#include <ctype.h>
#include <errno.h>
#include <limits.h>
+#include <math.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "alloc.h"
+#include "arena.h"
#include "bench.h"
#include "bits.h"
#include "dstr.h"
# include <cpuid.h>
# define CPUID_1D_TSC (1u << 4)
# define CPUID_1xD_TSCP (1u << 27)
+# define USE_X86_RDTSC 1
#endif
#if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
+# include <sys/syscall.h>
# include <sys/types.h>
# include <unistd.h>
# include <linux/perf_event.h>
-# include <asm/unistd.h>
+# ifdef HAVE_VALGRIND_VALGRIND_H
+# include <valgrind/valgrind.h>
+# endif
+# define USE_LINUX_PERFEVENT 1
# if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
# include <sys/mman.h>
+# define USE_LINUX_PERFEVRDPMC 1
# endif
#endif
struct timer {
struct bench_timer _t;
+ arena *a;
const struct timer_ops *ops[NTIMER]; /* subtimers for clock and cycles */
union {
+#ifdef USE_X86_RDTSC
unsigned tscaux; /* `ia32_tsc_aux' for `ldtscp' */
+#endif
+#ifdef USE_LINUX_PERFEVENT
int fd; /* vanilla `perf_event_open' */
- struct { const volatile void *map; size_t sz; } pmc; /* `perf_event_open'
- * with `rdpmc' */
+#endif
+#ifdef USE_LINUX_PERFEVRDPMC
+ struct { /* `perf_event_open' with `rdpmc' */
+ const volatile void *map; size_t sz; /* memory-mapped info */
+ pid_t owner; /* owning thread id */
+ } pmc;
+#endif
} u_cy; /* state for cycle measurement */
};
struct timer_ops {
const char *name; /* timer name */
unsigned f; /* flags */
-#define TF_SECRET 1u /* don't try this automatically */
+/* ... @BTF_...OK@ flags */ /* expected results */
+#define TF_SECRET 16u /* don't try this automatically */
int (*init)(struct timer */*t*/); /* initialization function */
+ int (*preflight)(struct timer */*t*/); /* preflight checks */
int (*now)(struct timer */*t*/, /* read current */
struct bench_time */*t_out*/, unsigned /*f*/);
void (*diff)(struct timer */*t*/, /* difference */
}
}
-/*----- Difference utilities ----------------------------------------------*/
-
#ifdef HAVE_UINT64
# define FLOATK64(k) ((double)(k).i)
#else
const struct bench_time *t0, const struct bench_time *t1)
{
unsigned f = t0->f&t1->f;
- kludge64 k;
+ kludge64 delta_s;
+ uint32 delta_ns;
if (f&BTF_TIMEOK) {
- /* Calculate the integer difference in seconds. */
- SUB64(k, t1->t.ts.s, t0->t.ts.s);
+ /* Calculate the integer differences in seconds and nanoseconds
+ * independently. To avoid underflow, though, add a second's worth of
+ * nanoseconds which we'll subtract off later.
+ */
+ SUB64(delta_s, t1->t.ts.s, t0->t.ts.s);
+ delta_ns = t1->t.ts.ns + NS_PER_S - t0->t.ts.ns;
+
+ /* Hack if they're both equal. */
+ if (ZERO64(delta_s) && !delta_ns) delta_ns = 1;
- /* And apply the nanoseconds difference. To prevent underflow,
- * pre-emptively borrow one from the integer difference.
+ /* And apply the nanoseconds difference. To prevent underflow, pre-
+ * emptively borrow one from the integer difference.
*/
- delta_inout->t =
- FLOATK64(k) - 1.0 +
- (t1->t.ts.ns + NS_PER_S - t0->t.ts.ns)/(double)NS_PER_S;
+ delta_inout->t = FLOATK64(delta_s) - 1.0 + delta_ns/(double)NS_PER_S;
/* Done. */
delta_inout->f |= BTF_TIMEOK;
const struct bench_time *t1)
{
unsigned f = t0->f&t1->f;
- kludge64 k;
+ kludge64 delta_cy;
if (f&BTF_CYOK) {
- SUB64(k, t1->cy, t0->cy); delta_inout->cy = FLOATK64(k);
+ SUB64(delta_cy, t1->cy, t0->cy); delta_inout->cy = FLOATK64(delta_cy);
+ if (!delta_inout->cy) delta_inout->cy = 1;
delta_inout->f |= BTF_CYOK;
}
}
#undef FLOATK64
+/* --- @normalize@ --- *
+ *
+ * Arguments: @double *x_inout@ = address of a value to normalize
+ * @const char **unit_out@ = address to store unit prefix
+ * @double scale@ = scale factor for unit steps
+ *
+ * Returns: ---
+ *
+ * Use: Adjust @*x_inout@ by a power of @scale@, and set @*unit_out@
+ * so that printing the two reflects the original value with an
+ * appropriate SI unit scaling. The @scale@ should be 1024 for
+ * binary quantities, most notably memory sizes, or 1000 for
+ * other quantities.
+ */
+
+static void normalize(double *x_inout, const char **unit_out, double scale)
+{
+ static const char
+ *const nothing = "",
+ *const big[] = { "k", "M", "G", "T", "P", "E", 0 },
+ *const little[] = { "m", "ยต", "n", "p", "f", "a", 0 };
+ const char *const *u;
+ double x = *x_inout;
+
+ if (x < 1)
+ for (u = little, x *= scale; x < 1 && u[1]; u++, x *= scale);
+ else if (x >= scale)
+ for (u = big, x /= scale; x >= scale && u[1]; u++, x /= scale);
+ else
+ u = ¬hing;
+
+ *x_inout = x; *unit_out = *u;
+}
+
/*----- The null timer ----------------------------------------------------*/
/* This is a timer which does nothing, in case we don't have any better
static int null_init(struct timer *t) { return (0); }
static int null_now(struct timer *t, struct bench_time *t_out, unsigned f)
{ return (0); }
+static int null_preflight(struct timer *t) { return (0); }
static void null_diff(struct timer *t, struct bench_timing *delta_inout,
const struct bench_time *t0,
const struct bench_time *t1)
static void null_teardown(struct timer *t) { ; }
static const struct timer_ops null_ops =
- { "null", 0, null_init, null_now, null_diff, null_teardown };
+ { "null", 0,
+ null_init, null_preflight, null_now, null_diff, null_teardown };
#define NULL_ENT &null_ops,
/*----- The broken clock --------------------------------------------------*/
static int broken_init(struct timer *t) { return (-1); }
static const struct timer_ops broken_ops =
- { "broken", TF_SECRET, broken_init, null_now, null_diff, null_teardown };
+ { "broken", TF_SECRET,
+ broken_init, null_preflight, null_now, null_diff, null_teardown };
#define BROKEN_ENT &broken_ops,
/*----- Linux performance counters ----------------------------------------*/
attr.exclude_kernel = 1;
attr.exclude_hv = 1;
- fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
+ fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0);
if (fd < 0) {
debug("couldn't open perf event: %s", strerror(errno));
return (-1);
static int perfevent_init(struct timer *t)
{
- struct bench_time tm;
int fd = -1, rc;
- fd = perfevent_open(); if (!fd) { rc = -1; goto end; }
-
- t->u_cy.fd = fd; tm.f = 0; perfevent_now(t, &tm, 0);
- if (!(tm.f&BTF_CYOK)) { rc = -1; goto end; }
- fd = -1; rc = 0;
+ fd = perfevent_open(); if (fd < 0) { rc = -1; goto end; }
+ t->u_cy.fd = fd; fd = -1; rc = 0;
end:
if (fd != -1) close(fd);
return (rc);
}
static const struct timer_ops perfevent_ops =
- { "linux-perf-read-hw-cycles", 0,
- perfevent_init, perfevent_now, diff_cycles, perfevent_teardown };
+ { "linux-perf-read-hw-cycles", BTF_CYOK,
+ perfevent_init, null_preflight, perfevent_now,
+ diff_cycles, perfevent_teardown };
#define PERFEVENT_VANILLA_CYENT &perfevent_ops,
# if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
const struct bench_time *t0,
const struct bench_time *t1)
{
+ unsigned long long delta_ns;
unsigned f = t0->f&t1->f;
if (f&BTF_TIMEOK) {
- delta_inout->t = (t1->t.rawns.i - t0->t.rawns.i)/(double)NS_PER_S;
+ delta_ns = t1->t.rawns.i - t0->t.rawns.i; if (!delta_ns) delta_ns = 1;
+ delta_inout->t = delta_ns/(double)NS_PER_S;
delta_inout->f |= BTF_TIMEOK;
}
if (f&BTF_CYOK) {
delta_inout->cy = t1->cy.i - t0->cy.i;
+ if (!delta_inout->cy) delta_inout->cy = 1;
delta_inout->f |= BTF_CYOK;
}
}
+static void perfevrdpmc_unmap
+ (const volatile struct perf_event_mmap_page *map, size_t mapsz)
+ { if (map) munmap(UNQUALIFY(struct perf_event_mmap_page, map), mapsz); }
+
static void perfevrdpmc_teardown(struct timer *t)
- { munmap((/*unconst unvolatile*/ void *)t->u_cy.pmc.map, t->u_cy.pmc.sz); }
+ { perfevrdpmc_unmap(t->u_cy.pmc.map, t->u_cy.pmc.sz); }
-static int perfevrdpmc_cyinit(struct timer *t)
+static int perfevrdpmc_setup(struct timer *t)
{
const volatile struct perf_event_mmap_page *map = 0;
- unsigned a, b, c, d, q0, q1, f;
- int pgsz, mapsz, fd = -1, rc;
-
- /* We need `rdtsc' to do the passage-of-time measurement. */
- if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
- { debug("no `rdtsc' instrunction"); return (-1); }
+ int pgsz, mapsz = 0, fd = -1, rc;
/* The rules say we must allocate %$1 + 2^n$% pages, so we need to know how
* big a page is.
return (-1);
}
- /* Check that it's revealed the necessary information. */
- q0 = map->lock;
- __atomic_thread_fence(__ATOMIC_ACQ_REL);
- for (;;) {
- f = 0;
- if (map->cap_user_time) f |= BTF_TIMEOK;
- if (map->cap_user_rdpmc) f |= BTF_CYOK;
- __atomic_thread_fence(__ATOMIC_ACQ_REL);
- q1 = map->lock;
- if (q0 == q1) break;
- q0 = q1;
- }
- if (!(f&BTF_TIMEOK))
- { debug("kernel refused user time measurement"); rc = -1; goto end; }
- if (!(f&BTF_TIMEOK))
- { debug("kernel refused user cycle measurement"); rc = -1; goto end; }
-
- /* All done. We can close the descriptor here: the mapping will keep the
- * performance-measurement machinery alive.
- */
- t->u_cy.pmc.map = map; t->u_cy.pmc.sz = mapsz; map = 0; rc = 0;
+ t->u_cy.pmc.map = map; t->u_cy.pmc.sz = mapsz; map = 0;
+ t->u_cy.pmc.owner = syscall(SYS_gettid); rc = 0;
end:
if (fd != -1) close(fd);
- if (map) munmap((/*unconst unvolatile*/ void *)map, mapsz);
+ perfevrdpmc_unmap(map, mapsz);
return (rc);
}
+static int perfevrdpmc_preflight(struct timer *t)
+{
+ if (!t->u_cy.pmc.map) { debug("retry perf event map setup"); goto reopen; }
+ if (t->u_cy.pmc.owner != syscall(SYS_gettid)) {
+ debug("pid changed: reopen perf event map");
+ perfevrdpmc_unmap(t->u_cy.pmc.map, t->u_cy.pmc.sz);
+ t->u_cy.pmc.map = 0; goto reopen;
+ }
+ return (0);
+
+reopen:
+ if (perfevrdpmc_setup(t)) return (-1);
+ return (0);
+}
+
+static int perfevrdpmc_cyinit(struct timer *t)
+{
+ unsigned a, b, c, d;
+
+# ifdef HAVE_VALGRIND_VALGRIND_H
+ /* Valgrind doesn't like `rdpmc' instructions, so just bail. */
+ if (RUNNING_ON_VALGRIND) return (-1);
+# endif
+
+ /* We need `rdtsc' to do the passage-of-time measurement. */
+ if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
+ { debug("no `rdtsc' instrunction"); return (-1); }
+
+ /* Set things up. */
+ if (perfevrdpmc_setup(t)) return (-1);
+ return (0);
+}
+
static const struct timer_ops perfevrdpmc_cyops =
- { "linux-x86-perf-rdpmc-hw-cycles", 0,
- perfevrdpmc_cyinit, perfevrdpmc_now,
+ { "linux-x86-perf-rdpmc-hw-cycles", BTF_TIMEOK | BTF_CYOK,
+ perfevrdpmc_cyinit, perfevrdpmc_preflight, perfevrdpmc_now,
perfevrdpmc_diff, perfevrdpmc_teardown };
static int perfevrdpmc_clkinit(struct timer *t)
{
- if (t->ops[CLK] != &perfevrdpmc_cyops) {
- debug("linux-x86-perf-rdpmc-hw-cycles not set as cycle subtimer");
+ if (t->ops[CY] != &perfevrdpmc_cyops) {
+ debug("`linux-x86-perf-rdpmc-hw-cycles' not set as cycle subtimer");
return(-1);
}
return (0);
static const struct timer_ops perfevrdpmc_clkops =
{ "linux-x86-perf-rdpmc-hw-cycles", 0,
- perfevrdpmc_clkinit, null_now,
+ perfevrdpmc_clkinit, null_preflight, null_now,
null_diff, null_teardown };
# define PERFEVENT_RDPMC_CLKENT &perfevrdpmc_clkops,
}
static const struct timer_ops x86rdtsc_ops =
- { "x86-rdtsc", 0,
- x86rdtsc_init, x86rdtsc_now, diff_cycles, null_teardown };
+ { "x86-rdtsc", BTF_CYOK,
+ x86rdtsc_init, null_preflight, x86rdtsc_now,
+ diff_cycles, null_teardown };
static const struct timer_ops x86rdtscp_ops =
- { "x86-rdtscp", 0,
- x86rdtscp_init, x86rdtscp_now, diff_cycles, null_teardown };
+ { "x86-rdtscp", BTF_CYOK,
+ x86rdtscp_init, null_preflight,
+ x86rdtscp_now, diff_cycles, null_teardown };
# define X86RDTSC_CYENT &x86rdtscp_ops, &x86rdtsc_ops,
#else
t_out->f |= BTF_TIMEOK; return (0);
}
-static int gettime_init(struct timer *t)
-{
- struct bench_time tm;
-
- tm.f = 0; gettime_now(t, &tm, 0); if (!tm.f&BTF_TIMEOK) return (-1);
- return (0);
-}
-
static const struct timer_ops gettime_ops =
- { "posix-thread-cputime", 0,
- gettime_init, gettime_now, diff_ts, null_teardown };
+ { "posix-thread-cputime", BTF_TIMEOK,
+ null_init, null_preflight, gettime_now, diff_ts, null_teardown };
# define GETTIME_CLKENT &gettime_ops,
#else
const struct bench_time *t0,
const struct bench_time *t1)
{
+ clock_t delta_clk;
unsigned f = t0->f&t1->f;
if (f&BTF_TIMEOK) {
- delta_inout->t = (t1->t.clk - t0->t.clk)/(double)CLOCKS_PER_SEC;
+ delta_clk = t1->t.clk - t0->t.clk; if (!delta_clk) delta_clk = 1;
+ delta_inout->t = delta_clk/(double)CLOCKS_PER_SEC;
delta_inout->f |= BTF_TIMEOK;
}
}
-static int clock_init(struct timer *t)
-{
- struct bench_time tm;
-
- tm.f = 0; clock_now(t, &tm, 0); if (!tm.f&BTF_TIMEOK) return (-1);
- return (0);
-}
-
static const struct timer_ops clock_ops =
- { "stdc-clock", 0, clock_init, clock_now, clock_diff, null_teardown };
+ { "stdc-clock", BTF_TIMEOK, null_init, null_preflight, clock_now,
+ clock_diff, null_teardown };
#define CLOCK_CLKENT &clock_ops,
const char *env;
const struct timer_ops *const *opstab;
} timertab[] = {
- { "clock", "MLIB_BENCH_CLKTIMER", clktab },
- { "cycle", "MLIB_BENCH_CYCLETIMER", cytab }
+ { "clock", "MLIB_BENCH_CLKTIMER", clktab },
+ { "cycle", "MLIB_BENCH_CYCLETIMER", cytab }
};
/* --- @find_timer@ --- *
static int try_timer(struct timer *t,
const struct timer_ops *ops, unsigned tm)
{
- if (ops->init(t)) return (-1);
+ struct bench_time t0, t1;
+ struct bench_timing delta;
+ int rc;
+ unsigned f = 0;
+#define f_teardown 1u
+
+ if (ops->init(t)) { rc = -1; goto end; }
+ f |= f_teardown;
+
+ if (ops->preflight(t)) { rc = -1; goto end; }
+ t0.f = t1.f = 0;
+ do {
+ while (ops->now(t, &t0, BTF_T0));
+ } while (ops->now(t, &t1, BTF_T1));
+ delta.f = 0; ops->diff(t, &delta, &t0, &t1);
+ if ((ops->f ^ delta.f)&BTF_ANY) { rc = -1; goto end; }
+
debug("selected %s timer `%s'", timertab[tm].what, ops->name);
- t->ops[tm] = ops; return (0);
+ t->ops[tm] = ops; f &= ~f_teardown; rc = 0;
+
+end:
+ if (f&f_teardown) ops->teardown(t);
+ return (rc);
+
+#undef f_teardown
}
/* --- @select_timer@ --- *
}
}
+static int timer_preflight(struct bench_timer *tm)
+{
+ struct timer *t = (struct timer *)tm;
+ unsigned i;
+
+ for (i = 0; i < NTIMER; i++) if (t->ops[i]->preflight(t)) return (-1);
+ return (0);
+}
+
static int timer_now(struct bench_timer *tm,
struct bench_time *t_out, unsigned f)
{
if (!t) return;
for (i = 0; i < NTIMER; i++)
if (t->ops[i]) t->ops[i]->teardown(t);
- xfree(t);
+ x_free(t->a, t);
}
static const struct bench_timerops timer_ops =
- { timer_describe, timer_now, timer_diff, timer_destroy };
+ { timer_describe, timer_preflight, timer_now, timer_diff, timer_destroy };
/* --- @bench_createtimer@ --- *
*
for (i = 0; i < NTIMER; i++) {
printf("%s timers:", timertab[i].what);
for (tt = timertab[i].opstab; *tt; tt++)
- if (!((*tt)->f)&TF_SECRET) printf(" %s", (*tt)->name);
+ if (!((*tt)->f&TF_SECRET)) printf(" %s", (*tt)->name);
putchar('\n');
}
goto next_config;
}
/* All seems well. Allocate the timer object. */
- t = xmalloc(sizeof(*t));
+ XNEW(t); t->a = arena_global;
for (i = 0; i < NTIMER; i++) t->ops[i] = 0;
/* Try to set up the subtimers. */
void bench_destroy(struct bench_state *b)
{ if (b->tm) { b->tm->ops->destroy(b->tm); b->tm = 0; } }
-/* --- @do_nothing@ --- *
+/* --- @spin@ --- *
*
* Arguments: @unsigned long n@ = iteration count
* @void *ctx@ = context pointer (ignored)
* the benchmarking state.
*/
-static void do_nothing(unsigned long n, void *ctx)
+static void spin(unsigned long n, void *ctx)
{ while (n--) RELAX; }
-/* --- @measure@ --- *
- *
- * Arguments: @struct bench_state *b@ = bench state
- * @struct bench_timing *delta_out@ = where to leave the timing
- * @bench_fn *fn@ = function to measure
- * @void *ctx@ = context for the function
- * @double n@ = number of iterations
- *
- * Returns: ---
- *
- * Use: Run the function @n@ times, and report how long it took.
- *
- * This function deals with retrying the measurements if the
- * timer reports a temporary failure, and all of the
- * difficulties if @n@ is too large to fit in a machine integer.
- */
-
-static void measure(struct bench_state *b, struct bench_timing *delta_out,
- bench_fn *fn, void *ctx, double n)
-{
- struct bench_timer *tm = b->tm;
- struct bench_time t0, t1;
- unsigned long n0, n1;
- double R = ULONG_MAX;
-
- if (n <= R) {
- n0 = n;
- do {
- while (tm->ops->now(tm, &t0, BTF_T0));
- fn(n0, ctx);
- } while (tm->ops->now(tm, &t1, BTF_T1));
- } else {
- n1 = n/R; n0 = n - n1*R;
- do {
- while (tm->ops->now(tm, &t0, BTF_T0));
- while (n1--) fn(ULONG_MAX, ctx);
- fn(n0, ctx);
- } while (tm->ops->now(tm, &t1, BTF_T1));
- }
- tm->ops->diff(tm, delta_out, &t0, &t1);
-}
-
/* --- @bench_calibrate@ --- *
*
* Arguments: @struct bench_state *b@ = bench state
+ * @unsigned f@ = calibration flags
*
* Returns: Zero on success, %$-1$% if calibration failed.
*
* Use: Calibrate the benchmark state, so that it can be used to
* measure performance reasonably accurately.
+ *
+ * Calibration will take into account how the subject code is
+ * going to be located. If you're going to use @BENCH_MEASURE@
+ * to measure a piece of literal code, then leave @f@ zero. If
+ * the code to be measured is going to be executed via an
+ * indirect branch, e.g., through the @measure@ function, then
+ * set @BTF_INDIRECT@.
*/
#define T_CLB 0.0625 /* calibration time limit */
-int bench_calibrate(struct bench_state *b)
+int bench_calibrate(struct bench_state *b, unsigned f)
{
struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT;
+ struct bench_timer *tm = b->tm;
struct bench_timing delta;
double n, r;
- bench_fn *fn = LAUNDER(&do_nothing);
- unsigned i, f = BTF_ANY;
+ unsigned i, tf = BTF_ANY;
+ BENCH_TIMELOOP_DECLS;
int rc;
/* The model here is that a timing loop has a fixed overhead as we enter
/* If we've already calibrated then there's nothing to do. */
if (b->f&BTF_CLB) return (b->f&BTF_ANY ? 0 : -1);
- /* Exercise the inner loop a few times to educate the branch predictor. */
- for (i = 0; i < 50; i++) measure(b, &delta, fn, 0, 10000);
+ /* Run the timer preflight check. */
+ if (tm->ops->preflight(tm)) { rc = -1; goto end; }
+
+ /* Exercise the inner loop a few times to educate the branch predictor.
+ * This is only useful if we're executing via an indirect call.
+ */
+ if (f&BTF_INDIRECT) {
+ for (i = 0; i < 50; i++)
+ BENCH_TIMELOOP_TAG(setup, b, &delta, 10000)
+ LAUNDER(&spin)(_bench_n, 0);
+ }
/* Now we measure idle loops until they take sufficiently long -- or we run
* out of counter.
for (;;) {
/* Measure @n@ iterations of the idle loop. */
- measure(b, &delta, fn, 0, n); f &= delta.f;
- if (!(f&BTF_TIMEOK)) { rc = -1; goto end; }
+ if (f&BTF_INDIRECT)
+ BENCH_TIMELOOP_TAG(calibrate, b, &delta, n)
+ LAUNDER(&spin)(_bench_n, 0);
+ else
+ BENCH_TIMELOOP_TAG(calibrate, b, &delta, n)
+ while (_bench_n--) RELAX;
+ tf &= delta.f; if (!(tf&BTF_TIMEOK)) { rc = -1; goto end; }
/* Register the timings with the regression machinery. */
linreg_update(&lr_clk, n, delta.t);
- if (!(f&BTF_CYOK))
+ if (!(tf&BTF_CYOK))
debug(" n = %10.0f; t = %12g s", n, delta.t);
else {
linreg_update(&lr_cy, n, delta.cy);
*/
linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, &r);
debug("clock overhead = (%g n + %g) s (r = %g)", b->clk.m, b->clk.c, r);
- if (f&BTF_CYOK) {
+ if (tf&BTF_CYOK) {
linreg_fit(&lr_cy, &b->cy.m, &b->cy.c, &r);
debug("cycle overhead = (%g n + %g) cy (r = %g)", b->cy.m, b->cy.c, r);
}
/* We're done. */
rc = 0;
end:
- b->f |= f | BTF_CLB; /* no point trying again */
+ b->f |= tf | BTF_CLB; /* no point trying again */
return (rc);
}
-/* --- @bench_measure@ --- *
+/* --- @bench_preflight@ --- *
*
* Arguments: @struct bench_state *b@ = benchmark state
- * @struct bench_timing *t_out@ = where to leave the timing
- * @double base@ = number of internal units per call
- * @bench_fn *fn@, @void *ctx@ = benchmark function to run
*
- * Returns: Zero on success, %$-1$% if timing failed.
+ * Returns: Zero on success, %$-1$% on failure.
*
- * Use: Measure a function. The function @fn@ is called adaptively
- * with an iteration count @n@ set so as to run for
- * approximately @b->target_s@ seconds.
+ * Use: Prepares for benchmarking on the current thread. Current
+ * checks are that the timer is calibrated and that it can
+ * successfully measure time; the timer preflight is also run.
*
- * The result is left in @*t_out@, with @t_out->n@ counting the
- * final product of the iteration count and @base@ (which might,
- * e.g., reflect the number of inner iterations the function
- * performs, or the number of bytes it processes per iteration).
+ * Users are unlikely to find this function useful: it's called
+ * automatically by the @BENCH_MEASURE@ macro and the
+ * @bench_measure@ function.
*/
-int bench_measure(struct bench_state *b, struct bench_timing *t_out,
- double base, bench_fn *fn, void *ctx)
+int bench_preflight(struct bench_state *b)
{
- double n, nn;
+ struct bench_timer *tm = b->tm;
- /* Make sure the state is calibrated and usable. */
- if (!(b->f&BTF_CLB) && bench_calibrate(b)) return (-1);
+ if (!(b->f&BTF_CLB)) return (-1);
if (!(b->f&BTF_TIMEOK)) return (-1);
+ if (tm->ops->preflight(tm)) return (-1);
+ debug("measuring...");
+ return (0);
+}
+
+/* --- @bench_adapt@ --- *
+ *
+ * Arguments: @struct bench_state *b@ = benchmark state
+ * @double *n_inout@ = number of iterations, updated
+ * @const struct bench_timing *t@ = timing from the previous run
+ *
+ * Returns: Nonzero if the measurement is sufficient; zero to run again.
+ *
+ * Use: This function determines a suitable number of iterations of a
+ * benchmark function to perform next. It is used in a loop
+ * such as the following.
+ *
+ * @double n = 1.0;@
+ * @struct bench_timing t;@
+ *
+ * @do {@
+ * (run @n@ iterations; set @t@ to the timing)
+ * @} while (!bench_adapt(b, &n, &t));@
+ *
+ * On entry, @*n_inout@ should be the number of iterations
+ * performed by the previous pass, and @*t@ the resulting time;
+ * the @BTF_TIMEOK@ flag must be set @t->f@. If the timing is
+ * sufficient -- @t->t@ is sufficiently close to @b->target_s@
+ * -- then the function returns nonzero to indicate that
+ * measurement is complete. Otherwise, it sets @*n_inout@ to a
+ * new, larger iteration count and returns zero to indicate that
+ * a further pass is necessary.
+ */
- /* Main adaptive measurement loop.
- *
- * Suppose the timer loop %$n$% iterations in %$t$% seconds. Our ideal
+int bench_adapt(struct bench_state *b,
+ double *n_inout, const struct bench_timing *t)
+{
+ double n = *n_inout, nn;
+
+ /* Dump the results for debugging. */
+ if (!(t->f&BTF_CYOK)) debug(" n = %10.0f; t = %12g", n, t->t);
+ else debug(" n = %10.0f; t = %12g, cy = %10.0f", n, t->t, t->cy);
+
+ /* Suppose the timer loop %$n$% iterations in %$t$% seconds. Our ideal
* time is %$T$% seconds. If %$t \ge T/\sqrt{2}$%, we're happy.
* Otherwise, we need to scale up the iteration count. The obvious next
* choice is %$n' = n T/t$%. Alas, rounding is a problem: if
* hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying
* again with %$n' = n + 1$% iterations will very likely work.
*/
- debug("measuring..."); n = 1.0;
- for (;;) {
- measure(b, t_out, fn, ctx, n); t_out->f &= b->f;
- if (!(t_out->f&BTF_TIMEOK)) return (-1);
- if (!(t_out->f&BTF_CYOK))
- debug(" n = %10.0f; t = %12g", n, t_out->t);
- else
- debug(" n = %10.0f; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy);
+ if (t->t >= 0.707*b->target_s) return (1);
+ nn = n*b->target_s/t->t; modf(nn, &nn);
+ *n_inout = nn > n ? nn : n + 1;
+ return (0);
+}
- if (t_out->t >= 0.707*b->target_s) break;
- nn = n*b->target_s/t_out->t;
- if (n > ULONG_MAX || nn > (unsigned long)n + 1) n = nn;
- else n++;
- }
+/* --- @bench_adjust@ --- *
+ *
+ * Arguments: @struct bench_state *b@ = benchmark state
+ * @struct bench_timing *t_inout@ = timing to adjust
+ * @double n@ = number of external iterations performed
+ * @double base@ = number of internal operations per external
+ * iteration
+ *
+ * Returns: ---
+ *
+ * Use: Adjusts a raw timing, as captured by @BENCH_TIMELOOP@,
+ * according to the calibration data captured in @b@.
+ * On exit, the timing data is updated, and @t->n@ is set to the
+ * product @n*base@.
+ */
+
+void bench_adjust(struct bench_state *b,
+ struct bench_timing *t_inout, double n, double base)
+{
/* Adjust according to the calibration. */
- t_out->t -= n*b->clk.m + b->clk.c;
- if (t_out->f&BTF_CYOK) t_out->cy -= n*b->cy.m + b->cy.c;
+ t_inout->t -= n*b->clk.m + b->clk.c;
+ if (t_inout->f&BTF_CYOK) t_inout->cy -= n*b->cy.m + b->cy.c;
/* Report the results, if debugging. */
- if (!(t_out->f&BTF_CYOK)) debug(" adjusted t' = %12g", t_out->t);
- else debug(" adjusted t = %12g, cy = %10.0f", t_out->t, t_out->cy);
- if (!(t_out->f&BTF_CYOK))
- debug(" %g s per op; %g ops/s", t_out->t/n, n/t_out->t);
+ if (!(t_inout->f&BTF_CYOK)) debug(" adjusted t' = %12g", t_inout->t);
+ else debug(" adjusted t' = %12g, cy' = %10.0f", t_inout->t, t_inout->cy);
+ if (!(t_inout->f&BTF_CYOK))
+ debug(" %g s per iter; %g iters/s", t_inout->t/n, n/t_inout->t);
else
- debug(" %g s (%g cy) per op; %g ops/s",
- t_out->t/n, t_out->cy/n, n/t_out->t);
+ debug(" %g s (%g cy) per iter; %g iters/s",
+ t_inout->t/n, t_inout->cy/n, n/t_inout->t);
/* All done. */
- t_out->n = n*base; return (0);
+ t_inout->n = n*base;
+}
+
+/* --- @bench_measure@ --- *
+ *
+ * Arguments: @struct bench_state *b@ = benchmark state
+ * @struct bench_timing *t_out@ = where to leave the timing
+ * @double base@ = number of internal units per call
+ * @bench_fn *fn@, @void *ctx@ = benchmark function to run
+ *
+ * Returns: Zero on success, %$-1$% if timing failed.
+ *
+ * Use: Measure a function. The function @fn@ is called adaptively
+ * with an iteration count @n@ set so as to run for
+ * approximately @b->target_s@ seconds.
+ *
+ * The result is left in @*t_out@, with @t_out->n@ counting the
+ * final product of the iteration count and @base@ (which might,
+ * e.g., reflect the number of inner iterations the function
+ * performs, or the number of bytes it processes per iteration).
+ *
+ * To get useful results, the benchmark state should have been
+ * calibrated for indirect calling -- i.e., with @BTF_INDIRECT@.
+ */
+
+int bench_measure(struct bench_state *b, struct bench_timing *t_out,
+ double base, bench_fn *fn, void *ctx)
+{
+ BENCH_MEASURE_DECLS;
+ int rc;
+
+ BENCH_MEASURE_TAG(bench_measure, b, rc, t_out, base)
+ fn(_bench_n, ctx);
+ return (rc);
+}
+
+/*----- Reporting ---------------------------------------------------------*/
+
+/* --- @bench_report@ --- *
+ *
+ * Arguments: @const struct gprintf_ops *gops, void *gp@ = output formatter
+ * @unsigned unit@ = unit processed by the benchmark function
+ * @const struct bench_timing *t@ = benchmark result
+ *
+ * Returns: ---
+ *
+ * Use: Format, to the output identified by @gops@ and @go@, a
+ * human-readable report of the benchmarking result @t@. No
+ * newline is appended.
+ *
+ * The output format is subject to change in later versions.
+ */
+
+void bench_report(const struct gprintf_ops *gops, void *go,
+ unsigned unit, const struct bench_timing *t)
+{
+ double scale, x, n = t->n;
+ const char *u, *what, *whats;
+
+ assert(t->f&BTF_TIMEOK);
+
+ switch (unit) {
+ case BTU_OP:
+ gprintf(gops, go, "%.0f iterations ", n);
+ what = "op"; whats = "ops"; scale = 1000;
+ break;
+ case BTU_BYTE:
+ x = n; normalize(&x, &u, 1024); gprintf(gops, go, "%.3f %sB ", x, u);
+ what = whats = "B"; scale = 1024;
+ break;
+ default:
+ assert(0);
+ }
+
+ x = t->t; normalize(&x, &u, 1000);
+ gprintf(gops, go, "in %.3f %ss", x, u);
+ if (t->f&BTF_CYOK) {
+ x = t->cy; normalize(&x, &u, 1000);
+ gprintf(gops, go, " (%.3f %scy)", x, u);
+ }
+ gprintf(gops, go, ": ");
+
+ x = n/t->t; normalize(&x, &u, scale);
+ gprintf(gops, go, "%.3f %s%s/s", x, u, whats);
+ x = t->t/n; normalize(&x, &u, 1000);
+ gprintf(gops, go, ", %.3f %ss/%s", x, u, what);
+ if (t->f&BTF_CYOK) {
+ x = t->cy/n; normalize(&x, &u, 1000);
+ gprintf(gops, go, " (%.3f %scy/%s)", x, u, what);
+ }
}
/*----- That's all, folks -------------------------------------------------*/