@@@ man wip

[mLib] / test / bench.c
diff --git a/test/bench.c b/test/bench.c

index d1a00ea..7fedc9d 100644 (file)
--- a/test/bench.c
+++ b/test/bench.c
@@ -29,6 +29,7 @@
  
  #include "config.h"
  
+#include <ctype.h>
  #include <errno.h>
  #include <stdarg.h>
  #include <stdio.h>
@@ -39,27 +40,44 @@
  #include "alloc.h"
  #include "bench.h"
  #include "bits.h"
+#include "dstr.h"
  #include "linreg.h"
  #include "macros.h"
  
  /*----- Data structures ---------------------------------------------------*/
  
+enum { CLK, CY, NTIMER };
+
  struct timer {
    struct bench_timer _t;
-  const struct timer_ops *clkops, *cyops;
-  union { int fd; } u_cy;
+  const struct timer_ops *ops[NTIMER]; /* subtimers for clock and cycles */
+  union { int fd; } u_cy;              /* state for cycle measurement */
  };
  
  struct timer_ops {
-  void (*now)(struct bench_time *t_out, struct timer *t);
-  void (*teardown)(struct timer *t);
+  const char *name;                    /* timer name */
+  unsigned f;                          /* flags */
+#define TF_SECRET 1u                   /*   don't try this automatically */
+  int (*init)(struct timer */*t*/);    /* initialization function */
+  void (*now)(struct bench_time *t_out, struct timer *t); /* read current */
+  void (*teardown)(struct timer *t);   /* release held resources */
  };
  
  /*----- Preliminaries -----------------------------------------------------*/
  
  #define NS_PER_S 1000000000
  
-static void PRINTF_LIKE(1, 2) debug(const char *fmt, ...)
+/* --- @debug@ --- *
+ *
+ * Arguments:  @const char *fmt@ = format control string
+ *             @...@ = format arguemnts
+ *
+ * Returns:    ---
+ *
+ * Use:                Maybe report a debugging message to standard error.
+ */
+
+static PRINTF_LIKE(1, 2) void debug(const char *fmt, ...)
  {
    const char *p;
    va_list ap;
@@ -74,19 +92,84 @@ static void PRINTF_LIKE(1, 2) debug(const char *fmt, ...)
    }
  }
  
-/*----- The null clock ----------------------------------------------------*/
+/* --- @timer_diff@ --- *
+ *
+ * Arguments:  @struct bench_timing *delta_out@ = where to putt the result
+ *             @const struct bench_time *t0, *t1@ = two times captured by a
+ *                     timer's @now@ function
+ *
+ * Returns:    ---
+ *
+ * Use:                Calculates the difference between two captured times.  The
+ *             flags are set according to whether the differences are
+ *             meaningful; @delta_out->n@ is left unset.
+ */
+
+static void timer_diff(struct bench_timing *delta_out,
+                      const struct bench_time *t0,
+                      const struct bench_time *t1)
+{
+  unsigned f = t0->f&t1->f;
+  kludge64 k;
+
+#ifdef HAVE_UINT64
+#  define FLOATK64(k) ((double)(k).i)
+#else
+#  define FLOATK64(k) ((double)(k).lo + 4275123318.0*(double)(k).hi)
+#endif
  
+  if (!(f&BTF_TIMEOK))
+    delta_out->t = 0.0;
+  else {
+    SUB64(k, t1->s, t0->s);
+    delta_out->t = FLOATK64(k) - 1 +
+      (t1->ns + NS_PER_S - t0->ns)/(double)NS_PER_S;
+  }
+
+  if (!(f&BTF_CYOK))
+    delta_out->cy = 0.0;
+  else {
+    SUB64(k, t1->cy, t0->cy);
+    delta_out->cy = FLOATK64(k);
+  }
+
+  delta_out->f = f;
+
+#undef FLOATK64
+}
+
+/*----- The null timer ----------------------------------------------------*/
+
+/* This is a timer which does nothing, in case we don't have any better
+ * ideas.
+ */
+
+static int null_init(struct timer *t) { return (0); }
  static void null_now(struct bench_time *t_out, struct timer *t) { ; }
  static void null_teardown(struct timer *t) { ; }
-static const struct timer_ops null_ops = { null_now, null_teardown };
  
-static int null_cyinit(struct timer *t)
-  { t->cyops = &null_ops; return (0); }
+static const struct timer_ops null_ops =
+  { "null", 0, null_init, null_now, null_teardown };
+#define NULL_ENT &null_ops,
  
-#define NULL_CYENT { "null", null_cyinit },
+/*----- The broken clock --------------------------------------------------*/
+
+/* This is a cycle counter which does nothing, in case we don't have any
+ * better ideas.
+ */
+
+static int broken_init(struct timer *t) { return (-1); }
+
+static const struct timer_ops broken_ops =
+  { "broken", TF_SECRET, broken_init, null_now, null_teardown };
+#define BROKEN_ENT &broken_ops,
  
  /*----- Linux performance counters ----------------------------------------*/
  
+/* This is a cycle counter which uses the Linux performance event system,
+ * which is probably the best choice if it's available.
+ */
+
  #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
  
  #include <sys/types.h>
@@ -110,9 +193,6 @@ static void perfevent_now(struct bench_time *t_out, struct timer *t)
  static void perfevent_teardown(struct timer *t)
    { close(t->u_cy.fd); }
  
-static const struct timer_ops perfevent_ops =
-  { perfevent_now, perfevent_teardown };
-
  static int perfevent_init(struct timer *t)
  {
    struct perf_event_attr attr = { 0 };
@@ -134,96 +214,57 @@ static int perfevent_init(struct timer *t)
    tm.f = 0; perfevent_now(&tm, t);
    if (!(tm.f&BTF_CYOK)) { close(t->u_cy.fd); return (-1); }
  
-  t->cyops = &perfevent_ops; return (0);
+  return (0);
  }
-#  define PERFEVENT_CYENT { "linux-perf-event", perfevent_init },
+
+static const struct timer_ops perfevent_ops =
+  { "linux-perf-hw-cycles", 0,
+    perfevent_init, perfevent_now, perfevent_teardown };
+
+#  define PERFEVENT_CYENT &perfevent_ops,
  #else
  #  define PERFEVENT_CYENT
  #endif
  
  /*----- Intel time-stamp counter ------------------------------------------*/
  
-#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-
-#define EFLAGS_ID (1u << 21)
-#define CPUID_1D_TSC (1u << 4)
-
-static uint32 set_flags(unsigned long m, unsigned long x)
-{
-  unsigned long r;
-
-#ifdef __x86_64__
-#  define TMP "%%rcx"
-#else
-#  define TMP "%%ecx"
-#endif
+/* This is a cycle counter based on the Intel `rdtsc' instruction.  It's not
+ * really suitable for performance measurement because it gets confused by
+ * CPU frequency adjustments.
+ */
  
-  __asm__ ("pushf\n\t"
-          "pop %0\n\t"
-          "mov %0, " TMP "\n\t"
-          "and %1, %0\n\t"
-          "xor %2, %0\n\t"
-          "push %0\n\t"
-          "popf\n\t"
-          "pushf\n\t"
-          "pop %0\n\t"
-          "push " TMP "\n\t"
-          "popf"
-          : "=r"(r)
-          : "g"(m), "g"(x)
-          : "ecx");
-  return (r);
-}
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
  
-struct cpuid { uint32 a, b, c, d; };
+#include <cpuid.h>
  
-static void cpuid(struct cpuid *info_out, uint32 a, uint32 c)
-{
-  __asm__ ("movl %1, %%eax\n\t"
-          "movl %2, %%ecx\n\t"
-          "cpuid\n\t"
-          "movl %%eax, 0(%0)\n\t"
-          "movl %%ebx, 4(%0)\n\t"
-          "movl %%ecx, 8(%0)\n\t"
-          "movl %%edx, 12(%0)\n\t"
-          : /* no outputs */
-          : "r"(info_out), "g"(a), "g"(c)
-          : "eax", "ebx", "ecx", "edx", "cc");
-}
+#define CPUID_1D_TSC (1u << 4)
  
  static void x86rdtsc_now(struct bench_time *t_out, struct timer *t)
-{
-  uint32 lo, hi;
-
-  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
-  SET64(t_out->cy, hi, lo); t_out->f |= BTF_CYOK;
-}
-
-static const struct timer_ops x86rdtsc_ops =
-  { x86rdtsc_now, null_teardown };
+  { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; }
  
  static int x86rdtsc_init(struct timer *t)
  {
-  struct cpuid info;
-
-  if ((set_flags(~EFLAGS_ID, 0)&EFLAGS_ID) ||
-      !(set_flags(~EFLAGS_ID, EFLAGS_ID)&EFLAGS_ID))
-    { debug("no `cpuid' instruction"); return (-1); }
-  cpuid(&info, 0, 0);
-  if (info.a < 1) { debug("no `cpuid' leaf 1"); return (-1); }
-  cpuid(&info, 1, 0);
-  if (!(info.d&CPUID_1D_TSC))
+  unsigned a, b, c, d;
+
+  if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
      { debug("no `rdtsc' instrunction"); return (-1); }
-  t->cyops = &x86rdtsc_ops; return (0);
+  return (0);
  }
  
-#  define X86RDTSC_CYENT { "x86-rdtsc", x86rdtsc_init },
+static const struct timer_ops x86rdtsc_ops =
+  { "x86-rdtsc", 0, x86rdtsc_init, x86rdtsc_now, null_teardown };
+
+#  define X86RDTSC_CYENT &x86rdtsc_ops,
  #else
-#  define X86RDTWC_CYENT
+#  define X86RDTSC_CYENT
  #endif
  
  /*----- POSIX `clock_gettime' ---------------------------------------------*/
  
+/* This is a real-time clock based on the POSIX time interface, with up to
+ * nanosecond precision.
+ */
+
  #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)
  
  static void gettime_now(struct bench_time *t_out, struct timer *t)
@@ -236,23 +277,28 @@ static void gettime_now(struct bench_time *t_out, struct timer *t)
    t_out->f |= BTF_TIMEOK;
  }
  
-static const struct timer_ops gettime_ops = { gettime_now, null_teardown };
-
  static int gettime_init(struct timer *t)
  {
    struct bench_time tm;
  
    tm.f = 0; gettime_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1);
-  t->clkops = &gettime_ops; return (0);
+  return (0);
  }
  
-#  define GETTIME_CLKENT { "posix-clock_gettime", gettime_init },
+static const struct timer_ops gettime_ops =
+  { "posix-thread-cputime", 0, gettime_init, gettime_now, null_teardown };
+
+#  define GETTIME_CLKENT &gettime_ops,
  #else
  #  define GETTIME_CLKENT
  #endif
  
  /*----- Standard C `clock' ------------------------------------------------*/
  
+/* This is a real-time clock based on the C `clock' function which is
+ * guaranteed to be available, though it's not likely to be very good.
+ */
+
  static void clock_now(struct bench_time *t_out, struct timer *t)
  {
    clock_t now, x;
@@ -276,142 +322,334 @@ static void clock_now(struct bench_time *t_out, struct timer *t)
    ASSIGN64(t_out->s, s); t_out->ns = ns; t_out->f |= BTF_TIMEOK;
  }
  
-static const struct timer_ops clock_ops = { clock_now, null_teardown };
-
  static int clock_init(struct timer *t)
  {
    struct bench_time tm;
  
    tm.f = 0; clock_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1);
-  t->clkops = &clock_ops; return (0);
+  return (0);
  }
  
-#define CLOCK_CLKENT { "clock", clock_init },
+static const struct timer_ops clock_ops =
+  { "stdc-clock", 0, clock_init, clock_now, null_teardown };
+
+#define CLOCK_CLKENT &clock_ops,
  
  /*----- Timing setup ------------------------------------------------------*/
  
-static const struct timerent {
-  const char *name;
-  int (*init)(struct timer */*t*/);
-}
-  clktab[] = { GETTIME_CLKENT CLOCK_CLKENT { 0, 0 } },
-  cytab[] = { PERFEVENT_CYENT X86RDTSC_CYENT NULL_CYENT { 0, 0 } };
+/* Tables of timing sources. */
+static const struct timer_ops
+  *const clktab[] = { GETTIME_CLKENT CLOCK_CLKENT BROKEN_ENT 0 },
+  *const cytab[] = { PERFEVENT_CYENT X86RDTSC_CYENT NULL_ENT BROKEN_ENT 0 };
+
+static const struct timertab {
+  const char *what;
+  const char *env;
+  const struct timer_ops *const *opstab;
+} timertab[] = {
+  { "clock", "MLIB_BENCH_CLKTIMER", clktab },
+  { "cycle", "MLIB_BENCH_CYCLETIMER", cytab }
+};
+
+/* --- @find_timer@ --- *
+ *
+ * Arguments:  @const char *name@ = timer name
+ *             @size_t sz@ = length of name
+ *             @unsigned tm@ = which subtimer we're looking for
+ *
+ * Returns:    The table entry matching the given name, or null if there
+ *             isn't one.
+ */
  
-static const struct timerent *find_timer_n(const char *name, size_t sz,
-                                          const struct timerent *timers,
-                                          const char *what)
+static const struct timer_ops *find_timer(const char *name, size_t sz,
+                                         unsigned tm)
  {
-  while (timers->name) {
-    if (strlen(timers->name) == sz && MEMCMP(name, ==, timers->name, sz))
-      return (timers);
-    timers++;
+  const struct timer_ops *const *tt;
+
+  for (tt = timertab[tm].opstab; *tt; tt++) {
+    if (strlen((*tt)->name) == sz &&
+       MEMCMP(name, ==, (*tt)->name, sz))
+      return (*tt);
    }
-  debug("%s timer `%.*s' not found", what, (int)sz, name); return (0);
+  debug("%s timer `%.*s' not found",
+       timertab[tm].what, (int)sz, name); return (0);
  }
  
+/* --- @try_timer@ --- *
+ *
+ * Arguments:  @struct timer *t@ = timer structure
+ *             @const struct timer_ops *ops@ = timer ops
+ *             @unsigned tm@ = which subtimer we're setting
+ *
+ * Returns:    Zero on success, %$-1$% if timer failed.
+ *
+ * Use:                Tries to initialize the timer @t@, reporting a debug message
+ *             if it worked.
+ */
+
  static int try_timer(struct timer *t,
-                    const struct timerent *timer, const char *what)
+                    const struct timer_ops *ops, unsigned tm)
  {
-  if (timer->init(t)) return (-1);
-  debug("selected %s timer `%s'", what, timer->name); return (0);
+  if (ops->init(t)) return (-1);
+  debug("selected %s timer `%s'", timertab[tm].what, ops->name);
+  t->ops[tm] = ops; return (0);
  }
  
-static int select_timer(struct timer *t, const struct timerent *timers,
-                       const char *varname, const char *what)
+/* --- @select_timer@ --- *
+ *
+ * Arguments:  @struct timer *t@ = timer structure
+ *             @unsigned tm@ = which subtimer we're setting
+ *             @const char *config@, @size_t sz@ = config string
+ *
+ * Returns:    Zero on success, %$-1$% if timer failed.
+ *
+ * Use:                Select a timer from the table.  If the environment variable
+ *             is set, then parse a comma-separated list of timer names and
+ *             use the first one listed that seems to work; otherwise, try
+ *             the timers in the table in order.
+ */
+
+static int select_timer(struct timer *t, unsigned tm,
+                       const char *config, size_t sz)
  {
-  const char *p; size_t n;
-  const struct timerent *timer;
+  const char *p, *l;
+  const struct timer_ops *ops, *const *tt;
  
-  p = getenv(varname);
-  if (!p) {
-    while (timers->name)
-      if (!try_timer(t, timers++, what)) return (0);
+  if (!config) {
+    for (tt = timertab[tm].opstab; *tt; tt++)
+      if (!((*tt)->f&TF_SECRET) && !try_timer(t, *tt, tm)) return (0);
    } else {
+    l = config + sz;
      for (;;) {
-      n = strcspn(p, ",");
-      timer = find_timer_n(p, n, timers, what);
-      if (timer && !try_timer(t, timer, what)) return (0);
-      if (!p[n]) break;
-      p += n + 1;
+      p = memchr(config, ',', l - config); if (!p) p = l;
+      ops = find_timer(config, p - config, tm);
+      if (ops && !try_timer(t, ops, tm)) return (0);
+      if (p >= l) break;
+      config = p + 1;
      }
    }
-  debug("no suitable %s timer found", what); return (-1);
+  debug("no suitable %s timer found", timertab[tm].what); return (-1);
+}
+
+/* Bench timer operations. */
+static void timer_describe(struct bench_timer *tm, dstr *d)
+{
+  struct timer *t = (struct timer *)tm;
+  unsigned i;
+
+  dstr_puts(d, "builtin: ");
+  for (i = 0; i < NTIMER; i++) {
+    if (i) dstr_puts(d, ", ");
+    dstr_putf(d, "%s = %s", timertab[i].what, t->ops[i]->name);
+  }
  }
  
  static void timer_now(struct bench_timer *tm, struct bench_time *t_out)
  {
    struct timer *t = (struct timer *)tm;
+  unsigned i;
  
-  t->clkops->now(t_out, t);
-  t->cyops->now(t_out, t);
+  for (i = 0; i < NTIMER; i++) t->ops[i]->now(t_out, t);
  }
  
  static void timer_destroy(struct bench_timer *tm)
  {
    struct timer *t = (struct timer *)tm;
+  unsigned i;
  
    if (!t) return;
-  if (t->clkops) t->clkops->teardown(t);
-  if (t->cyops) t->cyops->teardown(t);
+  for (i = 0; i < NTIMER; i++)
+    if (t->ops[i]) t->ops[i]->teardown(t);
    xfree(t);
  }
  
-static const struct bench_timerops timer_ops = { timer_now, timer_destroy };
+static const struct bench_timerops timer_ops =
+  { timer_describe, timer_now, timer_destroy };
+
+/* --- @bench_createtimer@ --- *
+ *
+ * Arguments:  @const char *config@ = timer configuration string
+ *
+ * Returns:    A freshly constructed standard timer object.
+ *
+ * Use:                Allocate a timer.  Dispose of it by calling
+ *             @tm->ops->destroy(tm)@ when you're done.
+ *
+ *             Applications should not set configuration strings except as
+ *             established by user action, e.g., from a command-line option,
+ *             environment variable, or configuration file.
+ */
  
-struct bench_timer *bench_createtimer(void)
+struct bench_timer *bench_createtimer(const char *config)
  {
    struct timer *t = 0;
    struct bench_timer *ret = 0;
+  struct { const char *p; size_t sz; } tmconf[NTIMER] = { 0 };
+  const struct timer_ops *const *tt;
+  const char *p, *l; size_t n, nn;
+  unsigned i;
  
-  t = xmalloc(sizeof(*t)); t->cyops = 0; t->clkops = 0;
-  if (select_timer(t, clktab, "MLIB_BENCH_CLKTIMER", "clock")) goto end;
-  if (select_timer(t, cytab, "MLIB_BENCH_CYCLETIMER", "cycle")) goto end;
+  /* Parse the configuration string. */
+  if (config) {
+
+    /* The first thing to do is find the end of the string. */
+    l = config + strlen(config);
+
+    for (;;) {
+      /* Process the whitespace-sparated words of the string one by one. */
+
+      /* Skip over any initial whitespace.  If we hit the end of the string
+       * then we're done.
+       */
+      for (;;)
+       if (config >= l) goto done_config;
+       else if (!ISSPACE(*config)) break;
+       else config++;
+
+      /* There's definitely a word here.  Find the end of it. */
+      for (p = config; p < l && !ISSPACE(*p); p++);
+      nn = p - config;
+
+      /* Try various simple keywords. */
+#define MATCHP(lit) (nn == sizeof(lit) - 1 && MEMCMP(config, ==, lit, nn))
+
+      if (MATCHP("list")) {
+       /* The `list' keyword requests lists of the available timer
+        * implementations.
+        */
+
+       for (i = 0; i < NTIMER; i++) {
+         printf("%s timers:", timertab[i].what);
+         for (tt = timertab[i].opstab; *tt; tt++)
+           if (!((*tt)->f)&TF_SECRET) printf(" %s", (*tt)->name);
+         putchar('\n');
+       }
+       goto next_config;
+      }
+
+#undef MATCHP
+
+      /* Otherwise it's an assignment, setting a subtimer list. */
+      p = memchr(config, '=', nn);
+      if (!p)
+       n = nn;
+      else {
+       n = p - config;
+       for (i = 0; i < NTIMER; i++)
+         if (STRNCMP(config, ==, timertab[i].what, n) &&
+             !timertab[i].what[n]) {
+           if (tmconf[i].p)
+             debug("duplicate %s timer list", timertab[i].what);
+           tmconf[i].p = config + n + 1; tmconf[i].sz = nn - n - 1;
+           goto next_config;
+         }
+      }
+      debug("unrecognized config keyword `%.*s'", (int)n, config);
+
+      /* Move on to the next word. */
+    next_config:
+      config += nn;
+    }
+  done_config:;
+  }
+
+  /* Override these settings from the environment. */
+  for (i = 0; i < NTIMER; i++) {
+    p = getenv(timertab[i].env);
+    if (p) { tmconf[i].p = p; tmconf[i].sz = strlen(p); }
+  }
+
+  /* All seems well.  Allocate the timer object. */
+  t = xmalloc(sizeof(*t));
+  for (i = 0; i < NTIMER; i++) t->ops[i] = 0;
+
+  /* Try to set up the subtimers. */
+  for (i = 0; i < NTIMER; i++)
+    if (select_timer(t, i, tmconf[i].p, tmconf[i].sz)) goto end;
+
+  /* All is done. */
    t->_t.ops = &timer_ops; ret = &t->_t; t = 0;
  end:
    if (t) timer_destroy(&t->_t);
    return (ret);
  }
  
-#ifdef HAVE_UINT64
-#  define FLOATK64(k) ((double)(k).i)
-#else
-#  define FLOATK64(k) ((double)(k).lo + 4275123318.0*(double)(k).hi)
-#endif
+/*----- Benchmarking ------------------------------------------------------*/
  
-static void timer_diff(struct bench_timing *delta_out,
-                      const struct bench_time *t0,
-                      const struct bench_time *t1)
+/* --- @bench_init@ --- *
+ *
+ * Arguments:  @struct bench_state *b@ = bench state to initialize
+ *             @struct bench_timer *tm@ = timer to attach, or null
+ *
+ * Returns:    Zero on success, %$-1$% on failure.
+ *
+ * Use:                Initialize the benchmark state.  On success, the timer state
+ *             still needs to be calibrated (use @bench_calibrate@) before
+ *             it can be used, but this will be done automatically by
+ *             @bench_measure@ if it's not done by hand earlier.  The timer
+ *             is now owned by the benchmark state and will be destroyed by
+ *             @bench_destroy@.
+ *
+ *             The only reason for failure is if @tm@ was null on entry,
+ *             and automatic construction of a timer failed.  The state is
+ *             safe to discard, but calling @bench_destroy@ is safe too.
+ */
+
+int bench_init(struct bench_state *b, struct bench_timer *tm)
  {
-  delta_out->f = t0->f&t1->f;
-  kludge64 k;
+  int rc;
  
-  if (!(delta_out->f&BTF_TIMEOK))
-    delta_out->t = 0.0;
-  else {
-    SUB64(k, t1->s, t0->s);
-    delta_out->t = FLOATK64(k) - 1 +
-      (t1->ns + NS_PER_S - t0->ns)/(double)NS_PER_S;
-  }
+  b->tm = 0;
  
-  if (!(delta_out->f&BTF_CYOK))
-    delta_out->cy = 0.0;
-  else {
-    SUB64(k, t1->cy, t0->cy);
-    delta_out->cy = FLOATK64(k);
+  if (!tm) {
+    tm = bench_createtimer(0);
+    if (!tm) { rc = -1; goto end; }
    }
-}
  
-/*----- Calibration -------------------------------------------------------*/
+  b->tm = tm; b->target_s = 1.0; b->f = 0; rc = 0;
+end:
+  return (rc);
+}
  
-void bench_init(struct bench_state *b, struct bench_timer *tm)
-  { b->tm = tm; b->target_s = 1.0; b->f = 0; }
+/* --- @bench_destroy@ --- *
+ *
+ * Arguments:  @struct bench_state *b@ = bench state
+ *
+ * Returns:    ---
+ *
+ * Use:                Destroy the benchmark state, releasing the resources that it
+ *             holds.
+ */
  
  void bench_destroy(struct bench_state *b)
-  { b->tm->ops->destroy(b->tm); }
+  { if (b->tm) { b->tm->ops->destroy(b->tm); b->tm = 0; } }
+
+/* --- @do_nothing@ --- *
+ *
+ * Arguments:  @unsigned long n@ = iteration count
+ *             @void *ctx@ = context pointer (ignored)
+ *
+ * Returns:    ---
+ *
+ * Use:                Does nothing at all for @n@ iterations.  Used to calibrate
+ *             the benchmarking state.
+ */
  
-static void do_nothing(unsigned long n, void *p)
+static void do_nothing(unsigned long n, void *ctx)
    { while (n--) RELAX; }
  
+/* --- @bench_calibrate@ --- *
+ *
+ * Arguments:  @struct bench_state *b@ = bench state
+ *
+ * Returns:    Zero on success, %$-1$% if calibration failed.
+ *
+ * Use:                Calibrate the benchmark state, so that it can be used to
+ *             measure performance reasonably accurately.
+ */
+
+#define T_CLB 0.0625                   /* calibration time limit */
+
  int bench_calibrate(struct bench_state *b)
  {
    struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT;
@@ -420,21 +658,37 @@ int bench_calibrate(struct bench_state *b)
    struct bench_timer *tm = b->tm;
    struct bench_time t0, t1;
    struct bench_timing delta;
+  double r;
    bench_fn *fn = LAUNDER(&do_nothing);
    unsigned f = BTF_ANY;
    int rc;
  
-  if (b->f&BTF_ANY) return (0);
+  /* The model here is that a timing loop has a fixed overhead as we enter
+   * and leave (e.g., to do with the indirect branch into the code), and
+   * per-iteration overheads as we check the counter and loop back.  We aim
+   * to split these apart using linear regression.
+   */
  
+  /* If we've already calibrated then there's nothing to do. */
+  if (b->f&BTF_CLB) return (b->f&BTF_ANY ? 0 : -1);
+
+  /* Exercise the inner loop a few times to educate the branch predictor. */
    for (i = 0; i < 10; i++)
-    { tm->ops->now(tm, &t0); fn(1, 0); tm->ops->now(tm, &t1); }
+    { tm->ops->now(tm, &t0); fn(50, 0); tm->ops->now(tm, &t1); }
  
+  /* Now we measure idle loops until they take sufficiently long -- or we run
+   * out of counter.
+   */
    debug("calibrating...");
    n = 1;
    for (;;) {
+
+    /* Measure @n@ iterations of the idle loop. */
      tm->ops->now(tm, &t0); fn(n, 0); tm->ops->now(tm, &t1);
      timer_diff(&delta, &t0, &t1); f &= delta.f;
      if (!(f&BTF_TIMEOK)) { rc = -1; goto end; }
+
+    /* Register the timings with the regression machinery. */
      linreg_update(&lr_clk, n, delta.t);
      if (!(f&BTF_CYOK))
        debug("  n = %10lu; t = %12g s", n, delta.t);
@@ -442,42 +696,92 @@ int bench_calibrate(struct bench_state *b)
        linreg_update(&lr_cy, n, delta.cy);
        debug("  n = %10lu; t = %12g s, cy = %10.0f", n, delta.t, delta.cy);
      }
-    if (delta.t >= b->target_s/20.0) break;
+
+    /* If we're done then stop. */
+    if (delta.t >= T_CLB) break;
      if (n >= ULONG_MAX - n/3) break;
+
+    /* Update the counter and continue. */
      n += n/3 + 1;
    }
  
-  linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, 0);
-  debug("clock overhead = (%g n + %g) s", b->clk.m, b->clk.c);
+  /* Now run the linear regression to extract the constant and per-iteration
+   * overheads.
+   */
+  linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, &r);
+  debug("clock overhead = (%g n + %g) s (r = %g)", b->clk.m, b->clk.c, r);
    if (f&BTF_CYOK) {
-    linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, 0);
-    debug("cycle overhead = (%g n + %g) cy", b->cy.m, b->cy.c);
+    linreg_fit(&lr_cy, &b->cy.m, &b->cy.c, &r);
+    debug("cycle overhead = (%g n + %g) cy (r = %g)", b->cy.m, b->cy.c, r);
    }
-  b->f |= f; rc = 0;
+
+  /* We're done. */
+  rc = 0;
  end:
+  b->f |= f | BTF_CLB;                 /* no point trying again */
    return (rc);
  }
  
-int bench_measure(struct bench_timing *t_out, struct bench_state *b,
-                 double base, bench_fn *fn, void *p)
+/* --- @bench_measure@ --- *
+ *
+ * Arguments:  @struct bench_state *b@ = benchmark state
+ *             @struct bench_timing *t_out@ = where to leave the timing
+ *             @double base@ = number of internal units per call
+ *             @bench_fn *fn@, @void *ctx@ = benchmark function to run
+ *
+ * Returns:    Zero on success, %$-1$% if timing failed.
+ *
+ * Use:                Measure a function.  The function @fn@ is called adaptively
+ *             with an iteration count @n@ set so as to run for
+ *             approximately @b->target_s@ seconds.
+ *
+ *             The result is left in @*t_out@, with @t_out->n@ counting the
+ *             final product of the iteration count and @base@ (which might,
+ *             e.g., reflect the number of inner iterations the function
+ *             performs, or the number of bytes it processes per iteration).
+ */
+
+int bench_measure(struct bench_state *b, struct bench_timing *t_out,
+                 double base, bench_fn *fn, void *ctx)
  {
    struct bench_timer *tm = b->tm;
    struct bench_time t0, t1;
-  unsigned long n;
-
-  if (bench_calibrate(b)) return (-1);
+  unsigned long n, nn;
+
+  /* Make sure the state is calibrated and usable. */
+  if (!(b->f&BTF_CLB) && bench_calibrate(b)) return (-1);
+  if (!(b->f&BTF_TIMEOK)) return (-1);
+
+  /* Main adaptive measurement loop.
+   *
+   * Suppose the timer loop %$n$% iterations in %$t$% seconds.  Our ideal
+   * time is %$T$% seconds.  If %$t \ge T/\sqrt{2}$%, we're happy.
+   * Otherwise, we need to scale up the iteration count.  The obvious next
+   * choice is %$n' = n T/t$%.  Alas, rounding is a problem: if
+   * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no
+   * progress.  We know that %$T/t > \sqrt{2}%, so this can only happen when
+   * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%.  On the other
+   * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying
+   * again with %$n' = n + 1$% iterations will very likely work.
+   */
    debug("measuring..."); n = 1;
    for (;;) {
-    tm->ops->now(tm, &t0); fn(n, p); tm->ops->now(tm, &t1);
+    tm->ops->now(tm, &t0); fn(n, ctx); tm->ops->now(tm, &t1);
      timer_diff(t_out, &t0, &t1);
      if (!(t_out->f&BTF_TIMEOK)) return (-1);
      if (!(t_out->f&BTF_CYOK)) debug("  n = %10lu; t = %12g", n, t_out->t);
      else debug("  n = %10lu; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy);
-    if (t_out->t >= 0.72*b->target_s) break;
-    n *= 1.44*b->target_s/t_out->t;
+    if (t_out->t >= 0.707*b->target_s) break;
+    nn = n*b->target_s/t_out->t;
+    if (nn > n) n = nn;
+    else n++;
    }
+
+  /* Adjust according to the calibration. */
    t_out->t -= n*b->clk.m + b->clk.c;
    if (t_out->f&BTF_CYOK) t_out->cy -= n*b->cy.m + b->cy.c;
+
+  /* Report the results, if debugging. */
    if (!(t_out->f&BTF_CYOK)) debug("  adjusted t' = %12g", t_out->t);
    else debug("  adjusted t = %12g, cy = %10.0f", t_out->t, t_out->cy);
    if (!(t_out->f&BTF_CYOK))
@@ -485,6 +789,8 @@ int bench_measure(struct bench_timing *t_out, struct bench_state *b,
    else
      debug("  %g s (%g cy) per op; %g ops/s",
           t_out->t/n, t_out->cy/n, n/t_out->t);
+
+  /* All done. */
    t_out->n = n*base; return (0);
  }