[mLib] / test / bench.c

/* -*-c-*-
 *
 * Benchmarking support
 *
 * (c) 2023 Straylight/Edgeware
 */

/*----- Licensing notice --------------------------------------------------*
 *
 * This file is part of the mLib utilities library.
 *
 * mLib is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Library General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 *
 * mLib is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
 * License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with mLib.  If not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
 * USA.
 */

/*----- Header files ------------------------------------------------------*/

#include "config.h"

#include <ctype.h>
#include <errno.h>
#include <limits.h>
#include <math.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#include "alloc.h"
#include "arena.h"
#include "bench.h"
#include "bits.h"
#include "dstr.h"
#include "linreg.h"
#include "macros.h"

#if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
#  include <cpuid.h>
#  define CPUID_1D_TSC (1u << 4)
#  define CPUID_1xD_TSCP (1u << 27)
#  define USE_X86_RDTSC 1
#endif

#if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
#  include <sys/syscall.h>
#  include <sys/types.h>
#  include <unistd.h>
#  include <linux/perf_event.h>
#  ifdef HAVE_VALGRIND_VALGRIND_H
#    include <valgrind/valgrind.h>
#  endif
#  define USE_LINUX_PERFEVENT 1
#  if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
#    include <sys/mman.h>
#    define USE_LINUX_PERFEVRDPMC 1
#  endif
#endif

/*----- Data structures ---------------------------------------------------*/

enum { CLK, CY, NTIMER };

struct timer {
  struct bench_timer _t;
  arena *a;
  const struct timer_ops *ops[NTIMER];	/* subtimers for clock and cycles */
  union {
#ifdef USE_X86_RDTSC
    unsigned tscaux;			/* `ia32_tsc_aux' for `ldtscp' */
#endif
#ifdef USE_LINUX_PERFEVENT
    int fd;				/* vanilla `perf_event_open' */
#endif
#ifdef USE_LINUX_PERFEVRDPMC
    struct {				/* `perf_event_open' with `rdpmc' */
      const volatile void *map; size_t sz; /* memory-mapped info */
      pid_t owner;			/*   owning thread id */
    } pmc;
#endif
  } u_cy;				/* state for cycle measurement */
};

struct timer_ops {
  const char *name;			/* timer name */
  unsigned f;				/* flags */
/* ... @BTF_...OK@ flags */		/*   expected results */
#define TF_SECRET 16u			/*   don't try this automatically */
  int (*init)(struct timer */*t*/);	/* initialization function */
  int (*preflight)(struct timer */*t*/); /* preflight checks */
  int (*now)(struct timer */*t*/,	/* read current */
	     struct bench_time */*t_out*/, unsigned /*f*/);
  void (*diff)(struct timer */*t*/,	/* difference */
	       struct bench_timing */*t_inout*/,
	       const struct bench_time */*t0*/,
	       const struct bench_time */*t1*/);
  void (*teardown)(struct timer */*t*/); /* release held resources */
};

/*----- Preliminaries -----------------------------------------------------*/

#define NS_PER_S 1000000000

/* --- @debug@ --- *
 *
 * Arguments:	@const char *fmt@ = format control string
 *		@...@ = format arguemnts
 *
 * Returns:	---
 *
 * Use:		Maybe report a debugging message to standard error.
 */

static PRINTF_LIKE(1, 2) void debug(const char *fmt, ...)
{
  const char *p;
  va_list ap;

  p = getenv("MLIB_BENCH_DEBUG");
  if (p && *p != 'n' && *p != '0') {
    va_start(ap, fmt);
    fputs("mLib BENCH: ", stderr);
    vfprintf(stderr, fmt, ap);
    fputc('\n', stderr);
    va_end(ap);
  }
}

#ifdef HAVE_UINT64
#  define FLOATK64(k) ((double)(k).i)
#else
#  define FLOATK64(k) ((double)(k).lo + 4294967296.0*(double)(k).hi)
#endif

/* --- @diff_ts@ --- *
 *
 * Arguments:	@struct timer *t@ = timer structure
 *		@struct bench_timing *delta_inout@ = where to put the result
 *		@const struct time *t0, *t1@ = two input times
 *
 * Returns:	---
 *
 * Use:		Calculates a time difference for timers using the
 *		@struct timespec@-like time format.
 */

static void diff_ts(struct timer *t, struct bench_timing *delta_inout,
		    const struct bench_time *t0, const struct bench_time *t1)
{
  unsigned f = t0->f&t1->f;
  kludge64 delta_s;
  uint32 delta_ns;

  if (f&BTF_TIMEOK) {

    /* Calculate the integer differences in seconds and nanoseconds
     * independently.  To avoid underflow, though, add a second's worth of
     * nanoseconds which we'll subtract off later.
     */
    SUB64(delta_s, t1->t.ts.s, t0->t.ts.s);
    delta_ns = t1->t.ts.ns + NS_PER_S - t0->t.ts.ns;

    /* Hack if they're both equal. */
    if (ZERO64(delta_s) && !delta_ns) delta_ns = 1;

    /* And apply the nanoseconds difference.  To prevent underflow, pre-
     * emptively borrow one from the integer difference.
     */
    delta_inout->t = FLOATK64(delta_s) - 1.0 + delta_ns/(double)NS_PER_S;

    /* Done. */
    delta_inout->f |= BTF_TIMEOK;
  }
}

/* --- @diff_cycles@ --- *
 *
 * Arguments:	@struct timer *t@ = timer structure
 *		@struct bench_timing *delta_inout@ = where to put the result
 *		@const struct time *t0, *t1@ = two input times
 *
 * Returns:	---
 *
 * Use:		Calculates a time difference for cycle-counting timers.
 */

static void diff_cycles(struct timer *t, struct bench_timing *delta_inout,
			const struct bench_time *t0,
			const struct bench_time *t1)
{
  unsigned f = t0->f&t1->f;
  kludge64 delta_cy;

  if (f&BTF_CYOK) {
    SUB64(delta_cy, t1->cy, t0->cy); delta_inout->cy = FLOATK64(delta_cy);
    if (!delta_inout->cy) delta_inout->cy = 1;
    delta_inout->f |= BTF_CYOK;
  }
}

#undef FLOATK64

/* --- @normalize@ --- *
 *
 * Arguments:	@double *x_inout@ = address of a value to normalize
 *		@const char **unit_out@ = address to store unit prefix
 *		@double scale@ = scale factor for unit steps
 *
 * Returns:	---
 *
 * Use:		Adjust @*x_inout@ by a power of @scale@, and set @*unit_out@
 *		so that printing the two reflects the original value with an
 *		appropriate SI unit scaling.  The @scale@ should be 1024 for
 *		binary quantities, most notably memory sizes, or 1000 for
 *		other quantities.
 */

static void normalize(double *x_inout, const char **unit_out, double scale)
{
  static const char
    *const nothing = "",
    *const big[] = { "k", "M", "G", "T", "P", "E", 0 },
    *const little[] = { "m", "µ", "n", "p", "f", "a", 0 };
  const char *const *u;
  double x = *x_inout;

  if (x < 1)
    for (u = little, x *= scale; x < 1 && u[1]; u++, x *= scale);
  else if (x >= scale)
    for (u = big, x /= scale; x >= scale && u[1]; u++, x /= scale);
  else
    u = &nothing;

  *x_inout = x; *unit_out = *u;
}

/*----- The null timer ----------------------------------------------------*/

/* This is a timer which does nothing, in case we don't have any better
 * ideas.
 */

static int null_init(struct timer *t) { return (0); }
static int null_now(struct timer *t, struct bench_time *t_out, unsigned f)
  { return (0); }
static int null_preflight(struct timer *t) { return (0); }
static void null_diff(struct timer *t, struct bench_timing *delta_inout,
		      const struct bench_time *t0,
		      const struct bench_time *t1)
  { ; }
static void null_teardown(struct timer *t) { ; }

static const struct timer_ops null_ops =
  { "null", 0,
    null_init, null_preflight, null_now, null_diff, null_teardown };
#define NULL_ENT &null_ops,

/*----- The broken clock --------------------------------------------------*/

/* This is a cycle counter which does nothing, in case we don't have any
 * better ideas.
 */

static int broken_init(struct timer *t) { return (-1); }

static const struct timer_ops broken_ops =
  { "broken", TF_SECRET,
    broken_init, null_preflight, null_now, null_diff, null_teardown };
#define BROKEN_ENT &broken_ops,

/*----- Linux performance counters ----------------------------------------*/

/* This is a cycle counter which uses the Linux performance event system,
 * which is probably the best choice if it's available.
 */

#if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)

/* --- @perfevent_open@ --- *
 *
 * Arguments:	---
 *
 * Returns:	File descriptor, or %$-1$%.
 *
 * Use:		Open a performance measurement descriptor set up to count CPU
 *		cycles.
 */

static int perfevent_open(void)
{
  struct perf_event_attr attr = { 0 };
  int fd;

  attr.type = PERF_TYPE_HARDWARE;
  attr.size = sizeof(attr);
  attr.config = PERF_COUNT_HW_CPU_CYCLES;
  attr.disabled = 0;
  attr.exclude_kernel = 1;
  attr.exclude_hv = 1;

  fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0);
  if (fd < 0) {
    debug("couldn't open perf event: %s", strerror(errno));
    return (-1);
  }

  return (fd);
}

static int perfevent_now(struct timer *t,
			 struct bench_time *t_out, unsigned f)
{
  ssize_t n;

  n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i));
    if (n != sizeof(t_out->cy.i)) {
      debug("failed to read perf-event counter: %s", strerror(errno));
      return (0);
    }
  t_out->f |= BTF_CYOK; return (0);
}

static void perfevent_teardown(struct timer *t)
  { close(t->u_cy.fd); }

static int perfevent_init(struct timer *t)
{
  int fd = -1, rc;

  fd = perfevent_open(); if (fd < 0) { rc = -1; goto end; }
  t->u_cy.fd = fd; fd = -1; rc = 0;
end:
  if (fd != -1) close(fd);
  return (rc);
}

static const struct timer_ops perfevent_ops =
  { "linux-perf-read-hw-cycles", BTF_CYOK,
    perfevent_init, null_preflight, perfevent_now,
    diff_cycles, perfevent_teardown };
#define PERFEVENT_VANILLA_CYENT &perfevent_ops,

#  if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))

/* Special syscall-free version for x86 using `rdpmc' instruction. *
 *
 * This is a bit weird because it does both kinds of measurement in a single
 * operation.
 */

static int perfevrdpmc_now(struct timer *t,
			   struct bench_time *t_out, unsigned f)
{
  const volatile struct perf_event_mmap_page *map = t->u_cy.pmc.map;
  unsigned long long tsc = tsc, toff = toff, tenb = tenb;
  unsigned long long cy = cy, cyoff = cyoff;
  unsigned long long m, hi, lo;
  unsigned tshift = tshift, tmult = tmult, q0, q1, ff;

  /* Repeat until we can complete this job without the buffer changing in the
   * middle.
   */
  q0 = map->lock;
  __atomic_thread_fence(__ATOMIC_ACQ_REL);
  for (;;) {
    ff = 0;

    /* Read the passage-of-time information. */
    if (map->cap_user_time) {
      tenb = map->time_enabled;
      tsc = __builtin_ia32_rdtsc();
      tshift = map->time_shift;
      tmult = map->time_mult;
      toff = map->time_offset;
      ff |= BTF_TIMEOK;
    }

    /* Read the performance-counter information. */
    if (map->cap_user_rdpmc) {
      cy = __builtin_ia32_rdpmc(map->index - 1);
      cyoff = map->offset;
      ff |= BTF_CYOK;
    }

    /* Check the sequence number again. */
    __atomic_thread_fence(__ATOMIC_ACQ_REL);
    q1 = map->lock;
    if (q0 == q1) break;
    q0 = q1;
  }

  if (ff&BTF_TIMEOK) {
    /* We have a raw reference-cycle count %$n$% (@tsc@), and parameters
     * %$a$%, %$w$% and %$t_0$%, such that %$a n/2^w + t_0$% gives a time in
     * nanoseconds.
     */

    m = (1ull << tshift) - 1;
    hi = tsc >> tshift; lo = tsc&m;
    t_out->t.rawns.i = hi*tmult + (lo*tmult >> tshift) + toff + tenb;
    t_out->f |= BTF_TIMEOK;
  }

  if (ff&BTF_CYOK) {
    /* We have the cycle count. */

    t_out->cy.i = cy + cyoff;
    t_out->f |= BTF_CYOK;
  }
  return (0);
}

static void perfevrdpmc_diff(struct timer *t,
			     struct bench_timing *delta_inout,
			     const struct bench_time *t0,
			     const struct bench_time *t1)
{
  unsigned long long delta_ns;
  unsigned f = t0->f&t1->f;

  if (f&BTF_TIMEOK) {
    delta_ns = t1->t.rawns.i - t0->t.rawns.i; if (!delta_ns) delta_ns = 1;
    delta_inout->t = delta_ns/(double)NS_PER_S;
    delta_inout->f |= BTF_TIMEOK;
  }

  if (f&BTF_CYOK) {
    delta_inout->cy = t1->cy.i - t0->cy.i;
    if (!delta_inout->cy) delta_inout->cy = 1;
    delta_inout->f |= BTF_CYOK;
  }
}

static void perfevrdpmc_unmap
  (const volatile struct perf_event_mmap_page *map, size_t mapsz)
  { if (map) munmap(UNQUALIFY(struct perf_event_mmap_page, map), mapsz); }

static void perfevrdpmc_teardown(struct timer *t)
  { perfevrdpmc_unmap(t->u_cy.pmc.map, t->u_cy.pmc.sz); }

static int perfevrdpmc_setup(struct timer *t)
{
  const volatile struct perf_event_mmap_page *map = 0;
  int pgsz, mapsz = 0, fd = -1, rc;

  /* The rules say we must allocate %$1 + 2^n$% pages, so we need to know how
   * big a page is.
   */
  pgsz = sysconf(_SC_PAGESIZE);
    if (pgsz < 0) {
      debug("failed to discover page size!: %s", strerror(errno));
      rc = -1; goto end;
    }

  /* Open the measurement descriptor and map it. */
  fd = perfevent_open(); if (!fd) return (-1);
  mapsz = 2*pgsz;
  map = mmap(0, mapsz, PROT_READ, MAP_SHARED, fd, 0);
    if (map == MAP_FAILED) {
      debug("failed to map perf event: %s", strerror(errno));
      return (-1);
    }

  t->u_cy.pmc.map = map; t->u_cy.pmc.sz = mapsz; map = 0;
  t->u_cy.pmc.owner = syscall(SYS_gettid); rc = 0;
end:
  if (fd != -1) close(fd);
  perfevrdpmc_unmap(map, mapsz);
  return (rc);
}

static int perfevrdpmc_preflight(struct timer *t)
{
  if (!t->u_cy.pmc.map) { debug("retry perf event map setup"); goto reopen; }
  if (t->u_cy.pmc.owner != syscall(SYS_gettid)) {
    debug("pid changed: reopen perf event map");
    perfevrdpmc_unmap(t->u_cy.pmc.map, t->u_cy.pmc.sz);
    t->u_cy.pmc.map = 0; goto reopen;
  }
  return (0);

reopen:
  if (perfevrdpmc_setup(t)) return (-1);
  return (0);
}

static int perfevrdpmc_cyinit(struct timer *t)
{
  unsigned a, b, c, d;

#  ifdef HAVE_VALGRIND_VALGRIND_H
  /* Valgrind doesn't like `rdpmc' instructions, so just bail. */
  if (RUNNING_ON_VALGRIND) return (-1);
#  endif

  /* We need `rdtsc' to do the passage-of-time measurement. */
  if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
    { debug("no `rdtsc' instrunction"); return (-1); }

  /* Set things up. */
  if (perfevrdpmc_setup(t)) return (-1);
  return (0);
}

static const struct timer_ops perfevrdpmc_cyops =
  { "linux-x86-perf-rdpmc-hw-cycles", BTF_TIMEOK | BTF_CYOK,
    perfevrdpmc_cyinit, perfevrdpmc_preflight, perfevrdpmc_now,
    perfevrdpmc_diff, perfevrdpmc_teardown };

static int perfevrdpmc_clkinit(struct timer *t)
{
  if (t->ops[CY] != &perfevrdpmc_cyops) {
    debug("`linux-x86-perf-rdpmc-hw-cycles' not set as cycle subtimer");
    return(-1);
  }
  return (0);
}

static const struct timer_ops perfevrdpmc_clkops =
  { "linux-x86-perf-rdpmc-hw-cycles", 0,
    perfevrdpmc_clkinit, null_preflight, null_now,
    null_diff, null_teardown };

#    define PERFEVENT_RDPMC_CLKENT &perfevrdpmc_clkops,
#    define PERFEVENT_RDPMC_CYENT &perfevrdpmc_cyops,

#  else
#    define PERFEVENT_RDPMC_CLKENT
#    define PERFEVENT_RDPMC_CYENT
#  endif

#  define PERFEVENT_CLKENT PERFEVENT_RDPMC_CLKENT
#  define PERFEVENT_CYENT PERFEVENT_RDPMC_CYENT PERFEVENT_VANILLA_CYENT
#else
#  define PERFEVENT_CLKENT
#  define PERFEVENT_CYENT
#endif

/*----- Intel time-stamp counter ------------------------------------------*/

/* This is a cycle counter based on the Intel `rdtsc' instruction.  It's not
 * really suitable for performance measurement because it gets confused by
 * CPU frequency adjustments.
 */

#if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))

static int x86rdtsc_now(struct timer *t,
			struct bench_time *t_out, unsigned f)
  { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; return (0); }

static int x86rdtsc_init(struct timer *t)
{
  unsigned a, b, c, d;

  if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
    { debug("no `rdtsc' instrunction"); return (-1); }
  t->u_cy.tscaux = ~0u;
  return (0);
}

static int x86rdtscp_now(struct timer *t,
			 struct bench_time *t_out, unsigned f)
{
  unsigned tscaux;
  unsigned long long n;

  n = __builtin_ia32_rdtscp(&tscaux);
  if (!(f&BTF_T1))
    t->u_cy.tscaux = tscaux;
  else if (t->u_cy.tscaux != tscaux) {
    debug("tscaux mismatch: new 0x%08x /= old 0x%08x",
	  tscaux, t->u_cy.tscaux);
    return (-1);
  }
  t_out->cy.i = n; t_out->f |= BTF_CYOK; return (0);
}

static int x86rdtscp_init(struct timer *t)
{
  unsigned a, b, c, d;

  if (!__get_cpuid(0x80000001, &a, &b, &c, &d) || !(d&CPUID_1xD_TSCP))
    { debug("no `rdtscp' instrunction"); return (-1); }
  return (0);
}

static const struct timer_ops x86rdtsc_ops =
  { "x86-rdtsc", BTF_CYOK,
    x86rdtsc_init, null_preflight, x86rdtsc_now,
    diff_cycles, null_teardown };
static const struct timer_ops x86rdtscp_ops =
  { "x86-rdtscp", BTF_CYOK,
    x86rdtscp_init, null_preflight,
    x86rdtscp_now, diff_cycles, null_teardown };

#  define X86RDTSC_CYENT &x86rdtscp_ops, &x86rdtsc_ops,
#else
#  define X86RDTSC_CYENT
#endif

/*----- POSIX `clock_gettime' ---------------------------------------------*/

/* This is a real-time clock based on the POSIX time interface, with up to
 * nanosecond precision.
 */

#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)

static int gettime_now(struct timer *t, struct bench_time *t_out, unsigned f)
{
  struct timespec now;

  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now))
    { debug("error reading POSIX clock: %s", strerror(errno)); return (0); }
  ASSIGN64(t_out->t.ts.s, now.tv_sec); t_out->t.ts.ns = now.tv_nsec;
  t_out->f |= BTF_TIMEOK; return (0);
}

static const struct timer_ops gettime_ops =
  { "posix-thread-cputime", BTF_TIMEOK,
    null_init, null_preflight, gettime_now, diff_ts, null_teardown };

#  define GETTIME_CLKENT &gettime_ops,
#else
#  define GETTIME_CLKENT
#endif

/*----- Standard C `clock' ------------------------------------------------*/

/* This is a real-time clock based on the C `clock' function which is
 * guaranteed to be available, though it's not likely to be very good.
 */

static int clock_now(struct timer *t, struct bench_time *t_out, unsigned f)
{
  clock_t now;

  now = clock();
    if (now == (clock_t)-1) {
      debug("error reading standard clock: %s", strerror(errno));
      return (0);
    }
  t_out->t.clk = now; t_out->f |= BTF_TIMEOK; return (0);
}

static void clock_diff(struct timer *t, struct bench_timing *delta_inout,
			const struct bench_time *t0,
			const struct bench_time *t1)
{
  clock_t delta_clk;
  unsigned f = t0->f&t1->f;

  if (f&BTF_TIMEOK) {
    delta_clk = t1->t.clk - t0->t.clk; if (!delta_clk) delta_clk = 1;
    delta_inout->t = delta_clk/(double)CLOCKS_PER_SEC;
    delta_inout->f |= BTF_TIMEOK;
  }
}

static const struct timer_ops clock_ops =
  { "stdc-clock", BTF_TIMEOK, null_init, null_preflight, clock_now,
    clock_diff, null_teardown };

#define CLOCK_CLKENT &clock_ops,

/*----- Timing setup ------------------------------------------------------*/

/* Tables of timing sources. */
static const struct timer_ops
  *const clktab[] = { PERFEVENT_CLKENT
		      GETTIME_CLKENT
		      CLOCK_CLKENT
		      BROKEN_ENT
		      0 },
  *const cytab[] = { PERFEVENT_CYENT
		     X86RDTSC_CYENT
		     NULL_ENT
		     BROKEN_ENT
		     0 };

static const struct timertab {
  const char *what;
  const char *env;
  const struct timer_ops *const *opstab;
} timertab[] = {
  { "clock",	"MLIB_BENCH_CLKTIMER",		clktab },
  { "cycle",	"MLIB_BENCH_CYCLETIMER",	cytab }
};

/* --- @find_timer@ --- *
 *
 * Arguments:	@const char *name@ = timer name
 *		@size_t sz@ = length of name
 *		@unsigned tm@ = which subtimer we're looking for
 *
 * Returns:	The table entry matching the given name, or null if there
 *		isn't one.
 */

static const struct timer_ops *find_timer(const char *name, size_t sz,
					  unsigned tm)
{
  const struct timer_ops *const *tt;

  for (tt = timertab[tm].opstab; *tt; tt++) {
    if (strlen((*tt)->name) == sz &&
	MEMCMP(name, ==, (*tt)->name, sz))
      return (*tt);
  }
  debug("%s timer `%.*s' not found",
	timertab[tm].what, (int)sz, name); return (0);
}

/* --- @try_timer@ --- *
 *
 * Arguments:	@struct timer *t@ = timer structure
 *		@const struct timer_ops *ops@ = timer ops
 *		@unsigned tm@ = which subtimer we're setting
 *
 * Returns:	Zero on success, %$-1$% if timer failed.
 *
 * Use:		Tries to initialize the timer @t@, reporting a debug message
 *		if it worked.
 */

static int try_timer(struct timer *t,
		     const struct timer_ops *ops, unsigned tm)
{
  struct bench_time t0, t1;
  struct bench_timing delta;
  int rc;
  unsigned f = 0;
#define f_teardown 1u

  if (ops->init(t)) { rc = -1; goto end; }
  f |= f_teardown;

  if (ops->preflight(t)) { rc = -1; goto end; }
  t0.f = t1.f = 0;
  do {
    while (ops->now(t, &t0, BTF_T0));
  } while (ops->now(t, &t1, BTF_T1));
  delta.f = 0; ops->diff(t, &delta, &t0, &t1);
  if ((ops->f ^ delta.f)&BTF_ANY) { rc = -1; goto end; }

  debug("selected %s timer `%s'", timertab[tm].what, ops->name);
  t->ops[tm] = ops; f &= ~f_teardown; rc = 0;

end:
  if (f&f_teardown) ops->teardown(t);
  return (rc);

#undef f_teardown
}

/* --- @select_timer@ --- *
 *
 * Arguments:	@struct timer *t@ = timer structure
 *		@unsigned tm@ = which subtimer we're setting
 *		@const char *config@, @size_t sz@ = config string
 *
 * Returns:	Zero on success, %$-1$% if timer failed.
 *
 * Use:		Select a timer from the table.  If the environment variable
 *		is set, then parse a comma-separated list of timer names and
 *		use the first one listed that seems to work; otherwise, try
 *		the timers in the table in order.
 */

static int select_timer(struct timer *t, unsigned tm,
			const char *config, size_t sz)
{
  const char *p, *l;
  const struct timer_ops *ops, *const *tt;

  if (!config) {
    for (tt = timertab[tm].opstab; *tt; tt++)
      if (!((*tt)->f&TF_SECRET) && !try_timer(t, *tt, tm)) return (0);
  } else {
    l = config + sz;
    for (;;) {
      p = memchr(config, ',', l - config); if (!p) p = l;
      ops = find_timer(config, p - config, tm);
      if (ops && !try_timer(t, ops, tm)) return (0);
      if (p >= l) break;
      config = p + 1;
    }
  }
  debug("no suitable %s timer found", timertab[tm].what); return (-1);
}

/* Bench timer operations. */
static void timer_describe(struct bench_timer *tm, dstr *d)
{
  struct timer *t = (struct timer *)tm;
  unsigned i;

  dstr_puts(d, "builtin: ");
  for (i = 0; i < NTIMER; i++) {
    if (i) dstr_puts(d, ", ");
    dstr_putf(d, "%s = %s", timertab[i].what, t->ops[i]->name);
  }
}

static int timer_preflight(struct bench_timer *tm)
{
  struct timer *t = (struct timer *)tm;
  unsigned i;

  for (i = 0; i < NTIMER; i++) if (t->ops[i]->preflight(t)) return (-1);
  return (0);
}

static int timer_now(struct bench_timer *tm,
		     struct bench_time *t_out, unsigned f)
{
  struct timer *t = (struct timer *)tm;
  unsigned i;

  t_out->f = 0;
  for (i = 0; i < NTIMER; i++) if (t->ops[i]->now(t, t_out, f)) return (-1);
  return (0);
}

static void timer_diff(struct bench_timer *tm,
		       struct bench_timing *t_out,
		       const struct bench_time *t0,
		       const struct bench_time *t1)
{
  struct timer *t = (struct timer *)tm;
  unsigned i;

  t_out->f = 0;
  for (i = 0; i < NTIMER; i++) t->ops[i]->diff(t, t_out, t0, t1);
}

static void timer_destroy(struct bench_timer *tm)
{
  struct timer *t = (struct timer *)tm;
  unsigned i;

  if (!t) return;
  for (i = 0; i < NTIMER; i++)
    if (t->ops[i]) t->ops[i]->teardown(t);
  x_free(t->a, t);
}

static const struct bench_timerops timer_ops =
  { timer_describe, timer_preflight, timer_now, timer_diff, timer_destroy };

/* --- @bench_createtimer@ --- *
 *
 * Arguments:	@const char *config@ = timer configuration string
 *
 * Returns:	A freshly constructed standard timer object.
 *
 * Use:		Allocate a timer.  Dispose of it by calling
 *		@tm->ops->destroy(tm)@ when you're done.
 *
 *		Applications should not set configuration strings except as
 *		established by user action, e.g., from a command-line option,
 *		environment variable, or configuration file.
 */

struct bench_timer *bench_createtimer(const char *config)
{
  struct timer *t = 0;
  struct bench_timer *ret = 0;
  struct { const char *p; size_t sz; } tmconf[NTIMER] = { 0 };
  const struct timer_ops *const *tt;
  const char *p, *l; size_t n, nn;
  unsigned i;

  /* Parse the configuration string. */
  if (config) {

    /* The first thing to do is find the end of the string. */
    l = config + strlen(config);

    for (;;) {
      /* Process the whitespace-sparated words of the string one by one. */

      /* Skip over any initial whitespace.  If we hit the end of the string
       * then we're done.
       */
      for (;;)
	if (config >= l) goto done_config;
	else if (!ISSPACE(*config)) break;
	else config++;

      /* There's definitely a word here.  Find the end of it. */
      for (p = config; p < l && !ISSPACE(*p); p++);
      nn = p - config;

      /* Try various simple keywords. */
#define MATCHP(lit) (nn == sizeof(lit) - 1 && MEMCMP(config, ==, lit, nn))

      if (MATCHP("list")) {
	/* The `list' keyword requests lists of the available timer
	 * implementations.
	 */

	for (i = 0; i < NTIMER; i++) {
	  printf("%s timers:", timertab[i].what);
	  for (tt = timertab[i].opstab; *tt; tt++)
	    if (!((*tt)->f&TF_SECRET)) printf(" %s", (*tt)->name);
	  putchar('\n');
	}
	goto next_config;
      }

#undef MATCHP

      /* Otherwise it's an assignment, setting a subtimer list. */
      p = memchr(config, '=', nn);
      if (!p)
	n = nn;
      else {
	n = p - config;
	for (i = 0; i < NTIMER; i++)
	  if (STRNCMP(config, ==, timertab[i].what, n) &&
	      !timertab[i].what[n]) {
	    if (tmconf[i].p)
	      debug("duplicate %s timer list", timertab[i].what);
	    tmconf[i].p = config + n + 1; tmconf[i].sz = nn - n - 1;
	    goto next_config;
	  }
      }
      debug("unrecognized config keyword `%.*s'", (int)n, config);

      /* Move on to the next word. */
    next_config:
      config += nn;
    }
  done_config:;
  }

  /* Override these settings from the environment. */
  for (i = 0; i < NTIMER; i++) {
    p = getenv(timertab[i].env);
    if (p) { tmconf[i].p = p; tmconf[i].sz = strlen(p); }
  }

  /* All seems well.  Allocate the timer object. */
  XNEW(t); t->a = arena_global;
  for (i = 0; i < NTIMER; i++) t->ops[i] = 0;

  /* Try to set up the subtimers. */
  for (i = NTIMER; i--; )
    if (select_timer(t, i, tmconf[i].p, tmconf[i].sz)) goto end;

  /* All is done. */
  t->_t.ops = &timer_ops; t->_t.ref = 1; ret = &t->_t; t = 0;
end:
  if (t) timer_destroy(&t->_t);
  return (ret);
}

/*----- Benchmarking ------------------------------------------------------*/

/* --- @bench_init@ --- *
 *
 * Arguments:	@struct bench_state *b@ = bench state to initialize
 *		@struct bench_timer *tm@ = timer to attach, or null
 *
 * Returns:	Zero on success, %$-1$% on failure.
 *
 * Use:		Initialize the benchmark state.  On success, the timer state
 *		still needs to be calibrated (use @bench_calibrate@) before
 *		it can be used, but this will be done automatically by
 *		@bench_measure@ if it's not done by hand earlier.  The timer
 *		is now owned by the benchmark state and will be destroyed by
 *		@bench_destroy@.
 *
 *		The only reason for failure is if @tm@ was null on entry,
 *		and automatic construction of a timer failed.  The state is
 *		safe to discard, but calling @bench_destroy@ is safe too.
 */

int bench_init(struct bench_state *b, struct bench_timer *tm)
{
  int rc;

  b->tm = 0;

  if (!tm) {
    tm = bench_createtimer(0);
    if (!tm) { rc = -1; goto end; }
  }

  b->tm = tm; b->target_s = 1.0; b->f = 0; rc = 0;
end:
  return (rc);
}

/* --- @bench_destroy@ --- *
 *
 * Arguments:	@struct bench_state *b@ = bench state
 *
 * Returns:	---
 *
 * Use:		Destroy the benchmark state, releasing the resources that it
 *		holds.
 */

void bench_destroy(struct bench_state *b)
  { if (b->tm && !--b->tm->ref) { b->tm->ops->destroy(b->tm); b->tm = 0; } }

/* --- @spin@ --- *
 *
 * Arguments:	@unsigned long n@ = iteration count
 *		@void *ctx@ = context pointer (ignored)
 *
 * Returns:	---
 *
 * Use:		Does nothing at all for @n@ iterations.  Used to calibrate
 *		the benchmarking state.
 */

static void spin(unsigned long n, void *ctx)
  { while (n--) RELAX; }

/* --- @bench_calibrate@ --- *
 *
 * Arguments:	@struct bench_state *b@ = bench state
 *		@unsigned f@ = calibration flags
 *
 * Returns:	Zero on success, %$-1$% if calibration failed.
 *
 * Use:		Calibrate the benchmark state, so that it can be used to
 *		measure performance reasonably accurately.
 *
 *		Calibration will take into account how the subject code is
 *		going to be located.  If you're going to use @BENCH_MEASURE@
 *		to measure a piece of literal code, then leave @f@ zero.  If
 *		the code to be measured is going to be executed via an
 *		indirect branch, e.g., through the @measure@ function, then
 *		set @BTF_INDIRECT@.
 */

#define T_CLB 0.0625			/* calibration time limit */

int bench_calibrate(struct bench_state *b, unsigned f)
{
  struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT;
  struct bench_timer *tm = b->tm;
  struct bench_timing delta;
  double n, r;
  unsigned i, tf = BTF_ANY;
  BENCH_TIMELOOP_DECLS;
  int rc;

  /* The model here is that a timing loop has a fixed overhead as we enter
   * and leave (e.g., to do with the indirect branch into the code), and
   * per-iteration overheads as we check the counter and loop back.  We aim
   * to split these apart using linear regression.
   */

  /* If we've already calibrated then there's nothing to do. */
  if (b->f&BTF_CLB) return (b->f&BTF_ANY ? 0 : -1);

  /* Run the timer preflight check. */
  if (tm->ops->preflight(tm)) { rc = -1; goto end; }

  /* Exercise the inner loop a few times to educate the branch predictor.
   * This is only useful if we're executing via an indirect call.
   */
  if (f&BTF_INDIRECT) {
    for (i = 0; i < 50; i++)
      BENCH_TIMELOOP_TAG(setup, b->tm, &delta, 10000, ;)
	LAUNDER(&spin)(_bench_n, 0);
  }

  /* Now we measure idle loops until they take sufficiently long -- or we run
   * out of counter.
   */
  debug("calibrating...");
  n = 1.0;
  for (;;) {

    /* Measure @n@ iterations of the idle loop. */
    if (f&BTF_INDIRECT)
      BENCH_TIMELOOP_TAG(calibrate, b->tm, &delta, n, ;)
	LAUNDER(&spin)(_bench_n, 0);
    else
      BENCH_TIMELOOP_TAG(calibrate, b->tm, &delta, n, ;)
	while (_bench_n--) RELAX;
    tf &= delta.f; if (!(tf&BTF_TIMEOK)) { rc = -1; goto end; }

    /* Register the timings with the regression machinery. */
    linreg_update(&lr_clk, n, delta.t);
    if (!(tf&BTF_CYOK))
      debug("  n = %10.0f; t = %12g s", n, delta.t);
    else {
      linreg_update(&lr_cy, n, delta.cy);
      debug("  n = %10.0f; t = %12g s, cy = %10.0f", n, delta.t, delta.cy);
    }

    /* If we're done then stop. */
    if (delta.t >= T_CLB) break;
    if (n >= ULONG_MAX - n/3) break;

    /* Update the counter and continue. */
    n += n/3.0 + 1.0;
  }

  /* Now run the linear regression to extract the constant and per-iteration
   * overheads.
   */
  linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, &r);
  debug("clock overhead = (%g n + %g) s (r = %g)", b->clk.m, b->clk.c, r);
  if (tf&BTF_CYOK) {
    linreg_fit(&lr_cy, &b->cy.m, &b->cy.c, &r);
    debug("cycle overhead = (%g n + %g) cy (r = %g)", b->cy.m, b->cy.c, r);
  }

  /* We're done. */
  rc = 0;
end:
  b->f |= tf | BTF_CLB;			/* no point trying again */
  return (rc);
}

/* --- @bench_preflight@ --- *
 *
 * Arguments:	@struct bench_state *b@ = benchmark state
 *
 * Returns:	Zero on success, %$-1$% on failure.
 *
 * Use:		Prepares for benchmarking on the current thread.  Current
 *		checks are that the timer is calibrated and that it can
 *		successfully measure time; the timer preflight is also run.
 *
 *		Users are unlikely to find this function useful: it's called
 *		automatically by the @BENCH_MEASURE@ macro and the
 *		@bench_measure@ function.
 */

int bench_preflight(struct bench_state *b)
{
  struct bench_timer *tm = b->tm;

  if (!(b->f&BTF_CLB)) return (-1);
  if (!(b->f&BTF_TIMEOK)) return (-1);
  if (tm->ops->preflight(tm)) return (-1);
  debug("measuring...");
  return (0);
}

/* --- @bench_adapt@ --- *
 *
 * Arguments:	@double *n_inout@ = number of iterations, updated
 *		@double target_s@ = target time in seconds
 *		@const struct bench_timing *t@ = timing from the previous run
 *
 * Returns:	Nonzero if the measurement is sufficient; zero to run again.
 *
 * Use:		This function determines a suitable number of iterations of a
 *		benchmark function to perform next.  It is used in a loop
 *		such as the following.
 *
 *			@double n = 1.0;@
 *			@struct bench_timing t;@
 *
 *			@do {@
 *			  (run @n@ iterations; set @t@ to the timing)
 *			@} while (!bench_adapt(b, &n, &t));@
 *
 *		On entry, @*n_inout@ should be the number of iterations
 *		performed by the previous pass, and @*t@ the resulting time;
 *		the @BTF_TIMEOK@ flag must be set @t->f@.  If the timing is
 *		sufficient -- @t->t@ is sufficiently close to @target_s@
 *		-- then the function returns nonzero to indicate that
 *		measurement is complete.  Otherwise, it sets @*n_inout@ to a
 *		new, larger iteration count and returns zero to indicate that
 *		a further pass is necessary.
 */

int bench_adapt(double *n_inout, double target_s,
		const struct bench_timing *t)
{
  double n = *n_inout, nn;

  /* Dump the results for debugging. */
  if (!(t->f&BTF_CYOK)) debug("  n = %10.0f; t = %12g", n, t->t);
  else debug("  n = %10.0f; t = %12g, cy = %10.0f", n, t->t, t->cy);

  /* Suppose the timer loop %$n$% iterations in %$t$% seconds.  Our ideal
   * time is %$T$% seconds.  If %$t \ge T/\sqrt{2}$%, we're happy.
   * Otherwise, we need to scale up the iteration count.  The obvious next
   * choice is %$n' = n T/t$%.  Alas, rounding is a problem: if
   * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no
   * progress.  We know that %$T/t > \sqrt{2}%, so this can only happen when
   * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%.  On the other
   * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying
   * again with %$n' = n + 1$% iterations will very likely work.
   */
  if (t->t >= 0.707*target_s) return (1);
  nn = n*target_s/t->t; modf(nn, &nn);
  *n_inout = nn > n ? nn : n + 1;
  return (0);
}

/* --- @bench_adjust@ --- *
 *
 * Arguments:	@struct bench_state *b@ = benchmark state
 *		@struct bench_timing *t_inout@ = timing to adjust
 *		@double n@ = number of external iterations performed
 *		@double base@ = number of internal operations per external
 *			iteration
 *
 * Returns:	---
 *
 * Use:		Adjusts a raw timing, as captured by @BENCH_TIMELOOP@,
 *		according to the calibration data captured in @b@.
 *		On exit, the timing data is updated, and @t->n@ is set to the
 *		product @n*base@.
 */

void bench_adjust(struct bench_state *b,
		  struct bench_timing *t_inout, double n, double base)
{

  /* Adjust according to the calibration. */
  t_inout->t -= n*b->clk.m + b->clk.c;
  if (t_inout->f&BTF_CYOK) t_inout->cy -= n*b->cy.m + b->cy.c;

  /* Report the results, if debugging. */
  if (!(t_inout->f&BTF_CYOK)) debug("  adjusted t' = %12g", t_inout->t);
  else debug("  adjusted t' = %12g, cy' = %10.0f", t_inout->t, t_inout->cy);
  if (!(t_inout->f&BTF_CYOK))
    debug("  %g s per iter; %g iters/s", t_inout->t/n, n/t_inout->t);
  else
    debug("  %g s (%g cy) per iter; %g iters/s",
	  t_inout->t/n, t_inout->cy/n, n/t_inout->t);

  /* All done. */
  t_inout->n = n*base;
}

/* --- @bench_measure@ --- *
 *
 * Arguments:	@struct bench_state *b@ = benchmark state
 *		@struct bench_timing *t_out@ = where to leave the timing
 *		@double base@ = number of internal units per call
 *		@bench_fn *fn@, @void *ctx@ = benchmark function to run
 *
 * Returns:	Zero on success, %$-1$% if timing failed.
 *
 * Use:		Measure a function.  The function @fn@ is called adaptively
 *		with an iteration count @n@ set so as to run for
 *		approximately @b->target_s@ seconds.
 *
 *		The result is left in @*t_out@, with @t_out->n@ counting the
 *		final product of the iteration count and @base@ (which might,
 *		e.g., reflect the number of inner iterations the function
 *		performs, or the number of bytes it processes per iteration).
 *
 *		To get useful results, the benchmark state should have been
 *		calibrated for indirect calling -- i.e., with @BTF_INDIRECT@.
 */

int bench_measure(struct bench_state *b, struct bench_timing *t_out,
		  double base, bench_fn *fn, void *ctx)
{
  BENCH_MEASURE_DECLS;
  int rc;

  BENCH_MEASURE(b, rc, t_out, base) fn(_bench_n, ctx);
  return (rc);
}

/*----- Reporting ---------------------------------------------------------*/

/* --- @bench_report@ --- *
 *
 * Arguments:	@const struct gprintf_ops *gops, void *gp@ = output formatter
 *		@unsigned unit@ = unit processed by the benchmark function
 *		@const struct bench_timing *t@ = benchmark result
 *
 * Returns:	---
 *
 * Use:		Format, to the output identified by @gops@ and @go@, a
 *		human-readable report of the benchmarking result @t@.  No
 *		newline is appended.
 *
 *		The output format is subject to change in later versions.
 */

void bench_report(const struct gprintf_ops *gops, void *go,
		  unsigned unit, const struct bench_timing *t)
{
  double scale, x, n = t->n;
  const char *u, *what, *whats;

  assert(t->f&BTF_TIMEOK);

  switch (unit) {
    case BTU_OP:
      gprintf(gops, go, "%.0f iterations ", n);
      what = "op"; whats = "ops"; scale = 1000;
      break;
    case BTU_BYTE:
      x = n; normalize(&x, &u, 1024); gprintf(gops, go, "%.3f %sB ", x, u);
      what = whats = "B"; scale = 1024;
      break;
    default:
      assert(0);
  }

  x = t->t; normalize(&x, &u, 1000);
  gprintf(gops, go, "in %.3f %ss", x, u);
  if (t->f&BTF_CYOK) {
    x = t->cy; normalize(&x, &u, 1000);
    gprintf(gops, go, " (%.3f %scy)", x, u);
  }
  gprintf(gops, go, ": ");

  x = n/t->t; normalize(&x, &u, scale);
    gprintf(gops, go, "%.3f %s%s/s", x, u, whats);
  x = t->t/n; normalize(&x, &u, 1000);
    gprintf(gops, go, ", %.3f %ss/%s", x, u, what);
  if (t->f&BTF_CYOK) {
    x = t->cy/n; normalize(&x, &u, 1000);
      gprintf(gops, go, " (%.3f %scy/%s)", x, u, what);
  }
}

/*----- That's all, folks -------------------------------------------------*/