From 6e683a79101025ee0d371f0b9bece811856edd8d Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Thu, 14 Mar 2024 09:15:46 +0000
Subject: [PATCH] @@@ misc mess

---
 struct/buf.3.in     |   2 +-
 struct/dstr.3.in    |   2 +-
 test/bench.3.in     | 235 +++++++++++++--------
 test/bench.c        | 577 +++++++++++++++++++++++++++++++++++++++++-----------
 test/bench.h        |  25 ++-
 test/tvec-bench.c   |   3 +-
 test/tvec-core.c    |  14 +-
 test/tvec-env.3.in  |   8 +-
 test/tvec-remote.c  |   8 +-
 test/tvec-timeout.c |   4 +-
 test/tvec.3.in      |  12 ++
 test/tvec.h         |  33 +--
 utils/macros.3.in   |  30 ++-
 utils/macros.h      | 254 ++++++++++++++++++++---
 14 files changed, 939 insertions(+), 268 deletions(-)

diff --git a/struct/buf.3.in b/struct/buf.3.in
index c39d412..083c599 100644
--- a/struct/buf.3.in
+++ b/struct/buf.3.in
@@ -1097,7 +1097,7 @@ or the macro equivalent
 these leave the buffer in the state established by initialization:
 the buffer holds no resources, but is ready for immediate use.
 .PP
-A dynamic buffer contains an 
+A dynamic buffer contains a
 .B buf
 buffer,
 called its
diff --git a/struct/dstr.3.in b/struct/dstr.3.in
index 919bf09..e7b527e 100644
--- a/struct/dstr.3.in
+++ b/struct/dstr.3.in
@@ -365,7 +365,7 @@ functions are implemented using
 The output operations table is exposed as
 .BR dstr_printops ;
 the functions expect the output pointer to be the address of the output
-.BR dstr 
+.BR dstr .
 .PP
 The function
 .B dstr_putd
diff --git a/test/bench.3.in b/test/bench.3.in
index d5953ad..ef0b4db 100644
--- a/test/bench.3.in
+++ b/test/bench.3.in
@@ -45,11 +45,14 @@ bench \- low-level benchmarking tools
 .nf
 .B "#include <mLib/bench.h>"
 .PP
-.ta 2n
+.ta 2n +2n +2n
 .B "struct bench_time {"
 .B "	unsigned f;"
-.B "	kludge64 s;"
-.B "	uint32 ns;"
+.B "	union {"
+.B "		struct { kludge64 s; uint32 ns; } ts;"
+.B "		clock_t clk;"
+.B "		kludge64 rawns;"
+.B "	} t;"
 .B "	kludge64 cy;"
 .B "};"
 .PP
@@ -60,9 +63,18 @@ bench \- low-level benchmarking tools
 .B "	double cy;"
 .B "};"
 .PP
+.B "#define BTF_T0 0u"
+.B "#define BTF_T1 ..."
 .B "struct bench_timerops {"
 .BI "	void (*describe)(struct bench_timer *" bt ", dstr *" d );
-.BI "	void (*now)(struct bench_timer *" bt ", struct bench_time *" t_out );
+.ta 2n +\w'\fBint (*now)('u
+.BI "	int (*now)(struct bench_timer *" bt ,
+.BI "		struct bench_time *" t_out ", unsigned " f );
+.ta 2n +\w'\void (*diff)('u
+.BI "	void (*diff)(struct bench_timer *" bt ,
+.BI "		struct bench_timing *" delta_out ,
+.BI "		const struct bench_time *" t0 ,
+.BI "		const struct bench_time *" t1 );
 .BI "	void (*destroy)(struct bench_timer *" bt );
 .B "};"
 .B "struct bench_timer {"
@@ -140,49 +152,54 @@ must always point to the timer object itself.
 Write a description of the timer to the dynamic string
 .IR d .
 .TP
-.IB tm ->ops->now( tm ", " t_out)
+.IB tm ->ops->now( tm ", " t_out ", " f )
 Store the current time in
-.IR t_out .
+.BI * t_out \fR.
 The
-.B struct bench_time
-used to represent the time reported by a timer
-is described in detail below.
+.B BTF_T1
+flag in
+.I f
+to indicate that this is the second call in a pair;
+leave it clear for the first call.
+(A fake
+.B BTF_T0
+flag is defined to be zero for symmetry.)
+Return zero on success
+.I or
+permanent failure;
+return \-1 if timing failed but
+trying again immediately has a reasonable chance of success.
+.TP
+.IB tm ->ops->diff( tm ", " delta_out ", " t0 ", " t1 )
+Store in
+.BI * delta_out
+the difference between the two times
+.I t0
+and
+.IR t1 .
 .TP
 .IB tm ->ops->destroy( tm )
 Destroy the timer,
 releasing all of the resources that it holds.
 .PP
-A time, a reported by a timer, is represented by the
-.BR "struct bench_time" .
-A passage-of-time measurement is stored in the
-.B s
-and
-.B ns
-members, holding seconds and nanoseconds respectively.
-(A timer need not have nanosecond precision.
-The exact interpretation of the time \(en
-e.g., whether it measures wallclock time,
-user-mode CPU time,
-or total thread CPU time \(en
-is a matter for the specific timer implementation.)
-A cycle count is stored in the
-.B cy
-member.
-The
+A
+.B bench_timing
+structure reports the difference between two times,
+as determined by a timer's
+.B diff
+function.
+It has four members.
+.TP
 .B f
-member stores flags:
+A flags word.
 .B BTF_TIMEOK
-is set if the passage-of-time measurement
-.B s
-and
-.B ns
-are valid; and
+is set if the passage-of-time measurement in
+.B t
+is valid;
 .B BTF_CYOK
-is set if the cycle count
+is set if the cycle count in
 .B cy
 is valid.
-Neither the time nor the cycle count need be measured
-relative to any particular origin.
 The mask
 .B BTF_ANY
 covers the
@@ -191,9 +208,57 @@ and
 .B BTF_CYOK
 bits:
 hence,
-.IB f &BTF_ANY
+.B f&BTF_ANY
 is nonzero (true)
 if the timer returned any valid timing information.
+.TP
+.B n
+The number of iterations performed by the benchmark function
+on its satisfactory run,
+multiplied by
+.IR base .
+.TP
+.B t
+The time taken for the satisfactory run of the benchmark function,
+in seconds.
+Only valid if
+.B BTF_TIMEOK
+is set in
+.BR f .
+.TP
+.B cy
+The number of CPU cycles used
+in the satisfactory run of the benchmark function,
+in seconds.
+Only valid if
+.B BTF_CYOK
+is set in
+.BR f .
+.PP
+A
+.B "struct bench_time"
+representats a single instant in time,
+as captured by a timer's
+.B now
+function.
+The use of this structure is a private matter for the timer:
+the only hard requirement is that the
+.B diff
+function should be able to compute the difference between two times.
+However, the intent is that
+a passage-of-time measurement is stored in the
+.B t
+union,
+a cycle count is stored in the
+.B cy
+member, and
+the
+.B f
+member stores flags
+.B BTF_TIMEOK
+and or
+.B BTF_CYOK
+if the passage-of-time or cycle count respectively are valid.
 .
 .SS The built-in timer
 The function
@@ -249,6 +314,10 @@ then construction of the timer as a whole fails.
 The clock subtimers are as follows.
 Not all of them will be available on every platform.
 .TP
+.B linux-x86-perf-rdpmc-hw-cycles
+This is a dummy companion to the similarly named cycle subtimer;
+see its description below.
+.TP
 .B posix-thread-cputime
 Measures the passage of time using
 .BR clock_gettime (2),
@@ -269,8 +338,8 @@ if other threads are running.
 The cycle subtimers are as follows.
 Not all of them will be available on every platform.
 .TP
-.B linux-perf-event
-Counts CPU cycles using the Linux-specific 
+.B linux-perf-read-hw-cycles
+Counts CPU cycles using the Linux-specific
 .BR perf_event_open (2)
 function to read the
 .BR PERF_\%COUNT_\%HW_\%CPU_\%CYCLES
@@ -282,13 +351,48 @@ e.g., because the
 .B /proc/sys/kernel/perf_event_paranoid
 level is too high.
 .TP
-.B x86-rdtsc
-Counts CPU cycles using the x86
+.B linux-perf-rdpmc-hw-cycles
+Counts CPU cycles using the Linux-specific
+.BR perf_event_open (2)
+function,
+as for
+.B linux-x86-perf-read-hw-cycles
+above,
+except that it additionally uses the i386/AMD64
 .B rdtsc
+and
+.B rdpmc
+instructions,
+together with information provided by the kernel
+through a memory-mapped page
+to do its measurements without any system call overheads.
+It does passage-of-time and cycle counting in a single operation,
+so no separate clock subtimer is required:
+the similarly-named clock subtimer does nothing
+except check that the
+.B linux-x86-perf-rdpmc-hw-cycles
+cycle subtimer has been selected.
+This is almost certainly the best choice if it's available.
+.TP
+.B x86-rdtscp
+Counts CPU cycles using the x86
+.B rdtscp
 instruction.
 This instruction is not really suitable for performance measurement:
 it gives misleading results on CPUs with variable clock frequency.
 .TP
+.B x86-rdtsc
+Counts CPU cycles using the x86
+.B rdtsc
+instruction.
+This has the downsides of
+.B rdtscp
+above,
+but also fails to detect when the thread has been suspended
+or transferred to a different CPU core
+and gives misleading answers in this case.
+Not really recommended.
+.TP
 .B null
 A dummy cycle counter,
 which will initialize successfully
@@ -297,15 +401,21 @@ This is a reasonable fallback in many situations.
 .PP
 The built-in preference order for clock subtimers,
 from most to least preferred, is
-.B posix-thread-cputime
+.BR linux-x86-perf-rdpmc-hw-cycles ,
 followed by
+.BR posix-thread-cputime ,
+and finally
 .BR stdc-clock .
 The built-in preference order for cycle subtimers,
 from most to least preferred, is
-.B linux-perf-event
+.BR linux-x86-perf-rdpmc-hw-cycles
+then
+.BR linux-x86-perf-read-hw-cycles ,
 followed by
+.BR x86-rdtscp ,
+and
 .BR x86-rdtsc ,
-and then
+and finally
 .BR null .
 .
 .SS The benchmark state
@@ -483,45 +593,6 @@ returns zero.
 If it fails \(en
 most likely because the timer failed \(en
 then it returns \-1.
-.PP
-A
-.B bench_timing
-structure reports the outcome of a successful measurement.
-It has four members.
-.TP
-.B f
-A flags word.
-.B BTF_TIMEOK
-is set if the passage-of-time measurement in 
-.B t
-is valid;
-.B BTF_CYOK
-is set if the cycle count in
-.B cy
-is valid.
-.TP
-.B n
-The number of iterations performed by the benchmark function
-on its satisfactory run,
-multiplied by
-.IR base .
-.TP
-.B t
-The time taken for the satisfactory run of the benchmark function,
-in seconds.
-Only valid if
-.B BTF_TIMEOK
-is set in
-.BR f .
-.TP
-.B cy
-The number of CPU cycles used
-in the satisfactory run of the benchmark function,
-in seconds.
-Only valid if
-.B BTF_CYOK
-is set in
-.BR f .
 .
 .\"--------------------------------------------------------------------------
 .SH "SEE ALSO"
diff --git a/test/bench.c b/test/bench.c
index 7fedc9d..9cb84e5 100644
--- a/test/bench.c
+++ b/test/bench.c
@@ -31,6 +31,7 @@
 
 #include <ctype.h>
 #include <errno.h>
+#include <limits.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -44,6 +45,22 @@
 #include "linreg.h"
 #include "macros.h"
 
+#if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
+#  include <cpuid.h>
+#  define CPUID_1D_TSC (1u << 4)
+#  define CPUID_1xD_TSCP (1u << 27)
+#endif
+
+#if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
+#  include <sys/types.h>
+#  include <unistd.h>
+#  include <linux/perf_event.h>
+#  include <asm/unistd.h>
+#  if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
+#    include <sys/mman.h>
+#  endif
+#endif
+
 /*----- Data structures ---------------------------------------------------*/
 
 enum { CLK, CY, NTIMER };
@@ -51,7 +68,12 @@ enum { CLK, CY, NTIMER };
 struct timer {
   struct bench_timer _t;
   const struct timer_ops *ops[NTIMER];	/* subtimers for clock and cycles */
-  union { int fd; } u_cy;		/* state for cycle measurement */
+  union {
+    unsigned tscaux;			/* `ia32_tsc_aux' for `ldtscp' */
+    int fd;				/* vanilla `perf_event_open' */
+    struct { const volatile void *map; size_t sz; } pmc; /* `perf_event_open'
+					 * with `rdpmc' */
+  } u_cy;				/* state for cycle measurement */
 };
 
 struct timer_ops {
@@ -59,8 +81,13 @@ struct timer_ops {
   unsigned f;				/* flags */
 #define TF_SECRET 1u			/*   don't try this automatically */
   int (*init)(struct timer */*t*/);	/* initialization function */
-  void (*now)(struct bench_time *t_out, struct timer *t); /* read current */
-  void (*teardown)(struct timer *t);	/* release held resources */
+  int (*now)(struct timer */*t*/,	/* read current */
+	     struct bench_time */*t_out*/, unsigned /*f*/);
+  void (*diff)(struct timer */*t*/,	/* difference */
+	       struct bench_timing */*t_inout*/,
+	       const struct bench_time */*t0*/,
+	       const struct bench_time */*t1*/);
+  void (*teardown)(struct timer */*t*/); /* release held resources */
 };
 
 /*----- Preliminaries -----------------------------------------------------*/
@@ -92,52 +119,75 @@ static PRINTF_LIKE(1, 2) void debug(const char *fmt, ...)
   }
 }
 
-/* --- @timer_diff@ --- *
+/*----- Difference utilities ----------------------------------------------*/
+
+#ifdef HAVE_UINT64
+#  define FLOATK64(k) ((double)(k).i)
+#else
+#  define FLOATK64(k) ((double)(k).lo + 4294967296.0*(double)(k).hi)
+#endif
+
+/* --- @diff_ts@ --- *
  *
- * Arguments:	@struct bench_timing *delta_out@ = where to putt the result
- *		@const struct bench_time *t0, *t1@ = two times captured by a
- *			timer's @now@ function
+ * Arguments:	@struct timer *t@ = timer structure
+ *		@struct bench_timing *delta_inout@ = where to put the result
+ *		@const struct time *t0, *t1@ = two input times
  *
  * Returns:	---
  *
- * Use:		Calculates the difference between two captured times.  The
- *		flags are set according to whether the differences are
- *		meaningful; @delta_out->n@ is left unset.
+ * Use:		Calculates a time difference for timers using the
+ *		@struct timespec@-like time format.
  */
 
-static void timer_diff(struct bench_timing *delta_out,
-		       const struct bench_time *t0,
-		       const struct bench_time *t1)
+static void diff_ts(struct timer *t, struct bench_timing *delta_inout,
+		    const struct bench_time *t0, const struct bench_time *t1)
 {
   unsigned f = t0->f&t1->f;
   kludge64 k;
 
-#ifdef HAVE_UINT64
-#  define FLOATK64(k) ((double)(k).i)
-#else
-#  define FLOATK64(k) ((double)(k).lo + 4275123318.0*(double)(k).hi)
-#endif
+  if (f&BTF_TIMEOK) {
 
-  if (!(f&BTF_TIMEOK))
-    delta_out->t = 0.0;
-  else {
-    SUB64(k, t1->s, t0->s);
-    delta_out->t = FLOATK64(k) - 1 +
-      (t1->ns + NS_PER_S - t0->ns)/(double)NS_PER_S;
-  }
+    /* Calculate the integer difference in seconds. */
+    SUB64(k, t1->t.ts.s, t0->t.ts.s);
 
-  if (!(f&BTF_CYOK))
-    delta_out->cy = 0.0;
-  else {
-    SUB64(k, t1->cy, t0->cy);
-    delta_out->cy = FLOATK64(k);
+    /* And apply the nanoseconds difference.  To prevent underflow,
+     * pre-emptively borrow one from the integer difference.
+     */
+    delta_inout->t =
+      FLOATK64(k) - 1.0 +
+      (t1->t.ts.ns + NS_PER_S - t0->t.ts.ns)/(double)NS_PER_S;
+
+    /* Done. */
+    delta_inout->f |= BTF_TIMEOK;
   }
+}
 
-  delta_out->f = f;
+/* --- @diff_cycles@ --- *
+ *
+ * Arguments:	@struct timer *t@ = timer structure
+ *		@struct bench_timing *delta_inout@ = where to put the result
+ *		@const struct time *t0, *t1@ = two input times
+ *
+ * Returns:	---
+ *
+ * Use:		Calculates a time difference for cycle-counting timers.
+ */
 
-#undef FLOATK64
+static void diff_cycles(struct timer *t, struct bench_timing *delta_inout,
+			const struct bench_time *t0,
+			const struct bench_time *t1)
+{
+  unsigned f = t0->f&t1->f;
+  kludge64 k;
+
+  if (f&BTF_CYOK) {
+    SUB64(k, t1->cy, t0->cy); delta_inout->cy = FLOATK64(k);
+    delta_inout->f |= BTF_CYOK;
+  }
 }
 
+#undef FLOATK64
+
 /*----- The null timer ----------------------------------------------------*/
 
 /* This is a timer which does nothing, in case we don't have any better
@@ -145,11 +195,16 @@ static void timer_diff(struct bench_timing *delta_out,
  */
 
 static int null_init(struct timer *t) { return (0); }
-static void null_now(struct bench_time *t_out, struct timer *t) { ; }
+static int null_now(struct timer *t, struct bench_time *t_out, unsigned f)
+  { return (0); }
+static void null_diff(struct timer *t, struct bench_timing *delta_inout,
+		      const struct bench_time *t0,
+		      const struct bench_time *t1)
+  { ; }
 static void null_teardown(struct timer *t) { ; }
 
 static const struct timer_ops null_ops =
-  { "null", 0, null_init, null_now, null_teardown };
+  { "null", 0, null_init, null_now, null_diff, null_teardown };
 #define NULL_ENT &null_ops,
 
 /*----- The broken clock --------------------------------------------------*/
@@ -161,7 +216,7 @@ static const struct timer_ops null_ops =
 static int broken_init(struct timer *t) { return (-1); }
 
 static const struct timer_ops broken_ops =
-  { "broken", TF_SECRET, broken_init, null_now, null_teardown };
+  { "broken", TF_SECRET, broken_init, null_now, null_diff, null_teardown };
 #define BROKEN_ENT &broken_ops,
 
 /*----- Linux performance counters ----------------------------------------*/
@@ -172,22 +227,48 @@ static const struct timer_ops broken_ops =
 
 #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
 
-#include <sys/types.h>
-#include <unistd.h>
+/* --- @perfevent_open@ --- *
+ *
+ * Arguments:	---
+ *
+ * Returns:	File descriptor, or %$-1$%.
+ *
+ * Use:		Open a performance measurement descriptor set up to count CPU
+ *		cycles.
+ */
 
-#include <linux/perf_event.h>
-#include <asm/unistd.h>
+static int perfevent_open(void)
+{
+  struct perf_event_attr attr = { 0 };
+  int fd;
 
-static void perfevent_now(struct bench_time *t_out, struct timer *t)
+  attr.type = PERF_TYPE_HARDWARE;
+  attr.size = sizeof(attr);
+  attr.config = PERF_COUNT_HW_CPU_CYCLES;
+  attr.disabled = 0;
+  attr.exclude_kernel = 1;
+  attr.exclude_hv = 1;
+
+  fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
+  if (fd < 0) {
+    debug("couldn't open perf event: %s", strerror(errno));
+    return (-1);
+  }
+
+  return (fd);
+}
+
+static int perfevent_now(struct timer *t,
+			 struct bench_time *t_out, unsigned f)
 {
   ssize_t n;
 
   n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i));
     if (n != sizeof(t_out->cy.i)) {
       debug("failed to read perf-event counter: %s", strerror(errno));
-      return;
+      return (0);
     }
-  t_out->f |= BTF_CYOK;
+  t_out->f |= BTF_CYOK; return (0);
 }
 
 static void perfevent_teardown(struct timer *t)
@@ -195,34 +276,201 @@ static void perfevent_teardown(struct timer *t)
 
 static int perfevent_init(struct timer *t)
 {
-  struct perf_event_attr attr = { 0 };
   struct bench_time tm;
+  int fd = -1, rc;
 
-  attr.type = PERF_TYPE_HARDWARE;
-  attr.size = sizeof(attr);
-  attr.config = PERF_COUNT_HW_CPU_CYCLES;
-  attr.disabled = 0;
-  attr.exclude_kernel = 1;
-  attr.exclude_hv = 1;
+  fd = perfevent_open(); if (!fd) { rc = -1; goto end; }
 
-  t->u_cy.fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
-  if (t->u_cy.fd < 0) {
-    debug("couldn't open perf evvent: %s", strerror(errno));
-    return (-1);
+  t->u_cy.fd = fd; tm.f = 0; perfevent_now(t, &tm, 0);
+  if (!(tm.f&BTF_CYOK)) { rc = -1; goto end; }
+  fd = -1; rc = 0;
+end:
+  if (fd != -1) close(fd);
+  return (rc);
+}
+
+static const struct timer_ops perfevent_ops =
+  { "linux-perf-read-hw-cycles", 0,
+    perfevent_init, perfevent_now, diff_cycles, perfevent_teardown };
+#define PERFEVENT_VANILLA_CYENT &perfevent_ops,
+
+#  if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
+
+/* Special syscall-free version for x86 using `rdpmc' instruction. *
+ *
+ * This is a bit weird because it does both kinds of measurement in a single
+ * operation.
+ */
+
+static int perfevrdpmc_now(struct timer *t,
+			   struct bench_time *t_out, unsigned f)
+{
+  const volatile struct perf_event_mmap_page *map = t->u_cy.pmc.map;
+  unsigned long long tsc = tsc, toff = toff, tenb = tenb;
+  unsigned long long cy = cy, cyoff = cyoff;
+  unsigned long long m, hi, lo;
+  unsigned tshift = tshift, tmult = tmult, q0, q1, ff;
+
+  /* Repeat until we can complete this job without the buffer changing in the
+   * middle.
+   */
+  q0 = map->lock;
+  __atomic_thread_fence(__ATOMIC_ACQ_REL);
+  for (;;) {
+    ff = 0;
+
+    /* Read the passage-of-time information. */
+    if (map->cap_user_time) {
+      tenb = map->time_enabled;
+      tsc = __builtin_ia32_rdtsc();
+      tshift = map->time_shift;
+      tmult = map->time_mult;
+      toff = map->time_offset;
+      ff |= BTF_TIMEOK;
+    }
+
+    /* Read the performance-counter information. */
+    if (map->cap_user_rdpmc) {
+      cy = __builtin_ia32_rdpmc(map->index - 1);
+      cyoff = map->offset;
+      ff |= BTF_CYOK;
+    }
+
+    /* Check the sequence number again. */
+    __atomic_thread_fence(__ATOMIC_ACQ_REL);
+    q1 = map->lock;
+    if (q0 == q1) break;
+    q0 = q1;
+  }
+
+  if (ff&BTF_TIMEOK) {
+    /* We have a raw reference-cycle count %$n$% (@tsc@), and parameters
+     * %$a$%, %$w$% and %$t_0$%, such that %$a n/2^w + t_0$% gives a time in
+     * nanoseconds.
+     */
+
+    m = (1ull << tshift) - 1;
+    hi = tsc >> tshift; lo = tsc&m;
+    t_out->t.rawns.i = hi*tmult + (lo*tmult >> tshift) + toff + tenb;
+    t_out->f |= BTF_TIMEOK;
   }
 
-  tm.f = 0; perfevent_now(&tm, t);
-  if (!(tm.f&BTF_CYOK)) { close(t->u_cy.fd); return (-1); }
+  if (ff&BTF_CYOK) {
+    /* We have the cycle count. */
 
+    t_out->cy.i = cy + cyoff;
+    t_out->f |= BTF_CYOK;
+  }
   return (0);
 }
 
-static const struct timer_ops perfevent_ops =
-  { "linux-perf-hw-cycles", 0,
-    perfevent_init, perfevent_now, perfevent_teardown };
+static void perfevrdpmc_diff(struct timer *t,
+			     struct bench_timing *delta_inout,
+			     const struct bench_time *t0,
+			     const struct bench_time *t1)
+{
+  unsigned f = t0->f&t1->f;
 
-#  define PERFEVENT_CYENT &perfevent_ops,
+  if (f&BTF_TIMEOK) {
+    delta_inout->t = (t1->t.rawns.i - t0->t.rawns.i)/(double)NS_PER_S;
+    delta_inout->f |= BTF_TIMEOK;
+  }
+
+  if (f&BTF_CYOK) {
+    delta_inout->cy = t1->cy.i - t0->cy.i;
+    delta_inout->f |= BTF_CYOK;
+  }
+}
+
+static void perfevrdpmc_teardown(struct timer *t)
+  { munmap((/*unconst unvolatile*/ void *)t->u_cy.pmc.map, t->u_cy.pmc.sz); }
+
+static int perfevrdpmc_cyinit(struct timer *t)
+{
+  const volatile struct perf_event_mmap_page *map = 0;
+  unsigned a, b, c, d, q0, q1, f;
+  int pgsz, mapsz, fd = -1, rc;
+
+  /* We need `rdtsc' to do the passage-of-time measurement. */
+  if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
+    { debug("no `rdtsc' instrunction"); return (-1); }
+
+  /* The rules say we must allocate %$1 + 2^n$% pages, so we need to know how
+   * big a page is.
+   */
+  pgsz = sysconf(_SC_PAGESIZE);
+    if (pgsz < 0) {
+      debug("failed to discover page size!: %s", strerror(errno));
+      rc = -1; goto end;
+    }
+
+  /* Open the measurement descriptor and map it. */
+  fd = perfevent_open(); if (!fd) return (-1);
+  mapsz = 2*pgsz;
+  map = mmap(0, mapsz, PROT_READ, MAP_SHARED, fd, 0);
+    if (map == MAP_FAILED) {
+      debug("failed to map perf event: %s", strerror(errno));
+      return (-1);
+    }
+
+  /* Check that it's revealed the necessary information. */
+  q0 = map->lock;
+  __atomic_thread_fence(__ATOMIC_ACQ_REL);
+  for (;;) {
+    f = 0;
+    if (map->cap_user_time) f |= BTF_TIMEOK;
+    if (map->cap_user_rdpmc) f |= BTF_CYOK;
+    __atomic_thread_fence(__ATOMIC_ACQ_REL);
+    q1 = map->lock;
+    if (q0 == q1) break;
+    q0 = q1;
+  }
+  if (!(f&BTF_TIMEOK))
+    { debug("kernel refused user time measurement"); rc = -1; goto end; }
+  if (!(f&BTF_TIMEOK))
+    { debug("kernel refused user cycle measurement"); rc = -1; goto end; }
+
+  /* All done.  We can close the descriptor here: the mapping will keep the
+   * performance-measurement machinery alive.
+   */
+  t->u_cy.pmc.map = map; t->u_cy.pmc.sz = mapsz; map = 0; rc = 0;
+end:
+  if (fd != -1) close(fd);
+  if (map) munmap((/*unconst unvolatile*/ void *)map, mapsz);
+  return (rc);
+}
+
+static const struct timer_ops perfevrdpmc_cyops =
+  { "linux-x86-perf-rdpmc-hw-cycles", 0,
+    perfevrdpmc_cyinit, perfevrdpmc_now,
+    perfevrdpmc_diff, perfevrdpmc_teardown };
+
+static int perfevrdpmc_clkinit(struct timer *t)
+{
+  if (t->ops[CLK] != &perfevrdpmc_cyops) {
+    debug("linux-x86-perf-rdpmc-hw-cycles not set as cycle subtimer");
+    return(-1);
+  }
+  return (0);
+}
+
+static const struct timer_ops perfevrdpmc_clkops =
+  { "linux-x86-perf-rdpmc-hw-cycles", 0,
+    perfevrdpmc_clkinit, null_now,
+    null_diff, null_teardown };
+
+#    define PERFEVENT_RDPMC_CLKENT &perfevrdpmc_clkops,
+#    define PERFEVENT_RDPMC_CYENT &perfevrdpmc_cyops,
+
+#  else
+#    define PERFEVENT_RDPMC_CLKENT
+#    define PERFEVENT_RDPMC_CYENT
+#  endif
+
+#  define PERFEVENT_CLKENT PERFEVENT_RDPMC_CLKENT
+#  define PERFEVENT_CYENT PERFEVENT_RDPMC_CYENT PERFEVENT_VANILLA_CYENT
 #else
+#  define PERFEVENT_CLKENT
 #  define PERFEVENT_CYENT
 #endif
 
@@ -233,14 +481,11 @@ static const struct timer_ops perfevent_ops =
  * CPU frequency adjustments.
  */
 
-#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
 
-#include <cpuid.h>
-
-#define CPUID_1D_TSC (1u << 4)
-
-static void x86rdtsc_now(struct bench_time *t_out, struct timer *t)
-  { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; }
+static int x86rdtsc_now(struct timer *t,
+			struct bench_time *t_out, unsigned f)
+  { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; return (0); }
 
 static int x86rdtsc_init(struct timer *t)
 {
@@ -248,13 +493,44 @@ static int x86rdtsc_init(struct timer *t)
 
   if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
     { debug("no `rdtsc' instrunction"); return (-1); }
+  t->u_cy.tscaux = ~0u;
+  return (0);
+}
+
+static int x86rdtscp_now(struct timer *t,
+			 struct bench_time *t_out, unsigned f)
+{
+  unsigned tscaux;
+  unsigned long long n;
+
+  n = __builtin_ia32_rdtscp(&tscaux);
+  if (!(f&BTF_T1))
+    t->u_cy.tscaux = tscaux;
+  else if (t->u_cy.tscaux != tscaux) {
+    debug("tscaux mismatch: new 0x%08x /= old 0x%08x",
+	  tscaux, t->u_cy.tscaux);
+    return (-1);
+  }
+  t_out->cy.i = n; t_out->f |= BTF_CYOK; return (0);
+}
+
+static int x86rdtscp_init(struct timer *t)
+{
+  unsigned a, b, c, d;
+
+  if (!__get_cpuid(0x80000001, &a, &b, &c, &d) || !(d&CPUID_1xD_TSCP))
+    { debug("no `rdtscp' instrunction"); return (-1); }
   return (0);
 }
 
 static const struct timer_ops x86rdtsc_ops =
-  { "x86-rdtsc", 0, x86rdtsc_init, x86rdtsc_now, null_teardown };
+  { "x86-rdtsc", 0,
+    x86rdtsc_init, x86rdtsc_now, diff_cycles, null_teardown };
+static const struct timer_ops x86rdtscp_ops =
+  { "x86-rdtscp", 0,
+    x86rdtscp_init, x86rdtscp_now, diff_cycles, null_teardown };
 
-#  define X86RDTSC_CYENT &x86rdtsc_ops,
+#  define X86RDTSC_CYENT &x86rdtscp_ops, &x86rdtsc_ops,
 #else
 #  define X86RDTSC_CYENT
 #endif
@@ -267,26 +543,27 @@ static const struct timer_ops x86rdtsc_ops =
 
 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)
 
-static void gettime_now(struct bench_time *t_out, struct timer *t)
+static int gettime_now(struct timer *t, struct bench_time *t_out, unsigned f)
 {
   struct timespec now;
 
   if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now))
-    { debug("error reading POSIX clock: %s", strerror(errno)); return; }
-  ASSIGN64(t_out->s, now.tv_sec); t_out->ns = now.tv_nsec;
-  t_out->f |= BTF_TIMEOK;
+    { debug("error reading POSIX clock: %s", strerror(errno)); return (0); }
+  ASSIGN64(t_out->t.ts.s, now.tv_sec); t_out->t.ts.ns = now.tv_nsec;
+  t_out->f |= BTF_TIMEOK; return (0);
 }
 
 static int gettime_init(struct timer *t)
 {
   struct bench_time tm;
 
-  tm.f = 0; gettime_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1);
+  tm.f = 0; gettime_now(t, &tm, 0); if (!tm.f&BTF_TIMEOK) return (-1);
   return (0);
 }
 
 static const struct timer_ops gettime_ops =
-  { "posix-thread-cputime", 0, gettime_init, gettime_now, null_teardown };
+  { "posix-thread-cputime", 0,
+    gettime_init, gettime_now, diff_ts, null_teardown };
 
 #  define GETTIME_CLKENT &gettime_ops,
 #else
@@ -299,39 +576,40 @@ static const struct timer_ops gettime_ops =
  * guaranteed to be available, though it's not likely to be very good.
  */
 
-static void clock_now(struct bench_time *t_out, struct timer *t)
+static int clock_now(struct timer *t, struct bench_time *t_out, unsigned f)
 {
-  clock_t now, x;
-  unsigned long s; uint32 ns;
+  clock_t now;
 
   now = clock();
     if (now == (clock_t)-1) {
       debug("error reading standard clock: %s", strerror(errno));
-      return;
+      return (0);
     }
-  x = now/CLOCKS_PER_SEC;
-    if (x > ULONG_MAX) { debug("standard clock out of range"); return; }
-
-  s = x; x = now - CLOCKS_PER_SEC*s;
-  if (!(NS_PER_S%CLOCKS_PER_SEC))
-    ns = x*(NS_PER_S/CLOCKS_PER_SEC);
-  else if (NS_PER_S <= ULONG_MAX/CLOCKS_PER_SEC)
-    ns = (x*NS_PER_S)/CLOCKS_PER_SEC;
-  else
-    ns = x*((NS_PER_S + 0.0)/CLOCKS_PER_SEC);
-  ASSIGN64(t_out->s, s); t_out->ns = ns; t_out->f |= BTF_TIMEOK;
+  t_out->t.clk = now; t_out->f |= BTF_TIMEOK; return (0);
+}
+
+static void clock_diff(struct timer *t, struct bench_timing *delta_inout,
+			const struct bench_time *t0,
+			const struct bench_time *t1)
+{
+  unsigned f = t0->f&t1->f;
+
+  if (f&BTF_TIMEOK) {
+    delta_inout->t = (t1->t.clk - t0->t.clk)/(double)CLOCKS_PER_SEC;
+    delta_inout->f |= BTF_TIMEOK;
+  }
 }
 
 static int clock_init(struct timer *t)
 {
   struct bench_time tm;
 
-  tm.f = 0; clock_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1);
+  tm.f = 0; clock_now(t, &tm, 0); if (!tm.f&BTF_TIMEOK) return (-1);
   return (0);
 }
 
 static const struct timer_ops clock_ops =
-  { "stdc-clock", 0, clock_init, clock_now, null_teardown };
+  { "stdc-clock", 0, clock_init, clock_now, clock_diff, null_teardown };
 
 #define CLOCK_CLKENT &clock_ops,
 
@@ -339,8 +617,16 @@ static const struct timer_ops clock_ops =
 
 /* Tables of timing sources. */
 static const struct timer_ops
-  *const clktab[] = { GETTIME_CLKENT CLOCK_CLKENT BROKEN_ENT 0 },
-  *const cytab[] = { PERFEVENT_CYENT X86RDTSC_CYENT NULL_ENT BROKEN_ENT 0 };
+  *const clktab[] = { PERFEVENT_CLKENT
+		      GETTIME_CLKENT
+		      CLOCK_CLKENT
+		      BROKEN_ENT
+		      0 },
+  *const cytab[] = { PERFEVENT_CYENT
+		     X86RDTSC_CYENT
+		     NULL_ENT
+		     BROKEN_ENT
+		     0 };
 
 static const struct timertab {
   const char *what;
@@ -444,12 +730,27 @@ static void timer_describe(struct bench_timer *tm, dstr *d)
   }
 }
 
-static void timer_now(struct bench_timer *tm, struct bench_time *t_out)
+static int timer_now(struct bench_timer *tm,
+		     struct bench_time *t_out, unsigned f)
 {
   struct timer *t = (struct timer *)tm;
   unsigned i;
 
-  for (i = 0; i < NTIMER; i++) t->ops[i]->now(t_out, t);
+  t_out->f = 0;
+  for (i = 0; i < NTIMER; i++) if (t->ops[i]->now(t, t_out, f)) return (-1);
+  return (0);
+}
+
+static void timer_diff(struct bench_timer *tm,
+		       struct bench_timing *t_out,
+		       const struct bench_time *t0,
+		       const struct bench_time *t1)
+{
+  struct timer *t = (struct timer *)tm;
+  unsigned i;
+
+  t_out->f = 0;
+  for (i = 0; i < NTIMER; i++) t->ops[i]->diff(t, t_out, t0, t1);
 }
 
 static void timer_destroy(struct bench_timer *tm)
@@ -464,7 +765,7 @@ static void timer_destroy(struct bench_timer *tm)
 }
 
 static const struct bench_timerops timer_ops =
-  { timer_describe, timer_now, timer_destroy };
+  { timer_describe, timer_now, timer_diff, timer_destroy };
 
 /* --- @bench_createtimer@ --- *
  *
@@ -564,7 +865,7 @@ struct bench_timer *bench_createtimer(const char *config)
   for (i = 0; i < NTIMER; i++) t->ops[i] = 0;
 
   /* Try to set up the subtimers. */
-  for (i = 0; i < NTIMER; i++)
+  for (i = NTIMER; i--; )
     if (select_timer(t, i, tmconf[i].p, tmconf[i].sz)) goto end;
 
   /* All is done. */
@@ -638,6 +939,48 @@ void bench_destroy(struct bench_state *b)
 static void do_nothing(unsigned long n, void *ctx)
   { while (n--) RELAX; }
 
+/* --- @measure@ --- *
+ *
+ * Arguments:	@struct bench_state *b@ = bench state
+ *		@struct bench_timing *delta_out@ = where to leave the timing
+ *		@bench_fn *fn@ = function to measure
+ *		@void *ctx@ = context for the function
+ *		@double n@ = number of iterations
+ *
+ * Returns:	---
+ *
+ * Use:		Run the function @n@ times, and report how long it took.
+ *
+ *		This function deals with retrying the measurements if the
+ *		timer reports a temporary failure, and all of the
+ *		difficulties if @n@ is too large to fit in a machine integer.
+ */
+
+static void measure(struct bench_state *b, struct bench_timing *delta_out,
+		    bench_fn *fn, void *ctx, double n)
+{
+  struct bench_timer *tm = b->tm;
+  struct bench_time t0, t1;
+  unsigned long n0, n1;
+  double R = ULONG_MAX;
+
+  if (n <= R) {
+    n0 = n;
+    do {
+      while (tm->ops->now(tm, &t0, BTF_T0));
+      fn(n0, ctx);
+    } while (tm->ops->now(tm, &t1, BTF_T1));
+  } else {
+    n1 = n/R; n0 = n - n1*R;
+    do {
+      while (tm->ops->now(tm, &t0, BTF_T0));
+      while (n1--) fn(ULONG_MAX, ctx);
+      fn(n0, ctx);
+    } while (tm->ops->now(tm, &t1, BTF_T1));
+  }
+  tm->ops->diff(tm, delta_out, &t0, &t1);
+}
+
 /* --- @bench_calibrate@ --- *
  *
  * Arguments:	@struct bench_state *b@ = bench state
@@ -653,14 +996,10 @@ static void do_nothing(unsigned long n, void *ctx)
 int bench_calibrate(struct bench_state *b)
 {
   struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT;
-  unsigned long n;
-  unsigned i;
-  struct bench_timer *tm = b->tm;
-  struct bench_time t0, t1;
   struct bench_timing delta;
-  double r;
+  double n, r;
   bench_fn *fn = LAUNDER(&do_nothing);
-  unsigned f = BTF_ANY;
+  unsigned i, f = BTF_ANY;
   int rc;
 
   /* The model here is that a timing loop has a fixed overhead as we enter
@@ -673,28 +1012,26 @@ int bench_calibrate(struct bench_state *b)
   if (b->f&BTF_CLB) return (b->f&BTF_ANY ? 0 : -1);
 
   /* Exercise the inner loop a few times to educate the branch predictor. */
-  for (i = 0; i < 10; i++)
-    { tm->ops->now(tm, &t0); fn(50, 0); tm->ops->now(tm, &t1); }
+  for (i = 0; i < 50; i++) measure(b, &delta, fn, 0, 10000);
 
   /* Now we measure idle loops until they take sufficiently long -- or we run
    * out of counter.
    */
   debug("calibrating...");
-  n = 1;
+  n = 1.0;
   for (;;) {
 
     /* Measure @n@ iterations of the idle loop. */
-    tm->ops->now(tm, &t0); fn(n, 0); tm->ops->now(tm, &t1);
-    timer_diff(&delta, &t0, &t1); f &= delta.f;
+    measure(b, &delta, fn, 0, n); f &= delta.f;
     if (!(f&BTF_TIMEOK)) { rc = -1; goto end; }
 
     /* Register the timings with the regression machinery. */
     linreg_update(&lr_clk, n, delta.t);
     if (!(f&BTF_CYOK))
-      debug("  n = %10lu; t = %12g s", n, delta.t);
+      debug("  n = %10.0f; t = %12g s", n, delta.t);
     else {
       linreg_update(&lr_cy, n, delta.cy);
-      debug("  n = %10lu; t = %12g s, cy = %10.0f", n, delta.t, delta.cy);
+      debug("  n = %10.0f; t = %12g s, cy = %10.0f", n, delta.t, delta.cy);
     }
 
     /* If we're done then stop. */
@@ -702,7 +1039,7 @@ int bench_calibrate(struct bench_state *b)
     if (n >= ULONG_MAX - n/3) break;
 
     /* Update the counter and continue. */
-    n += n/3 + 1;
+    n += n/3.0 + 1.0;
   }
 
   /* Now run the linear regression to extract the constant and per-iteration
@@ -744,9 +1081,7 @@ end:
 int bench_measure(struct bench_state *b, struct bench_timing *t_out,
 		  double base, bench_fn *fn, void *ctx)
 {
-  struct bench_timer *tm = b->tm;
-  struct bench_time t0, t1;
-  unsigned long n, nn;
+  double n, nn;
 
   /* Make sure the state is calibrated and usable. */
   if (!(b->f&BTF_CLB) && bench_calibrate(b)) return (-1);
@@ -764,16 +1099,18 @@ int bench_measure(struct bench_state *b, struct bench_timing *t_out,
    * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying
    * again with %$n' = n + 1$% iterations will very likely work.
    */
-  debug("measuring..."); n = 1;
+  debug("measuring..."); n = 1.0;
   for (;;) {
-    tm->ops->now(tm, &t0); fn(n, ctx); tm->ops->now(tm, &t1);
-    timer_diff(t_out, &t0, &t1);
+    measure(b, t_out, fn, ctx, n); t_out->f &= b->f;
     if (!(t_out->f&BTF_TIMEOK)) return (-1);
-    if (!(t_out->f&BTF_CYOK)) debug("  n = %10lu; t = %12g", n, t_out->t);
-    else debug("  n = %10lu; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy);
+    if (!(t_out->f&BTF_CYOK))
+      debug("  n = %10.0f; t = %12g", n, t_out->t);
+    else
+      debug("  n = %10.0f; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy);
+
     if (t_out->t >= 0.707*b->target_s) break;
     nn = n*b->target_s/t_out->t;
-    if (nn > n) n = nn;
+    if (n > ULONG_MAX || nn > (unsigned long)n + 1) n = nn;
     else n++;
   }
 
diff --git a/test/bench.h b/test/bench.h
index 0645068..2484584 100644
--- a/test/bench.h
+++ b/test/bench.h
@@ -34,6 +34,8 @@
 
 /*----- Header files ------------------------------------------------------*/
 
+#include <time.h>
+
 #ifndef MLIB_BITS_H
 #  include "bits.h"
 #endif
@@ -49,7 +51,11 @@ struct bench_time {
 #define BTF_TIMEOK 1u			/*   @s@ ad @ns@ slots are value */
 #define BTF_CYOK 2u			/*   @cy@ slot is valid */
 #define BTF_ANY (BTF_TIMEOK | BTF_CYOK)	/*   some part is useful */
-  kludge64 s; uint32 ns;		/* real time, seconds and nanos */
+  union {
+    struct { kludge64 s; uint32 ns; } ts; /* @struct timespec@-ish */
+    clock_t clk;			/* @clock@ */
+    kludge64 rawns;			/* raw nanosecond count */
+  } t;					/* time */
   kludge64 cy;				/* count of CPU cycles */
 };
 
@@ -64,8 +70,21 @@ struct bench_timerops {
   void (*describe)(struct bench_timer */*bt*/, dstr */*d*/);
     /* Write a description of the timer to @d@. */
 
-  void (*now)(struct bench_timer */*bt*/, struct bench_time */*t_out*/);
-    /* Fill in @*t_out@ with the current time. v*/
+  int (*now)(struct bench_timer */*bt*/, struct bench_time */*t_out*/,
+	      unsigned /*f*/);
+#define BTF_T0 0u			/* fetching first time of a pair */
+#define BTF_T1 1u			/* fetching second time of a pair */
+    /* Fill in @*t_out@ with the current time.  Return zero on success
+     * %%\emph{or} permanent failure; return %$-1$% on temporary failure.
+     */
+
+  void (*diff)(struct bench_timer */*bt*/,
+	       struct bench_timing */*delta_out*/,
+	       const struct bench_time */*t0*/,
+	       const struct bench_time */*t1*/);
+    /* Subtract the time @t0@ from the time @t1@, leaving the result in
+     * @*delta_out@, setting flags as appropriate.
+     */
 
   void (*destroy)(struct bench_timer */*bt*/);
     /* Release the timer and any resources it holds. */
diff --git a/test/tvec-bench.c b/test/tvec-bench.c
index bff3c3a..f61d5e1 100644
--- a/test/tvec-bench.c
+++ b/test/tvec-bench.c
@@ -287,7 +287,7 @@ static int setvar(struct tvec_state *tv, const char *var,
   struct tvec_benchctx *bc = ctx;
 
   if (STRCMP(var, ==, "@target")) {
-    if (bc->f&TVBF_SETTRG) return (tvec_dupreg(tv, var));
+    if (bc->f&TVBF_SETTRG) return (tvec_dupregerr(tv, var));
     bc->bst->target_s = rv->f; bc->f |= TVBF_SETTRG;
   } else assert("unknown var");
   return (0);
@@ -385,6 +385,7 @@ void tvec_benchrun(struct tvec_state *tv, tvec_testfn *fn, void *ctx)
       rd->ty->dump(&TVEC_REG(tv, in, rd->i)->v, rd,
 		   TVSF_COMPACT, &dstr_printops, &d);
     }
+  DPUTZ(&d);
 
   /* Run the benchmark. */
   o->ops->bbench(o, d.buf, unit);
diff --git a/test/tvec-core.c b/test/tvec-core.c
index e4f9f84..fc1b413 100644
--- a/test/tvec-core.c
+++ b/test/tvec-core.c
@@ -318,7 +318,7 @@ int tvec_syntax_v(struct tvec_state *tv, int ch,
   dstr_destroy(&d); return (-1);
 }
 
-/* --- @tvec_unkreg@ --- *
+/* --- @tvec_unkregerr@ --- *
  *
  * Arguments:	@struct tvec_state *tv@ = test-vector state
  *		@const char *name@ = register or pseudoregister name
@@ -329,13 +329,13 @@ int tvec_syntax_v(struct tvec_state *tv, int ch,
  *		unrecognized.
  */
 
-int tvec_unkreg(struct tvec_state *tv, const char *name)
+int tvec_unkregerr(struct tvec_state *tv, const char *name)
 {
   return (tvec_error(tv, "unknown special register `%s' for test `%s'",
 		     name, tv->test->name));
 }
 
-/* --- @tvec_dupreg@ --- *
+/* --- @tvec_dupregerr@ --- *
  *
  * Arguments:	@struct tvec_state *tv@ = test-vector state
  *		@const char *name@ = register or pseudoregister name
@@ -346,7 +346,7 @@ int tvec_unkreg(struct tvec_state *tv, const char *name)
  *		assigned already in the current test.
  */
 
-int tvec_dupreg(struct tvec_state *tv, const char *name)
+int tvec_dupregerr(struct tvec_state *tv, const char *name)
   { return (tvec_error(tv, "register `%s' is already set", name)); }
 
 /* --- @tvec_skipspc@ --- *
@@ -956,7 +956,7 @@ static int core_setvar(struct tvec_state *tv, const char *name,
   struct groupstate *g = ctx;
 
   if (STRCMP(name, ==, "@outcome")) {
-    if (g->f&GRPF_SETOUTC) return (tvec_dupreg(tv, name));
+    if (g->f&GRPF_SETOUTC) return (tvec_dupregerr(tv, name));
     if (rv->u == XFAIL) tvec_xfail(tv);
     g->f |= GRPF_SETOUTC;
   } else assert(!"unknown var");
@@ -1113,7 +1113,7 @@ int tvec_read(struct tvec_state *tv, const char *infile, FILE *fp)
 	      vd = env->findvar(tv, d.buf, &varctx, g.ctx);
 		if (vd) goto found_var;
 	    }
-	    tvec_unkreg(tv, d.buf); goto flush_line;
+	    tvec_unkregerr(tv, d.buf); goto flush_line;
 	  found_var:
 	    rd = &vd->def;
 	  } else {
@@ -1130,7 +1130,7 @@ int tvec_read(struct tvec_state *tv, const char *infile, FILE *fp)
 	    /* Complain if the register is already set. */
 	    r = TVEC_REG(tv, in, rd->i);
 	    if (r->f&TVRF_SEEN)
-	      { tvec_dupreg(tv, rd->name); goto flush_line; }
+	      { tvec_dupregerr(tv, rd->name); goto flush_line; }
 	  }
 
 	  /* If there's no test, then report an error.  Set the muffle flag,
diff --git a/test/tvec-env.3.in b/test/tvec-env.3.in
index eb71889..8508bd6 100644
--- a/test/tvec-env.3.in
+++ b/test/tvec-env.3.in
@@ -49,8 +49,8 @@
 .\" @tvec_report_v
 .\" @tvec_error
 .\" @tvec_notice
-.\" @tvec_unkreg
-.\" @tvec_dupreg
+.\" @tvec_unkregerr
+.\" @tvec_dupregerr
 .
 .\" @tvec_serialize
 .\" @tvec_deserialize
@@ -163,8 +163,8 @@ tvec-env \- test vector framework environments
 .BI "	const char *" msg ", va_list *" ap );
 .BI "int tvec_error(struct tvec_state *" tv ", const char *" msg ", ...);"
 .BI "void tvec_notice(struct tvec_state *" tv ", const char *" msg ", ...);"
-.BI "int tvec_unkreg(struct tvec_state *" tv ", const char *" name );
-.BI "int tvec_dupreg(struct tvec_state *" tv ", const char *" name );
+.BI "int tvec_unkregerr(struct tvec_state *" tv ", const char *" name );
+.BI "int tvec_dupregerr(struct tvec_state *" tv ", const char *" name );
 .PP
 .ta \w'\fBint tvec_serialize('u
 .BI "int tvec_serialize(const struct tvec_reg *" rv ", buf *" b ,
diff --git a/test/tvec-remote.c b/test/tvec-remote.c
index 3c25282..3087d1b 100644
--- a/test/tvec-remote.c
+++ b/test/tvec-remote.c
@@ -700,7 +700,7 @@ int tvec_remoteserver(int infd, int outfd, const struct tvec_config *config)
 		vd = env->findvar(&srvtv, d.buf, &varctx, ctx);
 		  if (vd) goto found_var;
 	      }
-	      rc = tvec_unkreg(&srvtv, d.buf); goto setvar_end;
+	      rc = tvec_unkregerr(&srvtv, d.buf); goto setvar_end;
 	    found_var:
 
 	      /* Set up the register. */
@@ -1929,15 +1929,15 @@ static int setvar_local(struct tvec_state *tv, const char *var,
   struct tvec_remotectx *r = ctx;
 
   if (STRCMP(var, ==, "@exit")) {
-    if (r->rc.f&TVRF_SETEXIT) return (tvec_dupreg(tv, var));
+    if (r->rc.f&TVRF_SETEXIT) return (tvec_dupregerr(tv, var));
     r->exwant = rv->u; r->rc.f |= TVRF_SETEXIT; return (0);
   } else if (STRCMP(var, ==, "@progress")) {
-    if (r->rc.f&TVRF_SETPRG) return (tvec_dupreg(tv, var));
+    if (r->rc.f&TVRF_SETPRG) return (tvec_dupregerr(tv, var));
     DRESET(&r->prgwant); DPUTM(&r->prgwant, rv->text.p, rv->text.sz);
     DPUTZ(&r->prgwant);
     r->rc.f |= TVRF_SETPRG; return (0);
   } else if (STRCMP(var, ==, "@reconnect")) {
-    if (r->rc.f&TVRF_SETRCN) return (tvec_dupreg(tv, var));
+    if (r->rc.f&TVRF_SETRCN) return (tvec_dupregerr(tv, var));
     r->rc.f = (r->rc.f&~TVRF_RCNMASK) | (rv->u&TVRF_RCNMASK) | TVRF_SETRCN;
     return (0);
   } else assert(!"unknown var");
diff --git a/test/tvec-timeout.c b/test/tvec-timeout.c
index f2c9208..f1b0b0c 100644
--- a/test/tvec-timeout.c
+++ b/test/tvec-timeout.c
@@ -107,10 +107,10 @@ static int setvar(struct tvec_state *tv, const char *var,
   struct tvec_timeoutctx *tc = ctx;
 
   if (STRCMP(var, ==, "@timeout")) {
-    if (tc->f&TVTF_SETTMO) return (tvec_dupreg(tv, var));
+    if (tc->f&TVTF_SETTMO) return (tvec_dupregerr(tv, var));
     tc->t = rv->f; tc->f |= TVTF_SETTMO;
   } else if (STRCMP(var, ==, "@timer")) {
-    if (tc->f&TVTF_SETTMR) return (tvec_dupreg(tv, var));
+    if (tc->f&TVTF_SETTMR) return (tvec_dupregerr(tv, var));
     tc->timer = rv->i; tc->f |= TVTF_SETTMR;
   } else assert(!"unknown var");
   return (0);
diff --git a/test/tvec.3.in b/test/tvec.3.in
index bc254d2..90e7781 100644
--- a/test/tvec.3.in
+++ b/test/tvec.3.in
@@ -301,3 +301,15 @@ the corresponding
 .I value
 is stored in the named register.
 .PP
+A test environment fits in between
+the framework and the test function.
+It can establish hook functions which are called
+at various stages during the test group.
+.hP \*o
+The
+.I setup
+hook is called once at the start of the test group.
+.hP \*o
+The
+.I teardown
+hook is called once at the end of the test group.   
diff --git a/test/tvec.h b/test/tvec.h
index 3a18fc5..f7512bd 100644
--- a/test/tvec.h
+++ b/test/tvec.h
@@ -1204,8 +1204,8 @@ extern tvec_envteardownfn tvec_benchteardown;
  * Returns:	---
  *
  * Use:		Formats a report about the benchmark performance.  This
- *		function is intended to be called on by an output
- *		@ebench@ function.
+ *		function is intended to be called on by an output @ebench@
+ *		function.
  */
 
 extern void tvec_benchreport
@@ -1493,7 +1493,7 @@ extern PRINTF_LIKE(2, 3)
 extern PRINTF_LIKE(2, 3)
   void tvec_notice(struct tvec_state */*tv*/, const char */*msg*/, ...);
 
-/* --- @tvec_unkreg@ --- *
+/* --- @tvec_unkregerr@ --- *
  *
  * Arguments:	@struct tvec_state *tv@ = test-vector state
  *		@const char *name@ = register or pseudoregister name
@@ -1504,9 +1504,9 @@ extern PRINTF_LIKE(2, 3)
  *		unrecognized.
  */
 
-extern int tvec_unkreg(struct tvec_state */*tv*/, const char */*name*/);
+extern int tvec_unkregerr(struct tvec_state */*tv*/, const char */*name*/);
 
-/* --- @tvec_dupreg@ --- *
+/* --- @tvec_dupregerr@ --- *
  *
  * Arguments:	@struct tvec_state *tv@ = test-vector state
  *		@const char *name@ = register or pseudoregister name
@@ -1517,7 +1517,7 @@ extern int tvec_unkreg(struct tvec_state */*tv*/, const char */*name*/);
  *		assigned already in the current test.
  */
 
-extern int tvec_dupreg(struct tvec_state */*tv*/, const char */*name*/);
+extern int tvec_dupregerr(struct tvec_state */*tv*/, const char */*name*/);
 
 /* --- @tvec_humanoutput@ --- *
  *
@@ -1545,16 +1545,17 @@ extern struct tvec_output *tvec_humanoutput(FILE */*fp*/);
  *		(`Test Anything Protocol') format.
  *
  *		TAP comes from the Perl community, but has spread rather
- *		further.  This driver produces TAP version 14, but pretends
- *		to be version 13.  The driver produces a TAP `test point' --
- *		i.e., a result reported as `ok' or `not ok' -- for each input
- *		test group.  Failure reports and register dumps are produced
- *		as diagnostic messages before the final group result.  (TAP
- *		permits structuerd YAML data after the test-point result,
- *		which could be used to report details, but (a) postponing the
- *		details until after the report is inconvenient, and (b) there
- *		is no standardization for the YAML anyway, so in practice
- *		it's no more useful than the unstructured diagnostics.
+ *		further.  This driver currently produces TAP version 14, but
+ *		pretends to be version 13.  The driver produces a TAP `test
+ *		point' -- i.e., a result reported as `ok' or `not ok' -- for
+ *		each input test group.  Failure reports and register dumps
+ *		are produced as diagnostic messages before the final group
+ *		result.  (TAP permits structuerd YAML data after the
+ *		test-point result, which could be used to report details, but
+ *		(a) postponing the details until after the report is
+ *		inconvenient, and (b) there is no standardization for the
+ *		YAML anyway, so in practice it's no more useful than the
+ *		unstructured diagnostics.
  */
 
 extern struct tvec_output *tvec_tapoutput(FILE */*fp*/);
diff --git a/utils/macros.3.in b/utils/macros.3.in
index 6bb18d4..91052c2 100644
--- a/utils/macros.3.in
+++ b/utils/macros.3.in
@@ -33,6 +33,8 @@
 .\" @STR
 .\" @GLUE
 .\" @STATIC_ASSERT
+.\" COMMA
+.
 .\" @ISALNUM
 .\" @ISALPHA
 .\" @ISASCII
@@ -49,18 +51,24 @@
 .\" @TOASCII
 .\" @TOLOWER
 .\" @TOUPPER
+.
 .\" @MEMCMP
 .\" @STRCMP
 .\" @STRNCMP
+.
 .\" @DISCARD
 .\" @IGNORE
+.\" @LAUNDER
+.\" @RELAX
+.
 .\" @DEPRECATED
-.\" @EXECL_LIKE
 .\" @IGNORABLE
 .\" @MUST_CHECK
 .\" @NORETURN
 .\" @PRINTF_LIKE
 .\" @SCANF_LIKE
+.\" @EXECL_LIKE
+.
 .\" @MUFFLE_WARNINGS_DECL
 .\" @MUFFLE_WARNINGS_EXPR
 .\" @MUFFLE_WARNINGS_STMT
@@ -147,7 +155,7 @@ preprocessing token.
 .PP
 The
 .B STATIC_ASSERT
-causes compilation to fail if the integer constant expression
+macro causes compilation to fail if the integer constant expression
 .I cond
 evaluates to zero.  This macro uses the C11
 .B static_assert
@@ -158,6 +166,13 @@ falls back to a somewhat ugly hack which currently ignores the
 .IR msg .
 .PP
 The
+.B COMMA
+macro expands to a comma
+.BR ` , ',
+which is useful for smuggling commas into macro arguments
+if they can't be protected by parentheses.
+.PP
+The
 .BR IS ...\&
 and
 .BR TO ...\&
@@ -201,6 +216,17 @@ The
 .B IGNORE
 macro ignores its argument, which may be an expression of any type.
 This can be useful in muffling warnings about unused variables.
+.PP
+The
+.B LAUNDER
+macro tries to confuse a compiler so that it `forgets' what it knows
+about a particular value.  This is most useful in benchmarking or
+similar applications.
+.PP
+The
+.B RELAX
+macro tries do nothing, but in a way that a compiler won't optimize
+away.
 .
 .SS Annotations
 The following annotations can be attached to function declarations and
diff --git a/utils/macros.h b/utils/macros.h
index 95d0ddb..dba22a7 100644
--- a/utils/macros.h
+++ b/utils/macros.h
@@ -42,14 +42,47 @@
 
 /*----- Miscellaneous utility macros --------------------------------------*/
 
+/* --- @N@ --- *
+ *
+ * Arguments:	@type v[]@ = an actual array, not a pointer
+ *
+ * Returns:	The number of elements in @v@.
+ */
+
 #define N(v) (sizeof(v)/sizeof(*(v)))
 
+/* --- @STR@ --- *
+ *
+ * Arguments:	@x@ = some tokens
+ *
+ * Returns:	A string literal containing the macro-expanded text of @x@.
+ */
+
 #define MLIB__STR(x) #x
 #define STR(x) MLIB__STR(x)
 
+/* --- @GLUE@ --- *
+ *
+ * Arguments:	@x, y@ = two sequences of tokens
+ *
+ * Returns:	A single token formed by gluing together the macro-expansions
+ *		of @x@ and @y@.
+ */
+
 #define MLIB__GLUE(x, y) x##y
 #define GLUE(x, y) MLIB__GLUE(x, y)
 
+/* --- @STATIC_ASSERT@ --- *
+ *
+ * Arguments:	@int cond@ = a condition
+ *		@msg@ = a string literal message
+ *
+ * Returns:	---
+ *
+ * Use:		Fail at compile time unless @cond@ is nonzero.  The failure
+ *		might report @msg@.
+ */
+
 #ifdef static_assert
 #  define STATIC_ASSERT(cond, msg) static_assert(!!(cond), msg)
 #else
@@ -57,10 +90,31 @@
 	IGNORABLE extern char static_assert_failed[2*!!(cond) - 1]
 #endif
 
+/* --- @COMMA@ --- *
+ *
+ * Arguments:	---
+ *
+ * Returns:	A `%|,|%' token, which can be usefully passed to macros to
+ *		avoid argument splitting.
+ */
+
 #define COMMA ,
 
 /*----- String and character hacks ----------------------------------------*/
 
+/* --- @IS...@ --- *
+ *
+ * Arguments:	@int ch@ = a character code, but not @EOF@
+ *
+ * Returns:	Nonzero if @ch@ is in the relevant @<ctype.h>@ category.
+ *
+ * Use:		Classifies characters, but safely even if characters are
+ *		signed.
+ *
+ *		There is a macro for each of the @<ctype.h>@ @is...@
+ *		functions.
+ */
+
 #define CTYPE_HACK(func, ch) (func((unsigned char)(ch)))
 
 #define ISALNUM(ch) CTYPE_HACK(isalnum, ch)
@@ -77,17 +131,47 @@
 #define ISUPPER(ch) CTYPE_HACK(isupper, ch)
 #define ISXDIGIT(ch) CTYPE_HACK(isxdigit, ch)
 
+/* --- @TO...@ --- *
+ *
+ * Arguments:	@int ch@ = a character code, but not @EOF@
+ *
+ * Returns:	The converted character code.
+ *
+ * Use:		Converts characters, but safely even if characters are
+ *		signed.
+ *
+ *		There is a macro for each of the @<ctype.h>@ @to...@
+ *		functions.
+ */
+
 #define TOASCII(ch) CTYPE_HACK(toascii, ch)
 #define TOLOWER(ch) CTYPE_HACK(tolower, ch)
 #define TOUPPER(ch) CTYPE_HACK(toupper, ch)
 
+/* --- @MEMCMP@, @STRCMP@, @STRNCMP@ --- *
+ *
+ * Arguments:	@const type *x, *y@ = pointers to strings
+ *		@op@ = a relational operator symbol
+ *		@size_t n@ = length of the strings
+ *
+ * Returns:	Nonzero if the relationship between the strings satisfies the
+ *		operator @op@, otherwise zero.
+ *
+ * Use:		These macros mitigate the author's frequent error of failing
+ *		to compare the result of the underlying standard functions
+ *		against zero, effectively reversing the sense of an intended
+ *		test for equality.
+ */
+
 #define MEMCMP(x, op, y, n) (memcmp((x), (y), (n)) op 0)
 #define STRCMP(x, op, y) (strcmp((x), (y)) op 0)
 #define STRNCMP(x, op, y, n) (strncmp((x), (y), (n)) op 0)
 
-/*----- Compiler diagnostics ----------------------------------------------*/
+/*----- Compiler-specific definitions -------------------------------------*/
 
-/* --- Compiler-specific definitions --- */
+/* The descriptions of these are given below, with the fallback
+ * definitions.
+ */
 
 #if GCC_VERSION_P(2, 5) || CLANG_VERSION_P(3, 3)
 #  define NORETURN __attribute__((__noreturn__))
@@ -193,40 +277,84 @@
 
 /* --- Fallback definitions, mostly trivial --- */
 
-#ifndef DEPRECATED
-#  define DEPRECATED(msg)
-#endif
-
-#ifndef EXECL_LIKE
-#  define EXECL_LIKE(ntrail)
-#endif
+/* --- @DISCARD@ --- *
+ *
+ * Arguments:	@x@ = a function call
+ *
+ * Returns:	---
+ *
+ * Use:		Explicitly discard the result of @x@.  This counteracts a
+ *		@MUST_CHECK@ attribute on the called function.
+ */
 
 #ifndef DISCARD
 #  define DISCARD(x) do if (x); while (0)
 #endif
 
+/* --- @IGNORE@ --- *
+ *
+ * Arguments:	@x@ = any expression
+ *
+ * Returns:	---
+ *
+ * Use:		Ignore the value of @x@, overriding compiler warnings.
+ */
+
 #ifndef IGNORE
 #  define IGNORE(x) ((void)(x))
 #endif
 
-#ifndef MUFFLE_WARNINGS_DECL
-#  define MUFFLE_WARNINGS_DECL(warns, body) body
-#endif
+/* --- @LAUNDER@ --- *
+ *
+ * Arguments:	@x@ = some integer expression
+ *
+ * Returns:	@x@.
+ *
+ * Use:		Causes a compiler to know nothing about the value of @x@,
+ *		even if it looks obvious, e.g., it's a constant.
+ */
 
-#ifndef MUFFLE_WARNINGS_EXPR
-#  define MUFFLE_WARNINGS_EXPR(warns, body) (body)
+#ifndef LAUNDER
+#  define LAUNDER(x) (x)
 #endif
 
-#ifndef MUFFLE_WARNINGS_STMT
-#  define MUFFLE_WARNINGS_STMT(warns, body) do { body } while (0)
-#endif
+/* --- @RELAX@ --- *
+ *
+ * Arguments:	---
+ *
+ * Returns:	---
+ *
+ * Use:		Does nothing, but the compiler doesn't know that.
+ */
 
-#ifndef PRINTF_LIKE
-#  define PRINF_LIKE(fmtix, argix)
+#ifndef RELAX
+#  define RELAX
 #endif
 
-#ifndef SCANF_LIKE
-#  define SCANF_LIKE(fmtix, argix)
+/* --- @DEPRECATED@, @NORETURN@, @IGNORABLE@, @MUST_CHECK@ --- *
+ *
+ * Use:		These are (mostly) function attributes; write them among the
+ *		declaration specifiers for a function definition or
+ *		declaration.  These may not do anything, but the intended
+ *		behaviour is as follows.
+ *
+ *		  * @DEPRECATED(msg)@ -- report a warning, quoting the string
+ *		    literal @msg@, if the function is called.
+ *
+ *		  * @NORETURN@ -- promise that the function doesn't return to
+ *		    its caller: either it kills the process, or it performs
+ *		    some nonlocal transfer.
+ *
+ *		  * @IGNORABLE@ -- the item (which might be data rather than
+ *		    a function) might not be referred to, but that's OK:
+ *		    don't warn about it.
+ *
+ *		  @ @MUST_CHECK@ -- warn if the return value of a function is
+ *		    ignored.  Use @DISCARD@ if you really don't care.
+ */
+
+#ifndef DEPRECATED
+#  define DEPRECATED(msg)
 #endif
 
 #ifndef NORETURN
@@ -241,18 +369,94 @@
 #  define MUST_CHECK
 #endif
 
-#ifndef LAUNDER
-#  define LAUNDER
+/* --- @PRINTF_LIKE@, @SCANF_LIKE@, @EXECL_LIKE@ --- *
+ *
+ * Arguments:	@int fmtix@ = format string argument index (starting from 1)
+ *		@int argix@ = variable format argument tail index (starting
+ *			from 1)
+ *		@int ntrail@ = number of arguments following terminator
+ *
+ * Use:		These are function attributes.  Again, they might not do
+ *		anything at all.  By intention, they give the compiler
+ *		information about a variadic function's arguments, so that it
+ *		can warn about misuse.
+ *
+ *		  * @PRINTF_LIKE@ -- the function takes a @printf@-style
+ *		    format string as argument @fmtix@ and an argument tail
+ *		    (which may be empty) beginning with argument @argix@.
+ *
+ *		  * @SCANF_LIKE@ -- the function takes a @scanf@-style
+ *		    format string as argument @fmtix@ and an argument tail
+ *		    (which may be empty) beginning with argument @argix@.
+ *
+ *		  * @EXECL_LIKE@ -- the function takes a sequence of pointer
+ *		    arguments terminated by a null pointer, followed by
+ *		    @ntrail@ further arguments.
+ */
+
+#ifndef PRINTF_LIKE
+#  define PRINF_LIKE(fmtix, argix)
 #endif
 
-#ifndef RELAX
-#  define RELAX
+#ifndef SCANF_LIKE
+#  define SCANF_LIKE(fmtix, argix)
 #endif
 
+#ifndef EXECL_LIKE
+#  define EXECL_LIKE(ntrail)
+#endif
+
+/* --- @MUFFLE_WARNINGS_...@ --- *
+ *
+ * Arguments:	@warns@ = a sequence of @..._WARNING@ calls (see below)
+ *		@body@ = some program text
+ *
+ * Use:		Muffle specific warnings within the program text.
+ *
+ *		For @MUFFLE_WARNINGS_DECL@, the program text is a
+ *		declaration; for @MUFFLE_WARNINGS_EXPR@, it is an expression,
+ *		and for @MUFFLE_WARNINGS_STMT@, it is a statement.
+ *
+ *		The warnings to be muffled are given as a list of
+ *		@..._WARNING@ macros, with no separators.  The list can
+ *		list warnings from multiple different compilers: entries for
+ *		irrelevant compilers will be ignored.
+ */
+
+#ifndef MUFFLE_WARNINGS_DECL
+#  define MUFFLE_WARNINGS_DECL(warns, body) body
+#endif
+
+#ifndef MUFFLE_WARNINGS_EXPR
+#  define MUFFLE_WARNINGS_EXPR(warns, body) (body)
+#endif
+
+#ifndef MUFFLE_WARNINGS_STMT
+#  define MUFFLE_WARNINGS_STMT(warns, body) do { body } while (0)
+#endif
+
+/* --- @GCC_WARNING@ --- *
+ *
+ * Arguments:	@warn@ = a string literal naming a warning, with `%|-W...|%'
+ *			prefix
+ *
+ * Use:		Names a GCC warning: use within @MUFFLE_WARNINGS_...@.
+ *
+ *		Note that GCC's warning suppression is very buggy.
+ */
+
 #ifndef GCC_WARNING
 #  define GCC_WARNING(warn)
 #endif
 
+/* --- @CLANG_WARNING@ --- *
+ *
+ * Arguments:	@warn@ = a string literal naming a warning, with `%|-W...|%'
+ *			prefix
+ *
+ * Use:		Names a Clang warning: use within @MUFFLE_WARNINGS_...@.
+ */
+
 #ifndef CLANG_WARNING
 #  define CLANG_WARNING(warn)
 #endif
-- 
2.11.0