@@@ mostly bench docs
[mLib] / test / bench.c
CommitLineData
b64eb60f
MW
1/* -*-c-*-
2 *
3 * Benchmarking support
4 *
5 * (c) 2023 Straylight/Edgeware
6 */
7
8/*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of the mLib utilities library.
11 *
12 * mLib is free software: you can redistribute it and/or modify it under
13 * the terms of the GNU Library General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or (at
15 * your option) any later version.
16 *
17 * mLib is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 * License for more details.
21 *
22 * You should have received a copy of the GNU Library General Public
23 * License along with mLib. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 * USA.
26 */
27
28/*----- Header files ------------------------------------------------------*/
29
30#include "config.h"
31
13ee7406 32#include <ctype.h>
b64eb60f 33#include <errno.h>
6e683a79 34#include <limits.h>
b1a20bee 35#include <math.h>
b64eb60f
MW
36#include <stdarg.h>
37#include <stdio.h>
38#include <stdlib.h>
39#include <string.h>
40#include <time.h>
41
42#include "alloc.h"
b1a20bee 43#include "arena.h"
b64eb60f
MW
44#include "bench.h"
45#include "bits.h"
13ee7406 46#include "dstr.h"
b64eb60f
MW
47#include "linreg.h"
48#include "macros.h"
49
6e683a79
MW
50#if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
51# include <cpuid.h>
52# define CPUID_1D_TSC (1u << 4)
53# define CPUID_1xD_TSCP (1u << 27)
b1a20bee 54# define USE_X86_RDTSC 1
6e683a79
MW
55#endif
56
57#if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
b1a20bee 58# include <sys/syscall.h>
6e683a79
MW
59# include <sys/types.h>
60# include <unistd.h>
61# include <linux/perf_event.h>
b1a20bee
MW
62# ifdef HAVE_VALGRIND_VALGRIND_H
63# include <valgrind/valgrind.h>
64# endif
65# define USE_LINUX_PERFEVENT 1
6e683a79
MW
66# if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
67# include <sys/mman.h>
b1a20bee 68# define USE_LINUX_PERFEVRDPMC 1
6e683a79
MW
69# endif
70#endif
71
b64eb60f
MW
72/*----- Data structures ---------------------------------------------------*/
73
13ee7406
MW
74enum { CLK, CY, NTIMER };
75
b64eb60f
MW
76struct timer {
77 struct bench_timer _t;
b1a20bee 78 arena *a;
13ee7406 79 const struct timer_ops *ops[NTIMER]; /* subtimers for clock and cycles */
6e683a79 80 union {
b1a20bee 81#ifdef USE_X86_RDTSC
6e683a79 82 unsigned tscaux; /* `ia32_tsc_aux' for `ldtscp' */
b1a20bee
MW
83#endif
84#ifdef USE_LINUX_PERFEVENT
6e683a79 85 int fd; /* vanilla `perf_event_open' */
b1a20bee
MW
86#endif
87#ifdef USE_LINUX_PERFEVRDPMC
88 struct { /* `perf_event_open' with `rdpmc' */
89 const volatile void *map; size_t sz; /* memory-mapped info */
90 pid_t owner; /* owning thread id */
91 } pmc;
92#endif
6e683a79 93 } u_cy; /* state for cycle measurement */
b64eb60f
MW
94};
95
96struct timer_ops {
13ee7406
MW
97 const char *name; /* timer name */
98 unsigned f; /* flags */
b1a20bee
MW
99/* ... @BTF_...OK@ flags */ /* expected results */
100#define TF_SECRET 16u /* don't try this automatically */
13ee7406 101 int (*init)(struct timer */*t*/); /* initialization function */
b1a20bee 102 int (*preflight)(struct timer */*t*/); /* preflight checks */
6e683a79
MW
103 int (*now)(struct timer */*t*/, /* read current */
104 struct bench_time */*t_out*/, unsigned /*f*/);
105 void (*diff)(struct timer */*t*/, /* difference */
106 struct bench_timing */*t_inout*/,
107 const struct bench_time */*t0*/,
108 const struct bench_time */*t1*/);
109 void (*teardown)(struct timer */*t*/); /* release held resources */
b64eb60f
MW
110};
111
112/*----- Preliminaries -----------------------------------------------------*/
113
114#define NS_PER_S 1000000000
115
67b5031e
MW
116/* --- @debug@ --- *
117 *
118 * Arguments: @const char *fmt@ = format control string
119 * @...@ = format arguemnts
120 *
121 * Returns: ---
122 *
123 * Use: Maybe report a debugging message to standard error.
124 */
125
31d0247c 126static PRINTF_LIKE(1, 2) void debug(const char *fmt, ...)
b64eb60f
MW
127{
128 const char *p;
129 va_list ap;
130
131 p = getenv("MLIB_BENCH_DEBUG");
132 if (p && *p != 'n' && *p != '0') {
133 va_start(ap, fmt);
134 fputs("mLib BENCH: ", stderr);
135 vfprintf(stderr, fmt, ap);
136 fputc('\n', stderr);
137 va_end(ap);
138 }
139}
140
6e683a79
MW
141#ifdef HAVE_UINT64
142# define FLOATK64(k) ((double)(k).i)
143#else
144# define FLOATK64(k) ((double)(k).lo + 4294967296.0*(double)(k).hi)
145#endif
146
147/* --- @diff_ts@ --- *
67b5031e 148 *
6e683a79
MW
149 * Arguments: @struct timer *t@ = timer structure
150 * @struct bench_timing *delta_inout@ = where to put the result
151 * @const struct time *t0, *t1@ = two input times
67b5031e
MW
152 *
153 * Returns: ---
154 *
6e683a79
MW
155 * Use: Calculates a time difference for timers using the
156 * @struct timespec@-like time format.
67b5031e
MW
157 */
158
6e683a79
MW
159static void diff_ts(struct timer *t, struct bench_timing *delta_inout,
160 const struct bench_time *t0, const struct bench_time *t1)
67b5031e
MW
161{
162 unsigned f = t0->f&t1->f;
b1a20bee
MW
163 kludge64 delta_s;
164 uint32 delta_ns;
67b5031e 165
6e683a79 166 if (f&BTF_TIMEOK) {
67b5031e 167
b1a20bee
MW
168 /* Calculate the integer differences in seconds and nanoseconds
169 * independently. To avoid underflow, though, add a second's worth of
170 * nanoseconds which we'll subtract off later.
171 */
172 SUB64(delta_s, t1->t.ts.s, t0->t.ts.s);
173 delta_ns = t1->t.ts.ns + NS_PER_S - t0->t.ts.ns;
174
175 /* Hack if they're both equal. */
176 if (ZERO64(delta_s) && !delta_ns) delta_ns = 1;
67b5031e 177
b1a20bee
MW
178 /* And apply the nanoseconds difference. To prevent underflow, pre-
179 * emptively borrow one from the integer difference.
6e683a79 180 */
b1a20bee 181 delta_inout->t = FLOATK64(delta_s) - 1.0 + delta_ns/(double)NS_PER_S;
6e683a79
MW
182
183 /* Done. */
184 delta_inout->f |= BTF_TIMEOK;
67b5031e 185 }
6e683a79 186}
67b5031e 187
6e683a79
MW
188/* --- @diff_cycles@ --- *
189 *
190 * Arguments: @struct timer *t@ = timer structure
191 * @struct bench_timing *delta_inout@ = where to put the result
192 * @const struct time *t0, *t1@ = two input times
193 *
194 * Returns: ---
195 *
196 * Use: Calculates a time difference for cycle-counting timers.
197 */
67b5031e 198
6e683a79
MW
199static void diff_cycles(struct timer *t, struct bench_timing *delta_inout,
200 const struct bench_time *t0,
201 const struct bench_time *t1)
202{
203 unsigned f = t0->f&t1->f;
b1a20bee 204 kludge64 delta_cy;
6e683a79
MW
205
206 if (f&BTF_CYOK) {
b1a20bee
MW
207 SUB64(delta_cy, t1->cy, t0->cy); delta_inout->cy = FLOATK64(delta_cy);
208 if (!delta_inout->cy) delta_inout->cy = 1;
6e683a79
MW
209 delta_inout->f |= BTF_CYOK;
210 }
67b5031e
MW
211}
212
6e683a79
MW
213#undef FLOATK64
214
b1a20bee
MW
215/* --- @normalize@ --- *
216 *
217 * Arguments: @double *x_inout@ = address of a value to normalize
218 * @const char **unit_out@ = address to store unit prefix
219 * @double scale@ = scale factor for unit steps
220 *
221 * Returns: ---
222 *
223 * Use: Adjust @*x_inout@ by a power of @scale@, and set @*unit_out@
224 * so that printing the two reflects the original value with an
225 * appropriate SI unit scaling. The @scale@ should be 1024 for
226 * binary quantities, most notably memory sizes, or 1000 for
227 * other quantities.
228 */
229
230static void normalize(double *x_inout, const char **unit_out, double scale)
231{
232 static const char
233 *const nothing = "",
234 *const big[] = { "k", "M", "G", "T", "P", "E", 0 },
235 *const little[] = { "m", "µ", "n", "p", "f", "a", 0 };
236 const char *const *u;
237 double x = *x_inout;
238
239 if (x < 1)
240 for (u = little, x *= scale; x < 1 && u[1]; u++, x *= scale);
241 else if (x >= scale)
242 for (u = big, x /= scale; x >= scale && u[1]; u++, x /= scale);
243 else
244 u = &nothing;
245
246 *x_inout = x; *unit_out = *u;
247}
248
13ee7406 249/*----- The null timer ----------------------------------------------------*/
b64eb60f 250
13ee7406
MW
251/* This is a timer which does nothing, in case we don't have any better
252 * ideas.
67b5031e
MW
253 */
254
13ee7406 255static int null_init(struct timer *t) { return (0); }
6e683a79
MW
256static int null_now(struct timer *t, struct bench_time *t_out, unsigned f)
257 { return (0); }
b1a20bee 258static int null_preflight(struct timer *t) { return (0); }
6e683a79
MW
259static void null_diff(struct timer *t, struct bench_timing *delta_inout,
260 const struct bench_time *t0,
261 const struct bench_time *t1)
262 { ; }
b64eb60f 263static void null_teardown(struct timer *t) { ; }
b64eb60f 264
13ee7406 265static const struct timer_ops null_ops =
b1a20bee
MW
266 { "null", 0,
267 null_init, null_preflight, null_now, null_diff, null_teardown };
13ee7406
MW
268#define NULL_ENT &null_ops,
269
270/*----- The broken clock --------------------------------------------------*/
271
272/* This is a cycle counter which does nothing, in case we don't have any
273 * better ideas.
274 */
b64eb60f 275
13ee7406
MW
276static int broken_init(struct timer *t) { return (-1); }
277
278static const struct timer_ops broken_ops =
b1a20bee
MW
279 { "broken", TF_SECRET,
280 broken_init, null_preflight, null_now, null_diff, null_teardown };
13ee7406 281#define BROKEN_ENT &broken_ops,
b64eb60f
MW
282
283/*----- Linux performance counters ----------------------------------------*/
284
67b5031e
MW
285/* This is a cycle counter which uses the Linux performance event system,
286 * which is probably the best choice if it's available.
287 */
288
b64eb60f
MW
289#if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
290
6e683a79
MW
291/* --- @perfevent_open@ --- *
292 *
293 * Arguments: ---
294 *
295 * Returns: File descriptor, or %$-1$%.
296 *
297 * Use: Open a performance measurement descriptor set up to count CPU
298 * cycles.
299 */
b64eb60f 300
6e683a79
MW
301static int perfevent_open(void)
302{
303 struct perf_event_attr attr = { 0 };
304 int fd;
b64eb60f 305
6e683a79
MW
306 attr.type = PERF_TYPE_HARDWARE;
307 attr.size = sizeof(attr);
308 attr.config = PERF_COUNT_HW_CPU_CYCLES;
309 attr.disabled = 0;
310 attr.exclude_kernel = 1;
311 attr.exclude_hv = 1;
312
b1a20bee 313 fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0);
6e683a79
MW
314 if (fd < 0) {
315 debug("couldn't open perf event: %s", strerror(errno));
316 return (-1);
317 }
318
319 return (fd);
320}
321
322static int perfevent_now(struct timer *t,
323 struct bench_time *t_out, unsigned f)
b64eb60f
MW
324{
325 ssize_t n;
326
327 n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i));
328 if (n != sizeof(t_out->cy.i)) {
329 debug("failed to read perf-event counter: %s", strerror(errno));
6e683a79 330 return (0);
b64eb60f 331 }
6e683a79 332 t_out->f |= BTF_CYOK; return (0);
b64eb60f
MW
333}
334
335static void perfevent_teardown(struct timer *t)
336 { close(t->u_cy.fd); }
337
b64eb60f
MW
338static int perfevent_init(struct timer *t)
339{
6e683a79 340 int fd = -1, rc;
b64eb60f 341
b1a20bee
MW
342 fd = perfevent_open(); if (fd < 0) { rc = -1; goto end; }
343 t->u_cy.fd = fd; fd = -1; rc = 0;
6e683a79
MW
344end:
345 if (fd != -1) close(fd);
346 return (rc);
347}
348
349static const struct timer_ops perfevent_ops =
b1a20bee
MW
350 { "linux-perf-read-hw-cycles", BTF_CYOK,
351 perfevent_init, null_preflight, perfevent_now,
352 diff_cycles, perfevent_teardown };
6e683a79
MW
353#define PERFEVENT_VANILLA_CYENT &perfevent_ops,
354
355# if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
356
357/* Special syscall-free version for x86 using `rdpmc' instruction. *
358 *
359 * This is a bit weird because it does both kinds of measurement in a single
360 * operation.
361 */
362
363static int perfevrdpmc_now(struct timer *t,
364 struct bench_time *t_out, unsigned f)
365{
366 const volatile struct perf_event_mmap_page *map = t->u_cy.pmc.map;
367 unsigned long long tsc = tsc, toff = toff, tenb = tenb;
368 unsigned long long cy = cy, cyoff = cyoff;
369 unsigned long long m, hi, lo;
370 unsigned tshift = tshift, tmult = tmult, q0, q1, ff;
371
372 /* Repeat until we can complete this job without the buffer changing in the
373 * middle.
374 */
375 q0 = map->lock;
376 __atomic_thread_fence(__ATOMIC_ACQ_REL);
377 for (;;) {
378 ff = 0;
379
380 /* Read the passage-of-time information. */
381 if (map->cap_user_time) {
382 tenb = map->time_enabled;
383 tsc = __builtin_ia32_rdtsc();
384 tshift = map->time_shift;
385 tmult = map->time_mult;
386 toff = map->time_offset;
387 ff |= BTF_TIMEOK;
388 }
389
390 /* Read the performance-counter information. */
391 if (map->cap_user_rdpmc) {
392 cy = __builtin_ia32_rdpmc(map->index - 1);
393 cyoff = map->offset;
394 ff |= BTF_CYOK;
395 }
396
397 /* Check the sequence number again. */
398 __atomic_thread_fence(__ATOMIC_ACQ_REL);
399 q1 = map->lock;
400 if (q0 == q1) break;
401 q0 = q1;
402 }
403
404 if (ff&BTF_TIMEOK) {
405 /* We have a raw reference-cycle count %$n$% (@tsc@), and parameters
406 * %$a$%, %$w$% and %$t_0$%, such that %$a n/2^w + t_0$% gives a time in
407 * nanoseconds.
408 */
409
410 m = (1ull << tshift) - 1;
411 hi = tsc >> tshift; lo = tsc&m;
412 t_out->t.rawns.i = hi*tmult + (lo*tmult >> tshift) + toff + tenb;
413 t_out->f |= BTF_TIMEOK;
b64eb60f
MW
414 }
415
6e683a79
MW
416 if (ff&BTF_CYOK) {
417 /* We have the cycle count. */
b64eb60f 418
6e683a79
MW
419 t_out->cy.i = cy + cyoff;
420 t_out->f |= BTF_CYOK;
421 }
13ee7406 422 return (0);
b64eb60f 423}
13ee7406 424
6e683a79
MW
425static void perfevrdpmc_diff(struct timer *t,
426 struct bench_timing *delta_inout,
427 const struct bench_time *t0,
428 const struct bench_time *t1)
429{
b1a20bee 430 unsigned long long delta_ns;
6e683a79 431 unsigned f = t0->f&t1->f;
13ee7406 432
6e683a79 433 if (f&BTF_TIMEOK) {
b1a20bee
MW
434 delta_ns = t1->t.rawns.i - t0->t.rawns.i; if (!delta_ns) delta_ns = 1;
435 delta_inout->t = delta_ns/(double)NS_PER_S;
6e683a79
MW
436 delta_inout->f |= BTF_TIMEOK;
437 }
438
439 if (f&BTF_CYOK) {
440 delta_inout->cy = t1->cy.i - t0->cy.i;
b1a20bee 441 if (!delta_inout->cy) delta_inout->cy = 1;
6e683a79
MW
442 delta_inout->f |= BTF_CYOK;
443 }
444}
445
b1a20bee
MW
446static void perfevrdpmc_unmap
447 (const volatile struct perf_event_mmap_page *map, size_t mapsz)
448 { if (map) munmap(UNQUALIFY(struct perf_event_mmap_page, map), mapsz); }
449
6e683a79 450static void perfevrdpmc_teardown(struct timer *t)
b1a20bee 451 { perfevrdpmc_unmap(t->u_cy.pmc.map, t->u_cy.pmc.sz); }
6e683a79 452
b1a20bee 453static int perfevrdpmc_setup(struct timer *t)
6e683a79
MW
454{
455 const volatile struct perf_event_mmap_page *map = 0;
b1a20bee 456 int pgsz, mapsz = 0, fd = -1, rc;
6e683a79
MW
457
458 /* The rules say we must allocate %$1 + 2^n$% pages, so we need to know how
459 * big a page is.
460 */
461 pgsz = sysconf(_SC_PAGESIZE);
462 if (pgsz < 0) {
463 debug("failed to discover page size!: %s", strerror(errno));
464 rc = -1; goto end;
465 }
466
467 /* Open the measurement descriptor and map it. */
468 fd = perfevent_open(); if (!fd) return (-1);
469 mapsz = 2*pgsz;
470 map = mmap(0, mapsz, PROT_READ, MAP_SHARED, fd, 0);
471 if (map == MAP_FAILED) {
472 debug("failed to map perf event: %s", strerror(errno));
473 return (-1);
474 }
475
b1a20bee
MW
476 t->u_cy.pmc.map = map; t->u_cy.pmc.sz = mapsz; map = 0;
477 t->u_cy.pmc.owner = syscall(SYS_gettid); rc = 0;
6e683a79
MW
478end:
479 if (fd != -1) close(fd);
b1a20bee 480 perfevrdpmc_unmap(map, mapsz);
6e683a79
MW
481 return (rc);
482}
483
b1a20bee
MW
484static int perfevrdpmc_preflight(struct timer *t)
485{
486 if (!t->u_cy.pmc.map) { debug("retry perf event map setup"); goto reopen; }
487 if (t->u_cy.pmc.owner != syscall(SYS_gettid)) {
488 debug("pid changed: reopen perf event map");
489 perfevrdpmc_unmap(t->u_cy.pmc.map, t->u_cy.pmc.sz);
490 t->u_cy.pmc.map = 0; goto reopen;
491 }
492 return (0);
493
494reopen:
495 if (perfevrdpmc_setup(t)) return (-1);
496 return (0);
497}
498
499static int perfevrdpmc_cyinit(struct timer *t)
500{
501 unsigned a, b, c, d;
502
503# ifdef HAVE_VALGRIND_VALGRIND_H
504 /* Valgrind doesn't like `rdpmc' instructions, so just bail. */
505 if (RUNNING_ON_VALGRIND) return (-1);
506# endif
507
508 /* We need `rdtsc' to do the passage-of-time measurement. */
509 if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
510 { debug("no `rdtsc' instrunction"); return (-1); }
511
512 /* Set things up. */
513 if (perfevrdpmc_setup(t)) return (-1);
514 return (0);
515}
516
6e683a79 517static const struct timer_ops perfevrdpmc_cyops =
b1a20bee
MW
518 { "linux-x86-perf-rdpmc-hw-cycles", BTF_TIMEOK | BTF_CYOK,
519 perfevrdpmc_cyinit, perfevrdpmc_preflight, perfevrdpmc_now,
6e683a79
MW
520 perfevrdpmc_diff, perfevrdpmc_teardown };
521
522static int perfevrdpmc_clkinit(struct timer *t)
523{
b1a20bee
MW
524 if (t->ops[CY] != &perfevrdpmc_cyops) {
525 debug("`linux-x86-perf-rdpmc-hw-cycles' not set as cycle subtimer");
6e683a79
MW
526 return(-1);
527 }
528 return (0);
529}
530
531static const struct timer_ops perfevrdpmc_clkops =
532 { "linux-x86-perf-rdpmc-hw-cycles", 0,
b1a20bee 533 perfevrdpmc_clkinit, null_preflight, null_now,
6e683a79
MW
534 null_diff, null_teardown };
535
536# define PERFEVENT_RDPMC_CLKENT &perfevrdpmc_clkops,
537# define PERFEVENT_RDPMC_CYENT &perfevrdpmc_cyops,
538
539# else
540# define PERFEVENT_RDPMC_CLKENT
541# define PERFEVENT_RDPMC_CYENT
542# endif
543
544# define PERFEVENT_CLKENT PERFEVENT_RDPMC_CLKENT
545# define PERFEVENT_CYENT PERFEVENT_RDPMC_CYENT PERFEVENT_VANILLA_CYENT
b64eb60f 546#else
6e683a79 547# define PERFEVENT_CLKENT
b64eb60f
MW
548# define PERFEVENT_CYENT
549#endif
550
551/*----- Intel time-stamp counter ------------------------------------------*/
552
67b5031e
MW
553/* This is a cycle counter based on the Intel `rdtsc' instruction. It's not
554 * really suitable for performance measurement because it gets confused by
555 * CPU frequency adjustments.
556 */
557
6e683a79 558#if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
b64eb60f 559
6e683a79
MW
560static int x86rdtsc_now(struct timer *t,
561 struct bench_time *t_out, unsigned f)
562 { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; return (0); }
b64eb60f
MW
563
564static int x86rdtsc_init(struct timer *t)
565{
13ee7406
MW
566 unsigned a, b, c, d;
567
568 if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
b64eb60f 569 { debug("no `rdtsc' instrunction"); return (-1); }
6e683a79
MW
570 t->u_cy.tscaux = ~0u;
571 return (0);
572}
573
574static int x86rdtscp_now(struct timer *t,
575 struct bench_time *t_out, unsigned f)
576{
577 unsigned tscaux;
578 unsigned long long n;
579
580 n = __builtin_ia32_rdtscp(&tscaux);
581 if (!(f&BTF_T1))
582 t->u_cy.tscaux = tscaux;
583 else if (t->u_cy.tscaux != tscaux) {
584 debug("tscaux mismatch: new 0x%08x /= old 0x%08x",
585 tscaux, t->u_cy.tscaux);
586 return (-1);
587 }
588 t_out->cy.i = n; t_out->f |= BTF_CYOK; return (0);
589}
590
591static int x86rdtscp_init(struct timer *t)
592{
593 unsigned a, b, c, d;
594
595 if (!__get_cpuid(0x80000001, &a, &b, &c, &d) || !(d&CPUID_1xD_TSCP))
596 { debug("no `rdtscp' instrunction"); return (-1); }
13ee7406 597 return (0);
b64eb60f
MW
598}
599
13ee7406 600static const struct timer_ops x86rdtsc_ops =
b1a20bee
MW
601 { "x86-rdtsc", BTF_CYOK,
602 x86rdtsc_init, null_preflight, x86rdtsc_now,
603 diff_cycles, null_teardown };
6e683a79 604static const struct timer_ops x86rdtscp_ops =
b1a20bee
MW
605 { "x86-rdtscp", BTF_CYOK,
606 x86rdtscp_init, null_preflight,
607 x86rdtscp_now, diff_cycles, null_teardown };
13ee7406 608
6e683a79 609# define X86RDTSC_CYENT &x86rdtscp_ops, &x86rdtsc_ops,
b64eb60f 610#else
13ee7406 611# define X86RDTSC_CYENT
b64eb60f
MW
612#endif
613
614/*----- POSIX `clock_gettime' ---------------------------------------------*/
615
67b5031e
MW
616/* This is a real-time clock based on the POSIX time interface, with up to
617 * nanosecond precision.
618 */
619
b64eb60f
MW
620#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)
621
6e683a79 622static int gettime_now(struct timer *t, struct bench_time *t_out, unsigned f)
b64eb60f
MW
623{
624 struct timespec now;
625
626 if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now))
6e683a79
MW
627 { debug("error reading POSIX clock: %s", strerror(errno)); return (0); }
628 ASSIGN64(t_out->t.ts.s, now.tv_sec); t_out->t.ts.ns = now.tv_nsec;
629 t_out->f |= BTF_TIMEOK; return (0);
b64eb60f
MW
630}
631
13ee7406 632static const struct timer_ops gettime_ops =
b1a20bee
MW
633 { "posix-thread-cputime", BTF_TIMEOK,
634 null_init, null_preflight, gettime_now, diff_ts, null_teardown };
13ee7406
MW
635
636# define GETTIME_CLKENT &gettime_ops,
b64eb60f
MW
637#else
638# define GETTIME_CLKENT
639#endif
640
641/*----- Standard C `clock' ------------------------------------------------*/
642
67b5031e
MW
643/* This is a real-time clock based on the C `clock' function which is
644 * guaranteed to be available, though it's not likely to be very good.
645 */
646
6e683a79 647static int clock_now(struct timer *t, struct bench_time *t_out, unsigned f)
b64eb60f 648{
6e683a79 649 clock_t now;
b64eb60f
MW
650
651 now = clock();
652 if (now == (clock_t)-1) {
653 debug("error reading standard clock: %s", strerror(errno));
6e683a79 654 return (0);
b64eb60f 655 }
6e683a79
MW
656 t_out->t.clk = now; t_out->f |= BTF_TIMEOK; return (0);
657}
658
659static void clock_diff(struct timer *t, struct bench_timing *delta_inout,
660 const struct bench_time *t0,
661 const struct bench_time *t1)
662{
b1a20bee 663 clock_t delta_clk;
6e683a79
MW
664 unsigned f = t0->f&t1->f;
665
666 if (f&BTF_TIMEOK) {
b1a20bee
MW
667 delta_clk = t1->t.clk - t0->t.clk; if (!delta_clk) delta_clk = 1;
668 delta_inout->t = delta_clk/(double)CLOCKS_PER_SEC;
6e683a79
MW
669 delta_inout->f |= BTF_TIMEOK;
670 }
b64eb60f
MW
671}
672
13ee7406 673static const struct timer_ops clock_ops =
b1a20bee
MW
674 { "stdc-clock", BTF_TIMEOK, null_init, null_preflight, clock_now,
675 clock_diff, null_teardown };
13ee7406
MW
676
677#define CLOCK_CLKENT &clock_ops,
b64eb60f
MW
678
679/*----- Timing setup ------------------------------------------------------*/
680
67b5031e 681/* Tables of timing sources. */
13ee7406 682static const struct timer_ops
6e683a79
MW
683 *const clktab[] = { PERFEVENT_CLKENT
684 GETTIME_CLKENT
685 CLOCK_CLKENT
686 BROKEN_ENT
687 0 },
688 *const cytab[] = { PERFEVENT_CYENT
689 X86RDTSC_CYENT
690 NULL_ENT
691 BROKEN_ENT
692 0 };
13ee7406
MW
693
694static const struct timertab {
695 const char *what;
696 const char *env;
697 const struct timer_ops *const *opstab;
698} timertab[] = {
b1a20bee
MW
699 { "clock", "MLIB_BENCH_CLKTIMER", clktab },
700 { "cycle", "MLIB_BENCH_CYCLETIMER", cytab }
13ee7406 701};
b64eb60f 702
67b5031e
MW
703/* --- @find_timer@ --- *
704 *
705 * Arguments: @const char *name@ = timer name
706 * @size_t sz@ = length of name
13ee7406 707 * @unsigned tm@ = which subtimer we're looking for
67b5031e
MW
708 *
709 * Returns: The table entry matching the given name, or null if there
710 * isn't one.
711 */
712
13ee7406
MW
713static const struct timer_ops *find_timer(const char *name, size_t sz,
714 unsigned tm)
b64eb60f 715{
13ee7406
MW
716 const struct timer_ops *const *tt;
717
718 for (tt = timertab[tm].opstab; *tt; tt++) {
719 if (strlen((*tt)->name) == sz &&
720 MEMCMP(name, ==, (*tt)->name, sz))
721 return (*tt);
b64eb60f 722 }
13ee7406
MW
723 debug("%s timer `%.*s' not found",
724 timertab[tm].what, (int)sz, name); return (0);
b64eb60f
MW
725}
726
67b5031e
MW
727/* --- @try_timer@ --- *
728 *
729 * Arguments: @struct timer *t@ = timer structure
13ee7406
MW
730 * @const struct timer_ops *ops@ = timer ops
731 * @unsigned tm@ = which subtimer we're setting
67b5031e 732 *
13ee7406 733 * Returns: Zero on success, %$-1$% if timer failed.
67b5031e
MW
734 *
735 * Use: Tries to initialize the timer @t@, reporting a debug message
736 * if it worked.
737 */
738
b64eb60f 739static int try_timer(struct timer *t,
13ee7406 740 const struct timer_ops *ops, unsigned tm)
b64eb60f 741{
b1a20bee
MW
742 struct bench_time t0, t1;
743 struct bench_timing delta;
744 int rc;
745 unsigned f = 0;
746#define f_teardown 1u
747
748 if (ops->init(t)) { rc = -1; goto end; }
749 f |= f_teardown;
750
751 if (ops->preflight(t)) { rc = -1; goto end; }
752 t0.f = t1.f = 0;
753 do {
754 while (ops->now(t, &t0, BTF_T0));
755 } while (ops->now(t, &t1, BTF_T1));
756 delta.f = 0; ops->diff(t, &delta, &t0, &t1);
757 if ((ops->f ^ delta.f)&BTF_ANY) { rc = -1; goto end; }
758
13ee7406 759 debug("selected %s timer `%s'", timertab[tm].what, ops->name);
b1a20bee
MW
760 t->ops[tm] = ops; f &= ~f_teardown; rc = 0;
761
762end:
763 if (f&f_teardown) ops->teardown(t);
764 return (rc);
765
766#undef f_teardown
b64eb60f
MW
767}
768
67b5031e
MW
769/* --- @select_timer@ --- *
770 *
771 * Arguments: @struct timer *t@ = timer structure
13ee7406
MW
772 * @unsigned tm@ = which subtimer we're setting
773 * @const char *config@, @size_t sz@ = config string
67b5031e 774 *
13ee7406 775 * Returns: Zero on success, %$-1$% if timer failed.
67b5031e
MW
776 *
777 * Use: Select a timer from the table. If the environment variable
778 * is set, then parse a comma-separated list of timer names and
779 * use the first one listed that seems to work; otherwise, try
780 * the timers in the table in order.
781 */
782
13ee7406
MW
783static int select_timer(struct timer *t, unsigned tm,
784 const char *config, size_t sz)
b64eb60f 785{
13ee7406
MW
786 const char *p, *l;
787 const struct timer_ops *ops, *const *tt;
b64eb60f 788
13ee7406
MW
789 if (!config) {
790 for (tt = timertab[tm].opstab; *tt; tt++)
791 if (!((*tt)->f&TF_SECRET) && !try_timer(t, *tt, tm)) return (0);
b64eb60f 792 } else {
13ee7406 793 l = config + sz;
b64eb60f 794 for (;;) {
13ee7406
MW
795 p = memchr(config, ',', l - config); if (!p) p = l;
796 ops = find_timer(config, p - config, tm);
797 if (ops && !try_timer(t, ops, tm)) return (0);
798 if (p >= l) break;
799 config = p + 1;
b64eb60f
MW
800 }
801 }
13ee7406 802 debug("no suitable %s timer found", timertab[tm].what); return (-1);
b64eb60f
MW
803}
804
67b5031e 805/* Bench timer operations. */
13ee7406
MW
806static void timer_describe(struct bench_timer *tm, dstr *d)
807{
808 struct timer *t = (struct timer *)tm;
809 unsigned i;
810
811 dstr_puts(d, "builtin: ");
812 for (i = 0; i < NTIMER; i++) {
813 if (i) dstr_puts(d, ", ");
814 dstr_putf(d, "%s = %s", timertab[i].what, t->ops[i]->name);
815 }
816}
817
b1a20bee
MW
818static int timer_preflight(struct bench_timer *tm)
819{
820 struct timer *t = (struct timer *)tm;
821 unsigned i;
822
823 for (i = 0; i < NTIMER; i++) if (t->ops[i]->preflight(t)) return (-1);
824 return (0);
825}
826
6e683a79
MW
827static int timer_now(struct bench_timer *tm,
828 struct bench_time *t_out, unsigned f)
b64eb60f
MW
829{
830 struct timer *t = (struct timer *)tm;
13ee7406 831 unsigned i;
b64eb60f 832
6e683a79
MW
833 t_out->f = 0;
834 for (i = 0; i < NTIMER; i++) if (t->ops[i]->now(t, t_out, f)) return (-1);
835 return (0);
836}
837
838static void timer_diff(struct bench_timer *tm,
839 struct bench_timing *t_out,
840 const struct bench_time *t0,
841 const struct bench_time *t1)
842{
843 struct timer *t = (struct timer *)tm;
844 unsigned i;
845
846 t_out->f = 0;
847 for (i = 0; i < NTIMER; i++) t->ops[i]->diff(t, t_out, t0, t1);
b64eb60f 848}
13ee7406 849
b64eb60f
MW
850static void timer_destroy(struct bench_timer *tm)
851{
852 struct timer *t = (struct timer *)tm;
13ee7406 853 unsigned i;
b64eb60f
MW
854
855 if (!t) return;
13ee7406
MW
856 for (i = 0; i < NTIMER; i++)
857 if (t->ops[i]) t->ops[i]->teardown(t);
b1a20bee 858 x_free(t->a, t);
b64eb60f
MW
859}
860
13ee7406 861static const struct bench_timerops timer_ops =
b1a20bee 862 { timer_describe, timer_preflight, timer_now, timer_diff, timer_destroy };
b64eb60f 863
67b5031e
MW
864/* --- @bench_createtimer@ --- *
865 *
13ee7406 866 * Arguments: @const char *config@ = timer configuration string
67b5031e
MW
867 *
868 * Returns: A freshly constructed standard timer object.
869 *
870 * Use: Allocate a timer. Dispose of it by calling
871 * @tm->ops->destroy(tm)@ when you're done.
13ee7406
MW
872 *
873 * Applications should not set configuration strings except as
874 * established by user action, e.g., from a command-line option,
875 * environment variable, or configuration file.
67b5031e
MW
876 */
877
13ee7406 878struct bench_timer *bench_createtimer(const char *config)
b64eb60f
MW
879{
880 struct timer *t = 0;
881 struct bench_timer *ret = 0;
13ee7406
MW
882 struct { const char *p; size_t sz; } tmconf[NTIMER] = { 0 };
883 const struct timer_ops *const *tt;
884 const char *p, *l; size_t n, nn;
885 unsigned i;
b64eb60f 886
13ee7406
MW
887 /* Parse the configuration string. */
888 if (config) {
889
890 /* The first thing to do is find the end of the string. */
891 l = config + strlen(config);
892
893 for (;;) {
894 /* Process the whitespace-sparated words of the string one by one. */
895
896 /* Skip over any initial whitespace. If we hit the end of the string
897 * then we're done.
898 */
899 for (;;)
900 if (config >= l) goto done_config;
901 else if (!ISSPACE(*config)) break;
902 else config++;
903
904 /* There's definitely a word here. Find the end of it. */
905 for (p = config; p < l && !ISSPACE(*p); p++);
906 nn = p - config;
907
908 /* Try various simple keywords. */
909#define MATCHP(lit) (nn == sizeof(lit) - 1 && MEMCMP(config, ==, lit, nn))
910
911 if (MATCHP("list")) {
912 /* The `list' keyword requests lists of the available timer
913 * implementations.
914 */
915
916 for (i = 0; i < NTIMER; i++) {
917 printf("%s timers:", timertab[i].what);
918 for (tt = timertab[i].opstab; *tt; tt++)
b1a20bee 919 if (!((*tt)->f&TF_SECRET)) printf(" %s", (*tt)->name);
13ee7406
MW
920 putchar('\n');
921 }
922 goto next_config;
923 }
924
925#undef MATCHP
926
927 /* Otherwise it's an assignment, setting a subtimer list. */
928 p = memchr(config, '=', nn);
929 if (!p)
930 n = nn;
931 else {
932 n = p - config;
933 for (i = 0; i < NTIMER; i++)
934 if (STRNCMP(config, ==, timertab[i].what, n) &&
935 !timertab[i].what[n]) {
936 if (tmconf[i].p)
937 debug("duplicate %s timer list", timertab[i].what);
938 tmconf[i].p = config + n + 1; tmconf[i].sz = nn - n - 1;
939 goto next_config;
940 }
941 }
942 debug("unrecognized config keyword `%.*s'", (int)n, config);
943
944 /* Move on to the next word. */
945 next_config:
946 config += nn;
947 }
948 done_config:;
949 }
950
951 /* Override these settings from the environment. */
952 for (i = 0; i < NTIMER; i++) {
953 p = getenv(timertab[i].env);
954 if (p) { tmconf[i].p = p; tmconf[i].sz = strlen(p); }
955 }
956
957 /* All seems well. Allocate the timer object. */
b1a20bee 958 XNEW(t); t->a = arena_global;
13ee7406
MW
959 for (i = 0; i < NTIMER; i++) t->ops[i] = 0;
960
961 /* Try to set up the subtimers. */
6e683a79 962 for (i = NTIMER; i--; )
13ee7406
MW
963 if (select_timer(t, i, tmconf[i].p, tmconf[i].sz)) goto end;
964
965 /* All is done. */
289651a7 966 t->_t.ops = &timer_ops; t->_t.ref = 1; ret = &t->_t; t = 0;
b64eb60f
MW
967end:
968 if (t) timer_destroy(&t->_t);
969 return (ret);
970}
971
67b5031e 972/*----- Benchmarking ------------------------------------------------------*/
b64eb60f 973
67b5031e
MW
974/* --- @bench_init@ --- *
975 *
976 * Arguments: @struct bench_state *b@ = bench state to initialize
13ee7406 977 * @struct bench_timer *tm@ = timer to attach, or null
67b5031e 978 *
13ee7406 979 * Returns: Zero on success, %$-1$% on failure.
67b5031e 980 *
13ee7406
MW
981 * Use: Initialize the benchmark state. On success, the timer state
982 * still needs to be calibrated (use @bench_calibrate@) before
983 * it can be used, but this will be done automatically by
984 * @bench_measure@ if it's not done by hand earlier. The timer
985 * is now owned by the benchmark state and will be destroyed by
986 * @bench_destroy@.
987 *
988 * The only reason for failure is if @tm@ was null on entry,
989 * and automatic construction of a timer failed. The state is
990 * safe to discard, but calling @bench_destroy@ is safe too.
67b5031e 991 */
b64eb60f 992
13ee7406
MW
993int bench_init(struct bench_state *b, struct bench_timer *tm)
994{
995 int rc;
996
997 b->tm = 0;
998
999 if (!tm) {
1000 tm = bench_createtimer(0);
1001 if (!tm) { rc = -1; goto end; }
1002 }
1003
1004 b->tm = tm; b->target_s = 1.0; b->f = 0; rc = 0;
1005end:
1006 return (rc);
1007}
b64eb60f 1008
67b5031e
MW
1009/* --- @bench_destroy@ --- *
1010 *
1011 * Arguments: @struct bench_state *b@ = bench state
1012 *
1013 * Returns: ---
1014 *
1015 * Use: Destroy the benchmark state, releasing the resources that it
1016 * holds.
1017 */
1018
b64eb60f 1019void bench_destroy(struct bench_state *b)
289651a7 1020 { if (b->tm && !--b->tm->ref) { b->tm->ops->destroy(b->tm); b->tm = 0; } }
b64eb60f 1021
b1a20bee 1022/* --- @spin@ --- *
67b5031e
MW
1023 *
1024 * Arguments: @unsigned long n@ = iteration count
1025 * @void *ctx@ = context pointer (ignored)
1026 *
1027 * Returns: ---
1028 *
1029 * Use: Does nothing at all for @n@ iterations. Used to calibrate
1030 * the benchmarking state.
1031 */
1032
b1a20bee 1033static void spin(unsigned long n, void *ctx)
b64eb60f
MW
1034 { while (n--) RELAX; }
1035
67b5031e
MW
1036/* --- @bench_calibrate@ --- *
1037 *
1038 * Arguments: @struct bench_state *b@ = bench state
b1a20bee 1039 * @unsigned f@ = calibration flags
67b5031e 1040 *
13ee7406 1041 * Returns: Zero on success, %$-1$% if calibration failed.
67b5031e
MW
1042 *
1043 * Use: Calibrate the benchmark state, so that it can be used to
1044 * measure performance reasonably accurately.
b1a20bee
MW
1045 *
1046 * Calibration will take into account how the subject code is
1047 * going to be located. If you're going to use @BENCH_MEASURE@
1048 * to measure a piece of literal code, then leave @f@ zero. If
1049 * the code to be measured is going to be executed via an
1050 * indirect branch, e.g., through the @measure@ function, then
1051 * set @BTF_INDIRECT@.
67b5031e
MW
1052 */
1053
13ee7406
MW
1054#define T_CLB 0.0625 /* calibration time limit */
1055
b1a20bee 1056int bench_calibrate(struct bench_state *b, unsigned f)
b64eb60f
MW
1057{
1058 struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT;
b1a20bee 1059 struct bench_timer *tm = b->tm;
b64eb60f 1060 struct bench_timing delta;
6e683a79 1061 double n, r;
b1a20bee
MW
1062 unsigned i, tf = BTF_ANY;
1063 BENCH_TIMELOOP_DECLS;
b64eb60f
MW
1064 int rc;
1065
67b5031e
MW
1066 /* The model here is that a timing loop has a fixed overhead as we enter
1067 * and leave (e.g., to do with the indirect branch into the code), and
1068 * per-iteration overheads as we check the counter and loop back. We aim
1069 * to split these apart using linear regression.
1070 */
1071
1072 /* If we've already calibrated then there's nothing to do. */
13ee7406 1073 if (b->f&BTF_CLB) return (b->f&BTF_ANY ? 0 : -1);
b64eb60f 1074
b1a20bee
MW
1075 /* Run the timer preflight check. */
1076 if (tm->ops->preflight(tm)) { rc = -1; goto end; }
1077
1078 /* Exercise the inner loop a few times to educate the branch predictor.
1079 * This is only useful if we're executing via an indirect call.
1080 */
1081 if (f&BTF_INDIRECT) {
1082 for (i = 0; i < 50; i++)
289651a7 1083 BENCH_TIMELOOP_TAG(setup, b->tm, &delta, 10000, ;)
b1a20bee
MW
1084 LAUNDER(&spin)(_bench_n, 0);
1085 }
b64eb60f 1086
67b5031e
MW
1087 /* Now we measure idle loops until they take sufficiently long -- or we run
1088 * out of counter.
1089 */
b64eb60f 1090 debug("calibrating...");
6e683a79 1091 n = 1.0;
b64eb60f 1092 for (;;) {
67b5031e
MW
1093
1094 /* Measure @n@ iterations of the idle loop. */
b1a20bee 1095 if (f&BTF_INDIRECT)
289651a7 1096 BENCH_TIMELOOP_TAG(calibrate, b->tm, &delta, n, ;)
b1a20bee
MW
1097 LAUNDER(&spin)(_bench_n, 0);
1098 else
289651a7 1099 BENCH_TIMELOOP_TAG(calibrate, b->tm, &delta, n, ;)
b1a20bee
MW
1100 while (_bench_n--) RELAX;
1101 tf &= delta.f; if (!(tf&BTF_TIMEOK)) { rc = -1; goto end; }
67b5031e
MW
1102
1103 /* Register the timings with the regression machinery. */
b64eb60f 1104 linreg_update(&lr_clk, n, delta.t);
b1a20bee 1105 if (!(tf&BTF_CYOK))
6e683a79 1106 debug(" n = %10.0f; t = %12g s", n, delta.t);
b64eb60f
MW
1107 else {
1108 linreg_update(&lr_cy, n, delta.cy);
6e683a79 1109 debug(" n = %10.0f; t = %12g s, cy = %10.0f", n, delta.t, delta.cy);
b64eb60f 1110 }
67b5031e
MW
1111
1112 /* If we're done then stop. */
13ee7406 1113 if (delta.t >= T_CLB) break;
b64eb60f 1114 if (n >= ULONG_MAX - n/3) break;
67b5031e
MW
1115
1116 /* Update the counter and continue. */
6e683a79 1117 n += n/3.0 + 1.0;
b64eb60f
MW
1118 }
1119
67b5031e
MW
1120 /* Now run the linear regression to extract the constant and per-iteration
1121 * overheads.
1122 */
13ee7406
MW
1123 linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, &r);
1124 debug("clock overhead = (%g n + %g) s (r = %g)", b->clk.m, b->clk.c, r);
b1a20bee 1125 if (tf&BTF_CYOK) {
13ee7406
MW
1126 linreg_fit(&lr_cy, &b->cy.m, &b->cy.c, &r);
1127 debug("cycle overhead = (%g n + %g) cy (r = %g)", b->cy.m, b->cy.c, r);
b64eb60f 1128 }
67b5031e
MW
1129
1130 /* We're done. */
13ee7406 1131 rc = 0;
b64eb60f 1132end:
b1a20bee 1133 b->f |= tf | BTF_CLB; /* no point trying again */
b64eb60f
MW
1134 return (rc);
1135}
1136
b1a20bee 1137/* --- @bench_preflight@ --- *
67b5031e 1138 *
8d372122 1139 * Arguments: @struct bench_state *b@ = benchmark state
67b5031e 1140 *
b1a20bee 1141 * Returns: Zero on success, %$-1$% on failure.
67b5031e 1142 *
b1a20bee
MW
1143 * Use: Prepares for benchmarking on the current thread. Current
1144 * checks are that the timer is calibrated and that it can
1145 * successfully measure time; the timer preflight is also run.
67b5031e 1146 *
b1a20bee
MW
1147 * Users are unlikely to find this function useful: it's called
1148 * automatically by the @BENCH_MEASURE@ macro and the
1149 * @bench_measure@ function.
67b5031e
MW
1150 */
1151
b1a20bee 1152int bench_preflight(struct bench_state *b)
b64eb60f 1153{
b1a20bee 1154 struct bench_timer *tm = b->tm;
b64eb60f 1155
b1a20bee 1156 if (!(b->f&BTF_CLB)) return (-1);
8d372122 1157 if (!(b->f&BTF_TIMEOK)) return (-1);
b1a20bee
MW
1158 if (tm->ops->preflight(tm)) return (-1);
1159 debug("measuring...");
1160 return (0);
1161}
1162
1163/* --- @bench_adapt@ --- *
1164 *
289651a7
MW
1165 * Arguments: @double *n_inout@ = number of iterations, updated
1166 * @double target_s@ = target time in seconds
b1a20bee
MW
1167 * @const struct bench_timing *t@ = timing from the previous run
1168 *
1169 * Returns: Nonzero if the measurement is sufficient; zero to run again.
1170 *
1171 * Use: This function determines a suitable number of iterations of a
1172 * benchmark function to perform next. It is used in a loop
1173 * such as the following.
1174 *
1175 * @double n = 1.0;@
1176 * @struct bench_timing t;@
1177 *
1178 * @do {@
1179 * (run @n@ iterations; set @t@ to the timing)
1180 * @} while (!bench_adapt(b, &n, &t));@
1181 *
1182 * On entry, @*n_inout@ should be the number of iterations
1183 * performed by the previous pass, and @*t@ the resulting time;
1184 * the @BTF_TIMEOK@ flag must be set @t->f@. If the timing is
289651a7 1185 * sufficient -- @t->t@ is sufficiently close to @target_s@
b1a20bee
MW
1186 * -- then the function returns nonzero to indicate that
1187 * measurement is complete. Otherwise, it sets @*n_inout@ to a
1188 * new, larger iteration count and returns zero to indicate that
1189 * a further pass is necessary.
1190 */
67b5031e 1191
289651a7
MW
1192int bench_adapt(double *n_inout, double target_s,
1193 const struct bench_timing *t)
b1a20bee
MW
1194{
1195 double n = *n_inout, nn;
1196
1197 /* Dump the results for debugging. */
1198 if (!(t->f&BTF_CYOK)) debug(" n = %10.0f; t = %12g", n, t->t);
1199 else debug(" n = %10.0f; t = %12g, cy = %10.0f", n, t->t, t->cy);
1200
1201 /* Suppose the timer loop %$n$% iterations in %$t$% seconds. Our ideal
c81c35df
MW
1202 * time is %$T$% seconds. If %$t \ge T/\sqrt{2}$%, we're happy.
1203 * Otherwise, we need to scale up the iteration count. The obvious next
1204 * choice is %$n' = n T/t$%. Alas, rounding is a problem: if
1205 * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no
1206 * progress. We know that %$T/t > \sqrt{2}%, so this can only happen when
1207 * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%. On the other
1208 * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying
1209 * again with %$n' = n + 1$% iterations will very likely work.
1210 */
289651a7
MW
1211 if (t->t >= 0.707*target_s) return (1);
1212 nn = n*target_s/t->t; modf(nn, &nn);
b1a20bee
MW
1213 *n_inout = nn > n ? nn : n + 1;
1214 return (0);
1215}
6e683a79 1216
b1a20bee
MW
1217/* --- @bench_adjust@ --- *
1218 *
1219 * Arguments: @struct bench_state *b@ = benchmark state
1220 * @struct bench_timing *t_inout@ = timing to adjust
1221 * @double n@ = number of external iterations performed
1222 * @double base@ = number of internal operations per external
1223 * iteration
1224 *
1225 * Returns: ---
1226 *
1227 * Use: Adjusts a raw timing, as captured by @BENCH_TIMELOOP@,
1228 * according to the calibration data captured in @b@.
1229 * On exit, the timing data is updated, and @t->n@ is set to the
1230 * product @n*base@.
1231 */
1232
1233void bench_adjust(struct bench_state *b,
1234 struct bench_timing *t_inout, double n, double base)
1235{
67b5031e
MW
1236
1237 /* Adjust according to the calibration. */
b1a20bee
MW
1238 t_inout->t -= n*b->clk.m + b->clk.c;
1239 if (t_inout->f&BTF_CYOK) t_inout->cy -= n*b->cy.m + b->cy.c;
67b5031e
MW
1240
1241 /* Report the results, if debugging. */
b1a20bee
MW
1242 if (!(t_inout->f&BTF_CYOK)) debug(" adjusted t' = %12g", t_inout->t);
1243 else debug(" adjusted t' = %12g, cy' = %10.0f", t_inout->t, t_inout->cy);
1244 if (!(t_inout->f&BTF_CYOK))
1245 debug(" %g s per iter; %g iters/s", t_inout->t/n, n/t_inout->t);
b64eb60f 1246 else
b1a20bee
MW
1247 debug(" %g s (%g cy) per iter; %g iters/s",
1248 t_inout->t/n, t_inout->cy/n, n/t_inout->t);
67b5031e
MW
1249
1250 /* All done. */
b1a20bee
MW
1251 t_inout->n = n*base;
1252}
1253
1254/* --- @bench_measure@ --- *
1255 *
1256 * Arguments: @struct bench_state *b@ = benchmark state
1257 * @struct bench_timing *t_out@ = where to leave the timing
1258 * @double base@ = number of internal units per call
1259 * @bench_fn *fn@, @void *ctx@ = benchmark function to run
1260 *
1261 * Returns: Zero on success, %$-1$% if timing failed.
1262 *
1263 * Use: Measure a function. The function @fn@ is called adaptively
1264 * with an iteration count @n@ set so as to run for
1265 * approximately @b->target_s@ seconds.
1266 *
1267 * The result is left in @*t_out@, with @t_out->n@ counting the
1268 * final product of the iteration count and @base@ (which might,
1269 * e.g., reflect the number of inner iterations the function
1270 * performs, or the number of bytes it processes per iteration).
1271 *
1272 * To get useful results, the benchmark state should have been
1273 * calibrated for indirect calling -- i.e., with @BTF_INDIRECT@.
1274 */
1275
1276int bench_measure(struct bench_state *b, struct bench_timing *t_out,
1277 double base, bench_fn *fn, void *ctx)
1278{
1279 BENCH_MEASURE_DECLS;
1280 int rc;
1281
289651a7 1282 BENCH_MEASURE(b, rc, t_out, base) fn(_bench_n, ctx);
b1a20bee
MW
1283 return (rc);
1284}
1285
1286/*----- Reporting ---------------------------------------------------------*/
1287
1288/* --- @bench_report@ --- *
1289 *
1290 * Arguments: @const struct gprintf_ops *gops, void *gp@ = output formatter
1291 * @unsigned unit@ = unit processed by the benchmark function
1292 * @const struct bench_timing *t@ = benchmark result
1293 *
1294 * Returns: ---
1295 *
1296 * Use: Format, to the output identified by @gops@ and @go@, a
1297 * human-readable report of the benchmarking result @t@. No
1298 * newline is appended.
1299 *
1300 * The output format is subject to change in later versions.
1301 */
1302
1303void bench_report(const struct gprintf_ops *gops, void *go,
1304 unsigned unit, const struct bench_timing *t)
1305{
1306 double scale, x, n = t->n;
1307 const char *u, *what, *whats;
1308
1309 assert(t->f&BTF_TIMEOK);
1310
1311 switch (unit) {
1312 case BTU_OP:
1313 gprintf(gops, go, "%.0f iterations ", n);
1314 what = "op"; whats = "ops"; scale = 1000;
1315 break;
1316 case BTU_BYTE:
1317 x = n; normalize(&x, &u, 1024); gprintf(gops, go, "%.3f %sB ", x, u);
1318 what = whats = "B"; scale = 1024;
1319 break;
1320 default:
1321 assert(0);
1322 }
1323
1324 x = t->t; normalize(&x, &u, 1000);
1325 gprintf(gops, go, "in %.3f %ss", x, u);
1326 if (t->f&BTF_CYOK) {
1327 x = t->cy; normalize(&x, &u, 1000);
1328 gprintf(gops, go, " (%.3f %scy)", x, u);
1329 }
1330 gprintf(gops, go, ": ");
1331
1332 x = n/t->t; normalize(&x, &u, scale);
1333 gprintf(gops, go, "%.3f %s%s/s", x, u, whats);
1334 x = t->t/n; normalize(&x, &u, 1000);
1335 gprintf(gops, go, ", %.3f %ss/%s", x, u, what);
1336 if (t->f&BTF_CYOK) {
1337 x = t->cy/n; normalize(&x, &u, 1000);
1338 gprintf(gops, go, " (%.3f %scy/%s)", x, u, what);
1339 }
b64eb60f
MW
1340}
1341
1342/*----- That's all, folks -------------------------------------------------*/