@@@ misc mess
[mLib] / test / bench.c
CommitLineData
b64eb60f
MW
1/* -*-c-*-
2 *
3 * Benchmarking support
4 *
5 * (c) 2023 Straylight/Edgeware
6 */
7
8/*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of the mLib utilities library.
11 *
12 * mLib is free software: you can redistribute it and/or modify it under
13 * the terms of the GNU Library General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or (at
15 * your option) any later version.
16 *
17 * mLib is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 * License for more details.
21 *
22 * You should have received a copy of the GNU Library General Public
23 * License along with mLib. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 * USA.
26 */
27
28/*----- Header files ------------------------------------------------------*/
29
30#include "config.h"
31
13ee7406 32#include <ctype.h>
b64eb60f 33#include <errno.h>
6e683a79 34#include <limits.h>
b64eb60f
MW
35#include <stdarg.h>
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#include <time.h>
40
41#include "alloc.h"
42#include "bench.h"
43#include "bits.h"
13ee7406 44#include "dstr.h"
b64eb60f
MW
45#include "linreg.h"
46#include "macros.h"
47
6e683a79
MW
48#if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
49# include <cpuid.h>
50# define CPUID_1D_TSC (1u << 4)
51# define CPUID_1xD_TSCP (1u << 27)
52#endif
53
54#if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
55# include <sys/types.h>
56# include <unistd.h>
57# include <linux/perf_event.h>
58# include <asm/unistd.h>
59# if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
60# include <sys/mman.h>
61# endif
62#endif
63
b64eb60f
MW
64/*----- Data structures ---------------------------------------------------*/
65
13ee7406
MW
66enum { CLK, CY, NTIMER };
67
b64eb60f
MW
68struct timer {
69 struct bench_timer _t;
13ee7406 70 const struct timer_ops *ops[NTIMER]; /* subtimers for clock and cycles */
6e683a79
MW
71 union {
72 unsigned tscaux; /* `ia32_tsc_aux' for `ldtscp' */
73 int fd; /* vanilla `perf_event_open' */
74 struct { const volatile void *map; size_t sz; } pmc; /* `perf_event_open'
75 * with `rdpmc' */
76 } u_cy; /* state for cycle measurement */
b64eb60f
MW
77};
78
79struct timer_ops {
13ee7406
MW
80 const char *name; /* timer name */
81 unsigned f; /* flags */
82#define TF_SECRET 1u /* don't try this automatically */
83 int (*init)(struct timer */*t*/); /* initialization function */
6e683a79
MW
84 int (*now)(struct timer */*t*/, /* read current */
85 struct bench_time */*t_out*/, unsigned /*f*/);
86 void (*diff)(struct timer */*t*/, /* difference */
87 struct bench_timing */*t_inout*/,
88 const struct bench_time */*t0*/,
89 const struct bench_time */*t1*/);
90 void (*teardown)(struct timer */*t*/); /* release held resources */
b64eb60f
MW
91};
92
93/*----- Preliminaries -----------------------------------------------------*/
94
95#define NS_PER_S 1000000000
96
67b5031e
MW
97/* --- @debug@ --- *
98 *
99 * Arguments: @const char *fmt@ = format control string
100 * @...@ = format arguemnts
101 *
102 * Returns: ---
103 *
104 * Use: Maybe report a debugging message to standard error.
105 */
106
31d0247c 107static PRINTF_LIKE(1, 2) void debug(const char *fmt, ...)
b64eb60f
MW
108{
109 const char *p;
110 va_list ap;
111
112 p = getenv("MLIB_BENCH_DEBUG");
113 if (p && *p != 'n' && *p != '0') {
114 va_start(ap, fmt);
115 fputs("mLib BENCH: ", stderr);
116 vfprintf(stderr, fmt, ap);
117 fputc('\n', stderr);
118 va_end(ap);
119 }
120}
121
6e683a79
MW
122/*----- Difference utilities ----------------------------------------------*/
123
124#ifdef HAVE_UINT64
125# define FLOATK64(k) ((double)(k).i)
126#else
127# define FLOATK64(k) ((double)(k).lo + 4294967296.0*(double)(k).hi)
128#endif
129
130/* --- @diff_ts@ --- *
67b5031e 131 *
6e683a79
MW
132 * Arguments: @struct timer *t@ = timer structure
133 * @struct bench_timing *delta_inout@ = where to put the result
134 * @const struct time *t0, *t1@ = two input times
67b5031e
MW
135 *
136 * Returns: ---
137 *
6e683a79
MW
138 * Use: Calculates a time difference for timers using the
139 * @struct timespec@-like time format.
67b5031e
MW
140 */
141
6e683a79
MW
142static void diff_ts(struct timer *t, struct bench_timing *delta_inout,
143 const struct bench_time *t0, const struct bench_time *t1)
67b5031e
MW
144{
145 unsigned f = t0->f&t1->f;
146 kludge64 k;
147
6e683a79 148 if (f&BTF_TIMEOK) {
67b5031e 149
6e683a79
MW
150 /* Calculate the integer difference in seconds. */
151 SUB64(k, t1->t.ts.s, t0->t.ts.s);
67b5031e 152
6e683a79
MW
153 /* And apply the nanoseconds difference. To prevent underflow,
154 * pre-emptively borrow one from the integer difference.
155 */
156 delta_inout->t =
157 FLOATK64(k) - 1.0 +
158 (t1->t.ts.ns + NS_PER_S - t0->t.ts.ns)/(double)NS_PER_S;
159
160 /* Done. */
161 delta_inout->f |= BTF_TIMEOK;
67b5031e 162 }
6e683a79 163}
67b5031e 164
6e683a79
MW
165/* --- @diff_cycles@ --- *
166 *
167 * Arguments: @struct timer *t@ = timer structure
168 * @struct bench_timing *delta_inout@ = where to put the result
169 * @const struct time *t0, *t1@ = two input times
170 *
171 * Returns: ---
172 *
173 * Use: Calculates a time difference for cycle-counting timers.
174 */
67b5031e 175
6e683a79
MW
176static void diff_cycles(struct timer *t, struct bench_timing *delta_inout,
177 const struct bench_time *t0,
178 const struct bench_time *t1)
179{
180 unsigned f = t0->f&t1->f;
181 kludge64 k;
182
183 if (f&BTF_CYOK) {
184 SUB64(k, t1->cy, t0->cy); delta_inout->cy = FLOATK64(k);
185 delta_inout->f |= BTF_CYOK;
186 }
67b5031e
MW
187}
188
6e683a79
MW
189#undef FLOATK64
190
13ee7406 191/*----- The null timer ----------------------------------------------------*/
b64eb60f 192
13ee7406
MW
193/* This is a timer which does nothing, in case we don't have any better
194 * ideas.
67b5031e
MW
195 */
196
13ee7406 197static int null_init(struct timer *t) { return (0); }
6e683a79
MW
198static int null_now(struct timer *t, struct bench_time *t_out, unsigned f)
199 { return (0); }
200static void null_diff(struct timer *t, struct bench_timing *delta_inout,
201 const struct bench_time *t0,
202 const struct bench_time *t1)
203 { ; }
b64eb60f 204static void null_teardown(struct timer *t) { ; }
b64eb60f 205
13ee7406 206static const struct timer_ops null_ops =
6e683a79 207 { "null", 0, null_init, null_now, null_diff, null_teardown };
13ee7406
MW
208#define NULL_ENT &null_ops,
209
210/*----- The broken clock --------------------------------------------------*/
211
212/* This is a cycle counter which does nothing, in case we don't have any
213 * better ideas.
214 */
b64eb60f 215
13ee7406
MW
216static int broken_init(struct timer *t) { return (-1); }
217
218static const struct timer_ops broken_ops =
6e683a79 219 { "broken", TF_SECRET, broken_init, null_now, null_diff, null_teardown };
13ee7406 220#define BROKEN_ENT &broken_ops,
b64eb60f
MW
221
222/*----- Linux performance counters ----------------------------------------*/
223
67b5031e
MW
224/* This is a cycle counter which uses the Linux performance event system,
225 * which is probably the best choice if it's available.
226 */
227
b64eb60f
MW
228#if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
229
6e683a79
MW
230/* --- @perfevent_open@ --- *
231 *
232 * Arguments: ---
233 *
234 * Returns: File descriptor, or %$-1$%.
235 *
236 * Use: Open a performance measurement descriptor set up to count CPU
237 * cycles.
238 */
b64eb60f 239
6e683a79
MW
240static int perfevent_open(void)
241{
242 struct perf_event_attr attr = { 0 };
243 int fd;
b64eb60f 244
6e683a79
MW
245 attr.type = PERF_TYPE_HARDWARE;
246 attr.size = sizeof(attr);
247 attr.config = PERF_COUNT_HW_CPU_CYCLES;
248 attr.disabled = 0;
249 attr.exclude_kernel = 1;
250 attr.exclude_hv = 1;
251
252 fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
253 if (fd < 0) {
254 debug("couldn't open perf event: %s", strerror(errno));
255 return (-1);
256 }
257
258 return (fd);
259}
260
261static int perfevent_now(struct timer *t,
262 struct bench_time *t_out, unsigned f)
b64eb60f
MW
263{
264 ssize_t n;
265
266 n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i));
267 if (n != sizeof(t_out->cy.i)) {
268 debug("failed to read perf-event counter: %s", strerror(errno));
6e683a79 269 return (0);
b64eb60f 270 }
6e683a79 271 t_out->f |= BTF_CYOK; return (0);
b64eb60f
MW
272}
273
274static void perfevent_teardown(struct timer *t)
275 { close(t->u_cy.fd); }
276
b64eb60f
MW
277static int perfevent_init(struct timer *t)
278{
b64eb60f 279 struct bench_time tm;
6e683a79 280 int fd = -1, rc;
b64eb60f 281
6e683a79 282 fd = perfevent_open(); if (!fd) { rc = -1; goto end; }
b64eb60f 283
6e683a79
MW
284 t->u_cy.fd = fd; tm.f = 0; perfevent_now(t, &tm, 0);
285 if (!(tm.f&BTF_CYOK)) { rc = -1; goto end; }
286 fd = -1; rc = 0;
287end:
288 if (fd != -1) close(fd);
289 return (rc);
290}
291
292static const struct timer_ops perfevent_ops =
293 { "linux-perf-read-hw-cycles", 0,
294 perfevent_init, perfevent_now, diff_cycles, perfevent_teardown };
295#define PERFEVENT_VANILLA_CYENT &perfevent_ops,
296
297# if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
298
299/* Special syscall-free version for x86 using `rdpmc' instruction. *
300 *
301 * This is a bit weird because it does both kinds of measurement in a single
302 * operation.
303 */
304
305static int perfevrdpmc_now(struct timer *t,
306 struct bench_time *t_out, unsigned f)
307{
308 const volatile struct perf_event_mmap_page *map = t->u_cy.pmc.map;
309 unsigned long long tsc = tsc, toff = toff, tenb = tenb;
310 unsigned long long cy = cy, cyoff = cyoff;
311 unsigned long long m, hi, lo;
312 unsigned tshift = tshift, tmult = tmult, q0, q1, ff;
313
314 /* Repeat until we can complete this job without the buffer changing in the
315 * middle.
316 */
317 q0 = map->lock;
318 __atomic_thread_fence(__ATOMIC_ACQ_REL);
319 for (;;) {
320 ff = 0;
321
322 /* Read the passage-of-time information. */
323 if (map->cap_user_time) {
324 tenb = map->time_enabled;
325 tsc = __builtin_ia32_rdtsc();
326 tshift = map->time_shift;
327 tmult = map->time_mult;
328 toff = map->time_offset;
329 ff |= BTF_TIMEOK;
330 }
331
332 /* Read the performance-counter information. */
333 if (map->cap_user_rdpmc) {
334 cy = __builtin_ia32_rdpmc(map->index - 1);
335 cyoff = map->offset;
336 ff |= BTF_CYOK;
337 }
338
339 /* Check the sequence number again. */
340 __atomic_thread_fence(__ATOMIC_ACQ_REL);
341 q1 = map->lock;
342 if (q0 == q1) break;
343 q0 = q1;
344 }
345
346 if (ff&BTF_TIMEOK) {
347 /* We have a raw reference-cycle count %$n$% (@tsc@), and parameters
348 * %$a$%, %$w$% and %$t_0$%, such that %$a n/2^w + t_0$% gives a time in
349 * nanoseconds.
350 */
351
352 m = (1ull << tshift) - 1;
353 hi = tsc >> tshift; lo = tsc&m;
354 t_out->t.rawns.i = hi*tmult + (lo*tmult >> tshift) + toff + tenb;
355 t_out->f |= BTF_TIMEOK;
b64eb60f
MW
356 }
357
6e683a79
MW
358 if (ff&BTF_CYOK) {
359 /* We have the cycle count. */
b64eb60f 360
6e683a79
MW
361 t_out->cy.i = cy + cyoff;
362 t_out->f |= BTF_CYOK;
363 }
13ee7406 364 return (0);
b64eb60f 365}
13ee7406 366
6e683a79
MW
367static void perfevrdpmc_diff(struct timer *t,
368 struct bench_timing *delta_inout,
369 const struct bench_time *t0,
370 const struct bench_time *t1)
371{
372 unsigned f = t0->f&t1->f;
13ee7406 373
6e683a79
MW
374 if (f&BTF_TIMEOK) {
375 delta_inout->t = (t1->t.rawns.i - t0->t.rawns.i)/(double)NS_PER_S;
376 delta_inout->f |= BTF_TIMEOK;
377 }
378
379 if (f&BTF_CYOK) {
380 delta_inout->cy = t1->cy.i - t0->cy.i;
381 delta_inout->f |= BTF_CYOK;
382 }
383}
384
385static void perfevrdpmc_teardown(struct timer *t)
386 { munmap((/*unconst unvolatile*/ void *)t->u_cy.pmc.map, t->u_cy.pmc.sz); }
387
388static int perfevrdpmc_cyinit(struct timer *t)
389{
390 const volatile struct perf_event_mmap_page *map = 0;
391 unsigned a, b, c, d, q0, q1, f;
392 int pgsz, mapsz, fd = -1, rc;
393
394 /* We need `rdtsc' to do the passage-of-time measurement. */
395 if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
396 { debug("no `rdtsc' instrunction"); return (-1); }
397
398 /* The rules say we must allocate %$1 + 2^n$% pages, so we need to know how
399 * big a page is.
400 */
401 pgsz = sysconf(_SC_PAGESIZE);
402 if (pgsz < 0) {
403 debug("failed to discover page size!: %s", strerror(errno));
404 rc = -1; goto end;
405 }
406
407 /* Open the measurement descriptor and map it. */
408 fd = perfevent_open(); if (!fd) return (-1);
409 mapsz = 2*pgsz;
410 map = mmap(0, mapsz, PROT_READ, MAP_SHARED, fd, 0);
411 if (map == MAP_FAILED) {
412 debug("failed to map perf event: %s", strerror(errno));
413 return (-1);
414 }
415
416 /* Check that it's revealed the necessary information. */
417 q0 = map->lock;
418 __atomic_thread_fence(__ATOMIC_ACQ_REL);
419 for (;;) {
420 f = 0;
421 if (map->cap_user_time) f |= BTF_TIMEOK;
422 if (map->cap_user_rdpmc) f |= BTF_CYOK;
423 __atomic_thread_fence(__ATOMIC_ACQ_REL);
424 q1 = map->lock;
425 if (q0 == q1) break;
426 q0 = q1;
427 }
428 if (!(f&BTF_TIMEOK))
429 { debug("kernel refused user time measurement"); rc = -1; goto end; }
430 if (!(f&BTF_TIMEOK))
431 { debug("kernel refused user cycle measurement"); rc = -1; goto end; }
432
433 /* All done. We can close the descriptor here: the mapping will keep the
434 * performance-measurement machinery alive.
435 */
436 t->u_cy.pmc.map = map; t->u_cy.pmc.sz = mapsz; map = 0; rc = 0;
437end:
438 if (fd != -1) close(fd);
439 if (map) munmap((/*unconst unvolatile*/ void *)map, mapsz);
440 return (rc);
441}
442
443static const struct timer_ops perfevrdpmc_cyops =
444 { "linux-x86-perf-rdpmc-hw-cycles", 0,
445 perfevrdpmc_cyinit, perfevrdpmc_now,
446 perfevrdpmc_diff, perfevrdpmc_teardown };
447
448static int perfevrdpmc_clkinit(struct timer *t)
449{
450 if (t->ops[CLK] != &perfevrdpmc_cyops) {
451 debug("linux-x86-perf-rdpmc-hw-cycles not set as cycle subtimer");
452 return(-1);
453 }
454 return (0);
455}
456
457static const struct timer_ops perfevrdpmc_clkops =
458 { "linux-x86-perf-rdpmc-hw-cycles", 0,
459 perfevrdpmc_clkinit, null_now,
460 null_diff, null_teardown };
461
462# define PERFEVENT_RDPMC_CLKENT &perfevrdpmc_clkops,
463# define PERFEVENT_RDPMC_CYENT &perfevrdpmc_cyops,
464
465# else
466# define PERFEVENT_RDPMC_CLKENT
467# define PERFEVENT_RDPMC_CYENT
468# endif
469
470# define PERFEVENT_CLKENT PERFEVENT_RDPMC_CLKENT
471# define PERFEVENT_CYENT PERFEVENT_RDPMC_CYENT PERFEVENT_VANILLA_CYENT
b64eb60f 472#else
6e683a79 473# define PERFEVENT_CLKENT
b64eb60f
MW
474# define PERFEVENT_CYENT
475#endif
476
477/*----- Intel time-stamp counter ------------------------------------------*/
478
67b5031e
MW
479/* This is a cycle counter based on the Intel `rdtsc' instruction. It's not
480 * really suitable for performance measurement because it gets confused by
481 * CPU frequency adjustments.
482 */
483
6e683a79 484#if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
b64eb60f 485
6e683a79
MW
486static int x86rdtsc_now(struct timer *t,
487 struct bench_time *t_out, unsigned f)
488 { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; return (0); }
b64eb60f
MW
489
490static int x86rdtsc_init(struct timer *t)
491{
13ee7406
MW
492 unsigned a, b, c, d;
493
494 if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC))
b64eb60f 495 { debug("no `rdtsc' instrunction"); return (-1); }
6e683a79
MW
496 t->u_cy.tscaux = ~0u;
497 return (0);
498}
499
500static int x86rdtscp_now(struct timer *t,
501 struct bench_time *t_out, unsigned f)
502{
503 unsigned tscaux;
504 unsigned long long n;
505
506 n = __builtin_ia32_rdtscp(&tscaux);
507 if (!(f&BTF_T1))
508 t->u_cy.tscaux = tscaux;
509 else if (t->u_cy.tscaux != tscaux) {
510 debug("tscaux mismatch: new 0x%08x /= old 0x%08x",
511 tscaux, t->u_cy.tscaux);
512 return (-1);
513 }
514 t_out->cy.i = n; t_out->f |= BTF_CYOK; return (0);
515}
516
517static int x86rdtscp_init(struct timer *t)
518{
519 unsigned a, b, c, d;
520
521 if (!__get_cpuid(0x80000001, &a, &b, &c, &d) || !(d&CPUID_1xD_TSCP))
522 { debug("no `rdtscp' instrunction"); return (-1); }
13ee7406 523 return (0);
b64eb60f
MW
524}
525
13ee7406 526static const struct timer_ops x86rdtsc_ops =
6e683a79
MW
527 { "x86-rdtsc", 0,
528 x86rdtsc_init, x86rdtsc_now, diff_cycles, null_teardown };
529static const struct timer_ops x86rdtscp_ops =
530 { "x86-rdtscp", 0,
531 x86rdtscp_init, x86rdtscp_now, diff_cycles, null_teardown };
13ee7406 532
6e683a79 533# define X86RDTSC_CYENT &x86rdtscp_ops, &x86rdtsc_ops,
b64eb60f 534#else
13ee7406 535# define X86RDTSC_CYENT
b64eb60f
MW
536#endif
537
538/*----- POSIX `clock_gettime' ---------------------------------------------*/
539
67b5031e
MW
540/* This is a real-time clock based on the POSIX time interface, with up to
541 * nanosecond precision.
542 */
543
b64eb60f
MW
544#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)
545
6e683a79 546static int gettime_now(struct timer *t, struct bench_time *t_out, unsigned f)
b64eb60f
MW
547{
548 struct timespec now;
549
550 if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now))
6e683a79
MW
551 { debug("error reading POSIX clock: %s", strerror(errno)); return (0); }
552 ASSIGN64(t_out->t.ts.s, now.tv_sec); t_out->t.ts.ns = now.tv_nsec;
553 t_out->f |= BTF_TIMEOK; return (0);
b64eb60f
MW
554}
555
b64eb60f
MW
556static int gettime_init(struct timer *t)
557{
558 struct bench_time tm;
559
6e683a79 560 tm.f = 0; gettime_now(t, &tm, 0); if (!tm.f&BTF_TIMEOK) return (-1);
13ee7406 561 return (0);
b64eb60f
MW
562}
563
13ee7406 564static const struct timer_ops gettime_ops =
6e683a79
MW
565 { "posix-thread-cputime", 0,
566 gettime_init, gettime_now, diff_ts, null_teardown };
13ee7406
MW
567
568# define GETTIME_CLKENT &gettime_ops,
b64eb60f
MW
569#else
570# define GETTIME_CLKENT
571#endif
572
573/*----- Standard C `clock' ------------------------------------------------*/
574
67b5031e
MW
575/* This is a real-time clock based on the C `clock' function which is
576 * guaranteed to be available, though it's not likely to be very good.
577 */
578
6e683a79 579static int clock_now(struct timer *t, struct bench_time *t_out, unsigned f)
b64eb60f 580{
6e683a79 581 clock_t now;
b64eb60f
MW
582
583 now = clock();
584 if (now == (clock_t)-1) {
585 debug("error reading standard clock: %s", strerror(errno));
6e683a79 586 return (0);
b64eb60f 587 }
6e683a79
MW
588 t_out->t.clk = now; t_out->f |= BTF_TIMEOK; return (0);
589}
590
591static void clock_diff(struct timer *t, struct bench_timing *delta_inout,
592 const struct bench_time *t0,
593 const struct bench_time *t1)
594{
595 unsigned f = t0->f&t1->f;
596
597 if (f&BTF_TIMEOK) {
598 delta_inout->t = (t1->t.clk - t0->t.clk)/(double)CLOCKS_PER_SEC;
599 delta_inout->f |= BTF_TIMEOK;
600 }
b64eb60f
MW
601}
602
b64eb60f
MW
603static int clock_init(struct timer *t)
604{
605 struct bench_time tm;
606
6e683a79 607 tm.f = 0; clock_now(t, &tm, 0); if (!tm.f&BTF_TIMEOK) return (-1);
13ee7406 608 return (0);
b64eb60f
MW
609}
610
13ee7406 611static const struct timer_ops clock_ops =
6e683a79 612 { "stdc-clock", 0, clock_init, clock_now, clock_diff, null_teardown };
13ee7406
MW
613
614#define CLOCK_CLKENT &clock_ops,
b64eb60f
MW
615
616/*----- Timing setup ------------------------------------------------------*/
617
67b5031e 618/* Tables of timing sources. */
13ee7406 619static const struct timer_ops
6e683a79
MW
620 *const clktab[] = { PERFEVENT_CLKENT
621 GETTIME_CLKENT
622 CLOCK_CLKENT
623 BROKEN_ENT
624 0 },
625 *const cytab[] = { PERFEVENT_CYENT
626 X86RDTSC_CYENT
627 NULL_ENT
628 BROKEN_ENT
629 0 };
13ee7406
MW
630
631static const struct timertab {
632 const char *what;
633 const char *env;
634 const struct timer_ops *const *opstab;
635} timertab[] = {
636 { "clock", "MLIB_BENCH_CLKTIMER", clktab },
637 { "cycle", "MLIB_BENCH_CYCLETIMER", cytab }
638};
b64eb60f 639
67b5031e
MW
640/* --- @find_timer@ --- *
641 *
642 * Arguments: @const char *name@ = timer name
643 * @size_t sz@ = length of name
13ee7406 644 * @unsigned tm@ = which subtimer we're looking for
67b5031e
MW
645 *
646 * Returns: The table entry matching the given name, or null if there
647 * isn't one.
648 */
649
13ee7406
MW
650static const struct timer_ops *find_timer(const char *name, size_t sz,
651 unsigned tm)
b64eb60f 652{
13ee7406
MW
653 const struct timer_ops *const *tt;
654
655 for (tt = timertab[tm].opstab; *tt; tt++) {
656 if (strlen((*tt)->name) == sz &&
657 MEMCMP(name, ==, (*tt)->name, sz))
658 return (*tt);
b64eb60f 659 }
13ee7406
MW
660 debug("%s timer `%.*s' not found",
661 timertab[tm].what, (int)sz, name); return (0);
b64eb60f
MW
662}
663
67b5031e
MW
664/* --- @try_timer@ --- *
665 *
666 * Arguments: @struct timer *t@ = timer structure
13ee7406
MW
667 * @const struct timer_ops *ops@ = timer ops
668 * @unsigned tm@ = which subtimer we're setting
67b5031e 669 *
13ee7406 670 * Returns: Zero on success, %$-1$% if timer failed.
67b5031e
MW
671 *
672 * Use: Tries to initialize the timer @t@, reporting a debug message
673 * if it worked.
674 */
675
b64eb60f 676static int try_timer(struct timer *t,
13ee7406 677 const struct timer_ops *ops, unsigned tm)
b64eb60f 678{
13ee7406
MW
679 if (ops->init(t)) return (-1);
680 debug("selected %s timer `%s'", timertab[tm].what, ops->name);
681 t->ops[tm] = ops; return (0);
b64eb60f
MW
682}
683
67b5031e
MW
684/* --- @select_timer@ --- *
685 *
686 * Arguments: @struct timer *t@ = timer structure
13ee7406
MW
687 * @unsigned tm@ = which subtimer we're setting
688 * @const char *config@, @size_t sz@ = config string
67b5031e 689 *
13ee7406 690 * Returns: Zero on success, %$-1$% if timer failed.
67b5031e
MW
691 *
692 * Use: Select a timer from the table. If the environment variable
693 * is set, then parse a comma-separated list of timer names and
694 * use the first one listed that seems to work; otherwise, try
695 * the timers in the table in order.
696 */
697
13ee7406
MW
698static int select_timer(struct timer *t, unsigned tm,
699 const char *config, size_t sz)
b64eb60f 700{
13ee7406
MW
701 const char *p, *l;
702 const struct timer_ops *ops, *const *tt;
b64eb60f 703
13ee7406
MW
704 if (!config) {
705 for (tt = timertab[tm].opstab; *tt; tt++)
706 if (!((*tt)->f&TF_SECRET) && !try_timer(t, *tt, tm)) return (0);
b64eb60f 707 } else {
13ee7406 708 l = config + sz;
b64eb60f 709 for (;;) {
13ee7406
MW
710 p = memchr(config, ',', l - config); if (!p) p = l;
711 ops = find_timer(config, p - config, tm);
712 if (ops && !try_timer(t, ops, tm)) return (0);
713 if (p >= l) break;
714 config = p + 1;
b64eb60f
MW
715 }
716 }
13ee7406 717 debug("no suitable %s timer found", timertab[tm].what); return (-1);
b64eb60f
MW
718}
719
67b5031e 720/* Bench timer operations. */
13ee7406
MW
721static void timer_describe(struct bench_timer *tm, dstr *d)
722{
723 struct timer *t = (struct timer *)tm;
724 unsigned i;
725
726 dstr_puts(d, "builtin: ");
727 for (i = 0; i < NTIMER; i++) {
728 if (i) dstr_puts(d, ", ");
729 dstr_putf(d, "%s = %s", timertab[i].what, t->ops[i]->name);
730 }
731}
732
6e683a79
MW
733static int timer_now(struct bench_timer *tm,
734 struct bench_time *t_out, unsigned f)
b64eb60f
MW
735{
736 struct timer *t = (struct timer *)tm;
13ee7406 737 unsigned i;
b64eb60f 738
6e683a79
MW
739 t_out->f = 0;
740 for (i = 0; i < NTIMER; i++) if (t->ops[i]->now(t, t_out, f)) return (-1);
741 return (0);
742}
743
744static void timer_diff(struct bench_timer *tm,
745 struct bench_timing *t_out,
746 const struct bench_time *t0,
747 const struct bench_time *t1)
748{
749 struct timer *t = (struct timer *)tm;
750 unsigned i;
751
752 t_out->f = 0;
753 for (i = 0; i < NTIMER; i++) t->ops[i]->diff(t, t_out, t0, t1);
b64eb60f 754}
13ee7406 755
b64eb60f
MW
756static void timer_destroy(struct bench_timer *tm)
757{
758 struct timer *t = (struct timer *)tm;
13ee7406 759 unsigned i;
b64eb60f
MW
760
761 if (!t) return;
13ee7406
MW
762 for (i = 0; i < NTIMER; i++)
763 if (t->ops[i]) t->ops[i]->teardown(t);
b64eb60f
MW
764 xfree(t);
765}
766
13ee7406 767static const struct bench_timerops timer_ops =
6e683a79 768 { timer_describe, timer_now, timer_diff, timer_destroy };
b64eb60f 769
67b5031e
MW
770/* --- @bench_createtimer@ --- *
771 *
13ee7406 772 * Arguments: @const char *config@ = timer configuration string
67b5031e
MW
773 *
774 * Returns: A freshly constructed standard timer object.
775 *
776 * Use: Allocate a timer. Dispose of it by calling
777 * @tm->ops->destroy(tm)@ when you're done.
13ee7406
MW
778 *
779 * Applications should not set configuration strings except as
780 * established by user action, e.g., from a command-line option,
781 * environment variable, or configuration file.
67b5031e
MW
782 */
783
13ee7406 784struct bench_timer *bench_createtimer(const char *config)
b64eb60f
MW
785{
786 struct timer *t = 0;
787 struct bench_timer *ret = 0;
13ee7406
MW
788 struct { const char *p; size_t sz; } tmconf[NTIMER] = { 0 };
789 const struct timer_ops *const *tt;
790 const char *p, *l; size_t n, nn;
791 unsigned i;
b64eb60f 792
13ee7406
MW
793 /* Parse the configuration string. */
794 if (config) {
795
796 /* The first thing to do is find the end of the string. */
797 l = config + strlen(config);
798
799 for (;;) {
800 /* Process the whitespace-sparated words of the string one by one. */
801
802 /* Skip over any initial whitespace. If we hit the end of the string
803 * then we're done.
804 */
805 for (;;)
806 if (config >= l) goto done_config;
807 else if (!ISSPACE(*config)) break;
808 else config++;
809
810 /* There's definitely a word here. Find the end of it. */
811 for (p = config; p < l && !ISSPACE(*p); p++);
812 nn = p - config;
813
814 /* Try various simple keywords. */
815#define MATCHP(lit) (nn == sizeof(lit) - 1 && MEMCMP(config, ==, lit, nn))
816
817 if (MATCHP("list")) {
818 /* The `list' keyword requests lists of the available timer
819 * implementations.
820 */
821
822 for (i = 0; i < NTIMER; i++) {
823 printf("%s timers:", timertab[i].what);
824 for (tt = timertab[i].opstab; *tt; tt++)
825 if (!((*tt)->f)&TF_SECRET) printf(" %s", (*tt)->name);
826 putchar('\n');
827 }
828 goto next_config;
829 }
830
831#undef MATCHP
832
833 /* Otherwise it's an assignment, setting a subtimer list. */
834 p = memchr(config, '=', nn);
835 if (!p)
836 n = nn;
837 else {
838 n = p - config;
839 for (i = 0; i < NTIMER; i++)
840 if (STRNCMP(config, ==, timertab[i].what, n) &&
841 !timertab[i].what[n]) {
842 if (tmconf[i].p)
843 debug("duplicate %s timer list", timertab[i].what);
844 tmconf[i].p = config + n + 1; tmconf[i].sz = nn - n - 1;
845 goto next_config;
846 }
847 }
848 debug("unrecognized config keyword `%.*s'", (int)n, config);
849
850 /* Move on to the next word. */
851 next_config:
852 config += nn;
853 }
854 done_config:;
855 }
856
857 /* Override these settings from the environment. */
858 for (i = 0; i < NTIMER; i++) {
859 p = getenv(timertab[i].env);
860 if (p) { tmconf[i].p = p; tmconf[i].sz = strlen(p); }
861 }
862
863 /* All seems well. Allocate the timer object. */
864 t = xmalloc(sizeof(*t));
865 for (i = 0; i < NTIMER; i++) t->ops[i] = 0;
866
867 /* Try to set up the subtimers. */
6e683a79 868 for (i = NTIMER; i--; )
13ee7406
MW
869 if (select_timer(t, i, tmconf[i].p, tmconf[i].sz)) goto end;
870
871 /* All is done. */
b64eb60f
MW
872 t->_t.ops = &timer_ops; ret = &t->_t; t = 0;
873end:
874 if (t) timer_destroy(&t->_t);
875 return (ret);
876}
877
67b5031e 878/*----- Benchmarking ------------------------------------------------------*/
b64eb60f 879
67b5031e
MW
880/* --- @bench_init@ --- *
881 *
882 * Arguments: @struct bench_state *b@ = bench state to initialize
13ee7406 883 * @struct bench_timer *tm@ = timer to attach, or null
67b5031e 884 *
13ee7406 885 * Returns: Zero on success, %$-1$% on failure.
67b5031e 886 *
13ee7406
MW
887 * Use: Initialize the benchmark state. On success, the timer state
888 * still needs to be calibrated (use @bench_calibrate@) before
889 * it can be used, but this will be done automatically by
890 * @bench_measure@ if it's not done by hand earlier. The timer
891 * is now owned by the benchmark state and will be destroyed by
892 * @bench_destroy@.
893 *
894 * The only reason for failure is if @tm@ was null on entry,
895 * and automatic construction of a timer failed. The state is
896 * safe to discard, but calling @bench_destroy@ is safe too.
67b5031e 897 */
b64eb60f 898
13ee7406
MW
899int bench_init(struct bench_state *b, struct bench_timer *tm)
900{
901 int rc;
902
903 b->tm = 0;
904
905 if (!tm) {
906 tm = bench_createtimer(0);
907 if (!tm) { rc = -1; goto end; }
908 }
909
910 b->tm = tm; b->target_s = 1.0; b->f = 0; rc = 0;
911end:
912 return (rc);
913}
b64eb60f 914
67b5031e
MW
915/* --- @bench_destroy@ --- *
916 *
917 * Arguments: @struct bench_state *b@ = bench state
918 *
919 * Returns: ---
920 *
921 * Use: Destroy the benchmark state, releasing the resources that it
922 * holds.
923 */
924
b64eb60f 925void bench_destroy(struct bench_state *b)
13ee7406 926 { if (b->tm) { b->tm->ops->destroy(b->tm); b->tm = 0; } }
b64eb60f 927
67b5031e
MW
928/* --- @do_nothing@ --- *
929 *
930 * Arguments: @unsigned long n@ = iteration count
931 * @void *ctx@ = context pointer (ignored)
932 *
933 * Returns: ---
934 *
935 * Use: Does nothing at all for @n@ iterations. Used to calibrate
936 * the benchmarking state.
937 */
938
939static void do_nothing(unsigned long n, void *ctx)
b64eb60f
MW
940 { while (n--) RELAX; }
941
6e683a79
MW
942/* --- @measure@ --- *
943 *
944 * Arguments: @struct bench_state *b@ = bench state
945 * @struct bench_timing *delta_out@ = where to leave the timing
946 * @bench_fn *fn@ = function to measure
947 * @void *ctx@ = context for the function
948 * @double n@ = number of iterations
949 *
950 * Returns: ---
951 *
952 * Use: Run the function @n@ times, and report how long it took.
953 *
954 * This function deals with retrying the measurements if the
955 * timer reports a temporary failure, and all of the
956 * difficulties if @n@ is too large to fit in a machine integer.
957 */
958
959static void measure(struct bench_state *b, struct bench_timing *delta_out,
960 bench_fn *fn, void *ctx, double n)
961{
962 struct bench_timer *tm = b->tm;
963 struct bench_time t0, t1;
964 unsigned long n0, n1;
965 double R = ULONG_MAX;
966
967 if (n <= R) {
968 n0 = n;
969 do {
970 while (tm->ops->now(tm, &t0, BTF_T0));
971 fn(n0, ctx);
972 } while (tm->ops->now(tm, &t1, BTF_T1));
973 } else {
974 n1 = n/R; n0 = n - n1*R;
975 do {
976 while (tm->ops->now(tm, &t0, BTF_T0));
977 while (n1--) fn(ULONG_MAX, ctx);
978 fn(n0, ctx);
979 } while (tm->ops->now(tm, &t1, BTF_T1));
980 }
981 tm->ops->diff(tm, delta_out, &t0, &t1);
982}
983
67b5031e
MW
984/* --- @bench_calibrate@ --- *
985 *
986 * Arguments: @struct bench_state *b@ = bench state
987 *
13ee7406 988 * Returns: Zero on success, %$-1$% if calibration failed.
67b5031e
MW
989 *
990 * Use: Calibrate the benchmark state, so that it can be used to
991 * measure performance reasonably accurately.
992 */
993
13ee7406
MW
994#define T_CLB 0.0625 /* calibration time limit */
995
b64eb60f
MW
996int bench_calibrate(struct bench_state *b)
997{
998 struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT;
b64eb60f 999 struct bench_timing delta;
6e683a79 1000 double n, r;
b64eb60f 1001 bench_fn *fn = LAUNDER(&do_nothing);
6e683a79 1002 unsigned i, f = BTF_ANY;
b64eb60f
MW
1003 int rc;
1004
67b5031e
MW
1005 /* The model here is that a timing loop has a fixed overhead as we enter
1006 * and leave (e.g., to do with the indirect branch into the code), and
1007 * per-iteration overheads as we check the counter and loop back. We aim
1008 * to split these apart using linear regression.
1009 */
1010
1011 /* If we've already calibrated then there's nothing to do. */
13ee7406 1012 if (b->f&BTF_CLB) return (b->f&BTF_ANY ? 0 : -1);
b64eb60f 1013
67b5031e 1014 /* Exercise the inner loop a few times to educate the branch predictor. */
6e683a79 1015 for (i = 0; i < 50; i++) measure(b, &delta, fn, 0, 10000);
b64eb60f 1016
67b5031e
MW
1017 /* Now we measure idle loops until they take sufficiently long -- or we run
1018 * out of counter.
1019 */
b64eb60f 1020 debug("calibrating...");
6e683a79 1021 n = 1.0;
b64eb60f 1022 for (;;) {
67b5031e
MW
1023
1024 /* Measure @n@ iterations of the idle loop. */
6e683a79 1025 measure(b, &delta, fn, 0, n); f &= delta.f;
b64eb60f 1026 if (!(f&BTF_TIMEOK)) { rc = -1; goto end; }
67b5031e
MW
1027
1028 /* Register the timings with the regression machinery. */
b64eb60f
MW
1029 linreg_update(&lr_clk, n, delta.t);
1030 if (!(f&BTF_CYOK))
6e683a79 1031 debug(" n = %10.0f; t = %12g s", n, delta.t);
b64eb60f
MW
1032 else {
1033 linreg_update(&lr_cy, n, delta.cy);
6e683a79 1034 debug(" n = %10.0f; t = %12g s, cy = %10.0f", n, delta.t, delta.cy);
b64eb60f 1035 }
67b5031e
MW
1036
1037 /* If we're done then stop. */
13ee7406 1038 if (delta.t >= T_CLB) break;
b64eb60f 1039 if (n >= ULONG_MAX - n/3) break;
67b5031e
MW
1040
1041 /* Update the counter and continue. */
6e683a79 1042 n += n/3.0 + 1.0;
b64eb60f
MW
1043 }
1044
67b5031e
MW
1045 /* Now run the linear regression to extract the constant and per-iteration
1046 * overheads.
1047 */
13ee7406
MW
1048 linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, &r);
1049 debug("clock overhead = (%g n + %g) s (r = %g)", b->clk.m, b->clk.c, r);
b64eb60f 1050 if (f&BTF_CYOK) {
13ee7406
MW
1051 linreg_fit(&lr_cy, &b->cy.m, &b->cy.c, &r);
1052 debug("cycle overhead = (%g n + %g) cy (r = %g)", b->cy.m, b->cy.c, r);
b64eb60f 1053 }
67b5031e
MW
1054
1055 /* We're done. */
13ee7406 1056 rc = 0;
b64eb60f 1057end:
13ee7406 1058 b->f |= f | BTF_CLB; /* no point trying again */
b64eb60f
MW
1059 return (rc);
1060}
1061
67b5031e
MW
1062/* --- @bench_measure@ --- *
1063 *
8d372122
MW
1064 * Arguments: @struct bench_state *b@ = benchmark state
1065 * @struct bench_timing *t_out@ = where to leave the timing
67b5031e
MW
1066 * @double base@ = number of internal units per call
1067 * @bench_fn *fn@, @void *ctx@ = benchmark function to run
1068 *
13ee7406 1069 * Returns: Zero on success, %$-1$% if timing failed.
67b5031e
MW
1070 *
1071 * Use: Measure a function. The function @fn@ is called adaptively
1072 * with an iteration count @n@ set so as to run for
1073 * approximately @b->target_s@ seconds.
1074 *
1075 * The result is left in @*t_out@, with @t_out->n@ counting the
1076 * final product of the iteration count and @base@ (which might,
1077 * e.g., reflect the number of inner iterations the function
1078 * performs, or the number of bytes it processes per iteration).
1079 */
1080
8d372122 1081int bench_measure(struct bench_state *b, struct bench_timing *t_out,
67b5031e 1082 double base, bench_fn *fn, void *ctx)
b64eb60f 1083{
6e683a79 1084 double n, nn;
b64eb60f 1085
8d372122
MW
1086 /* Make sure the state is calibrated and usable. */
1087 if (!(b->f&BTF_CLB) && bench_calibrate(b)) return (-1);
1088 if (!(b->f&BTF_TIMEOK)) return (-1);
67b5031e 1089
c81c35df
MW
1090 /* Main adaptive measurement loop.
1091 *
1092 * Suppose the timer loop %$n$% iterations in %$t$% seconds. Our ideal
1093 * time is %$T$% seconds. If %$t \ge T/\sqrt{2}$%, we're happy.
1094 * Otherwise, we need to scale up the iteration count. The obvious next
1095 * choice is %$n' = n T/t$%. Alas, rounding is a problem: if
1096 * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no
1097 * progress. We know that %$T/t > \sqrt{2}%, so this can only happen when
1098 * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%. On the other
1099 * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying
1100 * again with %$n' = n + 1$% iterations will very likely work.
1101 */
6e683a79 1102 debug("measuring..."); n = 1.0;
b64eb60f 1103 for (;;) {
6e683a79 1104 measure(b, t_out, fn, ctx, n); t_out->f &= b->f;
b64eb60f 1105 if (!(t_out->f&BTF_TIMEOK)) return (-1);
6e683a79
MW
1106 if (!(t_out->f&BTF_CYOK))
1107 debug(" n = %10.0f; t = %12g", n, t_out->t);
1108 else
1109 debug(" n = %10.0f; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy);
1110
c81c35df
MW
1111 if (t_out->t >= 0.707*b->target_s) break;
1112 nn = n*b->target_s/t_out->t;
6e683a79 1113 if (n > ULONG_MAX || nn > (unsigned long)n + 1) n = nn;
c81c35df 1114 else n++;
b64eb60f 1115 }
67b5031e
MW
1116
1117 /* Adjust according to the calibration. */
c7da785d
MW
1118 t_out->t -= n*b->clk.m + b->clk.c;
1119 if (t_out->f&BTF_CYOK) t_out->cy -= n*b->cy.m + b->cy.c;
67b5031e
MW
1120
1121 /* Report the results, if debugging. */
c7da785d
MW
1122 if (!(t_out->f&BTF_CYOK)) debug(" adjusted t' = %12g", t_out->t);
1123 else debug(" adjusted t = %12g, cy = %10.0f", t_out->t, t_out->cy);
b64eb60f
MW
1124 if (!(t_out->f&BTF_CYOK))
1125 debug(" %g s per op; %g ops/s", t_out->t/n, n/t_out->t);
1126 else
1127 debug(" %g s (%g cy) per op; %g ops/s",
1128 t_out->t/n, t_out->cy/n, n/t_out->t);
67b5031e
MW
1129
1130 /* All done. */
e63124bc 1131 t_out->n = n*base; return (0);
b64eb60f
MW
1132}
1133
1134/*----- That's all, folks -------------------------------------------------*/