5 * (c) 2023 Straylight/Edgeware
8 /*----- Licensing notice --------------------------------------------------*
10 * This file is part of the mLib utilities library.
12 * mLib is free software: you can redistribute it and/or modify it under
13 * the terms of the GNU Library General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or (at
15 * your option) any later version.
17 * mLib is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 * License for more details.
22 * You should have received a copy of the GNU Library General Public
23 * License along with mLib. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
28 /*----- Header files ------------------------------------------------------*/
50 #if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
52 # define CPUID_1D_TSC (1u << 4)
53 # define CPUID_1xD_TSCP (1u << 27)
54 # define USE_X86_RDTSC 1
57 #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
58 # include <sys/syscall.h>
59 # include <sys/types.h>
61 # include <linux/perf_event.h>
62 # ifdef HAVE_VALGRIND_VALGRIND_H
63 # include <valgrind/valgrind.h>
65 # define USE_LINUX_PERFEVENT 1
66 # if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
67 # include <sys/mman.h>
68 # define USE_LINUX_PERFEVRDPMC 1
72 /*----- Data structures ---------------------------------------------------*/
74 enum { CLK
, CY
, NTIMER
};
77 struct bench_timer _t
;
79 const struct timer_ops
*ops
[NTIMER
]; /* subtimers for clock and cycles */
82 unsigned tscaux
; /* `ia32_tsc_aux' for `ldtscp' */
84 #ifdef USE_LINUX_PERFEVENT
85 int fd
; /* vanilla `perf_event_open' */
87 #ifdef USE_LINUX_PERFEVRDPMC
88 struct { /* `perf_event_open' with `rdpmc' */
89 const volatile void *map
; size_t sz
; /* memory-mapped info */
90 pid_t owner
; /* owning thread id */
93 } u_cy
; /* state for cycle measurement */
97 const char *name
; /* timer name */
98 unsigned f
; /* flags */
99 /* ... @BTF_...OK@ flags */ /* expected results */
100 #define TF_SECRET 16u /* don't try this automatically */
101 int (*init
)(struct timer */
*t*/
); /* initialization function */
102 int (*preflight
)(struct timer */
*t*/
); /* preflight checks */
103 int (*now
)(struct timer */
*t*/
, /* read current */
104 struct bench_time */
*t_out*/
, unsigned /*f*/);
105 void (*diff
)(struct timer */
*t*/
, /* difference */
106 struct bench_timing */
*t_inout*/
,
107 const struct bench_time */
*t0*/
,
108 const struct bench_time */
*t1*/
);
109 void (*teardown
)(struct timer */
*t*/
); /* release held resources */
112 /*----- Preliminaries -----------------------------------------------------*/
114 #define NS_PER_S 1000000000
118 * Arguments: @const char *fmt@ = format control string
119 * @...@ = format arguemnts
123 * Use: Maybe report a debugging message to standard error.
126 static PRINTF_LIKE(1, 2) void debug(const char *fmt
, ...)
131 p
= getenv("MLIB_BENCH_DEBUG");
132 if (p
&& *p
!= 'n' && *p
!= '0') {
134 fputs("mLib BENCH: ", stderr
);
135 vfprintf(stderr
, fmt
, ap
);
142 # define FLOATK64(k) ((double)(k).i)
144 # define FLOATK64(k) ((double)(k).lo + 4294967296.0*(double)(k).hi)
147 /* --- @diff_ts@ --- *
149 * Arguments: @struct timer *t@ = timer structure
150 * @struct bench_timing *delta_inout@ = where to put the result
151 * @const struct time *t0, *t1@ = two input times
155 * Use: Calculates a time difference for timers using the
156 * @struct timespec@-like time format.
159 static void diff_ts(struct timer
*t
, struct bench_timing
*delta_inout
,
160 const struct bench_time
*t0
, const struct bench_time
*t1
)
162 unsigned f
= t0
->f
&t1
->f
;
168 /* Calculate the integer differences in seconds and nanoseconds
169 * independently. To avoid underflow, though, add a second's worth of
170 * nanoseconds which we'll subtract off later.
172 SUB64(delta_s
, t1
->t
.ts
.s
, t0
->t
.ts
.s
);
173 delta_ns
= t1
->t
.ts
.ns
+ NS_PER_S
- t0
->t
.ts
.ns
;
175 /* Hack if they're both equal. */
176 if (ZERO64(delta_s
) && !delta_ns
) delta_ns
= 1;
178 /* And apply the nanoseconds difference. To prevent underflow, pre-
179 * emptively borrow one from the integer difference.
181 delta_inout
->t
= FLOATK64(delta_s
) - 1.0 + delta_ns
/(double)NS_PER_S
;
184 delta_inout
->f
|= BTF_TIMEOK
;
188 /* --- @diff_cycles@ --- *
190 * Arguments: @struct timer *t@ = timer structure
191 * @struct bench_timing *delta_inout@ = where to put the result
192 * @const struct time *t0, *t1@ = two input times
196 * Use: Calculates a time difference for cycle-counting timers.
199 static void diff_cycles(struct timer
*t
, struct bench_timing
*delta_inout
,
200 const struct bench_time
*t0
,
201 const struct bench_time
*t1
)
203 unsigned f
= t0
->f
&t1
->f
;
207 SUB64(delta_cy
, t1
->cy
, t0
->cy
); delta_inout
->cy
= FLOATK64(delta_cy
);
208 if (!delta_inout
->cy
) delta_inout
->cy
= 1;
209 delta_inout
->f
|= BTF_CYOK
;
215 /* --- @normalize@ --- *
217 * Arguments: @double *x_inout@ = address of a value to normalize
218 * @const char **unit_out@ = address to store unit prefix
219 * @double scale@ = scale factor for unit steps
223 * Use: Adjust @*x_inout@ by a power of @scale@, and set @*unit_out@
224 * so that printing the two reflects the original value with an
225 * appropriate SI unit scaling. The @scale@ should be 1024 for
226 * binary quantities, most notably memory sizes, or 1000 for
230 static void normalize(double *x_inout
, const char **unit_out
, double scale
)
234 *const big
[] = { "k", "M", "G", "T", "P", "E", 0 },
235 *const little
[] = { "m", "ยต", "n", "p", "f", "a", 0 };
236 const char *const *u
;
240 for (u
= little
, x
*= scale
; x
< 1 && u
[1]; u
++, x
*= scale
);
242 for (u
= big
, x
/= scale
; x
>= scale
&& u
[1]; u
++, x
/= scale
);
246 *x_inout
= x
; *unit_out
= *u
;
249 /*----- The null timer ----------------------------------------------------*/
251 /* This is a timer which does nothing, in case we don't have any better
255 static int null_init(struct timer
*t
) { return (0); }
256 static int null_now(struct timer
*t
, struct bench_time
*t_out
, unsigned f
)
258 static int null_preflight(struct timer
*t
) { return (0); }
259 static void null_diff(struct timer
*t
, struct bench_timing
*delta_inout
,
260 const struct bench_time
*t0
,
261 const struct bench_time
*t1
)
263 static void null_teardown(struct timer
*t
) { ; }
265 static const struct timer_ops null_ops
=
267 null_init
, null_preflight
, null_now
, null_diff
, null_teardown
};
268 #define NULL_ENT &null_ops,
270 /*----- The broken clock --------------------------------------------------*/
272 /* This is a cycle counter which does nothing, in case we don't have any
276 static int broken_init(struct timer
*t
) { return (-1); }
278 static const struct timer_ops broken_ops
=
279 { "broken", TF_SECRET
,
280 broken_init
, null_preflight
, null_now
, null_diff
, null_teardown
};
281 #define BROKEN_ENT &broken_ops,
283 /*----- Linux performance counters ----------------------------------------*/
285 /* This is a cycle counter which uses the Linux performance event system,
286 * which is probably the best choice if it's available.
289 #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
291 /* --- @perfevent_open@ --- *
295 * Returns: File descriptor, or %$-1$%.
297 * Use: Open a performance measurement descriptor set up to count CPU
301 static int perfevent_open(void)
303 struct perf_event_attr attr
= { 0 };
306 attr
.type
= PERF_TYPE_HARDWARE
;
307 attr
.size
= sizeof(attr
);
308 attr
.config
= PERF_COUNT_HW_CPU_CYCLES
;
310 attr
.exclude_kernel
= 1;
313 fd
= syscall(SYS_perf_event_open
, &attr
, 0, -1, -1, 0);
315 debug("couldn't open perf event: %s", strerror(errno
));
322 static int perfevent_now(struct timer
*t
,
323 struct bench_time
*t_out
, unsigned f
)
327 n
= read(t
->u_cy
.fd
, &t_out
->cy
.i
, sizeof(t_out
->cy
.i
));
328 if (n
!= sizeof(t_out
->cy
.i
)) {
329 debug("failed to read perf-event counter: %s", strerror(errno
));
332 t_out
->f
|= BTF_CYOK
; return (0);
335 static void perfevent_teardown(struct timer
*t
)
336 { close(t
->u_cy
.fd
); }
338 static int perfevent_init(struct timer
*t
)
342 fd
= perfevent_open(); if (fd
< 0) { rc
= -1; goto end
; }
343 t
->u_cy
.fd
= fd
; fd
= -1; rc
= 0;
345 if (fd
!= -1) close(fd
);
349 static const struct timer_ops perfevent_ops
=
350 { "linux-perf-read-hw-cycles", BTF_CYOK
,
351 perfevent_init
, null_preflight
, perfevent_now
,
352 diff_cycles
, perfevent_teardown
};
353 #define PERFEVENT_VANILLA_CYENT &perfevent_ops,
355 # if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
357 /* Special syscall-free version for x86 using `rdpmc' instruction. *
359 * This is a bit weird because it does both kinds of measurement in a single
363 static int perfevrdpmc_now(struct timer
*t
,
364 struct bench_time
*t_out
, unsigned f
)
366 const volatile struct perf_event_mmap_page
*map
= t
->u_cy
.pmc
.map
;
367 unsigned long long tsc
= tsc
, toff
= toff
, tenb
= tenb
;
368 unsigned long long cy
= cy
, cyoff
= cyoff
;
369 unsigned long long m
, hi
, lo
;
370 unsigned tshift
= tshift
, tmult
= tmult
, q0
, q1
, ff
;
372 /* Repeat until we can complete this job without the buffer changing in the
376 __atomic_thread_fence(__ATOMIC_ACQ_REL
);
380 /* Read the passage-of-time information. */
381 if (map
->cap_user_time
) {
382 tenb
= map
->time_enabled
;
383 tsc
= __builtin_ia32_rdtsc();
384 tshift
= map
->time_shift
;
385 tmult
= map
->time_mult
;
386 toff
= map
->time_offset
;
390 /* Read the performance-counter information. */
391 if (map
->cap_user_rdpmc
) {
392 cy
= __builtin_ia32_rdpmc(map
->index
- 1);
397 /* Check the sequence number again. */
398 __atomic_thread_fence(__ATOMIC_ACQ_REL
);
405 /* We have a raw reference-cycle count %$n$% (@tsc@), and parameters
406 * %$a$%, %$w$% and %$t_0$%, such that %$a n/2^w + t_0$% gives a time in
410 m
= (1ull << tshift
) - 1;
411 hi
= tsc
>> tshift
; lo
= tsc
&m
;
412 t_out
->t
.rawns
.i
= hi
*tmult
+ (lo
*tmult
>> tshift
) + toff
+ tenb
;
413 t_out
->f
|= BTF_TIMEOK
;
417 /* We have the cycle count. */
419 t_out
->cy
.i
= cy
+ cyoff
;
420 t_out
->f
|= BTF_CYOK
;
425 static void perfevrdpmc_diff(struct timer
*t
,
426 struct bench_timing
*delta_inout
,
427 const struct bench_time
*t0
,
428 const struct bench_time
*t1
)
430 unsigned long long delta_ns
;
431 unsigned f
= t0
->f
&t1
->f
;
434 delta_ns
= t1
->t
.rawns
.i
- t0
->t
.rawns
.i
; if (!delta_ns
) delta_ns
= 1;
435 delta_inout
->t
= delta_ns
/(double)NS_PER_S
;
436 delta_inout
->f
|= BTF_TIMEOK
;
440 delta_inout
->cy
= t1
->cy
.i
- t0
->cy
.i
;
441 if (!delta_inout
->cy
) delta_inout
->cy
= 1;
442 delta_inout
->f
|= BTF_CYOK
;
446 static void perfevrdpmc_unmap
447 (const volatile struct perf_event_mmap_page
*map
, size_t mapsz
)
448 { if (map
) munmap(UNQUALIFY(struct perf_event_mmap_page
, map
), mapsz
); }
450 static void perfevrdpmc_teardown(struct timer
*t
)
451 { perfevrdpmc_unmap(t
->u_cy
.pmc
.map
, t
->u_cy
.pmc
.sz
); }
453 static int perfevrdpmc_setup(struct timer
*t
)
455 const volatile struct perf_event_mmap_page
*map
= 0;
456 int pgsz
, mapsz
= 0, fd
= -1, rc
;
458 /* The rules say we must allocate %$1 + 2^n$% pages, so we need to know how
461 pgsz
= sysconf(_SC_PAGESIZE
);
463 debug("failed to discover page size!: %s", strerror(errno
));
467 /* Open the measurement descriptor and map it. */
468 fd
= perfevent_open(); if (!fd
) return (-1);
470 map
= mmap(0, mapsz
, PROT_READ
, MAP_SHARED
, fd
, 0);
471 if (map
== MAP_FAILED
) {
472 debug("failed to map perf event: %s", strerror(errno
));
476 t
->u_cy
.pmc
.map
= map
; t
->u_cy
.pmc
.sz
= mapsz
; map
= 0;
477 t
->u_cy
.pmc
.owner
= syscall(SYS_gettid
); rc
= 0;
479 if (fd
!= -1) close(fd
);
480 perfevrdpmc_unmap(map
, mapsz
);
484 static int perfevrdpmc_preflight(struct timer
*t
)
486 if (!t
->u_cy
.pmc
.map
) { debug("retry perf event map setup"); goto reopen
; }
487 if (t
->u_cy
.pmc
.owner
!= syscall(SYS_gettid
)) {
488 debug("pid changed: reopen perf event map");
489 perfevrdpmc_unmap(t
->u_cy
.pmc
.map
, t
->u_cy
.pmc
.sz
);
490 t
->u_cy
.pmc
.map
= 0; goto reopen
;
495 if (perfevrdpmc_setup(t
)) return (-1);
499 static int perfevrdpmc_cyinit(struct timer
*t
)
503 # ifdef HAVE_VALGRIND_VALGRIND_H
504 /* Valgrind doesn't like `rdpmc' instructions, so just bail. */
505 if (RUNNING_ON_VALGRIND
) return (-1);
508 /* We need `rdtsc' to do the passage-of-time measurement. */
509 if (!__get_cpuid(1, &a
, &b
, &c
, &d
) || !(d
&CPUID_1D_TSC
))
510 { debug("no `rdtsc' instrunction"); return (-1); }
513 if (perfevrdpmc_setup(t
)) return (-1);
517 static const struct timer_ops perfevrdpmc_cyops
=
518 { "linux-x86-perf-rdpmc-hw-cycles", BTF_TIMEOK
| BTF_CYOK
,
519 perfevrdpmc_cyinit
, perfevrdpmc_preflight
, perfevrdpmc_now
,
520 perfevrdpmc_diff
, perfevrdpmc_teardown
};
522 static int perfevrdpmc_clkinit(struct timer
*t
)
524 if (t
->ops
[CY
] != &perfevrdpmc_cyops
) {
525 debug("`linux-x86-perf-rdpmc-hw-cycles' not set as cycle subtimer");
531 static const struct timer_ops perfevrdpmc_clkops
=
532 { "linux-x86-perf-rdpmc-hw-cycles", 0,
533 perfevrdpmc_clkinit
, null_preflight
, null_now
,
534 null_diff
, null_teardown
};
536 # define PERFEVENT_RDPMC_CLKENT &perfevrdpmc_clkops,
537 # define PERFEVENT_RDPMC_CYENT &perfevrdpmc_cyops,
540 # define PERFEVENT_RDPMC_CLKENT
541 # define PERFEVENT_RDPMC_CYENT
544 # define PERFEVENT_CLKENT PERFEVENT_RDPMC_CLKENT
545 # define PERFEVENT_CYENT PERFEVENT_RDPMC_CYENT PERFEVENT_VANILLA_CYENT
547 # define PERFEVENT_CLKENT
548 # define PERFEVENT_CYENT
551 /*----- Intel time-stamp counter ------------------------------------------*/
553 /* This is a cycle counter based on the Intel `rdtsc' instruction. It's not
554 * really suitable for performance measurement because it gets confused by
555 * CPU frequency adjustments.
558 #if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
560 static int x86rdtsc_now(struct timer
*t
,
561 struct bench_time
*t_out
, unsigned f
)
562 { t_out
->cy
.i
= __builtin_ia32_rdtsc(); t_out
->f
|= BTF_CYOK
; return (0); }
564 static int x86rdtsc_init(struct timer
*t
)
568 if (!__get_cpuid(1, &a
, &b
, &c
, &d
) || !(d
&CPUID_1D_TSC
))
569 { debug("no `rdtsc' instrunction"); return (-1); }
570 t
->u_cy
.tscaux
= ~0u;
574 static int x86rdtscp_now(struct timer
*t
,
575 struct bench_time
*t_out
, unsigned f
)
578 unsigned long long n
;
580 n
= __builtin_ia32_rdtscp(&tscaux
);
582 t
->u_cy
.tscaux
= tscaux
;
583 else if (t
->u_cy
.tscaux
!= tscaux
) {
584 debug("tscaux mismatch: new 0x%08x /= old 0x%08x",
585 tscaux
, t
->u_cy
.tscaux
);
588 t_out
->cy
.i
= n
; t_out
->f
|= BTF_CYOK
; return (0);
591 static int x86rdtscp_init(struct timer
*t
)
595 if (!__get_cpuid(0x80000001, &a
, &b
, &c
, &d
) || !(d
&CPUID_1xD_TSCP
))
596 { debug("no `rdtscp' instrunction"); return (-1); }
600 static const struct timer_ops x86rdtsc_ops
=
601 { "x86-rdtsc", BTF_CYOK
,
602 x86rdtsc_init
, null_preflight
, x86rdtsc_now
,
603 diff_cycles
, null_teardown
};
604 static const struct timer_ops x86rdtscp_ops
=
605 { "x86-rdtscp", BTF_CYOK
,
606 x86rdtscp_init
, null_preflight
,
607 x86rdtscp_now
, diff_cycles
, null_teardown
};
609 # define X86RDTSC_CYENT &x86rdtscp_ops, &x86rdtsc_ops,
611 # define X86RDTSC_CYENT
614 /*----- POSIX `clock_gettime' ---------------------------------------------*/
616 /* This is a real-time clock based on the POSIX time interface, with up to
617 * nanosecond precision.
620 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)
622 static int gettime_now(struct timer
*t
, struct bench_time
*t_out
, unsigned f
)
626 if (clock_gettime(CLOCK_THREAD_CPUTIME_ID
, &now
))
627 { debug("error reading POSIX clock: %s", strerror(errno
)); return (0); }
628 ASSIGN64(t_out
->t
.ts
.s
, now
.tv_sec
); t_out
->t
.ts
.ns
= now
.tv_nsec
;
629 t_out
->f
|= BTF_TIMEOK
; return (0);
632 static const struct timer_ops gettime_ops
=
633 { "posix-thread-cputime", BTF_TIMEOK
,
634 null_init
, null_preflight
, gettime_now
, diff_ts
, null_teardown
};
636 # define GETTIME_CLKENT &gettime_ops,
638 # define GETTIME_CLKENT
641 /*----- Standard C `clock' ------------------------------------------------*/
643 /* This is a real-time clock based on the C `clock' function which is
644 * guaranteed to be available, though it's not likely to be very good.
647 static int clock_now(struct timer
*t
, struct bench_time
*t_out
, unsigned f
)
652 if (now
== (clock_t)-1) {
653 debug("error reading standard clock: %s", strerror(errno
));
656 t_out
->t
.clk
= now
; t_out
->f
|= BTF_TIMEOK
; return (0);
659 static void clock_diff(struct timer
*t
, struct bench_timing
*delta_inout
,
660 const struct bench_time
*t0
,
661 const struct bench_time
*t1
)
664 unsigned f
= t0
->f
&t1
->f
;
667 delta_clk
= t1
->t
.clk
- t0
->t
.clk
; if (!delta_clk
) delta_clk
= 1;
668 delta_inout
->t
= delta_clk
/(double)CLOCKS_PER_SEC
;
669 delta_inout
->f
|= BTF_TIMEOK
;
673 static const struct timer_ops clock_ops
=
674 { "stdc-clock", BTF_TIMEOK
, null_init
, null_preflight
, clock_now
,
675 clock_diff
, null_teardown
};
677 #define CLOCK_CLKENT &clock_ops,
679 /*----- Timing setup ------------------------------------------------------*/
681 /* Tables of timing sources. */
682 static const struct timer_ops
683 *const clktab
[] = { PERFEVENT_CLKENT
688 *const cytab
[] = { PERFEVENT_CYENT
694 static const struct timertab
{
697 const struct timer_ops
*const *opstab
;
699 { "clock", "MLIB_BENCH_CLKTIMER", clktab
},
700 { "cycle", "MLIB_BENCH_CYCLETIMER", cytab
}
703 /* --- @find_timer@ --- *
705 * Arguments: @const char *name@ = timer name
706 * @size_t sz@ = length of name
707 * @unsigned tm@ = which subtimer we're looking for
709 * Returns: The table entry matching the given name, or null if there
713 static const struct timer_ops
*find_timer(const char *name
, size_t sz
,
716 const struct timer_ops
*const *tt
;
718 for (tt
= timertab
[tm
].opstab
; *tt
; tt
++) {
719 if (strlen((*tt
)->name
) == sz
&&
720 MEMCMP(name
, ==, (*tt
)->name
, sz
))
723 debug("%s timer `%.*s' not found",
724 timertab
[tm
].what
, (int)sz
, name
); return (0);
727 /* --- @try_timer@ --- *
729 * Arguments: @struct timer *t@ = timer structure
730 * @const struct timer_ops *ops@ = timer ops
731 * @unsigned tm@ = which subtimer we're setting
733 * Returns: Zero on success, %$-1$% if timer failed.
735 * Use: Tries to initialize the timer @t@, reporting a debug message
739 static int try_timer(struct timer
*t
,
740 const struct timer_ops
*ops
, unsigned tm
)
742 struct bench_time t0
, t1
;
743 struct bench_timing delta
;
746 #define f_teardown 1u
748 if (ops
->init(t
)) { rc
= -1; goto end
; }
751 if (ops
->preflight(t
)) { rc
= -1; goto end
; }
754 while (ops
->now(t
, &t0
, BTF_T0
));
755 } while (ops
->now(t
, &t1
, BTF_T1
));
756 delta
.f
= 0; ops
->diff(t
, &delta
, &t0
, &t1
);
757 if ((ops
->f
^ delta
.f
)&BTF_ANY
) { rc
= -1; goto end
; }
759 debug("selected %s timer `%s'", timertab
[tm
].what
, ops
->name
);
760 t
->ops
[tm
] = ops
; f
&= ~f_teardown
; rc
= 0;
763 if (f
&f_teardown
) ops
->teardown(t
);
769 /* --- @select_timer@ --- *
771 * Arguments: @struct timer *t@ = timer structure
772 * @unsigned tm@ = which subtimer we're setting
773 * @const char *config@, @size_t sz@ = config string
775 * Returns: Zero on success, %$-1$% if timer failed.
777 * Use: Select a timer from the table. If the environment variable
778 * is set, then parse a comma-separated list of timer names and
779 * use the first one listed that seems to work; otherwise, try
780 * the timers in the table in order.
783 static int select_timer(struct timer
*t
, unsigned tm
,
784 const char *config
, size_t sz
)
787 const struct timer_ops
*ops
, *const *tt
;
790 for (tt
= timertab
[tm
].opstab
; *tt
; tt
++)
791 if (!((*tt
)->f
&TF_SECRET
) && !try_timer(t
, *tt
, tm
)) return (0);
795 p
= memchr(config
, ',', l
- config
); if (!p
) p
= l
;
796 ops
= find_timer(config
, p
- config
, tm
);
797 if (ops
&& !try_timer(t
, ops
, tm
)) return (0);
802 debug("no suitable %s timer found", timertab
[tm
].what
); return (-1);
805 /* Bench timer operations. */
806 static void timer_describe(struct bench_timer
*tm
, dstr
*d
)
808 struct timer
*t
= (struct timer
*)tm
;
811 dstr_puts(d
, "builtin: ");
812 for (i
= 0; i
< NTIMER
; i
++) {
813 if (i
) dstr_puts(d
, ", ");
814 dstr_putf(d
, "%s = %s", timertab
[i
].what
, t
->ops
[i
]->name
);
818 static int timer_preflight(struct bench_timer
*tm
)
820 struct timer
*t
= (struct timer
*)tm
;
823 for (i
= 0; i
< NTIMER
; i
++) if (t
->ops
[i
]->preflight(t
)) return (-1);
827 static int timer_now(struct bench_timer
*tm
,
828 struct bench_time
*t_out
, unsigned f
)
830 struct timer
*t
= (struct timer
*)tm
;
834 for (i
= 0; i
< NTIMER
; i
++) if (t
->ops
[i
]->now(t
, t_out
, f
)) return (-1);
838 static void timer_diff(struct bench_timer
*tm
,
839 struct bench_timing
*t_out
,
840 const struct bench_time
*t0
,
841 const struct bench_time
*t1
)
843 struct timer
*t
= (struct timer
*)tm
;
847 for (i
= 0; i
< NTIMER
; i
++) t
->ops
[i
]->diff(t
, t_out
, t0
, t1
);
850 static void timer_destroy(struct bench_timer
*tm
)
852 struct timer
*t
= (struct timer
*)tm
;
856 for (i
= 0; i
< NTIMER
; i
++)
857 if (t
->ops
[i
]) t
->ops
[i
]->teardown(t
);
861 static const struct bench_timerops timer_ops
=
862 { timer_describe
, timer_preflight
, timer_now
, timer_diff
, timer_destroy
};
864 /* --- @bench_createtimer@ --- *
866 * Arguments: @const char *config@ = timer configuration string
868 * Returns: A freshly constructed standard timer object.
870 * Use: Allocate a timer. Dispose of it by calling
871 * @tm->ops->destroy(tm)@ when you're done.
873 * Applications should not set configuration strings except as
874 * established by user action, e.g., from a command-line option,
875 * environment variable, or configuration file.
878 struct bench_timer
*bench_createtimer(const char *config
)
881 struct bench_timer
*ret
= 0;
882 struct { const char *p
; size_t sz
; } tmconf
[NTIMER
] = { 0 };
883 const struct timer_ops
*const *tt
;
884 const char *p
, *l
; size_t n
, nn
;
887 /* Parse the configuration string. */
890 /* The first thing to do is find the end of the string. */
891 l
= config
+ strlen(config
);
894 /* Process the whitespace-sparated words of the string one by one. */
896 /* Skip over any initial whitespace. If we hit the end of the string
900 if (config
>= l
) goto done_config
;
901 else if (!ISSPACE(*config
)) break;
904 /* There's definitely a word here. Find the end of it. */
905 for (p
= config
; p
< l
&& !ISSPACE(*p
); p
++);
908 /* Try various simple keywords. */
909 #define MATCHP(lit) (nn == sizeof(lit) - 1 && MEMCMP(config, ==, lit, nn))
911 if (MATCHP("list")) {
912 /* The `list' keyword requests lists of the available timer
916 for (i
= 0; i
< NTIMER
; i
++) {
917 printf("%s timers:", timertab
[i
].what
);
918 for (tt
= timertab
[i
].opstab
; *tt
; tt
++)
919 if (!((*tt
)->f
&TF_SECRET
)) printf(" %s", (*tt
)->name
);
927 /* Otherwise it's an assignment, setting a subtimer list. */
928 p
= memchr(config
, '=', nn
);
933 for (i
= 0; i
< NTIMER
; i
++)
934 if (STRNCMP(config
, ==, timertab
[i
].what
, n
) &&
935 !timertab
[i
].what
[n
]) {
937 debug("duplicate %s timer list", timertab
[i
].what
);
938 tmconf
[i
].p
= config
+ n
+ 1; tmconf
[i
].sz
= nn
- n
- 1;
942 debug("unrecognized config keyword `%.*s'", (int)n
, config
);
944 /* Move on to the next word. */
951 /* Override these settings from the environment. */
952 for (i
= 0; i
< NTIMER
; i
++) {
953 p
= getenv(timertab
[i
].env
);
954 if (p
) { tmconf
[i
].p
= p
; tmconf
[i
].sz
= strlen(p
); }
957 /* All seems well. Allocate the timer object. */
958 XNEW(t
); t
->a
= arena_global
;
959 for (i
= 0; i
< NTIMER
; i
++) t
->ops
[i
] = 0;
961 /* Try to set up the subtimers. */
962 for (i
= NTIMER
; i
--; )
963 if (select_timer(t
, i
, tmconf
[i
].p
, tmconf
[i
].sz
)) goto end
;
966 t
->_t
.ops
= &timer_ops
; ret
= &t
->_t
; t
= 0;
968 if (t
) timer_destroy(&t
->_t
);
972 /*----- Benchmarking ------------------------------------------------------*/
974 /* --- @bench_init@ --- *
976 * Arguments: @struct bench_state *b@ = bench state to initialize
977 * @struct bench_timer *tm@ = timer to attach, or null
979 * Returns: Zero on success, %$-1$% on failure.
981 * Use: Initialize the benchmark state. On success, the timer state
982 * still needs to be calibrated (use @bench_calibrate@) before
983 * it can be used, but this will be done automatically by
984 * @bench_measure@ if it's not done by hand earlier. The timer
985 * is now owned by the benchmark state and will be destroyed by
988 * The only reason for failure is if @tm@ was null on entry,
989 * and automatic construction of a timer failed. The state is
990 * safe to discard, but calling @bench_destroy@ is safe too.
993 int bench_init(struct bench_state
*b
, struct bench_timer
*tm
)
1000 tm
= bench_createtimer(0);
1001 if (!tm
) { rc
= -1; goto end
; }
1004 b
->tm
= tm
; b
->target_s
= 1.0; b
->f
= 0; rc
= 0;
1009 /* --- @bench_destroy@ --- *
1011 * Arguments: @struct bench_state *b@ = bench state
1015 * Use: Destroy the benchmark state, releasing the resources that it
1019 void bench_destroy(struct bench_state
*b
)
1020 { if (b
->tm
) { b
->tm
->ops
->destroy(b
->tm
); b
->tm
= 0; } }
1024 * Arguments: @unsigned long n@ = iteration count
1025 * @void *ctx@ = context pointer (ignored)
1029 * Use: Does nothing at all for @n@ iterations. Used to calibrate
1030 * the benchmarking state.
1033 static void spin(unsigned long n
, void *ctx
)
1034 { while (n
--) RELAX
; }
1036 /* --- @bench_calibrate@ --- *
1038 * Arguments: @struct bench_state *b@ = bench state
1039 * @unsigned f@ = calibration flags
1041 * Returns: Zero on success, %$-1$% if calibration failed.
1043 * Use: Calibrate the benchmark state, so that it can be used to
1044 * measure performance reasonably accurately.
1046 * Calibration will take into account how the subject code is
1047 * going to be located. If you're going to use @BENCH_MEASURE@
1048 * to measure a piece of literal code, then leave @f@ zero. If
1049 * the code to be measured is going to be executed via an
1050 * indirect branch, e.g., through the @measure@ function, then
1051 * set @BTF_INDIRECT@.
1054 #define T_CLB 0.0625 /* calibration time limit */
1056 int bench_calibrate(struct bench_state
*b
, unsigned f
)
1058 struct linreg lr_clk
= LINREG_INIT
, lr_cy
= LINREG_INIT
;
1059 struct bench_timer
*tm
= b
->tm
;
1060 struct bench_timing delta
;
1062 unsigned i
, tf
= BTF_ANY
;
1063 BENCH_TIMELOOP_DECLS
;
1066 /* The model here is that a timing loop has a fixed overhead as we enter
1067 * and leave (e.g., to do with the indirect branch into the code), and
1068 * per-iteration overheads as we check the counter and loop back. We aim
1069 * to split these apart using linear regression.
1072 /* If we've already calibrated then there's nothing to do. */
1073 if (b
->f
&BTF_CLB
) return (b
->f
&BTF_ANY ?
0 : -1);
1075 /* Run the timer preflight check. */
1076 if (tm
->ops
->preflight(tm
)) { rc
= -1; goto end
; }
1078 /* Exercise the inner loop a few times to educate the branch predictor.
1079 * This is only useful if we're executing via an indirect call.
1081 if (f
&BTF_INDIRECT
) {
1082 for (i
= 0; i
< 50; i
++)
1083 BENCH_TIMELOOP_TAG(setup
, b
, &delta
, 10000)
1084 LAUNDER(&spin
)(_bench_n
, 0);
1087 /* Now we measure idle loops until they take sufficiently long -- or we run
1090 debug("calibrating...");
1094 /* Measure @n@ iterations of the idle loop. */
1096 BENCH_TIMELOOP_TAG(calibrate
, b
, &delta
, n
)
1097 LAUNDER(&spin
)(_bench_n
, 0);
1099 BENCH_TIMELOOP_TAG(calibrate
, b
, &delta
, n
)
1100 while (_bench_n
--) RELAX
;
1101 tf
&= delta
.f
; if (!(tf
&BTF_TIMEOK
)) { rc
= -1; goto end
; }
1103 /* Register the timings with the regression machinery. */
1104 linreg_update(&lr_clk
, n
, delta
.t
);
1106 debug(" n = %10.0f; t = %12g s", n
, delta
.t
);
1108 linreg_update(&lr_cy
, n
, delta
.cy
);
1109 debug(" n = %10.0f; t = %12g s, cy = %10.0f", n
, delta
.t
, delta
.cy
);
1112 /* If we're done then stop. */
1113 if (delta
.t
>= T_CLB
) break;
1114 if (n
>= ULONG_MAX
- n
/3) break;
1116 /* Update the counter and continue. */
1120 /* Now run the linear regression to extract the constant and per-iteration
1123 linreg_fit(&lr_clk
, &b
->clk
.m
, &b
->clk
.c
, &r
);
1124 debug("clock overhead = (%g n + %g) s (r = %g)", b
->clk
.m
, b
->clk
.c
, r
);
1126 linreg_fit(&lr_cy
, &b
->cy
.m
, &b
->cy
.c
, &r
);
1127 debug("cycle overhead = (%g n + %g) cy (r = %g)", b
->cy
.m
, b
->cy
.c
, r
);
1133 b
->f
|= tf
| BTF_CLB
; /* no point trying again */
1137 /* --- @bench_preflight@ --- *
1139 * Arguments: @struct bench_state *b@ = benchmark state
1141 * Returns: Zero on success, %$-1$% on failure.
1143 * Use: Prepares for benchmarking on the current thread. Current
1144 * checks are that the timer is calibrated and that it can
1145 * successfully measure time; the timer preflight is also run.
1147 * Users are unlikely to find this function useful: it's called
1148 * automatically by the @BENCH_MEASURE@ macro and the
1149 * @bench_measure@ function.
1152 int bench_preflight(struct bench_state
*b
)
1154 struct bench_timer
*tm
= b
->tm
;
1156 if (!(b
->f
&BTF_CLB
)) return (-1);
1157 if (!(b
->f
&BTF_TIMEOK
)) return (-1);
1158 if (tm
->ops
->preflight(tm
)) return (-1);
1159 debug("measuring...");
1163 /* --- @bench_adapt@ --- *
1165 * Arguments: @struct bench_state *b@ = benchmark state
1166 * @double *n_inout@ = number of iterations, updated
1167 * @const struct bench_timing *t@ = timing from the previous run
1169 * Returns: Nonzero if the measurement is sufficient; zero to run again.
1171 * Use: This function determines a suitable number of iterations of a
1172 * benchmark function to perform next. It is used in a loop
1173 * such as the following.
1176 * @struct bench_timing t;@
1179 * (run @n@ iterations; set @t@ to the timing)
1180 * @} while (!bench_adapt(b, &n, &t));@
1182 * On entry, @*n_inout@ should be the number of iterations
1183 * performed by the previous pass, and @*t@ the resulting time;
1184 * the @BTF_TIMEOK@ flag must be set @t->f@. If the timing is
1185 * sufficient -- @t->t@ is sufficiently close to @b->target_s@
1186 * -- then the function returns nonzero to indicate that
1187 * measurement is complete. Otherwise, it sets @*n_inout@ to a
1188 * new, larger iteration count and returns zero to indicate that
1189 * a further pass is necessary.
1192 int bench_adapt(struct bench_state
*b
,
1193 double *n_inout
, const struct bench_timing
*t
)
1195 double n
= *n_inout
, nn
;
1197 /* Dump the results for debugging. */
1198 if (!(t
->f
&BTF_CYOK
)) debug(" n = %10.0f; t = %12g", n
, t
->t
);
1199 else debug(" n = %10.0f; t = %12g, cy = %10.0f", n
, t
->t
, t
->cy
);
1201 /* Suppose the timer loop %$n$% iterations in %$t$% seconds. Our ideal
1202 * time is %$T$% seconds. If %$t \ge T/\sqrt{2}$%, we're happy.
1203 * Otherwise, we need to scale up the iteration count. The obvious next
1204 * choice is %$n' = n T/t$%. Alas, rounding is a problem: if
1205 * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no
1206 * progress. We know that %$T/t > \sqrt{2}%, so this can only happen when
1207 * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%. On the other
1208 * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying
1209 * again with %$n' = n + 1$% iterations will very likely work.
1211 if (t
->t
>= 0.707*b
->target_s
) return (1);
1212 nn
= n
*b
->target_s
/t
->t
; modf(nn
, &nn
);
1213 *n_inout
= nn
> n ? nn
: n
+ 1;
1217 /* --- @bench_adjust@ --- *
1219 * Arguments: @struct bench_state *b@ = benchmark state
1220 * @struct bench_timing *t_inout@ = timing to adjust
1221 * @double n@ = number of external iterations performed
1222 * @double base@ = number of internal operations per external
1227 * Use: Adjusts a raw timing, as captured by @BENCH_TIMELOOP@,
1228 * according to the calibration data captured in @b@.
1229 * On exit, the timing data is updated, and @t->n@ is set to the
1233 void bench_adjust(struct bench_state
*b
,
1234 struct bench_timing
*t_inout
, double n
, double base
)
1237 /* Adjust according to the calibration. */
1238 t_inout
->t
-= n
*b
->clk
.m
+ b
->clk
.c
;
1239 if (t_inout
->f
&BTF_CYOK
) t_inout
->cy
-= n
*b
->cy
.m
+ b
->cy
.c
;
1241 /* Report the results, if debugging. */
1242 if (!(t_inout
->f
&BTF_CYOK
)) debug(" adjusted t' = %12g", t_inout
->t
);
1243 else debug(" adjusted t' = %12g, cy' = %10.0f", t_inout
->t
, t_inout
->cy
);
1244 if (!(t_inout
->f
&BTF_CYOK
))
1245 debug(" %g s per iter; %g iters/s", t_inout
->t
/n
, n
/t_inout
->t
);
1247 debug(" %g s (%g cy) per iter; %g iters/s",
1248 t_inout
->t
/n
, t_inout
->cy
/n
, n
/t_inout
->t
);
1251 t_inout
->n
= n
*base
;
1254 /* --- @bench_measure@ --- *
1256 * Arguments: @struct bench_state *b@ = benchmark state
1257 * @struct bench_timing *t_out@ = where to leave the timing
1258 * @double base@ = number of internal units per call
1259 * @bench_fn *fn@, @void *ctx@ = benchmark function to run
1261 * Returns: Zero on success, %$-1$% if timing failed.
1263 * Use: Measure a function. The function @fn@ is called adaptively
1264 * with an iteration count @n@ set so as to run for
1265 * approximately @b->target_s@ seconds.
1267 * The result is left in @*t_out@, with @t_out->n@ counting the
1268 * final product of the iteration count and @base@ (which might,
1269 * e.g., reflect the number of inner iterations the function
1270 * performs, or the number of bytes it processes per iteration).
1272 * To get useful results, the benchmark state should have been
1273 * calibrated for indirect calling -- i.e., with @BTF_INDIRECT@.
1276 int bench_measure(struct bench_state
*b
, struct bench_timing
*t_out
,
1277 double base
, bench_fn
*fn
, void *ctx
)
1279 BENCH_MEASURE_DECLS
;
1282 BENCH_MEASURE_TAG(bench_measure
, b
, rc
, t_out
, base
)
1287 /*----- Reporting ---------------------------------------------------------*/
1289 /* --- @bench_report@ --- *
1291 * Arguments: @const struct gprintf_ops *gops, void *gp@ = output formatter
1292 * @unsigned unit@ = unit processed by the benchmark function
1293 * @const struct bench_timing *t@ = benchmark result
1297 * Use: Format, to the output identified by @gops@ and @go@, a
1298 * human-readable report of the benchmarking result @t@. No
1299 * newline is appended.
1301 * The output format is subject to change in later versions.
1304 void bench_report(const struct gprintf_ops
*gops
, void *go
,
1305 unsigned unit
, const struct bench_timing
*t
)
1307 double scale
, x
, n
= t
->n
;
1308 const char *u
, *what
, *whats
;
1310 assert(t
->f
&BTF_TIMEOK
);
1314 gprintf(gops
, go
, "%.0f iterations ", n
);
1315 what
= "op"; whats
= "ops"; scale
= 1000;
1318 x
= n
; normalize(&x
, &u
, 1024); gprintf(gops
, go
, "%.3f %sB ", x
, u
);
1319 what
= whats
= "B"; scale
= 1024;
1325 x
= t
->t
; normalize(&x
, &u
, 1000);
1326 gprintf(gops
, go
, "in %.3f %ss", x
, u
);
1327 if (t
->f
&BTF_CYOK
) {
1328 x
= t
->cy
; normalize(&x
, &u
, 1000);
1329 gprintf(gops
, go
, " (%.3f %scy)", x
, u
);
1331 gprintf(gops
, go
, ": ");
1333 x
= n
/t
->t
; normalize(&x
, &u
, scale
);
1334 gprintf(gops
, go
, "%.3f %s%s/s", x
, u
, whats
);
1335 x
= t
->t
/n
; normalize(&x
, &u
, 1000);
1336 gprintf(gops
, go
, ", %.3f %ss/%s", x
, u
, what
);
1337 if (t
->f
&BTF_CYOK
) {
1338 x
= t
->cy
/n
; normalize(&x
, &u
, 1000);
1339 gprintf(gops
, go
, " (%.3f %scy/%s)", x
, u
, what
);
1343 /*----- That's all, folks -------------------------------------------------*/