Commit | Line | Data |
---|---|---|
b64eb60f MW |
1 | /* -*-c-*- |
2 | * | |
3 | * Benchmarking support | |
4 | * | |
5 | * (c) 2023 Straylight/Edgeware | |
6 | */ | |
7 | ||
8 | /*----- Licensing notice --------------------------------------------------* | |
9 | * | |
10 | * This file is part of the mLib utilities library. | |
11 | * | |
12 | * mLib is free software: you can redistribute it and/or modify it under | |
13 | * the terms of the GNU Library General Public License as published by | |
14 | * the Free Software Foundation; either version 2 of the License, or (at | |
15 | * your option) any later version. | |
16 | * | |
17 | * mLib is distributed in the hope that it will be useful, but WITHOUT | |
18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public | |
20 | * License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Library General Public | |
23 | * License along with mLib. If not, write to the Free Software | |
24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | * USA. | |
26 | */ | |
27 | ||
28 | /*----- Header files ------------------------------------------------------*/ | |
29 | ||
30 | #include "config.h" | |
31 | ||
13ee7406 | 32 | #include <ctype.h> |
b64eb60f | 33 | #include <errno.h> |
6e683a79 | 34 | #include <limits.h> |
b1a20bee | 35 | #include <math.h> |
b64eb60f MW |
36 | #include <stdarg.h> |
37 | #include <stdio.h> | |
38 | #include <stdlib.h> | |
39 | #include <string.h> | |
40 | #include <time.h> | |
41 | ||
42 | #include "alloc.h" | |
b1a20bee | 43 | #include "arena.h" |
b64eb60f MW |
44 | #include "bench.h" |
45 | #include "bits.h" | |
13ee7406 | 46 | #include "dstr.h" |
b64eb60f MW |
47 | #include "linreg.h" |
48 | #include "macros.h" | |
49 | ||
6e683a79 MW |
50 | #if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__)) |
51 | # include <cpuid.h> | |
52 | # define CPUID_1D_TSC (1u << 4) | |
53 | # define CPUID_1xD_TSCP (1u << 27) | |
b1a20bee | 54 | # define USE_X86_RDTSC 1 |
6e683a79 MW |
55 | #endif |
56 | ||
57 | #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64) | |
b1a20bee | 58 | # include <sys/syscall.h> |
6e683a79 MW |
59 | # include <sys/types.h> |
60 | # include <unistd.h> | |
61 | # include <linux/perf_event.h> | |
b1a20bee MW |
62 | # ifdef HAVE_VALGRIND_VALGRIND_H |
63 | # include <valgrind/valgrind.h> | |
64 | # endif | |
65 | # define USE_LINUX_PERFEVENT 1 | |
6e683a79 MW |
66 | # if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__)) |
67 | # include <sys/mman.h> | |
b1a20bee | 68 | # define USE_LINUX_PERFEVRDPMC 1 |
6e683a79 MW |
69 | # endif |
70 | #endif | |
71 | ||
b64eb60f MW |
72 | /*----- Data structures ---------------------------------------------------*/ |
73 | ||
13ee7406 MW |
74 | enum { CLK, CY, NTIMER }; |
75 | ||
b64eb60f MW |
76 | struct timer { |
77 | struct bench_timer _t; | |
b1a20bee | 78 | arena *a; |
13ee7406 | 79 | const struct timer_ops *ops[NTIMER]; /* subtimers for clock and cycles */ |
6e683a79 | 80 | union { |
b1a20bee | 81 | #ifdef USE_X86_RDTSC |
6e683a79 | 82 | unsigned tscaux; /* `ia32_tsc_aux' for `ldtscp' */ |
b1a20bee MW |
83 | #endif |
84 | #ifdef USE_LINUX_PERFEVENT | |
6e683a79 | 85 | int fd; /* vanilla `perf_event_open' */ |
b1a20bee MW |
86 | #endif |
87 | #ifdef USE_LINUX_PERFEVRDPMC | |
88 | struct { /* `perf_event_open' with `rdpmc' */ | |
89 | const volatile void *map; size_t sz; /* memory-mapped info */ | |
90 | pid_t owner; /* owning thread id */ | |
91 | } pmc; | |
92 | #endif | |
6e683a79 | 93 | } u_cy; /* state for cycle measurement */ |
b64eb60f MW |
94 | }; |
95 | ||
96 | struct timer_ops { | |
13ee7406 MW |
97 | const char *name; /* timer name */ |
98 | unsigned f; /* flags */ | |
b1a20bee MW |
99 | /* ... @BTF_...OK@ flags */ /* expected results */ |
100 | #define TF_SECRET 16u /* don't try this automatically */ | |
13ee7406 | 101 | int (*init)(struct timer */*t*/); /* initialization function */ |
b1a20bee | 102 | int (*preflight)(struct timer */*t*/); /* preflight checks */ |
6e683a79 MW |
103 | int (*now)(struct timer */*t*/, /* read current */ |
104 | struct bench_time */*t_out*/, unsigned /*f*/); | |
105 | void (*diff)(struct timer */*t*/, /* difference */ | |
106 | struct bench_timing */*t_inout*/, | |
107 | const struct bench_time */*t0*/, | |
108 | const struct bench_time */*t1*/); | |
109 | void (*teardown)(struct timer */*t*/); /* release held resources */ | |
b64eb60f MW |
110 | }; |
111 | ||
112 | /*----- Preliminaries -----------------------------------------------------*/ | |
113 | ||
114 | #define NS_PER_S 1000000000 | |
115 | ||
67b5031e MW |
116 | /* --- @debug@ --- * |
117 | * | |
118 | * Arguments: @const char *fmt@ = format control string | |
119 | * @...@ = format arguemnts | |
120 | * | |
121 | * Returns: --- | |
122 | * | |
123 | * Use: Maybe report a debugging message to standard error. | |
124 | */ | |
125 | ||
31d0247c | 126 | static PRINTF_LIKE(1, 2) void debug(const char *fmt, ...) |
b64eb60f MW |
127 | { |
128 | const char *p; | |
129 | va_list ap; | |
130 | ||
131 | p = getenv("MLIB_BENCH_DEBUG"); | |
132 | if (p && *p != 'n' && *p != '0') { | |
133 | va_start(ap, fmt); | |
134 | fputs("mLib BENCH: ", stderr); | |
135 | vfprintf(stderr, fmt, ap); | |
136 | fputc('\n', stderr); | |
137 | va_end(ap); | |
138 | } | |
139 | } | |
140 | ||
6e683a79 MW |
141 | #ifdef HAVE_UINT64 |
142 | # define FLOATK64(k) ((double)(k).i) | |
143 | #else | |
144 | # define FLOATK64(k) ((double)(k).lo + 4294967296.0*(double)(k).hi) | |
145 | #endif | |
146 | ||
147 | /* --- @diff_ts@ --- * | |
67b5031e | 148 | * |
6e683a79 MW |
149 | * Arguments: @struct timer *t@ = timer structure |
150 | * @struct bench_timing *delta_inout@ = where to put the result | |
151 | * @const struct time *t0, *t1@ = two input times | |
67b5031e MW |
152 | * |
153 | * Returns: --- | |
154 | * | |
6e683a79 MW |
155 | * Use: Calculates a time difference for timers using the |
156 | * @struct timespec@-like time format. | |
67b5031e MW |
157 | */ |
158 | ||
6e683a79 MW |
159 | static void diff_ts(struct timer *t, struct bench_timing *delta_inout, |
160 | const struct bench_time *t0, const struct bench_time *t1) | |
67b5031e MW |
161 | { |
162 | unsigned f = t0->f&t1->f; | |
b1a20bee MW |
163 | kludge64 delta_s; |
164 | uint32 delta_ns; | |
67b5031e | 165 | |
6e683a79 | 166 | if (f&BTF_TIMEOK) { |
67b5031e | 167 | |
b1a20bee MW |
168 | /* Calculate the integer differences in seconds and nanoseconds |
169 | * independently. To avoid underflow, though, add a second's worth of | |
170 | * nanoseconds which we'll subtract off later. | |
171 | */ | |
172 | SUB64(delta_s, t1->t.ts.s, t0->t.ts.s); | |
173 | delta_ns = t1->t.ts.ns + NS_PER_S - t0->t.ts.ns; | |
174 | ||
175 | /* Hack if they're both equal. */ | |
176 | if (ZERO64(delta_s) && !delta_ns) delta_ns = 1; | |
67b5031e | 177 | |
b1a20bee MW |
178 | /* And apply the nanoseconds difference. To prevent underflow, pre- |
179 | * emptively borrow one from the integer difference. | |
6e683a79 | 180 | */ |
b1a20bee | 181 | delta_inout->t = FLOATK64(delta_s) - 1.0 + delta_ns/(double)NS_PER_S; |
6e683a79 MW |
182 | |
183 | /* Done. */ | |
184 | delta_inout->f |= BTF_TIMEOK; | |
67b5031e | 185 | } |
6e683a79 | 186 | } |
67b5031e | 187 | |
6e683a79 MW |
188 | /* --- @diff_cycles@ --- * |
189 | * | |
190 | * Arguments: @struct timer *t@ = timer structure | |
191 | * @struct bench_timing *delta_inout@ = where to put the result | |
192 | * @const struct time *t0, *t1@ = two input times | |
193 | * | |
194 | * Returns: --- | |
195 | * | |
196 | * Use: Calculates a time difference for cycle-counting timers. | |
197 | */ | |
67b5031e | 198 | |
6e683a79 MW |
199 | static void diff_cycles(struct timer *t, struct bench_timing *delta_inout, |
200 | const struct bench_time *t0, | |
201 | const struct bench_time *t1) | |
202 | { | |
203 | unsigned f = t0->f&t1->f; | |
b1a20bee | 204 | kludge64 delta_cy; |
6e683a79 MW |
205 | |
206 | if (f&BTF_CYOK) { | |
b1a20bee MW |
207 | SUB64(delta_cy, t1->cy, t0->cy); delta_inout->cy = FLOATK64(delta_cy); |
208 | if (!delta_inout->cy) delta_inout->cy = 1; | |
6e683a79 MW |
209 | delta_inout->f |= BTF_CYOK; |
210 | } | |
67b5031e MW |
211 | } |
212 | ||
6e683a79 MW |
213 | #undef FLOATK64 |
214 | ||
b1a20bee MW |
215 | /* --- @normalize@ --- * |
216 | * | |
217 | * Arguments: @double *x_inout@ = address of a value to normalize | |
218 | * @const char **unit_out@ = address to store unit prefix | |
219 | * @double scale@ = scale factor for unit steps | |
220 | * | |
221 | * Returns: --- | |
222 | * | |
223 | * Use: Adjust @*x_inout@ by a power of @scale@, and set @*unit_out@ | |
224 | * so that printing the two reflects the original value with an | |
225 | * appropriate SI unit scaling. The @scale@ should be 1024 for | |
226 | * binary quantities, most notably memory sizes, or 1000 for | |
227 | * other quantities. | |
228 | */ | |
229 | ||
230 | static void normalize(double *x_inout, const char **unit_out, double scale) | |
231 | { | |
232 | static const char | |
233 | *const nothing = "", | |
234 | *const big[] = { "k", "M", "G", "T", "P", "E", 0 }, | |
235 | *const little[] = { "m", "µ", "n", "p", "f", "a", 0 }; | |
236 | const char *const *u; | |
237 | double x = *x_inout; | |
238 | ||
239 | if (x < 1) | |
240 | for (u = little, x *= scale; x < 1 && u[1]; u++, x *= scale); | |
241 | else if (x >= scale) | |
242 | for (u = big, x /= scale; x >= scale && u[1]; u++, x /= scale); | |
243 | else | |
244 | u = ¬hing; | |
245 | ||
246 | *x_inout = x; *unit_out = *u; | |
247 | } | |
248 | ||
13ee7406 | 249 | /*----- The null timer ----------------------------------------------------*/ |
b64eb60f | 250 | |
13ee7406 MW |
251 | /* This is a timer which does nothing, in case we don't have any better |
252 | * ideas. | |
67b5031e MW |
253 | */ |
254 | ||
13ee7406 | 255 | static int null_init(struct timer *t) { return (0); } |
6e683a79 MW |
256 | static int null_now(struct timer *t, struct bench_time *t_out, unsigned f) |
257 | { return (0); } | |
b1a20bee | 258 | static int null_preflight(struct timer *t) { return (0); } |
6e683a79 MW |
259 | static void null_diff(struct timer *t, struct bench_timing *delta_inout, |
260 | const struct bench_time *t0, | |
261 | const struct bench_time *t1) | |
262 | { ; } | |
b64eb60f | 263 | static void null_teardown(struct timer *t) { ; } |
b64eb60f | 264 | |
13ee7406 | 265 | static const struct timer_ops null_ops = |
b1a20bee MW |
266 | { "null", 0, |
267 | null_init, null_preflight, null_now, null_diff, null_teardown }; | |
13ee7406 MW |
268 | #define NULL_ENT &null_ops, |
269 | ||
270 | /*----- The broken clock --------------------------------------------------*/ | |
271 | ||
272 | /* This is a cycle counter which does nothing, in case we don't have any | |
273 | * better ideas. | |
274 | */ | |
b64eb60f | 275 | |
13ee7406 MW |
276 | static int broken_init(struct timer *t) { return (-1); } |
277 | ||
278 | static const struct timer_ops broken_ops = | |
b1a20bee MW |
279 | { "broken", TF_SECRET, |
280 | broken_init, null_preflight, null_now, null_diff, null_teardown }; | |
13ee7406 | 281 | #define BROKEN_ENT &broken_ops, |
b64eb60f MW |
282 | |
283 | /*----- Linux performance counters ----------------------------------------*/ | |
284 | ||
67b5031e MW |
285 | /* This is a cycle counter which uses the Linux performance event system, |
286 | * which is probably the best choice if it's available. | |
287 | */ | |
288 | ||
b64eb60f MW |
289 | #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64) |
290 | ||
6e683a79 MW |
291 | /* --- @perfevent_open@ --- * |
292 | * | |
293 | * Arguments: --- | |
294 | * | |
295 | * Returns: File descriptor, or %$-1$%. | |
296 | * | |
297 | * Use: Open a performance measurement descriptor set up to count CPU | |
298 | * cycles. | |
299 | */ | |
b64eb60f | 300 | |
6e683a79 MW |
301 | static int perfevent_open(void) |
302 | { | |
303 | struct perf_event_attr attr = { 0 }; | |
304 | int fd; | |
b64eb60f | 305 | |
6e683a79 MW |
306 | attr.type = PERF_TYPE_HARDWARE; |
307 | attr.size = sizeof(attr); | |
308 | attr.config = PERF_COUNT_HW_CPU_CYCLES; | |
309 | attr.disabled = 0; | |
310 | attr.exclude_kernel = 1; | |
311 | attr.exclude_hv = 1; | |
312 | ||
b1a20bee | 313 | fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0); |
6e683a79 MW |
314 | if (fd < 0) { |
315 | debug("couldn't open perf event: %s", strerror(errno)); | |
316 | return (-1); | |
317 | } | |
318 | ||
319 | return (fd); | |
320 | } | |
321 | ||
322 | static int perfevent_now(struct timer *t, | |
323 | struct bench_time *t_out, unsigned f) | |
b64eb60f MW |
324 | { |
325 | ssize_t n; | |
326 | ||
327 | n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i)); | |
328 | if (n != sizeof(t_out->cy.i)) { | |
329 | debug("failed to read perf-event counter: %s", strerror(errno)); | |
6e683a79 | 330 | return (0); |
b64eb60f | 331 | } |
6e683a79 | 332 | t_out->f |= BTF_CYOK; return (0); |
b64eb60f MW |
333 | } |
334 | ||
335 | static void perfevent_teardown(struct timer *t) | |
336 | { close(t->u_cy.fd); } | |
337 | ||
b64eb60f MW |
338 | static int perfevent_init(struct timer *t) |
339 | { | |
6e683a79 | 340 | int fd = -1, rc; |
b64eb60f | 341 | |
b1a20bee MW |
342 | fd = perfevent_open(); if (fd < 0) { rc = -1; goto end; } |
343 | t->u_cy.fd = fd; fd = -1; rc = 0; | |
6e683a79 MW |
344 | end: |
345 | if (fd != -1) close(fd); | |
346 | return (rc); | |
347 | } | |
348 | ||
349 | static const struct timer_ops perfevent_ops = | |
b1a20bee MW |
350 | { "linux-perf-read-hw-cycles", BTF_CYOK, |
351 | perfevent_init, null_preflight, perfevent_now, | |
352 | diff_cycles, perfevent_teardown }; | |
6e683a79 MW |
353 | #define PERFEVENT_VANILLA_CYENT &perfevent_ops, |
354 | ||
355 | # if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__)) | |
356 | ||
357 | /* Special syscall-free version for x86 using `rdpmc' instruction. * | |
358 | * | |
359 | * This is a bit weird because it does both kinds of measurement in a single | |
360 | * operation. | |
361 | */ | |
362 | ||
363 | static int perfevrdpmc_now(struct timer *t, | |
364 | struct bench_time *t_out, unsigned f) | |
365 | { | |
366 | const volatile struct perf_event_mmap_page *map = t->u_cy.pmc.map; | |
367 | unsigned long long tsc = tsc, toff = toff, tenb = tenb; | |
368 | unsigned long long cy = cy, cyoff = cyoff; | |
369 | unsigned long long m, hi, lo; | |
370 | unsigned tshift = tshift, tmult = tmult, q0, q1, ff; | |
371 | ||
372 | /* Repeat until we can complete this job without the buffer changing in the | |
373 | * middle. | |
374 | */ | |
375 | q0 = map->lock; | |
376 | __atomic_thread_fence(__ATOMIC_ACQ_REL); | |
377 | for (;;) { | |
378 | ff = 0; | |
379 | ||
380 | /* Read the passage-of-time information. */ | |
381 | if (map->cap_user_time) { | |
382 | tenb = map->time_enabled; | |
383 | tsc = __builtin_ia32_rdtsc(); | |
384 | tshift = map->time_shift; | |
385 | tmult = map->time_mult; | |
386 | toff = map->time_offset; | |
387 | ff |= BTF_TIMEOK; | |
388 | } | |
389 | ||
390 | /* Read the performance-counter information. */ | |
391 | if (map->cap_user_rdpmc) { | |
392 | cy = __builtin_ia32_rdpmc(map->index - 1); | |
393 | cyoff = map->offset; | |
394 | ff |= BTF_CYOK; | |
395 | } | |
396 | ||
397 | /* Check the sequence number again. */ | |
398 | __atomic_thread_fence(__ATOMIC_ACQ_REL); | |
399 | q1 = map->lock; | |
400 | if (q0 == q1) break; | |
401 | q0 = q1; | |
402 | } | |
403 | ||
404 | if (ff&BTF_TIMEOK) { | |
405 | /* We have a raw reference-cycle count %$n$% (@tsc@), and parameters | |
406 | * %$a$%, %$w$% and %$t_0$%, such that %$a n/2^w + t_0$% gives a time in | |
407 | * nanoseconds. | |
408 | */ | |
409 | ||
410 | m = (1ull << tshift) - 1; | |
411 | hi = tsc >> tshift; lo = tsc&m; | |
412 | t_out->t.rawns.i = hi*tmult + (lo*tmult >> tshift) + toff + tenb; | |
413 | t_out->f |= BTF_TIMEOK; | |
b64eb60f MW |
414 | } |
415 | ||
6e683a79 MW |
416 | if (ff&BTF_CYOK) { |
417 | /* We have the cycle count. */ | |
b64eb60f | 418 | |
6e683a79 MW |
419 | t_out->cy.i = cy + cyoff; |
420 | t_out->f |= BTF_CYOK; | |
421 | } | |
13ee7406 | 422 | return (0); |
b64eb60f | 423 | } |
13ee7406 | 424 | |
6e683a79 MW |
425 | static void perfevrdpmc_diff(struct timer *t, |
426 | struct bench_timing *delta_inout, | |
427 | const struct bench_time *t0, | |
428 | const struct bench_time *t1) | |
429 | { | |
b1a20bee | 430 | unsigned long long delta_ns; |
6e683a79 | 431 | unsigned f = t0->f&t1->f; |
13ee7406 | 432 | |
6e683a79 | 433 | if (f&BTF_TIMEOK) { |
b1a20bee MW |
434 | delta_ns = t1->t.rawns.i - t0->t.rawns.i; if (!delta_ns) delta_ns = 1; |
435 | delta_inout->t = delta_ns/(double)NS_PER_S; | |
6e683a79 MW |
436 | delta_inout->f |= BTF_TIMEOK; |
437 | } | |
438 | ||
439 | if (f&BTF_CYOK) { | |
440 | delta_inout->cy = t1->cy.i - t0->cy.i; | |
b1a20bee | 441 | if (!delta_inout->cy) delta_inout->cy = 1; |
6e683a79 MW |
442 | delta_inout->f |= BTF_CYOK; |
443 | } | |
444 | } | |
445 | ||
b1a20bee MW |
446 | static void perfevrdpmc_unmap |
447 | (const volatile struct perf_event_mmap_page *map, size_t mapsz) | |
448 | { if (map) munmap(UNQUALIFY(struct perf_event_mmap_page, map), mapsz); } | |
449 | ||
6e683a79 | 450 | static void perfevrdpmc_teardown(struct timer *t) |
b1a20bee | 451 | { perfevrdpmc_unmap(t->u_cy.pmc.map, t->u_cy.pmc.sz); } |
6e683a79 | 452 | |
b1a20bee | 453 | static int perfevrdpmc_setup(struct timer *t) |
6e683a79 MW |
454 | { |
455 | const volatile struct perf_event_mmap_page *map = 0; | |
b1a20bee | 456 | int pgsz, mapsz = 0, fd = -1, rc; |
6e683a79 MW |
457 | |
458 | /* The rules say we must allocate %$1 + 2^n$% pages, so we need to know how | |
459 | * big a page is. | |
460 | */ | |
461 | pgsz = sysconf(_SC_PAGESIZE); | |
462 | if (pgsz < 0) { | |
463 | debug("failed to discover page size!: %s", strerror(errno)); | |
464 | rc = -1; goto end; | |
465 | } | |
466 | ||
467 | /* Open the measurement descriptor and map it. */ | |
468 | fd = perfevent_open(); if (!fd) return (-1); | |
469 | mapsz = 2*pgsz; | |
470 | map = mmap(0, mapsz, PROT_READ, MAP_SHARED, fd, 0); | |
471 | if (map == MAP_FAILED) { | |
472 | debug("failed to map perf event: %s", strerror(errno)); | |
473 | return (-1); | |
474 | } | |
475 | ||
b1a20bee MW |
476 | t->u_cy.pmc.map = map; t->u_cy.pmc.sz = mapsz; map = 0; |
477 | t->u_cy.pmc.owner = syscall(SYS_gettid); rc = 0; | |
6e683a79 MW |
478 | end: |
479 | if (fd != -1) close(fd); | |
b1a20bee | 480 | perfevrdpmc_unmap(map, mapsz); |
6e683a79 MW |
481 | return (rc); |
482 | } | |
483 | ||
b1a20bee MW |
484 | static int perfevrdpmc_preflight(struct timer *t) |
485 | { | |
486 | if (!t->u_cy.pmc.map) { debug("retry perf event map setup"); goto reopen; } | |
487 | if (t->u_cy.pmc.owner != syscall(SYS_gettid)) { | |
488 | debug("pid changed: reopen perf event map"); | |
489 | perfevrdpmc_unmap(t->u_cy.pmc.map, t->u_cy.pmc.sz); | |
490 | t->u_cy.pmc.map = 0; goto reopen; | |
491 | } | |
492 | return (0); | |
493 | ||
494 | reopen: | |
495 | if (perfevrdpmc_setup(t)) return (-1); | |
496 | return (0); | |
497 | } | |
498 | ||
499 | static int perfevrdpmc_cyinit(struct timer *t) | |
500 | { | |
501 | unsigned a, b, c, d; | |
502 | ||
503 | # ifdef HAVE_VALGRIND_VALGRIND_H | |
504 | /* Valgrind doesn't like `rdpmc' instructions, so just bail. */ | |
505 | if (RUNNING_ON_VALGRIND) return (-1); | |
506 | # endif | |
507 | ||
508 | /* We need `rdtsc' to do the passage-of-time measurement. */ | |
509 | if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC)) | |
510 | { debug("no `rdtsc' instrunction"); return (-1); } | |
511 | ||
512 | /* Set things up. */ | |
513 | if (perfevrdpmc_setup(t)) return (-1); | |
514 | return (0); | |
515 | } | |
516 | ||
6e683a79 | 517 | static const struct timer_ops perfevrdpmc_cyops = |
b1a20bee MW |
518 | { "linux-x86-perf-rdpmc-hw-cycles", BTF_TIMEOK | BTF_CYOK, |
519 | perfevrdpmc_cyinit, perfevrdpmc_preflight, perfevrdpmc_now, | |
6e683a79 MW |
520 | perfevrdpmc_diff, perfevrdpmc_teardown }; |
521 | ||
522 | static int perfevrdpmc_clkinit(struct timer *t) | |
523 | { | |
b1a20bee MW |
524 | if (t->ops[CY] != &perfevrdpmc_cyops) { |
525 | debug("`linux-x86-perf-rdpmc-hw-cycles' not set as cycle subtimer"); | |
6e683a79 MW |
526 | return(-1); |
527 | } | |
528 | return (0); | |
529 | } | |
530 | ||
531 | static const struct timer_ops perfevrdpmc_clkops = | |
532 | { "linux-x86-perf-rdpmc-hw-cycles", 0, | |
b1a20bee | 533 | perfevrdpmc_clkinit, null_preflight, null_now, |
6e683a79 MW |
534 | null_diff, null_teardown }; |
535 | ||
536 | # define PERFEVENT_RDPMC_CLKENT &perfevrdpmc_clkops, | |
537 | # define PERFEVENT_RDPMC_CYENT &perfevrdpmc_cyops, | |
538 | ||
539 | # else | |
540 | # define PERFEVENT_RDPMC_CLKENT | |
541 | # define PERFEVENT_RDPMC_CYENT | |
542 | # endif | |
543 | ||
544 | # define PERFEVENT_CLKENT PERFEVENT_RDPMC_CLKENT | |
545 | # define PERFEVENT_CYENT PERFEVENT_RDPMC_CYENT PERFEVENT_VANILLA_CYENT | |
b64eb60f | 546 | #else |
6e683a79 | 547 | # define PERFEVENT_CLKENT |
b64eb60f MW |
548 | # define PERFEVENT_CYENT |
549 | #endif | |
550 | ||
551 | /*----- Intel time-stamp counter ------------------------------------------*/ | |
552 | ||
67b5031e MW |
553 | /* This is a cycle counter based on the Intel `rdtsc' instruction. It's not |
554 | * really suitable for performance measurement because it gets confused by | |
555 | * CPU frequency adjustments. | |
556 | */ | |
557 | ||
6e683a79 | 558 | #if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__)) |
b64eb60f | 559 | |
6e683a79 MW |
560 | static int x86rdtsc_now(struct timer *t, |
561 | struct bench_time *t_out, unsigned f) | |
562 | { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; return (0); } | |
b64eb60f MW |
563 | |
564 | static int x86rdtsc_init(struct timer *t) | |
565 | { | |
13ee7406 MW |
566 | unsigned a, b, c, d; |
567 | ||
568 | if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC)) | |
b64eb60f | 569 | { debug("no `rdtsc' instrunction"); return (-1); } |
6e683a79 MW |
570 | t->u_cy.tscaux = ~0u; |
571 | return (0); | |
572 | } | |
573 | ||
574 | static int x86rdtscp_now(struct timer *t, | |
575 | struct bench_time *t_out, unsigned f) | |
576 | { | |
577 | unsigned tscaux; | |
578 | unsigned long long n; | |
579 | ||
580 | n = __builtin_ia32_rdtscp(&tscaux); | |
581 | if (!(f&BTF_T1)) | |
582 | t->u_cy.tscaux = tscaux; | |
583 | else if (t->u_cy.tscaux != tscaux) { | |
584 | debug("tscaux mismatch: new 0x%08x /= old 0x%08x", | |
585 | tscaux, t->u_cy.tscaux); | |
586 | return (-1); | |
587 | } | |
588 | t_out->cy.i = n; t_out->f |= BTF_CYOK; return (0); | |
589 | } | |
590 | ||
591 | static int x86rdtscp_init(struct timer *t) | |
592 | { | |
593 | unsigned a, b, c, d; | |
594 | ||
595 | if (!__get_cpuid(0x80000001, &a, &b, &c, &d) || !(d&CPUID_1xD_TSCP)) | |
596 | { debug("no `rdtscp' instrunction"); return (-1); } | |
13ee7406 | 597 | return (0); |
b64eb60f MW |
598 | } |
599 | ||
13ee7406 | 600 | static const struct timer_ops x86rdtsc_ops = |
b1a20bee MW |
601 | { "x86-rdtsc", BTF_CYOK, |
602 | x86rdtsc_init, null_preflight, x86rdtsc_now, | |
603 | diff_cycles, null_teardown }; | |
6e683a79 | 604 | static const struct timer_ops x86rdtscp_ops = |
b1a20bee MW |
605 | { "x86-rdtscp", BTF_CYOK, |
606 | x86rdtscp_init, null_preflight, | |
607 | x86rdtscp_now, diff_cycles, null_teardown }; | |
13ee7406 | 608 | |
6e683a79 | 609 | # define X86RDTSC_CYENT &x86rdtscp_ops, &x86rdtsc_ops, |
b64eb60f | 610 | #else |
13ee7406 | 611 | # define X86RDTSC_CYENT |
b64eb60f MW |
612 | #endif |
613 | ||
614 | /*----- POSIX `clock_gettime' ---------------------------------------------*/ | |
615 | ||
67b5031e MW |
616 | /* This is a real-time clock based on the POSIX time interface, with up to |
617 | * nanosecond precision. | |
618 | */ | |
619 | ||
b64eb60f MW |
620 | #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID) |
621 | ||
6e683a79 | 622 | static int gettime_now(struct timer *t, struct bench_time *t_out, unsigned f) |
b64eb60f MW |
623 | { |
624 | struct timespec now; | |
625 | ||
626 | if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now)) | |
6e683a79 MW |
627 | { debug("error reading POSIX clock: %s", strerror(errno)); return (0); } |
628 | ASSIGN64(t_out->t.ts.s, now.tv_sec); t_out->t.ts.ns = now.tv_nsec; | |
629 | t_out->f |= BTF_TIMEOK; return (0); | |
b64eb60f MW |
630 | } |
631 | ||
13ee7406 | 632 | static const struct timer_ops gettime_ops = |
b1a20bee MW |
633 | { "posix-thread-cputime", BTF_TIMEOK, |
634 | null_init, null_preflight, gettime_now, diff_ts, null_teardown }; | |
13ee7406 MW |
635 | |
636 | # define GETTIME_CLKENT &gettime_ops, | |
b64eb60f MW |
637 | #else |
638 | # define GETTIME_CLKENT | |
639 | #endif | |
640 | ||
641 | /*----- Standard C `clock' ------------------------------------------------*/ | |
642 | ||
67b5031e MW |
643 | /* This is a real-time clock based on the C `clock' function which is |
644 | * guaranteed to be available, though it's not likely to be very good. | |
645 | */ | |
646 | ||
6e683a79 | 647 | static int clock_now(struct timer *t, struct bench_time *t_out, unsigned f) |
b64eb60f | 648 | { |
6e683a79 | 649 | clock_t now; |
b64eb60f MW |
650 | |
651 | now = clock(); | |
652 | if (now == (clock_t)-1) { | |
653 | debug("error reading standard clock: %s", strerror(errno)); | |
6e683a79 | 654 | return (0); |
b64eb60f | 655 | } |
6e683a79 MW |
656 | t_out->t.clk = now; t_out->f |= BTF_TIMEOK; return (0); |
657 | } | |
658 | ||
659 | static void clock_diff(struct timer *t, struct bench_timing *delta_inout, | |
660 | const struct bench_time *t0, | |
661 | const struct bench_time *t1) | |
662 | { | |
b1a20bee | 663 | clock_t delta_clk; |
6e683a79 MW |
664 | unsigned f = t0->f&t1->f; |
665 | ||
666 | if (f&BTF_TIMEOK) { | |
b1a20bee MW |
667 | delta_clk = t1->t.clk - t0->t.clk; if (!delta_clk) delta_clk = 1; |
668 | delta_inout->t = delta_clk/(double)CLOCKS_PER_SEC; | |
6e683a79 MW |
669 | delta_inout->f |= BTF_TIMEOK; |
670 | } | |
b64eb60f MW |
671 | } |
672 | ||
13ee7406 | 673 | static const struct timer_ops clock_ops = |
b1a20bee MW |
674 | { "stdc-clock", BTF_TIMEOK, null_init, null_preflight, clock_now, |
675 | clock_diff, null_teardown }; | |
13ee7406 MW |
676 | |
677 | #define CLOCK_CLKENT &clock_ops, | |
b64eb60f MW |
678 | |
679 | /*----- Timing setup ------------------------------------------------------*/ | |
680 | ||
67b5031e | 681 | /* Tables of timing sources. */ |
13ee7406 | 682 | static const struct timer_ops |
6e683a79 MW |
683 | *const clktab[] = { PERFEVENT_CLKENT |
684 | GETTIME_CLKENT | |
685 | CLOCK_CLKENT | |
686 | BROKEN_ENT | |
687 | 0 }, | |
688 | *const cytab[] = { PERFEVENT_CYENT | |
689 | X86RDTSC_CYENT | |
690 | NULL_ENT | |
691 | BROKEN_ENT | |
692 | 0 }; | |
13ee7406 MW |
693 | |
694 | static const struct timertab { | |
695 | const char *what; | |
696 | const char *env; | |
697 | const struct timer_ops *const *opstab; | |
698 | } timertab[] = { | |
b1a20bee MW |
699 | { "clock", "MLIB_BENCH_CLKTIMER", clktab }, |
700 | { "cycle", "MLIB_BENCH_CYCLETIMER", cytab } | |
13ee7406 | 701 | }; |
b64eb60f | 702 | |
67b5031e MW |
703 | /* --- @find_timer@ --- * |
704 | * | |
705 | * Arguments: @const char *name@ = timer name | |
706 | * @size_t sz@ = length of name | |
13ee7406 | 707 | * @unsigned tm@ = which subtimer we're looking for |
67b5031e MW |
708 | * |
709 | * Returns: The table entry matching the given name, or null if there | |
710 | * isn't one. | |
711 | */ | |
712 | ||
13ee7406 MW |
713 | static const struct timer_ops *find_timer(const char *name, size_t sz, |
714 | unsigned tm) | |
b64eb60f | 715 | { |
13ee7406 MW |
716 | const struct timer_ops *const *tt; |
717 | ||
718 | for (tt = timertab[tm].opstab; *tt; tt++) { | |
719 | if (strlen((*tt)->name) == sz && | |
720 | MEMCMP(name, ==, (*tt)->name, sz)) | |
721 | return (*tt); | |
b64eb60f | 722 | } |
13ee7406 MW |
723 | debug("%s timer `%.*s' not found", |
724 | timertab[tm].what, (int)sz, name); return (0); | |
b64eb60f MW |
725 | } |
726 | ||
67b5031e MW |
727 | /* --- @try_timer@ --- * |
728 | * | |
729 | * Arguments: @struct timer *t@ = timer structure | |
13ee7406 MW |
730 | * @const struct timer_ops *ops@ = timer ops |
731 | * @unsigned tm@ = which subtimer we're setting | |
67b5031e | 732 | * |
13ee7406 | 733 | * Returns: Zero on success, %$-1$% if timer failed. |
67b5031e MW |
734 | * |
735 | * Use: Tries to initialize the timer @t@, reporting a debug message | |
736 | * if it worked. | |
737 | */ | |
738 | ||
b64eb60f | 739 | static int try_timer(struct timer *t, |
13ee7406 | 740 | const struct timer_ops *ops, unsigned tm) |
b64eb60f | 741 | { |
b1a20bee MW |
742 | struct bench_time t0, t1; |
743 | struct bench_timing delta; | |
744 | int rc; | |
745 | unsigned f = 0; | |
746 | #define f_teardown 1u | |
747 | ||
748 | if (ops->init(t)) { rc = -1; goto end; } | |
749 | f |= f_teardown; | |
750 | ||
751 | if (ops->preflight(t)) { rc = -1; goto end; } | |
752 | t0.f = t1.f = 0; | |
753 | do { | |
754 | while (ops->now(t, &t0, BTF_T0)); | |
755 | } while (ops->now(t, &t1, BTF_T1)); | |
756 | delta.f = 0; ops->diff(t, &delta, &t0, &t1); | |
757 | if ((ops->f ^ delta.f)&BTF_ANY) { rc = -1; goto end; } | |
758 | ||
13ee7406 | 759 | debug("selected %s timer `%s'", timertab[tm].what, ops->name); |
b1a20bee MW |
760 | t->ops[tm] = ops; f &= ~f_teardown; rc = 0; |
761 | ||
762 | end: | |
763 | if (f&f_teardown) ops->teardown(t); | |
764 | return (rc); | |
765 | ||
766 | #undef f_teardown | |
b64eb60f MW |
767 | } |
768 | ||
67b5031e MW |
769 | /* --- @select_timer@ --- * |
770 | * | |
771 | * Arguments: @struct timer *t@ = timer structure | |
13ee7406 MW |
772 | * @unsigned tm@ = which subtimer we're setting |
773 | * @const char *config@, @size_t sz@ = config string | |
67b5031e | 774 | * |
13ee7406 | 775 | * Returns: Zero on success, %$-1$% if timer failed. |
67b5031e MW |
776 | * |
777 | * Use: Select a timer from the table. If the environment variable | |
778 | * is set, then parse a comma-separated list of timer names and | |
779 | * use the first one listed that seems to work; otherwise, try | |
780 | * the timers in the table in order. | |
781 | */ | |
782 | ||
13ee7406 MW |
783 | static int select_timer(struct timer *t, unsigned tm, |
784 | const char *config, size_t sz) | |
b64eb60f | 785 | { |
13ee7406 MW |
786 | const char *p, *l; |
787 | const struct timer_ops *ops, *const *tt; | |
b64eb60f | 788 | |
13ee7406 MW |
789 | if (!config) { |
790 | for (tt = timertab[tm].opstab; *tt; tt++) | |
791 | if (!((*tt)->f&TF_SECRET) && !try_timer(t, *tt, tm)) return (0); | |
b64eb60f | 792 | } else { |
13ee7406 | 793 | l = config + sz; |
b64eb60f | 794 | for (;;) { |
13ee7406 MW |
795 | p = memchr(config, ',', l - config); if (!p) p = l; |
796 | ops = find_timer(config, p - config, tm); | |
797 | if (ops && !try_timer(t, ops, tm)) return (0); | |
798 | if (p >= l) break; | |
799 | config = p + 1; | |
b64eb60f MW |
800 | } |
801 | } | |
13ee7406 | 802 | debug("no suitable %s timer found", timertab[tm].what); return (-1); |
b64eb60f MW |
803 | } |
804 | ||
67b5031e | 805 | /* Bench timer operations. */ |
13ee7406 MW |
806 | static void timer_describe(struct bench_timer *tm, dstr *d) |
807 | { | |
808 | struct timer *t = (struct timer *)tm; | |
809 | unsigned i; | |
810 | ||
811 | dstr_puts(d, "builtin: "); | |
812 | for (i = 0; i < NTIMER; i++) { | |
813 | if (i) dstr_puts(d, ", "); | |
814 | dstr_putf(d, "%s = %s", timertab[i].what, t->ops[i]->name); | |
815 | } | |
816 | } | |
817 | ||
b1a20bee MW |
818 | static int timer_preflight(struct bench_timer *tm) |
819 | { | |
820 | struct timer *t = (struct timer *)tm; | |
821 | unsigned i; | |
822 | ||
823 | for (i = 0; i < NTIMER; i++) if (t->ops[i]->preflight(t)) return (-1); | |
824 | return (0); | |
825 | } | |
826 | ||
6e683a79 MW |
827 | static int timer_now(struct bench_timer *tm, |
828 | struct bench_time *t_out, unsigned f) | |
b64eb60f MW |
829 | { |
830 | struct timer *t = (struct timer *)tm; | |
13ee7406 | 831 | unsigned i; |
b64eb60f | 832 | |
6e683a79 MW |
833 | t_out->f = 0; |
834 | for (i = 0; i < NTIMER; i++) if (t->ops[i]->now(t, t_out, f)) return (-1); | |
835 | return (0); | |
836 | } | |
837 | ||
838 | static void timer_diff(struct bench_timer *tm, | |
839 | struct bench_timing *t_out, | |
840 | const struct bench_time *t0, | |
841 | const struct bench_time *t1) | |
842 | { | |
843 | struct timer *t = (struct timer *)tm; | |
844 | unsigned i; | |
845 | ||
846 | t_out->f = 0; | |
847 | for (i = 0; i < NTIMER; i++) t->ops[i]->diff(t, t_out, t0, t1); | |
b64eb60f | 848 | } |
13ee7406 | 849 | |
b64eb60f MW |
850 | static void timer_destroy(struct bench_timer *tm) |
851 | { | |
852 | struct timer *t = (struct timer *)tm; | |
13ee7406 | 853 | unsigned i; |
b64eb60f MW |
854 | |
855 | if (!t) return; | |
13ee7406 MW |
856 | for (i = 0; i < NTIMER; i++) |
857 | if (t->ops[i]) t->ops[i]->teardown(t); | |
b1a20bee | 858 | x_free(t->a, t); |
b64eb60f MW |
859 | } |
860 | ||
13ee7406 | 861 | static const struct bench_timerops timer_ops = |
b1a20bee | 862 | { timer_describe, timer_preflight, timer_now, timer_diff, timer_destroy }; |
b64eb60f | 863 | |
67b5031e MW |
864 | /* --- @bench_createtimer@ --- * |
865 | * | |
13ee7406 | 866 | * Arguments: @const char *config@ = timer configuration string |
67b5031e MW |
867 | * |
868 | * Returns: A freshly constructed standard timer object. | |
869 | * | |
870 | * Use: Allocate a timer. Dispose of it by calling | |
871 | * @tm->ops->destroy(tm)@ when you're done. | |
13ee7406 MW |
872 | * |
873 | * Applications should not set configuration strings except as | |
874 | * established by user action, e.g., from a command-line option, | |
875 | * environment variable, or configuration file. | |
67b5031e MW |
876 | */ |
877 | ||
13ee7406 | 878 | struct bench_timer *bench_createtimer(const char *config) |
b64eb60f MW |
879 | { |
880 | struct timer *t = 0; | |
881 | struct bench_timer *ret = 0; | |
13ee7406 MW |
882 | struct { const char *p; size_t sz; } tmconf[NTIMER] = { 0 }; |
883 | const struct timer_ops *const *tt; | |
884 | const char *p, *l; size_t n, nn; | |
885 | unsigned i; | |
b64eb60f | 886 | |
13ee7406 MW |
887 | /* Parse the configuration string. */ |
888 | if (config) { | |
889 | ||
890 | /* The first thing to do is find the end of the string. */ | |
891 | l = config + strlen(config); | |
892 | ||
893 | for (;;) { | |
894 | /* Process the whitespace-sparated words of the string one by one. */ | |
895 | ||
896 | /* Skip over any initial whitespace. If we hit the end of the string | |
897 | * then we're done. | |
898 | */ | |
899 | for (;;) | |
900 | if (config >= l) goto done_config; | |
901 | else if (!ISSPACE(*config)) break; | |
902 | else config++; | |
903 | ||
904 | /* There's definitely a word here. Find the end of it. */ | |
905 | for (p = config; p < l && !ISSPACE(*p); p++); | |
906 | nn = p - config; | |
907 | ||
908 | /* Try various simple keywords. */ | |
909 | #define MATCHP(lit) (nn == sizeof(lit) - 1 && MEMCMP(config, ==, lit, nn)) | |
910 | ||
911 | if (MATCHP("list")) { | |
912 | /* The `list' keyword requests lists of the available timer | |
913 | * implementations. | |
914 | */ | |
915 | ||
916 | for (i = 0; i < NTIMER; i++) { | |
917 | printf("%s timers:", timertab[i].what); | |
918 | for (tt = timertab[i].opstab; *tt; tt++) | |
b1a20bee | 919 | if (!((*tt)->f&TF_SECRET)) printf(" %s", (*tt)->name); |
13ee7406 MW |
920 | putchar('\n'); |
921 | } | |
922 | goto next_config; | |
923 | } | |
924 | ||
925 | #undef MATCHP | |
926 | ||
927 | /* Otherwise it's an assignment, setting a subtimer list. */ | |
928 | p = memchr(config, '=', nn); | |
929 | if (!p) | |
930 | n = nn; | |
931 | else { | |
932 | n = p - config; | |
933 | for (i = 0; i < NTIMER; i++) | |
934 | if (STRNCMP(config, ==, timertab[i].what, n) && | |
935 | !timertab[i].what[n]) { | |
936 | if (tmconf[i].p) | |
937 | debug("duplicate %s timer list", timertab[i].what); | |
938 | tmconf[i].p = config + n + 1; tmconf[i].sz = nn - n - 1; | |
939 | goto next_config; | |
940 | } | |
941 | } | |
942 | debug("unrecognized config keyword `%.*s'", (int)n, config); | |
943 | ||
944 | /* Move on to the next word. */ | |
945 | next_config: | |
946 | config += nn; | |
947 | } | |
948 | done_config:; | |
949 | } | |
950 | ||
951 | /* Override these settings from the environment. */ | |
952 | for (i = 0; i < NTIMER; i++) { | |
953 | p = getenv(timertab[i].env); | |
954 | if (p) { tmconf[i].p = p; tmconf[i].sz = strlen(p); } | |
955 | } | |
956 | ||
957 | /* All seems well. Allocate the timer object. */ | |
b1a20bee | 958 | XNEW(t); t->a = arena_global; |
13ee7406 MW |
959 | for (i = 0; i < NTIMER; i++) t->ops[i] = 0; |
960 | ||
961 | /* Try to set up the subtimers. */ | |
6e683a79 | 962 | for (i = NTIMER; i--; ) |
13ee7406 MW |
963 | if (select_timer(t, i, tmconf[i].p, tmconf[i].sz)) goto end; |
964 | ||
965 | /* All is done. */ | |
289651a7 | 966 | t->_t.ops = &timer_ops; t->_t.ref = 1; ret = &t->_t; t = 0; |
b64eb60f MW |
967 | end: |
968 | if (t) timer_destroy(&t->_t); | |
969 | return (ret); | |
970 | } | |
971 | ||
67b5031e | 972 | /*----- Benchmarking ------------------------------------------------------*/ |
b64eb60f | 973 | |
67b5031e MW |
974 | /* --- @bench_init@ --- * |
975 | * | |
976 | * Arguments: @struct bench_state *b@ = bench state to initialize | |
13ee7406 | 977 | * @struct bench_timer *tm@ = timer to attach, or null |
67b5031e | 978 | * |
13ee7406 | 979 | * Returns: Zero on success, %$-1$% on failure. |
67b5031e | 980 | * |
13ee7406 MW |
981 | * Use: Initialize the benchmark state. On success, the timer state |
982 | * still needs to be calibrated (use @bench_calibrate@) before | |
983 | * it can be used, but this will be done automatically by | |
984 | * @bench_measure@ if it's not done by hand earlier. The timer | |
985 | * is now owned by the benchmark state and will be destroyed by | |
986 | * @bench_destroy@. | |
987 | * | |
988 | * The only reason for failure is if @tm@ was null on entry, | |
989 | * and automatic construction of a timer failed. The state is | |
990 | * safe to discard, but calling @bench_destroy@ is safe too. | |
67b5031e | 991 | */ |
b64eb60f | 992 | |
13ee7406 MW |
993 | int bench_init(struct bench_state *b, struct bench_timer *tm) |
994 | { | |
995 | int rc; | |
996 | ||
997 | b->tm = 0; | |
998 | ||
999 | if (!tm) { | |
1000 | tm = bench_createtimer(0); | |
1001 | if (!tm) { rc = -1; goto end; } | |
1002 | } | |
1003 | ||
1004 | b->tm = tm; b->target_s = 1.0; b->f = 0; rc = 0; | |
1005 | end: | |
1006 | return (rc); | |
1007 | } | |
b64eb60f | 1008 | |
67b5031e MW |
1009 | /* --- @bench_destroy@ --- * |
1010 | * | |
1011 | * Arguments: @struct bench_state *b@ = bench state | |
1012 | * | |
1013 | * Returns: --- | |
1014 | * | |
1015 | * Use: Destroy the benchmark state, releasing the resources that it | |
1016 | * holds. | |
1017 | */ | |
1018 | ||
b64eb60f | 1019 | void bench_destroy(struct bench_state *b) |
289651a7 | 1020 | { if (b->tm && !--b->tm->ref) { b->tm->ops->destroy(b->tm); b->tm = 0; } } |
b64eb60f | 1021 | |
b1a20bee | 1022 | /* --- @spin@ --- * |
67b5031e MW |
1023 | * |
1024 | * Arguments: @unsigned long n@ = iteration count | |
1025 | * @void *ctx@ = context pointer (ignored) | |
1026 | * | |
1027 | * Returns: --- | |
1028 | * | |
1029 | * Use: Does nothing at all for @n@ iterations. Used to calibrate | |
1030 | * the benchmarking state. | |
1031 | */ | |
1032 | ||
b1a20bee | 1033 | static void spin(unsigned long n, void *ctx) |
b64eb60f MW |
1034 | { while (n--) RELAX; } |
1035 | ||
67b5031e MW |
1036 | /* --- @bench_calibrate@ --- * |
1037 | * | |
1038 | * Arguments: @struct bench_state *b@ = bench state | |
b1a20bee | 1039 | * @unsigned f@ = calibration flags |
67b5031e | 1040 | * |
13ee7406 | 1041 | * Returns: Zero on success, %$-1$% if calibration failed. |
67b5031e MW |
1042 | * |
1043 | * Use: Calibrate the benchmark state, so that it can be used to | |
1044 | * measure performance reasonably accurately. | |
b1a20bee MW |
1045 | * |
1046 | * Calibration will take into account how the subject code is | |
1047 | * going to be located. If you're going to use @BENCH_MEASURE@ | |
1048 | * to measure a piece of literal code, then leave @f@ zero. If | |
1049 | * the code to be measured is going to be executed via an | |
1050 | * indirect branch, e.g., through the @measure@ function, then | |
1051 | * set @BTF_INDIRECT@. | |
67b5031e MW |
1052 | */ |
1053 | ||
13ee7406 MW |
1054 | #define T_CLB 0.0625 /* calibration time limit */ |
1055 | ||
b1a20bee | 1056 | int bench_calibrate(struct bench_state *b, unsigned f) |
b64eb60f MW |
1057 | { |
1058 | struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT; | |
b1a20bee | 1059 | struct bench_timer *tm = b->tm; |
b64eb60f | 1060 | struct bench_timing delta; |
6e683a79 | 1061 | double n, r; |
b1a20bee MW |
1062 | unsigned i, tf = BTF_ANY; |
1063 | BENCH_TIMELOOP_DECLS; | |
b64eb60f MW |
1064 | int rc; |
1065 | ||
67b5031e MW |
1066 | /* The model here is that a timing loop has a fixed overhead as we enter |
1067 | * and leave (e.g., to do with the indirect branch into the code), and | |
1068 | * per-iteration overheads as we check the counter and loop back. We aim | |
1069 | * to split these apart using linear regression. | |
1070 | */ | |
1071 | ||
1072 | /* If we've already calibrated then there's nothing to do. */ | |
13ee7406 | 1073 | if (b->f&BTF_CLB) return (b->f&BTF_ANY ? 0 : -1); |
b64eb60f | 1074 | |
b1a20bee MW |
1075 | /* Run the timer preflight check. */ |
1076 | if (tm->ops->preflight(tm)) { rc = -1; goto end; } | |
1077 | ||
1078 | /* Exercise the inner loop a few times to educate the branch predictor. | |
1079 | * This is only useful if we're executing via an indirect call. | |
1080 | */ | |
1081 | if (f&BTF_INDIRECT) { | |
1082 | for (i = 0; i < 50; i++) | |
289651a7 | 1083 | BENCH_TIMELOOP_TAG(setup, b->tm, &delta, 10000, ;) |
b1a20bee MW |
1084 | LAUNDER(&spin)(_bench_n, 0); |
1085 | } | |
b64eb60f | 1086 | |
67b5031e MW |
1087 | /* Now we measure idle loops until they take sufficiently long -- or we run |
1088 | * out of counter. | |
1089 | */ | |
b64eb60f | 1090 | debug("calibrating..."); |
6e683a79 | 1091 | n = 1.0; |
b64eb60f | 1092 | for (;;) { |
67b5031e MW |
1093 | |
1094 | /* Measure @n@ iterations of the idle loop. */ | |
b1a20bee | 1095 | if (f&BTF_INDIRECT) |
289651a7 | 1096 | BENCH_TIMELOOP_TAG(calibrate, b->tm, &delta, n, ;) |
b1a20bee MW |
1097 | LAUNDER(&spin)(_bench_n, 0); |
1098 | else | |
289651a7 | 1099 | BENCH_TIMELOOP_TAG(calibrate, b->tm, &delta, n, ;) |
b1a20bee MW |
1100 | while (_bench_n--) RELAX; |
1101 | tf &= delta.f; if (!(tf&BTF_TIMEOK)) { rc = -1; goto end; } | |
67b5031e MW |
1102 | |
1103 | /* Register the timings with the regression machinery. */ | |
b64eb60f | 1104 | linreg_update(&lr_clk, n, delta.t); |
b1a20bee | 1105 | if (!(tf&BTF_CYOK)) |
6e683a79 | 1106 | debug(" n = %10.0f; t = %12g s", n, delta.t); |
b64eb60f MW |
1107 | else { |
1108 | linreg_update(&lr_cy, n, delta.cy); | |
6e683a79 | 1109 | debug(" n = %10.0f; t = %12g s, cy = %10.0f", n, delta.t, delta.cy); |
b64eb60f | 1110 | } |
67b5031e MW |
1111 | |
1112 | /* If we're done then stop. */ | |
13ee7406 | 1113 | if (delta.t >= T_CLB) break; |
b64eb60f | 1114 | if (n >= ULONG_MAX - n/3) break; |
67b5031e MW |
1115 | |
1116 | /* Update the counter and continue. */ | |
6e683a79 | 1117 | n += n/3.0 + 1.0; |
b64eb60f MW |
1118 | } |
1119 | ||
67b5031e MW |
1120 | /* Now run the linear regression to extract the constant and per-iteration |
1121 | * overheads. | |
1122 | */ | |
13ee7406 MW |
1123 | linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, &r); |
1124 | debug("clock overhead = (%g n + %g) s (r = %g)", b->clk.m, b->clk.c, r); | |
b1a20bee | 1125 | if (tf&BTF_CYOK) { |
13ee7406 MW |
1126 | linreg_fit(&lr_cy, &b->cy.m, &b->cy.c, &r); |
1127 | debug("cycle overhead = (%g n + %g) cy (r = %g)", b->cy.m, b->cy.c, r); | |
b64eb60f | 1128 | } |
67b5031e MW |
1129 | |
1130 | /* We're done. */ | |
13ee7406 | 1131 | rc = 0; |
b64eb60f | 1132 | end: |
b1a20bee | 1133 | b->f |= tf | BTF_CLB; /* no point trying again */ |
b64eb60f MW |
1134 | return (rc); |
1135 | } | |
1136 | ||
b1a20bee | 1137 | /* --- @bench_preflight@ --- * |
67b5031e | 1138 | * |
8d372122 | 1139 | * Arguments: @struct bench_state *b@ = benchmark state |
67b5031e | 1140 | * |
b1a20bee | 1141 | * Returns: Zero on success, %$-1$% on failure. |
67b5031e | 1142 | * |
b1a20bee MW |
1143 | * Use: Prepares for benchmarking on the current thread. Current |
1144 | * checks are that the timer is calibrated and that it can | |
1145 | * successfully measure time; the timer preflight is also run. | |
67b5031e | 1146 | * |
b1a20bee MW |
1147 | * Users are unlikely to find this function useful: it's called |
1148 | * automatically by the @BENCH_MEASURE@ macro and the | |
1149 | * @bench_measure@ function. | |
67b5031e MW |
1150 | */ |
1151 | ||
b1a20bee | 1152 | int bench_preflight(struct bench_state *b) |
b64eb60f | 1153 | { |
b1a20bee | 1154 | struct bench_timer *tm = b->tm; |
b64eb60f | 1155 | |
b1a20bee | 1156 | if (!(b->f&BTF_CLB)) return (-1); |
8d372122 | 1157 | if (!(b->f&BTF_TIMEOK)) return (-1); |
b1a20bee MW |
1158 | if (tm->ops->preflight(tm)) return (-1); |
1159 | debug("measuring..."); | |
1160 | return (0); | |
1161 | } | |
1162 | ||
1163 | /* --- @bench_adapt@ --- * | |
1164 | * | |
289651a7 MW |
1165 | * Arguments: @double *n_inout@ = number of iterations, updated |
1166 | * @double target_s@ = target time in seconds | |
b1a20bee MW |
1167 | * @const struct bench_timing *t@ = timing from the previous run |
1168 | * | |
1169 | * Returns: Nonzero if the measurement is sufficient; zero to run again. | |
1170 | * | |
1171 | * Use: This function determines a suitable number of iterations of a | |
1172 | * benchmark function to perform next. It is used in a loop | |
1173 | * such as the following. | |
1174 | * | |
1175 | * @double n = 1.0;@ | |
1176 | * @struct bench_timing t;@ | |
1177 | * | |
1178 | * @do {@ | |
1179 | * (run @n@ iterations; set @t@ to the timing) | |
1180 | * @} while (!bench_adapt(b, &n, &t));@ | |
1181 | * | |
1182 | * On entry, @*n_inout@ should be the number of iterations | |
1183 | * performed by the previous pass, and @*t@ the resulting time; | |
1184 | * the @BTF_TIMEOK@ flag must be set @t->f@. If the timing is | |
289651a7 | 1185 | * sufficient -- @t->t@ is sufficiently close to @target_s@ |
b1a20bee MW |
1186 | * -- then the function returns nonzero to indicate that |
1187 | * measurement is complete. Otherwise, it sets @*n_inout@ to a | |
1188 | * new, larger iteration count and returns zero to indicate that | |
1189 | * a further pass is necessary. | |
1190 | */ | |
67b5031e | 1191 | |
289651a7 MW |
1192 | int bench_adapt(double *n_inout, double target_s, |
1193 | const struct bench_timing *t) | |
b1a20bee MW |
1194 | { |
1195 | double n = *n_inout, nn; | |
1196 | ||
1197 | /* Dump the results for debugging. */ | |
1198 | if (!(t->f&BTF_CYOK)) debug(" n = %10.0f; t = %12g", n, t->t); | |
1199 | else debug(" n = %10.0f; t = %12g, cy = %10.0f", n, t->t, t->cy); | |
1200 | ||
1201 | /* Suppose the timer loop %$n$% iterations in %$t$% seconds. Our ideal | |
c81c35df MW |
1202 | * time is %$T$% seconds. If %$t \ge T/\sqrt{2}$%, we're happy. |
1203 | * Otherwise, we need to scale up the iteration count. The obvious next | |
1204 | * choice is %$n' = n T/t$%. Alas, rounding is a problem: if | |
1205 | * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no | |
1206 | * progress. We know that %$T/t > \sqrt{2}%, so this can only happen when | |
1207 | * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%. On the other | |
1208 | * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying | |
1209 | * again with %$n' = n + 1$% iterations will very likely work. | |
1210 | */ | |
289651a7 MW |
1211 | if (t->t >= 0.707*target_s) return (1); |
1212 | nn = n*target_s/t->t; modf(nn, &nn); | |
b1a20bee MW |
1213 | *n_inout = nn > n ? nn : n + 1; |
1214 | return (0); | |
1215 | } | |
6e683a79 | 1216 | |
b1a20bee MW |
1217 | /* --- @bench_adjust@ --- * |
1218 | * | |
1219 | * Arguments: @struct bench_state *b@ = benchmark state | |
1220 | * @struct bench_timing *t_inout@ = timing to adjust | |
1221 | * @double n@ = number of external iterations performed | |
1222 | * @double base@ = number of internal operations per external | |
1223 | * iteration | |
1224 | * | |
1225 | * Returns: --- | |
1226 | * | |
1227 | * Use: Adjusts a raw timing, as captured by @BENCH_TIMELOOP@, | |
1228 | * according to the calibration data captured in @b@. | |
1229 | * On exit, the timing data is updated, and @t->n@ is set to the | |
1230 | * product @n*base@. | |
1231 | */ | |
1232 | ||
1233 | void bench_adjust(struct bench_state *b, | |
1234 | struct bench_timing *t_inout, double n, double base) | |
1235 | { | |
67b5031e MW |
1236 | |
1237 | /* Adjust according to the calibration. */ | |
b1a20bee MW |
1238 | t_inout->t -= n*b->clk.m + b->clk.c; |
1239 | if (t_inout->f&BTF_CYOK) t_inout->cy -= n*b->cy.m + b->cy.c; | |
67b5031e MW |
1240 | |
1241 | /* Report the results, if debugging. */ | |
b1a20bee MW |
1242 | if (!(t_inout->f&BTF_CYOK)) debug(" adjusted t' = %12g", t_inout->t); |
1243 | else debug(" adjusted t' = %12g, cy' = %10.0f", t_inout->t, t_inout->cy); | |
1244 | if (!(t_inout->f&BTF_CYOK)) | |
1245 | debug(" %g s per iter; %g iters/s", t_inout->t/n, n/t_inout->t); | |
b64eb60f | 1246 | else |
b1a20bee MW |
1247 | debug(" %g s (%g cy) per iter; %g iters/s", |
1248 | t_inout->t/n, t_inout->cy/n, n/t_inout->t); | |
67b5031e MW |
1249 | |
1250 | /* All done. */ | |
b1a20bee MW |
1251 | t_inout->n = n*base; |
1252 | } | |
1253 | ||
1254 | /* --- @bench_measure@ --- * | |
1255 | * | |
1256 | * Arguments: @struct bench_state *b@ = benchmark state | |
1257 | * @struct bench_timing *t_out@ = where to leave the timing | |
1258 | * @double base@ = number of internal units per call | |
1259 | * @bench_fn *fn@, @void *ctx@ = benchmark function to run | |
1260 | * | |
1261 | * Returns: Zero on success, %$-1$% if timing failed. | |
1262 | * | |
1263 | * Use: Measure a function. The function @fn@ is called adaptively | |
1264 | * with an iteration count @n@ set so as to run for | |
1265 | * approximately @b->target_s@ seconds. | |
1266 | * | |
1267 | * The result is left in @*t_out@, with @t_out->n@ counting the | |
1268 | * final product of the iteration count and @base@ (which might, | |
1269 | * e.g., reflect the number of inner iterations the function | |
1270 | * performs, or the number of bytes it processes per iteration). | |
1271 | * | |
1272 | * To get useful results, the benchmark state should have been | |
1273 | * calibrated for indirect calling -- i.e., with @BTF_INDIRECT@. | |
1274 | */ | |
1275 | ||
1276 | int bench_measure(struct bench_state *b, struct bench_timing *t_out, | |
1277 | double base, bench_fn *fn, void *ctx) | |
1278 | { | |
1279 | BENCH_MEASURE_DECLS; | |
1280 | int rc; | |
1281 | ||
289651a7 | 1282 | BENCH_MEASURE(b, rc, t_out, base) fn(_bench_n, ctx); |
b1a20bee MW |
1283 | return (rc); |
1284 | } | |
1285 | ||
1286 | /*----- Reporting ---------------------------------------------------------*/ | |
1287 | ||
1288 | /* --- @bench_report@ --- * | |
1289 | * | |
1290 | * Arguments: @const struct gprintf_ops *gops, void *gp@ = output formatter | |
1291 | * @unsigned unit@ = unit processed by the benchmark function | |
1292 | * @const struct bench_timing *t@ = benchmark result | |
1293 | * | |
1294 | * Returns: --- | |
1295 | * | |
1296 | * Use: Format, to the output identified by @gops@ and @go@, a | |
1297 | * human-readable report of the benchmarking result @t@. No | |
1298 | * newline is appended. | |
1299 | * | |
1300 | * The output format is subject to change in later versions. | |
1301 | */ | |
1302 | ||
1303 | void bench_report(const struct gprintf_ops *gops, void *go, | |
1304 | unsigned unit, const struct bench_timing *t) | |
1305 | { | |
1306 | double scale, x, n = t->n; | |
1307 | const char *u, *what, *whats; | |
1308 | ||
1309 | assert(t->f&BTF_TIMEOK); | |
1310 | ||
1311 | switch (unit) { | |
1312 | case BTU_OP: | |
1313 | gprintf(gops, go, "%.0f iterations ", n); | |
1314 | what = "op"; whats = "ops"; scale = 1000; | |
1315 | break; | |
1316 | case BTU_BYTE: | |
1317 | x = n; normalize(&x, &u, 1024); gprintf(gops, go, "%.3f %sB ", x, u); | |
1318 | what = whats = "B"; scale = 1024; | |
1319 | break; | |
1320 | default: | |
1321 | assert(0); | |
1322 | } | |
1323 | ||
1324 | x = t->t; normalize(&x, &u, 1000); | |
1325 | gprintf(gops, go, "in %.3f %ss", x, u); | |
1326 | if (t->f&BTF_CYOK) { | |
1327 | x = t->cy; normalize(&x, &u, 1000); | |
1328 | gprintf(gops, go, " (%.3f %scy)", x, u); | |
1329 | } | |
1330 | gprintf(gops, go, ": "); | |
1331 | ||
1332 | x = n/t->t; normalize(&x, &u, scale); | |
1333 | gprintf(gops, go, "%.3f %s%s/s", x, u, whats); | |
1334 | x = t->t/n; normalize(&x, &u, 1000); | |
1335 | gprintf(gops, go, ", %.3f %ss/%s", x, u, what); | |
1336 | if (t->f&BTF_CYOK) { | |
1337 | x = t->cy/n; normalize(&x, &u, 1000); | |
1338 | gprintf(gops, go, " (%.3f %scy/%s)", x, u, what); | |
1339 | } | |
b64eb60f MW |
1340 | } |
1341 | ||
1342 | /*----- That's all, folks -------------------------------------------------*/ |