Commit | Line | Data |
---|---|---|
b64eb60f MW |
1 | /* -*-c-*- |
2 | * | |
3 | * Benchmarking support | |
4 | * | |
5 | * (c) 2023 Straylight/Edgeware | |
6 | */ | |
7 | ||
8 | /*----- Licensing notice --------------------------------------------------* | |
9 | * | |
10 | * This file is part of the mLib utilities library. | |
11 | * | |
12 | * mLib is free software: you can redistribute it and/or modify it under | |
13 | * the terms of the GNU Library General Public License as published by | |
14 | * the Free Software Foundation; either version 2 of the License, or (at | |
15 | * your option) any later version. | |
16 | * | |
17 | * mLib is distributed in the hope that it will be useful, but WITHOUT | |
18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public | |
20 | * License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Library General Public | |
23 | * License along with mLib. If not, write to the Free Software | |
24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | * USA. | |
26 | */ | |
27 | ||
28 | /*----- Header files ------------------------------------------------------*/ | |
29 | ||
30 | #include "config.h" | |
31 | ||
32 | #include <errno.h> | |
33 | #include <stdarg.h> | |
34 | #include <stdio.h> | |
35 | #include <stdlib.h> | |
36 | #include <string.h> | |
37 | #include <time.h> | |
38 | ||
39 | #include "alloc.h" | |
40 | #include "bench.h" | |
41 | #include "bits.h" | |
42 | #include "linreg.h" | |
43 | #include "macros.h" | |
44 | ||
45 | /*----- Data structures ---------------------------------------------------*/ | |
46 | ||
47 | struct timer { | |
48 | struct bench_timer _t; | |
67b5031e MW |
49 | const struct timer_ops *clkops, *cyops; /* time and cycle measurements */ |
50 | union { int fd; } u_cy; /* state for cycle measurement */ | |
b64eb60f MW |
51 | }; |
52 | ||
53 | struct timer_ops { | |
67b5031e MW |
54 | void (*now)(struct bench_time *t_out, struct timer *t); /* read current */ |
55 | void (*teardown)(struct timer *t); /* release held resources */ | |
b64eb60f MW |
56 | }; |
57 | ||
58 | /*----- Preliminaries -----------------------------------------------------*/ | |
59 | ||
60 | #define NS_PER_S 1000000000 | |
61 | ||
67b5031e MW |
62 | /* --- @debug@ --- * |
63 | * | |
64 | * Arguments: @const char *fmt@ = format control string | |
65 | * @...@ = format arguemnts | |
66 | * | |
67 | * Returns: --- | |
68 | * | |
69 | * Use: Maybe report a debugging message to standard error. | |
70 | */ | |
71 | ||
31d0247c | 72 | static PRINTF_LIKE(1, 2) void debug(const char *fmt, ...) |
b64eb60f MW |
73 | { |
74 | const char *p; | |
75 | va_list ap; | |
76 | ||
77 | p = getenv("MLIB_BENCH_DEBUG"); | |
78 | if (p && *p != 'n' && *p != '0') { | |
79 | va_start(ap, fmt); | |
80 | fputs("mLib BENCH: ", stderr); | |
81 | vfprintf(stderr, fmt, ap); | |
82 | fputc('\n', stderr); | |
83 | va_end(ap); | |
84 | } | |
85 | } | |
86 | ||
67b5031e MW |
87 | /* --- @timer_diff@ --- * |
88 | * | |
89 | * Arguments: @struct bench_timing *delta_out@ = where to putt the result | |
90 | * @const struct bench_time *t0, *t1@ = two times captured by a | |
91 | * timer's @now@ function | |
92 | * | |
93 | * Returns: --- | |
94 | * | |
95 | * Use: Calculates the difference between two captured times. The | |
96 | * flags are set according to whether the differences are | |
97 | * meaningful; @delta_out->n@ is left unset. | |
98 | */ | |
99 | ||
100 | static void timer_diff(struct bench_timing *delta_out, | |
101 | const struct bench_time *t0, | |
102 | const struct bench_time *t1) | |
103 | { | |
104 | unsigned f = t0->f&t1->f; | |
105 | kludge64 k; | |
106 | ||
107 | #ifdef HAVE_UINT64 | |
108 | # define FLOATK64(k) ((double)(k).i) | |
109 | #else | |
110 | # define FLOATK64(k) ((double)(k).lo + 4275123318.0*(double)(k).hi) | |
111 | #endif | |
112 | ||
113 | if (!(f&BTF_TIMEOK)) | |
114 | delta_out->t = 0.0; | |
115 | else { | |
116 | SUB64(k, t1->s, t0->s); | |
117 | delta_out->t = FLOATK64(k) - 1 + | |
118 | (t1->ns + NS_PER_S - t0->ns)/(double)NS_PER_S; | |
119 | } | |
120 | ||
121 | if (!(f&BTF_CYOK)) | |
122 | delta_out->cy = 0.0; | |
123 | else { | |
124 | SUB64(k, t1->cy, t0->cy); | |
125 | delta_out->cy = FLOATK64(k); | |
126 | } | |
127 | ||
128 | delta_out->f = f; | |
129 | ||
130 | #undef FLOATK64 | |
131 | } | |
132 | ||
b64eb60f MW |
133 | /*----- The null clock ----------------------------------------------------*/ |
134 | ||
67b5031e MW |
135 | /* This is a cycle counter which does nothing, in case we don't have any |
136 | * better ideas. | |
137 | */ | |
138 | ||
b64eb60f MW |
139 | static void null_now(struct bench_time *t_out, struct timer *t) { ; } |
140 | static void null_teardown(struct timer *t) { ; } | |
141 | static const struct timer_ops null_ops = { null_now, null_teardown }; | |
142 | ||
143 | static int null_cyinit(struct timer *t) | |
144 | { t->cyops = &null_ops; return (0); } | |
145 | ||
146 | #define NULL_CYENT { "null", null_cyinit }, | |
147 | ||
148 | /*----- Linux performance counters ----------------------------------------*/ | |
149 | ||
67b5031e MW |
150 | /* This is a cycle counter which uses the Linux performance event system, |
151 | * which is probably the best choice if it's available. | |
152 | */ | |
153 | ||
b64eb60f MW |
154 | #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64) |
155 | ||
156 | #include <sys/types.h> | |
157 | #include <unistd.h> | |
158 | ||
159 | #include <linux/perf_event.h> | |
160 | #include <asm/unistd.h> | |
161 | ||
162 | static void perfevent_now(struct bench_time *t_out, struct timer *t) | |
163 | { | |
164 | ssize_t n; | |
165 | ||
166 | n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i)); | |
167 | if (n != sizeof(t_out->cy.i)) { | |
168 | debug("failed to read perf-event counter: %s", strerror(errno)); | |
169 | return; | |
170 | } | |
171 | t_out->f |= BTF_CYOK; | |
172 | } | |
173 | ||
174 | static void perfevent_teardown(struct timer *t) | |
175 | { close(t->u_cy.fd); } | |
176 | ||
177 | static const struct timer_ops perfevent_ops = | |
178 | { perfevent_now, perfevent_teardown }; | |
179 | ||
180 | static int perfevent_init(struct timer *t) | |
181 | { | |
182 | struct perf_event_attr attr = { 0 }; | |
183 | struct bench_time tm; | |
184 | ||
185 | attr.type = PERF_TYPE_HARDWARE; | |
186 | attr.size = sizeof(attr); | |
187 | attr.config = PERF_COUNT_HW_CPU_CYCLES; | |
188 | attr.disabled = 0; | |
189 | attr.exclude_kernel = 1; | |
190 | attr.exclude_hv = 1; | |
191 | ||
192 | t->u_cy.fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); | |
193 | if (t->u_cy.fd < 0) { | |
194 | debug("couldn't open perf evvent: %s", strerror(errno)); | |
195 | return (-1); | |
196 | } | |
197 | ||
198 | tm.f = 0; perfevent_now(&tm, t); | |
199 | if (!(tm.f&BTF_CYOK)) { close(t->u_cy.fd); return (-1); } | |
200 | ||
201 | t->cyops = &perfevent_ops; return (0); | |
202 | } | |
203 | # define PERFEVENT_CYENT { "linux-perf-event", perfevent_init }, | |
204 | #else | |
205 | # define PERFEVENT_CYENT | |
206 | #endif | |
207 | ||
208 | /*----- Intel time-stamp counter ------------------------------------------*/ | |
209 | ||
67b5031e MW |
210 | /* This is a cycle counter based on the Intel `rdtsc' instruction. It's not |
211 | * really suitable for performance measurement because it gets confused by | |
212 | * CPU frequency adjustments. | |
213 | */ | |
214 | ||
b64eb60f MW |
215 | #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) |
216 | ||
217 | #define EFLAGS_ID (1u << 21) | |
218 | #define CPUID_1D_TSC (1u << 4) | |
219 | ||
220 | static uint32 set_flags(unsigned long m, unsigned long x) | |
221 | { | |
222 | unsigned long r; | |
223 | ||
224 | #ifdef __x86_64__ | |
225 | # define TMP "%%rcx" | |
226 | #else | |
227 | # define TMP "%%ecx" | |
228 | #endif | |
229 | ||
230 | __asm__ ("pushf\n\t" | |
231 | "pop %0\n\t" | |
232 | "mov %0, " TMP "\n\t" | |
233 | "and %1, %0\n\t" | |
234 | "xor %2, %0\n\t" | |
235 | "push %0\n\t" | |
236 | "popf\n\t" | |
237 | "pushf\n\t" | |
238 | "pop %0\n\t" | |
239 | "push " TMP "\n\t" | |
240 | "popf" | |
241 | : "=r"(r) | |
242 | : "g"(m), "g"(x) | |
243 | : "ecx"); | |
244 | return (r); | |
245 | } | |
246 | ||
247 | struct cpuid { uint32 a, b, c, d; }; | |
248 | ||
249 | static void cpuid(struct cpuid *info_out, uint32 a, uint32 c) | |
250 | { | |
251 | __asm__ ("movl %1, %%eax\n\t" | |
252 | "movl %2, %%ecx\n\t" | |
253 | "cpuid\n\t" | |
254 | "movl %%eax, 0(%0)\n\t" | |
255 | "movl %%ebx, 4(%0)\n\t" | |
256 | "movl %%ecx, 8(%0)\n\t" | |
257 | "movl %%edx, 12(%0)\n\t" | |
258 | : /* no outputs */ | |
259 | : "r"(info_out), "g"(a), "g"(c) | |
260 | : "eax", "ebx", "ecx", "edx", "cc"); | |
261 | } | |
262 | ||
263 | static void x86rdtsc_now(struct bench_time *t_out, struct timer *t) | |
264 | { | |
265 | uint32 lo, hi; | |
266 | ||
267 | __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); | |
268 | SET64(t_out->cy, hi, lo); t_out->f |= BTF_CYOK; | |
269 | } | |
270 | ||
271 | static const struct timer_ops x86rdtsc_ops = | |
272 | { x86rdtsc_now, null_teardown }; | |
273 | ||
274 | static int x86rdtsc_init(struct timer *t) | |
275 | { | |
276 | struct cpuid info; | |
277 | ||
278 | if ((set_flags(~EFLAGS_ID, 0)&EFLAGS_ID) || | |
279 | !(set_flags(~EFLAGS_ID, EFLAGS_ID)&EFLAGS_ID)) | |
280 | { debug("no `cpuid' instruction"); return (-1); } | |
281 | cpuid(&info, 0, 0); | |
282 | if (info.a < 1) { debug("no `cpuid' leaf 1"); return (-1); } | |
283 | cpuid(&info, 1, 0); | |
284 | if (!(info.d&CPUID_1D_TSC)) | |
285 | { debug("no `rdtsc' instrunction"); return (-1); } | |
286 | t->cyops = &x86rdtsc_ops; return (0); | |
287 | } | |
288 | ||
289 | # define X86RDTSC_CYENT { "x86-rdtsc", x86rdtsc_init }, | |
290 | #else | |
291 | # define X86RDTWC_CYENT | |
292 | #endif | |
293 | ||
294 | /*----- POSIX `clock_gettime' ---------------------------------------------*/ | |
295 | ||
67b5031e MW |
296 | /* This is a real-time clock based on the POSIX time interface, with up to |
297 | * nanosecond precision. | |
298 | */ | |
299 | ||
b64eb60f MW |
300 | #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID) |
301 | ||
302 | static void gettime_now(struct bench_time *t_out, struct timer *t) | |
303 | { | |
304 | struct timespec now; | |
305 | ||
306 | if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now)) | |
307 | { debug("error reading POSIX clock: %s", strerror(errno)); return; } | |
308 | ASSIGN64(t_out->s, now.tv_sec); t_out->ns = now.tv_nsec; | |
309 | t_out->f |= BTF_TIMEOK; | |
310 | } | |
311 | ||
312 | static const struct timer_ops gettime_ops = { gettime_now, null_teardown }; | |
313 | ||
314 | static int gettime_init(struct timer *t) | |
315 | { | |
316 | struct bench_time tm; | |
317 | ||
318 | tm.f = 0; gettime_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1); | |
319 | t->clkops = &gettime_ops; return (0); | |
320 | } | |
321 | ||
322 | # define GETTIME_CLKENT { "posix-clock_gettime", gettime_init }, | |
323 | #else | |
324 | # define GETTIME_CLKENT | |
325 | #endif | |
326 | ||
327 | /*----- Standard C `clock' ------------------------------------------------*/ | |
328 | ||
67b5031e MW |
329 | /* This is a real-time clock based on the C `clock' function which is |
330 | * guaranteed to be available, though it's not likely to be very good. | |
331 | */ | |
332 | ||
b64eb60f MW |
333 | static void clock_now(struct bench_time *t_out, struct timer *t) |
334 | { | |
335 | clock_t now, x; | |
336 | unsigned long s; uint32 ns; | |
337 | ||
338 | now = clock(); | |
339 | if (now == (clock_t)-1) { | |
340 | debug("error reading standard clock: %s", strerror(errno)); | |
341 | return; | |
342 | } | |
343 | x = now/CLOCKS_PER_SEC; | |
344 | if (x > ULONG_MAX) { debug("standard clock out of range"); return; } | |
345 | ||
346 | s = x; x = now - CLOCKS_PER_SEC*s; | |
347 | if (!(NS_PER_S%CLOCKS_PER_SEC)) | |
348 | ns = x*(NS_PER_S/CLOCKS_PER_SEC); | |
349 | else if (NS_PER_S <= ULONG_MAX/CLOCKS_PER_SEC) | |
350 | ns = (x*NS_PER_S)/CLOCKS_PER_SEC; | |
351 | else | |
352 | ns = x*((NS_PER_S + 0.0)/CLOCKS_PER_SEC); | |
353 | ASSIGN64(t_out->s, s); t_out->ns = ns; t_out->f |= BTF_TIMEOK; | |
354 | } | |
355 | ||
356 | static const struct timer_ops clock_ops = { clock_now, null_teardown }; | |
357 | ||
358 | static int clock_init(struct timer *t) | |
359 | { | |
360 | struct bench_time tm; | |
361 | ||
362 | tm.f = 0; clock_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1); | |
363 | t->clkops = &clock_ops; return (0); | |
364 | } | |
365 | ||
366 | #define CLOCK_CLKENT { "clock", clock_init }, | |
367 | ||
368 | /*----- Timing setup ------------------------------------------------------*/ | |
369 | ||
67b5031e | 370 | /* Tables of timing sources. */ |
b64eb60f MW |
371 | static const struct timerent { |
372 | const char *name; | |
373 | int (*init)(struct timer */*t*/); | |
374 | } | |
375 | clktab[] = { GETTIME_CLKENT CLOCK_CLKENT { 0, 0 } }, | |
376 | cytab[] = { PERFEVENT_CYENT X86RDTSC_CYENT NULL_CYENT { 0, 0 } }; | |
377 | ||
67b5031e MW |
378 | /* --- @find_timer@ --- * |
379 | * | |
380 | * Arguments: @const char *name@ = timer name | |
381 | * @size_t sz@ = length of name | |
382 | * @const struct timerent *timers@ = table to search | |
383 | * @const char *what@ = adjective describing table | |
384 | * | |
385 | * Returns: The table entry matching the given name, or null if there | |
386 | * isn't one. | |
387 | */ | |
388 | ||
b64eb60f MW |
389 | static const struct timerent *find_timer_n(const char *name, size_t sz, |
390 | const struct timerent *timers, | |
391 | const char *what) | |
392 | { | |
393 | while (timers->name) { | |
394 | if (strlen(timers->name) == sz && MEMCMP(name, ==, timers->name, sz)) | |
395 | return (timers); | |
396 | timers++; | |
397 | } | |
398 | debug("%s timer `%.*s' not found", what, (int)sz, name); return (0); | |
399 | } | |
400 | ||
67b5031e MW |
401 | /* --- @try_timer@ --- * |
402 | * | |
403 | * Arguments: @struct timer *t@ = timer structure | |
404 | * @const struct timerent *timer@ = timer table entry | |
405 | * @const char *what@ = adjective describing table | |
406 | * | |
407 | * Returns: Zero on success, @-1@ if timer failed. | |
408 | * | |
409 | * Use: Tries to initialize the timer @t@, reporting a debug message | |
410 | * if it worked. | |
411 | */ | |
412 | ||
b64eb60f MW |
413 | static int try_timer(struct timer *t, |
414 | const struct timerent *timer, const char *what) | |
415 | { | |
416 | if (timer->init(t)) return (-1); | |
417 | debug("selected %s timer `%s'", what, timer->name); return (0); | |
418 | } | |
419 | ||
67b5031e MW |
420 | /* --- @select_timer@ --- * |
421 | * | |
422 | * Arguments: @struct timer *t@ = timer structure | |
423 | * @const struct timerent *timer@ = timer table | |
424 | * @const char *varname@ = environment variable to consult | |
425 | * @const char *what@ = adjective describing table | |
426 | * | |
427 | * Returns: Zero on success, @-1@ if timer failed. | |
428 | * | |
429 | * Use: Select a timer from the table. If the environment variable | |
430 | * is set, then parse a comma-separated list of timer names and | |
431 | * use the first one listed that seems to work; otherwise, try | |
432 | * the timers in the table in order. | |
433 | */ | |
434 | ||
b64eb60f MW |
435 | static int select_timer(struct timer *t, const struct timerent *timers, |
436 | const char *varname, const char *what) | |
437 | { | |
438 | const char *p; size_t n; | |
439 | const struct timerent *timer; | |
440 | ||
441 | p = getenv(varname); | |
442 | if (!p) { | |
443 | while (timers->name) | |
444 | if (!try_timer(t, timers++, what)) return (0); | |
445 | } else { | |
446 | for (;;) { | |
447 | n = strcspn(p, ","); | |
448 | timer = find_timer_n(p, n, timers, what); | |
449 | if (timer && !try_timer(t, timer, what)) return (0); | |
450 | if (!p[n]) break; | |
451 | p += n + 1; | |
452 | } | |
453 | } | |
454 | debug("no suitable %s timer found", what); return (-1); | |
455 | } | |
456 | ||
67b5031e | 457 | /* Bench timer operations. */ |
b64eb60f MW |
458 | static void timer_now(struct bench_timer *tm, struct bench_time *t_out) |
459 | { | |
460 | struct timer *t = (struct timer *)tm; | |
461 | ||
462 | t->clkops->now(t_out, t); | |
463 | t->cyops->now(t_out, t); | |
464 | } | |
b64eb60f MW |
465 | static void timer_destroy(struct bench_timer *tm) |
466 | { | |
467 | struct timer *t = (struct timer *)tm; | |
468 | ||
469 | if (!t) return; | |
470 | if (t->clkops) t->clkops->teardown(t); | |
471 | if (t->cyops) t->cyops->teardown(t); | |
472 | xfree(t); | |
473 | } | |
474 | ||
475 | static const struct bench_timerops timer_ops = { timer_now, timer_destroy }; | |
476 | ||
67b5031e MW |
477 | /* --- @bench_createtimer@ --- * |
478 | * | |
479 | * Arguments: --- | |
480 | * | |
481 | * Returns: A freshly constructed standard timer object. | |
482 | * | |
483 | * Use: Allocate a timer. Dispose of it by calling | |
484 | * @tm->ops->destroy(tm)@ when you're done. | |
485 | */ | |
486 | ||
b64eb60f MW |
487 | struct bench_timer *bench_createtimer(void) |
488 | { | |
489 | struct timer *t = 0; | |
490 | struct bench_timer *ret = 0; | |
491 | ||
492 | t = xmalloc(sizeof(*t)); t->cyops = 0; t->clkops = 0; | |
493 | if (select_timer(t, clktab, "MLIB_BENCH_CLKTIMER", "clock")) goto end; | |
494 | if (select_timer(t, cytab, "MLIB_BENCH_CYCLETIMER", "cycle")) goto end; | |
495 | t->_t.ops = &timer_ops; ret = &t->_t; t = 0; | |
496 | end: | |
497 | if (t) timer_destroy(&t->_t); | |
498 | return (ret); | |
499 | } | |
500 | ||
67b5031e | 501 | /*----- Benchmarking ------------------------------------------------------*/ |
b64eb60f | 502 | |
67b5031e MW |
503 | /* --- @bench_init@ --- * |
504 | * | |
505 | * Arguments: @struct bench_state *b@ = bench state to initialize | |
506 | * @struct bench_timer *tm@ = timer to attach | |
507 | * | |
508 | * Returns: --- | |
509 | * | |
510 | * Use: Initialize the benchmark state. It still needs to be | |
511 | * calibrated (use @bench_calibrate@) before it can be used, but | |
512 | * this will be done automatically by @bench_measure@ if it's | |
513 | * not done by hand earlier. The timer is now owned by the | |
514 | * benchmark state and will be destroyed by @bench_destroy@. | |
515 | */ | |
b64eb60f MW |
516 | |
517 | void bench_init(struct bench_state *b, struct bench_timer *tm) | |
518 | { b->tm = tm; b->target_s = 1.0; b->f = 0; } | |
519 | ||
67b5031e MW |
520 | /* --- @bench_destroy@ --- * |
521 | * | |
522 | * Arguments: @struct bench_state *b@ = bench state | |
523 | * | |
524 | * Returns: --- | |
525 | * | |
526 | * Use: Destroy the benchmark state, releasing the resources that it | |
527 | * holds. | |
528 | */ | |
529 | ||
b64eb60f MW |
530 | void bench_destroy(struct bench_state *b) |
531 | { b->tm->ops->destroy(b->tm); } | |
532 | ||
67b5031e MW |
533 | /* --- @do_nothing@ --- * |
534 | * | |
535 | * Arguments: @unsigned long n@ = iteration count | |
536 | * @void *ctx@ = context pointer (ignored) | |
537 | * | |
538 | * Returns: --- | |
539 | * | |
540 | * Use: Does nothing at all for @n@ iterations. Used to calibrate | |
541 | * the benchmarking state. | |
542 | */ | |
543 | ||
544 | static void do_nothing(unsigned long n, void *ctx) | |
b64eb60f MW |
545 | { while (n--) RELAX; } |
546 | ||
67b5031e MW |
547 | /* --- @bench_calibrate@ --- * |
548 | * | |
549 | * Arguments: @struct bench_state *b@ = bench state | |
550 | * | |
551 | * Returns: Zero on success, @-1@ if calibration failed. | |
552 | * | |
553 | * Use: Calibrate the benchmark state, so that it can be used to | |
554 | * measure performance reasonably accurately. | |
555 | */ | |
556 | ||
b64eb60f MW |
557 | int bench_calibrate(struct bench_state *b) |
558 | { | |
559 | struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT; | |
560 | unsigned long n; | |
561 | unsigned i; | |
562 | struct bench_timer *tm = b->tm; | |
563 | struct bench_time t0, t1; | |
564 | struct bench_timing delta; | |
565 | bench_fn *fn = LAUNDER(&do_nothing); | |
566 | unsigned f = BTF_ANY; | |
567 | int rc; | |
568 | ||
67b5031e MW |
569 | /* The model here is that a timing loop has a fixed overhead as we enter |
570 | * and leave (e.g., to do with the indirect branch into the code), and | |
571 | * per-iteration overheads as we check the counter and loop back. We aim | |
572 | * to split these apart using linear regression. | |
573 | */ | |
574 | ||
575 | /* If we've already calibrated then there's nothing to do. */ | |
b64eb60f MW |
576 | if (b->f&BTF_ANY) return (0); |
577 | ||
67b5031e | 578 | /* Exercise the inner loop a few times to educate the branch predictor. */ |
b64eb60f | 579 | for (i = 0; i < 10; i++) |
67b5031e | 580 | { tm->ops->now(tm, &t0); fn(50, 0); tm->ops->now(tm, &t1); } |
b64eb60f | 581 | |
67b5031e MW |
582 | /* Now we measure idle loops until they take sufficiently long -- or we run |
583 | * out of counter. | |
584 | */ | |
b64eb60f MW |
585 | debug("calibrating..."); |
586 | n = 1; | |
587 | for (;;) { | |
67b5031e MW |
588 | |
589 | /* Measure @n@ iterations of the idle loop. */ | |
b64eb60f MW |
590 | tm->ops->now(tm, &t0); fn(n, 0); tm->ops->now(tm, &t1); |
591 | timer_diff(&delta, &t0, &t1); f &= delta.f; | |
592 | if (!(f&BTF_TIMEOK)) { rc = -1; goto end; } | |
67b5031e MW |
593 | |
594 | /* Register the timings with the regression machinery. */ | |
b64eb60f MW |
595 | linreg_update(&lr_clk, n, delta.t); |
596 | if (!(f&BTF_CYOK)) | |
597 | debug(" n = %10lu; t = %12g s", n, delta.t); | |
598 | else { | |
599 | linreg_update(&lr_cy, n, delta.cy); | |
600 | debug(" n = %10lu; t = %12g s, cy = %10.0f", n, delta.t, delta.cy); | |
601 | } | |
67b5031e MW |
602 | |
603 | /* If we're done then stop. */ | |
b64eb60f MW |
604 | if (delta.t >= b->target_s/20.0) break; |
605 | if (n >= ULONG_MAX - n/3) break; | |
67b5031e MW |
606 | |
607 | /* Update the counter and continue. */ | |
b64eb60f MW |
608 | n += n/3 + 1; |
609 | } | |
610 | ||
67b5031e MW |
611 | /* Now run the linear regression to extract the constant and per-iteration |
612 | * overheads. | |
613 | */ | |
b64eb60f MW |
614 | linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, 0); |
615 | debug("clock overhead = (%g n + %g) s", b->clk.m, b->clk.c); | |
616 | if (f&BTF_CYOK) { | |
617 | linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, 0); | |
618 | debug("cycle overhead = (%g n + %g) cy", b->cy.m, b->cy.c); | |
619 | } | |
67b5031e MW |
620 | |
621 | /* We're done. */ | |
b64eb60f MW |
622 | b->f |= f; rc = 0; |
623 | end: | |
624 | return (rc); | |
625 | } | |
626 | ||
67b5031e MW |
627 | /* --- @bench_measure@ --- * |
628 | * | |
629 | * Arguments: @struct bench_timing *t_out@ = where to leave the timing | |
630 | * @struct bench_state *b@ = benchmark state | |
631 | * @double base@ = number of internal units per call | |
632 | * @bench_fn *fn@, @void *ctx@ = benchmark function to run | |
633 | * | |
634 | * Returns: Zero on success, @-1@ if timing failed. | |
635 | * | |
636 | * Use: Measure a function. The function @fn@ is called adaptively | |
637 | * with an iteration count @n@ set so as to run for | |
638 | * approximately @b->target_s@ seconds. | |
639 | * | |
640 | * The result is left in @*t_out@, with @t_out->n@ counting the | |
641 | * final product of the iteration count and @base@ (which might, | |
642 | * e.g., reflect the number of inner iterations the function | |
643 | * performs, or the number of bytes it processes per iteration). | |
644 | */ | |
645 | ||
b64eb60f | 646 | int bench_measure(struct bench_timing *t_out, struct bench_state *b, |
67b5031e | 647 | double base, bench_fn *fn, void *ctx) |
b64eb60f MW |
648 | { |
649 | struct bench_timer *tm = b->tm; | |
650 | struct bench_time t0, t1; | |
c81c35df | 651 | unsigned long n, nn; |
b64eb60f | 652 | |
67b5031e | 653 | /* Make sure the state is calibrated. */ |
b64eb60f | 654 | if (bench_calibrate(b)) return (-1); |
67b5031e | 655 | |
c81c35df MW |
656 | /* Main adaptive measurement loop. |
657 | * | |
658 | * Suppose the timer loop %$n$% iterations in %$t$% seconds. Our ideal | |
659 | * time is %$T$% seconds. If %$t \ge T/\sqrt{2}$%, we're happy. | |
660 | * Otherwise, we need to scale up the iteration count. The obvious next | |
661 | * choice is %$n' = n T/t$%. Alas, rounding is a problem: if | |
662 | * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no | |
663 | * progress. We know that %$T/t > \sqrt{2}%, so this can only happen when | |
664 | * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%. On the other | |
665 | * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying | |
666 | * again with %$n' = n + 1$% iterations will very likely work. | |
667 | */ | |
b64eb60f MW |
668 | debug("measuring..."); n = 1; |
669 | for (;;) { | |
67b5031e | 670 | tm->ops->now(tm, &t0); fn(n, ctx); tm->ops->now(tm, &t1); |
b64eb60f MW |
671 | timer_diff(t_out, &t0, &t1); |
672 | if (!(t_out->f&BTF_TIMEOK)) return (-1); | |
673 | if (!(t_out->f&BTF_CYOK)) debug(" n = %10lu; t = %12g", n, t_out->t); | |
674 | else debug(" n = %10lu; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy); | |
c81c35df MW |
675 | if (t_out->t >= 0.707*b->target_s) break; |
676 | nn = n*b->target_s/t_out->t; | |
677 | if (nn > n) n = nn; | |
678 | else n++; | |
b64eb60f | 679 | } |
67b5031e MW |
680 | |
681 | /* Adjust according to the calibration. */ | |
c7da785d MW |
682 | t_out->t -= n*b->clk.m + b->clk.c; |
683 | if (t_out->f&BTF_CYOK) t_out->cy -= n*b->cy.m + b->cy.c; | |
67b5031e MW |
684 | |
685 | /* Report the results, if debugging. */ | |
c7da785d MW |
686 | if (!(t_out->f&BTF_CYOK)) debug(" adjusted t' = %12g", t_out->t); |
687 | else debug(" adjusted t = %12g, cy = %10.0f", t_out->t, t_out->cy); | |
b64eb60f MW |
688 | if (!(t_out->f&BTF_CYOK)) |
689 | debug(" %g s per op; %g ops/s", t_out->t/n, n/t_out->t); | |
690 | else | |
691 | debug(" %g s (%g cy) per op; %g ops/s", | |
692 | t_out->t/n, t_out->cy/n, n/t_out->t); | |
67b5031e MW |
693 | |
694 | /* All done. */ | |
e63124bc | 695 | t_out->n = n*base; return (0); |
b64eb60f MW |
696 | } |
697 | ||
698 | /*----- That's all, folks -------------------------------------------------*/ |