| 1 | /* -*-c-*- |
| 2 | * |
| 3 | * Benchmarking support |
| 4 | * |
| 5 | * (c) 2023 Straylight/Edgeware |
| 6 | */ |
| 7 | |
| 8 | /*----- Licensing notice --------------------------------------------------* |
| 9 | * |
| 10 | * This file is part of the mLib utilities library. |
| 11 | * |
| 12 | * mLib is free software: you can redistribute it and/or modify it under |
| 13 | * the terms of the GNU Library General Public License as published by |
| 14 | * the Free Software Foundation; either version 2 of the License, or (at |
| 15 | * your option) any later version. |
| 16 | * |
| 17 | * mLib is distributed in the hope that it will be useful, but WITHOUT |
| 18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public |
| 20 | * License for more details. |
| 21 | * |
| 22 | * You should have received a copy of the GNU Library General Public |
| 23 | * License along with mLib. If not, write to the Free Software |
| 24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, |
| 25 | * USA. |
| 26 | */ |
| 27 | |
| 28 | /*----- Header files ------------------------------------------------------*/ |
| 29 | |
| 30 | #include "config.h" |
| 31 | |
| 32 | #include <errno.h> |
| 33 | #include <stdarg.h> |
| 34 | #include <stdio.h> |
| 35 | #include <stdlib.h> |
| 36 | #include <string.h> |
| 37 | #include <time.h> |
| 38 | |
| 39 | #include "alloc.h" |
| 40 | #include "bench.h" |
| 41 | #include "bits.h" |
| 42 | #include "linreg.h" |
| 43 | #include "macros.h" |
| 44 | |
| 45 | /*----- Data structures ---------------------------------------------------*/ |
| 46 | |
| 47 | struct timer { |
| 48 | struct bench_timer _t; |
| 49 | const struct timer_ops *clkops, *cyops; /* time and cycle measurements */ |
| 50 | union { int fd; } u_cy; /* state for cycle measurement */ |
| 51 | }; |
| 52 | |
| 53 | struct timer_ops { |
| 54 | void (*now)(struct bench_time *t_out, struct timer *t); /* read current */ |
| 55 | void (*teardown)(struct timer *t); /* release held resources */ |
| 56 | }; |
| 57 | |
| 58 | /*----- Preliminaries -----------------------------------------------------*/ |
| 59 | |
| 60 | #define NS_PER_S 1000000000 |
| 61 | |
| 62 | /* --- @debug@ --- * |
| 63 | * |
| 64 | * Arguments: @const char *fmt@ = format control string |
| 65 | * @...@ = format arguemnts |
| 66 | * |
| 67 | * Returns: --- |
| 68 | * |
| 69 | * Use: Maybe report a debugging message to standard error. |
| 70 | */ |
| 71 | |
| 72 | static void PRINTF_LIKE(1, 2) debug(const char *fmt, ...) |
| 73 | { |
| 74 | const char *p; |
| 75 | va_list ap; |
| 76 | |
| 77 | p = getenv("MLIB_BENCH_DEBUG"); |
| 78 | if (p && *p != 'n' && *p != '0') { |
| 79 | va_start(ap, fmt); |
| 80 | fputs("mLib BENCH: ", stderr); |
| 81 | vfprintf(stderr, fmt, ap); |
| 82 | fputc('\n', stderr); |
| 83 | va_end(ap); |
| 84 | } |
| 85 | } |
| 86 | |
| 87 | /* --- @timer_diff@ --- * |
| 88 | * |
| 89 | * Arguments: @struct bench_timing *delta_out@ = where to putt the result |
| 90 | * @const struct bench_time *t0, *t1@ = two times captured by a |
| 91 | * timer's @now@ function |
| 92 | * |
| 93 | * Returns: --- |
| 94 | * |
| 95 | * Use: Calculates the difference between two captured times. The |
| 96 | * flags are set according to whether the differences are |
| 97 | * meaningful; @delta_out->n@ is left unset. |
| 98 | */ |
| 99 | |
| 100 | static void timer_diff(struct bench_timing *delta_out, |
| 101 | const struct bench_time *t0, |
| 102 | const struct bench_time *t1) |
| 103 | { |
| 104 | unsigned f = t0->f&t1->f; |
| 105 | kludge64 k; |
| 106 | |
| 107 | #ifdef HAVE_UINT64 |
| 108 | # define FLOATK64(k) ((double)(k).i) |
| 109 | #else |
| 110 | # define FLOATK64(k) ((double)(k).lo + 4275123318.0*(double)(k).hi) |
| 111 | #endif |
| 112 | |
| 113 | if (!(f&BTF_TIMEOK)) |
| 114 | delta_out->t = 0.0; |
| 115 | else { |
| 116 | SUB64(k, t1->s, t0->s); |
| 117 | delta_out->t = FLOATK64(k) - 1 + |
| 118 | (t1->ns + NS_PER_S - t0->ns)/(double)NS_PER_S; |
| 119 | } |
| 120 | |
| 121 | if (!(f&BTF_CYOK)) |
| 122 | delta_out->cy = 0.0; |
| 123 | else { |
| 124 | SUB64(k, t1->cy, t0->cy); |
| 125 | delta_out->cy = FLOATK64(k); |
| 126 | } |
| 127 | |
| 128 | delta_out->f = f; |
| 129 | |
| 130 | #undef FLOATK64 |
| 131 | } |
| 132 | |
| 133 | /*----- The null clock ----------------------------------------------------*/ |
| 134 | |
| 135 | /* This is a cycle counter which does nothing, in case we don't have any |
| 136 | * better ideas. |
| 137 | */ |
| 138 | |
| 139 | static void null_now(struct bench_time *t_out, struct timer *t) { ; } |
| 140 | static void null_teardown(struct timer *t) { ; } |
| 141 | static const struct timer_ops null_ops = { null_now, null_teardown }; |
| 142 | |
| 143 | static int null_cyinit(struct timer *t) |
| 144 | { t->cyops = &null_ops; return (0); } |
| 145 | |
| 146 | #define NULL_CYENT { "null", null_cyinit }, |
| 147 | |
| 148 | /*----- Linux performance counters ----------------------------------------*/ |
| 149 | |
| 150 | /* This is a cycle counter which uses the Linux performance event system, |
| 151 | * which is probably the best choice if it's available. |
| 152 | */ |
| 153 | |
| 154 | #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64) |
| 155 | |
| 156 | #include <sys/types.h> |
| 157 | #include <unistd.h> |
| 158 | |
| 159 | #include <linux/perf_event.h> |
| 160 | #include <asm/unistd.h> |
| 161 | |
| 162 | static void perfevent_now(struct bench_time *t_out, struct timer *t) |
| 163 | { |
| 164 | ssize_t n; |
| 165 | |
| 166 | n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i)); |
| 167 | if (n != sizeof(t_out->cy.i)) { |
| 168 | debug("failed to read perf-event counter: %s", strerror(errno)); |
| 169 | return; |
| 170 | } |
| 171 | t_out->f |= BTF_CYOK; |
| 172 | } |
| 173 | |
| 174 | static void perfevent_teardown(struct timer *t) |
| 175 | { close(t->u_cy.fd); } |
| 176 | |
| 177 | static const struct timer_ops perfevent_ops = |
| 178 | { perfevent_now, perfevent_teardown }; |
| 179 | |
| 180 | static int perfevent_init(struct timer *t) |
| 181 | { |
| 182 | struct perf_event_attr attr = { 0 }; |
| 183 | struct bench_time tm; |
| 184 | |
| 185 | attr.type = PERF_TYPE_HARDWARE; |
| 186 | attr.size = sizeof(attr); |
| 187 | attr.config = PERF_COUNT_HW_CPU_CYCLES; |
| 188 | attr.disabled = 0; |
| 189 | attr.exclude_kernel = 1; |
| 190 | attr.exclude_hv = 1; |
| 191 | |
| 192 | t->u_cy.fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); |
| 193 | if (t->u_cy.fd < 0) { |
| 194 | debug("couldn't open perf evvent: %s", strerror(errno)); |
| 195 | return (-1); |
| 196 | } |
| 197 | |
| 198 | tm.f = 0; perfevent_now(&tm, t); |
| 199 | if (!(tm.f&BTF_CYOK)) { close(t->u_cy.fd); return (-1); } |
| 200 | |
| 201 | t->cyops = &perfevent_ops; return (0); |
| 202 | } |
| 203 | # define PERFEVENT_CYENT { "linux-perf-event", perfevent_init }, |
| 204 | #else |
| 205 | # define PERFEVENT_CYENT |
| 206 | #endif |
| 207 | |
| 208 | /*----- Intel time-stamp counter ------------------------------------------*/ |
| 209 | |
| 210 | /* This is a cycle counter based on the Intel `rdtsc' instruction. It's not |
| 211 | * really suitable for performance measurement because it gets confused by |
| 212 | * CPU frequency adjustments. |
| 213 | */ |
| 214 | |
| 215 | #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) |
| 216 | |
| 217 | #define EFLAGS_ID (1u << 21) |
| 218 | #define CPUID_1D_TSC (1u << 4) |
| 219 | |
| 220 | static uint32 set_flags(unsigned long m, unsigned long x) |
| 221 | { |
| 222 | unsigned long r; |
| 223 | |
| 224 | #ifdef __x86_64__ |
| 225 | # define TMP "%%rcx" |
| 226 | #else |
| 227 | # define TMP "%%ecx" |
| 228 | #endif |
| 229 | |
| 230 | __asm__ ("pushf\n\t" |
| 231 | "pop %0\n\t" |
| 232 | "mov %0, " TMP "\n\t" |
| 233 | "and %1, %0\n\t" |
| 234 | "xor %2, %0\n\t" |
| 235 | "push %0\n\t" |
| 236 | "popf\n\t" |
| 237 | "pushf\n\t" |
| 238 | "pop %0\n\t" |
| 239 | "push " TMP "\n\t" |
| 240 | "popf" |
| 241 | : "=r"(r) |
| 242 | : "g"(m), "g"(x) |
| 243 | : "ecx"); |
| 244 | return (r); |
| 245 | } |
| 246 | |
| 247 | struct cpuid { uint32 a, b, c, d; }; |
| 248 | |
| 249 | static void cpuid(struct cpuid *info_out, uint32 a, uint32 c) |
| 250 | { |
| 251 | __asm__ ("movl %1, %%eax\n\t" |
| 252 | "movl %2, %%ecx\n\t" |
| 253 | "cpuid\n\t" |
| 254 | "movl %%eax, 0(%0)\n\t" |
| 255 | "movl %%ebx, 4(%0)\n\t" |
| 256 | "movl %%ecx, 8(%0)\n\t" |
| 257 | "movl %%edx, 12(%0)\n\t" |
| 258 | : /* no outputs */ |
| 259 | : "r"(info_out), "g"(a), "g"(c) |
| 260 | : "eax", "ebx", "ecx", "edx", "cc"); |
| 261 | } |
| 262 | |
| 263 | static void x86rdtsc_now(struct bench_time *t_out, struct timer *t) |
| 264 | { |
| 265 | uint32 lo, hi; |
| 266 | |
| 267 | __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); |
| 268 | SET64(t_out->cy, hi, lo); t_out->f |= BTF_CYOK; |
| 269 | } |
| 270 | |
| 271 | static const struct timer_ops x86rdtsc_ops = |
| 272 | { x86rdtsc_now, null_teardown }; |
| 273 | |
| 274 | static int x86rdtsc_init(struct timer *t) |
| 275 | { |
| 276 | struct cpuid info; |
| 277 | |
| 278 | if ((set_flags(~EFLAGS_ID, 0)&EFLAGS_ID) || |
| 279 | !(set_flags(~EFLAGS_ID, EFLAGS_ID)&EFLAGS_ID)) |
| 280 | { debug("no `cpuid' instruction"); return (-1); } |
| 281 | cpuid(&info, 0, 0); |
| 282 | if (info.a < 1) { debug("no `cpuid' leaf 1"); return (-1); } |
| 283 | cpuid(&info, 1, 0); |
| 284 | if (!(info.d&CPUID_1D_TSC)) |
| 285 | { debug("no `rdtsc' instrunction"); return (-1); } |
| 286 | t->cyops = &x86rdtsc_ops; return (0); |
| 287 | } |
| 288 | |
| 289 | # define X86RDTSC_CYENT { "x86-rdtsc", x86rdtsc_init }, |
| 290 | #else |
| 291 | # define X86RDTWC_CYENT |
| 292 | #endif |
| 293 | |
| 294 | /*----- POSIX `clock_gettime' ---------------------------------------------*/ |
| 295 | |
| 296 | /* This is a real-time clock based on the POSIX time interface, with up to |
| 297 | * nanosecond precision. |
| 298 | */ |
| 299 | |
| 300 | #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID) |
| 301 | |
| 302 | static void gettime_now(struct bench_time *t_out, struct timer *t) |
| 303 | { |
| 304 | struct timespec now; |
| 305 | |
| 306 | if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now)) |
| 307 | { debug("error reading POSIX clock: %s", strerror(errno)); return; } |
| 308 | ASSIGN64(t_out->s, now.tv_sec); t_out->ns = now.tv_nsec; |
| 309 | t_out->f |= BTF_TIMEOK; |
| 310 | } |
| 311 | |
| 312 | static const struct timer_ops gettime_ops = { gettime_now, null_teardown }; |
| 313 | |
| 314 | static int gettime_init(struct timer *t) |
| 315 | { |
| 316 | struct bench_time tm; |
| 317 | |
| 318 | tm.f = 0; gettime_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1); |
| 319 | t->clkops = &gettime_ops; return (0); |
| 320 | } |
| 321 | |
| 322 | # define GETTIME_CLKENT { "posix-clock_gettime", gettime_init }, |
| 323 | #else |
| 324 | # define GETTIME_CLKENT |
| 325 | #endif |
| 326 | |
| 327 | /*----- Standard C `clock' ------------------------------------------------*/ |
| 328 | |
| 329 | /* This is a real-time clock based on the C `clock' function which is |
| 330 | * guaranteed to be available, though it's not likely to be very good. |
| 331 | */ |
| 332 | |
| 333 | static void clock_now(struct bench_time *t_out, struct timer *t) |
| 334 | { |
| 335 | clock_t now, x; |
| 336 | unsigned long s; uint32 ns; |
| 337 | |
| 338 | now = clock(); |
| 339 | if (now == (clock_t)-1) { |
| 340 | debug("error reading standard clock: %s", strerror(errno)); |
| 341 | return; |
| 342 | } |
| 343 | x = now/CLOCKS_PER_SEC; |
| 344 | if (x > ULONG_MAX) { debug("standard clock out of range"); return; } |
| 345 | |
| 346 | s = x; x = now - CLOCKS_PER_SEC*s; |
| 347 | if (!(NS_PER_S%CLOCKS_PER_SEC)) |
| 348 | ns = x*(NS_PER_S/CLOCKS_PER_SEC); |
| 349 | else if (NS_PER_S <= ULONG_MAX/CLOCKS_PER_SEC) |
| 350 | ns = (x*NS_PER_S)/CLOCKS_PER_SEC; |
| 351 | else |
| 352 | ns = x*((NS_PER_S + 0.0)/CLOCKS_PER_SEC); |
| 353 | ASSIGN64(t_out->s, s); t_out->ns = ns; t_out->f |= BTF_TIMEOK; |
| 354 | } |
| 355 | |
| 356 | static const struct timer_ops clock_ops = { clock_now, null_teardown }; |
| 357 | |
| 358 | static int clock_init(struct timer *t) |
| 359 | { |
| 360 | struct bench_time tm; |
| 361 | |
| 362 | tm.f = 0; clock_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1); |
| 363 | t->clkops = &clock_ops; return (0); |
| 364 | } |
| 365 | |
| 366 | #define CLOCK_CLKENT { "clock", clock_init }, |
| 367 | |
| 368 | /*----- Timing setup ------------------------------------------------------*/ |
| 369 | |
| 370 | /* Tables of timing sources. */ |
| 371 | static const struct timerent { |
| 372 | const char *name; |
| 373 | int (*init)(struct timer */*t*/); |
| 374 | } |
| 375 | clktab[] = { GETTIME_CLKENT CLOCK_CLKENT { 0, 0 } }, |
| 376 | cytab[] = { PERFEVENT_CYENT X86RDTSC_CYENT NULL_CYENT { 0, 0 } }; |
| 377 | |
| 378 | /* --- @find_timer@ --- * |
| 379 | * |
| 380 | * Arguments: @const char *name@ = timer name |
| 381 | * @size_t sz@ = length of name |
| 382 | * @const struct timerent *timers@ = table to search |
| 383 | * @const char *what@ = adjective describing table |
| 384 | * |
| 385 | * Returns: The table entry matching the given name, or null if there |
| 386 | * isn't one. |
| 387 | */ |
| 388 | |
| 389 | static const struct timerent *find_timer_n(const char *name, size_t sz, |
| 390 | const struct timerent *timers, |
| 391 | const char *what) |
| 392 | { |
| 393 | while (timers->name) { |
| 394 | if (strlen(timers->name) == sz && MEMCMP(name, ==, timers->name, sz)) |
| 395 | return (timers); |
| 396 | timers++; |
| 397 | } |
| 398 | debug("%s timer `%.*s' not found", what, (int)sz, name); return (0); |
| 399 | } |
| 400 | |
| 401 | /* --- @try_timer@ --- * |
| 402 | * |
| 403 | * Arguments: @struct timer *t@ = timer structure |
| 404 | * @const struct timerent *timer@ = timer table entry |
| 405 | * @const char *what@ = adjective describing table |
| 406 | * |
| 407 | * Returns: Zero on success, @-1@ if timer failed. |
| 408 | * |
| 409 | * Use: Tries to initialize the timer @t@, reporting a debug message |
| 410 | * if it worked. |
| 411 | */ |
| 412 | |
| 413 | static int try_timer(struct timer *t, |
| 414 | const struct timerent *timer, const char *what) |
| 415 | { |
| 416 | if (timer->init(t)) return (-1); |
| 417 | debug("selected %s timer `%s'", what, timer->name); return (0); |
| 418 | } |
| 419 | |
| 420 | /* --- @select_timer@ --- * |
| 421 | * |
| 422 | * Arguments: @struct timer *t@ = timer structure |
| 423 | * @const struct timerent *timer@ = timer table |
| 424 | * @const char *varname@ = environment variable to consult |
| 425 | * @const char *what@ = adjective describing table |
| 426 | * |
| 427 | * Returns: Zero on success, @-1@ if timer failed. |
| 428 | * |
| 429 | * Use: Select a timer from the table. If the environment variable |
| 430 | * is set, then parse a comma-separated list of timer names and |
| 431 | * use the first one listed that seems to work; otherwise, try |
| 432 | * the timers in the table in order. |
| 433 | */ |
| 434 | |
| 435 | static int select_timer(struct timer *t, const struct timerent *timers, |
| 436 | const char *varname, const char *what) |
| 437 | { |
| 438 | const char *p; size_t n; |
| 439 | const struct timerent *timer; |
| 440 | |
| 441 | p = getenv(varname); |
| 442 | if (!p) { |
| 443 | while (timers->name) |
| 444 | if (!try_timer(t, timers++, what)) return (0); |
| 445 | } else { |
| 446 | for (;;) { |
| 447 | n = strcspn(p, ","); |
| 448 | timer = find_timer_n(p, n, timers, what); |
| 449 | if (timer && !try_timer(t, timer, what)) return (0); |
| 450 | if (!p[n]) break; |
| 451 | p += n + 1; |
| 452 | } |
| 453 | } |
| 454 | debug("no suitable %s timer found", what); return (-1); |
| 455 | } |
| 456 | |
| 457 | /* Bench timer operations. */ |
| 458 | static void timer_now(struct bench_timer *tm, struct bench_time *t_out) |
| 459 | { |
| 460 | struct timer *t = (struct timer *)tm; |
| 461 | |
| 462 | t->clkops->now(t_out, t); |
| 463 | t->cyops->now(t_out, t); |
| 464 | } |
| 465 | static void timer_destroy(struct bench_timer *tm) |
| 466 | { |
| 467 | struct timer *t = (struct timer *)tm; |
| 468 | |
| 469 | if (!t) return; |
| 470 | if (t->clkops) t->clkops->teardown(t); |
| 471 | if (t->cyops) t->cyops->teardown(t); |
| 472 | xfree(t); |
| 473 | } |
| 474 | |
| 475 | static const struct bench_timerops timer_ops = { timer_now, timer_destroy }; |
| 476 | |
| 477 | /* --- @bench_createtimer@ --- * |
| 478 | * |
| 479 | * Arguments: --- |
| 480 | * |
| 481 | * Returns: A freshly constructed standard timer object. |
| 482 | * |
| 483 | * Use: Allocate a timer. Dispose of it by calling |
| 484 | * @tm->ops->destroy(tm)@ when you're done. |
| 485 | */ |
| 486 | |
| 487 | struct bench_timer *bench_createtimer(void) |
| 488 | { |
| 489 | struct timer *t = 0; |
| 490 | struct bench_timer *ret = 0; |
| 491 | |
| 492 | t = xmalloc(sizeof(*t)); t->cyops = 0; t->clkops = 0; |
| 493 | if (select_timer(t, clktab, "MLIB_BENCH_CLKTIMER", "clock")) goto end; |
| 494 | if (select_timer(t, cytab, "MLIB_BENCH_CYCLETIMER", "cycle")) goto end; |
| 495 | t->_t.ops = &timer_ops; ret = &t->_t; t = 0; |
| 496 | end: |
| 497 | if (t) timer_destroy(&t->_t); |
| 498 | return (ret); |
| 499 | } |
| 500 | |
| 501 | /*----- Benchmarking ------------------------------------------------------*/ |
| 502 | |
| 503 | /* --- @bench_init@ --- * |
| 504 | * |
| 505 | * Arguments: @struct bench_state *b@ = bench state to initialize |
| 506 | * @struct bench_timer *tm@ = timer to attach |
| 507 | * |
| 508 | * Returns: --- |
| 509 | * |
| 510 | * Use: Initialize the benchmark state. It still needs to be |
| 511 | * calibrated (use @bench_calibrate@) before it can be used, but |
| 512 | * this will be done automatically by @bench_measure@ if it's |
| 513 | * not done by hand earlier. The timer is now owned by the |
| 514 | * benchmark state and will be destroyed by @bench_destroy@. |
| 515 | */ |
| 516 | |
| 517 | void bench_init(struct bench_state *b, struct bench_timer *tm) |
| 518 | { b->tm = tm; b->target_s = 1.0; b->f = 0; } |
| 519 | |
| 520 | /* --- @bench_destroy@ --- * |
| 521 | * |
| 522 | * Arguments: @struct bench_state *b@ = bench state |
| 523 | * |
| 524 | * Returns: --- |
| 525 | * |
| 526 | * Use: Destroy the benchmark state, releasing the resources that it |
| 527 | * holds. |
| 528 | */ |
| 529 | |
| 530 | void bench_destroy(struct bench_state *b) |
| 531 | { b->tm->ops->destroy(b->tm); } |
| 532 | |
| 533 | /* --- @do_nothing@ --- * |
| 534 | * |
| 535 | * Arguments: @unsigned long n@ = iteration count |
| 536 | * @void *ctx@ = context pointer (ignored) |
| 537 | * |
| 538 | * Returns: --- |
| 539 | * |
| 540 | * Use: Does nothing at all for @n@ iterations. Used to calibrate |
| 541 | * the benchmarking state. |
| 542 | */ |
| 543 | |
| 544 | static void do_nothing(unsigned long n, void *ctx) |
| 545 | { while (n--) RELAX; } |
| 546 | |
| 547 | /* --- @bench_calibrate@ --- * |
| 548 | * |
| 549 | * Arguments: @struct bench_state *b@ = bench state |
| 550 | * |
| 551 | * Returns: Zero on success, @-1@ if calibration failed. |
| 552 | * |
| 553 | * Use: Calibrate the benchmark state, so that it can be used to |
| 554 | * measure performance reasonably accurately. |
| 555 | */ |
| 556 | |
| 557 | int bench_calibrate(struct bench_state *b) |
| 558 | { |
| 559 | struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT; |
| 560 | unsigned long n; |
| 561 | unsigned i; |
| 562 | struct bench_timer *tm = b->tm; |
| 563 | struct bench_time t0, t1; |
| 564 | struct bench_timing delta; |
| 565 | bench_fn *fn = LAUNDER(&do_nothing); |
| 566 | unsigned f = BTF_ANY; |
| 567 | int rc; |
| 568 | |
| 569 | /* The model here is that a timing loop has a fixed overhead as we enter |
| 570 | * and leave (e.g., to do with the indirect branch into the code), and |
| 571 | * per-iteration overheads as we check the counter and loop back. We aim |
| 572 | * to split these apart using linear regression. |
| 573 | */ |
| 574 | |
| 575 | /* If we've already calibrated then there's nothing to do. */ |
| 576 | if (b->f&BTF_ANY) return (0); |
| 577 | |
| 578 | /* Exercise the inner loop a few times to educate the branch predictor. */ |
| 579 | for (i = 0; i < 10; i++) |
| 580 | { tm->ops->now(tm, &t0); fn(50, 0); tm->ops->now(tm, &t1); } |
| 581 | |
| 582 | /* Now we measure idle loops until they take sufficiently long -- or we run |
| 583 | * out of counter. |
| 584 | */ |
| 585 | debug("calibrating..."); |
| 586 | n = 1; |
| 587 | for (;;) { |
| 588 | |
| 589 | /* Measure @n@ iterations of the idle loop. */ |
| 590 | tm->ops->now(tm, &t0); fn(n, 0); tm->ops->now(tm, &t1); |
| 591 | timer_diff(&delta, &t0, &t1); f &= delta.f; |
| 592 | if (!(f&BTF_TIMEOK)) { rc = -1; goto end; } |
| 593 | |
| 594 | /* Register the timings with the regression machinery. */ |
| 595 | linreg_update(&lr_clk, n, delta.t); |
| 596 | if (!(f&BTF_CYOK)) |
| 597 | debug(" n = %10lu; t = %12g s", n, delta.t); |
| 598 | else { |
| 599 | linreg_update(&lr_cy, n, delta.cy); |
| 600 | debug(" n = %10lu; t = %12g s, cy = %10.0f", n, delta.t, delta.cy); |
| 601 | } |
| 602 | |
| 603 | /* If we're done then stop. */ |
| 604 | if (delta.t >= b->target_s/20.0) break; |
| 605 | if (n >= ULONG_MAX - n/3) break; |
| 606 | |
| 607 | /* Update the counter and continue. */ |
| 608 | n += n/3 + 1; |
| 609 | } |
| 610 | |
| 611 | /* Now run the linear regression to extract the constant and per-iteration |
| 612 | * overheads. |
| 613 | */ |
| 614 | linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, 0); |
| 615 | debug("clock overhead = (%g n + %g) s", b->clk.m, b->clk.c); |
| 616 | if (f&BTF_CYOK) { |
| 617 | linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, 0); |
| 618 | debug("cycle overhead = (%g n + %g) cy", b->cy.m, b->cy.c); |
| 619 | } |
| 620 | |
| 621 | /* We're done. */ |
| 622 | b->f |= f; rc = 0; |
| 623 | end: |
| 624 | return (rc); |
| 625 | } |
| 626 | |
| 627 | /* --- @bench_measure@ --- * |
| 628 | * |
| 629 | * Arguments: @struct bench_timing *t_out@ = where to leave the timing |
| 630 | * @struct bench_state *b@ = benchmark state |
| 631 | * @double base@ = number of internal units per call |
| 632 | * @bench_fn *fn@, @void *ctx@ = benchmark function to run |
| 633 | * |
| 634 | * Returns: Zero on success, @-1@ if timing failed. |
| 635 | * |
| 636 | * Use: Measure a function. The function @fn@ is called adaptively |
| 637 | * with an iteration count @n@ set so as to run for |
| 638 | * approximately @b->target_s@ seconds. |
| 639 | * |
| 640 | * The result is left in @*t_out@, with @t_out->n@ counting the |
| 641 | * final product of the iteration count and @base@ (which might, |
| 642 | * e.g., reflect the number of inner iterations the function |
| 643 | * performs, or the number of bytes it processes per iteration). |
| 644 | */ |
| 645 | |
| 646 | int bench_measure(struct bench_timing *t_out, struct bench_state *b, |
| 647 | double base, bench_fn *fn, void *ctx) |
| 648 | { |
| 649 | struct bench_timer *tm = b->tm; |
| 650 | struct bench_time t0, t1; |
| 651 | unsigned long n; |
| 652 | |
| 653 | /* Make sure the state is calibrated. */ |
| 654 | if (bench_calibrate(b)) return (-1); |
| 655 | |
| 656 | /* Main adaptive measurement loop. */ |
| 657 | debug("measuring..."); n = 1; |
| 658 | for (;;) { |
| 659 | tm->ops->now(tm, &t0); fn(n, ctx); tm->ops->now(tm, &t1); |
| 660 | timer_diff(t_out, &t0, &t1); |
| 661 | if (!(t_out->f&BTF_TIMEOK)) return (-1); |
| 662 | if (!(t_out->f&BTF_CYOK)) debug(" n = %10lu; t = %12g", n, t_out->t); |
| 663 | else debug(" n = %10lu; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy); |
| 664 | if (t_out->t >= 0.72*b->target_s) break; |
| 665 | n *= 1.44*b->target_s/t_out->t; |
| 666 | } |
| 667 | |
| 668 | /* Adjust according to the calibration. */ |
| 669 | t_out->t -= n*b->clk.m + b->clk.c; |
| 670 | if (t_out->f&BTF_CYOK) t_out->cy -= n*b->cy.m + b->cy.c; |
| 671 | |
| 672 | /* Report the results, if debugging. */ |
| 673 | if (!(t_out->f&BTF_CYOK)) debug(" adjusted t' = %12g", t_out->t); |
| 674 | else debug(" adjusted t = %12g, cy = %10.0f", t_out->t, t_out->cy); |
| 675 | if (!(t_out->f&BTF_CYOK)) |
| 676 | debug(" %g s per op; %g ops/s", t_out->t/n, n/t_out->t); |
| 677 | else |
| 678 | debug(" %g s (%g cy) per op; %g ops/s", |
| 679 | t_out->t/n, t_out->cy/n, n/t_out->t); |
| 680 | |
| 681 | /* All done. */ |
| 682 | t_out->n = n*base; return (0); |
| 683 | } |
| 684 | |
| 685 | /*----- That's all, folks -------------------------------------------------*/ |