| 1 | /* -*-c-*- |
| 2 | * |
| 3 | * Benchmarking support |
| 4 | * |
| 5 | * (c) 2023 Straylight/Edgeware |
| 6 | */ |
| 7 | |
| 8 | /*----- Licensing notice --------------------------------------------------* |
| 9 | * |
| 10 | * This file is part of the mLib utilities library. |
| 11 | * |
| 12 | * mLib is free software: you can redistribute it and/or modify it under |
| 13 | * the terms of the GNU Library General Public License as published by |
| 14 | * the Free Software Foundation; either version 2 of the License, or (at |
| 15 | * your option) any later version. |
| 16 | * |
| 17 | * mLib is distributed in the hope that it will be useful, but WITHOUT |
| 18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public |
| 20 | * License for more details. |
| 21 | * |
| 22 | * You should have received a copy of the GNU Library General Public |
| 23 | * License along with mLib. If not, write to the Free Software |
| 24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, |
| 25 | * USA. |
| 26 | */ |
| 27 | |
| 28 | /*----- Header files ------------------------------------------------------*/ |
| 29 | |
| 30 | #include "config.h" |
| 31 | |
| 32 | #include <ctype.h> |
| 33 | #include <errno.h> |
| 34 | #include <stdarg.h> |
| 35 | #include <stdio.h> |
| 36 | #include <stdlib.h> |
| 37 | #include <string.h> |
| 38 | #include <time.h> |
| 39 | |
| 40 | #include "alloc.h" |
| 41 | #include "bench.h" |
| 42 | #include "bits.h" |
| 43 | #include "dstr.h" |
| 44 | #include "linreg.h" |
| 45 | #include "macros.h" |
| 46 | |
| 47 | /*----- Data structures ---------------------------------------------------*/ |
| 48 | |
| 49 | enum { CLK, CY, NTIMER }; |
| 50 | |
| 51 | struct timer { |
| 52 | struct bench_timer _t; |
| 53 | const struct timer_ops *ops[NTIMER]; /* subtimers for clock and cycles */ |
| 54 | union { int fd; } u_cy; /* state for cycle measurement */ |
| 55 | }; |
| 56 | |
| 57 | struct timer_ops { |
| 58 | const char *name; /* timer name */ |
| 59 | unsigned f; /* flags */ |
| 60 | #define TF_SECRET 1u /* don't try this automatically */ |
| 61 | int (*init)(struct timer */*t*/); /* initialization function */ |
| 62 | void (*now)(struct bench_time *t_out, struct timer *t); /* read current */ |
| 63 | void (*teardown)(struct timer *t); /* release held resources */ |
| 64 | }; |
| 65 | |
| 66 | /*----- Preliminaries -----------------------------------------------------*/ |
| 67 | |
| 68 | #define NS_PER_S 1000000000 |
| 69 | |
| 70 | /* --- @debug@ --- * |
| 71 | * |
| 72 | * Arguments: @const char *fmt@ = format control string |
| 73 | * @...@ = format arguemnts |
| 74 | * |
| 75 | * Returns: --- |
| 76 | * |
| 77 | * Use: Maybe report a debugging message to standard error. |
| 78 | */ |
| 79 | |
| 80 | static PRINTF_LIKE(1, 2) void debug(const char *fmt, ...) |
| 81 | { |
| 82 | const char *p; |
| 83 | va_list ap; |
| 84 | |
| 85 | p = getenv("MLIB_BENCH_DEBUG"); |
| 86 | if (p && *p != 'n' && *p != '0') { |
| 87 | va_start(ap, fmt); |
| 88 | fputs("mLib BENCH: ", stderr); |
| 89 | vfprintf(stderr, fmt, ap); |
| 90 | fputc('\n', stderr); |
| 91 | va_end(ap); |
| 92 | } |
| 93 | } |
| 94 | |
| 95 | /* --- @timer_diff@ --- * |
| 96 | * |
| 97 | * Arguments: @struct bench_timing *delta_out@ = where to putt the result |
| 98 | * @const struct bench_time *t0, *t1@ = two times captured by a |
| 99 | * timer's @now@ function |
| 100 | * |
| 101 | * Returns: --- |
| 102 | * |
| 103 | * Use: Calculates the difference between two captured times. The |
| 104 | * flags are set according to whether the differences are |
| 105 | * meaningful; @delta_out->n@ is left unset. |
| 106 | */ |
| 107 | |
| 108 | static void timer_diff(struct bench_timing *delta_out, |
| 109 | const struct bench_time *t0, |
| 110 | const struct bench_time *t1) |
| 111 | { |
| 112 | unsigned f = t0->f&t1->f; |
| 113 | kludge64 k; |
| 114 | |
| 115 | #ifdef HAVE_UINT64 |
| 116 | # define FLOATK64(k) ((double)(k).i) |
| 117 | #else |
| 118 | # define FLOATK64(k) ((double)(k).lo + 4275123318.0*(double)(k).hi) |
| 119 | #endif |
| 120 | |
| 121 | if (!(f&BTF_TIMEOK)) |
| 122 | delta_out->t = 0.0; |
| 123 | else { |
| 124 | SUB64(k, t1->s, t0->s); |
| 125 | delta_out->t = FLOATK64(k) - 1 + |
| 126 | (t1->ns + NS_PER_S - t0->ns)/(double)NS_PER_S; |
| 127 | } |
| 128 | |
| 129 | if (!(f&BTF_CYOK)) |
| 130 | delta_out->cy = 0.0; |
| 131 | else { |
| 132 | SUB64(k, t1->cy, t0->cy); |
| 133 | delta_out->cy = FLOATK64(k); |
| 134 | } |
| 135 | |
| 136 | delta_out->f = f; |
| 137 | |
| 138 | #undef FLOATK64 |
| 139 | } |
| 140 | |
| 141 | /*----- The null timer ----------------------------------------------------*/ |
| 142 | |
| 143 | /* This is a timer which does nothing, in case we don't have any better |
| 144 | * ideas. |
| 145 | */ |
| 146 | |
| 147 | static int null_init(struct timer *t) { return (0); } |
| 148 | static void null_now(struct bench_time *t_out, struct timer *t) { ; } |
| 149 | static void null_teardown(struct timer *t) { ; } |
| 150 | |
| 151 | static const struct timer_ops null_ops = |
| 152 | { "null", 0, null_init, null_now, null_teardown }; |
| 153 | #define NULL_ENT &null_ops, |
| 154 | |
| 155 | /*----- The broken clock --------------------------------------------------*/ |
| 156 | |
| 157 | /* This is a cycle counter which does nothing, in case we don't have any |
| 158 | * better ideas. |
| 159 | */ |
| 160 | |
| 161 | static int broken_init(struct timer *t) { return (-1); } |
| 162 | |
| 163 | static const struct timer_ops broken_ops = |
| 164 | { "broken", TF_SECRET, broken_init, null_now, null_teardown }; |
| 165 | #define BROKEN_ENT &broken_ops, |
| 166 | |
| 167 | /*----- Linux performance counters ----------------------------------------*/ |
| 168 | |
| 169 | /* This is a cycle counter which uses the Linux performance event system, |
| 170 | * which is probably the best choice if it's available. |
| 171 | */ |
| 172 | |
| 173 | #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64) |
| 174 | |
| 175 | #include <sys/types.h> |
| 176 | #include <unistd.h> |
| 177 | |
| 178 | #include <linux/perf_event.h> |
| 179 | #include <asm/unistd.h> |
| 180 | |
| 181 | static void perfevent_now(struct bench_time *t_out, struct timer *t) |
| 182 | { |
| 183 | ssize_t n; |
| 184 | |
| 185 | n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i)); |
| 186 | if (n != sizeof(t_out->cy.i)) { |
| 187 | debug("failed to read perf-event counter: %s", strerror(errno)); |
| 188 | return; |
| 189 | } |
| 190 | t_out->f |= BTF_CYOK; |
| 191 | } |
| 192 | |
| 193 | static void perfevent_teardown(struct timer *t) |
| 194 | { close(t->u_cy.fd); } |
| 195 | |
| 196 | static int perfevent_init(struct timer *t) |
| 197 | { |
| 198 | struct perf_event_attr attr = { 0 }; |
| 199 | struct bench_time tm; |
| 200 | |
| 201 | attr.type = PERF_TYPE_HARDWARE; |
| 202 | attr.size = sizeof(attr); |
| 203 | attr.config = PERF_COUNT_HW_CPU_CYCLES; |
| 204 | attr.disabled = 0; |
| 205 | attr.exclude_kernel = 1; |
| 206 | attr.exclude_hv = 1; |
| 207 | |
| 208 | t->u_cy.fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); |
| 209 | if (t->u_cy.fd < 0) { |
| 210 | debug("couldn't open perf evvent: %s", strerror(errno)); |
| 211 | return (-1); |
| 212 | } |
| 213 | |
| 214 | tm.f = 0; perfevent_now(&tm, t); |
| 215 | if (!(tm.f&BTF_CYOK)) { close(t->u_cy.fd); return (-1); } |
| 216 | |
| 217 | return (0); |
| 218 | } |
| 219 | |
| 220 | static const struct timer_ops perfevent_ops = |
| 221 | { "linux-perf-hw-cycles", 0, |
| 222 | perfevent_init, perfevent_now, perfevent_teardown }; |
| 223 | |
| 224 | # define PERFEVENT_CYENT &perfevent_ops, |
| 225 | #else |
| 226 | # define PERFEVENT_CYENT |
| 227 | #endif |
| 228 | |
| 229 | /*----- Intel time-stamp counter ------------------------------------------*/ |
| 230 | |
| 231 | /* This is a cycle counter based on the Intel `rdtsc' instruction. It's not |
| 232 | * really suitable for performance measurement because it gets confused by |
| 233 | * CPU frequency adjustments. |
| 234 | */ |
| 235 | |
| 236 | #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) |
| 237 | |
| 238 | #include <cpuid.h> |
| 239 | |
| 240 | #define CPUID_1D_TSC (1u << 4) |
| 241 | |
| 242 | static void x86rdtsc_now(struct bench_time *t_out, struct timer *t) |
| 243 | { t_out->cy.i = __builtin_ia32_rdtsc(); t_out->f |= BTF_CYOK; } |
| 244 | |
| 245 | static int x86rdtsc_init(struct timer *t) |
| 246 | { |
| 247 | unsigned a, b, c, d; |
| 248 | |
| 249 | if (!__get_cpuid(1, &a, &b, &c, &d) || !(d&CPUID_1D_TSC)) |
| 250 | { debug("no `rdtsc' instrunction"); return (-1); } |
| 251 | return (0); |
| 252 | } |
| 253 | |
| 254 | static const struct timer_ops x86rdtsc_ops = |
| 255 | { "x86-rdtsc", 0, x86rdtsc_init, x86rdtsc_now, null_teardown }; |
| 256 | |
| 257 | # define X86RDTSC_CYENT &x86rdtsc_ops, |
| 258 | #else |
| 259 | # define X86RDTSC_CYENT |
| 260 | #endif |
| 261 | |
| 262 | /*----- POSIX `clock_gettime' ---------------------------------------------*/ |
| 263 | |
| 264 | /* This is a real-time clock based on the POSIX time interface, with up to |
| 265 | * nanosecond precision. |
| 266 | */ |
| 267 | |
| 268 | #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID) |
| 269 | |
| 270 | static void gettime_now(struct bench_time *t_out, struct timer *t) |
| 271 | { |
| 272 | struct timespec now; |
| 273 | |
| 274 | if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now)) |
| 275 | { debug("error reading POSIX clock: %s", strerror(errno)); return; } |
| 276 | ASSIGN64(t_out->s, now.tv_sec); t_out->ns = now.tv_nsec; |
| 277 | t_out->f |= BTF_TIMEOK; |
| 278 | } |
| 279 | |
| 280 | static int gettime_init(struct timer *t) |
| 281 | { |
| 282 | struct bench_time tm; |
| 283 | |
| 284 | tm.f = 0; gettime_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1); |
| 285 | return (0); |
| 286 | } |
| 287 | |
| 288 | static const struct timer_ops gettime_ops = |
| 289 | { "posix-thread-cputime", 0, gettime_init, gettime_now, null_teardown }; |
| 290 | |
| 291 | # define GETTIME_CLKENT &gettime_ops, |
| 292 | #else |
| 293 | # define GETTIME_CLKENT |
| 294 | #endif |
| 295 | |
| 296 | /*----- Standard C `clock' ------------------------------------------------*/ |
| 297 | |
| 298 | /* This is a real-time clock based on the C `clock' function which is |
| 299 | * guaranteed to be available, though it's not likely to be very good. |
| 300 | */ |
| 301 | |
| 302 | static void clock_now(struct bench_time *t_out, struct timer *t) |
| 303 | { |
| 304 | clock_t now, x; |
| 305 | unsigned long s; uint32 ns; |
| 306 | |
| 307 | now = clock(); |
| 308 | if (now == (clock_t)-1) { |
| 309 | debug("error reading standard clock: %s", strerror(errno)); |
| 310 | return; |
| 311 | } |
| 312 | x = now/CLOCKS_PER_SEC; |
| 313 | if (x > ULONG_MAX) { debug("standard clock out of range"); return; } |
| 314 | |
| 315 | s = x; x = now - CLOCKS_PER_SEC*s; |
| 316 | if (!(NS_PER_S%CLOCKS_PER_SEC)) |
| 317 | ns = x*(NS_PER_S/CLOCKS_PER_SEC); |
| 318 | else if (NS_PER_S <= ULONG_MAX/CLOCKS_PER_SEC) |
| 319 | ns = (x*NS_PER_S)/CLOCKS_PER_SEC; |
| 320 | else |
| 321 | ns = x*((NS_PER_S + 0.0)/CLOCKS_PER_SEC); |
| 322 | ASSIGN64(t_out->s, s); t_out->ns = ns; t_out->f |= BTF_TIMEOK; |
| 323 | } |
| 324 | |
| 325 | static int clock_init(struct timer *t) |
| 326 | { |
| 327 | struct bench_time tm; |
| 328 | |
| 329 | tm.f = 0; clock_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1); |
| 330 | return (0); |
| 331 | } |
| 332 | |
| 333 | static const struct timer_ops clock_ops = |
| 334 | { "stdc-clock", 0, clock_init, clock_now, null_teardown }; |
| 335 | |
| 336 | #define CLOCK_CLKENT &clock_ops, |
| 337 | |
| 338 | /*----- Timing setup ------------------------------------------------------*/ |
| 339 | |
| 340 | /* Tables of timing sources. */ |
| 341 | static const struct timer_ops |
| 342 | *const clktab[] = { GETTIME_CLKENT CLOCK_CLKENT BROKEN_ENT 0 }, |
| 343 | *const cytab[] = { PERFEVENT_CYENT X86RDTSC_CYENT NULL_ENT BROKEN_ENT 0 }; |
| 344 | |
| 345 | static const struct timertab { |
| 346 | const char *what; |
| 347 | const char *env; |
| 348 | const struct timer_ops *const *opstab; |
| 349 | } timertab[] = { |
| 350 | { "clock", "MLIB_BENCH_CLKTIMER", clktab }, |
| 351 | { "cycle", "MLIB_BENCH_CYCLETIMER", cytab } |
| 352 | }; |
| 353 | |
| 354 | /* --- @find_timer@ --- * |
| 355 | * |
| 356 | * Arguments: @const char *name@ = timer name |
| 357 | * @size_t sz@ = length of name |
| 358 | * @unsigned tm@ = which subtimer we're looking for |
| 359 | * |
| 360 | * Returns: The table entry matching the given name, or null if there |
| 361 | * isn't one. |
| 362 | */ |
| 363 | |
| 364 | static const struct timer_ops *find_timer(const char *name, size_t sz, |
| 365 | unsigned tm) |
| 366 | { |
| 367 | const struct timer_ops *const *tt; |
| 368 | |
| 369 | for (tt = timertab[tm].opstab; *tt; tt++) { |
| 370 | if (strlen((*tt)->name) == sz && |
| 371 | MEMCMP(name, ==, (*tt)->name, sz)) |
| 372 | return (*tt); |
| 373 | } |
| 374 | debug("%s timer `%.*s' not found", |
| 375 | timertab[tm].what, (int)sz, name); return (0); |
| 376 | } |
| 377 | |
| 378 | /* --- @try_timer@ --- * |
| 379 | * |
| 380 | * Arguments: @struct timer *t@ = timer structure |
| 381 | * @const struct timer_ops *ops@ = timer ops |
| 382 | * @unsigned tm@ = which subtimer we're setting |
| 383 | * |
| 384 | * Returns: Zero on success, %$-1$% if timer failed. |
| 385 | * |
| 386 | * Use: Tries to initialize the timer @t@, reporting a debug message |
| 387 | * if it worked. |
| 388 | */ |
| 389 | |
| 390 | static int try_timer(struct timer *t, |
| 391 | const struct timer_ops *ops, unsigned tm) |
| 392 | { |
| 393 | if (ops->init(t)) return (-1); |
| 394 | debug("selected %s timer `%s'", timertab[tm].what, ops->name); |
| 395 | t->ops[tm] = ops; return (0); |
| 396 | } |
| 397 | |
| 398 | /* --- @select_timer@ --- * |
| 399 | * |
| 400 | * Arguments: @struct timer *t@ = timer structure |
| 401 | * @unsigned tm@ = which subtimer we're setting |
| 402 | * @const char *config@, @size_t sz@ = config string |
| 403 | * |
| 404 | * Returns: Zero on success, %$-1$% if timer failed. |
| 405 | * |
| 406 | * Use: Select a timer from the table. If the environment variable |
| 407 | * is set, then parse a comma-separated list of timer names and |
| 408 | * use the first one listed that seems to work; otherwise, try |
| 409 | * the timers in the table in order. |
| 410 | */ |
| 411 | |
| 412 | static int select_timer(struct timer *t, unsigned tm, |
| 413 | const char *config, size_t sz) |
| 414 | { |
| 415 | const char *p, *l; |
| 416 | const struct timer_ops *ops, *const *tt; |
| 417 | |
| 418 | if (!config) { |
| 419 | for (tt = timertab[tm].opstab; *tt; tt++) |
| 420 | if (!((*tt)->f&TF_SECRET) && !try_timer(t, *tt, tm)) return (0); |
| 421 | } else { |
| 422 | l = config + sz; |
| 423 | for (;;) { |
| 424 | p = memchr(config, ',', l - config); if (!p) p = l; |
| 425 | ops = find_timer(config, p - config, tm); |
| 426 | if (ops && !try_timer(t, ops, tm)) return (0); |
| 427 | if (p >= l) break; |
| 428 | config = p + 1; |
| 429 | } |
| 430 | } |
| 431 | debug("no suitable %s timer found", timertab[tm].what); return (-1); |
| 432 | } |
| 433 | |
| 434 | /* Bench timer operations. */ |
| 435 | static void timer_describe(struct bench_timer *tm, dstr *d) |
| 436 | { |
| 437 | struct timer *t = (struct timer *)tm; |
| 438 | unsigned i; |
| 439 | |
| 440 | dstr_puts(d, "builtin: "); |
| 441 | for (i = 0; i < NTIMER; i++) { |
| 442 | if (i) dstr_puts(d, ", "); |
| 443 | dstr_putf(d, "%s = %s", timertab[i].what, t->ops[i]->name); |
| 444 | } |
| 445 | } |
| 446 | |
| 447 | static void timer_now(struct bench_timer *tm, struct bench_time *t_out) |
| 448 | { |
| 449 | struct timer *t = (struct timer *)tm; |
| 450 | unsigned i; |
| 451 | |
| 452 | for (i = 0; i < NTIMER; i++) t->ops[i]->now(t_out, t); |
| 453 | } |
| 454 | |
| 455 | static void timer_destroy(struct bench_timer *tm) |
| 456 | { |
| 457 | struct timer *t = (struct timer *)tm; |
| 458 | unsigned i; |
| 459 | |
| 460 | if (!t) return; |
| 461 | for (i = 0; i < NTIMER; i++) |
| 462 | if (t->ops[i]) t->ops[i]->teardown(t); |
| 463 | xfree(t); |
| 464 | } |
| 465 | |
| 466 | static const struct bench_timerops timer_ops = |
| 467 | { timer_describe, timer_now, timer_destroy }; |
| 468 | |
| 469 | /* --- @bench_createtimer@ --- * |
| 470 | * |
| 471 | * Arguments: @const char *config@ = timer configuration string |
| 472 | * |
| 473 | * Returns: A freshly constructed standard timer object. |
| 474 | * |
| 475 | * Use: Allocate a timer. Dispose of it by calling |
| 476 | * @tm->ops->destroy(tm)@ when you're done. |
| 477 | * |
| 478 | * Applications should not set configuration strings except as |
| 479 | * established by user action, e.g., from a command-line option, |
| 480 | * environment variable, or configuration file. |
| 481 | */ |
| 482 | |
| 483 | struct bench_timer *bench_createtimer(const char *config) |
| 484 | { |
| 485 | struct timer *t = 0; |
| 486 | struct bench_timer *ret = 0; |
| 487 | struct { const char *p; size_t sz; } tmconf[NTIMER] = { 0 }; |
| 488 | const struct timer_ops *const *tt; |
| 489 | const char *p, *l; size_t n, nn; |
| 490 | unsigned i; |
| 491 | |
| 492 | /* Parse the configuration string. */ |
| 493 | if (config) { |
| 494 | |
| 495 | /* The first thing to do is find the end of the string. */ |
| 496 | l = config + strlen(config); |
| 497 | |
| 498 | for (;;) { |
| 499 | /* Process the whitespace-sparated words of the string one by one. */ |
| 500 | |
| 501 | /* Skip over any initial whitespace. If we hit the end of the string |
| 502 | * then we're done. |
| 503 | */ |
| 504 | for (;;) |
| 505 | if (config >= l) goto done_config; |
| 506 | else if (!ISSPACE(*config)) break; |
| 507 | else config++; |
| 508 | |
| 509 | /* There's definitely a word here. Find the end of it. */ |
| 510 | for (p = config; p < l && !ISSPACE(*p); p++); |
| 511 | nn = p - config; |
| 512 | |
| 513 | /* Try various simple keywords. */ |
| 514 | #define MATCHP(lit) (nn == sizeof(lit) - 1 && MEMCMP(config, ==, lit, nn)) |
| 515 | |
| 516 | if (MATCHP("list")) { |
| 517 | /* The `list' keyword requests lists of the available timer |
| 518 | * implementations. |
| 519 | */ |
| 520 | |
| 521 | for (i = 0; i < NTIMER; i++) { |
| 522 | printf("%s timers:", timertab[i].what); |
| 523 | for (tt = timertab[i].opstab; *tt; tt++) |
| 524 | if (!((*tt)->f)&TF_SECRET) printf(" %s", (*tt)->name); |
| 525 | putchar('\n'); |
| 526 | } |
| 527 | goto next_config; |
| 528 | } |
| 529 | |
| 530 | #undef MATCHP |
| 531 | |
| 532 | /* Otherwise it's an assignment, setting a subtimer list. */ |
| 533 | p = memchr(config, '=', nn); |
| 534 | if (!p) |
| 535 | n = nn; |
| 536 | else { |
| 537 | n = p - config; |
| 538 | for (i = 0; i < NTIMER; i++) |
| 539 | if (STRNCMP(config, ==, timertab[i].what, n) && |
| 540 | !timertab[i].what[n]) { |
| 541 | if (tmconf[i].p) |
| 542 | debug("duplicate %s timer list", timertab[i].what); |
| 543 | tmconf[i].p = config + n + 1; tmconf[i].sz = nn - n - 1; |
| 544 | goto next_config; |
| 545 | } |
| 546 | } |
| 547 | debug("unrecognized config keyword `%.*s'", (int)n, config); |
| 548 | |
| 549 | /* Move on to the next word. */ |
| 550 | next_config: |
| 551 | config += nn; |
| 552 | } |
| 553 | done_config:; |
| 554 | } |
| 555 | |
| 556 | /* Override these settings from the environment. */ |
| 557 | for (i = 0; i < NTIMER; i++) { |
| 558 | p = getenv(timertab[i].env); |
| 559 | if (p) { tmconf[i].p = p; tmconf[i].sz = strlen(p); } |
| 560 | } |
| 561 | |
| 562 | /* All seems well. Allocate the timer object. */ |
| 563 | t = xmalloc(sizeof(*t)); |
| 564 | for (i = 0; i < NTIMER; i++) t->ops[i] = 0; |
| 565 | |
| 566 | /* Try to set up the subtimers. */ |
| 567 | for (i = 0; i < NTIMER; i++) |
| 568 | if (select_timer(t, i, tmconf[i].p, tmconf[i].sz)) goto end; |
| 569 | |
| 570 | /* All is done. */ |
| 571 | t->_t.ops = &timer_ops; ret = &t->_t; t = 0; |
| 572 | end: |
| 573 | if (t) timer_destroy(&t->_t); |
| 574 | return (ret); |
| 575 | } |
| 576 | |
| 577 | /*----- Benchmarking ------------------------------------------------------*/ |
| 578 | |
| 579 | /* --- @bench_init@ --- * |
| 580 | * |
| 581 | * Arguments: @struct bench_state *b@ = bench state to initialize |
| 582 | * @struct bench_timer *tm@ = timer to attach, or null |
| 583 | * |
| 584 | * Returns: Zero on success, %$-1$% on failure. |
| 585 | * |
| 586 | * Use: Initialize the benchmark state. On success, the timer state |
| 587 | * still needs to be calibrated (use @bench_calibrate@) before |
| 588 | * it can be used, but this will be done automatically by |
| 589 | * @bench_measure@ if it's not done by hand earlier. The timer |
| 590 | * is now owned by the benchmark state and will be destroyed by |
| 591 | * @bench_destroy@. |
| 592 | * |
| 593 | * The only reason for failure is if @tm@ was null on entry, |
| 594 | * and automatic construction of a timer failed. The state is |
| 595 | * safe to discard, but calling @bench_destroy@ is safe too. |
| 596 | */ |
| 597 | |
| 598 | int bench_init(struct bench_state *b, struct bench_timer *tm) |
| 599 | { |
| 600 | int rc; |
| 601 | |
| 602 | b->tm = 0; |
| 603 | |
| 604 | if (!tm) { |
| 605 | tm = bench_createtimer(0); |
| 606 | if (!tm) { rc = -1; goto end; } |
| 607 | } |
| 608 | |
| 609 | b->tm = tm; b->target_s = 1.0; b->f = 0; rc = 0; |
| 610 | end: |
| 611 | return (rc); |
| 612 | } |
| 613 | |
| 614 | /* --- @bench_destroy@ --- * |
| 615 | * |
| 616 | * Arguments: @struct bench_state *b@ = bench state |
| 617 | * |
| 618 | * Returns: --- |
| 619 | * |
| 620 | * Use: Destroy the benchmark state, releasing the resources that it |
| 621 | * holds. |
| 622 | */ |
| 623 | |
| 624 | void bench_destroy(struct bench_state *b) |
| 625 | { if (b->tm) { b->tm->ops->destroy(b->tm); b->tm = 0; } } |
| 626 | |
| 627 | /* --- @do_nothing@ --- * |
| 628 | * |
| 629 | * Arguments: @unsigned long n@ = iteration count |
| 630 | * @void *ctx@ = context pointer (ignored) |
| 631 | * |
| 632 | * Returns: --- |
| 633 | * |
| 634 | * Use: Does nothing at all for @n@ iterations. Used to calibrate |
| 635 | * the benchmarking state. |
| 636 | */ |
| 637 | |
| 638 | static void do_nothing(unsigned long n, void *ctx) |
| 639 | { while (n--) RELAX; } |
| 640 | |
| 641 | /* --- @bench_calibrate@ --- * |
| 642 | * |
| 643 | * Arguments: @struct bench_state *b@ = bench state |
| 644 | * |
| 645 | * Returns: Zero on success, %$-1$% if calibration failed. |
| 646 | * |
| 647 | * Use: Calibrate the benchmark state, so that it can be used to |
| 648 | * measure performance reasonably accurately. |
| 649 | */ |
| 650 | |
| 651 | #define T_CLB 0.0625 /* calibration time limit */ |
| 652 | |
| 653 | int bench_calibrate(struct bench_state *b) |
| 654 | { |
| 655 | struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT; |
| 656 | unsigned long n; |
| 657 | unsigned i; |
| 658 | struct bench_timer *tm = b->tm; |
| 659 | struct bench_time t0, t1; |
| 660 | struct bench_timing delta; |
| 661 | double r; |
| 662 | bench_fn *fn = LAUNDER(&do_nothing); |
| 663 | unsigned f = BTF_ANY; |
| 664 | int rc; |
| 665 | |
| 666 | /* The model here is that a timing loop has a fixed overhead as we enter |
| 667 | * and leave (e.g., to do with the indirect branch into the code), and |
| 668 | * per-iteration overheads as we check the counter and loop back. We aim |
| 669 | * to split these apart using linear regression. |
| 670 | */ |
| 671 | |
| 672 | /* If we've already calibrated then there's nothing to do. */ |
| 673 | if (b->f&BTF_CLB) return (b->f&BTF_ANY ? 0 : -1); |
| 674 | |
| 675 | /* Exercise the inner loop a few times to educate the branch predictor. */ |
| 676 | for (i = 0; i < 10; i++) |
| 677 | { tm->ops->now(tm, &t0); fn(50, 0); tm->ops->now(tm, &t1); } |
| 678 | |
| 679 | /* Now we measure idle loops until they take sufficiently long -- or we run |
| 680 | * out of counter. |
| 681 | */ |
| 682 | debug("calibrating..."); |
| 683 | n = 1; |
| 684 | for (;;) { |
| 685 | |
| 686 | /* Measure @n@ iterations of the idle loop. */ |
| 687 | tm->ops->now(tm, &t0); fn(n, 0); tm->ops->now(tm, &t1); |
| 688 | timer_diff(&delta, &t0, &t1); f &= delta.f; |
| 689 | if (!(f&BTF_TIMEOK)) { rc = -1; goto end; } |
| 690 | |
| 691 | /* Register the timings with the regression machinery. */ |
| 692 | linreg_update(&lr_clk, n, delta.t); |
| 693 | if (!(f&BTF_CYOK)) |
| 694 | debug(" n = %10lu; t = %12g s", n, delta.t); |
| 695 | else { |
| 696 | linreg_update(&lr_cy, n, delta.cy); |
| 697 | debug(" n = %10lu; t = %12g s, cy = %10.0f", n, delta.t, delta.cy); |
| 698 | } |
| 699 | |
| 700 | /* If we're done then stop. */ |
| 701 | if (delta.t >= T_CLB) break; |
| 702 | if (n >= ULONG_MAX - n/3) break; |
| 703 | |
| 704 | /* Update the counter and continue. */ |
| 705 | n += n/3 + 1; |
| 706 | } |
| 707 | |
| 708 | /* Now run the linear regression to extract the constant and per-iteration |
| 709 | * overheads. |
| 710 | */ |
| 711 | linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, &r); |
| 712 | debug("clock overhead = (%g n + %g) s (r = %g)", b->clk.m, b->clk.c, r); |
| 713 | if (f&BTF_CYOK) { |
| 714 | linreg_fit(&lr_cy, &b->cy.m, &b->cy.c, &r); |
| 715 | debug("cycle overhead = (%g n + %g) cy (r = %g)", b->cy.m, b->cy.c, r); |
| 716 | } |
| 717 | |
| 718 | /* We're done. */ |
| 719 | rc = 0; |
| 720 | end: |
| 721 | b->f |= f | BTF_CLB; /* no point trying again */ |
| 722 | return (rc); |
| 723 | } |
| 724 | |
| 725 | /* --- @bench_measure@ --- * |
| 726 | * |
| 727 | * Arguments: @struct bench_state *b@ = benchmark state |
| 728 | * @struct bench_timing *t_out@ = where to leave the timing |
| 729 | * @double base@ = number of internal units per call |
| 730 | * @bench_fn *fn@, @void *ctx@ = benchmark function to run |
| 731 | * |
| 732 | * Returns: Zero on success, %$-1$% if timing failed. |
| 733 | * |
| 734 | * Use: Measure a function. The function @fn@ is called adaptively |
| 735 | * with an iteration count @n@ set so as to run for |
| 736 | * approximately @b->target_s@ seconds. |
| 737 | * |
| 738 | * The result is left in @*t_out@, with @t_out->n@ counting the |
| 739 | * final product of the iteration count and @base@ (which might, |
| 740 | * e.g., reflect the number of inner iterations the function |
| 741 | * performs, or the number of bytes it processes per iteration). |
| 742 | */ |
| 743 | |
| 744 | int bench_measure(struct bench_state *b, struct bench_timing *t_out, |
| 745 | double base, bench_fn *fn, void *ctx) |
| 746 | { |
| 747 | struct bench_timer *tm = b->tm; |
| 748 | struct bench_time t0, t1; |
| 749 | unsigned long n, nn; |
| 750 | |
| 751 | /* Make sure the state is calibrated and usable. */ |
| 752 | if (!(b->f&BTF_CLB) && bench_calibrate(b)) return (-1); |
| 753 | if (!(b->f&BTF_TIMEOK)) return (-1); |
| 754 | |
| 755 | /* Main adaptive measurement loop. |
| 756 | * |
| 757 | * Suppose the timer loop %$n$% iterations in %$t$% seconds. Our ideal |
| 758 | * time is %$T$% seconds. If %$t \ge T/\sqrt{2}$%, we're happy. |
| 759 | * Otherwise, we need to scale up the iteration count. The obvious next |
| 760 | * choice is %$n' = n T/t$%. Alas, rounding is a problem: if |
| 761 | * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no |
| 762 | * progress. We know that %$T/t > \sqrt{2}%, so this can only happen when |
| 763 | * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%. On the other |
| 764 | * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying |
| 765 | * again with %$n' = n + 1$% iterations will very likely work. |
| 766 | */ |
| 767 | debug("measuring..."); n = 1; |
| 768 | for (;;) { |
| 769 | tm->ops->now(tm, &t0); fn(n, ctx); tm->ops->now(tm, &t1); |
| 770 | timer_diff(t_out, &t0, &t1); |
| 771 | if (!(t_out->f&BTF_TIMEOK)) return (-1); |
| 772 | if (!(t_out->f&BTF_CYOK)) debug(" n = %10lu; t = %12g", n, t_out->t); |
| 773 | else debug(" n = %10lu; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy); |
| 774 | if (t_out->t >= 0.707*b->target_s) break; |
| 775 | nn = n*b->target_s/t_out->t; |
| 776 | if (nn > n) n = nn; |
| 777 | else n++; |
| 778 | } |
| 779 | |
| 780 | /* Adjust according to the calibration. */ |
| 781 | t_out->t -= n*b->clk.m + b->clk.c; |
| 782 | if (t_out->f&BTF_CYOK) t_out->cy -= n*b->cy.m + b->cy.c; |
| 783 | |
| 784 | /* Report the results, if debugging. */ |
| 785 | if (!(t_out->f&BTF_CYOK)) debug(" adjusted t' = %12g", t_out->t); |
| 786 | else debug(" adjusted t = %12g, cy = %10.0f", t_out->t, t_out->cy); |
| 787 | if (!(t_out->f&BTF_CYOK)) |
| 788 | debug(" %g s per op; %g ops/s", t_out->t/n, n/t_out->t); |
| 789 | else |
| 790 | debug(" %g s (%g cy) per op; %g ops/s", |
| 791 | t_out->t/n, t_out->cy/n, n/t_out->t); |
| 792 | |
| 793 | /* All done. */ |
| 794 | t_out->n = n*base; return (0); |
| 795 | } |
| 796 | |
| 797 | /*----- That's all, folks -------------------------------------------------*/ |