| 1 | /* -*-c-*- |
| 2 | * |
| 3 | * Benchmarking support |
| 4 | * |
| 5 | * (c) 2023 Straylight/Edgeware |
| 6 | */ |
| 7 | |
| 8 | /*----- Licensing notice --------------------------------------------------* |
| 9 | * |
| 10 | * This file is part of the mLib utilities library. |
| 11 | * |
| 12 | * mLib is free software: you can redistribute it and/or modify it under |
| 13 | * the terms of the GNU Library General Public License as published by |
| 14 | * the Free Software Foundation; either version 2 of the License, or (at |
| 15 | * your option) any later version. |
| 16 | * |
| 17 | * mLib is distributed in the hope that it will be useful, but WITHOUT |
| 18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public |
| 20 | * License for more details. |
| 21 | * |
| 22 | * You should have received a copy of the GNU Library General Public |
| 23 | * License along with mLib. If not, write to the Free Software |
| 24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, |
| 25 | * USA. |
| 26 | */ |
| 27 | |
| 28 | /*----- Header files ------------------------------------------------------*/ |
| 29 | |
| 30 | #include "config.h" |
| 31 | |
| 32 | #include <errno.h> |
| 33 | #include <stdarg.h> |
| 34 | #include <stdio.h> |
| 35 | #include <stdlib.h> |
| 36 | #include <string.h> |
| 37 | #include <time.h> |
| 38 | |
| 39 | #include "alloc.h" |
| 40 | #include "bench.h" |
| 41 | #include "bits.h" |
| 42 | #include "linreg.h" |
| 43 | #include "macros.h" |
| 44 | |
| 45 | /*----- Data structures ---------------------------------------------------*/ |
| 46 | |
| 47 | struct timer { |
| 48 | struct bench_timer _t; |
| 49 | const struct timer_ops *clkops, *cyops; |
| 50 | union { int fd; } u_cy; |
| 51 | }; |
| 52 | |
| 53 | struct timer_ops { |
| 54 | void (*now)(struct bench_time *t_out, struct timer *t); |
| 55 | void (*teardown)(struct timer *t); |
| 56 | }; |
| 57 | |
| 58 | /*----- Preliminaries -----------------------------------------------------*/ |
| 59 | |
| 60 | #define NS_PER_S 1000000000 |
| 61 | |
| 62 | static void PRINTF_LIKE(1, 2) debug(const char *fmt, ...) |
| 63 | { |
| 64 | const char *p; |
| 65 | va_list ap; |
| 66 | |
| 67 | p = getenv("MLIB_BENCH_DEBUG"); |
| 68 | if (p && *p != 'n' && *p != '0') { |
| 69 | va_start(ap, fmt); |
| 70 | fputs("mLib BENCH: ", stderr); |
| 71 | vfprintf(stderr, fmt, ap); |
| 72 | fputc('\n', stderr); |
| 73 | va_end(ap); |
| 74 | } |
| 75 | } |
| 76 | |
| 77 | /*----- The null clock ----------------------------------------------------*/ |
| 78 | |
| 79 | static void null_now(struct bench_time *t_out, struct timer *t) { ; } |
| 80 | static void null_teardown(struct timer *t) { ; } |
| 81 | static const struct timer_ops null_ops = { null_now, null_teardown }; |
| 82 | |
| 83 | static int null_cyinit(struct timer *t) |
| 84 | { t->cyops = &null_ops; return (0); } |
| 85 | |
| 86 | #define NULL_CYENT { "null", null_cyinit }, |
| 87 | |
| 88 | /*----- Linux performance counters ----------------------------------------*/ |
| 89 | |
| 90 | #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64) |
| 91 | |
| 92 | #include <sys/types.h> |
| 93 | #include <unistd.h> |
| 94 | |
| 95 | #include <linux/perf_event.h> |
| 96 | #include <asm/unistd.h> |
| 97 | |
| 98 | static void perfevent_now(struct bench_time *t_out, struct timer *t) |
| 99 | { |
| 100 | ssize_t n; |
| 101 | |
| 102 | n = read(t->u_cy.fd, &t_out->cy.i, sizeof(t_out->cy.i)); |
| 103 | if (n != sizeof(t_out->cy.i)) { |
| 104 | debug("failed to read perf-event counter: %s", strerror(errno)); |
| 105 | return; |
| 106 | } |
| 107 | t_out->f |= BTF_CYOK; |
| 108 | } |
| 109 | |
| 110 | static void perfevent_teardown(struct timer *t) |
| 111 | { close(t->u_cy.fd); } |
| 112 | |
| 113 | static const struct timer_ops perfevent_ops = |
| 114 | { perfevent_now, perfevent_teardown }; |
| 115 | |
| 116 | static int perfevent_init(struct timer *t) |
| 117 | { |
| 118 | struct perf_event_attr attr = { 0 }; |
| 119 | struct bench_time tm; |
| 120 | |
| 121 | attr.type = PERF_TYPE_HARDWARE; |
| 122 | attr.size = sizeof(attr); |
| 123 | attr.config = PERF_COUNT_HW_CPU_CYCLES; |
| 124 | attr.disabled = 0; |
| 125 | attr.exclude_kernel = 1; |
| 126 | attr.exclude_hv = 1; |
| 127 | |
| 128 | t->u_cy.fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); |
| 129 | if (t->u_cy.fd < 0) { |
| 130 | debug("couldn't open perf evvent: %s", strerror(errno)); |
| 131 | return (-1); |
| 132 | } |
| 133 | |
| 134 | tm.f = 0; perfevent_now(&tm, t); |
| 135 | if (!(tm.f&BTF_CYOK)) { close(t->u_cy.fd); return (-1); } |
| 136 | |
| 137 | t->cyops = &perfevent_ops; return (0); |
| 138 | } |
| 139 | # define PERFEVENT_CYENT { "linux-perf-event", perfevent_init }, |
| 140 | #else |
| 141 | # define PERFEVENT_CYENT |
| 142 | #endif |
| 143 | |
| 144 | /*----- Intel time-stamp counter ------------------------------------------*/ |
| 145 | |
| 146 | #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) |
| 147 | |
| 148 | #define EFLAGS_ID (1u << 21) |
| 149 | #define CPUID_1D_TSC (1u << 4) |
| 150 | |
| 151 | static uint32 set_flags(unsigned long m, unsigned long x) |
| 152 | { |
| 153 | unsigned long r; |
| 154 | |
| 155 | #ifdef __x86_64__ |
| 156 | # define TMP "%%rcx" |
| 157 | #else |
| 158 | # define TMP "%%ecx" |
| 159 | #endif |
| 160 | |
| 161 | __asm__ ("pushf\n\t" |
| 162 | "pop %0\n\t" |
| 163 | "mov %0, " TMP "\n\t" |
| 164 | "and %1, %0\n\t" |
| 165 | "xor %2, %0\n\t" |
| 166 | "push %0\n\t" |
| 167 | "popf\n\t" |
| 168 | "pushf\n\t" |
| 169 | "pop %0\n\t" |
| 170 | "push " TMP "\n\t" |
| 171 | "popf" |
| 172 | : "=r"(r) |
| 173 | : "g"(m), "g"(x) |
| 174 | : "ecx"); |
| 175 | return (r); |
| 176 | } |
| 177 | |
| 178 | struct cpuid { uint32 a, b, c, d; }; |
| 179 | |
| 180 | static void cpuid(struct cpuid *info_out, uint32 a, uint32 c) |
| 181 | { |
| 182 | __asm__ ("movl %1, %%eax\n\t" |
| 183 | "movl %2, %%ecx\n\t" |
| 184 | "cpuid\n\t" |
| 185 | "movl %%eax, 0(%0)\n\t" |
| 186 | "movl %%ebx, 4(%0)\n\t" |
| 187 | "movl %%ecx, 8(%0)\n\t" |
| 188 | "movl %%edx, 12(%0)\n\t" |
| 189 | : /* no outputs */ |
| 190 | : "r"(info_out), "g"(a), "g"(c) |
| 191 | : "eax", "ebx", "ecx", "edx", "cc"); |
| 192 | } |
| 193 | |
| 194 | static void x86rdtsc_now(struct bench_time *t_out, struct timer *t) |
| 195 | { |
| 196 | uint32 lo, hi; |
| 197 | |
| 198 | __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); |
| 199 | SET64(t_out->cy, hi, lo); t_out->f |= BTF_CYOK; |
| 200 | } |
| 201 | |
| 202 | static const struct timer_ops x86rdtsc_ops = |
| 203 | { x86rdtsc_now, null_teardown }; |
| 204 | |
| 205 | static int x86rdtsc_init(struct timer *t) |
| 206 | { |
| 207 | struct cpuid info; |
| 208 | |
| 209 | if ((set_flags(~EFLAGS_ID, 0)&EFLAGS_ID) || |
| 210 | !(set_flags(~EFLAGS_ID, EFLAGS_ID)&EFLAGS_ID)) |
| 211 | { debug("no `cpuid' instruction"); return (-1); } |
| 212 | cpuid(&info, 0, 0); |
| 213 | if (info.a < 1) { debug("no `cpuid' leaf 1"); return (-1); } |
| 214 | cpuid(&info, 1, 0); |
| 215 | if (!(info.d&CPUID_1D_TSC)) |
| 216 | { debug("no `rdtsc' instrunction"); return (-1); } |
| 217 | t->cyops = &x86rdtsc_ops; return (0); |
| 218 | } |
| 219 | |
| 220 | # define X86RDTSC_CYENT { "x86-rdtsc", x86rdtsc_init }, |
| 221 | #else |
| 222 | # define X86RDTWC_CYENT |
| 223 | #endif |
| 224 | |
| 225 | /*----- POSIX `clock_gettime' ---------------------------------------------*/ |
| 226 | |
| 227 | #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID) |
| 228 | |
| 229 | static void gettime_now(struct bench_time *t_out, struct timer *t) |
| 230 | { |
| 231 | struct timespec now; |
| 232 | |
| 233 | if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now)) |
| 234 | { debug("error reading POSIX clock: %s", strerror(errno)); return; } |
| 235 | ASSIGN64(t_out->s, now.tv_sec); t_out->ns = now.tv_nsec; |
| 236 | t_out->f |= BTF_TIMEOK; |
| 237 | } |
| 238 | |
| 239 | static const struct timer_ops gettime_ops = { gettime_now, null_teardown }; |
| 240 | |
| 241 | static int gettime_init(struct timer *t) |
| 242 | { |
| 243 | struct bench_time tm; |
| 244 | |
| 245 | tm.f = 0; gettime_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1); |
| 246 | t->clkops = &gettime_ops; return (0); |
| 247 | } |
| 248 | |
| 249 | # define GETTIME_CLKENT { "posix-clock_gettime", gettime_init }, |
| 250 | #else |
| 251 | # define GETTIME_CLKENT |
| 252 | #endif |
| 253 | |
| 254 | /*----- Standard C `clock' ------------------------------------------------*/ |
| 255 | |
| 256 | static void clock_now(struct bench_time *t_out, struct timer *t) |
| 257 | { |
| 258 | clock_t now, x; |
| 259 | unsigned long s; uint32 ns; |
| 260 | |
| 261 | now = clock(); |
| 262 | if (now == (clock_t)-1) { |
| 263 | debug("error reading standard clock: %s", strerror(errno)); |
| 264 | return; |
| 265 | } |
| 266 | x = now/CLOCKS_PER_SEC; |
| 267 | if (x > ULONG_MAX) { debug("standard clock out of range"); return; } |
| 268 | |
| 269 | s = x; x = now - CLOCKS_PER_SEC*s; |
| 270 | if (!(NS_PER_S%CLOCKS_PER_SEC)) |
| 271 | ns = x*(NS_PER_S/CLOCKS_PER_SEC); |
| 272 | else if (NS_PER_S <= ULONG_MAX/CLOCKS_PER_SEC) |
| 273 | ns = (x*NS_PER_S)/CLOCKS_PER_SEC; |
| 274 | else |
| 275 | ns = x*((NS_PER_S + 0.0)/CLOCKS_PER_SEC); |
| 276 | ASSIGN64(t_out->s, s); t_out->ns = ns; t_out->f |= BTF_TIMEOK; |
| 277 | } |
| 278 | |
| 279 | static const struct timer_ops clock_ops = { clock_now, null_teardown }; |
| 280 | |
| 281 | static int clock_init(struct timer *t) |
| 282 | { |
| 283 | struct bench_time tm; |
| 284 | |
| 285 | tm.f = 0; clock_now(&tm, t); if (!tm.f&BTF_TIMEOK) return (-1); |
| 286 | t->clkops = &clock_ops; return (0); |
| 287 | } |
| 288 | |
| 289 | #define CLOCK_CLKENT { "clock", clock_init }, |
| 290 | |
| 291 | /*----- Timing setup ------------------------------------------------------*/ |
| 292 | |
| 293 | static const struct timerent { |
| 294 | const char *name; |
| 295 | int (*init)(struct timer */*t*/); |
| 296 | } |
| 297 | clktab[] = { GETTIME_CLKENT CLOCK_CLKENT { 0, 0 } }, |
| 298 | cytab[] = { PERFEVENT_CYENT X86RDTSC_CYENT NULL_CYENT { 0, 0 } }; |
| 299 | |
| 300 | static const struct timerent *find_timer_n(const char *name, size_t sz, |
| 301 | const struct timerent *timers, |
| 302 | const char *what) |
| 303 | { |
| 304 | while (timers->name) { |
| 305 | if (strlen(timers->name) == sz && MEMCMP(name, ==, timers->name, sz)) |
| 306 | return (timers); |
| 307 | timers++; |
| 308 | } |
| 309 | debug("%s timer `%.*s' not found", what, (int)sz, name); return (0); |
| 310 | } |
| 311 | |
| 312 | static int try_timer(struct timer *t, |
| 313 | const struct timerent *timer, const char *what) |
| 314 | { |
| 315 | if (timer->init(t)) return (-1); |
| 316 | debug("selected %s timer `%s'", what, timer->name); return (0); |
| 317 | } |
| 318 | |
| 319 | static int select_timer(struct timer *t, const struct timerent *timers, |
| 320 | const char *varname, const char *what) |
| 321 | { |
| 322 | const char *p; size_t n; |
| 323 | const struct timerent *timer; |
| 324 | |
| 325 | p = getenv(varname); |
| 326 | if (!p) { |
| 327 | while (timers->name) |
| 328 | if (!try_timer(t, timers++, what)) return (0); |
| 329 | } else { |
| 330 | for (;;) { |
| 331 | n = strcspn(p, ","); |
| 332 | timer = find_timer_n(p, n, timers, what); |
| 333 | if (timer && !try_timer(t, timer, what)) return (0); |
| 334 | if (!p[n]) break; |
| 335 | p += n + 1; |
| 336 | } |
| 337 | } |
| 338 | debug("no suitable %s timer found", what); return (-1); |
| 339 | } |
| 340 | |
| 341 | static void timer_now(struct bench_timer *tm, struct bench_time *t_out) |
| 342 | { |
| 343 | struct timer *t = (struct timer *)tm; |
| 344 | |
| 345 | t->clkops->now(t_out, t); |
| 346 | t->cyops->now(t_out, t); |
| 347 | } |
| 348 | |
| 349 | static void timer_destroy(struct bench_timer *tm) |
| 350 | { |
| 351 | struct timer *t = (struct timer *)tm; |
| 352 | |
| 353 | if (!t) return; |
| 354 | if (t->clkops) t->clkops->teardown(t); |
| 355 | if (t->cyops) t->cyops->teardown(t); |
| 356 | xfree(t); |
| 357 | } |
| 358 | |
| 359 | static const struct bench_timerops timer_ops = { timer_now, timer_destroy }; |
| 360 | |
| 361 | struct bench_timer *bench_createtimer(void) |
| 362 | { |
| 363 | struct timer *t = 0; |
| 364 | struct bench_timer *ret = 0; |
| 365 | |
| 366 | t = xmalloc(sizeof(*t)); t->cyops = 0; t->clkops = 0; |
| 367 | if (select_timer(t, clktab, "MLIB_BENCH_CLKTIMER", "clock")) goto end; |
| 368 | if (select_timer(t, cytab, "MLIB_BENCH_CYCLETIMER", "cycle")) goto end; |
| 369 | t->_t.ops = &timer_ops; ret = &t->_t; t = 0; |
| 370 | end: |
| 371 | if (t) timer_destroy(&t->_t); |
| 372 | return (ret); |
| 373 | } |
| 374 | |
| 375 | #ifdef HAVE_UINT64 |
| 376 | # define FLOATK64(k) ((double)(k).i) |
| 377 | #else |
| 378 | # define FLOATK64(k) ((double)(k).lo + 4275123318.0*(double)(k).hi) |
| 379 | #endif |
| 380 | |
| 381 | static void timer_diff(struct bench_timing *delta_out, |
| 382 | const struct bench_time *t0, |
| 383 | const struct bench_time *t1) |
| 384 | { |
| 385 | delta_out->f = t0->f&t1->f; |
| 386 | kludge64 k; |
| 387 | |
| 388 | if (!(delta_out->f&BTF_TIMEOK)) |
| 389 | delta_out->t = 0.0; |
| 390 | else { |
| 391 | SUB64(k, t1->s, t0->s); |
| 392 | delta_out->t = FLOATK64(k) - 1 + |
| 393 | (t1->ns + NS_PER_S - t0->ns)/(double)NS_PER_S; |
| 394 | } |
| 395 | |
| 396 | if (!(delta_out->f&BTF_CYOK)) |
| 397 | delta_out->cy = 0.0; |
| 398 | else { |
| 399 | SUB64(k, t1->cy, t0->cy); |
| 400 | delta_out->cy = FLOATK64(k); |
| 401 | } |
| 402 | } |
| 403 | |
| 404 | /*----- Calibration -------------------------------------------------------*/ |
| 405 | |
| 406 | void bench_init(struct bench_state *b, struct bench_timer *tm) |
| 407 | { b->tm = tm; b->target_s = 1.0; b->f = 0; } |
| 408 | |
| 409 | void bench_destroy(struct bench_state *b) |
| 410 | { b->tm->ops->destroy(b->tm); } |
| 411 | |
| 412 | static void do_nothing(unsigned long n, void *p) |
| 413 | { while (n--) RELAX; } |
| 414 | |
| 415 | int bench_calibrate(struct bench_state *b) |
| 416 | { |
| 417 | struct linreg lr_clk = LINREG_INIT, lr_cy = LINREG_INIT; |
| 418 | unsigned long n; |
| 419 | unsigned i; |
| 420 | struct bench_timer *tm = b->tm; |
| 421 | struct bench_time t0, t1; |
| 422 | struct bench_timing delta; |
| 423 | bench_fn *fn = LAUNDER(&do_nothing); |
| 424 | unsigned f = BTF_ANY; |
| 425 | int rc; |
| 426 | |
| 427 | if (b->f&BTF_ANY) return (0); |
| 428 | |
| 429 | for (i = 0; i < 10; i++) |
| 430 | { tm->ops->now(tm, &t0); fn(1, 0); tm->ops->now(tm, &t1); } |
| 431 | |
| 432 | debug("calibrating..."); |
| 433 | n = 1; |
| 434 | for (;;) { |
| 435 | tm->ops->now(tm, &t0); fn(n, 0); tm->ops->now(tm, &t1); |
| 436 | timer_diff(&delta, &t0, &t1); f &= delta.f; |
| 437 | if (!(f&BTF_TIMEOK)) { rc = -1; goto end; } |
| 438 | linreg_update(&lr_clk, n, delta.t); |
| 439 | if (!(f&BTF_CYOK)) |
| 440 | debug(" n = %10lu; t = %12g s", n, delta.t); |
| 441 | else { |
| 442 | linreg_update(&lr_cy, n, delta.cy); |
| 443 | debug(" n = %10lu; t = %12g s, cy = %10.0f", n, delta.t, delta.cy); |
| 444 | } |
| 445 | if (delta.t >= b->target_s/20.0) break; |
| 446 | if (n >= ULONG_MAX - n/3) break; |
| 447 | n += n/3 + 1; |
| 448 | } |
| 449 | |
| 450 | linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, 0); |
| 451 | debug("clock overhead = (%g n + %g) s", b->clk.m, b->clk.c); |
| 452 | if (f&BTF_CYOK) { |
| 453 | linreg_fit(&lr_clk, &b->clk.m, &b->clk.c, 0); |
| 454 | debug("cycle overhead = (%g n + %g) cy", b->cy.m, b->cy.c); |
| 455 | } |
| 456 | b->f |= f; rc = 0; |
| 457 | end: |
| 458 | return (rc); |
| 459 | } |
| 460 | |
| 461 | int bench_measure(struct bench_timing *t_out, struct bench_state *b, |
| 462 | bench_fn *fn, void *p) |
| 463 | { |
| 464 | struct bench_timer *tm = b->tm; |
| 465 | struct bench_time t0, t1; |
| 466 | unsigned long n; |
| 467 | |
| 468 | if (bench_calibrate(b)) return (-1); |
| 469 | debug("measuring..."); n = 1; |
| 470 | for (;;) { |
| 471 | tm->ops->now(tm, &t0); fn(n, p); tm->ops->now(tm, &t1); |
| 472 | timer_diff(t_out, &t0, &t1); |
| 473 | if (!(t_out->f&BTF_TIMEOK)) return (-1); |
| 474 | if (!(t_out->f&BTF_CYOK)) debug(" n = %10lu; t = %12g", n, t_out->t); |
| 475 | else debug(" n = %10lu; t = %12g, cy = %10.0f", n, t_out->t, t_out->cy); |
| 476 | if (t_out->t >= 0.72*b->target_s) break; |
| 477 | n *= 1.44*b->target_s/t_out->t; |
| 478 | } |
| 479 | t_out->t -= n*b->clk.m + b->clk.c; |
| 480 | if (t_out->f&BTF_CYOK) t_out->cy -= n*b->cy.m + b->cy.c; |
| 481 | if (!(t_out->f&BTF_CYOK)) debug(" adjusted t' = %12g", t_out->t); |
| 482 | else debug(" adjusted t = %12g, cy = %10.0f", t_out->t, t_out->cy); |
| 483 | if (!(t_out->f&BTF_CYOK)) |
| 484 | debug(" %g s per op; %g ops/s", t_out->t/n, n/t_out->t); |
| 485 | else |
| 486 | debug(" %g s (%g cy) per op; %g ops/s", |
| 487 | t_out->t/n, t_out->cy/n, n/t_out->t); |
| 488 | t_out->n = n; return (0); |
| 489 | } |
| 490 | |
| 491 | /*----- That's all, folks -------------------------------------------------*/ |