5 * (c) 2023 Straylight/Edgeware
8 /*----- Licensing notice --------------------------------------------------*
10 * This file is part of the mLib utilities library.
12 * mLib is free software: you can redistribute it and/or modify it under
13 * the terms of the GNU Library General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or (at
15 * your option) any later version.
17 * mLib is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 * License for more details.
22 * You should have received a copy of the GNU Library General Public
23 * License along with mLib. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
28 /*----- Header files ------------------------------------------------------*/
48 #if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
50 # define CPUID_1D_TSC (1u << 4)
51 # define CPUID_1xD_TSCP (1u << 27)
54 #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
55 # include <sys/types.h>
57 # include <linux/perf_event.h>
58 # include <asm/unistd.h>
59 # if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
60 # include <sys/mman.h>
64 /*----- Data structures ---------------------------------------------------*/
66 enum { CLK
, CY
, NTIMER
};
69 struct bench_timer _t
;
70 const struct timer_ops
*ops
[NTIMER
]; /* subtimers for clock and cycles */
72 unsigned tscaux
; /* `ia32_tsc_aux' for `ldtscp' */
73 int fd
; /* vanilla `perf_event_open' */
74 struct { const volatile void *map
; size_t sz
; } pmc
; /* `perf_event_open'
76 } u_cy
; /* state for cycle measurement */
80 const char *name
; /* timer name */
81 unsigned f
; /* flags */
82 #define TF_SECRET 1u /* don't try this automatically */
83 int (*init
)(struct timer */
*t*/
); /* initialization function */
84 int (*now
)(struct timer */
*t*/
, /* read current */
85 struct bench_time */
*t_out*/
, unsigned /*f*/);
86 void (*diff
)(struct timer */
*t*/
, /* difference */
87 struct bench_timing */
*t_inout*/
,
88 const struct bench_time */
*t0*/
,
89 const struct bench_time */
*t1*/
);
90 void (*teardown
)(struct timer */
*t*/
); /* release held resources */
93 /*----- Preliminaries -----------------------------------------------------*/
95 #define NS_PER_S 1000000000
99 * Arguments: @const char *fmt@ = format control string
100 * @...@ = format arguemnts
104 * Use: Maybe report a debugging message to standard error.
107 static PRINTF_LIKE(1, 2) void debug(const char *fmt
, ...)
112 p
= getenv("MLIB_BENCH_DEBUG");
113 if (p
&& *p
!= 'n' && *p
!= '0') {
115 fputs("mLib BENCH: ", stderr
);
116 vfprintf(stderr
, fmt
, ap
);
122 /*----- Difference utilities ----------------------------------------------*/
125 # define FLOATK64(k) ((double)(k).i)
127 # define FLOATK64(k) ((double)(k).lo + 4294967296.0*(double)(k).hi)
130 /* --- @diff_ts@ --- *
132 * Arguments: @struct timer *t@ = timer structure
133 * @struct bench_timing *delta_inout@ = where to put the result
134 * @const struct time *t0, *t1@ = two input times
138 * Use: Calculates a time difference for timers using the
139 * @struct timespec@-like time format.
142 static void diff_ts(struct timer
*t
, struct bench_timing
*delta_inout
,
143 const struct bench_time
*t0
, const struct bench_time
*t1
)
145 unsigned f
= t0
->f
&t1
->f
;
150 /* Calculate the integer difference in seconds. */
151 SUB64(k
, t1
->t
.ts
.s
, t0
->t
.ts
.s
);
153 /* And apply the nanoseconds difference. To prevent underflow,
154 * pre-emptively borrow one from the integer difference.
158 (t1
->t
.ts
.ns
+ NS_PER_S
- t0
->t
.ts
.ns
)/(double)NS_PER_S
;
161 delta_inout
->f
|= BTF_TIMEOK
;
165 /* --- @diff_cycles@ --- *
167 * Arguments: @struct timer *t@ = timer structure
168 * @struct bench_timing *delta_inout@ = where to put the result
169 * @const struct time *t0, *t1@ = two input times
173 * Use: Calculates a time difference for cycle-counting timers.
176 static void diff_cycles(struct timer
*t
, struct bench_timing
*delta_inout
,
177 const struct bench_time
*t0
,
178 const struct bench_time
*t1
)
180 unsigned f
= t0
->f
&t1
->f
;
184 SUB64(k
, t1
->cy
, t0
->cy
); delta_inout
->cy
= FLOATK64(k
);
185 delta_inout
->f
|= BTF_CYOK
;
191 /*----- The null timer ----------------------------------------------------*/
193 /* This is a timer which does nothing, in case we don't have any better
197 static int null_init(struct timer
*t
) { return (0); }
198 static int null_now(struct timer
*t
, struct bench_time
*t_out
, unsigned f
)
200 static void null_diff(struct timer
*t
, struct bench_timing
*delta_inout
,
201 const struct bench_time
*t0
,
202 const struct bench_time
*t1
)
204 static void null_teardown(struct timer
*t
) { ; }
206 static const struct timer_ops null_ops
=
207 { "null", 0, null_init
, null_now
, null_diff
, null_teardown
};
208 #define NULL_ENT &null_ops,
210 /*----- The broken clock --------------------------------------------------*/
212 /* This is a cycle counter which does nothing, in case we don't have any
216 static int broken_init(struct timer
*t
) { return (-1); }
218 static const struct timer_ops broken_ops
=
219 { "broken", TF_SECRET
, broken_init
, null_now
, null_diff
, null_teardown
};
220 #define BROKEN_ENT &broken_ops,
222 /*----- Linux performance counters ----------------------------------------*/
224 /* This is a cycle counter which uses the Linux performance event system,
225 * which is probably the best choice if it's available.
228 #if defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
230 /* --- @perfevent_open@ --- *
234 * Returns: File descriptor, or %$-1$%.
236 * Use: Open a performance measurement descriptor set up to count CPU
240 static int perfevent_open(void)
242 struct perf_event_attr attr
= { 0 };
245 attr
.type
= PERF_TYPE_HARDWARE
;
246 attr
.size
= sizeof(attr
);
247 attr
.config
= PERF_COUNT_HW_CPU_CYCLES
;
249 attr
.exclude_kernel
= 1;
252 fd
= syscall(__NR_perf_event_open
, &attr
, 0, -1, -1, 0);
254 debug("couldn't open perf event: %s", strerror(errno
));
261 static int perfevent_now(struct timer
*t
,
262 struct bench_time
*t_out
, unsigned f
)
266 n
= read(t
->u_cy
.fd
, &t_out
->cy
.i
, sizeof(t_out
->cy
.i
));
267 if (n
!= sizeof(t_out
->cy
.i
)) {
268 debug("failed to read perf-event counter: %s", strerror(errno
));
271 t_out
->f
|= BTF_CYOK
; return (0);
274 static void perfevent_teardown(struct timer
*t
)
275 { close(t
->u_cy
.fd
); }
277 static int perfevent_init(struct timer
*t
)
279 struct bench_time tm
;
282 fd
= perfevent_open(); if (!fd
) { rc
= -1; goto end
; }
284 t
->u_cy
.fd
= fd
; tm
.f
= 0; perfevent_now(t
, &tm
, 0);
285 if (!(tm
.f
&BTF_CYOK
)) { rc
= -1; goto end
; }
288 if (fd
!= -1) close(fd
);
292 static const struct timer_ops perfevent_ops
=
293 { "linux-perf-read-hw-cycles", 0,
294 perfevent_init
, perfevent_now
, diff_cycles
, perfevent_teardown
};
295 #define PERFEVENT_VANILLA_CYENT &perfevent_ops,
297 # if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
299 /* Special syscall-free version for x86 using `rdpmc' instruction. *
301 * This is a bit weird because it does both kinds of measurement in a single
305 static int perfevrdpmc_now(struct timer
*t
,
306 struct bench_time
*t_out
, unsigned f
)
308 const volatile struct perf_event_mmap_page
*map
= t
->u_cy
.pmc
.map
;
309 unsigned long long tsc
= tsc
, toff
= toff
, tenb
= tenb
;
310 unsigned long long cy
= cy
, cyoff
= cyoff
;
311 unsigned long long m
, hi
, lo
;
312 unsigned tshift
= tshift
, tmult
= tmult
, q0
, q1
, ff
;
314 /* Repeat until we can complete this job without the buffer changing in the
318 __atomic_thread_fence(__ATOMIC_ACQ_REL
);
322 /* Read the passage-of-time information. */
323 if (map
->cap_user_time
) {
324 tenb
= map
->time_enabled
;
325 tsc
= __builtin_ia32_rdtsc();
326 tshift
= map
->time_shift
;
327 tmult
= map
->time_mult
;
328 toff
= map
->time_offset
;
332 /* Read the performance-counter information. */
333 if (map
->cap_user_rdpmc
) {
334 cy
= __builtin_ia32_rdpmc(map
->index
- 1);
339 /* Check the sequence number again. */
340 __atomic_thread_fence(__ATOMIC_ACQ_REL
);
347 /* We have a raw reference-cycle count %$n$% (@tsc@), and parameters
348 * %$a$%, %$w$% and %$t_0$%, such that %$a n/2^w + t_0$% gives a time in
352 m
= (1ull << tshift
) - 1;
353 hi
= tsc
>> tshift
; lo
= tsc
&m
;
354 t_out
->t
.rawns
.i
= hi
*tmult
+ (lo
*tmult
>> tshift
) + toff
+ tenb
;
355 t_out
->f
|= BTF_TIMEOK
;
359 /* We have the cycle count. */
361 t_out
->cy
.i
= cy
+ cyoff
;
362 t_out
->f
|= BTF_CYOK
;
367 static void perfevrdpmc_diff(struct timer
*t
,
368 struct bench_timing
*delta_inout
,
369 const struct bench_time
*t0
,
370 const struct bench_time
*t1
)
372 unsigned f
= t0
->f
&t1
->f
;
375 delta_inout
->t
= (t1
->t
.rawns
.i
- t0
->t
.rawns
.i
)/(double)NS_PER_S
;
376 delta_inout
->f
|= BTF_TIMEOK
;
380 delta_inout
->cy
= t1
->cy
.i
- t0
->cy
.i
;
381 delta_inout
->f
|= BTF_CYOK
;
385 static void perfevrdpmc_teardown(struct timer
*t
)
386 { munmap((/*unconst unvolatile*/ void *)t
->u_cy
.pmc
.map
, t
->u_cy
.pmc
.sz
); }
388 static int perfevrdpmc_cyinit(struct timer
*t
)
390 const volatile struct perf_event_mmap_page
*map
= 0;
391 unsigned a
, b
, c
, d
, q0
, q1
, f
;
392 int pgsz
, mapsz
, fd
= -1, rc
;
394 /* We need `rdtsc' to do the passage-of-time measurement. */
395 if (!__get_cpuid(1, &a
, &b
, &c
, &d
) || !(d
&CPUID_1D_TSC
))
396 { debug("no `rdtsc' instrunction"); return (-1); }
398 /* The rules say we must allocate %$1 + 2^n$% pages, so we need to know how
401 pgsz
= sysconf(_SC_PAGESIZE
);
403 debug("failed to discover page size!: %s", strerror(errno
));
407 /* Open the measurement descriptor and map it. */
408 fd
= perfevent_open(); if (!fd
) return (-1);
410 map
= mmap(0, mapsz
, PROT_READ
, MAP_SHARED
, fd
, 0);
411 if (map
== MAP_FAILED
) {
412 debug("failed to map perf event: %s", strerror(errno
));
416 /* Check that it's revealed the necessary information. */
418 __atomic_thread_fence(__ATOMIC_ACQ_REL
);
421 if (map
->cap_user_time
) f
|= BTF_TIMEOK
;
422 if (map
->cap_user_rdpmc
) f
|= BTF_CYOK
;
423 __atomic_thread_fence(__ATOMIC_ACQ_REL
);
429 { debug("kernel refused user time measurement"); rc
= -1; goto end
; }
431 { debug("kernel refused user cycle measurement"); rc
= -1; goto end
; }
433 /* All done. We can close the descriptor here: the mapping will keep the
434 * performance-measurement machinery alive.
436 t
->u_cy
.pmc
.map
= map
; t
->u_cy
.pmc
.sz
= mapsz
; map
= 0; rc
= 0;
438 if (fd
!= -1) close(fd
);
439 if (map
) munmap((/*unconst unvolatile*/ void *)map
, mapsz
);
443 static const struct timer_ops perfevrdpmc_cyops
=
444 { "linux-x86-perf-rdpmc-hw-cycles", 0,
445 perfevrdpmc_cyinit
, perfevrdpmc_now
,
446 perfevrdpmc_diff
, perfevrdpmc_teardown
};
448 static int perfevrdpmc_clkinit(struct timer
*t
)
450 if (t
->ops
[CLK
] != &perfevrdpmc_cyops
) {
451 debug("linux-x86-perf-rdpmc-hw-cycles not set as cycle subtimer");
457 static const struct timer_ops perfevrdpmc_clkops
=
458 { "linux-x86-perf-rdpmc-hw-cycles", 0,
459 perfevrdpmc_clkinit
, null_now
,
460 null_diff
, null_teardown
};
462 # define PERFEVENT_RDPMC_CLKENT &perfevrdpmc_clkops,
463 # define PERFEVENT_RDPMC_CYENT &perfevrdpmc_cyops,
466 # define PERFEVENT_RDPMC_CLKENT
467 # define PERFEVENT_RDPMC_CYENT
470 # define PERFEVENT_CLKENT PERFEVENT_RDPMC_CLKENT
471 # define PERFEVENT_CYENT PERFEVENT_RDPMC_CYENT PERFEVENT_VANILLA_CYENT
473 # define PERFEVENT_CLKENT
474 # define PERFEVENT_CYENT
477 /*----- Intel time-stamp counter ------------------------------------------*/
479 /* This is a cycle counter based on the Intel `rdtsc' instruction. It's not
480 * really suitable for performance measurement because it gets confused by
481 * CPU frequency adjustments.
484 #if GCC_VERSION_P(4, 5) && (defined(__i386__) || defined(__x86_64__))
486 static int x86rdtsc_now(struct timer
*t
,
487 struct bench_time
*t_out
, unsigned f
)
488 { t_out
->cy
.i
= __builtin_ia32_rdtsc(); t_out
->f
|= BTF_CYOK
; return (0); }
490 static int x86rdtsc_init(struct timer
*t
)
494 if (!__get_cpuid(1, &a
, &b
, &c
, &d
) || !(d
&CPUID_1D_TSC
))
495 { debug("no `rdtsc' instrunction"); return (-1); }
496 t
->u_cy
.tscaux
= ~0u;
500 static int x86rdtscp_now(struct timer
*t
,
501 struct bench_time
*t_out
, unsigned f
)
504 unsigned long long n
;
506 n
= __builtin_ia32_rdtscp(&tscaux
);
508 t
->u_cy
.tscaux
= tscaux
;
509 else if (t
->u_cy
.tscaux
!= tscaux
) {
510 debug("tscaux mismatch: new 0x%08x /= old 0x%08x",
511 tscaux
, t
->u_cy
.tscaux
);
514 t_out
->cy
.i
= n
; t_out
->f
|= BTF_CYOK
; return (0);
517 static int x86rdtscp_init(struct timer
*t
)
521 if (!__get_cpuid(0x80000001, &a
, &b
, &c
, &d
) || !(d
&CPUID_1xD_TSCP
))
522 { debug("no `rdtscp' instrunction"); return (-1); }
526 static const struct timer_ops x86rdtsc_ops
=
528 x86rdtsc_init
, x86rdtsc_now
, diff_cycles
, null_teardown
};
529 static const struct timer_ops x86rdtscp_ops
=
531 x86rdtscp_init
, x86rdtscp_now
, diff_cycles
, null_teardown
};
533 # define X86RDTSC_CYENT &x86rdtscp_ops, &x86rdtsc_ops,
535 # define X86RDTSC_CYENT
538 /*----- POSIX `clock_gettime' ---------------------------------------------*/
540 /* This is a real-time clock based on the POSIX time interface, with up to
541 * nanosecond precision.
544 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)
546 static int gettime_now(struct timer
*t
, struct bench_time
*t_out
, unsigned f
)
550 if (clock_gettime(CLOCK_THREAD_CPUTIME_ID
, &now
))
551 { debug("error reading POSIX clock: %s", strerror(errno
)); return (0); }
552 ASSIGN64(t_out
->t
.ts
.s
, now
.tv_sec
); t_out
->t
.ts
.ns
= now
.tv_nsec
;
553 t_out
->f
|= BTF_TIMEOK
; return (0);
556 static int gettime_init(struct timer
*t
)
558 struct bench_time tm
;
560 tm
.f
= 0; gettime_now(t
, &tm
, 0); if (!tm
.f
&BTF_TIMEOK
) return (-1);
564 static const struct timer_ops gettime_ops
=
565 { "posix-thread-cputime", 0,
566 gettime_init
, gettime_now
, diff_ts
, null_teardown
};
568 # define GETTIME_CLKENT &gettime_ops,
570 # define GETTIME_CLKENT
573 /*----- Standard C `clock' ------------------------------------------------*/
575 /* This is a real-time clock based on the C `clock' function which is
576 * guaranteed to be available, though it's not likely to be very good.
579 static int clock_now(struct timer
*t
, struct bench_time
*t_out
, unsigned f
)
584 if (now
== (clock_t)-1) {
585 debug("error reading standard clock: %s", strerror(errno
));
588 t_out
->t
.clk
= now
; t_out
->f
|= BTF_TIMEOK
; return (0);
591 static void clock_diff(struct timer
*t
, struct bench_timing
*delta_inout
,
592 const struct bench_time
*t0
,
593 const struct bench_time
*t1
)
595 unsigned f
= t0
->f
&t1
->f
;
598 delta_inout
->t
= (t1
->t
.clk
- t0
->t
.clk
)/(double)CLOCKS_PER_SEC
;
599 delta_inout
->f
|= BTF_TIMEOK
;
603 static int clock_init(struct timer
*t
)
605 struct bench_time tm
;
607 tm
.f
= 0; clock_now(t
, &tm
, 0); if (!tm
.f
&BTF_TIMEOK
) return (-1);
611 static const struct timer_ops clock_ops
=
612 { "stdc-clock", 0, clock_init
, clock_now
, clock_diff
, null_teardown
};
614 #define CLOCK_CLKENT &clock_ops,
616 /*----- Timing setup ------------------------------------------------------*/
618 /* Tables of timing sources. */
619 static const struct timer_ops
620 *const clktab
[] = { PERFEVENT_CLKENT
625 *const cytab
[] = { PERFEVENT_CYENT
631 static const struct timertab
{
634 const struct timer_ops
*const *opstab
;
636 { "clock", "MLIB_BENCH_CLKTIMER", clktab
},
637 { "cycle", "MLIB_BENCH_CYCLETIMER", cytab
}
640 /* --- @find_timer@ --- *
642 * Arguments: @const char *name@ = timer name
643 * @size_t sz@ = length of name
644 * @unsigned tm@ = which subtimer we're looking for
646 * Returns: The table entry matching the given name, or null if there
650 static const struct timer_ops
*find_timer(const char *name
, size_t sz
,
653 const struct timer_ops
*const *tt
;
655 for (tt
= timertab
[tm
].opstab
; *tt
; tt
++) {
656 if (strlen((*tt
)->name
) == sz
&&
657 MEMCMP(name
, ==, (*tt
)->name
, sz
))
660 debug("%s timer `%.*s' not found",
661 timertab
[tm
].what
, (int)sz
, name
); return (0);
664 /* --- @try_timer@ --- *
666 * Arguments: @struct timer *t@ = timer structure
667 * @const struct timer_ops *ops@ = timer ops
668 * @unsigned tm@ = which subtimer we're setting
670 * Returns: Zero on success, %$-1$% if timer failed.
672 * Use: Tries to initialize the timer @t@, reporting a debug message
676 static int try_timer(struct timer
*t
,
677 const struct timer_ops
*ops
, unsigned tm
)
679 if (ops
->init(t
)) return (-1);
680 debug("selected %s timer `%s'", timertab
[tm
].what
, ops
->name
);
681 t
->ops
[tm
] = ops
; return (0);
684 /* --- @select_timer@ --- *
686 * Arguments: @struct timer *t@ = timer structure
687 * @unsigned tm@ = which subtimer we're setting
688 * @const char *config@, @size_t sz@ = config string
690 * Returns: Zero on success, %$-1$% if timer failed.
692 * Use: Select a timer from the table. If the environment variable
693 * is set, then parse a comma-separated list of timer names and
694 * use the first one listed that seems to work; otherwise, try
695 * the timers in the table in order.
698 static int select_timer(struct timer
*t
, unsigned tm
,
699 const char *config
, size_t sz
)
702 const struct timer_ops
*ops
, *const *tt
;
705 for (tt
= timertab
[tm
].opstab
; *tt
; tt
++)
706 if (!((*tt
)->f
&TF_SECRET
) && !try_timer(t
, *tt
, tm
)) return (0);
710 p
= memchr(config
, ',', l
- config
); if (!p
) p
= l
;
711 ops
= find_timer(config
, p
- config
, tm
);
712 if (ops
&& !try_timer(t
, ops
, tm
)) return (0);
717 debug("no suitable %s timer found", timertab
[tm
].what
); return (-1);
720 /* Bench timer operations. */
721 static void timer_describe(struct bench_timer
*tm
, dstr
*d
)
723 struct timer
*t
= (struct timer
*)tm
;
726 dstr_puts(d
, "builtin: ");
727 for (i
= 0; i
< NTIMER
; i
++) {
728 if (i
) dstr_puts(d
, ", ");
729 dstr_putf(d
, "%s = %s", timertab
[i
].what
, t
->ops
[i
]->name
);
733 static int timer_now(struct bench_timer
*tm
,
734 struct bench_time
*t_out
, unsigned f
)
736 struct timer
*t
= (struct timer
*)tm
;
740 for (i
= 0; i
< NTIMER
; i
++) if (t
->ops
[i
]->now(t
, t_out
, f
)) return (-1);
744 static void timer_diff(struct bench_timer
*tm
,
745 struct bench_timing
*t_out
,
746 const struct bench_time
*t0
,
747 const struct bench_time
*t1
)
749 struct timer
*t
= (struct timer
*)tm
;
753 for (i
= 0; i
< NTIMER
; i
++) t
->ops
[i
]->diff(t
, t_out
, t0
, t1
);
756 static void timer_destroy(struct bench_timer
*tm
)
758 struct timer
*t
= (struct timer
*)tm
;
762 for (i
= 0; i
< NTIMER
; i
++)
763 if (t
->ops
[i
]) t
->ops
[i
]->teardown(t
);
767 static const struct bench_timerops timer_ops
=
768 { timer_describe
, timer_now
, timer_diff
, timer_destroy
};
770 /* --- @bench_createtimer@ --- *
772 * Arguments: @const char *config@ = timer configuration string
774 * Returns: A freshly constructed standard timer object.
776 * Use: Allocate a timer. Dispose of it by calling
777 * @tm->ops->destroy(tm)@ when you're done.
779 * Applications should not set configuration strings except as
780 * established by user action, e.g., from a command-line option,
781 * environment variable, or configuration file.
784 struct bench_timer
*bench_createtimer(const char *config
)
787 struct bench_timer
*ret
= 0;
788 struct { const char *p
; size_t sz
; } tmconf
[NTIMER
] = { 0 };
789 const struct timer_ops
*const *tt
;
790 const char *p
, *l
; size_t n
, nn
;
793 /* Parse the configuration string. */
796 /* The first thing to do is find the end of the string. */
797 l
= config
+ strlen(config
);
800 /* Process the whitespace-sparated words of the string one by one. */
802 /* Skip over any initial whitespace. If we hit the end of the string
806 if (config
>= l
) goto done_config
;
807 else if (!ISSPACE(*config
)) break;
810 /* There's definitely a word here. Find the end of it. */
811 for (p
= config
; p
< l
&& !ISSPACE(*p
); p
++);
814 /* Try various simple keywords. */
815 #define MATCHP(lit) (nn == sizeof(lit) - 1 && MEMCMP(config, ==, lit, nn))
817 if (MATCHP("list")) {
818 /* The `list' keyword requests lists of the available timer
822 for (i
= 0; i
< NTIMER
; i
++) {
823 printf("%s timers:", timertab
[i
].what
);
824 for (tt
= timertab
[i
].opstab
; *tt
; tt
++)
825 if (!((*tt
)->f
)&TF_SECRET
) printf(" %s", (*tt
)->name
);
833 /* Otherwise it's an assignment, setting a subtimer list. */
834 p
= memchr(config
, '=', nn
);
839 for (i
= 0; i
< NTIMER
; i
++)
840 if (STRNCMP(config
, ==, timertab
[i
].what
, n
) &&
841 !timertab
[i
].what
[n
]) {
843 debug("duplicate %s timer list", timertab
[i
].what
);
844 tmconf
[i
].p
= config
+ n
+ 1; tmconf
[i
].sz
= nn
- n
- 1;
848 debug("unrecognized config keyword `%.*s'", (int)n
, config
);
850 /* Move on to the next word. */
857 /* Override these settings from the environment. */
858 for (i
= 0; i
< NTIMER
; i
++) {
859 p
= getenv(timertab
[i
].env
);
860 if (p
) { tmconf
[i
].p
= p
; tmconf
[i
].sz
= strlen(p
); }
863 /* All seems well. Allocate the timer object. */
864 t
= xmalloc(sizeof(*t
));
865 for (i
= 0; i
< NTIMER
; i
++) t
->ops
[i
] = 0;
867 /* Try to set up the subtimers. */
868 for (i
= NTIMER
; i
--; )
869 if (select_timer(t
, i
, tmconf
[i
].p
, tmconf
[i
].sz
)) goto end
;
872 t
->_t
.ops
= &timer_ops
; ret
= &t
->_t
; t
= 0;
874 if (t
) timer_destroy(&t
->_t
);
878 /*----- Benchmarking ------------------------------------------------------*/
880 /* --- @bench_init@ --- *
882 * Arguments: @struct bench_state *b@ = bench state to initialize
883 * @struct bench_timer *tm@ = timer to attach, or null
885 * Returns: Zero on success, %$-1$% on failure.
887 * Use: Initialize the benchmark state. On success, the timer state
888 * still needs to be calibrated (use @bench_calibrate@) before
889 * it can be used, but this will be done automatically by
890 * @bench_measure@ if it's not done by hand earlier. The timer
891 * is now owned by the benchmark state and will be destroyed by
894 * The only reason for failure is if @tm@ was null on entry,
895 * and automatic construction of a timer failed. The state is
896 * safe to discard, but calling @bench_destroy@ is safe too.
899 int bench_init(struct bench_state
*b
, struct bench_timer
*tm
)
906 tm
= bench_createtimer(0);
907 if (!tm
) { rc
= -1; goto end
; }
910 b
->tm
= tm
; b
->target_s
= 1.0; b
->f
= 0; rc
= 0;
915 /* --- @bench_destroy@ --- *
917 * Arguments: @struct bench_state *b@ = bench state
921 * Use: Destroy the benchmark state, releasing the resources that it
925 void bench_destroy(struct bench_state
*b
)
926 { if (b
->tm
) { b
->tm
->ops
->destroy(b
->tm
); b
->tm
= 0; } }
928 /* --- @do_nothing@ --- *
930 * Arguments: @unsigned long n@ = iteration count
931 * @void *ctx@ = context pointer (ignored)
935 * Use: Does nothing at all for @n@ iterations. Used to calibrate
936 * the benchmarking state.
939 static void do_nothing(unsigned long n
, void *ctx
)
940 { while (n
--) RELAX
; }
942 /* --- @measure@ --- *
944 * Arguments: @struct bench_state *b@ = bench state
945 * @struct bench_timing *delta_out@ = where to leave the timing
946 * @bench_fn *fn@ = function to measure
947 * @void *ctx@ = context for the function
948 * @double n@ = number of iterations
952 * Use: Run the function @n@ times, and report how long it took.
954 * This function deals with retrying the measurements if the
955 * timer reports a temporary failure, and all of the
956 * difficulties if @n@ is too large to fit in a machine integer.
959 static void measure(struct bench_state
*b
, struct bench_timing
*delta_out
,
960 bench_fn
*fn
, void *ctx
, double n
)
962 struct bench_timer
*tm
= b
->tm
;
963 struct bench_time t0
, t1
;
964 unsigned long n0
, n1
;
965 double R
= ULONG_MAX
;
970 while (tm
->ops
->now(tm
, &t0
, BTF_T0
));
972 } while (tm
->ops
->now(tm
, &t1
, BTF_T1
));
974 n1
= n
/R
; n0
= n
- n1
*R
;
976 while (tm
->ops
->now(tm
, &t0
, BTF_T0
));
977 while (n1
--) fn(ULONG_MAX
, ctx
);
979 } while (tm
->ops
->now(tm
, &t1
, BTF_T1
));
981 tm
->ops
->diff(tm
, delta_out
, &t0
, &t1
);
984 /* --- @bench_calibrate@ --- *
986 * Arguments: @struct bench_state *b@ = bench state
988 * Returns: Zero on success, %$-1$% if calibration failed.
990 * Use: Calibrate the benchmark state, so that it can be used to
991 * measure performance reasonably accurately.
994 #define T_CLB 0.0625 /* calibration time limit */
996 int bench_calibrate(struct bench_state
*b
)
998 struct linreg lr_clk
= LINREG_INIT
, lr_cy
= LINREG_INIT
;
999 struct bench_timing delta
;
1001 bench_fn
*fn
= LAUNDER(&do_nothing
);
1002 unsigned i
, f
= BTF_ANY
;
1005 /* The model here is that a timing loop has a fixed overhead as we enter
1006 * and leave (e.g., to do with the indirect branch into the code), and
1007 * per-iteration overheads as we check the counter and loop back. We aim
1008 * to split these apart using linear regression.
1011 /* If we've already calibrated then there's nothing to do. */
1012 if (b
->f
&BTF_CLB
) return (b
->f
&BTF_ANY ?
0 : -1);
1014 /* Exercise the inner loop a few times to educate the branch predictor. */
1015 for (i
= 0; i
< 50; i
++) measure(b
, &delta
, fn
, 0, 10000);
1017 /* Now we measure idle loops until they take sufficiently long -- or we run
1020 debug("calibrating...");
1024 /* Measure @n@ iterations of the idle loop. */
1025 measure(b
, &delta
, fn
, 0, n
); f
&= delta
.f
;
1026 if (!(f
&BTF_TIMEOK
)) { rc
= -1; goto end
; }
1028 /* Register the timings with the regression machinery. */
1029 linreg_update(&lr_clk
, n
, delta
.t
);
1031 debug(" n = %10.0f; t = %12g s", n
, delta
.t
);
1033 linreg_update(&lr_cy
, n
, delta
.cy
);
1034 debug(" n = %10.0f; t = %12g s, cy = %10.0f", n
, delta
.t
, delta
.cy
);
1037 /* If we're done then stop. */
1038 if (delta
.t
>= T_CLB
) break;
1039 if (n
>= ULONG_MAX
- n
/3) break;
1041 /* Update the counter and continue. */
1045 /* Now run the linear regression to extract the constant and per-iteration
1048 linreg_fit(&lr_clk
, &b
->clk
.m
, &b
->clk
.c
, &r
);
1049 debug("clock overhead = (%g n + %g) s (r = %g)", b
->clk
.m
, b
->clk
.c
, r
);
1051 linreg_fit(&lr_cy
, &b
->cy
.m
, &b
->cy
.c
, &r
);
1052 debug("cycle overhead = (%g n + %g) cy (r = %g)", b
->cy
.m
, b
->cy
.c
, r
);
1058 b
->f
|= f
| BTF_CLB
; /* no point trying again */
1062 /* --- @bench_measure@ --- *
1064 * Arguments: @struct bench_state *b@ = benchmark state
1065 * @struct bench_timing *t_out@ = where to leave the timing
1066 * @double base@ = number of internal units per call
1067 * @bench_fn *fn@, @void *ctx@ = benchmark function to run
1069 * Returns: Zero on success, %$-1$% if timing failed.
1071 * Use: Measure a function. The function @fn@ is called adaptively
1072 * with an iteration count @n@ set so as to run for
1073 * approximately @b->target_s@ seconds.
1075 * The result is left in @*t_out@, with @t_out->n@ counting the
1076 * final product of the iteration count and @base@ (which might,
1077 * e.g., reflect the number of inner iterations the function
1078 * performs, or the number of bytes it processes per iteration).
1081 int bench_measure(struct bench_state
*b
, struct bench_timing
*t_out
,
1082 double base
, bench_fn
*fn
, void *ctx
)
1086 /* Make sure the state is calibrated and usable. */
1087 if (!(b
->f
&BTF_CLB
) && bench_calibrate(b
)) return (-1);
1088 if (!(b
->f
&BTF_TIMEOK
)) return (-1);
1090 /* Main adaptive measurement loop.
1092 * Suppose the timer loop %$n$% iterations in %$t$% seconds. Our ideal
1093 * time is %$T$% seconds. If %$t \ge T/\sqrt{2}$%, we're happy.
1094 * Otherwise, we need to scale up the iteration count. The obvious next
1095 * choice is %$n' = n T/t$%. Alas, rounding is a problem: if
1096 * %$T/t < 1 + 1/n$% then %$\floor{n T/t} = n$% and we will make no
1097 * progress. We know that %$T/t > \sqrt{2}%, so this can only happen when
1098 * %$1 + 1/n > \sqrt{2}$%, i.e., when %$n < \sqrt{2} + 1$%. On the other
1099 * hand, if %$T/t < 1 + 1/n$% then %$t (n + 1)/n > T$%, so just trying
1100 * again with %$n' = n + 1$% iterations will very likely work.
1102 debug("measuring..."); n
= 1.0;
1104 measure(b
, t_out
, fn
, ctx
, n
); t_out
->f
&= b
->f
;
1105 if (!(t_out
->f
&BTF_TIMEOK
)) return (-1);
1106 if (!(t_out
->f
&BTF_CYOK
))
1107 debug(" n = %10.0f; t = %12g", n
, t_out
->t
);
1109 debug(" n = %10.0f; t = %12g, cy = %10.0f", n
, t_out
->t
, t_out
->cy
);
1111 if (t_out
->t
>= 0.707*b
->target_s
) break;
1112 nn
= n
*b
->target_s
/t_out
->t
;
1113 if (n
> ULONG_MAX
|| nn
> (unsigned long)n
+ 1) n
= nn
;
1117 /* Adjust according to the calibration. */
1118 t_out
->t
-= n
*b
->clk
.m
+ b
->clk
.c
;
1119 if (t_out
->f
&BTF_CYOK
) t_out
->cy
-= n
*b
->cy
.m
+ b
->cy
.c
;
1121 /* Report the results, if debugging. */
1122 if (!(t_out
->f
&BTF_CYOK
)) debug(" adjusted t' = %12g", t_out
->t
);
1123 else debug(" adjusted t = %12g, cy = %10.0f", t_out
->t
, t_out
->cy
);
1124 if (!(t_out
->f
&BTF_CYOK
))
1125 debug(" %g s per op; %g ops/s", t_out
->t
/n
, n
/t_out
->t
);
1127 debug(" %g s (%g cy) per op; %g ops/s",
1128 t_out
->t
/n
, t_out
->cy
/n
, n
/t_out
->t
);
1131 t_out
->n
= n
*base
; return (0);
1134 /*----- That's all, folks -------------------------------------------------*/