Commit | Line | Data |
---|---|---|
d056fbdf | 1 | .\" -*-nroff-*- |
c4ccbbf9 MW |
2 | .\" |
3 | .\" Manual for benchmarking core | |
4 | .\" | |
5 | .\" (c) 2024 Straylight/Edgeware | |
6 | .\" | |
7 | . | |
8 | .\"----- Licensing notice --------------------------------------------------- | |
9 | .\" | |
10 | .\" This file is part of the mLib utilities library. | |
11 | .\" | |
12 | .\" mLib is free software: you can redistribute it and/or modify it under | |
13 | .\" the terms of the GNU Library General Public License as published by | |
14 | .\" the Free Software Foundation; either version 2 of the License, or (at | |
15 | .\" your option) any later version. | |
16 | .\" | |
17 | .\" mLib is distributed in the hope that it will be useful, but WITHOUT | |
18 | .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public | |
20 | .\" License for more details. | |
21 | .\" | |
22 | .\" You should have received a copy of the GNU Library General Public | |
23 | .\" License along with mLib. If not, write to the Free Software | |
24 | .\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | .\" USA. | |
26 | . | |
27 | .\"-------------------------------------------------------------------------- | |
28 | .so ../defs.man \" @@@PRE@@@ | |
29 | . | |
30 | .\"-------------------------------------------------------------------------- | |
31 | .TH bench 3mLib "9 March 2024" "Straylight/Edgeware" "mLib utilities library" | |
d056fbdf | 32 | .\" @bench_createtimer |
289651a7 MW |
33 | .\" @BENCH_TIMELOOP_DECLS |
34 | .\" @BENCH_TIMELOOP_TAG | |
35 | . | |
d056fbdf MW |
36 | .\" @bench_init |
37 | .\" @bench_destroy | |
38 | .\" @bench_calibrate | |
289651a7 MW |
39 | .\" @bench_preflight |
40 | .\" @bench_adapt | |
41 | .\" @bench_adjust | |
42 | .\" @BENCH_MEASURE_DECLS | |
43 | .\" @BENCH_MEASURE_TAG | |
44 | .\" @BENCH_MEASURE | |
d056fbdf MW |
45 | .\" @bench_measure |
46 | . | |
289651a7 MW |
47 | .\" @bench_report |
48 | . | |
c4ccbbf9 MW |
49 | .\"-------------------------------------------------------------------------- |
50 | .SH NAME | |
51 | bench \- low-level benchmarking tools | |
52 | . | |
53 | .\"-------------------------------------------------------------------------- | |
d056fbdf | 54 | .SH SYNOPSIS |
c4ccbbf9 | 55 | . |
d056fbdf MW |
56 | .nf |
57 | .B "#include <mLib/bench.h>" | |
58 | .PP | |
289651a7 MW |
59 | .B "#define BTF_TIMEOK ..." |
60 | .B "#define BTF_CYOK ..." | |
61 | .B "#define BTF_ANY (BTF_TIMEOK | BTF_CYOK)" | |
62 | .PP | |
6e683a79 | 63 | .ta 2n +2n +2n |
d056fbdf MW |
64 | .B "struct bench_time {" |
65 | .B " unsigned f;" | |
6e683a79 MW |
66 | .B " union {" |
67 | .B " struct { kludge64 s; uint32 ns; } ts;" | |
68 | .B " clock_t clk;" | |
69 | .B " kludge64 rawns;" | |
70 | .B " } t;" | |
d056fbdf MW |
71 | .B " kludge64 cy;" |
72 | .B "};" | |
73 | .PP | |
74 | .B "struct bench_timing {" | |
75 | .B " unsigned f;" | |
76 | .B " double n;" | |
77 | .B " double t;" | |
78 | .B " double cy;" | |
79 | .B "};" | |
80 | .PP | |
6e683a79 MW |
81 | .B "#define BTF_T0 0u" |
82 | .B "#define BTF_T1 ..." | |
d056fbdf MW |
83 | .B "struct bench_timerops {" |
84 | .BI " void (*describe)(struct bench_timer *" bt ", dstr *" d ); | |
b1a20bee | 85 | .BI " int (*preflight)(struct bench_timer *" bt ); |
6e683a79 MW |
86 | .ta 2n +\w'\fBint (*now)('u |
87 | .BI " int (*now)(struct bench_timer *" bt , | |
88 | .BI " struct bench_time *" t_out ", unsigned " f ); | |
289651a7 | 89 | .ta 2n +\w'\fBvoid (*diff)('u |
6e683a79 MW |
90 | .BI " void (*diff)(struct bench_timer *" bt , |
91 | .BI " struct bench_timing *" delta_out , | |
92 | .BI " const struct bench_time *" t0 , | |
93 | .BI " const struct bench_time *" t1 ); | |
d056fbdf MW |
94 | .BI " void (*destroy)(struct bench_timer *" bt ); |
95 | .B "};" | |
96 | .B "struct bench_timer {" | |
97 | .B " const struct bench_timerops *ops;" | |
289651a7 | 98 | .B " unsigned ref;" |
d056fbdf MW |
99 | .B "};" |
100 | .PP | |
289651a7 MW |
101 | .B "struct bench_timer *bench_createtimer(void);" |
102 | .B "BENCH_TIMELOOP_DECLS;" | |
103 | .ta 2n \w'\fBBENCH_TIMELOOP_TAG('u | |
104 | .BI "BENCH_TIMELOOP_TAG(" tag ", struct bench_timer *" tm , | |
105 | .BI " struct bench_timing *" delta_out ", double " n , | |
106 | .BI " " onbreak ) | |
107 | .BI " " stmt | |
108 | .PP | |
109 | .B "#define BTF_CLB ..." | |
110 | .B "#define BTF_INDIRECT ..." | |
111 | .PP | |
112 | .ta 2n | |
d056fbdf MW |
113 | .B "struct bench_state {" |
114 | .B " unsigned f;" | |
115 | .B " double target_s;" | |
116 | .B " ..." | |
117 | .B "}"; | |
118 | .PP | |
119 | .BI "typedef void bench_fn(unsigned long " n ", void *" ctx ); | |
120 | .PP | |
d056fbdf MW |
121 | .BI "int bench_init(struct bench_state *" b ", struct bench_timer *" tm ); |
122 | .BI "void bench_destroy(struct bench_state *" b ); | |
289651a7 MW |
123 | .BI "int bench_calibrate(struct bench_state *" b ", unsigned " f ); |
124 | .BI "int bench_preflight(struct bench_state *" b ); | |
125 | .ta \w'\fBint bench_adapt('u | |
126 | .BI "int bench_adapt(struct bench_state *" b ", double *" n_inout , | |
127 | .BI " const struct bench_timing *" t ); | |
128 | .ta \w'\fBint bench_adjust('u | |
129 | .BI "int bench_adjust(struct bench_state *" b ", struct bench_timing *" t_inout , | |
130 | .BI " double " n ", double " base ); | |
131 | .B "BENCH_MEASURE_DECLS;" | |
132 | .ta 2n \w'\fBBENCH_MEASURE_TAG('u | |
133 | .BI "BENCH_MEASURE_TAG(" tag ", struct bench_state *" b , | |
134 | .BI " int &" rc ", struct bench_timing *" t_out ", double " bsae ) | |
135 | .BI " " stmt | |
136 | .ta 2n \w'\fBBENCH_MEASURE('u | |
137 | .BI "BENCH_MEASURE(struct bench_state *" b , | |
138 | .BI " int &" rc ", struct bench_timing *" t_out ", double " bsae ) | |
139 | .BI " " stmt | |
d056fbdf MW |
140 | .ta \w'\fBint bench_measure('u |
141 | .BI "int bench_measure(struct bench_state *" b ", struct bench_timing *" t_out , | |
142 | .BI " double " base ", bench_fn *" fn ", void *" ctx ); | |
289651a7 MW |
143 | .PP |
144 | .ta 2n | |
145 | .B "enum {" | |
146 | .B " BTU_OP = 0," | |
147 | .B " BTU_BYTE = 1," | |
148 | .B " ..." | |
149 | .BI " BTU_LIMIT = " n | |
150 | .B "};" | |
151 | .ta \w'\fBvoid bench_report('u | |
152 | .BI "void bench_report(const struct gprintf_ops *" gops ", void *" go , | |
153 | .BI " unsigned " unit ", const struct bench_timing *" t ); | |
154 | .PP | |
d056fbdf MW |
155 | .fi |
156 | . | |
c4ccbbf9 | 157 | .\"-------------------------------------------------------------------------- |
d056fbdf | 158 | .SH DESCRIPTION |
c4ccbbf9 | 159 | . |
d056fbdf MW |
160 | The header file |
161 | .B "<mLib/bench.h>" | |
162 | provides declarations and defintions | |
163 | for performing low-level benchmarks. | |
164 | .PP | |
289651a7 MW |
165 | The `main event' are the |
166 | .B BENCH_MEASURE | |
167 | macro and | |
168 | .B bench_measure | |
169 | function. | |
170 | These will be described in detail later, | |
d056fbdf | 171 | but, in brief, |
289651a7 | 172 | they execute a caller-provided piece of code |
d056fbdf MW |
173 | instructing it to run adaptively chosen numbers of iterations, |
174 | in order to get a reasonably reliable measurement of its running time, | |
289651a7 | 175 | and then report the results by filling in a structure. |
d056fbdf | 176 | .PP |
289651a7 MW |
177 | With understanding these as our objective, |
178 | we must examine all of the pieces involved in making them work. | |
d056fbdf MW |
179 | . |
180 | .SS Timers in general | |
181 | A | |
182 | .I timer | |
183 | is a gadget which is capable of reporting the current time, | |
184 | in seconds (ideally precise to tiny fractions of a second), | |
185 | and/or in CPU cycles. | |
186 | A timer is represented by a pointer to an object of type | |
187 | .BR "struct bench_timer" . | |
289651a7 | 188 | This structure has two members: |
d056fbdf MW |
189 | .BR ops , |
190 | pointing to a | |
191 | .BR "struct bench_timerops" , | |
289651a7 MW |
192 | which is a table of function pointers, |
193 | and | |
194 | .BR ref , | |
195 | which is a simple reference count; | |
d056fbdf MW |
196 | typically, a timer has more data following this, |
197 | but this fact is not exposed to applications. | |
198 | .PP | |
199 | The function pointers in | |
200 | .B "struct bench_timerops" | |
201 | are as follows. | |
202 | The first argument, | |
203 | named | |
204 | .I tm | |
205 | must always point to the timer object itself. | |
206 | .TP | |
207 | .IB tm ->ops->describe( tm ", " d) | |
208 | Write a description of the timer to the dynamic string | |
209 | .IR d . | |
289651a7 | 210 | .TP |
b1a20bee MW |
211 | .IB tm ->ops->preflight( tm ) |
212 | Ensure that the timer is in working order, | |
213 | and perform any necessary per-thread or per-process setup. | |
214 | Return zero if the | |
215 | .B now | |
216 | function is likely to work properly | |
217 | when called from the same thread | |
218 | in the near future; | |
219 | otherwise return \-1. | |
d056fbdf | 220 | .TP |
6e683a79 | 221 | .IB tm ->ops->now( tm ", " t_out ", " f ) |
d056fbdf | 222 | Store the current time in |
6e683a79 | 223 | .BI * t_out \fR. |
d056fbdf | 224 | The |
6e683a79 MW |
225 | .B BTF_T1 |
226 | flag in | |
227 | .I f | |
228 | to indicate that this is the second call in a pair; | |
229 | leave it clear for the first call. | |
230 | (A fake | |
231 | .B BTF_T0 | |
232 | flag is defined to be zero for symmetry.) | |
233 | Return zero on success | |
234 | .I or | |
235 | permanent failure; | |
236 | return \-1 if timing failed but | |
237 | trying again immediately has a reasonable chance of success. | |
238 | .TP | |
239 | .IB tm ->ops->diff( tm ", " delta_out ", " t0 ", " t1 ) | |
240 | Store in | |
241 | .BI * delta_out | |
242 | the difference between the two times | |
243 | .I t0 | |
244 | and | |
245 | .IR t1 . | |
d056fbdf MW |
246 | .TP |
247 | .IB tm ->ops->destroy( tm ) | |
248 | Destroy the timer, | |
249 | releasing all of the resources that it holds. | |
250 | .PP | |
289651a7 MW |
251 | In a freshly-created timer, the |
252 | .B ref | |
253 | member is 1. | |
254 | Applications are expected to handle the reference count themselves; | |
255 | the | |
256 | .B destroy | |
257 | function does not check or decrement the count. | |
258 | Code for destroying the timer when it's no longer needed | |
259 | might look like this. | |
260 | .VS | |
261 | if (!--tm->ref) tm->ops->destroy(tm); | |
262 | .VE | |
6e683a79 MW |
263 | A |
264 | .B bench_timing | |
265 | structure reports the difference between two times, | |
266 | as determined by a timer's | |
267 | .B diff | |
268 | function. | |
269 | It has four members. | |
270 | .TP | |
d056fbdf | 271 | .B f |
6e683a79 | 272 | A flags word. |
d056fbdf | 273 | .B BTF_TIMEOK |
6e683a79 MW |
274 | is set if the passage-of-time measurement in |
275 | .B t | |
276 | is valid; | |
d056fbdf | 277 | .B BTF_CYOK |
6e683a79 | 278 | is set if the cycle count in |
d056fbdf MW |
279 | .B cy |
280 | is valid. | |
d056fbdf MW |
281 | The mask |
282 | .B BTF_ANY | |
283 | covers the | |
284 | .B BTF_TIMEOK | |
285 | and | |
286 | .B BTF_CYOK | |
287 | bits: | |
288 | hence, | |
6e683a79 | 289 | .B f&BTF_ANY |
d056fbdf MW |
290 | is nonzero (true) |
291 | if the timer returned any valid timing information. | |
6e683a79 MW |
292 | .TP |
293 | .B n | |
289651a7 | 294 | The number of units processed the benchmark computation |
6e683a79 | 295 | on its satisfactory run, |
289651a7 MW |
296 | multiplied by a given |
297 | .IR base | |
298 | \(en see | |
299 | .BR BENCH_MEASURE , | |
300 | .BR bench_measure , | |
301 | and | |
302 | .BR bench_adjust . | |
6e683a79 MW |
303 | .TP |
304 | .B t | |
305 | The time taken for the satisfactory run of the benchmark function, | |
306 | in seconds. | |
307 | Only valid if | |
308 | .B BTF_TIMEOK | |
309 | is set in | |
310 | .BR f . | |
311 | .TP | |
312 | .B cy | |
313 | The number of CPU cycles used | |
314 | in the satisfactory run of the benchmark function, | |
315 | in seconds. | |
316 | Only valid if | |
317 | .B BTF_CYOK | |
318 | is set in | |
319 | .BR f . | |
320 | .PP | |
321 | A | |
322 | .B "struct bench_time" | |
5c0f2e08 | 323 | represents a single instant in time, |
6e683a79 MW |
324 | as captured by a timer's |
325 | .B now | |
326 | function. | |
327 | The use of this structure is a private matter for the timer: | |
328 | the only hard requirement is that the | |
329 | .B diff | |
330 | function should be able to compute the difference between two times. | |
331 | However, the intent is that | |
332 | a passage-of-time measurement is stored in the | |
333 | .B t | |
334 | union, | |
335 | a cycle count is stored in the | |
336 | .B cy | |
337 | member, and | |
338 | the | |
339 | .B f | |
340 | member stores flags | |
341 | .B BTF_TIMEOK | |
342 | and or | |
343 | .B BTF_CYOK | |
344 | if the passage-of-time or cycle count respectively are valid. | |
289651a7 MW |
345 | .PP |
346 | The | |
347 | .B BENCH_TIMELOOP_TAG | |
348 | macro uses a timer to measure a number of iterations of a computation. | |
349 | It requires the declarations made by | |
350 | .B BENCH_TIMELOOP_DECLS | |
351 | to be in scope, | |
352 | ideally within an enclosing block | |
353 | (rather than at top-level, | |
354 | where they'll have static storage duration, | |
355 | and take longer to access). | |
356 | The macro's expansion is syntactically a statement head; | |
357 | see | |
358 | .BR control (3) | |
359 | for details about the underlying machinery. | |
360 | In more detail, the macro is invoked as | |
361 | .IP | |
362 | .nf | |
363 | .ta 2n | |
364 | .BI "BENCH_TIMELOOP_TAG(" tag ", " tm ", " delta_out ", " n ", " onbreak ) | |
365 | .BI " " stmt | |
366 | .fi | |
367 | .PP | |
368 | The | |
369 | .I tag | |
370 | argument is used to distinguish | |
371 | the labels used internally by the macro: | |
372 | see | |
373 | .BR control (3) | |
374 | for details about tags. | |
375 | The macro calls on the timer | |
376 | .I tm | |
377 | to determine the initial time and cycle counts, | |
378 | performs | |
379 | .I n | |
380 | iterations of some computation, | |
381 | and calls on the timer a second time | |
382 | to determine the final time and cycle counts, | |
383 | and to store the difference in | |
384 | .BI * delta_out \fR. | |
385 | The | |
386 | .I stmt | |
387 | may be any C statement: | |
388 | when it is executed, | |
389 | the variable | |
390 | .BR _bench_n , | |
391 | of type | |
392 | .BR "unsigned long" , | |
393 | is in scope. | |
394 | The statement should perform | |
395 | .B _bench_n | |
396 | iterations of the computation to be measured | |
397 | \(en and do as little else as possible. | |
398 | The argument | |
399 | .I n | |
400 | to the macro | |
401 | may be larger than | |
402 | .BR ULONG_MAX : | |
403 | the macro will execute | |
404 | .I stmt | |
405 | multiple times if necessary. | |
406 | The statement is allowed to clobber | |
407 | .BR _bench_n . | |
408 | Note that | |
409 | .B BENCH_TIMELOOP_TAG | |
410 | does | |
411 | .I not | |
412 | call the timer's | |
413 | .B preflight | |
414 | function. | |
415 | If the | |
416 | .I stmt | |
417 | executes a free | |
418 | .B break | |
419 | statement | |
420 | then the statement | |
421 | .I onbreak | |
422 | is executed; | |
423 | a free | |
424 | .B continue | |
425 | statement within | |
426 | .I stmt | |
427 | currently does not have a useful behaviour. | |
428 | Free | |
429 | .B break | |
430 | and | |
431 | .B continue | |
432 | statements within | |
433 | .I onbreak | |
434 | behave normally. | |
435 | (See | |
436 | .BR control (3) | |
437 | for a definition of | |
438 | `free' | |
439 | .B break | |
440 | and | |
441 | .B continue | |
442 | statements.) | |
d056fbdf MW |
443 | . |
444 | .SS The built-in timer | |
445 | The function | |
446 | .B bench_createtimer | |
447 | constructs and returns a timer. | |
448 | It takes a single argument, | |
449 | a string | |
450 | .IR config , | |
451 | from which it reads configuration information. | |
452 | If | |
453 | .B bench_createtimer | |
454 | fails, it returns a null pointer. | |
455 | .PP | |
456 | The | |
457 | .I config | |
458 | pointer may safely be null, | |
459 | in which case a default configuration will be used. | |
460 | Applications | |
461 | .I should only | |
462 | set this pointer to a value supplied by a user, | |
463 | e.g., through a command-line argument, | |
464 | environment variable, or | |
465 | configuration file. | |
466 | .PP | |
467 | The built-in timer makes use of one or two | |
468 | .IR subtimers : | |
469 | a `clock' subtimer to measure the passage of time, | |
470 | and possibly a `cycle' subtimer to count CPU cycles. | |
471 | .PP | |
472 | The configuration string consists of a sequence of words | |
473 | separated by whitespace. | |
474 | There may be additional whitespace at the start and end of the string. | |
475 | The words recognized are as follows. | |
476 | .TP | |
477 | .B list | |
478 | Prints a list of the available clock and cycle subtimers | |
479 | to standard output. | |
480 | .TP | |
481 | .BI clock= t , ... | |
482 | Use the first of the listed clock subtimers | |
483 | to initialize successfully | |
484 | as the clock subtimer. | |
485 | If none of the subtimers can be initialized, | |
486 | then construction of the timer as a whole fails. | |
487 | .TP | |
488 | .BI cycle= t , ... | |
489 | Use the first of the listed subtimers | |
490 | to initialize successfully | |
491 | as the cycle subtimer. | |
492 | If none of the subtimers can be initialized, | |
493 | then construction of the timer as a whole fails. | |
494 | .PP | |
495 | The clock subtimers are as follows. | |
496 | Not all of them will be available on every platform. | |
497 | .TP | |
6e683a79 MW |
498 | .B linux-x86-perf-rdpmc-hw-cycles |
499 | This is a dummy companion to the similarly named cycle subtimer; | |
500 | see its description below. | |
501 | .TP | |
d056fbdf MW |
502 | .B posix-thread-cputime |
503 | Measures the passage of time using | |
504 | .BR clock_gettime (2), | |
505 | specifying the | |
506 | .B CLOCK_\%THREAD_\%CPUTIME_\%ID | |
507 | clock. | |
508 | .TP | |
509 | .B stdc-clock | |
510 | Measures the passage of time using | |
511 | .BR clock (3). | |
512 | Since | |
513 | .BR clock (3) | |
514 | is part of the original ANSI\ C standard, | |
515 | this subtimer should always be available. | |
516 | However, it may produce unhelpful results | |
517 | if other threads are running. | |
518 | .PP | |
519 | The cycle subtimers are as follows. | |
520 | Not all of them will be available on every platform. | |
521 | .TP | |
6e683a79 MW |
522 | .B linux-perf-read-hw-cycles |
523 | Counts CPU cycles using the Linux-specific | |
d056fbdf MW |
524 | .BR perf_event_open (2) |
525 | function to read the | |
526 | .BR PERF_\%COUNT_\%HW_\%CPU_\%CYCLES | |
527 | counter. | |
528 | Only available on Linux. | |
529 | It will fail to initialize | |
530 | if access to performance counters is restricted, | |
531 | e.g., because the | |
532 | .B /proc/sys/kernel/perf_event_paranoid | |
533 | level is too high. | |
534 | .TP | |
6e683a79 MW |
535 | .B linux-perf-rdpmc-hw-cycles |
536 | Counts CPU cycles using the Linux-specific | |
537 | .BR perf_event_open (2) | |
538 | function, | |
539 | as for | |
540 | .B linux-x86-perf-read-hw-cycles | |
541 | above, | |
542 | except that it additionally uses the i386/AMD64 | |
d056fbdf | 543 | .B rdtsc |
6e683a79 MW |
544 | and |
545 | .B rdpmc | |
546 | instructions, | |
547 | together with information provided by the kernel | |
548 | through a memory-mapped page | |
549 | to do its measurements without any system call overheads. | |
550 | It does passage-of-time and cycle counting in a single operation, | |
551 | so no separate clock subtimer is required: | |
552 | the similarly-named clock subtimer does nothing | |
553 | except check that the | |
554 | .B linux-x86-perf-rdpmc-hw-cycles | |
555 | cycle subtimer has been selected. | |
289651a7 MW |
556 | This is almost certainly the best choice if it's available; |
557 | It is, however, not compatible with (at least some versions of) | |
558 | .BR valgrind (1); | |
559 | it will detect that it is running under | |
560 | .B valgrind | |
561 | and fail to initialize. | |
6e683a79 MW |
562 | .TP |
563 | .B x86-rdtscp | |
564 | Counts CPU cycles using the x86 | |
565 | .B rdtscp | |
d056fbdf MW |
566 | instruction. |
567 | This instruction is not really suitable for performance measurement: | |
568 | it gives misleading results on CPUs with variable clock frequency. | |
569 | .TP | |
6e683a79 MW |
570 | .B x86-rdtsc |
571 | Counts CPU cycles using the x86 | |
572 | .B rdtsc | |
573 | instruction. | |
574 | This has the downsides of | |
575 | .B rdtscp | |
576 | above, | |
577 | but also fails to detect when the thread has been suspended | |
578 | or transferred to a different CPU core | |
579 | and gives misleading answers in this case. | |
580 | Not really recommended. | |
581 | .TP | |
d056fbdf MW |
582 | .B null |
583 | A dummy cycle counter, | |
584 | which will initialize successfully | |
585 | and then fail to report cycle counts. | |
586 | This is a reasonable fallback in many situations. | |
587 | .PP | |
588 | The built-in preference order for clock subtimers, | |
589 | from most to least preferred, is | |
6e683a79 | 590 | .BR linux-x86-perf-rdpmc-hw-cycles , |
d056fbdf | 591 | followed by |
6e683a79 MW |
592 | .BR posix-thread-cputime , |
593 | and finally | |
d056fbdf MW |
594 | .BR stdc-clock . |
595 | The built-in preference order for cycle subtimers, | |
596 | from most to least preferred, is | |
6e683a79 MW |
597 | .BR linux-x86-perf-rdpmc-hw-cycles |
598 | then | |
599 | .BR linux-x86-perf-read-hw-cycles , | |
d056fbdf | 600 | followed by |
6e683a79 MW |
601 | .BR x86-rdtscp , |
602 | and | |
d056fbdf | 603 | .BR x86-rdtsc , |
6e683a79 | 604 | and finally |
d056fbdf MW |
605 | .BR null . |
606 | . | |
607 | .SS The benchmark state | |
608 | A | |
609 | .I benchmark state | |
610 | tracks the information needed to measure performance of functions. | |
611 | It is represented by a | |
612 | .B struct bench_state | |
613 | structure. | |
614 | .PP | |
615 | The benchmark state is initialized by calling | |
616 | .BR bench_init , | |
617 | passing the address of the state structure to be initialized, | |
618 | and a pointer to a timer. | |
619 | If | |
620 | .B bench_init | |
621 | is called with a non-null timer pointer, | |
622 | then it will not fail; | |
623 | the benchmark state will be initialized, | |
289651a7 MW |
624 | and the function returns zero; |
625 | the timer's reference count is | |
626 | .I not | |
627 | incremented. | |
d056fbdf MW |
628 | If the timer pointer is null, |
629 | then | |
630 | .B bench_init | |
631 | attempts to construct a timer for itself | |
632 | by calling | |
633 | .BR bench_createtimer . | |
634 | If this succeeds, | |
635 | then the benchmark state will be initialized, | |
636 | and the function returns zero. | |
637 | In both cases, | |
289651a7 | 638 | the timer reference becomes owned by the benchmark state: |
d056fbdf MW |
639 | calling |
640 | .B bench_destroy | |
289651a7 MW |
641 | on the benchmark state will decrement the timer's reference count, |
642 | and destroy it unless it has additional outstanding references. | |
d056fbdf MW |
643 | If |
644 | .B bench_init | |
645 | is called with a null timer pointer, | |
646 | and its attempt to create a timer for itself fails, | |
647 | then | |
648 | .B bench_init | |
289651a7 | 649 | returns \-1: |
d056fbdf | 650 | the benchmark state is not initialized |
289651a7 | 651 | and can safely be discarded. |
d056fbdf MW |
652 | .PP |
653 | Calling | |
654 | .B bench_destroy | |
655 | on a benchmark state | |
656 | releases any resources it holds, | |
657 | most notably its timer, if any. | |
289651a7 MW |
658 | Calling |
659 | .B bench_destroy | |
660 | on an unsuccessfully initialized benchmark state | |
661 | is safe but has no effect. | |
d056fbdf MW |
662 | .PP |
663 | Although | |
664 | .B struct bench_state | |
665 | is defined in the header file, | |
666 | only two members are available for use by applications. | |
667 | .TP | |
668 | .B f | |
669 | A word containing flags. | |
670 | .TP | |
671 | .B target_s | |
672 | The target time for which to try run a benchmark, in seconds. | |
673 | After initialization, this is set to 1.0, | |
674 | though applications can override it. | |
675 | .PP | |
676 | Before the benchmark state can be used in measurements, | |
677 | it must be | |
678 | .IR calibrated . | |
679 | This is performed by calling | |
680 | .B bench_calibrate | |
681 | on the benchmark state. | |
682 | Calibration takes a noticeable amount of time | |
683 | (currently about 0.25\*,s), | |
684 | so it makes sense to defer it until it's known to be necessary. | |
685 | .PP | |
686 | Calibration is carried out separately, but in parallel, | |
687 | for the timer's passage-of-time measurement and cycle counter. | |
688 | Either or both of these calibrations can succeed or fail; | |
689 | if passage-of-time calibration fails, | |
690 | then cycle count calibration is impossible. | |
691 | .PP | |
289651a7 MW |
692 | The benchmarking state must be calibrated differently |
693 | for different kinds of timing loop; | |
694 | this is controlled by the flags passed as the | |
695 | .I f | |
696 | argument to | |
697 | .BR bench_calibrate . | |
698 | The main difference lies in whether the code to be measured | |
699 | is called | |
700 | .IR indirectly , | |
701 | i.e., via a function pointer. | |
702 | Set | |
703 | .B BTF_INDIRECT | |
704 | if the code is to be called indirectly; | |
705 | leave this flag clear if the code is called directly. | |
706 | The | |
707 | .B bench_measure | |
708 | function always makes indirect calls; | |
709 | the | |
710 | .B BENCH_MEASURE | |
711 | macro does not itself make indirect calls. | |
712 | Usually, a program needs only one or the other; | |
713 | if both are necessary for some reason, | |
714 | the best approach is just to set up two benchmarking states | |
715 | sharing the same timer, | |
716 | and calibrate them separately. | |
717 | .PP | |
d056fbdf MW |
718 | When it completes, |
719 | .B bench_calibrate | |
289651a7 | 720 | sets flags in the benchmark state's |
d056fbdf MW |
721 | .B f |
722 | member: | |
723 | if passage-of-time calibration succeeded, | |
724 | .B BTF_TIMEOK | |
725 | is set; | |
726 | if cycle-count calibration succeeded, | |
727 | .B BTF_CYOK | |
728 | is set; | |
729 | and the flag | |
730 | .B BTF_CLB | |
731 | is set unconditionally, | |
732 | as a persistent indication that calibration has been attempted. | |
733 | .PP | |
734 | The | |
735 | .B bench_calibrate | |
736 | function returns zero if it successfully calibrated | |
737 | at least the passage-of-time measurement; | |
738 | otherwise, it returns \-1. | |
739 | If | |
740 | .B bench_calibrate | |
741 | is called for a second or subsequent time on the same benchmark state, | |
742 | it returns immediately, | |
743 | either returning 0 or \-1 | |
744 | according to whether passage-of-time had previously been calibrated. | |
289651a7 MW |
745 | .PP |
746 | The | |
747 | .B BENCH_MEASURE | |
748 | macro measures the performance of a computation. | |
749 | It requires the declarations made by | |
750 | .B BENCH_MEASURE_DECLS | |
751 | to be in scope, | |
752 | ideally within an enclosing block | |
753 | (rather than at top-level, | |
754 | where they'll have static storage duration, | |
755 | and take longer to access). | |
756 | The macro's expansion is syntactically a statement head; | |
757 | see | |
758 | .BR control (3) | |
759 | for details about the underlying machinery. | |
760 | In more detail, the macro is invoked as | |
761 | .IP | |
762 | .nf | |
763 | .ta 2n | |
764 | .BI "BENCH_MEASURE(" b ", " rc ", " t_out ", " base ) | |
765 | .BI " " stmt | |
766 | .fi | |
767 | .PP | |
768 | The | |
769 | .I stmt | |
770 | can be any C statement; | |
771 | it should perform | |
772 | .B _bench_n | |
773 | iterations of the computation to be measured. | |
774 | (The variable | |
775 | .B _bench_n | |
776 | is declarared as part of | |
777 | .B BENCH_MEASURE_DECLS | |
778 | and has type | |
779 | .BR "unsigned long" . | |
780 | Before commencing measurement proper, | |
781 | the macro calls | |
782 | .BR bench_preflight , | |
783 | described below, | |
784 | to check that everything is set up properly | |
785 | for measurements on the current thread; | |
786 | if this fails, then the macro sets | |
787 | .I rc | |
788 | to \-1. | |
789 | Otherwise, the macro executes | |
790 | .I stmt | |
791 | one or more times, | |
792 | with the objective of finding an iteration count | |
793 | .I n | |
794 | such that | |
795 | .I n | |
796 | iterations of the computation take more than | |
797 | .IB b ->target_s "" \fR/\(sr2 | |
798 | seconds. | |
799 | If measurement fails, | |
800 | then | |
801 | .I rc | |
802 | is set to \-1; | |
803 | otherwise, | |
804 | .I rc | |
805 | is set to zero, and | |
806 | .BI * t_out | |
807 | is filled in with the measurement; | |
808 | .IB t_out ->n | |
809 | is set to | |
810 | .IR n "\ \(mu\ " base . | |
811 | .PP | |
812 | The | |
813 | .B BENCH_MEASURE_TAG | |
814 | macro works just like | |
815 | .B BENCH_MEASURE | |
816 | except that it takes an additional | |
817 | .I tag | |
818 | argument used to distinguish the internal labels | |
819 | used by the macro's implementation; | |
820 | this makes it possible to use | |
821 | .B BENCH_MEASURE_TAG | |
822 | as a component in more complex macros. | |
823 | See | |
824 | .BR control (3) | |
825 | for details about control-structure macros | |
826 | and the meaning and format of tags. | |
827 | .PP | |
828 | The function | |
829 | .B bench_measure | |
830 | is similar, | |
831 | except that it calls a | |
d056fbdf | 832 | .I benchmark function |
289651a7 MW |
833 | to perform the computation. |
834 | A benchmark function has the signature | |
d056fbdf MW |
835 | .IP |
836 | .BI "void " fn "(unsigned long " n ", void *" ctx ); | |
837 | .PP | |
838 | When called, it should perform the operation to be measured | |
839 | .I n | |
840 | times. | |
841 | The | |
842 | .I ctx | |
843 | argument is a pointer passed into | |
844 | .B bench_measure | |
845 | for the benchmark function's own purposes. | |
289651a7 | 846 | The |
d056fbdf | 847 | .B bench_measure |
289651a7 MW |
848 | function returns zero on success, |
849 | or \-1 on failure. | |
850 | Note that | |
d056fbdf | 851 | .B bench_measure |
289651a7 MW |
852 | invokes the benchmark indirectly, |
853 | so the benchmark state should have been calibrated with | |
854 | .BR BTF_INDIRECT . | |
855 | . | |
856 | .SS Measurement utilities | |
857 | The following functions are primarily exported for the benefit of the | |
858 | .B BENCH_MEASURE | |
859 | macro, | |
860 | but are documented here in case they are useful. | |
d056fbdf MW |
861 | .PP |
862 | The | |
289651a7 MW |
863 | .B bench_preflight |
864 | function prepares a benchmarking state for use. | |
865 | It checks that the timer is calibrated | |
866 | and suitable for measuring passage-of-time; | |
867 | it also calls the timer's | |
868 | .B preflight | |
869 | function to prepare it for measurements on the current thread. | |
870 | If these checks succeed, then | |
871 | .B bench_preflight | |
872 | returns zero; | |
873 | otherwise it returns \-1 | |
874 | and the caller should not proceed with measurements. | |
875 | .PP | |
876 | The | |
877 | .B bench_adapt | |
878 | function is used to determine iteration counts. | |
879 | It is used in a loop such as the following. | |
880 | .IP | |
881 | .nf | |
882 | .ta 2n +2n | |
883 | .B "BENCH_TIMELOOP_DECLS;" | |
884 | .B "struct bench_timer *tm;" | |
885 | .B "struct bench_timing t;" | |
886 | .B "double n = 1.0, target_s = 1.0;" | |
887 | .IP | |
888 | .B "do {" | |
889 | .B " BENCH_TIMELOOP_TAG(time, tm, &t, n, { break; })" | |
890 | .BI " " "(do " _bench_n " iterations of some computation)" ; | |
891 | .B "} while (!bench_adapt(&n, target_s, &t));" | |
892 | .fi | |
893 | .PP | |
894 | On entry, | |
895 | .BI *n_inout | |
896 | should be the number of iterations performed by the previous step, | |
897 | and | |
898 | .BI * t | |
899 | the resulting time; | |
900 | the | |
901 | .B BTF_TIMEOK | |
902 | flag must be set in | |
903 | .IB t ->f \fR. | |
904 | If the timing is sufficient \(en if | |
905 | .IR t\fB->t "\ \*(>=\ " target_s /\(sr2 | |
906 | \(en then | |
907 | .B bench_adapt | |
908 | returns a nonzero value to indicate that measurement is complete. | |
909 | Otherwise, it sets | |
910 | .BI * n_inout | |
911 | to a new, larger iteration count | |
912 | and returns zero to indicate that a further pass is necessary. | |
913 | .PP | |
914 | The | |
915 | .B bench_adjust | |
916 | function adjusts a raw timing, | |
917 | as captured by | |
918 | .BR BENCH_TIMELOOP_TAG , | |
919 | according to the calibration data captured in | |
920 | .IR b . | |
921 | On exit, the timing data is updated, | |
922 | and | |
923 | .IB t ->n | |
924 | is set to the product | |
925 | .IR n "\ \(mu\ " base . | |
926 | . | |
927 | .SS Reporting results | |
928 | The | |
929 | .B bench_report | |
930 | function formats a measurement result | |
931 | into a human-readable string. | |
932 | The function writes its output using the | |
933 | generalized output formatting operations | |
934 | .I gops | |
935 | and output pointer | |
936 | .IR go ; | |
937 | see | |
938 | .BR gprintf (3) | |
939 | for details on generalized output formatting. | |
940 | The | |
941 | .I unit | |
942 | argument describes the unit of activity being measured: | |
943 | .TP | |
944 | .B BTU_OP | |
945 | counts operations of some unspecified nature, while | |
946 | .TP | |
947 | .B BTU_BYTE | |
948 | counts a number of bytes processed. | |
949 | .PP | |
950 | These are presented differently | |
951 | \(em in particular, | |
952 | quantities bytes are expressed using binary scaling rather than decimal. | |
953 | The timing to report is given by the | |
954 | .I t | |
955 | argument; | |
956 | .IB t ->n | |
957 | gives the number of units processed. | |
958 | . | |
959 | .\"-------------------------------------------------------------------------- | |
960 | .SH EXAMPLE | |
d056fbdf | 961 | . |
289651a7 MW |
962 | The following macros offer a fairly simple example of |
963 | how the benchmarking functions and macros can be used. | |
964 | .VS | |
965 | .ta 2n +2n +2n 2n+\w'\fBBENCH_MEASURE_TAG('u \n(.lu-\n(.iu-4n | |
966 | #define BENCHMARK_DECLS \e | |
967 | struct bench_timing _bmark_t; \e | |
968 | int _bmark_rc; \e | |
969 | BENCH_MEASURE_DECLS | |
970 | .VP | |
971 | #define BENCHMARK_TAG(tag, b, unit, base) \e | |
972 | MC_BEFORE(tag##__benchmark_before, { fflush(stdout); }) \e | |
973 | MC_AFTER(tag##__benchmark_after, { \e | |
974 | if (_bmark_rc) \e | |
c752173d | 975 | puts(": FAILED"); \e |
289651a7 MW |
976 | else { \e |
977 | fputs(": ", stdout); \e | |
978 | bench_report(&file_printops, stdout, (unit), &_bmark_tm);\ \e | |
c752173d | 979 | putchar('\en'); \e |
289651a7 MW |
980 | } \e |
981 | }) \e | |
982 | BENCH_MEASURE_TAG(tag##__bmarkmark_measure, \e | |
983 | (b), _bmark_rc, &_bmark_t, (base)) | |
984 | #define BENCHMARK(b, unit, base) \e | |
985 | BENCHMARK_TAG(bench, b, unit, base) | |
986 | .VE | |
987 | ||
c4ccbbf9 | 988 | .\"-------------------------------------------------------------------------- |
d056fbdf | 989 | .SH "SEE ALSO" |
c4ccbbf9 | 990 | . |
289651a7 MW |
991 | .BR control (3), |
992 | .BR macros (3), | |
c4ccbbf9 | 993 | .BR tvec-bench (3), |
d056fbdf MW |
994 | .BR mLib (3). |
995 | . | |
c4ccbbf9 | 996 | .\"-------------------------------------------------------------------------- |
d056fbdf | 997 | .SH AUTHOR |
c4ccbbf9 | 998 | . |
d056fbdf | 999 | Mark Wooding, <mdw@distorted.org.uk> |
c4ccbbf9 MW |
1000 | . |
1001 | .\"----- That's all, folks -------------------------------------------------- |