Commit | Line | Data |
---|---|---|
d056fbdf | 1 | .\" -*-nroff-*- |
c4ccbbf9 MW |
2 | .\" |
3 | .\" Manual for benchmarking core | |
4 | .\" | |
5 | .\" (c) 2024 Straylight/Edgeware | |
6 | .\" | |
7 | . | |
8 | .\"----- Licensing notice --------------------------------------------------- | |
9 | .\" | |
10 | .\" This file is part of the mLib utilities library. | |
11 | .\" | |
12 | .\" mLib is free software: you can redistribute it and/or modify it under | |
13 | .\" the terms of the GNU Library General Public License as published by | |
14 | .\" the Free Software Foundation; either version 2 of the License, or (at | |
15 | .\" your option) any later version. | |
16 | .\" | |
17 | .\" mLib is distributed in the hope that it will be useful, but WITHOUT | |
18 | .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public | |
20 | .\" License for more details. | |
21 | .\" | |
22 | .\" You should have received a copy of the GNU Library General Public | |
23 | .\" License along with mLib. If not, write to the Free Software | |
24 | .\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | .\" USA. | |
26 | . | |
27 | .\"-------------------------------------------------------------------------- | |
28 | .so ../defs.man \" @@@PRE@@@ | |
29 | . | |
30 | .\"-------------------------------------------------------------------------- | |
31 | .TH bench 3mLib "9 March 2024" "Straylight/Edgeware" "mLib utilities library" | |
d056fbdf MW |
32 | .\" @bench_createtimer |
33 | .\" @bench_init | |
34 | .\" @bench_destroy | |
35 | .\" @bench_calibrate | |
36 | .\" @bench_measure | |
37 | . | |
c4ccbbf9 MW |
38 | .\"-------------------------------------------------------------------------- |
39 | .SH NAME | |
40 | bench \- low-level benchmarking tools | |
41 | . | |
42 | .\"-------------------------------------------------------------------------- | |
d056fbdf | 43 | .SH SYNOPSIS |
c4ccbbf9 | 44 | . |
d056fbdf MW |
45 | .nf |
46 | .B "#include <mLib/bench.h>" | |
47 | .PP | |
6e683a79 | 48 | .ta 2n +2n +2n |
d056fbdf MW |
49 | .B "struct bench_time {" |
50 | .B " unsigned f;" | |
6e683a79 MW |
51 | .B " union {" |
52 | .B " struct { kludge64 s; uint32 ns; } ts;" | |
53 | .B " clock_t clk;" | |
54 | .B " kludge64 rawns;" | |
55 | .B " } t;" | |
d056fbdf MW |
56 | .B " kludge64 cy;" |
57 | .B "};" | |
58 | .PP | |
59 | .B "struct bench_timing {" | |
60 | .B " unsigned f;" | |
61 | .B " double n;" | |
62 | .B " double t;" | |
63 | .B " double cy;" | |
64 | .B "};" | |
65 | .PP | |
6e683a79 MW |
66 | .B "#define BTF_T0 0u" |
67 | .B "#define BTF_T1 ..." | |
d056fbdf MW |
68 | .B "struct bench_timerops {" |
69 | .BI " void (*describe)(struct bench_timer *" bt ", dstr *" d ); | |
6e683a79 MW |
70 | .ta 2n +\w'\fBint (*now)('u |
71 | .BI " int (*now)(struct bench_timer *" bt , | |
72 | .BI " struct bench_time *" t_out ", unsigned " f ); | |
73 | .ta 2n +\w'\void (*diff)('u | |
74 | .BI " void (*diff)(struct bench_timer *" bt , | |
75 | .BI " struct bench_timing *" delta_out , | |
76 | .BI " const struct bench_time *" t0 , | |
77 | .BI " const struct bench_time *" t1 ); | |
d056fbdf MW |
78 | .BI " void (*destroy)(struct bench_timer *" bt ); |
79 | .B "};" | |
80 | .B "struct bench_timer {" | |
81 | .B " const struct bench_timerops *ops;" | |
82 | .B "};" | |
83 | .PP | |
84 | .B "struct bench_state {" | |
85 | .B " unsigned f;" | |
86 | .B " double target_s;" | |
87 | .B " ..." | |
88 | .B "}"; | |
89 | .PP | |
90 | .BI "typedef void bench_fn(unsigned long " n ", void *" ctx ); | |
91 | .PP | |
92 | .B "#define BTF_TIMEOK ..." | |
93 | .B "#define BTF_CYOK ..." | |
94 | .B "#define BTF_CLB ..." | |
95 | .B "#define BTF_ANY (BTF_TIMEOK | BTF_CYOK)" | |
96 | .PP | |
97 | .B "struct bench_timer *bench_createtimer(void);" | |
98 | .PP | |
99 | .BI "int bench_init(struct bench_state *" b ", struct bench_timer *" tm ); | |
100 | .BI "void bench_destroy(struct bench_state *" b ); | |
101 | .BI "int bench_calibrate(struct bench_state *" b ); | |
102 | .ta \w'\fBint bench_measure('u | |
103 | .BI "int bench_measure(struct bench_state *" b ", struct bench_timing *" t_out , | |
104 | .BI " double " base ", bench_fn *" fn ", void *" ctx ); | |
105 | .fi | |
106 | . | |
c4ccbbf9 | 107 | .\"-------------------------------------------------------------------------- |
d056fbdf | 108 | .SH DESCRIPTION |
c4ccbbf9 | 109 | . |
d056fbdf MW |
110 | The header file |
111 | .B "<mLib/bench.h>" | |
112 | provides declarations and defintions | |
113 | for performing low-level benchmarks. | |
114 | .PP | |
115 | The `main event' is | |
116 | .BR bench_measure . | |
117 | This function will be described in detail later, | |
118 | but, in brief, | |
119 | it calls a caller-provided function, | |
120 | instructing it to run adaptively chosen numbers of iterations, | |
121 | in order to get a reasonably reliable measurement of its running time, | |
122 | and then reports its results by filling in a structure. | |
123 | .PP | |
124 | With understanding this function as our objective, | |
125 | we must examine all of the pieces involved in making it work. | |
126 | . | |
127 | .SS Timers in general | |
128 | A | |
129 | .I timer | |
130 | is a gadget which is capable of reporting the current time, | |
131 | in seconds (ideally precise to tiny fractions of a second), | |
132 | and/or in CPU cycles. | |
133 | A timer is represented by a pointer to an object of type | |
134 | .BR "struct bench_timer" . | |
135 | This structure has a single member, | |
136 | .BR ops , | |
137 | pointing to a | |
138 | .BR "struct bench_timerops" , | |
139 | which is a table of function pointers; | |
140 | typically, a timer has more data following this, | |
141 | but this fact is not exposed to applications. | |
142 | .PP | |
143 | The function pointers in | |
144 | .B "struct bench_timerops" | |
145 | are as follows. | |
146 | The first argument, | |
147 | named | |
148 | .I tm | |
149 | must always point to the timer object itself. | |
150 | .TP | |
151 | .IB tm ->ops->describe( tm ", " d) | |
152 | Write a description of the timer to the dynamic string | |
153 | .IR d . | |
154 | .TP | |
6e683a79 | 155 | .IB tm ->ops->now( tm ", " t_out ", " f ) |
d056fbdf | 156 | Store the current time in |
6e683a79 | 157 | .BI * t_out \fR. |
d056fbdf | 158 | The |
6e683a79 MW |
159 | .B BTF_T1 |
160 | flag in | |
161 | .I f | |
162 | to indicate that this is the second call in a pair; | |
163 | leave it clear for the first call. | |
164 | (A fake | |
165 | .B BTF_T0 | |
166 | flag is defined to be zero for symmetry.) | |
167 | Return zero on success | |
168 | .I or | |
169 | permanent failure; | |
170 | return \-1 if timing failed but | |
171 | trying again immediately has a reasonable chance of success. | |
172 | .TP | |
173 | .IB tm ->ops->diff( tm ", " delta_out ", " t0 ", " t1 ) | |
174 | Store in | |
175 | .BI * delta_out | |
176 | the difference between the two times | |
177 | .I t0 | |
178 | and | |
179 | .IR t1 . | |
d056fbdf MW |
180 | .TP |
181 | .IB tm ->ops->destroy( tm ) | |
182 | Destroy the timer, | |
183 | releasing all of the resources that it holds. | |
184 | .PP | |
6e683a79 MW |
185 | A |
186 | .B bench_timing | |
187 | structure reports the difference between two times, | |
188 | as determined by a timer's | |
189 | .B diff | |
190 | function. | |
191 | It has four members. | |
192 | .TP | |
d056fbdf | 193 | .B f |
6e683a79 | 194 | A flags word. |
d056fbdf | 195 | .B BTF_TIMEOK |
6e683a79 MW |
196 | is set if the passage-of-time measurement in |
197 | .B t | |
198 | is valid; | |
d056fbdf | 199 | .B BTF_CYOK |
6e683a79 | 200 | is set if the cycle count in |
d056fbdf MW |
201 | .B cy |
202 | is valid. | |
d056fbdf MW |
203 | The mask |
204 | .B BTF_ANY | |
205 | covers the | |
206 | .B BTF_TIMEOK | |
207 | and | |
208 | .B BTF_CYOK | |
209 | bits: | |
210 | hence, | |
6e683a79 | 211 | .B f&BTF_ANY |
d056fbdf MW |
212 | is nonzero (true) |
213 | if the timer returned any valid timing information. | |
6e683a79 MW |
214 | .TP |
215 | .B n | |
216 | The number of iterations performed by the benchmark function | |
217 | on its satisfactory run, | |
218 | multiplied by | |
219 | .IR base . | |
220 | .TP | |
221 | .B t | |
222 | The time taken for the satisfactory run of the benchmark function, | |
223 | in seconds. | |
224 | Only valid if | |
225 | .B BTF_TIMEOK | |
226 | is set in | |
227 | .BR f . | |
228 | .TP | |
229 | .B cy | |
230 | The number of CPU cycles used | |
231 | in the satisfactory run of the benchmark function, | |
232 | in seconds. | |
233 | Only valid if | |
234 | .B BTF_CYOK | |
235 | is set in | |
236 | .BR f . | |
237 | .PP | |
238 | A | |
239 | .B "struct bench_time" | |
5c0f2e08 | 240 | represents a single instant in time, |
6e683a79 MW |
241 | as captured by a timer's |
242 | .B now | |
243 | function. | |
244 | The use of this structure is a private matter for the timer: | |
245 | the only hard requirement is that the | |
246 | .B diff | |
247 | function should be able to compute the difference between two times. | |
248 | However, the intent is that | |
249 | a passage-of-time measurement is stored in the | |
250 | .B t | |
251 | union, | |
252 | a cycle count is stored in the | |
253 | .B cy | |
254 | member, and | |
255 | the | |
256 | .B f | |
257 | member stores flags | |
258 | .B BTF_TIMEOK | |
259 | and or | |
260 | .B BTF_CYOK | |
261 | if the passage-of-time or cycle count respectively are valid. | |
d056fbdf MW |
262 | . |
263 | .SS The built-in timer | |
264 | The function | |
265 | .B bench_createtimer | |
266 | constructs and returns a timer. | |
267 | It takes a single argument, | |
268 | a string | |
269 | .IR config , | |
270 | from which it reads configuration information. | |
271 | If | |
272 | .B bench_createtimer | |
273 | fails, it returns a null pointer. | |
274 | .PP | |
275 | The | |
276 | .I config | |
277 | pointer may safely be null, | |
278 | in which case a default configuration will be used. | |
279 | Applications | |
280 | .I should only | |
281 | set this pointer to a value supplied by a user, | |
282 | e.g., through a command-line argument, | |
283 | environment variable, or | |
284 | configuration file. | |
285 | .PP | |
286 | The built-in timer makes use of one or two | |
287 | .IR subtimers : | |
288 | a `clock' subtimer to measure the passage of time, | |
289 | and possibly a `cycle' subtimer to count CPU cycles. | |
290 | .PP | |
291 | The configuration string consists of a sequence of words | |
292 | separated by whitespace. | |
293 | There may be additional whitespace at the start and end of the string. | |
294 | The words recognized are as follows. | |
295 | .TP | |
296 | .B list | |
297 | Prints a list of the available clock and cycle subtimers | |
298 | to standard output. | |
299 | .TP | |
300 | .BI clock= t , ... | |
301 | Use the first of the listed clock subtimers | |
302 | to initialize successfully | |
303 | as the clock subtimer. | |
304 | If none of the subtimers can be initialized, | |
305 | then construction of the timer as a whole fails. | |
306 | .TP | |
307 | .BI cycle= t , ... | |
308 | Use the first of the listed subtimers | |
309 | to initialize successfully | |
310 | as the cycle subtimer. | |
311 | If none of the subtimers can be initialized, | |
312 | then construction of the timer as a whole fails. | |
313 | .PP | |
314 | The clock subtimers are as follows. | |
315 | Not all of them will be available on every platform. | |
316 | .TP | |
6e683a79 MW |
317 | .B linux-x86-perf-rdpmc-hw-cycles |
318 | This is a dummy companion to the similarly named cycle subtimer; | |
319 | see its description below. | |
320 | .TP | |
d056fbdf MW |
321 | .B posix-thread-cputime |
322 | Measures the passage of time using | |
323 | .BR clock_gettime (2), | |
324 | specifying the | |
325 | .B CLOCK_\%THREAD_\%CPUTIME_\%ID | |
326 | clock. | |
327 | .TP | |
328 | .B stdc-clock | |
329 | Measures the passage of time using | |
330 | .BR clock (3). | |
331 | Since | |
332 | .BR clock (3) | |
333 | is part of the original ANSI\ C standard, | |
334 | this subtimer should always be available. | |
335 | However, it may produce unhelpful results | |
336 | if other threads are running. | |
337 | .PP | |
338 | The cycle subtimers are as follows. | |
339 | Not all of them will be available on every platform. | |
340 | .TP | |
6e683a79 MW |
341 | .B linux-perf-read-hw-cycles |
342 | Counts CPU cycles using the Linux-specific | |
d056fbdf MW |
343 | .BR perf_event_open (2) |
344 | function to read the | |
345 | .BR PERF_\%COUNT_\%HW_\%CPU_\%CYCLES | |
346 | counter. | |
347 | Only available on Linux. | |
348 | It will fail to initialize | |
349 | if access to performance counters is restricted, | |
350 | e.g., because the | |
351 | .B /proc/sys/kernel/perf_event_paranoid | |
352 | level is too high. | |
353 | .TP | |
6e683a79 MW |
354 | .B linux-perf-rdpmc-hw-cycles |
355 | Counts CPU cycles using the Linux-specific | |
356 | .BR perf_event_open (2) | |
357 | function, | |
358 | as for | |
359 | .B linux-x86-perf-read-hw-cycles | |
360 | above, | |
361 | except that it additionally uses the i386/AMD64 | |
d056fbdf | 362 | .B rdtsc |
6e683a79 MW |
363 | and |
364 | .B rdpmc | |
365 | instructions, | |
366 | together with information provided by the kernel | |
367 | through a memory-mapped page | |
368 | to do its measurements without any system call overheads. | |
369 | It does passage-of-time and cycle counting in a single operation, | |
370 | so no separate clock subtimer is required: | |
371 | the similarly-named clock subtimer does nothing | |
372 | except check that the | |
373 | .B linux-x86-perf-rdpmc-hw-cycles | |
374 | cycle subtimer has been selected. | |
375 | This is almost certainly the best choice if it's available. | |
376 | .TP | |
377 | .B x86-rdtscp | |
378 | Counts CPU cycles using the x86 | |
379 | .B rdtscp | |
d056fbdf MW |
380 | instruction. |
381 | This instruction is not really suitable for performance measurement: | |
382 | it gives misleading results on CPUs with variable clock frequency. | |
383 | .TP | |
6e683a79 MW |
384 | .B x86-rdtsc |
385 | Counts CPU cycles using the x86 | |
386 | .B rdtsc | |
387 | instruction. | |
388 | This has the downsides of | |
389 | .B rdtscp | |
390 | above, | |
391 | but also fails to detect when the thread has been suspended | |
392 | or transferred to a different CPU core | |
393 | and gives misleading answers in this case. | |
394 | Not really recommended. | |
395 | .TP | |
d056fbdf MW |
396 | .B null |
397 | A dummy cycle counter, | |
398 | which will initialize successfully | |
399 | and then fail to report cycle counts. | |
400 | This is a reasonable fallback in many situations. | |
401 | .PP | |
402 | The built-in preference order for clock subtimers, | |
403 | from most to least preferred, is | |
6e683a79 | 404 | .BR linux-x86-perf-rdpmc-hw-cycles , |
d056fbdf | 405 | followed by |
6e683a79 MW |
406 | .BR posix-thread-cputime , |
407 | and finally | |
d056fbdf MW |
408 | .BR stdc-clock . |
409 | The built-in preference order for cycle subtimers, | |
410 | from most to least preferred, is | |
6e683a79 MW |
411 | .BR linux-x86-perf-rdpmc-hw-cycles |
412 | then | |
413 | .BR linux-x86-perf-read-hw-cycles , | |
d056fbdf | 414 | followed by |
6e683a79 MW |
415 | .BR x86-rdtscp , |
416 | and | |
d056fbdf | 417 | .BR x86-rdtsc , |
6e683a79 | 418 | and finally |
d056fbdf MW |
419 | .BR null . |
420 | . | |
421 | .SS The benchmark state | |
422 | A | |
423 | .I benchmark state | |
424 | tracks the information needed to measure performance of functions. | |
425 | It is represented by a | |
426 | .B struct bench_state | |
427 | structure. | |
428 | .PP | |
429 | The benchmark state is initialized by calling | |
430 | .BR bench_init , | |
431 | passing the address of the state structure to be initialized, | |
432 | and a pointer to a timer. | |
433 | If | |
434 | .B bench_init | |
435 | is called with a non-null timer pointer, | |
436 | then it will not fail; | |
437 | the benchmark state will be initialized, | |
438 | and the function returns zero. | |
439 | If the timer pointer is null, | |
440 | then | |
441 | .B bench_init | |
442 | attempts to construct a timer for itself | |
443 | by calling | |
444 | .BR bench_createtimer . | |
445 | If this succeeds, | |
446 | then the benchmark state will be initialized, | |
447 | and the function returns zero. | |
448 | In both cases, | |
449 | the timer becomes owned by the benchmark state: | |
450 | calling | |
451 | .B bench_destroy | |
452 | on the benchmark state will destroy the timer. | |
453 | If | |
454 | .B bench_init | |
455 | is called with a null timer pointer, | |
456 | and its attempt to create a timer for itself fails, | |
457 | then | |
458 | .B bench_init | |
459 | returns \-1; | |
460 | the benchmark state is not initialized | |
461 | and can safely be discarded; | |
462 | calling | |
463 | safe to call | |
464 | .B bench_destroy | |
465 | on the unsuccessfully benchmark state is safe and has no effect. | |
466 | .PP | |
467 | Calling | |
468 | .B bench_destroy | |
469 | on a benchmark state | |
470 | releases any resources it holds, | |
471 | most notably its timer, if any. | |
472 | .PP | |
473 | Although | |
474 | .B struct bench_state | |
475 | is defined in the header file, | |
476 | only two members are available for use by applications. | |
477 | .TP | |
478 | .B f | |
479 | A word containing flags. | |
480 | .TP | |
481 | .B target_s | |
482 | The target time for which to try run a benchmark, in seconds. | |
483 | After initialization, this is set to 1.0, | |
484 | though applications can override it. | |
485 | .PP | |
486 | Before the benchmark state can be used in measurements, | |
487 | it must be | |
488 | .IR calibrated . | |
489 | This is performed by calling | |
490 | .B bench_calibrate | |
491 | on the benchmark state. | |
492 | Calibration takes a noticeable amount of time | |
493 | (currently about 0.25\*,s), | |
494 | so it makes sense to defer it until it's known to be necessary. | |
495 | .PP | |
496 | Calibration is carried out separately, but in parallel, | |
497 | for the timer's passage-of-time measurement and cycle counter. | |
498 | Either or both of these calibrations can succeed or fail; | |
499 | if passage-of-time calibration fails, | |
500 | then cycle count calibration is impossible. | |
501 | .PP | |
502 | When it completes, | |
503 | .B bench_calibrate | |
504 | sets flag in the benchmark state's | |
505 | .B f | |
506 | member: | |
507 | if passage-of-time calibration succeeded, | |
508 | .B BTF_TIMEOK | |
509 | is set; | |
510 | if cycle-count calibration succeeded, | |
511 | .B BTF_CYOK | |
512 | is set; | |
513 | and the flag | |
514 | .B BTF_CLB | |
515 | is set unconditionally, | |
516 | as a persistent indication that calibration has been attempted. | |
517 | .PP | |
518 | The | |
519 | .B bench_calibrate | |
520 | function returns zero if it successfully calibrated | |
521 | at least the passage-of-time measurement; | |
522 | otherwise, it returns \-1. | |
523 | If | |
524 | .B bench_calibrate | |
525 | is called for a second or subsequent time on the same benchmark state, | |
526 | it returns immediately, | |
527 | either returning 0 or \-1 | |
528 | according to whether passage-of-time had previously been calibrated. | |
529 | . | |
530 | .SS Timing functions | |
531 | A | |
532 | .I benchmark function | |
533 | has the signature | |
534 | .IP | |
535 | .BI "void " fn "(unsigned long " n ", void *" ctx ); | |
536 | .PP | |
537 | When called, it should perform the operation to be measured | |
538 | .I n | |
539 | times. | |
540 | The | |
541 | .I ctx | |
542 | argument is a pointer passed into | |
543 | .B bench_measure | |
544 | for the benchmark function's own purposes. | |
545 | .PP | |
546 | The function | |
547 | .B bench_measure | |
548 | receives five arguments. | |
549 | .TP | |
550 | .I b | |
551 | points to the benchmark state to be used. | |
552 | .TP | |
553 | .I t_out | |
554 | is the address of a | |
555 | .BR struct bench_timing | |
556 | in which the measurement should be left. | |
557 | This structure is described below. | |
558 | .TP | |
559 | .I base | |
560 | is a count of the number of operations performed | |
561 | by each iteration of the benchmark function. | |
562 | .TP | |
563 | .I fn | |
564 | is a benchmark function, described above. | |
565 | .TP | |
566 | .I ctx | |
567 | is a pointer to be passed to the benchmark function. | |
568 | .B bench_measure | |
569 | does not interpret this pointer in any way. | |
570 | .PP | |
571 | The | |
572 | .B bench_measure | |
573 | function calls its benchark function repeatedly | |
574 | with different iteration counts | |
575 | .IR n , | |
576 | with the objective that the call take approximately | |
577 | .B target_s | |
578 | seconds, as established in the benchmark state. | |
579 | (Currently, if | |
580 | .B target_s | |
581 | holds the value | |
582 | .IR t , | |
583 | then | |
584 | .B bench_measure | |
585 | is satisfied when a call takes at least | |
586 | .IR t /\(sr2\*,s.) | |
587 | Once the function finds a satisfactory number of iterations, | |
588 | it stores the results in | |
589 | .BI * t_out \fR. | |
590 | If measurement succeeds, then | |
591 | .B bench_measure | |
592 | returns zero. | |
593 | If it fails \(en | |
594 | most likely because the timer failed \(en | |
595 | then it returns \-1. | |
d056fbdf | 596 | . |
c4ccbbf9 | 597 | .\"-------------------------------------------------------------------------- |
d056fbdf | 598 | .SH "SEE ALSO" |
c4ccbbf9 MW |
599 | . |
600 | .BR tvec-bench (3), | |
d056fbdf MW |
601 | .BR mLib (3). |
602 | . | |
c4ccbbf9 | 603 | .\"-------------------------------------------------------------------------- |
d056fbdf | 604 | .SH AUTHOR |
c4ccbbf9 | 605 | . |
d056fbdf | 606 | Mark Wooding, <mdw@distorted.org.uk> |
c4ccbbf9 MW |
607 | . |
608 | .\"----- That's all, folks -------------------------------------------------- |