@@@ misc mess
[mLib] / test / bench.3.in
1 .\" -*-nroff-*-
2 .\"
3 .\" Manual for benchmarking core
4 .\"
5 .\" (c) 2024 Straylight/Edgeware
6 .\"
7 .
8 .\"----- Licensing notice ---------------------------------------------------
9 .\"
10 .\" This file is part of the mLib utilities library.
11 .\"
12 .\" mLib is free software: you can redistribute it and/or modify it under
13 .\" the terms of the GNU Library General Public License as published by
14 .\" the Free Software Foundation; either version 2 of the License, or (at
15 .\" your option) any later version.
16 .\"
17 .\" mLib is distributed in the hope that it will be useful, but WITHOUT
18 .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 .\" License for more details.
21 .\"
22 .\" You should have received a copy of the GNU Library General Public
23 .\" License along with mLib. If not, write to the Free Software
24 .\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 .\" USA.
26 .
27 .\"--------------------------------------------------------------------------
28 .so ../defs.man \" @@@PRE@@@
29 .
30 .\"--------------------------------------------------------------------------
31 .TH bench 3mLib "9 March 2024" "Straylight/Edgeware" "mLib utilities library"
32 .\" @bench_createtimer
33 .\" @bench_init
34 .\" @bench_destroy
35 .\" @bench_calibrate
36 .\" @bench_measure
37 .
38 .\"--------------------------------------------------------------------------
39 .SH NAME
40 bench \- low-level benchmarking tools
41 .
42 .\"--------------------------------------------------------------------------
43 .SH SYNOPSIS
44 .
45 .nf
46 .B "#include <mLib/bench.h>"
47 .PP
48 .ta 2n +2n +2n
49 .B "struct bench_time {"
50 .B " unsigned f;"
51 .B " union {"
52 .B " struct { kludge64 s; uint32 ns; } ts;"
53 .B " clock_t clk;"
54 .B " kludge64 rawns;"
55 .B " } t;"
56 .B " kludge64 cy;"
57 .B "};"
58 .PP
59 .B "struct bench_timing {"
60 .B " unsigned f;"
61 .B " double n;"
62 .B " double t;"
63 .B " double cy;"
64 .B "};"
65 .PP
66 .B "#define BTF_T0 0u"
67 .B "#define BTF_T1 ..."
68 .B "struct bench_timerops {"
69 .BI " void (*describe)(struct bench_timer *" bt ", dstr *" d );
70 .ta 2n +\w'\fBint (*now)('u
71 .BI " int (*now)(struct bench_timer *" bt ,
72 .BI " struct bench_time *" t_out ", unsigned " f );
73 .ta 2n +\w'\void (*diff)('u
74 .BI " void (*diff)(struct bench_timer *" bt ,
75 .BI " struct bench_timing *" delta_out ,
76 .BI " const struct bench_time *" t0 ,
77 .BI " const struct bench_time *" t1 );
78 .BI " void (*destroy)(struct bench_timer *" bt );
79 .B "};"
80 .B "struct bench_timer {"
81 .B " const struct bench_timerops *ops;"
82 .B "};"
83 .PP
84 .B "struct bench_state {"
85 .B " unsigned f;"
86 .B " double target_s;"
87 .B " ..."
88 .B "}";
89 .PP
90 .BI "typedef void bench_fn(unsigned long " n ", void *" ctx );
91 .PP
92 .B "#define BTF_TIMEOK ..."
93 .B "#define BTF_CYOK ..."
94 .B "#define BTF_CLB ..."
95 .B "#define BTF_ANY (BTF_TIMEOK | BTF_CYOK)"
96 .PP
97 .B "struct bench_timer *bench_createtimer(void);"
98 .PP
99 .BI "int bench_init(struct bench_state *" b ", struct bench_timer *" tm );
100 .BI "void bench_destroy(struct bench_state *" b );
101 .BI "int bench_calibrate(struct bench_state *" b );
102 .ta \w'\fBint bench_measure('u
103 .BI "int bench_measure(struct bench_state *" b ", struct bench_timing *" t_out ,
104 .BI " double " base ", bench_fn *" fn ", void *" ctx );
105 .fi
106 .
107 .\"--------------------------------------------------------------------------
108 .SH DESCRIPTION
109 .
110 The header file
111 .B "<mLib/bench.h>"
112 provides declarations and defintions
113 for performing low-level benchmarks.
114 .PP
115 The `main event' is
116 .BR bench_measure .
117 This function will be described in detail later,
118 but, in brief,
119 it calls a caller-provided function,
120 instructing it to run adaptively chosen numbers of iterations,
121 in order to get a reasonably reliable measurement of its running time,
122 and then reports its results by filling in a structure.
123 .PP
124 With understanding this function as our objective,
125 we must examine all of the pieces involved in making it work.
126 .
127 .SS Timers in general
128 A
129 .I timer
130 is a gadget which is capable of reporting the current time,
131 in seconds (ideally precise to tiny fractions of a second),
132 and/or in CPU cycles.
133 A timer is represented by a pointer to an object of type
134 .BR "struct bench_timer" .
135 This structure has a single member,
136 .BR ops ,
137 pointing to a
138 .BR "struct bench_timerops" ,
139 which is a table of function pointers;
140 typically, a timer has more data following this,
141 but this fact is not exposed to applications.
142 .PP
143 The function pointers in
144 .B "struct bench_timerops"
145 are as follows.
146 The first argument,
147 named
148 .I tm
149 must always point to the timer object itself.
150 .TP
151 .IB tm ->ops->describe( tm ", " d)
152 Write a description of the timer to the dynamic string
153 .IR d .
154 .TP
155 .IB tm ->ops->now( tm ", " t_out ", " f )
156 Store the current time in
157 .BI * t_out \fR.
158 The
159 .B BTF_T1
160 flag in
161 .I f
162 to indicate that this is the second call in a pair;
163 leave it clear for the first call.
164 (A fake
165 .B BTF_T0
166 flag is defined to be zero for symmetry.)
167 Return zero on success
168 .I or
169 permanent failure;
170 return \-1 if timing failed but
171 trying again immediately has a reasonable chance of success.
172 .TP
173 .IB tm ->ops->diff( tm ", " delta_out ", " t0 ", " t1 )
174 Store in
175 .BI * delta_out
176 the difference between the two times
177 .I t0
178 and
179 .IR t1 .
180 .TP
181 .IB tm ->ops->destroy( tm )
182 Destroy the timer,
183 releasing all of the resources that it holds.
184 .PP
185 A
186 .B bench_timing
187 structure reports the difference between two times,
188 as determined by a timer's
189 .B diff
190 function.
191 It has four members.
192 .TP
193 .B f
194 A flags word.
195 .B BTF_TIMEOK
196 is set if the passage-of-time measurement in
197 .B t
198 is valid;
199 .B BTF_CYOK
200 is set if the cycle count in
201 .B cy
202 is valid.
203 The mask
204 .B BTF_ANY
205 covers the
206 .B BTF_TIMEOK
207 and
208 .B BTF_CYOK
209 bits:
210 hence,
211 .B f&BTF_ANY
212 is nonzero (true)
213 if the timer returned any valid timing information.
214 .TP
215 .B n
216 The number of iterations performed by the benchmark function
217 on its satisfactory run,
218 multiplied by
219 .IR base .
220 .TP
221 .B t
222 The time taken for the satisfactory run of the benchmark function,
223 in seconds.
224 Only valid if
225 .B BTF_TIMEOK
226 is set in
227 .BR f .
228 .TP
229 .B cy
230 The number of CPU cycles used
231 in the satisfactory run of the benchmark function,
232 in seconds.
233 Only valid if
234 .B BTF_CYOK
235 is set in
236 .BR f .
237 .PP
238 A
239 .B "struct bench_time"
240 representats a single instant in time,
241 as captured by a timer's
242 .B now
243 function.
244 The use of this structure is a private matter for the timer:
245 the only hard requirement is that the
246 .B diff
247 function should be able to compute the difference between two times.
248 However, the intent is that
249 a passage-of-time measurement is stored in the
250 .B t
251 union,
252 a cycle count is stored in the
253 .B cy
254 member, and
255 the
256 .B f
257 member stores flags
258 .B BTF_TIMEOK
259 and or
260 .B BTF_CYOK
261 if the passage-of-time or cycle count respectively are valid.
262 .
263 .SS The built-in timer
264 The function
265 .B bench_createtimer
266 constructs and returns a timer.
267 It takes a single argument,
268 a string
269 .IR config ,
270 from which it reads configuration information.
271 If
272 .B bench_createtimer
273 fails, it returns a null pointer.
274 .PP
275 The
276 .I config
277 pointer may safely be null,
278 in which case a default configuration will be used.
279 Applications
280 .I should only
281 set this pointer to a value supplied by a user,
282 e.g., through a command-line argument,
283 environment variable, or
284 configuration file.
285 .PP
286 The built-in timer makes use of one or two
287 .IR subtimers :
288 a `clock' subtimer to measure the passage of time,
289 and possibly a `cycle' subtimer to count CPU cycles.
290 .PP
291 The configuration string consists of a sequence of words
292 separated by whitespace.
293 There may be additional whitespace at the start and end of the string.
294 The words recognized are as follows.
295 .TP
296 .B list
297 Prints a list of the available clock and cycle subtimers
298 to standard output.
299 .TP
300 .BI clock= t , ...
301 Use the first of the listed clock subtimers
302 to initialize successfully
303 as the clock subtimer.
304 If none of the subtimers can be initialized,
305 then construction of the timer as a whole fails.
306 .TP
307 .BI cycle= t , ...
308 Use the first of the listed subtimers
309 to initialize successfully
310 as the cycle subtimer.
311 If none of the subtimers can be initialized,
312 then construction of the timer as a whole fails.
313 .PP
314 The clock subtimers are as follows.
315 Not all of them will be available on every platform.
316 .TP
317 .B linux-x86-perf-rdpmc-hw-cycles
318 This is a dummy companion to the similarly named cycle subtimer;
319 see its description below.
320 .TP
321 .B posix-thread-cputime
322 Measures the passage of time using
323 .BR clock_gettime (2),
324 specifying the
325 .B CLOCK_\%THREAD_\%CPUTIME_\%ID
326 clock.
327 .TP
328 .B stdc-clock
329 Measures the passage of time using
330 .BR clock (3).
331 Since
332 .BR clock (3)
333 is part of the original ANSI\ C standard,
334 this subtimer should always be available.
335 However, it may produce unhelpful results
336 if other threads are running.
337 .PP
338 The cycle subtimers are as follows.
339 Not all of them will be available on every platform.
340 .TP
341 .B linux-perf-read-hw-cycles
342 Counts CPU cycles using the Linux-specific
343 .BR perf_event_open (2)
344 function to read the
345 .BR PERF_\%COUNT_\%HW_\%CPU_\%CYCLES
346 counter.
347 Only available on Linux.
348 It will fail to initialize
349 if access to performance counters is restricted,
350 e.g., because the
351 .B /proc/sys/kernel/perf_event_paranoid
352 level is too high.
353 .TP
354 .B linux-perf-rdpmc-hw-cycles
355 Counts CPU cycles using the Linux-specific
356 .BR perf_event_open (2)
357 function,
358 as for
359 .B linux-x86-perf-read-hw-cycles
360 above,
361 except that it additionally uses the i386/AMD64
362 .B rdtsc
363 and
364 .B rdpmc
365 instructions,
366 together with information provided by the kernel
367 through a memory-mapped page
368 to do its measurements without any system call overheads.
369 It does passage-of-time and cycle counting in a single operation,
370 so no separate clock subtimer is required:
371 the similarly-named clock subtimer does nothing
372 except check that the
373 .B linux-x86-perf-rdpmc-hw-cycles
374 cycle subtimer has been selected.
375 This is almost certainly the best choice if it's available.
376 .TP
377 .B x86-rdtscp
378 Counts CPU cycles using the x86
379 .B rdtscp
380 instruction.
381 This instruction is not really suitable for performance measurement:
382 it gives misleading results on CPUs with variable clock frequency.
383 .TP
384 .B x86-rdtsc
385 Counts CPU cycles using the x86
386 .B rdtsc
387 instruction.
388 This has the downsides of
389 .B rdtscp
390 above,
391 but also fails to detect when the thread has been suspended
392 or transferred to a different CPU core
393 and gives misleading answers in this case.
394 Not really recommended.
395 .TP
396 .B null
397 A dummy cycle counter,
398 which will initialize successfully
399 and then fail to report cycle counts.
400 This is a reasonable fallback in many situations.
401 .PP
402 The built-in preference order for clock subtimers,
403 from most to least preferred, is
404 .BR linux-x86-perf-rdpmc-hw-cycles ,
405 followed by
406 .BR posix-thread-cputime ,
407 and finally
408 .BR stdc-clock .
409 The built-in preference order for cycle subtimers,
410 from most to least preferred, is
411 .BR linux-x86-perf-rdpmc-hw-cycles
412 then
413 .BR linux-x86-perf-read-hw-cycles ,
414 followed by
415 .BR x86-rdtscp ,
416 and
417 .BR x86-rdtsc ,
418 and finally
419 .BR null .
420 .
421 .SS The benchmark state
422 A
423 .I benchmark state
424 tracks the information needed to measure performance of functions.
425 It is represented by a
426 .B struct bench_state
427 structure.
428 .PP
429 The benchmark state is initialized by calling
430 .BR bench_init ,
431 passing the address of the state structure to be initialized,
432 and a pointer to a timer.
433 If
434 .B bench_init
435 is called with a non-null timer pointer,
436 then it will not fail;
437 the benchmark state will be initialized,
438 and the function returns zero.
439 If the timer pointer is null,
440 then
441 .B bench_init
442 attempts to construct a timer for itself
443 by calling
444 .BR bench_createtimer .
445 If this succeeds,
446 then the benchmark state will be initialized,
447 and the function returns zero.
448 In both cases,
449 the timer becomes owned by the benchmark state:
450 calling
451 .B bench_destroy
452 on the benchmark state will destroy the timer.
453 If
454 .B bench_init
455 is called with a null timer pointer,
456 and its attempt to create a timer for itself fails,
457 then
458 .B bench_init
459 returns \-1;
460 the benchmark state is not initialized
461 and can safely be discarded;
462 calling
463 safe to call
464 .B bench_destroy
465 on the unsuccessfully benchmark state is safe and has no effect.
466 .PP
467 Calling
468 .B bench_destroy
469 on a benchmark state
470 releases any resources it holds,
471 most notably its timer, if any.
472 .PP
473 Although
474 .B struct bench_state
475 is defined in the header file,
476 only two members are available for use by applications.
477 .TP
478 .B f
479 A word containing flags.
480 .TP
481 .B target_s
482 The target time for which to try run a benchmark, in seconds.
483 After initialization, this is set to 1.0,
484 though applications can override it.
485 .PP
486 Before the benchmark state can be used in measurements,
487 it must be
488 .IR calibrated .
489 This is performed by calling
490 .B bench_calibrate
491 on the benchmark state.
492 Calibration takes a noticeable amount of time
493 (currently about 0.25\*,s),
494 so it makes sense to defer it until it's known to be necessary.
495 .PP
496 Calibration is carried out separately, but in parallel,
497 for the timer's passage-of-time measurement and cycle counter.
498 Either or both of these calibrations can succeed or fail;
499 if passage-of-time calibration fails,
500 then cycle count calibration is impossible.
501 .PP
502 When it completes,
503 .B bench_calibrate
504 sets flag in the benchmark state's
505 .B f
506 member:
507 if passage-of-time calibration succeeded,
508 .B BTF_TIMEOK
509 is set;
510 if cycle-count calibration succeeded,
511 .B BTF_CYOK
512 is set;
513 and the flag
514 .B BTF_CLB
515 is set unconditionally,
516 as a persistent indication that calibration has been attempted.
517 .PP
518 The
519 .B bench_calibrate
520 function returns zero if it successfully calibrated
521 at least the passage-of-time measurement;
522 otherwise, it returns \-1.
523 If
524 .B bench_calibrate
525 is called for a second or subsequent time on the same benchmark state,
526 it returns immediately,
527 either returning 0 or \-1
528 according to whether passage-of-time had previously been calibrated.
529 .
530 .SS Timing functions
531 A
532 .I benchmark function
533 has the signature
534 .IP
535 .BI "void " fn "(unsigned long " n ", void *" ctx );
536 .PP
537 When called, it should perform the operation to be measured
538 .I n
539 times.
540 The
541 .I ctx
542 argument is a pointer passed into
543 .B bench_measure
544 for the benchmark function's own purposes.
545 .PP
546 The function
547 .B bench_measure
548 receives five arguments.
549 .TP
550 .I b
551 points to the benchmark state to be used.
552 .TP
553 .I t_out
554 is the address of a
555 .BR struct bench_timing
556 in which the measurement should be left.
557 This structure is described below.
558 .TP
559 .I base
560 is a count of the number of operations performed
561 by each iteration of the benchmark function.
562 .TP
563 .I fn
564 is a benchmark function, described above.
565 .TP
566 .I ctx
567 is a pointer to be passed to the benchmark function.
568 .B bench_measure
569 does not interpret this pointer in any way.
570 .PP
571 The
572 .B bench_measure
573 function calls its benchark function repeatedly
574 with different iteration counts
575 .IR n ,
576 with the objective that the call take approximately
577 .B target_s
578 seconds, as established in the benchmark state.
579 (Currently, if
580 .B target_s
581 holds the value
582 .IR t ,
583 then
584 .B bench_measure
585 is satisfied when a call takes at least
586 .IR t /\(sr2\*,s.)
587 Once the function finds a satisfactory number of iterations,
588 it stores the results in
589 .BI * t_out \fR.
590 If measurement succeeds, then
591 .B bench_measure
592 returns zero.
593 If it fails \(en
594 most likely because the timer failed \(en
595 then it returns \-1.
596 .
597 .\"--------------------------------------------------------------------------
598 .SH "SEE ALSO"
599 .
600 .BR tvec-bench (3),
601 .BR mLib (3).
602 .
603 .\"--------------------------------------------------------------------------
604 .SH AUTHOR
605 .
606 Mark Wooding, <mdw@distorted.org.uk>
607 .
608 .\"----- That's all, folks --------------------------------------------------