@@@ fltfmt mess
[mLib] / test / bench.3.in
1 .\" -*-nroff-*-
2 .\"
3 .\" Manual for benchmarking core
4 .\"
5 .\" (c) 2024 Straylight/Edgeware
6 .\"
7 .
8 .\"----- Licensing notice ---------------------------------------------------
9 .\"
10 .\" This file is part of the mLib utilities library.
11 .\"
12 .\" mLib is free software: you can redistribute it and/or modify it under
13 .\" the terms of the GNU Library General Public License as published by
14 .\" the Free Software Foundation; either version 2 of the License, or (at
15 .\" your option) any later version.
16 .\"
17 .\" mLib is distributed in the hope that it will be useful, but WITHOUT
18 .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 .\" License for more details.
21 .\"
22 .\" You should have received a copy of the GNU Library General Public
23 .\" License along with mLib. If not, write to the Free Software
24 .\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 .\" USA.
26 .
27 .\"--------------------------------------------------------------------------
28 .so ../defs.man \" @@@PRE@@@
29 .
30 .\"--------------------------------------------------------------------------
31 .TH bench 3mLib "9 March 2024" "Straylight/Edgeware" "mLib utilities library"
32 .\" @bench_createtimer
33 .\" @bench_init
34 .\" @bench_destroy
35 .\" @bench_calibrate
36 .\" @bench_measure
37 .
38 .\"--------------------------------------------------------------------------
39 .SH NAME
40 bench \- low-level benchmarking tools
41 .
42 .\"--------------------------------------------------------------------------
43 .SH SYNOPSIS
44 .
45 .nf
46 .B "#include <mLib/bench.h>"
47 .PP
48 .ta 2n +2n +2n
49 .B "struct bench_time {"
50 .B " unsigned f;"
51 .B " union {"
52 .B " struct { kludge64 s; uint32 ns; } ts;"
53 .B " clock_t clk;"
54 .B " kludge64 rawns;"
55 .B " } t;"
56 .B " kludge64 cy;"
57 .B "};"
58 .PP
59 .B "struct bench_timing {"
60 .B " unsigned f;"
61 .B " double n;"
62 .B " double t;"
63 .B " double cy;"
64 .B "};"
65 .PP
66 .B "#define BTF_T0 0u"
67 .B "#define BTF_T1 ..."
68 .B "struct bench_timerops {"
69 .BI " void (*describe)(struct bench_timer *" bt ", dstr *" d );
70 .BI " int (*preflight)(struct bench_timer *" bt );
71 .ta 2n +\w'\fBint (*now)('u
72 .BI " int (*now)(struct bench_timer *" bt ,
73 .BI " struct bench_time *" t_out ", unsigned " f );
74 .ta 2n +\w'\void (*diff)('u
75 .BI " void (*diff)(struct bench_timer *" bt ,
76 .BI " struct bench_timing *" delta_out ,
77 .BI " const struct bench_time *" t0 ,
78 .BI " const struct bench_time *" t1 );
79 .BI " void (*destroy)(struct bench_timer *" bt );
80 .B "};"
81 .B "struct bench_timer {"
82 .B " const struct bench_timerops *ops;"
83 .B "};"
84 .PP
85 .B "struct bench_state {"
86 .B " unsigned f;"
87 .B " double target_s;"
88 .B " ..."
89 .B "}";
90 .PP
91 .BI "typedef void bench_fn(unsigned long " n ", void *" ctx );
92 .PP
93 .B "#define BTF_TIMEOK ..."
94 .B "#define BTF_CYOK ..."
95 .B "#define BTF_CLB ..."
96 .B "#define BTF_ANY (BTF_TIMEOK | BTF_CYOK)"
97 .PP
98 .B "struct bench_timer *bench_createtimer(void);"
99 .PP
100 .BI "int bench_init(struct bench_state *" b ", struct bench_timer *" tm );
101 .BI "void bench_destroy(struct bench_state *" b );
102 .BI "int bench_calibrate(struct bench_state *" b );
103 .ta \w'\fBint bench_measure('u
104 .BI "int bench_measure(struct bench_state *" b ", struct bench_timing *" t_out ,
105 .BI " double " base ", bench_fn *" fn ", void *" ctx );
106 .fi
107 .
108 .\"--------------------------------------------------------------------------
109 .SH DESCRIPTION
110 .
111 The header file
112 .B "<mLib/bench.h>"
113 provides declarations and defintions
114 for performing low-level benchmarks.
115 .PP
116 The `main event' is
117 .BR bench_measure .
118 This function will be described in detail later,
119 but, in brief,
120 it calls a caller-provided function,
121 instructing it to run adaptively chosen numbers of iterations,
122 in order to get a reasonably reliable measurement of its running time,
123 and then reports its results by filling in a structure.
124 .PP
125 With understanding this function as our objective,
126 we must examine all of the pieces involved in making it work.
127 .
128 .SS Timers in general
129 A
130 .I timer
131 is a gadget which is capable of reporting the current time,
132 in seconds (ideally precise to tiny fractions of a second),
133 and/or in CPU cycles.
134 A timer is represented by a pointer to an object of type
135 .BR "struct bench_timer" .
136 This structure has a single member,
137 .BR ops ,
138 pointing to a
139 .BR "struct bench_timerops" ,
140 which is a table of function pointers;
141 typically, a timer has more data following this,
142 but this fact is not exposed to applications.
143 .PP
144 The function pointers in
145 .B "struct bench_timerops"
146 are as follows.
147 The first argument,
148 named
149 .I tm
150 must always point to the timer object itself.
151 .TP
152 .IB tm ->ops->describe( tm ", " d)
153 Write a description of the timer to the dynamic string
154 .IR d .
155 .TPc
156 .IB tm ->ops->preflight( tm )
157 Ensure that the timer is in working order,
158 and perform any necessary per-thread or per-process setup.
159 Return zero if the
160 .B now
161 function is likely to work properly
162 when called from the same thread
163 in the near future;
164 otherwise return \-1.
165 .TP
166 .IB tm ->ops->now( tm ", " t_out ", " f )
167 Store the current time in
168 .BI * t_out \fR.
169 The
170 .B BTF_T1
171 flag in
172 .I f
173 to indicate that this is the second call in a pair;
174 leave it clear for the first call.
175 (A fake
176 .B BTF_T0
177 flag is defined to be zero for symmetry.)
178 Return zero on success
179 .I or
180 permanent failure;
181 return \-1 if timing failed but
182 trying again immediately has a reasonable chance of success.
183 .TP
184 .IB tm ->ops->diff( tm ", " delta_out ", " t0 ", " t1 )
185 Store in
186 .BI * delta_out
187 the difference between the two times
188 .I t0
189 and
190 .IR t1 .
191 .TP
192 .IB tm ->ops->destroy( tm )
193 Destroy the timer,
194 releasing all of the resources that it holds.
195 .PP
196 A
197 .B bench_timing
198 structure reports the difference between two times,
199 as determined by a timer's
200 .B diff
201 function.
202 It has four members.
203 .TP
204 .B f
205 A flags word.
206 .B BTF_TIMEOK
207 is set if the passage-of-time measurement in
208 .B t
209 is valid;
210 .B BTF_CYOK
211 is set if the cycle count in
212 .B cy
213 is valid.
214 The mask
215 .B BTF_ANY
216 covers the
217 .B BTF_TIMEOK
218 and
219 .B BTF_CYOK
220 bits:
221 hence,
222 .B f&BTF_ANY
223 is nonzero (true)
224 if the timer returned any valid timing information.
225 .TP
226 .B n
227 The number of iterations performed by the benchmark function
228 on its satisfactory run,
229 multiplied by
230 .IR base .
231 .TP
232 .B t
233 The time taken for the satisfactory run of the benchmark function,
234 in seconds.
235 Only valid if
236 .B BTF_TIMEOK
237 is set in
238 .BR f .
239 .TP
240 .B cy
241 The number of CPU cycles used
242 in the satisfactory run of the benchmark function,
243 in seconds.
244 Only valid if
245 .B BTF_CYOK
246 is set in
247 .BR f .
248 .PP
249 A
250 .B "struct bench_time"
251 represents a single instant in time,
252 as captured by a timer's
253 .B now
254 function.
255 The use of this structure is a private matter for the timer:
256 the only hard requirement is that the
257 .B diff
258 function should be able to compute the difference between two times.
259 However, the intent is that
260 a passage-of-time measurement is stored in the
261 .B t
262 union,
263 a cycle count is stored in the
264 .B cy
265 member, and
266 the
267 .B f
268 member stores flags
269 .B BTF_TIMEOK
270 and or
271 .B BTF_CYOK
272 if the passage-of-time or cycle count respectively are valid.
273 .
274 .SS The built-in timer
275 The function
276 .B bench_createtimer
277 constructs and returns a timer.
278 It takes a single argument,
279 a string
280 .IR config ,
281 from which it reads configuration information.
282 If
283 .B bench_createtimer
284 fails, it returns a null pointer.
285 .PP
286 The
287 .I config
288 pointer may safely be null,
289 in which case a default configuration will be used.
290 Applications
291 .I should only
292 set this pointer to a value supplied by a user,
293 e.g., through a command-line argument,
294 environment variable, or
295 configuration file.
296 .PP
297 The built-in timer makes use of one or two
298 .IR subtimers :
299 a `clock' subtimer to measure the passage of time,
300 and possibly a `cycle' subtimer to count CPU cycles.
301 .PP
302 The configuration string consists of a sequence of words
303 separated by whitespace.
304 There may be additional whitespace at the start and end of the string.
305 The words recognized are as follows.
306 .TP
307 .B list
308 Prints a list of the available clock and cycle subtimers
309 to standard output.
310 .TP
311 .BI clock= t , ...
312 Use the first of the listed clock subtimers
313 to initialize successfully
314 as the clock subtimer.
315 If none of the subtimers can be initialized,
316 then construction of the timer as a whole fails.
317 .TP
318 .BI cycle= t , ...
319 Use the first of the listed subtimers
320 to initialize successfully
321 as the cycle subtimer.
322 If none of the subtimers can be initialized,
323 then construction of the timer as a whole fails.
324 .PP
325 The clock subtimers are as follows.
326 Not all of them will be available on every platform.
327 .TP
328 .B linux-x86-perf-rdpmc-hw-cycles
329 This is a dummy companion to the similarly named cycle subtimer;
330 see its description below.
331 .TP
332 .B posix-thread-cputime
333 Measures the passage of time using
334 .BR clock_gettime (2),
335 specifying the
336 .B CLOCK_\%THREAD_\%CPUTIME_\%ID
337 clock.
338 .TP
339 .B stdc-clock
340 Measures the passage of time using
341 .BR clock (3).
342 Since
343 .BR clock (3)
344 is part of the original ANSI\ C standard,
345 this subtimer should always be available.
346 However, it may produce unhelpful results
347 if other threads are running.
348 .PP
349 The cycle subtimers are as follows.
350 Not all of them will be available on every platform.
351 .TP
352 .B linux-perf-read-hw-cycles
353 Counts CPU cycles using the Linux-specific
354 .BR perf_event_open (2)
355 function to read the
356 .BR PERF_\%COUNT_\%HW_\%CPU_\%CYCLES
357 counter.
358 Only available on Linux.
359 It will fail to initialize
360 if access to performance counters is restricted,
361 e.g., because the
362 .B /proc/sys/kernel/perf_event_paranoid
363 level is too high.
364 .TP
365 .B linux-perf-rdpmc-hw-cycles
366 Counts CPU cycles using the Linux-specific
367 .BR perf_event_open (2)
368 function,
369 as for
370 .B linux-x86-perf-read-hw-cycles
371 above,
372 except that it additionally uses the i386/AMD64
373 .B rdtsc
374 and
375 .B rdpmc
376 instructions,
377 together with information provided by the kernel
378 through a memory-mapped page
379 to do its measurements without any system call overheads.
380 It does passage-of-time and cycle counting in a single operation,
381 so no separate clock subtimer is required:
382 the similarly-named clock subtimer does nothing
383 except check that the
384 .B linux-x86-perf-rdpmc-hw-cycles
385 cycle subtimer has been selected.
386 This is almost certainly the best choice if it's available.
387 .TP
388 .B x86-rdtscp
389 Counts CPU cycles using the x86
390 .B rdtscp
391 instruction.
392 This instruction is not really suitable for performance measurement:
393 it gives misleading results on CPUs with variable clock frequency.
394 .TP
395 .B x86-rdtsc
396 Counts CPU cycles using the x86
397 .B rdtsc
398 instruction.
399 This has the downsides of
400 .B rdtscp
401 above,
402 but also fails to detect when the thread has been suspended
403 or transferred to a different CPU core
404 and gives misleading answers in this case.
405 Not really recommended.
406 .TP
407 .B null
408 A dummy cycle counter,
409 which will initialize successfully
410 and then fail to report cycle counts.
411 This is a reasonable fallback in many situations.
412 .PP
413 The built-in preference order for clock subtimers,
414 from most to least preferred, is
415 .BR linux-x86-perf-rdpmc-hw-cycles ,
416 followed by
417 .BR posix-thread-cputime ,
418 and finally
419 .BR stdc-clock .
420 The built-in preference order for cycle subtimers,
421 from most to least preferred, is
422 .BR linux-x86-perf-rdpmc-hw-cycles
423 then
424 .BR linux-x86-perf-read-hw-cycles ,
425 followed by
426 .BR x86-rdtscp ,
427 and
428 .BR x86-rdtsc ,
429 and finally
430 .BR null .
431 .
432 .SS The benchmark state
433 A
434 .I benchmark state
435 tracks the information needed to measure performance of functions.
436 It is represented by a
437 .B struct bench_state
438 structure.
439 .PP
440 The benchmark state is initialized by calling
441 .BR bench_init ,
442 passing the address of the state structure to be initialized,
443 and a pointer to a timer.
444 If
445 .B bench_init
446 is called with a non-null timer pointer,
447 then it will not fail;
448 the benchmark state will be initialized,
449 and the function returns zero.
450 If the timer pointer is null,
451 then
452 .B bench_init
453 attempts to construct a timer for itself
454 by calling
455 .BR bench_createtimer .
456 If this succeeds,
457 then the benchmark state will be initialized,
458 and the function returns zero.
459 In both cases,
460 the timer becomes owned by the benchmark state:
461 calling
462 .B bench_destroy
463 on the benchmark state will destroy the timer.
464 If
465 .B bench_init
466 is called with a null timer pointer,
467 and its attempt to create a timer for itself fails,
468 then
469 .B bench_init
470 returns \-1;
471 the benchmark state is not initialized
472 and can safely be discarded;
473 calling
474 safe to call
475 .B bench_destroy
476 on the unsuccessfully benchmark state is safe and has no effect.
477 .PP
478 Calling
479 .B bench_destroy
480 on a benchmark state
481 releases any resources it holds,
482 most notably its timer, if any.
483 .PP
484 Although
485 .B struct bench_state
486 is defined in the header file,
487 only two members are available for use by applications.
488 .TP
489 .B f
490 A word containing flags.
491 .TP
492 .B target_s
493 The target time for which to try run a benchmark, in seconds.
494 After initialization, this is set to 1.0,
495 though applications can override it.
496 .PP
497 Before the benchmark state can be used in measurements,
498 it must be
499 .IR calibrated .
500 This is performed by calling
501 .B bench_calibrate
502 on the benchmark state.
503 Calibration takes a noticeable amount of time
504 (currently about 0.25\*,s),
505 so it makes sense to defer it until it's known to be necessary.
506 .PP
507 Calibration is carried out separately, but in parallel,
508 for the timer's passage-of-time measurement and cycle counter.
509 Either or both of these calibrations can succeed or fail;
510 if passage-of-time calibration fails,
511 then cycle count calibration is impossible.
512 .PP
513 When it completes,
514 .B bench_calibrate
515 sets flag in the benchmark state's
516 .B f
517 member:
518 if passage-of-time calibration succeeded,
519 .B BTF_TIMEOK
520 is set;
521 if cycle-count calibration succeeded,
522 .B BTF_CYOK
523 is set;
524 and the flag
525 .B BTF_CLB
526 is set unconditionally,
527 as a persistent indication that calibration has been attempted.
528 .PP
529 The
530 .B bench_calibrate
531 function returns zero if it successfully calibrated
532 at least the passage-of-time measurement;
533 otherwise, it returns \-1.
534 If
535 .B bench_calibrate
536 is called for a second or subsequent time on the same benchmark state,
537 it returns immediately,
538 either returning 0 or \-1
539 according to whether passage-of-time had previously been calibrated.
540 .
541 .SS Timing functions
542 A
543 .I benchmark function
544 has the signature
545 .IP
546 .BI "void " fn "(unsigned long " n ", void *" ctx );
547 .PP
548 When called, it should perform the operation to be measured
549 .I n
550 times.
551 The
552 .I ctx
553 argument is a pointer passed into
554 .B bench_measure
555 for the benchmark function's own purposes.
556 .PP
557 The function
558 .B bench_measure
559 receives five arguments.
560 .TP
561 .I b
562 points to the benchmark state to be used.
563 .TP
564 .I t_out
565 is the address of a
566 .BR struct bench_timing
567 in which the measurement should be left.
568 This structure is described below.
569 .TP
570 .I base
571 is a count of the number of operations performed
572 by each iteration of the benchmark function.
573 .TP
574 .I fn
575 is a benchmark function, described above.
576 .TP
577 .I ctx
578 is a pointer to be passed to the benchmark function.
579 .B bench_measure
580 does not interpret this pointer in any way.
581 .PP
582 The
583 .B bench_measure
584 function calls its benchark function repeatedly
585 with different iteration counts
586 .IR n ,
587 with the objective that the call take approximately
588 .B target_s
589 seconds, as established in the benchmark state.
590 (Currently, if
591 .B target_s
592 holds the value
593 .IR t ,
594 then
595 .B bench_measure
596 is satisfied when a call takes at least
597 .IR t /\(sr2\*,s.)
598 Once the function finds a satisfactory number of iterations,
599 it stores the results in
600 .BI * t_out \fR.
601 If measurement succeeds, then
602 .B bench_measure
603 returns zero.
604 If it fails \(en
605 most likely because the timer failed \(en
606 then it returns \-1.
607 .
608 .\"--------------------------------------------------------------------------
609 .SH "SEE ALSO"
610 .
611 .BR tvec-bench (3),
612 .BR mLib (3).
613 .
614 .\"--------------------------------------------------------------------------
615 .SH AUTHOR
616 .
617 Mark Wooding, <mdw@distorted.org.uk>
618 .
619 .\"----- That's all, folks --------------------------------------------------