@@@ misc wip
[mLib] / test / bench.3.in
CommitLineData
d056fbdf 1.\" -*-nroff-*-
c4ccbbf9
MW
2.\"
3.\" Manual for benchmarking core
4.\"
5.\" (c) 2024 Straylight/Edgeware
6.\"
7.
8.\"----- Licensing notice ---------------------------------------------------
9.\"
10.\" This file is part of the mLib utilities library.
11.\"
12.\" mLib is free software: you can redistribute it and/or modify it under
13.\" the terms of the GNU Library General Public License as published by
14.\" the Free Software Foundation; either version 2 of the License, or (at
15.\" your option) any later version.
16.\"
17.\" mLib is distributed in the hope that it will be useful, but WITHOUT
18.\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19.\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20.\" License for more details.
21.\"
22.\" You should have received a copy of the GNU Library General Public
23.\" License along with mLib. If not, write to the Free Software
24.\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25.\" USA.
26.
27.\"--------------------------------------------------------------------------
28.so ../defs.man \" @@@PRE@@@
29.
30.\"--------------------------------------------------------------------------
31.TH bench 3mLib "9 March 2024" "Straylight/Edgeware" "mLib utilities library"
d056fbdf
MW
32.\" @bench_createtimer
33.\" @bench_init
34.\" @bench_destroy
35.\" @bench_calibrate
36.\" @bench_measure
37.
c4ccbbf9
MW
38.\"--------------------------------------------------------------------------
39.SH NAME
40bench \- low-level benchmarking tools
41.
42.\"--------------------------------------------------------------------------
d056fbdf 43.SH SYNOPSIS
c4ccbbf9 44.
d056fbdf
MW
45.nf
46.B "#include <mLib/bench.h>"
47.PP
6e683a79 48.ta 2n +2n +2n
d056fbdf
MW
49.B "struct bench_time {"
50.B " unsigned f;"
6e683a79
MW
51.B " union {"
52.B " struct { kludge64 s; uint32 ns; } ts;"
53.B " clock_t clk;"
54.B " kludge64 rawns;"
55.B " } t;"
d056fbdf
MW
56.B " kludge64 cy;"
57.B "};"
58.PP
59.B "struct bench_timing {"
60.B " unsigned f;"
61.B " double n;"
62.B " double t;"
63.B " double cy;"
64.B "};"
65.PP
6e683a79
MW
66.B "#define BTF_T0 0u"
67.B "#define BTF_T1 ..."
d056fbdf
MW
68.B "struct bench_timerops {"
69.BI " void (*describe)(struct bench_timer *" bt ", dstr *" d );
6e683a79
MW
70.ta 2n +\w'\fBint (*now)('u
71.BI " int (*now)(struct bench_timer *" bt ,
72.BI " struct bench_time *" t_out ", unsigned " f );
73.ta 2n +\w'\void (*diff)('u
74.BI " void (*diff)(struct bench_timer *" bt ,
75.BI " struct bench_timing *" delta_out ,
76.BI " const struct bench_time *" t0 ,
77.BI " const struct bench_time *" t1 );
d056fbdf
MW
78.BI " void (*destroy)(struct bench_timer *" bt );
79.B "};"
80.B "struct bench_timer {"
81.B " const struct bench_timerops *ops;"
82.B "};"
83.PP
84.B "struct bench_state {"
85.B " unsigned f;"
86.B " double target_s;"
87.B " ..."
88.B "}";
89.PP
90.BI "typedef void bench_fn(unsigned long " n ", void *" ctx );
91.PP
92.B "#define BTF_TIMEOK ..."
93.B "#define BTF_CYOK ..."
94.B "#define BTF_CLB ..."
95.B "#define BTF_ANY (BTF_TIMEOK | BTF_CYOK)"
96.PP
97.B "struct bench_timer *bench_createtimer(void);"
98.PP
99.BI "int bench_init(struct bench_state *" b ", struct bench_timer *" tm );
100.BI "void bench_destroy(struct bench_state *" b );
101.BI "int bench_calibrate(struct bench_state *" b );
102.ta \w'\fBint bench_measure('u
103.BI "int bench_measure(struct bench_state *" b ", struct bench_timing *" t_out ,
104.BI " double " base ", bench_fn *" fn ", void *" ctx );
105.fi
106.
c4ccbbf9 107.\"--------------------------------------------------------------------------
d056fbdf 108.SH DESCRIPTION
c4ccbbf9 109.
d056fbdf
MW
110The header file
111.B "<mLib/bench.h>"
112provides declarations and defintions
113for performing low-level benchmarks.
114.PP
115The `main event' is
116.BR bench_measure .
117This function will be described in detail later,
118but, in brief,
119it calls a caller-provided function,
120instructing it to run adaptively chosen numbers of iterations,
121in order to get a reasonably reliable measurement of its running time,
122and then reports its results by filling in a structure.
123.PP
124With understanding this function as our objective,
125we must examine all of the pieces involved in making it work.
126.
127.SS Timers in general
128A
129.I timer
130is a gadget which is capable of reporting the current time,
131in seconds (ideally precise to tiny fractions of a second),
132and/or in CPU cycles.
133A timer is represented by a pointer to an object of type
134.BR "struct bench_timer" .
135This structure has a single member,
136.BR ops ,
137pointing to a
138.BR "struct bench_timerops" ,
139which is a table of function pointers;
140typically, a timer has more data following this,
141but this fact is not exposed to applications.
142.PP
143The function pointers in
144.B "struct bench_timerops"
145are as follows.
146The first argument,
147named
148.I tm
149must always point to the timer object itself.
150.TP
151.IB tm ->ops->describe( tm ", " d)
152Write a description of the timer to the dynamic string
153.IR d .
154.TP
6e683a79 155.IB tm ->ops->now( tm ", " t_out ", " f )
d056fbdf 156Store the current time in
6e683a79 157.BI * t_out \fR.
d056fbdf 158The
6e683a79
MW
159.B BTF_T1
160flag in
161.I f
162to indicate that this is the second call in a pair;
163leave it clear for the first call.
164(A fake
165.B BTF_T0
166flag is defined to be zero for symmetry.)
167Return zero on success
168.I or
169permanent failure;
170return \-1 if timing failed but
171trying again immediately has a reasonable chance of success.
172.TP
173.IB tm ->ops->diff( tm ", " delta_out ", " t0 ", " t1 )
174Store in
175.BI * delta_out
176the difference between the two times
177.I t0
178and
179.IR t1 .
d056fbdf
MW
180.TP
181.IB tm ->ops->destroy( tm )
182Destroy the timer,
183releasing all of the resources that it holds.
184.PP
6e683a79
MW
185A
186.B bench_timing
187structure reports the difference between two times,
188as determined by a timer's
189.B diff
190function.
191It has four members.
192.TP
d056fbdf 193.B f
6e683a79 194A flags word.
d056fbdf 195.B BTF_TIMEOK
6e683a79
MW
196is set if the passage-of-time measurement in
197.B t
198is valid;
d056fbdf 199.B BTF_CYOK
6e683a79 200is set if the cycle count in
d056fbdf
MW
201.B cy
202is valid.
d056fbdf
MW
203The mask
204.B BTF_ANY
205covers the
206.B BTF_TIMEOK
207and
208.B BTF_CYOK
209bits:
210hence,
6e683a79 211.B f&BTF_ANY
d056fbdf
MW
212is nonzero (true)
213if the timer returned any valid timing information.
6e683a79
MW
214.TP
215.B n
216The number of iterations performed by the benchmark function
217on its satisfactory run,
218multiplied by
219.IR base .
220.TP
221.B t
222The time taken for the satisfactory run of the benchmark function,
223in seconds.
224Only valid if
225.B BTF_TIMEOK
226is set in
227.BR f .
228.TP
229.B cy
230The number of CPU cycles used
231in the satisfactory run of the benchmark function,
232in seconds.
233Only valid if
234.B BTF_CYOK
235is set in
236.BR f .
237.PP
238A
239.B "struct bench_time"
5c0f2e08 240represents a single instant in time,
6e683a79
MW
241as captured by a timer's
242.B now
243function.
244The use of this structure is a private matter for the timer:
245the only hard requirement is that the
246.B diff
247function should be able to compute the difference between two times.
248However, the intent is that
249a passage-of-time measurement is stored in the
250.B t
251union,
252a cycle count is stored in the
253.B cy
254member, and
255the
256.B f
257member stores flags
258.B BTF_TIMEOK
259and or
260.B BTF_CYOK
261if the passage-of-time or cycle count respectively are valid.
d056fbdf
MW
262.
263.SS The built-in timer
264The function
265.B bench_createtimer
266constructs and returns a timer.
267It takes a single argument,
268a string
269.IR config ,
270from which it reads configuration information.
271If
272.B bench_createtimer
273fails, it returns a null pointer.
274.PP
275The
276.I config
277pointer may safely be null,
278in which case a default configuration will be used.
279Applications
280.I should only
281set this pointer to a value supplied by a user,
282e.g., through a command-line argument,
283environment variable, or
284configuration file.
285.PP
286The built-in timer makes use of one or two
287.IR subtimers :
288a `clock' subtimer to measure the passage of time,
289and possibly a `cycle' subtimer to count CPU cycles.
290.PP
291The configuration string consists of a sequence of words
292separated by whitespace.
293There may be additional whitespace at the start and end of the string.
294The words recognized are as follows.
295.TP
296.B list
297Prints a list of the available clock and cycle subtimers
298to standard output.
299.TP
300.BI clock= t , ...
301Use the first of the listed clock subtimers
302to initialize successfully
303as the clock subtimer.
304If none of the subtimers can be initialized,
305then construction of the timer as a whole fails.
306.TP
307.BI cycle= t , ...
308Use the first of the listed subtimers
309to initialize successfully
310as the cycle subtimer.
311If none of the subtimers can be initialized,
312then construction of the timer as a whole fails.
313.PP
314The clock subtimers are as follows.
315Not all of them will be available on every platform.
316.TP
6e683a79
MW
317.B linux-x86-perf-rdpmc-hw-cycles
318This is a dummy companion to the similarly named cycle subtimer;
319see its description below.
320.TP
d056fbdf
MW
321.B posix-thread-cputime
322Measures the passage of time using
323.BR clock_gettime (2),
324specifying the
325.B CLOCK_\%THREAD_\%CPUTIME_\%ID
326clock.
327.TP
328.B stdc-clock
329Measures the passage of time using
330.BR clock (3).
331Since
332.BR clock (3)
333is part of the original ANSI\ C standard,
334this subtimer should always be available.
335However, it may produce unhelpful results
336if other threads are running.
337.PP
338The cycle subtimers are as follows.
339Not all of them will be available on every platform.
340.TP
6e683a79
MW
341.B linux-perf-read-hw-cycles
342Counts CPU cycles using the Linux-specific
d056fbdf
MW
343.BR perf_event_open (2)
344function to read the
345.BR PERF_\%COUNT_\%HW_\%CPU_\%CYCLES
346counter.
347Only available on Linux.
348It will fail to initialize
349if access to performance counters is restricted,
350e.g., because the
351.B /proc/sys/kernel/perf_event_paranoid
352level is too high.
353.TP
6e683a79
MW
354.B linux-perf-rdpmc-hw-cycles
355Counts CPU cycles using the Linux-specific
356.BR perf_event_open (2)
357function,
358as for
359.B linux-x86-perf-read-hw-cycles
360above,
361except that it additionally uses the i386/AMD64
d056fbdf 362.B rdtsc
6e683a79
MW
363and
364.B rdpmc
365instructions,
366together with information provided by the kernel
367through a memory-mapped page
368to do its measurements without any system call overheads.
369It does passage-of-time and cycle counting in a single operation,
370so no separate clock subtimer is required:
371the similarly-named clock subtimer does nothing
372except check that the
373.B linux-x86-perf-rdpmc-hw-cycles
374cycle subtimer has been selected.
375This is almost certainly the best choice if it's available.
376.TP
377.B x86-rdtscp
378Counts CPU cycles using the x86
379.B rdtscp
d056fbdf
MW
380instruction.
381This instruction is not really suitable for performance measurement:
382it gives misleading results on CPUs with variable clock frequency.
383.TP
6e683a79
MW
384.B x86-rdtsc
385Counts CPU cycles using the x86
386.B rdtsc
387instruction.
388This has the downsides of
389.B rdtscp
390above,
391but also fails to detect when the thread has been suspended
392or transferred to a different CPU core
393and gives misleading answers in this case.
394Not really recommended.
395.TP
d056fbdf
MW
396.B null
397A dummy cycle counter,
398which will initialize successfully
399and then fail to report cycle counts.
400This is a reasonable fallback in many situations.
401.PP
402The built-in preference order for clock subtimers,
403from most to least preferred, is
6e683a79 404.BR linux-x86-perf-rdpmc-hw-cycles ,
d056fbdf 405followed by
6e683a79
MW
406.BR posix-thread-cputime ,
407and finally
d056fbdf
MW
408.BR stdc-clock .
409The built-in preference order for cycle subtimers,
410from most to least preferred, is
6e683a79
MW
411.BR linux-x86-perf-rdpmc-hw-cycles
412then
413.BR linux-x86-perf-read-hw-cycles ,
d056fbdf 414followed by
6e683a79
MW
415.BR x86-rdtscp ,
416and
d056fbdf 417.BR x86-rdtsc ,
6e683a79 418and finally
d056fbdf
MW
419.BR null .
420.
421.SS The benchmark state
422A
423.I benchmark state
424tracks the information needed to measure performance of functions.
425It is represented by a
426.B struct bench_state
427structure.
428.PP
429The benchmark state is initialized by calling
430.BR bench_init ,
431passing the address of the state structure to be initialized,
432and a pointer to a timer.
433If
434.B bench_init
435is called with a non-null timer pointer,
436then it will not fail;
437the benchmark state will be initialized,
438and the function returns zero.
439If the timer pointer is null,
440then
441.B bench_init
442attempts to construct a timer for itself
443by calling
444.BR bench_createtimer .
445If this succeeds,
446then the benchmark state will be initialized,
447and the function returns zero.
448In both cases,
449the timer becomes owned by the benchmark state:
450calling
451.B bench_destroy
452on the benchmark state will destroy the timer.
453If
454.B bench_init
455is called with a null timer pointer,
456and its attempt to create a timer for itself fails,
457then
458.B bench_init
459returns \-1;
460the benchmark state is not initialized
461and can safely be discarded;
462calling
463safe to call
464.B bench_destroy
465on the unsuccessfully benchmark state is safe and has no effect.
466.PP
467Calling
468.B bench_destroy
469on a benchmark state
470releases any resources it holds,
471most notably its timer, if any.
472.PP
473Although
474.B struct bench_state
475is defined in the header file,
476only two members are available for use by applications.
477.TP
478.B f
479A word containing flags.
480.TP
481.B target_s
482The target time for which to try run a benchmark, in seconds.
483After initialization, this is set to 1.0,
484though applications can override it.
485.PP
486Before the benchmark state can be used in measurements,
487it must be
488.IR calibrated .
489This is performed by calling
490.B bench_calibrate
491on the benchmark state.
492Calibration takes a noticeable amount of time
493(currently about 0.25\*,s),
494so it makes sense to defer it until it's known to be necessary.
495.PP
496Calibration is carried out separately, but in parallel,
497for the timer's passage-of-time measurement and cycle counter.
498Either or both of these calibrations can succeed or fail;
499if passage-of-time calibration fails,
500then cycle count calibration is impossible.
501.PP
502When it completes,
503.B bench_calibrate
504sets flag in the benchmark state's
505.B f
506member:
507if passage-of-time calibration succeeded,
508.B BTF_TIMEOK
509is set;
510if cycle-count calibration succeeded,
511.B BTF_CYOK
512is set;
513and the flag
514.B BTF_CLB
515is set unconditionally,
516as a persistent indication that calibration has been attempted.
517.PP
518The
519.B bench_calibrate
520function returns zero if it successfully calibrated
521at least the passage-of-time measurement;
522otherwise, it returns \-1.
523If
524.B bench_calibrate
525is called for a second or subsequent time on the same benchmark state,
526it returns immediately,
527either returning 0 or \-1
528according to whether passage-of-time had previously been calibrated.
529.
530.SS Timing functions
531A
532.I benchmark function
533has the signature
534.IP
535.BI "void " fn "(unsigned long " n ", void *" ctx );
536.PP
537When called, it should perform the operation to be measured
538.I n
539times.
540The
541.I ctx
542argument is a pointer passed into
543.B bench_measure
544for the benchmark function's own purposes.
545.PP
546The function
547.B bench_measure
548receives five arguments.
549.TP
550.I b
551points to the benchmark state to be used.
552.TP
553.I t_out
554is the address of a
555.BR struct bench_timing
556in which the measurement should be left.
557This structure is described below.
558.TP
559.I base
560is a count of the number of operations performed
561by each iteration of the benchmark function.
562.TP
563.I fn
564is a benchmark function, described above.
565.TP
566.I ctx
567is a pointer to be passed to the benchmark function.
568.B bench_measure
569does not interpret this pointer in any way.
570.PP
571The
572.B bench_measure
573function calls its benchark function repeatedly
574with different iteration counts
575.IR n ,
576with the objective that the call take approximately
577.B target_s
578seconds, as established in the benchmark state.
579(Currently, if
580.B target_s
581holds the value
582.IR t ,
583then
584.B bench_measure
585is satisfied when a call takes at least
586.IR t /\(sr2\*,s.)
587Once the function finds a satisfactory number of iterations,
588it stores the results in
589.BI * t_out \fR.
590If measurement succeeds, then
591.B bench_measure
592returns zero.
593If it fails \(en
594most likely because the timer failed \(en
595then it returns \-1.
d056fbdf 596.
c4ccbbf9 597.\"--------------------------------------------------------------------------
d056fbdf 598.SH "SEE ALSO"
c4ccbbf9
MW
599.
600.BR tvec-bench (3),
d056fbdf
MW
601.BR mLib (3).
602.
c4ccbbf9 603.\"--------------------------------------------------------------------------
d056fbdf 604.SH AUTHOR
c4ccbbf9 605.
d056fbdf 606Mark Wooding, <mdw@distorted.org.uk>
c4ccbbf9
MW
607.
608.\"----- That's all, folks --------------------------------------------------