Commit | Line | Data |
---|---|---|
d056fbdf MW |
1 | .\" -*-nroff-*- |
2 | .ie t .ds , \h'\w'\ 'u/2u' | |
3 | .el .ds , \ \" | |
4 | .TH bench 3 "9 March 2024" "Straylight/Edgeware" "mLib utilities library" | |
5 | .\" @bench_createtimer | |
6 | .\" @bench_init | |
7 | .\" @bench_destroy | |
8 | .\" @bench_calibrate | |
9 | .\" @bench_measure | |
10 | . | |
11 | .SH SYNOPSIS | |
12 | .nf | |
13 | .B "#include <mLib/bench.h>" | |
14 | .PP | |
15 | .ta 2n | |
16 | .B "struct bench_time {" | |
17 | .B " unsigned f;" | |
18 | .B " kludge64 s;" | |
19 | .B " uint32 ns;" | |
20 | .B " kludge64 cy;" | |
21 | .B "};" | |
22 | .PP | |
23 | .B "struct bench_timing {" | |
24 | .B " unsigned f;" | |
25 | .B " double n;" | |
26 | .B " double t;" | |
27 | .B " double cy;" | |
28 | .B "};" | |
29 | .PP | |
30 | .B "struct bench_timerops {" | |
31 | .BI " void (*describe)(struct bench_timer *" bt ", dstr *" d ); | |
32 | .BI " void (*now)(struct bench_timer *" bt ", struct bench_time *" t_out ); | |
33 | .BI " void (*destroy)(struct bench_timer *" bt ); | |
34 | .B "};" | |
35 | .B "struct bench_timer {" | |
36 | .B " const struct bench_timerops *ops;" | |
37 | .B "};" | |
38 | .PP | |
39 | .B "struct bench_state {" | |
40 | .B " unsigned f;" | |
41 | .B " double target_s;" | |
42 | .B " ..." | |
43 | .B "}"; | |
44 | .PP | |
45 | .BI "typedef void bench_fn(unsigned long " n ", void *" ctx ); | |
46 | .PP | |
47 | .B "#define BTF_TIMEOK ..." | |
48 | .B "#define BTF_CYOK ..." | |
49 | .B "#define BTF_CLB ..." | |
50 | .B "#define BTF_ANY (BTF_TIMEOK | BTF_CYOK)" | |
51 | .PP | |
52 | .B "struct bench_timer *bench_createtimer(void);" | |
53 | .PP | |
54 | .BI "int bench_init(struct bench_state *" b ", struct bench_timer *" tm ); | |
55 | .BI "void bench_destroy(struct bench_state *" b ); | |
56 | .BI "int bench_calibrate(struct bench_state *" b ); | |
57 | .ta \w'\fBint bench_measure('u | |
58 | .BI "int bench_measure(struct bench_state *" b ", struct bench_timing *" t_out , | |
59 | .BI " double " base ", bench_fn *" fn ", void *" ctx ); | |
60 | .fi | |
61 | . | |
62 | .SH DESCRIPTION | |
63 | The header file | |
64 | .B "<mLib/bench.h>" | |
65 | provides declarations and defintions | |
66 | for performing low-level benchmarks. | |
67 | .PP | |
68 | The `main event' is | |
69 | .BR bench_measure . | |
70 | This function will be described in detail later, | |
71 | but, in brief, | |
72 | it calls a caller-provided function, | |
73 | instructing it to run adaptively chosen numbers of iterations, | |
74 | in order to get a reasonably reliable measurement of its running time, | |
75 | and then reports its results by filling in a structure. | |
76 | .PP | |
77 | With understanding this function as our objective, | |
78 | we must examine all of the pieces involved in making it work. | |
79 | . | |
80 | .SS Timers in general | |
81 | A | |
82 | .I timer | |
83 | is a gadget which is capable of reporting the current time, | |
84 | in seconds (ideally precise to tiny fractions of a second), | |
85 | and/or in CPU cycles. | |
86 | A timer is represented by a pointer to an object of type | |
87 | .BR "struct bench_timer" . | |
88 | This structure has a single member, | |
89 | .BR ops , | |
90 | pointing to a | |
91 | .BR "struct bench_timerops" , | |
92 | which is a table of function pointers; | |
93 | typically, a timer has more data following this, | |
94 | but this fact is not exposed to applications. | |
95 | .PP | |
96 | The function pointers in | |
97 | .B "struct bench_timerops" | |
98 | are as follows. | |
99 | The first argument, | |
100 | named | |
101 | .I tm | |
102 | must always point to the timer object itself. | |
103 | .TP | |
104 | .IB tm ->ops->describe( tm ", " d) | |
105 | Write a description of the timer to the dynamic string | |
106 | .IR d . | |
107 | .TP | |
108 | .IB tm ->ops->now( tm ", " t_out) | |
109 | Store the current time in | |
110 | .IR t_out . | |
111 | The | |
112 | .B struct bench_time | |
113 | used to represent the time reported by a timer | |
114 | is described in detail below. | |
115 | .TP | |
116 | .IB tm ->ops->destroy( tm ) | |
117 | Destroy the timer, | |
118 | releasing all of the resources that it holds. | |
119 | .PP | |
120 | A time, a reported by a timer, is represented by the | |
121 | .BR "struct bench_time" . | |
122 | A passage-of-time measurement is stored in the | |
123 | .B s | |
124 | and | |
125 | .B ns | |
126 | members, holding seconds and nanoseconds respectively. | |
127 | (A timer need not have nanosecond precision. | |
128 | The exact interpretation of the time \(en | |
129 | e.g., whether it measures wallclock time, | |
130 | user-mode CPU time, | |
131 | or total thread CPU time \(en | |
132 | is a matter for the specific timer implementation.) | |
133 | A cycle count is stored in the | |
134 | .B cy | |
135 | member. | |
136 | The | |
137 | .B f | |
138 | member stores flags: | |
139 | .B BTF_TIMEOK | |
140 | is set if the passage-of-time measurement | |
141 | .B s | |
142 | and | |
143 | .B ns | |
144 | are valid; and | |
145 | .B BTF_CYOK | |
146 | is set if the cycle count | |
147 | .B cy | |
148 | is valid. | |
149 | Neither the time nor the cycle count need be measured | |
150 | relative to any particular origin. | |
151 | The mask | |
152 | .B BTF_ANY | |
153 | covers the | |
154 | .B BTF_TIMEOK | |
155 | and | |
156 | .B BTF_CYOK | |
157 | bits: | |
158 | hence, | |
159 | .IB f &BTF_ANY | |
160 | is nonzero (true) | |
161 | if the timer returned any valid timing information. | |
162 | . | |
163 | .SS The built-in timer | |
164 | The function | |
165 | .B bench_createtimer | |
166 | constructs and returns a timer. | |
167 | It takes a single argument, | |
168 | a string | |
169 | .IR config , | |
170 | from which it reads configuration information. | |
171 | If | |
172 | .B bench_createtimer | |
173 | fails, it returns a null pointer. | |
174 | .PP | |
175 | The | |
176 | .I config | |
177 | pointer may safely be null, | |
178 | in which case a default configuration will be used. | |
179 | Applications | |
180 | .I should only | |
181 | set this pointer to a value supplied by a user, | |
182 | e.g., through a command-line argument, | |
183 | environment variable, or | |
184 | configuration file. | |
185 | .PP | |
186 | The built-in timer makes use of one or two | |
187 | .IR subtimers : | |
188 | a `clock' subtimer to measure the passage of time, | |
189 | and possibly a `cycle' subtimer to count CPU cycles. | |
190 | .PP | |
191 | The configuration string consists of a sequence of words | |
192 | separated by whitespace. | |
193 | There may be additional whitespace at the start and end of the string. | |
194 | The words recognized are as follows. | |
195 | .TP | |
196 | .B list | |
197 | Prints a list of the available clock and cycle subtimers | |
198 | to standard output. | |
199 | .TP | |
200 | .BI clock= t , ... | |
201 | Use the first of the listed clock subtimers | |
202 | to initialize successfully | |
203 | as the clock subtimer. | |
204 | If none of the subtimers can be initialized, | |
205 | then construction of the timer as a whole fails. | |
206 | .TP | |
207 | .BI cycle= t , ... | |
208 | Use the first of the listed subtimers | |
209 | to initialize successfully | |
210 | as the cycle subtimer. | |
211 | If none of the subtimers can be initialized, | |
212 | then construction of the timer as a whole fails. | |
213 | .PP | |
214 | The clock subtimers are as follows. | |
215 | Not all of them will be available on every platform. | |
216 | .TP | |
217 | .B posix-thread-cputime | |
218 | Measures the passage of time using | |
219 | .BR clock_gettime (2), | |
220 | specifying the | |
221 | .B CLOCK_\%THREAD_\%CPUTIME_\%ID | |
222 | clock. | |
223 | .TP | |
224 | .B stdc-clock | |
225 | Measures the passage of time using | |
226 | .BR clock (3). | |
227 | Since | |
228 | .BR clock (3) | |
229 | is part of the original ANSI\ C standard, | |
230 | this subtimer should always be available. | |
231 | However, it may produce unhelpful results | |
232 | if other threads are running. | |
233 | .PP | |
234 | The cycle subtimers are as follows. | |
235 | Not all of them will be available on every platform. | |
236 | .TP | |
237 | .B linux-perf-event | |
238 | Counts CPU cycles using the Linux-specific | |
239 | .BR perf_event_open (2) | |
240 | function to read the | |
241 | .BR PERF_\%COUNT_\%HW_\%CPU_\%CYCLES | |
242 | counter. | |
243 | Only available on Linux. | |
244 | It will fail to initialize | |
245 | if access to performance counters is restricted, | |
246 | e.g., because the | |
247 | .B /proc/sys/kernel/perf_event_paranoid | |
248 | level is too high. | |
249 | .TP | |
250 | .B x86-rdtsc | |
251 | Counts CPU cycles using the x86 | |
252 | .B rdtsc | |
253 | instruction. | |
254 | This instruction is not really suitable for performance measurement: | |
255 | it gives misleading results on CPUs with variable clock frequency. | |
256 | .TP | |
257 | .B null | |
258 | A dummy cycle counter, | |
259 | which will initialize successfully | |
260 | and then fail to report cycle counts. | |
261 | This is a reasonable fallback in many situations. | |
262 | .PP | |
263 | The built-in preference order for clock subtimers, | |
264 | from most to least preferred, is | |
265 | .B posix-thread-cputime | |
266 | followed by | |
267 | .BR stdc-clock . | |
268 | The built-in preference order for cycle subtimers, | |
269 | from most to least preferred, is | |
270 | .B linux-perf-event | |
271 | followed by | |
272 | .BR x86-rdtsc , | |
273 | and then | |
274 | .BR null . | |
275 | . | |
276 | .SS The benchmark state | |
277 | A | |
278 | .I benchmark state | |
279 | tracks the information needed to measure performance of functions. | |
280 | It is represented by a | |
281 | .B struct bench_state | |
282 | structure. | |
283 | .PP | |
284 | The benchmark state is initialized by calling | |
285 | .BR bench_init , | |
286 | passing the address of the state structure to be initialized, | |
287 | and a pointer to a timer. | |
288 | If | |
289 | .B bench_init | |
290 | is called with a non-null timer pointer, | |
291 | then it will not fail; | |
292 | the benchmark state will be initialized, | |
293 | and the function returns zero. | |
294 | If the timer pointer is null, | |
295 | then | |
296 | .B bench_init | |
297 | attempts to construct a timer for itself | |
298 | by calling | |
299 | .BR bench_createtimer . | |
300 | If this succeeds, | |
301 | then the benchmark state will be initialized, | |
302 | and the function returns zero. | |
303 | In both cases, | |
304 | the timer becomes owned by the benchmark state: | |
305 | calling | |
306 | .B bench_destroy | |
307 | on the benchmark state will destroy the timer. | |
308 | If | |
309 | .B bench_init | |
310 | is called with a null timer pointer, | |
311 | and its attempt to create a timer for itself fails, | |
312 | then | |
313 | .B bench_init | |
314 | returns \-1; | |
315 | the benchmark state is not initialized | |
316 | and can safely be discarded; | |
317 | calling | |
318 | safe to call | |
319 | .B bench_destroy | |
320 | on the unsuccessfully benchmark state is safe and has no effect. | |
321 | .PP | |
322 | Calling | |
323 | .B bench_destroy | |
324 | on a benchmark state | |
325 | releases any resources it holds, | |
326 | most notably its timer, if any. | |
327 | .PP | |
328 | Although | |
329 | .B struct bench_state | |
330 | is defined in the header file, | |
331 | only two members are available for use by applications. | |
332 | .TP | |
333 | .B f | |
334 | A word containing flags. | |
335 | .TP | |
336 | .B target_s | |
337 | The target time for which to try run a benchmark, in seconds. | |
338 | After initialization, this is set to 1.0, | |
339 | though applications can override it. | |
340 | .PP | |
341 | Before the benchmark state can be used in measurements, | |
342 | it must be | |
343 | .IR calibrated . | |
344 | This is performed by calling | |
345 | .B bench_calibrate | |
346 | on the benchmark state. | |
347 | Calibration takes a noticeable amount of time | |
348 | (currently about 0.25\*,s), | |
349 | so it makes sense to defer it until it's known to be necessary. | |
350 | .PP | |
351 | Calibration is carried out separately, but in parallel, | |
352 | for the timer's passage-of-time measurement and cycle counter. | |
353 | Either or both of these calibrations can succeed or fail; | |
354 | if passage-of-time calibration fails, | |
355 | then cycle count calibration is impossible. | |
356 | .PP | |
357 | When it completes, | |
358 | .B bench_calibrate | |
359 | sets flag in the benchmark state's | |
360 | .B f | |
361 | member: | |
362 | if passage-of-time calibration succeeded, | |
363 | .B BTF_TIMEOK | |
364 | is set; | |
365 | if cycle-count calibration succeeded, | |
366 | .B BTF_CYOK | |
367 | is set; | |
368 | and the flag | |
369 | .B BTF_CLB | |
370 | is set unconditionally, | |
371 | as a persistent indication that calibration has been attempted. | |
372 | .PP | |
373 | The | |
374 | .B bench_calibrate | |
375 | function returns zero if it successfully calibrated | |
376 | at least the passage-of-time measurement; | |
377 | otherwise, it returns \-1. | |
378 | If | |
379 | .B bench_calibrate | |
380 | is called for a second or subsequent time on the same benchmark state, | |
381 | it returns immediately, | |
382 | either returning 0 or \-1 | |
383 | according to whether passage-of-time had previously been calibrated. | |
384 | . | |
385 | .SS Timing functions | |
386 | A | |
387 | .I benchmark function | |
388 | has the signature | |
389 | .IP | |
390 | .BI "void " fn "(unsigned long " n ", void *" ctx ); | |
391 | .PP | |
392 | When called, it should perform the operation to be measured | |
393 | .I n | |
394 | times. | |
395 | The | |
396 | .I ctx | |
397 | argument is a pointer passed into | |
398 | .B bench_measure | |
399 | for the benchmark function's own purposes. | |
400 | .PP | |
401 | The function | |
402 | .B bench_measure | |
403 | receives five arguments. | |
404 | .TP | |
405 | .I b | |
406 | points to the benchmark state to be used. | |
407 | .TP | |
408 | .I t_out | |
409 | is the address of a | |
410 | .BR struct bench_timing | |
411 | in which the measurement should be left. | |
412 | This structure is described below. | |
413 | .TP | |
414 | .I base | |
415 | is a count of the number of operations performed | |
416 | by each iteration of the benchmark function. | |
417 | .TP | |
418 | .I fn | |
419 | is a benchmark function, described above. | |
420 | .TP | |
421 | .I ctx | |
422 | is a pointer to be passed to the benchmark function. | |
423 | .B bench_measure | |
424 | does not interpret this pointer in any way. | |
425 | .PP | |
426 | The | |
427 | .B bench_measure | |
428 | function calls its benchark function repeatedly | |
429 | with different iteration counts | |
430 | .IR n , | |
431 | with the objective that the call take approximately | |
432 | .B target_s | |
433 | seconds, as established in the benchmark state. | |
434 | (Currently, if | |
435 | .B target_s | |
436 | holds the value | |
437 | .IR t , | |
438 | then | |
439 | .B bench_measure | |
440 | is satisfied when a call takes at least | |
441 | .IR t /\(sr2\*,s.) | |
442 | Once the function finds a satisfactory number of iterations, | |
443 | it stores the results in | |
444 | .BI * t_out \fR. | |
445 | If measurement succeeds, then | |
446 | .B bench_measure | |
447 | returns zero. | |
448 | If it fails \(en | |
449 | most likely because the timer failed \(en | |
450 | then it returns \-1. | |
451 | .PP | |
452 | A | |
453 | .B bench_timing | |
454 | structure reports the outcome of a successful measurement. | |
455 | It has four members. | |
456 | .TP | |
457 | .B f | |
458 | A flags word. | |
459 | .B BTF_TIMEOK | |
460 | is set if the passage-of-time measurement in | |
461 | .B t | |
462 | is valid; | |
463 | .B BTF_CYOK | |
464 | is set if the cycle count in | |
465 | .B cy | |
466 | is valid. | |
467 | .TP | |
468 | .B n | |
469 | The number of iterations performed by the benchmark function | |
470 | on its satisfactory run, | |
471 | multiplied by | |
472 | .IR base . | |
473 | .TP | |
474 | .B t | |
475 | The time taken for the satisfactory run of the benchmark function, | |
476 | in seconds. | |
477 | Only valid if | |
478 | .B BTF_TIMEOK | |
479 | is set in | |
480 | .BR f . | |
481 | .TP | |
482 | .B cy | |
483 | The number of CPU cycles used | |
484 | in the satisfactory run of the benchmark function, | |
485 | in seconds. | |
486 | Only valid if | |
487 | .B BTF_CYOK | |
488 | is set in | |
489 | .BR f . | |
490 | . | |
491 | .SH "SEE ALSO" | |
492 | .BR mLib (3). | |
493 | . | |
494 | .SH AUTHOR | |
495 | Mark Wooding, <mdw@distorted.org.uk> |