.\" -*-nroff-*-
.\"
.\" Manual for benchmarking core
.\"
.\" (c) 2024 Straylight/Edgeware
.\"
.
.\"----- Licensing notice ---------------------------------------------------
.\"
.\" This file is part of the mLib utilities library.
.\"
.\" mLib is free software: you can redistribute it and/or modify it under
.\" the terms of the GNU Library General Public License as published by
.\" the Free Software Foundation; either version 2 of the License, or (at
.\" your option) any later version.
.\"
.\" mLib is distributed in the hope that it will be useful, but WITHOUT
.\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
.\" FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
.\" License for more details.
.\"
.\" You should have received a copy of the GNU Library General Public
.\" License along with mLib.  If not, write to the Free Software
.\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
.\" USA.
.
.\"--------------------------------------------------------------------------
.so ../defs.man \" @@@PRE@@@
.
.\"--------------------------------------------------------------------------
.TH bench 3mLib "9 March 2024" "Straylight/Edgeware" "mLib utilities library"
.\" @bench_createtimer
.\" @bench_init
.\" @bench_destroy
.\" @bench_calibrate
.\" @bench_measure
.
.\"--------------------------------------------------------------------------
.SH NAME
bench \- low-level benchmarking tools
.
.\"--------------------------------------------------------------------------
.SH SYNOPSIS
.
.nf
.B "#include <mLib/bench.h>"
.PP
.ta 2n +2n +2n
.B "struct bench_time {"
.B "	unsigned f;"
.B "	union {"
.B "		struct { kludge64 s; uint32 ns; } ts;"
.B "		clock_t clk;"
.B "		kludge64 rawns;"
.B "	} t;"
.B "	kludge64 cy;"
.B "};"
.PP
.B "struct bench_timing {"
.B "	unsigned f;"
.B "	double n;"
.B "	double t;"
.B "	double cy;"
.B "};"
.PP
.B "#define BTF_T0 0u"
.B "#define BTF_T1 ..."
.B "struct bench_timerops {"
.BI "	void (*describe)(struct bench_timer *" bt ", dstr *" d );
.ta 2n +\w'\fBint (*now)('u
.BI "	int (*now)(struct bench_timer *" bt ,
.BI "		struct bench_time *" t_out ", unsigned " f );
.ta 2n +\w'\void (*diff)('u
.BI "	void (*diff)(struct bench_timer *" bt ,
.BI "		struct bench_timing *" delta_out ,
.BI "		const struct bench_time *" t0 ,
.BI "		const struct bench_time *" t1 );
.BI "	void (*destroy)(struct bench_timer *" bt );
.B "};"
.B "struct bench_timer {"
.B "	const struct bench_timerops *ops;"
.B "};"
.PP
.B "struct bench_state {"
.B "	unsigned f;"
.B "	double target_s;"
.B "	..."
.B "}";
.PP
.BI "typedef void bench_fn(unsigned long " n ", void *" ctx );
.PP
.B "#define BTF_TIMEOK ..."
.B "#define BTF_CYOK ..."
.B "#define BTF_CLB ..."
.B "#define BTF_ANY (BTF_TIMEOK | BTF_CYOK)"
.PP
.B "struct bench_timer *bench_createtimer(void);"
.PP
.BI "int bench_init(struct bench_state *" b ", struct bench_timer *" tm );
.BI "void bench_destroy(struct bench_state *" b );
.BI "int bench_calibrate(struct bench_state *" b );
.ta \w'\fBint bench_measure('u
.BI "int bench_measure(struct bench_state *" b ", struct bench_timing *" t_out ,
.BI "	double " base ", bench_fn *" fn ", void *" ctx );
.fi
.
.\"--------------------------------------------------------------------------
.SH DESCRIPTION
.
The header file
.B "<mLib/bench.h>"
provides declarations and defintions
for performing low-level benchmarks.
.PP
The `main event' is
.BR bench_measure .
This function will be described in detail later,
but, in brief,
it calls a caller-provided function,
instructing it to run adaptively chosen numbers of iterations,
in order to get a reasonably reliable measurement of its running time,
and then reports its results by filling in a structure.
.PP
With understanding this function as our objective,
we must examine all of the pieces involved in making it work.
.
.SS Timers in general
A
.I timer
is a gadget which is capable of reporting the current time,
in seconds (ideally precise to tiny fractions of a second),
and/or in CPU cycles.
A timer is represented by a pointer to an object of type
.BR "struct bench_timer" .
This structure has a single member,
.BR ops ,
pointing to a
.BR "struct bench_timerops" ,
which is a table of function pointers;
typically, a timer has more data following this,
but this fact is not exposed to applications.
.PP
The function pointers in
.B "struct bench_timerops"
are as follows.
The first argument,
named
.I tm
must always point to the timer object itself.
.TP
.IB tm ->ops->describe( tm ", " d)
Write a description of the timer to the dynamic string
.IR d .
.TP
.IB tm ->ops->now( tm ", " t_out ", " f )
Store the current time in
.BI * t_out \fR.
The
.B BTF_T1
flag in
.I f
to indicate that this is the second call in a pair;
leave it clear for the first call.
(A fake
.B BTF_T0
flag is defined to be zero for symmetry.)
Return zero on success
.I or
permanent failure;
return \-1 if timing failed but
trying again immediately has a reasonable chance of success.
.TP
.IB tm ->ops->diff( tm ", " delta_out ", " t0 ", " t1 )
Store in
.BI * delta_out
the difference between the two times
.I t0
and
.IR t1 .
.TP
.IB tm ->ops->destroy( tm )
Destroy the timer,
releasing all of the resources that it holds.
.PP
A
.B bench_timing
structure reports the difference between two times,
as determined by a timer's
.B diff
function.
It has four members.
.TP
.B f
A flags word.
.B BTF_TIMEOK
is set if the passage-of-time measurement in
.B t
is valid;
.B BTF_CYOK
is set if the cycle count in
.B cy
is valid.
The mask
.B BTF_ANY
covers the
.B BTF_TIMEOK
and
.B BTF_CYOK
bits:
hence,
.B f&BTF_ANY
is nonzero (true)
if the timer returned any valid timing information.
.TP
.B n
The number of iterations performed by the benchmark function
on its satisfactory run,
multiplied by
.IR base .
.TP
.B t
The time taken for the satisfactory run of the benchmark function,
in seconds.
Only valid if
.B BTF_TIMEOK
is set in
.BR f .
.TP
.B cy
The number of CPU cycles used
in the satisfactory run of the benchmark function,
in seconds.
Only valid if
.B BTF_CYOK
is set in
.BR f .
.PP
A
.B "struct bench_time"
represents a single instant in time,
as captured by a timer's
.B now
function.
The use of this structure is a private matter for the timer:
the only hard requirement is that the
.B diff
function should be able to compute the difference between two times.
However, the intent is that
a passage-of-time measurement is stored in the
.B t
union,
a cycle count is stored in the
.B cy
member, and
the
.B f
member stores flags
.B BTF_TIMEOK
and or
.B BTF_CYOK
if the passage-of-time or cycle count respectively are valid.
.
.SS The built-in timer
The function
.B bench_createtimer
constructs and returns a timer.
It takes a single argument,
a string
.IR config ,
from which it reads configuration information.
If
.B bench_createtimer
fails, it returns a null pointer.
.PP
The
.I config
pointer may safely be null,
in which case a default configuration will be used.
Applications
.I should only
set this pointer to a value supplied by a user,
e.g., through a command-line argument,
environment variable, or
configuration file.
.PP
The built-in timer makes use of one or two
.IR subtimers :
a `clock' subtimer to measure the passage of time,
and possibly a `cycle' subtimer to count CPU cycles.
.PP
The configuration string consists of a sequence of words
separated by whitespace.
There may be additional whitespace at the start and end of the string.
The words recognized are as follows.
.TP
.B list
Prints a list of the available clock and cycle subtimers
to standard output.
.TP
.BI clock= t , ...
Use the first of the listed clock subtimers
to initialize successfully
as the clock subtimer.
If none of the subtimers can be initialized,
then construction of the timer as a whole fails.
.TP
.BI cycle= t , ...
Use the first of the listed subtimers
to initialize successfully
as the cycle subtimer.
If none of the subtimers can be initialized,
then construction of the timer as a whole fails.
.PP
The clock subtimers are as follows.
Not all of them will be available on every platform.
.TP
.B linux-x86-perf-rdpmc-hw-cycles
This is a dummy companion to the similarly named cycle subtimer;
see its description below.
.TP
.B posix-thread-cputime
Measures the passage of time using
.BR clock_gettime (2),
specifying the
.B CLOCK_\%THREAD_\%CPUTIME_\%ID
clock.
.TP
.B stdc-clock
Measures the passage of time using
.BR clock (3).
Since
.BR clock (3)
is part of the original ANSI\ C standard,
this subtimer should always be available.
However, it may produce unhelpful results
if other threads are running.
.PP
The cycle subtimers are as follows.
Not all of them will be available on every platform.
.TP
.B linux-perf-read-hw-cycles
Counts CPU cycles using the Linux-specific
.BR perf_event_open (2)
function to read the
.BR PERF_\%COUNT_\%HW_\%CPU_\%CYCLES
counter.
Only available on Linux.
It will fail to initialize
if access to performance counters is restricted,
e.g., because the
.B /proc/sys/kernel/perf_event_paranoid
level is too high.
.TP
.B linux-perf-rdpmc-hw-cycles
Counts CPU cycles using the Linux-specific
.BR perf_event_open (2)
function,
as for
.B linux-x86-perf-read-hw-cycles
above,
except that it additionally uses the i386/AMD64
.B rdtsc
and
.B rdpmc
instructions,
together with information provided by the kernel
through a memory-mapped page
to do its measurements without any system call overheads.
It does passage-of-time and cycle counting in a single operation,
so no separate clock subtimer is required:
the similarly-named clock subtimer does nothing
except check that the
.B linux-x86-perf-rdpmc-hw-cycles
cycle subtimer has been selected.
This is almost certainly the best choice if it's available.
.TP
.B x86-rdtscp
Counts CPU cycles using the x86
.B rdtscp
instruction.
This instruction is not really suitable for performance measurement:
it gives misleading results on CPUs with variable clock frequency.
.TP
.B x86-rdtsc
Counts CPU cycles using the x86
.B rdtsc
instruction.
This has the downsides of
.B rdtscp
above,
but also fails to detect when the thread has been suspended
or transferred to a different CPU core
and gives misleading answers in this case.
Not really recommended.
.TP
.B null
A dummy cycle counter,
which will initialize successfully
and then fail to report cycle counts.
This is a reasonable fallback in many situations.
.PP
The built-in preference order for clock subtimers,
from most to least preferred, is
.BR linux-x86-perf-rdpmc-hw-cycles ,
followed by
.BR posix-thread-cputime ,
and finally
.BR stdc-clock .
The built-in preference order for cycle subtimers,
from most to least preferred, is
.BR linux-x86-perf-rdpmc-hw-cycles
then
.BR linux-x86-perf-read-hw-cycles ,
followed by
.BR x86-rdtscp ,
and
.BR x86-rdtsc ,
and finally
.BR null .
.
.SS The benchmark state
A
.I benchmark state
tracks the information needed to measure performance of functions.
It is represented by a
.B struct bench_state
structure.
.PP
The benchmark state is initialized by calling
.BR bench_init ,
passing the address of the state structure to be initialized,
and a pointer to a timer.
If
.B bench_init
is called with a non-null timer pointer,
then it will not fail;
the benchmark state will be initialized,
and the function returns zero.
If the timer pointer is null,
then
.B bench_init
attempts to construct a timer for itself
by calling
.BR bench_createtimer .
If this succeeds,
then the benchmark state will be initialized,
and the function returns zero.
In both cases,
the timer becomes owned by the benchmark state:
calling
.B bench_destroy
on the benchmark state will destroy the timer.
If
.B bench_init
is called with a null timer pointer,
and its attempt to create a timer for itself fails,
then
.B bench_init
returns \-1;
the benchmark state is not initialized
and can safely be discarded;
calling
safe to call
.B bench_destroy
on the unsuccessfully benchmark state is safe and has no effect.
.PP
Calling
.B bench_destroy
on a benchmark state
releases any resources it holds,
most notably its timer, if any.
.PP
Although
.B struct bench_state
is defined in the header file,
only two members are available for use by applications.
.TP
.B f
A word containing flags.
.TP
.B target_s
The target time for which to try run a benchmark, in seconds.
After initialization, this is set to 1.0,
though applications can override it.
.PP
Before the benchmark state can be used in measurements,
it must be
.IR calibrated .
This is performed by calling
.B bench_calibrate
on the benchmark state.
Calibration takes a noticeable amount of time
(currently about 0.25\*,s),
so it makes sense to defer it until it's known to be necessary.
.PP
Calibration is carried out separately, but in parallel,
for the timer's passage-of-time measurement and cycle counter.
Either or both of these calibrations can succeed or fail;
if passage-of-time calibration fails,
then cycle count calibration is impossible.
.PP
When it completes,
.B bench_calibrate
sets flag in the benchmark state's
.B f
member:
if passage-of-time calibration succeeded,
.B BTF_TIMEOK
is set;
if cycle-count calibration succeeded,
.B BTF_CYOK
is set;
and the flag
.B BTF_CLB
is set unconditionally,
as a persistent indication that calibration has been attempted.
.PP
The
.B bench_calibrate
function returns zero if it successfully calibrated
at least the passage-of-time measurement;
otherwise, it returns \-1.
If
.B bench_calibrate
is called for a second or subsequent time on the same benchmark state,
it returns immediately,
either returning 0 or \-1
according to whether passage-of-time had previously been calibrated.
.
.SS Timing functions
A
.I benchmark function
has the signature
.IP
.BI "void " fn "(unsigned long " n ", void *" ctx );
.PP
When called, it should perform the operation to be measured
.I n
times.
The
.I ctx
argument is a pointer passed into
.B bench_measure
for the benchmark function's own purposes.
.PP
The function
.B bench_measure
receives five arguments.
.TP
.I b
points to the benchmark state to be used.
.TP
.I t_out
is the address of a
.BR struct bench_timing
in which the measurement should be left.
This structure is described below.
.TP
.I base
is a count of the number of operations performed
by each iteration of the benchmark function.
.TP
.I fn
is a benchmark function, described above.
.TP
.I ctx
is a pointer to be passed to the benchmark function.
.B bench_measure
does not interpret this pointer in any way.
.PP
The
.B bench_measure
function calls its benchark function repeatedly
with different iteration counts
.IR n ,
with the objective that the call take approximately
.B target_s
seconds, as established in the benchmark state.
(Currently, if
.B target_s
holds the value
.IR t ,
then
.B bench_measure
is satisfied when a call takes at least
.IR t /\(sr2\*,s.)
Once the function finds a satisfactory number of iterations,
it stores the results in
.BI * t_out \fR.
If measurement succeeds, then
.B bench_measure
returns zero.
If it fails \(en
most likely because the timer failed \(en
then it returns \-1.
.
.\"--------------------------------------------------------------------------
.SH "SEE ALSO"
.
.BR tvec-bench (3),
.BR mLib (3).
.
.\"--------------------------------------------------------------------------
.SH AUTHOR
.
Mark Wooding, <mdw@distorted.org.uk>
.
.\"----- That's all, folks --------------------------------------------------