.\"--------------------------------------------------------------------------
.TH bench 3mLib "9 March 2024" "Straylight/Edgeware" "mLib utilities library"
.\" @bench_createtimer
+.\" @BENCH_TIMELOOP_DECLS
+.\" @BENCH_TIMELOOP_TAG
+.
.\" @bench_init
.\" @bench_destroy
.\" @bench_calibrate
+.\" @bench_preflight
+.\" @bench_adapt
+.\" @bench_adjust
+.\" @BENCH_MEASURE_DECLS
+.\" @BENCH_MEASURE_TAG
+.\" @BENCH_MEASURE
.\" @bench_measure
.
+.\" @bench_report
+.
.\"--------------------------------------------------------------------------
.SH NAME
bench \- low-level benchmarking tools
.nf
.B "#include <mLib/bench.h>"
.PP
+.B "#define BTF_TIMEOK ..."
+.B "#define BTF_CYOK ..."
+.B "#define BTF_ANY (BTF_TIMEOK | BTF_CYOK)"
+.PP
.ta 2n +2n +2n
.B "struct bench_time {"
.B " unsigned f;"
.ta 2n +\w'\fBint (*now)('u
.BI " int (*now)(struct bench_timer *" bt ,
.BI " struct bench_time *" t_out ", unsigned " f );
-.ta 2n +\w'\void (*diff)('u
+.ta 2n +\w'\fBvoid (*diff)('u
.BI " void (*diff)(struct bench_timer *" bt ,
.BI " struct bench_timing *" delta_out ,
.BI " const struct bench_time *" t0 ,
.B "};"
.B "struct bench_timer {"
.B " const struct bench_timerops *ops;"
+.B " unsigned ref;"
.B "};"
.PP
+.B "struct bench_timer *bench_createtimer(void);"
+.B "BENCH_TIMELOOP_DECLS;"
+.ta 2n \w'\fBBENCH_TIMELOOP_TAG('u
+.BI "BENCH_TIMELOOP_TAG(" tag ", struct bench_timer *" tm ,
+.BI " struct bench_timing *" delta_out ", double " n ,
+.BI " " onbreak )
+.BI " " stmt
+.PP
+.B "#define BTF_CLB ..."
+.B "#define BTF_INDIRECT ..."
+.PP
+.ta 2n
.B "struct bench_state {"
.B " unsigned f;"
.B " double target_s;"
.PP
.BI "typedef void bench_fn(unsigned long " n ", void *" ctx );
.PP
-.B "#define BTF_TIMEOK ..."
-.B "#define BTF_CYOK ..."
-.B "#define BTF_CLB ..."
-.B "#define BTF_ANY (BTF_TIMEOK | BTF_CYOK)"
-.PP
-.B "struct bench_timer *bench_createtimer(void);"
-.PP
.BI "int bench_init(struct bench_state *" b ", struct bench_timer *" tm );
.BI "void bench_destroy(struct bench_state *" b );
-.BI "int bench_calibrate(struct bench_state *" b );
+.BI "int bench_calibrate(struct bench_state *" b ", unsigned " f );
+.BI "int bench_preflight(struct bench_state *" b );
+.ta \w'\fBint bench_adapt('u
+.BI "int bench_adapt(struct bench_state *" b ", double *" n_inout ,
+.BI " const struct bench_timing *" t );
+.ta \w'\fBint bench_adjust('u
+.BI "int bench_adjust(struct bench_state *" b ", struct bench_timing *" t_inout ,
+.BI " double " n ", double " base );
+.B "BENCH_MEASURE_DECLS;"
+.ta 2n \w'\fBBENCH_MEASURE_TAG('u
+.BI "BENCH_MEASURE_TAG(" tag ", struct bench_state *" b ,
+.BI " int &" rc ", struct bench_timing *" t_out ", double " bsae )
+.BI " " stmt
+.ta 2n \w'\fBBENCH_MEASURE('u
+.BI "BENCH_MEASURE(struct bench_state *" b ,
+.BI " int &" rc ", struct bench_timing *" t_out ", double " bsae )
+.BI " " stmt
.ta \w'\fBint bench_measure('u
.BI "int bench_measure(struct bench_state *" b ", struct bench_timing *" t_out ,
.BI " double " base ", bench_fn *" fn ", void *" ctx );
+.PP
+.ta 2n
+.B "enum {"
+.B " BTU_OP = 0,"
+.B " BTU_BYTE = 1,"
+.B " ..."
+.BI " BTU_LIMIT = " n
+.B "};"
+.ta \w'\fBvoid bench_report('u
+.BI "void bench_report(const struct gprintf_ops *" gops ", void *" go ,
+.BI " unsigned " unit ", const struct bench_timing *" t );
+.PP
.fi
.
.\"--------------------------------------------------------------------------
provides declarations and defintions
for performing low-level benchmarks.
.PP
-The `main event' is
-.BR bench_measure .
-This function will be described in detail later,
+The `main event' are the
+.B BENCH_MEASURE
+macro and
+.B bench_measure
+function.
+These will be described in detail later,
but, in brief,
-it calls a caller-provided function,
+they execute a caller-provided piece of code
instructing it to run adaptively chosen numbers of iterations,
in order to get a reasonably reliable measurement of its running time,
-and then reports its results by filling in a structure.
+and then report the results by filling in a structure.
.PP
-With understanding this function as our objective,
-we must examine all of the pieces involved in making it work.
+With understanding these as our objective,
+we must examine all of the pieces involved in making them work.
.
.SS Timers in general
A
and/or in CPU cycles.
A timer is represented by a pointer to an object of type
.BR "struct bench_timer" .
-This structure has a single member,
+This structure has two members:
.BR ops ,
pointing to a
.BR "struct bench_timerops" ,
-which is a table of function pointers;
+which is a table of function pointers,
+and
+.BR ref ,
+which is a simple reference count;
typically, a timer has more data following this,
but this fact is not exposed to applications.
.PP
.IB tm ->ops->describe( tm ", " d)
Write a description of the timer to the dynamic string
.IR d .
-.TPc
+.TP
.IB tm ->ops->preflight( tm )
Ensure that the timer is in working order,
and perform any necessary per-thread or per-process setup.
Destroy the timer,
releasing all of the resources that it holds.
.PP
+In a freshly-created timer, the
+.B ref
+member is 1.
+Applications are expected to handle the reference count themselves;
+the
+.B destroy
+function does not check or decrement the count.
+Code for destroying the timer when it's no longer needed
+might look like this.
+.VS
+if (!--tm->ref) tm->ops->destroy(tm);
+.VE
A
.B bench_timing
structure reports the difference between two times,
if the timer returned any valid timing information.
.TP
.B n
-The number of iterations performed by the benchmark function
+The number of units processed the benchmark computation
on its satisfactory run,
-multiplied by
-.IR base .
+multiplied by a given
+.IR base
+\(en see
+.BR BENCH_MEASURE ,
+.BR bench_measure ,
+and
+.BR bench_adjust .
.TP
.B t
The time taken for the satisfactory run of the benchmark function,
and or
.B BTF_CYOK
if the passage-of-time or cycle count respectively are valid.
+.PP
+The
+.B BENCH_TIMELOOP_TAG
+macro uses a timer to measure a number of iterations of a computation.
+It requires the declarations made by
+.B BENCH_TIMELOOP_DECLS
+to be in scope,
+ideally within an enclosing block
+(rather than at top-level,
+where they'll have static storage duration,
+and take longer to access).
+The macro's expansion is syntactically a statement head;
+see
+.BR control (3)
+for details about the underlying machinery.
+In more detail, the macro is invoked as
+.IP
+.nf
+.ta 2n
+.BI "BENCH_TIMELOOP_TAG(" tag ", " tm ", " delta_out ", " n ", " onbreak )
+.BI " " stmt
+.fi
+.PP
+The
+.I tag
+argument is used to distinguish
+the labels used internally by the macro:
+see
+.BR control (3)
+for details about tags.
+The macro calls on the timer
+.I tm
+to determine the initial time and cycle counts,
+performs
+.I n
+iterations of some computation,
+and calls on the timer a second time
+to determine the final time and cycle counts,
+and to store the difference in
+.BI * delta_out \fR.
+The
+.I stmt
+may be any C statement:
+when it is executed,
+the variable
+.BR _bench_n ,
+of type
+.BR "unsigned long" ,
+is in scope.
+The statement should perform
+.B _bench_n
+iterations of the computation to be measured
+\(en and do as little else as possible.
+The argument
+.I n
+to the macro
+may be larger than
+.BR ULONG_MAX :
+the macro will execute
+.I stmt
+multiple times if necessary.
+The statement is allowed to clobber
+.BR _bench_n .
+Note that
+.B BENCH_TIMELOOP_TAG
+does
+.I not
+call the timer's
+.B preflight
+function.
+If the
+.I stmt
+executes a free
+.B break
+statement
+then the statement
+.I onbreak
+is executed;
+a free
+.B continue
+statement within
+.I stmt
+currently does not have a useful behaviour.
+Free
+.B break
+and
+.B continue
+statements within
+.I onbreak
+behave normally.
+(See
+.BR control (3)
+for a definition of
+`free'
+.B break
+and
+.B continue
+statements.)
.
.SS The built-in timer
The function
except check that the
.B linux-x86-perf-rdpmc-hw-cycles
cycle subtimer has been selected.
-This is almost certainly the best choice if it's available.
+This is almost certainly the best choice if it's available;
+It is, however, not compatible with (at least some versions of)
+.BR valgrind (1);
+it will detect that it is running under
+.B valgrind
+and fail to initialize.
.TP
.B x86-rdtscp
Counts CPU cycles using the x86
is called with a non-null timer pointer,
then it will not fail;
the benchmark state will be initialized,
-and the function returns zero.
+and the function returns zero;
+the timer's reference count is
+.I not
+incremented.
If the timer pointer is null,
then
.B bench_init
then the benchmark state will be initialized,
and the function returns zero.
In both cases,
-the timer becomes owned by the benchmark state:
+the timer reference becomes owned by the benchmark state:
calling
.B bench_destroy
-on the benchmark state will destroy the timer.
+on the benchmark state will decrement the timer's reference count,
+and destroy it unless it has additional outstanding references.
If
.B bench_init
is called with a null timer pointer,
and its attempt to create a timer for itself fails,
then
.B bench_init
-returns \-1;
+returns \-1:
the benchmark state is not initialized
-and can safely be discarded;
-calling
-safe to call
-.B bench_destroy
-on the unsuccessfully benchmark state is safe and has no effect.
+and can safely be discarded.
.PP
Calling
.B bench_destroy
on a benchmark state
releases any resources it holds,
most notably its timer, if any.
+Calling
+.B bench_destroy
+on an unsuccessfully initialized benchmark state
+is safe but has no effect.
.PP
Although
.B struct bench_state
if passage-of-time calibration fails,
then cycle count calibration is impossible.
.PP
+The benchmarking state must be calibrated differently
+for different kinds of timing loop;
+this is controlled by the flags passed as the
+.I f
+argument to
+.BR bench_calibrate .
+The main difference lies in whether the code to be measured
+is called
+.IR indirectly ,
+i.e., via a function pointer.
+Set
+.B BTF_INDIRECT
+if the code is to be called indirectly;
+leave this flag clear if the code is called directly.
+The
+.B bench_measure
+function always makes indirect calls;
+the
+.B BENCH_MEASURE
+macro does not itself make indirect calls.
+Usually, a program needs only one or the other;
+if both are necessary for some reason,
+the best approach is just to set up two benchmarking states
+sharing the same timer,
+and calibrate them separately.
+.PP
When it completes,
.B bench_calibrate
-sets flag in the benchmark state's
+sets flags in the benchmark state's
.B f
member:
if passage-of-time calibration succeeded,
it returns immediately,
either returning 0 or \-1
according to whether passage-of-time had previously been calibrated.
-.
-.SS Timing functions
-A
+.PP
+The
+.B BENCH_MEASURE
+macro measures the performance of a computation.
+It requires the declarations made by
+.B BENCH_MEASURE_DECLS
+to be in scope,
+ideally within an enclosing block
+(rather than at top-level,
+where they'll have static storage duration,
+and take longer to access).
+The macro's expansion is syntactically a statement head;
+see
+.BR control (3)
+for details about the underlying machinery.
+In more detail, the macro is invoked as
+.IP
+.nf
+.ta 2n
+.BI "BENCH_MEASURE(" b ", " rc ", " t_out ", " base )
+.BI " " stmt
+.fi
+.PP
+The
+.I stmt
+can be any C statement;
+it should perform
+.B _bench_n
+iterations of the computation to be measured.
+(The variable
+.B _bench_n
+is declarared as part of
+.B BENCH_MEASURE_DECLS
+and has type
+.BR "unsigned long" .
+Before commencing measurement proper,
+the macro calls
+.BR bench_preflight ,
+described below,
+to check that everything is set up properly
+for measurements on the current thread;
+if this fails, then the macro sets
+.I rc
+to \-1.
+Otherwise, the macro executes
+.I stmt
+one or more times,
+with the objective of finding an iteration count
+.I n
+such that
+.I n
+iterations of the computation take more than
+.IB b ->target_s "" \fR/\(sr2
+seconds.
+If measurement fails,
+then
+.I rc
+is set to \-1;
+otherwise,
+.I rc
+is set to zero, and
+.BI * t_out
+is filled in with the measurement;
+.IB t_out ->n
+is set to
+.IR n "\ \(mu\ " base .
+.PP
+The
+.B BENCH_MEASURE_TAG
+macro works just like
+.B BENCH_MEASURE
+except that it takes an additional
+.I tag
+argument used to distinguish the internal labels
+used by the macro's implementation;
+this makes it possible to use
+.B BENCH_MEASURE_TAG
+as a component in more complex macros.
+See
+.BR control (3)
+for details about control-structure macros
+and the meaning and format of tags.
+.PP
+The function
+.B bench_measure
+is similar,
+except that it calls a
.I benchmark function
-has the signature
+to perform the computation.
+A benchmark function has the signature
.IP
.BI "void " fn "(unsigned long " n ", void *" ctx );
.PP
argument is a pointer passed into
.B bench_measure
for the benchmark function's own purposes.
-.PP
-The function
+The
.B bench_measure
-receives five arguments.
-.TP
-.I b
-points to the benchmark state to be used.
-.TP
-.I t_out
-is the address of a
-.BR struct bench_timing
-in which the measurement should be left.
-This structure is described below.
-.TP
-.I base
-is a count of the number of operations performed
-by each iteration of the benchmark function.
-.TP
-.I fn
-is a benchmark function, described above.
-.TP
-.I ctx
-is a pointer to be passed to the benchmark function.
+function returns zero on success,
+or \-1 on failure.
+Note that
.B bench_measure
-does not interpret this pointer in any way.
+invokes the benchmark indirectly,
+so the benchmark state should have been calibrated with
+.BR BTF_INDIRECT .
+.
+.SS Measurement utilities
+The following functions are primarily exported for the benefit of the
+.B BENCH_MEASURE
+macro,
+but are documented here in case they are useful.
.PP
The
-.B bench_measure
-function calls its benchark function repeatedly
-with different iteration counts
-.IR n ,
-with the objective that the call take approximately
-.B target_s
-seconds, as established in the benchmark state.
-(Currently, if
-.B target_s
-holds the value
-.IR t ,
-then
-.B bench_measure
-is satisfied when a call takes at least
-.IR t /\(sr2\*,s.)
-Once the function finds a satisfactory number of iterations,
-it stores the results in
-.BI * t_out \fR.
-If measurement succeeds, then
-.B bench_measure
-returns zero.
-If it fails \(en
-most likely because the timer failed \(en
-then it returns \-1.
+.B bench_preflight
+function prepares a benchmarking state for use.
+It checks that the timer is calibrated
+and suitable for measuring passage-of-time;
+it also calls the timer's
+.B preflight
+function to prepare it for measurements on the current thread.
+If these checks succeed, then
+.B bench_preflight
+returns zero;
+otherwise it returns \-1
+and the caller should not proceed with measurements.
+.PP
+The
+.B bench_adapt
+function is used to determine iteration counts.
+It is used in a loop such as the following.
+.IP
+.nf
+.ta 2n +2n
+.B "BENCH_TIMELOOP_DECLS;"
+.B "struct bench_timer *tm;"
+.B "struct bench_timing t;"
+.B "double n = 1.0, target_s = 1.0;"
+.IP
+.B "do {"
+.B " BENCH_TIMELOOP_TAG(time, tm, &t, n, { break; })"
+.BI " " "(do " _bench_n " iterations of some computation)" ;
+.B "} while (!bench_adapt(&n, target_s, &t));"
+.fi
+.PP
+On entry,
+.BI *n_inout
+should be the number of iterations performed by the previous step,
+and
+.BI * t
+the resulting time;
+the
+.B BTF_TIMEOK
+flag must be set in
+.IB t ->f \fR.
+If the timing is sufficient \(en if
+.IR t\fB->t "\ \*(>=\ " target_s /\(sr2
+\(en then
+.B bench_adapt
+returns a nonzero value to indicate that measurement is complete.
+Otherwise, it sets
+.BI * n_inout
+to a new, larger iteration count
+and returns zero to indicate that a further pass is necessary.
+.PP
+The
+.B bench_adjust
+function adjusts a raw timing,
+as captured by
+.BR BENCH_TIMELOOP_TAG ,
+according to the calibration data captured in
+.IR b .
+On exit, the timing data is updated,
+and
+.IB t ->n
+is set to the product
+.IR n "\ \(mu\ " base .
+.
+.SS Reporting results
+The
+.B bench_report
+function formats a measurement result
+into a human-readable string.
+The function writes its output using the
+generalized output formatting operations
+.I gops
+and output pointer
+.IR go ;
+see
+.BR gprintf (3)
+for details on generalized output formatting.
+The
+.I unit
+argument describes the unit of activity being measured:
+.TP
+.B BTU_OP
+counts operations of some unspecified nature, while
+.TP
+.B BTU_BYTE
+counts a number of bytes processed.
+.PP
+These are presented differently
+\(em in particular,
+quantities bytes are expressed using binary scaling rather than decimal.
+The timing to report is given by the
+.I t
+argument;
+.IB t ->n
+gives the number of units processed.
+.
+.\"--------------------------------------------------------------------------
+.SH EXAMPLE
.
+The following macros offer a fairly simple example of
+how the benchmarking functions and macros can be used.
+.VS
+.ta 2n +2n +2n 2n+\w'\fBBENCH_MEASURE_TAG('u \n(.lu-\n(.iu-4n
+#define BENCHMARK_DECLS \e
+ struct bench_timing _bmark_t; \e
+ int _bmark_rc; \e
+ BENCH_MEASURE_DECLS
+.VP
+#define BENCHMARK_TAG(tag, b, unit, base) \e
+ MC_BEFORE(tag##__benchmark_before, { fflush(stdout); }) \e
+ MC_AFTER(tag##__benchmark_after, { \e
+ if (_bmark_rc) \e
+ printf(": FAILED\en"); \e
+ else { \e
+ fputs(": ", stdout); \e
+ bench_report(&file_printops, stdout, (unit), &_bmark_tm);\ \e
+ putchar('\n'); \e
+ } \e
+ }) \e
+ BENCH_MEASURE_TAG(tag##__bmarkmark_measure, \e
+ (b), _bmark_rc, &_bmark_t, (base))
+#define BENCHMARK(b, unit, base) \e
+ BENCHMARK_TAG(bench, b, unit, base)
+.VE
+
.\"--------------------------------------------------------------------------
.SH "SEE ALSO"
.
+.BR control (3),
+.BR macros (3),
.BR tvec-bench (3),
.BR mLib (3).
.