[mLib] / codec / codec.3.in

.\" -*-nroff-*-
.\"
.\" Manual for new-fangled binary encoding and decoding
.\"
.\" (c) 2009, 2014, 2015, 2019, 2023, 2024 Straylight/Edgeware
.\"
.
.\"----- Licensing notice ---------------------------------------------------
.\"
.\" This file is part of the mLib utilities library.
.\"
.\" mLib is free software: you can redistribute it and/or modify it under
.\" the terms of the GNU Library General Public License as published by
.\" the Free Software Foundation; either version 2 of the License, or (at
.\" your option) any later version.
.\"
.\" mLib is distributed in the hope that it will be useful, but WITHOUT
.\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
.\" FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
.\" License for more details.
.\"
.\" You should have received a copy of the GNU Library General Public
.\" License along with mLib.  If not, write to the Free Software
.\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
.\" USA.
.
.\"--------------------------------------------------------------------------
.so ../defs.man \" @@@PRE@@@
.
.\"--------------------------------------------------------------------------
.TH codec 3mLib "9 January 2009" "Straylight/Edgeware" "mLib utilities library"
.\" @codec_class
.\" @codec_strerror
.\" @null_codec_class
.\" @base64_class
.\" @file64_class
.\" @base64url_class
.\" @base32_class
.\" @base32hex_class
.\" @hex_class
.
.\"--------------------------------------------------------------------------
.SH NAME
codec \- binary encoding and decoding
.
.\"--------------------------------------------------------------------------
.SH SYNOPSIS
.
.nf
.B "#include <mLib/codec.h>"
.B "#include <mLib/base64.h>"
.B "#include <mLib/base32.h>"
.B "#include <mLib/hex.h>"
.PP
.B "#define CDCF_LOWERC ..."
.B "#define CDCF_IGNCASE ..."
.B "#define CDCF_NOEQPAD ..."
.B "#define CDCF_IGNEQPAD ..."
.B "#define CDCF_IGNEQMID ..."
.B "#define CDCF_IGNZPAD ..."
.B "#define CDCF_IGNNEWL ..."
.B "#define CDCF_IGNINVCH ..."
.B "#define CDCF_IGNSPC ..."
.B "#define CDCF_IGNJUNK ..."
.PP
.ta 2n
.B "enum {"
.B "	CDCERR_OK = ...,"
.B "	CDCERR_INVCH = ...,"
.B "	CDCERR_INVEQPAD = ...,"
.B "	CDCERR_INVZPAD = ..."
.B "};"
.PP
.B "typedef struct {"
.B "	const char *name;"
.ta 2n +\w'\fBcodec *(*encoder)('u
.BI "	codec *(*encoder)(unsigned " flags ,
.BI "		const char *" indent ", unsigned " maxlen );
.BI "	codec *(*decoder)(unsigned " flags );
.B "	...\&"
.B "} codec_class;"
.PP
.B "typedef struct {"
.B "	const codec_ops *ops;"
.B "} codec;"
.PP
.B "typedef struct {"
.B "	const codec_class *c;"
.BI "	int (*code)(codec *" c ", const void *" p ", size_t " sz ", dstr *" d );
.BI "	void (*destroy)(codec *" c );
.B "} codec_ops;"
.PP
.B "codec_class null_codec_class;"
.B "codec_class base64_class, file64_class, base64url_class;"
.B "codec_class base32_class, base32hex_class;"
.B "codec_class hex_class;"
.PP
.BI "const char *codec_strerror(int " err ");"
.fi
.
.\"--------------------------------------------------------------------------
.SH DESCRIPTION
.
The
.B codec
system provides an object-based interface to functions which encode
binary data as plain text and decode the result to recover the original
binary data.  The interface makes it easy to support multiple encodings
and select an appropriate one at runtime.
.
.SS "The codec_class structure"
The
.B codec_class
structure represents a particular encoding format.  The structure has
the following members.
.TP
.B "const char *name"
The name of the class, as a null-terminated string.  The name should not
contain whitespace characters.
.TP
.BI "codec *(*encoder)(unsigned " flags ", const char *" indent ", unsigned " maxline ")"
Pointer to a function which constructs a new encoder object, of type
.BR codec .
The
.I flags
configure the behaviour of the object; the
.I indent
string is written to separate lines of output; the integer
.I maxline
is the maximum length of line to be produced, or zero to forbid line
breaking.
.TP
.BI "codec *(*decoder)(unsigned " flags ")"
Pointer to a function which constructs a new decoder object, also of
type
.BR codec .
The
.I flags
configure the behaviour of the object.
.PP
The
.I flags
to the
.B encoder
and
.B decoder
functions have the following meanings.
.TP
.B CDCF_LOWERC
For codecs which produce output using a single alphabetic case (e.g.,
.BR base32 ,
.BR hex ),
emit and accept only lower case; the default to emit and accept only
upper case, for compatibility with RFC4648.  If the codec usually
produces mixed-case output, then this flag is ignored.
.TP
.B CDCF_IGNCASE
For codecs which produce output using a single alphabetic case, ignore
the case of the input when decoding.  If the codec usually produces
mixed-case output, then this flag is ignored.
.TP
.B CDCF_NOEQPAD
For codecs which usually pad their output (e.g.,
.BR base64 ,
.BR base32 ),
do not emit or accept padding characters.  If the codec does not usually
produce padding, or the padding is not redundant, then this flag is
ignored.
.TP
.B CDCF_IGNEQPAD
For codecs which usually pad their output, do not treat incorrect (e.g.,
missing or excessive) padding as an error when decoding.  If the codec
does not usually produce padding, or the padding is required for
unambiguous decoding, then this flag is ignored.
.TP
.B CDCF_IGNEQMID
For codecs which usually pad their output, ignore padding characters
wherever they may appear when decoding.  Usually padding characters
indicate the end of the input, and further input characters are
considered erroneous.  If the codec does not usually produce padding, or
it is impossible to resume decoding correctly having seen padding
characters, then this flag is ignored.
.TP
.B CDCF_IGNZPAD
For codecs which need to pad their input, ignore unusual padding bits
when decoding.  (This is not at all the same thing as the padding
characters controlled by the flags above: they deal with padding the
length of the encoding
.I output
up to a suitable multiple of characters; this option deals with padding
of the
.I input
prior to encoding.)  If the codec does not add padding bits, or specific
values are required for unambiguous decoding, then this flag is ignored.
.TP
.B CDCF_IGNNEWL
Ignore newline (and carriage-return) characters when decoding: the
default for RFC4648 codecs is to reject newline characters.  If these
characters are significant in the encoding, then this flag is ignored.
.TP
.B CDCF_IGNSPC
Ignore whitespace characters (other than newlines) when decoding: the
default for RFC4648 codecs is to reject whitespace characters.  If these
characters are significant in the encoding, then this flag is ignored.
.TP
.B CDCF_IGNINVCH
Ignore any other invalid characters appearing in the input when
decoding.
.TP
.B CDCF_IGNJUNK
Ignore all `junk' in the input.  This should suppress almost all
decoding errors.
.PP
If you do not set any of the
.BR CDCF_IGN ...\&
flags, a decoder should only accept the exact encoding that the
corresponding encoder would produce (with
.I maxline
= 0 to inhibit line-breaking).
.
.SS "The codec and codec_ops structures"
The
.B codec
structure represents the state of an encoder or decoder, as returned by
the
.B encoder
and
.B decoder
functions described above, contains a single member.
.TP
.B "const codec_ops *ops"
Pointer to a
.B codec_ops
structure which contains operations and metadata for use with the
encoder or decoder.
.PP
The
.B codec_ops
structure contains the following members.
.TP
.B "const codec_class *c"
Pointer back to the
.B codec_class
which was used to construct the
.B codec
object.
.TP
.BI "int (*code)(codec *" c ", const void *" p ", size_t " sz ", dstr *" d ")"
Encode or decode, using the codec
.IR c ,
the data in the buffer at address
.I p
and continuing for
.I sz
bytes, appending the output to the dynamic string
.I d
(see
.BR dstr (3)).
If the operation was successful, the function returns zero; otherwise it
returns a nonzero error code, as described below.
.TP
.BI "void (*destroy)(codec *" c ")"
Destroy the codec object
.IR c ,
freeing any resources it may hold.
.PP
A codec may buffer its input (e.g., if needs to see more in order to
decide what output to produce next); it may also need to take special
action at the end of the input (e.g., flushing buffers, and applying
padding).  To signal the codec that there is no more input, call the
.B code
function with a null
.I p
pointer.  It will then write any final output to
.IR d .
.PP
The following error conditions may be reported.
.TP
.B CDCERR_INVCH
An invalid character was encountered while decoding.  This includes
encoutering padding characters if padding is disabled using the
.B CDCF_NOEQPAD
flag.
.TP
.B CDCERR_INVEQPAD
Invalid padding characters (e.g., wrong characters, or too few, too
many, or none at all) were found during decoding.  This may also
indicate that the input is truncated, even if the codec does not usually
perform output padding.
.TP
.B CDCERR_INVZPAD
Invalid padding bits were found during decoding.
.PP
The
.B codec_strerror
function converts these error codes to brief, (moderately)
human-readable strings.
.
.SS "Provided codecs"
The library provides a number of standard codecs.
.TP
.B base64
Implements Base64 encoding, as defined by RFC4648.  Output is
mixed-case, so the
.B CDCF_LOWERC
and
.B CDCF_IGNCASE
flags are ignored.
.TP
.B safe64
Implements a variant of the Base64 encoding which uses
.RB ` % '
in place of
.RB ` / ',
so that its output is suitable for use as a Unix filename.
.TP
.B base64url
Implements the filename- and URL-safe variant of Base64 encoding, as
defined by RFC4648.
.TP
.B base32
Implements Base32 encoding, as defined by RFC4648.  Output is in upper
case by default.
.TP
.B base32hex
Implements the extended-hex variant of Base32, as defined by RFC4648.
This encoding has the property that the encoding preserves the ordering
of messages if padding is suppressed.
.TP
.B hex
Implements hex encoding, defined by RFC4648 under the name Base16.  For
compatibility with that specification, output is in upper case by
default.
.
.\"--------------------------------------------------------------------------
.SH "SEE ALSO"
.
.BR bincode (1),
.BR dstr (3),
.BR mLib (3).
.
.\"--------------------------------------------------------------------------
.SH AUTHOR
.
Mark Wooding, <mdw@distorted.org.uk>
.
.\"----- That's all, folks --------------------------------------------------
Commit	Line	Data
236f657b	1	.\" --nroff--
c4ccbbf9 MW	2	.\"
	3	.\" Manual for new-fangled binary encoding and decoding
	4	.\"
	5	.\" (c) 2009, 2014, 2015, 2019, 2023, 2024 Straylight/Edgeware
	6	.\"
	7	.
	8	.\"----- Licensing notice ---------------------------------------------------
	9	.\"
	10	.\" This file is part of the mLib utilities library.
	11	.\"
	12	.\" mLib is free software: you can redistribute it and/or modify it under
	13	.\" the terms of the GNU Library General Public License as published by
	14	.\" the Free Software Foundation; either version 2 of the License, or (at
	15	.\" your option) any later version.
	16	.\"
	17	.\" mLib is distributed in the hope that it will be useful, but WITHOUT
	18	.\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	19	.\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
	20	.\" License for more details.
	21	.\"
	22	.\" You should have received a copy of the GNU Library General Public
	23	.\" License along with mLib. If not, write to the Free Software
	24	.\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	.\" USA.
	26	.
	27	.\"--------------------------------------------------------------------------
	28	.so ../defs.man \" @@@PRE@@@
	29	.
	30	.\"--------------------------------------------------------------------------
	31	.TH codec 3mLib "9 January 2009" "Straylight/Edgeware" "mLib utilities library"
236f657b MW	32	.\" @codec_class
	33	.\" @codec_strerror
	34	.\" @null_codec_class
	35	.\" @base64_class
	36	.\" @file64_class
	37	.\" @base64url_class
	38	.\" @base32_class
	39	.\" @base32hex_class
	40	.\" @hex_class
c4ccbbf9 MW	41	.
	42	.\"--------------------------------------------------------------------------
	43	.SH NAME
	44	codec \- binary encoding and decoding
	45	.
	46	.\"--------------------------------------------------------------------------
236f657b	47	.SH SYNOPSIS
c4ccbbf9	48	.
236f657b MW	49	.nf
	50	.B "#include <mLib/codec.h>"
	51	.B "#include <mLib/base64.h>"
	52	.B "#include <mLib/base32.h>"
	53	.B "#include <mLib/hex.h>"
d056fbdf	54	.PP
4729aa69 MW	55	.B "#define CDCF_LOWERC ..."
	56	.B "#define CDCF_IGNCASE ..."
	57	.B "#define CDCF_NOEQPAD ..."
	58	.B "#define CDCF_IGNEQPAD ..."
	59	.B "#define CDCF_IGNEQMID ..."
	60	.B "#define CDCF_IGNZPAD ..."
	61	.B "#define CDCF_IGNNEWL ..."
	62	.B "#define CDCF_IGNINVCH ..."
	63	.B "#define CDCF_IGNSPC ..."
	64	.B "#define CDCF_IGNJUNK ..."
d056fbdf	65	.PP
adec5584	66	.ta 2n
4729aa69	67	.B "enum {"
adec5584 MW	68	.B " CDCERR_OK = ...,"
	69	.B " CDCERR_INVCH = ...,"
	70	.B " CDCERR_INVEQPAD = ...,"
	71	.B " CDCERR_INVZPAD = ..."
4729aa69	72	.B "};"
d056fbdf	73	.PP
4729aa69	74	.B "typedef struct {"
adec5584 MW	75	.B " const char *name;"
	76	.ta 2n +\w'\fBcodec (encoder)('u
	77	.BI " codec (encoder)(unsigned " flags ,
	78	.BI " const char *" indent ", unsigned " maxlen );
	79	.BI " codec (decoder)(unsigned " flags );
	80	.B " ...\&"
4729aa69	81	.B "} codec_class;"
d056fbdf	82	.PP
4729aa69	83	.B "typedef struct {"
adec5584	84	.B " const codec_ops *ops;"
4729aa69	85	.B "} codec;"
d056fbdf	86	.PP
4729aa69	87	.B "typedef struct {"
adec5584 MW	88	.B " const codec_class *c;"
	89	.BI " int (code)(codec " c ", const void " p ", size_t " sz ", dstr " d );
	90	.BI " void (destroy)(codec " c );
4729aa69	91	.B "} codec_ops;"
d056fbdf	92	.PP
236f657b MW	93	.B "codec_class null_codec_class;"
	94	.B "codec_class base64_class, file64_class, base64url_class;"
	95	.B "codec_class base32_class, base32hex_class;"
	96	.B "codec_class hex_class;"
d056fbdf	97	.PP
236f657b MW	98	.BI "const char *codec_strerror(int " err ");"
236f657b MW	99	.fi
c4ccbbf9 MW	100	.
c4ccbbf9 MW	101	.\"--------------------------------------------------------------------------
236f657b	102	.SH DESCRIPTION
c4ccbbf9	103	.
236f657b MW	104	The
	105	.B codec
	106	system provides an object-based interface to functions which encode
	107	binary data as plain text and decode the result to recover the original
	108	binary data. The interface makes it easy to support multiple encodings
	109	and select an appropriate one at runtime.
c4ccbbf9	110	.
236f657b MW	111	.SS "The codec_class structure"
	112	The
	113	.B codec_class
	114	structure represents a particular encoding format. The structure has
	115	the following members.
	116	.TP
	117	.B "const char *name"
	118	The name of the class, as a null-terminated string. The name should not
	119	contain whitespace characters.
	120	.TP
	121	.BI "codec (encoder)(unsigned " flags ", const char *" indent ", unsigned " maxline ")"
	122	Pointer to a function which constructs a new encoder object, of type
	123	.BR codec .
	124	The
	125	.I flags
	126	configure the behaviour of the object; the
	127	.I indent
	128	string is written to separate lines of output; the integer
	129	.I maxline
	130	is the maximum length of line to be produced, or zero to forbid line
	131	breaking.
	132	.TP
	133	.BI "codec (decoder)(unsigned " flags ")"
	134	Pointer to a function which constructs a new decoder object, also of
	135	type
	136	.BR codec .
	137	The
	138	.I flags
	139	configure the behaviour of the object.
	140	.PP
	141	The
	142	.I flags
	143	to the
	144	.B encoder
	145	and
	146	.B decoder
	147	functions have the following meanings.
	148	.TP
	149	.B CDCF_LOWERC
	150	For codecs which produce output using a single alphabetic case (e.g.,
	151	.BR base32 ,
	152	.BR hex ),
	153	emit and accept only lower case; the default to emit and accept only
	154	upper case, for compatibility with RFC4648. If the codec usually
	155	produces mixed-case output, then this flag is ignored.
	156	.TP
	157	.B CDCF_IGNCASE
	158	For codecs which produce output using a single alphabetic case, ignore
	159	the case of the input when decoding. If the codec usually produces
	160	mixed-case output, then this flag is ignored.
	161	.TP
	162	.B CDCF_NOEQPAD
	163	For codecs which usually pad their output (e.g.,
	164	.BR base64 ,
	165	.BR base32 ),
	166	do not emit or accept padding characters. If the codec does not usually
	167	produce padding, or the padding is not redundant, then this flag is
	168	ignored.
	169	.TP
	170	.B CDCF_IGNEQPAD
	171	For codecs which usually pad their output, do not treat incorrect (e.g.,
	172	missing or excessive) padding as an error when decoding. If the codec
	173	does not usually produce padding, or the padding is required for
	174	unambiguous decoding, then this flag is ignored.
175	.TP
176	.B CDCF_IGNEQMID
177	For codecs which usually pad their output, ignore padding characters
178	wherever they may appear when decoding. Usually padding characters
179	indicate the end of the input, and further input characters are
180	considered erroneous. If the codec does not usually produce padding, or
181	it is impossible to resume decoding correctly having seen padding
182	characters, then this flag is ignored.
183	.TP
184	.B CDCF_IGNZPAD
185	For codecs which need to pad their input, ignore unusual padding bits
186	when decoding. (This is not at all the same thing as the padding
187	characters controlled by the flags above: they deal with padding the
188	length of the encoding
189	.I output
190	up to a suitable multiple of characters; this option deals with padding
191	of the
192	.I input
193	prior to encoding.) If the codec does not add padding bits, or specific
194	values are required for unambiguous decoding, then this flag is ignored.
195	.TP
196	.B CDCF_IGNNEWL
197	Ignore newline (and carriage-return) characters when decoding: the
198	default for RFC4648 codecs is to reject newline characters. If these
199	characters are significant in the encoding, then this flag is ignored.
200	.TP
09fbf4d0 MW	201	.B CDCF_IGNSPC
	202	Ignore whitespace characters (other than newlines) when decoding: the
	203	default for RFC4648 codecs is to reject whitespace characters. If these
	204	characters are significant in the encoding, then this flag is ignored.
	205	.TP
236f657b MW	206	.B CDCF_IGNINVCH
	207	Ignore any other invalid characters appearing in the input when
	208	decoding.
	209	.TP
	210	.B CDCF_IGNJUNK
	211	Ignore all `junk' in the input. This should suppress almost all
	212	decoding errors.
	213	.PP
	214	If you do not set any of the
c3dd6b29	215	.BR CDCF_IGN ...\&
236f657b MW	216	flags, a decoder should only accept the exact encoding that the
	217	corresponding encoder would produce (with
	218	.I maxline
	219	= 0 to inhibit line-breaking).
c4ccbbf9	220	.
236f657b MW	221	.SS "The codec and codec_ops structures"
	222	The
	223	.B codec
	224	structure represents the state of an encoder or decoder, as returned by
	225	the
	226	.B encoder
	227	and
	228	.B decoder
	229	functions described above, contains a single member.
	230	.TP
	231	.B "const codec_ops *ops"
	232	Pointer to a
	233	.B codec_ops
	234	structure which contains operations and metadata for use with the
	235	encoder or decoder.
	236	.PP
	237	The
	238	.B codec_ops
	239	structure contains the following members.
	240	.TP
	241	.B "const codec_class *c"
	242	Pointer back to the
	243	.B codec_class
	244	which was used to construct the
	245	.B codec
	246	object.
	247	.TP
	248	.BI "int (code)(codec " c ", const void " p ", size_t " sz ", dstr " d ")"
	249	Encode or decode, using the codec
63ba7202	250	.IR c ,
236f657b MW	251	the data in the buffer at address
	252	.I p
	253	and continuing for
	254	.I sz
	255	bytes, appending the output to the dynamic string
	256	.I d
	257	(see
	258	.BR dstr (3)).
	259	If the operation was successful, the function returns zero; otherwise it
	260	returns a nonzero error code, as described below.
	261	.TP
	262	.BI "void (destroy)(codec " c ")"
	263	Destroy the codec object
	264	.IR c ,
	265	freeing any resources it may hold.
	266	.PP
	267	A codec may buffer its input (e.g., if needs to see more in order to
	268	decide what output to produce next); it may also need to take special
	269	action at the end of the input (e.g., flushing buffers, and applying
	270	padding). To signal the codec that there is no more input, call the
	271	.B code
	272	function with a null
	273	.I p
	274	pointer. It will then write any final output to
	275	.IR d .
	276	.PP
	277	The following error conditions may be reported.
	278	.TP
	279	.B CDCERR_INVCH
	280	An invalid character was encountered while decoding. This includes
	281	encoutering padding characters if padding is disabled using the
	282	.B CDCF_NOEQPAD
	283	flag.
	284	.TP
	285	.B CDCERR_INVEQPAD
	286	Invalid padding characters (e.g., wrong characters, or too few, too
	287	many, or none at all) were found during decoding. This may also
	288	indicate that the input is truncated, even if the codec does not usually
	289	perform output padding.
	290	.TP
	291	.B CDCERR_INVZPAD
	292	Invalid padding bits were found during decoding.
	293	.PP
	294	The
	295	.B codec_strerror
	296	function converts these error codes to brief, (moderately)
	297	human-readable strings.
c4ccbbf9	298	.
236f657b MW	299	.SS "Provided codecs"
	300	The library provides a number of standard codecs.
	301	.TP
	302	.B base64
	303	Implements Base64 encoding, as defined by RFC4648. Output is
	304	mixed-case, so the
	305	.B CDCF_LOWERC
	306	and
	307	.B CDCF_IGNCASE
	308	flags are ignored.
	309	.TP
	310	.B safe64
	311	Implements a variant of the Base64 encoding which uses
	312	.RB ` % '
	313	in place of
	314	.RB ` / ',
	315	so that its output is suitable for use as a Unix filename.
	316	.TP
	317	.B base64url
	318	Implements the filename- and URL-safe variant of Base64 encoding, as
	319	defined by RFC4648.
	320	.TP
	321	.B base32
	322	Implements Base32 encoding, as defined by RFC4648. Output is in upper
	323	case by default.
	324	.TP
	325	.B base32hex
	326	Implements the extended-hex variant of Base32, as defined by RFC4648.
	327	This encoding has the property that the encoding preserves the ordering
	328	of messages if padding is suppressed.
	329	.TP
	330	.B hex
	331	Implements hex encoding, defined by RFC4648 under the name Base16. For
	332	compatibility with that specification, output is in upper case by
	333	default.
c4ccbbf9 MW	334	.
c4ccbbf9 MW	335	.\"--------------------------------------------------------------------------
236f657b	336	.SH "SEE ALSO"
c4ccbbf9	337	.
236f657b MW	338	.BR bincode (1),
	339	.BR dstr (3),
	340	.BR mLib (3).
c4ccbbf9 MW	341	.
c4ccbbf9 MW	342	.\"--------------------------------------------------------------------------
236f657b	343	.SH AUTHOR
c4ccbbf9	344	.
236f657b	345	Mark Wooding, <mdw@distorted.org.uk>
c4ccbbf9 MW	346	.
c4ccbbf9 MW	347	.\"----- That's all, folks --------------------------------------------------