codec, baseconv: Cleanup of the various binary encoding functions.

[mLib] / codec / codec.3
diff --git a/codec/codec.3 b/codec/codec.3

new file mode 100644 (file)

index 0000000..96174f3
--- /dev/null
+++ b/codec/codec.3
@@ -0,0 +1,256 @@
+.\" -*-nroff-*-
+.TH codec 3 "9 January 2009" "Straylight/Edgeware" "mLib utilities library"
+.SH NAME
+codec \- binary encoding and decoding
+.\" @codec_class
+.\" @codec_strerror
+.\" @null_codec_class
+.\" @base64_class
+.\" @file64_class
+.\" @base64url_class
+.\" @base32_class
+.\" @base32hex_class
+.\" @hex_class
+.SH SYNOPSIS
+.nf
+.B "#include <mLib/codec.h>"
+.B "#include <mLib/base64.h>"
+.B "#include <mLib/base32.h>"
+.B "#include <mLib/hex.h>"
+
+.B "codec_class null_codec_class;"
+.B "codec_class base64_class, file64_class, base64url_class;"
+.B "codec_class base32_class, base32hex_class;"
+.B "codec_class hex_class;"
+
+.BI "const char *codec_strerror(int " err ");"
+.fi
+.SH DESCRIPTION
+The
+.B codec
+system provides an object-based interface to functions which encode
+binary data as plain text and decode the result to recover the original
+binary data.  The interface makes it easy to support multiple encodings
+and select an appropriate one at runtime.
+.SS "The codec_class structure"
+The
+.B codec_class
+structure represents a particular encoding format.  The structure has
+the following members.
+.TP
+.B "const char *name"
+The name of the class, as a null-terminated string.  The name should not
+contain whitespace characters.
+.TP
+.BI "codec *(*encoder)(unsigned " flags ", const char *" indent ", unsigned " maxline ")"
+Pointer to a function which constructs a new encoder object, of type
+.BR codec .
+The
+.I flags
+configure the behaviour of the object; the
+.I indent
+string is written to separate lines of output; the integer
+.I maxline
+is the maximum length of line to be produced, or zero to forbid line
+breaking.
+.TP
+.BI "codec *(*decoder)(unsigned " flags ")"
+Pointer to a function which constructs a new decoder object, also of
+type
+.BR codec .
+The
+.I flags
+configure the behaviour of the object.
+.PP
+The
+.I flags
+to the
+.B encoder
+and
+.B decoder
+functions have the following meanings.
+.TP
+.B CDCF_LOWERC
+For codecs which produce output using a single alphabetic case (e.g.,
+.BR base32 ,
+.BR hex ),
+emit and accept only lower case; the default to emit and accept only
+upper case, for compatibility with RFC4648.  If the codec usually
+produces mixed-case output, then this flag is ignored.
+.TP
+.B CDCF_IGNCASE
+For codecs which produce output using a single alphabetic case, ignore
+the case of the input when decoding.  If the codec usually produces
+mixed-case output, then this flag is ignored.
+.TP
+.B CDCF_NOEQPAD
+For codecs which usually pad their output (e.g.,
+.BR base64 ,
+.BR base32 ),
+do not emit or accept padding characters.  If the codec does not usually
+produce padding, or the padding is not redundant, then this flag is
+ignored.
+.TP
+.B CDCF_IGNEQPAD
+For codecs which usually pad their output, do not treat incorrect (e.g.,
+missing or excessive) padding as an error when decoding.  If the codec
+does not usually produce padding, or the padding is required for
+unambiguous decoding, then this flag is ignored.
+.TP
+.B CDCF_IGNEQMID
+For codecs which usually pad their output, ignore padding characters
+wherever they may appear when decoding.  Usually padding characters
+indicate the end of the input, and further input characters are
+considered erroneous.  If the codec does not usually produce padding, or
+it is impossible to resume decoding correctly having seen padding
+characters, then this flag is ignored.
+.TP
+.B CDCF_IGNZPAD
+For codecs which need to pad their input, ignore unusual padding bits
+when decoding.  (This is not at all the same thing as the padding
+characters controlled by the flags above: they deal with padding the
+length of the encoding
+.I output
+up to a suitable multiple of characters; this option deals with padding
+of the
+.I input
+prior to encoding.)  If the codec does not add padding bits, or specific
+values are required for unambiguous decoding, then this flag is ignored.
+.TP
+.B CDCF_IGNNEWL
+Ignore newline (and carriage-return) characters when decoding: the
+default for RFC4648 codecs is to reject newline characters.  If these
+characters are significant in the encoding, then this flag is ignored.
+.TP
+.B CDCF_IGNINVCH
+Ignore any other invalid characters appearing in the input when
+decoding.
+.TP
+.B CDCF_IGNJUNK
+Ignore all `junk' in the input.  This should suppress almost all
+decoding errors.
+.PP
+If you do not set any of the
+.BR CDCF_IGN ...
+flags, a decoder should only accept the exact encoding that the
+corresponding encoder would produce (with
+.I maxline
+= 0 to inhibit line-breaking).
+.SS "The codec and codec_ops structures"
+The
+.B codec
+structure represents the state of an encoder or decoder, as returned by
+the
+.B encoder
+and
+.B decoder
+functions described above, contains a single member.
+.TP
+.B "const codec_ops *ops"
+Pointer to a
+.B codec_ops
+structure which contains operations and metadata for use with the
+encoder or decoder.
+.PP
+The
+.B codec_ops
+structure contains the following members.
+.TP
+.B "const codec_class *c"
+Pointer back to the
+.B codec_class
+which was used to construct the
+.B codec
+object.
+.TP
+.BI "int (*code)(codec *" c ", const void *" p ", size_t " sz ", dstr *" d ")"
+Encode or decode, using the codec
+.I c ,
+the data in the buffer at address
+.I p
+and continuing for
+.I sz
+bytes, appending the output to the dynamic string
+.I d
+(see
+.BR dstr (3)).
+If the operation was successful, the function returns zero; otherwise it
+returns a nonzero error code, as described below.
+.TP
+.BI "void (*destroy)(codec *" c ")"
+Destroy the codec object
+.IR c ,
+freeing any resources it may hold.
+.PP
+A codec may buffer its input (e.g., if needs to see more in order to
+decide what output to produce next); it may also need to take special
+action at the end of the input (e.g., flushing buffers, and applying
+padding).  To signal the codec that there is no more input, call the
+.B code
+function with a null
+.I p
+pointer.  It will then write any final output to
+.IR d .
+.PP
+The following error conditions may be reported.
+.TP
+.B CDCERR_INVCH
+An invalid character was encountered while decoding.  This includes
+encoutering padding characters if padding is disabled using the
+.B CDCF_NOEQPAD
+flag.
+.TP
+.B CDCERR_INVEQPAD
+Invalid padding characters (e.g., wrong characters, or too few, too
+many, or none at all) were found during decoding.  This may also
+indicate that the input is truncated, even if the codec does not usually
+perform output padding.
+.TP
+.B CDCERR_INVZPAD
+Invalid padding bits were found during decoding.
+.PP
+The
+.B codec_strerror
+function converts these error codes to brief, (moderately)
+human-readable strings.
+.SS "Provided codecs"
+The library provides a number of standard codecs.
+.TP
+.B base64
+Implements Base64 encoding, as defined by RFC4648.  Output is
+mixed-case, so the
+.B CDCF_LOWERC
+and
+.B CDCF_IGNCASE
+flags are ignored.
+.TP
+.B safe64
+Implements a variant of the Base64 encoding which uses
+.RB ` % '
+in place of
+.RB ` / ',
+so that its output is suitable for use as a Unix filename.
+.TP
+.B base64url
+Implements the filename- and URL-safe variant of Base64 encoding, as
+defined by RFC4648.
+.TP
+.B base32
+Implements Base32 encoding, as defined by RFC4648.  Output is in upper
+case by default.
+.TP
+.B base32hex
+Implements the extended-hex variant of Base32, as defined by RFC4648.
+This encoding has the property that the encoding preserves the ordering
+of messages if padding is suppressed.
+.TP
+.B hex
+Implements hex encoding, defined by RFC4648 under the name Base16.  For
+compatibility with that specification, output is in upper case by
+default.
+.SH "SEE ALSO"
+.BR bincode (1),
+.BR dstr (3),
+.BR mLib (3).
+.SH AUTHOR
+Mark Wooding, <mdw@distorted.org.uk>