mdw@git.distorted.org.uk Git - mLib/blob - codec/codec.3

   1 .\" -*-nroff-*-
   2 .TH codec 3 "9 January 2009" "Straylight/Edgeware" "mLib utilities library"
   3 .SH NAME
   4 codec \- binary encoding and decoding
   5 .\" @codec_class
   6 .\" @codec_strerror
   7 .\" @null_codec_class
   8 .\" @base64_class
   9 .\" @file64_class
  10 .\" @base64url_class
  11 .\" @base32_class
  12 .\" @base32hex_class
  13 .\" @hex_class
  14 .SH SYNOPSIS
  15 .nf
  16 .B "#include <mLib/codec.h>"
  17 .B "#include <mLib/base64.h>"
  18 .B "#include <mLib/base32.h>"
  19 .B "#include <mLib/hex.h>"
  20
  21 .B "#define CDCF_LOWERC ..."
  22 .B "#define CDCF_IGNCASE ..."
  23 .B "#define CDCF_NOEQPAD ..."
  24 .B "#define CDCF_IGNEQPAD ..."
  25 .B "#define CDCF_IGNEQMID ..."
  26 .B "#define CDCF_IGNZPAD ..."
  27 .B "#define CDCF_IGNNEWL ..."
  28 .B "#define CDCF_IGNINVCH ..."
  29 .B "#define CDCF_IGNSPC ..."
  30 .B "#define CDCF_IGNJUNK ..."
  31
  32 .B "enum {"
  33 .B "\h'4n'CDCERR_OK = ...,"
  34 .B "\h'4n'CDCERR_INVCH = ...,"
  35 .B "\h'4n'CDCERR_INVEQPAD = ...,"
  36 .B "\h'4n'CDCERR_INVZPAD = ..."
  37 .B "};"
  38
  39 .B "typedef struct {"
  40 .B "\h'4n'const char *name;"
  41 .ds mT \fBcodec *(*encoder)(
  42 .BI "\h'4n'\*(mTunsigned " flags ,
  43 .BI "\h'4n+\w'\*(mT'u'const char *" indent ", unsigned " maxlen );
  44 .BI "\h'4n'codec *(*decoder)(unsigned " flags );
  45 .B "\h'4n'...\&"
  46 .B "} codec_class;"
  47
  48 .B "typedef struct {"
  49 .B "\h'4n'const codec_ops *ops;"
  50 .B "} codec;"
  51
  52 .B "typedef struct {"
  53 .B "\h'4n'const codec_class *c;"
  54 .BI "\h'4n'int (*code)(codec *" c ", const void *" p ", size_t " sz ", dstr *" d );
  55 .BI "\h'4n'void (*destroy)(codec *" c );
  56 .B "} codec_ops;"
  57
  58 .B "codec_class null_codec_class;"
  59 .B "codec_class base64_class, file64_class, base64url_class;"
  60 .B "codec_class base32_class, base32hex_class;"
  61 .B "codec_class hex_class;"
  62
  63 .BI "const char *codec_strerror(int " err ");"
  64 .fi
  65 .SH DESCRIPTION
  66 The
  67 .B codec
  68 system provides an object-based interface to functions which encode
  69 binary data as plain text and decode the result to recover the original
  70 binary data.  The interface makes it easy to support multiple encodings
  71 and select an appropriate one at runtime.
  72 .SS "The codec_class structure"
  73 The
  74 .B codec_class
  75 structure represents a particular encoding format.  The structure has
  76 the following members.
  77 .TP
  78 .B "const char *name"
  79 The name of the class, as a null-terminated string.  The name should not
  80 contain whitespace characters.
  81 .TP
  82 .BI "codec *(*encoder)(unsigned " flags ", const char *" indent ", unsigned " maxline ")"
  83 Pointer to a function which constructs a new encoder object, of type
  84 .BR codec .
  85 The
  86 .I flags
  87 configure the behaviour of the object; the
  88 .I indent
  89 string is written to separate lines of output; the integer
  90 .I maxline
  91 is the maximum length of line to be produced, or zero to forbid line
  92 breaking.
  93 .TP
  94 .BI "codec *(*decoder)(unsigned " flags ")"
  95 Pointer to a function which constructs a new decoder object, also of
  96 type
  97 .BR codec .
  98 The
  99 .I flags
 100 configure the behaviour of the object.
 101 .PP
 102 The
 103 .I flags
 104 to the
 105 .B encoder
 106 and
 107 .B decoder
 108 functions have the following meanings.
 109 .TP
 110 .B CDCF_LOWERC
 111 For codecs which produce output using a single alphabetic case (e.g.,
 112 .BR base32 ,
 113 .BR hex ),
 114 emit and accept only lower case; the default to emit and accept only
 115 upper case, for compatibility with RFC4648.  If the codec usually
 116 produces mixed-case output, then this flag is ignored.
 117 .TP
 118 .B CDCF_IGNCASE
 119 For codecs which produce output using a single alphabetic case, ignore
 120 the case of the input when decoding.  If the codec usually produces
 121 mixed-case output, then this flag is ignored.
 122 .TP
 123 .B CDCF_NOEQPAD
 124 For codecs which usually pad their output (e.g.,
 125 .BR base64 ,
 126 .BR base32 ),
 127 do not emit or accept padding characters.  If the codec does not usually
 128 produce padding, or the padding is not redundant, then this flag is
 129 ignored.
 130 .TP
 131 .B CDCF_IGNEQPAD
 132 For codecs which usually pad their output, do not treat incorrect (e.g.,
 133 missing or excessive) padding as an error when decoding.  If the codec
 134 does not usually produce padding, or the padding is required for
 135 unambiguous decoding, then this flag is ignored.
 136 .TP
 137 .B CDCF_IGNEQMID
 138 For codecs which usually pad their output, ignore padding characters
 139 wherever they may appear when decoding.  Usually padding characters
 140 indicate the end of the input, and further input characters are
 141 considered erroneous.  If the codec does not usually produce padding, or
 142 it is impossible to resume decoding correctly having seen padding
 143 characters, then this flag is ignored.
 144 .TP
 145 .B CDCF_IGNZPAD
 146 For codecs which need to pad their input, ignore unusual padding bits
 147 when decoding.  (This is not at all the same thing as the padding
 148 characters controlled by the flags above: they deal with padding the
 149 length of the encoding
 150 .I output
 151 up to a suitable multiple of characters; this option deals with padding
 152 of the
 153 .I input
 154 prior to encoding.)  If the codec does not add padding bits, or specific
 155 values are required for unambiguous decoding, then this flag is ignored.
 156 .TP
 157 .B CDCF_IGNNEWL
 158 Ignore newline (and carriage-return) characters when decoding: the
 159 default for RFC4648 codecs is to reject newline characters.  If these
 160 characters are significant in the encoding, then this flag is ignored.
 161 .TP
 162 .B CDCF_IGNSPC
 163 Ignore whitespace characters (other than newlines) when decoding: the
 164 default for RFC4648 codecs is to reject whitespace characters.  If these
 165 characters are significant in the encoding, then this flag is ignored.
 166 .TP
 167 .B CDCF_IGNINVCH
 168 Ignore any other invalid characters appearing in the input when
 169 decoding.
 170 .TP
 171 .B CDCF_IGNJUNK
 172 Ignore all `junk' in the input.  This should suppress almost all
 173 decoding errors.
 174 .PP
 175 If you do not set any of the
 176 .BR CDCF_IGN ...\&
 177 flags, a decoder should only accept the exact encoding that the
 178 corresponding encoder would produce (with
 179 .I maxline
 180 = 0 to inhibit line-breaking).
 181 .SS "The codec and codec_ops structures"
 182 The
 183 .B codec
 184 structure represents the state of an encoder or decoder, as returned by
 185 the
 186 .B encoder
 187 and
 188 .B decoder
 189 functions described above, contains a single member.
 190 .TP
 191 .B "const codec_ops *ops"
 192 Pointer to a
 193 .B codec_ops
 194 structure which contains operations and metadata for use with the
 195 encoder or decoder.
 196 .PP
 197 The
 198 .B codec_ops
 199 structure contains the following members.
 200 .TP
 201 .B "const codec_class *c"
 202 Pointer back to the
 203 .B codec_class
 204 which was used to construct the
 205 .B codec
 206 object.
 207 .TP
 208 .BI "int (*code)(codec *" c ", const void *" p ", size_t " sz ", dstr *" d ")"
 209 Encode or decode, using the codec
 210 .IR c ,
 211 the data in the buffer at address
 212 .I p
 213 and continuing for
 214 .I sz
 215 bytes, appending the output to the dynamic string
 216 .I d
 217 (see
 218 .BR dstr (3)).
 219 If the operation was successful, the function returns zero; otherwise it
 220 returns a nonzero error code, as described below.
 221 .TP
 222 .BI "void (*destroy)(codec *" c ")"
 223 Destroy the codec object
 224 .IR c ,
 225 freeing any resources it may hold.
 226 .PP
 227 A codec may buffer its input (e.g., if needs to see more in order to
 228 decide what output to produce next); it may also need to take special
 229 action at the end of the input (e.g., flushing buffers, and applying
 230 padding).  To signal the codec that there is no more input, call the
 231 .B code
 232 function with a null
 233 .I p
 234 pointer.  It will then write any final output to
 235 .IR d .
 236 .PP
 237 The following error conditions may be reported.
 238 .TP
 239 .B CDCERR_INVCH
 240 An invalid character was encountered while decoding.  This includes
 241 encoutering padding characters if padding is disabled using the
 242 .B CDCF_NOEQPAD
 243 flag.
 244 .TP
 245 .B CDCERR_INVEQPAD
 246 Invalid padding characters (e.g., wrong characters, or too few, too
 247 many, or none at all) were found during decoding.  This may also
 248 indicate that the input is truncated, even if the codec does not usually
 249 perform output padding.
 250 .TP
 251 .B CDCERR_INVZPAD
 252 Invalid padding bits were found during decoding.
 253 .PP
 254 The
 255 .B codec_strerror
 256 function converts these error codes to brief, (moderately)
 257 human-readable strings.
 258 .SS "Provided codecs"
 259 The library provides a number of standard codecs.
 260 .TP
 261 .B base64
 262 Implements Base64 encoding, as defined by RFC4648.  Output is
 263 mixed-case, so the
 264 .B CDCF_LOWERC
 265 and
 266 .B CDCF_IGNCASE
 267 flags are ignored.
 268 .TP
 269 .B safe64
 270 Implements a variant of the Base64 encoding which uses
 271 .RB ` % '
 272 in place of
 273 .RB ` / ',
 274 so that its output is suitable for use as a Unix filename.
 275 .TP
 276 .B base64url
 277 Implements the filename- and URL-safe variant of Base64 encoding, as
 278 defined by RFC4648.
 279 .TP
 280 .B base32
 281 Implements Base32 encoding, as defined by RFC4648.  Output is in upper
 282 case by default.
 283 .TP
 284 .B base32hex
 285 Implements the extended-hex variant of Base32, as defined by RFC4648.
 286 This encoding has the property that the encoding preserves the ordering
 287 of messages if padding is suppressed.
 288 .TP
 289 .B hex
 290 Implements hex encoding, defined by RFC4648 under the name Base16.  For
 291 compatibility with that specification, output is in upper case by
 292 default.
 293 .SH "SEE ALSO"
 294 .BR bincode (1),
 295 .BR dstr (3),
 296 .BR mLib (3).
 297 .SH AUTHOR
 298 Mark Wooding, <mdw@distorted.org.uk>