mdw@git.distorted.org.uk Git - mLib/blob - codec/codec.3.in

   1 .\" -*-nroff-*-
   2 .\"
   3 .\" Manual for new-fangled binary encoding and decoding
   4 .\"
   5 .\" (c) 2009, 2014, 2015, 2019, 2023, 2024 Straylight/Edgeware
   6 .\"
   7 .
   8 .\"----- Licensing notice ---------------------------------------------------
   9 .\"
  10 .\" This file is part of the mLib utilities library.
  11 .\"
  12 .\" mLib is free software: you can redistribute it and/or modify it under
  13 .\" the terms of the GNU Library General Public License as published by
  14 .\" the Free Software Foundation; either version 2 of the License, or (at
  15 .\" your option) any later version.
  16 .\"
  17 .\" mLib is distributed in the hope that it will be useful, but WITHOUT
  18 .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19 .\" FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
  20 .\" License for more details.
  21 .\"
  22 .\" You should have received a copy of the GNU Library General Public
  23 .\" License along with mLib.  If not, write to the Free Software
  24 .\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25 .\" USA.
  26 .
  27 .\"--------------------------------------------------------------------------
  28 .so ../defs.man \" @@@PRE@@@
  29 .
  30 .\"--------------------------------------------------------------------------
  31 .TH codec 3mLib "9 January 2009" "Straylight/Edgeware" "mLib utilities library"
  32 .\" @codec_class
  33 .\" @codec_strerror
  34 .\" @null_codec_class
  35 .\" @base64_class
  36 .\" @file64_class
  37 .\" @base64url_class
  38 .\" @base32_class
  39 .\" @base32hex_class
  40 .\" @hex_class
  41 .
  42 .\"--------------------------------------------------------------------------
  43 .SH NAME
  44 codec \- binary encoding and decoding
  45 .
  46 .\"--------------------------------------------------------------------------
  47 .SH SYNOPSIS
  48 .
  49 .nf
  50 .B "#include <mLib/codec.h>"
  51 .B "#include <mLib/base64.h>"
  52 .B "#include <mLib/base32.h>"
  53 .B "#include <mLib/hex.h>"
  54 .PP
  55 .B "#define CDCF_LOWERC ..."
  56 .B "#define CDCF_IGNCASE ..."
  57 .B "#define CDCF_NOEQPAD ..."
  58 .B "#define CDCF_IGNEQPAD ..."
  59 .B "#define CDCF_IGNEQMID ..."
  60 .B "#define CDCF_IGNZPAD ..."
  61 .B "#define CDCF_IGNNEWL ..."
  62 .B "#define CDCF_IGNINVCH ..."
  63 .B "#define CDCF_IGNSPC ..."
  64 .B "#define CDCF_IGNJUNK ..."
  65 .PP
  66 .ta 2n
  67 .B "enum {"
  68 .B "    CDCERR_OK = ...,"
  69 .B "    CDCERR_INVCH = ...,"
  70 .B "    CDCERR_INVEQPAD = ...,"
  71 .B "    CDCERR_INVZPAD = ..."
  72 .B "};"
  73 .PP
  74 .B "typedef struct {"
  75 .B "    const char *name;"
  76 .ta 2n +\w'\fBcodec *(*encoder)('u
  77 .BI "   codec *(*encoder)(unsigned " flags ,
  78 .BI "           const char *" indent ", unsigned " maxlen );
  79 .BI "   codec *(*decoder)(unsigned " flags );
  80 .B "    ...\&"
  81 .B "} codec_class;"
  82 .PP
  83 .B "typedef struct {"
  84 .B "    const codec_ops *ops;"
  85 .B "} codec;"
  86 .PP
  87 .B "typedef struct {"
  88 .B "    const codec_class *c;"
  89 .BI "   int (*code)(codec *" c ", const void *" p ", size_t " sz ", dstr *" d );
  90 .BI "   void (*destroy)(codec *" c );
  91 .B "} codec_ops;"
  92 .PP
  93 .B "codec_class null_codec_class;"
  94 .B "codec_class base64_class, file64_class, base64url_class;"
  95 .B "codec_class base32_class, base32hex_class;"
  96 .B "codec_class hex_class;"
  97 .PP
  98 .BI "const char *codec_strerror(int " err ");"
  99 .fi
 100 .
 101 .\"--------------------------------------------------------------------------
 102 .SH DESCRIPTION
 103 .
 104 The
 105 .B codec
 106 system provides an object-based interface to functions which encode
 107 binary data as plain text and decode the result to recover the original
 108 binary data.  The interface makes it easy to support multiple encodings
 109 and select an appropriate one at runtime.
 110 .
 111 .SS "The codec_class structure"
 112 The
 113 .B codec_class
 114 structure represents a particular encoding format.  The structure has
 115 the following members.
 116 .TP
 117 .B "const char *name"
 118 The name of the class, as a null-terminated string.  The name should not
 119 contain whitespace characters.
 120 .TP
 121 .BI "codec *(*encoder)(unsigned " flags ", const char *" indent ", unsigned " maxline ")"
 122 Pointer to a function which constructs a new encoder object, of type
 123 .BR codec .
 124 The
 125 .I flags
 126 configure the behaviour of the object; the
 127 .I indent
 128 string is written to separate lines of output; the integer
 129 .I maxline
 130 is the maximum length of line to be produced, or zero to forbid line
 131 breaking.
 132 .TP
 133 .BI "codec *(*decoder)(unsigned " flags ")"
 134 Pointer to a function which constructs a new decoder object, also of
 135 type
 136 .BR codec .
 137 The
 138 .I flags
 139 configure the behaviour of the object.
 140 .PP
 141 The
 142 .I flags
 143 to the
 144 .B encoder
 145 and
 146 .B decoder
 147 functions have the following meanings.
 148 .TP
 149 .B CDCF_LOWERC
 150 For codecs which produce output using a single alphabetic case (e.g.,
 151 .BR base32 ,
 152 .BR hex ),
 153 emit and accept only lower case; the default to emit and accept only
 154 upper case, for compatibility with RFC4648.  If the codec usually
 155 produces mixed-case output, then this flag is ignored.
 156 .TP
 157 .B CDCF_IGNCASE
 158 For codecs which produce output using a single alphabetic case, ignore
 159 the case of the input when decoding.  If the codec usually produces
 160 mixed-case output, then this flag is ignored.
 161 .TP
 162 .B CDCF_NOEQPAD
 163 For codecs which usually pad their output (e.g.,
 164 .BR base64 ,
 165 .BR base32 ),
 166 do not emit or accept padding characters.  If the codec does not usually
 167 produce padding, or the padding is not redundant, then this flag is
 168 ignored.
 169 .TP
 170 .B CDCF_IGNEQPAD
 171 For codecs which usually pad their output, do not treat incorrect (e.g.,
 172 missing or excessive) padding as an error when decoding.  If the codec
 173 does not usually produce padding, or the padding is required for
 174 unambiguous decoding, then this flag is ignored.
 175 .TP
 176 .B CDCF_IGNEQMID
 177 For codecs which usually pad their output, ignore padding characters
 178 wherever they may appear when decoding.  Usually padding characters
 179 indicate the end of the input, and further input characters are
 180 considered erroneous.  If the codec does not usually produce padding, or
 181 it is impossible to resume decoding correctly having seen padding
 182 characters, then this flag is ignored.
 183 .TP
 184 .B CDCF_IGNZPAD
 185 For codecs which need to pad their input, ignore unusual padding bits
 186 when decoding.  (This is not at all the same thing as the padding
 187 characters controlled by the flags above: they deal with padding the
 188 length of the encoding
 189 .I output
 190 up to a suitable multiple of characters; this option deals with padding
 191 of the
 192 .I input
 193 prior to encoding.)  If the codec does not add padding bits, or specific
 194 values are required for unambiguous decoding, then this flag is ignored.
 195 .TP
 196 .B CDCF_IGNNEWL
 197 Ignore newline (and carriage-return) characters when decoding: the
 198 default for RFC4648 codecs is to reject newline characters.  If these
 199 characters are significant in the encoding, then this flag is ignored.
 200 .TP
 201 .B CDCF_IGNSPC
 202 Ignore whitespace characters (other than newlines) when decoding: the
 203 default for RFC4648 codecs is to reject whitespace characters.  If these
 204 characters are significant in the encoding, then this flag is ignored.
 205 .TP
 206 .B CDCF_IGNINVCH
 207 Ignore any other invalid characters appearing in the input when
 208 decoding.
 209 .TP
 210 .B CDCF_IGNJUNK
 211 Ignore all `junk' in the input.  This should suppress almost all
 212 decoding errors.
 213 .PP
 214 If you do not set any of the
 215 .BR CDCF_IGN ...\&
 216 flags, a decoder should only accept the exact encoding that the
 217 corresponding encoder would produce (with
 218 .I maxline
 219 = 0 to inhibit line-breaking).
 220 .
 221 .SS "The codec and codec_ops structures"
 222 The
 223 .B codec
 224 structure represents the state of an encoder or decoder, as returned by
 225 the
 226 .B encoder
 227 and
 228 .B decoder
 229 functions described above, contains a single member.
 230 .TP
 231 .B "const codec_ops *ops"
 232 Pointer to a
 233 .B codec_ops
 234 structure which contains operations and metadata for use with the
 235 encoder or decoder.
 236 .PP
 237 The
 238 .B codec_ops
 239 structure contains the following members.
 240 .TP
 241 .B "const codec_class *c"
 242 Pointer back to the
 243 .B codec_class
 244 which was used to construct the
 245 .B codec
 246 object.
 247 .TP
 248 .BI "int (*code)(codec *" c ", const void *" p ", size_t " sz ", dstr *" d ")"
 249 Encode or decode, using the codec
 250 .IR c ,
 251 the data in the buffer at address
 252 .I p
 253 and continuing for
 254 .I sz
 255 bytes, appending the output to the dynamic string
 256 .I d
 257 (see
 258 .BR dstr (3)).
 259 If the operation was successful, the function returns zero; otherwise it
 260 returns a nonzero error code, as described below.
 261 .TP
 262 .BI "void (*destroy)(codec *" c ")"
 263 Destroy the codec object
 264 .IR c ,
 265 freeing any resources it may hold.
 266 .PP
 267 A codec may buffer its input (e.g., if needs to see more in order to
 268 decide what output to produce next); it may also need to take special
 269 action at the end of the input (e.g., flushing buffers, and applying
 270 padding).  To signal the codec that there is no more input, call the
 271 .B code
 272 function with a null
 273 .I p
 274 pointer.  It will then write any final output to
 275 .IR d .
 276 .PP
 277 The following error conditions may be reported.
 278 .TP
 279 .B CDCERR_INVCH
 280 An invalid character was encountered while decoding.  This includes
 281 encoutering padding characters if padding is disabled using the
 282 .B CDCF_NOEQPAD
 283 flag.
 284 .TP
 285 .B CDCERR_INVEQPAD
 286 Invalid padding characters (e.g., wrong characters, or too few, too
 287 many, or none at all) were found during decoding.  This may also
 288 indicate that the input is truncated, even if the codec does not usually
 289 perform output padding.
 290 .TP
 291 .B CDCERR_INVZPAD
 292 Invalid padding bits were found during decoding.
 293 .PP
 294 The
 295 .B codec_strerror
 296 function converts these error codes to brief, (moderately)
 297 human-readable strings.
 298 .
 299 .SS "Provided codecs"
 300 The library provides a number of standard codecs.
 301 .TP
 302 .B base64
 303 Implements Base64 encoding, as defined by RFC4648.  Output is
 304 mixed-case, so the
 305 .B CDCF_LOWERC
 306 and
 307 .B CDCF_IGNCASE
 308 flags are ignored.
 309 .TP
 310 .B safe64
 311 Implements a variant of the Base64 encoding which uses
 312 .RB ` % '
 313 in place of
 314 .RB ` / ',
 315 so that its output is suitable for use as a Unix filename.
 316 .TP
 317 .B base64url
 318 Implements the filename- and URL-safe variant of Base64 encoding, as
 319 defined by RFC4648.
 320 .TP
 321 .B base32
 322 Implements Base32 encoding, as defined by RFC4648.  Output is in upper
 323 case by default.
 324 .TP
 325 .B base32hex
 326 Implements the extended-hex variant of Base32, as defined by RFC4648.
 327 This encoding has the property that the encoding preserves the ordering
 328 of messages if padding is suppressed.
 329 .TP
 330 .B hex
 331 Implements hex encoding, defined by RFC4648 under the name Base16.  For
 332 compatibility with that specification, output is in upper case by
 333 default.
 334 .
 335 .\"--------------------------------------------------------------------------
 336 .SH "SEE ALSO"
 337 .
 338 .BR bincode (1),
 339 .BR dstr (3),
 340 .BR mLib (3).
 341 .
 342 .\"--------------------------------------------------------------------------
 343 .SH AUTHOR
 344 .
 345 Mark Wooding, <mdw@distorted.org.uk>
 346 .
 347 .\"----- That's all, folks --------------------------------------------------