mdw@git.distorted.org.uk Git - mLib/blame_incremental

... / ...

Commit	Line	Data
	1	/* --c--
	2	*
	3	* $Id: unihash.h,v 1.4 2004/04/08 01:36:13 mdw Exp $
	4	*
	5	* Simple and efficient universal hashing for hashtables
	6	*
	7	* (c) 2003 Straylight/Edgeware
	8	*/
	9
	10	/----- Licensing notice --------------------------------------------------
	11	*
	12	* This file is part of the mLib utilities library.
	13	*
	14	* mLib is free software; you can redistribute it and/or modify
	15	* it under the terms of the GNU Library General Public License as
	16	* published by the Free Software Foundation; either version 2 of the
	17	* License, or (at your option) any later version.
	18	*
	19	* mLib is distributed in the hope that it will be useful,
	20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	22	* GNU Library General Public License for more details.
	23	*
	24	* You should have received a copy of the GNU Library General Public
	25	* License along with mLib; if not, write to the Free
	26	* Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	27	* MA 02111-1307, USA.
	28	*/
	29
	30	#ifndef MLIB_UNIHASH_H
	31	#define MLIB_UNIHASH_H
	32
	33	#ifdef __cplusplus
	34	extern "C" {
	35	#endif
	36
	37
	38	/----- Concept -----------------------------------------------------------
	39	*
	40	* Let %$\gf{q}$% be a finite field. Choose an arbitrary %$k \inr \gf{q}$%.
	41	* Let %$M$% be a message. Injectively pad %$M$% and split it into blocks
	42	* $m_{n-1}, m_{n-2}, \ldots, m_2, m_1, m_0$% in %$\gf{q}%.
	43	* Then we compute
	44	*
	45	* %$H_k(M) = k^{n+1} + \sum_{0\le i<n} m_i k^{i+1}.$%
	46	*
	47	* Note that %$H_0(M) = 0$% for all messages %$M$%.
	48	*
	49	* If we deal with messages at most %$\ell$% blocks long then %$H_k(\cdot)$%
	50	* is %$(\ell + 1)/q$%-almost universal. Moreover, if %$q = 2^f$% then
	51	* %$H_k(\cdot)$% is %$(\ell + 1)/q$%-almost XOR-universal.
	52	*
	53	* Proof. Let %$A$% and %$B$% be two messages, represented by
	54	* %$a_{n-1}, \ldots, a_0$% and %$b_{m-1}, \ldots, b_0$% respectively; and
	55	* choose any %$\delta \in \gf{q}$%. We must bound the probability that
	56	*
	57	* %$k^{n+1} + a_{n-1} k^{n} + \cdots + a_1 k^2 + a_0 k - {}$%
	58	* %$k^{m+1} - b_{m-1} k^{m} - \cdots - b_1 k^2 - b_0 k = \delta$%.
	59	*
	60	* Firstly, we claim that if %$A$% and %$B$% are distinct, there is some
	61	* nonzero coefficient of %$k$%. For if %$n \ne m$% then, without loss of
	62	* generality, let %$n > m$%, and hence the coefficient of %$k_n$% is
	63	* nonzero. Alternatively, if %$n = m$% then there must be some
	64	* %$i \in \{ 0, \ldots, n - 1 \}$% with %$a_i \ne b_i$%, for otherwise the
	65	* messages would be identical; but then the coefficient of %$k^{i+1}$% is
	66	* %$a_i - b_i \ne 0$%.
	67	*
	68	* Hence we have a polynomial equation with degree at most %$\ell + 1$%;
	69	* there must be at most %$\ell + 1$% solutions for %$k$%; but we choose
	70	* %$k$% at random from a set of %$q$%; so the equation is true with
	71	* probability at most %$(\ell + 1)/q$%.
	72	*
	73	* This function can be used as a simple MAC with provable security against
	74	* computationally unbounded adversaries. Simply XOR the hash with a random
	75	* string indexed from a large random pad by some nonce sent with the
	76	* message. The probability of a forgery attempt being successful is then
	77	* %$(\ell + 1)/2^t$%, where %$t$% is the tag length and %$\ell$% is the
	78	* longest message permitted.
	79	*/
	80
	81	/----- Practicalities ----------------------------------------------------
	82	*
	83	* We work in %$\gf{2^32}$%, represented as a field of polynomials modulo
	84	* %$\texttt{104c11db7}_x$% (this is the standard CRC-32 polynomial). Our
	85	* blocks are bytes.
	86	*
	87	* The choice of a 32-bit hash is made for pragmatic reasons: we're never
	88	* likely to actually want all 32 bits for a real hashtable anyway. The
	89	* truncation result is needed to keep us afloat with smaller tables.
	90	*
	91	* We compute hashes using a slightly unrolled version of Horner's rule,
	92	* using the recurrence:
	93	*
	94	* %$a_{i+b} = (a_i + m_i) k^b + m_{i+1} k^{b-1} + \cdots + m_{i+b-1} k$%
	95	*
	96	* which involves one full-width multiply and %$b - 1$% one-byte multiplies;
	97	* the latter may be efficiently computed using a table lookup. Start with
	98	* %$a_0 = k$%.
	99	*
	100	* We precompute tables %$S[\cdot][\cdot][\cdot]$%, where
	101	*
	102	* %$S[u][v][w] = k^{u+1} x^{8v} w$%
	103	* for %$0 \le u < b$%, %$0 \le v < 4$%, %$0 \le w < 256)$%.
	104	*
	105	* A one-byte multiply is one lookup; a full-width multiply is four lookups
	106	* and three XORs. The processing required is then %$b + 3$% lookups and
	107	* %$b + 3$% XORs per batch, or %$(b + 3)/b$% lookups and XORs per byte, at
	108	* the expense of %$4 b$% kilobytes of tables. This compares relatively
	109	* favorably with CRC32. Indeed, in tests, this implementation with $b = 4$%
	110	* is faster than a 32-bit CRC.
	111	*/
	112
	113	/----- Header files ------------------------------------------------------/
	114
	115	#include <stddef.h>
	116
	117	#ifndef MLIB_BITS_H
	118	# include "bits.h"
	119	#endif
	120
	121	/----- Data structures ---------------------------------------------------/
	122
	123	#define UNIHASH_NBATCH 4
	124	#define UNIHASH_POLY 0x04c11db7 /* From CRC32 */
	125
	126	typedef struct unihash_info {
	127	uint32 s[UNIHASH_NBATCH][4][256]; /* S-tables as described */
	128	} unihash_info;
	129
	130	/----- A global hash-info table ------------------------------------------/
	131
	132	extern unihash_info unihash_global; /* Key this if you like */
	133
	134	/----- Functions provided ------------------------------------------------/
	135
	136	/* --- @unihash_setkey@ --- *
	137	*
	138	* Arguments: @unihash_info *i@ = where to store the precomputed tables
	139	* @uint32 k@ = the key to set, randomly chosen
	140	*
	141	* Returns: ---
	142	*
	143	* Use: Calculates the tables required for efficient hashing.
	144	*/
	145
	146	extern void unihash_setkey(unihash_info /i/, uint32 /k*/);
	147
	148	/* --- @unihash_hash@ --- *
	149	*
	150	* Arguments: @const unihash_info *i@ = pointer to precomputed table
	151	* @uint32 a@ = @UNIHASH_INIT(i)@ or value from previous call
	152	* @const void *p@ = pointer to data to hash
	153	* @size_t sz@ = size of the data
	154	*
	155	* Returns: Hash of data so far.
	156	*
	157	* Use: Hashes data. Call this as many times as needed.
	158	*/
	159
	160	#define UNIHASH_INIT(i) ((i)->s[0][0][1]) /* %$k$% */
	161
	162	extern uint32 unihash_hash(const unihash_info /i/, uint32 /a*/,
	163	const void /p/, size_t /sz*/);
	164
	165	/* --- @unihash@ --- *
	166	*
	167	* Arguments: @const unihash_info *i@ = precomputed tables
	168	* @const void *p@ = pointer to data to hash
	169	* @size_t sz@ = size of the data
	170	*
	171	* Returns: The hash value computed.
	172	*
	173	* Use: All-in-one hashing function. No faster than using the
	174	* separate calls, but more convenient.
	175	*/
	176
	177	#define UNIHASH(i, p, sz) (unihash_hash((i), UNIHASH_INIT((i)), (p), (sz)))
	178
	179	extern uint32 unihash(const unihash_info /i*/,
	180	const void /p/, size_t /sz*/);
	181
	182	/----- That's all, folks -------------------------------------------------/
	183
	184	#ifdef __cplusplus
	185	}
	186	#endif
	187
	188	#endif