mdw@git.distorted.org.uk Git - mLib/blame_incremental

... / ...

Commit	Line	Data
	1	.\" --nroff--
	2	.de VS
	3	.sp 1
	4	.RS
	5	.nf
	6	.ft B
	7	..
	8	.de VE
	9	.ft R
	10	.fi
	11	.RE
	12	.sp 1
	13	..
	14	.de hP
	15	.IP
	16	.ft B
	17	\h'-\w'\\$1\ 'u'\\$1\ \c
	18	.ft P
	19	..
	20	.ie t \{\
	21	. ds ss \s8\u
	22	. ds se \d\s0
	23	. ds us \s8\d
	24	. ds ue \u\s0
	25	. ds d \(d
	26	. ds >= \(>=
	27	.\}
	28	.el \{\
	29	. ds ss ^
	30	. ds se
	31	. ds us _
	32	. ds ue
	33	. ds *d \fIdelta\fP
	34	. ds >= >=
	35	.\}
	36	.TH unihash 3 "5 July 2003" "Straylight/Edgeware" "mLib utilities library"
	37	.SH NAME
	38	unihash \- simple and efficient universal hashing for hashtables
	39	.\" @unihash_setkey
	40	.\" @UNIHASH_INIT
	41	.\" @unihash_hash
	42	.\" @UNIHASH
	43	.\" @unihash
	44	.SH SYNOPSIS
	45	.nf
	46	.B "#include <mLib/unihash.h>"
	47
	48	.B "unihash_info unihash_global;"
	49
	50	.BI "void unihash_setkey(unihash_info *" i ", uint32 " k );
	51	.BI "uint32 UNIHASH_INIT(const unihash_info *" i );
	52	.BI "uint32 unihash_hash(const unihash_info *" i ", uint32 " a ,
	53	.BI " const void *" p ", size_t " sz );
	54	.BI "uint32 unihash(const unihash_info " i ", const void " p ", size_t " sz );
	55	.BI "uint32 UNIHASH(const unihash_info " i ", const void " p ", size_t " sz );
	56	.fi
	57	.SH DESCRIPTION
	58	The
	59	.B unihash
	60	system implements a simple and relatively efficient
	61	.IR "universal hashing family" .
	62	Using a such a universal hashing family means that it's provably
	63	difficult for an adversary to choose input data whose hashes collide,
	64	thus guaranteeing good average performance even on maliciously chosen
	65	data.
	66	.PP
	67	Unlike, say,
	68	.BR crc32 (3),
	69	the
	70	.B unihash
	71	function is
	72	.I keyed
	73	\- in addition to the data to be hashed, the function takes as input a
	74	32-bit key. This key should be chosen at random each time the program
	75	runs.
	76	.SS "Preprocessing a key"
	77	Before use, a key must be
	78	.I preprocessed
	79	into a large (16K) table which is used by the main hashing functions.
	80	The preprocessing is done by
	81	.BR unihash_setkey :
	82	pass it a pointer to a
	83	.B unihash_info
	84	structure and the 32-bit key you've chosen, and it stores the table in
	85	the structure.
	86	.PP
	87	Objects of type
	88	.B unihash_info
	89	don't contain any pointers to other data and are safe to free when
	90	you've finished with them; or you can just allocate them statically or
	91	on the stack if that's more convenient.
	92	.SS "Hashing data"
	93	The function
	94	.B unihash_hash
	95	takes as input:
	96	.TP
	97	.BI "const unihash_info *" i
	98	A pointer to the precomputed tables for a key.
	99	.TP
	100	.BI "uint32 " a
	101	An accumulator value. This should be
	102	.BI UNIHASH_INIT( i )
	103	for the first chunk of a multi-chunk input, or the result of the
	104	previous
	105	.B unihash_hash
	106	call for subsequent chunks.
	107	.TP
	108	.BI "const void *" p
	109	A pointer to the start of a buffer containing this chunk of data.
	110	.TP
	111	.BI "size_t " sz
	112	The length of the chunk.
	113	.PP
	114	The function returns a new accumulator value, which is also the hash of
	115	the data so far. So, to hash multiple chunks of data, do something like
	116	.VS
	117	uint32 a = UNIHASH_INIT(i);
	118	a = unihash_hash(i, a, p_0, sz_0);
	119	a = unihash_hash(i, a, p_1, sz_1);
	120	/* ... */
	121	a = unihash_hash(i, a, p_n, sz_n);
	122	.VE
	123	The macro
	124	.B UNIHASH
	125	and function
	126	.B unihash
	127	are convenient interfaces to
	128	.B unihash_hash
	129	if you only wanted to hash one chunk.
	130	.SS "Global hash info table"
	131	There's no problem with using the same key for several purposes, as long
	132	as it's secret from all of your adversaries. Therefore, there is a
	133	global
	134	.B unihash_info
	135	table define, called
	136	.BR unihash_global .
	137	This initially contains information for a fixed key which the author
	138	chose at random, but if you need to you can set a different key into it
	139	.I before
	140	it gets used to hash any data (otherwise your hash tables will become
	141	messed up).
	142	.SS "Theoretical issues"
	143	The hash function implemented by
	144	.B unihash
	145	is
	146	.RI ( l \ +\ 1)/2\(ss32\(se-almost
	147	XOR-universal, where
	148	.I l
	149	is the length (in bytes) of the longest string you hash. That means
	150	that, for any pair of strings
	151	.I x
	152	and
	153	.I y
	154	and any 32-bit value \(d, the probability taken over all choices of the
	155	key
	156	.I k
	157	that
	158	.IR H\(usk\(ue ( x )\ \c
	159	.BR xor \c
	160	.RI \ H\(usk\(ue ( y )\ =\ \(d
	161	is no greater than
	162	.RI ( l \ +\ 1)/2\(ss32\(se.
	163	.PP
	164	This fact is proven in the header file, but it requires more
	165	sophisticated typesetting than is available here.
	166	.PP
	167	The function evaluates a polynomial over GF(2\(ss32\(se) whose
	168	coefficients are the bytes of the message and whose variable is the key.
	169	Details are given in the header file.
	170	.PP
	171	For best results, you should choose the key as a random 32-bit number
	172	each time your program starts. Choosing a different key for different
	173	hashtables isn't necessary. It's probably a good idea to avoid the keys
	174	0 and 1. This raises the collision bound to
	175	.RI ( l \ +\ 1)/(2\(ss32\(se\ \-\ 2)
	176	(which isn't a significant increase) but eliminates keys for which the
	177	hash's behaviour is particularly poor.
	178	.PP
	179	In tests,
	180	.B unihash
	181	actually performed better than
	182	.BR crc32 ,
	183	so if you want to just use it as a fast-ish hash with good statistical
	184	properties, choose some fixed key
	185	.IR k \ \*(>=\ 2.
	186	.PP
	187	We emphasize that the proof of this function's collision behaviour is
	188	.I not
	189	dependent on any unproven assumptions (unlike many `proofs' of
	190	cryptographic security, which actually reduce the security of some
	191	construction to the security of its components). It's just a fact.
	192	.SH SEE ALSO
	193	.BR unihash-mkstatic (3),
	194	.BR crc32 (3),
	195	.BR mLib (3).
	196	.SH AUTHOR
	197	Mark Wooding (mdw@distorted.org.uk).