Commit | Line | Data |
---|---|---|
03d53b73 | 1 | .\" -*-nroff-*- |
2 | .de VS | |
3 | .sp 1 | |
4 | .RS | |
5 | .nf | |
6 | .ft B | |
7 | .. | |
8 | .de VE | |
9 | .ft R | |
10 | .fi | |
11 | .RE | |
12 | .sp 1 | |
13 | .. | |
14 | .de hP | |
15 | .IP | |
16 | .ft B | |
17 | \h'-\w'\\$1\ 'u'\\$1\ \c | |
18 | .ft P | |
19 | .. | |
d056fbdf MW |
20 | .ie t \{\ |
21 | . ds o \(bu | |
22 | . de VP | |
23 | . sp .4v | |
24 | .. | |
25 | \} | |
26 | .el \{\ | |
27 | . ds o o | |
28 | . de VP | |
29 | . sp | |
30 | .. | |
31 | \} | |
fbf20b5b | 32 | .TH hash 3 "2 August 1999" "Straylight/Edgeware" "mLib utilities library" |
03d53b73 | 33 | .SH "NAME" |
34 | hash \- low-level hashtable implementation | |
35 | .\" @hash_create | |
36 | .\" @hash_destroy | |
37 | .\" @hash_bin | |
38 | .\" @hash_extend | |
39 | .\" @hash_remove | |
40 | .\" @hash_mkiter | |
41 | .\" @hash_next | |
42 | .\" | |
43 | .\" @HASH_BIN | |
44 | .\" @HASH_MKITER | |
45 | .\" @HASH_NEXT | |
46 | .\" | |
47 | .SH "SYNOPSIS" | |
48 | .nf | |
49 | .B "#include <mLib/hash.h>" | |
d056fbdf | 50 | .PP |
adec5584 | 51 | .ta 2n |
4729aa69 | 52 | .B "typedef struct {" |
adec5584 MW |
53 | .B " uint32 mask;" |
54 | .B " hash_base **v;" | |
55 | .B " arena *a;" | |
4729aa69 | 56 | .B "} hash_table;" |
d056fbdf | 57 | .PP |
4729aa69 | 58 | .B "typedef struct {" |
adec5584 MW |
59 | .B " hash_base *next;" |
60 | .B " uint32 hash;" | |
4729aa69 | 61 | .B "} hash_base;" |
d056fbdf | 62 | .PP |
4729aa69 | 63 | .B "typedef struct { ...\& } hash_iter;" |
d056fbdf | 64 | .PP |
03d53b73 | 65 | .BI "void hash_create(hash_table *" t ", size_t " n ); |
66 | .BI "void hash_destroy(hash_table *" t ); | |
67 | .BI "hash_base **hash_bin(hash_table *" t ", uint32 " hash ); | |
68 | .BI "int hash_extend(hash_table *" t ); | |
db0e70a1 | 69 | .BI "void hash_remove(hash_table *" t ", hash_base *" b ); |
03d53b73 | 70 | .BI "void hash_mkiter(hash_iter *" i ", hash_table *" t ); |
71 | .BI "hash_base *hash_next(hash_iter *" i ); | |
d056fbdf | 72 | .PP |
03d53b73 | 73 | .BI "hash_base **HASH_BIN(hash_table *" t ", uint32 " hash ); |
74 | .BI "void HASH_MKITER(hash_iter *" i ", hash_table *" t ); | |
75 | .BI "void HASH_NEXT(hash_iter *" i ", " b ); | |
76 | .fi | |
77 | .SH "OVERVIEW" | |
78 | The | |
79 | .B hash | |
80 | functions provide the basis for an extensible hashtable implementation. | |
81 | The implementation is not complete. Many decisions have been left to | |
82 | the user, including: | |
83 | .hP \*o | |
528c8b4d | 84 | how keys should be represented, hashed and compared; |
03d53b73 | 85 | .hP \*o |
528c8b4d | 86 | how objects contained within the table should be allocated; and |
03d53b73 | 87 | .hP \*o |
528c8b4d | 88 | when the hashtable should be extended. |
03d53b73 | 89 | .PP |
90 | A complete hashtable implementation will need to take the above | |
91 | decisions. If you just want a prepackaged solution, see | |
92 | .BR sym (3) | |
93 | which provides one. | |
94 | .SH "IMPLEMENTATION DETAILS" | |
95 | Each item in the hashtable is assigned a 32-bit integer | |
96 | .IR hash : | |
97 | a number computed somehow from the item's data such that two items which | |
98 | are considered equal will yield the same hash. Ideally, different items | |
99 | will yield different hashes. It is important for this implementation | |
100 | that all bits of the hash are similarly good. | |
101 | .PP | |
102 | In order to look up an item, the high bits of the hash are masked off | |
103 | and the remainder used as an index into a vector of | |
104 | .IR "bin lists" . | |
105 | Each bin contains a list of items with identical low-order bits of their | |
106 | hashes. | |
107 | .PP | |
108 | A table expansion involves doubling the size of the bin vector. Each | |
109 | bin list is then split into two, items being placed into a new bin | |
110 | depending on the setting of the next lowest hash bit. By expanding the | |
111 | hashtable as needed, lookups remain constant-time. | |
112 | .PP | |
113 | A low-level hashtable is represented by a | |
114 | .B hash_table | |
115 | structure. It contains two members: | |
116 | .TP | |
ff76c38f | 117 | .B "uint32 mask" |
03d53b73 | 118 | The current bitmask to be applied to hashes. This is one less than the |
119 | current number of bins in the hashtable, and is applied to hash values | |
120 | in order to decide which bin an item should be in. | |
121 | .TP | |
ff76c38f | 122 | .B "hash_base **v" |
03d53b73 | 123 | The bin vector. It is an array of pointers to hashtable items. |
124 | .PP | |
125 | A hashtable item consists of a | |
126 | .B hash_base | |
127 | structure. A full hashtable implementation will need to extend this | |
128 | structure by adding keying information and other data; the | |
129 | .B hash_base | |
130 | only contains the bare minimum of information needed to maintain the | |
131 | hashtable at a low level. It contains the following members: | |
132 | .TP | |
ff76c38f | 133 | .B "hash_base *next" |
03d53b73 | 134 | Pointer to the next item in the bin list. The final item has a null |
135 | .B next | |
136 | pointer. The entry in the bin vector is null if the bin list is empty. | |
137 | It is up to the high-level implementation to insert items into the list. | |
138 | .TP | |
ff76c38f | 139 | .B "uint32 hash" |
03d53b73 | 140 | The hash for this item. This must be the full 32-bit hash for the |
141 | current item. It is used during hashtable expansion to determine which | |
142 | bin an item should be moved to. | |
143 | .SH "FUNCTIONALITY PROVIDED" | |
144 | This section describes the functions and macros provided for building | |
145 | hashtables. Code examples are given throughout. They assume the | |
146 | following definitions: | |
147 | .VS | |
d056fbdf | 148 | .ta 2n |
03d53b73 | 149 | /* --- A table of items --- */ |
d056fbdf | 150 | .VP |
03d53b73 | 151 | typedef struct item_table { |
d056fbdf MW |
152 | hash_table t; |
153 | size_t load; | |
03d53b73 | 154 | }; |
d056fbdf | 155 | .VP |
03d53b73 | 156 | /* --- An item --- */ |
d056fbdf | 157 | .VP |
03d53b73 | 158 | typedef struct item { |
d056fbdf MW |
159 | hash_base b; |
160 | const char *k; | |
161 | /* ... */ | |
03d53b73 | 162 | } item; |
163 | .VE | |
164 | The implementation presented here is simple but relatively bad. The | |
165 | source file | |
166 | .B sym.c | |
167 | presents a more realistic example, but is rather more complex. | |
168 | .SS "Initialization and finalization" | |
169 | An empty hashtable is initialized by calling | |
170 | .B hash_create | |
171 | with the address of a | |
172 | .B hash_table | |
173 | structure to be filled in and the initial number of hash bins to create. | |
174 | .PP | |
175 | For example, an item table might be initialized like this: | |
176 | .VS | |
d056fbdf | 177 | .ta 2n |
03d53b73 | 178 | void item_createtab(item_table *t) |
179 | { | |
d056fbdf MW |
180 | hash_create(&t->t, ITEM_INITSZ); |
181 | t->load = ITEM_INITLOAD; | |
03d53b73 | 182 | } |
183 | .VE | |
184 | A hashtable can be destroyed by calling | |
185 | .B hash_destroy | |
186 | with the address of the | |
187 | .B hash_table | |
188 | structure. This does not attempt to deallocate the individual items; | |
189 | that must be done beforehand. | |
190 | .PP | |
191 | The usual way to deallocate the individual hashtable items is using the | |
192 | iteration constructs described below. | |
193 | .VS | |
d056fbdf | 194 | .ta 2n +2n |
03d53b73 | 195 | void item_destroytab(item_table *t) |
196 | { | |
d056fbdf MW |
197 | hash_iter i; |
198 | hash_base *b; | |
199 | for (hash_mkiter(&i, &t->t); (b = hash_next(&i)) != 0; ) { | |
200 | item *ii = (item *)b; | |
201 | free(ii->k); | |
202 | /* ... */ | |
203 | DESTROY(ii); | |
204 | } | |
205 | hash_destroy(&t->t); | |
03d53b73 | 206 | } |
207 | .VE | |
208 | .sp -1 | |
209 | .SS "Searching, adding and removing" | |
210 | Items must be searched for and added by hand. | |
211 | .PP | |
212 | The macro | |
213 | .B HASH_BIN | |
214 | returns the address of the bin list haed for a particular hashtable and | |
215 | hash value. The function | |
216 | .B hash_bin | |
217 | works the same way and provides the same result, but since the macro is | |
218 | very simple its use is preferred. However, it will evaluate its | |
219 | arguments multiple times. | |
220 | .PP | |
221 | Once the bin list has been found, it's fairly easy to search for an | |
222 | exact match. A simple search might look something like this: | |
223 | .VS | |
d056fbdf | 224 | .ta 2n +2n +2n 20m |
03d53b73 | 225 | item *lookup(item_table *t, const char *k) |
226 | { | |
d056fbdf MW |
227 | uint32 h = hash(k); /* Hash \fIk\fP somehow */ |
228 | hash_base **bin = HASH_BIN(&t->t, h); | |
229 | hash_base *b; | |
230 | for (b = *bin; b; b = b->next) { | |
231 | item *i = (item *)b; | |
232 | if (h == i->b.hash && strcmp(k, i->k) == 0) | |
233 | return (i); | |
234 | } | |
235 | return (0); | |
03d53b73 | 236 | } |
237 | .VE | |
238 | Insertion is also relatively trivial given the bin list head. Insertion | |
239 | may make the hashtable too large, so it might be necessary to extend | |
240 | it. Extension is performed by | |
241 | .BR hash_extend , | |
242 | which is passed only the address of the hashtable. It returns nonzero | |
243 | if extension was successful. | |
244 | .VS | |
d056fbdf | 245 | .ta 2n +2n |
03d53b73 | 246 | item *add(item_table *t, const char *k, /* ... */) |
247 | { | |
d056fbdf MW |
248 | item *i; |
249 | uint32 h; | |
250 | hash_base **bin; | |
251 | .VP | |
252 | /* --- See if the item is already there --- */ | |
253 | .VP | |
254 | if ((i = lookup(t, k)) != 0) | |
255 | return (i); | |
256 | .VP | |
257 | /* --- Make a new hashtable item --- */ | |
258 | .VP | |
259 | i = CREATE(item); | |
260 | i->k = xstrdup(k); | |
261 | /* ... */ | |
262 | .VP | |
263 | /* --- Link it into the bin list --- */ | |
264 | .VP | |
265 | h = i->b.hash = hash(k); | |
266 | bin = HASH_BIN(&t->t, h); | |
267 | i->b.next = *bin; | |
268 | *bin = &i->b.next; | |
269 | .VP | |
270 | /* --- Maybe extend the hashtable --- */ | |
271 | .VP | |
272 | if (t->load) | |
273 | t->load--; | |
274 | else if (hash_extend(&t->t)) | |
275 | t->load = recalc_load(t); | |
276 | .VP | |
277 | /* --- Done --- */ | |
278 | .VP | |
279 | return (i); | |
03d53b73 | 280 | } |
281 | .VE | |
282 | The | |
283 | .B sym | |
284 | implementation is rather more sophisticated in its approach but the idea | |
285 | is the same. In particular, | |
286 | .B sym | |
287 | provides a single interface for lookup and insertion which prevents the | |
288 | unnecessary rehashing performed by the above code. | |
289 | .PP | |
290 | Removal of items is more straightforward. The function | |
291 | .B hash_remove | |
292 | will unlink a given item from its bin list, after which point it is safe | |
293 | to remove. | |
294 | .SS "Iteration" | |
295 | Iteration allows code to be performed on all the items in a hashtable. | |
296 | This is done using an | |
297 | .I iterator | |
298 | object, represented by a | |
299 | .B hash_iter | |
300 | structure. An iterator is initialized by calling | |
301 | .BR hash_mkiter . | |
302 | Each call to | |
303 | .B hash_next | |
304 | yields a different item from the hashtable until there are none left, a | |
305 | condition signified by a null return value. | |
306 | .PP | |
307 | The macros | |
308 | .B HASH_MKITER | |
309 | and | |
310 | .B HASH_NEXT | |
311 | do the same jobs as the above functions. However, | |
312 | .B HASH_NEXT | |
313 | has a slightly bizarre argument passing convention: its second argument | |
314 | is an | |
315 | .I lvalue | |
316 | which is updated to contain the address of the next item. | |
317 | .PP | |
318 | The finalization code above contained an example of iteration. | |
319 | .SH "SEE ALSO" | |
320 | .BR sym (3), | |
321 | .BR mLib (3). | |
322 | .SH "AUTHOR" | |
9b5ac6ff | 323 | Mark Wooding, <mdw@distorted.org.uk> |