Commit | Line | Data |
---|---|---|
d3f33b9a MW |
1 | /* -*-c-*- |
2 | * | |
3 | * Bit permutations | |
4 | * | |
5 | * (c) 2024 Straylight/Edgeware | |
6 | */ | |
7 | ||
8 | /*----- Licensing notice --------------------------------------------------* | |
9 | * | |
10 | * This file is part of Catacomb. | |
11 | * | |
12 | * Catacomb is free software: you can redistribute it and/or modify it | |
13 | * under the terms of the GNU Library General Public License as published | |
14 | * by the Free Software Foundation; either version 2 of the License, or | |
15 | * (at your option) any later version. | |
16 | * | |
17 | * Catacomb is distributed in the hope that it will be useful, but | |
18 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
20 | * Library General Public License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Library General Public | |
23 | * License along with Catacomb. If not, write to the Free Software | |
24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | * USA. | |
26 | */ | |
27 | ||
28 | #ifndef CATACOMB_PERMUTE_H | |
29 | #define CATACOMB_PERMUTE_H | |
30 | ||
31 | #ifdef __cplusplus | |
32 | extern "C" { | |
33 | #endif | |
34 | ||
35 | /*----- Header files ------------------------------------------------------*/ | |
36 | ||
37 | #include <mLib/macros.h> | |
38 | ||
39 | /*----- Macros provided ---------------------------------------------------*/ | |
40 | ||
41 | /* --- Theory lesson --- * | |
42 | * | |
43 | * It's often useful to rearrange the bits in a word, or a value split across | |
44 | * two (or more) words, so it's worth taking a moment to consider how this | |
45 | * might be done efficiently. Throughout this discussion, we use the | |
46 | * standard bit numbering, where the least significant bit in a word is bit | |
47 | * zero, with numbering increasing with significance. Equivalently, bit | |
48 | * %$k$% has the numerical value %$2^k$%. | |
49 | * | |
50 | * An essential primitive is the `swizzle', which exchanges two similarly | |
51 | * arranged but disjoint groups of bits within a word which are separated by | |
52 | * a constant offset. The groups of bits don't have to be contiguous, but | |
53 | * they must be identified by shifts of the same mask. | |
54 | * | |
55 | * An especially important class of swizzle permutations considers the | |
56 | * individual bits of bit indices. Permutations of the bits in a word can be | |
57 | * interpreted as operations on the bits of the indices. For %$i \ge 0$%, | |
58 | * let %$\mu_i$% be the mask such that bit %$k$% of %$\mu_i$% is set if and | |
59 | * only if bit %$i$% is clear in %$k$%. Hence | |
60 | * | |
61 | * %$\mu_0 = (\ldots 01010101010101010101010101010101)_2 = -1/3$% , | |
62 | * %$\mu_1 = (\ldots 00110011001100110011001100110011)_2 = -1/5$% , | |
63 | * %$\mu_2 = (\ldots 00001111000011110000111100001111)_2 = -1/17$% , | |
64 | * %$\mu_3 = (\ldots 00000000111111110000000011111111)_2 = -1/257$% , | |
65 | * etc. | |
66 | * | |
67 | * In general, the low %$2^i$% bits of %$\mu_i$% are set, the next least | |
68 | * significant %$2^i$% bits are clear, the next %$2^i$% bits are set, and so | |
69 | * on. Hence, %$\mu_i \lsl 2^i = \bar{\mu}_i$%, or, in the %$2$%-adic | |
70 | * numbers %$\Z_2$%, %$2^{2^i} \mu_k = -1 - \mu_i$%, whence, in general, | |
71 | * %$\mu_i = -1/(2^{2^i} + 1)$%. Let %$x$% be some binary value; now we can | |
72 | * describe some important swizzles. | |
73 | * | |
74 | * * Let %$y=(x\bitand\mu_i)\lsl 2^i \bitor (x\bitand\bar{\mu}_i)\lsr 2^i%, | |
75 | * or %$y = (x\bitand\mu_i) \lsl 2^k \bitor (x \lsr 2^k)\bitand\mu_i$%. | |
76 | * This exchanges the two sub-blocks of %$2^i$% bits in each %$2^{i+1}$% | |
77 | * block in %$x$%. In terms of indices, now the bits at indices in which | |
78 | * bit %$i$% is set precede those in which bit %$i$% is clear. | |
79 | * %%\emph{We have inverted index bit %$i$%.}%% | |
80 | * | |
81 | * * Suppose that %$i < j$%, and let %$m = \bar{\mu}_i \bitand \mu_j$% and | |
82 | * %$s = 2^j - 2^i$%; let %$y = (x \bitand m) \lsl s \bitor {}$% | |
83 | * %$(x \lsr s)\bitand m \bitor (x\bitand \overline{m \bitor m \lsl s$%. | |
84 | * Now, %$m$% has its bit %$k$% set if and only if bit %$i$% of %$k$% is | |
85 | * set and bit %$j$% of %$k$% is clear. The related mask %$m \lsl s$% | |
86 | * has bit %$k + s$% set if %$k% has the same property; but, %$k$% | |
87 | * will have bit %$i$% set and bit %$j$% clear if and only if bit %$i$% | |
88 | * is clear and bit %$j$% is set in %$k + s$%. Combined, the mask | |
89 | * %$m \bitor (m \lsl s)$% selects bits at indices in which bits %$i$% | |
90 | * and %$j$% differ, so %$\overline{m \bitor (m \lsl s)}$% selects the | |
91 | * bits at indices where bits %$i$% and %$j$% are equal. | |
92 | * | |
93 | * This swizzle therefore exchanges the bits of %$x$% at indices where | |
94 | * bit %$i$% is set and bit %$j$% is clear with those at indices where | |
95 | * bit %$j$% is set and %$i$% is clear, leaving alone those bits at | |
96 | * indices where bits %$i$% and %$j$% are either both clear or both set. | |
97 | * %%\emph{We have exchanged index bits %$i$% and %$j$%.}%% | |
98 | * | |
99 | * * Rounding off this little collection, suppose again that %$i < j$%, and | |
100 | * let %$m = \mu_i \bitand \mu_j$% and %$s = 2^i + 2^j$%; and again, let | |
101 | * %$y = (x \bitand m) \lsl s \bitor (x \lsr s) \bitand m \bitor {}$% | |
102 | * %$(x \bitand \overline{m \bitor m \lsl s$%. Now, %$m$% has its bit | |
103 | * %$k$% set if and only if bits %$i$% and %$j$% of %$k$% are both clear. | |
104 | * This swizzle therefore exchanges the bits of %$x$% at indices where | |
105 | * bits %$i$% and %$j$% are both clear with those at indices where | |
106 | * bits %$i$% and %$j$% are both set, leaving alone those bits at indices | |
107 | * where bits %$i$% and %$j$% differ. It takes a little work to (left as | |
108 | * an exercise) to see that the effect combines the previous two. | |
109 | * %%\emph{We have exchanged and inverted index bits %$i$% and %$j$%.}%% | |
110 | * | |
111 | * Related is the `twizzle', which exchanges similarly arranged groups of | |
112 | * bits within two different words. This can be seen as a multiprecision | |
113 | * variant of the swizzle. | |
114 | * | |
c7c44436 MW |
115 | * Finally, we consider general permutations. These can be implemented using |
116 | * Beneš networks. Pick some index bit number %$i$%. By applying a swizzle | |
117 | * with a shift by %$2^i$% to the inputs, and another to the outputs, we can | |
118 | * reduce the problem to finding two independent permutations, one affecting | |
119 | * bits whose index has bit %$i$% clear, and the other affecting bits whose | |
120 | * index has bit %$i$% set. This doesn't sound so helpful, except that (a) | |
121 | * the smaller permutations can each be implemented in the same way, and (b) | |
122 | * they can be performed in parallel. Small Beneš networks can be | |
123 | * constructed by hand, but computer assistance is useful for larger ones; | |
124 | * there are some utilities in `utils/benes.lisp'. | |
125 | * | |
d3f33b9a MW |
126 | * The machinery here expects some parameters to have been defined: |
127 | * | |
128 | * * @regty@ should be an unsigned integer type, and | |
129 | * | |
130 | * * @REGWD@ should be a power of two such that @regty@ can store at least | |
131 | * @REGWD@ bits. | |
132 | */ | |
133 | ||
134 | /* We begin with some internal utilities. @CATACOMB__REPLICATE_n_(x)@ | |
135 | * produces a hexadecimal constant consisting of %$n$% copies of the digits | |
136 | * @x@. | |
137 | */ | |
138 | #define CATACOMB__REPLICATE_16_(x) CATACOMB__REPLICATE_8_(GLUE(x, x)) | |
139 | #define CATACOMB__REPLICATE_8_(x) CATACOMB__REPLICATE_4_(GLUE(x, x)) | |
140 | #define CATACOMB__REPLICATE_4_(x) CATACOMB__REPLICATE_2_(GLUE(x, x)) | |
141 | #define CATACOMB__REPLICATE_2_(x) CATACOMB__REPLICATE_1_(GLUE(x, x)) | |
142 | #define CATACOMB__REPLICATE_1_(x) GLUE(0x, x) | |
143 | ||
144 | /* More internal utilities. @CATACOMB__REPLi_Un(x)@ returns an %$n$%-bit | |
145 | * hexadecimal constant formed by replicating the %$i$%-bit constant (which | |
146 | * must have leading zeros) %$n/i$% times. | |
147 | */ | |
148 | #define CATACOMB__REPL8_U8 CATACOMB__REPLICATE_1_ | |
149 | #define CATACOMB__REPL8_U16 CATACOMB__REPLICATE_2_ | |
150 | #define CATACOMB__REPL8_U32 CATACOMB__REPLICATE_4_ | |
151 | #define CATACOMB__REPL8_U64 CATACOMB__REPLICATE_8_ | |
152 | #define CATACOMB__REPL8_U128 CATACOMB__REPLICATE_16_ | |
153 | ||
154 | #define CATACOMB__REPL16_U16 CATACOMB__REPLICATE_1_ | |
155 | #define CATACOMB__REPL16_U32 CATACOMB__REPLICATE_2_ | |
156 | #define CATACOMB__REPL16_U64 CATACOMB__REPLICATE_4_ | |
157 | #define CATACOMB__REPL16_U128 CATACOMB__REPLICATE_8_ | |
158 | ||
159 | #define CATACOMB__REPL32_U32 CATACOMB__REPLICATE_1_ | |
160 | #define CATACOMB__REPL32_U64 CATACOMB__REPLICATE_2_ | |
161 | #define CATACOMB__REPL32_U128 CATACOMB__REPLICATE_4_ | |
162 | ||
163 | #define CATACOMB__REPL64_U64 CATACOMB__REPLICATE_1_ | |
164 | #define CATACOMB__REPL64_U128 CATACOMB__REPLICATE_2_ | |
165 | ||
166 | #define CATACOMB__REPL128_U128 CATACOMB__REPLICATE_1_ | |
167 | ||
168 | /* Finally, @CATACOMB__REPLi(x)@ returns a hexadecimal constant formed by | |
169 | * replicating the %$i$%-bit constant (including leading zeros) sufficiently | |
170 | * many times as to fill a @REGWD@-bit wide register. | |
171 | */ | |
172 | #define CATACOMB__REPL8(x) GLUE(CATACOMB__REPL8_U, REGWD)(x) | |
173 | #define CATACOMB__REPL16(x) GLUE(CATACOMB__REPL16_U, REGWD)(x) | |
174 | #define CATACOMB__REPL32(x) GLUE(CATACOMB__REPL32_U, REGWD)(x) | |
175 | #define CATACOMB__REPL64(x) GLUE(CATACOMB__REPL64_U, REGWD)(x) | |
176 | #define CATACOMB__REPL128(x) GLUE(CATACOMB__REPL128_U, REGWD)(x) | |
177 | ||
178 | /* The macro @CATACOMB__IXMASK_Bi(_)@ evaluates to the low @REGWD@ bits of | |
179 | * the constant %$\mu_i$% defined above. The argument is ignored; it's | |
180 | * necessary to prevent technical problems with macro expansion | |
181 | * (specifically, to allow the blue paint on @GLUE@ to be washed off before | |
182 | * invoking @CATACOMB__REPLi@). | |
183 | */ | |
184 | #define CATACOMB__IXMASK_B0(_) CATACOMB__REPL8(55) | |
185 | #define CATACOMB__IXMASK_B1(_) CATACOMB__REPL8(33) | |
186 | #define CATACOMB__IXMASK_B2(_) CATACOMB__REPL8(0f) | |
187 | #define CATACOMB__IXMASK_B3(_) CATACOMB__REPL16(00ff) | |
188 | #define CATACOMB__IXMASK_B4(_) CATACOMB__REPL32(0000ffff) | |
189 | #define CATACOMB__IXMASK_B5(_) CATACOMB__REPL64(00000000ffffffff) | |
190 | #define CATACOMB__IXMASK_B6(_) \ | |
191 | CATACOMB__REPL128(0000000000000000ffffffffffffffff) | |
192 | ||
193 | /* @IXMASK(i)@ returns the low @REGWD@ bits of %$\mu_i$%. The argument @i@ | |
194 | * must be a decimal integer constant, without leading zeros. | |
195 | */ | |
196 | #define IXMASK(i) GLUE(CATACOMB__IXMASK_B, i)(hunoz) | |
197 | ||
198 | /* @IXMASK_xy(i, j)@ returns a @REGWD@-bit mask in which bit %$k$% is set if | |
199 | * bit %$i$% of %$k$% is equal to %$x$% and bit %$j$% of %$k$% is equal to | |
200 | * %$y$%. The arguments @i@ and @j@ must be decimal integer constants, | |
201 | * without leading zeros. | |
202 | */ | |
203 | #define IXMASK_00(i, j) (IXMASK(i)&IXMASK(j)) | |
204 | #define IXMASK_01(i, j) (IXMASK(i)&~IXMASK(j)) | |
205 | #define IXMASK_10(i, j) (~IXMASK(i)&IXMASK(j)) | |
206 | #define IXMASK_11(i, j) (~IXMASK(i)&~IXMASK(j)) | |
207 | ||
208 | /* The general swizzle operation. Exchange the bits in @x@ selected by | |
209 | * @mask@ with those selected by @mask << shift@. | |
210 | */ | |
211 | #define SWIZZLE(x, shift, mask) do { \ | |
212 | regty _t = ((x) ^ ((x) >> (shift)))&(mask); \ | |
213 | (x) ^= _t | (_t << (shift)); \ | |
214 | } while (0) | |
215 | ||
216 | /* A swizzle on two words @x@ and @y@, using the same shift, but different | |
217 | * masks @mask0@ and @mask1@. This is just a simple abbreviation. | |
218 | */ | |
219 | #define SWIZZLE_2(x, y, shift, mask0, mask1) do { \ | |
220 | SWIZZLE(x, shift, mask0); SWIZZLE(y, shift, mask1); \ | |
221 | } while (0) | |
222 | ||
223 | /* A `twizzle', or a swizzle across two words. | |
224 | * | |
225 | * The @TWIZZLE_0@ macro exchanges the bits of @x@ and @y selected by | |
226 | * @mask@. The @TWIZZLE_L@ and @TWIZZLE_R@ macros exchange the bits selected | |
227 | * by @mask@ in @y@ with the bits in @x@ selected by @mask << shift@ or | |
228 | * @mask >> shift@ respectively. (The names are from the direction in which | |
229 | * @x@ is shifted, not the direction the mask is shifted.) | |
230 | * | |
231 | * These are used to synthesize swizzles within multiprecision words: if the | |
232 | * intended shift is %$a w + b$%, where %$w$% is the word width, then %$a$% | |
233 | * gives the difference between word indices of the words to be processed, | |
234 | * and %$\abs{b$}% gives the @shift@ argument; use @TWIZZLE_R@ if | |
235 | * %$b \ge 0$%, @TWIZZLE_L@ if %$b \le 0$% is nonpositive, or @TWIZZLE_0@ if | |
236 | * %$b = 0$%. (We can easily distinguish which of %$a w \pm b$% or | |
237 | * %$(a \pm 1) w \mp (w - b)$%, since one kind of shift will keep @mask@ | |
238 | * within the same word, and the other will shift it out completely.) | |
239 | */ | |
240 | #define TWIZZLE_0(x, y, mask) do { \ | |
241 | regty _t = ((y) ^ ((x)))&(mask); \ | |
242 | (x) ^= _t; (y) ^= _t; \ | |
243 | } while (0) | |
244 | #define TWIZZLE_L(x, y, shift, mask) do { \ | |
245 | regty _t = ((y) ^ ((x) << (shift)))&(mask); \ | |
246 | (x) ^= _t >> (shift); (y) ^= _t; \ | |
247 | } while (0) | |
248 | #define TWIZZLE_R(x, y, shift, mask) do { \ | |
249 | regty _t = ((y) ^ ((x) >> (shift)))&(mask); \ | |
250 | (x) ^= _t << (shift); (y) ^= _t; \ | |
251 | } while (0) | |
252 | ||
253 | /* @SWIZZLE_CPL@ applies a swizzle to @x@ which complements index bit @i@; | |
254 | * @SWIZZLE_EXCH@ applies a swizzle to exchange index bits @i@ and @j@; and | |
255 | * @SWIZZLE_XCPL@ applies a swizzle to exchange and invert index bits @i@ and | |
256 | * @j@. The arguments @i@ and @j@ must be decimal integer constants without | |
257 | * leading zeros, with %$i \le j$%. (The macros do nothing if %$i = j$%.) | |
258 | * | |
259 | * The variants with @2@ in their names act identically on @x@ and @y@, and | |
260 | * are intended as a simple convenience. | |
261 | */ | |
262 | #define SWIZZLE_CPL(x, i) SWIZZLE(x, (1 << (i)), IXMASK(i)) | |
263 | #define SWIZZLE_EXCH(x, i, j) \ | |
264 | SWIZZLE(x, (1 << (j)) - (1 << (i)), IXMASK_10(i, j)) | |
265 | #define SWIZZLE_XCPL(x, i, j) \ | |
266 | SWIZZLE(x, (1 << (j)) + (1 << (i)), IXMASK_00(i, j)) | |
267 | ||
268 | #define SWIZZLE_CPL2(x, y, i) \ | |
269 | SWIZZLE_2(x, y, (1 << (i)), IXMASK(i), IXMASK(i)) | |
270 | #define SWIZZLE_EXCH2(x, y, i, j) \ | |
271 | SWIZZLE_2(x, y, (1 << (j)) - (1 << (i)), \ | |
272 | IXMASK_10(i, j), IXMASK_10(i, j)) | |
273 | #define SWIZZLE_XCPL2(x, y, i, j) \ | |
274 | SWIZZLE_2(x, y, (1 << (j)) + (1 << (i)), \ | |
275 | IXMASK_00(i, j), IXMASK_00(i, j)) | |
276 | ||
277 | /* The @TWIZZLE_EXCH@ and @TWIZZLE_XCPL@ macros act like the similarly named | |
278 | * @SWIZZLE_...@ macros above, except that (a) they act on two words from a | |
279 | * multiprecision value, and the @j@ index is implicit in the selection of | |
280 | * the operands @x@ and @y@. If the word width is %$w$%, and %$2^j = n w$%, | |
281 | * then %$x$% should be chosen to be %$n$% slots more significant than | |
282 | * %$y$%. | |
283 | * | |
284 | * Note that there is no @TWIZZLE_CPL@, since this would simply involve | |
285 | * exchanging two entries in an array. | |
286 | */ | |
287 | #define TWIZZLE_EXCH(x, y, i) TWIZZLE_L(x, y, (1 << (i)), ~IXMASK(i)) | |
288 | #define TWIZZLE_XCPL(x, y, i) TWIZZLE_R(x, y, (1 << (i)), IXMASK(i)) | |
289 | ||
290 | /*----- That's all, folks -------------------------------------------------*/ | |
291 | ||
292 | #ifdef __cplusplus | |
293 | } | |
294 | #endif | |
295 | ||
296 | #endif |