progs/perftest.c: Use from Glibc syscall numbers.
[catacomb] / base / permute.h
1 /* -*-c-*-
2 *
3 * Bit permutations
4 *
5 * (c) 2024 Straylight/Edgeware
6 */
7
8 /*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of Catacomb.
11 *
12 * Catacomb is free software: you can redistribute it and/or modify it
13 * under the terms of the GNU Library General Public License as published
14 * by the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * Catacomb is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Library General Public License for more details.
21 *
22 * You should have received a copy of the GNU Library General Public
23 * License along with Catacomb. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 * USA.
26 */
27
28 #ifndef CATACOMB_PERMUTE_H
29 #define CATACOMB_PERMUTE_H
30
31 #ifdef __cplusplus
32 extern "C" {
33 #endif
34
35 /*----- Header files ------------------------------------------------------*/
36
37 #include <mLib/macros.h>
38
39 /*----- Macros provided ---------------------------------------------------*/
40
41 /* --- Theory lesson --- *
42 *
43 * It's often useful to rearrange the bits in a word, or a value split across
44 * two (or more) words, so it's worth taking a moment to consider how this
45 * might be done efficiently. Throughout this discussion, we use the
46 * standard bit numbering, where the least significant bit in a word is bit
47 * zero, with numbering increasing with significance. Equivalently, bit
48 * %$k$% has the numerical value %$2^k$%.
49 *
50 * An essential primitive is the `swizzle', which exchanges two similarly
51 * arranged but disjoint groups of bits within a word which are separated by
52 * a constant offset. The groups of bits don't have to be contiguous, but
53 * they must be identified by shifts of the same mask.
54 *
55 * An especially important class of swizzle permutations considers the
56 * individual bits of bit indices. Permutations of the bits in a word can be
57 * interpreted as operations on the bits of the indices. For %$i \ge 0$%,
58 * let %$\mu_i$% be the mask such that bit %$k$% of %$\mu_i$% is set if and
59 * only if bit %$i$% is clear in %$k$%. Hence
60 *
61 * %$\mu_0 = (\ldots 01010101010101010101010101010101)_2 = -1/3$% ,
62 * %$\mu_1 = (\ldots 00110011001100110011001100110011)_2 = -1/5$% ,
63 * %$\mu_2 = (\ldots 00001111000011110000111100001111)_2 = -1/17$% ,
64 * %$\mu_3 = (\ldots 00000000111111110000000011111111)_2 = -1/257$% ,
65 * etc.
66 *
67 * In general, the low %$2^i$% bits of %$\mu_i$% are set, the next least
68 * significant %$2^i$% bits are clear, the next %$2^i$% bits are set, and so
69 * on. Hence, %$\mu_i \lsl 2^i = \bar{\mu}_i$%, or, in the %$2$%-adic
70 * numbers %$\Z_2$%, %$2^{2^i} \mu_k = -1 - \mu_i$%, whence, in general,
71 * %$\mu_i = -1/(2^{2^i} + 1)$%. Let %$x$% be some binary value; now we can
72 * describe some important swizzles.
73 *
74 * * Let %$y=(x\bitand\mu_i)\lsl 2^i \bitor (x\bitand\bar{\mu}_i)\lsr 2^i%,
75 * or %$y = (x\bitand\mu_i) \lsl 2^k \bitor (x \lsr 2^k)\bitand\mu_i$%.
76 * This exchanges the two sub-blocks of %$2^i$% bits in each %$2^{i+1}$%
77 * block in %$x$%. In terms of indices, now the bits at indices in which
78 * bit %$i$% is set precede those in which bit %$i$% is clear.
79 * %%\emph{We have inverted index bit %$i$%.}%%
80 *
81 * * Suppose that %$i < j$%, and let %$m = \bar{\mu}_i \bitand \mu_j$% and
82 * %$s = 2^j - 2^i$%; let %$y = (x \bitand m) \lsl s \bitor {}$%
83 * %$(x \lsr s)\bitand m \bitor (x\bitand \overline{m \bitor m \lsl s$%.
84 * Now, %$m$% has its bit %$k$% set if and only if bit %$i$% of %$k$% is
85 * set and bit %$j$% of %$k$% is clear. The related mask %$m \lsl s$%
86 * has bit %$k + s$% set if %$k% has the same property; but, %$k$%
87 * will have bit %$i$% set and bit %$j$% clear if and only if bit %$i$%
88 * is clear and bit %$j$% is set in %$k + s$%. Combined, the mask
89 * %$m \bitor (m \lsl s)$% selects bits at indices in which bits %$i$%
90 * and %$j$% differ, so %$\overline{m \bitor (m \lsl s)}$% selects the
91 * bits at indices where bits %$i$% and %$j$% are equal.
92 *
93 * This swizzle therefore exchanges the bits of %$x$% at indices where
94 * bit %$i$% is set and bit %$j$% is clear with those at indices where
95 * bit %$j$% is set and %$i$% is clear, leaving alone those bits at
96 * indices where bits %$i$% and %$j$% are either both clear or both set.
97 * %%\emph{We have exchanged index bits %$i$% and %$j$%.}%%
98 *
99 * * Rounding off this little collection, suppose again that %$i < j$%, and
100 * let %$m = \mu_i \bitand \mu_j$% and %$s = 2^i + 2^j$%; and again, let
101 * %$y = (x \bitand m) \lsl s \bitor (x \lsr s) \bitand m \bitor {}$%
102 * %$(x \bitand \overline{m \bitor m \lsl s$%. Now, %$m$% has its bit
103 * %$k$% set if and only if bits %$i$% and %$j$% of %$k$% are both clear.
104 * This swizzle therefore exchanges the bits of %$x$% at indices where
105 * bits %$i$% and %$j$% are both clear with those at indices where
106 * bits %$i$% and %$j$% are both set, leaving alone those bits at indices
107 * where bits %$i$% and %$j$% differ. It takes a little work to (left as
108 * an exercise) to see that the effect combines the previous two.
109 * %%\emph{We have exchanged and inverted index bits %$i$% and %$j$%.}%%
110 *
111 * Related is the `twizzle', which exchanges similarly arranged groups of
112 * bits within two different words. This can be seen as a multiprecision
113 * variant of the swizzle.
114 *
115 * Finally, we consider general permutations. These can be implemented using
116 * Beneš networks. Pick some index bit number %$i$%. By applying a swizzle
117 * with a shift by %$2^i$% to the inputs, and another to the outputs, we can
118 * reduce the problem to finding two independent permutations, one affecting
119 * bits whose index has bit %$i$% clear, and the other affecting bits whose
120 * index has bit %$i$% set. This doesn't sound so helpful, except that (a)
121 * the smaller permutations can each be implemented in the same way, and (b)
122 * they can be performed in parallel. Small Beneš networks can be
123 * constructed by hand, but computer assistance is useful for larger ones;
124 * there are some utilities in `utils/benes.lisp'.
125 *
126 * The machinery here expects some parameters to have been defined:
127 *
128 * * @regty@ should be an unsigned integer type, and
129 *
130 * * @REGWD@ should be a power of two such that @regty@ can store at least
131 * @REGWD@ bits.
132 */
133
134 /* We begin with some internal utilities. @CATACOMB__REPLICATE_n_(x)@
135 * produces a hexadecimal constant consisting of %$n$% copies of the digits
136 * @x@.
137 */
138 #define CATACOMB__REPLICATE_16_(x) CATACOMB__REPLICATE_8_(GLUE(x, x))
139 #define CATACOMB__REPLICATE_8_(x) CATACOMB__REPLICATE_4_(GLUE(x, x))
140 #define CATACOMB__REPLICATE_4_(x) CATACOMB__REPLICATE_2_(GLUE(x, x))
141 #define CATACOMB__REPLICATE_2_(x) CATACOMB__REPLICATE_1_(GLUE(x, x))
142 #define CATACOMB__REPLICATE_1_(x) GLUE(0x, x)
143
144 /* More internal utilities. @CATACOMB__REPLi_Un(x)@ returns an %$n$%-bit
145 * hexadecimal constant formed by replicating the %$i$%-bit constant (which
146 * must have leading zeros) %$n/i$% times.
147 */
148 #define CATACOMB__REPL8_U8 CATACOMB__REPLICATE_1_
149 #define CATACOMB__REPL8_U16 CATACOMB__REPLICATE_2_
150 #define CATACOMB__REPL8_U32 CATACOMB__REPLICATE_4_
151 #define CATACOMB__REPL8_U64 CATACOMB__REPLICATE_8_
152 #define CATACOMB__REPL8_U128 CATACOMB__REPLICATE_16_
153
154 #define CATACOMB__REPL16_U16 CATACOMB__REPLICATE_1_
155 #define CATACOMB__REPL16_U32 CATACOMB__REPLICATE_2_
156 #define CATACOMB__REPL16_U64 CATACOMB__REPLICATE_4_
157 #define CATACOMB__REPL16_U128 CATACOMB__REPLICATE_8_
158
159 #define CATACOMB__REPL32_U32 CATACOMB__REPLICATE_1_
160 #define CATACOMB__REPL32_U64 CATACOMB__REPLICATE_2_
161 #define CATACOMB__REPL32_U128 CATACOMB__REPLICATE_4_
162
163 #define CATACOMB__REPL64_U64 CATACOMB__REPLICATE_1_
164 #define CATACOMB__REPL64_U128 CATACOMB__REPLICATE_2_
165
166 #define CATACOMB__REPL128_U128 CATACOMB__REPLICATE_1_
167
168 /* Finally, @CATACOMB__REPLi(x)@ returns a hexadecimal constant formed by
169 * replicating the %$i$%-bit constant (including leading zeros) sufficiently
170 * many times as to fill a @REGWD@-bit wide register.
171 */
172 #define CATACOMB__REPL8(x) GLUE(CATACOMB__REPL8_U, REGWD)(x)
173 #define CATACOMB__REPL16(x) GLUE(CATACOMB__REPL16_U, REGWD)(x)
174 #define CATACOMB__REPL32(x) GLUE(CATACOMB__REPL32_U, REGWD)(x)
175 #define CATACOMB__REPL64(x) GLUE(CATACOMB__REPL64_U, REGWD)(x)
176 #define CATACOMB__REPL128(x) GLUE(CATACOMB__REPL128_U, REGWD)(x)
177
178 /* The macro @CATACOMB__IXMASK_Bi(_)@ evaluates to the low @REGWD@ bits of
179 * the constant %$\mu_i$% defined above. The argument is ignored; it's
180 * necessary to prevent technical problems with macro expansion
181 * (specifically, to allow the blue paint on @GLUE@ to be washed off before
182 * invoking @CATACOMB__REPLi@).
183 */
184 #define CATACOMB__IXMASK_B0(_) CATACOMB__REPL8(55)
185 #define CATACOMB__IXMASK_B1(_) CATACOMB__REPL8(33)
186 #define CATACOMB__IXMASK_B2(_) CATACOMB__REPL8(0f)
187 #define CATACOMB__IXMASK_B3(_) CATACOMB__REPL16(00ff)
188 #define CATACOMB__IXMASK_B4(_) CATACOMB__REPL32(0000ffff)
189 #define CATACOMB__IXMASK_B5(_) CATACOMB__REPL64(00000000ffffffff)
190 #define CATACOMB__IXMASK_B6(_) \
191 CATACOMB__REPL128(0000000000000000ffffffffffffffff)
192
193 /* @IXMASK(i)@ returns the low @REGWD@ bits of %$\mu_i$%. The argument @i@
194 * must be a decimal integer constant, without leading zeros.
195 */
196 #define IXMASK(i) GLUE(CATACOMB__IXMASK_B, i)(hunoz)
197
198 /* @IXMASK_xy(i, j)@ returns a @REGWD@-bit mask in which bit %$k$% is set if
199 * bit %$i$% of %$k$% is equal to %$x$% and bit %$j$% of %$k$% is equal to
200 * %$y$%. The arguments @i@ and @j@ must be decimal integer constants,
201 * without leading zeros.
202 */
203 #define IXMASK_00(i, j) (IXMASK(i)&IXMASK(j))
204 #define IXMASK_01(i, j) (IXMASK(i)&~IXMASK(j))
205 #define IXMASK_10(i, j) (~IXMASK(i)&IXMASK(j))
206 #define IXMASK_11(i, j) (~IXMASK(i)&~IXMASK(j))
207
208 /* The general swizzle operation. Exchange the bits in @x@ selected by
209 * @mask@ with those selected by @mask << shift@.
210 */
211 #define SWIZZLE(x, shift, mask) do { \
212 regty _t = ((x) ^ ((x) >> (shift)))&(mask); \
213 (x) ^= _t | (_t << (shift)); \
214 } while (0)
215
216 /* A swizzle on two words @x@ and @y@, using the same shift, but different
217 * masks @mask0@ and @mask1@. This is just a simple abbreviation.
218 */
219 #define SWIZZLE_2(x, y, shift, mask0, mask1) do { \
220 SWIZZLE(x, shift, mask0); SWIZZLE(y, shift, mask1); \
221 } while (0)
222
223 /* A `twizzle', or a swizzle across two words.
224 *
225 * The @TWIZZLE_0@ macro exchanges the bits of @x@ and @y selected by
226 * @mask@. The @TWIZZLE_L@ and @TWIZZLE_R@ macros exchange the bits selected
227 * by @mask@ in @y@ with the bits in @x@ selected by @mask << shift@ or
228 * @mask >> shift@ respectively. (The names are from the direction in which
229 * @x@ is shifted, not the direction the mask is shifted.)
230 *
231 * These are used to synthesize swizzles within multiprecision words: if the
232 * intended shift is %$a w + b$%, where %$w$% is the word width, then %$a$%
233 * gives the difference between word indices of the words to be processed,
234 * and %$\abs{b$}% gives the @shift@ argument; use @TWIZZLE_R@ if
235 * %$b \ge 0$%, @TWIZZLE_L@ if %$b \le 0$% is nonpositive, or @TWIZZLE_0@ if
236 * %$b = 0$%. (We can easily distinguish which of %$a w \pm b$% or
237 * %$(a \pm 1) w \mp (w - b)$%, since one kind of shift will keep @mask@
238 * within the same word, and the other will shift it out completely.)
239 */
240 #define TWIZZLE_0(x, y, mask) do { \
241 regty _t = ((y) ^ ((x)))&(mask); \
242 (x) ^= _t; (y) ^= _t; \
243 } while (0)
244 #define TWIZZLE_L(x, y, shift, mask) do { \
245 regty _t = ((y) ^ ((x) << (shift)))&(mask); \
246 (x) ^= _t >> (shift); (y) ^= _t; \
247 } while (0)
248 #define TWIZZLE_R(x, y, shift, mask) do { \
249 regty _t = ((y) ^ ((x) >> (shift)))&(mask); \
250 (x) ^= _t << (shift); (y) ^= _t; \
251 } while (0)
252
253 /* @SWIZZLE_CPL@ applies a swizzle to @x@ which complements index bit @i@;
254 * @SWIZZLE_EXCH@ applies a swizzle to exchange index bits @i@ and @j@; and
255 * @SWIZZLE_XCPL@ applies a swizzle to exchange and invert index bits @i@ and
256 * @j@. The arguments @i@ and @j@ must be decimal integer constants without
257 * leading zeros, with %$i \le j$%. (The macros do nothing if %$i = j$%.)
258 *
259 * The variants with @2@ in their names act identically on @x@ and @y@, and
260 * are intended as a simple convenience.
261 */
262 #define SWIZZLE_CPL(x, i) SWIZZLE(x, (1 << (i)), IXMASK(i))
263 #define SWIZZLE_EXCH(x, i, j) \
264 SWIZZLE(x, (1 << (j)) - (1 << (i)), IXMASK_10(i, j))
265 #define SWIZZLE_XCPL(x, i, j) \
266 SWIZZLE(x, (1 << (j)) + (1 << (i)), IXMASK_00(i, j))
267
268 #define SWIZZLE_CPL2(x, y, i) \
269 SWIZZLE_2(x, y, (1 << (i)), IXMASK(i), IXMASK(i))
270 #define SWIZZLE_EXCH2(x, y, i, j) \
271 SWIZZLE_2(x, y, (1 << (j)) - (1 << (i)), \
272 IXMASK_10(i, j), IXMASK_10(i, j))
273 #define SWIZZLE_XCPL2(x, y, i, j) \
274 SWIZZLE_2(x, y, (1 << (j)) + (1 << (i)), \
275 IXMASK_00(i, j), IXMASK_00(i, j))
276
277 /* The @TWIZZLE_EXCH@ and @TWIZZLE_XCPL@ macros act like the similarly named
278 * @SWIZZLE_...@ macros above, except that (a) they act on two words from a
279 * multiprecision value, and the @j@ index is implicit in the selection of
280 * the operands @x@ and @y@. If the word width is %$w$%, and %$2^j = n w$%,
281 * then %$x$% should be chosen to be %$n$% slots more significant than
282 * %$y$%.
283 *
284 * Note that there is no @TWIZZLE_CPL@, since this would simply involve
285 * exchanging two entries in an array.
286 */
287 #define TWIZZLE_EXCH(x, y, i) TWIZZLE_L(x, y, (1 << (i)), ~IXMASK(i))
288 #define TWIZZLE_XCPL(x, y, i) TWIZZLE_R(x, y, (1 << (i)), IXMASK(i))
289
290 /*----- That's all, folks -------------------------------------------------*/
291
292 #ifdef __cplusplus
293 }
294 #endif
295
296 #endif