| 1 | /* -*-c-*- |
| 2 | * |
| 3 | * Bit permutations |
| 4 | * |
| 5 | * (c) 2024 Straylight/Edgeware |
| 6 | */ |
| 7 | |
| 8 | /*----- Licensing notice --------------------------------------------------* |
| 9 | * |
| 10 | * This file is part of Catacomb. |
| 11 | * |
| 12 | * Catacomb is free software: you can redistribute it and/or modify it |
| 13 | * under the terms of the GNU Library General Public License as published |
| 14 | * by the Free Software Foundation; either version 2 of the License, or |
| 15 | * (at your option) any later version. |
| 16 | * |
| 17 | * Catacomb is distributed in the hope that it will be useful, but |
| 18 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
| 19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 20 | * Library General Public License for more details. |
| 21 | * |
| 22 | * You should have received a copy of the GNU Library General Public |
| 23 | * License along with Catacomb. If not, write to the Free Software |
| 24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, |
| 25 | * USA. |
| 26 | */ |
| 27 | |
| 28 | #ifndef CATACOMB_PERMUTE_H |
| 29 | #define CATACOMB_PERMUTE_H |
| 30 | |
| 31 | #ifdef __cplusplus |
| 32 | extern "C" { |
| 33 | #endif |
| 34 | |
| 35 | /*----- Header files ------------------------------------------------------*/ |
| 36 | |
| 37 | #include <mLib/macros.h> |
| 38 | |
| 39 | /*----- Macros provided ---------------------------------------------------*/ |
| 40 | |
| 41 | /* --- Theory lesson --- * |
| 42 | * |
| 43 | * It's often useful to rearrange the bits in a word, or a value split across |
| 44 | * two (or more) words, so it's worth taking a moment to consider how this |
| 45 | * might be done efficiently. Throughout this discussion, we use the |
| 46 | * standard bit numbering, where the least significant bit in a word is bit |
| 47 | * zero, with numbering increasing with significance. Equivalently, bit |
| 48 | * %$k$% has the numerical value %$2^k$%. |
| 49 | * |
| 50 | * An essential primitive is the `swizzle', which exchanges two similarly |
| 51 | * arranged but disjoint groups of bits within a word which are separated by |
| 52 | * a constant offset. The groups of bits don't have to be contiguous, but |
| 53 | * they must be identified by shifts of the same mask. |
| 54 | * |
| 55 | * An especially important class of swizzle permutations considers the |
| 56 | * individual bits of bit indices. Permutations of the bits in a word can be |
| 57 | * interpreted as operations on the bits of the indices. For %$i \ge 0$%, |
| 58 | * let %$\mu_i$% be the mask such that bit %$k$% of %$\mu_i$% is set if and |
| 59 | * only if bit %$i$% is clear in %$k$%. Hence |
| 60 | * |
| 61 | * %$\mu_0 = (\ldots 01010101010101010101010101010101)_2 = -1/3$% , |
| 62 | * %$\mu_1 = (\ldots 00110011001100110011001100110011)_2 = -1/5$% , |
| 63 | * %$\mu_2 = (\ldots 00001111000011110000111100001111)_2 = -1/17$% , |
| 64 | * %$\mu_3 = (\ldots 00000000111111110000000011111111)_2 = -1/257$% , |
| 65 | * etc. |
| 66 | * |
| 67 | * In general, the low %$2^i$% bits of %$\mu_i$% are set, the next least |
| 68 | * significant %$2^i$% bits are clear, the next %$2^i$% bits are set, and so |
| 69 | * on. Hence, %$\mu_i \lsl 2^i = \bar{\mu}_i$%, or, in the %$2$%-adic |
| 70 | * numbers %$\Z_2$%, %$2^{2^i} \mu_k = -1 - \mu_i$%, whence, in general, |
| 71 | * %$\mu_i = -1/(2^{2^i} + 1)$%. Let %$x$% be some binary value; now we can |
| 72 | * describe some important swizzles. |
| 73 | * |
| 74 | * * Let %$y=(x\bitand\mu_i)\lsl 2^i \bitor (x\bitand\bar{\mu}_i)\lsr 2^i%, |
| 75 | * or %$y = (x\bitand\mu_i) \lsl 2^k \bitor (x \lsr 2^k)\bitand\mu_i$%. |
| 76 | * This exchanges the two sub-blocks of %$2^i$% bits in each %$2^{i+1}$% |
| 77 | * block in %$x$%. In terms of indices, now the bits at indices in which |
| 78 | * bit %$i$% is set precede those in which bit %$i$% is clear. |
| 79 | * %%\emph{We have inverted index bit %$i$%.}%% |
| 80 | * |
| 81 | * * Suppose that %$i < j$%, and let %$m = \bar{\mu}_i \bitand \mu_j$% and |
| 82 | * %$s = 2^j - 2^i$%; let %$y = (x \bitand m) \lsl s \bitor {}$% |
| 83 | * %$(x \lsr s)\bitand m \bitor (x\bitand \overline{m \bitor m \lsl s$%. |
| 84 | * Now, %$m$% has its bit %$k$% set if and only if bit %$i$% of %$k$% is |
| 85 | * set and bit %$j$% of %$k$% is clear. The related mask %$m \lsl s$% |
| 86 | * has bit %$k + s$% set if %$k% has the same property; but, %$k$% |
| 87 | * will have bit %$i$% set and bit %$j$% clear if and only if bit %$i$% |
| 88 | * is clear and bit %$j$% is set in %$k + s$%. Combined, the mask |
| 89 | * %$m \bitor (m \lsl s)$% selects bits at indices in which bits %$i$% |
| 90 | * and %$j$% differ, so %$\overline{m \bitor (m \lsl s)}$% selects the |
| 91 | * bits at indices where bits %$i$% and %$j$% are equal. |
| 92 | * |
| 93 | * This swizzle therefore exchanges the bits of %$x$% at indices where |
| 94 | * bit %$i$% is set and bit %$j$% is clear with those at indices where |
| 95 | * bit %$j$% is set and %$i$% is clear, leaving alone those bits at |
| 96 | * indices where bits %$i$% and %$j$% are either both clear or both set. |
| 97 | * %%\emph{We have exchanged index bits %$i$% and %$j$%.}%% |
| 98 | * |
| 99 | * * Rounding off this little collection, suppose again that %$i < j$%, and |
| 100 | * let %$m = \mu_i \bitand \mu_j$% and %$s = 2^i + 2^j$%; and again, let |
| 101 | * %$y = (x \bitand m) \lsl s \bitor (x \lsr s) \bitand m \bitor {}$% |
| 102 | * %$(x \bitand \overline{m \bitor m \lsl s$%. Now, %$m$% has its bit |
| 103 | * %$k$% set if and only if bits %$i$% and %$j$% of %$k$% are both clear. |
| 104 | * This swizzle therefore exchanges the bits of %$x$% at indices where |
| 105 | * bits %$i$% and %$j$% are both clear with those at indices where |
| 106 | * bits %$i$% and %$j$% are both set, leaving alone those bits at indices |
| 107 | * where bits %$i$% and %$j$% differ. It takes a little work to (left as |
| 108 | * an exercise) to see that the effect combines the previous two. |
| 109 | * %%\emph{We have exchanged and inverted index bits %$i$% and %$j$%.}%% |
| 110 | * |
| 111 | * Related is the `twizzle', which exchanges similarly arranged groups of |
| 112 | * bits within two different words. This can be seen as a multiprecision |
| 113 | * variant of the swizzle. |
| 114 | * |
| 115 | * Finally, we consider general permutations. These can be implemented using |
| 116 | * Beneš networks. Pick some index bit number %$i$%. By applying a swizzle |
| 117 | * with a shift by %$2^i$% to the inputs, and another to the outputs, we can |
| 118 | * reduce the problem to finding two independent permutations, one affecting |
| 119 | * bits whose index has bit %$i$% clear, and the other affecting bits whose |
| 120 | * index has bit %$i$% set. This doesn't sound so helpful, except that (a) |
| 121 | * the smaller permutations can each be implemented in the same way, and (b) |
| 122 | * they can be performed in parallel. Small Beneš networks can be |
| 123 | * constructed by hand, but computer assistance is useful for larger ones; |
| 124 | * there are some utilities in `utils/benes.lisp'. |
| 125 | * |
| 126 | * The machinery here expects some parameters to have been defined: |
| 127 | * |
| 128 | * * @regty@ should be an unsigned integer type, and |
| 129 | * |
| 130 | * * @REGWD@ should be a power of two such that @regty@ can store at least |
| 131 | * @REGWD@ bits. |
| 132 | */ |
| 133 | |
| 134 | /* We begin with some internal utilities. @CATACOMB__REPLICATE_n_(x)@ |
| 135 | * produces a hexadecimal constant consisting of %$n$% copies of the digits |
| 136 | * @x@. |
| 137 | */ |
| 138 | #define CATACOMB__REPLICATE_16_(x) CATACOMB__REPLICATE_8_(GLUE(x, x)) |
| 139 | #define CATACOMB__REPLICATE_8_(x) CATACOMB__REPLICATE_4_(GLUE(x, x)) |
| 140 | #define CATACOMB__REPLICATE_4_(x) CATACOMB__REPLICATE_2_(GLUE(x, x)) |
| 141 | #define CATACOMB__REPLICATE_2_(x) CATACOMB__REPLICATE_1_(GLUE(x, x)) |
| 142 | #define CATACOMB__REPLICATE_1_(x) GLUE(0x, x) |
| 143 | |
| 144 | /* More internal utilities. @CATACOMB__REPLi_Un(x)@ returns an %$n$%-bit |
| 145 | * hexadecimal constant formed by replicating the %$i$%-bit constant (which |
| 146 | * must have leading zeros) %$n/i$% times. |
| 147 | */ |
| 148 | #define CATACOMB__REPL8_U8 CATACOMB__REPLICATE_1_ |
| 149 | #define CATACOMB__REPL8_U16 CATACOMB__REPLICATE_2_ |
| 150 | #define CATACOMB__REPL8_U32 CATACOMB__REPLICATE_4_ |
| 151 | #define CATACOMB__REPL8_U64 CATACOMB__REPLICATE_8_ |
| 152 | #define CATACOMB__REPL8_U128 CATACOMB__REPLICATE_16_ |
| 153 | |
| 154 | #define CATACOMB__REPL16_U16 CATACOMB__REPLICATE_1_ |
| 155 | #define CATACOMB__REPL16_U32 CATACOMB__REPLICATE_2_ |
| 156 | #define CATACOMB__REPL16_U64 CATACOMB__REPLICATE_4_ |
| 157 | #define CATACOMB__REPL16_U128 CATACOMB__REPLICATE_8_ |
| 158 | |
| 159 | #define CATACOMB__REPL32_U32 CATACOMB__REPLICATE_1_ |
| 160 | #define CATACOMB__REPL32_U64 CATACOMB__REPLICATE_2_ |
| 161 | #define CATACOMB__REPL32_U128 CATACOMB__REPLICATE_4_ |
| 162 | |
| 163 | #define CATACOMB__REPL64_U64 CATACOMB__REPLICATE_1_ |
| 164 | #define CATACOMB__REPL64_U128 CATACOMB__REPLICATE_2_ |
| 165 | |
| 166 | #define CATACOMB__REPL128_U128 CATACOMB__REPLICATE_1_ |
| 167 | |
| 168 | /* Finally, @CATACOMB__REPLi(x)@ returns a hexadecimal constant formed by |
| 169 | * replicating the %$i$%-bit constant (including leading zeros) sufficiently |
| 170 | * many times as to fill a @REGWD@-bit wide register. |
| 171 | */ |
| 172 | #define CATACOMB__REPL8(x) GLUE(CATACOMB__REPL8_U, REGWD)(x) |
| 173 | #define CATACOMB__REPL16(x) GLUE(CATACOMB__REPL16_U, REGWD)(x) |
| 174 | #define CATACOMB__REPL32(x) GLUE(CATACOMB__REPL32_U, REGWD)(x) |
| 175 | #define CATACOMB__REPL64(x) GLUE(CATACOMB__REPL64_U, REGWD)(x) |
| 176 | #define CATACOMB__REPL128(x) GLUE(CATACOMB__REPL128_U, REGWD)(x) |
| 177 | |
| 178 | /* The macro @CATACOMB__IXMASK_Bi(_)@ evaluates to the low @REGWD@ bits of |
| 179 | * the constant %$\mu_i$% defined above. The argument is ignored; it's |
| 180 | * necessary to prevent technical problems with macro expansion |
| 181 | * (specifically, to allow the blue paint on @GLUE@ to be washed off before |
| 182 | * invoking @CATACOMB__REPLi@). |
| 183 | */ |
| 184 | #define CATACOMB__IXMASK_B0(_) CATACOMB__REPL8(55) |
| 185 | #define CATACOMB__IXMASK_B1(_) CATACOMB__REPL8(33) |
| 186 | #define CATACOMB__IXMASK_B2(_) CATACOMB__REPL8(0f) |
| 187 | #define CATACOMB__IXMASK_B3(_) CATACOMB__REPL16(00ff) |
| 188 | #define CATACOMB__IXMASK_B4(_) CATACOMB__REPL32(0000ffff) |
| 189 | #define CATACOMB__IXMASK_B5(_) CATACOMB__REPL64(00000000ffffffff) |
| 190 | #define CATACOMB__IXMASK_B6(_) \ |
| 191 | CATACOMB__REPL128(0000000000000000ffffffffffffffff) |
| 192 | |
| 193 | /* @IXMASK(i)@ returns the low @REGWD@ bits of %$\mu_i$%. The argument @i@ |
| 194 | * must be a decimal integer constant, without leading zeros. |
| 195 | */ |
| 196 | #define IXMASK(i) GLUE(CATACOMB__IXMASK_B, i)(hunoz) |
| 197 | |
| 198 | /* @IXMASK_xy(i, j)@ returns a @REGWD@-bit mask in which bit %$k$% is set if |
| 199 | * bit %$i$% of %$k$% is equal to %$x$% and bit %$j$% of %$k$% is equal to |
| 200 | * %$y$%. The arguments @i@ and @j@ must be decimal integer constants, |
| 201 | * without leading zeros. |
| 202 | */ |
| 203 | #define IXMASK_00(i, j) (IXMASK(i)&IXMASK(j)) |
| 204 | #define IXMASK_01(i, j) (IXMASK(i)&~IXMASK(j)) |
| 205 | #define IXMASK_10(i, j) (~IXMASK(i)&IXMASK(j)) |
| 206 | #define IXMASK_11(i, j) (~IXMASK(i)&~IXMASK(j)) |
| 207 | |
| 208 | /* The general swizzle operation. Exchange the bits in @x@ selected by |
| 209 | * @mask@ with those selected by @mask << shift@. |
| 210 | */ |
| 211 | #define SWIZZLE(x, shift, mask) do { \ |
| 212 | regty _t = ((x) ^ ((x) >> (shift)))&(mask); \ |
| 213 | (x) ^= _t | (_t << (shift)); \ |
| 214 | } while (0) |
| 215 | |
| 216 | /* A swizzle on two words @x@ and @y@, using the same shift, but different |
| 217 | * masks @mask0@ and @mask1@. This is just a simple abbreviation. |
| 218 | */ |
| 219 | #define SWIZZLE_2(x, y, shift, mask0, mask1) do { \ |
| 220 | SWIZZLE(x, shift, mask0); SWIZZLE(y, shift, mask1); \ |
| 221 | } while (0) |
| 222 | |
| 223 | /* A `twizzle', or a swizzle across two words. |
| 224 | * |
| 225 | * The @TWIZZLE_0@ macro exchanges the bits of @x@ and @y selected by |
| 226 | * @mask@. The @TWIZZLE_L@ and @TWIZZLE_R@ macros exchange the bits selected |
| 227 | * by @mask@ in @y@ with the bits in @x@ selected by @mask << shift@ or |
| 228 | * @mask >> shift@ respectively. (The names are from the direction in which |
| 229 | * @x@ is shifted, not the direction the mask is shifted.) |
| 230 | * |
| 231 | * These are used to synthesize swizzles within multiprecision words: if the |
| 232 | * intended shift is %$a w + b$%, where %$w$% is the word width, then %$a$% |
| 233 | * gives the difference between word indices of the words to be processed, |
| 234 | * and %$\abs{b$}% gives the @shift@ argument; use @TWIZZLE_R@ if |
| 235 | * %$b \ge 0$%, @TWIZZLE_L@ if %$b \le 0$% is nonpositive, or @TWIZZLE_0@ if |
| 236 | * %$b = 0$%. (We can easily distinguish which of %$a w \pm b$% or |
| 237 | * %$(a \pm 1) w \mp (w - b)$%, since one kind of shift will keep @mask@ |
| 238 | * within the same word, and the other will shift it out completely.) |
| 239 | */ |
| 240 | #define TWIZZLE_0(x, y, mask) do { \ |
| 241 | regty _t = ((y) ^ ((x)))&(mask); \ |
| 242 | (x) ^= _t; (y) ^= _t; \ |
| 243 | } while (0) |
| 244 | #define TWIZZLE_L(x, y, shift, mask) do { \ |
| 245 | regty _t = ((y) ^ ((x) << (shift)))&(mask); \ |
| 246 | (x) ^= _t >> (shift); (y) ^= _t; \ |
| 247 | } while (0) |
| 248 | #define TWIZZLE_R(x, y, shift, mask) do { \ |
| 249 | regty _t = ((y) ^ ((x) >> (shift)))&(mask); \ |
| 250 | (x) ^= _t << (shift); (y) ^= _t; \ |
| 251 | } while (0) |
| 252 | |
| 253 | /* @SWIZZLE_CPL@ applies a swizzle to @x@ which complements index bit @i@; |
| 254 | * @SWIZZLE_EXCH@ applies a swizzle to exchange index bits @i@ and @j@; and |
| 255 | * @SWIZZLE_XCPL@ applies a swizzle to exchange and invert index bits @i@ and |
| 256 | * @j@. The arguments @i@ and @j@ must be decimal integer constants without |
| 257 | * leading zeros, with %$i \le j$%. (The macros do nothing if %$i = j$%.) |
| 258 | * |
| 259 | * The variants with @2@ in their names act identically on @x@ and @y@, and |
| 260 | * are intended as a simple convenience. |
| 261 | */ |
| 262 | #define SWIZZLE_CPL(x, i) SWIZZLE(x, (1 << (i)), IXMASK(i)) |
| 263 | #define SWIZZLE_EXCH(x, i, j) \ |
| 264 | SWIZZLE(x, (1 << (j)) - (1 << (i)), IXMASK_10(i, j)) |
| 265 | #define SWIZZLE_XCPL(x, i, j) \ |
| 266 | SWIZZLE(x, (1 << (j)) + (1 << (i)), IXMASK_00(i, j)) |
| 267 | |
| 268 | #define SWIZZLE_CPL2(x, y, i) \ |
| 269 | SWIZZLE_2(x, y, (1 << (i)), IXMASK(i), IXMASK(i)) |
| 270 | #define SWIZZLE_EXCH2(x, y, i, j) \ |
| 271 | SWIZZLE_2(x, y, (1 << (j)) - (1 << (i)), \ |
| 272 | IXMASK_10(i, j), IXMASK_10(i, j)) |
| 273 | #define SWIZZLE_XCPL2(x, y, i, j) \ |
| 274 | SWIZZLE_2(x, y, (1 << (j)) + (1 << (i)), \ |
| 275 | IXMASK_00(i, j), IXMASK_00(i, j)) |
| 276 | |
| 277 | /* The @TWIZZLE_EXCH@ and @TWIZZLE_XCPL@ macros act like the similarly named |
| 278 | * @SWIZZLE_...@ macros above, except that (a) they act on two words from a |
| 279 | * multiprecision value, and the @j@ index is implicit in the selection of |
| 280 | * the operands @x@ and @y@. If the word width is %$w$%, and %$2^j = n w$%, |
| 281 | * then %$x$% should be chosen to be %$n$% slots more significant than |
| 282 | * %$y$%. |
| 283 | * |
| 284 | * Note that there is no @TWIZZLE_CPL@, since this would simply involve |
| 285 | * exchanging two entries in an array. |
| 286 | */ |
| 287 | #define TWIZZLE_EXCH(x, y, i) TWIZZLE_L(x, y, (1 << (i)), ~IXMASK(i)) |
| 288 | #define TWIZZLE_XCPL(x, y, i) TWIZZLE_R(x, y, (1 << (i)), IXMASK(i)) |
| 289 | |
| 290 | /*----- That's all, folks -------------------------------------------------*/ |
| 291 | |
| 292 | #ifdef __cplusplus |
| 293 | } |
| 294 | #endif |
| 295 | |
| 296 | #endif |