X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/ac4a43c1da1d1554247212289cf483934b9e6d1c..d3f33b9a37f45bc5809af314b1b436b712d59a80:/base/permute.h diff --git a/base/permute.h b/base/permute.h new file mode 100644 index 00000000..ef8eb169 --- /dev/null +++ b/base/permute.h @@ -0,0 +1,285 @@ +/* -*-c-*- + * + * Bit permutations + * + * (c) 2024 Straylight/Edgeware + */ + +/*----- Licensing notice --------------------------------------------------* + * + * This file is part of Catacomb. + * + * Catacomb is free software: you can redistribute it and/or modify it + * under the terms of the GNU Library General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Catacomb is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with Catacomb. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + */ + +#ifndef CATACOMB_PERMUTE_H +#define CATACOMB_PERMUTE_H + +#ifdef __cplusplus + extern "C" { +#endif + +/*----- Header files ------------------------------------------------------*/ + +#include + +/*----- Macros provided ---------------------------------------------------*/ + +/* --- Theory lesson --- * + * + * It's often useful to rearrange the bits in a word, or a value split across + * two (or more) words, so it's worth taking a moment to consider how this + * might be done efficiently. Throughout this discussion, we use the + * standard bit numbering, where the least significant bit in a word is bit + * zero, with numbering increasing with significance. Equivalently, bit + * %$k$% has the numerical value %$2^k$%. + * + * An essential primitive is the `swizzle', which exchanges two similarly + * arranged but disjoint groups of bits within a word which are separated by + * a constant offset. The groups of bits don't have to be contiguous, but + * they must be identified by shifts of the same mask. + * + * An especially important class of swizzle permutations considers the + * individual bits of bit indices. Permutations of the bits in a word can be + * interpreted as operations on the bits of the indices. For %$i \ge 0$%, + * let %$\mu_i$% be the mask such that bit %$k$% of %$\mu_i$% is set if and + * only if bit %$i$% is clear in %$k$%. Hence + * + * %$\mu_0 = (\ldots 01010101010101010101010101010101)_2 = -1/3$% , + * %$\mu_1 = (\ldots 00110011001100110011001100110011)_2 = -1/5$% , + * %$\mu_2 = (\ldots 00001111000011110000111100001111)_2 = -1/17$% , + * %$\mu_3 = (\ldots 00000000111111110000000011111111)_2 = -1/257$% , + * etc. + * + * In general, the low %$2^i$% bits of %$\mu_i$% are set, the next least + * significant %$2^i$% bits are clear, the next %$2^i$% bits are set, and so + * on. Hence, %$\mu_i \lsl 2^i = \bar{\mu}_i$%, or, in the %$2$%-adic + * numbers %$\Z_2$%, %$2^{2^i} \mu_k = -1 - \mu_i$%, whence, in general, + * %$\mu_i = -1/(2^{2^i} + 1)$%. Let %$x$% be some binary value; now we can + * describe some important swizzles. + * + * * Let %$y=(x\bitand\mu_i)\lsl 2^i \bitor (x\bitand\bar{\mu}_i)\lsr 2^i%, + * or %$y = (x\bitand\mu_i) \lsl 2^k \bitor (x \lsr 2^k)\bitand\mu_i$%. + * This exchanges the two sub-blocks of %$2^i$% bits in each %$2^{i+1}$% + * block in %$x$%. In terms of indices, now the bits at indices in which + * bit %$i$% is set precede those in which bit %$i$% is clear. + * %%\emph{We have inverted index bit %$i$%.}%% + * + * * Suppose that %$i < j$%, and let %$m = \bar{\mu}_i \bitand \mu_j$% and + * %$s = 2^j - 2^i$%; let %$y = (x \bitand m) \lsl s \bitor {}$% + * %$(x \lsr s)\bitand m \bitor (x\bitand \overline{m \bitor m \lsl s$%. + * Now, %$m$% has its bit %$k$% set if and only if bit %$i$% of %$k$% is + * set and bit %$j$% of %$k$% is clear. The related mask %$m \lsl s$% + * has bit %$k + s$% set if %$k% has the same property; but, %$k$% + * will have bit %$i$% set and bit %$j$% clear if and only if bit %$i$% + * is clear and bit %$j$% is set in %$k + s$%. Combined, the mask + * %$m \bitor (m \lsl s)$% selects bits at indices in which bits %$i$% + * and %$j$% differ, so %$\overline{m \bitor (m \lsl s)}$% selects the + * bits at indices where bits %$i$% and %$j$% are equal. + * + * This swizzle therefore exchanges the bits of %$x$% at indices where + * bit %$i$% is set and bit %$j$% is clear with those at indices where + * bit %$j$% is set and %$i$% is clear, leaving alone those bits at + * indices where bits %$i$% and %$j$% are either both clear or both set. + * %%\emph{We have exchanged index bits %$i$% and %$j$%.}%% + * + * * Rounding off this little collection, suppose again that %$i < j$%, and + * let %$m = \mu_i \bitand \mu_j$% and %$s = 2^i + 2^j$%; and again, let + * %$y = (x \bitand m) \lsl s \bitor (x \lsr s) \bitand m \bitor {}$% + * %$(x \bitand \overline{m \bitor m \lsl s$%. Now, %$m$% has its bit + * %$k$% set if and only if bits %$i$% and %$j$% of %$k$% are both clear. + * This swizzle therefore exchanges the bits of %$x$% at indices where + * bits %$i$% and %$j$% are both clear with those at indices where + * bits %$i$% and %$j$% are both set, leaving alone those bits at indices + * where bits %$i$% and %$j$% differ. It takes a little work to (left as + * an exercise) to see that the effect combines the previous two. + * %%\emph{We have exchanged and inverted index bits %$i$% and %$j$%.}%% + * + * Related is the `twizzle', which exchanges similarly arranged groups of + * bits within two different words. This can be seen as a multiprecision + * variant of the swizzle. + * + * The machinery here expects some parameters to have been defined: + * + * * @regty@ should be an unsigned integer type, and + * + * * @REGWD@ should be a power of two such that @regty@ can store at least + * @REGWD@ bits. + */ + +/* We begin with some internal utilities. @CATACOMB__REPLICATE_n_(x)@ + * produces a hexadecimal constant consisting of %$n$% copies of the digits + * @x@. + */ +#define CATACOMB__REPLICATE_16_(x) CATACOMB__REPLICATE_8_(GLUE(x, x)) +#define CATACOMB__REPLICATE_8_(x) CATACOMB__REPLICATE_4_(GLUE(x, x)) +#define CATACOMB__REPLICATE_4_(x) CATACOMB__REPLICATE_2_(GLUE(x, x)) +#define CATACOMB__REPLICATE_2_(x) CATACOMB__REPLICATE_1_(GLUE(x, x)) +#define CATACOMB__REPLICATE_1_(x) GLUE(0x, x) + +/* More internal utilities. @CATACOMB__REPLi_Un(x)@ returns an %$n$%-bit + * hexadecimal constant formed by replicating the %$i$%-bit constant (which + * must have leading zeros) %$n/i$% times. + */ +#define CATACOMB__REPL8_U8 CATACOMB__REPLICATE_1_ +#define CATACOMB__REPL8_U16 CATACOMB__REPLICATE_2_ +#define CATACOMB__REPL8_U32 CATACOMB__REPLICATE_4_ +#define CATACOMB__REPL8_U64 CATACOMB__REPLICATE_8_ +#define CATACOMB__REPL8_U128 CATACOMB__REPLICATE_16_ + +#define CATACOMB__REPL16_U16 CATACOMB__REPLICATE_1_ +#define CATACOMB__REPL16_U32 CATACOMB__REPLICATE_2_ +#define CATACOMB__REPL16_U64 CATACOMB__REPLICATE_4_ +#define CATACOMB__REPL16_U128 CATACOMB__REPLICATE_8_ + +#define CATACOMB__REPL32_U32 CATACOMB__REPLICATE_1_ +#define CATACOMB__REPL32_U64 CATACOMB__REPLICATE_2_ +#define CATACOMB__REPL32_U128 CATACOMB__REPLICATE_4_ + +#define CATACOMB__REPL64_U64 CATACOMB__REPLICATE_1_ +#define CATACOMB__REPL64_U128 CATACOMB__REPLICATE_2_ + +#define CATACOMB__REPL128_U128 CATACOMB__REPLICATE_1_ + +/* Finally, @CATACOMB__REPLi(x)@ returns a hexadecimal constant formed by + * replicating the %$i$%-bit constant (including leading zeros) sufficiently + * many times as to fill a @REGWD@-bit wide register. + */ +#define CATACOMB__REPL8(x) GLUE(CATACOMB__REPL8_U, REGWD)(x) +#define CATACOMB__REPL16(x) GLUE(CATACOMB__REPL16_U, REGWD)(x) +#define CATACOMB__REPL32(x) GLUE(CATACOMB__REPL32_U, REGWD)(x) +#define CATACOMB__REPL64(x) GLUE(CATACOMB__REPL64_U, REGWD)(x) +#define CATACOMB__REPL128(x) GLUE(CATACOMB__REPL128_U, REGWD)(x) + +/* The macro @CATACOMB__IXMASK_Bi(_)@ evaluates to the low @REGWD@ bits of + * the constant %$\mu_i$% defined above. The argument is ignored; it's + * necessary to prevent technical problems with macro expansion + * (specifically, to allow the blue paint on @GLUE@ to be washed off before + * invoking @CATACOMB__REPLi@). + */ +#define CATACOMB__IXMASK_B0(_) CATACOMB__REPL8(55) +#define CATACOMB__IXMASK_B1(_) CATACOMB__REPL8(33) +#define CATACOMB__IXMASK_B2(_) CATACOMB__REPL8(0f) +#define CATACOMB__IXMASK_B3(_) CATACOMB__REPL16(00ff) +#define CATACOMB__IXMASK_B4(_) CATACOMB__REPL32(0000ffff) +#define CATACOMB__IXMASK_B5(_) CATACOMB__REPL64(00000000ffffffff) +#define CATACOMB__IXMASK_B6(_) \ + CATACOMB__REPL128(0000000000000000ffffffffffffffff) + +/* @IXMASK(i)@ returns the low @REGWD@ bits of %$\mu_i$%. The argument @i@ + * must be a decimal integer constant, without leading zeros. + */ +#define IXMASK(i) GLUE(CATACOMB__IXMASK_B, i)(hunoz) + +/* @IXMASK_xy(i, j)@ returns a @REGWD@-bit mask in which bit %$k$% is set if + * bit %$i$% of %$k$% is equal to %$x$% and bit %$j$% of %$k$% is equal to + * %$y$%. The arguments @i@ and @j@ must be decimal integer constants, + * without leading zeros. + */ +#define IXMASK_00(i, j) (IXMASK(i)&IXMASK(j)) +#define IXMASK_01(i, j) (IXMASK(i)&~IXMASK(j)) +#define IXMASK_10(i, j) (~IXMASK(i)&IXMASK(j)) +#define IXMASK_11(i, j) (~IXMASK(i)&~IXMASK(j)) + +/* The general swizzle operation. Exchange the bits in @x@ selected by + * @mask@ with those selected by @mask << shift@. + */ +#define SWIZZLE(x, shift, mask) do { \ + regty _t = ((x) ^ ((x) >> (shift)))&(mask); \ + (x) ^= _t | (_t << (shift)); \ +} while (0) + +/* A swizzle on two words @x@ and @y@, using the same shift, but different + * masks @mask0@ and @mask1@. This is just a simple abbreviation. + */ +#define SWIZZLE_2(x, y, shift, mask0, mask1) do { \ + SWIZZLE(x, shift, mask0); SWIZZLE(y, shift, mask1); \ +} while (0) + +/* A `twizzle', or a swizzle across two words. + * + * The @TWIZZLE_0@ macro exchanges the bits of @x@ and @y selected by + * @mask@. The @TWIZZLE_L@ and @TWIZZLE_R@ macros exchange the bits selected + * by @mask@ in @y@ with the bits in @x@ selected by @mask << shift@ or + * @mask >> shift@ respectively. (The names are from the direction in which + * @x@ is shifted, not the direction the mask is shifted.) + * + * These are used to synthesize swizzles within multiprecision words: if the + * intended shift is %$a w + b$%, where %$w$% is the word width, then %$a$% + * gives the difference between word indices of the words to be processed, + * and %$\abs{b$}% gives the @shift@ argument; use @TWIZZLE_R@ if + * %$b \ge 0$%, @TWIZZLE_L@ if %$b \le 0$% is nonpositive, or @TWIZZLE_0@ if + * %$b = 0$%. (We can easily distinguish which of %$a w \pm b$% or + * %$(a \pm 1) w \mp (w - b)$%, since one kind of shift will keep @mask@ + * within the same word, and the other will shift it out completely.) + */ +#define TWIZZLE_0(x, y, mask) do { \ + regty _t = ((y) ^ ((x)))&(mask); \ + (x) ^= _t; (y) ^= _t; \ +} while (0) +#define TWIZZLE_L(x, y, shift, mask) do { \ + regty _t = ((y) ^ ((x) << (shift)))&(mask); \ + (x) ^= _t >> (shift); (y) ^= _t; \ +} while (0) +#define TWIZZLE_R(x, y, shift, mask) do { \ + regty _t = ((y) ^ ((x) >> (shift)))&(mask); \ + (x) ^= _t << (shift); (y) ^= _t; \ +} while (0) + +/* @SWIZZLE_CPL@ applies a swizzle to @x@ which complements index bit @i@; + * @SWIZZLE_EXCH@ applies a swizzle to exchange index bits @i@ and @j@; and + * @SWIZZLE_XCPL@ applies a swizzle to exchange and invert index bits @i@ and + * @j@. The arguments @i@ and @j@ must be decimal integer constants without + * leading zeros, with %$i \le j$%. (The macros do nothing if %$i = j$%.) + * + * The variants with @2@ in their names act identically on @x@ and @y@, and + * are intended as a simple convenience. + */ +#define SWIZZLE_CPL(x, i) SWIZZLE(x, (1 << (i)), IXMASK(i)) +#define SWIZZLE_EXCH(x, i, j) \ + SWIZZLE(x, (1 << (j)) - (1 << (i)), IXMASK_10(i, j)) +#define SWIZZLE_XCPL(x, i, j) \ + SWIZZLE(x, (1 << (j)) + (1 << (i)), IXMASK_00(i, j)) + +#define SWIZZLE_CPL2(x, y, i) \ + SWIZZLE_2(x, y, (1 << (i)), IXMASK(i), IXMASK(i)) +#define SWIZZLE_EXCH2(x, y, i, j) \ + SWIZZLE_2(x, y, (1 << (j)) - (1 << (i)), \ + IXMASK_10(i, j), IXMASK_10(i, j)) +#define SWIZZLE_XCPL2(x, y, i, j) \ + SWIZZLE_2(x, y, (1 << (j)) + (1 << (i)), \ + IXMASK_00(i, j), IXMASK_00(i, j)) + +/* The @TWIZZLE_EXCH@ and @TWIZZLE_XCPL@ macros act like the similarly named + * @SWIZZLE_...@ macros above, except that (a) they act on two words from a + * multiprecision value, and the @j@ index is implicit in the selection of + * the operands @x@ and @y@. If the word width is %$w$%, and %$2^j = n w$%, + * then %$x$% should be chosen to be %$n$% slots more significant than + * %$y$%. + * + * Note that there is no @TWIZZLE_CPL@, since this would simply involve + * exchanging two entries in an array. + */ +#define TWIZZLE_EXCH(x, y, i) TWIZZLE_L(x, y, (1 << (i)), ~IXMASK(i)) +#define TWIZZLE_XCPL(x, y, i) TWIZZLE_R(x, y, (1 << (i)), IXMASK(i)) + +/*----- That's all, folks -------------------------------------------------*/ + +#ifdef __cplusplus + } +#endif + +#endif