mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/* --c--
	2	*
	3	* Bit permutations
	4	*
	5	* (c) 2024 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of Catacomb.
	11	*
	12	* Catacomb is free software: you can redistribute it and/or modify it
	13	* under the terms of the GNU Library General Public License as published
	14	* by the Free Software Foundation; either version 2 of the License, or
	15	* (at your option) any later version.
	16	*
	17	* Catacomb is distributed in the hope that it will be useful, but
	18	* WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	* Library General Public License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with Catacomb. If not, write to the Free Software
	24	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	* USA.
	26	*/
	27
	28	#ifndef CATACOMB_PERMUTE_H
	29	#define CATACOMB_PERMUTE_H
	30
	31	#ifdef __cplusplus
	32	extern "C" {
	33	#endif
	34
	35	/----- Header files ------------------------------------------------------/
	36
	37	#include <mLib/macros.h>
	38
	39	/----- Macros provided ---------------------------------------------------/
	40
	41	/* --- Theory lesson --- *
	42	*
	43	* It's often useful to rearrange the bits in a word, or a value split across
	44	* two (or more) words, so it's worth taking a moment to consider how this
	45	* might be done efficiently. Throughout this discussion, we use the
	46	* standard bit numbering, where the least significant bit in a word is bit
	47	* zero, with numbering increasing with significance. Equivalently, bit
	48	* %$k$% has the numerical value %$2^k$%.
	49	*
	50	* An essential primitive is the `swizzle', which exchanges two similarly
	51	* arranged but disjoint groups of bits within a word which are separated by
	52	* a constant offset. The groups of bits don't have to be contiguous, but
	53	* they must be identified by shifts of the same mask.
	54	*
	55	* An especially important class of swizzle permutations considers the
	56	* individual bits of bit indices. Permutations of the bits in a word can be
	57	* interpreted as operations on the bits of the indices. For %$i \ge 0$%,
	58	* let %$\mu_i$% be the mask such that bit %$k$% of %$\mu_i$% is set if and
	59	* only if bit %$i$% is clear in %$k$%. Hence
	60	*
	61	* %$\mu_0 = (\ldots 01010101010101010101010101010101)_2 = -1/3$% ,
	62	* %$\mu_1 = (\ldots 00110011001100110011001100110011)_2 = -1/5$% ,
	63	* %$\mu_2 = (\ldots 00001111000011110000111100001111)_2 = -1/17$% ,
	64	* %$\mu_3 = (\ldots 00000000111111110000000011111111)_2 = -1/257$% ,
	65	* etc.
	66	*
	67	* In general, the low %$2^i$% bits of %$\mu_i$% are set, the next least
	68	* significant %$2^i$% bits are clear, the next %$2^i$% bits are set, and so
	69	* on. Hence, %$\mu_i \lsl 2^i = \bar{\mu}_i$%, or, in the %$2$%-adic
	70	* numbers %$\Z_2$%, %$2^{2^i} \mu_k = -1 - \mu_i$%, whence, in general,
	71	* %$\mu_i = -1/(2^{2^i} + 1)$%. Let %$x$% be some binary value; now we can
	72	* describe some important swizzles.
	73	*
	74	* * Let %$y=(x\bitand\mu_i)\lsl 2^i \bitor (x\bitand\bar{\mu}_i)\lsr 2^i%,
	75	* or %$y = (x\bitand\mu_i) \lsl 2^k \bitor (x \lsr 2^k)\bitand\mu_i$%.
	76	* This exchanges the two sub-blocks of %$2^i$% bits in each %$2^{i+1}$%
	77	* block in %$x$%. In terms of indices, now the bits at indices in which
	78	* bit %$i$% is set precede those in which bit %$i$% is clear.
	79	* %%\emph{We have inverted index bit %$i$%.}%%
	80	*
	81	* * Suppose that %$i < j$%, and let %$m = \bar{\mu}_i \bitand \mu_j$% and
	82	* %$s = 2^j - 2^i$%; let %$y = (x \bitand m) \lsl s \bitor {}$%
	83	* %$(x \lsr s)\bitand m \bitor (x\bitand \overline{m \bitor m \lsl s$%.
	84	* Now, %$m$% has its bit %$k$% set if and only if bit %$i$% of %$k$% is
	85	* set and bit %$j$% of %$k$% is clear. The related mask %$m \lsl s$%
	86	* has bit %$k + s$% set if %$k% has the same property; but, %$k$%
	87	* will have bit %$i$% set and bit %$j$% clear if and only if bit %$i$%
	88	* is clear and bit %$j$% is set in %$k + s$%. Combined, the mask
	89	* %$m \bitor (m \lsl s)$% selects bits at indices in which bits %$i$%
	90	* and %$j$% differ, so %$\overline{m \bitor (m \lsl s)}$% selects the
	91	* bits at indices where bits %$i$% and %$j$% are equal.
	92	*
	93	* This swizzle therefore exchanges the bits of %$x$% at indices where
	94	* bit %$i$% is set and bit %$j$% is clear with those at indices where
	95	* bit %$j$% is set and %$i$% is clear, leaving alone those bits at
	96	* indices where bits %$i$% and %$j$% are either both clear or both set.
	97	* %%\emph{We have exchanged index bits %$i$% and %$j$%.}%%
	98	*
	99	* * Rounding off this little collection, suppose again that %$i < j$%, and
	100	* let %$m = \mu_i \bitand \mu_j$% and %$s = 2^i + 2^j$%; and again, let
	101	* %$y = (x \bitand m) \lsl s \bitor (x \lsr s) \bitand m \bitor {}$%
	102	* %$(x \bitand \overline{m \bitor m \lsl s$%. Now, %$m$% has its bit
	103	* %$k$% set if and only if bits %$i$% and %$j$% of %$k$% are both clear.
	104	* This swizzle therefore exchanges the bits of %$x$% at indices where
	105	* bits %$i$% and %$j$% are both clear with those at indices where
	106	* bits %$i$% and %$j$% are both set, leaving alone those bits at indices
	107	* where bits %$i$% and %$j$% differ. It takes a little work to (left as
	108	* an exercise) to see that the effect combines the previous two.
	109	* %%\emph{We have exchanged and inverted index bits %$i$% and %$j$%.}%%
	110	*
	111	* Related is the `twizzle', which exchanges similarly arranged groups of
	112	* bits within two different words. This can be seen as a multiprecision
	113	* variant of the swizzle.
	114	*
	115	* Finally, we consider general permutations. These can be implemented using
	116	* Beneš networks. Pick some index bit number %$i$%. By applying a swizzle
	117	* with a shift by %$2^i$% to the inputs, and another to the outputs, we can
	118	* reduce the problem to finding two independent permutations, one affecting
	119	* bits whose index has bit %$i$% clear, and the other affecting bits whose
	120	* index has bit %$i$% set. This doesn't sound so helpful, except that (a)
	121	* the smaller permutations can each be implemented in the same way, and (b)
	122	* they can be performed in parallel. Small Beneš networks can be
	123	* constructed by hand, but computer assistance is useful for larger ones;
	124	* there are some utilities in `utils/benes.lisp'.
	125	*
	126	* The machinery here expects some parameters to have been defined:
	127	*
	128	* * @regty@ should be an unsigned integer type, and
	129	*
	130	* * @REGWD@ should be a power of two such that @regty@ can store at least
	131	* @REGWD@ bits.
	132	*/
	133
	134	/* We begin with some internal utilities. @CATACOMB__REPLICATE_n_(x)@
	135	* produces a hexadecimal constant consisting of %$n$% copies of the digits
	136	* @x@.
	137	*/
	138	#define CATACOMB__REPLICATE_16_(x) CATACOMB__REPLICATE_8_(GLUE(x, x))
	139	#define CATACOMB__REPLICATE_8_(x) CATACOMB__REPLICATE_4_(GLUE(x, x))
	140	#define CATACOMB__REPLICATE_4_(x) CATACOMB__REPLICATE_2_(GLUE(x, x))
	141	#define CATACOMB__REPLICATE_2_(x) CATACOMB__REPLICATE_1_(GLUE(x, x))
	142	#define CATACOMB__REPLICATE_1_(x) GLUE(0x, x)
	143
	144	/* More internal utilities. @CATACOMB__REPLi_Un(x)@ returns an %$n$%-bit
	145	* hexadecimal constant formed by replicating the %$i$%-bit constant (which
	146	* must have leading zeros) %$n/i$% times.
	147	*/
	148	#define CATACOMB__REPL8_U8 CATACOMB__REPLICATE_1_
	149	#define CATACOMB__REPL8_U16 CATACOMB__REPLICATE_2_
	150	#define CATACOMB__REPL8_U32 CATACOMB__REPLICATE_4_
	151	#define CATACOMB__REPL8_U64 CATACOMB__REPLICATE_8_
	152	#define CATACOMB__REPL8_U128 CATACOMB__REPLICATE_16_
	153
	154	#define CATACOMB__REPL16_U16 CATACOMB__REPLICATE_1_
	155	#define CATACOMB__REPL16_U32 CATACOMB__REPLICATE_2_
	156	#define CATACOMB__REPL16_U64 CATACOMB__REPLICATE_4_
	157	#define CATACOMB__REPL16_U128 CATACOMB__REPLICATE_8_
	158
	159	#define CATACOMB__REPL32_U32 CATACOMB__REPLICATE_1_
	160	#define CATACOMB__REPL32_U64 CATACOMB__REPLICATE_2_
	161	#define CATACOMB__REPL32_U128 CATACOMB__REPLICATE_4_
	162
	163	#define CATACOMB__REPL64_U64 CATACOMB__REPLICATE_1_
	164	#define CATACOMB__REPL64_U128 CATACOMB__REPLICATE_2_
	165
	166	#define CATACOMB__REPL128_U128 CATACOMB__REPLICATE_1_
	167
	168	/* Finally, @CATACOMB__REPLi(x)@ returns a hexadecimal constant formed by
	169	* replicating the %$i$%-bit constant (including leading zeros) sufficiently
	170	* many times as to fill a @REGWD@-bit wide register.
	171	*/
	172	#define CATACOMB__REPL8(x) GLUE(CATACOMB__REPL8_U, REGWD)(x)
	173	#define CATACOMB__REPL16(x) GLUE(CATACOMB__REPL16_U, REGWD)(x)
	174	#define CATACOMB__REPL32(x) GLUE(CATACOMB__REPL32_U, REGWD)(x)
	175	#define CATACOMB__REPL64(x) GLUE(CATACOMB__REPL64_U, REGWD)(x)
	176	#define CATACOMB__REPL128(x) GLUE(CATACOMB__REPL128_U, REGWD)(x)
	177
	178	/* The macro @CATACOMB__IXMASK_Bi(_)@ evaluates to the low @REGWD@ bits of
	179	* the constant %$\mu_i$% defined above. The argument is ignored; it's
	180	* necessary to prevent technical problems with macro expansion
	181	* (specifically, to allow the blue paint on @GLUE@ to be washed off before
	182	* invoking @CATACOMB__REPLi@).
	183	*/
	184	#define CATACOMB__IXMASK_B0(_) CATACOMB__REPL8(55)
	185	#define CATACOMB__IXMASK_B1(_) CATACOMB__REPL8(33)
	186	#define CATACOMB__IXMASK_B2(_) CATACOMB__REPL8(0f)
	187	#define CATACOMB__IXMASK_B3(_) CATACOMB__REPL16(00ff)
	188	#define CATACOMB__IXMASK_B4(_) CATACOMB__REPL32(0000ffff)
	189	#define CATACOMB__IXMASK_B5(_) CATACOMB__REPL64(00000000ffffffff)
	190	#define CATACOMB__IXMASK_B6(_) \
	191	CATACOMB__REPL128(0000000000000000ffffffffffffffff)
	192
	193	/* @IXMASK(i)@ returns the low @REGWD@ bits of %$\mu_i$%. The argument @i@
	194	* must be a decimal integer constant, without leading zeros.
	195	*/
	196	#define IXMASK(i) GLUE(CATACOMB__IXMASK_B, i)(hunoz)
	197
	198	/* @IXMASK_xy(i, j)@ returns a @REGWD@-bit mask in which bit %$k$% is set if
	199	* bit %$i$% of %$k$% is equal to %$x$% and bit %$j$% of %$k$% is equal to
	200	* %$y$%. The arguments @i@ and @j@ must be decimal integer constants,
	201	* without leading zeros.
	202	*/
	203	#define IXMASK_00(i, j) (IXMASK(i)&IXMASK(j))
	204	#define IXMASK_01(i, j) (IXMASK(i)&~IXMASK(j))
	205	#define IXMASK_10(i, j) (~IXMASK(i)&IXMASK(j))
	206	#define IXMASK_11(i, j) (~IXMASK(i)&~IXMASK(j))
	207
	208	/* The general swizzle operation. Exchange the bits in @x@ selected by
	209	* @mask@ with those selected by @mask << shift@.
	210	*/
	211	#define SWIZZLE(x, shift, mask) do { \
	212	regty _t = ((x) ^ ((x) >> (shift)))&(mask); \
	213	(x) ^= _t \| (_t << (shift)); \
	214	} while (0)
	215
	216	/* A swizzle on two words @x@ and @y@, using the same shift, but different
	217	* masks @mask0@ and @mask1@. This is just a simple abbreviation.
	218	*/
	219	#define SWIZZLE_2(x, y, shift, mask0, mask1) do { \
	220	SWIZZLE(x, shift, mask0); SWIZZLE(y, shift, mask1); \
	221	} while (0)
	222
	223	/* A `twizzle', or a swizzle across two words.
	224	*
	225	* The @TWIZZLE_0@ macro exchanges the bits of @x@ and @y selected by
	226	* @mask@. The @TWIZZLE_L@ and @TWIZZLE_R@ macros exchange the bits selected
	227	* by @mask@ in @y@ with the bits in @x@ selected by @mask << shift@ or
	228	* @mask >> shift@ respectively. (The names are from the direction in which
	229	* @x@ is shifted, not the direction the mask is shifted.)
	230	*
	231	* These are used to synthesize swizzles within multiprecision words: if the
	232	* intended shift is %$a w + b$%, where %$w$% is the word width, then %$a$%
	233	* gives the difference between word indices of the words to be processed,
	234	* and %$\abs{b$}% gives the @shift@ argument; use @TWIZZLE_R@ if
	235	* %$b \ge 0$%, @TWIZZLE_L@ if %$b \le 0$% is nonpositive, or @TWIZZLE_0@ if
	236	* %$b = 0$%. (We can easily distinguish which of %$a w \pm b$% or
	237	* %$(a \pm 1) w \mp (w - b)$%, since one kind of shift will keep @mask@
	238	* within the same word, and the other will shift it out completely.)
	239	*/
	240	#define TWIZZLE_0(x, y, mask) do { \
	241	regty _t = ((y) ^ ((x)))&(mask); \
	242	(x) ^= _t; (y) ^= _t; \
	243	} while (0)
	244	#define TWIZZLE_L(x, y, shift, mask) do { \
	245	regty _t = ((y) ^ ((x) << (shift)))&(mask); \
	246	(x) ^= _t >> (shift); (y) ^= _t; \
	247	} while (0)
	248	#define TWIZZLE_R(x, y, shift, mask) do { \
	249	regty _t = ((y) ^ ((x) >> (shift)))&(mask); \
	250	(x) ^= _t << (shift); (y) ^= _t; \
	251	} while (0)
	252
	253	/* @SWIZZLE_CPL@ applies a swizzle to @x@ which complements index bit @i@;
	254	* @SWIZZLE_EXCH@ applies a swizzle to exchange index bits @i@ and @j@; and
	255	* @SWIZZLE_XCPL@ applies a swizzle to exchange and invert index bits @i@ and
	256	* @j@. The arguments @i@ and @j@ must be decimal integer constants without
	257	* leading zeros, with %$i \le j$%. (The macros do nothing if %$i = j$%.)
	258	*
	259	* The variants with @2@ in their names act identically on @x@ and @y@, and
	260	* are intended as a simple convenience.
	261	*/
	262	#define SWIZZLE_CPL(x, i) SWIZZLE(x, (1 << (i)), IXMASK(i))
	263	#define SWIZZLE_EXCH(x, i, j) \
	264	SWIZZLE(x, (1 << (j)) - (1 << (i)), IXMASK_10(i, j))
	265	#define SWIZZLE_XCPL(x, i, j) \
	266	SWIZZLE(x, (1 << (j)) + (1 << (i)), IXMASK_00(i, j))
	267
	268	#define SWIZZLE_CPL2(x, y, i) \
	269	SWIZZLE_2(x, y, (1 << (i)), IXMASK(i), IXMASK(i))
	270	#define SWIZZLE_EXCH2(x, y, i, j) \
	271	SWIZZLE_2(x, y, (1 << (j)) - (1 << (i)), \
	272	IXMASK_10(i, j), IXMASK_10(i, j))
	273	#define SWIZZLE_XCPL2(x, y, i, j) \
	274	SWIZZLE_2(x, y, (1 << (j)) + (1 << (i)), \
	275	IXMASK_00(i, j), IXMASK_00(i, j))
	276
	277	/* The @TWIZZLE_EXCH@ and @TWIZZLE_XCPL@ macros act like the similarly named
	278	* @SWIZZLE_...@ macros above, except that (a) they act on two words from a
	279	* multiprecision value, and the @j@ index is implicit in the selection of
	280	* the operands @x@ and @y@. If the word width is %$w$%, and %$2^j = n w$%,
	281	* then %$x$% should be chosen to be %$n$% slots more significant than
	282	* %$y$%.
	283	*
	284	* Note that there is no @TWIZZLE_CPL@, since this would simply involve
	285	* exchanging two entries in an array.
	286	*/
	287	#define TWIZZLE_EXCH(x, y, i) TWIZZLE_L(x, y, (1 << (i)), ~IXMASK(i))
	288	#define TWIZZLE_XCPL(x, y, i) TWIZZLE_R(x, y, (1 << (i)), IXMASK(i))
	289
	290	/----- That's all, folks -------------------------------------------------/
	291
	292	#ifdef __cplusplus
	293	}
	294	#endif
	295
	296	#endif