X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/48af823d9b5218ce32d7590b786d3b1cd089b68c..d9d9e6456f3f6d03a0ae1d1eb78e6796ecdb7fab:/math/gfx-sqr.c diff --git a/math/gfx-sqr.c b/math/gfx-sqr.c index ba54da88..0192738b 100644 --- a/math/gfx-sqr.c +++ b/math/gfx-sqr.c @@ -29,10 +29,7 @@ #include "mpx.h" #include "gfx.h" - -/*----- Static variables --------------------------------------------------*/ - -extern const uint16 gfx_sqrtab[256]; +#include "permute.h" /*----- Main code ---------------------------------------------------------*/ @@ -48,8 +45,25 @@ extern const uint16 gfx_sqrtab[256]; void gfx_sqr(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl) { - mpd a = 0, aa = 0; - unsigned b = 0, bb = 0; +#if MPW_BITS <= 16 +# define REGWD 16 +#elif MPW_BITS <= 32 +# define REGWD 32 +#elif MPW_BITS <= 64 +# define REGWD 64 +#elif MPW_BITS <= 128 +# define REGWD 128 +#else +# error "unsupported limb width: extend `gfx-sqr.c'" +#endif + +#if MPW_BITS == REGWD + typedef mpw regty; +#else + typedef mpd regty; +#endif + + regty a, t; /* --- Simple stuff --- */ @@ -62,76 +76,51 @@ void gfx_sqr(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl) * Our method depends on the fact that, in a field of characteristic 2, we * have that %$(a + b)^2 = a^2 + b^2$%. Thus, to square a polynomial, it's * sufficient just to put a zero bit between each of the bits of the - * original argument. We use a precomputed table for this, and work on - * entire octets at a time. Life is more complicated because we've got to - * be careful of bizarre architectures which don't have words with a - * multiple of 8 bits in them. + * original argument. */ - for (;;) { - - /* --- Input buffering --- */ - - if (b < 8) { - if (av >= avl) - break; - a |= *av++ << b; - b += MPW_BITS; - } - - /* --- Do the work in the middle --- */ - - aa |= (mpd)(gfx_sqrtab[U8(a)]) << bb; - bb += 16; - a >>= 8; - b -= 8; - - /* --- Output buffering --- */ - - if (bb >= MPW_BITS) { - *dv++ = MPW(aa); - if (dv >= dvl) - return; - aa >>= MPW_BITS; - bb -= MPW_BITS; - } - } - - /* --- Flush the input buffer --- */ - - if (b) for (;;) { - aa |= (mpd)(gfx_sqrtab[U8(a)]) << bb; - bb += 16; - if (bb > MPW_BITS) { - *dv++ = MPW(aa); - if (dv >= dvl) - return; - aa >>= MPW_BITS; - bb -= MPW_BITS; - } - a >>= 8; - if (b <= 8) - break; - else - b -= 8; - } + while (av < avl) { + a = *av++; + /* ..., 7, 6, 5, 4, 3, 2, 1, 0 */ + SWIZZLE_EXCH(a, 0, 1); /* ..., 7, 6, 5, 4, 3, 2, 0, 1 */ + SWIZZLE_EXCH(a, 0, 2); /* ..., 7, 6, 5, 4, 3, 1, 0, 2 */ + SWIZZLE_EXCH(a, 0, 3); /* ..., 7, 6, 5, 4, 2, 1, 0, 3 */ +#if MPW_BITS > 16 + SWIZZLE_EXCH(a, 0, 4); /* ..., 7, 6, 5, 3, 2, 1, 0, 4 */ +#endif +#if MPW_BITS > 32 + SWIZZLE_EXCH(a, 0, 5); /* ..., 7, 6, 4, 3, 2, 1, 0, 5 */ +#endif +#if MPW_BITS > 64 + SWIZZLE_EXCH(a, 0, 6); /* ..., 7, 5, 4, 3, 2, 1, 0, 6 */ +#endif +#if MPW_BITS > 128 +# error "unsupported limb width: extend `gfx-sqr.c'" +#endif - /* --- Flush the output buffer --- */ - - if (bb) for (;;) { - *dv++ = MPW(aa); - if (dv >= dvl) - return; - aa >>= MPW_BITS; - if (bb <= MPW_BITS) - break; - else - bb -= MPW_BITS; + /* Write out the low half, which consists of the low-order bits in + * even-numbered positions. + */ + *dv++ = MPW(a&IXMASK(0)); if (dv >= dvl) break; + + /* Write the high half. This is trickier, since, in general, the bits + * are split across even-numbered bits in the high part of the wider + * register and odd-numbered bits in the low part. Alas, we can't + * resolve this mess without special cases because shifts are broken in + * C. + */ + t = ((a >> 1)&IXMASK(0)) << (REGWD - MPW_BITS); +#if MPW_BITS < REGWD + t |= (a&IXMASK(0)) >> MPW_BITS; +#endif + *dv++ = MPW(t); if (dv >= dvl) break; } /* --- Zero the rest of everything --- */ MPX_ZERO(dv, dvl); + +#undef REGWD } /*----- Test rig ----------------------------------------------------------*/