#include "mpx.h"
#include "gfx.h"
-
-/*----- Static variables --------------------------------------------------*/
-
-extern const uint16 gfx_sqrtab[256];
+#include "permute.h"
/*----- Main code ---------------------------------------------------------*/
void gfx_sqr(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl)
{
- mpd a = 0, aa = 0;
- unsigned b = 0, bb = 0;
+#if MPW_BITS <= 16
+# define REGWD 16
+#elif MPW_BITS <= 32
+# define REGWD 32
+#elif MPW_BITS <= 64
+# define REGWD 64
+#elif MPW_BITS <= 128
+# define REGWD 128
+#else
+# error "unsupported limb width: extend `gfx-sqr.c'"
+#endif
+
+#if MPW_BITS == REGWD
+ typedef mpw regty;
+#else
+ typedef mpd regty;
+#endif
+
+ regty a, t;
/* --- Simple stuff --- */
* Our method depends on the fact that, in a field of characteristic 2, we
* have that %$(a + b)^2 = a^2 + b^2$%. Thus, to square a polynomial, it's
* sufficient just to put a zero bit between each of the bits of the
- * original argument. We use a precomputed table for this, and work on
- * entire octets at a time. Life is more complicated because we've got to
- * be careful of bizarre architectures which don't have words with a
- * multiple of 8 bits in them.
+ * original argument.
*/
- for (;;) {
-
- /* --- Input buffering --- */
-
- if (b < 8) {
- if (av >= avl)
- break;
- a |= *av++ << b;
- b += MPW_BITS;
- }
-
- /* --- Do the work in the middle --- */
-
- aa |= (mpd)(gfx_sqrtab[U8(a)]) << bb;
- bb += 16;
- a >>= 8;
- b -= 8;
-
- /* --- Output buffering --- */
-
- if (bb >= MPW_BITS) {
- *dv++ = MPW(aa);
- if (dv >= dvl)
- return;
- aa >>= MPW_BITS;
- bb -= MPW_BITS;
- }
- }
-
- /* --- Flush the input buffer --- */
-
- if (b) for (;;) {
- aa |= (mpd)(gfx_sqrtab[U8(a)]) << bb;
- bb += 16;
- if (bb > MPW_BITS) {
- *dv++ = MPW(aa);
- if (dv >= dvl)
- return;
- aa >>= MPW_BITS;
- bb -= MPW_BITS;
- }
- a >>= 8;
- if (b <= 8)
- break;
- else
- b -= 8;
- }
+ while (av < avl) {
+ a = *av++;
+ /* ..., 7, 6, 5, 4, 3, 2, 1, 0 */
+ SWIZZLE_EXCH(a, 0, 1); /* ..., 7, 6, 5, 4, 3, 2, 0, 1 */
+ SWIZZLE_EXCH(a, 0, 2); /* ..., 7, 6, 5, 4, 3, 1, 0, 2 */
+ SWIZZLE_EXCH(a, 0, 3); /* ..., 7, 6, 5, 4, 2, 1, 0, 3 */
+#if MPW_BITS > 16
+ SWIZZLE_EXCH(a, 0, 4); /* ..., 7, 6, 5, 3, 2, 1, 0, 4 */
+#endif
+#if MPW_BITS > 32
+ SWIZZLE_EXCH(a, 0, 5); /* ..., 7, 6, 4, 3, 2, 1, 0, 5 */
+#endif
+#if MPW_BITS > 64
+ SWIZZLE_EXCH(a, 0, 6); /* ..., 7, 5, 4, 3, 2, 1, 0, 6 */
+#endif
+#if MPW_BITS > 128
+# error "unsupported limb width: extend `gfx-sqr.c'"
+#endif
- /* --- Flush the output buffer --- */
-
- if (bb) for (;;) {
- *dv++ = MPW(aa);
- if (dv >= dvl)
- return;
- aa >>= MPW_BITS;
- if (bb <= MPW_BITS)
- break;
- else
- bb -= MPW_BITS;
+ /* Write out the low half, which consists of the low-order bits in
+ * even-numbered positions.
+ */
+ *dv++ = MPW(a&IXMASK(0)); if (dv >= dvl) break;
+
+ /* Write the high half. This is trickier, since, in general, the bits
+ * are split across even-numbered bits in the high part of the wider
+ * register and odd-numbered bits in the low part. Alas, we can't
+ * resolve this mess without special cases because shifts are broken in
+ * C.
+ */
+ t = ((a >> 1)&IXMASK(0)) << (REGWD - MPW_BITS);
+#if MPW_BITS < REGWD
+ t |= (a&IXMASK(0)) >> MPW_BITS;
+#endif
+ *dv++ = MPW(t); if (dv >= dvl) break;
}
/* --- Zero the rest of everything --- */
MPX_ZERO(dv, dvl);
+
+#undef REGWD
}
/*----- Test rig ----------------------------------------------------------*/