X-Git-Url: https://git.distorted.org.uk/u/mdw/catacomb/blobdiff_plain/5bf74deaebe06922cc0e03bd9118f207b31f211e..c760149fcb65296defd1af967fbfa098bd83143a:/mpx-ksqr.c

diff --git a/mpx-ksqr.c b/mpx-ksqr.c
index 45c49c9..226b99a 100644
--- a/mpx-ksqr.c
+++ b/mpx-ksqr.c
@@ -1,6 +1,6 @@
 /* -*-c-*-
  *
- * $Id: mpx-ksqr.c,v 1.1 1999/12/11 10:57:43 mdw Exp $
+ * $Id: mpx-ksqr.c,v 1.3 2000/06/17 11:42:54 mdw Exp $
  *
  * Karatsuba-based squaring algorithm
  *
@@ -30,6 +30,14 @@
 /*----- Revision history --------------------------------------------------* 
  *
  * $Log: mpx-ksqr.c,v $
+ * Revision 1.3  2000/06/17 11:42:54  mdw
+ * Moved the Karatsuba macros into a separate file for better sharing.
+ * Fixed some comments.  Use an improved technique so that all the
+ * operations are squarings.
+ *
+ * Revision 1.2  1999/12/13 15:35:01  mdw
+ * Simplify and improve.
+ *
  * Revision 1.1  1999/12/11 10:57:43  mdw
  * Karatsuba squaring algorithm.
  *
@@ -37,9 +45,11 @@
 
 /*----- Header files ------------------------------------------------------*/
 
+#include <assert.h>
 #include <stdio.h>
 
 #include "mpx.h"
+#include "mpx-kmac.h"
 
 /*----- Tweakables --------------------------------------------------------*/
 
@@ -48,42 +58,6 @@
 #  define KARATSUBA_CUTOFF 2
 #endif
 
-/*----- Addition macros ---------------------------------------------------*/
-
-#define ULSL1(dv, av, avl) do {						\
-  mpw *_dv = (dv);							\
-  const mpw *_av = (av), *_avl = (avl);					\
-  mpw _c = 0;								\
-									\
-  while (_av < _avl) {							\
-    mpw _x = *_av++;							\
-    *_dv++ = MPW(_c | (_x << 1));					\
-    _c = MPW(_x >> (MPW_BITS - 1));					\
-  }									\
-  *_dv++ = _c;								\
-} while (0)
-
-#define UADD(dv, av, avl) do {						\
-  mpw *_dv = (dv);							\
-  const mpw *_av = (av), *_avl = (avl);					\
-  mpw _c = 0;								\
-									\
-  while (_av < _avl) {							\
-    mpw _a, _b;								\
-    mpd _x;								\
-    _a = *_av++;							\
-    _b = *_dv;								\
-    _x = (mpd)_a + (mpd)_b + _c;					\
-    *_dv++ = MPW(_x);							\
-    _c = _x >> MPW_BITS;						\
-  }									\
-  while (_c) {								\
-    mpd _x = (mpd)*_dv + (mpd)_c;					\
-    *_dv++ = MPW(_x);							\
-    _c = _x >> MPW_BITS;						\
-  }									\
-} while (0)
-
 /*----- Main code ---------------------------------------------------------*/
 
 /* --- @mpx_ksqr@ --- *
@@ -129,10 +103,10 @@ void mpx_ksqr(mpw *dv, mpw *dvl,
 
   /* --- How the algorithm works --- *
    *
-   * Unlike Karatsuba's identity for multiplication which isn't particularly
-   * obvious, the identity for multiplication is known to all schoolchildren.
-   * Let %$A = xb + y$%.  Then %$A^2 = x^2 b^x + 2 x y b + y^2$%.  So now I
-   * have three multiplications, each four times easier, and that's a win.
+   * The identity for squaring is known to all schoolchildren.
+   * Let %$A = xb + y$%.  Then %$A^2 = x^2 b^2 + 2 x y b + y^2$%.  Now,
+   * %$(x + y)^2 - x^2 - y^2 = 2 x y$%, which means I only need to do three
+   * squarings.
    */
 
   /* --- First things --- *
@@ -143,38 +117,39 @@ void mpx_ksqr(mpw *dv, mpw *dvl,
   m = (avl - av + 1) >> 1;
   avm = av + m;
 
+  assert(((void)"Destination too small for Karatsuba square",
+	  dvl - dv >= 4 * m));
+  assert(((void)"Not enough workspace for Karatsuba square",
+	  svl - sv >= 4 * m));
+
   /* --- Sort out everything --- */
 
   {
-    mpw *ssv = sv + 2 * m;
+    mpw *svm = sv + m, *svn = svm + m, *ssv = svn + 4;
     mpw *tdv = dv + m;
     mpw *rdv = tdv + m;
 
-    /* --- The cross term in the middle needs a multiply --- *
-     *
-     * This isn't actually true, since %$x y = ((x + y)^2 - (x - y)^2)/4%.
-     * But that's two squarings, versus one multiplication.
-     */
-
+    UADD2(sv, svm, av, avm, avm, avl);
     if (m > KARATSUBA_CUTOFF)
-      mpx_kmul(sv, ssv, av, avm, avm, avl, ssv, svl);
+      mpx_ksqr(tdv, rdv + m + 4, sv, svm + 1, ssv, svl);
     else
-      mpx_umul(sv, ssv, av, avm, avm, avl);
-    ULSL1(tdv, sv, ssv);
-    MPX_ZERO(dv, tdv);
-    MPX_ZERO(rdv + m + 1, dvl);
+      mpx_usqr(tdv, rdv + m + 4, sv, svm + 1);
 
     if (m > KARATSUBA_CUTOFF)
       mpx_ksqr(sv, ssv, avm, avl, ssv, svl);
     else
       mpx_usqr(sv, ssv, avm, avl);
-    UADD(rdv, sv, ssv);
+    MPX_COPY(rdv + m + 1, dvl, svm + 1, svn);
+    UADD(rdv, sv, svm + 1);
+    USUB(tdv, sv, svn);
     
     if (m > KARATSUBA_CUTOFF)
       mpx_ksqr(sv, ssv, av, avm, ssv, svl);
     else
       mpx_usqr(sv, ssv, av, avm);
-    UADD(dv, sv, ssv);
+    MPX_COPY(dv, tdv, sv, svm);
+    UADD(tdv, svm, svn);
+    USUB(tdv, sv, svn);
   }
 }