+ if (len > KARATSUBA_THRESHOLD) {
+
+ /*
+ * Karatsuba divide-and-conquer algorithm. Cut each input in
+ * half, so that it's expressed as two big 'digits' in a giant
+ * base D:
+ *
+ * a = a_1 D + a_0
+ * b = b_1 D + b_0
+ *
+ * Then the product is of course
+ *
+ * ab = a_1 b_1 D^2 + (a_1 b_0 + a_0 b_1) D + a_0 b_0
+ *
+ * and we compute the three coefficients by recursively
+ * calling ourself to do half-length multiplications.
+ *
+ * The clever bit that makes this worth doing is that we only
+ * need _one_ half-length multiplication for the central
+ * coefficient rather than the two that it obviouly looks
+ * like, because we can use a single multiplication to compute
+ *
+ * (a_1 + a_0) (b_1 + b_0) = a_1 b_1 + a_1 b_0 + a_0 b_1 + a_0 b_0
+ *
+ * and then we subtract the other two coefficients (a_1 b_1
+ * and a_0 b_0) which we were computing anyway.
+ *
+ * Hence we get to multiply two numbers of length N in about
+ * three times as much work as it takes to multiply numbers of
+ * length N/2, which is obviously better than the four times
+ * as much work it would take if we just did a long
+ * conventional multiply.
+ */
+
+ int toplen = len/2, botlen = len - toplen; /* botlen is the bigger */
+ int midlen = botlen + 1;
+ BignumInt *scratch;
+ BignumDblInt carry;
+#ifdef KARA_DEBUG
+ int i;
+#endif
+
+ /*
+ * The coefficients a_1 b_1 and a_0 b_0 just avoid overlapping
+ * in the output array, so we can compute them immediately in
+ * place.
+ */
+
+#ifdef KARA_DEBUG
+ printf("a1,a0 = 0x");
+ for (i = 0; i < len; i++) {
+ if (i == toplen) printf(", 0x");
+ printf("%0*x", BIGNUM_INT_BITS/4, a[i]);
+ }
+ printf("\n");
+ printf("b1,b0 = 0x");
+ for (i = 0; i < len; i++) {
+ if (i == toplen) printf(", 0x");
+ printf("%0*x", BIGNUM_INT_BITS/4, b[i]);
+ }
+ printf("\n");
+#endif
+
+ /* a_1 b_1 */
+ internal_mul(a, b, c, toplen);
+#ifdef KARA_DEBUG
+ printf("a1b1 = 0x");
+ for (i = 0; i < 2*toplen; i++) {
+ printf("%0*x", BIGNUM_INT_BITS/4, c[i]);
+ }
+ printf("\n");
+#endif
+
+ /* a_0 b_0 */
+ internal_mul(a + toplen, b + toplen, c + 2*toplen, botlen);
+#ifdef KARA_DEBUG
+ printf("a0b0 = 0x");
+ for (i = 0; i < 2*botlen; i++) {
+ printf("%0*x", BIGNUM_INT_BITS/4, c[2*toplen+i]);
+ }
+ printf("\n");
+#endif
+
+ /*
+ * We must allocate scratch space for the central coefficient,
+ * and also for the two input values that we multiply when
+ * computing it. Since either or both may carry into the
+ * (botlen+1)th word, we must use a slightly longer length
+ * 'midlen'.
+ */
+ scratch = snewn(4 * midlen, BignumInt);
+
+ /* Zero padding. midlen exceeds toplen by at most 2, so just
+ * zero the first two words of each input and the rest will be
+ * copied over. */
+ scratch[0] = scratch[1] = scratch[midlen] = scratch[midlen+1] = 0;
+
+ for (j = 0; j < toplen; j++) {
+ scratch[midlen - toplen + j] = a[j]; /* a_1 */
+ scratch[2*midlen - toplen + j] = b[j]; /* b_1 */
+ }
+
+ /* compute a_1 + a_0 */
+ scratch[0] = internal_add(scratch+1, a+toplen, scratch+1, botlen);
+#ifdef KARA_DEBUG
+ printf("a1plusa0 = 0x");
+ for (i = 0; i < midlen; i++) {
+ printf("%0*x", BIGNUM_INT_BITS/4, scratch[i]);
+ }
+ printf("\n");
+#endif
+ /* compute b_1 + b_0 */
+ scratch[midlen] = internal_add(scratch+midlen+1, b+toplen,
+ scratch+midlen+1, botlen);
+#ifdef KARA_DEBUG
+ printf("b1plusb0 = 0x");
+ for (i = 0; i < midlen; i++) {
+ printf("%0*x", BIGNUM_INT_BITS/4, scratch[midlen+i]);
+ }
+ printf("\n");
+#endif
+
+ /*
+ * Now we can do the third multiplication.
+ */
+ internal_mul(scratch, scratch + midlen, scratch + 2*midlen, midlen);
+#ifdef KARA_DEBUG
+ printf("a1plusa0timesb1plusb0 = 0x");
+ for (i = 0; i < 2*midlen; i++) {
+ printf("%0*x", BIGNUM_INT_BITS/4, scratch[2*midlen+i]);
+ }
+ printf("\n");
+#endif
+
+ /*
+ * Now we can reuse the first half of 'scratch' to compute the
+ * sum of the outer two coefficients, to subtract from that
+ * product to obtain the middle one.
+ */
+ scratch[0] = scratch[1] = scratch[2] = scratch[3] = 0;
+ for (j = 0; j < 2*toplen; j++)
+ scratch[2*midlen - 2*toplen + j] = c[j];
+ scratch[1] = internal_add(scratch+2, c + 2*toplen,
+ scratch+2, 2*botlen);
+#ifdef KARA_DEBUG
+ printf("a1b1plusa0b0 = 0x");
+ for (i = 0; i < 2*midlen; i++) {
+ printf("%0*x", BIGNUM_INT_BITS/4, scratch[i]);
+ }
+ printf("\n");
+#endif
+
+ internal_sub(scratch + 2*midlen, scratch,
+ scratch + 2*midlen, 2*midlen);
+#ifdef KARA_DEBUG
+ printf("a1b0plusa0b1 = 0x");
+ for (i = 0; i < 2*midlen; i++) {
+ printf("%0*x", BIGNUM_INT_BITS/4, scratch[2*midlen+i]);
+ }
+ printf("\n");
+#endif
+
+ /*
+ * And now all we need to do is to add that middle coefficient
+ * back into the output. We may have to propagate a carry
+ * further up the output, but we can be sure it won't
+ * propagate right the way off the top.
+ */
+ carry = internal_add(c + 2*len - botlen - 2*midlen,
+ scratch + 2*midlen,
+ c + 2*len - botlen - 2*midlen, 2*midlen);
+ j = 2*len - botlen - 2*midlen - 1;
+ while (carry) {
+ assert(j >= 0);
+ carry += c[j];
+ c[j] = (BignumInt)carry;
+ carry >>= BIGNUM_INT_BITS;
+ j--;
+ }
+#ifdef KARA_DEBUG
+ printf("ab = 0x");
+ for (i = 0; i < 2*len; i++) {
+ printf("%0*x", BIGNUM_INT_BITS/4, c[i]);
+ }
+ printf("\n");
+#endif
+
+ /* Free scratch. */
+ for (j = 0; j < 4 * midlen; j++)
+ scratch[j] = 0;
+ sfree(scratch);
+
+ } else {
+
+ /*
+ * Multiply in the ordinary O(N^2) way.
+ */
+
+ for (j = 0; j < 2 * len; j++)
+ c[j] = 0;
+
+ for (i = len - 1; i >= 0; i--) {
+ t = 0;
+ for (j = len - 1; j >= 0; j--) {
+ t += MUL_WORD(a[i], (BignumDblInt) b[j]);
+ t += (BignumDblInt) c[i + j + 1];
+ c[i + j + 1] = (BignumInt) t;
+ t = t >> BIGNUM_INT_BITS;
+ }
+ c[i] = (BignumInt) t;
+ }
+ }
+}
+
+/*
+ * Variant form of internal_mul used for the initial step of
+ * Montgomery reduction. Only bothers outputting 'len' words
+ * (everything above that is thrown away).
+ */
+static void internal_mul_low(const BignumInt *a, const BignumInt *b,
+ BignumInt *c, int len)
+{
+ int i, j;
+ BignumDblInt t;
+
+ if (len > KARATSUBA_THRESHOLD) {
+
+ /*
+ * Karatsuba-aware version of internal_mul_low. As before, we
+ * express each input value as a shifted combination of two
+ * halves:
+ *
+ * a = a_1 D + a_0
+ * b = b_1 D + b_0
+ *
+ * Then the full product is, as before,
+ *
+ * ab = a_1 b_1 D^2 + (a_1 b_0 + a_0 b_1) D + a_0 b_0
+ *
+ * Provided we choose D on the large side (so that a_0 and b_0
+ * are _at least_ as long as a_1 and b_1), we don't need the
+ * topmost term at all, and we only need half of the middle
+ * term. So there's no point in doing the proper Karatsuba
+ * optimisation which computes the middle term using the top
+ * one, because we'd take as long computing the top one as
+ * just computing the middle one directly.
+ *
+ * So instead, we do a much more obvious thing: we call the
+ * fully optimised internal_mul to compute a_0 b_0, and we
+ * recursively call ourself to compute the _bottom halves_ of
+ * a_1 b_0 and a_0 b_1, each of which we add into the result
+ * in the obvious way.
+ *
+ * In other words, there's no actual Karatsuba _optimisation_
+ * in this function; the only benefit in doing it this way is
+ * that we call internal_mul proper for a large part of the
+ * work, and _that_ can optimise its operation.
+ */
+
+ int toplen = len/2, botlen = len - toplen; /* botlen is the bigger */
+ BignumInt *scratch;
+
+ /*
+ * Allocate scratch space for the various bits and pieces
+ * we're going to be adding together. We need botlen*2 words
+ * for a_0 b_0 (though we may end up throwing away its topmost
+ * word), and toplen words for each of a_1 b_0 and a_0 b_1.
+ * That adds up to exactly 2*len.
+ */
+ scratch = snewn(len*2, BignumInt);
+
+ /* a_0 b_0 */
+ internal_mul(a + toplen, b + toplen, scratch + 2*toplen, botlen);
+
+ /* a_1 b_0 */
+ internal_mul_low(a, b + len - toplen, scratch + toplen, toplen);
+
+ /* a_0 b_1 */
+ internal_mul_low(a + len - toplen, b, scratch, toplen);
+
+ /* Copy the bottom half of the big coefficient into place */
+ for (j = 0; j < botlen; j++)
+ c[toplen + j] = scratch[2*toplen + botlen + j];
+
+ /* Add the two small coefficients, throwing away the returned carry */
+ internal_add(scratch, scratch + toplen, scratch, toplen);
+
+ /* And add that to the large coefficient, leaving the result in c. */
+ internal_add(scratch, scratch + 2*toplen + botlen - toplen,
+ c, toplen);
+
+ /* Free scratch. */
+ for (j = 0; j < len*2; j++)
+ scratch[j] = 0;
+ sfree(scratch);
+
+ } else {
+
+ for (j = 0; j < len; j++)
+ c[j] = 0;
+
+ for (i = len - 1; i >= 0; i--) {
+ t = 0;
+ for (j = len - 1; j >= len - i - 1; j--) {
+ t += MUL_WORD(a[i], (BignumDblInt) b[j]);
+ t += (BignumDblInt) c[i + j + 1 - len];
+ c[i + j + 1 - len] = (BignumInt) t;
+ t = t >> BIGNUM_INT_BITS;
+ }
+ }