+ /* compute a_1 + a_0 */
+ scratch[midlen - 1] = internal_add(scratch, a, scratch, botlen);
+#ifdef KARA_DEBUG
+ printf("a1plusa0 = 0x");
+ for (i = 0; i < midlen; i++) {
+ printf("%0*x", BIGNUM_INT_BITS/4, scratch[midlen - 1 - i]);
+ }
+ printf("\n");
+#endif
+ /* compute b_1 + b_0 */
+ scratch[2*midlen - 1] = internal_add(scratch+midlen, b,
+ scratch+midlen, botlen);
+#ifdef KARA_DEBUG
+ printf("b1plusb0 = 0x");
+ for (i = 0; i < midlen; i++) {
+ printf("%0*x", BIGNUM_INT_BITS/4, scratch[2*midlen - 1 - i]);
+ }
+ printf("\n");
+#endif
+
+ /*
+ * Now we can do the third multiplication.
+ */
+ internal_mul(scratch, scratch + midlen, scratch + 2*midlen, midlen,
+ scratch + 4*midlen);
+#ifdef KARA_DEBUG
+ printf("a1plusa0timesb1plusb0 = 0x");
+ for (i = 0; i < 2*midlen; i++) {
+ printf("%0*x", BIGNUM_INT_BITS/4, scratch[4*midlen - 1 - i]);
+ }
+ printf("\n");
+#endif
+
+ /*
+ * Now we can reuse the first half of 'scratch' to compute the
+ * sum of the outer two coefficients, to subtract from that
+ * product to obtain the middle one.
+ */
+ scratch[2*botlen - 2] = scratch[2*botlen - 1] = 0;
+ for (i = 0; i < 2*toplen; i++)
+ scratch[i] = c[2*botlen + i];
+ scratch[2*botlen] = internal_add(scratch, c, scratch, 2*botlen);
+ scratch[2*botlen + 1] = 0;
+#ifdef KARA_DEBUG
+ printf("a1b1plusa0b0 = 0x");
+ for (i = 0; i < 2*midlen; i++) {
+ printf("%0*x", BIGNUM_INT_BITS/4, scratch[2*midlen - 1 - i]);
+ }
+ printf("\n");
+#endif
+
+ internal_sub(scratch + 2*midlen, scratch, scratch, 2*midlen);
+#ifdef KARA_DEBUG
+ printf("a1b0plusa0b1 = 0x");
+ for (i = 0; i < 2*midlen; i++) {
+ printf("%0*x", BIGNUM_INT_BITS/4, scratch[4*midlen - 1 - i]);
+ }
+ printf("\n");
+#endif
+
+ /*
+ * And now all we need to do is to add that middle coefficient
+ * back into the output. We may have to propagate a carry
+ * further up the output, but we can be sure it won't
+ * propagate right the way off the top.
+ */
+ carry = internal_add(c + botlen, scratch, c + botlen, 2*midlen);
+ i = botlen + 2*midlen;
+ while (carry) {
+ assert(i <= 2*len);
+ carry += c[i];
+ c[i] = (BignumInt)carry;
+ carry >>= BIGNUM_INT_BITS;
+ i++;
+ }
+#ifdef KARA_DEBUG
+ printf("ab = 0x");
+ for (i = 0; i < 2*len; i++) {
+ printf("%0*x", BIGNUM_INT_BITS/4, c[2*len - i]);
+ }
+ printf("\n");
+#endif
+
+ } else {
+ int i;
+ BignumInt carry;
+ BignumDblInt t;
+ const BignumInt *ap, *alim = a + len, *bp, *blim = b + len;
+ BignumInt *cp, *cps;
+
+ /*
+ * Multiply in the ordinary O(N^2) way.
+ */
+
+ for (i = 0; i < 2 * len; i++)
+ c[i] = 0;
+
+ for (cps = c, ap = a; ap < alim; ap++, cps++) {
+ carry = 0;
+ for (cp = cps, bp = b; bp < blim; bp++, cp++) {
+ t = (MUL_WORD(*ap, *bp) + carry) + *cp;
+ *cp = (BignumInt) t;
+ carry = (BignumInt)(t >> BIGNUM_INT_BITS);
+ }
+ *cp = carry;
+ }
+ }
+}
+
+/*
+ * Variant form of internal_mul used for the initial step of
+ * Montgomery reduction. Only bothers outputting 'len' words
+ * (everything above that is thrown away).
+ */
+static void internal_mul_low(const BignumInt *a, const BignumInt *b,
+ BignumInt *c, int len, BignumInt *scratch)
+{
+ if (len > KARATSUBA_THRESHOLD) {
+ int i;
+
+ /*
+ * Karatsuba-aware version of internal_mul_low. As before, we
+ * express each input value as a shifted combination of two
+ * halves:
+ *
+ * a = a_1 D + a_0
+ * b = b_1 D + b_0
+ *
+ * Then the full product is, as before,
+ *
+ * ab = a_1 b_1 D^2 + (a_1 b_0 + a_0 b_1) D + a_0 b_0
+ *
+ * Provided we choose D on the large side (so that a_0 and b_0
+ * are _at least_ as long as a_1 and b_1), we don't need the
+ * topmost term at all, and we only need half of the middle
+ * term. So there's no point in doing the proper Karatsuba
+ * optimisation which computes the middle term using the top
+ * one, because we'd take as long computing the top one as
+ * just computing the middle one directly.
+ *
+ * So instead, we do a much more obvious thing: we call the
+ * fully optimised internal_mul to compute a_0 b_0, and we
+ * recursively call ourself to compute the _bottom halves_ of
+ * a_1 b_0 and a_0 b_1, each of which we add into the result
+ * in the obvious way.
+ *
+ * In other words, there's no actual Karatsuba _optimisation_
+ * in this function; the only benefit in doing it this way is
+ * that we call internal_mul proper for a large part of the
+ * work, and _that_ can optimise its operation.
+ */
+
+ int toplen = len/2, botlen = len - toplen; /* botlen is the bigger */
+
+ /*
+ * Scratch space for the various bits and pieces we're going
+ * to be adding together: we need botlen*2 words for a_0 b_0
+ * (though we may end up throwing away its topmost word), and
+ * toplen words for each of a_1 b_0 and a_0 b_1. That adds up
+ * to exactly 2*len.
+ */
+
+ /* a_0 b_0 */
+ internal_mul(a, b, scratch + 2*toplen, botlen, scratch + 2*len);
+
+ /* a_1 b_0 */
+ internal_mul_low(a + botlen, b, scratch + toplen, toplen,
+ scratch + 2*len);
+
+ /* a_0 b_1 */
+ internal_mul_low(a, b + botlen, scratch, toplen, scratch + 2*len);
+
+ /* Copy the bottom half of the big coefficient into place */
+ for (i = 0; i < botlen; i++)
+ c[i] = scratch[2*toplen + i];
+
+ /* Add the two small coefficients, throwing away the returned carry */
+ internal_add(scratch, scratch + toplen, scratch, toplen);
+
+ /* And add that to the large coefficient, leaving the result in c. */
+ internal_add(scratch, scratch + 2*toplen + botlen,
+ c + botlen, toplen);
+
+ } else {
+ int i;
+ BignumInt carry;
+ BignumDblInt t;
+ const BignumInt *ap, *alen = a + len, *bp;
+ BignumInt *cp, *cps, *clen = c + len;
+
+ /*
+ * Multiply in the ordinary O(N^2) way.
+ */
+
+ for (i = 0; i < len; i++)
+ c[i] = 0;
+
+ for (cps = c, ap = a; ap < alen; ap++, cps++) {
+ carry = 0;
+ for (cp = cps, bp = b; cp < clen; bp++, cp++) {
+ t = (MUL_WORD(*ap, *bp) + carry) + *cp;
+ *cp = (BignumInt) t;
+ carry = (BignumInt)(t >> BIGNUM_INT_BITS);
+ }
+ }
+ }
+}
+
+/*
+ * Montgomery reduction. Expects x to be a little-endian array of 2*len
+ * BignumInts whose value satisfies 0 <= x < rn (where r = 2^(len *
+ * BIGNUM_INT_BITS) is the Montgomery base). Returns in the same array
+ * a value x' which is congruent to xr^{-1} mod n, and satisfies 0 <=
+ * x' < n.
+ *
+ * 'n' and 'mninv' should be little-endian arrays of 'len' BignumInts
+ * each, containing respectively n and the multiplicative inverse of
+ * -n mod r.
+ *
+ * 'tmp' is an array of BignumInt used as scratch space, of length at
+ * least 3*len + mul_compute_scratch(len).
+ */
+static void monty_reduce(BignumInt *x, const BignumInt *n,
+ const BignumInt *mninv, BignumInt *tmp, int len)
+{
+ int i;
+ BignumInt carry;
+
+ /*
+ * Multiply x by (-n)^{-1} mod r. This gives us a value m such
+ * that mn is congruent to -x mod r. Hence, mn+x is an exact
+ * multiple of r, and is also (obviously) congruent to x mod n.
+ */
+ internal_mul_low(x, mninv, tmp, len, tmp + 3*len);
+
+ /*
+ * Compute t = (mn+x)/r in ordinary, non-modular, integer
+ * arithmetic. By construction this is exact, and is congruent mod
+ * n to x * r^{-1}, i.e. the answer we want.
+ *
+ * The following multiply leaves that answer in the _most_
+ * significant half of the 'x' array, so then we must shift it
+ * down.
+ */
+ internal_mul(tmp, n, tmp+len, len, tmp + 3*len);
+ carry = internal_add(x, tmp+len, x, 2*len);
+ for (i = 0; i < len; i++)
+ x[i] = x[len + i], x[len + i] = 0;
+
+ /*
+ * Reduce t mod n. This doesn't require a full-on division by n,
+ * but merely a test and single optional subtraction, since we can
+ * show that 0 <= t < 2n.
+ *
+ * Proof:
+ * + we computed m mod r, so 0 <= m < r.
+ * + so 0 <= mn < rn, obviously
+ * + hence we only need 0 <= x < rn to guarantee that 0 <= mn+x < 2rn
+ * + yielding 0 <= (mn+x)/r < 2n as required.
+ */
+ if (!carry) {
+ for (i = len; i-- > 0; )
+ if (x[i] != n[i])
+ break;