From a86e33af1ea11503350b37dbf64a7518c27b583e Mon Sep 17 00:00:00 2001 From: mdw Date: Fri, 10 Dec 1999 23:26:40 +0000 Subject: [PATCH] Karatsuba-Ofman multiplication algorithm. --- mpx-kmul.c | 354 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mpx.h | 37 ++++++- tests/mpx | 46 +++++++- 3 files changed, 432 insertions(+), 5 deletions(-) create mode 100644 mpx-kmul.c diff --git a/mpx-kmul.c b/mpx-kmul.c new file mode 100644 index 0000000..d97700f --- /dev/null +++ b/mpx-kmul.c @@ -0,0 +1,354 @@ +/* -*-c-*- + * + * $Id: mpx-kmul.c,v 1.1 1999/12/10 23:23:51 mdw Exp $ + * + * Karatsuba's multiplication algorithm + * + * (c) 1999 Straylight/Edgeware + */ + +/*----- Licensing notice --------------------------------------------------* + * + * This file is part of Catacomb. + * + * Catacomb is free software; you can redistribute it and/or modify + * it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * Catacomb is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with Catacomb; if not, write to the Free + * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + * MA 02111-1307, USA. + */ + +/*----- Revision history --------------------------------------------------* + * + * $Log: mpx-kmul.c,v $ + * Revision 1.1 1999/12/10 23:23:51 mdw + * Karatsuba-Ofman multiplication algorithm. + * + */ + +/*----- Header files ------------------------------------------------------*/ + +#include + +#include "mpx.h" + +/*----- Tweakables --------------------------------------------------------*/ + +/* --- @KARATSUBA_CUTOFF@ --- * + * + * If either of the arguments to @mpx_kmul@ contains this number of words or + * fewer, the job is dumped out to @mpx_umul@ instead. Reduce the size when + * testing, to ensure better coverage. + */ + +#ifdef TEST_RIG +# undef KARATSUBA_CUTOFF +# define KARATSUBA_CUTOFF 2 +#endif + +/*----- Addition macros ---------------------------------------------------*/ + +#define UADD(dv, av, avl) do { \ + mpw *_dv = (dv); \ + const mpw *_av = (av), *_avl = (avl); \ + mpw _c = 0; \ + \ + while (_av < _avl) { \ + mpw _a, _b; \ + mpd _x; \ + _a = *_av++; \ + _b = *_dv; \ + _x = (mpd)_a + (mpd)_b + _c; \ + *_dv++ = MPW(_x); \ + _c = _x >> MPW_BITS; \ + } \ + while (_c) { \ + mpd _x = (mpd)*_dv + (mpd)_c; \ + *_dv++ = MPW(_x); \ + _c = _x >> MPW_BITS; \ + } \ +} while (0) + +#define UADD2(dv, dvl, av, avl, bv, bvl) do { \ + mpw *_dv = (dv), *_dvl = (dvl); \ + const mpw *_av = (av), *_avl = (avl); \ + const mpw *_bv = (bv), *_bvl = (bvl); \ + mpw _c = 0; \ + \ + while (_av < _avl || _bv < _bvl) { \ + mpw _a, _b; \ + mpd _x; \ + _a = (_av < _avl) ? *_av++ : 0; \ + _b = (_bv < _bvl) ? *_bv++ : 0; \ + _x = (mpd)_a + (mpd)_b + _c; \ + *_dv++ = MPW(_x); \ + _c = _x >> MPW_BITS; \ + } \ + *_dv++ = _c; \ + while (_dv < _dvl) \ + *_dv++ = 0; \ +} while (0) + +#define USUB(dv, av, avl) do { \ + mpw *_dv = (dv); \ + const mpw *_av = (av), *_avl = (avl); \ + mpw _c = 0; \ + \ + while (_av < _avl) { \ + mpw _a, _b; \ + mpd _x; \ + _a = *_av++; \ + _b = *_dv; \ + _x = (mpd)_b - (mpd)_a - _c; \ + *_dv++ = MPW(_x); \ + if (_x >> MPW_BITS) \ + _c = 1; \ + else \ + _c = 0; \ + } \ + while (_c) { \ + mpd _x = (mpd)*_dv - (mpd)_c; \ + *_dv++ = MPW(_x); \ + if (_x >> MPW_BITS) \ + _c = 1; \ + else \ + _c = 0; \ + } \ +} while (0) + +/*----- Main code ---------------------------------------------------------*/ + +/* --- @mpx_kmul@ --- * + * + * Arguments: @mpw *dv, *dvl@ = pointer to destination buffer + * @const mpw *av, *avl@ = pointer to first argument + * @const mpw *bv, *bvl@ = pointer to second argument + * @mpw *sv, *svl@ = pointer to scratch workspace + * + * Returns: --- + * + * Use: Multiplies two multiprecision integers using Karatsuba's + * algorithm. This is rather faster than traditional long + * multiplication (e.g., @mpx_umul@) on large numbers, although + * more expensive on small ones. + * + * The destination must be twice as large as the larger + * argument. The scratch space must be twice as large as the + * larger argument, plus the magic number @KARATSUBA_SLOP@. + * (Actually, a number of words proportional to the depth of + * recursion, but since recusion is strongly bounded by memory, + * I can replace it with a constant as long as it's `big + * enough'.) + */ + +void mpx_kmul(mpw *dv, mpw *dvl, + const mpw *av, const mpw *avl, + const mpw *bv, const mpw *bvl, + mpw *sv, mpw *svl) +{ + const mpw *avm, *bvm; + size_t m; + + /* --- Dispose of easy cases to @mpx_umul@ --- * + * + * Karatsuba is only a win on large numbers, because of all the + * recursiveness and bookkeeping. The recursive calls make a quick check + * to see whether to bottom out to @mpx_umul@ which should help quite a + * lot, but sometimes the only way to know is to make sure... + */ + + MPX_SHRINK(av, avl); + MPX_SHRINK(bv, bvl); + + if (avl - av <= KARATSUBA_CUTOFF || bvl - bv <= KARATSUBA_CUTOFF) { + mpx_umul(dv, dvl, av, avl, bv, bvl); + return; + } + + /* --- How the algorithm works --- * + * + * Let %$A = xb + y$% and %$B = ub + v$%. Then, simply by expanding, %$AB + * = x u b^2 + b(x v + y u) + y v$%. That's not helped any, because I've + * got four multiplications, each four times easier than the one I started + * with. However, note that I can rewrite the coefficient of %$b$% as + * %$xv + yu = (x + y)(u + v) - xu - yv$%. The terms %$xu$% and %$yv$% + * I've already calculated, and that leaves only one more multiplication to + * do. So now I have three multiplications, each four times easier, and + * that's a win. + */ + + /* --- First things --- * + * + * Sort out where to break the factors in half. I'll choose the midpoint + * of the largest one, since this minimizes the amount of work I have to do + * most effectively. + */ + + if (avl - av > bvl - bv) { + m = (avl - av + 1) >> 1; + avm = av + m; + if (bvl - bv > m) + bvm = bv + m; + else + bvm = bvl; + } else { + m = (bvl - bv + 1) >> 1; + bvm = bv + m; + if (avl - av > m) + avm = av + m; + else + avm = avl; + } + + /* --- Sort out the middle term --- * + * + * I'm going to keep track of the carry by hand rather than pass it down to + * the next level, because it means multiplication by one or zero, which I + * can do easily myself. + */ + + { + unsigned f = 0; + enum { + carry_a = 1, + carry_b = 2 + }; + + mpw *bsv = sv + m, *ssv = bsv + m; + mpw *rdv = dv + m, *rdvl = rdv + 2 * m; + + UADD2(sv, bsv + 1, av, avm, avm, avl); + if (*bsv) + f |= carry_a; + UADD2(bsv, ssv + 1, bv, bvm, bvm, bvl); + if (*ssv) + f |= carry_b; + MPX_ZERO(dv, rdv); + if (m > KARATSUBA_CUTOFF) + mpx_kmul(rdv, rdvl, sv, bsv, bsv, ssv, ssv, svl); + else + mpx_umul(rdv, rdvl, sv, bsv, bsv, ssv); + MPX_ZERO(rdvl, dvl); + rdv += m; rdvl += m; + if (f & carry_b) + UADD(rdv, sv, bsv); + if (f & carry_a) + UADD(rdv, bsv, ssv); + if (!(~f & (carry_a | carry_b))) + MPX_UADDN(rdv + m, rdvl, 1); + } + + /* --- Sort out the other two terms --- */ + + { + mpw *ssv = sv + 2 * m; + mpw *tdv = dv + m; + mpw *rdv = tdv + m; + + if (m > KARATSUBA_CUTOFF) + mpx_kmul(sv, ssv, avm, avl, bvm, bvl, ssv, svl); + else + mpx_umul(sv, ssv, avm, avl, bvm, bvl); + UADD(rdv, sv, ssv); + USUB(tdv, sv, ssv); + + if (m > KARATSUBA_CUTOFF) + mpx_kmul(sv, ssv, av, avm, bv, bvm, ssv, svl); + else + mpx_umul(sv, ssv, av, avm, bv, bvm); + USUB(tdv, sv, ssv); + UADD(dv, sv, ssv); + } +} + +/*----- Test rig ----------------------------------------------------------*/ + +#ifdef TEST_RIG + +#include +#include + +#include "mpscan.h" + +#define ALLOC(v, vl, sz) do { \ + size_t _sz = (sz); \ + mpw *_vv = xmalloc(MPWS(_sz)); \ + mpw *_vvl = _vv + _sz; \ + (v) = _vv; \ + (vl) = _vvl; \ +} while (0) + +#define LOAD(v, vl, d) do { \ + const dstr *_d = (d); \ + mpw *_v, *_vl; \ + ALLOC(_v, _vl, MPW_RQ(_d->len)); \ + mpx_loadb(_v, _vl, _d->buf, _d->len); \ + (v) = _v; \ + (vl) = _vl; \ +} while (0) + +#define MAX(x, y) ((x) > (y) ? (x) : (y)) + +static void dumpmp(const char *msg, const mpw *v, const mpw *vl) +{ + fputs(msg, stderr); + MPX_SHRINK(v, vl); + while (v < vl) + fprintf(stderr, " %08lx", (unsigned long)*--vl); + fputc('\n', stderr); +} + +static int umul(dstr *v) +{ + mpw *a, *al; + mpw *b, *bl; + mpw *c, *cl; + mpw *d, *dl; + mpw *s, *sl; + size_t m; + int ok = 1; + + LOAD(a, al, &v[0]); + LOAD(b, bl, &v[1]); + LOAD(c, cl, &v[2]); + m = MAX(al - a, bl - b) + 1; + ALLOC(d, dl, 2 * m); + ALLOC(s, sl, 2 * m + 32); + + mpx_kmul(d, dl, a, al, b, bl, s, sl); + if (MPX_UCMP(d, dl, !=, c, cl)) { + fprintf(stderr, "\n*** umul failed\n"); + dumpmp(" a", a, al); + dumpmp(" b", b, bl); + dumpmp("expected", c, cl); + dumpmp(" result", d, dl); + ok = 0; + } + + free(a); free(b); free(c); free(d); free(s); + return (ok); +} + +static test_chunk defs[] = { + { "umul", umul, { &type_hex, &type_hex, &type_hex, 0 } }, + { 0, 0, { 0 } } +}; + +int main(int argc, char *argv[]) +{ + test_run(argc, argv, defs, SRCDIR"/tests/mpx"); + return (0); +} + +#endif + +/*----- That's all, folks -------------------------------------------------*/ diff --git a/mpx.h b/mpx.h index 96bb358..7d8a2cd 100644 --- a/mpx.h +++ b/mpx.h @@ -1,6 +1,6 @@ /* -*-c-*- * - * $Id: mpx.h,v 1.5 1999/11/20 22:23:27 mdw Exp $ + * $Id: mpx.h,v 1.6 1999/12/10 23:23:51 mdw Exp $ * * Low level multiprecision arithmetic * @@ -30,6 +30,9 @@ /*----- Revision history --------------------------------------------------* * * $Log: mpx.h,v $ + * Revision 1.6 1999/12/10 23:23:51 mdw + * Karatsuba-Ofman multiplication algorithm. + * * Revision 1.5 1999/11/20 22:23:27 mdw * Add function versions of some low-level macros with wider use. * @@ -47,8 +50,8 @@ * */ -#ifndef MPX_H -#define MPX_H +#ifndef CATACOMB_MPX_H +#define CATACOMB_MPX_H #ifdef __cplusplus extern "C" { @@ -71,7 +74,7 @@ #include -#ifndef MPW_H +#ifndef CATACOMB_MPW_H # include "mpw.h" #endif @@ -509,6 +512,32 @@ extern void mpx_umlan(mpw */*dv*/, mpw */*dvl*/, extern void mpx_usqr(mpw */*dv*/, mpw */*dvl*/, const mpw */*av*/, const mpw */*avl*/); +/* --- @mpx_kmul@ --- * + * + * Arguments: @mpw *dv, *dvl@ = pointer to destination buffer + * @const mpw *av, *avl@ = pointer to first argument + * @const mpw *bv, *bvl@ = pointer to second argument + * @mpw *sv, *svl@ = pointer to scratch workspace + * + * Returns: --- + * + * Use: Multiplies two multiprecision integers using Karatsuba's + * algorithm. This is rather faster than traditional long + * multiplication (e.g., @mpx_umul@) on large numbers, although + * more expensive on small ones. + * + * The destination and scratch buffers must be twice as large as + * the larger argument. + */ + +#define KARATSUBA_CUTOFF 16 +#define KARATSUBA_SLOP 32 + +extern void mpx_kmul(mpw */*dv*/, mpw */*dvl*/, + const mpw */*av*/, const mpw */*avl*/, + const mpw */*bv*/, const mpw */*bvl*/, + mpw */*sv*/, mpw */*svl*/); + /* --- @mpx_udiv@ --- * * * Arguments: @mpw *qv, *qvl@ = quotient vector base and limit diff --git a/tests/mpx b/tests/mpx index 45f7705..a9490cd 100644 --- a/tests/mpx +++ b/tests/mpx @@ -1,6 +1,6 @@ # Test vectors for low-level MP functions # -# $Id: mpx,v 1.2 1999/11/14 13:53:12 mdw Exp $ +# $Id: mpx,v 1.3 1999/12/10 23:26:40 mdw Exp $ # --- Load-store tests --- # @@ -490,6 +490,50 @@ umul { 08d95fc1d6dd6b9423c7bb033598df0c 6c03f5958677efd383509141bf257375 03bbd76f19ba19e3f255c24063f6384d4ac913d9e582392589a525195bcc547c; + + # --- Karatsuba regression --- + # + # This bug was caused by kmul (a) choosing the split point too low on + # odd-sized inputs and (b) not allocating enough workspace. + + 21a9269d7b8b63cf18faa933b3c868ba1e8cb3f00b57e197709abf96eeb9bf12e8fe22b3 + 0144c992b68e3ca712678215d5bc968702ccfea17717737ba501a38d26fa5091ba + 2ab495f91afd7c36f85ece6fd58577f995de88d62a98a07c6d9e3500ae67b0f100bc709d1f30894662774d0cadfba091788c427cc6f4bacb26e42cf92f6e4494e03c990e; + + # --- Larger number tests for Karatsuba --- + + 416e63549e2cf08fb225058b3545cb4a47cbf9 + de38c473c27f7bdef02a084192b3e17f435cf7 + 38cc3c7f360737411df7b52a222a3672c6e0d39f0a868479176a6143e1129d44d5aa61be493f; + + aa20b1355073f21c57530d2f90bc40e47ac463 + 8315dfa60e97ff3dab7a6f61fcec2cd5b6f127 + 571d43fda6ce14a78534ac72c50b58738d62630766a59a7cec1a63433e499b1b5eac5ef71e15; + + f641594177c8c364d922c659a8f7ae0460c7d74b266c8cc258ad5f + 5948dd29fc5172c37c31da6957779a1bebe452d8deba26c5d3d390 + 55e2cf27aa49f938584dca4044d944077e226206c6f8c7688e8760f3b5c106413fd0ef4b63a97991da86fd113ff4822a41f76913d270; + + a4170f55dfa135c4bdd3a921a8c1567eebc6b799fb62b0dd27b089 + 7b7d619e07a5d01427348c05605f67196b2923b074787c375977ac + 4f277232c75290f0c5fed384dba2aaa23fe4a360ea63ee45fb6c0134b36a09a9163f3c767d498b8dcd31e5deaef386d4a9b7d85b4b0c; + + 250e7a0c7035df81429572d3f772720e723b710d54b9eb5f16814117980f0559bf12b82e00c5b3904e + 1be3c88b01ce53a70c12f74dcc247823846ac6c06a9cb41b86794900006045f05e29da23b81523aa9f + 04097fbcf75616fef7b6b91680963f7e0cf1bf72bc5f453e46136fc92b20ae8a30d7ae7965f8271de854442b93562ebaa9ee09fba4a7b79ef8b26718b12424419dc301496dc0d6cfb04e4f7a3a3729046c72; + + af3148a72dcf1340f6b5a3b2fc1cacd7e6f9e60a13de5c91c37bb850f0e930683c2dc96882a9f62b76 + 48603cc656908b34c70ed826da8c3414d5845100f53cb6f9f370a4c7708a9b8ffd787537048f89493a + 3187b8818bf644805c880d189bbf606ca23e01431cf5b3b633db48a1202aa346f6a0e3958c7264fa1de2d92660345e820f4f3659cf0040a28fb9b725f7ab83610c9c056062326f776ce871696eede0507ebc; + + 0b1cc934a2f6244b93c8ac10881de20349d133642ac19fa0be3acacbf4429d0ec7bbad2f41534a693647b7c02e683cde249e36f008fb68e96de65c8a268eb1ea + 12444899c13f0ee85a4f47024f06d8a5746f0d9fec02e57c6d87a7bba17c1454fe6387bff5b96e38ce6142b9eebe249865c52b617b8966e6f93b16f612a91155 + cafcf1ea9f16a56f9ac4635d58992a789c51b6d7e53b5e1d8b59d5c1850c5c6ac2297839af44b29d5cf440772f98fd9d090ccec2d56adeb113bd3459d620b0b02679b72f3170ea8e2bd4486eebf2496d8be01cfe86923e5bbeb6f91582dfb95c6ef0a52cbc068081dc363b31dbd2ed80e3267d973ea39f82e276002eab9cb2; + + 3ba5a9c550b8cf6c3b87cc106b6551221a0dc90ac193ebcc526e4e5f53cf012fa6e05b155dcb3c4c0e1a90a01062a67ec434f6744195349194770711ea836a8b + 54f04f121d22db842523e9bf75727d5b0e9ef17e6d727918894927fade87ceeb2106684c4af7c49653425e29f7a91abf8adec4de2ca499df2534644397e454ac + 13ca59703f4c087c16a9a7bc7022904a37a469c1d0fd9fa7ffabe8f7d887fe1572c0bf5c75fde6913b565f8106bba9c26c9bbce190a9b8967112d74c0ac3d4ff9d2a385b96833e3c456d5601c74d8d2c9fff35abc60e7cc15d7c680f20757c13a415f1b8fbe3c6c32434aa36c528473dd20ea39f0e5ee22d1cd23040900d3164; + } usqr { -- 2.11.0