X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/16efb197c59c4b4cfaef7b2f23bd39f70176dd9e..a1a9ee0a7240087e202a7855e470573de0e59c09:/math/mpmont.c diff --git a/math/mpmont.c b/math/mpmont.c index 1f9143b6..2ed1e113 100644 --- a/math/mpmont.c +++ b/math/mpmont.c @@ -27,6 +27,8 @@ /*----- Header files ------------------------------------------------------*/ +#include "config.h" +#include "dispatch.h" #include "mp.h" #include "mpmont.h" @@ -40,6 +42,222 @@ /* #define MPMONT_DISABLE */ +#define MPMONT_KTHRESH (16*MPK_THRESH) + +/*----- Low-level implementation ------------------------------------------*/ + +#ifndef MPMONT_DISABLE + +/* --- @redccore@ --- * + * + * Arguments: @mpw *dv, *dvl@ = base and limit of source/destination + * @const mpw *mv@ = base of modulus %$m$% + * @size_t n@ = length of modulus + * @const mpw *mi@ = base of REDC coefficient %$m'$% + * + * Returns: --- + * + * Use: Let %$a$% be the input operand. Store in %$d$% the value + * %$a + (m' a \bmod R) m$%. The destination has space for at + * least %$2 n + 1$% words of result. + */ + +CPU_DISPATCH(static, (void), void, redccore, + (mpw *dv, mpw *dvl, const mpw *mv, size_t n, const mpw *mi), + (dv, dvl, mv, n, mi), pick_redccore, simple_redccore); + +static void simple_redccore(mpw *dv, mpw *dvl, const mpw *mv, + size_t n, const mpw *mi) +{ + mpw mi0 = *mi; + size_t i; + + for (i = 0; i < n; i++) { + MPX_UMLAN(dv, dvl, mv, mv + n, MPW(*dv*mi0)); + dv++; + } +} + +#define MAYBE_REDC4(impl) \ + extern void mpxmont_redc4_##impl(mpw *dv, mpw *dvl, const mpw *mv, \ + size_t n, const mpw *mi); \ + static void maybe_redc4_##impl(mpw *dv, mpw *dvl, const mpw *mv, \ + size_t n, const mpw *mi) \ + { \ + if (n%4) simple_redccore(dv, dvl, mv, n, mi); \ + else mpxmont_redc4_##impl(dv, dvl, mv, n, mi); \ + } + +#if CPUFAM_X86 + MAYBE_REDC4(x86_sse2) + MAYBE_REDC4(x86_avx) +#endif + +#if CPUFAM_AMD64 + MAYBE_REDC4(amd64_sse2) + MAYBE_REDC4(amd64_avx) +#endif + +static redccore__functype *pick_redccore(void) +{ +#if CPUFAM_X86 + DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_avx, + cpu_feature_p(CPUFEAT_X86_AVX)); + DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); +#endif +#if CPUFAM_AMD64 + DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_avx, + cpu_feature_p(CPUFEAT_X86_AVX)); + DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); +#endif + DISPATCH_PICK_FALLBACK(mpmont_reduce, simple_redccore); +} + +/* --- @redccore@ --- * + * + * Arguments: @mpw *dv, *dvl@ = base and limit of source/destination + * @const mpw *av, *avl@ = base and limit of first multiplicand + * @const mpw *bv, *bvl@ = base and limit of second multiplicand + * @const mpw *mv@ = base of modulus %$m$% + * @size_t n@ = length of modulus + * @const mpw *mi@ = base of REDC coefficient %$m'$% + * + * Returns: --- + * + * Use: Let %$a$% and %$b$% be the multiplicands. Let %$w = a b$%. + * Store in %$d$% the value %$a b + (m' a b \bmod R) m$%. + */ + +CPU_DISPATCH(static, (void), void, mulcore, + (mpw *dv, mpw *dvl, const mpw *av, const mpw *avl, + const mpw *bv, const mpw *bvl, const mpw *mv, + size_t n, const mpw *mi), + (dv, dvl, av, avl, bv, bvl, mv, n, mi), + pick_mulcore, simple_mulcore); + +static void simple_mulcore(mpw *dv, mpw *dvl, + const mpw *av, const mpw *avl, + const mpw *bv, const mpw *bvl, + const mpw *mv, size_t n, const mpw *mi) +{ + mpw ai, b0, y, mi0 = *mi; + const mpw *tv, *tvl; + const mpw *mvl = mv + n; + size_t i = 0; + + /* --- Initial setup --- */ + + MPX_ZERO(dv, dvl); + if (avl - av > bvl - bv) { + tv = av; av = bv; bv = tv; + tvl = avl; avl = bvl; bvl = tvl; + } + b0 = *bv; + + /* --- Multiply, until we run out of multiplicand --- */ + + while (i < n && av < avl) { + ai = *av++; + y = MPW((*dv + ai*b0)*mi0); + MPX_UMLAN(dv, dvl, bv, bvl, ai); + MPX_UMLAN(dv, dvl, mv, mvl, y); + dv++; i++; + } + + /* --- Continue reducing until we run out of modulus --- */ + + while (i < n) { + y = MPW(*dv*mi0); + MPX_UMLAN(dv, dvl, mv, mvl, y); + dv++; i++; + } +} + +#define MAYBE_MUL4(impl) \ + extern void mpxmont_mul4_##impl(mpw *dv, \ + const mpw *av, const mpw *bv, \ + const mpw *mv, \ + size_t n, const mpw *mi); \ + static void maybe_mul4_##impl(mpw *dv, mpw *dvl, \ + const mpw *av, const mpw *avl, \ + const mpw *bv, const mpw *bvl, \ + const mpw *mv, size_t n, const mpw *mi) \ + { \ + size_t an = avl - av, bn = bvl - bv; \ + if (n%4 || an != n || bn != n) \ + simple_mulcore(dv, dvl, av, avl, bv, bvl, mv, n, mi); \ + else { \ + mpxmont_mul4_##impl(dv, av, bv, mv, n, mi); \ + MPX_ZERO(dv + 2*n + 1, dvl); \ + } \ + } + +#if CPUFAM_X86 + MAYBE_MUL4(x86_sse2) + MAYBE_MUL4(x86_avx) +#endif + +#if CPUFAM_AMD64 + MAYBE_MUL4(amd64_sse2) + MAYBE_MUL4(amd64_avx) +#endif + +static mulcore__functype *pick_mulcore(void) +{ +#if CPUFAM_X86 + DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_avx, + cpu_feature_p(CPUFEAT_X86_AVX)); + DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); +#endif +#if CPUFAM_AMD64 + DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_avx, + cpu_feature_p(CPUFEAT_X86_AVX)); + DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); +#endif + DISPATCH_PICK_FALLBACK(mpmont_mul, simple_mulcore); +} + +/* --- @finish@ --- * + * + * Arguments: @const mpmont *mm@ = pointer to a Montgomery reduction + * context + * *mp *d@ = pointer to mostly-reduced operand + * + * Returns: --- + * + * Use: Applies the finishing touches to Montgomery reduction. The + * operand @d@ is a multiple of %$R%$ at this point, so it needs + * to be shifted down; the result might need a further + * subtraction to get it into the right interval; and we may + * need to do an additional subtraction if %$d$% is negative. + */ + +static void finish(const mpmont *mm, mp *d) +{ + mpw *dv = d->v, *dvl = d->vl; + size_t n = mm->n; + + memmove(dv, dv + n, MPWS(dvl - (dv + n))); + dvl -= n; + + if (MPX_UCMP(dv, dvl, >=, mm->m->v, mm->m->vl)) + mpx_usub(dv, dvl, dv, dvl, mm->m->v, mm->m->vl); + + if (d->f & MP_NEG) { + mpx_usub(dv, dvl, mm->m->v, mm->m->vl, dv, dvl); + d->f &= ~MP_NEG; + } + + d->vl = dvl; + MP_SHRINK(d); +} + +#endif + /*----- Reduction and multiplication --------------------------------------*/ /* --- @mpmont_create@ --- * @@ -90,6 +308,7 @@ int mpmont_create(mpmont *mm, mp *m) mp_build(&r, r2->v + n, r2->vl); mm->mi = mp_modinv(MP_NEW, m, &r); mm->mi = mp_sub(mm->mi, &r, mm->mi); + MP_ENSURE(mm->mi, n); /* --- Discover the values %$R \bmod m$% and %$R^2 \bmod m$% --- */ @@ -122,7 +341,7 @@ void mpmont_destroy(mpmont *mm) /* --- @mpmont_reduce@ --- * * - * Arguments: @mpmont *mm@ = pointer to Montgomery reduction context + * Arguments: @const mpmont *mm@ = pointer to Montgomery reduction context * @mp *d@ = destination * @mp *a@ = source, assumed positive * @@ -131,7 +350,7 @@ void mpmont_destroy(mpmont *mm) #ifdef MPMONT_DISABLE -mp *mpmont_reduce(mpmont *mm, mp *d, mp *a) +mp *mpmont_reduce(const mpmont *mm, mp *d, mp *a) { mp_div(0, &d, a, mm->m); return (d); @@ -139,25 +358,22 @@ mp *mpmont_reduce(mpmont *mm, mp *d, mp *a) #else -mp *mpmont_reduce(mpmont *mm, mp *d, mp *a) +mp *mpmont_reduce(const mpmont *mm, mp *d, mp *a) { size_t n = mm->n; /* --- Check for serious Karatsuba reduction --- */ - if (n > MPK_THRESH * 3) { + if (n > MPMONT_KTHRESH) { mp al; mpw *vl; mp *u; - if (MP_LEN(a) >= n) - vl = a->v + n; - else - vl = a->vl; + if (MP_LEN(a) >= n) vl = a->v + n; + else vl = a->vl; mp_build(&al, a->v, vl); u = mp_mul(MP_NEW, &al, mm->mi); - if (MP_LEN(u) > n) - u->vl = u->v + n; + if (MP_LEN(u) > n) u->vl = u->v + n; u = mp_mul(u, u, mm->m); d = mp_add(d, a, u); MP_ENSURE(d, n); @@ -167,43 +383,16 @@ mp *mpmont_reduce(mpmont *mm, mp *d, mp *a) /* --- Otherwise do it the hard way --- */ else { - mpw *dv, *dvl; - mpw *mv, *mvl; - mpw mi; - size_t k = n; - - /* --- Initial conditioning of the arguments --- */ - a = MP_COPY(a); - if (d) - MP_DROP(d); + if (d) MP_DROP(d); d = a; - MP_DEST(d, 2 * n + 1, a->f); - - dv = d->v; dvl = d->vl; - mv = mm->m->v; mvl = mm->m->vl; - - /* --- Let's go to work --- */ - - mi = mm->mi->v[0]; - while (k--) { - mpw u = MPW(*dv * mi); - MPX_UMLAN(dv, dvl, mv, mvl, u); - dv++; - } + MP_DEST(d, 2*mm->n + 1, a->f); + redccore(d->v, d->vl, mm->m->v, mm->n, mm->mi->v); } /* --- Wrap everything up --- */ - memmove(d->v, d->v + n, MPWS(MP_LEN(d) - n)); - d->vl -= n; - if (MPX_UCMP(d->v, d->vl, >=, mm->m->v, mm->m->vl)) - mpx_usub(d->v, d->vl, d->v, d->vl, mm->m->v, mm->m->vl); - if (d->f & MP_NEG) { - mpx_usub(d->v, d->vl, mm->m->v, mm->m->vl, d->v, d->vl); - d->f &= ~MP_NEG; - } - MP_SHRINK(d); + finish(mm, d); return (d); } @@ -211,7 +400,7 @@ mp *mpmont_reduce(mpmont *mm, mp *d, mp *a) /* --- @mpmont_mul@ --- * * - * Arguments: @mpmont *mm@ = pointer to Montgomery reduction context + * Arguments: @const mpmont *mm@ = pointer to Montgomery reduction context * @mp *d@ = destination * @mp *a, *b@ = sources, assumed positive * @@ -220,7 +409,7 @@ mp *mpmont_reduce(mpmont *mm, mp *d, mp *a) #ifdef MPMONT_DISABLE -mp *mpmont_mul(mpmont *mm, mp *d, mp *a, mp *b) +mp *mpmont_mul(const mpmont *mm, mp *d, mp *a, mp *b) { d = mp_mul(d, a, b); mp_div(0, &d, d, mm->m); @@ -229,71 +418,21 @@ mp *mpmont_mul(mpmont *mm, mp *d, mp *a, mp *b) #else -mp *mpmont_mul(mpmont *mm, mp *d, mp *a, mp *b) +mp *mpmont_mul(const mpmont *mm, mp *d, mp *a, mp *b) { - if (mm->n > MPK_THRESH * 3) { + size_t n = mm->n; + + if (n > MPMONT_KTHRESH) { d = mp_mul(d, a, b); d = mpmont_reduce(mm, d, d); } else { - mpw *dv, *dvl; - mpw *av, *avl; - mpw *bv, *bvl; - mpw *mv, *mvl; - mpw y; - size_t n, i; - mpw mi; - - /* --- Initial conditioning of the arguments --- */ - - if (MP_LEN(a) > MP_LEN(b)) { - mp *t = a; a = b; b = t; - } - n = MP_LEN(mm->m); - - a = MP_COPY(a); - b = MP_COPY(b); - MP_DEST(d, 2 * n + 1, a->f | b->f | MP_UNDEF); - dv = d->v; dvl = d->vl; - MPX_ZERO(dv, dvl); - av = a->v; avl = a->vl; - bv = b->v; bvl = b->vl; - mv = mm->m->v; mvl = mm->m->vl; - y = *bv; - - /* --- Montgomery multiplication phase --- */ - - i = 0; - mi = mm->mi->v[0]; - while (i < n && av < avl) { - mpw x = *av++; - mpw u = MPW((*dv + x * y) * mi); - MPX_UMLAN(dv, dvl, bv, bvl, x); - MPX_UMLAN(dv, dvl, mv, mvl, u); - dv++; - i++; - } - - /* --- Simpler Montgomery reduction phase --- */ - - while (i < n) { - mpw u = MPW(*dv * mi); - MPX_UMLAN(dv, dvl, mv, mvl, u); - dv++; - i++; - } - - /* --- Done --- */ - - memmove(d->v, dv, MPWS(dvl - dv)); - d->vl -= dv - d->v; - if (MPX_UCMP(d->v, d->vl, >=, mm->m->v, mm->m->vl)) - mpx_usub(d->v, d->vl, d->v, d->vl, mm->m->v, mm->m->vl); - if ((a->f ^ b->f) & MP_NEG) - mpx_usub(d->v, d->vl, mm->m->v, mm->m->vl, d->v, d->vl); - MP_SHRINK(d); - d->f = (a->f | b->f) & MP_BURN; - MP_DROP(a); - MP_DROP(b); + a = MP_COPY(a); b = MP_COPY(b); + MP_DEST(d, 2*n + 1, a->f | b->f | MP_UNDEF); + mulcore(d->v, d->vl, a->v, a->vl, b->v, b->vl, + mm->m->v, mm->n, mm->mi->v); + d->f = ((a->f | b->f) & MP_BURN) | ((a->f ^ b->f) & MP_NEG); + finish(mm, d); + MP_DROP(a); MP_DROP(b); } return (d); @@ -305,6 +444,10 @@ mp *mpmont_mul(mpmont *mm, mp *d, mp *a, mp *b) #ifdef TEST_RIG +#ifdef ENABLE_ASM_DEBUG +# include "regdump.h" +#endif + static int tcreate(dstr *v) { mp *m = *(mp **)v[0].buf; @@ -400,7 +543,6 @@ static int tmul(dstr *v) mp_drop(mr); } - MP_DROP(m); MP_DROP(a); MP_DROP(b); @@ -419,6 +561,9 @@ static test_chunk tests[] = { int main(int argc, char *argv[]) { sub_init(); +#ifdef ENABLE_ASM_DEBUG + regdump_init(); +#endif test_run(argc, argv, tests, SRCDIR "/t/mpmont"); return (0); }