From: Mark Wooding Date: Wed, 18 May 2016 09:29:03 +0000 (+0100) Subject: Merge branch 'mdw/cpu-dispatch' X-Git-Tag: 2.2.3~1^2~22 X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/1da1ed6a5815deef6c33d74f1eb3c856793df3e5?hp=1c8e76bd932668c8e26f9984ad008c0027dc1167 Merge branch 'mdw/cpu-dispatch' * mdw/cpu-dispatch: Add support machinery for ARM hosts. base/dispatch.c: Add (unused) machinery for probing ELF auxilary vector. Add support for AMD64 processors and Microsoft Windows. symm/rijndael-x86-aseni.S: Unify encryption and decryption with a macro. symm/rijndael-x86-aesni.S: Use xmm5 instead of xmm7. symm/*.S: Symbolic names for shuffles. symm/chacha-x86-sse2.S: Fix the register allocation comment. Preprocess the assembler files. configure.ac: Improve the host CPU family detection. base/dispatch.c: Indent some preprocessor definitions properly. Add a pile of debug output around the CPU dispatching machinery. base/dispatch.c: Add documentation for some internal functions. base/dispatch.c: Add in more useful section markers. Support Intel's AES Native Instructions where available on x86 hardware. symm/: New SSE2 implementations of Salsa20 and ChaCha. symm/salsa20.c, symm/salsa20-core.h: Permute input matrix for SIMD. debian/rules: Run tests twice, once without any detected CPU features. base/dispatch.c: Check operating system support for XMM registers. configure.ac, base/dispatch.[ch]: CPU-specific implementations. configure.ac: Arrange to have an assembler available. Conflicts: configure.ac symm/Makefile.am --- diff --git a/base/Makefile.am b/base/Makefile.am index 35c86ff5..0ac43f2e 100644 --- a/base/Makefile.am +++ b/base/Makefile.am @@ -40,6 +40,10 @@ libbase_la_SOURCES += arena.c pkginclude_HEADERS += ct.h libbase_la_SOURCES += ct.c +## CPU-specific dispatch. +pkginclude_HEADERS += dispatch.h +libbase_la_SOURCES += dispatch.c + ## Acceptable key-size descriptions. pkginclude_HEADERS += keysz.h libbase_la_SOURCES += keysz.c keysz-conv.c @@ -51,4 +55,7 @@ libbase_la_SOURCES += lmem.c ## Clearing secrets from memory. pkginclude_HEADERS += paranoia.h +## Base definitions for assembler source. +EXTRA_DIST += asm-common.h + ###----- That's all, folks -------------------------------------------------- diff --git a/base/asm-common.h b/base/asm-common.h new file mode 100644 index 00000000..0af9cb58 --- /dev/null +++ b/base/asm-common.h @@ -0,0 +1,221 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// Fancy SIMD implementation of Salsa20 +/// +/// (c) 2015 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// General definitions. + +// Announcing an external function. +#define FUNC(name) \ + .globl F(name); \ + TYPE_FUNC(name); \ + .macro ENDFUNC; _ENDFUNC(name); .endm; \ + FUNC_PREHOOK(name); \ +F(name): \ + FUNC_POSTHOOK(name) + +// Marking the end of a function. +#define _ENDFUNC(name) \ + .purgem ENDFUNC; \ + SIZE_OBJ(name); \ + ENDFUNC_HOOK(name) + +///-------------------------------------------------------------------------- +/// ELF-specific hacking. + +#if __ELF__ + +#if __PIC__ || __PIE__ +# define WANT_PIC 1 +#endif + +#define TYPE_FUNC(name) .type name, STT_FUNC + +#define SIZE_OBJ(name) .size name, . - name + +#endif + +///-------------------------------------------------------------------------- +/// Windows-specific hacking. + +#if ABI_WIN + +#if CPUFAM_X86 +# define F(name) _##name +#endif + +#endif + +///-------------------------------------------------------------------------- +/// x86- and amd64-specific hacking. +/// +/// It's (slightly) easier to deal with both of these in one go. + +#if CPUFAM_X86 || CPUFAM_AMD64 + +// Set the function hooks. +#define FUNC_PREHOOK(_) .balign 16 + +// Don't use the wretched AT&T syntax. It's festooned with pointless +// punctuation, and all of the data movement is backwards. Ugh! + .intel_syntax noprefix + +// Call external subroutine at ADDR, possibly via PLT. + .macro callext addr +#if WANT_PIC + call \addr@PLT +#else + call \addr +#endif + .endm + +// Do I need to arrange a spare GOT register? +#if WANT_PIC && CPUFAM_X86 +# define NEED_GOT 1 +#endif +#define GOTREG ebx // Not needed in AMD64 so don't care. + +// Maybe load GOT address into GOT. + .macro ldgot got=GOTREG +#if WANT_PIC && CPUFAM_X86 + call _where_am_i.\got + add \got, offset _GLOBAL_OFFSET_TABLE_ +#endif + .endm + +// Maybe build a helper subroutine for `ldgot GOT'. + .macro gotaux got=GOTREG +#if WANT_PIC && CPUFAM_X86 + .align 16 +_where_am_i.\got : + mov \got, [esp] + ret +#endif + .endm + +// Load address of external symbol ADDR into REG, maybe using GOT. + .macro leaext reg, addr, got=GOTREG +#if WANT_PIC +# if CPUFAM_X86 + mov \reg, [\got + \addr@GOT] +# endif +# if CPUFAM_AMD64 + mov \reg, \addr@GOTPCREL[rip] +# endif +#else +# if CPUFAM_X86 + mov \reg, offset \addr +# endif +# if CPUFAM_AMD64 + lea \reg, \addr[rip] +# endif +#endif + .endm + +// Address expression (possibly using a base register, and a displacement) +// referring to ADDR, which is within our module, maybe using GOT. +#define INTADDR(...) INTADDR__0(__VA_ARGS__, GOTREG, dummy) +#define INTADDR__0(addr, got, ...) INTADDR__1(addr, got) +#if CPUFAM_AMD64 +# define INTADDR__1(addr, got) addr + rip +#elif WANT_PIC +# define INTADDR__1(addr, got) got + addr@GOTOFF +#else +# define INTADDR__1(addr, got) addr +#endif + +#endif + +///-------------------------------------------------------------------------- +/// ARM-specific hacking. + +#if CPUFAM_ARM + +// Set the function hooks. +#define FUNC_PREHOOK(_) .balign 4 +#define ENDFUNC_HOOK(name) .ltorg + +// Call external subroutine at ADDR, possibly via PLT. + .macro callext addr, cond= +#if WANT_PIC + bl\cond \addr(PLT) +#else + bl\cond \addr +#endif + .endm + +// Do I need to arrange a spare GOT register? +#if WANT_PIC +# define NEED_GOT 1 +#endif +#define GOTREG r9 + +// Maybe load GOT address into GOT. + .macro ldgot got=r9 +#if WANT_PIC + ldr \got, =_GLOBAL_OFFSET_TABLE_ - . - 12 + add \got, pc, \got +#endif + .endm + +// Load address of external symbol ADDR into REG, maybe using GOT. + .macro leaext reg, addr, cond=, got=GOTREG +#if WANT_PIC + ldr \reg, =\addr(GOT) + ldr \reg, [\got, \reg] +#else + ldr \reg, =\addr +#endif + .endm + +#endif + +///-------------------------------------------------------------------------- +/// Final stuff. + +// Default values for the various hooks. +#ifndef FUNC_PREHOOK +# define FUNC_PREHOOK(name) +#endif +#ifndef FUNC_POSTHOOK +# define FUNC_POSTHOOK(name) +#endif +#ifndef ENDFUNC_HOOK +# define ENDFUNC_HOOK(name) +#endif + +#ifndef F +# define F(name) name +#endif + +#ifndef TYPE_FUNC +# define TYPE_FUNC(name) +#endif + +#ifndef SIZE_OBJ +# define SIZE_OBJ(name) +#endif + +///----- That's all, folks -------------------------------------------------- diff --git a/base/dispatch.c b/base/dispatch.c new file mode 100644 index 00000000..1b0ab2b1 --- /dev/null +++ b/base/dispatch.c @@ -0,0 +1,532 @@ +/* -*-c-*- + * + * CPU-specific dispatch + * + * (c) 2015 Straylight/Edgeware + */ + +/*----- Licensing notice --------------------------------------------------* + * + * This file is part of Catacomb. + * + * Catacomb is free software; you can redistribute it and/or modify + * it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * Catacomb is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with Catacomb; if not, write to the Free + * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + * MA 02111-1307, USA. + */ + +/*----- Header files ------------------------------------------------------*/ + +#include "config.h" + +#include +#include +#include +#include +#include + +#include + +#include "dispatch.h" + +/*----- Intel x86/AMD64 feature probing -----------------------------------*/ + +#if CPUFAM_X86 || CPUFAM_AMD64 + +# define EFLAGS_ID (1u << 21) +# define CPUID1D_SSE2 (1u << 26) +# define CPUID1D_FXSR (1u << 24) +# define CPUID1C_AESNI (1u << 25) + +struct cpuid { unsigned a, b, c, d; }; + +/* --- @cpuid@ --- * + * + * Arguments: @struct cpuid *cc@ = where to write the result + * @unsigned a, c@ = EAX and ECX registers to set + * + * Returns: --- + * + * Use: Minimal C wrapper around the x86 `CPUID' instruction. Checks + * that the instruction is actually available before invoking + * it; fills the output structure with zero if it's not going to + * work. + */ + +#ifdef __GNUC__ +# if CPUFAM_X86 +static __inline__ unsigned getflags(void) + { unsigned f; __asm__ ("pushf; popl %0" : "=g" (f)); return (f); } +static __inline__ unsigned setflags(unsigned f) +{ + unsigned ff; + __asm__ ("pushf; pushl %1; popf; pushf; popl %0; popf" + : "=g" (ff) + : "g" (f)); + return (ff); +} +# else +static __inline__ unsigned long getflags(void) + { unsigned long f; __asm__ ("pushf; popq %0" : "=g" (f)); return (f); } +static __inline__ unsigned long long setflags(unsigned long f) +{ + unsigned long ff; + __asm__ ("pushf; pushq %1; popf; pushf; popq %0; popf" + : "=g" (ff) + : "g" (f)); + return (ff); +} +# endif +#endif + +static void cpuid(struct cpuid *cc, unsigned a, unsigned c) +{ +#ifdef __GNUC__ + unsigned f; +#endif + + cc->a = cc->b = cc->c = cc->d = 0; + +#ifdef __GNUC__ + /* Stupid dance to detect whether the CPUID instruction is available. */ + f = getflags(); + if (!(setflags(f | EFLAGS_ID) & EFLAGS_ID) || + setflags(f & ~EFLAGS_ID) & EFLAGS_ID) { + dispatch_debug("CPUID instruction not available"); + return; + } + setflags(f); + + /* Alas, EBX is magical in PIC code, so abuse ESI instead. This isn't + * pretty, but it works. + */ +# if CPUFAM_X86 + __asm__ ("pushl %%ebx; cpuid; movl %%ebx, %%esi; popl %%ebx" + : "=a" (cc->a), "=S" (cc->b), "=c" (cc->c), "=d" (cc->d) + : "a" (a) , "c" (c)); +# elif CPUFAM_AMD64 + __asm__ ("pushq %%rbx; cpuid; movl %%ebx, %%esi; popq %%rbx" + : "=a" (cc->a), "=S" (cc->b), "=c" (cc->c), "=d" (cc->d) + : "a" (a) , "c" (c)); +# else +# error "I'm confused." +# endif + dispatch_debug("CPUID(%08x, %08x) -> %08x, %08x, %08x, %08x", + a, c, cc->a, cc->b, cc->c, cc->d); +#else + dispatch_debug("GNU inline assembler not available; can't CPUID"); +#endif +} + +static unsigned cpuid_maxleaf(void) + { struct cpuid c; cpuid(&c, 0, 0); return (c.a); } + +/* --- @cpuid_features_p@ --- * + * + * Arguments: @unsigned dbits@ = bits to check in EDX + * @unsigned cbits@ = bits to check in ECX + * + * Returns: Nonzero if all the requested bits are set in the CPUID result + * on leaf 1. + */ + +static int cpuid_features_p(unsigned dbits, unsigned cbits) +{ + struct cpuid c; + if (cpuid_maxleaf() < 1) return (0); + cpuid(&c, 1, 0); + return ((c.d & dbits) == dbits && (c.c & cbits) == cbits); +} + +/* --- @xmm_registers_available_p@ --- * + * + * Arguments: --- + * + * Returns: Nonzero if the operating system has made the XMM registers + * available for use. + */ + +static int xmm_registers_available_p(void) +{ +#ifdef __GNUC__ + unsigned f; + /* This hack is by Agner Fog. Use FXSAVE/FXRSTOR to figure out whether the + * XMM registers are actually alive. + */ + if (!cpuid_features_p(CPUID1D_FXSR, 0)) return (0); +# if CPUFAM_X86 + __asm__ ("movl %%esp, %%edx; subl $512, %%esp; andl $~15, %%esp\n" + "fxsave (%%esp)\n" + "movl 160(%%esp), %%eax; xorl $0xaaaa5555, 160(%%esp)\n" + "fxrstor (%%esp); fxsave (%%esp)\n" + "movl 160(%%esp), %%ecx; movl %%eax, 160(%%esp)\n" + "fxrstor (%%esp); movl %%edx, %%esp\n" + "xorl %%ecx, %%eax" + : "=a" (f) + : /* no inputs */ + : "%ecx", "%edx"); +# elif CPUFAM_AMD64 + __asm__ ("movq %%rsp, %%rdx; subq $512, %%rsp; andq $~15, %%rsp\n" + "fxsave (%%rsp)\n" + "movl 160(%%rsp), %%eax; xorl $0xaaaa5555, 160(%%rsp)\n" + "fxrstor (%%rsp); fxsave (%%rsp)\n" + "movl 160(%%rsp), %%ecx; movl %%eax, 160(%%rsp)\n" + "fxrstor (%%rsp); movq %%rdx, %%rsp\n" + "xorl %%ecx, %%eax" + : "=a" (f) + : /* no inputs */ + : "%ecx", "%rdx"); +# else +# error "I'm confused." +# endif + dispatch_debug("XMM registers %savailable", f ? "" : "not "); + return (f); +#else + dispatch_debug("GNU inline assembler not available; can't check for XMM"); + return (0); +#endif +} + +#endif + +/*----- General feature probing using auxiliary vectors -------------------*/ + +/* Try to find the system's definitions for auxiliary vector entries. */ +#ifdef HAVE_SYS_AUXV_H +# include +#else +# ifdef HAVE_LINUX_AUXVEC_H +# include +# endif +# ifdef HAVE_ASM_HWCAP_H +# include +# endif +#endif + +/* The type of entries in the auxiliary vector. I'm assuming that `unsigned + * long' matches each platform's word length; if this is false then we'll + * need some host-specific tweaking here. + */ +union auxval { long i; unsigned long u; const void *p; }; +struct auxentry { unsigned long type; union auxval value; }; + +/* Register each CPU family's interest in the auxiliary vector. Make sure + * that the necessary entry types are defined. This is primarily ordered by + * entry type to minimize duplication. + */ +#if defined(AT_HWCAP) && CPUFAM_ARMEL +# define WANT_ANY 1 +# define WANT_AT_HWCAP(_) _(AT_HWCAP, u, hwcap) +#endif + +/* If we couldn't find any interesting entries then we can switch all of this + * machinery off. Also do that if we have no means for atomic updates. + */ +#if WANT_ANY && CPU_DISPATCH_P + +/* The main output of this section is a bitmask of detected features. The + * least significant bit will be set if we've tried to probe. Always access + * this using `DISPATCH_LOAD' and `DISPATCH_STORE'. + */ +static unsigned hwcaps = 0; + +/* For each potentially interesting type which turned out not to exist or be + * wanted, define a dummy macro for the sake of the next step. + */ +#ifndef WANT_AT_HWCAP +# define WANT_AT_HWCAP(_) +#endif + +/* For each CPU family, define two lists. + * + * * `WANTAUX' is a list of the `WANT_AT_MUMBLE' macros which the CPU + * family tried to register interest in above. Each entry contains the + * interesting auxiliary vector entry type, the name of the union branch + * for its value, and the name of the slot in `struct auxprobe' in which + * to store the value. + * + * * `CAPMAP' is a list describing the output features which the CPU family + * intends to satisfy from the auxiliary vector. Each entry contains a + * feature name suffix, and the token name (for `check_env'). + */ +#if CPUFAM_ARMEL +# define WANTAUX(_) \ + WANT_AT_HWCAP(_) +# define CAPMAP(_) \ + _(ARM_VFP, "arm:vfp") \ + _(ARM_NEON, "arm:neon") \ + _(ARM_V4, "arm:v4") \ + _(ARM_D32, "arm:d32") +#endif + +/* Build the bitmask for `hwcaps' from the `CAPMAP' list. */ +enum { + HFI_PROBED = 0, +#define HFI__ENUM(feat, tok) HFI_##feat, + CAPMAP(HFI__ENUM) +#undef HFI__ENUM + HFI__END +}; +enum { + HF_PROBED = 1, +#define HF__FLAG(feat, tok) HF_##feat = 1 << HFI_##feat, + CAPMAP(HF__FLAG) +#undef HF__FLAG + HF__END +}; + +/* Build a structure in which we can capture the interesting data from the + * auxiliary vector. + */ +#define AUXUTYPE_i long +#define AUXUTYPE_u unsigned long +#define AUXUTYPE_p const void * +struct auxprobe { +#define AUXPROBE__SLOT(type, ubranch, slot) AUXUTYPE_##ubranch slot; + WANTAUX(AUXPROBE__SLOT) +#undef AUXPROBE_SLOT +}; + +/* --- @probe_hwcaps@ --- * + * + * Arguments: --- + * + * Returns: --- + * + * Use: Attempt to find the auxiliary vector (which is well hidden) + * and discover interesting features from it. + */ + +static void probe_hwcaps(void) +{ + unsigned hw = HF_PROBED; + struct auxprobe probed = { 0 }; + + /* Populate `probed' with the information we manage to retrieve from the + * auxiliary vector. Slots we couldn't find are left zero-valued. + */ +#if defined(HAVE_GETAUXVAL) + /* Shiny new libc lets us request individual entry types. This is almost + * too easy. + */ +# define CAP__GET(type, slot, ubranch) \ + probed.slot.ubranch = (AUXUTYPE_##ubranch)getauxval(type); + WANTAUX(CAP__GET) +#else + /* Otherwise we're a bit stuck, really. Modern Linux kernels make a copy + * of the vector available in `/procc' so we could try that. + * + * The usual place is stuck on the end of the environment vector, but that + * may well have moved, and we have no way of telling whether it has or + * whether there was ever an auxiliary vector there at all; so don't do + * that. + */ + { + FILE *fp = 0; + unsigned char *p = 0, *q = 0; + const struct auxentry *a; + size_t sz, off, n; + + /* Open the file and read it into a memory chunk. */ + if ((fp = fopen("/proc/self/auxv", "rb")) == 0) goto clean; + sz = 4096; off = 0; + if ((p = malloc(sz)) == 0) goto clean; + for (;;) { + n = fread(p + off, 1, sz - off, fp); + off += n; + if (off < sz) break; + sz *= 2; if ((q = realloc(p, sz)) == 0) break; + p = q; + } + + /* Work through the vector (or as much of it as we found) and extract the + * types we're interested in. + */ + for (a = (const struct auxentry *)p, + n = sz/sizeof(struct auxentry); + n--; a++) { + switch (a->type) { +#define CAP__SWITCH(type, ubranch, slot) \ + case type: probed.slot = a->value.ubranch; break; + WANTAUX(CAP__SWITCH) + } + } + + clean: + if (p) free(p); + if (fp) fclose(fp); + } +#endif + + /* Each CPU family now has to pick through what was found and stashed in + * `probed', and set the appropriate flag bits in `hw'. + */ +#if CPUFAM_ARMEL + if (probed.hwcap & HWCAP_VFPv3) hw |= HF_ARM_VFP; + if (probed.hwcap & HWCAP_NEON) hw |= HF_ARM_NEON; + if (probed.hwcap & HWCAP_VFPD32) hw |= HF_ARM_D32; + if (probed.hwcap & HWCAP_VFPv4) hw |= HF_ARM_V4; +#endif + + /* Store the bitmask of features we probed for everyone to see. */ + DISPATCH_STORE(hwcaps, hw); + + /* Finally, make a report about the things we found. (Doing this earlier + * will pointlessly widen the window in which multiple threads will do the + * above auxiliary-vector probing.) + */ +#define CAP__DEBUG(feat, tok) \ + dispatch_debug("check auxv for feature `%s': %s", tok, \ + hw & HF_##feat ? "available" : "absent"); + CAPMAP(CAP__DEBUG) +#undef CAP__DEBUG +} + +/* --- @get_hwcaps@ --- * + * + * Arguments: --- + * + * Returns: A mask of hardware capabilities and other features, as probed + * from the auxiliary vector. + */ + +static unsigned get_hwcaps(void) +{ + unsigned hw; + + DISPATCH_LOAD(hwcaps, hw); + if (!(hwcaps & HF_PROBED)) { probe_hwcaps(); DISPATCH_LOAD(hwcaps, hw); } + return (hw); +} + +#endif + +/*----- External interface ------------------------------------------------*/ + +/* --- @dispatch_debug@ --- * + * + * Arguments: @const char *fmt@ = a format string + * @...@ = additional arguments + * + * Returns: --- + * + * Use: Writes a formatted message to standard output if dispatch + * debugging is enabled. + */ + +void dispatch_debug(const char *fmt, ...) +{ + va_list ap; + const char *e = getenv("CATACOMB_CPUDISPATCH_DEBUG"); + + if (e && *e != 'n' && *e != '0') { + va_start(ap, fmt); + fputs("Catacomb CPUDISPATCH: ", stderr); + vfprintf(stderr, fmt, ap); + fputc('\n', stderr); + va_end(ap); + } +} + +/* --- @check_env@ --- * + * + * Arguments: @const char *ftok@ = feature token + * + * Returns: Zero if the feature is forced off; positive if it's forced + * on; negative if the user hasn't decided. + * + * Use: Checks the environment variable `CATACOMB_CPUFEAT' for the + * feature token @ftok@. The variable, if it exists, should be + * a space-separated sequence of `+tok' and `-tok' items. These + * tokens may end in `*', which matches any suffix. + */ + +static int IGNORABLE check_env(const char *ftok) +{ + const char *p, *q, *pp; + int d; + + p = getenv("CATACOMB_CPUFEAT"); + if (!p) return (-1); + + for (;;) { + while (isspace((unsigned char)*p)) p++; + if (!*p) return (-1); + switch (*p) { + case '+': d = +1; p++; break; + case '-': d = 0; p++; break; + default: d = -1; break; + } + for (q = p; *q && !isspace((unsigned char)*q); q++); + if (d >= 0) { + for (pp = ftok; p < q && *pp && *p == *pp; p++, pp++); + if ((p == q && !*pp) || (*p == '*' && p + 1 == q)) return (d); + } + p = q; + } + return (-1); +} + +/* --- @cpu_feature_p@ --- * + * + * Arguments: @unsigned feat@ = a @CPUFEAT_...@ code + * + * Returns: Nonzero if the feature is available. + */ + +#include + +static int IGNORABLE + feat_debug(const char *ftok, const char *check, int verdict) +{ + if (verdict >= 0) { + dispatch_debug("feature `%s': %s -> %s", ftok, check, + verdict ? "available" : "absent"); + } + return (verdict); +} + +int cpu_feature_p(int feat) +{ + int IGNORABLE f; + IGNORE(f); +#define CASE_CPUFEAT(feat, ftok, cond) case CPUFEAT_##feat: \ + if ((f = feat_debug(ftok, "environment override", \ + check_env(ftok))) >= 0) \ + return (f); \ + else \ + return (feat_debug(ftok, "runtime probe", cond)); + + switch (feat) { +#if CPUFAM_X86 || CPUFAM_AMD64 + CASE_CPUFEAT(X86_SSE2, "x86:sse2", + xmm_registers_available_p() && + cpuid_features_p(CPUID1D_SSE2, 0)); + CASE_CPUFEAT(X86_AESNI, "x86:aesni", + xmm_registers_available_p() && + cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI)); +#endif +#ifdef CAPMAP +# define FEATP__CASE(feat, tok) \ + CASE_CPUFEAT(feat, tok, get_hwcaps & HF_##feat) + CAPMAP(FEATP__CASE) +#undef FEATP__CASE +#endif + default: + dispatch_debug("denying unknown feature %d", feat); + return (0); + } +#undef CASE_CPUFEAT +} + +/*----- That's all, folks -------------------------------------------------*/ diff --git a/base/dispatch.h b/base/dispatch.h new file mode 100644 index 00000000..1e463bdc --- /dev/null +++ b/base/dispatch.h @@ -0,0 +1,193 @@ +/* -*-c-*- + * + * CPU-specific dispatch + * + * (c) 2015 Straylight/Edgeware + */ + +/*----- Licensing notice --------------------------------------------------* + * + * This file is part of Catacomb. + * + * Catacomb is free software; you can redistribute it and/or modify + * it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * Catacomb is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with Catacomb; if not, write to the Free + * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + * MA 02111-1307, USA. + */ + +#ifndef CATACOMB_DISPATCH_H +#define CATACOMB_DISPATCH_H + +#ifdef __cplusplus + extern "C" { +#endif + +/*----- Header files ------------------------------------------------------*/ + +#include + +/*----- Macros ------------------------------------------------------------*/ + +/* --- Atomic data access machinery --- * + * + * If they're available, use GCC's `__atomic_*' intrinsics. If that doesn't + * work and we're using one of a small number of processors I'm sure won't + * mind, then just stick with simple memory access. Otherwise turn + * dispatching off, because it probably isn't thread-safe. + */ + +#if GCC_VERSION_P(4, 7) +# define CPU_DISPATCH_P 1 +# define DISPATCH_LOAD(g, v) \ + ((v) = __atomic_load_n(&(g), __ATOMIC_RELAXED)) +# define DISPATCH_STORE(g, v) \ + (__atomic_store_n(&(g), (v), __ATOMIC_RELAXED)) +#elif defined(__i386__) || defined(__amd64__) || \ + defined(__arm__) || defined(__aarch64__) || \ + defined(__mips__) +# define CPU_DISPATCH_P 1 +# define DISPATCH_LOAD(g, v) ((v) = (g)) +# define DISPATCH_STORE(g, v) ((g) = (v)) +#endif + +/* --- A simple hack --- */ + +#ifndef EMPTY +# define EMPTY +#endif + +/* --- @CPU_DISPATCH@ --- * + * + * Arguments: @stcls@ = storage class for the main @ext@ function + * (typically either @static@ or @EMPTY@) + * @rtn@ = prefix for tail-calling a function of the appropriate + * type (either @(void)@ or @return@) + * @ret@ = return type for the function + * @ext@ = name for the main function (other named are derived + * from this) + * @argdecls@ = parenthesis-enclosed list of argument types + * @args@ = parenthesis-enclosed list of argument names only + * @pick@ = function to select appropriate implementation + * @dflt@ = fallback implementation + * + * Use: Main machinery for CPU-specfic dispatching. + * + * The macro defines a function + * + * @stcls ret ext argdcls@ + * + * The first time @ext@ is called, it will invoke @pick@ to + * select and a return a pointer to an appropriate + * implementation for the runtime environment. Subsequent calls + * to @ext@ will (usually) call this preferred implementation + * directly. + * + * Some target platforms may not be able to establish the + * necessary function pointer in a threadsafe way. On such + * platforms, the dispatch machinery is disabled and @ext@ will + * simply call @dflt@. + * + * Some additional declarations are made. As a convenience, + * @ext__functype@ is the function type of @ext@. Declarations + * are made for @pick@ and @dflt@, as @static@ functions. + */ + +#ifdef CPU_DISPATCH_P + +#define CPU_DISPATCH(stcls, rtn, ret, ext, argdecls, args, pick, dflt) \ + \ +typedef ret ext##__functype argdecls; \ +static ret dflt argdecls; \ +static ret ext##__dispatch argdecls; \ +static ext##__functype *pick(void); \ +static ext##__functype *ext##__ptr = ext##__dispatch; \ + \ +static ret ext##__dispatch argdecls \ +{ \ + ext##__functype *f = pick(); \ + DISPATCH_STORE(ext##__ptr, f); \ + rtn f args; \ +} \ + \ +stcls ret ext argdecls \ +{ \ + ext##__functype *f; \ + DISPATCH_LOAD(ext##__ptr, f); \ + rtn f args; \ +} + +#else + +#define CPU_DISPATCH(stcls, rtn, ret, ext, argdecls, args, pick, dflt) \ + \ +typedef ret ext##__functype argdecls; \ +static ret dflt argdecls; \ +static ext##__functype *pick(void) IGNORABLE; \ + \ +stcls ret ext argdecls { rtn dflt args; } + +#endif + +/* --- Some macros for producing useful debugging --- */ + +#define DISPATCH_PICK_COND(what, func, cond) do { \ + if (cond) { \ + dispatch_debug("picked `%s' for `%s'", #func, #what); \ + return (func); \ + } \ +} while (0) +#define DISPATCH_PICK_FALLBACK(what, func) do { \ + dispatch_debug("using default `%s'", #what); \ + return (func); \ +} while (0) + +/*----- Functions provided ------------------------------------------------*/ + +/* --- @dispatch_debug@ --- * + * + * Arguments: @const char *fmt@ = a format string + * @...@ = additional arguments + * + * Returns: --- + * + * Use: Writes a formatted message to standard output if dispatch + * debugging is enabled. + */ + +extern void dispatch_debug(const char */*fmt*/, ...); + +/* --- @cpu_feature_p@ --- * + * + * Arguments: @unsigned feat@ = a @CPUFEAT_...@ code + * + * Returns: Nonzero if the feature is available. + */ + +enum { + CPUFEAT_X86_SSE2, /* Streaming SIMD Extensions 2 */ + CPUFEAT_X86_AESNI, /* AES Native Instructions */ + CPUFEAT_ARM_VFP, /* VFP floating-point (v3 or v4) */ + CPUFEAT_ARM_NEON, /* Advanced SIMD (v1 or v2) */ + CPUFEAT_ARM_V4, /* VFPv4 and/or SIMD v2 */ + CPUFEAT_ARM_D32 /* 32 double registers, not 16 */ +}; + +extern int cpu_feature_p(int /*feat*/); + +/*----- That's all, folks -------------------------------------------------*/ + +#ifdef __cplusplus + } +#endif + +#endif diff --git a/configure.ac b/configure.ac index c3e55ca4..715a5c28 100644 --- a/configure.ac +++ b/configure.ac @@ -32,6 +32,7 @@ AC_INIT([catacomb], AUTO_VERSION, [mdw@distorted.org.uk]) AC_CONFIG_SRCDIR([catacomb.pc.in]) AC_CONFIG_AUX_DIR([config]) AM_INIT_AUTOMAKE([foreign parallel-tests color-tests subdir-objects]) +AC_CANONICAL_HOST mdw_SILENT_RULES AC_PROG_CC @@ -39,11 +40,102 @@ AX_CFLAGS_WARN_ALL AM_PROG_LIBTOOL mdw_LIBTOOL_VERSION_INFO +AM_PROG_AS + AC_PROG_YACC AC_SUBST(AM_CFLAGS) dnl-------------------------------------------------------------------------- +dnl Host-specific configuration. + +AC_MSG_CHECKING([CPU family and ABI]) + +dnl The table of CPU families and ABIs which we might support. Support is +dnl not uniform: each dispatched function might or might not have an +dnl implementation for any particular CPU/ABI combination. +AC_DEFUN([catacomb_CPU_FAMILIES], + [$1([i[[3-6]]86,cygwin], [x86], [win]) + $1([i[[3-6]]86,*], [x86], [sysv]) + $1([x86_64,cygwin], [amd64], [win]) + $1([x86_64,*], [amd64], [sysv]) + $1([armv*,*-gnueabi | armv*,*-gnueabihf], [armel], [gnueabi])]) + +dnl A utility to clear the `seen' flags, used so as to process each CPU or +dnl ABI once. +m4_define([catacomb_CLEAR_FLAGS], +[m4_ifdef([catacomb_seen_cpu/$2], + [m4_undefine([catacomb_seen_cpu/$2])])dnl +m4_ifdef([catacomb_seen_abi/$3], + [m4_undefine([catacomb_seen_abi/$3])])]) + +dnl Identify the current host. +case $host_cpu,$host_os in + m4_define([catacomb_CPU_CASE], + [$1) CPUFAM=$2 ABI=$3 ;; +]) + catacomb_CPU_FAMILIES([catacomb_CPU_CASE]) + *) CPUFAM=nil ABI=nil ;; +esac + +dnl Figure out the current CPU. +catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS]) +case $CPUFAM in + m4_define([catacomb_DEFINE_CPU], + [m4_ifdef([catacomb_seen_cpu/$2], [], + [$2) + AC_DEFINE([CPUFAM_]m4_translit([$2], [a-z], [A-Z]), [1], + [Define if host CPU family is \`$2\'.]) + ;;m4_define([catacomb_seen_cpu/$2], [t])])]) + catacomb_CPU_FAMILIES([catacomb_DEFINE_CPU]) + nil) ;; + *) AC_MSG_ERROR([BUG: unexpected cpufam \`$CPUFAM']) ;; +esac +AC_SUBST([CPUFAM]) + +dnl Figure out the current ABI. +catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS]) +case $ABI in + m4_define([catacomb_DEFINE_ABI], + [m4_ifdef([catacomb_seen_abi/$3], [], + [$3) + AC_DEFINE([ABI_]m4_translit([$3], [a-z], [A-Z]), [1], + [Define if host ABI variant is \`$3\'.]) + ;;m4_define([catacomb_seen_abi/$3], [t])])]) + catacomb_CPU_FAMILIES([catacomb_DEFINE_ABI]) + nil) ;; + *) AC_MSG_ERROR([BUG: unexpected ABI \`$ABI']) ;; +esac +AC_SUBST([ABI]) + +dnl Establish Automake conditions for things. +catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS]) +m4_define([catacomb_COND_CPU], +[m4_define([_CPU], m4_translit([$2], [a-z], [A-Z])) +m4_define([_ABI], m4_translit([$3], [a-z], [A-Z])) +AM_CONDITIONAL([CPUABI_]_CPU[_]_ABI, [test x$CPUFAM/$ABI = x$2/$3]) +m4_ifdef([catacomb_seen_cpu/$2], [], +[AM_CONDITIONAL([CPUFAM_]_CPU, [test x$CPUFAM = x$2])dnl +m4_define([catacomb_seen_cpu/$2], [t])]) +m4_ifdef([catacomb_seen_abi/$3], [], +[AM_CONDITIONAL([ABI_]_ABI, [test x$ABI = x$3])dnl +m4_define([catacomb_seen_abi/$3], [t])])]) +catacomb_CPU_FAMILIES([catacomb_COND_CPU]) +AM_CONDITIONAL([KNOWN_CPUFAM], [test x$CPUFAM != xnil]) + +dnl Report on what we found. +case $CPUFAM in + nil) AC_MSG_RESULT([not supported]) ;; + *) AC_MSG_RESULT([$CPUFAM/$ABI]) ;; +esac + +dnl Some equipment wanted for checking CPU features at runtime. +AC_CHECK_HEADERS([asm/hwcap.h]) +AC_CHECK_HEADERS([sys/auxv.h]) +AC_CHECK_HEADERS([linux/auxvec.h]) +AC_CHECK_FUNCS([getauxval]) + +dnl-------------------------------------------------------------------------- dnl C programming environment. dnl Find out if we're cross-compiling. diff --git a/debian/rules b/debian/rules index cec98bbe..e64e6b43 100755 --- a/debian/rules +++ b/debian/rules @@ -1,2 +1,6 @@ #! /usr/bin/make -f %:; dh $@ --parallel -Bdebian/build + +override_dh_auto_test: + dh_auto_test --parallel -Bdebian/build + CATACOMB_CPUFEAT="-*" dh_auto_test --parallel -Bdebian/build diff --git a/symm/Makefile.am b/symm/Makefile.am index 69c1013f..1d3374f5 100644 --- a/symm/Makefile.am +++ b/symm/Makefile.am @@ -182,6 +182,12 @@ BLKCS += rc5 ## Daemen and Rijmen's `Rijndael' block cipher, selected as AES. BLKCS += rijndael rijndael192 rijndael256 libsymm_la_SOURCES += rijndael-base.h rijndael-base.c +if CPUFAM_X86 +libsymm_la_SOURCES += rijndael-x86ish-aesni.S +endif +if CPUFAM_AMD64 +libsymm_la_SOURCES += rijndael-x86ish-aesni.S +endif nodist_libsymm_la_SOURCES += ../precomp/symm/rijndael-tab.c PRECOMPS += $(precomp)/symm/rijndael-tab.c PRECOMP_PROGS += rijndael-mktab @@ -388,6 +394,12 @@ ALL_CIPHERS += seal EXTRA_DIST += salsa20-tvconv pkginclude_HEADERS += salsa20.h salsa20-core.h libsymm_la_SOURCES += salsa20.c +if CPUFAM_X86 +libsymm_la_SOURCES += salsa20-x86ish-sse2.S +endif +if CPUFAM_AMD64 +libsymm_la_SOURCES += salsa20-x86ish-sse2.S +endif TESTS += salsa20.t$(EXEEXT) ALL_CIPHERS += salsa20 salsa2012 salsa208 ALL_CIPHERS += xsalsa20 xsalsa2012 xsalsa208 @@ -414,6 +426,12 @@ t/salsa20: salsa20-tvconv t/salsa20.local $(SALSA20_ESTREAM_TV) ## Bernstein's `ChaCha' stream cipher. pkginclude_HEADERS += chacha.h chacha-core.h libsymm_la_SOURCES += chacha.c +if CPUFAM_X86 +libsymm_la_SOURCES += chacha-x86ish-sse2.S +endif +if CPUFAM_AMD64 +libsymm_la_SOURCES += chacha-x86ish-sse2.S +endif TESTS += chacha.t$(EXEEXT) EXTRA_DIST += t/chacha ALL_CIPHERS += chacha20 chacha12 chacha8 diff --git a/symm/chacha-core.h b/symm/chacha-core.h index ad6b05f5..1c0efcd9 100644 --- a/symm/chacha-core.h +++ b/symm/chacha-core.h @@ -95,9 +95,12 @@ /* The ChaCha feedforward step, used at the end of the core function. Here, * @y@ contains the original input matrix; @z@ contains the final one, and is - * updated. This is the same as Salsa20. + * updated. This is the same as Salsa20, only without the final permutation. */ -#define CHACHA_FFWD(z, y) SALSA20_FFWD(z, y) +#define CHACHA_FFWD(z, y) do { \ + int _i; \ + for (_i = 0; _i < 16; _i++) (z)[_i] += (y)[_i]; \ +} while (0) /* Various numbers of rounds, unrolled. Read from @y@, and write to @z@. */ #define CHACHA_4R(z, y) \ diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S new file mode 100644 index 00000000..f36bf90f --- /dev/null +++ b/symm/chacha-x86ish-sse2.S @@ -0,0 +1,260 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// Fancy SIMD implementation of ChaCha +/// +/// (c) 2015 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// External definitions. + +#include "config.h" +#include "asm-common.h" + +///-------------------------------------------------------------------------- +/// Local utilities. + +// Magic constants for shuffling. +#define ROTL 0x93 +#define ROT2 0x4e +#define ROTR 0x39 + +///-------------------------------------------------------------------------- +/// Main code. + + .arch pentium4 + .section .text + +FUNC(chacha_core_x86ish_sse2) + + // Initial setup. + +#if CPUFAM_X86 + // Arguments come in on the stack, and will need to be collected. We + // we can get away with just the scratch registers for integer work, + // but we'll run out of XMM registers and will need some properly + // aligned space which we'll steal from the stack. I don't trust the + // stack pointer's alignment, so I'll have to mask the stack pointer, + // which in turn means I'll need to keep track of the old value. + // Hence I'm making a full i386-style stack frame here. + // + // The Windows and SysV ABIs are sufficiently similar that we don't + // need to worry about the differences here. + +# define NR ecx +# define IN eax +# define OUT edx +# define SAVE0 xmm5 +# define SAVE1 xmm6 +# define SAVE2 xmm7 +# define SAVE3 [esp] + + push ebp + mov ebp, esp + sub esp, 16 + mov IN, [ebp + 12] + mov OUT, [ebp + 16] + and esp, ~15 + mov NR, [ebp + 8] +#endif + +#if CPUFAM_AMD64 && ABI_SYSV + // This is nice. We have plenty of XMM registers, and the arguments + // are in useful places. There's no need to spill anything and we + // can just get on with the code. + +# define NR edi +# define IN rsi +# define OUT rdx +# define SAVE0 xmm5 +# define SAVE1 xmm6 +# define SAVE2 xmm7 +# define SAVE3 xmm8 +#endif + +#if CPUFAM_AMD64 && ABI_WIN + // Arguments come in registers, but they're different between Windows + // and everyone else (and everyone else is saner). + // + // The Windows ABI insists that we preserve some of the XMM + // registers, but we want more than we can use as scratch space. We + // only need to save a copy of the input for the feedforward at the + // end, so we might as well use memory rather than spill extra + // registers. (We need an extra 8 bytes to align the stack.) + +# define NR ecx +# define IN rdx +# define OUT r8 +# define SAVE0 xmm5 +# define SAVE1 [rsp + 0] +# define SAVE2 [rsp + 16] +# define SAVE3 [rsp + 32] + + sub rsp, 48 + 8 +#endif + + // First job is to slurp the matrix into XMM registers. Be careful: + // the input matrix isn't likely to be properly aligned. + // + // [ 0 1 2 3] (a, xmm0) + // [ 4 5 6 7] (b, xmm1) + // [ 8 9 10 11] (c, xmm2) + // [12 13 14 15] (d, xmm3) + movdqu xmm0, [IN + 0] + movdqu xmm1, [IN + 16] + movdqu xmm2, [IN + 32] + movdqu xmm3, [IN + 48] + + // Take a copy for later. This one is aligned properly, by + // construction. + movdqa SAVE0, xmm0 + movdqa SAVE1, xmm1 + movdqa SAVE2, xmm2 + movdqa SAVE3, xmm3 + +loop: + // Apply a column quarterround to each of the columns simultaneously. + // Alas, there doesn't seem to be a packed doubleword rotate, so we + // have to synthesize it. + + // a += b; d ^= a; d <<<= 16 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 16 + psrld xmm4, 16 + por xmm3, xmm4 + + // c += d; b ^= c; b <<<= 12 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm4, xmm1 + pslld xmm1, 12 + psrld xmm4, 20 + por xmm1, xmm4 + + // a += b; d ^= a; d <<<= 8 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 8 + psrld xmm4, 24 + por xmm3, xmm4 + + // c += d; b ^= c; b <<<= 7 + paddd xmm2, xmm3 + pshufd xmm3, xmm3, ROTL + pxor xmm1, xmm2 + pshufd xmm2, xmm2, ROT2 + movdqa xmm4, xmm1 + pslld xmm1, 7 + psrld xmm4, 25 + por xmm1, xmm4 + + // The not-quite-transpose conveniently only involves reordering + // elements of individual rows, which can be done quite easily. It + // doesn't involve any movement of elements between rows, or even + // renaming of the rows. + // + // [ 0 1 2 3] [ 0 1 2 3] (a, xmm0) + // [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1) + // [ 8 9 10 11] [10 11 8 9] (c, xmm2) + // [12 13 14 15] [15 12 13 14] (d, xmm3) + // + // The shuffles have quite high latency, so they've mostly been + // pushed upwards. The remaining one can't be moved, though. + pshufd xmm1, xmm1, ROTR + + // Apply the diagonal quarterround to each of the columns + // simultaneously. + + // a += b; d ^= a; d <<<= 16 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 16 + psrld xmm4, 16 + por xmm3, xmm4 + + // c += d; b ^= c; b <<<= 12 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm4, xmm1 + pslld xmm1, 12 + psrld xmm4, 20 + por xmm1, xmm4 + + // a += b; d ^= a; d <<<= 8 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 8 + psrld xmm4, 24 + por xmm3, xmm4 + + // c += d; b ^= c; b <<<= 7 + paddd xmm2, xmm3 + pshufd xmm3, xmm3, ROTR + pxor xmm1, xmm2 + pshufd xmm2, xmm2, ROT2 + movdqa xmm4, xmm1 + pslld xmm1, 7 + psrld xmm4, 25 + por xmm1, xmm4 + + // Finally, finish off undoing the transpose, and we're done for this + // doubleround. Again, most of this was done above so we don't have + // to wait for the shuffles. + pshufd xmm1, xmm1, ROTL + + // Decrement the loop counter and see if we should go round again. + sub NR, 2 + ja loop + + // Almost there. Firstly, the feedforward addition. + paddd xmm0, SAVE0 + paddd xmm1, SAVE1 + paddd xmm2, SAVE2 + paddd xmm3, SAVE3 + + // And now we write out the result. This one won't be aligned + // either. + movdqu [OUT + 0], xmm0 + movdqu [OUT + 16], xmm1 + movdqu [OUT + 32], xmm2 + movdqu [OUT + 48], xmm3 + + // Tidy things up. +#if CPUFAM_X86 + mov esp, ebp + pop ebp +#endif +#if CPUFAM_AMD64 && ABI_WIN + add rsp, 48 + 8 +#endif + + // And with that, we're done. + ret + +ENDFUNC + +///----- That's all, folks -------------------------------------------------- diff --git a/symm/chacha.c b/symm/chacha.c index e694ad22..0c8aa003 100644 --- a/symm/chacha.c +++ b/symm/chacha.c @@ -27,6 +27,8 @@ /*----- Header files ------------------------------------------------------*/ +#include "config.h" + #include #include @@ -34,6 +36,7 @@ #include "arena.h" #include "chacha.h" #include "chacha-core.h" +#include "dispatch.h" #include "gcipher.h" #include "grand.h" #include "keysz.h" @@ -59,9 +62,29 @@ const octet chacha_keysz[] = { KSZ_SET, 32, 16, 10, 0 }; * the feedforward step. */ -static void core(unsigned r, const chacha_matrix src, chacha_matrix dest) +CPU_DISPATCH(static, (void), + void, core, (unsigned r, const chacha_matrix src, + chacha_matrix dest), + (r, src, dest), + pick_core, simple_core); + +static void simple_core(unsigned r, const chacha_matrix src, + chacha_matrix dest) { CHACHA_nR(dest, src, r); CHACHA_FFWD(dest, src); } +#if CPUFAM_X86 || CPUFAM_AMD64 +extern core__functype chacha_core_x86ish_sse2; +#endif + +static core__functype *pick_core(void) +{ +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); +#endif + DISPATCH_PICK_FALLBACK(chacha_core, simple_core); +} + /* --- @populate@ --- * * * Arguments: @chacha_matrix a@ = a matrix to fill in diff --git a/symm/rijndael-base.c b/symm/rijndael-base.c index bfab63a2..b0505a66 100644 --- a/symm/rijndael-base.c +++ b/symm/rijndael-base.c @@ -27,12 +27,15 @@ /*----- Header files ------------------------------------------------------*/ +#include "config.h" + #include #include #include #include "blkc.h" +#include "dispatch.h" #include "gcipher.h" #include "rijndael.h" #include "rijndael-base.h" @@ -55,25 +58,14 @@ const octet rijndael_keysz[] = { KSZ_RANGE, RIJNDAEL_KEYSZ, 4, 32, 4 }; * Use: Low-level key-scheduling. */ -void rijndael_setup(rijndael_ctx *k, unsigned nb, const void *buf, size_t sz) +static void simple_setup(rijndael_ctx *k, unsigned nb, + const void *buf, unsigned nk) { - unsigned nk, nr, nw; + unsigned nr = k->nr, nw; unsigned i, j, jj; const octet *p; uint32 ww; - /* --- Sort out the key size --- */ - - KSZ_ASSERT(rijndael, sz); - nk = sz / 4; - - /* --- Select the number of rounds --- */ - - nr = (nk > nb ? nk : nb) + 6; - if (nr < 10) - nr = 10; - k->nr = nr; - /* --- Fetch the first key words out --- */ p = buf; @@ -120,4 +112,42 @@ void rijndael_setup(rijndael_ctx *k, unsigned nb, const void *buf, size_t sz) k->wi[i] = k->w[j + jj++]; } +CPU_DISPATCH(static, EMPTY, void, setup, (rijndael_ctx *k, unsigned nb, + const void *buf, unsigned nk), + (k, nb, buf, nk), pick_setup, simple_setup) + +#if CPUFAM_X86 || CPUFAM_AMD64 +extern setup__functype rijndael_setup_x86ish_aesni; +#endif + +static setup__functype *pick_setup(void) +{ +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni, + cpu_feature_p(CPUFEAT_X86_AESNI)); +#endif + DISPATCH_PICK_FALLBACK(rijndael_setup, simple_setup); +} + +void rijndael_setup(rijndael_ctx *k, unsigned nb, const void *buf, size_t sz) +{ + unsigned nk, nr; + + /* --- Sort out the key size --- */ + + KSZ_ASSERT(rijndael, sz); + nk = sz / 4; + + /* --- Select the number of rounds --- */ + + nr = (nk > nb ? nk : nb) + 6; + if (nr < 10) + nr = 10; + k->nr = nr; + + /* --- Do the main setup --- */ + + setup(k, nb, buf, nk); +} + /*----- That's all, folks -------------------------------------------------*/ diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S new file mode 100644 index 00000000..91fcc352 --- /dev/null +++ b/symm/rijndael-x86ish-aesni.S @@ -0,0 +1,604 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// AESNI-based implementation of Rijndael +/// +/// (c) 2015 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// External definitions. + +#include "config.h" +#include "asm-common.h" + +///-------------------------------------------------------------------------- +/// External definitions. + + .globl F(abort) + .globl F(rijndael_rcon) + +///-------------------------------------------------------------------------- +/// Local utilities. + +// Magic constants for shuffling. +#define ROTL 0x93 +#define ROT2 0x4e +#define ROTR 0x39 + +///-------------------------------------------------------------------------- +/// Main code. + + .arch .aes + .section .text + +/// The AESNI instructions implement a little-endian version of AES, but +/// Catacomb's internal interface presents as big-endian so as to work better +/// with things like GCM. We therefore maintain the round keys in +/// little-endian form, and have to end-swap blocks in and out. +/// +/// For added amusement, the AESNI instructions don't implement the +/// larger-block versions of Rijndael, so we have to end-swap the keys if +/// we're preparing for one of those. + + // Useful constants. + .equ maxrounds, 16 // maximum number of rounds + .equ maxblksz, 32 // maximum block size, in bytes + .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer + + // Context structure. + .equ nr, 0 // number of rounds + .equ w, nr + 4 // encryption key words + .equ wi, w + kbufsz // decryption key words + +///-------------------------------------------------------------------------- +/// Key setup. + +FUNC(rijndael_setup_x86ish_aesni) + +#if CPUFAM_X86 + // Arguments are on the stack. We'll need to stack the caller's + // register veriables, but we'll manage. + +# define CTX ebp // context pointer +# define BLKSZ [esp + 24] // block size + +# define SI esi // source pointer +# define DI edi // destination pointer + +# define KSZ ebx // key size +# define KSZo ebx // ... as address offset +# define NKW edx // total number of key words +# define NKW_NEEDS_REFRESH 1 // ... needs recalculating +# define RCON ecx // round constants table +# define LIM edx // limit pointer +# define LIMn edx // ... as integer offset from base + +# define NR ecx // number of rounds +# define LRK eax // distance to last key +# define LRKo eax // ... as address offset +# define BLKOFF edx // block size in bytes +# define BLKOFFo edx // ... as address offset + + // Stack the caller's registers. + push ebp + push ebx + push esi + push edi + + // Set up our own variables. + mov CTX, [esp + 20] // context base pointer + mov SI, [esp + 28] // key material + mov KSZ, [esp + 32] // key size, in words +#endif + +#if CPUFAM_AMD64 && ABI_SYSV + // Arguments are in registers. We have plenty, but, to be honest, + // the initial register allocation is a bit annoying. + +# define CTX r8 // context pointer +# define BLKSZ r9d // block size + +# define SI rsi // source pointer +# define DI rdi // destination pointer + +# define KSZ edx // key size +# define KSZo rdx // ... as address offset +# define NKW r10d // total number of key words +# define RCON rdi // round constants table +# define LIMn ecx // limit pointer +# define LIM rcx // ... as integer offset from base + +# define NR ecx // number of rounds +# define LRK eax // distance to last key +# define LRKo rax // ... as address offset +# define BLKOFF r9d // block size in bytes +# define BLKOFFo r9 // ... as address offset + + // Move arguments to more useful places. + mov CTX, rdi // context base pointer + mov BLKSZ, esi // block size in words + mov SI, rdx // key material + mov KSZ, ecx // key size, in words +#endif + +#if CPUFAM_AMD64 && ABI_WIN + // Arguments are in different registers, and they're a little tight. + +# define CTX r8 // context pointer +# define BLKSZ edx // block size + +# define SI rsi // source pointer +# define DI rdi // destination pointer + +# define KSZ r9d // key size +# define KSZo r9 // ... as address offset +# define NKW r10d // total number of key words +# define RCON rdi // round constants table +# define LIMn ecx // limit pointer +# define LIM rcx // ... as integer offset from base + +# define NR ecx // number of rounds +# define LRK eax // distance to last key +# define LRKo rax // ... as address offset +# define BLKOFF edx // block size in bytes +# define BLKOFFo rdx // ... as address offset + + // We'll need the index registers, which belong to the caller in this + // ABI. + push rsi + push rdi + + // Move arguments to more useful places. + mov SI, r8 // key material + mov CTX, rcx // context base pointer +#endif + + // The initial round key material is taken directly from the input + // key, so copy it over. +#if CPUFAM_AMD64 && ABI_SYSV + // We've been lucky. We already have a copy of the context pointer + // in rdi, and the key size in ecx. + add DI, w +#else + lea DI, [CTX + w] + mov ecx, KSZ +#endif + rep movsd + + // Find out other useful things. + mov NKW, [CTX + nr] // number of rounds + add NKW, 1 + imul NKW, BLKSZ // total key size in words +#if !NKW_NEEDS_REFRESH + // If we can't keep NKW for later, then we use the same register for + // it and LIM, so this move is unnecessary. + mov LIMn, NKW +#endif + sub LIMn, KSZ // offset by the key size + + // Find the round constants. + ldgot ecx + leaext RCON, rijndael_rcon, ecx + + // Prepare for the main loop. + lea SI, [CTX + w] + mov eax, [SI + 4*KSZo - 4] // most recent key word + lea LIM, [SI + 4*LIM] // limit, offset by one key expansion + + // Main key expansion loop. The first word of each key-length chunk + // needs special treatment. + // + // This is rather tedious because the Intel `AESKEYGENASSIST' + // instruction is very strangely shaped. Firstly, it wants to + // operate on vast SSE registers, even though we're data-blocked from + // doing more than operation at a time unless we're doing two key + // schedules simultaneously -- and even then we can't do more than + // two, because the instruction ignores two of its input words + // entirely, and produces two different outputs for each of the other + // two. And secondly it insists on taking the magic round constant + // as an immediate, so it's kind of annoying if you're not + // open-coding the whole thing. It's much easier to leave that as + // zero and XOR in the round constant by hand. +9: movd xmm0, eax + pshufd xmm0, xmm0, ROTR + aeskeygenassist xmm1, xmm0, 0 + pshufd xmm1, xmm1, ROTL + movd eax, xmm1 + xor eax, [SI] + xor al, [RCON] + inc RCON + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM + jae 8f + + // The next three words are simple... + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM + jae 8f + + // (Word 2...) + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM + jae 8f + + // (Word 3...) + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM + jae 8f + + // Word 4. If the key is /more/ than 6 words long, then we must + // apply a substitution here. + cmp KSZ, 5 + jb 9b + cmp KSZ, 7 + jb 0f + movd xmm0, eax + pshufd xmm0, xmm0, ROTL + aeskeygenassist xmm1, xmm0, 0 + movd eax, xmm1 +0: xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM + jae 8f + + // (Word 5...) + cmp KSZ, 6 + jb 9b + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM + jae 8f + + // (Word 6...) + cmp KSZ, 7 + jb 9b + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM + jae 8f + + // (Word 7...) + cmp KSZ, 8 + jb 9b + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM + jae 8f + + // Must be done by now. + jmp 9b + + // Next job is to construct the decryption keys. The keys for the + // first and last rounds don't need to be mangled, but the remaining + // ones do -- and they all need to be reordered too. + // + // The plan of action, then, is to copy the final encryption round's + // keys into place first, then to do each of the intermediate rounds + // in reverse order, and finally do the first round. + // + // Do all of the heavy lifting with SSE registers. The order we're + // doing this in means that it's OK if we read or write too much, and + // there's easily enough buffer space for the over-enthusiastic reads + // and writes because the context has space for 32-byte blocks, which + // is our maximum and an exact fit for two SSE registers. +8: mov NR, [CTX + nr] // number of rounds +#if NKW_NEEDS_REFRESH + mov BLKOFF, BLKSZ + mov LRK, NR + imul LRK, BLKOFF +#else + // If we retain NKW, then BLKSZ and BLKOFF are the same register + // because we won't need the former again. + mov LRK, NKW + sub LRK, BLKSZ +#endif + lea DI, [CTX + wi] + lea SI, [CTX + w + 4*LRKo] // last round's keys + shl BLKOFF, 2 // block size (in bytes now) + + // Copy the last encryption round's keys. + movdqu xmm0, [SI] + movdqu [DI], xmm0 + cmp BLKOFF, 16 + jbe 9f + movdqu xmm0, [SI + 16] + movdqu [DI + 16], xmm0 + + // Update the loop variables and stop if we've finished. +9: add DI, BLKOFFo + sub SI, BLKOFFo + sub NR, 1 + jbe 0f + + // Do another middle round's keys... + movdqu xmm0, [SI] + aesimc xmm0, xmm0 + movdqu [DI], xmm0 + cmp BLKOFF, 16 + jbe 9b + movdqu xmm0, [SI + 16] + aesimc xmm0, xmm0 + movdqu [DI + 16], xmm0 + jmp 9b + + // Finally do the first encryption round. +0: movdqu xmm0, [SI] + movdqu [DI], xmm0 + cmp BLKOFF, 16 + jbe 0f + movdqu xmm0, [SI + 16] + movdqu [DI + 16], xmm0 + + // If the block size is not exactly four words then we must end-swap + // everything. We can use fancy SSE toys for this. +0: cmp BLKOFF, 16 + je 0f + + // Find the byte-reordering table. + ldgot ecx + movdqa xmm5, [INTADDR(endswap_tab, ecx)] + +#if NKW_NEEDS_REFRESH + // Calculate the number of subkey words again. (It's a good job + // we've got a fast multiplier.) + mov NKW, [CTX + nr] + add NKW, 1 + imul NKW, BLKSZ +#endif + + // End-swap the encryption keys. + mov ecx, NKW + lea SI, [CTX + w] + call endswap_block + + // And the decryption keys. + mov ecx, NKW + lea SI, [CTX + wi] + call endswap_block + +0: // All done. +#if CPUFAM_X86 + pop edi + pop esi + pop ebx + pop ebp +#endif +#if CPUFAM_AMD64 && ABI_WIN + pop rdi + pop rsi +#endif + ret + + .align 16 +endswap_block: + // End-swap ECX words starting at SI. The end-swapping table is + // already loaded into XMM5; and it's OK to work in 16-byte chunks. + movdqu xmm1, [SI] + pshufb xmm1, xmm5 + movdqu [SI], xmm1 + add SI, 16 + sub ecx, 4 + ja endswap_block + ret + +#undef CTX +#undef BLKSZ +#undef SI +#undef DI +#undef KSZ +#undef KSZo +#undef RCON +#undef LIMn +#undef LIM +#undef NR +#undef LRK +#undef LRKo +#undef BLKOFF +#undef BLKOFFo + +ENDFUNC + +///-------------------------------------------------------------------------- +/// Encrypting and decrypting blocks. + + .macro encdec op, aes, koff +FUNC(rijndael_\op\()_x86ish_aesni) + + // Find the magic endianness-swapping table. + ldgot ecx + movdqa xmm5, [INTADDR(endswap_tab, ecx)] + +#if CPUFAM_X86 + // Arguments come in on the stack, and need to be collected. We + // don't have a shortage of registers. + +# define K ecx +# define SRC edx +# define DST edx +# define NR eax + + mov K, [esp + 4] + mov SRC, [esp + 8] +#endif + +#if CPUFAM_AMD64 && ABI_SYSV + // Arguments come in registers. All is good. + +# define K rdi +# define SRC rsi +# define DST rdx +# define NR eax +#endif + +#if CPUFAM_AMD64 && ABI_WIN + // Arguments come in different registers. + +# define K rcx +# define SRC rdx +# define DST r8 +# define NR eax +#endif + + // Initial setup. + movdqu xmm0, [SRC] + pshufb xmm0, xmm5 + mov NR, [K + nr] + add K, \koff + + // Initial whitening. + movdqu xmm1, [K] + add K, 16 + pxor xmm0, xmm1 + + // Dispatch to the correct code. + cmp NR, 10 + je 10f + jb bogus + cmp NR, 14 + je 14f + ja bogus + cmp NR, 12 + je 12f + jb 11f + jmp 13f + + .align 2 + + // 14 rounds... +14: movdqu xmm1, [K] + add K, 16 + \aes xmm0, xmm1 + + // 13 rounds... +13: movdqu xmm1, [K] + add K, 16 + \aes xmm0, xmm1 + + // 12 rounds... +12: movdqu xmm1, [K] + add K, 16 + \aes xmm0, xmm1 + + // 11 rounds... +11: movdqu xmm1, [K] + add K, 16 + \aes xmm0, xmm1 + + // 10 rounds... +10: movdqu xmm1, [K] + \aes xmm0, xmm1 + + // 9 rounds... + movdqu xmm1, [K + 16] + \aes xmm0, xmm1 + + // 8 rounds... + movdqu xmm1, [K + 32] + \aes xmm0, xmm1 + + // 7 rounds... + movdqu xmm1, [K + 48] + \aes xmm0, xmm1 + + // 6 rounds... + movdqu xmm1, [K + 64] + \aes xmm0, xmm1 + + // 5 rounds... + movdqu xmm1, [K + 80] + \aes xmm0, xmm1 + + // 4 rounds... + movdqu xmm1, [K + 96] + \aes xmm0, xmm1 + + // 3 rounds... + movdqu xmm1, [K + 112] + \aes xmm0, xmm1 + + // 2 rounds... + movdqu xmm1, [K + 128] + \aes xmm0, xmm1 + + // Final round... + movdqu xmm1, [K + 144] + \aes\()last xmm0, xmm1 + + // Unpermute the ciphertext block and store it. + pshufb xmm0, xmm5 +#if CPUFAM_X86 + mov DST, [esp + 12] +#endif + movdqu [DST], xmm0 + + // And we're done. + ret + +#undef K +#undef SRC +#undef DST +#undef NR + +ENDFUNC + .endm + + encdec eblk, aesenc, w + encdec dblk, aesdec, wi + +///-------------------------------------------------------------------------- +/// Random utilities. + + .align 16 + // Abort the process because of a programming error. Indirecting + // through this point serves several purposes: (a) by CALLing, rather + // than branching to, `abort', we can save the return address, which + // might at least provide a hint as to what went wrong; (b) we don't + // have conditional CALLs (and they'd be big anyway); and (c) we can + // write a HLT here as a backstop against `abort' being mad. +bogus: callext F(abort) +0: hlt + jmp 0b + + gotaux ecx + +///-------------------------------------------------------------------------- +/// Data tables. + + .align 16 +endswap_tab: + .byte 3, 2, 1, 0 + .byte 7, 6, 5, 4 + .byte 11, 10, 9, 8 + .byte 15, 14, 13, 12 + +///----- That's all, folks -------------------------------------------------- diff --git a/symm/rijndael.c b/symm/rijndael.c index 9d8e739f..293f28da 100644 --- a/symm/rijndael.c +++ b/symm/rijndael.c @@ -27,12 +27,15 @@ /*----- Header files ------------------------------------------------------*/ +#include "config.h" + #include #include #include #include "blkc.h" +#include "dispatch.h" #include "gcipher.h" #include "rijndael.h" #include "rijndael-base.h" @@ -69,6 +72,39 @@ void rijndael_init(rijndael_ctx *k, const void *buf, size_t sz) * Use: Low-level block encryption and decryption. */ +CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_eblk, (const rijndael_ctx *k, + const uint32 s[4], + uint32 d[4]), + (k, s, d), pick_eblk, simple_eblk) + +CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_dblk, (const rijndael_ctx *k, + const uint32 s[4], + uint32 d[4]), + (k, s, d), pick_dblk, simple_dblk) + +#if CPUFAM_X86 || CPUFAM_AMD64 +extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni; +extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni; +#endif + +static rijndael_eblk__functype *pick_eblk(void) +{ +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni, + cpu_feature_p(CPUFEAT_X86_AESNI)); +#endif + DISPATCH_PICK_FALLBACK(rijndael_eblk, simple_eblk); +} + +static rijndael_dblk__functype *pick_dblk(void) +{ +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni, + cpu_feature_p(CPUFEAT_X86_AESNI)); +#endif + DISPATCH_PICK_FALLBACK(rijndael_dblk, simple_dblk); +} + #define DO(what, t, aa, bb, cc, dd, a, b, c, d, w) do { \ aa = what(t, a, b, c, d) ^ *w++; \ bb = what(t, b, c, d, a) ^ *w++; \ @@ -83,7 +119,7 @@ void rijndael_init(rijndael_ctx *k, const void *buf, size_t sz) dd = what(t, d, c, b, a) ^ *w++; \ } while (0) -void rijndael_eblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst) +static void simple_eblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst) { uint32 a = s[0], b = s[1], c = s[2], d = s[3]; uint32 aa, bb, cc, dd; @@ -118,7 +154,7 @@ void rijndael_eblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst) dst[0] = a; dst[1] = b; dst[2] = c; dst[3] = d; } -void rijndael_dblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst) +static void simple_dblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst) { uint32 a = s[0], b = s[1], c = s[2], d = s[3]; uint32 aa, bb, cc, dd; diff --git a/symm/salsa20-core.h b/symm/salsa20-core.h index 98efa72e..b27f2220 100644 --- a/symm/salsa20-core.h +++ b/symm/salsa20-core.h @@ -43,6 +43,28 @@ /*----- The Salsa20 core function -----------------------------------------*/ +/* It makes life somewhat easier if we don't actually store and maintain the + * input matrix in the textbook order. Instead, we rotate the columns other + * than the leftmost one upwards, so that the constants which were originally + * along the diagonal end up on the top row. We'll need to undo this + * permutation on output, but that's not too terrible an imposition. + * + * The permutation we're applying to the matrix elements is this: + * + * [ 0 1 2 3 ] [ 0 5 10 15 ] + * [ 4 5 6 7 ] --> [ 4 9 14 3 ] + * [ 8 9 10 11 ] [ 8 13 2 7 ] + * [ 12 13 14 15 ] [ 12 1 6 11 ] + * + * and as a result, we need to apply this inverse permutation to figure out + * which indices to use in the doublerow function and elsewhere. + * + * [ 0 13 10 7 ] + * [ 4 1 14 11 ] + * [ 8 5 2 15 ] + * [ 12 9 6 3 ] + */ + /* The Salsa20 quarter-round. Read from the matrix @y@ at indices @a@, @b@, * @c@, and @d@; and write to the corresponding elements of @z@. */ @@ -58,22 +80,31 @@ */ #define SALSA20_DR(z, y) do { \ SALSA20_QR(z, y, 0, 4, 8, 12); \ - SALSA20_QR(z, y, 5, 9, 13, 1); \ - SALSA20_QR(z, y, 10, 14, 2, 6); \ - SALSA20_QR(z, y, 15, 3, 7, 11); \ - SALSA20_QR(z, z, 0, 1, 2, 3); \ - SALSA20_QR(z, z, 5, 6, 7, 4); \ - SALSA20_QR(z, z, 10, 11, 8, 9); \ - SALSA20_QR(z, z, 15, 12, 13, 14); \ + SALSA20_QR(z, y, 1, 5, 9, 13); \ + SALSA20_QR(z, y, 2, 6, 10, 14); \ + SALSA20_QR(z, y, 3, 7, 11, 15); \ + SALSA20_QR(z, z, 0, 13, 10, 7); \ + SALSA20_QR(z, z, 1, 14, 11, 4); \ + SALSA20_QR(z, z, 2, 15, 8, 5); \ + SALSA20_QR(z, z, 3, 12, 9, 6); \ } while (0) /* The Salsa20 feedforward step, used at the end of the core function. Here, * @y@ contains the original input matrix; @z@ contains the final one, and is - * updated. + * updated. The output is rendered in canonical order, ready for output. */ #define SALSA20_FFWD(z, y) do { \ - int _i; \ - for (_i = 0; _i < 16; _i++) (z)[_i] += (y)[_i]; \ + const uint32 *_y = (y); \ + uint32 *_z = (z); \ + int _t; \ + _z[ 0] = _z[ 0] + _y[ 0]; _z[ 4] = _z[ 4] + _y[ 4]; \ + _z[ 8] = _z[ 8] + _y[ 8]; _z[12] = _z[12] + _y[12]; \ + _t = _z[ 1] + _y[ 1]; _z[ 1] = _z[13] + _y[13]; \ + _z[13] = _z[ 9] + _y[ 9]; _z[ 9] = _z[ 5] + _y[ 5]; _z[ 5] = _t; \ + _t = _z[ 2] + _y[ 2]; _z[ 2] = _z[10] + _y[10]; _z[10] = _t; \ + _t = _z[ 6] + _y[ 6]; _z[ 6] = _z[14] + _y[14]; _z[14] = _t; \ + _t = _z[ 3] + _y[ 3]; _z[ 3] = _z[ 7] + _y[ 7]; \ + _z[ 7] = _z[11] + _y[11]; _z[11] = _z[15] + _y[15]; _z[15] = _t; \ } while (0) /* Various numbers of rounds, unrolled. Read from @y@, and write to @z@. */ @@ -98,7 +129,7 @@ /* Step the counter in the Salsa20 state matrix @a@. */ #define SALSA20_STEP(a) \ - do { (a)[8] = U32((a)[8] + 1); (a)[9] += !(a)[8]; } while (0) + do { (a)[8] = U32((a)[8] + 1); (a)[5] += !(a)[8]; } while (0) /*----- Buffering and output ----------------------------------------------* * diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S new file mode 100644 index 00000000..a168d79a --- /dev/null +++ b/symm/salsa20-x86ish-sse2.S @@ -0,0 +1,331 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// Fancy SIMD implementation of Salsa20 +/// +/// (c) 2015 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// External definitions. + +#include "config.h" +#include "asm-common.h" + +///-------------------------------------------------------------------------- +/// Local utilities. + +// Magic constants for shuffling. +#define ROTL 0x93 +#define ROT2 0x4e +#define ROTR 0x39 + +///-------------------------------------------------------------------------- +/// Main code. + + .arch pentium4 + .section .text + +FUNC(salsa20_core_x86ish_sse2) + + // Initial setup. + +#if CPUFAM_X86 + // Arguments come in on the stack, and will need to be collected. We + // we can get away with just the scratch registers for integer work, + // but we'll run out of XMM registers and will need some properly + // aligned space which we'll steal from the stack. I don't trust the + // stack pointer's alignment, so I'll have to mask the stack pointer, + // which in turn means I'll need to keep track of the old value. + // Hence I'm making a full i386-style stack frame here. + // + // The Windows and SysV ABIs are sufficiently similar that we don't + // need to worry about the differences here. + +# define NR ecx +# define IN eax +# define OUT edx +# define SAVE0 xmm6 +# define SAVE1 xmm7 +# define SAVE2 [esp + 0] +# define SAVE3 [esp + 16] + + push ebp + mov ebp, esp + sub esp, 32 + mov IN, [ebp + 12] + mov OUT, [ebp + 16] + and esp, ~15 + mov NR, [ebp + 8] +#endif + +#if CPUFAM_AMD64 && ABI_SYSV + // This is nice. We have plenty of XMM registers, and the arguments + // are in useful places. There's no need to spill anything and we + // can just get on with the code. + +# define NR edi +# define IN rsi +# define OUT rdx +# define SAVE0 xmm6 +# define SAVE1 xmm7 +# define SAVE2 xmm8 +# define SAVE3 xmm9 +#endif + +# if CPUFAM_AMD64 && ABI_WIN + // Arguments come in registers, but they're different between Windows + // and everyone else (and everyone else is saner). + // + // The Windows ABI insists that we preserve some of the XMM + // registers, but we want more than we can use as scratch space. Two + // places we only need to save a copy of the input for the + // feedforward at the end; but the other two we want for the final + // permutation, so save the old values on the stack (We need an extra + // 8 bytes to align the stack.) + +# define NR ecx +# define IN rdx +# define OUT r8 +# define SAVE0 xmm6 +# define SAVE1 xmm7 +# define SAVE2 [rsp + 32] +# define SAVE3 [rsp + 48] + + sub rsp, 64 + 8 + movdqa [rsp + 0], xmm6 + movdqa [rsp + 16], xmm7 +#endif + + // First job is to slurp the matrix into XMM registers. The words + // have already been permuted conveniently to make them line up + // better for SIMD processing. + // + // The textbook arrangement of the matrix is this. + // + // [C K K K] + // [K C N N] + // [T T C K] + // [K K K C] + // + // But we've rotated the columns up so that the main diagonal with + // the constants on it end up in the first row, giving something more + // like + // + // [C C C C] + // [K T K K] + // [T K K N] + // [K K N K] + // + // so the transformation looks like this: + // + // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0) + // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1) + // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2) + // [12 13 14 15] [12 1 6 11] (d, xmm3) + movdqu xmm0, [IN + 0] + movdqu xmm1, [IN + 16] + movdqu xmm2, [IN + 32] + movdqu xmm3, [IN + 48] + + ## Take a copy for later. + movdqa SAVE0, xmm0 + movdqa SAVE1, xmm1 + movdqa SAVE2, xmm2 + movdqa SAVE3, xmm3 + +loop: + // Apply a column quarterround to each of the columns simultaneously. + // Alas, there doesn't seem to be a packed doubleword rotate, so we + // have to synthesize it. + + // b ^= (a + d) <<< 7 + movdqa xmm4, xmm0 + paddd xmm4, xmm3 + movdqa xmm5, xmm4 + pslld xmm4, 7 + psrld xmm5, 25 + por xmm4, xmm5 + pxor xmm1, xmm4 + + // c ^= (b + a) <<< 9 + movdqa xmm4, xmm1 + paddd xmm4, xmm0 + movdqa xmm5, xmm4 + pslld xmm4, 9 + psrld xmm5, 23 + por xmm4, xmm5 + pxor xmm2, xmm4 + + // d ^= (c + b) <<< 13 + movdqa xmm4, xmm2 + paddd xmm4, xmm1 + pshufd xmm1, xmm1, ROTL + movdqa xmm5, xmm4 + pslld xmm4, 13 + psrld xmm5, 19 + por xmm4, xmm5 + pxor xmm3, xmm4 + + // a ^= (d + c) <<< 18 + movdqa xmm4, xmm3 + pshufd xmm3, xmm3, ROTR + paddd xmm4, xmm2 + pshufd xmm2, xmm2, ROT2 + movdqa xmm5, xmm4 + pslld xmm4, 18 + psrld xmm5, 14 + por xmm4, xmm5 + pxor xmm0, xmm4 + + // The transpose conveniently only involves reordering elements of + // individual rows, which can be done quite easily, and reordering + // the rows themselves, which is a trivial renaming. It doesn't + // involve any movement of elements between rows. + // + // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0) + // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3) + // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2) + // [12 1 6 11] [ 3 4 9 14] (d, xmm1) + // + // The shuffles have quite high latency, so they've been pushed + // backwards into the main instruction list. + + // Apply the row quarterround to each of the columns (yes!) + // simultaneously. + + // b ^= (a + d) <<< 7 + movdqa xmm4, xmm0 + paddd xmm4, xmm1 + movdqa xmm5, xmm4 + pslld xmm4, 7 + psrld xmm5, 25 + por xmm4, xmm5 + pxor xmm3, xmm4 + + // c ^= (b + a) <<< 9 + movdqa xmm4, xmm3 + paddd xmm4, xmm0 + movdqa xmm5, xmm4 + pslld xmm4, 9 + psrld xmm5, 23 + por xmm4, xmm5 + pxor xmm2, xmm4 + + // d ^= (c + b) <<< 13 + movdqa xmm4, xmm2 + paddd xmm4, xmm3 + pshufd xmm3, xmm3, ROTL + movdqa xmm5, xmm4 + pslld xmm4, 13 + psrld xmm5, 19 + por xmm4, xmm5 + pxor xmm1, xmm4 + + // a ^= (d + c) <<< 18 + movdqa xmm4, xmm1 + pshufd xmm1, xmm1, ROTR + paddd xmm4, xmm2 + pshufd xmm2, xmm2, ROT2 + movdqa xmm5, xmm4 + pslld xmm4, 18 + psrld xmm5, 14 + por xmm4, xmm5 + pxor xmm0, xmm4 + + // We had to undo the transpose ready for the next loop. Again, push + // back the shuffles because they take a long time coming through. + // Decrement the loop counter and see if we should go round again. + // Later processors fuse this pair into a single uop. + sub NR, 2 + ja loop + + // Almost there. Firstly, the feedforward addition, and then we have + // to write out the result. Here we have to undo the permutation + // which was already applied to the input. Shuffling has quite high + // latency, so arrange to start a new shuffle into a temporary as + // soon as we've written out the old value. + paddd xmm0, SAVE0 + pshufd xmm4, xmm0, 0x39 + movd [OUT + 0], xmm0 + + paddd xmm1, SAVE1 + pshufd xmm5, xmm1, ROTL + movd [OUT + 16], xmm1 + + paddd xmm2, SAVE2 + pshufd xmm6, xmm2, ROT2 + movd [OUT + 32], xmm2 + + paddd xmm3, SAVE3 + pshufd xmm7, xmm3, ROTR + movd [OUT + 48], xmm3 + + movd [OUT + 4], xmm7 + pshufd xmm7, xmm3, ROT2 + movd [OUT + 24], xmm7 + pshufd xmm3, xmm3, ROTL + movd [OUT + 44], xmm3 + + movd [OUT + 8], xmm6 + pshufd xmm6, xmm2, ROTL + movd [OUT + 28], xmm6 + pshufd xmm2, xmm2, ROTR + movd [OUT + 52], xmm2 + + movd [OUT + 12], xmm5 + pshufd xmm5, xmm1, ROTR + movd [OUT + 36], xmm5 + pshufd xmm1, xmm1, ROT2 + movd [OUT + 56], xmm1 + + movd [OUT + 20], xmm4 + pshufd xmm4, xmm0, ROT2 + movd [OUT + 40], xmm4 + pshufd xmm0, xmm0, ROTL + movd [OUT + 60], xmm0 + + // Tidy things up. + +#if CPUFAM_X86 + mov esp, ebp + pop ebp +#endif +#if CPUFAM_AMD64 && ABI_WIN + movdqa xmm6, [rsp + 0] + movdqa xmm7, [rsp + 16] + add rsp, 64 + 8 +#endif + + // And with that, we're done. + ret + +#undef NR +#undef IN +#undef OUT +#undef SAVE0 +#undef SAVE1 +#undef SAVE2 +#undef SAVE3 + +ENDFUNC + +///----- That's all, folks -------------------------------------------------- diff --git a/symm/salsa20.c b/symm/salsa20.c index 4b35cbd7..40f28fc0 100644 --- a/symm/salsa20.c +++ b/symm/salsa20.c @@ -7,11 +7,14 @@ /*----- Header files ------------------------------------------------------*/ +#include "config.h" + #include #include #include "arena.h" +#include "dispatch.h" #include "gcipher.h" #include "grand.h" #include "keysz.h" @@ -39,9 +42,29 @@ const octet salsa20_keysz[] = { KSZ_SET, 32, 16, 10, 0 }; * the feedforward step. */ -static void core(unsigned r, const salsa20_matrix src, salsa20_matrix dest) +CPU_DISPATCH(static, (void), + void, core, (unsigned r, const salsa20_matrix src, + salsa20_matrix dest), + (r, src, dest), + pick_core, simple_core); + +static void simple_core(unsigned r, const salsa20_matrix src, + salsa20_matrix dest) { SALSA20_nR(dest, src, r); SALSA20_FFWD(dest, src); } +#if CPUFAM_X86 || CPUFAM_AMD64 +extern core__functype salsa20_core_x86ish_sse2; +#endif + +static core__functype *pick_core(void) +{ +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); +#endif + DISPATCH_PICK_FALLBACK(salsa20_core, simple_core); +} + /* --- @populate@ --- * * * Arguments: @salsa20_matrix a@ = a matrix to fill in @@ -61,33 +84,42 @@ static void populate(salsa20_matrix a, const void *key, size_t ksz) KSZ_ASSERT(salsa20, ksz); - a[ 1] = LOAD32_L(k + 0); - a[ 2] = LOAD32_L(k + 4); + /* Here's the pattern of key, constant, nonce, and counter pieces in the + * matrix, before and after our permutation. + * + * [ C0 K0 K1 K2 ] [ C0 C1 C2 C3 ] + * [ K3 C1 N0 N1 ] --> [ K3 T1 K7 K2 ] + * [ T0 T1 C2 K4 ] [ T0 K6 K1 N1 ] + * [ K5 K6 K7 C3 ] [ K5 K0 N0 K4 ] + */ + + a[13] = LOAD32_L(k + 0); + a[10] = LOAD32_L(k + 4); if (ksz == 10) { - a[ 3] = LOAD16_L(k + 8); + a[ 7] = LOAD16_L(k + 8); a[ 4] = 0; } else { - a[ 3] = LOAD32_L(k + 8); + a[ 7] = LOAD32_L(k + 8); a[ 4] = LOAD32_L(k + 12); } if (ksz <= 16) { - a[11] = a[ 1]; - a[12] = a[ 2]; - a[13] = a[ 3]; - a[14] = a[ 4]; + a[15] = a[13]; + a[12] = a[10]; + a[ 9] = a[ 7]; + a[ 6] = a[ 4]; a[ 0] = SALSA20_A128; - a[ 5] = SALSA20_B128; - a[10] = ksz == 10 ? SALSA20_C80 : SALSA20_C128; - a[15] = SALSA20_D128; + a[ 1] = SALSA20_B128; + a[ 2] = ksz == 10 ? SALSA20_C80 : SALSA20_C128; + a[ 3] = SALSA20_D128; } else { - a[11] = LOAD32_L(k + 16); + a[15] = LOAD32_L(k + 16); a[12] = LOAD32_L(k + 20); - a[13] = LOAD32_L(k + 24); - a[14] = LOAD32_L(k + 28); + a[ 9] = LOAD32_L(k + 24); + a[ 6] = LOAD32_L(k + 28); a[ 0] = SALSA20_A256; - a[ 5] = SALSA20_B256; - a[10] = SALSA20_C256; - a[15] = SALSA20_D256; + a[ 1] = SALSA20_B256; + a[ 2] = SALSA20_C256; + a[ 3] = SALSA20_D256; } } @@ -130,8 +162,8 @@ void salsa20_setnonce(salsa20_ctx *ctx, const void *nonce) { const octet *n = nonce; - ctx->a[6] = LOAD32_L(n + 0); - ctx->a[7] = LOAD32_L(n + 4); + ctx->a[14] = LOAD32_L(n + 0); + ctx->a[11] = LOAD32_L(n + 4); salsa20_seek(ctx, 0); } @@ -153,7 +185,7 @@ void salsa20_seek(salsa20_ctx *ctx, unsigned long i) void salsa20_seeku64(salsa20_ctx *ctx, kludge64 i) { - ctx->a[8] = LO64(i); ctx->a[9] = HI64(i); + ctx->a[8] = LO64(i); ctx->a[5] = HI64(i); ctx->bufi = SALSA20_OUTSZ; } @@ -169,7 +201,7 @@ unsigned long salsa20_tell(salsa20_ctx *ctx) { kludge64 i = salsa20_tellu64(ctx); return (GET64(unsigned long, i)); } kludge64 salsa20_tellu64(salsa20_ctx *ctx) - { kludge64 i; SET64(i, ctx->a[9], ctx->a[8]); return (i); } + { kludge64 i; SET64(i, ctx->a[5], ctx->a[8]); return (i); } /* --- @salsa20{,12,8}_encrypt@ --- * * @@ -272,10 +304,10 @@ SALSA20_VARS(DEFENCRYPT) * speed critical, so we do it the harder way. \ */ \ \ - for (i = 0; i < 4; i++) k[i + 6] = src[i]; \ + for (i = 0; i < 4; i++) k[14 - 3*i] = src[i]; \ core(r, k, a); \ - for (i = 0; i < 4; i++) dest[i] = a[5*i] - k[5*i]; \ - for (i = 4; i < 8; i++) dest[i] = a[i + 2] - k[i + 2]; \ + for (i = 0; i < 4; i++) dest[i] = a[5*i] - k[i]; \ + for (i = 4; i < 8; i++) dest[i] = a[i + 2] - k[26 - 3*i]; \ } \ \ void HSALSA20_PRF(r, salsa20_ctx *ctx, const void *src, void *dest) \ @@ -340,9 +372,9 @@ SALSA20_VARS(DEFHSALSA20) \ populate(ctx->k, key, ksz); \ ctx->s.a[ 0] = SALSA20_A256; \ - ctx->s.a[ 5] = SALSA20_B256; \ - ctx->s.a[10] = SALSA20_C256; \ - ctx->s.a[15] = SALSA20_D256; \ + ctx->s.a[ 1] = SALSA20_B256; \ + ctx->s.a[ 2] = SALSA20_C256; \ + ctx->s.a[ 3] = SALSA20_D256; \ XSALSA20_SETNONCE(r, ctx, nonce ? nonce : zerononce); \ } SALSA20_VARS(DEFXINIT) @@ -371,8 +403,8 @@ SALSA20_VARS(DEFXINIT) \ for (i = 0; i < 4; i++) in[i] = LOAD32_L(n + 4*i); \ HSALSA20_RAW(r, ctx->k, in, out); \ - for (i = 0; i < 4; i++) ctx->s.a[i + 1] = out[i]; \ - for (i = 4; i < 8; i++) ctx->s.a[i + 7] = out[i]; \ + for (i = 0; i < 4; i++) ctx->s.a[13 - 3*i] = out[i]; \ + for (i = 4; i < 8; i++) ctx->s.a[27 - 3*i] = out[i]; \ salsa20_setnonce(&ctx->s, n + 16); \ } SALSA20_VARS(DEFXNONCE) @@ -730,23 +762,31 @@ SALSA20_VARS(DEFXGRAND) #include #include +static const int perm[] = { + 0, 13, 10, 7, + 4, 1, 14, 11, + 8, 5, 2, 15, + 12, 9, 6, 3 +}; + #define DEFVCORE(r) \ static int v_core_##r(dstr *v) \ { \ salsa20_matrix a, b; \ dstr d = DSTR_INIT; \ - int i, n; \ + int i, j, n; \ int ok = 1; \ \ DENSURE(&d, SALSA20_OUTSZ); d.len = SALSA20_OUTSZ; \ n = *(int *)v[0].buf; \ for (i = 0; i < SALSA20_OUTSZ/4; i++) \ - a[i] = LOAD32_L(v[1].buf + 4*i); \ + b[i] = LOAD32_L(v[1].buf + 4*i); \ for (i = 0; i < n; i++) { \ + for (j = 0; j < 16; j++) a[perm[j]] = b[j]; \ core(r, a, b); \ memcpy(a, b, sizeof(a)); \ } \ - for (i = 0; i < SALSA20_OUTSZ/4; i++) STORE32_L(d.buf + 4*i, a[i]); \ + for (i = 0; i < SALSA20_OUTSZ/4; i++) STORE32_L(d.buf + 4*i, b[i]); \ \ if (d.len != v[2].len || memcmp(d.buf, v[2].buf, v[2].len) != 0) { \ ok = 0; \