From 1a0c09c4d4ed8a4a1a1679f793431cc4f5c24c80 Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Wed, 18 May 2016 10:29:03 +0100 Subject: [PATCH] Preprocess the assembler files. * Rename the `*.s' files to `*.S'. * Create a new header `base/asm-common.h' containing useful definitions, particularly for dealing with the peculiarities of shared library code. * Convert the assembler files to use the new macros. * Convert the assembler files to use `//' for comments rather than `#' (as currently). This is a bit annoying, but `#' is wanted by the preprocessor, and `/* ... */' doesn't work in Emacs's `asm-mode'. The reason for doing all of this is because the C preprocessor will let me do things like inventing symbolic names for registers, which will be handy later when I add support for AMD64 processors, because most of the code will be identical between 32- and 64-bit machines. This change has the side-effect that the AESNI implementation no longer uses PIC-ish means to find things when it doesn't need to. --- base/Makefile.am | 3 + base/asm-common.h | 152 +++++++++++++ symm/Makefile.am | 6 +- symm/chacha-x86-sse2.S | 195 ++++++++++++++++ symm/chacha-x86-sse2.s | 188 ---------------- symm/rijndael-x86-aesni.S | 548 +++++++++++++++++++++++++++++++++++++++++++++ symm/rijndael-x86-aesni.s | 553 ---------------------------------------------- symm/salsa20-x86-sse2.S | 254 +++++++++++++++++++++ symm/salsa20-x86-sse2.s | 247 --------------------- 9 files changed, 1155 insertions(+), 991 deletions(-) create mode 100644 base/asm-common.h create mode 100644 symm/chacha-x86-sse2.S delete mode 100644 symm/chacha-x86-sse2.s create mode 100644 symm/rijndael-x86-aesni.S delete mode 100644 symm/rijndael-x86-aesni.s create mode 100644 symm/salsa20-x86-sse2.S delete mode 100644 symm/salsa20-x86-sse2.s diff --git a/base/Makefile.am b/base/Makefile.am index c8608ed4..0ac43f2e 100644 --- a/base/Makefile.am +++ b/base/Makefile.am @@ -55,4 +55,7 @@ libbase_la_SOURCES += lmem.c ## Clearing secrets from memory. pkginclude_HEADERS += paranoia.h +## Base definitions for assembler source. +EXTRA_DIST += asm-common.h + ###----- That's all, folks -------------------------------------------------- diff --git a/base/asm-common.h b/base/asm-common.h new file mode 100644 index 00000000..7e62eb54 --- /dev/null +++ b/base/asm-common.h @@ -0,0 +1,152 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// Fancy SIMD implementation of Salsa20 +/// +/// (c) 2015 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// General definitions. + +// Announcing an external function. +#define FUNC(name) \ + .globl F(name); \ + TYPE_FUNC(name); \ + .macro ENDFUNC; _ENDFUNC(name); .endm; \ + FUNC_PREHOOK(name); \ +F(name): \ + FUNC_POSTHOOK(name) + +// Marking the end of a function. +#define _ENDFUNC(name) \ + .purgem ENDFUNC; \ + SIZE_OBJ(name); \ + ENDFUNC_HOOK(name) + +///-------------------------------------------------------------------------- +/// ELF-specific hacking. + +#if __ELF__ + +#if __PIC__ || __PIE__ +# define WANT_PIC 1 +#endif + +#define TYPE_FUNC(name) .type name, STT_FUNC + +#define SIZE_OBJ(name) .size name, . - name + +#endif + +///-------------------------------------------------------------------------- +/// x86-specific hacking. + +#if CPUFAM_X86 + +// Set the function hooks. +#define FUNC_PREHOOK(_) .balign 16 + +// Don't use the wretched AT&T syntax. It's festooned with pointless +// punctuation, and all of the data movement is backwards. Ugh! + .intel_syntax noprefix + +// Call external subroutine at ADDR, possibly via PLT. + .macro callext addr +#if WANT_PIC + call \addr@PLT +#else + call \addr +#endif + .endm + +// Do I need to arrange a spare GOT register? +#if WANT_PIC && CPUFAM_X86 +# define NEED_GOT 1 +#endif +#define GOTREG ebx // Not needed in AMD64 so don't care. + +// Maybe load GOT address into GOT. + .macro ldgot got=GOTREG +#if WANT_PIC + call _where_am_i.\got + add \got, offset _GLOBAL_OFFSET_TABLE_ +#endif + .endm + +// Maybe build a helper subroutine for `ldgot GOT'. + .macro gotaux got=GOTREG +#if WANT_PIC + .align 16 +_where_am_i.\got : + mov \got, [esp] + ret +#endif + .endm + +// Load address of external symbol ADDR into REG, maybe using GOT. + .macro leaext reg, addr, got=GOTREG +#if WANT_PIC + mov \reg, [\got + \addr@GOT] +#else + mov \reg, offset \addr +#endif + .endm + +// Address expression (possibly using a base register, and a displacement) +// referring to ADDR, which is within our module, maybe using GOT. +#define INTADDR(...) INTADDR__0(__VA_ARGS__, GOTREG, dummy) +#define INTADDR__0(addr, got, ...) INTADDR__1(addr, got) +#if WANT_PIC +# define INTADDR__1(addr, got) got + addr@GOTOFF +#else +# define INTADDR__1(addr, got) addr +#endif + +#endif + +///-------------------------------------------------------------------------- +/// Final stuff. + +// Default values for the various hooks. +#ifndef FUNC_PREHOOK +# define FUNC_PREHOOK(name) +#endif +#ifndef FUNC_POSTHOOK +# define FUNC_POSTHOOK(name) +#endif +#ifndef ENDFUNC_HOOK +# define ENDFUNC_HOOK(name) +#endif + +#ifndef F +# define F(name) name +#endif + +#ifndef TYPE_FUNC +# define TYPE_FUNC(name) +#endif + +#ifndef SIZE_OBJ +# define SIZE_OBJ(name) +#endif + +///----- That's all, folks -------------------------------------------------- diff --git a/symm/Makefile.am b/symm/Makefile.am index 6a63993d..ba037cd5 100644 --- a/symm/Makefile.am +++ b/symm/Makefile.am @@ -181,7 +181,7 @@ BLKCS += rc5 BLKCS += rijndael rijndael192 rijndael256 libsymm_la_SOURCES += rijndael-base.h rijndael-base.c if CPUFAM_X86 -libsymm_la_SOURCES += rijndael-x86-aesni.s +libsymm_la_SOURCES += rijndael-x86-aesni.S endif libsymm_la_SOURCES += $(precomp)/rijndael-tab.c PRECOMPS += $(precomp)/rijndael-tab.c @@ -382,7 +382,7 @@ EXTRA_DIST += salsa20-tvconv pkginclude_HEADERS += salsa20.h salsa20-core.h libsymm_la_SOURCES += salsa20.c if CPUFAM_X86 -libsymm_la_SOURCES += salsa20-x86-sse2.s +libsymm_la_SOURCES += salsa20-x86-sse2.S endif TESTS += salsa20.$t ALL_CIPHERS += salsa20 salsa2012 salsa208 @@ -411,7 +411,7 @@ t/salsa20: salsa20-tvconv t/salsa20.local $(SALSA20_ESTREAM_TV) pkginclude_HEADERS += chacha.h chacha-core.h libsymm_la_SOURCES += chacha.c if CPUFAM_X86 -libsymm_la_SOURCES += chacha-x86-sse2.s +libsymm_la_SOURCES += chacha-x86-sse2.S endif TESTS += chacha.$t EXTRA_DIST += t/chacha diff --git a/symm/chacha-x86-sse2.S b/symm/chacha-x86-sse2.S new file mode 100644 index 00000000..f9ae1c4e --- /dev/null +++ b/symm/chacha-x86-sse2.S @@ -0,0 +1,195 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// Fancy SIMD implementation of ChaCha +/// +/// (c) 2015 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// External definitions. + +#include "config.h" +#include "asm-common.h" + +///-------------------------------------------------------------------------- +/// Main code. + + .arch pentium4 + .section .text + +FUNC(chacha_core_x86_sse2) + + // Initial state. We have three arguments: + // [ebp + 8] is the number of rounds to do + // [ebp + 12] points to the input matrix + // [ebp + 16] points to the output matrix + push ebp + mov ebp, esp + sub esp, 16 + mov edx, [ebp + 12] + and esp, ~15 + + // First job is to slurp the matrix into XMM registers. Be careful: + // the input matrix isn't likely to be properly aligned. + // + // [ 0 1 2 3] (a, xmm0) + // [ 4 5 6 7] (b, xmm0) + // [ 8 9 10 11] (c, xmm0) + // [12 13 14 15] (d, xmm0) + movdqu xmm0, [edx + 0] + movdqu xmm1, [edx + 16] + movdqu xmm2, [edx + 32] + movdqu xmm3, [edx + 48] + + // Prepare for the main loop. + mov ecx, [ebp + 8] + + // Take a copy for later. This one is aligned properly, by + // construction. + movdqa [esp], xmm0 + movdqa xmm5, xmm1 + movdqa xmm6, xmm2 + movdqa xmm7, xmm3 + +loop: + // Apply a column quarterround to each of the columns simultaneously. + // Alas, there doesn't seem to be a packed doubleword rotate, so we + // have to synthesize it. + + // a += b; d ^= a; d <<<= 16 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 16 + psrld xmm4, 16 + por xmm3, xmm4 + + // c += d; b ^= c; b <<<= 12 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm4, xmm1 + pslld xmm1, 12 + psrld xmm4, 20 + por xmm1, xmm4 + + // a += b; d ^= a; d <<<= 8 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 8 + psrld xmm4, 24 + por xmm3, xmm4 + + // c += d; b ^= c; b <<<= 7 + paddd xmm2, xmm3 + pshufd xmm3, xmm3, 0x93 + pxor xmm1, xmm2 + pshufd xmm2, xmm2, 0x4e + movdqa xmm4, xmm1 + pslld xmm1, 7 + psrld xmm4, 25 + por xmm1, xmm4 + + // The not-quite-transpose conveniently only involves reordering + // elements of individual rows, which can be done quite easily. It + // doesn't involve any movement of elements between rows, or even + // renaming of the rows. + // + // [ 0 1 2 3] [ 0 1 2 3] (a, xmm0) + // [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1) + // [ 8 9 10 11] [10 11 8 9] (c, xmm2) + // [12 13 14 15] [15 12 13 14] (d, xmm3) + // + // The shuffles have quite high latency, so they've mostly been + // pushed upwards. The remaining one can't be moved, though. + pshufd xmm1, xmm1, 0x39 + + // Apply the diagonal quarterround to each of the columns + // simultaneously. + + // a += b; d ^= a; d <<<= 16 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 16 + psrld xmm4, 16 + por xmm3, xmm4 + + // c += d; b ^= c; b <<<= 12 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm4, xmm1 + pslld xmm1, 12 + psrld xmm4, 20 + por xmm1, xmm4 + + // a += b; d ^= a; d <<<= 8 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 8 + psrld xmm4, 24 + por xmm3, xmm4 + + // c += d; b ^= c; b <<<= 7 + paddd xmm2, xmm3 + pshufd xmm3, xmm3, 0x39 + pxor xmm1, xmm2 + pshufd xmm2, xmm2, 0x4e + movdqa xmm4, xmm1 + pslld xmm1, 7 + psrld xmm4, 25 + por xmm1, xmm4 + + // Finally, finish off undoing the transpose, and we're done for this + // doubleround. Again, most of this was done above so we don't have + // to wait for the shuffles. + pshufd xmm1, xmm1, 0x93 + + // Decrement the loop counter and see if we should go round again. + sub ecx, 2 + ja loop + + // Almost there. Firstly, the feedforward addition. + mov edx, [ebp + 16] + paddd xmm0, [esp] + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + + // And now we write out the result. This one won't be aligned + // either. + movdqu [edx + 0], xmm0 + movdqu [edx + 16], xmm1 + movdqu [edx + 32], xmm2 + movdqu [edx + 48], xmm3 + + // Tidy things up. + mov esp, ebp + pop ebp + + // And with that, we're done. + ret + +ENDFUNC + +///----- That's all, folks -------------------------------------------------- diff --git a/symm/chacha-x86-sse2.s b/symm/chacha-x86-sse2.s deleted file mode 100644 index 7b790107..00000000 --- a/symm/chacha-x86-sse2.s +++ /dev/null @@ -1,188 +0,0 @@ -### -*- mode: asm; asm-comment-char: ?# -*- -### -### Fancy SIMD implementation of ChaCha -### -### (c) 2015 Straylight/Edgeware -### - -###----- Licensing notice --------------------------------------------------- -### -### This file is part of Catacomb. -### -### Catacomb is free software; you can redistribute it and/or modify -### it under the terms of the GNU Library General Public License as -### published by the Free Software Foundation; either version 2 of the -### License, or (at your option) any later version. -### -### Catacomb is distributed in the hope that it will be useful, -### but WITHOUT ANY WARRANTY; without even the implied warranty of -### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -### GNU Library General Public License for more details. -### -### You should have received a copy of the GNU Library General Public -### License along with Catacomb; if not, write to the Free -### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, -### MA 02111-1307, USA. - - .intel_syntax noprefix - .arch pentium4 - - .section .text - - .globl chacha_core_x86_sse2 - .type chacha_core_x86_sse2, STT_FUNC -chacha_core_x86_sse2: - - ## Initial state. We have three arguments: - ## [ebp + 8] is the number of rounds to do - ## [ebp + 12] points to the input matrix - ## [ebp + 16] points to the output matrix - push ebp - mov ebp, esp - sub esp, 16 - mov edx, [ebp + 12] - and esp, ~15 - - ## First job is to slurp the matrix into XMM registers. Be careful: - ## the input matrix isn't likely to be properly aligned. - ## - ## [ 0 1 2 3] (a, xmm0) - ## [ 4 5 6 7] (b, xmm0) - ## [ 8 9 10 11] (c, xmm0) - ## [12 13 14 15] (d, xmm0) - movdqu xmm0, [edx + 0] - movdqu xmm1, [edx + 16] - movdqu xmm2, [edx + 32] - movdqu xmm3, [edx + 48] - - ## Prepare for the main loop. - mov ecx, [ebp + 8] - - ## Take a copy for later. This one is aligned properly, by - ## construction. - movdqa [esp], xmm0 - movdqa xmm5, xmm1 - movdqa xmm6, xmm2 - movdqa xmm7, xmm3 - -loop: - ## Apply a column quarterround to each of the columns simultaneously. - ## Alas, there doesn't seem to be a packed doubleword rotate, so we - ## have to synthesize it. - - ## a += b; d ^= a; d <<<= 16 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm4, xmm3 - pslld xmm3, 16 - psrld xmm4, 16 - por xmm3, xmm4 - - ## c += d; b ^= c; b <<<= 12 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm4, xmm1 - pslld xmm1, 12 - psrld xmm4, 20 - por xmm1, xmm4 - - ## a += b; d ^= a; d <<<= 8 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm4, xmm3 - pslld xmm3, 8 - psrld xmm4, 24 - por xmm3, xmm4 - - ## c += d; b ^= c; b <<<= 7 - paddd xmm2, xmm3 - pshufd xmm3, xmm3, 0x93 - pxor xmm1, xmm2 - pshufd xmm2, xmm2, 0x4e - movdqa xmm4, xmm1 - pslld xmm1, 7 - psrld xmm4, 25 - por xmm1, xmm4 - - ## The not-quite-transpose conveniently only involves reordering - ## elements of individual rows, which can be done quite easily. It - ## doesn't involve any movement of elements between rows, or even - ## renaming of the rows. - ## - ## [ 0 1 2 3] [ 0 1 2 3] (a, xmm0) - ## [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1) - ## [ 8 9 10 11] [10 11 8 9] (c, xmm2) - ## [12 13 14 15] [15 12 13 14] (d, xmm3) - ## - ## The shuffles have quite high latency, so they've mostly been - ## pushed upwards. The remaining one can't be moved, though. - pshufd xmm1, xmm1, 0x39 - - ## Apply the diagonal quarterround to each of the columns - ## simultaneously. - - ## a += b; d ^= a; d <<<= 16 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm4, xmm3 - pslld xmm3, 16 - psrld xmm4, 16 - por xmm3, xmm4 - - ## c += d; b ^= c; b <<<= 12 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm4, xmm1 - pslld xmm1, 12 - psrld xmm4, 20 - por xmm1, xmm4 - - ## a += b; d ^= a; d <<<= 8 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm4, xmm3 - pslld xmm3, 8 - psrld xmm4, 24 - por xmm3, xmm4 - - ## c += d; b ^= c; b <<<= 7 - paddd xmm2, xmm3 - pshufd xmm3, xmm3, 0x39 - pxor xmm1, xmm2 - pshufd xmm2, xmm2, 0x4e - movdqa xmm4, xmm1 - pslld xmm1, 7 - psrld xmm4, 25 - por xmm1, xmm4 - - ## Finally, finish off undoing the transpose, and we're done for this - ## doubleround. Again, most of this was done above so we don't have - ## to wait for the shuffles. - pshufd xmm1, xmm1, 0x93 - - ## Decrement the loop counter and see if we should go round again. - sub ecx, 2 - ja loop - - ## Almost there. Firstly, the feedforward addition. - mov edx, [ebp + 16] - paddd xmm0, [esp] - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - - ## And now we write out the result. This one won't be aligned - ## either. - movdqu [edx + 0], xmm0 - movdqu [edx + 16], xmm1 - movdqu [edx + 32], xmm2 - movdqu [edx + 48], xmm3 - - ## And with that, we're done. - mov esp, ebp - pop ebp - ret - - .size chacha_core_x86_sse2, . - chacha_core_x86_sse2 - -###----- That's all, folks -------------------------------------------------- diff --git a/symm/rijndael-x86-aesni.S b/symm/rijndael-x86-aesni.S new file mode 100644 index 00000000..d9aa9dc9 --- /dev/null +++ b/symm/rijndael-x86-aesni.S @@ -0,0 +1,548 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// AESNI-based implementation of Rijndael +/// +/// (c) 2015 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// External definitions. + +#include "config.h" +#include "asm-common.h" + +///-------------------------------------------------------------------------- +/// External definitions. + + .globl F(abort) + .globl F(rijndael_rcon) + +///-------------------------------------------------------------------------- +/// Main code. + + .arch .aes + .section .text + +/// The AESNI instructions implement a little-endian version of AES, but +/// Catacomb's internal interface presents as big-endian so as to work better +/// with things like GCM. We therefore maintain the round keys in +/// little-endian form, and have to end-swap blocks in and out. +/// +/// For added amusement, the AESNI instructions don't implement the +/// larger-block versions of Rijndael, so we have to end-swap the keys if +/// we're preparing for one of those. + + // Useful constants. + .equ maxrounds, 16 // maximum number of rounds + .equ maxblksz, 32 // maximum block size, in bytes + .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer + + // Context structure. + .equ nr, 0 // number of rounds + .equ w, nr + 4 // encryption key words + .equ wi, w + kbufsz // decryption key words + +///-------------------------------------------------------------------------- +/// Key setup. + +FUNC(rijndael_setup_x86_aesni) + + // Initial state. We have four arguments: + // [esp + 20] is the context pointer + // [esp + 24] is the block size, in 32-bit words (4, 6, or 8) + // [esp + 28] points to the key material, unaligned + // [esp + 32] is the size of the key, in words + // The key size has already been checked for validity, and the number + // of rounds has been computed. Our job is only to fill in the `w' + // and `wi' vectors. + + push ebp + push ebx + push esi + push edi + + // The initial round key material is taken directly from the input + // key, so copy it over. + mov ebp, [esp + 20] // context base pointer + mov ebx, [esp + 32] // key size, in words + mov ecx, ebx + mov esi, [esp + 28] + lea edi, [ebp + w] + rep movsd + + // Find out other useful things. + mov edx, [ebp + nr] // number of rounds + add edx, 1 + imul edx, [esp + 24] // total key size in words + sub edx, ebx // offset by the key size + + // Find the round constants. + ldgot ecx + leaext ecx, rijndael_rcon, ecx + + // Prepare for the main loop. + lea esi, [ebp + w] + mov eax, [esi + 4*ebx - 4] // most recent key word + lea edx, [esi + 4*edx] // limit, offset by one key expansion + + // Main key expansion loop. The first word of each key-length chunk + // needs special treatment. + // + // This is rather tedious because the Intel `AESKEYGENASSIST' + // instruction is very strangely shaped. Firstly, it wants to + // operate on vast SSE registers, even though we're data-blocked from + // doing more than operation at a time unless we're doing two key + // schedules simultaneously -- and even then we can't do more than + // two, because the instruction ignores two of its input words + // entirely, and produces two different outputs for each of the other + // two. And secondly it insists on taking the magic round constant + // as an immediate, so it's kind of annoying if you're not + // open-coding the whole thing. It's much easier to leave that as + // zero and XOR in the round constant by hand. +9: movd xmm0, eax + pshufd xmm0, xmm0, 0x39 + aeskeygenassist xmm1, xmm0, 0 + pshufd xmm1, xmm1, 0x93 + movd eax, xmm1 + xor eax, [esi] + xor al, [ecx] + inc ecx + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + // The next three words are simple... + xor eax, [esi] + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + // (Word 2...) + xor eax, [esi] + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + // (Word 3...) + xor eax, [esi] + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + // Word 4. If the key is /more/ than 6 words long, then we must + // apply a substitution here. + cmp ebx, 5 + jb 9b + cmp ebx, 7 + jb 0f + movd xmm0, eax + pshufd xmm0, xmm0, 0x93 + aeskeygenassist xmm1, xmm0, 0 + movd eax, xmm1 +0: xor eax, [esi] + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + // (Word 5...) + cmp ebx, 6 + jb 9b + xor eax, [esi] + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + // (Word 6...) + cmp ebx, 7 + jb 9b + xor eax, [esi] + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + // (Word 7...) + cmp ebx, 8 + jb 9b + xor eax, [esi] + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + // Must be done by now. + jmp 9b + + // Next job is to construct the decryption keys. The keys for the + // first and last rounds don't need to be mangled, but the remaining + // ones do -- and they all need to be reordered too. + // + // The plan of action, then, is to copy the final encryption round's + // keys into place first, then to do each of the intermediate rounds + // in reverse order, and finally do the first round. + // + // Do all of the heavy lifting with SSE registers. The order we're + // doing this in means that it's OK if we read or write too much, and + // there's easily enough buffer space for the over-enthusiastic reads + // and writes because the context has space for 32-byte blocks, which + // is our maximum and an exact fit for two SSE registers. +8: mov ecx, [ebp + nr] // number of rounds + mov ebx, [esp + 24] // block size (in words) + mov edx, ecx + imul edx, ebx + lea edi, [ebp + wi] + lea esi, [ebp + 4*edx + w] // last round's keys + shl ebx, 2 // block size (in bytes now) + + // Copy the last encryption round's keys. + movdqu xmm0, [esi] + movdqu [edi], xmm0 + cmp ebx, 16 + jbe 9f + movdqu xmm0, [esi + 16] + movdqu [edi + 16], xmm0 + + // Update the loop variables and stop if we've finished. +9: add edi, ebx + sub esi, ebx + sub ecx, 1 + jbe 0f + + // Do another middle round's keys... + movdqu xmm0, [esi] + aesimc xmm0, xmm0 + movdqu [edi], xmm0 + cmp ebx, 16 + jbe 9b + movdqu xmm0, [esi + 16] + aesimc xmm0, xmm0 + movdqu [edi + 16], xmm0 + jmp 9b + + // Finally do the first encryption round. +0: movdqu xmm0, [esi] + movdqu [edi], xmm0 + cmp ebx, 16 + jbe 0f + movdqu xmm0, [esi + 16] + movdqu [edi + 16], xmm0 + + // If the block size is not exactly four words then we must end-swap + // everything. We can use fancy SSE toys for this. +0: cmp ebx, 16 + je 0f + + // Find the byte-reordering table. + ldgot ecx + movdqa xmm7, [INTADDR(endswap_tab, ecx)] + + // Calculate the number of subkey words again. (It's a good job + // we've got a fast multiplier.) + mov ecx, [ebp + nr] + add ecx, 1 + imul ecx, [esp + 24] // total keys in words + + // End-swap the encryption keys. + mov eax, ecx + lea esi, [ebp + w] + call endswap_block + + // And the decryption keys. + mov ecx, eax + lea esi, [ebp + wi] + call endswap_block + + // All done. +0: pop edi + pop esi + pop ebx + pop ebp + ret + + .align 16 +endswap_block: + // End-swap ECX words starting at ESI. The end-swapping table is + // already loaded into XMM7; and it's OK to work in 16-byte chunks. + movdqu xmm1, [esi] + pshufb xmm1, xmm7 + movdqu [esi], xmm1 + add esi, 16 + sub ecx, 4 + ja endswap_block + ret + +ENDFUNC + +///-------------------------------------------------------------------------- +/// Encrypting and decrypting blocks. + +FUNC(rijndael_eblk_x86_aesni) + + // On entry, we have: + // [esp + 4] points to the context block + // [esp + 8] points to the input data block + // [esp + 12] points to the output buffer + + // Find the magic endianness-swapping table. + ldgot ecx + movdqa xmm7, [INTADDR(endswap_tab, ecx)] + + // Load the input block and end-swap it. Also, start loading the + // keys. + mov eax, [esp + 8] + movdqu xmm0, [eax] + pshufb xmm0, xmm7 + mov eax, [esp + 4] + lea edx, [eax + w] + mov eax, [eax + nr] + + // Initial whitening. + movdqu xmm1, [edx] + add edx, 16 + pxor xmm0, xmm1 + + // Dispatch to the correct code. + cmp eax, 10 + je er10 + jb bogus + cmp eax, 14 + je er14 + ja bogus + cmp eax, 12 + je er12 + jb er11 + jmp er13 + + .align 2 + + // 14 rounds... +er14: movdqu xmm1, [edx] + add edx, 16 + aesenc xmm0, xmm1 + + // 13 rounds... +er13: movdqu xmm1, [edx] + add edx, 16 + aesenc xmm0, xmm1 + + // 12 rounds... +er12: movdqu xmm1, [edx] + add edx, 16 + aesenc xmm0, xmm1 + + // 11 rounds... +er11: movdqu xmm1, [edx] + add edx, 16 + aesenc xmm0, xmm1 + + // 10 rounds... +er10: movdqu xmm1, [edx] + aesenc xmm0, xmm1 + + // 9 rounds... + movdqu xmm1, [edx + 16] + aesenc xmm0, xmm1 + + // 8 rounds... + movdqu xmm1, [edx + 32] + aesenc xmm0, xmm1 + + // 7 rounds... + movdqu xmm1, [edx + 48] + aesenc xmm0, xmm1 + + // 6 rounds... + movdqu xmm1, [edx + 64] + aesenc xmm0, xmm1 + + // 5 rounds... + movdqu xmm1, [edx + 80] + aesenc xmm0, xmm1 + + // 4 rounds... + movdqu xmm1, [edx + 96] + aesenc xmm0, xmm1 + + // 3 rounds... + movdqu xmm1, [edx + 112] + aesenc xmm0, xmm1 + + // 2 rounds... + movdqu xmm1, [edx + 128] + aesenc xmm0, xmm1 + + // Final round... + movdqu xmm1, [edx + 144] + aesenclast xmm0, xmm1 + + // Unpermute the ciphertext block and store it. + pshufb xmm0, xmm7 + mov eax, [esp + 12] + movdqu [eax], xmm0 + + // And we're done. + ret + +ENDFUNC + +FUNC(rijndael_dblk_x86_aesni) + + // On entry, we have: + // [esp + 4] points to the context block + // [esp + 8] points to the input data block + // [esp + 12] points to the output buffer + + // Find the magic endianness-swapping table. + ldgot ecx + movdqa xmm7, [INTADDR(endswap_tab, ecx)] + + // Load the input block and end-swap it. Also, start loading the + // keys. + mov eax, [esp + 8] + movdqu xmm0, [eax] + pshufb xmm0, xmm7 + mov eax, [esp + 4] + lea edx, [eax + wi] + mov eax, [eax + nr] + + // Initial whitening. + movdqu xmm1, [edx] + add edx, 16 + pxor xmm0, xmm1 + + // Dispatch to the correct code. + cmp eax, 10 + je dr10 + jb bogus + cmp eax, 14 + je dr14 + ja bogus + cmp eax, 12 + je dr12 + jb dr11 + jmp dr13 + + .align 2 + + // 14 rounds... +dr14: movdqu xmm1, [edx] + add edx, 16 + aesdec xmm0, xmm1 + + // 13 rounds... +dr13: movdqu xmm1, [edx] + add edx, 16 + aesdec xmm0, xmm1 + + // 12 rounds... +dr12: movdqu xmm1, [edx] + add edx, 16 + aesdec xmm0, xmm1 + + // 11 rounds... +dr11: movdqu xmm1, [edx] + add edx, 16 + aesdec xmm0, xmm1 + + // 10 rounds... +dr10: movdqu xmm1, [edx] + aesdec xmm0, xmm1 + + // 9 rounds... + movdqu xmm1, [edx + 16] + aesdec xmm0, xmm1 + + // 8 rounds... + movdqu xmm1, [edx + 32] + aesdec xmm0, xmm1 + + // 7 rounds... + movdqu xmm1, [edx + 48] + aesdec xmm0, xmm1 + + // 6 rounds... + movdqu xmm1, [edx + 64] + aesdec xmm0, xmm1 + + // 5 rounds... + movdqu xmm1, [edx + 80] + aesdec xmm0, xmm1 + + // 4 rounds... + movdqu xmm1, [edx + 96] + aesdec xmm0, xmm1 + + // 3 rounds... + movdqu xmm1, [edx + 112] + aesdec xmm0, xmm1 + + // 2 rounds... + movdqu xmm1, [edx + 128] + aesdec xmm0, xmm1 + + // Final round... + movdqu xmm1, [edx + 144] + aesdeclast xmm0, xmm1 + + // Unpermute the ciphertext block and store it. + pshufb xmm0, xmm7 + mov eax, [esp + 12] + movdqu [eax], xmm0 + + // And we're done. + ret + +ENDFUNC + +///-------------------------------------------------------------------------- +/// Random utilities. + + .align 16 + // Abort the process because of a programming error. Indirecting + // through this point serves several purposes: (a) by CALLing, rather + // than branching to, `abort', we can save the return address, which + // might at least provide a hint as to what went wrong; (b) we don't + // have conditional CALLs (and they'd be big anyway); and (c) we can + // write a HLT here as a backstop against `abort' being mad. +bogus: callext F(abort) +0: hlt + jmp 0b + + gotaux ecx + +///-------------------------------------------------------------------------- +/// Data tables. + + .align 16 +endswap_tab: + .byte 3, 2, 1, 0 + .byte 7, 6, 5, 4 + .byte 11, 10, 9, 8 + .byte 15, 14, 13, 12 + +///----- That's all, folks -------------------------------------------------- diff --git a/symm/rijndael-x86-aesni.s b/symm/rijndael-x86-aesni.s deleted file mode 100644 index 79b33584..00000000 --- a/symm/rijndael-x86-aesni.s +++ /dev/null @@ -1,553 +0,0 @@ -### -*- mode: asm; asm-comment-char: ?# -*- -### -### AESNI-based implementation of Rijndael -### -### (c) 2015 Straylight/Edgeware -### - -###----- Licensing notice --------------------------------------------------- -### -### This file is part of Catacomb. -### -### Catacomb is free software; you can redistribute it and/or modify -### it under the terms of the GNU Library General Public License as -### published by the Free Software Foundation; either version 2 of the -### License, or (at your option) any later version. -### -### Catacomb is distributed in the hope that it will be useful, -### but WITHOUT ANY WARRANTY; without even the implied warranty of -### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -### GNU Library General Public License for more details. -### -### You should have received a copy of the GNU Library General Public -### License along with Catacomb; if not, write to the Free -### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, -### MA 02111-1307, USA. - - .intel_syntax noprefix - .arch .aes - - .globl abort - .globl rijndael_rcon - - .section .text - -### The AESNI instructions implement a little-endian version of AES, but -### Catacomb's internal interface presents as big-endian so as to work better -### with things like GCM. We therefore maintain the round keys in -### little-endian form, and have to end-swap blocks in and out. -### -### For added amusement, the AESNI instructions don't implement the -### larger-block versions of Rijndael, so we have to end-swap the keys if -### we're preparing for one of those. - - ## Useful constants. - .equ maxrounds, 16 # maximum number of rounds - .equ maxblksz, 32 # maximum block size, in bytes - .equ kbufsz, maxblksz*(maxrounds + 1) # size of a key-schedule buffer - - ## Context structure. - .equ nr, 0 # number of rounds - .equ w, nr + 4 # encryption key words - .equ wi, w + kbufsz # decryption key words - -###-------------------------------------------------------------------------- -### Key setup. - - .globl rijndael_setup_x86_aesni - .type rijndael_setup_x86_aesni, STT_FUNC - .align 16 -rijndael_setup_x86_aesni: - - ## Initial state. We have four arguments: - ## [esp + 20] is the context pointer - ## [esp + 24] is the block size, in 32-bit words (4, 6, or 8) - ## [esp + 28] points to the key material, unaligned - ## [esp + 32] is the size of the key, in words - ## The key size has already been checked for validity, and the number - ## of rounds has been computed. Our job is only to fill in the `w' - ## and `wi' vectors. - - push ebp - push ebx - push esi - push edi - - ## The initial round key material is taken directly from the input - ## key, so copy it over. - mov ebp, [esp + 20] # context base pointer - mov ebx, [esp + 32] # key size, in words - mov ecx, ebx - mov esi, [esp + 28] - lea edi, [ebp + w] - rep movsd - - ## Find out other useful things. - mov edx, [ebp + nr] # number of rounds - add edx, 1 - imul edx, [esp + 24] # total key size in words - sub edx, ebx # offset by the key size - - ## Find the round constants. - call where_am_i_ecx - add ecx, offset _GLOBAL_OFFSET_TABLE_ - mov ecx, [ecx + rijndael_rcon@GOT] - - ## Prepare for the main loop. - lea esi, [ebp + w] - mov eax, [esi + 4*ebx - 4] # most recent key word - lea edx, [esi + 4*edx] # limit, offset by one key expansion - - ## Main key expansion loop. The first word of each key-length chunk - ## needs special treatment. - ## - ## This is rather tedious because the Intel `AESKEYGENASSIST' - ## instruction is very strangely shaped. Firstly, it wants to - ## operate on vast SSE registers, even though we're data-blocked from - ## doing more than operation at a time unless we're doing two key - ## schedules simultaneously -- and even then we can't do more than - ## two, because the instruction ignores two of its input words - ## entirely, and produces two different outputs for each of the other - ## two. And secondly it insists on taking the magic round constant - ## as an immediate, so it's kind of annoying if you're not - ## open-coding the whole thing. It's much easier to leave that as - ## zero and XOR in the round constant by hand. -9: movd xmm0, eax - pshufd xmm0, xmm0, 0x39 - aeskeygenassist xmm1, xmm0, 0 - pshufd xmm1, xmm1, 0x93 - movd eax, xmm1 - xor eax, [esi] - xor al, [ecx] - inc ecx - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx - jae 8f - - ## The next three words are simple... - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx - jae 8f - - ## (Word 2...) - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx - jae 8f - - ## (Word 3...) - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx - jae 8f - - ## Word 4. If the key is /more/ than 6 words long, then we must - ## apply a substitution here. - cmp ebx, 5 - jb 9b - cmp ebx, 7 - jb 0f - movd xmm0, eax - pshufd xmm0, xmm0, 0x93 - aeskeygenassist xmm1, xmm0, 0 - movd eax, xmm1 -0: xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx - jae 8f - - ## (Word 5...) - cmp ebx, 6 - jb 9b - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx - jae 8f - - ## (Word 6...) - cmp ebx, 7 - jb 9b - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx - jae 8f - - ## (Word 7...) - cmp ebx, 8 - jb 9b - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx - jae 8f - - ## Must be done by now. - jmp 9b - - ## Next job is to construct the decryption keys. The keys for the - ## first and last rounds don't need to be mangled, but the remaining - ## ones do -- and they all need to be reordered too. - ## - ## The plan of action, then, is to copy the final encryption round's - ## keys into place first, then to do each of the intermediate rounds - ## in reverse order, and finally do the first round. - ## - ## Do all of the heavy lifting with SSE registers. The order we're - ## doing this in means that it's OK if we read or write too much, and - ## there's easily enough buffer space for the over-enthusiastic reads - ## and writes because the context has space for 32-byte blocks, which - ## is our maximum and an exact fit for two SSE registers. -8: mov ecx, [ebp + nr] # number of rounds - mov ebx, [esp + 24] # block size (in words) - mov edx, ecx - imul edx, ebx - lea edi, [ebp + wi] - lea esi, [ebp + 4*edx + w] # last round's keys - shl ebx, 2 # block size (in bytes now) - - ## Copy the last encryption round's keys. - movdqu xmm0, [esi] - movdqu [edi], xmm0 - cmp ebx, 16 - jbe 9f - movdqu xmm0, [esi + 16] - movdqu [edi + 16], xmm0 - - ## Update the loop variables and stop if we've finished. -9: add edi, ebx - sub esi, ebx - sub ecx, 1 - jbe 0f - - ## Do another middle round's keys... - movdqu xmm0, [esi] - aesimc xmm0, xmm0 - movdqu [edi], xmm0 - cmp ebx, 16 - jbe 9b - movdqu xmm0, [esi + 16] - aesimc xmm0, xmm0 - movdqu [edi + 16], xmm0 - jmp 9b - - ## Finally do the first encryption round. -0: movdqu xmm0, [esi] - movdqu [edi], xmm0 - cmp ebx, 16 - jbe 0f - movdqu xmm0, [esi + 16] - movdqu [edi + 16], xmm0 - - ## If the block size is not exactly four words then we must end-swap - ## everything. We can use fancy SSE toys for this. -0: cmp ebx, 16 - je 0f - - ## Find the byte-reordering table. - call where_am_i_ecx - movdqa xmm7, [ecx + endswap_tab - .] - - ## Calculate the number of subkey words again. (It's a good job - ## we've got a fast multiplier.) - mov ecx, [ebp + nr] - add ecx, 1 - imul ecx, [esp + 24] # total keys in words - - ## End-swap the encryption keys. - mov eax, ecx - lea esi, [ebp + w] - call endswap_block - - ## And the decryption keys. - mov ecx, eax - lea esi, [ebp + wi] - call endswap_block - - ## All done. -0: pop edi - pop esi - pop ebx - pop ebp - ret - - .align 16 -endswap_block: - ## End-swap ECX words starting at ESI. The end-swapping table is - ## already loaded into XMM7; and it's OK to work in 16-byte chunks. - movdqu xmm1, [esi] - pshufb xmm1, xmm7 - movdqu [esi], xmm1 - add esi, 16 - sub ecx, 4 - ja endswap_block - ret - - .size rijndael_setup_x86_aesni, . - rijndael_setup_x86_aesni - -###-------------------------------------------------------------------------- -### Encrypting and decrypting blocks. - - .globl rijndael_eblk_x86_aesni - .type rijndael_eblk_x86_aesni, STT_FUNC - .align 16 -rijndael_eblk_x86_aesni: - - ## On entry, we have: - ## [esp + 4] points to the context block - ## [esp + 8] points to the input data block - ## [esp + 12] points to the output buffer - - ## Find the magic endianness-swapping table. - call where_am_i_ecx - movdqa xmm7, [ecx + endswap_tab - .] - - ## Load the input block and end-swap it. Also, start loading the - ## keys. - mov eax, [esp + 8] - movdqu xmm0, [eax] - pshufb xmm0, xmm7 - mov eax, [esp + 4] - lea edx, [eax + w] - mov eax, [eax + nr] - - ## Initial whitening. - movdqu xmm1, [edx] - add edx, 16 - pxor xmm0, xmm1 - - ## Dispatch to the correct code. - cmp eax, 10 - je er10 - jb bogus - cmp eax, 14 - je er14 - ja bogus - cmp eax, 12 - je er12 - jb er11 - jmp er13 - - .align 2 - - ## 14 rounds... -er14: movdqu xmm1, [edx] - add edx, 16 - aesenc xmm0, xmm1 - - ## 13 rounds... -er13: movdqu xmm1, [edx] - add edx, 16 - aesenc xmm0, xmm1 - - ## 12 rounds... -er12: movdqu xmm1, [edx] - add edx, 16 - aesenc xmm0, xmm1 - - ## 11 rounds... -er11: movdqu xmm1, [edx] - add edx, 16 - aesenc xmm0, xmm1 - - ## 10 rounds... -er10: movdqu xmm1, [edx] - aesenc xmm0, xmm1 - - ## 9 rounds... - movdqu xmm1, [edx + 16] - aesenc xmm0, xmm1 - - ## 8 rounds... - movdqu xmm1, [edx + 32] - aesenc xmm0, xmm1 - - ## 7 rounds... - movdqu xmm1, [edx + 48] - aesenc xmm0, xmm1 - - ## 6 rounds... - movdqu xmm1, [edx + 64] - aesenc xmm0, xmm1 - - ## 5 rounds... - movdqu xmm1, [edx + 80] - aesenc xmm0, xmm1 - - ## 4 rounds... - movdqu xmm1, [edx + 96] - aesenc xmm0, xmm1 - - ## 3 rounds... - movdqu xmm1, [edx + 112] - aesenc xmm0, xmm1 - - ## 2 rounds... - movdqu xmm1, [edx + 128] - aesenc xmm0, xmm1 - - ## Final round... - movdqu xmm1, [edx + 144] - aesenclast xmm0, xmm1 - - ## Unpermute the ciphertext block and store it. - pshufb xmm0, xmm7 - mov eax, [esp + 12] - movdqu [eax], xmm0 - - ## And we're done. - ret - - .size rijndael_eblk_x86_aesni, . - rijndael_dblk_x86_aesni - - .globl rijndael_dblk_x86_aesni - .type rijndael_dblk_x86_aesni, STT_FUNC - .align 16 -rijndael_dblk_x86_aesni: - - ## On entry, we have: - ## [esp + 4] points to the context block - ## [esp + 8] points to the input data block - ## [esp + 12] points to the output buffer - - ## Find the magic endianness-swapping table. - call where_am_i_ecx - movdqa xmm7, [ecx + endswap_tab - .] - - ## Load the input block and end-swap it. Also, start loading the - ## keys. - mov eax, [esp + 8] - movdqu xmm0, [eax] - pshufb xmm0, xmm7 - mov eax, [esp + 4] - lea edx, [eax + wi] - mov eax, [eax + nr] - - ## Initial whitening. - movdqu xmm1, [edx] - add edx, 16 - pxor xmm0, xmm1 - - ## Dispatch to the correct code. - cmp eax, 10 - je dr10 - jb bogus - cmp eax, 14 - je dr14 - ja bogus - cmp eax, 12 - je dr12 - jb dr11 - jmp dr13 - - .align 2 - - ## 14 rounds... -dr14: movdqu xmm1, [edx] - add edx, 16 - aesdec xmm0, xmm1 - - ## 13 rounds... -dr13: movdqu xmm1, [edx] - add edx, 16 - aesdec xmm0, xmm1 - - ## 12 rounds... -dr12: movdqu xmm1, [edx] - add edx, 16 - aesdec xmm0, xmm1 - - ## 11 rounds... -dr11: movdqu xmm1, [edx] - add edx, 16 - aesdec xmm0, xmm1 - - ## 10 rounds... -dr10: movdqu xmm1, [edx] - aesdec xmm0, xmm1 - - ## 9 rounds... - movdqu xmm1, [edx + 16] - aesdec xmm0, xmm1 - - ## 8 rounds... - movdqu xmm1, [edx + 32] - aesdec xmm0, xmm1 - - ## 7 rounds... - movdqu xmm1, [edx + 48] - aesdec xmm0, xmm1 - - ## 6 rounds... - movdqu xmm1, [edx + 64] - aesdec xmm0, xmm1 - - ## 5 rounds... - movdqu xmm1, [edx + 80] - aesdec xmm0, xmm1 - - ## 4 rounds... - movdqu xmm1, [edx + 96] - aesdec xmm0, xmm1 - - ## 3 rounds... - movdqu xmm1, [edx + 112] - aesdec xmm0, xmm1 - - ## 2 rounds... - movdqu xmm1, [edx + 128] - aesdec xmm0, xmm1 - - ## Final round... - movdqu xmm1, [edx + 144] - aesdeclast xmm0, xmm1 - - ## Unpermute the ciphertext block and store it. - pshufb xmm0, xmm7 - mov eax, [esp + 12] - movdqu [eax], xmm0 - - ## And we're done. - ret - - .size rijndael_dblk_x86_aesni, . - rijndael_dblk_x86_aesni - -###-------------------------------------------------------------------------- -### Random utilities. - - .align 16 - ## Abort the process because of a programming error. Indirecting - ## through this point serves several purposes: (a) by CALLing, rather - ## than branching to, `abort', we can save the return address, which - ## might at least provide a hint as to what went wrong; (b) we don't - ## have conditional CALLs (and they'd be big anyway); and (c) we can - ## write a HLT here as a backstop against `abort' being mad. -bogus: call abort@PLT -0: hlt - jmp 0b - - .align 16 - ## Return the address of the instruction following the CALL here in - ## ECX. This is useful for doing position-independent addressing. -where_am_i_ecx: - mov ecx, [esp] - ret - -###-------------------------------------------------------------------------- -### Data tables. - - .align 16 -endswap_tab: - .byte 3, 2, 1, 0 - .byte 7, 6, 5, 4 - .byte 11, 10, 9, 8 - .byte 15, 14, 13, 12 - -###----- That's all, folks -------------------------------------------------- diff --git a/symm/salsa20-x86-sse2.S b/symm/salsa20-x86-sse2.S new file mode 100644 index 00000000..5a13fd49 --- /dev/null +++ b/symm/salsa20-x86-sse2.S @@ -0,0 +1,254 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// Fancy SIMD implementation of Salsa20 +/// +/// (c) 2015 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// External definitions. + +#include "config.h" +#include "asm-common.h" + +///-------------------------------------------------------------------------- +/// Main code. + + .arch pentium4 + .section .text + +FUNC(salsa20_core_x86_sse2) + + // Initial state. We have three arguments: + // [ebp + 8] is the number of rounds to do + // [ebp + 12] points to the input matrix + // [ebp + 16] points to the output matrix + push ebp + mov ebp, esp + sub esp, 32 + mov edx, [ebp + 12] + and esp, ~15 + + // Prepare for the main loop. + mov ecx, [ebp + 8] + + // First job is to slurp the matrix into XMM registers. The words + // have already been permuted conveniently to make them line up + // better for SIMD processing. + // + // The textbook arrangement of the matrix is this. + // + // [C K K K] + // [K C N N] + // [T T C K] + // [K K K C] + // + // But we've rotated the columns up so that the main diagonal with + // the constants on it end up in the first row, giving something more + // like + // + // [C C C C] + // [K T K K] + // [T K K N] + // [K K N K] + // + // so the transformation looks like this: + // + // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0) + // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1) + // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2) + // [12 13 14 15] [12 1 6 11] (d, xmm3) + movdqu xmm0, [edx + 0] + movdqu xmm1, [edx + 16] + movdqu xmm2, [edx + 32] + movdqu xmm3, [edx + 48] + + // Take a copy for later. + movdqa [esp + 0], xmm0 + movdqa [esp + 16], xmm1 + movdqa xmm6, xmm2 + movdqa xmm7, xmm3 + +loop: + + // Apply a column quarterround to each of the columns simultaneously. + // Alas, there doesn't seem to be a packed doubleword rotate, so we + // have to synthesize it. + + // b ^= (a + d) <<< 7 + movdqa xmm4, xmm0 + paddd xmm4, xmm3 + movdqa xmm5, xmm4 + pslld xmm4, 7 + psrld xmm5, 25 + por xmm4, xmm5 + pxor xmm1, xmm4 + + // c ^= (b + a) <<< 9 + movdqa xmm4, xmm1 + paddd xmm4, xmm0 + movdqa xmm5, xmm4 + pslld xmm4, 9 + psrld xmm5, 23 + por xmm4, xmm5 + pxor xmm2, xmm4 + + // d ^= (c + b) <<< 13 + movdqa xmm4, xmm2 + paddd xmm4, xmm1 + pshufd xmm1, xmm1, 0x93 + movdqa xmm5, xmm4 + pslld xmm4, 13 + psrld xmm5, 19 + por xmm4, xmm5 + pxor xmm3, xmm4 + + // a ^= (d + c) <<< 18 + movdqa xmm4, xmm3 + pshufd xmm3, xmm3, 0x39 + paddd xmm4, xmm2 + pshufd xmm2, xmm2, 0x4e + movdqa xmm5, xmm4 + pslld xmm4, 18 + psrld xmm5, 14 + por xmm4, xmm5 + pxor xmm0, xmm4 + + // The transpose conveniently only involves reordering elements of + // individual rows, which can be done quite easily, and reordering + // the rows themselves, which is a trivial renaming. It doesn't + // involve any movement of elements between rows. + // + // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0) + // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3) + // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2) + // [12 1 6 11] [ 3 4 9 14] (d, xmm1) + // + // The shuffles have quite high latency, so they've been pushed + // backwards into the main instruction list. + + // Apply the row quarterround to each of the columns (yes!) + // simultaneously. + + // b ^= (a + d) <<< 7 + movdqa xmm4, xmm0 + paddd xmm4, xmm1 + movdqa xmm5, xmm4 + pslld xmm4, 7 + psrld xmm5, 25 + por xmm4, xmm5 + pxor xmm3, xmm4 + + // c ^= (b + a) <<< 9 + movdqa xmm4, xmm3 + paddd xmm4, xmm0 + movdqa xmm5, xmm4 + pslld xmm4, 9 + psrld xmm5, 23 + por xmm4, xmm5 + pxor xmm2, xmm4 + + // d ^= (c + b) <<< 13 + movdqa xmm4, xmm2 + paddd xmm4, xmm3 + pshufd xmm3, xmm3, 0x93 + movdqa xmm5, xmm4 + pslld xmm4, 13 + psrld xmm5, 19 + por xmm4, xmm5 + pxor xmm1, xmm4 + + // a ^= (d + c) <<< 18 + movdqa xmm4, xmm1 + pshufd xmm1, xmm1, 0x39 + paddd xmm4, xmm2 + pshufd xmm2, xmm2, 0x4e + movdqa xmm5, xmm4 + pslld xmm4, 18 + psrld xmm5, 14 + por xmm4, xmm5 + pxor xmm0, xmm4 + + // We had to undo the transpose ready for the next loop. Again, push + // back the shuffles because they take a long time coming through. + // Decrement the loop counter and see if we should go round again. + // Later processors fuse this pair into a single uop. + sub ecx, 2 + ja loop + + // Almost there. Firstly, the feedforward addition, and then we have + // to write out the result. Here we have to undo the permutation + // which was already applied to the input. Shuffling has quite high + // latency, so arrange to start a new shuffle into a temporary as + // soon as we've written out the old value. + mov edx, [ebp + 16] + + paddd xmm0, [esp + 0] + pshufd xmm4, xmm0, 0x39 + movd [edx + 0], xmm0 + + paddd xmm1, [esp + 16] + pshufd xmm5, xmm1, 0x93 + movd [edx + 16], xmm1 + + paddd xmm2, xmm6 + pshufd xmm6, xmm2, 0x4e + movd [edx + 32], xmm2 + + paddd xmm3, xmm7 + pshufd xmm7, xmm3, 0x39 + movd [edx + 48], xmm3 + + movd [edx + 4], xmm7 + pshufd xmm7, xmm3, 0x4e + movd [edx + 24], xmm7 + pshufd xmm3, xmm3, 0x93 + movd [edx + 44], xmm3 + + movd [edx + 8], xmm6 + pshufd xmm6, xmm2, 0x93 + movd [edx + 28], xmm6 + pshufd xmm2, xmm2, 0x39 + movd [edx + 52], xmm2 + + movd [edx + 12], xmm5 + pshufd xmm5, xmm1, 0x39 + movd [edx + 36], xmm5 + pshufd xmm1, xmm1, 0x4e + movd [edx + 56], xmm1 + + movd [edx + 20], xmm4 + pshufd xmm4, xmm0, 0x4e + movd [edx + 40], xmm4 + pshufd xmm0, xmm0, 0x93 + movd [edx + 60], xmm0 + + // Tidy things up. + mov esp, ebp + pop ebp + + // And with that, we're done. + ret + +ENDFUNC + +///----- That's all, folks -------------------------------------------------- diff --git a/symm/salsa20-x86-sse2.s b/symm/salsa20-x86-sse2.s deleted file mode 100644 index ef2b73ef..00000000 --- a/symm/salsa20-x86-sse2.s +++ /dev/null @@ -1,247 +0,0 @@ -### -*- mode: asm; asm-comment-char: ?# -*- -### -### Fancy SIMD implementation of Salsa20 -### -### (c) 2015 Straylight/Edgeware -### - -###----- Licensing notice --------------------------------------------------- -### -### This file is part of Catacomb. -### -### Catacomb is free software; you can redistribute it and/or modify -### it under the terms of the GNU Library General Public License as -### published by the Free Software Foundation; either version 2 of the -### License, or (at your option) any later version. -### -### Catacomb is distributed in the hope that it will be useful, -### but WITHOUT ANY WARRANTY; without even the implied warranty of -### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -### GNU Library General Public License for more details. -### -### You should have received a copy of the GNU Library General Public -### License along with Catacomb; if not, write to the Free -### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, -### MA 02111-1307, USA. - - .intel_syntax noprefix - .arch pentium4 - - .section .text - - .globl salsa20_core_x86_sse2 - .type salsa20_core_x86_sse2, STT_FUNC -salsa20_core_x86_sse2: - - ## Initial state. We have three arguments: - ## [ebp + 8] is the number of rounds to do - ## [ebp + 12] points to the input matrix - ## [ebp + 16] points to the output matrix - push ebp - mov ebp, esp - sub esp, 32 - mov edx, [ebp + 12] - and esp, ~15 - - ## Prepare for the main loop. - mov ecx, [ebp + 8] - - ## First job is to slurp the matrix into XMM registers. The words - ## have already been permuted conveniently to make them line up - ## better for SIMD processing. - ## - ## The textbook arrangement of the matrix is this. - ## - ## [C K K K] - ## [K C N N] - ## [T T C K] - ## [K K K C] - ## - ## But we've rotated the columns up so that the main diagonal with - ## the constants on it end up in the first row, giving something more - ## like - ## - ## [C C C C] - ## [K T K K] - ## [T K K N] - ## [K K N K] - ## - ## so the transformation looks like this: - ## - ## [ 0 1 2 3] [ 0 5 10 15] (a, xmm0) - ## [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1) - ## [ 8 9 10 11] [ 8 13 2 7] (c, xmm2) - ## [12 13 14 15] [12 1 6 11] (d, xmm3) - movdqu xmm0, [edx + 0] - movdqu xmm1, [edx + 16] - movdqu xmm2, [edx + 32] - movdqu xmm3, [edx + 48] - - ## Take a copy for later. - movdqa [esp + 0], xmm0 - movdqa [esp + 16], xmm1 - movdqa xmm6, xmm2 - movdqa xmm7, xmm3 - -loop: - - ## Apply a column quarterround to each of the columns simultaneously. - ## Alas, there doesn't seem to be a packed doubleword rotate, so we - ## have to synthesize it. - - ## b ^= (a + d) <<< 7 - movdqa xmm4, xmm0 - paddd xmm4, xmm3 - movdqa xmm5, xmm4 - pslld xmm4, 7 - psrld xmm5, 25 - por xmm4, xmm5 - pxor xmm1, xmm4 - - ## c ^= (b + a) <<< 9 - movdqa xmm4, xmm1 - paddd xmm4, xmm0 - movdqa xmm5, xmm4 - pslld xmm4, 9 - psrld xmm5, 23 - por xmm4, xmm5 - pxor xmm2, xmm4 - - ## d ^= (c + b) <<< 13 - movdqa xmm4, xmm2 - paddd xmm4, xmm1 - pshufd xmm1, xmm1, 0x93 - movdqa xmm5, xmm4 - pslld xmm4, 13 - psrld xmm5, 19 - por xmm4, xmm5 - pxor xmm3, xmm4 - - ## a ^= (d + c) <<< 18 - movdqa xmm4, xmm3 - pshufd xmm3, xmm3, 0x39 - paddd xmm4, xmm2 - pshufd xmm2, xmm2, 0x4e - movdqa xmm5, xmm4 - pslld xmm4, 18 - psrld xmm5, 14 - por xmm4, xmm5 - pxor xmm0, xmm4 - - ## The transpose conveniently only involves reordering elements of - ## individual rows, which can be done quite easily, and reordering - ## the rows themselves, which is a trivial renaming. It doesn't - ## involve any movement of elements between rows. - ## - ## [ 0 5 10 15] [ 0 5 10 15] (a, xmm0) - ## [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3) - ## [ 8 13 2 7] [ 2 7 8 13] (c, xmm2) - ## [12 1 6 11] [ 3 4 9 14] (d, xmm1) - ## - ## The shuffles have quite high latency, so they've been pushed - ## backwards into the main instruction list. - - ## Apply the row quarterround to each of the columns (yes!) - ## simultaneously. - - ## b ^= (a + d) <<< 7 - movdqa xmm4, xmm0 - paddd xmm4, xmm1 - movdqa xmm5, xmm4 - pslld xmm4, 7 - psrld xmm5, 25 - por xmm4, xmm5 - pxor xmm3, xmm4 - - ## c ^= (b + a) <<< 9 - movdqa xmm4, xmm3 - paddd xmm4, xmm0 - movdqa xmm5, xmm4 - pslld xmm4, 9 - psrld xmm5, 23 - por xmm4, xmm5 - pxor xmm2, xmm4 - - ## d ^= (c + b) <<< 13 - movdqa xmm4, xmm2 - paddd xmm4, xmm3 - pshufd xmm3, xmm3, 0x93 - movdqa xmm5, xmm4 - pslld xmm4, 13 - psrld xmm5, 19 - por xmm4, xmm5 - pxor xmm1, xmm4 - - ## a ^= (d + c) <<< 18 - movdqa xmm4, xmm1 - pshufd xmm1, xmm1, 0x39 - paddd xmm4, xmm2 - pshufd xmm2, xmm2, 0x4e - movdqa xmm5, xmm4 - pslld xmm4, 18 - psrld xmm5, 14 - por xmm4, xmm5 - pxor xmm0, xmm4 - - ## We had to undo the transpose ready for the next loop. Again, push - ## back the shuffles because they take a long time coming through. - ## Decrement the loop counter and see if we should go round again. - ## Later processors fuse this pair into a single uop. - sub ecx, 2 - ja loop - - ## Almost there. Firstly, the feedforward addition, and then we have - ## to write out the result. Here we have to undo the permutation - ## which was already applied to the input. Shuffling has quite high - ## latency, so arrange to start a new shuffle into a temporary as - ## soon as we've written out the old value. - mov edx, [ebp + 16] - - paddd xmm0, [esp + 0] - pshufd xmm4, xmm0, 0x39 - movd [edx + 0], xmm0 - - paddd xmm1, [esp + 16] - pshufd xmm5, xmm1, 0x93 - movd [edx + 16], xmm1 - - paddd xmm2, xmm6 - pshufd xmm6, xmm2, 0x4e - movd [edx + 32], xmm2 - - paddd xmm3, xmm7 - pshufd xmm7, xmm3, 0x39 - movd [edx + 48], xmm3 - - movd [edx + 4], xmm7 - pshufd xmm7, xmm3, 0x4e - movd [edx + 24], xmm7 - pshufd xmm3, xmm3, 0x93 - movd [edx + 44], xmm3 - - movd [edx + 8], xmm6 - pshufd xmm6, xmm2, 0x93 - movd [edx + 28], xmm6 - pshufd xmm2, xmm2, 0x39 - movd [edx + 52], xmm2 - - movd [edx + 12], xmm5 - pshufd xmm5, xmm1, 0x39 - movd [edx + 36], xmm5 - pshufd xmm1, xmm1, 0x4e - movd [edx + 56], xmm1 - - movd [edx + 20], xmm4 - pshufd xmm4, xmm0, 0x4e - movd [edx + 40], xmm4 - pshufd xmm0, xmm0, 0x93 - movd [edx + 60], xmm0 - - ## And with that, we're done. - mov esp, ebp - pop ebp - ret - - .size salsa20_core_x86_sse2, . - salsa20_core_x86_sse2 - -###----- That's all, folks -------------------------------------------------- -- 2.11.0