* Rename the `*.s' files to `*.S'.
* Create a new header `base/asm-common.h' containing useful
definitions, particularly for dealing with the peculiarities of
shared library code.
* Convert the assembler files to use the new macros.
* Convert the assembler files to use `//' for comments rather than
`#' (as currently). This is a bit annoying, but `#' is wanted by
the preprocessor, and `/* ... */' doesn't work in Emacs's
`asm-mode'.
The reason for doing all of this is because the C preprocessor will let
me do things like inventing symbolic names for registers, which will be
handy later when I add support for AMD64 processors, because most of the
code will be identical between 32- and 64-bit machines.
This change has the side-effect that the AESNI implementation no longer
uses PIC-ish means to find things when it doesn't need to.
## Clearing secrets from memory.
pkginclude_HEADERS += paranoia.h
+## Base definitions for assembler source.
+EXTRA_DIST += asm-common.h
+
###----- That's all, folks --------------------------------------------------
--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of Salsa20
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// General definitions.
+
+// Announcing an external function.
+#define FUNC(name) \
+ .globl F(name); \
+ TYPE_FUNC(name); \
+ .macro ENDFUNC; _ENDFUNC(name); .endm; \
+ FUNC_PREHOOK(name); \
+F(name): \
+ FUNC_POSTHOOK(name)
+
+// Marking the end of a function.
+#define _ENDFUNC(name) \
+ .purgem ENDFUNC; \
+ SIZE_OBJ(name); \
+ ENDFUNC_HOOK(name)
+
+///--------------------------------------------------------------------------
+/// ELF-specific hacking.
+
+#if __ELF__
+
+#if __PIC__ || __PIE__
+# define WANT_PIC 1
+#endif
+
+#define TYPE_FUNC(name) .type name, STT_FUNC
+
+#define SIZE_OBJ(name) .size name, . - name
+
+#endif
+
+///--------------------------------------------------------------------------
+/// x86-specific hacking.
+
+#if CPUFAM_X86
+
+// Set the function hooks.
+#define FUNC_PREHOOK(_) .balign 16
+
+// Don't use the wretched AT&T syntax. It's festooned with pointless
+// punctuation, and all of the data movement is backwards. Ugh!
+ .intel_syntax noprefix
+
+// Call external subroutine at ADDR, possibly via PLT.
+ .macro callext addr
+#if WANT_PIC
+ call \addr@PLT
+#else
+ call \addr
+#endif
+ .endm
+
+// Do I need to arrange a spare GOT register?
+#if WANT_PIC && CPUFAM_X86
+# define NEED_GOT 1
+#endif
+#define GOTREG ebx // Not needed in AMD64 so don't care.
+
+// Maybe load GOT address into GOT.
+ .macro ldgot got=GOTREG
+#if WANT_PIC
+ call _where_am_i.\got
+ add \got, offset _GLOBAL_OFFSET_TABLE_
+#endif
+ .endm
+
+// Maybe build a helper subroutine for `ldgot GOT'.
+ .macro gotaux got=GOTREG
+#if WANT_PIC
+ .align 16
+_where_am_i.\got :
+ mov \got, [esp]
+ ret
+#endif
+ .endm
+
+// Load address of external symbol ADDR into REG, maybe using GOT.
+ .macro leaext reg, addr, got=GOTREG
+#if WANT_PIC
+ mov \reg, [\got + \addr@GOT]
+#else
+ mov \reg, offset \addr
+#endif
+ .endm
+
+// Address expression (possibly using a base register, and a displacement)
+// referring to ADDR, which is within our module, maybe using GOT.
+#define INTADDR(...) INTADDR__0(__VA_ARGS__, GOTREG, dummy)
+#define INTADDR__0(addr, got, ...) INTADDR__1(addr, got)
+#if WANT_PIC
+# define INTADDR__1(addr, got) got + addr@GOTOFF
+#else
+# define INTADDR__1(addr, got) addr
+#endif
+
+#endif
+
+///--------------------------------------------------------------------------
+/// Final stuff.
+
+// Default values for the various hooks.
+#ifndef FUNC_PREHOOK
+# define FUNC_PREHOOK(name)
+#endif
+#ifndef FUNC_POSTHOOK
+# define FUNC_POSTHOOK(name)
+#endif
+#ifndef ENDFUNC_HOOK
+# define ENDFUNC_HOOK(name)
+#endif
+
+#ifndef F
+# define F(name) name
+#endif
+
+#ifndef TYPE_FUNC
+# define TYPE_FUNC(name)
+#endif
+
+#ifndef SIZE_OBJ
+# define SIZE_OBJ(name)
+#endif
+
+///----- That's all, folks --------------------------------------------------
BLKCS += rijndael rijndael192 rijndael256
libsymm_la_SOURCES += rijndael-base.h rijndael-base.c
if CPUFAM_X86
-libsymm_la_SOURCES += rijndael-x86-aesni.s
+libsymm_la_SOURCES += rijndael-x86-aesni.S
endif
libsymm_la_SOURCES += $(precomp)/rijndael-tab.c
PRECOMPS += $(precomp)/rijndael-tab.c
pkginclude_HEADERS += salsa20.h salsa20-core.h
libsymm_la_SOURCES += salsa20.c
if CPUFAM_X86
-libsymm_la_SOURCES += salsa20-x86-sse2.s
+libsymm_la_SOURCES += salsa20-x86-sse2.S
endif
TESTS += salsa20.$t
ALL_CIPHERS += salsa20 salsa2012 salsa208
pkginclude_HEADERS += chacha.h chacha-core.h
libsymm_la_SOURCES += chacha.c
if CPUFAM_X86
-libsymm_la_SOURCES += chacha-x86-sse2.s
+libsymm_la_SOURCES += chacha-x86-sse2.S
endif
TESTS += chacha.$t
EXTRA_DIST += t/chacha
--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of ChaCha
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+ .arch pentium4
+ .section .text
+
+FUNC(chacha_core_x86_sse2)
+
+ // Initial state. We have three arguments:
+ // [ebp + 8] is the number of rounds to do
+ // [ebp + 12] points to the input matrix
+ // [ebp + 16] points to the output matrix
+ push ebp
+ mov ebp, esp
+ sub esp, 16
+ mov edx, [ebp + 12]
+ and esp, ~15
+
+ // First job is to slurp the matrix into XMM registers. Be careful:
+ // the input matrix isn't likely to be properly aligned.
+ //
+ // [ 0 1 2 3] (a, xmm0)
+ // [ 4 5 6 7] (b, xmm0)
+ // [ 8 9 10 11] (c, xmm0)
+ // [12 13 14 15] (d, xmm0)
+ movdqu xmm0, [edx + 0]
+ movdqu xmm1, [edx + 16]
+ movdqu xmm2, [edx + 32]
+ movdqu xmm3, [edx + 48]
+
+ // Prepare for the main loop.
+ mov ecx, [ebp + 8]
+
+ // Take a copy for later. This one is aligned properly, by
+ // construction.
+ movdqa [esp], xmm0
+ movdqa xmm5, xmm1
+ movdqa xmm6, xmm2
+ movdqa xmm7, xmm3
+
+loop:
+ // Apply a column quarterround to each of the columns simultaneously.
+ // Alas, there doesn't seem to be a packed doubleword rotate, so we
+ // have to synthesize it.
+
+ // a += b; d ^= a; d <<<= 16
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm4, xmm3
+ pslld xmm3, 16
+ psrld xmm4, 16
+ por xmm3, xmm4
+
+ // c += d; b ^= c; b <<<= 12
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm4, xmm1
+ pslld xmm1, 12
+ psrld xmm4, 20
+ por xmm1, xmm4
+
+ // a += b; d ^= a; d <<<= 8
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm4, xmm3
+ pslld xmm3, 8
+ psrld xmm4, 24
+ por xmm3, xmm4
+
+ // c += d; b ^= c; b <<<= 7
+ paddd xmm2, xmm3
+ pshufd xmm3, xmm3, 0x93
+ pxor xmm1, xmm2
+ pshufd xmm2, xmm2, 0x4e
+ movdqa xmm4, xmm1
+ pslld xmm1, 7
+ psrld xmm4, 25
+ por xmm1, xmm4
+
+ // The not-quite-transpose conveniently only involves reordering
+ // elements of individual rows, which can be done quite easily. It
+ // doesn't involve any movement of elements between rows, or even
+ // renaming of the rows.
+ //
+ // [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
+ // [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
+ // [ 8 9 10 11] [10 11 8 9] (c, xmm2)
+ // [12 13 14 15] [15 12 13 14] (d, xmm3)
+ //
+ // The shuffles have quite high latency, so they've mostly been
+ // pushed upwards. The remaining one can't be moved, though.
+ pshufd xmm1, xmm1, 0x39
+
+ // Apply the diagonal quarterround to each of the columns
+ // simultaneously.
+
+ // a += b; d ^= a; d <<<= 16
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm4, xmm3
+ pslld xmm3, 16
+ psrld xmm4, 16
+ por xmm3, xmm4
+
+ // c += d; b ^= c; b <<<= 12
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm4, xmm1
+ pslld xmm1, 12
+ psrld xmm4, 20
+ por xmm1, xmm4
+
+ // a += b; d ^= a; d <<<= 8
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm4, xmm3
+ pslld xmm3, 8
+ psrld xmm4, 24
+ por xmm3, xmm4
+
+ // c += d; b ^= c; b <<<= 7
+ paddd xmm2, xmm3
+ pshufd xmm3, xmm3, 0x39
+ pxor xmm1, xmm2
+ pshufd xmm2, xmm2, 0x4e
+ movdqa xmm4, xmm1
+ pslld xmm1, 7
+ psrld xmm4, 25
+ por xmm1, xmm4
+
+ // Finally, finish off undoing the transpose, and we're done for this
+ // doubleround. Again, most of this was done above so we don't have
+ // to wait for the shuffles.
+ pshufd xmm1, xmm1, 0x93
+
+ // Decrement the loop counter and see if we should go round again.
+ sub ecx, 2
+ ja loop
+
+ // Almost there. Firstly, the feedforward addition.
+ mov edx, [ebp + 16]
+ paddd xmm0, [esp]
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+
+ // And now we write out the result. This one won't be aligned
+ // either.
+ movdqu [edx + 0], xmm0
+ movdqu [edx + 16], xmm1
+ movdqu [edx + 32], xmm2
+ movdqu [edx + 48], xmm3
+
+ // Tidy things up.
+ mov esp, ebp
+ pop ebp
+
+ // And with that, we're done.
+ ret
+
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
+++ /dev/null
-### -*- mode: asm; asm-comment-char: ?# -*-
-###
-### Fancy SIMD implementation of ChaCha
-###
-### (c) 2015 Straylight/Edgeware
-###
-
-###----- Licensing notice ---------------------------------------------------
-###
-### This file is part of Catacomb.
-###
-### Catacomb is free software; you can redistribute it and/or modify
-### it under the terms of the GNU Library General Public License as
-### published by the Free Software Foundation; either version 2 of the
-### License, or (at your option) any later version.
-###
-### Catacomb is distributed in the hope that it will be useful,
-### but WITHOUT ANY WARRANTY; without even the implied warranty of
-### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-### GNU Library General Public License for more details.
-###
-### You should have received a copy of the GNU Library General Public
-### License along with Catacomb; if not, write to the Free
-### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-### MA 02111-1307, USA.
-
- .intel_syntax noprefix
- .arch pentium4
-
- .section .text
-
- .globl chacha_core_x86_sse2
- .type chacha_core_x86_sse2, STT_FUNC
-chacha_core_x86_sse2:
-
- ## Initial state. We have three arguments:
- ## [ebp + 8] is the number of rounds to do
- ## [ebp + 12] points to the input matrix
- ## [ebp + 16] points to the output matrix
- push ebp
- mov ebp, esp
- sub esp, 16
- mov edx, [ebp + 12]
- and esp, ~15
-
- ## First job is to slurp the matrix into XMM registers. Be careful:
- ## the input matrix isn't likely to be properly aligned.
- ##
- ## [ 0 1 2 3] (a, xmm0)
- ## [ 4 5 6 7] (b, xmm0)
- ## [ 8 9 10 11] (c, xmm0)
- ## [12 13 14 15] (d, xmm0)
- movdqu xmm0, [edx + 0]
- movdqu xmm1, [edx + 16]
- movdqu xmm2, [edx + 32]
- movdqu xmm3, [edx + 48]
-
- ## Prepare for the main loop.
- mov ecx, [ebp + 8]
-
- ## Take a copy for later. This one is aligned properly, by
- ## construction.
- movdqa [esp], xmm0
- movdqa xmm5, xmm1
- movdqa xmm6, xmm2
- movdqa xmm7, xmm3
-
-loop:
- ## Apply a column quarterround to each of the columns simultaneously.
- ## Alas, there doesn't seem to be a packed doubleword rotate, so we
- ## have to synthesize it.
-
- ## a += b; d ^= a; d <<<= 16
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm4, xmm3
- pslld xmm3, 16
- psrld xmm4, 16
- por xmm3, xmm4
-
- ## c += d; b ^= c; b <<<= 12
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm4, xmm1
- pslld xmm1, 12
- psrld xmm4, 20
- por xmm1, xmm4
-
- ## a += b; d ^= a; d <<<= 8
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm4, xmm3
- pslld xmm3, 8
- psrld xmm4, 24
- por xmm3, xmm4
-
- ## c += d; b ^= c; b <<<= 7
- paddd xmm2, xmm3
- pshufd xmm3, xmm3, 0x93
- pxor xmm1, xmm2
- pshufd xmm2, xmm2, 0x4e
- movdqa xmm4, xmm1
- pslld xmm1, 7
- psrld xmm4, 25
- por xmm1, xmm4
-
- ## The not-quite-transpose conveniently only involves reordering
- ## elements of individual rows, which can be done quite easily. It
- ## doesn't involve any movement of elements between rows, or even
- ## renaming of the rows.
- ##
- ## [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
- ## [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
- ## [ 8 9 10 11] [10 11 8 9] (c, xmm2)
- ## [12 13 14 15] [15 12 13 14] (d, xmm3)
- ##
- ## The shuffles have quite high latency, so they've mostly been
- ## pushed upwards. The remaining one can't be moved, though.
- pshufd xmm1, xmm1, 0x39
-
- ## Apply the diagonal quarterround to each of the columns
- ## simultaneously.
-
- ## a += b; d ^= a; d <<<= 16
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm4, xmm3
- pslld xmm3, 16
- psrld xmm4, 16
- por xmm3, xmm4
-
- ## c += d; b ^= c; b <<<= 12
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm4, xmm1
- pslld xmm1, 12
- psrld xmm4, 20
- por xmm1, xmm4
-
- ## a += b; d ^= a; d <<<= 8
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm4, xmm3
- pslld xmm3, 8
- psrld xmm4, 24
- por xmm3, xmm4
-
- ## c += d; b ^= c; b <<<= 7
- paddd xmm2, xmm3
- pshufd xmm3, xmm3, 0x39
- pxor xmm1, xmm2
- pshufd xmm2, xmm2, 0x4e
- movdqa xmm4, xmm1
- pslld xmm1, 7
- psrld xmm4, 25
- por xmm1, xmm4
-
- ## Finally, finish off undoing the transpose, and we're done for this
- ## doubleround. Again, most of this was done above so we don't have
- ## to wait for the shuffles.
- pshufd xmm1, xmm1, 0x93
-
- ## Decrement the loop counter and see if we should go round again.
- sub ecx, 2
- ja loop
-
- ## Almost there. Firstly, the feedforward addition.
- mov edx, [ebp + 16]
- paddd xmm0, [esp]
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
-
- ## And now we write out the result. This one won't be aligned
- ## either.
- movdqu [edx + 0], xmm0
- movdqu [edx + 16], xmm1
- movdqu [edx + 32], xmm2
- movdqu [edx + 48], xmm3
-
- ## And with that, we're done.
- mov esp, ebp
- pop ebp
- ret
-
- .size chacha_core_x86_sse2, . - chacha_core_x86_sse2
-
-###----- That's all, folks --------------------------------------------------
--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// AESNI-based implementation of Rijndael
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+ .globl F(abort)
+ .globl F(rijndael_rcon)
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+ .arch .aes
+ .section .text
+
+/// The AESNI instructions implement a little-endian version of AES, but
+/// Catacomb's internal interface presents as big-endian so as to work better
+/// with things like GCM. We therefore maintain the round keys in
+/// little-endian form, and have to end-swap blocks in and out.
+///
+/// For added amusement, the AESNI instructions don't implement the
+/// larger-block versions of Rijndael, so we have to end-swap the keys if
+/// we're preparing for one of those.
+
+ // Useful constants.
+ .equ maxrounds, 16 // maximum number of rounds
+ .equ maxblksz, 32 // maximum block size, in bytes
+ .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
+
+ // Context structure.
+ .equ nr, 0 // number of rounds
+ .equ w, nr + 4 // encryption key words
+ .equ wi, w + kbufsz // decryption key words
+
+///--------------------------------------------------------------------------
+/// Key setup.
+
+FUNC(rijndael_setup_x86_aesni)
+
+ // Initial state. We have four arguments:
+ // [esp + 20] is the context pointer
+ // [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
+ // [esp + 28] points to the key material, unaligned
+ // [esp + 32] is the size of the key, in words
+ // The key size has already been checked for validity, and the number
+ // of rounds has been computed. Our job is only to fill in the `w'
+ // and `wi' vectors.
+
+ push ebp
+ push ebx
+ push esi
+ push edi
+
+ // The initial round key material is taken directly from the input
+ // key, so copy it over.
+ mov ebp, [esp + 20] // context base pointer
+ mov ebx, [esp + 32] // key size, in words
+ mov ecx, ebx
+ mov esi, [esp + 28]
+ lea edi, [ebp + w]
+ rep movsd
+
+ // Find out other useful things.
+ mov edx, [ebp + nr] // number of rounds
+ add edx, 1
+ imul edx, [esp + 24] // total key size in words
+ sub edx, ebx // offset by the key size
+
+ // Find the round constants.
+ ldgot ecx
+ leaext ecx, rijndael_rcon, ecx
+
+ // Prepare for the main loop.
+ lea esi, [ebp + w]
+ mov eax, [esi + 4*ebx - 4] // most recent key word
+ lea edx, [esi + 4*edx] // limit, offset by one key expansion
+
+ // Main key expansion loop. The first word of each key-length chunk
+ // needs special treatment.
+ //
+ // This is rather tedious because the Intel `AESKEYGENASSIST'
+ // instruction is very strangely shaped. Firstly, it wants to
+ // operate on vast SSE registers, even though we're data-blocked from
+ // doing more than operation at a time unless we're doing two key
+ // schedules simultaneously -- and even then we can't do more than
+ // two, because the instruction ignores two of its input words
+ // entirely, and produces two different outputs for each of the other
+ // two. And secondly it insists on taking the magic round constant
+ // as an immediate, so it's kind of annoying if you're not
+ // open-coding the whole thing. It's much easier to leave that as
+ // zero and XOR in the round constant by hand.
+9: movd xmm0, eax
+ pshufd xmm0, xmm0, 0x39
+ aeskeygenassist xmm1, xmm0, 0
+ pshufd xmm1, xmm1, 0x93
+ movd eax, xmm1
+ xor eax, [esi]
+ xor al, [ecx]
+ inc ecx
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ // The next three words are simple...
+ xor eax, [esi]
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ // (Word 2...)
+ xor eax, [esi]
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ // (Word 3...)
+ xor eax, [esi]
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ // Word 4. If the key is /more/ than 6 words long, then we must
+ // apply a substitution here.
+ cmp ebx, 5
+ jb 9b
+ cmp ebx, 7
+ jb 0f
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0x93
+ aeskeygenassist xmm1, xmm0, 0
+ movd eax, xmm1
+0: xor eax, [esi]
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ // (Word 5...)
+ cmp ebx, 6
+ jb 9b
+ xor eax, [esi]
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ // (Word 6...)
+ cmp ebx, 7
+ jb 9b
+ xor eax, [esi]
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ // (Word 7...)
+ cmp ebx, 8
+ jb 9b
+ xor eax, [esi]
+ mov [esi + 4*ebx], eax
+ add esi, 4
+ cmp esi, edx
+ jae 8f
+
+ // Must be done by now.
+ jmp 9b
+
+ // Next job is to construct the decryption keys. The keys for the
+ // first and last rounds don't need to be mangled, but the remaining
+ // ones do -- and they all need to be reordered too.
+ //
+ // The plan of action, then, is to copy the final encryption round's
+ // keys into place first, then to do each of the intermediate rounds
+ // in reverse order, and finally do the first round.
+ //
+ // Do all of the heavy lifting with SSE registers. The order we're
+ // doing this in means that it's OK if we read or write too much, and
+ // there's easily enough buffer space for the over-enthusiastic reads
+ // and writes because the context has space for 32-byte blocks, which
+ // is our maximum and an exact fit for two SSE registers.
+8: mov ecx, [ebp + nr] // number of rounds
+ mov ebx, [esp + 24] // block size (in words)
+ mov edx, ecx
+ imul edx, ebx
+ lea edi, [ebp + wi]
+ lea esi, [ebp + 4*edx + w] // last round's keys
+ shl ebx, 2 // block size (in bytes now)
+
+ // Copy the last encryption round's keys.
+ movdqu xmm0, [esi]
+ movdqu [edi], xmm0
+ cmp ebx, 16
+ jbe 9f
+ movdqu xmm0, [esi + 16]
+ movdqu [edi + 16], xmm0
+
+ // Update the loop variables and stop if we've finished.
+9: add edi, ebx
+ sub esi, ebx
+ sub ecx, 1
+ jbe 0f
+
+ // Do another middle round's keys...
+ movdqu xmm0, [esi]
+ aesimc xmm0, xmm0
+ movdqu [edi], xmm0
+ cmp ebx, 16
+ jbe 9b
+ movdqu xmm0, [esi + 16]
+ aesimc xmm0, xmm0
+ movdqu [edi + 16], xmm0
+ jmp 9b
+
+ // Finally do the first encryption round.
+0: movdqu xmm0, [esi]
+ movdqu [edi], xmm0
+ cmp ebx, 16
+ jbe 0f
+ movdqu xmm0, [esi + 16]
+ movdqu [edi + 16], xmm0
+
+ // If the block size is not exactly four words then we must end-swap
+ // everything. We can use fancy SSE toys for this.
+0: cmp ebx, 16
+ je 0f
+
+ // Find the byte-reordering table.
+ ldgot ecx
+ movdqa xmm7, [INTADDR(endswap_tab, ecx)]
+
+ // Calculate the number of subkey words again. (It's a good job
+ // we've got a fast multiplier.)
+ mov ecx, [ebp + nr]
+ add ecx, 1
+ imul ecx, [esp + 24] // total keys in words
+
+ // End-swap the encryption keys.
+ mov eax, ecx
+ lea esi, [ebp + w]
+ call endswap_block
+
+ // And the decryption keys.
+ mov ecx, eax
+ lea esi, [ebp + wi]
+ call endswap_block
+
+ // All done.
+0: pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+
+ .align 16
+endswap_block:
+ // End-swap ECX words starting at ESI. The end-swapping table is
+ // already loaded into XMM7; and it's OK to work in 16-byte chunks.
+ movdqu xmm1, [esi]
+ pshufb xmm1, xmm7
+ movdqu [esi], xmm1
+ add esi, 16
+ sub ecx, 4
+ ja endswap_block
+ ret
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Encrypting and decrypting blocks.
+
+FUNC(rijndael_eblk_x86_aesni)
+
+ // On entry, we have:
+ // [esp + 4] points to the context block
+ // [esp + 8] points to the input data block
+ // [esp + 12] points to the output buffer
+
+ // Find the magic endianness-swapping table.
+ ldgot ecx
+ movdqa xmm7, [INTADDR(endswap_tab, ecx)]
+
+ // Load the input block and end-swap it. Also, start loading the
+ // keys.
+ mov eax, [esp + 8]
+ movdqu xmm0, [eax]
+ pshufb xmm0, xmm7
+ mov eax, [esp + 4]
+ lea edx, [eax + w]
+ mov eax, [eax + nr]
+
+ // Initial whitening.
+ movdqu xmm1, [edx]
+ add edx, 16
+ pxor xmm0, xmm1
+
+ // Dispatch to the correct code.
+ cmp eax, 10
+ je er10
+ jb bogus
+ cmp eax, 14
+ je er14
+ ja bogus
+ cmp eax, 12
+ je er12
+ jb er11
+ jmp er13
+
+ .align 2
+
+ // 14 rounds...
+er14: movdqu xmm1, [edx]
+ add edx, 16
+ aesenc xmm0, xmm1
+
+ // 13 rounds...
+er13: movdqu xmm1, [edx]
+ add edx, 16
+ aesenc xmm0, xmm1
+
+ // 12 rounds...
+er12: movdqu xmm1, [edx]
+ add edx, 16
+ aesenc xmm0, xmm1
+
+ // 11 rounds...
+er11: movdqu xmm1, [edx]
+ add edx, 16
+ aesenc xmm0, xmm1
+
+ // 10 rounds...
+er10: movdqu xmm1, [edx]
+ aesenc xmm0, xmm1
+
+ // 9 rounds...
+ movdqu xmm1, [edx + 16]
+ aesenc xmm0, xmm1
+
+ // 8 rounds...
+ movdqu xmm1, [edx + 32]
+ aesenc xmm0, xmm1
+
+ // 7 rounds...
+ movdqu xmm1, [edx + 48]
+ aesenc xmm0, xmm1
+
+ // 6 rounds...
+ movdqu xmm1, [edx + 64]
+ aesenc xmm0, xmm1
+
+ // 5 rounds...
+ movdqu xmm1, [edx + 80]
+ aesenc xmm0, xmm1
+
+ // 4 rounds...
+ movdqu xmm1, [edx + 96]
+ aesenc xmm0, xmm1
+
+ // 3 rounds...
+ movdqu xmm1, [edx + 112]
+ aesenc xmm0, xmm1
+
+ // 2 rounds...
+ movdqu xmm1, [edx + 128]
+ aesenc xmm0, xmm1
+
+ // Final round...
+ movdqu xmm1, [edx + 144]
+ aesenclast xmm0, xmm1
+
+ // Unpermute the ciphertext block and store it.
+ pshufb xmm0, xmm7
+ mov eax, [esp + 12]
+ movdqu [eax], xmm0
+
+ // And we're done.
+ ret
+
+ENDFUNC
+
+FUNC(rijndael_dblk_x86_aesni)
+
+ // On entry, we have:
+ // [esp + 4] points to the context block
+ // [esp + 8] points to the input data block
+ // [esp + 12] points to the output buffer
+
+ // Find the magic endianness-swapping table.
+ ldgot ecx
+ movdqa xmm7, [INTADDR(endswap_tab, ecx)]
+
+ // Load the input block and end-swap it. Also, start loading the
+ // keys.
+ mov eax, [esp + 8]
+ movdqu xmm0, [eax]
+ pshufb xmm0, xmm7
+ mov eax, [esp + 4]
+ lea edx, [eax + wi]
+ mov eax, [eax + nr]
+
+ // Initial whitening.
+ movdqu xmm1, [edx]
+ add edx, 16
+ pxor xmm0, xmm1
+
+ // Dispatch to the correct code.
+ cmp eax, 10
+ je dr10
+ jb bogus
+ cmp eax, 14
+ je dr14
+ ja bogus
+ cmp eax, 12
+ je dr12
+ jb dr11
+ jmp dr13
+
+ .align 2
+
+ // 14 rounds...
+dr14: movdqu xmm1, [edx]
+ add edx, 16
+ aesdec xmm0, xmm1
+
+ // 13 rounds...
+dr13: movdqu xmm1, [edx]
+ add edx, 16
+ aesdec xmm0, xmm1
+
+ // 12 rounds...
+dr12: movdqu xmm1, [edx]
+ add edx, 16
+ aesdec xmm0, xmm1
+
+ // 11 rounds...
+dr11: movdqu xmm1, [edx]
+ add edx, 16
+ aesdec xmm0, xmm1
+
+ // 10 rounds...
+dr10: movdqu xmm1, [edx]
+ aesdec xmm0, xmm1
+
+ // 9 rounds...
+ movdqu xmm1, [edx + 16]
+ aesdec xmm0, xmm1
+
+ // 8 rounds...
+ movdqu xmm1, [edx + 32]
+ aesdec xmm0, xmm1
+
+ // 7 rounds...
+ movdqu xmm1, [edx + 48]
+ aesdec xmm0, xmm1
+
+ // 6 rounds...
+ movdqu xmm1, [edx + 64]
+ aesdec xmm0, xmm1
+
+ // 5 rounds...
+ movdqu xmm1, [edx + 80]
+ aesdec xmm0, xmm1
+
+ // 4 rounds...
+ movdqu xmm1, [edx + 96]
+ aesdec xmm0, xmm1
+
+ // 3 rounds...
+ movdqu xmm1, [edx + 112]
+ aesdec xmm0, xmm1
+
+ // 2 rounds...
+ movdqu xmm1, [edx + 128]
+ aesdec xmm0, xmm1
+
+ // Final round...
+ movdqu xmm1, [edx + 144]
+ aesdeclast xmm0, xmm1
+
+ // Unpermute the ciphertext block and store it.
+ pshufb xmm0, xmm7
+ mov eax, [esp + 12]
+ movdqu [eax], xmm0
+
+ // And we're done.
+ ret
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Random utilities.
+
+ .align 16
+ // Abort the process because of a programming error. Indirecting
+ // through this point serves several purposes: (a) by CALLing, rather
+ // than branching to, `abort', we can save the return address, which
+ // might at least provide a hint as to what went wrong; (b) we don't
+ // have conditional CALLs (and they'd be big anyway); and (c) we can
+ // write a HLT here as a backstop against `abort' being mad.
+bogus: callext F(abort)
+0: hlt
+ jmp 0b
+
+ gotaux ecx
+
+///--------------------------------------------------------------------------
+/// Data tables.
+
+ .align 16
+endswap_tab:
+ .byte 3, 2, 1, 0
+ .byte 7, 6, 5, 4
+ .byte 11, 10, 9, 8
+ .byte 15, 14, 13, 12
+
+///----- That's all, folks --------------------------------------------------
+++ /dev/null
-### -*- mode: asm; asm-comment-char: ?# -*-
-###
-### AESNI-based implementation of Rijndael
-###
-### (c) 2015 Straylight/Edgeware
-###
-
-###----- Licensing notice ---------------------------------------------------
-###
-### This file is part of Catacomb.
-###
-### Catacomb is free software; you can redistribute it and/or modify
-### it under the terms of the GNU Library General Public License as
-### published by the Free Software Foundation; either version 2 of the
-### License, or (at your option) any later version.
-###
-### Catacomb is distributed in the hope that it will be useful,
-### but WITHOUT ANY WARRANTY; without even the implied warranty of
-### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-### GNU Library General Public License for more details.
-###
-### You should have received a copy of the GNU Library General Public
-### License along with Catacomb; if not, write to the Free
-### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-### MA 02111-1307, USA.
-
- .intel_syntax noprefix
- .arch .aes
-
- .globl abort
- .globl rijndael_rcon
-
- .section .text
-
-### The AESNI instructions implement a little-endian version of AES, but
-### Catacomb's internal interface presents as big-endian so as to work better
-### with things like GCM. We therefore maintain the round keys in
-### little-endian form, and have to end-swap blocks in and out.
-###
-### For added amusement, the AESNI instructions don't implement the
-### larger-block versions of Rijndael, so we have to end-swap the keys if
-### we're preparing for one of those.
-
- ## Useful constants.
- .equ maxrounds, 16 # maximum number of rounds
- .equ maxblksz, 32 # maximum block size, in bytes
- .equ kbufsz, maxblksz*(maxrounds + 1) # size of a key-schedule buffer
-
- ## Context structure.
- .equ nr, 0 # number of rounds
- .equ w, nr + 4 # encryption key words
- .equ wi, w + kbufsz # decryption key words
-
-###--------------------------------------------------------------------------
-### Key setup.
-
- .globl rijndael_setup_x86_aesni
- .type rijndael_setup_x86_aesni, STT_FUNC
- .align 16
-rijndael_setup_x86_aesni:
-
- ## Initial state. We have four arguments:
- ## [esp + 20] is the context pointer
- ## [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
- ## [esp + 28] points to the key material, unaligned
- ## [esp + 32] is the size of the key, in words
- ## The key size has already been checked for validity, and the number
- ## of rounds has been computed. Our job is only to fill in the `w'
- ## and `wi' vectors.
-
- push ebp
- push ebx
- push esi
- push edi
-
- ## The initial round key material is taken directly from the input
- ## key, so copy it over.
- mov ebp, [esp + 20] # context base pointer
- mov ebx, [esp + 32] # key size, in words
- mov ecx, ebx
- mov esi, [esp + 28]
- lea edi, [ebp + w]
- rep movsd
-
- ## Find out other useful things.
- mov edx, [ebp + nr] # number of rounds
- add edx, 1
- imul edx, [esp + 24] # total key size in words
- sub edx, ebx # offset by the key size
-
- ## Find the round constants.
- call where_am_i_ecx
- add ecx, offset _GLOBAL_OFFSET_TABLE_
- mov ecx, [ecx + rijndael_rcon@GOT]
-
- ## Prepare for the main loop.
- lea esi, [ebp + w]
- mov eax, [esi + 4*ebx - 4] # most recent key word
- lea edx, [esi + 4*edx] # limit, offset by one key expansion
-
- ## Main key expansion loop. The first word of each key-length chunk
- ## needs special treatment.
- ##
- ## This is rather tedious because the Intel `AESKEYGENASSIST'
- ## instruction is very strangely shaped. Firstly, it wants to
- ## operate on vast SSE registers, even though we're data-blocked from
- ## doing more than operation at a time unless we're doing two key
- ## schedules simultaneously -- and even then we can't do more than
- ## two, because the instruction ignores two of its input words
- ## entirely, and produces two different outputs for each of the other
- ## two. And secondly it insists on taking the magic round constant
- ## as an immediate, so it's kind of annoying if you're not
- ## open-coding the whole thing. It's much easier to leave that as
- ## zero and XOR in the round constant by hand.
-9: movd xmm0, eax
- pshufd xmm0, xmm0, 0x39
- aeskeygenassist xmm1, xmm0, 0
- pshufd xmm1, xmm1, 0x93
- movd eax, xmm1
- xor eax, [esi]
- xor al, [ecx]
- inc ecx
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## The next three words are simple...
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## (Word 2...)
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## (Word 3...)
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## Word 4. If the key is /more/ than 6 words long, then we must
- ## apply a substitution here.
- cmp ebx, 5
- jb 9b
- cmp ebx, 7
- jb 0f
- movd xmm0, eax
- pshufd xmm0, xmm0, 0x93
- aeskeygenassist xmm1, xmm0, 0
- movd eax, xmm1
-0: xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## (Word 5...)
- cmp ebx, 6
- jb 9b
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## (Word 6...)
- cmp ebx, 7
- jb 9b
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## (Word 7...)
- cmp ebx, 8
- jb 9b
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## Must be done by now.
- jmp 9b
-
- ## Next job is to construct the decryption keys. The keys for the
- ## first and last rounds don't need to be mangled, but the remaining
- ## ones do -- and they all need to be reordered too.
- ##
- ## The plan of action, then, is to copy the final encryption round's
- ## keys into place first, then to do each of the intermediate rounds
- ## in reverse order, and finally do the first round.
- ##
- ## Do all of the heavy lifting with SSE registers. The order we're
- ## doing this in means that it's OK if we read or write too much, and
- ## there's easily enough buffer space for the over-enthusiastic reads
- ## and writes because the context has space for 32-byte blocks, which
- ## is our maximum and an exact fit for two SSE registers.
-8: mov ecx, [ebp + nr] # number of rounds
- mov ebx, [esp + 24] # block size (in words)
- mov edx, ecx
- imul edx, ebx
- lea edi, [ebp + wi]
- lea esi, [ebp + 4*edx + w] # last round's keys
- shl ebx, 2 # block size (in bytes now)
-
- ## Copy the last encryption round's keys.
- movdqu xmm0, [esi]
- movdqu [edi], xmm0
- cmp ebx, 16
- jbe 9f
- movdqu xmm0, [esi + 16]
- movdqu [edi + 16], xmm0
-
- ## Update the loop variables and stop if we've finished.
-9: add edi, ebx
- sub esi, ebx
- sub ecx, 1
- jbe 0f
-
- ## Do another middle round's keys...
- movdqu xmm0, [esi]
- aesimc xmm0, xmm0
- movdqu [edi], xmm0
- cmp ebx, 16
- jbe 9b
- movdqu xmm0, [esi + 16]
- aesimc xmm0, xmm0
- movdqu [edi + 16], xmm0
- jmp 9b
-
- ## Finally do the first encryption round.
-0: movdqu xmm0, [esi]
- movdqu [edi], xmm0
- cmp ebx, 16
- jbe 0f
- movdqu xmm0, [esi + 16]
- movdqu [edi + 16], xmm0
-
- ## If the block size is not exactly four words then we must end-swap
- ## everything. We can use fancy SSE toys for this.
-0: cmp ebx, 16
- je 0f
-
- ## Find the byte-reordering table.
- call where_am_i_ecx
- movdqa xmm7, [ecx + endswap_tab - .]
-
- ## Calculate the number of subkey words again. (It's a good job
- ## we've got a fast multiplier.)
- mov ecx, [ebp + nr]
- add ecx, 1
- imul ecx, [esp + 24] # total keys in words
-
- ## End-swap the encryption keys.
- mov eax, ecx
- lea esi, [ebp + w]
- call endswap_block
-
- ## And the decryption keys.
- mov ecx, eax
- lea esi, [ebp + wi]
- call endswap_block
-
- ## All done.
-0: pop edi
- pop esi
- pop ebx
- pop ebp
- ret
-
- .align 16
-endswap_block:
- ## End-swap ECX words starting at ESI. The end-swapping table is
- ## already loaded into XMM7; and it's OK to work in 16-byte chunks.
- movdqu xmm1, [esi]
- pshufb xmm1, xmm7
- movdqu [esi], xmm1
- add esi, 16
- sub ecx, 4
- ja endswap_block
- ret
-
- .size rijndael_setup_x86_aesni, . - rijndael_setup_x86_aesni
-
-###--------------------------------------------------------------------------
-### Encrypting and decrypting blocks.
-
- .globl rijndael_eblk_x86_aesni
- .type rijndael_eblk_x86_aesni, STT_FUNC
- .align 16
-rijndael_eblk_x86_aesni:
-
- ## On entry, we have:
- ## [esp + 4] points to the context block
- ## [esp + 8] points to the input data block
- ## [esp + 12] points to the output buffer
-
- ## Find the magic endianness-swapping table.
- call where_am_i_ecx
- movdqa xmm7, [ecx + endswap_tab - .]
-
- ## Load the input block and end-swap it. Also, start loading the
- ## keys.
- mov eax, [esp + 8]
- movdqu xmm0, [eax]
- pshufb xmm0, xmm7
- mov eax, [esp + 4]
- lea edx, [eax + w]
- mov eax, [eax + nr]
-
- ## Initial whitening.
- movdqu xmm1, [edx]
- add edx, 16
- pxor xmm0, xmm1
-
- ## Dispatch to the correct code.
- cmp eax, 10
- je er10
- jb bogus
- cmp eax, 14
- je er14
- ja bogus
- cmp eax, 12
- je er12
- jb er11
- jmp er13
-
- .align 2
-
- ## 14 rounds...
-er14: movdqu xmm1, [edx]
- add edx, 16
- aesenc xmm0, xmm1
-
- ## 13 rounds...
-er13: movdqu xmm1, [edx]
- add edx, 16
- aesenc xmm0, xmm1
-
- ## 12 rounds...
-er12: movdqu xmm1, [edx]
- add edx, 16
- aesenc xmm0, xmm1
-
- ## 11 rounds...
-er11: movdqu xmm1, [edx]
- add edx, 16
- aesenc xmm0, xmm1
-
- ## 10 rounds...
-er10: movdqu xmm1, [edx]
- aesenc xmm0, xmm1
-
- ## 9 rounds...
- movdqu xmm1, [edx + 16]
- aesenc xmm0, xmm1
-
- ## 8 rounds...
- movdqu xmm1, [edx + 32]
- aesenc xmm0, xmm1
-
- ## 7 rounds...
- movdqu xmm1, [edx + 48]
- aesenc xmm0, xmm1
-
- ## 6 rounds...
- movdqu xmm1, [edx + 64]
- aesenc xmm0, xmm1
-
- ## 5 rounds...
- movdqu xmm1, [edx + 80]
- aesenc xmm0, xmm1
-
- ## 4 rounds...
- movdqu xmm1, [edx + 96]
- aesenc xmm0, xmm1
-
- ## 3 rounds...
- movdqu xmm1, [edx + 112]
- aesenc xmm0, xmm1
-
- ## 2 rounds...
- movdqu xmm1, [edx + 128]
- aesenc xmm0, xmm1
-
- ## Final round...
- movdqu xmm1, [edx + 144]
- aesenclast xmm0, xmm1
-
- ## Unpermute the ciphertext block and store it.
- pshufb xmm0, xmm7
- mov eax, [esp + 12]
- movdqu [eax], xmm0
-
- ## And we're done.
- ret
-
- .size rijndael_eblk_x86_aesni, . - rijndael_dblk_x86_aesni
-
- .globl rijndael_dblk_x86_aesni
- .type rijndael_dblk_x86_aesni, STT_FUNC
- .align 16
-rijndael_dblk_x86_aesni:
-
- ## On entry, we have:
- ## [esp + 4] points to the context block
- ## [esp + 8] points to the input data block
- ## [esp + 12] points to the output buffer
-
- ## Find the magic endianness-swapping table.
- call where_am_i_ecx
- movdqa xmm7, [ecx + endswap_tab - .]
-
- ## Load the input block and end-swap it. Also, start loading the
- ## keys.
- mov eax, [esp + 8]
- movdqu xmm0, [eax]
- pshufb xmm0, xmm7
- mov eax, [esp + 4]
- lea edx, [eax + wi]
- mov eax, [eax + nr]
-
- ## Initial whitening.
- movdqu xmm1, [edx]
- add edx, 16
- pxor xmm0, xmm1
-
- ## Dispatch to the correct code.
- cmp eax, 10
- je dr10
- jb bogus
- cmp eax, 14
- je dr14
- ja bogus
- cmp eax, 12
- je dr12
- jb dr11
- jmp dr13
-
- .align 2
-
- ## 14 rounds...
-dr14: movdqu xmm1, [edx]
- add edx, 16
- aesdec xmm0, xmm1
-
- ## 13 rounds...
-dr13: movdqu xmm1, [edx]
- add edx, 16
- aesdec xmm0, xmm1
-
- ## 12 rounds...
-dr12: movdqu xmm1, [edx]
- add edx, 16
- aesdec xmm0, xmm1
-
- ## 11 rounds...
-dr11: movdqu xmm1, [edx]
- add edx, 16
- aesdec xmm0, xmm1
-
- ## 10 rounds...
-dr10: movdqu xmm1, [edx]
- aesdec xmm0, xmm1
-
- ## 9 rounds...
- movdqu xmm1, [edx + 16]
- aesdec xmm0, xmm1
-
- ## 8 rounds...
- movdqu xmm1, [edx + 32]
- aesdec xmm0, xmm1
-
- ## 7 rounds...
- movdqu xmm1, [edx + 48]
- aesdec xmm0, xmm1
-
- ## 6 rounds...
- movdqu xmm1, [edx + 64]
- aesdec xmm0, xmm1
-
- ## 5 rounds...
- movdqu xmm1, [edx + 80]
- aesdec xmm0, xmm1
-
- ## 4 rounds...
- movdqu xmm1, [edx + 96]
- aesdec xmm0, xmm1
-
- ## 3 rounds...
- movdqu xmm1, [edx + 112]
- aesdec xmm0, xmm1
-
- ## 2 rounds...
- movdqu xmm1, [edx + 128]
- aesdec xmm0, xmm1
-
- ## Final round...
- movdqu xmm1, [edx + 144]
- aesdeclast xmm0, xmm1
-
- ## Unpermute the ciphertext block and store it.
- pshufb xmm0, xmm7
- mov eax, [esp + 12]
- movdqu [eax], xmm0
-
- ## And we're done.
- ret
-
- .size rijndael_dblk_x86_aesni, . - rijndael_dblk_x86_aesni
-
-###--------------------------------------------------------------------------
-### Random utilities.
-
- .align 16
- ## Abort the process because of a programming error. Indirecting
- ## through this point serves several purposes: (a) by CALLing, rather
- ## than branching to, `abort', we can save the return address, which
- ## might at least provide a hint as to what went wrong; (b) we don't
- ## have conditional CALLs (and they'd be big anyway); and (c) we can
- ## write a HLT here as a backstop against `abort' being mad.
-bogus: call abort@PLT
-0: hlt
- jmp 0b
-
- .align 16
- ## Return the address of the instruction following the CALL here in
- ## ECX. This is useful for doing position-independent addressing.
-where_am_i_ecx:
- mov ecx, [esp]
- ret
-
-###--------------------------------------------------------------------------
-### Data tables.
-
- .align 16
-endswap_tab:
- .byte 3, 2, 1, 0
- .byte 7, 6, 5, 4
- .byte 11, 10, 9, 8
- .byte 15, 14, 13, 12
-
-###----- That's all, folks --------------------------------------------------
--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of Salsa20
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+ .arch pentium4
+ .section .text
+
+FUNC(salsa20_core_x86_sse2)
+
+ // Initial state. We have three arguments:
+ // [ebp + 8] is the number of rounds to do
+ // [ebp + 12] points to the input matrix
+ // [ebp + 16] points to the output matrix
+ push ebp
+ mov ebp, esp
+ sub esp, 32
+ mov edx, [ebp + 12]
+ and esp, ~15
+
+ // Prepare for the main loop.
+ mov ecx, [ebp + 8]
+
+ // First job is to slurp the matrix into XMM registers. The words
+ // have already been permuted conveniently to make them line up
+ // better for SIMD processing.
+ //
+ // The textbook arrangement of the matrix is this.
+ //
+ // [C K K K]
+ // [K C N N]
+ // [T T C K]
+ // [K K K C]
+ //
+ // But we've rotated the columns up so that the main diagonal with
+ // the constants on it end up in the first row, giving something more
+ // like
+ //
+ // [C C C C]
+ // [K T K K]
+ // [T K K N]
+ // [K K N K]
+ //
+ // so the transformation looks like this:
+ //
+ // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
+ // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
+ // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
+ // [12 13 14 15] [12 1 6 11] (d, xmm3)
+ movdqu xmm0, [edx + 0]
+ movdqu xmm1, [edx + 16]
+ movdqu xmm2, [edx + 32]
+ movdqu xmm3, [edx + 48]
+
+ // Take a copy for later.
+ movdqa [esp + 0], xmm0
+ movdqa [esp + 16], xmm1
+ movdqa xmm6, xmm2
+ movdqa xmm7, xmm3
+
+loop:
+
+ // Apply a column quarterround to each of the columns simultaneously.
+ // Alas, there doesn't seem to be a packed doubleword rotate, so we
+ // have to synthesize it.
+
+ // b ^= (a + d) <<< 7
+ movdqa xmm4, xmm0
+ paddd xmm4, xmm3
+ movdqa xmm5, xmm4
+ pslld xmm4, 7
+ psrld xmm5, 25
+ por xmm4, xmm5
+ pxor xmm1, xmm4
+
+ // c ^= (b + a) <<< 9
+ movdqa xmm4, xmm1
+ paddd xmm4, xmm0
+ movdqa xmm5, xmm4
+ pslld xmm4, 9
+ psrld xmm5, 23
+ por xmm4, xmm5
+ pxor xmm2, xmm4
+
+ // d ^= (c + b) <<< 13
+ movdqa xmm4, xmm2
+ paddd xmm4, xmm1
+ pshufd xmm1, xmm1, 0x93
+ movdqa xmm5, xmm4
+ pslld xmm4, 13
+ psrld xmm5, 19
+ por xmm4, xmm5
+ pxor xmm3, xmm4
+
+ // a ^= (d + c) <<< 18
+ movdqa xmm4, xmm3
+ pshufd xmm3, xmm3, 0x39
+ paddd xmm4, xmm2
+ pshufd xmm2, xmm2, 0x4e
+ movdqa xmm5, xmm4
+ pslld xmm4, 18
+ psrld xmm5, 14
+ por xmm4, xmm5
+ pxor xmm0, xmm4
+
+ // The transpose conveniently only involves reordering elements of
+ // individual rows, which can be done quite easily, and reordering
+ // the rows themselves, which is a trivial renaming. It doesn't
+ // involve any movement of elements between rows.
+ //
+ // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
+ // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
+ // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
+ // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
+ //
+ // The shuffles have quite high latency, so they've been pushed
+ // backwards into the main instruction list.
+
+ // Apply the row quarterround to each of the columns (yes!)
+ // simultaneously.
+
+ // b ^= (a + d) <<< 7
+ movdqa xmm4, xmm0
+ paddd xmm4, xmm1
+ movdqa xmm5, xmm4
+ pslld xmm4, 7
+ psrld xmm5, 25
+ por xmm4, xmm5
+ pxor xmm3, xmm4
+
+ // c ^= (b + a) <<< 9
+ movdqa xmm4, xmm3
+ paddd xmm4, xmm0
+ movdqa xmm5, xmm4
+ pslld xmm4, 9
+ psrld xmm5, 23
+ por xmm4, xmm5
+ pxor xmm2, xmm4
+
+ // d ^= (c + b) <<< 13
+ movdqa xmm4, xmm2
+ paddd xmm4, xmm3
+ pshufd xmm3, xmm3, 0x93
+ movdqa xmm5, xmm4
+ pslld xmm4, 13
+ psrld xmm5, 19
+ por xmm4, xmm5
+ pxor xmm1, xmm4
+
+ // a ^= (d + c) <<< 18
+ movdqa xmm4, xmm1
+ pshufd xmm1, xmm1, 0x39
+ paddd xmm4, xmm2
+ pshufd xmm2, xmm2, 0x4e
+ movdqa xmm5, xmm4
+ pslld xmm4, 18
+ psrld xmm5, 14
+ por xmm4, xmm5
+ pxor xmm0, xmm4
+
+ // We had to undo the transpose ready for the next loop. Again, push
+ // back the shuffles because they take a long time coming through.
+ // Decrement the loop counter and see if we should go round again.
+ // Later processors fuse this pair into a single uop.
+ sub ecx, 2
+ ja loop
+
+ // Almost there. Firstly, the feedforward addition, and then we have
+ // to write out the result. Here we have to undo the permutation
+ // which was already applied to the input. Shuffling has quite high
+ // latency, so arrange to start a new shuffle into a temporary as
+ // soon as we've written out the old value.
+ mov edx, [ebp + 16]
+
+ paddd xmm0, [esp + 0]
+ pshufd xmm4, xmm0, 0x39
+ movd [edx + 0], xmm0
+
+ paddd xmm1, [esp + 16]
+ pshufd xmm5, xmm1, 0x93
+ movd [edx + 16], xmm1
+
+ paddd xmm2, xmm6
+ pshufd xmm6, xmm2, 0x4e
+ movd [edx + 32], xmm2
+
+ paddd xmm3, xmm7
+ pshufd xmm7, xmm3, 0x39
+ movd [edx + 48], xmm3
+
+ movd [edx + 4], xmm7
+ pshufd xmm7, xmm3, 0x4e
+ movd [edx + 24], xmm7
+ pshufd xmm3, xmm3, 0x93
+ movd [edx + 44], xmm3
+
+ movd [edx + 8], xmm6
+ pshufd xmm6, xmm2, 0x93
+ movd [edx + 28], xmm6
+ pshufd xmm2, xmm2, 0x39
+ movd [edx + 52], xmm2
+
+ movd [edx + 12], xmm5
+ pshufd xmm5, xmm1, 0x39
+ movd [edx + 36], xmm5
+ pshufd xmm1, xmm1, 0x4e
+ movd [edx + 56], xmm1
+
+ movd [edx + 20], xmm4
+ pshufd xmm4, xmm0, 0x4e
+ movd [edx + 40], xmm4
+ pshufd xmm0, xmm0, 0x93
+ movd [edx + 60], xmm0
+
+ // Tidy things up.
+ mov esp, ebp
+ pop ebp
+
+ // And with that, we're done.
+ ret
+
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
+++ /dev/null
-### -*- mode: asm; asm-comment-char: ?# -*-
-###
-### Fancy SIMD implementation of Salsa20
-###
-### (c) 2015 Straylight/Edgeware
-###
-
-###----- Licensing notice ---------------------------------------------------
-###
-### This file is part of Catacomb.
-###
-### Catacomb is free software; you can redistribute it and/or modify
-### it under the terms of the GNU Library General Public License as
-### published by the Free Software Foundation; either version 2 of the
-### License, or (at your option) any later version.
-###
-### Catacomb is distributed in the hope that it will be useful,
-### but WITHOUT ANY WARRANTY; without even the implied warranty of
-### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-### GNU Library General Public License for more details.
-###
-### You should have received a copy of the GNU Library General Public
-### License along with Catacomb; if not, write to the Free
-### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-### MA 02111-1307, USA.
-
- .intel_syntax noprefix
- .arch pentium4
-
- .section .text
-
- .globl salsa20_core_x86_sse2
- .type salsa20_core_x86_sse2, STT_FUNC
-salsa20_core_x86_sse2:
-
- ## Initial state. We have three arguments:
- ## [ebp + 8] is the number of rounds to do
- ## [ebp + 12] points to the input matrix
- ## [ebp + 16] points to the output matrix
- push ebp
- mov ebp, esp
- sub esp, 32
- mov edx, [ebp + 12]
- and esp, ~15
-
- ## Prepare for the main loop.
- mov ecx, [ebp + 8]
-
- ## First job is to slurp the matrix into XMM registers. The words
- ## have already been permuted conveniently to make them line up
- ## better for SIMD processing.
- ##
- ## The textbook arrangement of the matrix is this.
- ##
- ## [C K K K]
- ## [K C N N]
- ## [T T C K]
- ## [K K K C]
- ##
- ## But we've rotated the columns up so that the main diagonal with
- ## the constants on it end up in the first row, giving something more
- ## like
- ##
- ## [C C C C]
- ## [K T K K]
- ## [T K K N]
- ## [K K N K]
- ##
- ## so the transformation looks like this:
- ##
- ## [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
- ## [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
- ## [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
- ## [12 13 14 15] [12 1 6 11] (d, xmm3)
- movdqu xmm0, [edx + 0]
- movdqu xmm1, [edx + 16]
- movdqu xmm2, [edx + 32]
- movdqu xmm3, [edx + 48]
-
- ## Take a copy for later.
- movdqa [esp + 0], xmm0
- movdqa [esp + 16], xmm1
- movdqa xmm6, xmm2
- movdqa xmm7, xmm3
-
-loop:
-
- ## Apply a column quarterround to each of the columns simultaneously.
- ## Alas, there doesn't seem to be a packed doubleword rotate, so we
- ## have to synthesize it.
-
- ## b ^= (a + d) <<< 7
- movdqa xmm4, xmm0
- paddd xmm4, xmm3
- movdqa xmm5, xmm4
- pslld xmm4, 7
- psrld xmm5, 25
- por xmm4, xmm5
- pxor xmm1, xmm4
-
- ## c ^= (b + a) <<< 9
- movdqa xmm4, xmm1
- paddd xmm4, xmm0
- movdqa xmm5, xmm4
- pslld xmm4, 9
- psrld xmm5, 23
- por xmm4, xmm5
- pxor xmm2, xmm4
-
- ## d ^= (c + b) <<< 13
- movdqa xmm4, xmm2
- paddd xmm4, xmm1
- pshufd xmm1, xmm1, 0x93
- movdqa xmm5, xmm4
- pslld xmm4, 13
- psrld xmm5, 19
- por xmm4, xmm5
- pxor xmm3, xmm4
-
- ## a ^= (d + c) <<< 18
- movdqa xmm4, xmm3
- pshufd xmm3, xmm3, 0x39
- paddd xmm4, xmm2
- pshufd xmm2, xmm2, 0x4e
- movdqa xmm5, xmm4
- pslld xmm4, 18
- psrld xmm5, 14
- por xmm4, xmm5
- pxor xmm0, xmm4
-
- ## The transpose conveniently only involves reordering elements of
- ## individual rows, which can be done quite easily, and reordering
- ## the rows themselves, which is a trivial renaming. It doesn't
- ## involve any movement of elements between rows.
- ##
- ## [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
- ## [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
- ## [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
- ## [12 1 6 11] [ 3 4 9 14] (d, xmm1)
- ##
- ## The shuffles have quite high latency, so they've been pushed
- ## backwards into the main instruction list.
-
- ## Apply the row quarterround to each of the columns (yes!)
- ## simultaneously.
-
- ## b ^= (a + d) <<< 7
- movdqa xmm4, xmm0
- paddd xmm4, xmm1
- movdqa xmm5, xmm4
- pslld xmm4, 7
- psrld xmm5, 25
- por xmm4, xmm5
- pxor xmm3, xmm4
-
- ## c ^= (b + a) <<< 9
- movdqa xmm4, xmm3
- paddd xmm4, xmm0
- movdqa xmm5, xmm4
- pslld xmm4, 9
- psrld xmm5, 23
- por xmm4, xmm5
- pxor xmm2, xmm4
-
- ## d ^= (c + b) <<< 13
- movdqa xmm4, xmm2
- paddd xmm4, xmm3
- pshufd xmm3, xmm3, 0x93
- movdqa xmm5, xmm4
- pslld xmm4, 13
- psrld xmm5, 19
- por xmm4, xmm5
- pxor xmm1, xmm4
-
- ## a ^= (d + c) <<< 18
- movdqa xmm4, xmm1
- pshufd xmm1, xmm1, 0x39
- paddd xmm4, xmm2
- pshufd xmm2, xmm2, 0x4e
- movdqa xmm5, xmm4
- pslld xmm4, 18
- psrld xmm5, 14
- por xmm4, xmm5
- pxor xmm0, xmm4
-
- ## We had to undo the transpose ready for the next loop. Again, push
- ## back the shuffles because they take a long time coming through.
- ## Decrement the loop counter and see if we should go round again.
- ## Later processors fuse this pair into a single uop.
- sub ecx, 2
- ja loop
-
- ## Almost there. Firstly, the feedforward addition, and then we have
- ## to write out the result. Here we have to undo the permutation
- ## which was already applied to the input. Shuffling has quite high
- ## latency, so arrange to start a new shuffle into a temporary as
- ## soon as we've written out the old value.
- mov edx, [ebp + 16]
-
- paddd xmm0, [esp + 0]
- pshufd xmm4, xmm0, 0x39
- movd [edx + 0], xmm0
-
- paddd xmm1, [esp + 16]
- pshufd xmm5, xmm1, 0x93
- movd [edx + 16], xmm1
-
- paddd xmm2, xmm6
- pshufd xmm6, xmm2, 0x4e
- movd [edx + 32], xmm2
-
- paddd xmm3, xmm7
- pshufd xmm7, xmm3, 0x39
- movd [edx + 48], xmm3
-
- movd [edx + 4], xmm7
- pshufd xmm7, xmm3, 0x4e
- movd [edx + 24], xmm7
- pshufd xmm3, xmm3, 0x93
- movd [edx + 44], xmm3
-
- movd [edx + 8], xmm6
- pshufd xmm6, xmm2, 0x93
- movd [edx + 28], xmm6
- pshufd xmm2, xmm2, 0x39
- movd [edx + 52], xmm2
-
- movd [edx + 12], xmm5
- pshufd xmm5, xmm1, 0x39
- movd [edx + 36], xmm5
- pshufd xmm1, xmm1, 0x4e
- movd [edx + 56], xmm1
-
- movd [edx + 20], xmm4
- pshufd xmm4, xmm0, 0x4e
- movd [edx + 40], xmm4
- pshufd xmm0, xmm0, 0x93
- movd [edx + 60], xmm0
-
- ## And with that, we're done.
- mov esp, ebp
- pop ebp
- ret
-
- .size salsa20_core_x86_sse2, . - salsa20_core_x86_sse2
-
-###----- That's all, folks --------------------------------------------------