These are chosen at runtime if the CPU is suitable.
EXTRA_DIST += salsa20-tvconv
pkginclude_HEADERS += salsa20.h salsa20-core.h
libsymm_la_SOURCES += salsa20.c
+if CPUFAM_X86
+libsymm_la_SOURCES += salsa20-x86-sse2.s
+endif
TESTS += salsa20.$t
ALL_CIPHERS += salsa20 salsa2012 salsa208
ALL_CIPHERS += xsalsa20 xsalsa2012 xsalsa208
## Bernstein's `ChaCha' stream cipher.
pkginclude_HEADERS += chacha.h chacha-core.h
libsymm_la_SOURCES += chacha.c
+if CPUFAM_X86
+libsymm_la_SOURCES += chacha-x86-sse2.s
+endif
TESTS += chacha.$t
EXTRA_DIST += t/chacha
ALL_CIPHERS += chacha20 chacha12 chacha8
--- /dev/null
+### -*- mode: asm; asm-comment-char: ?# -*-
+###
+### Fancy SIMD implementation of ChaCha
+###
+### (c) 2015 Straylight/Edgeware
+###
+
+###----- Licensing notice ---------------------------------------------------
+###
+### This file is part of Catacomb.
+###
+### Catacomb is free software; you can redistribute it and/or modify
+### it under the terms of the GNU Library General Public License as
+### published by the Free Software Foundation; either version 2 of the
+### License, or (at your option) any later version.
+###
+### Catacomb is distributed in the hope that it will be useful,
+### but WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+### GNU Library General Public License for more details.
+###
+### You should have received a copy of the GNU Library General Public
+### License along with Catacomb; if not, write to the Free
+### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+### MA 02111-1307, USA.
+
+ .intel_syntax noprefix
+ .arch pentium4
+
+ .section .text
+
+ .globl chacha_core_x86_sse2
+ .type chacha_core_x86_sse2, STT_FUNC
+chacha_core_x86_sse2:
+
+ ## Initial state. We have three arguments:
+ ## [ebp + 8] is the number of rounds to do
+ ## [ebp + 12] points to the input matrix
+ ## [ebp + 16] points to the output matrix
+ push ebp
+ mov ebp, esp
+ sub esp, 16
+ mov edx, [ebp + 12]
+ and esp, ~15
+
+ ## First job is to slurp the matrix into XMM registers. Be careful:
+ ## the input matrix isn't likely to be properly aligned.
+ ##
+ ## [ 0 1 2 3] (a, xmm0)
+ ## [ 4 5 6 7] (b, xmm0)
+ ## [ 8 9 10 11] (c, xmm0)
+ ## [12 13 14 15] (d, xmm0)
+ movdqu xmm0, [edx + 0]
+ movdqu xmm1, [edx + 16]
+ movdqu xmm2, [edx + 32]
+ movdqu xmm3, [edx + 48]
+
+ ## Prepare for the main loop.
+ mov ecx, [ebp + 8]
+
+ ## Take a copy for later. This one is aligned properly, by
+ ## construction.
+ movdqa [esp], xmm0
+ movdqa xmm5, xmm1
+ movdqa xmm6, xmm2
+ movdqa xmm7, xmm3
+
+loop:
+ ## Apply a column quarterround to each of the columns simultaneously.
+ ## Alas, there doesn't seem to be a packed doubleword rotate, so we
+ ## have to synthesize it.
+
+ ## a += b; d ^= a; d <<<= 16
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm4, xmm3
+ pslld xmm3, 16
+ psrld xmm4, 16
+ por xmm3, xmm4
+
+ ## c += d; b ^= c; b <<<= 12
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm4, xmm1
+ pslld xmm1, 12
+ psrld xmm4, 20
+ por xmm1, xmm4
+
+ ## a += b; d ^= a; d <<<= 8
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm4, xmm3
+ pslld xmm3, 8
+ psrld xmm4, 24
+ por xmm3, xmm4
+
+ ## c += d; b ^= c; b <<<= 7
+ paddd xmm2, xmm3
+ pshufd xmm3, xmm3, 0x93
+ pxor xmm1, xmm2
+ pshufd xmm2, xmm2, 0x4e
+ movdqa xmm4, xmm1
+ pslld xmm1, 7
+ psrld xmm4, 25
+ por xmm1, xmm4
+
+ ## The not-quite-transpose conveniently only involves reordering
+ ## elements of individual rows, which can be done quite easily. It
+ ## doesn't involve any movement of elements between rows, or even
+ ## renaming of the rows.
+ ##
+ ## [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
+ ## [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
+ ## [ 8 9 10 11] [10 11 8 9] (c, xmm2)
+ ## [12 13 14 15] [15 12 13 14] (d, xmm3)
+ ##
+ ## The shuffles have quite high latency, so they've mostly been
+ ## pushed upwards. The remaining one can't be moved, though.
+ pshufd xmm1, xmm1, 0x39
+
+ ## Apply the diagonal quarterround to each of the columns
+ ## simultaneously.
+
+ ## a += b; d ^= a; d <<<= 16
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm4, xmm3
+ pslld xmm3, 16
+ psrld xmm4, 16
+ por xmm3, xmm4
+
+ ## c += d; b ^= c; b <<<= 12
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm4, xmm1
+ pslld xmm1, 12
+ psrld xmm4, 20
+ por xmm1, xmm4
+
+ ## a += b; d ^= a; d <<<= 8
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm4, xmm3
+ pslld xmm3, 8
+ psrld xmm4, 24
+ por xmm3, xmm4
+
+ ## c += d; b ^= c; b <<<= 7
+ paddd xmm2, xmm3
+ pshufd xmm3, xmm3, 0x39
+ pxor xmm1, xmm2
+ pshufd xmm2, xmm2, 0x4e
+ movdqa xmm4, xmm1
+ pslld xmm1, 7
+ psrld xmm4, 25
+ por xmm1, xmm4
+
+ ## Finally, finish off undoing the transpose, and we're done for this
+ ## doubleround. Again, most of this was done above so we don't have
+ ## to wait for the shuffles.
+ pshufd xmm1, xmm1, 0x93
+
+ ## Decrement the loop counter and see if we should go round again.
+ sub ecx, 2
+ ja loop
+
+ ## Almost there. Firstly, the feedforward addition.
+ mov edx, [ebp + 16]
+ paddd xmm0, [esp]
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+
+ ## And now we write out the result. This one won't be aligned
+ ## either.
+ movdqu [edx + 0], xmm0
+ movdqu [edx + 16], xmm1
+ movdqu [edx + 32], xmm2
+ movdqu [edx + 48], xmm3
+
+ ## And with that, we're done.
+ mov esp, ebp
+ pop ebp
+ ret
+
+ .size chacha_core_x86_sse2, . - chacha_core_x86_sse2
+
+###----- That's all, folks --------------------------------------------------
/*----- Header files ------------------------------------------------------*/
+#include "config.h"
+
#include <stdarg.h>
#include <mLib/bits.h>
#include "arena.h"
#include "chacha.h"
#include "chacha-core.h"
+#include "dispatch.h"
#include "gcipher.h"
#include "grand.h"
#include "keysz.h"
* the feedforward step.
*/
-static void core(unsigned r, const chacha_matrix src, chacha_matrix dest)
+CPU_DISPATCH(static, (void),
+ void, core, (unsigned r, const chacha_matrix src,
+ chacha_matrix dest),
+ (r, src, dest),
+ pick_core, simple_core);
+
+static void simple_core(unsigned r, const chacha_matrix src,
+ chacha_matrix dest)
{ CHACHA_nR(dest, src, r); CHACHA_FFWD(dest, src); }
+#ifdef CPUFAM_X86
+extern core__functype chacha_core_x86_sse2;
+#endif
+
+static core__functype *pick_core(void)
+{
+#ifdef CPUFAM_X86
+ if (cpu_feature_p(CPUFEAT_X86_SSE2)) return chacha_core_x86_sse2;
+#endif
+ return simple_core;
+}
+
/* --- @populate@ --- *
*
* Arguments: @chacha_matrix a@ = a matrix to fill in
--- /dev/null
+### -*- mode: asm; asm-comment-char: ?# -*-
+###
+### Fancy SIMD implementation of Salsa20
+###
+### (c) 2015 Straylight/Edgeware
+###
+
+###----- Licensing notice ---------------------------------------------------
+###
+### This file is part of Catacomb.
+###
+### Catacomb is free software; you can redistribute it and/or modify
+### it under the terms of the GNU Library General Public License as
+### published by the Free Software Foundation; either version 2 of the
+### License, or (at your option) any later version.
+###
+### Catacomb is distributed in the hope that it will be useful,
+### but WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+### GNU Library General Public License for more details.
+###
+### You should have received a copy of the GNU Library General Public
+### License along with Catacomb; if not, write to the Free
+### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+### MA 02111-1307, USA.
+
+ .intel_syntax noprefix
+ .arch pentium4
+
+ .section .text
+
+ .globl salsa20_core_x86_sse2
+ .type salsa20_core_x86_sse2, STT_FUNC
+salsa20_core_x86_sse2:
+
+ ## Initial state. We have three arguments:
+ ## [ebp + 8] is the number of rounds to do
+ ## [ebp + 12] points to the input matrix
+ ## [ebp + 16] points to the output matrix
+ push ebp
+ mov ebp, esp
+ sub esp, 32
+ mov edx, [ebp + 12]
+ and esp, ~15
+
+ ## Prepare for the main loop.
+ mov ecx, [ebp + 8]
+
+ ## First job is to slurp the matrix into XMM registers. The words
+ ## have already been permuted conveniently to make them line up
+ ## better for SIMD processing.
+ ##
+ ## The textbook arrangement of the matrix is this.
+ ##
+ ## [C K K K]
+ ## [K C N N]
+ ## [T T C K]
+ ## [K K K C]
+ ##
+ ## But we've rotated the columns up so that the main diagonal with
+ ## the constants on it end up in the first row, giving something more
+ ## like
+ ##
+ ## [C C C C]
+ ## [K T K K]
+ ## [T K K N]
+ ## [K K N K]
+ ##
+ ## so the transformation looks like this:
+ ##
+ ## [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
+ ## [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
+ ## [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
+ ## [12 13 14 15] [12 1 6 11] (d, xmm3)
+ movdqu xmm0, [edx + 0]
+ movdqu xmm1, [edx + 16]
+ movdqu xmm2, [edx + 32]
+ movdqu xmm3, [edx + 48]
+
+ ## Take a copy for later.
+ movdqa [esp + 0], xmm0
+ movdqa [esp + 16], xmm1
+ movdqa xmm6, xmm2
+ movdqa xmm7, xmm3
+
+loop:
+
+ ## Apply a column quarterround to each of the columns simultaneously.
+ ## Alas, there doesn't seem to be a packed doubleword rotate, so we
+ ## have to synthesize it.
+
+ ## b ^= (a + d) <<< 7
+ movdqa xmm4, xmm0
+ paddd xmm4, xmm3
+ movdqa xmm5, xmm4
+ pslld xmm4, 7
+ psrld xmm5, 25
+ por xmm4, xmm5
+ pxor xmm1, xmm4
+
+ ## c ^= (b + a) <<< 9
+ movdqa xmm4, xmm1
+ paddd xmm4, xmm0
+ movdqa xmm5, xmm4
+ pslld xmm4, 9
+ psrld xmm5, 23
+ por xmm4, xmm5
+ pxor xmm2, xmm4
+
+ ## d ^= (c + b) <<< 13
+ movdqa xmm4, xmm2
+ paddd xmm4, xmm1
+ pshufd xmm1, xmm1, 0x93
+ movdqa xmm5, xmm4
+ pslld xmm4, 13
+ psrld xmm5, 19
+ por xmm4, xmm5
+ pxor xmm3, xmm4
+
+ ## a ^= (d + c) <<< 18
+ movdqa xmm4, xmm3
+ pshufd xmm3, xmm3, 0x39
+ paddd xmm4, xmm2
+ pshufd xmm2, xmm2, 0x4e
+ movdqa xmm5, xmm4
+ pslld xmm4, 18
+ psrld xmm5, 14
+ por xmm4, xmm5
+ pxor xmm0, xmm4
+
+ ## The transpose conveniently only involves reordering elements of
+ ## individual rows, which can be done quite easily, and reordering
+ ## the rows themselves, which is a trivial renaming. It doesn't
+ ## involve any movement of elements between rows.
+ ##
+ ## [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
+ ## [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
+ ## [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
+ ## [12 1 6 11] [ 3 4 9 14] (d, xmm1)
+ ##
+ ## The shuffles have quite high latency, so they've been pushed
+ ## backwards into the main instruction list.
+
+ ## Apply the row quarterround to each of the columns (yes!)
+ ## simultaneously.
+
+ ## b ^= (a + d) <<< 7
+ movdqa xmm4, xmm0
+ paddd xmm4, xmm1
+ movdqa xmm5, xmm4
+ pslld xmm4, 7
+ psrld xmm5, 25
+ por xmm4, xmm5
+ pxor xmm3, xmm4
+
+ ## c ^= (b + a) <<< 9
+ movdqa xmm4, xmm3
+ paddd xmm4, xmm0
+ movdqa xmm5, xmm4
+ pslld xmm4, 9
+ psrld xmm5, 23
+ por xmm4, xmm5
+ pxor xmm2, xmm4
+
+ ## d ^= (c + b) <<< 13
+ movdqa xmm4, xmm2
+ paddd xmm4, xmm3
+ pshufd xmm3, xmm3, 0x93
+ movdqa xmm5, xmm4
+ pslld xmm4, 13
+ psrld xmm5, 19
+ por xmm4, xmm5
+ pxor xmm1, xmm4
+
+ ## a ^= (d + c) <<< 18
+ movdqa xmm4, xmm1
+ pshufd xmm1, xmm1, 0x39
+ paddd xmm4, xmm2
+ pshufd xmm2, xmm2, 0x4e
+ movdqa xmm5, xmm4
+ pslld xmm4, 18
+ psrld xmm5, 14
+ por xmm4, xmm5
+ pxor xmm0, xmm4
+
+ ## We had to undo the transpose ready for the next loop. Again, push
+ ## back the shuffles because they take a long time coming through.
+ ## Decrement the loop counter and see if we should go round again.
+ ## Later processors fuse this pair into a single uop.
+ sub ecx, 2
+ ja loop
+
+ ## Almost there. Firstly, the feedforward addition, and then we have
+ ## to write out the result. Here we have to undo the permutation
+ ## which was already applied to the input. Shuffling has quite high
+ ## latency, so arrange to start a new shuffle into a temporary as
+ ## soon as we've written out the old value.
+ mov edx, [ebp + 16]
+
+ paddd xmm0, [esp + 0]
+ pshufd xmm4, xmm0, 0x39
+ movd [edx + 0], xmm0
+
+ paddd xmm1, [esp + 16]
+ pshufd xmm5, xmm1, 0x93
+ movd [edx + 16], xmm1
+
+ paddd xmm2, xmm6
+ pshufd xmm6, xmm2, 0x4e
+ movd [edx + 32], xmm2
+
+ paddd xmm3, xmm7
+ pshufd xmm7, xmm3, 0x39
+ movd [edx + 48], xmm3
+
+ movd [edx + 4], xmm7
+ pshufd xmm7, xmm3, 0x4e
+ movd [edx + 24], xmm7
+ pshufd xmm3, xmm3, 0x93
+ movd [edx + 44], xmm3
+
+ movd [edx + 8], xmm6
+ pshufd xmm6, xmm2, 0x93
+ movd [edx + 28], xmm6
+ pshufd xmm2, xmm2, 0x39
+ movd [edx + 52], xmm2
+
+ movd [edx + 12], xmm5
+ pshufd xmm5, xmm1, 0x39
+ movd [edx + 36], xmm5
+ pshufd xmm1, xmm1, 0x4e
+ movd [edx + 56], xmm1
+
+ movd [edx + 20], xmm4
+ pshufd xmm4, xmm0, 0x4e
+ movd [edx + 40], xmm4
+ pshufd xmm0, xmm0, 0x93
+ movd [edx + 60], xmm0
+
+ ## And with that, we're done.
+ mov esp, ebp
+ pop ebp
+ ret
+
+ .size salsa20_core_x86_sse2, . - salsa20_core_x86_sse2
+
+###----- That's all, folks --------------------------------------------------
/*----- Header files ------------------------------------------------------*/
+#include "config.h"
+
#include <stdarg.h>
#include <mLib/bits.h>
#include "arena.h"
+#include "dispatch.h"
#include "gcipher.h"
#include "grand.h"
#include "keysz.h"
* the feedforward step.
*/
-static void core(unsigned r, const salsa20_matrix src, salsa20_matrix dest)
+CPU_DISPATCH(static, (void),
+ void, core, (unsigned r, const salsa20_matrix src,
+ salsa20_matrix dest),
+ (r, src, dest),
+ pick_core, simple_core);
+
+static void simple_core(unsigned r, const salsa20_matrix src,
+ salsa20_matrix dest)
{ SALSA20_nR(dest, src, r); SALSA20_FFWD(dest, src); }
+#ifdef CPUFAM_X86
+extern core__functype salsa20_core_x86_sse2;
+#endif
+
+static core__functype *pick_core(void)
+{
+#ifdef CPUFAM_X86
+ if (cpu_feature_p(CPUFEAT_X86_SSE2)) return salsa20_core_x86_sse2;
+#endif
+ return simple_core;
+}
+
/* --- @populate@ --- *
*
* Arguments: @salsa20_matrix a@ = a matrix to fill in