From e10e6494b18a62339497db09d9712cd5df555714 Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Sat, 2 May 2015 17:05:20 +0100 Subject: [PATCH] symm/: New SSE2 implementations of Salsa20 and ChaCha. These are chosen at runtime if the CPU is suitable. --- symm/Makefile.am | 6 ++ symm/chacha-x86-sse2.s | 188 ++++++++++++++++++++++++++++++++++++ symm/chacha.c | 24 ++++- symm/salsa20-x86-sse2.s | 247 ++++++++++++++++++++++++++++++++++++++++++++++++ symm/salsa20.c | 24 ++++- 5 files changed, 487 insertions(+), 2 deletions(-) create mode 100644 symm/chacha-x86-sse2.s create mode 100644 symm/salsa20-x86-sse2.s diff --git a/symm/Makefile.am b/symm/Makefile.am index 63bf26b7..5bb82f1f 100644 --- a/symm/Makefile.am +++ b/symm/Makefile.am @@ -378,6 +378,9 @@ ALL_CIPHERS += seal EXTRA_DIST += salsa20-tvconv pkginclude_HEADERS += salsa20.h salsa20-core.h libsymm_la_SOURCES += salsa20.c +if CPUFAM_X86 +libsymm_la_SOURCES += salsa20-x86-sse2.s +endif TESTS += salsa20.$t ALL_CIPHERS += salsa20 salsa2012 salsa208 ALL_CIPHERS += xsalsa20 xsalsa2012 xsalsa208 @@ -404,6 +407,9 @@ t/salsa20: salsa20-tvconv t/salsa20.local $(SALSA20_ESTREAM_TV) ## Bernstein's `ChaCha' stream cipher. pkginclude_HEADERS += chacha.h chacha-core.h libsymm_la_SOURCES += chacha.c +if CPUFAM_X86 +libsymm_la_SOURCES += chacha-x86-sse2.s +endif TESTS += chacha.$t EXTRA_DIST += t/chacha ALL_CIPHERS += chacha20 chacha12 chacha8 diff --git a/symm/chacha-x86-sse2.s b/symm/chacha-x86-sse2.s new file mode 100644 index 00000000..7b790107 --- /dev/null +++ b/symm/chacha-x86-sse2.s @@ -0,0 +1,188 @@ +### -*- mode: asm; asm-comment-char: ?# -*- +### +### Fancy SIMD implementation of ChaCha +### +### (c) 2015 Straylight/Edgeware +### + +###----- Licensing notice --------------------------------------------------- +### +### This file is part of Catacomb. +### +### Catacomb is free software; you can redistribute it and/or modify +### it under the terms of the GNU Library General Public License as +### published by the Free Software Foundation; either version 2 of the +### License, or (at your option) any later version. +### +### Catacomb is distributed in the hope that it will be useful, +### but WITHOUT ANY WARRANTY; without even the implied warranty of +### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +### GNU Library General Public License for more details. +### +### You should have received a copy of the GNU Library General Public +### License along with Catacomb; if not, write to the Free +### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +### MA 02111-1307, USA. + + .intel_syntax noprefix + .arch pentium4 + + .section .text + + .globl chacha_core_x86_sse2 + .type chacha_core_x86_sse2, STT_FUNC +chacha_core_x86_sse2: + + ## Initial state. We have three arguments: + ## [ebp + 8] is the number of rounds to do + ## [ebp + 12] points to the input matrix + ## [ebp + 16] points to the output matrix + push ebp + mov ebp, esp + sub esp, 16 + mov edx, [ebp + 12] + and esp, ~15 + + ## First job is to slurp the matrix into XMM registers. Be careful: + ## the input matrix isn't likely to be properly aligned. + ## + ## [ 0 1 2 3] (a, xmm0) + ## [ 4 5 6 7] (b, xmm0) + ## [ 8 9 10 11] (c, xmm0) + ## [12 13 14 15] (d, xmm0) + movdqu xmm0, [edx + 0] + movdqu xmm1, [edx + 16] + movdqu xmm2, [edx + 32] + movdqu xmm3, [edx + 48] + + ## Prepare for the main loop. + mov ecx, [ebp + 8] + + ## Take a copy for later. This one is aligned properly, by + ## construction. + movdqa [esp], xmm0 + movdqa xmm5, xmm1 + movdqa xmm6, xmm2 + movdqa xmm7, xmm3 + +loop: + ## Apply a column quarterround to each of the columns simultaneously. + ## Alas, there doesn't seem to be a packed doubleword rotate, so we + ## have to synthesize it. + + ## a += b; d ^= a; d <<<= 16 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 16 + psrld xmm4, 16 + por xmm3, xmm4 + + ## c += d; b ^= c; b <<<= 12 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm4, xmm1 + pslld xmm1, 12 + psrld xmm4, 20 + por xmm1, xmm4 + + ## a += b; d ^= a; d <<<= 8 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 8 + psrld xmm4, 24 + por xmm3, xmm4 + + ## c += d; b ^= c; b <<<= 7 + paddd xmm2, xmm3 + pshufd xmm3, xmm3, 0x93 + pxor xmm1, xmm2 + pshufd xmm2, xmm2, 0x4e + movdqa xmm4, xmm1 + pslld xmm1, 7 + psrld xmm4, 25 + por xmm1, xmm4 + + ## The not-quite-transpose conveniently only involves reordering + ## elements of individual rows, which can be done quite easily. It + ## doesn't involve any movement of elements between rows, or even + ## renaming of the rows. + ## + ## [ 0 1 2 3] [ 0 1 2 3] (a, xmm0) + ## [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1) + ## [ 8 9 10 11] [10 11 8 9] (c, xmm2) + ## [12 13 14 15] [15 12 13 14] (d, xmm3) + ## + ## The shuffles have quite high latency, so they've mostly been + ## pushed upwards. The remaining one can't be moved, though. + pshufd xmm1, xmm1, 0x39 + + ## Apply the diagonal quarterround to each of the columns + ## simultaneously. + + ## a += b; d ^= a; d <<<= 16 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 16 + psrld xmm4, 16 + por xmm3, xmm4 + + ## c += d; b ^= c; b <<<= 12 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm4, xmm1 + pslld xmm1, 12 + psrld xmm4, 20 + por xmm1, xmm4 + + ## a += b; d ^= a; d <<<= 8 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 8 + psrld xmm4, 24 + por xmm3, xmm4 + + ## c += d; b ^= c; b <<<= 7 + paddd xmm2, xmm3 + pshufd xmm3, xmm3, 0x39 + pxor xmm1, xmm2 + pshufd xmm2, xmm2, 0x4e + movdqa xmm4, xmm1 + pslld xmm1, 7 + psrld xmm4, 25 + por xmm1, xmm4 + + ## Finally, finish off undoing the transpose, and we're done for this + ## doubleround. Again, most of this was done above so we don't have + ## to wait for the shuffles. + pshufd xmm1, xmm1, 0x93 + + ## Decrement the loop counter and see if we should go round again. + sub ecx, 2 + ja loop + + ## Almost there. Firstly, the feedforward addition. + mov edx, [ebp + 16] + paddd xmm0, [esp] + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + + ## And now we write out the result. This one won't be aligned + ## either. + movdqu [edx + 0], xmm0 + movdqu [edx + 16], xmm1 + movdqu [edx + 32], xmm2 + movdqu [edx + 48], xmm3 + + ## And with that, we're done. + mov esp, ebp + pop ebp + ret + + .size chacha_core_x86_sse2, . - chacha_core_x86_sse2 + +###----- That's all, folks -------------------------------------------------- diff --git a/symm/chacha.c b/symm/chacha.c index bd94ffde..8fe50e19 100644 --- a/symm/chacha.c +++ b/symm/chacha.c @@ -27,6 +27,8 @@ /*----- Header files ------------------------------------------------------*/ +#include "config.h" + #include #include @@ -34,6 +36,7 @@ #include "arena.h" #include "chacha.h" #include "chacha-core.h" +#include "dispatch.h" #include "gcipher.h" #include "grand.h" #include "keysz.h" @@ -59,9 +62,28 @@ const octet chacha_keysz[] = { KSZ_SET, 32, 16, 10, 0 }; * the feedforward step. */ -static void core(unsigned r, const chacha_matrix src, chacha_matrix dest) +CPU_DISPATCH(static, (void), + void, core, (unsigned r, const chacha_matrix src, + chacha_matrix dest), + (r, src, dest), + pick_core, simple_core); + +static void simple_core(unsigned r, const chacha_matrix src, + chacha_matrix dest) { CHACHA_nR(dest, src, r); CHACHA_FFWD(dest, src); } +#ifdef CPUFAM_X86 +extern core__functype chacha_core_x86_sse2; +#endif + +static core__functype *pick_core(void) +{ +#ifdef CPUFAM_X86 + if (cpu_feature_p(CPUFEAT_X86_SSE2)) return chacha_core_x86_sse2; +#endif + return simple_core; +} + /* --- @populate@ --- * * * Arguments: @chacha_matrix a@ = a matrix to fill in diff --git a/symm/salsa20-x86-sse2.s b/symm/salsa20-x86-sse2.s new file mode 100644 index 00000000..ef2b73ef --- /dev/null +++ b/symm/salsa20-x86-sse2.s @@ -0,0 +1,247 @@ +### -*- mode: asm; asm-comment-char: ?# -*- +### +### Fancy SIMD implementation of Salsa20 +### +### (c) 2015 Straylight/Edgeware +### + +###----- Licensing notice --------------------------------------------------- +### +### This file is part of Catacomb. +### +### Catacomb is free software; you can redistribute it and/or modify +### it under the terms of the GNU Library General Public License as +### published by the Free Software Foundation; either version 2 of the +### License, or (at your option) any later version. +### +### Catacomb is distributed in the hope that it will be useful, +### but WITHOUT ANY WARRANTY; without even the implied warranty of +### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +### GNU Library General Public License for more details. +### +### You should have received a copy of the GNU Library General Public +### License along with Catacomb; if not, write to the Free +### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +### MA 02111-1307, USA. + + .intel_syntax noprefix + .arch pentium4 + + .section .text + + .globl salsa20_core_x86_sse2 + .type salsa20_core_x86_sse2, STT_FUNC +salsa20_core_x86_sse2: + + ## Initial state. We have three arguments: + ## [ebp + 8] is the number of rounds to do + ## [ebp + 12] points to the input matrix + ## [ebp + 16] points to the output matrix + push ebp + mov ebp, esp + sub esp, 32 + mov edx, [ebp + 12] + and esp, ~15 + + ## Prepare for the main loop. + mov ecx, [ebp + 8] + + ## First job is to slurp the matrix into XMM registers. The words + ## have already been permuted conveniently to make them line up + ## better for SIMD processing. + ## + ## The textbook arrangement of the matrix is this. + ## + ## [C K K K] + ## [K C N N] + ## [T T C K] + ## [K K K C] + ## + ## But we've rotated the columns up so that the main diagonal with + ## the constants on it end up in the first row, giving something more + ## like + ## + ## [C C C C] + ## [K T K K] + ## [T K K N] + ## [K K N K] + ## + ## so the transformation looks like this: + ## + ## [ 0 1 2 3] [ 0 5 10 15] (a, xmm0) + ## [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1) + ## [ 8 9 10 11] [ 8 13 2 7] (c, xmm2) + ## [12 13 14 15] [12 1 6 11] (d, xmm3) + movdqu xmm0, [edx + 0] + movdqu xmm1, [edx + 16] + movdqu xmm2, [edx + 32] + movdqu xmm3, [edx + 48] + + ## Take a copy for later. + movdqa [esp + 0], xmm0 + movdqa [esp + 16], xmm1 + movdqa xmm6, xmm2 + movdqa xmm7, xmm3 + +loop: + + ## Apply a column quarterround to each of the columns simultaneously. + ## Alas, there doesn't seem to be a packed doubleword rotate, so we + ## have to synthesize it. + + ## b ^= (a + d) <<< 7 + movdqa xmm4, xmm0 + paddd xmm4, xmm3 + movdqa xmm5, xmm4 + pslld xmm4, 7 + psrld xmm5, 25 + por xmm4, xmm5 + pxor xmm1, xmm4 + + ## c ^= (b + a) <<< 9 + movdqa xmm4, xmm1 + paddd xmm4, xmm0 + movdqa xmm5, xmm4 + pslld xmm4, 9 + psrld xmm5, 23 + por xmm4, xmm5 + pxor xmm2, xmm4 + + ## d ^= (c + b) <<< 13 + movdqa xmm4, xmm2 + paddd xmm4, xmm1 + pshufd xmm1, xmm1, 0x93 + movdqa xmm5, xmm4 + pslld xmm4, 13 + psrld xmm5, 19 + por xmm4, xmm5 + pxor xmm3, xmm4 + + ## a ^= (d + c) <<< 18 + movdqa xmm4, xmm3 + pshufd xmm3, xmm3, 0x39 + paddd xmm4, xmm2 + pshufd xmm2, xmm2, 0x4e + movdqa xmm5, xmm4 + pslld xmm4, 18 + psrld xmm5, 14 + por xmm4, xmm5 + pxor xmm0, xmm4 + + ## The transpose conveniently only involves reordering elements of + ## individual rows, which can be done quite easily, and reordering + ## the rows themselves, which is a trivial renaming. It doesn't + ## involve any movement of elements between rows. + ## + ## [ 0 5 10 15] [ 0 5 10 15] (a, xmm0) + ## [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3) + ## [ 8 13 2 7] [ 2 7 8 13] (c, xmm2) + ## [12 1 6 11] [ 3 4 9 14] (d, xmm1) + ## + ## The shuffles have quite high latency, so they've been pushed + ## backwards into the main instruction list. + + ## Apply the row quarterround to each of the columns (yes!) + ## simultaneously. + + ## b ^= (a + d) <<< 7 + movdqa xmm4, xmm0 + paddd xmm4, xmm1 + movdqa xmm5, xmm4 + pslld xmm4, 7 + psrld xmm5, 25 + por xmm4, xmm5 + pxor xmm3, xmm4 + + ## c ^= (b + a) <<< 9 + movdqa xmm4, xmm3 + paddd xmm4, xmm0 + movdqa xmm5, xmm4 + pslld xmm4, 9 + psrld xmm5, 23 + por xmm4, xmm5 + pxor xmm2, xmm4 + + ## d ^= (c + b) <<< 13 + movdqa xmm4, xmm2 + paddd xmm4, xmm3 + pshufd xmm3, xmm3, 0x93 + movdqa xmm5, xmm4 + pslld xmm4, 13 + psrld xmm5, 19 + por xmm4, xmm5 + pxor xmm1, xmm4 + + ## a ^= (d + c) <<< 18 + movdqa xmm4, xmm1 + pshufd xmm1, xmm1, 0x39 + paddd xmm4, xmm2 + pshufd xmm2, xmm2, 0x4e + movdqa xmm5, xmm4 + pslld xmm4, 18 + psrld xmm5, 14 + por xmm4, xmm5 + pxor xmm0, xmm4 + + ## We had to undo the transpose ready for the next loop. Again, push + ## back the shuffles because they take a long time coming through. + ## Decrement the loop counter and see if we should go round again. + ## Later processors fuse this pair into a single uop. + sub ecx, 2 + ja loop + + ## Almost there. Firstly, the feedforward addition, and then we have + ## to write out the result. Here we have to undo the permutation + ## which was already applied to the input. Shuffling has quite high + ## latency, so arrange to start a new shuffle into a temporary as + ## soon as we've written out the old value. + mov edx, [ebp + 16] + + paddd xmm0, [esp + 0] + pshufd xmm4, xmm0, 0x39 + movd [edx + 0], xmm0 + + paddd xmm1, [esp + 16] + pshufd xmm5, xmm1, 0x93 + movd [edx + 16], xmm1 + + paddd xmm2, xmm6 + pshufd xmm6, xmm2, 0x4e + movd [edx + 32], xmm2 + + paddd xmm3, xmm7 + pshufd xmm7, xmm3, 0x39 + movd [edx + 48], xmm3 + + movd [edx + 4], xmm7 + pshufd xmm7, xmm3, 0x4e + movd [edx + 24], xmm7 + pshufd xmm3, xmm3, 0x93 + movd [edx + 44], xmm3 + + movd [edx + 8], xmm6 + pshufd xmm6, xmm2, 0x93 + movd [edx + 28], xmm6 + pshufd xmm2, xmm2, 0x39 + movd [edx + 52], xmm2 + + movd [edx + 12], xmm5 + pshufd xmm5, xmm1, 0x39 + movd [edx + 36], xmm5 + pshufd xmm1, xmm1, 0x4e + movd [edx + 56], xmm1 + + movd [edx + 20], xmm4 + pshufd xmm4, xmm0, 0x4e + movd [edx + 40], xmm4 + pshufd xmm0, xmm0, 0x93 + movd [edx + 60], xmm0 + + ## And with that, we're done. + mov esp, ebp + pop ebp + ret + + .size salsa20_core_x86_sse2, . - salsa20_core_x86_sse2 + +###----- That's all, folks -------------------------------------------------- diff --git a/symm/salsa20.c b/symm/salsa20.c index 0104e5e5..d3fb69a7 100644 --- a/symm/salsa20.c +++ b/symm/salsa20.c @@ -7,11 +7,14 @@ /*----- Header files ------------------------------------------------------*/ +#include "config.h" + #include #include #include "arena.h" +#include "dispatch.h" #include "gcipher.h" #include "grand.h" #include "keysz.h" @@ -39,9 +42,28 @@ const octet salsa20_keysz[] = { KSZ_SET, 32, 16, 10, 0 }; * the feedforward step. */ -static void core(unsigned r, const salsa20_matrix src, salsa20_matrix dest) +CPU_DISPATCH(static, (void), + void, core, (unsigned r, const salsa20_matrix src, + salsa20_matrix dest), + (r, src, dest), + pick_core, simple_core); + +static void simple_core(unsigned r, const salsa20_matrix src, + salsa20_matrix dest) { SALSA20_nR(dest, src, r); SALSA20_FFWD(dest, src); } +#ifdef CPUFAM_X86 +extern core__functype salsa20_core_x86_sse2; +#endif + +static core__functype *pick_core(void) +{ +#ifdef CPUFAM_X86 + if (cpu_feature_p(CPUFEAT_X86_SSE2)) return salsa20_core_x86_sse2; +#endif + return simple_core; +} + /* --- @populate@ --- * * * Arguments: @salsa20_matrix a@ = a matrix to fill in -- 2.11.0