From e10e6494b18a62339497db09d9712cd5df555714 Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Sat, 2 May 2015 17:05:20 +0100
Subject: [PATCH] symm/: New SSE2 implementations of Salsa20 and ChaCha.

These are chosen at runtime if the CPU is suitable.
---
 symm/Makefile.am        |   6 ++
 symm/chacha-x86-sse2.s  | 188 ++++++++++++++++++++++++++++++++++++
 symm/chacha.c           |  24 ++++-
 symm/salsa20-x86-sse2.s | 247 ++++++++++++++++++++++++++++++++++++++++++++++++
 symm/salsa20.c          |  24 ++++-
 5 files changed, 487 insertions(+), 2 deletions(-)
 create mode 100644 symm/chacha-x86-sse2.s
 create mode 100644 symm/salsa20-x86-sse2.s

diff --git a/symm/Makefile.am b/symm/Makefile.am
index 63bf26b7..5bb82f1f 100644
--- a/symm/Makefile.am
+++ b/symm/Makefile.am
@@ -378,6 +378,9 @@ ALL_CIPHERS		+= seal
 EXTRA_DIST		+= salsa20-tvconv
 pkginclude_HEADERS	+= salsa20.h salsa20-core.h
 libsymm_la_SOURCES	+= salsa20.c
+if CPUFAM_X86
+libsymm_la_SOURCES	+= salsa20-x86-sse2.s
+endif
 TESTS			+= salsa20.$t
 ALL_CIPHERS		+= salsa20 salsa2012 salsa208
 ALL_CIPHERS		+= xsalsa20 xsalsa2012 xsalsa208
@@ -404,6 +407,9 @@ t/salsa20: salsa20-tvconv t/salsa20.local $(SALSA20_ESTREAM_TV)
 ## Bernstein's `ChaCha' stream cipher.
 pkginclude_HEADERS	+= chacha.h chacha-core.h
 libsymm_la_SOURCES	+= chacha.c
+if CPUFAM_X86
+libsymm_la_SOURCES	+= chacha-x86-sse2.s
+endif
 TESTS			+= chacha.$t
 EXTRA_DIST		+= t/chacha
 ALL_CIPHERS		+= chacha20 chacha12 chacha8
diff --git a/symm/chacha-x86-sse2.s b/symm/chacha-x86-sse2.s
new file mode 100644
index 00000000..7b790107
--- /dev/null
+++ b/symm/chacha-x86-sse2.s
@@ -0,0 +1,188 @@
+### -*- mode: asm; asm-comment-char: ?# -*-
+###
+### Fancy SIMD implementation of ChaCha
+###
+### (c) 2015 Straylight/Edgeware
+###
+
+###----- Licensing notice ---------------------------------------------------
+###
+### This file is part of Catacomb.
+###
+### Catacomb is free software; you can redistribute it and/or modify
+### it under the terms of the GNU Library General Public License as
+### published by the Free Software Foundation; either version 2 of the
+### License, or (at your option) any later version.
+###
+### Catacomb is distributed in the hope that it will be useful,
+### but WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+### GNU Library General Public License for more details.
+###
+### You should have received a copy of the GNU Library General Public
+### License along with Catacomb; if not, write to the Free
+### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+### MA 02111-1307, USA.
+
+	.intel_syntax noprefix
+	.arch pentium4
+
+	.section .text
+
+	.globl	chacha_core_x86_sse2
+	.type	chacha_core_x86_sse2, STT_FUNC
+chacha_core_x86_sse2:
+
+	## Initial state.  We have three arguments:
+	## [ebp +  8] is the number of rounds to do
+	## [ebp + 12] points to the input matrix
+	## [ebp + 16] points to the output matrix
+	push	ebp
+	mov	ebp, esp
+	sub	esp, 16
+	mov	edx, [ebp + 12]
+	and	esp, ~15
+
+	## First job is to slurp the matrix into XMM registers.  Be careful:
+	## the input matrix isn't likely to be properly aligned.
+	##
+	##	[ 0  1  2  3] (a, xmm0)
+	##	[ 4  5  6  7] (b, xmm0)
+	##	[ 8  9 10 11] (c, xmm0)
+	##	[12 13 14 15] (d, xmm0)
+	movdqu	xmm0, [edx +  0]
+	movdqu	xmm1, [edx + 16]
+	movdqu	xmm2, [edx + 32]
+	movdqu	xmm3, [edx + 48]
+
+	## Prepare for the main loop.
+	mov	ecx, [ebp + 8]
+
+	## Take a copy for later.  This one is aligned properly, by
+	## construction.
+	movdqa	[esp], xmm0
+	movdqa	xmm5, xmm1
+	movdqa	xmm6, xmm2
+	movdqa	xmm7, xmm3
+
+loop:
+	## Apply a column quarterround to each of the columns simultaneously.
+	## Alas, there doesn't seem to be a packed doubleword rotate, so we
+	## have to synthesize it.
+
+	## a += b; d ^= a; d <<<= 16
+	paddd	xmm0, xmm1
+	pxor	xmm3, xmm0
+	movdqa	xmm4, xmm3
+	pslld	xmm3, 16
+	psrld	xmm4, 16
+	por	xmm3, xmm4
+
+	## c += d; b ^= c; b <<<= 12
+	paddd	xmm2, xmm3
+	pxor	xmm1, xmm2
+	movdqa	xmm4, xmm1
+	pslld	xmm1, 12
+	psrld	xmm4, 20
+	por	xmm1, xmm4
+
+	## a += b; d ^= a; d <<<=  8
+	paddd	xmm0, xmm1
+	pxor	xmm3, xmm0
+	movdqa	xmm4, xmm3
+	pslld	xmm3, 8
+	psrld	xmm4, 24
+	por	xmm3, xmm4
+
+	## c += d; b ^= c; b <<<=  7
+	paddd	xmm2, xmm3
+	pshufd	xmm3, xmm3, 0x93
+	pxor	xmm1, xmm2
+	pshufd	xmm2, xmm2, 0x4e
+	movdqa	xmm4, xmm1
+	pslld	xmm1, 7
+	psrld	xmm4, 25
+	por	xmm1, xmm4
+
+	## The not-quite-transpose conveniently only involves reordering
+	## elements of individual rows, which can be done quite easily.  It
+	## doesn't involve any movement of elements between rows, or even
+	## renaming of the rows.
+	##
+	##	[ 0  1  2  3]		[ 0  1  2  3] (a, xmm0)
+	##	[ 4  5  6  7]    -->	[ 5  6  7  4] (b, xmm1)
+	##	[ 8  9 10 11]		[10 11  8  9] (c, xmm2)
+	##	[12 13 14 15]		[15 12 13 14] (d, xmm3)
+	##
+	## The shuffles have quite high latency, so they've mostly been
+	## pushed upwards.  The remaining one can't be moved, though.
+	pshufd	xmm1, xmm1, 0x39
+
+	## Apply the diagonal quarterround to each of the columns
+	## simultaneously.
+
+	## a += b; d ^= a; d <<<= 16
+	paddd	xmm0, xmm1
+	pxor	xmm3, xmm0
+	movdqa	xmm4, xmm3
+	pslld	xmm3, 16
+	psrld	xmm4, 16
+	por	xmm3, xmm4
+
+	## c += d; b ^= c; b <<<= 12
+	paddd	xmm2, xmm3
+	pxor	xmm1, xmm2
+	movdqa	xmm4, xmm1
+	pslld	xmm1, 12
+	psrld	xmm4, 20
+	por	xmm1, xmm4
+
+	## a += b; d ^= a; d <<<=  8
+	paddd	xmm0, xmm1
+	pxor	xmm3, xmm0
+	movdqa	xmm4, xmm3
+	pslld	xmm3, 8
+	psrld	xmm4, 24
+	por	xmm3, xmm4
+
+	## c += d; b ^= c; b <<<=  7
+	paddd	xmm2, xmm3
+	pshufd	xmm3, xmm3, 0x39
+	pxor	xmm1, xmm2
+	pshufd	xmm2, xmm2, 0x4e
+	movdqa	xmm4, xmm1
+	pslld	xmm1, 7
+	psrld	xmm4, 25
+	por	xmm1, xmm4
+
+	## Finally, finish off undoing the transpose, and we're done for this
+	## doubleround.  Again, most of this was done above so we don't have
+	## to wait for the shuffles.
+	pshufd	xmm1, xmm1, 0x93
+
+	## Decrement the loop counter and see if we should go round again.
+	sub	ecx, 2
+	ja	loop
+
+	## Almost there.  Firstly, the feedforward addition.
+	mov	edx, [ebp + 16]
+	paddd	xmm0, [esp]
+	paddd	xmm1, xmm5
+	paddd	xmm2, xmm6
+	paddd	xmm3, xmm7
+
+	## And now we write out the result.  This one won't be aligned
+	## either.
+	movdqu	[edx +  0], xmm0
+	movdqu	[edx + 16], xmm1
+	movdqu	[edx + 32], xmm2
+	movdqu	[edx + 48], xmm3
+
+	## And with that, we're done.
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+	.size	chacha_core_x86_sse2, . - chacha_core_x86_sse2
+
+###----- That's all, folks --------------------------------------------------
diff --git a/symm/chacha.c b/symm/chacha.c
index bd94ffde..8fe50e19 100644
--- a/symm/chacha.c
+++ b/symm/chacha.c
@@ -27,6 +27,8 @@
 
 /*----- Header files ------------------------------------------------------*/
 
+#include "config.h"
+
 #include <stdarg.h>
 
 #include <mLib/bits.h>
@@ -34,6 +36,7 @@
 #include "arena.h"
 #include "chacha.h"
 #include "chacha-core.h"
+#include "dispatch.h"
 #include "gcipher.h"
 #include "grand.h"
 #include "keysz.h"
@@ -59,9 +62,28 @@ const octet chacha_keysz[] = { KSZ_SET, 32, 16, 10, 0 };
  *		the feedforward step.
  */
 
-static void core(unsigned r, const chacha_matrix src, chacha_matrix dest)
+CPU_DISPATCH(static, (void),
+	     void, core, (unsigned r, const chacha_matrix src,
+			  chacha_matrix dest),
+	     (r, src, dest),
+	     pick_core, simple_core);
+
+static void simple_core(unsigned r, const chacha_matrix src,
+			chacha_matrix dest)
   { CHACHA_nR(dest, src, r); CHACHA_FFWD(dest, src); }
 
+#ifdef CPUFAM_X86
+extern core__functype chacha_core_x86_sse2;
+#endif
+
+static core__functype *pick_core(void)
+{
+#ifdef CPUFAM_X86
+  if (cpu_feature_p(CPUFEAT_X86_SSE2)) return chacha_core_x86_sse2;
+#endif
+  return simple_core;
+}
+
 /* --- @populate@ --- *
  *
  * Arguments:	@chacha_matrix a@ = a matrix to fill in
diff --git a/symm/salsa20-x86-sse2.s b/symm/salsa20-x86-sse2.s
new file mode 100644
index 00000000..ef2b73ef
--- /dev/null
+++ b/symm/salsa20-x86-sse2.s
@@ -0,0 +1,247 @@
+### -*- mode: asm; asm-comment-char: ?# -*-
+###
+### Fancy SIMD implementation of Salsa20
+###
+### (c) 2015 Straylight/Edgeware
+###
+
+###----- Licensing notice ---------------------------------------------------
+###
+### This file is part of Catacomb.
+###
+### Catacomb is free software; you can redistribute it and/or modify
+### it under the terms of the GNU Library General Public License as
+### published by the Free Software Foundation; either version 2 of the
+### License, or (at your option) any later version.
+###
+### Catacomb is distributed in the hope that it will be useful,
+### but WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+### GNU Library General Public License for more details.
+###
+### You should have received a copy of the GNU Library General Public
+### License along with Catacomb; if not, write to the Free
+### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+### MA 02111-1307, USA.
+
+	.intel_syntax noprefix
+	.arch pentium4
+
+	.section .text
+
+	.globl	salsa20_core_x86_sse2
+	.type	salsa20_core_x86_sse2, STT_FUNC
+salsa20_core_x86_sse2:
+
+	## Initial state.  We have three arguments:
+	## [ebp +  8] is the number of rounds to do
+	## [ebp + 12] points to the input matrix
+	## [ebp + 16] points to the output matrix
+	push	ebp
+	mov	ebp, esp
+	sub	esp, 32
+	mov	edx, [ebp + 12]
+	and	esp, ~15
+
+	## Prepare for the main loop.
+	mov	ecx, [ebp + 8]
+
+	## First job is to slurp the matrix into XMM registers.  The words
+	## have already been permuted conveniently to make them line up
+	## better for SIMD processing.
+	##
+	## The textbook arrangement of the matrix is this.
+	##
+	##	[C K K K]
+	##	[K C N N]
+	##	[T T C K]
+	##	[K K K C]
+	##
+	## But we've rotated the columns up so that the main diagonal with
+	## the constants on it end up in the first row, giving something more
+	## like
+	##
+	##	[C C C C]
+	##	[K T K K]
+	##	[T K K N]
+	##	[K K N K]
+	##
+	## so the transformation looks like this:
+	##
+	##	[ 0  1  2  3]		[ 0  5 10 15] (a, xmm0)
+	##	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
+	##	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
+	##	[12 13 14 15]		[12  1  6 11] (d, xmm3)
+	movdqu	xmm0, [edx +  0]
+	movdqu	xmm1, [edx + 16]
+	movdqu	xmm2, [edx + 32]
+	movdqu	xmm3, [edx + 48]
+
+	## Take a copy for later.
+	movdqa	[esp +  0], xmm0
+	movdqa	[esp + 16], xmm1
+	movdqa	xmm6, xmm2
+	movdqa	xmm7, xmm3
+
+loop:
+
+	## Apply a column quarterround to each of the columns simultaneously.
+	## Alas, there doesn't seem to be a packed doubleword rotate, so we
+	## have to synthesize it.
+
+	## b ^= (a + d) <<<  7
+	movdqa	xmm4, xmm0
+	paddd	xmm4, xmm3
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 7
+	psrld	xmm5, 25
+	por	xmm4, xmm5
+	pxor	xmm1, xmm4
+
+	## c ^= (b + a) <<<  9
+	movdqa	xmm4, xmm1
+	paddd	xmm4, xmm0
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 9
+	psrld	xmm5, 23
+	por	xmm4, xmm5
+	pxor	xmm2, xmm4
+
+	## d ^= (c + b) <<< 13
+	movdqa	xmm4, xmm2
+	paddd	xmm4, xmm1
+	pshufd	xmm1, xmm1, 0x93
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 13
+	psrld	xmm5, 19
+	por	xmm4, xmm5
+	pxor	xmm3, xmm4
+
+	## a ^= (d + c) <<< 18
+	movdqa	xmm4, xmm3
+	pshufd	xmm3, xmm3, 0x39
+	paddd	xmm4, xmm2
+	pshufd	xmm2, xmm2, 0x4e
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 18
+	psrld	xmm5, 14
+	por	xmm4, xmm5
+	pxor	xmm0, xmm4
+
+	## The transpose conveniently only involves reordering elements of
+	## individual rows, which can be done quite easily, and reordering
+	## the rows themselves, which is a trivial renaming.  It doesn't
+	## involve any movement of elements between rows.
+	##
+	##	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
+	##	[ 4  9 14  3]    -->	[ 1  6 11 12] (b, xmm3)
+	##	[ 8 13  2  7]		[ 2  7  8 13] (c, xmm2)
+	##	[12  1  6 11]		[ 3  4  9 14] (d, xmm1)
+	##
+	## The shuffles have quite high latency, so they've been pushed
+	## backwards into the main instruction list.
+
+	## Apply the row quarterround to each of the columns (yes!)
+	## simultaneously.
+
+	## b ^= (a + d) <<<  7
+	movdqa	xmm4, xmm0
+	paddd	xmm4, xmm1
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 7
+	psrld	xmm5, 25
+	por	xmm4, xmm5
+	pxor	xmm3, xmm4
+
+	## c ^= (b + a) <<<  9
+	movdqa	xmm4, xmm3
+	paddd	xmm4, xmm0
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 9
+	psrld	xmm5, 23
+	por	xmm4, xmm5
+	pxor	xmm2, xmm4
+
+	## d ^= (c + b) <<< 13
+	movdqa	xmm4, xmm2
+	paddd	xmm4, xmm3
+	pshufd	xmm3, xmm3, 0x93
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 13
+	psrld	xmm5, 19
+	por	xmm4, xmm5
+	pxor	xmm1, xmm4
+
+	## a ^= (d + c) <<< 18
+	movdqa	xmm4, xmm1
+	pshufd	xmm1, xmm1, 0x39
+	paddd	xmm4, xmm2
+	pshufd	xmm2, xmm2, 0x4e
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 18
+	psrld	xmm5, 14
+	por	xmm4, xmm5
+	pxor	xmm0, xmm4
+
+	## We had to undo the transpose ready for the next loop.  Again, push
+	## back the shuffles because they take a long time coming through.
+	## Decrement the loop counter and see if we should go round again.
+	## Later processors fuse this pair into a single uop.
+	sub	ecx, 2
+	ja	loop
+
+	## Almost there.  Firstly, the feedforward addition, and then we have
+	## to write out the result.  Here we have to undo the permutation
+	## which was already applied to the input.  Shuffling has quite high
+	## latency, so arrange to start a new shuffle into a temporary as
+	## soon as we've written out the old value.
+	mov	edx, [ebp + 16]
+
+	paddd	xmm0, [esp +  0]
+	pshufd	xmm4, xmm0, 0x39
+	movd	[edx +  0], xmm0
+
+	paddd	xmm1, [esp + 16]
+	pshufd	xmm5, xmm1, 0x93
+	movd	[edx + 16], xmm1
+
+	paddd	xmm2, xmm6
+	pshufd	xmm6, xmm2, 0x4e
+	movd	[edx + 32], xmm2
+
+	paddd	xmm3, xmm7
+	pshufd	xmm7, xmm3, 0x39
+	movd	[edx + 48], xmm3
+
+	movd	[edx +  4], xmm7
+	pshufd	xmm7, xmm3, 0x4e
+	movd	[edx + 24], xmm7
+	pshufd	xmm3, xmm3, 0x93
+	movd	[edx + 44], xmm3
+
+	movd	[edx +  8], xmm6
+	pshufd	xmm6, xmm2, 0x93
+	movd	[edx + 28], xmm6
+	pshufd	xmm2, xmm2, 0x39
+	movd	[edx + 52], xmm2
+
+	movd	[edx + 12], xmm5
+	pshufd	xmm5, xmm1, 0x39
+	movd	[edx + 36], xmm5
+	pshufd	xmm1, xmm1, 0x4e
+	movd	[edx + 56], xmm1
+
+	movd	[edx + 20], xmm4
+	pshufd	xmm4, xmm0, 0x4e
+	movd	[edx + 40], xmm4
+	pshufd	xmm0, xmm0, 0x93
+	movd	[edx + 60], xmm0
+
+	## And with that, we're done.
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+	.size	salsa20_core_x86_sse2, . - salsa20_core_x86_sse2
+
+###----- That's all, folks --------------------------------------------------
diff --git a/symm/salsa20.c b/symm/salsa20.c
index 0104e5e5..d3fb69a7 100644
--- a/symm/salsa20.c
+++ b/symm/salsa20.c
@@ -7,11 +7,14 @@
 
 /*----- Header files ------------------------------------------------------*/
 
+#include "config.h"
+
 #include <stdarg.h>
 
 #include <mLib/bits.h>
 
 #include "arena.h"
+#include "dispatch.h"
 #include "gcipher.h"
 #include "grand.h"
 #include "keysz.h"
@@ -39,9 +42,28 @@ const octet salsa20_keysz[] = { KSZ_SET, 32, 16, 10, 0 };
  *		the feedforward step.
  */
 
-static void core(unsigned r, const salsa20_matrix src, salsa20_matrix dest)
+CPU_DISPATCH(static, (void),
+	     void, core, (unsigned r, const salsa20_matrix src,
+			  salsa20_matrix dest),
+	     (r, src, dest),
+	     pick_core, simple_core);
+
+static void simple_core(unsigned r, const salsa20_matrix src,
+			salsa20_matrix dest)
   { SALSA20_nR(dest, src, r); SALSA20_FFWD(dest, src); }
 
+#ifdef CPUFAM_X86
+extern core__functype salsa20_core_x86_sse2;
+#endif
+
+static core__functype *pick_core(void)
+{
+#ifdef CPUFAM_X86
+  if (cpu_feature_p(CPUFEAT_X86_SSE2)) return salsa20_core_x86_sse2;
+#endif
+  return simple_core;
+}
+
 /* --- @populate@ --- *
  *
  * Arguments:	@salsa20_matrix a@ = a matrix to fill in
-- 
2.11.0