From 1a0c09c4d4ed8a4a1a1679f793431cc4f5c24c80 Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Wed, 18 May 2016 10:29:03 +0100
Subject: [PATCH] Preprocess the assembler files.

  * Rename the `*.s' files to `*.S'.

  * Create a new header `base/asm-common.h' containing useful
    definitions, particularly for dealing with the peculiarities of
    shared library code.

  * Convert the assembler files to use the new macros.

  * Convert the assembler files to use `//' for comments rather than
    `#' (as currently).  This is a bit annoying, but `#' is wanted by
    the preprocessor, and `/* ... */' doesn't work in Emacs's
    `asm-mode'.

The reason for doing all of this is because the C preprocessor will let
me do things like inventing symbolic names for registers, which will be
handy later when I add support for AMD64 processors, because most of the
code will be identical between 32- and 64-bit machines.

This change has the side-effect that the AESNI implementation no longer
uses PIC-ish means to find things when it doesn't need to.
---
 base/Makefile.am          |   3 +
 base/asm-common.h         | 152 +++++++++++++
 symm/Makefile.am          |   6 +-
 symm/chacha-x86-sse2.S    | 195 ++++++++++++++++
 symm/chacha-x86-sse2.s    | 188 ----------------
 symm/rijndael-x86-aesni.S | 548 +++++++++++++++++++++++++++++++++++++++++++++
 symm/rijndael-x86-aesni.s | 553 ----------------------------------------------
 symm/salsa20-x86-sse2.S   | 254 +++++++++++++++++++++
 symm/salsa20-x86-sse2.s   | 247 ---------------------
 9 files changed, 1155 insertions(+), 991 deletions(-)
 create mode 100644 base/asm-common.h
 create mode 100644 symm/chacha-x86-sse2.S
 delete mode 100644 symm/chacha-x86-sse2.s
 create mode 100644 symm/rijndael-x86-aesni.S
 delete mode 100644 symm/rijndael-x86-aesni.s
 create mode 100644 symm/salsa20-x86-sse2.S
 delete mode 100644 symm/salsa20-x86-sse2.s

diff --git a/base/Makefile.am b/base/Makefile.am
index c8608ed4..0ac43f2e 100644
--- a/base/Makefile.am
+++ b/base/Makefile.am
@@ -55,4 +55,7 @@ libbase_la_SOURCES	+= lmem.c
 ## Clearing secrets from memory.
 pkginclude_HEADERS	+= paranoia.h
 
+## Base definitions for assembler source.
+EXTRA_DIST		+= asm-common.h
+
 ###----- That's all, folks --------------------------------------------------
diff --git a/base/asm-common.h b/base/asm-common.h
new file mode 100644
index 00000000..7e62eb54
--- /dev/null
+++ b/base/asm-common.h
@@ -0,0 +1,152 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of Salsa20
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// General definitions.
+
+// Announcing an external function.
+#define FUNC(name)							\
+	.globl	F(name);						\
+	TYPE_FUNC(name);						\
+	.macro	ENDFUNC; _ENDFUNC(name); .endm;				\
+	FUNC_PREHOOK(name);						\
+F(name):								\
+	FUNC_POSTHOOK(name)
+
+// Marking the end of a function.
+#define _ENDFUNC(name)							\
+	.purgem	ENDFUNC;						\
+	SIZE_OBJ(name);							\
+	ENDFUNC_HOOK(name)
+
+///--------------------------------------------------------------------------
+/// ELF-specific hacking.
+
+#if __ELF__
+
+#if __PIC__ || __PIE__
+#  define WANT_PIC 1
+#endif
+
+#define TYPE_FUNC(name) .type name, STT_FUNC
+
+#define SIZE_OBJ(name) .size name, . - name
+
+#endif
+
+///--------------------------------------------------------------------------
+/// x86-specific hacking.
+
+#if CPUFAM_X86
+
+// Set the function hooks.
+#define FUNC_PREHOOK(_) .balign 16
+
+// Don't use the wretched AT&T syntax.  It's festooned with pointless
+// punctuation, and all of the data movement is backwards.  Ugh!
+	.intel_syntax noprefix
+
+// Call external subroutine at ADDR, possibly via PLT.
+	.macro	callext addr
+#if WANT_PIC
+	call	\addr@PLT
+#else
+	call	\addr
+#endif
+	.endm
+
+// Do I need to arrange a spare GOT register?
+#if WANT_PIC && CPUFAM_X86
+#  define NEED_GOT 1
+#endif
+#define GOTREG ebx			// Not needed in AMD64 so don't care.
+
+// Maybe load GOT address into GOT.
+	.macro	ldgot got=GOTREG
+#if WANT_PIC
+	call	_where_am_i.\got
+	add	\got, offset _GLOBAL_OFFSET_TABLE_
+#endif
+	.endm
+
+// Maybe build a helper subroutine for `ldgot GOT'.
+	.macro	gotaux got=GOTREG
+#if WANT_PIC
+	.align	16
+_where_am_i.\got :
+	mov	\got, [esp]
+	ret
+#endif
+	.endm
+
+// Load address of external symbol ADDR into REG, maybe using GOT.
+	.macro	leaext reg, addr, got=GOTREG
+#if WANT_PIC
+	mov	\reg, [\got + \addr@GOT]
+#else
+	mov	\reg, offset \addr
+#endif
+	.endm
+
+// Address expression (possibly using a base register, and a displacement)
+// referring to ADDR, which is within our module, maybe using GOT.
+#define INTADDR(...) INTADDR__0(__VA_ARGS__, GOTREG, dummy)
+#define INTADDR__0(addr, got, ...) INTADDR__1(addr, got)
+#if WANT_PIC
+#  define INTADDR__1(addr, got) got + addr@GOTOFF
+#else
+#  define INTADDR__1(addr, got) addr
+#endif
+
+#endif
+
+///--------------------------------------------------------------------------
+/// Final stuff.
+
+// Default values for the various hooks.
+#ifndef FUNC_PREHOOK
+#  define FUNC_PREHOOK(name)
+#endif
+#ifndef FUNC_POSTHOOK
+#  define FUNC_POSTHOOK(name)
+#endif
+#ifndef ENDFUNC_HOOK
+#  define ENDFUNC_HOOK(name)
+#endif
+
+#ifndef F
+#  define F(name) name
+#endif
+
+#ifndef TYPE_FUNC
+#  define TYPE_FUNC(name)
+#endif
+
+#ifndef SIZE_OBJ
+#  define SIZE_OBJ(name)
+#endif
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/Makefile.am b/symm/Makefile.am
index 6a63993d..ba037cd5 100644
--- a/symm/Makefile.am
+++ b/symm/Makefile.am
@@ -181,7 +181,7 @@ BLKCS			+= rc5
 BLKCS			+= rijndael rijndael192 rijndael256
 libsymm_la_SOURCES	+= rijndael-base.h rijndael-base.c
 if CPUFAM_X86
-libsymm_la_SOURCES	+= rijndael-x86-aesni.s
+libsymm_la_SOURCES	+= rijndael-x86-aesni.S
 endif
 libsymm_la_SOURCES	+= $(precomp)/rijndael-tab.c
 PRECOMPS		+= $(precomp)/rijndael-tab.c
@@ -382,7 +382,7 @@ EXTRA_DIST		+= salsa20-tvconv
 pkginclude_HEADERS	+= salsa20.h salsa20-core.h
 libsymm_la_SOURCES	+= salsa20.c
 if CPUFAM_X86
-libsymm_la_SOURCES	+= salsa20-x86-sse2.s
+libsymm_la_SOURCES	+= salsa20-x86-sse2.S
 endif
 TESTS			+= salsa20.$t
 ALL_CIPHERS		+= salsa20 salsa2012 salsa208
@@ -411,7 +411,7 @@ t/salsa20: salsa20-tvconv t/salsa20.local $(SALSA20_ESTREAM_TV)
 pkginclude_HEADERS	+= chacha.h chacha-core.h
 libsymm_la_SOURCES	+= chacha.c
 if CPUFAM_X86
-libsymm_la_SOURCES	+= chacha-x86-sse2.s
+libsymm_la_SOURCES	+= chacha-x86-sse2.S
 endif
 TESTS			+= chacha.$t
 EXTRA_DIST		+= t/chacha
diff --git a/symm/chacha-x86-sse2.S b/symm/chacha-x86-sse2.S
new file mode 100644
index 00000000..f9ae1c4e
--- /dev/null
+++ b/symm/chacha-x86-sse2.S
@@ -0,0 +1,195 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of ChaCha
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+	.arch pentium4
+	.section .text
+
+FUNC(chacha_core_x86_sse2)
+
+	// Initial state.  We have three arguments:
+	// [ebp +  8] is the number of rounds to do
+	// [ebp + 12] points to the input matrix
+	// [ebp + 16] points to the output matrix
+	push	ebp
+	mov	ebp, esp
+	sub	esp, 16
+	mov	edx, [ebp + 12]
+	and	esp, ~15
+
+	// First job is to slurp the matrix into XMM registers.  Be careful:
+	// the input matrix isn't likely to be properly aligned.
+	//
+	//	[ 0  1  2  3] (a, xmm0)
+	//	[ 4  5  6  7] (b, xmm0)
+	//	[ 8  9 10 11] (c, xmm0)
+	//	[12 13 14 15] (d, xmm0)
+	movdqu	xmm0, [edx +  0]
+	movdqu	xmm1, [edx + 16]
+	movdqu	xmm2, [edx + 32]
+	movdqu	xmm3, [edx + 48]
+
+	// Prepare for the main loop.
+	mov	ecx, [ebp + 8]
+
+	// Take a copy for later.  This one is aligned properly, by
+	// construction.
+	movdqa	[esp], xmm0
+	movdqa	xmm5, xmm1
+	movdqa	xmm6, xmm2
+	movdqa	xmm7, xmm3
+
+loop:
+	// Apply a column quarterround to each of the columns simultaneously.
+	// Alas, there doesn't seem to be a packed doubleword rotate, so we
+	// have to synthesize it.
+
+	// a += b; d ^= a; d <<<= 16
+	paddd	xmm0, xmm1
+	pxor	xmm3, xmm0
+	movdqa	xmm4, xmm3
+	pslld	xmm3, 16
+	psrld	xmm4, 16
+	por	xmm3, xmm4
+
+	// c += d; b ^= c; b <<<= 12
+	paddd	xmm2, xmm3
+	pxor	xmm1, xmm2
+	movdqa	xmm4, xmm1
+	pslld	xmm1, 12
+	psrld	xmm4, 20
+	por	xmm1, xmm4
+
+	// a += b; d ^= a; d <<<=  8
+	paddd	xmm0, xmm1
+	pxor	xmm3, xmm0
+	movdqa	xmm4, xmm3
+	pslld	xmm3, 8
+	psrld	xmm4, 24
+	por	xmm3, xmm4
+
+	// c += d; b ^= c; b <<<=  7
+	paddd	xmm2, xmm3
+	pshufd	xmm3, xmm3, 0x93
+	pxor	xmm1, xmm2
+	pshufd	xmm2, xmm2, 0x4e
+	movdqa	xmm4, xmm1
+	pslld	xmm1, 7
+	psrld	xmm4, 25
+	por	xmm1, xmm4
+
+	// The not-quite-transpose conveniently only involves reordering
+	// elements of individual rows, which can be done quite easily.  It
+	// doesn't involve any movement of elements between rows, or even
+	// renaming of the rows.
+	//
+	//	[ 0  1  2  3]		[ 0  1  2  3] (a, xmm0)
+	//	[ 4  5  6  7]    -->	[ 5  6  7  4] (b, xmm1)
+	//	[ 8  9 10 11]		[10 11  8  9] (c, xmm2)
+	//	[12 13 14 15]		[15 12 13 14] (d, xmm3)
+	//
+	// The shuffles have quite high latency, so they've mostly been
+	// pushed upwards.  The remaining one can't be moved, though.
+	pshufd	xmm1, xmm1, 0x39
+
+	// Apply the diagonal quarterround to each of the columns
+	// simultaneously.
+
+	// a += b; d ^= a; d <<<= 16
+	paddd	xmm0, xmm1
+	pxor	xmm3, xmm0
+	movdqa	xmm4, xmm3
+	pslld	xmm3, 16
+	psrld	xmm4, 16
+	por	xmm3, xmm4
+
+	// c += d; b ^= c; b <<<= 12
+	paddd	xmm2, xmm3
+	pxor	xmm1, xmm2
+	movdqa	xmm4, xmm1
+	pslld	xmm1, 12
+	psrld	xmm4, 20
+	por	xmm1, xmm4
+
+	// a += b; d ^= a; d <<<=  8
+	paddd	xmm0, xmm1
+	pxor	xmm3, xmm0
+	movdqa	xmm4, xmm3
+	pslld	xmm3, 8
+	psrld	xmm4, 24
+	por	xmm3, xmm4
+
+	// c += d; b ^= c; b <<<=  7
+	paddd	xmm2, xmm3
+	pshufd	xmm3, xmm3, 0x39
+	pxor	xmm1, xmm2
+	pshufd	xmm2, xmm2, 0x4e
+	movdqa	xmm4, xmm1
+	pslld	xmm1, 7
+	psrld	xmm4, 25
+	por	xmm1, xmm4
+
+	// Finally, finish off undoing the transpose, and we're done for this
+	// doubleround.  Again, most of this was done above so we don't have
+	// to wait for the shuffles.
+	pshufd	xmm1, xmm1, 0x93
+
+	// Decrement the loop counter and see if we should go round again.
+	sub	ecx, 2
+	ja	loop
+
+	// Almost there.  Firstly, the feedforward addition.
+	mov	edx, [ebp + 16]
+	paddd	xmm0, [esp]
+	paddd	xmm1, xmm5
+	paddd	xmm2, xmm6
+	paddd	xmm3, xmm7
+
+	// And now we write out the result.  This one won't be aligned
+	// either.
+	movdqu	[edx +  0], xmm0
+	movdqu	[edx + 16], xmm1
+	movdqu	[edx + 32], xmm2
+	movdqu	[edx + 48], xmm3
+
+	// Tidy things up.
+	mov	esp, ebp
+	pop	ebp
+
+	// And with that, we're done.
+	ret
+
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/chacha-x86-sse2.s b/symm/chacha-x86-sse2.s
deleted file mode 100644
index 7b790107..00000000
--- a/symm/chacha-x86-sse2.s
+++ /dev/null
@@ -1,188 +0,0 @@
-### -*- mode: asm; asm-comment-char: ?# -*-
-###
-### Fancy SIMD implementation of ChaCha
-###
-### (c) 2015 Straylight/Edgeware
-###
-
-###----- Licensing notice ---------------------------------------------------
-###
-### This file is part of Catacomb.
-###
-### Catacomb is free software; you can redistribute it and/or modify
-### it under the terms of the GNU Library General Public License as
-### published by the Free Software Foundation; either version 2 of the
-### License, or (at your option) any later version.
-###
-### Catacomb is distributed in the hope that it will be useful,
-### but WITHOUT ANY WARRANTY; without even the implied warranty of
-### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-### GNU Library General Public License for more details.
-###
-### You should have received a copy of the GNU Library General Public
-### License along with Catacomb; if not, write to the Free
-### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-### MA 02111-1307, USA.
-
-	.intel_syntax noprefix
-	.arch pentium4
-
-	.section .text
-
-	.globl	chacha_core_x86_sse2
-	.type	chacha_core_x86_sse2, STT_FUNC
-chacha_core_x86_sse2:
-
-	## Initial state.  We have three arguments:
-	## [ebp +  8] is the number of rounds to do
-	## [ebp + 12] points to the input matrix
-	## [ebp + 16] points to the output matrix
-	push	ebp
-	mov	ebp, esp
-	sub	esp, 16
-	mov	edx, [ebp + 12]
-	and	esp, ~15
-
-	## First job is to slurp the matrix into XMM registers.  Be careful:
-	## the input matrix isn't likely to be properly aligned.
-	##
-	##	[ 0  1  2  3] (a, xmm0)
-	##	[ 4  5  6  7] (b, xmm0)
-	##	[ 8  9 10 11] (c, xmm0)
-	##	[12 13 14 15] (d, xmm0)
-	movdqu	xmm0, [edx +  0]
-	movdqu	xmm1, [edx + 16]
-	movdqu	xmm2, [edx + 32]
-	movdqu	xmm3, [edx + 48]
-
-	## Prepare for the main loop.
-	mov	ecx, [ebp + 8]
-
-	## Take a copy for later.  This one is aligned properly, by
-	## construction.
-	movdqa	[esp], xmm0
-	movdqa	xmm5, xmm1
-	movdqa	xmm6, xmm2
-	movdqa	xmm7, xmm3
-
-loop:
-	## Apply a column quarterround to each of the columns simultaneously.
-	## Alas, there doesn't seem to be a packed doubleword rotate, so we
-	## have to synthesize it.
-
-	## a += b; d ^= a; d <<<= 16
-	paddd	xmm0, xmm1
-	pxor	xmm3, xmm0
-	movdqa	xmm4, xmm3
-	pslld	xmm3, 16
-	psrld	xmm4, 16
-	por	xmm3, xmm4
-
-	## c += d; b ^= c; b <<<= 12
-	paddd	xmm2, xmm3
-	pxor	xmm1, xmm2
-	movdqa	xmm4, xmm1
-	pslld	xmm1, 12
-	psrld	xmm4, 20
-	por	xmm1, xmm4
-
-	## a += b; d ^= a; d <<<=  8
-	paddd	xmm0, xmm1
-	pxor	xmm3, xmm0
-	movdqa	xmm4, xmm3
-	pslld	xmm3, 8
-	psrld	xmm4, 24
-	por	xmm3, xmm4
-
-	## c += d; b ^= c; b <<<=  7
-	paddd	xmm2, xmm3
-	pshufd	xmm3, xmm3, 0x93
-	pxor	xmm1, xmm2
-	pshufd	xmm2, xmm2, 0x4e
-	movdqa	xmm4, xmm1
-	pslld	xmm1, 7
-	psrld	xmm4, 25
-	por	xmm1, xmm4
-
-	## The not-quite-transpose conveniently only involves reordering
-	## elements of individual rows, which can be done quite easily.  It
-	## doesn't involve any movement of elements between rows, or even
-	## renaming of the rows.
-	##
-	##	[ 0  1  2  3]		[ 0  1  2  3] (a, xmm0)
-	##	[ 4  5  6  7]    -->	[ 5  6  7  4] (b, xmm1)
-	##	[ 8  9 10 11]		[10 11  8  9] (c, xmm2)
-	##	[12 13 14 15]		[15 12 13 14] (d, xmm3)
-	##
-	## The shuffles have quite high latency, so they've mostly been
-	## pushed upwards.  The remaining one can't be moved, though.
-	pshufd	xmm1, xmm1, 0x39
-
-	## Apply the diagonal quarterround to each of the columns
-	## simultaneously.
-
-	## a += b; d ^= a; d <<<= 16
-	paddd	xmm0, xmm1
-	pxor	xmm3, xmm0
-	movdqa	xmm4, xmm3
-	pslld	xmm3, 16
-	psrld	xmm4, 16
-	por	xmm3, xmm4
-
-	## c += d; b ^= c; b <<<= 12
-	paddd	xmm2, xmm3
-	pxor	xmm1, xmm2
-	movdqa	xmm4, xmm1
-	pslld	xmm1, 12
-	psrld	xmm4, 20
-	por	xmm1, xmm4
-
-	## a += b; d ^= a; d <<<=  8
-	paddd	xmm0, xmm1
-	pxor	xmm3, xmm0
-	movdqa	xmm4, xmm3
-	pslld	xmm3, 8
-	psrld	xmm4, 24
-	por	xmm3, xmm4
-
-	## c += d; b ^= c; b <<<=  7
-	paddd	xmm2, xmm3
-	pshufd	xmm3, xmm3, 0x39
-	pxor	xmm1, xmm2
-	pshufd	xmm2, xmm2, 0x4e
-	movdqa	xmm4, xmm1
-	pslld	xmm1, 7
-	psrld	xmm4, 25
-	por	xmm1, xmm4
-
-	## Finally, finish off undoing the transpose, and we're done for this
-	## doubleround.  Again, most of this was done above so we don't have
-	## to wait for the shuffles.
-	pshufd	xmm1, xmm1, 0x93
-
-	## Decrement the loop counter and see if we should go round again.
-	sub	ecx, 2
-	ja	loop
-
-	## Almost there.  Firstly, the feedforward addition.
-	mov	edx, [ebp + 16]
-	paddd	xmm0, [esp]
-	paddd	xmm1, xmm5
-	paddd	xmm2, xmm6
-	paddd	xmm3, xmm7
-
-	## And now we write out the result.  This one won't be aligned
-	## either.
-	movdqu	[edx +  0], xmm0
-	movdqu	[edx + 16], xmm1
-	movdqu	[edx + 32], xmm2
-	movdqu	[edx + 48], xmm3
-
-	## And with that, we're done.
-	mov	esp, ebp
-	pop	ebp
-	ret
-
-	.size	chacha_core_x86_sse2, . - chacha_core_x86_sse2
-
-###----- That's all, folks --------------------------------------------------
diff --git a/symm/rijndael-x86-aesni.S b/symm/rijndael-x86-aesni.S
new file mode 100644
index 00000000..d9aa9dc9
--- /dev/null
+++ b/symm/rijndael-x86-aesni.S
@@ -0,0 +1,548 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// AESNI-based implementation of Rijndael
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+	.globl	F(abort)
+	.globl	F(rijndael_rcon)
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+	.arch	.aes
+	.section .text
+
+/// The AESNI instructions implement a little-endian version of AES, but
+/// Catacomb's internal interface presents as big-endian so as to work better
+/// with things like GCM.  We therefore maintain the round keys in
+/// little-endian form, and have to end-swap blocks in and out.
+///
+/// For added amusement, the AESNI instructions don't implement the
+/// larger-block versions of Rijndael, so we have to end-swap the keys if
+/// we're preparing for one of those.
+
+	// Useful constants.
+	.equ	maxrounds, 16		// maximum number of rounds
+	.equ	maxblksz, 32		// maximum block size, in bytes
+	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
+
+	// Context structure.
+	.equ	nr, 0			// number of rounds
+	.equ	w, nr + 4		// encryption key words
+	.equ	wi, w + kbufsz		// decryption key words
+
+///--------------------------------------------------------------------------
+/// Key setup.
+
+FUNC(rijndael_setup_x86_aesni)
+
+	// Initial state.  We have four arguments:
+	// [esp + 20] is the context pointer
+	// [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
+	// [esp + 28] points to the key material, unaligned
+	// [esp + 32] is the size of the key, in words
+	// The key size has already been checked for validity, and the number
+	// of rounds has been computed.  Our job is only to fill in the `w'
+	// and `wi' vectors.
+
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+
+	// The initial round key material is taken directly from the input
+	// key, so copy it over.
+	mov	ebp, [esp + 20]		// context base pointer
+	mov	ebx, [esp + 32]		// key size, in words
+	mov	ecx, ebx
+	mov	esi, [esp + 28]
+	lea	edi, [ebp + w]
+	rep	movsd
+
+	// Find out other useful things.
+	mov	edx, [ebp + nr]		// number of rounds
+	add	edx, 1
+	imul	edx, [esp + 24]		// total key size in words
+	sub	edx, ebx		// offset by the key size
+
+	// Find the round constants.
+	ldgot	ecx
+	leaext	ecx, rijndael_rcon, ecx
+
+	// Prepare for the main loop.
+	lea	esi, [ebp + w]
+	mov	eax, [esi + 4*ebx - 4]	// most recent key word
+	lea	edx, [esi + 4*edx]	// limit, offset by one key expansion
+
+	// Main key expansion loop.  The first word of each key-length chunk
+	// needs special treatment.
+	//
+	// This is rather tedious because the Intel `AESKEYGENASSIST'
+	// instruction is very strangely shaped.  Firstly, it wants to
+	// operate on vast SSE registers, even though we're data-blocked from
+	// doing more than operation at a time unless we're doing two key
+	// schedules simultaneously -- and even then we can't do more than
+	// two, because the instruction ignores two of its input words
+	// entirely, and produces two different outputs for each of the other
+	// two.  And secondly it insists on taking the magic round constant
+	// as an immediate, so it's kind of annoying if you're not
+	// open-coding the whole thing.  It's much easier to leave that as
+	// zero and XOR in the round constant by hand.
+9:	movd	xmm0, eax
+	pshufd	xmm0, xmm0, 0x39
+	aeskeygenassist xmm1, xmm0, 0
+	pshufd	xmm1, xmm1, 0x93
+	movd	eax, xmm1
+	xor	eax, [esi]
+	xor	al, [ecx]
+	inc	ecx
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	// The next three words are simple...
+	xor	eax, [esi]
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	// (Word 2...)
+	xor	eax, [esi]
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	// (Word 3...)
+	xor	eax, [esi]
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	// Word 4.  If the key is /more/ than 6 words long, then we must
+	// apply a substitution here.
+	cmp	ebx, 5
+	jb	9b
+	cmp	ebx, 7
+	jb	0f
+	movd	xmm0, eax
+	pshufd	xmm0, xmm0, 0x93
+	aeskeygenassist xmm1, xmm0, 0
+	movd	eax, xmm1
+0:	xor	eax, [esi]
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	// (Word 5...)
+	cmp	ebx, 6
+	jb	9b
+	xor	eax, [esi]
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	// (Word 6...)
+	cmp	ebx, 7
+	jb	9b
+	xor	eax, [esi]
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	// (Word 7...)
+	cmp	ebx, 8
+	jb	9b
+	xor	eax, [esi]
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	// Must be done by now.
+	jmp	9b
+
+	// Next job is to construct the decryption keys.  The keys for the
+	// first and last rounds don't need to be mangled, but the remaining
+	// ones do -- and they all need to be reordered too.
+	//
+	// The plan of action, then, is to copy the final encryption round's
+	// keys into place first, then to do each of the intermediate rounds
+	// in reverse order, and finally do the first round.
+	//
+	// Do all of the heavy lifting with SSE registers.  The order we're
+	// doing this in means that it's OK if we read or write too much, and
+	// there's easily enough buffer space for the over-enthusiastic reads
+	// and writes because the context has space for 32-byte blocks, which
+	// is our maximum and an exact fit for two SSE registers.
+8:	mov	ecx, [ebp + nr]		// number of rounds
+	mov	ebx, [esp + 24]		// block size (in words)
+	mov	edx, ecx
+	imul	edx, ebx
+	lea	edi, [ebp + wi]
+	lea	esi, [ebp + 4*edx + w]	// last round's keys
+	shl	ebx, 2			// block size (in bytes now)
+
+	// Copy the last encryption round's keys.
+	movdqu	xmm0, [esi]
+	movdqu	[edi], xmm0
+	cmp	ebx, 16
+	jbe	9f
+	movdqu	xmm0, [esi + 16]
+	movdqu	[edi + 16], xmm0
+
+	// Update the loop variables and stop if we've finished.
+9:	add	edi, ebx
+	sub	esi, ebx
+	sub	ecx, 1
+	jbe	0f
+
+	// Do another middle round's keys...
+	movdqu	xmm0, [esi]
+	aesimc	xmm0, xmm0
+	movdqu	[edi], xmm0
+	cmp	ebx, 16
+	jbe	9b
+	movdqu	xmm0, [esi + 16]
+	aesimc	xmm0, xmm0
+	movdqu	[edi + 16], xmm0
+	jmp	9b
+
+	// Finally do the first encryption round.
+0:	movdqu	xmm0, [esi]
+	movdqu	[edi], xmm0
+	cmp	ebx, 16
+	jbe	0f
+	movdqu	xmm0, [esi + 16]
+	movdqu	[edi + 16], xmm0
+
+	// If the block size is not exactly four words then we must end-swap
+	// everything.  We can use fancy SSE toys for this.
+0:	cmp	ebx, 16
+	je	0f
+
+	// Find the byte-reordering table.
+	ldgot	ecx
+	movdqa	xmm7, [INTADDR(endswap_tab, ecx)]
+
+	// Calculate the number of subkey words again.  (It's a good job
+	// we've got a fast multiplier.)
+	mov	ecx, [ebp + nr]
+	add	ecx, 1
+	imul	ecx, [esp + 24]		// total keys in words
+
+	// End-swap the encryption keys.
+	mov	eax, ecx
+	lea	esi, [ebp + w]
+	call	endswap_block
+
+	// And the decryption keys.
+	mov	ecx, eax
+	lea	esi, [ebp + wi]
+	call	endswap_block
+
+	// All done.
+0:	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
+	.align	16
+endswap_block:
+	// End-swap ECX words starting at ESI.  The end-swapping table is
+	// already loaded into XMM7; and it's OK to work in 16-byte chunks.
+	movdqu	xmm1, [esi]
+	pshufb	xmm1, xmm7
+	movdqu	[esi], xmm1
+	add	esi, 16
+	sub	ecx, 4
+	ja	endswap_block
+	ret
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Encrypting and decrypting blocks.
+
+FUNC(rijndael_eblk_x86_aesni)
+
+	// On entry, we have:
+	// [esp +  4] points to the context block
+	// [esp +  8] points to the input data block
+	// [esp + 12] points to the output buffer
+
+	// Find the magic endianness-swapping table.
+	ldgot	ecx
+	movdqa	xmm7, [INTADDR(endswap_tab, ecx)]
+
+	// Load the input block and end-swap it.  Also, start loading the
+	// keys.
+	mov	eax, [esp + 8]
+	movdqu	xmm0, [eax]
+	pshufb	xmm0, xmm7
+	mov	eax, [esp + 4]
+	lea	edx, [eax + w]
+	mov	eax, [eax + nr]
+
+	// Initial whitening.
+	movdqu	xmm1, [edx]
+	add	edx, 16
+	pxor	xmm0, xmm1
+
+	// Dispatch to the correct code.
+	cmp	eax, 10
+	je	er10
+	jb	bogus
+	cmp	eax, 14
+	je	er14
+	ja	bogus
+	cmp	eax, 12
+	je	er12
+	jb	er11
+	jmp	er13
+
+	.align	2
+
+	// 14 rounds...
+er14:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesenc	xmm0, xmm1
+
+	// 13 rounds...
+er13:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesenc	xmm0, xmm1
+
+	// 12 rounds...
+er12:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesenc	xmm0, xmm1
+
+	// 11 rounds...
+er11:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesenc	xmm0, xmm1
+
+	// 10 rounds...
+er10:	movdqu	xmm1, [edx]
+	aesenc	xmm0, xmm1
+
+	// 9 rounds...
+	movdqu	xmm1, [edx + 16]
+	aesenc	xmm0, xmm1
+
+	// 8 rounds...
+	movdqu	xmm1, [edx + 32]
+	aesenc	xmm0, xmm1
+
+	// 7 rounds...
+	movdqu	xmm1, [edx + 48]
+	aesenc	xmm0, xmm1
+
+	// 6 rounds...
+	movdqu	xmm1, [edx + 64]
+	aesenc	xmm0, xmm1
+
+	// 5 rounds...
+	movdqu	xmm1, [edx + 80]
+	aesenc	xmm0, xmm1
+
+	// 4 rounds...
+	movdqu	xmm1, [edx + 96]
+	aesenc	xmm0, xmm1
+
+	// 3 rounds...
+	movdqu	xmm1, [edx + 112]
+	aesenc	xmm0, xmm1
+
+	// 2 rounds...
+	movdqu	xmm1, [edx + 128]
+	aesenc	xmm0, xmm1
+
+	// Final round...
+	movdqu	xmm1, [edx + 144]
+	aesenclast xmm0, xmm1
+
+	// Unpermute the ciphertext block and store it.
+	pshufb	xmm0, xmm7
+	mov	eax, [esp + 12]
+	movdqu	[eax], xmm0
+
+	// And we're done.
+	ret
+
+ENDFUNC
+
+FUNC(rijndael_dblk_x86_aesni)
+
+	// On entry, we have:
+	// [esp +  4] points to the context block
+	// [esp +  8] points to the input data block
+	// [esp + 12] points to the output buffer
+
+	// Find the magic endianness-swapping table.
+	ldgot	ecx
+	movdqa	xmm7, [INTADDR(endswap_tab, ecx)]
+
+	// Load the input block and end-swap it.  Also, start loading the
+	// keys.
+	mov	eax, [esp + 8]
+	movdqu	xmm0, [eax]
+	pshufb	xmm0, xmm7
+	mov	eax, [esp + 4]
+	lea	edx, [eax + wi]
+	mov	eax, [eax + nr]
+
+	// Initial whitening.
+	movdqu	xmm1, [edx]
+	add	edx, 16
+	pxor	xmm0, xmm1
+
+	// Dispatch to the correct code.
+	cmp	eax, 10
+	je	dr10
+	jb	bogus
+	cmp	eax, 14
+	je	dr14
+	ja	bogus
+	cmp	eax, 12
+	je	dr12
+	jb	dr11
+	jmp	dr13
+
+	.align	2
+
+	// 14 rounds...
+dr14:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesdec	xmm0, xmm1
+
+	// 13 rounds...
+dr13:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesdec	xmm0, xmm1
+
+	// 12 rounds...
+dr12:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesdec	xmm0, xmm1
+
+	// 11 rounds...
+dr11:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesdec	xmm0, xmm1
+
+	// 10 rounds...
+dr10:	movdqu	xmm1, [edx]
+	aesdec	xmm0, xmm1
+
+	// 9 rounds...
+	movdqu	xmm1, [edx + 16]
+	aesdec	xmm0, xmm1
+
+	// 8 rounds...
+	movdqu	xmm1, [edx + 32]
+	aesdec	xmm0, xmm1
+
+	// 7 rounds...
+	movdqu	xmm1, [edx + 48]
+	aesdec	xmm0, xmm1
+
+	// 6 rounds...
+	movdqu	xmm1, [edx + 64]
+	aesdec	xmm0, xmm1
+
+	// 5 rounds...
+	movdqu	xmm1, [edx + 80]
+	aesdec	xmm0, xmm1
+
+	// 4 rounds...
+	movdqu	xmm1, [edx + 96]
+	aesdec	xmm0, xmm1
+
+	// 3 rounds...
+	movdqu	xmm1, [edx + 112]
+	aesdec	xmm0, xmm1
+
+	// 2 rounds...
+	movdqu	xmm1, [edx + 128]
+	aesdec	xmm0, xmm1
+
+	// Final round...
+	movdqu	xmm1, [edx + 144]
+	aesdeclast xmm0, xmm1
+
+	// Unpermute the ciphertext block and store it.
+	pshufb	xmm0, xmm7
+	mov	eax, [esp + 12]
+	movdqu	[eax], xmm0
+
+	// And we're done.
+	ret
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Random utilities.
+
+	.align	16
+	// Abort the process because of a programming error.  Indirecting
+	// through this point serves several purposes: (a) by CALLing, rather
+	// than branching to, `abort', we can save the return address, which
+	// might at least provide a hint as to what went wrong; (b) we don't
+	// have conditional CALLs (and they'd be big anyway); and (c) we can
+	// write a HLT here as a backstop against `abort' being mad.
+bogus:	callext	F(abort)
+0:	hlt
+	jmp	0b
+
+	gotaux	ecx
+
+///--------------------------------------------------------------------------
+/// Data tables.
+
+	.align	16
+endswap_tab:
+	.byte	 3,  2,  1,  0
+	.byte	 7,  6,  5,  4
+	.byte	11, 10,  9,  8
+	.byte	15, 14, 13, 12
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/rijndael-x86-aesni.s b/symm/rijndael-x86-aesni.s
deleted file mode 100644
index 79b33584..00000000
--- a/symm/rijndael-x86-aesni.s
+++ /dev/null
@@ -1,553 +0,0 @@
-### -*- mode: asm; asm-comment-char: ?# -*-
-###
-### AESNI-based implementation of Rijndael
-###
-### (c) 2015 Straylight/Edgeware
-###
-
-###----- Licensing notice ---------------------------------------------------
-###
-### This file is part of Catacomb.
-###
-### Catacomb is free software; you can redistribute it and/or modify
-### it under the terms of the GNU Library General Public License as
-### published by the Free Software Foundation; either version 2 of the
-### License, or (at your option) any later version.
-###
-### Catacomb is distributed in the hope that it will be useful,
-### but WITHOUT ANY WARRANTY; without even the implied warranty of
-### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-### GNU Library General Public License for more details.
-###
-### You should have received a copy of the GNU Library General Public
-### License along with Catacomb; if not, write to the Free
-### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-### MA 02111-1307, USA.
-
-	.intel_syntax noprefix
-	.arch .aes
-
-	.globl	abort
-	.globl	rijndael_rcon
-
-	.section .text
-
-### The AESNI instructions implement a little-endian version of AES, but
-### Catacomb's internal interface presents as big-endian so as to work better
-### with things like GCM.  We therefore maintain the round keys in
-### little-endian form, and have to end-swap blocks in and out.
-###
-### For added amusement, the AESNI instructions don't implement the
-### larger-block versions of Rijndael, so we have to end-swap the keys if
-### we're preparing for one of those.
-
-	## Useful constants.
-	.equ maxrounds, 16		# maximum number of rounds
-	.equ maxblksz, 32		# maximum block size, in bytes
-	.equ kbufsz, maxblksz*(maxrounds + 1) # size of a key-schedule buffer
-
-	## Context structure.
-	.equ nr, 0			# number of rounds
-	.equ w, nr + 4			# encryption key words
-	.equ wi, w + kbufsz		# decryption key words
-
-###--------------------------------------------------------------------------
-### Key setup.
-
-	.globl	rijndael_setup_x86_aesni
-	.type	rijndael_setup_x86_aesni, STT_FUNC
-	.align	16
-rijndael_setup_x86_aesni:
-
-	## Initial state.  We have four arguments:
-	## [esp + 20] is the context pointer
-	## [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
-	## [esp + 28] points to the key material, unaligned
-	## [esp + 32] is the size of the key, in words
-	## The key size has already been checked for validity, and the number
-	## of rounds has been computed.  Our job is only to fill in the `w'
-	## and `wi' vectors.
-
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-
-	## The initial round key material is taken directly from the input
-	## key, so copy it over.
-	mov	ebp, [esp + 20]		# context base pointer
-	mov	ebx, [esp + 32]		# key size, in words
-	mov	ecx, ebx
-	mov	esi, [esp + 28]
-	lea	edi, [ebp + w]
-	rep	movsd
-
-	## Find out other useful things.
-	mov	edx, [ebp + nr]		# number of rounds
-	add	edx, 1
-	imul	edx, [esp + 24]		# total key size in words
-	sub	edx, ebx		# offset by the key size
-
-	## Find the round constants.
-	call	where_am_i_ecx
-	add	ecx, offset _GLOBAL_OFFSET_TABLE_
-	mov	ecx, [ecx + rijndael_rcon@GOT]
-
-	## Prepare for the main loop.
-	lea	esi, [ebp + w]
-	mov	eax, [esi + 4*ebx - 4]	# most recent key word
-	lea	edx, [esi + 4*edx]	# limit, offset by one key expansion
-
-	## Main key expansion loop.  The first word of each key-length chunk
-	## needs special treatment.
-	##
-	## This is rather tedious because the Intel `AESKEYGENASSIST'
-	## instruction is very strangely shaped.  Firstly, it wants to
-	## operate on vast SSE registers, even though we're data-blocked from
-	## doing more than operation at a time unless we're doing two key
-	## schedules simultaneously -- and even then we can't do more than
-	## two, because the instruction ignores two of its input words
-	## entirely, and produces two different outputs for each of the other
-	## two.  And secondly it insists on taking the magic round constant
-	## as an immediate, so it's kind of annoying if you're not
-	## open-coding the whole thing.  It's much easier to leave that as
-	## zero and XOR in the round constant by hand.
-9:	movd	xmm0, eax
-	pshufd	xmm0, xmm0, 0x39
-	aeskeygenassist xmm1, xmm0, 0
-	pshufd	xmm1, xmm1, 0x93
-	movd	eax, xmm1
-	xor	eax, [esi]
-	xor	al, [ecx]
-	inc	ecx
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
-	jae	8f
-
-	## The next three words are simple...
-	xor	eax, [esi]
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
-	jae	8f
-
-	## (Word 2...)
-	xor	eax, [esi]
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
-	jae	8f
-
-	## (Word 3...)
-	xor	eax, [esi]
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
-	jae	8f
-
-	## Word 4.  If the key is /more/ than 6 words long, then we must
-	## apply a substitution here.
-	cmp	ebx, 5
-	jb	9b
-	cmp	ebx, 7
-	jb	0f
-	movd	xmm0, eax
-	pshufd	xmm0, xmm0, 0x93
-	aeskeygenassist xmm1, xmm0, 0
-	movd	eax, xmm1
-0:	xor	eax, [esi]
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
-	jae	8f
-
-	## (Word 5...)
-	cmp	ebx, 6
-	jb	9b
-	xor	eax, [esi]
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
-	jae	8f
-
-	## (Word 6...)
-	cmp	ebx, 7
-	jb	9b
-	xor	eax, [esi]
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
-	jae	8f
-
-	## (Word 7...)
-	cmp	ebx, 8
-	jb	9b
-	xor	eax, [esi]
-	mov	[esi + 4*ebx], eax
-	add	esi, 4
-	cmp	esi, edx
-	jae	8f
-
-	## Must be done by now.
-	jmp	9b
-
-	## Next job is to construct the decryption keys.  The keys for the
-	## first and last rounds don't need to be mangled, but the remaining
-	## ones do -- and they all need to be reordered too.
-	##
-	## The plan of action, then, is to copy the final encryption round's
-	## keys into place first, then to do each of the intermediate rounds
-	## in reverse order, and finally do the first round.
-	##
-	## Do all of the heavy lifting with SSE registers.  The order we're
-	## doing this in means that it's OK if we read or write too much, and
-	## there's easily enough buffer space for the over-enthusiastic reads
-	## and writes because the context has space for 32-byte blocks, which
-	## is our maximum and an exact fit for two SSE registers.
-8:	mov	ecx, [ebp + nr]		# number of rounds
-	mov	ebx, [esp + 24]		# block size (in words)
-	mov	edx, ecx
-	imul	edx, ebx
-	lea	edi, [ebp + wi]
-	lea	esi, [ebp + 4*edx + w]	# last round's keys
-	shl	ebx, 2			# block size (in bytes now)
-
-	## Copy the last encryption round's keys.
-	movdqu	xmm0, [esi]
-	movdqu	[edi], xmm0
-	cmp	ebx, 16
-	jbe	9f
-	movdqu	xmm0, [esi + 16]
-	movdqu	[edi + 16], xmm0
-
-	## Update the loop variables and stop if we've finished.
-9:	add	edi, ebx
-	sub	esi, ebx
-	sub	ecx, 1
-	jbe	0f
-
-	## Do another middle round's keys...
-	movdqu	xmm0, [esi]
-	aesimc	xmm0, xmm0
-	movdqu	[edi], xmm0
-	cmp	ebx, 16
-	jbe	9b
-	movdqu	xmm0, [esi + 16]
-	aesimc	xmm0, xmm0
-	movdqu	[edi + 16], xmm0
-	jmp	9b
-
-	## Finally do the first encryption round.
-0:	movdqu	xmm0, [esi]
-	movdqu	[edi], xmm0
-	cmp	ebx, 16
-	jbe	0f
-	movdqu	xmm0, [esi + 16]
-	movdqu	[edi + 16], xmm0
-
-	## If the block size is not exactly four words then we must end-swap
-	## everything.  We can use fancy SSE toys for this.
-0:	cmp	ebx, 16
-	je	0f
-
-	## Find the byte-reordering table.
-	call	where_am_i_ecx
-	movdqa	xmm7, [ecx + endswap_tab - .]
-
-	## Calculate the number of subkey words again.  (It's a good job
-	## we've got a fast multiplier.)
-	mov	ecx, [ebp + nr]
-	add	ecx, 1
-	imul	ecx, [esp + 24]		# total keys in words
-
-	## End-swap the encryption keys.
-	mov	eax, ecx
-	lea	esi, [ebp + w]
-	call	endswap_block
-
-	## And the decryption keys.
-	mov	ecx, eax
-	lea	esi, [ebp + wi]
-	call	endswap_block
-
-	## All done.
-0:	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-
-	.align	16
-endswap_block:
-	## End-swap ECX words starting at ESI.  The end-swapping table is
-	## already loaded into XMM7; and it's OK to work in 16-byte chunks.
-	movdqu	xmm1, [esi]
-	pshufb	xmm1, xmm7
-	movdqu	[esi], xmm1
-	add	esi, 16
-	sub	ecx, 4
-	ja	endswap_block
-	ret
-
-	.size	rijndael_setup_x86_aesni, . - rijndael_setup_x86_aesni
-
-###--------------------------------------------------------------------------
-### Encrypting and decrypting blocks.
-
-	.globl	rijndael_eblk_x86_aesni
-	.type	rijndael_eblk_x86_aesni, STT_FUNC
-	.align	16
-rijndael_eblk_x86_aesni:
-
-	## On entry, we have:
-	## [esp +  4] points to the context block
-	## [esp +  8] points to the input data block
-	## [esp + 12] points to the output buffer
-
-	## Find the magic endianness-swapping table.
-	call	where_am_i_ecx
-	movdqa	xmm7, [ecx + endswap_tab - .]
-
-	## Load the input block and end-swap it.  Also, start loading the
-	## keys.
-	mov	eax, [esp + 8]
-	movdqu	xmm0, [eax]
-	pshufb	xmm0, xmm7
-	mov	eax, [esp + 4]
-	lea	edx, [eax + w]
-	mov	eax, [eax + nr]
-
-	## Initial whitening.
-	movdqu	xmm1, [edx]
-	add	edx, 16
-	pxor	xmm0, xmm1
-
-	## Dispatch to the correct code.
-	cmp	eax, 10
-	je	er10
-	jb	bogus
-	cmp	eax, 14
-	je	er14
-	ja	bogus
-	cmp	eax, 12
-	je	er12
-	jb	er11
-	jmp	er13
-
-	.align	2
-
-	## 14 rounds...
-er14:	movdqu	xmm1, [edx]
-	add	edx, 16
-	aesenc	xmm0, xmm1
-
-	## 13 rounds...
-er13:	movdqu	xmm1, [edx]
-	add	edx, 16
-	aesenc	xmm0, xmm1
-
-	## 12 rounds...
-er12:	movdqu	xmm1, [edx]
-	add	edx, 16
-	aesenc	xmm0, xmm1
-
-	## 11 rounds...
-er11:	movdqu	xmm1, [edx]
-	add	edx, 16
-	aesenc	xmm0, xmm1
-
-	## 10 rounds...
-er10:	movdqu	xmm1, [edx]
-	aesenc	xmm0, xmm1
-
-	## 9 rounds...
-	movdqu	xmm1, [edx + 16]
-	aesenc	xmm0, xmm1
-
-	## 8 rounds...
-	movdqu	xmm1, [edx + 32]
-	aesenc	xmm0, xmm1
-
-	## 7 rounds...
-	movdqu	xmm1, [edx + 48]
-	aesenc	xmm0, xmm1
-
-	## 6 rounds...
-	movdqu	xmm1, [edx + 64]
-	aesenc	xmm0, xmm1
-
-	## 5 rounds...
-	movdqu	xmm1, [edx + 80]
-	aesenc	xmm0, xmm1
-
-	## 4 rounds...
-	movdqu	xmm1, [edx + 96]
-	aesenc	xmm0, xmm1
-
-	## 3 rounds...
-	movdqu	xmm1, [edx + 112]
-	aesenc	xmm0, xmm1
-
-	## 2 rounds...
-	movdqu	xmm1, [edx + 128]
-	aesenc	xmm0, xmm1
-
-	## Final round...
-	movdqu	xmm1, [edx + 144]
-	aesenclast xmm0, xmm1
-
-	## Unpermute the ciphertext block and store it.
-	pshufb	xmm0, xmm7
-	mov	eax, [esp + 12]
-	movdqu	[eax], xmm0
-
-	## And we're done.
-	ret
-
-	.size	rijndael_eblk_x86_aesni, . - rijndael_dblk_x86_aesni
-
-	.globl	rijndael_dblk_x86_aesni
-	.type	rijndael_dblk_x86_aesni, STT_FUNC
-	.align	16
-rijndael_dblk_x86_aesni:
-
-	## On entry, we have:
-	## [esp +  4] points to the context block
-	## [esp +  8] points to the input data block
-	## [esp + 12] points to the output buffer
-
-	## Find the magic endianness-swapping table.
-	call	where_am_i_ecx
-	movdqa	xmm7, [ecx + endswap_tab - .]
-
-	## Load the input block and end-swap it.  Also, start loading the
-	## keys.
-	mov	eax, [esp + 8]
-	movdqu	xmm0, [eax]
-	pshufb	xmm0, xmm7
-	mov	eax, [esp + 4]
-	lea	edx, [eax + wi]
-	mov	eax, [eax + nr]
-
-	## Initial whitening.
-	movdqu	xmm1, [edx]
-	add	edx, 16
-	pxor	xmm0, xmm1
-
-	## Dispatch to the correct code.
-	cmp	eax, 10
-	je	dr10
-	jb	bogus
-	cmp	eax, 14
-	je	dr14
-	ja	bogus
-	cmp	eax, 12
-	je	dr12
-	jb	dr11
-	jmp	dr13
-
-	.align	2
-
-	## 14 rounds...
-dr14:	movdqu	xmm1, [edx]
-	add	edx, 16
-	aesdec	xmm0, xmm1
-
-	## 13 rounds...
-dr13:	movdqu	xmm1, [edx]
-	add	edx, 16
-	aesdec	xmm0, xmm1
-
-	## 12 rounds...
-dr12:	movdqu	xmm1, [edx]
-	add	edx, 16
-	aesdec	xmm0, xmm1
-
-	## 11 rounds...
-dr11:	movdqu	xmm1, [edx]
-	add	edx, 16
-	aesdec	xmm0, xmm1
-
-	## 10 rounds...
-dr10:	movdqu	xmm1, [edx]
-	aesdec	xmm0, xmm1
-
-	## 9 rounds...
-	movdqu	xmm1, [edx + 16]
-	aesdec	xmm0, xmm1
-
-	## 8 rounds...
-	movdqu	xmm1, [edx + 32]
-	aesdec	xmm0, xmm1
-
-	## 7 rounds...
-	movdqu	xmm1, [edx + 48]
-	aesdec	xmm0, xmm1
-
-	## 6 rounds...
-	movdqu	xmm1, [edx + 64]
-	aesdec	xmm0, xmm1
-
-	## 5 rounds...
-	movdqu	xmm1, [edx + 80]
-	aesdec	xmm0, xmm1
-
-	## 4 rounds...
-	movdqu	xmm1, [edx + 96]
-	aesdec	xmm0, xmm1
-
-	## 3 rounds...
-	movdqu	xmm1, [edx + 112]
-	aesdec	xmm0, xmm1
-
-	## 2 rounds...
-	movdqu	xmm1, [edx + 128]
-	aesdec	xmm0, xmm1
-
-	## Final round...
-	movdqu	xmm1, [edx + 144]
-	aesdeclast xmm0, xmm1
-
-	## Unpermute the ciphertext block and store it.
-	pshufb	xmm0, xmm7
-	mov	eax, [esp + 12]
-	movdqu	[eax], xmm0
-
-	## And we're done.
-	ret
-
-	.size	rijndael_dblk_x86_aesni, . - rijndael_dblk_x86_aesni
-
-###--------------------------------------------------------------------------
-### Random utilities.
-
-	.align	16
-	## Abort the process because of a programming error.  Indirecting
-	## through this point serves several purposes: (a) by CALLing, rather
-	## than branching to, `abort', we can save the return address, which
-	## might at least provide a hint as to what went wrong; (b) we don't
-	## have conditional CALLs (and they'd be big anyway); and (c) we can
-	## write a HLT here as a backstop against `abort' being mad.
-bogus:	call	abort@PLT
-0:	hlt
-	jmp	0b
-
-	.align	16
-	## Return the address of the instruction following the CALL here in
-	## ECX.  This is useful for doing position-independent addressing.
-where_am_i_ecx:
-	mov	ecx, [esp]
-	ret
-
-###--------------------------------------------------------------------------
-### Data tables.
-
-	.align	16
-endswap_tab:
-	.byte	 3,  2,  1,  0
-	.byte	 7,  6,  5,  4
-	.byte	11, 10,  9,  8
-	.byte	15, 14, 13, 12
-
-###----- That's all, folks --------------------------------------------------
diff --git a/symm/salsa20-x86-sse2.S b/symm/salsa20-x86-sse2.S
new file mode 100644
index 00000000..5a13fd49
--- /dev/null
+++ b/symm/salsa20-x86-sse2.S
@@ -0,0 +1,254 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of Salsa20
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+	.arch pentium4
+	.section .text
+
+FUNC(salsa20_core_x86_sse2)
+
+	// Initial state.  We have three arguments:
+	// [ebp +  8] is the number of rounds to do
+	// [ebp + 12] points to the input matrix
+	// [ebp + 16] points to the output matrix
+	push	ebp
+	mov	ebp, esp
+	sub	esp, 32
+	mov	edx, [ebp + 12]
+	and	esp, ~15
+
+	// Prepare for the main loop.
+	mov	ecx, [ebp + 8]
+
+	// First job is to slurp the matrix into XMM registers.  The words
+	// have already been permuted conveniently to make them line up
+	// better for SIMD processing.
+	//
+	// The textbook arrangement of the matrix is this.
+	//
+	//	[C K K K]
+	//	[K C N N]
+	//	[T T C K]
+	//	[K K K C]
+	//
+	// But we've rotated the columns up so that the main diagonal with
+	// the constants on it end up in the first row, giving something more
+	// like
+	//
+	//	[C C C C]
+	//	[K T K K]
+	//	[T K K N]
+	//	[K K N K]
+	//
+	// so the transformation looks like this:
+	//
+	//	[ 0  1  2  3]		[ 0  5 10 15] (a, xmm0)
+	//	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
+	//	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
+	//	[12 13 14 15]		[12  1  6 11] (d, xmm3)
+	movdqu	xmm0, [edx +  0]
+	movdqu	xmm1, [edx + 16]
+	movdqu	xmm2, [edx + 32]
+	movdqu	xmm3, [edx + 48]
+
+	// Take a copy for later.
+	movdqa	[esp +  0], xmm0
+	movdqa	[esp + 16], xmm1
+	movdqa	xmm6, xmm2
+	movdqa	xmm7, xmm3
+
+loop:
+
+	// Apply a column quarterround to each of the columns simultaneously.
+	// Alas, there doesn't seem to be a packed doubleword rotate, so we
+	// have to synthesize it.
+
+	// b ^= (a + d) <<<  7
+	movdqa	xmm4, xmm0
+	paddd	xmm4, xmm3
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 7
+	psrld	xmm5, 25
+	por	xmm4, xmm5
+	pxor	xmm1, xmm4
+
+	// c ^= (b + a) <<<  9
+	movdqa	xmm4, xmm1
+	paddd	xmm4, xmm0
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 9
+	psrld	xmm5, 23
+	por	xmm4, xmm5
+	pxor	xmm2, xmm4
+
+	// d ^= (c + b) <<< 13
+	movdqa	xmm4, xmm2
+	paddd	xmm4, xmm1
+	pshufd	xmm1, xmm1, 0x93
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 13
+	psrld	xmm5, 19
+	por	xmm4, xmm5
+	pxor	xmm3, xmm4
+
+	// a ^= (d + c) <<< 18
+	movdqa	xmm4, xmm3
+	pshufd	xmm3, xmm3, 0x39
+	paddd	xmm4, xmm2
+	pshufd	xmm2, xmm2, 0x4e
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 18
+	psrld	xmm5, 14
+	por	xmm4, xmm5
+	pxor	xmm0, xmm4
+
+	// The transpose conveniently only involves reordering elements of
+	// individual rows, which can be done quite easily, and reordering
+	// the rows themselves, which is a trivial renaming.  It doesn't
+	// involve any movement of elements between rows.
+	//
+	//	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
+	//	[ 4  9 14  3]    -->	[ 1  6 11 12] (b, xmm3)
+	//	[ 8 13  2  7]		[ 2  7  8 13] (c, xmm2)
+	//	[12  1  6 11]		[ 3  4  9 14] (d, xmm1)
+	//
+	// The shuffles have quite high latency, so they've been pushed
+	// backwards into the main instruction list.
+
+	// Apply the row quarterround to each of the columns (yes!)
+	// simultaneously.
+
+	// b ^= (a + d) <<<  7
+	movdqa	xmm4, xmm0
+	paddd	xmm4, xmm1
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 7
+	psrld	xmm5, 25
+	por	xmm4, xmm5
+	pxor	xmm3, xmm4
+
+	// c ^= (b + a) <<<  9
+	movdqa	xmm4, xmm3
+	paddd	xmm4, xmm0
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 9
+	psrld	xmm5, 23
+	por	xmm4, xmm5
+	pxor	xmm2, xmm4
+
+	// d ^= (c + b) <<< 13
+	movdqa	xmm4, xmm2
+	paddd	xmm4, xmm3
+	pshufd	xmm3, xmm3, 0x93
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 13
+	psrld	xmm5, 19
+	por	xmm4, xmm5
+	pxor	xmm1, xmm4
+
+	// a ^= (d + c) <<< 18
+	movdqa	xmm4, xmm1
+	pshufd	xmm1, xmm1, 0x39
+	paddd	xmm4, xmm2
+	pshufd	xmm2, xmm2, 0x4e
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 18
+	psrld	xmm5, 14
+	por	xmm4, xmm5
+	pxor	xmm0, xmm4
+
+	// We had to undo the transpose ready for the next loop.  Again, push
+	// back the shuffles because they take a long time coming through.
+	// Decrement the loop counter and see if we should go round again.
+	// Later processors fuse this pair into a single uop.
+	sub	ecx, 2
+	ja	loop
+
+	// Almost there.  Firstly, the feedforward addition, and then we have
+	// to write out the result.  Here we have to undo the permutation
+	// which was already applied to the input.  Shuffling has quite high
+	// latency, so arrange to start a new shuffle into a temporary as
+	// soon as we've written out the old value.
+	mov	edx, [ebp + 16]
+
+	paddd	xmm0, [esp +  0]
+	pshufd	xmm4, xmm0, 0x39
+	movd	[edx +  0], xmm0
+
+	paddd	xmm1, [esp + 16]
+	pshufd	xmm5, xmm1, 0x93
+	movd	[edx + 16], xmm1
+
+	paddd	xmm2, xmm6
+	pshufd	xmm6, xmm2, 0x4e
+	movd	[edx + 32], xmm2
+
+	paddd	xmm3, xmm7
+	pshufd	xmm7, xmm3, 0x39
+	movd	[edx + 48], xmm3
+
+	movd	[edx +  4], xmm7
+	pshufd	xmm7, xmm3, 0x4e
+	movd	[edx + 24], xmm7
+	pshufd	xmm3, xmm3, 0x93
+	movd	[edx + 44], xmm3
+
+	movd	[edx +  8], xmm6
+	pshufd	xmm6, xmm2, 0x93
+	movd	[edx + 28], xmm6
+	pshufd	xmm2, xmm2, 0x39
+	movd	[edx + 52], xmm2
+
+	movd	[edx + 12], xmm5
+	pshufd	xmm5, xmm1, 0x39
+	movd	[edx + 36], xmm5
+	pshufd	xmm1, xmm1, 0x4e
+	movd	[edx + 56], xmm1
+
+	movd	[edx + 20], xmm4
+	pshufd	xmm4, xmm0, 0x4e
+	movd	[edx + 40], xmm4
+	pshufd	xmm0, xmm0, 0x93
+	movd	[edx + 60], xmm0
+
+	// Tidy things up.
+	mov	esp, ebp
+	pop	ebp
+
+	// And with that, we're done.
+	ret
+
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/salsa20-x86-sse2.s b/symm/salsa20-x86-sse2.s
deleted file mode 100644
index ef2b73ef..00000000
--- a/symm/salsa20-x86-sse2.s
+++ /dev/null
@@ -1,247 +0,0 @@
-### -*- mode: asm; asm-comment-char: ?# -*-
-###
-### Fancy SIMD implementation of Salsa20
-###
-### (c) 2015 Straylight/Edgeware
-###
-
-###----- Licensing notice ---------------------------------------------------
-###
-### This file is part of Catacomb.
-###
-### Catacomb is free software; you can redistribute it and/or modify
-### it under the terms of the GNU Library General Public License as
-### published by the Free Software Foundation; either version 2 of the
-### License, or (at your option) any later version.
-###
-### Catacomb is distributed in the hope that it will be useful,
-### but WITHOUT ANY WARRANTY; without even the implied warranty of
-### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-### GNU Library General Public License for more details.
-###
-### You should have received a copy of the GNU Library General Public
-### License along with Catacomb; if not, write to the Free
-### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-### MA 02111-1307, USA.
-
-	.intel_syntax noprefix
-	.arch pentium4
-
-	.section .text
-
-	.globl	salsa20_core_x86_sse2
-	.type	salsa20_core_x86_sse2, STT_FUNC
-salsa20_core_x86_sse2:
-
-	## Initial state.  We have three arguments:
-	## [ebp +  8] is the number of rounds to do
-	## [ebp + 12] points to the input matrix
-	## [ebp + 16] points to the output matrix
-	push	ebp
-	mov	ebp, esp
-	sub	esp, 32
-	mov	edx, [ebp + 12]
-	and	esp, ~15
-
-	## Prepare for the main loop.
-	mov	ecx, [ebp + 8]
-
-	## First job is to slurp the matrix into XMM registers.  The words
-	## have already been permuted conveniently to make them line up
-	## better for SIMD processing.
-	##
-	## The textbook arrangement of the matrix is this.
-	##
-	##	[C K K K]
-	##	[K C N N]
-	##	[T T C K]
-	##	[K K K C]
-	##
-	## But we've rotated the columns up so that the main diagonal with
-	## the constants on it end up in the first row, giving something more
-	## like
-	##
-	##	[C C C C]
-	##	[K T K K]
-	##	[T K K N]
-	##	[K K N K]
-	##
-	## so the transformation looks like this:
-	##
-	##	[ 0  1  2  3]		[ 0  5 10 15] (a, xmm0)
-	##	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
-	##	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
-	##	[12 13 14 15]		[12  1  6 11] (d, xmm3)
-	movdqu	xmm0, [edx +  0]
-	movdqu	xmm1, [edx + 16]
-	movdqu	xmm2, [edx + 32]
-	movdqu	xmm3, [edx + 48]
-
-	## Take a copy for later.
-	movdqa	[esp +  0], xmm0
-	movdqa	[esp + 16], xmm1
-	movdqa	xmm6, xmm2
-	movdqa	xmm7, xmm3
-
-loop:
-
-	## Apply a column quarterround to each of the columns simultaneously.
-	## Alas, there doesn't seem to be a packed doubleword rotate, so we
-	## have to synthesize it.
-
-	## b ^= (a + d) <<<  7
-	movdqa	xmm4, xmm0
-	paddd	xmm4, xmm3
-	movdqa	xmm5, xmm4
-	pslld	xmm4, 7
-	psrld	xmm5, 25
-	por	xmm4, xmm5
-	pxor	xmm1, xmm4
-
-	## c ^= (b + a) <<<  9
-	movdqa	xmm4, xmm1
-	paddd	xmm4, xmm0
-	movdqa	xmm5, xmm4
-	pslld	xmm4, 9
-	psrld	xmm5, 23
-	por	xmm4, xmm5
-	pxor	xmm2, xmm4
-
-	## d ^= (c + b) <<< 13
-	movdqa	xmm4, xmm2
-	paddd	xmm4, xmm1
-	pshufd	xmm1, xmm1, 0x93
-	movdqa	xmm5, xmm4
-	pslld	xmm4, 13
-	psrld	xmm5, 19
-	por	xmm4, xmm5
-	pxor	xmm3, xmm4
-
-	## a ^= (d + c) <<< 18
-	movdqa	xmm4, xmm3
-	pshufd	xmm3, xmm3, 0x39
-	paddd	xmm4, xmm2
-	pshufd	xmm2, xmm2, 0x4e
-	movdqa	xmm5, xmm4
-	pslld	xmm4, 18
-	psrld	xmm5, 14
-	por	xmm4, xmm5
-	pxor	xmm0, xmm4
-
-	## The transpose conveniently only involves reordering elements of
-	## individual rows, which can be done quite easily, and reordering
-	## the rows themselves, which is a trivial renaming.  It doesn't
-	## involve any movement of elements between rows.
-	##
-	##	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
-	##	[ 4  9 14  3]    -->	[ 1  6 11 12] (b, xmm3)
-	##	[ 8 13  2  7]		[ 2  7  8 13] (c, xmm2)
-	##	[12  1  6 11]		[ 3  4  9 14] (d, xmm1)
-	##
-	## The shuffles have quite high latency, so they've been pushed
-	## backwards into the main instruction list.
-
-	## Apply the row quarterround to each of the columns (yes!)
-	## simultaneously.
-
-	## b ^= (a + d) <<<  7
-	movdqa	xmm4, xmm0
-	paddd	xmm4, xmm1
-	movdqa	xmm5, xmm4
-	pslld	xmm4, 7
-	psrld	xmm5, 25
-	por	xmm4, xmm5
-	pxor	xmm3, xmm4
-
-	## c ^= (b + a) <<<  9
-	movdqa	xmm4, xmm3
-	paddd	xmm4, xmm0
-	movdqa	xmm5, xmm4
-	pslld	xmm4, 9
-	psrld	xmm5, 23
-	por	xmm4, xmm5
-	pxor	xmm2, xmm4
-
-	## d ^= (c + b) <<< 13
-	movdqa	xmm4, xmm2
-	paddd	xmm4, xmm3
-	pshufd	xmm3, xmm3, 0x93
-	movdqa	xmm5, xmm4
-	pslld	xmm4, 13
-	psrld	xmm5, 19
-	por	xmm4, xmm5
-	pxor	xmm1, xmm4
-
-	## a ^= (d + c) <<< 18
-	movdqa	xmm4, xmm1
-	pshufd	xmm1, xmm1, 0x39
-	paddd	xmm4, xmm2
-	pshufd	xmm2, xmm2, 0x4e
-	movdqa	xmm5, xmm4
-	pslld	xmm4, 18
-	psrld	xmm5, 14
-	por	xmm4, xmm5
-	pxor	xmm0, xmm4
-
-	## We had to undo the transpose ready for the next loop.  Again, push
-	## back the shuffles because they take a long time coming through.
-	## Decrement the loop counter and see if we should go round again.
-	## Later processors fuse this pair into a single uop.
-	sub	ecx, 2
-	ja	loop
-
-	## Almost there.  Firstly, the feedforward addition, and then we have
-	## to write out the result.  Here we have to undo the permutation
-	## which was already applied to the input.  Shuffling has quite high
-	## latency, so arrange to start a new shuffle into a temporary as
-	## soon as we've written out the old value.
-	mov	edx, [ebp + 16]
-
-	paddd	xmm0, [esp +  0]
-	pshufd	xmm4, xmm0, 0x39
-	movd	[edx +  0], xmm0
-
-	paddd	xmm1, [esp + 16]
-	pshufd	xmm5, xmm1, 0x93
-	movd	[edx + 16], xmm1
-
-	paddd	xmm2, xmm6
-	pshufd	xmm6, xmm2, 0x4e
-	movd	[edx + 32], xmm2
-
-	paddd	xmm3, xmm7
-	pshufd	xmm7, xmm3, 0x39
-	movd	[edx + 48], xmm3
-
-	movd	[edx +  4], xmm7
-	pshufd	xmm7, xmm3, 0x4e
-	movd	[edx + 24], xmm7
-	pshufd	xmm3, xmm3, 0x93
-	movd	[edx + 44], xmm3
-
-	movd	[edx +  8], xmm6
-	pshufd	xmm6, xmm2, 0x93
-	movd	[edx + 28], xmm6
-	pshufd	xmm2, xmm2, 0x39
-	movd	[edx + 52], xmm2
-
-	movd	[edx + 12], xmm5
-	pshufd	xmm5, xmm1, 0x39
-	movd	[edx + 36], xmm5
-	pshufd	xmm1, xmm1, 0x4e
-	movd	[edx + 56], xmm1
-
-	movd	[edx + 20], xmm4
-	pshufd	xmm4, xmm0, 0x4e
-	movd	[edx + 40], xmm4
-	pshufd	xmm0, xmm0, 0x93
-	movd	[edx + 60], xmm0
-
-	## And with that, we're done.
-	mov	esp, ebp
-	pop	ebp
-	ret
-
-	.size	salsa20_core_x86_sse2, . - salsa20_core_x86_sse2
-
-###----- That's all, folks --------------------------------------------------
-- 
2.11.0