From 226639f3441312d535bccf45b1c0d15d0ae156a1 Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Mon, 25 May 2015 10:34:14 +0100
Subject: [PATCH] Support Intel's AES Native Instructions where available on
 x86 hardware.

  * Add a detector for the CPU feature.

  * Implement AES in terms of the Intel AESNI instructions.

We can't use the fancy instructions to implement Rijndael with large
blocks, unfortunately; we /can/ (and do) use the rather cumbersome
key-scheduling instructions.

There's a slightly annoying endianness difference between Catacomb
(big-endian) and AESNI (little-endian).  Resolve this by (a) maintaining
the key schedule in little-endian order if we're using AESNI (and blocks
are exactly 128 bits); and (b) end-swapping the block on entry and exit
to the block cipher operations.
---
 base/dispatch.c           |   6 +
 base/dispatch.h           |   3 +-
 symm/Makefile.am          |   3 +
 symm/rijndael-base.c      |  57 +++--
 symm/rijndael-x86-aesni.s | 553 ++++++++++++++++++++++++++++++++++++++++++++++
 symm/rijndael.c           |  38 +++-
 6 files changed, 643 insertions(+), 17 deletions(-)
 create mode 100644 symm/rijndael-x86-aesni.s

diff --git a/base/dispatch.c b/base/dispatch.c
index 87d41b3a..fe3dcfb8 100644
--- a/base/dispatch.c
+++ b/base/dispatch.c
@@ -44,6 +44,7 @@
 #define EFLAGS_ID (1u << 21)
 #define CPUID1D_SSE2 (1u << 26)
 #define CPUID1D_FXSR (1u << 24)
+#define CPUID1C_AESNI (1u << 25)
 
 struct cpuid { unsigned a, b, c, d; };
 
@@ -196,6 +197,11 @@ int cpu_feature_p(int feat)
       return (xmm_registers_available_p() &&
 	      cpuid_features_p(CPUID1D_SSE2, 0));
     }
+    case CPUFEAT_X86_AESNI: {
+      check_env("x86:aesni");
+      return (xmm_registers_available_p() &&
+	      cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI));
+    }
 #endif
     default:
       return (0);
diff --git a/base/dispatch.h b/base/dispatch.h
index bcf9a13d..612cfcd3 100644
--- a/base/dispatch.h
+++ b/base/dispatch.h
@@ -148,7 +148,8 @@ stcls ret ext argdecls { rtn dflt args; }
  */
 
 enum {
-  CPUFEAT_X86_SSE2			/* Streaming SIMD Extensions 2 */
+  CPUFEAT_X86_SSE2,			/* Streaming SIMD Extensions 2 */
+  CPUFEAT_X86_AESNI			/* AES Native Instructions */
 };
 
 extern int cpu_feature_p(int /*feat*/);
diff --git a/symm/Makefile.am b/symm/Makefile.am
index 5bb82f1f..6a63993d 100644
--- a/symm/Makefile.am
+++ b/symm/Makefile.am
@@ -180,6 +180,9 @@ BLKCS			+= rc5
 ## Daemen and Rijmen's `Rijndael' block cipher, selected as AES.
 BLKCS			+= rijndael rijndael192 rijndael256
 libsymm_la_SOURCES	+= rijndael-base.h rijndael-base.c
+if CPUFAM_X86
+libsymm_la_SOURCES	+= rijndael-x86-aesni.s
+endif
 libsymm_la_SOURCES	+= $(precomp)/rijndael-tab.c
 PRECOMPS		+= $(precomp)/rijndael-tab.c
 PRECOMP_PROGS		+= rijndael-mktab
diff --git a/symm/rijndael-base.c b/symm/rijndael-base.c
index bfab63a2..6e59130c 100644
--- a/symm/rijndael-base.c
+++ b/symm/rijndael-base.c
@@ -27,12 +27,15 @@
 
 /*----- Header files ------------------------------------------------------*/
 
+#include "config.h"
+
 #include <assert.h>
 #include <stdio.h>
 
 #include <mLib/bits.h>
 
 #include "blkc.h"
+#include "dispatch.h"
 #include "gcipher.h"
 #include "rijndael.h"
 #include "rijndael-base.h"
@@ -55,25 +58,14 @@ const octet rijndael_keysz[] = { KSZ_RANGE, RIJNDAEL_KEYSZ, 4, 32, 4 };
  * Use:		Low-level key-scheduling.
  */
 
-void rijndael_setup(rijndael_ctx *k, unsigned nb, const void *buf, size_t sz)
+static void simple_setup(rijndael_ctx *k, unsigned nb,
+			 const void *buf, unsigned nk)
 {
-  unsigned nk, nr, nw;
+  unsigned nr = k->nr, nw;
   unsigned i, j, jj;
   const octet *p;
   uint32 ww;
 
-  /* --- Sort out the key size --- */
-
-  KSZ_ASSERT(rijndael, sz);
-  nk = sz / 4;
-
-  /* --- Select the number of rounds --- */
-
-  nr = (nk > nb ? nk : nb) + 6;
-  if (nr < 10)
-    nr = 10;
-  k->nr = nr;
-
   /* --- Fetch the first key words out --- */
 
   p = buf;
@@ -120,4 +112,41 @@ void rijndael_setup(rijndael_ctx *k, unsigned nb, const void *buf, size_t sz)
     k->wi[i] = k->w[j + jj++];
 }
 
+CPU_DISPATCH(static, EMPTY, void, setup, (rijndael_ctx *k, unsigned nb,
+					  const void *buf, unsigned nk),
+	     (k, nb, buf, nk), pick_setup, simple_setup)
+
+#ifdef CPUFAM_X86
+extern setup__functype rijndael_setup_x86_aesni;
+#endif
+
+static setup__functype *pick_setup(void)
+{
+#ifdef CPUFAM_X86
+  if (cpu_feature_p(CPUFEAT_X86_AESNI)) return rijndael_setup_x86_aesni;
+#endif
+  return simple_setup;
+}
+
+void rijndael_setup(rijndael_ctx *k, unsigned nb, const void *buf, size_t sz)
+{
+  unsigned nk, nr;
+
+  /* --- Sort out the key size --- */
+
+  KSZ_ASSERT(rijndael, sz);
+  nk = sz / 4;
+
+  /* --- Select the number of rounds --- */
+
+  nr = (nk > nb ? nk : nb) + 6;
+  if (nr < 10)
+    nr = 10;
+  k->nr = nr;
+
+  /* --- Do the main setup --- */
+
+  setup(k, nb, buf, nk);
+}
+
 /*----- That's all, folks -------------------------------------------------*/
diff --git a/symm/rijndael-x86-aesni.s b/symm/rijndael-x86-aesni.s
new file mode 100644
index 00000000..79b33584
--- /dev/null
+++ b/symm/rijndael-x86-aesni.s
@@ -0,0 +1,553 @@
+### -*- mode: asm; asm-comment-char: ?# -*-
+###
+### AESNI-based implementation of Rijndael
+###
+### (c) 2015 Straylight/Edgeware
+###
+
+###----- Licensing notice ---------------------------------------------------
+###
+### This file is part of Catacomb.
+###
+### Catacomb is free software; you can redistribute it and/or modify
+### it under the terms of the GNU Library General Public License as
+### published by the Free Software Foundation; either version 2 of the
+### License, or (at your option) any later version.
+###
+### Catacomb is distributed in the hope that it will be useful,
+### but WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+### GNU Library General Public License for more details.
+###
+### You should have received a copy of the GNU Library General Public
+### License along with Catacomb; if not, write to the Free
+### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+### MA 02111-1307, USA.
+
+	.intel_syntax noprefix
+	.arch .aes
+
+	.globl	abort
+	.globl	rijndael_rcon
+
+	.section .text
+
+### The AESNI instructions implement a little-endian version of AES, but
+### Catacomb's internal interface presents as big-endian so as to work better
+### with things like GCM.  We therefore maintain the round keys in
+### little-endian form, and have to end-swap blocks in and out.
+###
+### For added amusement, the AESNI instructions don't implement the
+### larger-block versions of Rijndael, so we have to end-swap the keys if
+### we're preparing for one of those.
+
+	## Useful constants.
+	.equ maxrounds, 16		# maximum number of rounds
+	.equ maxblksz, 32		# maximum block size, in bytes
+	.equ kbufsz, maxblksz*(maxrounds + 1) # size of a key-schedule buffer
+
+	## Context structure.
+	.equ nr, 0			# number of rounds
+	.equ w, nr + 4			# encryption key words
+	.equ wi, w + kbufsz		# decryption key words
+
+###--------------------------------------------------------------------------
+### Key setup.
+
+	.globl	rijndael_setup_x86_aesni
+	.type	rijndael_setup_x86_aesni, STT_FUNC
+	.align	16
+rijndael_setup_x86_aesni:
+
+	## Initial state.  We have four arguments:
+	## [esp + 20] is the context pointer
+	## [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
+	## [esp + 28] points to the key material, unaligned
+	## [esp + 32] is the size of the key, in words
+	## The key size has already been checked for validity, and the number
+	## of rounds has been computed.  Our job is only to fill in the `w'
+	## and `wi' vectors.
+
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+
+	## The initial round key material is taken directly from the input
+	## key, so copy it over.
+	mov	ebp, [esp + 20]		# context base pointer
+	mov	ebx, [esp + 32]		# key size, in words
+	mov	ecx, ebx
+	mov	esi, [esp + 28]
+	lea	edi, [ebp + w]
+	rep	movsd
+
+	## Find out other useful things.
+	mov	edx, [ebp + nr]		# number of rounds
+	add	edx, 1
+	imul	edx, [esp + 24]		# total key size in words
+	sub	edx, ebx		# offset by the key size
+
+	## Find the round constants.
+	call	where_am_i_ecx
+	add	ecx, offset _GLOBAL_OFFSET_TABLE_
+	mov	ecx, [ecx + rijndael_rcon@GOT]
+
+	## Prepare for the main loop.
+	lea	esi, [ebp + w]
+	mov	eax, [esi + 4*ebx - 4]	# most recent key word
+	lea	edx, [esi + 4*edx]	# limit, offset by one key expansion
+
+	## Main key expansion loop.  The first word of each key-length chunk
+	## needs special treatment.
+	##
+	## This is rather tedious because the Intel `AESKEYGENASSIST'
+	## instruction is very strangely shaped.  Firstly, it wants to
+	## operate on vast SSE registers, even though we're data-blocked from
+	## doing more than operation at a time unless we're doing two key
+	## schedules simultaneously -- and even then we can't do more than
+	## two, because the instruction ignores two of its input words
+	## entirely, and produces two different outputs for each of the other
+	## two.  And secondly it insists on taking the magic round constant
+	## as an immediate, so it's kind of annoying if you're not
+	## open-coding the whole thing.  It's much easier to leave that as
+	## zero and XOR in the round constant by hand.
+9:	movd	xmm0, eax
+	pshufd	xmm0, xmm0, 0x39
+	aeskeygenassist xmm1, xmm0, 0
+	pshufd	xmm1, xmm1, 0x93
+	movd	eax, xmm1
+	xor	eax, [esi]
+	xor	al, [ecx]
+	inc	ecx
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	## The next three words are simple...
+	xor	eax, [esi]
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	## (Word 2...)
+	xor	eax, [esi]
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	## (Word 3...)
+	xor	eax, [esi]
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	## Word 4.  If the key is /more/ than 6 words long, then we must
+	## apply a substitution here.
+	cmp	ebx, 5
+	jb	9b
+	cmp	ebx, 7
+	jb	0f
+	movd	xmm0, eax
+	pshufd	xmm0, xmm0, 0x93
+	aeskeygenassist xmm1, xmm0, 0
+	movd	eax, xmm1
+0:	xor	eax, [esi]
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	## (Word 5...)
+	cmp	ebx, 6
+	jb	9b
+	xor	eax, [esi]
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	## (Word 6...)
+	cmp	ebx, 7
+	jb	9b
+	xor	eax, [esi]
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	## (Word 7...)
+	cmp	ebx, 8
+	jb	9b
+	xor	eax, [esi]
+	mov	[esi + 4*ebx], eax
+	add	esi, 4
+	cmp	esi, edx
+	jae	8f
+
+	## Must be done by now.
+	jmp	9b
+
+	## Next job is to construct the decryption keys.  The keys for the
+	## first and last rounds don't need to be mangled, but the remaining
+	## ones do -- and they all need to be reordered too.
+	##
+	## The plan of action, then, is to copy the final encryption round's
+	## keys into place first, then to do each of the intermediate rounds
+	## in reverse order, and finally do the first round.
+	##
+	## Do all of the heavy lifting with SSE registers.  The order we're
+	## doing this in means that it's OK if we read or write too much, and
+	## there's easily enough buffer space for the over-enthusiastic reads
+	## and writes because the context has space for 32-byte blocks, which
+	## is our maximum and an exact fit for two SSE registers.
+8:	mov	ecx, [ebp + nr]		# number of rounds
+	mov	ebx, [esp + 24]		# block size (in words)
+	mov	edx, ecx
+	imul	edx, ebx
+	lea	edi, [ebp + wi]
+	lea	esi, [ebp + 4*edx + w]	# last round's keys
+	shl	ebx, 2			# block size (in bytes now)
+
+	## Copy the last encryption round's keys.
+	movdqu	xmm0, [esi]
+	movdqu	[edi], xmm0
+	cmp	ebx, 16
+	jbe	9f
+	movdqu	xmm0, [esi + 16]
+	movdqu	[edi + 16], xmm0
+
+	## Update the loop variables and stop if we've finished.
+9:	add	edi, ebx
+	sub	esi, ebx
+	sub	ecx, 1
+	jbe	0f
+
+	## Do another middle round's keys...
+	movdqu	xmm0, [esi]
+	aesimc	xmm0, xmm0
+	movdqu	[edi], xmm0
+	cmp	ebx, 16
+	jbe	9b
+	movdqu	xmm0, [esi + 16]
+	aesimc	xmm0, xmm0
+	movdqu	[edi + 16], xmm0
+	jmp	9b
+
+	## Finally do the first encryption round.
+0:	movdqu	xmm0, [esi]
+	movdqu	[edi], xmm0
+	cmp	ebx, 16
+	jbe	0f
+	movdqu	xmm0, [esi + 16]
+	movdqu	[edi + 16], xmm0
+
+	## If the block size is not exactly four words then we must end-swap
+	## everything.  We can use fancy SSE toys for this.
+0:	cmp	ebx, 16
+	je	0f
+
+	## Find the byte-reordering table.
+	call	where_am_i_ecx
+	movdqa	xmm7, [ecx + endswap_tab - .]
+
+	## Calculate the number of subkey words again.  (It's a good job
+	## we've got a fast multiplier.)
+	mov	ecx, [ebp + nr]
+	add	ecx, 1
+	imul	ecx, [esp + 24]		# total keys in words
+
+	## End-swap the encryption keys.
+	mov	eax, ecx
+	lea	esi, [ebp + w]
+	call	endswap_block
+
+	## And the decryption keys.
+	mov	ecx, eax
+	lea	esi, [ebp + wi]
+	call	endswap_block
+
+	## All done.
+0:	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
+	.align	16
+endswap_block:
+	## End-swap ECX words starting at ESI.  The end-swapping table is
+	## already loaded into XMM7; and it's OK to work in 16-byte chunks.
+	movdqu	xmm1, [esi]
+	pshufb	xmm1, xmm7
+	movdqu	[esi], xmm1
+	add	esi, 16
+	sub	ecx, 4
+	ja	endswap_block
+	ret
+
+	.size	rijndael_setup_x86_aesni, . - rijndael_setup_x86_aesni
+
+###--------------------------------------------------------------------------
+### Encrypting and decrypting blocks.
+
+	.globl	rijndael_eblk_x86_aesni
+	.type	rijndael_eblk_x86_aesni, STT_FUNC
+	.align	16
+rijndael_eblk_x86_aesni:
+
+	## On entry, we have:
+	## [esp +  4] points to the context block
+	## [esp +  8] points to the input data block
+	## [esp + 12] points to the output buffer
+
+	## Find the magic endianness-swapping table.
+	call	where_am_i_ecx
+	movdqa	xmm7, [ecx + endswap_tab - .]
+
+	## Load the input block and end-swap it.  Also, start loading the
+	## keys.
+	mov	eax, [esp + 8]
+	movdqu	xmm0, [eax]
+	pshufb	xmm0, xmm7
+	mov	eax, [esp + 4]
+	lea	edx, [eax + w]
+	mov	eax, [eax + nr]
+
+	## Initial whitening.
+	movdqu	xmm1, [edx]
+	add	edx, 16
+	pxor	xmm0, xmm1
+
+	## Dispatch to the correct code.
+	cmp	eax, 10
+	je	er10
+	jb	bogus
+	cmp	eax, 14
+	je	er14
+	ja	bogus
+	cmp	eax, 12
+	je	er12
+	jb	er11
+	jmp	er13
+
+	.align	2
+
+	## 14 rounds...
+er14:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesenc	xmm0, xmm1
+
+	## 13 rounds...
+er13:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesenc	xmm0, xmm1
+
+	## 12 rounds...
+er12:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesenc	xmm0, xmm1
+
+	## 11 rounds...
+er11:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesenc	xmm0, xmm1
+
+	## 10 rounds...
+er10:	movdqu	xmm1, [edx]
+	aesenc	xmm0, xmm1
+
+	## 9 rounds...
+	movdqu	xmm1, [edx + 16]
+	aesenc	xmm0, xmm1
+
+	## 8 rounds...
+	movdqu	xmm1, [edx + 32]
+	aesenc	xmm0, xmm1
+
+	## 7 rounds...
+	movdqu	xmm1, [edx + 48]
+	aesenc	xmm0, xmm1
+
+	## 6 rounds...
+	movdqu	xmm1, [edx + 64]
+	aesenc	xmm0, xmm1
+
+	## 5 rounds...
+	movdqu	xmm1, [edx + 80]
+	aesenc	xmm0, xmm1
+
+	## 4 rounds...
+	movdqu	xmm1, [edx + 96]
+	aesenc	xmm0, xmm1
+
+	## 3 rounds...
+	movdqu	xmm1, [edx + 112]
+	aesenc	xmm0, xmm1
+
+	## 2 rounds...
+	movdqu	xmm1, [edx + 128]
+	aesenc	xmm0, xmm1
+
+	## Final round...
+	movdqu	xmm1, [edx + 144]
+	aesenclast xmm0, xmm1
+
+	## Unpermute the ciphertext block and store it.
+	pshufb	xmm0, xmm7
+	mov	eax, [esp + 12]
+	movdqu	[eax], xmm0
+
+	## And we're done.
+	ret
+
+	.size	rijndael_eblk_x86_aesni, . - rijndael_dblk_x86_aesni
+
+	.globl	rijndael_dblk_x86_aesni
+	.type	rijndael_dblk_x86_aesni, STT_FUNC
+	.align	16
+rijndael_dblk_x86_aesni:
+
+	## On entry, we have:
+	## [esp +  4] points to the context block
+	## [esp +  8] points to the input data block
+	## [esp + 12] points to the output buffer
+
+	## Find the magic endianness-swapping table.
+	call	where_am_i_ecx
+	movdqa	xmm7, [ecx + endswap_tab - .]
+
+	## Load the input block and end-swap it.  Also, start loading the
+	## keys.
+	mov	eax, [esp + 8]
+	movdqu	xmm0, [eax]
+	pshufb	xmm0, xmm7
+	mov	eax, [esp + 4]
+	lea	edx, [eax + wi]
+	mov	eax, [eax + nr]
+
+	## Initial whitening.
+	movdqu	xmm1, [edx]
+	add	edx, 16
+	pxor	xmm0, xmm1
+
+	## Dispatch to the correct code.
+	cmp	eax, 10
+	je	dr10
+	jb	bogus
+	cmp	eax, 14
+	je	dr14
+	ja	bogus
+	cmp	eax, 12
+	je	dr12
+	jb	dr11
+	jmp	dr13
+
+	.align	2
+
+	## 14 rounds...
+dr14:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesdec	xmm0, xmm1
+
+	## 13 rounds...
+dr13:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesdec	xmm0, xmm1
+
+	## 12 rounds...
+dr12:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesdec	xmm0, xmm1
+
+	## 11 rounds...
+dr11:	movdqu	xmm1, [edx]
+	add	edx, 16
+	aesdec	xmm0, xmm1
+
+	## 10 rounds...
+dr10:	movdqu	xmm1, [edx]
+	aesdec	xmm0, xmm1
+
+	## 9 rounds...
+	movdqu	xmm1, [edx + 16]
+	aesdec	xmm0, xmm1
+
+	## 8 rounds...
+	movdqu	xmm1, [edx + 32]
+	aesdec	xmm0, xmm1
+
+	## 7 rounds...
+	movdqu	xmm1, [edx + 48]
+	aesdec	xmm0, xmm1
+
+	## 6 rounds...
+	movdqu	xmm1, [edx + 64]
+	aesdec	xmm0, xmm1
+
+	## 5 rounds...
+	movdqu	xmm1, [edx + 80]
+	aesdec	xmm0, xmm1
+
+	## 4 rounds...
+	movdqu	xmm1, [edx + 96]
+	aesdec	xmm0, xmm1
+
+	## 3 rounds...
+	movdqu	xmm1, [edx + 112]
+	aesdec	xmm0, xmm1
+
+	## 2 rounds...
+	movdqu	xmm1, [edx + 128]
+	aesdec	xmm0, xmm1
+
+	## Final round...
+	movdqu	xmm1, [edx + 144]
+	aesdeclast xmm0, xmm1
+
+	## Unpermute the ciphertext block and store it.
+	pshufb	xmm0, xmm7
+	mov	eax, [esp + 12]
+	movdqu	[eax], xmm0
+
+	## And we're done.
+	ret
+
+	.size	rijndael_dblk_x86_aesni, . - rijndael_dblk_x86_aesni
+
+###--------------------------------------------------------------------------
+### Random utilities.
+
+	.align	16
+	## Abort the process because of a programming error.  Indirecting
+	## through this point serves several purposes: (a) by CALLing, rather
+	## than branching to, `abort', we can save the return address, which
+	## might at least provide a hint as to what went wrong; (b) we don't
+	## have conditional CALLs (and they'd be big anyway); and (c) we can
+	## write a HLT here as a backstop against `abort' being mad.
+bogus:	call	abort@PLT
+0:	hlt
+	jmp	0b
+
+	.align	16
+	## Return the address of the instruction following the CALL here in
+	## ECX.  This is useful for doing position-independent addressing.
+where_am_i_ecx:
+	mov	ecx, [esp]
+	ret
+
+###--------------------------------------------------------------------------
+### Data tables.
+
+	.align	16
+endswap_tab:
+	.byte	 3,  2,  1,  0
+	.byte	 7,  6,  5,  4
+	.byte	11, 10,  9,  8
+	.byte	15, 14, 13, 12
+
+###----- That's all, folks --------------------------------------------------
diff --git a/symm/rijndael.c b/symm/rijndael.c
index 9d8e739f..9ee8aa28 100644
--- a/symm/rijndael.c
+++ b/symm/rijndael.c
@@ -27,12 +27,15 @@
 
 /*----- Header files ------------------------------------------------------*/
 
+#include "config.h"
+
 #include <assert.h>
 #include <stdio.h>
 
 #include <mLib/bits.h>
 
 #include "blkc.h"
+#include "dispatch.h"
 #include "gcipher.h"
 #include "rijndael.h"
 #include "rijndael-base.h"
@@ -69,6 +72,37 @@ void rijndael_init(rijndael_ctx *k, const void *buf, size_t sz)
  * Use:		Low-level block encryption and decryption.
  */
 
+CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_eblk, (const rijndael_ctx *k,
+						 const uint32 s[4],
+						 uint32 d[4]),
+	     (k, s, d), pick_eblk, simple_eblk)
+
+CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_dblk, (const rijndael_ctx *k,
+						 const uint32 s[4],
+						 uint32 d[4]),
+	     (k, s, d), pick_dblk, simple_dblk)
+
+#ifdef CPUFAM_X86
+extern rijndael_eblk__functype rijndael_eblk_x86_aesni;
+extern rijndael_dblk__functype rijndael_dblk_x86_aesni;
+#endif
+
+static rijndael_eblk__functype *pick_eblk(void)
+{
+#ifdef CPUFAM_X86
+  if (cpu_feature_p(CPUFEAT_X86_AESNI)) return rijndael_eblk_x86_aesni;
+#endif
+  return simple_eblk;
+}
+
+static rijndael_dblk__functype *pick_dblk(void)
+{
+#ifdef CPUFAM_X86
+  if (cpu_feature_p(CPUFEAT_X86_AESNI)) return rijndael_dblk_x86_aesni;
+#endif
+  return simple_dblk;
+}
+
 #define DO(what, t, aa, bb, cc, dd, a, b, c, d, w) do {			\
   aa = what(t, a, b, c, d) ^ *w++;					\
   bb = what(t, b, c, d, a) ^ *w++;					\
@@ -83,7 +117,7 @@ void rijndael_init(rijndael_ctx *k, const void *buf, size_t sz)
   dd = what(t, d, c, b, a) ^ *w++;					\
 } while (0)
 
-void rijndael_eblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst)
+static void simple_eblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst)
 {
   uint32 a = s[0], b = s[1], c = s[2], d = s[3];
   uint32 aa, bb, cc, dd;
@@ -118,7 +152,7 @@ void rijndael_eblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst)
   dst[0] = a; dst[1] = b; dst[2] = c; dst[3] = d;
 }
 
-void rijndael_dblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst)
+static void simple_dblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst)
 {
   uint32 a = s[0], b = s[1], c = s[2], d = s[3];
   uint32 aa, bb, cc, dd;
-- 
2.11.0