From 226639f3441312d535bccf45b1c0d15d0ae156a1 Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Mon, 25 May 2015 10:34:14 +0100 Subject: [PATCH] Support Intel's AES Native Instructions where available on x86 hardware. * Add a detector for the CPU feature. * Implement AES in terms of the Intel AESNI instructions. We can't use the fancy instructions to implement Rijndael with large blocks, unfortunately; we /can/ (and do) use the rather cumbersome key-scheduling instructions. There's a slightly annoying endianness difference between Catacomb (big-endian) and AESNI (little-endian). Resolve this by (a) maintaining the key schedule in little-endian order if we're using AESNI (and blocks are exactly 128 bits); and (b) end-swapping the block on entry and exit to the block cipher operations. --- base/dispatch.c | 6 + base/dispatch.h | 3 +- symm/Makefile.am | 3 + symm/rijndael-base.c | 57 +++-- symm/rijndael-x86-aesni.s | 553 ++++++++++++++++++++++++++++++++++++++++++++++ symm/rijndael.c | 38 +++- 6 files changed, 643 insertions(+), 17 deletions(-) create mode 100644 symm/rijndael-x86-aesni.s diff --git a/base/dispatch.c b/base/dispatch.c index 87d41b3a..fe3dcfb8 100644 --- a/base/dispatch.c +++ b/base/dispatch.c @@ -44,6 +44,7 @@ #define EFLAGS_ID (1u << 21) #define CPUID1D_SSE2 (1u << 26) #define CPUID1D_FXSR (1u << 24) +#define CPUID1C_AESNI (1u << 25) struct cpuid { unsigned a, b, c, d; }; @@ -196,6 +197,11 @@ int cpu_feature_p(int feat) return (xmm_registers_available_p() && cpuid_features_p(CPUID1D_SSE2, 0)); } + case CPUFEAT_X86_AESNI: { + check_env("x86:aesni"); + return (xmm_registers_available_p() && + cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI)); + } #endif default: return (0); diff --git a/base/dispatch.h b/base/dispatch.h index bcf9a13d..612cfcd3 100644 --- a/base/dispatch.h +++ b/base/dispatch.h @@ -148,7 +148,8 @@ stcls ret ext argdecls { rtn dflt args; } */ enum { - CPUFEAT_X86_SSE2 /* Streaming SIMD Extensions 2 */ + CPUFEAT_X86_SSE2, /* Streaming SIMD Extensions 2 */ + CPUFEAT_X86_AESNI /* AES Native Instructions */ }; extern int cpu_feature_p(int /*feat*/); diff --git a/symm/Makefile.am b/symm/Makefile.am index 5bb82f1f..6a63993d 100644 --- a/symm/Makefile.am +++ b/symm/Makefile.am @@ -180,6 +180,9 @@ BLKCS += rc5 ## Daemen and Rijmen's `Rijndael' block cipher, selected as AES. BLKCS += rijndael rijndael192 rijndael256 libsymm_la_SOURCES += rijndael-base.h rijndael-base.c +if CPUFAM_X86 +libsymm_la_SOURCES += rijndael-x86-aesni.s +endif libsymm_la_SOURCES += $(precomp)/rijndael-tab.c PRECOMPS += $(precomp)/rijndael-tab.c PRECOMP_PROGS += rijndael-mktab diff --git a/symm/rijndael-base.c b/symm/rijndael-base.c index bfab63a2..6e59130c 100644 --- a/symm/rijndael-base.c +++ b/symm/rijndael-base.c @@ -27,12 +27,15 @@ /*----- Header files ------------------------------------------------------*/ +#include "config.h" + #include #include #include #include "blkc.h" +#include "dispatch.h" #include "gcipher.h" #include "rijndael.h" #include "rijndael-base.h" @@ -55,25 +58,14 @@ const octet rijndael_keysz[] = { KSZ_RANGE, RIJNDAEL_KEYSZ, 4, 32, 4 }; * Use: Low-level key-scheduling. */ -void rijndael_setup(rijndael_ctx *k, unsigned nb, const void *buf, size_t sz) +static void simple_setup(rijndael_ctx *k, unsigned nb, + const void *buf, unsigned nk) { - unsigned nk, nr, nw; + unsigned nr = k->nr, nw; unsigned i, j, jj; const octet *p; uint32 ww; - /* --- Sort out the key size --- */ - - KSZ_ASSERT(rijndael, sz); - nk = sz / 4; - - /* --- Select the number of rounds --- */ - - nr = (nk > nb ? nk : nb) + 6; - if (nr < 10) - nr = 10; - k->nr = nr; - /* --- Fetch the first key words out --- */ p = buf; @@ -120,4 +112,41 @@ void rijndael_setup(rijndael_ctx *k, unsigned nb, const void *buf, size_t sz) k->wi[i] = k->w[j + jj++]; } +CPU_DISPATCH(static, EMPTY, void, setup, (rijndael_ctx *k, unsigned nb, + const void *buf, unsigned nk), + (k, nb, buf, nk), pick_setup, simple_setup) + +#ifdef CPUFAM_X86 +extern setup__functype rijndael_setup_x86_aesni; +#endif + +static setup__functype *pick_setup(void) +{ +#ifdef CPUFAM_X86 + if (cpu_feature_p(CPUFEAT_X86_AESNI)) return rijndael_setup_x86_aesni; +#endif + return simple_setup; +} + +void rijndael_setup(rijndael_ctx *k, unsigned nb, const void *buf, size_t sz) +{ + unsigned nk, nr; + + /* --- Sort out the key size --- */ + + KSZ_ASSERT(rijndael, sz); + nk = sz / 4; + + /* --- Select the number of rounds --- */ + + nr = (nk > nb ? nk : nb) + 6; + if (nr < 10) + nr = 10; + k->nr = nr; + + /* --- Do the main setup --- */ + + setup(k, nb, buf, nk); +} + /*----- That's all, folks -------------------------------------------------*/ diff --git a/symm/rijndael-x86-aesni.s b/symm/rijndael-x86-aesni.s new file mode 100644 index 00000000..79b33584 --- /dev/null +++ b/symm/rijndael-x86-aesni.s @@ -0,0 +1,553 @@ +### -*- mode: asm; asm-comment-char: ?# -*- +### +### AESNI-based implementation of Rijndael +### +### (c) 2015 Straylight/Edgeware +### + +###----- Licensing notice --------------------------------------------------- +### +### This file is part of Catacomb. +### +### Catacomb is free software; you can redistribute it and/or modify +### it under the terms of the GNU Library General Public License as +### published by the Free Software Foundation; either version 2 of the +### License, or (at your option) any later version. +### +### Catacomb is distributed in the hope that it will be useful, +### but WITHOUT ANY WARRANTY; without even the implied warranty of +### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +### GNU Library General Public License for more details. +### +### You should have received a copy of the GNU Library General Public +### License along with Catacomb; if not, write to the Free +### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +### MA 02111-1307, USA. + + .intel_syntax noprefix + .arch .aes + + .globl abort + .globl rijndael_rcon + + .section .text + +### The AESNI instructions implement a little-endian version of AES, but +### Catacomb's internal interface presents as big-endian so as to work better +### with things like GCM. We therefore maintain the round keys in +### little-endian form, and have to end-swap blocks in and out. +### +### For added amusement, the AESNI instructions don't implement the +### larger-block versions of Rijndael, so we have to end-swap the keys if +### we're preparing for one of those. + + ## Useful constants. + .equ maxrounds, 16 # maximum number of rounds + .equ maxblksz, 32 # maximum block size, in bytes + .equ kbufsz, maxblksz*(maxrounds + 1) # size of a key-schedule buffer + + ## Context structure. + .equ nr, 0 # number of rounds + .equ w, nr + 4 # encryption key words + .equ wi, w + kbufsz # decryption key words + +###-------------------------------------------------------------------------- +### Key setup. + + .globl rijndael_setup_x86_aesni + .type rijndael_setup_x86_aesni, STT_FUNC + .align 16 +rijndael_setup_x86_aesni: + + ## Initial state. We have four arguments: + ## [esp + 20] is the context pointer + ## [esp + 24] is the block size, in 32-bit words (4, 6, or 8) + ## [esp + 28] points to the key material, unaligned + ## [esp + 32] is the size of the key, in words + ## The key size has already been checked for validity, and the number + ## of rounds has been computed. Our job is only to fill in the `w' + ## and `wi' vectors. + + push ebp + push ebx + push esi + push edi + + ## The initial round key material is taken directly from the input + ## key, so copy it over. + mov ebp, [esp + 20] # context base pointer + mov ebx, [esp + 32] # key size, in words + mov ecx, ebx + mov esi, [esp + 28] + lea edi, [ebp + w] + rep movsd + + ## Find out other useful things. + mov edx, [ebp + nr] # number of rounds + add edx, 1 + imul edx, [esp + 24] # total key size in words + sub edx, ebx # offset by the key size + + ## Find the round constants. + call where_am_i_ecx + add ecx, offset _GLOBAL_OFFSET_TABLE_ + mov ecx, [ecx + rijndael_rcon@GOT] + + ## Prepare for the main loop. + lea esi, [ebp + w] + mov eax, [esi + 4*ebx - 4] # most recent key word + lea edx, [esi + 4*edx] # limit, offset by one key expansion + + ## Main key expansion loop. The first word of each key-length chunk + ## needs special treatment. + ## + ## This is rather tedious because the Intel `AESKEYGENASSIST' + ## instruction is very strangely shaped. Firstly, it wants to + ## operate on vast SSE registers, even though we're data-blocked from + ## doing more than operation at a time unless we're doing two key + ## schedules simultaneously -- and even then we can't do more than + ## two, because the instruction ignores two of its input words + ## entirely, and produces two different outputs for each of the other + ## two. And secondly it insists on taking the magic round constant + ## as an immediate, so it's kind of annoying if you're not + ## open-coding the whole thing. It's much easier to leave that as + ## zero and XOR in the round constant by hand. +9: movd xmm0, eax + pshufd xmm0, xmm0, 0x39 + aeskeygenassist xmm1, xmm0, 0 + pshufd xmm1, xmm1, 0x93 + movd eax, xmm1 + xor eax, [esi] + xor al, [ecx] + inc ecx + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + ## The next three words are simple... + xor eax, [esi] + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + ## (Word 2...) + xor eax, [esi] + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + ## (Word 3...) + xor eax, [esi] + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + ## Word 4. If the key is /more/ than 6 words long, then we must + ## apply a substitution here. + cmp ebx, 5 + jb 9b + cmp ebx, 7 + jb 0f + movd xmm0, eax + pshufd xmm0, xmm0, 0x93 + aeskeygenassist xmm1, xmm0, 0 + movd eax, xmm1 +0: xor eax, [esi] + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + ## (Word 5...) + cmp ebx, 6 + jb 9b + xor eax, [esi] + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + ## (Word 6...) + cmp ebx, 7 + jb 9b + xor eax, [esi] + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + ## (Word 7...) + cmp ebx, 8 + jb 9b + xor eax, [esi] + mov [esi + 4*ebx], eax + add esi, 4 + cmp esi, edx + jae 8f + + ## Must be done by now. + jmp 9b + + ## Next job is to construct the decryption keys. The keys for the + ## first and last rounds don't need to be mangled, but the remaining + ## ones do -- and they all need to be reordered too. + ## + ## The plan of action, then, is to copy the final encryption round's + ## keys into place first, then to do each of the intermediate rounds + ## in reverse order, and finally do the first round. + ## + ## Do all of the heavy lifting with SSE registers. The order we're + ## doing this in means that it's OK if we read or write too much, and + ## there's easily enough buffer space for the over-enthusiastic reads + ## and writes because the context has space for 32-byte blocks, which + ## is our maximum and an exact fit for two SSE registers. +8: mov ecx, [ebp + nr] # number of rounds + mov ebx, [esp + 24] # block size (in words) + mov edx, ecx + imul edx, ebx + lea edi, [ebp + wi] + lea esi, [ebp + 4*edx + w] # last round's keys + shl ebx, 2 # block size (in bytes now) + + ## Copy the last encryption round's keys. + movdqu xmm0, [esi] + movdqu [edi], xmm0 + cmp ebx, 16 + jbe 9f + movdqu xmm0, [esi + 16] + movdqu [edi + 16], xmm0 + + ## Update the loop variables and stop if we've finished. +9: add edi, ebx + sub esi, ebx + sub ecx, 1 + jbe 0f + + ## Do another middle round's keys... + movdqu xmm0, [esi] + aesimc xmm0, xmm0 + movdqu [edi], xmm0 + cmp ebx, 16 + jbe 9b + movdqu xmm0, [esi + 16] + aesimc xmm0, xmm0 + movdqu [edi + 16], xmm0 + jmp 9b + + ## Finally do the first encryption round. +0: movdqu xmm0, [esi] + movdqu [edi], xmm0 + cmp ebx, 16 + jbe 0f + movdqu xmm0, [esi + 16] + movdqu [edi + 16], xmm0 + + ## If the block size is not exactly four words then we must end-swap + ## everything. We can use fancy SSE toys for this. +0: cmp ebx, 16 + je 0f + + ## Find the byte-reordering table. + call where_am_i_ecx + movdqa xmm7, [ecx + endswap_tab - .] + + ## Calculate the number of subkey words again. (It's a good job + ## we've got a fast multiplier.) + mov ecx, [ebp + nr] + add ecx, 1 + imul ecx, [esp + 24] # total keys in words + + ## End-swap the encryption keys. + mov eax, ecx + lea esi, [ebp + w] + call endswap_block + + ## And the decryption keys. + mov ecx, eax + lea esi, [ebp + wi] + call endswap_block + + ## All done. +0: pop edi + pop esi + pop ebx + pop ebp + ret + + .align 16 +endswap_block: + ## End-swap ECX words starting at ESI. The end-swapping table is + ## already loaded into XMM7; and it's OK to work in 16-byte chunks. + movdqu xmm1, [esi] + pshufb xmm1, xmm7 + movdqu [esi], xmm1 + add esi, 16 + sub ecx, 4 + ja endswap_block + ret + + .size rijndael_setup_x86_aesni, . - rijndael_setup_x86_aesni + +###-------------------------------------------------------------------------- +### Encrypting and decrypting blocks. + + .globl rijndael_eblk_x86_aesni + .type rijndael_eblk_x86_aesni, STT_FUNC + .align 16 +rijndael_eblk_x86_aesni: + + ## On entry, we have: + ## [esp + 4] points to the context block + ## [esp + 8] points to the input data block + ## [esp + 12] points to the output buffer + + ## Find the magic endianness-swapping table. + call where_am_i_ecx + movdqa xmm7, [ecx + endswap_tab - .] + + ## Load the input block and end-swap it. Also, start loading the + ## keys. + mov eax, [esp + 8] + movdqu xmm0, [eax] + pshufb xmm0, xmm7 + mov eax, [esp + 4] + lea edx, [eax + w] + mov eax, [eax + nr] + + ## Initial whitening. + movdqu xmm1, [edx] + add edx, 16 + pxor xmm0, xmm1 + + ## Dispatch to the correct code. + cmp eax, 10 + je er10 + jb bogus + cmp eax, 14 + je er14 + ja bogus + cmp eax, 12 + je er12 + jb er11 + jmp er13 + + .align 2 + + ## 14 rounds... +er14: movdqu xmm1, [edx] + add edx, 16 + aesenc xmm0, xmm1 + + ## 13 rounds... +er13: movdqu xmm1, [edx] + add edx, 16 + aesenc xmm0, xmm1 + + ## 12 rounds... +er12: movdqu xmm1, [edx] + add edx, 16 + aesenc xmm0, xmm1 + + ## 11 rounds... +er11: movdqu xmm1, [edx] + add edx, 16 + aesenc xmm0, xmm1 + + ## 10 rounds... +er10: movdqu xmm1, [edx] + aesenc xmm0, xmm1 + + ## 9 rounds... + movdqu xmm1, [edx + 16] + aesenc xmm0, xmm1 + + ## 8 rounds... + movdqu xmm1, [edx + 32] + aesenc xmm0, xmm1 + + ## 7 rounds... + movdqu xmm1, [edx + 48] + aesenc xmm0, xmm1 + + ## 6 rounds... + movdqu xmm1, [edx + 64] + aesenc xmm0, xmm1 + + ## 5 rounds... + movdqu xmm1, [edx + 80] + aesenc xmm0, xmm1 + + ## 4 rounds... + movdqu xmm1, [edx + 96] + aesenc xmm0, xmm1 + + ## 3 rounds... + movdqu xmm1, [edx + 112] + aesenc xmm0, xmm1 + + ## 2 rounds... + movdqu xmm1, [edx + 128] + aesenc xmm0, xmm1 + + ## Final round... + movdqu xmm1, [edx + 144] + aesenclast xmm0, xmm1 + + ## Unpermute the ciphertext block and store it. + pshufb xmm0, xmm7 + mov eax, [esp + 12] + movdqu [eax], xmm0 + + ## And we're done. + ret + + .size rijndael_eblk_x86_aesni, . - rijndael_dblk_x86_aesni + + .globl rijndael_dblk_x86_aesni + .type rijndael_dblk_x86_aesni, STT_FUNC + .align 16 +rijndael_dblk_x86_aesni: + + ## On entry, we have: + ## [esp + 4] points to the context block + ## [esp + 8] points to the input data block + ## [esp + 12] points to the output buffer + + ## Find the magic endianness-swapping table. + call where_am_i_ecx + movdqa xmm7, [ecx + endswap_tab - .] + + ## Load the input block and end-swap it. Also, start loading the + ## keys. + mov eax, [esp + 8] + movdqu xmm0, [eax] + pshufb xmm0, xmm7 + mov eax, [esp + 4] + lea edx, [eax + wi] + mov eax, [eax + nr] + + ## Initial whitening. + movdqu xmm1, [edx] + add edx, 16 + pxor xmm0, xmm1 + + ## Dispatch to the correct code. + cmp eax, 10 + je dr10 + jb bogus + cmp eax, 14 + je dr14 + ja bogus + cmp eax, 12 + je dr12 + jb dr11 + jmp dr13 + + .align 2 + + ## 14 rounds... +dr14: movdqu xmm1, [edx] + add edx, 16 + aesdec xmm0, xmm1 + + ## 13 rounds... +dr13: movdqu xmm1, [edx] + add edx, 16 + aesdec xmm0, xmm1 + + ## 12 rounds... +dr12: movdqu xmm1, [edx] + add edx, 16 + aesdec xmm0, xmm1 + + ## 11 rounds... +dr11: movdqu xmm1, [edx] + add edx, 16 + aesdec xmm0, xmm1 + + ## 10 rounds... +dr10: movdqu xmm1, [edx] + aesdec xmm0, xmm1 + + ## 9 rounds... + movdqu xmm1, [edx + 16] + aesdec xmm0, xmm1 + + ## 8 rounds... + movdqu xmm1, [edx + 32] + aesdec xmm0, xmm1 + + ## 7 rounds... + movdqu xmm1, [edx + 48] + aesdec xmm0, xmm1 + + ## 6 rounds... + movdqu xmm1, [edx + 64] + aesdec xmm0, xmm1 + + ## 5 rounds... + movdqu xmm1, [edx + 80] + aesdec xmm0, xmm1 + + ## 4 rounds... + movdqu xmm1, [edx + 96] + aesdec xmm0, xmm1 + + ## 3 rounds... + movdqu xmm1, [edx + 112] + aesdec xmm0, xmm1 + + ## 2 rounds... + movdqu xmm1, [edx + 128] + aesdec xmm0, xmm1 + + ## Final round... + movdqu xmm1, [edx + 144] + aesdeclast xmm0, xmm1 + + ## Unpermute the ciphertext block and store it. + pshufb xmm0, xmm7 + mov eax, [esp + 12] + movdqu [eax], xmm0 + + ## And we're done. + ret + + .size rijndael_dblk_x86_aesni, . - rijndael_dblk_x86_aesni + +###-------------------------------------------------------------------------- +### Random utilities. + + .align 16 + ## Abort the process because of a programming error. Indirecting + ## through this point serves several purposes: (a) by CALLing, rather + ## than branching to, `abort', we can save the return address, which + ## might at least provide a hint as to what went wrong; (b) we don't + ## have conditional CALLs (and they'd be big anyway); and (c) we can + ## write a HLT here as a backstop against `abort' being mad. +bogus: call abort@PLT +0: hlt + jmp 0b + + .align 16 + ## Return the address of the instruction following the CALL here in + ## ECX. This is useful for doing position-independent addressing. +where_am_i_ecx: + mov ecx, [esp] + ret + +###-------------------------------------------------------------------------- +### Data tables. + + .align 16 +endswap_tab: + .byte 3, 2, 1, 0 + .byte 7, 6, 5, 4 + .byte 11, 10, 9, 8 + .byte 15, 14, 13, 12 + +###----- That's all, folks -------------------------------------------------- diff --git a/symm/rijndael.c b/symm/rijndael.c index 9d8e739f..9ee8aa28 100644 --- a/symm/rijndael.c +++ b/symm/rijndael.c @@ -27,12 +27,15 @@ /*----- Header files ------------------------------------------------------*/ +#include "config.h" + #include #include #include #include "blkc.h" +#include "dispatch.h" #include "gcipher.h" #include "rijndael.h" #include "rijndael-base.h" @@ -69,6 +72,37 @@ void rijndael_init(rijndael_ctx *k, const void *buf, size_t sz) * Use: Low-level block encryption and decryption. */ +CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_eblk, (const rijndael_ctx *k, + const uint32 s[4], + uint32 d[4]), + (k, s, d), pick_eblk, simple_eblk) + +CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_dblk, (const rijndael_ctx *k, + const uint32 s[4], + uint32 d[4]), + (k, s, d), pick_dblk, simple_dblk) + +#ifdef CPUFAM_X86 +extern rijndael_eblk__functype rijndael_eblk_x86_aesni; +extern rijndael_dblk__functype rijndael_dblk_x86_aesni; +#endif + +static rijndael_eblk__functype *pick_eblk(void) +{ +#ifdef CPUFAM_X86 + if (cpu_feature_p(CPUFEAT_X86_AESNI)) return rijndael_eblk_x86_aesni; +#endif + return simple_eblk; +} + +static rijndael_dblk__functype *pick_dblk(void) +{ +#ifdef CPUFAM_X86 + if (cpu_feature_p(CPUFEAT_X86_AESNI)) return rijndael_dblk_x86_aesni; +#endif + return simple_dblk; +} + #define DO(what, t, aa, bb, cc, dd, a, b, c, d, w) do { \ aa = what(t, a, b, c, d) ^ *w++; \ bb = what(t, b, c, d, a) ^ *w++; \ @@ -83,7 +117,7 @@ void rijndael_init(rijndael_ctx *k, const void *buf, size_t sz) dd = what(t, d, c, b, a) ^ *w++; \ } while (0) -void rijndael_eblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst) +static void simple_eblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst) { uint32 a = s[0], b = s[1], c = s[2], d = s[3]; uint32 aa, bb, cc, dd; @@ -118,7 +152,7 @@ void rijndael_eblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst) dst[0] = a; dst[1] = b; dst[2] = c; dst[3] = d; } -void rijndael_dblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst) +static void simple_dblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst) { uint32 a = s[0], b = s[1], c = s[2], d = s[3]; uint32 aa, bb, cc, dd; -- 2.11.0