mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// AESNI-based implementation of Rijndael
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// External definitions.
	35
	36	.globl F(abort)
	37	.globl F(rijndael_rcon)
	38
	39	///--------------------------------------------------------------------------
	40	/// Main code.
	41
	42	.arch .aes
	43	.section .text
	44
	45	/// The AESNI instructions implement a little-endian version of AES, but
	46	/// Catacomb's internal interface presents as big-endian so as to work better
	47	/// with things like GCM. We therefore maintain the round keys in
	48	/// little-endian form, and have to end-swap blocks in and out.
	49	///
	50	/// For added amusement, the AESNI instructions don't implement the
	51	/// larger-block versions of Rijndael, so we have to end-swap the keys if
	52	/// we're preparing for one of those.
	53
	54	// Useful constants.
	55	.equ maxrounds, 16 // maximum number of rounds
	56	.equ maxblksz, 32 // maximum block size, in bytes
	57	.equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
	58
	59	// Context structure.
	60	.equ nr, 0 // number of rounds
	61	.equ w, nr + 4 // encryption key words
	62	.equ wi, w + kbufsz // decryption key words
	63
	64	///--------------------------------------------------------------------------
	65	/// Key setup.
	66
	67	FUNC(rijndael_setup_x86_aesni)
	68
	69	// Initial state. We have four arguments:
	70	// [esp + 20] is the context pointer
	71	// [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
	72	// [esp + 28] points to the key material, unaligned
	73	// [esp + 32] is the size of the key, in words
	74	// The key size has already been checked for validity, and the number
	75	// of rounds has been computed. Our job is only to fill in the `w'
	76	// and `wi' vectors.
	77
	78	push ebp
	79	push ebx
	80	push esi
	81	push edi
	82
	83	// The initial round key material is taken directly from the input
	84	// key, so copy it over.
	85	mov ebp, [esp + 20] // context base pointer
	86	mov ebx, [esp + 32] // key size, in words
	87	mov ecx, ebx
	88	mov esi, [esp + 28]
	89	lea edi, [ebp + w]
	90	rep movsd
	91
	92	// Find out other useful things.
	93	mov edx, [ebp + nr] // number of rounds
	94	add edx, 1
	95	imul edx, [esp + 24] // total key size in words
	96	sub edx, ebx // offset by the key size
	97
	98	// Find the round constants.
	99	ldgot ecx
	100	leaext ecx, rijndael_rcon, ecx
	101
	102	// Prepare for the main loop.
	103	lea esi, [ebp + w]
	104	mov eax, [esi + 4*ebx - 4] // most recent key word
	105	lea edx, [esi + 4*edx] // limit, offset by one key expansion
	106
	107	// Main key expansion loop. The first word of each key-length chunk
	108	// needs special treatment.
	109	//
	110	// This is rather tedious because the Intel `AESKEYGENASSIST'
	111	// instruction is very strangely shaped. Firstly, it wants to
	112	// operate on vast SSE registers, even though we're data-blocked from
	113	// doing more than operation at a time unless we're doing two key
	114	// schedules simultaneously -- and even then we can't do more than
	115	// two, because the instruction ignores two of its input words
	116	// entirely, and produces two different outputs for each of the other
	117	// two. And secondly it insists on taking the magic round constant
	118	// as an immediate, so it's kind of annoying if you're not
	119	// open-coding the whole thing. It's much easier to leave that as
	120	// zero and XOR in the round constant by hand.
	121	9: movd xmm0, eax
	122	pshufd xmm0, xmm0, 0x39
	123	aeskeygenassist xmm1, xmm0, 0
	124	pshufd xmm1, xmm1, 0x93
	125	movd eax, xmm1
	126	xor eax, [esi]
	127	xor al, [ecx]
	128	inc ecx
	129	mov [esi + 4*ebx], eax
	130	add esi, 4
	131	cmp esi, edx
	132	jae 8f
	133
	134	// The next three words are simple...
	135	xor eax, [esi]
	136	mov [esi + 4*ebx], eax
	137	add esi, 4
	138	cmp esi, edx
	139	jae 8f
	140
	141	// (Word 2...)
	142	xor eax, [esi]
	143	mov [esi + 4*ebx], eax
	144	add esi, 4
	145	cmp esi, edx
	146	jae 8f
	147
	148	// (Word 3...)
	149	xor eax, [esi]
	150	mov [esi + 4*ebx], eax
	151	add esi, 4
	152	cmp esi, edx
	153	jae 8f
	154
	155	// Word 4. If the key is /more/ than 6 words long, then we must
	156	// apply a substitution here.
	157	cmp ebx, 5
	158	jb 9b
	159	cmp ebx, 7
	160	jb 0f
	161	movd xmm0, eax
	162	pshufd xmm0, xmm0, 0x93
	163	aeskeygenassist xmm1, xmm0, 0
	164	movd eax, xmm1
	165	0: xor eax, [esi]
	166	mov [esi + 4*ebx], eax
	167	add esi, 4
	168	cmp esi, edx
	169	jae 8f
	170
	171	// (Word 5...)
	172	cmp ebx, 6
	173	jb 9b
	174	xor eax, [esi]
	175	mov [esi + 4*ebx], eax
	176	add esi, 4
	177	cmp esi, edx
	178	jae 8f
	179
	180	// (Word 6...)
	181	cmp ebx, 7
	182	jb 9b
	183	xor eax, [esi]
	184	mov [esi + 4*ebx], eax
	185	add esi, 4
	186	cmp esi, edx
	187	jae 8f
	188
	189	// (Word 7...)
	190	cmp ebx, 8
	191	jb 9b
	192	xor eax, [esi]
	193	mov [esi + 4*ebx], eax
	194	add esi, 4
	195	cmp esi, edx
	196	jae 8f
	197
	198	// Must be done by now.
	199	jmp 9b
	200
	201	// Next job is to construct the decryption keys. The keys for the
	202	// first and last rounds don't need to be mangled, but the remaining
	203	// ones do -- and they all need to be reordered too.
	204	//
	205	// The plan of action, then, is to copy the final encryption round's
	206	// keys into place first, then to do each of the intermediate rounds
	207	// in reverse order, and finally do the first round.
	208	//
	209	// Do all of the heavy lifting with SSE registers. The order we're
	210	// doing this in means that it's OK if we read or write too much, and
	211	// there's easily enough buffer space for the over-enthusiastic reads
	212	// and writes because the context has space for 32-byte blocks, which
	213	// is our maximum and an exact fit for two SSE registers.
	214	8: mov ecx, [ebp + nr] // number of rounds
	215	mov ebx, [esp + 24] // block size (in words)
	216	mov edx, ecx
	217	imul edx, ebx
	218	lea edi, [ebp + wi]
	219	lea esi, [ebp + 4*edx + w] // last round's keys
	220	shl ebx, 2 // block size (in bytes now)
	221
	222	// Copy the last encryption round's keys.
	223	movdqu xmm0, [esi]
	224	movdqu [edi], xmm0
	225	cmp ebx, 16
	226	jbe 9f
	227	movdqu xmm0, [esi + 16]
	228	movdqu [edi + 16], xmm0
	229
	230	// Update the loop variables and stop if we've finished.
	231	9: add edi, ebx
	232	sub esi, ebx
	233	sub ecx, 1
	234	jbe 0f
	235
	236	// Do another middle round's keys...
	237	movdqu xmm0, [esi]
	238	aesimc xmm0, xmm0
	239	movdqu [edi], xmm0
	240	cmp ebx, 16
	241	jbe 9b
	242	movdqu xmm0, [esi + 16]
	243	aesimc xmm0, xmm0
	244	movdqu [edi + 16], xmm0
	245	jmp 9b
	246
	247	// Finally do the first encryption round.
	248	0: movdqu xmm0, [esi]
	249	movdqu [edi], xmm0
	250	cmp ebx, 16
	251	jbe 0f
	252	movdqu xmm0, [esi + 16]
	253	movdqu [edi + 16], xmm0
	254
	255	// If the block size is not exactly four words then we must end-swap
	256	// everything. We can use fancy SSE toys for this.
	257	0: cmp ebx, 16
	258	je 0f
	259
	260	// Find the byte-reordering table.
	261	ldgot ecx
	262	movdqa xmm7, [INTADDR(endswap_tab, ecx)]
	263
	264	// Calculate the number of subkey words again. (It's a good job
	265	// we've got a fast multiplier.)
	266	mov ecx, [ebp + nr]
	267	add ecx, 1
	268	imul ecx, [esp + 24] // total keys in words
	269
	270	// End-swap the encryption keys.
	271	mov eax, ecx
	272	lea esi, [ebp + w]
	273	call endswap_block
	274
	275	// And the decryption keys.
	276	mov ecx, eax
	277	lea esi, [ebp + wi]
	278	call endswap_block
	279
	280	// All done.
	281	0: pop edi
	282	pop esi
	283	pop ebx
	284	pop ebp
	285	ret
	286
	287	.align 16
	288	endswap_block:
	289	// End-swap ECX words starting at ESI. The end-swapping table is
	290	// already loaded into XMM7; and it's OK to work in 16-byte chunks.
	291	movdqu xmm1, [esi]
	292	pshufb xmm1, xmm7
	293	movdqu [esi], xmm1
	294	add esi, 16
	295	sub ecx, 4
	296	ja endswap_block
	297	ret
	298
	299	ENDFUNC
	300
	301	///--------------------------------------------------------------------------
	302	/// Encrypting and decrypting blocks.
	303
	304	FUNC(rijndael_eblk_x86_aesni)
	305
	306	// On entry, we have:
	307	// [esp + 4] points to the context block
	308	// [esp + 8] points to the input data block
	309	// [esp + 12] points to the output buffer
	310
	311	// Find the magic endianness-swapping table.
	312	ldgot ecx
	313	movdqa xmm7, [INTADDR(endswap_tab, ecx)]
	314
	315	// Load the input block and end-swap it. Also, start loading the
	316	// keys.
	317	mov eax, [esp + 8]
	318	movdqu xmm0, [eax]
	319	pshufb xmm0, xmm7
	320	mov eax, [esp + 4]
	321	lea edx, [eax + w]
	322	mov eax, [eax + nr]
	323
	324	// Initial whitening.
	325	movdqu xmm1, [edx]
	326	add edx, 16
	327	pxor xmm0, xmm1
	328
	329	// Dispatch to the correct code.
	330	cmp eax, 10
	331	je er10
	332	jb bogus
	333	cmp eax, 14
	334	je er14
	335	ja bogus
	336	cmp eax, 12
	337	je er12
	338	jb er11
	339	jmp er13
	340
	341	.align 2
	342
	343	// 14 rounds...
	344	er14: movdqu xmm1, [edx]
	345	add edx, 16
	346	aesenc xmm0, xmm1
	347
	348	// 13 rounds...
	349	er13: movdqu xmm1, [edx]
	350	add edx, 16
	351	aesenc xmm0, xmm1
	352
	353	// 12 rounds...
	354	er12: movdqu xmm1, [edx]
	355	add edx, 16
	356	aesenc xmm0, xmm1
	357
	358	// 11 rounds...
	359	er11: movdqu xmm1, [edx]
	360	add edx, 16
	361	aesenc xmm0, xmm1
	362
	363	// 10 rounds...
	364	er10: movdqu xmm1, [edx]
	365	aesenc xmm0, xmm1
	366
	367	// 9 rounds...
	368	movdqu xmm1, [edx + 16]
	369	aesenc xmm0, xmm1
	370
	371	// 8 rounds...
	372	movdqu xmm1, [edx + 32]
	373	aesenc xmm0, xmm1
	374
	375	// 7 rounds...
	376	movdqu xmm1, [edx + 48]
	377	aesenc xmm0, xmm1
	378
	379	// 6 rounds...
	380	movdqu xmm1, [edx + 64]
	381	aesenc xmm0, xmm1
	382
	383	// 5 rounds...
	384	movdqu xmm1, [edx + 80]
	385	aesenc xmm0, xmm1
	386
	387	// 4 rounds...
	388	movdqu xmm1, [edx + 96]
	389	aesenc xmm0, xmm1
	390
	391	// 3 rounds...
	392	movdqu xmm1, [edx + 112]
	393	aesenc xmm0, xmm1
	394
	395	// 2 rounds...
	396	movdqu xmm1, [edx + 128]
	397	aesenc xmm0, xmm1
	398
	399	// Final round...
	400	movdqu xmm1, [edx + 144]
	401	aesenclast xmm0, xmm1
	402
	403	// Unpermute the ciphertext block and store it.
	404	pshufb xmm0, xmm7
	405	mov eax, [esp + 12]
	406	movdqu [eax], xmm0
	407
	408	// And we're done.
	409	ret
	410
	411	ENDFUNC
	412
	413	FUNC(rijndael_dblk_x86_aesni)
	414
	415	// On entry, we have:
	416	// [esp + 4] points to the context block
	417	// [esp + 8] points to the input data block
	418	// [esp + 12] points to the output buffer
	419
	420	// Find the magic endianness-swapping table.
	421	ldgot ecx
	422	movdqa xmm7, [INTADDR(endswap_tab, ecx)]
	423
	424	// Load the input block and end-swap it. Also, start loading the
	425	// keys.
	426	mov eax, [esp + 8]
	427	movdqu xmm0, [eax]
	428	pshufb xmm0, xmm7
	429	mov eax, [esp + 4]
	430	lea edx, [eax + wi]
	431	mov eax, [eax + nr]
	432
	433	// Initial whitening.
	434	movdqu xmm1, [edx]
	435	add edx, 16
	436	pxor xmm0, xmm1
	437
	438	// Dispatch to the correct code.
	439	cmp eax, 10
	440	je dr10
	441	jb bogus
	442	cmp eax, 14
	443	je dr14
	444	ja bogus
	445	cmp eax, 12
	446	je dr12
	447	jb dr11
	448	jmp dr13
	449
	450	.align 2
	451
	452	// 14 rounds...
	453	dr14: movdqu xmm1, [edx]
	454	add edx, 16
	455	aesdec xmm0, xmm1
	456
	457	// 13 rounds...
	458	dr13: movdqu xmm1, [edx]
	459	add edx, 16
	460	aesdec xmm0, xmm1
	461
	462	// 12 rounds...
	463	dr12: movdqu xmm1, [edx]
	464	add edx, 16
	465	aesdec xmm0, xmm1
	466
	467	// 11 rounds...
	468	dr11: movdqu xmm1, [edx]
	469	add edx, 16
	470	aesdec xmm0, xmm1
	471
	472	// 10 rounds...
	473	dr10: movdqu xmm1, [edx]
	474	aesdec xmm0, xmm1
	475
	476	// 9 rounds...
	477	movdqu xmm1, [edx + 16]
	478	aesdec xmm0, xmm1
	479
	480	// 8 rounds...
	481	movdqu xmm1, [edx + 32]
	482	aesdec xmm0, xmm1
	483
	484	// 7 rounds...
	485	movdqu xmm1, [edx + 48]
	486	aesdec xmm0, xmm1
	487
	488	// 6 rounds...
	489	movdqu xmm1, [edx + 64]
	490	aesdec xmm0, xmm1
	491
	492	// 5 rounds...
	493	movdqu xmm1, [edx + 80]
	494	aesdec xmm0, xmm1
	495
	496	// 4 rounds...
	497	movdqu xmm1, [edx + 96]
	498	aesdec xmm0, xmm1
	499
	500	// 3 rounds...
	501	movdqu xmm1, [edx + 112]
	502	aesdec xmm0, xmm1
	503
	504	// 2 rounds...
	505	movdqu xmm1, [edx + 128]
	506	aesdec xmm0, xmm1
	507
	508	// Final round...
	509	movdqu xmm1, [edx + 144]
	510	aesdeclast xmm0, xmm1
	511
	512	// Unpermute the ciphertext block and store it.
	513	pshufb xmm0, xmm7
	514	mov eax, [esp + 12]
	515	movdqu [eax], xmm0
	516
	517	// And we're done.
	518	ret
	519
	520	ENDFUNC
	521
	522	///--------------------------------------------------------------------------
	523	/// Random utilities.
	524
	525	.align 16
	526	// Abort the process because of a programming error. Indirecting
	527	// through this point serves several purposes: (a) by CALLing, rather
	528	// than branching to, `abort', we can save the return address, which
	529	// might at least provide a hint as to what went wrong; (b) we don't
	530	// have conditional CALLs (and they'd be big anyway); and (c) we can
	531	// write a HLT here as a backstop against `abort' being mad.
	532	bogus: callext F(abort)
	533	0: hlt
	534	jmp 0b
	535
	536	gotaux ecx
	537
	538	///--------------------------------------------------------------------------
	539	/// Data tables.
	540
	541	.align 16
	542	endswap_tab:
	543	.byte 3, 2, 1, 0
	544	.byte 7, 6, 5, 4
	545	.byte 11, 10, 9, 8
	546	.byte 15, 14, 13, 12
	547
	548	///----- That's all, folks --------------------------------------------------