mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// AESNI-based implementation of Rijndael
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// External definitions.
	35
	36	.globl F(abort)
	37	.globl F(rijndael_rcon)
	38
	39	///--------------------------------------------------------------------------
	40	/// Local utilities.
	41
	42	// Magic constants for shuffling.
	43	#define ROTL 0x93
	44	#define ROT2 0x4e
	45	#define ROTR 0x39
	46
	47	///--------------------------------------------------------------------------
	48	/// Main code.
	49
	50	.arch .aes
	51	.section .text
	52
	53	/// The AESNI instructions implement a little-endian version of AES, but
	54	/// Catacomb's internal interface presents as big-endian so as to work better
	55	/// with things like GCM. We therefore maintain the round keys in
	56	/// little-endian form, and have to end-swap blocks in and out.
	57	///
	58	/// For added amusement, the AESNI instructions don't implement the
	59	/// larger-block versions of Rijndael, so we have to end-swap the keys if
	60	/// we're preparing for one of those.
	61
	62	// Useful constants.
	63	.equ maxrounds, 16 // maximum number of rounds
	64	.equ maxblksz, 32 // maximum block size, in bytes
	65	.equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
	66
	67	// Context structure.
	68	.equ nr, 0 // number of rounds
	69	.equ w, nr + 4 // encryption key words
	70	.equ wi, w + kbufsz // decryption key words
	71
	72	///--------------------------------------------------------------------------
	73	/// Key setup.
	74
	75	FUNC(rijndael_setup_x86_aesni)
	76
	77	// Initial state. We have four arguments:
	78	// [esp + 20] is the context pointer
	79	// [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
	80	// [esp + 28] points to the key material, unaligned
	81	// [esp + 32] is the size of the key, in words
	82	// The key size has already been checked for validity, and the number
	83	// of rounds has been computed. Our job is only to fill in the `w'
	84	// and `wi' vectors.
	85
	86	push ebp
	87	push ebx
	88	push esi
	89	push edi
	90
	91	// The initial round key material is taken directly from the input
	92	// key, so copy it over.
	93	mov ebp, [esp + 20] // context base pointer
	94	mov ebx, [esp + 32] // key size, in words
	95	mov ecx, ebx
	96	mov esi, [esp + 28]
	97	lea edi, [ebp + w]
	98	rep movsd
	99
	100	// Find out other useful things.
	101	mov edx, [ebp + nr] // number of rounds
	102	add edx, 1
	103	imul edx, [esp + 24] // total key size in words
	104	sub edx, ebx // offset by the key size
	105
	106	// Find the round constants.
	107	ldgot ecx
	108	leaext ecx, rijndael_rcon, ecx
	109
	110	// Prepare for the main loop.
	111	lea esi, [ebp + w]
	112	mov eax, [esi + 4*ebx - 4] // most recent key word
	113	lea edx, [esi + 4*edx] // limit, offset by one key expansion
	114
	115	// Main key expansion loop. The first word of each key-length chunk
	116	// needs special treatment.
	117	//
	118	// This is rather tedious because the Intel `AESKEYGENASSIST'
	119	// instruction is very strangely shaped. Firstly, it wants to
	120	// operate on vast SSE registers, even though we're data-blocked from
	121	// doing more than operation at a time unless we're doing two key
	122	// schedules simultaneously -- and even then we can't do more than
	123	// two, because the instruction ignores two of its input words
	124	// entirely, and produces two different outputs for each of the other
	125	// two. And secondly it insists on taking the magic round constant
	126	// as an immediate, so it's kind of annoying if you're not
	127	// open-coding the whole thing. It's much easier to leave that as
	128	// zero and XOR in the round constant by hand.
	129	9: movd xmm0, eax
	130	pshufd xmm0, xmm0, ROTR
	131	aeskeygenassist xmm1, xmm0, 0
	132	pshufd xmm1, xmm1, ROTL
	133	movd eax, xmm1
	134	xor eax, [esi]
	135	xor al, [ecx]
	136	inc ecx
	137	mov [esi + 4*ebx], eax
	138	add esi, 4
	139	cmp esi, edx
	140	jae 8f
	141
	142	// The next three words are simple...
	143	xor eax, [esi]
	144	mov [esi + 4*ebx], eax
	145	add esi, 4
	146	cmp esi, edx
	147	jae 8f
	148
	149	// (Word 2...)
	150	xor eax, [esi]
	151	mov [esi + 4*ebx], eax
	152	add esi, 4
	153	cmp esi, edx
	154	jae 8f
	155
	156	// (Word 3...)
	157	xor eax, [esi]
	158	mov [esi + 4*ebx], eax
	159	add esi, 4
	160	cmp esi, edx
	161	jae 8f
	162
	163	// Word 4. If the key is /more/ than 6 words long, then we must
	164	// apply a substitution here.
	165	cmp ebx, 5
	166	jb 9b
	167	cmp ebx, 7
	168	jb 0f
	169	movd xmm0, eax
	170	pshufd xmm0, xmm0, ROTL
	171	aeskeygenassist xmm1, xmm0, 0
	172	movd eax, xmm1
	173	0: xor eax, [esi]
	174	mov [esi + 4*ebx], eax
	175	add esi, 4
	176	cmp esi, edx
	177	jae 8f
	178
	179	// (Word 5...)
	180	cmp ebx, 6
	181	jb 9b
	182	xor eax, [esi]
	183	mov [esi + 4*ebx], eax
	184	add esi, 4
	185	cmp esi, edx
	186	jae 8f
	187
	188	// (Word 6...)
	189	cmp ebx, 7
	190	jb 9b
	191	xor eax, [esi]
	192	mov [esi + 4*ebx], eax
	193	add esi, 4
	194	cmp esi, edx
	195	jae 8f
	196
	197	// (Word 7...)
	198	cmp ebx, 8
	199	jb 9b
	200	xor eax, [esi]
	201	mov [esi + 4*ebx], eax
	202	add esi, 4
	203	cmp esi, edx
	204	jae 8f
	205
	206	// Must be done by now.
	207	jmp 9b
	208
	209	// Next job is to construct the decryption keys. The keys for the
	210	// first and last rounds don't need to be mangled, but the remaining
	211	// ones do -- and they all need to be reordered too.
	212	//
	213	// The plan of action, then, is to copy the final encryption round's
	214	// keys into place first, then to do each of the intermediate rounds
	215	// in reverse order, and finally do the first round.
	216	//
	217	// Do all of the heavy lifting with SSE registers. The order we're
	218	// doing this in means that it's OK if we read or write too much, and
	219	// there's easily enough buffer space for the over-enthusiastic reads
	220	// and writes because the context has space for 32-byte blocks, which
	221	// is our maximum and an exact fit for two SSE registers.
	222	8: mov ecx, [ebp + nr] // number of rounds
	223	mov ebx, [esp + 24] // block size (in words)
	224	mov edx, ecx
	225	imul edx, ebx
	226	lea edi, [ebp + wi]
	227	lea esi, [ebp + 4*edx + w] // last round's keys
	228	shl ebx, 2 // block size (in bytes now)
	229
	230	// Copy the last encryption round's keys.
	231	movdqu xmm0, [esi]
	232	movdqu [edi], xmm0
	233	cmp ebx, 16
	234	jbe 9f
	235	movdqu xmm0, [esi + 16]
	236	movdqu [edi + 16], xmm0
	237
	238	// Update the loop variables and stop if we've finished.
	239	9: add edi, ebx
	240	sub esi, ebx
	241	sub ecx, 1
	242	jbe 0f
	243
	244	// Do another middle round's keys...
	245	movdqu xmm0, [esi]
	246	aesimc xmm0, xmm0
	247	movdqu [edi], xmm0
	248	cmp ebx, 16
	249	jbe 9b
	250	movdqu xmm0, [esi + 16]
	251	aesimc xmm0, xmm0
	252	movdqu [edi + 16], xmm0
	253	jmp 9b
	254
	255	// Finally do the first encryption round.
	256	0: movdqu xmm0, [esi]
	257	movdqu [edi], xmm0
	258	cmp ebx, 16
	259	jbe 0f
	260	movdqu xmm0, [esi + 16]
	261	movdqu [edi + 16], xmm0
	262
	263	// If the block size is not exactly four words then we must end-swap
	264	// everything. We can use fancy SSE toys for this.
	265	0: cmp ebx, 16
	266	je 0f
	267
	268	// Find the byte-reordering table.
	269	ldgot ecx
	270	movdqa xmm5, [INTADDR(endswap_tab, ecx)]
	271
	272	// Calculate the number of subkey words again. (It's a good job
	273	// we've got a fast multiplier.)
	274	mov ecx, [ebp + nr]
	275	add ecx, 1
	276	imul ecx, [esp + 24] // total keys in words
	277
	278	// End-swap the encryption keys.
	279	mov eax, ecx
	280	lea esi, [ebp + w]
	281	call endswap_block
	282
	283	// And the decryption keys.
	284	mov ecx, eax
	285	lea esi, [ebp + wi]
	286	call endswap_block
	287
	288	// All done.
	289	0: pop edi
	290	pop esi
	291	pop ebx
	292	pop ebp
	293	ret
	294
	295	.align 16
	296	endswap_block:
	297	// End-swap ECX words starting at ESI. The end-swapping table is
	298	// already loaded into XMM5; and it's OK to work in 16-byte chunks.
	299	movdqu xmm1, [esi]
	300	pshufb xmm1, xmm5
	301	movdqu [esi], xmm1
	302	add esi, 16
	303	sub ecx, 4
	304	ja endswap_block
	305	ret
	306
	307	ENDFUNC
	308
	309	///--------------------------------------------------------------------------
	310	/// Encrypting and decrypting blocks.
	311
	312	FUNC(rijndael_eblk_x86_aesni)
	313
	314	// On entry, we have:
	315	// [esp + 4] points to the context block
	316	// [esp + 8] points to the input data block
	317	// [esp + 12] points to the output buffer
	318
	319	// Find the magic endianness-swapping table.
	320	ldgot ecx
	321	movdqa xmm5, [INTADDR(endswap_tab, ecx)]
	322
	323	// Load the input block and end-swap it. Also, start loading the
	324	// keys.
	325	mov eax, [esp + 8]
	326	movdqu xmm0, [eax]
	327	pshufb xmm0, xmm5
	328	mov eax, [esp + 4]
	329	lea edx, [eax + w]
	330	mov eax, [eax + nr]
	331
	332	// Initial whitening.
	333	movdqu xmm1, [edx]
	334	add edx, 16
	335	pxor xmm0, xmm1
	336
	337	// Dispatch to the correct code.
	338	cmp eax, 10
	339	je er10
	340	jb bogus
	341	cmp eax, 14
	342	je er14
	343	ja bogus
	344	cmp eax, 12
	345	je er12
	346	jb er11
	347	jmp er13
	348
	349	.align 2
	350
	351	// 14 rounds...
	352	er14: movdqu xmm1, [edx]
	353	add edx, 16
	354	aesenc xmm0, xmm1
	355
	356	// 13 rounds...
	357	er13: movdqu xmm1, [edx]
	358	add edx, 16
	359	aesenc xmm0, xmm1
	360
	361	// 12 rounds...
	362	er12: movdqu xmm1, [edx]
	363	add edx, 16
	364	aesenc xmm0, xmm1
	365
	366	// 11 rounds...
	367	er11: movdqu xmm1, [edx]
	368	add edx, 16
	369	aesenc xmm0, xmm1
	370
	371	// 10 rounds...
	372	er10: movdqu xmm1, [edx]
	373	aesenc xmm0, xmm1
	374
	375	// 9 rounds...
	376	movdqu xmm1, [edx + 16]
	377	aesenc xmm0, xmm1
	378
	379	// 8 rounds...
	380	movdqu xmm1, [edx + 32]
	381	aesenc xmm0, xmm1
	382
	383	// 7 rounds...
	384	movdqu xmm1, [edx + 48]
	385	aesenc xmm0, xmm1
	386
	387	// 6 rounds...
	388	movdqu xmm1, [edx + 64]
	389	aesenc xmm0, xmm1
	390
	391	// 5 rounds...
	392	movdqu xmm1, [edx + 80]
	393	aesenc xmm0, xmm1
	394
	395	// 4 rounds...
	396	movdqu xmm1, [edx + 96]
	397	aesenc xmm0, xmm1
	398
	399	// 3 rounds...
	400	movdqu xmm1, [edx + 112]
	401	aesenc xmm0, xmm1
	402
	403	// 2 rounds...
	404	movdqu xmm1, [edx + 128]
	405	aesenc xmm0, xmm1
	406
	407	// Final round...
	408	movdqu xmm1, [edx + 144]
	409	aesenclast xmm0, xmm1
	410
	411	// Unpermute the ciphertext block and store it.
	412	pshufb xmm0, xmm5
	413	mov eax, [esp + 12]
	414	movdqu [eax], xmm0
	415
	416	// And we're done.
	417	ret
	418
	419	ENDFUNC
	420
	421	FUNC(rijndael_dblk_x86_aesni)
	422
	423	// On entry, we have:
	424	// [esp + 4] points to the context block
	425	// [esp + 8] points to the input data block
	426	// [esp + 12] points to the output buffer
	427
	428	// Find the magic endianness-swapping table.
	429	ldgot ecx
	430	movdqa xmm5, [INTADDR(endswap_tab, ecx)]
	431
	432	// Load the input block and end-swap it. Also, start loading the
	433	// keys.
	434	mov eax, [esp + 8]
	435	movdqu xmm0, [eax]
	436	pshufb xmm0, xmm5
	437	mov eax, [esp + 4]
	438	lea edx, [eax + wi]
	439	mov eax, [eax + nr]
	440
	441	// Initial whitening.
	442	movdqu xmm1, [edx]
	443	add edx, 16
	444	pxor xmm0, xmm1
	445
	446	// Dispatch to the correct code.
	447	cmp eax, 10
	448	je dr10
	449	jb bogus
	450	cmp eax, 14
	451	je dr14
	452	ja bogus
	453	cmp eax, 12
	454	je dr12
	455	jb dr11
	456	jmp dr13
	457
	458	.align 2
	459
	460	// 14 rounds...
	461	dr14: movdqu xmm1, [edx]
	462	add edx, 16
	463	aesdec xmm0, xmm1
	464
	465	// 13 rounds...
	466	dr13: movdqu xmm1, [edx]
	467	add edx, 16
	468	aesdec xmm0, xmm1
	469
	470	// 12 rounds...
	471	dr12: movdqu xmm1, [edx]
	472	add edx, 16
	473	aesdec xmm0, xmm1
	474
	475	// 11 rounds...
	476	dr11: movdqu xmm1, [edx]
	477	add edx, 16
	478	aesdec xmm0, xmm1
	479
	480	// 10 rounds...
	481	dr10: movdqu xmm1, [edx]
	482	aesdec xmm0, xmm1
	483
	484	// 9 rounds...
	485	movdqu xmm1, [edx + 16]
	486	aesdec xmm0, xmm1
	487
	488	// 8 rounds...
	489	movdqu xmm1, [edx + 32]
	490	aesdec xmm0, xmm1
	491
	492	// 7 rounds...
	493	movdqu xmm1, [edx + 48]
	494	aesdec xmm0, xmm1
	495
	496	// 6 rounds...
	497	movdqu xmm1, [edx + 64]
	498	aesdec xmm0, xmm1
	499
	500	// 5 rounds...
	501	movdqu xmm1, [edx + 80]
	502	aesdec xmm0, xmm1
	503
	504	// 4 rounds...
	505	movdqu xmm1, [edx + 96]
	506	aesdec xmm0, xmm1
	507
	508	// 3 rounds...
	509	movdqu xmm1, [edx + 112]
	510	aesdec xmm0, xmm1
	511
	512	// 2 rounds...
	513	movdqu xmm1, [edx + 128]
	514	aesdec xmm0, xmm1
	515
	516	// Final round...
	517	movdqu xmm1, [edx + 144]
	518	aesdeclast xmm0, xmm1
	519
	520	// Unpermute the ciphertext block and store it.
	521	pshufb xmm0, xmm5
	522	mov eax, [esp + 12]
	523	movdqu [eax], xmm0
	524
	525	// And we're done.
	526	ret
	527
	528	ENDFUNC
	529
	530	///--------------------------------------------------------------------------
	531	/// Random utilities.
	532
	533	.align 16
	534	// Abort the process because of a programming error. Indirecting
	535	// through this point serves several purposes: (a) by CALLing, rather
	536	// than branching to, `abort', we can save the return address, which
	537	// might at least provide a hint as to what went wrong; (b) we don't
	538	// have conditional CALLs (and they'd be big anyway); and (c) we can
	539	// write a HLT here as a backstop against `abort' being mad.
	540	bogus: callext F(abort)
	541	0: hlt
	542	jmp 0b
	543
	544	gotaux ecx
	545
	546	///--------------------------------------------------------------------------
	547	/// Data tables.
	548
	549	.align 16
	550	endswap_tab:
	551	.byte 3, 2, 1, 0
	552	.byte 7, 6, 5, 4
	553	.byte 11, 10, 9, 8
	554	.byte 15, 14, 13, 12
	555
	556	///----- That's all, folks --------------------------------------------------