mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of Salsa20
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// Main code.
	35
	36	.arch pentium4
	37	.text
	38
	39	FUNC(salsa20_core_x86ish_sse2)
	40
	41	// Initial setup.
	42
	43	#if CPUFAM_X86
	44	// Arguments come in on the stack, and will need to be collected. We
	45	// we can get away with just the scratch registers for integer work,
	46	// but we'll run out of XMM registers and will need some properly
	47	// aligned space which we'll steal from the stack. I don't trust the
	48	// stack pointer's alignment, so I'll have to mask the stack pointer,
	49	// which in turn means I'll need to keep track of the old value.
	50	// Hence I'm making a full i386-style stack frame here.
	51	//
	52	// The Windows and SysV ABIs are sufficiently similar that we don't
	53	// need to worry about the differences here.
	54
	55	# define NR ecx
	56	# define IN eax
	57	# define OUT edx
	58	# define SAVE0 xmm6
	59	# define SAVE1 xmm7
	60	# define SAVE2 [esp + 0]
	61	# define SAVE3 [esp + 16]
	62
	63	push ebp
	64	mov ebp, esp
	65	sub esp, 32
	66	mov IN, [ebp + 12]
	67	mov OUT, [ebp + 16]
	68	and esp, ~15
	69	mov NR, [ebp + 8]
	70	#endif
	71
	72	#if CPUFAM_AMD64 && ABI_SYSV
	73	// This is nice. We have plenty of XMM registers, and the arguments
	74	// are in useful places. There's no need to spill anything and we
	75	// can just get on with the code.
	76
	77	# define NR edi
	78	# define IN rsi
	79	# define OUT rdx
	80	# define SAVE0 xmm6
	81	# define SAVE1 xmm7
	82	# define SAVE2 xmm8
	83	# define SAVE3 xmm9
	84	#endif
	85
	86	# if CPUFAM_AMD64 && ABI_WIN
	87	// Arguments come in registers, but they're different between Windows
	88	// and everyone else (and everyone else is saner).
	89	//
	90	// The Windows ABI insists that we preserve some of the XMM
	91	// registers, but we want more than we can use as scratch space. Two
	92	// places we only need to save a copy of the input for the
	93	// feedforward at the end; but the other two we want for the final
	94	// permutation, so save the old values on the stack. (We need an
	95	// extra 8 bytes to align the stack.)
	96
	97	# define NR ecx
	98	# define IN rdx
	99	# define OUT r8
	100	# define SAVE0 xmm6
	101	# define SAVE1 xmm7
	102	# define SAVE2 [rsp + 32]
	103	# define SAVE3 [rsp + 48]
	104
	105	sub rsp, 64 + 8
	106	.seh_stackalloc 64 + 8
	107	movdqa [rsp + 0], xmm6
	108	.seh_savexmm xmm6, 0
	109	movdqa [rsp + 16], xmm7
	110	.seh_savexmm xmm7, 16
	111	.seh_endprologue
	112	#endif
	113
	114	// First job is to slurp the matrix into XMM registers. The words
	115	// have already been permuted conveniently to make them line up
	116	// better for SIMD processing.
	117	//
	118	// The textbook arrangement of the matrix is this.
	119	//
	120	// [C K K K]
	121	// [K C N N]
	122	// [T T C K]
	123	// [K K K C]
	124	//
	125	// But we've rotated the columns up so that the main diagonal with
	126	// the constants on it end up in the first row, giving something more
	127	// like
	128	//
	129	// [C C C C]
	130	// [K T K K]
	131	// [T K K N]
	132	// [K K N K]
	133	//
	134	// so the transformation looks like this:
	135	//
	136	// [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
	137	// [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
	138	// [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
	139	// [12 13 14 15] [12 1 6 11] (d, xmm3)
	140	movdqu xmm0, [IN + 0]
	141	movdqu xmm1, [IN + 16]
	142	movdqu xmm2, [IN + 32]
	143	movdqu xmm3, [IN + 48]
	144
	145	// Take a copy for later.
	146	movdqa SAVE0, xmm0
	147	movdqa SAVE1, xmm1
	148	movdqa SAVE2, xmm2
	149	movdqa SAVE3, xmm3
	150
	151	0:
	152	// Apply a column quarterround to each of the columns simultaneously.
	153	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	154	// have to synthesize it.
	155
	156	// b ^= (a + d) <<< 7
	157	movdqa xmm4, xmm0
	158	paddd xmm4, xmm3
	159	movdqa xmm5, xmm4
	160	pslld xmm4, 7
	161	psrld xmm5, 25
	162	por xmm4, xmm5
	163	pxor xmm1, xmm4
	164
	165	// c ^= (b + a) <<< 9
	166	movdqa xmm4, xmm1
	167	paddd xmm4, xmm0
	168	movdqa xmm5, xmm4
	169	pslld xmm4, 9
	170	psrld xmm5, 23
	171	por xmm4, xmm5
	172	pxor xmm2, xmm4
	173
	174	// d ^= (c + b) <<< 13
	175	movdqa xmm4, xmm2
	176	paddd xmm4, xmm1
	177	pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
	178	movdqa xmm5, xmm4
	179	pslld xmm4, 13
	180	psrld xmm5, 19
	181	por xmm4, xmm5
	182	pxor xmm3, xmm4
	183
	184	// a ^= (d + c) <<< 18
	185	movdqa xmm4, xmm3
	186	pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
	187	paddd xmm4, xmm2
	188	pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
	189	movdqa xmm5, xmm4
	190	pslld xmm4, 18
	191	psrld xmm5, 14
	192	por xmm4, xmm5
	193	pxor xmm0, xmm4
	194
	195	// The transpose conveniently only involves reordering elements of
	196	// individual rows, which can be done quite easily, and reordering
	197	// the rows themselves, which is a trivial renaming. It doesn't
	198	// involve any movement of elements between rows.
	199	//
	200	// [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
	201	// [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
	202	// [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
	203	// [12 1 6 11] [ 3 4 9 14] (d, xmm1)
	204	//
	205	// The shuffles have quite high latency, so they've been pushed
	206	// backwards into the main instruction list.
	207
	208	// Apply the row quarterround to each of the columns (yes!)
	209	// simultaneously.
	210
	211	// b ^= (a + d) <<< 7
	212	movdqa xmm4, xmm0
	213	paddd xmm4, xmm1
	214	movdqa xmm5, xmm4
	215	pslld xmm4, 7
	216	psrld xmm5, 25
	217	por xmm4, xmm5
	218	pxor xmm3, xmm4
	219
	220	// c ^= (b + a) <<< 9
	221	movdqa xmm4, xmm3
	222	paddd xmm4, xmm0
	223	movdqa xmm5, xmm4
	224	pslld xmm4, 9
	225	psrld xmm5, 23
	226	por xmm4, xmm5
	227	pxor xmm2, xmm4
	228
	229	// d ^= (c + b) <<< 13
	230	movdqa xmm4, xmm2
	231	paddd xmm4, xmm3
	232	pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
	233	movdqa xmm5, xmm4
	234	pslld xmm4, 13
	235	psrld xmm5, 19
	236	por xmm4, xmm5
	237	pxor xmm1, xmm4
	238
	239	// a ^= (d + c) <<< 18
	240	movdqa xmm4, xmm1
	241	pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
	242	paddd xmm4, xmm2
	243	pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
	244	movdqa xmm5, xmm4
	245	pslld xmm4, 18
	246	psrld xmm5, 14
	247	por xmm4, xmm5
	248	pxor xmm0, xmm4
	249
	250	// We had to undo the transpose ready for the next loop. Again, push
	251	// back the shuffles because they take a long time coming through.
	252	// Decrement the loop counter and see if we should go round again.
	253	// Later processors fuse this pair into a single uop.
	254	sub NR, 2
	255	ja 0b
	256
	257	// Almost there. Firstly, the feedforward addition.
	258	paddd xmm0, SAVE0 // 0, 5, 10, 15
	259	paddd xmm1, SAVE1 // 4, 9, 14, 3
	260	paddd xmm2, SAVE2 // 8, 13, 2, 7
	261	paddd xmm3, SAVE3 // 12, 1, 6, 11
	262
	263	// Next we must undo the permutation which was already applied to the
	264	// input. This can be done by juggling values in registers, with the
	265	// following fancy footwork: some row rotations, a transpose, and
	266	// some more rotations.
	267	pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14
	268	pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13
	269	pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12
	270
	271	movdqa xmm4, xmm0
	272	movdqa xmm5, xmm3
	273	punpckldq xmm0, xmm2 // 0, 2, 5, 7
	274	punpckldq xmm3, xmm1 // 1, 3, 6, 4
	275	punpckhdq xmm4, xmm2 // 10, 8, 15, 13
	276	punpckhdq xmm5, xmm1 // 11, 9, 12, 14
	277
	278	movdqa xmm1, xmm0
	279	movdqa xmm2, xmm4
	280	punpckldq xmm0, xmm3 // 0, 1, 2, 3
	281	punpckldq xmm4, xmm5 // 10, 11, 8, 9
	282	punpckhdq xmm1, xmm3 // 5, 6, 7, 4
	283	punpckhdq xmm2, xmm5 // 15, 12, 13, 14
	284
	285	pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7
	286	pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11
	287	pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15
	288
	289	// Finally we have to write out the result.
	290	movdqu [OUT + 0], xmm0
	291	movdqu [OUT + 16], xmm1
	292	movdqu [OUT + 32], xmm4
	293	movdqu [OUT + 48], xmm2
	294
	295	// Tidy things up.
	296	#if CPUFAM_X86
	297	mov esp, ebp
	298	pop ebp
	299	#endif
	300	#if CPUFAM_AMD64 && ABI_WIN
	301	movdqa xmm6, [rsp + 0]
	302	movdqa xmm7, [rsp + 16]
	303	add rsp, 64 + 8
	304	#endif
	305
	306	// And with that, we're done.
	307	ret
	308
	309	#undef NR
	310	#undef IN
	311	#undef OUT
	312	#undef SAVE0
	313	#undef SAVE1
	314	#undef SAVE2
	315	#undef SAVE3
	316
	317	ENDFUNC
	318
	319	///----- That's all, folks --------------------------------------------------