mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of Salsa20
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// Local utilities.
	35
	36	// Magic constants for shuffling.
	37	#define ROTL 0x93
	38	#define ROT2 0x4e
	39	#define ROTR 0x39
	40
	41	///--------------------------------------------------------------------------
	42	/// Main code.
	43
	44	.arch pentium4
	45	.text
	46
	47	FUNC(salsa20_core_x86ish_sse2)
	48
	49	// Initial setup.
	50
	51	#if CPUFAM_X86
	52	// Arguments come in on the stack, and will need to be collected. We
	53	// we can get away with just the scratch registers for integer work,
	54	// but we'll run out of XMM registers and will need some properly
	55	// aligned space which we'll steal from the stack. I don't trust the
	56	// stack pointer's alignment, so I'll have to mask the stack pointer,
	57	// which in turn means I'll need to keep track of the old value.
	58	// Hence I'm making a full i386-style stack frame here.
	59	//
	60	// The Windows and SysV ABIs are sufficiently similar that we don't
	61	// need to worry about the differences here.
	62
	63	# define NR ecx
	64	# define IN eax
	65	# define OUT edx
	66	# define SAVE0 xmm6
	67	# define SAVE1 xmm7
	68	# define SAVE2 [esp + 0]
	69	# define SAVE3 [esp + 16]
	70
	71	push ebp
	72	mov ebp, esp
	73	sub esp, 32
	74	mov IN, [ebp + 12]
	75	mov OUT, [ebp + 16]
	76	and esp, ~15
	77	mov NR, [ebp + 8]
	78	#endif
	79
	80	#if CPUFAM_AMD64 && ABI_SYSV
	81	// This is nice. We have plenty of XMM registers, and the arguments
	82	// are in useful places. There's no need to spill anything and we
	83	// can just get on with the code.
	84
	85	# define NR edi
	86	# define IN rsi
	87	# define OUT rdx
	88	# define SAVE0 xmm6
	89	# define SAVE1 xmm7
	90	# define SAVE2 xmm8
	91	# define SAVE3 xmm9
	92	#endif
	93
	94	# if CPUFAM_AMD64 && ABI_WIN
	95	// Arguments come in registers, but they're different between Windows
	96	// and everyone else (and everyone else is saner).
	97	//
	98	// The Windows ABI insists that we preserve some of the XMM
	99	// registers, but we want more than we can use as scratch space. Two
	100	// places we only need to save a copy of the input for the
	101	// feedforward at the end; but the other two we want for the final
	102	// permutation, so save the old values on the stack (We need an extra
	103	// 8 bytes to align the stack.)
	104
	105	# define NR ecx
	106	# define IN rdx
	107	# define OUT r8
	108	# define SAVE0 xmm6
	109	# define SAVE1 xmm7
	110	# define SAVE2 [rsp + 32]
	111	# define SAVE3 [rsp + 48]
	112
	113	sub rsp, 64 + 8
	114	.seh_stackalloc 64 + 8
	115	movdqa [rsp + 0], xmm6
	116	.seh_savexmm xmm6, 0
	117	movdqa [rsp + 16], xmm7
	118	.seh_savexmm xmm7, 16
	119	.seh_endprologue
	120	#endif
	121
	122	// First job is to slurp the matrix into XMM registers. The words
	123	// have already been permuted conveniently to make them line up
	124	// better for SIMD processing.
	125	//
	126	// The textbook arrangement of the matrix is this.
	127	//
	128	// [C K K K]
	129	// [K C N N]
	130	// [T T C K]
	131	// [K K K C]
	132	//
	133	// But we've rotated the columns up so that the main diagonal with
	134	// the constants on it end up in the first row, giving something more
	135	// like
	136	//
	137	// [C C C C]
	138	// [K T K K]
	139	// [T K K N]
	140	// [K K N K]
	141	//
	142	// so the transformation looks like this:
	143	//
	144	// [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
	145	// [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
	146	// [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
	147	// [12 13 14 15] [12 1 6 11] (d, xmm3)
	148	movdqu xmm0, [IN + 0]
	149	movdqu xmm1, [IN + 16]
	150	movdqu xmm2, [IN + 32]
	151	movdqu xmm3, [IN + 48]
	152
	153	// Take a copy for later.
	154	movdqa SAVE0, xmm0
	155	movdqa SAVE1, xmm1
	156	movdqa SAVE2, xmm2
	157	movdqa SAVE3, xmm3
	158
	159	0:
	160	// Apply a column quarterround to each of the columns simultaneously.
	161	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	162	// have to synthesize it.
	163
	164	// b ^= (a + d) <<< 7
	165	movdqa xmm4, xmm0
	166	paddd xmm4, xmm3
	167	movdqa xmm5, xmm4
	168	pslld xmm4, 7
	169	psrld xmm5, 25
	170	por xmm4, xmm5
	171	pxor xmm1, xmm4
	172
	173	// c ^= (b + a) <<< 9
	174	movdqa xmm4, xmm1
	175	paddd xmm4, xmm0
	176	movdqa xmm5, xmm4
	177	pslld xmm4, 9
	178	psrld xmm5, 23
	179	por xmm4, xmm5
	180	pxor xmm2, xmm4
	181
	182	// d ^= (c + b) <<< 13
	183	movdqa xmm4, xmm2
	184	paddd xmm4, xmm1
	185	pshufd xmm1, xmm1, ROTL
	186	movdqa xmm5, xmm4
	187	pslld xmm4, 13
	188	psrld xmm5, 19
	189	por xmm4, xmm5
	190	pxor xmm3, xmm4
	191
	192	// a ^= (d + c) <<< 18
	193	movdqa xmm4, xmm3
	194	pshufd xmm3, xmm3, ROTR
	195	paddd xmm4, xmm2
	196	pshufd xmm2, xmm2, ROT2
	197	movdqa xmm5, xmm4
	198	pslld xmm4, 18
	199	psrld xmm5, 14
	200	por xmm4, xmm5
	201	pxor xmm0, xmm4
	202
	203	// The transpose conveniently only involves reordering elements of
	204	// individual rows, which can be done quite easily, and reordering
	205	// the rows themselves, which is a trivial renaming. It doesn't
	206	// involve any movement of elements between rows.
	207	//
	208	// [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
	209	// [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
	210	// [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
	211	// [12 1 6 11] [ 3 4 9 14] (d, xmm1)
	212	//
	213	// The shuffles have quite high latency, so they've been pushed
	214	// backwards into the main instruction list.
	215
	216	// Apply the row quarterround to each of the columns (yes!)
	217	// simultaneously.
	218
	219	// b ^= (a + d) <<< 7
	220	movdqa xmm4, xmm0
	221	paddd xmm4, xmm1
	222	movdqa xmm5, xmm4
	223	pslld xmm4, 7
	224	psrld xmm5, 25
	225	por xmm4, xmm5
	226	pxor xmm3, xmm4
	227
	228	// c ^= (b + a) <<< 9
	229	movdqa xmm4, xmm3
	230	paddd xmm4, xmm0
	231	movdqa xmm5, xmm4
	232	pslld xmm4, 9
	233	psrld xmm5, 23
	234	por xmm4, xmm5
	235	pxor xmm2, xmm4
	236
	237	// d ^= (c + b) <<< 13
	238	movdqa xmm4, xmm2
	239	paddd xmm4, xmm3
	240	pshufd xmm3, xmm3, ROTL
	241	movdqa xmm5, xmm4
	242	pslld xmm4, 13
	243	psrld xmm5, 19
	244	por xmm4, xmm5
	245	pxor xmm1, xmm4
	246
	247	// a ^= (d + c) <<< 18
	248	movdqa xmm4, xmm1
	249	pshufd xmm1, xmm1, ROTR
	250	paddd xmm4, xmm2
	251	pshufd xmm2, xmm2, ROT2
	252	movdqa xmm5, xmm4
	253	pslld xmm4, 18
	254	psrld xmm5, 14
	255	por xmm4, xmm5
	256	pxor xmm0, xmm4
	257
	258	// We had to undo the transpose ready for the next loop. Again, push
	259	// back the shuffles because they take a long time coming through.
	260	// Decrement the loop counter and see if we should go round again.
	261	// Later processors fuse this pair into a single uop.
	262	sub NR, 2
	263	ja 0b
	264
	265	// Almost there. Firstly, the feedforward addition, and then we have
	266	// to write out the result. Here we have to undo the permutation
	267	// which was already applied to the input. Shuffling has quite high
	268	// latency, so arrange to start a new shuffle into a temporary as
	269	// soon as we've written out the old value.
	270	paddd xmm0, SAVE0
	271	pshufd xmm4, xmm0, 0x39
	272	movd [OUT + 0], xmm0
	273
	274	paddd xmm1, SAVE1
	275	pshufd xmm5, xmm1, ROTL
	276	movd [OUT + 16], xmm1
	277
	278	paddd xmm2, SAVE2
	279	pshufd xmm6, xmm2, ROT2
	280	movd [OUT + 32], xmm2
	281
	282	paddd xmm3, SAVE3
	283	pshufd xmm7, xmm3, ROTR
	284	movd [OUT + 48], xmm3
	285
	286	movd [OUT + 4], xmm7
	287	pshufd xmm7, xmm3, ROT2
	288	movd [OUT + 24], xmm7
	289	pshufd xmm3, xmm3, ROTL
	290	movd [OUT + 44], xmm3
	291
	292	movd [OUT + 8], xmm6
	293	pshufd xmm6, xmm2, ROTL
	294	movd [OUT + 28], xmm6
	295	pshufd xmm2, xmm2, ROTR
	296	movd [OUT + 52], xmm2
	297
	298	movd [OUT + 12], xmm5
	299	pshufd xmm5, xmm1, ROTR
	300	movd [OUT + 36], xmm5
	301	pshufd xmm1, xmm1, ROT2
	302	movd [OUT + 56], xmm1
	303
	304	movd [OUT + 20], xmm4
	305	pshufd xmm4, xmm0, ROT2
	306	movd [OUT + 40], xmm4
	307	pshufd xmm0, xmm0, ROTL
	308	movd [OUT + 60], xmm0
	309
	310	// Tidy things up.
	311
	312	#if CPUFAM_X86
	313	mov esp, ebp
	314	pop ebp
	315	#endif
	316	#if CPUFAM_AMD64 && ABI_WIN
	317	movdqa xmm6, [rsp + 0]
	318	movdqa xmm7, [rsp + 16]
	319	add rsp, 64 + 8
	320	#endif
	321
	322	// And with that, we're done.
	323	ret
	324
	325	#undef NR
	326	#undef IN
	327	#undef OUT
	328	#undef SAVE0
	329	#undef SAVE1
	330	#undef SAVE2
	331	#undef SAVE3
	332
	333	ENDFUNC
	334
	335	///----- That's all, folks --------------------------------------------------