[catacomb] / symm / salsa20-x86ish-sse2.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// Fancy SIMD implementation of Salsa20
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

///--------------------------------------------------------------------------
/// Local utilities.

// Magic constants for shuffling.
#define ROTL 0x93
#define ROT2 0x4e
#define ROTR 0x39

///--------------------------------------------------------------------------
/// Main code.

	.arch pentium4
	.section .text

FUNC(salsa20_core_x86ish_sse2)

	// Initial setup.

#if CPUFAM_X86
	// Arguments come in on the stack, and will need to be collected.  We
	// we can get away with just the scratch registers for integer work,
	// but we'll run out of XMM registers and will need some properly
	// aligned space which we'll steal from the stack.  I don't trust the
	// stack pointer's alignment, so I'll have to mask the stack pointer,
	// which in turn means I'll need to keep track of the old value.
	// Hence I'm making a full i386-style stack frame here.
	//
	// The Windows and SysV ABIs are sufficiently similar that we don't
	// need to worry about the differences here.

#  define NR ecx
#  define IN eax
#  define OUT edx
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 [esp + 0]
#  define SAVE3 [esp + 16]

	push	ebp
	mov	ebp, esp
	sub	esp, 32
	mov	IN, [ebp + 12]
	mov	OUT, [ebp + 16]
	and	esp, ~15
	mov	NR, [ebp + 8]
#endif

#if CPUFAM_AMD64 && ABI_SYSV
	// This is nice.  We have plenty of XMM registers, and the arguments
	// are in useful places.  There's no need to spill anything and we
	// can just get on with the code.

#  define NR edi
#  define IN rsi
#  define OUT rdx
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 xmm8
#  define SAVE3 xmm9
#endif

#  if CPUFAM_AMD64 && ABI_WIN
	// Arguments come in registers, but they're different between Windows
	// and everyone else (and everyone else is saner).
	//
	// The Windows ABI insists that we preserve some of the XMM
	// registers, but we want more than we can use as scratch space.  Two
	// places we only need to save a copy of the input for the
	// feedforward at the end; but the other two we want for the final
	// permutation, so save the old values on the stack (We need an extra
	// 8 bytes to align the stack.)

#  define NR ecx
#  define IN rdx
#  define OUT r8
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 [rsp + 32]
#  define SAVE3 [rsp + 48]

	sub	rsp, 64 + 8
	movdqa	[rsp +  0], xmm6
	movdqa	[rsp + 16], xmm7
#endif

	// First job is to slurp the matrix into XMM registers.  The words
	// have already been permuted conveniently to make them line up
	// better for SIMD processing.
	//
	// The textbook arrangement of the matrix is this.
	//
	//	[C K K K]
	//	[K C N N]
	//	[T T C K]
	//	[K K K C]
	//
	// But we've rotated the columns up so that the main diagonal with
	// the constants on it end up in the first row, giving something more
	// like
	//
	//	[C C C C]
	//	[K T K K]
	//	[T K K N]
	//	[K K N K]
	//
	// so the transformation looks like this:
	//
	//	[ 0  1  2  3]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
	//	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
	//	[12 13 14 15]		[12  1  6 11] (d, xmm3)
	movdqu	xmm0, [IN +  0]
	movdqu	xmm1, [IN + 16]
	movdqu	xmm2, [IN + 32]
	movdqu	xmm3, [IN + 48]

	// Take a copy for later.
	movdqa	SAVE0, xmm0
	movdqa	SAVE1, xmm1
	movdqa	SAVE2, xmm2
	movdqa	SAVE3, xmm3

loop:
	// Apply a column quarterround to each of the columns simultaneously.
	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	// have to synthesize it.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm3
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm1
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm1
	pshufd	xmm1, xmm1, ROTL
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm3
	pshufd	xmm3, xmm3, ROTR
	paddd	xmm4, xmm2
	pshufd	xmm2, xmm2, ROT2
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// The transpose conveniently only involves reordering elements of
	// individual rows, which can be done quite easily, and reordering
	// the rows themselves, which is a trivial renaming.  It doesn't
	// involve any movement of elements between rows.
	//
	//	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  9 14  3]	 -->	[ 1  6 11 12] (b, xmm3)
	//	[ 8 13	2  7]		[ 2  7	8 13] (c, xmm2)
	//	[12  1	6 11]		[ 3  4	9 14] (d, xmm1)
	//
	// The shuffles have quite high latency, so they've been pushed
	// backwards into the main instruction list.

	// Apply the row quarterround to each of the columns (yes!)
	// simultaneously.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm1
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm3
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm3
	pshufd	xmm3, xmm3, ROTL
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm1
	pshufd	xmm1, xmm1, ROTR
	paddd	xmm4, xmm2
	pshufd	xmm2, xmm2, ROT2
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// We had to undo the transpose ready for the next loop.  Again, push
	// back the shuffles because they take a long time coming through.
	// Decrement the loop counter and see if we should go round again.
	// Later processors fuse this pair into a single uop.
	sub	NR, 2
	ja	loop

	// Almost there.  Firstly, the feedforward addition, and then we have
	// to write out the result.  Here we have to undo the permutation
	// which was already applied to the input.  Shuffling has quite high
	// latency, so arrange to start a new shuffle into a temporary as
	// soon as we've written out the old value.
	paddd	xmm0, SAVE0
	pshufd	xmm4, xmm0, 0x39
	movd	[OUT +  0], xmm0

	paddd	xmm1, SAVE1
	pshufd	xmm5, xmm1, ROTL
	movd	[OUT + 16], xmm1

	paddd	xmm2, SAVE2
	pshufd	xmm6, xmm2, ROT2
	movd	[OUT + 32], xmm2

	paddd	xmm3, SAVE3
	pshufd	xmm7, xmm3, ROTR
	movd	[OUT + 48], xmm3

	movd	[OUT +  4], xmm7
	pshufd	xmm7, xmm3, ROT2
	movd	[OUT + 24], xmm7
	pshufd	xmm3, xmm3, ROTL
	movd	[OUT + 44], xmm3

	movd	[OUT +  8], xmm6
	pshufd	xmm6, xmm2, ROTL
	movd	[OUT + 28], xmm6
	pshufd	xmm2, xmm2, ROTR
	movd	[OUT + 52], xmm2

	movd	[OUT + 12], xmm5
	pshufd	xmm5, xmm1, ROTR
	movd	[OUT + 36], xmm5
	pshufd	xmm1, xmm1, ROT2
	movd	[OUT + 56], xmm1

	movd	[OUT + 20], xmm4
	pshufd	xmm4, xmm0, ROT2
	movd	[OUT + 40], xmm4
	pshufd	xmm0, xmm0, ROTL
	movd	[OUT + 60], xmm0

	// Tidy things up.

#if CPUFAM_X86
	mov	esp, ebp
	pop	ebp
#endif
#if CPUFAM_AMD64 && ABI_WIN
	movdqa	xmm6, [rsp +  0]
	movdqa	xmm7, [rsp + 16]
	add	rsp, 64 + 8
#endif

	// And with that, we're done.
	ret

#undef NR
#undef IN
#undef OUT
#undef SAVE0
#undef SAVE1
#undef SAVE2
#undef SAVE3

ENDFUNC

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
1a0c09c4 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of Salsa20
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
47103664 MW	34	/// Local utilities.
	35
	36	// Magic constants for shuffling.
	37	#define ROTL 0x93
	38	#define ROT2 0x4e
	39	#define ROTR 0x39
	40
	41	///--------------------------------------------------------------------------
1a0c09c4 MW	42	/// Main code.
	43
	44	.arch pentium4
	45	.section .text
	46
0f23f75f MW	47	FUNC(salsa20_core_x86ish_sse2)
	48
	49	// Initial setup.
	50
	51	#if CPUFAM_X86
	52	// Arguments come in on the stack, and will need to be collected. We
	53	// we can get away with just the scratch registers for integer work,
	54	// but we'll run out of XMM registers and will need some properly
	55	// aligned space which we'll steal from the stack. I don't trust the
	56	// stack pointer's alignment, so I'll have to mask the stack pointer,
	57	// which in turn means I'll need to keep track of the old value.
	58	// Hence I'm making a full i386-style stack frame here.
	59	//
	60	// The Windows and SysV ABIs are sufficiently similar that we don't
	61	// need to worry about the differences here.
	62
	63	# define NR ecx
	64	# define IN eax
	65	# define OUT edx
	66	# define SAVE0 xmm6
	67	# define SAVE1 xmm7
	68	# define SAVE2 [esp + 0]
	69	# define SAVE3 [esp + 16]
1a0c09c4	70
1a0c09c4 MW	71	push ebp
	72	mov ebp, esp
	73	sub esp, 32
0f23f75f MW	74	mov IN, [ebp + 12]
0f23f75f MW	75	mov OUT, [ebp + 16]
1a0c09c4	76	and esp, ~15
0f23f75f MW	77	mov NR, [ebp + 8]
	78	#endif
	79
	80	#if CPUFAM_AMD64 && ABI_SYSV
	81	// This is nice. We have plenty of XMM registers, and the arguments
	82	// are in useful places. There's no need to spill anything and we
	83	// can just get on with the code.
	84
	85	# define NR edi
	86	# define IN rsi
	87	# define OUT rdx
	88	# define SAVE0 xmm6
	89	# define SAVE1 xmm7
	90	# define SAVE2 xmm8
	91	# define SAVE3 xmm9
	92	#endif
	93
	94	# if CPUFAM_AMD64 && ABI_WIN
	95	// Arguments come in registers, but they're different between Windows
	96	// and everyone else (and everyone else is saner).
	97	//
	98	// The Windows ABI insists that we preserve some of the XMM
	99	// registers, but we want more than we can use as scratch space. Two
	100	// places we only need to save a copy of the input for the
	101	// feedforward at the end; but the other two we want for the final
	102	// permutation, so save the old values on the stack (We need an extra
	103	// 8 bytes to align the stack.)
	104
	105	# define NR ecx
	106	# define IN rdx
	107	# define OUT r8
	108	# define SAVE0 xmm6
	109	# define SAVE1 xmm7
	110	# define SAVE2 [rsp + 32]
	111	# define SAVE3 [rsp + 48]
	112
	113	sub rsp, 64 + 8
	114	movdqa [rsp + 0], xmm6
	115	movdqa [rsp + 16], xmm7
	116	#endif
1a0c09c4 MW	117
	118	// First job is to slurp the matrix into XMM registers. The words
	119	// have already been permuted conveniently to make them line up
	120	// better for SIMD processing.
	121	//
	122	// The textbook arrangement of the matrix is this.
	123	//
	124	// [C K K K]
	125	// [K C N N]
	126	// [T T C K]
	127	// [K K K C]
	128	//
	129	// But we've rotated the columns up so that the main diagonal with
	130	// the constants on it end up in the first row, giving something more
	131	// like
	132	//
	133	// [C C C C]
	134	// [K T K K]
	135	// [T K K N]
	136	// [K K N K]
	137	//
	138	// so the transformation looks like this:
	139	//
	140	// [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
	141	// [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
	142	// [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
	143	// [12 13 14 15] [12 1 6 11] (d, xmm3)
0f23f75f MW	144	movdqu xmm0, [IN + 0]
	145	movdqu xmm1, [IN + 16]
	146	movdqu xmm2, [IN + 32]
	147	movdqu xmm3, [IN + 48]
1a0c09c4	148
7afb1dc9	149	// Take a copy for later.
0f23f75f MW	150	movdqa SAVE0, xmm0
	151	movdqa SAVE1, xmm1
	152	movdqa SAVE2, xmm2
	153	movdqa SAVE3, xmm3
1a0c09c4 MW	154
1a0c09c4 MW	155	loop:
1a0c09c4 MW	156	// Apply a column quarterround to each of the columns simultaneously.
	157	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	158	// have to synthesize it.
	159
	160	// b ^= (a + d) <<< 7
	161	movdqa xmm4, xmm0
	162	paddd xmm4, xmm3
	163	movdqa xmm5, xmm4
	164	pslld xmm4, 7
	165	psrld xmm5, 25
	166	por xmm4, xmm5
	167	pxor xmm1, xmm4
	168
	169	// c ^= (b + a) <<< 9
	170	movdqa xmm4, xmm1
	171	paddd xmm4, xmm0
	172	movdqa xmm5, xmm4
	173	pslld xmm4, 9
	174	psrld xmm5, 23
	175	por xmm4, xmm5
	176	pxor xmm2, xmm4
	177
	178	// d ^= (c + b) <<< 13
	179	movdqa xmm4, xmm2
	180	paddd xmm4, xmm1
47103664	181	pshufd xmm1, xmm1, ROTL
1a0c09c4 MW	182	movdqa xmm5, xmm4
	183	pslld xmm4, 13
	184	psrld xmm5, 19
	185	por xmm4, xmm5
	186	pxor xmm3, xmm4
	187
	188	// a ^= (d + c) <<< 18
	189	movdqa xmm4, xmm3
47103664	190	pshufd xmm3, xmm3, ROTR
1a0c09c4	191	paddd xmm4, xmm2
47103664	192	pshufd xmm2, xmm2, ROT2
1a0c09c4 MW	193	movdqa xmm5, xmm4
	194	pslld xmm4, 18
	195	psrld xmm5, 14
	196	por xmm4, xmm5
	197	pxor xmm0, xmm4
	198
	199	// The transpose conveniently only involves reordering elements of
	200	// individual rows, which can be done quite easily, and reordering
	201	// the rows themselves, which is a trivial renaming. It doesn't
	202	// involve any movement of elements between rows.
	203	//
	204	// [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
0f23f75f MW	205	// [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
	206	// [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
	207	// [12 1 6 11] [ 3 4 9 14] (d, xmm1)
1a0c09c4 MW	208	//
	209	// The shuffles have quite high latency, so they've been pushed
	210	// backwards into the main instruction list.
	211
	212	// Apply the row quarterround to each of the columns (yes!)
	213	// simultaneously.
	214
	215	// b ^= (a + d) <<< 7
	216	movdqa xmm4, xmm0
	217	paddd xmm4, xmm1
	218	movdqa xmm5, xmm4
	219	pslld xmm4, 7
	220	psrld xmm5, 25
	221	por xmm4, xmm5
	222	pxor xmm3, xmm4
	223
	224	// c ^= (b + a) <<< 9
	225	movdqa xmm4, xmm3
	226	paddd xmm4, xmm0
	227	movdqa xmm5, xmm4
	228	pslld xmm4, 9
	229	psrld xmm5, 23
	230	por xmm4, xmm5
	231	pxor xmm2, xmm4
	232
	233	// d ^= (c + b) <<< 13
	234	movdqa xmm4, xmm2
	235	paddd xmm4, xmm3
47103664	236	pshufd xmm3, xmm3, ROTL
1a0c09c4 MW	237	movdqa xmm5, xmm4
	238	pslld xmm4, 13
	239	psrld xmm5, 19
	240	por xmm4, xmm5
	241	pxor xmm1, xmm4
	242
	243	// a ^= (d + c) <<< 18
	244	movdqa xmm4, xmm1
47103664	245	pshufd xmm1, xmm1, ROTR
1a0c09c4	246	paddd xmm4, xmm2
47103664	247	pshufd xmm2, xmm2, ROT2
1a0c09c4 MW	248	movdqa xmm5, xmm4
	249	pslld xmm4, 18
	250	psrld xmm5, 14
	251	por xmm4, xmm5
	252	pxor xmm0, xmm4
	253
	254	// We had to undo the transpose ready for the next loop. Again, push
	255	// back the shuffles because they take a long time coming through.
	256	// Decrement the loop counter and see if we should go round again.
	257	// Later processors fuse this pair into a single uop.
0f23f75f	258	sub NR, 2
1a0c09c4 MW	259	ja loop
	260
	261	// Almost there. Firstly, the feedforward addition, and then we have
	262	// to write out the result. Here we have to undo the permutation
	263	// which was already applied to the input. Shuffling has quite high
	264	// latency, so arrange to start a new shuffle into a temporary as
	265	// soon as we've written out the old value.
0f23f75f MW	266	paddd xmm0, SAVE0
	267	pshufd xmm4, xmm0, 0x39
	268	movd [OUT + 0], xmm0
1a0c09c4	269
0f23f75f	270	paddd xmm1, SAVE1
47103664	271	pshufd xmm5, xmm1, ROTL
0f23f75f	272	movd [OUT + 16], xmm1
1a0c09c4	273
0f23f75f	274	paddd xmm2, SAVE2
47103664	275	pshufd xmm6, xmm2, ROT2
0f23f75f	276	movd [OUT + 32], xmm2
1a0c09c4	277
0f23f75f	278	paddd xmm3, SAVE3
47103664	279	pshufd xmm7, xmm3, ROTR
0f23f75f	280	movd [OUT + 48], xmm3
1a0c09c4	281
0f23f75f	282	movd [OUT + 4], xmm7
47103664	283	pshufd xmm7, xmm3, ROT2
0f23f75f	284	movd [OUT + 24], xmm7
47103664	285	pshufd xmm3, xmm3, ROTL
0f23f75f	286	movd [OUT + 44], xmm3
1a0c09c4	287
0f23f75f	288	movd [OUT + 8], xmm6
47103664	289	pshufd xmm6, xmm2, ROTL
0f23f75f	290	movd [OUT + 28], xmm6
47103664	291	pshufd xmm2, xmm2, ROTR
0f23f75f	292	movd [OUT + 52], xmm2
1a0c09c4	293
0f23f75f	294	movd [OUT + 12], xmm5
47103664	295	pshufd xmm5, xmm1, ROTR
0f23f75f	296	movd [OUT + 36], xmm5
47103664	297	pshufd xmm1, xmm1, ROT2
0f23f75f	298	movd [OUT + 56], xmm1
1a0c09c4	299
0f23f75f	300	movd [OUT + 20], xmm4
47103664	301	pshufd xmm4, xmm0, ROT2
0f23f75f	302	movd [OUT + 40], xmm4
47103664	303	pshufd xmm0, xmm0, ROTL
0f23f75f	304	movd [OUT + 60], xmm0
1a0c09c4 MW	305
1a0c09c4 MW	306	// Tidy things up.
0f23f75f MW	307
0f23f75f MW	308	#if CPUFAM_X86
1a0c09c4 MW	309	mov esp, ebp
1a0c09c4 MW	310	pop ebp
0f23f75f MW	311	#endif
	312	#if CPUFAM_AMD64 && ABI_WIN
	313	movdqa xmm6, [rsp + 0]
	314	movdqa xmm7, [rsp + 16]
	315	add rsp, 64 + 8
	316	#endif
1a0c09c4 MW	317
	318	// And with that, we're done.
	319	ret
	320
0f23f75f MW	321	#undef NR
	322	#undef IN
	323	#undef OUT
	324	#undef SAVE0
	325	#undef SAVE1
	326	#undef SAVE2
	327	#undef SAVE3
	328
1a0c09c4 MW	329	ENDFUNC
	330
	331	///----- That's all, folks --------------------------------------------------