[catacomb] / symm / chacha-x86ish-sse2.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// Fancy SIMD implementation of ChaCha
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

///--------------------------------------------------------------------------
/// Main code.

	.arch pentium4
	.text

FUNC(chacha_core_x86ish_sse2)

	// Initial setup.

#if CPUFAM_X86
	// Arguments come in on the stack, and will need to be collected.  We
	// can get away with just the scratch registers for integer work, but
	// we'll run out of XMM registers and will need some properly aligned
	// space which we'll steal from the stack.  I don't trust the stack
	// pointer's alignment, so I'll have to mask the stack pointer, which
	// in turn means I'll need to keep track of the old value.  Hence I'm
	// making a full i386-style stack frame here.
	//
	// The Windows and SysV ABIs are sufficiently similar that we don't
	// need to worry about the differences here.

#  define NR ecx
#  define IN eax
#  define OUT edx
#  define SAVE0 xmm5
#  define SAVE1 xmm6
#  define SAVE2 xmm7
#  define SAVE3 [esp]

	pushreg	ebp
	setfp	ebp
	sub	esp, 16
	mov	IN, [ebp + 12]
	mov	OUT, [ebp + 16]
	and	esp, ~15
	mov	NR, [ebp + 8]
#endif

#if CPUFAM_AMD64 && ABI_SYSV
	// This is nice.  We have plenty of XMM registers, and the arguments
	// are in useful places.  There's no need to spill anything and we
	// can just get on with the code.

#  define NR edi
#  define IN rsi
#  define OUT rdx
#  define SAVE0 xmm5
#  define SAVE1 xmm6
#  define SAVE2 xmm7
#  define SAVE3 xmm8
#endif

#if CPUFAM_AMD64 && ABI_WIN
	// Arguments come in registers, but they're different between Windows
	// and everyone else (and everyone else is saner).
	//
	// The Windows ABI insists that we preserve some of the XMM
	// registers, but we want more than we can use as scratch space.  We
	// only need to save a copy of the input for the feedforward at the
	// end, so we might as well use memory rather than spill extra
	// registers.  (We need an extra 8 bytes to align the stack.)

#  define NR ecx
#  define IN rdx
#  define OUT r8
#  define SAVE0 xmm5
#  define SAVE1 [rsp +  0]
#  define SAVE2 [rsp + 16]
#  define SAVE3 [rsp + 32]

	stalloc	48 + 8
#endif

  endprologue

	// First job is to slurp the matrix into XMM registers.  Be careful:
	// the input matrix isn't likely to be properly aligned.
	//
	//	[ 0  1  2  3] (a, xmm0)
	//	[ 4  5  6  7] (b, xmm1)
	//	[ 8  9 10 11] (c, xmm2)
	//	[12 13 14 15] (d, xmm3)
	movdqu	xmm0, [IN +  0]
	movdqu	xmm1, [IN + 16]
	movdqu	xmm2, [IN + 32]
	movdqu	xmm3, [IN + 48]

	// Take a copy for later.  This one is aligned properly, by
	// construction.
	movdqa	SAVE0, xmm0
	movdqa	SAVE1, xmm1
	movdqa	SAVE2, xmm2
	movdqa	SAVE3, xmm3

0:
	// Apply a column quarterround to each of the columns simultaneously.
	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	// have to synthesize it.

	// a += b; d ^= a; d <<<= 16
	paddd	xmm0, xmm1
	pxor	xmm3, xmm0
	movdqa	xmm4, xmm3
	pslld	xmm3, 16
	psrld	xmm4, 16
	por	xmm3, xmm4

	// c += d; b ^= c; b <<<= 12
	paddd	xmm2, xmm3
	pxor	xmm1, xmm2
	movdqa	xmm4, xmm1
	pslld	xmm1, 12
	psrld	xmm4, 20
	por	xmm1, xmm4

	// a += b; d ^= a; d <<<=  8
	paddd	xmm0, xmm1
	pxor	xmm3, xmm0
	movdqa	xmm4, xmm3
	pslld	xmm3, 8
	psrld	xmm4, 24
	por	xmm3, xmm4

	// c += d; b ^= c; b <<<=  7
	paddd	xmm2, xmm3
	 pshufd	xmm3, xmm3, SHUF(2, 1, 0, 3)
	pxor	xmm1, xmm2
	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
	movdqa	xmm4, xmm1
	pslld	xmm1, 7
	psrld	xmm4, 25
	por	xmm1, xmm4

	// The not-quite-transpose conveniently only involves reordering
	// elements of individual rows, which can be done quite easily.  It
	// doesn't involve any movement of elements between rows, or even
	// renaming of the rows.
	//
	//	[ 0  1  2  3]		[ 0  1  2  3] (a, xmm0)
	//	[ 4  5  6  7]    -->	[ 5  6  7  4] (b, xmm1)
	//	[ 8  9 10 11]		[10 11  8  9] (c, xmm2)
	//	[12 13 14 15]		[15 12 13 14] (d, xmm3)
	//
	// The shuffles have quite high latency, so they've mostly been
	// pushed upwards.  The remaining one can't be moved, though.
	pshufd	xmm1, xmm1, SHUF(0, 3, 2, 1)

	// Apply the diagonal quarterround to each of the columns
	// simultaneously.

	// a += b; d ^= a; d <<<= 16
	paddd	xmm0, xmm1
	pxor	xmm3, xmm0
	movdqa	xmm4, xmm3
	pslld	xmm3, 16
	psrld	xmm4, 16
	por	xmm3, xmm4

	// c += d; b ^= c; b <<<= 12
	paddd	xmm2, xmm3
	pxor	xmm1, xmm2
	movdqa	xmm4, xmm1
	pslld	xmm1, 12
	psrld	xmm4, 20
	por	xmm1, xmm4

	// a += b; d ^= a; d <<<=  8
	paddd	xmm0, xmm1
	pxor	xmm3, xmm0
	movdqa	xmm4, xmm3
	pslld	xmm3, 8
	psrld	xmm4, 24
	por	xmm3, xmm4

	// c += d; b ^= c; b <<<=  7
	paddd	xmm2, xmm3
	 pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)
	pxor	xmm1, xmm2
	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
	movdqa	xmm4, xmm1
	pslld	xmm1, 7
	psrld	xmm4, 25
	por	xmm1, xmm4

	// Finally, finish off undoing the transpose, and we're done for this
	// doubleround.  Again, most of this was done above so we don't have
	// to wait for the shuffles.
	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)

	// Decrement the loop counter and see if we should go round again.
	sub	NR, 2
	ja	0b

	// Almost there.  Firstly, the feedforward addition.
	paddd	xmm0, SAVE0
	paddd	xmm1, SAVE1
	paddd	xmm2, SAVE2
	paddd	xmm3, SAVE3

	// And now we write out the result.  This one won't be aligned
	// either.
	movdqu	[OUT +  0], xmm0
	movdqu	[OUT + 16], xmm1
	movdqu	[OUT + 32], xmm2
	movdqu	[OUT + 48], xmm3

	// Tidy things up.
#if CPUFAM_X86
	dropfp
	popreg	ebp
#endif
#if CPUFAM_AMD64 && ABI_WIN
	stfree	48 + 8
#endif

	// And with that, we're done.
	ret

ENDFUNC

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
1a0c09c4 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of ChaCha
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// Main code.
	35
	36	.arch pentium4
bc9ac7eb	37	.text
1a0c09c4	38
0f23f75f MW	39	FUNC(chacha_core_x86ish_sse2)
	40
	41	// Initial setup.
	42
	43	#if CPUFAM_X86
	44	// Arguments come in on the stack, and will need to be collected. We
172707cb MW	45	// can get away with just the scratch registers for integer work, but
	46	// we'll run out of XMM registers and will need some properly aligned
	47	// space which we'll steal from the stack. I don't trust the stack
	48	// pointer's alignment, so I'll have to mask the stack pointer, which
	49	// in turn means I'll need to keep track of the old value. Hence I'm
	50	// making a full i386-style stack frame here.
0f23f75f MW	51	//
	52	// The Windows and SysV ABIs are sufficiently similar that we don't
	53	// need to worry about the differences here.
	54
	55	# define NR ecx
	56	# define IN eax
	57	# define OUT edx
	58	# define SAVE0 xmm5
	59	# define SAVE1 xmm6
	60	# define SAVE2 xmm7
	61	# define SAVE3 [esp]
1a0c09c4	62
0923a413 MW	63	pushreg ebp
0923a413 MW	64	setfp ebp
1a0c09c4	65	sub esp, 16
0f23f75f MW	66	mov IN, [ebp + 12]
0f23f75f MW	67	mov OUT, [ebp + 16]
1a0c09c4	68	and esp, ~15
0f23f75f MW	69	mov NR, [ebp + 8]
	70	#endif
	71
	72	#if CPUFAM_AMD64 && ABI_SYSV
	73	// This is nice. We have plenty of XMM registers, and the arguments
	74	// are in useful places. There's no need to spill anything and we
	75	// can just get on with the code.
	76
	77	# define NR edi
	78	# define IN rsi
	79	# define OUT rdx
	80	# define SAVE0 xmm5
	81	# define SAVE1 xmm6
	82	# define SAVE2 xmm7
	83	# define SAVE3 xmm8
	84	#endif
	85
	86	#if CPUFAM_AMD64 && ABI_WIN
	87	// Arguments come in registers, but they're different between Windows
	88	// and everyone else (and everyone else is saner).
	89	//
	90	// The Windows ABI insists that we preserve some of the XMM
	91	// registers, but we want more than we can use as scratch space. We
	92	// only need to save a copy of the input for the feedforward at the
	93	// end, so we might as well use memory rather than spill extra
	94	// registers. (We need an extra 8 bytes to align the stack.)
	95
	96	# define NR ecx
	97	# define IN rdx
	98	# define OUT r8
	99	# define SAVE0 xmm5
	100	# define SAVE1 [rsp + 0]
	101	# define SAVE2 [rsp + 16]
	102	# define SAVE3 [rsp + 32]
	103
0923a413	104	stalloc 48 + 8
0f23f75f	105	#endif
1a0c09c4	106
0923a413 MW	107	endprologue
0923a413 MW	108
1a0c09c4 MW	109	// First job is to slurp the matrix into XMM registers. Be careful:
	110	// the input matrix isn't likely to be properly aligned.
	111	//
	112	// [ 0 1 2 3] (a, xmm0)
3197685c MW	113	// [ 4 5 6 7] (b, xmm1)
	114	// [ 8 9 10 11] (c, xmm2)
	115	// [12 13 14 15] (d, xmm3)
0f23f75f MW	116	movdqu xmm0, [IN + 0]
	117	movdqu xmm1, [IN + 16]
	118	movdqu xmm2, [IN + 32]
	119	movdqu xmm3, [IN + 48]
1a0c09c4 MW	120
	121	// Take a copy for later. This one is aligned properly, by
	122	// construction.
0f23f75f MW	123	movdqa SAVE0, xmm0
	124	movdqa SAVE1, xmm1
	125	movdqa SAVE2, xmm2
	126	movdqa SAVE3, xmm3
1a0c09c4	127
fd3bb67b	128	0:
1a0c09c4 MW	129	// Apply a column quarterround to each of the columns simultaneously.
	130	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	131	// have to synthesize it.
	132
	133	// a += b; d ^= a; d <<<= 16
	134	paddd xmm0, xmm1
	135	pxor xmm3, xmm0
	136	movdqa xmm4, xmm3
	137	pslld xmm3, 16
	138	psrld xmm4, 16
	139	por xmm3, xmm4
	140
	141	// c += d; b ^= c; b <<<= 12
	142	paddd xmm2, xmm3
	143	pxor xmm1, xmm2
	144	movdqa xmm4, xmm1
	145	pslld xmm1, 12
	146	psrld xmm4, 20
	147	por xmm1, xmm4
	148
	149	// a += b; d ^= a; d <<<= 8
	150	paddd xmm0, xmm1
	151	pxor xmm3, xmm0
	152	movdqa xmm4, xmm3
	153	pslld xmm3, 8
	154	psrld xmm4, 24
	155	por xmm3, xmm4
	156
	157	// c += d; b ^= c; b <<<= 7
	158	paddd xmm2, xmm3
a13b5730	159	pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
1a0c09c4	160	pxor xmm1, xmm2
a13b5730	161	pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4 MW	162	movdqa xmm4, xmm1
	163	pslld xmm1, 7
	164	psrld xmm4, 25
	165	por xmm1, xmm4
	166
	167	// The not-quite-transpose conveniently only involves reordering
	168	// elements of individual rows, which can be done quite easily. It
	169	// doesn't involve any movement of elements between rows, or even
	170	// renaming of the rows.
	171	//
	172	// [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
	173	// [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
	174	// [ 8 9 10 11] [10 11 8 9] (c, xmm2)
	175	// [12 13 14 15] [15 12 13 14] (d, xmm3)
	176	//
	177	// The shuffles have quite high latency, so they've mostly been
	178	// pushed upwards. The remaining one can't be moved, though.
a13b5730	179	pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
1a0c09c4 MW	180
	181	// Apply the diagonal quarterround to each of the columns
	182	// simultaneously.
	183
	184	// a += b; d ^= a; d <<<= 16
	185	paddd xmm0, xmm1
	186	pxor xmm3, xmm0
	187	movdqa xmm4, xmm3
	188	pslld xmm3, 16
	189	psrld xmm4, 16
	190	por xmm3, xmm4
	191
	192	// c += d; b ^= c; b <<<= 12
	193	paddd xmm2, xmm3
	194	pxor xmm1, xmm2
	195	movdqa xmm4, xmm1
	196	pslld xmm1, 12
	197	psrld xmm4, 20
	198	por xmm1, xmm4
	199
	200	// a += b; d ^= a; d <<<= 8
	201	paddd xmm0, xmm1
	202	pxor xmm3, xmm0
	203	movdqa xmm4, xmm3
	204	pslld xmm3, 8
	205	psrld xmm4, 24
	206	por xmm3, xmm4
	207
	208	// c += d; b ^= c; b <<<= 7
	209	paddd xmm2, xmm3
a13b5730	210	pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
1a0c09c4	211	pxor xmm1, xmm2
a13b5730	212	pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4 MW	213	movdqa xmm4, xmm1
	214	pslld xmm1, 7
	215	psrld xmm4, 25
	216	por xmm1, xmm4
	217
	218	// Finally, finish off undoing the transpose, and we're done for this
	219	// doubleround. Again, most of this was done above so we don't have
	220	// to wait for the shuffles.
a13b5730	221	pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
1a0c09c4 MW	222
1a0c09c4 MW	223	// Decrement the loop counter and see if we should go round again.
0f23f75f	224	sub NR, 2
fd3bb67b	225	ja 0b
1a0c09c4 MW	226
1a0c09c4 MW	227	// Almost there. Firstly, the feedforward addition.
0f23f75f MW	228	paddd xmm0, SAVE0
	229	paddd xmm1, SAVE1
	230	paddd xmm2, SAVE2
	231	paddd xmm3, SAVE3
1a0c09c4 MW	232
	233	// And now we write out the result. This one won't be aligned
	234	// either.
0f23f75f MW	235	movdqu [OUT + 0], xmm0
	236	movdqu [OUT + 16], xmm1
	237	movdqu [OUT + 32], xmm2
	238	movdqu [OUT + 48], xmm3
1a0c09c4 MW	239
1a0c09c4 MW	240	// Tidy things up.
0f23f75f	241	#if CPUFAM_X86
0923a413 MW	242	dropfp
0923a413 MW	243	popreg ebp
0f23f75f MW	244	#endif
0f23f75f MW	245	#if CPUFAM_AMD64 && ABI_WIN
0923a413	246	stfree 48 + 8
0f23f75f	247	#endif
1a0c09c4 MW	248
	249	// And with that, we're done.
	250	ret
	251
	252	ENDFUNC
	253
	254	///----- That's all, folks --------------------------------------------------