[catacomb] / symm / chacha-x86ish-sse2.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// Fancy SIMD implementation of ChaCha
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.text

///--------------------------------------------------------------------------
/// Main code.

FUNC(chacha_core_x86ish_avx)
	.arch	.avx
	vzeroupper
  endprologue
	// drop through...
ENDFUNC

	.arch	pentium4

FUNC(chacha_core_x86ish_sse2)

	// Initial setup.

#if CPUFAM_X86
	// Arguments come in on the stack, and will need to be collected.  We
	// can get away with just the scratch registers for integer work, but
	// we'll run out of XMM registers and will need some properly aligned
	// space which we'll steal from the stack.  I don't trust the stack
	// pointer's alignment, so I'll have to mask the stack pointer, which
	// in turn means I'll need to keep track of the old value.  Hence I'm
	// making a full i386-style stack frame here.
	//
	// The Windows and SysV ABIs are sufficiently similar that we don't
	// need to worry about the differences here.

#  define NR ecx
#  define IN eax
#  define OUT edx
#  define SAVE0 xmm5
#  define SAVE1 xmm6
#  define SAVE2 xmm7
#  define SAVE3 [SP]

	pushreg	BP
	setfp
	stalloc	16
	mov	IN, [BP + 12]
	mov	OUT, [BP + 16]
	and	SP, ~15
	mov	NR, [BP + 8]
#endif

#if CPUFAM_AMD64 && ABI_SYSV
	// This is nice.  We have plenty of XMM registers, and the arguments
	// are in useful places.  There's no need to spill anything and we
	// can just get on with the code.

#  define NR edi
#  define IN rsi
#  define OUT rdx
#  define SAVE0 xmm5
#  define SAVE1 xmm6
#  define SAVE2 xmm7
#  define SAVE3 xmm8
#endif

#if CPUFAM_AMD64 && ABI_WIN
	// Arguments come in registers, but they're different between Windows
	// and everyone else (and everyone else is saner).
	//
	// The Windows ABI insists that we preserve some of the XMM
	// registers, but we want more than we can use as scratch space.  We
	// only need to save a copy of the input for the feedforward at the
	// end, so we might as well use memory rather than spill extra
	// registers.  (We need an extra 8 bytes to align the stack.)

#  define NR ecx
#  define IN rdx
#  define OUT r8
#  define SAVE0 xmm5
#  define SAVE1 [SP +  0]
#  define SAVE2 [SP + 16]
#  define SAVE3 [SP + 32]

	stalloc	48 + 8
#endif

  endprologue

	// First job is to slurp the matrix into XMM registers.  Be careful:
	// the input matrix isn't likely to be properly aligned.
	//
	//	[ 0  1  2  3] (a, xmm0)
	//	[ 4  5  6  7] (b, xmm1)
	//	[ 8  9 10 11] (c, xmm2)
	//	[12 13 14 15] (d, xmm3)
	movdqu	xmm0, [IN +  0]
	movdqu	xmm1, [IN + 16]
	movdqu	xmm2, [IN + 32]
	movdqu	xmm3, [IN + 48]

	// Take a copy for later.  This one is aligned properly, by
	// construction.
	movdqa	SAVE0, xmm0
	movdqa	SAVE1, xmm1
	movdqa	SAVE2, xmm2
	movdqa	SAVE3, xmm3

0:
	// Apply a column quarterround to each of the columns simultaneously.
	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	// have to synthesize it.

	// a += b; d ^= a; d <<<= 16
	paddd	xmm0, xmm1
	pxor	xmm3, xmm0
	movdqa	xmm4, xmm3
	pslld	xmm3, 16
	psrld	xmm4, 16
	por	xmm3, xmm4

	// c += d; b ^= c; b <<<= 12
	paddd	xmm2, xmm3
	pxor	xmm1, xmm2
	movdqa	xmm4, xmm1
	pslld	xmm1, 12
	psrld	xmm4, 20
	por	xmm1, xmm4

	// a += b; d ^= a; d <<<=  8
	paddd	xmm0, xmm1
	pxor	xmm3, xmm0
	movdqa	xmm4, xmm3
	pslld	xmm3, 8
	psrld	xmm4, 24
	por	xmm3, xmm4

	// c += d; b ^= c; b <<<=  7
	paddd	xmm2, xmm3
	 pshufd	xmm3, xmm3, SHUF(3, 0, 1, 2)
	pxor	xmm1, xmm2
	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
	movdqa	xmm4, xmm1
	pslld	xmm1, 7
	psrld	xmm4, 25
	por	xmm1, xmm4

	// The not-quite-transpose conveniently only involves reordering
	// elements of individual rows, which can be done quite easily.  It
	// doesn't involve any movement of elements between rows, or even
	// renaming of the rows.
	//
	//	[ 0  1  2  3]		[ 0  1  2  3] (a, xmm0)
	//	[ 4  5  6  7]    -->	[ 5  6  7  4] (b, xmm1)
	//	[ 8  9 10 11]		[10 11  8  9] (c, xmm2)
	//	[12 13 14 15]		[15 12 13 14] (d, xmm3)
	//
	// The shuffles have quite high latency, so they've mostly been
	// pushed upwards.  The remaining one can't be moved, though.
	pshufd	xmm1, xmm1, SHUF(1, 2, 3, 0)

	// Apply the diagonal quarterround to each of the columns
	// simultaneously.

	// a += b; d ^= a; d <<<= 16
	paddd	xmm0, xmm1
	pxor	xmm3, xmm0
	movdqa	xmm4, xmm3
	pslld	xmm3, 16
	psrld	xmm4, 16
	por	xmm3, xmm4

	// c += d; b ^= c; b <<<= 12
	paddd	xmm2, xmm3
	pxor	xmm1, xmm2
	movdqa	xmm4, xmm1
	pslld	xmm1, 12
	psrld	xmm4, 20
	por	xmm1, xmm4

	// a += b; d ^= a; d <<<=  8
	paddd	xmm0, xmm1
	pxor	xmm3, xmm0
	movdqa	xmm4, xmm3
	pslld	xmm3, 8
	psrld	xmm4, 24
	por	xmm3, xmm4

	// c += d; b ^= c; b <<<=  7
	paddd	xmm2, xmm3
	 pshufd	xmm3, xmm3, SHUF(1, 2, 3, 0)
	pxor	xmm1, xmm2
	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
	movdqa	xmm4, xmm1
	pslld	xmm1, 7
	psrld	xmm4, 25
	por	xmm1, xmm4

	// Finally, finish off undoing the transpose, and we're done for this
	// doubleround.  Again, most of this was done above so we don't have
	// to wait for the shuffles.
	pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)

	// Decrement the loop counter and see if we should go round again.
	sub	NR, 2
	ja	0b

	// Almost there.  Firstly, the feedforward addition.
	paddd	xmm0, SAVE0
	paddd	xmm1, SAVE1
	paddd	xmm2, SAVE2
	paddd	xmm3, SAVE3

	// And now we write out the result.  This one won't be aligned
	// either.
	movdqu	[OUT +  0], xmm0
	movdqu	[OUT + 16], xmm1
	movdqu	[OUT + 32], xmm2
	movdqu	[OUT + 48], xmm3

	// Tidy things up.
#if CPUFAM_X86
	dropfp
	popreg	BP
#endif
#if CPUFAM_AMD64 && ABI_WIN
	stfree	48 + 8
#endif

	// And with that, we're done.
	ret

ENDFUNC

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
1a0c09c4 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of ChaCha
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
df07f2c0	28	/// Preliminaries.
1a0c09c4 MW	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
df07f2c0 MW	33	.text
df07f2c0 MW	34
1a0c09c4 MW	35	///--------------------------------------------------------------------------
	36	/// Main code.
	37
b9b279b4 MW	38	FUNC(chacha_core_x86ish_avx)
	39	.arch .avx
	40	vzeroupper
	41	endprologue
	42	// drop through...
	43	ENDFUNC
	44
	45	.arch pentium4
	46
0f23f75f MW	47	FUNC(chacha_core_x86ish_sse2)
	48
	49	// Initial setup.
	50
	51	#if CPUFAM_X86
	52	// Arguments come in on the stack, and will need to be collected. We
172707cb MW	53	// can get away with just the scratch registers for integer work, but
	54	// we'll run out of XMM registers and will need some properly aligned
	55	// space which we'll steal from the stack. I don't trust the stack
	56	// pointer's alignment, so I'll have to mask the stack pointer, which
	57	// in turn means I'll need to keep track of the old value. Hence I'm
	58	// making a full i386-style stack frame here.
0f23f75f MW	59	//
	60	// The Windows and SysV ABIs are sufficiently similar that we don't
	61	// need to worry about the differences here.
	62
	63	# define NR ecx
	64	# define IN eax
	65	# define OUT edx
	66	# define SAVE0 xmm5
	67	# define SAVE1 xmm6
	68	# define SAVE2 xmm7
a90d420c	69	# define SAVE3 [SP]
1a0c09c4	70
a90d420c	71	pushreg BP
42c44b27	72	setfp
6d2bd7f1	73	stalloc 16
a90d420c MW	74	mov IN, [BP + 12]
	75	mov OUT, [BP + 16]
	76	and SP, ~15
	77	mov NR, [BP + 8]
0f23f75f MW	78	#endif
	79
	80	#if CPUFAM_AMD64 && ABI_SYSV
	81	// This is nice. We have plenty of XMM registers, and the arguments
	82	// are in useful places. There's no need to spill anything and we
	83	// can just get on with the code.
	84
	85	# define NR edi
	86	# define IN rsi
	87	# define OUT rdx
	88	# define SAVE0 xmm5
	89	# define SAVE1 xmm6
	90	# define SAVE2 xmm7
	91	# define SAVE3 xmm8
	92	#endif
	93
	94	#if CPUFAM_AMD64 && ABI_WIN
	95	// Arguments come in registers, but they're different between Windows
	96	// and everyone else (and everyone else is saner).
	97	//
	98	// The Windows ABI insists that we preserve some of the XMM
	99	// registers, but we want more than we can use as scratch space. We
	100	// only need to save a copy of the input for the feedforward at the
	101	// end, so we might as well use memory rather than spill extra
	102	// registers. (We need an extra 8 bytes to align the stack.)
	103
	104	# define NR ecx
	105	# define IN rdx
	106	# define OUT r8
	107	# define SAVE0 xmm5
a90d420c MW	108	# define SAVE1 [SP + 0]
	109	# define SAVE2 [SP + 16]
	110	# define SAVE3 [SP + 32]
0f23f75f	111
0923a413	112	stalloc 48 + 8
0f23f75f	113	#endif
1a0c09c4	114
0923a413 MW	115	endprologue
0923a413 MW	116
1a0c09c4 MW	117	// First job is to slurp the matrix into XMM registers. Be careful:
	118	// the input matrix isn't likely to be properly aligned.
	119	//
	120	// [ 0 1 2 3] (a, xmm0)
3197685c MW	121	// [ 4 5 6 7] (b, xmm1)
	122	// [ 8 9 10 11] (c, xmm2)
	123	// [12 13 14 15] (d, xmm3)
0f23f75f MW	124	movdqu xmm0, [IN + 0]
	125	movdqu xmm1, [IN + 16]
	126	movdqu xmm2, [IN + 32]
	127	movdqu xmm3, [IN + 48]
1a0c09c4 MW	128
	129	// Take a copy for later. This one is aligned properly, by
	130	// construction.
0f23f75f MW	131	movdqa SAVE0, xmm0
	132	movdqa SAVE1, xmm1
	133	movdqa SAVE2, xmm2
	134	movdqa SAVE3, xmm3
1a0c09c4	135
fd3bb67b	136	0:
1a0c09c4 MW	137	// Apply a column quarterround to each of the columns simultaneously.
	138	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	139	// have to synthesize it.
	140
	141	// a += b; d ^= a; d <<<= 16
	142	paddd xmm0, xmm1
	143	pxor xmm3, xmm0
	144	movdqa xmm4, xmm3
	145	pslld xmm3, 16
	146	psrld xmm4, 16
	147	por xmm3, xmm4
	148
	149	// c += d; b ^= c; b <<<= 12
	150	paddd xmm2, xmm3
	151	pxor xmm1, xmm2
	152	movdqa xmm4, xmm1
	153	pslld xmm1, 12
	154	psrld xmm4, 20
	155	por xmm1, xmm4
	156
	157	// a += b; d ^= a; d <<<= 8
	158	paddd xmm0, xmm1
	159	pxor xmm3, xmm0
	160	movdqa xmm4, xmm3
	161	pslld xmm3, 8
	162	psrld xmm4, 24
	163	por xmm3, xmm4
	164
	165	// c += d; b ^= c; b <<<= 7
	166	paddd xmm2, xmm3
a117c06f	167	pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
1a0c09c4	168	pxor xmm1, xmm2
a117c06f	169	pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
1a0c09c4 MW	170	movdqa xmm4, xmm1
	171	pslld xmm1, 7
	172	psrld xmm4, 25
	173	por xmm1, xmm4
	174
	175	// The not-quite-transpose conveniently only involves reordering
	176	// elements of individual rows, which can be done quite easily. It
	177	// doesn't involve any movement of elements between rows, or even
	178	// renaming of the rows.
	179	//
	180	// [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
	181	// [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
	182	// [ 8 9 10 11] [10 11 8 9] (c, xmm2)
	183	// [12 13 14 15] [15 12 13 14] (d, xmm3)
	184	//
	185	// The shuffles have quite high latency, so they've mostly been
	186	// pushed upwards. The remaining one can't be moved, though.
a117c06f	187	pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
1a0c09c4 MW	188
	189	// Apply the diagonal quarterround to each of the columns
	190	// simultaneously.
	191
	192	// a += b; d ^= a; d <<<= 16
	193	paddd xmm0, xmm1
	194	pxor xmm3, xmm0
	195	movdqa xmm4, xmm3
	196	pslld xmm3, 16
	197	psrld xmm4, 16
	198	por xmm3, xmm4
	199
	200	// c += d; b ^= c; b <<<= 12
	201	paddd xmm2, xmm3
	202	pxor xmm1, xmm2
	203	movdqa xmm4, xmm1
	204	pslld xmm1, 12
	205	psrld xmm4, 20
	206	por xmm1, xmm4
	207
	208	// a += b; d ^= a; d <<<= 8
	209	paddd xmm0, xmm1
	210	pxor xmm3, xmm0
	211	movdqa xmm4, xmm3
	212	pslld xmm3, 8
	213	psrld xmm4, 24
	214	por xmm3, xmm4
	215
	216	// c += d; b ^= c; b <<<= 7
	217	paddd xmm2, xmm3
a117c06f	218	pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
1a0c09c4	219	pxor xmm1, xmm2
a117c06f	220	pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
1a0c09c4 MW	221	movdqa xmm4, xmm1
	222	pslld xmm1, 7
	223	psrld xmm4, 25
	224	por xmm1, xmm4
	225
	226	// Finally, finish off undoing the transpose, and we're done for this
	227	// doubleround. Again, most of this was done above so we don't have
	228	// to wait for the shuffles.
a117c06f	229	pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
1a0c09c4 MW	230
1a0c09c4 MW	231	// Decrement the loop counter and see if we should go round again.
0f23f75f	232	sub NR, 2
fd3bb67b	233	ja 0b
1a0c09c4 MW	234
1a0c09c4 MW	235	// Almost there. Firstly, the feedforward addition.
0f23f75f MW	236	paddd xmm0, SAVE0
	237	paddd xmm1, SAVE1
	238	paddd xmm2, SAVE2
	239	paddd xmm3, SAVE3
1a0c09c4 MW	240
	241	// And now we write out the result. This one won't be aligned
	242	// either.
0f23f75f MW	243	movdqu [OUT + 0], xmm0
	244	movdqu [OUT + 16], xmm1
	245	movdqu [OUT + 32], xmm2
	246	movdqu [OUT + 48], xmm3
1a0c09c4 MW	247
1a0c09c4 MW	248	// Tidy things up.
0f23f75f	249	#if CPUFAM_X86
0923a413	250	dropfp
a90d420c	251	popreg BP
0f23f75f MW	252	#endif
0f23f75f MW	253	#if CPUFAM_AMD64 && ABI_WIN
0923a413	254	stfree 48 + 8
0f23f75f	255	#endif
1a0c09c4 MW	256
	257	// And with that, we're done.
	258	ret
	259
	260	ENDFUNC
	261
	262	///----- That's all, folks --------------------------------------------------