mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of ChaCha
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// Preliminaries.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	.text
	34
	35	///--------------------------------------------------------------------------
	36	/// Main code.
	37
	38	FUNC(chacha_core_x86ish_avx)
	39	.arch .avx
	40	vzeroupper
	41	endprologue
	42	// drop through...
	43	ENDFUNC
	44
	45	.arch pentium4
	46
	47	FUNC(chacha_core_x86ish_sse2)
	48
	49	// Initial setup.
	50
	51	#if CPUFAM_X86
	52	// Arguments come in on the stack, and will need to be collected. We
	53	// can get away with just the scratch registers for integer work, but
	54	// we'll run out of XMM registers and will need some properly aligned
	55	// space which we'll steal from the stack. I don't trust the stack
	56	// pointer's alignment, so I'll have to mask the stack pointer, which
	57	// in turn means I'll need to keep track of the old value. Hence I'm
	58	// making a full i386-style stack frame here.
	59	//
	60	// The Windows and SysV ABIs are sufficiently similar that we don't
	61	// need to worry about the differences here.
	62
	63	# define NR ecx
	64	# define IN eax
	65	# define OUT edx
	66	# define SAVE0 xmm5
	67	# define SAVE1 xmm6
	68	# define SAVE2 xmm7
	69	# define SAVE3 [SP]
	70
	71	pushreg BP
	72	setfp
	73	stalloc 16
	74	mov IN, [BP + 12]
	75	mov OUT, [BP + 16]
	76	and SP, ~15
	77	mov NR, [BP + 8]
	78	#endif
	79
	80	#if CPUFAM_AMD64 && ABI_SYSV
	81	// This is nice. We have plenty of XMM registers, and the arguments
	82	// are in useful places. There's no need to spill anything and we
	83	// can just get on with the code.
	84
	85	# define NR edi
	86	# define IN rsi
	87	# define OUT rdx
	88	# define SAVE0 xmm5
	89	# define SAVE1 xmm6
	90	# define SAVE2 xmm7
	91	# define SAVE3 xmm8
	92	#endif
	93
	94	#if CPUFAM_AMD64 && ABI_WIN
	95	// Arguments come in registers, but they're different between Windows
	96	// and everyone else (and everyone else is saner).
	97	//
	98	// The Windows ABI insists that we preserve some of the XMM
	99	// registers, but we want more than we can use as scratch space. We
	100	// only need to save a copy of the input for the feedforward at the
	101	// end, so we might as well use memory rather than spill extra
	102	// registers. (We need an extra 8 bytes to align the stack.)
	103
	104	# define NR ecx
	105	# define IN rdx
	106	# define OUT r8
	107	# define SAVE0 xmm5
	108	# define SAVE1 [SP + 0]
	109	# define SAVE2 [SP + 16]
	110	# define SAVE3 [SP + 32]
	111
	112	stalloc 48 + 8
	113	#endif
	114
	115	endprologue
	116
	117	// First job is to slurp the matrix into XMM registers. Be careful:
	118	// the input matrix isn't likely to be properly aligned.
	119	//
	120	// [ 0 1 2 3] (a, xmm0)
	121	// [ 4 5 6 7] (b, xmm1)
	122	// [ 8 9 10 11] (c, xmm2)
	123	// [12 13 14 15] (d, xmm3)
	124	movdqu xmm0, [IN + 0]
	125	movdqu xmm1, [IN + 16]
	126	movdqu xmm2, [IN + 32]
	127	movdqu xmm3, [IN + 48]
	128
	129	// Take a copy for later. This one is aligned properly, by
	130	// construction.
	131	movdqa SAVE0, xmm0
	132	movdqa SAVE1, xmm1
	133	movdqa SAVE2, xmm2
	134	movdqa SAVE3, xmm3
	135
	136	0:
	137	// Apply a column quarterround to each of the columns simultaneously.
	138	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	139	// have to synthesize it.
	140
	141	// a += b; d ^= a; d <<<= 16
	142	paddd xmm0, xmm1
	143	pxor xmm3, xmm0
	144	movdqa xmm4, xmm3
	145	pslld xmm3, 16
	146	psrld xmm4, 16
	147	por xmm3, xmm4
	148
	149	// c += d; b ^= c; b <<<= 12
	150	paddd xmm2, xmm3
	151	pxor xmm1, xmm2
	152	movdqa xmm4, xmm1
	153	pslld xmm1, 12
	154	psrld xmm4, 20
	155	por xmm1, xmm4
	156
	157	// a += b; d ^= a; d <<<= 8
	158	paddd xmm0, xmm1
	159	pxor xmm3, xmm0
	160	movdqa xmm4, xmm3
	161	pslld xmm3, 8
	162	psrld xmm4, 24
	163	por xmm3, xmm4
	164
	165	// c += d; b ^= c; b <<<= 7
	166	paddd xmm2, xmm3
	167	pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
	168	pxor xmm1, xmm2
	169	pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
	170	movdqa xmm4, xmm1
	171	pslld xmm1, 7
	172	psrld xmm4, 25
	173	por xmm1, xmm4
	174
	175	// The not-quite-transpose conveniently only involves reordering
	176	// elements of individual rows, which can be done quite easily. It
	177	// doesn't involve any movement of elements between rows, or even
	178	// renaming of the rows.
	179	//
	180	// [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
	181	// [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
	182	// [ 8 9 10 11] [10 11 8 9] (c, xmm2)
	183	// [12 13 14 15] [15 12 13 14] (d, xmm3)
	184	//
	185	// The shuffles have quite high latency, so they've mostly been
	186	// pushed upwards. The remaining one can't be moved, though.
	187	pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
	188
	189	// Apply the diagonal quarterround to each of the columns
	190	// simultaneously.
	191
	192	// a += b; d ^= a; d <<<= 16
	193	paddd xmm0, xmm1
	194	pxor xmm3, xmm0
	195	movdqa xmm4, xmm3
	196	pslld xmm3, 16
	197	psrld xmm4, 16
	198	por xmm3, xmm4
	199
	200	// c += d; b ^= c; b <<<= 12
	201	paddd xmm2, xmm3
	202	pxor xmm1, xmm2
	203	movdqa xmm4, xmm1
	204	pslld xmm1, 12
	205	psrld xmm4, 20
	206	por xmm1, xmm4
	207
	208	// a += b; d ^= a; d <<<= 8
	209	paddd xmm0, xmm1
	210	pxor xmm3, xmm0
	211	movdqa xmm4, xmm3
	212	pslld xmm3, 8
	213	psrld xmm4, 24
	214	por xmm3, xmm4
	215
	216	// c += d; b ^= c; b <<<= 7
	217	paddd xmm2, xmm3
	218	pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
	219	pxor xmm1, xmm2
	220	pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
	221	movdqa xmm4, xmm1
	222	pslld xmm1, 7
	223	psrld xmm4, 25
	224	por xmm1, xmm4
	225
	226	// Finally, finish off undoing the transpose, and we're done for this
	227	// doubleround. Again, most of this was done above so we don't have
	228	// to wait for the shuffles.
	229	pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
	230
	231	// Decrement the loop counter and see if we should go round again.
	232	sub NR, 2
	233	ja 0b
	234
	235	// Almost there. Firstly, the feedforward addition.
	236	paddd xmm0, SAVE0
	237	paddd xmm1, SAVE1
	238	paddd xmm2, SAVE2
	239	paddd xmm3, SAVE3
	240
	241	// And now we write out the result. This one won't be aligned
	242	// either.
	243	movdqu [OUT + 0], xmm0
	244	movdqu [OUT + 16], xmm1
	245	movdqu [OUT + 32], xmm2
	246	movdqu [OUT + 48], xmm3
	247
	248	// Tidy things up.
	249	#if CPUFAM_X86
	250	dropfp
	251	popreg BP
	252	#endif
	253	#if CPUFAM_AMD64 && ABI_WIN
	254	stfree 48 + 8
	255	#endif
	256
	257	// And with that, we're done.
	258	ret
	259
	260	ENDFUNC
	261
	262	///----- That's all, folks --------------------------------------------------