mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of Salsa20
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// Local utilities.
	35
	36	// Magic constants for shuffling.
	37	#define ROTL 0x93
	38	#define ROT2 0x4e
	39	#define ROTR 0x39
	40
	41	///--------------------------------------------------------------------------
	42	/// Main code.
	43
	44	.arch pentium4
	45	.section .text
	46
	47	FUNC(salsa20_core_x86_sse2)
	48
	49	// Initial state. We have three arguments:
	50	// [ebp + 8] is the number of rounds to do
	51	// [ebp + 12] points to the input matrix
	52	// [ebp + 16] points to the output matrix
	53	push ebp
	54	mov ebp, esp
	55	sub esp, 32
	56	mov edx, [ebp + 12]
	57	and esp, ~15
	58
	59	// Prepare for the main loop.
	60	mov ecx, [ebp + 8]
	61
	62	// First job is to slurp the matrix into XMM registers. The words
	63	// have already been permuted conveniently to make them line up
	64	// better for SIMD processing.
	65	//
	66	// The textbook arrangement of the matrix is this.
	67	//
	68	// [C K K K]
	69	// [K C N N]
	70	// [T T C K]
	71	// [K K K C]
	72	//
	73	// But we've rotated the columns up so that the main diagonal with
	74	// the constants on it end up in the first row, giving something more
	75	// like
	76	//
	77	// [C C C C]
	78	// [K T K K]
	79	// [T K K N]
	80	// [K K N K]
	81	//
	82	// so the transformation looks like this:
	83	//
	84	// [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
	85	// [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
	86	// [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
	87	// [12 13 14 15] [12 1 6 11] (d, xmm3)
	88	movdqu xmm0, [edx + 0]
	89	movdqu xmm1, [edx + 16]
	90	movdqu xmm2, [edx + 32]
	91	movdqu xmm3, [edx + 48]
	92
	93	// Take a copy for later.
	94	movdqa [esp + 0], xmm0
	95	movdqa [esp + 16], xmm1
	96	movdqa xmm6, xmm2
	97	movdqa xmm7, xmm3
	98
	99	loop:
	100
	101	// Apply a column quarterround to each of the columns simultaneously.
	102	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	103	// have to synthesize it.
	104
	105	// b ^= (a + d) <<< 7
	106	movdqa xmm4, xmm0
	107	paddd xmm4, xmm3
	108	movdqa xmm5, xmm4
	109	pslld xmm4, 7
	110	psrld xmm5, 25
	111	por xmm4, xmm5
	112	pxor xmm1, xmm4
	113
	114	// c ^= (b + a) <<< 9
	115	movdqa xmm4, xmm1
	116	paddd xmm4, xmm0
	117	movdqa xmm5, xmm4
	118	pslld xmm4, 9
	119	psrld xmm5, 23
	120	por xmm4, xmm5
	121	pxor xmm2, xmm4
	122
	123	// d ^= (c + b) <<< 13
	124	movdqa xmm4, xmm2
	125	paddd xmm4, xmm1
	126	pshufd xmm1, xmm1, ROTL
	127	movdqa xmm5, xmm4
	128	pslld xmm4, 13
	129	psrld xmm5, 19
	130	por xmm4, xmm5
	131	pxor xmm3, xmm4
	132
	133	// a ^= (d + c) <<< 18
	134	movdqa xmm4, xmm3
	135	pshufd xmm3, xmm3, ROTR
	136	paddd xmm4, xmm2
	137	pshufd xmm2, xmm2, ROT2
	138	movdqa xmm5, xmm4
	139	pslld xmm4, 18
	140	psrld xmm5, 14
	141	por xmm4, xmm5
	142	pxor xmm0, xmm4
	143
	144	// The transpose conveniently only involves reordering elements of
	145	// individual rows, which can be done quite easily, and reordering
	146	// the rows themselves, which is a trivial renaming. It doesn't
	147	// involve any movement of elements between rows.
	148	//
	149	// [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
	150	// [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
	151	// [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
	152	// [12 1 6 11] [ 3 4 9 14] (d, xmm1)
	153	//
	154	// The shuffles have quite high latency, so they've been pushed
	155	// backwards into the main instruction list.
	156
	157	// Apply the row quarterround to each of the columns (yes!)
	158	// simultaneously.
	159
	160	// b ^= (a + d) <<< 7
	161	movdqa xmm4, xmm0
	162	paddd xmm4, xmm1
	163	movdqa xmm5, xmm4
	164	pslld xmm4, 7
	165	psrld xmm5, 25
	166	por xmm4, xmm5
	167	pxor xmm3, xmm4
	168
	169	// c ^= (b + a) <<< 9
	170	movdqa xmm4, xmm3
	171	paddd xmm4, xmm0
	172	movdqa xmm5, xmm4
	173	pslld xmm4, 9
	174	psrld xmm5, 23
	175	por xmm4, xmm5
	176	pxor xmm2, xmm4
	177
	178	// d ^= (c + b) <<< 13
	179	movdqa xmm4, xmm2
	180	paddd xmm4, xmm3
	181	pshufd xmm3, xmm3, ROTL
	182	movdqa xmm5, xmm4
	183	pslld xmm4, 13
	184	psrld xmm5, 19
	185	por xmm4, xmm5
	186	pxor xmm1, xmm4
	187
	188	// a ^= (d + c) <<< 18
	189	movdqa xmm4, xmm1
	190	pshufd xmm1, xmm1, ROTR
	191	paddd xmm4, xmm2
	192	pshufd xmm2, xmm2, ROT2
	193	movdqa xmm5, xmm4
	194	pslld xmm4, 18
	195	psrld xmm5, 14
	196	por xmm4, xmm5
	197	pxor xmm0, xmm4
	198
	199	// We had to undo the transpose ready for the next loop. Again, push
	200	// back the shuffles because they take a long time coming through.
	201	// Decrement the loop counter and see if we should go round again.
	202	// Later processors fuse this pair into a single uop.
	203	sub ecx, 2
	204	ja loop
	205
	206	// Almost there. Firstly, the feedforward addition, and then we have
	207	// to write out the result. Here we have to undo the permutation
	208	// which was already applied to the input. Shuffling has quite high
	209	// latency, so arrange to start a new shuffle into a temporary as
	210	// soon as we've written out the old value.
	211	mov edx, [ebp + 16]
	212
	213	paddd xmm0, [esp + 0]
	214	pshufd xmm4, xmm0, ROTR
	215	movd [edx + 0], xmm0
	216
	217	paddd xmm1, [esp + 16]
	218	pshufd xmm5, xmm1, ROTL
	219	movd [edx + 16], xmm1
	220
	221	paddd xmm2, xmm6
	222	pshufd xmm6, xmm2, ROT2
	223	movd [edx + 32], xmm2
	224
	225	paddd xmm3, xmm7
	226	pshufd xmm7, xmm3, ROTR
	227	movd [edx + 48], xmm3
	228
	229	movd [edx + 4], xmm7
	230	pshufd xmm7, xmm3, ROT2
	231	movd [edx + 24], xmm7
	232	pshufd xmm3, xmm3, ROTL
	233	movd [edx + 44], xmm3
	234
	235	movd [edx + 8], xmm6
	236	pshufd xmm6, xmm2, ROTL
	237	movd [edx + 28], xmm6
	238	pshufd xmm2, xmm2, ROTR
	239	movd [edx + 52], xmm2
	240
	241	movd [edx + 12], xmm5
	242	pshufd xmm5, xmm1, ROTR
	243	movd [edx + 36], xmm5
	244	pshufd xmm1, xmm1, ROT2
	245	movd [edx + 56], xmm1
	246
	247	movd [edx + 20], xmm4
	248	pshufd xmm4, xmm0, ROT2
	249	movd [edx + 40], xmm4
	250	pshufd xmm0, xmm0, ROTL
	251	movd [edx + 60], xmm0
	252
	253	// Tidy things up.
	254	mov esp, ebp
	255	pop ebp
	256
	257	// And with that, we're done.
	258	ret
	259
	260	ENDFUNC
	261
	262	///----- That's all, folks --------------------------------------------------