[catacomb] / symm / salsa20-x86-sse2.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// Fancy SIMD implementation of Salsa20
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

///--------------------------------------------------------------------------
/// Local utilities.

// Magic constants for shuffling.
#define ROTL 0x93
#define ROT2 0x4e
#define ROTR 0x39

///--------------------------------------------------------------------------
/// Main code.

	.arch pentium4
	.section .text

FUNC(salsa20_core_x86_sse2)

	// Initial state.  We have three arguments:
	// [ebp +  8] is the number of rounds to do
	// [ebp + 12] points to the input matrix
	// [ebp + 16] points to the output matrix
	push	ebp
	mov	ebp, esp
	sub	esp, 32
	mov	edx, [ebp + 12]
	and	esp, ~15

	// Prepare for the main loop.
	mov	ecx, [ebp + 8]

	// First job is to slurp the matrix into XMM registers.  The words
	// have already been permuted conveniently to make them line up
	// better for SIMD processing.
	//
	// The textbook arrangement of the matrix is this.
	//
	//	[C K K K]
	//	[K C N N]
	//	[T T C K]
	//	[K K K C]
	//
	// But we've rotated the columns up so that the main diagonal with
	// the constants on it end up in the first row, giving something more
	// like
	//
	//	[C C C C]
	//	[K T K K]
	//	[T K K N]
	//	[K K N K]
	//
	// so the transformation looks like this:
	//
	//	[ 0  1  2  3]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
	//	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
	//	[12 13 14 15]		[12  1  6 11] (d, xmm3)
	movdqu	xmm0, [edx +  0]
	movdqu	xmm1, [edx + 16]
	movdqu	xmm2, [edx + 32]
	movdqu	xmm3, [edx + 48]

	// Take a copy for later.
	movdqa	[esp +  0], xmm0
	movdqa	[esp + 16], xmm1
	movdqa	xmm6, xmm2
	movdqa	xmm7, xmm3

loop:

	// Apply a column quarterround to each of the columns simultaneously.
	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	// have to synthesize it.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm3
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm1
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm1
	pshufd	xmm1, xmm1, ROTL
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm3
	pshufd	xmm3, xmm3, ROTR
	paddd	xmm4, xmm2
	pshufd	xmm2, xmm2, ROT2
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// The transpose conveniently only involves reordering elements of
	// individual rows, which can be done quite easily, and reordering
	// the rows themselves, which is a trivial renaming.  It doesn't
	// involve any movement of elements between rows.
	//
	//	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  9 14  3]    -->	[ 1  6 11 12] (b, xmm3)
	//	[ 8 13  2  7]		[ 2  7  8 13] (c, xmm2)
	//	[12  1  6 11]		[ 3  4  9 14] (d, xmm1)
	//
	// The shuffles have quite high latency, so they've been pushed
	// backwards into the main instruction list.

	// Apply the row quarterround to each of the columns (yes!)
	// simultaneously.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm1
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm3
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm3
	pshufd	xmm3, xmm3, ROTL
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm1
	pshufd	xmm1, xmm1, ROTR
	paddd	xmm4, xmm2
	pshufd	xmm2, xmm2, ROT2
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// We had to undo the transpose ready for the next loop.  Again, push
	// back the shuffles because they take a long time coming through.
	// Decrement the loop counter and see if we should go round again.
	// Later processors fuse this pair into a single uop.
	sub	ecx, 2
	ja	loop

	// Almost there.  Firstly, the feedforward addition, and then we have
	// to write out the result.  Here we have to undo the permutation
	// which was already applied to the input.  Shuffling has quite high
	// latency, so arrange to start a new shuffle into a temporary as
	// soon as we've written out the old value.
	mov	edx, [ebp + 16]

	paddd	xmm0, [esp +  0]
	pshufd	xmm4, xmm0, ROTR
	movd	[edx +  0], xmm0

	paddd	xmm1, [esp + 16]
	pshufd	xmm5, xmm1, ROTL
	movd	[edx + 16], xmm1

	paddd	xmm2, xmm6
	pshufd	xmm6, xmm2, ROT2
	movd	[edx + 32], xmm2

	paddd	xmm3, xmm7
	pshufd	xmm7, xmm3, ROTR
	movd	[edx + 48], xmm3

	movd	[edx +  4], xmm7
	pshufd	xmm7, xmm3, ROT2
	movd	[edx + 24], xmm7
	pshufd	xmm3, xmm3, ROTL
	movd	[edx + 44], xmm3

	movd	[edx +  8], xmm6
	pshufd	xmm6, xmm2, ROTL
	movd	[edx + 28], xmm6
	pshufd	xmm2, xmm2, ROTR
	movd	[edx + 52], xmm2

	movd	[edx + 12], xmm5
	pshufd	xmm5, xmm1, ROTR
	movd	[edx + 36], xmm5
	pshufd	xmm1, xmm1, ROT2
	movd	[edx + 56], xmm1

	movd	[edx + 20], xmm4
	pshufd	xmm4, xmm0, ROT2
	movd	[edx + 40], xmm4
	pshufd	xmm0, xmm0, ROTL
	movd	[edx + 60], xmm0

	// Tidy things up.
	mov	esp, ebp
	pop	ebp

	// And with that, we're done.
	ret

ENDFUNC

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
1a0c09c4 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of Salsa20
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
47103664 MW	34	/// Local utilities.
	35
	36	// Magic constants for shuffling.
	37	#define ROTL 0x93
	38	#define ROT2 0x4e
	39	#define ROTR 0x39
	40
	41	///--------------------------------------------------------------------------
1a0c09c4 MW	42	/// Main code.
	43
	44	.arch pentium4
	45	.section .text
	46
	47	FUNC(salsa20_core_x86_sse2)
	48
	49	// Initial state. We have three arguments:
	50	// [ebp + 8] is the number of rounds to do
	51	// [ebp + 12] points to the input matrix
	52	// [ebp + 16] points to the output matrix
	53	push ebp
	54	mov ebp, esp
	55	sub esp, 32
	56	mov edx, [ebp + 12]
	57	and esp, ~15
	58
	59	// Prepare for the main loop.
	60	mov ecx, [ebp + 8]
	61
	62	// First job is to slurp the matrix into XMM registers. The words
	63	// have already been permuted conveniently to make them line up
	64	// better for SIMD processing.
	65	//
	66	// The textbook arrangement of the matrix is this.
	67	//
	68	// [C K K K]
	69	// [K C N N]
	70	// [T T C K]
	71	// [K K K C]
	72	//
	73	// But we've rotated the columns up so that the main diagonal with
	74	// the constants on it end up in the first row, giving something more
	75	// like
	76	//
	77	// [C C C C]
	78	// [K T K K]
	79	// [T K K N]
	80	// [K K N K]
	81	//
	82	// so the transformation looks like this:
	83	//
	84	// [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
	85	// [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
	86	// [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
	87	// [12 13 14 15] [12 1 6 11] (d, xmm3)
	88	movdqu xmm0, [edx + 0]
	89	movdqu xmm1, [edx + 16]
	90	movdqu xmm2, [edx + 32]
	91	movdqu xmm3, [edx + 48]
	92
	93	// Take a copy for later.
	94	movdqa [esp + 0], xmm0
	95	movdqa [esp + 16], xmm1
	96	movdqa xmm6, xmm2
	97	movdqa xmm7, xmm3
	98
	99	loop:
	100
	101	// Apply a column quarterround to each of the columns simultaneously.
	102	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	103	// have to synthesize it.
	104
	105	// b ^= (a + d) <<< 7
106	movdqa xmm4, xmm0
107	paddd xmm4, xmm3
108	movdqa xmm5, xmm4
109	pslld xmm4, 7
110	psrld xmm5, 25
111	por xmm4, xmm5
112	pxor xmm1, xmm4
113
114	// c ^= (b + a) <<< 9
115	movdqa xmm4, xmm1
116	paddd xmm4, xmm0
117	movdqa xmm5, xmm4
118	pslld xmm4, 9
119	psrld xmm5, 23
120	por xmm4, xmm5
121	pxor xmm2, xmm4
122
123	// d ^= (c + b) <<< 13
124	movdqa xmm4, xmm2
125	paddd xmm4, xmm1
47103664	126	pshufd xmm1, xmm1, ROTL
1a0c09c4 MW	127	movdqa xmm5, xmm4
	128	pslld xmm4, 13
	129	psrld xmm5, 19
	130	por xmm4, xmm5
	131	pxor xmm3, xmm4
	132
	133	// a ^= (d + c) <<< 18
	134	movdqa xmm4, xmm3
47103664	135	pshufd xmm3, xmm3, ROTR
1a0c09c4	136	paddd xmm4, xmm2
47103664	137	pshufd xmm2, xmm2, ROT2
1a0c09c4 MW	138	movdqa xmm5, xmm4
	139	pslld xmm4, 18
	140	psrld xmm5, 14
	141	por xmm4, xmm5
	142	pxor xmm0, xmm4
	143
	144	// The transpose conveniently only involves reordering elements of
	145	// individual rows, which can be done quite easily, and reordering
	146	// the rows themselves, which is a trivial renaming. It doesn't
	147	// involve any movement of elements between rows.
	148	//
	149	// [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
	150	// [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
	151	// [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
	152	// [12 1 6 11] [ 3 4 9 14] (d, xmm1)
	153	//
	154	// The shuffles have quite high latency, so they've been pushed
	155	// backwards into the main instruction list.
	156
	157	// Apply the row quarterround to each of the columns (yes!)
	158	// simultaneously.
	159
	160	// b ^= (a + d) <<< 7
	161	movdqa xmm4, xmm0
	162	paddd xmm4, xmm1
	163	movdqa xmm5, xmm4
	164	pslld xmm4, 7
	165	psrld xmm5, 25
	166	por xmm4, xmm5
	167	pxor xmm3, xmm4
	168
	169	// c ^= (b + a) <<< 9
	170	movdqa xmm4, xmm3
	171	paddd xmm4, xmm0
	172	movdqa xmm5, xmm4
	173	pslld xmm4, 9
	174	psrld xmm5, 23
	175	por xmm4, xmm5
	176	pxor xmm2, xmm4
	177
	178	// d ^= (c + b) <<< 13
	179	movdqa xmm4, xmm2
	180	paddd xmm4, xmm3
47103664	181	pshufd xmm3, xmm3, ROTL
1a0c09c4 MW	182	movdqa xmm5, xmm4
	183	pslld xmm4, 13
	184	psrld xmm5, 19
	185	por xmm4, xmm5
	186	pxor xmm1, xmm4
	187
	188	// a ^= (d + c) <<< 18
	189	movdqa xmm4, xmm1
47103664	190	pshufd xmm1, xmm1, ROTR
1a0c09c4	191	paddd xmm4, xmm2
47103664	192	pshufd xmm2, xmm2, ROT2
1a0c09c4 MW	193	movdqa xmm5, xmm4
	194	pslld xmm4, 18
	195	psrld xmm5, 14
	196	por xmm4, xmm5
	197	pxor xmm0, xmm4
	198
	199	// We had to undo the transpose ready for the next loop. Again, push
	200	// back the shuffles because they take a long time coming through.
	201	// Decrement the loop counter and see if we should go round again.
	202	// Later processors fuse this pair into a single uop.
	203	sub ecx, 2
	204	ja loop
	205
	206	// Almost there. Firstly, the feedforward addition, and then we have
	207	// to write out the result. Here we have to undo the permutation
	208	// which was already applied to the input. Shuffling has quite high
	209	// latency, so arrange to start a new shuffle into a temporary as
	210	// soon as we've written out the old value.
	211	mov edx, [ebp + 16]
	212
	213	paddd xmm0, [esp + 0]
47103664	214	pshufd xmm4, xmm0, ROTR
1a0c09c4 MW	215	movd [edx + 0], xmm0
	216
	217	paddd xmm1, [esp + 16]
47103664	218	pshufd xmm5, xmm1, ROTL
1a0c09c4 MW	219	movd [edx + 16], xmm1
	220
	221	paddd xmm2, xmm6
47103664	222	pshufd xmm6, xmm2, ROT2
1a0c09c4 MW	223	movd [edx + 32], xmm2
	224
	225	paddd xmm3, xmm7
47103664	226	pshufd xmm7, xmm3, ROTR
1a0c09c4 MW	227	movd [edx + 48], xmm3
	228
	229	movd [edx + 4], xmm7
47103664	230	pshufd xmm7, xmm3, ROT2
1a0c09c4	231	movd [edx + 24], xmm7
47103664	232	pshufd xmm3, xmm3, ROTL
1a0c09c4 MW	233	movd [edx + 44], xmm3
	234
	235	movd [edx + 8], xmm6
47103664	236	pshufd xmm6, xmm2, ROTL
1a0c09c4	237	movd [edx + 28], xmm6
47103664	238	pshufd xmm2, xmm2, ROTR
1a0c09c4 MW	239	movd [edx + 52], xmm2
	240
	241	movd [edx + 12], xmm5
47103664	242	pshufd xmm5, xmm1, ROTR
1a0c09c4	243	movd [edx + 36], xmm5
47103664	244	pshufd xmm1, xmm1, ROT2
1a0c09c4 MW	245	movd [edx + 56], xmm1
	246
	247	movd [edx + 20], xmm4
47103664	248	pshufd xmm4, xmm0, ROT2
1a0c09c4	249	movd [edx + 40], xmm4
47103664	250	pshufd xmm0, xmm0, ROTL
1a0c09c4 MW	251	movd [edx + 60], xmm0
	252
	253	// Tidy things up.
	254	mov esp, ebp
	255	pop ebp
	256
	257	// And with that, we're done.
	258	ret
	259
	260	ENDFUNC
	261
	262	///----- That's all, folks --------------------------------------------------