[catacomb] / symm / salsa20-x86-sse2.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// Fancy SIMD implementation of Salsa20
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

///--------------------------------------------------------------------------
/// Main code.

	.arch pentium4
	.section .text

FUNC(salsa20_core_x86_sse2)

	// Initial state.  We have three arguments:
	// [ebp +  8] is the number of rounds to do
	// [ebp + 12] points to the input matrix
	// [ebp + 16] points to the output matrix
	push	ebp
	mov	ebp, esp
	sub	esp, 32
	mov	edx, [ebp + 12]
	and	esp, ~15

	// Prepare for the main loop.
	mov	ecx, [ebp + 8]

	// First job is to slurp the matrix into XMM registers.  The words
	// have already been permuted conveniently to make them line up
	// better for SIMD processing.
	//
	// The textbook arrangement of the matrix is this.
	//
	//	[C K K K]
	//	[K C N N]
	//	[T T C K]
	//	[K K K C]
	//
	// But we've rotated the columns up so that the main diagonal with
	// the constants on it end up in the first row, giving something more
	// like
	//
	//	[C C C C]
	//	[K T K K]
	//	[T K K N]
	//	[K K N K]
	//
	// so the transformation looks like this:
	//
	//	[ 0  1  2  3]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
	//	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
	//	[12 13 14 15]		[12  1  6 11] (d, xmm3)
	movdqu	xmm0, [edx +  0]
	movdqu	xmm1, [edx + 16]
	movdqu	xmm2, [edx + 32]
	movdqu	xmm3, [edx + 48]

	// Take a copy for later.
	movdqa	[esp +  0], xmm0
	movdqa	[esp + 16], xmm1
	movdqa	xmm6, xmm2
	movdqa	xmm7, xmm3

loop:

	// Apply a column quarterround to each of the columns simultaneously.
	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	// have to synthesize it.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm3
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm1
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm1
	pshufd	xmm1, xmm1, 0x93
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm3
	pshufd	xmm3, xmm3, 0x39
	paddd	xmm4, xmm2
	pshufd	xmm2, xmm2, 0x4e
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// The transpose conveniently only involves reordering elements of
	// individual rows, which can be done quite easily, and reordering
	// the rows themselves, which is a trivial renaming.  It doesn't
	// involve any movement of elements between rows.
	//
	//	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  9 14  3]    -->	[ 1  6 11 12] (b, xmm3)
	//	[ 8 13  2  7]		[ 2  7  8 13] (c, xmm2)
	//	[12  1  6 11]		[ 3  4  9 14] (d, xmm1)
	//
	// The shuffles have quite high latency, so they've been pushed
	// backwards into the main instruction list.

	// Apply the row quarterround to each of the columns (yes!)
	// simultaneously.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm1
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm3
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm3
	pshufd	xmm3, xmm3, 0x93
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm1
	pshufd	xmm1, xmm1, 0x39
	paddd	xmm4, xmm2
	pshufd	xmm2, xmm2, 0x4e
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// We had to undo the transpose ready for the next loop.  Again, push
	// back the shuffles because they take a long time coming through.
	// Decrement the loop counter and see if we should go round again.
	// Later processors fuse this pair into a single uop.
	sub	ecx, 2
	ja	loop

	// Almost there.  Firstly, the feedforward addition, and then we have
	// to write out the result.  Here we have to undo the permutation
	// which was already applied to the input.  Shuffling has quite high
	// latency, so arrange to start a new shuffle into a temporary as
	// soon as we've written out the old value.
	mov	edx, [ebp + 16]

	paddd	xmm0, [esp +  0]
	pshufd	xmm4, xmm0, 0x39
	movd	[edx +  0], xmm0

	paddd	xmm1, [esp + 16]
	pshufd	xmm5, xmm1, 0x93
	movd	[edx + 16], xmm1

	paddd	xmm2, xmm6
	pshufd	xmm6, xmm2, 0x4e
	movd	[edx + 32], xmm2

	paddd	xmm3, xmm7
	pshufd	xmm7, xmm3, 0x39
	movd	[edx + 48], xmm3

	movd	[edx +  4], xmm7
	pshufd	xmm7, xmm3, 0x4e
	movd	[edx + 24], xmm7
	pshufd	xmm3, xmm3, 0x93
	movd	[edx + 44], xmm3

	movd	[edx +  8], xmm6
	pshufd	xmm6, xmm2, 0x93
	movd	[edx + 28], xmm6
	pshufd	xmm2, xmm2, 0x39
	movd	[edx + 52], xmm2

	movd	[edx + 12], xmm5
	pshufd	xmm5, xmm1, 0x39
	movd	[edx + 36], xmm5
	pshufd	xmm1, xmm1, 0x4e
	movd	[edx + 56], xmm1

	movd	[edx + 20], xmm4
	pshufd	xmm4, xmm0, 0x4e
	movd	[edx + 40], xmm4
	pshufd	xmm0, xmm0, 0x93
	movd	[edx + 60], xmm0

	// Tidy things up.
	mov	esp, ebp
	pop	ebp

	// And with that, we're done.
	ret

ENDFUNC

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
1a0c09c4 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of Salsa20
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// Main code.
	35
	36	.arch pentium4
	37	.section .text
	38
	39	FUNC(salsa20_core_x86_sse2)
	40
	41	// Initial state. We have three arguments:
	42	// [ebp + 8] is the number of rounds to do
	43	// [ebp + 12] points to the input matrix
	44	// [ebp + 16] points to the output matrix
	45	push ebp
	46	mov ebp, esp
	47	sub esp, 32
	48	mov edx, [ebp + 12]
	49	and esp, ~15
	50
	51	// Prepare for the main loop.
	52	mov ecx, [ebp + 8]
	53
	54	// First job is to slurp the matrix into XMM registers. The words
	55	// have already been permuted conveniently to make them line up
	56	// better for SIMD processing.
	57	//
	58	// The textbook arrangement of the matrix is this.
	59	//
	60	// [C K K K]
	61	// [K C N N]
	62	// [T T C K]
	63	// [K K K C]
	64	//
65	// But we've rotated the columns up so that the main diagonal with
66	// the constants on it end up in the first row, giving something more
67	// like
68	//
69	// [C C C C]
70	// [K T K K]
71	// [T K K N]
72	// [K K N K]
73	//
74	// so the transformation looks like this:
75	//
76	// [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
77	// [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
78	// [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
79	// [12 13 14 15] [12 1 6 11] (d, xmm3)
80	movdqu xmm0, [edx + 0]
81	movdqu xmm1, [edx + 16]
82	movdqu xmm2, [edx + 32]
83	movdqu xmm3, [edx + 48]
84
85	// Take a copy for later.
86	movdqa [esp + 0], xmm0
87	movdqa [esp + 16], xmm1
88	movdqa xmm6, xmm2
89	movdqa xmm7, xmm3
90
91	loop:
92
93	// Apply a column quarterround to each of the columns simultaneously.
94	// Alas, there doesn't seem to be a packed doubleword rotate, so we
95	// have to synthesize it.
96
97	// b ^= (a + d) <<< 7
98	movdqa xmm4, xmm0
99	paddd xmm4, xmm3
100	movdqa xmm5, xmm4
101	pslld xmm4, 7
102	psrld xmm5, 25
103	por xmm4, xmm5
104	pxor xmm1, xmm4
105
106	// c ^= (b + a) <<< 9
107	movdqa xmm4, xmm1
108	paddd xmm4, xmm0
109	movdqa xmm5, xmm4
110	pslld xmm4, 9
111	psrld xmm5, 23
112	por xmm4, xmm5
113	pxor xmm2, xmm4
114
115	// d ^= (c + b) <<< 13
116	movdqa xmm4, xmm2
117	paddd xmm4, xmm1
118	pshufd xmm1, xmm1, 0x93
119	movdqa xmm5, xmm4
120	pslld xmm4, 13
121	psrld xmm5, 19
122	por xmm4, xmm5
123	pxor xmm3, xmm4
124
125	// a ^= (d + c) <<< 18
126	movdqa xmm4, xmm3
127	pshufd xmm3, xmm3, 0x39
128	paddd xmm4, xmm2
129	pshufd xmm2, xmm2, 0x4e
130	movdqa xmm5, xmm4
131	pslld xmm4, 18
132	psrld xmm5, 14
133	por xmm4, xmm5
134	pxor xmm0, xmm4
135
136	// The transpose conveniently only involves reordering elements of
137	// individual rows, which can be done quite easily, and reordering
138	// the rows themselves, which is a trivial renaming. It doesn't
139	// involve any movement of elements between rows.
140	//
141	// [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
142	// [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
143	// [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
144	// [12 1 6 11] [ 3 4 9 14] (d, xmm1)
145	//
146	// The shuffles have quite high latency, so they've been pushed
147	// backwards into the main instruction list.
148
149	// Apply the row quarterround to each of the columns (yes!)
150	// simultaneously.
151
152	// b ^= (a + d) <<< 7
153	movdqa xmm4, xmm0
154	paddd xmm4, xmm1
155	movdqa xmm5, xmm4
156	pslld xmm4, 7
157	psrld xmm5, 25
158	por xmm4, xmm5
159	pxor xmm3, xmm4
160
161	// c ^= (b + a) <<< 9
162	movdqa xmm4, xmm3
163	paddd xmm4, xmm0
164	movdqa xmm5, xmm4
165	pslld xmm4, 9
166	psrld xmm5, 23
167	por xmm4, xmm5
168	pxor xmm2, xmm4
169
170	// d ^= (c + b) <<< 13
171	movdqa xmm4, xmm2
172	paddd xmm4, xmm3
173	pshufd xmm3, xmm3, 0x93
174	movdqa xmm5, xmm4
175	pslld xmm4, 13
176	psrld xmm5, 19
177	por xmm4, xmm5
178	pxor xmm1, xmm4
179
180	// a ^= (d + c) <<< 18
181	movdqa xmm4, xmm1
182	pshufd xmm1, xmm1, 0x39
183	paddd xmm4, xmm2
184	pshufd xmm2, xmm2, 0x4e
185	movdqa xmm5, xmm4
186	pslld xmm4, 18
187	psrld xmm5, 14
188	por xmm4, xmm5
189	pxor xmm0, xmm4
190
191	// We had to undo the transpose ready for the next loop. Again, push
192	// back the shuffles because they take a long time coming through.
193	// Decrement the loop counter and see if we should go round again.
194	// Later processors fuse this pair into a single uop.
195	sub ecx, 2
196	ja loop
197
198	// Almost there. Firstly, the feedforward addition, and then we have
199	// to write out the result. Here we have to undo the permutation
200	// which was already applied to the input. Shuffling has quite high
201	// latency, so arrange to start a new shuffle into a temporary as
202	// soon as we've written out the old value.
203	mov edx, [ebp + 16]
204
205	paddd xmm0, [esp + 0]
206	pshufd xmm4, xmm0, 0x39
207	movd [edx + 0], xmm0
208
209	paddd xmm1, [esp + 16]
210	pshufd xmm5, xmm1, 0x93
211	movd [edx + 16], xmm1
212
213	paddd xmm2, xmm6
214	pshufd xmm6, xmm2, 0x4e
215	movd [edx + 32], xmm2
216
217	paddd xmm3, xmm7
218	pshufd xmm7, xmm3, 0x39
219	movd [edx + 48], xmm3
220
221	movd [edx + 4], xmm7
222	pshufd xmm7, xmm3, 0x4e
223	movd [edx + 24], xmm7
224	pshufd xmm3, xmm3, 0x93
225	movd [edx + 44], xmm3
226
227	movd [edx + 8], xmm6
228	pshufd xmm6, xmm2, 0x93
229	movd [edx + 28], xmm6
230	pshufd xmm2, xmm2, 0x39
231	movd [edx + 52], xmm2
232
233	movd [edx + 12], xmm5
234	pshufd xmm5, xmm1, 0x39
235	movd [edx + 36], xmm5
236	pshufd xmm1, xmm1, 0x4e
237	movd [edx + 56], xmm1
238
239	movd [edx + 20], xmm4
240	pshufd xmm4, xmm0, 0x4e
241	movd [edx + 40], xmm4
242	pshufd xmm0, xmm0, 0x93
243	movd [edx + 60], xmm0
244
245	// Tidy things up.
246	mov esp, ebp
247	pop ebp
248
249	// And with that, we're done.
250	ret
251
252	ENDFUNC
253
254	///----- That's all, folks --------------------------------------------------