[catacomb] / symm / salsa20-x86-sse2.s

### -*- mode: asm; asm-comment-char: ?# -*-
###
### Fancy SIMD implementation of Salsa20
###
### (c) 2015 Straylight/Edgeware
###

###----- Licensing notice ---------------------------------------------------
###
### This file is part of Catacomb.
###
### Catacomb is free software; you can redistribute it and/or modify
### it under the terms of the GNU Library General Public License as
### published by the Free Software Foundation; either version 2 of the
### License, or (at your option) any later version.
###
### Catacomb is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
### GNU Library General Public License for more details.
###
### You should have received a copy of the GNU Library General Public
### License along with Catacomb; if not, write to the Free
### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
### MA 02111-1307, USA.

	.intel_syntax noprefix
	.arch pentium4

	.section .text

	.globl	salsa20_core_x86_sse2
	.type	salsa20_core_x86_sse2, STT_FUNC
salsa20_core_x86_sse2:

	## Initial state.  We have three arguments:
	## [ebp +  8] is the number of rounds to do
	## [ebp + 12] points to the input matrix
	## [ebp + 16] points to the output matrix
	push	ebp
	mov	ebp, esp
	sub	esp, 32
	mov	edx, [ebp + 12]
	and	esp, ~15

	## Prepare for the main loop.
	mov	ecx, [ebp + 8]

	## First job is to slurp the matrix into XMM registers.  The words
	## have already been permuted conveniently to make them line up
	## better for SIMD processing.
	##
	## The textbook arrangement of the matrix is this.
	##
	##	[C K K K]
	##	[K C N N]
	##	[T T C K]
	##	[K K K C]
	##
	## But we've rotated the columns up so that the main diagonal with
	## the constants on it end up in the first row, giving something more
	## like
	##
	##	[C C C C]
	##	[K T K K]
	##	[T K K N]
	##	[K K N K]
	##
	## so the transformation looks like this:
	##
	##	[ 0  1  2  3]		[ 0  5 10 15] (a, xmm0)
	##	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
	##	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
	##	[12 13 14 15]		[12  1  6 11] (d, xmm3)
	movdqu	xmm0, [edx +  0]
	movdqu	xmm1, [edx + 16]
	movdqu	xmm2, [edx + 32]
	movdqu	xmm3, [edx + 48]

	## Take a copy for later.
	movdqa	[esp +  0], xmm0
	movdqa	[esp + 16], xmm1
	movdqa	xmm6, xmm2
	movdqa	xmm7, xmm3

loop:

	## Apply a column quarterround to each of the columns simultaneously.
	## Alas, there doesn't seem to be a packed doubleword rotate, so we
	## have to synthesize it.

	## b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm3
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	## c ^= (b + a) <<<  9
	movdqa	xmm4, xmm1
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	## d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm1
	pshufd	xmm1, xmm1, 0x93
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	## a ^= (d + c) <<< 18
	movdqa	xmm4, xmm3
	pshufd	xmm3, xmm3, 0x39
	paddd	xmm4, xmm2
	pshufd	xmm2, xmm2, 0x4e
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	## The transpose conveniently only involves reordering elements of
	## individual rows, which can be done quite easily, and reordering
	## the rows themselves, which is a trivial renaming.  It doesn't
	## involve any movement of elements between rows.
	##
	##	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
	##	[ 4  9 14  3]    -->	[ 1  6 11 12] (b, xmm3)
	##	[ 8 13  2  7]		[ 2  7  8 13] (c, xmm2)
	##	[12  1  6 11]		[ 3  4  9 14] (d, xmm1)
	##
	## The shuffles have quite high latency, so they've been pushed
	## backwards into the main instruction list.

	## Apply the row quarterround to each of the columns (yes!)
	## simultaneously.

	## b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm1
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	## c ^= (b + a) <<<  9
	movdqa	xmm4, xmm3
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	## d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm3
	pshufd	xmm3, xmm3, 0x93
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	## a ^= (d + c) <<< 18
	movdqa	xmm4, xmm1
	pshufd	xmm1, xmm1, 0x39
	paddd	xmm4, xmm2
	pshufd	xmm2, xmm2, 0x4e
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	## We had to undo the transpose ready for the next loop.  Again, push
	## back the shuffles because they take a long time coming through.
	## Decrement the loop counter and see if we should go round again.
	## Later processors fuse this pair into a single uop.
	sub	ecx, 2
	ja	loop

	## Almost there.  Firstly, the feedforward addition, and then we have
	## to write out the result.  Here we have to undo the permutation
	## which was already applied to the input.  Shuffling has quite high
	## latency, so arrange to start a new shuffle into a temporary as
	## soon as we've written out the old value.
	mov	edx, [ebp + 16]

	paddd	xmm0, [esp +  0]
	pshufd	xmm4, xmm0, 0x39
	movd	[edx +  0], xmm0

	paddd	xmm1, [esp + 16]
	pshufd	xmm5, xmm1, 0x93
	movd	[edx + 16], xmm1

	paddd	xmm2, xmm6
	pshufd	xmm6, xmm2, 0x4e
	movd	[edx + 32], xmm2

	paddd	xmm3, xmm7
	pshufd	xmm7, xmm3, 0x39
	movd	[edx + 48], xmm3

	movd	[edx +  4], xmm7
	pshufd	xmm7, xmm3, 0x4e
	movd	[edx + 24], xmm7
	pshufd	xmm3, xmm3, 0x93
	movd	[edx + 44], xmm3

	movd	[edx +  8], xmm6
	pshufd	xmm6, xmm2, 0x93
	movd	[edx + 28], xmm6
	pshufd	xmm2, xmm2, 0x39
	movd	[edx + 52], xmm2

	movd	[edx + 12], xmm5
	pshufd	xmm5, xmm1, 0x39
	movd	[edx + 36], xmm5
	pshufd	xmm1, xmm1, 0x4e
	movd	[edx + 56], xmm1

	movd	[edx + 20], xmm4
	pshufd	xmm4, xmm0, 0x4e
	movd	[edx + 40], xmm4
	pshufd	xmm0, xmm0, 0x93
	movd	[edx + 60], xmm0

	## And with that, we're done.
	mov	esp, ebp
	pop	ebp
	ret

	.size	salsa20_core_x86_sse2, . - salsa20_core_x86_sse2

###----- That's all, folks --------------------------------------------------
Commit	Line	Data
e10e6494 MW	1	### -- mode: asm; asm-comment-char: ?# --
	2	###
	3	### Fancy SIMD implementation of Salsa20
	4	###
	5	### (c) 2015 Straylight/Edgeware
	6	###
	7
	8	###----- Licensing notice ---------------------------------------------------
	9	###
	10	### This file is part of Catacomb.
	11	###
	12	### Catacomb is free software; you can redistribute it and/or modify
	13	### it under the terms of the GNU Library General Public License as
	14	### published by the Free Software Foundation; either version 2 of the
	15	### License, or (at your option) any later version.
	16	###
	17	### Catacomb is distributed in the hope that it will be useful,
	18	### but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	### GNU Library General Public License for more details.
	21	###
	22	### You should have received a copy of the GNU Library General Public
	23	### License along with Catacomb; if not, write to the Free
	24	### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	### MA 02111-1307, USA.
	26
	27	.intel_syntax noprefix
	28	.arch pentium4
	29
	30	.section .text
	31
	32	.globl salsa20_core_x86_sse2
	33	.type salsa20_core_x86_sse2, STT_FUNC
	34	salsa20_core_x86_sse2:
	35
	36	## Initial state. We have three arguments:
	37	## [ebp + 8] is the number of rounds to do
	38	## [ebp + 12] points to the input matrix
	39	## [ebp + 16] points to the output matrix
	40	push ebp
	41	mov ebp, esp
	42	sub esp, 32
	43	mov edx, [ebp + 12]
	44	and esp, ~15
	45
	46	## Prepare for the main loop.
	47	mov ecx, [ebp + 8]
	48
	49	## First job is to slurp the matrix into XMM registers. The words
	50	## have already been permuted conveniently to make them line up
	51	## better for SIMD processing.
	52	##
	53	## The textbook arrangement of the matrix is this.
	54	##
	55	## [C K K K]
	56	## [K C N N]
	57	## [T T C K]
	58	## [K K K C]
	59	##
	60	## But we've rotated the columns up so that the main diagonal with
	61	## the constants on it end up in the first row, giving something more
	62	## like
	63	##
	64	## [C C C C]
65	## [K T K K]
66	## [T K K N]
67	## [K K N K]
68	##
69	## so the transformation looks like this:
70	##
71	## [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
72	## [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
73	## [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
74	## [12 13 14 15] [12 1 6 11] (d, xmm3)
75	movdqu xmm0, [edx + 0]
76	movdqu xmm1, [edx + 16]
77	movdqu xmm2, [edx + 32]
78	movdqu xmm3, [edx + 48]
79
80	## Take a copy for later.
81	movdqa [esp + 0], xmm0
82	movdqa [esp + 16], xmm1
83	movdqa xmm6, xmm2
84	movdqa xmm7, xmm3
85
86	loop:
87
88	## Apply a column quarterround to each of the columns simultaneously.
89	## Alas, there doesn't seem to be a packed doubleword rotate, so we
90	## have to synthesize it.
91
92	## b ^= (a + d) <<< 7
93	movdqa xmm4, xmm0
94	paddd xmm4, xmm3
95	movdqa xmm5, xmm4
96	pslld xmm4, 7
97	psrld xmm5, 25
98	por xmm4, xmm5
99	pxor xmm1, xmm4
100
101	## c ^= (b + a) <<< 9
102	movdqa xmm4, xmm1
103	paddd xmm4, xmm0
104	movdqa xmm5, xmm4
105	pslld xmm4, 9
106	psrld xmm5, 23
107	por xmm4, xmm5
108	pxor xmm2, xmm4
109
110	## d ^= (c + b) <<< 13
111	movdqa xmm4, xmm2
112	paddd xmm4, xmm1
113	pshufd xmm1, xmm1, 0x93
114	movdqa xmm5, xmm4
115	pslld xmm4, 13
116	psrld xmm5, 19
117	por xmm4, xmm5
118	pxor xmm3, xmm4
119
120	## a ^= (d + c) <<< 18
121	movdqa xmm4, xmm3
122	pshufd xmm3, xmm3, 0x39
123	paddd xmm4, xmm2
124	pshufd xmm2, xmm2, 0x4e
125	movdqa xmm5, xmm4
126	pslld xmm4, 18
127	psrld xmm5, 14
128	por xmm4, xmm5
129	pxor xmm0, xmm4
130
131	## The transpose conveniently only involves reordering elements of
132	## individual rows, which can be done quite easily, and reordering
133	## the rows themselves, which is a trivial renaming. It doesn't
134	## involve any movement of elements between rows.
135	##
136	## [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
137	## [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
138	## [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
139	## [12 1 6 11] [ 3 4 9 14] (d, xmm1)
140	##
141	## The shuffles have quite high latency, so they've been pushed
142	## backwards into the main instruction list.
143
144	## Apply the row quarterround to each of the columns (yes!)
145	## simultaneously.
146
147	## b ^= (a + d) <<< 7
148	movdqa xmm4, xmm0
149	paddd xmm4, xmm1
150	movdqa xmm5, xmm4
151	pslld xmm4, 7
152	psrld xmm5, 25
153	por xmm4, xmm5
154	pxor xmm3, xmm4
155
156	## c ^= (b + a) <<< 9
157	movdqa xmm4, xmm3
158	paddd xmm4, xmm0
159	movdqa xmm5, xmm4
160	pslld xmm4, 9
161	psrld xmm5, 23
162	por xmm4, xmm5
163	pxor xmm2, xmm4
164
165	## d ^= (c + b) <<< 13
166	movdqa xmm4, xmm2
167	paddd xmm4, xmm3
168	pshufd xmm3, xmm3, 0x93
169	movdqa xmm5, xmm4
170	pslld xmm4, 13
171	psrld xmm5, 19
172	por xmm4, xmm5
173	pxor xmm1, xmm4
174
175	## a ^= (d + c) <<< 18
176	movdqa xmm4, xmm1
177	pshufd xmm1, xmm1, 0x39
178	paddd xmm4, xmm2
179	pshufd xmm2, xmm2, 0x4e
180	movdqa xmm5, xmm4
181	pslld xmm4, 18
182	psrld xmm5, 14
183	por xmm4, xmm5
184	pxor xmm0, xmm4
185
186	## We had to undo the transpose ready for the next loop. Again, push
187	## back the shuffles because they take a long time coming through.
188	## Decrement the loop counter and see if we should go round again.
189	## Later processors fuse this pair into a single uop.
190	sub ecx, 2
191	ja loop
192
193	## Almost there. Firstly, the feedforward addition, and then we have
194	## to write out the result. Here we have to undo the permutation
195	## which was already applied to the input. Shuffling has quite high
196	## latency, so arrange to start a new shuffle into a temporary as
197	## soon as we've written out the old value.
198	mov edx, [ebp + 16]
199
200	paddd xmm0, [esp + 0]
201	pshufd xmm4, xmm0, 0x39
202	movd [edx + 0], xmm0
203
204	paddd xmm1, [esp + 16]
205	pshufd xmm5, xmm1, 0x93
206	movd [edx + 16], xmm1
207
208	paddd xmm2, xmm6
209	pshufd xmm6, xmm2, 0x4e
210	movd [edx + 32], xmm2
211
212	paddd xmm3, xmm7
213	pshufd xmm7, xmm3, 0x39
214	movd [edx + 48], xmm3
215
216	movd [edx + 4], xmm7
217	pshufd xmm7, xmm3, 0x4e
218	movd [edx + 24], xmm7
219	pshufd xmm3, xmm3, 0x93
220	movd [edx + 44], xmm3
221
222	movd [edx + 8], xmm6
223	pshufd xmm6, xmm2, 0x93
224	movd [edx + 28], xmm6
225	pshufd xmm2, xmm2, 0x39
226	movd [edx + 52], xmm2
227
228	movd [edx + 12], xmm5
229	pshufd xmm5, xmm1, 0x39
230	movd [edx + 36], xmm5
231	pshufd xmm1, xmm1, 0x4e
232	movd [edx + 56], xmm1
233
234	movd [edx + 20], xmm4
235	pshufd xmm4, xmm0, 0x4e
236	movd [edx + 40], xmm4
237	pshufd xmm0, xmm0, 0x93
238	movd [edx + 60], xmm0
239
240	## And with that, we're done.
241	mov esp, ebp
242	pop ebp
243	ret
244
245	.size salsa20_core_x86_sse2, . - salsa20_core_x86_sse2
246
247	###----- That's all, folks --------------------------------------------------