symm/: New SSE2 implementations of Salsa20 and ChaCha.
[catacomb] / symm / salsa20-x86-sse2.s
1 ### -*- mode: asm; asm-comment-char: ?# -*-
2 ###
3 ### Fancy SIMD implementation of Salsa20
4 ###
5 ### (c) 2015 Straylight/Edgeware
6 ###
7
8 ###----- Licensing notice ---------------------------------------------------
9 ###
10 ### This file is part of Catacomb.
11 ###
12 ### Catacomb is free software; you can redistribute it and/or modify
13 ### it under the terms of the GNU Library General Public License as
14 ### published by the Free Software Foundation; either version 2 of the
15 ### License, or (at your option) any later version.
16 ###
17 ### Catacomb is distributed in the hope that it will be useful,
18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ### GNU Library General Public License for more details.
21 ###
22 ### You should have received a copy of the GNU Library General Public
23 ### License along with Catacomb; if not, write to the Free
24 ### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 ### MA 02111-1307, USA.
26
27 .intel_syntax noprefix
28 .arch pentium4
29
30 .section .text
31
32 .globl salsa20_core_x86_sse2
33 .type salsa20_core_x86_sse2, STT_FUNC
34 salsa20_core_x86_sse2:
35
36 ## Initial state. We have three arguments:
37 ## [ebp + 8] is the number of rounds to do
38 ## [ebp + 12] points to the input matrix
39 ## [ebp + 16] points to the output matrix
40 push ebp
41 mov ebp, esp
42 sub esp, 32
43 mov edx, [ebp + 12]
44 and esp, ~15
45
46 ## Prepare for the main loop.
47 mov ecx, [ebp + 8]
48
49 ## First job is to slurp the matrix into XMM registers. The words
50 ## have already been permuted conveniently to make them line up
51 ## better for SIMD processing.
52 ##
53 ## The textbook arrangement of the matrix is this.
54 ##
55 ## [C K K K]
56 ## [K C N N]
57 ## [T T C K]
58 ## [K K K C]
59 ##
60 ## But we've rotated the columns up so that the main diagonal with
61 ## the constants on it end up in the first row, giving something more
62 ## like
63 ##
64 ## [C C C C]
65 ## [K T K K]
66 ## [T K K N]
67 ## [K K N K]
68 ##
69 ## so the transformation looks like this:
70 ##
71 ## [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
72 ## [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
73 ## [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
74 ## [12 13 14 15] [12 1 6 11] (d, xmm3)
75 movdqu xmm0, [edx + 0]
76 movdqu xmm1, [edx + 16]
77 movdqu xmm2, [edx + 32]
78 movdqu xmm3, [edx + 48]
79
80 ## Take a copy for later.
81 movdqa [esp + 0], xmm0
82 movdqa [esp + 16], xmm1
83 movdqa xmm6, xmm2
84 movdqa xmm7, xmm3
85
86 loop:
87
88 ## Apply a column quarterround to each of the columns simultaneously.
89 ## Alas, there doesn't seem to be a packed doubleword rotate, so we
90 ## have to synthesize it.
91
92 ## b ^= (a + d) <<< 7
93 movdqa xmm4, xmm0
94 paddd xmm4, xmm3
95 movdqa xmm5, xmm4
96 pslld xmm4, 7
97 psrld xmm5, 25
98 por xmm4, xmm5
99 pxor xmm1, xmm4
100
101 ## c ^= (b + a) <<< 9
102 movdqa xmm4, xmm1
103 paddd xmm4, xmm0
104 movdqa xmm5, xmm4
105 pslld xmm4, 9
106 psrld xmm5, 23
107 por xmm4, xmm5
108 pxor xmm2, xmm4
109
110 ## d ^= (c + b) <<< 13
111 movdqa xmm4, xmm2
112 paddd xmm4, xmm1
113 pshufd xmm1, xmm1, 0x93
114 movdqa xmm5, xmm4
115 pslld xmm4, 13
116 psrld xmm5, 19
117 por xmm4, xmm5
118 pxor xmm3, xmm4
119
120 ## a ^= (d + c) <<< 18
121 movdqa xmm4, xmm3
122 pshufd xmm3, xmm3, 0x39
123 paddd xmm4, xmm2
124 pshufd xmm2, xmm2, 0x4e
125 movdqa xmm5, xmm4
126 pslld xmm4, 18
127 psrld xmm5, 14
128 por xmm4, xmm5
129 pxor xmm0, xmm4
130
131 ## The transpose conveniently only involves reordering elements of
132 ## individual rows, which can be done quite easily, and reordering
133 ## the rows themselves, which is a trivial renaming. It doesn't
134 ## involve any movement of elements between rows.
135 ##
136 ## [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
137 ## [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
138 ## [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
139 ## [12 1 6 11] [ 3 4 9 14] (d, xmm1)
140 ##
141 ## The shuffles have quite high latency, so they've been pushed
142 ## backwards into the main instruction list.
143
144 ## Apply the row quarterround to each of the columns (yes!)
145 ## simultaneously.
146
147 ## b ^= (a + d) <<< 7
148 movdqa xmm4, xmm0
149 paddd xmm4, xmm1
150 movdqa xmm5, xmm4
151 pslld xmm4, 7
152 psrld xmm5, 25
153 por xmm4, xmm5
154 pxor xmm3, xmm4
155
156 ## c ^= (b + a) <<< 9
157 movdqa xmm4, xmm3
158 paddd xmm4, xmm0
159 movdqa xmm5, xmm4
160 pslld xmm4, 9
161 psrld xmm5, 23
162 por xmm4, xmm5
163 pxor xmm2, xmm4
164
165 ## d ^= (c + b) <<< 13
166 movdqa xmm4, xmm2
167 paddd xmm4, xmm3
168 pshufd xmm3, xmm3, 0x93
169 movdqa xmm5, xmm4
170 pslld xmm4, 13
171 psrld xmm5, 19
172 por xmm4, xmm5
173 pxor xmm1, xmm4
174
175 ## a ^= (d + c) <<< 18
176 movdqa xmm4, xmm1
177 pshufd xmm1, xmm1, 0x39
178 paddd xmm4, xmm2
179 pshufd xmm2, xmm2, 0x4e
180 movdqa xmm5, xmm4
181 pslld xmm4, 18
182 psrld xmm5, 14
183 por xmm4, xmm5
184 pxor xmm0, xmm4
185
186 ## We had to undo the transpose ready for the next loop. Again, push
187 ## back the shuffles because they take a long time coming through.
188 ## Decrement the loop counter and see if we should go round again.
189 ## Later processors fuse this pair into a single uop.
190 sub ecx, 2
191 ja loop
192
193 ## Almost there. Firstly, the feedforward addition, and then we have
194 ## to write out the result. Here we have to undo the permutation
195 ## which was already applied to the input. Shuffling has quite high
196 ## latency, so arrange to start a new shuffle into a temporary as
197 ## soon as we've written out the old value.
198 mov edx, [ebp + 16]
199
200 paddd xmm0, [esp + 0]
201 pshufd xmm4, xmm0, 0x39
202 movd [edx + 0], xmm0
203
204 paddd xmm1, [esp + 16]
205 pshufd xmm5, xmm1, 0x93
206 movd [edx + 16], xmm1
207
208 paddd xmm2, xmm6
209 pshufd xmm6, xmm2, 0x4e
210 movd [edx + 32], xmm2
211
212 paddd xmm3, xmm7
213 pshufd xmm7, xmm3, 0x39
214 movd [edx + 48], xmm3
215
216 movd [edx + 4], xmm7
217 pshufd xmm7, xmm3, 0x4e
218 movd [edx + 24], xmm7
219 pshufd xmm3, xmm3, 0x93
220 movd [edx + 44], xmm3
221
222 movd [edx + 8], xmm6
223 pshufd xmm6, xmm2, 0x93
224 movd [edx + 28], xmm6
225 pshufd xmm2, xmm2, 0x39
226 movd [edx + 52], xmm2
227
228 movd [edx + 12], xmm5
229 pshufd xmm5, xmm1, 0x39
230 movd [edx + 36], xmm5
231 pshufd xmm1, xmm1, 0x4e
232 movd [edx + 56], xmm1
233
234 movd [edx + 20], xmm4
235 pshufd xmm4, xmm0, 0x4e
236 movd [edx + 40], xmm4
237 pshufd xmm0, xmm0, 0x93
238 movd [edx + 60], xmm0
239
240 ## And with that, we're done.
241 mov esp, ebp
242 pop ebp
243 ret
244
245 .size salsa20_core_x86_sse2, . - salsa20_core_x86_sse2
246
247 ###----- That's all, folks --------------------------------------------------