x86ish *.S: Use `stalloc' consistently to allocate space on the stack.
[catacomb] / symm / salsa20-x86ish-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// Fancy SIMD implementation of Salsa20
4 ///
5 /// (c) 2015 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// Preliminaries.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .text
34
35 ///--------------------------------------------------------------------------
36 /// Main code.
37
38 FUNC(salsa20_core_x86ish_avx)
39 .arch .avx
40 vzeroupper
41 endprologue
42 // drop through...
43 ENDFUNC
44
45 .arch pentium4
46
47 FUNC(salsa20_core_x86ish_sse2)
48
49 // Initial setup.
50
51 #if CPUFAM_X86
52 // Arguments come in on the stack, and will need to be collected. We
53 // can get away with just the scratch registers for integer work, but
54 // we'll run out of XMM registers and will need some properly aligned
55 // space which we'll steal from the stack. I don't trust the stack
56 // pointer's alignment, so I'll have to mask the stack pointer, which
57 // in turn means I'll need to keep track of the old value. Hence I'm
58 // making a full i386-style stack frame here.
59 //
60 // The Windows and SysV ABIs are sufficiently similar that we don't
61 // need to worry about the differences here.
62
63 # define NR ecx
64 # define IN eax
65 # define OUT edx
66 # define SAVE0 xmm6
67 # define SAVE1 xmm7
68 # define SAVE2 [SP + 0]
69 # define SAVE3 [SP + 16]
70
71 pushreg BP
72 setfp
73 stalloc 32
74 mov IN, [BP + 12]
75 mov OUT, [BP + 16]
76 and SP, ~15
77 mov NR, [BP + 8]
78 #endif
79
80 #if CPUFAM_AMD64 && ABI_SYSV
81 // This is nice. We have plenty of XMM registers, and the arguments
82 // are in useful places. There's no need to spill anything and we
83 // can just get on with the code.
84
85 # define NR edi
86 # define IN rsi
87 # define OUT rdx
88 # define SAVE0 xmm6
89 # define SAVE1 xmm7
90 # define SAVE2 xmm8
91 # define SAVE3 xmm9
92 #endif
93
94 # if CPUFAM_AMD64 && ABI_WIN
95 // Arguments come in registers, but they're different between Windows
96 // and everyone else (and everyone else is saner).
97 //
98 // The Windows ABI insists that we preserve some of the XMM
99 // registers, but we want more than we can use as scratch space. Two
100 // places we only need to save a copy of the input for the
101 // feedforward at the end; but the other two we want for the final
102 // permutation, so save the old values on the stack. (We need an
103 // extra 8 bytes to align the stack.)
104
105 # define NR ecx
106 # define IN rdx
107 # define OUT r8
108 # define SAVE0 xmm6
109 # define SAVE1 xmm7
110 # define SAVE2 [SP + 32]
111 # define SAVE3 [SP + 48]
112
113 stalloc 64 + 8
114 savexmm xmm6, 0
115 savexmm xmm7, 16
116 #endif
117
118 endprologue
119
120 // First job is to slurp the matrix into XMM registers. The words
121 // have already been permuted conveniently to make them line up
122 // better for SIMD processing.
123 //
124 // The textbook arrangement of the matrix is this.
125 //
126 // [C K K K]
127 // [K C N N]
128 // [T T C K]
129 // [K K K C]
130 //
131 // But we've rotated the columns up so that the main diagonal with
132 // the constants on it end up in the first row, giving something more
133 // like
134 //
135 // [C C C C]
136 // [K T K K]
137 // [T K K N]
138 // [K K N K]
139 //
140 // so the transformation looks like this:
141 //
142 // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
143 // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
144 // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
145 // [12 13 14 15] [12 1 6 11] (d, xmm3)
146 movdqu xmm0, [IN + 0]
147 movdqu xmm1, [IN + 16]
148 movdqu xmm2, [IN + 32]
149 movdqu xmm3, [IN + 48]
150
151 // Take a copy for later.
152 movdqa SAVE0, xmm0
153 movdqa SAVE1, xmm1
154 movdqa SAVE2, xmm2
155 movdqa SAVE3, xmm3
156
157 0:
158 // Apply a column quarterround to each of the columns simultaneously.
159 // Alas, there doesn't seem to be a packed doubleword rotate, so we
160 // have to synthesize it.
161
162 // b ^= (a + d) <<< 7
163 movdqa xmm4, xmm0
164 paddd xmm4, xmm3
165 movdqa xmm5, xmm4
166 pslld xmm4, 7
167 psrld xmm5, 25
168 por xmm4, xmm5
169 pxor xmm1, xmm4
170
171 // c ^= (b + a) <<< 9
172 movdqa xmm4, xmm1
173 paddd xmm4, xmm0
174 movdqa xmm5, xmm4
175 pslld xmm4, 9
176 psrld xmm5, 23
177 por xmm4, xmm5
178 pxor xmm2, xmm4
179
180 // d ^= (c + b) <<< 13
181 movdqa xmm4, xmm2
182 paddd xmm4, xmm1
183 pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
184 movdqa xmm5, xmm4
185 pslld xmm4, 13
186 psrld xmm5, 19
187 por xmm4, xmm5
188 pxor xmm3, xmm4
189
190 // a ^= (d + c) <<< 18
191 movdqa xmm4, xmm3
192 pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
193 paddd xmm4, xmm2
194 pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
195 movdqa xmm5, xmm4
196 pslld xmm4, 18
197 psrld xmm5, 14
198 por xmm4, xmm5
199 pxor xmm0, xmm4
200
201 // The transpose conveniently only involves reordering elements of
202 // individual rows, which can be done quite easily, and reordering
203 // the rows themselves, which is a trivial renaming. It doesn't
204 // involve any movement of elements between rows.
205 //
206 // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
207 // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
208 // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
209 // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
210 //
211 // The shuffles have quite high latency, so they've been pushed
212 // backwards into the main instruction list.
213
214 // Apply the row quarterround to each of the columns (yes!)
215 // simultaneously.
216
217 // b ^= (a + d) <<< 7
218 movdqa xmm4, xmm0
219 paddd xmm4, xmm1
220 movdqa xmm5, xmm4
221 pslld xmm4, 7
222 psrld xmm5, 25
223 por xmm4, xmm5
224 pxor xmm3, xmm4
225
226 // c ^= (b + a) <<< 9
227 movdqa xmm4, xmm3
228 paddd xmm4, xmm0
229 movdqa xmm5, xmm4
230 pslld xmm4, 9
231 psrld xmm5, 23
232 por xmm4, xmm5
233 pxor xmm2, xmm4
234
235 // d ^= (c + b) <<< 13
236 movdqa xmm4, xmm2
237 paddd xmm4, xmm3
238 pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
239 movdqa xmm5, xmm4
240 pslld xmm4, 13
241 psrld xmm5, 19
242 por xmm4, xmm5
243 pxor xmm1, xmm4
244
245 // a ^= (d + c) <<< 18
246 movdqa xmm4, xmm1
247 pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
248 paddd xmm4, xmm2
249 pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
250 movdqa xmm5, xmm4
251 pslld xmm4, 18
252 psrld xmm5, 14
253 por xmm4, xmm5
254 pxor xmm0, xmm4
255
256 // We had to undo the transpose ready for the next loop. Again, push
257 // back the shuffles because they take a long time coming through.
258 // Decrement the loop counter and see if we should go round again.
259 // Later processors fuse this pair into a single uop.
260 sub NR, 2
261 ja 0b
262
263 // Almost there. Firstly, the feedforward addition.
264 paddd xmm0, SAVE0 // 0, 5, 10, 15
265 paddd xmm1, SAVE1 // 4, 9, 14, 3
266 paddd xmm2, SAVE2 // 8, 13, 2, 7
267 paddd xmm3, SAVE3 // 12, 1, 6, 11
268
269 // Next we must undo the permutation which was already applied to the
270 // input. This can be done by juggling values in registers, with the
271 // following fancy footwork: some row rotations, a transpose, and
272 // some more rotations.
273 pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 3, 4, 9, 14
274 pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) // 2, 7, 8, 13
275 pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) // 1, 6, 11, 12
276
277 movdqa xmm4, xmm0
278 movdqa xmm5, xmm3
279 punpckldq xmm0, xmm2 // 0, 2, 5, 7
280 punpckldq xmm3, xmm1 // 1, 3, 6, 4
281 punpckhdq xmm4, xmm2 // 10, 8, 15, 13
282 punpckhdq xmm5, xmm1 // 11, 9, 12, 14
283
284 movdqa xmm1, xmm0
285 movdqa xmm2, xmm4
286 punpckldq xmm0, xmm3 // 0, 1, 2, 3
287 punpckldq xmm4, xmm5 // 10, 11, 8, 9
288 punpckhdq xmm1, xmm3 // 5, 6, 7, 4
289 punpckhdq xmm2, xmm5 // 15, 12, 13, 14
290
291 pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 4, 5, 6, 7
292 pshufd xmm4, xmm4, SHUF(2, 3, 0, 1) // 8, 9, 10, 11
293 pshufd xmm2, xmm2, SHUF(1, 2, 3, 0) // 12, 13, 14, 15
294
295 // Finally we have to write out the result.
296 movdqu [OUT + 0], xmm0
297 movdqu [OUT + 16], xmm1
298 movdqu [OUT + 32], xmm4
299 movdqu [OUT + 48], xmm2
300
301 // Tidy things up.
302 #if CPUFAM_X86
303 dropfp
304 popreg BP
305 #endif
306 #if CPUFAM_AMD64 && ABI_WIN
307 rstrxmm xmm6, 0
308 rstrxmm xmm7, 16
309 stfree 64 + 8
310 #endif
311
312 // And with that, we're done.
313 ret
314
315 #undef NR
316 #undef IN
317 #undef OUT
318 #undef SAVE0
319 #undef SAVE1
320 #undef SAVE2
321 #undef SAVE3
322
323 ENDFUNC
324
325 ///----- That's all, folks --------------------------------------------------