base/asm-common.h, *.S: Include metadata for 64-bit Windows stack unwinding.
[catacomb] / symm / salsa20-x86ish-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// Fancy SIMD implementation of Salsa20
4 ///
5 /// (c) 2015 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// External definitions.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 ///--------------------------------------------------------------------------
34 /// Local utilities.
35
36 // Magic constants for shuffling.
37 #define ROTL 0x93
38 #define ROT2 0x4e
39 #define ROTR 0x39
40
41 ///--------------------------------------------------------------------------
42 /// Main code.
43
44 .arch pentium4
45 .text
46
47 FUNC(salsa20_core_x86ish_sse2)
48
49 // Initial setup.
50
51 #if CPUFAM_X86
52 // Arguments come in on the stack, and will need to be collected. We
53 // we can get away with just the scratch registers for integer work,
54 // but we'll run out of XMM registers and will need some properly
55 // aligned space which we'll steal from the stack. I don't trust the
56 // stack pointer's alignment, so I'll have to mask the stack pointer,
57 // which in turn means I'll need to keep track of the old value.
58 // Hence I'm making a full i386-style stack frame here.
59 //
60 // The Windows and SysV ABIs are sufficiently similar that we don't
61 // need to worry about the differences here.
62
63 # define NR ecx
64 # define IN eax
65 # define OUT edx
66 # define SAVE0 xmm6
67 # define SAVE1 xmm7
68 # define SAVE2 [esp + 0]
69 # define SAVE3 [esp + 16]
70
71 push ebp
72 mov ebp, esp
73 sub esp, 32
74 mov IN, [ebp + 12]
75 mov OUT, [ebp + 16]
76 and esp, ~15
77 mov NR, [ebp + 8]
78 #endif
79
80 #if CPUFAM_AMD64 && ABI_SYSV
81 // This is nice. We have plenty of XMM registers, and the arguments
82 // are in useful places. There's no need to spill anything and we
83 // can just get on with the code.
84
85 # define NR edi
86 # define IN rsi
87 # define OUT rdx
88 # define SAVE0 xmm6
89 # define SAVE1 xmm7
90 # define SAVE2 xmm8
91 # define SAVE3 xmm9
92 #endif
93
94 # if CPUFAM_AMD64 && ABI_WIN
95 // Arguments come in registers, but they're different between Windows
96 // and everyone else (and everyone else is saner).
97 //
98 // The Windows ABI insists that we preserve some of the XMM
99 // registers, but we want more than we can use as scratch space. Two
100 // places we only need to save a copy of the input for the
101 // feedforward at the end; but the other two we want for the final
102 // permutation, so save the old values on the stack (We need an extra
103 // 8 bytes to align the stack.)
104
105 # define NR ecx
106 # define IN rdx
107 # define OUT r8
108 # define SAVE0 xmm6
109 # define SAVE1 xmm7
110 # define SAVE2 [rsp + 32]
111 # define SAVE3 [rsp + 48]
112
113 sub rsp, 64 + 8
114 .seh_stackalloc 64 + 8
115 movdqa [rsp + 0], xmm6
116 .seh_savexmm xmm6, 0
117 movdqa [rsp + 16], xmm7
118 .seh_savexmm xmm7, 16
119 .seh_endprologue
120 #endif
121
122 // First job is to slurp the matrix into XMM registers. The words
123 // have already been permuted conveniently to make them line up
124 // better for SIMD processing.
125 //
126 // The textbook arrangement of the matrix is this.
127 //
128 // [C K K K]
129 // [K C N N]
130 // [T T C K]
131 // [K K K C]
132 //
133 // But we've rotated the columns up so that the main diagonal with
134 // the constants on it end up in the first row, giving something more
135 // like
136 //
137 // [C C C C]
138 // [K T K K]
139 // [T K K N]
140 // [K K N K]
141 //
142 // so the transformation looks like this:
143 //
144 // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
145 // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
146 // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
147 // [12 13 14 15] [12 1 6 11] (d, xmm3)
148 movdqu xmm0, [IN + 0]
149 movdqu xmm1, [IN + 16]
150 movdqu xmm2, [IN + 32]
151 movdqu xmm3, [IN + 48]
152
153 // Take a copy for later.
154 movdqa SAVE0, xmm0
155 movdqa SAVE1, xmm1
156 movdqa SAVE2, xmm2
157 movdqa SAVE3, xmm3
158
159 0:
160 // Apply a column quarterround to each of the columns simultaneously.
161 // Alas, there doesn't seem to be a packed doubleword rotate, so we
162 // have to synthesize it.
163
164 // b ^= (a + d) <<< 7
165 movdqa xmm4, xmm0
166 paddd xmm4, xmm3
167 movdqa xmm5, xmm4
168 pslld xmm4, 7
169 psrld xmm5, 25
170 por xmm4, xmm5
171 pxor xmm1, xmm4
172
173 // c ^= (b + a) <<< 9
174 movdqa xmm4, xmm1
175 paddd xmm4, xmm0
176 movdqa xmm5, xmm4
177 pslld xmm4, 9
178 psrld xmm5, 23
179 por xmm4, xmm5
180 pxor xmm2, xmm4
181
182 // d ^= (c + b) <<< 13
183 movdqa xmm4, xmm2
184 paddd xmm4, xmm1
185 pshufd xmm1, xmm1, ROTL
186 movdqa xmm5, xmm4
187 pslld xmm4, 13
188 psrld xmm5, 19
189 por xmm4, xmm5
190 pxor xmm3, xmm4
191
192 // a ^= (d + c) <<< 18
193 movdqa xmm4, xmm3
194 pshufd xmm3, xmm3, ROTR
195 paddd xmm4, xmm2
196 pshufd xmm2, xmm2, ROT2
197 movdqa xmm5, xmm4
198 pslld xmm4, 18
199 psrld xmm5, 14
200 por xmm4, xmm5
201 pxor xmm0, xmm4
202
203 // The transpose conveniently only involves reordering elements of
204 // individual rows, which can be done quite easily, and reordering
205 // the rows themselves, which is a trivial renaming. It doesn't
206 // involve any movement of elements between rows.
207 //
208 // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
209 // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
210 // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
211 // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
212 //
213 // The shuffles have quite high latency, so they've been pushed
214 // backwards into the main instruction list.
215
216 // Apply the row quarterround to each of the columns (yes!)
217 // simultaneously.
218
219 // b ^= (a + d) <<< 7
220 movdqa xmm4, xmm0
221 paddd xmm4, xmm1
222 movdqa xmm5, xmm4
223 pslld xmm4, 7
224 psrld xmm5, 25
225 por xmm4, xmm5
226 pxor xmm3, xmm4
227
228 // c ^= (b + a) <<< 9
229 movdqa xmm4, xmm3
230 paddd xmm4, xmm0
231 movdqa xmm5, xmm4
232 pslld xmm4, 9
233 psrld xmm5, 23
234 por xmm4, xmm5
235 pxor xmm2, xmm4
236
237 // d ^= (c + b) <<< 13
238 movdqa xmm4, xmm2
239 paddd xmm4, xmm3
240 pshufd xmm3, xmm3, ROTL
241 movdqa xmm5, xmm4
242 pslld xmm4, 13
243 psrld xmm5, 19
244 por xmm4, xmm5
245 pxor xmm1, xmm4
246
247 // a ^= (d + c) <<< 18
248 movdqa xmm4, xmm1
249 pshufd xmm1, xmm1, ROTR
250 paddd xmm4, xmm2
251 pshufd xmm2, xmm2, ROT2
252 movdqa xmm5, xmm4
253 pslld xmm4, 18
254 psrld xmm5, 14
255 por xmm4, xmm5
256 pxor xmm0, xmm4
257
258 // We had to undo the transpose ready for the next loop. Again, push
259 // back the shuffles because they take a long time coming through.
260 // Decrement the loop counter and see if we should go round again.
261 // Later processors fuse this pair into a single uop.
262 sub NR, 2
263 ja 0b
264
265 // Almost there. Firstly, the feedforward addition, and then we have
266 // to write out the result. Here we have to undo the permutation
267 // which was already applied to the input. Shuffling has quite high
268 // latency, so arrange to start a new shuffle into a temporary as
269 // soon as we've written out the old value.
270 paddd xmm0, SAVE0
271 pshufd xmm4, xmm0, 0x39
272 movd [OUT + 0], xmm0
273
274 paddd xmm1, SAVE1
275 pshufd xmm5, xmm1, ROTL
276 movd [OUT + 16], xmm1
277
278 paddd xmm2, SAVE2
279 pshufd xmm6, xmm2, ROT2
280 movd [OUT + 32], xmm2
281
282 paddd xmm3, SAVE3
283 pshufd xmm7, xmm3, ROTR
284 movd [OUT + 48], xmm3
285
286 movd [OUT + 4], xmm7
287 pshufd xmm7, xmm3, ROT2
288 movd [OUT + 24], xmm7
289 pshufd xmm3, xmm3, ROTL
290 movd [OUT + 44], xmm3
291
292 movd [OUT + 8], xmm6
293 pshufd xmm6, xmm2, ROTL
294 movd [OUT + 28], xmm6
295 pshufd xmm2, xmm2, ROTR
296 movd [OUT + 52], xmm2
297
298 movd [OUT + 12], xmm5
299 pshufd xmm5, xmm1, ROTR
300 movd [OUT + 36], xmm5
301 pshufd xmm1, xmm1, ROT2
302 movd [OUT + 56], xmm1
303
304 movd [OUT + 20], xmm4
305 pshufd xmm4, xmm0, ROT2
306 movd [OUT + 40], xmm4
307 pshufd xmm0, xmm0, ROTL
308 movd [OUT + 60], xmm0
309
310 // Tidy things up.
311
312 #if CPUFAM_X86
313 mov esp, ebp
314 pop ebp
315 #endif
316 #if CPUFAM_AMD64 && ABI_WIN
317 movdqa xmm6, [rsp + 0]
318 movdqa xmm7, [rsp + 16]
319 add rsp, 64 + 8
320 #endif
321
322 // And with that, we're done.
323 ret
324
325 #undef NR
326 #undef IN
327 #undef OUT
328 #undef SAVE0
329 #undef SAVE1
330 #undef SAVE2
331 #undef SAVE3
332
333 ENDFUNC
334
335 ///----- That's all, folks --------------------------------------------------