@@@ i386 wip
[catacomb] / symm / chacha-x86ish-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// Fancy SIMD implementation of ChaCha
4 ///
5 /// (c) 2015 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// Preliminaries.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .text
34
35 ///--------------------------------------------------------------------------
36 /// Main code.
37
38 FUNC(chacha_core_x86ish_avx)
39 .arch .avx
40 vzeroupper
41 endprologue
42 // drop through...
43 ENDFUNC
44
45 .arch pentium4
46
47 FUNC(chacha_core_x86ish_sse2)
48
49 // Initial setup.
50
51 #if CPUFAM_X86
52 // Arguments come in on the stack, and will need to be collected. We
53 // can get away with just the scratch registers for integer work, but
54 // we'll run out of XMM registers and will need some properly aligned
55 // space which we'll steal from the stack. I don't trust the stack
56 // pointer's alignment, so I'll have to mask the stack pointer, which
57 // in turn means I'll need to keep track of the old value. Hence I'm
58 // making a full i386-style stack frame here.
59 //
60 // The Windows and SysV ABIs are sufficiently similar that we don't
61 // need to worry about the differences here.
62
63 # define NR ecx
64 # define IN eax
65 # define OUT edx
66 # define SAVE0 xmm5
67 # define SAVE1 xmm6
68 # define SAVE2 xmm7
69 # define SAVE3 [SP]
70
71 pushreg BP
72 setfp
73 stalloc 16
74 mov IN, [BP + 12]
75 mov OUT, [BP + 16]
76 and SP, ~15
77 mov NR, [BP + 8]
78 #endif
79
80 #if CPUFAM_AMD64 && ABI_SYSV
81 // This is nice. We have plenty of XMM registers, and the arguments
82 // are in useful places. There's no need to spill anything and we
83 // can just get on with the code.
84
85 # define NR edi
86 # define IN rsi
87 # define OUT rdx
88 # define SAVE0 xmm5
89 # define SAVE1 xmm6
90 # define SAVE2 xmm7
91 # define SAVE3 xmm8
92 #endif
93
94 #if CPUFAM_AMD64 && ABI_WIN
95 // Arguments come in registers, but they're different between Windows
96 // and everyone else (and everyone else is saner).
97 //
98 // The Windows ABI insists that we preserve some of the XMM
99 // registers, but we want more than we can use as scratch space. We
100 // only need to save a copy of the input for the feedforward at the
101 // end, so we might as well use memory rather than spill extra
102 // registers. (We need an extra 8 bytes to align the stack.)
103
104 # define NR ecx
105 # define IN rdx
106 # define OUT r8
107 # define SAVE0 xmm5
108 # define SAVE1 [SP + 0]
109 # define SAVE2 [SP + 16]
110 # define SAVE3 [SP + 32]
111
112 stalloc 48 + 8
113 #endif
114
115 endprologue
116
117 // First job is to slurp the matrix into XMM registers. Be careful:
118 // the input matrix isn't likely to be properly aligned.
119 //
120 // [ 0 1 2 3] (a, xmm0)
121 // [ 4 5 6 7] (b, xmm1)
122 // [ 8 9 10 11] (c, xmm2)
123 // [12 13 14 15] (d, xmm3)
124 movdqu xmm0, [IN + 0]
125 movdqu xmm1, [IN + 16]
126 movdqu xmm2, [IN + 32]
127 movdqu xmm3, [IN + 48]
128
129 // Take a copy for later. This one is aligned properly, by
130 // construction.
131 movdqa SAVE0, xmm0
132 movdqa SAVE1, xmm1
133 movdqa SAVE2, xmm2
134 movdqa SAVE3, xmm3
135
136 0:
137 // Apply a column quarterround to each of the columns simultaneously.
138 // Alas, there doesn't seem to be a packed doubleword rotate, so we
139 // have to synthesize it.
140
141 // a += b; d ^= a; d <<<= 16
142 paddd xmm0, xmm1
143 pxor xmm3, xmm0
144 movdqa xmm4, xmm3
145 pslld xmm3, 16
146 psrld xmm4, 16
147 por xmm3, xmm4
148
149 // c += d; b ^= c; b <<<= 12
150 paddd xmm2, xmm3
151 pxor xmm1, xmm2
152 movdqa xmm4, xmm1
153 pslld xmm1, 12
154 psrld xmm4, 20
155 por xmm1, xmm4
156
157 // a += b; d ^= a; d <<<= 8
158 paddd xmm0, xmm1
159 pxor xmm3, xmm0
160 movdqa xmm4, xmm3
161 pslld xmm3, 8
162 psrld xmm4, 24
163 por xmm3, xmm4
164
165 // c += d; b ^= c; b <<<= 7
166 paddd xmm2, xmm3
167 pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
168 pxor xmm1, xmm2
169 pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
170 movdqa xmm4, xmm1
171 pslld xmm1, 7
172 psrld xmm4, 25
173 por xmm1, xmm4
174
175 // The not-quite-transpose conveniently only involves reordering
176 // elements of individual rows, which can be done quite easily. It
177 // doesn't involve any movement of elements between rows, or even
178 // renaming of the rows.
179 //
180 // [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
181 // [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
182 // [ 8 9 10 11] [10 11 8 9] (c, xmm2)
183 // [12 13 14 15] [15 12 13 14] (d, xmm3)
184 //
185 // The shuffles have quite high latency, so they've mostly been
186 // pushed upwards. The remaining one can't be moved, though.
187 pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
188
189 // Apply the diagonal quarterround to each of the columns
190 // simultaneously.
191
192 // a += b; d ^= a; d <<<= 16
193 paddd xmm0, xmm1
194 pxor xmm3, xmm0
195 movdqa xmm4, xmm3
196 pslld xmm3, 16
197 psrld xmm4, 16
198 por xmm3, xmm4
199
200 // c += d; b ^= c; b <<<= 12
201 paddd xmm2, xmm3
202 pxor xmm1, xmm2
203 movdqa xmm4, xmm1
204 pslld xmm1, 12
205 psrld xmm4, 20
206 por xmm1, xmm4
207
208 // a += b; d ^= a; d <<<= 8
209 paddd xmm0, xmm1
210 pxor xmm3, xmm0
211 movdqa xmm4, xmm3
212 pslld xmm3, 8
213 psrld xmm4, 24
214 por xmm3, xmm4
215
216 // c += d; b ^= c; b <<<= 7
217 paddd xmm2, xmm3
218 pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
219 pxor xmm1, xmm2
220 pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
221 movdqa xmm4, xmm1
222 pslld xmm1, 7
223 psrld xmm4, 25
224 por xmm1, xmm4
225
226 // Finally, finish off undoing the transpose, and we're done for this
227 // doubleround. Again, most of this was done above so we don't have
228 // to wait for the shuffles.
229 pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
230
231 // Decrement the loop counter and see if we should go round again.
232 sub NR, 2
233 ja 0b
234
235 // Almost there. Firstly, the feedforward addition.
236 paddd xmm0, SAVE0
237 paddd xmm1, SAVE1
238 paddd xmm2, SAVE2
239 paddd xmm3, SAVE3
240
241 // And now we write out the result. This one won't be aligned
242 // either.
243 movdqu [OUT + 0], xmm0
244 movdqu [OUT + 16], xmm1
245 movdqu [OUT + 32], xmm2
246 movdqu [OUT + 48], xmm3
247
248 // Tidy things up.
249 #if CPUFAM_X86
250 dropfp
251 popreg BP
252 #endif
253 #if CPUFAM_AMD64 && ABI_WIN
254 stfree 48 + 8
255 #endif
256
257 // And with that, we're done.
258 ret
259
260 ENDFUNC
261
262 FUNC(chacha_multi_i386_sse2)
263 // Arguments are on the stack:
264 //
265 // [sp + 4] pointer to state
266 // [sp + 8] input pointer (or null)
267 // [sp + 12] output pointer
268 // [sp + 16] number of blocks to process
269 // [sp + 20] number of rounds per block
270
271 pushreg SI
272 pushreg DI
273 pushreg BX
274 stalloc 4*64
275 endprologue
276
277 // Load the arguments.
278 mov BX, [SP + 272] // = state pointer
279 mov SI, [SP + 276] // = source pointer
280 mov DI, [SP + 280] // = destination pointer
281 mov CX, [SP + 284] // = block count
282 mov DX, [SP + 288] // = (initial) round count
283
284 // Do chunks of four blocks at a time.
285 sub CX, 4
286 jb 8f
287
288 // Inhale the initial state.
289 movdqu xmm1, [BX + 0]
290 movdqu xmm3, [BX + 16]
291 movdqu xmm5, [BX + 32]
292 movdqu xmm0, [BX + 48]
293
294 // Set the counters and initialize the working blocks.
295 pxor xmm2, xmm2
296 pxor xmm4, xmm4
297 pxor xmm6, xmm6
298 pxor xmm7, xmm7
299
300 xor eax, eax
301 mov al, 1
302 pinsrw xmm2, eax, 4
303 mov al, 2
304 pinsrw xmm4, eax, 4
305 mov al, 3
306 pinsrw xmm6, eax, 4
307 mov al, 4
308 pinsrw xmm7, eax, 4
309
310 movdqa [SP + 16], xmm3
311 movdqa [SP + 32], xmm5
312 movdqa [SP + 48], xmm0
313
314 paddq xmm2, xmm3
315 paddq xmm4, xmm3
316 paddq xmm6, xmm3
317 paddq xmm7, xmm3
318
319 movdqu [BX + 48], xmm7
320
321 // a += b; d ^= a; d <<<= 16
322 paddd xmm1, xmm3 // a += b
323
324 movdqa [SP + 0], xmm1
325
326 pxor xmm0, xmm1 // d ^= a
327 pxor xmm2, xmm1
328 pxor xmm4, xmm1
329 pxor xmm6, xmm1
330
331 movdqa xmm1, xmm0
332 movdqa xmm3, xmm2
333 movdqa xmm5, xmm4
334 movdqa xmm7, xmm6
335
336 pslld xmm0, 16 // d << 16
337 pslld xmm2, 16
338 pslld xmm4, 16
339 pslld xmm6, 16
340
341 pslrd xmm1, 16 // d >> 16
342 pslrd xmm3, 16
343 pslrd xmm5, 16
344 pslrd xmm7, 16
345
346 por xmm0, xmm1 // d <<<= 16
347 movdqa xmm1, [SP + 32]
348 por xmm2, xmm3
349 movdqa xmm3, [SP + 48]
350 por xmm4, xmm5
351 por xmm6, xmm7
352
353 movdqa [SP + 48], xmm0
354 movdqa [SP + 112], xmm2
355 movdqa [SP + 176], xmm4
356 movdqa [SP + 240], xmm6
357
358 // c += d; b ^= c; c <<<= 12
359 paddd xmm0, xmm1 // c += d
360 paddd xmm2, xmm1
361 paddd xmm4, xmm1
362 paddd xmm6, xmm1
363
364 movdqa [SP + 32], xmm0
365 movdqa [SP + 96], xmm0
366 movdqa [SP + 160], xmm0
367 movdqa [SP + 224], xmm0
368
369 pxor xmm0, xmm3 // b ^= c
370 pxor xmm2, xmm3
371 pxor xmm4, xmm3
372 pxor xmm6, xmm3
373
374 movdqa xmm1, xmm0
375 movdqa xmm3, xmm2
376 movdqa xmm5, xmm4
377 movdqa xmm7, xmm6
378
379 pslld xmm0, 16 // d << 16
380 pslld xmm2, 16
381 pslld xmm4, 16
382 pslld xmm6, 16
383
384 pslrd xmm1, 16 // d >> 16
385 pslrd xmm3, 16
386 pslrd xmm5, 16
387 pslrd xmm7, 16
388
389 por xmm0, xmm1 // d <<<= 16
390 por xmm2, xmm3
391 por xmm4, xmm5
392 por xmm6, xmm7
393
394 ENDFUNC
395
396 ///----- That's all, folks --------------------------------------------------