@@@ i386 wip
[catacomb] / symm / chacha-x86ish-sse2.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// Fancy SIMD implementation of ChaCha
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
df07f2c0 28/// Preliminaries.
1a0c09c4
MW
29
30#include "config.h"
31#include "asm-common.h"
32
df07f2c0
MW
33 .text
34
1a0c09c4
MW
35///--------------------------------------------------------------------------
36/// Main code.
37
b9b279b4
MW
38FUNC(chacha_core_x86ish_avx)
39 .arch .avx
40 vzeroupper
41 endprologue
42 // drop through...
43ENDFUNC
44
45 .arch pentium4
46
0f23f75f
MW
47FUNC(chacha_core_x86ish_sse2)
48
49 // Initial setup.
50
51#if CPUFAM_X86
52 // Arguments come in on the stack, and will need to be collected. We
172707cb
MW
53 // can get away with just the scratch registers for integer work, but
54 // we'll run out of XMM registers and will need some properly aligned
55 // space which we'll steal from the stack. I don't trust the stack
56 // pointer's alignment, so I'll have to mask the stack pointer, which
57 // in turn means I'll need to keep track of the old value. Hence I'm
58 // making a full i386-style stack frame here.
0f23f75f
MW
59 //
60 // The Windows and SysV ABIs are sufficiently similar that we don't
61 // need to worry about the differences here.
62
63# define NR ecx
64# define IN eax
65# define OUT edx
66# define SAVE0 xmm5
67# define SAVE1 xmm6
68# define SAVE2 xmm7
a90d420c 69# define SAVE3 [SP]
1a0c09c4 70
a90d420c 71 pushreg BP
42c44b27 72 setfp
6d2bd7f1 73 stalloc 16
a90d420c
MW
74 mov IN, [BP + 12]
75 mov OUT, [BP + 16]
76 and SP, ~15
77 mov NR, [BP + 8]
0f23f75f
MW
78#endif
79
80#if CPUFAM_AMD64 && ABI_SYSV
81 // This is nice. We have plenty of XMM registers, and the arguments
82 // are in useful places. There's no need to spill anything and we
83 // can just get on with the code.
84
85# define NR edi
86# define IN rsi
87# define OUT rdx
88# define SAVE0 xmm5
89# define SAVE1 xmm6
90# define SAVE2 xmm7
91# define SAVE3 xmm8
92#endif
93
94#if CPUFAM_AMD64 && ABI_WIN
95 // Arguments come in registers, but they're different between Windows
96 // and everyone else (and everyone else is saner).
97 //
98 // The Windows ABI insists that we preserve some of the XMM
99 // registers, but we want more than we can use as scratch space. We
100 // only need to save a copy of the input for the feedforward at the
101 // end, so we might as well use memory rather than spill extra
102 // registers. (We need an extra 8 bytes to align the stack.)
103
104# define NR ecx
105# define IN rdx
106# define OUT r8
107# define SAVE0 xmm5
a90d420c
MW
108# define SAVE1 [SP + 0]
109# define SAVE2 [SP + 16]
110# define SAVE3 [SP + 32]
0f23f75f 111
0923a413 112 stalloc 48 + 8
0f23f75f 113#endif
1a0c09c4 114
0923a413
MW
115 endprologue
116
1a0c09c4
MW
117 // First job is to slurp the matrix into XMM registers. Be careful:
118 // the input matrix isn't likely to be properly aligned.
119 //
120 // [ 0 1 2 3] (a, xmm0)
3197685c
MW
121 // [ 4 5 6 7] (b, xmm1)
122 // [ 8 9 10 11] (c, xmm2)
123 // [12 13 14 15] (d, xmm3)
0f23f75f
MW
124 movdqu xmm0, [IN + 0]
125 movdqu xmm1, [IN + 16]
126 movdqu xmm2, [IN + 32]
127 movdqu xmm3, [IN + 48]
1a0c09c4
MW
128
129 // Take a copy for later. This one is aligned properly, by
130 // construction.
0f23f75f
MW
131 movdqa SAVE0, xmm0
132 movdqa SAVE1, xmm1
133 movdqa SAVE2, xmm2
134 movdqa SAVE3, xmm3
1a0c09c4 135
fd3bb67b 1360:
1a0c09c4
MW
137 // Apply a column quarterround to each of the columns simultaneously.
138 // Alas, there doesn't seem to be a packed doubleword rotate, so we
139 // have to synthesize it.
140
141 // a += b; d ^= a; d <<<= 16
142 paddd xmm0, xmm1
143 pxor xmm3, xmm0
144 movdqa xmm4, xmm3
145 pslld xmm3, 16
146 psrld xmm4, 16
147 por xmm3, xmm4
148
149 // c += d; b ^= c; b <<<= 12
150 paddd xmm2, xmm3
151 pxor xmm1, xmm2
152 movdqa xmm4, xmm1
153 pslld xmm1, 12
154 psrld xmm4, 20
155 por xmm1, xmm4
156
157 // a += b; d ^= a; d <<<= 8
158 paddd xmm0, xmm1
159 pxor xmm3, xmm0
160 movdqa xmm4, xmm3
161 pslld xmm3, 8
162 psrld xmm4, 24
163 por xmm3, xmm4
164
165 // c += d; b ^= c; b <<<= 7
166 paddd xmm2, xmm3
a117c06f 167 pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
1a0c09c4 168 pxor xmm1, xmm2
a117c06f 169 pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
1a0c09c4
MW
170 movdqa xmm4, xmm1
171 pslld xmm1, 7
172 psrld xmm4, 25
173 por xmm1, xmm4
174
175 // The not-quite-transpose conveniently only involves reordering
176 // elements of individual rows, which can be done quite easily. It
177 // doesn't involve any movement of elements between rows, or even
178 // renaming of the rows.
179 //
180 // [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
181 // [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
182 // [ 8 9 10 11] [10 11 8 9] (c, xmm2)
183 // [12 13 14 15] [15 12 13 14] (d, xmm3)
184 //
185 // The shuffles have quite high latency, so they've mostly been
186 // pushed upwards. The remaining one can't be moved, though.
a117c06f 187 pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
1a0c09c4
MW
188
189 // Apply the diagonal quarterround to each of the columns
190 // simultaneously.
191
192 // a += b; d ^= a; d <<<= 16
193 paddd xmm0, xmm1
194 pxor xmm3, xmm0
195 movdqa xmm4, xmm3
196 pslld xmm3, 16
197 psrld xmm4, 16
198 por xmm3, xmm4
199
200 // c += d; b ^= c; b <<<= 12
201 paddd xmm2, xmm3
202 pxor xmm1, xmm2
203 movdqa xmm4, xmm1
204 pslld xmm1, 12
205 psrld xmm4, 20
206 por xmm1, xmm4
207
208 // a += b; d ^= a; d <<<= 8
209 paddd xmm0, xmm1
210 pxor xmm3, xmm0
211 movdqa xmm4, xmm3
212 pslld xmm3, 8
213 psrld xmm4, 24
214 por xmm3, xmm4
215
216 // c += d; b ^= c; b <<<= 7
217 paddd xmm2, xmm3
a117c06f 218 pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
1a0c09c4 219 pxor xmm1, xmm2
a117c06f 220 pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
1a0c09c4
MW
221 movdqa xmm4, xmm1
222 pslld xmm1, 7
223 psrld xmm4, 25
224 por xmm1, xmm4
225
226 // Finally, finish off undoing the transpose, and we're done for this
227 // doubleround. Again, most of this was done above so we don't have
228 // to wait for the shuffles.
a117c06f 229 pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
1a0c09c4
MW
230
231 // Decrement the loop counter and see if we should go round again.
0f23f75f 232 sub NR, 2
fd3bb67b 233 ja 0b
1a0c09c4
MW
234
235 // Almost there. Firstly, the feedforward addition.
0f23f75f
MW
236 paddd xmm0, SAVE0
237 paddd xmm1, SAVE1
238 paddd xmm2, SAVE2
239 paddd xmm3, SAVE3
1a0c09c4
MW
240
241 // And now we write out the result. This one won't be aligned
242 // either.
0f23f75f
MW
243 movdqu [OUT + 0], xmm0
244 movdqu [OUT + 16], xmm1
245 movdqu [OUT + 32], xmm2
246 movdqu [OUT + 48], xmm3
1a0c09c4
MW
247
248 // Tidy things up.
0f23f75f 249#if CPUFAM_X86
0923a413 250 dropfp
a90d420c 251 popreg BP
0f23f75f
MW
252#endif
253#if CPUFAM_AMD64 && ABI_WIN
0923a413 254 stfree 48 + 8
0f23f75f 255#endif
1a0c09c4
MW
256
257 // And with that, we're done.
258 ret
259
260ENDFUNC
261
22bace22
MW
262FUNC(chacha_multi_i386_sse2)
263 // Arguments are on the stack:
264 //
265 // [sp + 4] pointer to state
266 // [sp + 8] input pointer (or null)
267 // [sp + 12] output pointer
268 // [sp + 16] number of blocks to process
269 // [sp + 20] number of rounds per block
270
271 pushreg SI
272 pushreg DI
273 pushreg BX
274 stalloc 4*64
275 endprologue
276
277 // Load the arguments.
278 mov BX, [SP + 272] // = state pointer
279 mov SI, [SP + 276] // = source pointer
280 mov DI, [SP + 280] // = destination pointer
281 mov CX, [SP + 284] // = block count
282 mov DX, [SP + 288] // = (initial) round count
283
284 // Do chunks of four blocks at a time.
285 sub CX, 4
286 jb 8f
287
288 // Inhale the initial state.
289 movdqu xmm1, [BX + 0]
290 movdqu xmm3, [BX + 16]
291 movdqu xmm5, [BX + 32]
292 movdqu xmm0, [BX + 48]
293
294 // Set the counters and initialize the working blocks.
295 pxor xmm2, xmm2
296 pxor xmm4, xmm4
297 pxor xmm6, xmm6
298 pxor xmm7, xmm7
299
300 xor eax, eax
301 mov al, 1
302 pinsrw xmm2, eax, 4
303 mov al, 2
304 pinsrw xmm4, eax, 4
305 mov al, 3
306 pinsrw xmm6, eax, 4
307 mov al, 4
308 pinsrw xmm7, eax, 4
309
310 movdqa [SP + 16], xmm3
311 movdqa [SP + 32], xmm5
312 movdqa [SP + 48], xmm0
313
314 paddq xmm2, xmm3
315 paddq xmm4, xmm3
316 paddq xmm6, xmm3
317 paddq xmm7, xmm3
318
319 movdqu [BX + 48], xmm7
320
321 // a += b; d ^= a; d <<<= 16
322 paddd xmm1, xmm3 // a += b
323
324 movdqa [SP + 0], xmm1
325
326 pxor xmm0, xmm1 // d ^= a
327 pxor xmm2, xmm1
328 pxor xmm4, xmm1
329 pxor xmm6, xmm1
330
331 movdqa xmm1, xmm0
332 movdqa xmm3, xmm2
333 movdqa xmm5, xmm4
334 movdqa xmm7, xmm6
335
336 pslld xmm0, 16 // d << 16
337 pslld xmm2, 16
338 pslld xmm4, 16
339 pslld xmm6, 16
340
341 pslrd xmm1, 16 // d >> 16
342 pslrd xmm3, 16
343 pslrd xmm5, 16
344 pslrd xmm7, 16
345
346 por xmm0, xmm1 // d <<<= 16
347 movdqa xmm1, [SP + 32]
348 por xmm2, xmm3
349 movdqa xmm3, [SP + 48]
350 por xmm4, xmm5
351 por xmm6, xmm7
352
353 movdqa [SP + 48], xmm0
354 movdqa [SP + 112], xmm2
355 movdqa [SP + 176], xmm4
356 movdqa [SP + 240], xmm6
357
358 // c += d; b ^= c; c <<<= 12
359 paddd xmm0, xmm1 // c += d
360 paddd xmm2, xmm1
361 paddd xmm4, xmm1
362 paddd xmm6, xmm1
363
364 movdqa [SP + 32], xmm0
365 movdqa [SP + 96], xmm0
366 movdqa [SP + 160], xmm0
367 movdqa [SP + 224], xmm0
368
369 pxor xmm0, xmm3 // b ^= c
370 pxor xmm2, xmm3
371 pxor xmm4, xmm3
372 pxor xmm6, xmm3
373
374 movdqa xmm1, xmm0
375 movdqa xmm3, xmm2
376 movdqa xmm5, xmm4
377 movdqa xmm7, xmm6
378
379 pslld xmm0, 16 // d << 16
380 pslld xmm2, 16
381 pslld xmm4, 16
382 pslld xmm6, 16
383
384 pslrd xmm1, 16 // d >> 16
385 pslrd xmm3, 16
386 pslrd xmm5, 16
387 pslrd xmm7, 16
388
389 por xmm0, xmm1 // d <<<= 16
390 por xmm2, xmm3
391 por xmm4, xmm5
392 por xmm6, xmm7
393
394ENDFUNC
395
1a0c09c4 396///----- That's all, folks --------------------------------------------------