math/mpx.h, math/mpmont.c: Retune the Karatsuba thresholds.
[catacomb] / symm / salsa20-x86ish-sse2.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// Fancy SIMD implementation of Salsa20
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
33///--------------------------------------------------------------------------
34/// Main code.
35
36 .arch pentium4
bc9ac7eb 37 .text
1a0c09c4 38
0f23f75f
MW
39FUNC(salsa20_core_x86ish_sse2)
40
41 // Initial setup.
42
43#if CPUFAM_X86
44 // Arguments come in on the stack, and will need to be collected. We
45 // we can get away with just the scratch registers for integer work,
46 // but we'll run out of XMM registers and will need some properly
47 // aligned space which we'll steal from the stack. I don't trust the
48 // stack pointer's alignment, so I'll have to mask the stack pointer,
49 // which in turn means I'll need to keep track of the old value.
50 // Hence I'm making a full i386-style stack frame here.
51 //
52 // The Windows and SysV ABIs are sufficiently similar that we don't
53 // need to worry about the differences here.
54
55# define NR ecx
56# define IN eax
57# define OUT edx
58# define SAVE0 xmm6
59# define SAVE1 xmm7
60# define SAVE2 [esp + 0]
61# define SAVE3 [esp + 16]
1a0c09c4 62
1a0c09c4
MW
63 push ebp
64 mov ebp, esp
65 sub esp, 32
0f23f75f
MW
66 mov IN, [ebp + 12]
67 mov OUT, [ebp + 16]
1a0c09c4 68 and esp, ~15
0f23f75f
MW
69 mov NR, [ebp + 8]
70#endif
71
72#if CPUFAM_AMD64 && ABI_SYSV
73 // This is nice. We have plenty of XMM registers, and the arguments
74 // are in useful places. There's no need to spill anything and we
75 // can just get on with the code.
76
77# define NR edi
78# define IN rsi
79# define OUT rdx
80# define SAVE0 xmm6
81# define SAVE1 xmm7
82# define SAVE2 xmm8
83# define SAVE3 xmm9
84#endif
85
86# if CPUFAM_AMD64 && ABI_WIN
87 // Arguments come in registers, but they're different between Windows
88 // and everyone else (and everyone else is saner).
89 //
90 // The Windows ABI insists that we preserve some of the XMM
91 // registers, but we want more than we can use as scratch space. Two
92 // places we only need to save a copy of the input for the
93 // feedforward at the end; but the other two we want for the final
ae429891
MW
94 // permutation, so save the old values on the stack. (We need an
95 // extra 8 bytes to align the stack.)
0f23f75f
MW
96
97# define NR ecx
98# define IN rdx
99# define OUT r8
100# define SAVE0 xmm6
101# define SAVE1 xmm7
102# define SAVE2 [rsp + 32]
103# define SAVE3 [rsp + 48]
104
105 sub rsp, 64 + 8
f71dd54d 106 .seh_stackalloc 64 + 8
0f23f75f 107 movdqa [rsp + 0], xmm6
f71dd54d 108 .seh_savexmm xmm6, 0
0f23f75f 109 movdqa [rsp + 16], xmm7
f71dd54d
MW
110 .seh_savexmm xmm7, 16
111 .seh_endprologue
0f23f75f 112#endif
1a0c09c4
MW
113
114 // First job is to slurp the matrix into XMM registers. The words
115 // have already been permuted conveniently to make them line up
116 // better for SIMD processing.
117 //
118 // The textbook arrangement of the matrix is this.
119 //
120 // [C K K K]
121 // [K C N N]
122 // [T T C K]
123 // [K K K C]
124 //
125 // But we've rotated the columns up so that the main diagonal with
126 // the constants on it end up in the first row, giving something more
127 // like
128 //
129 // [C C C C]
130 // [K T K K]
131 // [T K K N]
132 // [K K N K]
133 //
134 // so the transformation looks like this:
135 //
136 // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
137 // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
138 // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
139 // [12 13 14 15] [12 1 6 11] (d, xmm3)
0f23f75f
MW
140 movdqu xmm0, [IN + 0]
141 movdqu xmm1, [IN + 16]
142 movdqu xmm2, [IN + 32]
143 movdqu xmm3, [IN + 48]
1a0c09c4 144
7afb1dc9 145 // Take a copy for later.
0f23f75f
MW
146 movdqa SAVE0, xmm0
147 movdqa SAVE1, xmm1
148 movdqa SAVE2, xmm2
149 movdqa SAVE3, xmm3
1a0c09c4 150
fd3bb67b 1510:
1a0c09c4
MW
152 // Apply a column quarterround to each of the columns simultaneously.
153 // Alas, there doesn't seem to be a packed doubleword rotate, so we
154 // have to synthesize it.
155
156 // b ^= (a + d) <<< 7
157 movdqa xmm4, xmm0
158 paddd xmm4, xmm3
159 movdqa xmm5, xmm4
160 pslld xmm4, 7
161 psrld xmm5, 25
162 por xmm4, xmm5
163 pxor xmm1, xmm4
164
165 // c ^= (b + a) <<< 9
166 movdqa xmm4, xmm1
167 paddd xmm4, xmm0
168 movdqa xmm5, xmm4
169 pslld xmm4, 9
170 psrld xmm5, 23
171 por xmm4, xmm5
172 pxor xmm2, xmm4
173
174 // d ^= (c + b) <<< 13
175 movdqa xmm4, xmm2
176 paddd xmm4, xmm1
a13b5730 177 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
1a0c09c4
MW
178 movdqa xmm5, xmm4
179 pslld xmm4, 13
180 psrld xmm5, 19
181 por xmm4, xmm5
182 pxor xmm3, xmm4
183
184 // a ^= (d + c) <<< 18
185 movdqa xmm4, xmm3
a13b5730 186 pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
1a0c09c4 187 paddd xmm4, xmm2
a13b5730 188 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4
MW
189 movdqa xmm5, xmm4
190 pslld xmm4, 18
191 psrld xmm5, 14
192 por xmm4, xmm5
193 pxor xmm0, xmm4
194
195 // The transpose conveniently only involves reordering elements of
196 // individual rows, which can be done quite easily, and reordering
197 // the rows themselves, which is a trivial renaming. It doesn't
198 // involve any movement of elements between rows.
199 //
200 // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
0f23f75f
MW
201 // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
202 // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
203 // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
1a0c09c4
MW
204 //
205 // The shuffles have quite high latency, so they've been pushed
206 // backwards into the main instruction list.
207
208 // Apply the row quarterround to each of the columns (yes!)
209 // simultaneously.
210
211 // b ^= (a + d) <<< 7
212 movdqa xmm4, xmm0
213 paddd xmm4, xmm1
214 movdqa xmm5, xmm4
215 pslld xmm4, 7
216 psrld xmm5, 25
217 por xmm4, xmm5
218 pxor xmm3, xmm4
219
220 // c ^= (b + a) <<< 9
221 movdqa xmm4, xmm3
222 paddd xmm4, xmm0
223 movdqa xmm5, xmm4
224 pslld xmm4, 9
225 psrld xmm5, 23
226 por xmm4, xmm5
227 pxor xmm2, xmm4
228
229 // d ^= (c + b) <<< 13
230 movdqa xmm4, xmm2
231 paddd xmm4, xmm3
a13b5730 232 pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
1a0c09c4
MW
233 movdqa xmm5, xmm4
234 pslld xmm4, 13
235 psrld xmm5, 19
236 por xmm4, xmm5
237 pxor xmm1, xmm4
238
239 // a ^= (d + c) <<< 18
240 movdqa xmm4, xmm1
a13b5730 241 pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
1a0c09c4 242 paddd xmm4, xmm2
a13b5730 243 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4
MW
244 movdqa xmm5, xmm4
245 pslld xmm4, 18
246 psrld xmm5, 14
247 por xmm4, xmm5
248 pxor xmm0, xmm4
249
250 // We had to undo the transpose ready for the next loop. Again, push
251 // back the shuffles because they take a long time coming through.
252 // Decrement the loop counter and see if we should go round again.
253 // Later processors fuse this pair into a single uop.
0f23f75f 254 sub NR, 2
fd3bb67b 255 ja 0b
1a0c09c4
MW
256
257 // Almost there. Firstly, the feedforward addition, and then we have
258 // to write out the result. Here we have to undo the permutation
259 // which was already applied to the input. Shuffling has quite high
260 // latency, so arrange to start a new shuffle into a temporary as
261 // soon as we've written out the old value.
0f23f75f 262 paddd xmm0, SAVE0
70bc6059 263 pshufd xmm4, xmm0, 0x39
0f23f75f 264 movd [OUT + 0], xmm0
1a0c09c4 265
0f23f75f 266 paddd xmm1, SAVE1
a13b5730 267 pshufd xmm5, xmm1, SHUF(2, 1, 0, 3)
0f23f75f 268 movd [OUT + 16], xmm1
1a0c09c4 269
0f23f75f 270 paddd xmm2, SAVE2
a13b5730 271 pshufd xmm6, xmm2, SHUF(1, 0, 3, 2)
0f23f75f 272 movd [OUT + 32], xmm2
1a0c09c4 273
0f23f75f 274 paddd xmm3, SAVE3
a13b5730 275 pshufd xmm7, xmm3, SHUF(0, 3, 2, 1)
0f23f75f 276 movd [OUT + 48], xmm3
1a0c09c4 277
0f23f75f 278 movd [OUT + 4], xmm7
a13b5730 279 pshufd xmm7, xmm3, SHUF(1, 0, 3, 2)
0f23f75f 280 movd [OUT + 24], xmm7
a13b5730 281 pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
0f23f75f 282 movd [OUT + 44], xmm3
1a0c09c4 283
0f23f75f 284 movd [OUT + 8], xmm6
a13b5730 285 pshufd xmm6, xmm2, SHUF(2, 1, 0, 3)
0f23f75f 286 movd [OUT + 28], xmm6
a13b5730 287 pshufd xmm2, xmm2, SHUF(0, 3, 2, 1)
0f23f75f 288 movd [OUT + 52], xmm2
1a0c09c4 289
0f23f75f 290 movd [OUT + 12], xmm5
a13b5730 291 pshufd xmm5, xmm1, SHUF(0, 3, 2, 1)
0f23f75f 292 movd [OUT + 36], xmm5
a13b5730 293 pshufd xmm1, xmm1, SHUF(1, 0, 3, 2)
0f23f75f 294 movd [OUT + 56], xmm1
1a0c09c4 295
0f23f75f 296 movd [OUT + 20], xmm4
a13b5730 297 pshufd xmm4, xmm0, SHUF(1, 0, 3, 2)
0f23f75f 298 movd [OUT + 40], xmm4
a13b5730 299 pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
0f23f75f 300 movd [OUT + 60], xmm0
1a0c09c4
MW
301
302 // Tidy things up.
0f23f75f 303#if CPUFAM_X86
1a0c09c4
MW
304 mov esp, ebp
305 pop ebp
0f23f75f
MW
306#endif
307#if CPUFAM_AMD64 && ABI_WIN
308 movdqa xmm6, [rsp + 0]
309 movdqa xmm7, [rsp + 16]
310 add rsp, 64 + 8
311#endif
1a0c09c4
MW
312
313 // And with that, we're done.
314 ret
315
0f23f75f
MW
316#undef NR
317#undef IN
318#undef OUT
319#undef SAVE0
320#undef SAVE1
321#undef SAVE2
322#undef SAVE3
323
1a0c09c4
MW
324ENDFUNC
325
326///----- That's all, folks --------------------------------------------------