base/asm-common.h, *.S: Add `INTFUNC' macro for internal subroutines.
[catacomb] / symm / salsa20-x86ish-sse2.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// Fancy SIMD implementation of Salsa20
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
33///--------------------------------------------------------------------------
34/// Main code.
35
36 .arch pentium4
bc9ac7eb 37 .text
1a0c09c4 38
0f23f75f
MW
39FUNC(salsa20_core_x86ish_sse2)
40
41 // Initial setup.
42
43#if CPUFAM_X86
44 // Arguments come in on the stack, and will need to be collected. We
45 // we can get away with just the scratch registers for integer work,
46 // but we'll run out of XMM registers and will need some properly
47 // aligned space which we'll steal from the stack. I don't trust the
48 // stack pointer's alignment, so I'll have to mask the stack pointer,
49 // which in turn means I'll need to keep track of the old value.
50 // Hence I'm making a full i386-style stack frame here.
51 //
52 // The Windows and SysV ABIs are sufficiently similar that we don't
53 // need to worry about the differences here.
54
55# define NR ecx
56# define IN eax
57# define OUT edx
58# define SAVE0 xmm6
59# define SAVE1 xmm7
60# define SAVE2 [esp + 0]
61# define SAVE3 [esp + 16]
1a0c09c4 62
1a0c09c4
MW
63 push ebp
64 mov ebp, esp
65 sub esp, 32
0f23f75f
MW
66 mov IN, [ebp + 12]
67 mov OUT, [ebp + 16]
1a0c09c4 68 and esp, ~15
0f23f75f
MW
69 mov NR, [ebp + 8]
70#endif
71
72#if CPUFAM_AMD64 && ABI_SYSV
73 // This is nice. We have plenty of XMM registers, and the arguments
74 // are in useful places. There's no need to spill anything and we
75 // can just get on with the code.
76
77# define NR edi
78# define IN rsi
79# define OUT rdx
80# define SAVE0 xmm6
81# define SAVE1 xmm7
82# define SAVE2 xmm8
83# define SAVE3 xmm9
84#endif
85
86# if CPUFAM_AMD64 && ABI_WIN
87 // Arguments come in registers, but they're different between Windows
88 // and everyone else (and everyone else is saner).
89 //
90 // The Windows ABI insists that we preserve some of the XMM
91 // registers, but we want more than we can use as scratch space. Two
92 // places we only need to save a copy of the input for the
93 // feedforward at the end; but the other two we want for the final
ae429891
MW
94 // permutation, so save the old values on the stack. (We need an
95 // extra 8 bytes to align the stack.)
0f23f75f
MW
96
97# define NR ecx
98# define IN rdx
99# define OUT r8
100# define SAVE0 xmm6
101# define SAVE1 xmm7
102# define SAVE2 [rsp + 32]
103# define SAVE3 [rsp + 48]
104
105 sub rsp, 64 + 8
f71dd54d 106 .seh_stackalloc 64 + 8
0f23f75f 107 movdqa [rsp + 0], xmm6
f71dd54d 108 .seh_savexmm xmm6, 0
0f23f75f 109 movdqa [rsp + 16], xmm7
f71dd54d
MW
110 .seh_savexmm xmm7, 16
111 .seh_endprologue
0f23f75f 112#endif
1a0c09c4
MW
113
114 // First job is to slurp the matrix into XMM registers. The words
115 // have already been permuted conveniently to make them line up
116 // better for SIMD processing.
117 //
118 // The textbook arrangement of the matrix is this.
119 //
120 // [C K K K]
121 // [K C N N]
122 // [T T C K]
123 // [K K K C]
124 //
125 // But we've rotated the columns up so that the main diagonal with
126 // the constants on it end up in the first row, giving something more
127 // like
128 //
129 // [C C C C]
130 // [K T K K]
131 // [T K K N]
132 // [K K N K]
133 //
134 // so the transformation looks like this:
135 //
136 // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
137 // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
138 // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
139 // [12 13 14 15] [12 1 6 11] (d, xmm3)
0f23f75f
MW
140 movdqu xmm0, [IN + 0]
141 movdqu xmm1, [IN + 16]
142 movdqu xmm2, [IN + 32]
143 movdqu xmm3, [IN + 48]
1a0c09c4 144
7afb1dc9 145 // Take a copy for later.
0f23f75f
MW
146 movdqa SAVE0, xmm0
147 movdqa SAVE1, xmm1
148 movdqa SAVE2, xmm2
149 movdqa SAVE3, xmm3
1a0c09c4 150
fd3bb67b 1510:
1a0c09c4
MW
152 // Apply a column quarterround to each of the columns simultaneously.
153 // Alas, there doesn't seem to be a packed doubleword rotate, so we
154 // have to synthesize it.
155
156 // b ^= (a + d) <<< 7
157 movdqa xmm4, xmm0
158 paddd xmm4, xmm3
159 movdqa xmm5, xmm4
160 pslld xmm4, 7
161 psrld xmm5, 25
162 por xmm4, xmm5
163 pxor xmm1, xmm4
164
165 // c ^= (b + a) <<< 9
166 movdqa xmm4, xmm1
167 paddd xmm4, xmm0
168 movdqa xmm5, xmm4
169 pslld xmm4, 9
170 psrld xmm5, 23
171 por xmm4, xmm5
172 pxor xmm2, xmm4
173
174 // d ^= (c + b) <<< 13
175 movdqa xmm4, xmm2
176 paddd xmm4, xmm1
a13b5730 177 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
1a0c09c4
MW
178 movdqa xmm5, xmm4
179 pslld xmm4, 13
180 psrld xmm5, 19
181 por xmm4, xmm5
182 pxor xmm3, xmm4
183
184 // a ^= (d + c) <<< 18
185 movdqa xmm4, xmm3
a13b5730 186 pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
1a0c09c4 187 paddd xmm4, xmm2
a13b5730 188 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4
MW
189 movdqa xmm5, xmm4
190 pslld xmm4, 18
191 psrld xmm5, 14
192 por xmm4, xmm5
193 pxor xmm0, xmm4
194
195 // The transpose conveniently only involves reordering elements of
196 // individual rows, which can be done quite easily, and reordering
197 // the rows themselves, which is a trivial renaming. It doesn't
198 // involve any movement of elements between rows.
199 //
200 // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
0f23f75f
MW
201 // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
202 // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
203 // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
1a0c09c4
MW
204 //
205 // The shuffles have quite high latency, so they've been pushed
206 // backwards into the main instruction list.
207
208 // Apply the row quarterround to each of the columns (yes!)
209 // simultaneously.
210
211 // b ^= (a + d) <<< 7
212 movdqa xmm4, xmm0
213 paddd xmm4, xmm1
214 movdqa xmm5, xmm4
215 pslld xmm4, 7
216 psrld xmm5, 25
217 por xmm4, xmm5
218 pxor xmm3, xmm4
219
220 // c ^= (b + a) <<< 9
221 movdqa xmm4, xmm3
222 paddd xmm4, xmm0
223 movdqa xmm5, xmm4
224 pslld xmm4, 9
225 psrld xmm5, 23
226 por xmm4, xmm5
227 pxor xmm2, xmm4
228
229 // d ^= (c + b) <<< 13
230 movdqa xmm4, xmm2
231 paddd xmm4, xmm3
a13b5730 232 pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
1a0c09c4
MW
233 movdqa xmm5, xmm4
234 pslld xmm4, 13
235 psrld xmm5, 19
236 por xmm4, xmm5
237 pxor xmm1, xmm4
238
239 // a ^= (d + c) <<< 18
240 movdqa xmm4, xmm1
a13b5730 241 pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
1a0c09c4 242 paddd xmm4, xmm2
a13b5730 243 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4
MW
244 movdqa xmm5, xmm4
245 pslld xmm4, 18
246 psrld xmm5, 14
247 por xmm4, xmm5
248 pxor xmm0, xmm4
249
250 // We had to undo the transpose ready for the next loop. Again, push
251 // back the shuffles because they take a long time coming through.
252 // Decrement the loop counter and see if we should go round again.
253 // Later processors fuse this pair into a single uop.
0f23f75f 254 sub NR, 2
fd3bb67b 255 ja 0b
1a0c09c4 256
3cb47d27
MW
257 // Almost there. Firstly, the feedforward addition.
258 paddd xmm0, SAVE0 // 0, 5, 10, 15
259 paddd xmm1, SAVE1 // 4, 9, 14, 3
260 paddd xmm2, SAVE2 // 8, 13, 2, 7
261 paddd xmm3, SAVE3 // 12, 1, 6, 11
262
263 // Next we must undo the permutation which was already applied to the
264 // input. This can be done by juggling values in registers, with the
265 // following fancy footwork: some row rotations, a transpose, and
266 // some more rotations.
267 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14
268 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13
269 pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12
270
271 movdqa xmm4, xmm0
272 movdqa xmm5, xmm3
273 punpckldq xmm0, xmm2 // 0, 2, 5, 7
274 punpckldq xmm3, xmm1 // 1, 3, 6, 4
275 punpckhdq xmm4, xmm2 // 10, 8, 15, 13
276 punpckhdq xmm5, xmm1 // 11, 9, 12, 14
277
278 movdqa xmm1, xmm0
279 movdqa xmm2, xmm4
280 punpckldq xmm0, xmm3 // 0, 1, 2, 3
281 punpckldq xmm4, xmm5 // 10, 11, 8, 9
282 punpckhdq xmm1, xmm3 // 5, 6, 7, 4
283 punpckhdq xmm2, xmm5 // 15, 12, 13, 14
284
285 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7
286 pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11
287 pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15
288
289 // Finally we have to write out the result.
290 movdqu [OUT + 0], xmm0
291 movdqu [OUT + 16], xmm1
292 movdqu [OUT + 32], xmm4
293 movdqu [OUT + 48], xmm2
1a0c09c4
MW
294
295 // Tidy things up.
0f23f75f 296#if CPUFAM_X86
1a0c09c4
MW
297 mov esp, ebp
298 pop ebp
0f23f75f
MW
299#endif
300#if CPUFAM_AMD64 && ABI_WIN
301 movdqa xmm6, [rsp + 0]
302 movdqa xmm7, [rsp + 16]
303 add rsp, 64 + 8
304#endif
1a0c09c4
MW
305
306 // And with that, we're done.
307 ret
308
0f23f75f
MW
309#undef NR
310#undef IN
311#undef OUT
312#undef SAVE0
313#undef SAVE1
314#undef SAVE2
315#undef SAVE3
316
1a0c09c4
MW
317ENDFUNC
318
319///----- That's all, folks --------------------------------------------------