math/Makefile.am, symm/Makefile.am: Use `--no-install' on oddball tests.
[catacomb] / symm / salsa20-x86ish-sse2.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// Fancy SIMD implementation of Salsa20
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
33///--------------------------------------------------------------------------
34/// Main code.
35
36 .arch pentium4
bc9ac7eb 37 .text
1a0c09c4 38
0f23f75f
MW
39FUNC(salsa20_core_x86ish_sse2)
40
41 // Initial setup.
42
43#if CPUFAM_X86
44 // Arguments come in on the stack, and will need to be collected. We
172707cb
MW
45 // can get away with just the scratch registers for integer work, but
46 // we'll run out of XMM registers and will need some properly aligned
47 // space which we'll steal from the stack. I don't trust the stack
48 // pointer's alignment, so I'll have to mask the stack pointer, which
49 // in turn means I'll need to keep track of the old value. Hence I'm
50 // making a full i386-style stack frame here.
0f23f75f
MW
51 //
52 // The Windows and SysV ABIs are sufficiently similar that we don't
53 // need to worry about the differences here.
54
55# define NR ecx
56# define IN eax
57# define OUT edx
58# define SAVE0 xmm6
59# define SAVE1 xmm7
60# define SAVE2 [esp + 0]
61# define SAVE3 [esp + 16]
1a0c09c4 62
0923a413
MW
63 pushreg ebp
64 setfp ebp
1a0c09c4 65 sub esp, 32
0f23f75f
MW
66 mov IN, [ebp + 12]
67 mov OUT, [ebp + 16]
1a0c09c4 68 and esp, ~15
0f23f75f
MW
69 mov NR, [ebp + 8]
70#endif
71
72#if CPUFAM_AMD64 && ABI_SYSV
73 // This is nice. We have plenty of XMM registers, and the arguments
74 // are in useful places. There's no need to spill anything and we
75 // can just get on with the code.
76
77# define NR edi
78# define IN rsi
79# define OUT rdx
80# define SAVE0 xmm6
81# define SAVE1 xmm7
82# define SAVE2 xmm8
83# define SAVE3 xmm9
84#endif
85
86# if CPUFAM_AMD64 && ABI_WIN
87 // Arguments come in registers, but they're different between Windows
88 // and everyone else (and everyone else is saner).
89 //
90 // The Windows ABI insists that we preserve some of the XMM
91 // registers, but we want more than we can use as scratch space. Two
92 // places we only need to save a copy of the input for the
93 // feedforward at the end; but the other two we want for the final
ae429891
MW
94 // permutation, so save the old values on the stack. (We need an
95 // extra 8 bytes to align the stack.)
0f23f75f
MW
96
97# define NR ecx
98# define IN rdx
99# define OUT r8
100# define SAVE0 xmm6
101# define SAVE1 xmm7
102# define SAVE2 [rsp + 32]
103# define SAVE3 [rsp + 48]
104
0923a413
MW
105 stalloc 64 + 8
106 savexmm xmm6, 0
107 savexmm xmm7, 16
0f23f75f 108#endif
1a0c09c4 109
0923a413
MW
110 endprologue
111
1a0c09c4
MW
112 // First job is to slurp the matrix into XMM registers. The words
113 // have already been permuted conveniently to make them line up
114 // better for SIMD processing.
115 //
116 // The textbook arrangement of the matrix is this.
117 //
118 // [C K K K]
119 // [K C N N]
120 // [T T C K]
121 // [K K K C]
122 //
123 // But we've rotated the columns up so that the main diagonal with
124 // the constants on it end up in the first row, giving something more
125 // like
126 //
127 // [C C C C]
128 // [K T K K]
129 // [T K K N]
130 // [K K N K]
131 //
132 // so the transformation looks like this:
133 //
134 // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
135 // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
136 // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
137 // [12 13 14 15] [12 1 6 11] (d, xmm3)
0f23f75f
MW
138 movdqu xmm0, [IN + 0]
139 movdqu xmm1, [IN + 16]
140 movdqu xmm2, [IN + 32]
141 movdqu xmm3, [IN + 48]
1a0c09c4 142
7afb1dc9 143 // Take a copy for later.
0f23f75f
MW
144 movdqa SAVE0, xmm0
145 movdqa SAVE1, xmm1
146 movdqa SAVE2, xmm2
147 movdqa SAVE3, xmm3
1a0c09c4 148
fd3bb67b 1490:
1a0c09c4
MW
150 // Apply a column quarterround to each of the columns simultaneously.
151 // Alas, there doesn't seem to be a packed doubleword rotate, so we
152 // have to synthesize it.
153
154 // b ^= (a + d) <<< 7
155 movdqa xmm4, xmm0
156 paddd xmm4, xmm3
157 movdqa xmm5, xmm4
158 pslld xmm4, 7
159 psrld xmm5, 25
160 por xmm4, xmm5
161 pxor xmm1, xmm4
162
163 // c ^= (b + a) <<< 9
164 movdqa xmm4, xmm1
165 paddd xmm4, xmm0
166 movdqa xmm5, xmm4
167 pslld xmm4, 9
168 psrld xmm5, 23
169 por xmm4, xmm5
170 pxor xmm2, xmm4
171
172 // d ^= (c + b) <<< 13
173 movdqa xmm4, xmm2
174 paddd xmm4, xmm1
a13b5730 175 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
1a0c09c4
MW
176 movdqa xmm5, xmm4
177 pslld xmm4, 13
178 psrld xmm5, 19
179 por xmm4, xmm5
180 pxor xmm3, xmm4
181
182 // a ^= (d + c) <<< 18
183 movdqa xmm4, xmm3
a13b5730 184 pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
1a0c09c4 185 paddd xmm4, xmm2
a13b5730 186 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4
MW
187 movdqa xmm5, xmm4
188 pslld xmm4, 18
189 psrld xmm5, 14
190 por xmm4, xmm5
191 pxor xmm0, xmm4
192
193 // The transpose conveniently only involves reordering elements of
194 // individual rows, which can be done quite easily, and reordering
195 // the rows themselves, which is a trivial renaming. It doesn't
196 // involve any movement of elements between rows.
197 //
198 // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
0f23f75f
MW
199 // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
200 // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
201 // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
1a0c09c4
MW
202 //
203 // The shuffles have quite high latency, so they've been pushed
204 // backwards into the main instruction list.
205
206 // Apply the row quarterround to each of the columns (yes!)
207 // simultaneously.
208
209 // b ^= (a + d) <<< 7
210 movdqa xmm4, xmm0
211 paddd xmm4, xmm1
212 movdqa xmm5, xmm4
213 pslld xmm4, 7
214 psrld xmm5, 25
215 por xmm4, xmm5
216 pxor xmm3, xmm4
217
218 // c ^= (b + a) <<< 9
219 movdqa xmm4, xmm3
220 paddd xmm4, xmm0
221 movdqa xmm5, xmm4
222 pslld xmm4, 9
223 psrld xmm5, 23
224 por xmm4, xmm5
225 pxor xmm2, xmm4
226
227 // d ^= (c + b) <<< 13
228 movdqa xmm4, xmm2
229 paddd xmm4, xmm3
a13b5730 230 pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
1a0c09c4
MW
231 movdqa xmm5, xmm4
232 pslld xmm4, 13
233 psrld xmm5, 19
234 por xmm4, xmm5
235 pxor xmm1, xmm4
236
237 // a ^= (d + c) <<< 18
238 movdqa xmm4, xmm1
a13b5730 239 pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
1a0c09c4 240 paddd xmm4, xmm2
a13b5730 241 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4
MW
242 movdqa xmm5, xmm4
243 pslld xmm4, 18
244 psrld xmm5, 14
245 por xmm4, xmm5
246 pxor xmm0, xmm4
247
248 // We had to undo the transpose ready for the next loop. Again, push
249 // back the shuffles because they take a long time coming through.
250 // Decrement the loop counter and see if we should go round again.
251 // Later processors fuse this pair into a single uop.
0f23f75f 252 sub NR, 2
fd3bb67b 253 ja 0b
1a0c09c4 254
3cb47d27
MW
255 // Almost there. Firstly, the feedforward addition.
256 paddd xmm0, SAVE0 // 0, 5, 10, 15
257 paddd xmm1, SAVE1 // 4, 9, 14, 3
258 paddd xmm2, SAVE2 // 8, 13, 2, 7
259 paddd xmm3, SAVE3 // 12, 1, 6, 11
260
261 // Next we must undo the permutation which was already applied to the
262 // input. This can be done by juggling values in registers, with the
263 // following fancy footwork: some row rotations, a transpose, and
264 // some more rotations.
265 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14
266 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13
267 pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12
268
269 movdqa xmm4, xmm0
270 movdqa xmm5, xmm3
271 punpckldq xmm0, xmm2 // 0, 2, 5, 7
272 punpckldq xmm3, xmm1 // 1, 3, 6, 4
273 punpckhdq xmm4, xmm2 // 10, 8, 15, 13
274 punpckhdq xmm5, xmm1 // 11, 9, 12, 14
275
276 movdqa xmm1, xmm0
277 movdqa xmm2, xmm4
278 punpckldq xmm0, xmm3 // 0, 1, 2, 3
279 punpckldq xmm4, xmm5 // 10, 11, 8, 9
280 punpckhdq xmm1, xmm3 // 5, 6, 7, 4
281 punpckhdq xmm2, xmm5 // 15, 12, 13, 14
282
283 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7
284 pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11
285 pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15
286
287 // Finally we have to write out the result.
288 movdqu [OUT + 0], xmm0
289 movdqu [OUT + 16], xmm1
290 movdqu [OUT + 32], xmm4
291 movdqu [OUT + 48], xmm2
1a0c09c4
MW
292
293 // Tidy things up.
0f23f75f 294#if CPUFAM_X86
0923a413
MW
295 dropfp
296 popreg ebp
0f23f75f
MW
297#endif
298#if CPUFAM_AMD64 && ABI_WIN
0923a413 299 rstrxmm xmm6, 0
41fb2356 300 rstrxmm xmm7, 16
0923a413 301 stfree 64 + 8
0f23f75f 302#endif
1a0c09c4
MW
303
304 // And with that, we're done.
305 ret
306
0f23f75f
MW
307#undef NR
308#undef IN
309#undef OUT
310#undef SAVE0
311#undef SAVE1
312#undef SAVE2
313#undef SAVE3
314
1a0c09c4
MW
315ENDFUNC
316
317///----- That's all, folks --------------------------------------------------