symm/rijndael-arm-crypto.S: More aggressive loading of subkey data.
[catacomb] / symm / salsa20-x86ish-sse2.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// Fancy SIMD implementation of Salsa20
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
33///--------------------------------------------------------------------------
47103664
MW
34/// Local utilities.
35
36// Magic constants for shuffling.
37#define ROTL 0x93
38#define ROT2 0x4e
39#define ROTR 0x39
40
41///--------------------------------------------------------------------------
1a0c09c4
MW
42/// Main code.
43
44 .arch pentium4
bc9ac7eb 45 .text
1a0c09c4 46
0f23f75f
MW
47FUNC(salsa20_core_x86ish_sse2)
48
49 // Initial setup.
50
51#if CPUFAM_X86
52 // Arguments come in on the stack, and will need to be collected. We
53 // we can get away with just the scratch registers for integer work,
54 // but we'll run out of XMM registers and will need some properly
55 // aligned space which we'll steal from the stack. I don't trust the
56 // stack pointer's alignment, so I'll have to mask the stack pointer,
57 // which in turn means I'll need to keep track of the old value.
58 // Hence I'm making a full i386-style stack frame here.
59 //
60 // The Windows and SysV ABIs are sufficiently similar that we don't
61 // need to worry about the differences here.
62
63# define NR ecx
64# define IN eax
65# define OUT edx
66# define SAVE0 xmm6
67# define SAVE1 xmm7
68# define SAVE2 [esp + 0]
69# define SAVE3 [esp + 16]
1a0c09c4 70
1a0c09c4
MW
71 push ebp
72 mov ebp, esp
73 sub esp, 32
0f23f75f
MW
74 mov IN, [ebp + 12]
75 mov OUT, [ebp + 16]
1a0c09c4 76 and esp, ~15
0f23f75f
MW
77 mov NR, [ebp + 8]
78#endif
79
80#if CPUFAM_AMD64 && ABI_SYSV
81 // This is nice. We have plenty of XMM registers, and the arguments
82 // are in useful places. There's no need to spill anything and we
83 // can just get on with the code.
84
85# define NR edi
86# define IN rsi
87# define OUT rdx
88# define SAVE0 xmm6
89# define SAVE1 xmm7
90# define SAVE2 xmm8
91# define SAVE3 xmm9
92#endif
93
94# if CPUFAM_AMD64 && ABI_WIN
95 // Arguments come in registers, but they're different between Windows
96 // and everyone else (and everyone else is saner).
97 //
98 // The Windows ABI insists that we preserve some of the XMM
99 // registers, but we want more than we can use as scratch space. Two
100 // places we only need to save a copy of the input for the
101 // feedforward at the end; but the other two we want for the final
102 // permutation, so save the old values on the stack (We need an extra
103 // 8 bytes to align the stack.)
104
105# define NR ecx
106# define IN rdx
107# define OUT r8
108# define SAVE0 xmm6
109# define SAVE1 xmm7
110# define SAVE2 [rsp + 32]
111# define SAVE3 [rsp + 48]
112
113 sub rsp, 64 + 8
f71dd54d 114 .seh_stackalloc 64 + 8
0f23f75f 115 movdqa [rsp + 0], xmm6
f71dd54d 116 .seh_savexmm xmm6, 0
0f23f75f 117 movdqa [rsp + 16], xmm7
f71dd54d
MW
118 .seh_savexmm xmm7, 16
119 .seh_endprologue
0f23f75f 120#endif
1a0c09c4
MW
121
122 // First job is to slurp the matrix into XMM registers. The words
123 // have already been permuted conveniently to make them line up
124 // better for SIMD processing.
125 //
126 // The textbook arrangement of the matrix is this.
127 //
128 // [C K K K]
129 // [K C N N]
130 // [T T C K]
131 // [K K K C]
132 //
133 // But we've rotated the columns up so that the main diagonal with
134 // the constants on it end up in the first row, giving something more
135 // like
136 //
137 // [C C C C]
138 // [K T K K]
139 // [T K K N]
140 // [K K N K]
141 //
142 // so the transformation looks like this:
143 //
144 // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
145 // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
146 // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
147 // [12 13 14 15] [12 1 6 11] (d, xmm3)
0f23f75f
MW
148 movdqu xmm0, [IN + 0]
149 movdqu xmm1, [IN + 16]
150 movdqu xmm2, [IN + 32]
151 movdqu xmm3, [IN + 48]
1a0c09c4 152
7afb1dc9 153 // Take a copy for later.
0f23f75f
MW
154 movdqa SAVE0, xmm0
155 movdqa SAVE1, xmm1
156 movdqa SAVE2, xmm2
157 movdqa SAVE3, xmm3
1a0c09c4 158
fd3bb67b 1590:
1a0c09c4
MW
160 // Apply a column quarterround to each of the columns simultaneously.
161 // Alas, there doesn't seem to be a packed doubleword rotate, so we
162 // have to synthesize it.
163
164 // b ^= (a + d) <<< 7
165 movdqa xmm4, xmm0
166 paddd xmm4, xmm3
167 movdqa xmm5, xmm4
168 pslld xmm4, 7
169 psrld xmm5, 25
170 por xmm4, xmm5
171 pxor xmm1, xmm4
172
173 // c ^= (b + a) <<< 9
174 movdqa xmm4, xmm1
175 paddd xmm4, xmm0
176 movdqa xmm5, xmm4
177 pslld xmm4, 9
178 psrld xmm5, 23
179 por xmm4, xmm5
180 pxor xmm2, xmm4
181
182 // d ^= (c + b) <<< 13
183 movdqa xmm4, xmm2
184 paddd xmm4, xmm1
47103664 185 pshufd xmm1, xmm1, ROTL
1a0c09c4
MW
186 movdqa xmm5, xmm4
187 pslld xmm4, 13
188 psrld xmm5, 19
189 por xmm4, xmm5
190 pxor xmm3, xmm4
191
192 // a ^= (d + c) <<< 18
193 movdqa xmm4, xmm3
47103664 194 pshufd xmm3, xmm3, ROTR
1a0c09c4 195 paddd xmm4, xmm2
47103664 196 pshufd xmm2, xmm2, ROT2
1a0c09c4
MW
197 movdqa xmm5, xmm4
198 pslld xmm4, 18
199 psrld xmm5, 14
200 por xmm4, xmm5
201 pxor xmm0, xmm4
202
203 // The transpose conveniently only involves reordering elements of
204 // individual rows, which can be done quite easily, and reordering
205 // the rows themselves, which is a trivial renaming. It doesn't
206 // involve any movement of elements between rows.
207 //
208 // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
0f23f75f
MW
209 // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
210 // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
211 // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
1a0c09c4
MW
212 //
213 // The shuffles have quite high latency, so they've been pushed
214 // backwards into the main instruction list.
215
216 // Apply the row quarterround to each of the columns (yes!)
217 // simultaneously.
218
219 // b ^= (a + d) <<< 7
220 movdqa xmm4, xmm0
221 paddd xmm4, xmm1
222 movdqa xmm5, xmm4
223 pslld xmm4, 7
224 psrld xmm5, 25
225 por xmm4, xmm5
226 pxor xmm3, xmm4
227
228 // c ^= (b + a) <<< 9
229 movdqa xmm4, xmm3
230 paddd xmm4, xmm0
231 movdqa xmm5, xmm4
232 pslld xmm4, 9
233 psrld xmm5, 23
234 por xmm4, xmm5
235 pxor xmm2, xmm4
236
237 // d ^= (c + b) <<< 13
238 movdqa xmm4, xmm2
239 paddd xmm4, xmm3
47103664 240 pshufd xmm3, xmm3, ROTL
1a0c09c4
MW
241 movdqa xmm5, xmm4
242 pslld xmm4, 13
243 psrld xmm5, 19
244 por xmm4, xmm5
245 pxor xmm1, xmm4
246
247 // a ^= (d + c) <<< 18
248 movdqa xmm4, xmm1
47103664 249 pshufd xmm1, xmm1, ROTR
1a0c09c4 250 paddd xmm4, xmm2
47103664 251 pshufd xmm2, xmm2, ROT2
1a0c09c4
MW
252 movdqa xmm5, xmm4
253 pslld xmm4, 18
254 psrld xmm5, 14
255 por xmm4, xmm5
256 pxor xmm0, xmm4
257
258 // We had to undo the transpose ready for the next loop. Again, push
259 // back the shuffles because they take a long time coming through.
260 // Decrement the loop counter and see if we should go round again.
261 // Later processors fuse this pair into a single uop.
0f23f75f 262 sub NR, 2
fd3bb67b 263 ja 0b
1a0c09c4
MW
264
265 // Almost there. Firstly, the feedforward addition, and then we have
266 // to write out the result. Here we have to undo the permutation
267 // which was already applied to the input. Shuffling has quite high
268 // latency, so arrange to start a new shuffle into a temporary as
269 // soon as we've written out the old value.
0f23f75f
MW
270 paddd xmm0, SAVE0
271 pshufd xmm4, xmm0, 0x39
272 movd [OUT + 0], xmm0
1a0c09c4 273
0f23f75f 274 paddd xmm1, SAVE1
47103664 275 pshufd xmm5, xmm1, ROTL
0f23f75f 276 movd [OUT + 16], xmm1
1a0c09c4 277
0f23f75f 278 paddd xmm2, SAVE2
47103664 279 pshufd xmm6, xmm2, ROT2
0f23f75f 280 movd [OUT + 32], xmm2
1a0c09c4 281
0f23f75f 282 paddd xmm3, SAVE3
47103664 283 pshufd xmm7, xmm3, ROTR
0f23f75f 284 movd [OUT + 48], xmm3
1a0c09c4 285
0f23f75f 286 movd [OUT + 4], xmm7
47103664 287 pshufd xmm7, xmm3, ROT2
0f23f75f 288 movd [OUT + 24], xmm7
47103664 289 pshufd xmm3, xmm3, ROTL
0f23f75f 290 movd [OUT + 44], xmm3
1a0c09c4 291
0f23f75f 292 movd [OUT + 8], xmm6
47103664 293 pshufd xmm6, xmm2, ROTL
0f23f75f 294 movd [OUT + 28], xmm6
47103664 295 pshufd xmm2, xmm2, ROTR
0f23f75f 296 movd [OUT + 52], xmm2
1a0c09c4 297
0f23f75f 298 movd [OUT + 12], xmm5
47103664 299 pshufd xmm5, xmm1, ROTR
0f23f75f 300 movd [OUT + 36], xmm5
47103664 301 pshufd xmm1, xmm1, ROT2
0f23f75f 302 movd [OUT + 56], xmm1
1a0c09c4 303
0f23f75f 304 movd [OUT + 20], xmm4
47103664 305 pshufd xmm4, xmm0, ROT2
0f23f75f 306 movd [OUT + 40], xmm4
47103664 307 pshufd xmm0, xmm0, ROTL
0f23f75f 308 movd [OUT + 60], xmm0
1a0c09c4
MW
309
310 // Tidy things up.
0f23f75f
MW
311
312#if CPUFAM_X86
1a0c09c4
MW
313 mov esp, ebp
314 pop ebp
0f23f75f
MW
315#endif
316#if CPUFAM_AMD64 && ABI_WIN
317 movdqa xmm6, [rsp + 0]
318 movdqa xmm7, [rsp + 16]
319 add rsp, 64 + 8
320#endif
1a0c09c4
MW
321
322 // And with that, we're done.
323 ret
324
0f23f75f
MW
325#undef NR
326#undef IN
327#undef OUT
328#undef SAVE0
329#undef SAVE1
330#undef SAVE2
331#undef SAVE3
332
1a0c09c4
MW
333ENDFUNC
334
335///----- That's all, folks --------------------------------------------------