symm/salsa20-x86ish-sse2.S: Fix stray `##' comment to be `//'.
[catacomb] / symm / salsa20-x86ish-sse2.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// Fancy SIMD implementation of Salsa20
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
33///--------------------------------------------------------------------------
47103664
MW
34/// Local utilities.
35
36// Magic constants for shuffling.
37#define ROTL 0x93
38#define ROT2 0x4e
39#define ROTR 0x39
40
41///--------------------------------------------------------------------------
1a0c09c4
MW
42/// Main code.
43
44 .arch pentium4
45 .section .text
46
0f23f75f
MW
47FUNC(salsa20_core_x86ish_sse2)
48
49 // Initial setup.
50
51#if CPUFAM_X86
52 // Arguments come in on the stack, and will need to be collected. We
53 // we can get away with just the scratch registers for integer work,
54 // but we'll run out of XMM registers and will need some properly
55 // aligned space which we'll steal from the stack. I don't trust the
56 // stack pointer's alignment, so I'll have to mask the stack pointer,
57 // which in turn means I'll need to keep track of the old value.
58 // Hence I'm making a full i386-style stack frame here.
59 //
60 // The Windows and SysV ABIs are sufficiently similar that we don't
61 // need to worry about the differences here.
62
63# define NR ecx
64# define IN eax
65# define OUT edx
66# define SAVE0 xmm6
67# define SAVE1 xmm7
68# define SAVE2 [esp + 0]
69# define SAVE3 [esp + 16]
1a0c09c4 70
1a0c09c4
MW
71 push ebp
72 mov ebp, esp
73 sub esp, 32
0f23f75f
MW
74 mov IN, [ebp + 12]
75 mov OUT, [ebp + 16]
1a0c09c4 76 and esp, ~15
0f23f75f
MW
77 mov NR, [ebp + 8]
78#endif
79
80#if CPUFAM_AMD64 && ABI_SYSV
81 // This is nice. We have plenty of XMM registers, and the arguments
82 // are in useful places. There's no need to spill anything and we
83 // can just get on with the code.
84
85# define NR edi
86# define IN rsi
87# define OUT rdx
88# define SAVE0 xmm6
89# define SAVE1 xmm7
90# define SAVE2 xmm8
91# define SAVE3 xmm9
92#endif
93
94# if CPUFAM_AMD64 && ABI_WIN
95 // Arguments come in registers, but they're different between Windows
96 // and everyone else (and everyone else is saner).
97 //
98 // The Windows ABI insists that we preserve some of the XMM
99 // registers, but we want more than we can use as scratch space. Two
100 // places we only need to save a copy of the input for the
101 // feedforward at the end; but the other two we want for the final
102 // permutation, so save the old values on the stack (We need an extra
103 // 8 bytes to align the stack.)
104
105# define NR ecx
106# define IN rdx
107# define OUT r8
108# define SAVE0 xmm6
109# define SAVE1 xmm7
110# define SAVE2 [rsp + 32]
111# define SAVE3 [rsp + 48]
112
113 sub rsp, 64 + 8
114 movdqa [rsp + 0], xmm6
115 movdqa [rsp + 16], xmm7
116#endif
1a0c09c4
MW
117
118 // First job is to slurp the matrix into XMM registers. The words
119 // have already been permuted conveniently to make them line up
120 // better for SIMD processing.
121 //
122 // The textbook arrangement of the matrix is this.
123 //
124 // [C K K K]
125 // [K C N N]
126 // [T T C K]
127 // [K K K C]
128 //
129 // But we've rotated the columns up so that the main diagonal with
130 // the constants on it end up in the first row, giving something more
131 // like
132 //
133 // [C C C C]
134 // [K T K K]
135 // [T K K N]
136 // [K K N K]
137 //
138 // so the transformation looks like this:
139 //
140 // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
141 // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
142 // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
143 // [12 13 14 15] [12 1 6 11] (d, xmm3)
0f23f75f
MW
144 movdqu xmm0, [IN + 0]
145 movdqu xmm1, [IN + 16]
146 movdqu xmm2, [IN + 32]
147 movdqu xmm3, [IN + 48]
1a0c09c4 148
7afb1dc9 149 // Take a copy for later.
0f23f75f
MW
150 movdqa SAVE0, xmm0
151 movdqa SAVE1, xmm1
152 movdqa SAVE2, xmm2
153 movdqa SAVE3, xmm3
1a0c09c4
MW
154
155loop:
1a0c09c4
MW
156 // Apply a column quarterround to each of the columns simultaneously.
157 // Alas, there doesn't seem to be a packed doubleword rotate, so we
158 // have to synthesize it.
159
160 // b ^= (a + d) <<< 7
161 movdqa xmm4, xmm0
162 paddd xmm4, xmm3
163 movdqa xmm5, xmm4
164 pslld xmm4, 7
165 psrld xmm5, 25
166 por xmm4, xmm5
167 pxor xmm1, xmm4
168
169 // c ^= (b + a) <<< 9
170 movdqa xmm4, xmm1
171 paddd xmm4, xmm0
172 movdqa xmm5, xmm4
173 pslld xmm4, 9
174 psrld xmm5, 23
175 por xmm4, xmm5
176 pxor xmm2, xmm4
177
178 // d ^= (c + b) <<< 13
179 movdqa xmm4, xmm2
180 paddd xmm4, xmm1
47103664 181 pshufd xmm1, xmm1, ROTL
1a0c09c4
MW
182 movdqa xmm5, xmm4
183 pslld xmm4, 13
184 psrld xmm5, 19
185 por xmm4, xmm5
186 pxor xmm3, xmm4
187
188 // a ^= (d + c) <<< 18
189 movdqa xmm4, xmm3
47103664 190 pshufd xmm3, xmm3, ROTR
1a0c09c4 191 paddd xmm4, xmm2
47103664 192 pshufd xmm2, xmm2, ROT2
1a0c09c4
MW
193 movdqa xmm5, xmm4
194 pslld xmm4, 18
195 psrld xmm5, 14
196 por xmm4, xmm5
197 pxor xmm0, xmm4
198
199 // The transpose conveniently only involves reordering elements of
200 // individual rows, which can be done quite easily, and reordering
201 // the rows themselves, which is a trivial renaming. It doesn't
202 // involve any movement of elements between rows.
203 //
204 // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
0f23f75f
MW
205 // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
206 // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
207 // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
1a0c09c4
MW
208 //
209 // The shuffles have quite high latency, so they've been pushed
210 // backwards into the main instruction list.
211
212 // Apply the row quarterround to each of the columns (yes!)
213 // simultaneously.
214
215 // b ^= (a + d) <<< 7
216 movdqa xmm4, xmm0
217 paddd xmm4, xmm1
218 movdqa xmm5, xmm4
219 pslld xmm4, 7
220 psrld xmm5, 25
221 por xmm4, xmm5
222 pxor xmm3, xmm4
223
224 // c ^= (b + a) <<< 9
225 movdqa xmm4, xmm3
226 paddd xmm4, xmm0
227 movdqa xmm5, xmm4
228 pslld xmm4, 9
229 psrld xmm5, 23
230 por xmm4, xmm5
231 pxor xmm2, xmm4
232
233 // d ^= (c + b) <<< 13
234 movdqa xmm4, xmm2
235 paddd xmm4, xmm3
47103664 236 pshufd xmm3, xmm3, ROTL
1a0c09c4
MW
237 movdqa xmm5, xmm4
238 pslld xmm4, 13
239 psrld xmm5, 19
240 por xmm4, xmm5
241 pxor xmm1, xmm4
242
243 // a ^= (d + c) <<< 18
244 movdqa xmm4, xmm1
47103664 245 pshufd xmm1, xmm1, ROTR
1a0c09c4 246 paddd xmm4, xmm2
47103664 247 pshufd xmm2, xmm2, ROT2
1a0c09c4
MW
248 movdqa xmm5, xmm4
249 pslld xmm4, 18
250 psrld xmm5, 14
251 por xmm4, xmm5
252 pxor xmm0, xmm4
253
254 // We had to undo the transpose ready for the next loop. Again, push
255 // back the shuffles because they take a long time coming through.
256 // Decrement the loop counter and see if we should go round again.
257 // Later processors fuse this pair into a single uop.
0f23f75f 258 sub NR, 2
1a0c09c4
MW
259 ja loop
260
261 // Almost there. Firstly, the feedforward addition, and then we have
262 // to write out the result. Here we have to undo the permutation
263 // which was already applied to the input. Shuffling has quite high
264 // latency, so arrange to start a new shuffle into a temporary as
265 // soon as we've written out the old value.
0f23f75f
MW
266 paddd xmm0, SAVE0
267 pshufd xmm4, xmm0, 0x39
268 movd [OUT + 0], xmm0
1a0c09c4 269
0f23f75f 270 paddd xmm1, SAVE1
47103664 271 pshufd xmm5, xmm1, ROTL
0f23f75f 272 movd [OUT + 16], xmm1
1a0c09c4 273
0f23f75f 274 paddd xmm2, SAVE2
47103664 275 pshufd xmm6, xmm2, ROT2
0f23f75f 276 movd [OUT + 32], xmm2
1a0c09c4 277
0f23f75f 278 paddd xmm3, SAVE3
47103664 279 pshufd xmm7, xmm3, ROTR
0f23f75f 280 movd [OUT + 48], xmm3
1a0c09c4 281
0f23f75f 282 movd [OUT + 4], xmm7
47103664 283 pshufd xmm7, xmm3, ROT2
0f23f75f 284 movd [OUT + 24], xmm7
47103664 285 pshufd xmm3, xmm3, ROTL
0f23f75f 286 movd [OUT + 44], xmm3
1a0c09c4 287
0f23f75f 288 movd [OUT + 8], xmm6
47103664 289 pshufd xmm6, xmm2, ROTL
0f23f75f 290 movd [OUT + 28], xmm6
47103664 291 pshufd xmm2, xmm2, ROTR
0f23f75f 292 movd [OUT + 52], xmm2
1a0c09c4 293
0f23f75f 294 movd [OUT + 12], xmm5
47103664 295 pshufd xmm5, xmm1, ROTR
0f23f75f 296 movd [OUT + 36], xmm5
47103664 297 pshufd xmm1, xmm1, ROT2
0f23f75f 298 movd [OUT + 56], xmm1
1a0c09c4 299
0f23f75f 300 movd [OUT + 20], xmm4
47103664 301 pshufd xmm4, xmm0, ROT2
0f23f75f 302 movd [OUT + 40], xmm4
47103664 303 pshufd xmm0, xmm0, ROTL
0f23f75f 304 movd [OUT + 60], xmm0
1a0c09c4
MW
305
306 // Tidy things up.
0f23f75f
MW
307
308#if CPUFAM_X86
1a0c09c4
MW
309 mov esp, ebp
310 pop ebp
0f23f75f
MW
311#endif
312#if CPUFAM_AMD64 && ABI_WIN
313 movdqa xmm6, [rsp + 0]
314 movdqa xmm7, [rsp + 16]
315 add rsp, 64 + 8
316#endif
1a0c09c4
MW
317
318 // And with that, we're done.
319 ret
320
0f23f75f
MW
321#undef NR
322#undef IN
323#undef OUT
324#undef SAVE0
325#undef SAVE1
326#undef SAVE2
327#undef SAVE3
328
1a0c09c4
MW
329ENDFUNC
330
331///----- That's all, folks --------------------------------------------------