base/asm-common.h, symm/*.S: New macros for register name decoration.
[catacomb] / symm / salsa20-x86ish-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// Fancy SIMD implementation of Salsa20
4 ///
5 /// (c) 2015 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// External definitions.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 ///--------------------------------------------------------------------------
34 /// Main code.
35
36 .arch pentium4
37 .text
38
39 FUNC(salsa20_core_x86ish_sse2)
40
41 // Initial setup.
42
43 #if CPUFAM_X86
44 // Arguments come in on the stack, and will need to be collected. We
45 // we can get away with just the scratch registers for integer work,
46 // but we'll run out of XMM registers and will need some properly
47 // aligned space which we'll steal from the stack. I don't trust the
48 // stack pointer's alignment, so I'll have to mask the stack pointer,
49 // which in turn means I'll need to keep track of the old value.
50 // Hence I'm making a full i386-style stack frame here.
51 //
52 // The Windows and SysV ABIs are sufficiently similar that we don't
53 // need to worry about the differences here.
54
55 # define NR ecx
56 # define IN eax
57 # define OUT edx
58 # define SAVE0 xmm6
59 # define SAVE1 xmm7
60 # define SAVE2 [esp + 0]
61 # define SAVE3 [esp + 16]
62
63 push ebp
64 mov ebp, esp
65 sub esp, 32
66 mov IN, [ebp + 12]
67 mov OUT, [ebp + 16]
68 and esp, ~15
69 mov NR, [ebp + 8]
70 #endif
71
72 #if CPUFAM_AMD64 && ABI_SYSV
73 // This is nice. We have plenty of XMM registers, and the arguments
74 // are in useful places. There's no need to spill anything and we
75 // can just get on with the code.
76
77 # define NR edi
78 # define IN rsi
79 # define OUT rdx
80 # define SAVE0 xmm6
81 # define SAVE1 xmm7
82 # define SAVE2 xmm8
83 # define SAVE3 xmm9
84 #endif
85
86 # if CPUFAM_AMD64 && ABI_WIN
87 // Arguments come in registers, but they're different between Windows
88 // and everyone else (and everyone else is saner).
89 //
90 // The Windows ABI insists that we preserve some of the XMM
91 // registers, but we want more than we can use as scratch space. Two
92 // places we only need to save a copy of the input for the
93 // feedforward at the end; but the other two we want for the final
94 // permutation, so save the old values on the stack. (We need an
95 // extra 8 bytes to align the stack.)
96
97 # define NR ecx
98 # define IN rdx
99 # define OUT r8
100 # define SAVE0 xmm6
101 # define SAVE1 xmm7
102 # define SAVE2 [rsp + 32]
103 # define SAVE3 [rsp + 48]
104
105 sub rsp, 64 + 8
106 .seh_stackalloc 64 + 8
107 movdqa [rsp + 0], xmm6
108 .seh_savexmm xmm6, 0
109 movdqa [rsp + 16], xmm7
110 .seh_savexmm xmm7, 16
111 .seh_endprologue
112 #endif
113
114 // First job is to slurp the matrix into XMM registers. The words
115 // have already been permuted conveniently to make them line up
116 // better for SIMD processing.
117 //
118 // The textbook arrangement of the matrix is this.
119 //
120 // [C K K K]
121 // [K C N N]
122 // [T T C K]
123 // [K K K C]
124 //
125 // But we've rotated the columns up so that the main diagonal with
126 // the constants on it end up in the first row, giving something more
127 // like
128 //
129 // [C C C C]
130 // [K T K K]
131 // [T K K N]
132 // [K K N K]
133 //
134 // so the transformation looks like this:
135 //
136 // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
137 // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
138 // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
139 // [12 13 14 15] [12 1 6 11] (d, xmm3)
140 movdqu xmm0, [IN + 0]
141 movdqu xmm1, [IN + 16]
142 movdqu xmm2, [IN + 32]
143 movdqu xmm3, [IN + 48]
144
145 // Take a copy for later.
146 movdqa SAVE0, xmm0
147 movdqa SAVE1, xmm1
148 movdqa SAVE2, xmm2
149 movdqa SAVE3, xmm3
150
151 0:
152 // Apply a column quarterround to each of the columns simultaneously.
153 // Alas, there doesn't seem to be a packed doubleword rotate, so we
154 // have to synthesize it.
155
156 // b ^= (a + d) <<< 7
157 movdqa xmm4, xmm0
158 paddd xmm4, xmm3
159 movdqa xmm5, xmm4
160 pslld xmm4, 7
161 psrld xmm5, 25
162 por xmm4, xmm5
163 pxor xmm1, xmm4
164
165 // c ^= (b + a) <<< 9
166 movdqa xmm4, xmm1
167 paddd xmm4, xmm0
168 movdqa xmm5, xmm4
169 pslld xmm4, 9
170 psrld xmm5, 23
171 por xmm4, xmm5
172 pxor xmm2, xmm4
173
174 // d ^= (c + b) <<< 13
175 movdqa xmm4, xmm2
176 paddd xmm4, xmm1
177 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
178 movdqa xmm5, xmm4
179 pslld xmm4, 13
180 psrld xmm5, 19
181 por xmm4, xmm5
182 pxor xmm3, xmm4
183
184 // a ^= (d + c) <<< 18
185 movdqa xmm4, xmm3
186 pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
187 paddd xmm4, xmm2
188 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
189 movdqa xmm5, xmm4
190 pslld xmm4, 18
191 psrld xmm5, 14
192 por xmm4, xmm5
193 pxor xmm0, xmm4
194
195 // The transpose conveniently only involves reordering elements of
196 // individual rows, which can be done quite easily, and reordering
197 // the rows themselves, which is a trivial renaming. It doesn't
198 // involve any movement of elements between rows.
199 //
200 // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
201 // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
202 // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
203 // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
204 //
205 // The shuffles have quite high latency, so they've been pushed
206 // backwards into the main instruction list.
207
208 // Apply the row quarterround to each of the columns (yes!)
209 // simultaneously.
210
211 // b ^= (a + d) <<< 7
212 movdqa xmm4, xmm0
213 paddd xmm4, xmm1
214 movdqa xmm5, xmm4
215 pslld xmm4, 7
216 psrld xmm5, 25
217 por xmm4, xmm5
218 pxor xmm3, xmm4
219
220 // c ^= (b + a) <<< 9
221 movdqa xmm4, xmm3
222 paddd xmm4, xmm0
223 movdqa xmm5, xmm4
224 pslld xmm4, 9
225 psrld xmm5, 23
226 por xmm4, xmm5
227 pxor xmm2, xmm4
228
229 // d ^= (c + b) <<< 13
230 movdqa xmm4, xmm2
231 paddd xmm4, xmm3
232 pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
233 movdqa xmm5, xmm4
234 pslld xmm4, 13
235 psrld xmm5, 19
236 por xmm4, xmm5
237 pxor xmm1, xmm4
238
239 // a ^= (d + c) <<< 18
240 movdqa xmm4, xmm1
241 pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
242 paddd xmm4, xmm2
243 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
244 movdqa xmm5, xmm4
245 pslld xmm4, 18
246 psrld xmm5, 14
247 por xmm4, xmm5
248 pxor xmm0, xmm4
249
250 // We had to undo the transpose ready for the next loop. Again, push
251 // back the shuffles because they take a long time coming through.
252 // Decrement the loop counter and see if we should go round again.
253 // Later processors fuse this pair into a single uop.
254 sub NR, 2
255 ja 0b
256
257 // Almost there. Firstly, the feedforward addition.
258 paddd xmm0, SAVE0 // 0, 5, 10, 15
259 paddd xmm1, SAVE1 // 4, 9, 14, 3
260 paddd xmm2, SAVE2 // 8, 13, 2, 7
261 paddd xmm3, SAVE3 // 12, 1, 6, 11
262
263 // Next we must undo the permutation which was already applied to the
264 // input. This can be done by juggling values in registers, with the
265 // following fancy footwork: some row rotations, a transpose, and
266 // some more rotations.
267 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14
268 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13
269 pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12
270
271 movdqa xmm4, xmm0
272 movdqa xmm5, xmm3
273 punpckldq xmm0, xmm2 // 0, 2, 5, 7
274 punpckldq xmm3, xmm1 // 1, 3, 6, 4
275 punpckhdq xmm4, xmm2 // 10, 8, 15, 13
276 punpckhdq xmm5, xmm1 // 11, 9, 12, 14
277
278 movdqa xmm1, xmm0
279 movdqa xmm2, xmm4
280 punpckldq xmm0, xmm3 // 0, 1, 2, 3
281 punpckldq xmm4, xmm5 // 10, 11, 8, 9
282 punpckhdq xmm1, xmm3 // 5, 6, 7, 4
283 punpckhdq xmm2, xmm5 // 15, 12, 13, 14
284
285 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7
286 pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11
287 pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15
288
289 // Finally we have to write out the result.
290 movdqu [OUT + 0], xmm0
291 movdqu [OUT + 16], xmm1
292 movdqu [OUT + 32], xmm4
293 movdqu [OUT + 48], xmm2
294
295 // Tidy things up.
296 #if CPUFAM_X86
297 mov esp, ebp
298 pop ebp
299 #endif
300 #if CPUFAM_AMD64 && ABI_WIN
301 movdqa xmm6, [rsp + 0]
302 movdqa xmm7, [rsp + 16]
303 add rsp, 64 + 8
304 #endif
305
306 // And with that, we're done.
307 ret
308
309 #undef NR
310 #undef IN
311 #undef OUT
312 #undef SAVE0
313 #undef SAVE1
314 #undef SAVE2
315 #undef SAVE3
316
317 ENDFUNC
318
319 ///----- That's all, folks --------------------------------------------------