x86ish *.S: Use `stalloc' consistently to allocate space on the stack.
[catacomb] / symm / chacha-x86ish-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// Fancy SIMD implementation of ChaCha
4 ///
5 /// (c) 2015 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// Preliminaries.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .text
34
35 ///--------------------------------------------------------------------------
36 /// Main code.
37
38 FUNC(chacha_core_x86ish_avx)
39 .arch .avx
40 vzeroupper
41 endprologue
42 // drop through...
43 ENDFUNC
44
45 .arch pentium4
46
47 FUNC(chacha_core_x86ish_sse2)
48
49 // Initial setup.
50
51 #if CPUFAM_X86
52 // Arguments come in on the stack, and will need to be collected. We
53 // can get away with just the scratch registers for integer work, but
54 // we'll run out of XMM registers and will need some properly aligned
55 // space which we'll steal from the stack. I don't trust the stack
56 // pointer's alignment, so I'll have to mask the stack pointer, which
57 // in turn means I'll need to keep track of the old value. Hence I'm
58 // making a full i386-style stack frame here.
59 //
60 // The Windows and SysV ABIs are sufficiently similar that we don't
61 // need to worry about the differences here.
62
63 # define NR ecx
64 # define IN eax
65 # define OUT edx
66 # define SAVE0 xmm5
67 # define SAVE1 xmm6
68 # define SAVE2 xmm7
69 # define SAVE3 [SP]
70
71 pushreg BP
72 setfp
73 stalloc 16
74 mov IN, [BP + 12]
75 mov OUT, [BP + 16]
76 and SP, ~15
77 mov NR, [BP + 8]
78 #endif
79
80 #if CPUFAM_AMD64 && ABI_SYSV
81 // This is nice. We have plenty of XMM registers, and the arguments
82 // are in useful places. There's no need to spill anything and we
83 // can just get on with the code.
84
85 # define NR edi
86 # define IN rsi
87 # define OUT rdx
88 # define SAVE0 xmm5
89 # define SAVE1 xmm6
90 # define SAVE2 xmm7
91 # define SAVE3 xmm8
92 #endif
93
94 #if CPUFAM_AMD64 && ABI_WIN
95 // Arguments come in registers, but they're different between Windows
96 // and everyone else (and everyone else is saner).
97 //
98 // The Windows ABI insists that we preserve some of the XMM
99 // registers, but we want more than we can use as scratch space. We
100 // only need to save a copy of the input for the feedforward at the
101 // end, so we might as well use memory rather than spill extra
102 // registers. (We need an extra 8 bytes to align the stack.)
103
104 # define NR ecx
105 # define IN rdx
106 # define OUT r8
107 # define SAVE0 xmm5
108 # define SAVE1 [SP + 0]
109 # define SAVE2 [SP + 16]
110 # define SAVE3 [SP + 32]
111
112 stalloc 48 + 8
113 #endif
114
115 endprologue
116
117 // First job is to slurp the matrix into XMM registers. Be careful:
118 // the input matrix isn't likely to be properly aligned.
119 //
120 // [ 0 1 2 3] (a, xmm0)
121 // [ 4 5 6 7] (b, xmm1)
122 // [ 8 9 10 11] (c, xmm2)
123 // [12 13 14 15] (d, xmm3)
124 movdqu xmm0, [IN + 0]
125 movdqu xmm1, [IN + 16]
126 movdqu xmm2, [IN + 32]
127 movdqu xmm3, [IN + 48]
128
129 // Take a copy for later. This one is aligned properly, by
130 // construction.
131 movdqa SAVE0, xmm0
132 movdqa SAVE1, xmm1
133 movdqa SAVE2, xmm2
134 movdqa SAVE3, xmm3
135
136 0:
137 // Apply a column quarterround to each of the columns simultaneously.
138 // Alas, there doesn't seem to be a packed doubleword rotate, so we
139 // have to synthesize it.
140
141 // a += b; d ^= a; d <<<= 16
142 paddd xmm0, xmm1
143 pxor xmm3, xmm0
144 movdqa xmm4, xmm3
145 pslld xmm3, 16
146 psrld xmm4, 16
147 por xmm3, xmm4
148
149 // c += d; b ^= c; b <<<= 12
150 paddd xmm2, xmm3
151 pxor xmm1, xmm2
152 movdqa xmm4, xmm1
153 pslld xmm1, 12
154 psrld xmm4, 20
155 por xmm1, xmm4
156
157 // a += b; d ^= a; d <<<= 8
158 paddd xmm0, xmm1
159 pxor xmm3, xmm0
160 movdqa xmm4, xmm3
161 pslld xmm3, 8
162 psrld xmm4, 24
163 por xmm3, xmm4
164
165 // c += d; b ^= c; b <<<= 7
166 paddd xmm2, xmm3
167 pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
168 pxor xmm1, xmm2
169 pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
170 movdqa xmm4, xmm1
171 pslld xmm1, 7
172 psrld xmm4, 25
173 por xmm1, xmm4
174
175 // The not-quite-transpose conveniently only involves reordering
176 // elements of individual rows, which can be done quite easily. It
177 // doesn't involve any movement of elements between rows, or even
178 // renaming of the rows.
179 //
180 // [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
181 // [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
182 // [ 8 9 10 11] [10 11 8 9] (c, xmm2)
183 // [12 13 14 15] [15 12 13 14] (d, xmm3)
184 //
185 // The shuffles have quite high latency, so they've mostly been
186 // pushed upwards. The remaining one can't be moved, though.
187 pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
188
189 // Apply the diagonal quarterround to each of the columns
190 // simultaneously.
191
192 // a += b; d ^= a; d <<<= 16
193 paddd xmm0, xmm1
194 pxor xmm3, xmm0
195 movdqa xmm4, xmm3
196 pslld xmm3, 16
197 psrld xmm4, 16
198 por xmm3, xmm4
199
200 // c += d; b ^= c; b <<<= 12
201 paddd xmm2, xmm3
202 pxor xmm1, xmm2
203 movdqa xmm4, xmm1
204 pslld xmm1, 12
205 psrld xmm4, 20
206 por xmm1, xmm4
207
208 // a += b; d ^= a; d <<<= 8
209 paddd xmm0, xmm1
210 pxor xmm3, xmm0
211 movdqa xmm4, xmm3
212 pslld xmm3, 8
213 psrld xmm4, 24
214 por xmm3, xmm4
215
216 // c += d; b ^= c; b <<<= 7
217 paddd xmm2, xmm3
218 pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
219 pxor xmm1, xmm2
220 pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
221 movdqa xmm4, xmm1
222 pslld xmm1, 7
223 psrld xmm4, 25
224 por xmm1, xmm4
225
226 // Finally, finish off undoing the transpose, and we're done for this
227 // doubleround. Again, most of this was done above so we don't have
228 // to wait for the shuffles.
229 pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
230
231 // Decrement the loop counter and see if we should go round again.
232 sub NR, 2
233 ja 0b
234
235 // Almost there. Firstly, the feedforward addition.
236 paddd xmm0, SAVE0
237 paddd xmm1, SAVE1
238 paddd xmm2, SAVE2
239 paddd xmm3, SAVE3
240
241 // And now we write out the result. This one won't be aligned
242 // either.
243 movdqu [OUT + 0], xmm0
244 movdqu [OUT + 16], xmm1
245 movdqu [OUT + 32], xmm2
246 movdqu [OUT + 48], xmm3
247
248 // Tidy things up.
249 #if CPUFAM_X86
250 dropfp
251 popreg BP
252 #endif
253 #if CPUFAM_AMD64 && ABI_WIN
254 stfree 48 + 8
255 #endif
256
257 // And with that, we're done.
258 ret
259
260 ENDFUNC
261
262 ///----- That's all, folks --------------------------------------------------