x86ish *.S: Use `stalloc' consistently to allocate space on the stack.
[catacomb] / symm / chacha-x86ish-sse2.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// Fancy SIMD implementation of ChaCha
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
df07f2c0 28/// Preliminaries.
1a0c09c4
MW
29
30#include "config.h"
31#include "asm-common.h"
32
df07f2c0
MW
33 .text
34
1a0c09c4
MW
35///--------------------------------------------------------------------------
36/// Main code.
37
b9b279b4
MW
38FUNC(chacha_core_x86ish_avx)
39 .arch .avx
40 vzeroupper
41 endprologue
42 // drop through...
43ENDFUNC
44
45 .arch pentium4
46
0f23f75f
MW
47FUNC(chacha_core_x86ish_sse2)
48
49 // Initial setup.
50
51#if CPUFAM_X86
52 // Arguments come in on the stack, and will need to be collected. We
172707cb
MW
53 // can get away with just the scratch registers for integer work, but
54 // we'll run out of XMM registers and will need some properly aligned
55 // space which we'll steal from the stack. I don't trust the stack
56 // pointer's alignment, so I'll have to mask the stack pointer, which
57 // in turn means I'll need to keep track of the old value. Hence I'm
58 // making a full i386-style stack frame here.
0f23f75f
MW
59 //
60 // The Windows and SysV ABIs are sufficiently similar that we don't
61 // need to worry about the differences here.
62
63# define NR ecx
64# define IN eax
65# define OUT edx
66# define SAVE0 xmm5
67# define SAVE1 xmm6
68# define SAVE2 xmm7
a90d420c 69# define SAVE3 [SP]
1a0c09c4 70
a90d420c 71 pushreg BP
42c44b27 72 setfp
6d2bd7f1 73 stalloc 16
a90d420c
MW
74 mov IN, [BP + 12]
75 mov OUT, [BP + 16]
76 and SP, ~15
77 mov NR, [BP + 8]
0f23f75f
MW
78#endif
79
80#if CPUFAM_AMD64 && ABI_SYSV
81 // This is nice. We have plenty of XMM registers, and the arguments
82 // are in useful places. There's no need to spill anything and we
83 // can just get on with the code.
84
85# define NR edi
86# define IN rsi
87# define OUT rdx
88# define SAVE0 xmm5
89# define SAVE1 xmm6
90# define SAVE2 xmm7
91# define SAVE3 xmm8
92#endif
93
94#if CPUFAM_AMD64 && ABI_WIN
95 // Arguments come in registers, but they're different between Windows
96 // and everyone else (and everyone else is saner).
97 //
98 // The Windows ABI insists that we preserve some of the XMM
99 // registers, but we want more than we can use as scratch space. We
100 // only need to save a copy of the input for the feedforward at the
101 // end, so we might as well use memory rather than spill extra
102 // registers. (We need an extra 8 bytes to align the stack.)
103
104# define NR ecx
105# define IN rdx
106# define OUT r8
107# define SAVE0 xmm5
a90d420c
MW
108# define SAVE1 [SP + 0]
109# define SAVE2 [SP + 16]
110# define SAVE3 [SP + 32]
0f23f75f 111
0923a413 112 stalloc 48 + 8
0f23f75f 113#endif
1a0c09c4 114
0923a413
MW
115 endprologue
116
1a0c09c4
MW
117 // First job is to slurp the matrix into XMM registers. Be careful:
118 // the input matrix isn't likely to be properly aligned.
119 //
120 // [ 0 1 2 3] (a, xmm0)
3197685c
MW
121 // [ 4 5 6 7] (b, xmm1)
122 // [ 8 9 10 11] (c, xmm2)
123 // [12 13 14 15] (d, xmm3)
0f23f75f
MW
124 movdqu xmm0, [IN + 0]
125 movdqu xmm1, [IN + 16]
126 movdqu xmm2, [IN + 32]
127 movdqu xmm3, [IN + 48]
1a0c09c4
MW
128
129 // Take a copy for later. This one is aligned properly, by
130 // construction.
0f23f75f
MW
131 movdqa SAVE0, xmm0
132 movdqa SAVE1, xmm1
133 movdqa SAVE2, xmm2
134 movdqa SAVE3, xmm3
1a0c09c4 135
fd3bb67b 1360:
1a0c09c4
MW
137 // Apply a column quarterround to each of the columns simultaneously.
138 // Alas, there doesn't seem to be a packed doubleword rotate, so we
139 // have to synthesize it.
140
141 // a += b; d ^= a; d <<<= 16
142 paddd xmm0, xmm1
143 pxor xmm3, xmm0
144 movdqa xmm4, xmm3
145 pslld xmm3, 16
146 psrld xmm4, 16
147 por xmm3, xmm4
148
149 // c += d; b ^= c; b <<<= 12
150 paddd xmm2, xmm3
151 pxor xmm1, xmm2
152 movdqa xmm4, xmm1
153 pslld xmm1, 12
154 psrld xmm4, 20
155 por xmm1, xmm4
156
157 // a += b; d ^= a; d <<<= 8
158 paddd xmm0, xmm1
159 pxor xmm3, xmm0
160 movdqa xmm4, xmm3
161 pslld xmm3, 8
162 psrld xmm4, 24
163 por xmm3, xmm4
164
165 // c += d; b ^= c; b <<<= 7
166 paddd xmm2, xmm3
a117c06f 167 pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
1a0c09c4 168 pxor xmm1, xmm2
a117c06f 169 pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
1a0c09c4
MW
170 movdqa xmm4, xmm1
171 pslld xmm1, 7
172 psrld xmm4, 25
173 por xmm1, xmm4
174
175 // The not-quite-transpose conveniently only involves reordering
176 // elements of individual rows, which can be done quite easily. It
177 // doesn't involve any movement of elements between rows, or even
178 // renaming of the rows.
179 //
180 // [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
181 // [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
182 // [ 8 9 10 11] [10 11 8 9] (c, xmm2)
183 // [12 13 14 15] [15 12 13 14] (d, xmm3)
184 //
185 // The shuffles have quite high latency, so they've mostly been
186 // pushed upwards. The remaining one can't be moved, though.
a117c06f 187 pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
1a0c09c4
MW
188
189 // Apply the diagonal quarterround to each of the columns
190 // simultaneously.
191
192 // a += b; d ^= a; d <<<= 16
193 paddd xmm0, xmm1
194 pxor xmm3, xmm0
195 movdqa xmm4, xmm3
196 pslld xmm3, 16
197 psrld xmm4, 16
198 por xmm3, xmm4
199
200 // c += d; b ^= c; b <<<= 12
201 paddd xmm2, xmm3
202 pxor xmm1, xmm2
203 movdqa xmm4, xmm1
204 pslld xmm1, 12
205 psrld xmm4, 20
206 por xmm1, xmm4
207
208 // a += b; d ^= a; d <<<= 8
209 paddd xmm0, xmm1
210 pxor xmm3, xmm0
211 movdqa xmm4, xmm3
212 pslld xmm3, 8
213 psrld xmm4, 24
214 por xmm3, xmm4
215
216 // c += d; b ^= c; b <<<= 7
217 paddd xmm2, xmm3
a117c06f 218 pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
1a0c09c4 219 pxor xmm1, xmm2
a117c06f 220 pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
1a0c09c4
MW
221 movdqa xmm4, xmm1
222 pslld xmm1, 7
223 psrld xmm4, 25
224 por xmm1, xmm4
225
226 // Finally, finish off undoing the transpose, and we're done for this
227 // doubleround. Again, most of this was done above so we don't have
228 // to wait for the shuffles.
a117c06f 229 pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
1a0c09c4
MW
230
231 // Decrement the loop counter and see if we should go round again.
0f23f75f 232 sub NR, 2
fd3bb67b 233 ja 0b
1a0c09c4
MW
234
235 // Almost there. Firstly, the feedforward addition.
0f23f75f
MW
236 paddd xmm0, SAVE0
237 paddd xmm1, SAVE1
238 paddd xmm2, SAVE2
239 paddd xmm3, SAVE3
1a0c09c4
MW
240
241 // And now we write out the result. This one won't be aligned
242 // either.
0f23f75f
MW
243 movdqu [OUT + 0], xmm0
244 movdqu [OUT + 16], xmm1
245 movdqu [OUT + 32], xmm2
246 movdqu [OUT + 48], xmm3
1a0c09c4
MW
247
248 // Tidy things up.
0f23f75f 249#if CPUFAM_X86
0923a413 250 dropfp
a90d420c 251 popreg BP
0f23f75f
MW
252#endif
253#if CPUFAM_AMD64 && ABI_WIN
0923a413 254 stfree 48 + 8
0f23f75f 255#endif
1a0c09c4
MW
256
257 // And with that, we're done.
258 ret
259
260ENDFUNC
261
262///----- That's all, folks --------------------------------------------------