math/Makefile.am, symm/Makefile.am: Use `--no-install' on oddball tests.
[catacomb] / symm / chacha-x86ish-sse2.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// Fancy SIMD implementation of ChaCha
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
33///--------------------------------------------------------------------------
34/// Main code.
35
36 .arch pentium4
bc9ac7eb 37 .text
1a0c09c4 38
0f23f75f
MW
39FUNC(chacha_core_x86ish_sse2)
40
41 // Initial setup.
42
43#if CPUFAM_X86
44 // Arguments come in on the stack, and will need to be collected. We
172707cb
MW
45 // can get away with just the scratch registers for integer work, but
46 // we'll run out of XMM registers and will need some properly aligned
47 // space which we'll steal from the stack. I don't trust the stack
48 // pointer's alignment, so I'll have to mask the stack pointer, which
49 // in turn means I'll need to keep track of the old value. Hence I'm
50 // making a full i386-style stack frame here.
0f23f75f
MW
51 //
52 // The Windows and SysV ABIs are sufficiently similar that we don't
53 // need to worry about the differences here.
54
55# define NR ecx
56# define IN eax
57# define OUT edx
58# define SAVE0 xmm5
59# define SAVE1 xmm6
60# define SAVE2 xmm7
61# define SAVE3 [esp]
1a0c09c4 62
0923a413
MW
63 pushreg ebp
64 setfp ebp
1a0c09c4 65 sub esp, 16
0f23f75f
MW
66 mov IN, [ebp + 12]
67 mov OUT, [ebp + 16]
1a0c09c4 68 and esp, ~15
0f23f75f
MW
69 mov NR, [ebp + 8]
70#endif
71
72#if CPUFAM_AMD64 && ABI_SYSV
73 // This is nice. We have plenty of XMM registers, and the arguments
74 // are in useful places. There's no need to spill anything and we
75 // can just get on with the code.
76
77# define NR edi
78# define IN rsi
79# define OUT rdx
80# define SAVE0 xmm5
81# define SAVE1 xmm6
82# define SAVE2 xmm7
83# define SAVE3 xmm8
84#endif
85
86#if CPUFAM_AMD64 && ABI_WIN
87 // Arguments come in registers, but they're different between Windows
88 // and everyone else (and everyone else is saner).
89 //
90 // The Windows ABI insists that we preserve some of the XMM
91 // registers, but we want more than we can use as scratch space. We
92 // only need to save a copy of the input for the feedforward at the
93 // end, so we might as well use memory rather than spill extra
94 // registers. (We need an extra 8 bytes to align the stack.)
95
96# define NR ecx
97# define IN rdx
98# define OUT r8
99# define SAVE0 xmm5
100# define SAVE1 [rsp + 0]
101# define SAVE2 [rsp + 16]
102# define SAVE3 [rsp + 32]
103
0923a413 104 stalloc 48 + 8
0f23f75f 105#endif
1a0c09c4 106
0923a413
MW
107 endprologue
108
1a0c09c4
MW
109 // First job is to slurp the matrix into XMM registers. Be careful:
110 // the input matrix isn't likely to be properly aligned.
111 //
112 // [ 0 1 2 3] (a, xmm0)
3197685c
MW
113 // [ 4 5 6 7] (b, xmm1)
114 // [ 8 9 10 11] (c, xmm2)
115 // [12 13 14 15] (d, xmm3)
0f23f75f
MW
116 movdqu xmm0, [IN + 0]
117 movdqu xmm1, [IN + 16]
118 movdqu xmm2, [IN + 32]
119 movdqu xmm3, [IN + 48]
1a0c09c4
MW
120
121 // Take a copy for later. This one is aligned properly, by
122 // construction.
0f23f75f
MW
123 movdqa SAVE0, xmm0
124 movdqa SAVE1, xmm1
125 movdqa SAVE2, xmm2
126 movdqa SAVE3, xmm3
1a0c09c4 127
fd3bb67b 1280:
1a0c09c4
MW
129 // Apply a column quarterround to each of the columns simultaneously.
130 // Alas, there doesn't seem to be a packed doubleword rotate, so we
131 // have to synthesize it.
132
133 // a += b; d ^= a; d <<<= 16
134 paddd xmm0, xmm1
135 pxor xmm3, xmm0
136 movdqa xmm4, xmm3
137 pslld xmm3, 16
138 psrld xmm4, 16
139 por xmm3, xmm4
140
141 // c += d; b ^= c; b <<<= 12
142 paddd xmm2, xmm3
143 pxor xmm1, xmm2
144 movdqa xmm4, xmm1
145 pslld xmm1, 12
146 psrld xmm4, 20
147 por xmm1, xmm4
148
149 // a += b; d ^= a; d <<<= 8
150 paddd xmm0, xmm1
151 pxor xmm3, xmm0
152 movdqa xmm4, xmm3
153 pslld xmm3, 8
154 psrld xmm4, 24
155 por xmm3, xmm4
156
157 // c += d; b ^= c; b <<<= 7
158 paddd xmm2, xmm3
a13b5730 159 pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
1a0c09c4 160 pxor xmm1, xmm2
a13b5730 161 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4
MW
162 movdqa xmm4, xmm1
163 pslld xmm1, 7
164 psrld xmm4, 25
165 por xmm1, xmm4
166
167 // The not-quite-transpose conveniently only involves reordering
168 // elements of individual rows, which can be done quite easily. It
169 // doesn't involve any movement of elements between rows, or even
170 // renaming of the rows.
171 //
172 // [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
173 // [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
174 // [ 8 9 10 11] [10 11 8 9] (c, xmm2)
175 // [12 13 14 15] [15 12 13 14] (d, xmm3)
176 //
177 // The shuffles have quite high latency, so they've mostly been
178 // pushed upwards. The remaining one can't be moved, though.
a13b5730 179 pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
1a0c09c4
MW
180
181 // Apply the diagonal quarterround to each of the columns
182 // simultaneously.
183
184 // a += b; d ^= a; d <<<= 16
185 paddd xmm0, xmm1
186 pxor xmm3, xmm0
187 movdqa xmm4, xmm3
188 pslld xmm3, 16
189 psrld xmm4, 16
190 por xmm3, xmm4
191
192 // c += d; b ^= c; b <<<= 12
193 paddd xmm2, xmm3
194 pxor xmm1, xmm2
195 movdqa xmm4, xmm1
196 pslld xmm1, 12
197 psrld xmm4, 20
198 por xmm1, xmm4
199
200 // a += b; d ^= a; d <<<= 8
201 paddd xmm0, xmm1
202 pxor xmm3, xmm0
203 movdqa xmm4, xmm3
204 pslld xmm3, 8
205 psrld xmm4, 24
206 por xmm3, xmm4
207
208 // c += d; b ^= c; b <<<= 7
209 paddd xmm2, xmm3
a13b5730 210 pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
1a0c09c4 211 pxor xmm1, xmm2
a13b5730 212 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4
MW
213 movdqa xmm4, xmm1
214 pslld xmm1, 7
215 psrld xmm4, 25
216 por xmm1, xmm4
217
218 // Finally, finish off undoing the transpose, and we're done for this
219 // doubleround. Again, most of this was done above so we don't have
220 // to wait for the shuffles.
a13b5730 221 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
1a0c09c4
MW
222
223 // Decrement the loop counter and see if we should go round again.
0f23f75f 224 sub NR, 2
fd3bb67b 225 ja 0b
1a0c09c4
MW
226
227 // Almost there. Firstly, the feedforward addition.
0f23f75f
MW
228 paddd xmm0, SAVE0
229 paddd xmm1, SAVE1
230 paddd xmm2, SAVE2
231 paddd xmm3, SAVE3
1a0c09c4
MW
232
233 // And now we write out the result. This one won't be aligned
234 // either.
0f23f75f
MW
235 movdqu [OUT + 0], xmm0
236 movdqu [OUT + 16], xmm1
237 movdqu [OUT + 32], xmm2
238 movdqu [OUT + 48], xmm3
1a0c09c4
MW
239
240 // Tidy things up.
0f23f75f 241#if CPUFAM_X86
0923a413
MW
242 dropfp
243 popreg ebp
0f23f75f
MW
244#endif
245#if CPUFAM_AMD64 && ABI_WIN
0923a413 246 stfree 48 + 8
0f23f75f 247#endif
1a0c09c4
MW
248
249 // And with that, we're done.
250 ret
251
252ENDFUNC
253
254///----- That's all, folks --------------------------------------------------