math/Makefile.am, symm/Makefile.am: Use `--no-install' on oddball tests.
[catacomb] / symm / chacha-x86ish-sse2.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// Fancy SIMD implementation of ChaCha
4 ///
5 /// (c) 2015 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// External definitions.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 ///--------------------------------------------------------------------------
34 /// Main code.
35
36 .arch pentium4
37 .text
38
39 FUNC(chacha_core_x86ish_sse2)
40
41 // Initial setup.
42
43 #if CPUFAM_X86
44 // Arguments come in on the stack, and will need to be collected. We
45 // can get away with just the scratch registers for integer work, but
46 // we'll run out of XMM registers and will need some properly aligned
47 // space which we'll steal from the stack. I don't trust the stack
48 // pointer's alignment, so I'll have to mask the stack pointer, which
49 // in turn means I'll need to keep track of the old value. Hence I'm
50 // making a full i386-style stack frame here.
51 //
52 // The Windows and SysV ABIs are sufficiently similar that we don't
53 // need to worry about the differences here.
54
55 # define NR ecx
56 # define IN eax
57 # define OUT edx
58 # define SAVE0 xmm5
59 # define SAVE1 xmm6
60 # define SAVE2 xmm7
61 # define SAVE3 [esp]
62
63 pushreg ebp
64 setfp ebp
65 sub esp, 16
66 mov IN, [ebp + 12]
67 mov OUT, [ebp + 16]
68 and esp, ~15
69 mov NR, [ebp + 8]
70 #endif
71
72 #if CPUFAM_AMD64 && ABI_SYSV
73 // This is nice. We have plenty of XMM registers, and the arguments
74 // are in useful places. There's no need to spill anything and we
75 // can just get on with the code.
76
77 # define NR edi
78 # define IN rsi
79 # define OUT rdx
80 # define SAVE0 xmm5
81 # define SAVE1 xmm6
82 # define SAVE2 xmm7
83 # define SAVE3 xmm8
84 #endif
85
86 #if CPUFAM_AMD64 && ABI_WIN
87 // Arguments come in registers, but they're different between Windows
88 // and everyone else (and everyone else is saner).
89 //
90 // The Windows ABI insists that we preserve some of the XMM
91 // registers, but we want more than we can use as scratch space. We
92 // only need to save a copy of the input for the feedforward at the
93 // end, so we might as well use memory rather than spill extra
94 // registers. (We need an extra 8 bytes to align the stack.)
95
96 # define NR ecx
97 # define IN rdx
98 # define OUT r8
99 # define SAVE0 xmm5
100 # define SAVE1 [rsp + 0]
101 # define SAVE2 [rsp + 16]
102 # define SAVE3 [rsp + 32]
103
104 stalloc 48 + 8
105 #endif
106
107 endprologue
108
109 // First job is to slurp the matrix into XMM registers. Be careful:
110 // the input matrix isn't likely to be properly aligned.
111 //
112 // [ 0 1 2 3] (a, xmm0)
113 // [ 4 5 6 7] (b, xmm1)
114 // [ 8 9 10 11] (c, xmm2)
115 // [12 13 14 15] (d, xmm3)
116 movdqu xmm0, [IN + 0]
117 movdqu xmm1, [IN + 16]
118 movdqu xmm2, [IN + 32]
119 movdqu xmm3, [IN + 48]
120
121 // Take a copy for later. This one is aligned properly, by
122 // construction.
123 movdqa SAVE0, xmm0
124 movdqa SAVE1, xmm1
125 movdqa SAVE2, xmm2
126 movdqa SAVE3, xmm3
127
128 0:
129 // Apply a column quarterround to each of the columns simultaneously.
130 // Alas, there doesn't seem to be a packed doubleword rotate, so we
131 // have to synthesize it.
132
133 // a += b; d ^= a; d <<<= 16
134 paddd xmm0, xmm1
135 pxor xmm3, xmm0
136 movdqa xmm4, xmm3
137 pslld xmm3, 16
138 psrld xmm4, 16
139 por xmm3, xmm4
140
141 // c += d; b ^= c; b <<<= 12
142 paddd xmm2, xmm3
143 pxor xmm1, xmm2
144 movdqa xmm4, xmm1
145 pslld xmm1, 12
146 psrld xmm4, 20
147 por xmm1, xmm4
148
149 // a += b; d ^= a; d <<<= 8
150 paddd xmm0, xmm1
151 pxor xmm3, xmm0
152 movdqa xmm4, xmm3
153 pslld xmm3, 8
154 psrld xmm4, 24
155 por xmm3, xmm4
156
157 // c += d; b ^= c; b <<<= 7
158 paddd xmm2, xmm3
159 pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
160 pxor xmm1, xmm2
161 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
162 movdqa xmm4, xmm1
163 pslld xmm1, 7
164 psrld xmm4, 25
165 por xmm1, xmm4
166
167 // The not-quite-transpose conveniently only involves reordering
168 // elements of individual rows, which can be done quite easily. It
169 // doesn't involve any movement of elements between rows, or even
170 // renaming of the rows.
171 //
172 // [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
173 // [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
174 // [ 8 9 10 11] [10 11 8 9] (c, xmm2)
175 // [12 13 14 15] [15 12 13 14] (d, xmm3)
176 //
177 // The shuffles have quite high latency, so they've mostly been
178 // pushed upwards. The remaining one can't be moved, though.
179 pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
180
181 // Apply the diagonal quarterround to each of the columns
182 // simultaneously.
183
184 // a += b; d ^= a; d <<<= 16
185 paddd xmm0, xmm1
186 pxor xmm3, xmm0
187 movdqa xmm4, xmm3
188 pslld xmm3, 16
189 psrld xmm4, 16
190 por xmm3, xmm4
191
192 // c += d; b ^= c; b <<<= 12
193 paddd xmm2, xmm3
194 pxor xmm1, xmm2
195 movdqa xmm4, xmm1
196 pslld xmm1, 12
197 psrld xmm4, 20
198 por xmm1, xmm4
199
200 // a += b; d ^= a; d <<<= 8
201 paddd xmm0, xmm1
202 pxor xmm3, xmm0
203 movdqa xmm4, xmm3
204 pslld xmm3, 8
205 psrld xmm4, 24
206 por xmm3, xmm4
207
208 // c += d; b ^= c; b <<<= 7
209 paddd xmm2, xmm3
210 pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
211 pxor xmm1, xmm2
212 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
213 movdqa xmm4, xmm1
214 pslld xmm1, 7
215 psrld xmm4, 25
216 por xmm1, xmm4
217
218 // Finally, finish off undoing the transpose, and we're done for this
219 // doubleround. Again, most of this was done above so we don't have
220 // to wait for the shuffles.
221 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
222
223 // Decrement the loop counter and see if we should go round again.
224 sub NR, 2
225 ja 0b
226
227 // Almost there. Firstly, the feedforward addition.
228 paddd xmm0, SAVE0
229 paddd xmm1, SAVE1
230 paddd xmm2, SAVE2
231 paddd xmm3, SAVE3
232
233 // And now we write out the result. This one won't be aligned
234 // either.
235 movdqu [OUT + 0], xmm0
236 movdqu [OUT + 16], xmm1
237 movdqu [OUT + 32], xmm2
238 movdqu [OUT + 48], xmm3
239
240 // Tidy things up.
241 #if CPUFAM_X86
242 dropfp
243 popreg ebp
244 #endif
245 #if CPUFAM_AMD64 && ABI_WIN
246 stfree 48 + 8
247 #endif
248
249 // And with that, we're done.
250 ret
251
252 ENDFUNC
253
254 ///----- That's all, folks --------------------------------------------------