Commit | Line | Data |
---|---|---|
1a0c09c4 MW |
1 | /// -*- mode: asm; asm-comment-char: ?/ -*- |
2 | /// | |
3 | /// Fancy SIMD implementation of Salsa20 | |
4 | /// | |
5 | /// (c) 2015 Straylight/Edgeware | |
6 | /// | |
7 | ||
8 | ///----- Licensing notice --------------------------------------------------- | |
9 | /// | |
10 | /// This file is part of Catacomb. | |
11 | /// | |
12 | /// Catacomb is free software; you can redistribute it and/or modify | |
13 | /// it under the terms of the GNU Library General Public License as | |
14 | /// published by the Free Software Foundation; either version 2 of the | |
15 | /// License, or (at your option) any later version. | |
16 | /// | |
17 | /// Catacomb is distributed in the hope that it will be useful, | |
18 | /// but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | /// GNU Library General Public License for more details. | |
21 | /// | |
22 | /// You should have received a copy of the GNU Library General Public | |
23 | /// License along with Catacomb; if not, write to the Free | |
24 | /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, | |
25 | /// MA 02111-1307, USA. | |
26 | ||
27 | ///-------------------------------------------------------------------------- | |
28 | /// General definitions. | |
29 | ||
898f32b3 MW |
30 | // Preprocessor hacks. |
31 | #define STRINGY(x) _STRINGY(x, y) | |
32 | #define _STRINGY(x) #x | |
33 | #define GLUE(x, y) _GLUE(x, y) | |
34 | #define _GLUE(x, y) x##y | |
35 | #define _EMPTY | |
36 | ||
f8e509a9 MW |
37 | // Some useful variables. |
38 | .L$_subsec = 0 | |
39 | ||
40 | // Literal pools done the hard way. | |
41 | #define _LIT .text .L$_subsec + 1 | |
42 | #define _ENDLIT .text .L$_subsec | |
6c54cbd3 | 43 | #define _LTORG .L$_subsec = .L$_subsec + 2; .text .L$_subsec |
f8e509a9 | 44 | |
645fcce0 MW |
45 | // ELF section types. |
46 | #if __ELF__ | |
47 | # if CPUFAM_ARMEL | |
48 | # define _SECTTY(ty) %ty | |
49 | # else | |
50 | # define _SECTTY(ty) @ty | |
51 | # endif | |
52 | #endif | |
53 | ||
54 | // Section selection. | |
55 | #define TEXT .text .L$_subsec | |
56 | #if ABI_WIN | |
57 | # define RODATA .section .rdata, "dr" | |
58 | #elif __ELF__ | |
59 | # define RODATA .section .rodata, "a", _SECTTY(progbits) | |
60 | #else | |
61 | # define RODATA TEXT | |
62 | #endif | |
63 | #define DATA .data | |
64 | ||
1a0c09c4 MW |
65 | // Announcing an external function. |
66 | #define FUNC(name) \ | |
67 | .globl F(name); \ | |
68 | TYPE_FUNC(name); \ | |
8a1aa284 | 69 | .macro ENDFUNC; _ENDFUNC(name); .endm; \ |
1a0c09c4 MW |
70 | FUNC_PREHOOK(name); \ |
71 | F(name): \ | |
72 | FUNC_POSTHOOK(name) | |
73 | ||
74 | // Marking the end of a function. | |
75 | #define _ENDFUNC(name) \ | |
76 | .purgem ENDFUNC; \ | |
77 | SIZE_OBJ(name); \ | |
f8e509a9 | 78 | ENDFUNC_HOOK(name); \ |
6c54cbd3 | 79 | _LTORG |
1a0c09c4 | 80 | |
8ae4c946 MW |
81 | // Make a helper function, if necessary. |
82 | #define AUXFN(name) \ | |
83 | .ifndef .L$_auxfn_def.name; \ | |
84 | .text 7128; \ | |
85 | .macro _ENDAUXFN; _ENDAUXFN_TAIL(name); .endm; \ | |
86 | FUNC_PREHOOK(name); \ | |
87 | name: | |
88 | #define _ENDAUXFN_TAIL(name) \ | |
89 | .purgem _ENDAUXFN; \ | |
90 | .text .L$_subsec; \ | |
91 | .L$_auxfn_def.name = 1 | |
92 | #define ENDAUXFN _ENDAUXFN; .endif | |
93 | ||
1a0c09c4 MW |
94 | ///-------------------------------------------------------------------------- |
95 | /// ELF-specific hacking. | |
96 | ||
97 | #if __ELF__ | |
98 | ||
99 | #if __PIC__ || __PIE__ | |
100 | # define WANT_PIC 1 | |
101 | #endif | |
102 | ||
103 | #define TYPE_FUNC(name) .type name, STT_FUNC | |
104 | ||
105 | #define SIZE_OBJ(name) .size name, . - name | |
106 | ||
107 | #endif | |
108 | ||
109 | ///-------------------------------------------------------------------------- | |
0f23f75f MW |
110 | /// Windows-specific hacking. |
111 | ||
112 | #if ABI_WIN | |
1a0c09c4 MW |
113 | |
114 | #if CPUFAM_X86 | |
0f23f75f MW |
115 | # define F(name) _##name |
116 | #endif | |
117 | ||
118 | #endif | |
119 | ||
120 | ///-------------------------------------------------------------------------- | |
121 | /// x86- and amd64-specific hacking. | |
122 | /// | |
123 | /// It's (slightly) easier to deal with both of these in one go. | |
124 | ||
125 | #if CPUFAM_X86 || CPUFAM_AMD64 | |
1a0c09c4 MW |
126 | |
127 | // Set the function hooks. | |
128 | #define FUNC_PREHOOK(_) .balign 16 | |
129 | ||
f71dd54d MW |
130 | // On Windows, arrange to install stack-unwinding data. |
131 | #if CPUFAM_AMD64 && ABI_WIN | |
132 | # define FUNC_POSTHOOK(name) .seh_proc name | |
133 | # define ENDFUNC_HOOK(_) .seh_endproc | |
134 | // Procedures are expected to invoke `.seh_setframe' if necessary, and | |
135 | // `.seh_pushreg' and friends, and `.seh_endprologue'. | |
136 | #endif | |
137 | ||
1a0c09c4 MW |
138 | // Don't use the wretched AT&T syntax. It's festooned with pointless |
139 | // punctuation, and all of the data movement is backwards. Ugh! | |
140 | .intel_syntax noprefix | |
141 | ||
142 | // Call external subroutine at ADDR, possibly via PLT. | |
8a1aa284 | 143 | .macro callext addr |
1a0c09c4 MW |
144 | #if WANT_PIC |
145 | call \addr@PLT | |
146 | #else | |
147 | call \addr | |
148 | #endif | |
8a1aa284 | 149 | .endm |
1a0c09c4 MW |
150 | |
151 | // Do I need to arrange a spare GOT register? | |
152 | #if WANT_PIC && CPUFAM_X86 | |
153 | # define NEED_GOT 1 | |
154 | #endif | |
155 | #define GOTREG ebx // Not needed in AMD64 so don't care. | |
156 | ||
157 | // Maybe load GOT address into GOT. | |
8a1aa284 | 158 | .macro ldgot got=GOTREG |
0f23f75f | 159 | #if WANT_PIC && CPUFAM_X86 |
8ae4c946 | 160 | AUXFN(_ldgot.\got) |
1a0c09c4 MW |
161 | mov \got, [esp] |
162 | ret | |
8ae4c946 MW |
163 | ENDAUXFN |
164 | call _ldgot.\got | |
165 | add \got, offset _GLOBAL_OFFSET_TABLE_ | |
1a0c09c4 | 166 | #endif |
8a1aa284 | 167 | .endm |
1a0c09c4 MW |
168 | |
169 | // Load address of external symbol ADDR into REG, maybe using GOT. | |
8a1aa284 | 170 | .macro leaext reg, addr, got=GOTREG |
1a0c09c4 | 171 | #if WANT_PIC |
0f23f75f | 172 | # if CPUFAM_X86 |
1a0c09c4 | 173 | mov \reg, [\got + \addr@GOT] |
0f23f75f MW |
174 | # endif |
175 | # if CPUFAM_AMD64 | |
176 | mov \reg, \addr@GOTPCREL[rip] | |
177 | # endif | |
1a0c09c4 | 178 | #else |
0f23f75f | 179 | # if CPUFAM_X86 |
1a0c09c4 | 180 | mov \reg, offset \addr |
0f23f75f MW |
181 | # endif |
182 | # if CPUFAM_AMD64 | |
183 | lea \reg, \addr[rip] | |
184 | # endif | |
1a0c09c4 | 185 | #endif |
8a1aa284 | 186 | .endm |
1a0c09c4 MW |
187 | |
188 | // Address expression (possibly using a base register, and a displacement) | |
189 | // referring to ADDR, which is within our module, maybe using GOT. | |
190 | #define INTADDR(...) INTADDR__0(__VA_ARGS__, GOTREG, dummy) | |
191 | #define INTADDR__0(addr, got, ...) INTADDR__1(addr, got) | |
0f23f75f MW |
192 | #if CPUFAM_AMD64 |
193 | # define INTADDR__1(addr, got) addr + rip | |
194 | #elif WANT_PIC | |
1a0c09c4 MW |
195 | # define INTADDR__1(addr, got) got + addr@GOTOFF |
196 | #else | |
197 | # define INTADDR__1(addr, got) addr | |
198 | #endif | |
199 | ||
a13b5730 MW |
200 | // Permutations for SIMD instructions. SHUF(D, C, B, A) is an immediate, |
201 | // suitable for use in `pshufd' or `shufpd', which copies element D | |
202 | // (0 <= D < 4) of the source to element 3 of the destination, element C to | |
203 | // element 2, element B to element 1, and element A to element 0. | |
204 | #define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a)) | |
205 | ||
1a0c09c4 MW |
206 | #endif |
207 | ||
17de5b2e MW |
208 | #if CPUFAM_X86 |
209 | ||
210 | .macro _reg.0 | |
211 | // Stash GP registers and establish temporary stack frame. | |
212 | pushfd | |
213 | push eax | |
214 | push ecx | |
215 | push edx | |
216 | push ebp | |
217 | mov ebp, esp | |
218 | and esp, ~15 | |
219 | sub esp, 512 | |
220 | fxsave [esp] | |
221 | .endm | |
222 | ||
223 | .macro _reg.1 | |
224 | .endm | |
225 | ||
226 | .macro _reg.2 | |
227 | .endm | |
228 | ||
229 | .macro _reg.3 fmt | |
230 | // Print FMT and the other established arguments. | |
231 | lea eax, .L$_reg$msg.\@ | |
232 | push eax | |
233 | call printf | |
234 | jmp .L$_reg$cont.\@ | |
235 | .L$_reg$msg.\@: | |
236 | .ascii ";; \fmt\n\0" | |
237 | .L$_reg$cont.\@: | |
238 | mov eax, ebp | |
239 | and eax, ~15 | |
240 | sub eax, 512 | |
241 | fxrstor [eax] | |
242 | mov esp, ebp | |
243 | pop ebp | |
244 | pop edx | |
245 | pop ecx | |
246 | pop eax | |
247 | popfd | |
248 | .endm | |
249 | ||
250 | .macro msg msg | |
251 | _reg.0 | |
252 | _reg.1 | |
253 | _reg.2 | |
254 | _reg.3 "\msg" | |
255 | .endm | |
256 | ||
257 | .macro reg r, msg | |
258 | _reg.0 | |
259 | .ifeqs "\r", "esp" | |
260 | lea eax, [ebp + 20] | |
261 | push eax | |
262 | .else | |
263 | .ifeqs "\r", "ebp" | |
264 | push [ebp] | |
265 | .else | |
266 | push \r | |
267 | .endif | |
268 | .endif | |
269 | _reg.1 | |
270 | _reg.2 | |
271 | _reg.3 "\msg: \r = %08x" | |
272 | .endm | |
273 | ||
274 | .macro xmmreg r, msg | |
275 | _reg.0 | |
276 | _reg.1 | |
277 | _reg.2 | |
278 | movdqu xmm0, \r | |
279 | pshufd xmm0, xmm0, 0x1b | |
280 | sub esp, 16 | |
281 | movdqa [esp], xmm0 | |
282 | _reg.3 "\msg: \r = %08x %08x %08x %08x" | |
283 | .endm | |
284 | ||
285 | .macro mmreg r, msg | |
286 | _reg.0 | |
287 | _reg.1 | |
288 | _reg.2 | |
289 | pshufw \r, \r, 0x4e | |
290 | sub esp, 8 | |
291 | movq [esp], \r | |
292 | _reg.3 "\msg: \r = %08x %08x" | |
293 | .endm | |
294 | ||
295 | .macro freg i, msg | |
296 | _reg.0 | |
297 | _reg.1 | |
298 | _reg.2 | |
299 | finit | |
300 | fldt [esp + 32 + 16*\i] | |
301 | sub esp, 12 | |
302 | fstpt [esp] | |
303 | _reg.3 "\msg: st(\i) = %.20Lg" | |
304 | .endm | |
305 | ||
306 | .macro fxreg i, msg | |
307 | _reg.0 | |
308 | _reg.1 | |
309 | _reg.2 | |
310 | finit | |
311 | fldt [esp + 32 + 16*\i] | |
312 | sub esp, 12 | |
313 | fstpt [esp] | |
314 | _reg.3 "\msg: st(\i) = %La" | |
315 | .endm | |
316 | ||
317 | #endif | |
318 | ||
1a0c09c4 | 319 | ///-------------------------------------------------------------------------- |
61bd904b MW |
320 | /// ARM-specific hacking. |
321 | ||
59d86860 | 322 | #if CPUFAM_ARMEL |
61bd904b | 323 | |
9f6eb05d MW |
324 | // ARM/Thumb mode things. Use ARM by default. |
325 | #define ARM .arm; .L$_pcoff = 8 | |
326 | #define THUMB .thumb; .L$_pcoff = 4 | |
327 | ARM | |
328 | ||
61bd904b MW |
329 | // Set the function hooks. |
330 | #define FUNC_PREHOOK(_) .balign 4 | |
331 | #define ENDFUNC_HOOK(name) .ltorg | |
332 | ||
333 | // Call external subroutine at ADDR, possibly via PLT. | |
8a1aa284 | 334 | .macro callext addr, cond= |
61bd904b MW |
335 | #if WANT_PIC |
336 | bl\cond \addr(PLT) | |
337 | #else | |
338 | bl\cond \addr | |
339 | #endif | |
8a1aa284 | 340 | .endm |
61bd904b MW |
341 | |
342 | // Do I need to arrange a spare GOT register? | |
343 | #if WANT_PIC | |
344 | # define NEED_GOT 1 | |
345 | #endif | |
346 | #define GOTREG r9 | |
347 | ||
348 | // Maybe load GOT address into GOT. | |
8a1aa284 | 349 | .macro ldgot cond=, got=GOTREG |
61bd904b | 350 | #if WANT_PIC |
adca2a18 MW |
351 | ldr\cond \got, .L$_ldgot$\@ |
352 | .L$_ldgot_pc$\@: | |
2d03a881 | 353 | add\cond \got, pc, \got |
8a1aa284 | 354 | _LIT |
adca2a18 MW |
355 | .balign 4 |
356 | .L$_ldgot$\@: | |
9f6eb05d | 357 | .word _GLOBAL_OFFSET_TABLE_ - .L$_ldgot_pc$\@ - .L$_pcoff |
8a1aa284 | 358 | _ENDLIT |
61bd904b | 359 | #endif |
8a1aa284 | 360 | .endm |
61bd904b MW |
361 | |
362 | // Load address of external symbol ADDR into REG, maybe using GOT. | |
8a1aa284 | 363 | .macro leaext reg, addr, cond=, got=GOTREG |
61bd904b | 364 | #if WANT_PIC |
adca2a18 | 365 | ldr\cond \reg, .L$_leaext$\@ |
2d03a881 | 366 | ldr\cond \reg, [\got, \reg] |
8a1aa284 | 367 | _LIT |
adca2a18 MW |
368 | .balign 4 |
369 | .L$_leaext$\@: | |
370 | .word \addr(GOT) | |
8a1aa284 | 371 | _ENDLIT |
61bd904b | 372 | #else |
2d03a881 | 373 | ldr\cond \reg, =\addr |
61bd904b | 374 | #endif |
8a1aa284 | 375 | .endm |
61bd904b | 376 | |
0c53ac58 | 377 | // Load address of external symbol ADDR into REG directly. |
8a1aa284 | 378 | .macro leaextq reg, addr, cond= |
0c53ac58 MW |
379 | #if WANT_PIC |
380 | ldr\cond \reg, .L$_leaextq$\@ | |
381 | .L$_leaextq_pc$\@: | |
9f6eb05d | 382 | .if .L$_pcoff == 8 |
0c53ac58 | 383 | ldr\cond \reg, [pc, \reg] |
9f6eb05d MW |
384 | .else |
385 | add\cond \reg, pc | |
386 | ldr\cond \reg, [\reg] | |
387 | .endif | |
8a1aa284 | 388 | _LIT |
0c53ac58 MW |
389 | .balign 4 |
390 | .L$_leaextq$\@: | |
9f6eb05d | 391 | .word \addr(GOT_PREL) + (. - .L$_leaextq_pc$\@ - .L$_pcoff) |
8a1aa284 | 392 | _ENDLIT |
0c53ac58 MW |
393 | #else |
394 | ldr\cond \reg, =\addr | |
395 | #endif | |
8a1aa284 | 396 | .endm |
0c53ac58 | 397 | |
61bd904b MW |
398 | #endif |
399 | ||
400 | ///-------------------------------------------------------------------------- | |
1a0c09c4 MW |
401 | /// Final stuff. |
402 | ||
403 | // Default values for the various hooks. | |
404 | #ifndef FUNC_PREHOOK | |
405 | # define FUNC_PREHOOK(name) | |
406 | #endif | |
407 | #ifndef FUNC_POSTHOOK | |
408 | # define FUNC_POSTHOOK(name) | |
409 | #endif | |
410 | #ifndef ENDFUNC_HOOK | |
411 | # define ENDFUNC_HOOK(name) | |
412 | #endif | |
413 | ||
414 | #ifndef F | |
415 | # define F(name) name | |
416 | #endif | |
417 | ||
418 | #ifndef TYPE_FUNC | |
419 | # define TYPE_FUNC(name) | |
420 | #endif | |
421 | ||
422 | #ifndef SIZE_OBJ | |
423 | # define SIZE_OBJ(name) | |
424 | #endif | |
425 | ||
1aa5bfa8 MW |
426 | #if __ELF__ && defined(WANT_EXECUTABLE_STACK) |
427 | .pushsection .note.GNU-stack, "", _SECTTY(progbits) | |
428 | .popsection | |
429 | #endif | |
430 | ||
1a0c09c4 | 431 | ///----- That's all, folks -------------------------------------------------- |