Commit | Line | Data |
---|---|---|
1a0c09c4 MW |
1 | /// -*- mode: asm; asm-comment-char: ?/ -*- |
2 | /// | |
3 | /// Fancy SIMD implementation of Salsa20 | |
4 | /// | |
5 | /// (c) 2015 Straylight/Edgeware | |
6 | /// | |
7 | ||
8 | ///----- Licensing notice --------------------------------------------------- | |
9 | /// | |
10 | /// This file is part of Catacomb. | |
11 | /// | |
12 | /// Catacomb is free software; you can redistribute it and/or modify | |
13 | /// it under the terms of the GNU Library General Public License as | |
14 | /// published by the Free Software Foundation; either version 2 of the | |
15 | /// License, or (at your option) any later version. | |
16 | /// | |
17 | /// Catacomb is distributed in the hope that it will be useful, | |
18 | /// but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | /// GNU Library General Public License for more details. | |
21 | /// | |
22 | /// You should have received a copy of the GNU Library General Public | |
23 | /// License along with Catacomb; if not, write to the Free | |
24 | /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, | |
25 | /// MA 02111-1307, USA. | |
26 | ||
27 | ///-------------------------------------------------------------------------- | |
28 | /// General definitions. | |
29 | ||
898f32b3 MW |
30 | // Preprocessor hacks. |
31 | #define STRINGY(x) _STRINGY(x, y) | |
32 | #define _STRINGY(x) #x | |
33 | #define GLUE(x, y) _GLUE(x, y) | |
34 | #define _GLUE(x, y) x##y | |
35 | #define _EMPTY | |
36 | ||
f8e509a9 MW |
37 | // Some useful variables. |
38 | .L$_subsec = 0 | |
39 | ||
40 | // Literal pools done the hard way. | |
41 | #define _LIT .text .L$_subsec + 1 | |
42 | #define _ENDLIT .text .L$_subsec | |
6c54cbd3 | 43 | #define _LTORG .L$_subsec = .L$_subsec + 2; .text .L$_subsec |
f8e509a9 | 44 | |
645fcce0 MW |
45 | // ELF section types. |
46 | #if __ELF__ | |
47 | # if CPUFAM_ARMEL | |
48 | # define _SECTTY(ty) %ty | |
49 | # else | |
50 | # define _SECTTY(ty) @ty | |
51 | # endif | |
52 | #endif | |
53 | ||
54 | // Section selection. | |
55 | #define TEXT .text .L$_subsec | |
56 | #if ABI_WIN | |
57 | # define RODATA .section .rdata, "dr" | |
58 | #elif __ELF__ | |
59 | # define RODATA .section .rodata, "a", _SECTTY(progbits) | |
60 | #else | |
61 | # define RODATA TEXT | |
62 | #endif | |
63 | #define DATA .data | |
64 | ||
1a0c09c4 MW |
65 | // Announcing an external function. |
66 | #define FUNC(name) \ | |
67 | .globl F(name); \ | |
68 | TYPE_FUNC(name); \ | |
8a1aa284 | 69 | .macro ENDFUNC; _ENDFUNC(name); .endm; \ |
1a0c09c4 MW |
70 | FUNC_PREHOOK(name); \ |
71 | F(name): \ | |
72 | FUNC_POSTHOOK(name) | |
73 | ||
74 | // Marking the end of a function. | |
75 | #define _ENDFUNC(name) \ | |
76 | .purgem ENDFUNC; \ | |
77 | SIZE_OBJ(name); \ | |
f8e509a9 | 78 | ENDFUNC_HOOK(name); \ |
6c54cbd3 | 79 | _LTORG |
1a0c09c4 | 80 | |
8ae4c946 MW |
81 | // Make a helper function, if necessary. |
82 | #define AUXFN(name) \ | |
83 | .ifndef .L$_auxfn_def.name; \ | |
84 | .text 7128; \ | |
85 | .macro _ENDAUXFN; _ENDAUXFN_TAIL(name); .endm; \ | |
86 | FUNC_PREHOOK(name); \ | |
87 | name: | |
88 | #define _ENDAUXFN_TAIL(name) \ | |
89 | .purgem _ENDAUXFN; \ | |
90 | .text .L$_subsec; \ | |
91 | .L$_auxfn_def.name = 1 | |
92 | #define ENDAUXFN _ENDAUXFN; .endif | |
93 | ||
1a0c09c4 MW |
94 | ///-------------------------------------------------------------------------- |
95 | /// ELF-specific hacking. | |
96 | ||
97 | #if __ELF__ | |
98 | ||
99 | #if __PIC__ || __PIE__ | |
100 | # define WANT_PIC 1 | |
101 | #endif | |
102 | ||
103 | #define TYPE_FUNC(name) .type name, STT_FUNC | |
104 | ||
105 | #define SIZE_OBJ(name) .size name, . - name | |
106 | ||
107 | #endif | |
108 | ||
109 | ///-------------------------------------------------------------------------- | |
0f23f75f MW |
110 | /// Windows-specific hacking. |
111 | ||
112 | #if ABI_WIN | |
1a0c09c4 MW |
113 | |
114 | #if CPUFAM_X86 | |
0f23f75f MW |
115 | # define F(name) _##name |
116 | #endif | |
117 | ||
118 | #endif | |
119 | ||
120 | ///-------------------------------------------------------------------------- | |
121 | /// x86- and amd64-specific hacking. | |
122 | /// | |
123 | /// It's (slightly) easier to deal with both of these in one go. | |
124 | ||
125 | #if CPUFAM_X86 || CPUFAM_AMD64 | |
1a0c09c4 MW |
126 | |
127 | // Set the function hooks. | |
128 | #define FUNC_PREHOOK(_) .balign 16 | |
129 | ||
f71dd54d MW |
130 | // On Windows, arrange to install stack-unwinding data. |
131 | #if CPUFAM_AMD64 && ABI_WIN | |
132 | # define FUNC_POSTHOOK(name) .seh_proc name | |
133 | # define ENDFUNC_HOOK(_) .seh_endproc | |
134 | // Procedures are expected to invoke `.seh_setframe' if necessary, and | |
135 | // `.seh_pushreg' and friends, and `.seh_endprologue'. | |
136 | #endif | |
137 | ||
1a0c09c4 MW |
138 | // Don't use the wretched AT&T syntax. It's festooned with pointless |
139 | // punctuation, and all of the data movement is backwards. Ugh! | |
140 | .intel_syntax noprefix | |
141 | ||
142 | // Call external subroutine at ADDR, possibly via PLT. | |
8a1aa284 | 143 | .macro callext addr |
1a0c09c4 MW |
144 | #if WANT_PIC |
145 | call \addr@PLT | |
146 | #else | |
147 | call \addr | |
148 | #endif | |
8a1aa284 | 149 | .endm |
1a0c09c4 MW |
150 | |
151 | // Do I need to arrange a spare GOT register? | |
152 | #if WANT_PIC && CPUFAM_X86 | |
153 | # define NEED_GOT 1 | |
154 | #endif | |
155 | #define GOTREG ebx // Not needed in AMD64 so don't care. | |
156 | ||
157 | // Maybe load GOT address into GOT. | |
8a1aa284 | 158 | .macro ldgot got=GOTREG |
0f23f75f | 159 | #if WANT_PIC && CPUFAM_X86 |
8ae4c946 | 160 | AUXFN(_ldgot.\got) |
1a0c09c4 MW |
161 | mov \got, [esp] |
162 | ret | |
8ae4c946 MW |
163 | ENDAUXFN |
164 | call _ldgot.\got | |
165 | add \got, offset _GLOBAL_OFFSET_TABLE_ | |
1a0c09c4 | 166 | #endif |
8a1aa284 | 167 | .endm |
1a0c09c4 MW |
168 | |
169 | // Load address of external symbol ADDR into REG, maybe using GOT. | |
8a1aa284 | 170 | .macro leaext reg, addr, got=GOTREG |
1a0c09c4 | 171 | #if WANT_PIC |
0f23f75f | 172 | # if CPUFAM_X86 |
1a0c09c4 | 173 | mov \reg, [\got + \addr@GOT] |
0f23f75f MW |
174 | # endif |
175 | # if CPUFAM_AMD64 | |
176 | mov \reg, \addr@GOTPCREL[rip] | |
177 | # endif | |
1a0c09c4 | 178 | #else |
0f23f75f | 179 | # if CPUFAM_X86 |
1a0c09c4 | 180 | mov \reg, offset \addr |
0f23f75f MW |
181 | # endif |
182 | # if CPUFAM_AMD64 | |
183 | lea \reg, \addr[rip] | |
184 | # endif | |
1a0c09c4 | 185 | #endif |
8a1aa284 | 186 | .endm |
1a0c09c4 MW |
187 | |
188 | // Address expression (possibly using a base register, and a displacement) | |
189 | // referring to ADDR, which is within our module, maybe using GOT. | |
190 | #define INTADDR(...) INTADDR__0(__VA_ARGS__, GOTREG, dummy) | |
191 | #define INTADDR__0(addr, got, ...) INTADDR__1(addr, got) | |
0f23f75f MW |
192 | #if CPUFAM_AMD64 |
193 | # define INTADDR__1(addr, got) addr + rip | |
194 | #elif WANT_PIC | |
1a0c09c4 MW |
195 | # define INTADDR__1(addr, got) got + addr@GOTOFF |
196 | #else | |
197 | # define INTADDR__1(addr, got) addr | |
198 | #endif | |
199 | ||
a13b5730 MW |
200 | // Permutations for SIMD instructions. SHUF(D, C, B, A) is an immediate, |
201 | // suitable for use in `pshufd' or `shufpd', which copies element D | |
202 | // (0 <= D < 4) of the source to element 3 of the destination, element C to | |
203 | // element 2, element B to element 1, and element A to element 0. | |
204 | #define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a)) | |
205 | ||
43ea7558 MW |
206 | // Map register names to their individual pieces. |
207 | ||
208 | // Apply decoration decor to (internal) register name reg of type ty. | |
209 | // | |
210 | // See `R_...' for internal register names. Decorations are as follows. | |
211 | // | |
212 | // b low byte (e.g., `al', `r8b') | |
213 | // h high byte (e.g., `ah') | |
214 | // w word (e.g., `ax', `r8w') | |
215 | // d doubleword (e.g., `eax', `r8d') | |
216 | // q quadword (e.g., `rax', `r8') | |
217 | // r whole register (doubleword on x86, quadword on amd64) | |
218 | // | |
219 | // And types are as follows. | |
220 | // | |
221 | // abcd the four traditional registers `a', `b', `c', `d' | |
222 | // xp the four pointer registers `si', `di', `bp', `sp' | |
223 | // ip the instruction pointer `ip' | |
224 | // rn the AMD64 numbered registers `r8'--`r15' | |
225 | #define _DECOR(ty, decor, reg) _DECOR_##ty##_##decor(reg) | |
226 | ||
227 | // Internal macros: _DECOR_ty_decor(reg) applies decoration decor to | |
228 | // (internal) register name reg of type ty. | |
229 | ||
230 | #define _DECOR_abcd_b(reg) reg##l | |
231 | #define _DECOR_abcd_h(reg) reg##h | |
232 | #define _DECOR_abcd_w(reg) reg##x | |
233 | #define _DECOR_abcd_d(reg) e##reg##x | |
234 | #if CPUFAM_AMD64 | |
235 | # define _DECOR_abcd_q(reg) r##reg##x | |
236 | #endif | |
237 | ||
238 | #define _DECOR_xp_b(reg) reg##l | |
239 | #define _DECOR_xp_w(reg) reg | |
240 | #define _DECOR_xp_d(reg) e##reg | |
241 | #if CPUFAM_AMD64 | |
242 | # define _DECOR_xp_q(reg) r##reg | |
243 | #endif | |
244 | ||
245 | #define _DECOR_ip_w(reg) reg | |
246 | #define _DECOR_ip_d(reg) e##reg | |
247 | #if CPUFAM_AMD64 | |
248 | # define _DECOR_ip_q(reg) r##reg | |
249 | #endif | |
250 | ||
251 | #if CPUFAM_AMD64 | |
252 | # define _DECOR_rn_b(reg) reg##b | |
253 | # define _DECOR_rn_w(reg) reg##w | |
254 | # define _DECOR_rn_d(reg) reg##d | |
255 | # define _DECOR_rn_q(reg) reg | |
256 | # define _DECOR_rn_r(reg) reg | |
257 | #endif | |
258 | ||
259 | #if CPUFAM_X86 | |
260 | # define _DECOR_abcd_r(reg) e##reg##x | |
261 | # define _DECOR_xp_r(reg) e##reg | |
262 | # define _DECOR_ip_r(reg) e##reg | |
263 | #endif | |
264 | #if CPUFAM_AMD64 | |
265 | # define _DECOR_abcd_r(reg) r##reg##x | |
266 | # define _DECOR_xp_r(reg) r##reg | |
267 | # define _DECOR_ip_r(reg) r##reg | |
268 | #endif | |
269 | ||
270 | #define _DECOR_mem_b(addr) byte ptr addr | |
271 | #define _DECOR_mem_w(addr) word ptr addr | |
272 | #define _DECOR_mem_d(addr) dword ptr addr | |
273 | #if CPUFAM_AMD64 | |
274 | # define _DECOR_mem_q(addr) qword ptr addr | |
275 | #endif | |
276 | ||
277 | // R_r(decor) applies decoration decor to register r, which is an internal | |
278 | // register name. The internal register names are: `ip', `a', `b', `c', `d', | |
279 | // `si', `di', `bp', `sp', `r8'--`r15'. | |
280 | #define R_ip(decor) _DECOR(ip, decor, ip) | |
281 | #define R_a(decor) _DECOR(abcd, decor, a) | |
282 | #define R_b(decor) _DECOR(abcd, decor, b) | |
283 | #define R_c(decor) _DECOR(abcd, decor, c) | |
284 | #define R_d(decor) _DECOR(abcd, decor, d) | |
285 | #define R_si(decor) _DECOR(xp, decor, si) | |
286 | #define R_di(decor) _DECOR(xp, decor, di) | |
287 | #define R_bp(decor) _DECOR(xp, decor, bp) | |
288 | #define R_sp(decor) _DECOR(xp, decor, sp) | |
289 | #if CPUFAM_AMD64 | |
290 | # define R_r8(decor) _DECOR(rn, decor, r8) | |
291 | # define R_r9(decor) _DECOR(rn, decor, r9) | |
292 | # define R_r10(decor) _DECOR(rn, decor, r10) | |
293 | # define R_r11(decor) _DECOR(rn, decor, r11) | |
294 | # define R_r12(decor) _DECOR(rn, decor, r12) | |
295 | # define R_r13(decor) _DECOR(rn, decor, r13) | |
296 | # define R_r14(decor) _DECOR(rn, decor, r14) | |
297 | # define R_r15(decor) _DECOR(rn, decor, r15) | |
298 | #endif | |
299 | ||
300 | // Refer to an in-memory datum of the type implied by decor residing at | |
301 | // address addr (which should supply its own square-brackets). | |
302 | #define MEM(decor, addr) _DECOR(mem, decor, addr) | |
303 | ||
304 | // Applies decoration decor to assembler-level register name reg. | |
305 | #define _REGFORM(reg, decor) _GLUE(_REGFORM_, reg)(decor) | |
306 | ||
307 | // Internal macros: _REGFORM_r(decor) applies decoration decor to an | |
308 | // assembler-level register name, in place of any decoration that register | |
309 | // name has already. | |
310 | ||
311 | #define _REGFORM_ip(decor) R_ip(decor) | |
312 | #define _REGFORM_eip(decor) R_ip(decor) | |
313 | ||
314 | #define _REGFORM_a(decor) R_a(decor) | |
315 | #define _REGFORM_al(decor) R_a(decor) | |
316 | #define _REGFORM_ah(decor) R_a(decor) | |
317 | #define _REGFORM_ax(decor) R_a(decor) | |
318 | #define _REGFORM_eax(decor) R_a(decor) | |
319 | ||
320 | #define _REGFORM_b(decor) R_b(decor) | |
321 | #define _REGFORM_bl(decor) R_b(decor) | |
322 | #define _REGFORM_bh(decor) R_b(decor) | |
323 | #define _REGFORM_bx(decor) R_b(decor) | |
324 | #define _REGFORM_ebx(decor) R_b(decor) | |
325 | ||
326 | #define _REGFORM_c(decor) R_c(decor) | |
327 | #define _REGFORM_cl(decor) R_c(decor) | |
328 | #define _REGFORM_ch(decor) R_c(decor) | |
329 | #define _REGFORM_cx(decor) R_c(decor) | |
330 | #define _REGFORM_ecx(decor) R_c(decor) | |
331 | ||
332 | #define _REGFORM_d(decor) R_d(decor) | |
333 | #define _REGFORM_dl(decor) R_d(decor) | |
334 | #define _REGFORM_dh(decor) R_d(decor) | |
335 | #define _REGFORM_dx(decor) R_d(decor) | |
336 | #define _REGFORM_edx(decor) R_d(decor) | |
337 | ||
338 | #define _REGFORM_si(decor) R_si(decor) | |
339 | #define _REGFORM_sil(decor) R_si(decor) | |
340 | #define _REGFORM_esi(decor) R_si(decor) | |
341 | ||
342 | #define _REGFORM_di(decor) R_di(decor) | |
343 | #define _REGFORM_dil(decor) R_di(decor) | |
344 | #define _REGFORM_edi(decor) R_di(decor) | |
345 | ||
346 | #define _REGFORM_bp(decor) R_bp(decor) | |
347 | #define _REGFORM_bpl(decor) R_bp(decor) | |
348 | #define _REGFORM_ebp(decor) R_bp(decor) | |
349 | ||
350 | #define _REGFORM_sp(decor) R_sp(decor) | |
351 | #define _REGFORM_spl(decor) R_sp(decor) | |
352 | #define _REGFORM_esp(decor) R_sp(decor) | |
353 | ||
354 | #if CPUFAM_AMD64 | |
355 | ||
356 | # define _REGFORM_rip(decor) R_ip(decor) | |
357 | # define _REGFORM_rsp(decor) R_sp(decor) | |
358 | # define _REGFORM_rbp(decor) R_bp(decor) | |
359 | # define _REGFORM_rdi(decor) R_di(decor) | |
360 | # define _REGFORM_rsi(decor) R_si(decor) | |
361 | # define _REGFORM_rdx(decor) R_d(decor) | |
362 | # define _REGFORM_rcx(decor) R_c(decor) | |
363 | # define _REGFORM_rbx(decor) R_b(decor) | |
364 | # define _REGFORM_rax(decor) R_a(decor) | |
365 | ||
366 | # define _REGFORM_r8(decor) R_r8(decor) | |
367 | # define _REGFORM_r8b(decor) R_r8(decor) | |
368 | # define _REGFORM_r8w(decor) R_r8(decor) | |
369 | # define _REGFORM_r8d(decor) R_r8(decor) | |
370 | ||
371 | # define _REGFORM_r9(decor) R_r9(decor) | |
372 | # define _REGFORM_r9b(decor) R_r9(decor) | |
373 | # define _REGFORM_r9w(decor) R_r9(decor) | |
374 | # define _REGFORM_r9d(decor) R_r9(decor) | |
375 | ||
376 | # define _REGFORM_r10(decor) R_r10(decor) | |
377 | # define _REGFORM_r10b(decor) R_r10(decor) | |
378 | # define _REGFORM_r10w(decor) R_r10(decor) | |
379 | # define _REGFORM_r10d(decor) R_r10(decor) | |
380 | ||
381 | # define _REGFORM_r11(decor) R_r11(decor) | |
382 | # define _REGFORM_r11b(decor) R_r11(decor) | |
383 | # define _REGFORM_r11w(decor) R_r11(decor) | |
384 | # define _REGFORM_r11d(decor) R_r11(decor) | |
385 | ||
386 | # define _REGFORM_r12(decor) R_r12(decor) | |
387 | # define _REGFORM_r12b(decor) R_r12(decor) | |
388 | # define _REGFORM_r12w(decor) R_r12(decor) | |
389 | # define _REGFORM_r12d(decor) R_r12(decor) | |
390 | ||
391 | # define _REGFORM_r13(decor) R_r13(decor) | |
392 | # define _REGFORM_r13b(decor) R_r13(decor) | |
393 | # define _REGFORM_r13w(decor) R_r13(decor) | |
394 | # define _REGFORM_r13d(decor) R_r13(decor) | |
395 | ||
396 | # define _REGFORM_r14(decor) R_r14(decor) | |
397 | # define _REGFORM_r14b(decor) R_r14(decor) | |
398 | # define _REGFORM_r14w(decor) R_r14(decor) | |
399 | # define _REGFORM_r14d(decor) R_r14(decor) | |
400 | ||
401 | # define _REGFORM_r15(decor) R_r15(decor) | |
402 | # define _REGFORM_r15b(decor) R_r15(decor) | |
403 | # define _REGFORM_r15w(decor) R_r15(decor) | |
404 | # define _REGFORM_r15d(decor) R_r15(decor) | |
405 | ||
406 | #endif | |
407 | ||
408 | // Macros for converting register names. | |
409 | #define BYTE(reg) _REGFORM(reg, b) | |
410 | #define HIBYTE(reg) _REGFORM(reg, h) | |
411 | #define WORD(reg) _REGFORM(reg, w) | |
412 | #define DWORD(reg) _REGFORM(reg, d) | |
413 | #if CPUFAM_AMD64 | |
414 | # define QWORD(reg) _REGFORM(reg, q) | |
415 | #endif | |
416 | #define WHOLE(reg) _REGFORM(reg, r) | |
417 | ||
1a0c09c4 MW |
418 | #endif |
419 | ||
17de5b2e MW |
420 | #if CPUFAM_X86 |
421 | ||
422 | .macro _reg.0 | |
423 | // Stash GP registers and establish temporary stack frame. | |
424 | pushfd | |
425 | push eax | |
426 | push ecx | |
427 | push edx | |
428 | push ebp | |
429 | mov ebp, esp | |
430 | and esp, ~15 | |
431 | sub esp, 512 | |
432 | fxsave [esp] | |
433 | .endm | |
434 | ||
435 | .macro _reg.1 | |
436 | .endm | |
437 | ||
438 | .macro _reg.2 | |
439 | .endm | |
440 | ||
441 | .macro _reg.3 fmt | |
442 | // Print FMT and the other established arguments. | |
443 | lea eax, .L$_reg$msg.\@ | |
444 | push eax | |
445 | call printf | |
446 | jmp .L$_reg$cont.\@ | |
447 | .L$_reg$msg.\@: | |
448 | .ascii ";; \fmt\n\0" | |
449 | .L$_reg$cont.\@: | |
450 | mov eax, ebp | |
451 | and eax, ~15 | |
452 | sub eax, 512 | |
453 | fxrstor [eax] | |
454 | mov esp, ebp | |
455 | pop ebp | |
456 | pop edx | |
457 | pop ecx | |
458 | pop eax | |
459 | popfd | |
460 | .endm | |
461 | ||
462 | .macro msg msg | |
463 | _reg.0 | |
464 | _reg.1 | |
465 | _reg.2 | |
466 | _reg.3 "\msg" | |
467 | .endm | |
468 | ||
469 | .macro reg r, msg | |
470 | _reg.0 | |
471 | .ifeqs "\r", "esp" | |
472 | lea eax, [ebp + 20] | |
473 | push eax | |
474 | .else | |
475 | .ifeqs "\r", "ebp" | |
476 | push [ebp] | |
477 | .else | |
478 | push \r | |
479 | .endif | |
480 | .endif | |
481 | _reg.1 | |
482 | _reg.2 | |
483 | _reg.3 "\msg: \r = %08x" | |
484 | .endm | |
485 | ||
486 | .macro xmmreg r, msg | |
487 | _reg.0 | |
488 | _reg.1 | |
489 | _reg.2 | |
490 | movdqu xmm0, \r | |
491 | pshufd xmm0, xmm0, 0x1b | |
492 | sub esp, 16 | |
493 | movdqa [esp], xmm0 | |
494 | _reg.3 "\msg: \r = %08x %08x %08x %08x" | |
495 | .endm | |
496 | ||
497 | .macro mmreg r, msg | |
498 | _reg.0 | |
499 | _reg.1 | |
500 | _reg.2 | |
501 | pshufw \r, \r, 0x4e | |
502 | sub esp, 8 | |
503 | movq [esp], \r | |
504 | _reg.3 "\msg: \r = %08x %08x" | |
505 | .endm | |
506 | ||
507 | .macro freg i, msg | |
508 | _reg.0 | |
509 | _reg.1 | |
510 | _reg.2 | |
511 | finit | |
512 | fldt [esp + 32 + 16*\i] | |
513 | sub esp, 12 | |
514 | fstpt [esp] | |
515 | _reg.3 "\msg: st(\i) = %.20Lg" | |
516 | .endm | |
517 | ||
518 | .macro fxreg i, msg | |
519 | _reg.0 | |
520 | _reg.1 | |
521 | _reg.2 | |
522 | finit | |
523 | fldt [esp + 32 + 16*\i] | |
524 | sub esp, 12 | |
525 | fstpt [esp] | |
526 | _reg.3 "\msg: st(\i) = %La" | |
527 | .endm | |
528 | ||
529 | #endif | |
530 | ||
1a0c09c4 | 531 | ///-------------------------------------------------------------------------- |
61bd904b MW |
532 | /// ARM-specific hacking. |
533 | ||
59d86860 | 534 | #if CPUFAM_ARMEL |
61bd904b | 535 | |
9f6eb05d MW |
536 | // ARM/Thumb mode things. Use ARM by default. |
537 | #define ARM .arm; .L$_pcoff = 8 | |
538 | #define THUMB .thumb; .L$_pcoff = 4 | |
539 | ARM | |
540 | ||
61bd904b MW |
541 | // Set the function hooks. |
542 | #define FUNC_PREHOOK(_) .balign 4 | |
543 | #define ENDFUNC_HOOK(name) .ltorg | |
544 | ||
545 | // Call external subroutine at ADDR, possibly via PLT. | |
8a1aa284 | 546 | .macro callext addr, cond= |
61bd904b MW |
547 | #if WANT_PIC |
548 | bl\cond \addr(PLT) | |
549 | #else | |
550 | bl\cond \addr | |
551 | #endif | |
8a1aa284 | 552 | .endm |
61bd904b MW |
553 | |
554 | // Do I need to arrange a spare GOT register? | |
555 | #if WANT_PIC | |
556 | # define NEED_GOT 1 | |
557 | #endif | |
558 | #define GOTREG r9 | |
559 | ||
560 | // Maybe load GOT address into GOT. | |
8a1aa284 | 561 | .macro ldgot cond=, got=GOTREG |
61bd904b | 562 | #if WANT_PIC |
adca2a18 MW |
563 | ldr\cond \got, .L$_ldgot$\@ |
564 | .L$_ldgot_pc$\@: | |
2d03a881 | 565 | add\cond \got, pc, \got |
8a1aa284 | 566 | _LIT |
adca2a18 MW |
567 | .balign 4 |
568 | .L$_ldgot$\@: | |
9f6eb05d | 569 | .word _GLOBAL_OFFSET_TABLE_ - .L$_ldgot_pc$\@ - .L$_pcoff |
8a1aa284 | 570 | _ENDLIT |
61bd904b | 571 | #endif |
8a1aa284 | 572 | .endm |
61bd904b MW |
573 | |
574 | // Load address of external symbol ADDR into REG, maybe using GOT. | |
8a1aa284 | 575 | .macro leaext reg, addr, cond=, got=GOTREG |
61bd904b | 576 | #if WANT_PIC |
adca2a18 | 577 | ldr\cond \reg, .L$_leaext$\@ |
2d03a881 | 578 | ldr\cond \reg, [\got, \reg] |
8a1aa284 | 579 | _LIT |
adca2a18 MW |
580 | .balign 4 |
581 | .L$_leaext$\@: | |
582 | .word \addr(GOT) | |
8a1aa284 | 583 | _ENDLIT |
61bd904b | 584 | #else |
2d03a881 | 585 | ldr\cond \reg, =\addr |
61bd904b | 586 | #endif |
8a1aa284 | 587 | .endm |
61bd904b | 588 | |
0c53ac58 | 589 | // Load address of external symbol ADDR into REG directly. |
8a1aa284 | 590 | .macro leaextq reg, addr, cond= |
0c53ac58 MW |
591 | #if WANT_PIC |
592 | ldr\cond \reg, .L$_leaextq$\@ | |
593 | .L$_leaextq_pc$\@: | |
9f6eb05d | 594 | .if .L$_pcoff == 8 |
0c53ac58 | 595 | ldr\cond \reg, [pc, \reg] |
9f6eb05d MW |
596 | .else |
597 | add\cond \reg, pc | |
598 | ldr\cond \reg, [\reg] | |
599 | .endif | |
8a1aa284 | 600 | _LIT |
0c53ac58 MW |
601 | .balign 4 |
602 | .L$_leaextq$\@: | |
9f6eb05d | 603 | .word \addr(GOT_PREL) + (. - .L$_leaextq_pc$\@ - .L$_pcoff) |
8a1aa284 | 604 | _ENDLIT |
0c53ac58 MW |
605 | #else |
606 | ldr\cond \reg, =\addr | |
607 | #endif | |
8a1aa284 | 608 | .endm |
0c53ac58 | 609 | |
43ea7558 MW |
610 | // Apply decoration decor to register name reg. |
611 | #define _REGFORM(reg, decor) _GLUE(_REGFORM_, reg)(decor) | |
612 | ||
613 | // Internal macros: `_REGFORM_r(decor)' applies decoration decor to register | |
614 | // name r. | |
615 | ||
616 | #define _REGFORM_s0(decor) _DECOR(s, decor, 0) | |
617 | #define _REGFORM_s1(decor) _DECOR(s, decor, 1) | |
618 | #define _REGFORM_s2(decor) _DECOR(s, decor, 2) | |
619 | #define _REGFORM_s3(decor) _DECOR(s, decor, 3) | |
620 | #define _REGFORM_s4(decor) _DECOR(s, decor, 4) | |
621 | #define _REGFORM_s5(decor) _DECOR(s, decor, 5) | |
622 | #define _REGFORM_s6(decor) _DECOR(s, decor, 6) | |
623 | #define _REGFORM_s7(decor) _DECOR(s, decor, 7) | |
624 | #define _REGFORM_s8(decor) _DECOR(s, decor, 8) | |
625 | #define _REGFORM_s9(decor) _DECOR(s, decor, 9) | |
626 | #define _REGFORM_s10(decor) _DECOR(s, decor, 10) | |
627 | #define _REGFORM_s11(decor) _DECOR(s, decor, 11) | |
628 | #define _REGFORM_s12(decor) _DECOR(s, decor, 12) | |
629 | #define _REGFORM_s13(decor) _DECOR(s, decor, 13) | |
630 | #define _REGFORM_s14(decor) _DECOR(s, decor, 14) | |
631 | #define _REGFORM_s15(decor) _DECOR(s, decor, 15) | |
632 | #define _REGFORM_s16(decor) _DECOR(s, decor, 16) | |
633 | #define _REGFORM_s17(decor) _DECOR(s, decor, 17) | |
634 | #define _REGFORM_s18(decor) _DECOR(s, decor, 18) | |
635 | #define _REGFORM_s19(decor) _DECOR(s, decor, 19) | |
636 | #define _REGFORM_s20(decor) _DECOR(s, decor, 20) | |
637 | #define _REGFORM_s21(decor) _DECOR(s, decor, 21) | |
638 | #define _REGFORM_s22(decor) _DECOR(s, decor, 22) | |
639 | #define _REGFORM_s23(decor) _DECOR(s, decor, 23) | |
640 | #define _REGFORM_s24(decor) _DECOR(s, decor, 24) | |
641 | #define _REGFORM_s25(decor) _DECOR(s, decor, 25) | |
642 | #define _REGFORM_s26(decor) _DECOR(s, decor, 26) | |
643 | #define _REGFORM_s27(decor) _DECOR(s, decor, 27) | |
644 | #define _REGFORM_s28(decor) _DECOR(s, decor, 28) | |
645 | #define _REGFORM_s29(decor) _DECOR(s, decor, 29) | |
646 | #define _REGFORM_s30(decor) _DECOR(s, decor, 30) | |
647 | #define _REGFORM_s31(decor) _DECOR(s, decor, 31) | |
648 | ||
649 | #define _REGFORM_d0(decor) _DECOR(d, decor, 0) | |
650 | #define _REGFORM_d1(decor) _DECOR(d, decor, 1) | |
651 | #define _REGFORM_d2(decor) _DECOR(d, decor, 2) | |
652 | #define _REGFORM_d3(decor) _DECOR(d, decor, 3) | |
653 | #define _REGFORM_d4(decor) _DECOR(d, decor, 4) | |
654 | #define _REGFORM_d5(decor) _DECOR(d, decor, 5) | |
655 | #define _REGFORM_d6(decor) _DECOR(d, decor, 6) | |
656 | #define _REGFORM_d7(decor) _DECOR(d, decor, 7) | |
657 | #define _REGFORM_d8(decor) _DECOR(d, decor, 8) | |
658 | #define _REGFORM_d9(decor) _DECOR(d, decor, 9) | |
659 | #define _REGFORM_d10(decor) _DECOR(d, decor, 10) | |
660 | #define _REGFORM_d11(decor) _DECOR(d, decor, 11) | |
661 | #define _REGFORM_d12(decor) _DECOR(d, decor, 12) | |
662 | #define _REGFORM_d13(decor) _DECOR(d, decor, 13) | |
663 | #define _REGFORM_d14(decor) _DECOR(d, decor, 14) | |
664 | #define _REGFORM_d15(decor) _DECOR(d, decor, 15) | |
665 | #define _REGFORM_d16(decor) _DECOR(d, decor, 16) | |
666 | #define _REGFORM_d17(decor) _DECOR(d, decor, 17) | |
667 | #define _REGFORM_d18(decor) _DECOR(d, decor, 18) | |
668 | #define _REGFORM_d19(decor) _DECOR(d, decor, 19) | |
669 | #define _REGFORM_d20(decor) _DECOR(d, decor, 20) | |
670 | #define _REGFORM_d21(decor) _DECOR(d, decor, 21) | |
671 | #define _REGFORM_d22(decor) _DECOR(d, decor, 22) | |
672 | #define _REGFORM_d23(decor) _DECOR(d, decor, 23) | |
673 | #define _REGFORM_d24(decor) _DECOR(d, decor, 24) | |
674 | #define _REGFORM_d25(decor) _DECOR(d, decor, 25) | |
675 | #define _REGFORM_d26(decor) _DECOR(d, decor, 26) | |
676 | #define _REGFORM_d27(decor) _DECOR(d, decor, 27) | |
677 | #define _REGFORM_d28(decor) _DECOR(d, decor, 28) | |
678 | #define _REGFORM_d29(decor) _DECOR(d, decor, 29) | |
679 | #define _REGFORM_d30(decor) _DECOR(d, decor, 30) | |
680 | #define _REGFORM_d31(decor) _DECOR(d, decor, 31) | |
681 | ||
682 | #define _REGFORM_q0(decor) _DECOR(q, decor, 0) | |
683 | #define _REGFORM_q1(decor) _DECOR(q, decor, 1) | |
684 | #define _REGFORM_q2(decor) _DECOR(q, decor, 2) | |
685 | #define _REGFORM_q3(decor) _DECOR(q, decor, 3) | |
686 | #define _REGFORM_q4(decor) _DECOR(q, decor, 4) | |
687 | #define _REGFORM_q5(decor) _DECOR(q, decor, 5) | |
688 | #define _REGFORM_q6(decor) _DECOR(q, decor, 6) | |
689 | #define _REGFORM_q7(decor) _DECOR(q, decor, 7) | |
690 | #define _REGFORM_q8(decor) _DECOR(q, decor, 8) | |
691 | #define _REGFORM_q9(decor) _DECOR(q, decor, 9) | |
692 | #define _REGFORM_q10(decor) _DECOR(q, decor, 10) | |
693 | #define _REGFORM_q11(decor) _DECOR(q, decor, 11) | |
694 | #define _REGFORM_q12(decor) _DECOR(q, decor, 12) | |
695 | #define _REGFORM_q13(decor) _DECOR(q, decor, 13) | |
696 | #define _REGFORM_q14(decor) _DECOR(q, decor, 14) | |
697 | #define _REGFORM_q15(decor) _DECOR(q, decor, 15) | |
698 | ||
699 | // `_LOPART(n)' and `_HIPART(n)' return the numbers of the register halves of | |
700 | // register n, i.e., 2*n and 2*n + 1 respectively. | |
701 | #define _LOPART(n) _GLUE(_LOPART_, n) | |
702 | #define _HIPART(n) _GLUE(_HIPART_, n) | |
703 | ||
704 | // Internal macros: `_LOPART_n' and `_HIPART_n' return the numbers of the | |
705 | // register halves of register n, i.e., 2*n and 2*n + 1 respectively. | |
706 | ||
707 | #define _LOPART_0 0 | |
708 | #define _HIPART_0 1 | |
709 | #define _LOPART_1 2 | |
710 | #define _HIPART_1 3 | |
711 | #define _LOPART_2 4 | |
712 | #define _HIPART_2 5 | |
713 | #define _LOPART_3 6 | |
714 | #define _HIPART_3 7 | |
715 | #define _LOPART_4 8 | |
716 | #define _HIPART_4 9 | |
717 | #define _LOPART_5 10 | |
718 | #define _HIPART_5 11 | |
719 | #define _LOPART_6 12 | |
720 | #define _HIPART_6 13 | |
721 | #define _LOPART_7 14 | |
722 | #define _HIPART_7 15 | |
723 | #define _LOPART_8 16 | |
724 | #define _HIPART_8 17 | |
725 | #define _LOPART_9 18 | |
726 | #define _HIPART_9 19 | |
727 | #define _LOPART_10 20 | |
728 | #define _HIPART_10 21 | |
729 | #define _LOPART_11 22 | |
730 | #define _HIPART_11 23 | |
731 | #define _LOPART_12 24 | |
732 | #define _HIPART_12 25 | |
733 | #define _LOPART_13 26 | |
734 | #define _HIPART_13 27 | |
735 | #define _LOPART_14 28 | |
736 | #define _HIPART_14 29 | |
737 | #define _LOPART_15 30 | |
738 | #define _HIPART_15 31 | |
739 | ||
740 | // Return the register number of the pair containing register n, i.e., | |
741 | // floor(n/2). | |
742 | #define _PAIR(n) _GLUE(_PAIR_, n) | |
743 | ||
744 | // Internal macros: `_PAIR_n' returns the register number of the pair | |
745 | // containing register n, i.e., floor(n/2). | |
746 | #define _PAIR_0 0 | |
747 | #define _PAIR_1 0 | |
748 | #define _PAIR_2 1 | |
749 | #define _PAIR_3 1 | |
750 | #define _PAIR_4 2 | |
751 | #define _PAIR_5 2 | |
752 | #define _PAIR_6 3 | |
753 | #define _PAIR_7 3 | |
754 | #define _PAIR_8 4 | |
755 | #define _PAIR_9 4 | |
756 | #define _PAIR_10 5 | |
757 | #define _PAIR_11 5 | |
758 | #define _PAIR_12 6 | |
759 | #define _PAIR_13 6 | |
760 | #define _PAIR_14 7 | |
761 | #define _PAIR_15 7 | |
762 | #define _PAIR_16 8 | |
763 | #define _PAIR_17 8 | |
764 | #define _PAIR_18 9 | |
765 | #define _PAIR_19 9 | |
766 | #define _PAIR_20 10 | |
767 | #define _PAIR_21 10 | |
768 | #define _PAIR_22 11 | |
769 | #define _PAIR_23 11 | |
770 | #define _PAIR_24 12 | |
771 | #define _PAIR_25 12 | |
772 | #define _PAIR_26 13 | |
773 | #define _PAIR_27 13 | |
774 | #define _PAIR_28 14 | |
775 | #define _PAIR_29 14 | |
776 | #define _PAIR_30 15 | |
777 | #define _PAIR_31 15 | |
778 | ||
779 | // Apply decoration decor to register number n of type ty. Decorations are | |
780 | // as follows. | |
781 | // | |
782 | // decor types meaning | |
783 | // Q s, d the NEON qN register containing this one | |
784 | // D s the NEON dN register containing this one | |
785 | // D0 q the low 64-bit half of this one | |
786 | // D1 q the high 64-bit half of this one | |
787 | // S0 d, q the first 32-bit piece of this one | |
788 | // S1 d, q the second 32-bit piece of this one | |
789 | // S2 q the third 32-bit piece of this one | |
790 | // S3 q the fourth 32-bit piece of this one | |
791 | // Bn q the nth byte of this register, as a scalar | |
792 | // Hn q the nth halfword of this register, as a scalar | |
793 | // Wn q the nth word of this register, as a scalar | |
794 | #define _DECOR(ty, decor, n) _DECOR_##ty##_##decor(n) | |
795 | ||
796 | // Internal macros: `_DECOR_ty_decor(n)' applies decoration decor to register | |
797 | // number n of type ty. | |
798 | ||
799 | #define _DECOR_s_Q(n) GLUE(q, _PAIR(_PAIR(n))) | |
800 | #define _DECOR_s_D(n) GLUE(d, _PAIR(n)) | |
801 | ||
802 | #define _DECOR_d_Q(n) GLUE(q, _PAIR(n)) | |
803 | #define _DECOR_d_S0(n) GLUE(s, _LOPART(n)) | |
804 | #define _DECOR_d_S1(n) GLUE(s, _LOPART(n)) | |
805 | ||
806 | #define _DECOR_q_D0(n) GLUE(d, _LOPART(n)) | |
807 | #define _DECOR_q_D1(n) GLUE(d, _HIPART(n)) | |
808 | #define _DECOR_q_S0(n) GLUE(s, _LOPART(_LOPART(n))) | |
809 | #define _DECOR_q_S1(n) GLUE(s, _HIPART(_LOPART(n))) | |
810 | #define _DECOR_q_S2(n) GLUE(s, _LOPART(_HIPART(n))) | |
811 | #define _DECOR_q_S3(n) GLUE(s, _HIPART(_HIPART(n))) | |
812 | #define _DECOR_q_W0(n) GLUE(d, _LOPART(n))[0] | |
813 | #define _DECOR_q_W1(n) GLUE(d, _LOPART(n))[1] | |
814 | #define _DECOR_q_W2(n) GLUE(d, _HIPART(n))[0] | |
815 | #define _DECOR_q_W3(n) GLUE(d, _HIPART(n))[1] | |
816 | #define _DECOR_q_H0(n) GLUE(d, _LOPART(n))[0] | |
817 | #define _DECOR_q_H1(n) GLUE(d, _LOPART(n))[1] | |
818 | #define _DECOR_q_H2(n) GLUE(d, _LOPART(n))[2] | |
819 | #define _DECOR_q_H3(n) GLUE(d, _LOPART(n))[3] | |
820 | #define _DECOR_q_H4(n) GLUE(d, _HIPART(n))[0] | |
821 | #define _DECOR_q_H5(n) GLUE(d, _HIPART(n))[1] | |
822 | #define _DECOR_q_H6(n) GLUE(d, _HIPART(n))[2] | |
823 | #define _DECOR_q_H7(n) GLUE(d, _HIPART(n))[3] | |
824 | #define _DECOR_q_B0(n) GLUE(d, _LOPART(n))[0] | |
825 | #define _DECOR_q_B1(n) GLUE(d, _LOPART(n))[1] | |
826 | #define _DECOR_q_B2(n) GLUE(d, _LOPART(n))[2] | |
827 | #define _DECOR_q_B3(n) GLUE(d, _LOPART(n))[3] | |
828 | #define _DECOR_q_B4(n) GLUE(d, _LOPART(n))[4] | |
829 | #define _DECOR_q_B5(n) GLUE(d, _LOPART(n))[5] | |
830 | #define _DECOR_q_B6(n) GLUE(d, _LOPART(n))[6] | |
831 | #define _DECOR_q_B7(n) GLUE(d, _LOPART(n))[7] | |
832 | #define _DECOR_q_B8(n) GLUE(d, _HIPART(n))[0] | |
833 | #define _DECOR_q_B9(n) GLUE(d, _HIPART(n))[1] | |
834 | #define _DECOR_q_B10(n) GLUE(d, _HIPART(n))[2] | |
835 | #define _DECOR_q_B11(n) GLUE(d, _HIPART(n))[3] | |
836 | #define _DECOR_q_B12(n) GLUE(d, _HIPART(n))[4] | |
837 | #define _DECOR_q_B13(n) GLUE(d, _HIPART(n))[5] | |
838 | #define _DECOR_q_B14(n) GLUE(d, _HIPART(n))[6] | |
839 | #define _DECOR_q_B15(n) GLUE(d, _HIPART(n))[7] | |
840 | ||
841 | // Macros for navigating the NEON register hierarchy. | |
842 | #define S0(reg) _REGFORM(reg, S0) | |
843 | #define S1(reg) _REGFORM(reg, S1) | |
844 | #define S2(reg) _REGFORM(reg, S2) | |
845 | #define S3(reg) _REGFORM(reg, S3) | |
846 | #define D(reg) _REGFORM(reg, D) | |
847 | #define D0(reg) _REGFORM(reg, D0) | |
848 | #define D1(reg) _REGFORM(reg, D1) | |
849 | #define Q(reg) _REGFORM(reg, Q) | |
850 | ||
851 | // Macros for indexing quadword registers. | |
852 | #define QB(reg, i) _REGFORM(reg, B##i) | |
853 | #define QH(reg, i) _REGFORM(reg, H##i) | |
854 | #define QW(reg, i) _REGFORM(reg, W##i) | |
855 | ||
856 | // Macros for converting vldm/vstm ranges. | |
857 | #define QQ(qlo, qhi) D0(qlo)-D1(qhi) | |
858 | ||
61bd904b MW |
859 | #endif |
860 | ||
861 | ///-------------------------------------------------------------------------- | |
1a0c09c4 MW |
862 | /// Final stuff. |
863 | ||
864 | // Default values for the various hooks. | |
865 | #ifndef FUNC_PREHOOK | |
1e5664a6 | 866 | # define FUNC_PREHOOK(_) |
1a0c09c4 MW |
867 | #endif |
868 | #ifndef FUNC_POSTHOOK | |
1e5664a6 | 869 | # define FUNC_POSTHOOK(_) |
1a0c09c4 MW |
870 | #endif |
871 | #ifndef ENDFUNC_HOOK | |
1e5664a6 | 872 | # define ENDFUNC_HOOK(_) |
1a0c09c4 MW |
873 | #endif |
874 | ||
875 | #ifndef F | |
876 | # define F(name) name | |
877 | #endif | |
878 | ||
879 | #ifndef TYPE_FUNC | |
880 | # define TYPE_FUNC(name) | |
881 | #endif | |
882 | ||
883 | #ifndef SIZE_OBJ | |
884 | # define SIZE_OBJ(name) | |
885 | #endif | |
886 | ||
1aa5bfa8 MW |
887 | #if __ELF__ && defined(WANT_EXECUTABLE_STACK) |
888 | .pushsection .note.GNU-stack, "", _SECTTY(progbits) | |
889 | .popsection | |
890 | #endif | |
891 | ||
1a0c09c4 | 892 | ///----- That's all, folks -------------------------------------------------- |