From: Mark Wooding Date: Thu, 29 Dec 2016 15:21:08 +0000 (+0000) Subject: base/asm-common.h, */*.S: New macros for making stack-unwinding tables. X-Git-Tag: 2.3.0~17 X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/0923a413958b0e778a3f059c76355ab58e5be414 base/asm-common.h, */*.S: New macros for making stack-unwinding tables. Previously, I only supported Microsoft SEH tables, because they're basically essential to having a working 64-bit binary (because Microsoft are crazy and throw asynchronous exceptions). But there are three variants of stack-unwinding tables which are useful to make: * Microsoft's SEH tables for AMD64, constructed using `.seh_...' directives; * ARM's `.ARM.exidx' and `.ARM.extab' tables; and * Dwarf `.eh_frame' and `.debug_frame' tables. These are all quite similar in flavour, but different in detail. Rather than write lots of hairy conditional stuff around subroutine prologues and epilogues, wrap the whole lot up in some target-specific macros. --- diff --git a/base/asm-common.h b/base/asm-common.h index fdd7fad1..22bb44d6 100644 --- a/base/asm-common.h +++ b/base/asm-common.h @@ -66,6 +66,7 @@ #define INTFUNC(name) \ TYPE_FUNC(name); \ .macro ENDFUNC; _ENDFUNC(name); .endm; \ + .L$_prologue_p = 0; .L$_frameptr_p = 0; \ FUNC_PREHOOK(name); \ name: \ FUNC_POSTHOOK(name) @@ -77,6 +78,8 @@ INTFUNC(F(name)) // Marking the end of a function. #define _ENDFUNC(name) \ + .if ~ .L$_prologue_p; .error "Missing `endprologue'"; .endif; \ + .if .L$_frameptr_p; .purgem dropfp; .endif; \ .purgem ENDFUNC; \ SIZE_OBJ(name); \ ENDFUNC_HOOK(name); \ @@ -147,6 +150,11 @@ name: // `.seh_pushreg' and friends, and `.seh_endprologue'. #endif +#if __ELF__ +# define FUNC_POSTHOOK(_) .cfi_startproc +# define ENDFUNC_HOOK(_) .cfi_endproc +#endif + // Don't use the wretched AT&T syntax. It's festooned with pointless // punctuation, and all of the data movement is backwards. Ugh! .intel_syntax noprefix @@ -427,6 +435,101 @@ name: #endif #define WHOLE(reg) _REGFORM(reg, r) +// Stack management and unwinding. +.macro setfp fp, offset = 0 + .if \offset == 0 + mov \fp, R_sp(r) +#if __ELF__ + .cfi_def_cfa_register \fp +#endif +#if ABI_WIN && CPUFAM_AMD64 + .seh_setframe \fp, 0 +#endif + .else + lea \fp, [R_sp(r) + \offset] +#if __ELF__ + .cfi_def_cfa_register \fp + .cfi_adjust_cfa_offset -\offset +#endif +#if ABI_WIN && CPUFAM_AMD64 + .seh_setframe \fp, \offset +#endif + .endif + .L$_frameptr_p = -1 + .macro dropfp; _dropfp \fp, \offset; .endm +.endm + +.macro _dropfp fp, offset = 0 + .if \offset == 0 + mov R_sp(r), \fp +#if __ELF__ + .cfi_def_cfa_register R_sp(r) +#endif + .else + lea R_sp(r), [\fp - \offset] +#if __ELF__ + .cfi_def_cfa_register R_sp(r) + .cfi_adjust_cfa_offset +\offset +#endif + .endif + .L$_frameptr_p = 0 + .purgem dropfp +.endm + +.macro stalloc n + sub R_sp(r), \n +#if __ELF__ + .cfi_adjust_cfa_offset +\n +#endif +#if ABI_WIN && CPUFAM_AMD64 + .seh_stackalloc \n +#endif +.endm + +.macro stfree n + add R_sp(r), \n +#if __ELF__ + .cfi_adjust_cfa_offset -\n +#endif +.endm + +.macro pushreg r + push \r +#if __ELF__ + .cfi_adjust_cfa_offset +WORDSZ + .cfi_rel_offset \r, 0 +#endif +#if ABI_WIN && CPUFAM_AMD64 + .seh_pushreg \r +#endif +.endm + +.macro popreg r + pop \r +#if __ELF__ + .cfi_adjust_cfa_offset -WORDSZ + .cfi_restore \r +#endif +.endm + +.macro savexmm r, offset + movdqa [R_sp(r) + \offset], \r +#if ABI_WIN && CPUFAM_AMD64 + .seh_savexmm \r, \offset +#endif +.endm + +.macro rstrxmm r, offset + movdqa \r, [R_sp(r) + \offset] +.endm + +.macro endprologue +#if ABI_WIN && CPUFAM_AMD64 + .seh_endprologue +#endif + .L$_prologue_p = -1 +.endm + #endif #if CPUFAM_X86 @@ -551,8 +654,8 @@ name: ARM // Set the function hooks. -#define FUNC_PREHOOK(_) .balign 4 -#define ENDFUNC_HOOK(name) .ltorg +#define FUNC_PREHOOK(_) .balign 4; .fnstart +#define ENDFUNC_HOOK(_) .fnend; .ltorg // Call external subroutine at ADDR, possibly via PLT. .macro callext addr, cond= @@ -868,6 +971,63 @@ name: // Macros for converting vldm/vstm ranges. #define QQ(qlo, qhi) D0(qlo)-D1(qhi) +// Stack management and unwinding. +.macro setfp fp, offset = 0 + .if \offset == 0 + mov \fp, sp + .setfp \fp, sp + .else + add \fp, sp, #\offset + .setfp \fp, sp, #\offset + .endif + .macro dropfp; _dropfp \fp, \offset; .endm + .L$_frameptr_p = -1 +.endm + +.macro _dropfp fp, offset = 0 + .if \offset == 0 + mov sp, \fp + .else + sub sp, \fp, #\offset + .endif + .purgem dropfp + .L$_frameptr_p = 0 +.endm + +.macro stalloc n + sub sp, sp, #\n + .pad #\n +.endm + +.macro stfree n + add sp, sp, #\n + .pad #-\n +.endm + +.macro pushreg rr:vararg + stmfd sp!, {\rr} + .save {\rr} +.endm + +.macro popreg rr:vararg + ldmfd sp!, {\rr} +.endm + +.macro pushvfp rr:vararg + vstmdb sp!, {\rr} + .vsave {\rr} +.endm + +.macro popvfp rr:vararg + vldmia sp!, {\rr} +.endm + +.macro endprologue +.endm + +// No need for prologue markers on ARM. +#define FUNC_POSTHOOK(_) .L$_prologue_p = -1 + #endif ///-------------------------------------------------------------------------- diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 8f69a559..a6613ed0 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -315,6 +315,8 @@ INTFUNC(carryprop) // form. Store the low 128 bits of the represented carry to [EDI] as // a packed 128-bit value, and leave the remaining 16 bits in the low // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered. + endprologue + propout [edi + 0], xmm4, xmm5 propout [edi + 4], xmm5, xmm6 propout [edi + 8], xmm6, nil @@ -333,6 +335,8 @@ INTFUNC(dmul4) // [EDI], and update the carry registers with the carry out. The // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil propout [edi + 0], xmm4, xmm5 @@ -365,6 +369,8 @@ INTFUNC(dmla4) // [EDI], and update the carry registers with the carry out. The // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + carryadd mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil @@ -395,6 +401,8 @@ INTFUNC(mul4zc) // and set the carry registers XMM4, XMM5, XMM6 to the carry out. // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 @@ -421,6 +429,8 @@ INTFUNC(mul4) // and update the carry registers with the carry out. The registers // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t propout [edi + 0], xmm4, xmm5 @@ -446,6 +456,8 @@ INTFUNC(mla4zc) // and set the carry registers XMM4, XMM5, XMM6 to the carry out. // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + movd xmm4, [edi + 0] movd xmm5, [edi + 4] movd xmm6, [edi + 8] @@ -478,6 +490,8 @@ INTFUNC(mla4) // [EDI], and update the carry registers with the carry out. The // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + carryadd mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil @@ -508,7 +522,8 @@ INTFUNC(mmul4) // of the sum U V + N Y to [EDI], leaving the remaining carry in // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and // XMM7 are clobbered; the general-purpose registers are preserved. - sub esp, 64 // space for the carries + stalloc 64 // space for the carries + endprologue // Calculate W = U V, and leave it in the destination. Stash the // carry pieces for later. @@ -532,7 +547,9 @@ INTFUNC(mmla4) // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, // XMM3, and XMM7 are clobbered; the general-purpose registers are // preserved. - sub esp, 64 // space for the carries + stalloc 64 // space for the carries + endprologue + movd xmm4, [edi + 0] movd xmm5, [edi + 4] movd xmm6, [edi + 8] @@ -599,7 +616,7 @@ INTFUNC(mmla4) paddq xmm6, [esp + 32] // And, with that, we're done. - add esp, 64 + stfree 64 ret ENDFUNC @@ -614,6 +631,7 @@ INTFUNC(mont4) // of the sum W + N Y to [EDI], leaving the remaining carry in // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and // XMM7 are clobbered; the general-purpose registers are preserved. + endprologue // Calculate Y = W M. mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 @@ -680,13 +698,14 @@ FUNC(mpx_umul4_x86_sse2) // // esp + 0 expanded Y (32 bytes) // esp + 32 (top of locals) - push ebp - push ebx - push esi - push edi - mov ebp, esp + pushreg ebp + pushreg ebx + pushreg esi + pushreg edi + setfp ebp and esp, ~15 sub esp, 32 + endprologue // Prepare for the first iteration. mov esi, [ebp + 32] // -> bv[0] @@ -753,7 +772,7 @@ FUNC(mpx_umul4_x86_sse2) jb 1b // All over. -9: mov esp, ebp +9: dropfp pop edi pop esi pop ebx @@ -787,13 +806,14 @@ FUNC(mpxmont_mul4_x86_sse2) // esp + 108 bv limit // esp + 112 (gap) // esp + 124 (top of locals) - push ebp - push ebx - push esi - push edi - mov ebp, esp + pushreg ebp + pushreg ebx + pushreg esi + pushreg edi + setfp ebp and esp, ~15 sub esp, 124 + endprologue // Establish the expanded operands. pxor xmm7, xmm7 @@ -894,11 +914,11 @@ FUNC(mpxmont_mul4_x86_sse2) movd [edi + 16], xmm4 // All done. -9: mov esp, ebp - pop edi - pop esi - pop ebx - pop ebp +9: dropfp + popreg edi + popreg esi + popreg ebx + popreg ebp ret ENDFUNC @@ -924,13 +944,14 @@ FUNC(mpxmont_redc4_x86_sse2) // esp + 12 expanded M (32 bytes) // esp + 44 expanded Y (32 bytes) // esp + 76 (top of locals) - push ebp - push ebx - push esi - push edi - mov ebp, esp + pushreg ebp + pushreg ebx + pushreg esi + pushreg edi + setfp ebp and esp, ~15 sub esp, 76 + endprologue // Establish the expanded operands and the blocks-of-4 dv limit. mov edi, [ebp + 20] // -> Z = dv[0] @@ -1019,11 +1040,11 @@ FUNC(mpxmont_redc4_x86_sse2) jmp 5b // All over. -9: mov esp, ebp - pop edi - pop esi - pop ebx - pop ebp +9: dropfp + popreg edi + popreg esi + popreg ebx + popreg ebp ret ENDFUNC @@ -1052,13 +1073,14 @@ ENDFUNC .endm .macro testprologue - push ebp - push ebx - push esi - push edi - mov ebp, esp + pushreg ebp + pushreg ebx + pushreg esi + pushreg edi + setfp ebp and esp, ~15 sub esp, 3*32 + 12 + endprologue // vars: // esp + 0 = cycles // esp + 12 = v expanded @@ -1067,11 +1089,11 @@ ENDFUNC .endm .macro testepilogue - mov esp, ebp - pop edi - pop esi - pop ebx - pop ebp + dropfp + popreg edi + popreg esi + popreg ebx + popreg ebp ret .endm @@ -1133,8 +1155,7 @@ ENDFUNC movdqu [ecx + 32], xmm6 .endm - .globl test_dmul4 -test_dmul4: +FUNC(test_dmul4) testprologue testldcarry [ebp + 24] testexpand [ebp + 36], [ebp + 40] @@ -1144,9 +1165,9 @@ test_dmul4: testtail [ebp + 48], [ebp + 44] testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_dmla4 -test_dmla4: +FUNC(test_dmla4) testprologue testldcarry [ebp + 24] testexpand [ebp + 36], [ebp + 40] @@ -1156,9 +1177,9 @@ test_dmla4: testtail [ebp + 48], [ebp + 44] testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mul4 -test_mul4: +FUNC(test_mul4) testprologue testldcarry [ebp + 24] testexpand nil, [ebp + 32] @@ -1168,9 +1189,9 @@ test_mul4: testtail [ebp + 40], [ebp + 36] testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mla4 -test_mla4: +FUNC(test_mla4) testprologue testldcarry [ebp + 24] testexpand nil, [ebp + 32] @@ -1180,9 +1201,9 @@ test_mla4: testtail [ebp + 40], [ebp + 36] testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mmul4 -test_mmul4: +FUNC(test_mmul4) testprologue testexpand [ebp + 40], [ebp + 44] mov edi, [ebp + 20] @@ -1196,9 +1217,9 @@ test_mmul4: movdqu [edi + 16], xmm1 testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mmla4 -test_mmla4: +FUNC(test_mmla4) testprologue testexpand [ebp + 40], [ebp + 44] mov edi, [ebp + 20] @@ -1212,9 +1233,9 @@ test_mmla4: movdqu [edi + 16], xmm1 testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mont4 -test_mont4: +FUNC(test_mont4) testprologue testexpand nil, [ebp + 36] mov edi, [ebp + 20] @@ -1228,6 +1249,7 @@ test_mont4: movdqu [edi + 16], xmm1 testcarryout [ebp + 24] testepilogue +ENDFUNC #endif diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S index a7ff68b5..0989fd4b 100644 --- a/symm/chacha-x86ish-sse2.S +++ b/symm/chacha-x86ish-sse2.S @@ -60,8 +60,8 @@ FUNC(chacha_core_x86ish_sse2) # define SAVE2 xmm7 # define SAVE3 [esp] - push ebp - mov ebp, esp + pushreg ebp + setfp ebp sub esp, 16 mov IN, [ebp + 12] mov OUT, [ebp + 16] @@ -101,11 +101,11 @@ FUNC(chacha_core_x86ish_sse2) # define SAVE2 [rsp + 16] # define SAVE3 [rsp + 32] - sub rsp, 48 + 8 - .seh_stackalloc 48 + 8 - .seh_endprologue + stalloc 48 + 8 #endif + endprologue + // First job is to slurp the matrix into XMM registers. Be careful: // the input matrix isn't likely to be properly aligned. // @@ -239,11 +239,11 @@ FUNC(chacha_core_x86ish_sse2) // Tidy things up. #if CPUFAM_X86 - mov esp, ebp - pop ebp + dropfp + popreg ebp #endif #if CPUFAM_AMD64 && ABI_WIN - add rsp, 48 + 8 + stfree 48 + 8 #endif // And with that, we're done. diff --git a/symm/rijndael-arm-crypto.S b/symm/rijndael-arm-crypto.S index 4d7312d4..1e551698 100644 --- a/symm/rijndael-arm-crypto.S +++ b/symm/rijndael-arm-crypto.S @@ -70,7 +70,7 @@ FUNC(rijndael_setup_arm_crypto) // r2 = pointer to key material // r3 = key size in words - stmfd sp!, {r4-r9, r14} + pushreg {r4-r9, r14} // The initial round key material is taken directly from the input // key, so copy it over. Unfortunately, the key material is not @@ -209,7 +209,7 @@ FUNC(rijndael_setup_arm_crypto) bl endswap_block // All done. -9: ldmfd sp!, {r4-r9, pc} +9: popreg {r4-r9, pc} ENDFUNC diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S index b0b880a4..2b99b5c7 100644 --- a/symm/rijndael-x86ish-aesni.S +++ b/symm/rijndael-x86ish-aesni.S @@ -85,10 +85,10 @@ FUNC(rijndael_setup_x86ish_aesni) # define BLKOFF edx // block size in bytes // Stack the caller's registers. - push ebp - push ebx - push esi - push edi + pushreg ebp + pushreg ebx + pushreg esi + pushreg edi // Set up our own variables. mov CTX, [esp + 20] // context base pointer @@ -138,17 +138,16 @@ FUNC(rijndael_setup_x86ish_aesni) // We'll need the index registers, which belong to the caller in this // ABI. - push rsi - .seh_pushreg rsi - push rdi - .seh_pushreg rdi - .seh_endprologue + pushreg rsi + pushreg rdi // Move arguments to more useful places. mov rsi, r8 // key material mov CTX, rcx // context base pointer #endif + endprologue + // The initial round key material is taken directly from the input // key, so copy it over. #if CPUFAM_AMD64 && ABI_SYSV @@ -321,14 +320,14 @@ FUNC(rijndael_setup_x86ish_aesni) 9: // All done. #if CPUFAM_X86 - pop edi - pop esi - pop ebx - pop ebp + popreg edi + popreg esi + popreg ebx + popreg ebp #endif #if CPUFAM_AMD64 && ABI_WIN - pop rdi - pop rsi + popreg rdi + popreg rsi #endif ret @@ -337,9 +336,7 @@ ENDFUNC INTFUNC(endswap_block) // End-swap NKW words starting at SI. The end-swapping table is // already loaded into XMM5; and it's OK to work in 16-byte chunks. -#if CPUFAM_AMD64 && ABI_WIN - .seh_endprologue -#endif + endprologue mov ecx, NKW 0: movdqu xmm1, [SI] @@ -399,9 +396,10 @@ ENDFUNC # define SRC rdx # define DST r8 # define NR eax - .seh_endprologue #endif + endprologue + // Find the magic endianness-swapping table. ldgot ecx movdqa xmm5, [INTADDR(endswap_tab, ecx)] @@ -522,9 +520,7 @@ INTFUNC(bogus) // might at least provide a hint as to what went wrong; (b) we don't // have conditional CALLs (and they'd be big anyway); and (c) we can // write a HLT here as a backstop against `abort' being mad. -#if CPUFAM_AMD64 && ABI_WIN - .seh_endprologue -#endif + endprologue callext F(abort) 0: hlt diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S index a05cb4e4..ca677f17 100644 --- a/symm/salsa20-x86ish-sse2.S +++ b/symm/salsa20-x86ish-sse2.S @@ -60,8 +60,8 @@ FUNC(salsa20_core_x86ish_sse2) # define SAVE2 [esp + 0] # define SAVE3 [esp + 16] - push ebp - mov ebp, esp + pushreg ebp + setfp ebp sub esp, 32 mov IN, [ebp + 12] mov OUT, [ebp + 16] @@ -102,15 +102,13 @@ FUNC(salsa20_core_x86ish_sse2) # define SAVE2 [rsp + 32] # define SAVE3 [rsp + 48] - sub rsp, 64 + 8 - .seh_stackalloc 64 + 8 - movdqa [rsp + 0], xmm6 - .seh_savexmm xmm6, 0 - movdqa [rsp + 16], xmm7 - .seh_savexmm xmm7, 16 - .seh_endprologue + stalloc 64 + 8 + savexmm xmm6, 0 + savexmm xmm7, 16 #endif + endprologue + // First job is to slurp the matrix into XMM registers. The words // have already been permuted conveniently to make them line up // better for SIMD processing. @@ -294,13 +292,13 @@ FUNC(salsa20_core_x86ish_sse2) // Tidy things up. #if CPUFAM_X86 - mov esp, ebp - pop ebp + dropfp + popreg ebp #endif #if CPUFAM_AMD64 && ABI_WIN - movdqa xmm6, [rsp + 0] - movdqa xmm7, [rsp + 16] - add rsp, 64 + 8 + rstrxmm xmm6, 0 + rsrrxmm xmm7, 16 + stfree 64 + 8 #endif // And with that, we're done.