X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/1a517bb3785891ff6940c73af7c5a136d0250ebf..4ff9d579bfb438187bb69ed60a5b23b0c7a55dfd:/base/asm-common.h diff --git a/base/asm-common.h b/base/asm-common.h index fdd7fad1..ebcba2c6 100644 --- a/base/asm-common.h +++ b/base/asm-common.h @@ -1,6 +1,6 @@ /// -*- mode: asm; asm-comment-char: ?/ -*- /// -/// Fancy SIMD implementation of Salsa20 +/// Common definitions for asesembler source files /// /// (c) 2015 Straylight/Edgeware /// @@ -24,6 +24,9 @@ /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, /// MA 02111-1307, USA. +#ifndef CATACOMB_ASM_COMMON_H +#define CATACOMB_ASM_COMMON_H + ///-------------------------------------------------------------------------- /// General definitions. @@ -66,6 +69,7 @@ #define INTFUNC(name) \ TYPE_FUNC(name); \ .macro ENDFUNC; _ENDFUNC(name); .endm; \ + .L$_prologue_p = 0; .L$_frameptr_p = 0; \ FUNC_PREHOOK(name); \ name: \ FUNC_POSTHOOK(name) @@ -77,6 +81,8 @@ INTFUNC(F(name)) // Marking the end of a function. #define _ENDFUNC(name) \ + .if ~ .L$_prologue_p; .error "Missing `endprologue'"; .endif; \ + .if .L$_frameptr_p; .purgem dropfp; .endif; \ .purgem ENDFUNC; \ SIZE_OBJ(name); \ ENDFUNC_HOOK(name); \ @@ -147,6 +153,11 @@ name: // `.seh_pushreg' and friends, and `.seh_endprologue'. #endif +#if __ELF__ +# define FUNC_POSTHOOK(_) .cfi_startproc +# define ENDFUNC_HOOK(_) .cfi_endproc +#endif + // Don't use the wretched AT&T syntax. It's festooned with pointless // punctuation, and all of the data movement is backwards. Ugh! .intel_syntax noprefix @@ -209,11 +220,11 @@ name: # define INTADDR__1(addr, got) addr #endif -// Permutations for SIMD instructions. SHUF(D, C, B, A) is an immediate, -// suitable for use in `pshufd' or `shufpd', which copies element D -// (0 <= D < 4) of the source to element 3 of the destination, element C to -// element 2, element B to element 1, and element A to element 0. -#define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a)) +// Permutations for SIMD instructions. SHUF(A, B, C, D) is an immediate, +// suitable for use in `pshufd' or `shufpd', which copies element A +// (0 <= A < 4) of the source to element 0 of the destination, element B to +// element 1, element C to element 2, and element D to element 3. +#define SHUF(a, b, c, d) ((a) + 4*(b) + 16*(c) + 64*(d)) // Map register names to their individual pieces. @@ -247,10 +258,10 @@ name: # define _DECOR_abcd_q(reg) r##reg##x #endif -#define _DECOR_xp_b(reg) reg##l #define _DECOR_xp_w(reg) reg #define _DECOR_xp_d(reg) e##reg #if CPUFAM_AMD64 +# define _DECOR_xp_b(reg) reg##l # define _DECOR_xp_q(reg) r##reg #endif @@ -268,27 +279,39 @@ name: # define _DECOR_rn_r(reg) reg #endif +#define _DECOR_mem_b(addr) byte ptr addr +#define _DECOR_mem_w(addr) word ptr addr +#define _DECOR_mem_d(addr) dword ptr addr +#if CPUFAM_AMD64 +# define _DECOR_mem_q(addr) qword ptr addr +#endif + +#define _DECOR_imm_b(imm) byte imm +#define _DECOR_imm_w(imm) word imm +#define _DECOR_imm_d(imm) dword imm +#if CPUFAM_AMD64 +# define _DECOR_imm_q(imm) qword imm +#endif + #if CPUFAM_X86 # define _DECOR_abcd_r(reg) e##reg##x # define _DECOR_xp_r(reg) e##reg # define _DECOR_ip_r(reg) e##reg +# define _DECOR_mem_r(addr) dword ptr addr +# define _DECOR_imm_r(imm) dword imm #endif #if CPUFAM_AMD64 # define _DECOR_abcd_r(reg) r##reg##x # define _DECOR_xp_r(reg) r##reg # define _DECOR_ip_r(reg) r##reg -#endif - -#define _DECOR_mem_b(addr) byte ptr addr -#define _DECOR_mem_w(addr) word ptr addr -#define _DECOR_mem_d(addr) dword ptr addr -#if CPUFAM_AMD64 -# define _DECOR_mem_q(addr) qword ptr addr +# define _DECOR_mem_r(addr) qword ptr addr +# define _DECOR_imm_r(imm) qword imm #endif // R_r(decor) applies decoration decor to register r, which is an internal // register name. The internal register names are: `ip', `a', `b', `c', `d', // `si', `di', `bp', `sp', `r8'--`r15'. +#define R_nil(decor) nil #define R_ip(decor) _DECOR(ip, decor, ip) #define R_a(decor) _DECOR(abcd, decor, a) #define R_b(decor) _DECOR(abcd, decor, b) @@ -313,6 +336,9 @@ name: // address addr (which should supply its own square-brackets). #define MEM(decor, addr) _DECOR(mem, decor, addr) +// Refer to an immediate datum of the type implied by decor. +#define IMM(decor, imm) _DECOR(mem, decor, imm) + // Applies decoration decor to assembler-level register name reg. #define _REGFORM(reg, decor) _GLUE(_REGFORM_, reg)(decor) @@ -320,6 +346,8 @@ name: // assembler-level register name, in place of any decoration that register // name has already. +#define _REGFORM_nil(decor) R_nil(decor) + #define _REGFORM_ip(decor) R_ip(decor) #define _REGFORM_eip(decor) R_ip(decor) @@ -427,115 +455,109 @@ name: #endif #define WHOLE(reg) _REGFORM(reg, r) +// Macros for some common registers. +#define AX R_a(r) +#define BX R_b(r) +#define CX R_c(r) +#define DX R_d(r) +#define SI R_si(r) +#define DI R_di(r) +#define BP R_bp(r) +#define SP R_sp(r) + +// Stack management and unwinding. +.macro setfp fp=BP, offset=0 + .if \offset == 0 + mov \fp, SP +#if __ELF__ + .cfi_def_cfa_register \fp #endif +#if ABI_WIN && CPUFAM_AMD64 + .seh_setframe \fp, 0 +#endif + .else + lea \fp, [SP + \offset] +#if __ELF__ + .cfi_def_cfa_register \fp + .cfi_adjust_cfa_offset -\offset +#endif +#if ABI_WIN && CPUFAM_AMD64 + .seh_setframe \fp, \offset +#endif + .endif + .L$_frameptr_p = -1 + .macro dropfp; _dropfp \fp, \offset; .endm +.endm -#if CPUFAM_X86 - -.macro _reg.0 - // Stash GP registers and establish temporary stack frame. - pushfd - push eax - push ecx - push edx - push ebp - mov ebp, esp - and esp, ~15 - sub esp, 512 - fxsave [esp] -.endm - -.macro _reg.1 -.endm - -.macro _reg.2 -.endm - -.macro _reg.3 fmt - // Print FMT and the other established arguments. - lea eax, .L$_reg$msg.\@ - push eax - call printf - jmp .L$_reg$cont.\@ -.L$_reg$msg.\@: - .ascii ";; \fmt\n\0" -.L$_reg$cont.\@: - mov eax, ebp - and eax, ~15 - sub eax, 512 - fxrstor [eax] - mov esp, ebp - pop ebp - pop edx - pop ecx - pop eax - popfd -.endm - -.macro msg msg - _reg.0 - _reg.1 - _reg.2 - _reg.3 "\msg" -.endm - -.macro reg r, msg - _reg.0 - .ifeqs "\r", "esp" - lea eax, [ebp + 20] - push eax +.macro _dropfp fp, offset=0 + .if \offset == 0 + mov SP, \fp +#if __ELF__ + .cfi_def_cfa_register SP +#endif .else - .ifeqs "\r", "ebp" - push [ebp] - .else - push \r - .endif + lea SP, [\fp - \offset] +#if __ELF__ + .cfi_def_cfa_register SP + .cfi_adjust_cfa_offset +\offset +#endif .endif - _reg.1 - _reg.2 - _reg.3 "\msg: \r = %08x" + .L$_frameptr_p = 0 + .purgem dropfp .endm -.macro xmmreg r, msg - _reg.0 - _reg.1 - _reg.2 - movdqu xmm0, \r - pshufd xmm0, xmm0, 0x1b - sub esp, 16 - movdqa [esp], xmm0 - _reg.3 "\msg: \r = %08x %08x %08x %08x" +.macro stalloc n + sub SP, \n +#if __ELF__ + .cfi_adjust_cfa_offset +\n +#endif +#if ABI_WIN && CPUFAM_AMD64 + .seh_stackalloc \n +#endif .endm -.macro mmreg r, msg - _reg.0 - _reg.1 - _reg.2 - pshufw \r, \r, 0x4e - sub esp, 8 - movq [esp], \r - _reg.3 "\msg: \r = %08x %08x" +.macro stfree n + add SP, \n +#if __ELF__ + .cfi_adjust_cfa_offset -\n +#endif .endm -.macro freg i, msg - _reg.0 - _reg.1 - _reg.2 - finit - fldt [esp + 32 + 16*\i] - sub esp, 12 - fstpt [esp] - _reg.3 "\msg: st(\i) = %.20Lg" +.macro pushreg r + push \r +#if __ELF__ + .cfi_adjust_cfa_offset +WORDSZ + .cfi_rel_offset \r, 0 +#endif +#if ABI_WIN && CPUFAM_AMD64 + .seh_pushreg \r +#endif .endm -.macro fxreg i, msg - _reg.0 - _reg.1 - _reg.2 - finit - fldt [esp + 32 + 16*\i] - sub esp, 12 - fstpt [esp] - _reg.3 "\msg: st(\i) = %La" +.macro popreg r + pop \r +#if __ELF__ + .cfi_adjust_cfa_offset -WORDSZ + .cfi_restore \r +#endif +.endm + +.macro savexmm r, offset + movdqa [SP + \offset], \r +#if ABI_WIN && CPUFAM_AMD64 + .seh_savexmm \r, \offset +#endif +.endm + +.macro rstrxmm r, offset + movdqa \r, [SP + \offset] +.endm + +.macro endprologue +#if ABI_WIN && CPUFAM_AMD64 + .seh_endprologue +#endif + .L$_prologue_p = -1 .endm #endif @@ -551,8 +573,8 @@ name: ARM // Set the function hooks. -#define FUNC_PREHOOK(_) .balign 4 -#define ENDFUNC_HOOK(name) .ltorg +#define FUNC_PREHOOK(_) .balign 4; .fnstart +#define ENDFUNC_HOOK(_) .fnend; .ltorg // Call external subroutine at ADDR, possibly via PLT. .macro callext addr, cond= @@ -603,12 +625,12 @@ name: #if WANT_PIC ldr\cond \reg, .L$_leaextq$\@ .L$_leaextq_pc$\@: - .if .L$_pcoff == 8 + .if .L$_pcoff == 8 ldr\cond \reg, [pc, \reg] - .else + .else add\cond \reg, pc ldr\cond \reg, [\reg] - .endif + .endif _LIT .balign 4 .L$_leaextq$\@: @@ -619,12 +641,37 @@ name: #endif .endm +.macro vzero vz=q15 + // Set VZ (default q15) to zero. + vmov.u32 \vz, #0 +.endm + +.macro vshl128 vd, vn, nbit, vz=q15 + // Set VD to VN shifted left by NBIT. Assume VZ (default q15) is + // all-bits-zero. NBIT must be a multiple of 8. + .if \nbit&3 != 0 + .error "shift quantity must be whole number of bytes" + .endif + vext.8 \vd, \vz, \vn, #16 - (\nbit >> 3) +.endm + +.macro vshr128 vd, vn, nbit, vz=q15 + // Set VD to VN shifted right by NBIT. Assume VZ (default q15) is + // all-bits-zero. NBIT must be a multiple of 8. + .if \nbit&3 != 0 + .error "shift quantity must be whole number of bytes" + .endif + vext.8 \vd, \vn, \vz, #\nbit >> 3 +.endm + // Apply decoration decor to register name reg. #define _REGFORM(reg, decor) _GLUE(_REGFORM_, reg)(decor) // Internal macros: `_REGFORM_r(decor)' applies decoration decor to register // name r. +#define _REGFORM_nil(decor) nil + #define _REGFORM_s0(decor) _DECOR(s, decor, 0) #define _REGFORM_s1(decor) _DECOR(s, decor, 1) #define _REGFORM_s2(decor) _DECOR(s, decor, 2) @@ -868,6 +915,205 @@ name: // Macros for converting vldm/vstm ranges. #define QQ(qlo, qhi) D0(qlo)-D1(qhi) +// Stack management and unwinding. +.macro setfp fp=r11, offset=0 + .if \offset == 0 + mov \fp, sp + .setfp \fp, sp + .else + add \fp, sp, #\offset + .setfp \fp, sp, #\offset + .endif + .macro dropfp; _dropfp \fp, \offset; .endm + .L$_frameptr_p = -1 +.endm + +.macro _dropfp fp, offset=0 + .if \offset == 0 + mov sp, \fp + .else + sub sp, \fp, #\offset + .endif + .purgem dropfp + .L$_frameptr_p = 0 +.endm + +.macro stalloc n + sub sp, sp, #\n + .pad #\n +.endm + +.macro stfree n + add sp, sp, #\n + .pad #-\n +.endm + +.macro pushreg rr:vararg + push {\rr} + .save {\rr} +.endm + +.macro popreg rr:vararg + pop {\rr} +.endm + +.macro pushvfp rr:vararg + vstmdb sp!, {\rr} + .vsave {\rr} +.endm + +.macro popvfp rr:vararg + vldmia sp!, {\rr} +.endm + +.macro endprologue +.endm + +// No need for prologue markers on ARM. +#define FUNC_POSTHOOK(_) .L$_prologue_p = -1 + +#endif + +///-------------------------------------------------------------------------- +/// AArch64-specific hacking. + +#if CPUFAM_ARM64 + +// Set the function hooks. +#define FUNC_PREHOOK(_) .balign 4 +#define FUNC_POSTHOOK(_) .cfi_startproc; .L$_prologue_p = -1 +#define ENDFUNC_HOOK(_) .cfi_endproc + +// Call external subroutine at ADDR, possibly via PLT. +.macro callext addr + bl \addr +.endm + +// Load address of external symbol ADDR into REG. +.macro leaext reg, addr +#if WANT_PIC + adrp \reg, :got:\addr + ldr \reg, [\reg, #:got_lo12:\addr] +#else + adrp \reg, \addr + add \reg, \reg, #:lo12:\addr +#endif +.endm + +.macro vzero vz=v31 + // Set VZ (default v31) to zero. + dup \vz\().4s, wzr +.endm + +.macro vshl128 vd, vn, nbit, vz=v31 + // Set VD to VN shifted left by NBIT. Assume VZ (default v31) is + // all-bits-zero. NBIT must be a multiple of 8. + .if \nbit&3 != 0 + .error "shift quantity must be whole number of bytes" + .endif + ext \vd\().16b, \vz\().16b, \vn\().16b, #16 - (\nbit >> 3) +.endm + +.macro vshr128 vd, vn, nbit, vz=v31 + // Set VD to VN shifted right by NBIT. Assume VZ (default v31) is + // all-bits-zero. NBIT must be a multiple of 8. + .if \nbit&3 != 0 + .error "shift quantity must be whole number of bytes" + .endif + ext \vd\().16b, \vn\().16b, \vz\().16b, #\nbit >> 3 +.endm + +// Stack management and unwinding. +.macro setfp fp=x29, offset=0 + // If you're just going through the motions with a fixed-size stack frame, + // then you want to say `add x29, sp, #OFFSET' directly, which will avoid + // pointlessly restoring sp later. + .if \offset == 0 + mov \fp, sp + .cfi_def_cfa_register \fp + .else + add \fp, sp, #\offset + .cfi_def_cfa_register \fp + .cfi_adjust_cfa_offset -\offset + .endif + .macro dropfp; _dropfp \fp, \offset; .endm + .L$_frameptr_p = -1 +.endm + +.macro _dropfp fp, offset=0 + .if \offset == 0 + mov sp, \fp + .cfi_def_cfa_register sp + .else + sub sp, \fp, #\offset + .cfi_def_cfa_register sp + .cfi_adjust_cfa_offset +\offset + .endif + .purgem dropfp + .L$_frameptr_p = 0 +.endm + +.macro stalloc n + sub sp, sp, #\n + .cfi_adjust_cfa_offset +\n +.endm + +.macro stfree n + add sp, sp, #\n + .cfi_adjust_cfa_offset -\n +.endm + +.macro pushreg x, y=nil + .ifeqs "\y", "nil" + str \x, [sp, #-16]! + .cfi_adjust_cfa_offset +16 + .cfi_rel_offset \x, 0 + .else + stp \x, \y, [sp, #-16]! + .cfi_adjust_cfa_offset +16 + .cfi_rel_offset \x, 0 + .cfi_rel_offset \y, 8 + .endif +.endm + +.macro popreg x, y=nil + .ifeqs "\y", "nil" + ldr \x, [sp], #16 + .cfi_restore \x + .cfi_adjust_cfa_offset -16 + .else + ldp \x, \y, [sp], #16 + .cfi_restore \x + .cfi_restore \y + .cfi_adjust_cfa_offset -16 + .endif +.endm + +.macro savereg x, y, z=nil + .ifeqs "\z", "nil" + str \x, [sp, \y] + .cfi_rel_offset \x, \y + .else + stp \x, \y, [sp, #\z] + .cfi_rel_offset \x, \z + .cfi_rel_offset \y, \z + 8 + .endif +.endm + +.macro rstrreg x, y, z=nil + .ifeqs "\z", "nil" + ldr \x, [sp, \y] + .cfi_restore \x + .else + ldp \x, \y, [sp, #\z] + .cfi_restore \x + .cfi_restore \y + .endif +.endm + +.macro endprologue +.endm + #endif ///-------------------------------------------------------------------------- @@ -885,7 +1131,11 @@ name: #endif #ifndef F -# define F(name) name +# ifdef SYM_USCORE +# define F(name) _##name +# else +# define F(name) name +# endif #endif #ifndef TYPE_FUNC @@ -896,9 +1146,11 @@ name: # define SIZE_OBJ(name) #endif -#if __ELF__ && defined(WANT_EXECUTABLE_STACK) +#if __ELF__ && !defined(WANT_EXECUTABLE_STACK) .pushsection .note.GNU-stack, "", _SECTTY(progbits) .popsection #endif ///----- That's all, folks -------------------------------------------------- + +#endif