#define INTFUNC(name) \
TYPE_FUNC(name); \
.macro ENDFUNC; _ENDFUNC(name); .endm; \
+ .L$_prologue_p = 0; .L$_frameptr_p = 0; \
FUNC_PREHOOK(name); \
name: \
FUNC_POSTHOOK(name)
// Marking the end of a function.
#define _ENDFUNC(name) \
+ .if ~ .L$_prologue_p; .error "Missing `endprologue'"; .endif; \
+ .if .L$_frameptr_p; .purgem dropfp; .endif; \
.purgem ENDFUNC; \
SIZE_OBJ(name); \
ENDFUNC_HOOK(name); \
// `.seh_pushreg' and friends, and `.seh_endprologue'.
#endif
+#if __ELF__
+# define FUNC_POSTHOOK(_) .cfi_startproc
+# define ENDFUNC_HOOK(_) .cfi_endproc
+#endif
+
// Don't use the wretched AT&T syntax. It's festooned with pointless
// punctuation, and all of the data movement is backwards. Ugh!
.intel_syntax noprefix
#endif
#define WHOLE(reg) _REGFORM(reg, r)
+// Stack management and unwinding.
+.macro setfp fp, offset = 0
+ .if \offset == 0
+ mov \fp, R_sp(r)
+#if __ELF__
+ .cfi_def_cfa_register \fp
+#endif
+#if ABI_WIN && CPUFAM_AMD64
+ .seh_setframe \fp, 0
+#endif
+ .else
+ lea \fp, [R_sp(r) + \offset]
+#if __ELF__
+ .cfi_def_cfa_register \fp
+ .cfi_adjust_cfa_offset -\offset
+#endif
+#if ABI_WIN && CPUFAM_AMD64
+ .seh_setframe \fp, \offset
+#endif
+ .endif
+ .L$_frameptr_p = -1
+ .macro dropfp; _dropfp \fp, \offset; .endm
+.endm
+
+.macro _dropfp fp, offset = 0
+ .if \offset == 0
+ mov R_sp(r), \fp
+#if __ELF__
+ .cfi_def_cfa_register R_sp(r)
+#endif
+ .else
+ lea R_sp(r), [\fp - \offset]
+#if __ELF__
+ .cfi_def_cfa_register R_sp(r)
+ .cfi_adjust_cfa_offset +\offset
+#endif
+ .endif
+ .L$_frameptr_p = 0
+ .purgem dropfp
+.endm
+
+.macro stalloc n
+ sub R_sp(r), \n
+#if __ELF__
+ .cfi_adjust_cfa_offset +\n
+#endif
+#if ABI_WIN && CPUFAM_AMD64
+ .seh_stackalloc \n
+#endif
+.endm
+
+.macro stfree n
+ add R_sp(r), \n
+#if __ELF__
+ .cfi_adjust_cfa_offset -\n
+#endif
+.endm
+
+.macro pushreg r
+ push \r
+#if __ELF__
+ .cfi_adjust_cfa_offset +WORDSZ
+ .cfi_rel_offset \r, 0
+#endif
+#if ABI_WIN && CPUFAM_AMD64
+ .seh_pushreg \r
+#endif
+.endm
+
+.macro popreg r
+ pop \r
+#if __ELF__
+ .cfi_adjust_cfa_offset -WORDSZ
+ .cfi_restore \r
+#endif
+.endm
+
+.macro savexmm r, offset
+ movdqa [R_sp(r) + \offset], \r
+#if ABI_WIN && CPUFAM_AMD64
+ .seh_savexmm \r, \offset
+#endif
+.endm
+
+.macro rstrxmm r, offset
+ movdqa \r, [R_sp(r) + \offset]
+.endm
+
+.macro endprologue
+#if ABI_WIN && CPUFAM_AMD64
+ .seh_endprologue
+#endif
+ .L$_prologue_p = -1
+.endm
+
#endif
#if CPUFAM_X86
ARM
// Set the function hooks.
-#define FUNC_PREHOOK(_) .balign 4
-#define ENDFUNC_HOOK(name) .ltorg
+#define FUNC_PREHOOK(_) .balign 4; .fnstart
+#define ENDFUNC_HOOK(_) .fnend; .ltorg
// Call external subroutine at ADDR, possibly via PLT.
.macro callext addr, cond=
// Macros for converting vldm/vstm ranges.
#define QQ(qlo, qhi) D0(qlo)-D1(qhi)
+// Stack management and unwinding.
+.macro setfp fp, offset = 0
+ .if \offset == 0
+ mov \fp, sp
+ .setfp \fp, sp
+ .else
+ add \fp, sp, #\offset
+ .setfp \fp, sp, #\offset
+ .endif
+ .macro dropfp; _dropfp \fp, \offset; .endm
+ .L$_frameptr_p = -1
+.endm
+
+.macro _dropfp fp, offset = 0
+ .if \offset == 0
+ mov sp, \fp
+ .else
+ sub sp, \fp, #\offset
+ .endif
+ .purgem dropfp
+ .L$_frameptr_p = 0
+.endm
+
+.macro stalloc n
+ sub sp, sp, #\n
+ .pad #\n
+.endm
+
+.macro stfree n
+ add sp, sp, #\n
+ .pad #-\n
+.endm
+
+.macro pushreg rr:vararg
+ stmfd sp!, {\rr}
+ .save {\rr}
+.endm
+
+.macro popreg rr:vararg
+ ldmfd sp!, {\rr}
+.endm
+
+.macro pushvfp rr:vararg
+ vstmdb sp!, {\rr}
+ .vsave {\rr}
+.endm
+
+.macro popvfp rr:vararg
+ vldmia sp!, {\rr}
+.endm
+
+.macro endprologue
+.endm
+
+// No need for prologue markers on ARM.
+#define FUNC_POSTHOOK(_) .L$_prologue_p = -1
+
#endif
///--------------------------------------------------------------------------
// form. Store the low 128 bits of the represented carry to [EDI] as
// a packed 128-bit value, and leave the remaining 16 bits in the low
// 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
+ endprologue
+
propout [edi + 0], xmm4, xmm5
propout [edi + 4], xmm5, xmm6
propout [edi + 8], xmm6, nil
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
propout [edi + 0], xmm4, xmm5
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
carryadd
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
// and update the carry registers with the carry out. The registers
// XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
propout [edi + 0], xmm4, xmm5
// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
movd xmm4, [edi + 0]
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
carryadd
mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
// of the sum U V + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
- sub esp, 64 // space for the carries
+ stalloc 64 // space for the carries
+ endprologue
// Calculate W = U V, and leave it in the destination. Stash the
// carry pieces for later.
// carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
// XMM3, and XMM7 are clobbered; the general-purpose registers are
// preserved.
- sub esp, 64 // space for the carries
+ stalloc 64 // space for the carries
+ endprologue
+
movd xmm4, [edi + 0]
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
paddq xmm6, [esp + 32]
// And, with that, we're done.
- add esp, 64
+ stfree 64
ret
ENDFUNC
// of the sum W + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
+ endprologue
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
//
// esp + 0 expanded Y (32 bytes)
// esp + 32 (top of locals)
- push ebp
- push ebx
- push esi
- push edi
- mov ebp, esp
+ pushreg ebp
+ pushreg ebx
+ pushreg esi
+ pushreg edi
+ setfp ebp
and esp, ~15
sub esp, 32
+ endprologue
// Prepare for the first iteration.
mov esi, [ebp + 32] // -> bv[0]
jb 1b
// All over.
-9: mov esp, ebp
+9: dropfp
pop edi
pop esi
pop ebx
// esp + 108 bv limit
// esp + 112 (gap)
// esp + 124 (top of locals)
- push ebp
- push ebx
- push esi
- push edi
- mov ebp, esp
+ pushreg ebp
+ pushreg ebx
+ pushreg esi
+ pushreg edi
+ setfp ebp
and esp, ~15
sub esp, 124
+ endprologue
// Establish the expanded operands.
pxor xmm7, xmm7
movd [edi + 16], xmm4
// All done.
-9: mov esp, ebp
- pop edi
- pop esi
- pop ebx
- pop ebp
+9: dropfp
+ popreg edi
+ popreg esi
+ popreg ebx
+ popreg ebp
ret
ENDFUNC
// esp + 12 expanded M (32 bytes)
// esp + 44 expanded Y (32 bytes)
// esp + 76 (top of locals)
- push ebp
- push ebx
- push esi
- push edi
- mov ebp, esp
+ pushreg ebp
+ pushreg ebx
+ pushreg esi
+ pushreg edi
+ setfp ebp
and esp, ~15
sub esp, 76
+ endprologue
// Establish the expanded operands and the blocks-of-4 dv limit.
mov edi, [ebp + 20] // -> Z = dv[0]
jmp 5b
// All over.
-9: mov esp, ebp
- pop edi
- pop esi
- pop ebx
- pop ebp
+9: dropfp
+ popreg edi
+ popreg esi
+ popreg ebx
+ popreg ebp
ret
ENDFUNC
.endm
.macro testprologue
- push ebp
- push ebx
- push esi
- push edi
- mov ebp, esp
+ pushreg ebp
+ pushreg ebx
+ pushreg esi
+ pushreg edi
+ setfp ebp
and esp, ~15
sub esp, 3*32 + 12
+ endprologue
// vars:
// esp + 0 = cycles
// esp + 12 = v expanded
.endm
.macro testepilogue
- mov esp, ebp
- pop edi
- pop esi
- pop ebx
- pop ebp
+ dropfp
+ popreg edi
+ popreg esi
+ popreg ebx
+ popreg ebp
ret
.endm
movdqu [ecx + 32], xmm6
.endm
- .globl test_dmul4
-test_dmul4:
+FUNC(test_dmul4)
testprologue
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
testtail [ebp + 48], [ebp + 44]
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_dmla4
-test_dmla4:
+FUNC(test_dmla4)
testprologue
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
testtail [ebp + 48], [ebp + 44]
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mul4
-test_mul4:
+FUNC(test_mul4)
testprologue
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
testtail [ebp + 40], [ebp + 36]
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mla4
-test_mla4:
+FUNC(test_mla4)
testprologue
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
testtail [ebp + 40], [ebp + 36]
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mmul4
-test_mmul4:
+FUNC(test_mmul4)
testprologue
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mmla4
-test_mmla4:
+FUNC(test_mmla4)
testprologue
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mont4
-test_mont4:
+FUNC(test_mont4)
testprologue
testexpand nil, [ebp + 36]
mov edi, [ebp + 20]
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
#endif