From: Mark Wooding <mdw@distorted.org.uk>
Date: Thu, 29 Dec 2016 15:21:08 +0000 (+0000)
Subject: base/asm-common.h, */*.S: New macros for making stack-unwinding tables.
X-Git-Tag: 2.3.0~17
X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/0923a413958b0e778a3f059c76355ab58e5be414

base/asm-common.h, */*.S: New macros for making stack-unwinding tables.

Previously, I only supported Microsoft SEH tables, because they're
basically essential to having a working 64-bit binary (because Microsoft
are crazy and throw asynchronous exceptions).  But there are three
variants of stack-unwinding tables which are useful to make:

  * Microsoft's SEH tables for AMD64, constructed using `.seh_...'
    directives;

  * ARM's `.ARM.exidx' and `.ARM.extab' tables; and

  * Dwarf `.eh_frame' and `.debug_frame' tables.

These are all quite similar in flavour, but different in detail.  Rather
than write lots of hairy conditional stuff around subroutine prologues
and epilogues, wrap the whole lot up in some target-specific macros.
---

diff --git a/base/asm-common.h b/base/asm-common.h
index fdd7fad1..22bb44d6 100644
--- a/base/asm-common.h
+++ b/base/asm-common.h
@@ -66,6 +66,7 @@
 #define INTFUNC(name)							\
 	TYPE_FUNC(name);						\
 	.macro ENDFUNC; _ENDFUNC(name); .endm;				\
+	.L$_prologue_p = 0; .L$_frameptr_p = 0;				\
 	FUNC_PREHOOK(name);						\
 name:									\
 	FUNC_POSTHOOK(name)
@@ -77,6 +78,8 @@ INTFUNC(F(name))
 
 // Marking the end of a function.
 #define _ENDFUNC(name)							\
+	.if ~ .L$_prologue_p; .error "Missing `endprologue'"; .endif;	\
+	.if .L$_frameptr_p; .purgem dropfp; .endif;			\
 	.purgem	ENDFUNC;						\
 	SIZE_OBJ(name);							\
 	ENDFUNC_HOOK(name);						\
@@ -147,6 +150,11 @@ name:
 // `.seh_pushreg' and friends, and `.seh_endprologue'.
 #endif
 
+#if __ELF__
+#  define FUNC_POSTHOOK(_) .cfi_startproc
+#  define ENDFUNC_HOOK(_) .cfi_endproc
+#endif
+
 // Don't use the wretched AT&T syntax.  It's festooned with pointless
 // punctuation, and all of the data movement is backwards.  Ugh!
 	.intel_syntax noprefix
@@ -427,6 +435,101 @@ name:
 #endif
 #define WHOLE(reg) _REGFORM(reg, r)
 
+// Stack management and unwinding.
+.macro	setfp	fp, offset = 0
+  .if \offset == 0
+	mov	\fp, R_sp(r)
+#if __ELF__
+	  .cfi_def_cfa_register \fp
+#endif
+#if ABI_WIN && CPUFAM_AMD64
+	  .seh_setframe \fp, 0
+#endif
+  .else
+	lea	\fp, [R_sp(r) + \offset]
+#if __ELF__
+	  .cfi_def_cfa_register \fp
+	  .cfi_adjust_cfa_offset -\offset
+#endif
+#if ABI_WIN && CPUFAM_AMD64
+	  .seh_setframe \fp, \offset
+#endif
+  .endif
+	.L$_frameptr_p = -1
+	.macro dropfp; _dropfp	\fp, \offset; .endm
+.endm
+
+.macro	_dropfp	fp, offset = 0
+  .if \offset == 0
+	mov	R_sp(r), \fp
+#if __ELF__
+	  .cfi_def_cfa_register R_sp(r)
+#endif
+  .else
+	lea	R_sp(r), [\fp - \offset]
+#if __ELF__
+	  .cfi_def_cfa_register R_sp(r)
+	  .cfi_adjust_cfa_offset +\offset
+#endif
+  .endif
+	.L$_frameptr_p = 0
+	.purgem	dropfp
+.endm
+
+.macro	stalloc	n
+	sub	R_sp(r), \n
+#if __ELF__
+	  .cfi_adjust_cfa_offset +\n
+#endif
+#if ABI_WIN && CPUFAM_AMD64
+	  .seh_stackalloc \n
+#endif
+.endm
+
+.macro	stfree	n
+	add	R_sp(r), \n
+#if __ELF__
+	  .cfi_adjust_cfa_offset -\n
+#endif
+.endm
+
+.macro	pushreg	r
+	push	\r
+#if __ELF__
+	  .cfi_adjust_cfa_offset +WORDSZ
+	  .cfi_rel_offset \r, 0
+#endif
+#if ABI_WIN && CPUFAM_AMD64
+	  .seh_pushreg \r
+#endif
+.endm
+
+.macro	popreg	r
+	pop	\r
+#if __ELF__
+	  .cfi_adjust_cfa_offset -WORDSZ
+	  .cfi_restore \r
+#endif
+.endm
+
+.macro	savexmm	r, offset
+	movdqa	[R_sp(r) + \offset], \r
+#if ABI_WIN && CPUFAM_AMD64
+	  .seh_savexmm \r, \offset
+#endif
+.endm
+
+.macro	rstrxmm	r, offset
+	movdqa	\r, [R_sp(r) + \offset]
+.endm
+
+.macro	endprologue
+#if ABI_WIN && CPUFAM_AMD64
+	  .seh_endprologue
+#endif
+	.L$_prologue_p = -1
+.endm
+
 #endif
 
 #if CPUFAM_X86
@@ -551,8 +654,8 @@ name:
 	ARM
 
 // Set the function hooks.
-#define FUNC_PREHOOK(_) .balign 4
-#define ENDFUNC_HOOK(name) .ltorg
+#define FUNC_PREHOOK(_) .balign 4; .fnstart
+#define ENDFUNC_HOOK(_) .fnend; .ltorg
 
 // Call external subroutine at ADDR, possibly via PLT.
 .macro	callext addr, cond=
@@ -868,6 +971,63 @@ name:
 // Macros for converting vldm/vstm ranges.
 #define QQ(qlo, qhi) D0(qlo)-D1(qhi)
 
+// Stack management and unwinding.
+.macro	setfp	fp, offset = 0
+  .if \offset == 0
+	mov	\fp, sp
+	  .setfp \fp, sp
+  .else
+	add	\fp, sp, #\offset
+	  .setfp \fp, sp, #\offset
+  .endif
+	.macro dropfp; _dropfp	\fp, \offset; .endm
+	.L$_frameptr_p = -1
+.endm
+
+.macro	_dropfp	fp, offset = 0
+  .if \offset == 0
+	mov	sp, \fp
+  .else
+	sub	sp, \fp, #\offset
+  .endif
+	.purgem	dropfp
+	.L$_frameptr_p = 0
+.endm
+
+.macro	stalloc	n
+	sub	sp, sp, #\n
+	  .pad #\n
+.endm
+
+.macro	stfree	n
+	add	sp, sp, #\n
+	  .pad #-\n
+.endm
+
+.macro	pushreg	rr:vararg
+	stmfd	sp!, {\rr}
+	  .save {\rr}
+.endm
+
+.macro	popreg	rr:vararg
+	ldmfd	sp!, {\rr}
+.endm
+
+.macro	pushvfp rr:vararg
+	vstmdb	sp!, {\rr}
+	  .vsave {\rr}
+.endm
+
+.macro	popvfp rr:vararg
+	vldmia	sp!, {\rr}
+.endm
+
+.macro	endprologue
+.endm
+
+// No need for prologue markers on ARM.
+#define FUNC_POSTHOOK(_) .L$_prologue_p = -1
+
 #endif
 
 ///--------------------------------------------------------------------------
diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S
index 8f69a559..a6613ed0 100644
--- a/math/mpx-mul4-x86-sse2.S
+++ b/math/mpx-mul4-x86-sse2.S
@@ -315,6 +315,8 @@ INTFUNC(carryprop)
 	// form.  Store the low 128 bits of the represented carry to [EDI] as
 	// a packed 128-bit value, and leave the remaining 16 bits in the low
 	// 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
+  endprologue
+
 	propout	[edi +  0], xmm4, xmm5
 	propout	[edi +  4], xmm5, xmm6
 	propout	[edi +  8], xmm6, nil
@@ -333,6 +335,8 @@ INTFUNC(dmul4)
 	// [EDI], and update the carry registers with the carry out.  The
 	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 	// general-purpose registers are preserved.
+  endprologue
+
 	mulacc	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
 	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 	propout	[edi +  0],	 xmm4, xmm5
@@ -365,6 +369,8 @@ INTFUNC(dmla4)
 	// [EDI], and update the carry registers with the carry out.  The
 	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 	// general-purpose registers are preserved.
+  endprologue
+
 	carryadd
 
 	mulacc	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
@@ -395,6 +401,8 @@ INTFUNC(mul4zc)
 	// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 	// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 	// general-purpose registers are preserved.
+  endprologue
+
 	mulcore	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 	propout	[edi +  0],	 xmm4, xmm5
 
@@ -421,6 +429,8 @@ INTFUNC(mul4)
 	// and update the carry registers with the carry out.  The registers
 	// XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 	// general-purpose registers are preserved.
+  endprologue
+
 	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
 	propout	[edi +  0],	 xmm4, xmm5
 
@@ -446,6 +456,8 @@ INTFUNC(mla4zc)
 	// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 	// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 	// general-purpose registers are preserved.
+  endprologue
+
 	movd	xmm4, [edi +  0]
 	movd	xmm5, [edi +  4]
 	movd	xmm6, [edi +  8]
@@ -478,6 +490,8 @@ INTFUNC(mla4)
 	// [EDI], and update the carry registers with the carry out.  The
 	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 	// general-purpose registers are preserved.
+  endprologue
+
 	carryadd
 
 	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
@@ -508,7 +522,8 @@ INTFUNC(mmul4)
 	// of the sum U V + N Y to [EDI], leaving the remaining carry in
 	// XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 	// XMM7 are clobbered; the general-purpose registers are preserved.
-	sub	esp, 64			// space for the carries
+	stalloc	64			// space for the carries
+  endprologue
 
 	// Calculate W = U V, and leave it in the destination.  Stash the
 	// carry pieces for later.
@@ -532,7 +547,9 @@ INTFUNC(mmla4)
 	// carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
 	// XMM3, and XMM7 are clobbered; the general-purpose registers are
 	// preserved.
-	sub	esp, 64			// space for the carries
+	stalloc	64			// space for the carries
+  endprologue
+
 	movd	xmm4, [edi +  0]
 	movd	xmm5, [edi +  4]
 	movd	xmm6, [edi +  8]
@@ -599,7 +616,7 @@ INTFUNC(mmla4)
 	paddq	xmm6, [esp + 32]
 
 	// And, with that, we're done.
-	add	esp, 64
+	stfree	64
 	ret
 
 ENDFUNC
@@ -614,6 +631,7 @@ INTFUNC(mont4)
 	// of the sum W + N Y to [EDI], leaving the remaining carry in
 	// XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 	// XMM7 are clobbered; the general-purpose registers are preserved.
+  endprologue
 
 	// Calculate Y = W M.
 	mulcore	[edi +  0], esi, xmm4, xmm5, xmm6, xmm7
@@ -680,13 +698,14 @@ FUNC(mpx_umul4_x86_sse2)
 	//
 	//	esp +  0	expanded Y (32 bytes)
 	//	esp + 32	(top of locals)
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	ebp, esp
+	pushreg	ebp
+	pushreg	ebx
+	pushreg	esi
+	pushreg	edi
+	setfp	ebp
 	and	esp, ~15
 	sub	esp, 32
+  endprologue
 
 	// Prepare for the first iteration.
 	mov	esi, [ebp + 32]		// -> bv[0]
@@ -753,7 +772,7 @@ FUNC(mpx_umul4_x86_sse2)
 	jb	1b
 
 	// All over.
-9:	mov	esp, ebp
+9:	dropfp
 	pop	edi
 	pop	esi
 	pop	ebx
@@ -787,13 +806,14 @@ FUNC(mpxmont_mul4_x86_sse2)
 	//	esp + 108	bv limit
 	//	esp + 112	(gap)
 	//	esp + 124	(top of locals)
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	ebp, esp
+	pushreg	ebp
+	pushreg	ebx
+	pushreg	esi
+	pushreg	edi
+	setfp	ebp
 	and	esp, ~15
 	sub	esp, 124
+  endprologue
 
 	// Establish the expanded operands.
 	pxor	xmm7, xmm7
@@ -894,11 +914,11 @@ FUNC(mpxmont_mul4_x86_sse2)
 	movd	[edi + 16], xmm4
 
 	// All done.
-9:	mov	esp, ebp
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
+9:	dropfp
+	popreg	edi
+	popreg	esi
+	popreg	ebx
+	popreg	ebp
 	ret
 
 ENDFUNC
@@ -924,13 +944,14 @@ FUNC(mpxmont_redc4_x86_sse2)
 	//	esp + 12	expanded M (32 bytes)
 	//	esp + 44	expanded Y (32 bytes)
 	//	esp + 76	(top of locals)
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	ebp, esp
+	pushreg	ebp
+	pushreg	ebx
+	pushreg	esi
+	pushreg	edi
+	setfp	ebp
 	and	esp, ~15
 	sub	esp, 76
+  endprologue
 
 	// Establish the expanded operands and the blocks-of-4 dv limit.
 	mov	edi, [ebp + 20]		// -> Z = dv[0]
@@ -1019,11 +1040,11 @@ FUNC(mpxmont_redc4_x86_sse2)
 	jmp	5b
 
 	// All over.
-9:	mov	esp, ebp
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
+9:	dropfp
+	popreg	edi
+	popreg	esi
+	popreg	ebx
+	popreg	ebp
 	ret
 
 ENDFUNC
@@ -1052,13 +1073,14 @@ ENDFUNC
 .endm
 
 .macro	testprologue
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	ebp, esp
+	pushreg	ebp
+	pushreg	ebx
+	pushreg	esi
+	pushreg	edi
+	setfp	ebp
 	and	esp, ~15
 	sub	esp, 3*32 + 12
+  endprologue
 	// vars:
 	//	esp +  0 = cycles
 	//	esp + 12 = v expanded
@@ -1067,11 +1089,11 @@ ENDFUNC
 .endm
 
 .macro	testepilogue
-	mov	esp, ebp
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
+	dropfp
+	popreg	edi
+	popreg	esi
+	popreg	ebx
+	popreg	ebp
 	ret
 .endm
 
@@ -1133,8 +1155,7 @@ ENDFUNC
 	movdqu	[ecx + 32], xmm6
 .endm
 
-	.globl	test_dmul4
-test_dmul4:
+FUNC(test_dmul4)
 	testprologue
 	testldcarry [ebp + 24]
 	testexpand [ebp + 36], [ebp + 40]
@@ -1144,9 +1165,9 @@ test_dmul4:
 	testtail [ebp + 48], [ebp + 44]
 	testcarryout [ebp + 24]
 	testepilogue
+ENDFUNC
 
-	.globl	test_dmla4
-test_dmla4:
+FUNC(test_dmla4)
 	testprologue
 	testldcarry [ebp + 24]
 	testexpand [ebp + 36], [ebp + 40]
@@ -1156,9 +1177,9 @@ test_dmla4:
 	testtail [ebp + 48], [ebp + 44]
 	testcarryout [ebp + 24]
 	testepilogue
+ENDFUNC
 
-	.globl	test_mul4
-test_mul4:
+FUNC(test_mul4)
 	testprologue
 	testldcarry [ebp + 24]
 	testexpand nil, [ebp + 32]
@@ -1168,9 +1189,9 @@ test_mul4:
 	testtail [ebp + 40], [ebp + 36]
 	testcarryout [ebp + 24]
 	testepilogue
+ENDFUNC
 
-	.globl	test_mla4
-test_mla4:
+FUNC(test_mla4)
 	testprologue
 	testldcarry [ebp + 24]
 	testexpand nil, [ebp + 32]
@@ -1180,9 +1201,9 @@ test_mla4:
 	testtail [ebp + 40], [ebp + 36]
 	testcarryout [ebp + 24]
 	testepilogue
+ENDFUNC
 
-	.globl	test_mmul4
-test_mmul4:
+FUNC(test_mmul4)
 	testprologue
 	testexpand [ebp + 40], [ebp + 44]
 	mov	edi, [ebp + 20]
@@ -1196,9 +1217,9 @@ test_mmul4:
 	movdqu	[edi + 16], xmm1
 	testcarryout [ebp + 24]
 	testepilogue
+ENDFUNC
 
-	.globl	test_mmla4
-test_mmla4:
+FUNC(test_mmla4)
 	testprologue
 	testexpand [ebp + 40], [ebp + 44]
 	mov	edi, [ebp + 20]
@@ -1212,9 +1233,9 @@ test_mmla4:
 	movdqu	[edi + 16], xmm1
 	testcarryout [ebp + 24]
 	testepilogue
+ENDFUNC
 
-	.globl	test_mont4
-test_mont4:
+FUNC(test_mont4)
 	testprologue
 	testexpand nil, [ebp + 36]
 	mov	edi, [ebp + 20]
@@ -1228,6 +1249,7 @@ test_mont4:
 	movdqu	[edi + 16], xmm1
 	testcarryout [ebp + 24]
 	testepilogue
+ENDFUNC
 
 #endif
 
diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S
index a7ff68b5..0989fd4b 100644
--- a/symm/chacha-x86ish-sse2.S
+++ b/symm/chacha-x86ish-sse2.S
@@ -60,8 +60,8 @@ FUNC(chacha_core_x86ish_sse2)
 #  define SAVE2 xmm7
 #  define SAVE3 [esp]
 
-	push	ebp
-	mov	ebp, esp
+	pushreg	ebp
+	setfp	ebp
 	sub	esp, 16
 	mov	IN, [ebp + 12]
 	mov	OUT, [ebp + 16]
@@ -101,11 +101,11 @@ FUNC(chacha_core_x86ish_sse2)
 #  define SAVE2 [rsp + 16]
 #  define SAVE3 [rsp + 32]
 
-	sub	rsp, 48 + 8
-	  .seh_stackalloc 48 + 8
-  .seh_endprologue
+	stalloc	48 + 8
 #endif
 
+  endprologue
+
 	// First job is to slurp the matrix into XMM registers.  Be careful:
 	// the input matrix isn't likely to be properly aligned.
 	//
@@ -239,11 +239,11 @@ FUNC(chacha_core_x86ish_sse2)
 
 	// Tidy things up.
 #if CPUFAM_X86
-	mov	esp, ebp
-	pop	ebp
+	dropfp
+	popreg	ebp
 #endif
 #if CPUFAM_AMD64 && ABI_WIN
-	add	rsp, 48 + 8
+	stfree	48 + 8
 #endif
 
 	// And with that, we're done.
diff --git a/symm/rijndael-arm-crypto.S b/symm/rijndael-arm-crypto.S
index 4d7312d4..1e551698 100644
--- a/symm/rijndael-arm-crypto.S
+++ b/symm/rijndael-arm-crypto.S
@@ -70,7 +70,7 @@ FUNC(rijndael_setup_arm_crypto)
 	//	r2 = pointer to key material
 	//	r3 = key size in words
 
-	stmfd	sp!, {r4-r9, r14}
+	pushreg	{r4-r9, r14}
 
 	// The initial round key material is taken directly from the input
 	// key, so copy it over.  Unfortunately, the key material is not
@@ -209,7 +209,7 @@ FUNC(rijndael_setup_arm_crypto)
 	bl	endswap_block
 
 	// All done.
-9:	ldmfd	sp!, {r4-r9, pc}
+9:	popreg	{r4-r9, pc}
 
 ENDFUNC
 
diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S
index b0b880a4..2b99b5c7 100644
--- a/symm/rijndael-x86ish-aesni.S
+++ b/symm/rijndael-x86ish-aesni.S
@@ -85,10 +85,10 @@ FUNC(rijndael_setup_x86ish_aesni)
 #  define BLKOFF edx			// block size in bytes
 
 	// Stack the caller's registers.
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
+	pushreg	ebp
+	pushreg	ebx
+	pushreg	esi
+	pushreg	edi
 
 	// Set up our own variables.
 	mov	CTX, [esp + 20]		// context base pointer
@@ -138,17 +138,16 @@ FUNC(rijndael_setup_x86ish_aesni)
 
 	// We'll need the index registers, which belong to the caller in this
 	// ABI.
-	push	rsi
-	  .seh_pushreg rsi
-	push	rdi
-	  .seh_pushreg rdi
-  .seh_endprologue
+	pushreg	rsi
+	pushreg	rdi
 
 	// Move arguments to more useful places.
 	mov	rsi, r8			// key material
 	mov	CTX, rcx		// context base pointer
 #endif
 
+  endprologue
+
 	// The initial round key material is taken directly from the input
 	// key, so copy it over.
 #if CPUFAM_AMD64 && ABI_SYSV
@@ -321,14 +320,14 @@ FUNC(rijndael_setup_x86ish_aesni)
 
 9:	// All done.
 #if CPUFAM_X86
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
+	popreg	edi
+	popreg	esi
+	popreg	ebx
+	popreg	ebp
 #endif
 #if CPUFAM_AMD64 && ABI_WIN
-	pop	rdi
-	pop	rsi
+	popreg	rdi
+	popreg	rsi
 #endif
 	ret
 
@@ -337,9 +336,7 @@ ENDFUNC
 INTFUNC(endswap_block)
 	// End-swap NKW words starting at SI.  The end-swapping table is
 	// already loaded into XMM5; and it's OK to work in 16-byte chunks.
-#if CPUFAM_AMD64 && ABI_WIN
-  .seh_endprologue
-#endif
+  endprologue
 
 	mov	ecx, NKW
 0:	movdqu	xmm1, [SI]
@@ -399,9 +396,10 @@ ENDFUNC
 #  define SRC rdx
 #  define DST r8
 #  define NR eax
-  .seh_endprologue
 #endif
 
+  endprologue
+
 	// Find the magic endianness-swapping table.
 	ldgot	ecx
 	movdqa	xmm5, [INTADDR(endswap_tab, ecx)]
@@ -522,9 +520,7 @@ INTFUNC(bogus)
 	// might at least provide a hint as to what went wrong; (b) we don't
 	// have conditional CALLs (and they'd be big anyway); and (c) we can
 	// write a HLT here as a backstop against `abort' being mad.
-#if CPUFAM_AMD64 && ABI_WIN
-  .seh_endprologue
-#endif
+  endprologue
 
 	callext	F(abort)
 0:	hlt
diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S
index a05cb4e4..ca677f17 100644
--- a/symm/salsa20-x86ish-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -60,8 +60,8 @@ FUNC(salsa20_core_x86ish_sse2)
 #  define SAVE2 [esp + 0]
 #  define SAVE3 [esp + 16]
 
-	push	ebp
-	mov	ebp, esp
+	pushreg	ebp
+	setfp	ebp
 	sub	esp, 32
 	mov	IN, [ebp + 12]
 	mov	OUT, [ebp + 16]
@@ -102,15 +102,13 @@ FUNC(salsa20_core_x86ish_sse2)
 #  define SAVE2 [rsp + 32]
 #  define SAVE3 [rsp + 48]
 
-	sub	rsp, 64 + 8
-	  .seh_stackalloc 64 + 8
-	movdqa	[rsp +  0], xmm6
-	  .seh_savexmm xmm6, 0
-	movdqa	[rsp + 16], xmm7
-	  .seh_savexmm xmm7, 16
-  .seh_endprologue
+	stalloc	64 + 8
+	savexmm	xmm6, 0
+	savexmm	xmm7, 16
 #endif
 
+  endprologue
+
 	// First job is to slurp the matrix into XMM registers.  The words
 	// have already been permuted conveniently to make them line up
 	// better for SIMD processing.
@@ -294,13 +292,13 @@ FUNC(salsa20_core_x86ish_sse2)
 
 	// Tidy things up.
 #if CPUFAM_X86
-	mov	esp, ebp
-	pop	ebp
+	dropfp
+	popreg	ebp
 #endif
 #if CPUFAM_AMD64 && ABI_WIN
-	movdqa	xmm6, [rsp +  0]
-	movdqa	xmm7, [rsp + 16]
-	add	rsp, 64 + 8
+	rstrxmm	xmm6, 0
+	rsrrxmm	xmm7, 16
+	stfree	64 + 8
 #endif
 
 	// And with that, we're done.