From: Mark Wooding <mdw@distorted.org.uk>
Date: Tue, 29 Oct 2019 18:55:16 +0000 (+0000)
Subject: base/asm-common.h (x86), and knock-on: Add macros for full-size regs.
X-Git-Tag: 2.6.0~26
X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/a90d420cbe87490c844ae422c966e746d3134b07

base/asm-common.h (x86), and knock-on: Add macros for full-size regs.

These registers get used a lot as pointers, so it's useful to be able to
refer to them as full-width registers more conveniently than `R_sp(r)'.
Introduce (C preprocessor) macros `AX', ..., for this purpose, and use
them extensively.

(Delete the existing `SI' and `DI' macros from `rijndael-x86ish-aesni.S'
which had the same purpose.)
---

diff --git a/base/asm-common.h b/base/asm-common.h
index 44c223da..d162a5d9 100644
--- a/base/asm-common.h
+++ b/base/asm-common.h
@@ -452,10 +452,20 @@ name:
 #endif
 #define WHOLE(reg) _REGFORM(reg, r)
 
+// Macros for some common registers.
+#define AX R_a(r)
+#define BX R_b(r)
+#define CX R_c(r)
+#define DX R_d(r)
+#define SI R_si(r)
+#define DI R_di(r)
+#define BP R_bp(r)
+#define SP R_sp(r)
+
 // Stack management and unwinding.
-.macro	setfp	fp=R_bp(r), offset=0
+.macro	setfp	fp=BP, offset=0
   .if \offset == 0
-	mov	\fp, R_sp(r)
+	mov	\fp, SP
 #if __ELF__
 	  .cfi_def_cfa_register \fp
 #endif
@@ -463,7 +473,7 @@ name:
 	  .seh_setframe \fp, 0
 #endif
   .else
-	lea	\fp, [R_sp(r) + \offset]
+	lea	\fp, [SP + \offset]
 #if __ELF__
 	  .cfi_def_cfa_register \fp
 	  .cfi_adjust_cfa_offset -\offset
@@ -478,14 +488,14 @@ name:
 
 .macro	_dropfp	fp, offset=0
   .if \offset == 0
-	mov	R_sp(r), \fp
+	mov	SP, \fp
 #if __ELF__
-	  .cfi_def_cfa_register R_sp(r)
+	  .cfi_def_cfa_register SP
 #endif
   .else
-	lea	R_sp(r), [\fp - \offset]
+	lea	SP, [\fp - \offset]
 #if __ELF__
-	  .cfi_def_cfa_register R_sp(r)
+	  .cfi_def_cfa_register SP
 	  .cfi_adjust_cfa_offset +\offset
 #endif
   .endif
@@ -494,7 +504,7 @@ name:
 .endm
 
 .macro	stalloc	n
-	sub	R_sp(r), \n
+	sub	SP, \n
 #if __ELF__
 	  .cfi_adjust_cfa_offset +\n
 #endif
@@ -504,7 +514,7 @@ name:
 .endm
 
 .macro	stfree	n
-	add	R_sp(r), \n
+	add	SP, \n
 #if __ELF__
 	  .cfi_adjust_cfa_offset -\n
 #endif
@@ -530,14 +540,14 @@ name:
 .endm
 
 .macro	savexmm	r, offset
-	movdqa	[R_sp(r) + \offset], \r
+	movdqa	[SP + \offset], \r
 #if ABI_WIN && CPUFAM_AMD64
 	  .seh_savexmm \r, \offset
 #endif
 .endm
 
 .macro	rstrxmm	r, offset
-	movdqa	\r, [R_sp(r) + \offset]
+	movdqa	\r, [SP + \offset]
 .endm
 
 .macro	endprologue
diff --git a/base/dispatch-x86ish.S b/base/dispatch-x86ish.S
index 8c6a2a8f..c3725fc6 100644
--- a/base/dispatch-x86ish.S
+++ b/base/dispatch-x86ish.S
@@ -46,9 +46,9 @@ FUNC(dispatch_x86ish_cpuid)
 #if CPUFAM_X86
 	pushreg	ebx
 	pushreg	edi
-	mov	edi, [esp + 12]
-	mov	eax, [esp + 16]
-	mov	ecx, [esp + 20]
+	mov	edi, [SP + 12]
+	mov	eax, [SP + 16]
+	mov	ecx, [SP + 20]
 #  define OUT edi
 #endif
 #if CPUFAM_AMD64 && ABI_SYSV
@@ -69,21 +69,21 @@ FUNC(dispatch_x86ish_cpuid)
 	// First, check that this is even a thing, using the complicated
 	// dance with the flags register.
 	pushf
-	pop	R_d(r)			// current flags in d
+	pop	DX			// current flags in d
 
-	or	R_d(r), EFLAGS_ID	// force the id bit on and check it
-	push	R_d(r)
+	or	DX, EFLAGS_ID		// force the id bit on and check it
+	push	DX
 	popf
 	pushf
-	pop	R_d(r)
+	pop	DX
 	test	edx, EFLAGS_ID
 	jz	8f
 
-	and	R_d(r), ~EFLAGS_ID	// force the id bit off and check it
-	push	R_d(r)
+	and	DX, ~EFLAGS_ID		// force the id bit off and check it
+	push	DX
 	popf
 	pushf
-	pop	R_d(r)
+	pop	DX
 	test	edx, EFLAGS_ID
 	jnz	8f
 
@@ -124,32 +124,32 @@ FUNC(dispatch_x86ish_xmmregisters_p)
 	// Enter with no arguments.  Return nonzero if the XMM registers are
 	// usable.
 
-	pushreg	R_bp(r)
+	pushreg	BP
 	setfp
 	stalloc	512
-	and	R_sp(r), ~15
+	and	SP, ~15
   endprologue
 
 	// Save the floating point and SIMD registers, and try to clobber
 	// xmm0.
-	fxsave	[R_sp(r)]
-	mov	eax, [R_sp(r) + 160]
-	xor	dword ptr [R_sp(r) + 160], 0xaaaa5555
-	fxrstor	[R_sp(r)]
+	fxsave	[SP]
+	mov	eax, [SP + 160]
+	xor	dword ptr [SP + 160], 0xaaaa5555
+	fxrstor	[SP]
 
 	// Save them again, and read back the low word of xmm0.  Undo the
 	// clobbering and restore.
-	fxsave	[R_sp(r)]
-	mov	ecx, [R_sp(r) + 160]
-	mov	[R_sp(r) + 160], eax
-	fxrstor	[R_sp(r)]
+	fxsave	[SP]
+	mov	ecx, [SP + 160]
+	mov	[SP + 160], eax
+	fxrstor	[SP]
 
 	// The register are live if we read different things.
 	xor	eax, ecx
 
 	// Done.
 	dropfp
-	popreg	R_bp(r)
+	popreg	BP
 	ret
 ENDFUNC
 
@@ -164,7 +164,7 @@ FUNC(dispatch_x86ish_rdrand)
 #if CPUFAM_X86
 #  define X_OUT edx
 #  define COUNT ecx
-	mov	X_OUT, [esp + 4]
+	mov	X_OUT, [SP + 4]
 #endif
 #if CPUFAM_AMD64 && ABI_SYSV
 #  define X_OUT rdi
diff --git a/base/regdump-x86ish.S b/base/regdump-x86ish.S
index e4dd8e80..67a4ae0e 100644
--- a/base/regdump-x86ish.S
+++ b/base/regdump-x86ish.S
@@ -56,48 +56,48 @@ FUNC(regdump_gpsave)
 	cld
 
 	// Save r/ebp and establish it pointing to the save area.
-	mov	[R_sp(r) + WORDSZ + REGIX_BP*WORDSZ], R_bp(r)
-	lea	R_bp(r), [R_sp(r) + WORDSZ]
+	mov	[SP + WORDSZ + REGIX_BP*WORDSZ], BP
+	lea	BP, [SP + WORDSZ]
 
 	// Save the other easy general-purpose registers.
 #if !CPUFAM_X86
-	mov	[R_bp(r) + REGIX_BX*WORDSZ], R_b(r)
+	mov	[BP + REGIX_BX*WORDSZ], BX
 #endif
-	mov	[R_bp(r) + REGIX_CX*WORDSZ], R_c(r)
-	mov	[R_bp(r) + REGIX_DX*WORDSZ], R_d(r)
-	mov	[R_bp(r) + REGIX_SI*WORDSZ], R_si(r)
-	mov	[R_bp(r) + REGIX_DI*WORDSZ], R_di(r)
+	mov	[BP + REGIX_CX*WORDSZ], CX
+	mov	[BP + REGIX_DX*WORDSZ], DX
+	mov	[BP + REGIX_SI*WORDSZ], SI
+	mov	[BP + REGIX_DI*WORDSZ], DI
 #if CPUFAM_AMD64
-	mov	[R_bp(r) + REGIX_R8*WORDSZ], R_r8(r)
-	mov	[R_bp(r) + REGIX_R9*WORDSZ], R_r9(r)
-	mov	[R_bp(r) + REGIX_R10*WORDSZ], R_r10(r)
-	mov	[R_bp(r) + REGIX_R11*WORDSZ], R_r11(r)
-	mov	[R_bp(r) + REGIX_R12*WORDSZ], R_r12(r)
-	mov	[R_bp(r) + REGIX_R13*WORDSZ], R_r13(r)
-	mov	[R_bp(r) + REGIX_R14*WORDSZ], R_r14(r)
-	mov	[R_bp(r) + REGIX_R15*WORDSZ], R_r15(r)
+	mov	[BP + REGIX_R8*WORDSZ], r8
+	mov	[BP + REGIX_R9*WORDSZ], r9
+	mov	[BP + REGIX_R10*WORDSZ], r10
+	mov	[BP + REGIX_R11*WORDSZ], r11
+	mov	[BP + REGIX_R12*WORDSZ], r12
+	mov	[BP + REGIX_R13*WORDSZ], r13
+	mov	[BP + REGIX_R14*WORDSZ], r14
+	mov	[BP + REGIX_R15*WORDSZ], r15
 #endif
 
 	// Determine the previous stack pointer and save it.
 #if CPUFAM_AMD64 && ABI_SYSV
-	lea	R_a(r), [R_bp(r) + 128 + REGDUMP_GPSIZE]
+	lea	AX, [BP + 128 + REGDUMP_GPSIZE]
 #else
-	lea	R_a(r), [R_bp(r) + REGDUMP_GPSIZE]
+	lea	AX, [BP + REGDUMP_GPSIZE]
 #endif
-	mov	[R_bp(r) + REGIX_SP*WORDSZ], R_a(r)
+	mov	[BP + REGIX_SP*WORDSZ], AX
 
 	// Collect the return address and save it as r/eip.
-	mov	R_a(r), [R_sp(r)]
-	mov	[R_bp(r) + REGIX_IP*WORDSZ], R_a(r)
+	mov	AX, [SP]
+	mov	[BP + REGIX_IP*WORDSZ], AX
 
 	// Save the segment registers.
-	lea	R_a(r), [R_bp(r) + REGIX_GPLIM*WORDSZ]
-	mov	[R_a(r) + 2*REGIX_CS], cs
-	mov	[R_a(r) + 2*REGIX_DS], ds
-	mov	[R_a(r) + 2*REGIX_SS], ss
-	mov	[R_a(r) + 2*REGIX_ES], es
-	mov	[R_a(r) + 2*REGIX_FS], fs
-	mov	[R_a(r) + 2*REGIX_GS], gs
+	lea	AX, [BP + REGIX_GPLIM*WORDSZ]
+	mov	[AX + 2*REGIX_CS], cs
+	mov	[AX + 2*REGIX_DS], ds
+	mov	[AX + 2*REGIX_SS], ss
+	mov	[AX + 2*REGIX_ES], es
+	mov	[AX + 2*REGIX_FS], fs
+	mov	[AX + 2*REGIX_GS], gs
 
 	// Determine the extended save area size.  Preserve ebx on 32-bit x86
 	// here, because the caller needs it for PLT-indirect calls.
@@ -135,23 +135,23 @@ FUNC(regdump_gprstr)
 
 	// We assume nobody actually fiddled with the segment registers.  So
 	// just the actual integer registers to do.
-	mov	R_a(r), [R_bp(r) + REGIX_AX*WORDSZ]
-	mov	R_b(r), [R_bp(r) + REGIX_BX*WORDSZ]
-	mov	R_c(r), [R_bp(r) + REGIX_CX*WORDSZ]
-	mov	R_d(r), [R_bp(r) + REGIX_DX*WORDSZ]
-	mov	R_si(r), [R_bp(r) + REGIX_SI*WORDSZ]
-	mov	R_di(r), [R_bp(r) + REGIX_DI*WORDSZ]
+	mov	AX, [BP + REGIX_AX*WORDSZ]
+	mov	BX, [BP + REGIX_BX*WORDSZ]
+	mov	CX, [BP + REGIX_CX*WORDSZ]
+	mov	DX, [BP + REGIX_DX*WORDSZ]
+	mov	SI, [BP + REGIX_SI*WORDSZ]
+	mov	DI, [BP + REGIX_DI*WORDSZ]
 #if CPUFAM_AMD64
-	mov	R_r8(r), [R_bp(r) + REGIX_R8*WORDSZ]
-	mov	R_r9(r), [R_bp(r) + REGIX_R9*WORDSZ]
-	mov	R_r10(r), [R_bp(r) + REGIX_R10*WORDSZ]
-	mov	R_r11(r), [R_bp(r) + REGIX_R11*WORDSZ]
-	mov	R_r12(r), [R_bp(r) + REGIX_R12*WORDSZ]
-	mov	R_r13(r), [R_bp(r) + REGIX_R13*WORDSZ]
-	mov	R_r14(r), [R_bp(r) + REGIX_R14*WORDSZ]
-	mov	R_r15(r), [R_bp(r) + REGIX_R15*WORDSZ]
+	mov	r8, [BP + REGIX_R8*WORDSZ]
+	mov	r9, [BP + REGIX_R9*WORDSZ]
+	mov	r10, [BP + REGIX_R10*WORDSZ]
+	mov	r11, [BP + REGIX_R11*WORDSZ]
+	mov	r12, [BP + REGIX_R12*WORDSZ]
+	mov	r13, [BP + REGIX_R13*WORDSZ]
+	mov	r14, [BP + REGIX_R14*WORDSZ]
+	mov	r15, [BP + REGIX_R15*WORDSZ]
 #endif
-	mov	R_bp(r), [R_bp(r) + REGIX_BP*WORDSZ]
+	mov	BP, [BP + REGIX_BP*WORDSZ]
 
 	// Done.
 	ret
@@ -175,11 +175,11 @@ FUNC(regdump_xtsave)
 	// general registers are clobbered.
 
 	// Start by filling in the easy parts of the map.
-	mov	[R_sp(r) + WORDSZ + regmap_gp], R_bp(r)
-	lea	R_bp(r), [R_sp(r) + WORDSZ]
+	mov	[SP + WORDSZ + regmap_gp], BP
+	lea	BP, [SP + WORDSZ]
 
 	xor	eax, eax		// clears rax too on amd64
-	mov	[R_bp(r) + regmap_avx], R_a(r)
+	mov	[BP + regmap_avx], AX
 
 	// Find out whether we use `xsave'.  (Preserve ebx.)
 #if CPUFAM_X86
@@ -191,40 +191,40 @@ FUNC(regdump_xtsave)
 	je	5f
 
 	// We have the `xsave' machinery.  Select the base address.
-	lea	R_si(r), [R_sp(r) + WORDSZ + regmap_size + 63]
-	and	R_si(r), ~63
-	mov	[R_bp(r) + regmap_fx], R_si(r)
+	lea	SI, [SP + WORDSZ + regmap_size + 63]
+	and	SI, ~63
+	mov	[BP + regmap_fx], SI
 
 	// Clear out the header area.
 	xor	eax, eax
-	lea	R_di(r), [R_si(r) + 512]
+	lea	DI, [SI + 512]
 	mov	ecx, 16
 	rep stosd
 
 	// Save the registers.
 	mov	eax, 0x00000007
 	xor	edx, edx
-	xsave	[R_si(r)]
+	xsave	[SI]
 
 	// Establish the AVX pointer, if available.
-	test	dword ptr [R_si(r) + 512], 4 // = xstate_bv
+	test	dword ptr [SI + 512], 4 // = xstate_bv
 	je	8f
 
 	mov	eax, 13
 	mov	ecx, 2
 	cpuid
-	add	R_b(r), R_si(r)
-	mov	[R_bp(r) + regmap_avx], R_b(r)
+	add	BX, SI
+	mov	[BP + regmap_avx], BX
 
 	jmp	8f
 
 	// We have only `fxsave'.  Set the base address.
-5:	lea	R_si(r), [R_sp(r) + WORDSZ + regmap_size + 15]
-	and	R_si(r), ~15
-	mov	[R_bp(r) + regmap_fx], R_si(r)
+5:	lea	SI, [SP + WORDSZ + regmap_size + 15]
+	and	SI, ~15
+	mov	[BP + regmap_fx], SI
 
 	// Save the registers.
-	fxsave	[R_si(r)]
+	fxsave	[SI]
 
 	// Clear the x87 state; otherwise it can cause trouble later.
 8:	fninit
@@ -245,7 +245,7 @@ FUNC(regdump_xtrstr)
 	// 32-bit x86, and the other general registers are clobbered.
 
 	// Find the extended register dump.
-	mov	R_si(r), [R_bp(r) + regmap_fx]
+	mov	SI, [BP + regmap_fx]
 
 	// Probe to find out whether we have `xsave'.
 #if CPUFAM_X86
@@ -259,14 +259,14 @@ FUNC(regdump_xtrstr)
 	// We have the `xsave' machinery.
 	mov	eax, 0x00000007
 	xor	edx, edx
-	xrstor	[R_si(r)]
+	xrstor	[SI]
 	jmp	8f
 
 	// We must fake it up.
-1:	fxrstor	[R_si(r)]
+1:	fxrstor	[SI]
 
 	// Done.
-8:	mov	R_bp(r), [R_bp(r) + regmap_gp]
+8:	mov	BP, [BP + regmap_gp]
 #if CPUFAM_X86
 	pop	ebx
 #endif
diff --git a/base/regdump.h b/base/regdump.h
index db968642..3ada7eac 100644
--- a/base/regdump.h
+++ b/base/regdump.h
@@ -381,32 +381,32 @@ DO8(REGDEF_SIMD)
 
 	// Stash r/eax.  This is bletcherous: hope we don't get a signal in
 	// the next few instructions.
-	mov	[R_sp(r) - REGDUMP_SPADJ + (REGIX_AX - 1)*WORDSZ], R_a(r)
+	mov	[SP - REGDUMP_SPADJ + (REGIX_AX - 1)*WORDSZ], AX
 
   .ifnes "\addr", "nil"
 	// Collect the effective address for the following dump, leaving it
 	// in the `addr' slot of the dump.
-	lea	R_a(r), \addr
-	mov	[R_sp(r) - REGDUMP_SPADJ + (REGIX_ADDR - 1)*WORDSZ], R_a(r)
+	lea	AX, \addr
+	mov	[SP - REGDUMP_SPADJ + (REGIX_ADDR - 1)*WORDSZ], AX
   .endif
 
 	// Make space for the register save area.  On AMD64 with System/V
 	// ABI, also skip the red zone.  Use `lea' here to preserve the
 	// flags.
-	lea	R_sp(r), [R_sp(r) - REGDUMP_SPADJ]
+	lea	SP, [SP - REGDUMP_SPADJ]
 
 	// Save flags and general-purpose registers.  On 32-bit x86, we save
 	// ebx here and establish a GOT pointer here for the benefit of the
 	// PLT-indirect calls made later on.
 	pushf
 #  if CPUFAM_X86
-	mov	[esp + 4*REGIX_BX], ebx
+	mov	[SP + 4*REGIX_BX], ebx
 	ldgot
 #  endif
 	callext	F(regdump_gpsave)
 
 	// Make space for the extended registers.
-	sub	R_sp(r), R_c(r)
+	sub	SP, CX
 	callext	F(regdump_xtsave)
 
 	// Prepare for calling back into C.  On 32-bit x86, leave space for
@@ -414,11 +414,11 @@ DO8(REGDEF_SIMD)
 	// the `shadow space' for the called-function's arguments.  Also,
 	// forcibly align the stack pointer to a 16-byte boundary.
 #  if CPUFAM_X86
-	sub	esp, 16
+	sub	SP, 16
 #  elif ABI_WIN
-	sub	rsp, 32
+	sub	SP, 32
 #  endif
-	and	R_sp(r), ~15
+	and	SP, ~15
 .endm
 
 .macro	_rstrregs
@@ -426,27 +426,27 @@ DO8(REGDEF_SIMD)
 
 	// We assume r/ebp still points to the register map.
 	callext	F(regdump_xtrstr)
-	mov	R_sp(r), R_bp(r)
+	mov	SP, BP
 	callext	F(regdump_gprstr)
 	popf
-	lea	R_sp(r), [R_sp(r) + REGDUMP_SPADJ]
+	lea	SP, [SP + REGDUMP_SPADJ]
 .endm
 
 .macro	_regbase
 #  if CPUFAM_X86
-	mov	[esp + 0], ebp
+	mov	[SP + 0], BP
 #  elif ABI_SYSV
-	mov	rdi, rbp
+	mov	rdi, BP
 #  elif ABI_WIN
-	mov	rcx, rbp
+	mov	rcx, BP
 #  endif
 .endm
 
 .macro	_membase
-	mov	R_a(r), [R_bp(r) + regmap_gp]
+	mov	AX, [BP + regmap_gp]
 #  if CPUFAM_X86
 	mov	eax, [eax + REGIX_ADDR*WORDSZ]
-	mov	[esp + 0], eax
+	mov	[SP + 0], eax
 #  elif ABI_SYSV
 	mov	rdi, [rax + REGIX_ADDR*WORDSZ]
 #  elif ABI_WIN
@@ -457,7 +457,7 @@ DO8(REGDEF_SIMD)
 .macro	_reglbl	msg
   .ifeqs "\msg", ""
 #  if CPUFAM_X86
-	mov	dword ptr [esp + 4], 0
+	mov	dword ptr [SP + 4], 0
 #  elif ABI_SYSV
 	xor	esi, esi
 #  elif ABI_WIN
@@ -466,7 +466,7 @@ DO8(REGDEF_SIMD)
   .else
 #  if CPUFAM_X86
 	lea	eax, [INTADDR(.L$_reglbl$\@)]
-	mov	[esp + 4], eax
+	mov	[SP + 4], eax
 #  elif ABI_SYSV
 	lea	rsi, [INTADDR(.L$_reglbl$\@)]
 #  elif ABI_WIN
@@ -481,7 +481,7 @@ DO8(REGDEF_SIMD)
 
 .macro	_regfmt	arg
 #  if CPUFAM_X86
-	mov	dword ptr [esp + 8], \arg
+	mov	dword ptr [SP + 8], \arg
 #  elif ABI_SYSV
 	mov	edx, \arg
 #  elif ABI_WIN
diff --git a/base/test-regdump-x86ish.S b/base/test-regdump-x86ish.S
index a8c8d435..41ba77f7 100644
--- a/base/test-regdump-x86ish.S
+++ b/base/test-regdump-x86ish.S
@@ -10,9 +10,9 @@ vec:
 
 FUNC(main)
 
-	pushreg	R_bp(r)
+	pushreg	BP
 	setfp
-	and	R_sp(r), ~15
+	and	SP, ~15
   endprologue
 
 	fldz
@@ -32,7 +32,7 @@ FUNC(main)
 
 	xor	eax, eax
 	dropfp
-	popreg	R_bp(r)
+	popreg	BP
 	ret
 
 ENDFUNC
diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S
index d313765f..51e94c58 100644
--- a/math/mpx-mul4-amd64-sse2.S
+++ b/math/mpx-mul4-amd64-sse2.S
@@ -575,10 +575,10 @@ INTFUNC(mmla4)
 	movdqu	xmm4, [rax]
 #if ABI_WIN
 	stalloc	48 + 8			// space for the carries
-#  define STKTMP(i) [rsp + i]
+#  define STKTMP(i) [SP + i]
 #endif
 #if ABI_SYSV
-#  define STKTMP(i) [rsp + i - 48 - 8]	// use red zone
+#  define STKTMP(i) [SP + i - 48 - 8]	// use red zone
 #endif
   endprologue
 
@@ -811,7 +811,7 @@ FUNC(mpx_umul4_amd64_sse2)
   endprologue
 
 	mov	rdi, DV
-	mov	BVL, [rsp + 224]
+	mov	BVL, [SP + 224]
 
 #endif
 
@@ -978,8 +978,8 @@ FUNC(mpxmont_mul4_amd64_sse2)
   endprologue
 
 	mov	rdi, DV
-	mov	N, [rsp + 224]
-	mov	MI, [rsp + 232]
+	mov	N, [SP + 224]
+	mov	MI, [SP + 232]
 
 #endif
 
@@ -1183,7 +1183,7 @@ FUNC(mpxmont_redc4_amd64_sse2)
   endprologue
 
 	mov	rdi, DV
-	mov	MI, [rsp + 224]
+	mov	MI, [SP + 224]
 
 #endif
 
@@ -1329,7 +1329,7 @@ ENDFUNC
 #  define ARG8 STKARG(4)
 #  define STKARG_OFFSET 224
 #endif
-#define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
+#define STKARG(i) [SP + STKARG_OFFSET + 8*(i)]
 
 //		  sysv				win
 //		  dmul  smul  mmul  mont	dmul  smul  mmul  mont
diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S
index 904c0d0a..ba7ae6a3 100644
--- a/math/mpx-mul4-x86-sse2.S
+++ b/math/mpx-mul4-x86-sse2.S
@@ -561,9 +561,9 @@ INTFUNC(mmla4)
 	mulacc	[eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 	propout	[edi + 12],	 xmm7, xmm4
 
-	movdqa	[esp +  0], xmm4
-	movdqa	[esp + 16], xmm5
-	movdqa	[esp + 32], xmm6
+	movdqa	[SP +  0], xmm4
+	movdqa	[SP + 16], xmm5
+	movdqa	[SP + 32], xmm6
 
 	// Calculate Y = W M.
 	mulcore	[edi +  0], esi, xmm4, xmm5, xmm6, xmm7
@@ -606,9 +606,9 @@ INTFUNC(mmla4)
 	propout	[edi + 12],	 xmm7, xmm4
 
 	// Add add on the carry we calculated earlier.
-	paddq	xmm4, [esp +  0]
-	paddq	xmm5, [esp + 16]
-	paddq	xmm6, [esp + 32]
+	paddq	xmm4, [SP +  0]
+	paddq	xmm5, [SP + 16]
+	paddq	xmm6, [SP + 32]
 
 	// And, with that, we're done.
 	stfree	48 + 12
@@ -688,40 +688,40 @@ FUNC(mpx_umul4_x86_sse2)
 	// void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
 	//			   const mpw *bv, const mpw *bvl);
 
-	// Build a stack frame.  Arguments will be relative to EBP, as
+	// Build a stack frame.  Arguments will be relative to BP, as
 	// follows.
 	//
-	//	ebp + 20	dv
-	//	ebp + 24	av
-	//	ebp + 28	avl
-	//	ebp + 32	bv
-	//	ebp + 36	bvl
+	//	BP + 20	dv
+	//	BP + 24	av
+	//	BP + 28	avl
+	//	BP + 32	bv
+	//	BP + 36	bvl
 	//
-	// Locals are relative to ESP, as follows.
+	// Locals are relative to SP, as follows.
 	//
-	//	esp +  0	expanded Y (32 bytes)
-	//	esp + 32	(top of locals)
-	pushreg	ebp
+	//	SP +  0	expanded Y (32 bytes)
+	//	SP + 32	(top of locals)
+	pushreg	BP
 	pushreg	ebx
 	pushreg	esi
 	pushreg	edi
 	setfp
-	and	esp, ~15
-	sub	esp, 32
+	and	SP, ~15
+	sub	SP, 32
   endprologue
 
 	// Prepare for the first iteration.
-	mov	esi, [ebp + 32]		// -> bv[0]
+	mov	esi, [BP + 32]		// -> bv[0]
 	pxor	xmm7, xmm7
 	movdqu	xmm0, [esi]		// bv[0]
-	mov	edi, [ebp + 20]		// -> dv[0]
+	mov	edi, [BP + 20]		// -> dv[0]
 	mov	ecx, edi		// outer loop dv cursor
 	expand	xmm7, xmm0, xmm1
-	mov	ebx, [ebp + 24]		// -> av[0]
-	mov	eax, [ebp + 28]		// -> av[m] = av limit
-	mov	edx, esp		// -> expanded Y = bv[0]
-	movdqa	[esp + 0], xmm0		// bv[0] expanded low
-	movdqa	[esp + 16], xmm1	// bv[0] expanded high
+	mov	ebx, [BP + 24]		// -> av[0]
+	mov	eax, [BP + 28]		// -> av[m] = av limit
+	mov	edx, SP			// -> expanded Y = bv[0]
+	movdqa	[SP + 0], xmm0		// bv[0] expanded low
+	movdqa	[SP + 16], xmm1		// bv[0] expanded high
 	call	mul4zc
 	add	ebx, 16
 	add	edi, 16
@@ -740,7 +740,7 @@ FUNC(mpx_umul4_x86_sse2)
 
 	// Write out the leftover carry.  There can be no tail here.
 8:	call	carryprop
-	cmp	esi, [ebp + 36]		// more passes to do?
+	cmp	esi, [BP + 36]		// more passes to do?
 	jae	9f
 
 	.p2align 4
@@ -749,9 +749,9 @@ FUNC(mpx_umul4_x86_sse2)
 	mov	edi, ecx		// -> dv[i]
 	pxor	xmm7, xmm7
 	expand	xmm7, xmm0, xmm1
-	mov	ebx, [ebp + 24]		// -> av[0]
-	movdqa	[esp + 0], xmm0		// bv[i] expanded low
-	movdqa	[esp + 16], xmm1	// bv[i] expanded high
+	mov	ebx, [BP + 24]		// -> av[0]
+	movdqa	[SP + 0], xmm0		// bv[i] expanded low
+	movdqa	[SP + 16], xmm1		// bv[i] expanded high
 	call	mla4zc
 	add	edi, 16
 	add	ebx, 16
@@ -771,7 +771,7 @@ FUNC(mpx_umul4_x86_sse2)
 	// Finish off this pass.  There was no tail on the previous pass, and
 	// there can be none on this pass.
 8:	call	carryprop
-	cmp	esi, [ebp + 36]
+	cmp	esi, [BP + 36]
 	jb	1b
 
 	// All over.
@@ -779,7 +779,7 @@ FUNC(mpx_umul4_x86_sse2)
 	pop	edi
 	pop	esi
 	pop	ebx
-	pop	ebp
+	pop	BP
 	ret
 
 ENDFUNC
@@ -796,69 +796,69 @@ FUNC(mpxmont_mul4_x86_sse2)
 	// void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
 	//			     const mpw *nv, size_t n, const mpw *mi);
 
-	// Build a stack frame.  Arguments will be relative to EBP, as
+	// Build a stack frame.  Arguments will be relative to BP, as
 	// follows.
 	//
-	//	ebp + 20	dv
-	//	ebp + 24	av
-	//	ebp + 28	bv
-	//	ebp + 32	nv
-	//	ebp + 36	n (nonzero multiple of 4)
-	//	ebp + 40	mi
+	//	BP + 20	dv
+	//	BP + 24	av
+	//	BP + 28	bv
+	//	BP + 32	nv
+	//	BP + 36	n (nonzero multiple of 4)
+	//	BP + 40	mi
 	//
-	// Locals are relative to ESP, which 16-byte aligned, as follows.
+	// Locals are relative to SP, which 16-byte aligned, as follows.
 	//
-	//	esp +   0	expanded V (32 bytes)
-	//	esp +  32	expanded M (32 bytes)
-	//	esp +  64	expanded Y (32 bytes)
-	//	esp +  96	outer loop dv
-	//	esp + 100	outer loop bv
-	//	esp + 104	av limit (mostly in ESI)
-	//	esp + 108	bv limit
-	//	esp + 112	(top of locals)
-	pushreg	ebp
+	//	SP +   0	expanded V (32 bytes)
+	//	SP +  32	expanded M (32 bytes)
+	//	SP +  64	expanded Y (32 bytes)
+	//	SP +  96	outer loop dv
+	//	SP + 100	outer loop bv
+	//	SP + 104	av limit (mostly in ESI)
+	//	SP + 108	bv limit
+	//	SP + 112	(top of locals)
+	pushreg	BP
 	pushreg	ebx
 	pushreg	esi
 	pushreg	edi
 	setfp
-	and	esp, ~15
-	sub	esp, 112
+	and	SP, ~15
+	sub	SP, 112
   endprologue
 
 	// Establish the expanded operands.
 	pxor	xmm7, xmm7
-	mov	ecx, [ebp + 28]		// -> bv
-	mov	edx, [ebp + 40]		// -> mi
+	mov	ecx, [BP + 28]		// -> bv
+	mov	edx, [BP + 40]		// -> mi
 	movdqu	xmm0, [ecx]		// bv[0]
 	movdqu	xmm2, [edx]		// mi
 	expand	xmm7, xmm0, xmm1, xmm2, xmm3
-	movdqa	[esp +  0], xmm0	// bv[0] expanded low
-	movdqa	[esp + 16], xmm1	// bv[0] expanded high
-	movdqa	[esp + 32], xmm2	// mi expanded low
-	movdqa	[esp + 48], xmm3	// mi expanded high
+	movdqa	[SP +  0], xmm0		// bv[0] expanded low
+	movdqa	[SP + 16], xmm1		// bv[0] expanded high
+	movdqa	[SP + 32], xmm2		// mi expanded low
+	movdqa	[SP + 48], xmm3		// mi expanded high
 
 	// Set up the outer loop state and prepare for the first iteration.
-	mov	edx, [ebp + 36]		// n
-	mov	eax, [ebp + 24]		// -> U = av[0]
-	mov	ebx, [ebp + 32]		// -> X = nv[0]
-	mov	edi, [ebp + 20]		// -> Z = dv[0]
-	mov	[esp + 100], ecx
+	mov	edx, [BP + 36]		// n
+	mov	eax, [BP + 24]		// -> U = av[0]
+	mov	ebx, [BP + 32]		// -> X = nv[0]
+	mov	edi, [BP + 20]		// -> Z = dv[0]
+	mov	[SP + 100], ecx
 	lea	ecx, [ecx + 4*edx]	// -> bv[n/4] = bv limit
 	lea	edx, [eax + 4*edx]	// -> av[n/4] = av limit
-	mov	[esp + 96], edi
-	mov	[esp + 104], edx
-	mov	[esp + 108], ecx
-	lea	ecx, [esp + 0]		// -> expanded V = bv[0]
-	lea	esi, [esp + 32]		// -> expanded M = mi
-	lea	edx, [esp + 64]		// -> space for Y
+	mov	[SP + 96], edi
+	mov	[SP + 104], edx
+	mov	[SP + 108], ecx
+	lea	ecx, [SP + 0]		// -> expanded V = bv[0]
+	lea	esi, [SP + 32]		// -> expanded M = mi
+	lea	edx, [SP + 64]		// -> space for Y
 	call	mmul4
-	mov	esi, [esp + 104]	// recover av limit
+	mov	esi, [SP + 104]		// recover av limit
 	add	edi, 16
 	add	eax, 16
 	add	ebx, 16
 	cmp	eax, esi		// done already?
 	jae	8f
-	mov	[esp + 96], edi
+	mov	[SP + 96], edi
 
 	.p2align 4
 	// Complete the first inner loop.
@@ -877,26 +877,26 @@ FUNC(mpxmont_mul4_x86_sse2)
 	// Embark on the next iteration.  (There must be one.  If n = 1, then
 	// we would have bailed above, to label 8.  Similarly, the subsequent
 	// iterations can fall into the inner loop immediately.)
-1:	mov	eax, [esp + 100]	// -> bv[i - 1]
-	mov	edi, [esp + 96]		// -> Z = dv[i]
+1:	mov	eax, [SP + 100]		// -> bv[i - 1]
+	mov	edi, [SP + 96]		// -> Z = dv[i]
 	add	eax, 16			// -> bv[i]
 	pxor	xmm7, xmm7
-	mov	[esp + 100], eax
-	cmp	eax, [esp + 108]	// done yet?
+	mov	[SP + 100], eax
+	cmp	eax, [SP + 108]		// done yet?
 	jae	9f
 	movdqu	xmm0, [eax]		// bv[i]
-	mov	ebx, [ebp + 32]		// -> X = nv[0]
-	lea	esi, [esp + 32]		// -> expanded M = mi
-	mov	eax, [ebp + 24]		// -> U = av[0]
+	mov	ebx, [BP + 32]		// -> X = nv[0]
+	lea	esi, [SP + 32]		// -> expanded M = mi
+	mov	eax, [BP + 24]		// -> U = av[0]
 	expand	xmm7, xmm0, xmm1
-	movdqa	[esp + 0], xmm0		// bv[i] expanded low
-	movdqa	[esp + 16], xmm1	// bv[i] expanded high
+	movdqa	[SP + 0], xmm0		// bv[i] expanded low
+	movdqa	[SP + 16], xmm1		// bv[i] expanded high
 	call	mmla4
-	mov	esi, [esp + 104]	// recover av limit
+	mov	esi, [SP + 104]		// recover av limit
 	add	edi, 16
 	add	eax, 16
 	add	ebx, 16
-	mov	[esp + 96], edi
+	mov	[SP + 96], edi
 
 	.p2align 4
 	// Complete the next inner loop.
@@ -928,7 +928,7 @@ FUNC(mpxmont_mul4_x86_sse2)
 	popreg	edi
 	popreg	esi
 	popreg	ebx
-	popreg	ebp
+	popreg	BP
 	ret
 
 ENDFUNC
@@ -945,55 +945,55 @@ FUNC(mpxmont_redc4_x86_sse2)
 	// void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
 	//			       size_t n, const mpw *mi);
 
-	// Build a stack frame.  Arguments will be relative to EBP, as
+	// Build a stack frame.  Arguments will be relative to BP, as
 	// follows.
 	//
-	//	ebp + 20	dv
-	//	ebp + 24	dvl
-	//	ebp + 28	nv
-	//	ebp + 32	n (nonzero multiple of 4)
-	//	ebp + 36	mi
+	//	BP + 20	dv
+	//	BP + 24	dvl
+	//	BP + 28	nv
+	//	BP + 32	n (nonzero multiple of 4)
+	//	BP + 36	mi
 	//
-	// Locals are relative to ESP, as follows.
+	// Locals are relative to SP, as follows.
 	//
-	//	esp +  0	outer loop dv
-	//	esp +  4	outer dv limit
-	//	esp +  8	blocks-of-4 dv limit
-	//	esp + 12	expanded M (32 bytes)
-	//	esp + 44	expanded Y (32 bytes)
-	//	esp + 76	(top of locals)
-	pushreg	ebp
+	//	SP +  0	outer loop dv
+	//	SP +  4	outer dv limit
+	//	SP +  8	blocks-of-4 dv limit
+	//	SP + 12	expanded M (32 bytes)
+	//	SP + 44	expanded Y (32 bytes)
+	//	SP + 76	(top of locals)
+	pushreg	BP
 	pushreg	ebx
 	pushreg	esi
 	pushreg	edi
 	setfp
-	and	esp, ~15
-	sub	esp, 76
+	and	SP, ~15
+	sub	SP, 76
   endprologue
 
 	// Establish the expanded operands and the blocks-of-4 dv limit.
-	mov	edi, [ebp + 20]		// -> Z = dv[0]
+	mov	edi, [BP + 20]		// -> Z = dv[0]
 	pxor	xmm7, xmm7
-	mov	eax, [ebp + 24]		// -> dv[n] = dv limit
+	mov	eax, [BP + 24]		// -> dv[n] = dv limit
 	sub	eax, edi		// length of dv in bytes
-	mov	edx, [ebp + 36]		// -> mi
+	mov	edx, [BP + 36]		// -> mi
 	movdqu	xmm0, [edx]		// mi
 	and	eax, ~15		// mask off the tail end
 	expand	xmm7, xmm0, xmm1
 	add	eax, edi		// find limit
-	movdqa	[esp + 12], xmm0	// mi expanded low
-	movdqa	[esp + 28], xmm1	// mi expanded high
-	mov	[esp + 8], eax
+	movdqa	[SP + 12], xmm0		// mi expanded low
+	movdqa	[SP + 28], xmm1		// mi expanded high
+	mov	[SP + 8], eax
 
 	// Set up the outer loop state and prepare for the first iteration.
-	mov	ecx, [ebp + 32]		// n
-	mov	ebx, [ebp + 28]		// -> X = nv[0]
+	mov	ecx, [BP + 32]		// n
+	mov	ebx, [BP + 28]		// -> X = nv[0]
 	lea	edx, [edi + 4*ecx]	// -> dv[n/4] = outer dv limit
 	lea	ecx, [ebx + 4*ecx]	// -> nv[n/4] = nv limit
-	mov	[esp + 0], edi
-	mov	[esp + 4], edx
-	lea	esi, [esp + 12]		// -> expanded M = mi
-	lea	edx, [esp + 44]		// -> space for Y
+	mov	[SP + 0], edi
+	mov	[SP + 4], edx
+	lea	esi, [SP + 12]		// -> expanded M = mi
+	lea	edx, [SP + 44]		// -> space for Y
 	call	mont4
 	add	ebx, 16
 	add	edi, 16
@@ -1010,8 +1010,8 @@ FUNC(mpxmont_redc4_x86_sse2)
 
 	// Still have carries left to propagate.
 8:	carryadd
-	mov	esi, [esp + 8]		// -> dv blocks limit
-	mov	edx, [ebp + 24]		// dv limit
+	mov	esi, [SP + 8]		// -> dv blocks limit
+	mov	edx, [BP + 24]		// dv limit
 	psllq	xmm7, 16
 	pslldq	xmm7, 8
 	paddq	xmm6, xmm7
@@ -1044,14 +1044,14 @@ FUNC(mpxmont_redc4_x86_sse2)
 	// All done for this iteration.  Start the next.  (This must have at
 	// least one follow-on iteration, or we'd not have started this outer
 	// loop.)
-8:	mov	edi, [esp + 0]		// -> dv[i - 1]
-	mov	ebx, [ebp + 28]		// -> X = nv[0]
-	lea	edx, [esp + 44]		// -> space for Y
-	lea	esi, [esp + 12]		// -> expanded M = mi
+8:	mov	edi, [SP + 0]		// -> dv[i - 1]
+	mov	ebx, [BP + 28]		// -> X = nv[0]
+	lea	edx, [SP + 44]		// -> space for Y
+	lea	esi, [SP + 12]		// -> expanded M = mi
 	add	edi, 16			// -> Z = dv[i]
-	cmp	edi, [esp + 4]		// all done yet?
+	cmp	edi, [SP + 4]		// all done yet?
 	jae	9f
-	mov	[esp + 0], edi
+	mov	[SP + 0], edi
 	call	mont4
 	add	edi, 16
 	add	ebx, 16
@@ -1062,7 +1062,7 @@ FUNC(mpxmont_redc4_x86_sse2)
 	popreg	edi
 	popreg	esi
 	popreg	ebx
-	popreg	ebp
+	popreg	BP
 	ret
 
 ENDFUNC
@@ -1091,22 +1091,22 @@ ENDFUNC
 .endm
 
 .macro	testprologue n
-	pushreg	ebp
+	pushreg	BP
 	pushreg	ebx
 	pushreg	esi
 	pushreg	edi
 	setfp
-	and	esp, ~15
-	sub	esp, 3*32 + 4*4
+	and	SP, ~15
+	sub	SP, 3*32 + 4*4
   endprologue
 	mov	eax, \n
-	mov	[esp + 104], eax
+	mov	[SP + 104], eax
 	// vars:
-	//	esp +   0 = v expanded
-	//	esp +  32 = y expanded
-	//	esp +  64 = ? expanded
-	//	esp +  96 = cycles
-	//	esp + 104 = count
+	//	SP +   0 = v expanded
+	//	SP +  32 = y expanded
+	//	SP +  64 = ? expanded
+	//	SP +  96 = cycles
+	//	SP + 104 = count
 .endm
 
 .macro	testepilogue
@@ -1114,7 +1114,7 @@ ENDFUNC
 	popreg	edi
 	popreg	esi
 	popreg	ebx
-	popreg	ebp
+	popreg	BP
 	ret
 .endm
 
@@ -1131,15 +1131,15 @@ ENDFUNC
 	mov	ecx, \v
 	movdqu	xmm0, [ecx]
 	expand	xmm7, xmm0, xmm1
-	movdqa	[esp +  0], xmm0
-	movdqa	[esp + 16], xmm1
+	movdqa	[SP +  0], xmm0
+	movdqa	[SP + 16], xmm1
   .endif
   .ifnes "\y", "nil"
 	mov	edx, \y
 	movdqu	xmm2, [edx]
 	expand	xmm7, xmm2, xmm3
-	movdqa	[esp + 32], xmm2
-	movdqa	[esp + 48], xmm3
+	movdqa	[SP + 32], xmm2
+	movdqa	[SP + 48], xmm3
   .endif
 .endm
 
@@ -1147,25 +1147,25 @@ ENDFUNC
 	.p2align 4
 0:
   .ifnes "\u", "nil"
-	lea	ecx, [esp + 0]
+	lea	ecx, [SP + 0]
   .endif
 	mov	ebx, \x
   .ifeqs "\mode", "mont"
-	lea	esi, [esp + 32]
+	lea	esi, [SP + 32]
   .endif
-	cysetup	esp + 96
+	cysetup	SP + 96
   .ifnes "\u", "nil"
 	mov	eax, \u
   .endif
   .ifeqs "\mode", "mont"
-	lea	edx, [esp + 64]
+	lea	edx, [SP + 64]
   .else
-	lea	edx, [esp + 32]
+	lea	edx, [SP + 32]
   .endif
 .endm
 
 .macro	testtail cyv
-	cystore	esp + 96, \cyv, esp + 104
+	cystore	SP + 96, \cyv, SP + 104
 	jnz	0b
 .endm
 
@@ -1177,122 +1177,122 @@ ENDFUNC
 .endm
 
 FUNC(test_dmul4)
-	testprologue [ebp + 44]
-	testldcarry [ebp + 24]
-	testexpand [ebp + 36], [ebp + 40]
-	mov	edi, [ebp + 20]
-	testtop	[ebp + 28], [ebp + 32]
+	testprologue [BP + 44]
+	testldcarry [BP + 24]
+	testexpand [BP + 36], [BP + 40]
+	mov	edi, [BP + 20]
+	testtop	[BP + 28], [BP + 32]
 	call	dmul4
-	testtail [ebp + 48]
-	testcarryout [ebp + 24]
+	testtail [BP + 48]
+	testcarryout [BP + 24]
 	testepilogue
 ENDFUNC
 
 FUNC(test_dmla4)
-	testprologue [ebp + 44]
-	testldcarry [ebp + 24]
-	testexpand [ebp + 36], [ebp + 40]
-	mov	edi, [ebp + 20]
-	testtop	[ebp + 28], [ebp + 32]
+	testprologue [BP + 44]
+	testldcarry [BP + 24]
+	testexpand [BP + 36], [BP + 40]
+	mov	edi, [BP + 20]
+	testtop	[BP + 28], [BP + 32]
 	call	dmla4
-	testtail [ebp + 48]
-	testcarryout [ebp + 24]
+	testtail [BP + 48]
+	testcarryout [BP + 24]
 	testepilogue
 ENDFUNC
 
 FUNC(test_mul4)
-	testprologue [ebp + 36]
-	testldcarry [ebp + 24]
-	testexpand nil, [ebp + 32]
-	mov	edi, [ebp + 20]
-	testtop	nil, [ebp + 28]
+	testprologue [BP + 36]
+	testldcarry [BP + 24]
+	testexpand nil, [BP + 32]
+	mov	edi, [BP + 20]
+	testtop	nil, [BP + 28]
 	call	mul4
-	testtail [ebp + 40]
-	testcarryout [ebp + 24]
+	testtail [BP + 40]
+	testcarryout [BP + 24]
 	testepilogue
 ENDFUNC
 
 FUNC(test_mul4zc)
-	testprologue [ebp + 36]
-	testldcarry [ebp + 24]
-	testexpand nil, [ebp + 32]
-	mov	edi, [ebp + 20]
-	testtop	nil, [ebp + 28]
+	testprologue [BP + 36]
+	testldcarry [BP + 24]
+	testexpand nil, [BP + 32]
+	mov	edi, [BP + 20]
+	testtop	nil, [BP + 28]
 	call	mul4zc
-	testtail [ebp + 40]
-	testcarryout [ebp + 24]
+	testtail [BP + 40]
+	testcarryout [BP + 24]
 	testepilogue
 ENDFUNC
 
 FUNC(test_mla4)
-	testprologue [ebp + 36]
-	testldcarry [ebp + 24]
-	testexpand nil, [ebp + 32]
-	mov	edi, [ebp + 20]
-	testtop	nil, [ebp + 28]
+	testprologue [BP + 36]
+	testldcarry [BP + 24]
+	testexpand nil, [BP + 32]
+	mov	edi, [BP + 20]
+	testtop	nil, [BP + 28]
 	call	mla4
-	testtail [ebp + 40]
-	testcarryout [ebp + 24]
+	testtail [BP + 40]
+	testcarryout [BP + 24]
 	testepilogue
 ENDFUNC
 
 FUNC(test_mla4zc)
-	testprologue [ebp + 36]
-	testldcarry [ebp + 24]
-	testexpand nil, [ebp + 32]
-	mov	edi, [ebp + 20]
-	testtop	nil, [ebp + 28]
+	testprologue [BP + 36]
+	testldcarry [BP + 24]
+	testexpand nil, [BP + 32]
+	mov	edi, [BP + 20]
+	testtop	nil, [BP + 28]
 	call	mla4zc
-	testtail [ebp + 40]
-	testcarryout [ebp + 24]
+	testtail [BP + 40]
+	testcarryout [BP + 24]
 	testepilogue
 ENDFUNC
 
 FUNC(test_mmul4)
-	testprologue [ebp + 48]
-	testexpand [ebp + 40], [ebp + 44]
-	mov	edi, [ebp + 20]
-	testtop	[ebp + 32], [ebp + 36], mont
+	testprologue [BP + 48]
+	testexpand [BP + 40], [BP + 44]
+	mov	edi, [BP + 20]
+	testtop	[BP + 32], [BP + 36], mont
 	call	mmul4
-	testtail [ebp + 52]
-	mov	edi, [ebp + 28]
-	movdqa	xmm0, [esp + 64]
-	movdqa	xmm1, [esp + 80]
+	testtail [BP + 52]
+	mov	edi, [BP + 28]
+	movdqa	xmm0, [SP + 64]
+	movdqa	xmm1, [SP + 80]
 	movdqu	[edi], xmm0
 	movdqu	[edi + 16], xmm1
-	testcarryout [ebp + 24]
+	testcarryout [BP + 24]
 	testepilogue
 ENDFUNC
 
 FUNC(test_mmla4)
-	testprologue [ebp + 48]
-	testexpand [ebp + 40], [ebp + 44]
-	mov	edi, [ebp + 20]
-	testtop	[ebp + 32], [ebp + 36], mont
+	testprologue [BP + 48]
+	testexpand [BP + 40], [BP + 44]
+	mov	edi, [BP + 20]
+	testtop	[BP + 32], [BP + 36], mont
 	call	mmla4
-	testtail [ebp + 52]
-	mov	edi, [ebp + 28]
-	movdqa	xmm0, [esp + 64]
-	movdqa	xmm1, [esp + 80]
+	testtail [BP + 52]
+	mov	edi, [BP + 28]
+	movdqa	xmm0, [SP + 64]
+	movdqa	xmm1, [SP + 80]
 	movdqu	[edi], xmm0
 	movdqu	[edi + 16], xmm1
-	testcarryout [ebp + 24]
+	testcarryout [BP + 24]
 	testepilogue
 ENDFUNC
 
 FUNC(test_mont4)
-	testprologue [ebp + 40]
-	testexpand nil, [ebp + 36]
-	mov	edi, [ebp + 20]
-	testtop	nil, [ebp + 32], mont
+	testprologue [BP + 40]
+	testexpand nil, [BP + 36]
+	mov	edi, [BP + 20]
+	testtop	nil, [BP + 32], mont
 	call	mont4
-	testtail [ebp + 44]
-	mov	edi, [ebp + 28]
-	movdqa	xmm0, [esp + 64]
-	movdqa	xmm1, [esp + 80]
+	testtail [BP + 44]
+	mov	edi, [BP + 28]
+	movdqa	xmm0, [SP + 64]
+	movdqa	xmm1, [SP + 80]
 	movdqu	[edi], xmm0
 	movdqu	[edi + 16], xmm1
-	testcarryout [ebp + 24]
+	testcarryout [BP + 24]
 	testepilogue
 ENDFUNC
 
diff --git a/rand/rand-x86ish.S b/rand/rand-x86ish.S
index 829bc2cd..61de2b84 100644
--- a/rand/rand-x86ish.S
+++ b/rand/rand-x86ish.S
@@ -42,7 +42,7 @@ FUNC(rand_quick_x86ish_rdrand)
 	// Return zero on success, or -1 on error.
 
 #if CPUFAM_X86
-	mov	edx, [esp + 4]
+	mov	edx, [SP + 4]
 	stalloc	28
 #  define COUNT ecx
 #endif
@@ -58,7 +58,7 @@ FUNC(rand_quick_x86ish_rdrand)
 
 	// Try to fetch a random number.
 	mov	COUNT, 16
-0:	rdrand	R_a(r)
+0:	rdrand	AX
 	jc	1f
 	dec	COUNT
 	jnz	0b
@@ -70,22 +70,22 @@ FUNC(rand_quick_x86ish_rdrand)
 	// Success.
 1:
 #if CPUFAM_X86
-	mov	[esp + 16], eax
-	lea	ecx, [esp + 16]
-	mov	dword ptr [esp + 12], 32
-	mov	dword ptr [esp + 8], 4
-	mov	[esp + 4], ecx
-	mov	[esp + 0], edx
+	mov	[SP + 16], AX
+	lea	ecx, [SP + 16]
+	mov	dword ptr [SP + 12], 32
+	mov	dword ptr [SP + 8], 4
+	mov	[SP + 4], ecx
+	mov	[SP + 0], edx
 #endif
 #if CPUFAM_AMD64 && ABI_SYSV
-	mov	[rsp + 0], rax
-	mov	rsi, rsp
+	mov	[SP + 0], AX
+	mov	rsi, SP
 	mov	edx, 8
 	mov	ecx, 64
 #endif
 #if CPUFAM_AMD64 && ABI_WIN
-	mov	[rsp + 32], rax
-	lea	rdx, [rsp + 32]
+	mov	[SP + 32], AX
+	lea	rdx, [SP + 32]
 	mov	r8d, 8
 	mov	r9d, 64
 #endif
diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S
index 3fb623af..33af65f0 100644
--- a/symm/chacha-x86ish-sse2.S
+++ b/symm/chacha-x86ish-sse2.S
@@ -66,15 +66,15 @@ FUNC(chacha_core_x86ish_sse2)
 #  define SAVE0 xmm5
 #  define SAVE1 xmm6
 #  define SAVE2 xmm7
-#  define SAVE3 [esp]
+#  define SAVE3 [SP]
 
-	pushreg	ebp
+	pushreg	BP
 	setfp
-	sub	esp, 16
-	mov	IN, [ebp + 12]
-	mov	OUT, [ebp + 16]
-	and	esp, ~15
-	mov	NR, [ebp + 8]
+	sub	SP, 16
+	mov	IN, [BP + 12]
+	mov	OUT, [BP + 16]
+	and	SP, ~15
+	mov	NR, [BP + 8]
 #endif
 
 #if CPUFAM_AMD64 && ABI_SYSV
@@ -105,9 +105,9 @@ FUNC(chacha_core_x86ish_sse2)
 #  define IN rdx
 #  define OUT r8
 #  define SAVE0 xmm5
-#  define SAVE1 [rsp +  0]
-#  define SAVE2 [rsp + 16]
-#  define SAVE3 [rsp + 32]
+#  define SAVE1 [SP +  0]
+#  define SAVE2 [SP + 16]
+#  define SAVE3 [SP + 32]
 
 	stalloc	48 + 8
 #endif
@@ -248,7 +248,7 @@ FUNC(chacha_core_x86ish_sse2)
 	// Tidy things up.
 #if CPUFAM_X86
 	dropfp
-	popreg	ebp
+	popreg	BP
 #endif
 #if CPUFAM_AMD64 && ABI_WIN
 	stfree	48 + 8
diff --git a/symm/gcm-x86ish-pclmul.S b/symm/gcm-x86ish-pclmul.S
index e60b7cab..092242bc 100644
--- a/symm/gcm-x86ish-pclmul.S
+++ b/symm/gcm-x86ish-pclmul.S
@@ -576,7 +576,7 @@
 	// xmm3 =			// v_0 = (v_01; v_00)
 	movdqa	xmm4, xmm0		// u_1 again
 #if CPUFAM_X86
-	movdqa	[esp + 0], xmm3
+	movdqa	[SP + 0], xmm3
 #elif CPUFAM_AMD64
 	movdqa	xmm8, xmm3
 #  define V0 xmm8
@@ -608,7 +608,7 @@
 	pclmullqlqdq xmm4, xmm2		// u_11 v_11
 	pclmulhqhqdq xmm7, xmm2		// u_10 v_10
 #if CPUFAM_X86
-	movdqa	xmm2, [esp + 0]
+	movdqa	xmm2, [SP + 0]
 #  define V0 xmm2
 #endif
 	pxor	xmm0, xmm3		// u_10 v_11 + u_11 v_10
@@ -771,8 +771,8 @@ SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
 	// A is updated with the product A K.
 
 #if CPUFAM_X86
-	mov	A, [esp + 4]
-	mov	K, [esp + 8]
+	mov	A, [SP + 4]
+	mov	K, [SP + 8]
 #endif
   endprologue
 	movdqu	xmm0, [A]
@@ -790,8 +790,8 @@ SSEFUNC(gcm_mulk_128l_x86ish_pclmul)
 	// exit, A is updated with the product A K.
 
 #if CPUFAM_X86
-	mov	A, [esp + 4]
-	mov	K, [esp + 8]
+	mov	A, [SP + 4]
+	mov	K, [SP + 8]
 	ldgot	ecx
 #endif
   endprologue
@@ -811,8 +811,8 @@ SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
 	// A is updated with the product A K.
 
 #if CPUFAM_X86
-	mov	A, [esp + 4]
-	mov	K, [esp + 8]
+	mov	A, [SP + 4]
+	mov	K, [SP + 8]
 #endif
   endprologue
 	movq	xmm0, [A]
@@ -830,8 +830,8 @@ SSEFUNC(gcm_mulk_64l_x86ish_pclmul)
 	// exit, A is updated with the product A K.
 
 #if CPUFAM_X86
-	mov	A, [esp + 4]
-	mov	K, [esp + 8]
+	mov	A, [SP + 4]
+	mov	K, [SP + 8]
 	ldgot	ecx
 #endif
   endprologue
@@ -852,8 +852,8 @@ SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
 	// with the product A K.
 
 #if CPUFAM_X86
-	mov	A, [esp + 4]
-	mov	K, [esp + 8]
+	mov	A, [SP + 4]
+	mov	K, [SP + 8]
 #endif
   endprologue
 	movq	xmm0, [A + 0]
@@ -876,8 +876,8 @@ SSEFUNC(gcm_mulk_96l_x86ish_pclmul)
 	// updated with the product A K.
 
 #if CPUFAM_X86
-	mov	A, [esp + 4]
-	mov	K, [esp + 8]
+	mov	A, [SP + 4]
+	mov	K, [SP + 8]
 	ldgot	ecx
 #endif
   endprologue
@@ -901,8 +901,8 @@ SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
 	// A is updated with the product A K.
 
 #if CPUFAM_X86
-	mov	A, [esp + 4]
-	mov	K, [esp + 8]
+	mov	A, [SP + 4]
+	mov	K, [SP + 8]
 #endif
 #if CPUFAM_AMD64 && ABI_WIN
 	stalloc	2*16 + 8
@@ -935,8 +935,8 @@ SSEFUNC(gcm_mulk_192l_x86ish_pclmul)
 	// exit, A is updated with the product A K.
 
 #if CPUFAM_X86
-	mov	A, [esp + 4]
-	mov	K, [esp + 8]
+	mov	A, [SP + 4]
+	mov	K, [SP + 8]
 	ldgot	ecx
 #endif
 #if CPUFAM_AMD64 && ABI_WIN
@@ -970,12 +970,12 @@ SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
 	// A is updated with the product A K.
 
 #if CPUFAM_X86
-	pushreg	ebp
+	pushreg	BP
 	setfp
-	mov	A, [esp + 8]
-	mov	K, [esp + 12]
-	and	esp, ~15
-	sub	esp, 16
+	mov	A, [SP + 8]
+	mov	K, [SP + 12]
+	and	SP, ~15
+	sub	SP, 16
 #endif
 #if CPUFAM_AMD64 && ABI_WIN
 	stalloc	3*16 + 8
@@ -997,7 +997,7 @@ SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
 	movdqu	[A + 0], xmm1
 #if CPUFAM_X86
 	dropfp
-	popreg	ebp
+	popreg	BP
 #endif
 #if CPUFAM_AMD64 && ABI_WIN
 	rstrxmm	xmm6, 0
@@ -1014,13 +1014,13 @@ SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
 	// exit, A is updated with the product A K.
 
 #if CPUFAM_X86
-	pushreg	ebp
+	pushreg	BP
 	setfp
-	mov	A, [esp + 8]
-	mov	K, [esp + 12]
-	and	esp, ~15
+	mov	A, [SP + 8]
+	mov	K, [SP + 12]
+	and	SP, ~15
 	ldgot	ecx
-	sub	esp, 16
+	sub	SP, 16
 #endif
 #if CPUFAM_AMD64 && ABI_WIN
 	stalloc	3*16 + 8
@@ -1044,7 +1044,7 @@ SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
 	movdqu	[A + 0], xmm1
 #if CPUFAM_X86
 	dropfp
-	popreg	ebp
+	popreg	BP
 #endif
 #if CPUFAM_AMD64 && ABI_WIN
 	rstrxmm	xmm6, 0
diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S
index 6d9b3b22..f5e5cc9c 100644
--- a/symm/rijndael-x86ish-aesni.S
+++ b/symm/rijndael-x86ish-aesni.S
@@ -70,15 +70,12 @@ ENDFUNC
 
 FUNC(rijndael_setup_x86ish_aesni)
 
-#define SI WHOLE(si)
-#define DI WHOLE(di)
-
 #if CPUFAM_X86
 	// Arguments are on the stack.  We'll need to stack the caller's
 	// register veriables, but we'll manage.
 
-#  define CTX ebp			// context pointer
-#  define BLKSZ [esp + 24]		// block size
+#  define CTX BP			// context pointer
+#  define BLKSZ [SP + 24]		// block size
 
 #  define KSZ ebx			// key size
 #  define NKW edx			// total number of key words
@@ -92,15 +89,15 @@ FUNC(rijndael_setup_x86ish_aesni)
 #  define BLKOFF edx			// block size in bytes
 
 	// Stack the caller's registers.
-	pushreg	ebp
+	pushreg	BP
 	pushreg	ebx
 	pushreg	esi
 	pushreg	edi
 
 	// Set up our own variables.
-	mov	CTX, [esp + 20]		// context base pointer
-	mov	SI, [esp + 28]		// key material
-	mov	KSZ, [esp + 32]		// key size, in words
+	mov	CTX, [SP + 20]		// context base pointer
+	mov	SI, [SP + 28]		// key material
+	mov	KSZ, [SP + 32]		// key size, in words
 #endif
 
 #if CPUFAM_AMD64 && ABI_SYSV
@@ -330,7 +327,7 @@ FUNC(rijndael_setup_x86ish_aesni)
 	popreg	edi
 	popreg	esi
 	popreg	ebx
-	popreg	ebp
+	popreg	BP
 #endif
 #if CPUFAM_AMD64 && ABI_WIN
 	popreg	rdi
@@ -389,8 +386,8 @@ ENDFUNC
 #  define DST edx
 #  define NR ecx
 
-	mov	K, [esp + 4]
-	mov	SRC, [esp + 8]
+	mov	K, [SP + 4]
+	mov	SRC, [SP + 8]
 #endif
 
 #if CPUFAM_AMD64 && ABI_SYSV
@@ -428,7 +425,7 @@ ENDFUNC
 	add	K, 16
 	pxor	xmm0, xmm1
 #if CPUFAM_X86
-	mov	DST, [esp + 12]
+	mov	DST, [SP + 12]
 #endif
 
 	// Dispatch to the correct code.
diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S
index 5dc9c17c..eb346afe 100644
--- a/symm/salsa20-x86ish-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -65,16 +65,16 @@ FUNC(salsa20_core_x86ish_sse2)
 #  define OUT edx
 #  define SAVE0 xmm6
 #  define SAVE1 xmm7
-#  define SAVE2 [esp + 0]
-#  define SAVE3 [esp + 16]
+#  define SAVE2 [SP + 0]
+#  define SAVE3 [SP + 16]
 
-	pushreg	ebp
+	pushreg	BP
 	setfp
-	sub	esp, 32
-	mov	IN, [ebp + 12]
-	mov	OUT, [ebp + 16]
-	and	esp, ~15
-	mov	NR, [ebp + 8]
+	sub	SP, 32
+	mov	IN, [BP + 12]
+	mov	OUT, [BP + 16]
+	and	SP, ~15
+	mov	NR, [BP + 8]
 #endif
 
 #if CPUFAM_AMD64 && ABI_SYSV
@@ -107,8 +107,8 @@ FUNC(salsa20_core_x86ish_sse2)
 #  define OUT r8
 #  define SAVE0 xmm6
 #  define SAVE1 xmm7
-#  define SAVE2 [rsp + 32]
-#  define SAVE3 [rsp + 48]
+#  define SAVE2 [SP + 32]
+#  define SAVE3 [SP + 48]
 
 	stalloc	64 + 8
 	savexmm	xmm6, 0
@@ -301,7 +301,7 @@ FUNC(salsa20_core_x86ish_sse2)
 	// Tidy things up.
 #if CPUFAM_X86
 	dropfp
-	popreg	ebp
+	popreg	BP
 #endif
 #if CPUFAM_AMD64 && ABI_WIN
 	rstrxmm	xmm6, 0