base/asm-common.h (x86), and knock-on: Add macros for full-size regs.

author Mark Wooding <mdw@distorted.org.uk>

Tue, 29 Oct 2019 18:55:16 +0000 (18:55 +0000)

committer Mark Wooding <mdw@distorted.org.uk>

Sat, 9 May 2020 19:57:33 +0000 (20:57 +0100)
author Mark Wooding <mdw@distorted.org.uk>
Tue, 29 Oct 2019 18:55:16 +0000 (18:55 +0000)
committer Mark Wooding <mdw@distorted.org.uk>
Sat, 9 May 2020 19:57:33 +0000 (20:57 +0100)
diff --git a/base/asm-common.h b/base/asm-common.h

index 44c223d..d162a5d 100644 (file)
--- a/base/asm-common.h
+++ b/base/asm-common.h
@@ -452,10 +452,20 @@ name:
  #endif
  #define WHOLE(reg) _REGFORM(reg, r)
  
+// Macros for some common registers.
+#define AX R_a(r)
+#define BX R_b(r)
+#define CX R_c(r)
+#define DX R_d(r)
+#define SI R_si(r)
+#define DI R_di(r)
+#define BP R_bp(r)
+#define SP R_sp(r)
+
  // Stack management and unwinding.
-.macro setfp   fp=R_bp(r), offset=0
+.macro setfp   fp=BP, offset=0
    .if \offset == 0
-       mov     \fp, R_sp(r)
+       mov     \fp, SP
  #if __ELF__
           .cfi_def_cfa_register \fp
  #endif
@@ -463,7 +473,7 @@ name:
           .seh_setframe \fp, 0
  #endif
    .else
-       lea     \fp, [R_sp(r) + \offset]
+       lea     \fp, [SP + \offset]
  #if __ELF__
           .cfi_def_cfa_register \fp
           .cfi_adjust_cfa_offset -\offset
@@ -478,14 +488,14 @@ name:
  
  .macro _dropfp fp, offset=0
    .if \offset == 0
-       mov     R_sp(r), \fp
+       mov     SP, \fp
  #if __ELF__
-         .cfi_def_cfa_register R_sp(r)
+         .cfi_def_cfa_register SP
  #endif
    .else
-       lea     R_sp(r), [\fp - \offset]
+       lea     SP, [\fp - \offset]
  #if __ELF__
-         .cfi_def_cfa_register R_sp(r)
+         .cfi_def_cfa_register SP
           .cfi_adjust_cfa_offset +\offset
  #endif
    .endif
@@ -494,7 +504,7 @@ name:
  .endm
  
  .macro stalloc n
-       sub     R_sp(r), \n
+       sub     SP, \n
  #if __ELF__
           .cfi_adjust_cfa_offset +\n
  #endif
@@ -504,7 +514,7 @@ name:
  .endm
  
  .macro stfree  n
-       add     R_sp(r), \n
+       add     SP, \n
  #if __ELF__
           .cfi_adjust_cfa_offset -\n
  #endif
@@ -530,14 +540,14 @@ name:
  .endm
  
  .macro savexmm r, offset
-       movdqa  [R_sp(r) + \offset], \r
+       movdqa  [SP + \offset], \r
  #if ABI_WIN && CPUFAM_AMD64
           .seh_savexmm \r, \offset
  #endif
  .endm
  
  .macro rstrxmm r, offset
-       movdqa  \r, [R_sp(r) + \offset]
+       movdqa  \r, [SP + \offset]
  .endm
  
  .macro endprologue
diff --git a/base/dispatch-x86ish.S b/base/dispatch-x86ish.S

index 8c6a2a8..c3725fc 100644 (file)
--- a/base/dispatch-x86ish.S
+++ b/base/dispatch-x86ish.S
@@ -46,9 +46,9 @@ FUNC(dispatch_x86ish_cpuid)
  #if CPUFAM_X86
         pushreg ebx
         pushreg edi
-       mov     edi, [esp + 12]
-       mov     eax, [esp + 16]
-       mov     ecx, [esp + 20]
+       mov     edi, [SP + 12]
+       mov     eax, [SP + 16]
+       mov     ecx, [SP + 20]
  #  define OUT edi
  #endif
  #if CPUFAM_AMD64 && ABI_SYSV
@@ -69,21 +69,21 @@ FUNC(dispatch_x86ish_cpuid)
         // First, check that this is even a thing, using the complicated
         // dance with the flags register.
         pushf
-       pop     R_d(r)                  // current flags in d
+       pop     DX                      // current flags in d
  
-       or      R_d(r), EFLAGS_ID       // force the id bit on and check it
-       push    R_d(r)
+       or      DX, EFLAGS_ID           // force the id bit on and check it
+       push    DX
         popf
         pushf
-       pop     R_d(r)
+       pop     DX
         test    edx, EFLAGS_ID
         jz      8f
  
-       and     R_d(r), ~EFLAGS_ID      // force the id bit off and check it
-       push    R_d(r)
+       and     DX, ~EFLAGS_ID          // force the id bit off and check it
+       push    DX
         popf
         pushf
-       pop     R_d(r)
+       pop     DX
         test    edx, EFLAGS_ID
         jnz     8f
  
@@ -124,32 +124,32 @@ FUNC(dispatch_x86ish_xmmregisters_p)
         // Enter with no arguments.  Return nonzero if the XMM registers are
         // usable.
  
-       pushreg R_bp(r)
+       pushreg BP
         setfp
         stalloc 512
-       and     R_sp(r), ~15
+       and     SP, ~15
    endprologue
  
         // Save the floating point and SIMD registers, and try to clobber
         // xmm0.
-       fxsave  [R_sp(r)]
-       mov     eax, [R_sp(r) + 160]
-       xor     dword ptr [R_sp(r) + 160], 0xaaaa5555
-       fxrstor [R_sp(r)]
+       fxsave  [SP]
+       mov     eax, [SP + 160]
+       xor     dword ptr [SP + 160], 0xaaaa5555
+       fxrstor [SP]
  
         // Save them again, and read back the low word of xmm0.  Undo the
         // clobbering and restore.
-       fxsave  [R_sp(r)]
-       mov     ecx, [R_sp(r) + 160]
-       mov     [R_sp(r) + 160], eax
-       fxrstor [R_sp(r)]
+       fxsave  [SP]
+       mov     ecx, [SP + 160]
+       mov     [SP + 160], eax
+       fxrstor [SP]
  
         // The register are live if we read different things.
         xor     eax, ecx
  
         // Done.
         dropfp
-       popreg  R_bp(r)
+       popreg  BP
         ret
  ENDFUNC
  
@@ -164,7 +164,7 @@ FUNC(dispatch_x86ish_rdrand)
  #if CPUFAM_X86
  #  define X_OUT edx
  #  define COUNT ecx
-       mov     X_OUT, [esp + 4]
+       mov     X_OUT, [SP + 4]
  #endif
  #if CPUFAM_AMD64 && ABI_SYSV
  #  define X_OUT rdi
diff --git a/base/regdump-x86ish.S b/base/regdump-x86ish.S

index e4dd8e8..67a4ae0 100644 (file)
--- a/base/regdump-x86ish.S
+++ b/base/regdump-x86ish.S
@@ -56,48 +56,48 @@ FUNC(regdump_gpsave)
         cld
  
         // Save r/ebp and establish it pointing to the save area.
-       mov     [R_sp(r) + WORDSZ + REGIX_BP*WORDSZ], R_bp(r)
-       lea     R_bp(r), [R_sp(r) + WORDSZ]
+       mov     [SP + WORDSZ + REGIX_BP*WORDSZ], BP
+       lea     BP, [SP + WORDSZ]
  
         // Save the other easy general-purpose registers.
  #if !CPUFAM_X86
-       mov     [R_bp(r) + REGIX_BX*WORDSZ], R_b(r)
+       mov     [BP + REGIX_BX*WORDSZ], BX
  #endif
-       mov     [R_bp(r) + REGIX_CX*WORDSZ], R_c(r)
-       mov     [R_bp(r) + REGIX_DX*WORDSZ], R_d(r)
-       mov     [R_bp(r) + REGIX_SI*WORDSZ], R_si(r)
-       mov     [R_bp(r) + REGIX_DI*WORDSZ], R_di(r)
+       mov     [BP + REGIX_CX*WORDSZ], CX
+       mov     [BP + REGIX_DX*WORDSZ], DX
+       mov     [BP + REGIX_SI*WORDSZ], SI
+       mov     [BP + REGIX_DI*WORDSZ], DI
  #if CPUFAM_AMD64
-       mov     [R_bp(r) + REGIX_R8*WORDSZ], R_r8(r)
-       mov     [R_bp(r) + REGIX_R9*WORDSZ], R_r9(r)
-       mov     [R_bp(r) + REGIX_R10*WORDSZ], R_r10(r)
-       mov     [R_bp(r) + REGIX_R11*WORDSZ], R_r11(r)
-       mov     [R_bp(r) + REGIX_R12*WORDSZ], R_r12(r)
-       mov     [R_bp(r) + REGIX_R13*WORDSZ], R_r13(r)
-       mov     [R_bp(r) + REGIX_R14*WORDSZ], R_r14(r)
-       mov     [R_bp(r) + REGIX_R15*WORDSZ], R_r15(r)
+       mov     [BP + REGIX_R8*WORDSZ], r8
+       mov     [BP + REGIX_R9*WORDSZ], r9
+       mov     [BP + REGIX_R10*WORDSZ], r10
+       mov     [BP + REGIX_R11*WORDSZ], r11
+       mov     [BP + REGIX_R12*WORDSZ], r12
+       mov     [BP + REGIX_R13*WORDSZ], r13
+       mov     [BP + REGIX_R14*WORDSZ], r14
+       mov     [BP + REGIX_R15*WORDSZ], r15
  #endif
  
         // Determine the previous stack pointer and save it.
  #if CPUFAM_AMD64 && ABI_SYSV
-       lea     R_a(r), [R_bp(r) + 128 + REGDUMP_GPSIZE]
+       lea     AX, [BP + 128 + REGDUMP_GPSIZE]
  #else
-       lea     R_a(r), [R_bp(r) + REGDUMP_GPSIZE]
+       lea     AX, [BP + REGDUMP_GPSIZE]
  #endif
-       mov     [R_bp(r) + REGIX_SP*WORDSZ], R_a(r)
+       mov     [BP + REGIX_SP*WORDSZ], AX
  
         // Collect the return address and save it as r/eip.
-       mov     R_a(r), [R_sp(r)]
-       mov     [R_bp(r) + REGIX_IP*WORDSZ], R_a(r)
+       mov     AX, [SP]
+       mov     [BP + REGIX_IP*WORDSZ], AX
  
         // Save the segment registers.
-       lea     R_a(r), [R_bp(r) + REGIX_GPLIM*WORDSZ]
-       mov     [R_a(r) + 2*REGIX_CS], cs
-       mov     [R_a(r) + 2*REGIX_DS], ds
-       mov     [R_a(r) + 2*REGIX_SS], ss
-       mov     [R_a(r) + 2*REGIX_ES], es
-       mov     [R_a(r) + 2*REGIX_FS], fs
-       mov     [R_a(r) + 2*REGIX_GS], gs
+       lea     AX, [BP + REGIX_GPLIM*WORDSZ]
+       mov     [AX + 2*REGIX_CS], cs
+       mov     [AX + 2*REGIX_DS], ds
+       mov     [AX + 2*REGIX_SS], ss
+       mov     [AX + 2*REGIX_ES], es
+       mov     [AX + 2*REGIX_FS], fs
+       mov     [AX + 2*REGIX_GS], gs
  
         // Determine the extended save area size.  Preserve ebx on 32-bit x86
         // here, because the caller needs it for PLT-indirect calls.
@@ -135,23 +135,23 @@ FUNC(regdump_gprstr)
  
         // We assume nobody actually fiddled with the segment registers.  So
         // just the actual integer registers to do.
-       mov     R_a(r), [R_bp(r) + REGIX_AX*WORDSZ]
-       mov     R_b(r), [R_bp(r) + REGIX_BX*WORDSZ]
-       mov     R_c(r), [R_bp(r) + REGIX_CX*WORDSZ]
-       mov     R_d(r), [R_bp(r) + REGIX_DX*WORDSZ]
-       mov     R_si(r), [R_bp(r) + REGIX_SI*WORDSZ]
-       mov     R_di(r), [R_bp(r) + REGIX_DI*WORDSZ]
+       mov     AX, [BP + REGIX_AX*WORDSZ]
+       mov     BX, [BP + REGIX_BX*WORDSZ]
+       mov     CX, [BP + REGIX_CX*WORDSZ]
+       mov     DX, [BP + REGIX_DX*WORDSZ]
+       mov     SI, [BP + REGIX_SI*WORDSZ]
+       mov     DI, [BP + REGIX_DI*WORDSZ]
  #if CPUFAM_AMD64
-       mov     R_r8(r), [R_bp(r) + REGIX_R8*WORDSZ]
-       mov     R_r9(r), [R_bp(r) + REGIX_R9*WORDSZ]
-       mov     R_r10(r), [R_bp(r) + REGIX_R10*WORDSZ]
-       mov     R_r11(r), [R_bp(r) + REGIX_R11*WORDSZ]
-       mov     R_r12(r), [R_bp(r) + REGIX_R12*WORDSZ]
-       mov     R_r13(r), [R_bp(r) + REGIX_R13*WORDSZ]
-       mov     R_r14(r), [R_bp(r) + REGIX_R14*WORDSZ]
-       mov     R_r15(r), [R_bp(r) + REGIX_R15*WORDSZ]
+       mov     r8, [BP + REGIX_R8*WORDSZ]
+       mov     r9, [BP + REGIX_R9*WORDSZ]
+       mov     r10, [BP + REGIX_R10*WORDSZ]
+       mov     r11, [BP + REGIX_R11*WORDSZ]
+       mov     r12, [BP + REGIX_R12*WORDSZ]
+       mov     r13, [BP + REGIX_R13*WORDSZ]
+       mov     r14, [BP + REGIX_R14*WORDSZ]
+       mov     r15, [BP + REGIX_R15*WORDSZ]
  #endif
-       mov     R_bp(r), [R_bp(r) + REGIX_BP*WORDSZ]
+       mov     BP, [BP + REGIX_BP*WORDSZ]
  
         // Done.
         ret
@@ -175,11 +175,11 @@ FUNC(regdump_xtsave)
         // general registers are clobbered.
  
         // Start by filling in the easy parts of the map.
-       mov     [R_sp(r) + WORDSZ + regmap_gp], R_bp(r)
-       lea     R_bp(r), [R_sp(r) + WORDSZ]
+       mov     [SP + WORDSZ + regmap_gp], BP
+       lea     BP, [SP + WORDSZ]
  
         xor     eax, eax                // clears rax too on amd64
-       mov     [R_bp(r) + regmap_avx], R_a(r)
+       mov     [BP + regmap_avx], AX
  
         // Find out whether we use `xsave'.  (Preserve ebx.)
  #if CPUFAM_X86
@@ -191,40 +191,40 @@ FUNC(regdump_xtsave)
         je      5f
  
         // We have the `xsave' machinery.  Select the base address.
-       lea     R_si(r), [R_sp(r) + WORDSZ + regmap_size + 63]
-       and     R_si(r), ~63
-       mov     [R_bp(r) + regmap_fx], R_si(r)
+       lea     SI, [SP + WORDSZ + regmap_size + 63]
+       and     SI, ~63
+       mov     [BP + regmap_fx], SI
  
         // Clear out the header area.
         xor     eax, eax
-       lea     R_di(r), [R_si(r) + 512]
+       lea     DI, [SI + 512]
         mov     ecx, 16
         rep stosd
  
         // Save the registers.
         mov     eax, 0x00000007
         xor     edx, edx
-       xsave   [R_si(r)]
+       xsave   [SI]
  
         // Establish the AVX pointer, if available.
-       test    dword ptr [R_si(r) + 512], 4 // = xstate_bv
+       test    dword ptr [SI + 512], 4 // = xstate_bv
         je      8f
  
         mov     eax, 13
         mov     ecx, 2
         cpuid
-       add     R_b(r), R_si(r)
-       mov     [R_bp(r) + regmap_avx], R_b(r)
+       add     BX, SI
+       mov     [BP + regmap_avx], BX
  
         jmp     8f
  
         // We have only `fxsave'.  Set the base address.
-5:     lea     R_si(r), [R_sp(r) + WORDSZ + regmap_size + 15]
-       and     R_si(r), ~15
-       mov     [R_bp(r) + regmap_fx], R_si(r)
+5:     lea     SI, [SP + WORDSZ + regmap_size + 15]
+       and     SI, ~15
+       mov     [BP + regmap_fx], SI
  
         // Save the registers.
-       fxsave  [R_si(r)]
+       fxsave  [SI]
  
         // Clear the x87 state; otherwise it can cause trouble later.
  8:     fninit
@@ -245,7 +245,7 @@ FUNC(regdump_xtrstr)
         // 32-bit x86, and the other general registers are clobbered.
  
         // Find the extended register dump.
-       mov     R_si(r), [R_bp(r) + regmap_fx]
+       mov     SI, [BP + regmap_fx]
  
         // Probe to find out whether we have `xsave'.
  #if CPUFAM_X86
@@ -259,14 +259,14 @@ FUNC(regdump_xtrstr)
         // We have the `xsave' machinery.
         mov     eax, 0x00000007
         xor     edx, edx
-       xrstor  [R_si(r)]
+       xrstor  [SI]
         jmp     8f
  
         // We must fake it up.
-1:     fxrstor [R_si(r)]
+1:     fxrstor [SI]
  
         // Done.
-8:     mov     R_bp(r), [R_bp(r) + regmap_gp]
+8:     mov     BP, [BP + regmap_gp]
  #if CPUFAM_X86
         pop     ebx
  #endif
diff --git a/base/regdump.h b/base/regdump.h

index db96864..3ada7ea 100644 (file)
--- a/base/regdump.h
+++ b/base/regdump.h
@@ -381,32 +381,32 @@ DO8(REGDEF_SIMD)
  
         // Stash r/eax.  This is bletcherous: hope we don't get a signal in
         // the next few instructions.
-       mov     [R_sp(r) - REGDUMP_SPADJ + (REGIX_AX - 1)*WORDSZ], R_a(r)
+       mov     [SP - REGDUMP_SPADJ + (REGIX_AX - 1)*WORDSZ], AX
  
    .ifnes "\addr", "nil"
         // Collect the effective address for the following dump, leaving it
         // in the `addr' slot of the dump.
-       lea     R_a(r), \addr
-       mov     [R_sp(r) - REGDUMP_SPADJ + (REGIX_ADDR - 1)*WORDSZ], R_a(r)
+       lea     AX, \addr
+       mov     [SP - REGDUMP_SPADJ + (REGIX_ADDR - 1)*WORDSZ], AX
    .endif
  
         // Make space for the register save area.  On AMD64 with System/V
         // ABI, also skip the red zone.  Use `lea' here to preserve the
         // flags.
-       lea     R_sp(r), [R_sp(r) - REGDUMP_SPADJ]
+       lea     SP, [SP - REGDUMP_SPADJ]
  
         // Save flags and general-purpose registers.  On 32-bit x86, we save
         // ebx here and establish a GOT pointer here for the benefit of the
         // PLT-indirect calls made later on.
         pushf
  #  if CPUFAM_X86
-       mov     [esp + 4*REGIX_BX], ebx
+       mov     [SP + 4*REGIX_BX], ebx
         ldgot
  #  endif
         callext F(regdump_gpsave)
  
         // Make space for the extended registers.
-       sub     R_sp(r), R_c(r)
+       sub     SP, CX
         callext F(regdump_xtsave)
  
         // Prepare for calling back into C.  On 32-bit x86, leave space for
@@ -414,11 +414,11 @@ DO8(REGDEF_SIMD)
         // the `shadow space' for the called-function's arguments.  Also,
         // forcibly align the stack pointer to a 16-byte boundary.
  #  if CPUFAM_X86
-       sub     esp, 16
+       sub     SP, 16
  #  elif ABI_WIN
-       sub     rsp, 32
+       sub     SP, 32
  #  endif
-       and     R_sp(r), ~15
+       and     SP, ~15
  .endm
  
  .macro _rstrregs
@@ -426,27 +426,27 @@ DO8(REGDEF_SIMD)
  
         // We assume r/ebp still points to the register map.
         callext F(regdump_xtrstr)
-       mov     R_sp(r), R_bp(r)
+       mov     SP, BP
         callext F(regdump_gprstr)
         popf
-       lea     R_sp(r), [R_sp(r) + REGDUMP_SPADJ]
+       lea     SP, [SP + REGDUMP_SPADJ]
  .endm
  
  .macro _regbase
  #  if CPUFAM_X86
-       mov     [esp + 0], ebp
+       mov     [SP + 0], BP
  #  elif ABI_SYSV
-       mov     rdi, rbp
+       mov     rdi, BP
  #  elif ABI_WIN
-       mov     rcx, rbp
+       mov     rcx, BP
  #  endif
  .endm
  
  .macro _membase
-       mov     R_a(r), [R_bp(r) + regmap_gp]
+       mov     AX, [BP + regmap_gp]
  #  if CPUFAM_X86
         mov     eax, [eax + REGIX_ADDR*WORDSZ]
-       mov     [esp + 0], eax
+       mov     [SP + 0], eax
  #  elif ABI_SYSV
         mov     rdi, [rax + REGIX_ADDR*WORDSZ]
  #  elif ABI_WIN
@@ -457,7 +457,7 @@ DO8(REGDEF_SIMD)
  .macro _reglbl msg
    .ifeqs "\msg", ""
  #  if CPUFAM_X86
-       mov     dword ptr [esp + 4], 0
+       mov     dword ptr [SP + 4], 0
  #  elif ABI_SYSV
         xor     esi, esi
  #  elif ABI_WIN
@@ -466,7 +466,7 @@ DO8(REGDEF_SIMD)
    .else
  #  if CPUFAM_X86
         lea     eax, [INTADDR(.L$_reglbl$\@)]
-       mov     [esp + 4], eax
+       mov     [SP + 4], eax
  #  elif ABI_SYSV
         lea     rsi, [INTADDR(.L$_reglbl$\@)]
  #  elif ABI_WIN
@@ -481,7 +481,7 @@ DO8(REGDEF_SIMD)
  
  .macro _regfmt arg
  #  if CPUFAM_X86
-       mov     dword ptr [esp + 8], \arg
+       mov     dword ptr [SP + 8], \arg
  #  elif ABI_SYSV
         mov     edx, \arg
  #  elif ABI_WIN
diff --git a/base/test-regdump-x86ish.S b/base/test-regdump-x86ish.S

index a8c8d43..41ba77f 100644 (file)
--- a/base/test-regdump-x86ish.S
+++ b/base/test-regdump-x86ish.S
@@ -10,9 +10,9 @@ vec:
  
  FUNC(main)
  
-       pushreg R_bp(r)
+       pushreg BP
         setfp
-       and     R_sp(r), ~15
+       and     SP, ~15
    endprologue
  
         fldz
@@ -32,7 +32,7 @@ FUNC(main)
  
         xor     eax, eax
         dropfp
-       popreg  R_bp(r)
+       popreg  BP
         ret
  
  ENDFUNC
diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S

index d313765..51e94c5 100644 (file)
--- a/math/mpx-mul4-amd64-sse2.S
+++ b/math/mpx-mul4-amd64-sse2.S
@@ -575,10 +575,10 @@ INTFUNC(mmla4)
         movdqu  xmm4, [rax]
  #if ABI_WIN
         stalloc 48 + 8                  // space for the carries
-#  define STKTMP(i) [rsp + i]
+#  define STKTMP(i) [SP + i]
  #endif
  #if ABI_SYSV
-#  define STKTMP(i) [rsp + i - 48 - 8] // use red zone
+#  define STKTMP(i) [SP + i - 48 - 8]  // use red zone
  #endif
    endprologue
  
@@ -811,7 +811,7 @@ FUNC(mpx_umul4_amd64_sse2)
    endprologue
  
         mov     rdi, DV
-       mov     BVL, [rsp + 224]
+       mov     BVL, [SP + 224]
  
  #endif
  
@@ -978,8 +978,8 @@ FUNC(mpxmont_mul4_amd64_sse2)
    endprologue
  
         mov     rdi, DV
-       mov     N, [rsp + 224]
-       mov     MI, [rsp + 232]
+       mov     N, [SP + 224]
+       mov     MI, [SP + 232]
  
  #endif
  
@@ -1183,7 +1183,7 @@ FUNC(mpxmont_redc4_amd64_sse2)
    endprologue
  
         mov     rdi, DV
-       mov     MI, [rsp + 224]
+       mov     MI, [SP + 224]
  
  #endif
  
@@ -1329,7 +1329,7 @@ ENDFUNC
  #  define ARG8 STKARG(4)
  #  define STKARG_OFFSET 224
  #endif
-#define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
+#define STKARG(i) [SP + STKARG_OFFSET + 8*(i)]
  
  //               sysv                          win
  //               dmul  smul  mmul  mont        dmul  smul  mmul  mont
diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S

index 904c0d0..ba7ae6a 100644 (file)
--- a/math/mpx-mul4-x86-sse2.S
+++ b/math/mpx-mul4-x86-sse2.S
@@ -561,9 +561,9 @@ INTFUNC(mmla4)
         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
         propout [edi + 12],      xmm7, xmm4
  
-       movdqa  [esp +  0], xmm4
-       movdqa  [esp + 16], xmm5
-       movdqa  [esp + 32], xmm6
+       movdqa  [SP +  0], xmm4
+       movdqa  [SP + 16], xmm5
+       movdqa  [SP + 32], xmm6
  
         // Calculate Y = W M.
         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
@@ -606,9 +606,9 @@ INTFUNC(mmla4)
         propout [edi + 12],      xmm7, xmm4
  
         // Add add on the carry we calculated earlier.
-       paddq   xmm4, [esp +  0]
-       paddq   xmm5, [esp + 16]
-       paddq   xmm6, [esp + 32]
+       paddq   xmm4, [SP +  0]
+       paddq   xmm5, [SP + 16]
+       paddq   xmm6, [SP + 32]
  
         // And, with that, we're done.
         stfree  48 + 12
@@ -688,40 +688,40 @@ FUNC(mpx_umul4_x86_sse2)
         // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
         //                         const mpw *bv, const mpw *bvl);
  
-       // Build a stack frame.  Arguments will be relative to EBP, as
+       // Build a stack frame.  Arguments will be relative to BP, as
         // follows.
         //
-       //      ebp + 20        dv
-       //      ebp + 24        av
-       //      ebp + 28        avl
-       //      ebp + 32        bv
-       //      ebp + 36        bvl
+       //      BP + 20 dv
+       //      BP + 24 av
+       //      BP + 28 avl
+       //      BP + 32 bv
+       //      BP + 36 bvl
         //
-       // Locals are relative to ESP, as follows.
+       // Locals are relative to SP, as follows.
         //
-       //      esp +  0        expanded Y (32 bytes)
-       //      esp + 32        (top of locals)
-       pushreg ebp
+       //      SP +  0 expanded Y (32 bytes)
+       //      SP + 32 (top of locals)
+       pushreg BP
         pushreg ebx
         pushreg esi
         pushreg edi
         setfp
-       and     esp, ~15
-       sub     esp, 32
+       and     SP, ~15
+       sub     SP, 32
    endprologue
  
         // Prepare for the first iteration.
-       mov     esi, [ebp + 32]         // -> bv[0]
+       mov     esi, [BP + 32]          // -> bv[0]
         pxor    xmm7, xmm7
         movdqu  xmm0, [esi]             // bv[0]
-       mov     edi, [ebp + 20]         // -> dv[0]
+       mov     edi, [BP + 20]          // -> dv[0]
         mov     ecx, edi                // outer loop dv cursor
         expand  xmm7, xmm0, xmm1
-       mov     ebx, [ebp + 24]         // -> av[0]
-       mov     eax, [ebp + 28]         // -> av[m] = av limit
-       mov     edx, esp                // -> expanded Y = bv[0]
-       movdqa  [esp + 0], xmm0         // bv[0] expanded low
-       movdqa  [esp + 16], xmm1        // bv[0] expanded high
+       mov     ebx, [BP + 24]          // -> av[0]
+       mov     eax, [BP + 28]          // -> av[m] = av limit
+       mov     edx, SP                 // -> expanded Y = bv[0]
+       movdqa  [SP + 0], xmm0          // bv[0] expanded low
+       movdqa  [SP + 16], xmm1         // bv[0] expanded high
         call    mul4zc
         add     ebx, 16
         add     edi, 16
@@ -740,7 +740,7 @@ FUNC(mpx_umul4_x86_sse2)
  
         // Write out the leftover carry.  There can be no tail here.
  8:     call    carryprop
-       cmp     esi, [ebp + 36]         // more passes to do?
+       cmp     esi, [BP + 36]          // more passes to do?
         jae     9f
  
         .p2align 4
@@ -749,9 +749,9 @@ FUNC(mpx_umul4_x86_sse2)
         mov     edi, ecx                // -> dv[i]
         pxor    xmm7, xmm7
         expand  xmm7, xmm0, xmm1
-       mov     ebx, [ebp + 24]         // -> av[0]
-       movdqa  [esp + 0], xmm0         // bv[i] expanded low
-       movdqa  [esp + 16], xmm1        // bv[i] expanded high
+       mov     ebx, [BP + 24]          // -> av[0]
+       movdqa  [SP + 0], xmm0          // bv[i] expanded low
+       movdqa  [SP + 16], xmm1         // bv[i] expanded high
         call    mla4zc
         add     edi, 16
         add     ebx, 16
@@ -771,7 +771,7 @@ FUNC(mpx_umul4_x86_sse2)
         // Finish off this pass.  There was no tail on the previous pass, and
         // there can be none on this pass.
  8:     call    carryprop
-       cmp     esi, [ebp + 36]
+       cmp     esi, [BP + 36]
         jb      1b
  
         // All over.
@@ -779,7 +779,7 @@ FUNC(mpx_umul4_x86_sse2)
         pop     edi
         pop     esi
         pop     ebx
-       pop     ebp
+       pop     BP
         ret
  
  ENDFUNC
@@ -796,69 +796,69 @@ FUNC(mpxmont_mul4_x86_sse2)
         // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
         //                           const mpw *nv, size_t n, const mpw *mi);
  
-       // Build a stack frame.  Arguments will be relative to EBP, as
+       // Build a stack frame.  Arguments will be relative to BP, as
         // follows.
         //
-       //      ebp + 20        dv
-       //      ebp + 24        av
-       //      ebp + 28        bv
-       //      ebp + 32        nv
-       //      ebp + 36        n (nonzero multiple of 4)
-       //      ebp + 40        mi
+       //      BP + 20 dv
+       //      BP + 24 av
+       //      BP + 28 bv
+       //      BP + 32 nv
+       //      BP + 36 n (nonzero multiple of 4)
+       //      BP + 40 mi
         //
-       // Locals are relative to ESP, which 16-byte aligned, as follows.
+       // Locals are relative to SP, which 16-byte aligned, as follows.
         //
-       //      esp +   0       expanded V (32 bytes)
-       //      esp +  32       expanded M (32 bytes)
-       //      esp +  64       expanded Y (32 bytes)
-       //      esp +  96       outer loop dv
-       //      esp + 100       outer loop bv
-       //      esp + 104       av limit (mostly in ESI)
-       //      esp + 108       bv limit
-       //      esp + 112       (top of locals)
-       pushreg ebp
+       //      SP +   0        expanded V (32 bytes)
+       //      SP +  32        expanded M (32 bytes)
+       //      SP +  64        expanded Y (32 bytes)
+       //      SP +  96        outer loop dv
+       //      SP + 100        outer loop bv
+       //      SP + 104        av limit (mostly in ESI)
+       //      SP + 108        bv limit
+       //      SP + 112        (top of locals)
+       pushreg BP
         pushreg ebx
         pushreg esi
         pushreg edi
         setfp
-       and     esp, ~15
-       sub     esp, 112
+       and     SP, ~15
+       sub     SP, 112
    endprologue
  
         // Establish the expanded operands.
         pxor    xmm7, xmm7
-       mov     ecx, [ebp + 28]         // -> bv
-       mov     edx, [ebp + 40]         // -> mi
+       mov     ecx, [BP + 28]          // -> bv
+       mov     edx, [BP + 40]          // -> mi
         movdqu  xmm0, [ecx]             // bv[0]
         movdqu  xmm2, [edx]             // mi
         expand  xmm7, xmm0, xmm1, xmm2, xmm3
-       movdqa  [esp +  0], xmm0        // bv[0] expanded low
-       movdqa  [esp + 16], xmm1        // bv[0] expanded high
-       movdqa  [esp + 32], xmm2        // mi expanded low
-       movdqa  [esp + 48], xmm3        // mi expanded high
+       movdqa  [SP +  0], xmm0         // bv[0] expanded low
+       movdqa  [SP + 16], xmm1         // bv[0] expanded high
+       movdqa  [SP + 32], xmm2         // mi expanded low
+       movdqa  [SP + 48], xmm3         // mi expanded high
  
         // Set up the outer loop state and prepare for the first iteration.
-       mov     edx, [ebp + 36]         // n
-       mov     eax, [ebp + 24]         // -> U = av[0]
-       mov     ebx, [ebp + 32]         // -> X = nv[0]
-       mov     edi, [ebp + 20]         // -> Z = dv[0]
-       mov     [esp + 100], ecx
+       mov     edx, [BP + 36]          // n
+       mov     eax, [BP + 24]          // -> U = av[0]
+       mov     ebx, [BP + 32]          // -> X = nv[0]
+       mov     edi, [BP + 20]          // -> Z = dv[0]
+       mov     [SP + 100], ecx
         lea     ecx, [ecx + 4*edx]      // -> bv[n/4] = bv limit
         lea     edx, [eax + 4*edx]      // -> av[n/4] = av limit
-       mov     [esp + 96], edi
-       mov     [esp + 104], edx
-       mov     [esp + 108], ecx
-       lea     ecx, [esp + 0]          // -> expanded V = bv[0]
-       lea     esi, [esp + 32]         // -> expanded M = mi
-       lea     edx, [esp + 64]         // -> space for Y
+       mov     [SP + 96], edi
+       mov     [SP + 104], edx
+       mov     [SP + 108], ecx
+       lea     ecx, [SP + 0]           // -> expanded V = bv[0]
+       lea     esi, [SP + 32]          // -> expanded M = mi
+       lea     edx, [SP + 64]          // -> space for Y
         call    mmul4
-       mov     esi, [esp + 104]        // recover av limit
+       mov     esi, [SP + 104]         // recover av limit
         add     edi, 16
         add     eax, 16
         add     ebx, 16
         cmp     eax, esi                // done already?
         jae     8f
-       mov     [esp + 96], edi
+       mov     [SP + 96], edi
  
         .p2align 4
         // Complete the first inner loop.
@@ -877,26 +877,26 @@ FUNC(mpxmont_mul4_x86_sse2)
         // Embark on the next iteration.  (There must be one.  If n = 1, then
         // we would have bailed above, to label 8.  Similarly, the subsequent
         // iterations can fall into the inner loop immediately.)
-1:     mov     eax, [esp + 100]        // -> bv[i - 1]
-       mov     edi, [esp + 96]         // -> Z = dv[i]
+1:     mov     eax, [SP + 100]         // -> bv[i - 1]
+       mov     edi, [SP + 96]          // -> Z = dv[i]
         add     eax, 16                 // -> bv[i]
         pxor    xmm7, xmm7
-       mov     [esp + 100], eax
-       cmp     eax, [esp + 108]        // done yet?
+       mov     [SP + 100], eax
+       cmp     eax, [SP + 108]         // done yet?
         jae     9f
         movdqu  xmm0, [eax]             // bv[i]
-       mov     ebx, [ebp + 32]         // -> X = nv[0]
-       lea     esi, [esp + 32]         // -> expanded M = mi
-       mov     eax, [ebp + 24]         // -> U = av[0]
+       mov     ebx, [BP + 32]          // -> X = nv[0]
+       lea     esi, [SP + 32]          // -> expanded M = mi
+       mov     eax, [BP + 24]          // -> U = av[0]
         expand  xmm7, xmm0, xmm1
-       movdqa  [esp + 0], xmm0         // bv[i] expanded low
-       movdqa  [esp + 16], xmm1        // bv[i] expanded high
+       movdqa  [SP + 0], xmm0          // bv[i] expanded low
+       movdqa  [SP + 16], xmm1         // bv[i] expanded high
         call    mmla4
-       mov     esi, [esp + 104]        // recover av limit
+       mov     esi, [SP + 104]         // recover av limit
         add     edi, 16
         add     eax, 16
         add     ebx, 16
-       mov     [esp + 96], edi
+       mov     [SP + 96], edi
  
         .p2align 4
         // Complete the next inner loop.
@@ -928,7 +928,7 @@ FUNC(mpxmont_mul4_x86_sse2)
         popreg  edi
         popreg  esi
         popreg  ebx
-       popreg  ebp
+       popreg  BP
         ret
  
  ENDFUNC
@@ -945,55 +945,55 @@ FUNC(mpxmont_redc4_x86_sse2)
         // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
         //                             size_t n, const mpw *mi);
  
-       // Build a stack frame.  Arguments will be relative to EBP, as
+       // Build a stack frame.  Arguments will be relative to BP, as
         // follows.
         //
-       //      ebp + 20        dv
-       //      ebp + 24        dvl
-       //      ebp + 28        nv
-       //      ebp + 32        n (nonzero multiple of 4)
-       //      ebp + 36        mi
+       //      BP + 20 dv
+       //      BP + 24 dvl
+       //      BP + 28 nv
+       //      BP + 32 n (nonzero multiple of 4)
+       //      BP + 36 mi
         //
-       // Locals are relative to ESP, as follows.
+       // Locals are relative to SP, as follows.
         //
-       //      esp +  0        outer loop dv
-       //      esp +  4        outer dv limit
-       //      esp +  8        blocks-of-4 dv limit
-       //      esp + 12        expanded M (32 bytes)
-       //      esp + 44        expanded Y (32 bytes)
-       //      esp + 76        (top of locals)
-       pushreg ebp
+       //      SP +  0 outer loop dv
+       //      SP +  4 outer dv limit
+       //      SP +  8 blocks-of-4 dv limit
+       //      SP + 12 expanded M (32 bytes)
+       //      SP + 44 expanded Y (32 bytes)
+       //      SP + 76 (top of locals)
+       pushreg BP
         pushreg ebx
         pushreg esi
         pushreg edi
         setfp
-       and     esp, ~15
-       sub     esp, 76
+       and     SP, ~15
+       sub     SP, 76
    endprologue
  
         // Establish the expanded operands and the blocks-of-4 dv limit.
-       mov     edi, [ebp + 20]         // -> Z = dv[0]
+       mov     edi, [BP + 20]          // -> Z = dv[0]
         pxor    xmm7, xmm7
-       mov     eax, [ebp + 24]         // -> dv[n] = dv limit
+       mov     eax, [BP + 24]          // -> dv[n] = dv limit
         sub     eax, edi                // length of dv in bytes
-       mov     edx, [ebp + 36]         // -> mi
+       mov     edx, [BP + 36]          // -> mi
         movdqu  xmm0, [edx]             // mi
         and     eax, ~15                // mask off the tail end
         expand  xmm7, xmm0, xmm1
         add     eax, edi                // find limit
-       movdqa  [esp + 12], xmm0        // mi expanded low
-       movdqa  [esp + 28], xmm1        // mi expanded high
-       mov     [esp + 8], eax
+       movdqa  [SP + 12], xmm0         // mi expanded low
+       movdqa  [SP + 28], xmm1         // mi expanded high
+       mov     [SP + 8], eax
  
         // Set up the outer loop state and prepare for the first iteration.
-       mov     ecx, [ebp + 32]         // n
-       mov     ebx, [ebp + 28]         // -> X = nv[0]
+       mov     ecx, [BP + 32]          // n
+       mov     ebx, [BP + 28]          // -> X = nv[0]
         lea     edx, [edi + 4*ecx]      // -> dv[n/4] = outer dv limit
         lea     ecx, [ebx + 4*ecx]      // -> nv[n/4] = nv limit
-       mov     [esp + 0], edi
-       mov     [esp + 4], edx
-       lea     esi, [esp + 12]         // -> expanded M = mi
-       lea     edx, [esp + 44]         // -> space for Y
+       mov     [SP + 0], edi
+       mov     [SP + 4], edx
+       lea     esi, [SP + 12]          // -> expanded M = mi
+       lea     edx, [SP + 44]          // -> space for Y
         call    mont4
         add     ebx, 16
         add     edi, 16
@@ -1010,8 +1010,8 @@ FUNC(mpxmont_redc4_x86_sse2)
  
         // Still have carries left to propagate.
  8:     carryadd
-       mov     esi, [esp + 8]          // -> dv blocks limit
-       mov     edx, [ebp + 24]         // dv limit
+       mov     esi, [SP + 8]           // -> dv blocks limit
+       mov     edx, [BP + 24]          // dv limit
         psllq   xmm7, 16
         pslldq  xmm7, 8
         paddq   xmm6, xmm7
@@ -1044,14 +1044,14 @@ FUNC(mpxmont_redc4_x86_sse2)
         // All done for this iteration.  Start the next.  (This must have at
         // least one follow-on iteration, or we'd not have started this outer
         // loop.)
-8:     mov     edi, [esp + 0]          // -> dv[i - 1]
-       mov     ebx, [ebp + 28]         // -> X = nv[0]
-       lea     edx, [esp + 44]         // -> space for Y
-       lea     esi, [esp + 12]         // -> expanded M = mi
+8:     mov     edi, [SP + 0]           // -> dv[i - 1]
+       mov     ebx, [BP + 28]          // -> X = nv[0]
+       lea     edx, [SP + 44]          // -> space for Y
+       lea     esi, [SP + 12]          // -> expanded M = mi
         add     edi, 16                 // -> Z = dv[i]
-       cmp     edi, [esp + 4]          // all done yet?
+       cmp     edi, [SP + 4]           // all done yet?
         jae     9f
-       mov     [esp + 0], edi
+       mov     [SP + 0], edi
         call    mont4
         add     edi, 16
         add     ebx, 16
@@ -1062,7 +1062,7 @@ FUNC(mpxmont_redc4_x86_sse2)
         popreg  edi
         popreg  esi
         popreg  ebx
-       popreg  ebp
+       popreg  BP
         ret
  
  ENDFUNC
@@ -1091,22 +1091,22 @@ ENDFUNC
  .endm
  
  .macro testprologue n
-       pushreg ebp
+       pushreg BP
         pushreg ebx
         pushreg esi
         pushreg edi
         setfp
-       and     esp, ~15
-       sub     esp, 3*32 + 4*4
+       and     SP, ~15
+       sub     SP, 3*32 + 4*4
    endprologue
         mov     eax, \n
-       mov     [esp + 104], eax
+       mov     [SP + 104], eax
         // vars:
-       //      esp +   0 = v expanded
-       //      esp +  32 = y expanded
-       //      esp +  64 = ? expanded
-       //      esp +  96 = cycles
-       //      esp + 104 = count
+       //      SP +   0 = v expanded
+       //      SP +  32 = y expanded
+       //      SP +  64 = ? expanded
+       //      SP +  96 = cycles
+       //      SP + 104 = count
  .endm
  
  .macro testepilogue
@@ -1114,7 +1114,7 @@ ENDFUNC
         popreg  edi
         popreg  esi
         popreg  ebx
-       popreg  ebp
+       popreg  BP
         ret
  .endm
  
@@ -1131,15 +1131,15 @@ ENDFUNC
         mov     ecx, \v
         movdqu  xmm0, [ecx]
         expand  xmm7, xmm0, xmm1
-       movdqa  [esp +  0], xmm0
-       movdqa  [esp + 16], xmm1
+       movdqa  [SP +  0], xmm0
+       movdqa  [SP + 16], xmm1
    .endif
    .ifnes "\y", "nil"
         mov     edx, \y
         movdqu  xmm2, [edx]
         expand  xmm7, xmm2, xmm3
-       movdqa  [esp + 32], xmm2
-       movdqa  [esp + 48], xmm3
+       movdqa  [SP + 32], xmm2
+       movdqa  [SP + 48], xmm3
    .endif
  .endm
  
@@ -1147,25 +1147,25 @@ ENDFUNC
         .p2align 4
  0:
    .ifnes "\u", "nil"
-       lea     ecx, [esp + 0]
+       lea     ecx, [SP + 0]
    .endif
         mov     ebx, \x
    .ifeqs "\mode", "mont"
-       lea     esi, [esp + 32]
+       lea     esi, [SP + 32]
    .endif
-       cysetup esp + 96
+       cysetup SP + 96
    .ifnes "\u", "nil"
         mov     eax, \u
    .endif
    .ifeqs "\mode", "mont"
-       lea     edx, [esp + 64]
+       lea     edx, [SP + 64]
    .else
-       lea     edx, [esp + 32]
+       lea     edx, [SP + 32]
    .endif
  .endm
  
  .macro testtail cyv
-       cystore esp + 96, \cyv, esp + 104
+       cystore SP + 96, \cyv, SP + 104
         jnz     0b
  .endm
  
@@ -1177,122 +1177,122 @@ ENDFUNC
  .endm
  
  FUNC(test_dmul4)
-       testprologue [ebp + 44]
-       testldcarry [ebp + 24]
-       testexpand [ebp + 36], [ebp + 40]
-       mov     edi, [ebp + 20]
-       testtop [ebp + 28], [ebp + 32]
+       testprologue [BP + 44]
+       testldcarry [BP + 24]
+       testexpand [BP + 36], [BP + 40]
+       mov     edi, [BP + 20]
+       testtop [BP + 28], [BP + 32]
         call    dmul4
-       testtail [ebp + 48]
-       testcarryout [ebp + 24]
+       testtail [BP + 48]
+       testcarryout [BP + 24]
         testepilogue
  ENDFUNC
  
  FUNC(test_dmla4)
-       testprologue [ebp + 44]
-       testldcarry [ebp + 24]
-       testexpand [ebp + 36], [ebp + 40]
-       mov     edi, [ebp + 20]
-       testtop [ebp + 28], [ebp + 32]
+       testprologue [BP + 44]
+       testldcarry [BP + 24]
+       testexpand [BP + 36], [BP + 40]
+       mov     edi, [BP + 20]
+       testtop [BP + 28], [BP + 32]
         call    dmla4
-       testtail [ebp + 48]
-       testcarryout [ebp + 24]
+       testtail [BP + 48]
+       testcarryout [BP + 24]
         testepilogue
  ENDFUNC
  
  FUNC(test_mul4)
-       testprologue [ebp + 36]
-       testldcarry [ebp + 24]
-       testexpand nil, [ebp + 32]
-       mov     edi, [ebp + 20]
-       testtop nil, [ebp + 28]
+       testprologue [BP + 36]
+       testldcarry [BP + 24]
+       testexpand nil, [BP + 32]
+       mov     edi, [BP + 20]
+       testtop nil, [BP + 28]
         call    mul4
-       testtail [ebp + 40]
-       testcarryout [ebp + 24]
+       testtail [BP + 40]
+       testcarryout [BP + 24]
         testepilogue
  ENDFUNC
  
  FUNC(test_mul4zc)
-       testprologue [ebp + 36]
-       testldcarry [ebp + 24]
-       testexpand nil, [ebp + 32]
-       mov     edi, [ebp + 20]
-       testtop nil, [ebp + 28]
+       testprologue [BP + 36]
+       testldcarry [BP + 24]
+       testexpand nil, [BP + 32]
+       mov     edi, [BP + 20]
+       testtop nil, [BP + 28]
         call    mul4zc
-       testtail [ebp + 40]
-       testcarryout [ebp + 24]
+       testtail [BP + 40]
+       testcarryout [BP + 24]
         testepilogue
  ENDFUNC
  
  FUNC(test_mla4)
-       testprologue [ebp + 36]
-       testldcarry [ebp + 24]
-       testexpand nil, [ebp + 32]
-       mov     edi, [ebp + 20]
-       testtop nil, [ebp + 28]
+       testprologue [BP + 36]
+       testldcarry [BP + 24]
+       testexpand nil, [BP + 32]
+       mov     edi, [BP + 20]
+       testtop nil, [BP + 28]
         call    mla4
-       testtail [ebp + 40]
-       testcarryout [ebp + 24]
+       testtail [BP + 40]
+       testcarryout [BP + 24]
         testepilogue
  ENDFUNC
  
  FUNC(test_mla4zc)
-       testprologue [ebp + 36]
-       testldcarry [ebp + 24]
-       testexpand nil, [ebp + 32]
-       mov     edi, [ebp + 20]
-       testtop nil, [ebp + 28]
+       testprologue [BP + 36]
+       testldcarry [BP + 24]
+       testexpand nil, [BP + 32]
+       mov     edi, [BP + 20]
+       testtop nil, [BP + 28]
         call    mla4zc
-       testtail [ebp + 40]
-       testcarryout [ebp + 24]
+       testtail [BP + 40]
+       testcarryout [BP + 24]
         testepilogue
  ENDFUNC
  
  FUNC(test_mmul4)
-       testprologue [ebp + 48]
-       testexpand [ebp + 40], [ebp + 44]
-       mov     edi, [ebp + 20]
-       testtop [ebp + 32], [ebp + 36], mont
+       testprologue [BP + 48]
+       testexpand [BP + 40], [BP + 44]
+       mov     edi, [BP + 20]
+       testtop [BP + 32], [BP + 36], mont
         call    mmul4
-       testtail [ebp + 52]
-       mov     edi, [ebp + 28]
-       movdqa  xmm0, [esp + 64]
-       movdqa  xmm1, [esp + 80]
+       testtail [BP + 52]
+       mov     edi, [BP + 28]
+       movdqa  xmm0, [SP + 64]
+       movdqa  xmm1, [SP + 80]
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
-       testcarryout [ebp + 24]
+       testcarryout [BP + 24]
         testepilogue
  ENDFUNC
  
  FUNC(test_mmla4)
-       testprologue [ebp + 48]
-       testexpand [ebp + 40], [ebp + 44]
-       mov     edi, [ebp + 20]
-       testtop [ebp + 32], [ebp + 36], mont
+       testprologue [BP + 48]
+       testexpand [BP + 40], [BP + 44]
+       mov     edi, [BP + 20]
+       testtop [BP + 32], [BP + 36], mont
         call    mmla4
-       testtail [ebp + 52]
-       mov     edi, [ebp + 28]
-       movdqa  xmm0, [esp + 64]
-       movdqa  xmm1, [esp + 80]
+       testtail [BP + 52]
+       mov     edi, [BP + 28]
+       movdqa  xmm0, [SP + 64]
+       movdqa  xmm1, [SP + 80]
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
-       testcarryout [ebp + 24]
+       testcarryout [BP + 24]
         testepilogue
  ENDFUNC
  
  FUNC(test_mont4)
-       testprologue [ebp + 40]
-       testexpand nil, [ebp + 36]
-       mov     edi, [ebp + 20]
-       testtop nil, [ebp + 32], mont
+       testprologue [BP + 40]
+       testexpand nil, [BP + 36]
+       mov     edi, [BP + 20]
+       testtop nil, [BP + 32], mont
         call    mont4
-       testtail [ebp + 44]
-       mov     edi, [ebp + 28]
-       movdqa  xmm0, [esp + 64]
-       movdqa  xmm1, [esp + 80]
+       testtail [BP + 44]
+       mov     edi, [BP + 28]
+       movdqa  xmm0, [SP + 64]
+       movdqa  xmm1, [SP + 80]
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
-       testcarryout [ebp + 24]
+       testcarryout [BP + 24]
         testepilogue
  ENDFUNC
  
diff --git a/rand/rand-x86ish.S b/rand/rand-x86ish.S

index 829bc2c..61de2b8 100644 (file)
--- a/rand/rand-x86ish.S
+++ b/rand/rand-x86ish.S
@@ -42,7 +42,7 @@ FUNC(rand_quick_x86ish_rdrand)
         // Return zero on success, or -1 on error.
  
  #if CPUFAM_X86
-       mov     edx, [esp + 4]
+       mov     edx, [SP + 4]
         stalloc 28
  #  define COUNT ecx
  #endif
@@ -58,7 +58,7 @@ FUNC(rand_quick_x86ish_rdrand)
  
         // Try to fetch a random number.
         mov     COUNT, 16
-0:     rdrand  R_a(r)
+0:     rdrand  AX
         jc      1f
         dec     COUNT
         jnz     0b
@@ -70,22 +70,22 @@ FUNC(rand_quick_x86ish_rdrand)
         // Success.
  1:
  #if CPUFAM_X86
-       mov     [esp + 16], eax
-       lea     ecx, [esp + 16]
-       mov     dword ptr [esp + 12], 32
-       mov     dword ptr [esp + 8], 4
-       mov     [esp + 4], ecx
-       mov     [esp + 0], edx
+       mov     [SP + 16], AX
+       lea     ecx, [SP + 16]
+       mov     dword ptr [SP + 12], 32
+       mov     dword ptr [SP + 8], 4
+       mov     [SP + 4], ecx
+       mov     [SP + 0], edx
  #endif
  #if CPUFAM_AMD64 && ABI_SYSV
-       mov     [rsp + 0], rax
-       mov     rsi, rsp
+       mov     [SP + 0], AX
+       mov     rsi, SP
         mov     edx, 8
         mov     ecx, 64
  #endif
  #if CPUFAM_AMD64 && ABI_WIN
-       mov     [rsp + 32], rax
-       lea     rdx, [rsp + 32]
+       mov     [SP + 32], AX
+       lea     rdx, [SP + 32]
         mov     r8d, 8
         mov     r9d, 64
  #endif
diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S

index 3fb623a..33af65f 100644 (file)
--- a/symm/chacha-x86ish-sse2.S
+++ b/symm/chacha-x86ish-sse2.S
@@ -66,15 +66,15 @@ FUNC(chacha_core_x86ish_sse2)
  #  define SAVE0 xmm5
  #  define SAVE1 xmm6
  #  define SAVE2 xmm7
-#  define SAVE3 [esp]
+#  define SAVE3 [SP]
  
-       pushreg ebp
+       pushreg BP
         setfp
-       sub     esp, 16
-       mov     IN, [ebp + 12]
-       mov     OUT, [ebp + 16]
-       and     esp, ~15
-       mov     NR, [ebp + 8]
+       sub     SP, 16
+       mov     IN, [BP + 12]
+       mov     OUT, [BP + 16]
+       and     SP, ~15
+       mov     NR, [BP + 8]
  #endif
  
  #if CPUFAM_AMD64 && ABI_SYSV
@@ -105,9 +105,9 @@ FUNC(chacha_core_x86ish_sse2)
  #  define IN rdx
  #  define OUT r8
  #  define SAVE0 xmm5
-#  define SAVE1 [rsp +  0]
-#  define SAVE2 [rsp + 16]
-#  define SAVE3 [rsp + 32]
+#  define SAVE1 [SP +  0]
+#  define SAVE2 [SP + 16]
+#  define SAVE3 [SP + 32]
  
         stalloc 48 + 8
  #endif
@@ -248,7 +248,7 @@ FUNC(chacha_core_x86ish_sse2)
         // Tidy things up.
  #if CPUFAM_X86
         dropfp
-       popreg  ebp
+       popreg  BP
  #endif
  #if CPUFAM_AMD64 && ABI_WIN
         stfree  48 + 8
diff --git a/symm/gcm-x86ish-pclmul.S b/symm/gcm-x86ish-pclmul.S

index e60b7ca..092242b 100644 (file)
--- a/symm/gcm-x86ish-pclmul.S
+++ b/symm/gcm-x86ish-pclmul.S
@@ -576,7 +576,7 @@
         // xmm3 =                       // v_0 = (v_01; v_00)
         movdqa  xmm4, xmm0              // u_1 again
  #if CPUFAM_X86
-       movdqa  [esp + 0], xmm3
+       movdqa  [SP + 0], xmm3
  #elif CPUFAM_AMD64
         movdqa  xmm8, xmm3
  #  define V0 xmm8
@@ -608,7 +608,7 @@
         pclmullqlqdq xmm4, xmm2         // u_11 v_11
         pclmulhqhqdq xmm7, xmm2         // u_10 v_10
  #if CPUFAM_X86
-       movdqa  xmm2, [esp + 0]
+       movdqa  xmm2, [SP + 0]
  #  define V0 xmm2
  #endif
         pxor    xmm0, xmm3              // u_10 v_11 + u_11 v_10
@@ -771,8 +771,8 @@ SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
         // A is updated with the product A K.
  
  #if CPUFAM_X86
-       mov     A, [esp + 4]
-       mov     K, [esp + 8]
+       mov     A, [SP + 4]
+       mov     K, [SP + 8]
  #endif
    endprologue
         movdqu  xmm0, [A]
@@ -790,8 +790,8 @@ SSEFUNC(gcm_mulk_128l_x86ish_pclmul)
         // exit, A is updated with the product A K.
  
  #if CPUFAM_X86
-       mov     A, [esp + 4]
-       mov     K, [esp + 8]
+       mov     A, [SP + 4]
+       mov     K, [SP + 8]
         ldgot   ecx
  #endif
    endprologue
@@ -811,8 +811,8 @@ SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
         // A is updated with the product A K.
  
  #if CPUFAM_X86
-       mov     A, [esp + 4]
-       mov     K, [esp + 8]
+       mov     A, [SP + 4]
+       mov     K, [SP + 8]
  #endif
    endprologue
         movq    xmm0, [A]
@@ -830,8 +830,8 @@ SSEFUNC(gcm_mulk_64l_x86ish_pclmul)
         // exit, A is updated with the product A K.
  
  #if CPUFAM_X86
-       mov     A, [esp + 4]
-       mov     K, [esp + 8]
+       mov     A, [SP + 4]
+       mov     K, [SP + 8]
         ldgot   ecx
  #endif
    endprologue
@@ -852,8 +852,8 @@ SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
         // with the product A K.
  
  #if CPUFAM_X86
-       mov     A, [esp + 4]
-       mov     K, [esp + 8]
+       mov     A, [SP + 4]
+       mov     K, [SP + 8]
  #endif
    endprologue
         movq    xmm0, [A + 0]
@@ -876,8 +876,8 @@ SSEFUNC(gcm_mulk_96l_x86ish_pclmul)
         // updated with the product A K.
  
  #if CPUFAM_X86
-       mov     A, [esp + 4]
-       mov     K, [esp + 8]
+       mov     A, [SP + 4]
+       mov     K, [SP + 8]
         ldgot   ecx
  #endif
    endprologue
@@ -901,8 +901,8 @@ SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
         // A is updated with the product A K.
  
  #if CPUFAM_X86
-       mov     A, [esp + 4]
-       mov     K, [esp + 8]
+       mov     A, [SP + 4]
+       mov     K, [SP + 8]
  #endif
  #if CPUFAM_AMD64 && ABI_WIN
         stalloc 2*16 + 8
@@ -935,8 +935,8 @@ SSEFUNC(gcm_mulk_192l_x86ish_pclmul)
         // exit, A is updated with the product A K.
  
  #if CPUFAM_X86
-       mov     A, [esp + 4]
-       mov     K, [esp + 8]
+       mov     A, [SP + 4]
+       mov     K, [SP + 8]
         ldgot   ecx
  #endif
  #if CPUFAM_AMD64 && ABI_WIN
@@ -970,12 +970,12 @@ SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
         // A is updated with the product A K.
  
  #if CPUFAM_X86
-       pushreg ebp
+       pushreg BP
         setfp
-       mov     A, [esp + 8]
-       mov     K, [esp + 12]
-       and     esp, ~15
-       sub     esp, 16
+       mov     A, [SP + 8]
+       mov     K, [SP + 12]
+       and     SP, ~15
+       sub     SP, 16
  #endif
  #if CPUFAM_AMD64 && ABI_WIN
         stalloc 3*16 + 8
@@ -997,7 +997,7 @@ SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
         movdqu  [A + 0], xmm1
  #if CPUFAM_X86
         dropfp
-       popreg  ebp
+       popreg  BP
  #endif
  #if CPUFAM_AMD64 && ABI_WIN
         rstrxmm xmm6, 0
@@ -1014,13 +1014,13 @@ SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
         // exit, A is updated with the product A K.
  
  #if CPUFAM_X86
-       pushreg ebp
+       pushreg BP
         setfp
-       mov     A, [esp + 8]
-       mov     K, [esp + 12]
-       and     esp, ~15
+       mov     A, [SP + 8]
+       mov     K, [SP + 12]
+       and     SP, ~15
         ldgot   ecx
-       sub     esp, 16
+       sub     SP, 16
  #endif
  #if CPUFAM_AMD64 && ABI_WIN
         stalloc 3*16 + 8
@@ -1044,7 +1044,7 @@ SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
         movdqu  [A + 0], xmm1
  #if CPUFAM_X86
         dropfp
-       popreg  ebp
+       popreg  BP
  #endif
  #if CPUFAM_AMD64 && ABI_WIN
         rstrxmm xmm6, 0
diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S

index 6d9b3b2..f5e5cc9 100644 (file)
--- a/symm/rijndael-x86ish-aesni.S
+++ b/symm/rijndael-x86ish-aesni.S
@@ -70,15 +70,12 @@ ENDFUNC
  
  FUNC(rijndael_setup_x86ish_aesni)
  
-#define SI WHOLE(si)
-#define DI WHOLE(di)
-
  #if CPUFAM_X86
         // Arguments are on the stack.  We'll need to stack the caller's
         // register veriables, but we'll manage.
  
-#  define CTX ebp                      // context pointer
-#  define BLKSZ [esp + 24]             // block size
+#  define CTX BP                       // context pointer
+#  define BLKSZ [SP + 24]              // block size
  
  #  define KSZ ebx                      // key size
  #  define NKW edx                      // total number of key words
@@ -92,15 +89,15 @@ FUNC(rijndael_setup_x86ish_aesni)
  #  define BLKOFF edx                   // block size in bytes
  
         // Stack the caller's registers.
-       pushreg ebp
+       pushreg BP
         pushreg ebx
         pushreg esi
         pushreg edi
  
         // Set up our own variables.
-       mov     CTX, [esp + 20]         // context base pointer
-       mov     SI, [esp + 28]          // key material
-       mov     KSZ, [esp + 32]         // key size, in words
+       mov     CTX, [SP + 20]          // context base pointer
+       mov     SI, [SP + 28]           // key material
+       mov     KSZ, [SP + 32]          // key size, in words
  #endif
  
  #if CPUFAM_AMD64 && ABI_SYSV
@@ -330,7 +327,7 @@ FUNC(rijndael_setup_x86ish_aesni)
         popreg  edi
         popreg  esi
         popreg  ebx
-       popreg  ebp
+       popreg  BP
  #endif
  #if CPUFAM_AMD64 && ABI_WIN
         popreg  rdi
@@ -389,8 +386,8 @@ ENDFUNC
  #  define DST edx
  #  define NR ecx
  
-       mov     K, [esp + 4]
-       mov     SRC, [esp + 8]
+       mov     K, [SP + 4]
+       mov     SRC, [SP + 8]
  #endif
  
  #if CPUFAM_AMD64 && ABI_SYSV
@@ -428,7 +425,7 @@ ENDFUNC
         add     K, 16
         pxor    xmm0, xmm1
  #if CPUFAM_X86
-       mov     DST, [esp + 12]
+       mov     DST, [SP + 12]
  #endif
  
         // Dispatch to the correct code.
diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S

index 5dc9c17..eb346af 100644 (file)
--- a/symm/salsa20-x86ish-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -65,16 +65,16 @@ FUNC(salsa20_core_x86ish_sse2)
  #  define OUT edx
  #  define SAVE0 xmm6
  #  define SAVE1 xmm7
-#  define SAVE2 [esp + 0]
-#  define SAVE3 [esp + 16]
+#  define SAVE2 [SP + 0]
+#  define SAVE3 [SP + 16]
  
-       pushreg ebp
+       pushreg BP
         setfp
-       sub     esp, 32
-       mov     IN, [ebp + 12]
-       mov     OUT, [ebp + 16]
-       and     esp, ~15
-       mov     NR, [ebp + 8]
+       sub     SP, 32
+       mov     IN, [BP + 12]
+       mov     OUT, [BP + 16]
+       and     SP, ~15
+       mov     NR, [BP + 8]
  #endif
  
  #if CPUFAM_AMD64 && ABI_SYSV
@@ -107,8 +107,8 @@ FUNC(salsa20_core_x86ish_sse2)
  #  define OUT r8
  #  define SAVE0 xmm6
  #  define SAVE1 xmm7
-#  define SAVE2 [rsp + 32]
-#  define SAVE3 [rsp + 48]
+#  define SAVE2 [SP + 32]
+#  define SAVE3 [SP + 48]
  
         stalloc 64 + 8
         savexmm xmm6, 0
@@ -301,7 +301,7 @@ FUNC(salsa20_core_x86ish_sse2)
         // Tidy things up.
  #if CPUFAM_X86
         dropfp
-       popreg  ebp
+       popreg  BP
  #endif
  #if CPUFAM_AMD64 && ABI_WIN
         rstrxmm xmm6, 0
author	Mark Wooding <mdw@distorted.org.uk>
	Tue, 29 Oct 2019 18:55:16 +0000 (18:55 +0000)
committer	Mark Wooding <mdw@distorted.org.uk>
	Sat, 9 May 2020 19:57:33 +0000 (20:57 +0100)
base/asm-common.h		patch \| blob \| blame \| history
base/dispatch-x86ish.S		patch \| blob \| blame \| history
base/regdump-x86ish.S		patch \| blob \| blame \| history
base/regdump.h		patch \| blob \| blame \| history
base/test-regdump-x86ish.S		patch \| blob \| blame \| history
math/mpx-mul4-amd64-sse2.S		patch \| blob \| blame \| history
math/mpx-mul4-x86-sse2.S		patch \| blob \| blame \| history
rand/rand-x86ish.S		patch \| blob \| blame \| history
symm/chacha-x86ish-sse2.S		patch \| blob \| blame \| history
symm/gcm-x86ish-pclmul.S		patch \| blob \| blame \| history
symm/rijndael-x86ish-aesni.S		patch \| blob \| blame \| history
symm/salsa20-x86ish-sse2.S		patch \| blob \| blame \| history