These registers get used a lot as pointers, so it's useful to be able to
refer to them as full-width registers more conveniently than `R_sp(r)'.
Introduce (C preprocessor) macros `AX', ..., for this purpose, and use
them extensively.
(Delete the existing `SI' and `DI' macros from `rijndael-x86ish-aesni.S'
which had the same purpose.)
12 files changed:
#endif
#define WHOLE(reg) _REGFORM(reg, r)
#endif
#define WHOLE(reg) _REGFORM(reg, r)
+// Macros for some common registers.
+#define AX R_a(r)
+#define BX R_b(r)
+#define CX R_c(r)
+#define DX R_d(r)
+#define SI R_si(r)
+#define DI R_di(r)
+#define BP R_bp(r)
+#define SP R_sp(r)
+
// Stack management and unwinding.
// Stack management and unwinding.
-.macro setfp fp=R_bp(r), offset=0
+.macro setfp fp=BP, offset=0
#if __ELF__
.cfi_def_cfa_register \fp
#endif
#if __ELF__
.cfi_def_cfa_register \fp
#endif
.seh_setframe \fp, 0
#endif
.else
.seh_setframe \fp, 0
#endif
.else
- lea \fp, [R_sp(r) + \offset]
+ lea \fp, [SP + \offset]
#if __ELF__
.cfi_def_cfa_register \fp
.cfi_adjust_cfa_offset -\offset
#if __ELF__
.cfi_def_cfa_register \fp
.cfi_adjust_cfa_offset -\offset
.macro _dropfp fp, offset=0
.if \offset == 0
.macro _dropfp fp, offset=0
.if \offset == 0
- .cfi_def_cfa_register R_sp(r)
+ .cfi_def_cfa_register SP
- lea R_sp(r), [\fp - \offset]
+ lea SP, [\fp - \offset]
- .cfi_def_cfa_register R_sp(r)
+ .cfi_def_cfa_register SP
.cfi_adjust_cfa_offset +\offset
#endif
.endif
.cfi_adjust_cfa_offset +\offset
#endif
.endif
#if __ELF__
.cfi_adjust_cfa_offset +\n
#endif
#if __ELF__
.cfi_adjust_cfa_offset +\n
#endif
#if __ELF__
.cfi_adjust_cfa_offset -\n
#endif
#if __ELF__
.cfi_adjust_cfa_offset -\n
#endif
.endm
.macro savexmm r, offset
.endm
.macro savexmm r, offset
- movdqa [R_sp(r) + \offset], \r
+ movdqa [SP + \offset], \r
#if ABI_WIN && CPUFAM_AMD64
.seh_savexmm \r, \offset
#endif
.endm
.macro rstrxmm r, offset
#if ABI_WIN && CPUFAM_AMD64
.seh_savexmm \r, \offset
#endif
.endm
.macro rstrxmm r, offset
- movdqa \r, [R_sp(r) + \offset]
+ movdqa \r, [SP + \offset]
#if CPUFAM_X86
pushreg ebx
pushreg edi
#if CPUFAM_X86
pushreg ebx
pushreg edi
- mov edi, [esp + 12]
- mov eax, [esp + 16]
- mov ecx, [esp + 20]
+ mov edi, [SP + 12]
+ mov eax, [SP + 16]
+ mov ecx, [SP + 20]
# define OUT edi
#endif
#if CPUFAM_AMD64 && ABI_SYSV
# define OUT edi
#endif
#if CPUFAM_AMD64 && ABI_SYSV
// First, check that this is even a thing, using the complicated
// dance with the flags register.
pushf
// First, check that this is even a thing, using the complicated
// dance with the flags register.
pushf
- pop R_d(r) // current flags in d
+ pop DX // current flags in d
- or R_d(r), EFLAGS_ID // force the id bit on and check it
- push R_d(r)
+ or DX, EFLAGS_ID // force the id bit on and check it
+ push DX
test edx, EFLAGS_ID
jz 8f
test edx, EFLAGS_ID
jz 8f
- and R_d(r), ~EFLAGS_ID // force the id bit off and check it
- push R_d(r)
+ and DX, ~EFLAGS_ID // force the id bit off and check it
+ push DX
test edx, EFLAGS_ID
jnz 8f
test edx, EFLAGS_ID
jnz 8f
// Enter with no arguments. Return nonzero if the XMM registers are
// usable.
// Enter with no arguments. Return nonzero if the XMM registers are
// usable.
endprologue
// Save the floating point and SIMD registers, and try to clobber
// xmm0.
endprologue
// Save the floating point and SIMD registers, and try to clobber
// xmm0.
- fxsave [R_sp(r)]
- mov eax, [R_sp(r) + 160]
- xor dword ptr [R_sp(r) + 160], 0xaaaa5555
- fxrstor [R_sp(r)]
+ fxsave [SP]
+ mov eax, [SP + 160]
+ xor dword ptr [SP + 160], 0xaaaa5555
+ fxrstor [SP]
// Save them again, and read back the low word of xmm0. Undo the
// clobbering and restore.
// Save them again, and read back the low word of xmm0. Undo the
// clobbering and restore.
- fxsave [R_sp(r)]
- mov ecx, [R_sp(r) + 160]
- mov [R_sp(r) + 160], eax
- fxrstor [R_sp(r)]
+ fxsave [SP]
+ mov ecx, [SP + 160]
+ mov [SP + 160], eax
+ fxrstor [SP]
// The register are live if we read different things.
xor eax, ecx
// Done.
dropfp
// The register are live if we read different things.
xor eax, ecx
// Done.
dropfp
#if CPUFAM_X86
# define X_OUT edx
# define COUNT ecx
#if CPUFAM_X86
# define X_OUT edx
# define COUNT ecx
#endif
#if CPUFAM_AMD64 && ABI_SYSV
# define X_OUT rdi
#endif
#if CPUFAM_AMD64 && ABI_SYSV
# define X_OUT rdi
cld
// Save r/ebp and establish it pointing to the save area.
cld
// Save r/ebp and establish it pointing to the save area.
- mov [R_sp(r) + WORDSZ + REGIX_BP*WORDSZ], R_bp(r)
- lea R_bp(r), [R_sp(r) + WORDSZ]
+ mov [SP + WORDSZ + REGIX_BP*WORDSZ], BP
+ lea BP, [SP + WORDSZ]
// Save the other easy general-purpose registers.
#if !CPUFAM_X86
// Save the other easy general-purpose registers.
#if !CPUFAM_X86
- mov [R_bp(r) + REGIX_BX*WORDSZ], R_b(r)
+ mov [BP + REGIX_BX*WORDSZ], BX
- mov [R_bp(r) + REGIX_CX*WORDSZ], R_c(r)
- mov [R_bp(r) + REGIX_DX*WORDSZ], R_d(r)
- mov [R_bp(r) + REGIX_SI*WORDSZ], R_si(r)
- mov [R_bp(r) + REGIX_DI*WORDSZ], R_di(r)
+ mov [BP + REGIX_CX*WORDSZ], CX
+ mov [BP + REGIX_DX*WORDSZ], DX
+ mov [BP + REGIX_SI*WORDSZ], SI
+ mov [BP + REGIX_DI*WORDSZ], DI
- mov [R_bp(r) + REGIX_R8*WORDSZ], R_r8(r)
- mov [R_bp(r) + REGIX_R9*WORDSZ], R_r9(r)
- mov [R_bp(r) + REGIX_R10*WORDSZ], R_r10(r)
- mov [R_bp(r) + REGIX_R11*WORDSZ], R_r11(r)
- mov [R_bp(r) + REGIX_R12*WORDSZ], R_r12(r)
- mov [R_bp(r) + REGIX_R13*WORDSZ], R_r13(r)
- mov [R_bp(r) + REGIX_R14*WORDSZ], R_r14(r)
- mov [R_bp(r) + REGIX_R15*WORDSZ], R_r15(r)
+ mov [BP + REGIX_R8*WORDSZ], r8
+ mov [BP + REGIX_R9*WORDSZ], r9
+ mov [BP + REGIX_R10*WORDSZ], r10
+ mov [BP + REGIX_R11*WORDSZ], r11
+ mov [BP + REGIX_R12*WORDSZ], r12
+ mov [BP + REGIX_R13*WORDSZ], r13
+ mov [BP + REGIX_R14*WORDSZ], r14
+ mov [BP + REGIX_R15*WORDSZ], r15
#endif
// Determine the previous stack pointer and save it.
#if CPUFAM_AMD64 && ABI_SYSV
#endif
// Determine the previous stack pointer and save it.
#if CPUFAM_AMD64 && ABI_SYSV
- lea R_a(r), [R_bp(r) + 128 + REGDUMP_GPSIZE]
+ lea AX, [BP + 128 + REGDUMP_GPSIZE]
- lea R_a(r), [R_bp(r) + REGDUMP_GPSIZE]
+ lea AX, [BP + REGDUMP_GPSIZE]
- mov [R_bp(r) + REGIX_SP*WORDSZ], R_a(r)
+ mov [BP + REGIX_SP*WORDSZ], AX
// Collect the return address and save it as r/eip.
// Collect the return address and save it as r/eip.
- mov R_a(r), [R_sp(r)]
- mov [R_bp(r) + REGIX_IP*WORDSZ], R_a(r)
+ mov AX, [SP]
+ mov [BP + REGIX_IP*WORDSZ], AX
// Save the segment registers.
// Save the segment registers.
- lea R_a(r), [R_bp(r) + REGIX_GPLIM*WORDSZ]
- mov [R_a(r) + 2*REGIX_CS], cs
- mov [R_a(r) + 2*REGIX_DS], ds
- mov [R_a(r) + 2*REGIX_SS], ss
- mov [R_a(r) + 2*REGIX_ES], es
- mov [R_a(r) + 2*REGIX_FS], fs
- mov [R_a(r) + 2*REGIX_GS], gs
+ lea AX, [BP + REGIX_GPLIM*WORDSZ]
+ mov [AX + 2*REGIX_CS], cs
+ mov [AX + 2*REGIX_DS], ds
+ mov [AX + 2*REGIX_SS], ss
+ mov [AX + 2*REGIX_ES], es
+ mov [AX + 2*REGIX_FS], fs
+ mov [AX + 2*REGIX_GS], gs
// Determine the extended save area size. Preserve ebx on 32-bit x86
// here, because the caller needs it for PLT-indirect calls.
// Determine the extended save area size. Preserve ebx on 32-bit x86
// here, because the caller needs it for PLT-indirect calls.
// We assume nobody actually fiddled with the segment registers. So
// just the actual integer registers to do.
// We assume nobody actually fiddled with the segment registers. So
// just the actual integer registers to do.
- mov R_a(r), [R_bp(r) + REGIX_AX*WORDSZ]
- mov R_b(r), [R_bp(r) + REGIX_BX*WORDSZ]
- mov R_c(r), [R_bp(r) + REGIX_CX*WORDSZ]
- mov R_d(r), [R_bp(r) + REGIX_DX*WORDSZ]
- mov R_si(r), [R_bp(r) + REGIX_SI*WORDSZ]
- mov R_di(r), [R_bp(r) + REGIX_DI*WORDSZ]
+ mov AX, [BP + REGIX_AX*WORDSZ]
+ mov BX, [BP + REGIX_BX*WORDSZ]
+ mov CX, [BP + REGIX_CX*WORDSZ]
+ mov DX, [BP + REGIX_DX*WORDSZ]
+ mov SI, [BP + REGIX_SI*WORDSZ]
+ mov DI, [BP + REGIX_DI*WORDSZ]
- mov R_r8(r), [R_bp(r) + REGIX_R8*WORDSZ]
- mov R_r9(r), [R_bp(r) + REGIX_R9*WORDSZ]
- mov R_r10(r), [R_bp(r) + REGIX_R10*WORDSZ]
- mov R_r11(r), [R_bp(r) + REGIX_R11*WORDSZ]
- mov R_r12(r), [R_bp(r) + REGIX_R12*WORDSZ]
- mov R_r13(r), [R_bp(r) + REGIX_R13*WORDSZ]
- mov R_r14(r), [R_bp(r) + REGIX_R14*WORDSZ]
- mov R_r15(r), [R_bp(r) + REGIX_R15*WORDSZ]
+ mov r8, [BP + REGIX_R8*WORDSZ]
+ mov r9, [BP + REGIX_R9*WORDSZ]
+ mov r10, [BP + REGIX_R10*WORDSZ]
+ mov r11, [BP + REGIX_R11*WORDSZ]
+ mov r12, [BP + REGIX_R12*WORDSZ]
+ mov r13, [BP + REGIX_R13*WORDSZ]
+ mov r14, [BP + REGIX_R14*WORDSZ]
+ mov r15, [BP + REGIX_R15*WORDSZ]
- mov R_bp(r), [R_bp(r) + REGIX_BP*WORDSZ]
+ mov BP, [BP + REGIX_BP*WORDSZ]
// general registers are clobbered.
// Start by filling in the easy parts of the map.
// general registers are clobbered.
// Start by filling in the easy parts of the map.
- mov [R_sp(r) + WORDSZ + regmap_gp], R_bp(r)
- lea R_bp(r), [R_sp(r) + WORDSZ]
+ mov [SP + WORDSZ + regmap_gp], BP
+ lea BP, [SP + WORDSZ]
xor eax, eax // clears rax too on amd64
xor eax, eax // clears rax too on amd64
- mov [R_bp(r) + regmap_avx], R_a(r)
+ mov [BP + regmap_avx], AX
// Find out whether we use `xsave'. (Preserve ebx.)
#if CPUFAM_X86
// Find out whether we use `xsave'. (Preserve ebx.)
#if CPUFAM_X86
je 5f
// We have the `xsave' machinery. Select the base address.
je 5f
// We have the `xsave' machinery. Select the base address.
- lea R_si(r), [R_sp(r) + WORDSZ + regmap_size + 63]
- and R_si(r), ~63
- mov [R_bp(r) + regmap_fx], R_si(r)
+ lea SI, [SP + WORDSZ + regmap_size + 63]
+ and SI, ~63
+ mov [BP + regmap_fx], SI
// Clear out the header area.
xor eax, eax
// Clear out the header area.
xor eax, eax
- lea R_di(r), [R_si(r) + 512]
mov ecx, 16
rep stosd
// Save the registers.
mov eax, 0x00000007
xor edx, edx
mov ecx, 16
rep stosd
// Save the registers.
mov eax, 0x00000007
xor edx, edx
// Establish the AVX pointer, if available.
// Establish the AVX pointer, if available.
- test dword ptr [R_si(r) + 512], 4 // = xstate_bv
+ test dword ptr [SI + 512], 4 // = xstate_bv
je 8f
mov eax, 13
mov ecx, 2
cpuid
je 8f
mov eax, 13
mov ecx, 2
cpuid
- add R_b(r), R_si(r)
- mov [R_bp(r) + regmap_avx], R_b(r)
+ add BX, SI
+ mov [BP + regmap_avx], BX
jmp 8f
// We have only `fxsave'. Set the base address.
jmp 8f
// We have only `fxsave'. Set the base address.
-5: lea R_si(r), [R_sp(r) + WORDSZ + regmap_size + 15]
- and R_si(r), ~15
- mov [R_bp(r) + regmap_fx], R_si(r)
+5: lea SI, [SP + WORDSZ + regmap_size + 15]
+ and SI, ~15
+ mov [BP + regmap_fx], SI
// Clear the x87 state; otherwise it can cause trouble later.
8: fninit
// Clear the x87 state; otherwise it can cause trouble later.
8: fninit
// 32-bit x86, and the other general registers are clobbered.
// Find the extended register dump.
// 32-bit x86, and the other general registers are clobbered.
// Find the extended register dump.
- mov R_si(r), [R_bp(r) + regmap_fx]
+ mov SI, [BP + regmap_fx]
// Probe to find out whether we have `xsave'.
#if CPUFAM_X86
// Probe to find out whether we have `xsave'.
#if CPUFAM_X86
// We have the `xsave' machinery.
mov eax, 0x00000007
xor edx, edx
// We have the `xsave' machinery.
mov eax, 0x00000007
xor edx, edx
jmp 8f
// We must fake it up.
jmp 8f
// We must fake it up.
-8: mov R_bp(r), [R_bp(r) + regmap_gp]
+8: mov BP, [BP + regmap_gp]
#if CPUFAM_X86
pop ebx
#endif
#if CPUFAM_X86
pop ebx
#endif
// Stash r/eax. This is bletcherous: hope we don't get a signal in
// the next few instructions.
// Stash r/eax. This is bletcherous: hope we don't get a signal in
// the next few instructions.
- mov [R_sp(r) - REGDUMP_SPADJ + (REGIX_AX - 1)*WORDSZ], R_a(r)
+ mov [SP - REGDUMP_SPADJ + (REGIX_AX - 1)*WORDSZ], AX
.ifnes "\addr", "nil"
// Collect the effective address for the following dump, leaving it
// in the `addr' slot of the dump.
.ifnes "\addr", "nil"
// Collect the effective address for the following dump, leaving it
// in the `addr' slot of the dump.
- lea R_a(r), \addr
- mov [R_sp(r) - REGDUMP_SPADJ + (REGIX_ADDR - 1)*WORDSZ], R_a(r)
+ lea AX, \addr
+ mov [SP - REGDUMP_SPADJ + (REGIX_ADDR - 1)*WORDSZ], AX
.endif
// Make space for the register save area. On AMD64 with System/V
// ABI, also skip the red zone. Use `lea' here to preserve the
// flags.
.endif
// Make space for the register save area. On AMD64 with System/V
// ABI, also skip the red zone. Use `lea' here to preserve the
// flags.
- lea R_sp(r), [R_sp(r) - REGDUMP_SPADJ]
+ lea SP, [SP - REGDUMP_SPADJ]
// Save flags and general-purpose registers. On 32-bit x86, we save
// ebx here and establish a GOT pointer here for the benefit of the
// PLT-indirect calls made later on.
pushf
# if CPUFAM_X86
// Save flags and general-purpose registers. On 32-bit x86, we save
// ebx here and establish a GOT pointer here for the benefit of the
// PLT-indirect calls made later on.
pushf
# if CPUFAM_X86
- mov [esp + 4*REGIX_BX], ebx
+ mov [SP + 4*REGIX_BX], ebx
ldgot
# endif
callext F(regdump_gpsave)
// Make space for the extended registers.
ldgot
# endif
callext F(regdump_gpsave)
// Make space for the extended registers.
callext F(regdump_xtsave)
// Prepare for calling back into C. On 32-bit x86, leave space for
callext F(regdump_xtsave)
// Prepare for calling back into C. On 32-bit x86, leave space for
// the `shadow space' for the called-function's arguments. Also,
// forcibly align the stack pointer to a 16-byte boundary.
# if CPUFAM_X86
// the `shadow space' for the called-function's arguments. Also,
// forcibly align the stack pointer to a 16-byte boundary.
# if CPUFAM_X86
// We assume r/ebp still points to the register map.
callext F(regdump_xtrstr)
// We assume r/ebp still points to the register map.
callext F(regdump_xtrstr)
callext F(regdump_gprstr)
popf
callext F(regdump_gprstr)
popf
- lea R_sp(r), [R_sp(r) + REGDUMP_SPADJ]
+ lea SP, [SP + REGDUMP_SPADJ]
.endm
.macro _regbase
# if CPUFAM_X86
.endm
.macro _regbase
# if CPUFAM_X86
# endif
.endm
.macro _membase
# endif
.endm
.macro _membase
- mov R_a(r), [R_bp(r) + regmap_gp]
+ mov AX, [BP + regmap_gp]
# if CPUFAM_X86
mov eax, [eax + REGIX_ADDR*WORDSZ]
# if CPUFAM_X86
mov eax, [eax + REGIX_ADDR*WORDSZ]
# elif ABI_SYSV
mov rdi, [rax + REGIX_ADDR*WORDSZ]
# elif ABI_WIN
# elif ABI_SYSV
mov rdi, [rax + REGIX_ADDR*WORDSZ]
# elif ABI_WIN
.macro _reglbl msg
.ifeqs "\msg", ""
# if CPUFAM_X86
.macro _reglbl msg
.ifeqs "\msg", ""
# if CPUFAM_X86
- mov dword ptr [esp + 4], 0
+ mov dword ptr [SP + 4], 0
# elif ABI_SYSV
xor esi, esi
# elif ABI_WIN
# elif ABI_SYSV
xor esi, esi
# elif ABI_WIN
.else
# if CPUFAM_X86
lea eax, [INTADDR(.L$_reglbl$\@)]
.else
# if CPUFAM_X86
lea eax, [INTADDR(.L$_reglbl$\@)]
# elif ABI_SYSV
lea rsi, [INTADDR(.L$_reglbl$\@)]
# elif ABI_WIN
# elif ABI_SYSV
lea rsi, [INTADDR(.L$_reglbl$\@)]
# elif ABI_WIN
.macro _regfmt arg
# if CPUFAM_X86
.macro _regfmt arg
# if CPUFAM_X86
- mov dword ptr [esp + 8], \arg
+ mov dword ptr [SP + 8], \arg
# elif ABI_SYSV
mov edx, \arg
# elif ABI_WIN
# elif ABI_SYSV
mov edx, \arg
# elif ABI_WIN
movdqu xmm4, [rax]
#if ABI_WIN
stalloc 48 + 8 // space for the carries
movdqu xmm4, [rax]
#if ABI_WIN
stalloc 48 + 8 // space for the carries
-# define STKTMP(i) [rsp + i]
+# define STKTMP(i) [SP + i]
-# define STKTMP(i) [rsp + i - 48 - 8] // use red zone
+# define STKTMP(i) [SP + i - 48 - 8] // use red zone
- mov N, [rsp + 224]
- mov MI, [rsp + 232]
+ mov N, [SP + 224]
+ mov MI, [SP + 232]
# define ARG8 STKARG(4)
# define STKARG_OFFSET 224
#endif
# define ARG8 STKARG(4)
# define STKARG_OFFSET 224
#endif
-#define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
+#define STKARG(i) [SP + STKARG_OFFSET + 8*(i)]
// sysv win
// dmul smul mmul mont dmul smul mmul mont
// sysv win
// dmul smul mmul mont dmul smul mmul mont
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
propout [edi + 12], xmm7, xmm4
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
propout [edi + 12], xmm7, xmm4
- movdqa [esp + 0], xmm4
- movdqa [esp + 16], xmm5
- movdqa [esp + 32], xmm6
+ movdqa [SP + 0], xmm4
+ movdqa [SP + 16], xmm5
+ movdqa [SP + 32], xmm6
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
propout [edi + 12], xmm7, xmm4
// Add add on the carry we calculated earlier.
propout [edi + 12], xmm7, xmm4
// Add add on the carry we calculated earlier.
- paddq xmm4, [esp + 0]
- paddq xmm5, [esp + 16]
- paddq xmm6, [esp + 32]
+ paddq xmm4, [SP + 0]
+ paddq xmm5, [SP + 16]
+ paddq xmm6, [SP + 32]
// And, with that, we're done.
stfree 48 + 12
// And, with that, we're done.
stfree 48 + 12
// void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
// const mpw *bv, const mpw *bvl);
// void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
// const mpw *bv, const mpw *bvl);
- // Build a stack frame. Arguments will be relative to EBP, as
+ // Build a stack frame. Arguments will be relative to BP, as
- // ebp + 20 dv
- // ebp + 24 av
- // ebp + 28 avl
- // ebp + 32 bv
- // ebp + 36 bvl
+ // BP + 20 dv
+ // BP + 24 av
+ // BP + 28 avl
+ // BP + 32 bv
+ // BP + 36 bvl
- // Locals are relative to ESP, as follows.
+ // Locals are relative to SP, as follows.
- // esp + 0 expanded Y (32 bytes)
- // esp + 32 (top of locals)
- pushreg ebp
+ // SP + 0 expanded Y (32 bytes)
+ // SP + 32 (top of locals)
+ pushreg BP
pushreg ebx
pushreg esi
pushreg edi
setfp
pushreg ebx
pushreg esi
pushreg edi
setfp
- and esp, ~15
- sub esp, 32
+ and SP, ~15
+ sub SP, 32
endprologue
// Prepare for the first iteration.
endprologue
// Prepare for the first iteration.
- mov esi, [ebp + 32] // -> bv[0]
+ mov esi, [BP + 32] // -> bv[0]
pxor xmm7, xmm7
movdqu xmm0, [esi] // bv[0]
pxor xmm7, xmm7
movdqu xmm0, [esi] // bv[0]
- mov edi, [ebp + 20] // -> dv[0]
+ mov edi, [BP + 20] // -> dv[0]
mov ecx, edi // outer loop dv cursor
expand xmm7, xmm0, xmm1
mov ecx, edi // outer loop dv cursor
expand xmm7, xmm0, xmm1
- mov ebx, [ebp + 24] // -> av[0]
- mov eax, [ebp + 28] // -> av[m] = av limit
- mov edx, esp // -> expanded Y = bv[0]
- movdqa [esp + 0], xmm0 // bv[0] expanded low
- movdqa [esp + 16], xmm1 // bv[0] expanded high
+ mov ebx, [BP + 24] // -> av[0]
+ mov eax, [BP + 28] // -> av[m] = av limit
+ mov edx, SP // -> expanded Y = bv[0]
+ movdqa [SP + 0], xmm0 // bv[0] expanded low
+ movdqa [SP + 16], xmm1 // bv[0] expanded high
call mul4zc
add ebx, 16
add edi, 16
call mul4zc
add ebx, 16
add edi, 16
// Write out the leftover carry. There can be no tail here.
8: call carryprop
// Write out the leftover carry. There can be no tail here.
8: call carryprop
- cmp esi, [ebp + 36] // more passes to do?
+ cmp esi, [BP + 36] // more passes to do?
mov edi, ecx // -> dv[i]
pxor xmm7, xmm7
expand xmm7, xmm0, xmm1
mov edi, ecx // -> dv[i]
pxor xmm7, xmm7
expand xmm7, xmm0, xmm1
- mov ebx, [ebp + 24] // -> av[0]
- movdqa [esp + 0], xmm0 // bv[i] expanded low
- movdqa [esp + 16], xmm1 // bv[i] expanded high
+ mov ebx, [BP + 24] // -> av[0]
+ movdqa [SP + 0], xmm0 // bv[i] expanded low
+ movdqa [SP + 16], xmm1 // bv[i] expanded high
call mla4zc
add edi, 16
add ebx, 16
call mla4zc
add edi, 16
add ebx, 16
// Finish off this pass. There was no tail on the previous pass, and
// there can be none on this pass.
8: call carryprop
// Finish off this pass. There was no tail on the previous pass, and
// there can be none on this pass.
8: call carryprop
// void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
// const mpw *nv, size_t n, const mpw *mi);
// void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
// const mpw *nv, size_t n, const mpw *mi);
- // Build a stack frame. Arguments will be relative to EBP, as
+ // Build a stack frame. Arguments will be relative to BP, as
- // ebp + 20 dv
- // ebp + 24 av
- // ebp + 28 bv
- // ebp + 32 nv
- // ebp + 36 n (nonzero multiple of 4)
- // ebp + 40 mi
+ // BP + 20 dv
+ // BP + 24 av
+ // BP + 28 bv
+ // BP + 32 nv
+ // BP + 36 n (nonzero multiple of 4)
+ // BP + 40 mi
- // Locals are relative to ESP, which 16-byte aligned, as follows.
+ // Locals are relative to SP, which 16-byte aligned, as follows.
- // esp + 0 expanded V (32 bytes)
- // esp + 32 expanded M (32 bytes)
- // esp + 64 expanded Y (32 bytes)
- // esp + 96 outer loop dv
- // esp + 100 outer loop bv
- // esp + 104 av limit (mostly in ESI)
- // esp + 108 bv limit
- // esp + 112 (top of locals)
- pushreg ebp
+ // SP + 0 expanded V (32 bytes)
+ // SP + 32 expanded M (32 bytes)
+ // SP + 64 expanded Y (32 bytes)
+ // SP + 96 outer loop dv
+ // SP + 100 outer loop bv
+ // SP + 104 av limit (mostly in ESI)
+ // SP + 108 bv limit
+ // SP + 112 (top of locals)
+ pushreg BP
pushreg ebx
pushreg esi
pushreg edi
setfp
pushreg ebx
pushreg esi
pushreg edi
setfp
- and esp, ~15
- sub esp, 112
+ and SP, ~15
+ sub SP, 112
endprologue
// Establish the expanded operands.
pxor xmm7, xmm7
endprologue
// Establish the expanded operands.
pxor xmm7, xmm7
- mov ecx, [ebp + 28] // -> bv
- mov edx, [ebp + 40] // -> mi
+ mov ecx, [BP + 28] // -> bv
+ mov edx, [BP + 40] // -> mi
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
expand xmm7, xmm0, xmm1, xmm2, xmm3
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
expand xmm7, xmm0, xmm1, xmm2, xmm3
- movdqa [esp + 0], xmm0 // bv[0] expanded low
- movdqa [esp + 16], xmm1 // bv[0] expanded high
- movdqa [esp + 32], xmm2 // mi expanded low
- movdqa [esp + 48], xmm3 // mi expanded high
+ movdqa [SP + 0], xmm0 // bv[0] expanded low
+ movdqa [SP + 16], xmm1 // bv[0] expanded high
+ movdqa [SP + 32], xmm2 // mi expanded low
+ movdqa [SP + 48], xmm3 // mi expanded high
// Set up the outer loop state and prepare for the first iteration.
// Set up the outer loop state and prepare for the first iteration.
- mov edx, [ebp + 36] // n
- mov eax, [ebp + 24] // -> U = av[0]
- mov ebx, [ebp + 32] // -> X = nv[0]
- mov edi, [ebp + 20] // -> Z = dv[0]
- mov [esp + 100], ecx
+ mov edx, [BP + 36] // n
+ mov eax, [BP + 24] // -> U = av[0]
+ mov ebx, [BP + 32] // -> X = nv[0]
+ mov edi, [BP + 20] // -> Z = dv[0]
+ mov [SP + 100], ecx
lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
lea edx, [eax + 4*edx] // -> av[n/4] = av limit
lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
lea edx, [eax + 4*edx] // -> av[n/4] = av limit
- mov [esp + 96], edi
- mov [esp + 104], edx
- mov [esp + 108], ecx
- lea ecx, [esp + 0] // -> expanded V = bv[0]
- lea esi, [esp + 32] // -> expanded M = mi
- lea edx, [esp + 64] // -> space for Y
+ mov [SP + 96], edi
+ mov [SP + 104], edx
+ mov [SP + 108], ecx
+ lea ecx, [SP + 0] // -> expanded V = bv[0]
+ lea esi, [SP + 32] // -> expanded M = mi
+ lea edx, [SP + 64] // -> space for Y
- mov esi, [esp + 104] // recover av limit
+ mov esi, [SP + 104] // recover av limit
add edi, 16
add eax, 16
add ebx, 16
cmp eax, esi // done already?
jae 8f
add edi, 16
add eax, 16
add ebx, 16
cmp eax, esi // done already?
jae 8f
.p2align 4
// Complete the first inner loop.
.p2align 4
// Complete the first inner loop.
// Embark on the next iteration. (There must be one. If n = 1, then
// we would have bailed above, to label 8. Similarly, the subsequent
// iterations can fall into the inner loop immediately.)
// Embark on the next iteration. (There must be one. If n = 1, then
// we would have bailed above, to label 8. Similarly, the subsequent
// iterations can fall into the inner loop immediately.)
-1: mov eax, [esp + 100] // -> bv[i - 1]
- mov edi, [esp + 96] // -> Z = dv[i]
+1: mov eax, [SP + 100] // -> bv[i - 1]
+ mov edi, [SP + 96] // -> Z = dv[i]
add eax, 16 // -> bv[i]
pxor xmm7, xmm7
add eax, 16 // -> bv[i]
pxor xmm7, xmm7
- mov [esp + 100], eax
- cmp eax, [esp + 108] // done yet?
+ mov [SP + 100], eax
+ cmp eax, [SP + 108] // done yet?
jae 9f
movdqu xmm0, [eax] // bv[i]
jae 9f
movdqu xmm0, [eax] // bv[i]
- mov ebx, [ebp + 32] // -> X = nv[0]
- lea esi, [esp + 32] // -> expanded M = mi
- mov eax, [ebp + 24] // -> U = av[0]
+ mov ebx, [BP + 32] // -> X = nv[0]
+ lea esi, [SP + 32] // -> expanded M = mi
+ mov eax, [BP + 24] // -> U = av[0]
- movdqa [esp + 0], xmm0 // bv[i] expanded low
- movdqa [esp + 16], xmm1 // bv[i] expanded high
+ movdqa [SP + 0], xmm0 // bv[i] expanded low
+ movdqa [SP + 16], xmm1 // bv[i] expanded high
- mov esi, [esp + 104] // recover av limit
+ mov esi, [SP + 104] // recover av limit
add edi, 16
add eax, 16
add ebx, 16
add edi, 16
add eax, 16
add ebx, 16
.p2align 4
// Complete the next inner loop.
.p2align 4
// Complete the next inner loop.
popreg edi
popreg esi
popreg ebx
popreg edi
popreg esi
popreg ebx
// void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
// size_t n, const mpw *mi);
// void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
// size_t n, const mpw *mi);
- // Build a stack frame. Arguments will be relative to EBP, as
+ // Build a stack frame. Arguments will be relative to BP, as
- // ebp + 20 dv
- // ebp + 24 dvl
- // ebp + 28 nv
- // ebp + 32 n (nonzero multiple of 4)
- // ebp + 36 mi
+ // BP + 20 dv
+ // BP + 24 dvl
+ // BP + 28 nv
+ // BP + 32 n (nonzero multiple of 4)
+ // BP + 36 mi
- // Locals are relative to ESP, as follows.
+ // Locals are relative to SP, as follows.
- // esp + 0 outer loop dv
- // esp + 4 outer dv limit
- // esp + 8 blocks-of-4 dv limit
- // esp + 12 expanded M (32 bytes)
- // esp + 44 expanded Y (32 bytes)
- // esp + 76 (top of locals)
- pushreg ebp
+ // SP + 0 outer loop dv
+ // SP + 4 outer dv limit
+ // SP + 8 blocks-of-4 dv limit
+ // SP + 12 expanded M (32 bytes)
+ // SP + 44 expanded Y (32 bytes)
+ // SP + 76 (top of locals)
+ pushreg BP
pushreg ebx
pushreg esi
pushreg edi
setfp
pushreg ebx
pushreg esi
pushreg edi
setfp
- and esp, ~15
- sub esp, 76
+ and SP, ~15
+ sub SP, 76
endprologue
// Establish the expanded operands and the blocks-of-4 dv limit.
endprologue
// Establish the expanded operands and the blocks-of-4 dv limit.
- mov edi, [ebp + 20] // -> Z = dv[0]
+ mov edi, [BP + 20] // -> Z = dv[0]
- mov eax, [ebp + 24] // -> dv[n] = dv limit
+ mov eax, [BP + 24] // -> dv[n] = dv limit
sub eax, edi // length of dv in bytes
sub eax, edi // length of dv in bytes
- mov edx, [ebp + 36] // -> mi
+ mov edx, [BP + 36] // -> mi
movdqu xmm0, [edx] // mi
and eax, ~15 // mask off the tail end
expand xmm7, xmm0, xmm1
add eax, edi // find limit
movdqu xmm0, [edx] // mi
and eax, ~15 // mask off the tail end
expand xmm7, xmm0, xmm1
add eax, edi // find limit
- movdqa [esp + 12], xmm0 // mi expanded low
- movdqa [esp + 28], xmm1 // mi expanded high
- mov [esp + 8], eax
+ movdqa [SP + 12], xmm0 // mi expanded low
+ movdqa [SP + 28], xmm1 // mi expanded high
+ mov [SP + 8], eax
// Set up the outer loop state and prepare for the first iteration.
// Set up the outer loop state and prepare for the first iteration.
- mov ecx, [ebp + 32] // n
- mov ebx, [ebp + 28] // -> X = nv[0]
+ mov ecx, [BP + 32] // n
+ mov ebx, [BP + 28] // -> X = nv[0]
lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit
lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit
- mov [esp + 0], edi
- mov [esp + 4], edx
- lea esi, [esp + 12] // -> expanded M = mi
- lea edx, [esp + 44] // -> space for Y
+ mov [SP + 0], edi
+ mov [SP + 4], edx
+ lea esi, [SP + 12] // -> expanded M = mi
+ lea edx, [SP + 44] // -> space for Y
call mont4
add ebx, 16
add edi, 16
call mont4
add ebx, 16
add edi, 16
// Still have carries left to propagate.
8: carryadd
// Still have carries left to propagate.
8: carryadd
- mov esi, [esp + 8] // -> dv blocks limit
- mov edx, [ebp + 24] // dv limit
+ mov esi, [SP + 8] // -> dv blocks limit
+ mov edx, [BP + 24] // dv limit
psllq xmm7, 16
pslldq xmm7, 8
paddq xmm6, xmm7
psllq xmm7, 16
pslldq xmm7, 8
paddq xmm6, xmm7
// All done for this iteration. Start the next. (This must have at
// least one follow-on iteration, or we'd not have started this outer
// loop.)
// All done for this iteration. Start the next. (This must have at
// least one follow-on iteration, or we'd not have started this outer
// loop.)
-8: mov edi, [esp + 0] // -> dv[i - 1]
- mov ebx, [ebp + 28] // -> X = nv[0]
- lea edx, [esp + 44] // -> space for Y
- lea esi, [esp + 12] // -> expanded M = mi
+8: mov edi, [SP + 0] // -> dv[i - 1]
+ mov ebx, [BP + 28] // -> X = nv[0]
+ lea edx, [SP + 44] // -> space for Y
+ lea esi, [SP + 12] // -> expanded M = mi
add edi, 16 // -> Z = dv[i]
add edi, 16 // -> Z = dv[i]
- cmp edi, [esp + 4] // all done yet?
+ cmp edi, [SP + 4] // all done yet?
call mont4
add edi, 16
add ebx, 16
call mont4
add edi, 16
add ebx, 16
popreg edi
popreg esi
popreg ebx
popreg edi
popreg esi
popreg ebx
.endm
.macro testprologue n
.endm
.macro testprologue n
pushreg ebx
pushreg esi
pushreg edi
setfp
pushreg ebx
pushreg esi
pushreg edi
setfp
- and esp, ~15
- sub esp, 3*32 + 4*4
+ and SP, ~15
+ sub SP, 3*32 + 4*4
- // esp + 0 = v expanded
- // esp + 32 = y expanded
- // esp + 64 = ? expanded
- // esp + 96 = cycles
- // esp + 104 = count
+ // SP + 0 = v expanded
+ // SP + 32 = y expanded
+ // SP + 64 = ? expanded
+ // SP + 96 = cycles
+ // SP + 104 = count
.endm
.macro testepilogue
.endm
.macro testepilogue
popreg edi
popreg esi
popreg ebx
popreg edi
popreg esi
popreg ebx
mov ecx, \v
movdqu xmm0, [ecx]
expand xmm7, xmm0, xmm1
mov ecx, \v
movdqu xmm0, [ecx]
expand xmm7, xmm0, xmm1
- movdqa [esp + 0], xmm0
- movdqa [esp + 16], xmm1
+ movdqa [SP + 0], xmm0
+ movdqa [SP + 16], xmm1
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
expand xmm7, xmm2, xmm3
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
expand xmm7, xmm2, xmm3
- movdqa [esp + 32], xmm2
- movdqa [esp + 48], xmm3
+ movdqa [SP + 32], xmm2
+ movdqa [SP + 48], xmm3
.p2align 4
0:
.ifnes "\u", "nil"
.p2align 4
0:
.ifnes "\u", "nil"
.endif
mov ebx, \x
.ifeqs "\mode", "mont"
.endif
mov ebx, \x
.ifeqs "\mode", "mont"
.ifnes "\u", "nil"
mov eax, \u
.endif
.ifeqs "\mode", "mont"
.ifnes "\u", "nil"
mov eax, \u
.endif
.ifeqs "\mode", "mont"
.endif
.endm
.macro testtail cyv
.endif
.endm
.macro testtail cyv
- cystore esp + 96, \cyv, esp + 104
+ cystore SP + 96, \cyv, SP + 104
- testprologue [ebp + 44]
- testldcarry [ebp + 24]
- testexpand [ebp + 36], [ebp + 40]
- mov edi, [ebp + 20]
- testtop [ebp + 28], [ebp + 32]
+ testprologue [BP + 44]
+ testldcarry [BP + 24]
+ testexpand [BP + 36], [BP + 40]
+ mov edi, [BP + 20]
+ testtop [BP + 28], [BP + 32]
- testtail [ebp + 48]
- testcarryout [ebp + 24]
+ testtail [BP + 48]
+ testcarryout [BP + 24]
testepilogue
ENDFUNC
FUNC(test_dmla4)
testepilogue
ENDFUNC
FUNC(test_dmla4)
- testprologue [ebp + 44]
- testldcarry [ebp + 24]
- testexpand [ebp + 36], [ebp + 40]
- mov edi, [ebp + 20]
- testtop [ebp + 28], [ebp + 32]
+ testprologue [BP + 44]
+ testldcarry [BP + 24]
+ testexpand [BP + 36], [BP + 40]
+ mov edi, [BP + 20]
+ testtop [BP + 28], [BP + 32]
- testtail [ebp + 48]
- testcarryout [ebp + 24]
+ testtail [BP + 48]
+ testcarryout [BP + 24]
testepilogue
ENDFUNC
FUNC(test_mul4)
testepilogue
ENDFUNC
FUNC(test_mul4)
- testprologue [ebp + 36]
- testldcarry [ebp + 24]
- testexpand nil, [ebp + 32]
- mov edi, [ebp + 20]
- testtop nil, [ebp + 28]
+ testprologue [BP + 36]
+ testldcarry [BP + 24]
+ testexpand nil, [BP + 32]
+ mov edi, [BP + 20]
+ testtop nil, [BP + 28]
- testtail [ebp + 40]
- testcarryout [ebp + 24]
+ testtail [BP + 40]
+ testcarryout [BP + 24]
testepilogue
ENDFUNC
FUNC(test_mul4zc)
testepilogue
ENDFUNC
FUNC(test_mul4zc)
- testprologue [ebp + 36]
- testldcarry [ebp + 24]
- testexpand nil, [ebp + 32]
- mov edi, [ebp + 20]
- testtop nil, [ebp + 28]
+ testprologue [BP + 36]
+ testldcarry [BP + 24]
+ testexpand nil, [BP + 32]
+ mov edi, [BP + 20]
+ testtop nil, [BP + 28]
- testtail [ebp + 40]
- testcarryout [ebp + 24]
+ testtail [BP + 40]
+ testcarryout [BP + 24]
testepilogue
ENDFUNC
FUNC(test_mla4)
testepilogue
ENDFUNC
FUNC(test_mla4)
- testprologue [ebp + 36]
- testldcarry [ebp + 24]
- testexpand nil, [ebp + 32]
- mov edi, [ebp + 20]
- testtop nil, [ebp + 28]
+ testprologue [BP + 36]
+ testldcarry [BP + 24]
+ testexpand nil, [BP + 32]
+ mov edi, [BP + 20]
+ testtop nil, [BP + 28]
- testtail [ebp + 40]
- testcarryout [ebp + 24]
+ testtail [BP + 40]
+ testcarryout [BP + 24]
testepilogue
ENDFUNC
FUNC(test_mla4zc)
testepilogue
ENDFUNC
FUNC(test_mla4zc)
- testprologue [ebp + 36]
- testldcarry [ebp + 24]
- testexpand nil, [ebp + 32]
- mov edi, [ebp + 20]
- testtop nil, [ebp + 28]
+ testprologue [BP + 36]
+ testldcarry [BP + 24]
+ testexpand nil, [BP + 32]
+ mov edi, [BP + 20]
+ testtop nil, [BP + 28]
- testtail [ebp + 40]
- testcarryout [ebp + 24]
+ testtail [BP + 40]
+ testcarryout [BP + 24]
testepilogue
ENDFUNC
FUNC(test_mmul4)
testepilogue
ENDFUNC
FUNC(test_mmul4)
- testprologue [ebp + 48]
- testexpand [ebp + 40], [ebp + 44]
- mov edi, [ebp + 20]
- testtop [ebp + 32], [ebp + 36], mont
+ testprologue [BP + 48]
+ testexpand [BP + 40], [BP + 44]
+ mov edi, [BP + 20]
+ testtop [BP + 32], [BP + 36], mont
- testtail [ebp + 52]
- mov edi, [ebp + 28]
- movdqa xmm0, [esp + 64]
- movdqa xmm1, [esp + 80]
+ testtail [BP + 52]
+ mov edi, [BP + 28]
+ movdqa xmm0, [SP + 64]
+ movdqa xmm1, [SP + 80]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
- testcarryout [ebp + 24]
testepilogue
ENDFUNC
FUNC(test_mmla4)
testepilogue
ENDFUNC
FUNC(test_mmla4)
- testprologue [ebp + 48]
- testexpand [ebp + 40], [ebp + 44]
- mov edi, [ebp + 20]
- testtop [ebp + 32], [ebp + 36], mont
+ testprologue [BP + 48]
+ testexpand [BP + 40], [BP + 44]
+ mov edi, [BP + 20]
+ testtop [BP + 32], [BP + 36], mont
- testtail [ebp + 52]
- mov edi, [ebp + 28]
- movdqa xmm0, [esp + 64]
- movdqa xmm1, [esp + 80]
+ testtail [BP + 52]
+ mov edi, [BP + 28]
+ movdqa xmm0, [SP + 64]
+ movdqa xmm1, [SP + 80]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
- testcarryout [ebp + 24]
testepilogue
ENDFUNC
FUNC(test_mont4)
testepilogue
ENDFUNC
FUNC(test_mont4)
- testprologue [ebp + 40]
- testexpand nil, [ebp + 36]
- mov edi, [ebp + 20]
- testtop nil, [ebp + 32], mont
+ testprologue [BP + 40]
+ testexpand nil, [BP + 36]
+ mov edi, [BP + 20]
+ testtop nil, [BP + 32], mont
- testtail [ebp + 44]
- mov edi, [ebp + 28]
- movdqa xmm0, [esp + 64]
- movdqa xmm1, [esp + 80]
+ testtail [BP + 44]
+ mov edi, [BP + 28]
+ movdqa xmm0, [SP + 64]
+ movdqa xmm1, [SP + 80]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
- testcarryout [ebp + 24]
// Return zero on success, or -1 on error.
#if CPUFAM_X86
// Return zero on success, or -1 on error.
#if CPUFAM_X86
stalloc 28
# define COUNT ecx
#endif
stalloc 28
# define COUNT ecx
#endif
// Try to fetch a random number.
mov COUNT, 16
// Try to fetch a random number.
mov COUNT, 16
// Success.
1:
#if CPUFAM_X86
// Success.
1:
#if CPUFAM_X86
- mov [esp + 16], eax
- lea ecx, [esp + 16]
- mov dword ptr [esp + 12], 32
- mov dword ptr [esp + 8], 4
- mov [esp + 4], ecx
- mov [esp + 0], edx
+ mov [SP + 16], AX
+ lea ecx, [SP + 16]
+ mov dword ptr [SP + 12], 32
+ mov dword ptr [SP + 8], 4
+ mov [SP + 4], ecx
+ mov [SP + 0], edx
#endif
#if CPUFAM_AMD64 && ABI_SYSV
#endif
#if CPUFAM_AMD64 && ABI_SYSV
- mov [rsp + 0], rax
- mov rsi, rsp
+ mov [SP + 0], AX
+ mov rsi, SP
mov edx, 8
mov ecx, 64
#endif
#if CPUFAM_AMD64 && ABI_WIN
mov edx, 8
mov ecx, 64
#endif
#if CPUFAM_AMD64 && ABI_WIN
- mov [rsp + 32], rax
- lea rdx, [rsp + 32]
+ mov [SP + 32], AX
+ lea rdx, [SP + 32]
mov r8d, 8
mov r9d, 64
#endif
mov r8d, 8
mov r9d, 64
#endif
# define SAVE0 xmm5
# define SAVE1 xmm6
# define SAVE2 xmm7
# define SAVE0 xmm5
# define SAVE1 xmm6
# define SAVE2 xmm7
- sub esp, 16
- mov IN, [ebp + 12]
- mov OUT, [ebp + 16]
- and esp, ~15
- mov NR, [ebp + 8]
+ sub SP, 16
+ mov IN, [BP + 12]
+ mov OUT, [BP + 16]
+ and SP, ~15
+ mov NR, [BP + 8]
#endif
#if CPUFAM_AMD64 && ABI_SYSV
#endif
#if CPUFAM_AMD64 && ABI_SYSV
# define IN rdx
# define OUT r8
# define SAVE0 xmm5
# define IN rdx
# define OUT r8
# define SAVE0 xmm5
-# define SAVE1 [rsp + 0]
-# define SAVE2 [rsp + 16]
-# define SAVE3 [rsp + 32]
+# define SAVE1 [SP + 0]
+# define SAVE2 [SP + 16]
+# define SAVE3 [SP + 32]
// Tidy things up.
#if CPUFAM_X86
dropfp
// Tidy things up.
#if CPUFAM_X86
dropfp
#endif
#if CPUFAM_AMD64 && ABI_WIN
stfree 48 + 8
#endif
#if CPUFAM_AMD64 && ABI_WIN
stfree 48 + 8
// xmm3 = // v_0 = (v_01; v_00)
movdqa xmm4, xmm0 // u_1 again
#if CPUFAM_X86
// xmm3 = // v_0 = (v_01; v_00)
movdqa xmm4, xmm0 // u_1 again
#if CPUFAM_X86
#elif CPUFAM_AMD64
movdqa xmm8, xmm3
# define V0 xmm8
#elif CPUFAM_AMD64
movdqa xmm8, xmm3
# define V0 xmm8
pclmullqlqdq xmm4, xmm2 // u_11 v_11
pclmulhqhqdq xmm7, xmm2 // u_10 v_10
#if CPUFAM_X86
pclmullqlqdq xmm4, xmm2 // u_11 v_11
pclmulhqhqdq xmm7, xmm2 // u_10 v_10
#if CPUFAM_X86
# define V0 xmm2
#endif
pxor xmm0, xmm3 // u_10 v_11 + u_11 v_10
# define V0 xmm2
#endif
pxor xmm0, xmm3 // u_10 v_11 + u_11 v_10
// A is updated with the product A K.
#if CPUFAM_X86
// A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
#endif
endprologue
movdqu xmm0, [A]
#endif
endprologue
movdqu xmm0, [A]
// exit, A is updated with the product A K.
#if CPUFAM_X86
// exit, A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
ldgot ecx
#endif
endprologue
ldgot ecx
#endif
endprologue
// A is updated with the product A K.
#if CPUFAM_X86
// A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
#endif
endprologue
movq xmm0, [A]
#endif
endprologue
movq xmm0, [A]
// exit, A is updated with the product A K.
#if CPUFAM_X86
// exit, A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
ldgot ecx
#endif
endprologue
ldgot ecx
#endif
endprologue
// with the product A K.
#if CPUFAM_X86
// with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
#endif
endprologue
movq xmm0, [A + 0]
#endif
endprologue
movq xmm0, [A + 0]
// updated with the product A K.
#if CPUFAM_X86
// updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
ldgot ecx
#endif
endprologue
ldgot ecx
#endif
endprologue
// A is updated with the product A K.
#if CPUFAM_X86
// A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
#endif
#if CPUFAM_AMD64 && ABI_WIN
stalloc 2*16 + 8
#endif
#if CPUFAM_AMD64 && ABI_WIN
stalloc 2*16 + 8
// exit, A is updated with the product A K.
#if CPUFAM_X86
// exit, A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
ldgot ecx
#endif
#if CPUFAM_AMD64 && ABI_WIN
ldgot ecx
#endif
#if CPUFAM_AMD64 && ABI_WIN
// A is updated with the product A K.
#if CPUFAM_X86
// A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 8]
- mov K, [esp + 12]
- and esp, ~15
- sub esp, 16
+ mov A, [SP + 8]
+ mov K, [SP + 12]
+ and SP, ~15
+ sub SP, 16
#endif
#if CPUFAM_AMD64 && ABI_WIN
stalloc 3*16 + 8
#endif
#if CPUFAM_AMD64 && ABI_WIN
stalloc 3*16 + 8
movdqu [A + 0], xmm1
#if CPUFAM_X86
dropfp
movdqu [A + 0], xmm1
#if CPUFAM_X86
dropfp
#endif
#if CPUFAM_AMD64 && ABI_WIN
rstrxmm xmm6, 0
#endif
#if CPUFAM_AMD64 && ABI_WIN
rstrxmm xmm6, 0
// exit, A is updated with the product A K.
#if CPUFAM_X86
// exit, A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 8]
- mov K, [esp + 12]
- and esp, ~15
+ mov A, [SP + 8]
+ mov K, [SP + 12]
+ and SP, ~15
#endif
#if CPUFAM_AMD64 && ABI_WIN
stalloc 3*16 + 8
#endif
#if CPUFAM_AMD64 && ABI_WIN
stalloc 3*16 + 8
movdqu [A + 0], xmm1
#if CPUFAM_X86
dropfp
movdqu [A + 0], xmm1
#if CPUFAM_X86
dropfp
#endif
#if CPUFAM_AMD64 && ABI_WIN
rstrxmm xmm6, 0
#endif
#if CPUFAM_AMD64 && ABI_WIN
rstrxmm xmm6, 0
FUNC(rijndael_setup_x86ish_aesni)
FUNC(rijndael_setup_x86ish_aesni)
-#define SI WHOLE(si)
-#define DI WHOLE(di)
-
#if CPUFAM_X86
// Arguments are on the stack. We'll need to stack the caller's
// register veriables, but we'll manage.
#if CPUFAM_X86
// Arguments are on the stack. We'll need to stack the caller's
// register veriables, but we'll manage.
-# define CTX ebp // context pointer
-# define BLKSZ [esp + 24] // block size
+# define CTX BP // context pointer
+# define BLKSZ [SP + 24] // block size
# define KSZ ebx // key size
# define NKW edx // total number of key words
# define KSZ ebx // key size
# define NKW edx // total number of key words
# define BLKOFF edx // block size in bytes
// Stack the caller's registers.
# define BLKOFF edx // block size in bytes
// Stack the caller's registers.
pushreg ebx
pushreg esi
pushreg edi
// Set up our own variables.
pushreg ebx
pushreg esi
pushreg edi
// Set up our own variables.
- mov CTX, [esp + 20] // context base pointer
- mov SI, [esp + 28] // key material
- mov KSZ, [esp + 32] // key size, in words
+ mov CTX, [SP + 20] // context base pointer
+ mov SI, [SP + 28] // key material
+ mov KSZ, [SP + 32] // key size, in words
#endif
#if CPUFAM_AMD64 && ABI_SYSV
#endif
#if CPUFAM_AMD64 && ABI_SYSV
popreg edi
popreg esi
popreg ebx
popreg edi
popreg esi
popreg ebx
#endif
#if CPUFAM_AMD64 && ABI_WIN
popreg rdi
#endif
#if CPUFAM_AMD64 && ABI_WIN
popreg rdi
# define DST edx
# define NR ecx
# define DST edx
# define NR ecx
- mov K, [esp + 4]
- mov SRC, [esp + 8]
+ mov K, [SP + 4]
+ mov SRC, [SP + 8]
#endif
#if CPUFAM_AMD64 && ABI_SYSV
#endif
#if CPUFAM_AMD64 && ABI_SYSV
add K, 16
pxor xmm0, xmm1
#if CPUFAM_X86
add K, 16
pxor xmm0, xmm1
#if CPUFAM_X86
#endif
// Dispatch to the correct code.
#endif
// Dispatch to the correct code.
# define OUT edx
# define SAVE0 xmm6
# define SAVE1 xmm7
# define OUT edx
# define SAVE0 xmm6
# define SAVE1 xmm7
-# define SAVE2 [esp + 0]
-# define SAVE3 [esp + 16]
+# define SAVE2 [SP + 0]
+# define SAVE3 [SP + 16]
- sub esp, 32
- mov IN, [ebp + 12]
- mov OUT, [ebp + 16]
- and esp, ~15
- mov NR, [ebp + 8]
+ sub SP, 32
+ mov IN, [BP + 12]
+ mov OUT, [BP + 16]
+ and SP, ~15
+ mov NR, [BP + 8]
#endif
#if CPUFAM_AMD64 && ABI_SYSV
#endif
#if CPUFAM_AMD64 && ABI_SYSV
# define OUT r8
# define SAVE0 xmm6
# define SAVE1 xmm7
# define OUT r8
# define SAVE0 xmm6
# define SAVE1 xmm7
-# define SAVE2 [rsp + 32]
-# define SAVE3 [rsp + 48]
+# define SAVE2 [SP + 32]
+# define SAVE3 [SP + 48]
stalloc 64 + 8
savexmm xmm6, 0
stalloc 64 + 8
savexmm xmm6, 0
// Tidy things up.
#if CPUFAM_X86
dropfp
// Tidy things up.
#if CPUFAM_X86
dropfp
#endif
#if CPUFAM_AMD64 && ABI_WIN
rstrxmm xmm6, 0
#endif
#if CPUFAM_AMD64 && ABI_WIN
rstrxmm xmm6, 0