From: Mark Wooding Date: Tue, 29 Oct 2019 18:55:16 +0000 (+0000) Subject: base/asm-common.h (x86), and knock-on: Add macros for full-size regs. X-Git-Tag: 2.6.0~26 X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/a90d420cbe87490c844ae422c966e746d3134b07 base/asm-common.h (x86), and knock-on: Add macros for full-size regs. These registers get used a lot as pointers, so it's useful to be able to refer to them as full-width registers more conveniently than `R_sp(r)'. Introduce (C preprocessor) macros `AX', ..., for this purpose, and use them extensively. (Delete the existing `SI' and `DI' macros from `rijndael-x86ish-aesni.S' which had the same purpose.) --- diff --git a/base/asm-common.h b/base/asm-common.h index 44c223da..d162a5d9 100644 --- a/base/asm-common.h +++ b/base/asm-common.h @@ -452,10 +452,20 @@ name: #endif #define WHOLE(reg) _REGFORM(reg, r) +// Macros for some common registers. +#define AX R_a(r) +#define BX R_b(r) +#define CX R_c(r) +#define DX R_d(r) +#define SI R_si(r) +#define DI R_di(r) +#define BP R_bp(r) +#define SP R_sp(r) + // Stack management and unwinding. -.macro setfp fp=R_bp(r), offset=0 +.macro setfp fp=BP, offset=0 .if \offset == 0 - mov \fp, R_sp(r) + mov \fp, SP #if __ELF__ .cfi_def_cfa_register \fp #endif @@ -463,7 +473,7 @@ name: .seh_setframe \fp, 0 #endif .else - lea \fp, [R_sp(r) + \offset] + lea \fp, [SP + \offset] #if __ELF__ .cfi_def_cfa_register \fp .cfi_adjust_cfa_offset -\offset @@ -478,14 +488,14 @@ name: .macro _dropfp fp, offset=0 .if \offset == 0 - mov R_sp(r), \fp + mov SP, \fp #if __ELF__ - .cfi_def_cfa_register R_sp(r) + .cfi_def_cfa_register SP #endif .else - lea R_sp(r), [\fp - \offset] + lea SP, [\fp - \offset] #if __ELF__ - .cfi_def_cfa_register R_sp(r) + .cfi_def_cfa_register SP .cfi_adjust_cfa_offset +\offset #endif .endif @@ -494,7 +504,7 @@ name: .endm .macro stalloc n - sub R_sp(r), \n + sub SP, \n #if __ELF__ .cfi_adjust_cfa_offset +\n #endif @@ -504,7 +514,7 @@ name: .endm .macro stfree n - add R_sp(r), \n + add SP, \n #if __ELF__ .cfi_adjust_cfa_offset -\n #endif @@ -530,14 +540,14 @@ name: .endm .macro savexmm r, offset - movdqa [R_sp(r) + \offset], \r + movdqa [SP + \offset], \r #if ABI_WIN && CPUFAM_AMD64 .seh_savexmm \r, \offset #endif .endm .macro rstrxmm r, offset - movdqa \r, [R_sp(r) + \offset] + movdqa \r, [SP + \offset] .endm .macro endprologue diff --git a/base/dispatch-x86ish.S b/base/dispatch-x86ish.S index 8c6a2a8f..c3725fc6 100644 --- a/base/dispatch-x86ish.S +++ b/base/dispatch-x86ish.S @@ -46,9 +46,9 @@ FUNC(dispatch_x86ish_cpuid) #if CPUFAM_X86 pushreg ebx pushreg edi - mov edi, [esp + 12] - mov eax, [esp + 16] - mov ecx, [esp + 20] + mov edi, [SP + 12] + mov eax, [SP + 16] + mov ecx, [SP + 20] # define OUT edi #endif #if CPUFAM_AMD64 && ABI_SYSV @@ -69,21 +69,21 @@ FUNC(dispatch_x86ish_cpuid) // First, check that this is even a thing, using the complicated // dance with the flags register. pushf - pop R_d(r) // current flags in d + pop DX // current flags in d - or R_d(r), EFLAGS_ID // force the id bit on and check it - push R_d(r) + or DX, EFLAGS_ID // force the id bit on and check it + push DX popf pushf - pop R_d(r) + pop DX test edx, EFLAGS_ID jz 8f - and R_d(r), ~EFLAGS_ID // force the id bit off and check it - push R_d(r) + and DX, ~EFLAGS_ID // force the id bit off and check it + push DX popf pushf - pop R_d(r) + pop DX test edx, EFLAGS_ID jnz 8f @@ -124,32 +124,32 @@ FUNC(dispatch_x86ish_xmmregisters_p) // Enter with no arguments. Return nonzero if the XMM registers are // usable. - pushreg R_bp(r) + pushreg BP setfp stalloc 512 - and R_sp(r), ~15 + and SP, ~15 endprologue // Save the floating point and SIMD registers, and try to clobber // xmm0. - fxsave [R_sp(r)] - mov eax, [R_sp(r) + 160] - xor dword ptr [R_sp(r) + 160], 0xaaaa5555 - fxrstor [R_sp(r)] + fxsave [SP] + mov eax, [SP + 160] + xor dword ptr [SP + 160], 0xaaaa5555 + fxrstor [SP] // Save them again, and read back the low word of xmm0. Undo the // clobbering and restore. - fxsave [R_sp(r)] - mov ecx, [R_sp(r) + 160] - mov [R_sp(r) + 160], eax - fxrstor [R_sp(r)] + fxsave [SP] + mov ecx, [SP + 160] + mov [SP + 160], eax + fxrstor [SP] // The register are live if we read different things. xor eax, ecx // Done. dropfp - popreg R_bp(r) + popreg BP ret ENDFUNC @@ -164,7 +164,7 @@ FUNC(dispatch_x86ish_rdrand) #if CPUFAM_X86 # define X_OUT edx # define COUNT ecx - mov X_OUT, [esp + 4] + mov X_OUT, [SP + 4] #endif #if CPUFAM_AMD64 && ABI_SYSV # define X_OUT rdi diff --git a/base/regdump-x86ish.S b/base/regdump-x86ish.S index e4dd8e80..67a4ae0e 100644 --- a/base/regdump-x86ish.S +++ b/base/regdump-x86ish.S @@ -56,48 +56,48 @@ FUNC(regdump_gpsave) cld // Save r/ebp and establish it pointing to the save area. - mov [R_sp(r) + WORDSZ + REGIX_BP*WORDSZ], R_bp(r) - lea R_bp(r), [R_sp(r) + WORDSZ] + mov [SP + WORDSZ + REGIX_BP*WORDSZ], BP + lea BP, [SP + WORDSZ] // Save the other easy general-purpose registers. #if !CPUFAM_X86 - mov [R_bp(r) + REGIX_BX*WORDSZ], R_b(r) + mov [BP + REGIX_BX*WORDSZ], BX #endif - mov [R_bp(r) + REGIX_CX*WORDSZ], R_c(r) - mov [R_bp(r) + REGIX_DX*WORDSZ], R_d(r) - mov [R_bp(r) + REGIX_SI*WORDSZ], R_si(r) - mov [R_bp(r) + REGIX_DI*WORDSZ], R_di(r) + mov [BP + REGIX_CX*WORDSZ], CX + mov [BP + REGIX_DX*WORDSZ], DX + mov [BP + REGIX_SI*WORDSZ], SI + mov [BP + REGIX_DI*WORDSZ], DI #if CPUFAM_AMD64 - mov [R_bp(r) + REGIX_R8*WORDSZ], R_r8(r) - mov [R_bp(r) + REGIX_R9*WORDSZ], R_r9(r) - mov [R_bp(r) + REGIX_R10*WORDSZ], R_r10(r) - mov [R_bp(r) + REGIX_R11*WORDSZ], R_r11(r) - mov [R_bp(r) + REGIX_R12*WORDSZ], R_r12(r) - mov [R_bp(r) + REGIX_R13*WORDSZ], R_r13(r) - mov [R_bp(r) + REGIX_R14*WORDSZ], R_r14(r) - mov [R_bp(r) + REGIX_R15*WORDSZ], R_r15(r) + mov [BP + REGIX_R8*WORDSZ], r8 + mov [BP + REGIX_R9*WORDSZ], r9 + mov [BP + REGIX_R10*WORDSZ], r10 + mov [BP + REGIX_R11*WORDSZ], r11 + mov [BP + REGIX_R12*WORDSZ], r12 + mov [BP + REGIX_R13*WORDSZ], r13 + mov [BP + REGIX_R14*WORDSZ], r14 + mov [BP + REGIX_R15*WORDSZ], r15 #endif // Determine the previous stack pointer and save it. #if CPUFAM_AMD64 && ABI_SYSV - lea R_a(r), [R_bp(r) + 128 + REGDUMP_GPSIZE] + lea AX, [BP + 128 + REGDUMP_GPSIZE] #else - lea R_a(r), [R_bp(r) + REGDUMP_GPSIZE] + lea AX, [BP + REGDUMP_GPSIZE] #endif - mov [R_bp(r) + REGIX_SP*WORDSZ], R_a(r) + mov [BP + REGIX_SP*WORDSZ], AX // Collect the return address and save it as r/eip. - mov R_a(r), [R_sp(r)] - mov [R_bp(r) + REGIX_IP*WORDSZ], R_a(r) + mov AX, [SP] + mov [BP + REGIX_IP*WORDSZ], AX // Save the segment registers. - lea R_a(r), [R_bp(r) + REGIX_GPLIM*WORDSZ] - mov [R_a(r) + 2*REGIX_CS], cs - mov [R_a(r) + 2*REGIX_DS], ds - mov [R_a(r) + 2*REGIX_SS], ss - mov [R_a(r) + 2*REGIX_ES], es - mov [R_a(r) + 2*REGIX_FS], fs - mov [R_a(r) + 2*REGIX_GS], gs + lea AX, [BP + REGIX_GPLIM*WORDSZ] + mov [AX + 2*REGIX_CS], cs + mov [AX + 2*REGIX_DS], ds + mov [AX + 2*REGIX_SS], ss + mov [AX + 2*REGIX_ES], es + mov [AX + 2*REGIX_FS], fs + mov [AX + 2*REGIX_GS], gs // Determine the extended save area size. Preserve ebx on 32-bit x86 // here, because the caller needs it for PLT-indirect calls. @@ -135,23 +135,23 @@ FUNC(regdump_gprstr) // We assume nobody actually fiddled with the segment registers. So // just the actual integer registers to do. - mov R_a(r), [R_bp(r) + REGIX_AX*WORDSZ] - mov R_b(r), [R_bp(r) + REGIX_BX*WORDSZ] - mov R_c(r), [R_bp(r) + REGIX_CX*WORDSZ] - mov R_d(r), [R_bp(r) + REGIX_DX*WORDSZ] - mov R_si(r), [R_bp(r) + REGIX_SI*WORDSZ] - mov R_di(r), [R_bp(r) + REGIX_DI*WORDSZ] + mov AX, [BP + REGIX_AX*WORDSZ] + mov BX, [BP + REGIX_BX*WORDSZ] + mov CX, [BP + REGIX_CX*WORDSZ] + mov DX, [BP + REGIX_DX*WORDSZ] + mov SI, [BP + REGIX_SI*WORDSZ] + mov DI, [BP + REGIX_DI*WORDSZ] #if CPUFAM_AMD64 - mov R_r8(r), [R_bp(r) + REGIX_R8*WORDSZ] - mov R_r9(r), [R_bp(r) + REGIX_R9*WORDSZ] - mov R_r10(r), [R_bp(r) + REGIX_R10*WORDSZ] - mov R_r11(r), [R_bp(r) + REGIX_R11*WORDSZ] - mov R_r12(r), [R_bp(r) + REGIX_R12*WORDSZ] - mov R_r13(r), [R_bp(r) + REGIX_R13*WORDSZ] - mov R_r14(r), [R_bp(r) + REGIX_R14*WORDSZ] - mov R_r15(r), [R_bp(r) + REGIX_R15*WORDSZ] + mov r8, [BP + REGIX_R8*WORDSZ] + mov r9, [BP + REGIX_R9*WORDSZ] + mov r10, [BP + REGIX_R10*WORDSZ] + mov r11, [BP + REGIX_R11*WORDSZ] + mov r12, [BP + REGIX_R12*WORDSZ] + mov r13, [BP + REGIX_R13*WORDSZ] + mov r14, [BP + REGIX_R14*WORDSZ] + mov r15, [BP + REGIX_R15*WORDSZ] #endif - mov R_bp(r), [R_bp(r) + REGIX_BP*WORDSZ] + mov BP, [BP + REGIX_BP*WORDSZ] // Done. ret @@ -175,11 +175,11 @@ FUNC(regdump_xtsave) // general registers are clobbered. // Start by filling in the easy parts of the map. - mov [R_sp(r) + WORDSZ + regmap_gp], R_bp(r) - lea R_bp(r), [R_sp(r) + WORDSZ] + mov [SP + WORDSZ + regmap_gp], BP + lea BP, [SP + WORDSZ] xor eax, eax // clears rax too on amd64 - mov [R_bp(r) + regmap_avx], R_a(r) + mov [BP + regmap_avx], AX // Find out whether we use `xsave'. (Preserve ebx.) #if CPUFAM_X86 @@ -191,40 +191,40 @@ FUNC(regdump_xtsave) je 5f // We have the `xsave' machinery. Select the base address. - lea R_si(r), [R_sp(r) + WORDSZ + regmap_size + 63] - and R_si(r), ~63 - mov [R_bp(r) + regmap_fx], R_si(r) + lea SI, [SP + WORDSZ + regmap_size + 63] + and SI, ~63 + mov [BP + regmap_fx], SI // Clear out the header area. xor eax, eax - lea R_di(r), [R_si(r) + 512] + lea DI, [SI + 512] mov ecx, 16 rep stosd // Save the registers. mov eax, 0x00000007 xor edx, edx - xsave [R_si(r)] + xsave [SI] // Establish the AVX pointer, if available. - test dword ptr [R_si(r) + 512], 4 // = xstate_bv + test dword ptr [SI + 512], 4 // = xstate_bv je 8f mov eax, 13 mov ecx, 2 cpuid - add R_b(r), R_si(r) - mov [R_bp(r) + regmap_avx], R_b(r) + add BX, SI + mov [BP + regmap_avx], BX jmp 8f // We have only `fxsave'. Set the base address. -5: lea R_si(r), [R_sp(r) + WORDSZ + regmap_size + 15] - and R_si(r), ~15 - mov [R_bp(r) + regmap_fx], R_si(r) +5: lea SI, [SP + WORDSZ + regmap_size + 15] + and SI, ~15 + mov [BP + regmap_fx], SI // Save the registers. - fxsave [R_si(r)] + fxsave [SI] // Clear the x87 state; otherwise it can cause trouble later. 8: fninit @@ -245,7 +245,7 @@ FUNC(regdump_xtrstr) // 32-bit x86, and the other general registers are clobbered. // Find the extended register dump. - mov R_si(r), [R_bp(r) + regmap_fx] + mov SI, [BP + regmap_fx] // Probe to find out whether we have `xsave'. #if CPUFAM_X86 @@ -259,14 +259,14 @@ FUNC(regdump_xtrstr) // We have the `xsave' machinery. mov eax, 0x00000007 xor edx, edx - xrstor [R_si(r)] + xrstor [SI] jmp 8f // We must fake it up. -1: fxrstor [R_si(r)] +1: fxrstor [SI] // Done. -8: mov R_bp(r), [R_bp(r) + regmap_gp] +8: mov BP, [BP + regmap_gp] #if CPUFAM_X86 pop ebx #endif diff --git a/base/regdump.h b/base/regdump.h index db968642..3ada7eac 100644 --- a/base/regdump.h +++ b/base/regdump.h @@ -381,32 +381,32 @@ DO8(REGDEF_SIMD) // Stash r/eax. This is bletcherous: hope we don't get a signal in // the next few instructions. - mov [R_sp(r) - REGDUMP_SPADJ + (REGIX_AX - 1)*WORDSZ], R_a(r) + mov [SP - REGDUMP_SPADJ + (REGIX_AX - 1)*WORDSZ], AX .ifnes "\addr", "nil" // Collect the effective address for the following dump, leaving it // in the `addr' slot of the dump. - lea R_a(r), \addr - mov [R_sp(r) - REGDUMP_SPADJ + (REGIX_ADDR - 1)*WORDSZ], R_a(r) + lea AX, \addr + mov [SP - REGDUMP_SPADJ + (REGIX_ADDR - 1)*WORDSZ], AX .endif // Make space for the register save area. On AMD64 with System/V // ABI, also skip the red zone. Use `lea' here to preserve the // flags. - lea R_sp(r), [R_sp(r) - REGDUMP_SPADJ] + lea SP, [SP - REGDUMP_SPADJ] // Save flags and general-purpose registers. On 32-bit x86, we save // ebx here and establish a GOT pointer here for the benefit of the // PLT-indirect calls made later on. pushf # if CPUFAM_X86 - mov [esp + 4*REGIX_BX], ebx + mov [SP + 4*REGIX_BX], ebx ldgot # endif callext F(regdump_gpsave) // Make space for the extended registers. - sub R_sp(r), R_c(r) + sub SP, CX callext F(regdump_xtsave) // Prepare for calling back into C. On 32-bit x86, leave space for @@ -414,11 +414,11 @@ DO8(REGDEF_SIMD) // the `shadow space' for the called-function's arguments. Also, // forcibly align the stack pointer to a 16-byte boundary. # if CPUFAM_X86 - sub esp, 16 + sub SP, 16 # elif ABI_WIN - sub rsp, 32 + sub SP, 32 # endif - and R_sp(r), ~15 + and SP, ~15 .endm .macro _rstrregs @@ -426,27 +426,27 @@ DO8(REGDEF_SIMD) // We assume r/ebp still points to the register map. callext F(regdump_xtrstr) - mov R_sp(r), R_bp(r) + mov SP, BP callext F(regdump_gprstr) popf - lea R_sp(r), [R_sp(r) + REGDUMP_SPADJ] + lea SP, [SP + REGDUMP_SPADJ] .endm .macro _regbase # if CPUFAM_X86 - mov [esp + 0], ebp + mov [SP + 0], BP # elif ABI_SYSV - mov rdi, rbp + mov rdi, BP # elif ABI_WIN - mov rcx, rbp + mov rcx, BP # endif .endm .macro _membase - mov R_a(r), [R_bp(r) + regmap_gp] + mov AX, [BP + regmap_gp] # if CPUFAM_X86 mov eax, [eax + REGIX_ADDR*WORDSZ] - mov [esp + 0], eax + mov [SP + 0], eax # elif ABI_SYSV mov rdi, [rax + REGIX_ADDR*WORDSZ] # elif ABI_WIN @@ -457,7 +457,7 @@ DO8(REGDEF_SIMD) .macro _reglbl msg .ifeqs "\msg", "" # if CPUFAM_X86 - mov dword ptr [esp + 4], 0 + mov dword ptr [SP + 4], 0 # elif ABI_SYSV xor esi, esi # elif ABI_WIN @@ -466,7 +466,7 @@ DO8(REGDEF_SIMD) .else # if CPUFAM_X86 lea eax, [INTADDR(.L$_reglbl$\@)] - mov [esp + 4], eax + mov [SP + 4], eax # elif ABI_SYSV lea rsi, [INTADDR(.L$_reglbl$\@)] # elif ABI_WIN @@ -481,7 +481,7 @@ DO8(REGDEF_SIMD) .macro _regfmt arg # if CPUFAM_X86 - mov dword ptr [esp + 8], \arg + mov dword ptr [SP + 8], \arg # elif ABI_SYSV mov edx, \arg # elif ABI_WIN diff --git a/base/test-regdump-x86ish.S b/base/test-regdump-x86ish.S index a8c8d435..41ba77f7 100644 --- a/base/test-regdump-x86ish.S +++ b/base/test-regdump-x86ish.S @@ -10,9 +10,9 @@ vec: FUNC(main) - pushreg R_bp(r) + pushreg BP setfp - and R_sp(r), ~15 + and SP, ~15 endprologue fldz @@ -32,7 +32,7 @@ FUNC(main) xor eax, eax dropfp - popreg R_bp(r) + popreg BP ret ENDFUNC diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index d313765f..51e94c58 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -575,10 +575,10 @@ INTFUNC(mmla4) movdqu xmm4, [rax] #if ABI_WIN stalloc 48 + 8 // space for the carries -# define STKTMP(i) [rsp + i] +# define STKTMP(i) [SP + i] #endif #if ABI_SYSV -# define STKTMP(i) [rsp + i - 48 - 8] // use red zone +# define STKTMP(i) [SP + i - 48 - 8] // use red zone #endif endprologue @@ -811,7 +811,7 @@ FUNC(mpx_umul4_amd64_sse2) endprologue mov rdi, DV - mov BVL, [rsp + 224] + mov BVL, [SP + 224] #endif @@ -978,8 +978,8 @@ FUNC(mpxmont_mul4_amd64_sse2) endprologue mov rdi, DV - mov N, [rsp + 224] - mov MI, [rsp + 232] + mov N, [SP + 224] + mov MI, [SP + 232] #endif @@ -1183,7 +1183,7 @@ FUNC(mpxmont_redc4_amd64_sse2) endprologue mov rdi, DV - mov MI, [rsp + 224] + mov MI, [SP + 224] #endif @@ -1329,7 +1329,7 @@ ENDFUNC # define ARG8 STKARG(4) # define STKARG_OFFSET 224 #endif -#define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)] +#define STKARG(i) [SP + STKARG_OFFSET + 8*(i)] // sysv win // dmul smul mmul mont dmul smul mmul mont diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 904c0d0a..ba7ae6a3 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -561,9 +561,9 @@ INTFUNC(mmla4) mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t propout [edi + 12], xmm7, xmm4 - movdqa [esp + 0], xmm4 - movdqa [esp + 16], xmm5 - movdqa [esp + 32], xmm6 + movdqa [SP + 0], xmm4 + movdqa [SP + 16], xmm5 + movdqa [SP + 32], xmm6 // Calculate Y = W M. mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 @@ -606,9 +606,9 @@ INTFUNC(mmla4) propout [edi + 12], xmm7, xmm4 // Add add on the carry we calculated earlier. - paddq xmm4, [esp + 0] - paddq xmm5, [esp + 16] - paddq xmm6, [esp + 32] + paddq xmm4, [SP + 0] + paddq xmm5, [SP + 16] + paddq xmm6, [SP + 32] // And, with that, we're done. stfree 48 + 12 @@ -688,40 +688,40 @@ FUNC(mpx_umul4_x86_sse2) // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl, // const mpw *bv, const mpw *bvl); - // Build a stack frame. Arguments will be relative to EBP, as + // Build a stack frame. Arguments will be relative to BP, as // follows. // - // ebp + 20 dv - // ebp + 24 av - // ebp + 28 avl - // ebp + 32 bv - // ebp + 36 bvl + // BP + 20 dv + // BP + 24 av + // BP + 28 avl + // BP + 32 bv + // BP + 36 bvl // - // Locals are relative to ESP, as follows. + // Locals are relative to SP, as follows. // - // esp + 0 expanded Y (32 bytes) - // esp + 32 (top of locals) - pushreg ebp + // SP + 0 expanded Y (32 bytes) + // SP + 32 (top of locals) + pushreg BP pushreg ebx pushreg esi pushreg edi setfp - and esp, ~15 - sub esp, 32 + and SP, ~15 + sub SP, 32 endprologue // Prepare for the first iteration. - mov esi, [ebp + 32] // -> bv[0] + mov esi, [BP + 32] // -> bv[0] pxor xmm7, xmm7 movdqu xmm0, [esi] // bv[0] - mov edi, [ebp + 20] // -> dv[0] + mov edi, [BP + 20] // -> dv[0] mov ecx, edi // outer loop dv cursor expand xmm7, xmm0, xmm1 - mov ebx, [ebp + 24] // -> av[0] - mov eax, [ebp + 28] // -> av[m] = av limit - mov edx, esp // -> expanded Y = bv[0] - movdqa [esp + 0], xmm0 // bv[0] expanded low - movdqa [esp + 16], xmm1 // bv[0] expanded high + mov ebx, [BP + 24] // -> av[0] + mov eax, [BP + 28] // -> av[m] = av limit + mov edx, SP // -> expanded Y = bv[0] + movdqa [SP + 0], xmm0 // bv[0] expanded low + movdqa [SP + 16], xmm1 // bv[0] expanded high call mul4zc add ebx, 16 add edi, 16 @@ -740,7 +740,7 @@ FUNC(mpx_umul4_x86_sse2) // Write out the leftover carry. There can be no tail here. 8: call carryprop - cmp esi, [ebp + 36] // more passes to do? + cmp esi, [BP + 36] // more passes to do? jae 9f .p2align 4 @@ -749,9 +749,9 @@ FUNC(mpx_umul4_x86_sse2) mov edi, ecx // -> dv[i] pxor xmm7, xmm7 expand xmm7, xmm0, xmm1 - mov ebx, [ebp + 24] // -> av[0] - movdqa [esp + 0], xmm0 // bv[i] expanded low - movdqa [esp + 16], xmm1 // bv[i] expanded high + mov ebx, [BP + 24] // -> av[0] + movdqa [SP + 0], xmm0 // bv[i] expanded low + movdqa [SP + 16], xmm1 // bv[i] expanded high call mla4zc add edi, 16 add ebx, 16 @@ -771,7 +771,7 @@ FUNC(mpx_umul4_x86_sse2) // Finish off this pass. There was no tail on the previous pass, and // there can be none on this pass. 8: call carryprop - cmp esi, [ebp + 36] + cmp esi, [BP + 36] jb 1b // All over. @@ -779,7 +779,7 @@ FUNC(mpx_umul4_x86_sse2) pop edi pop esi pop ebx - pop ebp + pop BP ret ENDFUNC @@ -796,69 +796,69 @@ FUNC(mpxmont_mul4_x86_sse2) // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv, // const mpw *nv, size_t n, const mpw *mi); - // Build a stack frame. Arguments will be relative to EBP, as + // Build a stack frame. Arguments will be relative to BP, as // follows. // - // ebp + 20 dv - // ebp + 24 av - // ebp + 28 bv - // ebp + 32 nv - // ebp + 36 n (nonzero multiple of 4) - // ebp + 40 mi + // BP + 20 dv + // BP + 24 av + // BP + 28 bv + // BP + 32 nv + // BP + 36 n (nonzero multiple of 4) + // BP + 40 mi // - // Locals are relative to ESP, which 16-byte aligned, as follows. + // Locals are relative to SP, which 16-byte aligned, as follows. // - // esp + 0 expanded V (32 bytes) - // esp + 32 expanded M (32 bytes) - // esp + 64 expanded Y (32 bytes) - // esp + 96 outer loop dv - // esp + 100 outer loop bv - // esp + 104 av limit (mostly in ESI) - // esp + 108 bv limit - // esp + 112 (top of locals) - pushreg ebp + // SP + 0 expanded V (32 bytes) + // SP + 32 expanded M (32 bytes) + // SP + 64 expanded Y (32 bytes) + // SP + 96 outer loop dv + // SP + 100 outer loop bv + // SP + 104 av limit (mostly in ESI) + // SP + 108 bv limit + // SP + 112 (top of locals) + pushreg BP pushreg ebx pushreg esi pushreg edi setfp - and esp, ~15 - sub esp, 112 + and SP, ~15 + sub SP, 112 endprologue // Establish the expanded operands. pxor xmm7, xmm7 - mov ecx, [ebp + 28] // -> bv - mov edx, [ebp + 40] // -> mi + mov ecx, [BP + 28] // -> bv + mov edx, [BP + 40] // -> mi movdqu xmm0, [ecx] // bv[0] movdqu xmm2, [edx] // mi expand xmm7, xmm0, xmm1, xmm2, xmm3 - movdqa [esp + 0], xmm0 // bv[0] expanded low - movdqa [esp + 16], xmm1 // bv[0] expanded high - movdqa [esp + 32], xmm2 // mi expanded low - movdqa [esp + 48], xmm3 // mi expanded high + movdqa [SP + 0], xmm0 // bv[0] expanded low + movdqa [SP + 16], xmm1 // bv[0] expanded high + movdqa [SP + 32], xmm2 // mi expanded low + movdqa [SP + 48], xmm3 // mi expanded high // Set up the outer loop state and prepare for the first iteration. - mov edx, [ebp + 36] // n - mov eax, [ebp + 24] // -> U = av[0] - mov ebx, [ebp + 32] // -> X = nv[0] - mov edi, [ebp + 20] // -> Z = dv[0] - mov [esp + 100], ecx + mov edx, [BP + 36] // n + mov eax, [BP + 24] // -> U = av[0] + mov ebx, [BP + 32] // -> X = nv[0] + mov edi, [BP + 20] // -> Z = dv[0] + mov [SP + 100], ecx lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit lea edx, [eax + 4*edx] // -> av[n/4] = av limit - mov [esp + 96], edi - mov [esp + 104], edx - mov [esp + 108], ecx - lea ecx, [esp + 0] // -> expanded V = bv[0] - lea esi, [esp + 32] // -> expanded M = mi - lea edx, [esp + 64] // -> space for Y + mov [SP + 96], edi + mov [SP + 104], edx + mov [SP + 108], ecx + lea ecx, [SP + 0] // -> expanded V = bv[0] + lea esi, [SP + 32] // -> expanded M = mi + lea edx, [SP + 64] // -> space for Y call mmul4 - mov esi, [esp + 104] // recover av limit + mov esi, [SP + 104] // recover av limit add edi, 16 add eax, 16 add ebx, 16 cmp eax, esi // done already? jae 8f - mov [esp + 96], edi + mov [SP + 96], edi .p2align 4 // Complete the first inner loop. @@ -877,26 +877,26 @@ FUNC(mpxmont_mul4_x86_sse2) // Embark on the next iteration. (There must be one. If n = 1, then // we would have bailed above, to label 8. Similarly, the subsequent // iterations can fall into the inner loop immediately.) -1: mov eax, [esp + 100] // -> bv[i - 1] - mov edi, [esp + 96] // -> Z = dv[i] +1: mov eax, [SP + 100] // -> bv[i - 1] + mov edi, [SP + 96] // -> Z = dv[i] add eax, 16 // -> bv[i] pxor xmm7, xmm7 - mov [esp + 100], eax - cmp eax, [esp + 108] // done yet? + mov [SP + 100], eax + cmp eax, [SP + 108] // done yet? jae 9f movdqu xmm0, [eax] // bv[i] - mov ebx, [ebp + 32] // -> X = nv[0] - lea esi, [esp + 32] // -> expanded M = mi - mov eax, [ebp + 24] // -> U = av[0] + mov ebx, [BP + 32] // -> X = nv[0] + lea esi, [SP + 32] // -> expanded M = mi + mov eax, [BP + 24] // -> U = av[0] expand xmm7, xmm0, xmm1 - movdqa [esp + 0], xmm0 // bv[i] expanded low - movdqa [esp + 16], xmm1 // bv[i] expanded high + movdqa [SP + 0], xmm0 // bv[i] expanded low + movdqa [SP + 16], xmm1 // bv[i] expanded high call mmla4 - mov esi, [esp + 104] // recover av limit + mov esi, [SP + 104] // recover av limit add edi, 16 add eax, 16 add ebx, 16 - mov [esp + 96], edi + mov [SP + 96], edi .p2align 4 // Complete the next inner loop. @@ -928,7 +928,7 @@ FUNC(mpxmont_mul4_x86_sse2) popreg edi popreg esi popreg ebx - popreg ebp + popreg BP ret ENDFUNC @@ -945,55 +945,55 @@ FUNC(mpxmont_redc4_x86_sse2) // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv, // size_t n, const mpw *mi); - // Build a stack frame. Arguments will be relative to EBP, as + // Build a stack frame. Arguments will be relative to BP, as // follows. // - // ebp + 20 dv - // ebp + 24 dvl - // ebp + 28 nv - // ebp + 32 n (nonzero multiple of 4) - // ebp + 36 mi + // BP + 20 dv + // BP + 24 dvl + // BP + 28 nv + // BP + 32 n (nonzero multiple of 4) + // BP + 36 mi // - // Locals are relative to ESP, as follows. + // Locals are relative to SP, as follows. // - // esp + 0 outer loop dv - // esp + 4 outer dv limit - // esp + 8 blocks-of-4 dv limit - // esp + 12 expanded M (32 bytes) - // esp + 44 expanded Y (32 bytes) - // esp + 76 (top of locals) - pushreg ebp + // SP + 0 outer loop dv + // SP + 4 outer dv limit + // SP + 8 blocks-of-4 dv limit + // SP + 12 expanded M (32 bytes) + // SP + 44 expanded Y (32 bytes) + // SP + 76 (top of locals) + pushreg BP pushreg ebx pushreg esi pushreg edi setfp - and esp, ~15 - sub esp, 76 + and SP, ~15 + sub SP, 76 endprologue // Establish the expanded operands and the blocks-of-4 dv limit. - mov edi, [ebp + 20] // -> Z = dv[0] + mov edi, [BP + 20] // -> Z = dv[0] pxor xmm7, xmm7 - mov eax, [ebp + 24] // -> dv[n] = dv limit + mov eax, [BP + 24] // -> dv[n] = dv limit sub eax, edi // length of dv in bytes - mov edx, [ebp + 36] // -> mi + mov edx, [BP + 36] // -> mi movdqu xmm0, [edx] // mi and eax, ~15 // mask off the tail end expand xmm7, xmm0, xmm1 add eax, edi // find limit - movdqa [esp + 12], xmm0 // mi expanded low - movdqa [esp + 28], xmm1 // mi expanded high - mov [esp + 8], eax + movdqa [SP + 12], xmm0 // mi expanded low + movdqa [SP + 28], xmm1 // mi expanded high + mov [SP + 8], eax // Set up the outer loop state and prepare for the first iteration. - mov ecx, [ebp + 32] // n - mov ebx, [ebp + 28] // -> X = nv[0] + mov ecx, [BP + 32] // n + mov ebx, [BP + 28] // -> X = nv[0] lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit - mov [esp + 0], edi - mov [esp + 4], edx - lea esi, [esp + 12] // -> expanded M = mi - lea edx, [esp + 44] // -> space for Y + mov [SP + 0], edi + mov [SP + 4], edx + lea esi, [SP + 12] // -> expanded M = mi + lea edx, [SP + 44] // -> space for Y call mont4 add ebx, 16 add edi, 16 @@ -1010,8 +1010,8 @@ FUNC(mpxmont_redc4_x86_sse2) // Still have carries left to propagate. 8: carryadd - mov esi, [esp + 8] // -> dv blocks limit - mov edx, [ebp + 24] // dv limit + mov esi, [SP + 8] // -> dv blocks limit + mov edx, [BP + 24] // dv limit psllq xmm7, 16 pslldq xmm7, 8 paddq xmm6, xmm7 @@ -1044,14 +1044,14 @@ FUNC(mpxmont_redc4_x86_sse2) // All done for this iteration. Start the next. (This must have at // least one follow-on iteration, or we'd not have started this outer // loop.) -8: mov edi, [esp + 0] // -> dv[i - 1] - mov ebx, [ebp + 28] // -> X = nv[0] - lea edx, [esp + 44] // -> space for Y - lea esi, [esp + 12] // -> expanded M = mi +8: mov edi, [SP + 0] // -> dv[i - 1] + mov ebx, [BP + 28] // -> X = nv[0] + lea edx, [SP + 44] // -> space for Y + lea esi, [SP + 12] // -> expanded M = mi add edi, 16 // -> Z = dv[i] - cmp edi, [esp + 4] // all done yet? + cmp edi, [SP + 4] // all done yet? jae 9f - mov [esp + 0], edi + mov [SP + 0], edi call mont4 add edi, 16 add ebx, 16 @@ -1062,7 +1062,7 @@ FUNC(mpxmont_redc4_x86_sse2) popreg edi popreg esi popreg ebx - popreg ebp + popreg BP ret ENDFUNC @@ -1091,22 +1091,22 @@ ENDFUNC .endm .macro testprologue n - pushreg ebp + pushreg BP pushreg ebx pushreg esi pushreg edi setfp - and esp, ~15 - sub esp, 3*32 + 4*4 + and SP, ~15 + sub SP, 3*32 + 4*4 endprologue mov eax, \n - mov [esp + 104], eax + mov [SP + 104], eax // vars: - // esp + 0 = v expanded - // esp + 32 = y expanded - // esp + 64 = ? expanded - // esp + 96 = cycles - // esp + 104 = count + // SP + 0 = v expanded + // SP + 32 = y expanded + // SP + 64 = ? expanded + // SP + 96 = cycles + // SP + 104 = count .endm .macro testepilogue @@ -1114,7 +1114,7 @@ ENDFUNC popreg edi popreg esi popreg ebx - popreg ebp + popreg BP ret .endm @@ -1131,15 +1131,15 @@ ENDFUNC mov ecx, \v movdqu xmm0, [ecx] expand xmm7, xmm0, xmm1 - movdqa [esp + 0], xmm0 - movdqa [esp + 16], xmm1 + movdqa [SP + 0], xmm0 + movdqa [SP + 16], xmm1 .endif .ifnes "\y", "nil" mov edx, \y movdqu xmm2, [edx] expand xmm7, xmm2, xmm3 - movdqa [esp + 32], xmm2 - movdqa [esp + 48], xmm3 + movdqa [SP + 32], xmm2 + movdqa [SP + 48], xmm3 .endif .endm @@ -1147,25 +1147,25 @@ ENDFUNC .p2align 4 0: .ifnes "\u", "nil" - lea ecx, [esp + 0] + lea ecx, [SP + 0] .endif mov ebx, \x .ifeqs "\mode", "mont" - lea esi, [esp + 32] + lea esi, [SP + 32] .endif - cysetup esp + 96 + cysetup SP + 96 .ifnes "\u", "nil" mov eax, \u .endif .ifeqs "\mode", "mont" - lea edx, [esp + 64] + lea edx, [SP + 64] .else - lea edx, [esp + 32] + lea edx, [SP + 32] .endif .endm .macro testtail cyv - cystore esp + 96, \cyv, esp + 104 + cystore SP + 96, \cyv, SP + 104 jnz 0b .endm @@ -1177,122 +1177,122 @@ ENDFUNC .endm FUNC(test_dmul4) - testprologue [ebp + 44] - testldcarry [ebp + 24] - testexpand [ebp + 36], [ebp + 40] - mov edi, [ebp + 20] - testtop [ebp + 28], [ebp + 32] + testprologue [BP + 44] + testldcarry [BP + 24] + testexpand [BP + 36], [BP + 40] + mov edi, [BP + 20] + testtop [BP + 28], [BP + 32] call dmul4 - testtail [ebp + 48] - testcarryout [ebp + 24] + testtail [BP + 48] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_dmla4) - testprologue [ebp + 44] - testldcarry [ebp + 24] - testexpand [ebp + 36], [ebp + 40] - mov edi, [ebp + 20] - testtop [ebp + 28], [ebp + 32] + testprologue [BP + 44] + testldcarry [BP + 24] + testexpand [BP + 36], [BP + 40] + mov edi, [BP + 20] + testtop [BP + 28], [BP + 32] call dmla4 - testtail [ebp + 48] - testcarryout [ebp + 24] + testtail [BP + 48] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mul4) - testprologue [ebp + 36] - testldcarry [ebp + 24] - testexpand nil, [ebp + 32] - mov edi, [ebp + 20] - testtop nil, [ebp + 28] + testprologue [BP + 36] + testldcarry [BP + 24] + testexpand nil, [BP + 32] + mov edi, [BP + 20] + testtop nil, [BP + 28] call mul4 - testtail [ebp + 40] - testcarryout [ebp + 24] + testtail [BP + 40] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mul4zc) - testprologue [ebp + 36] - testldcarry [ebp + 24] - testexpand nil, [ebp + 32] - mov edi, [ebp + 20] - testtop nil, [ebp + 28] + testprologue [BP + 36] + testldcarry [BP + 24] + testexpand nil, [BP + 32] + mov edi, [BP + 20] + testtop nil, [BP + 28] call mul4zc - testtail [ebp + 40] - testcarryout [ebp + 24] + testtail [BP + 40] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mla4) - testprologue [ebp + 36] - testldcarry [ebp + 24] - testexpand nil, [ebp + 32] - mov edi, [ebp + 20] - testtop nil, [ebp + 28] + testprologue [BP + 36] + testldcarry [BP + 24] + testexpand nil, [BP + 32] + mov edi, [BP + 20] + testtop nil, [BP + 28] call mla4 - testtail [ebp + 40] - testcarryout [ebp + 24] + testtail [BP + 40] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mla4zc) - testprologue [ebp + 36] - testldcarry [ebp + 24] - testexpand nil, [ebp + 32] - mov edi, [ebp + 20] - testtop nil, [ebp + 28] + testprologue [BP + 36] + testldcarry [BP + 24] + testexpand nil, [BP + 32] + mov edi, [BP + 20] + testtop nil, [BP + 28] call mla4zc - testtail [ebp + 40] - testcarryout [ebp + 24] + testtail [BP + 40] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mmul4) - testprologue [ebp + 48] - testexpand [ebp + 40], [ebp + 44] - mov edi, [ebp + 20] - testtop [ebp + 32], [ebp + 36], mont + testprologue [BP + 48] + testexpand [BP + 40], [BP + 44] + mov edi, [BP + 20] + testtop [BP + 32], [BP + 36], mont call mmul4 - testtail [ebp + 52] - mov edi, [ebp + 28] - movdqa xmm0, [esp + 64] - movdqa xmm1, [esp + 80] + testtail [BP + 52] + mov edi, [BP + 28] + movdqa xmm0, [SP + 64] + movdqa xmm1, [SP + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 - testcarryout [ebp + 24] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mmla4) - testprologue [ebp + 48] - testexpand [ebp + 40], [ebp + 44] - mov edi, [ebp + 20] - testtop [ebp + 32], [ebp + 36], mont + testprologue [BP + 48] + testexpand [BP + 40], [BP + 44] + mov edi, [BP + 20] + testtop [BP + 32], [BP + 36], mont call mmla4 - testtail [ebp + 52] - mov edi, [ebp + 28] - movdqa xmm0, [esp + 64] - movdqa xmm1, [esp + 80] + testtail [BP + 52] + mov edi, [BP + 28] + movdqa xmm0, [SP + 64] + movdqa xmm1, [SP + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 - testcarryout [ebp + 24] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mont4) - testprologue [ebp + 40] - testexpand nil, [ebp + 36] - mov edi, [ebp + 20] - testtop nil, [ebp + 32], mont + testprologue [BP + 40] + testexpand nil, [BP + 36] + mov edi, [BP + 20] + testtop nil, [BP + 32], mont call mont4 - testtail [ebp + 44] - mov edi, [ebp + 28] - movdqa xmm0, [esp + 64] - movdqa xmm1, [esp + 80] + testtail [BP + 44] + mov edi, [BP + 28] + movdqa xmm0, [SP + 64] + movdqa xmm1, [SP + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 - testcarryout [ebp + 24] + testcarryout [BP + 24] testepilogue ENDFUNC diff --git a/rand/rand-x86ish.S b/rand/rand-x86ish.S index 829bc2cd..61de2b84 100644 --- a/rand/rand-x86ish.S +++ b/rand/rand-x86ish.S @@ -42,7 +42,7 @@ FUNC(rand_quick_x86ish_rdrand) // Return zero on success, or -1 on error. #if CPUFAM_X86 - mov edx, [esp + 4] + mov edx, [SP + 4] stalloc 28 # define COUNT ecx #endif @@ -58,7 +58,7 @@ FUNC(rand_quick_x86ish_rdrand) // Try to fetch a random number. mov COUNT, 16 -0: rdrand R_a(r) +0: rdrand AX jc 1f dec COUNT jnz 0b @@ -70,22 +70,22 @@ FUNC(rand_quick_x86ish_rdrand) // Success. 1: #if CPUFAM_X86 - mov [esp + 16], eax - lea ecx, [esp + 16] - mov dword ptr [esp + 12], 32 - mov dword ptr [esp + 8], 4 - mov [esp + 4], ecx - mov [esp + 0], edx + mov [SP + 16], AX + lea ecx, [SP + 16] + mov dword ptr [SP + 12], 32 + mov dword ptr [SP + 8], 4 + mov [SP + 4], ecx + mov [SP + 0], edx #endif #if CPUFAM_AMD64 && ABI_SYSV - mov [rsp + 0], rax - mov rsi, rsp + mov [SP + 0], AX + mov rsi, SP mov edx, 8 mov ecx, 64 #endif #if CPUFAM_AMD64 && ABI_WIN - mov [rsp + 32], rax - lea rdx, [rsp + 32] + mov [SP + 32], AX + lea rdx, [SP + 32] mov r8d, 8 mov r9d, 64 #endif diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S index 3fb623af..33af65f0 100644 --- a/symm/chacha-x86ish-sse2.S +++ b/symm/chacha-x86ish-sse2.S @@ -66,15 +66,15 @@ FUNC(chacha_core_x86ish_sse2) # define SAVE0 xmm5 # define SAVE1 xmm6 # define SAVE2 xmm7 -# define SAVE3 [esp] +# define SAVE3 [SP] - pushreg ebp + pushreg BP setfp - sub esp, 16 - mov IN, [ebp + 12] - mov OUT, [ebp + 16] - and esp, ~15 - mov NR, [ebp + 8] + sub SP, 16 + mov IN, [BP + 12] + mov OUT, [BP + 16] + and SP, ~15 + mov NR, [BP + 8] #endif #if CPUFAM_AMD64 && ABI_SYSV @@ -105,9 +105,9 @@ FUNC(chacha_core_x86ish_sse2) # define IN rdx # define OUT r8 # define SAVE0 xmm5 -# define SAVE1 [rsp + 0] -# define SAVE2 [rsp + 16] -# define SAVE3 [rsp + 32] +# define SAVE1 [SP + 0] +# define SAVE2 [SP + 16] +# define SAVE3 [SP + 32] stalloc 48 + 8 #endif @@ -248,7 +248,7 @@ FUNC(chacha_core_x86ish_sse2) // Tidy things up. #if CPUFAM_X86 dropfp - popreg ebp + popreg BP #endif #if CPUFAM_AMD64 && ABI_WIN stfree 48 + 8 diff --git a/symm/gcm-x86ish-pclmul.S b/symm/gcm-x86ish-pclmul.S index e60b7cab..092242bc 100644 --- a/symm/gcm-x86ish-pclmul.S +++ b/symm/gcm-x86ish-pclmul.S @@ -576,7 +576,7 @@ // xmm3 = // v_0 = (v_01; v_00) movdqa xmm4, xmm0 // u_1 again #if CPUFAM_X86 - movdqa [esp + 0], xmm3 + movdqa [SP + 0], xmm3 #elif CPUFAM_AMD64 movdqa xmm8, xmm3 # define V0 xmm8 @@ -608,7 +608,7 @@ pclmullqlqdq xmm4, xmm2 // u_11 v_11 pclmulhqhqdq xmm7, xmm2 // u_10 v_10 #if CPUFAM_X86 - movdqa xmm2, [esp + 0] + movdqa xmm2, [SP + 0] # define V0 xmm2 #endif pxor xmm0, xmm3 // u_10 v_11 + u_11 v_10 @@ -771,8 +771,8 @@ SSEFUNC(gcm_mulk_128b_x86ish_pclmul) // A is updated with the product A K. #if CPUFAM_X86 - mov A, [esp + 4] - mov K, [esp + 8] + mov A, [SP + 4] + mov K, [SP + 8] #endif endprologue movdqu xmm0, [A] @@ -790,8 +790,8 @@ SSEFUNC(gcm_mulk_128l_x86ish_pclmul) // exit, A is updated with the product A K. #if CPUFAM_X86 - mov A, [esp + 4] - mov K, [esp + 8] + mov A, [SP + 4] + mov K, [SP + 8] ldgot ecx #endif endprologue @@ -811,8 +811,8 @@ SSEFUNC(gcm_mulk_64b_x86ish_pclmul) // A is updated with the product A K. #if CPUFAM_X86 - mov A, [esp + 4] - mov K, [esp + 8] + mov A, [SP + 4] + mov K, [SP + 8] #endif endprologue movq xmm0, [A] @@ -830,8 +830,8 @@ SSEFUNC(gcm_mulk_64l_x86ish_pclmul) // exit, A is updated with the product A K. #if CPUFAM_X86 - mov A, [esp + 4] - mov K, [esp + 8] + mov A, [SP + 4] + mov K, [SP + 8] ldgot ecx #endif endprologue @@ -852,8 +852,8 @@ SSEFUNC(gcm_mulk_96b_x86ish_pclmul) // with the product A K. #if CPUFAM_X86 - mov A, [esp + 4] - mov K, [esp + 8] + mov A, [SP + 4] + mov K, [SP + 8] #endif endprologue movq xmm0, [A + 0] @@ -876,8 +876,8 @@ SSEFUNC(gcm_mulk_96l_x86ish_pclmul) // updated with the product A K. #if CPUFAM_X86 - mov A, [esp + 4] - mov K, [esp + 8] + mov A, [SP + 4] + mov K, [SP + 8] ldgot ecx #endif endprologue @@ -901,8 +901,8 @@ SSEFUNC(gcm_mulk_192b_x86ish_pclmul) // A is updated with the product A K. #if CPUFAM_X86 - mov A, [esp + 4] - mov K, [esp + 8] + mov A, [SP + 4] + mov K, [SP + 8] #endif #if CPUFAM_AMD64 && ABI_WIN stalloc 2*16 + 8 @@ -935,8 +935,8 @@ SSEFUNC(gcm_mulk_192l_x86ish_pclmul) // exit, A is updated with the product A K. #if CPUFAM_X86 - mov A, [esp + 4] - mov K, [esp + 8] + mov A, [SP + 4] + mov K, [SP + 8] ldgot ecx #endif #if CPUFAM_AMD64 && ABI_WIN @@ -970,12 +970,12 @@ SSEFUNC(gcm_mulk_256b_x86ish_pclmul) // A is updated with the product A K. #if CPUFAM_X86 - pushreg ebp + pushreg BP setfp - mov A, [esp + 8] - mov K, [esp + 12] - and esp, ~15 - sub esp, 16 + mov A, [SP + 8] + mov K, [SP + 12] + and SP, ~15 + sub SP, 16 #endif #if CPUFAM_AMD64 && ABI_WIN stalloc 3*16 + 8 @@ -997,7 +997,7 @@ SSEFUNC(gcm_mulk_256b_x86ish_pclmul) movdqu [A + 0], xmm1 #if CPUFAM_X86 dropfp - popreg ebp + popreg BP #endif #if CPUFAM_AMD64 && ABI_WIN rstrxmm xmm6, 0 @@ -1014,13 +1014,13 @@ SSEFUNC(gcm_mulk_256l_x86ish_pclmul) // exit, A is updated with the product A K. #if CPUFAM_X86 - pushreg ebp + pushreg BP setfp - mov A, [esp + 8] - mov K, [esp + 12] - and esp, ~15 + mov A, [SP + 8] + mov K, [SP + 12] + and SP, ~15 ldgot ecx - sub esp, 16 + sub SP, 16 #endif #if CPUFAM_AMD64 && ABI_WIN stalloc 3*16 + 8 @@ -1044,7 +1044,7 @@ SSEFUNC(gcm_mulk_256l_x86ish_pclmul) movdqu [A + 0], xmm1 #if CPUFAM_X86 dropfp - popreg ebp + popreg BP #endif #if CPUFAM_AMD64 && ABI_WIN rstrxmm xmm6, 0 diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S index 6d9b3b22..f5e5cc9c 100644 --- a/symm/rijndael-x86ish-aesni.S +++ b/symm/rijndael-x86ish-aesni.S @@ -70,15 +70,12 @@ ENDFUNC FUNC(rijndael_setup_x86ish_aesni) -#define SI WHOLE(si) -#define DI WHOLE(di) - #if CPUFAM_X86 // Arguments are on the stack. We'll need to stack the caller's // register veriables, but we'll manage. -# define CTX ebp // context pointer -# define BLKSZ [esp + 24] // block size +# define CTX BP // context pointer +# define BLKSZ [SP + 24] // block size # define KSZ ebx // key size # define NKW edx // total number of key words @@ -92,15 +89,15 @@ FUNC(rijndael_setup_x86ish_aesni) # define BLKOFF edx // block size in bytes // Stack the caller's registers. - pushreg ebp + pushreg BP pushreg ebx pushreg esi pushreg edi // Set up our own variables. - mov CTX, [esp + 20] // context base pointer - mov SI, [esp + 28] // key material - mov KSZ, [esp + 32] // key size, in words + mov CTX, [SP + 20] // context base pointer + mov SI, [SP + 28] // key material + mov KSZ, [SP + 32] // key size, in words #endif #if CPUFAM_AMD64 && ABI_SYSV @@ -330,7 +327,7 @@ FUNC(rijndael_setup_x86ish_aesni) popreg edi popreg esi popreg ebx - popreg ebp + popreg BP #endif #if CPUFAM_AMD64 && ABI_WIN popreg rdi @@ -389,8 +386,8 @@ ENDFUNC # define DST edx # define NR ecx - mov K, [esp + 4] - mov SRC, [esp + 8] + mov K, [SP + 4] + mov SRC, [SP + 8] #endif #if CPUFAM_AMD64 && ABI_SYSV @@ -428,7 +425,7 @@ ENDFUNC add K, 16 pxor xmm0, xmm1 #if CPUFAM_X86 - mov DST, [esp + 12] + mov DST, [SP + 12] #endif // Dispatch to the correct code. diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S index 5dc9c17c..eb346afe 100644 --- a/symm/salsa20-x86ish-sse2.S +++ b/symm/salsa20-x86ish-sse2.S @@ -65,16 +65,16 @@ FUNC(salsa20_core_x86ish_sse2) # define OUT edx # define SAVE0 xmm6 # define SAVE1 xmm7 -# define SAVE2 [esp + 0] -# define SAVE3 [esp + 16] +# define SAVE2 [SP + 0] +# define SAVE3 [SP + 16] - pushreg ebp + pushreg BP setfp - sub esp, 32 - mov IN, [ebp + 12] - mov OUT, [ebp + 16] - and esp, ~15 - mov NR, [ebp + 8] + sub SP, 32 + mov IN, [BP + 12] + mov OUT, [BP + 16] + and SP, ~15 + mov NR, [BP + 8] #endif #if CPUFAM_AMD64 && ABI_SYSV @@ -107,8 +107,8 @@ FUNC(salsa20_core_x86ish_sse2) # define OUT r8 # define SAVE0 xmm6 # define SAVE1 xmm7 -# define SAVE2 [rsp + 32] -# define SAVE3 [rsp + 48] +# define SAVE2 [SP + 32] +# define SAVE3 [SP + 48] stalloc 64 + 8 savexmm xmm6, 0 @@ -301,7 +301,7 @@ FUNC(salsa20_core_x86ish_sse2) // Tidy things up. #if CPUFAM_X86 dropfp - popreg ebp + popreg BP #endif #if CPUFAM_AMD64 && ABI_WIN rstrxmm xmm6, 0