// u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
//
// Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
- // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
+ // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
// Then
//
- // ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
+ // ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
// = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
//
// which is almost the bit-reversal of u v, only it's shifted right
// xmm3 = // v_0 = (v_01; v_00)
movdqa xmm4, xmm0 // u_1 again
#if CPUFAM_X86
- movdqa [esp + 0], xmm3
+ movdqa [SP + 0], xmm3
#elif CPUFAM_AMD64
movdqa xmm8, xmm3
# define V0 xmm8
pclmullqlqdq xmm4, xmm2 // u_11 v_11
pclmulhqhqdq xmm7, xmm2 // u_10 v_10
#if CPUFAM_X86
- movdqa xmm2, [esp + 0]
+ movdqa xmm2, [SP + 0]
# define V0 xmm2
#endif
pxor xmm0, xmm3 // u_10 v_11 + u_11 v_10
// A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
#endif
endprologue
movdqu xmm0, [A]
// exit, A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
ldgot ecx
#endif
endprologue
// A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
#endif
endprologue
movq xmm0, [A]
// exit, A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
ldgot ecx
#endif
endprologue
// with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
#endif
endprologue
movq xmm0, [A + 0]
// updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
ldgot ecx
#endif
endprologue
// A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
#endif
#if CPUFAM_AMD64 && ABI_WIN
stalloc 2*16 + 8
// exit, A is updated with the product A K.
#if CPUFAM_X86
- mov A, [esp + 4]
- mov K, [esp + 8]
+ mov A, [SP + 4]
+ mov K, [SP + 8]
ldgot ecx
#endif
#if CPUFAM_AMD64 && ABI_WIN
// A is updated with the product A K.
#if CPUFAM_X86
- pushreg ebp
+ pushreg BP
setfp
- mov A, [esp + 8]
- mov K, [esp + 12]
- and esp, ~15
- sub esp, 16
+ mov A, [SP + 8]
+ mov K, [SP + 12]
+ stalloc 16
+ and SP, ~15
#endif
#if CPUFAM_AMD64 && ABI_WIN
stalloc 3*16 + 8
movdqu [A + 0], xmm1
#if CPUFAM_X86
dropfp
- popreg ebp
+ popreg BP
#endif
#if CPUFAM_AMD64 && ABI_WIN
rstrxmm xmm6, 0
// exit, A is updated with the product A K.
#if CPUFAM_X86
- pushreg ebp
+ pushreg BP
setfp
- mov A, [esp + 8]
- mov K, [esp + 12]
- and esp, ~15
+ mov A, [SP + 8]
+ mov K, [SP + 12]
+ stalloc 16
ldgot ecx
- sub esp, 16
+ and SP, ~15
#endif
#if CPUFAM_AMD64 && ABI_WIN
stalloc 3*16 + 8
movdqu [A + 0], xmm1
#if CPUFAM_X86
dropfp
- popreg ebp
+ popreg BP
#endif
#if CPUFAM_AMD64 && ABI_WIN
rstrxmm xmm6, 0