///--------------------------------------------------------------------------
/// Primitive multipliers and related utilities.
- .p2align 4
-carryprop:
+INTFUNC(carryprop)
// On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
// form. Store the low 128 bits of the represented carry to [EDI] as
// a packed 128-bit value, and leave the remaining 16 bits in the low
// 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
+ endprologue
+
propout [edi + 0], xmm4, xmm5
propout [edi + 4], xmm5, xmm6
propout [edi + 8], xmm6, nil
endprop [edi + 12], xmm6, xmm4
ret
- .p2align 4
-dmul4:
+ENDFUNC
+
+INTFUNC(dmul4)
// On entry, EDI points to the destination buffer; EAX and EBX point
// to the packed operands U and X; ECX and EDX point to the expanded
// operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
propout [edi + 0], xmm4, xmm5
ret
- .p2align 4
-dmla4:
+ENDFUNC
+
+INTFUNC(dmla4)
// On entry, EDI points to the destination buffer, which also
// contains an addend A to accumulate; EAX and EBX point to the
// packed operands U and X; ECX and EDX point to the expanded
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
carryadd
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
ret
- .p2align 4
-mul4zc:
+ENDFUNC
+
+INTFUNC(mul4zc)
// On entry, EDI points to the destination buffer; EBX points to a
// packed operand X; and EDX points to an expanded operand Y.
//
// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
ret
- .p2align 4
-mul4:
+ENDFUNC
+
+INTFUNC(mul4)
// On entry, EDI points to the destination buffer; EBX points to a
// packed operand X; EDX points to an expanded operand Y; and XMM4,
// XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
// and update the carry registers with the carry out. The registers
// XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
propout [edi + 0], xmm4, xmm5
ret
- .p2align 4
-mla4zc:
+ENDFUNC
+
+INTFUNC(mla4zc)
// On entry, EDI points to the destination buffer, which also
// contains an addend A to accumulate; EBX points to a packed operand
// X; and EDX points to an expanded operand Y.
// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
movd xmm4, [edi + 0]
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
ret
- .p2align 4
-mla4:
+ENDFUNC
+
+INTFUNC(mla4)
// On entry, EDI points to the destination buffer, which also
// contains an addend A to accumulate; EBX points to a packed operand
// X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
carryadd
mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil
ret
- .p2align 4
-mmul4:
+ENDFUNC
+
+INTFUNC(mmul4)
// On entry, EDI points to the destination buffer; EAX and EBX point
// to the packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
// of the sum U V + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
- sub esp, 64 // space for the carries
+ stalloc 64 // space for the carries
+ endprologue
// Calculate W = U V, and leave it in the destination. Stash the
// carry pieces for later.
propout [edi + 0], xmm4, xmm5
jmp 5f
- .p2align 4
-mmla4:
+ENDFUNC
+
+INTFUNC(mmla4)
// On entry, EDI points to the destination buffer, which also
// contains an addend A to accumulate; EAX and EBX point
// to the packed operands U and N; ECX and ESI point to the expanded
// carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
// XMM3, and XMM7 are clobbered; the general-purpose registers are
// preserved.
- sub esp, 64 // space for the carries
+ stalloc 64 // space for the carries
+ endprologue
+
movd xmm4, [edi + 0]
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
paddq xmm6, [esp + 32]
// And, with that, we're done.
- add esp, 64
+ stfree 64
ret
- .p2align 4
-mont4:
+ENDFUNC
+
+INTFUNC(mont4)
// On entry, EDI points to the destination buffer holding a packed
// value A; EBX points to a packed operand N; ESI points to an
// expanded operand M; and EDX points to a place to store an expanded
// of the sum W + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
+ endprologue
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
// And, with that, we're done.
ret
+ENDFUNC
+
///--------------------------------------------------------------------------
/// Bulk multipliers.
//
// esp + 0 expanded Y (32 bytes)
// esp + 32 (top of locals)
- push ebp
- push ebx
- push esi
- push edi
- mov ebp, esp
+ pushreg ebp
+ pushreg ebx
+ pushreg esi
+ pushreg edi
+ setfp ebp
and esp, ~15
sub esp, 32
+ endprologue
// Prepare for the first iteration.
mov esi, [ebp + 32] // -> bv[0]
jb 1b
// All over.
-9: mov esp, ebp
+9: dropfp
pop edi
pop esi
pop ebx
// esp + 108 bv limit
// esp + 112 (gap)
// esp + 124 (top of locals)
- push ebp
- push ebx
- push esi
- push edi
- mov ebp, esp
+ pushreg ebp
+ pushreg ebx
+ pushreg esi
+ pushreg edi
+ setfp ebp
and esp, ~15
sub esp, 124
+ endprologue
// Establish the expanded operands.
pxor xmm7, xmm7
movd [edi + 16], xmm4
// All done.
-9: mov esp, ebp
- pop edi
- pop esi
- pop ebx
- pop ebp
+9: dropfp
+ popreg edi
+ popreg esi
+ popreg ebx
+ popreg ebp
ret
ENDFUNC
// esp + 12 expanded M (32 bytes)
// esp + 44 expanded Y (32 bytes)
// esp + 76 (top of locals)
- push ebp
- push ebx
- push esi
- push edi
- mov ebp, esp
+ pushreg ebp
+ pushreg ebx
+ pushreg esi
+ pushreg edi
+ setfp ebp
and esp, ~15
sub esp, 76
+ endprologue
// Establish the expanded operands and the blocks-of-4 dv limit.
mov edi, [ebp + 20] // -> Z = dv[0]
jmp 5b
// All over.
-9: mov esp, ebp
- pop edi
- pop esi
- pop ebx
- pop ebp
+9: dropfp
+ popreg edi
+ popreg esi
+ popreg ebx
+ popreg ebp
ret
ENDFUNC
.endm
.macro testprologue
- push ebp
- push ebx
- push esi
- push edi
- mov ebp, esp
+ pushreg ebp
+ pushreg ebx
+ pushreg esi
+ pushreg edi
+ setfp ebp
and esp, ~15
sub esp, 3*32 + 12
+ endprologue
// vars:
// esp + 0 = cycles
// esp + 12 = v expanded
.endm
.macro testepilogue
- mov esp, ebp
- pop edi
- pop esi
- pop ebx
- pop ebp
+ dropfp
+ popreg edi
+ popreg esi
+ popreg ebx
+ popreg ebp
ret
.endm
movdqu [ecx + 32], xmm6
.endm
- .globl test_dmul4
-test_dmul4:
+FUNC(test_dmul4)
testprologue
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
testtail [ebp + 48], [ebp + 44]
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_dmla4
-test_dmla4:
+FUNC(test_dmla4)
testprologue
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
testtail [ebp + 48], [ebp + 44]
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mul4
-test_mul4:
+FUNC(test_mul4)
testprologue
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
testtail [ebp + 40], [ebp + 36]
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mla4
-test_mla4:
+FUNC(test_mla4)
testprologue
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
testtail [ebp + 40], [ebp + 36]
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mmul4
-test_mmul4:
+FUNC(test_mmul4)
testprologue
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mmla4
-test_mmla4:
+FUNC(test_mmla4)
testprologue
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mont4
-test_mont4:
+FUNC(test_mont4)
testprologue
testexpand nil, [ebp + 36]
mov edi, [ebp + 20]
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
#endif