.ifnes "\d3", "nil"
movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
.endif
- pshufd \d0, \d0, 0b11001100 // (r_i, ?, r_i, ?)
+ pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
.ifnes "\d1", "nil"
psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
.endif
// carry registers. On completion, XMM3 is clobbered. If CC is
// `nil', then the contribution which would have been added to it is
// left in C.
- pshufd xmm3, \c, 0b10111111 // (?, ?, ?, t = c'' mod B)
+ pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
pslldq xmm3, 2 // (t b, 0)
paddq \c, xmm3 // (c' + t b, c'')
punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
.endif
- pshufd \a, \a, 0b11011000 // (a'_0, a'_1, a''_0, a''_1)
- pshufd \b, \b, 0b11011000 // (a'_2, a'_3, a''_2, a''_3)
+ pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
+ pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
.ifnes "\c", "nil"
- pshufd \c, \c, 0b11011000 // (c'_0, c'_1, c''_0, c''_1)
- pshufd \d, \d, 0b11011000 // (c'_2, c'_3, c''_2, c''_3)
+ pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
+ pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
.endif
.endm
///--------------------------------------------------------------------------
/// Primitive multipliers and related utilities.
- .p2align 4
-carryprop:
+INTFUNC(carryprop)
// On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
// form. Store the low 128 bits of the represented carry to [EDI] as
// a packed 128-bit value, and leave the remaining 16 bits in the low
endprop [edi + 12], xmm6, xmm4
ret
- .p2align 4
-dmul4:
+ENDFUNC
+
+INTFUNC(dmul4)
// On entry, EDI points to the destination buffer; EAX and EBX point
// to the packed operands U and X; ECX and EDX point to the expanded
// operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
ret
- .p2align 4
-dmla4:
+ENDFUNC
+
+INTFUNC(dmla4)
// On entry, EDI points to the destination buffer, which also
// contains an addend A to accumulate; EAX and EBX point to the
// packed operands U and X; ECX and EDX point to the expanded
ret
- .p2align 4
-mul4zc:
+ENDFUNC
+
+INTFUNC(mul4zc)
// On entry, EDI points to the destination buffer; EBX points to a
// packed operand X; and EDX points to an expanded operand Y.
//
ret
- .p2align 4
-mul4:
+ENDFUNC
+
+INTFUNC(mul4)
// On entry, EDI points to the destination buffer; EBX points to a
// packed operand X; EDX points to an expanded operand Y; and XMM4,
// XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
ret
- .p2align 4
-mla4zc:
+ENDFUNC
+
+INTFUNC(mla4zc)
// On entry, EDI points to the destination buffer, which also
// contains an addend A to accumulate; EBX points to a packed operand
// X; and EDX points to an expanded operand Y.
ret
- .p2align 4
-mla4:
+ENDFUNC
+
+INTFUNC(mla4)
// On entry, EDI points to the destination buffer, which also
// contains an addend A to accumulate; EBX points to a packed operand
// X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
ret
- .p2align 4
-mmul4:
+ENDFUNC
+
+INTFUNC(mmul4)
// On entry, EDI points to the destination buffer; EAX and EBX point
// to the packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
propout [edi + 0], xmm4, xmm5
jmp 5f
- .p2align 4
-mmla4:
+ENDFUNC
+
+INTFUNC(mmla4)
// On entry, EDI points to the destination buffer, which also
// contains an addend A to accumulate; EAX and EBX point
// to the packed operands U and N; ECX and ESI point to the expanded
add esp, 64
ret
- .p2align 4
-mont4:
+ENDFUNC
+
+INTFUNC(mont4)
// On entry, EDI points to the destination buffer holding a packed
// value A; EBX points to a packed operand N; ESI points to an
// expanded operand M; and EDX points to a place to store an expanded
// And, with that, we're done.
ret
+ENDFUNC
+
///--------------------------------------------------------------------------
/// Bulk multipliers.