~mdw
/
catacomb
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
base/dispatch.c: Make `cpuid_feature_p' more easily extensible.
[catacomb]
/
math
/
mpx-mul4-amd64-sse2.S
diff --git
a/math/mpx-mul4-amd64-sse2.S
b/math/mpx-mul4-amd64-sse2.S
index
da3e6d6
..
1c344f4
100644
(file)
--- a/
math/mpx-mul4-amd64-sse2.S
+++ b/
math/mpx-mul4-amd64-sse2.S
@@
-1270,10
+1270,10
@@
FUNC(mpxmont_redc4_amd64_sse2)
jb 7b
// All done for this iteration. Start the next.
jb 7b
// All done for this iteration. Start the next.
-8: mov rdi, DV // -> Z = dv[i]
- mov rbx, NV // -> X = nv[0]
- cmp rdi, DVLO // all done yet?
+ cmp DV, DVLO // all done yet?
jae 9f
jae 9f
+ mov rdi, DV // -> Z = dv[i]
+ mov rbx, NV // -> X = nv[0]
add DV, 16
call mont4
add rdi, 16
add DV, 16
call mont4
add rdi, 16
@@
-1601,6
+1601,8
@@
FUNC(test_mmul4)
testtop r11
call mmul4
testtail
testtop r11
call mmul4
testtail
+ pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
+ pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
@@
-1612,6
+1614,8
@@
FUNC(test_mmla4)
testtop r11
call mmla4
testtail
testtop r11
call mmla4
testtail
+ pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
+ pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
@@
-1623,6
+1627,8
@@
FUNC(test_mont4)
testtop
call mont4
testtail
testtop
call mont4
testtail
+ pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
+ pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout