~mdw
/
catacomb
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
math/mpx-mul4-*.S: Use more portable type syntax for ambiguous instructions.
[catacomb]
/
math
/
mpx-mul4-amd64-sse2.S
diff --git
a/math/mpx-mul4-amd64-sse2.S
b/math/mpx-mul4-amd64-sse2.S
index
17c4f1a
..
5a748c6
100644
(file)
--- a/
math/mpx-mul4-amd64-sse2.S
+++ b/
math/mpx-mul4-amd64-sse2.S
@@
-1155,7
+1155,7
@@
FUNC(mpxmont_redc4_amd64_sse2)
// outer loop dv r10 rcx
// outer loop dv limit r11 r11
// nv base rdx r8
// outer loop dv r10 rcx
// outer loop dv limit r11 r11
// nv base rdx r8
- // nv limit r9 r1
2
*
+ // nv limit r9 r1
0
*
// n rcx r9
// c rcx r9
// n rcx r9
// c rcx r9
@@
-1183,14
+1183,13
@@
FUNC(mpxmont_redc4_amd64_sse2)
# define DV rcx
# define DVLO r11
# define NV r8
# define DV rcx
# define DVLO r11
# define NV r8
-# define NVL r1
2
+# define NVL r1
0
# define N r9
# define C r9d
pushreg rbx
pushreg rdi
# define N r9
# define C r9d
pushreg rbx
pushreg rdi
- pushreg r12
- stalloc 160
+ stalloc 168
savexmm xmm6, 0
savexmm xmm7, 16
savexmm xmm6, 0
savexmm xmm7, 16
@@
-1252,9
+1251,9
@@
FUNC(mpxmont_redc4_amd64_sse2)
// Continue carry propagation until the end of the buffer.
0: add [rdi], C
mov C, 0 // preserves flags
// Continue carry propagation until the end of the buffer.
0: add [rdi], C
mov C, 0 // preserves flags
- adc
d
[rdi + 4], 0
- adc
d
[rdi + 8], 0
- adc
d
[rdi + 12], 0
+ adc
dword ptr
[rdi + 4], 0
+ adc
dword ptr
[rdi + 8], 0
+ adc
dword ptr
[rdi + 12], 0
adc C, 0
add rdi, 16
cmp rdi, DVL4
adc C, 0
add rdi, 16
cmp rdi, DVL4
@@
-1271,10
+1270,10
@@
FUNC(mpxmont_redc4_amd64_sse2)
jb 7b
// All done for this iteration. Start the next.
jb 7b
// All done for this iteration. Start the next.
-8: mov rdi, DV // -> Z = dv[i]
- mov rbx, NV // -> X = nv[0]
- cmp rdi, DVLO // all done yet?
+ cmp DV, DVLO // all done yet?
jae 9f
jae 9f
+ mov rdi, DV // -> Z = dv[i]
+ mov rbx, NV // -> X = nv[0]
add DV, 16
call mont4
add rdi, 16
add DV, 16
call mont4
add rdi, 16
@@
-1300,8
+1299,7
@@
FUNC(mpxmont_redc4_amd64_sse2)
rstrxmm xmm14, 128
rstrxmm xmm15, 144
rstrxmm xmm14, 128
rstrxmm xmm15, 144
- stfree 160
- popreg r12
+ stfree 168
popreg rdi
popreg rbx
#endif
popreg rdi
popreg rbx
#endif
@@
-1603,6
+1601,8
@@
FUNC(test_mmul4)
testtop r11
call mmul4
testtail
testtop r11
call mmul4
testtail
+ pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
+ pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
@@
-1614,6
+1614,8
@@
FUNC(test_mmla4)
testtop r11
call mmla4
testtail
testtop r11
call mmla4
testtail
+ pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
+ pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
@@
-1625,6
+1627,8
@@
FUNC(test_mont4)
testtop
call mont4
testtail
testtop
call mont4
testtail
+ pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
+ pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout