~mdw
/
catacomb
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
progs/perftest.c: Use from Glibc syscall numbers.
[catacomb]
/
math
/
mpx-mul4-arm64-simd.S
diff --git
a/math/mpx-mul4-arm64-simd.S
b/math/mpx-mul4-arm64-simd.S
index
0781e64
..
ee33a00
100644
(file)
--- a/
math/mpx-mul4-arm64-simd.S
+++ b/
math/mpx-mul4-arm64-simd.S
@@
-57,9
+57,9
@@
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
/// operands, as follows.
///
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
/// operands, as follows.
///
-/// Offset
0 4 8 12
-/// 0 v'
_0 v''_0 v'_1 v''_1
-/// 16 v'
_2 v''_2 v'_3 v''_3
+/// Offset
12 8 4 0
+/// 0 v'
'_1 v'_1 v''_0 v'_0
+/// 16 v'
'_3 v'_3 v''_2 v'_2
///
/// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
///
/// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
@@
-186,7
+186,7
@@
.endif
.endm
.endif
.endm
-// `mulI': accumulate the B^I and b B^
i
terms of the polynomial product sum
+// `mulI': accumulate the B^I and b B^
I
terms of the polynomial product sum
// U V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly
// for x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2)
// + B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are
// U V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly
// for x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2)
// + B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are
@@
-230,7
+230,7
@@
// leaving a carry in CG.
//
// In detail, what happens is as follows. Suppose initially that ZLO =
// leaving a carry in CG.
//
// In detail, what happens is as follows. Suppose initially that ZLO =
-// (z'
_i; z''_i) and ZHI = (z'_{i+1}; z'
'_{i+1}). Let t = z'_i + b z''_i;
+// (z'
'_i; z'_i) and ZHI = (z''_{i+1}; z
'_{i+1}). Let t = z'_i + b z''_i;
// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and
// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has
// a circuit depth of 3; I don't know how to do better.
// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and
// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has
// a circuit depth of 3; I don't know how to do better.
@@
-1032,12
+1032,12
@@
ENDFUNC
.ifeqs "\mode", "dmul"
ldr q2, [x4]
.ifeqs "\mode", "dmul"
ldr q2, [x4]
- zip2 v3.8h, v2.8h, v31.8h // (v'
_2, v''_2; v'_3, v''_3
)
- zip1 v2.8h, v2.8h, v31.8h // (v'
_0, v''_0; v'_1, v''_1
)
+ zip2 v3.8h, v2.8h, v31.8h // (v'
'_3, v'_3; v''_2, v'_2
)
+ zip1 v2.8h, v2.8h, v31.8h // (v'
'_1, v'_1; v''_0, v'_0
)
ldr q4, [x5]
ldr q4, [x5]
- zip2 v5.8h, v4.8h, v31.8h // (y'
_2, y''_2; y'_3, y''_3
)
- zip1 v4.8h, v4.8h, v31.8h // (y'
_0, y''_0; y'_1, y''_1
)
+ zip2 v5.8h, v4.8h, v31.8h // (y'
'_3, y'_3; y''_2, y'_2
)
+ zip1 v4.8h, v4.8h, v31.8h // (y'
'_1, y'_1; y''_0, y'_0
)
mov x16, x1
mov x1, x2 // -> u
mov x16, x1
mov x1, x2 // -> u
@@
-1050,8
+1050,8
@@
ENDFUNC
.ifeqs "\mode", "smul"
ldr q4, [x3]
.ifeqs "\mode", "smul"
ldr q4, [x3]
- zip2 v5.8h, v4.8h, v31.8h // (y'
_2, y''_2; y'_3, y''_3
)
- zip1 v4.8h, v4.8h, v31.8h // (y'
_0, y''_0; y'_1, y''_1
)
+ zip2 v5.8h, v4.8h, v31.8h // (y'
'_3, y'_3; y''_2, y'_2
)
+ zip1 v4.8h, v4.8h, v31.8h // (y'
'_1, y'_1; y''_0, y'_0
)
// x2 // -> x
mov x3, x1 // -> c
// x2 // -> x
mov x3, x1 // -> c
@@
-1061,12
+1061,12
@@
ENDFUNC
.ifeqs "\mode", "mmul"
ldr q2, [x5]
.ifeqs "\mode", "mmul"
ldr q2, [x5]
- zip2 v3.8h, v2.8h, v31.8h // (v'
_2, v''_2; v'_3, v''_3
)
- zip1 v2.8h, v2.8h, v31.8h // (v'
_0, v''_0; v'_1, v''_1
)
+ zip2 v3.8h, v2.8h, v31.8h // (v'
'_3, v'_3; v''_2, v'_2
)
+ zip1 v2.8h, v2.8h, v31.8h // (v'
'_1, v'_1; v''_0, v'_0
)
ldr q6, [x6]
ldr q6, [x6]
- zip2 v7.8h, v6.8h, v31.8h // (y'
_2, y''_2; y'_3, y''_3
)
- zip1 v6.8h, v6.8h, v31.8h // (y'
_0, y''_0; y'_1, y''_1
)
+ zip2 v7.8h, v6.8h, v31.8h // (y'
'_3, y'_3; y''_2, y'_2
)
+ zip1 v6.8h, v6.8h, v31.8h // (y'
'_1, y'_1; y''_0, y'_0
)
mov x16, x1
mov x1, x3 // -> u
mov x16, x1
mov x1, x3 // -> u
@@
-1082,8
+1082,8
@@
ENDFUNC
.ifeqs "\mode", "mont"
ldr q6, [x4]
.ifeqs "\mode", "mont"
ldr q6, [x4]
- zip2 v7.8h, v6.8h, v31.8h // (m'
_2, m''_2; m'_3, m''_3
)
- zip1 v6.8h, v6.8h, v31.8h // (m'
_0, m''_0; m'_1, m''_1
)
+ zip2 v7.8h, v6.8h, v31.8h // (m'
'_3, m'_3; m''_2, m'_2
)
+ zip1 v6.8h, v6.8h, v31.8h // (m'
'_1, m'_1; m''_0, m'_0
)
mov x4, x2 // -> y
mov x2, x3 // -> x
mov x4, x2 // -> y
mov x2, x3 // -> x