X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/81ceb2c35de440e701d2f4e6960001395d2b7e97..HEAD:/math/mpx-mul4-arm64-simd.S diff --git a/math/mpx-mul4-arm64-simd.S b/math/mpx-mul4-arm64-simd.S index 60eed208..ee33a002 100644 --- a/math/mpx-mul4-arm64-simd.S +++ b/math/mpx-mul4-arm64-simd.S @@ -57,9 +57,9 @@ /// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD /// operands, as follows. /// -/// Offset 0 4 8 12 -/// 0 v'_0 v''_0 v'_1 v''_1 -/// 16 v'_2 v''_2 v'_3 v''_3 +/// Offset 12 8 4 0 +/// 0 v''_1 v'_1 v''_0 v'_0 +/// 16 v''_3 v'_3 v''_2 v'_2 /// /// The `umull' and `umlal' instructions can multiply a vector of two 32-bit /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act @@ -230,7 +230,7 @@ // leaving a carry in CG. // // In detail, what happens is as follows. Suppose initially that ZLO = -// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i; +// (z''_i; z'_i) and ZHI = (z''_{i+1}; z'_{i+1}). Let t = z'_i + b z''_i; // observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and // add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has // a circuit depth of 3; I don't know how to do better. @@ -1032,12 +1032,12 @@ ENDFUNC .ifeqs "\mode", "dmul" ldr q2, [x4] - zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3) - zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1) + zip2 v3.8h, v2.8h, v31.8h // (v''_3, v'_3; v''_2, v'_2) + zip1 v2.8h, v2.8h, v31.8h // (v''_1, v'_1; v''_0, v'_0) ldr q4, [x5] - zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3) - zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1) + zip2 v5.8h, v4.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2) + zip1 v4.8h, v4.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0) mov x16, x1 mov x1, x2 // -> u @@ -1050,8 +1050,8 @@ ENDFUNC .ifeqs "\mode", "smul" ldr q4, [x3] - zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3) - zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1) + zip2 v5.8h, v4.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2) + zip1 v4.8h, v4.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0) // x2 // -> x mov x3, x1 // -> c @@ -1061,12 +1061,12 @@ ENDFUNC .ifeqs "\mode", "mmul" ldr q2, [x5] - zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3) - zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1) + zip2 v3.8h, v2.8h, v31.8h // (v''_3, v'_3; v''_2, v'_2) + zip1 v2.8h, v2.8h, v31.8h // (v''_1, v'_1; v''_0, v'_0) ldr q6, [x6] - zip2 v7.8h, v6.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3) - zip1 v6.8h, v6.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1) + zip2 v7.8h, v6.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2) + zip1 v6.8h, v6.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0) mov x16, x1 mov x1, x3 // -> u @@ -1082,8 +1082,8 @@ ENDFUNC .ifeqs "\mode", "mont" ldr q6, [x4] - zip2 v7.8h, v6.8h, v31.8h // (m'_2, m''_2; m'_3, m''_3) - zip1 v6.8h, v6.8h, v31.8h // (m'_0, m''_0; m'_1, m''_1) + zip2 v7.8h, v6.8h, v31.8h // (m''_3, m'_3; m''_2, m'_2) + zip1 v6.8h, v6.8h, v31.8h // (m''_1, m'_1; m''_0, m'_0) mov x4, x2 // -> y mov x2, x3 // -> x