/// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
/// operands, as follows.
///
-/// Offset 0 4 8 12
-/// 0 v'_0 v''_0 v'_1 v''_1
-/// 16 v'_2 v''_2 v'_3 v''_3
+/// Offset 12 8 4 0
+/// 0 v''_1 v'_1 v''_0 v'_0
+/// 16 v''_3 v'_3 v''_2 v'_2
///
/// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
ldr r14, [STKARG(0)] // -> vv
vld1.32 {q2}, [r14]
vmov.i32 q3, #0
- vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1)
+ vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ldr r14, [STKARG(1)] // -> yy
vld1.32 {q4}, [r14]
vmov.i32 q5, #0
- vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1)
+ vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ldr r5, [STKARG(2)] // = n
ldr r6, [STKARG(3)] // -> cyv
vld1.32 {q4}, [r3]
vmov.i32 q5, #0
- vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1)
+ vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ldr r5, [STKARG(0)] // = n
ldr r6, [STKARG(1)] // -> cyv
ldr r14, [STKARG(1)] // -> vv
vld1.32 {q2}, [r14]
vmov.i32 q3, #0
- vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1)
+ vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ldr r14, [STKARG(2)] // -> yy
vld1.32 {q4}, [r14]
vmov.i32 q5, #0
- vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1)
+ vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ldr r5, [STKARG(3)] // = n
ldr r6, [STKARG(4)] // -> cyv
ldr r14, [STKARG(0)] // -> vv
vld1.32 {q2}, [r14]
vmov.i32 q3, #0
- vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1)
+ vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ldr r5, [STKARG(1)] // = n
ldr r6, [STKARG(2)] // -> cyv