// We need a copy for later. Rather than waste time copying them by
// hand, we'll use the three-address nature of the instruction set.
// But this means that the main loop is offset by a bit.
- vldmia r1, {d24-d31}
+ vldmia r1, {QQ(q12, q15)}
// Apply a column quarterround to each of the columns simultaneously,
// moving the results to their working registers. Alas, there
vext.32 q3, q9, q9, #1 // 12, 13, 14, 15
// And with that, we're done.
- vstmia r2, {d0-d7}
+ vstmia r2, {QQ(q0, q3)}
bx r14
ENDFUNC