Apparently I was asleep when I read the architecture reference because I
missed the `sri' instruction and how it can be used to synthesize
rotations with only two instructions rather than three.
Also replace rotation by 16 with the obvious `rev32'.
// a += b; d ^= a; d <<<= 16
vadd.u32 q8, q12, q13
veor q11, q15, q8
// a += b; d ^= a; d <<<= 16
vadd.u32 q8, q12, q13
veor q11, q15, q8
- vshl.u32 q0, q11, #16
- vshr.u32 q11, q11, #16
- vorr q11, q11, q0
// c += d; b ^= c; b <<<= 12
vadd.u32 q10, q14, q11
// c += d; b ^= c; b <<<= 12
vadd.u32 q10, q14, q11
- veor q9, q13, q10
- vshl.u32 q0, q9, #12
- vshr.u32 q9, q9, #20
- vorr q9, q9, q0
+ veor q0, q13, q10
+ vshl.u32 q9, q0, #12
+ vsri.u32 q9, q0, #20
0:
// Apply (the rest of) a column quarterround to each of the columns
0:
// Apply (the rest of) a column quarterround to each of the columns
// a += b; d ^= a; d <<<= 8
vadd.u32 q8, q8, q9
// a += b; d ^= a; d <<<= 8
vadd.u32 q8, q8, q9
- veor q11, q11, q8
- vshl.u32 q0, q11, #8
- vshr.u32 q11, q11, #24
- vorr q11, q11, q0
+ veor q0, q11, q8
+ vshl.u32 q11, q0, #8
+ vsri.u32 q11, q0, #24
// c += d; b ^= c; b <<<= 7
vadd.u32 q10, q10, q11
vext.32 q11, q11, q11, #3
// c += d; b ^= c; b <<<= 7
vadd.u32 q10, q10, q11
vext.32 q11, q11, q11, #3
vext.32 q10, q10, q10, #2
vext.32 q10, q10, q10, #2
- vshl.u32 q0, q9, #7
- vshr.u32 q9, q9, #25
- vorr q9, q9, q0
+ vshl.u32 q9, q0, #7
+ vsri.u32 q9, q0, #25
// The not-quite-transpose conveniently only involves reordering
// elements of individual rows, which can be done quite easily. It
// The not-quite-transpose conveniently only involves reordering
// elements of individual rows, which can be done quite easily. It
// a += b; d ^= a; d <<<= 16
vadd.u32 q8, q8, q9
veor q11, q11, q8
// a += b; d ^= a; d <<<= 16
vadd.u32 q8, q8, q9
veor q11, q11, q8
- vshl.u32 q0, q11, #16
- vshr.u32 q11, q11, #16
- vorr q11, q11, q0
// c += d; b ^= c; b <<<= 12
vadd.u32 q10, q10, q11
// c += d; b ^= c; b <<<= 12
vadd.u32 q10, q10, q11
- veor q9, q9, q10
- vshl.u32 q0, q9, #12
- vshr.u32 q9, q9, #20
- vorr q9, q9, q0
+ veor q0, q9, q10
+ vshl.u32 q9, q0, #12
+ vsri.u32 q9, q0, #20
// a += b; d ^= a; d <<<= 8
vadd.u32 q8, q8, q9
// a += b; d ^= a; d <<<= 8
vadd.u32 q8, q8, q9
- veor q11, q11, q8
- vshl.u32 q0, q11, #8
- vshr.u32 q11, q11, #24
- vorr q11, q11, q0
+ veor q0, q11, q8
+ vshl.u32 q11, q0, #8
+ vsri.u32 q11, q0, #24
// c += d; b ^= c; b <<<= 7
vadd.u32 q10, q10, q11
vext.32 q11, q11, q11, #1
// c += d; b ^= c; b <<<= 7
vadd.u32 q10, q10, q11
vext.32 q11, q11, q11, #1
vext.32 q10, q10, q10, #2
vext.32 q10, q10, q10, #2
- vshl.u32 q0, q9, #7
- vshr.u32 q9, q9, #25
- vorr q9, q9, q0
+ vshl.u32 q9, q0, #7
+ vsri.u32 q9, q0, #25
// Finally finish off undoing the transpose, and we're done for this
// doubleround. Again, most of this was done above so we don't have
// Finally finish off undoing the transpose, and we're done for this
// doubleround. Again, most of this was done above so we don't have
// a += b; d ^= a; d <<<= 16
vadd.u32 q8, q8, q9
veor q11, q11, q8
// a += b; d ^= a; d <<<= 16
vadd.u32 q8, q8, q9
veor q11, q11, q8
- vshl.u32 q0, q11, #16
- vshr.u32 q11, q11, #16
- vorr q11, q11, q0
// c += d; b ^= c; b <<<= 12
vadd.u32 q10, q10, q11
// c += d; b ^= c; b <<<= 12
vadd.u32 q10, q10, q11
- veor q9, q9, q10
- vshl.u32 q0, q9, #12
- vshr.u32 q9, q9, #20
- vorr q9, q9, q0
+ veor q0, q9, q10
+ vshl.u32 q9, q0, #12
+ vsri.u32 q9, q0, #20
// a += b; d ^= a; d <<<= 16
add v4.4s, v0.4s, v1.4s
eor v7.16b, v3.16b, v4.16b
// a += b; d ^= a; d <<<= 16
add v4.4s, v0.4s, v1.4s
eor v7.16b, v3.16b, v4.16b
- shl v16.4s, v7.4s, #16
- ushr v7.4s, v7.4s, #16
- orr v7.16b, v7.16b, v16.16b
// c += d; b ^= c; b <<<= 12
add v6.4s, v2.4s, v7.4s
// c += d; b ^= c; b <<<= 12
add v6.4s, v2.4s, v7.4s
- eor v5.16b, v1.16b, v6.16b
- shl v16.4s, v5.4s, #12
- ushr v5.4s, v5.4s, #20
- orr v5.16b, v5.16b, v16.16b
+ eor v16.16b, v1.16b, v6.16b
+ shl v5.4s, v16.4s, #12
+ sri v5.4s, v16.4s, #20
0:
// Apply (the rest of) a column quarterround to each of the columns
0:
// Apply (the rest of) a column quarterround to each of the columns
// a += b; d ^= a; d <<<= 8
add v4.4s, v4.4s, v5.4s
// a += b; d ^= a; d <<<= 8
add v4.4s, v4.4s, v5.4s
- eor v7.16b, v7.16b, v4.16b
- shl v16.4s, v7.4s, #8
- ushr v7.4s, v7.4s, #24
- orr v7.16b, v7.16b, v16.16b
+ eor v16.16b, v7.16b, v4.16b
+ shl v7.4s, v16.4s, #8
+ sri v7.4s, v16.4s, #24
// c += d; b ^= c; b <<<= 7
add v6.4s, v6.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #12
// c += d; b ^= c; b <<<= 7
add v6.4s, v6.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #12
- eor v5.16b, v5.16b, v6.16b
+ eor v16.16b, v5.16b, v6.16b
ext v6.16b, v6.16b, v6.16b, #8
ext v6.16b, v6.16b, v6.16b, #8
- shl v16.4s, v5.4s, #7
- ushr v5.4s, v5.4s, #25
- orr v5.16b, v5.16b, v16.16b
+ shl v5.4s, v16.4s, #7
+ sri v5.4s, v16.4s, #25
// The not-quite-transpose conveniently only involves reordering
// elements of individual rows, which can be done quite easily. It
// The not-quite-transpose conveniently only involves reordering
// elements of individual rows, which can be done quite easily. It
// a += b; d ^= a; d <<<= 16
add v4.4s, v4.4s, v5.4s
eor v7.16b, v7.16b, v4.16b
// a += b; d ^= a; d <<<= 16
add v4.4s, v4.4s, v5.4s
eor v7.16b, v7.16b, v4.16b
- shl v16.4s, v7.4s, #16
- ushr v7.4s, v7.4s, #16
- orr v7.16b, v7.16b, v16.16b
// c += d; b ^= c; b <<<= 12
add v6.4s, v6.4s, v7.4s
// c += d; b ^= c; b <<<= 12
add v6.4s, v6.4s, v7.4s
- eor v5.16b, v5.16b, v6.16b
- shl v16.4s, v5.4s, #12
- ushr v5.4s, v5.4s, #20
- orr v5.16b, v5.16b, v16.16b
+ eor v16.16b, v5.16b, v6.16b
+ shl v5.4s, v16.4s, #12
+ sri v5.4s, v16.4s, #20
// a += b; d ^= a; d <<<= 8
add v4.4s, v4.4s, v5.4s
// a += b; d ^= a; d <<<= 8
add v4.4s, v4.4s, v5.4s
- eor v7.16b, v7.16b, v4.16b
- shl v16.4s, v7.4s, #8
- ushr v7.4s, v7.4s, #24
- orr v7.16b, v7.16b, v16.16b
+ eor v16.16b, v7.16b, v4.16b
+ shl v7.4s, v16.4s, #8
+ sri v7.4s, v16.4s, #24
// c += d; b ^= c; b <<<= 7
add v6.4s, v6.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #4
// c += d; b ^= c; b <<<= 7
add v6.4s, v6.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #4
- eor v5.16b, v5.16b, v6.16b
+ eor v16.16b, v5.16b, v6.16b
ext v6.16b, v6.16b, v6.16b, #8
ext v6.16b, v6.16b, v6.16b, #8
- shl v16.4s, v5.4s, #7
- ushr v5.4s, v5.4s, #25
- orr v5.16b, v5.16b, v16.16b
+ shl v5.4s, v16.4s, #7
+ sri v5.4s, v16.4s, #25
// Finally finish off undoing the transpose, and we're done for this
// doubleround. Again, most of this was done above so we don't have
// Finally finish off undoing the transpose, and we're done for this
// doubleround. Again, most of this was done above so we don't have
// a += b; d ^= a; d <<<= 16
add v4.4s, v4.4s, v5.4s
eor v7.16b, v7.16b, v4.16b
// a += b; d ^= a; d <<<= 16
add v4.4s, v4.4s, v5.4s
eor v7.16b, v7.16b, v4.16b
- shl v16.4s, v7.4s, #16
- ushr v7.4s, v7.4s, #16
- orr v7.16b, v7.16b, v16.16b
// c += d; b ^= c; b <<<= 12
add v6.4s, v6.4s, v7.4s
// c += d; b ^= c; b <<<= 12
add v6.4s, v6.4s, v7.4s
- eor v5.16b, v5.16b, v6.16b
- shl v16.4s, v5.4s, #12
- ushr v5.4s, v5.4s, #20
- orr v5.16b, v5.16b, v16.16b
+ eor v16.16b, v5.16b, v6.16b
+ shl v5.4s, v16.4s, #12
+ sri v5.4s, v16.4s, #20
// b ^= (a + d) <<< 7
vadd.u32 q0, q12, q15
vshl.u32 q1, q0, #7
// b ^= (a + d) <<< 7
vadd.u32 q0, q12, q15
vshl.u32 q1, q0, #7
- vshr.u32 q0, q0, #25
- vorr q0, q0, q1
- veor q9, q13, q0
+ vsri.u32 q1, q0, #25
+ veor q9, q13, q1
// c ^= (b + a) <<< 9
vadd.u32 q0, q9, q12
vshl.u32 q1, q0, #9
// c ^= (b + a) <<< 9
vadd.u32 q0, q9, q12
vshl.u32 q1, q0, #9
- vshr.u32 q0, q0, #23
- vorr q0, q0, q1
- veor q10, q14, q0
+ vsri.u32 q1, q0, #23
+ veor q10, q14, q1
// d ^= (c + b) <<< 13
vadd.u32 q0, q10, q9
vext.32 q9, q9, q9, #3
vshl.u32 q1, q0, #13
// d ^= (c + b) <<< 13
vadd.u32 q0, q10, q9
vext.32 q9, q9, q9, #3
vshl.u32 q1, q0, #13
- vshr.u32 q0, q0, #19
- vorr q0, q0, q1
- veor q11, q15, q0
+ vsri.u32 q1, q0, #19
+ veor q11, q15, q1
// a ^= (d + c) <<< 18
vadd.u32 q0, q11, q10
vext.32 q10, q10, q10, #2
vext.32 q11, q11, q11, #1
vshl.u32 q1, q0, #18
// a ^= (d + c) <<< 18
vadd.u32 q0, q11, q10
vext.32 q10, q10, q10, #2
vext.32 q11, q11, q11, #1
vshl.u32 q1, q0, #18
- vshr.u32 q0, q0, #14
- vorr q0, q0, q1
- veor q8, q12, q0
+ vsri.u32 q1, q0, #14
+ veor q8, q12, q1
0:
// The transpose conveniently only involves reordering elements of
0:
// The transpose conveniently only involves reordering elements of
// b ^= (a + d) <<< 7
vadd.u32 q0, q8, q9
vshl.u32 q1, q0, #7
// b ^= (a + d) <<< 7
vadd.u32 q0, q8, q9
vshl.u32 q1, q0, #7
- vshr.u32 q0, q0, #25
- vorr q0, q0, q1
- veor q11, q11, q0
+ vsri.u32 q1, q0, #25
+ veor q11, q11, q1
// c ^= (b + a) <<< 9
vadd.u32 q0, q11, q8
vshl.u32 q1, q0, #9
// c ^= (b + a) <<< 9
vadd.u32 q0, q11, q8
vshl.u32 q1, q0, #9
- vshr.u32 q0, q0, #23
- vorr q0, q0, q1
- veor q10, q10, q0
+ vsri.u32 q1, q0, #23
+ veor q10, q10, q1
// d ^= (c + b) <<< 13
vadd.u32 q0, q10, q11
vext.32 q11, q11, q11, #3
vshl.u32 q1, q0, #13
// d ^= (c + b) <<< 13
vadd.u32 q0, q10, q11
vext.32 q11, q11, q11, #3
vshl.u32 q1, q0, #13
- vshr.u32 q0, q0, #19
- vorr q0, q0, q1
- veor q9, q9, q0
+ vsri.u32 q1, q0, #19
+ veor q9, q9, q1
// a ^= (d + c) <<< 18
vadd.u32 q0, q9, q10
vext.32 q10, q10, q10, #2
vext.32 q9, q9, q9, #1
vshl.u32 q1, q0, #18
// a ^= (d + c) <<< 18
vadd.u32 q0, q9, q10
vext.32 q10, q10, q10, #2
vext.32 q9, q9, q9, #1
vshl.u32 q1, q0, #18
- vshr.u32 q0, q0, #14
- vorr q0, q0, q1
- veor q8, q8, q0
+ vsri.u32 q1, q0, #14
+ veor q8, q8, q1
// We had to undo the transpose ready for the next loop. Again, push
// back the reorderings to reduce latency. Decrement the loop
// We had to undo the transpose ready for the next loop. Again, push
// back the reorderings to reduce latency. Decrement the loop
// b ^= (a + d) <<< 7
vadd.u32 q0, q8, q11
vshl.u32 q1, q0, #7
// b ^= (a + d) <<< 7
vadd.u32 q0, q8, q11
vshl.u32 q1, q0, #7
- vshr.u32 q0, q0, #25
- vorr q0, q0, q1
- veor q9, q9, q0
+ vsri.u32 q1, q0, #25
+ veor q9, q9, q1
// c ^= (b + a) <<< 9
vadd.u32 q0, q9, q8
vshl.u32 q1, q0, #9
// c ^= (b + a) <<< 9
vadd.u32 q0, q9, q8
vshl.u32 q1, q0, #9
- vshr.u32 q0, q0, #23
- vorr q0, q0, q1
- veor q10, q10, q0
+ vsri.u32 q1, q0, #23
+ veor q10, q10, q1
// d ^= (c + b) <<< 13
vadd.u32 q0, q10, q9
vext.32 q9, q9, q9, #3
vshl.u32 q1, q0, #13
// d ^= (c + b) <<< 13
vadd.u32 q0, q10, q9
vext.32 q9, q9, q9, #3
vshl.u32 q1, q0, #13
- vshr.u32 q0, q0, #19
- vorr q0, q0, q1
- veor q11, q11, q0
+ vsri.u32 q1, q0, #19
+ veor q11, q11, q1
// a ^= (d + c) <<< 18
vadd.u32 q0, q11, q10
vext.32 q10, q10, q10, #2
vext.32 q11, q11, q11, #1
vshl.u32 q1, q0, #18
// a ^= (d + c) <<< 18
vadd.u32 q0, q11, q10
vext.32 q10, q10, q10, #2
vext.32 q11, q11, q11, #1
vshl.u32 q1, q0, #18
- vshr.u32 q0, q0, #14
- vorr q0, q0, q1
- veor q8, q8, q0
+ vsri.u32 q1, q0, #14
+ veor q8, q8, q1
// b ^= (a + d) <<< 7
add v16.4s, v0.4s, v3.4s
shl v17.4s, v16.4s, #7
// b ^= (a + d) <<< 7
add v16.4s, v0.4s, v3.4s
shl v17.4s, v16.4s, #7
- ushr v16.4s, v16.4s, #25
- orr v16.16b, v16.16b, v17.16b
- eor v5.16b, v1.16b, v16.16b
+ sri v17.4s, v16.4s, #25
+ eor v5.16b, v1.16b, v17.16b
// c ^= (b + a) <<< 9
add v16.4s, v5.4s, v0.4s
shl v17.4s, v16.4s, #9
// c ^= (b + a) <<< 9
add v16.4s, v5.4s, v0.4s
shl v17.4s, v16.4s, #9
- ushr v16.4s, v16.4s, #23
- orr v16.16b, v16.16b, v17.16b
- eor v6.16b, v2.16b, v16.16b
+ sri v17.4s, v16.4s, #23
+ eor v6.16b, v2.16b, v17.16b
// d ^= (c + b) <<< 13
add v16.4s, v6.4s, v5.4s
ext v5.16b, v5.16b, v5.16b, #12
shl v17.4s, v16.4s, #13
// d ^= (c + b) <<< 13
add v16.4s, v6.4s, v5.4s
ext v5.16b, v5.16b, v5.16b, #12
shl v17.4s, v16.4s, #13
- ushr v16.4s, v16.4s, #19
- orr v16.16b, v16.16b, v17.16b
- eor v7.16b, v3.16b, v16.16b
+ sri v17.4s, v16.4s, #19
+ eor v7.16b, v3.16b, v17.16b
// a ^= (d + c) <<< 18
add v16.4s, v7.4s, v6.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v7.16b, v7.16b, v7.16b, #4
shl v17.4s, v16.4s, #18
// a ^= (d + c) <<< 18
add v16.4s, v7.4s, v6.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v7.16b, v7.16b, v7.16b, #4
shl v17.4s, v16.4s, #18
- ushr v16.4s, v16.4s, #14
- orr v16.16b, v16.16b, v17.16b
- eor v4.16b, v0.16b, v16.16b
+ sri v17.4s, v16.4s, #14
+ eor v4.16b, v0.16b, v17.16b
0:
// The transpose conveniently only involves reordering elements of
0:
// The transpose conveniently only involves reordering elements of
// b ^= (a + d) <<< 7
add v16.4s, v4.4s, v5.4s
shl v17.4s, v16.4s, #7
// b ^= (a + d) <<< 7
add v16.4s, v4.4s, v5.4s
shl v17.4s, v16.4s, #7
- ushr v16.4s, v16.4s, #25
- orr v16.16b, v16.16b, v17.16b
- eor v7.16b, v7.16b, v16.16b
+ sri v17.4s, v16.4s, #25
+ eor v7.16b, v7.16b, v17.16b
// c ^= (b + a) <<< 9
add v16.4s, v7.4s, v4.4s
shl v17.4s, v16.4s, #9
// c ^= (b + a) <<< 9
add v16.4s, v7.4s, v4.4s
shl v17.4s, v16.4s, #9
- ushr v16.4s, v16.4s, #23
- orr v16.16b, v16.16b, v17.16b
- eor v6.16b, v6.16b, v16.16b
+ sri v17.4s, v16.4s, #23
+ eor v6.16b, v6.16b, v17.16b
// d ^= (c + b) <<< 13
add v16.4s, v6.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #12
shl v17.4s, v16.4s, #13
// d ^= (c + b) <<< 13
add v16.4s, v6.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #12
shl v17.4s, v16.4s, #13
- ushr v16.4s, v16.4s, #19
- orr v16.16b, v16.16b, v17.16b
- eor v5.16b, v5.16b, v16.16b
+ sri v17.4s, v16.4s, #19
+ eor v5.16b, v5.16b, v17.16b
// a ^= (d + c) <<< 18
add v16.4s, v5.4s, v6.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v5.16b, v5.16b, v5.16b, #4
shl v17.4s, v16.4s, #18
// a ^= (d + c) <<< 18
add v16.4s, v5.4s, v6.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v5.16b, v5.16b, v5.16b, #4
shl v17.4s, v16.4s, #18
- ushr v16.4s, v16.4s, #14
- orr v16.16b, v16.16b, v17.16b
- eor v4.16b, v4.16b, v16.16b
+ sri v17.4s, v16.4s, #14
+ eor v4.16b, v4.16b, v17.16b
// We had to undo the transpose ready for the next loop. Again, push
// back the reorderings to reduce latency. Decrement the loop
// We had to undo the transpose ready for the next loop. Again, push
// back the reorderings to reduce latency. Decrement the loop
// b ^= (a + d) <<< 7
add v16.4s, v4.4s, v7.4s
shl v17.4s, v16.4s, #7
// b ^= (a + d) <<< 7
add v16.4s, v4.4s, v7.4s
shl v17.4s, v16.4s, #7
- ushr v16.4s, v16.4s, #25
- orr v16.16b, v16.16b, v17.16b
- eor v5.16b, v5.16b, v16.16b
+ sri v17.4s, v16.4s, #25
+ eor v5.16b, v5.16b, v17.16b
// c ^= (b + a) <<< 9
add v16.4s, v5.4s, v4.4s
shl v17.4s, v16.4s, #9
// c ^= (b + a) <<< 9
add v16.4s, v5.4s, v4.4s
shl v17.4s, v16.4s, #9
- ushr v16.4s, v16.4s, #23
- orr v16.16b, v16.16b, v17.16b
- eor v6.16b, v6.16b, v16.16b
+ sri v17.4s, v16.4s, #23
+ eor v6.16b, v6.16b, v17.16b
// d ^= (c + b) <<< 13
add v16.4s, v6.4s, v5.4s
ext v5.16b, v5.16b, v5.16b, #12
shl v17.4s, v16.4s, #13
// d ^= (c + b) <<< 13
add v16.4s, v6.4s, v5.4s
ext v5.16b, v5.16b, v5.16b, #12
shl v17.4s, v16.4s, #13
- ushr v16.4s, v16.4s, #19
- orr v16.16b, v16.16b, v17.16b
- eor v7.16b, v7.16b, v16.16b
+ sri v17.4s, v16.4s, #19
+ eor v7.16b, v7.16b, v17.16b
// a ^= (d + c) <<< 18
add v16.4s, v7.4s, v6.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v7.16b, v7.16b, v7.16b, #4
shl v17.4s, v16.4s, #18
// a ^= (d + c) <<< 18
add v16.4s, v7.4s, v6.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v7.16b, v7.16b, v7.16b, #4
shl v17.4s, v16.4s, #18
- ushr v16.4s, v16.4s, #14
- orr v16.16b, v16.16b, v17.16b
- eor v4.16b, v4.16b, v16.16b
+ sri v17.4s, v16.4s, #14
+ eor v4.16b, v4.16b, v17.16b