Juggle the register allocation in the loop which copies over the first
key-data cycle, so as to arrange to leave the last copied key word in
r4. Then we can elide the explicit load of r4 at the start of the main
key expansion loop, because it already has the right value, saving a
whole instruction.
// sort this out.
add r9, r0, #w
mov r14, r3
- ands r4, r2, #3
+ ands r6, r2, #3
beq 1f
- mov r4, r4, lsl #3
- rsb r5, r4, #32
+ mov r6, r6, lsl #3
+ rsb r7, r6, #32
bic r2, r2, #3
- ldr r6, [r2], #4
+ ldr r4, [r2], #4
-0: ldr r7, [r2], #4
- mov r6, r6, lsr r4
- orr r6, r7, lsl r5
- str r6, [r9], #4
- mov r6, r7
+0: ldr r5, [r2], #4
+ mov r4, r4, lsr r6
+ orr r4, r5, lsl r7
+ str r4, [r9], #4
subs r14, r14, #1
+ movhi r4, r5
bhi 0b
b 9f
-1: ldr r6, [r2], #4
- str r6, [r9], #4
+1: ldr r4, [r2], #4
+ str r4, [r9], #4
subs r14, r14, #1
bhi 1b
// Find out other useful things and prepare for the main loop.
9: ldr r7, [r0, #nr] // number of rounds
mla r2, r1, r7, r1 // total key size in words
- ldr r4, [r9, #-4] // most recent key word
leaextq r5, rijndael_rcon // round constants
sub r8, r2, r3 // minus what we've copied already
veor q1, q1 // all-zero register for the key