sub r8, r2, r3 // minus what we've copied already
veor q1, q1 // all-zero register for the key
add r8, r9, r8, lsl #2 // limit of the key buffer
+ mov r12, #0 // position in current cycle
- // Main key expansion loop. The first word of each key-length chunk
- // needs special treatment.
-0: ldrb r14, [r5], #1 // next round constant
- ldr r6, [r9, -r3, lsl #2]
+ // Main key expansion loop. Dispatch according to the position in
+ // the cycle.
+0: ldr r6, [r9, -r3, lsl #2] // word from previous cycle
+ cmp r12, #0 // first word of the cycle?
+ beq 1f
+ cmp r12, #4 // fourth word of the cycle?
+ bne 2f
+ cmp r3, #7 // seven or eight words of key?
+ bcc 2f
+
+ // Fourth word of the cycle, seven or eight words of key. We must do
+ // the byte substitution.
vdup.32 q0, r4
aese.8 q0, q1 // effectively, just SubBytes
vmov.32 r4, d0[0]
- eor r4, r14, r4, ror #8
- eor r4, r4, r6
- str r4, [r9], #4
- cmp r9, r8
- bcs 9f
+ b 2f
- // The next three words are simple.
+ // First word of the cycle. Byte substitution, rotation, and round
+ // constant.
+1: ldrb r14, [r5], #1 // next round constant
ldr r6, [r9, -r3, lsl #2]
- eor r4, r4, r6
- str r4, [r9], #4
- cmp r9, r8
- bcs 9f
-
- // (Word 2...)
- ldr r6, [r9, -r3, lsl #2]
- eor r4, r4, r6
- str r4, [r9], #4
- cmp r9, r8
- bcs 9f
-
- // (Word 3...)
- ldr r6, [r9, -r3, lsl #2]
- eor r4, r4, r6
- str r4, [r9], #4
- cmp r9, r8
- bcs 9f
-
- // Word 4. If the key is /more/ than 6 words long, then we must
- // apply a substitution here.
- cmp r3, #5
- bcc 0b
- ldr r6, [r9, -r3, lsl #2]
- cmp r3, #7
- bcc 1f
vdup.32 q0, r4
aese.8 q0, q1 // effectively, just SubBytes
vmov.32 r4, d0[0]
-1: eor r4, r4, r6
- str r4, [r9], #4
- cmp r9, r8
- bcs 9f
-
- // (Word 5...)
- cmp r3, #6
- bcc 0b
- ldr r6, [r9, -r3, lsl #2]
- eor r4, r4, r6
- str r4, [r9], #4
- cmp r9, r8
- bcs 9f
+ eor r4, r14, r4, ror #8
- // (Word 6...)
- cmp r3, #7
- bcc 0b
- ldr r6, [r9, -r3, lsl #2]
- eor r4, r4, r6
+ // Common ending: mix in the word from the previous cycle and store.
+2: eor r4, r4, r6
str r4, [r9], #4
- cmp r9, r8
- bcs 9f
- // (Word 7...)
- cmp r3, #8
- bcc 0b
- ldr r6, [r9, -r3, lsl #2]
- eor r4, r4, r6
- str r4, [r9], #4
+ // Prepare for the next iteration. If we're done, then stop; if
+ // we've finished a cycle then reset the counter.
+ add r12, r12, #1
cmp r9, r8
bcs 9f
-
- // Must be done by now.
+ cmp r12, r3
+ movcs r12, #0
b 0b
// Next job is to construct the decryption keys. The keys for the
# define RCON ecx // round constants table
# define LIM edx // limit pointer
# define LIMn edx // ... as integer offset from base
+# define CYIX edi // index in shift-register cycle
# define NR ecx // number of rounds
# define LRK eax // distance to last key
# define RCON rdi // round constants table
# define LIMn ecx // limit pointer
# define LIM rcx // ... as integer offset from base
+# define CYIX r11d // index in shift-register cycle
# define NR ecx // number of rounds
# define LRK eax // distance to last key
# define RCON rdi // round constants table
# define LIMn ecx // limit pointer
# define LIM rcx // ... as integer offset from base
+# define CYIX r11d // index in shift-register cycle
# define NR ecx // number of rounds
# define LRK eax // distance to last key
lea SI, [CTX + w]
mov eax, [SI + 4*KSZo - 4] // most recent key word
lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
+ xor CYIX, CYIX // start of new cycle
// Main key expansion loop. The first word of each key-length chunk
// needs special treatment.
// as an immediate, so it's kind of annoying if you're not
// open-coding the whole thing. It's much easier to leave that as
// zero and XOR in the round constant by hand.
-0: movd xmm0, eax
+0: cmp CYIX, 0 // first word of the cycle?
+ je 1f
+ cmp CYIX, 4 // fourth word of the cycle?
+ jne 2f
+ cmp KSZ, 7 // and a large key?
+ jb 2f
+
+ // Fourth word of the cycle, and seven or eight words of key. Do a
+ // byte substitution.
+ movd xmm0, eax
+ pshufd xmm0, xmm0, ROTL
+ aeskeygenassist xmm1, xmm0, 0
+ movd eax, xmm1
+ jmp 2f
+
+ // First word of the cycle. This is the complicated piece.
+1: movd xmm0, eax
pshufd xmm0, xmm0, ROTR
aeskeygenassist xmm1, xmm0, 0
pshufd xmm1, xmm1, ROTL
movd eax, xmm1
- xor eax, [SI]
xor al, [RCON]
inc RCON
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 9f
-
- // The next three words are simple...
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 9f
-
- // (Word 2...)
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 9f
- // (Word 3...)
- xor eax, [SI]
+ // Common tail. Mix in the corresponding word from the previous
+ // cycle and prepare for the next loop.
+2: xor eax, [SI]
mov [SI + 4*KSZo], eax
add SI, 4
+ inc CYIX
cmp SI, LIM
jae 9f
-
- // Word 4. If the key is /more/ than 6 words long, then we must
- // apply a substitution here.
- cmp KSZ, 5
+ cmp CYIX, KSZ
jb 0b
- cmp KSZ, 7
- jb 1f
- movd xmm0, eax
- pshufd xmm0, xmm0, ROTL
- aeskeygenassist xmm1, xmm0, 0
- movd eax, xmm1
-1: xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 9f
-
- // (Word 5...)
- cmp KSZ, 6
- jb 0b
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 9f
-
- // (Word 6...)
- cmp KSZ, 7
- jb 0b
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 9f
-
- // (Word 7...)
- cmp KSZ, 8
- jb 0b
- xor eax, [SI]
- mov [SI + 4*KSZo], eax
- add SI, 4
- cmp SI, LIM
- jae 9f
-
- // Must be done by now.
+ xor CYIX, CYIX
jmp 0b
// Next job is to construct the decryption keys. The keys for the