X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/26e182fc3ae2a40dc7d52bab2318d8d1837dfeee..318c3c31be343fdba362cb60f33aab3e88798d8a:/symm/rijndael-arm-crypto.S diff --git a/symm/rijndael-arm-crypto.S b/symm/rijndael-arm-crypto.S index d33cac6b..1df81d97 100644 --- a/symm/rijndael-arm-crypto.S +++ b/symm/rijndael-arm-crypto.S @@ -25,20 +25,22 @@ /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- -/// External definitions. +/// Preliminaries. #include "config.h" #include "asm-common.h" - .globl F(abort) - .globl F(rijndael_rcon) + .arch armv8-a + .fpu crypto-neon-fp-armv8 + + .extern F(abort) + .extern F(rijndael_rcon) + + .text ///-------------------------------------------------------------------------- /// Main code. - .arch armv8-a - .fpu crypto-neon-fp-armv8 - /// The ARM crypto extension implements a little-endian version of AES /// (though the manual doesn't actually spell this out and you have to /// experiment), but Catacomb's internal interface presents as big-endian so @@ -52,7 +54,7 @@ // Useful constants. .equ maxrounds, 16 // maximum number of rounds .equ maxblksz, 32 // maximum block size, in bytes - .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer + .equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer // Context structure. .equ nr, 0 // number of rounds @@ -70,7 +72,7 @@ FUNC(rijndael_setup_arm_crypto) // r2 = pointer to key material // r3 = key size in words - stmfd sp!, {r4-r9, r14} + pushreg r4-r9, r14 // The initial round key material is taken directly from the input // key, so copy it over. Unfortunately, the key material is not @@ -78,114 +80,73 @@ FUNC(rijndael_setup_arm_crypto) // sort this out. add r9, r0, #w mov r14, r3 - ands r4, r2, #3 + ands r6, r2, #3 beq 1f - mov r4, r4, lsl #3 - rsb r5, r4, #32 + mov r6, r6, lsl #3 + rsb r7, r6, #32 bic r2, r2, #3 - ldr r6, [r2], #4 + ldr r4, [r2], #4 -0: ldr r7, [r2], #4 - mov r6, r6, lsr r4 - orr r6, r7, lsl r5 - str r6, [r9], #4 - mov r6, r7 +0: ldr r5, [r2], #4 + mov r4, r4, lsr r6 + orr r4, r5, lsl r7 + str r4, [r9], #4 subs r14, r14, #1 + movhi r4, r5 bhi 0b b 9f -1: ldr r6, [r2], #4 - str r6, [r9], #4 +1: ldr r4, [r2], #4 + str r4, [r9], #4 subs r14, r14, #1 bhi 1b // Find out other useful things and prepare for the main loop. - ldr r7, [r0, #nr] // number of rounds +9: ldr r7, [r0, #nr] // number of rounds mla r2, r1, r7, r1 // total key size in words - ldr r4, [r9, #-4] // most recent key word leaextq r5, rijndael_rcon // round constants sub r8, r2, r3 // minus what we've copied already - veor q1, q1 // all-zero register for the key + vmov.i32 q1, #0 // all-zero register for the key add r8, r9, r8, lsl #2 // limit of the key buffer + mov r12, #0 // position in current cycle + + // Main key expansion loop. Dispatch according to the position in + // the cycle. +0: ldr r6, [r9, -r3, lsl #2] // word from previous cycle + cmp r12, #0 // first word of the cycle? + beq 1f + cmp r12, #4 // fourth word of the cycle? + bne 2f + cmp r3, #7 // seven or eight words of key? + bcc 2f - // Main key expansion loop. The first word of each key-length chunk - // needs special treatment. -9: ldrb r14, [r5], #1 // next round constant - ldr r6, [r9, -r3, lsl #2] + // Fourth word of the cycle, seven or eight words of key. We must do + // the byte substitution. vdup.32 q0, r4 aese.8 q0, q1 // effectively, just SubBytes vmov.32 r4, d0[0] - eor r4, r14, r4, ror #8 - eor r4, r4, r6 - str r4, [r9], #4 - cmp r9, r8 - bcs 8f - - // The next three words are simple. - ldr r6, [r9, -r3, lsl #2] - eor r4, r4, r6 - str r4, [r9], #4 - cmp r9, r8 - bcs 8f - - // (Word 2...) - ldr r6, [r9, -r3, lsl #2] - eor r4, r4, r6 - str r4, [r9], #4 - cmp r9, r8 - bcs 8f + b 2f - // (Word 3...) - ldr r6, [r9, -r3, lsl #2] - eor r4, r4, r6 - str r4, [r9], #4 - cmp r9, r8 - bcs 8f - - // Word 4. If the key is /more/ than 6 words long, then we must - // apply a substitution here. - cmp r3, #5 - bcc 9b - ldr r6, [r9, -r3, lsl #2] - cmp r3, #7 - bcc 0f + // First word of the cycle. Byte substitution, rotation, and round + // constant. +1: ldrb r14, [r5], #1 // next round constant vdup.32 q0, r4 aese.8 q0, q1 // effectively, just SubBytes vmov.32 r4, d0[0] -0: eor r4, r4, r6 - str r4, [r9], #4 - cmp r9, r8 - bcs 8f - - // (Word 5...) - cmp r3, #6 - bcc 9b - ldr r6, [r9, -r3, lsl #2] - eor r4, r4, r6 - str r4, [r9], #4 - cmp r9, r8 - bcs 8f + eor r4, r14, r4, ror #8 - // (Word 6...) - cmp r3, #7 - bcc 9b - ldr r6, [r9, -r3, lsl #2] - eor r4, r4, r6 + // Common ending: mix in the word from the previous cycle and store. +2: eor r4, r4, r6 str r4, [r9], #4 - cmp r9, r8 - bcs 8f - // (Word 7...) - cmp r3, #8 - bcc 9b - ldr r6, [r9, -r3, lsl #2] - eor r4, r4, r6 - str r4, [r9], #4 + // Prepare for the next iteration. If we're done, then stop; if + // we've finished a cycle then reset the counter. + add r12, r12, #1 cmp r9, r8 - bcs 8f - - // Must be done by now. - b 9b + bcs 9f + cmp r12, r3 + movcs r12, #0 + b 0b // Next job is to construct the decryption keys. The keys for the // first and last rounds don't need to be mangled, but the remaining @@ -200,7 +161,7 @@ FUNC(rijndael_setup_arm_crypto) // there's easily enough buffer space for the over-enthusiastic reads // and writes because the context has space for 32-byte blocks, which // is our maximum and an exact fit for two Q-class registers. -8: add r5, r0, #wi +9: add r5, r0, #wi add r4, r0, #w add r4, r4, r2, lsl #2 sub r4, r4, r1, lsl #2 // last round's keys @@ -213,10 +174,10 @@ FUNC(rijndael_setup_arm_crypto) vstmiane r5, {d0-d3} // Update the loop variables and stop if we've finished. -9: sub r4, r4, r1, lsl #2 +0: sub r4, r4, r1, lsl #2 add r5, r5, r1, lsl #2 subs r7, r7, #1 - beq 0f + beq 9f // Do another middle round's keys... teq r1, #4 @@ -224,13 +185,13 @@ FUNC(rijndael_setup_arm_crypto) vldmiane r4, {d0-d3} aesimc.8 q0, q0 vstmiaeq r5, {d0, d1} - beq 9b + beq 0b aesimc.8 q1, q1 vstmia r5, {d0-d3} - b 9b + b 0b // Finally do the first encryption round. -0: teq r1, #4 +9: teq r1, #4 vldmiaeq r4, {d0, d1} vldmiane r4, {d0-d3} vstmiaeq r5, {d0, d1} @@ -238,7 +199,7 @@ FUNC(rijndael_setup_arm_crypto) // If the block size is not exactly four words then we must end-swap // everything. We can use fancy NEON toys for this. - beq 0f + beq 9f // End-swap the encryption keys. add r1, r0, #w @@ -249,11 +210,14 @@ FUNC(rijndael_setup_arm_crypto) bl endswap_block // All done. -0: ldmfd sp!, {r4-r9, pc} +9: popreg r4-r9, pc + +ENDFUNC -endswap_block: +INTFUNC(endswap_block) // End-swap R2 words starting at R1. R1 is clobbered; R2 is not. // It's OK to work in 16-byte chunks. + mov r4, r2 0: vldmia r1, {d0, d1} vrev32.8 q0, q0 @@ -267,7 +231,8 @@ ENDFUNC ///-------------------------------------------------------------------------- /// Encrypting and decrypting blocks. -FUNC(rijndael_eblk_arm_crypto) +.macro encdec op, aes, mc, koff + FUNC(rijndael_\op\()_arm_crypto) // Arguments: // r0 = pointer to context @@ -276,77 +241,95 @@ FUNC(rijndael_eblk_arm_crypto) // Set things up ready. ldr r3, [r0, #nr] - add r0, r0, #w + add r0, r0, #\koff vldmia r1, {d0, d1} vrev32.8 q0, q0 - // Dispatch according to the number of rounds. - add r3, r3, r3, lsl #1 - rsbs r3, r3, #3*14 - addcs pc, pc, r3, lsl #2 + // Check the number of rounds and dispatch. + sub r3, r3, #10 + cmp r3, #5 + addlo pc, pc, r3, lsl #2 callext F(abort) - // The last round doesn't have MixColumns, so do it separately. - .rept 13 - vldmia r0!, {d2, d3} - aese.8 q0, q1 - aesmc.8 q0, q0 - .endr - - // Final round. - vldmia r0!, {d2, d3} - aese.8 q0, q1 - - // Final whitening. - vldmia r0!, {d2, d3} - veor q0, q1 + b 10f + b 11f + b 12f + b 13f + b 14f + + // Eleven rounds. +11: vldmia r0!, {d16, d17} + \aes\().8 q0, q8 + \mc\().8 q0, q0 + b 10f + + // Twelve rounds. +12: vldmia r0!, {d16-d19} + \aes\().8 q0, q8 + \mc\().8 q0, q0 + \aes\().8 q0, q9 + \mc\().8 q0, q0 + b 10f + + // Thirteen rounds. +13: vldmia r0!, {d16-d21} + \aes\().8 q0, q8 + \mc\().8 q0, q0 + \aes\().8 q0, q9 + \mc\().8 q0, q0 + \aes\().8 q0, q10 + \mc\().8 q0, q0 + b 10f + + // Fourteen rounds. (Drops through to the ten round case because + // this is the next most common.) +14: vldmia r0!, {d16-d23} + \aes\().8 q0, q8 + \mc\().8 q0, q0 + \aes\().8 q0, q9 + \mc\().8 q0, q0 + \aes\().8 q0, q10 + \mc\().8 q0, q0 + \aes\().8 q0, q11 + \mc\().8 q0, q0 + // Drop through... + + // Ten rounds. +10: vldmia r0!, {d16-d25} + \aes\().8 q0, q8 + \mc\().8 q0, q0 + \aes\().8 q0, q9 + \mc\().8 q0, q0 + \aes\().8 q0, q10 + \mc\().8 q0, q0 + \aes\().8 q0, q11 + \mc\().8 q0, q0 + \aes\().8 q0, q12 + \mc\().8 q0, q0 + + vldmia r0!, {d16-d27} + \aes\().8 q0, q8 + \mc\().8 q0, q0 + \aes\().8 q0, q9 + \mc\().8 q0, q0 + \aes\().8 q0, q10 + \mc\().8 q0, q0 + \aes\().8 q0, q11 + \mc\().8 q0, q0 + + // Final round has no MixColumns, but is followed by final whitening. + \aes\().8 q0, q12 + veor q0, q0, q13 // All done. vrev32.8 q0, q0 vstmia r2, {d0, d1} bx r14 -ENDFUNC - -FUNC(rijndael_dblk_arm_crypto) - - // Arguments: - // r0 = pointer to context - // r1 = pointer to input block - // r2 = pointer to output block + ENDFUNC +.endm - // Set things up ready. - ldr r3, [r0, #nr] - add r0, r0, #wi - vldmia r1, {d0, d1} - vrev32.8 q0, q0 - - // Dispatch according to the number of rounds. - add r3, r3, r3, lsl #1 - rsbs r3, r3, #3*14 - addcs pc, pc, r3, lsl #2 - callext F(abort) - - // The last round doesn't have MixColumns, so do it separately. - .rept 13 - vldmia r0!, {d2, d3} - aesd.8 q0, q1 - aesimc.8 q0, q0 - .endr - - // Final round. - vldmia r0!, {d2, d3} - aesd.8 q0, q1 - - // Final whitening. - vldmia r0!, {d2, d3} - veor q0, q1 - - // All done. - vrev32.8 q0, q0 - vstmia r2, {d0, d1} - bx r14 - -ENDFUNC + encdec eblk, aese, aesmc, w + encdec dblk, aesd, aesimc, wi ///----- That's all, folks --------------------------------------------------