mdw@git.distorted.org.uk Git - catacomb/blob - symm/rijndael-arm64-crypto.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// AArch64 crypto-extension-based implementation of Rijndael
   4 ///
   5 /// (c) 2018 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .extern F(abort)
  34         .extern F(rijndael_rcon)
  35
  36 ///--------------------------------------------------------------------------
  37 /// Main code.
  38
  39         .arch   armv8-a+crypto
  40
  41 /// The ARM crypto extension implements a little-endian version of AES
  42 /// (though the manual doesn't actually spell this out and you have to
  43 /// experiment), but Catacomb's internal interface presents as big-endian so
  44 /// as to work better with things like GCM.  We therefore maintain the round
  45 /// keys in little-endian form, and have to end-swap blocks in and out.
  46 ///
  47 /// For added amusement, the crypto extension doesn't implement the larger-
  48 /// block versions of Rijndael, so we have to end-swap the keys if we're
  49 /// preparing for one of those.
  50
  51         // Useful constants.
  52         .equ    maxrounds, 16           // maximum number of rounds
  53         .equ    maxblksz, 32            // maximum block size, in bytes
  54         .equ    kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
  55
  56         // Context structure.
  57         .equ    nr, 0                   // number of rounds
  58         .equ    w, nr + 4               // encryption key words
  59         .equ    wi, w + kbufsz          // decryption key words
  60
  61 ///--------------------------------------------------------------------------
  62 /// Key setup.
  63
  64 FUNC(rijndael_setup_arm64_crypto)
  65
  66         // Arguments:
  67         //      x0 = pointer to context
  68         //      w1 = block size in 32-bit words
  69         //      x2 = pointer to key material
  70         //      x3 = key size in words
  71
  72         pushreg x29, x30
  73         mov     x29, sp
  74
  75         // The initial round key material is taken directly from the input
  76         // key, so copy it over.  Unfortunately, the key material is not
  77         // guaranteed to be aligned in any especially useful way.  Assume
  78         // that alignment traps are not enabled.  (Why would they be?  On
  79         // A32, alignment traps were part of a transition plan which changed
  80         // the way unaligned loads and stores behaved, but there's never been
  81         // any other behaviour on A64.)
  82         mov     x15, x3
  83         add     x4, x0, #w
  84 0:      sub     x15, x15, #1
  85         ldr     w14, [x2], #4
  86         str     w14, [x4], #4
  87         cbnz    x15, 0b
  88
  89         // Find out other useful things and prepare for the main loop.
  90 9:      ldr     w9, [x0, #nr]           // number of rounds
  91         madd    w2, w1, w9, w1          // total key size in words
  92         leaext  x5, rijndael_rcon       // round constants
  93         sub     x6, x2, x3              // minus what we've copied already
  94         add     x7, x0, #w              // position in previous cycle
  95         movi    v1.4s, #0               // all-zero register for the key
  96         mov     x8, #0                  // position in current cycle
  97
  98         // Main key expansion loop.  Dispatch according to the position in
  99         // the cycle.
 100 0:      ldr     w15, [x7], #4           // word from previous cycle
 101         cbz     x8, 1f                  // first word of the cycle?
 102         cmp     x8, #4                  // fourth word of the cycle?
 103         b.ne    2f
 104         cmp     x3, #7                  // seven or eight words of key?
 105         b.cc    2f
 106
 107         // Fourth word of the cycle, seven or eight words of key.  We must do
 108         // the byte substitution.
 109         dup     v0.4s, w14
 110         aese    v0.16b, v1.16b          // effectively, just SubBytes
 111         mov     w14, v0.s[0]
 112         b       2f
 113
 114         // First word of the cycle.  Byte substitution, rotation, and round
 115         // constant.
 116 1:      ldrb    w13, [x5], #1           // next round constant
 117         dup     v0.4s, w14
 118         aese    v0.16b, v1.16b          // effectively, just SubBytes
 119         mov     w14, v0.s[0]
 120         eor     w14, w13, w14, ror #8
 121
 122         // Common ending: mix in the word from the previous cycle and store.
 123 2:      eor     w14, w14, w15
 124         str     w14, [x4], #4
 125
 126         // Prepare for the next iteration.  If we're done, then stop; if
 127         // we've finished a cycle then reset the counter.
 128         add     x8, x8, #1
 129         sub     x6, x6, #1
 130         cmp     x8, x3
 131         cbz     x6, 9f
 132         csel    x8, x8, xzr, cc
 133         b       0b
 134
 135         // Next job is to construct the decryption keys.  The keys for the
 136         // first and last rounds don't need to be mangled, but the remaining
 137         // ones do -- and they all need to be reordered too.
 138         //
 139         // The plan of action, then, is to copy the final encryption round's
 140         // keys into place first, then to do each of the intermediate rounds
 141         // in reverse order, and finally do the first round.
 142         //
 143         // Do all the heavy lifting with the vector registers.  The order
 144         // we're doing this in means that it's OK if we read or write too
 145         // much, and there's easily enough buffer space for the
 146         // over-enthusiastic reads and writes because the context has space
 147         // for 32-byte blocks, which is our maximum and an exact fit for two
 148         // full-width registers.
 149 9:      add     x5, x0, #wi
 150         add     x4, x0, #w
 151         add     x4, x4, w2, uxtw #2
 152         sub     x4, x4, w1, uxtw #2             // last round's keys
 153
 154         // Copy the last encryption round's keys.
 155         ld1     {v0.4s, v1.4s}, [x4]
 156         st1     {v0.4s, v1.4s}, [x5]
 157
 158         // Update the loop variables and stop if we've finished.
 159 0:      sub     w9, w9, #1
 160         add     x5, x5, w1, uxtw #2
 161         sub     x4, x4, w1, uxtw #2
 162         cbz     w9, 9f
 163
 164         // Do another middle round's keys...
 165         ld1     {v0.4s, v1.4s}, [x4]
 166         aesimc  v0.16b, v0.16b
 167         aesimc  v1.16b, v1.16b
 168         st1     {v0.4s, v1.4s}, [x5]
 169         b       0b
 170
 171         // Finally do the first encryption round.
 172 9:      ld1     {v0.4s, v1.4s}, [x4]
 173         st1     {v0.4s, v1.4s}, [x5]
 174
 175         // If the block size is not exactly four words then we must end-swap
 176         // everything.  We can use fancy vector toys for this.
 177         cmp     w1, #4
 178         b.eq    9f
 179
 180         // End-swap the encryption keys.
 181         add     x1, x0, #w
 182         bl      endswap_block
 183
 184         // And the decryption keys
 185         add     x1, x0, #wi
 186         bl      endswap_block
 187
 188         // All done.
 189 9:      popreg  x29, x30
 190         ret
 191
 192 ENDFUNC
 193
 194 INTFUNC(endswap_block)
 195         // End-swap w2 words starting at x1.  x1 is clobbered; w2 is not.
 196         // It's OK to work in 16-byte chunks.
 197
 198         mov     w3, w2
 199 0:      subs    w3, w3, #4
 200         ld1     {v0.4s}, [x1]
 201         rev32   v0.16b, v0.16b
 202         st1     {v0.4s}, [x1], #16
 203         b.hi    0b
 204         ret
 205
 206 ENDFUNC
 207
 208 ///--------------------------------------------------------------------------
 209 /// Encrypting and decrypting blocks.
 210
 211 .macro  encdec  op, aes, mc, koff
 212   FUNC(rijndael_\op\()_arm64_crypto)
 213
 214         // Arguments:
 215         //      x0 = pointer to context
 216         //      x1 = pointer to input block
 217         //      x2 = pointer to output block
 218
 219         // Set things up ready.
 220         ldr     w3, [x0, #nr]
 221         add     x0, x0, #\koff
 222         ld1     {v0.4s}, [x1]
 223         rev32   v0.16b, v0.16b
 224
 225         // Check the number of rounds and dispatch.
 226         cmp     w3, #14
 227         b.eq    14f
 228         cmp     w3, #10
 229         b.eq    10f
 230         cmp     w3, #12
 231         b.eq    12f
 232         cmp     w3, #13
 233         b.eq    13f
 234         cmp     w3, #11
 235         b.eq    11f
 236         callext F(abort)
 237
 238         // Eleven rounds.
 239 11:     ld1     {v16.4s}, [x0], #16
 240         \aes    v0.16b, v16.16b
 241         \mc     v0.16b, v0.16b
 242         b       10f
 243
 244         // Twelve rounds.
 245 12:     ld1     {v16.4s, v17.4s}, [x0], #32
 246         \aes    v0.16b, v16.16b
 247         \mc     v0.16b, v0.16b
 248         \aes    v0.16b, v17.16b
 249         \mc     v0.16b, v0.16b
 250         b       10f
 251
 252         // Thirteen rounds.
 253 13:     ld1     {v16.4s-v18.4s}, [x0], #48
 254         \aes    v0.16b, v16.16b
 255         \mc     v0.16b, v0.16b
 256         \aes    v0.16b, v17.16b
 257         \mc     v0.16b, v0.16b
 258         \aes    v0.16b, v18.16b
 259         \mc     v0.16b, v0.16b
 260         b       10f
 261
 262         // Fourteen rounds.  (Drops through to the ten round case because
 263         // this is the next most common.)
 264 14:     ld1     {v16.4s-v19.4s}, [x0], #64
 265         \aes    v0.16b, v16.16b
 266         \mc     v0.16b, v0.16b
 267         \aes    v0.16b, v17.16b
 268         \mc     v0.16b, v0.16b
 269         \aes    v0.16b, v18.16b
 270         \mc     v0.16b, v0.16b
 271         \aes    v0.16b, v19.16b
 272         \mc     v0.16b, v0.16b
 273         // Drop through...
 274
 275         // Ten rounds.
 276 10:     ld1     {v16.4s-v19.4s}, [x0], #64
 277         ld1     {v20.4s-v23.4s}, [x0], #64
 278         \aes    v0.16b, v16.16b
 279         \mc     v0.16b, v0.16b
 280         \aes    v0.16b, v17.16b
 281         \mc     v0.16b, v0.16b
 282         \aes    v0.16b, v18.16b
 283         \mc     v0.16b, v0.16b
 284         \aes    v0.16b, v19.16b
 285         \mc     v0.16b, v0.16b
 286
 287         ld1     {v16.4s-v18.4s}, [x0], #48
 288         \aes    v0.16b, v20.16b
 289         \mc     v0.16b, v0.16b
 290         \aes    v0.16b, v21.16b
 291         \mc     v0.16b, v0.16b
 292         \aes    v0.16b, v22.16b
 293         \mc     v0.16b, v0.16b
 294         \aes    v0.16b, v23.16b
 295         \mc     v0.16b, v0.16b
 296
 297         // Final round has no MixColumns, but is followed by final whitening.
 298         \aes    v0.16b, v16.16b
 299         \mc     v0.16b, v0.16b
 300         \aes    v0.16b, v17.16b
 301         eor     v0.16b, v0.16b, v18.16b
 302
 303         // All done.
 304         rev32   v0.16b, v0.16b
 305         st1     {v0.4s}, [x2]
 306         ret
 307
 308   ENDFUNC
 309 .endm
 310
 311         encdec  eblk, aese, aesmc, w
 312         encdec  dblk, aesd, aesimc, wi
 313
 314 ///----- That's all, folks --------------------------------------------------