mdw@git.distorted.org.uk Git - catacomb/blob - symm/rijndael-x86ish-aesni.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// AESNI-based implementation of Rijndael
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .arch   .aes
  34
  35         .extern F(abort)
  36         .extern F(rijndael_rcon)
  37
  38         .text
  39
  40 ///--------------------------------------------------------------------------
  41 /// Main code.
  42
  43 /// The AESNI instructions implement a little-endian version of AES, but
  44 /// Catacomb's internal interface presents as big-endian so as to work better
  45 /// with things like GCM.  We therefore maintain the round keys in
  46 /// little-endian form, and have to end-swap blocks in and out.
  47 ///
  48 /// For added amusement, the AESNI instructions don't implement the
  49 /// larger-block versions of Rijndael, so we have to end-swap the keys if
  50 /// we're preparing for one of those.
  51
  52         // Useful constants.
  53         .equ    maxrounds, 16           // maximum number of rounds
  54         .equ    maxblksz, 32            // maximum block size, in bytes
  55         .equ    kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
  56
  57         // Context structure.
  58         .equ    nr, 0                   // number of rounds
  59         .equ    w, nr + 4               // encryption key words
  60         .equ    wi, w + kbufsz          // decryption key words
  61
  62 ///--------------------------------------------------------------------------
  63 /// Key setup.
  64
  65 FUNC(rijndael_setup_x86ish_aesni_avx)
  66         vzeroupper                    // avoid penalty on `legacy' XMM access
  67   endprologue
  68         // and drop through...
  69 ENDFUNC
  70
  71 FUNC(rijndael_setup_x86ish_aesni)
  72
  73 #if CPUFAM_X86
  74         // Arguments are on the stack.  We'll need to stack the caller's
  75         // register veriables, but we'll manage.
  76
  77 #  define CTX BP                        // context pointer
  78 #  define BLKSZ [SP + 24]               // block size
  79
  80 #  define KSZ ebx                       // key size
  81 #  define NKW edx                       // total number of key words
  82 #  define NKW_NEEDS_REFRESH 1           // ... needs recalculating
  83 #  define RCON ecx                      // round constants table
  84 #  define LIM edx                       // limit pointer
  85 #  define CYIX edi                      // index in shift-register cycle
  86
  87 #  define NR ecx                        // number of rounds
  88 #  define LRK eax                       // distance to last key
  89 #  define BLKOFF edx                    // block size in bytes
  90
  91         // Stack the caller's registers.
  92         pushreg BP
  93         pushreg ebx
  94         pushreg esi
  95         pushreg edi
  96
  97         // Set up our own variables.
  98         mov     CTX, [SP + 20]          // context base pointer
  99         mov     SI, [SP + 28]           // key material
 100         mov     KSZ, [SP + 32]          // key size, in words
 101 #endif
 102
 103 #if CPUFAM_AMD64 && ABI_SYSV
 104         // Arguments are in registers.  We have plenty, but, to be honest,
 105         // the initial register allocation is a bit annoying.
 106
 107 #  define CTX r8                        // context pointer
 108 #  define BLKSZ r9d                     // block size
 109
 110 #  define KSZ edx                       // key size
 111 #  define NKW r10d                      // total number of key words
 112 #  define RCON rdi                      // round constants table
 113 #  define LIM rcx                       // limit pointer
 114 #  define CYIX r11d                     // index in shift-register cycle
 115
 116 #  define NR ecx                        // number of rounds
 117 #  define LRK eax                       // distance to last key
 118 #  define BLKOFF r9d                    // block size in bytes
 119
 120         // Move arguments to more useful places.
 121         mov     CTX, rdi                // context base pointer
 122         mov     BLKSZ, esi              // block size in words
 123         mov     SI, rdx                 // key material
 124         mov     KSZ, ecx                // key size, in words
 125 #endif
 126
 127 #if CPUFAM_AMD64 && ABI_WIN
 128         // Arguments are in different registers, and they're a little tight.
 129
 130 #  define CTX r8                        // context pointer
 131 #  define BLKSZ edx                     // block size
 132
 133 #  define KSZ r9d                       // key size
 134 #  define NKW r10d                      // total number of key words
 135 #  define RCON rdi                      // round constants table
 136 #  define LIM rcx                       // limit pointer
 137 #  define CYIX r11d                     // index in shift-register cycle
 138
 139 #  define NR ecx                        // number of rounds
 140 #  define LRK eax                       // distance to last key
 141 #  define BLKOFF edx                    // block size in bytes
 142
 143         // We'll need the index registers, which belong to the caller in this
 144         // ABI.
 145         pushreg rsi
 146         pushreg rdi
 147
 148         // Move arguments to more useful places.
 149         mov     rsi, r8                 // key material
 150         mov     CTX, rcx                // context base pointer
 151 #endif
 152
 153   endprologue
 154
 155         // The initial round key material is taken directly from the input
 156         // key, so copy it over.
 157 #if CPUFAM_AMD64 && ABI_SYSV
 158         // We've been lucky.  We already have a copy of the context pointer
 159         // in rdi, and the key size in ecx.
 160         add     rdi, w
 161 #else
 162         lea     DI, [CTX + w]
 163         mov     ecx, KSZ
 164 #endif
 165         rep     movsd
 166
 167         // Find out other useful things.
 168         mov     NKW, [CTX + nr]         // number of rounds
 169         add     NKW, 1
 170         imul    NKW, BLKSZ              // total key size in words
 171 #if !NKW_NEEDS_REFRESH
 172         // If we can't keep NKW for later, then we use the same register for
 173         // it and LIM, so this move is unnecessary.
 174         mov     DWORD(LIM), NKW
 175 #endif
 176         sub     DWORD(LIM), KSZ         // offset by the key size
 177
 178         // Find the round constants.
 179         ldgot   WHOLE(c)
 180         leaext  RCON, F(rijndael_rcon), WHOLE(c)
 181
 182         // Prepare for the main loop.
 183         lea     SI, [CTX + w]
 184         mov     eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
 185         lea     LIM, [SI + 4*LIM]       // limit, offset by one key expansion
 186         xor     CYIX, CYIX              // start of new cycle
 187
 188         // Main key expansion loop.  The first word of each key-length chunk
 189         // needs special treatment.
 190         //
 191         // This is rather tedious because the Intel `AESKEYGENASSIST'
 192         // instruction is very strangely shaped.  Firstly, it wants to
 193         // operate on vast SSE registers, even though we're data-blocked from
 194         // doing more than operation at a time unless we're doing two key
 195         // schedules simultaneously -- and even then we can't do more than
 196         // two, because the instruction ignores two of its input words
 197         // entirely, and produces two different outputs for each of the other
 198         // two.  And secondly it insists on taking the magic round constant
 199         // as an immediate, so it's kind of annoying if you're not
 200         // open-coding the whole thing.  It's much easier to leave that as
 201         // zero and XOR in the round constant by hand.
 202 0:      cmp     CYIX, 0                 // first word of the cycle?
 203         je      1f
 204         cmp     CYIX, 4                 // fourth word of the cycle?
 205         jne     2f
 206         cmp     KSZ, 7                  // and a large key?
 207         jb      2f
 208
 209         // Fourth word of the cycle, and seven or eight words of key.  Do a
 210         // byte substitution.
 211         movd    xmm0, eax
 212         pshufd  xmm0, xmm0, SHUF(2, 1, 0, 3)
 213         aeskeygenassist xmm1, xmm0, 0
 214         movd    eax, xmm1
 215         jmp     2f
 216
 217         // First word of the cycle.  This is the complicated piece.
 218 1:      movd    xmm0, eax
 219         pshufd  xmm0, xmm0, SHUF(0, 3, 2, 1)
 220         aeskeygenassist xmm1, xmm0, 0
 221         pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)
 222         movd    eax, xmm1
 223         xor     al, [RCON]
 224         inc     RCON
 225
 226         // Common tail.  Mix in the corresponding word from the previous
 227         // cycle and prepare for the next loop.
 228 2:      xor     eax, [SI]
 229         mov     [SI + 4*WHOLE(KSZ)], eax
 230         add     SI, 4
 231         inc     CYIX
 232         cmp     SI, LIM
 233         jae     9f
 234         cmp     CYIX, KSZ
 235         jb      0b
 236         xor     CYIX, CYIX
 237         jmp     0b
 238
 239         // Next job is to construct the decryption keys.  The keys for the
 240         // first and last rounds don't need to be mangled, but the remaining
 241         // ones do -- and they all need to be reordered too.
 242         //
 243         // The plan of action, then, is to copy the final encryption round's
 244         // keys into place first, then to do each of the intermediate rounds
 245         // in reverse order, and finally do the first round.
 246         //
 247         // Do all of the heavy lifting with SSE registers.  The order we're
 248         // doing this in means that it's OK if we read or write too much, and
 249         // there's easily enough buffer space for the over-enthusiastic reads
 250         // and writes because the context has space for 32-byte blocks, which
 251         // is our maximum and an exact fit for two SSE registers.
 252 9:      mov     NR, [CTX + nr]          // number of rounds
 253 #if NKW_NEEDS_REFRESH
 254         mov     BLKOFF, BLKSZ
 255         mov     LRK, NR
 256         imul    LRK, BLKOFF
 257 #else
 258         // If we retain NKW, then BLKSZ and BLKOFF are the same register
 259         // because we won't need the former again.
 260         mov     LRK, NKW
 261         sub     LRK, BLKSZ
 262 #endif
 263         lea     DI, [CTX + wi]
 264         lea     SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
 265         shl     BLKOFF, 2               // block size (in bytes now)
 266
 267         // Copy the last encryption round's keys.
 268         movdqu  xmm0, [SI]
 269         movdqu  [DI], xmm0
 270         cmp     BLKOFF, 16
 271         jbe     0f
 272         movdqu  xmm0, [SI + 16]
 273         movdqu  [DI + 16], xmm0
 274
 275         // Update the loop variables and stop if we've finished.
 276 0:      add     DI, WHOLE(BLKOFF)
 277         sub     SI, WHOLE(BLKOFF)
 278         sub     NR, 1
 279         jbe     9f
 280
 281         // Do another middle round's keys...
 282         movdqu  xmm0, [SI]
 283         aesimc  xmm0, xmm0
 284         movdqu  [DI], xmm0
 285         cmp     BLKOFF, 16
 286         jbe     0b
 287         movdqu  xmm0, [SI + 16]
 288         aesimc  xmm0, xmm0
 289         movdqu  [DI + 16], xmm0
 290         jmp     0b
 291
 292         // Finally do the first encryption round.
 293 9:      movdqu  xmm0, [SI]
 294         movdqu  [DI], xmm0
 295         cmp     BLKOFF, 16
 296         jbe     1f
 297         movdqu  xmm0, [SI + 16]
 298         movdqu  [DI + 16], xmm0
 299
 300         // If the block size is not exactly four words then we must end-swap
 301         // everything.  We can use fancy SSE toys for this.
 302 1:      cmp     BLKOFF, 16
 303         je      9f
 304
 305         // Find the byte-reordering table.
 306         ldgot   ecx
 307         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 308
 309 #if NKW_NEEDS_REFRESH
 310         // Calculate the number of subkey words again.  (It's a good job
 311         // we've got a fast multiplier.)
 312         mov     NKW, [CTX + nr]
 313         add     NKW, 1
 314         imul    NKW, BLKSZ
 315 #endif
 316
 317         // End-swap the encryption keys.
 318         lea     SI, [CTX + w]
 319         call    endswap_block
 320
 321         // And the decryption keys.
 322         lea     SI, [CTX + wi]
 323         call    endswap_block
 324
 325 9:      // All done.
 326 #if CPUFAM_X86
 327         popreg  edi
 328         popreg  esi
 329         popreg  ebx
 330         popreg  BP
 331 #endif
 332 #if CPUFAM_AMD64 && ABI_WIN
 333         popreg  rdi
 334         popreg  rsi
 335 #endif
 336         ret
 337
 338 ENDFUNC
 339
 340 INTFUNC(endswap_block)
 341         // End-swap NKW words starting at SI.  The end-swapping table is
 342         // already loaded into XMM5; and it's OK to work in 16-byte chunks.
 343   endprologue
 344
 345         mov     ecx, NKW
 346 0:      movdqu  xmm1, [SI]
 347         pshufb  xmm1, xmm5
 348         movdqu  [SI], xmm1
 349         add     SI, 16
 350         sub     ecx, 4
 351         ja      0b
 352
 353         ret
 354
 355 ENDFUNC
 356
 357 #undef CTX
 358 #undef BLKSZ
 359 #undef SI
 360 #undef DI
 361 #undef KSZ
 362 #undef RCON
 363 #undef LIM
 364 #undef NR
 365 #undef LRK
 366 #undef BLKOFF
 367
 368 ///--------------------------------------------------------------------------
 369 /// Encrypting and decrypting blocks.
 370
 371 .macro  encdec  op, aes, koff
 372   FUNC(rijndael_\op\()_x86ish_aesni_avx)
 373         vzeroupper                      // avoid XMM penalties
 374   endprologue
 375         // and drop through...
 376   ENDFUNC
 377
 378   FUNC(rijndael_\op\()_x86ish_aesni)
 379
 380 #if CPUFAM_X86
 381         // Arguments come in on the stack, and need to be collected.  We
 382         // don't have a shortage of registers.
 383
 384 #  define K eax
 385 #  define SRC edx
 386 #  define DST edx
 387 #  define NR ecx
 388
 389         mov     K, [SP + 4]
 390         mov     SRC, [SP + 8]
 391 #endif
 392
 393 #if CPUFAM_AMD64 && ABI_SYSV
 394         // Arguments come in registers.  All is good.
 395
 396 #  define K rdi
 397 #  define SRC rsi
 398 #  define DST rdx
 399 #  define NR eax
 400 #endif
 401
 402 #if CPUFAM_AMD64 && ABI_WIN
 403         // Arguments come in different registers.
 404
 405 #  define K rcx
 406 #  define SRC rdx
 407 #  define DST r8
 408 #  define NR eax
 409 #endif
 410
 411   endprologue
 412
 413         // Find the magic endianness-swapping table.
 414         ldgot   ecx
 415         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 416
 417         // Initial setup.
 418         movdqu  xmm0, [SRC]
 419         pshufb  xmm0, xmm5
 420         mov     NR, [K + nr]
 421         add     K, \koff
 422
 423         // Initial whitening.
 424         movdqu  xmm1, [K]
 425         add     K, 16
 426         pxor    xmm0, xmm1
 427 #if CPUFAM_X86
 428         mov     DST, [SP + 12]
 429 #endif
 430
 431         // Dispatch to the correct code.
 432         cmp     NR, 10
 433         je      10f
 434         jb      bogus
 435         cmp     NR, 14
 436         je      14f
 437         ja      bogus
 438         cmp     NR, 12
 439         je      12f
 440         jb      11f
 441         jmp     13f
 442
 443         .align  2
 444
 445         // 14 rounds...
 446 14:     movdqu  xmm1, [K]
 447         add     K, 16
 448         \aes    xmm0, xmm1
 449
 450         // 13 rounds...
 451 13:     movdqu  xmm1, [K]
 452         add     K, 16
 453         \aes    xmm0, xmm1
 454
 455         // 12 rounds...
 456 12:     movdqu  xmm1, [K]
 457         add     K, 16
 458         \aes    xmm0, xmm1
 459
 460         // 11 rounds...
 461 11:     movdqu  xmm1, [K]
 462         add     K, 16
 463         \aes    xmm0, xmm1
 464
 465         // 10 rounds...
 466 10:     movdqu  xmm1, [K]
 467         \aes    xmm0, xmm1
 468
 469         // 9 rounds...
 470         movdqu  xmm1, [K + 16]
 471         \aes    xmm0, xmm1
 472
 473         // 8 rounds...
 474         movdqu  xmm1, [K + 32]
 475         \aes    xmm0, xmm1
 476
 477         // 7 rounds...
 478         movdqu  xmm1, [K + 48]
 479         \aes    xmm0, xmm1
 480
 481         // 6 rounds...
 482         movdqu  xmm1, [K + 64]
 483         \aes    xmm0, xmm1
 484
 485         // 5 rounds...
 486         movdqu  xmm1, [K + 80]
 487         \aes    xmm0, xmm1
 488
 489         // 4 rounds...
 490         movdqu  xmm1, [K + 96]
 491         \aes    xmm0, xmm1
 492
 493         // 3 rounds...
 494         movdqu  xmm1, [K + 112]
 495         \aes    xmm0, xmm1
 496
 497         // 2 rounds...
 498         movdqu  xmm1, [K + 128]
 499         \aes    xmm0, xmm1
 500
 501         // Final round...
 502         movdqu  xmm1, [K + 144]
 503         \aes\()last xmm0, xmm1
 504
 505         // Unpermute the ciphertext block and store it.
 506         pshufb  xmm0, xmm5
 507         movdqu  [DST], xmm0
 508
 509         // And we're done.
 510         ret
 511
 512 #undef K
 513 #undef SRC
 514 #undef DST
 515 #undef NR
 516
 517   ENDFUNC
 518 .endm
 519
 520         encdec  eblk, aesenc, w
 521         encdec  dblk, aesdec, wi
 522
 523 ///--------------------------------------------------------------------------
 524 /// Random utilities.
 525
 526 INTFUNC(bogus)
 527         // Abort the process because of a programming error.  Indirecting
 528         // through this point serves several purposes: (a) by CALLing, rather
 529         // than branching to, `abort', we can save the return address, which
 530         // might at least provide a hint as to what went wrong; (b) we don't
 531         // have conditional CALLs (and they'd be big anyway); and (c) we can
 532         // write a HLT here as a backstop against `abort' being mad.
 533   endprologue
 534
 535         callext F(abort)
 536 0:      hlt
 537         jmp     0b
 538
 539 ENDFUNC
 540
 541 ///--------------------------------------------------------------------------
 542 /// Data tables.
 543
 544         RODATA
 545
 546         .align  16
 547 endswap_tab:
 548         .byte    3,  2,  1,  0
 549         .byte    7,  6,  5,  4
 550         .byte   11, 10,  9,  8
 551         .byte   15, 14, 13, 12
 552
 553 ///----- That's all, folks --------------------------------------------------