mdw@git.distorted.org.uk Git - catacomb/blob - symm/rijndael-x86ish-aesni.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// AESNI-based implementation of Rijndael
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .extern F(abort)
  34         .extern F(rijndael_rcon)
  35
  36 ///--------------------------------------------------------------------------
  37 /// Main code.
  38
  39         .arch   .aes
  40         .text
  41
  42 /// The AESNI instructions implement a little-endian version of AES, but
  43 /// Catacomb's internal interface presents as big-endian so as to work better
  44 /// with things like GCM.  We therefore maintain the round keys in
  45 /// little-endian form, and have to end-swap blocks in and out.
  46 ///
  47 /// For added amusement, the AESNI instructions don't implement the
  48 /// larger-block versions of Rijndael, so we have to end-swap the keys if
  49 /// we're preparing for one of those.
  50
  51         // Useful constants.
  52         .equ    maxrounds, 16           // maximum number of rounds
  53         .equ    maxblksz, 32            // maximum block size, in bytes
  54         .equ    kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
  55
  56         // Context structure.
  57         .equ    nr, 0                   // number of rounds
  58         .equ    w, nr + 4               // encryption key words
  59         .equ    wi, w + kbufsz          // decryption key words
  60
  61 ///--------------------------------------------------------------------------
  62 /// Key setup.
  63
  64 FUNC(rijndael_setup_x86ish_aesni_avx)
  65         vzeroupper                    // avoid penalty on `legacy' XMM access
  66   endprologue
  67         // and drop through...
  68 ENDFUNC
  69
  70 FUNC(rijndael_setup_x86ish_aesni)
  71
  72 #define SI WHOLE(si)
  73 #define DI WHOLE(di)
  74
  75 #if CPUFAM_X86
  76         // Arguments are on the stack.  We'll need to stack the caller's
  77         // register veriables, but we'll manage.
  78
  79 #  define CTX ebp                       // context pointer
  80 #  define BLKSZ [esp + 24]              // block size
  81
  82 #  define KSZ ebx                       // key size
  83 #  define NKW edx                       // total number of key words
  84 #  define NKW_NEEDS_REFRESH 1           // ... needs recalculating
  85 #  define RCON ecx                      // round constants table
  86 #  define LIM edx                       // limit pointer
  87 #  define CYIX edi                      // index in shift-register cycle
  88
  89 #  define NR ecx                        // number of rounds
  90 #  define LRK eax                       // distance to last key
  91 #  define BLKOFF edx                    // block size in bytes
  92
  93         // Stack the caller's registers.
  94         pushreg ebp
  95         pushreg ebx
  96         pushreg esi
  97         pushreg edi
  98
  99         // Set up our own variables.
 100         mov     CTX, [esp + 20]         // context base pointer
 101         mov     SI, [esp + 28]          // key material
 102         mov     KSZ, [esp + 32]         // key size, in words
 103 #endif
 104
 105 #if CPUFAM_AMD64 && ABI_SYSV
 106         // Arguments are in registers.  We have plenty, but, to be honest,
 107         // the initial register allocation is a bit annoying.
 108
 109 #  define CTX r8                        // context pointer
 110 #  define BLKSZ r9d                     // block size
 111
 112 #  define KSZ edx                       // key size
 113 #  define NKW r10d                      // total number of key words
 114 #  define RCON rdi                      // round constants table
 115 #  define LIM rcx                       // limit pointer
 116 #  define CYIX r11d                     // index in shift-register cycle
 117
 118 #  define NR ecx                        // number of rounds
 119 #  define LRK eax                       // distance to last key
 120 #  define BLKOFF r9d                    // block size in bytes
 121
 122         // Move arguments to more useful places.
 123         mov     CTX, rdi                // context base pointer
 124         mov     BLKSZ, esi              // block size in words
 125         mov     SI, rdx                 // key material
 126         mov     KSZ, ecx                // key size, in words
 127 #endif
 128
 129 #if CPUFAM_AMD64 && ABI_WIN
 130         // Arguments are in different registers, and they're a little tight.
 131
 132 #  define CTX r8                        // context pointer
 133 #  define BLKSZ edx                     // block size
 134
 135 #  define KSZ r9d                       // key size
 136 #  define NKW r10d                      // total number of key words
 137 #  define RCON rdi                      // round constants table
 138 #  define LIM rcx                       // limit pointer
 139 #  define CYIX r11d                     // index in shift-register cycle
 140
 141 #  define NR ecx                        // number of rounds
 142 #  define LRK eax                       // distance to last key
 143 #  define BLKOFF edx                    // block size in bytes
 144
 145         // We'll need the index registers, which belong to the caller in this
 146         // ABI.
 147         pushreg rsi
 148         pushreg rdi
 149
 150         // Move arguments to more useful places.
 151         mov     rsi, r8                 // key material
 152         mov     CTX, rcx                // context base pointer
 153 #endif
 154
 155   endprologue
 156
 157         // The initial round key material is taken directly from the input
 158         // key, so copy it over.
 159 #if CPUFAM_AMD64 && ABI_SYSV
 160         // We've been lucky.  We already have a copy of the context pointer
 161         // in rdi, and the key size in ecx.
 162         add     rdi, w
 163 #else
 164         lea     DI, [CTX + w]
 165         mov     ecx, KSZ
 166 #endif
 167         rep     movsd
 168
 169         // Find out other useful things.
 170         mov     NKW, [CTX + nr]         // number of rounds
 171         add     NKW, 1
 172         imul    NKW, BLKSZ              // total key size in words
 173 #if !NKW_NEEDS_REFRESH
 174         // If we can't keep NKW for later, then we use the same register for
 175         // it and LIM, so this move is unnecessary.
 176         mov     DWORD(LIM), NKW
 177 #endif
 178         sub     DWORD(LIM), KSZ         // offset by the key size
 179
 180         // Find the round constants.
 181         ldgot   WHOLE(c)
 182         leaext  RCON, F(rijndael_rcon), WHOLE(c)
 183
 184         // Prepare for the main loop.
 185         lea     SI, [CTX + w]
 186         mov     eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
 187         lea     LIM, [SI + 4*LIM]       // limit, offset by one key expansion
 188         xor     CYIX, CYIX              // start of new cycle
 189
 190         // Main key expansion loop.  The first word of each key-length chunk
 191         // needs special treatment.
 192         //
 193         // This is rather tedious because the Intel `AESKEYGENASSIST'
 194         // instruction is very strangely shaped.  Firstly, it wants to
 195         // operate on vast SSE registers, even though we're data-blocked from
 196         // doing more than operation at a time unless we're doing two key
 197         // schedules simultaneously -- and even then we can't do more than
 198         // two, because the instruction ignores two of its input words
 199         // entirely, and produces two different outputs for each of the other
 200         // two.  And secondly it insists on taking the magic round constant
 201         // as an immediate, so it's kind of annoying if you're not
 202         // open-coding the whole thing.  It's much easier to leave that as
 203         // zero and XOR in the round constant by hand.
 204 0:      cmp     CYIX, 0                 // first word of the cycle?
 205         je      1f
 206         cmp     CYIX, 4                 // fourth word of the cycle?
 207         jne     2f
 208         cmp     KSZ, 7                  // and a large key?
 209         jb      2f
 210
 211         // Fourth word of the cycle, and seven or eight words of key.  Do a
 212         // byte substitution.
 213         movd    xmm0, eax
 214         pshufd  xmm0, xmm0, SHUF(3, 0, 1, 2)
 215         aeskeygenassist xmm1, xmm0, 0
 216         movd    eax, xmm1
 217         jmp     2f
 218
 219         // First word of the cycle.  This is the complicated piece.
 220 1:      movd    xmm0, eax
 221         pshufd  xmm0, xmm0, SHUF(1, 2, 3, 0)
 222         aeskeygenassist xmm1, xmm0, 0
 223         pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)
 224         movd    eax, xmm1
 225         xor     al, [RCON]
 226         inc     RCON
 227
 228         // Common tail.  Mix in the corresponding word from the previous
 229         // cycle and prepare for the next loop.
 230 2:      xor     eax, [SI]
 231         mov     [SI + 4*WHOLE(KSZ)], eax
 232         add     SI, 4
 233         inc     CYIX
 234         cmp     SI, LIM
 235         jae     9f
 236         cmp     CYIX, KSZ
 237         jb      0b
 238         xor     CYIX, CYIX
 239         jmp     0b
 240
 241         // Next job is to construct the decryption keys.  The keys for the
 242         // first and last rounds don't need to be mangled, but the remaining
 243         // ones do -- and they all need to be reordered too.
 244         //
 245         // The plan of action, then, is to copy the final encryption round's
 246         // keys into place first, then to do each of the intermediate rounds
 247         // in reverse order, and finally do the first round.
 248         //
 249         // Do all of the heavy lifting with SSE registers.  The order we're
 250         // doing this in means that it's OK if we read or write too much, and
 251         // there's easily enough buffer space for the over-enthusiastic reads
 252         // and writes because the context has space for 32-byte blocks, which
 253         // is our maximum and an exact fit for two SSE registers.
 254 9:      mov     NR, [CTX + nr]          // number of rounds
 255 #if NKW_NEEDS_REFRESH
 256         mov     BLKOFF, BLKSZ
 257         mov     LRK, NR
 258         imul    LRK, BLKOFF
 259 #else
 260         // If we retain NKW, then BLKSZ and BLKOFF are the same register
 261         // because we won't need the former again.
 262         mov     LRK, NKW
 263         sub     LRK, BLKSZ
 264 #endif
 265         lea     DI, [CTX + wi]
 266         lea     SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
 267         shl     BLKOFF, 2               // block size (in bytes now)
 268
 269         // Copy the last encryption round's keys.
 270         movdqu  xmm0, [SI]
 271         movdqu  [DI], xmm0
 272         cmp     BLKOFF, 16
 273         jbe     0f
 274         movdqu  xmm0, [SI + 16]
 275         movdqu  [DI + 16], xmm0
 276
 277         // Update the loop variables and stop if we've finished.
 278 0:      add     DI, WHOLE(BLKOFF)
 279         sub     SI, WHOLE(BLKOFF)
 280         sub     NR, 1
 281         jbe     9f
 282
 283         // Do another middle round's keys...
 284         movdqu  xmm0, [SI]
 285         aesimc  xmm0, xmm0
 286         movdqu  [DI], xmm0
 287         cmp     BLKOFF, 16
 288         jbe     0b
 289         movdqu  xmm0, [SI + 16]
 290         aesimc  xmm0, xmm0
 291         movdqu  [DI + 16], xmm0
 292         jmp     0b
 293
 294         // Finally do the first encryption round.
 295 9:      movdqu  xmm0, [SI]
 296         movdqu  [DI], xmm0
 297         cmp     BLKOFF, 16
 298         jbe     1f
 299         movdqu  xmm0, [SI + 16]
 300         movdqu  [DI + 16], xmm0
 301
 302         // If the block size is not exactly four words then we must end-swap
 303         // everything.  We can use fancy SSE toys for this.
 304 1:      cmp     BLKOFF, 16
 305         je      9f
 306
 307         // Find the byte-reordering table.
 308         ldgot   ecx
 309         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 310
 311 #if NKW_NEEDS_REFRESH
 312         // Calculate the number of subkey words again.  (It's a good job
 313         // we've got a fast multiplier.)
 314         mov     NKW, [CTX + nr]
 315         add     NKW, 1
 316         imul    NKW, BLKSZ
 317 #endif
 318
 319         // End-swap the encryption keys.
 320         lea     SI, [CTX + w]
 321         call    endswap_block
 322
 323         // And the decryption keys.
 324         lea     SI, [CTX + wi]
 325         call    endswap_block
 326
 327 9:      // All done.
 328 #if CPUFAM_X86
 329         popreg  edi
 330         popreg  esi
 331         popreg  ebx
 332         popreg  ebp
 333 #endif
 334 #if CPUFAM_AMD64 && ABI_WIN
 335         popreg  rdi
 336         popreg  rsi
 337 #endif
 338         ret
 339
 340 ENDFUNC
 341
 342 INTFUNC(endswap_block)
 343         // End-swap NKW words starting at SI.  The end-swapping table is
 344         // already loaded into XMM5; and it's OK to work in 16-byte chunks.
 345   endprologue
 346
 347         mov     ecx, NKW
 348 0:      movdqu  xmm1, [SI]
 349         pshufb  xmm1, xmm5
 350         movdqu  [SI], xmm1
 351         add     SI, 16
 352         sub     ecx, 4
 353         ja      0b
 354
 355         ret
 356
 357 ENDFUNC
 358
 359 #undef CTX
 360 #undef BLKSZ
 361 #undef SI
 362 #undef DI
 363 #undef KSZ
 364 #undef RCON
 365 #undef LIM
 366 #undef NR
 367 #undef LRK
 368 #undef BLKOFF
 369
 370 ///--------------------------------------------------------------------------
 371 /// Encrypting and decrypting blocks.
 372
 373 .macro  encdec  op, aes, koff
 374   FUNC(rijndael_\op\()_x86ish_aesni_avx)
 375         vzeroupper                      // avoid XMM penalties
 376   endprologue
 377         // and drop through...
 378   ENDFUNC
 379
 380   FUNC(rijndael_\op\()_x86ish_aesni)
 381
 382 #if CPUFAM_X86
 383         // Arguments come in on the stack, and need to be collected.  We
 384         // don't have a shortage of registers.
 385
 386 #  define K eax
 387 #  define SRC edx
 388 #  define DST edx
 389 #  define NR ecx
 390
 391         mov     K, [esp + 4]
 392         mov     SRC, [esp + 8]
 393 #endif
 394
 395 #if CPUFAM_AMD64 && ABI_SYSV
 396         // Arguments come in registers.  All is good.
 397
 398 #  define K rdi
 399 #  define SRC rsi
 400 #  define DST rdx
 401 #  define NR eax
 402 #endif
 403
 404 #if CPUFAM_AMD64 && ABI_WIN
 405         // Arguments come in different registers.
 406
 407 #  define K rcx
 408 #  define SRC rdx
 409 #  define DST r8
 410 #  define NR eax
 411 #endif
 412
 413   endprologue
 414
 415         // Find the magic endianness-swapping table.
 416         ldgot   ecx
 417         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 418
 419         // Initial setup.
 420         movdqu  xmm0, [SRC]
 421         pshufb  xmm0, xmm5
 422         mov     NR, [K + nr]
 423         add     K, \koff
 424
 425         // Initial whitening.
 426         movdqu  xmm1, [K]
 427         add     K, 16
 428         pxor    xmm0, xmm1
 429 #if CPUFAM_X86
 430         mov     DST, [esp + 12]
 431 #endif
 432
 433         // Dispatch to the correct code.
 434         cmp     NR, 10
 435         je      10f
 436         jb      bogus
 437         cmp     NR, 14
 438         je      14f
 439         ja      bogus
 440         cmp     NR, 12
 441         je      12f
 442         jb      11f
 443         jmp     13f
 444
 445         .align  2
 446
 447         // 14 rounds...
 448 14:     movdqu  xmm1, [K]
 449         add     K, 16
 450         \aes    xmm0, xmm1
 451
 452         // 13 rounds...
 453 13:     movdqu  xmm1, [K]
 454         add     K, 16
 455         \aes    xmm0, xmm1
 456
 457         // 12 rounds...
 458 12:     movdqu  xmm1, [K]
 459         add     K, 16
 460         \aes    xmm0, xmm1
 461
 462         // 11 rounds...
 463 11:     movdqu  xmm1, [K]
 464         add     K, 16
 465         \aes    xmm0, xmm1
 466
 467         // 10 rounds...
 468 10:     movdqu  xmm1, [K]
 469         \aes    xmm0, xmm1
 470
 471         // 9 rounds...
 472         movdqu  xmm1, [K + 16]
 473         \aes    xmm0, xmm1
 474
 475         // 8 rounds...
 476         movdqu  xmm1, [K + 32]
 477         \aes    xmm0, xmm1
 478
 479         // 7 rounds...
 480         movdqu  xmm1, [K + 48]
 481         \aes    xmm0, xmm1
 482
 483         // 6 rounds...
 484         movdqu  xmm1, [K + 64]
 485         \aes    xmm0, xmm1
 486
 487         // 5 rounds...
 488         movdqu  xmm1, [K + 80]
 489         \aes    xmm0, xmm1
 490
 491         // 4 rounds...
 492         movdqu  xmm1, [K + 96]
 493         \aes    xmm0, xmm1
 494
 495         // 3 rounds...
 496         movdqu  xmm1, [K + 112]
 497         \aes    xmm0, xmm1
 498
 499         // 2 rounds...
 500         movdqu  xmm1, [K + 128]
 501         \aes    xmm0, xmm1
 502
 503         // Final round...
 504         movdqu  xmm1, [K + 144]
 505         \aes\()last xmm0, xmm1
 506
 507         // Unpermute the ciphertext block and store it.
 508         pshufb  xmm0, xmm5
 509         movdqu  [DST], xmm0
 510
 511         // And we're done.
 512         ret
 513
 514 #undef K
 515 #undef SRC
 516 #undef DST
 517 #undef NR
 518
 519   ENDFUNC
 520 .endm
 521
 522         encdec  eblk, aesenc, w
 523         encdec  dblk, aesdec, wi
 524
 525 ///--------------------------------------------------------------------------
 526 /// Random utilities.
 527
 528 INTFUNC(bogus)
 529         // Abort the process because of a programming error.  Indirecting
 530         // through this point serves several purposes: (a) by CALLing, rather
 531         // than branching to, `abort', we can save the return address, which
 532         // might at least provide a hint as to what went wrong; (b) we don't
 533         // have conditional CALLs (and they'd be big anyway); and (c) we can
 534         // write a HLT here as a backstop against `abort' being mad.
 535   endprologue
 536
 537         callext F(abort)
 538 0:      hlt
 539         jmp     0b
 540
 541 ENDFUNC
 542
 543 ///--------------------------------------------------------------------------
 544 /// Data tables.
 545
 546         RODATA
 547
 548         .align  16
 549 endswap_tab:
 550         .byte    3,  2,  1,  0
 551         .byte    7,  6,  5,  4
 552         .byte   11, 10,  9,  8
 553         .byte   15, 14, 13, 12
 554
 555 ///----- That's all, folks --------------------------------------------------