mdw@git.distorted.org.uk Git - catacomb/blob - symm/rijndael-x86ish-aesni.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// AESNI-based implementation of Rijndael
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .arch   .aes
  34
  35         .extern F(abort)
  36         .extern F(rijndael_rcon)
  37
  38         .text
  39
  40 ///--------------------------------------------------------------------------
  41 /// Main code.
  42
  43 /// The AESNI instructions implement a little-endian version of AES, but
  44 /// Catacomb's internal interface presents as big-endian so as to work better
  45 /// with things like GCM.  We therefore maintain the round keys in
  46 /// little-endian form, and have to end-swap blocks in and out.
  47 ///
  48 /// For added amusement, the AESNI instructions don't implement the
  49 /// larger-block versions of Rijndael, so we have to end-swap the keys if
  50 /// we're preparing for one of those.
  51
  52         // Useful constants.
  53         .equ    maxrounds, 16           // maximum number of rounds
  54         .equ    maxblksz, 32            // maximum block size, in bytes
  55         .equ    kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
  56
  57         // Context structure.
  58         .equ    nr, 0                   // number of rounds
  59         .equ    w, nr + 4               // encryption key words
  60         .equ    wi, w + kbufsz          // decryption key words
  61
  62 ///--------------------------------------------------------------------------
  63 /// Key setup.
  64
  65 FUNC(rijndael_setup_x86ish_aesni_avx)
  66         vzeroupper                    // avoid penalty on `legacy' XMM access
  67   endprologue
  68         // and drop through...
  69 ENDFUNC
  70
  71 FUNC(rijndael_setup_x86ish_aesni)
  72
  73 #define SI WHOLE(si)
  74 #define DI WHOLE(di)
  75
  76 #if CPUFAM_X86
  77         // Arguments are on the stack.  We'll need to stack the caller's
  78         // register veriables, but we'll manage.
  79
  80 #  define CTX ebp                       // context pointer
  81 #  define BLKSZ [esp + 24]              // block size
  82
  83 #  define KSZ ebx                       // key size
  84 #  define NKW edx                       // total number of key words
  85 #  define NKW_NEEDS_REFRESH 1           // ... needs recalculating
  86 #  define RCON ecx                      // round constants table
  87 #  define LIM edx                       // limit pointer
  88 #  define CYIX edi                      // index in shift-register cycle
  89
  90 #  define NR ecx                        // number of rounds
  91 #  define LRK eax                       // distance to last key
  92 #  define BLKOFF edx                    // block size in bytes
  93
  94         // Stack the caller's registers.
  95         pushreg ebp
  96         pushreg ebx
  97         pushreg esi
  98         pushreg edi
  99
 100         // Set up our own variables.
 101         mov     CTX, [esp + 20]         // context base pointer
 102         mov     SI, [esp + 28]          // key material
 103         mov     KSZ, [esp + 32]         // key size, in words
 104 #endif
 105
 106 #if CPUFAM_AMD64 && ABI_SYSV
 107         // Arguments are in registers.  We have plenty, but, to be honest,
 108         // the initial register allocation is a bit annoying.
 109
 110 #  define CTX r8                        // context pointer
 111 #  define BLKSZ r9d                     // block size
 112
 113 #  define KSZ edx                       // key size
 114 #  define NKW r10d                      // total number of key words
 115 #  define RCON rdi                      // round constants table
 116 #  define LIM rcx                       // limit pointer
 117 #  define CYIX r11d                     // index in shift-register cycle
 118
 119 #  define NR ecx                        // number of rounds
 120 #  define LRK eax                       // distance to last key
 121 #  define BLKOFF r9d                    // block size in bytes
 122
 123         // Move arguments to more useful places.
 124         mov     CTX, rdi                // context base pointer
 125         mov     BLKSZ, esi              // block size in words
 126         mov     SI, rdx                 // key material
 127         mov     KSZ, ecx                // key size, in words
 128 #endif
 129
 130 #if CPUFAM_AMD64 && ABI_WIN
 131         // Arguments are in different registers, and they're a little tight.
 132
 133 #  define CTX r8                        // context pointer
 134 #  define BLKSZ edx                     // block size
 135
 136 #  define KSZ r9d                       // key size
 137 #  define NKW r10d                      // total number of key words
 138 #  define RCON rdi                      // round constants table
 139 #  define LIM rcx                       // limit pointer
 140 #  define CYIX r11d                     // index in shift-register cycle
 141
 142 #  define NR ecx                        // number of rounds
 143 #  define LRK eax                       // distance to last key
 144 #  define BLKOFF edx                    // block size in bytes
 145
 146         // We'll need the index registers, which belong to the caller in this
 147         // ABI.
 148         pushreg rsi
 149         pushreg rdi
 150
 151         // Move arguments to more useful places.
 152         mov     rsi, r8                 // key material
 153         mov     CTX, rcx                // context base pointer
 154 #endif
 155
 156   endprologue
 157
 158         // The initial round key material is taken directly from the input
 159         // key, so copy it over.
 160 #if CPUFAM_AMD64 && ABI_SYSV
 161         // We've been lucky.  We already have a copy of the context pointer
 162         // in rdi, and the key size in ecx.
 163         add     rdi, w
 164 #else
 165         lea     DI, [CTX + w]
 166         mov     ecx, KSZ
 167 #endif
 168         rep     movsd
 169
 170         // Find out other useful things.
 171         mov     NKW, [CTX + nr]         // number of rounds
 172         add     NKW, 1
 173         imul    NKW, BLKSZ              // total key size in words
 174 #if !NKW_NEEDS_REFRESH
 175         // If we can't keep NKW for later, then we use the same register for
 176         // it and LIM, so this move is unnecessary.
 177         mov     DWORD(LIM), NKW
 178 #endif
 179         sub     DWORD(LIM), KSZ         // offset by the key size
 180
 181         // Find the round constants.
 182         ldgot   WHOLE(c)
 183         leaext  RCON, F(rijndael_rcon), WHOLE(c)
 184
 185         // Prepare for the main loop.
 186         lea     SI, [CTX + w]
 187         mov     eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
 188         lea     LIM, [SI + 4*LIM]       // limit, offset by one key expansion
 189         xor     CYIX, CYIX              // start of new cycle
 190
 191         // Main key expansion loop.  The first word of each key-length chunk
 192         // needs special treatment.
 193         //
 194         // This is rather tedious because the Intel `AESKEYGENASSIST'
 195         // instruction is very strangely shaped.  Firstly, it wants to
 196         // operate on vast SSE registers, even though we're data-blocked from
 197         // doing more than operation at a time unless we're doing two key
 198         // schedules simultaneously -- and even then we can't do more than
 199         // two, because the instruction ignores two of its input words
 200         // entirely, and produces two different outputs for each of the other
 201         // two.  And secondly it insists on taking the magic round constant
 202         // as an immediate, so it's kind of annoying if you're not
 203         // open-coding the whole thing.  It's much easier to leave that as
 204         // zero and XOR in the round constant by hand.
 205 0:      cmp     CYIX, 0                 // first word of the cycle?
 206         je      1f
 207         cmp     CYIX, 4                 // fourth word of the cycle?
 208         jne     2f
 209         cmp     KSZ, 7                  // and a large key?
 210         jb      2f
 211
 212         // Fourth word of the cycle, and seven or eight words of key.  Do a
 213         // byte substitution.
 214         movd    xmm0, eax
 215         pshufd  xmm0, xmm0, SHUF(3, 0, 1, 2)
 216         aeskeygenassist xmm1, xmm0, 0
 217         movd    eax, xmm1
 218         jmp     2f
 219
 220         // First word of the cycle.  This is the complicated piece.
 221 1:      movd    xmm0, eax
 222         pshufd  xmm0, xmm0, SHUF(1, 2, 3, 0)
 223         aeskeygenassist xmm1, xmm0, 0
 224         pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)
 225         movd    eax, xmm1
 226         xor     al, [RCON]
 227         inc     RCON
 228
 229         // Common tail.  Mix in the corresponding word from the previous
 230         // cycle and prepare for the next loop.
 231 2:      xor     eax, [SI]
 232         mov     [SI + 4*WHOLE(KSZ)], eax
 233         add     SI, 4
 234         inc     CYIX
 235         cmp     SI, LIM
 236         jae     9f
 237         cmp     CYIX, KSZ
 238         jb      0b
 239         xor     CYIX, CYIX
 240         jmp     0b
 241
 242         // Next job is to construct the decryption keys.  The keys for the
 243         // first and last rounds don't need to be mangled, but the remaining
 244         // ones do -- and they all need to be reordered too.
 245         //
 246         // The plan of action, then, is to copy the final encryption round's
 247         // keys into place first, then to do each of the intermediate rounds
 248         // in reverse order, and finally do the first round.
 249         //
 250         // Do all of the heavy lifting with SSE registers.  The order we're
 251         // doing this in means that it's OK if we read or write too much, and
 252         // there's easily enough buffer space for the over-enthusiastic reads
 253         // and writes because the context has space for 32-byte blocks, which
 254         // is our maximum and an exact fit for two SSE registers.
 255 9:      mov     NR, [CTX + nr]          // number of rounds
 256 #if NKW_NEEDS_REFRESH
 257         mov     BLKOFF, BLKSZ
 258         mov     LRK, NR
 259         imul    LRK, BLKOFF
 260 #else
 261         // If we retain NKW, then BLKSZ and BLKOFF are the same register
 262         // because we won't need the former again.
 263         mov     LRK, NKW
 264         sub     LRK, BLKSZ
 265 #endif
 266         lea     DI, [CTX + wi]
 267         lea     SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
 268         shl     BLKOFF, 2               // block size (in bytes now)
 269
 270         // Copy the last encryption round's keys.
 271         movdqu  xmm0, [SI]
 272         movdqu  [DI], xmm0
 273         cmp     BLKOFF, 16
 274         jbe     0f
 275         movdqu  xmm0, [SI + 16]
 276         movdqu  [DI + 16], xmm0
 277
 278         // Update the loop variables and stop if we've finished.
 279 0:      add     DI, WHOLE(BLKOFF)
 280         sub     SI, WHOLE(BLKOFF)
 281         sub     NR, 1
 282         jbe     9f
 283
 284         // Do another middle round's keys...
 285         movdqu  xmm0, [SI]
 286         aesimc  xmm0, xmm0
 287         movdqu  [DI], xmm0
 288         cmp     BLKOFF, 16
 289         jbe     0b
 290         movdqu  xmm0, [SI + 16]
 291         aesimc  xmm0, xmm0
 292         movdqu  [DI + 16], xmm0
 293         jmp     0b
 294
 295         // Finally do the first encryption round.
 296 9:      movdqu  xmm0, [SI]
 297         movdqu  [DI], xmm0
 298         cmp     BLKOFF, 16
 299         jbe     1f
 300         movdqu  xmm0, [SI + 16]
 301         movdqu  [DI + 16], xmm0
 302
 303         // If the block size is not exactly four words then we must end-swap
 304         // everything.  We can use fancy SSE toys for this.
 305 1:      cmp     BLKOFF, 16
 306         je      9f
 307
 308         // Find the byte-reordering table.
 309         ldgot   ecx
 310         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 311
 312 #if NKW_NEEDS_REFRESH
 313         // Calculate the number of subkey words again.  (It's a good job
 314         // we've got a fast multiplier.)
 315         mov     NKW, [CTX + nr]
 316         add     NKW, 1
 317         imul    NKW, BLKSZ
 318 #endif
 319
 320         // End-swap the encryption keys.
 321         lea     SI, [CTX + w]
 322         call    endswap_block
 323
 324         // And the decryption keys.
 325         lea     SI, [CTX + wi]
 326         call    endswap_block
 327
 328 9:      // All done.
 329 #if CPUFAM_X86
 330         popreg  edi
 331         popreg  esi
 332         popreg  ebx
 333         popreg  ebp
 334 #endif
 335 #if CPUFAM_AMD64 && ABI_WIN
 336         popreg  rdi
 337         popreg  rsi
 338 #endif
 339         ret
 340
 341 ENDFUNC
 342
 343 INTFUNC(endswap_block)
 344         // End-swap NKW words starting at SI.  The end-swapping table is
 345         // already loaded into XMM5; and it's OK to work in 16-byte chunks.
 346   endprologue
 347
 348         mov     ecx, NKW
 349 0:      movdqu  xmm1, [SI]
 350         pshufb  xmm1, xmm5
 351         movdqu  [SI], xmm1
 352         add     SI, 16
 353         sub     ecx, 4
 354         ja      0b
 355
 356         ret
 357
 358 ENDFUNC
 359
 360 #undef CTX
 361 #undef BLKSZ
 362 #undef SI
 363 #undef DI
 364 #undef KSZ
 365 #undef RCON
 366 #undef LIM
 367 #undef NR
 368 #undef LRK
 369 #undef BLKOFF
 370
 371 ///--------------------------------------------------------------------------
 372 /// Encrypting and decrypting blocks.
 373
 374 .macro  encdec  op, aes, koff
 375   FUNC(rijndael_\op\()_x86ish_aesni_avx)
 376         vzeroupper                      // avoid XMM penalties
 377   endprologue
 378         // and drop through...
 379   ENDFUNC
 380
 381   FUNC(rijndael_\op\()_x86ish_aesni)
 382
 383 #if CPUFAM_X86
 384         // Arguments come in on the stack, and need to be collected.  We
 385         // don't have a shortage of registers.
 386
 387 #  define K eax
 388 #  define SRC edx
 389 #  define DST edx
 390 #  define NR ecx
 391
 392         mov     K, [esp + 4]
 393         mov     SRC, [esp + 8]
 394 #endif
 395
 396 #if CPUFAM_AMD64 && ABI_SYSV
 397         // Arguments come in registers.  All is good.
 398
 399 #  define K rdi
 400 #  define SRC rsi
 401 #  define DST rdx
 402 #  define NR eax
 403 #endif
 404
 405 #if CPUFAM_AMD64 && ABI_WIN
 406         // Arguments come in different registers.
 407
 408 #  define K rcx
 409 #  define SRC rdx
 410 #  define DST r8
 411 #  define NR eax
 412 #endif
 413
 414   endprologue
 415
 416         // Find the magic endianness-swapping table.
 417         ldgot   ecx
 418         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 419
 420         // Initial setup.
 421         movdqu  xmm0, [SRC]
 422         pshufb  xmm0, xmm5
 423         mov     NR, [K + nr]
 424         add     K, \koff
 425
 426         // Initial whitening.
 427         movdqu  xmm1, [K]
 428         add     K, 16
 429         pxor    xmm0, xmm1
 430 #if CPUFAM_X86
 431         mov     DST, [esp + 12]
 432 #endif
 433
 434         // Dispatch to the correct code.
 435         cmp     NR, 10
 436         je      10f
 437         jb      bogus
 438         cmp     NR, 14
 439         je      14f
 440         ja      bogus
 441         cmp     NR, 12
 442         je      12f
 443         jb      11f
 444         jmp     13f
 445
 446         .align  2
 447
 448         // 14 rounds...
 449 14:     movdqu  xmm1, [K]
 450         add     K, 16
 451         \aes    xmm0, xmm1
 452
 453         // 13 rounds...
 454 13:     movdqu  xmm1, [K]
 455         add     K, 16
 456         \aes    xmm0, xmm1
 457
 458         // 12 rounds...
 459 12:     movdqu  xmm1, [K]
 460         add     K, 16
 461         \aes    xmm0, xmm1
 462
 463         // 11 rounds...
 464 11:     movdqu  xmm1, [K]
 465         add     K, 16
 466         \aes    xmm0, xmm1
 467
 468         // 10 rounds...
 469 10:     movdqu  xmm1, [K]
 470         \aes    xmm0, xmm1
 471
 472         // 9 rounds...
 473         movdqu  xmm1, [K + 16]
 474         \aes    xmm0, xmm1
 475
 476         // 8 rounds...
 477         movdqu  xmm1, [K + 32]
 478         \aes    xmm0, xmm1
 479
 480         // 7 rounds...
 481         movdqu  xmm1, [K + 48]
 482         \aes    xmm0, xmm1
 483
 484         // 6 rounds...
 485         movdqu  xmm1, [K + 64]
 486         \aes    xmm0, xmm1
 487
 488         // 5 rounds...
 489         movdqu  xmm1, [K + 80]
 490         \aes    xmm0, xmm1
 491
 492         // 4 rounds...
 493         movdqu  xmm1, [K + 96]
 494         \aes    xmm0, xmm1
 495
 496         // 3 rounds...
 497         movdqu  xmm1, [K + 112]
 498         \aes    xmm0, xmm1
 499
 500         // 2 rounds...
 501         movdqu  xmm1, [K + 128]
 502         \aes    xmm0, xmm1
 503
 504         // Final round...
 505         movdqu  xmm1, [K + 144]
 506         \aes\()last xmm0, xmm1
 507
 508         // Unpermute the ciphertext block and store it.
 509         pshufb  xmm0, xmm5
 510         movdqu  [DST], xmm0
 511
 512         // And we're done.
 513         ret
 514
 515 #undef K
 516 #undef SRC
 517 #undef DST
 518 #undef NR
 519
 520   ENDFUNC
 521 .endm
 522
 523         encdec  eblk, aesenc, w
 524         encdec  dblk, aesdec, wi
 525
 526 ///--------------------------------------------------------------------------
 527 /// Random utilities.
 528
 529 INTFUNC(bogus)
 530         // Abort the process because of a programming error.  Indirecting
 531         // through this point serves several purposes: (a) by CALLing, rather
 532         // than branching to, `abort', we can save the return address, which
 533         // might at least provide a hint as to what went wrong; (b) we don't
 534         // have conditional CALLs (and they'd be big anyway); and (c) we can
 535         // write a HLT here as a backstop against `abort' being mad.
 536   endprologue
 537
 538         callext F(abort)
 539 0:      hlt
 540         jmp     0b
 541
 542 ENDFUNC
 543
 544 ///--------------------------------------------------------------------------
 545 /// Data tables.
 546
 547         RODATA
 548
 549         .align  16
 550 endswap_tab:
 551         .byte    3,  2,  1,  0
 552         .byte    7,  6,  5,  4
 553         .byte   11, 10,  9,  8
 554         .byte   15, 14, 13, 12
 555
 556 ///----- That's all, folks --------------------------------------------------