mdw@git.distorted.org.uk Git - catacomb/blob - symm/rijndael-x86ish-aesni.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// AESNI-based implementation of Rijndael
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .globl  F(abort)
  34         .globl  F(rijndael_rcon)
  35
  36 ///--------------------------------------------------------------------------
  37 /// Main code.
  38
  39         .arch   .aes
  40         .text
  41
  42 /// The AESNI instructions implement a little-endian version of AES, but
  43 /// Catacomb's internal interface presents as big-endian so as to work better
  44 /// with things like GCM.  We therefore maintain the round keys in
  45 /// little-endian form, and have to end-swap blocks in and out.
  46 ///
  47 /// For added amusement, the AESNI instructions don't implement the
  48 /// larger-block versions of Rijndael, so we have to end-swap the keys if
  49 /// we're preparing for one of those.
  50
  51         // Useful constants.
  52         .equ    maxrounds, 16           // maximum number of rounds
  53         .equ    maxblksz, 32            // maximum block size, in bytes
  54         .equ    kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
  55
  56         // Context structure.
  57         .equ    nr, 0                   // number of rounds
  58         .equ    w, nr + 4               // encryption key words
  59         .equ    wi, w + kbufsz          // decryption key words
  60
  61 ///--------------------------------------------------------------------------
  62 /// Key setup.
  63
  64 FUNC(rijndael_setup_x86ish_aesni)
  65
  66 #if CPUFAM_X86
  67         // Arguments are on the stack.  We'll need to stack the caller's
  68         // register veriables, but we'll manage.
  69
  70 #  define CTX ebp                       // context pointer
  71 #  define BLKSZ [esp + 24]              // block size
  72
  73 #  define SI esi                        // source pointer
  74 #  define DI edi                        // destination pointer
  75
  76 #  define KSZ ebx                       // key size
  77 #  define KSZo ebx                      // ... as address offset
  78 #  define NKW edx                       // total number of key words
  79 #  define NKW_NEEDS_REFRESH 1           // ... needs recalculating
  80 #  define RCON ecx                      // round constants table
  81 #  define LIM edx                       // limit pointer
  82 #  define LIMn edx                      // ... as integer offset from base
  83 #  define CYIX edi                      // index in shift-register cycle
  84
  85 #  define NR ecx                        // number of rounds
  86 #  define LRK eax                       // distance to last key
  87 #  define LRKo eax                      // ... as address offset
  88 #  define BLKOFF edx                    // block size in bytes
  89 #  define BLKOFFo edx                   // ... as address offset
  90
  91         // Stack the caller's registers.
  92         push    ebp
  93         push    ebx
  94         push    esi
  95         push    edi
  96
  97         // Set up our own variables.
  98         mov     CTX, [esp + 20]         // context base pointer
  99         mov     SI, [esp + 28]          // key material
 100         mov     KSZ, [esp + 32]         // key size, in words
 101 #endif
 102
 103 #if CPUFAM_AMD64 && ABI_SYSV
 104         // Arguments are in registers.  We have plenty, but, to be honest,
 105         // the initial register allocation is a bit annoying.
 106
 107 #  define CTX r8                        // context pointer
 108 #  define BLKSZ r9d                     // block size
 109
 110 #  define SI rsi                        // source pointer
 111 #  define DI rdi                        // destination pointer
 112
 113 #  define KSZ edx                       // key size
 114 #  define KSZo rdx                      // ... as address offset
 115 #  define NKW r10d                      // total number of key words
 116 #  define RCON rdi                      // round constants table
 117 #  define LIMn ecx                      // limit pointer
 118 #  define LIM rcx                       // ... as integer offset from base
 119 #  define CYIX r11d                     // index in shift-register cycle
 120
 121 #  define NR ecx                        // number of rounds
 122 #  define LRK eax                       // distance to last key
 123 #  define LRKo rax                      // ... as address offset
 124 #  define BLKOFF r9d                    // block size in bytes
 125 #  define BLKOFFo r9                    // ... as address offset
 126
 127         // Move arguments to more useful places.
 128         mov     CTX, rdi                // context base pointer
 129         mov     BLKSZ, esi              // block size in words
 130         mov     SI, rdx                 // key material
 131         mov     KSZ, ecx                // key size, in words
 132 #endif
 133
 134 #if CPUFAM_AMD64 && ABI_WIN
 135         // Arguments are in different registers, and they're a little tight.
 136
 137 #  define CTX r8                        // context pointer
 138 #  define BLKSZ edx                     // block size
 139
 140 #  define SI rsi                        // source pointer
 141 #  define DI rdi                        // destination pointer
 142
 143 #  define KSZ r9d                       // key size
 144 #  define KSZo r9                       // ... as address offset
 145 #  define NKW r10d                      // total number of key words
 146 #  define RCON rdi                      // round constants table
 147 #  define LIMn ecx                      // limit pointer
 148 #  define LIM rcx                       // ... as integer offset from base
 149 #  define CYIX r11d                     // index in shift-register cycle
 150
 151 #  define NR ecx                        // number of rounds
 152 #  define LRK eax                       // distance to last key
 153 #  define LRKo rax                      // ... as address offset
 154 #  define BLKOFF edx                    // block size in bytes
 155 #  define BLKOFFo rdx                   // ... as address offset
 156
 157         // We'll need the index registers, which belong to the caller in this
 158         // ABI.
 159         push    rsi
 160           .seh_pushreg rsi
 161         push    rdi
 162           .seh_pushreg rdi
 163   .seh_endprologue
 164
 165         // Move arguments to more useful places.
 166         mov     SI, r8                  // key material
 167         mov     CTX, rcx                // context base pointer
 168 #endif
 169
 170         // The initial round key material is taken directly from the input
 171         // key, so copy it over.
 172 #if CPUFAM_AMD64 && ABI_SYSV
 173         // We've been lucky.  We already have a copy of the context pointer
 174         // in rdi, and the key size in ecx.
 175         add     DI, w
 176 #else
 177         lea     DI, [CTX + w]
 178         mov     ecx, KSZ
 179 #endif
 180         rep     movsd
 181
 182         // Find out other useful things.
 183         mov     NKW, [CTX + nr]         // number of rounds
 184         add     NKW, 1
 185         imul    NKW, BLKSZ              // total key size in words
 186 #if !NKW_NEEDS_REFRESH
 187         // If we can't keep NKW for later, then we use the same register for
 188         // it and LIM, so this move is unnecessary.
 189         mov     LIMn, NKW
 190 #endif
 191         sub     LIMn, KSZ               // offset by the key size
 192
 193         // Find the round constants.
 194         ldgot   ecx
 195         leaext  RCON, F(rijndael_rcon), ecx
 196
 197         // Prepare for the main loop.
 198         lea     SI, [CTX + w]
 199         mov     eax, [SI + 4*KSZo - 4]  // most recent key word
 200         lea     LIM, [SI + 4*LIM]       // limit, offset by one key expansion
 201         xor     CYIX, CYIX              // start of new cycle
 202
 203         // Main key expansion loop.  The first word of each key-length chunk
 204         // needs special treatment.
 205         //
 206         // This is rather tedious because the Intel `AESKEYGENASSIST'
 207         // instruction is very strangely shaped.  Firstly, it wants to
 208         // operate on vast SSE registers, even though we're data-blocked from
 209         // doing more than operation at a time unless we're doing two key
 210         // schedules simultaneously -- and even then we can't do more than
 211         // two, because the instruction ignores two of its input words
 212         // entirely, and produces two different outputs for each of the other
 213         // two.  And secondly it insists on taking the magic round constant
 214         // as an immediate, so it's kind of annoying if you're not
 215         // open-coding the whole thing.  It's much easier to leave that as
 216         // zero and XOR in the round constant by hand.
 217 0:      cmp     CYIX, 0                 // first word of the cycle?
 218         je      1f
 219         cmp     CYIX, 4                 // fourth word of the cycle?
 220         jne     2f
 221         cmp     KSZ, 7                  // and a large key?
 222         jb      2f
 223
 224         // Fourth word of the cycle, and seven or eight words of key.  Do a
 225         // byte substitution.
 226         movd    xmm0, eax
 227         pshufd  xmm0, xmm0, SHUF(2, 1, 0, 3)
 228         aeskeygenassist xmm1, xmm0, 0
 229         movd    eax, xmm1
 230         jmp     2f
 231
 232         // First word of the cycle.  This is the complicated piece.
 233 1:      movd    xmm0, eax
 234         pshufd  xmm0, xmm0, SHUF(0, 3, 2, 1)
 235         aeskeygenassist xmm1, xmm0, 0
 236         pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)
 237         movd    eax, xmm1
 238         xor     al, [RCON]
 239         inc     RCON
 240
 241         // Common tail.  Mix in the corresponding word from the previous
 242         // cycle and prepare for the next loop.
 243 2:      xor     eax, [SI]
 244         mov     [SI + 4*KSZo], eax
 245         add     SI, 4
 246         inc     CYIX
 247         cmp     SI, LIM
 248         jae     9f
 249         cmp     CYIX, KSZ
 250         jb      0b
 251         xor     CYIX, CYIX
 252         jmp     0b
 253
 254         // Next job is to construct the decryption keys.  The keys for the
 255         // first and last rounds don't need to be mangled, but the remaining
 256         // ones do -- and they all need to be reordered too.
 257         //
 258         // The plan of action, then, is to copy the final encryption round's
 259         // keys into place first, then to do each of the intermediate rounds
 260         // in reverse order, and finally do the first round.
 261         //
 262         // Do all of the heavy lifting with SSE registers.  The order we're
 263         // doing this in means that it's OK if we read or write too much, and
 264         // there's easily enough buffer space for the over-enthusiastic reads
 265         // and writes because the context has space for 32-byte blocks, which
 266         // is our maximum and an exact fit for two SSE registers.
 267 9:      mov     NR, [CTX + nr]          // number of rounds
 268 #if NKW_NEEDS_REFRESH
 269         mov     BLKOFF, BLKSZ
 270         mov     LRK, NR
 271         imul    LRK, BLKOFF
 272 #else
 273         // If we retain NKW, then BLKSZ and BLKOFF are the same register
 274         // because we won't need the former again.
 275         mov     LRK, NKW
 276         sub     LRK, BLKSZ
 277 #endif
 278         lea     DI, [CTX + wi]
 279         lea     SI, [CTX + w + 4*LRKo]  // last round's keys
 280         shl     BLKOFF, 2               // block size (in bytes now)
 281
 282         // Copy the last encryption round's keys.
 283         movdqu  xmm0, [SI]
 284         movdqu  [DI], xmm0
 285         cmp     BLKOFF, 16
 286         jbe     0f
 287         movdqu  xmm0, [SI + 16]
 288         movdqu  [DI + 16], xmm0
 289
 290         // Update the loop variables and stop if we've finished.
 291 0:      add     DI, BLKOFFo
 292         sub     SI, BLKOFFo
 293         sub     NR, 1
 294         jbe     9f
 295
 296         // Do another middle round's keys...
 297         movdqu  xmm0, [SI]
 298         aesimc  xmm0, xmm0
 299         movdqu  [DI], xmm0
 300         cmp     BLKOFF, 16
 301         jbe     0b
 302         movdqu  xmm0, [SI + 16]
 303         aesimc  xmm0, xmm0
 304         movdqu  [DI + 16], xmm0
 305         jmp     0b
 306
 307         // Finally do the first encryption round.
 308 9:      movdqu  xmm0, [SI]
 309         movdqu  [DI], xmm0
 310         cmp     BLKOFF, 16
 311         jbe     1f
 312         movdqu  xmm0, [SI + 16]
 313         movdqu  [DI + 16], xmm0
 314
 315         // If the block size is not exactly four words then we must end-swap
 316         // everything.  We can use fancy SSE toys for this.
 317 1:      cmp     BLKOFF, 16
 318         je      9f
 319
 320         // Find the byte-reordering table.
 321         ldgot   ecx
 322         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 323
 324 #if NKW_NEEDS_REFRESH
 325         // Calculate the number of subkey words again.  (It's a good job
 326         // we've got a fast multiplier.)
 327         mov     NKW, [CTX + nr]
 328         add     NKW, 1
 329         imul    NKW, BLKSZ
 330 #endif
 331
 332         // End-swap the encryption keys.
 333         lea     SI, [CTX + w]
 334         call    endswap_block
 335
 336         // And the decryption keys.
 337         lea     SI, [CTX + wi]
 338         call    endswap_block
 339
 340 9:      // All done.
 341 #if CPUFAM_X86
 342         pop     edi
 343         pop     esi
 344         pop     ebx
 345         pop     ebp
 346 #endif
 347 #if CPUFAM_AMD64 && ABI_WIN
 348         pop     rdi
 349         pop     rsi
 350 #endif
 351         ret
 352
 353         .align  16
 354 endswap_block:
 355         // End-swap NKW words starting at SI.  The end-swapping table is
 356         // already loaded into XMM5; and it's OK to work in 16-byte chunks.
 357         mov     ecx, NKW
 358 0:      movdqu  xmm1, [SI]
 359         pshufb  xmm1, xmm5
 360         movdqu  [SI], xmm1
 361         add     SI, 16
 362         sub     ecx, 4
 363         ja      0b
 364         ret
 365
 366 #undef CTX
 367 #undef BLKSZ
 368 #undef SI
 369 #undef DI
 370 #undef KSZ
 371 #undef KSZo
 372 #undef RCON
 373 #undef LIMn
 374 #undef LIM
 375 #undef NR
 376 #undef LRK
 377 #undef LRKo
 378 #undef BLKOFF
 379 #undef BLKOFFo
 380
 381 ENDFUNC
 382
 383 ///--------------------------------------------------------------------------
 384 /// Encrypting and decrypting blocks.
 385
 386 .macro  encdec  op, aes, koff
 387   FUNC(rijndael_\op\()_x86ish_aesni)
 388
 389 #if CPUFAM_X86
 390         // Arguments come in on the stack, and need to be collected.  We
 391         // don't have a shortage of registers.
 392
 393 #  define K eax
 394 #  define SRC edx
 395 #  define DST edx
 396 #  define NR ecx
 397
 398         mov     K, [esp + 4]
 399         mov     SRC, [esp + 8]
 400 #endif
 401
 402 #if CPUFAM_AMD64 && ABI_SYSV
 403         // Arguments come in registers.  All is good.
 404
 405 #  define K rdi
 406 #  define SRC rsi
 407 #  define DST rdx
 408 #  define NR eax
 409 #endif
 410
 411 #if CPUFAM_AMD64 && ABI_WIN
 412         // Arguments come in different registers.
 413
 414 #  define K rcx
 415 #  define SRC rdx
 416 #  define DST r8
 417 #  define NR eax
 418   .seh_endprologue
 419 #endif
 420
 421         // Find the magic endianness-swapping table.
 422         ldgot   ecx
 423         movdqa  xmm5, [INTADDR(endswap_tab, ecx)]
 424
 425         // Initial setup.
 426         movdqu  xmm0, [SRC]
 427         pshufb  xmm0, xmm5
 428         mov     NR, [K + nr]
 429         add     K, \koff
 430
 431         // Initial whitening.
 432         movdqu  xmm1, [K]
 433         add     K, 16
 434         pxor    xmm0, xmm1
 435
 436         // Dispatch to the correct code.
 437         cmp     NR, 10
 438         je      10f
 439         jb      bogus
 440         cmp     NR, 14
 441         je      14f
 442         ja      bogus
 443         cmp     NR, 12
 444         je      12f
 445         jb      11f
 446         jmp     13f
 447
 448         .align  2
 449
 450         // 14 rounds...
 451 14:     movdqu  xmm1, [K]
 452         add     K, 16
 453         \aes    xmm0, xmm1
 454
 455         // 13 rounds...
 456 13:     movdqu  xmm1, [K]
 457         add     K, 16
 458         \aes    xmm0, xmm1
 459
 460         // 12 rounds...
 461 12:     movdqu  xmm1, [K]
 462         add     K, 16
 463         \aes    xmm0, xmm1
 464
 465         // 11 rounds...
 466 11:     movdqu  xmm1, [K]
 467         add     K, 16
 468         \aes    xmm0, xmm1
 469
 470         // 10 rounds...
 471 10:     movdqu  xmm1, [K]
 472         \aes    xmm0, xmm1
 473
 474         // 9 rounds...
 475         movdqu  xmm1, [K + 16]
 476         \aes    xmm0, xmm1
 477
 478         // 8 rounds...
 479         movdqu  xmm1, [K + 32]
 480         \aes    xmm0, xmm1
 481
 482         // 7 rounds...
 483         movdqu  xmm1, [K + 48]
 484         \aes    xmm0, xmm1
 485
 486         // 6 rounds...
 487         movdqu  xmm1, [K + 64]
 488         \aes    xmm0, xmm1
 489
 490         // 5 rounds...
 491         movdqu  xmm1, [K + 80]
 492         \aes    xmm0, xmm1
 493
 494         // 4 rounds...
 495         movdqu  xmm1, [K + 96]
 496         \aes    xmm0, xmm1
 497
 498         // 3 rounds...
 499         movdqu  xmm1, [K + 112]
 500         \aes    xmm0, xmm1
 501
 502         // 2 rounds...
 503         movdqu  xmm1, [K + 128]
 504         \aes    xmm0, xmm1
 505
 506         // Final round...
 507         movdqu  xmm1, [K + 144]
 508         \aes\()last xmm0, xmm1
 509
 510         // Unpermute the ciphertext block and store it.
 511         pshufb  xmm0, xmm5
 512 #if CPUFAM_X86
 513         mov     DST, [esp + 12]
 514 #endif
 515         movdqu  [DST], xmm0
 516
 517         // And we're done.
 518         ret
 519
 520 #undef K
 521 #undef SRC
 522 #undef DST
 523 #undef NR
 524
 525   ENDFUNC
 526 .endm
 527
 528         encdec  eblk, aesenc, w
 529         encdec  dblk, aesdec, wi
 530
 531 ///--------------------------------------------------------------------------
 532 /// Random utilities.
 533
 534         .align  16
 535         // Abort the process because of a programming error.  Indirecting
 536         // through this point serves several purposes: (a) by CALLing, rather
 537         // than branching to, `abort', we can save the return address, which
 538         // might at least provide a hint as to what went wrong; (b) we don't
 539         // have conditional CALLs (and they'd be big anyway); and (c) we can
 540         // write a HLT here as a backstop against `abort' being mad.
 541 bogus:  callext F(abort)
 542 0:      hlt
 543         jmp     0b
 544
 545 ///--------------------------------------------------------------------------
 546 /// Data tables.
 547
 548         .align  16
 549 endswap_tab:
 550         .byte    3,  2,  1,  0
 551         .byte    7,  6,  5,  4
 552         .byte   11, 10,  9,  8
 553         .byte   15, 14, 13, 12
 554
 555 ///----- That's all, folks --------------------------------------------------