mdw@git.distorted.org.uk Git - catacomb/blob - symm/rijndael-x86-aesni.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// AESNI-based implementation of Rijndael
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33 ///--------------------------------------------------------------------------
  34 /// External definitions.
  35
  36         .globl  F(abort)
  37         .globl  F(rijndael_rcon)
  38
  39 ///--------------------------------------------------------------------------
  40 /// Main code.
  41
  42         .arch   .aes
  43         .section .text
  44
  45 /// The AESNI instructions implement a little-endian version of AES, but
  46 /// Catacomb's internal interface presents as big-endian so as to work better
  47 /// with things like GCM.  We therefore maintain the round keys in
  48 /// little-endian form, and have to end-swap blocks in and out.
  49 ///
  50 /// For added amusement, the AESNI instructions don't implement the
  51 /// larger-block versions of Rijndael, so we have to end-swap the keys if
  52 /// we're preparing for one of those.
  53
  54         // Useful constants.
  55         .equ    maxrounds, 16           // maximum number of rounds
  56         .equ    maxblksz, 32            // maximum block size, in bytes
  57         .equ    kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
  58
  59         // Context structure.
  60         .equ    nr, 0                   // number of rounds
  61         .equ    w, nr + 4               // encryption key words
  62         .equ    wi, w + kbufsz          // decryption key words
  63
  64 ///--------------------------------------------------------------------------
  65 /// Key setup.
  66
  67 FUNC(rijndael_setup_x86_aesni)
  68
  69         // Initial state.  We have four arguments:
  70         // [esp + 20] is the context pointer
  71         // [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
  72         // [esp + 28] points to the key material, unaligned
  73         // [esp + 32] is the size of the key, in words
  74         // The key size has already been checked for validity, and the number
  75         // of rounds has been computed.  Our job is only to fill in the `w'
  76         // and `wi' vectors.
  77
  78         push    ebp
  79         push    ebx
  80         push    esi
  81         push    edi
  82
  83         // The initial round key material is taken directly from the input
  84         // key, so copy it over.
  85         mov     ebp, [esp + 20]         // context base pointer
  86         mov     ebx, [esp + 32]         // key size, in words
  87         mov     ecx, ebx
  88         mov     esi, [esp + 28]
  89         lea     edi, [ebp + w]
  90         rep     movsd
  91
  92         // Find out other useful things.
  93         mov     edx, [ebp + nr]         // number of rounds
  94         add     edx, 1
  95         imul    edx, [esp + 24]         // total key size in words
  96         sub     edx, ebx                // offset by the key size
  97
  98         // Find the round constants.
  99         ldgot   ecx
 100         leaext  ecx, rijndael_rcon, ecx
 101
 102         // Prepare for the main loop.
 103         lea     esi, [ebp + w]
 104         mov     eax, [esi + 4*ebx - 4]  // most recent key word
 105         lea     edx, [esi + 4*edx]      // limit, offset by one key expansion
 106
 107         // Main key expansion loop.  The first word of each key-length chunk
 108         // needs special treatment.
 109         //
 110         // This is rather tedious because the Intel `AESKEYGENASSIST'
 111         // instruction is very strangely shaped.  Firstly, it wants to
 112         // operate on vast SSE registers, even though we're data-blocked from
 113         // doing more than operation at a time unless we're doing two key
 114         // schedules simultaneously -- and even then we can't do more than
 115         // two, because the instruction ignores two of its input words
 116         // entirely, and produces two different outputs for each of the other
 117         // two.  And secondly it insists on taking the magic round constant
 118         // as an immediate, so it's kind of annoying if you're not
 119         // open-coding the whole thing.  It's much easier to leave that as
 120         // zero and XOR in the round constant by hand.
 121 9:      movd    xmm0, eax
 122         pshufd  xmm0, xmm0, 0x39
 123         aeskeygenassist xmm1, xmm0, 0
 124         pshufd  xmm1, xmm1, 0x93
 125         movd    eax, xmm1
 126         xor     eax, [esi]
 127         xor     al, [ecx]
 128         inc     ecx
 129         mov     [esi + 4*ebx], eax
 130         add     esi, 4
 131         cmp     esi, edx
 132         jae     8f
 133
 134         // The next three words are simple...
 135         xor     eax, [esi]
 136         mov     [esi + 4*ebx], eax
 137         add     esi, 4
 138         cmp     esi, edx
 139         jae     8f
 140
 141         // (Word 2...)
 142         xor     eax, [esi]
 143         mov     [esi + 4*ebx], eax
 144         add     esi, 4
 145         cmp     esi, edx
 146         jae     8f
 147
 148         // (Word 3...)
 149         xor     eax, [esi]
 150         mov     [esi + 4*ebx], eax
 151         add     esi, 4
 152         cmp     esi, edx
 153         jae     8f
 154
 155         // Word 4.  If the key is /more/ than 6 words long, then we must
 156         // apply a substitution here.
 157         cmp     ebx, 5
 158         jb      9b
 159         cmp     ebx, 7
 160         jb      0f
 161         movd    xmm0, eax
 162         pshufd  xmm0, xmm0, 0x93
 163         aeskeygenassist xmm1, xmm0, 0
 164         movd    eax, xmm1
 165 0:      xor     eax, [esi]
 166         mov     [esi + 4*ebx], eax
 167         add     esi, 4
 168         cmp     esi, edx
 169         jae     8f
 170
 171         // (Word 5...)
 172         cmp     ebx, 6
 173         jb      9b
 174         xor     eax, [esi]
 175         mov     [esi + 4*ebx], eax
 176         add     esi, 4
 177         cmp     esi, edx
 178         jae     8f
 179
 180         // (Word 6...)
 181         cmp     ebx, 7
 182         jb      9b
 183         xor     eax, [esi]
 184         mov     [esi + 4*ebx], eax
 185         add     esi, 4
 186         cmp     esi, edx
 187         jae     8f
 188
 189         // (Word 7...)
 190         cmp     ebx, 8
 191         jb      9b
 192         xor     eax, [esi]
 193         mov     [esi + 4*ebx], eax
 194         add     esi, 4
 195         cmp     esi, edx
 196         jae     8f
 197
 198         // Must be done by now.
 199         jmp     9b
 200
 201         // Next job is to construct the decryption keys.  The keys for the
 202         // first and last rounds don't need to be mangled, but the remaining
 203         // ones do -- and they all need to be reordered too.
 204         //
 205         // The plan of action, then, is to copy the final encryption round's
 206         // keys into place first, then to do each of the intermediate rounds
 207         // in reverse order, and finally do the first round.
 208         //
 209         // Do all of the heavy lifting with SSE registers.  The order we're
 210         // doing this in means that it's OK if we read or write too much, and
 211         // there's easily enough buffer space for the over-enthusiastic reads
 212         // and writes because the context has space for 32-byte blocks, which
 213         // is our maximum and an exact fit for two SSE registers.
 214 8:      mov     ecx, [ebp + nr]         // number of rounds
 215         mov     ebx, [esp + 24]         // block size (in words)
 216         mov     edx, ecx
 217         imul    edx, ebx
 218         lea     edi, [ebp + wi]
 219         lea     esi, [ebp + 4*edx + w]  // last round's keys
 220         shl     ebx, 2                  // block size (in bytes now)
 221
 222         // Copy the last encryption round's keys.
 223         movdqu  xmm0, [esi]
 224         movdqu  [edi], xmm0
 225         cmp     ebx, 16
 226         jbe     9f
 227         movdqu  xmm0, [esi + 16]
 228         movdqu  [edi + 16], xmm0
 229
 230         // Update the loop variables and stop if we've finished.
 231 9:      add     edi, ebx
 232         sub     esi, ebx
 233         sub     ecx, 1
 234         jbe     0f
 235
 236         // Do another middle round's keys...
 237         movdqu  xmm0, [esi]
 238         aesimc  xmm0, xmm0
 239         movdqu  [edi], xmm0
 240         cmp     ebx, 16
 241         jbe     9b
 242         movdqu  xmm0, [esi + 16]
 243         aesimc  xmm0, xmm0
 244         movdqu  [edi + 16], xmm0
 245         jmp     9b
 246
 247         // Finally do the first encryption round.
 248 0:      movdqu  xmm0, [esi]
 249         movdqu  [edi], xmm0
 250         cmp     ebx, 16
 251         jbe     0f
 252         movdqu  xmm0, [esi + 16]
 253         movdqu  [edi + 16], xmm0
 254
 255         // If the block size is not exactly four words then we must end-swap
 256         // everything.  We can use fancy SSE toys for this.
 257 0:      cmp     ebx, 16
 258         je      0f
 259
 260         // Find the byte-reordering table.
 261         ldgot   ecx
 262         movdqa  xmm7, [INTADDR(endswap_tab, ecx)]
 263
 264         // Calculate the number of subkey words again.  (It's a good job
 265         // we've got a fast multiplier.)
 266         mov     ecx, [ebp + nr]
 267         add     ecx, 1
 268         imul    ecx, [esp + 24]         // total keys in words
 269
 270         // End-swap the encryption keys.
 271         mov     eax, ecx
 272         lea     esi, [ebp + w]
 273         call    endswap_block
 274
 275         // And the decryption keys.
 276         mov     ecx, eax
 277         lea     esi, [ebp + wi]
 278         call    endswap_block
 279
 280         // All done.
 281 0:      pop     edi
 282         pop     esi
 283         pop     ebx
 284         pop     ebp
 285         ret
 286
 287         .align  16
 288 endswap_block:
 289         // End-swap ECX words starting at ESI.  The end-swapping table is
 290         // already loaded into XMM7; and it's OK to work in 16-byte chunks.
 291         movdqu  xmm1, [esi]
 292         pshufb  xmm1, xmm7
 293         movdqu  [esi], xmm1
 294         add     esi, 16
 295         sub     ecx, 4
 296         ja      endswap_block
 297         ret
 298
 299 ENDFUNC
 300
 301 ///--------------------------------------------------------------------------
 302 /// Encrypting and decrypting blocks.
 303
 304 FUNC(rijndael_eblk_x86_aesni)
 305
 306         // On entry, we have:
 307         // [esp +  4] points to the context block
 308         // [esp +  8] points to the input data block
 309         // [esp + 12] points to the output buffer
 310
 311         // Find the magic endianness-swapping table.
 312         ldgot   ecx
 313         movdqa  xmm7, [INTADDR(endswap_tab, ecx)]
 314
 315         // Load the input block and end-swap it.  Also, start loading the
 316         // keys.
 317         mov     eax, [esp + 8]
 318         movdqu  xmm0, [eax]
 319         pshufb  xmm0, xmm7
 320         mov     eax, [esp + 4]
 321         lea     edx, [eax + w]
 322         mov     eax, [eax + nr]
 323
 324         // Initial whitening.
 325         movdqu  xmm1, [edx]
 326         add     edx, 16
 327         pxor    xmm0, xmm1
 328
 329         // Dispatch to the correct code.
 330         cmp     eax, 10
 331         je      er10
 332         jb      bogus
 333         cmp     eax, 14
 334         je      er14
 335         ja      bogus
 336         cmp     eax, 12
 337         je      er12
 338         jb      er11
 339         jmp     er13
 340
 341         .align  2
 342
 343         // 14 rounds...
 344 er14:   movdqu  xmm1, [edx]
 345         add     edx, 16
 346         aesenc  xmm0, xmm1
 347
 348         // 13 rounds...
 349 er13:   movdqu  xmm1, [edx]
 350         add     edx, 16
 351         aesenc  xmm0, xmm1
 352
 353         // 12 rounds...
 354 er12:   movdqu  xmm1, [edx]
 355         add     edx, 16
 356         aesenc  xmm0, xmm1
 357
 358         // 11 rounds...
 359 er11:   movdqu  xmm1, [edx]
 360         add     edx, 16
 361         aesenc  xmm0, xmm1
 362
 363         // 10 rounds...
 364 er10:   movdqu  xmm1, [edx]
 365         aesenc  xmm0, xmm1
 366
 367         // 9 rounds...
 368         movdqu  xmm1, [edx + 16]
 369         aesenc  xmm0, xmm1
 370
 371         // 8 rounds...
 372         movdqu  xmm1, [edx + 32]
 373         aesenc  xmm0, xmm1
 374
 375         // 7 rounds...
 376         movdqu  xmm1, [edx + 48]
 377         aesenc  xmm0, xmm1
 378
 379         // 6 rounds...
 380         movdqu  xmm1, [edx + 64]
 381         aesenc  xmm0, xmm1
 382
 383         // 5 rounds...
 384         movdqu  xmm1, [edx + 80]
 385         aesenc  xmm0, xmm1
 386
 387         // 4 rounds...
 388         movdqu  xmm1, [edx + 96]
 389         aesenc  xmm0, xmm1
 390
 391         // 3 rounds...
 392         movdqu  xmm1, [edx + 112]
 393         aesenc  xmm0, xmm1
 394
 395         // 2 rounds...
 396         movdqu  xmm1, [edx + 128]
 397         aesenc  xmm0, xmm1
 398
 399         // Final round...
 400         movdqu  xmm1, [edx + 144]
 401         aesenclast xmm0, xmm1
 402
 403         // Unpermute the ciphertext block and store it.
 404         pshufb  xmm0, xmm7
 405         mov     eax, [esp + 12]
 406         movdqu  [eax], xmm0
 407
 408         // And we're done.
 409         ret
 410
 411 ENDFUNC
 412
 413 FUNC(rijndael_dblk_x86_aesni)
 414
 415         // On entry, we have:
 416         // [esp +  4] points to the context block
 417         // [esp +  8] points to the input data block
 418         // [esp + 12] points to the output buffer
 419
 420         // Find the magic endianness-swapping table.
 421         ldgot   ecx
 422         movdqa  xmm7, [INTADDR(endswap_tab, ecx)]
 423
 424         // Load the input block and end-swap it.  Also, start loading the
 425         // keys.
 426         mov     eax, [esp + 8]
 427         movdqu  xmm0, [eax]
 428         pshufb  xmm0, xmm7
 429         mov     eax, [esp + 4]
 430         lea     edx, [eax + wi]
 431         mov     eax, [eax + nr]
 432
 433         // Initial whitening.
 434         movdqu  xmm1, [edx]
 435         add     edx, 16
 436         pxor    xmm0, xmm1
 437
 438         // Dispatch to the correct code.
 439         cmp     eax, 10
 440         je      dr10
 441         jb      bogus
 442         cmp     eax, 14
 443         je      dr14
 444         ja      bogus
 445         cmp     eax, 12
 446         je      dr12
 447         jb      dr11
 448         jmp     dr13
 449
 450         .align  2
 451
 452         // 14 rounds...
 453 dr14:   movdqu  xmm1, [edx]
 454         add     edx, 16
 455         aesdec  xmm0, xmm1
 456
 457         // 13 rounds...
 458 dr13:   movdqu  xmm1, [edx]
 459         add     edx, 16
 460         aesdec  xmm0, xmm1
 461
 462         // 12 rounds...
 463 dr12:   movdqu  xmm1, [edx]
 464         add     edx, 16
 465         aesdec  xmm0, xmm1
 466
 467         // 11 rounds...
 468 dr11:   movdqu  xmm1, [edx]
 469         add     edx, 16
 470         aesdec  xmm0, xmm1
 471
 472         // 10 rounds...
 473 dr10:   movdqu  xmm1, [edx]
 474         aesdec  xmm0, xmm1
 475
 476         // 9 rounds...
 477         movdqu  xmm1, [edx + 16]
 478         aesdec  xmm0, xmm1
 479
 480         // 8 rounds...
 481         movdqu  xmm1, [edx + 32]
 482         aesdec  xmm0, xmm1
 483
 484         // 7 rounds...
 485         movdqu  xmm1, [edx + 48]
 486         aesdec  xmm0, xmm1
 487
 488         // 6 rounds...
 489         movdqu  xmm1, [edx + 64]
 490         aesdec  xmm0, xmm1
 491
 492         // 5 rounds...
 493         movdqu  xmm1, [edx + 80]
 494         aesdec  xmm0, xmm1
 495
 496         // 4 rounds...
 497         movdqu  xmm1, [edx + 96]
 498         aesdec  xmm0, xmm1
 499
 500         // 3 rounds...
 501         movdqu  xmm1, [edx + 112]
 502         aesdec  xmm0, xmm1
 503
 504         // 2 rounds...
 505         movdqu  xmm1, [edx + 128]
 506         aesdec  xmm0, xmm1
 507
 508         // Final round...
 509         movdqu  xmm1, [edx + 144]
 510         aesdeclast xmm0, xmm1
 511
 512         // Unpermute the ciphertext block and store it.
 513         pshufb  xmm0, xmm7
 514         mov     eax, [esp + 12]
 515         movdqu  [eax], xmm0
 516
 517         // And we're done.
 518         ret
 519
 520 ENDFUNC
 521
 522 ///--------------------------------------------------------------------------
 523 /// Random utilities.
 524
 525         .align  16
 526         // Abort the process because of a programming error.  Indirecting
 527         // through this point serves several purposes: (a) by CALLing, rather
 528         // than branching to, `abort', we can save the return address, which
 529         // might at least provide a hint as to what went wrong; (b) we don't
 530         // have conditional CALLs (and they'd be big anyway); and (c) we can
 531         // write a HLT here as a backstop against `abort' being mad.
 532 bogus:  callext F(abort)
 533 0:      hlt
 534         jmp     0b
 535
 536         gotaux  ecx
 537
 538 ///--------------------------------------------------------------------------
 539 /// Data tables.
 540
 541         .align  16
 542 endswap_tab:
 543         .byte    3,  2,  1,  0
 544         .byte    7,  6,  5,  4
 545         .byte   11, 10,  9,  8
 546         .byte   15, 14, 13, 12
 547
 548 ///----- That's all, folks --------------------------------------------------