mdw@git.distorted.org.uk Git - catacomb/blob - symm/chacha-x86ish-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// Fancy SIMD implementation of ChaCha
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .text
  34
  35 ///--------------------------------------------------------------------------
  36 /// Main code.
  37
  38 FUNC(chacha_core_x86ish_avx)
  39         .arch   .avx
  40         vzeroupper
  41   endprologue
  42         // drop through...
  43 ENDFUNC
  44
  45         .arch   pentium4
  46
  47 FUNC(chacha_core_x86ish_sse2)
  48
  49         // Initial setup.
  50
  51 #if CPUFAM_X86
  52         // Arguments come in on the stack, and will need to be collected.  We
  53         // can get away with just the scratch registers for integer work, but
  54         // we'll run out of XMM registers and will need some properly aligned
  55         // space which we'll steal from the stack.  I don't trust the stack
  56         // pointer's alignment, so I'll have to mask the stack pointer, which
  57         // in turn means I'll need to keep track of the old value.  Hence I'm
  58         // making a full i386-style stack frame here.
  59         //
  60         // The Windows and SysV ABIs are sufficiently similar that we don't
  61         // need to worry about the differences here.
  62
  63 #  define NR ecx
  64 #  define IN eax
  65 #  define OUT edx
  66 #  define SAVE0 xmm5
  67 #  define SAVE1 xmm6
  68 #  define SAVE2 xmm7
  69 #  define SAVE3 [SP]
  70
  71         pushreg BP
  72         setfp
  73         stalloc 16
  74         mov     IN, [BP + 12]
  75         mov     OUT, [BP + 16]
  76         and     SP, ~15
  77         mov     NR, [BP + 8]
  78 #endif
  79
  80 #if CPUFAM_AMD64 && ABI_SYSV
  81         // This is nice.  We have plenty of XMM registers, and the arguments
  82         // are in useful places.  There's no need to spill anything and we
  83         // can just get on with the code.
  84
  85 #  define NR edi
  86 #  define IN rsi
  87 #  define OUT rdx
  88 #  define SAVE0 xmm5
  89 #  define SAVE1 xmm6
  90 #  define SAVE2 xmm7
  91 #  define SAVE3 xmm8
  92 #endif
  93
  94 #if CPUFAM_AMD64 && ABI_WIN
  95         // Arguments come in registers, but they're different between Windows
  96         // and everyone else (and everyone else is saner).
  97         //
  98         // The Windows ABI insists that we preserve some of the XMM
  99         // registers, but we want more than we can use as scratch space.  We
 100         // only need to save a copy of the input for the feedforward at the
 101         // end, so we might as well use memory rather than spill extra
 102         // registers.  (We need an extra 8 bytes to align the stack.)
 103
 104 #  define NR ecx
 105 #  define IN rdx
 106 #  define OUT r8
 107 #  define SAVE0 xmm5
 108 #  define SAVE1 [SP +  0]
 109 #  define SAVE2 [SP + 16]
 110 #  define SAVE3 [SP + 32]
 111
 112         stalloc 48 + 8
 113 #endif
 114
 115   endprologue
 116
 117         // First job is to slurp the matrix into XMM registers.  Be careful:
 118         // the input matrix isn't likely to be properly aligned.
 119         //
 120         //      [ 0  1  2  3] (a, xmm0)
 121         //      [ 4  5  6  7] (b, xmm1)
 122         //      [ 8  9 10 11] (c, xmm2)
 123         //      [12 13 14 15] (d, xmm3)
 124         movdqu  xmm0, [IN +  0]
 125         movdqu  xmm1, [IN + 16]
 126         movdqu  xmm2, [IN + 32]
 127         movdqu  xmm3, [IN + 48]
 128
 129         // Take a copy for later.  This one is aligned properly, by
 130         // construction.
 131         movdqa  SAVE0, xmm0
 132         movdqa  SAVE1, xmm1
 133         movdqa  SAVE2, xmm2
 134         movdqa  SAVE3, xmm3
 135
 136 0:
 137         // Apply a column quarterround to each of the columns simultaneously.
 138         // Alas, there doesn't seem to be a packed doubleword rotate, so we
 139         // have to synthesize it.
 140
 141         // a += b; d ^= a; d <<<= 16
 142         paddd   xmm0, xmm1
 143         pxor    xmm3, xmm0
 144         movdqa  xmm4, xmm3
 145         pslld   xmm3, 16
 146         psrld   xmm4, 16
 147         por     xmm3, xmm4
 148
 149         // c += d; b ^= c; b <<<= 12
 150         paddd   xmm2, xmm3
 151         pxor    xmm1, xmm2
 152         movdqa  xmm4, xmm1
 153         pslld   xmm1, 12
 154         psrld   xmm4, 20
 155         por     xmm1, xmm4
 156
 157         // a += b; d ^= a; d <<<=  8
 158         paddd   xmm0, xmm1
 159         pxor    xmm3, xmm0
 160         movdqa  xmm4, xmm3
 161         pslld   xmm3, 8
 162         psrld   xmm4, 24
 163         por     xmm3, xmm4
 164
 165         // c += d; b ^= c; b <<<=  7
 166         paddd   xmm2, xmm3
 167          pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
 168         pxor    xmm1, xmm2
 169          pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
 170         movdqa  xmm4, xmm1
 171         pslld   xmm1, 7
 172         psrld   xmm4, 25
 173         por     xmm1, xmm4
 174
 175         // The not-quite-transpose conveniently only involves reordering
 176         // elements of individual rows, which can be done quite easily.  It
 177         // doesn't involve any movement of elements between rows, or even
 178         // renaming of the rows.
 179         //
 180         //      [ 0  1  2  3]           [ 0  1  2  3] (a, xmm0)
 181         //      [ 4  5  6  7]    -->    [ 5  6  7  4] (b, xmm1)
 182         //      [ 8  9 10 11]           [10 11  8  9] (c, xmm2)
 183         //      [12 13 14 15]           [15 12 13 14] (d, xmm3)
 184         //
 185         // The shuffles have quite high latency, so they've mostly been
 186         // pushed upwards.  The remaining one can't be moved, though.
 187         pshufd  xmm1, xmm1, SHUF(1, 2, 3, 0)
 188
 189         // Apply the diagonal quarterround to each of the columns
 190         // simultaneously.
 191
 192         // a += b; d ^= a; d <<<= 16
 193         paddd   xmm0, xmm1
 194         pxor    xmm3, xmm0
 195         movdqa  xmm4, xmm3
 196         pslld   xmm3, 16
 197         psrld   xmm4, 16
 198         por     xmm3, xmm4
 199
 200         // c += d; b ^= c; b <<<= 12
 201         paddd   xmm2, xmm3
 202         pxor    xmm1, xmm2
 203         movdqa  xmm4, xmm1
 204         pslld   xmm1, 12
 205         psrld   xmm4, 20
 206         por     xmm1, xmm4
 207
 208         // a += b; d ^= a; d <<<=  8
 209         paddd   xmm0, xmm1
 210         pxor    xmm3, xmm0
 211         movdqa  xmm4, xmm3
 212         pslld   xmm3, 8
 213         psrld   xmm4, 24
 214         por     xmm3, xmm4
 215
 216         // c += d; b ^= c; b <<<=  7
 217         paddd   xmm2, xmm3
 218          pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
 219         pxor    xmm1, xmm2
 220          pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
 221         movdqa  xmm4, xmm1
 222         pslld   xmm1, 7
 223         psrld   xmm4, 25
 224         por     xmm1, xmm4
 225
 226         // Finally, finish off undoing the transpose, and we're done for this
 227         // doubleround.  Again, most of this was done above so we don't have
 228         // to wait for the shuffles.
 229         pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)
 230
 231         // Decrement the loop counter and see if we should go round again.
 232         sub     NR, 2
 233         ja      0b
 234
 235         // Almost there.  Firstly, the feedforward addition.
 236         paddd   xmm0, SAVE0
 237         paddd   xmm1, SAVE1
 238         paddd   xmm2, SAVE2
 239         paddd   xmm3, SAVE3
 240
 241         // And now we write out the result.  This one won't be aligned
 242         // either.
 243         movdqu  [OUT +  0], xmm0
 244         movdqu  [OUT + 16], xmm1
 245         movdqu  [OUT + 32], xmm2
 246         movdqu  [OUT + 48], xmm3
 247
 248         // Tidy things up.
 249 #if CPUFAM_X86
 250         dropfp
 251         popreg  BP
 252 #endif
 253 #if CPUFAM_AMD64 && ABI_WIN
 254         stfree  48 + 8
 255 #endif
 256
 257         // And with that, we're done.
 258         ret
 259
 260 ENDFUNC
 261
 262 FUNC(chacha_multi_i386_sse2)
 263         // Arguments are on the stack:
 264         //
 265         // [sp +  4]    pointer to state
 266         // [sp +  8]    input pointer (or null)
 267         // [sp + 12]    output pointer
 268         // [sp + 16]    number of blocks to process
 269         // [sp + 20]    number of rounds per block
 270
 271         pushreg SI
 272         pushreg DI
 273         pushreg BX
 274         stalloc 4*64
 275   endprologue
 276
 277         // Load the arguments.
 278         mov     BX, [SP + 272]          // = state pointer
 279         mov     SI, [SP + 276]          // = source pointer
 280         mov     DI, [SP + 280]          // = destination pointer
 281         mov     CX, [SP + 284]          // = block count
 282         mov     DX, [SP + 288]          // = (initial) round count
 283
 284         // Do chunks of four blocks at a time.
 285         sub     CX, 4
 286         jb      8f
 287
 288         // Inhale the initial state.
 289         movdqu  xmm1, [BX +  0]
 290         movdqu  xmm3, [BX + 16]
 291         movdqu  xmm5, [BX + 32]
 292         movdqu  xmm0, [BX + 48]
 293
 294         // Set the counters and initialize the working blocks.
 295         pxor    xmm2, xmm2
 296         pxor    xmm4, xmm4
 297         pxor    xmm6, xmm6
 298         pxor    xmm7, xmm7
 299
 300         xor     eax, eax
 301         mov     al, 1
 302         pinsrw  xmm2, eax, 4
 303         mov     al, 2
 304         pinsrw  xmm4, eax, 4
 305         mov     al, 3
 306         pinsrw  xmm6, eax, 4
 307         mov     al, 4
 308         pinsrw  xmm7, eax, 4
 309
 310         movdqa  [SP +  16], xmm3
 311         movdqa  [SP +  32], xmm5
 312         movdqa  [SP +  48], xmm0
 313
 314         paddq   xmm2, xmm3
 315         paddq   xmm4, xmm3
 316         paddq   xmm6, xmm3
 317         paddq   xmm7, xmm3
 318
 319         movdqu  [BX + 48], xmm7
 320
 321         // a += b; d ^= a; d <<<= 16
 322         paddd   xmm1, xmm3              // a += b
 323
 324         movdqa  [SP +   0], xmm1
 325
 326         pxor    xmm0, xmm1              // d ^= a
 327         pxor    xmm2, xmm1
 328         pxor    xmm4, xmm1
 329         pxor    xmm6, xmm1
 330
 331         movdqa  xmm1, xmm0
 332         movdqa  xmm3, xmm2
 333         movdqa  xmm5, xmm4
 334         movdqa  xmm7, xmm6
 335
 336         pslld   xmm0, 16                // d << 16
 337         pslld   xmm2, 16
 338         pslld   xmm4, 16
 339         pslld   xmm6, 16
 340
 341         pslrd   xmm1, 16                // d >> 16
 342         pslrd   xmm3, 16
 343         pslrd   xmm5, 16
 344         pslrd   xmm7, 16
 345
 346         por     xmm0, xmm1              // d <<<= 16
 347          movdqa xmm1, [SP + 32]
 348         por     xmm2, xmm3
 349          movdqa xmm3, [SP + 48]
 350         por     xmm4, xmm5
 351         por     xmm6, xmm7
 352
 353         movdqa  [SP +  48], xmm0
 354         movdqa  [SP + 112], xmm2
 355         movdqa  [SP + 176], xmm4
 356         movdqa  [SP + 240], xmm6
 357
 358         // c += d; b ^= c; c <<<= 12
 359         paddd   xmm0, xmm1              // c += d
 360         paddd   xmm2, xmm1
 361         paddd   xmm4, xmm1
 362         paddd   xmm6, xmm1
 363
 364         movdqa  [SP +  32], xmm0
 365         movdqa  [SP +  96], xmm0
 366         movdqa  [SP + 160], xmm0
 367         movdqa  [SP + 224], xmm0
 368
 369         pxor    xmm0, xmm3              // b ^= c
 370         pxor    xmm2, xmm3
 371         pxor    xmm4, xmm3
 372         pxor    xmm6, xmm3
 373
 374         movdqa  xmm1, xmm0
 375         movdqa  xmm3, xmm2
 376         movdqa  xmm5, xmm4
 377         movdqa  xmm7, xmm6
 378
 379         pslld   xmm0, 16                // d << 16
 380         pslld   xmm2, 16
 381         pslld   xmm4, 16
 382         pslld   xmm6, 16
 383
 384         pslrd   xmm1, 16                // d >> 16
 385         pslrd   xmm3, 16
 386         pslrd   xmm5, 16
 387         pslrd   xmm7, 16
 388
 389         por     xmm0, xmm1              // d <<<= 16
 390         por     xmm2, xmm3
 391         por     xmm4, xmm5
 392         por     xmm6, xmm7
 393
 394 ENDFUNC
 395
 396 ///----- That's all, folks --------------------------------------------------