mdw@git.distorted.org.uk Git - catacomb/blob - symm/salsa20-x86ish-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// Fancy SIMD implementation of Salsa20
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33 ///--------------------------------------------------------------------------
  34 /// Main code.
  35
  36         .arch pentium4
  37         .text
  38
  39 FUNC(salsa20_core_x86ish_sse2)
  40
  41         // Initial setup.
  42
  43 #if CPUFAM_X86
  44         // Arguments come in on the stack, and will need to be collected.  We
  45         // we can get away with just the scratch registers for integer work,
  46         // but we'll run out of XMM registers and will need some properly
  47         // aligned space which we'll steal from the stack.  I don't trust the
  48         // stack pointer's alignment, so I'll have to mask the stack pointer,
  49         // which in turn means I'll need to keep track of the old value.
  50         // Hence I'm making a full i386-style stack frame here.
  51         //
  52         // The Windows and SysV ABIs are sufficiently similar that we don't
  53         // need to worry about the differences here.
  54
  55 #  define NR ecx
  56 #  define IN eax
  57 #  define OUT edx
  58 #  define SAVE0 xmm6
  59 #  define SAVE1 xmm7
  60 #  define SAVE2 [esp + 0]
  61 #  define SAVE3 [esp + 16]
  62
  63         push    ebp
  64         mov     ebp, esp
  65         sub     esp, 32
  66         mov     IN, [ebp + 12]
  67         mov     OUT, [ebp + 16]
  68         and     esp, ~15
  69         mov     NR, [ebp + 8]
  70 #endif
  71
  72 #if CPUFAM_AMD64 && ABI_SYSV
  73         // This is nice.  We have plenty of XMM registers, and the arguments
  74         // are in useful places.  There's no need to spill anything and we
  75         // can just get on with the code.
  76
  77 #  define NR edi
  78 #  define IN rsi
  79 #  define OUT rdx
  80 #  define SAVE0 xmm6
  81 #  define SAVE1 xmm7
  82 #  define SAVE2 xmm8
  83 #  define SAVE3 xmm9
  84 #endif
  85
  86 #  if CPUFAM_AMD64 && ABI_WIN
  87         // Arguments come in registers, but they're different between Windows
  88         // and everyone else (and everyone else is saner).
  89         //
  90         // The Windows ABI insists that we preserve some of the XMM
  91         // registers, but we want more than we can use as scratch space.  Two
  92         // places we only need to save a copy of the input for the
  93         // feedforward at the end; but the other two we want for the final
  94         // permutation, so save the old values on the stack.  (We need an
  95         // extra 8 bytes to align the stack.)
  96
  97 #  define NR ecx
  98 #  define IN rdx
  99 #  define OUT r8
 100 #  define SAVE0 xmm6
 101 #  define SAVE1 xmm7
 102 #  define SAVE2 [rsp + 32]
 103 #  define SAVE3 [rsp + 48]
 104
 105         sub     rsp, 64 + 8
 106           .seh_stackalloc 64 + 8
 107         movdqa  [rsp +  0], xmm6
 108           .seh_savexmm xmm6, 0
 109         movdqa  [rsp + 16], xmm7
 110           .seh_savexmm xmm7, 16
 111   .seh_endprologue
 112 #endif
 113
 114         // First job is to slurp the matrix into XMM registers.  The words
 115         // have already been permuted conveniently to make them line up
 116         // better for SIMD processing.
 117         //
 118         // The textbook arrangement of the matrix is this.
 119         //
 120         //      [C K K K]
 121         //      [K C N N]
 122         //      [T T C K]
 123         //      [K K K C]
 124         //
 125         // But we've rotated the columns up so that the main diagonal with
 126         // the constants on it end up in the first row, giving something more
 127         // like
 128         //
 129         //      [C C C C]
 130         //      [K T K K]
 131         //      [T K K N]
 132         //      [K K N K]
 133         //
 134         // so the transformation looks like this:
 135         //
 136         //      [ 0  1  2  3]           [ 0  5 10 15] (a, xmm0)
 137         //      [ 4  5  6  7]    -->    [ 4  9 14  3] (b, xmm1)
 138         //      [ 8  9 10 11]           [ 8 13  2  7] (c, xmm2)
 139         //      [12 13 14 15]           [12  1  6 11] (d, xmm3)
 140         movdqu  xmm0, [IN +  0]
 141         movdqu  xmm1, [IN + 16]
 142         movdqu  xmm2, [IN + 32]
 143         movdqu  xmm3, [IN + 48]
 144
 145         // Take a copy for later.
 146         movdqa  SAVE0, xmm0
 147         movdqa  SAVE1, xmm1
 148         movdqa  SAVE2, xmm2
 149         movdqa  SAVE3, xmm3
 150
 151 0:
 152         // Apply a column quarterround to each of the columns simultaneously.
 153         // Alas, there doesn't seem to be a packed doubleword rotate, so we
 154         // have to synthesize it.
 155
 156         // b ^= (a + d) <<<  7
 157         movdqa  xmm4, xmm0
 158         paddd   xmm4, xmm3
 159         movdqa  xmm5, xmm4
 160         pslld   xmm4, 7
 161         psrld   xmm5, 25
 162         por     xmm4, xmm5
 163         pxor    xmm1, xmm4
 164
 165         // c ^= (b + a) <<<  9
 166         movdqa  xmm4, xmm1
 167         paddd   xmm4, xmm0
 168         movdqa  xmm5, xmm4
 169         pslld   xmm4, 9
 170         psrld   xmm5, 23
 171         por     xmm4, xmm5
 172         pxor    xmm2, xmm4
 173
 174         // d ^= (c + b) <<< 13
 175         movdqa  xmm4, xmm2
 176         paddd   xmm4, xmm1
 177          pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
 178         movdqa  xmm5, xmm4
 179         pslld   xmm4, 13
 180         psrld   xmm5, 19
 181         por     xmm4, xmm5
 182         pxor    xmm3, xmm4
 183
 184         // a ^= (d + c) <<< 18
 185         movdqa  xmm4, xmm3
 186          pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
 187         paddd   xmm4, xmm2
 188          pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
 189         movdqa  xmm5, xmm4
 190         pslld   xmm4, 18
 191         psrld   xmm5, 14
 192         por     xmm4, xmm5
 193         pxor    xmm0, xmm4
 194
 195         // The transpose conveniently only involves reordering elements of
 196         // individual rows, which can be done quite easily, and reordering
 197         // the rows themselves, which is a trivial renaming.  It doesn't
 198         // involve any movement of elements between rows.
 199         //
 200         //      [ 0  5 10 15]           [ 0  5 10 15] (a, xmm0)
 201         //      [ 4  9 14  3]    -->    [ 1  6 11 12] (b, xmm3)
 202         //      [ 8 13  2  7]           [ 2  7  8 13] (c, xmm2)
 203         //      [12  1  6 11]           [ 3  4  9 14] (d, xmm1)
 204         //
 205         // The shuffles have quite high latency, so they've been pushed
 206         // backwards into the main instruction list.
 207
 208         // Apply the row quarterround to each of the columns (yes!)
 209         // simultaneously.
 210
 211         // b ^= (a + d) <<<  7
 212         movdqa  xmm4, xmm0
 213         paddd   xmm4, xmm1
 214         movdqa  xmm5, xmm4
 215         pslld   xmm4, 7
 216         psrld   xmm5, 25
 217         por     xmm4, xmm5
 218         pxor    xmm3, xmm4
 219
 220         // c ^= (b + a) <<<  9
 221         movdqa  xmm4, xmm3
 222         paddd   xmm4, xmm0
 223         movdqa  xmm5, xmm4
 224         pslld   xmm4, 9
 225         psrld   xmm5, 23
 226         por     xmm4, xmm5
 227         pxor    xmm2, xmm4
 228
 229         // d ^= (c + b) <<< 13
 230         movdqa  xmm4, xmm2
 231         paddd   xmm4, xmm3
 232          pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
 233         movdqa  xmm5, xmm4
 234         pslld   xmm4, 13
 235         psrld   xmm5, 19
 236         por     xmm4, xmm5
 237         pxor    xmm1, xmm4
 238
 239         // a ^= (d + c) <<< 18
 240         movdqa  xmm4, xmm1
 241          pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
 242         paddd   xmm4, xmm2
 243          pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
 244         movdqa  xmm5, xmm4
 245         pslld   xmm4, 18
 246         psrld   xmm5, 14
 247         por     xmm4, xmm5
 248         pxor    xmm0, xmm4
 249
 250         // We had to undo the transpose ready for the next loop.  Again, push
 251         // back the shuffles because they take a long time coming through.
 252         // Decrement the loop counter and see if we should go round again.
 253         // Later processors fuse this pair into a single uop.
 254         sub     NR, 2
 255         ja      0b
 256
 257         // Almost there.  Firstly, the feedforward addition.
 258         paddd   xmm0, SAVE0                     //  0,  5, 10, 15
 259         paddd   xmm1, SAVE1                     //  4,  9, 14,  3
 260         paddd   xmm2, SAVE2                     //  8, 13,  2,  7
 261         paddd   xmm3, SAVE3                     // 12,  1,  6, 11
 262
 263         // Next we must undo the permutation which was already applied to the
 264         // input.  This can be done by juggling values in registers, with the
 265         // following fancy footwork: some row rotations, a transpose, and
 266         // some more rotations.
 267         pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)    //  3,  4,  9, 14
 268         pshufd  xmm2, xmm2, SHUF(1, 0, 3, 2)    //  2,  7,  8, 13
 269         pshufd  xmm3, xmm3, SHUF(0, 3, 2, 1)    //  1,  6, 11, 12
 270
 271         movdqa  xmm4, xmm0
 272         movdqa  xmm5, xmm3
 273         punpckldq xmm0, xmm2                    //  0,  2,  5,  7
 274         punpckldq xmm3, xmm1                    //  1,  3,  6,  4
 275         punpckhdq xmm4, xmm2                    //  10, 8, 15, 13
 276         punpckhdq xmm5, xmm1                    //  11, 9, 12, 14
 277
 278         movdqa  xmm1, xmm0
 279         movdqa  xmm2, xmm4
 280         punpckldq xmm0, xmm3                    //  0,  1,  2,  3
 281         punpckldq xmm4, xmm5                    // 10, 11,  8,  9
 282         punpckhdq xmm1, xmm3                    //  5,  6,  7,  4
 283         punpckhdq xmm2, xmm5                    // 15, 12, 13, 14
 284
 285         pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)    //  4,  5,  6,  7
 286         pshufd  xmm4, xmm4, SHUF(1, 0, 3, 2)    //  8,  9, 10, 11
 287         pshufd  xmm2, xmm2, SHUF(0, 3, 2, 1)    // 12, 13, 14, 15
 288
 289         // Finally we have to write out the result.
 290         movdqu  [OUT +  0], xmm0
 291         movdqu  [OUT + 16], xmm1
 292         movdqu  [OUT + 32], xmm4
 293         movdqu  [OUT + 48], xmm2
 294
 295         // Tidy things up.
 296 #if CPUFAM_X86
 297         mov     esp, ebp
 298         pop     ebp
 299 #endif
 300 #if CPUFAM_AMD64 && ABI_WIN
 301         movdqa  xmm6, [rsp +  0]
 302         movdqa  xmm7, [rsp + 16]
 303         add     rsp, 64 + 8
 304 #endif
 305
 306         // And with that, we're done.
 307         ret
 308
 309 #undef NR
 310 #undef IN
 311 #undef OUT
 312 #undef SAVE0
 313 #undef SAVE1
 314 #undef SAVE2
 315 #undef SAVE3
 316
 317 ENDFUNC
 318
 319 ///----- That's all, folks --------------------------------------------------