mdw@git.distorted.org.uk Git - catacomb/blob - symm/salsa20-x86-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// Fancy SIMD implementation of Salsa20
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33 ///--------------------------------------------------------------------------
  34 /// Main code.
  35
  36         .arch pentium4
  37         .section .text
  38
  39 FUNC(salsa20_core_x86_sse2)
  40
  41         // Initial state.  We have three arguments:
  42         // [ebp +  8] is the number of rounds to do
  43         // [ebp + 12] points to the input matrix
  44         // [ebp + 16] points to the output matrix
  45         push    ebp
  46         mov     ebp, esp
  47         sub     esp, 32
  48         mov     edx, [ebp + 12]
  49         and     esp, ~15
  50
  51         // Prepare for the main loop.
  52         mov     ecx, [ebp + 8]
  53
  54         // First job is to slurp the matrix into XMM registers.  The words
  55         // have already been permuted conveniently to make them line up
  56         // better for SIMD processing.
  57         //
  58         // The textbook arrangement of the matrix is this.
  59         //
  60         //      [C K K K]
  61         //      [K C N N]
  62         //      [T T C K]
  63         //      [K K K C]
  64         //
  65         // But we've rotated the columns up so that the main diagonal with
  66         // the constants on it end up in the first row, giving something more
  67         // like
  68         //
  69         //      [C C C C]
  70         //      [K T K K]
  71         //      [T K K N]
  72         //      [K K N K]
  73         //
  74         // so the transformation looks like this:
  75         //
  76         //      [ 0  1  2  3]           [ 0  5 10 15] (a, xmm0)
  77         //      [ 4  5  6  7]    -->    [ 4  9 14  3] (b, xmm1)
  78         //      [ 8  9 10 11]           [ 8 13  2  7] (c, xmm2)
  79         //      [12 13 14 15]           [12  1  6 11] (d, xmm3)
  80         movdqu  xmm0, [edx +  0]
  81         movdqu  xmm1, [edx + 16]
  82         movdqu  xmm2, [edx + 32]
  83         movdqu  xmm3, [edx + 48]
  84
  85         // Take a copy for later.
  86         movdqa  [esp +  0], xmm0
  87         movdqa  [esp + 16], xmm1
  88         movdqa  xmm6, xmm2
  89         movdqa  xmm7, xmm3
  90
  91 loop:
  92
  93         // Apply a column quarterround to each of the columns simultaneously.
  94         // Alas, there doesn't seem to be a packed doubleword rotate, so we
  95         // have to synthesize it.
  96
  97         // b ^= (a + d) <<<  7
  98         movdqa  xmm4, xmm0
  99         paddd   xmm4, xmm3
 100         movdqa  xmm5, xmm4
 101         pslld   xmm4, 7
 102         psrld   xmm5, 25
 103         por     xmm4, xmm5
 104         pxor    xmm1, xmm4
 105
 106         // c ^= (b + a) <<<  9
 107         movdqa  xmm4, xmm1
 108         paddd   xmm4, xmm0
 109         movdqa  xmm5, xmm4
 110         pslld   xmm4, 9
 111         psrld   xmm5, 23
 112         por     xmm4, xmm5
 113         pxor    xmm2, xmm4
 114
 115         // d ^= (c + b) <<< 13
 116         movdqa  xmm4, xmm2
 117         paddd   xmm4, xmm1
 118         pshufd  xmm1, xmm1, 0x93
 119         movdqa  xmm5, xmm4
 120         pslld   xmm4, 13
 121         psrld   xmm5, 19
 122         por     xmm4, xmm5
 123         pxor    xmm3, xmm4
 124
 125         // a ^= (d + c) <<< 18
 126         movdqa  xmm4, xmm3
 127         pshufd  xmm3, xmm3, 0x39
 128         paddd   xmm4, xmm2
 129         pshufd  xmm2, xmm2, 0x4e
 130         movdqa  xmm5, xmm4
 131         pslld   xmm4, 18
 132         psrld   xmm5, 14
 133         por     xmm4, xmm5
 134         pxor    xmm0, xmm4
 135
 136         // The transpose conveniently only involves reordering elements of
 137         // individual rows, which can be done quite easily, and reordering
 138         // the rows themselves, which is a trivial renaming.  It doesn't
 139         // involve any movement of elements between rows.
 140         //
 141         //      [ 0  5 10 15]           [ 0  5 10 15] (a, xmm0)
 142         //      [ 4  9 14  3]    -->    [ 1  6 11 12] (b, xmm3)
 143         //      [ 8 13  2  7]           [ 2  7  8 13] (c, xmm2)
 144         //      [12  1  6 11]           [ 3  4  9 14] (d, xmm1)
 145         //
 146         // The shuffles have quite high latency, so they've been pushed
 147         // backwards into the main instruction list.
 148
 149         // Apply the row quarterround to each of the columns (yes!)
 150         // simultaneously.
 151
 152         // b ^= (a + d) <<<  7
 153         movdqa  xmm4, xmm0
 154         paddd   xmm4, xmm1
 155         movdqa  xmm5, xmm4
 156         pslld   xmm4, 7
 157         psrld   xmm5, 25
 158         por     xmm4, xmm5
 159         pxor    xmm3, xmm4
 160
 161         // c ^= (b + a) <<<  9
 162         movdqa  xmm4, xmm3
 163         paddd   xmm4, xmm0
 164         movdqa  xmm5, xmm4
 165         pslld   xmm4, 9
 166         psrld   xmm5, 23
 167         por     xmm4, xmm5
 168         pxor    xmm2, xmm4
 169
 170         // d ^= (c + b) <<< 13
 171         movdqa  xmm4, xmm2
 172         paddd   xmm4, xmm3
 173         pshufd  xmm3, xmm3, 0x93
 174         movdqa  xmm5, xmm4
 175         pslld   xmm4, 13
 176         psrld   xmm5, 19
 177         por     xmm4, xmm5
 178         pxor    xmm1, xmm4
 179
 180         // a ^= (d + c) <<< 18
 181         movdqa  xmm4, xmm1
 182         pshufd  xmm1, xmm1, 0x39
 183         paddd   xmm4, xmm2
 184         pshufd  xmm2, xmm2, 0x4e
 185         movdqa  xmm5, xmm4
 186         pslld   xmm4, 18
 187         psrld   xmm5, 14
 188         por     xmm4, xmm5
 189         pxor    xmm0, xmm4
 190
 191         // We had to undo the transpose ready for the next loop.  Again, push
 192         // back the shuffles because they take a long time coming through.
 193         // Decrement the loop counter and see if we should go round again.
 194         // Later processors fuse this pair into a single uop.
 195         sub     ecx, 2
 196         ja      loop
 197
 198         // Almost there.  Firstly, the feedforward addition, and then we have
 199         // to write out the result.  Here we have to undo the permutation
 200         // which was already applied to the input.  Shuffling has quite high
 201         // latency, so arrange to start a new shuffle into a temporary as
 202         // soon as we've written out the old value.
 203         mov     edx, [ebp + 16]
 204
 205         paddd   xmm0, [esp +  0]
 206         pshufd  xmm4, xmm0, 0x39
 207         movd    [edx +  0], xmm0
 208
 209         paddd   xmm1, [esp + 16]
 210         pshufd  xmm5, xmm1, 0x93
 211         movd    [edx + 16], xmm1
 212
 213         paddd   xmm2, xmm6
 214         pshufd  xmm6, xmm2, 0x4e
 215         movd    [edx + 32], xmm2
 216
 217         paddd   xmm3, xmm7
 218         pshufd  xmm7, xmm3, 0x39
 219         movd    [edx + 48], xmm3
 220
 221         movd    [edx +  4], xmm7
 222         pshufd  xmm7, xmm3, 0x4e
 223         movd    [edx + 24], xmm7
 224         pshufd  xmm3, xmm3, 0x93
 225         movd    [edx + 44], xmm3
 226
 227         movd    [edx +  8], xmm6
 228         pshufd  xmm6, xmm2, 0x93
 229         movd    [edx + 28], xmm6
 230         pshufd  xmm2, xmm2, 0x39
 231         movd    [edx + 52], xmm2
 232
 233         movd    [edx + 12], xmm5
 234         pshufd  xmm5, xmm1, 0x39
 235         movd    [edx + 36], xmm5
 236         pshufd  xmm1, xmm1, 0x4e
 237         movd    [edx + 56], xmm1
 238
 239         movd    [edx + 20], xmm4
 240         pshufd  xmm4, xmm0, 0x4e
 241         movd    [edx + 40], xmm4
 242         pshufd  xmm0, xmm0, 0x93
 243         movd    [edx + 60], xmm0
 244
 245         // Tidy things up.
 246         mov     esp, ebp
 247         pop     ebp
 248
 249         // And with that, we're done.
 250         ret
 251
 252 ENDFUNC
 253
 254 ///----- That's all, folks --------------------------------------------------