1 /// -*- mode: asm; asm-comment-char: ?/ -*-
3 /// Fancy SIMD implementation of Salsa20
5 /// (c) 2015 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
27 ///--------------------------------------------------------------------------
28 /// External definitions.
31 #include "asm-common.h"
33 ///--------------------------------------------------------------------------
36 // Magic constants for shuffling.
41 ///--------------------------------------------------------------------------
47 FUNC(salsa20_core_x86_sse2)
49 // Initial state. We have three arguments:
50 // [ebp + 8] is the number of rounds to do
51 // [ebp + 12] points to the input matrix
52 // [ebp + 16] points to the output matrix
59 // Prepare for the main loop.
62 // First job is to slurp the matrix into XMM registers. The words
63 // have already been permuted conveniently to make them line up
64 // better for SIMD processing.
66 // The textbook arrangement of the matrix is this.
73 // But we've rotated the columns up so that the main diagonal with
74 // the constants on it end up in the first row, giving something more
82 // so the transformation looks like this:
84 // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
85 // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
86 // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
87 // [12 13 14 15] [12 1 6 11] (d, xmm3)
88 movdqu xmm0, [edx + 0]
89 movdqu xmm1, [edx + 16]
90 movdqu xmm2, [edx + 32]
91 movdqu xmm3, [edx + 48]
93 // Take a copy for later.
94 movdqa [esp + 0], xmm0
95 movdqa [esp + 16], xmm1
101 // Apply a column quarterround to each of the columns simultaneously.
102 // Alas, there doesn't seem to be a packed doubleword rotate, so we
103 // have to synthesize it.
105 // b ^= (a + d) <<< 7
114 // c ^= (b + a) <<< 9
123 // d ^= (c + b) <<< 13
126 pshufd xmm1, xmm1, ROTL
133 // a ^= (d + c) <<< 18
135 pshufd xmm3, xmm3, ROTR
137 pshufd xmm2, xmm2, ROT2
144 // The transpose conveniently only involves reordering elements of
145 // individual rows, which can be done quite easily, and reordering
146 // the rows themselves, which is a trivial renaming. It doesn't
147 // involve any movement of elements between rows.
149 // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
150 // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
151 // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
152 // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
154 // The shuffles have quite high latency, so they've been pushed
155 // backwards into the main instruction list.
157 // Apply the row quarterround to each of the columns (yes!)
160 // b ^= (a + d) <<< 7
169 // c ^= (b + a) <<< 9
178 // d ^= (c + b) <<< 13
181 pshufd xmm3, xmm3, ROTL
188 // a ^= (d + c) <<< 18
190 pshufd xmm1, xmm1, ROTR
192 pshufd xmm2, xmm2, ROT2
199 // We had to undo the transpose ready for the next loop. Again, push
200 // back the shuffles because they take a long time coming through.
201 // Decrement the loop counter and see if we should go round again.
202 // Later processors fuse this pair into a single uop.
206 // Almost there. Firstly, the feedforward addition, and then we have
207 // to write out the result. Here we have to undo the permutation
208 // which was already applied to the input. Shuffling has quite high
209 // latency, so arrange to start a new shuffle into a temporary as
210 // soon as we've written out the old value.
213 paddd xmm0, [esp + 0]
214 pshufd xmm4, xmm0, ROTR
217 paddd xmm1, [esp + 16]
218 pshufd xmm5, xmm1, ROTL
219 movd [edx + 16], xmm1
222 pshufd xmm6, xmm2, ROT2
223 movd [edx + 32], xmm2
226 pshufd xmm7, xmm3, ROTR
227 movd [edx + 48], xmm3
230 pshufd xmm7, xmm3, ROT2
231 movd [edx + 24], xmm7
232 pshufd xmm3, xmm3, ROTL
233 movd [edx + 44], xmm3
236 pshufd xmm6, xmm2, ROTL
237 movd [edx + 28], xmm6
238 pshufd xmm2, xmm2, ROTR
239 movd [edx + 52], xmm2
241 movd [edx + 12], xmm5
242 pshufd xmm5, xmm1, ROTR
243 movd [edx + 36], xmm5
244 pshufd xmm1, xmm1, ROT2
245 movd [edx + 56], xmm1
247 movd [edx + 20], xmm4
248 pshufd xmm4, xmm0, ROT2
249 movd [edx + 40], xmm4
250 pshufd xmm0, xmm0, ROTL
251 movd [edx + 60], xmm0
257 // And with that, we're done.
262 ///----- That's all, folks --------------------------------------------------