1 /// -*- mode: asm; asm-comment-char: ?/ -*-
3 /// Fancy SIMD implementation of Salsa20
5 /// (c) 2015 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
27 ///--------------------------------------------------------------------------
28 /// External definitions.
31 #include "asm-common.h"
33 ///--------------------------------------------------------------------------
39 FUNC(salsa20_core_x86_sse2)
41 // Initial state. We have three arguments:
42 // [ebp + 8] is the number of rounds to do
43 // [ebp + 12] points to the input matrix
44 // [ebp + 16] points to the output matrix
51 // Prepare for the main loop.
54 // First job is to slurp the matrix into XMM registers. The words
55 // have already been permuted conveniently to make them line up
56 // better for SIMD processing.
58 // The textbook arrangement of the matrix is this.
65 // But we've rotated the columns up so that the main diagonal with
66 // the constants on it end up in the first row, giving something more
74 // so the transformation looks like this:
76 // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
77 // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
78 // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
79 // [12 13 14 15] [12 1 6 11] (d, xmm3)
80 movdqu xmm0, [edx + 0]
81 movdqu xmm1, [edx + 16]
82 movdqu xmm2, [edx + 32]
83 movdqu xmm3, [edx + 48]
85 // Take a copy for later.
86 movdqa [esp + 0], xmm0
87 movdqa [esp + 16], xmm1
93 // Apply a column quarterround to each of the columns simultaneously.
94 // Alas, there doesn't seem to be a packed doubleword rotate, so we
95 // have to synthesize it.
106 // c ^= (b + a) <<< 9
115 // d ^= (c + b) <<< 13
118 pshufd xmm1, xmm1, 0x93
125 // a ^= (d + c) <<< 18
127 pshufd xmm3, xmm3, 0x39
129 pshufd xmm2, xmm2, 0x4e
136 // The transpose conveniently only involves reordering elements of
137 // individual rows, which can be done quite easily, and reordering
138 // the rows themselves, which is a trivial renaming. It doesn't
139 // involve any movement of elements between rows.
141 // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
142 // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
143 // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
144 // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
146 // The shuffles have quite high latency, so they've been pushed
147 // backwards into the main instruction list.
149 // Apply the row quarterround to each of the columns (yes!)
152 // b ^= (a + d) <<< 7
161 // c ^= (b + a) <<< 9
170 // d ^= (c + b) <<< 13
173 pshufd xmm3, xmm3, 0x93
180 // a ^= (d + c) <<< 18
182 pshufd xmm1, xmm1, 0x39
184 pshufd xmm2, xmm2, 0x4e
191 // We had to undo the transpose ready for the next loop. Again, push
192 // back the shuffles because they take a long time coming through.
193 // Decrement the loop counter and see if we should go round again.
194 // Later processors fuse this pair into a single uop.
198 // Almost there. Firstly, the feedforward addition, and then we have
199 // to write out the result. Here we have to undo the permutation
200 // which was already applied to the input. Shuffling has quite high
201 // latency, so arrange to start a new shuffle into a temporary as
202 // soon as we've written out the old value.
205 paddd xmm0, [esp + 0]
206 pshufd xmm4, xmm0, 0x39
209 paddd xmm1, [esp + 16]
210 pshufd xmm5, xmm1, 0x93
211 movd [edx + 16], xmm1
214 pshufd xmm6, xmm2, 0x4e
215 movd [edx + 32], xmm2
218 pshufd xmm7, xmm3, 0x39
219 movd [edx + 48], xmm3
222 pshufd xmm7, xmm3, 0x4e
223 movd [edx + 24], xmm7
224 pshufd xmm3, xmm3, 0x93
225 movd [edx + 44], xmm3
228 pshufd xmm6, xmm2, 0x93
229 movd [edx + 28], xmm6
230 pshufd xmm2, xmm2, 0x39
231 movd [edx + 52], xmm2
233 movd [edx + 12], xmm5
234 pshufd xmm5, xmm1, 0x39
235 movd [edx + 36], xmm5
236 pshufd xmm1, xmm1, 0x4e
237 movd [edx + 56], xmm1
239 movd [edx + 20], xmm4
240 pshufd xmm4, xmm0, 0x4e
241 movd [edx + 40], xmm4
242 pshufd xmm0, xmm0, 0x93
243 movd [edx + 60], xmm0
249 // And with that, we're done.
254 ///----- That's all, folks --------------------------------------------------