1 ### -*- mode: asm; asm-comment-char: ?# -*-
3 ### Fancy SIMD implementation of Salsa20
5 ### (c) 2015 Straylight/Edgeware
8 ###----- Licensing notice ---------------------------------------------------
10 ### This file is part of Catacomb.
12 ### Catacomb is free software; you can redistribute it and/or modify
13 ### it under the terms of the GNU Library General Public License as
14 ### published by the Free Software Foundation; either version 2 of the
15 ### License, or (at your option) any later version.
17 ### Catacomb is distributed in the hope that it will be useful,
18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ### GNU Library General Public License for more details.
22 ### You should have received a copy of the GNU Library General Public
23 ### License along with Catacomb; if not, write to the Free
24 ### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 ### MA 02111-1307, USA.
27 .intel_syntax noprefix
32 .globl salsa20_core_x86_sse2
33 .type salsa20_core_x86_sse2, STT_FUNC
34 salsa20_core_x86_sse2:
36 ## Initial state. We have three arguments:
37 ## [ebp + 8] is the number of rounds to do
38 ## [ebp + 12] points to the input matrix
39 ## [ebp + 16] points to the output matrix
46 ## Prepare for the main loop.
49 ## First job is to slurp the matrix into XMM registers. The words
50 ## have already been permuted conveniently to make them line up
51 ## better for SIMD processing.
53 ## The textbook arrangement of the matrix is this.
60 ## But we've rotated the columns up so that the main diagonal with
61 ## the constants on it end up in the first row, giving something more
69 ## so the transformation looks like this:
71 ## [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
72 ## [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
73 ## [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
74 ## [12 13 14 15] [12 1 6 11] (d, xmm3)
75 movdqu xmm0, [edx + 0]
76 movdqu xmm1, [edx + 16]
77 movdqu xmm2, [edx + 32]
78 movdqu xmm3, [edx + 48]
80 ## Take a copy for later.
81 movdqa [esp + 0], xmm0
82 movdqa [esp + 16], xmm1
88 ## Apply a column quarterround to each of the columns simultaneously.
89 ## Alas, there doesn't seem to be a packed doubleword rotate, so we
90 ## have to synthesize it.
101 ## c ^= (b + a) <<< 9
110 ## d ^= (c + b) <<< 13
113 pshufd xmm1, xmm1, 0x93
120 ## a ^= (d + c) <<< 18
122 pshufd xmm3, xmm3, 0x39
124 pshufd xmm2, xmm2, 0x4e
131 ## The transpose conveniently only involves reordering elements of
132 ## individual rows, which can be done quite easily, and reordering
133 ## the rows themselves, which is a trivial renaming. It doesn't
134 ## involve any movement of elements between rows.
136 ## [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
137 ## [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
138 ## [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
139 ## [12 1 6 11] [ 3 4 9 14] (d, xmm1)
141 ## The shuffles have quite high latency, so they've been pushed
142 ## backwards into the main instruction list.
144 ## Apply the row quarterround to each of the columns (yes!)
147 ## b ^= (a + d) <<< 7
156 ## c ^= (b + a) <<< 9
165 ## d ^= (c + b) <<< 13
168 pshufd xmm3, xmm3, 0x93
175 ## a ^= (d + c) <<< 18
177 pshufd xmm1, xmm1, 0x39
179 pshufd xmm2, xmm2, 0x4e
186 ## We had to undo the transpose ready for the next loop. Again, push
187 ## back the shuffles because they take a long time coming through.
188 ## Decrement the loop counter and see if we should go round again.
189 ## Later processors fuse this pair into a single uop.
193 ## Almost there. Firstly, the feedforward addition, and then we have
194 ## to write out the result. Here we have to undo the permutation
195 ## which was already applied to the input. Shuffling has quite high
196 ## latency, so arrange to start a new shuffle into a temporary as
197 ## soon as we've written out the old value.
200 paddd xmm0, [esp + 0]
201 pshufd xmm4, xmm0, 0x39
204 paddd xmm1, [esp + 16]
205 pshufd xmm5, xmm1, 0x93
206 movd [edx + 16], xmm1
209 pshufd xmm6, xmm2, 0x4e
210 movd [edx + 32], xmm2
213 pshufd xmm7, xmm3, 0x39
214 movd [edx + 48], xmm3
217 pshufd xmm7, xmm3, 0x4e
218 movd [edx + 24], xmm7
219 pshufd xmm3, xmm3, 0x93
220 movd [edx + 44], xmm3
223 pshufd xmm6, xmm2, 0x93
224 movd [edx + 28], xmm6
225 pshufd xmm2, xmm2, 0x39
226 movd [edx + 52], xmm2
228 movd [edx + 12], xmm5
229 pshufd xmm5, xmm1, 0x39
230 movd [edx + 36], xmm5
231 pshufd xmm1, xmm1, 0x4e
232 movd [edx + 56], xmm1
234 movd [edx + 20], xmm4
235 pshufd xmm4, xmm0, 0x4e
236 movd [edx + 40], xmm4
237 pshufd xmm0, xmm0, 0x93
238 movd [edx + 60], xmm0
240 ## And with that, we're done.
245 .size salsa20_core_x86_sse2, . - salsa20_core_x86_sse2
247 ###----- That's all, folks --------------------------------------------------