Commit | Line | Data |
---|---|---|
e10e6494 MW |
1 | ### -*- mode: asm; asm-comment-char: ?# -*- |
2 | ### | |
3 | ### Fancy SIMD implementation of Salsa20 | |
4 | ### | |
5 | ### (c) 2015 Straylight/Edgeware | |
6 | ### | |
7 | ||
8 | ###----- Licensing notice --------------------------------------------------- | |
9 | ### | |
10 | ### This file is part of Catacomb. | |
11 | ### | |
12 | ### Catacomb is free software; you can redistribute it and/or modify | |
13 | ### it under the terms of the GNU Library General Public License as | |
14 | ### published by the Free Software Foundation; either version 2 of the | |
15 | ### License, or (at your option) any later version. | |
16 | ### | |
17 | ### Catacomb is distributed in the hope that it will be useful, | |
18 | ### but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | ### GNU Library General Public License for more details. | |
21 | ### | |
22 | ### You should have received a copy of the GNU Library General Public | |
23 | ### License along with Catacomb; if not, write to the Free | |
24 | ### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, | |
25 | ### MA 02111-1307, USA. | |
26 | ||
27 | .intel_syntax noprefix | |
28 | .arch pentium4 | |
29 | ||
30 | .section .text | |
31 | ||
32 | .globl salsa20_core_x86_sse2 | |
33 | .type salsa20_core_x86_sse2, STT_FUNC | |
34 | salsa20_core_x86_sse2: | |
35 | ||
36 | ## Initial state. We have three arguments: | |
37 | ## [ebp + 8] is the number of rounds to do | |
38 | ## [ebp + 12] points to the input matrix | |
39 | ## [ebp + 16] points to the output matrix | |
40 | push ebp | |
41 | mov ebp, esp | |
42 | sub esp, 32 | |
43 | mov edx, [ebp + 12] | |
44 | and esp, ~15 | |
45 | ||
46 | ## Prepare for the main loop. | |
47 | mov ecx, [ebp + 8] | |
48 | ||
49 | ## First job is to slurp the matrix into XMM registers. The words | |
50 | ## have already been permuted conveniently to make them line up | |
51 | ## better for SIMD processing. | |
52 | ## | |
53 | ## The textbook arrangement of the matrix is this. | |
54 | ## | |
55 | ## [C K K K] | |
56 | ## [K C N N] | |
57 | ## [T T C K] | |
58 | ## [K K K C] | |
59 | ## | |
60 | ## But we've rotated the columns up so that the main diagonal with | |
61 | ## the constants on it end up in the first row, giving something more | |
62 | ## like | |
63 | ## | |
64 | ## [C C C C] | |
65 | ## [K T K K] | |
66 | ## [T K K N] | |
67 | ## [K K N K] | |
68 | ## | |
69 | ## so the transformation looks like this: | |
70 | ## | |
71 | ## [ 0 1 2 3] [ 0 5 10 15] (a, xmm0) | |
72 | ## [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1) | |
73 | ## [ 8 9 10 11] [ 8 13 2 7] (c, xmm2) | |
74 | ## [12 13 14 15] [12 1 6 11] (d, xmm3) | |
75 | movdqu xmm0, [edx + 0] | |
76 | movdqu xmm1, [edx + 16] | |
77 | movdqu xmm2, [edx + 32] | |
78 | movdqu xmm3, [edx + 48] | |
79 | ||
80 | ## Take a copy for later. | |
81 | movdqa [esp + 0], xmm0 | |
82 | movdqa [esp + 16], xmm1 | |
83 | movdqa xmm6, xmm2 | |
84 | movdqa xmm7, xmm3 | |
85 | ||
86 | loop: | |
87 | ||
88 | ## Apply a column quarterround to each of the columns simultaneously. | |
89 | ## Alas, there doesn't seem to be a packed doubleword rotate, so we | |
90 | ## have to synthesize it. | |
91 | ||
92 | ## b ^= (a + d) <<< 7 | |
93 | movdqa xmm4, xmm0 | |
94 | paddd xmm4, xmm3 | |
95 | movdqa xmm5, xmm4 | |
96 | pslld xmm4, 7 | |
97 | psrld xmm5, 25 | |
98 | por xmm4, xmm5 | |
99 | pxor xmm1, xmm4 | |
100 | ||
101 | ## c ^= (b + a) <<< 9 | |
102 | movdqa xmm4, xmm1 | |
103 | paddd xmm4, xmm0 | |
104 | movdqa xmm5, xmm4 | |
105 | pslld xmm4, 9 | |
106 | psrld xmm5, 23 | |
107 | por xmm4, xmm5 | |
108 | pxor xmm2, xmm4 | |
109 | ||
110 | ## d ^= (c + b) <<< 13 | |
111 | movdqa xmm4, xmm2 | |
112 | paddd xmm4, xmm1 | |
113 | pshufd xmm1, xmm1, 0x93 | |
114 | movdqa xmm5, xmm4 | |
115 | pslld xmm4, 13 | |
116 | psrld xmm5, 19 | |
117 | por xmm4, xmm5 | |
118 | pxor xmm3, xmm4 | |
119 | ||
120 | ## a ^= (d + c) <<< 18 | |
121 | movdqa xmm4, xmm3 | |
122 | pshufd xmm3, xmm3, 0x39 | |
123 | paddd xmm4, xmm2 | |
124 | pshufd xmm2, xmm2, 0x4e | |
125 | movdqa xmm5, xmm4 | |
126 | pslld xmm4, 18 | |
127 | psrld xmm5, 14 | |
128 | por xmm4, xmm5 | |
129 | pxor xmm0, xmm4 | |
130 | ||
131 | ## The transpose conveniently only involves reordering elements of | |
132 | ## individual rows, which can be done quite easily, and reordering | |
133 | ## the rows themselves, which is a trivial renaming. It doesn't | |
134 | ## involve any movement of elements between rows. | |
135 | ## | |
136 | ## [ 0 5 10 15] [ 0 5 10 15] (a, xmm0) | |
137 | ## [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3) | |
138 | ## [ 8 13 2 7] [ 2 7 8 13] (c, xmm2) | |
139 | ## [12 1 6 11] [ 3 4 9 14] (d, xmm1) | |
140 | ## | |
141 | ## The shuffles have quite high latency, so they've been pushed | |
142 | ## backwards into the main instruction list. | |
143 | ||
144 | ## Apply the row quarterround to each of the columns (yes!) | |
145 | ## simultaneously. | |
146 | ||
147 | ## b ^= (a + d) <<< 7 | |
148 | movdqa xmm4, xmm0 | |
149 | paddd xmm4, xmm1 | |
150 | movdqa xmm5, xmm4 | |
151 | pslld xmm4, 7 | |
152 | psrld xmm5, 25 | |
153 | por xmm4, xmm5 | |
154 | pxor xmm3, xmm4 | |
155 | ||
156 | ## c ^= (b + a) <<< 9 | |
157 | movdqa xmm4, xmm3 | |
158 | paddd xmm4, xmm0 | |
159 | movdqa xmm5, xmm4 | |
160 | pslld xmm4, 9 | |
161 | psrld xmm5, 23 | |
162 | por xmm4, xmm5 | |
163 | pxor xmm2, xmm4 | |
164 | ||
165 | ## d ^= (c + b) <<< 13 | |
166 | movdqa xmm4, xmm2 | |
167 | paddd xmm4, xmm3 | |
168 | pshufd xmm3, xmm3, 0x93 | |
169 | movdqa xmm5, xmm4 | |
170 | pslld xmm4, 13 | |
171 | psrld xmm5, 19 | |
172 | por xmm4, xmm5 | |
173 | pxor xmm1, xmm4 | |
174 | ||
175 | ## a ^= (d + c) <<< 18 | |
176 | movdqa xmm4, xmm1 | |
177 | pshufd xmm1, xmm1, 0x39 | |
178 | paddd xmm4, xmm2 | |
179 | pshufd xmm2, xmm2, 0x4e | |
180 | movdqa xmm5, xmm4 | |
181 | pslld xmm4, 18 | |
182 | psrld xmm5, 14 | |
183 | por xmm4, xmm5 | |
184 | pxor xmm0, xmm4 | |
185 | ||
186 | ## We had to undo the transpose ready for the next loop. Again, push | |
187 | ## back the shuffles because they take a long time coming through. | |
188 | ## Decrement the loop counter and see if we should go round again. | |
189 | ## Later processors fuse this pair into a single uop. | |
190 | sub ecx, 2 | |
191 | ja loop | |
192 | ||
193 | ## Almost there. Firstly, the feedforward addition, and then we have | |
194 | ## to write out the result. Here we have to undo the permutation | |
195 | ## which was already applied to the input. Shuffling has quite high | |
196 | ## latency, so arrange to start a new shuffle into a temporary as | |
197 | ## soon as we've written out the old value. | |
198 | mov edx, [ebp + 16] | |
199 | ||
200 | paddd xmm0, [esp + 0] | |
201 | pshufd xmm4, xmm0, 0x39 | |
202 | movd [edx + 0], xmm0 | |
203 | ||
204 | paddd xmm1, [esp + 16] | |
205 | pshufd xmm5, xmm1, 0x93 | |
206 | movd [edx + 16], xmm1 | |
207 | ||
208 | paddd xmm2, xmm6 | |
209 | pshufd xmm6, xmm2, 0x4e | |
210 | movd [edx + 32], xmm2 | |
211 | ||
212 | paddd xmm3, xmm7 | |
213 | pshufd xmm7, xmm3, 0x39 | |
214 | movd [edx + 48], xmm3 | |
215 | ||
216 | movd [edx + 4], xmm7 | |
217 | pshufd xmm7, xmm3, 0x4e | |
218 | movd [edx + 24], xmm7 | |
219 | pshufd xmm3, xmm3, 0x93 | |
220 | movd [edx + 44], xmm3 | |
221 | ||
222 | movd [edx + 8], xmm6 | |
223 | pshufd xmm6, xmm2, 0x93 | |
224 | movd [edx + 28], xmm6 | |
225 | pshufd xmm2, xmm2, 0x39 | |
226 | movd [edx + 52], xmm2 | |
227 | ||
228 | movd [edx + 12], xmm5 | |
229 | pshufd xmm5, xmm1, 0x39 | |
230 | movd [edx + 36], xmm5 | |
231 | pshufd xmm1, xmm1, 0x4e | |
232 | movd [edx + 56], xmm1 | |
233 | ||
234 | movd [edx + 20], xmm4 | |
235 | pshufd xmm4, xmm0, 0x4e | |
236 | movd [edx + 40], xmm4 | |
237 | pshufd xmm0, xmm0, 0x93 | |
238 | movd [edx + 60], xmm0 | |
239 | ||
240 | ## And with that, we're done. | |
241 | mov esp, ebp | |
242 | pop ebp | |
243 | ret | |
244 | ||
245 | .size salsa20_core_x86_sse2, . - salsa20_core_x86_sse2 | |
246 | ||
247 | ###----- That's all, folks -------------------------------------------------- |