Preprocess the assembler files.
[catacomb] / symm / salsa20-x86-sse2.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// Fancy SIMD implementation of Salsa20
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
33///--------------------------------------------------------------------------
34/// Main code.
35
36 .arch pentium4
37 .section .text
38
39FUNC(salsa20_core_x86_sse2)
40
41 // Initial state. We have three arguments:
42 // [ebp + 8] is the number of rounds to do
43 // [ebp + 12] points to the input matrix
44 // [ebp + 16] points to the output matrix
45 push ebp
46 mov ebp, esp
47 sub esp, 32
48 mov edx, [ebp + 12]
49 and esp, ~15
50
51 // Prepare for the main loop.
52 mov ecx, [ebp + 8]
53
54 // First job is to slurp the matrix into XMM registers. The words
55 // have already been permuted conveniently to make them line up
56 // better for SIMD processing.
57 //
58 // The textbook arrangement of the matrix is this.
59 //
60 // [C K K K]
61 // [K C N N]
62 // [T T C K]
63 // [K K K C]
64 //
65 // But we've rotated the columns up so that the main diagonal with
66 // the constants on it end up in the first row, giving something more
67 // like
68 //
69 // [C C C C]
70 // [K T K K]
71 // [T K K N]
72 // [K K N K]
73 //
74 // so the transformation looks like this:
75 //
76 // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
77 // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
78 // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
79 // [12 13 14 15] [12 1 6 11] (d, xmm3)
80 movdqu xmm0, [edx + 0]
81 movdqu xmm1, [edx + 16]
82 movdqu xmm2, [edx + 32]
83 movdqu xmm3, [edx + 48]
84
85 // Take a copy for later.
86 movdqa [esp + 0], xmm0
87 movdqa [esp + 16], xmm1
88 movdqa xmm6, xmm2
89 movdqa xmm7, xmm3
90
91loop:
92
93 // Apply a column quarterround to each of the columns simultaneously.
94 // Alas, there doesn't seem to be a packed doubleword rotate, so we
95 // have to synthesize it.
96
97 // b ^= (a + d) <<< 7
98 movdqa xmm4, xmm0
99 paddd xmm4, xmm3
100 movdqa xmm5, xmm4
101 pslld xmm4, 7
102 psrld xmm5, 25
103 por xmm4, xmm5
104 pxor xmm1, xmm4
105
106 // c ^= (b + a) <<< 9
107 movdqa xmm4, xmm1
108 paddd xmm4, xmm0
109 movdqa xmm5, xmm4
110 pslld xmm4, 9
111 psrld xmm5, 23
112 por xmm4, xmm5
113 pxor xmm2, xmm4
114
115 // d ^= (c + b) <<< 13
116 movdqa xmm4, xmm2
117 paddd xmm4, xmm1
118 pshufd xmm1, xmm1, 0x93
119 movdqa xmm5, xmm4
120 pslld xmm4, 13
121 psrld xmm5, 19
122 por xmm4, xmm5
123 pxor xmm3, xmm4
124
125 // a ^= (d + c) <<< 18
126 movdqa xmm4, xmm3
127 pshufd xmm3, xmm3, 0x39
128 paddd xmm4, xmm2
129 pshufd xmm2, xmm2, 0x4e
130 movdqa xmm5, xmm4
131 pslld xmm4, 18
132 psrld xmm5, 14
133 por xmm4, xmm5
134 pxor xmm0, xmm4
135
136 // The transpose conveniently only involves reordering elements of
137 // individual rows, which can be done quite easily, and reordering
138 // the rows themselves, which is a trivial renaming. It doesn't
139 // involve any movement of elements between rows.
140 //
141 // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
142 // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
143 // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
144 // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
145 //
146 // The shuffles have quite high latency, so they've been pushed
147 // backwards into the main instruction list.
148
149 // Apply the row quarterround to each of the columns (yes!)
150 // simultaneously.
151
152 // b ^= (a + d) <<< 7
153 movdqa xmm4, xmm0
154 paddd xmm4, xmm1
155 movdqa xmm5, xmm4
156 pslld xmm4, 7
157 psrld xmm5, 25
158 por xmm4, xmm5
159 pxor xmm3, xmm4
160
161 // c ^= (b + a) <<< 9
162 movdqa xmm4, xmm3
163 paddd xmm4, xmm0
164 movdqa xmm5, xmm4
165 pslld xmm4, 9
166 psrld xmm5, 23
167 por xmm4, xmm5
168 pxor xmm2, xmm4
169
170 // d ^= (c + b) <<< 13
171 movdqa xmm4, xmm2
172 paddd xmm4, xmm3
173 pshufd xmm3, xmm3, 0x93
174 movdqa xmm5, xmm4
175 pslld xmm4, 13
176 psrld xmm5, 19
177 por xmm4, xmm5
178 pxor xmm1, xmm4
179
180 // a ^= (d + c) <<< 18
181 movdqa xmm4, xmm1
182 pshufd xmm1, xmm1, 0x39
183 paddd xmm4, xmm2
184 pshufd xmm2, xmm2, 0x4e
185 movdqa xmm5, xmm4
186 pslld xmm4, 18
187 psrld xmm5, 14
188 por xmm4, xmm5
189 pxor xmm0, xmm4
190
191 // We had to undo the transpose ready for the next loop. Again, push
192 // back the shuffles because they take a long time coming through.
193 // Decrement the loop counter and see if we should go round again.
194 // Later processors fuse this pair into a single uop.
195 sub ecx, 2
196 ja loop
197
198 // Almost there. Firstly, the feedforward addition, and then we have
199 // to write out the result. Here we have to undo the permutation
200 // which was already applied to the input. Shuffling has quite high
201 // latency, so arrange to start a new shuffle into a temporary as
202 // soon as we've written out the old value.
203 mov edx, [ebp + 16]
204
205 paddd xmm0, [esp + 0]
206 pshufd xmm4, xmm0, 0x39
207 movd [edx + 0], xmm0
208
209 paddd xmm1, [esp + 16]
210 pshufd xmm5, xmm1, 0x93
211 movd [edx + 16], xmm1
212
213 paddd xmm2, xmm6
214 pshufd xmm6, xmm2, 0x4e
215 movd [edx + 32], xmm2
216
217 paddd xmm3, xmm7
218 pshufd xmm7, xmm3, 0x39
219 movd [edx + 48], xmm3
220
221 movd [edx + 4], xmm7
222 pshufd xmm7, xmm3, 0x4e
223 movd [edx + 24], xmm7
224 pshufd xmm3, xmm3, 0x93
225 movd [edx + 44], xmm3
226
227 movd [edx + 8], xmm6
228 pshufd xmm6, xmm2, 0x93
229 movd [edx + 28], xmm6
230 pshufd xmm2, xmm2, 0x39
231 movd [edx + 52], xmm2
232
233 movd [edx + 12], xmm5
234 pshufd xmm5, xmm1, 0x39
235 movd [edx + 36], xmm5
236 pshufd xmm1, xmm1, 0x4e
237 movd [edx + 56], xmm1
238
239 movd [edx + 20], xmm4
240 pshufd xmm4, xmm0, 0x4e
241 movd [edx + 40], xmm4
242 pshufd xmm0, xmm0, 0x93
243 movd [edx + 60], xmm0
244
245 // Tidy things up.
246 mov esp, ebp
247 pop ebp
248
249 // And with that, we're done.
250 ret
251
252ENDFUNC
253
254///----- That's all, folks --------------------------------------------------