symm/rijndael-x86-aseni.S: Unify encryption and decryption with a macro.
[catacomb] / symm / salsa20-x86-sse2.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// Fancy SIMD implementation of Salsa20
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
33///--------------------------------------------------------------------------
47103664
MW
34/// Local utilities.
35
36// Magic constants for shuffling.
37#define ROTL 0x93
38#define ROT2 0x4e
39#define ROTR 0x39
40
41///--------------------------------------------------------------------------
1a0c09c4
MW
42/// Main code.
43
44 .arch pentium4
45 .section .text
46
47FUNC(salsa20_core_x86_sse2)
48
49 // Initial state. We have three arguments:
50 // [ebp + 8] is the number of rounds to do
51 // [ebp + 12] points to the input matrix
52 // [ebp + 16] points to the output matrix
53 push ebp
54 mov ebp, esp
55 sub esp, 32
56 mov edx, [ebp + 12]
57 and esp, ~15
58
59 // Prepare for the main loop.
60 mov ecx, [ebp + 8]
61
62 // First job is to slurp the matrix into XMM registers. The words
63 // have already been permuted conveniently to make them line up
64 // better for SIMD processing.
65 //
66 // The textbook arrangement of the matrix is this.
67 //
68 // [C K K K]
69 // [K C N N]
70 // [T T C K]
71 // [K K K C]
72 //
73 // But we've rotated the columns up so that the main diagonal with
74 // the constants on it end up in the first row, giving something more
75 // like
76 //
77 // [C C C C]
78 // [K T K K]
79 // [T K K N]
80 // [K K N K]
81 //
82 // so the transformation looks like this:
83 //
84 // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
85 // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
86 // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
87 // [12 13 14 15] [12 1 6 11] (d, xmm3)
88 movdqu xmm0, [edx + 0]
89 movdqu xmm1, [edx + 16]
90 movdqu xmm2, [edx + 32]
91 movdqu xmm3, [edx + 48]
92
93 // Take a copy for later.
94 movdqa [esp + 0], xmm0
95 movdqa [esp + 16], xmm1
96 movdqa xmm6, xmm2
97 movdqa xmm7, xmm3
98
99loop:
100
101 // Apply a column quarterround to each of the columns simultaneously.
102 // Alas, there doesn't seem to be a packed doubleword rotate, so we
103 // have to synthesize it.
104
105 // b ^= (a + d) <<< 7
106 movdqa xmm4, xmm0
107 paddd xmm4, xmm3
108 movdqa xmm5, xmm4
109 pslld xmm4, 7
110 psrld xmm5, 25
111 por xmm4, xmm5
112 pxor xmm1, xmm4
113
114 // c ^= (b + a) <<< 9
115 movdqa xmm4, xmm1
116 paddd xmm4, xmm0
117 movdqa xmm5, xmm4
118 pslld xmm4, 9
119 psrld xmm5, 23
120 por xmm4, xmm5
121 pxor xmm2, xmm4
122
123 // d ^= (c + b) <<< 13
124 movdqa xmm4, xmm2
125 paddd xmm4, xmm1
47103664 126 pshufd xmm1, xmm1, ROTL
1a0c09c4
MW
127 movdqa xmm5, xmm4
128 pslld xmm4, 13
129 psrld xmm5, 19
130 por xmm4, xmm5
131 pxor xmm3, xmm4
132
133 // a ^= (d + c) <<< 18
134 movdqa xmm4, xmm3
47103664 135 pshufd xmm3, xmm3, ROTR
1a0c09c4 136 paddd xmm4, xmm2
47103664 137 pshufd xmm2, xmm2, ROT2
1a0c09c4
MW
138 movdqa xmm5, xmm4
139 pslld xmm4, 18
140 psrld xmm5, 14
141 por xmm4, xmm5
142 pxor xmm0, xmm4
143
144 // The transpose conveniently only involves reordering elements of
145 // individual rows, which can be done quite easily, and reordering
146 // the rows themselves, which is a trivial renaming. It doesn't
147 // involve any movement of elements between rows.
148 //
149 // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
150 // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
151 // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
152 // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
153 //
154 // The shuffles have quite high latency, so they've been pushed
155 // backwards into the main instruction list.
156
157 // Apply the row quarterround to each of the columns (yes!)
158 // simultaneously.
159
160 // b ^= (a + d) <<< 7
161 movdqa xmm4, xmm0
162 paddd xmm4, xmm1
163 movdqa xmm5, xmm4
164 pslld xmm4, 7
165 psrld xmm5, 25
166 por xmm4, xmm5
167 pxor xmm3, xmm4
168
169 // c ^= (b + a) <<< 9
170 movdqa xmm4, xmm3
171 paddd xmm4, xmm0
172 movdqa xmm5, xmm4
173 pslld xmm4, 9
174 psrld xmm5, 23
175 por xmm4, xmm5
176 pxor xmm2, xmm4
177
178 // d ^= (c + b) <<< 13
179 movdqa xmm4, xmm2
180 paddd xmm4, xmm3
47103664 181 pshufd xmm3, xmm3, ROTL
1a0c09c4
MW
182 movdqa xmm5, xmm4
183 pslld xmm4, 13
184 psrld xmm5, 19
185 por xmm4, xmm5
186 pxor xmm1, xmm4
187
188 // a ^= (d + c) <<< 18
189 movdqa xmm4, xmm1
47103664 190 pshufd xmm1, xmm1, ROTR
1a0c09c4 191 paddd xmm4, xmm2
47103664 192 pshufd xmm2, xmm2, ROT2
1a0c09c4
MW
193 movdqa xmm5, xmm4
194 pslld xmm4, 18
195 psrld xmm5, 14
196 por xmm4, xmm5
197 pxor xmm0, xmm4
198
199 // We had to undo the transpose ready for the next loop. Again, push
200 // back the shuffles because they take a long time coming through.
201 // Decrement the loop counter and see if we should go round again.
202 // Later processors fuse this pair into a single uop.
203 sub ecx, 2
204 ja loop
205
206 // Almost there. Firstly, the feedforward addition, and then we have
207 // to write out the result. Here we have to undo the permutation
208 // which was already applied to the input. Shuffling has quite high
209 // latency, so arrange to start a new shuffle into a temporary as
210 // soon as we've written out the old value.
211 mov edx, [ebp + 16]
212
213 paddd xmm0, [esp + 0]
47103664 214 pshufd xmm4, xmm0, ROTR
1a0c09c4
MW
215 movd [edx + 0], xmm0
216
217 paddd xmm1, [esp + 16]
47103664 218 pshufd xmm5, xmm1, ROTL
1a0c09c4
MW
219 movd [edx + 16], xmm1
220
221 paddd xmm2, xmm6
47103664 222 pshufd xmm6, xmm2, ROT2
1a0c09c4
MW
223 movd [edx + 32], xmm2
224
225 paddd xmm3, xmm7
47103664 226 pshufd xmm7, xmm3, ROTR
1a0c09c4
MW
227 movd [edx + 48], xmm3
228
229 movd [edx + 4], xmm7
47103664 230 pshufd xmm7, xmm3, ROT2
1a0c09c4 231 movd [edx + 24], xmm7
47103664 232 pshufd xmm3, xmm3, ROTL
1a0c09c4
MW
233 movd [edx + 44], xmm3
234
235 movd [edx + 8], xmm6
47103664 236 pshufd xmm6, xmm2, ROTL
1a0c09c4 237 movd [edx + 28], xmm6
47103664 238 pshufd xmm2, xmm2, ROTR
1a0c09c4
MW
239 movd [edx + 52], xmm2
240
241 movd [edx + 12], xmm5
47103664 242 pshufd xmm5, xmm1, ROTR
1a0c09c4 243 movd [edx + 36], xmm5
47103664 244 pshufd xmm1, xmm1, ROT2
1a0c09c4
MW
245 movd [edx + 56], xmm1
246
247 movd [edx + 20], xmm4
47103664 248 pshufd xmm4, xmm0, ROT2
1a0c09c4 249 movd [edx + 40], xmm4
47103664 250 pshufd xmm0, xmm0, ROTL
1a0c09c4
MW
251 movd [edx + 60], xmm0
252
253 // Tidy things up.
254 mov esp, ebp
255 pop ebp
256
257 // And with that, we're done.
258 ret
259
260ENDFUNC
261
262///----- That's all, folks --------------------------------------------------