symm/: New SSE2 implementations of Salsa20 and ChaCha.
[catacomb] / symm / chacha-x86-sse2.s
1 ### -*- mode: asm; asm-comment-char: ?# -*-
2 ###
3 ### Fancy SIMD implementation of ChaCha
4 ###
5 ### (c) 2015 Straylight/Edgeware
6 ###
7
8 ###----- Licensing notice ---------------------------------------------------
9 ###
10 ### This file is part of Catacomb.
11 ###
12 ### Catacomb is free software; you can redistribute it and/or modify
13 ### it under the terms of the GNU Library General Public License as
14 ### published by the Free Software Foundation; either version 2 of the
15 ### License, or (at your option) any later version.
16 ###
17 ### Catacomb is distributed in the hope that it will be useful,
18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ### GNU Library General Public License for more details.
21 ###
22 ### You should have received a copy of the GNU Library General Public
23 ### License along with Catacomb; if not, write to the Free
24 ### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 ### MA 02111-1307, USA.
26
27 .intel_syntax noprefix
28 .arch pentium4
29
30 .section .text
31
32 .globl chacha_core_x86_sse2
33 .type chacha_core_x86_sse2, STT_FUNC
34 chacha_core_x86_sse2:
35
36 ## Initial state. We have three arguments:
37 ## [ebp + 8] is the number of rounds to do
38 ## [ebp + 12] points to the input matrix
39 ## [ebp + 16] points to the output matrix
40 push ebp
41 mov ebp, esp
42 sub esp, 16
43 mov edx, [ebp + 12]
44 and esp, ~15
45
46 ## First job is to slurp the matrix into XMM registers. Be careful:
47 ## the input matrix isn't likely to be properly aligned.
48 ##
49 ## [ 0 1 2 3] (a, xmm0)
50 ## [ 4 5 6 7] (b, xmm0)
51 ## [ 8 9 10 11] (c, xmm0)
52 ## [12 13 14 15] (d, xmm0)
53 movdqu xmm0, [edx + 0]
54 movdqu xmm1, [edx + 16]
55 movdqu xmm2, [edx + 32]
56 movdqu xmm3, [edx + 48]
57
58 ## Prepare for the main loop.
59 mov ecx, [ebp + 8]
60
61 ## Take a copy for later. This one is aligned properly, by
62 ## construction.
63 movdqa [esp], xmm0
64 movdqa xmm5, xmm1
65 movdqa xmm6, xmm2
66 movdqa xmm7, xmm3
67
68 loop:
69 ## Apply a column quarterround to each of the columns simultaneously.
70 ## Alas, there doesn't seem to be a packed doubleword rotate, so we
71 ## have to synthesize it.
72
73 ## a += b; d ^= a; d <<<= 16
74 paddd xmm0, xmm1
75 pxor xmm3, xmm0
76 movdqa xmm4, xmm3
77 pslld xmm3, 16
78 psrld xmm4, 16
79 por xmm3, xmm4
80
81 ## c += d; b ^= c; b <<<= 12
82 paddd xmm2, xmm3
83 pxor xmm1, xmm2
84 movdqa xmm4, xmm1
85 pslld xmm1, 12
86 psrld xmm4, 20
87 por xmm1, xmm4
88
89 ## a += b; d ^= a; d <<<= 8
90 paddd xmm0, xmm1
91 pxor xmm3, xmm0
92 movdqa xmm4, xmm3
93 pslld xmm3, 8
94 psrld xmm4, 24
95 por xmm3, xmm4
96
97 ## c += d; b ^= c; b <<<= 7
98 paddd xmm2, xmm3
99 pshufd xmm3, xmm3, 0x93
100 pxor xmm1, xmm2
101 pshufd xmm2, xmm2, 0x4e
102 movdqa xmm4, xmm1
103 pslld xmm1, 7
104 psrld xmm4, 25
105 por xmm1, xmm4
106
107 ## The not-quite-transpose conveniently only involves reordering
108 ## elements of individual rows, which can be done quite easily. It
109 ## doesn't involve any movement of elements between rows, or even
110 ## renaming of the rows.
111 ##
112 ## [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
113 ## [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
114 ## [ 8 9 10 11] [10 11 8 9] (c, xmm2)
115 ## [12 13 14 15] [15 12 13 14] (d, xmm3)
116 ##
117 ## The shuffles have quite high latency, so they've mostly been
118 ## pushed upwards. The remaining one can't be moved, though.
119 pshufd xmm1, xmm1, 0x39
120
121 ## Apply the diagonal quarterround to each of the columns
122 ## simultaneously.
123
124 ## a += b; d ^= a; d <<<= 16
125 paddd xmm0, xmm1
126 pxor xmm3, xmm0
127 movdqa xmm4, xmm3
128 pslld xmm3, 16
129 psrld xmm4, 16
130 por xmm3, xmm4
131
132 ## c += d; b ^= c; b <<<= 12
133 paddd xmm2, xmm3
134 pxor xmm1, xmm2
135 movdqa xmm4, xmm1
136 pslld xmm1, 12
137 psrld xmm4, 20
138 por xmm1, xmm4
139
140 ## a += b; d ^= a; d <<<= 8
141 paddd xmm0, xmm1
142 pxor xmm3, xmm0
143 movdqa xmm4, xmm3
144 pslld xmm3, 8
145 psrld xmm4, 24
146 por xmm3, xmm4
147
148 ## c += d; b ^= c; b <<<= 7
149 paddd xmm2, xmm3
150 pshufd xmm3, xmm3, 0x39
151 pxor xmm1, xmm2
152 pshufd xmm2, xmm2, 0x4e
153 movdqa xmm4, xmm1
154 pslld xmm1, 7
155 psrld xmm4, 25
156 por xmm1, xmm4
157
158 ## Finally, finish off undoing the transpose, and we're done for this
159 ## doubleround. Again, most of this was done above so we don't have
160 ## to wait for the shuffles.
161 pshufd xmm1, xmm1, 0x93
162
163 ## Decrement the loop counter and see if we should go round again.
164 sub ecx, 2
165 ja loop
166
167 ## Almost there. Firstly, the feedforward addition.
168 mov edx, [ebp + 16]
169 paddd xmm0, [esp]
170 paddd xmm1, xmm5
171 paddd xmm2, xmm6
172 paddd xmm3, xmm7
173
174 ## And now we write out the result. This one won't be aligned
175 ## either.
176 movdqu [edx + 0], xmm0
177 movdqu [edx + 16], xmm1
178 movdqu [edx + 32], xmm2
179 movdqu [edx + 48], xmm3
180
181 ## And with that, we're done.
182 mov esp, ebp
183 pop ebp
184 ret
185
186 .size chacha_core_x86_sse2, . - chacha_core_x86_sse2
187
188 ###----- That's all, folks --------------------------------------------------