1 ### -*- mode: asm; asm-comment-char: ?# -*-
3 ### Fancy SIMD implementation of ChaCha
5 ### (c) 2015 Straylight/Edgeware
8 ###----- Licensing notice ---------------------------------------------------
10 ### This file is part of Catacomb.
12 ### Catacomb is free software; you can redistribute it and/or modify
13 ### it under the terms of the GNU Library General Public License as
14 ### published by the Free Software Foundation; either version 2 of the
15 ### License, or (at your option) any later version.
17 ### Catacomb is distributed in the hope that it will be useful,
18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ### GNU Library General Public License for more details.
22 ### You should have received a copy of the GNU Library General Public
23 ### License along with Catacomb; if not, write to the Free
24 ### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 ### MA 02111-1307, USA.
27 .intel_syntax noprefix
32 .globl chacha_core_x86_sse2
33 .type chacha_core_x86_sse2, STT_FUNC
36 ## Initial state. We have three arguments:
37 ## [ebp + 8] is the number of rounds to do
38 ## [ebp + 12] points to the input matrix
39 ## [ebp + 16] points to the output matrix
46 ## First job is to slurp the matrix into XMM registers. Be careful:
47 ## the input matrix isn't likely to be properly aligned.
49 ## [ 0 1 2 3] (a, xmm0)
50 ## [ 4 5 6 7] (b, xmm0)
51 ## [ 8 9 10 11] (c, xmm0)
52 ## [12 13 14 15] (d, xmm0)
53 movdqu xmm0, [edx + 0]
54 movdqu xmm1, [edx + 16]
55 movdqu xmm2, [edx + 32]
56 movdqu xmm3, [edx + 48]
58 ## Prepare for the main loop.
61 ## Take a copy for later. This one is aligned properly, by
69 ## Apply a column quarterround to each of the columns simultaneously.
70 ## Alas, there doesn't seem to be a packed doubleword rotate, so we
71 ## have to synthesize it.
73 ## a += b; d ^= a; d <<<= 16
81 ## c += d; b ^= c; b <<<= 12
89 ## a += b; d ^= a; d <<<= 8
97 ## c += d; b ^= c; b <<<= 7
99 pshufd xmm3, xmm3, 0x93
101 pshufd xmm2, xmm2, 0x4e
107 ## The not-quite-transpose conveniently only involves reordering
108 ## elements of individual rows, which can be done quite easily. It
109 ## doesn't involve any movement of elements between rows, or even
110 ## renaming of the rows.
112 ## [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
113 ## [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
114 ## [ 8 9 10 11] [10 11 8 9] (c, xmm2)
115 ## [12 13 14 15] [15 12 13 14] (d, xmm3)
117 ## The shuffles have quite high latency, so they've mostly been
118 ## pushed upwards. The remaining one can't be moved, though.
119 pshufd xmm1, xmm1, 0x39
121 ## Apply the diagonal quarterround to each of the columns
124 ## a += b; d ^= a; d <<<= 16
132 ## c += d; b ^= c; b <<<= 12
140 ## a += b; d ^= a; d <<<= 8
148 ## c += d; b ^= c; b <<<= 7
150 pshufd xmm3, xmm3, 0x39
152 pshufd xmm2, xmm2, 0x4e
158 ## Finally, finish off undoing the transpose, and we're done for this
159 ## doubleround. Again, most of this was done above so we don't have
160 ## to wait for the shuffles.
161 pshufd xmm1, xmm1, 0x93
163 ## Decrement the loop counter and see if we should go round again.
167 ## Almost there. Firstly, the feedforward addition.
174 ## And now we write out the result. This one won't be aligned
176 movdqu [edx + 0], xmm0
177 movdqu [edx + 16], xmm1
178 movdqu [edx + 32], xmm2
179 movdqu [edx + 48], xmm3
181 ## And with that, we're done.
186 .size chacha_core_x86_sse2, . - chacha_core_x86_sse2
188 ###----- That's all, folks --------------------------------------------------