| 1 | /// -*- mode: asm; asm-comment-char: ?/ -*- |
| 2 | /// |
| 3 | /// Fancy SIMD implementation of Salsa20 |
| 4 | /// |
| 5 | /// (c) 2015 Straylight/Edgeware |
| 6 | /// |
| 7 | |
| 8 | ///----- Licensing notice --------------------------------------------------- |
| 9 | /// |
| 10 | /// This file is part of Catacomb. |
| 11 | /// |
| 12 | /// Catacomb is free software; you can redistribute it and/or modify |
| 13 | /// it under the terms of the GNU Library General Public License as |
| 14 | /// published by the Free Software Foundation; either version 2 of the |
| 15 | /// License, or (at your option) any later version. |
| 16 | /// |
| 17 | /// Catacomb is distributed in the hope that it will be useful, |
| 18 | /// but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 19 | /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 20 | /// GNU Library General Public License for more details. |
| 21 | /// |
| 22 | /// You should have received a copy of the GNU Library General Public |
| 23 | /// License along with Catacomb; if not, write to the Free |
| 24 | /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, |
| 25 | /// MA 02111-1307, USA. |
| 26 | |
| 27 | ///-------------------------------------------------------------------------- |
| 28 | /// General definitions. |
| 29 | |
| 30 | // Some useful variables. |
| 31 | .L$_subsec = 0 |
| 32 | |
| 33 | // Literal pools done the hard way. |
| 34 | #define _LIT .text .L$_subsec + 1 |
| 35 | #define _ENDLIT .text .L$_subsec |
| 36 | #define _LTORG .L$_subsec = .L$_subsec + 2; .text .L$_subsec |
| 37 | |
| 38 | // Announcing an external function. |
| 39 | #define FUNC(name) \ |
| 40 | .globl F(name); \ |
| 41 | TYPE_FUNC(name); \ |
| 42 | .macro ENDFUNC; _ENDFUNC(name); .endm; \ |
| 43 | FUNC_PREHOOK(name); \ |
| 44 | F(name): \ |
| 45 | FUNC_POSTHOOK(name) |
| 46 | |
| 47 | // Marking the end of a function. |
| 48 | #define _ENDFUNC(name) \ |
| 49 | .purgem ENDFUNC; \ |
| 50 | SIZE_OBJ(name); \ |
| 51 | ENDFUNC_HOOK(name); \ |
| 52 | _LTORG |
| 53 | |
| 54 | // Make a helper function, if necessary. |
| 55 | #define AUXFN(name) \ |
| 56 | .ifndef .L$_auxfn_def.name; \ |
| 57 | .text 7128; \ |
| 58 | .macro _ENDAUXFN; _ENDAUXFN_TAIL(name); .endm; \ |
| 59 | FUNC_PREHOOK(name); \ |
| 60 | name: |
| 61 | #define _ENDAUXFN_TAIL(name) \ |
| 62 | .purgem _ENDAUXFN; \ |
| 63 | .text .L$_subsec; \ |
| 64 | .L$_auxfn_def.name = 1 |
| 65 | #define ENDAUXFN _ENDAUXFN; .endif |
| 66 | |
| 67 | ///-------------------------------------------------------------------------- |
| 68 | /// ELF-specific hacking. |
| 69 | |
| 70 | #if __ELF__ |
| 71 | |
| 72 | #if __PIC__ || __PIE__ |
| 73 | # define WANT_PIC 1 |
| 74 | #endif |
| 75 | |
| 76 | #define TYPE_FUNC(name) .type name, STT_FUNC |
| 77 | |
| 78 | #define SIZE_OBJ(name) .size name, . - name |
| 79 | |
| 80 | #endif |
| 81 | |
| 82 | ///-------------------------------------------------------------------------- |
| 83 | /// Windows-specific hacking. |
| 84 | |
| 85 | #if ABI_WIN |
| 86 | |
| 87 | #if CPUFAM_X86 |
| 88 | # define F(name) _##name |
| 89 | #endif |
| 90 | |
| 91 | #endif |
| 92 | |
| 93 | ///-------------------------------------------------------------------------- |
| 94 | /// x86- and amd64-specific hacking. |
| 95 | /// |
| 96 | /// It's (slightly) easier to deal with both of these in one go. |
| 97 | |
| 98 | #if CPUFAM_X86 || CPUFAM_AMD64 |
| 99 | |
| 100 | // Set the function hooks. |
| 101 | #define FUNC_PREHOOK(_) .balign 16 |
| 102 | |
| 103 | // On Windows, arrange to install stack-unwinding data. |
| 104 | #if CPUFAM_AMD64 && ABI_WIN |
| 105 | # define FUNC_POSTHOOK(name) .seh_proc name |
| 106 | # define ENDFUNC_HOOK(_) .seh_endproc |
| 107 | // Procedures are expected to invoke `.seh_setframe' if necessary, and |
| 108 | // `.seh_pushreg' and friends, and `.seh_endprologue'. |
| 109 | #endif |
| 110 | |
| 111 | // Don't use the wretched AT&T syntax. It's festooned with pointless |
| 112 | // punctuation, and all of the data movement is backwards. Ugh! |
| 113 | .intel_syntax noprefix |
| 114 | |
| 115 | // Call external subroutine at ADDR, possibly via PLT. |
| 116 | .macro callext addr |
| 117 | #if WANT_PIC |
| 118 | call \addr@PLT |
| 119 | #else |
| 120 | call \addr |
| 121 | #endif |
| 122 | .endm |
| 123 | |
| 124 | // Do I need to arrange a spare GOT register? |
| 125 | #if WANT_PIC && CPUFAM_X86 |
| 126 | # define NEED_GOT 1 |
| 127 | #endif |
| 128 | #define GOTREG ebx // Not needed in AMD64 so don't care. |
| 129 | |
| 130 | // Maybe load GOT address into GOT. |
| 131 | .macro ldgot got=GOTREG |
| 132 | #if WANT_PIC && CPUFAM_X86 |
| 133 | AUXFN(_ldgot.\got) |
| 134 | mov \got, [esp] |
| 135 | ret |
| 136 | ENDAUXFN |
| 137 | call _ldgot.\got |
| 138 | add \got, offset _GLOBAL_OFFSET_TABLE_ |
| 139 | #endif |
| 140 | .endm |
| 141 | |
| 142 | // Load address of external symbol ADDR into REG, maybe using GOT. |
| 143 | .macro leaext reg, addr, got=GOTREG |
| 144 | #if WANT_PIC |
| 145 | # if CPUFAM_X86 |
| 146 | mov \reg, [\got + \addr@GOT] |
| 147 | # endif |
| 148 | # if CPUFAM_AMD64 |
| 149 | mov \reg, \addr@GOTPCREL[rip] |
| 150 | # endif |
| 151 | #else |
| 152 | # if CPUFAM_X86 |
| 153 | mov \reg, offset \addr |
| 154 | # endif |
| 155 | # if CPUFAM_AMD64 |
| 156 | lea \reg, \addr[rip] |
| 157 | # endif |
| 158 | #endif |
| 159 | .endm |
| 160 | |
| 161 | // Address expression (possibly using a base register, and a displacement) |
| 162 | // referring to ADDR, which is within our module, maybe using GOT. |
| 163 | #define INTADDR(...) INTADDR__0(__VA_ARGS__, GOTREG, dummy) |
| 164 | #define INTADDR__0(addr, got, ...) INTADDR__1(addr, got) |
| 165 | #if CPUFAM_AMD64 |
| 166 | # define INTADDR__1(addr, got) addr + rip |
| 167 | #elif WANT_PIC |
| 168 | # define INTADDR__1(addr, got) got + addr@GOTOFF |
| 169 | #else |
| 170 | # define INTADDR__1(addr, got) addr |
| 171 | #endif |
| 172 | |
| 173 | // Permutations for SIMD instructions. SHUF(D, C, B, A) is an immediate, |
| 174 | // suitable for use in `pshufd' or `shufpd', which copies element D |
| 175 | // (0 <= D < 4) of the source to element 3 of the destination, element C to |
| 176 | // element 2, element B to element 1, and element A to element 0. |
| 177 | #define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a)) |
| 178 | |
| 179 | #endif |
| 180 | |
| 181 | #if CPUFAM_X86 |
| 182 | |
| 183 | .macro _reg.0 |
| 184 | // Stash GP registers and establish temporary stack frame. |
| 185 | pushfd |
| 186 | push eax |
| 187 | push ecx |
| 188 | push edx |
| 189 | push ebp |
| 190 | mov ebp, esp |
| 191 | and esp, ~15 |
| 192 | sub esp, 512 |
| 193 | fxsave [esp] |
| 194 | .endm |
| 195 | |
| 196 | .macro _reg.1 |
| 197 | .endm |
| 198 | |
| 199 | .macro _reg.2 |
| 200 | .endm |
| 201 | |
| 202 | .macro _reg.3 fmt |
| 203 | // Print FMT and the other established arguments. |
| 204 | lea eax, .L$_reg$msg.\@ |
| 205 | push eax |
| 206 | call printf |
| 207 | jmp .L$_reg$cont.\@ |
| 208 | .L$_reg$msg.\@: |
| 209 | .ascii ";; \fmt\n\0" |
| 210 | .L$_reg$cont.\@: |
| 211 | mov eax, ebp |
| 212 | and eax, ~15 |
| 213 | sub eax, 512 |
| 214 | fxrstor [eax] |
| 215 | mov esp, ebp |
| 216 | pop ebp |
| 217 | pop edx |
| 218 | pop ecx |
| 219 | pop eax |
| 220 | popfd |
| 221 | .endm |
| 222 | |
| 223 | .macro msg msg |
| 224 | _reg.0 |
| 225 | _reg.1 |
| 226 | _reg.2 |
| 227 | _reg.3 "\msg" |
| 228 | .endm |
| 229 | |
| 230 | .macro reg r, msg |
| 231 | _reg.0 |
| 232 | .ifeqs "\r", "esp" |
| 233 | lea eax, [ebp + 20] |
| 234 | push eax |
| 235 | .else |
| 236 | .ifeqs "\r", "ebp" |
| 237 | push [ebp] |
| 238 | .else |
| 239 | push \r |
| 240 | .endif |
| 241 | .endif |
| 242 | _reg.1 |
| 243 | _reg.2 |
| 244 | _reg.3 "\msg: \r = %08x" |
| 245 | .endm |
| 246 | |
| 247 | .macro xmmreg r, msg |
| 248 | _reg.0 |
| 249 | _reg.1 |
| 250 | _reg.2 |
| 251 | movdqu xmm0, \r |
| 252 | pshufd xmm0, xmm0, 0x1b |
| 253 | sub esp, 16 |
| 254 | movdqa [esp], xmm0 |
| 255 | _reg.3 "\msg: \r = %08x %08x %08x %08x" |
| 256 | .endm |
| 257 | |
| 258 | .macro mmreg r, msg |
| 259 | _reg.0 |
| 260 | _reg.1 |
| 261 | _reg.2 |
| 262 | pshufw \r, \r, 0x4e |
| 263 | sub esp, 8 |
| 264 | movq [esp], \r |
| 265 | _reg.3 "\msg: \r = %08x %08x" |
| 266 | .endm |
| 267 | |
| 268 | .macro freg i, msg |
| 269 | _reg.0 |
| 270 | _reg.1 |
| 271 | _reg.2 |
| 272 | finit |
| 273 | fldt [esp + 32 + 16*\i] |
| 274 | sub esp, 12 |
| 275 | fstpt [esp] |
| 276 | _reg.3 "\msg: st(\i) = %.20Lg" |
| 277 | .endm |
| 278 | |
| 279 | .macro fxreg i, msg |
| 280 | _reg.0 |
| 281 | _reg.1 |
| 282 | _reg.2 |
| 283 | finit |
| 284 | fldt [esp + 32 + 16*\i] |
| 285 | sub esp, 12 |
| 286 | fstpt [esp] |
| 287 | _reg.3 "\msg: st(\i) = %La" |
| 288 | .endm |
| 289 | |
| 290 | #endif |
| 291 | |
| 292 | ///-------------------------------------------------------------------------- |
| 293 | /// ARM-specific hacking. |
| 294 | |
| 295 | #if CPUFAM_ARMEL |
| 296 | |
| 297 | // ARM/Thumb mode things. Use ARM by default. |
| 298 | #define ARM .arm; .L$_pcoff = 8 |
| 299 | #define THUMB .thumb; .L$_pcoff = 4 |
| 300 | ARM |
| 301 | |
| 302 | // Set the function hooks. |
| 303 | #define FUNC_PREHOOK(_) .balign 4 |
| 304 | #define ENDFUNC_HOOK(name) .ltorg |
| 305 | |
| 306 | // Call external subroutine at ADDR, possibly via PLT. |
| 307 | .macro callext addr, cond= |
| 308 | #if WANT_PIC |
| 309 | bl\cond \addr(PLT) |
| 310 | #else |
| 311 | bl\cond \addr |
| 312 | #endif |
| 313 | .endm |
| 314 | |
| 315 | // Do I need to arrange a spare GOT register? |
| 316 | #if WANT_PIC |
| 317 | # define NEED_GOT 1 |
| 318 | #endif |
| 319 | #define GOTREG r9 |
| 320 | |
| 321 | // Maybe load GOT address into GOT. |
| 322 | .macro ldgot cond=, got=GOTREG |
| 323 | #if WANT_PIC |
| 324 | ldr\cond \got, .L$_ldgot$\@ |
| 325 | .L$_ldgot_pc$\@: |
| 326 | add\cond \got, pc, \got |
| 327 | _LIT |
| 328 | .balign 4 |
| 329 | .L$_ldgot$\@: |
| 330 | .word _GLOBAL_OFFSET_TABLE_ - .L$_ldgot_pc$\@ - .L$_pcoff |
| 331 | _ENDLIT |
| 332 | #endif |
| 333 | .endm |
| 334 | |
| 335 | // Load address of external symbol ADDR into REG, maybe using GOT. |
| 336 | .macro leaext reg, addr, cond=, got=GOTREG |
| 337 | #if WANT_PIC |
| 338 | ldr\cond \reg, .L$_leaext$\@ |
| 339 | ldr\cond \reg, [\got, \reg] |
| 340 | _LIT |
| 341 | .balign 4 |
| 342 | .L$_leaext$\@: |
| 343 | .word \addr(GOT) |
| 344 | _ENDLIT |
| 345 | #else |
| 346 | ldr\cond \reg, =\addr |
| 347 | #endif |
| 348 | .endm |
| 349 | |
| 350 | // Load address of external symbol ADDR into REG directly. |
| 351 | .macro leaextq reg, addr, cond= |
| 352 | #if WANT_PIC |
| 353 | ldr\cond \reg, .L$_leaextq$\@ |
| 354 | .L$_leaextq_pc$\@: |
| 355 | .if .L$_pcoff == 8 |
| 356 | ldr\cond \reg, [pc, \reg] |
| 357 | .else |
| 358 | add\cond \reg, pc |
| 359 | ldr\cond \reg, [\reg] |
| 360 | .endif |
| 361 | _LIT |
| 362 | .balign 4 |
| 363 | .L$_leaextq$\@: |
| 364 | .word \addr(GOT_PREL) + (. - .L$_leaextq_pc$\@ - .L$_pcoff) |
| 365 | _ENDLIT |
| 366 | #else |
| 367 | ldr\cond \reg, =\addr |
| 368 | #endif |
| 369 | .endm |
| 370 | |
| 371 | #endif |
| 372 | |
| 373 | ///-------------------------------------------------------------------------- |
| 374 | /// Final stuff. |
| 375 | |
| 376 | // Default values for the various hooks. |
| 377 | #ifndef FUNC_PREHOOK |
| 378 | # define FUNC_PREHOOK(name) |
| 379 | #endif |
| 380 | #ifndef FUNC_POSTHOOK |
| 381 | # define FUNC_POSTHOOK(name) |
| 382 | #endif |
| 383 | #ifndef ENDFUNC_HOOK |
| 384 | # define ENDFUNC_HOOK(name) |
| 385 | #endif |
| 386 | |
| 387 | #ifndef F |
| 388 | # define F(name) name |
| 389 | #endif |
| 390 | |
| 391 | #ifndef TYPE_FUNC |
| 392 | # define TYPE_FUNC(name) |
| 393 | #endif |
| 394 | |
| 395 | #ifndef SIZE_OBJ |
| 396 | # define SIZE_OBJ(name) |
| 397 | #endif |
| 398 | |
| 399 | #if __ELF__ && defined(WANT_EXECUTABLE_STACK) |
| 400 | .pushsection .note.GNU-stack, "", _SECTTY(progbits) |
| 401 | .popsection |
| 402 | #endif |
| 403 | |
| 404 | ///----- That's all, folks -------------------------------------------------- |