+// Permutations for SIMD instructions. SHUF(D, C, B, A) is an immediate,
+// suitable for use in `pshufd' or `shufpd', which copies element D
+// (0 <= D < 4) of the source to element 3 of the destination, element C to
+// element 2, element B to element 1, and element A to element 0.
+#define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a))
+
+#endif
+
+#if CPUFAM_X86
+
+.macro _reg.0
+ // Stash GP registers and establish temporary stack frame.
+ pushfd
+ push eax
+ push ecx
+ push edx
+ push ebp
+ mov ebp, esp
+ and esp, ~15
+ sub esp, 512
+ fxsave [esp]
+.endm
+
+.macro _reg.1
+.endm
+
+.macro _reg.2
+.endm
+
+.macro _reg.3 fmt
+ // Print FMT and the other established arguments.
+ lea eax, .L$_reg$msg.\@
+ push eax
+ call printf
+ jmp .L$_reg$cont.\@
+.L$_reg$msg.\@:
+ .ascii ";; \fmt\n\0"
+.L$_reg$cont.\@:
+ mov eax, ebp
+ and eax, ~15
+ sub eax, 512
+ fxrstor [eax]
+ mov esp, ebp
+ pop ebp
+ pop edx
+ pop ecx
+ pop eax
+ popfd
+.endm
+
+.macro msg msg
+ _reg.0
+ _reg.1
+ _reg.2
+ _reg.3 "\msg"
+.endm
+
+.macro reg r, msg
+ _reg.0
+ .ifeqs "\r", "esp"
+ lea eax, [ebp + 20]
+ push eax
+ .else
+ .ifeqs "\r", "ebp"
+ push [ebp]
+ .else
+ push \r
+ .endif
+ .endif
+ _reg.1
+ _reg.2
+ _reg.3 "\msg: \r = %08x"
+.endm
+
+.macro xmmreg r, msg
+ _reg.0
+ _reg.1
+ _reg.2
+ movdqu xmm0, \r
+ pshufd xmm0, xmm0, 0x1b
+ sub esp, 16
+ movdqa [esp], xmm0
+ _reg.3 "\msg: \r = %08x %08x %08x %08x"
+.endm
+
+.macro mmreg r, msg
+ _reg.0
+ _reg.1
+ _reg.2
+ pshufw \r, \r, 0x4e
+ sub esp, 8
+ movq [esp], \r
+ _reg.3 "\msg: \r = %08x %08x"
+.endm
+
+.macro freg i, msg
+ _reg.0
+ _reg.1
+ _reg.2
+ finit
+ fldt [esp + 32 + 16*\i]
+ sub esp, 12
+ fstpt [esp]
+ _reg.3 "\msg: st(\i) = %.20Lg"
+.endm
+
+.macro fxreg i, msg
+ _reg.0
+ _reg.1
+ _reg.2
+ finit
+ fldt [esp + 32 + 16*\i]
+ sub esp, 12
+ fstpt [esp]
+ _reg.3 "\msg: st(\i) = %La"
+.endm
+