+ mov NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+ // This is nice. We have plenty of XMM registers, and the arguments
+ // are in useful places. There's no need to spill anything and we
+ // can just get on with the code.
+
+# define NR edi
+# define IN rsi
+# define OUT rdx
+# define SAVE0 xmm6
+# define SAVE1 xmm7
+# define SAVE2 xmm8
+# define SAVE3 xmm9
+#endif
+
+# if CPUFAM_AMD64 && ABI_WIN
+ // Arguments come in registers, but they're different between Windows
+ // and everyone else (and everyone else is saner).
+ //
+ // The Windows ABI insists that we preserve some of the XMM
+ // registers, but we want more than we can use as scratch space. Two
+ // places we only need to save a copy of the input for the
+ // feedforward at the end; but the other two we want for the final
+ // permutation, so save the old values on the stack (We need an extra
+ // 8 bytes to align the stack.)
+
+# define NR ecx
+# define IN rdx
+# define OUT r8
+# define SAVE0 xmm6
+# define SAVE1 xmm7
+# define SAVE2 [rsp + 32]
+# define SAVE3 [rsp + 48]
+
+ sub rsp, 64 + 8
+ movdqa [rsp + 0], xmm6
+ movdqa [rsp + 16], xmm7
+#endif