[catacomb] / symm / salsa20-x86ish-sse2.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// Fancy SIMD implementation of Salsa20
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.text

///--------------------------------------------------------------------------
/// Main code.

FUNC(salsa20_core_x86ish_avx)
	.arch	.avx
	vzeroupper
  endprologue
	// drop through...
ENDFUNC

	.arch	pentium4

FUNC(salsa20_core_x86ish_sse2)

	// Initial setup.

#if CPUFAM_X86
	// Arguments come in on the stack, and will need to be collected.  We
	// can get away with just the scratch registers for integer work, but
	// we'll run out of XMM registers and will need some properly aligned
	// space which we'll steal from the stack.  I don't trust the stack
	// pointer's alignment, so I'll have to mask the stack pointer, which
	// in turn means I'll need to keep track of the old value.  Hence I'm
	// making a full i386-style stack frame here.
	//
	// The Windows and SysV ABIs are sufficiently similar that we don't
	// need to worry about the differences here.

#  define NR ecx
#  define IN eax
#  define OUT edx
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 [esp + 0]
#  define SAVE3 [esp + 16]

	pushreg	ebp
	setfp	ebp
	sub	esp, 32
	mov	IN, [ebp + 12]
	mov	OUT, [ebp + 16]
	and	esp, ~15
	mov	NR, [ebp + 8]
#endif

#if CPUFAM_AMD64 && ABI_SYSV
	// This is nice.  We have plenty of XMM registers, and the arguments
	// are in useful places.  There's no need to spill anything and we
	// can just get on with the code.

#  define NR edi
#  define IN rsi
#  define OUT rdx
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 xmm8
#  define SAVE3 xmm9
#endif

#  if CPUFAM_AMD64 && ABI_WIN
	// Arguments come in registers, but they're different between Windows
	// and everyone else (and everyone else is saner).
	//
	// The Windows ABI insists that we preserve some of the XMM
	// registers, but we want more than we can use as scratch space.  Two
	// places we only need to save a copy of the input for the
	// feedforward at the end; but the other two we want for the final
	// permutation, so save the old values on the stack.  (We need an
	// extra 8 bytes to align the stack.)

#  define NR ecx
#  define IN rdx
#  define OUT r8
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 [rsp + 32]
#  define SAVE3 [rsp + 48]

	stalloc	64 + 8
	savexmm	xmm6, 0
	savexmm	xmm7, 16
#endif

  endprologue

	// First job is to slurp the matrix into XMM registers.  The words
	// have already been permuted conveniently to make them line up
	// better for SIMD processing.
	//
	// The textbook arrangement of the matrix is this.
	//
	//	[C K K K]
	//	[K C N N]
	//	[T T C K]
	//	[K K K C]
	//
	// But we've rotated the columns up so that the main diagonal with
	// the constants on it end up in the first row, giving something more
	// like
	//
	//	[C C C C]
	//	[K T K K]
	//	[T K K N]
	//	[K K N K]
	//
	// so the transformation looks like this:
	//
	//	[ 0  1  2  3]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
	//	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
	//	[12 13 14 15]		[12  1  6 11] (d, xmm3)
	movdqu	xmm0, [IN +  0]
	movdqu	xmm1, [IN + 16]
	movdqu	xmm2, [IN + 32]
	movdqu	xmm3, [IN + 48]

	// Take a copy for later.
	movdqa	SAVE0, xmm0
	movdqa	SAVE1, xmm1
	movdqa	SAVE2, xmm2
	movdqa	SAVE3, xmm3

0:
	// Apply a column quarterround to each of the columns simultaneously.
	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	// have to synthesize it.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm3
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm1
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm1
	 pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm3
	 pshufd	xmm3, xmm3, SHUF(1, 2, 3, 0)
	paddd	xmm4, xmm2
	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// The transpose conveniently only involves reordering elements of
	// individual rows, which can be done quite easily, and reordering
	// the rows themselves, which is a trivial renaming.  It doesn't
	// involve any movement of elements between rows.
	//
	//	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  9 14  3]	 -->	[ 1  6 11 12] (b, xmm3)
	//	[ 8 13	2  7]		[ 2  7	8 13] (c, xmm2)
	//	[12  1	6 11]		[ 3  4	9 14] (d, xmm1)
	//
	// The shuffles have quite high latency, so they've been pushed
	// backwards into the main instruction list.

	// Apply the row quarterround to each of the columns (yes!)
	// simultaneously.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm1
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm3
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm3
	 pshufd	xmm3, xmm3, SHUF(3, 0, 1, 2)
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm1
	 pshufd	xmm1, xmm1, SHUF(1, 2, 3, 0)
	paddd	xmm4, xmm2
	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// We had to undo the transpose ready for the next loop.  Again, push
	// back the shuffles because they take a long time coming through.
	// Decrement the loop counter and see if we should go round again.
	// Later processors fuse this pair into a single uop.
	sub	NR, 2
	ja	0b

	// Almost there.  Firstly, the feedforward addition.
	paddd	xmm0, SAVE0			//  0,  5, 10, 15
	paddd	xmm1, SAVE1			//  4,  9, 14,  3
	paddd	xmm2, SAVE2			//  8, 13,  2,  7
	paddd	xmm3, SAVE3			// 12,  1,  6, 11

	// Next we must undo the permutation which was already applied to the
	// input.  This can be done by juggling values in registers, with the
	// following fancy footwork: some row rotations, a transpose, and
	// some more rotations.
	pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)	//  3,  4,  9, 14
	pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)	//  2,  7,  8, 13
	pshufd	xmm3, xmm3, SHUF(1, 2, 3, 0)	//  1,  6, 11, 12

	movdqa	xmm4, xmm0
	movdqa	xmm5, xmm3
	punpckldq xmm0, xmm2			//  0,  2,  5,  7
	punpckldq xmm3, xmm1			//  1,  3,  6,  4
	punpckhdq xmm4, xmm2			//  10, 8, 15, 13
	punpckhdq xmm5, xmm1			//  11, 9, 12, 14

	movdqa	xmm1, xmm0
	movdqa	xmm2, xmm4
	punpckldq xmm0, xmm3			//  0,  1,  2,  3
	punpckldq xmm4, xmm5			// 10, 11,  8,  9
	punpckhdq xmm1, xmm3			//  5,  6,  7,  4
	punpckhdq xmm2, xmm5			// 15, 12, 13, 14

	pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)	//  4,  5,  6,  7
	pshufd	xmm4, xmm4, SHUF(2, 3, 0, 1)	//  8,  9, 10, 11
	pshufd	xmm2, xmm2, SHUF(1, 2, 3, 0)	// 12, 13, 14, 15

	// Finally we have to write out the result.
	movdqu	[OUT +  0], xmm0
	movdqu	[OUT + 16], xmm1
	movdqu	[OUT + 32], xmm4
	movdqu	[OUT + 48], xmm2

	// Tidy things up.
#if CPUFAM_X86
	dropfp
	popreg	ebp
#endif
#if CPUFAM_AMD64 && ABI_WIN
	rstrxmm	xmm6, 0
	rstrxmm	xmm7, 16
	stfree	64 + 8
#endif

	// And with that, we're done.
	ret

#undef NR
#undef IN
#undef OUT
#undef SAVE0
#undef SAVE1
#undef SAVE2
#undef SAVE3

ENDFUNC

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
1a0c09c4 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of Salsa20
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
df07f2c0	28	/// Preliminaries.
1a0c09c4 MW	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
df07f2c0 MW	33	.text
df07f2c0 MW	34
1a0c09c4 MW	35	///--------------------------------------------------------------------------
	36	/// Main code.
	37
b9b279b4 MW	38	FUNC(salsa20_core_x86ish_avx)
	39	.arch .avx
	40	vzeroupper
	41	endprologue
	42	// drop through...
	43	ENDFUNC
	44
	45	.arch pentium4
	46
0f23f75f MW	47	FUNC(salsa20_core_x86ish_sse2)
	48
	49	// Initial setup.
	50
	51	#if CPUFAM_X86
	52	// Arguments come in on the stack, and will need to be collected. We
172707cb MW	53	// can get away with just the scratch registers for integer work, but
	54	// we'll run out of XMM registers and will need some properly aligned
	55	// space which we'll steal from the stack. I don't trust the stack
	56	// pointer's alignment, so I'll have to mask the stack pointer, which
	57	// in turn means I'll need to keep track of the old value. Hence I'm
	58	// making a full i386-style stack frame here.
0f23f75f MW	59	//
	60	// The Windows and SysV ABIs are sufficiently similar that we don't
	61	// need to worry about the differences here.
	62
	63	# define NR ecx
	64	# define IN eax
	65	# define OUT edx
	66	# define SAVE0 xmm6
	67	# define SAVE1 xmm7
	68	# define SAVE2 [esp + 0]
	69	# define SAVE3 [esp + 16]
1a0c09c4	70
0923a413 MW	71	pushreg ebp
0923a413 MW	72	setfp ebp
1a0c09c4	73	sub esp, 32
0f23f75f MW	74	mov IN, [ebp + 12]
0f23f75f MW	75	mov OUT, [ebp + 16]
1a0c09c4	76	and esp, ~15
0f23f75f MW	77	mov NR, [ebp + 8]
	78	#endif
	79
	80	#if CPUFAM_AMD64 && ABI_SYSV
	81	// This is nice. We have plenty of XMM registers, and the arguments
	82	// are in useful places. There's no need to spill anything and we
	83	// can just get on with the code.
	84
	85	# define NR edi
	86	# define IN rsi
	87	# define OUT rdx
	88	# define SAVE0 xmm6
	89	# define SAVE1 xmm7
	90	# define SAVE2 xmm8
	91	# define SAVE3 xmm9
	92	#endif
	93
	94	# if CPUFAM_AMD64 && ABI_WIN
	95	// Arguments come in registers, but they're different between Windows
	96	// and everyone else (and everyone else is saner).
	97	//
	98	// The Windows ABI insists that we preserve some of the XMM
	99	// registers, but we want more than we can use as scratch space. Two
	100	// places we only need to save a copy of the input for the
	101	// feedforward at the end; but the other two we want for the final
ae429891 MW	102	// permutation, so save the old values on the stack. (We need an
ae429891 MW	103	// extra 8 bytes to align the stack.)
0f23f75f MW	104
	105	# define NR ecx
	106	# define IN rdx
	107	# define OUT r8
	108	# define SAVE0 xmm6
	109	# define SAVE1 xmm7
	110	# define SAVE2 [rsp + 32]
	111	# define SAVE3 [rsp + 48]
	112
0923a413 MW	113	stalloc 64 + 8
	114	savexmm xmm6, 0
	115	savexmm xmm7, 16
0f23f75f	116	#endif
1a0c09c4	117
0923a413 MW	118	endprologue
0923a413 MW	119
1a0c09c4 MW	120	// First job is to slurp the matrix into XMM registers. The words
	121	// have already been permuted conveniently to make them line up
	122	// better for SIMD processing.
	123	//
	124	// The textbook arrangement of the matrix is this.
	125	//
	126	// [C K K K]
	127	// [K C N N]
	128	// [T T C K]
	129	// [K K K C]
	130	//
	131	// But we've rotated the columns up so that the main diagonal with
	132	// the constants on it end up in the first row, giving something more
	133	// like
	134	//
	135	// [C C C C]
	136	// [K T K K]
	137	// [T K K N]
	138	// [K K N K]
	139	//
	140	// so the transformation looks like this:
	141	//
	142	// [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
	143	// [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
	144	// [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
	145	// [12 13 14 15] [12 1 6 11] (d, xmm3)
0f23f75f MW	146	movdqu xmm0, [IN + 0]
	147	movdqu xmm1, [IN + 16]
	148	movdqu xmm2, [IN + 32]
	149	movdqu xmm3, [IN + 48]
1a0c09c4	150
7afb1dc9	151	// Take a copy for later.
0f23f75f MW	152	movdqa SAVE0, xmm0
	153	movdqa SAVE1, xmm1
	154	movdqa SAVE2, xmm2
	155	movdqa SAVE3, xmm3
1a0c09c4	156
fd3bb67b	157	0:
1a0c09c4 MW	158	// Apply a column quarterround to each of the columns simultaneously.
	159	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	160	// have to synthesize it.
	161
	162	// b ^= (a + d) <<< 7
	163	movdqa xmm4, xmm0
	164	paddd xmm4, xmm3
	165	movdqa xmm5, xmm4
	166	pslld xmm4, 7
	167	psrld xmm5, 25
	168	por xmm4, xmm5
	169	pxor xmm1, xmm4
	170
	171	// c ^= (b + a) <<< 9
	172	movdqa xmm4, xmm1
	173	paddd xmm4, xmm0
	174	movdqa xmm5, xmm4
	175	pslld xmm4, 9
	176	psrld xmm5, 23
	177	por xmm4, xmm5
	178	pxor xmm2, xmm4
	179
	180	// d ^= (c + b) <<< 13
	181	movdqa xmm4, xmm2
	182	paddd xmm4, xmm1
a117c06f	183	pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
1a0c09c4 MW	184	movdqa xmm5, xmm4
	185	pslld xmm4, 13
	186	psrld xmm5, 19
	187	por xmm4, xmm5
	188	pxor xmm3, xmm4
	189
	190	// a ^= (d + c) <<< 18
	191	movdqa xmm4, xmm3
a117c06f	192	pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
1a0c09c4	193	paddd xmm4, xmm2
a117c06f	194	pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
1a0c09c4 MW	195	movdqa xmm5, xmm4
	196	pslld xmm4, 18
	197	psrld xmm5, 14
	198	por xmm4, xmm5
	199	pxor xmm0, xmm4
	200
	201	// The transpose conveniently only involves reordering elements of
	202	// individual rows, which can be done quite easily, and reordering
	203	// the rows themselves, which is a trivial renaming. It doesn't
	204	// involve any movement of elements between rows.
	205	//
	206	// [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
0f23f75f MW	207	// [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
	208	// [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
	209	// [12 1 6 11] [ 3 4 9 14] (d, xmm1)
1a0c09c4 MW	210	//
	211	// The shuffles have quite high latency, so they've been pushed
	212	// backwards into the main instruction list.
	213
	214	// Apply the row quarterround to each of the columns (yes!)
	215	// simultaneously.
	216
	217	// b ^= (a + d) <<< 7
	218	movdqa xmm4, xmm0
	219	paddd xmm4, xmm1
	220	movdqa xmm5, xmm4
	221	pslld xmm4, 7
	222	psrld xmm5, 25
	223	por xmm4, xmm5
	224	pxor xmm3, xmm4
	225
	226	// c ^= (b + a) <<< 9
	227	movdqa xmm4, xmm3
	228	paddd xmm4, xmm0
	229	movdqa xmm5, xmm4
	230	pslld xmm4, 9
	231	psrld xmm5, 23
	232	por xmm4, xmm5
	233	pxor xmm2, xmm4
	234
	235	// d ^= (c + b) <<< 13
	236	movdqa xmm4, xmm2
	237	paddd xmm4, xmm3
a117c06f	238	pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
1a0c09c4 MW	239	movdqa xmm5, xmm4
	240	pslld xmm4, 13
	241	psrld xmm5, 19
	242	por xmm4, xmm5
	243	pxor xmm1, xmm4
	244
	245	// a ^= (d + c) <<< 18
	246	movdqa xmm4, xmm1
a117c06f	247	pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
1a0c09c4	248	paddd xmm4, xmm2
a117c06f	249	pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
1a0c09c4 MW	250	movdqa xmm5, xmm4
	251	pslld xmm4, 18
	252	psrld xmm5, 14
	253	por xmm4, xmm5
	254	pxor xmm0, xmm4
	255
	256	// We had to undo the transpose ready for the next loop. Again, push
	257	// back the shuffles because they take a long time coming through.
	258	// Decrement the loop counter and see if we should go round again.
	259	// Later processors fuse this pair into a single uop.
0f23f75f	260	sub NR, 2
fd3bb67b	261	ja 0b
1a0c09c4	262
3cb47d27 MW	263	// Almost there. Firstly, the feedforward addition.
	264	paddd xmm0, SAVE0 // 0, 5, 10, 15
	265	paddd xmm1, SAVE1 // 4, 9, 14, 3
	266	paddd xmm2, SAVE2 // 8, 13, 2, 7
	267	paddd xmm3, SAVE3 // 12, 1, 6, 11
	268
	269	// Next we must undo the permutation which was already applied to the
	270	// input. This can be done by juggling values in registers, with the
	271	// following fancy footwork: some row rotations, a transpose, and
	272	// some more rotations.
a117c06f MW	273	pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 3, 4, 9, 14
	274	pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) // 2, 7, 8, 13
	275	pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) // 1, 6, 11, 12
3cb47d27 MW	276
	277	movdqa xmm4, xmm0
	278	movdqa xmm5, xmm3
	279	punpckldq xmm0, xmm2 // 0, 2, 5, 7
	280	punpckldq xmm3, xmm1 // 1, 3, 6, 4
	281	punpckhdq xmm4, xmm2 // 10, 8, 15, 13
	282	punpckhdq xmm5, xmm1 // 11, 9, 12, 14
	283
	284	movdqa xmm1, xmm0
	285	movdqa xmm2, xmm4
	286	punpckldq xmm0, xmm3 // 0, 1, 2, 3
	287	punpckldq xmm4, xmm5 // 10, 11, 8, 9
	288	punpckhdq xmm1, xmm3 // 5, 6, 7, 4
	289	punpckhdq xmm2, xmm5 // 15, 12, 13, 14
	290
a117c06f MW	291	pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 4, 5, 6, 7
	292	pshufd xmm4, xmm4, SHUF(2, 3, 0, 1) // 8, 9, 10, 11
	293	pshufd xmm2, xmm2, SHUF(1, 2, 3, 0) // 12, 13, 14, 15
3cb47d27 MW	294
	295	// Finally we have to write out the result.
	296	movdqu [OUT + 0], xmm0
	297	movdqu [OUT + 16], xmm1
	298	movdqu [OUT + 32], xmm4
	299	movdqu [OUT + 48], xmm2
1a0c09c4 MW	300
1a0c09c4 MW	301	// Tidy things up.
0f23f75f	302	#if CPUFAM_X86
0923a413 MW	303	dropfp
0923a413 MW	304	popreg ebp
0f23f75f MW	305	#endif
0f23f75f MW	306	#if CPUFAM_AMD64 && ABI_WIN
0923a413	307	rstrxmm xmm6, 0
41fb2356	308	rstrxmm xmm7, 16
0923a413	309	stfree 64 + 8
0f23f75f	310	#endif
1a0c09c4 MW	311
	312	// And with that, we're done.
	313	ret
	314
0f23f75f MW	315	#undef NR
	316	#undef IN
	317	#undef OUT
	318	#undef SAVE0
	319	#undef SAVE1
	320	#undef SAVE2
	321	#undef SAVE3
	322
1a0c09c4 MW	323	ENDFUNC
	324
	325	///----- That's all, folks --------------------------------------------------