[catacomb] / symm / salsa20-x86ish-sse2.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// Fancy SIMD implementation of Salsa20
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

///--------------------------------------------------------------------------
/// Main code.

	.arch pentium4
	.text

FUNC(salsa20_core_x86ish_sse2)

	// Initial setup.

#if CPUFAM_X86
	// Arguments come in on the stack, and will need to be collected.  We
	// we can get away with just the scratch registers for integer work,
	// but we'll run out of XMM registers and will need some properly
	// aligned space which we'll steal from the stack.  I don't trust the
	// stack pointer's alignment, so I'll have to mask the stack pointer,
	// which in turn means I'll need to keep track of the old value.
	// Hence I'm making a full i386-style stack frame here.
	//
	// The Windows and SysV ABIs are sufficiently similar that we don't
	// need to worry about the differences here.

#  define NR ecx
#  define IN eax
#  define OUT edx
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 [esp + 0]
#  define SAVE3 [esp + 16]

	push	ebp
	mov	ebp, esp
	sub	esp, 32
	mov	IN, [ebp + 12]
	mov	OUT, [ebp + 16]
	and	esp, ~15
	mov	NR, [ebp + 8]
#endif

#if CPUFAM_AMD64 && ABI_SYSV
	// This is nice.  We have plenty of XMM registers, and the arguments
	// are in useful places.  There's no need to spill anything and we
	// can just get on with the code.

#  define NR edi
#  define IN rsi
#  define OUT rdx
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 xmm8
#  define SAVE3 xmm9
#endif

#  if CPUFAM_AMD64 && ABI_WIN
	// Arguments come in registers, but they're different between Windows
	// and everyone else (and everyone else is saner).
	//
	// The Windows ABI insists that we preserve some of the XMM
	// registers, but we want more than we can use as scratch space.  Two
	// places we only need to save a copy of the input for the
	// feedforward at the end; but the other two we want for the final
	// permutation, so save the old values on the stack.  (We need an
	// extra 8 bytes to align the stack.)

#  define NR ecx
#  define IN rdx
#  define OUT r8
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 [rsp + 32]
#  define SAVE3 [rsp + 48]

	sub	rsp, 64 + 8
	  .seh_stackalloc 64 + 8
	movdqa	[rsp +  0], xmm6
	  .seh_savexmm xmm6, 0
	movdqa	[rsp + 16], xmm7
	  .seh_savexmm xmm7, 16
  .seh_endprologue
#endif

	// First job is to slurp the matrix into XMM registers.  The words
	// have already been permuted conveniently to make them line up
	// better for SIMD processing.
	//
	// The textbook arrangement of the matrix is this.
	//
	//	[C K K K]
	//	[K C N N]
	//	[T T C K]
	//	[K K K C]
	//
	// But we've rotated the columns up so that the main diagonal with
	// the constants on it end up in the first row, giving something more
	// like
	//
	//	[C C C C]
	//	[K T K K]
	//	[T K K N]
	//	[K K N K]
	//
	// so the transformation looks like this:
	//
	//	[ 0  1  2  3]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
	//	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
	//	[12 13 14 15]		[12  1  6 11] (d, xmm3)
	movdqu	xmm0, [IN +  0]
	movdqu	xmm1, [IN + 16]
	movdqu	xmm2, [IN + 32]
	movdqu	xmm3, [IN + 48]

	// Take a copy for later.
	movdqa	SAVE0, xmm0
	movdqa	SAVE1, xmm1
	movdqa	SAVE2, xmm2
	movdqa	SAVE3, xmm3

0:
	// Apply a column quarterround to each of the columns simultaneously.
	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	// have to synthesize it.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm3
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm1
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm1
	 pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm3
	 pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)
	paddd	xmm4, xmm2
	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// The transpose conveniently only involves reordering elements of
	// individual rows, which can be done quite easily, and reordering
	// the rows themselves, which is a trivial renaming.  It doesn't
	// involve any movement of elements between rows.
	//
	//	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  9 14  3]	 -->	[ 1  6 11 12] (b, xmm3)
	//	[ 8 13	2  7]		[ 2  7	8 13] (c, xmm2)
	//	[12  1	6 11]		[ 3  4	9 14] (d, xmm1)
	//
	// The shuffles have quite high latency, so they've been pushed
	// backwards into the main instruction list.

	// Apply the row quarterround to each of the columns (yes!)
	// simultaneously.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm1
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm3
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm3
	 pshufd	xmm3, xmm3, SHUF(2, 1, 0, 3)
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm1
	 pshufd	xmm1, xmm1, SHUF(0, 3, 2, 1)
	paddd	xmm4, xmm2
	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// We had to undo the transpose ready for the next loop.  Again, push
	// back the shuffles because they take a long time coming through.
	// Decrement the loop counter and see if we should go round again.
	// Later processors fuse this pair into a single uop.
	sub	NR, 2
	ja	0b

	// Almost there.  Firstly, the feedforward addition, and then we have
	// to write out the result.  Here we have to undo the permutation
	// which was already applied to the input.  Shuffling has quite high
	// latency, so arrange to start a new shuffle into a temporary as
	// soon as we've written out the old value.
	paddd	xmm0, SAVE0
	 pshufd	xmm4, xmm0, 0x39
	movd	[OUT +  0], xmm0

	paddd	xmm1, SAVE1
	 pshufd	xmm5, xmm1, SHUF(2, 1, 0, 3)
	movd	[OUT + 16], xmm1

	paddd	xmm2, SAVE2
	 pshufd	xmm6, xmm2, SHUF(1, 0, 3, 2)
	movd	[OUT + 32], xmm2

	paddd	xmm3, SAVE3
	 pshufd	xmm7, xmm3, SHUF(0, 3, 2, 1)
	movd	[OUT + 48], xmm3

	movd	[OUT +  4], xmm7
	 pshufd	xmm7, xmm3, SHUF(1, 0, 3, 2)
	movd	[OUT + 24], xmm7
	 pshufd	xmm3, xmm3, SHUF(2, 1, 0, 3)
	movd	[OUT + 44], xmm3

	movd	[OUT +  8], xmm6
	 pshufd	xmm6, xmm2, SHUF(2, 1, 0, 3)
	movd	[OUT + 28], xmm6
	 pshufd	xmm2, xmm2, SHUF(0, 3, 2, 1)
	movd	[OUT + 52], xmm2

	movd	[OUT + 12], xmm5
	 pshufd	xmm5, xmm1, SHUF(0, 3, 2, 1)
	movd	[OUT + 36], xmm5
	 pshufd	xmm1, xmm1, SHUF(1, 0, 3, 2)
	movd	[OUT + 56], xmm1

	movd	[OUT + 20], xmm4
	 pshufd	xmm4, xmm0, SHUF(1, 0, 3, 2)
	movd	[OUT + 40], xmm4
	 pshufd	xmm0, xmm0, SHUF(2, 1, 0, 3)
	movd	[OUT + 60], xmm0

	// Tidy things up.
#if CPUFAM_X86
	mov	esp, ebp
	pop	ebp
#endif
#if CPUFAM_AMD64 && ABI_WIN
	movdqa	xmm6, [rsp +  0]
	movdqa	xmm7, [rsp + 16]
	add	rsp, 64 + 8
#endif

	// And with that, we're done.
	ret

#undef NR
#undef IN
#undef OUT
#undef SAVE0
#undef SAVE1
#undef SAVE2
#undef SAVE3

ENDFUNC

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
1a0c09c4 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of Salsa20
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// Main code.
	35
	36	.arch pentium4
bc9ac7eb	37	.text
1a0c09c4	38
0f23f75f MW	39	FUNC(salsa20_core_x86ish_sse2)
	40
	41	// Initial setup.
	42
	43	#if CPUFAM_X86
	44	// Arguments come in on the stack, and will need to be collected. We
	45	// we can get away with just the scratch registers for integer work,
	46	// but we'll run out of XMM registers and will need some properly
	47	// aligned space which we'll steal from the stack. I don't trust the
	48	// stack pointer's alignment, so I'll have to mask the stack pointer,
	49	// which in turn means I'll need to keep track of the old value.
	50	// Hence I'm making a full i386-style stack frame here.
	51	//
	52	// The Windows and SysV ABIs are sufficiently similar that we don't
	53	// need to worry about the differences here.
	54
	55	# define NR ecx
	56	# define IN eax
	57	# define OUT edx
	58	# define SAVE0 xmm6
	59	# define SAVE1 xmm7
	60	# define SAVE2 [esp + 0]
	61	# define SAVE3 [esp + 16]
1a0c09c4	62
1a0c09c4 MW	63	push ebp
	64	mov ebp, esp
	65	sub esp, 32
0f23f75f MW	66	mov IN, [ebp + 12]
0f23f75f MW	67	mov OUT, [ebp + 16]
1a0c09c4	68	and esp, ~15
0f23f75f MW	69	mov NR, [ebp + 8]
	70	#endif
	71
	72	#if CPUFAM_AMD64 && ABI_SYSV
	73	// This is nice. We have plenty of XMM registers, and the arguments
	74	// are in useful places. There's no need to spill anything and we
	75	// can just get on with the code.
	76
	77	# define NR edi
	78	# define IN rsi
	79	# define OUT rdx
	80	# define SAVE0 xmm6
	81	# define SAVE1 xmm7
	82	# define SAVE2 xmm8
	83	# define SAVE3 xmm9
	84	#endif
	85
	86	# if CPUFAM_AMD64 && ABI_WIN
	87	// Arguments come in registers, but they're different between Windows
	88	// and everyone else (and everyone else is saner).
	89	//
	90	// The Windows ABI insists that we preserve some of the XMM
	91	// registers, but we want more than we can use as scratch space. Two
	92	// places we only need to save a copy of the input for the
	93	// feedforward at the end; but the other two we want for the final
ae429891 MW	94	// permutation, so save the old values on the stack. (We need an
ae429891 MW	95	// extra 8 bytes to align the stack.)
0f23f75f MW	96
	97	# define NR ecx
	98	# define IN rdx
	99	# define OUT r8
	100	# define SAVE0 xmm6
	101	# define SAVE1 xmm7
	102	# define SAVE2 [rsp + 32]
	103	# define SAVE3 [rsp + 48]
	104
	105	sub rsp, 64 + 8
f71dd54d	106	.seh_stackalloc 64 + 8
0f23f75f	107	movdqa [rsp + 0], xmm6
f71dd54d	108	.seh_savexmm xmm6, 0
0f23f75f	109	movdqa [rsp + 16], xmm7
f71dd54d MW	110	.seh_savexmm xmm7, 16
f71dd54d MW	111	.seh_endprologue
0f23f75f	112	#endif
1a0c09c4 MW	113
	114	// First job is to slurp the matrix into XMM registers. The words
	115	// have already been permuted conveniently to make them line up
	116	// better for SIMD processing.
	117	//
	118	// The textbook arrangement of the matrix is this.
	119	//
	120	// [C K K K]
	121	// [K C N N]
	122	// [T T C K]
	123	// [K K K C]
	124	//
	125	// But we've rotated the columns up so that the main diagonal with
	126	// the constants on it end up in the first row, giving something more
	127	// like
	128	//
	129	// [C C C C]
	130	// [K T K K]
	131	// [T K K N]
	132	// [K K N K]
	133	//
	134	// so the transformation looks like this:
	135	//
	136	// [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
	137	// [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
	138	// [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
	139	// [12 13 14 15] [12 1 6 11] (d, xmm3)
0f23f75f MW	140	movdqu xmm0, [IN + 0]
	141	movdqu xmm1, [IN + 16]
	142	movdqu xmm2, [IN + 32]
	143	movdqu xmm3, [IN + 48]
1a0c09c4	144
7afb1dc9	145	// Take a copy for later.
0f23f75f MW	146	movdqa SAVE0, xmm0
	147	movdqa SAVE1, xmm1
	148	movdqa SAVE2, xmm2
	149	movdqa SAVE3, xmm3
1a0c09c4	150
fd3bb67b	151	0:
1a0c09c4 MW	152	// Apply a column quarterround to each of the columns simultaneously.
	153	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	154	// have to synthesize it.
	155
	156	// b ^= (a + d) <<< 7
	157	movdqa xmm4, xmm0
	158	paddd xmm4, xmm3
	159	movdqa xmm5, xmm4
	160	pslld xmm4, 7
	161	psrld xmm5, 25
	162	por xmm4, xmm5
	163	pxor xmm1, xmm4
	164
	165	// c ^= (b + a) <<< 9
	166	movdqa xmm4, xmm1
	167	paddd xmm4, xmm0
	168	movdqa xmm5, xmm4
	169	pslld xmm4, 9
	170	psrld xmm5, 23
	171	por xmm4, xmm5
	172	pxor xmm2, xmm4
	173
	174	// d ^= (c + b) <<< 13
	175	movdqa xmm4, xmm2
	176	paddd xmm4, xmm1
a13b5730	177	pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
1a0c09c4 MW	178	movdqa xmm5, xmm4
	179	pslld xmm4, 13
	180	psrld xmm5, 19
	181	por xmm4, xmm5
	182	pxor xmm3, xmm4
	183
	184	// a ^= (d + c) <<< 18
	185	movdqa xmm4, xmm3
a13b5730	186	pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
1a0c09c4	187	paddd xmm4, xmm2
a13b5730	188	pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4 MW	189	movdqa xmm5, xmm4
	190	pslld xmm4, 18
	191	psrld xmm5, 14
	192	por xmm4, xmm5
	193	pxor xmm0, xmm4
	194
	195	// The transpose conveniently only involves reordering elements of
	196	// individual rows, which can be done quite easily, and reordering
	197	// the rows themselves, which is a trivial renaming. It doesn't
	198	// involve any movement of elements between rows.
	199	//
	200	// [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
0f23f75f MW	201	// [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
	202	// [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
	203	// [12 1 6 11] [ 3 4 9 14] (d, xmm1)
1a0c09c4 MW	204	//
	205	// The shuffles have quite high latency, so they've been pushed
	206	// backwards into the main instruction list.
	207
	208	// Apply the row quarterround to each of the columns (yes!)
	209	// simultaneously.
	210
	211	// b ^= (a + d) <<< 7
	212	movdqa xmm4, xmm0
	213	paddd xmm4, xmm1
	214	movdqa xmm5, xmm4
	215	pslld xmm4, 7
	216	psrld xmm5, 25
	217	por xmm4, xmm5
	218	pxor xmm3, xmm4
	219
	220	// c ^= (b + a) <<< 9
	221	movdqa xmm4, xmm3
	222	paddd xmm4, xmm0
	223	movdqa xmm5, xmm4
	224	pslld xmm4, 9
	225	psrld xmm5, 23
	226	por xmm4, xmm5
	227	pxor xmm2, xmm4
	228
	229	// d ^= (c + b) <<< 13
	230	movdqa xmm4, xmm2
	231	paddd xmm4, xmm3
a13b5730	232	pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
1a0c09c4 MW	233	movdqa xmm5, xmm4
	234	pslld xmm4, 13
	235	psrld xmm5, 19
	236	por xmm4, xmm5
	237	pxor xmm1, xmm4
	238
	239	// a ^= (d + c) <<< 18
	240	movdqa xmm4, xmm1
a13b5730	241	pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
1a0c09c4	242	paddd xmm4, xmm2
a13b5730	243	pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4 MW	244	movdqa xmm5, xmm4
	245	pslld xmm4, 18
	246	psrld xmm5, 14
	247	por xmm4, xmm5
	248	pxor xmm0, xmm4
	249
	250	// We had to undo the transpose ready for the next loop. Again, push
	251	// back the shuffles because they take a long time coming through.
	252	// Decrement the loop counter and see if we should go round again.
	253	// Later processors fuse this pair into a single uop.
0f23f75f	254	sub NR, 2
fd3bb67b	255	ja 0b
1a0c09c4 MW	256
	257	// Almost there. Firstly, the feedforward addition, and then we have
	258	// to write out the result. Here we have to undo the permutation
	259	// which was already applied to the input. Shuffling has quite high
	260	// latency, so arrange to start a new shuffle into a temporary as
	261	// soon as we've written out the old value.
0f23f75f	262	paddd xmm0, SAVE0
70bc6059	263	pshufd xmm4, xmm0, 0x39
0f23f75f	264	movd [OUT + 0], xmm0
1a0c09c4	265
0f23f75f	266	paddd xmm1, SAVE1
a13b5730	267	pshufd xmm5, xmm1, SHUF(2, 1, 0, 3)
0f23f75f	268	movd [OUT + 16], xmm1
1a0c09c4	269
0f23f75f	270	paddd xmm2, SAVE2
a13b5730	271	pshufd xmm6, xmm2, SHUF(1, 0, 3, 2)
0f23f75f	272	movd [OUT + 32], xmm2
1a0c09c4	273
0f23f75f	274	paddd xmm3, SAVE3
a13b5730	275	pshufd xmm7, xmm3, SHUF(0, 3, 2, 1)
0f23f75f	276	movd [OUT + 48], xmm3
1a0c09c4	277
0f23f75f	278	movd [OUT + 4], xmm7
a13b5730	279	pshufd xmm7, xmm3, SHUF(1, 0, 3, 2)
0f23f75f	280	movd [OUT + 24], xmm7
a13b5730	281	pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
0f23f75f	282	movd [OUT + 44], xmm3
1a0c09c4	283
0f23f75f	284	movd [OUT + 8], xmm6
a13b5730	285	pshufd xmm6, xmm2, SHUF(2, 1, 0, 3)
0f23f75f	286	movd [OUT + 28], xmm6
a13b5730	287	pshufd xmm2, xmm2, SHUF(0, 3, 2, 1)
0f23f75f	288	movd [OUT + 52], xmm2
1a0c09c4	289
0f23f75f	290	movd [OUT + 12], xmm5
a13b5730	291	pshufd xmm5, xmm1, SHUF(0, 3, 2, 1)
0f23f75f	292	movd [OUT + 36], xmm5
a13b5730	293	pshufd xmm1, xmm1, SHUF(1, 0, 3, 2)
0f23f75f	294	movd [OUT + 56], xmm1
1a0c09c4	295
0f23f75f	296	movd [OUT + 20], xmm4
a13b5730	297	pshufd xmm4, xmm0, SHUF(1, 0, 3, 2)
0f23f75f	298	movd [OUT + 40], xmm4
a13b5730	299	pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
0f23f75f	300	movd [OUT + 60], xmm0
1a0c09c4 MW	301
1a0c09c4 MW	302	// Tidy things up.
0f23f75f	303	#if CPUFAM_X86
1a0c09c4 MW	304	mov esp, ebp
1a0c09c4 MW	305	pop ebp
0f23f75f MW	306	#endif
	307	#if CPUFAM_AMD64 && ABI_WIN
	308	movdqa xmm6, [rsp + 0]
	309	movdqa xmm7, [rsp + 16]
	310	add rsp, 64 + 8
	311	#endif
1a0c09c4 MW	312
	313	// And with that, we're done.
	314	ret
	315
0f23f75f MW	316	#undef NR
	317	#undef IN
	318	#undef OUT
	319	#undef SAVE0
	320	#undef SAVE1
	321	#undef SAVE2
	322	#undef SAVE3
	323
1a0c09c4 MW	324	ENDFUNC
	325
	326	///----- That's all, folks --------------------------------------------------