[catacomb] / symm / salsa20-x86ish-sse2.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// Fancy SIMD implementation of Salsa20
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

///--------------------------------------------------------------------------
/// Main code.

	.arch pentium4
	.text

FUNC(salsa20_core_x86ish_sse2)

	// Initial setup.

#if CPUFAM_X86
	// Arguments come in on the stack, and will need to be collected.  We
	// we can get away with just the scratch registers for integer work,
	// but we'll run out of XMM registers and will need some properly
	// aligned space which we'll steal from the stack.  I don't trust the
	// stack pointer's alignment, so I'll have to mask the stack pointer,
	// which in turn means I'll need to keep track of the old value.
	// Hence I'm making a full i386-style stack frame here.
	//
	// The Windows and SysV ABIs are sufficiently similar that we don't
	// need to worry about the differences here.

#  define NR ecx
#  define IN eax
#  define OUT edx
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 [esp + 0]
#  define SAVE3 [esp + 16]

	push	ebp
	mov	ebp, esp
	sub	esp, 32
	mov	IN, [ebp + 12]
	mov	OUT, [ebp + 16]
	and	esp, ~15
	mov	NR, [ebp + 8]
#endif

#if CPUFAM_AMD64 && ABI_SYSV
	// This is nice.  We have plenty of XMM registers, and the arguments
	// are in useful places.  There's no need to spill anything and we
	// can just get on with the code.

#  define NR edi
#  define IN rsi
#  define OUT rdx
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 xmm8
#  define SAVE3 xmm9
#endif

#  if CPUFAM_AMD64 && ABI_WIN
	// Arguments come in registers, but they're different between Windows
	// and everyone else (and everyone else is saner).
	//
	// The Windows ABI insists that we preserve some of the XMM
	// registers, but we want more than we can use as scratch space.  Two
	// places we only need to save a copy of the input for the
	// feedforward at the end; but the other two we want for the final
	// permutation, so save the old values on the stack.  (We need an
	// extra 8 bytes to align the stack.)

#  define NR ecx
#  define IN rdx
#  define OUT r8
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 [rsp + 32]
#  define SAVE3 [rsp + 48]

	sub	rsp, 64 + 8
	  .seh_stackalloc 64 + 8
	movdqa	[rsp +  0], xmm6
	  .seh_savexmm xmm6, 0
	movdqa	[rsp + 16], xmm7
	  .seh_savexmm xmm7, 16
  .seh_endprologue
#endif

	// First job is to slurp the matrix into XMM registers.  The words
	// have already been permuted conveniently to make them line up
	// better for SIMD processing.
	//
	// The textbook arrangement of the matrix is this.
	//
	//	[C K K K]
	//	[K C N N]
	//	[T T C K]
	//	[K K K C]
	//
	// But we've rotated the columns up so that the main diagonal with
	// the constants on it end up in the first row, giving something more
	// like
	//
	//	[C C C C]
	//	[K T K K]
	//	[T K K N]
	//	[K K N K]
	//
	// so the transformation looks like this:
	//
	//	[ 0  1  2  3]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
	//	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
	//	[12 13 14 15]		[12  1  6 11] (d, xmm3)
	movdqu	xmm0, [IN +  0]
	movdqu	xmm1, [IN + 16]
	movdqu	xmm2, [IN + 32]
	movdqu	xmm3, [IN + 48]

	// Take a copy for later.
	movdqa	SAVE0, xmm0
	movdqa	SAVE1, xmm1
	movdqa	SAVE2, xmm2
	movdqa	SAVE3, xmm3

0:
	// Apply a column quarterround to each of the columns simultaneously.
	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	// have to synthesize it.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm3
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm1
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm1
	 pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm3
	 pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)
	paddd	xmm4, xmm2
	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// The transpose conveniently only involves reordering elements of
	// individual rows, which can be done quite easily, and reordering
	// the rows themselves, which is a trivial renaming.  It doesn't
	// involve any movement of elements between rows.
	//
	//	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  9 14  3]	 -->	[ 1  6 11 12] (b, xmm3)
	//	[ 8 13	2  7]		[ 2  7	8 13] (c, xmm2)
	//	[12  1	6 11]		[ 3  4	9 14] (d, xmm1)
	//
	// The shuffles have quite high latency, so they've been pushed
	// backwards into the main instruction list.

	// Apply the row quarterround to each of the columns (yes!)
	// simultaneously.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm1
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm3
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm3
	 pshufd	xmm3, xmm3, SHUF(2, 1, 0, 3)
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm1
	 pshufd	xmm1, xmm1, SHUF(0, 3, 2, 1)
	paddd	xmm4, xmm2
	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// We had to undo the transpose ready for the next loop.  Again, push
	// back the shuffles because they take a long time coming through.
	// Decrement the loop counter and see if we should go round again.
	// Later processors fuse this pair into a single uop.
	sub	NR, 2
	ja	0b

	// Almost there.  Firstly, the feedforward addition.
	paddd	xmm0, SAVE0			//  0,  5, 10, 15
	paddd	xmm1, SAVE1			//  4,  9, 14,  3
	paddd	xmm2, SAVE2			//  8, 13,  2,  7
	paddd	xmm3, SAVE3			// 12,  1,  6, 11

	// Next we must undo the permutation which was already applied to the
	// input.  This can be done by juggling values in registers, with the
	// following fancy footwork: some row rotations, a transpose, and
	// some more rotations.
	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)	//  3,  4,  9, 14
	pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)	//  2,  7,  8, 13
	pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)	//  1,  6, 11, 12

	movdqa	xmm4, xmm0
	movdqa	xmm5, xmm3
	punpckldq xmm0, xmm2			//  0,  2,  5,  7
	punpckldq xmm3, xmm1			//  1,  3,  6,  4
	punpckhdq xmm4, xmm2			//  10, 8, 15, 13
	punpckhdq xmm5, xmm1			//  11, 9, 12, 14

	movdqa	xmm1, xmm0
	movdqa	xmm2, xmm4
	punpckldq xmm0, xmm3			//  0,  1,  2,  3
	punpckldq xmm4, xmm5			// 10, 11,  8,  9
	punpckhdq xmm1, xmm3			//  5,  6,  7,  4
	punpckhdq xmm2, xmm5			// 15, 12, 13, 14

	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)	//  4,  5,  6,  7
	pshufd	xmm4, xmm4, SHUF(1, 0, 3, 2)	//  8,  9, 10, 11
	pshufd	xmm2, xmm2, SHUF(0, 3, 2, 1)	// 12, 13, 14, 15

	// Finally we have to write out the result.
	movdqu	[OUT +  0], xmm0
	movdqu	[OUT + 16], xmm1
	movdqu	[OUT + 32], xmm4
	movdqu	[OUT + 48], xmm2

	// Tidy things up.
#if CPUFAM_X86
	mov	esp, ebp
	pop	ebp
#endif
#if CPUFAM_AMD64 && ABI_WIN
	movdqa	xmm6, [rsp +  0]
	movdqa	xmm7, [rsp + 16]
	add	rsp, 64 + 8
#endif

	// And with that, we're done.
	ret

#undef NR
#undef IN
#undef OUT
#undef SAVE0
#undef SAVE1
#undef SAVE2
#undef SAVE3

ENDFUNC

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
1a0c09c4 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of Salsa20
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// Main code.
	35
	36	.arch pentium4
bc9ac7eb	37	.text
1a0c09c4	38
0f23f75f MW	39	FUNC(salsa20_core_x86ish_sse2)
	40
	41	// Initial setup.
	42
	43	#if CPUFAM_X86
	44	// Arguments come in on the stack, and will need to be collected. We
	45	// we can get away with just the scratch registers for integer work,
	46	// but we'll run out of XMM registers and will need some properly
	47	// aligned space which we'll steal from the stack. I don't trust the
	48	// stack pointer's alignment, so I'll have to mask the stack pointer,
	49	// which in turn means I'll need to keep track of the old value.
	50	// Hence I'm making a full i386-style stack frame here.
	51	//
	52	// The Windows and SysV ABIs are sufficiently similar that we don't
	53	// need to worry about the differences here.
	54
	55	# define NR ecx
	56	# define IN eax
	57	# define OUT edx
	58	# define SAVE0 xmm6
	59	# define SAVE1 xmm7
	60	# define SAVE2 [esp + 0]
	61	# define SAVE3 [esp + 16]
1a0c09c4	62
1a0c09c4 MW	63	push ebp
	64	mov ebp, esp
	65	sub esp, 32
0f23f75f MW	66	mov IN, [ebp + 12]
0f23f75f MW	67	mov OUT, [ebp + 16]
1a0c09c4	68	and esp, ~15
0f23f75f MW	69	mov NR, [ebp + 8]
	70	#endif
	71
	72	#if CPUFAM_AMD64 && ABI_SYSV
	73	// This is nice. We have plenty of XMM registers, and the arguments
	74	// are in useful places. There's no need to spill anything and we
	75	// can just get on with the code.
	76
	77	# define NR edi
	78	# define IN rsi
	79	# define OUT rdx
	80	# define SAVE0 xmm6
	81	# define SAVE1 xmm7
	82	# define SAVE2 xmm8
	83	# define SAVE3 xmm9
	84	#endif
	85
	86	# if CPUFAM_AMD64 && ABI_WIN
	87	// Arguments come in registers, but they're different between Windows
	88	// and everyone else (and everyone else is saner).
	89	//
	90	// The Windows ABI insists that we preserve some of the XMM
	91	// registers, but we want more than we can use as scratch space. Two
	92	// places we only need to save a copy of the input for the
	93	// feedforward at the end; but the other two we want for the final
ae429891 MW	94	// permutation, so save the old values on the stack. (We need an
ae429891 MW	95	// extra 8 bytes to align the stack.)
0f23f75f MW	96
	97	# define NR ecx
	98	# define IN rdx
	99	# define OUT r8
	100	# define SAVE0 xmm6
	101	# define SAVE1 xmm7
	102	# define SAVE2 [rsp + 32]
	103	# define SAVE3 [rsp + 48]
	104
	105	sub rsp, 64 + 8
f71dd54d	106	.seh_stackalloc 64 + 8
0f23f75f	107	movdqa [rsp + 0], xmm6
f71dd54d	108	.seh_savexmm xmm6, 0
0f23f75f	109	movdqa [rsp + 16], xmm7
f71dd54d MW	110	.seh_savexmm xmm7, 16
f71dd54d MW	111	.seh_endprologue
0f23f75f	112	#endif
1a0c09c4 MW	113
	114	// First job is to slurp the matrix into XMM registers. The words
	115	// have already been permuted conveniently to make them line up
	116	// better for SIMD processing.
	117	//
	118	// The textbook arrangement of the matrix is this.
	119	//
	120	// [C K K K]
	121	// [K C N N]
	122	// [T T C K]
	123	// [K K K C]
	124	//
	125	// But we've rotated the columns up so that the main diagonal with
	126	// the constants on it end up in the first row, giving something more
	127	// like
	128	//
	129	// [C C C C]
	130	// [K T K K]
	131	// [T K K N]
	132	// [K K N K]
	133	//
	134	// so the transformation looks like this:
	135	//
	136	// [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
	137	// [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
	138	// [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
	139	// [12 13 14 15] [12 1 6 11] (d, xmm3)
0f23f75f MW	140	movdqu xmm0, [IN + 0]
	141	movdqu xmm1, [IN + 16]
	142	movdqu xmm2, [IN + 32]
	143	movdqu xmm3, [IN + 48]
1a0c09c4	144
7afb1dc9	145	// Take a copy for later.
0f23f75f MW	146	movdqa SAVE0, xmm0
	147	movdqa SAVE1, xmm1
	148	movdqa SAVE2, xmm2
	149	movdqa SAVE3, xmm3
1a0c09c4	150
fd3bb67b	151	0:
1a0c09c4 MW	152	// Apply a column quarterround to each of the columns simultaneously.
	153	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	154	// have to synthesize it.
	155
	156	// b ^= (a + d) <<< 7
	157	movdqa xmm4, xmm0
	158	paddd xmm4, xmm3
	159	movdqa xmm5, xmm4
	160	pslld xmm4, 7
	161	psrld xmm5, 25
	162	por xmm4, xmm5
	163	pxor xmm1, xmm4
	164
	165	// c ^= (b + a) <<< 9
	166	movdqa xmm4, xmm1
	167	paddd xmm4, xmm0
	168	movdqa xmm5, xmm4
	169	pslld xmm4, 9
	170	psrld xmm5, 23
	171	por xmm4, xmm5
	172	pxor xmm2, xmm4
	173
	174	// d ^= (c + b) <<< 13
	175	movdqa xmm4, xmm2
	176	paddd xmm4, xmm1
a13b5730	177	pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
1a0c09c4 MW	178	movdqa xmm5, xmm4
	179	pslld xmm4, 13
	180	psrld xmm5, 19
	181	por xmm4, xmm5
	182	pxor xmm3, xmm4
	183
	184	// a ^= (d + c) <<< 18
	185	movdqa xmm4, xmm3
a13b5730	186	pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
1a0c09c4	187	paddd xmm4, xmm2
a13b5730	188	pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4 MW	189	movdqa xmm5, xmm4
	190	pslld xmm4, 18
	191	psrld xmm5, 14
	192	por xmm4, xmm5
	193	pxor xmm0, xmm4
	194
	195	// The transpose conveniently only involves reordering elements of
	196	// individual rows, which can be done quite easily, and reordering
	197	// the rows themselves, which is a trivial renaming. It doesn't
	198	// involve any movement of elements between rows.
	199	//
	200	// [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
0f23f75f MW	201	// [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
	202	// [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
	203	// [12 1 6 11] [ 3 4 9 14] (d, xmm1)
1a0c09c4 MW	204	//
	205	// The shuffles have quite high latency, so they've been pushed
	206	// backwards into the main instruction list.
	207
	208	// Apply the row quarterround to each of the columns (yes!)
	209	// simultaneously.
	210
	211	// b ^= (a + d) <<< 7
	212	movdqa xmm4, xmm0
	213	paddd xmm4, xmm1
	214	movdqa xmm5, xmm4
	215	pslld xmm4, 7
	216	psrld xmm5, 25
	217	por xmm4, xmm5
	218	pxor xmm3, xmm4
	219
	220	// c ^= (b + a) <<< 9
	221	movdqa xmm4, xmm3
	222	paddd xmm4, xmm0
	223	movdqa xmm5, xmm4
	224	pslld xmm4, 9
	225	psrld xmm5, 23
	226	por xmm4, xmm5
	227	pxor xmm2, xmm4
	228
	229	// d ^= (c + b) <<< 13
	230	movdqa xmm4, xmm2
	231	paddd xmm4, xmm3
a13b5730	232	pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
1a0c09c4 MW	233	movdqa xmm5, xmm4
	234	pslld xmm4, 13
	235	psrld xmm5, 19
	236	por xmm4, xmm5
	237	pxor xmm1, xmm4
	238
	239	// a ^= (d + c) <<< 18
	240	movdqa xmm4, xmm1
a13b5730	241	pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
1a0c09c4	242	paddd xmm4, xmm2
a13b5730	243	pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4 MW	244	movdqa xmm5, xmm4
	245	pslld xmm4, 18
	246	psrld xmm5, 14
	247	por xmm4, xmm5
	248	pxor xmm0, xmm4
	249
	250	// We had to undo the transpose ready for the next loop. Again, push
	251	// back the shuffles because they take a long time coming through.
	252	// Decrement the loop counter and see if we should go round again.
	253	// Later processors fuse this pair into a single uop.
0f23f75f	254	sub NR, 2
fd3bb67b	255	ja 0b
1a0c09c4	256
3cb47d27 MW	257	// Almost there. Firstly, the feedforward addition.
	258	paddd xmm0, SAVE0 // 0, 5, 10, 15
	259	paddd xmm1, SAVE1 // 4, 9, 14, 3
	260	paddd xmm2, SAVE2 // 8, 13, 2, 7
	261	paddd xmm3, SAVE3 // 12, 1, 6, 11
	262
	263	// Next we must undo the permutation which was already applied to the
	264	// input. This can be done by juggling values in registers, with the
	265	// following fancy footwork: some row rotations, a transpose, and
	266	// some more rotations.
	267	pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14
	268	pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13
	269	pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12
	270
	271	movdqa xmm4, xmm0
	272	movdqa xmm5, xmm3
	273	punpckldq xmm0, xmm2 // 0, 2, 5, 7
	274	punpckldq xmm3, xmm1 // 1, 3, 6, 4
	275	punpckhdq xmm4, xmm2 // 10, 8, 15, 13
	276	punpckhdq xmm5, xmm1 // 11, 9, 12, 14
	277
	278	movdqa xmm1, xmm0
	279	movdqa xmm2, xmm4
	280	punpckldq xmm0, xmm3 // 0, 1, 2, 3
	281	punpckldq xmm4, xmm5 // 10, 11, 8, 9
	282	punpckhdq xmm1, xmm3 // 5, 6, 7, 4
	283	punpckhdq xmm2, xmm5 // 15, 12, 13, 14
	284
	285	pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7
	286	pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11
	287	pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15
	288
	289	// Finally we have to write out the result.
	290	movdqu [OUT + 0], xmm0
	291	movdqu [OUT + 16], xmm1
	292	movdqu [OUT + 32], xmm4
	293	movdqu [OUT + 48], xmm2
1a0c09c4 MW	294
1a0c09c4 MW	295	// Tidy things up.
0f23f75f	296	#if CPUFAM_X86
1a0c09c4 MW	297	mov esp, ebp
1a0c09c4 MW	298	pop ebp
0f23f75f MW	299	#endif
	300	#if CPUFAM_AMD64 && ABI_WIN
	301	movdqa xmm6, [rsp + 0]
	302	movdqa xmm7, [rsp + 16]
	303	add rsp, 64 + 8
	304	#endif
1a0c09c4 MW	305
	306	// And with that, we're done.
	307	ret
	308
0f23f75f MW	309	#undef NR
	310	#undef IN
	311	#undef OUT
	312	#undef SAVE0
	313	#undef SAVE1
	314	#undef SAVE2
	315	#undef SAVE3
	316
1a0c09c4 MW	317	ENDFUNC
	318
	319	///----- That's all, folks --------------------------------------------------