[catacomb] / symm / salsa20-x86ish-sse2.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// Fancy SIMD implementation of Salsa20
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

///--------------------------------------------------------------------------
/// Main code.

	.arch pentium4
	.text

FUNC(salsa20_core_x86ish_sse2)

	// Initial setup.

#if CPUFAM_X86
	// Arguments come in on the stack, and will need to be collected.  We
	// can get away with just the scratch registers for integer work, but
	// we'll run out of XMM registers and will need some properly aligned
	// space which we'll steal from the stack.  I don't trust the stack
	// pointer's alignment, so I'll have to mask the stack pointer, which
	// in turn means I'll need to keep track of the old value.  Hence I'm
	// making a full i386-style stack frame here.
	//
	// The Windows and SysV ABIs are sufficiently similar that we don't
	// need to worry about the differences here.

#  define NR ecx
#  define IN eax
#  define OUT edx
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 [esp + 0]
#  define SAVE3 [esp + 16]

	pushreg	ebp
	setfp	ebp
	sub	esp, 32
	mov	IN, [ebp + 12]
	mov	OUT, [ebp + 16]
	and	esp, ~15
	mov	NR, [ebp + 8]
#endif

#if CPUFAM_AMD64 && ABI_SYSV
	// This is nice.  We have plenty of XMM registers, and the arguments
	// are in useful places.  There's no need to spill anything and we
	// can just get on with the code.

#  define NR edi
#  define IN rsi
#  define OUT rdx
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 xmm8
#  define SAVE3 xmm9
#endif

#  if CPUFAM_AMD64 && ABI_WIN
	// Arguments come in registers, but they're different between Windows
	// and everyone else (and everyone else is saner).
	//
	// The Windows ABI insists that we preserve some of the XMM
	// registers, but we want more than we can use as scratch space.  Two
	// places we only need to save a copy of the input for the
	// feedforward at the end; but the other two we want for the final
	// permutation, so save the old values on the stack.  (We need an
	// extra 8 bytes to align the stack.)

#  define NR ecx
#  define IN rdx
#  define OUT r8
#  define SAVE0 xmm6
#  define SAVE1 xmm7
#  define SAVE2 [rsp + 32]
#  define SAVE3 [rsp + 48]

	stalloc	64 + 8
	savexmm	xmm6, 0
	savexmm	xmm7, 16
#endif

  endprologue

	// First job is to slurp the matrix into XMM registers.  The words
	// have already been permuted conveniently to make them line up
	// better for SIMD processing.
	//
	// The textbook arrangement of the matrix is this.
	//
	//	[C K K K]
	//	[K C N N]
	//	[T T C K]
	//	[K K K C]
	//
	// But we've rotated the columns up so that the main diagonal with
	// the constants on it end up in the first row, giving something more
	// like
	//
	//	[C C C C]
	//	[K T K K]
	//	[T K K N]
	//	[K K N K]
	//
	// so the transformation looks like this:
	//
	//	[ 0  1  2  3]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
	//	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
	//	[12 13 14 15]		[12  1  6 11] (d, xmm3)
	movdqu	xmm0, [IN +  0]
	movdqu	xmm1, [IN + 16]
	movdqu	xmm2, [IN + 32]
	movdqu	xmm3, [IN + 48]

	// Take a copy for later.
	movdqa	SAVE0, xmm0
	movdqa	SAVE1, xmm1
	movdqa	SAVE2, xmm2
	movdqa	SAVE3, xmm3

0:
	// Apply a column quarterround to each of the columns simultaneously.
	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	// have to synthesize it.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm3
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm1
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm1
	 pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm3
	 pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)
	paddd	xmm4, xmm2
	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// The transpose conveniently only involves reordering elements of
	// individual rows, which can be done quite easily, and reordering
	// the rows themselves, which is a trivial renaming.  It doesn't
	// involve any movement of elements between rows.
	//
	//	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
	//	[ 4  9 14  3]	 -->	[ 1  6 11 12] (b, xmm3)
	//	[ 8 13	2  7]		[ 2  7	8 13] (c, xmm2)
	//	[12  1	6 11]		[ 3  4	9 14] (d, xmm1)
	//
	// The shuffles have quite high latency, so they've been pushed
	// backwards into the main instruction list.

	// Apply the row quarterround to each of the columns (yes!)
	// simultaneously.

	// b ^= (a + d) <<<  7
	movdqa	xmm4, xmm0
	paddd	xmm4, xmm1
	movdqa	xmm5, xmm4
	pslld	xmm4, 7
	psrld	xmm5, 25
	por	xmm4, xmm5
	pxor	xmm3, xmm4

	// c ^= (b + a) <<<  9
	movdqa	xmm4, xmm3
	paddd	xmm4, xmm0
	movdqa	xmm5, xmm4
	pslld	xmm4, 9
	psrld	xmm5, 23
	por	xmm4, xmm5
	pxor	xmm2, xmm4

	// d ^= (c + b) <<< 13
	movdqa	xmm4, xmm2
	paddd	xmm4, xmm3
	 pshufd	xmm3, xmm3, SHUF(2, 1, 0, 3)
	movdqa	xmm5, xmm4
	pslld	xmm4, 13
	psrld	xmm5, 19
	por	xmm4, xmm5
	pxor	xmm1, xmm4

	// a ^= (d + c) <<< 18
	movdqa	xmm4, xmm1
	 pshufd	xmm1, xmm1, SHUF(0, 3, 2, 1)
	paddd	xmm4, xmm2
	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
	movdqa	xmm5, xmm4
	pslld	xmm4, 18
	psrld	xmm5, 14
	por	xmm4, xmm5
	pxor	xmm0, xmm4

	// We had to undo the transpose ready for the next loop.  Again, push
	// back the shuffles because they take a long time coming through.
	// Decrement the loop counter and see if we should go round again.
	// Later processors fuse this pair into a single uop.
	sub	NR, 2
	ja	0b

	// Almost there.  Firstly, the feedforward addition.
	paddd	xmm0, SAVE0			//  0,  5, 10, 15
	paddd	xmm1, SAVE1			//  4,  9, 14,  3
	paddd	xmm2, SAVE2			//  8, 13,  2,  7
	paddd	xmm3, SAVE3			// 12,  1,  6, 11

	// Next we must undo the permutation which was already applied to the
	// input.  This can be done by juggling values in registers, with the
	// following fancy footwork: some row rotations, a transpose, and
	// some more rotations.
	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)	//  3,  4,  9, 14
	pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)	//  2,  7,  8, 13
	pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)	//  1,  6, 11, 12

	movdqa	xmm4, xmm0
	movdqa	xmm5, xmm3
	punpckldq xmm0, xmm2			//  0,  2,  5,  7
	punpckldq xmm3, xmm1			//  1,  3,  6,  4
	punpckhdq xmm4, xmm2			//  10, 8, 15, 13
	punpckhdq xmm5, xmm1			//  11, 9, 12, 14

	movdqa	xmm1, xmm0
	movdqa	xmm2, xmm4
	punpckldq xmm0, xmm3			//  0,  1,  2,  3
	punpckldq xmm4, xmm5			// 10, 11,  8,  9
	punpckhdq xmm1, xmm3			//  5,  6,  7,  4
	punpckhdq xmm2, xmm5			// 15, 12, 13, 14

	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)	//  4,  5,  6,  7
	pshufd	xmm4, xmm4, SHUF(1, 0, 3, 2)	//  8,  9, 10, 11
	pshufd	xmm2, xmm2, SHUF(0, 3, 2, 1)	// 12, 13, 14, 15

	// Finally we have to write out the result.
	movdqu	[OUT +  0], xmm0
	movdqu	[OUT + 16], xmm1
	movdqu	[OUT + 32], xmm4
	movdqu	[OUT + 48], xmm2

	// Tidy things up.
#if CPUFAM_X86
	dropfp
	popreg	ebp
#endif
#if CPUFAM_AMD64 && ABI_WIN
	rstrxmm	xmm6, 0
	rstrxmm	xmm7, 16
	stfree	64 + 8
#endif

	// And with that, we're done.
	ret

#undef NR
#undef IN
#undef OUT
#undef SAVE0
#undef SAVE1
#undef SAVE2
#undef SAVE3

ENDFUNC

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
1a0c09c4 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of Salsa20
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// Main code.
	35
	36	.arch pentium4
bc9ac7eb	37	.text
1a0c09c4	38
0f23f75f MW	39	FUNC(salsa20_core_x86ish_sse2)
	40
	41	// Initial setup.
	42
	43	#if CPUFAM_X86
	44	// Arguments come in on the stack, and will need to be collected. We
172707cb MW	45	// can get away with just the scratch registers for integer work, but
	46	// we'll run out of XMM registers and will need some properly aligned
	47	// space which we'll steal from the stack. I don't trust the stack
	48	// pointer's alignment, so I'll have to mask the stack pointer, which
	49	// in turn means I'll need to keep track of the old value. Hence I'm
	50	// making a full i386-style stack frame here.
0f23f75f MW	51	//
	52	// The Windows and SysV ABIs are sufficiently similar that we don't
	53	// need to worry about the differences here.
	54
	55	# define NR ecx
	56	# define IN eax
	57	# define OUT edx
	58	# define SAVE0 xmm6
	59	# define SAVE1 xmm7
	60	# define SAVE2 [esp + 0]
	61	# define SAVE3 [esp + 16]
1a0c09c4	62
0923a413 MW	63	pushreg ebp
0923a413 MW	64	setfp ebp
1a0c09c4	65	sub esp, 32
0f23f75f MW	66	mov IN, [ebp + 12]
0f23f75f MW	67	mov OUT, [ebp + 16]
1a0c09c4	68	and esp, ~15
0f23f75f MW	69	mov NR, [ebp + 8]
	70	#endif
	71
	72	#if CPUFAM_AMD64 && ABI_SYSV
	73	// This is nice. We have plenty of XMM registers, and the arguments
	74	// are in useful places. There's no need to spill anything and we
	75	// can just get on with the code.
	76
	77	# define NR edi
	78	# define IN rsi
	79	# define OUT rdx
	80	# define SAVE0 xmm6
	81	# define SAVE1 xmm7
	82	# define SAVE2 xmm8
	83	# define SAVE3 xmm9
	84	#endif
	85
	86	# if CPUFAM_AMD64 && ABI_WIN
	87	// Arguments come in registers, but they're different between Windows
	88	// and everyone else (and everyone else is saner).
	89	//
	90	// The Windows ABI insists that we preserve some of the XMM
	91	// registers, but we want more than we can use as scratch space. Two
	92	// places we only need to save a copy of the input for the
	93	// feedforward at the end; but the other two we want for the final
ae429891 MW	94	// permutation, so save the old values on the stack. (We need an
ae429891 MW	95	// extra 8 bytes to align the stack.)
0f23f75f MW	96
	97	# define NR ecx
	98	# define IN rdx
	99	# define OUT r8
	100	# define SAVE0 xmm6
	101	# define SAVE1 xmm7
	102	# define SAVE2 [rsp + 32]
	103	# define SAVE3 [rsp + 48]
	104
0923a413 MW	105	stalloc 64 + 8
	106	savexmm xmm6, 0
	107	savexmm xmm7, 16
0f23f75f	108	#endif
1a0c09c4	109
0923a413 MW	110	endprologue
0923a413 MW	111
1a0c09c4 MW	112	// First job is to slurp the matrix into XMM registers. The words
	113	// have already been permuted conveniently to make them line up
	114	// better for SIMD processing.
	115	//
	116	// The textbook arrangement of the matrix is this.
	117	//
	118	// [C K K K]
	119	// [K C N N]
	120	// [T T C K]
	121	// [K K K C]
	122	//
	123	// But we've rotated the columns up so that the main diagonal with
	124	// the constants on it end up in the first row, giving something more
	125	// like
	126	//
	127	// [C C C C]
	128	// [K T K K]
	129	// [T K K N]
	130	// [K K N K]
	131	//
	132	// so the transformation looks like this:
	133	//
	134	// [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
	135	// [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
	136	// [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
	137	// [12 13 14 15] [12 1 6 11] (d, xmm3)
0f23f75f MW	138	movdqu xmm0, [IN + 0]
	139	movdqu xmm1, [IN + 16]
	140	movdqu xmm2, [IN + 32]
	141	movdqu xmm3, [IN + 48]
1a0c09c4	142
7afb1dc9	143	// Take a copy for later.
0f23f75f MW	144	movdqa SAVE0, xmm0
	145	movdqa SAVE1, xmm1
	146	movdqa SAVE2, xmm2
	147	movdqa SAVE3, xmm3
1a0c09c4	148
fd3bb67b	149	0:
1a0c09c4 MW	150	// Apply a column quarterround to each of the columns simultaneously.
	151	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	152	// have to synthesize it.
	153
	154	// b ^= (a + d) <<< 7
	155	movdqa xmm4, xmm0
	156	paddd xmm4, xmm3
	157	movdqa xmm5, xmm4
	158	pslld xmm4, 7
	159	psrld xmm5, 25
	160	por xmm4, xmm5
	161	pxor xmm1, xmm4
	162
	163	// c ^= (b + a) <<< 9
	164	movdqa xmm4, xmm1
	165	paddd xmm4, xmm0
	166	movdqa xmm5, xmm4
	167	pslld xmm4, 9
	168	psrld xmm5, 23
	169	por xmm4, xmm5
	170	pxor xmm2, xmm4
	171
	172	// d ^= (c + b) <<< 13
	173	movdqa xmm4, xmm2
	174	paddd xmm4, xmm1
a13b5730	175	pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
1a0c09c4 MW	176	movdqa xmm5, xmm4
	177	pslld xmm4, 13
	178	psrld xmm5, 19
	179	por xmm4, xmm5
	180	pxor xmm3, xmm4
	181
	182	// a ^= (d + c) <<< 18
	183	movdqa xmm4, xmm3
a13b5730	184	pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
1a0c09c4	185	paddd xmm4, xmm2
a13b5730	186	pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4 MW	187	movdqa xmm5, xmm4
	188	pslld xmm4, 18
	189	psrld xmm5, 14
	190	por xmm4, xmm5
	191	pxor xmm0, xmm4
	192
	193	// The transpose conveniently only involves reordering elements of
	194	// individual rows, which can be done quite easily, and reordering
	195	// the rows themselves, which is a trivial renaming. It doesn't
	196	// involve any movement of elements between rows.
	197	//
	198	// [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
0f23f75f MW	199	// [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
	200	// [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
	201	// [12 1 6 11] [ 3 4 9 14] (d, xmm1)
1a0c09c4 MW	202	//
	203	// The shuffles have quite high latency, so they've been pushed
	204	// backwards into the main instruction list.
	205
	206	// Apply the row quarterround to each of the columns (yes!)
	207	// simultaneously.
	208
	209	// b ^= (a + d) <<< 7
	210	movdqa xmm4, xmm0
	211	paddd xmm4, xmm1
	212	movdqa xmm5, xmm4
	213	pslld xmm4, 7
	214	psrld xmm5, 25
	215	por xmm4, xmm5
	216	pxor xmm3, xmm4
	217
	218	// c ^= (b + a) <<< 9
	219	movdqa xmm4, xmm3
	220	paddd xmm4, xmm0
	221	movdqa xmm5, xmm4
	222	pslld xmm4, 9
	223	psrld xmm5, 23
	224	por xmm4, xmm5
	225	pxor xmm2, xmm4
	226
	227	// d ^= (c + b) <<< 13
	228	movdqa xmm4, xmm2
	229	paddd xmm4, xmm3
a13b5730	230	pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
1a0c09c4 MW	231	movdqa xmm5, xmm4
	232	pslld xmm4, 13
	233	psrld xmm5, 19
	234	por xmm4, xmm5
	235	pxor xmm1, xmm4
	236
	237	// a ^= (d + c) <<< 18
	238	movdqa xmm4, xmm1
a13b5730	239	pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
1a0c09c4	240	paddd xmm4, xmm2
a13b5730	241	pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
1a0c09c4 MW	242	movdqa xmm5, xmm4
	243	pslld xmm4, 18
	244	psrld xmm5, 14
	245	por xmm4, xmm5
	246	pxor xmm0, xmm4
	247
	248	// We had to undo the transpose ready for the next loop. Again, push
	249	// back the shuffles because they take a long time coming through.
	250	// Decrement the loop counter and see if we should go round again.
	251	// Later processors fuse this pair into a single uop.
0f23f75f	252	sub NR, 2
fd3bb67b	253	ja 0b
1a0c09c4	254
3cb47d27 MW	255	// Almost there. Firstly, the feedforward addition.
	256	paddd xmm0, SAVE0 // 0, 5, 10, 15
	257	paddd xmm1, SAVE1 // 4, 9, 14, 3
	258	paddd xmm2, SAVE2 // 8, 13, 2, 7
	259	paddd xmm3, SAVE3 // 12, 1, 6, 11
	260
	261	// Next we must undo the permutation which was already applied to the
	262	// input. This can be done by juggling values in registers, with the
	263	// following fancy footwork: some row rotations, a transpose, and
	264	// some more rotations.
	265	pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14
	266	pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13
	267	pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12
	268
	269	movdqa xmm4, xmm0
	270	movdqa xmm5, xmm3
	271	punpckldq xmm0, xmm2 // 0, 2, 5, 7
	272	punpckldq xmm3, xmm1 // 1, 3, 6, 4
	273	punpckhdq xmm4, xmm2 // 10, 8, 15, 13
	274	punpckhdq xmm5, xmm1 // 11, 9, 12, 14
	275
	276	movdqa xmm1, xmm0
	277	movdqa xmm2, xmm4
	278	punpckldq xmm0, xmm3 // 0, 1, 2, 3
	279	punpckldq xmm4, xmm5 // 10, 11, 8, 9
	280	punpckhdq xmm1, xmm3 // 5, 6, 7, 4
	281	punpckhdq xmm2, xmm5 // 15, 12, 13, 14
	282
	283	pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7
	284	pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11
	285	pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15
	286
	287	// Finally we have to write out the result.
	288	movdqu [OUT + 0], xmm0
	289	movdqu [OUT + 16], xmm1
	290	movdqu [OUT + 32], xmm4
	291	movdqu [OUT + 48], xmm2
1a0c09c4 MW	292
1a0c09c4 MW	293	// Tidy things up.
0f23f75f	294	#if CPUFAM_X86
0923a413 MW	295	dropfp
0923a413 MW	296	popreg ebp
0f23f75f MW	297	#endif
0f23f75f MW	298	#if CPUFAM_AMD64 && ABI_WIN
0923a413	299	rstrxmm xmm6, 0
41fb2356	300	rstrxmm xmm7, 16
0923a413	301	stfree 64 + 8
0f23f75f	302	#endif
1a0c09c4 MW	303
	304	// And with that, we're done.
	305	ret
	306
0f23f75f MW	307	#undef NR
	308	#undef IN
	309	#undef OUT
	310	#undef SAVE0
	311	#undef SAVE1
	312	#undef SAVE2
	313	#undef SAVE3
	314
1a0c09c4 MW	315	ENDFUNC
	316
	317	///----- That's all, folks --------------------------------------------------