[catacomb] / symm / chacha-x86ish-sse2.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// Fancy SIMD implementation of ChaCha
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.text

///--------------------------------------------------------------------------
/// Main code.

FUNC(chacha_core_x86ish_avx)
	.arch	.avx
	vzeroupper
  endprologue
	// drop through...
ENDFUNC

	.arch	pentium4

FUNC(chacha_core_x86ish_sse2)

	// Initial setup.

#if CPUFAM_X86
	// Arguments come in on the stack, and will need to be collected.  We
	// can get away with just the scratch registers for integer work, but
	// we'll run out of XMM registers and will need some properly aligned
	// space which we'll steal from the stack.  I don't trust the stack
	// pointer's alignment, so I'll have to mask the stack pointer, which
	// in turn means I'll need to keep track of the old value.  Hence I'm
	// making a full i386-style stack frame here.
	//
	// The Windows and SysV ABIs are sufficiently similar that we don't
	// need to worry about the differences here.

#  define NR ecx
#  define IN eax
#  define OUT edx
#  define SAVE0 xmm5
#  define SAVE1 xmm6
#  define SAVE2 xmm7
#  define SAVE3 [SP]

	pushreg	BP
	setfp
	stalloc	16
	mov	IN, [BP + 12]
	mov	OUT, [BP + 16]
	and	SP, ~15
	mov	NR, [BP + 8]
#endif

#if CPUFAM_AMD64 && ABI_SYSV
	// This is nice.  We have plenty of XMM registers, and the arguments
	// are in useful places.  There's no need to spill anything and we
	// can just get on with the code.

#  define NR edi
#  define IN rsi
#  define OUT rdx
#  define SAVE0 xmm5
#  define SAVE1 xmm6
#  define SAVE2 xmm7
#  define SAVE3 xmm8
#endif

#if CPUFAM_AMD64 && ABI_WIN
	// Arguments come in registers, but they're different between Windows
	// and everyone else (and everyone else is saner).
	//
	// The Windows ABI insists that we preserve some of the XMM
	// registers, but we want more than we can use as scratch space.  We
	// only need to save a copy of the input for the feedforward at the
	// end, so we might as well use memory rather than spill extra
	// registers.  (We need an extra 8 bytes to align the stack.)

#  define NR ecx
#  define IN rdx
#  define OUT r8
#  define SAVE0 xmm5
#  define SAVE1 [SP +  0]
#  define SAVE2 [SP + 16]
#  define SAVE3 [SP + 32]

	stalloc	48 + 8
#endif

  endprologue

	// First job is to slurp the matrix into XMM registers.  Be careful:
	// the input matrix isn't likely to be properly aligned.
	//
	//	[ 0  1  2  3] (a, xmm0)
	//	[ 4  5  6  7] (b, xmm1)
	//	[ 8  9 10 11] (c, xmm2)
	//	[12 13 14 15] (d, xmm3)
	movdqu	xmm0, [IN +  0]
	movdqu	xmm1, [IN + 16]
	movdqu	xmm2, [IN + 32]
	movdqu	xmm3, [IN + 48]

	// Take a copy for later.  This one is aligned properly, by
	// construction.
	movdqa	SAVE0, xmm0
	movdqa	SAVE1, xmm1
	movdqa	SAVE2, xmm2
	movdqa	SAVE3, xmm3

0:
	// Apply a column quarterround to each of the columns simultaneously.
	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	// have to synthesize it.

	// a += b; d ^= a; d <<<= 16
	paddd	xmm0, xmm1
	pxor	xmm3, xmm0
	movdqa	xmm4, xmm3
	pslld	xmm3, 16
	psrld	xmm4, 16
	por	xmm3, xmm4

	// c += d; b ^= c; b <<<= 12
	paddd	xmm2, xmm3
	pxor	xmm1, xmm2
	movdqa	xmm4, xmm1
	pslld	xmm1, 12
	psrld	xmm4, 20
	por	xmm1, xmm4

	// a += b; d ^= a; d <<<=  8
	paddd	xmm0, xmm1
	pxor	xmm3, xmm0
	movdqa	xmm4, xmm3
	pslld	xmm3, 8
	psrld	xmm4, 24
	por	xmm3, xmm4

	// c += d; b ^= c; b <<<=  7
	paddd	xmm2, xmm3
	 pshufd	xmm3, xmm3, SHUF(3, 0, 1, 2)
	pxor	xmm1, xmm2
	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
	movdqa	xmm4, xmm1
	pslld	xmm1, 7
	psrld	xmm4, 25
	por	xmm1, xmm4

	// The not-quite-transpose conveniently only involves reordering
	// elements of individual rows, which can be done quite easily.  It
	// doesn't involve any movement of elements between rows, or even
	// renaming of the rows.
	//
	//	[ 0  1  2  3]		[ 0  1  2  3] (a, xmm0)
	//	[ 4  5  6  7]    -->	[ 5  6  7  4] (b, xmm1)
	//	[ 8  9 10 11]		[10 11  8  9] (c, xmm2)
	//	[12 13 14 15]		[15 12 13 14] (d, xmm3)
	//
	// The shuffles have quite high latency, so they've mostly been
	// pushed upwards.  The remaining one can't be moved, though.
	pshufd	xmm1, xmm1, SHUF(1, 2, 3, 0)

	// Apply the diagonal quarterround to each of the columns
	// simultaneously.

	// a += b; d ^= a; d <<<= 16
	paddd	xmm0, xmm1
	pxor	xmm3, xmm0
	movdqa	xmm4, xmm3
	pslld	xmm3, 16
	psrld	xmm4, 16
	por	xmm3, xmm4

	// c += d; b ^= c; b <<<= 12
	paddd	xmm2, xmm3
	pxor	xmm1, xmm2
	movdqa	xmm4, xmm1
	pslld	xmm1, 12
	psrld	xmm4, 20
	por	xmm1, xmm4

	// a += b; d ^= a; d <<<=  8
	paddd	xmm0, xmm1
	pxor	xmm3, xmm0
	movdqa	xmm4, xmm3
	pslld	xmm3, 8
	psrld	xmm4, 24
	por	xmm3, xmm4

	// c += d; b ^= c; b <<<=  7
	paddd	xmm2, xmm3
	 pshufd	xmm3, xmm3, SHUF(1, 2, 3, 0)
	pxor	xmm1, xmm2
	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
	movdqa	xmm4, xmm1
	pslld	xmm1, 7
	psrld	xmm4, 25
	por	xmm1, xmm4

	// Finally, finish off undoing the transpose, and we're done for this
	// doubleround.  Again, most of this was done above so we don't have
	// to wait for the shuffles.
	pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)

	// Decrement the loop counter and see if we should go round again.
	sub	NR, 2
	ja	0b

	// Almost there.  Firstly, the feedforward addition.
	paddd	xmm0, SAVE0
	paddd	xmm1, SAVE1
	paddd	xmm2, SAVE2
	paddd	xmm3, SAVE3

	// And now we write out the result.  This one won't be aligned
	// either.
	movdqu	[OUT +  0], xmm0
	movdqu	[OUT + 16], xmm1
	movdqu	[OUT + 32], xmm2
	movdqu	[OUT + 48], xmm3

	// Tidy things up.
#if CPUFAM_X86
	dropfp
	popreg	BP
#endif
#if CPUFAM_AMD64 && ABI_WIN
	stfree	48 + 8
#endif

	// And with that, we're done.
	ret

ENDFUNC

FUNC(chacha_multi_i386_sse2)
	// Arguments are on the stack:
	//
	// [sp +  4]	pointer to state
	// [sp +  8]	input pointer (or null)
	// [sp + 12]	output pointer
	// [sp + 16]	number of blocks to process
	// [sp + 20]	number of rounds per block

	pushreg	SI
	pushreg	DI
	pushreg	BX
	stalloc	4*64
  endprologue

	// Load the arguments.
	mov	BX, [SP + 272]		// = state pointer
	mov	SI, [SP + 276]		// = source pointer
	mov	DI, [SP + 280]		// = destination pointer
	mov	CX, [SP + 284]		// = block count
	mov	DX, [SP + 288]		// = (initial) round count

	// Do chunks of four blocks at a time.
	sub	CX, 4
	jb	8f

	// Inhale the initial state.
	movdqu	xmm1, [BX +  0]
	movdqu	xmm3, [BX + 16]
	movdqu	xmm5, [BX + 32]
	movdqu	xmm0, [BX + 48]

	// Set the counters and initialize the working blocks.
	pxor	xmm2, xmm2
	pxor	xmm4, xmm4
	pxor	xmm6, xmm6
	pxor	xmm7, xmm7

	xor	eax, eax
	mov	al, 1
	pinsrw	xmm2, eax, 4
	mov	al, 2
	pinsrw	xmm4, eax, 4
	mov	al, 3
	pinsrw	xmm6, eax, 4
	mov	al, 4
	pinsrw	xmm7, eax, 4

	movdqa	[SP +  16], xmm3
	movdqa	[SP +  32], xmm5
	movdqa	[SP +  48], xmm0

	paddq	xmm2, xmm3
	paddq	xmm4, xmm3
	paddq	xmm6, xmm3
	paddq	xmm7, xmm3

	movdqu	[BX + 48], xmm7

	// a += b; d ^= a; d <<<= 16
	paddd	xmm1, xmm3		// a += b

	movdqa	[SP +   0], xmm1

	pxor	xmm0, xmm1		// d ^= a
	pxor	xmm2, xmm1
	pxor	xmm4, xmm1
	pxor	xmm6, xmm1

	movdqa	xmm1, xmm0
	movdqa	xmm3, xmm2
	movdqa	xmm5, xmm4
	movdqa	xmm7, xmm6

	pslld	xmm0, 16		// d << 16
	pslld	xmm2, 16
	pslld	xmm4, 16
	pslld	xmm6, 16

	pslrd	xmm1, 16		// d >> 16
	pslrd	xmm3, 16
	pslrd	xmm5, 16
	pslrd	xmm7, 16

	por	xmm0, xmm1		// d <<<= 16
	 movdqa	xmm1, [SP + 32]
	por	xmm2, xmm3
	 movdqa	xmm3, [SP + 48]
	por	xmm4, xmm5
	por	xmm6, xmm7

	movdqa	[SP +  48], xmm0
	movdqa	[SP + 112], xmm2
	movdqa	[SP + 176], xmm4
	movdqa	[SP + 240], xmm6

	// c += d; b ^= c; c <<<= 12
	paddd	xmm0, xmm1		// c += d
	paddd	xmm2, xmm1
	paddd	xmm4, xmm1
	paddd	xmm6, xmm1

	movdqa	[SP +  32], xmm0
	movdqa	[SP +  96], xmm0
	movdqa	[SP + 160], xmm0
	movdqa	[SP + 224], xmm0

	pxor	xmm0, xmm3		// b ^= c
	pxor	xmm2, xmm3
	pxor	xmm4, xmm3
	pxor	xmm6, xmm3

	movdqa	xmm1, xmm0
	movdqa	xmm3, xmm2
	movdqa	xmm5, xmm4
	movdqa	xmm7, xmm6

	pslld	xmm0, 16		// d << 16
	pslld	xmm2, 16
	pslld	xmm4, 16
	pslld	xmm6, 16

	pslrd	xmm1, 16		// d >> 16
	pslrd	xmm3, 16
	pslrd	xmm5, 16
	pslrd	xmm7, 16

	por	xmm0, xmm1		// d <<<= 16
	por	xmm2, xmm3
	por	xmm4, xmm5
	por	xmm6, xmm7

ENDFUNC

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
1a0c09c4 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of ChaCha
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
df07f2c0	28	/// Preliminaries.
1a0c09c4 MW	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
df07f2c0 MW	33	.text
df07f2c0 MW	34
1a0c09c4 MW	35	///--------------------------------------------------------------------------
	36	/// Main code.
	37
b9b279b4 MW	38	FUNC(chacha_core_x86ish_avx)
	39	.arch .avx
	40	vzeroupper
	41	endprologue
	42	// drop through...
	43	ENDFUNC
	44
	45	.arch pentium4
	46
0f23f75f MW	47	FUNC(chacha_core_x86ish_sse2)
	48
	49	// Initial setup.
	50
	51	#if CPUFAM_X86
	52	// Arguments come in on the stack, and will need to be collected. We
172707cb MW	53	// can get away with just the scratch registers for integer work, but
	54	// we'll run out of XMM registers and will need some properly aligned
	55	// space which we'll steal from the stack. I don't trust the stack
	56	// pointer's alignment, so I'll have to mask the stack pointer, which
	57	// in turn means I'll need to keep track of the old value. Hence I'm
	58	// making a full i386-style stack frame here.
0f23f75f MW	59	//
	60	// The Windows and SysV ABIs are sufficiently similar that we don't
	61	// need to worry about the differences here.
	62
	63	# define NR ecx
	64	# define IN eax
	65	# define OUT edx
	66	# define SAVE0 xmm5
	67	# define SAVE1 xmm6
	68	# define SAVE2 xmm7
a90d420c	69	# define SAVE3 [SP]
1a0c09c4	70
a90d420c	71	pushreg BP
42c44b27	72	setfp
6d2bd7f1	73	stalloc 16
a90d420c MW	74	mov IN, [BP + 12]
	75	mov OUT, [BP + 16]
	76	and SP, ~15
	77	mov NR, [BP + 8]
0f23f75f MW	78	#endif
	79
	80	#if CPUFAM_AMD64 && ABI_SYSV
	81	// This is nice. We have plenty of XMM registers, and the arguments
	82	// are in useful places. There's no need to spill anything and we
	83	// can just get on with the code.
	84
	85	# define NR edi
	86	# define IN rsi
	87	# define OUT rdx
	88	# define SAVE0 xmm5
	89	# define SAVE1 xmm6
	90	# define SAVE2 xmm7
	91	# define SAVE3 xmm8
	92	#endif
	93
	94	#if CPUFAM_AMD64 && ABI_WIN
	95	// Arguments come in registers, but they're different between Windows
	96	// and everyone else (and everyone else is saner).
	97	//
	98	// The Windows ABI insists that we preserve some of the XMM
	99	// registers, but we want more than we can use as scratch space. We
	100	// only need to save a copy of the input for the feedforward at the
	101	// end, so we might as well use memory rather than spill extra
	102	// registers. (We need an extra 8 bytes to align the stack.)
	103
	104	# define NR ecx
	105	# define IN rdx
	106	# define OUT r8
	107	# define SAVE0 xmm5
a90d420c MW	108	# define SAVE1 [SP + 0]
	109	# define SAVE2 [SP + 16]
	110	# define SAVE3 [SP + 32]
0f23f75f	111
0923a413	112	stalloc 48 + 8
0f23f75f	113	#endif
1a0c09c4	114
0923a413 MW	115	endprologue
0923a413 MW	116
1a0c09c4 MW	117	// First job is to slurp the matrix into XMM registers. Be careful:
	118	// the input matrix isn't likely to be properly aligned.
	119	//
	120	// [ 0 1 2 3] (a, xmm0)
3197685c MW	121	// [ 4 5 6 7] (b, xmm1)
	122	// [ 8 9 10 11] (c, xmm2)
	123	// [12 13 14 15] (d, xmm3)
0f23f75f MW	124	movdqu xmm0, [IN + 0]
	125	movdqu xmm1, [IN + 16]
	126	movdqu xmm2, [IN + 32]
	127	movdqu xmm3, [IN + 48]
1a0c09c4 MW	128
	129	// Take a copy for later. This one is aligned properly, by
	130	// construction.
0f23f75f MW	131	movdqa SAVE0, xmm0
	132	movdqa SAVE1, xmm1
	133	movdqa SAVE2, xmm2
	134	movdqa SAVE3, xmm3
1a0c09c4	135
fd3bb67b	136	0:
1a0c09c4 MW	137	// Apply a column quarterround to each of the columns simultaneously.
	138	// Alas, there doesn't seem to be a packed doubleword rotate, so we
	139	// have to synthesize it.
	140
	141	// a += b; d ^= a; d <<<= 16
	142	paddd xmm0, xmm1
	143	pxor xmm3, xmm0
	144	movdqa xmm4, xmm3
	145	pslld xmm3, 16
	146	psrld xmm4, 16
	147	por xmm3, xmm4
	148
	149	// c += d; b ^= c; b <<<= 12
	150	paddd xmm2, xmm3
	151	pxor xmm1, xmm2
	152	movdqa xmm4, xmm1
	153	pslld xmm1, 12
	154	psrld xmm4, 20
	155	por xmm1, xmm4
	156
	157	// a += b; d ^= a; d <<<= 8
	158	paddd xmm0, xmm1
	159	pxor xmm3, xmm0
	160	movdqa xmm4, xmm3
	161	pslld xmm3, 8
	162	psrld xmm4, 24
	163	por xmm3, xmm4
	164
	165	// c += d; b ^= c; b <<<= 7
	166	paddd xmm2, xmm3
a117c06f	167	pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
1a0c09c4	168	pxor xmm1, xmm2
a117c06f	169	pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
1a0c09c4 MW	170	movdqa xmm4, xmm1
	171	pslld xmm1, 7
	172	psrld xmm4, 25
	173	por xmm1, xmm4
	174
	175	// The not-quite-transpose conveniently only involves reordering
	176	// elements of individual rows, which can be done quite easily. It
	177	// doesn't involve any movement of elements between rows, or even
	178	// renaming of the rows.
	179	//
	180	// [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
	181	// [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
	182	// [ 8 9 10 11] [10 11 8 9] (c, xmm2)
	183	// [12 13 14 15] [15 12 13 14] (d, xmm3)
	184	//
	185	// The shuffles have quite high latency, so they've mostly been
	186	// pushed upwards. The remaining one can't be moved, though.
a117c06f	187	pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
1a0c09c4 MW	188
	189	// Apply the diagonal quarterround to each of the columns
	190	// simultaneously.
	191
	192	// a += b; d ^= a; d <<<= 16
	193	paddd xmm0, xmm1
	194	pxor xmm3, xmm0
	195	movdqa xmm4, xmm3
	196	pslld xmm3, 16
	197	psrld xmm4, 16
	198	por xmm3, xmm4
	199
	200	// c += d; b ^= c; b <<<= 12
	201	paddd xmm2, xmm3
	202	pxor xmm1, xmm2
	203	movdqa xmm4, xmm1
	204	pslld xmm1, 12
	205	psrld xmm4, 20
	206	por xmm1, xmm4
	207
	208	// a += b; d ^= a; d <<<= 8
	209	paddd xmm0, xmm1
	210	pxor xmm3, xmm0
	211	movdqa xmm4, xmm3
	212	pslld xmm3, 8
	213	psrld xmm4, 24
	214	por xmm3, xmm4
	215
	216	// c += d; b ^= c; b <<<= 7
	217	paddd xmm2, xmm3
a117c06f	218	pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
1a0c09c4	219	pxor xmm1, xmm2
a117c06f	220	pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
1a0c09c4 MW	221	movdqa xmm4, xmm1
	222	pslld xmm1, 7
	223	psrld xmm4, 25
	224	por xmm1, xmm4
	225
	226	// Finally, finish off undoing the transpose, and we're done for this
	227	// doubleround. Again, most of this was done above so we don't have
	228	// to wait for the shuffles.
a117c06f	229	pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
1a0c09c4 MW	230
1a0c09c4 MW	231	// Decrement the loop counter and see if we should go round again.
0f23f75f	232	sub NR, 2
fd3bb67b	233	ja 0b
1a0c09c4 MW	234
1a0c09c4 MW	235	// Almost there. Firstly, the feedforward addition.
0f23f75f MW	236	paddd xmm0, SAVE0
	237	paddd xmm1, SAVE1
	238	paddd xmm2, SAVE2
	239	paddd xmm3, SAVE3
1a0c09c4 MW	240
	241	// And now we write out the result. This one won't be aligned
	242	// either.
0f23f75f MW	243	movdqu [OUT + 0], xmm0
	244	movdqu [OUT + 16], xmm1
	245	movdqu [OUT + 32], xmm2
	246	movdqu [OUT + 48], xmm3
1a0c09c4 MW	247
1a0c09c4 MW	248	// Tidy things up.
0f23f75f	249	#if CPUFAM_X86
0923a413	250	dropfp
a90d420c	251	popreg BP
0f23f75f MW	252	#endif
0f23f75f MW	253	#if CPUFAM_AMD64 && ABI_WIN
0923a413	254	stfree 48 + 8
0f23f75f	255	#endif
1a0c09c4 MW	256
	257	// And with that, we're done.
	258	ret
	259
	260	ENDFUNC
	261
22bace22 MW	262	FUNC(chacha_multi_i386_sse2)
	263	// Arguments are on the stack:
	264	//
	265	// [sp + 4] pointer to state
	266	// [sp + 8] input pointer (or null)
	267	// [sp + 12] output pointer
	268	// [sp + 16] number of blocks to process
	269	// [sp + 20] number of rounds per block
	270
	271	pushreg SI
	272	pushreg DI
	273	pushreg BX
	274	stalloc 4*64
	275	endprologue
	276
	277	// Load the arguments.
	278	mov BX, [SP + 272] // = state pointer
	279	mov SI, [SP + 276] // = source pointer
	280	mov DI, [SP + 280] // = destination pointer
	281	mov CX, [SP + 284] // = block count
	282	mov DX, [SP + 288] // = (initial) round count
	283
	284	// Do chunks of four blocks at a time.
	285	sub CX, 4
	286	jb 8f
	287
	288	// Inhale the initial state.
	289	movdqu xmm1, [BX + 0]
	290	movdqu xmm3, [BX + 16]
	291	movdqu xmm5, [BX + 32]
	292	movdqu xmm0, [BX + 48]
	293
	294	// Set the counters and initialize the working blocks.
	295	pxor xmm2, xmm2
	296	pxor xmm4, xmm4
	297	pxor xmm6, xmm6
	298	pxor xmm7, xmm7
	299
	300	xor eax, eax
	301	mov al, 1
	302	pinsrw xmm2, eax, 4
	303	mov al, 2
	304	pinsrw xmm4, eax, 4
	305	mov al, 3
	306	pinsrw xmm6, eax, 4
	307	mov al, 4
	308	pinsrw xmm7, eax, 4
	309
	310	movdqa [SP + 16], xmm3
	311	movdqa [SP + 32], xmm5
	312	movdqa [SP + 48], xmm0
	313
	314	paddq xmm2, xmm3
	315	paddq xmm4, xmm3
	316	paddq xmm6, xmm3
	317	paddq xmm7, xmm3
	318
	319	movdqu [BX + 48], xmm7
	320
	321	// a += b; d ^= a; d <<<= 16
	322	paddd xmm1, xmm3 // a += b
	323
	324	movdqa [SP + 0], xmm1
	325
326	pxor xmm0, xmm1 // d ^= a
327	pxor xmm2, xmm1
328	pxor xmm4, xmm1
329	pxor xmm6, xmm1
330
331	movdqa xmm1, xmm0
332	movdqa xmm3, xmm2
333	movdqa xmm5, xmm4
334	movdqa xmm7, xmm6
335
336	pslld xmm0, 16 // d << 16
337	pslld xmm2, 16
338	pslld xmm4, 16
339	pslld xmm6, 16
340
341	pslrd xmm1, 16 // d >> 16
342	pslrd xmm3, 16
343	pslrd xmm5, 16
344	pslrd xmm7, 16
345
346	por xmm0, xmm1 // d <<<= 16
347	movdqa xmm1, [SP + 32]
348	por xmm2, xmm3
349	movdqa xmm3, [SP + 48]
350	por xmm4, xmm5
351	por xmm6, xmm7
352
353	movdqa [SP + 48], xmm0
354	movdqa [SP + 112], xmm2
355	movdqa [SP + 176], xmm4
356	movdqa [SP + 240], xmm6
357
358	// c += d; b ^= c; c <<<= 12
359	paddd xmm0, xmm1 // c += d
360	paddd xmm2, xmm1
361	paddd xmm4, xmm1
362	paddd xmm6, xmm1
363
364	movdqa [SP + 32], xmm0
365	movdqa [SP + 96], xmm0
366	movdqa [SP + 160], xmm0
367	movdqa [SP + 224], xmm0
368
369	pxor xmm0, xmm3 // b ^= c
370	pxor xmm2, xmm3
371	pxor xmm4, xmm3
372	pxor xmm6, xmm3
373
374	movdqa xmm1, xmm0
375	movdqa xmm3, xmm2
376	movdqa xmm5, xmm4
377	movdqa xmm7, xmm6
378
379	pslld xmm0, 16 // d << 16
380	pslld xmm2, 16
381	pslld xmm4, 16
382	pslld xmm6, 16
383
384	pslrd xmm1, 16 // d >> 16
385	pslrd xmm3, 16
386	pslrd xmm5, 16
387	pslrd xmm7, 16
388
389	por xmm0, xmm1 // d <<<= 16
390	por xmm2, xmm3
391	por xmm4, xmm5
392	por xmm6, xmm7
393
394	ENDFUNC
395
1a0c09c4	396	///----- That's all, folks --------------------------------------------------