From: Mark Wooding <mdw@distorted.org.uk>
Date: Sat, 24 Nov 2018 21:53:58 +0000 (+0000)
Subject: Merge branch '2.4.x'
X-Git-Tag: 2.5.0~21
X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/925ff94a516478164fdd01d53332637455e0074d?hp=-c

Merge branch '2.4.x'

* 2.4.x:
  progs/cc-progress.c: Use `fstat' to discover the file size.
  math/mpx-mul4-amd64-sse2.S: Always collect iteration count as 32 bits.
  math/mpx-mul4-amd64-sse2.S: Fix stack-argument offset for 64-bit Windows.
  symm/salsa20-x86ish-sse2.S: Fix typo in 64-bit Windows code.
  symm/desx.c, symm/desx.h (desx_init): Fix documentation.
  symm/t/rijndael256: Add tests for small key sizes.
  progs/cc-kem.c (getkem): Parse the `kdf' spec after bulk crypto.
  progs/..., symm/...: Fix 32-bit right-shift idiom.
---

925ff94a516478164fdd01d53332637455e0074d
diff --combined math/mpx-mul4-amd64-sse2.S
index 64460ca9,9146a63f..29939c1c
--- a/math/mpx-mul4-amd64-sse2.S
+++ b/math/mpx-mul4-amd64-sse2.S
@@@ -96,32 -96,32 +96,32 @@@
  .macro	mulcore	r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
  	// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
  	// of the product in registers D0, D1, D2, D3.
 -	pshufd	\d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?, r_i, ?)
 +	pshufd	\d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
    .ifnes "\d1", "nil"
 -	movdqa	\d1, \slo		// (s'_0, s'_1, s''_0, s''_1)
 +	movdqa	\d1, \slo		// (s'_0, s'_1; s''_0, s''_1)
    .endif
    .ifnes "\d3", "nil"
 -	movdqa	\d3, \shi		// (s'_2, s'_3, s''_2, s''_3)
 +	movdqa	\d3, \shi		// (s'_2, s'_3; s''_2, s''_3)
    .endif
    .ifnes "\d1", "nil"
 -	psrldq	\d1, 4			// (s'_1, s''_0, s''_1, 0)
 +	psrldq	\d1, 4			// (s'_1, s''_0; s''_1, 0)
    .endif
    .ifnes "\d2", "nil"
 -	movdqa	\d2, \d0		// another copy of (r_i, ?, r_i, ?)
 +	movdqa	\d2, \d0		// another copy of (r_i, ?; r_i, ?)
    .endif
    .ifnes "\d3", "nil"
 -	psrldq	\d3, 4			// (s'_3, s''_2, s''_3, 0)
 +	psrldq	\d3, 4			// (s'_3, s''_2; s''_3, 0)
    .endif
    .ifnes "\d1", "nil"
 -	pmuludq	\d1, \d0		// (r_i s'_1, r_i s''_1)
 +	pmuludq	\d1, \d0		// (r_i s'_1; r_i s''_1)
    .endif
    .ifnes "\d3", "nil"
 -	pmuludq	\d3, \d0		// (r_i s'_3, r_i s''_3)
 +	pmuludq	\d3, \d0		// (r_i s'_3; r_i s''_3)
    .endif
    .ifnes "\d2", "nil"
 -	pmuludq	\d2, \shi		// (r_i s'_2, r_i s''_2)
 +	pmuludq	\d2, \shi		// (r_i s'_2; r_i s''_2)
    .endif
 -	pmuludq	\d0, \slo		// (r_i s'_0, r_i s''_0)
 +	pmuludq	\d0, \slo		// (r_i s'_0; r_i s''_0)
  .endm
  
  .macro	accum	c0, c1=nil, c2=nil, c3=nil
@@@ -163,10 -163,10 +163,10 @@@
  	// lane 0 or 1 of D; the high two lanes of D are clobbered.  On
  	// completion, XMM3 is clobbered.  If CC is `nil', then the
  	// contribution which would have been added to it is left in C.
 -	pshufd	xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
 -	psrldq	xmm3, 12		// (t, 0, 0, 0) = (t, 0)
 -	pslldq	xmm3, 2			// (t b, 0)
 -	paddq	\c, xmm3		// (c' + t b, c'')
 +	pshufd	xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
 +	psrldq	xmm3, 12		// (t, 0; 0, 0) = (t; 0)
 +	pslldq	xmm3, 2			// (t b; 0)
 +	paddq	\c, xmm3		// (c' + t b; c'')
    .ifeqs "\pos", "lo"
  	movdqa	\d, \c
    .else
@@@ -183,37 -183,37 +183,37 @@@
  	// of the value represented in C are written at POS in D, and the
  	// remaining bits are left at the bottom of T.
  	movdqa	\t, \c
 -	psllq	\t, 16			// (?, c'' b)
 -	pslldq	\c, 8			// (0, c')
 -	paddq	\t, \c			// (?, c' + c'' b)
 -	psrldq	\t, 8			// c' + c'' b
 +	psllq	\t, 16			// (?; c'' b)
 +	pslldq	\c, 8			// (0; c')
 +	paddq	\t, \c			// (?; c' + c'' b)
 +	psrldq	\t, 8			// (c' + c'' b; 0) = (c; 0)
    .ifeqs "\pos", "lo"
  	movdqa	\d, \t
    .else
  	punpckldq \d, \t
    .endif
 -	psrldq	\t, 4			// floor((c' + c'' b)/B)
 +	psrldq	\t, 4			// (floor(c/B); 0)
  .endm
  
  .macro	expand	z, a, b, c=nil, d=nil
  	// On entry, A and C hold packed 128-bit values, and Z is zero.  On
  	// exit, A:B and C:D together hold the same values in expanded
  	// form.  If C is `nil', then only expand A to A:B.
 -	movdqa	\b, \a			// (a_0, a_1, a_2, a_3)
 +	movdqa	\b, \a			// (a_0, a_1; a_2, a_3)
    .ifnes "\c", "nil"
 -	movdqa	\d, \c			// (c_0, c_1, c_2, c_3)
 +	movdqa	\d, \c			// (c_0, c_1; c_2, c_3)
    .endif
 -	punpcklwd \a, \z		// (a'_0, a''_0, a'_1, a''_1)
 -	punpckhwd \b, \z		// (a'_2, a''_2, a'_3, a''_3)
 +	punpcklwd \a, \z		// (a'_0, a''_0; a'_1, a''_1)
 +	punpckhwd \b, \z		// (a'_2, a''_2; a'_3, a''_3)
    .ifnes "\c", "nil"
 -	punpcklwd \c, \z		// (c'_0, c''_0, c'_1, c''_1)
 -	punpckhwd \d, \z		// (c'_2, c''_2, c'_3, c''_3)
 +	punpcklwd \c, \z		// (c'_0, c''_0; c'_1, c''_1)
 +	punpckhwd \d, \z		// (c'_2, c''_2; c'_3, c''_3)
    .endif
 -	pshufd	\a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
 -	pshufd	\b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
 +	pshufd	\a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
 +	pshufd	\b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
    .ifnes "\c", "nil"
 -	pshufd	\c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
 -	pshufd	\d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
 +	pshufd	\c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
 +	pshufd	\d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
    .endif
  .endm
  
@@@ -229,10 -229,10 +229,10 @@@
  	// we can do that, we must gather them together.
  	movdqa	\t, \c0
  	movdqa	\u, \c1
 -	punpcklqdq \t, \c2		// (y'_0, y'_2)
 -	punpckhqdq \c0, \c2		// (y''_0, y''_2)
 -	punpcklqdq \u, \c3		// (y'_1, y'_3)
 -	punpckhqdq \c1, \c3		// (y''_1, y''_3)
 +	punpcklqdq \t, \c2		// (y'_0; y'_2)
 +	punpckhqdq \c0, \c2		// (y''_0; y''_2)
 +	punpcklqdq \u, \c3		// (y'_1; y'_3)
 +	punpckhqdq \c1, \c3		// (y''_1; y''_3)
  
  	// Now split the double-prime pieces.  The high (up to) 48 bits will
  	// go up; the low 16 bits go down.
@@@ -240,43 -240,43 +240,43 @@@
  	movdqa	\c3, \c1
  	psllq	\c2, 48
  	psllq	\c3, 48
 -	psrlq	\c0, 16			// high parts of (y''_0, y''_2)
 -	psrlq	\c1, 16			// high parts of (y''_1, y''_3)
 -	psrlq	\c2, 32			// low parts of (y''_0, y''_2)
 -	psrlq	\c3, 32			// low parts of (y''_1, y''_3)
 +	psrlq	\c0, 16			// high parts of (y''_0; y''_2)
 +	psrlq	\c1, 16			// high parts of (y''_1; y''_3)
 +	psrlq	\c2, 32			// low parts of (y''_0; y''_2)
 +	psrlq	\c3, 32			// low parts of (y''_1; y''_3)
    .ifnes "\hi", "nil"
  	movdqa	\hi, \c1
    .endif
 -	pslldq	\c1, 8			// high part of (0, y''_1)
 +	pslldq	\c1, 8			// high part of (0; y''_1)
  
  	paddq	\t, \c2			// propagate down
  	paddq	\u, \c3
 -	paddq	\t, \c1			// and up: (y_0, y_2)
 -	paddq	\u, \c0			// (y_1, y_3)
 +	paddq	\t, \c1			// and up: (y_0; y_2)
 +	paddq	\u, \c0			// (y_1; y_3)
    .ifnes "\hi", "nil"
 -	psrldq	\hi, 8			// high part of (y''_3, 0)
 +	psrldq	\hi, 8			// high part of (y''_3; 0)
    .endif
  
  	// Finally extract the answer.  This complicated dance is better than
  	// storing to memory and loading, because the piecemeal stores
  	// inhibit store forwarding.
 -	movdqa	\c3, \t			// (y_0, y_1)
 -	movdqa	\lo, \t			// (y^*_0, ?, ?, ?)
 -	psrldq	\t, 8			// (y_2, 0)
 -	psrlq	\c3, 32			// (floor(y_0/B), ?)
 -	paddq	\c3, \u			// (y_1 + floor(y_0/B), ?)
 -	movdqa	\c1, \c3		// (y^*_1, ?, ?, ?)
 -	psrldq	\u, 8			// (y_3, 0)
 -	psrlq	\c3, 32			// (floor((y_1 B + y_0)/B^2, ?)
 -	paddq	\c3, \t			// (y_2 + floor((y_1 B + y_0)/B^2, ?)
 -	punpckldq \lo, \c3		// (y^*_0, y^*_2, ?, ?)
 -	psrlq	\c3, 32		    // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 -	paddq	\c3, \u	      // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 +	movdqa	\c3, \t			// (y_0; ?)
 +	movdqa	\lo, \t			// (y^*_0, ?; ?, ?)
 +	psrldq	\t, 8			// (y_2; 0)
 +	psrlq	\c3, 32			// (floor(y_0/B); ?)
 +	paddq	\c3, \u			// (y_1 + floor(y_0/B); ?)
 +	movdqa	\c1, \c3		// (y^*_1, ?; ?, ?)
 +	psrldq	\u, 8			// (y_3; 0)
 +	psrlq	\c3, 32			// (floor((y_1 B + y_0)/B^2; ?)
 +	paddq	\c3, \t			// (y_2 + floor((y_1 B + y_0)/B^2; ?)
 +	punpckldq \lo, \c3		// (y^*_0, y^*_2; ?, ?)
 +	psrlq	\c3, 32		    // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 +	paddq	\c3, \u	      // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
    .ifnes "\hi", "nil"
  	movdqa	\t, \c3
  	pxor	\u, \u
    .endif
 -	punpckldq \c1, \c3		// (y^*_1, y^*_3, ?, ?)
 +	punpckldq \c1, \c3		// (y^*_1, y^*_3; ?, ?)
    .ifnes "\hi", "nil"
  	psrlq	\t, 32			// very high bits of y
  	paddq	\hi, \t
@@@ -293,13 -293,13 +293,13 @@@
  	// On exit, the carry registers, including XMM15, are updated to hold
  	// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
  	// registers are preserved.
 -	movd	xmm0, [rdi +  0]	// (a_0, 0)
 -	movd	xmm1, [rdi +  4]	// (a_1, 0)
 -	movd	xmm2, [rdi +  8]	// (a_2, 0)
 -	movd	xmm15, [rdi + 12]	// (a_3, 0)
 -	paddq	xmm12, xmm0		// (c'_0 + a_0, c''_0)
 -	paddq	xmm13, xmm1		// (c'_1 + a_1, c''_1)
 -	paddq	xmm14, xmm2		// (c'_2 + a_2, c''_2 + a_3 b)
 +	movd	xmm0, [rdi +  0]	// (a_0; 0)
 +	movd	xmm1, [rdi +  4]	// (a_1; 0)
 +	movd	xmm2, [rdi +  8]	// (a_2; 0)
 +	movd	xmm15, [rdi + 12]	// (a_3; 0)
 +	paddq	xmm12, xmm0		// (c'_0 + a_0; c''_0)
 +	paddq	xmm13, xmm1		// (c'_1 + a_1; c''_1)
 +	paddq	xmm14, xmm2		// (c'_2 + a_2; c''_2 + a_3 b)
  .endm
  
  ///--------------------------------------------------------------------------
@@@ -621,8 -621,8 +621,8 @@@ INTFUNC(mmla4
  	mulcore	xmm7, 1,   xmm10, xmm11, xmm0,  xmm1,  xmm2
  	accum				 xmm4,  xmm5,  xmm6
  
 -	punpckldq xmm12, xmm15		// (w_0, 0, w_1, 0)
 -	punpckhdq xmm14, xmm15		// (w_2, 0, w_3, 0)
 +	punpckldq xmm12, xmm15		// (w_0, 0; w_1, 0)
 +	punpckhdq xmm14, xmm15		// (w_2, 0; w_3, 0)
  
  	mulcore	xmm7, 2,   xmm10, xmm11, xmm0,  xmm1
  	accum				 xmm5,  xmm6
@@@ -634,10 -634,10 +634,10 @@@
  	mulcore	xmm7, 3,   xmm10, xmm11, xmm0
  	accum				 xmm6
  
 -	punpckldq xmm12, xmm2		// (w_0, 0, 0, 0)
 -	punpckldq xmm14, xmm2		// (w_2, 0, 0, 0)
 -	punpckhdq xmm13, xmm2		// (w_1, 0, 0, 0)
 -	punpckhdq xmm15, xmm2		// (w_3, 0, 0, 0)
 +	punpckldq xmm12, xmm2		// (w_0, 0; 0, 0)
 +	punpckldq xmm14, xmm2		// (w_2, 0; 0, 0)
 +	punpckhdq xmm13, xmm2		// (w_1, 0; 0, 0)
 +	punpckhdq xmm15, xmm2		// (w_3, 0; 0, 0)
  
  	// That's lots of pieces.  Now we have to assemble the answer.
  	squash	xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
@@@ -703,8 -703,8 +703,8 @@@ INTFUNC(mont4
  	mulcore	xmm7, 1,   xmm8,  xmm9,  xmm0,  xmm1,  xmm2
  	accum				 xmm4,  xmm5,  xmm6
  
 -	punpckldq xmm12, xmm15		// (w_0, 0, w_1, 0)
 -	punpckhdq xmm14, xmm15		// (w_2, 0, w_3, 0)
 +	punpckldq xmm12, xmm15		// (w_0, 0; w_1, 0)
 +	punpckhdq xmm14, xmm15		// (w_2, 0; w_3, 0)
  
  	mulcore	xmm7, 2,   xmm8,  xmm9,  xmm0,  xmm1
  	accum				 xmm5,  xmm6
@@@ -716,10 -716,10 +716,10 @@@
  	mulcore	xmm7, 3,   xmm8,  xmm9,  xmm0
  	accum				 xmm6
  
 -	punpckldq xmm12, xmm2		// (w_0, 0, 0, 0)
 -	punpckldq xmm14, xmm2		// (w_2, 0, 0, 0)
 -	punpckhdq xmm13, xmm2		// (w_1, 0, 0, 0)
 -	punpckhdq xmm15, xmm2		// (w_3, 0, 0, 0)
 +	punpckldq xmm12, xmm2		// (w_0, 0; 0, 0)
 +	punpckldq xmm14, xmm2		// (w_2, 0; 0, 0)
 +	punpckhdq xmm13, xmm2		// (w_1, 0; 0, 0)
 +	punpckhdq xmm15, xmm2		// (w_3, 0; 0, 0)
  
  	// That's lots of pieces.  Now we have to assemble the answer.
  	squash	xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
@@@ -752,13 -752,6 +752,13 @@@ ENDFUN
  ///--------------------------------------------------------------------------
  /// Bulk multipliers.
  
 +FUNC(mpx_umul4_amd64_avx)
 +	.arch	.avx
 +	vzeroupper
 +  endprologue
 +	.arch	pentium4
 +ENDFUNC
 +
  FUNC(mpx_umul4_amd64_sse2)
  	// void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
  	//			   const mpw *bv, const mpw *bvl);
@@@ -908,13 -901,6 +908,13 @@@
  
  ENDFUNC
  
 +FUNC(mpxmont_mul4_amd64_avx)
 +	.arch	.avx
 +	vzeroupper
 +  endprologue
 +	.arch	pentium4
 +ENDFUNC
 +
  FUNC(mpxmont_mul4_amd64_sse2)
  	// void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
  	//			     const mpw *nv, size_t n, const mpw *mi);
@@@ -1109,13 -1095,6 +1109,13 @@@
  
  ENDFUNC
  
 +FUNC(mpxmont_redc4_amd64_avx)
 +	.arch	.avx
 +	vzeroupper
 +  endprologue
 +	.arch	pentium4
 +ENDFUNC
 +
  FUNC(mpxmont_redc4_amd64_sse2)
  	// void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
  	//			       size_t n, const mpw *mi);
@@@ -1329,7 -1308,7 +1329,7 @@@ ENDFUN
  #  define ARG6 STKARG(2)
  #  define ARG7 STKARG(3)
  #  define ARG8 STKARG(4)
- #  define STKARG_OFFSET 40
+ #  define STKARG_OFFSET 224
  #endif
  #define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
  
@@@ -1386,7 -1365,7 +1386,7 @@@
  	mov	rbx, r8
  	movdqu	xmm8, [r9]
  	movdqu	xmm10, [rax]
- 	mov	r8, STKARG(1)
+ 	mov	r8d, STKARG(1)
  	mov	r9, STKARG(2)
  	mov	r10, rdx
  	mov	r11, rcx
@@@ -1395,7 -1374,7 +1395,7 @@@
    .ifeqs "\mode", "mont"
  	mov	rbx, rcx
  	movdqu	xmm8, [r8]
- 	mov	r8, r9
+ 	mov	r8d, r9d
  	mov	r9, STKARG(0)
  	mov	r10, rdx
  	mov	rcx, rsi
@@@ -1423,16 -1402,16 +1423,16 @@@
  	mov	rbx, r9
  	movdqu	xmm8, [r10]
  	movdqu	xmm10, [r11]
- 	mov	r8, STKARG(2)
- 	mov	r9, STKARG(3)
  	mov	r11, r8
+ 	mov	r8d, STKARG(2)
+ 	mov	r9, STKARG(3)
    .endif
    .ifeqs "\mode", "smul"
  	mov	rdi, rcx
  	mov	rcx, rdx
  	mov	rbx, r8
  	movdqu	xmm10, [r9]
- 	mov	r8, STKARG(0)
+ 	mov	r8d, STKARG(0)
  	mov	r9, STKARG(1)
    .endif
    .ifeqs "\mode", "mmul"
@@@ -1443,10 -1422,10 +1443,10 @@@
  	mov	rbx, STKARG(0)
  	movdqu	xmm8, [r10]
  	movdqu	xmm10, [r11]
- 	mov	r8, STKARG(3)
- 	mov	r9, STKARG(4)
  	mov	r10, r8
  	mov	r11, r9
+ 	mov	r8d, STKARG(3)
+ 	mov	r9, STKARG(4)
    .endif
    .ifeqs "\mode", "mont"
  	mov	r10, STKARG(0)
@@@ -1454,9 -1433,9 +1454,9 @@@
  	mov	rcx, rdx
  	mov	rbx, r9
  	movdqu	xmm8, [r10]
- 	mov	r8, STKARG(1)
- 	mov	r9, STKARG(2)
  	mov	r10, r8
+ 	mov	r8d, STKARG(1)
+ 	mov	r9, STKARG(2)
    .endif
  #endif
  
@@@ -1495,9 -1474,9 +1495,9 @@@
  .endm
  
  .macro	testldcarry
 -	movdqu	xmm12, [rcx +  0]	// (c'_0, c''_0)
 -	movdqu	xmm13, [rcx + 16]	// (c'_1, c''_1)
 -	movdqu	xmm14, [rcx + 32]	// (c'_2, c''_2)
 +	movdqu	xmm12, [rcx +  0]	// (c'_0; c''_0)
 +	movdqu	xmm13, [rcx + 16]	// (c'_1; c''_1)
 +	movdqu	xmm14, [rcx + 32]	// (c'_2; c''_2)
  .endm
  
  .macro	testtop	u=nil
diff --combined symm/blkc.h
index e94e932b,ff631f09..e0837521
--- a/symm/blkc.h
+++ b/symm/blkc.h
@@@ -109,7 -109,7 +109,7 @@@
  
  #define BLKC_SHOW(PRE, tag, w) do {					\
    fputs(tag ": ", stdout);						\
 -  BLKC_SKEL_X(PRE, BLKC_W(w);, printf("%08x ", *_w++););		\
 +  BLKC_SKEL_X(PRE, const BLKC_W(w);, printf("%08x ", *_w++););		\
    fputc('\n', stdout);							\
  } while (0)
  
@@@ -174,7 -174,7 +174,7 @@@
    unsigned _i; BLKC_W(w); unsigned long _x = x;				\
    for (_i = 0; _i < PRE##_BLKSZ / 4; _i++) {				\
      *_w++ = U32(_x);							\
-     _x = ((_x & ~MASK32) >> 16) >> 16;					\
+     _x = ((_x & ~(unsigned long)MASK32) >> 16) >> 16;			\
    }									\
  } while (0)
  
@@@ -182,7 -182,7 +182,7 @@@
    unsigned _i; BLKC_W(w); unsigned long _x = x;	_w += PRE##_BLKSZ / 4;	\
    for (_i = 0; _i < PRE##_BLKSZ / 4; _i++) {				\
      *--_w = U32(_x);							\
-     _x = ((_x & ~MASK32) >> 16) >> 16;					\
+     _x = ((_x & ~(unsigned long)MASK32) >> 16) >> 16;			\
    }									\
  } while (0)
  
diff --combined symm/salsa20-x86ish-sse2.S
index ad4e322b,7d8e2e38..06ba3d2c
--- a/symm/salsa20-x86ish-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@@ -33,17 -33,9 +33,17 @@@
  ///--------------------------------------------------------------------------
  /// Main code.
  
 -	.arch pentium4
  	.text
  
 +FUNC(salsa20_core_x86ish_avx)
 +	.arch	.avx
 +	vzeroupper
 +  endprologue
 +	// drop through...
 +ENDFUNC
 +
 +	.arch	pentium4
 +
  FUNC(salsa20_core_x86ish_sse2)
  
  	// Initial setup.
@@@ -180,7 -172,7 +180,7 @@@
  	// d ^= (c + b) <<< 13
  	movdqa	xmm4, xmm2
  	paddd	xmm4, xmm1
 -	 pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)
 +	 pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)
  	movdqa	xmm5, xmm4
  	pslld	xmm4, 13
  	psrld	xmm5, 19
@@@ -189,9 -181,9 +189,9 @@@
  
  	// a ^= (d + c) <<< 18
  	movdqa	xmm4, xmm3
 -	 pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)
 +	 pshufd	xmm3, xmm3, SHUF(1, 2, 3, 0)
  	paddd	xmm4, xmm2
 -	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
 +	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
  	movdqa	xmm5, xmm4
  	pslld	xmm4, 18
  	psrld	xmm5, 14
@@@ -235,7 -227,7 +235,7 @@@
  	// d ^= (c + b) <<< 13
  	movdqa	xmm4, xmm2
  	paddd	xmm4, xmm3
 -	 pshufd	xmm3, xmm3, SHUF(2, 1, 0, 3)
 +	 pshufd	xmm3, xmm3, SHUF(3, 0, 1, 2)
  	movdqa	xmm5, xmm4
  	pslld	xmm4, 13
  	psrld	xmm5, 19
@@@ -244,9 -236,9 +244,9 @@@
  
  	// a ^= (d + c) <<< 18
  	movdqa	xmm4, xmm1
 -	 pshufd	xmm1, xmm1, SHUF(0, 3, 2, 1)
 +	 pshufd	xmm1, xmm1, SHUF(1, 2, 3, 0)
  	paddd	xmm4, xmm2
 -	 pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)
 +	 pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)
  	movdqa	xmm5, xmm4
  	pslld	xmm4, 18
  	psrld	xmm5, 14
@@@ -270,9 -262,9 +270,9 @@@
  	// input.  This can be done by juggling values in registers, with the
  	// following fancy footwork: some row rotations, a transpose, and
  	// some more rotations.
 -	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)	//  3,  4,  9, 14
 -	pshufd	xmm2, xmm2, SHUF(1, 0, 3, 2)	//  2,  7,  8, 13
 -	pshufd	xmm3, xmm3, SHUF(0, 3, 2, 1)	//  1,  6, 11, 12
 +	pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)	//  3,  4,  9, 14
 +	pshufd	xmm2, xmm2, SHUF(2, 3, 0, 1)	//  2,  7,  8, 13
 +	pshufd	xmm3, xmm3, SHUF(1, 2, 3, 0)	//  1,  6, 11, 12
  
  	movdqa	xmm4, xmm0
  	movdqa	xmm5, xmm3
@@@ -288,9 -280,9 +288,9 @@@
  	punpckhdq xmm1, xmm3			//  5,  6,  7,  4
  	punpckhdq xmm2, xmm5			// 15, 12, 13, 14
  
 -	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)	//  4,  5,  6,  7
 -	pshufd	xmm4, xmm4, SHUF(1, 0, 3, 2)	//  8,  9, 10, 11
 -	pshufd	xmm2, xmm2, SHUF(0, 3, 2, 1)	// 12, 13, 14, 15
 +	pshufd	xmm1, xmm1, SHUF(3, 0, 1, 2)	//  4,  5,  6,  7
 +	pshufd	xmm4, xmm4, SHUF(2, 3, 0, 1)	//  8,  9, 10, 11
 +	pshufd	xmm2, xmm2, SHUF(1, 2, 3, 0)	// 12, 13, 14, 15
  
  	// Finally we have to write out the result.
  	movdqu	[OUT +  0], xmm0
@@@ -305,7 -297,7 +305,7 @@@
  #endif
  #if CPUFAM_AMD64 && ABI_WIN
  	rstrxmm	xmm6, 0
- 	rsrrxmm	xmm7, 16
+ 	rstrxmm	xmm7, 16
  	stfree	64 + 8
  #endif