ENDFUNC
+FUNC(chacha_multi_i386_sse2)
+ // Arguments are on the stack:
+ //
+ // [sp + 4] pointer to state
+ // [sp + 8] input pointer (or null)
+ // [sp + 12] output pointer
+ // [sp + 16] number of blocks to process
+ // [sp + 20] number of rounds per block
+
+ pushreg SI
+ pushreg DI
+ pushreg BX
+ stalloc 4*64
+ endprologue
+
+ // Load the arguments.
+ mov BX, [SP + 272] // = state pointer
+ mov SI, [SP + 276] // = source pointer
+ mov DI, [SP + 280] // = destination pointer
+ mov CX, [SP + 284] // = block count
+ mov DX, [SP + 288] // = (initial) round count
+
+ // Do chunks of four blocks at a time.
+ sub CX, 4
+ jb 8f
+
+ // Inhale the initial state.
+ movdqu xmm1, [BX + 0]
+ movdqu xmm3, [BX + 16]
+ movdqu xmm5, [BX + 32]
+ movdqu xmm0, [BX + 48]
+
+ // Set the counters and initialize the working blocks.
+ pxor xmm2, xmm2
+ pxor xmm4, xmm4
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ xor eax, eax
+ mov al, 1
+ pinsrw xmm2, eax, 4
+ mov al, 2
+ pinsrw xmm4, eax, 4
+ mov al, 3
+ pinsrw xmm6, eax, 4
+ mov al, 4
+ pinsrw xmm7, eax, 4
+
+ movdqa [SP + 16], xmm3
+ movdqa [SP + 32], xmm5
+ movdqa [SP + 48], xmm0
+
+ paddq xmm2, xmm3
+ paddq xmm4, xmm3
+ paddq xmm6, xmm3
+ paddq xmm7, xmm3
+
+ movdqu [BX + 48], xmm7
+
+ // a += b; d ^= a; d <<<= 16
+ paddd xmm1, xmm3 // a += b
+
+ movdqa [SP + 0], xmm1
+
+ pxor xmm0, xmm1 // d ^= a
+ pxor xmm2, xmm1
+ pxor xmm4, xmm1
+ pxor xmm6, xmm1
+
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ movdqa xmm5, xmm4
+ movdqa xmm7, xmm6
+
+ pslld xmm0, 16 // d << 16
+ pslld xmm2, 16
+ pslld xmm4, 16
+ pslld xmm6, 16
+
+ pslrd xmm1, 16 // d >> 16
+ pslrd xmm3, 16
+ pslrd xmm5, 16
+ pslrd xmm7, 16
+
+ por xmm0, xmm1 // d <<<= 16
+ movdqa xmm1, [SP + 32]
+ por xmm2, xmm3
+ movdqa xmm3, [SP + 48]
+ por xmm4, xmm5
+ por xmm6, xmm7
+
+ movdqa [SP + 48], xmm0
+ movdqa [SP + 112], xmm2
+ movdqa [SP + 176], xmm4
+ movdqa [SP + 240], xmm6
+
+ // c += d; b ^= c; c <<<= 12
+ paddd xmm0, xmm1 // c += d
+ paddd xmm2, xmm1
+ paddd xmm4, xmm1
+ paddd xmm6, xmm1
+
+ movdqa [SP + 32], xmm0
+ movdqa [SP + 96], xmm0
+ movdqa [SP + 160], xmm0
+ movdqa [SP + 224], xmm0
+
+ pxor xmm0, xmm3 // b ^= c
+ pxor xmm2, xmm3
+ pxor xmm4, xmm3
+ pxor xmm6, xmm3
+
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ movdqa xmm5, xmm4
+ movdqa xmm7, xmm6
+
+ pslld xmm0, 16 // d << 16
+ pslld xmm2, 16
+ pslld xmm4, 16
+ pslld xmm6, 16
+
+ pslrd xmm1, 16 // d >> 16
+ pslrd xmm3, 16
+ pslrd xmm5, 16
+ pslrd xmm7, 16
+
+ por xmm0, xmm1 // d <<<= 16
+ por xmm2, xmm3
+ por xmm4, xmm5
+ por xmm6, xmm7
+
+ENDFUNC
+
///----- That's all, folks --------------------------------------------------