cmp edi, esi
jb 0b
- // Deal with the tail end.
+ // Deal with the tail end. Note that the actual destination length
+ // won't be an exact number of blocks of four, so it's safe to just
+ // drop through here.
7: add [edi], eax
mov eax, 0
add edi, 4
cmp edi, edx
jb 7b
- // All done for this iteration. Start the next. (This must have at
- // least one follow-on iteration, or we'd not have started this outer
- // loop.)
+ // All done for this iteration. Start the next.
8: mov edi, [SP + 0] // -> dv[i - 1]
mov ebx, [BP + 28] // -> X = nv[0]
lea edx, [SP + 44] // -> space for Y
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
+ pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
+ pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
+ pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
+ pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
+ pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
+ pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]