* The carry loop is wrong if the destination is an exact multiple of
four limbs. Fortunately, it isn't.
* The initial pass feeds into the main loop unconditionally, unlike
`mpxmont_mul4_...' (from which I think the commentary was
uncritically copied), so being at the end of it doesn't tell you
anything about whether to start another. And, indeed, we do indeed
check the loop-end condition.
- // Deal with the tail end.
+ // Deal with the tail end. Note that the actual destination length
+ // won't be an exacty number of blocks of four, so it's safe to just
+ // drop through here.
7: add [rdi], C
mov C, 0
add rdi, 4
7: add [rdi], C
mov C, 0
add rdi, 4
- // All done for this iteration. Start the next. (This must have at
- // least one follow-on iteration, or we'd not have started this outer
- // loop.)
+ // All done for this iteration. Start the next.
8: mov rdi, DV // -> Z = dv[i]
mov rbx, NV // -> X = nv[0]
cmp rdi, DVLO // all done yet?
8: mov rdi, DV // -> Z = dv[i]
mov rbx, NV // -> X = nv[0]
cmp rdi, DVLO // all done yet?
- // Deal with the tail end.
+ // Deal with the tail end. Note that the actual destination length
+ // won't be an exact number of blocks of four, so it's safe to just
+ // drop through here.
7: add [edi], eax
mov eax, 0
add edi, 4
7: add [edi], eax
mov eax, 0
add edi, 4
- // All done for this iteration. Start the next. (This must have at
- // least one follow-on iteration, or we'd not have started this outer
- // loop.)
+ // All done for this iteration. Start the next.
8: mov edi, [SP + 0] // -> dv[i - 1]
mov ebx, [BP + 28] // -> X = nv[0]
lea edx, [SP + 44] // -> space for Y
8: mov edi, [SP + 0] // -> dv[i - 1]
mov ebx, [BP + 28] // -> X = nv[0]
lea edx, [SP + 44] // -> space for Y