cmp rdi, DVL4
jb 0b
- // Deal with the tail end.
+ // Deal with the tail end. Note that the actual destination length
+ // won't be an exacty number of blocks of four, so it's safe to just
+ // drop through here.
7: add [rdi], C
mov C, 0
add rdi, 4
cmp rdi, DVL
jb 7b
- // All done for this iteration. Start the next. (This must have at
- // least one follow-on iteration, or we'd not have started this outer
- // loop.)
+ // All done for this iteration. Start the next.
8: mov rdi, DV // -> Z = dv[i]
mov rbx, NV // -> X = nv[0]
cmp rdi, DVLO // all done yet?