Previously, I waited until `rdi' was set up for the next iteration
before comparing it against the limit. But in fact, `DV' already has
the right value, so we can compare earlier.
jb 7b
// All done for this iteration. Start the next.
jb 7b
// All done for this iteration. Start the next.
-8: mov rdi, DV // -> Z = dv[i]
- mov rbx, NV // -> X = nv[0]
- cmp rdi, DVLO // all done yet?
+ cmp DV, DVLO // all done yet?
+ mov rdi, DV // -> Z = dv[i]
+ mov rbx, NV // -> X = nv[0]
add DV, 16
call mont4
add rdi, 16
add DV, 16
call mont4
add rdi, 16