X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/92edc356a312bc64abca0c30bc03d4b6676f3d39..f79c8756d054b2979f79d8277affb988c4f39c49:/math/mpx-mul4-amd64-sse2.S diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index a37aba69..da3e6d61 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -1155,7 +1155,7 @@ FUNC(mpxmont_redc4_amd64_sse2) // outer loop dv r10 rcx // outer loop dv limit r11 r11 // nv base rdx r8 - // nv limit r9 r12* + // nv limit r9 r10* // n rcx r9 // c rcx r9 @@ -1183,14 +1183,13 @@ FUNC(mpxmont_redc4_amd64_sse2) # define DV rcx # define DVLO r11 # define NV r8 -# define NVL r12 +# define NVL r10 # define N r9 # define C r9d pushreg rbx pushreg rdi - pushreg r12 - stalloc 160 + stalloc 168 savexmm xmm6, 0 savexmm xmm7, 16 @@ -1260,7 +1259,9 @@ FUNC(mpxmont_redc4_amd64_sse2) cmp rdi, DVL4 jb 0b - // Deal with the tail end. + // Deal with the tail end. Note that the actual destination length + // won't be an exacty number of blocks of four, so it's safe to just + // drop through here. 7: add [rdi], C mov C, 0 add rdi, 4 @@ -1268,9 +1269,7 @@ FUNC(mpxmont_redc4_amd64_sse2) cmp rdi, DVL jb 7b - // All done for this iteration. Start the next. (This must have at - // least one follow-on iteration, or we'd not have started this outer - // loop.) + // All done for this iteration. Start the next. 8: mov rdi, DV // -> Z = dv[i] mov rbx, NV // -> X = nv[0] cmp rdi, DVLO // all done yet? @@ -1300,8 +1299,7 @@ FUNC(mpxmont_redc4_amd64_sse2) rstrxmm xmm14, 128 rstrxmm xmm15, 144 - stfree 160 - popreg r12 + stfree 168 popreg rdi popreg rbx #endif