From aa80ad5e5a5363c53db3a63793cc10849411c1bf Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Sat, 27 Oct 2018 10:43:24 +0100 Subject: [PATCH] math/mpx-mul4-*-sse2.S (squash): We don't care about the top half of c3 here. The previous version of the comment erroneously claimed that the top half of c3 held y_1; in fact it holds y_2, but we'll clobber it anyway because the objective is to carry up into y_1, so mark it as don't-care (like lo). --- math/mpx-mul4-amd64-sse2.S | 2 +- math/mpx-mul4-x86-sse2.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index d8f54e1f..84f9e3fe 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -260,7 +260,7 @@ // Finally extract the answer. This complicated dance is better than // storing to memory and loading, because the piecemeal stores // inhibit store forwarding. - movdqa \c3, \t // (y_0, y_1) + movdqa \c3, \t // (y_0, ?) movdqa \lo, \t // (y^*_0, ?, ?, ?) psrldq \t, 8 // (y_2, 0) psrlq \c3, 32 // (floor(y_0/B), ?) diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index cdc35967..ee741d21 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -260,7 +260,7 @@ // Finally extract the answer. This complicated dance is better than // storing to memory and loading, because the piecemeal stores // inhibit store forwarding. - movdqa \c3, \t // (y_0, y_1) + movdqa \c3, \t // (y_0, ?) movdqa \lo, \t // (y^*_0, ?, ?, ?) psrldq \t, 8 // (y_2, 0) psrlq \c3, 32 // (floor(y_0/B), ?) -- 2.11.0