// the input matrix isn't likely to be properly aligned.
//
// [ 0 1 2 3] (a, xmm0)
- // [ 4 5 6 7] (b, xmm0)
- // [ 8 9 10 11] (c, xmm0)
- // [12 13 14 15] (d, xmm0)
+ // [ 4 5 6 7] (b, xmm1)
+ // [ 8 9 10 11] (c, xmm2)
+ // [12 13 14 15] (d, xmm3)
movdqu xmm0, [edx + 0]
movdqu xmm1, [edx + 16]
movdqu xmm2, [edx + 32]