X-Git-Url: https://git.distorted.org.uk/~mdw/xchg-rax-rax/blobdiff_plain/90c4eee32bd52dbdba4913a5030c7b27cffaa103..636f688ac516d2d2bed4e0e8836a7f0720bd9958:/xchg.S diff --git a/xchg.S b/xchg.S index 0dcb6c8..40c5ba9 100644 --- a/xchg.S +++ b/xchg.S @@ -380,18 +380,24 @@ proc call_example #elif defined(__aarch64__) - stp x29, x30, [sp, #-13*8]! + stp x29, x30, [sp, #-14*8]! mov x29, sp stp x19, x20, [sp, #16] stp x21, x22, [sp, #32] stp x23, x24, [sp, #48] stp x25, x26, [sp, #64] stp x27, x28, [sp, #80] - str x1, [sp, #96] - - mov x16, x0 - - ldr x17, [x1, #128] + str x1, [sp, #104] + + ldp x29, x30, [x1, #224] + msr nzcv, x30 + mov x30, x0 + ldp x27, x28, [x1, #208] + ldp x25, x26, [x1, #192] + ldp x23, x24, [x1, #176] + ldp x21, x22, [x1, #160] + ldp x19, x20, [x1, #144] + ldp x16, x17, [x1, #128] ldp x14, x15, [x1, #112] ldp x12, x13, [x1, #96] ldp x10, x11, [x1, #80] @@ -400,28 +406,36 @@ proc call_example ldp x4, x5, [x1, #32] ldp x2, x3, [x1, #16] ldp x0, x1, [x1, #0] - msr nzcv, x17 - - blr x16 - - ldr x16, [sp, #96] - mrs x17, nzcv - str x17, [x16, #128] - stp x14, x15, [x16, #112] - stp x12, x13, [x16, #96] - stp x10, x11, [x16, #80] - stp x8, x9, [x16, #64] - stp x6, x7, [x16, #48] - stp x4, x5, [x16, #32] - stp x2, x3, [x16, #16] - stp x0, x1, [x16, #0] + + blr x30 + + ldr x30, [sp, #104] + stp x27, x28, [x30, #208] + stp x25, x26, [x30, #192] + stp x23, x24, [x30, #176] + stp x21, x22, [x30, #160] + stp x19, x20, [x30, #144] + stp x16, x17, [x30, #128] + stp x14, x15, [x30, #112] + stp x12, x13, [x30, #96] + stp x10, x11, [x30, #80] + stp x8, x9, [x30, #64] + stp x6, x7, [x30, #48] + stp x4, x5, [x30, #32] + stp x2, x3, [x30, #16] + stp x0, x1, [x30, #0] + mov x0, x30 + mrs x30, nzcv + stp x29, x30, [x0, #224] ldp x19, x20, [sp, #16] ldp x21, x22, [sp, #32] ldp x23, x24, [sp, #48] ldp x25, x26, [sp, #64] ldp x27, x28, [sp, #80] - ldp x29, x30, [sp], #13*8 + ldp x29, x30, [sp], #14*8 + + ret #else # error "not supported" @@ -2736,50 +2750,101 @@ endproc proc x26 + // a bad way to rotate a right by 7 places + #if defined(__x86_64__) - notimpl + mov rbx, rax + ror rbx, 7 // better + + mov rdx, rax // d' = a + shr rax, 7 // a' = a >> 7 + shl rdx, 0x39 // d' = a << 57 + or rax, rdx // a' = a >>> 7 #elif defined(__i386__) - notimpl + mov ebx, eax + ror ebx, 7 // better + + mov edx, eax // d' = a + shr eax, 7 // a' = a >> 7 + shl edx, 0x39 // d' = a << 57 + or eax, edx // a' = a >>> 7 #elif defined(__arm__) - notimpl + mov r1, r0, ror #7 // easy way + + // even the hard way is fairly easy on arm + mov r3, r0, lsl #25 + orr r0, r3, r0, lsr #7 // hard way #elif defined(__aarch64__) - notimpl + ror x1, x0, #7 // easy way + + // even the hard way is fairly easy on arm + lsl x3, x0, #57 + orr x0, x3, x0, lsr #7 // hard way #else notimpl #endif + ret + endproc proc x27 + // shift a right by c places, in two halves + #if defined(__x86_64__) - notimpl + mov ch, cl // c' = [c, c] + inc ch // c' = [c, c + 1] + shr ch, 1 + shr cl, 1 // c' = [floor(c/2), ceil(c/2)] + shr rax, cl + xchg ch, cl + shr rax, cl #elif defined(__i386__) - notimpl + mov ch, cl // c' = [c, c] + inc ch // c' = [c, c + 1] + shr ch, 1 + shr cl, 1 // c' = [floor(c/2), ceil(c/2)] + shr eax, cl + xchg ch, cl + shr eax, cl #elif defined(__arm__) - notimpl + // it would be clearer and more efficient to say: `mov r12, r2, lsr + // #1; sub r2, r2, r12', but that's not the lesson this exercise is + // trying to teach. + add r12, r2, #1 + mov r2, r2, lsr #1 + mov r12, r12, lsr #1 + mov r0, r0, lsr r2 + mov r0, r0, lsr r12 #elif defined(__aarch64__) - notimpl + add w16, w2, #1 + lsr w2, w2, #1 + lsr w16, w16, #1 + lsr x0, x0, x2 + lsr x0, x0, x16 #else notimpl #endif + ret + endproc proc x28