xchg.S: Another couple of (easy) exercises.

[xchg-rax-rax] / xchg.S
diff --git a/xchg.S b/xchg.S

index 8e0bf66..40c5ba9 100644 (file)
--- a/xchg.S
+++ b/xchg.S
@@ -387,11 +387,17 @@ proc      call_example
         stp     x23, x24, [sp,  #48]
         stp     x25, x26, [sp,  #64]
         stp     x27, x28, [sp,  #80]
-       str     x1, [sp, #96]
-
-       mov     x16, x0
-
-       ldr     x17,      [x1, #128]
+       str           x1, [sp, #104]
+
+       ldp     x29, x30, [x1, #224]
+       msr     nzcv, x30
+       mov     x30, x0
+       ldp     x27, x28, [x1, #208]
+       ldp     x25, x26, [x1, #192]
+       ldp     x23, x24, [x1, #176]
+       ldp     x21, x22, [x1, #160]
+       ldp     x19, x20, [x1, #144]
+       ldp     x16, x17, [x1, #128]
         ldp     x14, x15, [x1, #112]
         ldp     x12, x13, [x1,  #96]
         ldp     x10, x11, [x1,  #80]
@@ -400,21 +406,27 @@ proc      call_example
         ldp      x4,  x5, [x1,  #32]
         ldp      x2,  x3, [x1,  #16]
         ldp      x0,  x1, [x1,   #0]
-       msr     nzcv, x17
-
-       blr     x16
-
-       ldr     x16, [sp, #96]
-       mrs     x17, nzcv
-       str     x17,      [x16, #128]
-       stp     x14, x15, [x16, #112]
-       stp     x12, x13, [x16,  #96]
-       stp     x10, x11, [x16,  #80]
-       stp      x8,  x9, [x16,  #64]
-       stp      x6,  x7, [x16,  #48]
-       stp      x4,  x5, [x16,  #32]
-       stp      x2,  x3, [x16,  #16]
-       stp      x0,  x1, [x16,   #0]
+
+       blr     x30
+
+       ldr     x30, [sp, #104]
+       stp     x27, x28, [x30, #208]
+       stp     x25, x26, [x30, #192]
+       stp     x23, x24, [x30, #176]
+       stp     x21, x22, [x30, #160]
+       stp     x19, x20, [x30, #144]
+       stp     x16, x17, [x30, #128]
+       stp     x14, x15, [x30, #112]
+       stp     x12, x13, [x30,  #96]
+       stp     x10, x11, [x30,  #80]
+       stp      x8,  x9, [x30,  #64]
+       stp      x6,  x7, [x30,  #48]
+       stp      x4,  x5, [x30,  #32]
+       stp      x2,  x3, [x30,  #16]
+       stp      x0,  x1, [x30,   #0]
+       mov     x0, x30
+       mrs     x30, nzcv
+       stp     x29, x30,  [x0, #224]
  
         ldp     x19, x20, [sp,  #16]
         ldp     x21, x22, [sp,  #32]
@@ -2738,50 +2750,101 @@ endproc
  
  proc   x26
  
+       // a bad way to rotate a right by 7 places
+
  #if defined(__x86_64__)
  
-       notimpl
+       mov     rbx, rax
+       ror     rbx, 7                  // better
+
+       mov     rdx, rax                // d' = a
+       shr     rax, 7                  // a' = a >> 7
+       shl     rdx, 0x39               // d' = a << 57
+       or      rax, rdx                // a' = a >>> 7
  
  #elif defined(__i386__)
  
-       notimpl
+       mov     ebx, eax
+       ror     ebx, 7                  // better
+
+       mov     edx, eax                // d' = a
+       shr     eax, 7                  // a' = a >> 7
+       shl     edx, 0x39               // d' = a << 57
+       or      eax, edx                // a' = a >>> 7
  
  #elif defined(__arm__)
  
-       notimpl
+       mov     r1, r0, ror #7          // easy way
+
+       // even the hard way is fairly easy on arm
+       mov     r3, r0, lsl #25
+       orr     r0, r3, r0, lsr #7      // hard way
  
  #elif defined(__aarch64__)
  
-       notimpl
+       ror     x1, x0, #7              // easy way
+
+       // even the hard way is fairly easy on arm
+       lsl     x3, x0, #57
+       orr     x0, x3, x0, lsr #7      // hard way
  
  #else
         notimpl
  #endif
  
+       ret
+
  endproc
  
  proc   x27
  
+       // shift a right by c places, in two halves
+
  #if defined(__x86_64__)
  
-       notimpl
+       mov     ch, cl                  // c' = [c, c]
+       inc     ch                      // c' = [c, c + 1]
+       shr     ch, 1
+       shr     cl, 1                   // c' = [floor(c/2), ceil(c/2)]
+       shr     rax, cl
+       xchg    ch, cl
+       shr     rax, cl
  
  #elif defined(__i386__)
  
-       notimpl
+       mov     ch, cl                  // c' = [c, c]
+       inc     ch                      // c' = [c, c + 1]
+       shr     ch, 1
+       shr     cl, 1                   // c' = [floor(c/2), ceil(c/2)]
+       shr     eax, cl
+       xchg    ch, cl
+       shr     eax, cl
  
  #elif defined(__arm__)
  
-       notimpl
+       // it would be clearer and more efficient to say: `mov r12, r2, lsr
+       // #1; sub r2, r2, r12', but that's not the lesson this exercise is
+       // trying to teach.
+       add     r12, r2, #1
+       mov     r2, r2, lsr #1
+       mov     r12, r12, lsr #1
+       mov     r0, r0, lsr r2
+       mov     r0, r0, lsr r12
  
  #elif defined(__aarch64__)
  
-       notimpl
+       add     w16, w2, #1
+       lsr     w2, w2, #1
+       lsr     w16, w16, #1
+       lsr     x0, x0, x2
+       lsr     x0, x0, x16
  
  #else
         notimpl
  #endif
  
+       ret
+
  endproc
  
  proc   x28