xchg.S: More exercises.

author Mark Wooding <mdw@distorted.org.uk>

Mon, 19 Oct 2020 18:02:48 +0000 (19:02 +0100)

committer Mark Wooding <mdw@distorted.org.uk>

Mon, 19 Oct 2020 18:02:48 +0000 (19:02 +0100)
author Mark Wooding <mdw@distorted.org.uk>
Mon, 19 Oct 2020 18:02:48 +0000 (19:02 +0100)
committer Mark Wooding <mdw@distorted.org.uk>
Mon, 19 Oct 2020 18:02:48 +0000 (19:02 +0100)
diff --git a/xchg.S b/xchg.S

index 40c5ba9..b95c665 100644 (file)
--- a/xchg.S
+++ b/xchg.S
@@ -16,6 +16,7 @@
  .endm
  
         .arch   armv7-a
+       .fpu    neon
  
  #elif defined(__aarch64__)
  
@@ -2703,10 +2704,9 @@ proc     x25
  0:     movzx   ebx, cx
         imul    ebx, ebx
  
-       ror     ecx, 0x10
-       movzx   edx, cx
+       mov     edx, ecx
+       shr     edx, 0x10
         imul    edx, edx
-       rol     ecx, 0x10
  
         add     ebx, edx                // see?
         sbb     eax, 0
@@ -2849,194 +2849,590 @@ endproc
  
  proc   x28
  
+       // divide c-byte little-endian bignum at rsi by 2 (rounding down)
+
  #if defined(__x86_64__)
  
-       notimpl
+       clc
+0:     rcr     byte ptr [rsi], 1
+       inc     rsi
+       loop    0b
  
  #elif defined(__i386__)
  
-       notimpl
+       clc
+0:     rcr     byte ptr [esi], 1
+       inc     esi
+       loop    0b
  
  #elif defined(__arm__)
  
-       notimpl
+       // we could hack this a word at a time using rrx
+       mov     r3, #0
+0:     ldrb    r12, [r4]
+       subs    r2, r2, #1
+       orr     r3, r3, r12, lsr #1
+       strb    r3, [r4], #1
+       mov     r3, r12, lsl #7
+       bne     0b
  
  #elif defined(__aarch64__)
  
-       notimpl
+       mov     w16, #0
+0:     ldrb    w17, [x4]
+       sub     x2, x2, #1
+       orr     w16, w16, w17, lsr #1
+       strb    w16, [x4], #1
+       lsl     w16, w17, #7
+       cbnz    x2, 0b
  
  #else
         notimpl
  #endif
  
+       ret
+
  endproc
  
  proc   x29
  
+       // fill a buffer with a 3-byte pattern
+
  #if defined(__x86_64__)
  
-       notimpl
+       lea     rdi, [rsi + 3]
+       rep movsb
  
  #elif defined(__i386__)
  
-       notimpl
+       lea     edi, [esi + 3]
+       rep movsb
  
  #elif defined(__arm__)
  
-       notimpl
+       add     r5, r4, #3
+0:     subs    r2, r2, #1
+       ldrhsb  r12, [r4], #1
+       strhsb  r12, [r5], #1
+       bhs     0b
  
  #elif defined(__aarch64__)
  
-       notimpl
+       cbz     x2, 9f
+       add     x5, x4, #3
+0:     sub     x2, x2, #1
+       ldrb    w16, [x4], #1
+       strb    w16, [x5], #1
+       cbnz    x2, 0b
+9:
  
  #else
         notimpl
  #endif
  
+       ret
+
  endproc
  
  proc   x2a
  
+       // rotate the words in a buffer, so that the last word comes first,
+       // the first comes second, and so on.  this isn't a good way to do
+       // it.
+
  #if defined(__x86_64__)
  
-       notimpl
+       mov     rsi, rbx                // set string pointers
+       mov     rdi, rbx
+0:     lodsq                           // fetch next word
+       xchg    rax, qword ptr [rbx]    // stash it for next iteration and
+                                       // replace it with the previously
+                                       // stashed word
+       stosq                           // store in output
+       // (note that the first iteration doesn't actually do anything)
+       loop    0b                      // continue until all done
  
  #elif defined(__i386__)
  
-       notimpl
+       mov     esi, ebx                // set string pointers
+       mov     edi, ebx
+0:     lodsd                           // fetch next word
+       xchg    eax, dword ptr [ebx]    // stash it for next iteration and
+                                       // replace it with the previously
+                                       // stashed word
+       stosd                           // store in output
+       loop    0b                      // continue until all done
  
  #elif defined(__arm__)
  
-       notimpl
+       // let's do this a sensible way.  (we could go faster using ldm/stm.)
+       add     r0, r1, r2, lsl #2      // find the end of the buffer
+       ldr     r0, [r0, #-4]           // collect final element
+0:     subs    r2, r2, #1
+       ldr     r12, [r1]
+       str     r0, [r1], #4
+       mov     r0, r12
+       bne     0b
  
  #elif defined(__aarch64__)
  
-       notimpl
+       add     x0, x1, x2, lsl #3      // find the end of the buffer
+       ldr     x0, [x0, #-8]           // collect final element
+0:     sub     x2, x2, #1
+       ldr     x16, [x1]
+       str     x0, [x1], #8
+       mov     x0, x16
+       cbnz    x2, 0b
  
  #else
         notimpl
  #endif
  
+       ret
+
  endproc
  
  proc   x2b
  
-#if defined(__x86_64__)
-
-       notimpl
+       // find a cycle in a function f: B -> B, where B = {0, 1, ..., 255}
  
-#elif defined(__i386__)
+#if defined(__x86_64__)
  
-       notimpl
+       // this is floyd's cycle-finding algorithm.
+       //
+       // consider the sequence s_0 = 0, s_1 = f(0), s_2 = f(f(0)), ...,
+       // s_{i+1} = f(s_i).  since B is finite, there must be some smallest
+       // t and c such that s(t) = s(t + c); then we have s_i = s_j iff
+       // i >= t, j >= t, and i == j (mod c).
+       //
+       // the algorithm sets two cursors advancing through the sequence: a
+       // /tortoise/ which advances one step at a time, and a /hare/ which
+       // advances by two, so when the tortoise is at element s_i, the hare
+       // is at s_{2i}.  the hare will run around the cycle and catch the
+       // tortoise when i >= t and i == 2 i (mod c); the latter is simply i
+       // == 0 (mod c), which therefore happens first when i = k = t +
+       // (-t mod c).
+       //
+       // i'm not sure what good xlatb does here that mov al, [rbx + al]
+       // doesn't.
+
+       xor     eax, eax                // tortoise starts at 0
+       xor     edx, edx                // hare starts at 0
+0:     xlatb                           // advance tortoise
+       xchg    rax, rdx                // switch to hare
+       xlatb                           // advance hare ...
+       xlatb                           // ... twice
+       xchg    rax, rdx                // switch back
+       cmp     al, dl                  // hare caught the tortoise?
+       jnz     0b                      // no -- go around again
+
+       // now we trace the initial tail: reset the tortoise to s_0, and slow
+       // the hare down so that both take only a single step in each
+       // iteration.  this loop terminates when i >= t and i == i + 2 k
+       // (mod c).  we know k is a multiple of c, so the latter condition
+       // always holds, so this finds the first step of the cycle.
+
+       xor     eax, eax                // reset the tortoise
+0:     xlatb                           // advance tortoise
+       xchg    rax, rdx                // switch to hare
+       xlatb                           // advance hare
+       xchg    rax, rdx                // and switch back
+       cmp     al, dl                  // done?
+       jnz     0b                      // no -- iterate
+
+#elif defined(__i386__)
+
+       xor     eax, eax                // tortoise starts at 0
+       xor     edx, edx                // hare starts at 0
+0:     xlatb                           // advance tortoise
+       xchg    eax, edx                // switch to hare
+       xlatb                           // advance hare ...
+       xlatb                           // ... twice
+       xchg    eax, edx                // switch back
+       cmp     al, dl                  // hare caught the tortoise?
+       jnz     0b                      // no -- go around again
+
+       xor     eax, eax                // reset the tortoise
+0:     xlatb                           // advance tortoise
+       xchg    eax, edx                // switch to hare
+       xlatb                           // advance hare
+       xchg    eax, edx                // and switch back
+       cmp     al, dl                  // done?
+       jnz     0b                      // no -- iterate
  
  #elif defined(__arm__)
  
-       notimpl
+       mov     r0, #0
+       mov     r3, #0
+0:     ldrb    r0, [r1, r0]
+       ldrb    r3, [r1, r3]
+       ldrb    r3, [r1, r3]
+       cmp     r0, r3
+       bne     0b
+
+       mov     r0, #0
+0:     ldrb    r0, [r1, r0]
+       ldrb    r3, [r1, r3]
+       cmp     r0, r3
+       bne     0b
  
  #elif defined(__aarch64__)
  
-       notimpl
+       mov     w0, #0
+       mov     w3, #0
+0:     ldrb    w0, [x1, x0]
+       ldrb    w3, [x1, x3]
+       ldrb    w3, [x1, x3]
+       cmp     w0, w3
+       b.ne    0b
+
+       mov     w0, #0
+0:     ldrb    w0, [x1, x0]
+       ldrb    w3, [x1, x3]
+       cmp     w0, w3
+       b.ne    0b
  
  #else
         notimpl
  #endif
  
+       ret
+
  endproc
  
  proc   x2c
  
+       // a convoluted way to set rax = rsi
+
  #if defined(__x86_64__)
  
-       notimpl
+       mov     qword ptr [rbx + 8*rcx], 0 // b[c] = 0
+       mov     qword ptr [rbx + 8*rdx], 1 // b[d] = 1
+       mov     rax, [rbx + 8*rcx]      // a' = b[c] = 0
+
+       mov     [rbx], rsi              // b[0] = t
+       mov     [rbx + 8], rdi          // b[1] = u
+       mov     rax, [rbx + 8*rax]      // a' = b[a'] = b[0] = t
  
  #elif defined(__i386__)
  
-       notimpl
+       mov     dword ptr [ebx + 8*ecx], 0 // b[c] = 0
+       mov     dword ptr [ebx + 8*edx], 1 // b[d] = 1
+       mov     eax, [ebx + 8*ecx]      // a' = b[c] = 0
+
+       mov     [ebx], esi              // b[0] = t
+       mov     [ebx + 8], edi          // b[1] = u
+       mov     eax, [ebx + 8*eax]      // a' = b[a'] = b[0] = t
  
  #elif defined(__arm__)
  
-       notimpl
+       mov     r0, #0
+       mov     r12, #1
+
+       str     r0, [r1, r2, lsl #2]
+       str     r12, [r1, r3, lsl #2]
+       ldr     r0, [r1, r2, lsl #2]
+
+       str     r4, [r1]
+       str     r5, [r1, #4]
+       ldr     r0, [r1, r0, lsl #2]
  
  #elif defined(__aarch64__)
  
-       notimpl
+       mov     x16, #1
+
+       str     xzr, [x1, x2, lsl #3]
+       str     x16, [x1, x3, lsl #3]
+       ldr     x0, [x1, x2, lsl #3]
+
+       str     x4, [x1]
+       str     x5, [x1, #8]
+       ldr     x0, [x1, x0, lsl #3]
  
  #else
         notimpl
  #endif
  
+       ret
+
  endproc
  
  proc   x2d
  
+       // clear the least significant set bit in a, by calculating a' =
+       // a AND (a - 1).
+       //
+       // if a = 0 then a' = 0.  otherwise, a - 1 differs from a exactly in
+       // the least significant /set/ bit of a, and all bits of lesser
+       // significance.  to put it another way: write a = u 2^{k+1} + 2^k;
+       // then a - 1 = u 2^{k+1} + 2^{k-1} + ... + 2 + 1.  taking the
+       // bitwise AND of these leaves only the bits common to both, i.e.,
+       // u 2^{k+1}.
+
  #if defined(__x86_64__)
  
-       notimpl
+       mov     rdx, rax                // d' = a
+       dec     rax                     // a' = a - 1
+       and     rax, rdx                // a' = a AND (a - 1)
  
  #elif defined(__i386__)
  
-       notimpl
+       mov     edx, eax                // d' = a
+       dec     eax                     // a' = a - 1
+       and     eax, edx                // a' = a AND (a - 1)
  
  #elif defined(__arm__)
  
-       notimpl
+       sub     r3, r0, #1
+       and     r0, r0, r3
  
  #elif defined(__aarch64__)
  
-       notimpl
+       sub     x3, x0, #1
+       and     x0, x0, x3
  
  #else
         notimpl
  #endif
  
+       ret
+
  endproc
  
  proc   x2e
  
+       // compute a mask of one bits in exactly the positions of the
+       // low-order run of zero bits in a
+
  #if defined(__x86_64__)
  
-       notimpl
+       mov     rdx, rax                // d' = a
+       dec     rdx                     // d' = a - 1
+       xor     rax, rdx                // a = a XOR (a - 1)
+                                       //   set bits are least significant
+                                       //   set bit of a, and all bits of
+                                       //   lesser significance
+       shr     rax, 1                  // now only bits of lesser
+                                       //   significance; a' = 0 iff a odd
+       cmp     rax, rdx                // equal if a = 0 or 2^k; otherwise
+                                       //   strictly less
  
  #elif defined(__i386__)
  
-       notimpl
+       mov     edx, eax
+       dec     edx
+       xor     eax, edx
+       shr     eax, 1
+       cmp     eax, edx
  
  #elif defined(__arm__)
  
-       notimpl
+       sub     r3, r0, #1
+       eor     r0, r0, r3
+       mov     r0, r0, lsr #1          // probably fold shift into next inst
+       cmp     r0, r3
  
  #elif defined(__aarch64__)
  
-       notimpl
+       sub     x3, x0, #1
+       eor     x0, x0, x3
+       mov     x0, x0, lsr #1          // probably fold shift into next inst
+       cmp     x0, x3
  
  #else
         notimpl
  #endif
  
+       ret
+
  endproc
  
  proc   x2f
  
+       // a slow population count
+
  #if defined(__x86_64__)
  
-       notimpl
+       popcnt  rbx, rcx                // the easy way
+
+       // a fast version in software
+       mov     rax, rcx
+
+       mov     rdx, rcx
+       shr     rdx, 1
+       mov     rsi, 0x5555555555555555
+       and     rax, rsi
+       and     rdx, rsi
+       add     rax, rdx
+
+       mov     rdx, rax
+       shr     rdx, 2
+       mov     rsi, 0x3333333333333333
+       and     rax, rsi
+       and     rdx, rsi
+       add     rax, rdx
+
+       mov     rdx, rax
+       shr     rdx, 32
+       add     rax, rdx
+
+       mov     rdx, rax
+       shr     rdx, 4
+       and     rax, 0x0f0f0f0f
+       and     rdx, 0x0f0f0f0f
+       add     rax, rdx
+
+       mov     rdx, rax
+       shr     rdx, 8
+       add     rax, rdx
+
+       mov     rdx, rax
+       shr     rdx, 16
+       add     rax, rdx
+       movzx   rsi, al
+
+       // the official version
+       xor     eax, eax                // clear iteration counter
+0:     jrcxz   9f                      // bail if c = 0
+       inc     rax                     // bump iteration count
+       mov     rdx, rcx                // d' = c
+       dec     rdx                     // d' = c - 1
+       and     rcx, rdx                // zap least significant set bit of c
+       jmp     0b                      // and go again
+9:
  
  #elif defined(__i386__)
  
-       notimpl
+       popcnt  ebx, ecx                // the easy way
+
+       mov     eax, ecx
+
+       mov     edx, ecx
+       shr     edx, 1
+       and     eax, 0x55555555
+       and     edx, 0x55555555
+       add     eax, edx
+
+       mov     edx, eax
+       shr     edx, 2
+       and     eax, 0x33333333
+       and     edx, 0x33333333
+       add     eax, edx
+
+       mov     edx, eax
+       shr     edx, 4
+       add     eax, edx
+
+       mov     edx, eax
+       shr     edx, 8
+       and     eax, 0x000f000f
+       and     edx, 0x000f000f
+       add     eax, edx
+
+       mov     edx, eax
+       shr     edx, 16
+       add     eax, edx
+       movzx   esi, al
+
+       xor     eax, eax
+0:     jecxz   9f
+       inc     eax
+       mov     edx, ecx
+       dec     edx
+       and     ecx, edx
+       jmp     0b
+9:
  
  #elif defined(__arm__)
  
-       notimpl
+       // the easy-ish way
+       vmov    d0[0], r2
+       vcnt.8  d0, d0
+       vmov    r1, d0[0]
+       add     r1, r1, r1, lsl #8
+       add     r1, r1, r1, lsl #16
+       mov     r1, r1, lsr #24
+
+       // the hard way
+       movw    r12, #0x5555
+       movt    r12, #0x5555
+       and     r3, r12, r2, lsr #1
+       and     r0, r12, r2
+       add     r0, r0, r3
+
+       movw    r12, #0x3333
+       movt    r12, #0x3333
+       and     r3, r12, r0, lsr #2
+       and     r0, r12, r0
+       add     r0, r0, r3
+
+       add     r0, r0, r0, lsl #16
+
+       movt    r12, 0x0f0f
+       and     r3, r12, r0, lsr #4
+       and     r0, r12, r0
+       add     r0, r0, r3
+
+       add     r0, r0, r0, lsl #8
+
+       mov     r4, r0, lsr #24
+
+       // and following the exercise
+       mov     r0, #0
+       cmp     r2, #0
+       beq     9f
+0:     add     r0, r0, #1
+       sub     r3, r2, #1
+       ands    r2, r2, r3
+       bne     0b
+9:
  
  #elif defined(__aarch64__)
  
-       notimpl
+       // the easy-ish way
+       mov     v0.d[0], x2
+       cnt     v0.8b, v0.8b
+       mov     x1, v0.d[0]
+       add     x1, x1, x1, lsl #8
+       add     x1, x1, x1, lsl #16
+       add     x1, x1, x1, lsl #32
+       lsr     x1, x1, #56
+
+       // the hard way -- though arm64's immediate constant encodings and
+       // shifting make this actually rather pleasant.
+       and     x3, x2, #0xaaaaaaaaaaaaaaaa
+       and     x0, x2, #0x5555555555555555
+       add     x0, x0, x3, lsr #1
+
+       and     x3, x0, #0xcccccccccccccccc
+       and     x0, x0, #0x3333333333333333
+       add     x0, x0, x3, lsr #2
+
+       add     x0, x0, x0, lsr #4
+
+       and     x3, x0, #0x0f000f000f000f00
+       and     x0, x0, #0x000f000f000f000f
+       add     x0, x3, x0, lsl #8
+
+       add     x0, x0, x0, lsl #16
+       add     x0, x0, x0, lsl #32
+       lsr     x4, x0, #56
+
+       // and the official way
+       mov     x0, #0
+       cbz     x2, 9f
+0:     add     x0, x0, #1
+       sub     x3, x2, #1
+       and     x2, x2, x3
+       cbnz    x2, 0b
+9:
  
  #else
         notimpl
  #endif
  
+       ret
+
  endproc
  
  ///--------------------------------------------------------------------------
author	Mark Wooding <mdw@distorted.org.uk>
	Mon, 19 Oct 2020 18:02:48 +0000 (19:02 +0100)
committer	Mark Wooding <mdw@distorted.org.uk>
	Mon, 19 Oct 2020 18:02:48 +0000 (19:02 +0100)