From 99fe70cb14a9586fcea77bfea14cb931257807a8 Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Mon, 19 Oct 2020 19:02:48 +0100 Subject: [PATCH] xchg.S: More exercises. --- xchg.S | 472 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 434 insertions(+), 38 deletions(-) diff --git a/xchg.S b/xchg.S index 40c5ba9..b95c665 100644 --- a/xchg.S +++ b/xchg.S @@ -16,6 +16,7 @@ .endm .arch armv7-a + .fpu neon #elif defined(__aarch64__) @@ -2703,10 +2704,9 @@ proc x25 0: movzx ebx, cx imul ebx, ebx - ror ecx, 0x10 - movzx edx, cx + mov edx, ecx + shr edx, 0x10 imul edx, edx - rol ecx, 0x10 add ebx, edx // see? sbb eax, 0 @@ -2849,194 +2849,590 @@ endproc proc x28 + // divide c-byte little-endian bignum at rsi by 2 (rounding down) + #if defined(__x86_64__) - notimpl + clc +0: rcr byte ptr [rsi], 1 + inc rsi + loop 0b #elif defined(__i386__) - notimpl + clc +0: rcr byte ptr [esi], 1 + inc esi + loop 0b #elif defined(__arm__) - notimpl + // we could hack this a word at a time using rrx + mov r3, #0 +0: ldrb r12, [r4] + subs r2, r2, #1 + orr r3, r3, r12, lsr #1 + strb r3, [r4], #1 + mov r3, r12, lsl #7 + bne 0b #elif defined(__aarch64__) - notimpl + mov w16, #0 +0: ldrb w17, [x4] + sub x2, x2, #1 + orr w16, w16, w17, lsr #1 + strb w16, [x4], #1 + lsl w16, w17, #7 + cbnz x2, 0b #else notimpl #endif + ret + endproc proc x29 + // fill a buffer with a 3-byte pattern + #if defined(__x86_64__) - notimpl + lea rdi, [rsi + 3] + rep movsb #elif defined(__i386__) - notimpl + lea edi, [esi + 3] + rep movsb #elif defined(__arm__) - notimpl + add r5, r4, #3 +0: subs r2, r2, #1 + ldrhsb r12, [r4], #1 + strhsb r12, [r5], #1 + bhs 0b #elif defined(__aarch64__) - notimpl + cbz x2, 9f + add x5, x4, #3 +0: sub x2, x2, #1 + ldrb w16, [x4], #1 + strb w16, [x5], #1 + cbnz x2, 0b +9: #else notimpl #endif + ret + endproc proc x2a + // rotate the words in a buffer, so that the last word comes first, + // the first comes second, and so on. this isn't a good way to do + // it. + #if defined(__x86_64__) - notimpl + mov rsi, rbx // set string pointers + mov rdi, rbx +0: lodsq // fetch next word + xchg rax, qword ptr [rbx] // stash it for next iteration and + // replace it with the previously + // stashed word + stosq // store in output + // (note that the first iteration doesn't actually do anything) + loop 0b // continue until all done #elif defined(__i386__) - notimpl + mov esi, ebx // set string pointers + mov edi, ebx +0: lodsd // fetch next word + xchg eax, dword ptr [ebx] // stash it for next iteration and + // replace it with the previously + // stashed word + stosd // store in output + loop 0b // continue until all done #elif defined(__arm__) - notimpl + // let's do this a sensible way. (we could go faster using ldm/stm.) + add r0, r1, r2, lsl #2 // find the end of the buffer + ldr r0, [r0, #-4] // collect final element +0: subs r2, r2, #1 + ldr r12, [r1] + str r0, [r1], #4 + mov r0, r12 + bne 0b #elif defined(__aarch64__) - notimpl + add x0, x1, x2, lsl #3 // find the end of the buffer + ldr x0, [x0, #-8] // collect final element +0: sub x2, x2, #1 + ldr x16, [x1] + str x0, [x1], #8 + mov x0, x16 + cbnz x2, 0b #else notimpl #endif + ret + endproc proc x2b -#if defined(__x86_64__) - - notimpl + // find a cycle in a function f: B -> B, where B = {0, 1, ..., 255} -#elif defined(__i386__) +#if defined(__x86_64__) - notimpl + // this is floyd's cycle-finding algorithm. + // + // consider the sequence s_0 = 0, s_1 = f(0), s_2 = f(f(0)), ..., + // s_{i+1} = f(s_i). since B is finite, there must be some smallest + // t and c such that s(t) = s(t + c); then we have s_i = s_j iff + // i >= t, j >= t, and i == j (mod c). + // + // the algorithm sets two cursors advancing through the sequence: a + // /tortoise/ which advances one step at a time, and a /hare/ which + // advances by two, so when the tortoise is at element s_i, the hare + // is at s_{2i}. the hare will run around the cycle and catch the + // tortoise when i >= t and i == 2 i (mod c); the latter is simply i + // == 0 (mod c), which therefore happens first when i = k = t + + // (-t mod c). + // + // i'm not sure what good xlatb does here that mov al, [rbx + al] + // doesn't. + + xor eax, eax // tortoise starts at 0 + xor edx, edx // hare starts at 0 +0: xlatb // advance tortoise + xchg rax, rdx // switch to hare + xlatb // advance hare ... + xlatb // ... twice + xchg rax, rdx // switch back + cmp al, dl // hare caught the tortoise? + jnz 0b // no -- go around again + + // now we trace the initial tail: reset the tortoise to s_0, and slow + // the hare down so that both take only a single step in each + // iteration. this loop terminates when i >= t and i == i + 2 k + // (mod c). we know k is a multiple of c, so the latter condition + // always holds, so this finds the first step of the cycle. + + xor eax, eax // reset the tortoise +0: xlatb // advance tortoise + xchg rax, rdx // switch to hare + xlatb // advance hare + xchg rax, rdx // and switch back + cmp al, dl // done? + jnz 0b // no -- iterate + +#elif defined(__i386__) + + xor eax, eax // tortoise starts at 0 + xor edx, edx // hare starts at 0 +0: xlatb // advance tortoise + xchg eax, edx // switch to hare + xlatb // advance hare ... + xlatb // ... twice + xchg eax, edx // switch back + cmp al, dl // hare caught the tortoise? + jnz 0b // no -- go around again + + xor eax, eax // reset the tortoise +0: xlatb // advance tortoise + xchg eax, edx // switch to hare + xlatb // advance hare + xchg eax, edx // and switch back + cmp al, dl // done? + jnz 0b // no -- iterate #elif defined(__arm__) - notimpl + mov r0, #0 + mov r3, #0 +0: ldrb r0, [r1, r0] + ldrb r3, [r1, r3] + ldrb r3, [r1, r3] + cmp r0, r3 + bne 0b + + mov r0, #0 +0: ldrb r0, [r1, r0] + ldrb r3, [r1, r3] + cmp r0, r3 + bne 0b #elif defined(__aarch64__) - notimpl + mov w0, #0 + mov w3, #0 +0: ldrb w0, [x1, x0] + ldrb w3, [x1, x3] + ldrb w3, [x1, x3] + cmp w0, w3 + b.ne 0b + + mov w0, #0 +0: ldrb w0, [x1, x0] + ldrb w3, [x1, x3] + cmp w0, w3 + b.ne 0b #else notimpl #endif + ret + endproc proc x2c + // a convoluted way to set rax = rsi + #if defined(__x86_64__) - notimpl + mov qword ptr [rbx + 8*rcx], 0 // b[c] = 0 + mov qword ptr [rbx + 8*rdx], 1 // b[d] = 1 + mov rax, [rbx + 8*rcx] // a' = b[c] = 0 + + mov [rbx], rsi // b[0] = t + mov [rbx + 8], rdi // b[1] = u + mov rax, [rbx + 8*rax] // a' = b[a'] = b[0] = t #elif defined(__i386__) - notimpl + mov dword ptr [ebx + 8*ecx], 0 // b[c] = 0 + mov dword ptr [ebx + 8*edx], 1 // b[d] = 1 + mov eax, [ebx + 8*ecx] // a' = b[c] = 0 + + mov [ebx], esi // b[0] = t + mov [ebx + 8], edi // b[1] = u + mov eax, [ebx + 8*eax] // a' = b[a'] = b[0] = t #elif defined(__arm__) - notimpl + mov r0, #0 + mov r12, #1 + + str r0, [r1, r2, lsl #2] + str r12, [r1, r3, lsl #2] + ldr r0, [r1, r2, lsl #2] + + str r4, [r1] + str r5, [r1, #4] + ldr r0, [r1, r0, lsl #2] #elif defined(__aarch64__) - notimpl + mov x16, #1 + + str xzr, [x1, x2, lsl #3] + str x16, [x1, x3, lsl #3] + ldr x0, [x1, x2, lsl #3] + + str x4, [x1] + str x5, [x1, #8] + ldr x0, [x1, x0, lsl #3] #else notimpl #endif + ret + endproc proc x2d + // clear the least significant set bit in a, by calculating a' = + // a AND (a - 1). + // + // if a = 0 then a' = 0. otherwise, a - 1 differs from a exactly in + // the least significant /set/ bit of a, and all bits of lesser + // significance. to put it another way: write a = u 2^{k+1} + 2^k; + // then a - 1 = u 2^{k+1} + 2^{k-1} + ... + 2 + 1. taking the + // bitwise AND of these leaves only the bits common to both, i.e., + // u 2^{k+1}. + #if defined(__x86_64__) - notimpl + mov rdx, rax // d' = a + dec rax // a' = a - 1 + and rax, rdx // a' = a AND (a - 1) #elif defined(__i386__) - notimpl + mov edx, eax // d' = a + dec eax // a' = a - 1 + and eax, edx // a' = a AND (a - 1) #elif defined(__arm__) - notimpl + sub r3, r0, #1 + and r0, r0, r3 #elif defined(__aarch64__) - notimpl + sub x3, x0, #1 + and x0, x0, x3 #else notimpl #endif + ret + endproc proc x2e + // compute a mask of one bits in exactly the positions of the + // low-order run of zero bits in a + #if defined(__x86_64__) - notimpl + mov rdx, rax // d' = a + dec rdx // d' = a - 1 + xor rax, rdx // a = a XOR (a - 1) + // set bits are least significant + // set bit of a, and all bits of + // lesser significance + shr rax, 1 // now only bits of lesser + // significance; a' = 0 iff a odd + cmp rax, rdx // equal if a = 0 or 2^k; otherwise + // strictly less #elif defined(__i386__) - notimpl + mov edx, eax + dec edx + xor eax, edx + shr eax, 1 + cmp eax, edx #elif defined(__arm__) - notimpl + sub r3, r0, #1 + eor r0, r0, r3 + mov r0, r0, lsr #1 // probably fold shift into next inst + cmp r0, r3 #elif defined(__aarch64__) - notimpl + sub x3, x0, #1 + eor x0, x0, x3 + mov x0, x0, lsr #1 // probably fold shift into next inst + cmp x0, x3 #else notimpl #endif + ret + endproc proc x2f + // a slow population count + #if defined(__x86_64__) - notimpl + popcnt rbx, rcx // the easy way + + // a fast version in software + mov rax, rcx + + mov rdx, rcx + shr rdx, 1 + mov rsi, 0x5555555555555555 + and rax, rsi + and rdx, rsi + add rax, rdx + + mov rdx, rax + shr rdx, 2 + mov rsi, 0x3333333333333333 + and rax, rsi + and rdx, rsi + add rax, rdx + + mov rdx, rax + shr rdx, 32 + add rax, rdx + + mov rdx, rax + shr rdx, 4 + and rax, 0x0f0f0f0f + and rdx, 0x0f0f0f0f + add rax, rdx + + mov rdx, rax + shr rdx, 8 + add rax, rdx + + mov rdx, rax + shr rdx, 16 + add rax, rdx + movzx rsi, al + + // the official version + xor eax, eax // clear iteration counter +0: jrcxz 9f // bail if c = 0 + inc rax // bump iteration count + mov rdx, rcx // d' = c + dec rdx // d' = c - 1 + and rcx, rdx // zap least significant set bit of c + jmp 0b // and go again +9: #elif defined(__i386__) - notimpl + popcnt ebx, ecx // the easy way + + mov eax, ecx + + mov edx, ecx + shr edx, 1 + and eax, 0x55555555 + and edx, 0x55555555 + add eax, edx + + mov edx, eax + shr edx, 2 + and eax, 0x33333333 + and edx, 0x33333333 + add eax, edx + + mov edx, eax + shr edx, 4 + add eax, edx + + mov edx, eax + shr edx, 8 + and eax, 0x000f000f + and edx, 0x000f000f + add eax, edx + + mov edx, eax + shr edx, 16 + add eax, edx + movzx esi, al + + xor eax, eax +0: jecxz 9f + inc eax + mov edx, ecx + dec edx + and ecx, edx + jmp 0b +9: #elif defined(__arm__) - notimpl + // the easy-ish way + vmov d0[0], r2 + vcnt.8 d0, d0 + vmov r1, d0[0] + add r1, r1, r1, lsl #8 + add r1, r1, r1, lsl #16 + mov r1, r1, lsr #24 + + // the hard way + movw r12, #0x5555 + movt r12, #0x5555 + and r3, r12, r2, lsr #1 + and r0, r12, r2 + add r0, r0, r3 + + movw r12, #0x3333 + movt r12, #0x3333 + and r3, r12, r0, lsr #2 + and r0, r12, r0 + add r0, r0, r3 + + add r0, r0, r0, lsl #16 + + movt r12, 0x0f0f + and r3, r12, r0, lsr #4 + and r0, r12, r0 + add r0, r0, r3 + + add r0, r0, r0, lsl #8 + + mov r4, r0, lsr #24 + + // and following the exercise + mov r0, #0 + cmp r2, #0 + beq 9f +0: add r0, r0, #1 + sub r3, r2, #1 + ands r2, r2, r3 + bne 0b +9: #elif defined(__aarch64__) - notimpl + // the easy-ish way + mov v0.d[0], x2 + cnt v0.8b, v0.8b + mov x1, v0.d[0] + add x1, x1, x1, lsl #8 + add x1, x1, x1, lsl #16 + add x1, x1, x1, lsl #32 + lsr x1, x1, #56 + + // the hard way -- though arm64's immediate constant encodings and + // shifting make this actually rather pleasant. + and x3, x2, #0xaaaaaaaaaaaaaaaa + and x0, x2, #0x5555555555555555 + add x0, x0, x3, lsr #1 + + and x3, x0, #0xcccccccccccccccc + and x0, x0, #0x3333333333333333 + add x0, x0, x3, lsr #2 + + add x0, x0, x0, lsr #4 + + and x3, x0, #0x0f000f000f000f00 + and x0, x0, #0x000f000f000f000f + add x0, x3, x0, lsl #8 + + add x0, x0, x0, lsl #16 + add x0, x0, x0, lsl #32 + lsr x4, x0, #56 + + // and the official way + mov x0, #0 + cbz x2, 9f +0: add x0, x0, #1 + sub x3, x2, #1 + and x2, x2, x3 + cbnz x2, 0b +9: #else notimpl #endif + ret + endproc ///-------------------------------------------------------------------------- -- 2.11.0