From 99fe70cb14a9586fcea77bfea14cb931257807a8 Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Mon, 19 Oct 2020 19:02:48 +0100
Subject: [PATCH] xchg.S: More exercises.

---
 xchg.S | 472 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 434 insertions(+), 38 deletions(-)

diff --git a/xchg.S b/xchg.S
index 40c5ba9..b95c665 100644
--- a/xchg.S
+++ b/xchg.S
@@ -16,6 +16,7 @@
 .endm
 
 	.arch	armv7-a
+	.fpu	neon
 
 #elif defined(__aarch64__)
 
@@ -2703,10 +2704,9 @@ proc	x25
 0:	movzx	ebx, cx
 	imul	ebx, ebx
 
-	ror	ecx, 0x10
-	movzx	edx, cx
+	mov	edx, ecx
+	shr	edx, 0x10
 	imul	edx, edx
-	rol	ecx, 0x10
 
 	add	ebx, edx		// see?
 	sbb	eax, 0
@@ -2849,194 +2849,590 @@ endproc
 
 proc	x28
 
+	// divide c-byte little-endian bignum at rsi by 2 (rounding down)
+
 #if defined(__x86_64__)
 
-	notimpl
+	clc
+0:	rcr	byte ptr [rsi], 1
+	inc	rsi
+	loop	0b
 
 #elif defined(__i386__)
 
-	notimpl
+	clc
+0:	rcr	byte ptr [esi], 1
+	inc	esi
+	loop	0b
 
 #elif defined(__arm__)
 
-	notimpl
+	// we could hack this a word at a time using rrx
+	mov	r3, #0
+0:	ldrb	r12, [r4]
+	subs	r2, r2, #1
+	orr	r3, r3, r12, lsr #1
+	strb	r3, [r4], #1
+	mov	r3, r12, lsl #7
+	bne	0b
 
 #elif defined(__aarch64__)
 
-	notimpl
+	mov	w16, #0
+0:	ldrb	w17, [x4]
+	sub	x2, x2, #1
+	orr	w16, w16, w17, lsr #1
+	strb	w16, [x4], #1
+	lsl	w16, w17, #7
+	cbnz	x2, 0b
 
 #else
 	notimpl
 #endif
 
+	ret
+
 endproc
 
 proc	x29
 
+	// fill a buffer with a 3-byte pattern
+
 #if defined(__x86_64__)
 
-	notimpl
+	lea	rdi, [rsi + 3]
+	rep movsb
 
 #elif defined(__i386__)
 
-	notimpl
+	lea	edi, [esi + 3]
+	rep movsb
 
 #elif defined(__arm__)
 
-	notimpl
+	add	r5, r4, #3
+0:	subs	r2, r2, #1
+	ldrhsb	r12, [r4], #1
+	strhsb	r12, [r5], #1
+	bhs	0b
 
 #elif defined(__aarch64__)
 
-	notimpl
+	cbz	x2, 9f
+	add	x5, x4, #3
+0:	sub	x2, x2, #1
+	ldrb	w16, [x4], #1
+	strb	w16, [x5], #1
+	cbnz	x2, 0b
+9:
 
 #else
 	notimpl
 #endif
 
+	ret
+
 endproc
 
 proc	x2a
 
+	// rotate the words in a buffer, so that the last word comes first,
+	// the first comes second, and so on.  this isn't a good way to do
+	// it.
+
 #if defined(__x86_64__)
 
-	notimpl
+	mov	rsi, rbx		// set string pointers
+	mov	rdi, rbx
+0:	lodsq				// fetch next word
+	xchg	rax, qword ptr [rbx]	// stash it for next iteration and
+					// replace it with the previously
+					// stashed word
+	stosq				// store in output
+	// (note that the first iteration doesn't actually do anything)
+	loop	0b			// continue until all done
 
 #elif defined(__i386__)
 
-	notimpl
+	mov	esi, ebx		// set string pointers
+	mov	edi, ebx
+0:	lodsd				// fetch next word
+	xchg	eax, dword ptr [ebx]	// stash it for next iteration and
+					// replace it with the previously
+					// stashed word
+	stosd				// store in output
+	loop	0b			// continue until all done
 
 #elif defined(__arm__)
 
-	notimpl
+	// let's do this a sensible way.  (we could go faster using ldm/stm.)
+	add	r0, r1, r2, lsl #2	// find the end of the buffer
+	ldr	r0, [r0, #-4]		// collect final element
+0:	subs	r2, r2, #1
+	ldr	r12, [r1]
+	str	r0, [r1], #4
+	mov	r0, r12
+	bne	0b
 
 #elif defined(__aarch64__)
 
-	notimpl
+	add	x0, x1, x2, lsl #3	// find the end of the buffer
+	ldr	x0, [x0, #-8]		// collect final element
+0:	sub	x2, x2, #1
+	ldr	x16, [x1]
+	str	x0, [x1], #8
+	mov	x0, x16
+	cbnz	x2, 0b
 
 #else
 	notimpl
 #endif
 
+	ret
+
 endproc
 
 proc	x2b
 
-#if defined(__x86_64__)
-
-	notimpl
+	// find a cycle in a function f: B -> B, where B = {0, 1, ..., 255}
 
-#elif defined(__i386__)
+#if defined(__x86_64__)
 
-	notimpl
+	// this is floyd's cycle-finding algorithm.
+	//
+	// consider the sequence s_0 = 0, s_1 = f(0), s_2 = f(f(0)), ...,
+	// s_{i+1} = f(s_i).  since B is finite, there must be some smallest
+	// t and c such that s(t) = s(t + c); then we have s_i = s_j iff
+	// i >= t, j >= t, and i == j (mod c).
+	//
+	// the algorithm sets two cursors advancing through the sequence: a
+	// /tortoise/ which advances one step at a time, and a /hare/ which
+	// advances by two, so when the tortoise is at element s_i, the hare
+	// is at s_{2i}.  the hare will run around the cycle and catch the
+	// tortoise when i >= t and i == 2 i (mod c); the latter is simply i
+	// == 0 (mod c), which therefore happens first when i = k = t +
+	// (-t mod c).
+	//
+	// i'm not sure what good xlatb does here that mov al, [rbx + al]
+	// doesn't.
+
+	xor	eax, eax		// tortoise starts at 0
+	xor	edx, edx		// hare starts at 0
+0:	xlatb				// advance tortoise
+	xchg	rax, rdx		// switch to hare
+	xlatb				// advance hare ...
+	xlatb				// ... twice
+	xchg	rax, rdx		// switch back
+	cmp	al, dl			// hare caught the tortoise?
+	jnz	0b			// no -- go around again
+
+	// now we trace the initial tail: reset the tortoise to s_0, and slow
+	// the hare down so that both take only a single step in each
+	// iteration.  this loop terminates when i >= t and i == i + 2 k
+	// (mod c).  we know k is a multiple of c, so the latter condition
+	// always holds, so this finds the first step of the cycle.
+
+	xor	eax, eax		// reset the tortoise
+0:	xlatb				// advance tortoise
+	xchg	rax, rdx		// switch to hare
+	xlatb				// advance hare
+	xchg	rax, rdx		// and switch back
+	cmp	al, dl			// done?
+	jnz	0b			// no -- iterate
+
+#elif defined(__i386__)
+
+	xor	eax, eax		// tortoise starts at 0
+	xor	edx, edx		// hare starts at 0
+0:	xlatb				// advance tortoise
+	xchg	eax, edx		// switch to hare
+	xlatb				// advance hare ...
+	xlatb				// ... twice
+	xchg	eax, edx		// switch back
+	cmp	al, dl			// hare caught the tortoise?
+	jnz	0b			// no -- go around again
+
+	xor	eax, eax		// reset the tortoise
+0:	xlatb				// advance tortoise
+	xchg	eax, edx		// switch to hare
+	xlatb				// advance hare
+	xchg	eax, edx		// and switch back
+	cmp	al, dl			// done?
+	jnz	0b			// no -- iterate
 
 #elif defined(__arm__)
 
-	notimpl
+	mov	r0, #0
+	mov	r3, #0
+0:	ldrb	r0, [r1, r0]
+	ldrb	r3, [r1, r3]
+	ldrb	r3, [r1, r3]
+	cmp	r0, r3
+	bne	0b
+
+	mov	r0, #0
+0:	ldrb	r0, [r1, r0]
+	ldrb	r3, [r1, r3]
+	cmp	r0, r3
+	bne	0b
 
 #elif defined(__aarch64__)
 
-	notimpl
+	mov	w0, #0
+	mov	w3, #0
+0:	ldrb	w0, [x1, x0]
+	ldrb	w3, [x1, x3]
+	ldrb	w3, [x1, x3]
+	cmp	w0, w3
+	b.ne	0b
+
+	mov	w0, #0
+0:	ldrb	w0, [x1, x0]
+	ldrb	w3, [x1, x3]
+	cmp	w0, w3
+	b.ne	0b
 
 #else
 	notimpl
 #endif
 
+	ret
+
 endproc
 
 proc	x2c
 
+	// a convoluted way to set rax = rsi
+
 #if defined(__x86_64__)
 
-	notimpl
+	mov	qword ptr [rbx + 8*rcx], 0 // b[c] = 0
+	mov	qword ptr [rbx + 8*rdx], 1 // b[d] = 1
+	mov	rax, [rbx + 8*rcx]	// a' = b[c] = 0
+
+	mov	[rbx], rsi		// b[0] = t
+	mov	[rbx + 8], rdi		// b[1] = u
+	mov	rax, [rbx + 8*rax]	// a' = b[a'] = b[0] = t
 
 #elif defined(__i386__)
 
-	notimpl
+	mov	dword ptr [ebx + 8*ecx], 0 // b[c] = 0
+	mov	dword ptr [ebx + 8*edx], 1 // b[d] = 1
+	mov	eax, [ebx + 8*ecx]	// a' = b[c] = 0
+
+	mov	[ebx], esi		// b[0] = t
+	mov	[ebx + 8], edi		// b[1] = u
+	mov	eax, [ebx + 8*eax]	// a' = b[a'] = b[0] = t
 
 #elif defined(__arm__)
 
-	notimpl
+	mov	r0, #0
+	mov	r12, #1
+
+	str	r0, [r1, r2, lsl #2]
+	str	r12, [r1, r3, lsl #2]
+	ldr	r0, [r1, r2, lsl #2]
+
+	str	r4, [r1]
+	str	r5, [r1, #4]
+	ldr	r0, [r1, r0, lsl #2]
 
 #elif defined(__aarch64__)
 
-	notimpl
+	mov	x16, #1
+
+	str	xzr, [x1, x2, lsl #3]
+	str	x16, [x1, x3, lsl #3]
+	ldr	x0, [x1, x2, lsl #3]
+
+	str	x4, [x1]
+	str	x5, [x1, #8]
+	ldr	x0, [x1, x0, lsl #3]
 
 #else
 	notimpl
 #endif
 
+	ret
+
 endproc
 
 proc	x2d
 
+	// clear the least significant set bit in a, by calculating a' =
+	// a AND (a - 1).
+	//
+	// if a = 0 then a' = 0.  otherwise, a - 1 differs from a exactly in
+	// the least significant /set/ bit of a, and all bits of lesser
+	// significance.  to put it another way: write a = u 2^{k+1} + 2^k;
+	// then a - 1 = u 2^{k+1} + 2^{k-1} + ... + 2 + 1.  taking the
+	// bitwise AND of these leaves only the bits common to both, i.e.,
+	// u 2^{k+1}.
+
 #if defined(__x86_64__)
 
-	notimpl
+	mov	rdx, rax		// d' = a
+	dec	rax			// a' = a - 1
+	and	rax, rdx		// a' = a AND (a - 1)
 
 #elif defined(__i386__)
 
-	notimpl
+	mov	edx, eax		// d' = a
+	dec	eax			// a' = a - 1
+	and	eax, edx		// a' = a AND (a - 1)
 
 #elif defined(__arm__)
 
-	notimpl
+	sub	r3, r0, #1
+	and	r0, r0, r3
 
 #elif defined(__aarch64__)
 
-	notimpl
+	sub	x3, x0, #1
+	and	x0, x0, x3
 
 #else
 	notimpl
 #endif
 
+	ret
+
 endproc
 
 proc	x2e
 
+	// compute a mask of one bits in exactly the positions of the
+	// low-order run of zero bits in a
+
 #if defined(__x86_64__)
 
-	notimpl
+	mov	rdx, rax		// d' = a
+	dec	rdx			// d' = a - 1
+	xor	rax, rdx		// a = a XOR (a - 1)
+					//   set bits are least significant
+					//   set bit of a, and all bits of
+					//   lesser significance
+	shr	rax, 1			// now only bits of lesser
+					//   significance; a' = 0 iff a odd
+	cmp	rax, rdx		// equal if a = 0 or 2^k; otherwise
+					//   strictly less
 
 #elif defined(__i386__)
 
-	notimpl
+	mov	edx, eax
+	dec	edx
+	xor	eax, edx
+	shr	eax, 1
+	cmp	eax, edx
 
 #elif defined(__arm__)
 
-	notimpl
+	sub	r3, r0, #1
+	eor	r0, r0, r3
+	mov	r0, r0, lsr #1		// probably fold shift into next inst
+	cmp	r0, r3
 
 #elif defined(__aarch64__)
 
-	notimpl
+	sub	x3, x0, #1
+	eor	x0, x0, x3
+	mov	x0, x0, lsr #1		// probably fold shift into next inst
+	cmp	x0, x3
 
 #else
 	notimpl
 #endif
 
+	ret
+
 endproc
 
 proc	x2f
 
+	// a slow population count
+
 #if defined(__x86_64__)
 
-	notimpl
+	popcnt	rbx, rcx		// the easy way
+
+	// a fast version in software
+	mov	rax, rcx
+
+	mov	rdx, rcx
+	shr	rdx, 1
+	mov	rsi, 0x5555555555555555
+	and	rax, rsi
+	and	rdx, rsi
+	add	rax, rdx
+
+	mov	rdx, rax
+	shr	rdx, 2
+	mov	rsi, 0x3333333333333333
+	and	rax, rsi
+	and	rdx, rsi
+	add	rax, rdx
+
+	mov	rdx, rax
+	shr	rdx, 32
+	add	rax, rdx
+
+	mov	rdx, rax
+	shr	rdx, 4
+	and	rax, 0x0f0f0f0f
+	and	rdx, 0x0f0f0f0f
+	add	rax, rdx
+
+	mov	rdx, rax
+	shr	rdx, 8
+	add	rax, rdx
+
+	mov	rdx, rax
+	shr	rdx, 16
+	add	rax, rdx
+	movzx	rsi, al
+
+	// the official version
+	xor	eax, eax		// clear iteration counter
+0:	jrcxz	9f			// bail if c = 0
+	inc	rax			// bump iteration count
+	mov	rdx, rcx		// d' = c
+	dec	rdx			// d' = c - 1
+	and	rcx, rdx		// zap least significant set bit of c
+	jmp	0b			// and go again
+9:
 
 #elif defined(__i386__)
 
-	notimpl
+	popcnt	ebx, ecx		// the easy way
+
+	mov	eax, ecx
+
+	mov	edx, ecx
+	shr	edx, 1
+	and	eax, 0x55555555
+	and	edx, 0x55555555
+	add	eax, edx
+
+	mov	edx, eax
+	shr	edx, 2
+	and	eax, 0x33333333
+	and	edx, 0x33333333
+	add	eax, edx
+
+	mov	edx, eax
+	shr	edx, 4
+	add	eax, edx
+
+	mov	edx, eax
+	shr	edx, 8
+	and	eax, 0x000f000f
+	and	edx, 0x000f000f
+	add	eax, edx
+
+	mov	edx, eax
+	shr	edx, 16
+	add	eax, edx
+	movzx	esi, al
+
+	xor	eax, eax
+0:	jecxz	9f
+	inc	eax
+	mov	edx, ecx
+	dec	edx
+	and	ecx, edx
+	jmp	0b
+9:
 
 #elif defined(__arm__)
 
-	notimpl
+	// the easy-ish way
+	vmov	d0[0], r2
+	vcnt.8	d0, d0
+	vmov	r1, d0[0]
+	add	r1, r1, r1, lsl #8
+	add	r1, r1, r1, lsl #16
+	mov	r1, r1, lsr #24
+
+	// the hard way
+	movw	r12, #0x5555
+	movt	r12, #0x5555
+	and	r3, r12, r2, lsr #1
+	and	r0, r12, r2
+	add	r0, r0, r3
+
+	movw	r12, #0x3333
+	movt	r12, #0x3333
+	and	r3, r12, r0, lsr #2
+	and	r0, r12, r0
+	add	r0, r0, r3
+
+	add	r0, r0, r0, lsl #16
+
+	movt	r12, 0x0f0f
+	and	r3, r12, r0, lsr #4
+	and	r0, r12, r0
+	add	r0, r0, r3
+
+	add	r0, r0, r0, lsl #8
+
+	mov	r4, r0, lsr #24
+
+	// and following the exercise
+	mov	r0, #0
+	cmp	r2, #0
+	beq	9f
+0:	add	r0, r0, #1
+	sub	r3, r2, #1
+	ands	r2, r2, r3
+	bne	0b
+9:
 
 #elif defined(__aarch64__)
 
-	notimpl
+	// the easy-ish way
+	mov	v0.d[0], x2
+	cnt	v0.8b, v0.8b
+	mov	x1, v0.d[0]
+	add	x1, x1, x1, lsl #8
+	add	x1, x1, x1, lsl #16
+	add	x1, x1, x1, lsl #32
+	lsr	x1, x1, #56
+
+	// the hard way -- though arm64's immediate constant encodings and
+	// shifting make this actually rather pleasant.
+	and	x3, x2, #0xaaaaaaaaaaaaaaaa
+	and	x0, x2, #0x5555555555555555
+	add	x0, x0, x3, lsr #1
+
+	and	x3, x0, #0xcccccccccccccccc
+	and	x0, x0, #0x3333333333333333
+	add	x0, x0, x3, lsr #2
+
+	add	x0, x0, x0, lsr #4
+
+	and	x3, x0, #0x0f000f000f000f00
+	and	x0, x0, #0x000f000f000f000f
+	add	x0, x3, x0, lsl #8
+
+	add	x0, x0, x0, lsl #16
+	add	x0, x0, x0, lsl #32
+	lsr	x4, x0, #56
+
+	// and the official way
+	mov	x0, #0
+	cbz	x2, 9f
+0:	add	x0, x0, #1
+	sub	x3, x2, #1
+	and	x2, x2, x3
+	cbnz	x2, 0b
+9:
 
 #else
 	notimpl
 #endif
 
+	ret
+
 endproc
 
 ///--------------------------------------------------------------------------
-- 
2.11.0