X-Git-Url: https://git.distorted.org.uk/~mdw/xchg-rax-rax/blobdiff_plain/90c4eee32bd52dbdba4913a5030c7b27cffaa103..636f688ac516d2d2bed4e0e8836a7f0720bd9958:/xchg.S

diff --git a/xchg.S b/xchg.S
index 0dcb6c8..40c5ba9 100644
--- a/xchg.S
+++ b/xchg.S
@@ -380,18 +380,24 @@ proc	call_example
 
 #elif defined(__aarch64__)
 
-	stp	x29, x30, [sp, #-13*8]!
+	stp	x29, x30, [sp, #-14*8]!
 	mov	x29, sp
 	stp	x19, x20, [sp,  #16]
 	stp	x21, x22, [sp,  #32]
 	stp	x23, x24, [sp,  #48]
 	stp	x25, x26, [sp,  #64]
 	stp	x27, x28, [sp,  #80]
-	str	x1, [sp, #96]
-
-	mov	x16, x0
-
-	ldr	x17,	  [x1, #128]
+	str	      x1, [sp, #104]
+
+	ldp	x29, x30, [x1, #224]
+	msr	nzcv, x30
+	mov	x30, x0
+	ldp	x27, x28, [x1, #208]
+	ldp	x25, x26, [x1, #192]
+	ldp	x23, x24, [x1, #176]
+	ldp	x21, x22, [x1, #160]
+	ldp	x19, x20, [x1, #144]
+	ldp	x16, x17, [x1, #128]
 	ldp	x14, x15, [x1, #112]
 	ldp	x12, x13, [x1,  #96]
 	ldp	x10, x11, [x1,  #80]
@@ -400,28 +406,36 @@ proc	call_example
 	ldp	 x4,  x5, [x1,  #32]
 	ldp	 x2,  x3, [x1,  #16]
 	ldp	 x0,  x1, [x1,   #0]
-	msr	nzcv, x17
-
-	blr	x16
-
-	ldr	x16, [sp, #96]
-	mrs	x17, nzcv
-	str	x17,	  [x16, #128]
-	stp	x14, x15, [x16, #112]
-	stp	x12, x13, [x16,  #96]
-	stp	x10, x11, [x16,  #80]
-	stp	 x8,  x9, [x16,  #64]
-	stp	 x6,  x7, [x16,  #48]
-	stp	 x4,  x5, [x16,  #32]
-	stp	 x2,  x3, [x16,  #16]
-	stp	 x0,  x1, [x16,   #0]
+
+	blr	x30
+
+	ldr	x30, [sp, #104]
+	stp	x27, x28, [x30, #208]
+	stp	x25, x26, [x30, #192]
+	stp	x23, x24, [x30, #176]
+	stp	x21, x22, [x30, #160]
+	stp	x19, x20, [x30, #144]
+	stp	x16, x17, [x30, #128]
+	stp	x14, x15, [x30, #112]
+	stp	x12, x13, [x30,  #96]
+	stp	x10, x11, [x30,  #80]
+	stp	 x8,  x9, [x30,  #64]
+	stp	 x6,  x7, [x30,  #48]
+	stp	 x4,  x5, [x30,  #32]
+	stp	 x2,  x3, [x30,  #16]
+	stp	 x0,  x1, [x30,   #0]
+	mov	x0, x30
+	mrs	x30, nzcv
+	stp	x29, x30,  [x0, #224]
 
 	ldp	x19, x20, [sp,  #16]
 	ldp	x21, x22, [sp,  #32]
 	ldp	x23, x24, [sp,  #48]
 	ldp	x25, x26, [sp,  #64]
 	ldp	x27, x28, [sp,  #80]
-	ldp	x29, x30, [sp], #13*8
+	ldp	x29, x30, [sp], #14*8
+
+	ret
 
 #else
 #  error "not supported"
@@ -2736,50 +2750,101 @@ endproc
 
 proc	x26
 
+	// a bad way to rotate a right by 7 places
+
 #if defined(__x86_64__)
 
-	notimpl
+	mov	rbx, rax
+	ror	rbx, 7			// better
+
+	mov	rdx, rax		// d' = a
+	shr	rax, 7			// a' = a >> 7
+	shl	rdx, 0x39		// d' = a << 57
+	or	rax, rdx		// a' = a >>> 7
 
 #elif defined(__i386__)
 
-	notimpl
+	mov	ebx, eax
+	ror	ebx, 7			// better
+
+	mov	edx, eax		// d' = a
+	shr	eax, 7			// a' = a >> 7
+	shl	edx, 0x39		// d' = a << 57
+	or	eax, edx		// a' = a >>> 7
 
 #elif defined(__arm__)
 
-	notimpl
+	mov	r1, r0, ror #7		// easy way
+
+	// even the hard way is fairly easy on arm
+	mov	r3, r0, lsl #25
+	orr	r0, r3, r0, lsr #7	// hard way
 
 #elif defined(__aarch64__)
 
-	notimpl
+	ror	x1, x0, #7		// easy way
+
+	// even the hard way is fairly easy on arm
+	lsl	x3, x0, #57
+	orr	x0, x3, x0, lsr #7	// hard way
 
 #else
 	notimpl
 #endif
 
+	ret
+
 endproc
 
 proc	x27
 
+	// shift a right by c places, in two halves
+
 #if defined(__x86_64__)
 
-	notimpl
+	mov	ch, cl			// c' = [c, c]
+	inc	ch			// c' = [c, c + 1]
+	shr	ch, 1
+	shr	cl, 1			// c' = [floor(c/2), ceil(c/2)]
+	shr	rax, cl
+	xchg	ch, cl
+	shr	rax, cl
 
 #elif defined(__i386__)
 
-	notimpl
+	mov	ch, cl			// c' = [c, c]
+	inc	ch			// c' = [c, c + 1]
+	shr	ch, 1
+	shr	cl, 1			// c' = [floor(c/2), ceil(c/2)]
+	shr	eax, cl
+	xchg	ch, cl
+	shr	eax, cl
 
 #elif defined(__arm__)
 
-	notimpl
+	// it would be clearer and more efficient to say: `mov r12, r2, lsr
+	// #1; sub r2, r2, r12', but that's not the lesson this exercise is
+	// trying to teach.
+	add	r12, r2, #1
+	mov	r2, r2, lsr #1
+	mov	r12, r12, lsr #1
+	mov	r0, r0, lsr r2
+	mov	r0, r0, lsr r12
 
 #elif defined(__aarch64__)
 
-	notimpl
+	add	w16, w2, #1
+	lsr	w2, w2, #1
+	lsr	w16, w16, #1
+	lsr	x0, x0, x2
+	lsr	x0, x0, x16
 
 #else
 	notimpl
 #endif
 
+	ret
+
 endproc
 
 proc	x28