/// -*- mode: asm; asm-comment-char: ?/ -*-

	.intel_syntax noprefix

	.section .note.GNU-stack, "", @progbits

.macro	proc	name
	.globl	\name
	.type	\name, STT_FUNC
	.p2align 4
\name\():
  .macro endproc
	.size	\name, . - \name
	.purgem	endproc
  .endm
.endm

.macro ch c
	pushf
	push	rax
	push	rcx
	push	rdx
	push	rsi
	push	rdi
	push	r8
	push	r9
	push	rbp
	mov	rbp, rsp
	and	rsp, -16

	mov	rdi, \c
	call	putchar@plt

	mov	rdi, [rip + stdout]
	call	fflush@plt

	mov	rsp, rbp
	pop	rbp
	pop	r9
	pop	r8
	pop	rdi
	pop	rsi
	pop	rdx
	pop	rcx
	pop	rax
	popf
.endm

	.text

proc	call_example

	push	rbx			// rbx
	push	r10
	push	r11
	push	r12
	push	r13
	push	r14
	push	r15
	push	rbp			// flags, rbp, ..., rbx
	pushf

	push	rsi			// regs, flags, rbp, ..., rbx

	lea	rax, [rip + 9f]
	push	rax			// cont, regs, flags, rbp, ..., rbx
	push	rdi		    // func, cont, regs, flags, rbp, ..., rbx

	mov	rax, [rsi + 56]
	pushf
	pop	rcx
	and	rax,  0x0cd5
	and	rcx, ~0x0cd5
	or	rax, rcx
	push	rax
	popf
	mov	rax, [rsi +  0]
	mov	rbx, [rsi +  8]
	mov	rcx, [rsi + 16]
	mov	rdx, [rsi + 24]
	mov	rdi, [rsi + 40]
	mov	rbp, [rsi + 48]
	mov	rsi, [rsi + 32]

	ret			       // -> func; regs, flags, rbp, ..., rbx

9:	pushf				// rflags, regs, flags, rbp, ..., rbx
	push	rsi		   // rsi, rflags, regs, flags, rbp, ..., rbx
	mov	rsi, [rsp + 16]
	mov	[rsi +  0], rax
	mov	[rsi +  8], rbx
	mov	[rsi + 16], rcx
	mov	[rsi + 24], rdx
	mov	[rsi + 40], rdi
	mov	[rsi + 48], rbp
	pop	rax			// rflags, regs, flags, rbp, ..., rbx
	mov	[rsi + 32], rax
	pop	rax			// regs, flags, rbp, ..., rbx
	mov	[rsi + 56], rax

	add	rsp, 8			// flags, rbp, ..., rbx
	popf				// rbp, ..., rbx
	pop	rbp			// ..., rbx
	pop	r15
	pop	r14
	pop	r13
	pop	r12
	pop	r11
	pop	r10
	pop	rbx			//
	ret

endproc

proc	nop

	ret

endproc

///--------------------------------------------------------------------------

proc	x00

	// clear all 64 bits of extended traditional registers
	xor      eax,eax		// clear rax
	lea      rbx,[0]		// rbx -> _|_
	loop     .			// iterate, decrement rcx until zero
	mov      rdx,0			// set rdx = 0
	and      esi,0			// clear all bits of rsi
	sub      edi,edi		// set rdi = edi - edi = 0
	push     0
	pop      rbp			// pop 0 into rbp

	ret

endproc

proc	x01

	// advance a fibonacci pair by c steps
	//
	// on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
	// and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
0:	xadd	rax, rdx		// a, d = a + d, a
					//      = f_{i+1} + f_i, f_{i+1}
					//      = f_{i+2}, f_{i+1}
	loop	0b			// advance i, decrement c, iterate

	ret

endproc

proc	x02

	// boolean canonify a: if a = 0 on entry, leave it zero; otherwise
	// set a = 1
	neg	rax			// set cf iff a /= 0
	sbb	rax, rax		// a = a - a - cf = -cf
	neg	rax			// a = cf

	ret

endproc

proc	x03

	// set a = min(a, d) (unsigned); clobber c, d
	sub	rdx, rax		// d' = d - a; set cf if a > d
	sbb	rcx, rcx		// c = -cf = -[a > d]
	and	rcx, rdx		// c = a > d ? d - a : 0
	add	rax, rcx		// a' = a > d ? d : a

	ret

endproc

proc	x04

	// switch case?
	xor	al, 0x20

	ret

endproc

proc	x05

	// answer whether 5 <= a </<= 9.
	sub	rax, 5			// a' = a - 5
	cmp	rax, 4			// is a' - 5 </<= 4?

	// cc		a'			a
	//
	// z/e		a' = 4			a = 9
	// nz/ne	a' /= 4			a /= 9
	//
	// a/nbe	a' > 4			a > 9 or a < 5
	// nc/ae/nb	a' >= 4			a >= 9 or a < 5
	// c/b/nae	a' < 4			5 <= a < 9
	// be/na	a' <= 4			5 <= a <= 9
	//
	// o		a' < -2^63 + 4		-2^63 + 5 <= a < -2^63 + 9
	// no		a' >= -2^63 + 4		a >= -2^63 + 9 or
	//						a < -2^63 + 5
	// s		-2^63 + 4 <= a' < 4	-2^63 + 9 <= a < 9
	// ns		a' < -2^63 + 4 or	a < -2^63 + 9 or a >= 9
	//			a' >= 4
	// ge/nl	a' >= 4			a >= 9 or a < -2^63 + 5
	// l/nge	a' < 4			-2^63 + 5 <= a < 9
	// g/nle	a' > 4			a > 9 or a < -2^63 + 5
	// le/ng	a' <= 4			-2^63 + 5 <= a <= 9

	ret

endproc

proc	x06

	// leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
	// set sf to msb(a)
	not	rax			// a' = -a - 1
	inc	rax			// a' = -a
	neg	rax			// a' = a

	ret

endproc

proc	x07

	// same as before (?)
	inc	rax			// a' = a + 1
	neg	rax			// a' = -a - 1
	inc	rax			// a' = -a
	neg	rax			// a' = a

	ret

endproc

proc	x08

	// floor((a + d)/2), correctly handling overflow conditions; final cf
	// is lsb(a + d), probably uninteresting
	add	rax, rdx		// cf || a' = a + d
	rcr	rax, 1			// shift 65-bit result right by one
					// place; lsb moves into carry

	ret

endproc

proc	x09

	// a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
	// (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
	shr	rax, 3			// a' = floor(a/8); cf = 1 if a ==
					// 4, 5, 6, 7 (mod 8)
	adc	rax, 0			// a' = floor(a/8) + cf

	ret

endproc

proc	x0a

	// increment c-byte little-endian bignum at rdi
	add	byte ptr [rdi], 1
0:	inc	rdi
	adc	byte ptr [rdi], 0
	loop	0b

	ret

endproc

proc	x0b

	// negate double-precision d:a
	not	rdx			// d' = -d - 1
	neg	rax			// a' = -a;
					// cf = 1 iff a /= 0
	sbb	rdx, -1			// d' = -d - cf

	ret

endproc

proc	x0c

	// rotate is distributive over xor.

	// rax				// = a_1 || a_0
	// rbx				// = b_1 || b_0
	mov	rcx, rax		// = a_1 || a_0

	xor	rcx, rbx		// = (a_1 XOR b_1) || (a_0 XOR b_0)
	ror	rcx, 0xd		// = (a_0 XOR b_0) || (a_1 XOR b_1)

	ror	rax, 0xd		// = a_0 || a_1
	ror	rbx, 0xd		// = b_0 || b_1
	xor	rax, rbx		// = (a_0 XOR b_0) || (a_1 XOR b_1)

	cmp	rax, rcx		// always equal

	ret

endproc

proc	x0d

	// and is distributive over xor.

	mov	rdx, rbx		// = b

	xor	rbx, rcx		// = b XOR c
	and	rbx, rax		// = a AND (b XOR c)

	and	rdx, rax		// = a AND b
	and	rax, rcx		// = a AND c
	xor	rax, rdx		// = (a AND b) XOR (a AND c)
					// = a AND (b XOR c)

	cmp	rax, rbx		// always equal

	ret

endproc

proc	x0e

	// de morgan's law

	mov	rcx, rax		// = a

	and	rcx, rbx		// = a AND b
	not	rcx			// = NOT (a AND b)

	not	rax			// = NOT a
	not	rbx			// = NOT b
	or	rax, rbx		// = (NOT a) OR (NOT b)
					// = NOT (a AND b)

	cmp	rax, rcx

	ret

endproc

proc	x0f

	// replace input buffer bytes with cumulative XORs with initial a;
	// final a is XOR of all buffer bytes and initial a.
	//
	// not sure why you'd do this.

	cld

0:	xor	[rsi], al
	lodsb
	loop	0b

	ret

endproc

proc	x10

	// four different ways to swap a pair of registers.

	push	rax
	push	rcx
	pop	rax
	pop	rcx

	xor	rax, rcx
	xor	rcx, rax
	xor	rax, rcx

	add	rax, rcx
	sub	rcx, rax
	add	rax, rcx
	neg	rcx

	xchg	rax, rcx

	ret

endproc

proc	x11

	// assuming a is initialized to zero, set a to the inclusive or of
	// the xor-differences of corresponding bytes in the c-byte strings
	// at si and di.
	//
	// in particular, a will be zero (and zf set) if and only if the two
	// strings are equal.

0:	mov	dl, [rsi]
	xor	dl, [rdi]
	inc	rsi
	inc	rdi
	or	al, dl
	loop	0b

	ret

endproc

proc	x12

	// an obtuse way of adding two registers.  for any bit position, a
	// OR d is set if and only if at least one of a and d has a bit set
	// in that position, and a AND d is set if and only if both have a
	// bit set in that position.  essentially, then, what we've done is
	// move all of the set bits in d to a, unless there's already a bit
	// there.  this clearly doesn't change the sum.

	mov	rcx, rdx		// c' = d
	and	rdx, rax		// d' = a AND d
	or	rax, rcx		// a' = a OR d
	add	rax, rdx

	ret

endproc

proc	x13

	// ok, so this is a really obtuse way of adding a and b; the result
	// is in a and d.  but why does it work?

	mov	rcx, 0x40		// carry chains at most 64 long
0:	mov	rdx, rax		// copy a'
	xor	rax, rbx		// low bits of each bitwise sum
	and	rbx, rdx		// carry bits from each bitwise sum
	shl	rbx, 001		// carry them into next position
	loop	0b

	ret

endproc

proc	x14

	// floor((a + d)/2), like x08.

	mov	rcx, rax		// copy a for later
	and	rcx, rdx		// carry bits

	xor	rax, rdx		// low bits of each bitwise sum
	shr	rax, 1			// divide by 2; carries now in place

	add	rax, rcx		// add the carries; done

	ret

endproc

proc	x15

	// sign extension 32 -> 64 bits.

	//movsx	rbx, eax		// like this?

	mov	rdx, 0xffffffff80000000
	add	rax, rdx		// if bit 31 of a is set then bits
					// 31--63 of a' are clear; otherwise,
					// these bits are all set -- which is
					// exactly backwards
	xor	rax, rdx		// so fix it

	ret

endproc

proc	x16

  //shl rax, 56
  //shl rbx, 56
  //shl rcx, 56

	xor	rax, rbx		// a' = a XOR b
	xor	rbx, rcx		// b' = b XOR c
	mov	rsi, rax		// t = a XOR b
	add	rsi, rbx		// t = (a XOR b) + (b XOR c)
	cmovc	rax, rbx		// a' = cf ? b XOR c : a XOR b
	xor	rax, rbx		// a' = cf ? 0 : a XOR c
	cmp	rax, rsi

	ret

endproc

proc	x17

	ud2

endproc

proc	x18

	ud2

endproc

proc	x19

	ud2

endproc

proc	x1a

	ud2

endproc

proc	x1b

	ud2

endproc

proc	x1c

	ud2

endproc

proc	x1d

	ud2

endproc

proc	x1e

	ud2

endproc

proc	x1f

	ud2

endproc

proc	x20

	ud2

	ret

endproc

proc	x21

	ud2

endproc

proc	x22

	ud2

endproc

proc	x23

	ud2

endproc

proc	x24

	ud2

endproc

proc	x25

	ud2

endproc

proc	x26

	ud2

endproc

proc	x27

	ud2

endproc

proc	x28

	ud2

endproc

proc	x29

	ud2

endproc

proc	x2a

	ud2

endproc

proc	x2b

	ud2

endproc

proc	x2c

	ud2

endproc

proc	x2d

	ud2

endproc

proc	x2e

	ud2

endproc

proc	x2f

	ud2

endproc

proc	x30

	ud2

	ret

endproc

proc	x31

	ud2

endproc

proc	x32

	ud2

endproc

proc	x33

	ud2

endproc

proc	x34

	ud2

endproc

proc	x35

	ud2

endproc

proc	x36

	ud2

endproc

proc	x37

	ud2

endproc

proc	x38

	ud2

endproc

proc	x39

	ud2

endproc

proc	x3a

	ud2

endproc

proc	x3b

	ud2

endproc

proc	x3c

	ud2

endproc

proc	x3d

	ud2

endproc

proc	x3e

	ud2

endproc

proc	x3f

	ud2

endproc