bsnes/libco/ppc-elf.c

/*
 * libco.ppc-elf
 * author: Kernigh
 * license: public domain
 *
 * PowerPC 32-bit ELF implementation of libco (for compile with GCC),
 * ported from PowerPC Mac OS X implementation (ppc.s) by Vas Crabb.
 * This ELF version works for OpenBSD, and might also work for FreeBSD,
 * NetBSD and Linux.
 *
 * Note 1:	This implementation does not handle the AltiVec/VMX
 *		registers, because the ELF ABI does not mention them,
 *		and my OpenBSD system is not using them.
 *
 * Note 2:	If you want position-independent code, then you must
 *		define __PIC__. gcc -fpic or -fPIC defines __PIC__, but
 *		gcc -fpie or -fPIE might not. If you want to use -fpie
 *		or -fPIE, then you might need a manual definition:
 *			gcc -fpie -D__PIC__=1
 *			gcc -fPIE -D__PIC__=2
 *
 * The ELF ABI is "System V Application Binary Interface, PowerPC
 * Processor Supplement", which you can get from
 *   <http://refspecs.linux-foundation.org/elf/elfspec_ppc.pdf>
 * (PDF file, hosted by Linux Foundation).
 *
 * ELF and Mac OS X use similar conventions to allocate the registers,
 * and to pass arguments and return values through registers. The main
 * differences are that ELF has a slightly different stack format, that
 * symbols are different (and without an extra underscore at the start),
 * and that the assembly syntax is different.
 *
 * A function may destroy the values of volatile registers, but must
 * preserve the values of nonvolatile registers. So the co_switch()
 * function only saves the nonvolatile registers.
 *
 * [nonvolatile registers in ELF]
 *   %r1, %r14..%r31
 *   %f14..%f31
 *   %cr2..%cr4 in cr
 *
 * [volatile registers in ELF]
 *   %r0, %r3..%r10
 *   %f0..%f13
 *   %cr0, %cr1, %cr5..%cr7 in cr
 *   ctr, lr, xer
 *
 * lr (link register) is volatile, but it contains the return address,
 * so co_switch must save lr.
 *
 * %r13 is the small data pointer. This is constant across threads, so
 * co_switch() does not touch %r13.
 *
 * %r2 is a reserved register, so co_switch() does not touch %r2. Some
 * systems might borrow an idea from the PowerPC Embedded ABI, and might
 * use %r2 as a small read-only data pointer, which is constant across
 * threads.
 */

#ifdef __cplusplus
extern "C" {
#endif

typedef void * cothread_t;

/*
 * co_active_context is either in a global offset table (if we are
 * compiling -fPIC or -fPIE) or has an absolute position.
 */
static void *co_main_stack_pointer;
static cothread_t co_active_context = &co_main_stack_pointer;

extern cothread_t co_active() {
  return co_active_context;
}

/*
 * Embedded assembly.
 *
 * We are not using the percent-sign substitution feature,
 * so we must write "%r1", not "%%r1".
 *
 * We always write 'bl malloc@plt', not 'bl malloc'. The '@plt'
 * is necessary in position-indepent code and seems to have no
 * significant effect in fixed-position code.
 *
 * We never use the 'lmw' or 'stmw' instructions. The ELF ABI
 * mentions that these instructions "are usually slower than
 * a sequence of other instructions that have the same effect."
 * We instead use sequences of 'lwz' or 'stz' instructions.
 */
__asm__("\n"
"### embedded assembly						\n"
".section \".text\"						\n"
"	.balign	4						\n"
"								\n"
/*
 * void co_switch(co_thread to %r3)
 *
 * Allocate our stack frame of 240 bytes:
 * Old      New        Value
 *  4(%r1)   244(%r1)   return address, used by us
 *  0(%r1)   240(%r1)   frame pointer
 *           232(%r1)   %f31
 *           224(%r1)   %f30
 *                      ...
 *            96(%r1)   %f14
 *            92(%r1)   %r31
 *            88(%r1)   %r30
 *                      ...
 *            24(%r1)   %r14
 *            20(%r1)   condition register
 *             8(%r1)   padding of 12 bytes
 *             4(%r1)   return address, never used
 *             0(%r1)   frame pointer
 *
 * Save our registers in our stack frame.
 * Save our stack pointer in 0(%r4).
 * Switch to the stack of the other thread.
 * Restore registers and return.
 */
"	.globl	co_switch					\n"
"	.type	co_switch, @function				\n"
"co_switch:							\n"
"	mflr	%r0		# %r0 = return address		\n"
"	mfcr	%r9		# %r9 = condition register	\n"
"	stwu	%r1, -240(%r1)	# allocate stack frame		\n"
"								\n"
"	stw	%r0, 244(%r1)	# save return address		\n"
"	stfd	%f31, 232(%r1)	# save floating-point regs	\n"
"	stfd	%f30, 224(%r1)					\n"
"	stfd	%f29, 216(%r1)					\n"
"	stfd	%f28, 208(%r1)					\n"
"	stfd	%f27, 200(%r1)					\n"
"	stfd	%f26, 192(%r1)					\n"
"	stfd	%f25, 184(%r1)					\n"
"	stfd	%f24, 176(%r1)					\n"
"	stfd	%f23, 168(%r1)					\n"
"	stfd	%f22, 160(%r1)					\n"
"	stfd	%f21, 152(%r1)					\n"
"	stfd	%f20, 144(%r1)					\n"
"	stfd	%f19, 136(%r1)					\n"
"	stfd	%f18, 128(%r1)					\n"
"	stfd	%f17, 120(%r1)					\n"
"	stfd	%f16, 112(%r1)					\n"
"	stfd	%f16, 104(%r1)					\n"
"	stfd	%f14, 96(%r1)					\n"
"	stw	%r31, 92(%r1)	# save general-purpose regs	\n"
"	stw	%r30, 88(%r1)					\n"
"	stw	%r29, 84(%r1)					\n"
"	stw	%r28, 80(%r1)					\n"
"	stw	%r27, 76(%r1)					\n"
"	stw	%r26, 72(%r1)					\n"
"	stw	%r25, 68(%r1)					\n"
"	stw	%r24, 64(%r1)					\n"
"	stw	%r23, 60(%r1)					\n"
"	stw	%r22, 56(%r1)					\n"
"	stw	%r21, 52(%r1)					\n"
"	stw	%r20, 48(%r1)					\n"
"	stw	%r19, 44(%r1)					\n"
"	stw	%r18, 40(%r1)					\n"
"	stw	%r17, 36(%r1)					\n"
"	stw	%r16, 32(%r1)					\n"
"	stw	%r15, 28(%r1)					\n"
"	stw	%r14, 24(%r1)					\n"
"	stw	%r9, 20(%r1)	# save condition reg		\n"
"								\n"
"	# save current context, set new context			\n"
"	#   %r4 = co_active_context				\n"
"	#   co_active_context = %r3				\n"
#if __PIC__ == 2
"	# position-independent code, large model (-fPIC)	\n"
"	bl	_GLOBAL_OFFSET_TABLE_@local-4			\n"
"	mflr	%r8		# %r8 = address of got		\n"
"	addis	%r7, %r8, co_active_context@got@ha		\n"
"	lwz	%r6, co_active_context@got@l(%r7)		\n"
"	lwz	%r4, 0(%r6)					\n"
"	stw	%r3, 0(%r6)					\n"
#elif __PIC__ == 1
"	# position-independent code, small model (-fpic)	\n"
"	bl	_GLOBAL_OFFSET_TABLE_@local-4			\n"
"	mflr	%r8		# %r8 = address of got		\n"
"	lwz	%r7, co_active_context@got(%r8)			\n"
"	lwz	%r4, 0(%r7)					\n"
"	stw	%r3, 0(%r7)					\n"
#else
"	# fixed-position code					\n"
"	lis	%r8, co_active_context@ha			\n"
"	lwz	%r4, co_active_context@l(%r8)			\n"
"	stw	%r3, co_active_context@l(%r8)			\n"
#endif
"								\n"
"	# save current stack pointer				\n"
"	stw	%r1, 0(%r4)					\n"
"	# get new stack pointer					\n"
"	lwz	%r1, 0(%r3)					\n"
"								\n"
"	lwz	%r0, 244(%r1)	# get return address		\n"
"	lfd	%f31, 232(%r1)	# restore floating-point regs	\n"
"	lfd	%f30, 224(%r1)					\n"
"	lfd	%f29, 216(%r1)					\n"
"	lfd	%f28, 208(%r1)					\n"
"	lfd	%f27, 200(%r1)					\n"
"	lfd	%f26, 192(%r1)					\n"
"	lfd	%f25, 184(%r1)					\n"
"	lfd	%f24, 176(%r1)					\n"
"	lfd	%f23, 168(%r1)					\n"
"	lfd	%f22, 160(%r1)					\n"
"	lfd	%f21, 152(%r1)					\n"
"	lfd	%f20, 144(%r1)					\n"
"	lfd	%f19, 136(%r1)					\n"
"	lfd	%f18, 128(%r1)					\n"
"	lfd	%f17, 120(%r1)					\n"
"	lfd	%f16, 112(%r1)					\n"
"	lfd	%f16, 104(%r1)					\n"
"	lfd	%f14, 96(%r1)					\n"
"	lwz	%r31, 92(%r1)	# restore general-purpose regs	\n"
"	lwz	%r30, 88(%r1)					\n"
"	lwz	%r29, 84(%r1)					\n"
"	lwz	%r28, 80(%r1)					\n"
"	lwz	%r27, 76(%r1)					\n"
"	lwz	%r26, 72(%r1)					\n"
"	lwz	%r25, 68(%r1)					\n"
"	lwz	%r24, 64(%r1)					\n"
"	lwz	%r23, 60(%r1)					\n"
"	lwz	%r22, 56(%r1)					\n"
"	lwz	%r21, 52(%r1)					\n"
"	lwz	%r20, 48(%r1)					\n"
"	lwz	%r19, 44(%r1)					\n"
"	lwz	%r18, 40(%r1)					\n"
"	lwz	%r17, 36(%r1)					\n"
"	lwz	%r16, 32(%r1)					\n"
"	lwz	%r15, 28(%r1)					\n"
"	lwz	%r14, 24(%r1)					\n"
"	lwz	%r9, 20(%r1)	# get condition reg		\n"
"								\n"
"	addi	%r1, %r1, 240	# free stack frame		\n"
"	mtlr	%r0		# restore return address	\n"
"	mtcr	%r9		# restore condition register	\n"
"	blr			# return			\n"
"	.size	co_switch, . - co_switch			\n"
"								\n"
/*
 * cothread_t %r3 co_create(unsigned int stack_size %r3,
 *			    void (*coentry %r4)())
 *
 * Allocate a new stack, such that when you co_switch to that
 * stack, then co_switch returns to coentry.
 */
"	.globl	co_create					\n"
"	.type	co_create, @function				\n"
"co_create:							\n"
"	mflr	%r0		# %r0 = return address		\n"
"	stwu	%r1, -16(%r1)	# allocate my stack frame	\n"
"	stw	%r0, 20(%r1)	# save return address		\n"
"	stw	%r31, 12(%r1)	# save %r31			\n"
"	stw	%r30, 8(%r1)	# save %r30			\n"
"								\n"
"	mr	%r30, %r3	# %r30 = stack_size		\n"
"	mr	%r31, %r4	# %r31 = coentry		\n"
"								\n"
"	# Call malloc(stack_size %r3) to allocate stack;	\n"
"	# malloc() probably uses good alignment.		\n"
"	#							\n"
"	bl	malloc@plt	# returns %r3 = low end		\n"
"	cmpwi	%r3, 0		# if returned NULL,		\n"
"	beq-	1f		#   then abort			\n"
"								\n"
"	# we return %r3 = low end of stack			\n"
"	add	%r4, %r3, %r30	# %r4 = high end of stack	\n"
"								\n"
"	# uncomment if malloc() uses wrong alignment		\n"
"	#rlwinm	%r4,%r4,0,0,27	# force 16-byte alignment 	\n"
"								\n"
	/*
	 * Allocate two stack frames:
	 *   16 bytes for stack frame with return address
	 *  240 bytes for co_switch stack frame
	 *
	 * Old         New         Value
	 *   -8(%r4)    248(%r5)    padding of 8 bytes
	 *  -12(%r4)    244(%r5)    return address = coentry
	 *  -16(%r4)    240(%r5)    frame pointer = NULL
	 *              232(%r5)    %f31 = 0
	 *                          ...
	 *               20(%r5)    condition register = 0
	 *                0(%r5)    frame pointer
	 */
"	li	%r9, (240-20)/4+1				\n"
"	addi	%r5, %r4, -16	# allocate first stack frame	\n"
"	li	%r0, 0						\n"
"	stwu	%r5, -240(%r5)	# allocate second stack frame	\n"
"	li	%r8, 20						\n"
"	mtctr	%r9		# loop %r9 times		\n"
"2:	# loop to store zero to 20(%r5) through 240(%r5)	\n"
"	stwx	%r0, %r5, %r8					\n"
"	addi	%r8, %r8, 4	# index += 4			\n"
"	bdnz+	2b		# ctr -= 1, branch if nonzero	\n"
"								\n"
"	stw	%r31, 244(%r5)	# return address = coentry	\n"
"	stw	%r5, 0(%r3)	# save stack pointer		\n"
"								\n"
"	lwz	%r0, 20(%r1)	# get return address		\n"
"	lwz	%r31, 12(%r1)	# restore %r31			\n"
"	lwz	%r30, 8(%r1)	# restore %r30			\n"
"	mtlr	%r0		# restore return address	\n"
"	addi	%r1, %r1, 16	# free stack frame		\n"
"	blr			# return			\n"
"								\n"
"1:	b	abort@plt	# branch 1f to abort		\n"
"	.size	co_create, . - co_create			\n"
"								\n"
/*
 * void co_delete(cothread_t) => void free(void *)
 */
"	.globl	co_delete					\n"
"	.type	co_delete, @function				\n"
"co_delete:							\n"
"	b	free@plt					\n"
"								\n"
);

#ifdef __cplusplus
}
#endif