#include "as_reg_compat.h"

.text
	.set		push
	.set		noreorder
	.set		noat

.macro TransformWithColourM	function, fog_mode, texture_mode
.global \function
\function:

############################
#	a0 - world matrix				- must be aligned to 16 bytes
#	a1 - world projection matrix	- must be aligned to 16 bytes
#	a2 - Fiddled vertices			- stride 16
#	a3 - Output vertices			- must be aligned to 16 bytes, stride 64
#	t0 - num vertices
#	t1 - params

	lv.q		R000, 0($a0)		// Load mat world
	lv.q		R001, 16($a0)
	lv.q		R002, 32($a0)
	lv.q		R003, 48($a0)

	lv.q		R100, 0($a1)		// Load mat project
	lv.q		R101, 16($a1)
	lv.q		R102, 32($a1)
	lv.q		R103, 48($a1)
	
	lv.q		R701, 16($t1)		// Load params [fog_m, fog_o, tscale_x, tscale_y]
	
	// XXXX Todo - can this be improved?
	# Load 1/256 (vuc2i/vi2f end up converting 0xff to 256.0)
	vfim.s		S203, 0.00390625

	sll			$t0, $t0, 4			// count = count * 16
	addu		$t0, $a2, $t0		// end_ptr = start_ptr + count * 16

	beq			$a2, $t0, finished_colour_\function
	nop

next_vertex_colour_\function:
	# Load and transform this vertex position
 	lv.s		S200, 0($a2)					// load word [y,x,?,z]
 	lv.s		S210, 4($a2)					//		should align this to 16 bytes so we can do a single load?
	vs2i.p		R200, R200						// R200 = [?,z,y,x]
	vi2f.q		R200, R200, 16					// int -> float
	vmov.q		R200, R200[y,x,w,1]				// Have to permute here, as sadly can't do this with first vtfm4.q
 
	vtfm4.q		R201, M000, R200				// World transform
	vtfm4.q		R202, M100, R200				// Projection transform
	
	sv.q		R201, 0($a3)					// Store world transform
	sv.q		R202, 16($a3)					// Store projection transform
	
.if \fog_mode == 1
	#	Calculate the Colour alpha value from the foq. We do this while the projected point
	#	is still loaded, and merge it in after the rgb components are calculated by the lighting calc below
	#	float eyespace_z = projected.z / projected.w;
	#	fog_coeff = (eyespace_z * m_fFogMult) + m_fFogOffset;
	#	mVtxProjected[i].Colour.w = std::clamp< f32 >( fog_coeff, 0.0f, 1.0f );
	vmov.s		S200, S232					// get w component
	vmov.s		S201, S222					// v = z
	vrcp.s		S200, S200					// 1.0 / projected.w
	vmul.s		S201, S201, S701			// v = z * fog_mult
	vmul.s		S201, S201, S200			// v = z*fog_mult*(1/w)
	vadd.s		S600[0:1], S201, S710		// v = z*fog_mult*(1/w) + fog_offset
.endif

	# Compute the clip flags
	vcmp.q		LT, R202, R202[-w,-w,-w,0]		// x < -w, y < -w, z < -w
	vnop
	mfvc		$t4, $131						// VFPU_CC. Corresponds to X_NEG/Y_NEG/Z_NEG (will become X_POS/Y_POS/Z_POS later)
	andi		$t4, $t4, 0x7					// Mask out the condition codes we don't care about
	sll			$t4, $t4, 3						// Shift up to create X_POS/Y_POS/Z_POS	
	
	vcmp.q		GT, R202, R202[w,w,w,0]			// x > w, y > w, z > w
	vnop
	mfvc		$t5, $131						// VFPU_CC. Corresponds to X_NEG/Y_NEG/Z_NEG
	andi		$t5, $t5, 0x7					// Mask out the condition codes we don't care about
	or			$t4, $t4, $t5

	sw			$t4, 56($a3)					// Store ClipFlags
	
	
.if \texture_mode == 0
	# Nothing to do

.elseif \texture_mode == 1
# Textured
#	t.x = (float)v.tu * mTextureScale.x
#	t.y = (float)v.tv * mTextureScale.y

	lv.s		S202, 8($a2)				// load texture word [tv,tu] (N.B. due to swizzling these are 'backwards' from what you might expect)
	vs2i.s		R202, S202
	vi2f.q		R202, R202, 16				// int -> float
	vmul.q		R202, R202, R701[w,z,0,0]	// multiply by mTextureScale
	sv.s		S212, 0x30($a3)				// Store Texture.x
	sv.s		S202, 0x34($a3)				// Store Texture.y
	
.endif
	
	# Load and normalise the RGBA colour
	lv.s		S200, 12($a2)					// load colour word [a,b,g,r]
	.word		0xd0380000 | (8<<8) | (40)		// vuc2i.s	R200, S200					// R200 = [?,z,y,x]
	vi2f.q		R200, R200, 23					// int -> float
	vscl.q		R200, R200[w,z,y,x], S203		// R200 = [r * 1/256, g * 1/256, b * 1/256, a * 1/256]
	
.if \fog_mode == 1
	# Merge in the computed fog colour
	vmov.s		S230, S600
.endif

	sv.q		R200, 32($a3)					// Store colour

	# Continue with the next vertex
	addiu		$a2, $a2, 16		// Next input vertex
	bne			$a2, $t0, next_vertex_colour_\function
	addiu		$a3, $a3, 64		// Next output vertex
	
finished_colour_\function:
	jr			$ra
	nop

.endm

TransformWithColourM _TransformVerticesWithColour_f0_t0, 0, 0
TransformWithColourM _TransformVerticesWithColour_f0_t1, 0, 1
TransformWithColourM _TransformVerticesWithColour_f1_t0, 1, 0
TransformWithColourM _TransformVerticesWithColour_f1_t1, 1, 1

	.set pop