From 7699fa55de16f4ed14512dd702fbded2ff0daa70 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 6 May 2017 18:58:15 -0700
Subject: [PATCH] arm: Jit throughmode 16-bit texcoords.

It's popular, and this makes decoding such verts much faster.
---
 GPU/Common/VertexDecoderArm.cpp   | 39 ++++++++++++++++++++++++++++++-
 GPU/Common/VertexDecoderArm64.cpp | 22 ++++++++++++++++-
 2 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp
index 3f9267cab3..ce87e02117 100644
--- a/GPU/Common/VertexDecoderArm.cpp
+++ b/GPU/Common/VertexDecoderArm.cpp
@@ -127,7 +127,7 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
 
 	{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
-	// {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
+	{&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
 
 	{&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
 	{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
@@ -568,6 +568,43 @@ void VertexDecoderJitCache::Jit_TcFloat() {
 	STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4);
 }
 
+void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
+	LDRH(tempReg1, srcReg, dec_->tcoff);
+	LDRH(tempReg2, srcReg, dec_->tcoff + 2);
+
+	MOVP2R(scratchReg, &gstate_c.vertBounds.minU);
+
+	auto updateSide = [&](ARMReg r, CCFlags cc, u32 off) {
+		LDRH(tempReg3, scratchReg, off);
+		CMP(r, tempReg3);
+		SetCC(cc);
+		STRH(r, scratchReg, off);
+		SetCC(CC_AL);
+	};
+
+	// TODO: Can this actually be fast?  Hmm, floats aren't better.
+	updateSide(tempReg1, CC_LT, offsetof(KnownVertexBounds, minU));
+	updateSide(tempReg1, CC_GT, offsetof(KnownVertexBounds, maxU));
+	updateSide(tempReg2, CC_LT, offsetof(KnownVertexBounds, minV));
+	updateSide(tempReg2, CC_GT, offsetof(KnownVertexBounds, maxV));
+
+	if (cpu_info.bNEON) {
+		ADD(scratchReg, srcReg, dec_->tcoff);
+		VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
+		VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
+		VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
+		ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
+		VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
+	} else {
+		VMOV(fpScratchReg, tempReg1);
+		VMOV(fpScratchReg2, tempReg2);
+		VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
+		VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
+		VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
+		VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
+	}
+}
+
 void VertexDecoderJitCache::Jit_TcFloatThrough() {
 	LDR(tempReg1, srcReg, dec_->tcoff);
 	LDR(tempReg2, srcReg, dec_->tcoff + 4);
diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp
index f68f100457..12f58e829b 100644
--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@@ -101,7 +101,7 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
 
 	{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
-	// {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
+	{&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
 
 	{&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
 	{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
@@ -579,6 +579,26 @@ void VertexDecoderJitCache::Jit_Color5551() {
 	CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
 }
 
+void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
+	LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
+	LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2);
+
+	auto updateSide = [&](ARM64Reg src, CCFlags cc, ARM64Reg dst) {
+		CMP(src, dst);
+		CSEL(dst, src, dst, cc);
+	};
+
+	updateSide(tempReg1, CC_LT, boundsMinUReg);
+	updateSide(tempReg1, CC_GT, boundsMaxUReg);
+	updateSide(tempReg2, CC_LT, boundsMinVReg);
+	updateSide(tempReg2, CC_GT, boundsMaxVReg);
+
+	fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
+	fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
+	fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
+	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
+}
+
 void VertexDecoderJitCache::Jit_TcFloatThrough() {
 	LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff);
 	STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff);