From 0727df6f0a620b2dcc97ddb220db7ab3142b2503 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Wed, 10 Sep 2014 10:52:59 +0200
Subject: [PATCH] Jit the most common of the "ToFloat" texcoord conversions

---
 GPU/Common/VertexDecoderArm.cpp |  1 +
 GPU/Common/VertexDecoderX86.cpp | 35 +++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp
index 82ce8cd3bb..f41d0b2d0d 100644
--- a/GPU/Common/VertexDecoderArm.cpp
+++ b/GPU/Common/VertexDecoderArm.cpp
@@ -51,6 +51,7 @@ static float MEMORY_ALIGNED16(boneMask[4]) = {1.0f, 1.0f, 1.0f, 0.0f};
 
 
 static const float by128 = 1.0f / 128.0f;
+static const float by16384 = 1.0f / 16384.0f;
 static const float by32768 = 1.0f / 32768.0f;
 
 using namespace ArmGen;
diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp
index 1666eb10e3..28454c061e 100644
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@@ -39,6 +39,10 @@ static const float MEMORY_ALIGNED16( by32768[4] ) = {
 static const u32 MEMORY_ALIGNED16( threeMasks[4] ) = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0};
 static const u32 MEMORY_ALIGNED16( aOne[4] ) = {0, 0, 0, 0x3F800000};
 
+static const float MEMORY_ALIGNED16(by16384[4]) = {
+	1.0f / 16384.0f, 1.0f / 16384.0f, 1.0f / 16384.0f, 1.0f / 16384.0f,
+};
+
 #ifdef _M_X64
 #ifdef _WIN32
 static const X64Reg tempReg1 = RAX;
@@ -90,6 +94,8 @@ static const JitLookup jitLookup[] = {
 
 	{&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8},
 	{&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16},
+	{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
+	{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
 	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
 	{&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double},
 
@@ -98,6 +104,7 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
 
 	{&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through},
+	{&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
 	{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
 	{&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble},
 
@@ -464,6 +471,26 @@ void VertexDecoderJitCache::Jit_TcU16() {
 	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
 }
 
+void VertexDecoderJitCache::Jit_TcU8ToFloat() {
+	// TODO: The first five instructions could be done in 1 or 2 in SSE4
+	MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->tcoff));
+	MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->tcoff + 1));
+	CVTSI2SS(fpScratchReg, R(tempReg1));
+	CVTSI2SS(fpScratchReg2, R(tempReg2));
+	UNPCKLPS(fpScratchReg, R(fpScratchReg2));
+	MULPS(fpScratchReg, M(&by128));
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
+}
+
+void VertexDecoderJitCache::Jit_TcU16ToFloat() {
+	PXOR(fpScratchReg2, R(fpScratchReg2));
+	MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff));
+	PUNPCKLWD(fpScratchReg, R(fpScratchReg2));
+	CVTDQ2PS(fpScratchReg, R(fpScratchReg));
+	MULPS(fpScratchReg, M(&by32768));
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
+}
+
 void VertexDecoderJitCache::Jit_TcU16Double() {
 	MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
 	MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2));
@@ -525,6 +552,14 @@ void VertexDecoderJitCache::Jit_TcU16Through() {
 	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
 }
 
+void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
+	PXOR(fpScratchReg2, R(fpScratchReg2));
+	MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff));
+	PUNPCKLWD(fpScratchReg, R(fpScratchReg2));
+	CVTDQ2PS(fpScratchReg, R(fpScratchReg));
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
+}
+
 void VertexDecoderJitCache::Jit_TcU16ThroughDouble() {
 	MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
 	MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2));