From a0397bce4ca8919db0da0ad42e726845dd127d57 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Wed, 13 Apr 2016 23:15:41 -0700
Subject: [PATCH 1/4] Hopefully fix prescale in remasters.

Don't actually have a remaster to test, though.
---
 GPU/Common/VertexDecoderCommon.cpp | 33 ++++++++++++++++++++++--------
 GPU/Common/VertexDecoderCommon.h   |  1 +
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp
index f8ab3a63d7..9900d38a4d 100644
--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@@ -390,6 +390,13 @@ void VertexDecoder::Step_TcU16Prescale() const {
 	uv[1] = (float)uvdata[1] * (1.f / 32768.f) * gstate_c.uv.vScale + gstate_c.uv.vOff;
 }
 
+void VertexDecoder::Step_TcU16DoublePrescale() const {
+	float *uv = (float *)(decoded_ + decFmt.uvoff);
+	const u16 *uvdata = (const u16_le *)(ptr_ + tcoff);
+	uv[0] = (float)uvdata[0] * (1.f / 16384.f) * gstate_c.uv.uScale + gstate_c.uv.uOff;
+	uv[1] = (float)uvdata[1] * (1.f / 16384.f) * gstate_c.uv.vScale + gstate_c.uv.vOff;
+}
+
 void VertexDecoder::Step_TcFloatPrescale() const {
 	float *uv = (float *)(decoded_ + decFmt.uvoff);
 	const float *uvdata = (const float*)(ptr_ + tcoff);
@@ -752,6 +759,13 @@ static const StepFunction tcstep_prescale[4] = {
 	&VertexDecoder::Step_TcFloatPrescale,
 };
 
+static const StepFunction tcstep_prescale_remaster[4] = {
+	0,
+	&VertexDecoder::Step_TcU8Prescale,
+	&VertexDecoder::Step_TcU16DoublePrescale,
+	&VertexDecoder::Step_TcFloatPrescale,
+};
+
 static const StepFunction tcstep_through[4] = {
 	0,
 	&VertexDecoder::Step_TcU8,
@@ -767,28 +781,28 @@ static const StepFunction tcstep_throughToFloat[4] = {
 };
 
 // Some HD Remaster games double the u16 texture coordinates.
-static const StepFunction tcstep_Remaster[4] = {
+static const StepFunction tcstep_remaster[4] = {
 	0,
 	&VertexDecoder::Step_TcU8,
 	&VertexDecoder::Step_TcU16Double,
 	&VertexDecoder::Step_TcFloat,
 };
 
-static const StepFunction tcstep_RemasterToFloat[4] = {
+static const StepFunction tcstep_remasterToFloat[4] = {
 	0,
 	&VertexDecoder::Step_TcU8ToFloat,
 	&VertexDecoder::Step_TcU16DoubleToFloat,
 	&VertexDecoder::Step_TcFloat,
 };
 
-static const StepFunction tcstep_through_Remaster[4] = {
+static const StepFunction tcstep_through_remaster[4] = {
 	0,
 	&VertexDecoder::Step_TcU8,
 	&VertexDecoder::Step_TcU16ThroughDouble,
 	&VertexDecoder::Step_TcFloatThrough,
 };
 
-static const StepFunction tcstep_through_RemasterToFloat[4] = {
+static const StepFunction tcstep_through_remasterToFloat[4] = {
 	0,
 	&VertexDecoder::Step_TcU8ToFloat,
 	&VertexDecoder::Step_TcU16ThroughDoubleToFloat,
@@ -955,19 +969,22 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
 			biggest = tcalign[tc];
 
 		// NOTE: That we check getUVGenMode here means that we must include it in the decoder ID!
-		if (g_Config.bPrescaleUV && !throughmode && (gstate.getUVGenMode() == 0 || gstate.getUVGenMode() == 3)) {
-			steps_[numSteps_++] = tcstep_prescale[tc];
+		if (g_Config.bPrescaleUV && !throughmode && (gstate.getUVGenMode() == GE_TEXMAP_TEXTURE_COORDS || gstate.getUVGenMode() == GE_TEXMAP_UNKNOWN)) {
+			if (g_DoubleTextureCoordinates)
+				steps_[numSteps_++] = tcstep_prescale_remaster[tc];
+			else
+				steps_[numSteps_++] = tcstep_prescale[tc];
 			decFmt.uvfmt = DEC_FLOAT_2;
 		} else {
 			if (options.expandAllUVtoFloat) {
 				if (g_DoubleTextureCoordinates)
-					steps_[numSteps_++] = throughmode ? tcstep_through_RemasterToFloat[tc] : tcstep_RemasterToFloat[tc];
+					steps_[numSteps_++] = throughmode ? tcstep_through_remasterToFloat[tc] : tcstep_remasterToFloat[tc];
 				else
 					steps_[numSteps_++] = throughmode ? tcstep_throughToFloat[tc] : tcstepToFloat[tc];
 				decFmt.uvfmt = DEC_FLOAT_2;
 			} else {
 				if (g_DoubleTextureCoordinates)
-					steps_[numSteps_++] = throughmode ? tcstep_through_Remaster[tc] : tcstep_Remaster[tc];
+					steps_[numSteps_++] = throughmode ? tcstep_through_remaster[tc] : tcstep_remaster[tc];
 				else
 					steps_[numSteps_++] = throughmode ? tcstep_through[tc] : tcstep[tc];
 
diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
index 14ed122bae..dff07bedc2 100644
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@@ -484,6 +484,7 @@ public:
 
 	void Step_TcU8Prescale() const;
 	void Step_TcU16Prescale() const;
+	void Step_TcU16DoublePrescale() const;
 	void Step_TcFloatPrescale() const;
 
 	void Step_TcU16Double() const;

From 614665068a7691ba917d46a5311e1a6c28b861e0 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Wed, 13 Apr 2016 23:34:45 -0700
Subject: [PATCH 2/4] Implement morphing for texcoords.

Tests show that this can be used.
---
 GPU/Common/VertexDecoderCommon.cpp | 162 +++++++++++++++++++++++++++--
 GPU/Common/VertexDecoderCommon.h   |   9 ++
 2 files changed, 164 insertions(+), 7 deletions(-)

diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp
index 9900d38a4d..6f6a0dc9d2 100644
--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@@ -385,14 +385,14 @@ void VertexDecoder::Step_TcU8Prescale() const {
 
 void VertexDecoder::Step_TcU16Prescale() const {
 	float *uv = (float *)(decoded_ + decFmt.uvoff);
-	const u16 *uvdata = (const u16_le *)(ptr_ + tcoff);
+	const u16_le *uvdata = (const u16_le *)(ptr_ + tcoff);
 	uv[0] = (float)uvdata[0] * (1.f / 32768.f) * gstate_c.uv.uScale + gstate_c.uv.uOff;
 	uv[1] = (float)uvdata[1] * (1.f / 32768.f) * gstate_c.uv.vScale + gstate_c.uv.vOff;
 }
 
 void VertexDecoder::Step_TcU16DoublePrescale() const {
 	float *uv = (float *)(decoded_ + decFmt.uvoff);
-	const u16 *uvdata = (const u16_le *)(ptr_ + tcoff);
+	const u16_le *uvdata = (const u16_le *)(ptr_ + tcoff);
 	uv[0] = (float)uvdata[0] * (1.f / 16384.f) * gstate_c.uv.uScale + gstate_c.uv.uOff;
 	uv[1] = (float)uvdata[1] * (1.f / 16384.f) * gstate_c.uv.vScale + gstate_c.uv.vOff;
 }
@@ -404,6 +404,126 @@ void VertexDecoder::Step_TcFloatPrescale() const {
 	uv[1] = uvdata[1] * gstate_c.uv.vScale + gstate_c.uv.vOff;
 }
 
+void VertexDecoder::Step_TcU8Morph() const {
+	float uv[2] = { 0, 0 };
+	for (int n = 0; n < morphcount; n++) {
+		float w = gstate_c.morphWeights[n];
+		const u8 *uvdata = (const u8 *)(ptr_ + onesize_*n + tcoff);
+
+		uv[0] += (float)uvdata[0] * (1.f / 128.f) * w;
+		uv[1] += (float)uvdata[1] * (1.f / 128.f) * w;
+	}
+
+	float *out = (float *)(decoded_ + decFmt.uvoff);
+	out[0] = uv[0];
+	out[1] = uv[1];
+}
+
+void VertexDecoder::Step_TcU16Morph() const {
+	float uv[2] = { 0, 0 };
+	for (int n = 0; n < morphcount; n++) {
+		float w = gstate_c.morphWeights[n];
+		const u16_le *uvdata = (const u16_le *)(ptr_ + onesize_*n + tcoff);
+
+		uv[0] += (float)uvdata[0] * (1.f / 32768.f) * w;
+		uv[1] += (float)uvdata[1] * (1.f / 32768.f) * w;
+	}
+
+	float *out = (float *)(decoded_ + decFmt.uvoff);
+	out[0] = uv[0];
+	out[1] = uv[1];
+}
+
+void VertexDecoder::Step_TcU16DoubleMorph() const {
+	float uv[2] = { 0, 0 };
+	for (int n = 0; n < morphcount; n++) {
+		float w = gstate_c.morphWeights[n];
+		const u16_le *uvdata = (const u16_le *)(ptr_ + onesize_*n + tcoff);
+
+		uv[0] += (float)uvdata[0] * (1.f / 16384.f) * w;
+		uv[1] += (float)uvdata[1] * (1.f / 16384.f) * w;
+	}
+
+	float *out = (float *)(decoded_ + decFmt.uvoff);
+	out[0] = uv[0];
+	out[1] = uv[1];
+}
+
+void VertexDecoder::Step_TcFloatMorph() const {
+	float uv[2] = { 0, 0 };
+	for (int n = 0; n < morphcount; n++) {
+		float w = gstate_c.morphWeights[n];
+		const float_le *uvdata = (const float_le *)(ptr_ + onesize_*n + tcoff);
+
+		uv[0] += (float)uvdata[0] * w;
+		uv[1] += (float)uvdata[1] * w;
+	}
+
+	float *out = (float *)(decoded_ + decFmt.uvoff);
+	out[0] = uv[0];
+	out[1] = uv[1];
+}
+
+void VertexDecoder::Step_TcU8PrescaleMorph() const {
+	float uv[2] = { 0, 0 };
+	for (int n = 0; n < morphcount; n++) {
+		float w = gstate_c.morphWeights[n];
+		const u8 *uvdata = (const u8 *)(ptr_ + onesize_*n + tcoff);
+
+		uv[0] += (float)uvdata[0] * (1.f / 128.f) * w;
+		uv[1] += (float)uvdata[1] * (1.f / 128.f) * w;
+	}
+
+	float *out = (float *)(decoded_ + decFmt.uvoff);
+	out[0] = uv[0] * gstate_c.uv.uScale + gstate_c.uv.uOff;
+	out[1] = uv[1] * gstate_c.uv.vScale + gstate_c.uv.vOff;
+}
+
+void VertexDecoder::Step_TcU16PrescaleMorph() const {
+	float uv[2] = { 0, 0 };
+	for (int n = 0; n < morphcount; n++) {
+		float w = gstate_c.morphWeights[n];
+		const u16_le *uvdata = (const u16_le *)(ptr_ + onesize_*n + tcoff);
+
+		uv[0] += (float)uvdata[0] * (1.f / 32768.f) * w;
+		uv[1] += (float)uvdata[1] * (1.f / 32768.f) * w;
+	}
+
+	float *out = (float *)(decoded_ + decFmt.uvoff);
+	out[0] = uv[0] * gstate_c.uv.uScale + gstate_c.uv.uOff;
+	out[1] = uv[1] * gstate_c.uv.vScale + gstate_c.uv.vOff;
+}
+
+void VertexDecoder::Step_TcU16DoublePrescaleMorph() const {
+	float uv[2] = { 0, 0 };
+	for (int n = 0; n < morphcount; n++) {
+		float w = gstate_c.morphWeights[n];
+		const u16_le *uvdata = (const u16_le *)(ptr_ + onesize_*n + tcoff);
+
+		uv[0] += (float)uvdata[0] * (1.f / 16384.f) * w;
+		uv[1] += (float)uvdata[1] * (1.f / 16384.f) * w;
+	}
+
+	float *out = (float *)(decoded_ + decFmt.uvoff);
+	out[0] = uv[0] * gstate_c.uv.uScale + gstate_c.uv.uOff;
+	out[1] = uv[1] * gstate_c.uv.vScale + gstate_c.uv.vOff;
+}
+
+void VertexDecoder::Step_TcFloatPrescaleMorph() const {
+	float uv[2] = { 0, 0 };
+	for (int n = 0; n < morphcount; n++) {
+		float w = gstate_c.morphWeights[n];
+		const float_le *uvdata = (const float_le *)(ptr_ + onesize_*n + tcoff);
+
+		uv[0] += (float)uvdata[0] * w;
+		uv[1] += (float)uvdata[1] * w;
+	}
+
+	float *out = (float *)(decoded_ + decFmt.uvoff);
+	out[0] = uv[0] * gstate_c.uv.uScale + gstate_c.uv.uOff;
+	out[1] = uv[1] * gstate_c.uv.vScale + gstate_c.uv.vOff;
+}
+
 void VertexDecoder::Step_ColorInvalid() const
 {
 	// Do nothing.  This is only here to prevent crashes.
@@ -766,6 +886,34 @@ static const StepFunction tcstep_prescale_remaster[4] = {
 	&VertexDecoder::Step_TcFloatPrescale,
 };
 
+static const StepFunction tcstep_prescale_morph[4] = {
+	0,
+	&VertexDecoder::Step_TcU8PrescaleMorph,
+	&VertexDecoder::Step_TcU16PrescaleMorph,
+	&VertexDecoder::Step_TcFloatPrescaleMorph,
+};
+
+static const StepFunction tcstep_prescale_morph_remaster[4] = {
+	0,
+	&VertexDecoder::Step_TcU8PrescaleMorph,
+	&VertexDecoder::Step_TcU16DoublePrescaleMorph,
+	&VertexDecoder::Step_TcFloatPrescaleMorph,
+};
+
+static const StepFunction tcstep_morph[4] = {
+	0,
+	&VertexDecoder::Step_TcU8Morph,
+	&VertexDecoder::Step_TcU16Morph,
+	&VertexDecoder::Step_TcFloatMorph,
+};
+
+static const StepFunction tcstep_morph_remaster[4] = {
+	0,
+	&VertexDecoder::Step_TcU8Morph,
+	&VertexDecoder::Step_TcU16DoubleMorph,
+	&VertexDecoder::Step_TcFloatMorph,
+};
+
 static const StepFunction tcstep_through[4] = {
 	0,
 	&VertexDecoder::Step_TcU8,
@@ -809,9 +957,6 @@ static const StepFunction tcstep_through_remasterToFloat[4] = {
 	&VertexDecoder::Step_TcFloatThrough,
 };
 
-
-// TODO: Tc Morph
-
 static const StepFunction colstep[8] = {
 	0,
 	&VertexDecoder::Step_ColorInvalid,
@@ -971,9 +1116,12 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
 		// NOTE: That we check getUVGenMode here means that we must include it in the decoder ID!
 		if (g_Config.bPrescaleUV && !throughmode && (gstate.getUVGenMode() == GE_TEXMAP_TEXTURE_COORDS || gstate.getUVGenMode() == GE_TEXMAP_UNKNOWN)) {
 			if (g_DoubleTextureCoordinates)
-				steps_[numSteps_++] = tcstep_prescale_remaster[tc];
+				steps_[numSteps_++] = morphcount == 1 ? tcstep_prescale_remaster[tc] : tcstep_prescale_morph_remaster[tc];
 			else
-				steps_[numSteps_++] = tcstep_prescale[tc];
+				steps_[numSteps_++] = morphcount == 1 ? tcstep_prescale[tc] : tcstep_prescale_morph[tc];
+			decFmt.uvfmt = DEC_FLOAT_2;
+		} else if (morphcount != 1 && !throughmode) {
+			steps_[numSteps_++] = g_DoubleTextureCoordinates ? tcstep_morph_remaster[tc] : tcstep_morph[tc];
 			decFmt.uvfmt = DEC_FLOAT_2;
 		} else {
 			if (options.expandAllUVtoFloat) {
diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
index dff07bedc2..0e30d5caff 100644
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@@ -495,6 +495,15 @@ public:
 	void Step_TcU16ThroughDoubleToFloat() const;
 	void Step_TcFloatThrough() const;
 
+	void Step_TcU8Morph() const;
+	void Step_TcU16Morph() const;
+	void Step_TcU16DoubleMorph() const;
+	void Step_TcFloatMorph() const;
+	void Step_TcU8PrescaleMorph() const;
+	void Step_TcU16PrescaleMorph() const;
+	void Step_TcU16DoublePrescaleMorph() const;
+	void Step_TcFloatPrescaleMorph() const;
+
 	void Step_ColorInvalid() const;
 	void Step_Color4444() const;
 	void Step_Color565() const;

From ff802a983a1f8e6483455f889c41be74c9610c67 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 16 Apr 2016 17:45:05 -0700
Subject: [PATCH 3/4] vertexjit: Implement x86 tc morph decoding.

---
 GPU/Common/VertexDecoderCommon.h |   8 ++
 GPU/Common/VertexDecoderX86.cpp  | 127 ++++++++++++++++++++++++++++++-
 2 files changed, 131 insertions(+), 4 deletions(-)

diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
index 0e30d5caff..9f9f141d5c 100644
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@@ -637,6 +637,14 @@ public:
 	void Jit_TcU16Prescale();
 	void Jit_TcFloatPrescale();
 
+	void Jit_TcAnyMorph(int bits);
+	void Jit_TcU8Morph();
+	void Jit_TcU16Morph();
+	void Jit_TcFloatMorph();
+	void Jit_TcU8PrescaleMorph();
+	void Jit_TcU16PrescaleMorph();
+	void Jit_TcFloatPrescaleMorph();
+
 	void Jit_TcU16Double();
 	void Jit_TcU16ThroughDouble();
 
diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp
index 93db752b7c..ab61a9871e 100644
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@@ -105,6 +105,13 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale},
 	{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
 
+	{&VertexDecoder::Step_TcU8Morph, &VertexDecoderJitCache::Jit_TcU8Morph},
+	{&VertexDecoder::Step_TcU16Morph, &VertexDecoderJitCache::Jit_TcU16Morph},
+	{&VertexDecoder::Step_TcFloatMorph, &VertexDecoderJitCache::Jit_TcFloatMorph},
+	{&VertexDecoder::Step_TcU8PrescaleMorph, &VertexDecoderJitCache::Jit_TcU8PrescaleMorph},
+	{&VertexDecoder::Step_TcU16PrescaleMorph, &VertexDecoderJitCache::Jit_TcU16PrescaleMorph},
+	{&VertexDecoder::Step_TcFloatPrescaleMorph, &VertexDecoderJitCache::Jit_TcFloatPrescaleMorph},
+
 	{&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through},
 	{&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
 	{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
@@ -185,6 +192,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 			dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
 			prescaleStep = true;
 		}
+		if (dec.steps_[i] == &VertexDecoder::Step_TcU8PrescaleMorph ||
+			dec.steps_[i] == &VertexDecoder::Step_TcU16PrescaleMorph ||
+			dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescaleMorph) {
+			prescaleStep = true;
+		}
 	}
 
 	// Add code to convert matrices to 4x4.
@@ -747,6 +759,105 @@ void VertexDecoderJitCache::Jit_TcFloatPrescale() {
 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
 }
 
+void VertexDecoderJitCache::Jit_TcAnyMorph(int bits) {
+	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
+	if (!cpu_info.bSSE4_1) {
+		PXOR(fpScratchReg4, R(fpScratchReg4));
+	}
+
+	bool first = true;
+	for (int n = 0; n < dec_->morphcount; ++n) {
+		const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
+		const OpArg src = MDisp(srcReg, dec_->onesize_ * n + dec_->tcoff);
+
+		// Load the actual values and convert to float.
+		if (bits == 32) {
+			// Two floats: just load as a MOVQ.
+			MOVQ_xmm(reg, src);
+		} else {
+			if (bits == 8) {
+				MOVZX(32, 16, tempReg2, src);
+				MOVD_xmm(reg, R(tempReg2));
+			} else {
+				MOVD_xmm(reg, src);
+			}
+			if (cpu_info.bSSE4_1) {
+				if (bits == 8) {
+					PMOVZXBD(reg, R(reg));
+				} else {
+					PMOVZXWD(reg, R(reg));
+				}
+			} else {
+				if (bits == 8) {
+					PUNPCKLBW(reg, R(fpScratchReg4));
+				}
+				PUNPCKLWD(reg, R(fpScratchReg4));
+			}
+
+			CVTDQ2PS(reg, R(reg));
+		}
+
+		// And now scale by the weight.
+		MOVSS(fpScratchReg3, MDisp(tempReg1, n * sizeof(float)));
+		SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
+		MULPS(reg, R(fpScratchReg3));
+
+		if (!first) {
+			ADDPS(fpScratchReg, R(fpScratchReg2));
+		} else {
+			first = false;
+		}
+	}
+}
+
+void VertexDecoderJitCache::Jit_TcU8Morph() {
+	Jit_TcAnyMorph(8);
+	// They were all added (weighted) pre-normalize, we normalize once here.
+	MULPS(fpScratchReg, M(&by128));
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
+}
+
+void VertexDecoderJitCache::Jit_TcU16Morph() {
+	Jit_TcAnyMorph(16);
+	// They were all added (weighted) pre-normalize, we normalize once here.
+	MULPS(fpScratchReg, M(&by32768));
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
+}
+
+void VertexDecoderJitCache::Jit_TcFloatMorph() {
+	Jit_TcAnyMorph(32);
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
+}
+
+void VertexDecoderJitCache::Jit_TcU8PrescaleMorph() {
+	Jit_TcAnyMorph(8);
+	// The scale takes into account the u8 normalization.
+	MULPS(fpScratchReg, R(fpScaleOffsetReg));
+	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
+	ADDPS(fpScratchReg, R(fpScaleOffsetReg));
+	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
+}
+
+void VertexDecoderJitCache::Jit_TcU16PrescaleMorph() {
+	Jit_TcAnyMorph(16);
+	// The scale takes into account the u16 normalization.
+	MULPS(fpScratchReg, R(fpScaleOffsetReg));
+	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
+	ADDPS(fpScratchReg, R(fpScaleOffsetReg));
+	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
+}
+
+void VertexDecoderJitCache::Jit_TcFloatPrescaleMorph() {
+	Jit_TcAnyMorph(32);
+	MULPS(fpScratchReg, R(fpScaleOffsetReg));
+	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
+	ADDPS(fpScratchReg, R(fpScaleOffsetReg));
+	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
+}
+
 void VertexDecoderJitCache::Jit_TcU16Through() {
 	MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
 	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
@@ -960,7 +1071,9 @@ void VertexDecoderJitCache::Jit_Color5551() {
 
 void VertexDecoderJitCache::Jit_Color8888Morph() {
 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
-	PXOR(fpScratchReg4, R(fpScratchReg4));
+	if (!cpu_info.bSSE4_1) {
+		PXOR(fpScratchReg4, R(fpScratchReg4));
+	}
 
 	bool first = true;
 	for (int n = 0; n < dec_->morphcount; ++n) {
@@ -994,7 +1107,9 @@ static const float MEMORY_ALIGNED16(byColor4444[4]) = { 255.0f / 15.0f, 255.0f /
 
 void VertexDecoderJitCache::Jit_Color4444Morph() {
 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
-	PXOR(fpScratchReg4, R(fpScratchReg4));
+	if (!cpu_info.bSSE4_1) {
+		PXOR(fpScratchReg4, R(fpScratchReg4));
+	}
 	MOVDQA(XMM5, M(color4444mask));
 	MOVAPS(XMM6, M(byColor4444));
 
@@ -1376,7 +1491,9 @@ void VertexDecoderJitCache::Jit_AnyU16ToFloat(int srcoff, u32 bits) {
 
 void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
-	PXOR(fpScratchReg4, R(fpScratchReg4));
+	if (!cpu_info.bSSE4_1) {
+		PXOR(fpScratchReg4, R(fpScratchReg4));
+	}
 	MOVAPS(XMM5, M(by128));
 
 	// Sum into fpScratchReg.
@@ -1414,7 +1531,9 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
 
 void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
-	PXOR(fpScratchReg4, R(fpScratchReg4));
+	if (!cpu_info.bSSE4_1) {
+		PXOR(fpScratchReg4, R(fpScratchReg4));
+	}
 	MOVAPS(XMM5, M(by32768));
 
 	// Sum into fpScratchReg.

From ebce8d275378c7f75f2f53d3ac7c20a56f47b075 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 16 Apr 2016 19:00:34 -0700
Subject: [PATCH 4/4] Don't convert to float with prescale off.

Since we assume we need to normalize, it seems.
---
 GPU/Common/VertexDecoderCommon.cpp | 74 +++++++++++++++++++++++++++---
 GPU/Common/VertexDecoderCommon.h   |  7 ++-
 GPU/Common/VertexDecoderX86.cpp    |  8 ++--
 3 files changed, 76 insertions(+), 13 deletions(-)

diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp
index 6f6a0dc9d2..0d58cde4ad 100644
--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@@ -405,6 +405,51 @@ void VertexDecoder::Step_TcFloatPrescale() const {
 }
 
 void VertexDecoder::Step_TcU8Morph() const {
+	float uv[2] = { 0, 0 };
+	for (int n = 0; n < morphcount; n++) {
+		float w = gstate_c.morphWeights[n];
+		const u8 *uvdata = (const u8 *)(ptr_ + onesize_*n + tcoff);
+
+		uv[0] += (float)uvdata[0] * w;
+		uv[1] += (float)uvdata[1] * w;
+	}
+
+	u8 *out = decoded_ + decFmt.uvoff;
+	out[0] = (int)uv[0];
+	out[1] = (int)uv[1];
+}
+
+void VertexDecoder::Step_TcU16Morph() const {
+	float uv[2] = { 0, 0 };
+	for (int n = 0; n < morphcount; n++) {
+		float w = gstate_c.morphWeights[n];
+		const u16_le *uvdata = (const u16_le *)(ptr_ + onesize_*n + tcoff);
+
+		uv[0] += (float)uvdata[0] * w;
+		uv[1] += (float)uvdata[1] * w;
+	}
+
+	u16_le *out = (u16_le *)(decoded_ + decFmt.uvoff);
+	out[0] = (int)uv[0];
+	out[1] = (int)uv[1];
+}
+
+void VertexDecoder::Step_TcU16DoubleMorph() const {
+	float uv[2] = { 0, 0 };
+	for (int n = 0; n < morphcount; n++) {
+		float w = gstate_c.morphWeights[n];
+		const u16_le *uvdata = (const u16_le *)(ptr_ + onesize_*n + tcoff);
+
+		uv[0] += (float)uvdata[0] * w;
+		uv[1] += (float)uvdata[1] * w;
+	}
+
+	u16_le *out = (u16_le *)(decoded_ + decFmt.uvoff);
+	out[0] = (int)(uv[0] * 2.0f);
+	out[1] = (int)(uv[1] * 2.0f);
+}
+
+void VertexDecoder::Step_TcU8MorphToFloat() const {
 	float uv[2] = { 0, 0 };
 	for (int n = 0; n < morphcount; n++) {
 		float w = gstate_c.morphWeights[n];
@@ -419,7 +464,7 @@ void VertexDecoder::Step_TcU8Morph() const {
 	out[1] = uv[1];
 }
 
-void VertexDecoder::Step_TcU16Morph() const {
+void VertexDecoder::Step_TcU16MorphToFloat() const {
 	float uv[2] = { 0, 0 };
 	for (int n = 0; n < morphcount; n++) {
 		float w = gstate_c.morphWeights[n];
@@ -434,7 +479,7 @@ void VertexDecoder::Step_TcU16Morph() const {
 	out[1] = uv[1];
 }
 
-void VertexDecoder::Step_TcU16DoubleMorph() const {
+void VertexDecoder::Step_TcU16DoubleMorphToFloat() const {
 	float uv[2] = { 0, 0 };
 	for (int n = 0; n < morphcount; n++) {
 		float w = gstate_c.morphWeights[n];
@@ -914,6 +959,20 @@ static const StepFunction tcstep_morph_remaster[4] = {
 	&VertexDecoder::Step_TcFloatMorph,
 };
 
+static const StepFunction tcstep_morphToFloat[4] = {
+	0,
+	&VertexDecoder::Step_TcU8MorphToFloat,
+	&VertexDecoder::Step_TcU16MorphToFloat,
+	&VertexDecoder::Step_TcFloatMorph,
+};
+
+static const StepFunction tcstep_morph_remasterToFloat[4] = {
+	0,
+	&VertexDecoder::Step_TcU8MorphToFloat,
+	&VertexDecoder::Step_TcU16DoubleMorphToFloat,
+	&VertexDecoder::Step_TcFloatMorph,
+};
+
 static const StepFunction tcstep_through[4] = {
 	0,
 	&VertexDecoder::Step_TcU8,
@@ -1120,18 +1179,19 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
 			else
 				steps_[numSteps_++] = morphcount == 1 ? tcstep_prescale[tc] : tcstep_prescale_morph[tc];
 			decFmt.uvfmt = DEC_FLOAT_2;
-		} else if (morphcount != 1 && !throughmode) {
-			steps_[numSteps_++] = g_DoubleTextureCoordinates ? tcstep_morph_remaster[tc] : tcstep_morph[tc];
-			decFmt.uvfmt = DEC_FLOAT_2;
 		} else {
 			if (options.expandAllUVtoFloat) {
-				if (g_DoubleTextureCoordinates)
+				if (morphcount != 1 && !throughmode)
+					steps_[numSteps_++] = g_DoubleTextureCoordinates ? tcstep_morph_remasterToFloat[tc] : tcstep_morphToFloat[tc];
+				else if (g_DoubleTextureCoordinates)
 					steps_[numSteps_++] = throughmode ? tcstep_through_remasterToFloat[tc] : tcstep_remasterToFloat[tc];
 				else
 					steps_[numSteps_++] = throughmode ? tcstep_throughToFloat[tc] : tcstepToFloat[tc];
 				decFmt.uvfmt = DEC_FLOAT_2;
 			} else {
-				if (g_DoubleTextureCoordinates)
+				if (morphcount != 1 && !throughmode)
+					steps_[numSteps_++] = g_DoubleTextureCoordinates ? tcstep_morph_remaster[tc] : tcstep_morph[tc];
+				else if (g_DoubleTextureCoordinates)
 					steps_[numSteps_++] = throughmode ? tcstep_through_remaster[tc] : tcstep_remaster[tc];
 				else
 					steps_[numSteps_++] = throughmode ? tcstep_through[tc] : tcstep[tc];
diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
index 9f9f141d5c..6163951ecb 100644
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@@ -498,6 +498,9 @@ public:
 	void Step_TcU8Morph() const;
 	void Step_TcU16Morph() const;
 	void Step_TcU16DoubleMorph() const;
+	void Step_TcU8MorphToFloat() const;
+	void Step_TcU16MorphToFloat() const;
+	void Step_TcU16DoubleMorphToFloat() const;
 	void Step_TcFloatMorph() const;
 	void Step_TcU8PrescaleMorph() const;
 	void Step_TcU16PrescaleMorph() const;
@@ -638,8 +641,8 @@ public:
 	void Jit_TcFloatPrescale();
 
 	void Jit_TcAnyMorph(int bits);
-	void Jit_TcU8Morph();
-	void Jit_TcU16Morph();
+	void Jit_TcU8MorphToFloat();
+	void Jit_TcU16MorphToFloat();
 	void Jit_TcFloatMorph();
 	void Jit_TcU8PrescaleMorph();
 	void Jit_TcU16PrescaleMorph();
diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp
index ab61a9871e..1f67637ef7 100644
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@@ -105,8 +105,8 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale},
 	{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
 
-	{&VertexDecoder::Step_TcU8Morph, &VertexDecoderJitCache::Jit_TcU8Morph},
-	{&VertexDecoder::Step_TcU16Morph, &VertexDecoderJitCache::Jit_TcU16Morph},
+	{&VertexDecoder::Step_TcU8MorphToFloat, &VertexDecoderJitCache::Jit_TcU8MorphToFloat},
+	{&VertexDecoder::Step_TcU16MorphToFloat, &VertexDecoderJitCache::Jit_TcU16MorphToFloat},
 	{&VertexDecoder::Step_TcFloatMorph, &VertexDecoderJitCache::Jit_TcFloatMorph},
 	{&VertexDecoder::Step_TcU8PrescaleMorph, &VertexDecoderJitCache::Jit_TcU8PrescaleMorph},
 	{&VertexDecoder::Step_TcU16PrescaleMorph, &VertexDecoderJitCache::Jit_TcU16PrescaleMorph},
@@ -810,14 +810,14 @@ void VertexDecoderJitCache::Jit_TcAnyMorph(int bits) {
 	}
 }
 
-void VertexDecoderJitCache::Jit_TcU8Morph() {
+void VertexDecoderJitCache::Jit_TcU8MorphToFloat() {
 	Jit_TcAnyMorph(8);
 	// They were all added (weighted) pre-normalize, we normalize once here.
 	MULPS(fpScratchReg, M(&by128));
 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
 }
 
-void VertexDecoderJitCache::Jit_TcU16Morph() {
+void VertexDecoderJitCache::Jit_TcU16MorphToFloat() {
 	Jit_TcAnyMorph(16);
 	// They were all added (weighted) pre-normalize, we normalize once here.
 	MULPS(fpScratchReg, M(&by32768));