Force positions to scale by 128/32768 as psp does.

This makes everything use floats for positions. On some hardware/drivers, this may be faster. On some it may be slower. We'll need testing to see the performance impact. Fixes Final Fantasy 4's pos misalignments, and probably others (like Tekken 5 I suspect.)
2025-04-02 11:01:50 -04:00 · 2014-08-17 12:38:21 -07:00 · 2014-08-17 12:38:21 -07:00 · 385df1c54e
commit 385df1c54e
parent 78ddffee2c
4 changed files with 119 additions and 83 deletions
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@ -118,7 +118,7 @@ public:
 					pos[2] = u[2] * (1.0f / 65535.0f);
 				} else {
 					for (int i = 0; i < 3; i++)
-						pos[i] = s[i] * (1.f / 32767.f);
+						pos[i] = s[i] * (1.0f / 32768.0f);
 				}
 			}
 			break;
@ -130,10 +130,10 @@ public:
 				if (isThrough()) {
 					for (int i = 0; i < 2; i++)
 						pos[i] = b[i];
-					pos[2] = u[2] / 255.0f;
+					pos[2] = u[2] * (1.0f / 255.0f);
 				} else {
 					for (int i = 0; i < 3; i++)
-						pos[i] = b[i] * (1.f / 127.f);
+						pos[i] = b[i] * (1.0f / 128.0f);
 				}
 			}
 			break;
@ -168,7 +168,7 @@ public:
 					pos[2] = u[2];
 				} else {
 					for (int i = 0; i < 3; i++)
-						pos[i] = s[i] * (1.f / 32767.f);
+						pos[i] = s[i] * (1.0f / 32768.0f);
 					// TODO: Does depth need conversion?
 				}
 			}
@ -184,7 +184,7 @@ public:
 					pos[2] = u[2];
 				} else {
 					for (int i = 0; i < 3; i++)
-						pos[i] = b[i] * (1.f / 127.f);
+						pos[i] = b[i] * (1.0f / 128.0f);
 					// TODO: Does depth need conversion?
 				}
 			}
@ -203,7 +203,7 @@ public:
 			{
 				const float *f = (const float *)(data_ + decFmt_.nrmoff);
 				for (int i = 0; i < 3; i++)
-					nrm[i] = f[i] ;
+					nrm[i] = f[i];
 			}
 			break;
 		case DEC_S16_3:
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@ -93,7 +93,7 @@ void VertexDecoder::Step_WeightsU8Skin() const
 	for (int j = 0; j < nweights; j++) {
 		const float *bone = &gstate.boneMatrix[j * 12];
 		if (wdata[j] != 0) {
-			float weight = wdata[j] / 128.0f;
+			float weight = wdata[j] * (1.0f / 128.0f);
 			for (int i = 0; i < 12; i++) {
 				skinMatrix[i] += weight * bone[i];
 			}
@ -109,7 +109,7 @@ void VertexDecoder::Step_WeightsU16Skin() const
 	for (int j = 0; j < nweights; j++) {
 		const float *bone = &gstate.boneMatrix[j * 12];
 		if (wdata[j] != 0) {
-			float weight = wdata[j] / 32768.0f;
+			float weight = wdata[j] * (1.0f / 32768.0f);
 			for (int i = 0; i < 12; i++) {
 				skinMatrix[i] += weight * bone[i];
 			}
@ -354,7 +354,7 @@ void VertexDecoder::Step_NormalS8Skin() const
 {
 	float *normal = (float *)(decoded_ + decFmt.nrmoff);
 	const s8 *sv = (const s8*)(ptr_ + nrmoff);
-	const float fn[3] = { sv[0] / 128.0f, sv[1] / 128.0f, sv[2] / 128.0f };
+	const float fn[3] = { sv[0] * (1.0f / 128.0f), sv[1] * (1.0f / 128.0f), sv[2] * (1.0f / 128.0f) };
 	Norm3ByMatrix43(normal, fn, skinMatrix);
 }

@ -362,7 +362,7 @@ void VertexDecoder::Step_NormalS16Skin() const
 {
 	float *normal = (float *)(decoded_ + decFmt.nrmoff);
 	const s16 *sv = (const s16*)(ptr_ + nrmoff);
-	const float fn[3] = { sv[0] / 32768.0f, sv[1] / 32768.0f, sv[2] / 32768.0f };
+	const float fn[3] = { sv[0] * (1.0f / 32768.0f), sv[1] * (1.0f / 32768.0f), sv[2] * (1.0f / 32768.0f) };
 	Norm3ByMatrix43(normal, fn, skinMatrix);
 }

@ -380,7 +380,7 @@ void VertexDecoder::Step_NormalS8Morph() const
 	for (int n = 0; n < morphcount; n++)
 	{
 		const s8 *bv = (const s8*)(ptr_ + onesize_*n + nrmoff);
-		float multiplier = gstate_c.morphWeights[n] * (1.0f/127.0f);
+		const float multiplier = gstate_c.morphWeights[n] * (1.0f / 128.0f);
 		for (int j = 0; j < 3; j++)
 			normal[j] += bv[j] * multiplier;
 	}
@ -392,8 +392,8 @@ void VertexDecoder::Step_NormalS16Morph() const
 	memset(normal, 0, sizeof(float)*3);
 	for (int n = 0; n < morphcount; n++)
 	{
-		float multiplier = gstate_c.morphWeights[n] * (1.0f/32767.0f);
 		const s16 *sv = (const s16 *)(ptr_ + onesize_*n + nrmoff);
+		const float multiplier = gstate_c.morphWeights[n] * (1.0f / 32768.0f);
 		for (int j = 0; j < 3; j++)
 			normal[j] += sv[j] * multiplier;
 	}
@ -414,20 +414,18 @@ void VertexDecoder::Step_NormalFloatMorph() const

 void VertexDecoder::Step_PosS8() const
 {
-	s8 *v = (s8 *)(decoded_ + decFmt.posoff);
+	float *pos = (float *)(decoded_ + decFmt.posoff);
 	const s8 *sv = (const s8*)(ptr_ + posoff);
 	for (int j = 0; j < 3; j++)
-		v[j] = sv[j];
-	v[3] = 0;
+		pos[j] = sv[j] * (1.0f / 128.0f);
 }

 void VertexDecoder::Step_PosS16() const
 {
-	s16 *v = (s16 *)(decoded_ + decFmt.posoff);
+	float *pos = (float *)(decoded_ + decFmt.posoff);
 	const s16 *sv = (const s16*)(ptr_ + posoff);
 	for (int j = 0; j < 3; j++)
-		v[j] = sv[j];
-	v[3] = 0;
+		pos[j] = sv[j] * (1.0f / 32768.0f);
 }

 void VertexDecoder::Step_PosFloat() const
@ -441,7 +439,7 @@ void VertexDecoder::Step_PosS8Skin() const
 {
 	float *pos = (float *)(decoded_ + decFmt.posoff);
 	const s8 *sv = (const s8*)(ptr_ + posoff);
-	const float fn[3] = { sv[0] / 128.0f, sv[1] / 128.0f, sv[2] / 128.0f };
+	const float fn[3] = { sv[0] * (1.0f / 128.0f), sv[1] * (1.0f / 128.0f), sv[2] * (1.0f / 128.0f) };
 	Vec3ByMatrix43(pos, fn, skinMatrix);
 }

@ -449,7 +447,7 @@ void VertexDecoder::Step_PosS16Skin() const
 {
 	float *pos = (float *)(decoded_ + decFmt.posoff);
 	const s16 *sv = (const s16*)(ptr_ + posoff);
-	const float fn[3] = { sv[0] / 32768.0f, sv[1] / 32768.0f, sv[2] / 32768.0f };
+	const float fn[3] = { sv[0] * (1.0f / 32768.0f), sv[1] * (1.0f / 32768.0f), sv[2] * (1.0f / 32768.0f) };
 	Vec3ByMatrix43(pos, fn, skinMatrix);
 }

@ -491,7 +489,7 @@ void VertexDecoder::Step_PosS8Morph() const
 	float *v = (float *)(decoded_ + decFmt.posoff);
 	memset(v, 0, sizeof(float) * 3);
 	for (int n = 0; n < morphcount; n++) {
-		float multiplier = 1.0f / 127.0f;
+		const float multiplier = 1.0f / 128.0f;
 		const s8 *sv = (const s8*)(ptr_ + onesize_*n + posoff);
 		for (int j = 0; j < 3; j++)
 			v[j] += (float)sv[j] * (multiplier * gstate_c.morphWeights[n]);
@ -503,7 +501,7 @@ void VertexDecoder::Step_PosS16Morph() const
 	float *v = (float *)(decoded_ + decFmt.posoff);
 	memset(v, 0, sizeof(float) * 3);
 	for (int n = 0; n < morphcount; n++) {
-		float multiplier = 1.0f / 32767.0f;
+		const float multiplier = 1.0f / 32768.0f;
 		const s16 *sv = (const s16*)(ptr_ + onesize_*n + posoff);
 		for (int j = 0; j < 3; j++)
 			v[j] += (float)sv[j] * (multiplier * gstate_c.morphWeights[n]);
@ -806,18 +804,7 @@ void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) {
 				decFmt.posfmt = DEC_FLOAT_3;
 			} else {
 				steps_[numSteps_++] = morphcount == 1 ? posstep[pos] : posstep_morph[pos];
-
-				if (morphcount == 1) {
-					// The non-through-mode position formats match the gl formats perfectly, let's use 'em.
-					switch (pos) {
-					case GE_VTYPE_POS_8BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S8_3; break;
-					case GE_VTYPE_POS_16BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S16_3; break;
-					case GE_VTYPE_POS_FLOAT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_FLOAT_3; break;
-					}
-				} else {
-					// Actually, temporarily let's not.
-					decFmt.posfmt = DEC_FLOAT_3;
-				}
+				decFmt.posfmt = DEC_FLOAT_3;
 			}
 		}
 		decFmt.posoff = decOff;
--- a/GPU/GLES/VertexDecoderArm.cpp
+++ b/GPU/GLES/VertexDecoderArm.cpp
@ -50,10 +50,7 @@ static float MEMORY_ALIGNED16(boneMask[4]) = {1.0f, 1.0f, 1.0f, 0.0f};
 // TODO: Maybe load all morph weights to Q6+ to avoid memory access?


-static const float by127 = 1.0f / 127.0f;
 static const float by128 = 1.0f / 128.0f;
-static const float by256 = 1.0f / 256.0f;
-static const float by32767 = 1.0f / 32767.0f;
 static const float by32768 = 1.0f / 32768.0f;

 using namespace ArmGen;
@ -1147,8 +1144,8 @@ void VertexDecoderJitCache::Jit_NormalFloat() {
 // Through expands into floats, always. Might want to look at changing this.
 void VertexDecoderJitCache::Jit_PosS8Through() {
 	DEBUG_LOG_REPORT_ONCE(vertexS8Through, G3D, "Using S8 positions in throughmode");
-	_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScrathRegs must be in order.");
-	_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScrathRegs must be in order.");
+	_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
+	_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");

 	// TODO: SIMD
 	LDRSB(tempReg1, srcReg, dec_->posoff);
@ -1173,8 +1170,8 @@ void VertexDecoderJitCache::Jit_PosS8Through() {

 // Through expands into floats, always. Might want to look at changing this.
 void VertexDecoderJitCache::Jit_PosS16Through() {
-	_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScrathRegs must be in order.");
-	_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScrathRegs must be in order.");
+	_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
+	_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");

 	// TODO: SIMD
 	LDRSH(tempReg1, srcReg, dec_->posoff);
@ -1197,24 +1194,69 @@ void VertexDecoderJitCache::Jit_PosS16Through() {
 	}
 }

-// Copy 3 bytes and then a zero. Might as well copy four.
 void VertexDecoderJitCache::Jit_PosS8() {
-	LDRB(tempReg1, srcReg, dec_->posoff);
-	LDRB(tempReg2, srcReg, dec_->posoff + 1);
-	LDRB(tempReg3, srcReg, dec_->posoff + 2);
-	ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8));
-	ORR(tempReg1, tempReg1, Operand2(tempReg3, ST_LSL, 16));
-	STR(tempReg1, dstReg, dec_->decFmt.posoff);
+	if (NEONSkinning) {
+		ADD(scratchReg, srcReg, dec_->posoff);
+		VMOV_neon(F_32, Q3, by128);
+		VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
+		VMOVL(I_8 | I_SIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 16-bit
+		VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
+		VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
+		VMUL(F_32, srcNEON, neonScratchReg, Q3);
+	} else {
+		LDRSB(tempReg1, srcReg, dec_->posoff);
+		LDRSB(tempReg2, srcReg, dec_->posoff + 1);
+		LDRSB(tempReg3, srcReg, dec_->posoff + 2);
+		VMOV(src[0], tempReg1);
+		VMOV(src[1], tempReg2);
+		VMOV(src[2], tempReg3);
+		MOVI2F(S15, by128, scratchReg);
+		VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED);
+		VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED);
+		VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED);
+		VMUL(src[0], src[0], S15);
+		VMUL(src[1], src[1], S15);
+		VMUL(src[2], src[2], S15);
+	}
+
+	ADD(scratchReg, dstReg, dec_->decFmt.posoff);
+	if (NEONSkinning) {
+		VST1(F_32, srcNEON, scratchReg, 2);
+	} else {
+		VSTMIA(scratchReg, false, src[0], 3);
+	}
 }

-// Copy 6 bytes and then 2 zeroes.
 void VertexDecoderJitCache::Jit_PosS16() {
-	LDRH(tempReg1, srcReg, dec_->posoff);
-	LDRH(tempReg2, srcReg, dec_->posoff + 2);
-	LDRH(tempReg3, srcReg, dec_->posoff + 4);
-	ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16));
-	STR(tempReg1, dstReg, dec_->decFmt.posoff);
-	STR(tempReg3, dstReg, dec_->decFmt.posoff + 4);
+	if (NEONSkinning) {
+		ADD(scratchReg, srcReg, dec_->posoff);
+		VMOV_neon(F_32, Q3, by32768);
+		VLD1(I_32, neonScratchReg, scratchReg, 1, ALIGN_NONE);
+		VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
+		VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
+		VMUL(F_32, srcNEON, neonScratchReg, Q3);
+	} else {
+		LDRSH(tempReg1,  srcReg, dec_->posoff);
+		LDRSH(tempReg2, srcReg, dec_->posoff + 2);
+		LDRSH(tempReg3, srcReg, dec_->posoff + 4);
+		VMOV(fpScratchReg, tempReg1);
+		VMOV(fpScratchReg2, tempReg2);
+		VMOV(fpScratchReg3, tempReg3);
+		MOVI2F(S15, by32768, scratchReg);
+		VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED);
+		VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT | IS_SIGNED);
+		VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT | IS_SIGNED);
+		VMUL(src[0], fpScratchReg, S15);
+		VMUL(src[1], fpScratchReg2, S15);
+		VMUL(src[2], fpScratchReg3, S15);
+	}
+
+	ADD(scratchReg, dstReg, dec_->decFmt.posoff);
+	if (NEONSkinning) {
+		VST1(F_32, srcNEON, scratchReg, 2);
+	} else {
+		VSTMIA(scratchReg, false, src[0], 3);
+	}
 }

 // Just copy 12 bytes.
@ -1304,8 +1346,8 @@ void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
 		}
 		VST1(F_32, accNEON, scratchReg, 2);
 	} else {
-		_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScrathRegs must be in order.");
-		_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScrathRegs must be in order.");
+		_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
+		_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");

 		MOVP2R(tempReg1, skinMatrix);
 		VLDMIA(tempReg1, true, fpScratchReg, 3);
@ -1404,10 +1446,10 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
 	MOVP2R(tempReg2, &gstate_c.morphWeights[0]);

 	if (useNEON) {
-		MOVI2FR(scratchReg2, by127);
+		MOVI2FR(scratchReg2, by128);
 		VDUP(I_32, Q5, scratchReg2);
 	} else {
-		MOVI2F(S13, by127, scratchReg);
+		MOVI2F(S13, by128, scratchReg);
 	}

 	bool first = true;
@ -1474,10 +1516,10 @@ void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
 	MOVP2R(tempReg2, &gstate_c.morphWeights[0]);

 	if (useNEON) {
-		MOVI2FR(scratchReg, by32767);
+		MOVI2FR(scratchReg, by32768);
 		VDUP(I_32, Q5, scratchReg);
 	} else {
-		MOVI2F(S13, by32767, scratchReg);
+		MOVI2F(S13, by32768, scratchReg);
 	}

 	bool first = true;
--- a/GPU/GLES/VertexDecoderX86.cpp
+++ b/GPU/GLES/VertexDecoderX86.cpp
@ -29,18 +29,9 @@ static float MEMORY_ALIGNED16(bones[16 * 8]);

 using namespace Gen;

-static const float MEMORY_ALIGNED16( by127[4] ) = {
-	1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f
-};
 static const float MEMORY_ALIGNED16( by128[4] ) = {
 	1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f
 };
-static const float MEMORY_ALIGNED16( by256[4] ) = {
-	1.0f / 256, 1.0f / 256, 1.0f / 256, 1.0f / 256
-};
-static const float MEMORY_ALIGNED16( by32767[4] ) = {
-	1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f,
-};
 static const float MEMORY_ALIGNED16( by32768[4] ) = {
 	1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f,
 };
@ -1025,19 +1016,35 @@ void VertexDecoderJitCache::Jit_PosS16Through() {
 	MOVSS(MDisp(dstReg, dec_->decFmt.posoff + 8), fpScratchReg);
 }

-// Copy 3 bytes and then a zero. Might as well copy four.
 void VertexDecoderJitCache::Jit_PosS8() {
-	MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
-	AND(32, R(tempReg1), Imm32(0x00FFFFFF));
-	MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
+	XORPS(XMM3, R(XMM3));
+	MOVD_xmm(XMM1, MDisp(srcReg, dec_->posoff));
+	if (cpu_info.bSSE4_1) {
+		PMOVSXBD(XMM1, R(XMM1));
+	} else {
+		PUNPCKLBW(XMM1, R(XMM3));
+		PUNPCKLWD(XMM1, R(XMM3));
+		PSLLD(XMM1, 24);
+		PSRAD(XMM1, 24);
+	}
+	CVTDQ2PS(XMM3, R(XMM1));
+	MULPS(XMM3, M(&by128));
+	MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3);
 }

-// Copy 6 bytes and then 2 zeroes.
 void VertexDecoderJitCache::Jit_PosS16() {
-	MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
-	MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->posoff + 4));
-	MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
-	MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2));
+	XORPS(XMM3, R(XMM3));
+	MOVQ_xmm(XMM1, MDisp(srcReg, dec_->posoff));
+	if (cpu_info.bSSE4_1) {
+		PMOVSXWD(XMM1, R(XMM1));
+	} else {
+		PUNPCKLWD(XMM1, R(XMM3));
+		PSLLD(XMM1, 16);
+		PSRAD(XMM1, 16);
+	}
+	CVTDQ2PS(XMM3, R(XMM1));
+	MULPS(XMM3, M(&by32768));
+	MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3);
 }

 // Just copy 12 bytes.
@ -1090,7 +1097,7 @@ void VertexDecoderJitCache::Jit_PosFloatSkin() {
 void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
 	PXOR(fpScratchReg4, R(fpScratchReg4));
-	MOVAPS(XMM5, M(by127));
+	MOVAPS(XMM5, M(by128));

 	// Sum into fpScratchReg.
 	bool first = true;
@ -1108,7 +1115,7 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
 		}
 		CVTDQ2PS(reg, R(reg));

-		// Now, It's time to multiply by the weight and 1.0f/127.0f.
+		// Now, It's time to multiply by the weight and 1.0f/128.0f.
 		MOVSS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
 		MULSS(fpScratchReg3, R(XMM5));
 		SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
@ -1128,7 +1135,7 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
 void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
 	PXOR(fpScratchReg4, R(fpScratchReg4));
-	MOVAPS(XMM5, M(by32767));
+	MOVAPS(XMM5, M(by32768));

 	// Sum into fpScratchReg.
 	bool first = true;
@ -1145,7 +1152,7 @@ void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
 		}
 		CVTDQ2PS(reg, R(reg));

-		// Now, It's time to multiply by the weight and 1.0f/32767.0f.
+		// Now, It's time to multiply by the weight and 1.0f/32768.0f.
 		MOVSS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
 		MULSS(fpScratchReg3, R(XMM5));
 		SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));