diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp
index fa6b417bb3..4e8d76afab 100644
--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@@ -160,6 +160,30 @@ void VertexDecoder::Step_WeightsU16() const
 		wt[j++] = 0;
 }
 
+void VertexDecoder::Step_WeightsU8ToFloat() const
+{
+	float *wt = (float *)(decoded_ + decFmt.w0off);
+	const u8 *wdata = (const u8*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++) {
+		wt[j] = (float)wdata[j] * (1.0f / 128.0f);
+	}
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0;
+}
+
+void VertexDecoder::Step_WeightsU16ToFloat() const
+{
+	float *wt = (float *)(decoded_ + decFmt.w0off);
+	const u16 *wdata = (const u16*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++) {
+		wt[j] = (float)wdata[j] * (1.0f / 32768.0f);
+	}
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0;
+}
+
 // Float weights should be uncommon, we can live with having to multiply these by 2.0
 // to avoid special checks in the vertex shader generator.
 // (PSP uses 0.0-2.0 fixed point numbers for weights)
@@ -459,6 +483,15 @@ void VertexDecoder::Step_NormalS8() const
 	normal[3] = 0;
 }
 
+void VertexDecoder::Step_NormalS8ToFloat() const
+{
+	float *normal = (float *)(decoded_ + decFmt.nrmoff);
+	const s8 *sv = (const s8*)(ptr_ + nrmoff);
+	normal[0] = sv[0] * (1.0f / 128.0f);
+	normal[1] = sv[1] * (1.0f / 128.0f);
+	normal[2] = sv[2] * (1.0f / 128.0f);
+}
+
 void VertexDecoder::Step_NormalS16() const
 {
 	s16 *normal = (s16 *)(decoded_ + decFmt.nrmoff);
@@ -649,6 +682,13 @@ static const StepFunction wtstep[4] = {
 	&VertexDecoder::Step_WeightsFloat,
 };
 
+static const StepFunction wtstepToFloat[4] = {
+	0,
+	&VertexDecoder::Step_WeightsU8ToFloat,
+	&VertexDecoder::Step_WeightsU16ToFloat,
+	&VertexDecoder::Step_WeightsFloat,
+};
+
 static const StepFunction wtstep_skin[4] = {
 	0,
 	&VertexDecoder::Step_WeightsU8Skin,
@@ -746,6 +786,13 @@ static const StepFunction nrmstep[4] = {
 	&VertexDecoder::Step_NormalFloat,
 };
 
+static const StepFunction nrmstep8BitToFloat[4] = {
+	0,
+	&VertexDecoder::Step_NormalS8ToFloat,
+	&VertexDecoder::Step_NormalS16,
+	&VertexDecoder::Step_NormalFloat,
+};
+
 static const StepFunction nrmstep_skin[4] = {
 	0,
 	&VertexDecoder::Step_NormalS8Skin,
@@ -825,17 +872,21 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
 
 		if (skinInDecode) {
 			steps_[numSteps_++] = wtstep_skin[weighttype];
-			// No visible output
+			// No visible output, passed in register/external memory to the "pos" step.
 		} else {
-			steps_[numSteps_++] = wtstep[weighttype];
-
 			int fmtBase = DEC_FLOAT_1;
-			if (weighttype == GE_VTYPE_WEIGHT_8BIT >> GE_VTYPE_WEIGHT_SHIFT) {
-				fmtBase = DEC_U8_1;
-			} else if (weighttype == GE_VTYPE_WEIGHT_16BIT >> GE_VTYPE_WEIGHT_SHIFT) {
-				fmtBase = DEC_U16_1;
-			} else if (weighttype == GE_VTYPE_WEIGHT_FLOAT >> GE_VTYPE_WEIGHT_SHIFT) {
+			if (options.expandAllWeightsToFloat) {
+				steps_[numSteps_++] = wtstepToFloat[weighttype];
 				fmtBase = DEC_FLOAT_1;
+			} else {
+				steps_[numSteps_++] = wtstep[weighttype];
+				if (weighttype == GE_VTYPE_WEIGHT_8BIT >> GE_VTYPE_WEIGHT_SHIFT) {
+					fmtBase = DEC_U8_1;
+				} else if (weighttype == GE_VTYPE_WEIGHT_16BIT >> GE_VTYPE_WEIGHT_SHIFT) {
+					fmtBase = DEC_U16_1;
+				} else if (weighttype == GE_VTYPE_WEIGHT_FLOAT >> GE_VTYPE_WEIGHT_SHIFT) {
+					fmtBase = DEC_FLOAT_1;
+				}
 			}
 
 			int numWeights = TranslateNumBones(nweights);
@@ -927,14 +978,26 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
 			// After skinning, we always have three floats.
 			decFmt.nrmfmt = DEC_FLOAT_3;
 		} else {
-			steps_[numSteps_++] = morphcount == 1 ? nrmstep[nrm] : nrmstep_morph[nrm];
-
 			if (morphcount == 1) {
-				// The normal formats match the gl formats perfectly, let's use 'em.
+				// The 8-bit and 16-bit normal formats match GL formats nicely, and the 16-bit normal format matches a D3D format so let's use them where possible.
 				switch (nrm) {
-				case GE_VTYPE_NRM_8BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S8_3; break;
-				case GE_VTYPE_NRM_16BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S16_3; break;
-				case GE_VTYPE_NRM_FLOAT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_FLOAT_3; break;
+				case GE_VTYPE_NRM_8BIT >> GE_VTYPE_NRM_SHIFT:
+					if (options.expand8BitNormalsToFloat) {
+						decFmt.nrmfmt = DEC_FLOAT_3;
+						steps_[numSteps_++] = morphcount == 1 ? nrmstep8BitToFloat[nrm] : nrmstep_morph[nrm];
+					} else {
+						decFmt.nrmfmt = DEC_S8_3;
+						steps_[numSteps_++] = morphcount == 1 ? nrmstep[nrm] : nrmstep_morph[nrm];
+					}
+					break;
+				case GE_VTYPE_NRM_16BIT >> GE_VTYPE_NRM_SHIFT:
+					decFmt.nrmfmt = DEC_S16_3;
+					steps_[numSteps_++] = morphcount == 1 ? nrmstep[nrm] : nrmstep_morph[nrm];
+					break;
+				case GE_VTYPE_NRM_FLOAT >> GE_VTYPE_NRM_SHIFT:
+					decFmt.nrmfmt = DEC_FLOAT_3;
+					steps_[numSteps_++] = morphcount == 1 ? nrmstep[nrm] : nrmstep_morph[nrm];
+					break;
 				}
 			} else {
 				decFmt.nrmfmt = DEC_FLOAT_3;
diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
index d07ba5e602..f06d40681f 100644
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@@ -435,6 +435,8 @@ typedef void(*JittedVertexDecoder)(const u8 *src, u8 *dst, int count);
 
 struct VertexDecoderOptions {
 	bool expandAllUVtoFloat;
+	bool expandAllWeightsToFloat;
+	bool expand8BitNormalsToFloat;
 };
 
 class VertexDecoder
@@ -457,6 +459,8 @@ public:
 
 	void Step_WeightsU8() const;
 	void Step_WeightsU16() const;
+	void Step_WeightsU8ToFloat() const;
+	void Step_WeightsU16ToFloat() const;
 	void Step_WeightsFloat() const;
 
 	void Step_WeightsU8Skin() const;
@@ -492,6 +496,7 @@ public:
 	void Step_Color8888Morph() const;
 
 	void Step_NormalS8() const;
+	void Step_NormalS8ToFloat() const;
 	void Step_NormalS16() const;
 	void Step_NormalFloat() const;
 
@@ -627,6 +632,7 @@ public:
 	void Jit_Color5551();
 
 	void Jit_NormalS8();
+	void Jit_NormalS8ToFloat();
 	void Jit_NormalS16();
 	void Jit_NormalFloat();
 
@@ -635,6 +641,7 @@ public:
 	void Jit_NormalFloatSkin();
 
 	void Jit_PosS8();
+	void Jit_PosS8ToFloat();
 	void Jit_PosS16();
 	void Jit_PosFloat();
 	void Jit_PosS8Through();
diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp
index 28454c061e..8dbeed6e21 100644
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@@ -109,6 +109,7 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble},
 
 	{&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
+	{&VertexDecoder::Step_NormalS8ToFloat, &VertexDecoderJitCache::Jit_NormalS8ToFloat},
 	{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
 	{&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
 
@@ -956,6 +957,11 @@ void VertexDecoderJitCache::Jit_NormalS8() {
 	MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
 }
 
+void VertexDecoderJitCache::Jit_NormalS8ToFloat() {
+	Jit_AnyS8ToFloat(dec_->nrmoff);
+	MOVUPS(MDisp(dstReg, dec_->decFmt.nrmoff), XMM3);
+}
+
 // Copy 6 bytes and then 2 zeroes.
 void VertexDecoderJitCache::Jit_NormalS16() {
 	MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
diff --git a/GPU/Directx9/TransformPipelineDX9.cpp b/GPU/Directx9/TransformPipelineDX9.cpp
index dac3894695..877c9667ee 100644
--- a/GPU/Directx9/TransformPipelineDX9.cpp
+++ b/GPU/Directx9/TransformPipelineDX9.cpp
@@ -147,6 +147,8 @@ TransformDrawEngineDX9::TransformDrawEngineDX9()
 
 	memset(&decOptions_, 0, sizeof(decOptions_));
 	decOptions_.expandAllUVtoFloat = true;
+	decOptions_.expandAllWeightsToFloat = true;
+	decOptions_.expand8BitNormalsToFloat = true;
 
 	decimationCounter_ = VERTEXCACHE_DECIMATION_INTERVAL;
 	// Allocate nicely aligned memory. Maybe graphics drivers will
diff --git a/GPU/Directx9/VertexShaderGeneratorDX9.cpp b/GPU/Directx9/VertexShaderGeneratorDX9.cpp
index 288d7a6c7f..278474cd3c 100644
--- a/GPU/Directx9/VertexShaderGeneratorDX9.cpp
+++ b/GPU/Directx9/VertexShaderGeneratorDX9.cpp
@@ -312,9 +312,6 @@ void GenerateVertexShaderDX9(int prim, char *buffer, bool useHWTransform) {
 		} else {
 			int numWeights = TranslateNumBonesDX9(vertTypeGetNumBoneWeights(vertType));
 
-			static const char *rescale[4] = {"", " * 1.9921875", " * 1.999969482421875", ""}; // 2*127.5f/128.f, 2*32767.5f/32768.f, 1.0f};
-			const char *factor = rescale[vertTypeGetWeightMask(vertType) >> GE_VTYPE_WEIGHT_SHIFT];
-
 			static const char * const boneWeightAttr[8] = {
 				"a_w1.x", "a_w1.y", "a_w1.z", "a_w1.w",
 				"a_w2.x", "a_w2.y", "a_w2.z", "a_w2.w",
@@ -377,11 +374,11 @@ void GenerateVertexShaderDX9(int prim, char *buffer, bool useHWTransform) {
 			WRITE(p, ";\n");
 
 			// Trying to simplify this results in bugs in LBP...
-			WRITE(p, "  float3 skinnedpos = mul(float4(In.position.xyz, 1.0), skinMatrix).xyz %s;\n", factor);
+			WRITE(p, "  float3 skinnedpos = mul(float4(In.position.xyz, 1.0), skinMatrix).xyz;\n");
 			WRITE(p, "  float3 worldpos = mul(float4(skinnedpos, 1.0), u_world).xyz;\n");
 
 			if (hasNormal) {
-				WRITE(p, "  float3 skinnednormal = mul(float4(%sIn.normal, 0.0), skinMatrix).xyz %s;\n", flipNormal ? "-" : "", factor);
+				WRITE(p, "  float3 skinnednormal = mul(float4(%sIn.normal, 0.0), skinMatrix).xyz;\n", flipNormal ? "-" : "");
 				WRITE(p, "  float3 worldnormal = normalize(mul(float4(skinnednormal, 0.0), u_world).xyz);\n");
 			} else {
 				WRITE(p, "  float3 worldnormal = mul( mul( float4(0.0, 0.0, 1.0, 0.0), skinMatrix), u_world).xyz;\n");