From 3e6c2f0c7868c1dd496069cb2d04acd217a7fd78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Wed, 13 Nov 2013 17:10:44 +0100
Subject: [PATCH 01/12] Update native

---
 native | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/native b/native
index 483e42f64e..cf895f95d7 160000
--- a/native
+++ b/native
@@ -1 +1 @@
-Subproject commit 483e42f64e666ee47390b47e3405ad731d011f93
+Subproject commit cf895f95d7ae75d4535cf252687fd4f9c4f1663b

From 7e67476b00da917c2bbf0a5d2433df5cc72487d7 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 10 Nov 2013 13:18:52 +0100
Subject: [PATCH 02/12] Simple unoptimized software skinning.

Does not take advantage of the possible reduction in state changes yet.
---
 Core/Config.cpp                |   2 +
 Core/Config.h                  |   2 +
 GPU/GLES/GLES_GPU.cpp          |  32 ++++-
 GPU/GLES/TransformPipeline.cpp |   8 +-
 GPU/GLES/VertexDecoder.cpp     | 236 ++++++++++++++++++++++++++-------
 GPU/GLES/VertexDecoder.h       |  12 ++
 GPU/GPUState.cpp               |  15 ++-
 GPU/GPUState.h                 |   4 +-
 UI/GameSettingsScreen.cpp      |   5 +-
 9 files changed, 257 insertions(+), 59 deletions(-)

diff --git a/Core/Config.cpp b/Core/Config.cpp
index 8e11eab12a..bc15c6c1f2 100644
--- a/Core/Config.cpp
+++ b/Core/Config.cpp
@@ -144,6 +144,7 @@ void Config::Load(const char *iniFileName, const char *controllerIniFilename) {
 	graphics->Get("RenderingMode", &iRenderingMode, renderingModeDefault);
 	graphics->Get("SoftwareRendering", &bSoftwareRendering, false);
 	graphics->Get("HardwareTransform", &bHardwareTransform, true);
+	graphics->Get("SoftwareSkinning", &bSoftwareSkinning, false);
 	graphics->Get("TextureFiltering", &iTexFiltering, 1);
 	// Auto on Windows, 1x elsewhere. Maybe change to 2x on large screens?
 #ifdef _WIN32
@@ -401,6 +402,7 @@ void Config::Save() {
 		graphics->Set("RenderingMode", iRenderingMode);
 		graphics->Set("SoftwareRendering", bSoftwareRendering);
 		graphics->Set("HardwareTransform", bHardwareTransform);
+		graphics->Set("SoftwareSkinning", bSoftwareSkinning);
 		graphics->Set("TextureFiltering", iTexFiltering);
 		graphics->Set("InternalResolution", iInternalResolution);
 		graphics->Set("FrameSkip", iFrameSkip);
diff --git a/Core/Config.h b/Core/Config.h
index b5a4073b3f..2922d4c005 100644
--- a/Core/Config.h
+++ b/Core/Config.h
@@ -64,6 +64,8 @@ public:
 	// GFX
 	bool bSoftwareRendering;
 	bool bHardwareTransform; // only used in the GLES backend
+	bool bSoftwareSkinning;  // may speed up some games
+
 	int iRenderingMode; // 0 = non-buffered rendering 1 = buffered rendering 2 = Read Framebuffer to memory (CPU) 3 = Read Framebuffer to memory (GPU)
 	int iTexFiltering; // 1 = off , 2 = nearest , 3 = linear , 4 = linear(CG)
 #ifdef BLACKBERRY
diff --git a/GPU/GLES/GLES_GPU.cpp b/GPU/GLES/GLES_GPU.cpp
index 66a5223254..19343dc1db 100644
--- a/GPU/GLES/GLES_GPU.cpp
+++ b/GPU/GLES/GLES_GPU.cpp
@@ -430,6 +430,12 @@ GLES_GPU::GLES_GPU()
 		commandFlags_[GE_CMD_TEXOFFSETV] &= ~FLAG_FLUSHBEFOREONCHANGE;
 	}
 
+	// TODO: Can't turn this optimization on until we don't decode everything in one go
+	// but instead decode for every draw call when sw skinning
+	if (g_Config.bSoftwareSkinning) {
+		// commandFlags_[GE_CMD_VERTEXTYPE] &= ~FLAG_FLUSHBEFOREONCHANGE;
+	}
+
 	BuildReportingInfo();
 }
 
@@ -868,8 +874,21 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) {
 		break;
 
 	case GE_CMD_VERTEXTYPE:
-		if (diff)
-			shaderManager_->DirtyUniform(DIRTY_UVSCALEOFFSET);
+		if (diff) {
+			if (!g_Config.bSoftwareSkinning) {
+				shaderManager_->DirtyUniform(DIRTY_UVSCALEOFFSET);
+			} else if (false) {
+				// TODO: Can't turn this optimization on until we don't decode everything in one go
+				// but instead decode for every draw call when sw skinning
+				if (diff & ~GE_VTYPE_WEIGHTCOUNT_MASK) {
+					// Restore and flush
+					gstate.vertType ^= diff;
+					Flush();
+					gstate.vertType ^= diff;
+					shaderManager_->DirtyUniform(DIRTY_UVSCALEOFFSET);
+				}
+			}
+		}
 		break;
 
 	case GE_CMD_REGION1:
@@ -1375,9 +1394,14 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) {
 			int num = gstate.boneMatrixNumber & 0x7F;
 			float newVal = getFloat24(data);
 			if (num < 96 && newVal != gstate.boneMatrix[num]) {
-				Flush();
+				// Bone matrices should NOT flush when software skinning is enabled!
+				// TODO: Also check for morph...
+				// TODO: Can't turn this optimizatoin on until we decode per drawcall when sw skinning.
+				if (true || !g_Config.bSoftwareSkinning) {
+					Flush();
+					shaderManager_->DirtyUniform(DIRTY_BONEMATRIX0 << (num / 12));
+				}
 				gstate.boneMatrix[num] = newVal;
-				shaderManager_->DirtyUniform(DIRTY_BONEMATRIX0 << (num / 12));
 			}
 			num++;
 			gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x7F);
diff --git a/GPU/GLES/TransformPipeline.cpp b/GPU/GLES/TransformPipeline.cpp
index b6de946821..d1f70a69f7 100644
--- a/GPU/GLES/TransformPipeline.cpp
+++ b/GPU/GLES/TransformPipeline.cpp
@@ -524,8 +524,14 @@ void TransformDrawEngine::DoFlush() {
 		int vertexCount = 0;
 		int maxIndex = 0;
 		bool useElements = true;
+
 		// Cannot cache vertex data with morph enabled.
-		if (g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK)) {
+		bool useCache = g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK);
+		// Also avoid caching when software skinning.
+		if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK))
+			useCache = false;
+
+		if (useCache) {
 			u32 id = ComputeFastDCID();
 			auto iter = vai_.find(id);
 			VertexArrayInfo *vai;
diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp
index abe37a98ac..6e848708a5 100644
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@@ -15,11 +15,10 @@
 // Official git repository and contact information can be found at
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
 
-#include "math/lin/matrix4x4.h"
-
 #include "Core/Config.h"
 #include "Core/MemMap.h"
 #include "GPU/ge_constants.h"
+#include "GPU/Math3D.h"
 
 #include "VertexDecoder.h"
 #include "VertexShaderGenerator.h"
@@ -36,6 +35,10 @@ static const u8 nrmsize[4] = {0,3,6,12}, nrmalign[4] = {0,1,2,4};
 static const u8 possize[4] = {0,3,6,12}, posalign[4] = {0,1,2,4};
 static const u8 wtsize[4] = {0,1,2,4}, wtalign[4] = {0,1,2,4};
 
+// When software skinning. This should be stored in registers instead of memory
+// when jitting.
+float skinMatrix[12];
+
 inline int align(int n, int align) {
 	return (n + (align - 1)) & ~(align - 1);
 }
@@ -112,6 +115,57 @@ void VertexDecoder::Step_WeightsFloat() const
 		wt[j++] = 0.0f;
 }
 
+void VertexDecoder::Step_WeightsU8Skin() const
+{
+	memset(skinMatrix, 0, sizeof(skinMatrix));
+	u8 *wt = (u8 *)(decoded_ + decFmt.w0off);
+	const u8 *wdata = (const u8*)(ptr_);
+	for (int j = 0; j < nweights; j++) {
+		const float *bone = &gstate.boneMatrix[j * 12];
+		if (wdata[j] != 0) {
+			float weight = wdata[j] / 128.0f;
+			for (int i = 0; i < 12; i++) {
+				skinMatrix[i] += weight * bone[i];
+			}
+		}
+	}
+}
+
+void VertexDecoder::Step_WeightsU16Skin() const
+{
+	memset(skinMatrix, 0, sizeof(skinMatrix));
+	u16 *wt = (u16 *)(decoded_ + decFmt.w0off);
+	const u16 *wdata = (const u16*)(ptr_);
+	for (int j = 0; j < nweights; j++) {
+		const float *bone = &gstate.boneMatrix[j * 12];
+		if (wdata[j] != 0) {
+			float weight = wdata[j] / 32768.0f;
+			for (int i = 0; i < 12; i++) {
+				skinMatrix[i] += weight * bone[i];
+			}
+		}
+	}
+}
+
+// Float weights should be uncommon, we can live with having to multiply these by 2.0
+// to avoid special checks in the vertex shader generator.
+// (PSP uses 0.0-2.0 fixed point numbers for weights)
+void VertexDecoder::Step_WeightsFloatSkin() const
+{
+	memset(skinMatrix, 0, sizeof(skinMatrix));
+	float *wt = (float *)(decoded_ + decFmt.w0off);
+	const float *wdata = (const float*)(ptr_);
+	for (int j = 0; j < nweights; j++) {
+		const float *bone = &gstate.boneMatrix[j * 12];
+		float weight = wdata[j];
+		if (weight > 0.0) {
+			for (int i = 0; i < 12; i++) {
+				skinMatrix[i] += weight * bone[i];
+			}
+		}
+	}
+}
+
 void VertexDecoder::Step_TcU8() const
 {
 	// u32 to write two bytes of zeroes for free.
@@ -318,6 +372,29 @@ void VertexDecoder::Step_NormalFloat() const
 		normal[j] = fv[j];
 }
 
+void VertexDecoder::Step_NormalS8Skin() const
+{
+	float *normal = (float *)(decoded_ + decFmt.nrmoff);
+	const s8 *sv = (const s8*)(ptr_ + nrmoff);
+	const float fn[3] = { sv[0] / 128.0f, sv[1] / 128.0f, sv[2] / 128.0f };
+	Norm3ByMatrix43(normal, fn, skinMatrix);
+}
+
+void VertexDecoder::Step_NormalS16Skin() const
+{
+	float *normal = (float *)(decoded_ + decFmt.nrmoff);
+	const s16 *sv = (const s16*)(ptr_ + nrmoff);
+	const float fn[3] = { sv[0] / 32768.0f, sv[1] / 32768.0f, sv[2] / 32768.0f };
+	Norm3ByMatrix43(normal, fn, skinMatrix);
+}
+
+void VertexDecoder::Step_NormalFloatSkin() const
+{
+	float *normal = (float *)(decoded_ + decFmt.nrmoff);
+	const float *fn = (const float *)(ptr_ + nrmoff);
+	Norm3ByMatrix43(normal, fn, skinMatrix);
+}
+
 void VertexDecoder::Step_NormalS8Morph() const
 {
 	float *normal = (float *)(decoded_ + decFmt.nrmoff);
@@ -382,6 +459,29 @@ void VertexDecoder::Step_PosFloat() const
 	memcpy(v, fv, 12);
 }
 
+void VertexDecoder::Step_PosS8Skin() const
+{
+	float *pos = (float *)(decoded_ + decFmt.posoff);
+	const s8 *sv = (const s8*)(ptr_ + posoff);
+	const float fn[3] = { sv[0] / 128.0f, sv[1] / 128.0f, sv[2] / 128.0f };
+	Vec3ByMatrix43(pos, fn, skinMatrix);
+}
+
+void VertexDecoder::Step_PosS16Skin() const
+{
+	float *pos = (float *)(decoded_ + decFmt.posoff);
+	const s16 *sv = (const s16*)(ptr_ + posoff);
+	const float fn[3] = { sv[0] / 32768.0f, sv[1] / 32768.0f, sv[2] / 32768.0f };
+	Vec3ByMatrix43(pos, fn, skinMatrix);
+}
+
+void VertexDecoder::Step_PosFloatSkin() const
+{
+	float *pos = (float *)(decoded_ + decFmt.posoff);
+	const float *fn = (const float *)(ptr_ + posoff);
+	Vec3ByMatrix43(pos, fn, skinMatrix);
+}
+
 void VertexDecoder::Step_PosS8Through() const
 {
 	float *v = (float *)(decoded_ + decFmt.posoff);
@@ -449,6 +549,13 @@ static const StepFunction wtstep[4] = {
 	&VertexDecoder::Step_WeightsFloat,
 };
 
+static const StepFunction wtstep_skin[4] = {
+	0,
+	&VertexDecoder::Step_WeightsU8Skin,
+	&VertexDecoder::Step_WeightsU16Skin,
+	&VertexDecoder::Step_WeightsFloatSkin,
+};
+
 static const StepFunction tcstep[4] = {
 	0,
 	&VertexDecoder::Step_TcU8,
@@ -510,6 +617,13 @@ static const StepFunction nrmstep[4] = {
 	&VertexDecoder::Step_NormalFloat,
 };
 
+static const StepFunction nrmstep_skin[4] = {
+	0,
+	&VertexDecoder::Step_NormalS8Skin,
+	&VertexDecoder::Step_NormalS16Skin,
+	&VertexDecoder::Step_NormalFloatSkin,
+};
+
 static const StepFunction nrmstep_morph[4] = {
 	0,
 	&VertexDecoder::Step_NormalS8Morph,
@@ -524,6 +638,13 @@ static const StepFunction posstep[4] = {
 	&VertexDecoder::Step_PosFloat,
 };
 
+static const StepFunction posstep_skin[4] = {
+	0,
+	&VertexDecoder::Step_PosS8Skin,
+	&VertexDecoder::Step_PosS16Skin,
+	&VertexDecoder::Step_PosFloatSkin,
+};
+
 static const StepFunction posstep_morph[4] = {
 	0,
 	&VertexDecoder::Step_PosS8Morph,
@@ -564,6 +685,8 @@ void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) {
 		DEBUG_LOG(G3D,"VTYPE: THRU=%i TC=%i COL=%i POS=%i NRM=%i WT=%i NW=%i IDX=%i MC=%i", (int)throughmode, tc,col,pos,nrm,weighttype,nweights,idx,morphcount);
 	}
 
+	bool skinInDecode = weighttype != 0 && g_Config.bSoftwareSkinning && morphcount == 1;
+
 	if (weighttype) { // && nweights?
 		weightoff = size;
 		//size = align(size, wtalign[weighttype]);	unnecessary
@@ -571,30 +694,35 @@ void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) {
 		if (wtalign[weighttype] > biggest)
 			biggest = wtalign[weighttype];
 
-		steps_[numSteps_++] = wtstep[weighttype];
-
-		int fmtBase = DEC_FLOAT_1;
-		if (weighttype == GE_VTYPE_WEIGHT_8BIT >> GE_VTYPE_WEIGHT_SHIFT) {
-			fmtBase = DEC_U8_1;
-		} else if (weighttype == GE_VTYPE_WEIGHT_16BIT >> GE_VTYPE_WEIGHT_SHIFT) {
-			fmtBase = DEC_U16_1;
-		} else if (weighttype == GE_VTYPE_WEIGHT_FLOAT >> GE_VTYPE_WEIGHT_SHIFT) {
-			fmtBase = DEC_FLOAT_1;
-		}
-
-		int numWeights = TranslateNumBones(nweights);
-
-		if (numWeights <= 4) {
-			decFmt.w0off = decOff;
-			decFmt.w0fmt = fmtBase + numWeights - 1;
-			decOff += DecFmtSize(decFmt.w0fmt);
+		if (skinInDecode) {
+			steps_[numSteps_++] = wtstep_skin[weighttype];
+			// No visible output
 		} else {
-			decFmt.w0off = decOff;
-			decFmt.w0fmt = fmtBase + 3;
-			decOff += DecFmtSize(decFmt.w0fmt);
-			decFmt.w1off = decOff;
-			decFmt.w1fmt = fmtBase + numWeights - 5;
-			decOff += DecFmtSize(decFmt.w1fmt);
+			steps_[numSteps_++] = wtstep[weighttype];
+
+			int fmtBase = DEC_FLOAT_1;
+			if (weighttype == GE_VTYPE_WEIGHT_8BIT >> GE_VTYPE_WEIGHT_SHIFT) {
+				fmtBase = DEC_U8_1;
+			} else if (weighttype == GE_VTYPE_WEIGHT_16BIT >> GE_VTYPE_WEIGHT_SHIFT) {
+				fmtBase = DEC_U16_1;
+			} else if (weighttype == GE_VTYPE_WEIGHT_FLOAT >> GE_VTYPE_WEIGHT_SHIFT) {
+				fmtBase = DEC_FLOAT_1;
+			}
+
+			int numWeights = TranslateNumBones(nweights);
+
+			if (numWeights <= 4) {
+				decFmt.w0off = decOff;
+				decFmt.w0fmt = fmtBase + numWeights - 1;
+				decOff += DecFmtSize(decFmt.w0fmt);
+			} else {
+				decFmt.w0off = decOff;
+				decFmt.w0fmt = fmtBase + 3;
+				decOff += DecFmtSize(decFmt.w0fmt);
+				decFmt.w1off = decOff;
+				decFmt.w1fmt = fmtBase + numWeights - 5;
+				decOff += DecFmtSize(decFmt.w1fmt);
+			}
 		}
 	}
 
@@ -656,26 +784,29 @@ void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) {
 		if (nrmalign[nrm] > biggest)
 			biggest = nrmalign[nrm]; 
 
-		steps_[numSteps_++] = morphcount == 1 ? nrmstep[nrm] : nrmstep_morph[nrm];
-
-		if (morphcount == 1) {
-			// The normal formats match the gl formats perfectly, let's use 'em.
-			switch (nrm) {
-			case GE_VTYPE_NRM_8BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S8_3; break;
-			case GE_VTYPE_NRM_16BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S16_3; break;
-			case GE_VTYPE_NRM_FLOAT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_FLOAT_3; break;
-			}
-		} else {
+		if (skinInDecode) {
+			steps_[numSteps_++] = nrmstep_skin[nrm];
+			// After skinning, we always have three floats.
 			decFmt.nrmfmt = DEC_FLOAT_3;
-		}
+		} else {
+			steps_[numSteps_++] = morphcount == 1 ? nrmstep[nrm] : nrmstep_morph[nrm];
 
-		// Actually, temporarily let's not.
+			if (morphcount == 1) {
+				// The normal formats match the gl formats perfectly, let's use 'em.
+				switch (nrm) {
+				case GE_VTYPE_NRM_8BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S8_3; break;
+				case GE_VTYPE_NRM_16BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S16_3; break;
+				case GE_VTYPE_NRM_FLOAT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_FLOAT_3; break;
+				}
+			} else {
+				decFmt.nrmfmt = DEC_FLOAT_3;
+			}
+		}
 		decFmt.nrmoff = decOff;
 		decOff += DecFmtSize(decFmt.nrmfmt);
 	}
 
-	if (pos)  // there's always a position
-	{
+	if (pos) { // there's always a position
 		size = align(size, posalign[pos]);
 		posoff = size;
 		size += possize[pos];
@@ -686,18 +817,23 @@ void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) {
 			steps_[numSteps_++] = posstep_through[pos];
 			decFmt.posfmt = DEC_FLOAT_3;
 		} else {
-			steps_[numSteps_++] = morphcount == 1 ? posstep[pos] : posstep_morph[pos];
-
-			if (morphcount == 1) {
-				// The non-through-mode position formats match the gl formats perfectly, let's use 'em.
-				switch (pos) {
-				case GE_VTYPE_POS_8BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S8_3; break;
-				case GE_VTYPE_POS_16BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S16_3; break;
-				case GE_VTYPE_POS_FLOAT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_FLOAT_3; break;
-				}
-			} else {
-				// Actually, temporarily let's not.
+			if (skinInDecode) {
+				steps_[numSteps_++] = posstep_skin[pos];
 				decFmt.posfmt = DEC_FLOAT_3;
+			} else {
+				steps_[numSteps_++] = morphcount == 1 ? posstep[pos] : posstep_morph[pos];
+
+				if (morphcount == 1) {
+					// The non-through-mode position formats match the gl formats perfectly, let's use 'em.
+					switch (pos) {
+					case GE_VTYPE_POS_8BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S8_3; break;
+					case GE_VTYPE_POS_16BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S16_3; break;
+					case GE_VTYPE_POS_FLOAT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_FLOAT_3; break;
+					}
+				} else {
+					// Actually, temporarily let's not.
+					decFmt.posfmt = DEC_FLOAT_3;
+				}
 			}
 		}
 		decFmt.posoff = decOff;
diff --git a/GPU/GLES/VertexDecoder.h b/GPU/GLES/VertexDecoder.h
index dfa248140c..1ba24af027 100644
--- a/GPU/GLES/VertexDecoder.h
+++ b/GPU/GLES/VertexDecoder.h
@@ -64,6 +64,10 @@ public:
 	void Step_WeightsU16() const;
 	void Step_WeightsFloat() const;
 
+	void Step_WeightsU8Skin() const;
+	void Step_WeightsU16Skin() const;
+	void Step_WeightsFloatSkin() const;
+
 	void Step_TcU8() const;
 	void Step_TcU16() const;
 	void Step_TcFloat() const;
@@ -91,6 +95,10 @@ public:
 	void Step_NormalS16() const;
 	void Step_NormalFloat() const;
 
+	void Step_NormalS8Skin() const;
+	void Step_NormalS16Skin() const;
+	void Step_NormalFloatSkin() const;
+
 	void Step_NormalS8Morph() const;
 	void Step_NormalS16Morph() const;
 	void Step_NormalFloatMorph() const;
@@ -99,6 +107,10 @@ public:
 	void Step_PosS16() const;
 	void Step_PosFloat() const;
 
+	void Step_PosS8Skin() const;
+	void Step_PosS16Skin() const;
+	void Step_PosFloatSkin() const;
+
 	void Step_PosS8Morph() const;
 	void Step_PosS16Morph() const;
 	void Step_PosFloatMorph() const;
diff --git a/GPU/GPUState.cpp b/GPU/GPUState.cpp
index fad43058c7..1a089db3ab 100644
--- a/GPU/GPUState.cpp
+++ b/GPU/GPUState.cpp
@@ -28,10 +28,14 @@
 #include "GPU/Directx9/GPU_DX9.h"
 #endif
 #include "Core/CoreParameter.h"
+#include "Core/Config.h"
 #include "Core/System.h"
 
-GPUgstate gstate;
-GPUStateCache gstate_c;
+// This must be aligned so that the matrices within are aligned.
+GPUgstate MEMORY_ALIGNED16(gstate);
+// Let's align this one too for good measure.
+GPUStateCache MEMORY_ALIGNED16(gstate_c);
+
 GPUInterface *gpu;
 GPUDebugInterface *gpuDebug;
 GPUStatistics gpuStats;
@@ -200,3 +204,10 @@ void GPUgstate::Restore(u32_le *ptr) {
 	memcpy(projMatrix, matrices, sizeof(projMatrix)); matrices += sizeof(projMatrix);
 	memcpy(tgenMatrix, matrices, sizeof(tgenMatrix)); matrices += sizeof(tgenMatrix);
 }
+
+bool vertTypeIsSkinningEnabled(u32 vertType) {
+	if (g_Config.bSoftwareSkinning && ((vertType & GE_VTYPE_MORPHCOUNT_MASK) == 0))
+		return false;
+	else
+		return ((vertType & GE_VTYPE_WEIGHT_MASK) != GE_VTYPE_WEIGHT_NONE);
+}
diff --git a/GPU/GPUState.h b/GPU/GPUState.h
index 5e00234693..f1c8aca408 100644
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include <cmath>
+
 #include "../Globals.h"
 #include "ge_constants.h"
 #include "Common/Common.h"
@@ -406,7 +407,8 @@ enum SkipDrawReasonFlags {
 	SKIPDRAW_BAD_FB_TEXTURE = 4,
 };
 
-inline bool vertTypeIsSkinningEnabled(u32 vertType) { return ((vertType & GE_VTYPE_WEIGHT_MASK) != GE_VTYPE_WEIGHT_NONE); }
+bool vertTypeIsSkinningEnabled(u32 vertType);
+
 inline int vertTypeGetNumBoneWeights(u32 vertType) { return 1 + ((vertType & GE_VTYPE_WEIGHTCOUNT_MASK) >> GE_VTYPE_WEIGHTCOUNT_SHIFT); }
 inline int vertTypeGetWeightMask(u32 vertType) { return vertType & GE_VTYPE_WEIGHT_MASK; }
 inline int vertTypeGetTexCoordMask(u32 vertType) { return vertType & GE_VTYPE_TC_MASK; }
diff --git a/UI/GameSettingsScreen.cpp b/UI/GameSettingsScreen.cpp
index be8044c839..22ce9bf5b1 100644
--- a/UI/GameSettingsScreen.cpp
+++ b/UI/GameSettingsScreen.cpp
@@ -143,10 +143,13 @@ void GameSettingsScreen::CreateViews() {
 	graphicsSettings->Add(new CheckBox(&g_Config.bVSync, gs->T("VSync")));
 #endif
 	graphicsSettings->Add(new CheckBox(&g_Config.bHardwareTransform, gs->T("Hardware Transform")));
+	CheckBox *swSkin = graphicsSettings->Add(new CheckBox(&g_Config.bSoftwareSkinning, gs->T("Software Skinning")));
 	graphicsSettings->Add(new CheckBox(&g_Config.bVertexCache, gs->T("Vertex Cache")));
 	CheckBox *vtxJit = graphicsSettings->Add(new CheckBox(&g_Config.bVertexDecoderJit, gs->T("Vertex Decoder JIT")));
-	if (PSP_IsInited())
+	if (PSP_IsInited()) {
+		swSkin->SetEnabled(false);
 		vtxJit->SetEnabled(false);
+	}
 
 	graphicsSettings->Add(new CheckBox(&g_Config.bLowQualitySplineBezier, gs->T("LowCurves", "Low quality spline/bezier curves")));
 

From 4f78eda23b60ed7e947d2ed7a4c0d52dc6ac664d Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 10 Nov 2013 13:30:57 +0100
Subject: [PATCH 03/12] Save a couple of registers in the x86 vertex decoder
 jit by SIMD-ing prescale UV

---
 GPU/GLES/VertexDecoder.cpp | 73 ++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 39 deletions(-)

diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp
index 6e848708a5..8844112ac1 100644
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@@ -927,11 +927,11 @@ struct JitLookup {
 	JitStepFunction jitFunc;
 };
 
+#ifdef ARM
+
 static const float by128 = 1.0f / 128.0f;
 static const float by32768 = 1.0f / 32768.0f;
 
-#ifdef ARM
-
 using namespace ArmGen;
 
 static const ARMReg tempReg1 = R3;
@@ -1373,6 +1373,9 @@ void VertexDecoderJitCache::Jit_PosFloat() {
 
 using namespace Gen;
 
+static const float MEMORY_ALIGNED16( by128[4] ) = {1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f};
+static const float MEMORY_ALIGNED16( by32768[4] ) = {1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f};
+
 #ifdef _M_X64
 #ifdef _WIN32
 static const X64Reg tempReg1 = RAX;
@@ -1400,12 +1403,13 @@ static const X64Reg counterReg = ECX;
 
 // XMM0-XMM5 are volatile on Windows X64
 // XMM0-XMM7 are arguments (and thus volatile) on System V ABI (other x64 platforms)
-static const X64Reg fpUscaleReg = XMM0;
-static const X64Reg fpVscaleReg = XMM1;
-static const X64Reg fpUoffsetReg = XMM2;
-static const X64Reg fpVoffsetReg = XMM3;
-static const X64Reg fpScratchReg = XMM4;
-static const X64Reg fpScratchReg2 = XMM5;
+static const X64Reg fpScaleReg = XMM0;
+static const X64Reg fpOffsetReg = XMM1;
+static const X64Reg fpScratchReg = XMM2;
+static const X64Reg fpScratchReg2 = XMM3;
+// We're gonna keep the current skinning matrix in 3 or 4 XMM regs. Fortunately we easily
+// have space for that now.
+
 
 // To debug, just comment them out one at a time until it works. We fall back
 // on the interpreter if the compiler fails.
@@ -1495,16 +1499,16 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
 #else
 		MOV(32, R(tempReg1), Imm32((u32)(&gstate_c.uv)));
 #endif
-		MOVSS(fpUscaleReg, MDisp(tempReg1, 0));
-		MOVSS(fpVscaleReg, MDisp(tempReg1, 4));
-		MOVSS(fpUoffsetReg, MDisp(tempReg1, 8));
-		MOVSS(fpVoffsetReg, MDisp(tempReg1, 12));
+		MOVSS(fpScaleReg, MDisp(tempReg1, 0));
+		MOVSS(fpScratchReg, MDisp(tempReg1, 4));
+		UNPCKLPS(fpScaleReg, R(fpScratchReg));
+		MOVSS(fpOffsetReg, MDisp(tempReg1, 8));
+		MOVSS(fpScratchReg, MDisp(tempReg1, 12));
+		UNPCKLPS(fpOffsetReg, R(fpScratchReg));
 		if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
-			MULSS(fpUscaleReg, M((void *)&by128));
-			MULSS(fpVscaleReg, M((void *)&by128));
+			MULPS(fpScaleReg, M((void *)&by128));
 		} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
-			MULSS(fpUscaleReg, M((void *)&by32768));
-			MULSS(fpVscaleReg, M((void *)&by32768));
+			MULPS(fpScaleReg, M((void *)&by32768));
 		}
 	}
 
@@ -1611,43 +1615,34 @@ void VertexDecoderJitCache::Jit_TcFloat() {
 }
 
 void VertexDecoderJitCache::Jit_TcU8Prescale() {
-	// TODO: SIMD
+	// TODO: The first five instructions could be done in 1 or 2 in SSE4
 	MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->tcoff));
 	MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->tcoff + 1));
 	CVTSI2SS(fpScratchReg, R(tempReg1));
 	CVTSI2SS(fpScratchReg2, R(tempReg2));
-	MULSS(fpScratchReg, R(fpUscaleReg));
-	MULSS(fpScratchReg2, R(fpVscaleReg));
-	ADDSS(fpScratchReg, R(fpUoffsetReg));
-	ADDSS(fpScratchReg2, R(fpVoffsetReg));
-	MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
-	MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2);
+	UNPCKLPS(fpScratchReg, R(fpScratchReg2));
+	MULPS(fpScratchReg, R(fpScaleReg));
+	ADDPS(fpScratchReg, R(fpOffsetReg));
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
 }
 
 void VertexDecoderJitCache::Jit_TcU16Prescale() {
-	// TODO: SIMD
+	// TODO: The first five instructions could be done in 1 or 2 in SSE4 and probably in 3 in SSE2
 	MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
 	MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2));
 	CVTSI2SS(fpScratchReg, R(tempReg1));
 	CVTSI2SS(fpScratchReg2, R(tempReg2));
-	MULSS(fpScratchReg, R(fpUscaleReg));
-	MULSS(fpScratchReg2, R(fpVscaleReg));
-	ADDSS(fpScratchReg, R(fpUoffsetReg));
-	ADDSS(fpScratchReg2, R(fpVoffsetReg));
-	MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
-	MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2);
+	UNPCKLPS(fpScratchReg, R(fpScratchReg2));
+	MULPS(fpScratchReg, R(fpScaleReg));
+	ADDPS(fpScratchReg, R(fpOffsetReg));
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
 }
 
 void VertexDecoderJitCache::Jit_TcFloatPrescale() {
-	// TODO: SIMD
-	MOVSS(fpScratchReg, MDisp(srcReg, dec_->tcoff));
-	MOVSS(fpScratchReg2, MDisp(srcReg, dec_->tcoff + 4));
-	MULSS(fpScratchReg, R(fpUscaleReg));
-	MULSS(fpScratchReg2, R(fpVscaleReg));
-	ADDSS(fpScratchReg, R(fpUoffsetReg));
-	ADDSS(fpScratchReg2, R(fpVoffsetReg));
-	MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
-	MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2);
+	MOVQ_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff));
+	MULPS(fpScratchReg, R(fpScaleReg));
+	ADDPS(fpScratchReg, R(fpOffsetReg));
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
 }
 
 void VertexDecoderJitCache::Jit_TcU16Through() {

From 46313ced55e5153f90b02e65136de1d81f0b64e5 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 10 Nov 2013 15:22:24 +0100
Subject: [PATCH 04/12] Prepare transform pipeline for step by step decoding

---
 GPU/GLES/TransformPipeline.cpp | 133 +++++++++++++++++----------------
 GPU/GLES/TransformPipeline.h   |   4 +-
 2 files changed, 72 insertions(+), 65 deletions(-)

diff --git a/GPU/GLES/TransformPipeline.cpp b/GPU/GLES/TransformPipeline.cpp
index d1f70a69f7..c88502fed4 100644
--- a/GPU/GLES/TransformPipeline.cpp
+++ b/GPU/GLES/TransformPipeline.cpp
@@ -119,7 +119,8 @@ TransformDrawEngine::TransformDrawEngine()
 		framebufferManager_(0),
 		numDrawCalls(0),
 		vertexCountInDrawCalls(0),
-		uvScale(0) {
+		uvScale(0),
+		decodeCounter_(0) {
 	decimationCounter_ = VERTEXCACHE_DECIMATION_INTERVAL;
 	// Allocate nicely aligned memory. Maybe graphics drivers will
 	// appreciate it.
@@ -332,70 +333,11 @@ void TransformDrawEngine::DecodeVerts() {
 	UVScale origUV;
 	if (uvScale)
 		origUV = gstate_c.uv;
-	for (int i = 0; i < numDrawCalls; i++) {
-		const DeferredDrawCall &dc = drawCalls[i];
-
-		indexGen.SetIndex(collectedVerts);
-		int indexLowerBound = dc.indexLowerBound, indexUpperBound = dc.indexUpperBound;
-
-		u32 indexType = dc.indexType;
-		void *inds = dc.inds;
-		if (indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) {
-			// Decode the verts and apply morphing. Simple.
-			if (uvScale)
-				gstate_c.uv = uvScale[i];
-			dec_->DecodeVerts(decoded + collectedVerts * (int)dec_->GetDecVtxFmt().stride,
-				dc.verts, indexLowerBound, indexUpperBound);
-			collectedVerts += indexUpperBound - indexLowerBound + 1;
-			indexGen.AddPrim(dc.prim, dc.vertexCount);
-		} else {
-			// It's fairly common that games issue long sequences of PRIM calls, with differing
-			// inds pointer but the same base vertex pointer. We'd like to reuse vertices between
-			// these as much as possible, so we make sure here to combine as many as possible
-			// into one nice big drawcall, sharing data.
-
-			// 1. Look ahead to find the max index, only looking as "matching" drawcalls.
-			//    Expand the lower and upper bounds as we go.
-			int j = i + 1;
-			int lastMatch = i;
-			while (j < numDrawCalls) {
-				if (drawCalls[j].verts != dc.verts)
-					break;
-				if (uvScale && memcmp(&uvScale[j], &uvScale[i], sizeof(uvScale[0])) != 0)
-					break;
-
-				indexLowerBound = std::min(indexLowerBound, (int)drawCalls[j].indexLowerBound);
-				indexUpperBound = std::max(indexUpperBound, (int)drawCalls[j].indexUpperBound);
-				lastMatch = j;
-				j++;
-			}
-			
-			// 2. Loop through the drawcalls, translating indices as we go.
-			for (j = i; j <= lastMatch; j++) {
-				switch (indexType) {
-				case GE_VTYPE_IDX_8BIT >> GE_VTYPE_IDX_SHIFT:
-					indexGen.TranslatePrim(drawCalls[j].prim, drawCalls[j].vertexCount, (const u8 *)drawCalls[j].inds, indexLowerBound);
-					break;
-				case GE_VTYPE_IDX_16BIT >> GE_VTYPE_IDX_SHIFT:
-					indexGen.TranslatePrim(drawCalls[j].prim, drawCalls[j].vertexCount, (const u16 *)drawCalls[j].inds, indexLowerBound);
-					break;
-				}
-			}
-
-			int vertexCount = indexUpperBound - indexLowerBound + 1;
-			// 3. Decode that range of vertex data.
-			if (uvScale)
-				gstate_c.uv = uvScale[i];
-			dec_->DecodeVerts(decoded + collectedVerts * (int)dec_->GetDecVtxFmt().stride,
-				dc.verts, indexLowerBound, indexUpperBound);
-			collectedVerts += vertexCount;
-
-			// 4. Advance indexgen vertex counter.
-			indexGen.Advance(vertexCount);
-			i = lastMatch;
-		}
+	for (; decodeCounter_ < numDrawCalls; decodeCounter_++) {
+		if (uvScale)
+			gstate_c.uv = uvScale[decodeCounter_];
+		DecodeVertsStep();
 	}
-
 	// Sanity check
 	if (indexGen.Prim() < 0) {
 		ERROR_LOG_REPORT(G3D, "DecodeVerts: Failed to deduce prim: %i", indexGen.Prim());
@@ -406,6 +348,68 @@ void TransformDrawEngine::DecodeVerts() {
 		gstate_c.uv = origUV;
 }
 
+void TransformDrawEngine::DecodeVertsStep() {
+	const int i = decodeCounter_;
+
+	const DeferredDrawCall &dc = drawCalls[i];
+
+	indexGen.SetIndex(collectedVerts);
+	int indexLowerBound = dc.indexLowerBound, indexUpperBound = dc.indexUpperBound;
+
+	u32 indexType = dc.indexType;
+	void *inds = dc.inds;
+	if (indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) {
+		// Decode the verts and apply morphing. Simple.
+		dec_->DecodeVerts(decoded + collectedVerts * (int)dec_->GetDecVtxFmt().stride,
+			dc.verts, indexLowerBound, indexUpperBound);
+		collectedVerts += indexUpperBound - indexLowerBound + 1;
+		indexGen.AddPrim(dc.prim, dc.vertexCount);
+	} else {
+		// It's fairly common that games issue long sequences of PRIM calls, with differing
+		// inds pointer but the same base vertex pointer. We'd like to reuse vertices between
+		// these as much as possible, so we make sure here to combine as many as possible
+		// into one nice big drawcall, sharing data.
+
+		// 1. Look ahead to find the max index, only looking as "matching" drawcalls.
+		//    Expand the lower and upper bounds as we go.
+		int j = i + 1;
+		int lastMatch = i;
+		while (j < numDrawCalls) {
+			if (drawCalls[j].verts != dc.verts)
+				break;
+			if (uvScale && memcmp(&uvScale[j], &uvScale[i], sizeof(uvScale[0])) != 0)
+				break;
+
+			indexLowerBound = std::min(indexLowerBound, (int)drawCalls[j].indexLowerBound);
+			indexUpperBound = std::max(indexUpperBound, (int)drawCalls[j].indexUpperBound);
+			lastMatch = j;
+			j++;
+		}
+			
+		// 2. Loop through the drawcalls, translating indices as we go.
+		for (j = i; j <= lastMatch; j++) {
+			switch (indexType) {
+			case GE_VTYPE_IDX_8BIT >> GE_VTYPE_IDX_SHIFT:
+				indexGen.TranslatePrim(drawCalls[j].prim, drawCalls[j].vertexCount, (const u8 *)drawCalls[j].inds, indexLowerBound);
+				break;
+			case GE_VTYPE_IDX_16BIT >> GE_VTYPE_IDX_SHIFT:
+				indexGen.TranslatePrim(drawCalls[j].prim, drawCalls[j].vertexCount, (const u16 *)drawCalls[j].inds, indexLowerBound);
+				break;
+			}
+		}
+
+		int vertexCount = indexUpperBound - indexLowerBound + 1;
+		// 3. Decode that range of vertex data.
+		dec_->DecodeVerts(decoded + collectedVerts * (int)dec_->GetDecVtxFmt().stride,
+			dc.verts, indexLowerBound, indexUpperBound);
+		collectedVerts += vertexCount;
+
+		// 4. Advance indexgen vertex counter.
+		indexGen.Advance(vertexCount);
+		decodeCounter_ = lastMatch;
+	}
+}
+
 u32 TransformDrawEngine::ComputeHash() {
 	u32 fullhash = 0;
 	int vertexSize = dec_->GetDecVtxFmt().stride;
@@ -720,6 +724,7 @@ rotateVBO:
 	collectedVerts = 0;
 	numDrawCalls = 0;
 	vertexCountInDrawCalls = 0;
+	decodeCounter_ = 0;
 	prevPrim_ = GE_PRIM_INVALID;
 
 #ifndef USING_GLES2
diff --git a/GPU/GLES/TransformPipeline.h b/GPU/GLES/TransformPipeline.h
index 543a8a2f8c..273cf66982 100644
--- a/GPU/GLES/TransformPipeline.h
+++ b/GPU/GLES/TransformPipeline.h
@@ -97,7 +97,6 @@ public:
 	void SubmitBezier(void* control_points, void* indices, int count_u, int count_v, GEPatchPrimType prim_type, u32 vertType);
 	bool TestBoundingBox(void* control_points, int vertexCount, u32 vertType);
 
-	void DecodeVerts();
 	void SetShaderManager(ShaderManager *shaderManager) {
 		shaderManager_ = shaderManager;
 	}
@@ -127,6 +126,8 @@ public:
 	}
 
 private:
+	void DecodeVerts();
+	void DecodeVertsStep();
 	void DoFlush();
 	void SoftwareTransformAndDraw(int prim, u8 *decoded, LinkedShader *program, int vertexCount, u32 vertexType, void *inds, int indexType, const DecVtxFormat &decVtxFormat, int maxIndex);
 	void ApplyDrawState(int prim);
@@ -195,6 +196,7 @@ private:
 	int vertexCountInDrawCalls;
 
 	int decimationCounter_;
+	int decodeCounter_;
 
 	UVScale *uvScale;
 };

From 179934ec9fc942fa4a65a664ddb2d199325cca7f Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 10 Nov 2013 15:31:56 +0100
Subject: [PATCH 05/12] Decode step by step when sw skinning

---
 GPU/GLES/TransformPipeline.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/GPU/GLES/TransformPipeline.cpp b/GPU/GLES/TransformPipeline.cpp
index c88502fed4..9bab41bfca 100644
--- a/GPU/GLES/TransformPipeline.cpp
+++ b/GPU/GLES/TransformPipeline.cpp
@@ -325,8 +325,14 @@ void TransformDrawEngine::SubmitPrim(void *verts, void *inds, GEPrimitiveType pr
 	if (uvScale) {
 		uvScale[numDrawCalls] = gstate_c.uv;
 	}
+
 	numDrawCalls++;
 	vertexCountInDrawCalls += vertexCount;
+
+	if (g_Config.bSoftwareSkinning && (vertType & GE_VTYPE_WEIGHT_MASK)) {
+		DecodeVertsStep();
+		decodeCounter_++;
+	}
 }
 
 void TransformDrawEngine::DecodeVerts() {

From 6976d6a3a07500b11afde1f4f0c5911c4370c0f2 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 10 Nov 2013 16:05:34 +0100
Subject: [PATCH 06/12] Enable the softskinning optimizations that let us merge
 drawcalls

---
 GPU/GLES/GLES_GPU.cpp | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/GPU/GLES/GLES_GPU.cpp b/GPU/GLES/GLES_GPU.cpp
index 19343dc1db..6009ed75e4 100644
--- a/GPU/GLES/GLES_GPU.cpp
+++ b/GPU/GLES/GLES_GPU.cpp
@@ -430,10 +430,8 @@ GLES_GPU::GLES_GPU()
 		commandFlags_[GE_CMD_TEXOFFSETV] &= ~FLAG_FLUSHBEFOREONCHANGE;
 	}
 
-	// TODO: Can't turn this optimization on until we don't decode everything in one go
-	// but instead decode for every draw call when sw skinning
 	if (g_Config.bSoftwareSkinning) {
-		// commandFlags_[GE_CMD_VERTEXTYPE] &= ~FLAG_FLUSHBEFOREONCHANGE;
+		commandFlags_[GE_CMD_VERTEXTYPE] &= ~FLAG_FLUSHBEFOREONCHANGE;
 	}
 
 	BuildReportingInfo();
@@ -876,16 +874,16 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) {
 	case GE_CMD_VERTEXTYPE:
 		if (diff) {
 			if (!g_Config.bSoftwareSkinning) {
-				shaderManager_->DirtyUniform(DIRTY_UVSCALEOFFSET);
-			} else if (false) {
-				// TODO: Can't turn this optimization on until we don't decode everything in one go
-				// but instead decode for every draw call when sw skinning
+				if (diff & (GE_VTYPE_TC_MASK | GE_VTYPE_THROUGH_MASK))
+					shaderManager_->DirtyUniform(DIRTY_UVSCALEOFFSET);
+			} else {
 				if (diff & ~GE_VTYPE_WEIGHTCOUNT_MASK) {
 					// Restore and flush
 					gstate.vertType ^= diff;
 					Flush();
 					gstate.vertType ^= diff;
-					shaderManager_->DirtyUniform(DIRTY_UVSCALEOFFSET);
+					if (diff & (GE_VTYPE_TC_MASK | GE_VTYPE_THROUGH_MASK))
+						shaderManager_->DirtyUniform(DIRTY_UVSCALEOFFSET);
 				}
 			}
 		}
@@ -1392,16 +1390,16 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) {
 	case GE_CMD_BONEMATRIXDATA:
 		{
 			int num = gstate.boneMatrixNumber & 0x7F;
-			float newVal = getFloat24(data);
-			if (num < 96 && newVal != gstate.boneMatrix[num]) {
+			u32 newVal = data << 8;
+			if (num < 96 && newVal != ((const u32 *)gstate.boneMatrix)[num]) {
 				// Bone matrices should NOT flush when software skinning is enabled!
 				// TODO: Also check for morph...
 				// TODO: Can't turn this optimizatoin on until we decode per drawcall when sw skinning.
-				if (true || !g_Config.bSoftwareSkinning) {
+				if (!g_Config.bSoftwareSkinning) {
 					Flush();
 					shaderManager_->DirtyUniform(DIRTY_BONEMATRIX0 << (num / 12));
 				}
-				gstate.boneMatrix[num] = newVal;
+				((u32 *)gstate.boneMatrix)[num] = newVal;
 			}
 			num++;
 			gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x7F);

From f0cacf46d080a6b604852fba6d574e2d2d0c215c Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 10 Nov 2013 16:05:57 +0100
Subject: [PATCH 07/12] No reason to involve the FPU when loading matrices

---
 GPU/GLES/GLES_GPU.cpp | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/GPU/GLES/GLES_GPU.cpp b/GPU/GLES/GLES_GPU.cpp
index 6009ed75e4..b296f91e2e 100644
--- a/GPU/GLES/GLES_GPU.cpp
+++ b/GPU/GLES/GLES_GPU.cpp
@@ -1187,9 +1187,9 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) {
 	case GE_CMD_LDC0:case GE_CMD_LDC1:case GE_CMD_LDC2:case GE_CMD_LDC3:
 	case GE_CMD_LSC0:case GE_CMD_LSC1:case GE_CMD_LSC2:case GE_CMD_LSC3:
 		if (diff)	{
-			float r = (float)(data & 0xff)/255.0f;
-			float g = (float)((data>>8) & 0xff)/255.0f;
-			float b = (float)(data>>16)/255.0f;
+			float r = (float)(data & 0xff) * (1.0f / 255.0f);
+			float g = (float)((data >> 8) & 0xff) * (1.0f / 255.0f);
+			float b = (float)(data >> 16) * (1.0f / 255.0f);
 
 			int l = (cmd - GE_CMD_LAC0) / 3;
 			int t = (cmd - GE_CMD_LAC0) % 3;
@@ -1318,10 +1318,10 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) {
 	case GE_CMD_WORLDMATRIXDATA:
 		{
 			int num = gstate.worldmtxnum & 0xF;
-			float newVal = getFloat24(data);
-			if (num < 12 && newVal != gstate.worldMatrix[num]) {
+			u32 newVal = data << 8;
+			if (num < 12 && newVal != ((const u32 *)gstate.worldMatrix)[num]) {
 				Flush();
-				gstate.worldMatrix[num] = newVal;
+				((u32 *)gstate.worldMatrix)[num] = newVal;
 				shaderManager_->DirtyUniform(DIRTY_WORLDMATRIX);
 			}
 			num++;
@@ -1336,10 +1336,10 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) {
 	case GE_CMD_VIEWMATRIXDATA:
 		{
 			int num = gstate.viewmtxnum & 0xF;
-			float newVal = getFloat24(data);
-			if (num < 12 && newVal != gstate.viewMatrix[num]) {
+			u32 newVal = data << 8;
+			if (num < 12 && newVal != ((const u32 *)gstate.viewMatrix)[num]) {
 				Flush();
-				gstate.viewMatrix[num] = newVal;
+				((u32 *)gstate.viewMatrix)[num] = newVal;
 				shaderManager_->DirtyUniform(DIRTY_VIEWMATRIX);
 			}
 			num++;
@@ -1354,10 +1354,10 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) {
 	case GE_CMD_PROJMATRIXDATA:
 		{
 			int num = gstate.projmtxnum & 0xF;
-			float newVal = getFloat24(data);
-			if (newVal != gstate.projMatrix[num]) {
+			u32 newVal = data << 8;
+			if (newVal != ((const u32 *)gstate.projMatrix)[num]) {
 				Flush();
-				gstate.projMatrix[num] = newVal;
+				((u32 *)gstate.projMatrix)[num] = newVal;
 				shaderManager_->DirtyUniform(DIRTY_PROJMATRIX);
 			}
 			num++;
@@ -1372,10 +1372,10 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) {
 	case GE_CMD_TGENMATRIXDATA:
 		{
 			int num = gstate.texmtxnum & 0xF;
-			float newVal = getFloat24(data);
-			if (num < 12 && newVal != gstate.tgenMatrix[num]) {
+			u32 newVal = data << 8;
+			if (num < 12 && newVal != ((const u32 *)gstate.tgenMatrix)[num]) {
 				Flush();
-				gstate.tgenMatrix[num] = newVal;
+				((u32 *)gstate.tgenMatrix)[num] = newVal;
 				shaderManager_->DirtyUniform(DIRTY_TEXMATRIX);
 			}
 			num++;

From 9333d3ea769b75541e60323f6627c328f8cacfcb Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 10 Nov 2013 17:40:38 +0100
Subject: [PATCH 08/12] Vtx dec jit: Combine the scale and offset registers to
 save 1 more xmm register.

---
 GPU/GLES/VertexDecoder.cpp | 46 ++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp
index 8844112ac1..d0a67abbaf 100644
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@@ -1403,13 +1403,14 @@ static const X64Reg counterReg = ECX;
 
 // XMM0-XMM5 are volatile on Windows X64
 // XMM0-XMM7 are arguments (and thus volatile) on System V ABI (other x64 platforms)
-static const X64Reg fpScaleReg = XMM0;
-static const X64Reg fpOffsetReg = XMM1;
-static const X64Reg fpScratchReg = XMM2;
-static const X64Reg fpScratchReg2 = XMM3;
-// We're gonna keep the current skinning matrix in 3 or 4 XMM regs. Fortunately we easily
-// have space for that now.
+static const X64Reg fpScaleOffsetReg = XMM0;
 
+static const X64Reg fpScratchReg = XMM1;
+static const X64Reg fpScratchReg2 = XMM2;
+static const X64Reg fpScratchReg3 = XMM3;
+
+// We're gonna keep the current skinning matrix in 4 XMM regs. Fortunately we easily
+// have space for that now.
 
 // To debug, just comment them out one at a time until it works. We fall back
 // on the interpreter if the compiler fails.
@@ -1499,17 +1500,18 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
 #else
 		MOV(32, R(tempReg1), Imm32((u32)(&gstate_c.uv)));
 #endif
-		MOVSS(fpScaleReg, MDisp(tempReg1, 0));
+		MOVSS(fpScaleOffsetReg, MDisp(tempReg1, 0));
 		MOVSS(fpScratchReg, MDisp(tempReg1, 4));
-		UNPCKLPS(fpScaleReg, R(fpScratchReg));
-		MOVSS(fpOffsetReg, MDisp(tempReg1, 8));
-		MOVSS(fpScratchReg, MDisp(tempReg1, 12));
-		UNPCKLPS(fpOffsetReg, R(fpScratchReg));
+		UNPCKLPS(fpScaleOffsetReg, R(fpScratchReg));
 		if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
-			MULPS(fpScaleReg, M((void *)&by128));
+			MULPS(fpScaleOffsetReg, M((void *)&by128));
 		} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
-			MULPS(fpScaleReg, M((void *)&by32768));
+			MULPS(fpScaleOffsetReg, M((void *)&by32768));
 		}
+		MOVSS(fpScratchReg, MDisp(tempReg1, 8));
+		MOVSS(fpScratchReg2, MDisp(tempReg1, 12));
+		UNPCKLPS(fpScratchReg, R(fpScratchReg2));
+		UNPCKLPD(fpScaleOffsetReg, R(fpScratchReg));
 	}
 
 	// Let's not bother with a proper stack frame. We just grab the arguments and go.
@@ -1621,8 +1623,10 @@ void VertexDecoderJitCache::Jit_TcU8Prescale() {
 	CVTSI2SS(fpScratchReg, R(tempReg1));
 	CVTSI2SS(fpScratchReg2, R(tempReg2));
 	UNPCKLPS(fpScratchReg, R(fpScratchReg2));
-	MULPS(fpScratchReg, R(fpScaleReg));
-	ADDPS(fpScratchReg, R(fpOffsetReg));
+	MULPS(fpScratchReg, R(fpScaleOffsetReg));
+	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
+	ADDPS(fpScratchReg, R(fpScaleOffsetReg));
+	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
 }
 
@@ -1633,15 +1637,19 @@ void VertexDecoderJitCache::Jit_TcU16Prescale() {
 	CVTSI2SS(fpScratchReg, R(tempReg1));
 	CVTSI2SS(fpScratchReg2, R(tempReg2));
 	UNPCKLPS(fpScratchReg, R(fpScratchReg2));
-	MULPS(fpScratchReg, R(fpScaleReg));
-	ADDPS(fpScratchReg, R(fpOffsetReg));
+	MULPS(fpScratchReg, R(fpScaleOffsetReg));
+	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
+	ADDPS(fpScratchReg, R(fpScaleOffsetReg));
+	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
 }
 
 void VertexDecoderJitCache::Jit_TcFloatPrescale() {
 	MOVQ_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff));
-	MULPS(fpScratchReg, R(fpScaleReg));
-	ADDPS(fpScratchReg, R(fpOffsetReg));
+	MULPS(fpScratchReg, R(fpScaleOffsetReg));
+	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
+	ADDPS(fpScratchReg, R(fpScaleOffsetReg));
+	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
 }
 

From 6221dbaf5d83ea6049e8d47f52c92b841a174bec Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 10 Nov 2013 20:08:46 +0100
Subject: [PATCH 09/12] Optimize software skinning for x86.

Can't seem to get a win on Windows vs hardware skinning though, even though
draw calls drop by 2/3rd...
---
 Common/x64Emitter.cpp      |   6 +
 Common/x64Emitter.h        |   2 +
 GPU/GLES/GLES_GPU.cpp      |   1 -
 GPU/GLES/VertexDecoder.cpp | 331 +++++++++++++++++++++++++++++++++++--
 GPU/GLES/VertexDecoder.h   |  16 +-
 5 files changed, 342 insertions(+), 14 deletions(-)

diff --git a/Common/x64Emitter.cpp b/Common/x64Emitter.cpp
index 9abc3eff2c..a4e6db829b 100644
--- a/Common/x64Emitter.cpp
+++ b/Common/x64Emitter.cpp
@@ -1371,6 +1371,12 @@ void XEmitter::PSLLQ(X64Reg reg, int shift) {
 	Write8(shift);
 }
 
+void XEmitter::PSLLDQ(X64Reg reg, int shift) {
+	WriteSSEOp(64, 0x73, true, (X64Reg)7, R(reg));
+	Write8(shift);
+}
+
+
 // WARNING not REX compatible
 void XEmitter::PSRAW(X64Reg reg, int shift) {
 	if (reg > 7)
diff --git a/Common/x64Emitter.h b/Common/x64Emitter.h
index ba7c8d0fa1..ccff753935 100644
--- a/Common/x64Emitter.h
+++ b/Common/x64Emitter.h
@@ -658,6 +658,8 @@ public:
 	void PSLLD(X64Reg reg, int shift);
 	void PSLLQ(X64Reg reg, int shift);
 
+	void PSLLDQ(X64Reg reg, int shift);
+
 	void PSRAW(X64Reg reg, int shift);
 	void PSRAD(X64Reg reg, int shift);
 
diff --git a/GPU/GLES/GLES_GPU.cpp b/GPU/GLES/GLES_GPU.cpp
index b296f91e2e..79226106e2 100644
--- a/GPU/GLES/GLES_GPU.cpp
+++ b/GPU/GLES/GLES_GPU.cpp
@@ -1394,7 +1394,6 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) {
 			if (num < 96 && newVal != ((const u32 *)gstate.boneMatrix)[num]) {
 				// Bone matrices should NOT flush when software skinning is enabled!
 				// TODO: Also check for morph...
-				// TODO: Can't turn this optimizatoin on until we decode per drawcall when sw skinning.
 				if (!g_Config.bSoftwareSkinning) {
 					Flush();
 					shaderManager_->DirtyUniform(DIRTY_BONEMATRIX0 << (num / 12));
diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp
index d0a67abbaf..b58a97d8f8 100644
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@@ -15,6 +15,8 @@
 // Official git repository and contact information can be found at
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
 
+#include "base/basictypes.h"
+
 #include "Core/Config.h"
 #include "Core/MemMap.h"
 #include "GPU/ge_constants.h"
@@ -35,9 +37,13 @@ static const u8 nrmsize[4] = {0,3,6,12}, nrmalign[4] = {0,1,2,4};
 static const u8 possize[4] = {0,3,6,12}, posalign[4] = {0,1,2,4};
 static const u8 wtsize[4] = {0,1,2,4}, wtalign[4] = {0,1,2,4};
 
-// When software skinning. This should be stored in registers instead of memory
-// when jitting.
-float skinMatrix[12];
+// When software skinning. This array is only used when non-jitted - when jitted, the matrix
+// is kept in registers.
+static float skinMatrix[12];
+
+// We start out by converting the active matrices into 4x4 which are easier to multiply with
+// using SSE / NEON and store them here.
+static float bones[16 * 8];
 
 inline int align(int n, int align) {
 	return (n + (align - 1)) & ~(align - 1);
@@ -930,6 +936,7 @@ struct JitLookup {
 #ifdef ARM
 
 static const float by128 = 1.0f / 128.0f;
+static const float by256 = 1.0f / 256.0f;
 static const float by32768 = 1.0f / 32768.0f;
 
 using namespace ArmGen;
@@ -1373,8 +1380,18 @@ void VertexDecoderJitCache::Jit_PosFloat() {
 
 using namespace Gen;
 
-static const float MEMORY_ALIGNED16( by128[4] ) = {1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f};
-static const float MEMORY_ALIGNED16( by32768[4] ) = {1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f};
+static const float MEMORY_ALIGNED16( by128[4] ) = {
+	1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f
+};
+static const float MEMORY_ALIGNED16( by256[4] ) = {
+	1.0f / 256, 1.0f / 256, 1.0f / 256, 1.0f / 256
+};
+static const float MEMORY_ALIGNED16( by32768[4] ) = {
+	1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f,
+};
+
+static const u32 MEMORY_ALIGNED16( threeMasks[4] ) = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0};
+static const u32 MEMORY_ALIGNED16( aOne[4] ) = {0, 0, 0, 0x3F800000};
 
 #ifdef _M_X64
 #ifdef _WIN32
@@ -1420,6 +1437,10 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
 	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
 
+	{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
+	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
+	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
+
 	{&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8},
 	{&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16},
 	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
@@ -1437,6 +1458,10 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
 	{&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
 
+	{&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin},
+	{&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin},
+	{&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin},
+
 	{&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888},
 	{&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444},
 	{&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565},
@@ -1449,6 +1474,10 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8},
 	{&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16},
 	{&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
+
+	{&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin},
+	{&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin},
+	{&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin},
 };
 
 // TODO: This should probably be global...
@@ -1479,9 +1508,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
 	// Save XMM4/XMM5 which apparently can be problematic?
 	// Actually, if they are, it must be a compiler bug because they SHOULD be ok.
 	// So I won't bother.
-	// SUB(PTRBITS, R(ESP), Imm8(32));
-	// MOVUPS(MDisp(ESP, 0), XMM4);
-	// MOVUPS(MDisp(ESP, 16), XMM5);
+	SUB(PTRBITS, R(ESP), Imm8(64));
+	MOVUPS(MDisp(ESP, 0), XMM4);
+	MOVUPS(MDisp(ESP, 16), XMM5);
+	MOVUPS(MDisp(ESP, 32), XMM6);
+	MOVUPS(MDisp(ESP, 48), XMM7);
 
 	bool prescaleStep = false;
 	// Look for prescaled texcoord steps
@@ -1493,6 +1524,28 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
 		}
 	}
 
+	// Add code to convert matrices to 4x4.
+	// Later we might want to do this when the matrices are loaded instead.
+	// This is mostly proof of concept.
+	int boneCount = 0;
+	if (dec.weighttype) {
+		for (int i = 0; i < 8; i++) {
+			MOVUPS(XMM0, M((void *)(gstate.boneMatrix + 12 * i)));
+			MOVUPS(XMM1, M((void *)(gstate.boneMatrix + 12 * i + 3)));
+			MOVUPS(XMM2, M((void *)(gstate.boneMatrix + 12 * i + 3 * 2)));
+			MOVUPS(XMM3, M((void *)(gstate.boneMatrix + 12 * i + 3 * 3)));
+			ANDPS(XMM0, M((void *)&threeMasks));
+			ANDPS(XMM1, M((void *)&threeMasks));
+			ANDPS(XMM2, M((void *)&threeMasks));
+			ANDPS(XMM3, M((void *)&threeMasks));
+			ORPS(XMM3, M((void *)&aOne));
+			MOVAPS(M((void *)(bones + 16 * i)), XMM0);
+			MOVAPS(M((void *)(bones + 16 * i + 4)), XMM1);
+			MOVAPS(M((void *)(bones + 16 * i + 8)), XMM2);
+			MOVAPS(M((void *)(bones + 16 * i + 12)), XMM3);
+		}
+	}
+	
 	// Keep the scale/offset in a few fp registers if we need it.
 	if (prescaleStep) {
 #ifdef _M_X64
@@ -1529,9 +1582,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
 	SUB(32, R(counterReg), Imm8(1));
 	J_CC(CC_NZ, loopStart, true);
 
-	// MOVUPS(XMM4, MDisp(ESP, 0));
-	// MOVUPS(XMM5, MDisp(ESP, 16));
-	// ADD(PTRBITS, R(ESP), Imm8(32));
+	MOVUPS(XMM4, MDisp(ESP, 0));
+	MOVUPS(XMM5, MDisp(ESP, 16));
+	MOVUPS(XMM6, MDisp(ESP, 32));
+	MOVUPS(XMM7, MDisp(ESP, 48));
+	ADD(PTRBITS, R(ESP), Imm8(64));
 
 #ifdef _M_IX86
 	// Restore register values
@@ -1584,6 +1639,118 @@ void VertexDecoderJitCache::Jit_WeightsFloat() {
 	}
 }
 
+void VertexDecoderJitCache::Jit_WeightsU8Skin() {
+#ifdef _M_X64
+	MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones));
+#else
+	MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones));
+#endif
+	for (int j = 0; j < dec_->nweights; j++) {
+		MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff + j));
+		CVTSI2SS(XMM1, R(tempReg1));
+		MULSS(XMM1, M((void *)&by128));
+		SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
+		if (j == 0) {
+			MOVAPS(XMM4, MDisp(tempReg2, 0));
+			MOVAPS(XMM5, MDisp(tempReg2, 16));
+			MULPS(XMM4, R(XMM1));
+			MULPS(XMM5, R(XMM1));
+			MOVAPS(XMM6, MDisp(tempReg2, 32));
+			MOVAPS(XMM7, MDisp(tempReg2, 48));
+			MULPS(XMM6, R(XMM1));
+			MULPS(XMM7, R(XMM1));
+		} else {
+			MOVAPS(XMM2, MDisp(tempReg2, 0));
+			MOVAPS(XMM3, MDisp(tempReg2, 16));
+			MULPS(XMM2, R(XMM1));
+			MULPS(XMM3, R(XMM1));
+			ADDPS(XMM4, R(XMM2));
+			ADDPS(XMM5, R(XMM3));
+			MOVAPS(XMM2, MDisp(tempReg2, 32));
+			MOVAPS(XMM3, MDisp(tempReg2, 48));
+			MULPS(XMM2, R(XMM1));
+			MULPS(XMM3, R(XMM1));
+			ADDPS(XMM6, R(XMM2));
+			ADDPS(XMM7, R(XMM3));
+		}
+		ADD(PTRBITS, R(tempReg2), Imm8(4 * 16));
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsU16Skin() {
+#ifdef _M_X64
+	MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones));
+#else
+	MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones));
+#endif
+	for (int j = 0; j < dec_->nweights; j++) {
+		MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff + j * 2));
+		CVTSI2SS(XMM1, R(tempReg1));
+		MULSS(XMM1, M((void *)&by32768));
+		SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
+		if (j == 0) {
+			MOVAPS(XMM4, MDisp(tempReg2, 0));
+			MOVAPS(XMM5, MDisp(tempReg2, 16));
+			MULPS(XMM4, R(XMM1));
+			MULPS(XMM5, R(XMM1));
+			MOVAPS(XMM6, MDisp(tempReg2, 32));
+			MOVAPS(XMM7, MDisp(tempReg2, 48));
+			MULPS(XMM6, R(XMM1));
+			MULPS(XMM7, R(XMM1));
+		} else {
+			MOVAPS(XMM2, MDisp(tempReg2, 0));
+			MOVAPS(XMM3, MDisp(tempReg2, 16));
+			MULPS(XMM2, R(XMM1));
+			MULPS(XMM3, R(XMM1));
+			ADDPS(XMM4, R(XMM2));
+			ADDPS(XMM5, R(XMM3));
+			MOVAPS(XMM2, MDisp(tempReg2, 32));
+			MOVAPS(XMM3, MDisp(tempReg2, 48));
+			MULPS(XMM2, R(XMM1));
+			MULPS(XMM3, R(XMM1));
+			ADDPS(XMM6, R(XMM2));
+			ADDPS(XMM7, R(XMM3));
+		}
+		ADD(PTRBITS, R(tempReg2), Imm8(4 * 16));
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
+#ifdef _M_X64
+	MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones));
+#else
+	MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones));
+#endif
+	for (int j = 0; j < dec_->nweights; j++) {
+		MOVSS(XMM1, MDisp(srcReg, dec_->weightoff + j * 4));
+		SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
+		if (j == 0) {
+			MOVAPS(XMM4, MDisp(tempReg2, 0));
+			MOVAPS(XMM5, MDisp(tempReg2, 16));
+			MULPS(XMM4, R(XMM1));
+			MULPS(XMM5, R(XMM1));
+			MOVAPS(XMM6, MDisp(tempReg2, 32));
+			MOVAPS(XMM7, MDisp(tempReg2, 48));
+			MULPS(XMM6, R(XMM1));
+			MULPS(XMM7, R(XMM1));
+		} else {
+			MOVAPS(XMM2, MDisp(tempReg2, 0));
+			MOVAPS(XMM3, MDisp(tempReg2, 16));
+			MULPS(XMM2, R(XMM1));
+			MULPS(XMM3, R(XMM1));
+			ADDPS(XMM4, R(XMM2));
+			ADDPS(XMM5, R(XMM3));
+			MOVAPS(XMM2, MDisp(tempReg2, 32));
+			MOVAPS(XMM3, MDisp(tempReg2, 48));
+			MULPS(XMM2, R(XMM1));
+			MULPS(XMM3, R(XMM1));
+			ADDPS(XMM6, R(XMM2));
+			ADDPS(XMM7, R(XMM3));
+		}
+		ADD(PTRBITS, R(tempReg2), Imm8(4 * 16));
+	}
+}
+
 // Fill last two bytes with zeroes to align to 4 bytes. MOVZX does it for us, handy.
 void VertexDecoderJitCache::Jit_TcU8() {
 	MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
@@ -1816,6 +1983,77 @@ void VertexDecoderJitCache::Jit_NormalFloat() {
 	MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 8), R(tempReg3));
 }
 
+void VertexDecoderJitCache::Jit_NormalS8Skin() {
+	XORPS(XMM3, R(XMM3));
+	for (int i = 0; i < 3; i++) {
+		MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->nrmoff + (2 - i)));
+		CVTSI2SS(XMM3, R(tempReg1));
+		if (i != 2) {
+			PSLLDQ(XMM3, 4);
+		}
+	}
+	MULPS(XMM3, M((void *)&by128));
+
+	MOVAPS(XMM1, R(XMM3));
+	SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
+	MULPS(XMM1, R(XMM4));
+	MOVAPS(XMM2, R(XMM3));
+	SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1));
+	MULPS(XMM2, R(XMM5));
+	ADDPS(XMM1, R(XMM2));
+	MOVAPS(XMM2, R(XMM3));
+	SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2));
+	MULPS(XMM2, R(XMM6));
+	ADDPS(XMM1, R(XMM2));
+	MOVUPS(MDisp(dstReg, dec_->decFmt.nrmoff), XMM1);
+}
+
+// Copy 6 bytes and then 2 zeroes.
+void VertexDecoderJitCache::Jit_NormalS16Skin() {
+	XORPS(XMM3, R(XMM3));
+	for (int i = 0; i < 3; i++) {
+		MOVSX(32, 16, tempReg1, MDisp(srcReg, dec_->nrmoff + (2 - i) * 2));
+		CVTSI2SS(XMM3, R(tempReg1));
+		if (i != 2) {
+			PSLLDQ(XMM3, 4);
+		}
+	}
+	MULPS(XMM3, M((void *)&by32768));
+
+	MOVAPS(XMM1, R(XMM3));
+	SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
+	MULPS(XMM1, R(XMM4));
+	MOVAPS(XMM2, R(XMM3));
+	SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1));
+	MULPS(XMM2, R(XMM5));
+	ADDPS(XMM1, R(XMM2));
+	MOVAPS(XMM2, R(XMM3));
+	SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2));
+	MULPS(XMM2, R(XMM6));
+	ADDPS(XMM1, R(XMM2));
+	MOVUPS(MDisp(dstReg, dec_->decFmt.nrmoff), XMM1);
+}
+
+void VertexDecoderJitCache::Jit_NormalFloatSkin() {
+	MOVUPS(XMM3, MDisp(srcReg, dec_->nrmoff));
+
+	MOVAPS(XMM1, R(XMM3));
+	SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
+	MULPS(XMM1, R(XMM4));
+
+	MOVAPS(XMM2, R(XMM3));
+	SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1));
+	MULPS(XMM2, R(XMM5));
+	ADDPS(XMM1, R(XMM2));
+
+	MOVAPS(XMM2, R(XMM3));
+	SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2));
+	MULPS(XMM2, R(XMM6));
+	ADDPS(XMM1, R(XMM2));
+
+	MOVUPS(MDisp(dstReg, dec_->decFmt.nrmoff), XMM1);
+}
+
 // Through expands into floats, always. Might want to look at changing this.
 void VertexDecoderJitCache::Jit_PosS8Through() {
 	// TODO: SIMD
@@ -1861,6 +2099,77 @@ void VertexDecoderJitCache::Jit_PosFloat() {
 	MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 8), R(tempReg3));
 }
 
+void VertexDecoderJitCache::Jit_PosS8Skin() {
+	XORPS(XMM3, R(XMM3));
+	for (int i = 0; i < 3; i++) {
+		MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + (2 - i)));
+		CVTSI2SS(XMM3, R(tempReg1));
+		if (i != 2) {
+			PSLLDQ(XMM3, 4);
+		}
+	}
+	MULPS(XMM3, M((void *)&by128));
+	MOVAPS(XMM1, R(XMM3));
+	SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
+	MULPS(XMM1, R(XMM4));
+	MOVAPS(XMM2, R(XMM3));
+	SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1));
+	MULPS(XMM2, R(XMM5));
+	ADDPS(XMM1, R(XMM2));
+	MOVAPS(XMM2, R(XMM3));
+	SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2));
+	MULPS(XMM2, R(XMM6));
+	ADDPS(XMM1, R(XMM2));
+	ADDPS(XMM1, R(XMM7));
+	MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM1);
+}
+
+void VertexDecoderJitCache::Jit_PosS16Skin() {
+	XORPS(XMM3, R(XMM3));
+	for (int i = 0; i < 3; i++) {
+		MOVSX(32, 16, tempReg1, MDisp(srcReg, dec_->posoff + (2 - i) * 2));
+		CVTSI2SS(XMM3, R(tempReg1));
+		if (i != 2) {
+			PSLLDQ(XMM3, 4);
+		}
+	}
+	MULPS(XMM3, M((void *)&by32768));
+	MOVAPS(XMM1, R(XMM3));
+	SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
+	MULPS(XMM1, R(XMM4));
+	MOVAPS(XMM2, R(XMM3));
+	SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1));
+	MULPS(XMM2, R(XMM5));
+	ADDPS(XMM1, R(XMM2));
+	MOVAPS(XMM2, R(XMM3));
+	SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2));
+	MULPS(XMM2, R(XMM6));
+	ADDPS(XMM1, R(XMM2));
+	ADDPS(XMM1, R(XMM7));
+	MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM1);
+}
+
+// Just copy 12 bytes.
+void VertexDecoderJitCache::Jit_PosFloatSkin() {
+	MOVUPS(XMM3, MDisp(srcReg, dec_->posoff));
+
+	MOVAPS(XMM1, R(XMM3));
+	SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
+	MULPS(XMM1, R(XMM4));
+
+	MOVAPS(XMM2, R(XMM3));
+	SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1));
+	MULPS(XMM2, R(XMM5));
+	ADDPS(XMM1, R(XMM2));
+
+	MOVAPS(XMM2, R(XMM3));
+	SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2));
+	MULPS(XMM2, R(XMM6));
+	ADDPS(XMM1, R(XMM2));
+	ADDPS(XMM1, R(XMM7));
+	MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM1);
+}
+
 #elif defined(PPC)
 
 #error This should not be built for PowerPC, at least not yet.
diff --git a/GPU/GLES/VertexDecoder.h b/GPU/GLES/VertexDecoder.h
index 1ba24af027..b6586756f6 100644
--- a/GPU/GLES/VertexDecoder.h
+++ b/GPU/GLES/VertexDecoder.h
@@ -199,6 +199,10 @@ public:
 	void Jit_WeightsU16();
 	void Jit_WeightsFloat();
 
+	void Jit_WeightsU8Skin();
+	void Jit_WeightsU16Skin();
+	void Jit_WeightsFloatSkin();
+
 	void Jit_TcU8();
 	void Jit_TcU16();
 	void Jit_TcFloat();
@@ -222,11 +226,19 @@ public:
 	void Jit_NormalS16();
 	void Jit_NormalFloat();
 
+	void Jit_NormalS8Skin();
+	void Jit_NormalS16Skin();
+	void Jit_NormalFloatSkin();
+
 	void Jit_PosS8();
-	void Jit_PosS8Through();
 	void Jit_PosS16();
-	void Jit_PosS16Through();
 	void Jit_PosFloat();
+	void Jit_PosS8Through();
+	void Jit_PosS16Through();
+
+	void Jit_PosS8Skin();
+	void Jit_PosS16Skin();
+	void Jit_PosFloatSkin();
 
 private:
 	bool CompileStep(const VertexDecoder &dec, int i);

From 821a2f10f8766b2f29ba911cc40642757de390d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Wed, 13 Nov 2013 10:35:22 +0100
Subject: [PATCH 10/12] Delete obsolete code

---
 GPU/GLES/VertexDecoder.cpp | 31 -------------------------------
 1 file changed, 31 deletions(-)

diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp
index b58a97d8f8..24edc39773 100644
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@@ -49,37 +49,6 @@ inline int align(int n, int align) {
 	return (n + (align - 1)) & ~(align - 1);
 }
 
-#if 0
-// This is what the software transform spits out, and thus w
-DecVtxFormat GetTransformedVtxFormat(const DecVtxFormat &fmt) {
-	DecVtxFormat tfm = {0};
-	int size = 0;
-	int offset = 0;
-	// Weights disappear during transform.
-	if (fmt.uvfmt) {
-		// UV always becomes float2.
-		tfm.uvfmt = DEC_FLOAT_2;
-		tfm.uvoff = offset;
-		offset += DecFmtSize(tfm.uvfmt);
-	}
-	// We always (?) get two colors out, they're floats (although we'd probably be fine with less precision).
-	tfm.c0fmt = DEC_FLOAT_4;
-	tfm.c0off = offset;
-	offset += DecFmtSize(tfm.c0fmt);
-	tfm.c1fmt = DEC_FLOAT_3;  // color1 (specular) doesn't have alpha.
-	tfm.c1off = offset;
-	offset += DecFmtSize(tfm.c1fmt);
-	// We never get a normal, it's gone.
-	// But we do get a position, and it's always float3.
-	tfm.posfmt = DEC_FLOAT_3;
-	tfm.posoff = offset;
-	offset += DecFmtSize(tfm.posfmt);
-	// Update stride.
-	tfm.stride = offset;
-	return tfm;
-}
-#endif
-
 VertexDecoder::VertexDecoder() : coloff(0), nrmoff(0), posoff(0), jitted_(0) {
 	memset(stats_, 0, sizeof(stats_));
 }

From 9bbdd1907d0bcad04e893ede9a476bdee2cff47e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Wed, 13 Nov 2013 17:17:14 +0100
Subject: [PATCH 11/12] Kind-of optimized ARM software skinning (non-NEON)

---
 GPU/GLES/VertexDecoder.cpp | 228 ++++++++++++++++++++++++++++++++++++-
 GPU/GLES/VertexDecoder.h   |   1 +
 2 files changed, 226 insertions(+), 3 deletions(-)

diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp
index 24edc39773..f81514078b 100644
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@@ -16,6 +16,7 @@
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
 
 #include "base/basictypes.h"
+#include "base/logging.h"
 
 #include "Core/Config.h"
 #include "Core/MemMap.h"
@@ -878,7 +879,7 @@ int VertexDecoder::ToString(char *output) const {
 
 VertexDecoderJitCache::VertexDecoderJitCache() {
 	// 64k should be enough.
-	AllocCodeSpace(1024 * 64);
+	AllocCodeSpace(1024 * 64 * 4);
 
 	// Add some random code to "help" MSVC's buggy disassembler :(
 #if defined(_WIN32)
@@ -914,21 +915,32 @@ static const ARMReg tempReg1 = R3;
 static const ARMReg tempReg2 = R4;
 static const ARMReg tempReg3 = R5;
 static const ARMReg scratchReg = R6;
+static const ARMReg scratchReg2 = R7;
+static const ARMReg scratchReg3 = R12;
 static const ARMReg srcReg = R0;
 static const ARMReg dstReg = R1;
 static const ARMReg counterReg = R2;
 static const ARMReg fpScratchReg = S4;
 static const ARMReg fpScratchReg2 = S5;
+static const ARMReg fpScratchReg3 = S6;
+
 static const ARMReg fpUscaleReg = S0;
 static const ARMReg fpVscaleReg = S1;
 static const ARMReg fpUoffsetReg = S2;
 static const ARMReg fpVoffsetReg = S3;
+// Everything above S6 is fair game for skinning
+static const ARMReg src[3] = {S8, S9, S10};  // skin source
+static const ARMReg acc[3] = {S11, S12, S13};  // skin accumulator
 
 static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
 	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
 	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
 
+	{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
+	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
+	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
+
 	{&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8},
 	{&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16},
 	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
@@ -946,6 +958,10 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
 	{&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
 
+	{&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin},
+	{&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin},
+	{&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin},
+
 	{&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888},
 	{&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444},
 	{&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565},
@@ -958,6 +974,10 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8},
 	{&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16},
 	{&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
+
+	{&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin},
+	{&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin},
+	{&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin},
 };
 
 JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
@@ -965,6 +985,8 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
 	const u8 *start = this->GetCodePtr();
 
 	bool prescaleStep = false;
+	bool skinning = false;
+
 	// Look for prescaled texcoord steps
 	for (int i = 0; i < dec.numSteps_; i++) {
 		if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
@@ -972,6 +994,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
 				dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
 			prescaleStep = true;
 		}
+		if (dec.steps_[i] == &VertexDecoder::Step_WeightsU8Skin ||
+				dec.steps_[i] == &VertexDecoder::Step_WeightsU16Skin ||
+				dec.steps_[i] == &VertexDecoder::Step_WeightsFloatSkin) {
+			skinning = true;
+		}
 	}
 
 	SetCC(CC_AL);
@@ -996,10 +1023,14 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
 		}
 	}
 
-	// NEON skinning register mapping
+	// TODO: NEON skinning register mapping
 	// The matrix will be built in Q12-Q15.
 	// The temporary matrix to be added to the built matrix will be in Q8-Q11.
 
+	if (skinning) {
+		// TODO: Preload scale factors
+	}
+
 	JumpTarget loopStart = GetCodePtr();
 	for (int i = 0; i < dec.numSteps_; i++) {
 		if (!CompileStep(dec, i)) {
@@ -1079,6 +1110,91 @@ void VertexDecoderJitCache::Jit_WeightsFloat() {
 	}
 }
 
+void VertexDecoderJitCache::Jit_WeightsU8Skin() {
+	// No need to zero skinMatrix, we'll just STR to it in the first lap,
+	// then VLDR/VADD/VSTR in subsequent laps.
+	MOVI2R(tempReg2, (u32)skinMatrix, scratchReg);
+	for (int j = 0; j < dec_->nweights; j++) {
+		const float *bone = &gstate.boneMatrix[j * 12];
+		LDRB(tempReg1, srcReg, dec_->weightoff + j);
+		VMOV(fpScratchReg, tempReg1);
+		VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
+		MOVI2F(fpScratchReg2, by128, scratchReg);
+		VMUL(fpScratchReg, fpScratchReg, fpScratchReg2);
+		MOVI2R(tempReg1, (u32)bone, scratchReg);
+		// Okay, we have the weight.
+		if (j == 0) {
+			for (int i = 0; i < 12; i++) {
+				VLDR(fpScratchReg2, tempReg1, i * 4);
+				VMUL(fpScratchReg2, fpScratchReg2, fpScratchReg);
+				VSTR(fpScratchReg2, tempReg2, i * 4);
+			}
+		} else {
+			for (int i = 0; i < 12; i++) {
+				VLDR(fpScratchReg2, tempReg1, i * 4);
+				VLDR(fpScratchReg3, tempReg2, i * 4);
+				VMLA(fpScratchReg3, fpScratchReg2, fpScratchReg);
+				VSTR(fpScratchReg3, tempReg2, i * 4);
+			}
+		}
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsU16Skin() {
+	// No need to zero skinMatrix, we'll just STR to it in the first lap,
+	// then VLDR/VADD/VSTR in subsequent laps.
+	MOVI2R(tempReg2, (u32)skinMatrix, scratchReg);
+	for (int j = 0; j < dec_->nweights; j++) {
+		const float *bone = &gstate.boneMatrix[j * 12];
+		LDRH(tempReg1, srcReg, dec_->weightoff + j * 2);
+		VMOV(fpScratchReg, tempReg1);
+		VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
+		MOVI2F(fpScratchReg2, 1.0f / 32768.0f, scratchReg);
+		VMUL(fpScratchReg, fpScratchReg, fpScratchReg2);
+		MOVI2R(tempReg1, (u32)bone, scratchReg);
+		// Okay, we have the weight.
+		if (j == 0) {
+			for (int i = 0; i < 12; i++) {
+				VLDR(fpScratchReg2, tempReg1, i * 4);
+				VMUL(fpScratchReg2, fpScratchReg2, fpScratchReg);
+				VSTR(fpScratchReg2, tempReg2, i * 4);
+			}
+		} else {
+			for (int i = 0; i < 12; i++) {
+				VLDR(fpScratchReg2, tempReg1, i * 4);
+				VLDR(fpScratchReg3, tempReg2, i * 4);
+				VMLA(fpScratchReg3, fpScratchReg2, fpScratchReg);
+				VSTR(fpScratchReg3, tempReg2, i * 4);
+			}
+		}
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
+	// No need to zero skinMatrix, we'll just STR to it in the first lap,
+	// then VLDR/VADD/VSTR in subsequent laps.
+	MOVI2R(tempReg2, (u32)skinMatrix, scratchReg);
+	for (int j = 0; j < dec_->nweights; j++) {
+		const float *bone = &gstate.boneMatrix[j * 12];
+		VLDR(fpScratchReg, srcReg, dec_->weightoff + j * 4);
+		MOVI2R(tempReg1, (u32)bone, scratchReg);
+		if (j == 0) {
+			for (int i = 0; i < 12; i++) {
+				VLDR(fpScratchReg2, tempReg1, i * 4);
+				VMUL(fpScratchReg2, fpScratchReg2, fpScratchReg);
+				VSTR(fpScratchReg2, tempReg2, i * 4);
+			}
+		} else {
+			for (int i = 0; i < 12; i++) {
+				VLDR(fpScratchReg2, tempReg1, i * 4);
+				VLDR(fpScratchReg3, tempReg2, i * 4);
+				VMLA(fpScratchReg3, fpScratchReg2, fpScratchReg);
+				VSTR(fpScratchReg3, tempReg2, i * 4);
+			}
+		}
+	}
+}
+
 // Fill last two bytes with zeroes to align to 4 bytes. LDRH does it for us, handy.
 void VertexDecoderJitCache::Jit_TcU8() {
 	LDRB(tempReg1, srcReg, dec_->tcoff);
@@ -1336,7 +1452,6 @@ void VertexDecoderJitCache::Jit_PosS16() {
 
 // Just copy 12 bytes.
 void VertexDecoderJitCache::Jit_PosFloat() {
-	// Might not be aligned to 4, so we can't use LDMIA.
 	LDR(tempReg1, srcReg, dec_->posoff);
 	LDR(tempReg2, srcReg, dec_->posoff + 4);
 	LDR(tempReg3, srcReg, dec_->posoff + 8);
@@ -1345,6 +1460,113 @@ void VertexDecoderJitCache::Jit_PosFloat() {
 	STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
 }
 
+void VertexDecoderJitCache::Jit_NormalS8Skin() {
+	LDRSB(tempReg1, srcReg, dec_->nrmoff);
+	LDRSB(tempReg2, srcReg, dec_->nrmoff + 1);
+	LDRSB(tempReg3, srcReg, dec_->nrmoff + 2);
+	VMOV(src[0], tempReg1);
+	VMOV(src[1], tempReg2);
+	VMOV(src[2], tempReg3);
+	MOVI2F(S15, 1.0f/128.0f, scratchReg);
+	VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED);
+	VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED);
+	VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED);
+	VMUL(src[0], src[0], S15);
+	VMUL(src[1], src[1], S15);
+	VMUL(src[2], src[2], S15);
+	Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
+}
+
+void VertexDecoderJitCache::Jit_NormalS16Skin() {
+	LDRSH(tempReg1,  srcReg, dec_->nrmoff);
+	LDRSH(tempReg2, srcReg, dec_->nrmoff + 2);
+	LDRSH(tempReg3, srcReg, dec_->nrmoff + 4);
+	VMOV(fpScratchReg, tempReg1);
+	VMOV(fpScratchReg2, tempReg2);
+	VMOV(fpScratchReg3, tempReg3);
+	MOVI2F(S15, 1.0f/32768.0f, scratchReg);
+	VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED);
+	VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT | IS_SIGNED);
+	VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT | IS_SIGNED);
+	VMUL(src[0], fpScratchReg, S15);
+	VMUL(src[1], fpScratchReg2, S15);
+	VMUL(src[2], fpScratchReg3, S15);
+	Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
+}
+
+void VertexDecoderJitCache::Jit_NormalFloatSkin() {
+	VLDR(src[0], srcReg, dec_->nrmoff);
+	VLDR(src[1], srcReg, dec_->nrmoff + 4);
+	VLDR(src[2], srcReg, dec_->nrmoff + 8);
+	Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
+}
+
+void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
+	MOVI2R(tempReg1, (u32)skinMatrix, scratchReg);
+	for (int i = 0; i < 3; i++) {
+		VLDR(fpScratchReg, tempReg1, 4 * i);
+		VMUL(acc[i], fpScratchReg, src[0]);
+	}
+	for (int i = 0; i < 3; i++) {
+		VLDR(fpScratchReg, tempReg1, 12 + 4 * i);
+		VMLA(acc[i], fpScratchReg, src[1]);
+	}
+	for (int i = 0; i < 3; i++) {
+		VLDR(fpScratchReg, tempReg1, 24 + 4 * i);
+		VMLA(acc[i], fpScratchReg, src[2]);
+	}
+	if (pos) {
+		for (int i = 0; i < 3; i++) {
+			VLDR(fpScratchReg, tempReg1, 36 + 4 * i);
+			VADD(acc[i], acc[i], fpScratchReg);
+		}
+	}
+	for (int i = 0; i < 3; i++) {
+		VSTR(acc[i], dstReg, outOff + i * 4);
+	}
+}
+
+void VertexDecoderJitCache::Jit_PosS8Skin() {
+	LDRSB(tempReg1, srcReg, dec_->posoff);
+	LDRSB(tempReg2, srcReg, dec_->posoff + 1);
+	LDRSB(tempReg3, srcReg, dec_->posoff + 2);
+	VMOV(src[0], tempReg1);
+	VMOV(src[1], tempReg2);
+	VMOV(src[2], tempReg3);
+	MOVI2F(S15, 1.0f/128.0f, scratchReg);
+	VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED);
+	VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED);
+	VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED);
+	VMUL(src[0], src[0], S15);
+	VMUL(src[1], src[1], S15);
+	VMUL(src[2], src[2], S15);
+	Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
+}
+
+void VertexDecoderJitCache::Jit_PosS16Skin() {
+	LDRSH(tempReg1, srcReg, dec_->posoff);
+	LDRSH(tempReg2, srcReg, dec_->posoff + 2);
+	LDRSH(tempReg3, srcReg, dec_->posoff + 4);
+	VMOV(src[0], tempReg1);
+	VMOV(src[1], tempReg2);
+	VMOV(src[2], tempReg3);
+	MOVI2F(S15, 1.0f/32768.0f, scratchReg);
+	VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED);
+	VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED);
+	VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED);
+	VMUL(src[0], src[0], S15);
+	VMUL(src[1], src[1], S15);
+	VMUL(src[2], src[2], S15);
+	Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
+}
+
+void VertexDecoderJitCache::Jit_PosFloatSkin() {
+	VLDR(src[0], srcReg, dec_->posoff);
+	VLDR(src[1], srcReg, dec_->posoff + 4);
+	VLDR(src[2], srcReg, dec_->posoff + 8);
+	Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
+}
+
 #elif defined(_M_X64) || defined(_M_IX86)
 
 using namespace Gen;
diff --git a/GPU/GLES/VertexDecoder.h b/GPU/GLES/VertexDecoder.h
index b6586756f6..2c4ce40a47 100644
--- a/GPU/GLES/VertexDecoder.h
+++ b/GPU/GLES/VertexDecoder.h
@@ -242,5 +242,6 @@ public:
 
 private:
 	bool CompileStep(const VertexDecoder &dec, int i);
+	void Jit_WriteMatrixMul(int outOff, bool pos);
 	const VertexDecoder *dec_;
 };

From da380478f6e8ec14f0901229708557d184f9a256 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Wed, 13 Nov 2013 19:55:20 +0100
Subject: [PATCH 12/12] Enable software skinning by default

---
 Core/Config.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Core/Config.cpp b/Core/Config.cpp
index bc15c6c1f2..39d35cbec1 100644
--- a/Core/Config.cpp
+++ b/Core/Config.cpp
@@ -144,7 +144,7 @@ void Config::Load(const char *iniFileName, const char *controllerIniFilename) {
 	graphics->Get("RenderingMode", &iRenderingMode, renderingModeDefault);
 	graphics->Get("SoftwareRendering", &bSoftwareRendering, false);
 	graphics->Get("HardwareTransform", &bHardwareTransform, true);
-	graphics->Get("SoftwareSkinning", &bSoftwareSkinning, false);
+	graphics->Get("SoftwareSkinning", &bSoftwareSkinning, true);
 	graphics->Get("TextureFiltering", &iTexFiltering, 1);
 	// Auto on Windows, 1x elsewhere. Maybe change to 2x on large screens?
 #ifdef _WIN32