diff --git a/GPU/GLES/DisplayListInterpreter.cpp b/GPU/GLES/DisplayListInterpreter.cpp
index b3a68bd937..98cd0e059f 100644
--- a/GPU/GLES/DisplayListInterpreter.cpp
+++ b/GPU/GLES/DisplayListInterpreter.cpp
@@ -51,6 +51,7 @@ GLES_GPU::GLES_GPU(int renderWidth, int renderHeight)
 {
 	renderWidthFactor_ = (float)renderWidth / 480.0f;
 	renderHeightFactor_ = (float)renderHeight / 272.0f;
+	shaderManager_ = &shaderManager;
 
 	// Sanity check gstate
 	if ((int *)&gstate.transferstart - (int *)&gstate != 0xEA) {
@@ -299,8 +300,7 @@ void GLES_GPU::DrawBezier(int ucount, int vcount)
 		}
 	}
 
-	LinkedShader *linkedShader = shaderManager.ApplyShader();
-	TransformAndDrawPrim(Memory::GetPointer(gstate_c.vertexAddr), &indices[0], GE_PRIM_TRIANGLES, 3 * 3 * 6, linkedShader, customUV, GE_VTYPE_IDX_16BIT);
+	TransformAndDrawPrim(Memory::GetPointer(gstate_c.vertexAddr), &indices[0], GE_PRIM_TRIANGLES, 3 * 3 * 6, customUV, GE_VTYPE_IDX_16BIT);
 }
 
 
@@ -365,14 +365,13 @@ void GLES_GPU::ExecuteOp(u32 op, u32 diff)
 			};
 			DEBUG_LOG(G3D, "DL DrawPrim type: %s count: %i vaddr= %08x, iaddr= %08x", type<7 ? types[type] : "INVALID", count, gstate_c.vertexAddr, gstate_c.indexAddr);
 
-			LinkedShader *linkedShader = shaderManager.ApplyShader();
 			// TODO: Split this so that we can collect sequences of primitives, can greatly speed things up
 			// on platforms where draw calls are expensive like mobile and D3D
 			void *verts = Memory::GetPointer(gstate_c.vertexAddr);
 			void *inds = 0;
 			if ((gstate.vertType & GE_VTYPE_IDX_MASK) != GE_VTYPE_IDX_NONE)
 				inds = Memory::GetPointer(gstate_c.indexAddr);
-			TransformAndDrawPrim(verts, inds, type, count, linkedShader, 0, -1);
+			TransformAndDrawPrim(verts, inds, type, count, 0, -1);
 		}
 		break;
 
@@ -1033,7 +1032,7 @@ void GLES_GPU::ExecuteOp(u32 op, u32 diff)
 	case GE_CMD_PROJMATRIXDATA:
 		DEBUG_LOG(G3D,"DL PROJECTION matrix data # %f", getFloat24(data));
 		{
-			int num = gstate.projmtxnum & 0xF;
+			int num = gstate.projmtxnum & 0xF;	
 			gstate.projMatrix[num++] = getFloat24(data);
 			gstate.projmtxnum = (gstate.projmtxnum & 0xFF000000) | (num & 0xF);
 		}
@@ -1049,21 +1048,24 @@ void GLES_GPU::ExecuteOp(u32 op, u32 diff)
 		DEBUG_LOG(G3D,"DL TGEN matrix data # %f", getFloat24(data));
 		{
 			int num = gstate.texmtxnum & 0xF;
-			gstate.tgenMatrix[num++] = getFloat24(data);
+			if (num < 12)
+				gstate.tgenMatrix[num++] = getFloat24(data);
 			gstate.texmtxnum = (gstate.texmtxnum & 0xFF000000) | (num & 0xF);
 		}
 		break;
 
 	case GE_CMD_BONEMATRIXNUMBER:
 		DEBUG_LOG(G3D,"DL BONE matrix #%i", data);
-		gstate.boneMatrixNumber &= 0xFF000007F;
+		gstate.boneMatrixNumber &= 0xFF00007F;
 		break;
 
 	case GE_CMD_BONEMATRIXDATA:
 		DEBUG_LOG(G3D,"DL BONE matrix data #%i %f", gstate.boneMatrixNumber & 0x7f, getFloat24(data));
 		{
 			int num = gstate.boneMatrixNumber & 0x7F;
-			gstate.boneMatrix[num++] = getFloat24(data);
+			if (num < 96) {
+				gstate.boneMatrix[num++] = getFloat24(data);
+			}
 			gstate.boneMatrixNumber = (gstate.boneMatrixNumber & 0xFF000000) | (num & 0x7F);
 		}
 		break;
diff --git a/GPU/GLES/DisplayListInterpreter.h b/GPU/GLES/DisplayListInterpreter.h
index 7bdf1af1d7..7eb4c8a46c 100644
--- a/GPU/GLES/DisplayListInterpreter.h
+++ b/GPU/GLES/DisplayListInterpreter.h
@@ -48,10 +48,13 @@ public:
 
 private:
 	// TransformPipeline.cpp
-	void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, LinkedShader *program, float *customUV, int forceIndexType);
+	void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, float *customUV, int forceIndexType);
+	void UpdateViewportAndProjection();
 	void DrawBezier(int ucount, int vcount);
 	void DoBlockTransfer();
 	bool ProcessDLQueue();
+
+	ShaderManager *shaderManager_;
 	bool interruptsEnabled_;
 
 	u32 displayFramebufPtr_;
diff --git a/GPU/GLES/ShaderManager.cpp b/GPU/GLES/ShaderManager.cpp
index 01a0917bdf..5def745e84 100644
--- a/GPU/GLES/ShaderManager.cpp
+++ b/GPU/GLES/ShaderManager.cpp
@@ -111,8 +111,19 @@ void LinkedShader::use() {
 		else
 		{
 			glUniformMatrix4fv(u_proj, 1, GL_FALSE, gstate.projMatrix);
-		}
+			float flippedMatrix[16];
+			memcpy(flippedMatrix, gstate.projMatrix, 16 * sizeof(float));
+			if (gstate_c.vpHeight < 0) {
+				flippedMatrix[5] = -flippedMatrix[5];
+				flippedMatrix[13] = -flippedMatrix[13];
+			}
+			if (gstate_c.vpWidth < 0) {
+				flippedMatrix[0] = -flippedMatrix[0];
+				flippedMatrix[12] = -flippedMatrix[12];
+			}
 
+			glUniformMatrix4fv(u_proj, 1, GL_FALSE, flippedMatrix);
+		}
 	}
 	if (u_texenv != -1 && dirtyUniforms & DIRTY_TEXENV) {
 		glUniform4f(u_texenv, 1.0, 1.0, 1.0, 1.0);	// TODO
diff --git a/GPU/GLES/TransformPipeline.cpp b/GPU/GLES/TransformPipeline.cpp
index 96a799df1e..27c49edd5a 100644
--- a/GPU/GLES/TransformPipeline.cpp
+++ b/GPU/GLES/TransformPipeline.cpp
@@ -196,7 +196,7 @@ void Lighter::Light(float colorOut0[4], float colorOut1[4], const float colorIn[
 // primitives correctly. Other primitives are possible to transform and light in hardware
 // using vertex shader, which will be way, way faster, especially on mobile. This has
 // not yet been implemented though.
-void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, LinkedShader *program, float *customUV, int forceIndexType)
+void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, float *customUV, int forceIndexType)
 {
 	int indexLowerBound, indexUpperBound;
 	// First, decode the verts and apply morphing
@@ -217,7 +217,7 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte
 	}
 	gpuStats.numDrawCalls++;
 	gpuStats.numVertsTransformed += vertexCount;
-
+	
 	bool throughmode = (gstate.vertType & GE_VTYPE_THROUGH_MASK) != 0;
 	// Then, transform and draw in one big swoop (urgh!)
 	// need to move this to the shader.
@@ -312,54 +312,43 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte
 
 			// Perform lighting here if enabled. don't need to check through, it's checked above.
 			float dots[4] = {0,0,0,0};
-			if (program->a_color0 != -1)
-			{
-				float unlitColor[4];
-				for (int j = 0; j < 4; j++) {
-					unlitColor[j] = decoded[index].color[j] / 255.0f;
-				}
-				float litColor0[4];
-				float litColor1[4];
-				lighter.Light(litColor0, litColor1, unlitColor, out, norm, dots);
+			float unlitColor[4];
+			for (int j = 0; j < 4; j++) {
+				unlitColor[j] = decoded[index].color[j] / 255.0f;
+			}
+			float litColor0[4];
+			float litColor1[4];
+			lighter.Light(litColor0, litColor1, unlitColor, out, norm, dots);
 				
-				if (gstate.lightingEnable & 1)
-				{
-					// TODO: don't ignore gstate.lmode - we should send two colors in that case
-					if (gstate.lmode & 1) {
-						// Separate colors
-						for (int j = 0; j < 4; j++) {
-							c0[j] = litColor0[j];
-							c1[j] = litColor1[j];
-						}
-					} else {
-						// Summed color into c0
-						for (int j = 0; j < 4; j++) {
-							c0[j] = litColor0[j] + litColor1[j];
-							c1[j] = 0.0f;
-						}
+			if (gstate.lightingEnable & 1)
+			{
+				// TODO: don't ignore gstate.lmode - we should send two colors in that case
+				if (gstate.lmode & 1) {
+					// Separate colors
+					for (int j = 0; j < 4; j++) {
+						c0[j] = litColor0[j];
+						c1[j] = litColor1[j];
 					}
-				}
-				else
-				{
-					if(dec.hasColor()) {
-						for (int j = 0; j < 4; j++) {
-							c0[j] = unlitColor[j];
-							c1[j] = 0.0f;
-						}
-					} else {
-						c0[0] = (gstate.materialambient & 0xFF) / 255.f;
-						c0[1] = ((gstate.materialambient >> 8) & 0xFF) / 255.f;
-						c0[2] = ((gstate.materialambient >> 16) & 0xFF) / 255.f;
-						c0[3] = (gstate.materialalpha & 0xFF) / 255.f;
+				} else {
+					// Summed color into c0
+					for (int j = 0; j < 4; j++) {
+						c0[j] = litColor0[j] + litColor1[j];
+						c1[j] = 0.0f;
 					}
 				}
 			}
 			else
 			{
-				// no color in the fragment program???
-				for (int j = 0; j < 4; j++) {
-					c0[j] = decoded[index].color[j] / 255.0f;
-					c1[j] = 0.0f;
+				if(dec.hasColor()) {
+					for (int j = 0; j < 4; j++) {
+						c0[j] = unlitColor[j];
+						c1[j] = 0.0f;
+					}
+				} else {
+					c0[0] = (gstate.materialambient & 0xFF) / 255.f;
+					c0[1] = ((gstate.materialambient >> 8) & 0xFF) / 255.f;
+					c0[2] = ((gstate.materialambient >> 16) & 0xFF) / 255.f;
+					c0[3] = (gstate.materialalpha & 0xFF) / 255.f;
 				}
 			}
 
@@ -623,57 +612,8 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte
 
 	glstate.depthRange.set(gstate_c.zOff - gstate_c.zScale, gstate_c.zOff + gstate_c.zScale);
 
-
-	// Debugging code to mess around with the viewport
-#if 1
-	// We can probably use these to simply set scissors? Maybe we need to offset by regionX1/Y1
-	int regionX1 = gstate.region1 & 0x3FF;
-	int regionY1 = (gstate.region1 >> 10) & 0x3FF;
-	int regionX2 = (gstate.region2 & 0x3FF) + 1;
-	int regionY2 = ((gstate.region2 >> 10) & 0x3FF) + 1;
-
-	float offsetX = (float)(gstate.offsetx & 0xFFFF) / 16.0f;
-	float offsetY = (float)(gstate.offsety & 0xFFFF) / 16.0f;
-
-	if (throughmode) {
-		// No viewport transform here. Let's experiment with using region.
-		glViewport((0 + regionX1) * renderWidthFactor_, (0 - regionY1) * renderHeightFactor_, (regionX2 - regionX1) * renderWidthFactor_, (regionY2 - regionY1) * renderHeightFactor_);
-	} else {
-		// These we can turn into a glViewport call, offset by offsetX and offsetY. Math after.
-		float vpXa = getFloat24(gstate.viewportx1);
-		float vpXb = getFloat24(gstate.viewportx2);
-		float vpYa = getFloat24(gstate.viewporty1);
-		float vpYb = getFloat24(gstate.viewporty2);
-		float vpZa = getFloat24(gstate.viewportz1);  //  / 65536.0f   should map it to OpenGL's 0.0-1.0 Z range
-		float vpZb = getFloat24(gstate.viewportz2);  //  / 65536.0f
-
-		// The viewport transform appears to go like this: 
-		// Xscreen = -offsetX + vpXb + vpXa * Xview
-		// Yscreen = -offsetY + vpYb + vpYa * Yview
-		// Zscreen = vpZb + vpZa * Zview
-	
-		// This means that to get the analogue glViewport we must:
-		float vpX0 = vpXb - offsetX - vpXa;
-		float vpY0 = vpYb - offsetY + vpYa;   // Need to account for sign of Y
-		float vpWidth = vpXa * 2;
-		float vpHeight = -vpYa * 2;
-
-		// TODO: These two should feed into glDepthRange somehow.
-		float vpZ0 = (vpZb - vpZa) / 65536.0f;
-		float vpZ1 = (vpZa * 2) / 65536.0f;
-
-		vpX0 *= renderWidthFactor_;
-		vpY0 *= renderHeightFactor_;
-		vpWidth *= renderWidthFactor_;
-		vpHeight *= renderHeightFactor_;
-
-		// Flip vpY0 to match the OpenGL coordinate system.
-		vpY0 = renderHeight_ - (vpY0 + vpHeight);
-		glViewport(vpX0, vpY0, vpWidth, vpHeight); 
-		// Sadly, as glViewport takes integers, we will not be able to support sub pixel offsets this way. But meh.
-	}
-
-#endif
+	UpdateViewportAndProjection();
+	LinkedShader *program = shaderManager_->ApplyShader();
 
 	// TODO: Make a cache for glEnableVertexAttribArray and glVertexAttribPtr states, these spam the gDebugger log.
 	glEnableVertexAttribArray(program->a_position);
@@ -696,3 +636,62 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte
 	if (program->a_color0 != -1) glDisableVertexAttribArray(program->a_color0);
 	if (program->a_color1 != -1) glDisableVertexAttribArray(program->a_color1);
 }
+
+void GLES_GPU::UpdateViewportAndProjection()
+{
+	bool throughmode = (gstate.vertType & GE_VTYPE_THROUGH_MASK) != 0;
+	
+	// We can probably use these to simply set scissors? Maybe we need to offset by regionX1/Y1
+	int regionX1 = gstate.region1 & 0x3FF;
+	int regionY1 = (gstate.region1 >> 10) & 0x3FF;
+	int regionX2 = (gstate.region2 & 0x3FF) + 1;
+	int regionY2 = ((gstate.region2 >> 10) & 0x3FF) + 1;
+
+	float offsetX = (float)(gstate.offsetx & 0xFFFF) / 16.0f;
+	float offsetY = (float)(gstate.offsety & 0xFFFF) / 16.0f;
+
+	if (throughmode) {
+		return;
+		// No viewport transform here. Let's experiment with using region.
+		glViewport((0 + regionX1) * renderWidthFactor_, (0 - regionY1) * renderHeightFactor_, (regionX2 - regionX1) * renderWidthFactor_, (regionY2 - regionY1) * renderHeightFactor_);
+	} else {
+		// These we can turn into a glViewport call, offset by offsetX and offsetY. Math after.
+		float vpXa = getFloat24(gstate.viewportx1);
+		float vpXb = getFloat24(gstate.viewportx2);
+		float vpYa = getFloat24(gstate.viewporty1);
+		float vpYb = getFloat24(gstate.viewporty2);
+		float vpZa = getFloat24(gstate.viewportz1);  //  / 65536.0f   should map it to OpenGL's 0.0-1.0 Z range
+		float vpZb = getFloat24(gstate.viewportz2);  //  / 65536.0f
+
+		// The viewport transform appears to go like this: 
+		// Xscreen = -offsetX + vpXb + vpXa * Xview
+		// Yscreen = -offsetY + vpYb + vpYa * Yview
+		// Zscreen = vpZb + vpZa * Zview
+
+		// This means that to get the analogue glViewport we must:
+		float vpX0 = vpXb - offsetX - vpXa;
+		float vpY0 = vpYb - offsetY + vpYa;   // Need to account for sign of Y
+		gstate_c.vpWidth = vpXa * 2;
+		gstate_c.vpHeight = -vpYa * 2;
+
+		return;
+
+		float vpWidth = fabsf(gstate_c.vpWidth);
+		float vpHeight = fabsf(gstate_c.vpHeight);
+
+		// TODO: These two should feed into glDepthRange somehow.
+		float vpZ0 = (vpZb - vpZa) / 65536.0f;
+		float vpZ1 = (vpZa * 2) / 65536.0f;
+
+		vpX0 *= renderWidthFactor_;
+		vpY0 *= renderWidthFactor_;
+		vpWidth *= renderWidthFactor_;
+		vpHeight *= renderWidthFactor_;
+
+		// Flip vpY0 to match the OpenGL coordinate system.
+		vpY0 = renderHeight_ - (vpY0 + vpHeight);
+		glViewport(vpX0, vpY0, vpWidth, vpHeight); 
+		// Sadly, as glViewport takes integers, we will not be able to support sub pixel offsets this way. But meh.
+		shaderManager_->DirtyUniform(DIRTY_PROJMATRIX);
+	}
+}
\ No newline at end of file
diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp
index 167aab7fc0..194a3cb4b7 100644
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@@ -283,8 +283,12 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 			}
 			switch (nrm)
 			{
-			case 0:
-				//no normals
+			case GE_VTYPE_NRM_8BIT:
+				{
+					const s8 *sv = (const s8*)(ptr + onesize_*n + nrmoff);
+					for (int j = 0; j < 3; j++)
+						normal[j] += (sv[j]/127.0f) * multiplier;
+				}
 				break;
 
 			case GE_VTYPE_NRM_FLOAT >> 5:
@@ -302,10 +306,6 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 						normal[j] += (sv[j]/32767.0f) * multiplier;
 				}
 				break;
-
-			default:
-				DEBUG_LOG(G3D,"Unknown normal format %i",nrm);
-				break;
 			}
 		}
 
diff --git a/GPU/GPUState.h b/GPU/GPUState.h
index cc464f377c..de4d7e5849 100644
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@@ -279,6 +279,9 @@ struct GPUStateCache
 
 	u32 curTextureWidth;
 	u32 curTextureHeight;
+
+	float vpWidth;
+	float vpHeight;
 };
 
 // TODO: Implement support for these.