From bf5129152780dcde7a5e683adb5f593a820fbcef Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Mon, 4 Feb 2013 23:09:01 +0100
Subject: [PATCH] Don't translate bone weights to floats unnecessarily. Minor
 optimization to ApplyShader, pushing it down the profile from 2% to 0.5% in
 Wipeout.

---
 GPU/GLES/ShaderManager.cpp         | 22 +++++++++++++++++----
 GPU/GLES/ShaderManager.h           |  9 +++------
 GPU/GLES/TransformPipeline.cpp     | 15 +++++++++------
 GPU/GLES/VertexDecoder.cpp         | 31 ++++++++++++++++++++++--------
 GPU/GLES/VertexDecoder.h           | 18 +++++++++++++++--
 GPU/GLES/VertexShaderGenerator.cpp |  2 +-
 6 files changed, 70 insertions(+), 27 deletions(-)

diff --git a/GPU/GLES/ShaderManager.cpp b/GPU/GLES/ShaderManager.cpp
index b000b7dc3b..71957ec335 100644
--- a/GPU/GLES/ShaderManager.cpp
+++ b/GPU/GLES/ShaderManager.cpp
@@ -315,6 +315,15 @@ void LinkedShader::updateUniforms() {
 	dirtyUniforms = 0;
 }
 
+ShaderManager::ShaderManager() : lastShader(NULL), globalDirty(0xFFFFFFFF), shaderSwitchDirty(0) {
+	codeBuffer_ = new char[16384];
+}
+
+ShaderManager::~ShaderManager() {
+	delete [] codeBuffer_;
+}
+
+
 void ShaderManager::DirtyUniform(u32 what) {
 	globalDirty |= what;
 }
@@ -353,10 +362,9 @@ void ShaderManager::DirtyShader()
 LinkedShader *ShaderManager::ApplyShader(int prim)
 {
 	if (globalDirty) {
-		// Deferred dirtying! Let's see if we can make this even more clever later.
-		for (LinkedShaderCache::iterator iter = linkedShaderCache.begin(); iter != linkedShaderCache.end(); ++iter) {
-			iter->second->dirtyUniforms |= globalDirty;
-		}
+		if (lastShader)
+			lastShader->dirtyUniforms |= globalDirty;
+		shaderSwitchDirty |= globalDirty;
 		globalDirty = 0;
 	}
 
@@ -376,6 +384,12 @@ LinkedShader *ShaderManager::ApplyShader(int prim)
 		lastShader->stop();
 	}
 
+	// Deferred dirtying! Let's see if we can make this even more clever later.
+	for (LinkedShaderCache::iterator iter = linkedShaderCache.begin(); iter != linkedShaderCache.end(); ++iter) {
+		iter->second->dirtyUniforms |= shaderSwitchDirty;
+	}
+	shaderSwitchDirty = 0;
+
 	lastVSID = VSID;
 	lastFSID = FSID;
 
diff --git a/GPU/GLES/ShaderManager.h b/GPU/GLES/ShaderManager.h
index 90b34fdf47..c83ad3a66a 100644
--- a/GPU/GLES/ShaderManager.h
+++ b/GPU/GLES/ShaderManager.h
@@ -133,12 +133,8 @@ private:
 class ShaderManager
 {
 public:
-	ShaderManager() : lastShader(NULL), globalDirty(0xFFFFFFFF) {
-		codeBuffer_ = new char[16384];
-	}
-	~ShaderManager() {
-		delete [] codeBuffer_;
-	}
+	ShaderManager();
+	~ShaderManager();
 
 	void ClearCache(bool deleteThem);  // TODO: deleteThem currently not respected
 	LinkedShader *ApplyShader(int prim);
@@ -160,6 +156,7 @@ private:
 
 	LinkedShader *lastShader;
 	u32 globalDirty;
+	u32 shaderSwitchDirty;
 	char *codeBuffer_;
 
 	typedef std::map<FragmentShaderID, Shader *> FSCache;
diff --git a/GPU/GLES/TransformPipeline.cpp b/GPU/GLES/TransformPipeline.cpp
index ee2bc3a47a..e43a0d5d1a 100644
--- a/GPU/GLES/TransformPipeline.cpp
+++ b/GPU/GLES/TransformPipeline.cpp
@@ -298,12 +298,12 @@ void Lighter::Light(float colorOut0[4], float colorOut1[4], const float colorIn[
 }
 
 struct GlTypeInfo {
-	GLuint type;
-	int count;
-	GLboolean normalized;
+	u16 type;
+	u8 count;
+	u8 normalized;
 };
 
-const GlTypeInfo GLComp[] = {
+static const GlTypeInfo GLComp[] = {
 	{0}, // 	DEC_NONE,
 	{GL_FLOAT, 1, GL_FALSE}, // 	DEC_FLOAT_1,
 	{GL_FLOAT, 2, GL_FALSE}, // 	DEC_FLOAT_2,
@@ -311,8 +311,10 @@ const GlTypeInfo GLComp[] = {
 	{GL_FLOAT, 4, GL_FALSE}, // 	DEC_FLOAT_4,
 	{GL_BYTE, 4, GL_TRUE}, // 	DEC_S8_3,
 	{GL_SHORT, 4, GL_TRUE},// 	DEC_S16_3,
-	{GL_UNSIGNED_BYTE, 4, GL_TRUE},// 	DEC_U8_4,
+	{GL_UNSIGNED_BYTE, 1, GL_TRUE},// 	DEC_U8_1,
+	{GL_UNSIGNED_BYTE, 2, GL_TRUE},// 	DEC_U8_2,
 	{GL_UNSIGNED_BYTE, 3, GL_TRUE},// 	DEC_U8_3,
+	{GL_UNSIGNED_BYTE, 4, GL_TRUE},// 	DEC_U8_4,
 };
 
 static inline void VertexAttribSetup(int attrib, int fmt, int stride, u8 *ptr) {
@@ -838,8 +840,9 @@ void TransformDrawEngine::ClearTrackedVertexArrays() {
 }
 
 void TransformDrawEngine::DecimateTrackedVertexArrays() {
+	int threshold = gpuStats.numFrames - VAI_KILL_AGE;
 	for (auto iter = vai_.begin(); iter != vai_.end(); ) {
-		if (iter->second->lastFrame + VAI_KILL_AGE < gpuStats.numFrames) {
+		if (iter->second->lastFrame < threshold ) {
 			delete iter->second;
 			vai_.erase(iter++);
 		}
diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp
index a26b8b93ad..d615b6d4d0 100644
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@@ -69,6 +69,9 @@ int DecFmtSize(u8 fmt) {
 	case DEC_FLOAT_4: return 16;
 	case DEC_S8_3: return 4;
 	case DEC_S16_3: return 8;
+	case DEC_U8_1: return 4;
+	case DEC_U8_2: return 4;
+	case DEC_U8_3: return 4;
 	case DEC_U8_4: return 4;
 	default:
 		return 0;
@@ -107,10 +110,10 @@ DecVtxFormat GetTransformedVtxFormat(const DecVtxFormat &fmt) {
 
 void VertexDecoder::Step_WeightsU8() const
 {
-	float *wt = (float *)(decoded_ + decFmt.w0off);
+	u8 *wt = (u8 *)(decoded_ + decFmt.w0off);
 	const u8 *wdata = (const u8*)(ptr_);
 	for (int j = 0; j < nweights; j++)
-		wt[j] = (float)wdata[j] / 128.0f;
+		wt[j] = wdata[j];
 }
 
 void VertexDecoder::Step_WeightsU16() const
@@ -118,14 +121,19 @@ void VertexDecoder::Step_WeightsU16() const
 	float *wt = (float *)(decoded_  + decFmt.w0off);
 	const u16 *wdata = (const u16*)(ptr_);
 	for (int j = 0; j < nweights; j++)
-		wt[j] = (float)wdata[j] / 32768.0f;
+		wt[j] = (float)wdata[j] / 65535.0f;
 }
 
+// Float weights should be uncommon, we can live with having to multiply these by 2.0
+// to avoid special checks in the vertex shader generator.
+// (PSP uses 0.0-2.0 fixed point numbers for weights)
 void VertexDecoder::Step_WeightsFloat() const
 {
 	float *wt = (float *)(decoded_ + decFmt.w0off);
 	const float *wdata = (const float*)(ptr_);
-	memcpy(wt, wdata, nweights * sizeof(float));
+	for (int i = 0; i < nweights; i++) {
+		wt[i] = wdata[i] * 0.5f;
+	}
 }
 
 void VertexDecoder::Step_TcU8() const
@@ -544,14 +552,21 @@ void VertexDecoder::SetVertexType(u32 fmt) {
 
 		steps_[numSteps_++] = wtstep[weighttype];
 
+		int fmtBase = DEC_FLOAT_1;
+		int weightSize = 4;
+		if (weighttype == GE_VTYPE_WEIGHT_8BIT >> GE_VTYPE_WEIGHT_SHIFT) {
+			fmtBase = DEC_U8_1;
+			weightSize = 1;
+		}
+
 		if (nweights < 5) {
 			decFmt.w0off = decOff;
-			decFmt.w0fmt = DEC_FLOAT_1 + nweights - 1;
+			decFmt.w0fmt = fmtBase + nweights - 1;
 		} else {
 			decFmt.w0off = decOff;
-			decFmt.w0fmt = DEC_FLOAT_4;
-			decFmt.w1off = decOff + 4 * 4;
-			decFmt.w1fmt = DEC_FLOAT_1 + nweights - 5;
+			decFmt.w0fmt = fmtBase + 3;
+			decFmt.w1off = decOff + 4 * weightSize;
+			decFmt.w1fmt = fmtBase + nweights - 5;
 		}
 		decOff += nweights * 4;
 	}
diff --git a/GPU/GLES/VertexDecoder.h b/GPU/GLES/VertexDecoder.h
index b8c7da3c8d..04be647258 100644
--- a/GPU/GLES/VertexDecoder.h
+++ b/GPU/GLES/VertexDecoder.h
@@ -34,6 +34,9 @@ enum {
 	DEC_FLOAT_4,
 	DEC_S8_3,
 	DEC_S16_3,
+	DEC_U8_1,
+	DEC_U8_2,
+	DEC_U8_3,
 	DEC_U8_4,
 };
 
@@ -243,7 +246,7 @@ public:
 		switch (decFmt_.c0fmt) {
 		case DEC_U8_4:
 			{
-				u8 *p = (u8 *)(data_ + decFmt_.c0off);
+				const u8 *p = (const u8 *)(data_ + decFmt_.c0off);
 				for (int i = 0; i < 4; i++)
 					color[i] = p[i] / 255.0f;
 			}
@@ -260,7 +263,7 @@ public:
 		switch (decFmt_.c1fmt) {
 		case DEC_U8_4:
 			{
-				u8 *p = (u8 *)(data_ + decFmt_.c1off);
+				const u8 *p = (const u8 *)(data_ + decFmt_.c1off);
 				for (int i = 0; i < 3; i++)
 					color[i] = p[i] / 255.0f;
 			}
@@ -274,15 +277,22 @@ public:
 	}
 
 	void ReadWeights(float weights[8]) {
+		const u8 *p = (const u8 *)(data_ + decFmt_.w0off);
 		switch (decFmt_.w0fmt) {
 		case DEC_FLOAT_1: memcpy(weights, data_ + decFmt_.w0off, 4); break;
 		case DEC_FLOAT_2: memcpy(weights, data_ + decFmt_.w0off, 8); break;
 		case DEC_FLOAT_3: memcpy(weights, data_ + decFmt_.w0off, 12); break;
 		case DEC_FLOAT_4: memcpy(weights, data_ + decFmt_.w0off, 16); break;
+		case DEC_U8_1: weights[0] = p[0] / 128.f; break;
+		case DEC_U8_2: for (int i = 0; i < 2; i++) weights[i] = p[i] / 128.f; break;
+		case DEC_U8_3: for (int i = 0; i < 3; i++) weights[i] = p[i] / 128.f; break;
+		case DEC_U8_4: for (int i = 0; i < 4; i++) weights[i] = p[i] / 128.f; break;
 		default:
 			ERROR_LOG(G3D, "Reader: Unsupported W0 Format");
 			break;
 		}
+
+		p = (const u8 *)(data_ + decFmt_.w1off);
 		switch (decFmt_.w1fmt) {
 		case 0:
 			// It's fine for there to be w0 weights but not w1.
@@ -291,6 +301,10 @@ public:
 		case DEC_FLOAT_2: memcpy(weights + 4, data_ + decFmt_.w1off, 8); break;
 		case DEC_FLOAT_3: memcpy(weights + 4, data_ + decFmt_.w1off, 12); break;
 		case DEC_FLOAT_4: memcpy(weights + 4, data_ + decFmt_.w1off, 16); break;
+		case DEC_U8_1: weights[4] = p[0] / 128.f; break;
+		case DEC_U8_2: for (int i = 0; i < 2; i++) weights[i+4] = p[i] / 128.f; break;
+		case DEC_U8_3: for (int i = 0; i < 3; i++) weights[i+4] = p[i] / 128.f; break;
+		case DEC_U8_4: for (int i = 0; i < 4; i++) weights[i+4] = p[i] / 128.f; break;
 		default:
 			ERROR_LOG(G3D, "Reader: Unsupported W1 Format");
 			break;
diff --git a/GPU/GLES/VertexShaderGenerator.cpp b/GPU/GLES/VertexShaderGenerator.cpp
index 5e0e0b4218..6bcfa1bc09 100644
--- a/GPU/GLES/VertexShaderGenerator.cpp
+++ b/GPU/GLES/VertexShaderGenerator.cpp
@@ -275,7 +275,7 @@ void GenerateVertexShader(int prim, char *buffer) {
 					WRITE(p, "  worldnormal += %s * (u_bone%i * vec4(a_normal, 0.0)).xyz;\n", weightAttr, i);
 			}
 			// Finally, multiply by world matrix (yes, we have to).
-			WRITE(p, "  worldpos = (u_world * vec4(worldpos, 1.0)).xyz;\n");
+			WRITE(p, "  worldpos = (u_world * vec4(worldpos * 2.0, 1.0)).xyz;\n");
 			if (hasNormal)
 				WRITE(p, "  worldnormal = (u_world * vec4(worldnormal, 0.0)).xyz;\n");
 		}