From bf5129152780dcde7a5e683adb5f593a820fbcef Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Mon, 4 Feb 2013 23:09:01 +0100 Subject: [PATCH] Don't translate bone weights to floats unnecessarily. Minor optimization to ApplyShader, pushing it down the profile from 2% to 0.5% in Wipeout. --- GPU/GLES/ShaderManager.cpp | 22 +++++++++++++++++---- GPU/GLES/ShaderManager.h | 9 +++------ GPU/GLES/TransformPipeline.cpp | 15 +++++++++------ GPU/GLES/VertexDecoder.cpp | 31 ++++++++++++++++++++++-------- GPU/GLES/VertexDecoder.h | 18 +++++++++++++++-- GPU/GLES/VertexShaderGenerator.cpp | 2 +- 6 files changed, 70 insertions(+), 27 deletions(-) diff --git a/GPU/GLES/ShaderManager.cpp b/GPU/GLES/ShaderManager.cpp index b000b7dc3b..71957ec335 100644 --- a/GPU/GLES/ShaderManager.cpp +++ b/GPU/GLES/ShaderManager.cpp @@ -315,6 +315,15 @@ void LinkedShader::updateUniforms() { dirtyUniforms = 0; } +ShaderManager::ShaderManager() : lastShader(NULL), globalDirty(0xFFFFFFFF), shaderSwitchDirty(0) { + codeBuffer_ = new char[16384]; +} + +ShaderManager::~ShaderManager() { + delete [] codeBuffer_; +} + + void ShaderManager::DirtyUniform(u32 what) { globalDirty |= what; } @@ -353,10 +362,9 @@ void ShaderManager::DirtyShader() LinkedShader *ShaderManager::ApplyShader(int prim) { if (globalDirty) { - // Deferred dirtying! Let's see if we can make this even more clever later. - for (LinkedShaderCache::iterator iter = linkedShaderCache.begin(); iter != linkedShaderCache.end(); ++iter) { - iter->second->dirtyUniforms |= globalDirty; - } + if (lastShader) + lastShader->dirtyUniforms |= globalDirty; + shaderSwitchDirty |= globalDirty; globalDirty = 0; } @@ -376,6 +384,12 @@ LinkedShader *ShaderManager::ApplyShader(int prim) lastShader->stop(); } + // Deferred dirtying! Let's see if we can make this even more clever later. + for (LinkedShaderCache::iterator iter = linkedShaderCache.begin(); iter != linkedShaderCache.end(); ++iter) { + iter->second->dirtyUniforms |= shaderSwitchDirty; + } + shaderSwitchDirty = 0; + lastVSID = VSID; lastFSID = FSID; diff --git a/GPU/GLES/ShaderManager.h b/GPU/GLES/ShaderManager.h index 90b34fdf47..c83ad3a66a 100644 --- a/GPU/GLES/ShaderManager.h +++ b/GPU/GLES/ShaderManager.h @@ -133,12 +133,8 @@ private: class ShaderManager { public: - ShaderManager() : lastShader(NULL), globalDirty(0xFFFFFFFF) { - codeBuffer_ = new char[16384]; - } - ~ShaderManager() { - delete [] codeBuffer_; - } + ShaderManager(); + ~ShaderManager(); void ClearCache(bool deleteThem); // TODO: deleteThem currently not respected LinkedShader *ApplyShader(int prim); @@ -160,6 +156,7 @@ private: LinkedShader *lastShader; u32 globalDirty; + u32 shaderSwitchDirty; char *codeBuffer_; typedef std::map FSCache; diff --git a/GPU/GLES/TransformPipeline.cpp b/GPU/GLES/TransformPipeline.cpp index ee2bc3a47a..e43a0d5d1a 100644 --- a/GPU/GLES/TransformPipeline.cpp +++ b/GPU/GLES/TransformPipeline.cpp @@ -298,12 +298,12 @@ void Lighter::Light(float colorOut0[4], float colorOut1[4], const float colorIn[ } struct GlTypeInfo { - GLuint type; - int count; - GLboolean normalized; + u16 type; + u8 count; + u8 normalized; }; -const GlTypeInfo GLComp[] = { +static const GlTypeInfo GLComp[] = { {0}, // DEC_NONE, {GL_FLOAT, 1, GL_FALSE}, // DEC_FLOAT_1, {GL_FLOAT, 2, GL_FALSE}, // DEC_FLOAT_2, @@ -311,8 +311,10 @@ const GlTypeInfo GLComp[] = { {GL_FLOAT, 4, GL_FALSE}, // DEC_FLOAT_4, {GL_BYTE, 4, GL_TRUE}, // DEC_S8_3, {GL_SHORT, 4, GL_TRUE},// DEC_S16_3, - {GL_UNSIGNED_BYTE, 4, GL_TRUE},// DEC_U8_4, + {GL_UNSIGNED_BYTE, 1, GL_TRUE},// DEC_U8_1, + {GL_UNSIGNED_BYTE, 2, GL_TRUE},// DEC_U8_2, {GL_UNSIGNED_BYTE, 3, GL_TRUE},// DEC_U8_3, + {GL_UNSIGNED_BYTE, 4, GL_TRUE},// DEC_U8_4, }; static inline void VertexAttribSetup(int attrib, int fmt, int stride, u8 *ptr) { @@ -838,8 +840,9 @@ void TransformDrawEngine::ClearTrackedVertexArrays() { } void TransformDrawEngine::DecimateTrackedVertexArrays() { + int threshold = gpuStats.numFrames - VAI_KILL_AGE; for (auto iter = vai_.begin(); iter != vai_.end(); ) { - if (iter->second->lastFrame + VAI_KILL_AGE < gpuStats.numFrames) { + if (iter->second->lastFrame < threshold ) { delete iter->second; vai_.erase(iter++); } diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp index a26b8b93ad..d615b6d4d0 100644 --- a/GPU/GLES/VertexDecoder.cpp +++ b/GPU/GLES/VertexDecoder.cpp @@ -69,6 +69,9 @@ int DecFmtSize(u8 fmt) { case DEC_FLOAT_4: return 16; case DEC_S8_3: return 4; case DEC_S16_3: return 8; + case DEC_U8_1: return 4; + case DEC_U8_2: return 4; + case DEC_U8_3: return 4; case DEC_U8_4: return 4; default: return 0; @@ -107,10 +110,10 @@ DecVtxFormat GetTransformedVtxFormat(const DecVtxFormat &fmt) { void VertexDecoder::Step_WeightsU8() const { - float *wt = (float *)(decoded_ + decFmt.w0off); + u8 *wt = (u8 *)(decoded_ + decFmt.w0off); const u8 *wdata = (const u8*)(ptr_); for (int j = 0; j < nweights; j++) - wt[j] = (float)wdata[j] / 128.0f; + wt[j] = wdata[j]; } void VertexDecoder::Step_WeightsU16() const @@ -118,14 +121,19 @@ void VertexDecoder::Step_WeightsU16() const float *wt = (float *)(decoded_ + decFmt.w0off); const u16 *wdata = (const u16*)(ptr_); for (int j = 0; j < nweights; j++) - wt[j] = (float)wdata[j] / 32768.0f; + wt[j] = (float)wdata[j] / 65535.0f; } +// Float weights should be uncommon, we can live with having to multiply these by 2.0 +// to avoid special checks in the vertex shader generator. +// (PSP uses 0.0-2.0 fixed point numbers for weights) void VertexDecoder::Step_WeightsFloat() const { float *wt = (float *)(decoded_ + decFmt.w0off); const float *wdata = (const float*)(ptr_); - memcpy(wt, wdata, nweights * sizeof(float)); + for (int i = 0; i < nweights; i++) { + wt[i] = wdata[i] * 0.5f; + } } void VertexDecoder::Step_TcU8() const @@ -544,14 +552,21 @@ void VertexDecoder::SetVertexType(u32 fmt) { steps_[numSteps_++] = wtstep[weighttype]; + int fmtBase = DEC_FLOAT_1; + int weightSize = 4; + if (weighttype == GE_VTYPE_WEIGHT_8BIT >> GE_VTYPE_WEIGHT_SHIFT) { + fmtBase = DEC_U8_1; + weightSize = 1; + } + if (nweights < 5) { decFmt.w0off = decOff; - decFmt.w0fmt = DEC_FLOAT_1 + nweights - 1; + decFmt.w0fmt = fmtBase + nweights - 1; } else { decFmt.w0off = decOff; - decFmt.w0fmt = DEC_FLOAT_4; - decFmt.w1off = decOff + 4 * 4; - decFmt.w1fmt = DEC_FLOAT_1 + nweights - 5; + decFmt.w0fmt = fmtBase + 3; + decFmt.w1off = decOff + 4 * weightSize; + decFmt.w1fmt = fmtBase + nweights - 5; } decOff += nweights * 4; } diff --git a/GPU/GLES/VertexDecoder.h b/GPU/GLES/VertexDecoder.h index b8c7da3c8d..04be647258 100644 --- a/GPU/GLES/VertexDecoder.h +++ b/GPU/GLES/VertexDecoder.h @@ -34,6 +34,9 @@ enum { DEC_FLOAT_4, DEC_S8_3, DEC_S16_3, + DEC_U8_1, + DEC_U8_2, + DEC_U8_3, DEC_U8_4, }; @@ -243,7 +246,7 @@ public: switch (decFmt_.c0fmt) { case DEC_U8_4: { - u8 *p = (u8 *)(data_ + decFmt_.c0off); + const u8 *p = (const u8 *)(data_ + decFmt_.c0off); for (int i = 0; i < 4; i++) color[i] = p[i] / 255.0f; } @@ -260,7 +263,7 @@ public: switch (decFmt_.c1fmt) { case DEC_U8_4: { - u8 *p = (u8 *)(data_ + decFmt_.c1off); + const u8 *p = (const u8 *)(data_ + decFmt_.c1off); for (int i = 0; i < 3; i++) color[i] = p[i] / 255.0f; } @@ -274,15 +277,22 @@ public: } void ReadWeights(float weights[8]) { + const u8 *p = (const u8 *)(data_ + decFmt_.w0off); switch (decFmt_.w0fmt) { case DEC_FLOAT_1: memcpy(weights, data_ + decFmt_.w0off, 4); break; case DEC_FLOAT_2: memcpy(weights, data_ + decFmt_.w0off, 8); break; case DEC_FLOAT_3: memcpy(weights, data_ + decFmt_.w0off, 12); break; case DEC_FLOAT_4: memcpy(weights, data_ + decFmt_.w0off, 16); break; + case DEC_U8_1: weights[0] = p[0] / 128.f; break; + case DEC_U8_2: for (int i = 0; i < 2; i++) weights[i] = p[i] / 128.f; break; + case DEC_U8_3: for (int i = 0; i < 3; i++) weights[i] = p[i] / 128.f; break; + case DEC_U8_4: for (int i = 0; i < 4; i++) weights[i] = p[i] / 128.f; break; default: ERROR_LOG(G3D, "Reader: Unsupported W0 Format"); break; } + + p = (const u8 *)(data_ + decFmt_.w1off); switch (decFmt_.w1fmt) { case 0: // It's fine for there to be w0 weights but not w1. @@ -291,6 +301,10 @@ public: case DEC_FLOAT_2: memcpy(weights + 4, data_ + decFmt_.w1off, 8); break; case DEC_FLOAT_3: memcpy(weights + 4, data_ + decFmt_.w1off, 12); break; case DEC_FLOAT_4: memcpy(weights + 4, data_ + decFmt_.w1off, 16); break; + case DEC_U8_1: weights[4] = p[0] / 128.f; break; + case DEC_U8_2: for (int i = 0; i < 2; i++) weights[i+4] = p[i] / 128.f; break; + case DEC_U8_3: for (int i = 0; i < 3; i++) weights[i+4] = p[i] / 128.f; break; + case DEC_U8_4: for (int i = 0; i < 4; i++) weights[i+4] = p[i] / 128.f; break; default: ERROR_LOG(G3D, "Reader: Unsupported W1 Format"); break; diff --git a/GPU/GLES/VertexShaderGenerator.cpp b/GPU/GLES/VertexShaderGenerator.cpp index 5e0e0b4218..6bcfa1bc09 100644 --- a/GPU/GLES/VertexShaderGenerator.cpp +++ b/GPU/GLES/VertexShaderGenerator.cpp @@ -275,7 +275,7 @@ void GenerateVertexShader(int prim, char *buffer) { WRITE(p, " worldnormal += %s * (u_bone%i * vec4(a_normal, 0.0)).xyz;\n", weightAttr, i); } // Finally, multiply by world matrix (yes, we have to). - WRITE(p, " worldpos = (u_world * vec4(worldpos, 1.0)).xyz;\n"); + WRITE(p, " worldpos = (u_world * vec4(worldpos * 2.0, 1.0)).xyz;\n"); if (hasNormal) WRITE(p, " worldnormal = (u_world * vec4(worldnormal, 0.0)).xyz;\n"); }