From 3e6c2f0c7868c1dd496069cb2d04acd217a7fd78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Wed, 13 Nov 2013 17:10:44 +0100 Subject: [PATCH 01/12] Update native --- native | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/native b/native index 483e42f64e..cf895f95d7 160000 --- a/native +++ b/native @@ -1 +1 @@ -Subproject commit 483e42f64e666ee47390b47e3405ad731d011f93 +Subproject commit cf895f95d7ae75d4535cf252687fd4f9c4f1663b From 7e67476b00da917c2bbf0a5d2433df5cc72487d7 Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sun, 10 Nov 2013 13:18:52 +0100 Subject: [PATCH 02/12] Simple unoptimized software skinning. Does not take advantage of the possible reduction in state changes yet. --- Core/Config.cpp | 2 + Core/Config.h | 2 + GPU/GLES/GLES_GPU.cpp | 32 ++++- GPU/GLES/TransformPipeline.cpp | 8 +- GPU/GLES/VertexDecoder.cpp | 236 ++++++++++++++++++++++++++------- GPU/GLES/VertexDecoder.h | 12 ++ GPU/GPUState.cpp | 15 ++- GPU/GPUState.h | 4 +- UI/GameSettingsScreen.cpp | 5 +- 9 files changed, 257 insertions(+), 59 deletions(-) diff --git a/Core/Config.cpp b/Core/Config.cpp index 8e11eab12a..bc15c6c1f2 100644 --- a/Core/Config.cpp +++ b/Core/Config.cpp @@ -144,6 +144,7 @@ void Config::Load(const char *iniFileName, const char *controllerIniFilename) { graphics->Get("RenderingMode", &iRenderingMode, renderingModeDefault); graphics->Get("SoftwareRendering", &bSoftwareRendering, false); graphics->Get("HardwareTransform", &bHardwareTransform, true); + graphics->Get("SoftwareSkinning", &bSoftwareSkinning, false); graphics->Get("TextureFiltering", &iTexFiltering, 1); // Auto on Windows, 1x elsewhere. Maybe change to 2x on large screens? #ifdef _WIN32 @@ -401,6 +402,7 @@ void Config::Save() { graphics->Set("RenderingMode", iRenderingMode); graphics->Set("SoftwareRendering", bSoftwareRendering); graphics->Set("HardwareTransform", bHardwareTransform); + graphics->Set("SoftwareSkinning", bSoftwareSkinning); graphics->Set("TextureFiltering", iTexFiltering); graphics->Set("InternalResolution", iInternalResolution); graphics->Set("FrameSkip", iFrameSkip); diff --git a/Core/Config.h b/Core/Config.h index b5a4073b3f..2922d4c005 100644 --- a/Core/Config.h +++ b/Core/Config.h @@ -64,6 +64,8 @@ public: // GFX bool bSoftwareRendering; bool bHardwareTransform; // only used in the GLES backend + bool bSoftwareSkinning; // may speed up some games + int iRenderingMode; // 0 = non-buffered rendering 1 = buffered rendering 2 = Read Framebuffer to memory (CPU) 3 = Read Framebuffer to memory (GPU) int iTexFiltering; // 1 = off , 2 = nearest , 3 = linear , 4 = linear(CG) #ifdef BLACKBERRY diff --git a/GPU/GLES/GLES_GPU.cpp b/GPU/GLES/GLES_GPU.cpp index 66a5223254..19343dc1db 100644 --- a/GPU/GLES/GLES_GPU.cpp +++ b/GPU/GLES/GLES_GPU.cpp @@ -430,6 +430,12 @@ GLES_GPU::GLES_GPU() commandFlags_[GE_CMD_TEXOFFSETV] &= ~FLAG_FLUSHBEFOREONCHANGE; } + // TODO: Can't turn this optimization on until we don't decode everything in one go + // but instead decode for every draw call when sw skinning + if (g_Config.bSoftwareSkinning) { + // commandFlags_[GE_CMD_VERTEXTYPE] &= ~FLAG_FLUSHBEFOREONCHANGE; + } + BuildReportingInfo(); } @@ -868,8 +874,21 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) { break; case GE_CMD_VERTEXTYPE: - if (diff) - shaderManager_->DirtyUniform(DIRTY_UVSCALEOFFSET); + if (diff) { + if (!g_Config.bSoftwareSkinning) { + shaderManager_->DirtyUniform(DIRTY_UVSCALEOFFSET); + } else if (false) { + // TODO: Can't turn this optimization on until we don't decode everything in one go + // but instead decode for every draw call when sw skinning + if (diff & ~GE_VTYPE_WEIGHTCOUNT_MASK) { + // Restore and flush + gstate.vertType ^= diff; + Flush(); + gstate.vertType ^= diff; + shaderManager_->DirtyUniform(DIRTY_UVSCALEOFFSET); + } + } + } break; case GE_CMD_REGION1: @@ -1375,9 +1394,14 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) { int num = gstate.boneMatrixNumber & 0x7F; float newVal = getFloat24(data); if (num < 96 && newVal != gstate.boneMatrix[num]) { - Flush(); + // Bone matrices should NOT flush when software skinning is enabled! + // TODO: Also check for morph... + // TODO: Can't turn this optimizatoin on until we decode per drawcall when sw skinning. + if (true || !g_Config.bSoftwareSkinning) { + Flush(); + shaderManager_->DirtyUniform(DIRTY_BONEMATRIX0 << (num / 12)); + } gstate.boneMatrix[num] = newVal; - shaderManager_->DirtyUniform(DIRTY_BONEMATRIX0 << (num / 12)); } num++; gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x7F); diff --git a/GPU/GLES/TransformPipeline.cpp b/GPU/GLES/TransformPipeline.cpp index b6de946821..d1f70a69f7 100644 --- a/GPU/GLES/TransformPipeline.cpp +++ b/GPU/GLES/TransformPipeline.cpp @@ -524,8 +524,14 @@ void TransformDrawEngine::DoFlush() { int vertexCount = 0; int maxIndex = 0; bool useElements = true; + // Cannot cache vertex data with morph enabled. - if (g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK)) { + bool useCache = g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK); + // Also avoid caching when software skinning. + if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) + useCache = false; + + if (useCache) { u32 id = ComputeFastDCID(); auto iter = vai_.find(id); VertexArrayInfo *vai; diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp index abe37a98ac..6e848708a5 100644 --- a/GPU/GLES/VertexDecoder.cpp +++ b/GPU/GLES/VertexDecoder.cpp @@ -15,11 +15,10 @@ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. -#include "math/lin/matrix4x4.h" - #include "Core/Config.h" #include "Core/MemMap.h" #include "GPU/ge_constants.h" +#include "GPU/Math3D.h" #include "VertexDecoder.h" #include "VertexShaderGenerator.h" @@ -36,6 +35,10 @@ static const u8 nrmsize[4] = {0,3,6,12}, nrmalign[4] = {0,1,2,4}; static const u8 possize[4] = {0,3,6,12}, posalign[4] = {0,1,2,4}; static const u8 wtsize[4] = {0,1,2,4}, wtalign[4] = {0,1,2,4}; +// When software skinning. This should be stored in registers instead of memory +// when jitting. +float skinMatrix[12]; + inline int align(int n, int align) { return (n + (align - 1)) & ~(align - 1); } @@ -112,6 +115,57 @@ void VertexDecoder::Step_WeightsFloat() const wt[j++] = 0.0f; } +void VertexDecoder::Step_WeightsU8Skin() const +{ + memset(skinMatrix, 0, sizeof(skinMatrix)); + u8 *wt = (u8 *)(decoded_ + decFmt.w0off); + const u8 *wdata = (const u8*)(ptr_); + for (int j = 0; j < nweights; j++) { + const float *bone = &gstate.boneMatrix[j * 12]; + if (wdata[j] != 0) { + float weight = wdata[j] / 128.0f; + for (int i = 0; i < 12; i++) { + skinMatrix[i] += weight * bone[i]; + } + } + } +} + +void VertexDecoder::Step_WeightsU16Skin() const +{ + memset(skinMatrix, 0, sizeof(skinMatrix)); + u16 *wt = (u16 *)(decoded_ + decFmt.w0off); + const u16 *wdata = (const u16*)(ptr_); + for (int j = 0; j < nweights; j++) { + const float *bone = &gstate.boneMatrix[j * 12]; + if (wdata[j] != 0) { + float weight = wdata[j] / 32768.0f; + for (int i = 0; i < 12; i++) { + skinMatrix[i] += weight * bone[i]; + } + } + } +} + +// Float weights should be uncommon, we can live with having to multiply these by 2.0 +// to avoid special checks in the vertex shader generator. +// (PSP uses 0.0-2.0 fixed point numbers for weights) +void VertexDecoder::Step_WeightsFloatSkin() const +{ + memset(skinMatrix, 0, sizeof(skinMatrix)); + float *wt = (float *)(decoded_ + decFmt.w0off); + const float *wdata = (const float*)(ptr_); + for (int j = 0; j < nweights; j++) { + const float *bone = &gstate.boneMatrix[j * 12]; + float weight = wdata[j]; + if (weight > 0.0) { + for (int i = 0; i < 12; i++) { + skinMatrix[i] += weight * bone[i]; + } + } + } +} + void VertexDecoder::Step_TcU8() const { // u32 to write two bytes of zeroes for free. @@ -318,6 +372,29 @@ void VertexDecoder::Step_NormalFloat() const normal[j] = fv[j]; } +void VertexDecoder::Step_NormalS8Skin() const +{ + float *normal = (float *)(decoded_ + decFmt.nrmoff); + const s8 *sv = (const s8*)(ptr_ + nrmoff); + const float fn[3] = { sv[0] / 128.0f, sv[1] / 128.0f, sv[2] / 128.0f }; + Norm3ByMatrix43(normal, fn, skinMatrix); +} + +void VertexDecoder::Step_NormalS16Skin() const +{ + float *normal = (float *)(decoded_ + decFmt.nrmoff); + const s16 *sv = (const s16*)(ptr_ + nrmoff); + const float fn[3] = { sv[0] / 32768.0f, sv[1] / 32768.0f, sv[2] / 32768.0f }; + Norm3ByMatrix43(normal, fn, skinMatrix); +} + +void VertexDecoder::Step_NormalFloatSkin() const +{ + float *normal = (float *)(decoded_ + decFmt.nrmoff); + const float *fn = (const float *)(ptr_ + nrmoff); + Norm3ByMatrix43(normal, fn, skinMatrix); +} + void VertexDecoder::Step_NormalS8Morph() const { float *normal = (float *)(decoded_ + decFmt.nrmoff); @@ -382,6 +459,29 @@ void VertexDecoder::Step_PosFloat() const memcpy(v, fv, 12); } +void VertexDecoder::Step_PosS8Skin() const +{ + float *pos = (float *)(decoded_ + decFmt.posoff); + const s8 *sv = (const s8*)(ptr_ + posoff); + const float fn[3] = { sv[0] / 128.0f, sv[1] / 128.0f, sv[2] / 128.0f }; + Vec3ByMatrix43(pos, fn, skinMatrix); +} + +void VertexDecoder::Step_PosS16Skin() const +{ + float *pos = (float *)(decoded_ + decFmt.posoff); + const s16 *sv = (const s16*)(ptr_ + posoff); + const float fn[3] = { sv[0] / 32768.0f, sv[1] / 32768.0f, sv[2] / 32768.0f }; + Vec3ByMatrix43(pos, fn, skinMatrix); +} + +void VertexDecoder::Step_PosFloatSkin() const +{ + float *pos = (float *)(decoded_ + decFmt.posoff); + const float *fn = (const float *)(ptr_ + posoff); + Vec3ByMatrix43(pos, fn, skinMatrix); +} + void VertexDecoder::Step_PosS8Through() const { float *v = (float *)(decoded_ + decFmt.posoff); @@ -449,6 +549,13 @@ static const StepFunction wtstep[4] = { &VertexDecoder::Step_WeightsFloat, }; +static const StepFunction wtstep_skin[4] = { + 0, + &VertexDecoder::Step_WeightsU8Skin, + &VertexDecoder::Step_WeightsU16Skin, + &VertexDecoder::Step_WeightsFloatSkin, +}; + static const StepFunction tcstep[4] = { 0, &VertexDecoder::Step_TcU8, @@ -510,6 +617,13 @@ static const StepFunction nrmstep[4] = { &VertexDecoder::Step_NormalFloat, }; +static const StepFunction nrmstep_skin[4] = { + 0, + &VertexDecoder::Step_NormalS8Skin, + &VertexDecoder::Step_NormalS16Skin, + &VertexDecoder::Step_NormalFloatSkin, +}; + static const StepFunction nrmstep_morph[4] = { 0, &VertexDecoder::Step_NormalS8Morph, @@ -524,6 +638,13 @@ static const StepFunction posstep[4] = { &VertexDecoder::Step_PosFloat, }; +static const StepFunction posstep_skin[4] = { + 0, + &VertexDecoder::Step_PosS8Skin, + &VertexDecoder::Step_PosS16Skin, + &VertexDecoder::Step_PosFloatSkin, +}; + static const StepFunction posstep_morph[4] = { 0, &VertexDecoder::Step_PosS8Morph, @@ -564,6 +685,8 @@ void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) { DEBUG_LOG(G3D,"VTYPE: THRU=%i TC=%i COL=%i POS=%i NRM=%i WT=%i NW=%i IDX=%i MC=%i", (int)throughmode, tc,col,pos,nrm,weighttype,nweights,idx,morphcount); } + bool skinInDecode = weighttype != 0 && g_Config.bSoftwareSkinning && morphcount == 1; + if (weighttype) { // && nweights? weightoff = size; //size = align(size, wtalign[weighttype]); unnecessary @@ -571,30 +694,35 @@ void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) { if (wtalign[weighttype] > biggest) biggest = wtalign[weighttype]; - steps_[numSteps_++] = wtstep[weighttype]; - - int fmtBase = DEC_FLOAT_1; - if (weighttype == GE_VTYPE_WEIGHT_8BIT >> GE_VTYPE_WEIGHT_SHIFT) { - fmtBase = DEC_U8_1; - } else if (weighttype == GE_VTYPE_WEIGHT_16BIT >> GE_VTYPE_WEIGHT_SHIFT) { - fmtBase = DEC_U16_1; - } else if (weighttype == GE_VTYPE_WEIGHT_FLOAT >> GE_VTYPE_WEIGHT_SHIFT) { - fmtBase = DEC_FLOAT_1; - } - - int numWeights = TranslateNumBones(nweights); - - if (numWeights <= 4) { - decFmt.w0off = decOff; - decFmt.w0fmt = fmtBase + numWeights - 1; - decOff += DecFmtSize(decFmt.w0fmt); + if (skinInDecode) { + steps_[numSteps_++] = wtstep_skin[weighttype]; + // No visible output } else { - decFmt.w0off = decOff; - decFmt.w0fmt = fmtBase + 3; - decOff += DecFmtSize(decFmt.w0fmt); - decFmt.w1off = decOff; - decFmt.w1fmt = fmtBase + numWeights - 5; - decOff += DecFmtSize(decFmt.w1fmt); + steps_[numSteps_++] = wtstep[weighttype]; + + int fmtBase = DEC_FLOAT_1; + if (weighttype == GE_VTYPE_WEIGHT_8BIT >> GE_VTYPE_WEIGHT_SHIFT) { + fmtBase = DEC_U8_1; + } else if (weighttype == GE_VTYPE_WEIGHT_16BIT >> GE_VTYPE_WEIGHT_SHIFT) { + fmtBase = DEC_U16_1; + } else if (weighttype == GE_VTYPE_WEIGHT_FLOAT >> GE_VTYPE_WEIGHT_SHIFT) { + fmtBase = DEC_FLOAT_1; + } + + int numWeights = TranslateNumBones(nweights); + + if (numWeights <= 4) { + decFmt.w0off = decOff; + decFmt.w0fmt = fmtBase + numWeights - 1; + decOff += DecFmtSize(decFmt.w0fmt); + } else { + decFmt.w0off = decOff; + decFmt.w0fmt = fmtBase + 3; + decOff += DecFmtSize(decFmt.w0fmt); + decFmt.w1off = decOff; + decFmt.w1fmt = fmtBase + numWeights - 5; + decOff += DecFmtSize(decFmt.w1fmt); + } } } @@ -656,26 +784,29 @@ void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) { if (nrmalign[nrm] > biggest) biggest = nrmalign[nrm]; - steps_[numSteps_++] = morphcount == 1 ? nrmstep[nrm] : nrmstep_morph[nrm]; - - if (morphcount == 1) { - // The normal formats match the gl formats perfectly, let's use 'em. - switch (nrm) { - case GE_VTYPE_NRM_8BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S8_3; break; - case GE_VTYPE_NRM_16BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S16_3; break; - case GE_VTYPE_NRM_FLOAT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_FLOAT_3; break; - } - } else { + if (skinInDecode) { + steps_[numSteps_++] = nrmstep_skin[nrm]; + // After skinning, we always have three floats. decFmt.nrmfmt = DEC_FLOAT_3; - } + } else { + steps_[numSteps_++] = morphcount == 1 ? nrmstep[nrm] : nrmstep_morph[nrm]; - // Actually, temporarily let's not. + if (morphcount == 1) { + // The normal formats match the gl formats perfectly, let's use 'em. + switch (nrm) { + case GE_VTYPE_NRM_8BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S8_3; break; + case GE_VTYPE_NRM_16BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S16_3; break; + case GE_VTYPE_NRM_FLOAT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_FLOAT_3; break; + } + } else { + decFmt.nrmfmt = DEC_FLOAT_3; + } + } decFmt.nrmoff = decOff; decOff += DecFmtSize(decFmt.nrmfmt); } - if (pos) // there's always a position - { + if (pos) { // there's always a position size = align(size, posalign[pos]); posoff = size; size += possize[pos]; @@ -686,18 +817,23 @@ void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) { steps_[numSteps_++] = posstep_through[pos]; decFmt.posfmt = DEC_FLOAT_3; } else { - steps_[numSteps_++] = morphcount == 1 ? posstep[pos] : posstep_morph[pos]; - - if (morphcount == 1) { - // The non-through-mode position formats match the gl formats perfectly, let's use 'em. - switch (pos) { - case GE_VTYPE_POS_8BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S8_3; break; - case GE_VTYPE_POS_16BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S16_3; break; - case GE_VTYPE_POS_FLOAT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_FLOAT_3; break; - } - } else { - // Actually, temporarily let's not. + if (skinInDecode) { + steps_[numSteps_++] = posstep_skin[pos]; decFmt.posfmt = DEC_FLOAT_3; + } else { + steps_[numSteps_++] = morphcount == 1 ? posstep[pos] : posstep_morph[pos]; + + if (morphcount == 1) { + // The non-through-mode position formats match the gl formats perfectly, let's use 'em. + switch (pos) { + case GE_VTYPE_POS_8BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S8_3; break; + case GE_VTYPE_POS_16BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S16_3; break; + case GE_VTYPE_POS_FLOAT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_FLOAT_3; break; + } + } else { + // Actually, temporarily let's not. + decFmt.posfmt = DEC_FLOAT_3; + } } } decFmt.posoff = decOff; diff --git a/GPU/GLES/VertexDecoder.h b/GPU/GLES/VertexDecoder.h index dfa248140c..1ba24af027 100644 --- a/GPU/GLES/VertexDecoder.h +++ b/GPU/GLES/VertexDecoder.h @@ -64,6 +64,10 @@ public: void Step_WeightsU16() const; void Step_WeightsFloat() const; + void Step_WeightsU8Skin() const; + void Step_WeightsU16Skin() const; + void Step_WeightsFloatSkin() const; + void Step_TcU8() const; void Step_TcU16() const; void Step_TcFloat() const; @@ -91,6 +95,10 @@ public: void Step_NormalS16() const; void Step_NormalFloat() const; + void Step_NormalS8Skin() const; + void Step_NormalS16Skin() const; + void Step_NormalFloatSkin() const; + void Step_NormalS8Morph() const; void Step_NormalS16Morph() const; void Step_NormalFloatMorph() const; @@ -99,6 +107,10 @@ public: void Step_PosS16() const; void Step_PosFloat() const; + void Step_PosS8Skin() const; + void Step_PosS16Skin() const; + void Step_PosFloatSkin() const; + void Step_PosS8Morph() const; void Step_PosS16Morph() const; void Step_PosFloatMorph() const; diff --git a/GPU/GPUState.cpp b/GPU/GPUState.cpp index fad43058c7..1a089db3ab 100644 --- a/GPU/GPUState.cpp +++ b/GPU/GPUState.cpp @@ -28,10 +28,14 @@ #include "GPU/Directx9/GPU_DX9.h" #endif #include "Core/CoreParameter.h" +#include "Core/Config.h" #include "Core/System.h" -GPUgstate gstate; -GPUStateCache gstate_c; +// This must be aligned so that the matrices within are aligned. +GPUgstate MEMORY_ALIGNED16(gstate); +// Let's align this one too for good measure. +GPUStateCache MEMORY_ALIGNED16(gstate_c); + GPUInterface *gpu; GPUDebugInterface *gpuDebug; GPUStatistics gpuStats; @@ -200,3 +204,10 @@ void GPUgstate::Restore(u32_le *ptr) { memcpy(projMatrix, matrices, sizeof(projMatrix)); matrices += sizeof(projMatrix); memcpy(tgenMatrix, matrices, sizeof(tgenMatrix)); matrices += sizeof(tgenMatrix); } + +bool vertTypeIsSkinningEnabled(u32 vertType) { + if (g_Config.bSoftwareSkinning && ((vertType & GE_VTYPE_MORPHCOUNT_MASK) == 0)) + return false; + else + return ((vertType & GE_VTYPE_WEIGHT_MASK) != GE_VTYPE_WEIGHT_NONE); +} diff --git a/GPU/GPUState.h b/GPU/GPUState.h index 5e00234693..f1c8aca408 100644 --- a/GPU/GPUState.h +++ b/GPU/GPUState.h @@ -18,6 +18,7 @@ #pragma once #include + #include "../Globals.h" #include "ge_constants.h" #include "Common/Common.h" @@ -406,7 +407,8 @@ enum SkipDrawReasonFlags { SKIPDRAW_BAD_FB_TEXTURE = 4, }; -inline bool vertTypeIsSkinningEnabled(u32 vertType) { return ((vertType & GE_VTYPE_WEIGHT_MASK) != GE_VTYPE_WEIGHT_NONE); } +bool vertTypeIsSkinningEnabled(u32 vertType); + inline int vertTypeGetNumBoneWeights(u32 vertType) { return 1 + ((vertType & GE_VTYPE_WEIGHTCOUNT_MASK) >> GE_VTYPE_WEIGHTCOUNT_SHIFT); } inline int vertTypeGetWeightMask(u32 vertType) { return vertType & GE_VTYPE_WEIGHT_MASK; } inline int vertTypeGetTexCoordMask(u32 vertType) { return vertType & GE_VTYPE_TC_MASK; } diff --git a/UI/GameSettingsScreen.cpp b/UI/GameSettingsScreen.cpp index be8044c839..22ce9bf5b1 100644 --- a/UI/GameSettingsScreen.cpp +++ b/UI/GameSettingsScreen.cpp @@ -143,10 +143,13 @@ void GameSettingsScreen::CreateViews() { graphicsSettings->Add(new CheckBox(&g_Config.bVSync, gs->T("VSync"))); #endif graphicsSettings->Add(new CheckBox(&g_Config.bHardwareTransform, gs->T("Hardware Transform"))); + CheckBox *swSkin = graphicsSettings->Add(new CheckBox(&g_Config.bSoftwareSkinning, gs->T("Software Skinning"))); graphicsSettings->Add(new CheckBox(&g_Config.bVertexCache, gs->T("Vertex Cache"))); CheckBox *vtxJit = graphicsSettings->Add(new CheckBox(&g_Config.bVertexDecoderJit, gs->T("Vertex Decoder JIT"))); - if (PSP_IsInited()) + if (PSP_IsInited()) { + swSkin->SetEnabled(false); vtxJit->SetEnabled(false); + } graphicsSettings->Add(new CheckBox(&g_Config.bLowQualitySplineBezier, gs->T("LowCurves", "Low quality spline/bezier curves"))); From 4f78eda23b60ed7e947d2ed7a4c0d52dc6ac664d Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sun, 10 Nov 2013 13:30:57 +0100 Subject: [PATCH 03/12] Save a couple of registers in the x86 vertex decoder jit by SIMD-ing prescale UV --- GPU/GLES/VertexDecoder.cpp | 73 ++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp index 6e848708a5..8844112ac1 100644 --- a/GPU/GLES/VertexDecoder.cpp +++ b/GPU/GLES/VertexDecoder.cpp @@ -927,11 +927,11 @@ struct JitLookup { JitStepFunction jitFunc; }; +#ifdef ARM + static const float by128 = 1.0f / 128.0f; static const float by32768 = 1.0f / 32768.0f; -#ifdef ARM - using namespace ArmGen; static const ARMReg tempReg1 = R3; @@ -1373,6 +1373,9 @@ void VertexDecoderJitCache::Jit_PosFloat() { using namespace Gen; +static const float MEMORY_ALIGNED16( by128[4] ) = {1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f}; +static const float MEMORY_ALIGNED16( by32768[4] ) = {1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f}; + #ifdef _M_X64 #ifdef _WIN32 static const X64Reg tempReg1 = RAX; @@ -1400,12 +1403,13 @@ static const X64Reg counterReg = ECX; // XMM0-XMM5 are volatile on Windows X64 // XMM0-XMM7 are arguments (and thus volatile) on System V ABI (other x64 platforms) -static const X64Reg fpUscaleReg = XMM0; -static const X64Reg fpVscaleReg = XMM1; -static const X64Reg fpUoffsetReg = XMM2; -static const X64Reg fpVoffsetReg = XMM3; -static const X64Reg fpScratchReg = XMM4; -static const X64Reg fpScratchReg2 = XMM5; +static const X64Reg fpScaleReg = XMM0; +static const X64Reg fpOffsetReg = XMM1; +static const X64Reg fpScratchReg = XMM2; +static const X64Reg fpScratchReg2 = XMM3; +// We're gonna keep the current skinning matrix in 3 or 4 XMM regs. Fortunately we easily +// have space for that now. + // To debug, just comment them out one at a time until it works. We fall back // on the interpreter if the compiler fails. @@ -1495,16 +1499,16 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { #else MOV(32, R(tempReg1), Imm32((u32)(&gstate_c.uv))); #endif - MOVSS(fpUscaleReg, MDisp(tempReg1, 0)); - MOVSS(fpVscaleReg, MDisp(tempReg1, 4)); - MOVSS(fpUoffsetReg, MDisp(tempReg1, 8)); - MOVSS(fpVoffsetReg, MDisp(tempReg1, 12)); + MOVSS(fpScaleReg, MDisp(tempReg1, 0)); + MOVSS(fpScratchReg, MDisp(tempReg1, 4)); + UNPCKLPS(fpScaleReg, R(fpScratchReg)); + MOVSS(fpOffsetReg, MDisp(tempReg1, 8)); + MOVSS(fpScratchReg, MDisp(tempReg1, 12)); + UNPCKLPS(fpOffsetReg, R(fpScratchReg)); if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) { - MULSS(fpUscaleReg, M((void *)&by128)); - MULSS(fpVscaleReg, M((void *)&by128)); + MULPS(fpScaleReg, M((void *)&by128)); } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) { - MULSS(fpUscaleReg, M((void *)&by32768)); - MULSS(fpVscaleReg, M((void *)&by32768)); + MULPS(fpScaleReg, M((void *)&by32768)); } } @@ -1611,43 +1615,34 @@ void VertexDecoderJitCache::Jit_TcFloat() { } void VertexDecoderJitCache::Jit_TcU8Prescale() { - // TODO: SIMD + // TODO: The first five instructions could be done in 1 or 2 in SSE4 MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->tcoff)); MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->tcoff + 1)); CVTSI2SS(fpScratchReg, R(tempReg1)); CVTSI2SS(fpScratchReg2, R(tempReg2)); - MULSS(fpScratchReg, R(fpUscaleReg)); - MULSS(fpScratchReg2, R(fpVscaleReg)); - ADDSS(fpScratchReg, R(fpUoffsetReg)); - ADDSS(fpScratchReg2, R(fpVoffsetReg)); - MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); - MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2); + UNPCKLPS(fpScratchReg, R(fpScratchReg2)); + MULPS(fpScratchReg, R(fpScaleReg)); + ADDPS(fpScratchReg, R(fpOffsetReg)); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); } void VertexDecoderJitCache::Jit_TcU16Prescale() { - // TODO: SIMD + // TODO: The first five instructions could be done in 1 or 2 in SSE4 and probably in 3 in SSE2 MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff)); MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2)); CVTSI2SS(fpScratchReg, R(tempReg1)); CVTSI2SS(fpScratchReg2, R(tempReg2)); - MULSS(fpScratchReg, R(fpUscaleReg)); - MULSS(fpScratchReg2, R(fpVscaleReg)); - ADDSS(fpScratchReg, R(fpUoffsetReg)); - ADDSS(fpScratchReg2, R(fpVoffsetReg)); - MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); - MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2); + UNPCKLPS(fpScratchReg, R(fpScratchReg2)); + MULPS(fpScratchReg, R(fpScaleReg)); + ADDPS(fpScratchReg, R(fpOffsetReg)); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); } void VertexDecoderJitCache::Jit_TcFloatPrescale() { - // TODO: SIMD - MOVSS(fpScratchReg, MDisp(srcReg, dec_->tcoff)); - MOVSS(fpScratchReg2, MDisp(srcReg, dec_->tcoff + 4)); - MULSS(fpScratchReg, R(fpUscaleReg)); - MULSS(fpScratchReg2, R(fpVscaleReg)); - ADDSS(fpScratchReg, R(fpUoffsetReg)); - ADDSS(fpScratchReg2, R(fpVoffsetReg)); - MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); - MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2); + MOVQ_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff)); + MULPS(fpScratchReg, R(fpScaleReg)); + ADDPS(fpScratchReg, R(fpOffsetReg)); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); } void VertexDecoderJitCache::Jit_TcU16Through() { From 46313ced55e5153f90b02e65136de1d81f0b64e5 Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sun, 10 Nov 2013 15:22:24 +0100 Subject: [PATCH 04/12] Prepare transform pipeline for step by step decoding --- GPU/GLES/TransformPipeline.cpp | 133 +++++++++++++++++---------------- GPU/GLES/TransformPipeline.h | 4 +- 2 files changed, 72 insertions(+), 65 deletions(-) diff --git a/GPU/GLES/TransformPipeline.cpp b/GPU/GLES/TransformPipeline.cpp index d1f70a69f7..c88502fed4 100644 --- a/GPU/GLES/TransformPipeline.cpp +++ b/GPU/GLES/TransformPipeline.cpp @@ -119,7 +119,8 @@ TransformDrawEngine::TransformDrawEngine() framebufferManager_(0), numDrawCalls(0), vertexCountInDrawCalls(0), - uvScale(0) { + uvScale(0), + decodeCounter_(0) { decimationCounter_ = VERTEXCACHE_DECIMATION_INTERVAL; // Allocate nicely aligned memory. Maybe graphics drivers will // appreciate it. @@ -332,70 +333,11 @@ void TransformDrawEngine::DecodeVerts() { UVScale origUV; if (uvScale) origUV = gstate_c.uv; - for (int i = 0; i < numDrawCalls; i++) { - const DeferredDrawCall &dc = drawCalls[i]; - - indexGen.SetIndex(collectedVerts); - int indexLowerBound = dc.indexLowerBound, indexUpperBound = dc.indexUpperBound; - - u32 indexType = dc.indexType; - void *inds = dc.inds; - if (indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) { - // Decode the verts and apply morphing. Simple. - if (uvScale) - gstate_c.uv = uvScale[i]; - dec_->DecodeVerts(decoded + collectedVerts * (int)dec_->GetDecVtxFmt().stride, - dc.verts, indexLowerBound, indexUpperBound); - collectedVerts += indexUpperBound - indexLowerBound + 1; - indexGen.AddPrim(dc.prim, dc.vertexCount); - } else { - // It's fairly common that games issue long sequences of PRIM calls, with differing - // inds pointer but the same base vertex pointer. We'd like to reuse vertices between - // these as much as possible, so we make sure here to combine as many as possible - // into one nice big drawcall, sharing data. - - // 1. Look ahead to find the max index, only looking as "matching" drawcalls. - // Expand the lower and upper bounds as we go. - int j = i + 1; - int lastMatch = i; - while (j < numDrawCalls) { - if (drawCalls[j].verts != dc.verts) - break; - if (uvScale && memcmp(&uvScale[j], &uvScale[i], sizeof(uvScale[0])) != 0) - break; - - indexLowerBound = std::min(indexLowerBound, (int)drawCalls[j].indexLowerBound); - indexUpperBound = std::max(indexUpperBound, (int)drawCalls[j].indexUpperBound); - lastMatch = j; - j++; - } - - // 2. Loop through the drawcalls, translating indices as we go. - for (j = i; j <= lastMatch; j++) { - switch (indexType) { - case GE_VTYPE_IDX_8BIT >> GE_VTYPE_IDX_SHIFT: - indexGen.TranslatePrim(drawCalls[j].prim, drawCalls[j].vertexCount, (const u8 *)drawCalls[j].inds, indexLowerBound); - break; - case GE_VTYPE_IDX_16BIT >> GE_VTYPE_IDX_SHIFT: - indexGen.TranslatePrim(drawCalls[j].prim, drawCalls[j].vertexCount, (const u16 *)drawCalls[j].inds, indexLowerBound); - break; - } - } - - int vertexCount = indexUpperBound - indexLowerBound + 1; - // 3. Decode that range of vertex data. - if (uvScale) - gstate_c.uv = uvScale[i]; - dec_->DecodeVerts(decoded + collectedVerts * (int)dec_->GetDecVtxFmt().stride, - dc.verts, indexLowerBound, indexUpperBound); - collectedVerts += vertexCount; - - // 4. Advance indexgen vertex counter. - indexGen.Advance(vertexCount); - i = lastMatch; - } + for (; decodeCounter_ < numDrawCalls; decodeCounter_++) { + if (uvScale) + gstate_c.uv = uvScale[decodeCounter_]; + DecodeVertsStep(); } - // Sanity check if (indexGen.Prim() < 0) { ERROR_LOG_REPORT(G3D, "DecodeVerts: Failed to deduce prim: %i", indexGen.Prim()); @@ -406,6 +348,68 @@ void TransformDrawEngine::DecodeVerts() { gstate_c.uv = origUV; } +void TransformDrawEngine::DecodeVertsStep() { + const int i = decodeCounter_; + + const DeferredDrawCall &dc = drawCalls[i]; + + indexGen.SetIndex(collectedVerts); + int indexLowerBound = dc.indexLowerBound, indexUpperBound = dc.indexUpperBound; + + u32 indexType = dc.indexType; + void *inds = dc.inds; + if (indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) { + // Decode the verts and apply morphing. Simple. + dec_->DecodeVerts(decoded + collectedVerts * (int)dec_->GetDecVtxFmt().stride, + dc.verts, indexLowerBound, indexUpperBound); + collectedVerts += indexUpperBound - indexLowerBound + 1; + indexGen.AddPrim(dc.prim, dc.vertexCount); + } else { + // It's fairly common that games issue long sequences of PRIM calls, with differing + // inds pointer but the same base vertex pointer. We'd like to reuse vertices between + // these as much as possible, so we make sure here to combine as many as possible + // into one nice big drawcall, sharing data. + + // 1. Look ahead to find the max index, only looking as "matching" drawcalls. + // Expand the lower and upper bounds as we go. + int j = i + 1; + int lastMatch = i; + while (j < numDrawCalls) { + if (drawCalls[j].verts != dc.verts) + break; + if (uvScale && memcmp(&uvScale[j], &uvScale[i], sizeof(uvScale[0])) != 0) + break; + + indexLowerBound = std::min(indexLowerBound, (int)drawCalls[j].indexLowerBound); + indexUpperBound = std::max(indexUpperBound, (int)drawCalls[j].indexUpperBound); + lastMatch = j; + j++; + } + + // 2. Loop through the drawcalls, translating indices as we go. + for (j = i; j <= lastMatch; j++) { + switch (indexType) { + case GE_VTYPE_IDX_8BIT >> GE_VTYPE_IDX_SHIFT: + indexGen.TranslatePrim(drawCalls[j].prim, drawCalls[j].vertexCount, (const u8 *)drawCalls[j].inds, indexLowerBound); + break; + case GE_VTYPE_IDX_16BIT >> GE_VTYPE_IDX_SHIFT: + indexGen.TranslatePrim(drawCalls[j].prim, drawCalls[j].vertexCount, (const u16 *)drawCalls[j].inds, indexLowerBound); + break; + } + } + + int vertexCount = indexUpperBound - indexLowerBound + 1; + // 3. Decode that range of vertex data. + dec_->DecodeVerts(decoded + collectedVerts * (int)dec_->GetDecVtxFmt().stride, + dc.verts, indexLowerBound, indexUpperBound); + collectedVerts += vertexCount; + + // 4. Advance indexgen vertex counter. + indexGen.Advance(vertexCount); + decodeCounter_ = lastMatch; + } +} + u32 TransformDrawEngine::ComputeHash() { u32 fullhash = 0; int vertexSize = dec_->GetDecVtxFmt().stride; @@ -720,6 +724,7 @@ rotateVBO: collectedVerts = 0; numDrawCalls = 0; vertexCountInDrawCalls = 0; + decodeCounter_ = 0; prevPrim_ = GE_PRIM_INVALID; #ifndef USING_GLES2 diff --git a/GPU/GLES/TransformPipeline.h b/GPU/GLES/TransformPipeline.h index 543a8a2f8c..273cf66982 100644 --- a/GPU/GLES/TransformPipeline.h +++ b/GPU/GLES/TransformPipeline.h @@ -97,7 +97,6 @@ public: void SubmitBezier(void* control_points, void* indices, int count_u, int count_v, GEPatchPrimType prim_type, u32 vertType); bool TestBoundingBox(void* control_points, int vertexCount, u32 vertType); - void DecodeVerts(); void SetShaderManager(ShaderManager *shaderManager) { shaderManager_ = shaderManager; } @@ -127,6 +126,8 @@ public: } private: + void DecodeVerts(); + void DecodeVertsStep(); void DoFlush(); void SoftwareTransformAndDraw(int prim, u8 *decoded, LinkedShader *program, int vertexCount, u32 vertexType, void *inds, int indexType, const DecVtxFormat &decVtxFormat, int maxIndex); void ApplyDrawState(int prim); @@ -195,6 +196,7 @@ private: int vertexCountInDrawCalls; int decimationCounter_; + int decodeCounter_; UVScale *uvScale; }; From 179934ec9fc942fa4a65a664ddb2d199325cca7f Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sun, 10 Nov 2013 15:31:56 +0100 Subject: [PATCH 05/12] Decode step by step when sw skinning --- GPU/GLES/TransformPipeline.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/GPU/GLES/TransformPipeline.cpp b/GPU/GLES/TransformPipeline.cpp index c88502fed4..9bab41bfca 100644 --- a/GPU/GLES/TransformPipeline.cpp +++ b/GPU/GLES/TransformPipeline.cpp @@ -325,8 +325,14 @@ void TransformDrawEngine::SubmitPrim(void *verts, void *inds, GEPrimitiveType pr if (uvScale) { uvScale[numDrawCalls] = gstate_c.uv; } + numDrawCalls++; vertexCountInDrawCalls += vertexCount; + + if (g_Config.bSoftwareSkinning && (vertType & GE_VTYPE_WEIGHT_MASK)) { + DecodeVertsStep(); + decodeCounter_++; + } } void TransformDrawEngine::DecodeVerts() { From 6976d6a3a07500b11afde1f4f0c5911c4370c0f2 Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sun, 10 Nov 2013 16:05:34 +0100 Subject: [PATCH 06/12] Enable the softskinning optimizations that let us merge drawcalls --- GPU/GLES/GLES_GPU.cpp | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/GPU/GLES/GLES_GPU.cpp b/GPU/GLES/GLES_GPU.cpp index 19343dc1db..6009ed75e4 100644 --- a/GPU/GLES/GLES_GPU.cpp +++ b/GPU/GLES/GLES_GPU.cpp @@ -430,10 +430,8 @@ GLES_GPU::GLES_GPU() commandFlags_[GE_CMD_TEXOFFSETV] &= ~FLAG_FLUSHBEFOREONCHANGE; } - // TODO: Can't turn this optimization on until we don't decode everything in one go - // but instead decode for every draw call when sw skinning if (g_Config.bSoftwareSkinning) { - // commandFlags_[GE_CMD_VERTEXTYPE] &= ~FLAG_FLUSHBEFOREONCHANGE; + commandFlags_[GE_CMD_VERTEXTYPE] &= ~FLAG_FLUSHBEFOREONCHANGE; } BuildReportingInfo(); @@ -876,16 +874,16 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) { case GE_CMD_VERTEXTYPE: if (diff) { if (!g_Config.bSoftwareSkinning) { - shaderManager_->DirtyUniform(DIRTY_UVSCALEOFFSET); - } else if (false) { - // TODO: Can't turn this optimization on until we don't decode everything in one go - // but instead decode for every draw call when sw skinning + if (diff & (GE_VTYPE_TC_MASK | GE_VTYPE_THROUGH_MASK)) + shaderManager_->DirtyUniform(DIRTY_UVSCALEOFFSET); + } else { if (diff & ~GE_VTYPE_WEIGHTCOUNT_MASK) { // Restore and flush gstate.vertType ^= diff; Flush(); gstate.vertType ^= diff; - shaderManager_->DirtyUniform(DIRTY_UVSCALEOFFSET); + if (diff & (GE_VTYPE_TC_MASK | GE_VTYPE_THROUGH_MASK)) + shaderManager_->DirtyUniform(DIRTY_UVSCALEOFFSET); } } } @@ -1392,16 +1390,16 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) { case GE_CMD_BONEMATRIXDATA: { int num = gstate.boneMatrixNumber & 0x7F; - float newVal = getFloat24(data); - if (num < 96 && newVal != gstate.boneMatrix[num]) { + u32 newVal = data << 8; + if (num < 96 && newVal != ((const u32 *)gstate.boneMatrix)[num]) { // Bone matrices should NOT flush when software skinning is enabled! // TODO: Also check for morph... // TODO: Can't turn this optimizatoin on until we decode per drawcall when sw skinning. - if (true || !g_Config.bSoftwareSkinning) { + if (!g_Config.bSoftwareSkinning) { Flush(); shaderManager_->DirtyUniform(DIRTY_BONEMATRIX0 << (num / 12)); } - gstate.boneMatrix[num] = newVal; + ((u32 *)gstate.boneMatrix)[num] = newVal; } num++; gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x7F); From f0cacf46d080a6b604852fba6d574e2d2d0c215c Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sun, 10 Nov 2013 16:05:57 +0100 Subject: [PATCH 07/12] No reason to involve the FPU when loading matrices --- GPU/GLES/GLES_GPU.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/GPU/GLES/GLES_GPU.cpp b/GPU/GLES/GLES_GPU.cpp index 6009ed75e4..b296f91e2e 100644 --- a/GPU/GLES/GLES_GPU.cpp +++ b/GPU/GLES/GLES_GPU.cpp @@ -1187,9 +1187,9 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) { case GE_CMD_LDC0:case GE_CMD_LDC1:case GE_CMD_LDC2:case GE_CMD_LDC3: case GE_CMD_LSC0:case GE_CMD_LSC1:case GE_CMD_LSC2:case GE_CMD_LSC3: if (diff) { - float r = (float)(data & 0xff)/255.0f; - float g = (float)((data>>8) & 0xff)/255.0f; - float b = (float)(data>>16)/255.0f; + float r = (float)(data & 0xff) * (1.0f / 255.0f); + float g = (float)((data >> 8) & 0xff) * (1.0f / 255.0f); + float b = (float)(data >> 16) * (1.0f / 255.0f); int l = (cmd - GE_CMD_LAC0) / 3; int t = (cmd - GE_CMD_LAC0) % 3; @@ -1318,10 +1318,10 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) { case GE_CMD_WORLDMATRIXDATA: { int num = gstate.worldmtxnum & 0xF; - float newVal = getFloat24(data); - if (num < 12 && newVal != gstate.worldMatrix[num]) { + u32 newVal = data << 8; + if (num < 12 && newVal != ((const u32 *)gstate.worldMatrix)[num]) { Flush(); - gstate.worldMatrix[num] = newVal; + ((u32 *)gstate.worldMatrix)[num] = newVal; shaderManager_->DirtyUniform(DIRTY_WORLDMATRIX); } num++; @@ -1336,10 +1336,10 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) { case GE_CMD_VIEWMATRIXDATA: { int num = gstate.viewmtxnum & 0xF; - float newVal = getFloat24(data); - if (num < 12 && newVal != gstate.viewMatrix[num]) { + u32 newVal = data << 8; + if (num < 12 && newVal != ((const u32 *)gstate.viewMatrix)[num]) { Flush(); - gstate.viewMatrix[num] = newVal; + ((u32 *)gstate.viewMatrix)[num] = newVal; shaderManager_->DirtyUniform(DIRTY_VIEWMATRIX); } num++; @@ -1354,10 +1354,10 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) { case GE_CMD_PROJMATRIXDATA: { int num = gstate.projmtxnum & 0xF; - float newVal = getFloat24(data); - if (newVal != gstate.projMatrix[num]) { + u32 newVal = data << 8; + if (newVal != ((const u32 *)gstate.projMatrix)[num]) { Flush(); - gstate.projMatrix[num] = newVal; + ((u32 *)gstate.projMatrix)[num] = newVal; shaderManager_->DirtyUniform(DIRTY_PROJMATRIX); } num++; @@ -1372,10 +1372,10 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) { case GE_CMD_TGENMATRIXDATA: { int num = gstate.texmtxnum & 0xF; - float newVal = getFloat24(data); - if (num < 12 && newVal != gstate.tgenMatrix[num]) { + u32 newVal = data << 8; + if (num < 12 && newVal != ((const u32 *)gstate.tgenMatrix)[num]) { Flush(); - gstate.tgenMatrix[num] = newVal; + ((u32 *)gstate.tgenMatrix)[num] = newVal; shaderManager_->DirtyUniform(DIRTY_TEXMATRIX); } num++; From 9333d3ea769b75541e60323f6627c328f8cacfcb Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sun, 10 Nov 2013 17:40:38 +0100 Subject: [PATCH 08/12] Vtx dec jit: Combine the scale and offset registers to save 1 more xmm register. --- GPU/GLES/VertexDecoder.cpp | 46 ++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp index 8844112ac1..d0a67abbaf 100644 --- a/GPU/GLES/VertexDecoder.cpp +++ b/GPU/GLES/VertexDecoder.cpp @@ -1403,13 +1403,14 @@ static const X64Reg counterReg = ECX; // XMM0-XMM5 are volatile on Windows X64 // XMM0-XMM7 are arguments (and thus volatile) on System V ABI (other x64 platforms) -static const X64Reg fpScaleReg = XMM0; -static const X64Reg fpOffsetReg = XMM1; -static const X64Reg fpScratchReg = XMM2; -static const X64Reg fpScratchReg2 = XMM3; -// We're gonna keep the current skinning matrix in 3 or 4 XMM regs. Fortunately we easily -// have space for that now. +static const X64Reg fpScaleOffsetReg = XMM0; +static const X64Reg fpScratchReg = XMM1; +static const X64Reg fpScratchReg2 = XMM2; +static const X64Reg fpScratchReg3 = XMM3; + +// We're gonna keep the current skinning matrix in 4 XMM regs. Fortunately we easily +// have space for that now. // To debug, just comment them out one at a time until it works. We fall back // on the interpreter if the compiler fails. @@ -1499,17 +1500,18 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { #else MOV(32, R(tempReg1), Imm32((u32)(&gstate_c.uv))); #endif - MOVSS(fpScaleReg, MDisp(tempReg1, 0)); + MOVSS(fpScaleOffsetReg, MDisp(tempReg1, 0)); MOVSS(fpScratchReg, MDisp(tempReg1, 4)); - UNPCKLPS(fpScaleReg, R(fpScratchReg)); - MOVSS(fpOffsetReg, MDisp(tempReg1, 8)); - MOVSS(fpScratchReg, MDisp(tempReg1, 12)); - UNPCKLPS(fpOffsetReg, R(fpScratchReg)); + UNPCKLPS(fpScaleOffsetReg, R(fpScratchReg)); if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) { - MULPS(fpScaleReg, M((void *)&by128)); + MULPS(fpScaleOffsetReg, M((void *)&by128)); } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) { - MULPS(fpScaleReg, M((void *)&by32768)); + MULPS(fpScaleOffsetReg, M((void *)&by32768)); } + MOVSS(fpScratchReg, MDisp(tempReg1, 8)); + MOVSS(fpScratchReg2, MDisp(tempReg1, 12)); + UNPCKLPS(fpScratchReg, R(fpScratchReg2)); + UNPCKLPD(fpScaleOffsetReg, R(fpScratchReg)); } // Let's not bother with a proper stack frame. We just grab the arguments and go. @@ -1621,8 +1623,10 @@ void VertexDecoderJitCache::Jit_TcU8Prescale() { CVTSI2SS(fpScratchReg, R(tempReg1)); CVTSI2SS(fpScratchReg2, R(tempReg2)); UNPCKLPS(fpScratchReg, R(fpScratchReg2)); - MULPS(fpScratchReg, R(fpScaleReg)); - ADDPS(fpScratchReg, R(fpOffsetReg)); + MULPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + ADDPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); } @@ -1633,15 +1637,19 @@ void VertexDecoderJitCache::Jit_TcU16Prescale() { CVTSI2SS(fpScratchReg, R(tempReg1)); CVTSI2SS(fpScratchReg2, R(tempReg2)); UNPCKLPS(fpScratchReg, R(fpScratchReg2)); - MULPS(fpScratchReg, R(fpScaleReg)); - ADDPS(fpScratchReg, R(fpOffsetReg)); + MULPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + ADDPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); } void VertexDecoderJitCache::Jit_TcFloatPrescale() { MOVQ_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff)); - MULPS(fpScratchReg, R(fpScaleReg)); - ADDPS(fpScratchReg, R(fpOffsetReg)); + MULPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + ADDPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); } From 6221dbaf5d83ea6049e8d47f52c92b841a174bec Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sun, 10 Nov 2013 20:08:46 +0100 Subject: [PATCH 09/12] Optimize software skinning for x86. Can't seem to get a win on Windows vs hardware skinning though, even though draw calls drop by 2/3rd... --- Common/x64Emitter.cpp | 6 + Common/x64Emitter.h | 2 + GPU/GLES/GLES_GPU.cpp | 1 - GPU/GLES/VertexDecoder.cpp | 331 +++++++++++++++++++++++++++++++++++-- GPU/GLES/VertexDecoder.h | 16 +- 5 files changed, 342 insertions(+), 14 deletions(-) diff --git a/Common/x64Emitter.cpp b/Common/x64Emitter.cpp index 9abc3eff2c..a4e6db829b 100644 --- a/Common/x64Emitter.cpp +++ b/Common/x64Emitter.cpp @@ -1371,6 +1371,12 @@ void XEmitter::PSLLQ(X64Reg reg, int shift) { Write8(shift); } +void XEmitter::PSLLDQ(X64Reg reg, int shift) { + WriteSSEOp(64, 0x73, true, (X64Reg)7, R(reg)); + Write8(shift); +} + + // WARNING not REX compatible void XEmitter::PSRAW(X64Reg reg, int shift) { if (reg > 7) diff --git a/Common/x64Emitter.h b/Common/x64Emitter.h index ba7c8d0fa1..ccff753935 100644 --- a/Common/x64Emitter.h +++ b/Common/x64Emitter.h @@ -658,6 +658,8 @@ public: void PSLLD(X64Reg reg, int shift); void PSLLQ(X64Reg reg, int shift); + void PSLLDQ(X64Reg reg, int shift); + void PSRAW(X64Reg reg, int shift); void PSRAD(X64Reg reg, int shift); diff --git a/GPU/GLES/GLES_GPU.cpp b/GPU/GLES/GLES_GPU.cpp index b296f91e2e..79226106e2 100644 --- a/GPU/GLES/GLES_GPU.cpp +++ b/GPU/GLES/GLES_GPU.cpp @@ -1394,7 +1394,6 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) { if (num < 96 && newVal != ((const u32 *)gstate.boneMatrix)[num]) { // Bone matrices should NOT flush when software skinning is enabled! // TODO: Also check for morph... - // TODO: Can't turn this optimizatoin on until we decode per drawcall when sw skinning. if (!g_Config.bSoftwareSkinning) { Flush(); shaderManager_->DirtyUniform(DIRTY_BONEMATRIX0 << (num / 12)); diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp index d0a67abbaf..b58a97d8f8 100644 --- a/GPU/GLES/VertexDecoder.cpp +++ b/GPU/GLES/VertexDecoder.cpp @@ -15,6 +15,8 @@ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. +#include "base/basictypes.h" + #include "Core/Config.h" #include "Core/MemMap.h" #include "GPU/ge_constants.h" @@ -35,9 +37,13 @@ static const u8 nrmsize[4] = {0,3,6,12}, nrmalign[4] = {0,1,2,4}; static const u8 possize[4] = {0,3,6,12}, posalign[4] = {0,1,2,4}; static const u8 wtsize[4] = {0,1,2,4}, wtalign[4] = {0,1,2,4}; -// When software skinning. This should be stored in registers instead of memory -// when jitting. -float skinMatrix[12]; +// When software skinning. This array is only used when non-jitted - when jitted, the matrix +// is kept in registers. +static float skinMatrix[12]; + +// We start out by converting the active matrices into 4x4 which are easier to multiply with +// using SSE / NEON and store them here. +static float bones[16 * 8]; inline int align(int n, int align) { return (n + (align - 1)) & ~(align - 1); @@ -930,6 +936,7 @@ struct JitLookup { #ifdef ARM static const float by128 = 1.0f / 128.0f; +static const float by256 = 1.0f / 256.0f; static const float by32768 = 1.0f / 32768.0f; using namespace ArmGen; @@ -1373,8 +1380,18 @@ void VertexDecoderJitCache::Jit_PosFloat() { using namespace Gen; -static const float MEMORY_ALIGNED16( by128[4] ) = {1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f}; -static const float MEMORY_ALIGNED16( by32768[4] ) = {1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f}; +static const float MEMORY_ALIGNED16( by128[4] ) = { + 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f +}; +static const float MEMORY_ALIGNED16( by256[4] ) = { + 1.0f / 256, 1.0f / 256, 1.0f / 256, 1.0f / 256 +}; +static const float MEMORY_ALIGNED16( by32768[4] ) = { + 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, +}; + +static const u32 MEMORY_ALIGNED16( threeMasks[4] ) = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0}; +static const u32 MEMORY_ALIGNED16( aOne[4] ) = {0, 0, 0, 0x3F800000}; #ifdef _M_X64 #ifdef _WIN32 @@ -1420,6 +1437,10 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16}, {&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat}, + {&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin}, + {&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin}, + {&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin}, + {&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8}, {&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16}, {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, @@ -1437,6 +1458,10 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16}, {&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat}, + {&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin}, + {&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin}, + {&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin}, + {&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888}, {&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444}, {&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565}, @@ -1449,6 +1474,10 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8}, {&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16}, {&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat}, + + {&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin}, + {&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin}, + {&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin}, }; // TODO: This should probably be global... @@ -1479,9 +1508,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { // Save XMM4/XMM5 which apparently can be problematic? // Actually, if they are, it must be a compiler bug because they SHOULD be ok. // So I won't bother. - // SUB(PTRBITS, R(ESP), Imm8(32)); - // MOVUPS(MDisp(ESP, 0), XMM4); - // MOVUPS(MDisp(ESP, 16), XMM5); + SUB(PTRBITS, R(ESP), Imm8(64)); + MOVUPS(MDisp(ESP, 0), XMM4); + MOVUPS(MDisp(ESP, 16), XMM5); + MOVUPS(MDisp(ESP, 32), XMM6); + MOVUPS(MDisp(ESP, 48), XMM7); bool prescaleStep = false; // Look for prescaled texcoord steps @@ -1493,6 +1524,28 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { } } + // Add code to convert matrices to 4x4. + // Later we might want to do this when the matrices are loaded instead. + // This is mostly proof of concept. + int boneCount = 0; + if (dec.weighttype) { + for (int i = 0; i < 8; i++) { + MOVUPS(XMM0, M((void *)(gstate.boneMatrix + 12 * i))); + MOVUPS(XMM1, M((void *)(gstate.boneMatrix + 12 * i + 3))); + MOVUPS(XMM2, M((void *)(gstate.boneMatrix + 12 * i + 3 * 2))); + MOVUPS(XMM3, M((void *)(gstate.boneMatrix + 12 * i + 3 * 3))); + ANDPS(XMM0, M((void *)&threeMasks)); + ANDPS(XMM1, M((void *)&threeMasks)); + ANDPS(XMM2, M((void *)&threeMasks)); + ANDPS(XMM3, M((void *)&threeMasks)); + ORPS(XMM3, M((void *)&aOne)); + MOVAPS(M((void *)(bones + 16 * i)), XMM0); + MOVAPS(M((void *)(bones + 16 * i + 4)), XMM1); + MOVAPS(M((void *)(bones + 16 * i + 8)), XMM2); + MOVAPS(M((void *)(bones + 16 * i + 12)), XMM3); + } + } + // Keep the scale/offset in a few fp registers if we need it. if (prescaleStep) { #ifdef _M_X64 @@ -1529,9 +1582,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { SUB(32, R(counterReg), Imm8(1)); J_CC(CC_NZ, loopStart, true); - // MOVUPS(XMM4, MDisp(ESP, 0)); - // MOVUPS(XMM5, MDisp(ESP, 16)); - // ADD(PTRBITS, R(ESP), Imm8(32)); + MOVUPS(XMM4, MDisp(ESP, 0)); + MOVUPS(XMM5, MDisp(ESP, 16)); + MOVUPS(XMM6, MDisp(ESP, 32)); + MOVUPS(XMM7, MDisp(ESP, 48)); + ADD(PTRBITS, R(ESP), Imm8(64)); #ifdef _M_IX86 // Restore register values @@ -1584,6 +1639,118 @@ void VertexDecoderJitCache::Jit_WeightsFloat() { } } +void VertexDecoderJitCache::Jit_WeightsU8Skin() { +#ifdef _M_X64 + MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones)); +#else + MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones)); +#endif + for (int j = 0; j < dec_->nweights; j++) { + MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff + j)); + CVTSI2SS(XMM1, R(tempReg1)); + MULSS(XMM1, M((void *)&by128)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); + if (j == 0) { + MOVAPS(XMM4, MDisp(tempReg2, 0)); + MOVAPS(XMM5, MDisp(tempReg2, 16)); + MULPS(XMM4, R(XMM1)); + MULPS(XMM5, R(XMM1)); + MOVAPS(XMM6, MDisp(tempReg2, 32)); + MOVAPS(XMM7, MDisp(tempReg2, 48)); + MULPS(XMM6, R(XMM1)); + MULPS(XMM7, R(XMM1)); + } else { + MOVAPS(XMM2, MDisp(tempReg2, 0)); + MOVAPS(XMM3, MDisp(tempReg2, 16)); + MULPS(XMM2, R(XMM1)); + MULPS(XMM3, R(XMM1)); + ADDPS(XMM4, R(XMM2)); + ADDPS(XMM5, R(XMM3)); + MOVAPS(XMM2, MDisp(tempReg2, 32)); + MOVAPS(XMM3, MDisp(tempReg2, 48)); + MULPS(XMM2, R(XMM1)); + MULPS(XMM3, R(XMM1)); + ADDPS(XMM6, R(XMM2)); + ADDPS(XMM7, R(XMM3)); + } + ADD(PTRBITS, R(tempReg2), Imm8(4 * 16)); + } +} + +void VertexDecoderJitCache::Jit_WeightsU16Skin() { +#ifdef _M_X64 + MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones)); +#else + MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones)); +#endif + for (int j = 0; j < dec_->nweights; j++) { + MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff + j * 2)); + CVTSI2SS(XMM1, R(tempReg1)); + MULSS(XMM1, M((void *)&by32768)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); + if (j == 0) { + MOVAPS(XMM4, MDisp(tempReg2, 0)); + MOVAPS(XMM5, MDisp(tempReg2, 16)); + MULPS(XMM4, R(XMM1)); + MULPS(XMM5, R(XMM1)); + MOVAPS(XMM6, MDisp(tempReg2, 32)); + MOVAPS(XMM7, MDisp(tempReg2, 48)); + MULPS(XMM6, R(XMM1)); + MULPS(XMM7, R(XMM1)); + } else { + MOVAPS(XMM2, MDisp(tempReg2, 0)); + MOVAPS(XMM3, MDisp(tempReg2, 16)); + MULPS(XMM2, R(XMM1)); + MULPS(XMM3, R(XMM1)); + ADDPS(XMM4, R(XMM2)); + ADDPS(XMM5, R(XMM3)); + MOVAPS(XMM2, MDisp(tempReg2, 32)); + MOVAPS(XMM3, MDisp(tempReg2, 48)); + MULPS(XMM2, R(XMM1)); + MULPS(XMM3, R(XMM1)); + ADDPS(XMM6, R(XMM2)); + ADDPS(XMM7, R(XMM3)); + } + ADD(PTRBITS, R(tempReg2), Imm8(4 * 16)); + } +} + +void VertexDecoderJitCache::Jit_WeightsFloatSkin() { +#ifdef _M_X64 + MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones)); +#else + MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones)); +#endif + for (int j = 0; j < dec_->nweights; j++) { + MOVSS(XMM1, MDisp(srcReg, dec_->weightoff + j * 4)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); + if (j == 0) { + MOVAPS(XMM4, MDisp(tempReg2, 0)); + MOVAPS(XMM5, MDisp(tempReg2, 16)); + MULPS(XMM4, R(XMM1)); + MULPS(XMM5, R(XMM1)); + MOVAPS(XMM6, MDisp(tempReg2, 32)); + MOVAPS(XMM7, MDisp(tempReg2, 48)); + MULPS(XMM6, R(XMM1)); + MULPS(XMM7, R(XMM1)); + } else { + MOVAPS(XMM2, MDisp(tempReg2, 0)); + MOVAPS(XMM3, MDisp(tempReg2, 16)); + MULPS(XMM2, R(XMM1)); + MULPS(XMM3, R(XMM1)); + ADDPS(XMM4, R(XMM2)); + ADDPS(XMM5, R(XMM3)); + MOVAPS(XMM2, MDisp(tempReg2, 32)); + MOVAPS(XMM3, MDisp(tempReg2, 48)); + MULPS(XMM2, R(XMM1)); + MULPS(XMM3, R(XMM1)); + ADDPS(XMM6, R(XMM2)); + ADDPS(XMM7, R(XMM3)); + } + ADD(PTRBITS, R(tempReg2), Imm8(4 * 16)); + } +} + // Fill last two bytes with zeroes to align to 4 bytes. MOVZX does it for us, handy. void VertexDecoderJitCache::Jit_TcU8() { MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff)); @@ -1816,6 +1983,77 @@ void VertexDecoderJitCache::Jit_NormalFloat() { MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 8), R(tempReg3)); } +void VertexDecoderJitCache::Jit_NormalS8Skin() { + XORPS(XMM3, R(XMM3)); + for (int i = 0; i < 3; i++) { + MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->nrmoff + (2 - i))); + CVTSI2SS(XMM3, R(tempReg1)); + if (i != 2) { + PSLLDQ(XMM3, 4); + } + } + MULPS(XMM3, M((void *)&by128)); + + MOVAPS(XMM1, R(XMM3)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); + MULPS(XMM1, R(XMM4)); + MOVAPS(XMM2, R(XMM3)); + SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1)); + MULPS(XMM2, R(XMM5)); + ADDPS(XMM1, R(XMM2)); + MOVAPS(XMM2, R(XMM3)); + SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2)); + MULPS(XMM2, R(XMM6)); + ADDPS(XMM1, R(XMM2)); + MOVUPS(MDisp(dstReg, dec_->decFmt.nrmoff), XMM1); +} + +// Copy 6 bytes and then 2 zeroes. +void VertexDecoderJitCache::Jit_NormalS16Skin() { + XORPS(XMM3, R(XMM3)); + for (int i = 0; i < 3; i++) { + MOVSX(32, 16, tempReg1, MDisp(srcReg, dec_->nrmoff + (2 - i) * 2)); + CVTSI2SS(XMM3, R(tempReg1)); + if (i != 2) { + PSLLDQ(XMM3, 4); + } + } + MULPS(XMM3, M((void *)&by32768)); + + MOVAPS(XMM1, R(XMM3)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); + MULPS(XMM1, R(XMM4)); + MOVAPS(XMM2, R(XMM3)); + SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1)); + MULPS(XMM2, R(XMM5)); + ADDPS(XMM1, R(XMM2)); + MOVAPS(XMM2, R(XMM3)); + SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2)); + MULPS(XMM2, R(XMM6)); + ADDPS(XMM1, R(XMM2)); + MOVUPS(MDisp(dstReg, dec_->decFmt.nrmoff), XMM1); +} + +void VertexDecoderJitCache::Jit_NormalFloatSkin() { + MOVUPS(XMM3, MDisp(srcReg, dec_->nrmoff)); + + MOVAPS(XMM1, R(XMM3)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); + MULPS(XMM1, R(XMM4)); + + MOVAPS(XMM2, R(XMM3)); + SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1)); + MULPS(XMM2, R(XMM5)); + ADDPS(XMM1, R(XMM2)); + + MOVAPS(XMM2, R(XMM3)); + SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2)); + MULPS(XMM2, R(XMM6)); + ADDPS(XMM1, R(XMM2)); + + MOVUPS(MDisp(dstReg, dec_->decFmt.nrmoff), XMM1); +} + // Through expands into floats, always. Might want to look at changing this. void VertexDecoderJitCache::Jit_PosS8Through() { // TODO: SIMD @@ -1861,6 +2099,77 @@ void VertexDecoderJitCache::Jit_PosFloat() { MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 8), R(tempReg3)); } +void VertexDecoderJitCache::Jit_PosS8Skin() { + XORPS(XMM3, R(XMM3)); + for (int i = 0; i < 3; i++) { + MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + (2 - i))); + CVTSI2SS(XMM3, R(tempReg1)); + if (i != 2) { + PSLLDQ(XMM3, 4); + } + } + MULPS(XMM3, M((void *)&by128)); + MOVAPS(XMM1, R(XMM3)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); + MULPS(XMM1, R(XMM4)); + MOVAPS(XMM2, R(XMM3)); + SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1)); + MULPS(XMM2, R(XMM5)); + ADDPS(XMM1, R(XMM2)); + MOVAPS(XMM2, R(XMM3)); + SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2)); + MULPS(XMM2, R(XMM6)); + ADDPS(XMM1, R(XMM2)); + ADDPS(XMM1, R(XMM7)); + MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM1); +} + +void VertexDecoderJitCache::Jit_PosS16Skin() { + XORPS(XMM3, R(XMM3)); + for (int i = 0; i < 3; i++) { + MOVSX(32, 16, tempReg1, MDisp(srcReg, dec_->posoff + (2 - i) * 2)); + CVTSI2SS(XMM3, R(tempReg1)); + if (i != 2) { + PSLLDQ(XMM3, 4); + } + } + MULPS(XMM3, M((void *)&by32768)); + MOVAPS(XMM1, R(XMM3)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); + MULPS(XMM1, R(XMM4)); + MOVAPS(XMM2, R(XMM3)); + SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1)); + MULPS(XMM2, R(XMM5)); + ADDPS(XMM1, R(XMM2)); + MOVAPS(XMM2, R(XMM3)); + SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2)); + MULPS(XMM2, R(XMM6)); + ADDPS(XMM1, R(XMM2)); + ADDPS(XMM1, R(XMM7)); + MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM1); +} + +// Just copy 12 bytes. +void VertexDecoderJitCache::Jit_PosFloatSkin() { + MOVUPS(XMM3, MDisp(srcReg, dec_->posoff)); + + MOVAPS(XMM1, R(XMM3)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); + MULPS(XMM1, R(XMM4)); + + MOVAPS(XMM2, R(XMM3)); + SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1)); + MULPS(XMM2, R(XMM5)); + ADDPS(XMM1, R(XMM2)); + + MOVAPS(XMM2, R(XMM3)); + SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2)); + MULPS(XMM2, R(XMM6)); + ADDPS(XMM1, R(XMM2)); + ADDPS(XMM1, R(XMM7)); + MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM1); +} + #elif defined(PPC) #error This should not be built for PowerPC, at least not yet. diff --git a/GPU/GLES/VertexDecoder.h b/GPU/GLES/VertexDecoder.h index 1ba24af027..b6586756f6 100644 --- a/GPU/GLES/VertexDecoder.h +++ b/GPU/GLES/VertexDecoder.h @@ -199,6 +199,10 @@ public: void Jit_WeightsU16(); void Jit_WeightsFloat(); + void Jit_WeightsU8Skin(); + void Jit_WeightsU16Skin(); + void Jit_WeightsFloatSkin(); + void Jit_TcU8(); void Jit_TcU16(); void Jit_TcFloat(); @@ -222,11 +226,19 @@ public: void Jit_NormalS16(); void Jit_NormalFloat(); + void Jit_NormalS8Skin(); + void Jit_NormalS16Skin(); + void Jit_NormalFloatSkin(); + void Jit_PosS8(); - void Jit_PosS8Through(); void Jit_PosS16(); - void Jit_PosS16Through(); void Jit_PosFloat(); + void Jit_PosS8Through(); + void Jit_PosS16Through(); + + void Jit_PosS8Skin(); + void Jit_PosS16Skin(); + void Jit_PosFloatSkin(); private: bool CompileStep(const VertexDecoder &dec, int i); From 821a2f10f8766b2f29ba911cc40642757de390d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Wed, 13 Nov 2013 10:35:22 +0100 Subject: [PATCH 10/12] Delete obsolete code --- GPU/GLES/VertexDecoder.cpp | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp index b58a97d8f8..24edc39773 100644 --- a/GPU/GLES/VertexDecoder.cpp +++ b/GPU/GLES/VertexDecoder.cpp @@ -49,37 +49,6 @@ inline int align(int n, int align) { return (n + (align - 1)) & ~(align - 1); } -#if 0 -// This is what the software transform spits out, and thus w -DecVtxFormat GetTransformedVtxFormat(const DecVtxFormat &fmt) { - DecVtxFormat tfm = {0}; - int size = 0; - int offset = 0; - // Weights disappear during transform. - if (fmt.uvfmt) { - // UV always becomes float2. - tfm.uvfmt = DEC_FLOAT_2; - tfm.uvoff = offset; - offset += DecFmtSize(tfm.uvfmt); - } - // We always (?) get two colors out, they're floats (although we'd probably be fine with less precision). - tfm.c0fmt = DEC_FLOAT_4; - tfm.c0off = offset; - offset += DecFmtSize(tfm.c0fmt); - tfm.c1fmt = DEC_FLOAT_3; // color1 (specular) doesn't have alpha. - tfm.c1off = offset; - offset += DecFmtSize(tfm.c1fmt); - // We never get a normal, it's gone. - // But we do get a position, and it's always float3. - tfm.posfmt = DEC_FLOAT_3; - tfm.posoff = offset; - offset += DecFmtSize(tfm.posfmt); - // Update stride. - tfm.stride = offset; - return tfm; -} -#endif - VertexDecoder::VertexDecoder() : coloff(0), nrmoff(0), posoff(0), jitted_(0) { memset(stats_, 0, sizeof(stats_)); } From 9bbdd1907d0bcad04e893ede9a476bdee2cff47e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Wed, 13 Nov 2013 17:17:14 +0100 Subject: [PATCH 11/12] Kind-of optimized ARM software skinning (non-NEON) --- GPU/GLES/VertexDecoder.cpp | 228 ++++++++++++++++++++++++++++++++++++- GPU/GLES/VertexDecoder.h | 1 + 2 files changed, 226 insertions(+), 3 deletions(-) diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp index 24edc39773..f81514078b 100644 --- a/GPU/GLES/VertexDecoder.cpp +++ b/GPU/GLES/VertexDecoder.cpp @@ -16,6 +16,7 @@ // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. #include "base/basictypes.h" +#include "base/logging.h" #include "Core/Config.h" #include "Core/MemMap.h" @@ -878,7 +879,7 @@ int VertexDecoder::ToString(char *output) const { VertexDecoderJitCache::VertexDecoderJitCache() { // 64k should be enough. - AllocCodeSpace(1024 * 64); + AllocCodeSpace(1024 * 64 * 4); // Add some random code to "help" MSVC's buggy disassembler :( #if defined(_WIN32) @@ -914,21 +915,32 @@ static const ARMReg tempReg1 = R3; static const ARMReg tempReg2 = R4; static const ARMReg tempReg3 = R5; static const ARMReg scratchReg = R6; +static const ARMReg scratchReg2 = R7; +static const ARMReg scratchReg3 = R12; static const ARMReg srcReg = R0; static const ARMReg dstReg = R1; static const ARMReg counterReg = R2; static const ARMReg fpScratchReg = S4; static const ARMReg fpScratchReg2 = S5; +static const ARMReg fpScratchReg3 = S6; + static const ARMReg fpUscaleReg = S0; static const ARMReg fpVscaleReg = S1; static const ARMReg fpUoffsetReg = S2; static const ARMReg fpVoffsetReg = S3; +// Everything above S6 is fair game for skinning +static const ARMReg src[3] = {S8, S9, S10}; // skin source +static const ARMReg acc[3] = {S11, S12, S13}; // skin accumulator static const JitLookup jitLookup[] = { {&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8}, {&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16}, {&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat}, + {&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin}, + {&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin}, + {&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin}, + {&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8}, {&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16}, {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, @@ -946,6 +958,10 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16}, {&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat}, + {&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin}, + {&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin}, + {&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin}, + {&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888}, {&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444}, {&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565}, @@ -958,6 +974,10 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8}, {&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16}, {&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat}, + + {&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin}, + {&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin}, + {&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin}, }; JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { @@ -965,6 +985,8 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { const u8 *start = this->GetCodePtr(); bool prescaleStep = false; + bool skinning = false; + // Look for prescaled texcoord steps for (int i = 0; i < dec.numSteps_; i++) { if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale || @@ -972,6 +994,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) { prescaleStep = true; } + if (dec.steps_[i] == &VertexDecoder::Step_WeightsU8Skin || + dec.steps_[i] == &VertexDecoder::Step_WeightsU16Skin || + dec.steps_[i] == &VertexDecoder::Step_WeightsFloatSkin) { + skinning = true; + } } SetCC(CC_AL); @@ -996,10 +1023,14 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { } } - // NEON skinning register mapping + // TODO: NEON skinning register mapping // The matrix will be built in Q12-Q15. // The temporary matrix to be added to the built matrix will be in Q8-Q11. + if (skinning) { + // TODO: Preload scale factors + } + JumpTarget loopStart = GetCodePtr(); for (int i = 0; i < dec.numSteps_; i++) { if (!CompileStep(dec, i)) { @@ -1079,6 +1110,91 @@ void VertexDecoderJitCache::Jit_WeightsFloat() { } } +void VertexDecoderJitCache::Jit_WeightsU8Skin() { + // No need to zero skinMatrix, we'll just STR to it in the first lap, + // then VLDR/VADD/VSTR in subsequent laps. + MOVI2R(tempReg2, (u32)skinMatrix, scratchReg); + for (int j = 0; j < dec_->nweights; j++) { + const float *bone = &gstate.boneMatrix[j * 12]; + LDRB(tempReg1, srcReg, dec_->weightoff + j); + VMOV(fpScratchReg, tempReg1); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT); + MOVI2F(fpScratchReg2, by128, scratchReg); + VMUL(fpScratchReg, fpScratchReg, fpScratchReg2); + MOVI2R(tempReg1, (u32)bone, scratchReg); + // Okay, we have the weight. + if (j == 0) { + for (int i = 0; i < 12; i++) { + VLDR(fpScratchReg2, tempReg1, i * 4); + VMUL(fpScratchReg2, fpScratchReg2, fpScratchReg); + VSTR(fpScratchReg2, tempReg2, i * 4); + } + } else { + for (int i = 0; i < 12; i++) { + VLDR(fpScratchReg2, tempReg1, i * 4); + VLDR(fpScratchReg3, tempReg2, i * 4); + VMLA(fpScratchReg3, fpScratchReg2, fpScratchReg); + VSTR(fpScratchReg3, tempReg2, i * 4); + } + } + } +} + +void VertexDecoderJitCache::Jit_WeightsU16Skin() { + // No need to zero skinMatrix, we'll just STR to it in the first lap, + // then VLDR/VADD/VSTR in subsequent laps. + MOVI2R(tempReg2, (u32)skinMatrix, scratchReg); + for (int j = 0; j < dec_->nweights; j++) { + const float *bone = &gstate.boneMatrix[j * 12]; + LDRH(tempReg1, srcReg, dec_->weightoff + j * 2); + VMOV(fpScratchReg, tempReg1); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT); + MOVI2F(fpScratchReg2, 1.0f / 32768.0f, scratchReg); + VMUL(fpScratchReg, fpScratchReg, fpScratchReg2); + MOVI2R(tempReg1, (u32)bone, scratchReg); + // Okay, we have the weight. + if (j == 0) { + for (int i = 0; i < 12; i++) { + VLDR(fpScratchReg2, tempReg1, i * 4); + VMUL(fpScratchReg2, fpScratchReg2, fpScratchReg); + VSTR(fpScratchReg2, tempReg2, i * 4); + } + } else { + for (int i = 0; i < 12; i++) { + VLDR(fpScratchReg2, tempReg1, i * 4); + VLDR(fpScratchReg3, tempReg2, i * 4); + VMLA(fpScratchReg3, fpScratchReg2, fpScratchReg); + VSTR(fpScratchReg3, tempReg2, i * 4); + } + } + } +} + +void VertexDecoderJitCache::Jit_WeightsFloatSkin() { + // No need to zero skinMatrix, we'll just STR to it in the first lap, + // then VLDR/VADD/VSTR in subsequent laps. + MOVI2R(tempReg2, (u32)skinMatrix, scratchReg); + for (int j = 0; j < dec_->nweights; j++) { + const float *bone = &gstate.boneMatrix[j * 12]; + VLDR(fpScratchReg, srcReg, dec_->weightoff + j * 4); + MOVI2R(tempReg1, (u32)bone, scratchReg); + if (j == 0) { + for (int i = 0; i < 12; i++) { + VLDR(fpScratchReg2, tempReg1, i * 4); + VMUL(fpScratchReg2, fpScratchReg2, fpScratchReg); + VSTR(fpScratchReg2, tempReg2, i * 4); + } + } else { + for (int i = 0; i < 12; i++) { + VLDR(fpScratchReg2, tempReg1, i * 4); + VLDR(fpScratchReg3, tempReg2, i * 4); + VMLA(fpScratchReg3, fpScratchReg2, fpScratchReg); + VSTR(fpScratchReg3, tempReg2, i * 4); + } + } + } +} + // Fill last two bytes with zeroes to align to 4 bytes. LDRH does it for us, handy. void VertexDecoderJitCache::Jit_TcU8() { LDRB(tempReg1, srcReg, dec_->tcoff); @@ -1336,7 +1452,6 @@ void VertexDecoderJitCache::Jit_PosS16() { // Just copy 12 bytes. void VertexDecoderJitCache::Jit_PosFloat() { - // Might not be aligned to 4, so we can't use LDMIA. LDR(tempReg1, srcReg, dec_->posoff); LDR(tempReg2, srcReg, dec_->posoff + 4); LDR(tempReg3, srcReg, dec_->posoff + 8); @@ -1345,6 +1460,113 @@ void VertexDecoderJitCache::Jit_PosFloat() { STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3); } +void VertexDecoderJitCache::Jit_NormalS8Skin() { + LDRSB(tempReg1, srcReg, dec_->nrmoff); + LDRSB(tempReg2, srcReg, dec_->nrmoff + 1); + LDRSB(tempReg3, srcReg, dec_->nrmoff + 2); + VMOV(src[0], tempReg1); + VMOV(src[1], tempReg2); + VMOV(src[2], tempReg3); + MOVI2F(S15, 1.0f/128.0f, scratchReg); + VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED); + VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED); + VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED); + VMUL(src[0], src[0], S15); + VMUL(src[1], src[1], S15); + VMUL(src[2], src[2], S15); + Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); +} + +void VertexDecoderJitCache::Jit_NormalS16Skin() { + LDRSH(tempReg1, srcReg, dec_->nrmoff); + LDRSH(tempReg2, srcReg, dec_->nrmoff + 2); + LDRSH(tempReg3, srcReg, dec_->nrmoff + 4); + VMOV(fpScratchReg, tempReg1); + VMOV(fpScratchReg2, tempReg2); + VMOV(fpScratchReg3, tempReg3); + MOVI2F(S15, 1.0f/32768.0f, scratchReg); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED); + VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT | IS_SIGNED); + VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT | IS_SIGNED); + VMUL(src[0], fpScratchReg, S15); + VMUL(src[1], fpScratchReg2, S15); + VMUL(src[2], fpScratchReg3, S15); + Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); +} + +void VertexDecoderJitCache::Jit_NormalFloatSkin() { + VLDR(src[0], srcReg, dec_->nrmoff); + VLDR(src[1], srcReg, dec_->nrmoff + 4); + VLDR(src[2], srcReg, dec_->nrmoff + 8); + Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); +} + +void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) { + MOVI2R(tempReg1, (u32)skinMatrix, scratchReg); + for (int i = 0; i < 3; i++) { + VLDR(fpScratchReg, tempReg1, 4 * i); + VMUL(acc[i], fpScratchReg, src[0]); + } + for (int i = 0; i < 3; i++) { + VLDR(fpScratchReg, tempReg1, 12 + 4 * i); + VMLA(acc[i], fpScratchReg, src[1]); + } + for (int i = 0; i < 3; i++) { + VLDR(fpScratchReg, tempReg1, 24 + 4 * i); + VMLA(acc[i], fpScratchReg, src[2]); + } + if (pos) { + for (int i = 0; i < 3; i++) { + VLDR(fpScratchReg, tempReg1, 36 + 4 * i); + VADD(acc[i], acc[i], fpScratchReg); + } + } + for (int i = 0; i < 3; i++) { + VSTR(acc[i], dstReg, outOff + i * 4); + } +} + +void VertexDecoderJitCache::Jit_PosS8Skin() { + LDRSB(tempReg1, srcReg, dec_->posoff); + LDRSB(tempReg2, srcReg, dec_->posoff + 1); + LDRSB(tempReg3, srcReg, dec_->posoff + 2); + VMOV(src[0], tempReg1); + VMOV(src[1], tempReg2); + VMOV(src[2], tempReg3); + MOVI2F(S15, 1.0f/128.0f, scratchReg); + VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED); + VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED); + VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED); + VMUL(src[0], src[0], S15); + VMUL(src[1], src[1], S15); + VMUL(src[2], src[2], S15); + Jit_WriteMatrixMul(dec_->decFmt.posoff, true); +} + +void VertexDecoderJitCache::Jit_PosS16Skin() { + LDRSH(tempReg1, srcReg, dec_->posoff); + LDRSH(tempReg2, srcReg, dec_->posoff + 2); + LDRSH(tempReg3, srcReg, dec_->posoff + 4); + VMOV(src[0], tempReg1); + VMOV(src[1], tempReg2); + VMOV(src[2], tempReg3); + MOVI2F(S15, 1.0f/32768.0f, scratchReg); + VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED); + VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED); + VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED); + VMUL(src[0], src[0], S15); + VMUL(src[1], src[1], S15); + VMUL(src[2], src[2], S15); + Jit_WriteMatrixMul(dec_->decFmt.posoff, true); +} + +void VertexDecoderJitCache::Jit_PosFloatSkin() { + VLDR(src[0], srcReg, dec_->posoff); + VLDR(src[1], srcReg, dec_->posoff + 4); + VLDR(src[2], srcReg, dec_->posoff + 8); + Jit_WriteMatrixMul(dec_->decFmt.posoff, true); +} + #elif defined(_M_X64) || defined(_M_IX86) using namespace Gen; diff --git a/GPU/GLES/VertexDecoder.h b/GPU/GLES/VertexDecoder.h index b6586756f6..2c4ce40a47 100644 --- a/GPU/GLES/VertexDecoder.h +++ b/GPU/GLES/VertexDecoder.h @@ -242,5 +242,6 @@ public: private: bool CompileStep(const VertexDecoder &dec, int i); + void Jit_WriteMatrixMul(int outOff, bool pos); const VertexDecoder *dec_; }; From da380478f6e8ec14f0901229708557d184f9a256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Wed, 13 Nov 2013 19:55:20 +0100 Subject: [PATCH 12/12] Enable software skinning by default --- Core/Config.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Core/Config.cpp b/Core/Config.cpp index bc15c6c1f2..39d35cbec1 100644 --- a/Core/Config.cpp +++ b/Core/Config.cpp @@ -144,7 +144,7 @@ void Config::Load(const char *iniFileName, const char *controllerIniFilename) { graphics->Get("RenderingMode", &iRenderingMode, renderingModeDefault); graphics->Get("SoftwareRendering", &bSoftwareRendering, false); graphics->Get("HardwareTransform", &bHardwareTransform, true); - graphics->Get("SoftwareSkinning", &bSoftwareSkinning, false); + graphics->Get("SoftwareSkinning", &bSoftwareSkinning, true); graphics->Get("TextureFiltering", &iTexFiltering, 1); // Auto on Windows, 1x elsewhere. Maybe change to 2x on large screens? #ifdef _WIN32