From 186b0f105cbd91193725a0f82d9077aa4ea10c40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 5 Jun 2023 09:40:08 +0200 Subject: [PATCH 1/5] Simplify the vertex cache ID handling --- GPU/Common/DrawEngineCommon.cpp | 9 --------- GPU/Common/DrawEngineCommon.h | 1 - GPU/D3D11/DrawEngineD3D11.cpp | 8 ++++---- GPU/Directx9/DrawEngineDX9.cpp | 8 ++++---- GPU/GLES/DrawEngineGLES.cpp | 2 -- GPU/Vulkan/DrawEngineVulkan.cpp | 9 +++++---- 6 files changed, 13 insertions(+), 24 deletions(-) diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 234d2af223..554dd72bb3 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -821,15 +821,6 @@ void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimiti if ((vertexCount < 2 && prim > 0) || (vertexCount < 3 && prim > GE_PRIM_LINE_STRIP && prim != GE_PRIM_RECTANGLES)) return; - if (g_Config.bVertexCache) { - u32 dhash = dcid_; - dhash = __rotl(dhash ^ (u32)(uintptr_t)verts, 13); - dhash = __rotl(dhash ^ (u32)(uintptr_t)inds, 19); - dhash = __rotl(dhash ^ (u32)vertTypeID, 7); - dhash = __rotl(dhash ^ (u32)vertexCount, 11); - dcid_ = lowbias32_r(dhash ^ (u32)prim); - } - DeferredDrawCall &dc = drawCalls_[numDrawCalls_]; dc.verts = verts; dc.inds = inds; diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index c9b51ff020..0798c454b5 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -218,7 +218,6 @@ protected: int decimationCounter_ = 0; int decodeCounter_ = 0; - u32 dcid_ = 0; // Vertex collector state IndexGenerator indexGen; diff --git a/GPU/D3D11/DrawEngineD3D11.cpp b/GPU/D3D11/DrawEngineD3D11.cpp index aab67928bd..119607d76d 100644 --- a/GPU/D3D11/DrawEngineD3D11.cpp +++ b/GPU/D3D11/DrawEngineD3D11.cpp @@ -363,12 +363,13 @@ void DrawEngineD3D11::DoFlush() { useCache = false; if (useCache) { - u32 id = dcid_ ^ gstate.getUVGenMode(); // This can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263 + // getUVGenMode can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263 + u32 dcid = (u32)XXH3_64bits(&drawCalls_, sizeof(DeferredDrawCall) * numDrawCalls_) ^ gstate.getUVGenMode(); - VertexArrayInfoD3D11 *vai = vai_.Get(id); + VertexArrayInfoD3D11 *vai = vai_.Get(dcid); if (!vai) { vai = new VertexArrayInfoD3D11(); - vai_.Insert(id, vai); + vai_.Insert(dcid, vai); } switch (vai->status) { @@ -724,7 +725,6 @@ rotateVBO: numDrawCalls_ = 0; vertexCountInDrawCalls_ = 0; decodeCounter_ = 0; - dcid_ = 0; gstate_c.vertexFullAlpha = true; framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason); diff --git a/GPU/Directx9/DrawEngineDX9.cpp b/GPU/Directx9/DrawEngineDX9.cpp index b098898d21..95767740a4 100644 --- a/GPU/Directx9/DrawEngineDX9.cpp +++ b/GPU/Directx9/DrawEngineDX9.cpp @@ -345,11 +345,12 @@ void DrawEngineDX9::DoFlush() { useCache = false; if (useCache) { - u32 id = dcid_ ^ gstate.getUVGenMode(); // This can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263 - VertexArrayInfoDX9 *vai = vai_.Get(id); + // getUVGenMode can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263 + u32 dcid = (u32)XXH3_64bits(&drawCalls_, sizeof(DeferredDrawCall) * numDrawCalls_) ^ gstate.getUVGenMode(); + VertexArrayInfoDX9 *vai = vai_.Get(dcid); if (!vai) { vai = new VertexArrayInfoDX9(); - vai_.Insert(id, vai); + vai_.Insert(dcid, vai); } switch (vai->status) { @@ -666,7 +667,6 @@ rotateVBO: numDrawCalls_ = 0; vertexCountInDrawCalls_ = 0; decodeCounter_ = 0; - dcid_ = 0; gstate_c.vertexFullAlpha = true; framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason); diff --git a/GPU/GLES/DrawEngineGLES.cpp b/GPU/GLES/DrawEngineGLES.cpp index 1f55ad16d3..e77728eb08 100644 --- a/GPU/GLES/DrawEngineGLES.cpp +++ b/GPU/GLES/DrawEngineGLES.cpp @@ -248,7 +248,6 @@ void DrawEngineGLES::DoFlush() { numDrawCalls_ = 0; vertexCountInDrawCalls_ = 0; decodeCounter_ = 0; - dcid_ = 0; return; } @@ -483,7 +482,6 @@ bail: numDrawCalls_ = 0; vertexCountInDrawCalls_ = 0; decodeCounter_ = 0; - dcid_ = 0; gstate_c.vertexFullAlpha = true; framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason); diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index 62bf89de8d..8676ba9e87 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -597,12 +597,14 @@ void DrawEngineVulkan::DoFlush() { } if (useCache) { + // getUVGenMode can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263 + u32 dcid = (u32)XXH3_64bits(&drawCalls_, sizeof(DeferredDrawCall) * numDrawCalls_) ^ gstate.getUVGenMode(); + PROFILE_THIS_SCOPE("vcache"); - u32 id = dcid_ ^ gstate.getUVGenMode(); // This can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263 - VertexArrayInfoVulkan *vai = vai_.Get(id); + VertexArrayInfoVulkan *vai = vai_.Get(dcid); if (!vai) { vai = new VertexArrayInfoVulkan(); - vai_.Insert(id, vai); + vai_.Insert(dcid, vai); } switch (vai->status) { @@ -1011,7 +1013,6 @@ void DrawEngineVulkan::DoFlush() { numDrawCalls_ = 0; vertexCountInDrawCalls_ = 0; decodeCounter_ = 0; - dcid_ = 0; gstate_c.vertexFullAlpha = true; framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason); From d90671e87761e5bd527d9c12485a06edf3486adb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 5 Jun 2023 09:49:58 +0200 Subject: [PATCH 2/5] Add some comments. --- Common/GPU/Vulkan/VulkanMemory.h | 2 ++ GPU/Vulkan/DrawEngineVulkan.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/Common/GPU/Vulkan/VulkanMemory.h b/Common/GPU/Vulkan/VulkanMemory.h index 53f96ae689..9f51990691 100644 --- a/Common/GPU/Vulkan/VulkanMemory.h +++ b/Common/GPU/Vulkan/VulkanMemory.h @@ -135,6 +135,8 @@ public: return blocks_[curBlockIndex_].writePtr; } + // NOTE: If you can avoid this by writing the data directly into memory returned from Allocate, + // do so. Savings from avoiding memcpy can be significant. VkDeviceSize Push(const void *data, VkDeviceSize numBytes, int alignment, VkBuffer *vkbuf) { uint32_t bindOffset; uint8_t *ptr = Allocate(numBytes, alignment, vkbuf, &bindOffset); diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index 8676ba9e87..254e6932ed 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -825,6 +825,8 @@ void DrawEngineVulkan::DoFlush() { VkDescriptorSet ds = GetOrCreateDescriptorSet(imageView, sampler, baseBuf, lightBuf, boneBuf, tess); + // TODO: Can we avoid binding all three when not needed? Same below for hardware transform. + // Think this will require different descriptor set layouts. const uint32_t dynamicUBOOffsets[3] = { baseUBOOffset, lightUBOOffset, boneUBOOffset, }; From 468757b93a428e0141678786762e76c5eca99510 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 5 Jun 2023 10:07:36 +0200 Subject: [PATCH 3/5] Add comment about possible UV scale/offset bug. Move loop-max to local. --- GPU/Common/DrawEngineCommon.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 554dd72bb3..eaf7b2ce52 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -74,17 +74,18 @@ VertexDecoder *DrawEngineCommon::GetVertexDecoder(u32 vtype) { int DrawEngineCommon::ComputeNumVertsToDecode() const { int vertsToDecode = 0; + int numDrawCalls = numDrawCalls_; if (drawCalls_[0].indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) { - for (int i = 0; i < numDrawCalls_; i++) { + for (int i = 0; i < numDrawCalls; i++) { const DeferredDrawCall &dc = drawCalls_[i]; vertsToDecode += dc.vertexCount; } } else { // TODO: Share this computation with DecodeVertsStep? - for (int i = 0; i < numDrawCalls_; i++) { + for (int i = 0; i < numDrawCalls; i++) { const DeferredDrawCall &dc = drawCalls_[i]; int lastMatch = i; - const int total = numDrawCalls_; + const int total = numDrawCalls; int indexLowerBound = dc.indexLowerBound; int indexUpperBound = dc.indexUpperBound; for (int j = i + 1; j < total; ++j) { @@ -645,7 +646,7 @@ void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) { for (int j = i + 1; j < total; ++j) { if (drawCalls_[j].verts != dc.verts) break; - + // TODO: What if UV scale/offset changes between drawcalls here? indexLowerBound = std::min(indexLowerBound, (int)drawCalls_[j].indexLowerBound); indexUpperBound = std::max(indexUpperBound, (int)drawCalls_[j].indexUpperBound); lastMatch = j; From 2f90ec609391b6edd6c397a089fa82cabd42ba4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 5 Jun 2023 10:47:20 +0200 Subject: [PATCH 4/5] Breakout the vertex caching (just code cleanup) --- GPU/Vulkan/DrawEngineVulkan.cpp | 295 ++++++++++++++++---------------- GPU/Vulkan/DrawEngineVulkan.h | 2 + 2 files changed, 152 insertions(+), 145 deletions(-) diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index 254e6932ed..86a6dd1d8e 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -552,6 +552,152 @@ void DrawEngineVulkan::Invalidate(InvalidationCallbackFlags flags) { } } +bool DrawEngineVulkan::VertexCacheLookup(int &vertexCount, GEPrimitiveType &prim, VkBuffer &vbuf, uint32_t &vbOffset, VkBuffer &ibuf, uint32_t &ibOffset, bool &useElements, bool forceIndexed) { + // getUVGenMode can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263 + u32 dcid = (u32)XXH3_64bits(&drawCalls_, sizeof(DeferredDrawCall) * numDrawCalls_) ^ gstate.getUVGenMode(); + + PROFILE_THIS_SCOPE("vcache"); + VertexArrayInfoVulkan *vai = vai_.Get(dcid); + if (!vai) { + vai = new VertexArrayInfoVulkan(); + vai_.Insert(dcid, vai); + } + + switch (vai->status) { + case VertexArrayInfoVulkan::VAI_NEW: + { + // Haven't seen this one before. We don't actually upload the vertex data yet. + uint64_t dataHash = ComputeHash(); + vai->hash = dataHash; + vai->minihash = ComputeMiniHash(); + vai->status = VertexArrayInfoVulkan::VAI_HASHING; + vai->drawsUntilNextFullHash = 0; + DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); // writes to indexGen + vai->numVerts = indexGen.VertexCount(); + vai->prim = indexGen.Prim(); + vai->maxIndex = indexGen.MaxIndex(); + vai->flags = gstate_c.vertexFullAlpha ? VAIVULKAN_FLAG_VERTEXFULLALPHA : 0; + return true; + } + + // Hashing - still gaining confidence about the buffer. + // But if we get this far it's likely to be worth uploading the data. + case VertexArrayInfoVulkan::VAI_HASHING: + { + PROFILE_THIS_SCOPE("vcachehash"); + vai->numDraws++; + if (vai->lastFrame != gpuStats.numFlips) { + vai->numFrames++; + } + if (vai->drawsUntilNextFullHash == 0) { + // Let's try to skip a full hash if mini would fail. + const u32 newMiniHash = ComputeMiniHash(); + uint64_t newHash = vai->hash; + if (newMiniHash == vai->minihash) { + newHash = ComputeHash(); + } + if (newMiniHash != vai->minihash || newHash != vai->hash) { + MarkUnreliable(vai); + DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); + return true; + } + if (vai->numVerts > 64) { + // exponential backoff up to 16 draws, then every 24 + vai->drawsUntilNextFullHash = std::min(24, vai->numFrames); + } else { + // Lower numbers seem much more likely to change. + vai->drawsUntilNextFullHash = 0; + } + // TODO: tweak + //if (vai->numFrames > 1000) { + // vai->status = VertexArrayInfo::VAI_RELIABLE; + //} + } else { + vai->drawsUntilNextFullHash--; + u32 newMiniHash = ComputeMiniHash(); + if (newMiniHash != vai->minihash) { + MarkUnreliable(vai); + DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); + return true; + } + } + + if (!vai->vb) { + // Directly push to the vertex cache. + DecodeVertsToPushBuffer(vertexCache_, &vai->vbOffset, &vai->vb); + _dbg_assert_msg_(gstate_c.vertBounds.minV >= gstate_c.vertBounds.maxV, "Should not have checked UVs when caching."); + vai->numVerts = indexGen.VertexCount(); + vai->maxIndex = indexGen.MaxIndex(); + vai->flags = gstate_c.vertexFullAlpha ? VAIVULKAN_FLAG_VERTEXFULLALPHA : 0; + if (forceIndexed) { + vai->prim = indexGen.GeneralPrim(); + useElements = true; + } else { + vai->prim = indexGen.Prim(); + useElements = !indexGen.SeenOnlyPurePrims(); + if (!useElements && indexGen.PureCount()) { + vai->numVerts = indexGen.PureCount(); + } + } + + if (useElements) { + u32 size = sizeof(uint16_t) * indexGen.VertexCount(); + void *dest = vertexCache_->Allocate(size, 4, &vai->ib, &vai->ibOffset); + memcpy(dest, decIndex_, size); + } else { + vai->ib = VK_NULL_HANDLE; + vai->ibOffset = 0; + } + } else { + gpuStats.numCachedDrawCalls++; + useElements = vai->ib ? true : false; + gpuStats.numCachedVertsDrawn += vai->numVerts; + gstate_c.vertexFullAlpha = vai->flags & VAIVULKAN_FLAG_VERTEXFULLALPHA; + } + vbuf = vai->vb; + ibuf = vai->ib; + vbOffset = vai->vbOffset; + ibOffset = vai->ibOffset; + vertexCount = vai->numVerts; + prim = static_cast(vai->prim); + break; + } + + // Reliable - we don't even bother hashing anymore. Right now we don't go here until after a very long time. + case VertexArrayInfoVulkan::VAI_RELIABLE: + { + vai->numDraws++; + if (vai->lastFrame != gpuStats.numFlips) { + vai->numFrames++; + } + gpuStats.numCachedDrawCalls++; + gpuStats.numCachedVertsDrawn += vai->numVerts; + vbuf = vai->vb; + ibuf = vai->ib; + vbOffset = vai->vbOffset; + ibOffset = vai->ibOffset; + vertexCount = vai->numVerts; + prim = static_cast(vai->prim); + + gstate_c.vertexFullAlpha = vai->flags & VAIVULKAN_FLAG_VERTEXFULLALPHA; + break; + } + + case VertexArrayInfoVulkan::VAI_UNRELIABLE: + { + vai->numDraws++; + if (vai->lastFrame != gpuStats.numFlips) { + vai->numFrames++; + } + DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); + return true; + } + default: + break; + } + return false; +} + // The inline wrapper in the header checks for numDrawCalls_ == 0 void DrawEngineVulkan::DoFlush() { VulkanRenderManager *renderManager = (VulkanRenderManager *)draw_->GetNativeObject(Draw::NativeObject::RENDER_MANAGER); @@ -595,150 +741,9 @@ void DrawEngineVulkan::DoFlush() { if (decOptions_.applySkinInDecode && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) { useCache = false; } - + bool useIndexGen = true; if (useCache) { - // getUVGenMode can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263 - u32 dcid = (u32)XXH3_64bits(&drawCalls_, sizeof(DeferredDrawCall) * numDrawCalls_) ^ gstate.getUVGenMode(); - - PROFILE_THIS_SCOPE("vcache"); - VertexArrayInfoVulkan *vai = vai_.Get(dcid); - if (!vai) { - vai = new VertexArrayInfoVulkan(); - vai_.Insert(dcid, vai); - } - - switch (vai->status) { - case VertexArrayInfoVulkan::VAI_NEW: - { - // Haven't seen this one before. We don't actually upload the vertex data yet. - uint64_t dataHash = ComputeHash(); - vai->hash = dataHash; - vai->minihash = ComputeMiniHash(); - vai->status = VertexArrayInfoVulkan::VAI_HASHING; - vai->drawsUntilNextFullHash = 0; - DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); // writes to indexGen - vai->numVerts = indexGen.VertexCount(); - vai->prim = indexGen.Prim(); - vai->maxIndex = indexGen.MaxIndex(); - vai->flags = gstate_c.vertexFullAlpha ? VAIVULKAN_FLAG_VERTEXFULLALPHA : 0; - goto rotateVBO; - } - - // Hashing - still gaining confidence about the buffer. - // But if we get this far it's likely to be worth uploading the data. - case VertexArrayInfoVulkan::VAI_HASHING: - { - PROFILE_THIS_SCOPE("vcachehash"); - vai->numDraws++; - if (vai->lastFrame != gpuStats.numFlips) { - vai->numFrames++; - } - if (vai->drawsUntilNextFullHash == 0) { - // Let's try to skip a full hash if mini would fail. - const u32 newMiniHash = ComputeMiniHash(); - uint64_t newHash = vai->hash; - if (newMiniHash == vai->minihash) { - newHash = ComputeHash(); - } - if (newMiniHash != vai->minihash || newHash != vai->hash) { - MarkUnreliable(vai); - DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); - goto rotateVBO; - } - if (vai->numVerts > 64) { - // exponential backoff up to 16 draws, then every 24 - vai->drawsUntilNextFullHash = std::min(24, vai->numFrames); - } else { - // Lower numbers seem much more likely to change. - vai->drawsUntilNextFullHash = 0; - } - // TODO: tweak - //if (vai->numFrames > 1000) { - // vai->status = VertexArrayInfo::VAI_RELIABLE; - //} - } else { - vai->drawsUntilNextFullHash--; - u32 newMiniHash = ComputeMiniHash(); - if (newMiniHash != vai->minihash) { - MarkUnreliable(vai); - DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); - goto rotateVBO; - } - } - - if (!vai->vb) { - // Directly push to the vertex cache. - DecodeVertsToPushBuffer(vertexCache_, &vai->vbOffset, &vai->vb); - _dbg_assert_msg_(gstate_c.vertBounds.minV >= gstate_c.vertBounds.maxV, "Should not have checked UVs when caching."); - vai->numVerts = indexGen.VertexCount(); - vai->maxIndex = indexGen.MaxIndex(); - vai->flags = gstate_c.vertexFullAlpha ? VAIVULKAN_FLAG_VERTEXFULLALPHA : 0; - if (forceIndexed) { - vai->prim = indexGen.GeneralPrim(); - useElements = true; - } else { - vai->prim = indexGen.Prim(); - useElements = !indexGen.SeenOnlyPurePrims(); - if (!useElements && indexGen.PureCount()) { - vai->numVerts = indexGen.PureCount(); - } - } - - if (useElements) { - u32 size = sizeof(uint16_t) * indexGen.VertexCount(); - void *dest = vertexCache_->Allocate(size, 4, &vai->ib, &vai->ibOffset); - memcpy(dest, decIndex_, size); - } else { - vai->ib = VK_NULL_HANDLE; - vai->ibOffset = 0; - } - } else { - gpuStats.numCachedDrawCalls++; - useElements = vai->ib ? true : false; - gpuStats.numCachedVertsDrawn += vai->numVerts; - gstate_c.vertexFullAlpha = vai->flags & VAIVULKAN_FLAG_VERTEXFULLALPHA; - } - vbuf = vai->vb; - ibuf = vai->ib; - vbOffset = vai->vbOffset; - ibOffset = vai->ibOffset; - vertexCount = vai->numVerts; - prim = static_cast(vai->prim); - break; - } - - // Reliable - we don't even bother hashing anymore. Right now we don't go here until after a very long time. - case VertexArrayInfoVulkan::VAI_RELIABLE: - { - vai->numDraws++; - if (vai->lastFrame != gpuStats.numFlips) { - vai->numFrames++; - } - gpuStats.numCachedDrawCalls++; - gpuStats.numCachedVertsDrawn += vai->numVerts; - vbuf = vai->vb; - ibuf = vai->ib; - vbOffset = vai->vbOffset; - ibOffset = vai->ibOffset; - vertexCount = vai->numVerts; - prim = static_cast(vai->prim); - - gstate_c.vertexFullAlpha = vai->flags & VAIVULKAN_FLAG_VERTEXFULLALPHA; - break; - } - - case VertexArrayInfoVulkan::VAI_UNRELIABLE: - { - vai->numDraws++; - if (vai->lastFrame != gpuStats.numFlips) { - vai->numFrames++; - } - DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); - goto rotateVBO; - } - default: - break; - } + useIndexGen = VertexCacheLookup(vertexCount, prim, vbuf, vbOffset, ibuf, ibOffset, useElements, forceIndexed); } else { if (decOptions_.applySkinInDecode && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) { // If software skinning, we've already predecoded into "decoded". So push that content. @@ -749,10 +754,10 @@ void DrawEngineVulkan::DoFlush() { // Decode directly into the pushbuffer DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); } - - rotateVBO: gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); + } + if (useIndexGen) { vertexCount = indexGen.VertexCount(); if (forceIndexed) { useElements = true; diff --git a/GPU/Vulkan/DrawEngineVulkan.h b/GPU/Vulkan/DrawEngineVulkan.h index 4f169a65ca..541263ab8a 100644 --- a/GPU/Vulkan/DrawEngineVulkan.h +++ b/GPU/Vulkan/DrawEngineVulkan.h @@ -222,6 +222,8 @@ private: void DestroyDeviceObjects(); + bool VertexCacheLookup(int &vertexCount, GEPrimitiveType &prim, VkBuffer &vbuf, uint32_t &vbOffset, VkBuffer &ibuf, uint32_t &ibOffset, bool &useElements, bool forceIndexed); + void DecodeVertsToPushPool(VulkanPushPool *push, uint32_t *bindOffset, VkBuffer *vkbuf); void DecodeVertsToPushBuffer(VulkanPushBuffer *push, uint32_t *bindOffset, VkBuffer *vkbuf); From f5516d3248ed62ae96081c90519a70c683560b9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 12 Jun 2023 14:24:20 +0200 Subject: [PATCH 5/5] Actually switch away from XXH to a custom hash, to de-risk --- GPU/Common/DrawEngineCommon.cpp | 33 +++++++++++++++++++++++---------- GPU/Common/DrawEngineCommon.h | 2 ++ GPU/Vulkan/DrawEngineVulkan.cpp | 3 ++- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index eaf7b2ce52..6421108401 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -783,16 +783,6 @@ uint64_t DrawEngineCommon::ComputeHash() { return fullhash; } -// Cheap bit scrambler from https://nullprogram.com/blog/2018/07/31/ -inline uint32_t lowbias32_r(uint32_t x) { - x ^= x >> 16; - x *= 0x43021123U; - x ^= x >> 15 ^ x >> 30; - x *= 0x1d69e2a5U; - x ^= x >> 16; - return x; -} - // vertTypeID is the vertex type but with the UVGen mode smashed into the top bits. void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead) { if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls_ >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) { @@ -865,6 +855,29 @@ bool DrawEngineCommon::CanUseHardwareTessellation(GEPatchPrimType prim) { return false; } +// Cheap bit scrambler from https://nullprogram.com/blog/2018/07/31/ +inline uint32_t lowbias32_r(uint32_t x) { + x ^= x >> 16; + x *= 0x43021123U; + x ^= x >> 15 ^ x >> 30; + x *= 0x1d69e2a5U; + x ^= x >> 16; + return x; +} + +uint32_t DrawEngineCommon::ComputeDrawcallsHash() const { + uint32_t dcid = 0; + for (int i = 0; i < numDrawCalls_; i++) { + u32 dhash = dcid; + dhash = __rotl(dhash ^ (u32)(uintptr_t)drawCalls_[i].verts, 13); + dhash = __rotl(dhash ^ (u32)(uintptr_t)drawCalls_[i].inds, 19); + dhash = __rotl(dhash ^ (u32)drawCalls_[i].indexType, 7); + dhash = __rotl(dhash ^ (u32)drawCalls_[i].vertexCount, 11); + dcid = lowbias32_r(dhash ^ (u32)drawCalls_[i].prim); + } + return dcid; +} + void TessellationDataTransfer::CopyControlPoints(float *pos, float *tex, float *col, int posStride, int texStride, int colStride, const SimpleVertex *const *points, int size, u32 vertType) { bool hasColor = (vertType & GE_VTYPE_COL_MASK) != 0; bool hasTexCoord = (vertType & GE_VTYPE_TC_MASK) != 0; diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index 0798c454b5..f20c411081 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -175,6 +175,8 @@ protected: } } + uint32_t ComputeDrawcallsHash() const; + bool useHWTransform_ = false; bool useHWTessellation_ = false; // Used to prevent unnecessary flushing in softgpu. diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index 86a6dd1d8e..88a722cbcd 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -554,7 +554,8 @@ void DrawEngineVulkan::Invalidate(InvalidationCallbackFlags flags) { bool DrawEngineVulkan::VertexCacheLookup(int &vertexCount, GEPrimitiveType &prim, VkBuffer &vbuf, uint32_t &vbOffset, VkBuffer &ibuf, uint32_t &ibOffset, bool &useElements, bool forceIndexed) { // getUVGenMode can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263 - u32 dcid = (u32)XXH3_64bits(&drawCalls_, sizeof(DeferredDrawCall) * numDrawCalls_) ^ gstate.getUVGenMode(); + // u32 dcid = (u32)XXH3_64bits(&drawCalls_, sizeof(DeferredDrawCall) * numDrawCalls_) ^ gstate.getUVGenMode(); + u32 dcid = ComputeDrawcallsHash() ^ gstate.getUVGenMode(); PROFILE_THIS_SCOPE("vcache"); VertexArrayInfoVulkan *vai = vai_.Get(dcid);