From d95475e021b45d3aa0a09b5510668e5bff5e580a Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 16 Jan 2022 09:07:33 -0800 Subject: [PATCH 1/3] softgpu: Expose flush reasons/times in debug stats. --- GPU/Software/BinManager.cpp | 51 ++++++++++++++++++++++++++++++---- GPU/Software/BinManager.h | 10 ++++++- GPU/Software/SoftGpu.cpp | 2 +- GPU/Software/TransformUnit.cpp | 8 +++++- GPU/Software/TransformUnit.h | 2 ++ 5 files changed, 65 insertions(+), 8 deletions(-) diff --git a/GPU/Software/BinManager.cpp b/GPU/Software/BinManager.cpp index 4d1233a617..5947bba6b7 100644 --- a/GPU/Software/BinManager.cpp +++ b/GPU/Software/BinManager.cpp @@ -20,6 +20,8 @@ #include #include "Common/Profiler/Profiler.h" #include "Common/Thread/ThreadManager.h" +#include "Common/TimeUtil.h" +#include "Core/System.h" #include "GPU/Software/BinManager.h" #include "GPU/Software/Rasterizer.h" #include "GPU/Software/RasterizerRectangle.h" @@ -136,7 +138,7 @@ BinManager::~BinManager() { void BinManager::UpdateState() { PROFILE_THIS_SCOPE("bin_state"); if (states_.Full()) - Flush(); + Flush("states"); stateIndex_ = (int)states_.Push(RasterizerState()); ComputeRasterizerState(&states_[stateIndex_]); states_[stateIndex_].samplerID.cached.clut = cluts_[clutIndex_].readable; @@ -165,12 +167,12 @@ void BinManager::UpdateState() { // We don't want to overlap wrong, so flush any pending. if (maxTasks_ != newMaxTasks) { maxTasks_ = newMaxTasks; - Flush(); + Flush("selfrender"); } // Our bin sizes are based on offset, so if that changes we have to flush. if (queueOffsetX_ != gstate.getOffsetX16() || queueOffsetY_ != gstate.getOffsetY16()) { - Flush(); + Flush("offset"); queueOffsetX_ = gstate.getOffsetX16(); queueOffsetY_ = gstate.getOffsetY16(); } @@ -179,7 +181,7 @@ void BinManager::UpdateState() { void BinManager::UpdateClut(const void *src) { PROFILE_THIS_SCOPE("bin_clut"); if (cluts_.Full()) - Flush(); + Flush("cluts"); clutIndex_ = (int)cluts_.Push(BinClut()); memcpy(cluts_[clutIndex_].readable, src, sizeof(BinClut)); } @@ -323,7 +325,10 @@ void BinManager::Drain() { } } -void BinManager::Flush() { +void BinManager::Flush(const char *reason) { + double st; + if (coreCollectDebugStats) + st = time_now_d(); Drain(); waitable_->Wait(); taskRanges_.clear(); @@ -341,6 +346,42 @@ void BinManager::Flush() { queueRange_.y2 = 0; queueOffsetX_ = -1; queueOffsetY_ = -1; + + if (coreCollectDebugStats) { + double et = time_now_d(); + flushReasonTimes_[reason] += et - st; + if (et - st > slowestFlushTime_) { + slowestFlushTime_ = et - st; + slowestFlushReason_ = reason; + } + } +} + +void BinManager::GetStats(char *buffer, size_t bufsize) { + double allTotal = 0.0; + double slowestTotalTime = 0.0; + const char *slowestTotalReason = nullptr; + for (auto &it : flushReasonTimes_) { + if (it.second > slowestTotalTime) { + slowestTotalTime = it.second; + slowestTotalReason = it.first; + } + allTotal += it.second; + } + + snprintf(buffer, bufsize, + "Slowest individual flush: %s (%0.4f)\n" + "Slowest total flush: %s (%0.4f)\n" + "Total flush time: %0.4f\n", + slowestFlushReason_, slowestFlushTime_, + slowestTotalReason, slowestTotalTime, + allTotal); +} + +void BinManager::ResetStats() { + flushReasonTimes_.clear(); + slowestFlushReason_ = nullptr; + slowestFlushTime_ = 0.0; } inline BinCoords BinCoords::Intersect(const BinCoords &range) const { diff --git a/GPU/Software/BinManager.h b/GPU/Software/BinManager.h index eff0c4e6c9..253084dda7 100644 --- a/GPU/Software/BinManager.h +++ b/GPU/Software/BinManager.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include "Common/Log.h" #include "GPU/Software/Rasterizer.h" @@ -166,7 +167,10 @@ public: void AddPoint(const VertexData &v0); void Drain(); - void Flush(); + void Flush(const char *reason); + + void GetStats(char *buffer, size_t bufsize); + void ResetStats(); private: static constexpr int MAX_POSSIBLE_TASKS = 64; @@ -188,6 +192,10 @@ private: std::atomic taskStatus_[MAX_POSSIBLE_TASKS]; BinWaitable *waitable_ = nullptr; + std::unordered_map flushReasonTimes_; + const char *slowestFlushReason_ = nullptr; + double slowestFlushTime_ = 0.0; + BinCoords Scissor(BinCoords range); BinCoords Range(const VertexData &v0, const VertexData &v1, const VertexData &v2); BinCoords Range(const VertexData &v0, const VertexData &v1); diff --git a/GPU/Software/SoftGpu.cpp b/GPU/Software/SoftGpu.cpp index 7e786916ee..f1c1d7409c 100644 --- a/GPU/Software/SoftGpu.cpp +++ b/GPU/Software/SoftGpu.cpp @@ -900,7 +900,7 @@ void SoftGPU::FinishDeferred() { } void SoftGPU::GetStats(char *buffer, size_t bufsize) { - snprintf(buffer, bufsize, "SoftGPU: (N/A)"); + drawEngine_->transformUnit.GetStats(buffer, bufsize); } void SoftGPU::InvalidateCache(u32 addr, int size, GPUInvalidationType type) diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index 5736f37ddd..e020f0a7f4 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -609,10 +609,16 @@ void TransformUnit::SubmitPrimitive(void* vertices, void* indices, GEPrimitiveTy } void TransformUnit::Flush(const char *reason) { - binner_->Flush(); + binner_->Flush(reason); GPUDebug::NotifyDraw(); } +void TransformUnit::GetStats(char *buffer, size_t bufsize) { + // TODO: More stats? + binner_->GetStats(buffer, bufsize); + binner_->ResetStats(); +} + void TransformUnit::FlushIfOverlap(const char *reason, uint32_t addr, uint32_t sz) { if (!Memory::IsVRAMAddress(addr)) return; diff --git a/GPU/Software/TransformUnit.h b/GPU/Software/TransformUnit.h index 79bd8bfbd5..0c277b21bc 100644 --- a/GPU/Software/TransformUnit.h +++ b/GPU/Software/TransformUnit.h @@ -123,6 +123,8 @@ public: void FlushIfOverlap(const char *reason, uint32_t addr, uint32_t sz); void NotifyClutUpdate(const void *src); + void GetStats(char *buffer, size_t bufsize); + private: VertexData ReadVertex(VertexReader &vreader, bool &outside_range_flag); From cb5ac04d1691e1002150c1750deca786a9405e7e Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 16 Jan 2022 09:30:02 -0800 Subject: [PATCH 2/3] softgpu: Tune some queue sizes for perf. Using a chunk of RAM for this, but mostly with many threads. --- GPU/Software/BinManager.cpp | 29 +++++++++++++++++++++++------ GPU/Software/BinManager.h | 25 ++++++++++++++++++++----- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/GPU/Software/BinManager.cpp b/GPU/Software/BinManager.cpp index 5947bba6b7..16805578fb 100644 --- a/GPU/Software/BinManager.cpp +++ b/GPU/Software/BinManager.cpp @@ -89,7 +89,7 @@ static inline void DrawBinItem(const BinItem &item, const RasterizerState &state class DrawBinItemsTask : public Task { public: - DrawBinItemsTask(BinWaitable *notify, BinQueue &items, std::atomic &status, const BinQueue &states) + DrawBinItemsTask(BinWaitable *notify, BinManager::BinItemQueue &items, std::atomic &status, const BinManager::BinStateQueue &states) : notify_(notify), items_(items), status_(status), states_(states) { } @@ -115,9 +115,9 @@ private: } BinWaitable *notify_; - BinQueue &items_; + BinManager::BinItemQueue &items_; std::atomic &status_; - const BinQueue &states_; + const BinManager::BinStateQueue &states_; }; BinManager::BinManager() { @@ -369,16 +369,33 @@ void BinManager::GetStats(char *buffer, size_t bufsize) { allTotal += it.second; } + // Many games are 30 FPS, so check last frame too for better stats. + double recentTotal = allTotal; + double slowestRecentTime = slowestTotalTime; + const char *slowestRecentReason = slowestTotalReason; + for (auto &it : lastFlushReasonTimes_) { + if (it.second > slowestRecentTime) { + slowestRecentTime = it.second; + slowestRecentReason = it.first; + } + recentTotal += it.second; + } + snprintf(buffer, bufsize, "Slowest individual flush: %s (%0.4f)\n" - "Slowest total flush: %s (%0.4f)\n" - "Total flush time: %0.4f\n", + "Slowest frame flush: %s (%0.4f)\n" + "Slowest recent flush: %s (%0.4f)\n" + "Total flush time: %0.4f (%05.2f%%, last 2: %05.2f%%)\n", slowestFlushReason_, slowestFlushTime_, slowestTotalReason, slowestTotalTime, - allTotal); + slowestRecentReason, slowestRecentTime, + allTotal, allTotal * (6000.0 / 1.001), recentTotal * (3000.0 / 1.001)); + + constexpr int foo = sizeof(BinItem); } void BinManager::ResetStats() { + lastFlushReasonTimes_ = std::move(flushReasonTimes_); flushReasonTimes_.clear(); slowestFlushReason_ = nullptr; slowestFlushTime_ = 0.0; diff --git a/GPU/Software/BinManager.h b/GPU/Software/BinManager.h index 253084dda7..bcb674bc2a 100644 --- a/GPU/Software/BinManager.h +++ b/GPU/Software/BinManager.h @@ -23,6 +23,7 @@ #include "GPU/Software/Rasterizer.h" struct BinWaitable; +class DrawBinItemsTask; enum class BinItemType { TRIANGLE, @@ -172,15 +173,26 @@ public: void GetStats(char *buffer, size_t bufsize); void ResetStats(); -private: +protected: static constexpr int MAX_POSSIBLE_TASKS = 64; + // This is about 1MB of state data. + static constexpr int QUEUED_STATES = 4096; + // These are 1KB each, so half an MB. + static constexpr int QUEUED_CLUTS = 512; + // About 320 KB, but we have 64 of them, so 20 MB (most not likely active.) + static constexpr int QUEUED_PRIMS = 1024; - BinQueue states_; + typedef BinQueue BinStateQueue; + typedef BinQueue BinClutQueue; + typedef BinQueue BinItemQueue; + +private: + BinStateQueue states_; int stateIndex_; - BinQueue cluts_; + BinClutQueue cluts_; int clutIndex_; BinCoords scissor_; - BinQueue queue_; + BinItemQueue queue_; BinCoords queueRange_; int queueOffsetX_ = -1; int queueOffsetY_ = -1; @@ -188,11 +200,12 @@ private: int maxTasks_ = 1; bool tasksSplit_ = false; std::vector taskRanges_; - BinQueue taskQueues_[MAX_POSSIBLE_TASKS]; + BinItemQueue taskQueues_[MAX_POSSIBLE_TASKS]; std::atomic taskStatus_[MAX_POSSIBLE_TASKS]; BinWaitable *waitable_ = nullptr; std::unordered_map flushReasonTimes_; + std::unordered_map lastFlushReasonTimes_; const char *slowestFlushReason_ = nullptr; double slowestFlushTime_ = 0.0; @@ -201,4 +214,6 @@ private: BinCoords Range(const VertexData &v0, const VertexData &v1); BinCoords Range(const VertexData &v0); void Expand(const BinCoords &range); + + friend class DrawBinItemsTask; }; From 1764111a4bc248b0ab1a5ceb8f8b40fb64267b37 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 16 Jan 2022 11:49:41 -0800 Subject: [PATCH 3/3] softgpu: Reduce wasted memory. --- GPU/Software/BinManager.cpp | 7 +++++++ GPU/Software/BinManager.h | 7 +++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/GPU/Software/BinManager.cpp b/GPU/Software/BinManager.cpp index 16805578fb..faa1f648c0 100644 --- a/GPU/Software/BinManager.cpp +++ b/GPU/Software/BinManager.cpp @@ -129,6 +129,13 @@ BinManager::BinManager() { waitable_ = new BinWaitable(); for (auto &s : taskStatus_) s = false; + + int maxInitTasks = std::min(g_threadManager.GetNumLooperThreads(), MAX_POSSIBLE_TASKS); + for (int i = 0; i < maxInitTasks; ++i) + taskQueues_[i].Setup(); + states_.Setup(); + cluts_.Setup(); + queue_.Setup(); } BinManager::~BinManager() { diff --git a/GPU/Software/BinManager.h b/GPU/Software/BinManager.h index bcb674bc2a..883997ad10 100644 --- a/GPU/Software/BinManager.h +++ b/GPU/Software/BinManager.h @@ -58,13 +58,16 @@ struct BinItem { template struct BinQueue { BinQueue() { - items_ = new T[N]; Reset(); } ~BinQueue() { delete [] items_; } + void Setup() { + items_ = new T[N]; + } + void Reset() { head_ = 0; tail_ = 0; @@ -179,7 +182,7 @@ protected: static constexpr int QUEUED_STATES = 4096; // These are 1KB each, so half an MB. static constexpr int QUEUED_CLUTS = 512; - // About 320 KB, but we have 64 of them, so 20 MB (most not likely active.) + // About 320 KB, but we have usually 16 or less of them, so 5 MB - 20 MB. static constexpr int QUEUED_PRIMS = 1024; typedef BinQueue BinStateQueue;