diff --git a/GPU/Software/BinManager.cpp b/GPU/Software/BinManager.cpp index 4d1233a617..faa1f648c0 100644 --- a/GPU/Software/BinManager.cpp +++ b/GPU/Software/BinManager.cpp @@ -20,6 +20,8 @@ #include #include "Common/Profiler/Profiler.h" #include "Common/Thread/ThreadManager.h" +#include "Common/TimeUtil.h" +#include "Core/System.h" #include "GPU/Software/BinManager.h" #include "GPU/Software/Rasterizer.h" #include "GPU/Software/RasterizerRectangle.h" @@ -87,7 +89,7 @@ static inline void DrawBinItem(const BinItem &item, const RasterizerState &state class DrawBinItemsTask : public Task { public: - DrawBinItemsTask(BinWaitable *notify, BinQueue &items, std::atomic &status, const BinQueue &states) + DrawBinItemsTask(BinWaitable *notify, BinManager::BinItemQueue &items, std::atomic &status, const BinManager::BinStateQueue &states) : notify_(notify), items_(items), status_(status), states_(states) { } @@ -113,9 +115,9 @@ private: } BinWaitable *notify_; - BinQueue &items_; + BinManager::BinItemQueue &items_; std::atomic &status_; - const BinQueue &states_; + const BinManager::BinStateQueue &states_; }; BinManager::BinManager() { @@ -127,6 +129,13 @@ BinManager::BinManager() { waitable_ = new BinWaitable(); for (auto &s : taskStatus_) s = false; + + int maxInitTasks = std::min(g_threadManager.GetNumLooperThreads(), MAX_POSSIBLE_TASKS); + for (int i = 0; i < maxInitTasks; ++i) + taskQueues_[i].Setup(); + states_.Setup(); + cluts_.Setup(); + queue_.Setup(); } BinManager::~BinManager() { @@ -136,7 +145,7 @@ BinManager::~BinManager() { void BinManager::UpdateState() { PROFILE_THIS_SCOPE("bin_state"); if (states_.Full()) - Flush(); + Flush("states"); stateIndex_ = (int)states_.Push(RasterizerState()); ComputeRasterizerState(&states_[stateIndex_]); states_[stateIndex_].samplerID.cached.clut = cluts_[clutIndex_].readable; @@ -165,12 +174,12 @@ void BinManager::UpdateState() { // We don't want to overlap wrong, so flush any pending. if (maxTasks_ != newMaxTasks) { maxTasks_ = newMaxTasks; - Flush(); + Flush("selfrender"); } // Our bin sizes are based on offset, so if that changes we have to flush. if (queueOffsetX_ != gstate.getOffsetX16() || queueOffsetY_ != gstate.getOffsetY16()) { - Flush(); + Flush("offset"); queueOffsetX_ = gstate.getOffsetX16(); queueOffsetY_ = gstate.getOffsetY16(); } @@ -179,7 +188,7 @@ void BinManager::UpdateState() { void BinManager::UpdateClut(const void *src) { PROFILE_THIS_SCOPE("bin_clut"); if (cluts_.Full()) - Flush(); + Flush("cluts"); clutIndex_ = (int)cluts_.Push(BinClut()); memcpy(cluts_[clutIndex_].readable, src, sizeof(BinClut)); } @@ -323,7 +332,10 @@ void BinManager::Drain() { } } -void BinManager::Flush() { +void BinManager::Flush(const char *reason) { + double st; + if (coreCollectDebugStats) + st = time_now_d(); Drain(); waitable_->Wait(); taskRanges_.clear(); @@ -341,6 +353,59 @@ void BinManager::Flush() { queueRange_.y2 = 0; queueOffsetX_ = -1; queueOffsetY_ = -1; + + if (coreCollectDebugStats) { + double et = time_now_d(); + flushReasonTimes_[reason] += et - st; + if (et - st > slowestFlushTime_) { + slowestFlushTime_ = et - st; + slowestFlushReason_ = reason; + } + } +} + +void BinManager::GetStats(char *buffer, size_t bufsize) { + double allTotal = 0.0; + double slowestTotalTime = 0.0; + const char *slowestTotalReason = nullptr; + for (auto &it : flushReasonTimes_) { + if (it.second > slowestTotalTime) { + slowestTotalTime = it.second; + slowestTotalReason = it.first; + } + allTotal += it.second; + } + + // Many games are 30 FPS, so check last frame too for better stats. + double recentTotal = allTotal; + double slowestRecentTime = slowestTotalTime; + const char *slowestRecentReason = slowestTotalReason; + for (auto &it : lastFlushReasonTimes_) { + if (it.second > slowestRecentTime) { + slowestRecentTime = it.second; + slowestRecentReason = it.first; + } + recentTotal += it.second; + } + + snprintf(buffer, bufsize, + "Slowest individual flush: %s (%0.4f)\n" + "Slowest frame flush: %s (%0.4f)\n" + "Slowest recent flush: %s (%0.4f)\n" + "Total flush time: %0.4f (%05.2f%%, last 2: %05.2f%%)\n", + slowestFlushReason_, slowestFlushTime_, + slowestTotalReason, slowestTotalTime, + slowestRecentReason, slowestRecentTime, + allTotal, allTotal * (6000.0 / 1.001), recentTotal * (3000.0 / 1.001)); + + constexpr int foo = sizeof(BinItem); +} + +void BinManager::ResetStats() { + lastFlushReasonTimes_ = std::move(flushReasonTimes_); + flushReasonTimes_.clear(); + slowestFlushReason_ = nullptr; + slowestFlushTime_ = 0.0; } inline BinCoords BinCoords::Intersect(const BinCoords &range) const { diff --git a/GPU/Software/BinManager.h b/GPU/Software/BinManager.h index eff0c4e6c9..883997ad10 100644 --- a/GPU/Software/BinManager.h +++ b/GPU/Software/BinManager.h @@ -18,10 +18,12 @@ #pragma once #include +#include #include "Common/Log.h" #include "GPU/Software/Rasterizer.h" struct BinWaitable; +class DrawBinItemsTask; enum class BinItemType { TRIANGLE, @@ -56,13 +58,16 @@ struct BinItem { template struct BinQueue { BinQueue() { - items_ = new T[N]; Reset(); } ~BinQueue() { delete [] items_; } + void Setup() { + items_ = new T[N]; + } + void Reset() { head_ = 0; tail_ = 0; @@ -166,17 +171,31 @@ public: void AddPoint(const VertexData &v0); void Drain(); - void Flush(); + void Flush(const char *reason); + + void GetStats(char *buffer, size_t bufsize); + void ResetStats(); + +protected: + static constexpr int MAX_POSSIBLE_TASKS = 64; + // This is about 1MB of state data. + static constexpr int QUEUED_STATES = 4096; + // These are 1KB each, so half an MB. + static constexpr int QUEUED_CLUTS = 512; + // About 320 KB, but we have usually 16 or less of them, so 5 MB - 20 MB. + static constexpr int QUEUED_PRIMS = 1024; + + typedef BinQueue BinStateQueue; + typedef BinQueue BinClutQueue; + typedef BinQueue BinItemQueue; private: - static constexpr int MAX_POSSIBLE_TASKS = 64; - - BinQueue states_; + BinStateQueue states_; int stateIndex_; - BinQueue cluts_; + BinClutQueue cluts_; int clutIndex_; BinCoords scissor_; - BinQueue queue_; + BinItemQueue queue_; BinCoords queueRange_; int queueOffsetX_ = -1; int queueOffsetY_ = -1; @@ -184,13 +203,20 @@ private: int maxTasks_ = 1; bool tasksSplit_ = false; std::vector taskRanges_; - BinQueue taskQueues_[MAX_POSSIBLE_TASKS]; + BinItemQueue taskQueues_[MAX_POSSIBLE_TASKS]; std::atomic taskStatus_[MAX_POSSIBLE_TASKS]; BinWaitable *waitable_ = nullptr; + std::unordered_map flushReasonTimes_; + std::unordered_map lastFlushReasonTimes_; + const char *slowestFlushReason_ = nullptr; + double slowestFlushTime_ = 0.0; + BinCoords Scissor(BinCoords range); BinCoords Range(const VertexData &v0, const VertexData &v1, const VertexData &v2); BinCoords Range(const VertexData &v0, const VertexData &v1); BinCoords Range(const VertexData &v0); void Expand(const BinCoords &range); + + friend class DrawBinItemsTask; }; diff --git a/GPU/Software/SoftGpu.cpp b/GPU/Software/SoftGpu.cpp index 7e786916ee..f1c1d7409c 100644 --- a/GPU/Software/SoftGpu.cpp +++ b/GPU/Software/SoftGpu.cpp @@ -900,7 +900,7 @@ void SoftGPU::FinishDeferred() { } void SoftGPU::GetStats(char *buffer, size_t bufsize) { - snprintf(buffer, bufsize, "SoftGPU: (N/A)"); + drawEngine_->transformUnit.GetStats(buffer, bufsize); } void SoftGPU::InvalidateCache(u32 addr, int size, GPUInvalidationType type) diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index 5736f37ddd..e020f0a7f4 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -609,10 +609,16 @@ void TransformUnit::SubmitPrimitive(void* vertices, void* indices, GEPrimitiveTy } void TransformUnit::Flush(const char *reason) { - binner_->Flush(); + binner_->Flush(reason); GPUDebug::NotifyDraw(); } +void TransformUnit::GetStats(char *buffer, size_t bufsize) { + // TODO: More stats? + binner_->GetStats(buffer, bufsize); + binner_->ResetStats(); +} + void TransformUnit::FlushIfOverlap(const char *reason, uint32_t addr, uint32_t sz) { if (!Memory::IsVRAMAddress(addr)) return; diff --git a/GPU/Software/TransformUnit.h b/GPU/Software/TransformUnit.h index 79bd8bfbd5..0c277b21bc 100644 --- a/GPU/Software/TransformUnit.h +++ b/GPU/Software/TransformUnit.h @@ -123,6 +123,8 @@ public: void FlushIfOverlap(const char *reason, uint32_t addr, uint32_t sz); void NotifyClutUpdate(const void *src); + void GetStats(char *buffer, size_t bufsize); + private: VertexData ReadVertex(VertexReader &vreader, bool &outside_range_flag);