From 88e8f952931542bc3b9db0228ea7021b77be64b8 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 20 Aug 2022 23:22:21 -0700 Subject: [PATCH] softgpu: Flush on transfer to pending tex read. Potentially could use these for self-render, but so far we should be detecting that so leaving it alone. --- GPU/Software/BinManager.cpp | 60 ++++++++++++++++++++++++++++++++-- GPU/Software/BinManager.h | 5 +++ GPU/Software/SoftGpu.cpp | 6 ++-- GPU/Software/TransformUnit.cpp | 4 ++- GPU/Software/TransformUnit.h | 2 +- 5 files changed, 69 insertions(+), 8 deletions(-) diff --git a/GPU/Software/BinManager.cpp b/GPU/Software/BinManager.cpp index 4005e42945..7f8f1ad453 100644 --- a/GPU/Software/BinManager.cpp +++ b/GPU/Software/BinManager.cpp @@ -210,6 +210,9 @@ void BinManager::UpdateState() { } if (HasDirty(SoftDirty::BINNER_OVERLAP)) { + // This is a good place to record any dependencies for block transfer overlap. + MarkPendingReads(state); + // Disallow threads when rendering to the target, even offset. bool selfRender = HasTextureWrite(state); int newMaxTasks = selfRender ? 1 : g_threadManager.GetNumLooperThreads(); @@ -251,6 +254,34 @@ bool BinManager::HasTextureWrite(const RasterizerState &state) { return false; } +void BinManager::MarkPendingReads(const Rasterizer::RasterizerState &state) { + if (!state.enableTextures) + return; + + const uint8_t textureBits = textureBitsPerPixel[state.samplerID.texfmt]; + for (int i = 0; i <= state.maxTexLevel; ++i) { + uint32_t byteStride = (state.texbufw[i] * textureBits) / 8; + uint32_t byteWidth = (state.samplerID.cached.sizes[i].w * textureBits) / 8; + uint32_t h = state.samplerID.cached.sizes[i].h; + auto it = pendingReads_.find(state.texaddr[i]); + if (it != pendingReads_.end()) { + uint32_t total = byteStride * (h - 1) + byteWidth; + uint32_t existing = it->second.strideBytes * (it->second.height - 1) + it->second.widthBytes; + if (existing < total) { + it->second.strideBytes = std::max(it->second.strideBytes, byteStride); + it->second.widthBytes = std::max(it->second.widthBytes, byteWidth); + it->second.height = std::max(it->second.height, h); + } + } else { + auto &range = pendingReads_[state.texaddr[i]]; + range.base = state.texaddr[i]; + range.strideBytes = byteStride; + range.widthBytes = byteWidth; + range.height = h; + } + } +} + inline void BinDirtyRange::Expand(uint32_t newBase, uint32_t bpp, uint32_t stride, DrawingCoords &tl, DrawingCoords &br) { const uint32_t w = br.x - tl.x + 1; const uint32_t h = br.y - tl.y + 1; @@ -465,9 +496,10 @@ void BinManager::Flush(const char *reason) { for (auto &pending : pendingWrites_) pending.base = 0; pendingOverlap_ = false; + pendingReads_.clear(); - // We'll need to set the pending writes again, since we just flushed it. - dirty_ |= SoftDirty::BINNER_RANGE; + // We'll need to set the pending writes and reads again, since we just flushed it. + dirty_ |= SoftDirty::BINNER_RANGE | SoftDirty::BINNER_OVERLAP; if (coreCollectDebugStats) { double et = time_now_d(); @@ -486,7 +518,7 @@ bool BinManager::HasPendingWrite(uint32_t start, uint32_t stride, uint32_t w, ui // Ignore mirrors for overlap detection. start &= 0x0FFFFFFF & ~0x00600000; - uint32_t size = stride * h; + uint32_t size = stride * (h - 1) + w; for (const auto &range : pendingWrites_) { if (range.base == 0 || range.strideBytes == 0) continue; @@ -512,6 +544,28 @@ bool BinManager::HasPendingWrite(uint32_t start, uint32_t stride, uint32_t w, ui return false; } +bool BinManager::HasPendingRead(uint32_t start, uint32_t stride, uint32_t w, uint32_t h) { + if (Memory::IsVRAMAddress(start)) { + // Ignore VRAM mirrors. + start &= 0x0FFFFFFF & ~0x00600000; + } else { + // Ignore only regular RAM mirrors. + start &= 0x3FFFFFFF; + } + + uint32_t size = stride * (h - 1) + w; + for (const auto &pair : pendingReads_) { + const auto &range = pair.second; + if (start >= range.base + range.height * range.strideBytes || start + size <= range.base) + continue; + + // Stride gaps are uncommon with reads, so don't bother. + return true; + } + + return false; +} + void BinManager::GetStats(char *buffer, size_t bufsize) { double allTotal = 0.0; double slowestTotalTime = 0.0; diff --git a/GPU/Software/BinManager.h b/GPU/Software/BinManager.h index 44ae81e2b2..01b6837d20 100644 --- a/GPU/Software/BinManager.h +++ b/GPU/Software/BinManager.h @@ -198,6 +198,8 @@ public: void Drain(); void Flush(const char *reason); bool HasPendingWrite(uint32_t start, uint32_t stride, uint32_t w, uint32_t h); + // Assumes you've also checked for a write (writes are partial so are automatically reads.) + bool HasPendingRead(uint32_t start, uint32_t stride, uint32_t w, uint32_t h); void GetStats(char *buffer, size_t bufsize); void ResetStats(); @@ -252,6 +254,8 @@ private: BinWaitable *waitable_ = nullptr; BinDirtyRange pendingWrites_[2]{}; + std::unordered_map pendingReads_; + bool pendingOverlap_ = false; std::unordered_map flushReasonTimes_; @@ -262,6 +266,7 @@ private: int enqueues_ = 0; int mostThreads_ = 0; + void MarkPendingReads(const Rasterizer::RasterizerState &state); bool HasTextureWrite(const Rasterizer::RasterizerState &state); BinCoords Scissor(BinCoords range); BinCoords Range(const VertexData &v0, const VertexData &v1, const VertexData &v2); diff --git a/GPU/Software/SoftGpu.cpp b/GPU/Software/SoftGpu.cpp index 8015a9db05..b677a5ec9a 100644 --- a/GPU/Software/SoftGpu.cpp +++ b/GPU/Software/SoftGpu.cpp @@ -787,8 +787,8 @@ void SoftGPU::Execute_BlockTransferStart(u32 op, u32 diff) { const uint32_t dstSize = height * dstStride * bpp; // Need to flush both source and target, so we overwrite properly. - drawEngine_->transformUnit.FlushIfOverlap("blockxfer", src, srcStride, width * bpp, height); - drawEngine_->transformUnit.FlushIfOverlap("blockxfer", dst, dstStride, width * bpp, height); + drawEngine_->transformUnit.FlushIfOverlap("blockxfer", false, src, srcStride, width * bpp, height); + drawEngine_->transformUnit.FlushIfOverlap("blockxfer", true, dst, dstStride, width * bpp, height); DEBUG_LOG(G3D, "Block transfer: %08x/%x -> %08x/%x, %ix%ix%i (%i,%i)->(%i,%i)", srcBasePtr, srcStride, dstBasePtr, dstStride, width, height, bpp, srcX, srcY, dstX, dstY); @@ -974,7 +974,7 @@ void SoftGPU::Execute_LoadClut(u32 op, u32 diff) { u32 clutTotalBytes = gstate.getClutLoadBytes(); // Might be copying drawing into the CLUT, so flush. - drawEngine_->transformUnit.FlushIfOverlap("loadclut", clutAddr, clutTotalBytes, clutTotalBytes, 1); + drawEngine_->transformUnit.FlushIfOverlap("loadclut", false, clutAddr, clutTotalBytes, clutTotalBytes, 1); bool changed = false; if (Memory::IsValidAddress(clutAddr)) { diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index b765df8404..db50f48bb6 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -797,9 +797,11 @@ void TransformUnit::GetStats(char *buffer, size_t bufsize) { binner_->GetStats(buffer, bufsize); } -void TransformUnit::FlushIfOverlap(const char *reason, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h) { +void TransformUnit::FlushIfOverlap(const char *reason, bool modifying, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h) { if (binner_->HasPendingWrite(addr, stride, w, h)) Flush(reason); + if (modifying && binner_->HasPendingRead(addr, stride, w, h)) + Flush(reason); } void TransformUnit::NotifyClutUpdate(const void *src) { diff --git a/GPU/Software/TransformUnit.h b/GPU/Software/TransformUnit.h index 9f17ed1197..54f9fb5f5e 100644 --- a/GPU/Software/TransformUnit.h +++ b/GPU/Software/TransformUnit.h @@ -123,7 +123,7 @@ public: bool GetCurrentSimpleVertices(int count, std::vector &vertices, std::vector &indices); void Flush(const char *reason); - void FlushIfOverlap(const char *reason, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h); + void FlushIfOverlap(const char *reason, bool modifying, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h); void NotifyClutUpdate(const void *src); void GetStats(char *buffer, size_t bufsize);