Merge pull request #15873 from unknownbrackets/softgpu-xfer-hazard

softgpu: Flush on transfer to pending tex read
2025-04-02 11:01:50 -04:00 · 2022-08-21 10:04:27 +02:00 · 2022-08-21 10:04:27 +02:00 · 5f30c88e38
commit 5f30c88e38
parent 5ad1301ee8 88e8f95293
5 changed files with 69 additions and 8 deletions
--- a/GPU/Software/BinManager.cpp
+++ b/GPU/Software/BinManager.cpp
@ -210,6 +210,9 @@ void BinManager::UpdateState() {
 	}

 	if (HasDirty(SoftDirty::BINNER_OVERLAP)) {
+		// This is a good place to record any dependencies for block transfer overlap.
+		MarkPendingReads(state);
+
 		// Disallow threads when rendering to the target, even offset.
 		bool selfRender = HasTextureWrite(state);
 		int newMaxTasks = selfRender ? 1 : g_threadManager.GetNumLooperThreads();
@ -251,6 +254,34 @@ bool BinManager::HasTextureWrite(const RasterizerState &state) {
 	return false;
 }

+void BinManager::MarkPendingReads(const Rasterizer::RasterizerState &state) {
+	if (!state.enableTextures)
+		return;
+
+	const uint8_t textureBits = textureBitsPerPixel[state.samplerID.texfmt];
+	for (int i = 0; i <= state.maxTexLevel; ++i) {
+		uint32_t byteStride = (state.texbufw[i] * textureBits) / 8;
+		uint32_t byteWidth = (state.samplerID.cached.sizes[i].w * textureBits) / 8;
+		uint32_t h = state.samplerID.cached.sizes[i].h;
+		auto it = pendingReads_.find(state.texaddr[i]);
+		if (it != pendingReads_.end()) {
+			uint32_t total = byteStride * (h - 1) + byteWidth;
+			uint32_t existing = it->second.strideBytes * (it->second.height - 1) + it->second.widthBytes;
+			if (existing < total) {
+				it->second.strideBytes = std::max(it->second.strideBytes, byteStride);
+				it->second.widthBytes = std::max(it->second.widthBytes, byteWidth);
+				it->second.height = std::max(it->second.height, h);
+			}
+		} else {
+			auto &range = pendingReads_[state.texaddr[i]];
+			range.base = state.texaddr[i];
+			range.strideBytes = byteStride;
+			range.widthBytes = byteWidth;
+			range.height = h;
+		}
+	}
+}
+
 inline void BinDirtyRange::Expand(uint32_t newBase, uint32_t bpp, uint32_t stride, DrawingCoords &tl, DrawingCoords &br) {
 	const uint32_t w = br.x - tl.x + 1;
 	const uint32_t h = br.y - tl.y + 1;
@ -465,9 +496,10 @@ void BinManager::Flush(const char *reason) {
 	for (auto &pending : pendingWrites_)
 		pending.base = 0;
 	pendingOverlap_ = false;
+	pendingReads_.clear();

-	// We'll need to set the pending writes again, since we just flushed it.
-	dirty_ |= SoftDirty::BINNER_RANGE;
+	// We'll need to set the pending writes and reads again, since we just flushed it.
+	dirty_ |= SoftDirty::BINNER_RANGE | SoftDirty::BINNER_OVERLAP;

 	if (coreCollectDebugStats) {
 		double et = time_now_d();
@ -486,7 +518,7 @@ bool BinManager::HasPendingWrite(uint32_t start, uint32_t stride, uint32_t w, ui
 	// Ignore mirrors for overlap detection.
 	start &= 0x0FFFFFFF & ~0x00600000;

-	uint32_t size = stride * h;
+	uint32_t size = stride * (h - 1) + w;
 	for (const auto &range : pendingWrites_) {
 		if (range.base == 0 || range.strideBytes == 0)
 			continue;
@ -512,6 +544,28 @@ bool BinManager::HasPendingWrite(uint32_t start, uint32_t stride, uint32_t w, ui
 	return false;
 }

+bool BinManager::HasPendingRead(uint32_t start, uint32_t stride, uint32_t w, uint32_t h) {
+	if (Memory::IsVRAMAddress(start)) {
+		// Ignore VRAM mirrors.
+		start &= 0x0FFFFFFF & ~0x00600000;
+	} else {
+		// Ignore only regular RAM mirrors.
+		start &= 0x3FFFFFFF;
+	}
+
+	uint32_t size = stride * (h - 1) + w;
+	for (const auto &pair : pendingReads_) {
+		const auto &range = pair.second;
+		if (start >= range.base + range.height * range.strideBytes || start + size <= range.base)
+			continue;
+
+		// Stride gaps are uncommon with reads, so don't bother.
+		return true;
+	}
+
+	return false;
+}
+
 void BinManager::GetStats(char *buffer, size_t bufsize) {
 	double allTotal = 0.0;
 	double slowestTotalTime = 0.0;
--- a/GPU/Software/BinManager.h
+++ b/GPU/Software/BinManager.h
@ -198,6 +198,8 @@ public:
 	void Drain();
 	void Flush(const char *reason);
 	bool HasPendingWrite(uint32_t start, uint32_t stride, uint32_t w, uint32_t h);
+	// Assumes you've also checked for a write (writes are partial so are automatically reads.)
+	bool HasPendingRead(uint32_t start, uint32_t stride, uint32_t w, uint32_t h);

 	void GetStats(char *buffer, size_t bufsize);
 	void ResetStats();
@ -252,6 +254,8 @@ private:
 	BinWaitable *waitable_ = nullptr;

 	BinDirtyRange pendingWrites_[2]{};
+	std::unordered_map<uint32_t, BinDirtyRange> pendingReads_;
+
 	bool pendingOverlap_ = false;

 	std::unordered_map<const char *, double> flushReasonTimes_;
@ -262,6 +266,7 @@ private:
 	int enqueues_ = 0;
 	int mostThreads_ = 0;

+	void MarkPendingReads(const Rasterizer::RasterizerState &state);
 	bool HasTextureWrite(const Rasterizer::RasterizerState &state);
 	BinCoords Scissor(BinCoords range);
 	BinCoords Range(const VertexData &v0, const VertexData &v1, const VertexData &v2);
--- a/GPU/Software/SoftGpu.cpp
+++ b/GPU/Software/SoftGpu.cpp
@ -787,8 +787,8 @@ void SoftGPU::Execute_BlockTransferStart(u32 op, u32 diff) {
 	const uint32_t dstSize = height * dstStride * bpp;

 	// Need to flush both source and target, so we overwrite properly.
-	drawEngine_->transformUnit.FlushIfOverlap("blockxfer", src, srcStride, width * bpp, height);
-	drawEngine_->transformUnit.FlushIfOverlap("blockxfer", dst, dstStride, width * bpp, height);
+	drawEngine_->transformUnit.FlushIfOverlap("blockxfer", false, src, srcStride, width * bpp, height);
+	drawEngine_->transformUnit.FlushIfOverlap("blockxfer", true, dst, dstStride, width * bpp, height);

 	DEBUG_LOG(G3D, "Block transfer: %08x/%x -> %08x/%x, %ix%ix%i (%i,%i)->(%i,%i)", srcBasePtr, srcStride, dstBasePtr, dstStride, width, height, bpp, srcX, srcY, dstX, dstY);

@ -977,7 +977,7 @@ void SoftGPU::Execute_LoadClut(u32 op, u32 diff) {
 		clutTotalBytes = 1024;

 	// Might be copying drawing into the CLUT, so flush.
-	drawEngine_->transformUnit.FlushIfOverlap("loadclut", clutAddr, clutTotalBytes, clutTotalBytes, 1);
+	drawEngine_->transformUnit.FlushIfOverlap("loadclut", false, clutAddr, clutTotalBytes, clutTotalBytes, 1);

 	bool changed = false;
 	if (Memory::IsValidAddress(clutAddr)) {
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@ -797,9 +797,11 @@ void TransformUnit::GetStats(char *buffer, size_t bufsize) {
 	binner_->GetStats(buffer, bufsize);
 }

-void TransformUnit::FlushIfOverlap(const char *reason, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h) {
+void TransformUnit::FlushIfOverlap(const char *reason, bool modifying, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h) {
 	if (binner_->HasPendingWrite(addr, stride, w, h))
 		Flush(reason);
+	if (modifying && binner_->HasPendingRead(addr, stride, w, h))
+		Flush(reason);
 }

 void TransformUnit::NotifyClutUpdate(const void *src) {
--- a/GPU/Software/TransformUnit.h
+++ b/GPU/Software/TransformUnit.h
@ -123,7 +123,7 @@ public:
 	bool GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices);

 	void Flush(const char *reason);
-	void FlushIfOverlap(const char *reason, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h);
+	void FlushIfOverlap(const char *reason, bool modifying, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h);
 	void NotifyClutUpdate(const void *src);

 	void GetStats(char *buffer, size_t bufsize);