Frustum-cull small draws

Some games do a poor job of culling stuff, and some transparent sprites can be very expensive if they cause a copy. Skipping them if outside the viewport makes sense in that case. One example are the flame sprites in #17797 . Additionally, we should be able to cull through-mode draws easily, this one doesn't even try.
2025-04-02 11:01:50 -04:00 · 2023-11-26 11:14:13 +01:00 · 2023-11-26 11:14:13 +01:00 · 0905b6a5ad
commit 0905b6a5ad
parent 3e20fab387
5 changed files with 79 additions and 5 deletions
--- a/Common/UI/View.cpp
+++ b/Common/UI/View.cpp
@ -620,7 +620,6 @@ CollapsibleHeader::CollapsibleHeader(bool *toggle, const std::string &text, Layo

 void CollapsibleHeader::Draw(UIContext &dc) {
 	Style style = dc.theme->itemStyle;
-	style.background.color = 0;
 	if (HasFocus()) style = dc.theme->itemFocusedStyle;
 	if (down_) style = dc.theme->itemDownStyle;
 	if (!IsEnabled()) style = dc.theme->itemDisabledStyle;
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@ -670,6 +670,31 @@ int DrawEngineCommon::ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t *
 	return cmd - start;
 }

+void DrawEngineCommon::SkipPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead) {
+	if (!indexGen.PrimCompatible(prevPrim_, prim)) {
+		DispatchFlush();
+	}
+
+	// This isn't exactly right, if we flushed, since prims can straddle previous calls.
+	// But it generally works for common usage.
+	if (prim == GE_PRIM_KEEP_PREVIOUS) {
+		// Has to be set to something, let's assume POINTS (0) if no previous.
+		if (prevPrim_ == GE_PRIM_INVALID)
+			prevPrim_ = GE_PRIM_POINTS;
+		prim = prevPrim_;
+	} else {
+		prevPrim_ = prim;
+	}
+
+	// If vtype has changed, setup the vertex decoder.
+	if (vertTypeID != lastVType_ || !dec_) {
+		dec_ = GetVertexDecoder(vertTypeID);
+		lastVType_ = vertTypeID;
+	}
+
+	*bytesRead = vertexCount * dec_->VertexSize();
+}
+
 // vertTypeID is the vertex type but with the UVGen mode smashed into the top bits.
 bool DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, bool clockwise, int *bytesRead) {
 	if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawVerts_ >= MAX_DEFERRED_DRAW_VERTS || numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@ -113,6 +113,8 @@ public:

 	int ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t *stall, u32 vertTypeID, bool clockwise, int *bytesRead, bool isTriangle);
 	bool SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, bool clockwise, int *bytesRead);
+	void SkipPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead);
+
 	template<class Surface>
 	void SubmitCurve(const void *control_points, const void *indices, Surface &surface, u32 vertType, int *bytesRead, const char *scope);
 	void ClearSplineBezierWeights();
--- a/GPU/GPU.h
+++ b/GPU/GPU.h
@ -76,6 +76,7 @@ struct GPUStatistics {
 	void ResetFrame() {
 		numDrawCalls = 0;
 		numVertexDecodes = 0;
+		numCulledDraws = 0;
 		numDrawSyncs = 0;
 		numListSyncs = 0;
 		numVertsSubmitted = 0;
@ -111,6 +112,7 @@ struct GPUStatistics {
 	// Per frame statistics
 	int numDrawCalls;
 	int numVertexDecodes;
+	int numCulledDraws;
 	int numDrawSyncs;
 	int numListSyncs;
 	int numFlushes;
--- a/GPU/GPUCommonHW.cpp
+++ b/GPU/GPUCommonHW.cpp
@ -989,9 +989,36 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
 	int cullMode = gstate.getCullMode();

 	uint32_t vertTypeID = GetVertTypeID(vertexType, gstate.getUVGenMode(), g_Config.bSoftwareSkinning);
-	if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, true, &bytesRead)) {
+
+#define MAX_CULL_CHECK_COUNT 6
+
+#define PASSES_CULLING ((vertexType & (GE_VTYPE_THROUGH_MASK | GE_VTYPE_MORPHCOUNT_MASK | GE_VTYPE_WEIGHT_MASK | GE_VTYPE_IDX_MASK)) || count > MAX_CULL_CHECK_COUNT)
+
+	// If certain conditions are true, do frustum culling.
+	bool passCulling = PASSES_CULLING;
+	if (!passCulling) {
+		// Do software culling.
+		if (drawEngineCommon_->TestBoundingBox(verts, inds, count, vertexType)) {
+			passCulling = true;
+		} else {
+			gpuStats.numCulledDraws++;
+		}
+	}
+
+	// If the first one in a batch passes, let's assume the whole batch passes.
+	// Cuts down on checking, while not losing that much efficiency.
+	bool onePassed = false;
+	if (passCulling) {
+		if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, true, &bytesRead)) {
+			canExtend = false;
+		}
+		onePassed = true;
+	} else {
+		// Still need to advance bytesRead.
+		drawEngineCommon_->SkipPrim(prim, count, vertTypeID, &bytesRead);
 		canExtend = false;
 	}
+
 	// After drawing, we advance the vertexAddr (when non indexed) or indexAddr (when indexed).
 	// Some games rely on this, they don't bother reloading VADDR and IADDR.
 	// The VADDR/IADDR registers are NOT updated.
@ -1027,7 +1054,7 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
 			bool clockwise = !gstate.isCullEnabled() || gstate.getCullMode() == cullMode;
 			if (canExtend) {
 				// Non-indexed draws can be cheaply merged if vertexAddr hasn't changed, that means the vertices
-				// are consecutive in memory.
+				// are consecutive in memory. We also ignore culling here.
 				_dbg_assert_((vertexType & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_NONE);
 				int commandsExecuted = drawEngineCommon_->ExtendNonIndexedPrim(src, stall, vertTypeID, clockwise, &bytesRead, isTriangle);
 				if (!commandsExecuted) {
@ -1047,7 +1074,25 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
 				// We can extend again after submitting a normal draw.
 				canExtend = isTriangle;
 			}
-			if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, clockwise, &bytesRead)) {
+
+			bool passCulling = onePassed || PASSES_CULLING;
+			if (!passCulling) {
+				// Do software culling.
+				if (drawEngineCommon_->TestBoundingBox(verts, inds, count, vertexType)) {
+					passCulling = true;
+				} else {
+					gpuStats.numCulledDraws++;
+				}
+			}
+			if (passCulling) {
+				if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, clockwise, &bytesRead)) {
+					canExtend = false;
+				}
+				// As soon as one passes, assume we don't need to check the rest of this batch.
+				onePassed = true;
+			} else {
+				// Still need to advance bytesRead.
+				drawEngineCommon_->SkipPrim(newPrim, count, vertTypeID, &bytesRead);
 				canExtend = false;
 			}
 			AdvanceVerts(vertexType, count, bytesRead);
@ -1691,7 +1736,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
 	float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f;
 	return snprintf(buffer, size,
 		"DL processing time: %0.2f ms, %d drawsync, %d listsync\n"
-		"Draw: %d (%d dec), flushes %d, clears %d, bbox jumps %d (%d updates)\n"
+		"Draw: %d (%d dec, %d culled), flushes %d, clears %d, bbox jumps %d (%d updates)\n"
 		"Vertices: %d drawn: %d\n"
 		"FBOs active: %d (evaluations: %d)\n"
 		"Textures: %d, dec: %d, invalidated: %d, hashed: %d kB\n"
@ -1705,6 +1750,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
 		gpuStats.numListSyncs,
 		gpuStats.numDrawCalls,
 		gpuStats.numVertexDecodes,
+		gpuStats.numCulledDraws,
 		gpuStats.numFlushes,
 		gpuStats.numClears,
 		gpuStats.numBBOXJumps,