Frustum-cull small draws

Some games do a poor job of culling stuff, and some transparent
sprites can be very expensive if they cause a copy.
Skipping them if outside the viewport makes sense in that case.

One example are the flame sprites in #17797 .

Additionally, we should be able to cull through-mode draws easily, this
one doesn't even try.
This commit is contained in:
Henrik Rydgård 2023-11-26 11:14:13 +01:00
parent 3e20fab387
commit 0905b6a5ad
5 changed files with 79 additions and 5 deletions

View file

@ -620,7 +620,6 @@ CollapsibleHeader::CollapsibleHeader(bool *toggle, const std::string &text, Layo
void CollapsibleHeader::Draw(UIContext &dc) {
Style style = dc.theme->itemStyle;
style.background.color = 0;
if (HasFocus()) style = dc.theme->itemFocusedStyle;
if (down_) style = dc.theme->itemDownStyle;
if (!IsEnabled()) style = dc.theme->itemDisabledStyle;

View file

@ -670,6 +670,31 @@ int DrawEngineCommon::ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t *
return cmd - start;
}
void DrawEngineCommon::SkipPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead) {
if (!indexGen.PrimCompatible(prevPrim_, prim)) {
DispatchFlush();
}
// This isn't exactly right, if we flushed, since prims can straddle previous calls.
// But it generally works for common usage.
if (prim == GE_PRIM_KEEP_PREVIOUS) {
// Has to be set to something, let's assume POINTS (0) if no previous.
if (prevPrim_ == GE_PRIM_INVALID)
prevPrim_ = GE_PRIM_POINTS;
prim = prevPrim_;
} else {
prevPrim_ = prim;
}
// If vtype has changed, setup the vertex decoder.
if (vertTypeID != lastVType_ || !dec_) {
dec_ = GetVertexDecoder(vertTypeID);
lastVType_ = vertTypeID;
}
*bytesRead = vertexCount * dec_->VertexSize();
}
// vertTypeID is the vertex type but with the UVGen mode smashed into the top bits.
bool DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, bool clockwise, int *bytesRead) {
if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawVerts_ >= MAX_DEFERRED_DRAW_VERTS || numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {

View file

@ -113,6 +113,8 @@ public:
int ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t *stall, u32 vertTypeID, bool clockwise, int *bytesRead, bool isTriangle);
bool SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, bool clockwise, int *bytesRead);
void SkipPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead);
template<class Surface>
void SubmitCurve(const void *control_points, const void *indices, Surface &surface, u32 vertType, int *bytesRead, const char *scope);
void ClearSplineBezierWeights();

View file

@ -76,6 +76,7 @@ struct GPUStatistics {
void ResetFrame() {
numDrawCalls = 0;
numVertexDecodes = 0;
numCulledDraws = 0;
numDrawSyncs = 0;
numListSyncs = 0;
numVertsSubmitted = 0;
@ -111,6 +112,7 @@ struct GPUStatistics {
// Per frame statistics
int numDrawCalls;
int numVertexDecodes;
int numCulledDraws;
int numDrawSyncs;
int numListSyncs;
int numFlushes;

View file

@ -989,9 +989,36 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
int cullMode = gstate.getCullMode();
uint32_t vertTypeID = GetVertTypeID(vertexType, gstate.getUVGenMode(), g_Config.bSoftwareSkinning);
if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, true, &bytesRead)) {
#define MAX_CULL_CHECK_COUNT 6
#define PASSES_CULLING ((vertexType & (GE_VTYPE_THROUGH_MASK | GE_VTYPE_MORPHCOUNT_MASK | GE_VTYPE_WEIGHT_MASK | GE_VTYPE_IDX_MASK)) || count > MAX_CULL_CHECK_COUNT)
// If certain conditions are true, do frustum culling.
bool passCulling = PASSES_CULLING;
if (!passCulling) {
// Do software culling.
if (drawEngineCommon_->TestBoundingBox(verts, inds, count, vertexType)) {
passCulling = true;
} else {
gpuStats.numCulledDraws++;
}
}
// If the first one in a batch passes, let's assume the whole batch passes.
// Cuts down on checking, while not losing that much efficiency.
bool onePassed = false;
if (passCulling) {
if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, true, &bytesRead)) {
canExtend = false;
}
onePassed = true;
} else {
// Still need to advance bytesRead.
drawEngineCommon_->SkipPrim(prim, count, vertTypeID, &bytesRead);
canExtend = false;
}
// After drawing, we advance the vertexAddr (when non indexed) or indexAddr (when indexed).
// Some games rely on this, they don't bother reloading VADDR and IADDR.
// The VADDR/IADDR registers are NOT updated.
@ -1027,7 +1054,7 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
bool clockwise = !gstate.isCullEnabled() || gstate.getCullMode() == cullMode;
if (canExtend) {
// Non-indexed draws can be cheaply merged if vertexAddr hasn't changed, that means the vertices
// are consecutive in memory.
// are consecutive in memory. We also ignore culling here.
_dbg_assert_((vertexType & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_NONE);
int commandsExecuted = drawEngineCommon_->ExtendNonIndexedPrim(src, stall, vertTypeID, clockwise, &bytesRead, isTriangle);
if (!commandsExecuted) {
@ -1047,7 +1074,25 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
// We can extend again after submitting a normal draw.
canExtend = isTriangle;
}
if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, clockwise, &bytesRead)) {
bool passCulling = onePassed || PASSES_CULLING;
if (!passCulling) {
// Do software culling.
if (drawEngineCommon_->TestBoundingBox(verts, inds, count, vertexType)) {
passCulling = true;
} else {
gpuStats.numCulledDraws++;
}
}
if (passCulling) {
if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, clockwise, &bytesRead)) {
canExtend = false;
}
// As soon as one passes, assume we don't need to check the rest of this batch.
onePassed = true;
} else {
// Still need to advance bytesRead.
drawEngineCommon_->SkipPrim(newPrim, count, vertTypeID, &bytesRead);
canExtend = false;
}
AdvanceVerts(vertexType, count, bytesRead);
@ -1691,7 +1736,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f;
return snprintf(buffer, size,
"DL processing time: %0.2f ms, %d drawsync, %d listsync\n"
"Draw: %d (%d dec), flushes %d, clears %d, bbox jumps %d (%d updates)\n"
"Draw: %d (%d dec, %d culled), flushes %d, clears %d, bbox jumps %d (%d updates)\n"
"Vertices: %d drawn: %d\n"
"FBOs active: %d (evaluations: %d)\n"
"Textures: %d, dec: %d, invalidated: %d, hashed: %d kB\n"
@ -1705,6 +1750,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
gpuStats.numListSyncs,
gpuStats.numDrawCalls,
gpuStats.numVertexDecodes,
gpuStats.numCulledDraws,
gpuStats.numFlushes,
gpuStats.numClears,
gpuStats.numBBOXJumps,