Merge pull request #19776 from hrydgard/depth-draw-queue

Implement queueing of depth raster draws
This commit is contained in:
Henrik Rydgård 2024-12-29 01:13:19 +01:00 committed by GitHub
commit 0b82405889
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 147 additions and 79 deletions

View file

@ -103,25 +103,18 @@ enum class TriangleResult {
TooSmall,
};
constexpr int MIN_TRI_AREA = 10;
constexpr int MIN_TWICE_TRI_AREA = 10;
// Adapted from Intel's depth rasterizer example.
// Started with the scalar version, will SIMD-ify later.
// x1/y1 etc are the scissor rect.
template<ZCompareMode compareMode>
TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
const int tileStartX = scissor.x1;
const int tileEndX = scissor.x2;
const int tileStartY = scissor.y1;
const int tileEndY = scissor.y2;
// BEGIN triangle setup. This should be done SIMD, four triangles at a time.
// Due to the many multiplications, we might want to do it in floating point as 32-bit integer muls
// are slow on SSE2.
// NOTE: Triangles are stored in groups of 4.
int v0x = tx[0];
int v0y = ty[0];
int v1x = tx[4];
@ -131,10 +124,10 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
// use fixed-point only for X and Y. Avoid work for Z and W.
// We use 4x1 tiles for simplicity.
int minX = std::max(std::min(std::min(v0x, v1x), v2x), tileStartX) & ~3;
int maxX = std::min(std::max(std::max(v0x, v1x), v2x) + 3, tileEndX) & ~3;
int minY = std::max(std::min(std::min(v0y, v1y), v2y), tileStartY);
int maxY = std::min(std::max(std::max(v0y, v1y), v2y), tileEndY);
int minX = std::max(std::min(std::min(v0x, v1x), v2x), (int)scissor.x1) & ~3;
int maxX = std::min(std::max(std::max(v0x, v1x), v2x) + 3, (int)scissor.x2) & ~3;
int minY = std::max(std::min(std::min(v0y, v1y), v2y), (int)scissor.y1);
int maxY = std::min(std::max(std::max(v0y, v1y), v2y), (int)scissor.y2);
if (maxX == minX || maxY == minY) {
// No pixels, or outside screen.
return TriangleResult::NoPixels;
@ -145,7 +138,7 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
if (triArea < 0) {
return TriangleResult::Backface;
}
if (triArea < MIN_TRI_AREA) {
if (triArea < MIN_TWICE_TRI_AREA) {
return TriangleResult::TooSmall; // Or zero area.
}

View file

@ -15,7 +15,7 @@ struct DepthScreenVertex {
#endif
// We only need to support these three modes.
enum class ZCompareMode {
enum class ZCompareMode : u8 {
Greater, // Most common
Less, // Less common
Always, // Mostly used for clears
@ -29,13 +29,14 @@ struct DepthScissor {
};
struct DepthDraw {
u32 depthAddr;
u16 depthStride;
u8 cullMode;
GEPrimitiveType prim;
ZCompareMode compareMode;
bool cullEnabled;
int cullMode;
DepthScissor scissor;
bool through;
int transformedStartIndex;
int vertexOffset;
int indexOffset;
int vertexCount;
};

View file

@ -39,7 +39,7 @@
enum {
TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex),
DEPTH_TRANSFORMED_SIZE = VERTEX_BUFFER_MAX * 4,
DEPTH_TRANSFORMED_SIZE = VERTEX_BUFFER_MAX * 4 * sizeof(float),
DEPTH_SCREENVERTS_COMPONENT_COUNT = VERTEX_BUFFER_MAX,
DEPTH_SCREENVERTS_COMPONENT_SIZE = DEPTH_SCREENVERTS_COMPONENT_COUNT * sizeof(int) + 384,
DEPTH_SCREENVERTS_SIZE = DEPTH_SCREENVERTS_COMPONENT_SIZE * 3,
@ -67,6 +67,9 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
case DepthRasterMode::OFF:
useDepthRaster_ = false;
}
if (useDepthRaster_) {
depthDraws_.reserve(256);
}
}
DrawEngineCommon::~DrawEngineCommon() {
@ -803,7 +806,7 @@ void DrawEngineCommon::DecodeVerts(VertexDecoder *dec, u8 *dest) {
int i = decodeVertsCounter_;
int stride = (int)dec->GetDecVtxFmt().stride;
for (; i < numDrawVerts_; i++) {
DeferredVerts &dv = drawVerts_[i];
const DeferredVerts &dv = drawVerts_[i];
int indexLowerBound = dv.indexLowerBound;
drawVertexOffsets_[i] = numDecodedVerts_ - indexLowerBound;
@ -932,7 +935,7 @@ Mat4F32 ComputeFinalProjMatrix() {
return m;
}
static bool CalculateDepthDraw(DepthDraw *draw, GEPrimitiveType prim, int vertexCount) {
bool DrawEngineCommon::CalculateDepthDraw(DepthDraw *draw, GEPrimitiveType prim, int vertexCount) {
switch (prim) {
case GE_PRIM_INVALID:
case GE_PRIM_KEEP_PREVIOUS:
@ -978,8 +981,15 @@ static bool CalculateDepthDraw(DepthDraw *draw, GEPrimitiveType prim, int vertex
_dbg_assert_(gstate.isDepthWriteEnabled());
}
draw->transformedStartIndex = 0;
draw->indexOffset = 0;
if (depthVertexCount_ + vertexCount >= DEPTH_INDEXBUFFER_SIZE) {
// Can't add more.
return false;
}
draw->depthAddr = gstate.getDepthBufRawAddress() | 0x04000000;
draw->depthStride = gstate.DepthBufStride();
draw->vertexOffset = depthVertexCount_;
draw->indexOffset = depthIndexCount_;
draw->vertexCount = vertexCount;
draw->cullEnabled = gstate.isCullEnabled();
draw->cullMode = gstate.getCullMode();
@ -1010,32 +1020,31 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
return;
}
TimeCollector collectStat(&gpuStats.msRasterizingDepth, coreCollectDebugStats);
TimeCollector collectStat(&gpuStats.msPrepareDepth, coreCollectDebugStats);
// Decode.
int numDec = 0;
int numDecoded = 0;
for (int i = 0; i < numDrawVerts_; i++) {
DeferredVerts &dv = drawVerts_[i];
int indexLowerBound = dv.indexLowerBound;
drawVertexOffsets_[i] = numDec - indexLowerBound;
int indexUpperBound = dv.indexUpperBound;
if (indexUpperBound + 1 - indexLowerBound + numDec >= VERTEX_BUFFER_MAX) {
const DeferredVerts &dv = drawVerts_[i];
if (dv.indexUpperBound + 1 - dv.indexLowerBound + numDecoded >= VERTEX_BUFFER_MAX) {
// Hit our limit! Stop decoding in this draw.
break;
}
// Decode the verts (and at the same time apply morphing/skinning). Simple.
DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, worldviewproj, dv.verts, indexLowerBound, indexUpperBound, dec, vertTypeID);
numDec += indexUpperBound - indexLowerBound + 1;
DecodeAndTransformForDepthRaster(depthTransformed_ + (draw.vertexOffset + numDecoded) * 4, worldviewproj, dv.verts, dv.indexLowerBound, dv.indexUpperBound, dec, vertTypeID);
numDecoded += dv.indexUpperBound - dv.indexLowerBound + 1;
}
// Copy indices.
memcpy(depthIndices_, decIndex_, sizeof(uint16_t) * vertexCount);
memcpy(depthIndices_ + draw.indexOffset, decIndex_, sizeof(uint16_t) * vertexCount);
// FUTURE SPLIT --- The above will always run on the main thread. The below can be split across workers.
FlushDepthDraw(draw);
// Commit
depthIndexCount_ += vertexCount;
depthVertexCount_ += numDecoded;
depthDraws_.push_back(draw);
// FlushQueuedDepth();
}
void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *inVerts, int numDecoded, VertexDecoder *dec, int vertexCount) {
@ -1048,45 +1057,63 @@ void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *i
return;
}
TimeCollector collectStat(&gpuStats.msRasterizingDepth, coreCollectDebugStats);
TimeCollector collectStat(&gpuStats.msPrepareDepth, coreCollectDebugStats);
_dbg_assert_(prim != GE_PRIM_TRIANGLE_STRIP && prim != GE_PRIM_TRIANGLE_FAN);
if (dec->throughmode) {
ConvertPredecodedThroughForDepthRaster(depthTransformed_, decoded_, dec, numDecoded);
ConvertPredecodedThroughForDepthRaster(depthTransformed_ + 4 * draw.vertexOffset, decoded_, dec, numDecoded);
} else {
if (dec->VertexType() & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) {
return;
}
float worldviewproj[16];
ComputeFinalProjMatrix().Store(worldviewproj);
TransformPredecodedForDepthRaster(depthTransformed_, worldviewproj, decoded_, dec, numDecoded);
TransformPredecodedForDepthRaster(depthTransformed_ + 4 * draw.vertexOffset, worldviewproj, decoded_, dec, numDecoded);
}
// Copy indices.
memcpy(depthIndices_, decIndex_, sizeof(uint16_t) * vertexCount);
memcpy(depthIndices_ + draw.indexOffset, decIndex_, sizeof(uint16_t) * vertexCount);
// FUTURE SPLIT --- The above will always run on the main thread. The below can be split across workers.
FlushDepthDraw(draw);
// Commit
depthIndexCount_ += vertexCount;
depthVertexCount_ += numDecoded;
depthDraws_.push_back(draw);
// FlushQueuedDepth();
}
void DrawEngineCommon::FlushDepthDraw(const DepthDraw &draw) {
int *tx = depthScreenVerts_;
int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT;
float *tz = (float *)(depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2);
void DrawEngineCommon::FlushQueuedDepth() {
TimeCollector collectStat(&gpuStats.msRasterizeDepth, coreCollectDebugStats);
int outVertCount = 0;
switch (draw.prim) {
case GE_PRIM_RECTANGLES:
outVertCount = DepthRasterClipIndexedRectangles(tx, ty, tz, depthTransformed_, depthIndices_ + draw.indexOffset, draw);
break;
case GE_PRIM_TRIANGLES:
outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, depthIndices_ + draw.indexOffset, draw);
break;
default:
_dbg_assert_(false);
break;
for (const auto &draw : depthDraws_) {
int *tx = depthScreenVerts_;
int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT;
float *tz = (float *)(depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2);
int outVertCount = 0;
const float *vertices = depthTransformed_ + 4 * draw.vertexOffset;
const uint16_t *indices = depthIndices_ + draw.indexOffset;
switch (draw.prim) {
case GE_PRIM_RECTANGLES:
outVertCount = DepthRasterClipIndexedRectangles(tx, ty, tz, vertices, indices, draw);
break;
case GE_PRIM_TRIANGLES:
outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, vertices, indices, draw);
break;
default:
_dbg_assert_(false);
break;
}
// TODO: Could potentially split into tasks here!
DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(draw.depthAddr), draw.depthStride, tx, ty, tz, outVertCount, draw);
}
DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
tx, ty, tz, outVertCount, draw);
// Reset queue
depthIndexCount_ = 0;
depthVertexCount_ = 0;
depthDraws_.clear();
}

View file

@ -164,6 +164,8 @@ public:
return decoded_ + 12 * 65536;
}
void FlushQueuedDepth();
protected:
virtual bool UpdateUseHWTessellation(bool enabled) const { return enabled; }
void UpdatePlanes();
@ -177,7 +179,7 @@ protected:
void DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount);
void DepthRasterPredecoded(GEPrimitiveType prim, const void *inVerts, int numDecoded, VertexDecoder *dec, int vertexCount);
void FlushDepthDraw(const DepthDraw &draw);
bool CalculateDepthDraw(DepthDraw *draw, GEPrimitiveType prim, int vertexCount);
static inline int IndexSize(u32 vtype) {
const u32 indexType = (vtype & GE_VTYPE_IDX_MASK);
@ -360,4 +362,9 @@ protected:
float *depthTransformed_ = nullptr;
int *depthScreenVerts_ = nullptr;
uint16_t *depthIndices_ = nullptr;
// Queue
int depthVertexCount_ = 0;
int depthIndexCount_ = 0;
std::vector<DepthDraw> depthDraws_;
};

View file

@ -3240,6 +3240,7 @@ void FramebufferManagerCommon::ReadFramebufferToMemory(VirtualFramebuffer *vfb,
}
void FramebufferManagerCommon::FlushBeforeCopy() {
drawEngine_->FlushQueuedDepth();
// Flush anything not yet drawn before blitting, downloading, or uploading.
// This might be a stalled list, or unflushed before a block transfer, etc.
// Only bother if any draws are pending.
@ -3247,7 +3248,8 @@ void FramebufferManagerCommon::FlushBeforeCopy() {
// TODO: It's really bad that we are calling SetRenderFramebuffer here with
// all the irrelevant state checking it'll use to decide what to do. Should
// do something more focused here.
SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason);
bool changed;
SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason, &changed);
drawEngine_->Flush();
}
}

View file

@ -306,13 +306,14 @@ public:
void DestroyFramebuf(VirtualFramebuffer *v);
VirtualFramebuffer *DoSetRenderFrameBuffer(FramebufferHeuristicParams &params, u32 skipDrawReason);
VirtualFramebuffer *SetRenderFrameBuffer(bool framebufChanged, int skipDrawReason) {
VirtualFramebuffer *SetRenderFrameBuffer(bool framebufChanged, int skipDrawReason, bool *changed) {
// Inlining this part since it's so frequent.
if (!framebufChanged && currentRenderVfb_) {
currentRenderVfb_->last_frame_render = gpuStats.numFlips;
currentRenderVfb_->dirtyAfterDisplay = true;
if (!skipDrawReason)
currentRenderVfb_->reallyDirtyAfterDisplay = true;
*changed = false;
return currentRenderVfb_;
} else {
// This is so that we will be able to drive DoSetRenderFramebuffer with inputs
@ -322,6 +323,7 @@ public:
VirtualFramebuffer *vfb = DoSetRenderFrameBuffer(inputs, skipDrawReason);
_dbg_assert_msg_(vfb, "DoSetRenderFramebuffer must return a valid framebuffer.");
_dbg_assert_msg_(currentRenderVfb_, "DoSetRenderFramebuffer must set a valid framebuffer.");
*changed = true;
return vfb;
}
}

View file

@ -108,7 +108,8 @@ struct GPUStatistics {
numCachedReplacedTextures = 0;
numClutTextures = 0;
msProcessingDisplayLists = 0;
msRasterizingDepth = 0.0f;
msPrepareDepth = 0.0f;
msRasterizeDepth = 0.0f;
numDepthRasterPrims = 0;
numDepthRasterBackface = 0;
numDepthRasterNoPixels = 0;
@ -153,7 +154,8 @@ struct GPUStatistics {
int numCachedReplacedTextures;
int numClutTextures;
double msProcessingDisplayLists;
double msRasterizingDepth;
double msPrepareDepth;
double msRasterizeDepth;
int vertexGPUCycles;
int otherGPUCycles;
int numDepthRasterPrims;

View file

@ -987,8 +987,10 @@ void GPUCommon::Execute_Ret(u32 op, u32 diff) {
}
void GPUCommon::Execute_End(u32 op, u32 diff) {
if (flushOnParams_)
if (flushOnParams_) {
drawEngineCommon_->FlushQueuedDepth();
Flush();
}
const u32 prev = Memory::ReadUnchecked_U32(currentList->pc - 4);
UpdatePC(currentList->pc, currentList->pc);
@ -1311,8 +1313,10 @@ void GPUCommon::FlushImm() {
gstate_c.UpdateUVScaleOffset();
VirtualFramebuffer *vfb = nullptr;
if (framebufferManager_)
vfb = framebufferManager_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason);
if (framebufferManager_) {
bool changed;
vfb = framebufferManager_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason, &changed);
}
if (vfb) {
CheckDepthUsage(vfb);
}
@ -1378,8 +1382,9 @@ void GPUCommon::FastLoadBoneMatrix(u32 target) {
}
if (!g_Config.bSoftwareSkinning) {
if (flushOnParams_)
if (flushOnParams_) {
Flush();
}
gstate_c.Dirty(uniformsToDirty);
} else {
gstate_c.deferredVertTypeDirty |= uniformsToDirty;

View file

@ -531,6 +531,7 @@ void GPUCommonHW::PreExecuteOp(u32 op, u32 diff) {
}
void GPUCommonHW::CopyDisplayToOutput(bool reallyDirty) {
drawEngineCommon_->FlushQueuedDepth();
// Flush anything left over.
drawEngineCommon_->Flush();
@ -949,11 +950,16 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
}
// This also makes skipping drawing very effective.
VirtualFramebuffer *vfb = framebufferManager_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason);
bool changed;
VirtualFramebuffer *vfb = framebufferManager_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason, &changed);
if (blueToAlpha) {
vfb->usageFlags |= FB_USAGE_BLUE_TO_ALPHA;
}
if (changed) {
drawEngineCommon_->FlushQueuedDepth();
}
if (gstate_c.dirty & DIRTY_VERTEXSHADER_STATE) {
vertexCost_ = EstimatePerVertexCost();
}
@ -1273,7 +1279,11 @@ void GPUCommonHW::Execute_Bezier(u32 op, u32 diff) {
gstate_c.framebufFormat = gstate.FrameBufFormat();
// This also make skipping drawing very effective.
VirtualFramebuffer *vfb = framebufferManager_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason);
bool changed;
VirtualFramebuffer *vfb = framebufferManager_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason, &changed);
if (changed) {
drawEngineCommon_->FlushQueuedDepth();
}
if (gstate_c.skipDrawReason & (SKIPDRAW_SKIPFRAME | SKIPDRAW_NON_DISPLAYED_FB)) {
// TODO: Should this eat some cycles? Probably yes. Not sure if important.
return;
@ -1282,7 +1292,7 @@ void GPUCommonHW::Execute_Bezier(u32 op, u32 diff) {
CheckDepthUsage(vfb);
if (!Memory::IsValidAddress(gstate_c.vertexAddr)) {
ERROR_LOG_REPORT(Log::G3D, "Bad vertex address %08x!", gstate_c.vertexAddr);
ERROR_LOG(Log::G3D, "Bad vertex address %08x!", gstate_c.vertexAddr);
return;
}
@ -1290,7 +1300,7 @@ void GPUCommonHW::Execute_Bezier(u32 op, u32 diff) {
const void *indices = NULL;
if ((gstate.vertType & GE_VTYPE_IDX_MASK) != GE_VTYPE_IDX_NONE) {
if (!Memory::IsValidAddress(gstate_c.indexAddr)) {
ERROR_LOG_REPORT(Log::G3D, "Bad index address %08x!", gstate_c.indexAddr);
ERROR_LOG(Log::G3D, "Bad index address %08x!", gstate_c.indexAddr);
return;
}
indices = Memory::GetPointerUnchecked(gstate_c.indexAddr);
@ -1345,7 +1355,11 @@ void GPUCommonHW::Execute_Spline(u32 op, u32 diff) {
gstate_c.framebufFormat = gstate.FrameBufFormat();
// This also make skipping drawing very effective.
VirtualFramebuffer *vfb = framebufferManager_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason);
bool changed;
VirtualFramebuffer *vfb = framebufferManager_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason, &changed);
if (changed) {
drawEngineCommon_->FlushQueuedDepth();
}
if (gstate_c.skipDrawReason & (SKIPDRAW_SKIPFRAME | SKIPDRAW_NON_DISPLAYED_FB)) {
// TODO: Should this eat some cycles? Probably yes. Not sure if important.
return;
@ -1354,7 +1368,7 @@ void GPUCommonHW::Execute_Spline(u32 op, u32 diff) {
CheckDepthUsage(vfb);
if (!Memory::IsValidAddress(gstate_c.vertexAddr)) {
ERROR_LOG_REPORT(Log::G3D, "Bad vertex address %08x!", gstate_c.vertexAddr);
ERROR_LOG(Log::G3D, "Bad vertex address %08x!", gstate_c.vertexAddr);
return;
}
@ -1362,14 +1376,14 @@ void GPUCommonHW::Execute_Spline(u32 op, u32 diff) {
const void *indices = NULL;
if ((gstate.vertType & GE_VTYPE_IDX_MASK) != GE_VTYPE_IDX_NONE) {
if (!Memory::IsValidAddress(gstate_c.indexAddr)) {
ERROR_LOG_REPORT(Log::G3D, "Bad index address %08x!", gstate_c.indexAddr);
ERROR_LOG(Log::G3D, "Bad index address %08x!", gstate_c.indexAddr);
return;
}
indices = Memory::GetPointerUnchecked(gstate_c.indexAddr);
}
if (vertTypeIsSkinningEnabled(gstate.vertType)) {
DEBUG_LOG_REPORT(Log::G3D, "Unusual bezier/spline vtype: %08x, morph: %d, bones: %d", gstate.vertType, (gstate.vertType & GE_VTYPE_MORPHCOUNT_MASK) >> GE_VTYPE_MORPHCOUNT_SHIFT, vertTypeGetNumBoneWeights(gstate.vertType));
WARN_LOG_ONCE(unusualcurve, Log::G3D, "Unusual bezier/spline vtype: %08x, morph: %d, bones: %d", gstate.vertType, (gstate.vertType & GE_VTYPE_MORPHCOUNT_MASK) >> GE_VTYPE_MORPHCOUNT_SHIFT, vertTypeGetNumBoneWeights(gstate.vertType));
}
// Can't flush after setting gstate_c.submitType below since it'll be a mess - it must be done already.
@ -1415,6 +1429,7 @@ void GPUCommonHW::Execute_Spline(u32 op, u32 diff) {
}
void GPUCommonHW::Execute_BlockTransferStart(u32 op, u32 diff) {
drawEngineCommon_->FlushQueuedDepth();
Flush();
PROFILE_THIS_SCOPE("block"); // don't include the flush in the profile, would be misleading.
@ -1763,6 +1778,16 @@ void GPUCommonHW::Execute_TexFlush(u32 op, u32 diff) {
framebufferManager_->DiscardFramebufferCopy();
}
u32 GPUCommonHW::DrawSync(int mode) {
drawEngineCommon_->FlushQueuedDepth();
return GPUCommon::DrawSync(mode);
}
int GPUCommonHW::ListSync(int listid, int mode) {
drawEngineCommon_->FlushQueuedDepth();
return GPUCommon::ListSync(listid, mode);
}
size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f;
return snprintf(buffer, size,
@ -1776,7 +1801,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
"replacer: tracks %d references, %d unique textures\n"
"Cpy: depth %d, color %d, reint %d, blend %d, self %d\n"
"GPU cycles: %d (%0.1f per vertex)\n"
"Depth raster: %0.2f ms, %d prim, %d nopix, %d small, %d back, %d zcull\n%s",
"Z-rast: %0.2f/%0.2f ms, %d prim, %d nopix, %d small, %d back, %d zcull\n%s",
gpuStats.msProcessingDisplayLists * 1000.0f,
gpuStats.numDrawSyncs,
gpuStats.numListSyncs,
@ -1813,7 +1838,8 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
gpuStats.numCopiesForSelfTex,
gpuStats.vertexGPUCycles + gpuStats.otherGPUCycles,
vertexAverageCycles,
gpuStats.msRasterizingDepth * 1000.0,
gpuStats.msPrepareDepth * 1000.0,
gpuStats.msRasterizeDepth * 1000.0,
gpuStats.numDepthRasterPrims,
gpuStats.numDepthRasterNoPixels,
gpuStats.numDepthRasterTooSmall,

View file

@ -42,6 +42,9 @@ public:
void SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat format) override;
void InvalidateCache(u32 addr, int size, GPUInvalidationType type) override;
u32 DrawSync(int mode) override;
int ListSync(int listid, int mode) override;
bool FramebufferDirty() override;
bool FramebufferReallyDirty() override;