Compute and cull by triangle area early before writing 4-groups of triangles

This commit is contained in:
Henrik Rydgård 2024-12-29 01:10:20 +01:00
parent e8786fc401
commit ef934df0f2
4 changed files with 29 additions and 13 deletions

View file

@ -266,6 +266,10 @@ struct Vec4F32 {
inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; }
inline bool AnyZeroSignBit(Vec4F32 value) {
return _mm_movemask_ps(value.v) != 0xF;
}
// Make sure the W component of scale is 1.0f.
inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) {
m.col0 = _mm_mul_ps(m.col0, scale.v);
@ -609,6 +613,14 @@ inline bool AnyZeroSignBit(Vec4S32 value) {
return (mask & 0x80000000) == 0;
}
inline bool AnyZeroSignBit(Vec4F32 value) {
int32x4_t ival = vreinterpretq_s32_f32(value.v);
int32x2_t prod = vand_s32(vget_low_s32(ival), vget_high_s32(ival));
int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1);
return (mask & 0x80000000) == 0;
}
struct Vec4U16 {
uint16x4_t v; // 64 bits.

View file

@ -99,8 +99,7 @@ struct Edge {
enum class TriangleResult {
OK,
NoPixels,
Backface,
TooSmall,
SmallOrBackface,
};
constexpr int MIN_TWICE_TRI_AREA = 10;
@ -130,16 +129,14 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
int maxY = std::min(std::max(std::max(v0y, v1y), v2y), (int)scissor.y2);
if (maxX == minX || maxY == minY) {
// No pixels, or outside screen.
// Most of these are now gone in the initial pass.
return TriangleResult::NoPixels;
}
// TODO: Cull really small triangles here - we can increase the threshold a bit probably.
int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
if (triArea < 0) {
return TriangleResult::Backface;
}
if (triArea < MIN_TWICE_TRI_AREA) {
return TriangleResult::TooSmall; // Or zero area.
return TriangleResult::SmallOrBackface; // Or zero area.
}
float oneOverTriArea = 1.0f / (float)triArea;
@ -420,6 +417,14 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
continue;
}
// Floating point triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
// Still good for culling early and pretty cheap to compute.
Vec4F32 triArea = (y1 - y2) * x0 + (x2 - x1) * y0 + (x1 * y2 - x2 * y1) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA);
if (!AnyZeroSignBit(triArea)) {
gpuStats.numDepthRasterEarlySize += 4;
continue;
}
Vec4S32FromF32(x0).Store(tx + outCount);
Vec4S32FromF32(x1).Store(tx + outCount + 4);
Vec4S32FromF32(x2).Store(tx + outCount + 8);
@ -470,7 +475,7 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con
break;
case GE_PRIM_TRIANGLES:
{
int stats[4]{};
int stats[3]{};
// Batches of 4 triangles, as output by the clip function.
for (int i = 0; i < count; i += 12) {
switch (draw.compareMode) {
@ -491,9 +496,8 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con
}
}
}
gpuStats.numDepthRasterBackface += stats[(int)TriangleResult::Backface];
gpuStats.numDepthRasterNoPixels += stats[(int)TriangleResult::NoPixels];
gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::TooSmall];
gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::SmallOrBackface];
gpuStats.numDepthRasterPrims += stats[(int)TriangleResult::OK];
break;
}

View file

@ -111,7 +111,7 @@ struct GPUStatistics {
msPrepareDepth = 0.0f;
msRasterizeDepth = 0.0f;
numDepthRasterPrims = 0;
numDepthRasterBackface = 0;
numDepthRasterEarlySize = 0;
numDepthRasterNoPixels = 0;
numDepthRasterTooSmall = 0;
numDepthRasterZCulled = 0;
@ -160,7 +160,7 @@ struct GPUStatistics {
int vertexGPUCycles;
int otherGPUCycles;
int numDepthRasterPrims;
int numDepthRasterBackface;
int numDepthRasterEarlySize;
int numDepthRasterNoPixels;
int numDepthRasterTooSmall;
int numDepthRasterZCulled;

View file

@ -1801,7 +1801,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
"replacer: tracks %d references, %d unique textures\n"
"Cpy: depth %d, color %d, reint %d, blend %d, self %d\n"
"GPU cycles: %d (%0.1f per vertex)\n"
"Z-rast: %0.2f/%0.2f ms, %d prim, %d nopix, %d small, %d back, %d zcull, %d box\n%s",
"Z-rast: %0.2f/%0.2f ms, %d prim, %d nopix, %d small, %d earlysize, %d zcull, %d box\n%s",
gpuStats.msProcessingDisplayLists * 1000.0f,
gpuStats.numDrawSyncs,
gpuStats.numListSyncs,
@ -1843,7 +1843,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
gpuStats.numDepthRasterPrims,
gpuStats.numDepthRasterNoPixels,
gpuStats.numDepthRasterTooSmall,
gpuStats.numDepthRasterBackface,
gpuStats.numDepthRasterEarlySize,
gpuStats.numDepthRasterZCulled,
gpuStats.numDepthEarlyBoxCulled,
debugRecording_ ? "(debug-recording)" : ""