diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 1a0bebedc4..a8b68ba0c1 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -266,6 +266,10 @@ struct Vec4F32 { inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; } inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; } +inline bool AnyZeroSignBit(Vec4F32 value) { + return _mm_movemask_ps(value.v) != 0xF; +} + // Make sure the W component of scale is 1.0f. inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) { m.col0 = _mm_mul_ps(m.col0, scale.v); @@ -609,6 +613,14 @@ inline bool AnyZeroSignBit(Vec4S32 value) { return (mask & 0x80000000) == 0; } +inline bool AnyZeroSignBit(Vec4F32 value) { + int32x4_t ival = vreinterpretq_s32_f32(value.v); + int32x2_t prod = vand_s32(vget_low_s32(ival), vget_high_s32(ival)); + int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1); + return (mask & 0x80000000) == 0; +} + + struct Vec4U16 { uint16x4_t v; // 64 bits. diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 67eeb1e957..fa706807ed 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -99,8 +99,7 @@ struct Edge { enum class TriangleResult { OK, NoPixels, - Backface, - TooSmall, + SmallOrBackface, }; constexpr int MIN_TWICE_TRI_AREA = 10; @@ -130,16 +129,14 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor int maxY = std::min(std::max(std::max(v0y, v1y), v2y), (int)scissor.y2); if (maxX == minX || maxY == minY) { // No pixels, or outside screen. + // Most of these are now gone in the initial pass. return TriangleResult::NoPixels; } // TODO: Cull really small triangles here - we can increase the threshold a bit probably. int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y); - if (triArea < 0) { - return TriangleResult::Backface; - } if (triArea < MIN_TWICE_TRI_AREA) { - return TriangleResult::TooSmall; // Or zero area. + return TriangleResult::SmallOrBackface; // Or zero area. } float oneOverTriArea = 1.0f / (float)triArea; @@ -420,6 +417,14 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr continue; } + // Floating point triangle area. Can't be reused for the integer-snapped raster reliably (though may work...) + // Still good for culling early and pretty cheap to compute. + Vec4F32 triArea = (y1 - y2) * x0 + (x2 - x1) * y0 + (x1 * y2 - x2 * y1) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA); + if (!AnyZeroSignBit(triArea)) { + gpuStats.numDepthRasterEarlySize += 4; + continue; + } + Vec4S32FromF32(x0).Store(tx + outCount); Vec4S32FromF32(x1).Store(tx + outCount + 4); Vec4S32FromF32(x2).Store(tx + outCount + 8); @@ -470,7 +475,7 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con break; case GE_PRIM_TRIANGLES: { - int stats[4]{}; + int stats[3]{}; // Batches of 4 triangles, as output by the clip function. for (int i = 0; i < count; i += 12) { switch (draw.compareMode) { @@ -491,9 +496,8 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con } } } - gpuStats.numDepthRasterBackface += stats[(int)TriangleResult::Backface]; gpuStats.numDepthRasterNoPixels += stats[(int)TriangleResult::NoPixels]; - gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::TooSmall]; + gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::SmallOrBackface]; gpuStats.numDepthRasterPrims += stats[(int)TriangleResult::OK]; break; } diff --git a/GPU/GPU.h b/GPU/GPU.h index ad5d7916e1..cbf5bc93a1 100644 --- a/GPU/GPU.h +++ b/GPU/GPU.h @@ -111,7 +111,7 @@ struct GPUStatistics { msPrepareDepth = 0.0f; msRasterizeDepth = 0.0f; numDepthRasterPrims = 0; - numDepthRasterBackface = 0; + numDepthRasterEarlySize = 0; numDepthRasterNoPixels = 0; numDepthRasterTooSmall = 0; numDepthRasterZCulled = 0; @@ -160,7 +160,7 @@ struct GPUStatistics { int vertexGPUCycles; int otherGPUCycles; int numDepthRasterPrims; - int numDepthRasterBackface; + int numDepthRasterEarlySize; int numDepthRasterNoPixels; int numDepthRasterTooSmall; int numDepthRasterZCulled; diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp index 9d411926a0..c665f379a2 100644 --- a/GPU/GPUCommonHW.cpp +++ b/GPU/GPUCommonHW.cpp @@ -1801,7 +1801,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) { "replacer: tracks %d references, %d unique textures\n" "Cpy: depth %d, color %d, reint %d, blend %d, self %d\n" "GPU cycles: %d (%0.1f per vertex)\n" - "Z-rast: %0.2f/%0.2f ms, %d prim, %d nopix, %d small, %d back, %d zcull, %d box\n%s", + "Z-rast: %0.2f/%0.2f ms, %d prim, %d nopix, %d small, %d earlysize, %d zcull, %d box\n%s", gpuStats.msProcessingDisplayLists * 1000.0f, gpuStats.numDrawSyncs, gpuStats.numListSyncs, @@ -1843,7 +1843,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) { gpuStats.numDepthRasterPrims, gpuStats.numDepthRasterNoPixels, gpuStats.numDepthRasterTooSmall, - gpuStats.numDepthRasterBackface, + gpuStats.numDepthRasterEarlySize, gpuStats.numDepthRasterZCulled, gpuStats.numDepthEarlyBoxCulled, debugRecording_ ? "(debug-recording)" : ""