mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Compute and cull by triangle area early before writing 4-groups of triangles
This commit is contained in:
parent
e8786fc401
commit
ef934df0f2
4 changed files with 29 additions and 13 deletions
|
@ -266,6 +266,10 @@ struct Vec4F32 {
|
|||
inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
|
||||
inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; }
|
||||
|
||||
inline bool AnyZeroSignBit(Vec4F32 value) {
|
||||
return _mm_movemask_ps(value.v) != 0xF;
|
||||
}
|
||||
|
||||
// Make sure the W component of scale is 1.0f.
|
||||
inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) {
|
||||
m.col0 = _mm_mul_ps(m.col0, scale.v);
|
||||
|
@ -609,6 +613,14 @@ inline bool AnyZeroSignBit(Vec4S32 value) {
|
|||
return (mask & 0x80000000) == 0;
|
||||
}
|
||||
|
||||
inline bool AnyZeroSignBit(Vec4F32 value) {
|
||||
int32x4_t ival = vreinterpretq_s32_f32(value.v);
|
||||
int32x2_t prod = vand_s32(vget_low_s32(ival), vget_high_s32(ival));
|
||||
int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1);
|
||||
return (mask & 0x80000000) == 0;
|
||||
}
|
||||
|
||||
|
||||
struct Vec4U16 {
|
||||
uint16x4_t v; // 64 bits.
|
||||
|
||||
|
|
|
@ -99,8 +99,7 @@ struct Edge {
|
|||
enum class TriangleResult {
|
||||
OK,
|
||||
NoPixels,
|
||||
Backface,
|
||||
TooSmall,
|
||||
SmallOrBackface,
|
||||
};
|
||||
|
||||
constexpr int MIN_TWICE_TRI_AREA = 10;
|
||||
|
@ -130,16 +129,14 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
|
|||
int maxY = std::min(std::max(std::max(v0y, v1y), v2y), (int)scissor.y2);
|
||||
if (maxX == minX || maxY == minY) {
|
||||
// No pixels, or outside screen.
|
||||
// Most of these are now gone in the initial pass.
|
||||
return TriangleResult::NoPixels;
|
||||
}
|
||||
|
||||
// TODO: Cull really small triangles here - we can increase the threshold a bit probably.
|
||||
int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
|
||||
if (triArea < 0) {
|
||||
return TriangleResult::Backface;
|
||||
}
|
||||
if (triArea < MIN_TWICE_TRI_AREA) {
|
||||
return TriangleResult::TooSmall; // Or zero area.
|
||||
return TriangleResult::SmallOrBackface; // Or zero area.
|
||||
}
|
||||
|
||||
float oneOverTriArea = 1.0f / (float)triArea;
|
||||
|
@ -420,6 +417,14 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
|
|||
continue;
|
||||
}
|
||||
|
||||
// Floating point triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
|
||||
// Still good for culling early and pretty cheap to compute.
|
||||
Vec4F32 triArea = (y1 - y2) * x0 + (x2 - x1) * y0 + (x1 * y2 - x2 * y1) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA);
|
||||
if (!AnyZeroSignBit(triArea)) {
|
||||
gpuStats.numDepthRasterEarlySize += 4;
|
||||
continue;
|
||||
}
|
||||
|
||||
Vec4S32FromF32(x0).Store(tx + outCount);
|
||||
Vec4S32FromF32(x1).Store(tx + outCount + 4);
|
||||
Vec4S32FromF32(x2).Store(tx + outCount + 8);
|
||||
|
@ -470,7 +475,7 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con
|
|||
break;
|
||||
case GE_PRIM_TRIANGLES:
|
||||
{
|
||||
int stats[4]{};
|
||||
int stats[3]{};
|
||||
// Batches of 4 triangles, as output by the clip function.
|
||||
for (int i = 0; i < count; i += 12) {
|
||||
switch (draw.compareMode) {
|
||||
|
@ -491,9 +496,8 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con
|
|||
}
|
||||
}
|
||||
}
|
||||
gpuStats.numDepthRasterBackface += stats[(int)TriangleResult::Backface];
|
||||
gpuStats.numDepthRasterNoPixels += stats[(int)TriangleResult::NoPixels];
|
||||
gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::TooSmall];
|
||||
gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::SmallOrBackface];
|
||||
gpuStats.numDepthRasterPrims += stats[(int)TriangleResult::OK];
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -111,7 +111,7 @@ struct GPUStatistics {
|
|||
msPrepareDepth = 0.0f;
|
||||
msRasterizeDepth = 0.0f;
|
||||
numDepthRasterPrims = 0;
|
||||
numDepthRasterBackface = 0;
|
||||
numDepthRasterEarlySize = 0;
|
||||
numDepthRasterNoPixels = 0;
|
||||
numDepthRasterTooSmall = 0;
|
||||
numDepthRasterZCulled = 0;
|
||||
|
@ -160,7 +160,7 @@ struct GPUStatistics {
|
|||
int vertexGPUCycles;
|
||||
int otherGPUCycles;
|
||||
int numDepthRasterPrims;
|
||||
int numDepthRasterBackface;
|
||||
int numDepthRasterEarlySize;
|
||||
int numDepthRasterNoPixels;
|
||||
int numDepthRasterTooSmall;
|
||||
int numDepthRasterZCulled;
|
||||
|
|
|
@ -1801,7 +1801,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
|
|||
"replacer: tracks %d references, %d unique textures\n"
|
||||
"Cpy: depth %d, color %d, reint %d, blend %d, self %d\n"
|
||||
"GPU cycles: %d (%0.1f per vertex)\n"
|
||||
"Z-rast: %0.2f/%0.2f ms, %d prim, %d nopix, %d small, %d back, %d zcull, %d box\n%s",
|
||||
"Z-rast: %0.2f/%0.2f ms, %d prim, %d nopix, %d small, %d earlysize, %d zcull, %d box\n%s",
|
||||
gpuStats.msProcessingDisplayLists * 1000.0f,
|
||||
gpuStats.numDrawSyncs,
|
||||
gpuStats.numListSyncs,
|
||||
|
@ -1843,7 +1843,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
|
|||
gpuStats.numDepthRasterPrims,
|
||||
gpuStats.numDepthRasterNoPixels,
|
||||
gpuStats.numDepthRasterTooSmall,
|
||||
gpuStats.numDepthRasterBackface,
|
||||
gpuStats.numDepthRasterEarlySize,
|
||||
gpuStats.numDepthRasterZCulled,
|
||||
gpuStats.numDepthEarlyBoxCulled,
|
||||
debugRecording_ ? "(debug-recording)" : ""
|
||||
|
|
Loading…
Add table
Reference in a new issue