From 373569bf64e33977404e8b48402a0ef271f66f8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sun, 29 Dec 2024 17:51:44 +0100 Subject: [PATCH] More prep. Add triangle loop. --- GPU/Common/DepthRaster.cpp | 130 ++++++++++++++++++++----------------- 1 file changed, 69 insertions(+), 61 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 52bfb30353..2eb4da83b9 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -104,8 +104,9 @@ constexpr int MIN_TWICE_TRI_AREA = 10; template TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) { // BEGIN triangle setup. This should be done SIMD, four triangles at a time. - // Due to the many multiplications, we might want to do it in floating point as 32-bit integer muls - // are slow on SSE2. + // 16x16->32 multiplications are doable on SSE2, which should be all we need. + + // We use 4x1 SIMD tiles for simplicity. 2x2 would be ideal but stores/loads get annoying. // NOTE: Triangles are stored in groups of 4. int x0 = tx[0]; @@ -115,12 +116,11 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor int x2 = tx[8]; int y2 = ty[8]; - // use fixed-point only for X and Y. Avoid work for Z and W. - // We use 4x1 tiles for simplicity. int minX = std::max(std::min(std::min(x0, x1), x2), (int)scissor.x1) & ~3; int maxX = std::min(std::max(std::max(x0, x1), x2) + 3, (int)scissor.x2) & ~3; int minY = std::max(std::min(std::min(y0, y1), y2), (int)scissor.y1); int maxY = std::min(std::max(std::max(y0, y1), y2), (int)scissor.y2); + if (maxX == minX || maxY == minY) { // No pixels, or outside screen. // Most of these are now gone in the initial pass. @@ -150,11 +150,6 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor int B01 = x1 - x0; int C01 = x0 * y1 - y0 * x1; - // Prepare to interpolate Z - float zbase = tz[0]; - float z_20 = (tz[4] - tz[0]) * oneOverTriArea; - float z_01 = (tz[8] - tz[0]) * oneOverTriArea; - // Step deltas int stepX12 = A12 * stepXSize; int stepY12 = B12 * stepYSize; @@ -163,67 +158,80 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor int stepX01 = A01 * stepXSize; int stepY01 = B01 * stepYSize; - // x/y values for initial pixel block. Add horizontal offsets. - Vec4S32 initialX = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123); - int initialY = minY; - - // Convert per-triangle values to wide registers. + // Prepare to interpolate Z + float zbase = tz[0]; + float z_20 = (tz[4] - tz[0]) * oneOverTriArea; + float z_01 = (tz[8] - tz[0]) * oneOverTriArea; + float zdx = z_20 * (float)stepX20 + z_01 * (float)stepX01; + float zdy = z_20 * (float)stepY20 + z_01 * (float)stepY01; // Edge function values at origin - Vec4S32 w0_row = Vec4S32::Splat(A12) * initialX + Vec4S32::Splat(B12 * initialY + C12); - Vec4S32 w1_row = Vec4S32::Splat(A20) * initialX + Vec4S32::Splat(B20 * initialY + C20); - Vec4S32 w2_row = Vec4S32::Splat(A01) * initialX + Vec4S32::Splat(B01 * initialY + C01); + // TODO: We could SIMD the second part here. + for (int t = 0; t < 1; t++) { + // Check for bad triangle. + if (triArea[t] == 0) { + continue; + } - Vec4F32 zdeltaX = Vec4F32::Splat(z_20 * (float)stepX20 + z_01 * (float)stepX01); - Vec4F32 zdeltaY = Vec4F32::Splat(z_20 * (float)stepY20 + z_01 * (float)stepY01); - Vec4F32 zrow = Vec4F32::Splat(zbase) + Vec4F32FromS32(w1_row) * z_20 + Vec4F32FromS32(w2_row) * z_01; + // Convert per-triangle values to wide registers. + Vec4S32 initialX = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123); + int initialY = minY; - Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12); - Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12); - Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20); - Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20); - Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01); - Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01); - // Rasterize - for (int y = minY; y <= maxY; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) { - // Barycentric coordinates at start of row - Vec4S32 w0 = w0_row; - Vec4S32 w1 = w1_row; - Vec4S32 w2 = w2_row; - Vec4F32 zs = zrow; + Vec4S32 w0_row = Vec4S32::Splat(A12) * initialX + Vec4S32::Splat(B12 * initialY + C12); + Vec4S32 w1_row = Vec4S32::Splat(A20) * initialX + Vec4S32::Splat(B20 * initialY + C20); + Vec4S32 w2_row = Vec4S32::Splat(A01) * initialX + Vec4S32::Splat(B01 * initialY + C01); - uint16_t *rowPtr = depthBuf + stride * y; + Vec4F32 zrow = Vec4F32::Splat(zbase) + Vec4F32FromS32(w1_row) * z_20 + Vec4F32FromS32(w2_row) * z_01; + Vec4F32 zdeltaX = Vec4F32::Splat(zdx); + Vec4F32 zdeltaY = Vec4F32::Splat(zdy); - for (int x = minX; x <= maxX; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) { - // If p is on or inside all edges for any pixels, - // render those pixels. - Vec4S32 signCalc = w0 | w1 | w2; - if (!AnyZeroSignBit(signCalc)) { - continue; - } + Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12); + Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12); + Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20); + Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20); + Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01); + Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01); + // Rasterize + for (int y = minY; y <= maxY; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) { + // Barycentric coordinates at start of row + Vec4S32 w0 = w0_row; + Vec4S32 w1 = w1_row; + Vec4S32 w2 = w2_row; + Vec4F32 zs = zrow; - Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x); - Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc); - // Now, the mask has 1111111 where we should preserve the contents of the depth buffer. + uint16_t *rowPtr = depthBuf + stride * y; - Vec4U16 shortZ = Vec4U16::FromVec4F32(zs); + for (int x = minX; x <= maxX; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) { + // If p is on or inside all edges for any pixels, + // render those pixels. + Vec4S32 signCalc = w0 | w1 | w2; + if (!AnyZeroSignBit(signCalc)) { + continue; + } - // This switch is on a templated constant, so should collapse away. - switch (compareMode) { - case ZCompareMode::Greater: - // To implement the greater/greater-than comparison, we can combine mask and max. - // Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output. - // We use AndNot to zero out Z results, before doing Max with the buffer. - AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x); - break; - case ZCompareMode::Less: // UNTESTED - // This time, we OR the mask and use .Min. - (shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x); - break; - case ZCompareMode::Always: // UNTESTED - // This could be replaced with a vblend operation. - ((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x); - break; + Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x); + Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc); + // Now, the mask has 1111111 where we should preserve the contents of the depth buffer. + + Vec4U16 shortZ = Vec4U16::FromVec4F32(zs); + + // This switch is on a templated constant, so should collapse away. + switch (compareMode) { + case ZCompareMode::Greater: + // To implement the greater/greater-than comparison, we can combine mask and max. + // Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output. + // We use AndNot to zero out Z results, before doing Max with the buffer. + AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x); + break; + case ZCompareMode::Less: // UNTESTED + // This time, we OR the mask and use .Min. + (shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x); + break; + case ZCompareMode::Always: // UNTESTED + // This could be replaced with a vblend operation. + ((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x); + break; + } } } }