Reimplement the depth rasterizer with SIMD.

2025-04-02 11:01:50 -04:00 · 2024-12-21 11:28:29 +01:00 · 2024-12-21 11:28:29 +01:00 · 73ae6da757
commit 73ae6da757
parent 399570e411
1 changed files with 139 additions and 112 deletions
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@ -8,7 +8,14 @@
 #include "Common/Math/math_util.h"
 #include "GPU/Common/VertexDecoderCommon.h"

-void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, GEComparison depthCompare) {
+// We only need to support these three modes.
+enum class ZCompareMode {
+	Greater,  // Most common
+	Less,  // Less common
+	Always,  // Fairly common
+};
+
+void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, ZCompareMode compareMode) {
 	// Swap coordinates if needed, we don't back-face-cull rects.
 	// We also ignore the UV rotation here.
 	if (x1 > x2) {
@ -26,8 +33,8 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 	for (int y = y1; y < y2; y++) {
 		__m128i *ptr = (__m128i *)(dest + stride * y + x1);
 		int w = x2 - x1;
-		switch (depthCompare) {
-		case GE_COMP_ALWAYS:
+		switch (compareMode) {
+		case ZCompareMode::Always:
 			if (depthValue == 0) {
 				memset(ptr, 0, w * 2);
 			} else {
@ -39,8 +46,6 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 			}
 			break;
 			// TODO: Trailer
-		case GE_COMP_NEVER:
-			break;
 		default:
 			// TODO
 			break;
@ -53,8 +58,8 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 		uint16_t *ptr = (uint16_t *)(dest + stride * y + x1);
 		int w = x2 - x1;

-		switch (depthCompare) {
-		case GE_COMP_ALWAYS:
+		switch (compareMode) {
+		case ZCompareMode::Always:
 			if (depthValue == 0) {
 				memset(ptr, 0, w * 2);
 			} else {
@ -66,8 +71,6 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 			}
 			break;
 			// TODO: Trailer
-		case GE_COMP_NEVER:
-			break;
 		default:
 			// TODO
 			break;
@ -78,10 +81,39 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 #endif
 }

+alignas(16) static const int zero123[4]  = {0, 1, 2, 3};
+
+struct Edge {
+	// Dimensions of our pixel group
+	static const int stepXSize = 4;
+	static const int stepYSize = 1;
+
+	Vec4S32 oneStepX;
+	Vec4S32 oneStepY;
+
+	Vec4S32 init(int v0x, int v0y, int v1x, int v1y, int p0x, int p0y) {
+		// Edge setup
+		int A = v0y - v1y;
+		int B = v1x - v0x;
+		int C = v0x * v1y - v0y * v1x;
+
+		// Step deltas
+		oneStepX = Vec4S32::Splat(A * stepXSize);
+		oneStepY = Vec4S32::Splat(B * stepYSize);
+
+		// x/y values for initial pixel block. Add horizontal offsets.
+		Vec4S32 x = Vec4S32::Splat(p0x) + Vec4S32::LoadAligned(zero123);
+		Vec4S32 y = Vec4S32::Splat(p0y);
+
+		// Edge function values at origin
+		return Vec4S32::Splat(A) * x + Vec4S32::Splat(B) * y + Vec4S32::Splat(C);
+	}
+};
+
 // Adapted from Intel's depth rasterizer example.
 // Started with the scalar version, will SIMD-ify later.
 // x1/y1 etc are the scissor rect.
-void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, GEComparison compareMode) {
+void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, ZCompareMode compareMode) {
 	int tileStartX = x1;
 	int tileEndX = x2;

@ -93,124 +125,90 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 	// are slow on SSE2.

 	// Convert to whole pixels for now. Later subpixel precision.
-	DepthScreenVertex verts[3];
-	verts[0].x = tx[0];
-	verts[0].y = ty[0];
-	verts[0].z = tz[0];
-	verts[1].x = tx[1];
-	verts[1].y = ty[1];
-	verts[1].z = tz[1];
-	verts[2].x = tx[2];
-	verts[2].y = ty[2];
-	verts[2].z = tz[2];
+	int v0x = tx[0];
+	int v0y = ty[0];
+	int v0z = tz[0];
+	int v1x = tx[1];
+	int v1y = ty[1];
+	int v1z = tz[1];
+	int v2x = tx[2];
+	int v2y = ty[2];
+	int v2z = tz[2];

 	// use fixed-point only for X and Y.  Avoid work for Z and W.
-	int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX);
-	int endX = std::min(std::max(std::max(verts[0].x, verts[1].x), verts[2].x) + 1, tileEndX);
-
-	int startY = std::max(std::min(std::min(verts[0].y, verts[1].y), verts[2].y), tileStartY);
-	int endY = std::min(std::max(std::max(verts[0].y, verts[1].y), verts[2].y) + 1, tileEndY);
-	if (endX == startX || endY == startY) {
+	// We use 4x1 tiles for simplicity.
+	int minX = std::max(std::min(std::min(v0x, v1x), v2x), tileStartX) & ~3;
+	int maxX = std::min(std::max(std::max(v0x, v1x), v2x) + 3, tileEndX) & ~3;
+	int minY = std::max(std::min(std::min(v0y, v1y), v2y), tileStartY);
+	int maxY = std::min(std::max(std::max(v0y, v1y), v2y), tileEndY);
+	if (maxX == minX || maxY == minY) {
 		// No pixels, or outside screen.
 		return;
 	}
+
 	// TODO: Cull really small triangles here.

-	// Fab(x, y) =     Ax       +       By     +      C              = 0
-	// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
-	// Compute A = (ya - yb) for the 3 line segments that make up each triangle
-	int A0 = verts[1].y - verts[2].y;
-	int A1 = verts[2].y - verts[0].y;
-	int A2 = verts[0].y - verts[1].y;
+	Edge e01, e12, e20;

-	// Compute B = (xb - xa) for the 3 line segments that make up each triangle
-	int B0 = verts[2].x - verts[1].x;
-	int B1 = verts[0].x - verts[2].x;
-	int B2 = verts[1].x - verts[0].x;
+	Vec4S32 w0_row = e12.init(v1x, v1y, v2x, v2y, minX, minY);
+	Vec4S32 w1_row = e20.init(v2x, v2y, v0x, v0y, minX, minY);
+	Vec4S32 w2_row = e01.init(v0x, v0y, v1x, v1y, minX, minY);

-	// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
-	int C0 = verts[1].x * verts[2].y - verts[2].x * verts[1].y;
-	int C1 = verts[2].x * verts[0].y - verts[0].x * verts[2].y;
-	int C2 = verts[0].x * verts[1].y - verts[1].x * verts[0].y;
-
-	// Compute triangle area.
-	// TODO: Cull really small triangles here - we can just raise the comparison value below.
-	int triArea = A0 * verts[0].x + B0 * verts[0].y + C0;
+	int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
 	if (triArea <= 0) {
-		// Too small to rasterize or backface culled
-		// NOTE: Just disabling this check won't enable two-sided rendering.
-		// Since it's not that common, let's just queue the triangles with both windings.
 		return;
 	}
+	float oneOverTriArea = 1.0f / (float)triArea;

-	int rowIdx = (startY * stride + startX);
-	int col = startX;
-	int row = startY;
+	// Prepare to interpolate Z
+	Vec4F32 zz0 = Vec4F32::Splat((float)v0z);
+	Vec4F32 zz1 = Vec4F32::Splat((float)(v1z - v0z) * oneOverTriArea);
+	Vec4F32 zz2 = Vec4F32::Splat((float)(v2z - v0z) * oneOverTriArea);

-	// Calculate slopes at starting corner.
-	int alpha0 = (A0 * col) + (B0 * row) + C0;
-	int beta0 = (A1 * col) + (B1 * row) + C1;
-	int gamma0 = (A2 * col) + (B2 * row) + C2;
+	// Rasterize
+	for (int y = minY; y <= maxY; y += Edge::stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY) {
+		// Barycentric coordinates at start of row
+		Vec4S32 w0 = w0_row;
+		Vec4S32 w1 = w1_row;
+		Vec4S32 w2 = w2_row;

-	float oneOverTriArea = (1.0f / float(triArea));
+		uint16_t *rowPtr = depthBuf + stride * y;

-	float zz[3];
-	zz[0] = (float)verts[0].z;
-	zz[1] = (float)(verts[1].z - verts[0].z) * oneOverTriArea;
-	zz[2] = (float)(verts[2].z - verts[0].z) * oneOverTriArea;
-
-	// END triangle setup.
-
-	// Here we should draw four triangles in a sequence.
-
-	// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
-	for (int r = startY; r < endY; r++,
-		row++,
-		rowIdx += stride,
-		alpha0 += B0,
-		beta0 += B1,
-		gamma0 += B2)
-	{
-		int idx = rowIdx;
-
-		// Restore row steppers.
-		int alpha = alpha0;
-		int beta = beta0;
-		int gamma = gamma0;
-
-		for (int c = startX; c < endX; c++,
-			idx++,
-			alpha += A0,
-			beta += A1,
-			gamma += A2)
-		{
-			int mask = (alpha | beta | gamma) >= 0;
-			// Early out if all of this quad's pixels are outside the triangle.
-			if (!mask) {
+		for (int x = minX; x <= maxX; x += Edge::stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX) {
+			// If p is on or inside all edges for any pixels,
+			// render those pixels.
+			Vec4S32 signCalc = w0 | w1 | w2;
+			if (!AnyZeroSignBit(signCalc)) {
 				continue;
 			}
-			// Compute barycentric-interpolated depth. Could also compute it incrementally.
-			float depth = zz[0] + beta * zz[1] + gamma * zz[2];
-			float previousDepthValue = (float)depthBuf[idx];

-			int depthMask;
+			Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x);
+			Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc);
+			// Now, the mask has 1111111 where we should preserve the contents of the depth buffer.
+
+			// Compute the Z value for all four pixels.
+			// float depth = zz[0] + beta * zz[1] + gamma * zz[2];
+			Vec4U16 shortZ = Vec4U16::FromVec4F32(zz0 + Vec4F32FromS32(w1) * zz1 + Vec4F32FromS32(w2) * zz2);
+
+			// TODO: Lift this switch out of the inner loop, or even out of the function with templating.
 			switch (compareMode) {
-			case GE_COMP_EQUAL:  depthMask = depth == previousDepthValue; break;
-			case GE_COMP_LESS: depthMask = depth < previousDepthValue; break;
-			case GE_COMP_LEQUAL: depthMask = depth <= previousDepthValue; break;
-			case GE_COMP_GEQUAL: depthMask = depth >= previousDepthValue; break;
-			case GE_COMP_GREATER: depthMask = depth > previousDepthValue; break;
-			case GE_COMP_NOTEQUAL: depthMask = depth != previousDepthValue; break;
-			case GE_COMP_ALWAYS:
-			default:
-				depthMask = 1;
+			case ZCompareMode::Greater:
+				// To implement the greater/greater-than comparison, we can combine mask and max.
+				// It might be better to do the math in float space on x86 due to SSE2 deficiencies.
+				// We use AndNot to zero out Z results, before doing Max with the buffer.
+				AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x);
+				break;
+			case ZCompareMode::Less:  // UNTESTED
+				// This time, we OR the mask and use .Min.
+				(shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x);
+				break;
+			case ZCompareMode::Always:  // UNTESTED
+				// This could be replaced with a vblend operation.
+				((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x);
 				break;
 			}
-			int finalMask = mask & depthMask;
-			depth = finalMask == 1 ? depth : previousDepthValue;
-			depthBuf[idx] = (u16)depth;
-		} //for each column
-	} // for each row
+		}
+	}
 }

 void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID) {
@ -299,9 +297,9 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran
 		z *= recipW;

 		Vec4S32 screen[3];
-		screen[0] = VecS32FromF32((x * viewportScaleX + viewportX) - offsetX);
-		screen[1] = VecS32FromF32((y * viewportScaleY + viewportY) - offsetY);
-		screen[2] = VecS32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f));
+		screen[0] = Vec4S32FromF32((x * viewportScaleX + viewportX) - offsetX);
+		screen[1] = Vec4S32FromF32((y * viewportScaleY + viewportY) - offsetY);
+		screen[2] = Vec4S32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f));

 		screen[0].Store(tx + outCount);
 		screen[1].Store(ty + outCount);
@ -341,12 +339,41 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr
 	// Prim should now be either TRIANGLES or RECTs.
 	_dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES);

+	// Ignore draws where stencil operations are active?
+	if (gstate.isStencilTestEnabled()) {
+		// return;
+	}
+
 	GEComparison compareMode = gstate.getDepthTestFunction();
+
+	ZCompareMode comp;
+	// Ignore some useless compare modes.
+	switch (compareMode) {
+	case GE_COMP_NEVER:
+	case GE_COMP_EQUAL:
+		// These will never have a useful effect in Z-only raster.
+		return;
+	case GE_COMP_ALWAYS:
+		comp = ZCompareMode::Always;
+		break;
+	case GE_COMP_LEQUAL:
+	case GE_COMP_LESS:
+		comp = ZCompareMode::Less;
+		break;
+	case GE_COMP_GEQUAL:
+	case GE_COMP_GREATER:
+		comp = ZCompareMode::Greater;  // Most common
+		break;
+	case GE_COMP_NOTEQUAL:
+		// This is highly unusual, let's just ignore it.
+		return;
+	}
+
 	if (gstate.isModeClear()) {
 		if (!gstate.isClearModeDepthMask()) {
 			return;
 		}
-		compareMode = GE_COMP_ALWAYS;
+		comp = ZCompareMode::Always;
 	} else {
 		if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled())
 			return;
@ -358,12 +385,12 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr
 			uint16_t z = tz[i + 1];  // depth from second vertex
 			// TODO: Should clip coordinates to the scissor rectangle.
 			// We remove the subpixel information here.
-			DepthRasterRect(depth, depthStride, tx[i], ty[i], tx[i + 1], ty[i + 1], z, compareMode);
+			DepthRasterRect(depth, depthStride, tx[i], ty[i], tx[i + 1], ty[i + 1], z, comp);
 		}
 		break;
 	case GE_PRIM_TRIANGLES:
 		for (int i = 0; i < count; i += 3) {
-			DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], compareMode);
+			DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], comp);
 		}
 		break;
 	default: