Add convenient wrappers

2025-04-02 11:01:50 -04:00 · 2024-12-19 10:38:56 +01:00 · 2024-12-19 10:38:56 +01:00 · 72c954d8c3
commit 72c954d8c3
parent 09afe363ca
1 changed files with 6 additions and 13 deletions
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@ -1,5 +1,6 @@
 #include <algorithm>
 #include <cstring>
+#include <cstdint>

 #include "Common/Math/CrossSIMD.h"
 #include "GPU/Common/DepthRaster.h"
@ -18,18 +19,9 @@ struct Vec4S32 {
 	Vec4S32 operator -(Vec4S32 other) const {
 		return Vec4S32{ _mm_sub_epi32(v, other.v) };
 	}
-	// This is really bad if we restrict ourselves to SSE2 only.
-	// If we have SSE4, we can do _mm_mullo_epi32.
-	// Let's avoid using it as much as possible.
-	// https://stackoverflow.com/questions/17264399/fastest-way-to-multiply-two-vectors-of-32bit-integers-in-c-with-sse
+	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
 	Vec4S32 operator *(Vec4S32 other) const {
-		__m128i a13 = _mm_shuffle_epi32(v, 0xF5);          // (-,a3,-,a1)
-		__m128i b13 = _mm_shuffle_epi32(other.v, 0xF5);          // (-,b3,-,b1)
-		__m128i prod02 = _mm_mul_epu32(v, other.v);                 // (-,a2*b2,-,a0*b0)
-		__m128i prod13 = _mm_mul_epu32(a13, b13);             // (-,a3*b3,-,a1*b1)
-		__m128i prod01 = _mm_unpacklo_epi32(prod02, prod13);   // (-,-,a1*b1,a0*b0) 
-		__m128i prod23 = _mm_unpackhi_epi32(prod02, prod13);   // (-,-,a3*b3,a2*b2) 
-		return Vec4S32{ _mm_unpacklo_epi64(prod01, prod23) };   // (ab3,ab2,ab1,ab0)
+		return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) };   // (ab3,ab2,ab1,ab0)
 	}
 };

@ -234,7 +226,8 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 	int C1 = verts[2].x * verts[0].y - verts[0].x * verts[2].y;
 	int C2 = verts[0].x * verts[1].y - verts[1].x * verts[0].y;

-	// Compute triangle area
+	// Compute triangle area.
+	// TODO: Cull really small triangles here - we can just raise the comparison value below.
 	int triArea = A0 * verts[0].x + B0 * verts[0].y + C0;
 	if (triArea <= 0) {
 		// Too small to rasterize or backface culled
@ -287,7 +280,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 			if (!mask) {
 				continue;
 			}
-			// Compute barycentric-interpolated depth
+			// Compute barycentric-interpolated depth. Could also compute it incrementally.
 			float depth = zz[0] + beta * zz[1] + gamma * zz[2];
 			float previousDepthValue = (float)depthBuf[idx];