From 72c954d8c31176ca2502ba74af4c13d57f8a8c4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 19 Dec 2024 10:38:56 +0100 Subject: [PATCH] Add convenient wrappers --- GPU/Common/DepthRaster.cpp | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index af20c9c89a..2bb83831bb 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "Common/Math/CrossSIMD.h" #include "GPU/Common/DepthRaster.h" @@ -18,18 +19,9 @@ struct Vec4S32 { Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; } - // This is really bad if we restrict ourselves to SSE2 only. - // If we have SSE4, we can do _mm_mullo_epi32. - // Let's avoid using it as much as possible. - // https://stackoverflow.com/questions/17264399/fastest-way-to-multiply-two-vectors-of-32bit-integers-in-c-with-sse + // NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow. Vec4S32 operator *(Vec4S32 other) const { - __m128i a13 = _mm_shuffle_epi32(v, 0xF5); // (-,a3,-,a1) - __m128i b13 = _mm_shuffle_epi32(other.v, 0xF5); // (-,b3,-,b1) - __m128i prod02 = _mm_mul_epu32(v, other.v); // (-,a2*b2,-,a0*b0) - __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) - __m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0) - __m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2) - return Vec4S32{ _mm_unpacklo_epi64(prod01, prod23) }; // (ab3,ab2,ab1,ab0) + return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; // (ab3,ab2,ab1,ab0) } }; @@ -234,7 +226,8 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int C1 = verts[2].x * verts[0].y - verts[0].x * verts[2].y; int C2 = verts[0].x * verts[1].y - verts[1].x * verts[0].y; - // Compute triangle area + // Compute triangle area. + // TODO: Cull really small triangles here - we can just raise the comparison value below. int triArea = A0 * verts[0].x + B0 * verts[0].y + C0; if (triArea <= 0) { // Too small to rasterize or backface culled @@ -287,7 +280,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, if (!mask) { continue; } - // Compute barycentric-interpolated depth + // Compute barycentric-interpolated depth. Could also compute it incrementally. float depth = zz[0] + beta * zz[1] + gamma * zz[2]; float previousDepthValue = (float)depthBuf[idx];