Add convenient wrappers

This commit is contained in:
Henrik Rydgård 2024-12-19 10:38:56 +01:00
parent 09afe363ca
commit 72c954d8c3

View file

@ -1,5 +1,6 @@
#include <algorithm>
#include <cstring>
#include <cstdint>
#include "Common/Math/CrossSIMD.h"
#include "GPU/Common/DepthRaster.h"
@ -18,18 +19,9 @@ struct Vec4S32 {
Vec4S32 operator -(Vec4S32 other) const {
return Vec4S32{ _mm_sub_epi32(v, other.v) };
}
// This is really bad if we restrict ourselves to SSE2 only.
// If we have SSE4, we can do _mm_mullo_epi32.
// Let's avoid using it as much as possible.
// https://stackoverflow.com/questions/17264399/fastest-way-to-multiply-two-vectors-of-32bit-integers-in-c-with-sse
// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
Vec4S32 operator *(Vec4S32 other) const {
__m128i a13 = _mm_shuffle_epi32(v, 0xF5); // (-,a3,-,a1)
__m128i b13 = _mm_shuffle_epi32(other.v, 0xF5); // (-,b3,-,b1)
__m128i prod02 = _mm_mul_epu32(v, other.v); // (-,a2*b2,-,a0*b0)
__m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1)
__m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0)
__m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2)
return Vec4S32{ _mm_unpacklo_epi64(prod01, prod23) }; // (ab3,ab2,ab1,ab0)
return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; // (ab3,ab2,ab1,ab0)
}
};
@ -234,7 +226,8 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
int C1 = verts[2].x * verts[0].y - verts[0].x * verts[2].y;
int C2 = verts[0].x * verts[1].y - verts[1].x * verts[0].y;
// Compute triangle area
// Compute triangle area.
// TODO: Cull really small triangles here - we can just raise the comparison value below.
int triArea = A0 * verts[0].x + B0 * verts[0].y + C0;
if (triArea <= 0) {
// Too small to rasterize or backface culled
@ -287,7 +280,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
if (!mask) {
continue;
}
// Compute barycentric-interpolated depth
// Compute barycentric-interpolated depth. Could also compute it incrementally.
float depth = zz[0] + beta * zz[1] + gamma * zz[2];
float previousDepthValue = (float)depthBuf[idx];