Merge pull request #19786 from hrydgard/even-more-depth-work

DepthRaster: Better guardband rejection, parallelize triangle setup
2025-04-02 11:01:50 -04:00 · 2024-12-31 10:18:41 +01:00 · 2024-12-31 10:18:41 +01:00 · bbc933b1b1
commit bbc933b1b1
parent 0fa88652f1 f85d7db5b1
4 changed files with 327 additions and 198 deletions
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@ -121,12 +121,6 @@ struct Vec4S32 {
 	void Store2(int *dst) { _mm_storel_epi64((__m128i *)dst, v); }
 	void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);}

-	// Swaps the two lower elements. Useful for reversing triangles..
-	Vec4S32 SwapLowerElements() {
-		return Vec4S32{
-			_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 0, 1))
-		};
-	}
 	Vec4S32 SignBits32ToMask() {
 		return Vec4S32{
 			_mm_srai_epi32(v, 31)
@ -137,13 +131,19 @@ struct Vec4S32 {
 	// On SSE2, much faster than _mm_mullo_epi32_SSE2.
 	// On NEON though, it'll read the full 32 bits, so beware.
 	// See https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/.
-	Vec4S32 MulAsS16(Vec4S32 other) const {
+	Vec4S32 Mul16(Vec4S32 other) const {
 		// Note that we only need to mask one of the inputs, so we get zeroes - multiplying
 		// by zero is zero, so it doesn't matter what the upper halfword of each 32-bit word is
 		// in the other register.
 		return Vec4S32{ _mm_madd_epi16(v, _mm_and_si128(other.v, _mm_set1_epi32(0x0000FFFF))) };
 	}

+	Vec4S32 SignExtend16() const { return Vec4S32{ _mm_srai_epi32(_mm_slli_epi32(v, 16), 16) }; }
+	// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output.
+	Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ _mm_min_epi16(v, other.v) }; }
+	Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ _mm_max_epi16(v, other.v) }; }
+	Vec4S32 FixupAfterMinMax() const { return SignExtend16(); }
+
 	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; }
 	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; }
 	Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ _mm_or_si128(v, other.v) }; }
@ -152,6 +152,18 @@ struct Vec4S32 {
 	// TODO: andnot
 	void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); }
 	void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); }
+	void operator &=(Vec4S32 other) { v = _mm_and_si128(v, other.v); }
+	void operator |=(Vec4S32 other) { v = _mm_or_si128(v, other.v); }
+	void operator ^=(Vec4S32 other) { v = _mm_xor_si128(v, other.v); }
+
+	Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ _mm_andnot_si128(inverted.v, v) }; }  // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
+	Vec4S32 Mul(Vec4S32 other) const { return *this * other; }
+
+	template<int imm>
+	Vec4S32 Shl() const { return Vec4S32{ _mm_slli_epi32(v, imm) }; }
+
+	// NOTE: May be slow.
+	int operator[](size_t index) const { return ((int *)&v)[index]; }

 	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
 	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; }  // (ab3,ab2,ab1,ab0)
@ -216,10 +228,14 @@ struct Vec4F32 {
 	void operator -=(Vec4F32 other) { v = _mm_sub_ps(v, other.v); }
 	void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); }
 	void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); }
+	void operator &=(Vec4S32 other) { v = _mm_and_ps(v, _mm_castsi128_ps(other.v)); }
 	Vec4F32 operator *(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
+	// NOTE: May be slow.
+	float operator[](size_t index) const { return ((float *)&v)[index]; }

 	Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
-	Vec4F32 Recip() { return Vec4F32{ _mm_rcp_ps(v) }; }
+	Vec4F32 RecipApprox() const { return Vec4F32{ _mm_rcp_ps(v) }; }
+	Vec4F32 Recip() const { return Vec4F32{ _mm_div_ps(_mm_set1_ps(1.0f), v) }; }

 	Vec4F32 Clamp(float lower, float higher) {
 		return Vec4F32{
@ -238,13 +254,6 @@ struct Vec4F32 {
 		return Vec4F32{ _mm_or_ps(_mm_and_ps(v, _mm_load_ps((const float *)mask)), _mm_load_ps((const float *)onelane3)) };
 	}

-	// Swaps the two lower elements. Useful for reversing triangles..
-	Vec4F32 SwapLowerElements() {
-		return Vec4F32{
-			_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 0, 1))
-		};
-	}
-
 	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
 		return Vec4F32{ _mm_add_ps(
 			_mm_add_ps(
@ -261,6 +270,19 @@ struct Vec4F32 {
 	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
 		_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
 	}
+
+	// This is here because ARM64 can do this very efficiently.
+	static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
+		col0.v = _mm_loadu_ps(src);
+		col1.v = _mm_loadu_ps(src + 4);
+		col2.v = _mm_loadu_ps(src + 8);
+		col3.v = _mm_loadu_ps(src + 12);
+		_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
+	}
+
+	Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpeq_ps(v, other.v)) }; }
+	Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmplt_ps(v, other.v)) }; }
+	Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpgt_ps(v, other.v)) }; }
 };

 inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
@ -309,6 +331,12 @@ struct Vec4U16 {
 	Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; }
 	Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ _mm_min_epu16_SSE2(v, other.v) }; }
 	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; }
+
+	inline Vec4U16 AndNot(Vec4U16 inverted) {
+		return Vec4U16{
+			_mm_andnot_si128(inverted.v, v)  // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
+		};
+	}
 };

 struct Vec8U16 {
@ -328,12 +356,6 @@ inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
 	};
 }

-inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
-	return Vec4U16{
-		_mm_andnot_si128(inverted.v, a.v)  // NOTE: with andnot, the first parameter is inverted, and then and is performed.
-	};
-}
-
 #elif PPSSPP_ARCH(ARM_NEON)

 struct Mat4F32 {
@ -443,16 +465,17 @@ struct Vec4S32 {
 	void Store2(int *dst) { vst1_s32(dst, vget_low_s32(v)); }
 	void StoreAligned(int *dst) { vst1q_s32(dst, v); }

-	// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
-	// This is quite awkward on ARM64 :/ Maybe there's a better solution?
-	Vec4S32 SwapLowerElements() {
-		int32x2_t upper = vget_high_s32(v);
-		int32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v));
-		return Vec4S32{ vcombine_s32(lowerSwapped, upper) };
-	};
-
 	// Warning: Unlike on x86, this is a full 32-bit multiplication.
-	Vec4S32 MulAsS16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
+	Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
+
+	Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; }
+	// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least).
+	Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ vminq_s32(v, other.v) }; }
+	Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ vmaxq_s32(v, other.v) }; }
+	Vec4S32 FixupAfterMinMax() const { return Vec4S32{ v }; }
+
+	// NOTE: May be slow.
+	int operator[](size_t index) const { return ((int *)&v)[index]; }

 	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; }
 	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; }
@ -460,6 +483,12 @@ struct Vec4S32 {
 	Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ vorrq_s32(v, other.v) }; }
 	Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ vandq_s32(v, other.v) }; }
 	Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ veorq_s32(v, other.v) }; }
+	Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ vandq_s32(v, vmvnq_s32(inverted.v))}; }
+	Vec4S32 Mul(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
+	void operator &=(Vec4S32 other) { v = vandq_s32(v, other.v); }
+
+	template<int imm>
+	Vec4S32 Shl() const { return Vec4S32{ vshlq_n_s32(v, imm) }; }

 	void operator +=(Vec4S32 other) { v = vaddq_s32(v, other.v); }
 	void operator -=(Vec4S32 other) { v = vsubq_s32(v, other.v); }
@ -508,6 +537,9 @@ struct Vec4F32 {
 		return Vec4F32{ vcvtq_f32_s32(other.v) };
 	}

+	// NOTE: May be slow.
+	float operator[](size_t index) const { return ((float *)&v)[index]; }
+
 	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; }
 	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; }
 	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; }
@ -517,19 +549,27 @@ struct Vec4F32 {
 	void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); }
 	void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); }
 	void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); }
+	void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); }
 	Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }

 	Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }

-	Vec4F32 Recip() {
+	Vec4F32 Recip() const {
 		float32x4_t recip = vrecpeq_f32(v);
 		// Use a couple Newton-Raphson steps to refine the estimate.
-		// May be able to get away with only one refinement, not sure!
+		// To save one iteration at the expense of accuracy, use RecipApprox().
 		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
 		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
 		return Vec4F32{ recip };
 	}

+	Vec4F32 RecipApprox() const {
+		float32x4_t recip = vrecpeq_f32(v);
+		// To approximately match the precision of x86-64's rcpps, do a single iteration.
+		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
+		return Vec4F32{ recip };
+	}
+
 	Vec4F32 Clamp(float lower, float higher) {
 		return Vec4F32{
 			vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher))
@ -544,12 +584,11 @@ struct Vec4F32 {
 		return Vec4F32{ vsetq_lane_f32(1.0f, v, 3) };
 	}

-	// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
-	// This is quite awkward on ARM64 :/ Maybe there's a better solution?
-	Vec4F32 SwapLowerElements() {
-		float32x2_t lowerSwapped = vrev64_f32(vget_low_f32(v));
-		return Vec4F32{ vcombine_f32(lowerSwapped, vget_high_f32(v)) };
-	};
+	Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vceqq_f32(v, other.v)) }; }
+	Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcltq_f32(v, other.v)) }; }
+	Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_f32(v, other.v)) }; }
+	Vec4S32 CompareLe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcleq_f32(v, other.v)) }; }
+	Vec4S32 CompareGe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgeq_f32(v, other.v)) }; }

 	// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
 	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
@ -573,6 +612,15 @@ struct Vec4F32 {
 #endif
 	}

+	static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
+		// The optimizer hopefully gets rid of the copies below.
+		float32x4x4_t r = vld4q_f32(src);
+		col0.v = r.val[0];
+		col1.v = r.val[1];
+		col2.v = r.val[2];
+		col3.v = r.val[3];
+	}
+
 	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
 #if PPSSPP_ARCH(ARM64_NEON)
 		float32x4_t sum = vaddq_f32(
@ -644,6 +692,8 @@ struct Vec4U16 {
 	Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; }
 	Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; }
 	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; }
+
+	Vec4U16 AndNot(Vec4U16 inverted) { return Vec4U16{ vand_u16(v, vmvn_u16(inverted.v)) }; }
 };

 inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
@ -652,10 +702,6 @@ inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
 	return Vec4U16{ result };
 }

-inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
-	return Vec4U16{ vand_u16(a.v, vmvn_u16(inverted.v)) };
-}
-
 struct Vec8U16 {
 	uint16x8_t v;

--- a/Common/Math/SIMDHeaders.h
+++ b/Common/Math/SIMDHeaders.h
@ -88,29 +88,29 @@ static inline uint32x4_t vcgezq_f32(float32x4_t v) {
 // May later figure out how to use the appropriate ones depending on compile flags.

 inline __m128i _mm_mullo_epi32_SSE2(const __m128i v0, const __m128i v1) {
-       __m128i a13 = _mm_shuffle_epi32(v0, 0xF5);             // (-,a3,-,a1)
-       __m128i b13 = _mm_shuffle_epi32(v1, 0xF5);             // (-,b3,-,b1)
-       __m128i prod02 = _mm_mul_epu32(v0, v1);                // (-,a2*b2,-,a0*b0)
-       __m128i prod13 = _mm_mul_epu32(a13, b13);              // (-,a3*b3,-,a1*b1)
-       __m128i prod01 = _mm_unpacklo_epi32(prod02, prod13);   // (-,-,a1*b1,a0*b0)
-       __m128i prod23 = _mm_unpackhi_epi32(prod02, prod13);   // (-,-,a3*b3,a2*b2)
-       return _mm_unpacklo_epi64(prod01, prod23);
+	__m128i a13 = _mm_shuffle_epi32(v0, 0xF5);             // (-,a3,-,a1)
+	__m128i b13 = _mm_shuffle_epi32(v1, 0xF5);             // (-,b3,-,b1)
+	__m128i prod02 = _mm_mul_epu32(v0, v1);                // (-,a2*b2,-,a0*b0)
+	__m128i prod13 = _mm_mul_epu32(a13, b13);              // (-,a3*b3,-,a1*b1)
+	__m128i prod01 = _mm_unpacklo_epi32(prod02, prod13);   // (-,-,a1*b1,a0*b0)
+	__m128i prod23 = _mm_unpackhi_epi32(prod02, prod13);   // (-,-,a3*b3,a2*b2)
+	return _mm_unpacklo_epi64(prod01, prod23);
 }

 inline __m128i _mm_max_epu16_SSE2(const __m128i v0, const __m128i v1) {
-       return _mm_xor_si128(
-               _mm_max_epi16(
-                       _mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
-                       _mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
-               _mm_set1_epi16((int16_t)0x8000));
+	return _mm_xor_si128(
+		_mm_max_epi16(
+			_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
+			_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
+		_mm_set1_epi16((int16_t)0x8000));
 }

 inline __m128i _mm_min_epu16_SSE2(const __m128i v0, const __m128i v1) {
-       return _mm_xor_si128(
-               _mm_min_epi16(
-                       _mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
-                       _mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
-               _mm_set1_epi16((int16_t)0x8000));
+	return _mm_xor_si128(
+		_mm_min_epi16(
+			_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
+			_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
+		_mm_set1_epi16((int16_t)0x8000));
 }

 // SSE2 replacement for half of a _mm_packus_epi32 but without the saturation.
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@ -85,36 +85,15 @@ static void DepthRasterRect(uint16_t *dest, int stride, const DepthScissor sciss
 	}
 }

-alignas(16) static const int zero123[4]  = {0, 1, 2, 3};
+alignas(16) static const int zero123[4] = {0, 1, 2, 3};

-struct Edge {
-	// Dimensions of our pixel group
-	static const int stepXSize = 4;
-	static const int stepYSize = 1;
+constexpr int stepXSize = 4;
+constexpr int stepYSize = 1;

-	Vec4S32 oneStepX;
-	Vec4S32 oneStepY;
+constexpr int stepXShift = 2;
+constexpr int stepYShift = 0;

-	Vec4S32 init(int v0x, int v0y, int v1x, int v1y, int p0x, int p0y) {
-		// Edge setup
-		int A = v0y - v1y;
-		int B = v1x - v0x;
-		int C = v0x * v1y - v0y * v1x;
-
-		// Step deltas
-		oneStepX = Vec4S32::Splat(A * stepXSize);
-		oneStepY = Vec4S32::Splat(B * stepYSize);
-
-		// x/y values for initial pixel block. Add horizontal offsets.
-		Vec4S32 x = Vec4S32::Splat(p0x) + Vec4S32::LoadAligned(zero123);
-		Vec4S32 y = Vec4S32::Splat(p0y);
-
-		// Edge function values at origin
-		return Vec4S32::Splat(A) * x + Vec4S32::Splat(B) * y + Vec4S32::Splat(C);
-	}
-};
-
-enum class TriangleResult {
+enum class TriangleStat {
 	OK,
 	NoPixels,
 	SmallOrBackface,
@ -122,109 +101,154 @@ enum class TriangleResult {

 constexpr int MIN_TWICE_TRI_AREA = 10;

-// Adapted from Intel's depth rasterizer example.
-// Started with the scalar version, will SIMD-ify later.
-// x1/y1 etc are the scissor rect.
+// A mix of ideas from Intel's sample and ryg's rasterizer blog series.
 template<ZCompareMode compareMode>
-TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
-	// BEGIN triangle setup. This should be done SIMD, four triangles at a time.
-	// Due to the many multiplications, we might want to do it in floating point as 32-bit integer muls
-	// are slow on SSE2.
+void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
+	// Triangle setup. This is done using SIMD, four triangles at a time.
+	// 16x16->32 multiplications are doable on SSE2, which should be all we need.
+
+	// We use 4x1 SIMD tiles for simplicity. 2x2 would be ideal but stores/loads get annoying.

 	// NOTE: Triangles are stored in groups of 4.
-	int v0x = tx[0];
-	int v0y = ty[0];
-	int v1x = tx[4];
-	int v1y = ty[4];
-	int v2x = tx[8];
-	int v2y = ty[8];
+	Vec4S32 x0 = Vec4S32::LoadAligned(tx);
+	Vec4S32 y0 = Vec4S32::LoadAligned(ty);
+	Vec4S32 x1 = Vec4S32::LoadAligned(tx + 4);
+	Vec4S32 y1 = Vec4S32::LoadAligned(ty + 4);
+	Vec4S32 x2 = Vec4S32::LoadAligned(tx + 8);
+	Vec4S32 y2 = Vec4S32::LoadAligned(ty + 8);

-	// use fixed-point only for X and Y.  Avoid work for Z and W.
-	// We use 4x1 tiles for simplicity.
-	int minX = std::max(std::min(std::min(v0x, v1x), v2x), (int)scissor.x1) & ~3;
-	int maxX = std::min(std::max(std::max(v0x, v1x), v2x) + 3, (int)scissor.x2) & ~3;
-	int minY = std::max(std::min(std::min(v0y, v1y), v2y), (int)scissor.y1);
-	int maxY = std::min(std::max(std::max(v0y, v1y), v2y), (int)scissor.y2);
-	if (maxX == minX || maxY == minY) {
-		// No pixels, or outside screen.
-		// Most of these are now gone in the initial pass.
-		return TriangleResult::NoPixels;
-	}
+	// FixupAfterMinMax is just 16->32 sign extension, in case the current platform (like SSE2) just has 16-bit min/max operations.
+	Vec4S32 minX = x0.Min16(x1).Min16(x2).Max16(Vec4S32::Splat(scissor.x1)).FixupAfterMinMax();
+	Vec4S32 maxX = x0.Max16(x1).Max16(x2).Min16(Vec4S32::Splat(scissor.x2)).FixupAfterMinMax();
+	Vec4S32 minY = y0.Min16(y1).Min16(y2).Max16(Vec4S32::Splat(scissor.y1)).FixupAfterMinMax();
+	Vec4S32 maxY = y0.Max16(y1).Max16(y2).Min16(Vec4S32::Splat(scissor.y2)).FixupAfterMinMax();

-	// TODO: Cull really small triangles here - we can increase the threshold a bit probably.
-	int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
-	if (triArea < MIN_TWICE_TRI_AREA) {
-		return TriangleResult::SmallOrBackface;  // Or zero area.
-	}
+	Vec4S32 triArea = (x1 - x0).Mul16(y2 - y0) - (x2 - x0).Mul16(y1 - y0);

-	float oneOverTriArea = 1.0f / (float)triArea;
+	// Edge setup
+	Vec4S32 A12 = y1 - y2;
+	Vec4S32 B12 = x2 - x1;
+	Vec4S32 C12 = x1.Mul16(y2) - y1.Mul16(x2);

-	Edge e01, e12, e20;
+	Vec4S32 A20 = y2 - y0;
+	Vec4S32 B20 = x0 - x2;
+	Vec4S32 C20 = x2.Mul16(y0) - y2.Mul16(x0);

-	Vec4S32 w0_row = e12.init(v1x, v1y, v2x, v2y, minX, minY);
-	Vec4S32 w1_row = e20.init(v2x, v2y, v0x, v0y, minX, minY);
-	Vec4S32 w2_row = e01.init(v0x, v0y, v1x, v1y, minX, minY);
+	Vec4S32 A01 = y0 - y1;
+	Vec4S32 B01 = x1 - x0;
+	Vec4S32 C01 = x0.Mul16(y1) - y0.Mul16(x1);
+
+	// Step deltas
+	Vec4S32 stepX12 = A12.Shl<stepXShift>();
+	Vec4S32 stepY12 = B12.Shl<stepYShift>();
+	Vec4S32 stepX20 = A20.Shl<stepXShift>();
+	Vec4S32 stepY20 = B20.Shl<stepYShift>();
+	Vec4S32 stepX01 = A01.Shl<stepXShift>();
+	Vec4S32 stepY01 = B01.Shl<stepYShift>();

 	// Prepare to interpolate Z
-	Vec4F32 zz0 = Vec4F32::Splat(tz[0]);
-	Vec4F32 zz1 = Vec4F32::Splat((tz[4] - tz[0]) * oneOverTriArea);
-	Vec4F32 zz2 = Vec4F32::Splat((tz[8] - tz[0]) * oneOverTriArea);
+	Vec4F32 oneOverTriArea = Vec4F32FromS32(triArea).Recip();
+	Vec4F32 zbase = Vec4F32::LoadAligned(tz);
+	Vec4F32 z_20 = (Vec4F32::LoadAligned(tz + 4) - zbase) * oneOverTriArea;
+	Vec4F32 z_01 = (Vec4F32::LoadAligned(tz + 8) - zbase) * oneOverTriArea;
+	Vec4F32 zdx = z_20 * Vec4F32FromS32(stepX20) + z_01 * Vec4F32FromS32(stepX01);
+	Vec4F32 zdy = z_20 * Vec4F32FromS32(stepY20) + z_01 * Vec4F32FromS32(stepY01);

-	Vec4F32 zdeltaX = zz1 * Vec4F32FromS32(e20.oneStepX) + zz2 * Vec4F32FromS32(e01.oneStepX);
-	Vec4F32 zdeltaY = zz1 * Vec4F32FromS32(e20.oneStepY) + zz2 * Vec4F32FromS32(e01.oneStepY);
-	Vec4F32 zrow = zz0 + Vec4F32FromS32(w1_row) * zz1 + Vec4F32FromS32(w2_row) * zz2;
+	// Shared setup is done, now loop per-triangle in the group of four.
+	for (int t = 0; t < 4; t++) {
+		// Check for bad triangle.
+		// Using operator[] on the vectors actually seems to result in pretty good code.
+		if (maxX[t] <= minX[t] || maxY[t] <= minY[t]) {
+			// No pixels, or outside screen.
+			// Most of these are now gone in the initial pass, but not all since we cull
+			// in 4-groups there.
+			stats[(int)TriangleStat::NoPixels]++;
+			continue;
+		}

-	// Rasterize
-	for (int y = minY; y <= maxY; y += Edge::stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY, zrow += zdeltaY) {
-		// Barycentric coordinates at start of row
-		Vec4S32 w0 = w0_row;
-		Vec4S32 w1 = w1_row;
-		Vec4S32 w2 = w2_row;
-		Vec4F32 zs = zrow;
+		if (triArea[t] < MIN_TWICE_TRI_AREA) {
+			stats[(int)TriangleStat::SmallOrBackface]++;  // Or zero area.
+			continue;
+		}

-		uint16_t *rowPtr = depthBuf + stride * y;
+		const int minXT = minX[t] & ~3;
+		const int maxXT = maxX[t] & ~3;

-		for (int x = minX; x <= maxX; x += Edge::stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX, zs += zdeltaX) {
-			// If p is on or inside all edges for any pixels,
-			// render those pixels.
-			Vec4S32 signCalc = w0 | w1 | w2;
-			if (!AnyZeroSignBit(signCalc)) {
-				continue;
-			}
+		const int minYT = minY[t];
+		const int maxYT = maxY[t];

-			Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x);
-			Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc);
-			// Now, the mask has 1111111 where we should preserve the contents of the depth buffer.
+		// Convert to wide registers.
+		Vec4S32 initialX = Vec4S32::Splat(minXT) + Vec4S32::LoadAligned(zero123);
+		int initialY = minY[t];
+		_dbg_assert_(A12[t] < 32767);
+		_dbg_assert_(A12[t] > -32767);
+		_dbg_assert_(A20[t] < 32767);
+		_dbg_assert_(A20[t] > -32767);
+		_dbg_assert_(A01[t] < 32767);
+		_dbg_assert_(A01[t] > -32767);

-			Vec4U16 shortZ = Vec4U16::FromVec4F32(zs);
+		// TODO: The latter subexpression can be broken out of this loop, but reduces block size flexibility.
+		Vec4S32 w0_row = Vec4S32::Splat(A12[t]).Mul16(initialX) + Vec4S32::Splat(B12[t] * initialY + C12[t]);
+		Vec4S32 w1_row = Vec4S32::Splat(A20[t]).Mul16(initialX) + Vec4S32::Splat(B20[t] * initialY + C20[t]);
+		Vec4S32 w2_row = Vec4S32::Splat(A01[t]).Mul16(initialX) + Vec4S32::Splat(B01[t] * initialY + C01[t]);

-			// This switch is on a templated constant, so should collapse away.
-			switch (compareMode) {
-			case ZCompareMode::Greater:
-				// To implement the greater/greater-than comparison, we can combine mask and max.
-				// Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output.
-				// We use AndNot to zero out Z results, before doing Max with the buffer.
-				AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x);
-				break;
-			case ZCompareMode::Less:  // UNTESTED
-				// This time, we OR the mask and use .Min.
-				(shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x);
-				break;
-			case ZCompareMode::Always:  // UNTESTED
-				// This could be replaced with a vblend operation.
-				((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x);
-				break;
+		Vec4F32 zrow = Vec4F32::Splat(zbase[t]) + Vec4F32FromS32(w1_row) * z_20[t] + Vec4F32FromS32(w2_row) * z_01[t];
+		Vec4F32 zdeltaX = Vec4F32::Splat(zdx[t]);
+		Vec4F32 zdeltaY = Vec4F32::Splat(zdy[t]);
+
+		Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12[t]);
+		Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12[t]);
+		Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20[t]);
+		Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20[t]);
+		Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01[t]);
+		Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01[t]);
+		// Rasterize
+		for (int y = minYT; y <= maxYT; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) {
+			// Barycentric coordinates at start of row
+			Vec4S32 w0 = w0_row;
+			Vec4S32 w1 = w1_row;
+			Vec4S32 w2 = w2_row;
+			Vec4F32 zs = zrow;
+
+			uint16_t *rowPtr = depthBuf + stride * y;
+
+			for (int x = minXT; x <= maxXT; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) {
+				// If p is on or inside all edges for any pixels,
+				// render those pixels.
+				Vec4S32 signCalc = w0 | w1 | w2;
+
+				// TODO: Check if this check is profitable. Maybe only for big triangles?
+				if (!AnyZeroSignBit(signCalc)) {
+					continue;
+				}
+
+				Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x);
+				Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc);
+				// Now, the mask has 1111111 where we should preserve the contents of the depth buffer.
+
+				Vec4U16 shortZ = Vec4U16::FromVec4F32(zs);
+
+				// This switch is on a templated constant, so should collapse away.
+				switch (compareMode) {
+				case ZCompareMode::Greater:
+					// To implement the greater/greater-than comparison, we can combine mask and max.
+					// Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output.
+					// We use AndNot to zero out Z results, before doing Max with the buffer.
+					shortZ.AndNot(shortMaskInv).Max(bufferValues).Store(rowPtr + x);
+					break;
+				case ZCompareMode::Less:
+					// This time, we OR the mask and use .Min.
+					(shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x);
+					break;
+				case ZCompareMode::Always:  // UNTESTED
+					// This could be replaced with a vblend operation.
+					((bufferValues & shortMaskInv) | shortZ.AndNot(shortMaskInv)).Store(rowPtr + x);
+					break;
+				}
 			}
 		}
-	}
-	return TriangleResult::OK;
-}

-template<ZCompareMode compareMode>
-inline void DepthRaster4Triangles(int stats[4], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
-	for (int i = 0; i < 4; i++) {
-		TriangleResult result = DepthRasterTriangle<compareMode>(depthBuf, stride, scissor, tx + i, ty + i, tz + i);
-		stats[(int)result]++;
+		stats[(int)TriangleStat::OK]++;
 	}
 }

@ -343,7 +367,7 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 	}
 	const bool cullEnabled = draw.cullEnabled;

-	static const float zerovec[4] = {};
+	static const float zerovec[4] = {0.0f, 0.0f, 0.0f, 1.0f};

 	int collected = 0;
 	int planeCulled = 0;
@ -351,18 +375,36 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 	const float *verts[12];  // four triangles at a time!
 	const int count = draw.vertexCount;

-	Vec4F32 scissorX1 = Vec4F32::Splat((float)scissor.x1);
-	Vec4F32 scissorY1 = Vec4F32::Splat((float)scissor.y1);
-	Vec4F32 scissorX2 = Vec4F32::Splat((float)scissor.x2);
-	Vec4F32 scissorY2 = Vec4F32::Splat((float)scissor.y2);
+	// Not exactly the same guardband as on the real PSP, but good enough to prevent 16-bit overflow in raster.
+	// This is slightly off-center since we are already in screen space, but whatever.
+	Vec4S32 guardBandTopLeft = Vec4S32::Splat(-4096);
+	Vec4S32 guardBandBottomRight = Vec4S32::Splat(4096);
+
+	Vec4S32 scissorX1 = Vec4S32::Splat((float)scissor.x1);
+	Vec4S32 scissorY1 = Vec4S32::Splat((float)scissor.y1);
+	Vec4S32 scissorX2 = Vec4S32::Splat((float)scissor.x2);
+	Vec4S32 scissorY2 = Vec4S32::Splat((float)scissor.y2);
+
+	// Add cheap pre-projection pre-checks for bad triangle here. Not much we can do safely other than checking W.
+	auto validVert = [](const float *v) -> bool {
+		if (v[3] <= 0.0f || v[2] <= 0.0f) {
+			return false;
+		}
+		/*
+		if (v[2] >= 65535.0f * v[3]) {
+			return false;
+		}*/
+		return true;
+	};

 	for (int i = 0; i < count; i += 3) {
 		// Collect valid triangles into buffer.
 		const float *v0 = transformed + indexBuffer[i] * 4;
 		const float *v1 = transformed + indexBuffer[i + (1 ^ flipCull)] * 4;
 		const float *v2 = transformed + indexBuffer[i + (2 ^ flipCull)] * 4;
-		// Don't collect triangle if any vertex is behind the 0 plane.
-		if (v0[3] > 0.0f && v1[3] > 0.0f && v2[3] > 0.0f) {
+		// Don't collect triangle if any vertex is beyond the planes.
+		// TODO: Optimize this somehow.
+		if (validVert(v0) && validVert(v1) && validVert(v2)) {
 			verts[collected] = v0;
 			verts[collected + 1] = v1;
 			verts[collected + 2] = v2;
@ -380,6 +422,7 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 		}

 		if (collected != 12) {
+			// Fetch more!
 			continue;
 		}

@ -411,38 +454,53 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 		Vec4F32 recipW2 = w2.Recip();
 		x0 *= recipW0;
 		y0 *= recipW0;
-		z0 = (z0 * recipW0).Clamp(0.0f, 65535.0f);
+		z0 *= recipW0;
 		x1 *= recipW1;
 		y1 *= recipW1;
-		z1 = (z1 * recipW1).Clamp(0.0f, 65535.0f);
+		z1 *= recipW1;
 		x2 *= recipW2;
 		y2 *= recipW2;
-		z2 = (z2 * recipW2).Clamp(0.0f, 65535.0f);
+		z2 *= recipW2;

-		// Check bounding box size (clamped to screen edges). Cast to integer for crude rounding (and to match the rasterizer).
-		Vec4S32 minX = Vec4S32FromF32(x0.Min(x1.Min(x2)).Max(scissorX1));
-		Vec4S32 minY = Vec4S32FromF32(y0.Min(y1.Min(y2)).Max(scissorY1));
-		Vec4S32 maxX = Vec4S32FromF32(x0.Max(x1.Max(x2)).Min(scissorX2));
-		Vec4S32 maxY = Vec4S32FromF32(y0.Max(y1.Max(y2)).Min(scissorY2));
+		// Check bounding box size. Cast to integer for crude rounding (and to approximately match the rasterizer).
+		Vec4S32 minX = Vec4S32FromF32(x0.Min(x1.Min(x2)));
+		Vec4S32 minY = Vec4S32FromF32(y0.Min(y1.Min(y2)));
+		Vec4S32 maxX = Vec4S32FromF32(x0.Max(x1.Max(x2)));
+		Vec4S32 maxY = Vec4S32FromF32(y0.Max(y1.Max(y2)));

-		// If all are equal in any dimension, all four triangles are tiny nonsense (or outside the scissor) and can be skipped early.
+		// If all are equal in any dimension, all four triangles are tiny nonsense and can be skipped early.
 		Vec4S32 eqMask = minX.CompareEq(maxX) | minY.CompareEq(maxY);
-		// Otherwise we just proceed to triangle setup with all four for now. Later might want to
-		// compact the remaining triangles... Or do more checking here.
+
+		// Otherwise we just proceed to triangle setup with all four for now.
 		// We could also save the computed boxes for later..
+		// TODO: Merge into below checks? Though nice with an early out.
 		if (!AnyZeroSignBit(eqMask)) {
 			boxCulled += 4;
 			continue;
 		}

-		// Floating point triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
+		// Create a mask to kill coordinates of triangles that poke outside the guardband (or are just empty).
+		Vec4S32 inGuardBand =
+			((minX.CompareGt(guardBandTopLeft) & maxX.CompareLt(guardBandBottomRight)) &
+				(minY.CompareGt(guardBandTopLeft) & maxY.CompareLt(guardBandBottomRight))).AndNot(eqMask);
+
+		// Create another mask to kill off-screen triangles. Not perfectly accurate.
+		inGuardBand &= (maxX.CompareGt(scissorX1) & minX.CompareLt(scissorX2)) & (maxY.CompareGt(scissorY1) & minY.CompareLt(scissorY2));
+
+		// It's enough to smash one coordinate to make future checks (like the tri area check) fail.
+		x0 &= inGuardBand;
+		x1 &= inGuardBand;
+		x2 &= inGuardBand;
+
+		// Floating point double triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
 		// Still good for culling early and pretty cheap to compute.
-		Vec4F32 triArea = (y1 - y2) * x0 + (x2 - x1) * y0 + (x1 * y2 - x2 * y1) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA);
-		if (!AnyZeroSignBit(triArea)) {
+		Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)(MIN_TWICE_TRI_AREA));
+		if (!AnyZeroSignBit(doubleTriArea)) {
 			gpuStats.numDepthRasterEarlySize += 4;
 			continue;
 		}

+		// Note: If any triangle is outside the guardband, (just) its X coords get zeroed, and it'll later get rejected.
 		Vec4S32FromF32(x0).Store(tx + outCount);
 		Vec4S32FromF32(x1).Store(tx + outCount + 4);
 		Vec4S32FromF32(x2).Store(tx + outCount + 8);
@ -453,13 +511,22 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 		z1.Store(tz + outCount + 4);
 		z2.Store(tz + outCount + 8);

+#ifdef _DEBUG
+		for (int i = 0; i < 12; i++) {
+			_dbg_assert_(tx[outCount + i] < 32767);
+			_dbg_assert_(tx[outCount + i] >= -32768);
+			_dbg_assert_(tx[outCount + i] < 32767);
+			_dbg_assert_(tx[outCount + i] >= -32768);
+		}
+#endif
+
 		outCount += 12;

 		if (!cullEnabled) {
-			// If culling is off, store the triangles again, in the opposite order.
-			Vec4S32FromF32(x0).Store(tx + outCount);
-			Vec4S32FromF32(x2).Store(tx + outCount + 4);
-			Vec4S32FromF32(x1).Store(tx + outCount + 8);
+			// If culling is off, store the triangles again, with the first two vertices swapped.
+			(Vec4S32FromF32(x0) & inGuardBand).Store(tx + outCount);
+			(Vec4S32FromF32(x2) & inGuardBand).Store(tx + outCount + 4);
+			(Vec4S32FromF32(x1) & inGuardBand).Store(tx + outCount + 8);
 			Vec4S32FromF32(y0).Store(ty + outCount);
 			Vec4S32FromF32(y2).Store(ty + outCount + 4);
 			Vec4S32FromF32(y1).Store(ty + outCount + 8);
@ -514,9 +581,9 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con
 			}
 			}
 		}
-		gpuStats.numDepthRasterNoPixels += stats[(int)TriangleResult::NoPixels];
-		gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::SmallOrBackface];
-		gpuStats.numDepthRasterPrims += stats[(int)TriangleResult::OK];
+		gpuStats.numDepthRasterNoPixels += stats[(int)TriangleStat::NoPixels];
+		gpuStats.numDepthRasterTooSmall += stats[(int)TriangleStat::SmallOrBackface];
+		gpuStats.numDepthRasterPrims += stats[(int)TriangleStat::OK];
 		break;
 	}
 	default:
--- a/unittest/UnitTest.cpp
+++ b/unittest/UnitTest.cpp
@ -56,6 +56,7 @@
 #include "Common/Buffer.h"
 #include "Common/File/Path.h"
 #include "Common/Math/SIMDHeaders.h"
+#include "Common/Math/CrossSIMD.h"
 // Get some more instructions for testing
 #if PPSSPP_ARCH(SSE2)
 #include <immintrin.h>
@ -1048,7 +1049,7 @@ CharQueue GetQueue() {

 bool TestCharQueue() {
 	// We use a tiny block size for testing.
-	CharQueue queue = std::move(GetQueue());
+	CharQueue queue = GetQueue();

 	// Add 16 chars.
 	queue.push_back("abcdefghijkl");
@ -1124,6 +1125,21 @@ bool TestSIMD() {
 	EXPECT_EQ_INT(testdata2[2], 0x8888777766665555);
 	EXPECT_EQ_INT(testdata2[2], 0x8888777766665555);
 #endif
+
+	const int testval[2][4] = {
+		{ 0x1000, 0x2000, 0x3000, 0x7000 },
+		{ -0x1000, -0x2000, -0x3000, -0x7000 }
+	};
+
+	for (int i = 0; i < 2; i++) {
+		Vec4S32 s = Vec4S32::Load(testval[i]);
+		Vec4S32 square = s * s;
+		Vec4S32 square16 = s.Mul16(s);
+		EXPECT_EQ_INT(square[0], square16[0]);
+		EXPECT_EQ_INT(square[1], square16[1]);
+		EXPECT_EQ_INT(square[2], square16[2]);
+		EXPECT_EQ_INT(square[3], square16[3]);
+	}
 	return true;
 }