diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index a8b68ba0c1..556d9e3b1a 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -121,12 +121,6 @@ struct Vec4S32 { void Store2(int *dst) { _mm_storel_epi64((__m128i *)dst, v); } void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);} - // Swaps the two lower elements. Useful for reversing triangles.. - Vec4S32 SwapLowerElements() { - return Vec4S32{ - _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 0, 1)) - }; - } Vec4S32 SignBits32ToMask() { return Vec4S32{ _mm_srai_epi32(v, 31) @@ -137,13 +131,19 @@ struct Vec4S32 { // On SSE2, much faster than _mm_mullo_epi32_SSE2. // On NEON though, it'll read the full 32 bits, so beware. // See https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/. - Vec4S32 MulAsS16(Vec4S32 other) const { + Vec4S32 Mul16(Vec4S32 other) const { // Note that we only need to mask one of the inputs, so we get zeroes - multiplying // by zero is zero, so it doesn't matter what the upper halfword of each 32-bit word is // in the other register. return Vec4S32{ _mm_madd_epi16(v, _mm_and_si128(other.v, _mm_set1_epi32(0x0000FFFF))) }; } + Vec4S32 SignExtend16() const { return Vec4S32{ _mm_srai_epi32(_mm_slli_epi32(v, 16), 16) }; } + // NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output. + Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ _mm_min_epi16(v, other.v) }; } + Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ _mm_max_epi16(v, other.v) }; } + Vec4S32 FixupAfterMinMax() const { return SignExtend16(); } + Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; } Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; } Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ _mm_or_si128(v, other.v) }; } @@ -152,6 +152,18 @@ struct Vec4S32 { // TODO: andnot void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); } void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); } + void operator &=(Vec4S32 other) { v = _mm_and_si128(v, other.v); } + void operator |=(Vec4S32 other) { v = _mm_or_si128(v, other.v); } + void operator ^=(Vec4S32 other) { v = _mm_xor_si128(v, other.v); } + + Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ _mm_andnot_si128(inverted.v, v) }; } // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed. + Vec4S32 Mul(Vec4S32 other) const { return *this * other; } + + template + Vec4S32 Shl() const { return Vec4S32{ _mm_slli_epi32(v, imm) }; } + + // NOTE: May be slow. + int operator[](size_t index) const { return ((int *)&v)[index]; } // NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow. Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; } // (ab3,ab2,ab1,ab0) @@ -216,10 +228,14 @@ struct Vec4F32 { void operator -=(Vec4F32 other) { v = _mm_sub_ps(v, other.v); } void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); } void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); } + void operator &=(Vec4S32 other) { v = _mm_and_ps(v, _mm_castsi128_ps(other.v)); } Vec4F32 operator *(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; } + // NOTE: May be slow. + float operator[](size_t index) const { return ((float *)&v)[index]; } Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; } - Vec4F32 Recip() { return Vec4F32{ _mm_rcp_ps(v) }; } + Vec4F32 RecipApprox() const { return Vec4F32{ _mm_rcp_ps(v) }; } + Vec4F32 Recip() const { return Vec4F32{ _mm_div_ps(_mm_set1_ps(1.0f), v) }; } Vec4F32 Clamp(float lower, float higher) { return Vec4F32{ @@ -238,13 +254,6 @@ struct Vec4F32 { return Vec4F32{ _mm_or_ps(_mm_and_ps(v, _mm_load_ps((const float *)mask)), _mm_load_ps((const float *)onelane3)) }; } - // Swaps the two lower elements. Useful for reversing triangles.. - Vec4F32 SwapLowerElements() { - return Vec4F32{ - _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 0, 1)) - }; - } - inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) { return Vec4F32{ _mm_add_ps( _mm_add_ps( @@ -261,6 +270,19 @@ struct Vec4F32 { static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { _MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v); } + + // This is here because ARM64 can do this very efficiently. + static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { + col0.v = _mm_loadu_ps(src); + col1.v = _mm_loadu_ps(src + 4); + col2.v = _mm_loadu_ps(src + 8); + col3.v = _mm_loadu_ps(src + 12); + _MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v); + } + + Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpeq_ps(v, other.v)) }; } + Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmplt_ps(v, other.v)) }; } + Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpgt_ps(v, other.v)) }; } }; inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; } @@ -309,6 +331,12 @@ struct Vec4U16 { Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; } Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ _mm_min_epu16_SSE2(v, other.v) }; } Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; } + + inline Vec4U16 AndNot(Vec4U16 inverted) { + return Vec4U16{ + _mm_andnot_si128(inverted.v, v) // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed. + }; + } }; struct Vec8U16 { @@ -328,12 +356,6 @@ inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) { }; } -inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) { - return Vec4U16{ - _mm_andnot_si128(inverted.v, a.v) // NOTE: with andnot, the first parameter is inverted, and then and is performed. - }; -} - #elif PPSSPP_ARCH(ARM_NEON) struct Mat4F32 { @@ -443,16 +465,17 @@ struct Vec4S32 { void Store2(int *dst) { vst1_s32(dst, vget_low_s32(v)); } void StoreAligned(int *dst) { vst1q_s32(dst, v); } - // Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles.. - // This is quite awkward on ARM64 :/ Maybe there's a better solution? - Vec4S32 SwapLowerElements() { - int32x2_t upper = vget_high_s32(v); - int32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v)); - return Vec4S32{ vcombine_s32(lowerSwapped, upper) }; - }; - // Warning: Unlike on x86, this is a full 32-bit multiplication. - Vec4S32 MulAsS16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; } + Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; } + + Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; } + // NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least). + Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ vminq_s32(v, other.v) }; } + Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ vmaxq_s32(v, other.v) }; } + Vec4S32 FixupAfterMinMax() const { return Vec4S32{ v }; } + + // NOTE: May be slow. + int operator[](size_t index) const { return ((int *)&v)[index]; } Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; } Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; } @@ -460,6 +483,12 @@ struct Vec4S32 { Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ vorrq_s32(v, other.v) }; } Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ vandq_s32(v, other.v) }; } Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ veorq_s32(v, other.v) }; } + Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ vandq_s32(v, vmvnq_s32(inverted.v))}; } + Vec4S32 Mul(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; } + void operator &=(Vec4S32 other) { v = vandq_s32(v, other.v); } + + template + Vec4S32 Shl() const { return Vec4S32{ vshlq_n_s32(v, imm) }; } void operator +=(Vec4S32 other) { v = vaddq_s32(v, other.v); } void operator -=(Vec4S32 other) { v = vsubq_s32(v, other.v); } @@ -508,6 +537,9 @@ struct Vec4F32 { return Vec4F32{ vcvtq_f32_s32(other.v) }; } + // NOTE: May be slow. + float operator[](size_t index) const { return ((float *)&v)[index]; } + Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; } Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; } Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; } @@ -517,19 +549,27 @@ struct Vec4F32 { void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); } void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); } void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); } + void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); } Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; } Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; } - Vec4F32 Recip() { + Vec4F32 Recip() const { float32x4_t recip = vrecpeq_f32(v); // Use a couple Newton-Raphson steps to refine the estimate. - // May be able to get away with only one refinement, not sure! + // To save one iteration at the expense of accuracy, use RecipApprox(). recip = vmulq_f32(vrecpsq_f32(v, recip), recip); recip = vmulq_f32(vrecpsq_f32(v, recip), recip); return Vec4F32{ recip }; } + Vec4F32 RecipApprox() const { + float32x4_t recip = vrecpeq_f32(v); + // To approximately match the precision of x86-64's rcpps, do a single iteration. + recip = vmulq_f32(vrecpsq_f32(v, recip), recip); + return Vec4F32{ recip }; + } + Vec4F32 Clamp(float lower, float higher) { return Vec4F32{ vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher)) @@ -544,12 +584,11 @@ struct Vec4F32 { return Vec4F32{ vsetq_lane_f32(1.0f, v, 3) }; } - // Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles.. - // This is quite awkward on ARM64 :/ Maybe there's a better solution? - Vec4F32 SwapLowerElements() { - float32x2_t lowerSwapped = vrev64_f32(vget_low_f32(v)); - return Vec4F32{ vcombine_f32(lowerSwapped, vget_high_f32(v)) }; - }; + Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vceqq_f32(v, other.v)) }; } + Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcltq_f32(v, other.v)) }; } + Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_f32(v, other.v)) }; } + Vec4S32 CompareLe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcleq_f32(v, other.v)) }; } + Vec4S32 CompareGe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgeq_f32(v, other.v)) }; } // One of many possible solutions. Sometimes we could also use vld4q_f32 probably.. static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { @@ -573,6 +612,15 @@ struct Vec4F32 { #endif } + static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { + // The optimizer hopefully gets rid of the copies below. + float32x4x4_t r = vld4q_f32(src); + col0.v = r.val[0]; + col1.v = r.val[1]; + col2.v = r.val[2]; + col3.v = r.val[3]; + } + inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) { #if PPSSPP_ARCH(ARM64_NEON) float32x4_t sum = vaddq_f32( @@ -644,6 +692,8 @@ struct Vec4U16 { Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; } Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; } Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; } + + Vec4U16 AndNot(Vec4U16 inverted) { return Vec4U16{ vand_u16(v, vmvn_u16(inverted.v)) }; } }; inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) { @@ -652,10 +702,6 @@ inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) { return Vec4U16{ result }; } -inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) { - return Vec4U16{ vand_u16(a.v, vmvn_u16(inverted.v)) }; -} - struct Vec8U16 { uint16x8_t v; diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h index cb63b89eef..82b18e558c 100644 --- a/Common/Math/SIMDHeaders.h +++ b/Common/Math/SIMDHeaders.h @@ -88,29 +88,29 @@ static inline uint32x4_t vcgezq_f32(float32x4_t v) { // May later figure out how to use the appropriate ones depending on compile flags. inline __m128i _mm_mullo_epi32_SSE2(const __m128i v0, const __m128i v1) { - __m128i a13 = _mm_shuffle_epi32(v0, 0xF5); // (-,a3,-,a1) - __m128i b13 = _mm_shuffle_epi32(v1, 0xF5); // (-,b3,-,b1) - __m128i prod02 = _mm_mul_epu32(v0, v1); // (-,a2*b2,-,a0*b0) - __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) - __m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0) - __m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2) - return _mm_unpacklo_epi64(prod01, prod23); + __m128i a13 = _mm_shuffle_epi32(v0, 0xF5); // (-,a3,-,a1) + __m128i b13 = _mm_shuffle_epi32(v1, 0xF5); // (-,b3,-,b1) + __m128i prod02 = _mm_mul_epu32(v0, v1); // (-,a2*b2,-,a0*b0) + __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) + __m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0) + __m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2) + return _mm_unpacklo_epi64(prod01, prod23); } inline __m128i _mm_max_epu16_SSE2(const __m128i v0, const __m128i v1) { - return _mm_xor_si128( - _mm_max_epi16( - _mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)), - _mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))), - _mm_set1_epi16((int16_t)0x8000)); + return _mm_xor_si128( + _mm_max_epi16( + _mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)), + _mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))), + _mm_set1_epi16((int16_t)0x8000)); } inline __m128i _mm_min_epu16_SSE2(const __m128i v0, const __m128i v1) { - return _mm_xor_si128( - _mm_min_epi16( - _mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)), - _mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))), - _mm_set1_epi16((int16_t)0x8000)); + return _mm_xor_si128( + _mm_min_epi16( + _mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)), + _mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))), + _mm_set1_epi16((int16_t)0x8000)); } // SSE2 replacement for half of a _mm_packus_epi32 but without the saturation. diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index b464e3012b..8bf6e2586c 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -85,36 +85,15 @@ static void DepthRasterRect(uint16_t *dest, int stride, const DepthScissor sciss } } -alignas(16) static const int zero123[4] = {0, 1, 2, 3}; +alignas(16) static const int zero123[4] = {0, 1, 2, 3}; -struct Edge { - // Dimensions of our pixel group - static const int stepXSize = 4; - static const int stepYSize = 1; +constexpr int stepXSize = 4; +constexpr int stepYSize = 1; - Vec4S32 oneStepX; - Vec4S32 oneStepY; +constexpr int stepXShift = 2; +constexpr int stepYShift = 0; - Vec4S32 init(int v0x, int v0y, int v1x, int v1y, int p0x, int p0y) { - // Edge setup - int A = v0y - v1y; - int B = v1x - v0x; - int C = v0x * v1y - v0y * v1x; - - // Step deltas - oneStepX = Vec4S32::Splat(A * stepXSize); - oneStepY = Vec4S32::Splat(B * stepYSize); - - // x/y values for initial pixel block. Add horizontal offsets. - Vec4S32 x = Vec4S32::Splat(p0x) + Vec4S32::LoadAligned(zero123); - Vec4S32 y = Vec4S32::Splat(p0y); - - // Edge function values at origin - return Vec4S32::Splat(A) * x + Vec4S32::Splat(B) * y + Vec4S32::Splat(C); - } -}; - -enum class TriangleResult { +enum class TriangleStat { OK, NoPixels, SmallOrBackface, @@ -122,109 +101,154 @@ enum class TriangleResult { constexpr int MIN_TWICE_TRI_AREA = 10; -// Adapted from Intel's depth rasterizer example. -// Started with the scalar version, will SIMD-ify later. -// x1/y1 etc are the scissor rect. +// A mix of ideas from Intel's sample and ryg's rasterizer blog series. template -TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) { - // BEGIN triangle setup. This should be done SIMD, four triangles at a time. - // Due to the many multiplications, we might want to do it in floating point as 32-bit integer muls - // are slow on SSE2. +void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) { + // Triangle setup. This is done using SIMD, four triangles at a time. + // 16x16->32 multiplications are doable on SSE2, which should be all we need. + + // We use 4x1 SIMD tiles for simplicity. 2x2 would be ideal but stores/loads get annoying. // NOTE: Triangles are stored in groups of 4. - int v0x = tx[0]; - int v0y = ty[0]; - int v1x = tx[4]; - int v1y = ty[4]; - int v2x = tx[8]; - int v2y = ty[8]; + Vec4S32 x0 = Vec4S32::LoadAligned(tx); + Vec4S32 y0 = Vec4S32::LoadAligned(ty); + Vec4S32 x1 = Vec4S32::LoadAligned(tx + 4); + Vec4S32 y1 = Vec4S32::LoadAligned(ty + 4); + Vec4S32 x2 = Vec4S32::LoadAligned(tx + 8); + Vec4S32 y2 = Vec4S32::LoadAligned(ty + 8); - // use fixed-point only for X and Y. Avoid work for Z and W. - // We use 4x1 tiles for simplicity. - int minX = std::max(std::min(std::min(v0x, v1x), v2x), (int)scissor.x1) & ~3; - int maxX = std::min(std::max(std::max(v0x, v1x), v2x) + 3, (int)scissor.x2) & ~3; - int minY = std::max(std::min(std::min(v0y, v1y), v2y), (int)scissor.y1); - int maxY = std::min(std::max(std::max(v0y, v1y), v2y), (int)scissor.y2); - if (maxX == minX || maxY == minY) { - // No pixels, or outside screen. - // Most of these are now gone in the initial pass. - return TriangleResult::NoPixels; - } + // FixupAfterMinMax is just 16->32 sign extension, in case the current platform (like SSE2) just has 16-bit min/max operations. + Vec4S32 minX = x0.Min16(x1).Min16(x2).Max16(Vec4S32::Splat(scissor.x1)).FixupAfterMinMax(); + Vec4S32 maxX = x0.Max16(x1).Max16(x2).Min16(Vec4S32::Splat(scissor.x2)).FixupAfterMinMax(); + Vec4S32 minY = y0.Min16(y1).Min16(y2).Max16(Vec4S32::Splat(scissor.y1)).FixupAfterMinMax(); + Vec4S32 maxY = y0.Max16(y1).Max16(y2).Min16(Vec4S32::Splat(scissor.y2)).FixupAfterMinMax(); - // TODO: Cull really small triangles here - we can increase the threshold a bit probably. - int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y); - if (triArea < MIN_TWICE_TRI_AREA) { - return TriangleResult::SmallOrBackface; // Or zero area. - } + Vec4S32 triArea = (x1 - x0).Mul16(y2 - y0) - (x2 - x0).Mul16(y1 - y0); - float oneOverTriArea = 1.0f / (float)triArea; + // Edge setup + Vec4S32 A12 = y1 - y2; + Vec4S32 B12 = x2 - x1; + Vec4S32 C12 = x1.Mul16(y2) - y1.Mul16(x2); - Edge e01, e12, e20; + Vec4S32 A20 = y2 - y0; + Vec4S32 B20 = x0 - x2; + Vec4S32 C20 = x2.Mul16(y0) - y2.Mul16(x0); - Vec4S32 w0_row = e12.init(v1x, v1y, v2x, v2y, minX, minY); - Vec4S32 w1_row = e20.init(v2x, v2y, v0x, v0y, minX, minY); - Vec4S32 w2_row = e01.init(v0x, v0y, v1x, v1y, minX, minY); + Vec4S32 A01 = y0 - y1; + Vec4S32 B01 = x1 - x0; + Vec4S32 C01 = x0.Mul16(y1) - y0.Mul16(x1); + + // Step deltas + Vec4S32 stepX12 = A12.Shl(); + Vec4S32 stepY12 = B12.Shl(); + Vec4S32 stepX20 = A20.Shl(); + Vec4S32 stepY20 = B20.Shl(); + Vec4S32 stepX01 = A01.Shl(); + Vec4S32 stepY01 = B01.Shl(); // Prepare to interpolate Z - Vec4F32 zz0 = Vec4F32::Splat(tz[0]); - Vec4F32 zz1 = Vec4F32::Splat((tz[4] - tz[0]) * oneOverTriArea); - Vec4F32 zz2 = Vec4F32::Splat((tz[8] - tz[0]) * oneOverTriArea); + Vec4F32 oneOverTriArea = Vec4F32FromS32(triArea).Recip(); + Vec4F32 zbase = Vec4F32::LoadAligned(tz); + Vec4F32 z_20 = (Vec4F32::LoadAligned(tz + 4) - zbase) * oneOverTriArea; + Vec4F32 z_01 = (Vec4F32::LoadAligned(tz + 8) - zbase) * oneOverTriArea; + Vec4F32 zdx = z_20 * Vec4F32FromS32(stepX20) + z_01 * Vec4F32FromS32(stepX01); + Vec4F32 zdy = z_20 * Vec4F32FromS32(stepY20) + z_01 * Vec4F32FromS32(stepY01); - Vec4F32 zdeltaX = zz1 * Vec4F32FromS32(e20.oneStepX) + zz2 * Vec4F32FromS32(e01.oneStepX); - Vec4F32 zdeltaY = zz1 * Vec4F32FromS32(e20.oneStepY) + zz2 * Vec4F32FromS32(e01.oneStepY); - Vec4F32 zrow = zz0 + Vec4F32FromS32(w1_row) * zz1 + Vec4F32FromS32(w2_row) * zz2; + // Shared setup is done, now loop per-triangle in the group of four. + for (int t = 0; t < 4; t++) { + // Check for bad triangle. + // Using operator[] on the vectors actually seems to result in pretty good code. + if (maxX[t] <= minX[t] || maxY[t] <= minY[t]) { + // No pixels, or outside screen. + // Most of these are now gone in the initial pass, but not all since we cull + // in 4-groups there. + stats[(int)TriangleStat::NoPixels]++; + continue; + } - // Rasterize - for (int y = minY; y <= maxY; y += Edge::stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY, zrow += zdeltaY) { - // Barycentric coordinates at start of row - Vec4S32 w0 = w0_row; - Vec4S32 w1 = w1_row; - Vec4S32 w2 = w2_row; - Vec4F32 zs = zrow; + if (triArea[t] < MIN_TWICE_TRI_AREA) { + stats[(int)TriangleStat::SmallOrBackface]++; // Or zero area. + continue; + } - uint16_t *rowPtr = depthBuf + stride * y; + const int minXT = minX[t] & ~3; + const int maxXT = maxX[t] & ~3; - for (int x = minX; x <= maxX; x += Edge::stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX, zs += zdeltaX) { - // If p is on or inside all edges for any pixels, - // render those pixels. - Vec4S32 signCalc = w0 | w1 | w2; - if (!AnyZeroSignBit(signCalc)) { - continue; - } + const int minYT = minY[t]; + const int maxYT = maxY[t]; - Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x); - Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc); - // Now, the mask has 1111111 where we should preserve the contents of the depth buffer. + // Convert to wide registers. + Vec4S32 initialX = Vec4S32::Splat(minXT) + Vec4S32::LoadAligned(zero123); + int initialY = minY[t]; + _dbg_assert_(A12[t] < 32767); + _dbg_assert_(A12[t] > -32767); + _dbg_assert_(A20[t] < 32767); + _dbg_assert_(A20[t] > -32767); + _dbg_assert_(A01[t] < 32767); + _dbg_assert_(A01[t] > -32767); - Vec4U16 shortZ = Vec4U16::FromVec4F32(zs); + // TODO: The latter subexpression can be broken out of this loop, but reduces block size flexibility. + Vec4S32 w0_row = Vec4S32::Splat(A12[t]).Mul16(initialX) + Vec4S32::Splat(B12[t] * initialY + C12[t]); + Vec4S32 w1_row = Vec4S32::Splat(A20[t]).Mul16(initialX) + Vec4S32::Splat(B20[t] * initialY + C20[t]); + Vec4S32 w2_row = Vec4S32::Splat(A01[t]).Mul16(initialX) + Vec4S32::Splat(B01[t] * initialY + C01[t]); - // This switch is on a templated constant, so should collapse away. - switch (compareMode) { - case ZCompareMode::Greater: - // To implement the greater/greater-than comparison, we can combine mask and max. - // Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output. - // We use AndNot to zero out Z results, before doing Max with the buffer. - AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x); - break; - case ZCompareMode::Less: // UNTESTED - // This time, we OR the mask and use .Min. - (shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x); - break; - case ZCompareMode::Always: // UNTESTED - // This could be replaced with a vblend operation. - ((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x); - break; + Vec4F32 zrow = Vec4F32::Splat(zbase[t]) + Vec4F32FromS32(w1_row) * z_20[t] + Vec4F32FromS32(w2_row) * z_01[t]; + Vec4F32 zdeltaX = Vec4F32::Splat(zdx[t]); + Vec4F32 zdeltaY = Vec4F32::Splat(zdy[t]); + + Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12[t]); + Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12[t]); + Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20[t]); + Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20[t]); + Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01[t]); + Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01[t]); + // Rasterize + for (int y = minYT; y <= maxYT; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) { + // Barycentric coordinates at start of row + Vec4S32 w0 = w0_row; + Vec4S32 w1 = w1_row; + Vec4S32 w2 = w2_row; + Vec4F32 zs = zrow; + + uint16_t *rowPtr = depthBuf + stride * y; + + for (int x = minXT; x <= maxXT; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) { + // If p is on or inside all edges for any pixels, + // render those pixels. + Vec4S32 signCalc = w0 | w1 | w2; + + // TODO: Check if this check is profitable. Maybe only for big triangles? + if (!AnyZeroSignBit(signCalc)) { + continue; + } + + Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x); + Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc); + // Now, the mask has 1111111 where we should preserve the contents of the depth buffer. + + Vec4U16 shortZ = Vec4U16::FromVec4F32(zs); + + // This switch is on a templated constant, so should collapse away. + switch (compareMode) { + case ZCompareMode::Greater: + // To implement the greater/greater-than comparison, we can combine mask and max. + // Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output. + // We use AndNot to zero out Z results, before doing Max with the buffer. + shortZ.AndNot(shortMaskInv).Max(bufferValues).Store(rowPtr + x); + break; + case ZCompareMode::Less: + // This time, we OR the mask and use .Min. + (shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x); + break; + case ZCompareMode::Always: // UNTESTED + // This could be replaced with a vblend operation. + ((bufferValues & shortMaskInv) | shortZ.AndNot(shortMaskInv)).Store(rowPtr + x); + break; + } } } - } - return TriangleResult::OK; -} -template -inline void DepthRaster4Triangles(int stats[4], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) { - for (int i = 0; i < 4; i++) { - TriangleResult result = DepthRasterTriangle(depthBuf, stride, scissor, tx + i, ty + i, tz + i); - stats[(int)result]++; + stats[(int)TriangleStat::OK]++; } } @@ -343,7 +367,7 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr } const bool cullEnabled = draw.cullEnabled; - static const float zerovec[4] = {}; + static const float zerovec[4] = {0.0f, 0.0f, 0.0f, 1.0f}; int collected = 0; int planeCulled = 0; @@ -351,18 +375,36 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr const float *verts[12]; // four triangles at a time! const int count = draw.vertexCount; - Vec4F32 scissorX1 = Vec4F32::Splat((float)scissor.x1); - Vec4F32 scissorY1 = Vec4F32::Splat((float)scissor.y1); - Vec4F32 scissorX2 = Vec4F32::Splat((float)scissor.x2); - Vec4F32 scissorY2 = Vec4F32::Splat((float)scissor.y2); + // Not exactly the same guardband as on the real PSP, but good enough to prevent 16-bit overflow in raster. + // This is slightly off-center since we are already in screen space, but whatever. + Vec4S32 guardBandTopLeft = Vec4S32::Splat(-4096); + Vec4S32 guardBandBottomRight = Vec4S32::Splat(4096); + + Vec4S32 scissorX1 = Vec4S32::Splat((float)scissor.x1); + Vec4S32 scissorY1 = Vec4S32::Splat((float)scissor.y1); + Vec4S32 scissorX2 = Vec4S32::Splat((float)scissor.x2); + Vec4S32 scissorY2 = Vec4S32::Splat((float)scissor.y2); + + // Add cheap pre-projection pre-checks for bad triangle here. Not much we can do safely other than checking W. + auto validVert = [](const float *v) -> bool { + if (v[3] <= 0.0f || v[2] <= 0.0f) { + return false; + } + /* + if (v[2] >= 65535.0f * v[3]) { + return false; + }*/ + return true; + }; for (int i = 0; i < count; i += 3) { // Collect valid triangles into buffer. const float *v0 = transformed + indexBuffer[i] * 4; const float *v1 = transformed + indexBuffer[i + (1 ^ flipCull)] * 4; const float *v2 = transformed + indexBuffer[i + (2 ^ flipCull)] * 4; - // Don't collect triangle if any vertex is behind the 0 plane. - if (v0[3] > 0.0f && v1[3] > 0.0f && v2[3] > 0.0f) { + // Don't collect triangle if any vertex is beyond the planes. + // TODO: Optimize this somehow. + if (validVert(v0) && validVert(v1) && validVert(v2)) { verts[collected] = v0; verts[collected + 1] = v1; verts[collected + 2] = v2; @@ -380,6 +422,7 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr } if (collected != 12) { + // Fetch more! continue; } @@ -411,38 +454,53 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr Vec4F32 recipW2 = w2.Recip(); x0 *= recipW0; y0 *= recipW0; - z0 = (z0 * recipW0).Clamp(0.0f, 65535.0f); + z0 *= recipW0; x1 *= recipW1; y1 *= recipW1; - z1 = (z1 * recipW1).Clamp(0.0f, 65535.0f); + z1 *= recipW1; x2 *= recipW2; y2 *= recipW2; - z2 = (z2 * recipW2).Clamp(0.0f, 65535.0f); + z2 *= recipW2; - // Check bounding box size (clamped to screen edges). Cast to integer for crude rounding (and to match the rasterizer). - Vec4S32 minX = Vec4S32FromF32(x0.Min(x1.Min(x2)).Max(scissorX1)); - Vec4S32 minY = Vec4S32FromF32(y0.Min(y1.Min(y2)).Max(scissorY1)); - Vec4S32 maxX = Vec4S32FromF32(x0.Max(x1.Max(x2)).Min(scissorX2)); - Vec4S32 maxY = Vec4S32FromF32(y0.Max(y1.Max(y2)).Min(scissorY2)); + // Check bounding box size. Cast to integer for crude rounding (and to approximately match the rasterizer). + Vec4S32 minX = Vec4S32FromF32(x0.Min(x1.Min(x2))); + Vec4S32 minY = Vec4S32FromF32(y0.Min(y1.Min(y2))); + Vec4S32 maxX = Vec4S32FromF32(x0.Max(x1.Max(x2))); + Vec4S32 maxY = Vec4S32FromF32(y0.Max(y1.Max(y2))); - // If all are equal in any dimension, all four triangles are tiny nonsense (or outside the scissor) and can be skipped early. + // If all are equal in any dimension, all four triangles are tiny nonsense and can be skipped early. Vec4S32 eqMask = minX.CompareEq(maxX) | minY.CompareEq(maxY); - // Otherwise we just proceed to triangle setup with all four for now. Later might want to - // compact the remaining triangles... Or do more checking here. + + // Otherwise we just proceed to triangle setup with all four for now. // We could also save the computed boxes for later.. + // TODO: Merge into below checks? Though nice with an early out. if (!AnyZeroSignBit(eqMask)) { boxCulled += 4; continue; } - // Floating point triangle area. Can't be reused for the integer-snapped raster reliably (though may work...) + // Create a mask to kill coordinates of triangles that poke outside the guardband (or are just empty). + Vec4S32 inGuardBand = + ((minX.CompareGt(guardBandTopLeft) & maxX.CompareLt(guardBandBottomRight)) & + (minY.CompareGt(guardBandTopLeft) & maxY.CompareLt(guardBandBottomRight))).AndNot(eqMask); + + // Create another mask to kill off-screen triangles. Not perfectly accurate. + inGuardBand &= (maxX.CompareGt(scissorX1) & minX.CompareLt(scissorX2)) & (maxY.CompareGt(scissorY1) & minY.CompareLt(scissorY2)); + + // It's enough to smash one coordinate to make future checks (like the tri area check) fail. + x0 &= inGuardBand; + x1 &= inGuardBand; + x2 &= inGuardBand; + + // Floating point double triangle area. Can't be reused for the integer-snapped raster reliably (though may work...) // Still good for culling early and pretty cheap to compute. - Vec4F32 triArea = (y1 - y2) * x0 + (x2 - x1) * y0 + (x1 * y2 - x2 * y1) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA); - if (!AnyZeroSignBit(triArea)) { + Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)(MIN_TWICE_TRI_AREA)); + if (!AnyZeroSignBit(doubleTriArea)) { gpuStats.numDepthRasterEarlySize += 4; continue; } + // Note: If any triangle is outside the guardband, (just) its X coords get zeroed, and it'll later get rejected. Vec4S32FromF32(x0).Store(tx + outCount); Vec4S32FromF32(x1).Store(tx + outCount + 4); Vec4S32FromF32(x2).Store(tx + outCount + 8); @@ -453,13 +511,22 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr z1.Store(tz + outCount + 4); z2.Store(tz + outCount + 8); +#ifdef _DEBUG + for (int i = 0; i < 12; i++) { + _dbg_assert_(tx[outCount + i] < 32767); + _dbg_assert_(tx[outCount + i] >= -32768); + _dbg_assert_(tx[outCount + i] < 32767); + _dbg_assert_(tx[outCount + i] >= -32768); + } +#endif + outCount += 12; if (!cullEnabled) { - // If culling is off, store the triangles again, in the opposite order. - Vec4S32FromF32(x0).Store(tx + outCount); - Vec4S32FromF32(x2).Store(tx + outCount + 4); - Vec4S32FromF32(x1).Store(tx + outCount + 8); + // If culling is off, store the triangles again, with the first two vertices swapped. + (Vec4S32FromF32(x0) & inGuardBand).Store(tx + outCount); + (Vec4S32FromF32(x2) & inGuardBand).Store(tx + outCount + 4); + (Vec4S32FromF32(x1) & inGuardBand).Store(tx + outCount + 8); Vec4S32FromF32(y0).Store(ty + outCount); Vec4S32FromF32(y2).Store(ty + outCount + 4); Vec4S32FromF32(y1).Store(ty + outCount + 8); @@ -514,9 +581,9 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con } } } - gpuStats.numDepthRasterNoPixels += stats[(int)TriangleResult::NoPixels]; - gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::SmallOrBackface]; - gpuStats.numDepthRasterPrims += stats[(int)TriangleResult::OK]; + gpuStats.numDepthRasterNoPixels += stats[(int)TriangleStat::NoPixels]; + gpuStats.numDepthRasterTooSmall += stats[(int)TriangleStat::SmallOrBackface]; + gpuStats.numDepthRasterPrims += stats[(int)TriangleStat::OK]; break; } default: diff --git a/unittest/UnitTest.cpp b/unittest/UnitTest.cpp index a087d205b9..475b785abd 100644 --- a/unittest/UnitTest.cpp +++ b/unittest/UnitTest.cpp @@ -56,6 +56,7 @@ #include "Common/Buffer.h" #include "Common/File/Path.h" #include "Common/Math/SIMDHeaders.h" +#include "Common/Math/CrossSIMD.h" // Get some more instructions for testing #if PPSSPP_ARCH(SSE2) #include @@ -1048,7 +1049,7 @@ CharQueue GetQueue() { bool TestCharQueue() { // We use a tiny block size for testing. - CharQueue queue = std::move(GetQueue()); + CharQueue queue = GetQueue(); // Add 16 chars. queue.push_back("abcdefghijkl"); @@ -1124,6 +1125,21 @@ bool TestSIMD() { EXPECT_EQ_INT(testdata2[2], 0x8888777766665555); EXPECT_EQ_INT(testdata2[2], 0x8888777766665555); #endif + + const int testval[2][4] = { + { 0x1000, 0x2000, 0x3000, 0x7000 }, + { -0x1000, -0x2000, -0x3000, -0x7000 } + }; + + for (int i = 0; i < 2; i++) { + Vec4S32 s = Vec4S32::Load(testval[i]); + Vec4S32 square = s * s; + Vec4S32 square16 = s.Mul16(s); + EXPECT_EQ_INT(square[0], square16[0]); + EXPECT_EQ_INT(square[1], square16[1]); + EXPECT_EQ_INT(square[2], square16[2]); + EXPECT_EQ_INT(square[3], square16[3]); + } return true; }