mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Merge pull request #19786 from hrydgard/even-more-depth-work
DepthRaster: Better guardband rejection, parallelize triangle setup
This commit is contained in:
commit
bbc933b1b1
4 changed files with 327 additions and 198 deletions
|
@ -121,12 +121,6 @@ struct Vec4S32 {
|
|||
void Store2(int *dst) { _mm_storel_epi64((__m128i *)dst, v); }
|
||||
void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);}
|
||||
|
||||
// Swaps the two lower elements. Useful for reversing triangles..
|
||||
Vec4S32 SwapLowerElements() {
|
||||
return Vec4S32{
|
||||
_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 0, 1))
|
||||
};
|
||||
}
|
||||
Vec4S32 SignBits32ToMask() {
|
||||
return Vec4S32{
|
||||
_mm_srai_epi32(v, 31)
|
||||
|
@ -137,13 +131,19 @@ struct Vec4S32 {
|
|||
// On SSE2, much faster than _mm_mullo_epi32_SSE2.
|
||||
// On NEON though, it'll read the full 32 bits, so beware.
|
||||
// See https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/.
|
||||
Vec4S32 MulAsS16(Vec4S32 other) const {
|
||||
Vec4S32 Mul16(Vec4S32 other) const {
|
||||
// Note that we only need to mask one of the inputs, so we get zeroes - multiplying
|
||||
// by zero is zero, so it doesn't matter what the upper halfword of each 32-bit word is
|
||||
// in the other register.
|
||||
return Vec4S32{ _mm_madd_epi16(v, _mm_and_si128(other.v, _mm_set1_epi32(0x0000FFFF))) };
|
||||
}
|
||||
|
||||
Vec4S32 SignExtend16() const { return Vec4S32{ _mm_srai_epi32(_mm_slli_epi32(v, 16), 16) }; }
|
||||
// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output.
|
||||
Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ _mm_min_epi16(v, other.v) }; }
|
||||
Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ _mm_max_epi16(v, other.v) }; }
|
||||
Vec4S32 FixupAfterMinMax() const { return SignExtend16(); }
|
||||
|
||||
Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; }
|
||||
Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; }
|
||||
Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ _mm_or_si128(v, other.v) }; }
|
||||
|
@ -152,6 +152,18 @@ struct Vec4S32 {
|
|||
// TODO: andnot
|
||||
void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); }
|
||||
void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); }
|
||||
void operator &=(Vec4S32 other) { v = _mm_and_si128(v, other.v); }
|
||||
void operator |=(Vec4S32 other) { v = _mm_or_si128(v, other.v); }
|
||||
void operator ^=(Vec4S32 other) { v = _mm_xor_si128(v, other.v); }
|
||||
|
||||
Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ _mm_andnot_si128(inverted.v, v) }; } // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
|
||||
Vec4S32 Mul(Vec4S32 other) const { return *this * other; }
|
||||
|
||||
template<int imm>
|
||||
Vec4S32 Shl() const { return Vec4S32{ _mm_slli_epi32(v, imm) }; }
|
||||
|
||||
// NOTE: May be slow.
|
||||
int operator[](size_t index) const { return ((int *)&v)[index]; }
|
||||
|
||||
// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
|
||||
Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; } // (ab3,ab2,ab1,ab0)
|
||||
|
@ -216,10 +228,14 @@ struct Vec4F32 {
|
|||
void operator -=(Vec4F32 other) { v = _mm_sub_ps(v, other.v); }
|
||||
void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); }
|
||||
void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); }
|
||||
void operator &=(Vec4S32 other) { v = _mm_and_ps(v, _mm_castsi128_ps(other.v)); }
|
||||
Vec4F32 operator *(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
|
||||
// NOTE: May be slow.
|
||||
float operator[](size_t index) const { return ((float *)&v)[index]; }
|
||||
|
||||
Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
|
||||
Vec4F32 Recip() { return Vec4F32{ _mm_rcp_ps(v) }; }
|
||||
Vec4F32 RecipApprox() const { return Vec4F32{ _mm_rcp_ps(v) }; }
|
||||
Vec4F32 Recip() const { return Vec4F32{ _mm_div_ps(_mm_set1_ps(1.0f), v) }; }
|
||||
|
||||
Vec4F32 Clamp(float lower, float higher) {
|
||||
return Vec4F32{
|
||||
|
@ -238,13 +254,6 @@ struct Vec4F32 {
|
|||
return Vec4F32{ _mm_or_ps(_mm_and_ps(v, _mm_load_ps((const float *)mask)), _mm_load_ps((const float *)onelane3)) };
|
||||
}
|
||||
|
||||
// Swaps the two lower elements. Useful for reversing triangles..
|
||||
Vec4F32 SwapLowerElements() {
|
||||
return Vec4F32{
|
||||
_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 0, 1))
|
||||
};
|
||||
}
|
||||
|
||||
inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
|
||||
return Vec4F32{ _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
|
@ -261,6 +270,19 @@ struct Vec4F32 {
|
|||
static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
|
||||
_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
|
||||
}
|
||||
|
||||
// This is here because ARM64 can do this very efficiently.
|
||||
static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
|
||||
col0.v = _mm_loadu_ps(src);
|
||||
col1.v = _mm_loadu_ps(src + 4);
|
||||
col2.v = _mm_loadu_ps(src + 8);
|
||||
col3.v = _mm_loadu_ps(src + 12);
|
||||
_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
|
||||
}
|
||||
|
||||
Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpeq_ps(v, other.v)) }; }
|
||||
Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmplt_ps(v, other.v)) }; }
|
||||
Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpgt_ps(v, other.v)) }; }
|
||||
};
|
||||
|
||||
inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
|
||||
|
@ -309,6 +331,12 @@ struct Vec4U16 {
|
|||
Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; }
|
||||
Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ _mm_min_epu16_SSE2(v, other.v) }; }
|
||||
Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; }
|
||||
|
||||
inline Vec4U16 AndNot(Vec4U16 inverted) {
|
||||
return Vec4U16{
|
||||
_mm_andnot_si128(inverted.v, v) // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
struct Vec8U16 {
|
||||
|
@ -328,12 +356,6 @@ inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
|
|||
};
|
||||
}
|
||||
|
||||
inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
|
||||
return Vec4U16{
|
||||
_mm_andnot_si128(inverted.v, a.v) // NOTE: with andnot, the first parameter is inverted, and then and is performed.
|
||||
};
|
||||
}
|
||||
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
|
||||
struct Mat4F32 {
|
||||
|
@ -443,16 +465,17 @@ struct Vec4S32 {
|
|||
void Store2(int *dst) { vst1_s32(dst, vget_low_s32(v)); }
|
||||
void StoreAligned(int *dst) { vst1q_s32(dst, v); }
|
||||
|
||||
// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
|
||||
// This is quite awkward on ARM64 :/ Maybe there's a better solution?
|
||||
Vec4S32 SwapLowerElements() {
|
||||
int32x2_t upper = vget_high_s32(v);
|
||||
int32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v));
|
||||
return Vec4S32{ vcombine_s32(lowerSwapped, upper) };
|
||||
};
|
||||
|
||||
// Warning: Unlike on x86, this is a full 32-bit multiplication.
|
||||
Vec4S32 MulAsS16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
|
||||
Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
|
||||
|
||||
Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; }
|
||||
// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least).
|
||||
Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ vminq_s32(v, other.v) }; }
|
||||
Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ vmaxq_s32(v, other.v) }; }
|
||||
Vec4S32 FixupAfterMinMax() const { return Vec4S32{ v }; }
|
||||
|
||||
// NOTE: May be slow.
|
||||
int operator[](size_t index) const { return ((int *)&v)[index]; }
|
||||
|
||||
Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; }
|
||||
Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; }
|
||||
|
@ -460,6 +483,12 @@ struct Vec4S32 {
|
|||
Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ vorrq_s32(v, other.v) }; }
|
||||
Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ vandq_s32(v, other.v) }; }
|
||||
Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ veorq_s32(v, other.v) }; }
|
||||
Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ vandq_s32(v, vmvnq_s32(inverted.v))}; }
|
||||
Vec4S32 Mul(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
|
||||
void operator &=(Vec4S32 other) { v = vandq_s32(v, other.v); }
|
||||
|
||||
template<int imm>
|
||||
Vec4S32 Shl() const { return Vec4S32{ vshlq_n_s32(v, imm) }; }
|
||||
|
||||
void operator +=(Vec4S32 other) { v = vaddq_s32(v, other.v); }
|
||||
void operator -=(Vec4S32 other) { v = vsubq_s32(v, other.v); }
|
||||
|
@ -508,6 +537,9 @@ struct Vec4F32 {
|
|||
return Vec4F32{ vcvtq_f32_s32(other.v) };
|
||||
}
|
||||
|
||||
// NOTE: May be slow.
|
||||
float operator[](size_t index) const { return ((float *)&v)[index]; }
|
||||
|
||||
Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; }
|
||||
Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; }
|
||||
Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; }
|
||||
|
@ -517,19 +549,27 @@ struct Vec4F32 {
|
|||
void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); }
|
||||
void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); }
|
||||
void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); }
|
||||
void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); }
|
||||
Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
|
||||
|
||||
Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
|
||||
|
||||
Vec4F32 Recip() {
|
||||
Vec4F32 Recip() const {
|
||||
float32x4_t recip = vrecpeq_f32(v);
|
||||
// Use a couple Newton-Raphson steps to refine the estimate.
|
||||
// May be able to get away with only one refinement, not sure!
|
||||
// To save one iteration at the expense of accuracy, use RecipApprox().
|
||||
recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
|
||||
recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
|
||||
return Vec4F32{ recip };
|
||||
}
|
||||
|
||||
Vec4F32 RecipApprox() const {
|
||||
float32x4_t recip = vrecpeq_f32(v);
|
||||
// To approximately match the precision of x86-64's rcpps, do a single iteration.
|
||||
recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
|
||||
return Vec4F32{ recip };
|
||||
}
|
||||
|
||||
Vec4F32 Clamp(float lower, float higher) {
|
||||
return Vec4F32{
|
||||
vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher))
|
||||
|
@ -544,12 +584,11 @@ struct Vec4F32 {
|
|||
return Vec4F32{ vsetq_lane_f32(1.0f, v, 3) };
|
||||
}
|
||||
|
||||
// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
|
||||
// This is quite awkward on ARM64 :/ Maybe there's a better solution?
|
||||
Vec4F32 SwapLowerElements() {
|
||||
float32x2_t lowerSwapped = vrev64_f32(vget_low_f32(v));
|
||||
return Vec4F32{ vcombine_f32(lowerSwapped, vget_high_f32(v)) };
|
||||
};
|
||||
Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vceqq_f32(v, other.v)) }; }
|
||||
Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcltq_f32(v, other.v)) }; }
|
||||
Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_f32(v, other.v)) }; }
|
||||
Vec4S32 CompareLe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcleq_f32(v, other.v)) }; }
|
||||
Vec4S32 CompareGe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgeq_f32(v, other.v)) }; }
|
||||
|
||||
// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
|
||||
static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
|
||||
|
@ -573,6 +612,15 @@ struct Vec4F32 {
|
|||
#endif
|
||||
}
|
||||
|
||||
static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
|
||||
// The optimizer hopefully gets rid of the copies below.
|
||||
float32x4x4_t r = vld4q_f32(src);
|
||||
col0.v = r.val[0];
|
||||
col1.v = r.val[1];
|
||||
col2.v = r.val[2];
|
||||
col3.v = r.val[3];
|
||||
}
|
||||
|
||||
inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
|
||||
#if PPSSPP_ARCH(ARM64_NEON)
|
||||
float32x4_t sum = vaddq_f32(
|
||||
|
@ -644,6 +692,8 @@ struct Vec4U16 {
|
|||
Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; }
|
||||
Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; }
|
||||
Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; }
|
||||
|
||||
Vec4U16 AndNot(Vec4U16 inverted) { return Vec4U16{ vand_u16(v, vmvn_u16(inverted.v)) }; }
|
||||
};
|
||||
|
||||
inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
|
||||
|
@ -652,10 +702,6 @@ inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
|
|||
return Vec4U16{ result };
|
||||
}
|
||||
|
||||
inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
|
||||
return Vec4U16{ vand_u16(a.v, vmvn_u16(inverted.v)) };
|
||||
}
|
||||
|
||||
struct Vec8U16 {
|
||||
uint16x8_t v;
|
||||
|
||||
|
|
|
@ -88,29 +88,29 @@ static inline uint32x4_t vcgezq_f32(float32x4_t v) {
|
|||
// May later figure out how to use the appropriate ones depending on compile flags.
|
||||
|
||||
inline __m128i _mm_mullo_epi32_SSE2(const __m128i v0, const __m128i v1) {
|
||||
__m128i a13 = _mm_shuffle_epi32(v0, 0xF5); // (-,a3,-,a1)
|
||||
__m128i b13 = _mm_shuffle_epi32(v1, 0xF5); // (-,b3,-,b1)
|
||||
__m128i prod02 = _mm_mul_epu32(v0, v1); // (-,a2*b2,-,a0*b0)
|
||||
__m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1)
|
||||
__m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0)
|
||||
__m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2)
|
||||
return _mm_unpacklo_epi64(prod01, prod23);
|
||||
__m128i a13 = _mm_shuffle_epi32(v0, 0xF5); // (-,a3,-,a1)
|
||||
__m128i b13 = _mm_shuffle_epi32(v1, 0xF5); // (-,b3,-,b1)
|
||||
__m128i prod02 = _mm_mul_epu32(v0, v1); // (-,a2*b2,-,a0*b0)
|
||||
__m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1)
|
||||
__m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0)
|
||||
__m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2)
|
||||
return _mm_unpacklo_epi64(prod01, prod23);
|
||||
}
|
||||
|
||||
inline __m128i _mm_max_epu16_SSE2(const __m128i v0, const __m128i v1) {
|
||||
return _mm_xor_si128(
|
||||
_mm_max_epi16(
|
||||
_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
|
||||
_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
|
||||
_mm_set1_epi16((int16_t)0x8000));
|
||||
return _mm_xor_si128(
|
||||
_mm_max_epi16(
|
||||
_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
|
||||
_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
|
||||
_mm_set1_epi16((int16_t)0x8000));
|
||||
}
|
||||
|
||||
inline __m128i _mm_min_epu16_SSE2(const __m128i v0, const __m128i v1) {
|
||||
return _mm_xor_si128(
|
||||
_mm_min_epi16(
|
||||
_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
|
||||
_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
|
||||
_mm_set1_epi16((int16_t)0x8000));
|
||||
return _mm_xor_si128(
|
||||
_mm_min_epi16(
|
||||
_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
|
||||
_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
|
||||
_mm_set1_epi16((int16_t)0x8000));
|
||||
}
|
||||
|
||||
// SSE2 replacement for half of a _mm_packus_epi32 but without the saturation.
|
||||
|
|
|
@ -85,36 +85,15 @@ static void DepthRasterRect(uint16_t *dest, int stride, const DepthScissor sciss
|
|||
}
|
||||
}
|
||||
|
||||
alignas(16) static const int zero123[4] = {0, 1, 2, 3};
|
||||
alignas(16) static const int zero123[4] = {0, 1, 2, 3};
|
||||
|
||||
struct Edge {
|
||||
// Dimensions of our pixel group
|
||||
static const int stepXSize = 4;
|
||||
static const int stepYSize = 1;
|
||||
constexpr int stepXSize = 4;
|
||||
constexpr int stepYSize = 1;
|
||||
|
||||
Vec4S32 oneStepX;
|
||||
Vec4S32 oneStepY;
|
||||
constexpr int stepXShift = 2;
|
||||
constexpr int stepYShift = 0;
|
||||
|
||||
Vec4S32 init(int v0x, int v0y, int v1x, int v1y, int p0x, int p0y) {
|
||||
// Edge setup
|
||||
int A = v0y - v1y;
|
||||
int B = v1x - v0x;
|
||||
int C = v0x * v1y - v0y * v1x;
|
||||
|
||||
// Step deltas
|
||||
oneStepX = Vec4S32::Splat(A * stepXSize);
|
||||
oneStepY = Vec4S32::Splat(B * stepYSize);
|
||||
|
||||
// x/y values for initial pixel block. Add horizontal offsets.
|
||||
Vec4S32 x = Vec4S32::Splat(p0x) + Vec4S32::LoadAligned(zero123);
|
||||
Vec4S32 y = Vec4S32::Splat(p0y);
|
||||
|
||||
// Edge function values at origin
|
||||
return Vec4S32::Splat(A) * x + Vec4S32::Splat(B) * y + Vec4S32::Splat(C);
|
||||
}
|
||||
};
|
||||
|
||||
enum class TriangleResult {
|
||||
enum class TriangleStat {
|
||||
OK,
|
||||
NoPixels,
|
||||
SmallOrBackface,
|
||||
|
@ -122,109 +101,154 @@ enum class TriangleResult {
|
|||
|
||||
constexpr int MIN_TWICE_TRI_AREA = 10;
|
||||
|
||||
// Adapted from Intel's depth rasterizer example.
|
||||
// Started with the scalar version, will SIMD-ify later.
|
||||
// x1/y1 etc are the scissor rect.
|
||||
// A mix of ideas from Intel's sample and ryg's rasterizer blog series.
|
||||
template<ZCompareMode compareMode>
|
||||
TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
|
||||
// BEGIN triangle setup. This should be done SIMD, four triangles at a time.
|
||||
// Due to the many multiplications, we might want to do it in floating point as 32-bit integer muls
|
||||
// are slow on SSE2.
|
||||
void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
|
||||
// Triangle setup. This is done using SIMD, four triangles at a time.
|
||||
// 16x16->32 multiplications are doable on SSE2, which should be all we need.
|
||||
|
||||
// We use 4x1 SIMD tiles for simplicity. 2x2 would be ideal but stores/loads get annoying.
|
||||
|
||||
// NOTE: Triangles are stored in groups of 4.
|
||||
int v0x = tx[0];
|
||||
int v0y = ty[0];
|
||||
int v1x = tx[4];
|
||||
int v1y = ty[4];
|
||||
int v2x = tx[8];
|
||||
int v2y = ty[8];
|
||||
Vec4S32 x0 = Vec4S32::LoadAligned(tx);
|
||||
Vec4S32 y0 = Vec4S32::LoadAligned(ty);
|
||||
Vec4S32 x1 = Vec4S32::LoadAligned(tx + 4);
|
||||
Vec4S32 y1 = Vec4S32::LoadAligned(ty + 4);
|
||||
Vec4S32 x2 = Vec4S32::LoadAligned(tx + 8);
|
||||
Vec4S32 y2 = Vec4S32::LoadAligned(ty + 8);
|
||||
|
||||
// use fixed-point only for X and Y. Avoid work for Z and W.
|
||||
// We use 4x1 tiles for simplicity.
|
||||
int minX = std::max(std::min(std::min(v0x, v1x), v2x), (int)scissor.x1) & ~3;
|
||||
int maxX = std::min(std::max(std::max(v0x, v1x), v2x) + 3, (int)scissor.x2) & ~3;
|
||||
int minY = std::max(std::min(std::min(v0y, v1y), v2y), (int)scissor.y1);
|
||||
int maxY = std::min(std::max(std::max(v0y, v1y), v2y), (int)scissor.y2);
|
||||
if (maxX == minX || maxY == minY) {
|
||||
// No pixels, or outside screen.
|
||||
// Most of these are now gone in the initial pass.
|
||||
return TriangleResult::NoPixels;
|
||||
}
|
||||
// FixupAfterMinMax is just 16->32 sign extension, in case the current platform (like SSE2) just has 16-bit min/max operations.
|
||||
Vec4S32 minX = x0.Min16(x1).Min16(x2).Max16(Vec4S32::Splat(scissor.x1)).FixupAfterMinMax();
|
||||
Vec4S32 maxX = x0.Max16(x1).Max16(x2).Min16(Vec4S32::Splat(scissor.x2)).FixupAfterMinMax();
|
||||
Vec4S32 minY = y0.Min16(y1).Min16(y2).Max16(Vec4S32::Splat(scissor.y1)).FixupAfterMinMax();
|
||||
Vec4S32 maxY = y0.Max16(y1).Max16(y2).Min16(Vec4S32::Splat(scissor.y2)).FixupAfterMinMax();
|
||||
|
||||
// TODO: Cull really small triangles here - we can increase the threshold a bit probably.
|
||||
int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
|
||||
if (triArea < MIN_TWICE_TRI_AREA) {
|
||||
return TriangleResult::SmallOrBackface; // Or zero area.
|
||||
}
|
||||
Vec4S32 triArea = (x1 - x0).Mul16(y2 - y0) - (x2 - x0).Mul16(y1 - y0);
|
||||
|
||||
float oneOverTriArea = 1.0f / (float)triArea;
|
||||
// Edge setup
|
||||
Vec4S32 A12 = y1 - y2;
|
||||
Vec4S32 B12 = x2 - x1;
|
||||
Vec4S32 C12 = x1.Mul16(y2) - y1.Mul16(x2);
|
||||
|
||||
Edge e01, e12, e20;
|
||||
Vec4S32 A20 = y2 - y0;
|
||||
Vec4S32 B20 = x0 - x2;
|
||||
Vec4S32 C20 = x2.Mul16(y0) - y2.Mul16(x0);
|
||||
|
||||
Vec4S32 w0_row = e12.init(v1x, v1y, v2x, v2y, minX, minY);
|
||||
Vec4S32 w1_row = e20.init(v2x, v2y, v0x, v0y, minX, minY);
|
||||
Vec4S32 w2_row = e01.init(v0x, v0y, v1x, v1y, minX, minY);
|
||||
Vec4S32 A01 = y0 - y1;
|
||||
Vec4S32 B01 = x1 - x0;
|
||||
Vec4S32 C01 = x0.Mul16(y1) - y0.Mul16(x1);
|
||||
|
||||
// Step deltas
|
||||
Vec4S32 stepX12 = A12.Shl<stepXShift>();
|
||||
Vec4S32 stepY12 = B12.Shl<stepYShift>();
|
||||
Vec4S32 stepX20 = A20.Shl<stepXShift>();
|
||||
Vec4S32 stepY20 = B20.Shl<stepYShift>();
|
||||
Vec4S32 stepX01 = A01.Shl<stepXShift>();
|
||||
Vec4S32 stepY01 = B01.Shl<stepYShift>();
|
||||
|
||||
// Prepare to interpolate Z
|
||||
Vec4F32 zz0 = Vec4F32::Splat(tz[0]);
|
||||
Vec4F32 zz1 = Vec4F32::Splat((tz[4] - tz[0]) * oneOverTriArea);
|
||||
Vec4F32 zz2 = Vec4F32::Splat((tz[8] - tz[0]) * oneOverTriArea);
|
||||
Vec4F32 oneOverTriArea = Vec4F32FromS32(triArea).Recip();
|
||||
Vec4F32 zbase = Vec4F32::LoadAligned(tz);
|
||||
Vec4F32 z_20 = (Vec4F32::LoadAligned(tz + 4) - zbase) * oneOverTriArea;
|
||||
Vec4F32 z_01 = (Vec4F32::LoadAligned(tz + 8) - zbase) * oneOverTriArea;
|
||||
Vec4F32 zdx = z_20 * Vec4F32FromS32(stepX20) + z_01 * Vec4F32FromS32(stepX01);
|
||||
Vec4F32 zdy = z_20 * Vec4F32FromS32(stepY20) + z_01 * Vec4F32FromS32(stepY01);
|
||||
|
||||
Vec4F32 zdeltaX = zz1 * Vec4F32FromS32(e20.oneStepX) + zz2 * Vec4F32FromS32(e01.oneStepX);
|
||||
Vec4F32 zdeltaY = zz1 * Vec4F32FromS32(e20.oneStepY) + zz2 * Vec4F32FromS32(e01.oneStepY);
|
||||
Vec4F32 zrow = zz0 + Vec4F32FromS32(w1_row) * zz1 + Vec4F32FromS32(w2_row) * zz2;
|
||||
// Shared setup is done, now loop per-triangle in the group of four.
|
||||
for (int t = 0; t < 4; t++) {
|
||||
// Check for bad triangle.
|
||||
// Using operator[] on the vectors actually seems to result in pretty good code.
|
||||
if (maxX[t] <= minX[t] || maxY[t] <= minY[t]) {
|
||||
// No pixels, or outside screen.
|
||||
// Most of these are now gone in the initial pass, but not all since we cull
|
||||
// in 4-groups there.
|
||||
stats[(int)TriangleStat::NoPixels]++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rasterize
|
||||
for (int y = minY; y <= maxY; y += Edge::stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY, zrow += zdeltaY) {
|
||||
// Barycentric coordinates at start of row
|
||||
Vec4S32 w0 = w0_row;
|
||||
Vec4S32 w1 = w1_row;
|
||||
Vec4S32 w2 = w2_row;
|
||||
Vec4F32 zs = zrow;
|
||||
if (triArea[t] < MIN_TWICE_TRI_AREA) {
|
||||
stats[(int)TriangleStat::SmallOrBackface]++; // Or zero area.
|
||||
continue;
|
||||
}
|
||||
|
||||
uint16_t *rowPtr = depthBuf + stride * y;
|
||||
const int minXT = minX[t] & ~3;
|
||||
const int maxXT = maxX[t] & ~3;
|
||||
|
||||
for (int x = minX; x <= maxX; x += Edge::stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX, zs += zdeltaX) {
|
||||
// If p is on or inside all edges for any pixels,
|
||||
// render those pixels.
|
||||
Vec4S32 signCalc = w0 | w1 | w2;
|
||||
if (!AnyZeroSignBit(signCalc)) {
|
||||
continue;
|
||||
}
|
||||
const int minYT = minY[t];
|
||||
const int maxYT = maxY[t];
|
||||
|
||||
Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x);
|
||||
Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc);
|
||||
// Now, the mask has 1111111 where we should preserve the contents of the depth buffer.
|
||||
// Convert to wide registers.
|
||||
Vec4S32 initialX = Vec4S32::Splat(minXT) + Vec4S32::LoadAligned(zero123);
|
||||
int initialY = minY[t];
|
||||
_dbg_assert_(A12[t] < 32767);
|
||||
_dbg_assert_(A12[t] > -32767);
|
||||
_dbg_assert_(A20[t] < 32767);
|
||||
_dbg_assert_(A20[t] > -32767);
|
||||
_dbg_assert_(A01[t] < 32767);
|
||||
_dbg_assert_(A01[t] > -32767);
|
||||
|
||||
Vec4U16 shortZ = Vec4U16::FromVec4F32(zs);
|
||||
// TODO: The latter subexpression can be broken out of this loop, but reduces block size flexibility.
|
||||
Vec4S32 w0_row = Vec4S32::Splat(A12[t]).Mul16(initialX) + Vec4S32::Splat(B12[t] * initialY + C12[t]);
|
||||
Vec4S32 w1_row = Vec4S32::Splat(A20[t]).Mul16(initialX) + Vec4S32::Splat(B20[t] * initialY + C20[t]);
|
||||
Vec4S32 w2_row = Vec4S32::Splat(A01[t]).Mul16(initialX) + Vec4S32::Splat(B01[t] * initialY + C01[t]);
|
||||
|
||||
// This switch is on a templated constant, so should collapse away.
|
||||
switch (compareMode) {
|
||||
case ZCompareMode::Greater:
|
||||
// To implement the greater/greater-than comparison, we can combine mask and max.
|
||||
// Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output.
|
||||
// We use AndNot to zero out Z results, before doing Max with the buffer.
|
||||
AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x);
|
||||
break;
|
||||
case ZCompareMode::Less: // UNTESTED
|
||||
// This time, we OR the mask and use .Min.
|
||||
(shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x);
|
||||
break;
|
||||
case ZCompareMode::Always: // UNTESTED
|
||||
// This could be replaced with a vblend operation.
|
||||
((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x);
|
||||
break;
|
||||
Vec4F32 zrow = Vec4F32::Splat(zbase[t]) + Vec4F32FromS32(w1_row) * z_20[t] + Vec4F32FromS32(w2_row) * z_01[t];
|
||||
Vec4F32 zdeltaX = Vec4F32::Splat(zdx[t]);
|
||||
Vec4F32 zdeltaY = Vec4F32::Splat(zdy[t]);
|
||||
|
||||
Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12[t]);
|
||||
Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12[t]);
|
||||
Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20[t]);
|
||||
Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20[t]);
|
||||
Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01[t]);
|
||||
Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01[t]);
|
||||
// Rasterize
|
||||
for (int y = minYT; y <= maxYT; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) {
|
||||
// Barycentric coordinates at start of row
|
||||
Vec4S32 w0 = w0_row;
|
||||
Vec4S32 w1 = w1_row;
|
||||
Vec4S32 w2 = w2_row;
|
||||
Vec4F32 zs = zrow;
|
||||
|
||||
uint16_t *rowPtr = depthBuf + stride * y;
|
||||
|
||||
for (int x = minXT; x <= maxXT; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) {
|
||||
// If p is on or inside all edges for any pixels,
|
||||
// render those pixels.
|
||||
Vec4S32 signCalc = w0 | w1 | w2;
|
||||
|
||||
// TODO: Check if this check is profitable. Maybe only for big triangles?
|
||||
if (!AnyZeroSignBit(signCalc)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x);
|
||||
Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc);
|
||||
// Now, the mask has 1111111 where we should preserve the contents of the depth buffer.
|
||||
|
||||
Vec4U16 shortZ = Vec4U16::FromVec4F32(zs);
|
||||
|
||||
// This switch is on a templated constant, so should collapse away.
|
||||
switch (compareMode) {
|
||||
case ZCompareMode::Greater:
|
||||
// To implement the greater/greater-than comparison, we can combine mask and max.
|
||||
// Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output.
|
||||
// We use AndNot to zero out Z results, before doing Max with the buffer.
|
||||
shortZ.AndNot(shortMaskInv).Max(bufferValues).Store(rowPtr + x);
|
||||
break;
|
||||
case ZCompareMode::Less:
|
||||
// This time, we OR the mask and use .Min.
|
||||
(shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x);
|
||||
break;
|
||||
case ZCompareMode::Always: // UNTESTED
|
||||
// This could be replaced with a vblend operation.
|
||||
((bufferValues & shortMaskInv) | shortZ.AndNot(shortMaskInv)).Store(rowPtr + x);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return TriangleResult::OK;
|
||||
}
|
||||
|
||||
template<ZCompareMode compareMode>
|
||||
inline void DepthRaster4Triangles(int stats[4], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
TriangleResult result = DepthRasterTriangle<compareMode>(depthBuf, stride, scissor, tx + i, ty + i, tz + i);
|
||||
stats[(int)result]++;
|
||||
stats[(int)TriangleStat::OK]++;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -343,7 +367,7 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
|
|||
}
|
||||
const bool cullEnabled = draw.cullEnabled;
|
||||
|
||||
static const float zerovec[4] = {};
|
||||
static const float zerovec[4] = {0.0f, 0.0f, 0.0f, 1.0f};
|
||||
|
||||
int collected = 0;
|
||||
int planeCulled = 0;
|
||||
|
@ -351,18 +375,36 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
|
|||
const float *verts[12]; // four triangles at a time!
|
||||
const int count = draw.vertexCount;
|
||||
|
||||
Vec4F32 scissorX1 = Vec4F32::Splat((float)scissor.x1);
|
||||
Vec4F32 scissorY1 = Vec4F32::Splat((float)scissor.y1);
|
||||
Vec4F32 scissorX2 = Vec4F32::Splat((float)scissor.x2);
|
||||
Vec4F32 scissorY2 = Vec4F32::Splat((float)scissor.y2);
|
||||
// Not exactly the same guardband as on the real PSP, but good enough to prevent 16-bit overflow in raster.
|
||||
// This is slightly off-center since we are already in screen space, but whatever.
|
||||
Vec4S32 guardBandTopLeft = Vec4S32::Splat(-4096);
|
||||
Vec4S32 guardBandBottomRight = Vec4S32::Splat(4096);
|
||||
|
||||
Vec4S32 scissorX1 = Vec4S32::Splat((float)scissor.x1);
|
||||
Vec4S32 scissorY1 = Vec4S32::Splat((float)scissor.y1);
|
||||
Vec4S32 scissorX2 = Vec4S32::Splat((float)scissor.x2);
|
||||
Vec4S32 scissorY2 = Vec4S32::Splat((float)scissor.y2);
|
||||
|
||||
// Add cheap pre-projection pre-checks for bad triangle here. Not much we can do safely other than checking W.
|
||||
auto validVert = [](const float *v) -> bool {
|
||||
if (v[3] <= 0.0f || v[2] <= 0.0f) {
|
||||
return false;
|
||||
}
|
||||
/*
|
||||
if (v[2] >= 65535.0f * v[3]) {
|
||||
return false;
|
||||
}*/
|
||||
return true;
|
||||
};
|
||||
|
||||
for (int i = 0; i < count; i += 3) {
|
||||
// Collect valid triangles into buffer.
|
||||
const float *v0 = transformed + indexBuffer[i] * 4;
|
||||
const float *v1 = transformed + indexBuffer[i + (1 ^ flipCull)] * 4;
|
||||
const float *v2 = transformed + indexBuffer[i + (2 ^ flipCull)] * 4;
|
||||
// Don't collect triangle if any vertex is behind the 0 plane.
|
||||
if (v0[3] > 0.0f && v1[3] > 0.0f && v2[3] > 0.0f) {
|
||||
// Don't collect triangle if any vertex is beyond the planes.
|
||||
// TODO: Optimize this somehow.
|
||||
if (validVert(v0) && validVert(v1) && validVert(v2)) {
|
||||
verts[collected] = v0;
|
||||
verts[collected + 1] = v1;
|
||||
verts[collected + 2] = v2;
|
||||
|
@ -380,6 +422,7 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
|
|||
}
|
||||
|
||||
if (collected != 12) {
|
||||
// Fetch more!
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -411,38 +454,53 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
|
|||
Vec4F32 recipW2 = w2.Recip();
|
||||
x0 *= recipW0;
|
||||
y0 *= recipW0;
|
||||
z0 = (z0 * recipW0).Clamp(0.0f, 65535.0f);
|
||||
z0 *= recipW0;
|
||||
x1 *= recipW1;
|
||||
y1 *= recipW1;
|
||||
z1 = (z1 * recipW1).Clamp(0.0f, 65535.0f);
|
||||
z1 *= recipW1;
|
||||
x2 *= recipW2;
|
||||
y2 *= recipW2;
|
||||
z2 = (z2 * recipW2).Clamp(0.0f, 65535.0f);
|
||||
z2 *= recipW2;
|
||||
|
||||
// Check bounding box size (clamped to screen edges). Cast to integer for crude rounding (and to match the rasterizer).
|
||||
Vec4S32 minX = Vec4S32FromF32(x0.Min(x1.Min(x2)).Max(scissorX1));
|
||||
Vec4S32 minY = Vec4S32FromF32(y0.Min(y1.Min(y2)).Max(scissorY1));
|
||||
Vec4S32 maxX = Vec4S32FromF32(x0.Max(x1.Max(x2)).Min(scissorX2));
|
||||
Vec4S32 maxY = Vec4S32FromF32(y0.Max(y1.Max(y2)).Min(scissorY2));
|
||||
// Check bounding box size. Cast to integer for crude rounding (and to approximately match the rasterizer).
|
||||
Vec4S32 minX = Vec4S32FromF32(x0.Min(x1.Min(x2)));
|
||||
Vec4S32 minY = Vec4S32FromF32(y0.Min(y1.Min(y2)));
|
||||
Vec4S32 maxX = Vec4S32FromF32(x0.Max(x1.Max(x2)));
|
||||
Vec4S32 maxY = Vec4S32FromF32(y0.Max(y1.Max(y2)));
|
||||
|
||||
// If all are equal in any dimension, all four triangles are tiny nonsense (or outside the scissor) and can be skipped early.
|
||||
// If all are equal in any dimension, all four triangles are tiny nonsense and can be skipped early.
|
||||
Vec4S32 eqMask = minX.CompareEq(maxX) | minY.CompareEq(maxY);
|
||||
// Otherwise we just proceed to triangle setup with all four for now. Later might want to
|
||||
// compact the remaining triangles... Or do more checking here.
|
||||
|
||||
// Otherwise we just proceed to triangle setup with all four for now.
|
||||
// We could also save the computed boxes for later..
|
||||
// TODO: Merge into below checks? Though nice with an early out.
|
||||
if (!AnyZeroSignBit(eqMask)) {
|
||||
boxCulled += 4;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Floating point triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
|
||||
// Create a mask to kill coordinates of triangles that poke outside the guardband (or are just empty).
|
||||
Vec4S32 inGuardBand =
|
||||
((minX.CompareGt(guardBandTopLeft) & maxX.CompareLt(guardBandBottomRight)) &
|
||||
(minY.CompareGt(guardBandTopLeft) & maxY.CompareLt(guardBandBottomRight))).AndNot(eqMask);
|
||||
|
||||
// Create another mask to kill off-screen triangles. Not perfectly accurate.
|
||||
inGuardBand &= (maxX.CompareGt(scissorX1) & minX.CompareLt(scissorX2)) & (maxY.CompareGt(scissorY1) & minY.CompareLt(scissorY2));
|
||||
|
||||
// It's enough to smash one coordinate to make future checks (like the tri area check) fail.
|
||||
x0 &= inGuardBand;
|
||||
x1 &= inGuardBand;
|
||||
x2 &= inGuardBand;
|
||||
|
||||
// Floating point double triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
|
||||
// Still good for culling early and pretty cheap to compute.
|
||||
Vec4F32 triArea = (y1 - y2) * x0 + (x2 - x1) * y0 + (x1 * y2 - x2 * y1) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA);
|
||||
if (!AnyZeroSignBit(triArea)) {
|
||||
Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)(MIN_TWICE_TRI_AREA));
|
||||
if (!AnyZeroSignBit(doubleTriArea)) {
|
||||
gpuStats.numDepthRasterEarlySize += 4;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Note: If any triangle is outside the guardband, (just) its X coords get zeroed, and it'll later get rejected.
|
||||
Vec4S32FromF32(x0).Store(tx + outCount);
|
||||
Vec4S32FromF32(x1).Store(tx + outCount + 4);
|
||||
Vec4S32FromF32(x2).Store(tx + outCount + 8);
|
||||
|
@ -453,13 +511,22 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
|
|||
z1.Store(tz + outCount + 4);
|
||||
z2.Store(tz + outCount + 8);
|
||||
|
||||
#ifdef _DEBUG
|
||||
for (int i = 0; i < 12; i++) {
|
||||
_dbg_assert_(tx[outCount + i] < 32767);
|
||||
_dbg_assert_(tx[outCount + i] >= -32768);
|
||||
_dbg_assert_(tx[outCount + i] < 32767);
|
||||
_dbg_assert_(tx[outCount + i] >= -32768);
|
||||
}
|
||||
#endif
|
||||
|
||||
outCount += 12;
|
||||
|
||||
if (!cullEnabled) {
|
||||
// If culling is off, store the triangles again, in the opposite order.
|
||||
Vec4S32FromF32(x0).Store(tx + outCount);
|
||||
Vec4S32FromF32(x2).Store(tx + outCount + 4);
|
||||
Vec4S32FromF32(x1).Store(tx + outCount + 8);
|
||||
// If culling is off, store the triangles again, with the first two vertices swapped.
|
||||
(Vec4S32FromF32(x0) & inGuardBand).Store(tx + outCount);
|
||||
(Vec4S32FromF32(x2) & inGuardBand).Store(tx + outCount + 4);
|
||||
(Vec4S32FromF32(x1) & inGuardBand).Store(tx + outCount + 8);
|
||||
Vec4S32FromF32(y0).Store(ty + outCount);
|
||||
Vec4S32FromF32(y2).Store(ty + outCount + 4);
|
||||
Vec4S32FromF32(y1).Store(ty + outCount + 8);
|
||||
|
@ -514,9 +581,9 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con
|
|||
}
|
||||
}
|
||||
}
|
||||
gpuStats.numDepthRasterNoPixels += stats[(int)TriangleResult::NoPixels];
|
||||
gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::SmallOrBackface];
|
||||
gpuStats.numDepthRasterPrims += stats[(int)TriangleResult::OK];
|
||||
gpuStats.numDepthRasterNoPixels += stats[(int)TriangleStat::NoPixels];
|
||||
gpuStats.numDepthRasterTooSmall += stats[(int)TriangleStat::SmallOrBackface];
|
||||
gpuStats.numDepthRasterPrims += stats[(int)TriangleStat::OK];
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
|
|
@ -56,6 +56,7 @@
|
|||
#include "Common/Buffer.h"
|
||||
#include "Common/File/Path.h"
|
||||
#include "Common/Math/SIMDHeaders.h"
|
||||
#include "Common/Math/CrossSIMD.h"
|
||||
// Get some more instructions for testing
|
||||
#if PPSSPP_ARCH(SSE2)
|
||||
#include <immintrin.h>
|
||||
|
@ -1048,7 +1049,7 @@ CharQueue GetQueue() {
|
|||
|
||||
bool TestCharQueue() {
|
||||
// We use a tiny block size for testing.
|
||||
CharQueue queue = std::move(GetQueue());
|
||||
CharQueue queue = GetQueue();
|
||||
|
||||
// Add 16 chars.
|
||||
queue.push_back("abcdefghijkl");
|
||||
|
@ -1124,6 +1125,21 @@ bool TestSIMD() {
|
|||
EXPECT_EQ_INT(testdata2[2], 0x8888777766665555);
|
||||
EXPECT_EQ_INT(testdata2[2], 0x8888777766665555);
|
||||
#endif
|
||||
|
||||
const int testval[2][4] = {
|
||||
{ 0x1000, 0x2000, 0x3000, 0x7000 },
|
||||
{ -0x1000, -0x2000, -0x3000, -0x7000 }
|
||||
};
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
Vec4S32 s = Vec4S32::Load(testval[i]);
|
||||
Vec4S32 square = s * s;
|
||||
Vec4S32 square16 = s.Mul16(s);
|
||||
EXPECT_EQ_INT(square[0], square16[0]);
|
||||
EXPECT_EQ_INT(square[1], square16[1]);
|
||||
EXPECT_EQ_INT(square[2], square16[2]);
|
||||
EXPECT_EQ_INT(square[3], square16[3]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue