From ca248e1201df93d13ec11f1902caff2ed31f7fcd Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 18 Sep 2022 07:15:30 -0700 Subject: [PATCH] softgpu: Fix s8 primitives in throughmode. Also always cull no-position verts, hardware too. Matches tests. --- GPU/Common/VertexDecoderArm.cpp | 14 +++---------- GPU/Common/VertexDecoderArm64.cpp | 12 ++++------- GPU/Common/VertexDecoderCommon.cpp | 33 +++++++++++++++++------------- GPU/Common/VertexDecoderCommon.h | 1 + GPU/Common/VertexDecoderX86.cpp | 7 +------ GPU/Software/TransformUnit.cpp | 4 +--- test.py | 2 +- 7 files changed, 30 insertions(+), 43 deletions(-) diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp index e3d5b11f6b..2b93b563f8 100644 --- a/GPU/Common/VertexDecoderArm.cpp +++ b/GPU/Common/VertexDecoderArm.cpp @@ -872,22 +872,14 @@ void VertexDecoderJitCache::Jit_NormalFloat() { STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3); } -// Through expands into floats, always. Might want to look at changing this. void VertexDecoderJitCache::Jit_PosS8Through() { - DEBUG_LOG_REPORT_ONCE(vertexS8Through, G3D, "Using S8 positions in throughmode"); _dbg_assert_msg_(fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order."); _dbg_assert_msg_(fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order."); - // TODO: SIMD - LDRSB(tempReg1, srcReg, dec_->posoff); - LDRSB(tempReg2, srcReg, dec_->posoff + 1); - LDRB(tempReg3, srcReg, dec_->posoff + 2); - static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 }; - static const ARMReg fr[3] = { fpScratchReg, fpScratchReg2, fpScratchReg3 }; + // 8-bit positions in throughmode always decode to 0, depth included. + VEOR(neonScratchReg, neonScratchReg, neonScratchReg); + VEOR(neonScratchReg2, neonScratchReg, neonScratchReg); ADD(scratchReg, dstReg, dec_->decFmt.posoff); - VMOV(neonScratchReg, tempReg1, tempReg2); - VMOV(neonScratchReg2, tempReg3, tempReg3); - VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ); VST1(F_32, neonScratchReg, scratchReg, 2, ALIGN_NONE); } diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index 5ff1d605f3..0ad04dbe5c 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -668,15 +668,11 @@ void VertexDecoderJitCache::Jit_PosFloat() { } void VertexDecoderJitCache::Jit_PosS8Through() { - LDRSB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff); - LDRSB(INDEX_UNSIGNED, tempReg2, srcReg, dec_->posoff + 1); - LDRB(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 2); - fp.SCVTF(fpScratchReg, tempReg1); - fp.SCVTF(fpScratchReg2, tempReg2); - fp.SCVTF(fpScratchReg3, tempReg3); + // 8-bit positions in throughmode always decode to 0, depth included. + fp.EOR(fpScratchReg, fpScratchReg, fpScratchReg); STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff); - STR(INDEX_UNSIGNED, fpScratchReg2, dstReg, dec_->decFmt.posoff + 4); - STR(INDEX_UNSIGNED, fpScratchReg3, dstReg, dec_->decFmt.posoff + 8); + STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff + 4); + STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff + 8); } void VertexDecoderJitCache::Jit_PosS16Through() { diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp index a9b4a1039a..62eb20ef84 100644 --- a/GPU/Common/VertexDecoderCommon.cpp +++ b/GPU/Common/VertexDecoderCommon.cpp @@ -773,14 +773,20 @@ void VertexDecoder::Step_PosFloatSkin() const Vec3ByMatrix43(pos, fn, skinMatrix); } -void VertexDecoder::Step_PosS8Through() const -{ +void VertexDecoder::Step_PosInvalid() const { + // Invalid positions are just culled. Simulate by forcing invalid values. float *v = (float *)(decoded_ + decFmt.posoff); - const s8 *sv = (const s8 *)(ptr_ + posoff); - const u8 *uv = (const u8 *)(ptr_ + posoff); - v[0] = sv[0]; - v[1] = sv[1]; - v[2] = uv[2]; + v[0] = std::numeric_limits::infinity(); + v[1] = std::numeric_limits::infinity(); + v[2] = std::numeric_limits::infinity(); +} + +void VertexDecoder::Step_PosS8Through() const { + // 8-bit positions in throughmode always decode to 0, depth included. + float *v = (float *)(decoded_ + decFmt.posoff); + v[0] = 0; + v[1] = 0; + v[2] = 0; } void VertexDecoder::Step_PosS16Through() const @@ -1023,35 +1029,35 @@ static const StepFunction nrmstep_morphskin[4] = { }; static const StepFunction posstep[4] = { - &VertexDecoder::Step_PosS8, + &VertexDecoder::Step_PosInvalid, &VertexDecoder::Step_PosS8, &VertexDecoder::Step_PosS16, &VertexDecoder::Step_PosFloat, }; static const StepFunction posstep_skin[4] = { - &VertexDecoder::Step_PosS8Skin, + &VertexDecoder::Step_PosInvalid, &VertexDecoder::Step_PosS8Skin, &VertexDecoder::Step_PosS16Skin, &VertexDecoder::Step_PosFloatSkin, }; static const StepFunction posstep_morph[4] = { - &VertexDecoder::Step_PosS8Morph, + &VertexDecoder::Step_PosInvalid, &VertexDecoder::Step_PosS8Morph, &VertexDecoder::Step_PosS16Morph, &VertexDecoder::Step_PosFloatMorph, }; static const StepFunction posstep_morph_skin[4] = { - &VertexDecoder::Step_PosS8MorphSkin, + &VertexDecoder::Step_PosInvalid, &VertexDecoder::Step_PosS8MorphSkin, &VertexDecoder::Step_PosS16MorphSkin, &VertexDecoder::Step_PosFloatMorphSkin, }; static const StepFunction posstep_through[4] = { - &VertexDecoder::Step_PosS8Through, + &VertexDecoder::Step_PosInvalid, &VertexDecoder::Step_PosS8Through, &VertexDecoder::Step_PosS16Through, &VertexDecoder::Step_PosFloatThrough, @@ -1224,9 +1230,8 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options, bool reportNoPos = false; if (!pos) { reportNoPos = true; - pos = 1; } - if (pos) { // there's always a position + if (pos >= 0) { // there's always a position size = align(size, posalign[pos]); posoff = size; size += possize[pos]; diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 983f76f9b2..6a06093902 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -433,6 +433,7 @@ public: void Step_PosS16MorphSkin() const; void Step_PosFloatMorphSkin() const; + void Step_PosInvalid() const; void Step_PosS8Through() const; void Step_PosS16Through() const; void Step_PosFloatThrough() const; diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index d684078989..13aabe2df3 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -1345,14 +1345,9 @@ void VertexDecoderJitCache::Jit_NormalFloatSkin() { // Through expands into floats, always. Might want to look at changing this. void VertexDecoderJitCache::Jit_PosS8Through() { - DEBUG_LOG_REPORT_ONCE(vertexS8Through, G3D, "Using S8 positions in throughmode"); // SIMD doesn't really matter since this isn't useful on hardware. + XORPS(fpScratchReg, R(fpScratchReg)); for (int i = 0; i < 3; i++) { - if (i == 2) - MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i)); - else - MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i)); - CVTSI2SS(fpScratchReg, R(tempReg1)); MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), fpScratchReg); } } diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index 4822a5a215..c1471f7172 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -503,10 +503,8 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G if (gstate_c.skipDrawReason & SKIPDRAW_SKIPFRAME) { return; } - // Throughmode never draws 8-bit primitives, maybe because they can't fully specify the screen? - if ((vertex_type & GE_VTYPE_THROUGH_MASK) != 0 && (vertex_type & GE_VTYPE_POS_MASK) == GE_VTYPE_POS_8BIT) - return; // Vertices without position are just entirely culled. + // Note: Throughmode does draw 8-bit primitives, but positions are always zero - handled in decode. if ((vertex_type & GE_VTYPE_POS_MASK) == 0) return; diff --git a/test.py b/test.py index a805fef82d..4597804a51 100755 --- a/test.py +++ b/test.py @@ -158,6 +158,7 @@ tests_good = [ "gpu/ge/enqueueparam", "gpu/ge/queue", "gpu/primitives/indices", + "gpu/primitives/invalidprim", "gpu/primitives/trianglefan", "gpu/primitives/trianglestrip", "gpu/primitives/triangles", @@ -400,7 +401,6 @@ tests_next = [ "gpu/primitives/bezier", "gpu/primitives/continue", "gpu/primitives/immediate", - "gpu/primitives/invalidprim", "gpu/primitives/lines", "gpu/primitives/linestrip", "gpu/primitives/points",