From 7699fa55de16f4ed14512dd702fbded2ff0daa70 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 6 May 2017 18:58:15 -0700 Subject: [PATCH] arm: Jit throughmode 16-bit texcoords. It's popular, and this makes decoding such verts much faster. --- GPU/Common/VertexDecoderArm.cpp | 39 ++++++++++++++++++++++++++++++- GPU/Common/VertexDecoderArm64.cpp | 22 ++++++++++++++++- 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp index 3f9267cab3..ce87e02117 100644 --- a/GPU/Common/VertexDecoderArm.cpp +++ b/GPU/Common/VertexDecoderArm.cpp @@ -127,7 +127,7 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, - // {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat}, + {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat}, {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8}, {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16}, @@ -568,6 +568,43 @@ void VertexDecoderJitCache::Jit_TcFloat() { STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4); } +void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() { + LDRH(tempReg1, srcReg, dec_->tcoff); + LDRH(tempReg2, srcReg, dec_->tcoff + 2); + + MOVP2R(scratchReg, &gstate_c.vertBounds.minU); + + auto updateSide = [&](ARMReg r, CCFlags cc, u32 off) { + LDRH(tempReg3, scratchReg, off); + CMP(r, tempReg3); + SetCC(cc); + STRH(r, scratchReg, off); + SetCC(CC_AL); + }; + + // TODO: Can this actually be fast? Hmm, floats aren't better. + updateSide(tempReg1, CC_LT, offsetof(KnownVertexBounds, minU)); + updateSide(tempReg1, CC_GT, offsetof(KnownVertexBounds, maxU)); + updateSide(tempReg2, CC_LT, offsetof(KnownVertexBounds, minV)); + updateSide(tempReg2, CC_GT, offsetof(KnownVertexBounds, maxV)); + + if (cpu_info.bNEON) { + ADD(scratchReg, srcReg, dec_->tcoff); + VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false); + VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit + VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ); + ADD(scratchReg2, dstReg, dec_->decFmt.uvoff); + VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE); + } else { + VMOV(fpScratchReg, tempReg1); + VMOV(fpScratchReg2, tempReg2); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT); + VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT); + VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff); + VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4); + } +} + void VertexDecoderJitCache::Jit_TcFloatThrough() { LDR(tempReg1, srcReg, dec_->tcoff); LDR(tempReg2, srcReg, dec_->tcoff + 4); diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index f68f100457..12f58e829b 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -101,7 +101,7 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, - // {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat}, + {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat}, {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8}, {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16}, @@ -579,6 +579,26 @@ void VertexDecoderJitCache::Jit_Color5551() { CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ); } +void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() { + LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); + LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2); + + auto updateSide = [&](ARM64Reg src, CCFlags cc, ARM64Reg dst) { + CMP(src, dst); + CSEL(dst, src, dst, cc); + }; + + updateSide(tempReg1, CC_LT, boundsMinUReg); + updateSide(tempReg1, CC_GT, boundsMaxUReg); + updateSide(tempReg2, CC_LT, boundsMinVReg); + updateSide(tempReg2, CC_GT, boundsMaxVReg); + + fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff); + fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit + fp.UCVTF(32, neonScratchRegD, neonScratchRegD); + fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff); +} + void VertexDecoderJitCache::Jit_TcFloatThrough() { LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff); STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff);