From 0727df6f0a620b2dcc97ddb220db7ab3142b2503 Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Wed, 10 Sep 2014 10:52:59 +0200 Subject: [PATCH] Jit the most common of the "ToFloat" texcoord conversions --- GPU/Common/VertexDecoderArm.cpp | 1 + GPU/Common/VertexDecoderX86.cpp | 35 +++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp index 82ce8cd3bb..f41d0b2d0d 100644 --- a/GPU/Common/VertexDecoderArm.cpp +++ b/GPU/Common/VertexDecoderArm.cpp @@ -51,6 +51,7 @@ static float MEMORY_ALIGNED16(boneMask[4]) = {1.0f, 1.0f, 1.0f, 0.0f}; static const float by128 = 1.0f / 128.0f; +static const float by16384 = 1.0f / 16384.0f; static const float by32768 = 1.0f / 32768.0f; using namespace ArmGen; diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index 1666eb10e3..28454c061e 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -39,6 +39,10 @@ static const float MEMORY_ALIGNED16( by32768[4] ) = { static const u32 MEMORY_ALIGNED16( threeMasks[4] ) = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0}; static const u32 MEMORY_ALIGNED16( aOne[4] ) = {0, 0, 0, 0x3F800000}; +static const float MEMORY_ALIGNED16(by16384[4]) = { + 1.0f / 16384.0f, 1.0f / 16384.0f, 1.0f / 16384.0f, 1.0f / 16384.0f, +}; + #ifdef _M_X64 #ifdef _WIN32 static const X64Reg tempReg1 = RAX; @@ -90,6 +94,8 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8}, {&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16}, + {&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat}, + {&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat}, {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, {&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double}, @@ -98,6 +104,7 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, {&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through}, + {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat}, {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, {&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble}, @@ -464,6 +471,26 @@ void VertexDecoderJitCache::Jit_TcU16() { MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); } +void VertexDecoderJitCache::Jit_TcU8ToFloat() { + // TODO: The first five instructions could be done in 1 or 2 in SSE4 + MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->tcoff)); + MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->tcoff + 1)); + CVTSI2SS(fpScratchReg, R(tempReg1)); + CVTSI2SS(fpScratchReg2, R(tempReg2)); + UNPCKLPS(fpScratchReg, R(fpScratchReg2)); + MULPS(fpScratchReg, M(&by128)); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); +} + +void VertexDecoderJitCache::Jit_TcU16ToFloat() { + PXOR(fpScratchReg2, R(fpScratchReg2)); + MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff)); + PUNPCKLWD(fpScratchReg, R(fpScratchReg2)); + CVTDQ2PS(fpScratchReg, R(fpScratchReg)); + MULPS(fpScratchReg, M(&by32768)); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); +} + void VertexDecoderJitCache::Jit_TcU16Double() { MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff)); MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2)); @@ -525,6 +552,14 @@ void VertexDecoderJitCache::Jit_TcU16Through() { MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); } +void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() { + PXOR(fpScratchReg2, R(fpScratchReg2)); + MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff)); + PUNPCKLWD(fpScratchReg, R(fpScratchReg2)); + CVTDQ2PS(fpScratchReg, R(fpScratchReg)); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); +} + void VertexDecoderJitCache::Jit_TcU16ThroughDouble() { MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff)); MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2));