diff --git a/Common/x64Emitter.cpp b/Common/x64Emitter.cpp index c05e6ac407..f24e920991 100644 --- a/Common/x64Emitter.cpp +++ b/Common/x64Emitter.cpp @@ -1701,16 +1701,15 @@ void XEmitter::PSRLQ(X64Reg reg, int shift) Write8(shift); } -void XEmitter::PSRLQ(X64Reg reg, OpArg arg) -{ - WriteSSEOp(0x66, 0xd3, reg, arg); -} - void XEmitter::PSRLDQ(X64Reg reg, int shift) { WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg)); Write8(shift); } +void XEmitter::PSRLW(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD1, reg, arg); } +void XEmitter::PSRLD(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD2, reg, arg); } +void XEmitter::PSRLQ(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD3, reg, arg); } + void XEmitter::PSLLW(X64Reg reg, int shift) { WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg)); @@ -1734,6 +1733,10 @@ void XEmitter::PSLLDQ(X64Reg reg, int shift) { Write8(shift); } +void XEmitter::PSLLW(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF1, reg, arg); } +void XEmitter::PSLLD(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF2, reg, arg); } +void XEmitter::PSLLQ(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF3, reg, arg); } + void XEmitter::PSRAW(X64Reg reg, int shift) { WriteSSEOp(0x66, 0x71, (X64Reg)4, R(reg)); @@ -1746,6 +1749,9 @@ void XEmitter::PSRAD(X64Reg reg, int shift) Write8(shift); } +void XEmitter::PSRAW(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xE1, reg, arg); } +void XEmitter::PSRAD(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xE2, reg, arg); } + void XEmitter::PMULLW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0xD5, dest, arg);} void XEmitter::PMULHW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0xE5, dest, arg);} void XEmitter::PMULHUW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0xE4, dest, arg);} diff --git a/Common/x64Emitter.h b/Common/x64Emitter.h index dbdf921871..ae885397dc 100644 --- a/Common/x64Emitter.h +++ b/Common/x64Emitter.h @@ -847,16 +847,31 @@ public: void PSRLW(X64Reg reg, int shift); void PSRLD(X64Reg reg, int shift); void PSRLQ(X64Reg reg, int shift); - void PSRLQ(X64Reg reg, OpArg arg); void PSRLDQ(X64Reg reg, int shift); + // Note: all values shifted by lowest 64-bit in XMM arg. + void PSRLW(X64Reg reg, OpArg arg); + // Note: all values shifted by lowest 64-bit in XMM arg. + void PSRLD(X64Reg reg, OpArg arg); + // Note: both values shifted by lowest 64-bit in XMM arg. + void PSRLQ(X64Reg reg, OpArg arg); void PSLLW(X64Reg reg, int shift); void PSLLD(X64Reg reg, int shift); void PSLLQ(X64Reg reg, int shift); void PSLLDQ(X64Reg reg, int shift); + // Note: all values shifted by lowest 64-bit in XMM arg. + void PSLLW(X64Reg reg, OpArg arg); + // Note: all values shifted by lowest 64-bit in XMM arg. + void PSLLD(X64Reg reg, OpArg arg); + // Note: both values shifted by lowest 64-bit in XMM arg. + void PSLLQ(X64Reg reg, OpArg arg); void PSRAW(X64Reg reg, int shift); void PSRAD(X64Reg reg, int shift); + // Note: all values shifted by lowest 64-bit in XMM arg. + void PSRAW(X64Reg reg, OpArg arg); + // Note: all values shifted by lowest 64-bit in XMM arg. + void PSRAD(X64Reg reg, OpArg arg); void PMULLW(X64Reg dest, const OpArg &arg); void PMULHW(X64Reg dest, const OpArg &arg); diff --git a/GPU/Software/Sampler.h b/GPU/Software/Sampler.h index b1530ee883..2b844aa739 100644 --- a/GPU/Software/Sampler.h +++ b/GPU/Software/Sampler.h @@ -102,6 +102,7 @@ private: bool Jit_PrepareDataDirectOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int bitsPerTexel); bool Jit_PrepareDataSwizzledOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int bitsPerTexel); bool Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFallback); + bool Jit_TransformClutIndexQuad(const SamplerID &id, int bitsPerIndex); bool Jit_ReadClutQuad(const SamplerID &id, bool level1); bool Jit_BlendQuad(const SamplerID &id, bool level1); bool Jit_DecodeQuad(const SamplerID &id, bool level1); diff --git a/GPU/Software/SamplerX86.cpp b/GPU/Software/SamplerX86.cpp index da734c2fa3..a6a1aeb83a 100644 --- a/GPU/Software/SamplerX86.cpp +++ b/GPU/Software/SamplerX86.cpp @@ -944,7 +944,7 @@ bool SamplerJitCache::Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFal bool success = true; // TODO: Limit less. - if (cpu_info.bAVX2 && id.TexFmt() == GE_TFMT_CLUT4 && id.ClutFmt() == GE_CMODE_32BIT_ABGR8888 && !id.hasClutMask && !id.hasClutOffset && !id.hasClutShift) { + if (cpu_info.bAVX2 && id.TexFmt() == GE_TFMT_CLUT4) { Describe("ReadQuad"); X64Reg baseReg = regCache_.Alloc(RegCache::GEN_ARG_TEXPTR); @@ -970,13 +970,12 @@ bool SamplerJitCache::Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFal VPSRLVD(128, vecIndexReg, vecIndexReg, R(uReg)); regCache_.Unlock(uReg, level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U); - // Okay, now we need to mask out just the low four bits. - PCMPEQD(maskReg, R(maskReg)); - PSRLD(maskReg, 28); - PAND(vecIndexReg, R(maskReg)); regCache_.Release(maskReg, RegCache::VEC_TEMP0); regCache_.Unlock(vecIndexReg, RegCache::VEC_INDEX); + // Apply mask and any other CLUT transformations. + success = success && Jit_TransformClutIndexQuad(id, 4); + // Great, now we can use our CLUT indices to gather again. success = success && Jit_ReadClutQuad(id, level1); } else { @@ -986,6 +985,96 @@ bool SamplerJitCache::Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFal return success; } +bool SamplerJitCache::Jit_TransformClutIndexQuad(const SamplerID &id, int bitsPerIndex) { + Describe("TrCLUTQuad"); + GEPaletteFormat fmt = id.ClutFmt(); + if (!id.hasClutShift && !id.hasClutMask && !id.hasClutOffset) { + // This is simple - just mask. + X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX); + // Mask to 8 bits for CLUT8/16/32, 4 bits for CLUT4. + PSLLD(indexReg, bitsPerIndex >= 8 ? 24 : 28); + PSRLD(indexReg, bitsPerIndex >= 8 ? 24 : 28); + regCache_.Unlock(indexReg, RegCache::VEC_INDEX); + + return true; + } + + X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX); + bool maskedIndex = false; + + // Okay, first load the actual gstate clutformat bits we'll use. + X64Reg formatReg = regCache_.Alloc(RegCache::VEC_TEMP0); + X64Reg gstateReg = GetGState(); + if (cpu_info.bAVX2 && !id.hasClutShift) + VPBROADCASTD(128, formatReg, MDisp(gstateReg, offsetof(GPUgstate, clutformat))); + else + MOVD_xmm(formatReg, MDisp(gstateReg, offsetof(GPUgstate, clutformat))); + regCache_.Unlock(gstateReg, RegCache::GEN_GSTATE); + + // Shift = (clutformat >> 2) & 0x1F + if (id.hasClutShift) { + // Before shifting, let's mask if needed (we always read 32 bits.) + // We have to do this here, because the bits should be zero even if F is used as a mask. + if (bitsPerIndex < 32) { + PSLLD(indexReg, 32 - bitsPerIndex); + PSRLD(indexReg, 32 - bitsPerIndex); + maskedIndex = true; + } + + X64Reg shiftReg = regCache_.Alloc(RegCache::VEC_TEMP1); + // Shift against walls to get 5 bits after the rightmost 2. + if (cpu_info.bAVX) { + VPSLLD(128, shiftReg, formatReg, 32 - 7); + } else { + MOVDQA(shiftReg, R(formatReg)); + PSLLD(shiftReg, 32 - 7); + } + PSRLD(shiftReg, 32 - 5); + // The other lanes are zero, so we can use PSRLD. + PSRLD(indexReg, R(shiftReg)); + regCache_.Release(shiftReg, RegCache::VEC_TEMP1); + } + + // With shifting done, we need the format in each lane. + if (!cpu_info.bAVX2 || id.hasClutShift) + PSHUFD(formatReg, R(formatReg), _MM_SHUFFLE(0, 0, 0, 0)); + + // Mask = (clutformat >> 8) & 0xFF + if (id.hasClutMask) { + X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP1); + // If it was CLUT4, grab only 4 bits of the mask. + if (cpu_info.bAVX) { + VPSLLD(128, maskReg, formatReg, bitsPerIndex == 4 ? 20 : 16); + } else { + MOVDQA(maskReg, R(formatReg)); + PSLLD(maskReg, bitsPerIndex == 4 ? 20 : 16); + } + PSRLD(maskReg, bitsPerIndex == 4 ? 28 : 24); + + PAND(indexReg, R(maskReg)); + regCache_.Release(maskReg, RegCache::VEC_TEMP1); + } else if (!maskedIndex || bitsPerIndex > 8) { + // Apply the fixed 8 bit mask (or the CLUT4 mask if we didn't shift.) + PSLLD(indexReg, maskedIndex || bitsPerIndex >= 8 ? 24 : 28); + PSRLD(indexReg, maskedIndex || bitsPerIndex >= 8 ? 24 : 28); + } + + // Offset = (clutformat >> 12) & 0x01F0 + if (id.hasClutOffset) { + // Use walls to extract the 5 bits at 16, and then put them shifted left by 4. + int offsetBits = fmt == GE_CMODE_32BIT_ABGR8888 ? 4 : 5; + PSRLD(formatReg, 16); + PSLLD(formatReg, 32 - offsetBits); + PSRLD(formatReg, 32 - offsetBits - 4); + + POR(indexReg, R(formatReg)); + } + + regCache_.Release(formatReg, RegCache::VEC_TEMP0); + regCache_.Unlock(indexReg, RegCache::VEC_INDEX); + return true; +} + bool SamplerJitCache::Jit_ReadClutQuad(const SamplerID &id, bool level1) { Describe("ReadCLUTQuad"); X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);