Merge pull request #15372 from unknownbrackets/bmi2

Optimize jits with a bit of BMI2
This commit is contained in:
Henrik Rydgård 2022-02-01 09:43:35 +01:00 committed by GitHub
commit f58d4dfcfe
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 484 additions and 221 deletions

View file

@ -255,6 +255,8 @@ void CPUInfo::Detect() {
if ((cpu_id[1] >> 11) & 1)
bRTM = true;
}
bBMI2_fast = bBMI2 && (vendor != VENDOR_AMD || family >= 0x19);
}
if (max_ex_fn >= 0x80000004) {
// Extract brand string

View file

@ -54,6 +54,7 @@ struct CPUInfo {
bool bLZCNT;
bool bBMI1;
bool bBMI2;
bool bBMI2_fast;
bool bXOP;
bool bRTM;

View file

@ -1458,6 +1458,7 @@ void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg
{
CheckFlags();
_assert_msg_(cpu_info.bBMI1, "Trying to use BMI1 on a system that doesn't support it.");
_assert_msg_(!arg.IsImm(), "Imm arg unsupported for this BMI1 instruction");
WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
}
@ -1465,6 +1466,7 @@ void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg
{
CheckFlags();
_assert_msg_(cpu_info.bBMI2, "Trying to use BMI2 on a system that doesn't support it.");
_assert_msg_(!arg.IsImm(), "Imm arg unsupported for this BMI2 instruction");
WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
}

View file

@ -22,6 +22,7 @@
#include "Common/BitSet.h"
#include "Common/CommonTypes.h"
#include "Common/CPUDetect.h"
#include "Core/MIPS/MIPSCodeUtils.h"
#include "Core/MIPS/x86/Jit.h"
#include "Core/MIPS/x86/RegCache.h"
@ -647,16 +648,36 @@ namespace MIPSComp
}
gpr.Lock(rd, rt, rs);
if (gpr.IsImm(rs))
{
if (gpr.IsImm(rs)) {
int sa = gpr.GetImm(rs);
gpr.MapReg(rd, rd == rt, true);
if (rd != rt)
if (cpu_info.bBMI2 && shift == &XEmitter::ROR) {
_assert_(!gpr.IsImm(rt));
RORX(32, gpr.RX(rd), gpr.R(rt), sa & 0x1F);
} else {
if (rd != rt)
MOV(32, gpr.R(rd), gpr.R(rt));
(this->*shift)(32, gpr.R(rd), Imm8(sa & 0x1F));
}
} else if (cpu_info.bBMI2 && shift != &XEmitter::ROR) {
gpr.MapReg(rd, rd == rt || rd == rs, true);
gpr.MapReg(rs, true, false);
MIPSGPReg src = rt;
if (gpr.IsImm(rt) && rd == rs) {
gpr.MapReg(rt, true, false);
} else if (gpr.IsImm(rt)) {
MOV(32, gpr.R(rd), gpr.R(rt));
(this->*shift)(32, gpr.R(rd), Imm8(sa));
}
else
{
src = rd;
}
if (shift == &XEmitter::SHL)
SHLX(32, gpr.RX(rd), gpr.R(src), gpr.RX(rs));
else if (shift == &XEmitter::SHR)
SHRX(32, gpr.RX(rd), gpr.R(src), gpr.RX(rs));
else if (shift == &XEmitter::SAR)
SARX(32, gpr.RX(rd), gpr.R(src), gpr.RX(rs));
else
_assert_msg_(false, "Unexpected shift type");
} else {
gpr.FlushLockX(ECX);
gpr.MapReg(rd, rd == rt || rd == rs, true);
MOV(32, R(ECX), gpr.R(rs)); // Only ECX can be used for variable shifts.

View file

@ -1631,8 +1631,12 @@ bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) {
if (maskReg != INVALID_REG) {
// Zero all other bits, then flip maskReg to clear the bits we're keeping in colorReg.
AND(16, MatR(colorOff), R(maskReg));
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
if (cpu_info.bBMI1) {
ANDN(32, colorReg, maskReg, R(colorReg));
} else {
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
}
OR(16, MatR(colorOff), R(colorReg));
} else if (fixedKeepMask == 0) {
MOV(16, MatR(colorOff), R(colorReg));
@ -1647,8 +1651,12 @@ bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) {
if (maskReg != INVALID_REG) {
// Zero all other bits, then flip maskReg to clear the bits we're keeping in colorReg.
AND(32, MatR(colorOff), R(maskReg));
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
if (cpu_info.bBMI1) {
ANDN(32, colorReg, maskReg, R(colorReg));
} else {
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
}
OR(32, MatR(colorOff), R(colorReg));
} else if (fixedKeepMask == 0) {
MOV(32, MatR(colorOff), R(colorReg));
@ -1774,8 +1782,12 @@ bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorR
tableValues[GE_LOGIC_AND_REVERSE] = GetCodePointer();
// Reverse memory in a temp reg so we can apply the write mask easily.
MOV(bits, R(temp1Reg), MatR(colorOff));
NOT(32, R(temp1Reg));
AND(32, R(colorReg), R(temp1Reg));
if (cpu_info.bBMI1) {
ANDN(32, colorReg, temp1Reg, R(colorReg));
} else {
NOT(32, R(temp1Reg));
AND(32, R(colorReg), R(temp1Reg));
}
// Now add in the stencil bits (must be zero before, since we used AND.)
if (stencilReg != INVALID_REG) {
OR(32, R(colorReg), R(stencilReg));
@ -1825,9 +1837,13 @@ bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorR
tableValues[GE_LOGIC_NOOP] = GetCodePointer();
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
// Start by clearing masked bits from stencilReg.
NOT(32, R(maskReg));
AND(32, R(stencilReg), R(maskReg));
NOT(32, R(maskReg));
if (cpu_info.bBMI1) {
ANDN(32, stencilReg, maskReg, R(stencilReg));
} else {
NOT(32, R(maskReg));
AND(32, R(stencilReg), R(maskReg));
NOT(32, R(maskReg));
}
// Now mask out the stencil bits we're writing from memory.
OR(bits, R(maskReg), notStencilMask);
@ -1862,9 +1878,13 @@ bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorR
OR(32, R(colorReg), R(stencilReg));
// Clear the bits we should be masking out.
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
NOT(32, R(maskReg));
if (cpu_info.bBMI1) {
ANDN(32, colorReg, maskReg, R(colorReg));
} else {
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
NOT(32, R(maskReg));
}
// Clear all the unmasked stencil bits, so we can set our own.
OR(bits, R(maskReg), notStencilMask);
@ -1875,8 +1895,12 @@ bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorR
AND(bits, MatR(colorOff), notStencilMask);
} else if (maskReg != INVALID_REG) {
// Clear the bits we should be masking out.
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
if (cpu_info.bBMI1) {
ANDN(32, colorReg, maskReg, R(colorReg));
} else {
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
}
} else if (id.FBFormat() == GE_FORMAT_8888) {
// We only need to do this for 8888, the others already have 0 stencil.
AND(bits, R(colorReg), notStencilMask);
@ -1954,9 +1978,13 @@ bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorR
OR(32, R(colorReg), R(stencilReg));
// Clear the bits we should be masking out.
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
NOT(32, R(maskReg));
if (cpu_info.bBMI1) {
ANDN(32, colorReg, maskReg, R(colorReg));
} else {
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
NOT(32, R(maskReg));
}
// Clear all the unmasked stencil bits, so we can set our own.
OR(bits, R(maskReg), notStencilMask);
@ -2032,6 +2060,13 @@ bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorR
bool PixelJitCache::Jit_ConvertTo565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) {
Describe("ConvertTo565");
if (cpu_info.bBMI2_fast) {
MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
PEXT(32, colorReg, colorReg, R(temp1Reg));
return true;
}
// Assemble the 565 color, starting with R...
MOV(32, R(temp1Reg), R(colorReg));
SHR(32, R(temp1Reg), Imm8(3));
@ -2053,6 +2088,13 @@ bool PixelJitCache::Jit_ConvertTo565(const PixelFuncID &id, RegCache::Reg colorR
bool PixelJitCache::Jit_ConvertTo5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
Describe("ConvertTo5551");
if (cpu_info.bBMI2_fast) {
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x80F8F8F8 : 0x00F8F8F8));
PEXT(32, colorReg, colorReg, R(temp1Reg));
return true;
}
// This is R, pretty simple.
MOV(32, R(temp1Reg), R(colorReg));
SHR(32, R(temp1Reg), Imm8(3));
@ -2084,6 +2126,13 @@ bool PixelJitCache::Jit_ConvertTo5551(const PixelFuncID &id, RegCache::Reg color
bool PixelJitCache::Jit_ConvertTo4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
Describe("ConvertTo4444");
if (cpu_info.bBMI2_fast) {
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0));
PEXT(32, colorReg, colorReg, R(temp1Reg));
return true;
}
// Shift and mask out R.
MOV(32, R(temp1Reg), R(colorReg));
SHR(32, R(temp1Reg), Imm8(4));
@ -2115,6 +2164,24 @@ bool PixelJitCache::Jit_ConvertTo4444(const PixelFuncID &id, RegCache::Reg color
bool PixelJitCache::Jit_ConvertFrom565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) {
Describe("ConvertFrom565");
if (cpu_info.bBMI2_fast) {
// Start off with the high bits.
MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
PDEP(32, temp1Reg, colorReg, R(temp1Reg));
// Now grab the low bits (they end up packed.)
MOV(32, R(temp2Reg), Imm32(0x0000E61C));
PEXT(32, colorReg, colorReg, R(temp2Reg));
// And spread them back out.
MOV(32, R(temp2Reg), Imm32(0x00070307));
PDEP(32, colorReg, colorReg, R(temp2Reg));
// Finally put the high bits in, we're done.
OR(32, R(colorReg), R(temp1Reg));
return true;
}
// Filter out red only into temp1.
MOV(32, R(temp1Reg), R(colorReg));
AND(16, R(temp1Reg), Imm16(0x1F << 0));
@ -2150,6 +2217,27 @@ bool PixelJitCache::Jit_ConvertFrom565(const PixelFuncID &id, RegCache::Reg colo
bool PixelJitCache::Jit_ConvertFrom5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
Describe("ConvertFrom5551");
if (cpu_info.bBMI2_fast) {
// First, grab the top bits.
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8));
PDEP(32, colorReg, colorReg, R(temp1Reg));
// Now make the swizzle bits.
MOV(32, R(temp2Reg), R(colorReg));
SHR(32, R(temp2Reg), Imm8(5));
AND(32, R(temp2Reg), Imm32(0x00070707));
if (keepAlpha) {
// Sign extend the alpha bit to 8 bits.
SHL(32, R(colorReg), Imm8(7));
SAR(32, R(colorReg), Imm8(7));
}
OR(32, R(colorReg), R(temp2Reg));
return true;
}
// Filter out red only into temp1.
MOV(32, R(temp1Reg), R(colorReg));
AND(16, R(temp1Reg), Imm16(0x1F << 0));
@ -2187,6 +2275,19 @@ bool PixelJitCache::Jit_ConvertFrom5551(const PixelFuncID &id, RegCache::Reg col
bool PixelJitCache::Jit_ConvertFrom4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
Describe("ConvertFrom4444");
if (cpu_info.bBMI2_fast) {
// First, spread the bits out with spaces.
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0));
PDEP(32, colorReg, colorReg, R(temp1Reg));
// Now swizzle the low bits in.
MOV(32, R(temp1Reg), R(colorReg));
SHR(32, R(temp1Reg), Imm8(4));
OR(32, R(colorReg), R(temp1Reg));
return true;
}
// Move red into position within temp1.
MOV(32, R(temp1Reg), R(colorReg));
AND(16, R(temp1Reg), Imm16(0xF << 0));

View file

@ -184,12 +184,14 @@ NearestFunc SamplerJitCache::CompileNearest(const SamplerID &id) {
if (regCache_.Has(RegCache::GEN_ARG_BUFW_PTR)) {
// On Linux, RCX is currently bufwptr, but we'll need it for other things.
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
MOV(64, R(R15), R(bufwReg));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
regCache_.ChangeReg(R15, RegCache::GEN_ARG_BUFW_PTR);
regCache_.ForceRetain(RegCache::GEN_ARG_BUFW_PTR);
if (!cpu_info.bBMI2) {
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
MOV(64, R(R15), R(bufwReg));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
regCache_.ChangeReg(R15, RegCache::GEN_ARG_BUFW_PTR);
regCache_.ForceRetain(RegCache::GEN_ARG_BUFW_PTR);
}
} else {
// Let's load bufwptr/texptrptr into regs. Match Linux just for consistency - RDX is free.
MOV(64, R(RDX), MDisp(RSP, stackArgPos_ + 0));
@ -200,8 +202,10 @@ NearestFunc SamplerJitCache::CompileNearest(const SamplerID &id) {
regCache_.ForceRetain(RegCache::GEN_ARG_BUFW_PTR);
}
// Okay, now lock RCX as a shifting reg.
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
regCache_.ForceRetain(RegCache::GEN_SHIFTVAL);
if (!cpu_info.bBMI2) {
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
regCache_.ForceRetain(RegCache::GEN_SHIFTVAL);
}
bool success = true;
@ -345,7 +349,8 @@ NearestFunc SamplerJitCache::CompileNearest(const SamplerID &id) {
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
regCache_.ForceRelease(RegCache::GEN_SHIFTVAL);
if (regCache_.Has(RegCache::GEN_SHIFTVAL))
regCache_.ForceRelease(RegCache::GEN_SHIFTVAL);
regCache_.ForceRelease(RegCache::GEN_RESULT);
if (id.hasAnyMips) {
@ -1820,7 +1825,7 @@ bool SamplerJitCache::Jit_GetDXT1Color(const SamplerID &id, int blockSize, int a
regCache_.Release(srcOffsetReg, RegCache::GEN_TEMP1);
// Make sure we don't grab this as colorIndexReg.
if (uReg != ECX)
if (uReg != ECX && !cpu_info.bBMI2)
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
// The colorIndex is simply the 2 bits at blockPos + (v & 3), shifted right by (u & 3) twice.
@ -1835,6 +1840,9 @@ bool SamplerJitCache::Jit_GetDXT1Color(const SamplerID &id, int blockSize, int a
if (uReg == ECX) {
SHR(32, R(colorIndexReg), R(CL));
SHR(32, R(colorIndexReg), R(CL));
} else if (cpu_info.bBMI2) {
SHRX(32, colorIndexReg, R(colorIndexReg), uReg);
SHRX(32, colorIndexReg, R(colorIndexReg), uReg);
} else {
bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
_assert_(hasRCX);
@ -1869,43 +1877,58 @@ bool SamplerJitCache::Jit_GetDXT1Color(const SamplerID &id, int blockSize, int a
CMP(32, R(colorIndexReg), Imm32(3));
FixupBranch finishZero = J_CC(CC_E, true);
// We'll need more regs. Grab two more.
PUSH(R12);
PUSH(R13);
// At this point, resultReg, colorIndexReg, R12, and R13 can be used as temps.
// At this point, resultReg, colorIndexReg, and maybe R12/R13 can be used as temps.
// We'll add, then shift from 565 a bit less to "divide" by 2 for a 50/50 mix.
// Start with summing R, then shift into position.
MOV(32, R(resultReg), R(color1Reg));
AND(32, R(resultReg), Imm32(0x0000F800));
MOV(32, R(colorIndexReg), R(color2Reg));
AND(32, R(colorIndexReg), Imm32(0x0000F800));
LEA(32, R12, MRegSum(resultReg, colorIndexReg));
// The position is 9, instead of 8, due to doubling.
SHR(32, R(R12), Imm8(9));
if (cpu_info.bBMI2_fast) {
// Expand everything out to 0BGR at 8888, but halved.
MOV(32, R(colorIndexReg), Imm32(0x007C7E7C));
PDEP(32, color1Reg, color1Reg, R(colorIndexReg));
PDEP(32, color2Reg, color2Reg, R(colorIndexReg));
// For G, summing leaves it 4 right (doubling made it not need more.)
MOV(32, R(resultReg), R(color1Reg));
AND(32, R(resultReg), Imm32(0x000007E0));
MOV(32, R(colorIndexReg), R(color2Reg));
AND(32, R(colorIndexReg), Imm32(0x000007E0));
LEA(32, resultReg, MRegSum(resultReg, colorIndexReg));
SHL(32, R(resultReg), Imm8(5 - 1));
// Now add G and R together.
OR(32, R(resultReg), R(R12));
// Now let's sum them together (this undoes our halving.)
LEA(32, resultReg, MRegSum(color1Reg, color2Reg));
// At B, we're free to modify the regs in place, finally.
AND(32, R(color1Reg), Imm32(0x0000001F));
AND(32, R(color2Reg), Imm32(0x0000001F));
LEA(32, colorIndexReg, MRegSum(color1Reg, color2Reg));
// We shift left 2 into position (not 3 due to doubling), then 16 more into the B slot.
SHL(32, R(colorIndexReg), Imm8(16 + 2));
// And combine into the result.
OR(32, R(resultReg), R(colorIndexReg));
// Time to swap into order. Luckily we can ignore alpha.
BSWAP(32, resultReg);
SHR(32, R(resultReg), Imm8(8));
} else {
// We'll need more regs. Grab two more.
PUSH(R12);
PUSH(R13);
// Start with summing R, then shift into position.
MOV(32, R(resultReg), R(color1Reg));
AND(32, R(resultReg), Imm32(0x0000F800));
MOV(32, R(colorIndexReg), R(color2Reg));
AND(32, R(colorIndexReg), Imm32(0x0000F800));
LEA(32, R12, MRegSum(resultReg, colorIndexReg));
// The position is 9, instead of 8, due to doubling.
SHR(32, R(R12), Imm8(9));
// For G, summing leaves it 4 right (doubling made it not need more.)
MOV(32, R(resultReg), R(color1Reg));
AND(32, R(resultReg), Imm32(0x000007E0));
MOV(32, R(colorIndexReg), R(color2Reg));
AND(32, R(colorIndexReg), Imm32(0x000007E0));
LEA(32, resultReg, MRegSum(resultReg, colorIndexReg));
SHL(32, R(resultReg), Imm8(5 - 1));
// Now add G and R together.
OR(32, R(resultReg), R(R12));
// At B, we're free to modify the regs in place, finally.
AND(32, R(color1Reg), Imm32(0x0000001F));
AND(32, R(color2Reg), Imm32(0x0000001F));
LEA(32, colorIndexReg, MRegSum(color1Reg, color2Reg));
// We shift left 2 into position (not 3 due to doubling), then 16 more into the B slot.
SHL(32, R(colorIndexReg), Imm8(16 + 2));
// And combine into the result.
OR(32, R(resultReg), R(colorIndexReg));
POP(R13);
POP(R12);
}
POP(R13);
POP(R12);
FixupBranch finishMix50 = J(true);
// Simply load the 565 color, and convert to 0888.
@ -1917,29 +1940,34 @@ bool SamplerJitCache::Jit_GetDXT1Color(const SamplerID &id, int blockSize, int a
if (id.TexFmt() == GE_TFMT_DXT1)
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
// Start with R, shifting it into place.
MOV(32, R(resultReg), R(colorIndexReg));
AND(32, R(resultReg), Imm32(0x0000F800));
SHR(32, R(resultReg), Imm8(8));
if (cpu_info.bBMI2_fast) {
// We're only grabbing the high bits, no swizzle here.
MOV(32, R(resultReg), Imm32(0x00F8FCF8));
PDEP(32, resultReg, colorIndexReg, R(resultReg));
BSWAP(32, resultReg);
SHR(32, R(resultReg), Imm8(8));
} else {
// Start with R, shifting it into place.
MOV(32, R(resultReg), R(colorIndexReg));
AND(32, R(resultReg), Imm32(0x0000F800));
SHR(32, R(resultReg), Imm8(8));
// Then take G and shift it too.
MOV(32, R(color2Reg), R(colorIndexReg));
AND(32, R(color2Reg), Imm32(0x000007E0));
SHL(32, R(color2Reg), Imm8(5));
// And now combine with R, shifting that in the process.
OR(32, R(resultReg), R(color2Reg));
// Then take G and shift it too.
MOV(32, R(color2Reg), R(colorIndexReg));
AND(32, R(color2Reg), Imm32(0x000007E0));
SHL(32, R(color2Reg), Imm8(5));
// And now combine with R, shifting that in the process.
OR(32, R(resultReg), R(color2Reg));
// Modify B in place and OR in.
AND(32, R(colorIndexReg), Imm32(0x0000001F));
SHL(32, R(colorIndexReg), Imm8(16 + 3));
OR(32, R(resultReg), R(colorIndexReg));
// Modify B in place and OR in.
AND(32, R(colorIndexReg), Imm32(0x0000001F));
SHL(32, R(colorIndexReg), Imm8(16 + 3));
OR(32, R(resultReg), R(colorIndexReg));
}
FixupBranch finish565 = J(true);
// Here we'll mix color1 and color2 by 2/3 (which gets the 2 depends on colorIndexReg.)
SetJumpTarget(handleMix23);
// We'll need more regs. Grab two more to keep the stack aligned.
PUSH(R12);
PUSH(R13);
// If colorIndexReg is 2, it's color1Reg * 2 + color2Reg, but if colorIndexReg is 3, it's reversed.
// Let's swap the regs in that case.
@ -1948,43 +1976,68 @@ bool SamplerJitCache::Jit_GetDXT1Color(const SamplerID &id, int blockSize, int a
XCHG(32, R(color2Reg), R(color1Reg));
SetJumpTarget(skipSwap23);
// Start off with R, adding together first...
MOV(32, R(resultReg), R(color1Reg));
AND(32, R(resultReg), Imm32(0x0000F800));
MOV(32, R(colorIndexReg), R(color2Reg));
AND(32, R(colorIndexReg), Imm32(0x0000F800));
LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));
// We'll overflow if we divide here, so shift into place already.
SHR(32, R(resultReg), Imm8(8));
// Now we divide that by 3, by actually multiplying by AAAB and shifting off.
IMUL(32, R12, R(resultReg), Imm32(0x0000AAAB));
// Now we SHR off the extra bits we added on.
SHR(32, R(R12), Imm8(17));
if (cpu_info.bBMI2_fast) {
// Gather B, G, and R and space them apart by 14 or 15 bits.
MOV(64, R(colorIndexReg), Imm64(0x00001F0003F0001FULL));
PDEP(64, color1Reg, color1Reg, R(colorIndexReg));
PDEP(64, color2Reg, color2Reg, R(colorIndexReg));
LEA(64, resultReg, MComplex(color2Reg, color1Reg, SCALE_2, 0));
// Now add up G. We leave this in place and shift right more.
MOV(32, R(resultReg), R(color1Reg));
AND(32, R(resultReg), Imm32(0x000007E0));
MOV(32, R(colorIndexReg), R(color2Reg));
AND(32, R(colorIndexReg), Imm32(0x000007E0));
LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));
// Again, multiply and now we use AAAB, this time masking.
IMUL(32, resultReg, R(resultReg), Imm32(0x0000AAAB));
SHR(32, R(resultReg), Imm8(17 - 5));
AND(32, R(resultReg), Imm32(0x0000FF00));
// Let's combine R in already.
OR(32, R(resultReg), R(R12));
// Now multiply all of them by a special constant to divide by 3.
// This constant is (1 << 13) / 3, which is importantly less than 14 or 15.
IMUL(64, resultReg, R(resultReg), Imm32(0x00000AAB));
// Now for B, it starts in the lowest place so we'll need to mask.
AND(32, R(color1Reg), Imm32(0x0000001F));
AND(32, R(color2Reg), Imm32(0x0000001F));
LEA(32, colorIndexReg, MComplex(color2Reg, color1Reg, SCALE_2, 0));
// Instead of shifting left, though, we multiply by a bit more.
IMUL(32, colorIndexReg, R(colorIndexReg), Imm32(0x0002AAAB));
AND(32, R(colorIndexReg), Imm32(0x00FF0000));
OR(32, R(resultReg), R(colorIndexReg));
// Now extract the BGR values to 8 bits each.
// We subtract 3 from 13 to get 8 from 5 bits, then 2 from 20 + 13, and 3 from 40 + 13.
MOV(64, R(colorIndexReg), Imm64((0xFFULL << 10) | (0xFFULL << 31) | (0xFFULL << 50)));
PEXT(64, resultReg, resultReg, R(colorIndexReg));
POP(R13);
POP(R12);
// Finally swap B and R.
BSWAP(32, resultReg);
SHR(32, R(resultReg), Imm8(8));
} else {
// We'll need more regs. Grab two more to keep the stack aligned.
PUSH(R12);
PUSH(R13);
// Start off with R, adding together first...
MOV(32, R(resultReg), R(color1Reg));
AND(32, R(resultReg), Imm32(0x0000F800));
MOV(32, R(colorIndexReg), R(color2Reg));
AND(32, R(colorIndexReg), Imm32(0x0000F800));
LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));
// We'll overflow if we divide here, so shift into place already.
SHR(32, R(resultReg), Imm8(8));
// Now we divide that by 3, by actually multiplying by AAAB and shifting off.
IMUL(32, R12, R(resultReg), Imm32(0x0000AAAB));
// Now we SHR off the extra bits we added on.
SHR(32, R(R12), Imm8(17));
// Now add up G. We leave this in place and shift right more.
MOV(32, R(resultReg), R(color1Reg));
AND(32, R(resultReg), Imm32(0x000007E0));
MOV(32, R(colorIndexReg), R(color2Reg));
AND(32, R(colorIndexReg), Imm32(0x000007E0));
LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));
// Again, multiply and now we use AAAB, this time masking.
IMUL(32, resultReg, R(resultReg), Imm32(0x0000AAAB));
SHR(32, R(resultReg), Imm8(17 - 5));
AND(32, R(resultReg), Imm32(0x0000FF00));
// Let's combine R in already.
OR(32, R(resultReg), R(R12));
// Now for B, it starts in the lowest place so we'll need to mask.
AND(32, R(color1Reg), Imm32(0x0000001F));
AND(32, R(color2Reg), Imm32(0x0000001F));
LEA(32, colorIndexReg, MComplex(color2Reg, color1Reg, SCALE_2, 0));
// Instead of shifting left, though, we multiply by a bit more.
IMUL(32, colorIndexReg, R(colorIndexReg), Imm32(0x0002AAAB));
AND(32, R(colorIndexReg), Imm32(0x00FF0000));
OR(32, R(resultReg), R(colorIndexReg));
POP(R13);
POP(R12);
}
regCache_.Release(colorIndexReg, RegCache::GEN_TEMP0);
regCache_.Release(color1Reg, RegCache::GEN_TEMP1);
@ -2017,16 +2070,21 @@ bool SamplerJitCache::Jit_ApplyDXTAlpha(const SamplerID &id) {
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
if (uReg != RCX) {
if (uReg != RCX && !cpu_info.bBMI2) {
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
}
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
MOVZX(32, 16, temp1Reg, MComplex(srcReg, vReg, SCALE_2, 8));
// Still depending on it being GEN_SHIFTVAL or GEN_ARG_U above.
LEA(32, RCX, MScaled(uReg, SCALE_4, 0));
SHR(32, R(temp1Reg), R(CL));
if (cpu_info.bBMI2) {
LEA(32, uReg, MScaled(uReg, SCALE_4, 0));
SHRX(32, temp1Reg, R(temp1Reg), uReg);
} else {
// Still depending on it being GEN_SHIFTVAL or GEN_ARG_U above.
LEA(32, RCX, MScaled(uReg, SCALE_4, 0));
SHR(32, R(temp1Reg), R(CL));
}
SHL(32, R(temp1Reg), Imm8(28));
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
OR(32, R(resultReg), R(temp1Reg));
@ -2046,7 +2104,7 @@ bool SamplerJitCache::Jit_ApplyDXTAlpha(const SamplerID &id) {
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
if (uReg != RCX)
if (uReg != RCX && !cpu_info.bBMI2)
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
// Let's figure out the alphaIndex bit offset so we can read the right byte.
@ -2067,13 +2125,17 @@ bool SamplerJitCache::Jit_ApplyDXTAlpha(const SamplerID &id) {
// Load 16 bits and mask, in case it straddles bytes.
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
MOVZX(32, 16, alphaIndexReg, MComplex(srcReg, alphaIndexReg, SCALE_1, 8));
// If not, it's in what was bufwReg.
if (uReg != RCX) {
_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
MOV(32, R(RCX), R(uReg));
if (cpu_info.bBMI2) {
SHRX(32, alphaIndexReg, MComplex(srcReg, alphaIndexReg, SCALE_1, 8), uReg);
} else {
MOVZX(32, 16, alphaIndexReg, MComplex(srcReg, alphaIndexReg, SCALE_1, 8));
// If not, it's in what was bufwReg.
if (uReg != RCX) {
_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
MOV(32, R(RCX), R(uReg));
}
SHR(32, R(alphaIndexReg), R(CL));
}
SHR(32, R(alphaIndexReg), R(CL));
AND(32, R(alphaIndexReg), Imm32(7));
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
@ -2190,11 +2252,17 @@ bool SamplerJitCache::Jit_GetTexData(const SamplerID &id, int bitsPerTexel) {
break;
case 4: {
XOR(32, R(temp2Reg), R(temp2Reg));
if (cpu_info.bBMI2_fast)
MOV(32, R(temp2Reg), Imm32(0x0F));
else
XOR(32, R(temp2Reg), R(temp2Reg));
SHR(32, R(uReg), Imm8(1));
FixupBranch skip = J_CC(CC_NC);
// Track whether we shifted a 1 off or not.
MOV(32, R(temp2Reg), Imm32(4));
if (cpu_info.bBMI2_fast)
SHL(32, R(temp2Reg), Imm8(4));
else
MOV(32, R(temp2Reg), Imm32(4));
SetJumpTarget(skip);
LEA(64, temp1Reg, MRegSum(srcReg, uReg));
break;
@ -2222,7 +2290,7 @@ bool SamplerJitCache::Jit_GetTexData(const SamplerID &id, int bitsPerTexel) {
// We can throw bufw away, now.
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
if (bitsPerTexel == 4) {
if (bitsPerTexel == 4 && !cpu_info.bBMI2) {
bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
_assert_(hasRCX);
}
@ -2236,12 +2304,20 @@ bool SamplerJitCache::Jit_GetTexData(const SamplerID &id, int bitsPerTexel) {
case 4: {
SHR(32, R(resultReg), Imm8(1));
MOV(8, R(resultReg), MRegSum(temp1Reg, resultReg));
// RCX is now free.
MOV(8, R(RCX), R(temp2Reg));
SHR(8, R(resultReg), R(RCX));
// Zero out any bits not shifted off.
AND(32, R(resultReg), Imm8(0x0F));
if (cpu_info.bBMI2_fast) {
MOV(8, R(resultReg), MRegSum(temp1Reg, resultReg));
PEXT(32, resultReg, resultReg, R(temp2Reg));
} else if (cpu_info.bBMI2) {
SHRX(32, resultReg, MRegSum(temp1Reg, resultReg), temp2Reg);
AND(32, R(resultReg), Imm8(0x0F));
} else {
MOV(8, R(resultReg), MRegSum(temp1Reg, resultReg));
// RCX is now free.
MOV(8, R(RCX), R(temp2Reg));
SHR(8, R(resultReg), R(RCX));
// Zero out any bits not shifted off.
AND(32, R(resultReg), Imm8(0x0F));
}
break;
}
@ -3076,36 +3152,54 @@ bool SamplerJitCache::Jit_Decode5650(const SamplerID &id) {
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
MOV(32, R(temp2Reg), R(resultReg));
AND(32, R(temp2Reg), Imm32(0x0000001F));
if (cpu_info.bBMI2_fast) {
// Start off with the high bits.
MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
PDEP(32, temp1Reg, resultReg, R(temp1Reg));
if (id.useTextureAlpha || id.fetch)
OR(32, R(temp1Reg), Imm32(0xFF000000));
// B (we do R and B at the same time, they're both 5.)
MOV(32, R(temp1Reg), R(resultReg));
AND(32, R(temp1Reg), Imm32(0x0000F800));
SHL(32, R(temp1Reg), Imm8(5));
OR(32, R(temp2Reg), R(temp1Reg));
// Now grab the low bits (they end up packed.)
MOV(32, R(temp2Reg), Imm32(0x0000E61C));
PEXT(32, resultReg, resultReg, R(temp2Reg));
// And spread them back out.
MOV(32, R(temp2Reg), Imm32(0x00070307));
PDEP(32, resultReg, resultReg, R(temp2Reg));
// Expand 5 -> 8. At this point we have 00BB00RR.
MOV(32, R(temp1Reg), R(temp2Reg));
SHL(32, R(temp2Reg), Imm8(3));
SHR(32, R(temp1Reg), Imm8(2));
OR(32, R(temp2Reg), R(temp1Reg));
AND(32, R(temp2Reg), Imm32(0x00FF00FF));
// Finally put the high bits in, we're done.
OR(32, R(resultReg), R(temp1Reg));
} else {
MOV(32, R(temp2Reg), R(resultReg));
AND(32, R(temp2Reg), Imm32(0x0000001F));
// Now's as good a time to put in A as any.
if (id.useTextureAlpha || id.fetch)
OR(32, R(temp2Reg), Imm32(0xFF000000));
// B (we do R and B at the same time, they're both 5.)
MOV(32, R(temp1Reg), R(resultReg));
AND(32, R(temp1Reg), Imm32(0x0000F800));
SHL(32, R(temp1Reg), Imm8(5));
OR(32, R(temp2Reg), R(temp1Reg));
// Last, we need to align, extract, and expand G.
// 3 to align to G, and then 2 to expand to 8.
SHL(32, R(resultReg), Imm8(3 + 2));
AND(32, R(resultReg), Imm32(0x0000FC00));
MOV(32, R(temp1Reg), R(resultReg));
// 2 to account for resultReg being preshifted, 4 for expansion.
SHR(32, R(temp1Reg), Imm8(2 + 4));
OR(32, R(resultReg), R(temp1Reg));
AND(32, R(resultReg), Imm32(0x0000FF00));
OR(32, R(resultReg), R(temp2Reg));
// Expand 5 -> 8. At this point we have 00BB00RR.
MOV(32, R(temp1Reg), R(temp2Reg));
SHL(32, R(temp2Reg), Imm8(3));
SHR(32, R(temp1Reg), Imm8(2));
OR(32, R(temp2Reg), R(temp1Reg));
AND(32, R(temp2Reg), Imm32(0x00FF00FF));
// Now's as good a time to put in A as any.
if (id.useTextureAlpha || id.fetch)
OR(32, R(temp2Reg), Imm32(0xFF000000));
// Last, we need to align, extract, and expand G.
// 3 to align to G, and then 2 to expand to 8.
SHL(32, R(resultReg), Imm8(3 + 2));
AND(32, R(resultReg), Imm32(0x0000FC00));
MOV(32, R(temp1Reg), R(resultReg));
// 2 to account for resultReg being preshifted, 4 for expansion.
SHR(32, R(temp1Reg), Imm8(2 + 4));
OR(32, R(resultReg), R(temp1Reg));
AND(32, R(resultReg), Imm32(0x0000FF00));
OR(32, R(resultReg), R(temp2Reg));
}
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
@ -3154,34 +3248,54 @@ bool SamplerJitCache::Jit_Decode5551(const SamplerID &id) {
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
MOV(32, R(temp2Reg), R(resultReg));
MOV(32, R(temp1Reg), R(resultReg));
AND(32, R(temp2Reg), Imm32(0x0000001F));
AND(32, R(temp1Reg), Imm32(0x000003E0));
SHL(32, R(temp1Reg), Imm8(3));
OR(32, R(temp2Reg), R(temp1Reg));
if (cpu_info.bBMI2_fast) {
// First, grab the top bits.
bool keepAlpha = id.useTextureAlpha || id.fetch;
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8));
PDEP(32, resultReg, resultReg, R(temp1Reg));
MOV(32, R(temp1Reg), R(resultReg));
AND(32, R(temp1Reg), Imm32(0x00007C00));
SHL(32, R(temp1Reg), Imm8(6));
OR(32, R(temp2Reg), R(temp1Reg));
// Now make the swizzle bits.
MOV(32, R(temp2Reg), R(resultReg));
SHR(32, R(temp2Reg), Imm8(5));
AND(32, R(temp2Reg), Imm32(0x00070707));
// Expand 5 -> 8. After this is just A.
MOV(32, R(temp1Reg), R(temp2Reg));
SHL(32, R(temp2Reg), Imm8(3));
SHR(32, R(temp1Reg), Imm8(2));
// Chop off the bits that were shifted out.
AND(32, R(temp1Reg), Imm32(0x00070707));
OR(32, R(temp2Reg), R(temp1Reg));
if (keepAlpha) {
// Sign extend the alpha bit to 8 bits.
SHL(32, R(resultReg), Imm8(7));
SAR(32, R(resultReg), Imm8(7));
}
if (id.useTextureAlpha || id.fetch) {
// For A, we sign extend to get either 16 1s or 0s of alpha.
SAR(16, R(resultReg), Imm8(15));
// Now, shift left by 24 to get the lowest 8 of those at the top.
SHL(32, R(resultReg), Imm8(24));
OR(32, R(resultReg), R(temp2Reg));
} else {
MOV(32, R(resultReg), R(temp2Reg));
MOV(32, R(temp2Reg), R(resultReg));
MOV(32, R(temp1Reg), R(resultReg));
AND(32, R(temp2Reg), Imm32(0x0000001F));
AND(32, R(temp1Reg), Imm32(0x000003E0));
SHL(32, R(temp1Reg), Imm8(3));
OR(32, R(temp2Reg), R(temp1Reg));
MOV(32, R(temp1Reg), R(resultReg));
AND(32, R(temp1Reg), Imm32(0x00007C00));
SHL(32, R(temp1Reg), Imm8(6));
OR(32, R(temp2Reg), R(temp1Reg));
// Expand 5 -> 8. After this is just A.
MOV(32, R(temp1Reg), R(temp2Reg));
SHL(32, R(temp2Reg), Imm8(3));
SHR(32, R(temp1Reg), Imm8(2));
// Chop off the bits that were shifted out.
AND(32, R(temp1Reg), Imm32(0x00070707));
OR(32, R(temp2Reg), R(temp1Reg));
if (id.useTextureAlpha || id.fetch) {
// For A, we sign extend to get either 16 1s or 0s of alpha.
SAR(16, R(resultReg), Imm8(15));
// Now, shift left by 24 to get the lowest 8 of those at the top.
SHL(32, R(resultReg), Imm8(24));
OR(32, R(resultReg), R(temp2Reg));
} else {
MOV(32, R(resultReg), R(temp2Reg));
}
}
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
@ -3235,31 +3349,46 @@ alignas(16) static const u32 color4444mask[4] = { 0xf00ff00f, 0xf00ff00f, 0xf00f
bool SamplerJitCache::Jit_Decode4444(const SamplerID &id) {
Describe("4444");
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
X64Reg vecTemp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
X64Reg vecTemp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
X64Reg vecTemp3Reg = regCache_.Alloc(RegCache::VEC_TEMP3);
MOVD_xmm(vecTemp1Reg, R(resultReg));
PUNPCKLBW(vecTemp1Reg, R(vecTemp1Reg));
if (RipAccessible(color4444mask)) {
PAND(vecTemp1Reg, M(color4444mask));
} else {
if (cpu_info.bBMI2_fast) {
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
MOV(PTRBITS, R(temp1Reg), ImmPtr(color4444mask));
PAND(vecTemp1Reg, MatR(temp1Reg));
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
}
MOVSS(vecTemp2Reg, R(vecTemp1Reg));
MOVSS(vecTemp3Reg, R(vecTemp1Reg));
PSRLW(vecTemp2Reg, 4);
PSLLW(vecTemp3Reg, 4);
POR(vecTemp1Reg, R(vecTemp2Reg));
POR(vecTemp1Reg, R(vecTemp3Reg));
MOVD_xmm(R(resultReg), vecTemp1Reg);
// First, spread the bits out with spaces.
MOV(32, R(temp1Reg), Imm32(0xF0F0F0F0));
PDEP(32, resultReg, resultReg, R(temp1Reg));
regCache_.Release(vecTemp1Reg, RegCache::VEC_TEMP1);
regCache_.Release(vecTemp2Reg, RegCache::VEC_TEMP2);
regCache_.Release(vecTemp3Reg, RegCache::VEC_TEMP3);
// Now swizzle the low bits in.
MOV(32, R(temp1Reg), R(resultReg));
SHR(32, R(temp1Reg), Imm8(4));
OR(32, R(resultReg), R(temp1Reg));
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
} else {
X64Reg vecTemp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
X64Reg vecTemp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
X64Reg vecTemp3Reg = regCache_.Alloc(RegCache::VEC_TEMP3);
MOVD_xmm(vecTemp1Reg, R(resultReg));
PUNPCKLBW(vecTemp1Reg, R(vecTemp1Reg));
if (RipAccessible(color4444mask)) {
PAND(vecTemp1Reg, M(color4444mask));
} else {
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
MOV(PTRBITS, R(temp1Reg), ImmPtr(color4444mask));
PAND(vecTemp1Reg, MatR(temp1Reg));
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
}
MOVSS(vecTemp2Reg, R(vecTemp1Reg));
MOVSS(vecTemp3Reg, R(vecTemp1Reg));
PSRLW(vecTemp2Reg, 4);
PSLLW(vecTemp3Reg, 4);
POR(vecTemp1Reg, R(vecTemp2Reg));
POR(vecTemp1Reg, R(vecTemp3Reg));
MOVD_xmm(R(resultReg), vecTemp1Reg);
regCache_.Release(vecTemp1Reg, RegCache::VEC_TEMP1);
regCache_.Release(vecTemp2Reg, RegCache::VEC_TEMP2);
regCache_.Release(vecTemp3Reg, RegCache::VEC_TEMP3);
}
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
return true;
}
@ -3277,8 +3406,10 @@ bool SamplerJitCache::Jit_TransformClutIndex(const SamplerID &id, int bitsPerInd
return true;
}
bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
_assert_msg_(hasRCX, "Could not obtain RCX, locked?");
if (!cpu_info.bBMI2) {
bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
_assert_msg_(hasRCX, "Could not obtain RCX, locked?");
}
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
X64Reg idReg = GetSamplerID();
@ -3286,23 +3417,28 @@ bool SamplerJitCache::Jit_TransformClutIndex(const SamplerID &id, int bitsPerInd
UnlockSamplerID(idReg);
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
int shiftedToSoFar = 0;
// Shift = (clutformat >> 2) & 0x1F
if (id.hasClutShift) {
_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
MOV(32, R(RCX), R(temp1Reg));
SHR(32, R(RCX), Imm8(2));
AND(32, R(RCX), Imm8(0x1F));
SHR(32, R(resultReg), R(RCX));
SHR(32, R(temp1Reg), Imm8(2 - shiftedToSoFar));
shiftedToSoFar = 2;
if (cpu_info.bBMI2) {
SHRX(32, resultReg, R(resultReg), temp1Reg);
} else {
_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
MOV(32, R(RCX), R(temp1Reg));
SHR(32, R(resultReg), R(RCX));
}
}
// Mask = (clutformat >> 8) & 0xFF
if (id.hasClutMask) {
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
MOV(32, R(temp2Reg), R(temp1Reg));
SHR(32, R(temp2Reg), Imm8(8));
AND(32, R(resultReg), R(temp2Reg));
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
SHR(32, R(temp1Reg), Imm8(8 - shiftedToSoFar));
shiftedToSoFar = 8;
AND(32, R(resultReg), R(temp1Reg));
}
// We need to wrap any entries beyond the first 1024 bytes.
@ -3316,7 +3452,7 @@ bool SamplerJitCache::Jit_TransformClutIndex(const SamplerID &id, int bitsPerInd
// Offset = (clutformat >> 12) & 0x01F0
if (id.hasClutOffset) {
SHR(32, R(temp1Reg), Imm8(16));
SHR(32, R(temp1Reg), Imm8(16 - shiftedToSoFar));
SHL(32, R(temp1Reg), Imm8(4));
OR(32, R(resultReg), R(temp1Reg));
AND(32, R(resultReg), Imm32(offsetMask));