Merge pull request #15372 from unknownbrackets/bmi2

Optimize jits with a bit of BMI2
2025-04-02 11:01:50 -04:00 · 2022-02-01 09:43:35 +01:00 · 2022-02-01 09:43:35 +01:00 · f58d4dfcfe
commit f58d4dfcfe
parent c3ecd41e2c c2dd59084d
6 changed files with 484 additions and 221 deletions
--- a/Common/CPUDetect.cpp
+++ b/Common/CPUDetect.cpp
@ -255,6 +255,8 @@ void CPUInfo::Detect() {
 			if ((cpu_id[1] >> 11) & 1)
 				bRTM = true;
 		}
+
+		bBMI2_fast = bBMI2 && (vendor != VENDOR_AMD || family >= 0x19);
 	}
 	if (max_ex_fn >= 0x80000004) {
 		// Extract brand string
--- a/Common/CPUDetect.h
+++ b/Common/CPUDetect.h
@ -54,6 +54,7 @@ struct CPUInfo {
 	bool bLZCNT;
 	bool bBMI1;
 	bool bBMI2;
+	bool bBMI2_fast;
 	bool bXOP;
 	bool bRTM;

--- a/Common/x64Emitter.cpp
+++ b/Common/x64Emitter.cpp
@ -1458,6 +1458,7 @@ void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg
 {
 	CheckFlags();
 	_assert_msg_(cpu_info.bBMI1, "Trying to use BMI1 on a system that doesn't support it.");
+	_assert_msg_(!arg.IsImm(), "Imm arg unsupported for this BMI1 instruction");
 	WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
 }

@ -1465,6 +1466,7 @@ void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg
 {
 	CheckFlags();
 	_assert_msg_(cpu_info.bBMI2, "Trying to use BMI2 on a system that doesn't support it.");
+	_assert_msg_(!arg.IsImm(), "Imm arg unsupported for this BMI2 instruction");
 	WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
 }

--- a/Core/MIPS/x86/CompALU.cpp
+++ b/Core/MIPS/x86/CompALU.cpp
@ -22,6 +22,7 @@

 #include "Common/BitSet.h"
 #include "Common/CommonTypes.h"
+#include "Common/CPUDetect.h"
 #include "Core/MIPS/MIPSCodeUtils.h"
 #include "Core/MIPS/x86/Jit.h"
 #include "Core/MIPS/x86/RegCache.h"
@ -647,16 +648,36 @@ namespace MIPSComp
 		}

 		gpr.Lock(rd, rt, rs);
-		if (gpr.IsImm(rs))
-		{
+		if (gpr.IsImm(rs)) {
 			int sa = gpr.GetImm(rs);
 			gpr.MapReg(rd, rd == rt, true);
-			if (rd != rt)
+			if (cpu_info.bBMI2 && shift == &XEmitter::ROR) {
+				_assert_(!gpr.IsImm(rt));
+				RORX(32, gpr.RX(rd), gpr.R(rt), sa & 0x1F);
+			} else {
+				if (rd != rt)
+					MOV(32, gpr.R(rd), gpr.R(rt));
+				(this->*shift)(32, gpr.R(rd), Imm8(sa & 0x1F));
+			}
+		} else if (cpu_info.bBMI2 && shift != &XEmitter::ROR) {
+			gpr.MapReg(rd, rd == rt || rd == rs, true);
+			gpr.MapReg(rs, true, false);
+			MIPSGPReg src = rt;
+			if (gpr.IsImm(rt) && rd == rs) {
+				gpr.MapReg(rt, true, false);
+			} else if (gpr.IsImm(rt)) {
 				MOV(32, gpr.R(rd), gpr.R(rt));
-			(this->*shift)(32, gpr.R(rd), Imm8(sa));
-		}
-		else
-		{
+				src = rd;
+			}
+			if (shift == &XEmitter::SHL)
+				SHLX(32, gpr.RX(rd), gpr.R(src), gpr.RX(rs));
+			else if (shift == &XEmitter::SHR)
+				SHRX(32, gpr.RX(rd), gpr.R(src), gpr.RX(rs));
+			else if (shift == &XEmitter::SAR)
+				SARX(32, gpr.RX(rd), gpr.R(src), gpr.RX(rs));
+			else
+				_assert_msg_(false, "Unexpected shift type");
+		} else {
 			gpr.FlushLockX(ECX);
 			gpr.MapReg(rd, rd == rt || rd == rs, true);
 			MOV(32, R(ECX), gpr.R(rs));	// Only ECX can be used for variable shifts.
--- a/GPU/Software/DrawPixelX86.cpp
+++ b/GPU/Software/DrawPixelX86.cpp
@ -1631,8 +1631,12 @@ bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) {
 		if (maskReg != INVALID_REG) {
 			// Zero all other bits, then flip maskReg to clear the bits we're keeping in colorReg.
 			AND(16, MatR(colorOff), R(maskReg));
-			NOT(32, R(maskReg));
-			AND(32, R(colorReg), R(maskReg));
+			if (cpu_info.bBMI1) {
+				ANDN(32, colorReg, maskReg, R(colorReg));
+			} else {
+				NOT(32, R(maskReg));
+				AND(32, R(colorReg), R(maskReg));
+			}
 			OR(16, MatR(colorOff), R(colorReg));
 		} else if (fixedKeepMask == 0) {
 			MOV(16, MatR(colorOff), R(colorReg));
@ -1647,8 +1651,12 @@ bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) {
 		if (maskReg != INVALID_REG) {
 			// Zero all other bits, then flip maskReg to clear the bits we're keeping in colorReg.
 			AND(32, MatR(colorOff), R(maskReg));
-			NOT(32, R(maskReg));
-			AND(32, R(colorReg), R(maskReg));
+			if (cpu_info.bBMI1) {
+				ANDN(32, colorReg, maskReg, R(colorReg));
+			} else {
+				NOT(32, R(maskReg));
+				AND(32, R(colorReg), R(maskReg));
+			}
 			OR(32, MatR(colorOff), R(colorReg));
 		} else if (fixedKeepMask == 0) {
 			MOV(32, MatR(colorOff), R(colorReg));
@ -1774,8 +1782,12 @@ bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorR
 	tableValues[GE_LOGIC_AND_REVERSE] = GetCodePointer();
 	// Reverse memory in a temp reg so we can apply the write mask easily.
 	MOV(bits, R(temp1Reg), MatR(colorOff));
-	NOT(32, R(temp1Reg));
-	AND(32, R(colorReg), R(temp1Reg));
+	if (cpu_info.bBMI1) {
+		ANDN(32, colorReg, temp1Reg, R(colorReg));
+	} else {
+		NOT(32, R(temp1Reg));
+		AND(32, R(colorReg), R(temp1Reg));
+	}
 	// Now add in the stencil bits (must be zero before, since we used AND.)
 	if (stencilReg != INVALID_REG) {
 		OR(32, R(colorReg), R(stencilReg));
@ -1825,9 +1837,13 @@ bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorR
 	tableValues[GE_LOGIC_NOOP] = GetCodePointer();
 	if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
 		// Start by clearing masked bits from stencilReg.
-		NOT(32, R(maskReg));
-		AND(32, R(stencilReg), R(maskReg));
-		NOT(32, R(maskReg));
+		if (cpu_info.bBMI1) {
+			ANDN(32, stencilReg, maskReg, R(stencilReg));
+		} else {
+			NOT(32, R(maskReg));
+			AND(32, R(stencilReg), R(maskReg));
+			NOT(32, R(maskReg));
+		}

 		// Now mask out the stencil bits we're writing from memory.
 		OR(bits, R(maskReg), notStencilMask);
@ -1862,9 +1878,13 @@ bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorR
 		OR(32, R(colorReg), R(stencilReg));

 		// Clear the bits we should be masking out.
-		NOT(32, R(maskReg));
-		AND(32, R(colorReg), R(maskReg));
-		NOT(32, R(maskReg));
+		if (cpu_info.bBMI1) {
+			ANDN(32, colorReg, maskReg, R(colorReg));
+		} else {
+			NOT(32, R(maskReg));
+			AND(32, R(colorReg), R(maskReg));
+			NOT(32, R(maskReg));
+		}

 		// Clear all the unmasked stencil bits, so we can set our own.
 		OR(bits, R(maskReg), notStencilMask);
@ -1875,8 +1895,12 @@ bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorR
 		AND(bits, MatR(colorOff), notStencilMask);
 	} else if (maskReg != INVALID_REG) {
 		// Clear the bits we should be masking out.
-		NOT(32, R(maskReg));
-		AND(32, R(colorReg), R(maskReg));
+		if (cpu_info.bBMI1) {
+			ANDN(32, colorReg, maskReg, R(colorReg));
+		} else {
+			NOT(32, R(maskReg));
+			AND(32, R(colorReg), R(maskReg));
+		}
 	} else if (id.FBFormat() == GE_FORMAT_8888) {
 		// We only need to do this for 8888, the others already have 0 stencil.
 		AND(bits, R(colorReg), notStencilMask);
@ -1954,9 +1978,13 @@ bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorR
 		OR(32, R(colorReg), R(stencilReg));

 		// Clear the bits we should be masking out.
-		NOT(32, R(maskReg));
-		AND(32, R(colorReg), R(maskReg));
-		NOT(32, R(maskReg));
+		if (cpu_info.bBMI1) {
+			ANDN(32, colorReg, maskReg, R(colorReg));
+		} else {
+			NOT(32, R(maskReg));
+			AND(32, R(colorReg), R(maskReg));
+			NOT(32, R(maskReg));
+		}

 		// Clear all the unmasked stencil bits, so we can set our own.
 		OR(bits, R(maskReg), notStencilMask);
@ -2032,6 +2060,13 @@ bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorR

 bool PixelJitCache::Jit_ConvertTo565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) {
 	Describe("ConvertTo565");
+
+	if (cpu_info.bBMI2_fast) {
+		MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
+		PEXT(32, colorReg, colorReg, R(temp1Reg));
+		return true;
+	}
+
 	// Assemble the 565 color, starting with R...
 	MOV(32, R(temp1Reg), R(colorReg));
 	SHR(32, R(temp1Reg), Imm8(3));
@ -2053,6 +2088,13 @@ bool PixelJitCache::Jit_ConvertTo565(const PixelFuncID &id, RegCache::Reg colorR

 bool PixelJitCache::Jit_ConvertTo5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
 	Describe("ConvertTo5551");
+
+	if (cpu_info.bBMI2_fast) {
+		MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x80F8F8F8 : 0x00F8F8F8));
+		PEXT(32, colorReg, colorReg, R(temp1Reg));
+		return true;
+	}
+
 	// This is R, pretty simple.
 	MOV(32, R(temp1Reg), R(colorReg));
 	SHR(32, R(temp1Reg), Imm8(3));
@ -2084,6 +2126,13 @@ bool PixelJitCache::Jit_ConvertTo5551(const PixelFuncID &id, RegCache::Reg color

 bool PixelJitCache::Jit_ConvertTo4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
 	Describe("ConvertTo4444");
+
+	if (cpu_info.bBMI2_fast) {
+		MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0));
+		PEXT(32, colorReg, colorReg, R(temp1Reg));
+		return true;
+	}
+
 	// Shift and mask out R.
 	MOV(32, R(temp1Reg), R(colorReg));
 	SHR(32, R(temp1Reg), Imm8(4));
@ -2115,6 +2164,24 @@ bool PixelJitCache::Jit_ConvertTo4444(const PixelFuncID &id, RegCache::Reg color

 bool PixelJitCache::Jit_ConvertFrom565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) {
 	Describe("ConvertFrom565");
+
+	if (cpu_info.bBMI2_fast) {
+		// Start off with the high bits.
+		MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
+		PDEP(32, temp1Reg, colorReg, R(temp1Reg));
+
+		// Now grab the low bits (they end up packed.)
+		MOV(32, R(temp2Reg), Imm32(0x0000E61C));
+		PEXT(32, colorReg, colorReg, R(temp2Reg));
+		// And spread them back out.
+		MOV(32, R(temp2Reg), Imm32(0x00070307));
+		PDEP(32, colorReg, colorReg, R(temp2Reg));
+
+		// Finally put the high bits in, we're done.
+		OR(32, R(colorReg), R(temp1Reg));
+		return true;
+	}
+
 	// Filter out red only into temp1.
 	MOV(32, R(temp1Reg), R(colorReg));
 	AND(16, R(temp1Reg), Imm16(0x1F << 0));
@ -2150,6 +2217,27 @@ bool PixelJitCache::Jit_ConvertFrom565(const PixelFuncID &id, RegCache::Reg colo

 bool PixelJitCache::Jit_ConvertFrom5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
 	Describe("ConvertFrom5551");
+
+	if (cpu_info.bBMI2_fast) {
+		// First, grab the top bits.
+		MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8));
+		PDEP(32, colorReg, colorReg, R(temp1Reg));
+
+		// Now make the swizzle bits.
+		MOV(32, R(temp2Reg), R(colorReg));
+		SHR(32, R(temp2Reg), Imm8(5));
+		AND(32, R(temp2Reg), Imm32(0x00070707));
+
+		if (keepAlpha) {
+			// Sign extend the alpha bit to 8 bits.
+			SHL(32, R(colorReg), Imm8(7));
+			SAR(32, R(colorReg), Imm8(7));
+		}
+
+		OR(32, R(colorReg), R(temp2Reg));
+		return true;
+	}
+
 	// Filter out red only into temp1.
 	MOV(32, R(temp1Reg), R(colorReg));
 	AND(16, R(temp1Reg), Imm16(0x1F << 0));
@ -2187,6 +2275,19 @@ bool PixelJitCache::Jit_ConvertFrom5551(const PixelFuncID &id, RegCache::Reg col

 bool PixelJitCache::Jit_ConvertFrom4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
 	Describe("ConvertFrom4444");
+
+	if (cpu_info.bBMI2_fast) {
+		// First, spread the bits out with spaces.
+		MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0));
+		PDEP(32, colorReg, colorReg, R(temp1Reg));
+
+		// Now swizzle the low bits in.
+		MOV(32, R(temp1Reg), R(colorReg));
+		SHR(32, R(temp1Reg), Imm8(4));
+		OR(32, R(colorReg), R(temp1Reg));
+		return true;
+	}
+
 	// Move red into position within temp1.
 	MOV(32, R(temp1Reg), R(colorReg));
 	AND(16, R(temp1Reg), Imm16(0xF << 0));
--- a/GPU/Software/SamplerX86.cpp
+++ b/GPU/Software/SamplerX86.cpp
@ -184,12 +184,14 @@ NearestFunc SamplerJitCache::CompileNearest(const SamplerID &id) {

 	if (regCache_.Has(RegCache::GEN_ARG_BUFW_PTR)) {
 		// On Linux, RCX is currently bufwptr, but we'll need it for other things.
-		X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
-		MOV(64, R(R15), R(bufwReg));
-		regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
-		regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
-		regCache_.ChangeReg(R15, RegCache::GEN_ARG_BUFW_PTR);
-		regCache_.ForceRetain(RegCache::GEN_ARG_BUFW_PTR);
+		if (!cpu_info.bBMI2) {
+			X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
+			MOV(64, R(R15), R(bufwReg));
+			regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
+			regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
+			regCache_.ChangeReg(R15, RegCache::GEN_ARG_BUFW_PTR);
+			regCache_.ForceRetain(RegCache::GEN_ARG_BUFW_PTR);
+		}
 	} else {
 		// Let's load bufwptr/texptrptr into regs.  Match Linux just for consistency - RDX is free.
 		MOV(64, R(RDX), MDisp(RSP, stackArgPos_ + 0));
@ -200,8 +202,10 @@ NearestFunc SamplerJitCache::CompileNearest(const SamplerID &id) {
 		regCache_.ForceRetain(RegCache::GEN_ARG_BUFW_PTR);
 	}
 	// Okay, now lock RCX as a shifting reg.
-	regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
-	regCache_.ForceRetain(RegCache::GEN_SHIFTVAL);
+	if (!cpu_info.bBMI2) {
+		regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
+		regCache_.ForceRetain(RegCache::GEN_SHIFTVAL);
+	}

 	bool success = true;

@ -345,7 +349,8 @@ NearestFunc SamplerJitCache::CompileNearest(const SamplerID &id) {
 		regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
 	if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
 		regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
-	regCache_.ForceRelease(RegCache::GEN_SHIFTVAL);
+	if (regCache_.Has(RegCache::GEN_SHIFTVAL))
+		regCache_.ForceRelease(RegCache::GEN_SHIFTVAL);
 	regCache_.ForceRelease(RegCache::GEN_RESULT);

 	if (id.hasAnyMips) {
@ -1820,7 +1825,7 @@ bool SamplerJitCache::Jit_GetDXT1Color(const SamplerID &id, int blockSize, int a
 	regCache_.Release(srcOffsetReg, RegCache::GEN_TEMP1);

 	// Make sure we don't grab this as colorIndexReg.
-	if (uReg != ECX)
+	if (uReg != ECX && !cpu_info.bBMI2)
 		regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);

 	// The colorIndex is simply the 2 bits at blockPos + (v & 3), shifted right by (u & 3) twice.
@ -1835,6 +1840,9 @@ bool SamplerJitCache::Jit_GetDXT1Color(const SamplerID &id, int blockSize, int a
 	if (uReg == ECX) {
 		SHR(32, R(colorIndexReg), R(CL));
 		SHR(32, R(colorIndexReg), R(CL));
+	} else if (cpu_info.bBMI2) {
+		SHRX(32, colorIndexReg, R(colorIndexReg), uReg);
+		SHRX(32, colorIndexReg, R(colorIndexReg), uReg);
 	} else {
 		bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
 		_assert_(hasRCX);
@ -1869,43 +1877,58 @@ bool SamplerJitCache::Jit_GetDXT1Color(const SamplerID &id, int blockSize, int a
 	CMP(32, R(colorIndexReg), Imm32(3));
 	FixupBranch finishZero = J_CC(CC_E, true);

-	// We'll need more regs.  Grab two more.
-	PUSH(R12);
-	PUSH(R13);
-
-	// At this point, resultReg, colorIndexReg, R12, and R13 can be used as temps.
+	// At this point, resultReg, colorIndexReg, and maybe R12/R13 can be used as temps.
 	// We'll add, then shift from 565 a bit less to "divide" by 2 for a 50/50 mix.

-	// Start with summing R, then shift into position.
-	MOV(32, R(resultReg), R(color1Reg));
-	AND(32, R(resultReg), Imm32(0x0000F800));
-	MOV(32, R(colorIndexReg), R(color2Reg));
-	AND(32, R(colorIndexReg), Imm32(0x0000F800));
-	LEA(32, R12, MRegSum(resultReg, colorIndexReg));
-	// The position is 9, instead of 8, due to doubling.
-	SHR(32, R(R12), Imm8(9));
+	if (cpu_info.bBMI2_fast) {
+		// Expand everything out to 0BGR at 8888, but halved.
+		MOV(32, R(colorIndexReg), Imm32(0x007C7E7C));
+		PDEP(32, color1Reg, color1Reg, R(colorIndexReg));
+		PDEP(32, color2Reg, color2Reg, R(colorIndexReg));

-	// For G, summing leaves it 4 right (doubling made it not need more.)
-	MOV(32, R(resultReg), R(color1Reg));
-	AND(32, R(resultReg), Imm32(0x000007E0));
-	MOV(32, R(colorIndexReg), R(color2Reg));
-	AND(32, R(colorIndexReg), Imm32(0x000007E0));
-	LEA(32, resultReg, MRegSum(resultReg, colorIndexReg));
-	SHL(32, R(resultReg), Imm8(5 - 1));
-	// Now add G and R together.
-	OR(32, R(resultReg), R(R12));
+		// Now let's sum them together (this undoes our halving.)
+		LEA(32, resultReg, MRegSum(color1Reg, color2Reg));

-	// At B, we're free to modify the regs in place, finally.
-	AND(32, R(color1Reg), Imm32(0x0000001F));
-	AND(32, R(color2Reg), Imm32(0x0000001F));
-	LEA(32, colorIndexReg, MRegSum(color1Reg, color2Reg));
-	// We shift left 2 into position (not 3 due to doubling), then 16 more into the B slot.
-	SHL(32, R(colorIndexReg), Imm8(16 + 2));
-	// And combine into the result.
-	OR(32, R(resultReg), R(colorIndexReg));
+		// Time to swap into order.  Luckily we can ignore alpha.
+		BSWAP(32, resultReg);
+		SHR(32, R(resultReg), Imm8(8));
+	} else {
+		// We'll need more regs.  Grab two more.
+		PUSH(R12);
+		PUSH(R13);
+
+		// Start with summing R, then shift into position.
+		MOV(32, R(resultReg), R(color1Reg));
+		AND(32, R(resultReg), Imm32(0x0000F800));
+		MOV(32, R(colorIndexReg), R(color2Reg));
+		AND(32, R(colorIndexReg), Imm32(0x0000F800));
+		LEA(32, R12, MRegSum(resultReg, colorIndexReg));
+		// The position is 9, instead of 8, due to doubling.
+		SHR(32, R(R12), Imm8(9));
+
+		// For G, summing leaves it 4 right (doubling made it not need more.)
+		MOV(32, R(resultReg), R(color1Reg));
+		AND(32, R(resultReg), Imm32(0x000007E0));
+		MOV(32, R(colorIndexReg), R(color2Reg));
+		AND(32, R(colorIndexReg), Imm32(0x000007E0));
+		LEA(32, resultReg, MRegSum(resultReg, colorIndexReg));
+		SHL(32, R(resultReg), Imm8(5 - 1));
+		// Now add G and R together.
+		OR(32, R(resultReg), R(R12));
+
+		// At B, we're free to modify the regs in place, finally.
+		AND(32, R(color1Reg), Imm32(0x0000001F));
+		AND(32, R(color2Reg), Imm32(0x0000001F));
+		LEA(32, colorIndexReg, MRegSum(color1Reg, color2Reg));
+		// We shift left 2 into position (not 3 due to doubling), then 16 more into the B slot.
+		SHL(32, R(colorIndexReg), Imm8(16 + 2));
+		// And combine into the result.
+		OR(32, R(resultReg), R(colorIndexReg));
+
+		POP(R13);
+		POP(R12);
+	}

-	POP(R13);
-	POP(R12);
 	FixupBranch finishMix50 = J(true);

 	// Simply load the 565 color, and convert to 0888.
@ -1917,29 +1940,34 @@ bool SamplerJitCache::Jit_GetDXT1Color(const SamplerID &id, int blockSize, int a
 	if (id.TexFmt() == GE_TFMT_DXT1)
 		regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);

-	// Start with R, shifting it into place.
-	MOV(32, R(resultReg), R(colorIndexReg));
-	AND(32, R(resultReg), Imm32(0x0000F800));
-	SHR(32, R(resultReg), Imm8(8));
+	if (cpu_info.bBMI2_fast) {
+		// We're only grabbing the high bits, no swizzle here.
+		MOV(32, R(resultReg), Imm32(0x00F8FCF8));
+		PDEP(32, resultReg, colorIndexReg, R(resultReg));
+		BSWAP(32, resultReg);
+		SHR(32, R(resultReg), Imm8(8));
+	} else {
+		// Start with R, shifting it into place.
+		MOV(32, R(resultReg), R(colorIndexReg));
+		AND(32, R(resultReg), Imm32(0x0000F800));
+		SHR(32, R(resultReg), Imm8(8));

-	// Then take G and shift it too.
-	MOV(32, R(color2Reg), R(colorIndexReg));
-	AND(32, R(color2Reg), Imm32(0x000007E0));
-	SHL(32, R(color2Reg), Imm8(5));
-	// And now combine with R, shifting that in the process.
-	OR(32, R(resultReg), R(color2Reg));
+		// Then take G and shift it too.
+		MOV(32, R(color2Reg), R(colorIndexReg));
+		AND(32, R(color2Reg), Imm32(0x000007E0));
+		SHL(32, R(color2Reg), Imm8(5));
+		// And now combine with R, shifting that in the process.
+		OR(32, R(resultReg), R(color2Reg));

-	// Modify B in place and OR in.
-	AND(32, R(colorIndexReg), Imm32(0x0000001F));
-	SHL(32, R(colorIndexReg), Imm8(16 + 3));
-	OR(32, R(resultReg), R(colorIndexReg));
+		// Modify B in place and OR in.
+		AND(32, R(colorIndexReg), Imm32(0x0000001F));
+		SHL(32, R(colorIndexReg), Imm8(16 + 3));
+		OR(32, R(resultReg), R(colorIndexReg));
+	}
 	FixupBranch finish565 = J(true);

 	// Here we'll mix color1 and color2 by 2/3 (which gets the 2 depends on colorIndexReg.)
 	SetJumpTarget(handleMix23);
-	// We'll need more regs.  Grab two more to keep the stack aligned.
-	PUSH(R12);
-	PUSH(R13);

 	// If colorIndexReg is 2, it's color1Reg * 2 + color2Reg, but if colorIndexReg is 3, it's reversed.
 	// Let's swap the regs in that case.
@ -1948,43 +1976,68 @@ bool SamplerJitCache::Jit_GetDXT1Color(const SamplerID &id, int blockSize, int a
 	XCHG(32, R(color2Reg), R(color1Reg));
 	SetJumpTarget(skipSwap23);

-	// Start off with R, adding together first...
-	MOV(32, R(resultReg), R(color1Reg));
-	AND(32, R(resultReg), Imm32(0x0000F800));
-	MOV(32, R(colorIndexReg), R(color2Reg));
-	AND(32, R(colorIndexReg), Imm32(0x0000F800));
-	LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));
-	// We'll overflow if we divide here, so shift into place already.
-	SHR(32, R(resultReg), Imm8(8));
-	// Now we divide that by 3, by actually multiplying by AAAB and shifting off.
-	IMUL(32, R12, R(resultReg), Imm32(0x0000AAAB));
-	// Now we SHR off the extra bits we added on.
-	SHR(32, R(R12), Imm8(17));
+	if (cpu_info.bBMI2_fast) {
+		// Gather B, G, and R and space them apart by 14 or 15 bits.
+		MOV(64, R(colorIndexReg), Imm64(0x00001F0003F0001FULL));
+		PDEP(64, color1Reg, color1Reg, R(colorIndexReg));
+		PDEP(64, color2Reg, color2Reg, R(colorIndexReg));
+		LEA(64, resultReg, MComplex(color2Reg, color1Reg, SCALE_2, 0));

-	// Now add up G.  We leave this in place and shift right more.
-	MOV(32, R(resultReg), R(color1Reg));
-	AND(32, R(resultReg), Imm32(0x000007E0));
-	MOV(32, R(colorIndexReg), R(color2Reg));
-	AND(32, R(colorIndexReg), Imm32(0x000007E0));
-	LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));
-	// Again, multiply and now we use AAAB, this time masking.
-	IMUL(32, resultReg, R(resultReg), Imm32(0x0000AAAB));
-	SHR(32, R(resultReg), Imm8(17 - 5));
-	AND(32, R(resultReg), Imm32(0x0000FF00));
-	// Let's combine R in already.
-	OR(32, R(resultReg), R(R12));
+		// Now multiply all of them by a special constant to divide by 3.
+		// This constant is (1 << 13) / 3, which is importantly less than 14 or 15.
+		IMUL(64, resultReg, R(resultReg), Imm32(0x00000AAB));

-	// Now for B, it starts in the lowest place so we'll need to mask.
-	AND(32, R(color1Reg), Imm32(0x0000001F));
-	AND(32, R(color2Reg), Imm32(0x0000001F));
-	LEA(32, colorIndexReg, MComplex(color2Reg, color1Reg, SCALE_2, 0));
-	// Instead of shifting left, though, we multiply by a bit more.
-	IMUL(32, colorIndexReg, R(colorIndexReg), Imm32(0x0002AAAB));
-	AND(32, R(colorIndexReg), Imm32(0x00FF0000));
-	OR(32, R(resultReg), R(colorIndexReg));
+		// Now extract the BGR values to 8 bits each.
+		// We subtract 3 from 13 to get 8 from 5 bits, then 2 from 20 + 13, and 3 from 40 + 13.
+		MOV(64, R(colorIndexReg), Imm64((0xFFULL << 10) | (0xFFULL << 31) | (0xFFULL << 50)));
+		PEXT(64, resultReg, resultReg, R(colorIndexReg));

-	POP(R13);
-	POP(R12);
+		// Finally swap B and R.
+		BSWAP(32, resultReg);
+		SHR(32, R(resultReg), Imm8(8));
+	} else {
+		// We'll need more regs.  Grab two more to keep the stack aligned.
+		PUSH(R12);
+		PUSH(R13);
+
+		// Start off with R, adding together first...
+		MOV(32, R(resultReg), R(color1Reg));
+		AND(32, R(resultReg), Imm32(0x0000F800));
+		MOV(32, R(colorIndexReg), R(color2Reg));
+		AND(32, R(colorIndexReg), Imm32(0x0000F800));
+		LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));
+		// We'll overflow if we divide here, so shift into place already.
+		SHR(32, R(resultReg), Imm8(8));
+		// Now we divide that by 3, by actually multiplying by AAAB and shifting off.
+		IMUL(32, R12, R(resultReg), Imm32(0x0000AAAB));
+		// Now we SHR off the extra bits we added on.
+		SHR(32, R(R12), Imm8(17));
+
+		// Now add up G.  We leave this in place and shift right more.
+		MOV(32, R(resultReg), R(color1Reg));
+		AND(32, R(resultReg), Imm32(0x000007E0));
+		MOV(32, R(colorIndexReg), R(color2Reg));
+		AND(32, R(colorIndexReg), Imm32(0x000007E0));
+		LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));
+		// Again, multiply and now we use AAAB, this time masking.
+		IMUL(32, resultReg, R(resultReg), Imm32(0x0000AAAB));
+		SHR(32, R(resultReg), Imm8(17 - 5));
+		AND(32, R(resultReg), Imm32(0x0000FF00));
+		// Let's combine R in already.
+		OR(32, R(resultReg), R(R12));
+
+		// Now for B, it starts in the lowest place so we'll need to mask.
+		AND(32, R(color1Reg), Imm32(0x0000001F));
+		AND(32, R(color2Reg), Imm32(0x0000001F));
+		LEA(32, colorIndexReg, MComplex(color2Reg, color1Reg, SCALE_2, 0));
+		// Instead of shifting left, though, we multiply by a bit more.
+		IMUL(32, colorIndexReg, R(colorIndexReg), Imm32(0x0002AAAB));
+		AND(32, R(colorIndexReg), Imm32(0x00FF0000));
+		OR(32, R(resultReg), R(colorIndexReg));
+
+		POP(R13);
+		POP(R12);
+	}

 	regCache_.Release(colorIndexReg, RegCache::GEN_TEMP0);
 	regCache_.Release(color1Reg, RegCache::GEN_TEMP1);
@ -2017,16 +2070,21 @@ bool SamplerJitCache::Jit_ApplyDXTAlpha(const SamplerID &id) {
 		X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
 		X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);

-		if (uReg != RCX) {
+		if (uReg != RCX && !cpu_info.bBMI2) {
 			regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
 			_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
 		}

 		X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
 		MOVZX(32, 16, temp1Reg, MComplex(srcReg, vReg, SCALE_2, 8));
-		// Still depending on it being GEN_SHIFTVAL or GEN_ARG_U above.
-		LEA(32, RCX, MScaled(uReg, SCALE_4, 0));
-		SHR(32, R(temp1Reg), R(CL));
+		if (cpu_info.bBMI2) {
+			LEA(32, uReg, MScaled(uReg, SCALE_4, 0));
+			SHRX(32, temp1Reg, R(temp1Reg), uReg);
+		} else {
+			// Still depending on it being GEN_SHIFTVAL or GEN_ARG_U above.
+			LEA(32, RCX, MScaled(uReg, SCALE_4, 0));
+			SHR(32, R(temp1Reg), R(CL));
+		}
 		SHL(32, R(temp1Reg), Imm8(28));
 		X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
 		OR(32, R(resultReg), R(temp1Reg));
@ -2046,7 +2104,7 @@ bool SamplerJitCache::Jit_ApplyDXTAlpha(const SamplerID &id) {
 		X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
 		X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);

-		if (uReg != RCX)
+		if (uReg != RCX && !cpu_info.bBMI2)
 			regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);

 		// Let's figure out the alphaIndex bit offset so we can read the right byte.
@ -2067,13 +2125,17 @@ bool SamplerJitCache::Jit_ApplyDXTAlpha(const SamplerID &id) {

 		// Load 16 bits and mask, in case it straddles bytes.
 		X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
-		MOVZX(32, 16, alphaIndexReg, MComplex(srcReg, alphaIndexReg, SCALE_1, 8));
-		// If not, it's in what was bufwReg.
-		if (uReg != RCX) {
-			_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
-			MOV(32, R(RCX), R(uReg));
+		if (cpu_info.bBMI2) {
+			SHRX(32, alphaIndexReg, MComplex(srcReg, alphaIndexReg, SCALE_1, 8), uReg);
+		} else {
+			MOVZX(32, 16, alphaIndexReg, MComplex(srcReg, alphaIndexReg, SCALE_1, 8));
+			// If not, it's in what was bufwReg.
+			if (uReg != RCX) {
+				_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
+				MOV(32, R(RCX), R(uReg));
+			}
+			SHR(32, R(alphaIndexReg), R(CL));
 		}
-		SHR(32, R(alphaIndexReg), R(CL));
 		AND(32, R(alphaIndexReg), Imm32(7));

 		regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
@ -2190,11 +2252,17 @@ bool SamplerJitCache::Jit_GetTexData(const SamplerID &id, int bitsPerTexel) {
 		break;

 	case 4: {
-		XOR(32, R(temp2Reg), R(temp2Reg));
+		if (cpu_info.bBMI2_fast)
+			MOV(32, R(temp2Reg), Imm32(0x0F));
+		else
+			XOR(32, R(temp2Reg), R(temp2Reg));
 		SHR(32, R(uReg), Imm8(1));
 		FixupBranch skip = J_CC(CC_NC);
 		// Track whether we shifted a 1 off or not.
-		MOV(32, R(temp2Reg), Imm32(4));
+		if (cpu_info.bBMI2_fast)
+			SHL(32, R(temp2Reg), Imm8(4));
+		else
+			MOV(32, R(temp2Reg), Imm32(4));
 		SetJumpTarget(skip);
 		LEA(64, temp1Reg, MRegSum(srcReg, uReg));
 		break;
@ -2222,7 +2290,7 @@ bool SamplerJitCache::Jit_GetTexData(const SamplerID &id, int bitsPerTexel) {
 	// We can throw bufw away, now.
 	regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);

-	if (bitsPerTexel == 4) {
+	if (bitsPerTexel == 4 && !cpu_info.bBMI2) {
 		bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
 		_assert_(hasRCX);
 	}
@ -2236,12 +2304,20 @@ bool SamplerJitCache::Jit_GetTexData(const SamplerID &id, int bitsPerTexel) {

 	case 4: {
 		SHR(32, R(resultReg), Imm8(1));
-		MOV(8, R(resultReg), MRegSum(temp1Reg, resultReg));
-		// RCX is now free.
-		MOV(8, R(RCX), R(temp2Reg));
-		SHR(8, R(resultReg), R(RCX));
-		// Zero out any bits not shifted off.
-		AND(32, R(resultReg), Imm8(0x0F));
+		if (cpu_info.bBMI2_fast) {
+			MOV(8, R(resultReg), MRegSum(temp1Reg, resultReg));
+			PEXT(32, resultReg, resultReg, R(temp2Reg));
+		} else if (cpu_info.bBMI2) {
+			SHRX(32, resultReg, MRegSum(temp1Reg, resultReg), temp2Reg);
+			AND(32, R(resultReg), Imm8(0x0F));
+		} else {
+			MOV(8, R(resultReg), MRegSum(temp1Reg, resultReg));
+			// RCX is now free.
+			MOV(8, R(RCX), R(temp2Reg));
+			SHR(8, R(resultReg), R(RCX));
+			// Zero out any bits not shifted off.
+			AND(32, R(resultReg), Imm8(0x0F));
+		}
 		break;
 	}

@ -3076,36 +3152,54 @@ bool SamplerJitCache::Jit_Decode5650(const SamplerID &id) {
 	X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
 	X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);

-	MOV(32, R(temp2Reg), R(resultReg));
-	AND(32, R(temp2Reg), Imm32(0x0000001F));
+	if (cpu_info.bBMI2_fast) {
+		// Start off with the high bits.
+		MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
+		PDEP(32, temp1Reg, resultReg, R(temp1Reg));
+		if (id.useTextureAlpha || id.fetch)
+			OR(32, R(temp1Reg), Imm32(0xFF000000));

-	// B (we do R and B at the same time, they're both 5.)
-	MOV(32, R(temp1Reg), R(resultReg));
-	AND(32, R(temp1Reg), Imm32(0x0000F800));
-	SHL(32, R(temp1Reg), Imm8(5));
-	OR(32, R(temp2Reg), R(temp1Reg));
+		// Now grab the low bits (they end up packed.)
+		MOV(32, R(temp2Reg), Imm32(0x0000E61C));
+		PEXT(32, resultReg, resultReg, R(temp2Reg));
+		// And spread them back out.
+		MOV(32, R(temp2Reg), Imm32(0x00070307));
+		PDEP(32, resultReg, resultReg, R(temp2Reg));

-	// Expand 5 -> 8.  At this point we have 00BB00RR.
-	MOV(32, R(temp1Reg), R(temp2Reg));
-	SHL(32, R(temp2Reg), Imm8(3));
-	SHR(32, R(temp1Reg), Imm8(2));
-	OR(32, R(temp2Reg), R(temp1Reg));
-	AND(32, R(temp2Reg), Imm32(0x00FF00FF));
+		// Finally put the high bits in, we're done.
+		OR(32, R(resultReg), R(temp1Reg));
+	} else {
+		MOV(32, R(temp2Reg), R(resultReg));
+		AND(32, R(temp2Reg), Imm32(0x0000001F));

-	// Now's as good a time to put in A as any.
-	if (id.useTextureAlpha || id.fetch)
-		OR(32, R(temp2Reg), Imm32(0xFF000000));
+		// B (we do R and B at the same time, they're both 5.)
+		MOV(32, R(temp1Reg), R(resultReg));
+		AND(32, R(temp1Reg), Imm32(0x0000F800));
+		SHL(32, R(temp1Reg), Imm8(5));
+		OR(32, R(temp2Reg), R(temp1Reg));

-	// Last, we need to align, extract, and expand G.
-	// 3 to align to G, and then 2 to expand to 8.
-	SHL(32, R(resultReg), Imm8(3 + 2));
-	AND(32, R(resultReg), Imm32(0x0000FC00));
-	MOV(32, R(temp1Reg), R(resultReg));
-	// 2 to account for resultReg being preshifted, 4 for expansion.
-	SHR(32, R(temp1Reg), Imm8(2 + 4));
-	OR(32, R(resultReg), R(temp1Reg));
-	AND(32, R(resultReg), Imm32(0x0000FF00));
-	OR(32, R(resultReg), R(temp2Reg));
+		// Expand 5 -> 8.  At this point we have 00BB00RR.
+		MOV(32, R(temp1Reg), R(temp2Reg));
+		SHL(32, R(temp2Reg), Imm8(3));
+		SHR(32, R(temp1Reg), Imm8(2));
+		OR(32, R(temp2Reg), R(temp1Reg));
+		AND(32, R(temp2Reg), Imm32(0x00FF00FF));
+
+		// Now's as good a time to put in A as any.
+		if (id.useTextureAlpha || id.fetch)
+			OR(32, R(temp2Reg), Imm32(0xFF000000));
+
+		// Last, we need to align, extract, and expand G.
+		// 3 to align to G, and then 2 to expand to 8.
+		SHL(32, R(resultReg), Imm8(3 + 2));
+		AND(32, R(resultReg), Imm32(0x0000FC00));
+		MOV(32, R(temp1Reg), R(resultReg));
+		// 2 to account for resultReg being preshifted, 4 for expansion.
+		SHR(32, R(temp1Reg), Imm8(2 + 4));
+		OR(32, R(resultReg), R(temp1Reg));
+		AND(32, R(resultReg), Imm32(0x0000FF00));
+		OR(32, R(resultReg), R(temp2Reg));
+	}

 	regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
 	regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
@ -3154,34 +3248,54 @@ bool SamplerJitCache::Jit_Decode5551(const SamplerID &id) {
 	X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
 	X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);

-	MOV(32, R(temp2Reg), R(resultReg));
-	MOV(32, R(temp1Reg), R(resultReg));
-	AND(32, R(temp2Reg), Imm32(0x0000001F));
-	AND(32, R(temp1Reg), Imm32(0x000003E0));
-	SHL(32, R(temp1Reg), Imm8(3));
-	OR(32, R(temp2Reg), R(temp1Reg));
+	if (cpu_info.bBMI2_fast) {
+		// First, grab the top bits.
+		bool keepAlpha = id.useTextureAlpha || id.fetch;
+		MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8));
+		PDEP(32, resultReg, resultReg, R(temp1Reg));

-	MOV(32, R(temp1Reg), R(resultReg));
-	AND(32, R(temp1Reg), Imm32(0x00007C00));
-	SHL(32, R(temp1Reg), Imm8(6));
-	OR(32, R(temp2Reg), R(temp1Reg));
+		// Now make the swizzle bits.
+		MOV(32, R(temp2Reg), R(resultReg));
+		SHR(32, R(temp2Reg), Imm8(5));
+		AND(32, R(temp2Reg), Imm32(0x00070707));

-	// Expand 5 -> 8.  After this is just A.
-	MOV(32, R(temp1Reg), R(temp2Reg));
-	SHL(32, R(temp2Reg), Imm8(3));
-	SHR(32, R(temp1Reg), Imm8(2));
-	// Chop off the bits that were shifted out.
-	AND(32, R(temp1Reg), Imm32(0x00070707));
-	OR(32, R(temp2Reg), R(temp1Reg));
+		if (keepAlpha) {
+			// Sign extend the alpha bit to 8 bits.
+			SHL(32, R(resultReg), Imm8(7));
+			SAR(32, R(resultReg), Imm8(7));
+		}

-	if (id.useTextureAlpha || id.fetch) {
-		// For A, we sign extend to get either 16 1s or 0s of alpha.
-		SAR(16, R(resultReg), Imm8(15));
-		// Now, shift left by 24 to get the lowest 8 of those at the top.
-		SHL(32, R(resultReg), Imm8(24));
 		OR(32, R(resultReg), R(temp2Reg));
 	} else {
-		MOV(32, R(resultReg), R(temp2Reg));
+		MOV(32, R(temp2Reg), R(resultReg));
+		MOV(32, R(temp1Reg), R(resultReg));
+		AND(32, R(temp2Reg), Imm32(0x0000001F));
+		AND(32, R(temp1Reg), Imm32(0x000003E0));
+		SHL(32, R(temp1Reg), Imm8(3));
+		OR(32, R(temp2Reg), R(temp1Reg));
+
+		MOV(32, R(temp1Reg), R(resultReg));
+		AND(32, R(temp1Reg), Imm32(0x00007C00));
+		SHL(32, R(temp1Reg), Imm8(6));
+		OR(32, R(temp2Reg), R(temp1Reg));
+
+		// Expand 5 -> 8.  After this is just A.
+		MOV(32, R(temp1Reg), R(temp2Reg));
+		SHL(32, R(temp2Reg), Imm8(3));
+		SHR(32, R(temp1Reg), Imm8(2));
+		// Chop off the bits that were shifted out.
+		AND(32, R(temp1Reg), Imm32(0x00070707));
+		OR(32, R(temp2Reg), R(temp1Reg));
+
+		if (id.useTextureAlpha || id.fetch) {
+			// For A, we sign extend to get either 16 1s or 0s of alpha.
+			SAR(16, R(resultReg), Imm8(15));
+			// Now, shift left by 24 to get the lowest 8 of those at the top.
+			SHL(32, R(resultReg), Imm8(24));
+			OR(32, R(resultReg), R(temp2Reg));
+		} else {
+			MOV(32, R(resultReg), R(temp2Reg));
+		}
 	}

 	regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
@ -3235,31 +3349,46 @@ alignas(16) static const u32 color4444mask[4] = { 0xf00ff00f, 0xf00ff00f, 0xf00f
 bool SamplerJitCache::Jit_Decode4444(const SamplerID &id) {
 	Describe("4444");
 	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
-	X64Reg vecTemp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
-	X64Reg vecTemp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
-	X64Reg vecTemp3Reg = regCache_.Alloc(RegCache::VEC_TEMP3);

-	MOVD_xmm(vecTemp1Reg, R(resultReg));
-	PUNPCKLBW(vecTemp1Reg, R(vecTemp1Reg));
-	if (RipAccessible(color4444mask)) {
-		PAND(vecTemp1Reg, M(color4444mask));
-	} else {
+	if (cpu_info.bBMI2_fast) {
 		X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
-		MOV(PTRBITS, R(temp1Reg), ImmPtr(color4444mask));
-		PAND(vecTemp1Reg, MatR(temp1Reg));
-		regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
-	}
-	MOVSS(vecTemp2Reg, R(vecTemp1Reg));
-	MOVSS(vecTemp3Reg, R(vecTemp1Reg));
-	PSRLW(vecTemp2Reg, 4);
-	PSLLW(vecTemp3Reg, 4);
-	POR(vecTemp1Reg, R(vecTemp2Reg));
-	POR(vecTemp1Reg, R(vecTemp3Reg));
-	MOVD_xmm(R(resultReg), vecTemp1Reg);
+		// First, spread the bits out with spaces.
+		MOV(32, R(temp1Reg), Imm32(0xF0F0F0F0));
+		PDEP(32, resultReg, resultReg, R(temp1Reg));

-	regCache_.Release(vecTemp1Reg, RegCache::VEC_TEMP1);
-	regCache_.Release(vecTemp2Reg, RegCache::VEC_TEMP2);
-	regCache_.Release(vecTemp3Reg, RegCache::VEC_TEMP3);
+		// Now swizzle the low bits in.
+		MOV(32, R(temp1Reg), R(resultReg));
+		SHR(32, R(temp1Reg), Imm8(4));
+		OR(32, R(resultReg), R(temp1Reg));
+
+		regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
+	} else {
+		X64Reg vecTemp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
+		X64Reg vecTemp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
+		X64Reg vecTemp3Reg = regCache_.Alloc(RegCache::VEC_TEMP3);
+
+		MOVD_xmm(vecTemp1Reg, R(resultReg));
+		PUNPCKLBW(vecTemp1Reg, R(vecTemp1Reg));
+		if (RipAccessible(color4444mask)) {
+			PAND(vecTemp1Reg, M(color4444mask));
+		} else {
+			X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
+			MOV(PTRBITS, R(temp1Reg), ImmPtr(color4444mask));
+			PAND(vecTemp1Reg, MatR(temp1Reg));
+			regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
+		}
+		MOVSS(vecTemp2Reg, R(vecTemp1Reg));
+		MOVSS(vecTemp3Reg, R(vecTemp1Reg));
+		PSRLW(vecTemp2Reg, 4);
+		PSLLW(vecTemp3Reg, 4);
+		POR(vecTemp1Reg, R(vecTemp2Reg));
+		POR(vecTemp1Reg, R(vecTemp3Reg));
+		MOVD_xmm(R(resultReg), vecTemp1Reg);
+
+		regCache_.Release(vecTemp1Reg, RegCache::VEC_TEMP1);
+		regCache_.Release(vecTemp2Reg, RegCache::VEC_TEMP2);
+		regCache_.Release(vecTemp3Reg, RegCache::VEC_TEMP3);
+	}
 	regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
 	return true;
 }
@ -3277,8 +3406,10 @@ bool SamplerJitCache::Jit_TransformClutIndex(const SamplerID &id, int bitsPerInd
 		return true;
 	}

-	bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
-	_assert_msg_(hasRCX, "Could not obtain RCX, locked?");
+	if (!cpu_info.bBMI2) {
+		bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
+		_assert_msg_(hasRCX, "Could not obtain RCX, locked?");
+	}

 	X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
 	X64Reg idReg = GetSamplerID();
@ -3286,23 +3417,28 @@ bool SamplerJitCache::Jit_TransformClutIndex(const SamplerID &id, int bitsPerInd
 	UnlockSamplerID(idReg);

 	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
+	int shiftedToSoFar = 0;

 	// Shift = (clutformat >> 2) & 0x1F
 	if (id.hasClutShift) {
-		_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
-		MOV(32, R(RCX), R(temp1Reg));
-		SHR(32, R(RCX), Imm8(2));
-		AND(32, R(RCX), Imm8(0x1F));
-		SHR(32, R(resultReg), R(RCX));
+		SHR(32, R(temp1Reg), Imm8(2 - shiftedToSoFar));
+		shiftedToSoFar = 2;
+
+		if (cpu_info.bBMI2) {
+			SHRX(32, resultReg, R(resultReg), temp1Reg);
+		} else {
+			_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
+			MOV(32, R(RCX), R(temp1Reg));
+			SHR(32, R(resultReg), R(RCX));
+		}
 	}

 	// Mask = (clutformat >> 8) & 0xFF
 	if (id.hasClutMask) {
-		X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
-		MOV(32, R(temp2Reg), R(temp1Reg));
-		SHR(32, R(temp2Reg), Imm8(8));
-		AND(32, R(resultReg), R(temp2Reg));
-		regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
+		SHR(32, R(temp1Reg), Imm8(8 - shiftedToSoFar));
+		shiftedToSoFar = 8;
+
+		AND(32, R(resultReg), R(temp1Reg));
 	}

 	// We need to wrap any entries beyond the first 1024 bytes.
@ -3316,7 +3452,7 @@ bool SamplerJitCache::Jit_TransformClutIndex(const SamplerID &id, int bitsPerInd

 	// Offset = (clutformat >> 12) & 0x01F0
 	if (id.hasClutOffset) {
-		SHR(32, R(temp1Reg), Imm8(16));
+		SHR(32, R(temp1Reg), Imm8(16 - shiftedToSoFar));
 		SHL(32, R(temp1Reg), Imm8(4));
 		OR(32, R(resultReg), R(temp1Reg));
 		AND(32, R(resultReg), Imm32(offsetMask));