samplerjit: Decode colors with BMI2.

This only happens with nearest, though, so very small benefit.
2025-04-02 11:01:50 -04:00 · 2022-01-31 22:05:34 -08:00 · 2022-01-31 22:05:34 -08:00 · 4cadcea6da
commit 4cadcea6da
parent 367525f875
1 changed files with 124 additions and 71 deletions
--- a/GPU/Software/SamplerX86.cpp
+++ b/GPU/Software/SamplerX86.cpp
@ -3076,36 +3076,54 @@ bool SamplerJitCache::Jit_Decode5650(const SamplerID &id) {
 	X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
 	X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);

-	MOV(32, R(temp2Reg), R(resultReg));
-	AND(32, R(temp2Reg), Imm32(0x0000001F));
+	if (cpu_info.bBMI2_fast) {
+		// Start off with the high bits.
+		MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
+		PDEP(32, temp1Reg, resultReg, R(temp1Reg));
+		if (id.useTextureAlpha || id.fetch)
+			OR(32, R(temp1Reg), Imm32(0xFF000000));

-	// B (we do R and B at the same time, they're both 5.)
-	MOV(32, R(temp1Reg), R(resultReg));
-	AND(32, R(temp1Reg), Imm32(0x0000F800));
-	SHL(32, R(temp1Reg), Imm8(5));
-	OR(32, R(temp2Reg), R(temp1Reg));
+		// Now grab the low bits (they end up packed.)
+		MOV(32, R(temp2Reg), Imm32(0x0000E61C));
+		PEXT(32, resultReg, resultReg, R(temp2Reg));
+		// And spread them back out.
+		MOV(32, R(temp2Reg), Imm32(0x00070307));
+		PDEP(32, resultReg, resultReg, R(temp2Reg));

-	// Expand 5 -> 8.  At this point we have 00BB00RR.
-	MOV(32, R(temp1Reg), R(temp2Reg));
-	SHL(32, R(temp2Reg), Imm8(3));
-	SHR(32, R(temp1Reg), Imm8(2));
-	OR(32, R(temp2Reg), R(temp1Reg));
-	AND(32, R(temp2Reg), Imm32(0x00FF00FF));
+		// Finally put the high bits in, we're done.
+		OR(32, R(resultReg), R(temp1Reg));
+	} else {
+		MOV(32, R(temp2Reg), R(resultReg));
+		AND(32, R(temp2Reg), Imm32(0x0000001F));

-	// Now's as good a time to put in A as any.
-	if (id.useTextureAlpha || id.fetch)
-		OR(32, R(temp2Reg), Imm32(0xFF000000));
+		// B (we do R and B at the same time, they're both 5.)
+		MOV(32, R(temp1Reg), R(resultReg));
+		AND(32, R(temp1Reg), Imm32(0x0000F800));
+		SHL(32, R(temp1Reg), Imm8(5));
+		OR(32, R(temp2Reg), R(temp1Reg));

-	// Last, we need to align, extract, and expand G.
-	// 3 to align to G, and then 2 to expand to 8.
-	SHL(32, R(resultReg), Imm8(3 + 2));
-	AND(32, R(resultReg), Imm32(0x0000FC00));
-	MOV(32, R(temp1Reg), R(resultReg));
-	// 2 to account for resultReg being preshifted, 4 for expansion.
-	SHR(32, R(temp1Reg), Imm8(2 + 4));
-	OR(32, R(resultReg), R(temp1Reg));
-	AND(32, R(resultReg), Imm32(0x0000FF00));
-	OR(32, R(resultReg), R(temp2Reg));
+		// Expand 5 -> 8.  At this point we have 00BB00RR.
+		MOV(32, R(temp1Reg), R(temp2Reg));
+		SHL(32, R(temp2Reg), Imm8(3));
+		SHR(32, R(temp1Reg), Imm8(2));
+		OR(32, R(temp2Reg), R(temp1Reg));
+		AND(32, R(temp2Reg), Imm32(0x00FF00FF));
+
+		// Now's as good a time to put in A as any.
+		if (id.useTextureAlpha || id.fetch)
+			OR(32, R(temp2Reg), Imm32(0xFF000000));
+
+		// Last, we need to align, extract, and expand G.
+		// 3 to align to G, and then 2 to expand to 8.
+		SHL(32, R(resultReg), Imm8(3 + 2));
+		AND(32, R(resultReg), Imm32(0x0000FC00));
+		MOV(32, R(temp1Reg), R(resultReg));
+		// 2 to account for resultReg being preshifted, 4 for expansion.
+		SHR(32, R(temp1Reg), Imm8(2 + 4));
+		OR(32, R(resultReg), R(temp1Reg));
+		AND(32, R(resultReg), Imm32(0x0000FF00));
+		OR(32, R(resultReg), R(temp2Reg));
+	}

 	regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
 	regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
@ -3154,34 +3172,54 @@ bool SamplerJitCache::Jit_Decode5551(const SamplerID &id) {
 	X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
 	X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);

-	MOV(32, R(temp2Reg), R(resultReg));
-	MOV(32, R(temp1Reg), R(resultReg));
-	AND(32, R(temp2Reg), Imm32(0x0000001F));
-	AND(32, R(temp1Reg), Imm32(0x000003E0));
-	SHL(32, R(temp1Reg), Imm8(3));
-	OR(32, R(temp2Reg), R(temp1Reg));
+	if (cpu_info.bBMI2_fast) {
+		// First, grab the top bits.
+		bool keepAlpha = id.useTextureAlpha || id.fetch;
+		MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8));
+		PDEP(32, resultReg, resultReg, R(temp1Reg));

-	MOV(32, R(temp1Reg), R(resultReg));
-	AND(32, R(temp1Reg), Imm32(0x00007C00));
-	SHL(32, R(temp1Reg), Imm8(6));
-	OR(32, R(temp2Reg), R(temp1Reg));
+		// Now make the swizzle bits.
+		MOV(32, R(temp2Reg), R(resultReg));
+		SHR(32, R(temp2Reg), Imm8(5));
+		AND(32, R(temp2Reg), Imm32(0x00070707));

-	// Expand 5 -> 8.  After this is just A.
-	MOV(32, R(temp1Reg), R(temp2Reg));
-	SHL(32, R(temp2Reg), Imm8(3));
-	SHR(32, R(temp1Reg), Imm8(2));
-	// Chop off the bits that were shifted out.
-	AND(32, R(temp1Reg), Imm32(0x00070707));
-	OR(32, R(temp2Reg), R(temp1Reg));
+		if (keepAlpha) {
+			// Sign extend the alpha bit to 8 bits.
+			SHL(32, R(resultReg), Imm8(7));
+			SAR(32, R(resultReg), Imm8(7));
+		}

-	if (id.useTextureAlpha || id.fetch) {
-		// For A, we sign extend to get either 16 1s or 0s of alpha.
-		SAR(16, R(resultReg), Imm8(15));
-		// Now, shift left by 24 to get the lowest 8 of those at the top.
-		SHL(32, R(resultReg), Imm8(24));
 		OR(32, R(resultReg), R(temp2Reg));
 	} else {
-		MOV(32, R(resultReg), R(temp2Reg));
+		MOV(32, R(temp2Reg), R(resultReg));
+		MOV(32, R(temp1Reg), R(resultReg));
+		AND(32, R(temp2Reg), Imm32(0x0000001F));
+		AND(32, R(temp1Reg), Imm32(0x000003E0));
+		SHL(32, R(temp1Reg), Imm8(3));
+		OR(32, R(temp2Reg), R(temp1Reg));
+
+		MOV(32, R(temp1Reg), R(resultReg));
+		AND(32, R(temp1Reg), Imm32(0x00007C00));
+		SHL(32, R(temp1Reg), Imm8(6));
+		OR(32, R(temp2Reg), R(temp1Reg));
+
+		// Expand 5 -> 8.  After this is just A.
+		MOV(32, R(temp1Reg), R(temp2Reg));
+		SHL(32, R(temp2Reg), Imm8(3));
+		SHR(32, R(temp1Reg), Imm8(2));
+		// Chop off the bits that were shifted out.
+		AND(32, R(temp1Reg), Imm32(0x00070707));
+		OR(32, R(temp2Reg), R(temp1Reg));
+
+		if (id.useTextureAlpha || id.fetch) {
+			// For A, we sign extend to get either 16 1s or 0s of alpha.
+			SAR(16, R(resultReg), Imm8(15));
+			// Now, shift left by 24 to get the lowest 8 of those at the top.
+			SHL(32, R(resultReg), Imm8(24));
+			OR(32, R(resultReg), R(temp2Reg));
+		} else {
+			MOV(32, R(resultReg), R(temp2Reg));
+		}
 	}

 	regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
@ -3235,31 +3273,46 @@ alignas(16) static const u32 color4444mask[4] = { 0xf00ff00f, 0xf00ff00f, 0xf00f
 bool SamplerJitCache::Jit_Decode4444(const SamplerID &id) {
 	Describe("4444");
 	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
-	X64Reg vecTemp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
-	X64Reg vecTemp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
-	X64Reg vecTemp3Reg = regCache_.Alloc(RegCache::VEC_TEMP3);

-	MOVD_xmm(vecTemp1Reg, R(resultReg));
-	PUNPCKLBW(vecTemp1Reg, R(vecTemp1Reg));
-	if (RipAccessible(color4444mask)) {
-		PAND(vecTemp1Reg, M(color4444mask));
-	} else {
+	if (cpu_info.bBMI2_fast) {
 		X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
-		MOV(PTRBITS, R(temp1Reg), ImmPtr(color4444mask));
-		PAND(vecTemp1Reg, MatR(temp1Reg));
-		regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
-	}
-	MOVSS(vecTemp2Reg, R(vecTemp1Reg));
-	MOVSS(vecTemp3Reg, R(vecTemp1Reg));
-	PSRLW(vecTemp2Reg, 4);
-	PSLLW(vecTemp3Reg, 4);
-	POR(vecTemp1Reg, R(vecTemp2Reg));
-	POR(vecTemp1Reg, R(vecTemp3Reg));
-	MOVD_xmm(R(resultReg), vecTemp1Reg);
+		// First, spread the bits out with spaces.
+		MOV(32, R(temp1Reg), Imm32(0xF0F0F0F0));
+		PDEP(32, resultReg, resultReg, R(temp1Reg));

-	regCache_.Release(vecTemp1Reg, RegCache::VEC_TEMP1);
-	regCache_.Release(vecTemp2Reg, RegCache::VEC_TEMP2);
-	regCache_.Release(vecTemp3Reg, RegCache::VEC_TEMP3);
+		// Now swizzle the low bits in.
+		MOV(32, R(temp1Reg), R(resultReg));
+		SHR(32, R(temp1Reg), Imm8(4));
+		OR(32, R(resultReg), R(temp1Reg));
+
+		regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
+	} else {
+		X64Reg vecTemp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
+		X64Reg vecTemp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
+		X64Reg vecTemp3Reg = regCache_.Alloc(RegCache::VEC_TEMP3);
+
+		MOVD_xmm(vecTemp1Reg, R(resultReg));
+		PUNPCKLBW(vecTemp1Reg, R(vecTemp1Reg));
+		if (RipAccessible(color4444mask)) {
+			PAND(vecTemp1Reg, M(color4444mask));
+		} else {
+			X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
+			MOV(PTRBITS, R(temp1Reg), ImmPtr(color4444mask));
+			PAND(vecTemp1Reg, MatR(temp1Reg));
+			regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
+		}
+		MOVSS(vecTemp2Reg, R(vecTemp1Reg));
+		MOVSS(vecTemp3Reg, R(vecTemp1Reg));
+		PSRLW(vecTemp2Reg, 4);
+		PSLLW(vecTemp3Reg, 4);
+		POR(vecTemp1Reg, R(vecTemp2Reg));
+		POR(vecTemp1Reg, R(vecTemp3Reg));
+		MOVD_xmm(R(resultReg), vecTemp1Reg);
+
+		regCache_.Release(vecTemp1Reg, RegCache::VEC_TEMP1);
+		regCache_.Release(vecTemp2Reg, RegCache::VEC_TEMP2);
+		regCache_.Release(vecTemp3Reg, RegCache::VEC_TEMP3);
+	}
 	regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
 	return true;
 }