samplerjit: Simplify AVX shift-copies.

These have been the most common and the fallback is safe. Let's just add a helper.
2025-04-02 11:01:50 -04:00 · 2022-01-17 15:15:36 -08:00 · 2022-01-17 15:15:36 -08:00 · 0ba2d05da5
commit 0ba2d05da5
parent 4ea1c08551
3 changed files with 120 additions and 137 deletions
--- a/Common/x64Emitter.cpp
+++ b/Common/x64Emitter.cpp
@ -1683,25 +1683,50 @@ void XEmitter::PUNPCKHWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x69,
 void XEmitter::PUNPCKHDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6A, dest, arg);}
 void XEmitter::PUNPCKHQDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6D, dest, arg);}

-void XEmitter::PSRLW(X64Reg reg, int shift)
-{
+void XEmitter::PSRLW(X64Reg dest, X64Reg reg, int shift) {
+	if (dest != reg) {
+		if (cpu_info.bAVX) {
+			VPSRLW(128, dest, reg, shift);
+			return;
+		}
+		MOVDQA(dest, R(reg));
+	}
 	WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
 	Write8(shift);
 }

-void XEmitter::PSRLD(X64Reg reg, int shift)
-{
+void XEmitter::PSRLD(X64Reg dest, X64Reg reg, int shift) {
+	if (dest != reg) {
+		if (cpu_info.bAVX) {
+			VPSRLD(128, dest, reg, shift);
+			return;
+		}
+		MOVDQA(dest, R(reg));
+	}
 	WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
 	Write8(shift);
 }

-void XEmitter::PSRLQ(X64Reg reg, int shift)
-{
+void XEmitter::PSRLQ(X64Reg dest, X64Reg reg, int shift) {
+	if (dest != reg) {
+		if (cpu_info.bAVX) {
+			VPSRLQ(128, dest, reg, shift);
+			return;
+		}
+		MOVDQA(dest, R(reg));
+	}
 	WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
 	Write8(shift);
 }

-void XEmitter::PSRLDQ(X64Reg reg, int shift) {
+void XEmitter::PSRLDQ(X64Reg dest, X64Reg reg, int shift) {
+	if (dest != reg) {
+		if (cpu_info.bAVX) {
+			VPSRLDQ(128, dest, reg, shift);
+			return;
+		}
+		MOVDQA(dest, R(reg));
+	}
 	WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
 	Write8(shift);
 }
@ -1710,25 +1735,50 @@ void XEmitter::PSRLW(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD1, reg, arg);
 void XEmitter::PSRLD(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD2, reg, arg); }
 void XEmitter::PSRLQ(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD3, reg, arg); }

-void XEmitter::PSLLW(X64Reg reg, int shift)
-{
+void XEmitter::PSLLW(X64Reg dest, X64Reg reg, int shift) {
+	if (dest != reg) {
+		if (cpu_info.bAVX) {
+			VPSLLW(128, dest, reg, shift);
+			return;
+		}
+		MOVDQA(dest, R(reg));
+	}
 	WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
 	Write8(shift);
 }

-void XEmitter::PSLLD(X64Reg reg, int shift)
-{
+void XEmitter::PSLLD(X64Reg dest, X64Reg reg, int shift) {
+	if (dest != reg) {
+		if (cpu_info.bAVX) {
+			VPSLLD(128, dest, reg, shift);
+			return;
+		}
+		MOVDQA(dest, R(reg));
+	}
 	WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
 	Write8(shift);
 }

-void XEmitter::PSLLQ(X64Reg reg, int shift)
-{
+void XEmitter::PSLLQ(X64Reg dest, X64Reg reg, int shift) {
+	if (dest != reg) {
+		if (cpu_info.bAVX) {
+			VPSLLQ(128, dest, reg, shift);
+			return;
+		}
+		MOVDQA(dest, R(reg));
+	}
 	WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
 	Write8(shift);
 }

-void XEmitter::PSLLDQ(X64Reg reg, int shift) {
+void XEmitter::PSLLDQ(X64Reg dest, X64Reg reg, int shift) {
+	if (dest != reg) {
+		if (cpu_info.bAVX) {
+			VPSLLDQ(128, dest, reg, shift);
+			return;
+		}
+		MOVDQA(dest, R(reg));
+	}
 	WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
 	Write8(shift);
 }
@ -1737,14 +1787,26 @@ void XEmitter::PSLLW(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF1, reg, arg);
 void XEmitter::PSLLD(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF2, reg, arg); }
 void XEmitter::PSLLQ(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF3, reg, arg); }

-void XEmitter::PSRAW(X64Reg reg, int shift)
-{
+void XEmitter::PSRAW(X64Reg dest, X64Reg reg, int shift) {
+	if (dest != reg) {
+		if (cpu_info.bAVX) {
+			VPSRAW(128, dest, reg, shift);
+			return;
+		}
+		MOVDQA(dest, R(reg));
+	}
 	WriteSSEOp(0x66, 0x71, (X64Reg)4, R(reg));
 	Write8(shift);
 }

-void XEmitter::PSRAD(X64Reg reg, int shift)
-{
+void XEmitter::PSRAD(X64Reg dest, X64Reg reg, int shift) {
+	if (dest != reg) {
+		if (cpu_info.bAVX) {
+			VPSRAD(128, dest, reg, shift);
+			return;
+		}
+		MOVDQA(dest, R(reg));
+	}
 	WriteSSEOp(0x66, 0x72, (X64Reg)4, R(reg));
 	Write8(shift);
 }
--- a/Common/x64Emitter.h
+++ b/Common/x64Emitter.h
@ -844,10 +844,14 @@ public:
 	void PSHUFLW(X64Reg dest, OpArg arg, u8 shuffle);
 	void PSHUFHW(X64Reg dest, OpArg arg, u8 shuffle);

-	void PSRLW(X64Reg reg, int shift);
-	void PSRLD(X64Reg reg, int shift);
-	void PSRLQ(X64Reg reg, int shift);
-	void PSRLDQ(X64Reg reg, int shift);
+	void PSRLW(X64Reg dest, X64Reg reg, int shift);
+	void PSRLW(X64Reg reg, int shift) { PSRLW(reg, reg, shift); }
+	void PSRLD(X64Reg dest, X64Reg reg, int shift);
+	void PSRLD(X64Reg reg, int shift) { PSRLD(reg, reg, shift); }
+	void PSRLQ(X64Reg dest, X64Reg reg, int shift);
+	void PSRLQ(X64Reg reg, int shift) { PSRLQ(reg, reg, shift); }
+	void PSRLDQ(X64Reg dest, X64Reg reg, int shift);
+	void PSRLDQ(X64Reg reg, int shift) { PSRLDQ(reg, reg, shift); }
 	// Note: all values shifted by lowest 64-bit in XMM arg.
 	void PSRLW(X64Reg reg, OpArg arg);
 	// Note: all values shifted by lowest 64-bit in XMM arg.
@ -855,10 +859,14 @@ public:
 	// Note: both values shifted by lowest 64-bit in XMM arg.
 	void PSRLQ(X64Reg reg, OpArg arg);

-	void PSLLW(X64Reg reg, int shift);
-	void PSLLD(X64Reg reg, int shift);
-	void PSLLQ(X64Reg reg, int shift);
-	void PSLLDQ(X64Reg reg, int shift);
+	void PSLLW(X64Reg dest, X64Reg reg, int shift);
+	void PSLLW(X64Reg reg, int shift) { PSLLW(reg, reg, shift); }
+	void PSLLD(X64Reg dest, X64Reg reg, int shift);
+	void PSLLD(X64Reg reg, int shift) { PSLLD(reg, reg, shift); }
+	void PSLLQ(X64Reg dest, X64Reg reg, int shift);
+	void PSLLQ(X64Reg reg, int shift) { PSLLQ(reg, reg, shift); }
+	void PSLLDQ(X64Reg dest, X64Reg reg, int shift);
+	void PSLLDQ(X64Reg reg, int shift) { PSLLDQ(reg, reg, shift); }
 	// Note: all values shifted by lowest 64-bit in XMM arg.
 	void PSLLW(X64Reg reg, OpArg arg);
 	// Note: all values shifted by lowest 64-bit in XMM arg.
@ -866,8 +874,10 @@ public:
 	// Note: both values shifted by lowest 64-bit in XMM arg.
 	void PSLLQ(X64Reg reg, OpArg arg);

-	void PSRAW(X64Reg reg, int shift);
-	void PSRAD(X64Reg reg, int shift);
+	void PSRAW(X64Reg dest, X64Reg reg, int shift);
+	void PSRAW(X64Reg reg, int shift) { PSRAW(reg, reg, shift); }
+	void PSRAD(X64Reg dest, X64Reg reg, int shift);
+	void PSRAD(X64Reg reg, int shift) { PSRAD(reg, reg, shift); }
 	// Note: all values shifted by lowest 64-bit in XMM arg.
 	void PSRAW(X64Reg reg, OpArg arg);
 	// Note: all values shifted by lowest 64-bit in XMM arg.
--- a/GPU/Software/SamplerX86.cpp
+++ b/GPU/Software/SamplerX86.cpp
@ -1169,12 +1169,7 @@ bool SamplerJitCache::Jit_TransformClutIndexQuad(const SamplerID &id, int bitsPe

 		X64Reg shiftReg = regCache_.Alloc(RegCache::VEC_TEMP1);
 		// Shift against walls to get 5 bits after the rightmost 2.
-		if (cpu_info.bAVX) {
-			VPSLLD(128, shiftReg, formatReg, 32 - 7);
-		} else {
-			MOVDQA(shiftReg, R(formatReg));
-			PSLLD(shiftReg, 32 - 7);
-		}
+		PSLLD(shiftReg, formatReg, 32 - 7);
 		PSRLD(shiftReg, 32 - 5);
 		// The other lanes are zero, so we can use PSRLD.
 		PSRLD(indexReg, R(shiftReg));
@ -1189,12 +1184,7 @@ bool SamplerJitCache::Jit_TransformClutIndexQuad(const SamplerID &id, int bitsPe
 	if (id.hasClutMask) {
 		X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP1);
 		// If it was CLUT4, grab only 4 bits of the mask.
-		if (cpu_info.bAVX) {
-			VPSLLD(128, maskReg, formatReg, bitsPerIndex == 4 ? 20 : 16);
-		} else {
-			MOVDQA(maskReg, R(formatReg));
-			PSLLD(maskReg, bitsPerIndex == 4 ? 20 : 16);
-		}
+		PSLLD(maskReg, formatReg, bitsPerIndex == 4 ? 20 : 16);
 		PSRLD(maskReg, bitsPerIndex == 4 ? 28 : 24);

 		PAND(indexReg, R(maskReg));
@ -1569,12 +1559,7 @@ bool SamplerJitCache::Jit_ApplyTextureFunc(const SamplerID &id) {
 			useAlphaFrom(primColorReg);
 		} else if (id.useColorDoubling) {
 			// We still need to finish dividing alpha, it's currently doubled (from the 7 above.)
-			if (cpu_info.bAVX) {
-				VPSRLW(128, primColorReg, resultReg, 1);
-			} else {
-				MOVDQA(primColorReg, R(resultReg));
-				PSRLW(primColorReg, 1);
-			}
+			PSRLW(primColorReg, resultReg, 1);
 			useAlphaFrom(primColorReg);
 		}
 		break;
@ -2606,12 +2591,7 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
 				PSHUFD(size, R(size), _MM_SHUFFLE(0, 0, 0, 0));
 			}

-			if (cpu_info.bAVX) {
-				VPSLLD(128, tempVecReg, size, 8);
-			} else {
-				MOVDQA(tempVecReg, R(size));
-				PSLLD(tempVecReg, 8);
-			}
+			PSLLD(tempVecReg, size, 8);
 			CVTDQ2PS(tempVecReg, R(tempVecReg));
 			// And then multiply.
 			MULPS(dest, R(tempVecReg));
@ -2948,12 +2928,7 @@ bool SamplerJitCache::Jit_PrepareDataSwizzledOffsets(const SamplerID &id, RegCac

 	// Divide vvec by 8 in a temp.
 	X64Reg vMultReg = regCache_.Alloc(RegCache::VEC_TEMP1);
-	if (cpu_info.bAVX) {
-		VPSRLD(128, vMultReg, vReg, 3);
-	} else {
-		MOVDQA(vMultReg, R(vReg));
-		PSRLD(vMultReg, 3);
-	}
+	PSRLD(vMultReg, vReg, 3);

 	// And now multiply by bufw.  May be able to use a shift in a common case.
 	int shiftAmount = 32 - clz32_nonzero(bitsPerTexel - 1);
@ -2995,24 +2970,14 @@ bool SamplerJitCache::Jit_PrepareDataSwizzledOffsets(const SamplerID &id, RegCac

 	// Now get ((uvec / texels_per_tile) / 4) * 32 * 4 aka (uvec / (128 / bitsPerTexel)) << 7.
 	X64Reg uCopyReg = regCache_.Alloc(RegCache::VEC_TEMP0);
-	if (cpu_info.bAVX) {
-		VPSRLD(128, uCopyReg, uReg, 7 + clz32_nonzero(bitsPerTexel - 1) - 32);
-	} else {
-		MOVDQA(uCopyReg, R(uReg));
-		PSRLD(uCopyReg, 7 + clz32_nonzero(bitsPerTexel - 1) - 32);
-	}
+	PSRLD(uCopyReg, uReg, 7 + clz32_nonzero(bitsPerTexel - 1) - 32);
 	PSLLD(uCopyReg, 7);
 	// Add it in to our running total.
 	PADDD(vReg, R(uCopyReg));

 	if (bitsPerTexel == 4) {
 		// Finally, we want (uvec & 31) / 2.  Use a 16-bit wall.
-		if (cpu_info.bAVX) {
-			VPSLLW(128, uCopyReg, uReg, 11);
-		} else {
-			MOVDQA(uCopyReg, R(uReg));
-			PSLLW(uCopyReg, 11);
-		}
+		PSLLW(uCopyReg, uReg, 11);
 		PSRLD(uCopyReg, 12);
 		// With that, this is our byte offset.  uvec & 1 has which half.
 		PADDD(vReg, R(uCopyReg));
@ -3075,34 +3040,19 @@ bool SamplerJitCache::Jit_Decode5650Quad(const SamplerID &id, Rasterizer::RegCac
 	X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);

 	// Filter out red only into temp1.  We do this by shifting into a wall.
-	if (cpu_info.bAVX) {
-		VPSLLD(128, temp1Reg, quadReg, 32 - 5);
-	} else {
-		MOVDQA(temp1Reg, R(quadReg));
-		PSLLD(temp1Reg, 32 - 5);
-	}
+	PSLLD(temp1Reg, quadReg, 32 - 5);
 	// Move it right to the top of the 8 bits.
 	PSRLD(temp1Reg, 24);

 	// Now we bring in blue, since it's also 5 like red.
-	if (cpu_info.bAVX) {
-		VPSRLD(128, temp2Reg, quadReg, 11);
-	} else {
-		MOVDQA(temp2Reg, R(quadReg));
-		// Luckily, we know the top 16 bits are zero.  Shift right into a wall.
-		PSRLD(temp2Reg, 11);
-	}
+	// Luckily, we know the top 16 bits are zero.  Shift right into a wall.
+	PSRLD(temp2Reg, quadReg, 11);
 	// Shift blue into place at 19, and merge back to temp1.
 	PSLLD(temp2Reg, 19);
 	POR(temp1Reg, R(temp2Reg));

 	// Make a copy back in temp2, and shift left 1 so we can swizzle together with G.
-	if (cpu_info.bAVX) {
-		VPSLLD(128, temp2Reg, temp1Reg, 1);
-	} else {
-		MOVDQA(temp2Reg, R(temp1Reg));
-		PSLLD(temp2Reg, 1);
-	}
+	PSLLD(temp2Reg, temp1Reg, 1);

 	// We go to green last because it's the different one.  Shift off red and blue.
 	PSRLD(quadReg, 5);
@ -3179,22 +3129,12 @@ bool SamplerJitCache::Jit_Decode5551Quad(const SamplerID &id, Rasterizer::RegCac
 	X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);

 	// Filter out red only into temp1.  We do this by shifting into a wall.
-	if (cpu_info.bAVX) {
-		VPSLLD(128, temp1Reg, quadReg, 32 - 5);
-	} else {
-		MOVDQA(temp1Reg, R(quadReg));
-		PSLLD(temp1Reg, 32 - 5);
-	}
+	PSLLD(temp1Reg, quadReg, 32 - 5);
 	// Move it right to the top of the 8 bits.
 	PSRLD(temp1Reg, 24);

 	// Add in green and shift into place (top 5 bits of byte 2.)
-	if (cpu_info.bAVX) {
-		VPSRLD(128, temp2Reg, quadReg, 5);
-	} else {
-		MOVDQA(temp2Reg, R(quadReg));
-		PSRLD(temp2Reg, 5);
-	}
+	PSRLD(temp2Reg, quadReg, 5);
 	PSLLW(temp2Reg, 11);
 	POR(temp1Reg, R(temp2Reg));

@ -3206,12 +3146,7 @@ bool SamplerJitCache::Jit_Decode5551Quad(const SamplerID &id, Rasterizer::RegCac

 	// Combine both together, we still need to swizzle.
 	POR(quadReg, R(temp1Reg));
-	if (cpu_info.bAVX) {
-		VPSRLD(128, temp1Reg, quadReg, 5);
-	} else {
-		MOVDQA(temp1Reg, R(quadReg));
-		PSRLD(temp1Reg, 5);
-	}
+	PSRLD(temp1Reg, quadReg, 5);

 	// Now for swizzle, we'll mask carefully to avoid overflow.
 	PAND(temp1Reg, M(const5551Swizzle_));
@ -3271,31 +3206,16 @@ bool SamplerJitCache::Jit_Decode4444Quad(const SamplerID &id, Rasterizer::RegCac
 	X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);

 	// Mask and move red into position within temp1.
-	if (cpu_info.bAVX) {
-		VPSLLD(128, temp1Reg, quadReg, 28);
-	} else {
-		MOVDQA(temp1Reg, R(quadReg));
-		PSLLD(temp1Reg, 28);
-	}
+	PSLLD(temp1Reg, quadReg, 28);
 	PSRLD(temp1Reg, 24);

 	// Green is easy too, we use a word shift to get a free wall.
-	if (cpu_info.bAVX) {
-		VPSRLD(128, temp2Reg, quadReg, 4);
-	} else {
-		MOVDQA(temp2Reg, R(quadReg));
-		PSRLD(temp2Reg, 4);
-	}
+	PSRLD(temp2Reg, quadReg, 4);
 	PSLLW(temp2Reg, 12);
 	POR(temp1Reg, R(temp2Reg));

 	// Blue isn't last this time, but it's next.
-	if (cpu_info.bAVX) {
-		VPSRLD(128, temp2Reg, quadReg, 8);
-	} else {
-		MOVDQA(temp2Reg, R(quadReg));
-		PSRLD(temp2Reg, 8);
-	}
+	PSRLD(temp2Reg, quadReg, 8);
 	PSLLD(temp2Reg, 28);
 	PSRLD(temp2Reg, 8);
 	POR(temp1Reg, R(temp2Reg));
@ -3307,20 +3227,11 @@ bool SamplerJitCache::Jit_Decode4444Quad(const SamplerID &id, Rasterizer::RegCac
 		POR(quadReg, R(temp1Reg));

 		// Masking isn't necessary here since everything is 4 wide.
-		if (cpu_info.bAVX) {
-			VPSRLD(128, temp1Reg, quadReg, 4);
-		} else {
-			MOVDQA(temp1Reg, R(quadReg));
-			PSRLD(temp1Reg, 4);
-		}
-		POR(quadReg, R(temp1Reg));
-	} else if (cpu_info.bAVX) {
-		VPSRLD(128, quadReg, temp1Reg, 4);
+		PSRLD(temp1Reg, quadReg, 4);
 		POR(quadReg, R(temp1Reg));
 	} else {
-		// Overwrite colorReg (we need temp1 as a copy anyway.)
-		MOVDQA(quadReg, R(temp1Reg));
-		PSRLD(temp1Reg, 4);
+		// Overwrite quadReg (we need temp1 as a copy anyway.)
+		PSRLD(quadReg, temp1Reg, 4);
 		POR(quadReg, R(temp1Reg));
 	}