x86jit: Micro optimize vi2x a bit with ssse3/sse4.

Both are small wins.
2025-04-02 11:01:50 -04:00 · 2014-11-08 11:58:59 -08:00 · 2014-11-08 11:58:59 -08:00 · bc7497857a
commit bc7497857a
parent 0e646f748a
3 changed files with 49 additions and 13 deletions
--- a/Common/x64Emitter.cpp
+++ b/Common/x64Emitter.cpp
@ -1742,6 +1742,15 @@ void XEmitter::PSHUFB(X64Reg dest, OpArg arg)   {WriteSSSE3Op(0x66, 0x3800, dest
 void XEmitter::PTEST(X64Reg dest, OpArg arg)    {WriteSSE41Op(0x66, 0x3817, dest, arg);}
 void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}

+void XEmitter::PMINSB(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x3838, dest, arg);}
+void XEmitter::PMINSD(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x3839, dest, arg);}
+void XEmitter::PMINUW(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x383a, dest, arg);}
+void XEmitter::PMINUD(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x383b, dest, arg);}
+void XEmitter::PMAXSB(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x383c, dest, arg);}
+void XEmitter::PMAXSD(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x383d, dest, arg);}
+void XEmitter::PMAXUW(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x383e, dest, arg);}
+void XEmitter::PMAXUD(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x383f, dest, arg);}
+
 void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);}
 void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);}
 void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);}
--- a/Common/x64Emitter.h
+++ b/Common/x64Emitter.h
@ -763,7 +763,15 @@ public:
 	void PMAXUB(X64Reg dest, OpArg arg);
 	void PMINSW(X64Reg dest, OpArg arg);
 	void PMINUB(X64Reg dest, OpArg arg);
-	// SSE4 has PMAXSB and PMINSB and PMAXUW and PMINUW too if we need them.
+	// SSE4: More MAX/MIN instructions.
+	void PMINSB(X64Reg dest, OpArg arg);
+	void PMINSD(X64Reg dest, OpArg arg);
+	void PMINUW(X64Reg dest, OpArg arg);
+	void PMINUD(X64Reg dest, OpArg arg);
+	void PMAXSB(X64Reg dest, OpArg arg);
+	void PMAXSD(X64Reg dest, OpArg arg);
+	void PMAXUW(X64Reg dest, OpArg arg);
+	void PMAXUD(X64Reg dest, OpArg arg);

 	void PMOVMSKB(X64Reg dest, OpArg arg);
 	void PSHUFD(X64Reg dest, OpArg arg, u8 shuffle);
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@ -2222,6 +2222,12 @@ void Jit::Comp_VDet(MIPSOpcode op) {
 	DISABLE;
 }

+// The goal is to map (reversed byte order for clarity):
+// 000000AA 000000BB 000000CC 000000DD -> AABBCCDD
+static s8 MEMORY_ALIGNED16( vi2xc_shuffle[16] ) = { 3, 7, 11, 15,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1 };
+// 0000AAAA 0000BBBB 0000CCCC 0000DDDD -> AAAABBBB CCCCDDDD
+static s8 MEMORY_ALIGNED16( vi2xs_shuffle[16] ) = { 2, 3, 6, 7,  10, 11, 14, 15,  -1, -1, -1, -1,  -1, -1, -1, -1 };
+
 void Jit::Comp_Vi2x(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 	if (js.HasUnknownPrefix())
@ -2293,27 +2299,40 @@ void Jit::Comp_Vi2x(MIPSOpcode op) {
 		PUNPCKLQDQ(dst0, R(XMM0));
 	} else {
 		// Otherwise, we need to zero out the top 2.
-		// TODO: Maybe PAND would be better?
+		// We expect XMM1 to be zero below.
 		PXOR(XMM1, R(XMM1));
 		PUNPCKLQDQ(dst0, R(XMM1));
 	}

 	// For "u" type ops, we clamp to zero and shift off the sign bit first.
 	if (unsignedOp) {
-		// Get a mask of the sign bit in dst0, then and in the values.  This clamps to 0.
-		MOVDQA(XMM1, R(dst0));
-		PSRAD(dst0, 31);
-		PSLLD(XMM1, 1);
-		PANDN(dst0, R(XMM1));
+		if (cpu_info.bSSE4_1) {
+			if (sz == V_Quad) {
+				// Zeroed in the other case above.
+				PXOR(XMM1, R(XMM1));
+			}
+			PMAXSD(dst0, R(XMM1));
+			PSLLD(dst0, 1);
+		} else {
+			// Get a mask of the sign bit in dst0, then and in the values.  This clamps to 0.
+			MOVDQA(XMM1, R(dst0));
+			PSRAD(dst0, 31);
+			PSLLD(XMM1, 1);
+			PANDN(dst0, R(XMM1));
+		}
 	}

 	// At this point, everything is aligned in the high bits of our lanes.
-	// Let's *arithmetically* shift in the sign so we can use saturating packs.
-	PSRAD(dst0, 32 - bits);
-	// XMM1 used for the high part just so there's no dependency.  It contains garbage or 0.
-	PACKSSDW(dst0, R(XMM1));
-	if (bits == 8) {
-		PACKSSWB(dst0, R(XMM1));
+	if (cpu_info.bSSSE3) {
+		PSHUFB(dst0, bits == 8 ? M(vi2xc_shuffle) : M(vi2xs_shuffle));
+	} else {
+		// Let's *arithmetically* shift in the sign so we can use saturating packs.
+		PSRAD(dst0, 32 - bits);
+		// XMM1 used for the high part just so there's no dependency.  It contains garbage or 0.
+		PACKSSDW(dst0, R(XMM1));
+		if (bits == 8) {
+			PACKSSWB(dst0, R(XMM1));
+		}
 	}

 	if (!fpr.V(dregs[0]).IsSimpleReg(dst0)) {