VFPU: Some micro-optimizations. Don't fall back to interpreter path for vexp/vlog/vrexp.

2025-04-02 11:01:50 -04:00 · 2023-06-04 10:09:20 +02:00 · 2023-06-04 10:09:20 +02:00 · 9db9fec898
commit 9db9fec898
parent 27b8d27efc
6 changed files with 65 additions and 59 deletions
--- a/Core/MIPS/JitCommon/JitBlockCache.cpp
+++ b/Core/MIPS/JitCommon/JitBlockCache.cpp
@ -83,7 +83,7 @@ JitBlockCache::~JitBlockCache() {
 	Shutdown();
 }

-bool JitBlock::ContainsAddress(u32 em_address) {
+bool JitBlock::ContainsAddress(u32 em_address) const {
 	// WARNING - THIS DOES NOT WORK WITH JIT INLINING ENABLED.
 	// However, that doesn't exist yet so meh.
 	return (em_address >= originalAddress && em_address < originalAddress + 4 * originalSize);
--- a/Core/MIPS/JitCommon/JitBlockCache.h
+++ b/Core/MIPS/JitCommon/JitBlockCache.h
@ -59,7 +59,7 @@ enum class DestroyType {
 // We should be careful not to access these block structures during runtime as they are large.
 // Fine to mess with them at block compile time though.
 struct JitBlock {
-	bool ContainsAddress(u32 em_address);
+	bool ContainsAddress(u32 em_address) const;

 	const u8 *checkedEntry;  // const, we have to translate to writable.
 	const u8 *normalEntry;
--- a/Core/MIPS/MIPSVFPUUtils.cpp
+++ b/Core/MIPS/MIPSVFPUUtils.cpp
@ -165,68 +165,58 @@ void GetMatrixRows(int matrixReg, MatrixSize msize, u8 vecs[4]) {
 }

 void ReadVector(float *rd, VectorSize size, int reg) {
-	int row = 0;
-	int length = 0;
-
+	int row;
+	int length;
 	switch (size) {
 	case V_Single: rd[0] = V(reg); return; // transpose = 0; row=(reg>>5)&3; length = 1; break;
 	case V_Pair:   row=(reg>>5)&2; length = 2; break;
 	case V_Triple: row=(reg>>6)&1; length = 3; break;
 	case V_Quad:   row=(reg>>5)&2; length = 4; break;
-	default: _assert_msg_(false, "%s: Bad vector size", __FUNCTION__);
+	default: length = 0; break;
 	}
-	int transpose = (reg>>5) & 1;
-	const int mtx = (reg >> 2) & 7;
+	int transpose = (reg >> 5) & 1;
+	const int mtx = reg & (7 << 2);
 	const int col = reg & 3;
-
 	if (transpose) {
-		const int base = mtx * 4 + col * 32;
+		const int base = mtx + col * 32;
 		for (int i = 0; i < length; i++)
 			rd[i] = V(base + ((row+i)&3));
 	} else {
-		const int base = mtx * 4 + col;
+		const int base = mtx + col;
 		for (int i = 0; i < length; i++)
 			rd[i] = V(base + ((row+i)&3)*32);
 	}
 }

 void WriteVector(const float *rd, VectorSize size, int reg) {
-	if (size == V_Single) {
-		// Optimize the common case.
-		if (!currentMIPS->VfpuWriteMask(0)) {
-			V(reg) = rd[0];
-		}
-		return;
-	}
-
-	const int mtx = (reg>>2)&7;
-	const int col = reg & 3;
-	int transpose = (reg>>5)&1;
-	int row = 0;
-	int length = 0;
+	int row;
+	int length;

 	switch (size) {
-	case V_Single: _dbg_assert_(false); return; // transpose = 0; row=(reg>>5)&3; length = 1; break;
+	case V_Single: if (!currentMIPS->VfpuWriteMask(0)) V(reg) = rd[0]; return; // transpose = 0; row=(reg>>5)&3; length = 1; break;
 	case V_Pair:   row=(reg>>5)&2; length = 2; break;
 	case V_Triple: row=(reg>>6)&1; length = 3; break;
 	case V_Quad:   row=(reg>>5)&2; length = 4; break;
-	default: _assert_msg_(false, "%s: Bad vector size", __FUNCTION__);
+	default: length = 0; break;
 	}

+	const int mtx = reg & (7 << 2);
+	const int col = reg & 3;
+	bool transpose = (reg >> 5) & 1;
 	if (currentMIPS->VfpuWriteMask() == 0) {
 		if (transpose) {
-			const int base = mtx * 4 + col * 32;
+			const int base = mtx + col * 32;
 			for (int i = 0; i < length; i++)
 				V(base + ((row+i)&3)) = rd[i];
 		} else {
-			const int base = mtx * 4 + col;
+			const int base = mtx + col;
 			for (int i = 0; i < length; i++)
 				V(base + ((row+i)&3)*32) = rd[i];
 		}
 	} else {
 		for (int i = 0; i < length; i++) {
 			if (!currentMIPS->VfpuWriteMask(i)) {
-				int index = mtx * 4;
+				int index = mtx;
 				if (transpose)
 					index += ((row+i)&3) + col*32;
 				else
@ -243,9 +233,6 @@ u32 VFPURewritePrefix(int ctrl, u32 remove, u32 add) {
 }

 void ReadMatrix(float *rd, MatrixSize size, int reg) {
-	int mtx = (reg >> 2) & 7;
-	int col = reg & 3;
-
 	int row = 0;
 	int side = 0;
 	int transpose = (reg >> 5) & 1;
@ -255,9 +242,12 @@ void ReadMatrix(float *rd, MatrixSize size, int reg) {
 	case M_2x2: row = (reg >> 5) & 2; side = 2; break;
 	case M_3x3: row = (reg >> 6) & 1; side = 3; break;
 	case M_4x4: row = (reg >> 5) & 2; side = 4; break;
-	default: _assert_msg_(false, "%s: Bad matrix size", __FUNCTION__);
+	default: side = 0; break;
 	}

+	int mtx = (reg >> 2) & 7;
+	int col = reg & 3;
+
 	// The voffset ordering is now integrated in these formulas,
 	// eliminating a table lookup.
 	const float *v = currentMIPS->v + (size_t)mtx * 16;
@ -296,8 +286,8 @@ void WriteMatrix(const float *rd, MatrixSize size, int reg) {
 	int mtx = (reg>>2)&7;
 	int col = reg&3;

-	int row = 0;
-	int side = 0;
+	int row;
+	int side;
 	int transpose = (reg >> 5) & 1;

 	switch (size) {
@ -305,7 +295,7 @@ void WriteMatrix(const float *rd, MatrixSize size, int reg) {
 	case M_2x2: row = (reg >> 5) & 2; side = 2; break;
 	case M_3x3: row = (reg >> 6) & 1; side = 3; break;
 	case M_4x4: row = (reg >> 5) & 2; side = 4; break;
-	default: _assert_msg_(false, "%s: Bad matrix size", __FUNCTION__);
+	default: side = 0;
 	}

 	if (currentMIPS->VfpuWriteMask() != 0) {
@ -370,16 +360,6 @@ int GetVectorOverlap(int vec1, VectorSize size1, int vec2, VectorSize size2) {
 	return count;
 }

-int GetNumVectorElements(VectorSize sz) {
-	switch (sz) {
-		case V_Single: return 1;
-		case V_Pair:   return 2;
-		case V_Triple: return 3;
-		case V_Quad:   return 4;
-		default:       return 0;
-	}
-}
-
 VectorSize GetHalfVectorSizeSafe(VectorSize sz) {
 	switch (sz) {
 	case V_Pair: return V_Single;
--- a/Core/MIPS/MIPSVFPUUtils.h
+++ b/Core/MIPS/MIPSVFPUUtils.h
@ -218,7 +218,17 @@ VectorSize GetDoubleVectorSizeSafe(VectorSize sz);
 VectorSize GetDoubleVectorSize(VectorSize sz);
 VectorSize MatrixVectorSizeSafe(MatrixSize sz);
 VectorSize MatrixVectorSize(MatrixSize sz);
-int GetNumVectorElements(VectorSize sz);
+
+inline int GetNumVectorElements(VectorSize sz) {
+	switch (sz) {
+	case V_Single: return 1;
+	case V_Pair:   return 2;
+	case V_Triple: return 3;
+	case V_Quad:   return 4;
+	default:       return 0;
+	}
+}
+
 int GetMatrixSideSafe(MatrixSize sz);
 int GetMatrixSide(MatrixSize sz);
 std::string GetVectorNotation(int reg, VectorSize size);
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@ -2208,8 +2208,8 @@ void CosOnly(SinCosArg angle, float *output) {
 	output[1] = vfpu_cos(angle);
 }

-void ASinScaled(SinCosArg angle, float *output) {
-	output[0] = vfpu_asin(angle);
+void ASinScaled(SinCosArg sine, float *output) {
+	output[0] = vfpu_asin(sine);
 }

 void SinCosNegSin(SinCosArg angle, float *output) {
@ -2217,13 +2217,25 @@ void SinCosNegSin(SinCosArg angle, float *output) {
 	output[0] = -output[0];
 }

+void Exp2(SinCosArg arg, float *output) {
+	output[0] = vfpu_exp2(arg);
+}
+
+void Log2(SinCosArg arg, float *output) {
+	output[0] = vfpu_log2(arg);
+}
+
+void RExp2(SinCosArg arg, float *output) {
+	output[0] = vfpu_rexp2(arg);
+}
+
 void Jit::Comp_VV2Op(MIPSOpcode op) {
 	CONDITIONAL_DISABLE(VFPU_VEC);

 	if (js.HasUnknownPrefix())
 		DISABLE;

-	auto trigCallHelper = [this](void (*sinCosFunc)(SinCosArg, float *output), u8 sreg) {
+	auto specialFuncCallHelper = [this](void (*specialFunc)(SinCosArg, float *output), u8 sreg) {
 #if PPSSPP_ARCH(AMD64)
 		MOVSS(XMM0, fpr.V(sreg));
 		// TODO: This reg might be different on Linux...
@ -2232,7 +2244,7 @@ void Jit::Comp_VV2Op(MIPSOpcode op) {
 #else
 		LEA(64, RDI, MIPSSTATE_VAR(sincostemp[0]));
 #endif
-		ABI_CallFunction(thunks.ProtectFunction((const void *)sinCosFunc, 0));
+		ABI_CallFunction(thunks.ProtectFunction((const void *)specialFunc, 0));
 #else
 		// Sigh, passing floats with cdecl isn't pretty, ends up on the stack.
 		if (fpr.V(sreg).IsSimpleReg()) {
@ -2240,7 +2252,7 @@ void Jit::Comp_VV2Op(MIPSOpcode op) {
 		} else {
 			MOV(32, R(EAX), fpr.V(sreg));
 		}
-		CallProtectedFunction((const void *)sinCosFunc, R(EAX), Imm32((uint32_t)(uintptr_t)&mips_->sincostemp[0]));
+		CallProtectedFunction((const void *)specialFunc, R(EAX), Imm32((uint32_t)(uintptr_t)&mips_->sincostemp[0]));
 #endif
 	};

@ -2406,18 +2418,20 @@ void Jit::Comp_VV2Op(MIPSOpcode op) {
 			DIVSS(tempxregs[i], R(XMM0));
 			break;
 		case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
-			trigCallHelper(&SinOnly, sregs[i]);
+			specialFuncCallHelper(&SinOnly, sregs[i]);
 			MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
 			break;
 		case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos
-			trigCallHelper(&CosOnly, sregs[i]);
+			specialFuncCallHelper(&CosOnly, sregs[i]);
 			MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[1]));
 			break;
 		case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2
-			DISABLE;
+			specialFuncCallHelper(&Exp2, sregs[i]);
+			MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
 			break;
 		case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2
-			DISABLE;
+			specialFuncCallHelper(&Log2, sregs[i]);
+			MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
 			break;
 		case 22: // d[i] = sqrtf(s[i]); break; //vsqrt
 			SQRTSS(tempxregs[i], fpr.V(sregs[i]));
@ -2425,7 +2439,7 @@ void Jit::Comp_VV2Op(MIPSOpcode op) {
 			ANDPS(tempxregs[i], MatR(TEMPREG));
 			break;
 		case 23: // d[i] = asinf(s[i]) / M_PI_2; break; //vasin
-			trigCallHelper(&ASinScaled, sregs[i]);
+			specialFuncCallHelper(&ASinScaled, sregs[i]);
 			MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
 			break;
 		case 24: // d[i] = -1.0f / s[i]; break; // vnrcp
@ -2436,11 +2450,12 @@ void Jit::Comp_VV2Op(MIPSOpcode op) {
 			MOVSS(tempxregs[i], R(XMM0));
 			break;
 		case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin
-			trigCallHelper(&NegSinOnly, sregs[i]);
+			specialFuncCallHelper(&NegSinOnly, sregs[i]);
 			MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
 			break;
 		case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2
-			DISABLE;
+			specialFuncCallHelper(&RExp2, sregs[i]);
+			MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
 			break;
 		}
 	}
--- a/Windows/MainWindow.cpp
+++ b/Windows/MainWindow.cpp
@ -87,6 +87,7 @@
 #define MOUSEEVENTF_FROMTOUCH_NOPEN 0xFF515780 //http://msdn.microsoft.com/en-us/library/windows/desktop/ms703320(v=vs.85).aspx
 #define MOUSEEVENTF_MASK_PLUS_PENTOUCH 0xFFFFFF80

+// See https://github.com/unknownbrackets/verysleepy/commit/fc1b1b3bd6081fae3566cdb542d896e413238b71
 int verysleepy__useSendMessage = 1;

 const UINT WM_VERYSLEEPY_MSG = WM_APP + 0x3117;