Merge pull request #7117 from unknownbrackets/jit-simd

x86jit: Implement vmmov using SIMD
2025-04-02 11:01:50 -04:00 · 2014-11-30 09:24:22 +01:00 · 2014-11-30 09:24:22 +01:00 · 7deb8055ee
commit 7deb8055ee
parent 7f65e81fa9 bb26e4f7d0
4 changed files with 88 additions and 2 deletions
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@ -2389,6 +2389,48 @@ void Jit::Comp_Vmmov(MIPSOpcode op) {
 	MatrixSize sz = GetMtxSize(op);
 	int n = GetMatrixSide(sz);

+	if (jo.enableVFPUSIMD) {
+		VectorSize vsz = GetVectorSize(sz);
+		u8 dest[4][4];
+		MatrixOverlapType overlap = GetMatrixOverlap(_VD, _VS, sz);
+
+		u8 vecs[4];
+		if (overlap == OVERLAP_NONE) {
+			GetMatrixColumns(_VD, sz, vecs);
+			for (int i = 0; i < n; ++i) {
+				GetVectorRegs(dest[i], vsz, vecs[i]);
+			}
+		} else {
+			for (int i = 0; i < n; ++i) {
+				fpr.GetTempVS(dest[i], vsz);
+			}
+		}
+
+		GetMatrixColumns(_VS, sz, vecs);
+		for (int i = 0; i < n; i++) {
+			u8 vec[4];
+			GetVectorRegs(vec, vsz, vecs[i]);
+			fpr.MapRegsVS(vec, vsz, 0);
+			fpr.MapRegsVS(dest[i], vsz, MAP_NOINIT);
+			MOVAPS(fpr.VSX(dest[i]), fpr.VS(vec));
+		}
+
+		if (overlap != OVERLAP_NONE) {
+			// Okay, move from the temps to VD now.
+			GetMatrixColumns(_VD, sz, vecs);
+			for (int i = 0; i < n; i++) {
+				u8 vec[4];
+				GetVectorRegs(vec, vsz, vecs[i]);
+				fpr.MapRegsVS(vec, vsz, MAP_NOINIT);
+				fpr.MapRegsVS(dest[i], vsz, 0);
+				MOVAPS(fpr.VSX(vec), fpr.VS(dest[i]));
+			}
+		}
+
+		fpr.ReleaseSpillLocks();
+		return;
+	}
+
 	u8 sregs[16], dregs[16];
 	GetMatrixRegs(sregs, sz, _VS);
 	GetMatrixRegs(dregs, sz, _VD);
--- a/Core/MIPS/x86/RegCacheFPU.cpp
+++ b/Core/MIPS/x86/RegCacheFPU.cpp
@ -716,6 +716,50 @@ int FPURegCache::GetTempR() {
 	return -1;
 }

+int FPURegCache::GetTempVS(u8 *v, VectorSize vsz) {
+	pendingFlush = true;
+	const int n = GetNumVectorElements(vsz);
+
+	// Let's collect regs as we go, but try for n free in a row.
+	int found = 0;
+	for (int r = TEMP0; r <= TEMP0 + NUM_TEMPS - n; ++r) {
+		if (regs[r].away || regs[r].tempLocked) {
+			continue;
+		}
+
+		// How many free siblings does this have?
+		int seq = 1;
+		for (int i = 1; i < n; ++i) {
+			if (regs[r + i].away || regs[r + i].tempLocked) {
+				break;
+			}
+			++seq;
+		}
+
+		if (seq == n) {
+			// Got 'em.  Exacty as many as we need.
+			for (int i = 0; i < n; ++i) {
+				v[i] = r + i - 32;
+			}
+			found = n;
+			break;
+		}
+
+		if (found < n) {
+			v[found++] = r - 32;
+		}
+	}
+
+	if (found != n) {
+		_assert_msg_(JIT, 0, "Regcache ran out of temp regs, might need to DiscardR() some.");
+		return -1;
+	}
+
+	for (int i = 0; i < n; ++i) {
+		regs[v[i] + 32].tempLocked = true;
+	}
+}
+
 void FPURegCache::Flush() {
 	if (!pendingFlush) {
 		return;
--- a/Core/MIPS/x86/RegCacheFPU.h
+++ b/Core/MIPS/x86/RegCacheFPU.h
@ -114,7 +114,7 @@ public:
 	int GetTempV() {
 		return GetTempR() - 32;
 	}
-	// TODO: GetTempVS?
+	int GetTempVS(u8 *v, VectorSize vsz);

 	void SetEmitter(XEmitter *emitter) {emit = emitter;}
 	void SetOptions(MIPSComp::JitOptions *jo) {jo_ = jo;}
--- a/unittest/JitHarness.cpp
+++ b/unittest/JitHarness.cpp
@ -173,7 +173,6 @@ bool TestJit() {
 #else
 		std::vector<std::string> lines = DisassembleX86(block->normalEntry, block->codeSize);
 #endif
-		printf("Jit was %fx faster than interp.\n\n", jit_speed / interp_speed);
 		// Cut off at 25 due to the repetition above. Might need tweaking for large instructions.
 		const int cutoff = 25;
 		for (int i = 0; i < std::min((int)lines.size(), cutoff); i++) {
@ -181,6 +180,7 @@ bool TestJit() {
 		}
 		if (lines.size() > cutoff)
 			printf("...\n");
+		printf("Jit was %fx faster than interp.\n\n", jit_speed / interp_speed);
 	}

 	printf("\n");