From e8527714809fa07427fbdf4560d78077aba35b71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 5 Oct 2023 18:52:50 +0200
Subject: [PATCH] Integrate the voffset shuffle in ReadVector

---
 Core/MIPS/MIPS.cpp          |  2 +-
 Core/MIPS/MIPSVFPUUtils.cpp | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/Core/MIPS/MIPS.cpp b/Core/MIPS/MIPS.cpp
index b68aebddf5..91223bd637 100644
--- a/Core/MIPS/MIPS.cpp
+++ b/Core/MIPS/MIPS.cpp
@@ -122,7 +122,7 @@ MIPSState::MIPSState() {
 	// * 4x4 Matrices are contiguous in RAM, making them, too, fast-loadable in NEON
 
 	// Disadvantages:
-	// * Extra indirection, can be confusing and slower (interpreter only)
+	// * Extra indirection, can be confusing and slower (interpreter only, however we can often skip the table by rerranging formulas)
 	// * Flushing and reloading row registers is now slower
 
 	int i = 0;
diff --git a/Core/MIPS/MIPSVFPUUtils.cpp b/Core/MIPS/MIPSVFPUUtils.cpp
index 87d9875399..ebb91d863b 100644
--- a/Core/MIPS/MIPSVFPUUtils.cpp
+++ b/Core/MIPS/MIPSVFPUUtils.cpp
@@ -175,16 +175,17 @@ void ReadVector(float *rd, VectorSize size, int reg) {
 	default: length = 0; break;
 	}
 	int transpose = (reg >> 5) & 1;
-	const int mtx = reg & (7 << 2);
+	const int mtx = ((reg << 2) & 0x70);
 	const int col = reg & 3;
+	// NOTE: We now skip the voffset lookups.
 	if (transpose) {
-		const int base = mtx + col * 32;
-		for (int i = 0; i < length; i++)
-			rd[i] = V(base + ((row+i)&3));
-	} else {
 		const int base = mtx + col;
 		for (int i = 0; i < length; i++)
-			rd[i] = V(base + ((row+i)&3)*32);
+			rd[i] = currentMIPS->v[base + ((row+i)&3) * 4];
+	} else {
+		const int base = mtx + col * 4;
+		for (int i = 0; i < length; i++)
+			rd[i] = currentMIPS->v[base + ((row+i)&3)];
 	}
 }