From 217a1837eddc85d0148fade3144a14777e326d6c Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 13 Aug 2023 13:07:35 -0700
Subject: [PATCH 1/5] irjit: Allow typical prefixes in vdiv/vasin/etc.

Some of these behave strangely, but there are some common usages that work
fine.
---
 Core/MIPS/IR/IRCompVFPU.cpp | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 374ebf8f3a..9ab5e6d258 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -741,6 +741,8 @@ namespace MIPSComp {
 			VSLT,
 		};
 		VecDo3Op type = VecDo3Op::INVALID;
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
 
 		// Check that we can support the ops, and prepare temporary values for ops that need it.
 		switch (op >> 26) {
@@ -778,9 +780,11 @@ namespace MIPSComp {
 		case VecDo3Op::VMUL:
 			break;
 		case VecDo3Op::VDIV:
-			if (!js.HasNoPrefix()) {
+			if (js.HasUnknownPrefix() || (sz != V_Single && !js.HasNoPrefix()))
+				DISABLE;
+			// If it's single, we just need to check the prefixes are within the size.
+			if (!IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op))
 				DISABLE;
-			}
 			break;
 		case VecDo3Op::VMIN:
 		case VecDo3Op::VMAX:
@@ -790,9 +794,6 @@ namespace MIPSComp {
 			break;
 		}
 
-		VectorSize sz = GetVecSize(op);
-		int n = GetNumVectorElements(sz);
-
 		u8 sregs[4], tregs[4], dregs[4];
 		GetVectorRegsPrefixS(sregs, sz, _VS);
 		GetVectorRegsPrefixT(tregs, sz, _VT);
@@ -901,10 +902,8 @@ namespace MIPSComp {
 			// D prefix is fine for these, and used sometimes.
 			if (js.HasUnknownPrefix() || js.HasSPrefix())
 				DISABLE;
-		} else {
-			// Many of these apply the D prefix strangely or override parts of the S prefix.
-			if (!js.HasNoPrefix())
-				DISABLE;
+		} else if (optype == 5 && js.HasDPrefix()) {
+			DISABLE;
 		}
 
 		// Vector unary operation
@@ -912,13 +911,19 @@ namespace MIPSComp {
 
 		int vs = _VS;
 		int vd = _VD;
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
 
 		if (optype >= 16 && !js.HasNoPrefix()) {
-			DISABLE;
-		} else if ((optype == 1 || optype == 2) && js.HasSPrefix()) {
-			DISABLE;
-		} else if (optype == 5 && js.HasDPrefix()) {
-			DISABLE;
+			// Many of these apply the D prefix strangely or override parts of the S prefix.
+			if (js.HasUnknownPrefix() || sz != V_Single)
+				DISABLE;
+			// If it's single, we just need to check the prefixes are within the size.
+			if (!IsPrefixWithinSize(js.prefixS, op))
+				DISABLE;
+			// The negative ones seem to use negate flags as a prefix hack.
+			if (optype >= 24 && (js.prefixS & 0x000F0000) != 0)
+				DISABLE;
 		}
 
 		// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
@@ -926,9 +931,6 @@ namespace MIPSComp {
 			return;
 		}
 
-		VectorSize sz = GetVecSize(op);
-		int n = GetNumVectorElements(sz);
-
 		u8 sregs[4]{}, dregs[4]{};
 		GetVectorRegsPrefixS(sregs, sz, vs);
 		GetVectorRegsPrefixD(dregs, sz, vd);

From e0be6858b84221cdf6099c866c575415f9a06eaf Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 13 Aug 2023 13:32:50 -0700
Subject: [PATCH 2/5] irjit: Implement vcrs.t.

As used in Jeanne d'Arc.
---
 Core/MIPS/IR/IRCompVFPU.cpp | 52 ++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 9ab5e6d258..ef836a8321 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -66,10 +66,12 @@ namespace MIPSComp {
 		return regs[1] == regs[0] + 1;
 	}
 
+	static bool IsConsecutive3(const u8 regs[3]) {
+		return IsConsecutive2(regs) && regs[2] == regs[1] + 1;
+	}
+
 	static bool IsConsecutive4(const u8 regs[4]) {
-		return regs[1] == regs[0] + 1 &&
-			     regs[2] == regs[1] + 1 &&
-			     regs[3] == regs[2] + 1;
+		return IsConsecutive3(regs) && regs[3] == regs[2] + 1;
 	}
 
 	static bool IsVec2(VectorSize sz, const u8 regs[2]) {
@@ -80,6 +82,10 @@ namespace MIPSComp {
 		return sz == V_Quad && IsConsecutive4(regs) && (regs[0] & 3) == 0;
 	}
 
+	static bool IsVec3of4(VectorSize sz, const u8 regs[4]) {
+		return sz == V_Triple && IsConsecutive3(regs) && (regs[0] & 3) == 0;
+	}
+
 	static bool IsMatrixVec4(MatrixSize sz, const u8 regs[16]) {
 		if (sz != M_4x4)
 			return false;
@@ -1629,8 +1635,46 @@ namespace MIPSComp {
 		// d[0] = s[y]*t[z], d[1] = s[z]*t[x], d[2] = s[x]*t[y]
 		// To do a full cross product: vcrs tmp1, s, t; vcrs tmp2 t, s; vsub d, tmp1, tmp2;
 		// (or just use vcrsp.)
+		// Note: this is possibly just a swizzle prefix hack for vmul.
 
-		DISABLE;
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+		if (sz != V_Triple)
+			DISABLE;
+
+		u8 sregs[4], dregs[4], tregs[4];
+		GetVectorRegsPrefixS(sregs, sz, _VS);
+		GetVectorRegsPrefixT(tregs, sz, _VT);
+		GetVectorRegsPrefixD(dregs, sz, _VD);
+
+		if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs)) {
+			// Use Vec4 where we can.  First, apply shuffles.
+			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], VFPU_SWIZZLE(1, 2, 0, 3));
+			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, tregs[0], VFPU_SWIZZLE(2, 0, 1, 3));
+			ir.Write(IROp::Vec4Mul, IRVTEMP_0, IRVTEMP_PFX_S, IRVTEMP_PFX_T);
+			// Now just retain w and blend in our values.
+			ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7 });
+		} else {
+			u8 tempregs[4]{};
+			if (!IsOverlapSafe(n, dregs, n, sregs, n, tregs)) {
+				for (int i = 0; i < n; ++i)
+					tempregs[i] = IRVTEMP_0 + i;
+			} else {
+				for (int i = 0; i < n; ++i)
+					tempregs[i] = dregs[i];
+			}
+
+			ir.Write(IROp::FMul, tempregs[0], sregs[1], tregs[2]);
+			ir.Write(IROp::FMul, tempregs[1], sregs[2], tregs[0]);
+			ir.Write(IROp::FMul, tempregs[2], sregs[0], tregs[1]);
+
+			for (int i = 0; i < n; i++) {
+				if (tempregs[i] != dregs[i])
+					ir.Write(IROp::FMov, dregs[i], tempregs[i]);
+			}
+		}
+
+		ApplyPrefixD(dregs, sz, _VD);
 	}
 
 	void IRFrontend::Comp_VDet(MIPSOpcode op) {

From 2e6dbab5fa256c7e7c4be6eaae6c2f94449970b3 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 13 Aug 2023 13:52:45 -0700
Subject: [PATCH 3/5] irjit: Add flag to prefer Vec4, use for add/sub.

This will improve things when using SIMD.
---
 Core/MIPS/IR/IRCompVFPU.cpp    | 31 +++++++++++++++++++++----------
 Core/MIPS/IR/IRInst.h          |  1 +
 Core/MIPS/IR/IRJit.cpp         | 11 +++++++++--
 Core/MIPS/JitCommon/JitState.h |  1 +
 4 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index ef836a8321..8ca94d0260 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -336,7 +336,7 @@ namespace MIPSComp {
 		if (js.prefixD == 0)
 			return;
 
-		if (IsVec4(sz, regs) && js.VfpuWriteMask() != 0) {
+		if (IsVec4(sz, regs) && js.VfpuWriteMask() != 0 && opts.preferVec4) {
 			// Use temps for all, we'll blend in the end (keeping in Vec4.)
 			for (int i = 0; i < 4; ++i)
 				regs[i] = IRVTEMP_PFX_D + i;
@@ -378,7 +378,7 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::ApplyPrefixDMask(u8 *vregs, VectorSize sz, int vectorReg) {
-		if (IsVec4(sz, vregs) && js.VfpuWriteMask() != 0) {
+		if (IsVec4(sz, vregs) && js.VfpuWriteMask() != 0 && opts.preferVec4) {
 			u8 origV[4];
 			GetVectorRegs(origV, sz, vectorReg);
 
@@ -815,7 +815,7 @@ namespace MIPSComp {
 		}
 
 		// If all three are consecutive 4, we're safe regardless of if we use temps so we should not check that here.
-		if (allowSIMD && IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
+		if (allowSIMD) {
 			IROp opFunc = IROp::Nop;
 			switch (type) {
 			case VecDo3Op::VADD: // d[i] = s[i] + t[i]; break; //vadd
@@ -835,13 +835,24 @@ namespace MIPSComp {
 				break;
 			}
 
-			if (opFunc != IROp::Nop) {
-				ir.Write(opFunc, dregs[0], sregs[0], tregs[0]);
-			} else {
-				DISABLE;
+			if (IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
+				if (opFunc != IROp::Nop) {
+					ir.Write(opFunc, dregs[0], sregs[0], tregs[0]);
+				} else {
+					DISABLE;
+				}
+				ApplyPrefixD(dregs, sz, _VD);
+				return;
+			} else if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
+				// This is actually pretty common.  Use a temp + blend.
+				// We could post-process this, but it's easier to do it here.
+				if (opFunc == IROp::Nop)
+					DISABLE;
+				ir.Write(opFunc, IRVTEMP_0, sregs[0], tregs[0]);
+				ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7 });
+				ApplyPrefixD(dregs, sz, _VD);
+				return;
 			}
-			ApplyPrefixD(dregs, sz, _VD);
-			return;
 		}
 
 		if (type == VecDo3Op::VSGE || type == VecDo3Op::VSLT) {
@@ -1647,7 +1658,7 @@ namespace MIPSComp {
 		GetVectorRegsPrefixT(tregs, sz, _VT);
 		GetVectorRegsPrefixD(dregs, sz, _VD);
 
-		if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs)) {
+		if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
 			// Use Vec4 where we can.  First, apply shuffles.
 			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], VFPU_SWIZZLE(1, 2, 0, 3));
 			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, tregs[0], VFPU_SWIZZLE(2, 0, 1, 3));
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index dd93ed823b..85914d3e20 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -385,6 +385,7 @@ private:
 struct IROptions {
 	uint32_t disableFlags;
 	bool unalignedLoadStore;
+	bool preferVec4;
 };
 
 const IRMeta *GetIRMeta(IROp op);
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index 3f33f52fec..0016904b68 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -50,9 +50,16 @@ IRJit::IRJit(MIPSState *mipsState) : frontend_(mipsState->HasDefaultPrefix()), m
 
 	IROptions opts{};
 	opts.disableFlags = g_Config.uJitDisableFlags;
-	// Assume that RISC-V always has very slow unaligned memory accesses.
-#if !PPSSPP_ARCH(RISCV64)
+#if PPSSPP_ARCH(RISCV64)
+	// Assume RISC-V always has very slow unaligned memory accesses.
+	opts.unalignedLoadStore = false;
+	opts.preferVec4 = cpu_info.RiscV_V;
+#elif PPSSPP_ARCH(ARM)
 	opts.unalignedLoadStore = (opts.disableFlags & (uint32_t)JitDisable::LSU_UNALIGNED) == 0;
+	opts.preferVec4 = cpu_info.bASIMD || cpu_info.bNEON;
+#else
+	opts.unalignedLoadStore = (opts.disableFlags & (uint32_t)JitDisable::LSU_UNALIGNED) == 0;
+	opts.preferVec4 = true;
 #endif
 	frontend_.SetOptions(opts);
 }
diff --git a/Core/MIPS/JitCommon/JitState.h b/Core/MIPS/JitCommon/JitState.h
index 48d8f7540b..e453a33d8c 100644
--- a/Core/MIPS/JitCommon/JitState.h
+++ b/Core/MIPS/JitCommon/JitState.h
@@ -233,6 +233,7 @@ namespace MIPSComp {
 		bool downcountInRegister;
 		// ARM64 only
 		bool useASIMDVFPU;
+		// ARM64 and RV64
 		bool useStaticAlloc;
 		bool enablePointerify;
 

From 5729de90d27b554b256440ea1fbf45f8a2cf5eb1 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 13 Aug 2023 17:52:00 -0700
Subject: [PATCH 4/5] irjit: Use more partial Vec4s / Vec4Blend.

---
 Core/MIPS/IR/IRCompVFPU.cpp | 61 +++++++++++++++++++++++++++++--------
 1 file changed, 49 insertions(+), 12 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 8ca94d0260..9057f17647 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -710,10 +710,21 @@ namespace MIPSComp {
 		GetVectorRegsPrefixT(tregs, sz, vt);
 		GetVectorRegsPrefixD(dregs, V_Single, vd);
 
-		if (IsVec4(sz, sregs) && IsVec4(sz, tregs) && IsOverlapSafe(dregs[0], n, sregs, n, tregs)) {
-			ir.Write(IROp::Vec4Dot, dregs[0], sregs[0], tregs[0]);
-			ApplyPrefixD(dregs, V_Single, vd);
-			return;
+		if (IsOverlapSafe(dregs[0], n, sregs, n, tregs)) {
+			if (IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
+				ir.Write(IROp::Vec4Dot, dregs[0], sregs[0], tregs[0]);
+				ApplyPrefixD(dregs, V_Single, vd);
+				return;
+			} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
+				// Nice example of this in Fat Princess (US) in block 088181A0 (hot.)
+				// Create a temporary copy of S with the last element zeroed.
+				ir.Write(IROp::Vec4Init, IRVTEMP_0, (int)Vec4Init::AllZERO);
+				ir.Write({ IROp::Vec4Blend, IRVTEMP_0, IRVTEMP_0, sregs[0], 0x7 });
+				// Now we can just dot like normal, with the last element effectively masked.
+				ir.Write(IROp::Vec4Dot, dregs[0], IRVTEMP_0, sregs[0] == tregs[0] ? IRVTEMP_0 : tregs[0]);
+				ApplyPrefixD(dregs, V_Single, vd);
+				return;
+			}
 		}
 
 		int temp0 = IRVTEMP_0;
@@ -973,20 +984,34 @@ namespace MIPSComp {
 			break;
 		}
 
-		if (canSIMD && !usingTemps && IsVec4(sz, sregs) && IsVec4(sz, dregs)) {
+		if (canSIMD && !usingTemps) {
+			IROp irop = IROp::Nop;
 			switch (optype) {
 			case 0:  // vmov
-				ir.Write(IROp::Vec4Mov, dregs[0], sregs[0]);
+				irop = IROp::Vec4Mov;
 				break;
 			case 1:  // vabs
-				ir.Write(IROp::Vec4Abs, dregs[0], sregs[0]);
+				irop = IROp::Vec4Abs;
 				break;
 			case 2:  // vneg
-				ir.Write(IROp::Vec4Neg, dregs[0], sregs[0]);
+				irop = IROp::Vec4Neg;
 				break;
 			}
-			ApplyPrefixD(dregs, sz, vd);
-			return;
+			if (IsVec4(sz, sregs) && IsVec4(sz, dregs) && irop != IROp::Nop) {
+				ir.Write(irop, dregs[0], sregs[0]);
+				ApplyPrefixD(dregs, sz, vd);
+				return;
+			} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, dregs) && irop != IROp::Nop && opts.preferVec4) {
+				// This is a simple case of vmov.t, just blend.
+				if (irop == IROp::Vec4Mov) {
+					ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], sregs[0], 0x7 });
+				} else {
+					ir.Write(irop, IRVTEMP_0, sregs[0]);
+					ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7 });
+				}
+				ApplyPrefixD(dregs, sz, vd);
+				return;
+			}
 		}
 
 		for (int i = 0; i < n; ++i) {
@@ -1397,11 +1422,16 @@ namespace MIPSComp {
 			}
 		}
 
-		if (IsVec4(sz, sregs) && IsVec4(sz, dregs)) {
-			if (!overlap || (vs == vd && IsOverlapSafe(treg, n, dregs))) {
+		if (!overlap || (vs == vd && IsOverlapSafe(treg, n, dregs))) {
+			if (IsVec4(sz, sregs) && IsVec4(sz, dregs)) {
 				ir.Write(IROp::Vec4Scale, dregs[0], sregs[0], treg);
 				ApplyPrefixD(dregs, sz, vd);
 				return;
+			} else if (IsVec3of4(sz, sregs) && IsVec3of4(sz, dregs) && opts.preferVec4) {
+				ir.Write(IROp::Vec4Scale, IRVTEMP_0, sregs[0], treg);
+				ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7 });
+				ApplyPrefixD(dregs, sz, vd);
+				return;
 			}
 		}
 
@@ -2097,6 +2127,10 @@ namespace MIPSComp {
 		if (IsVec4(sz, dregs)) {
 			ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(cst_constants[conNum]));
 			ir.Write(IROp::Vec4Shuffle, dregs[0], IRVTEMP_0, 0);
+		} else if (IsVec3of4(sz, dregs) && opts.preferVec4) {
+			ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(cst_constants[conNum]));
+			ir.Write(IROp::Vec4Shuffle, IRVTEMP_0, IRVTEMP_0, 0);
+			ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7 });
 		} else {
 			for (int i = 0; i < n; i++) {
 				// Most of the time, materializing a float is slower than copying from another float.
@@ -2247,6 +2281,9 @@ namespace MIPSComp {
 
 		if (IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
 			ir.Write(IROp::Vec4Add, dregs[0], tregs[0], sregs[0]);
+		} else if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
+			ir.Write(IROp::Vec4Add, IRVTEMP_0, tregs[0], sregs[0]);
+			ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7 });
 		} else {
 			u8 tempregs[4];
 			for (int i = 0; i < n; ++i) {

From 159b41a0fa1e8084dea9c895ddc3cfb906b0e336 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 13 Aug 2023 17:56:17 -0700
Subject: [PATCH 5/5] irjit: Fuse unaligned svl.q/svr.q together.

They're almost never used outside paired, which we can do on most
platforms easily.
---
 Core/MIPS/IR/IRCompVFPU.cpp | 45 +++++++++++++++++++++++++++++++++----
 Core/MIPS/IR/IRInst.h       |  1 +
 Core/MIPS/IR/IRJit.cpp      |  4 ++++
 3 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 9057f17647..e021ccbe12 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -424,8 +424,42 @@ namespace MIPSComp {
 
 		CheckMemoryBreakpoint(rs, imm);
 
+		enum class LSVType {
+			INVALID,
+			LVQ,
+			SVQ,
+			LVLQ,
+			LVRQ,
+			SVLQ,
+			SVRQ,
+		};
+
+		LSVType optype = LSVType::INVALID;
 		switch (op >> 26) {
-		case 54: //lv.q
+		case 54: optype = LSVType::LVQ; break; // lv.q
+		case 62: optype = LSVType::SVQ; break; // sv.q
+		case 53: // lvl/lvr.q - highly unusual
+			optype = (op & 2) == 0 ? LSVType::LVLQ : LSVType::LVRQ;
+			break;
+		case 61: // svl/svr.q - highly unusual
+			optype = (op & 2) == 0 ? LSVType::SVLQ : LSVType::SVRQ;
+			break;
+		}
+		if (optype == LSVType::INVALID)
+			INVALIDOP;
+
+		if ((optype == LSVType::LVRQ || optype == LSVType::SVRQ) && opts.unalignedLoadStoreVec4) {
+			// We don't bother with an op for this, but we do fuse unaligned stores which happen.
+			MIPSOpcode nextOp = GetOffsetInstruction(1);
+			if ((nextOp.encoding ^ op.encoding) == 0x0000000E) {
+				// Okay, it's an svr.q/svl.q pair, same registers.  Treat as lv.q/sv.q.
+				EatInstruction(nextOp);
+				optype = optype == LSVType::LVRQ ? LSVType::LVQ : LSVType::SVQ;
+			}
+		}
+
+		switch (optype) {
+		case LSVType::LVQ:
 			if (IsVec4(V_Quad, vregs)) {
 				ir.Write(IROp::LoadVec4, vregs[0], rs, ir.AddConstant(imm));
 			} else {
@@ -439,7 +473,7 @@ namespace MIPSComp {
 			}
 			break;
 
-		case 62: //sv.q
+		case LSVType::SVQ:
 			if (IsVec4(V_Quad, vregs)) {
 				ir.Write(IROp::StoreVec4, vregs[0], rs, ir.AddConstant(imm));
 			} else {
@@ -453,8 +487,11 @@ namespace MIPSComp {
 			}
 			break;
 
-		case 53: // lvl/lvr.q - highly unusual
-		case 61: // svl/svr.q - highly unusual
+		case LSVType::LVLQ:
+		case LSVType::LVRQ:
+		case LSVType::SVLQ:
+		case LSVType::SVRQ:
+			// These are pretty uncommon unless paired.
 			DISABLE;
 			break;
 
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index 85914d3e20..d8935b9e35 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -385,6 +385,7 @@ private:
 struct IROptions {
 	uint32_t disableFlags;
 	bool unalignedLoadStore;
+	bool unalignedLoadStoreVec4;
 	bool preferVec4;
 };
 
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index 0016904b68..44621c2dd5 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -53,12 +53,16 @@ IRJit::IRJit(MIPSState *mipsState) : frontend_(mipsState->HasDefaultPrefix()), m
 #if PPSSPP_ARCH(RISCV64)
 	// Assume RISC-V always has very slow unaligned memory accesses.
 	opts.unalignedLoadStore = false;
+	opts.unalignedLoadStoreVec4 = true;
 	opts.preferVec4 = cpu_info.RiscV_V;
 #elif PPSSPP_ARCH(ARM)
 	opts.unalignedLoadStore = (opts.disableFlags & (uint32_t)JitDisable::LSU_UNALIGNED) == 0;
+	opts.unalignedLoadStoreVec4 = true;
 	opts.preferVec4 = cpu_info.bASIMD || cpu_info.bNEON;
 #else
 	opts.unalignedLoadStore = (opts.disableFlags & (uint32_t)JitDisable::LSU_UNALIGNED) == 0;
+	// TODO: Could allow on x86 pretty easily...
+	opts.unalignedLoadStoreVec4 = false;
 	opts.preferVec4 = true;
 #endif
 	frontend_.SetOptions(opts);