diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 2bb96e7549..e67b93cddd 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -16,6 +16,7 @@
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
 
 #include <cmath>
+
 #include "math/math_util.h"
 
 #include "Core/MemMap.h"
@@ -57,7 +58,9 @@ namespace MIPSComp {
 	}
 
 	static bool IsConsecutive4(const u8 regs[4]) {
-		return (regs[1] == regs[0] + 1 && regs[2] == regs[1] + 1 && regs[3] == regs[2] + 1);
+		return regs[1] == regs[0] + 1 &&
+			     regs[2] == regs[1] + 1 &&
+			     regs[3] == regs[2] + 1;
 	}
 
 	void IRFrontend::Comp_VPFX(MIPSOpcode op)	{
@@ -244,15 +247,79 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_VVectorInit(MIPSOpcode op) {
-		DISABLE;
+		if (!js.HasNoPrefix())
+			DISABLE;
+
+		VectorSize sz = GetVecSize(op);
+		int type = (op >> 16) & 0xF;
+		int vd = _VD;
+
+		if (sz == 4 && IsVectorColumn(vd)) {
+			u8 dregs[4];
+			GetVectorRegs(dregs, sz, vd);
+			ir.Write(IROp::InitVec4, voffset[dregs[0]], (int)(type == 6 ? Vec4Init::AllZERO : Vec4Init::AllONE));
+		} else if (sz == 1) {
+			ir.Write(IROp::SetConstV, voffset[vd], ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f));
+		} else {
+			DISABLE;
+		}
 	}
 
 	void IRFrontend::Comp_VIdt(MIPSOpcode op) {
-		DISABLE;
+		if (!js.HasNoPrefix())
+			DISABLE;
+
+		int vd = _VD;
+		VectorSize sz = GetVecSize(op);
+		if (sz != V_Quad)
+			DISABLE;
+
+		if (!IsVectorColumn(vd))
+			DISABLE;
+
+		u8 dregs[4];
+		GetVectorRegs(dregs, sz, vd);
+		int row = vd & 3;
+		Vec4Init init = Vec4Init((int)Vec4Init::Set_1000 + row);
+		ir.Write(IROp::InitVec4, voffset[dregs[0]], (int)init);
 	}
 
 	void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) {
-		DISABLE;
+		MatrixSize sz = GetMtxSize(op);
+		if (sz != M_4x4) {
+			DISABLE;
+		}
+
+		// Not really about trying here, it will work if enabled.
+		VectorSize vsz = GetVectorSize(sz);
+		u8 vecs[4];
+		int vd = _VD;
+		if (IsMatrixTransposed(vd)) {
+			// All outputs are transpositionally symmetric, so should be fine.
+			vd = TransposeMatrixReg(vd);
+		}
+		GetMatrixColumns(vd, M_4x4, vecs);
+		for (int i = 0; i < 4; i++) {
+			u8 vec[4];
+			GetVectorRegs(vec, vsz, vecs[i]);
+			// As they are columns, they will be nicely consecutive.
+			Vec4Init init;
+			switch ((op >> 16) & 0xF) {
+			case 3:
+				init = Vec4Init((int)Vec4Init::Set_1000 + i);
+				break;
+			case 6:
+				init = Vec4Init::AllZERO;
+				break;
+			case 7:
+				init = Vec4Init::AllONE;
+				break;
+			default:
+				return;
+			}
+			ir.Write(IROp::InitVec4, voffset[vec[0]], (int)init);
+		}
+		return;
 	}
 
 	void IRFrontend::Comp_VHdp(MIPSOpcode op) {
@@ -275,7 +342,7 @@ namespace MIPSComp {
 
 	void IRFrontend::Comp_VV2Op(MIPSOpcode op) {
 		CONDITIONAL_DISABLE;
-		// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
+		// Eliminate silly no-op VMOVs, common in Wipeout Pure
 		if (((op >> 16) & 0x1f) == 0 && _VS == _VD && js.HasNoPrefix()) {
 			return;
 		}
@@ -379,7 +446,12 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_Viim(MIPSOpcode op) {
-		DISABLE;
+		if (!js.HasNoPrefix())
+			DISABLE;
+
+		u8 dreg = _VT;
+		s32 imm = (s32)(s16)(u16)(op & 0xFFFF);
+		ir.Write(IROp::SetConstV, voffset[dreg], ir.AddConstantFloat((float)imm));
 	}
 
 	void IRFrontend::Comp_Vfim(MIPSOpcode op) {
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index d82e72ccdb..e9bc55ab78 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -85,9 +85,18 @@ static const IRMeta irMeta[] = {
 	{ IROp::FMovToGPR, "FMovToGPR", "GF" },
 	{ IROp::VMovFromGPR, "VMovFromGPR", "VG" },
 	{ IROp::VMovToGPR, "VMovToGPR", "GV" },
+	{ IROp::InitVec4, "InitVec4", "Vv"},
 	{ IROp::FpCondToReg, "FpCondToReg", "G" },
 	{ IROp::VfpuCtrlToReg, "VfpuCtrlToReg", "GI" },
 	{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "TC" },
+
+	{ IROp::VSin, "VSin", "VV" },
+	{ IROp::VCos, "VCos", "VV" },
+	{ IROp::VSqrt, "VSqrt", "VV" },
+	{ IROp::VRSqrt, "VRSqrt", "VV" },
+	{ IROp::VRecip, "VRecip", "VV" },
+	{ IROp::VAsin, "VAsin", "VV" },
+
 	{ IROp::Interpret, "Interpret", "_C" },
 	{ IROp::Downcount, "Downcount", "_II" },
 	{ IROp::ExitToConst, "Exit", "C" },
@@ -177,6 +186,15 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *co
 		"RCX6",
 		"RCX7",
 	};
+	static const char *initVec4Names[8] = {
+		"[0 0 0 0]",
+		"[1 1 1 1]",
+		"[-1 -1 -1 -1]",
+		"[1 0 0 0]",
+		"[0 1 0 0]",
+		"[0 0 1 0]",
+		"[0 0 0 1]",
+	};
 
 	switch (type) {
 	case 'G':
@@ -197,6 +215,9 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *co
 	case 'T':
 		snprintf(buf, bufSize, "%s", vfpuCtrlNames[param]);
 		break;
+	case 'v':
+		snprintf(buf, bufSize, "%s", initVec4Names[param]);
+		break;
 	case '_':
 	case '\0':
 		buf[0] = 0;
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index 70f0e0ff6e..e2c0f6644a 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -144,6 +144,16 @@ enum class IROp : u8 {
 	VMovFromGPR,
 	VMovToGPR,
 
+	InitVec4,
+
+	// Slow special functions. Used on singles.
+	VSin,
+	VCos,
+	VSqrt,
+	VRSqrt,
+	VRecip,
+	VAsin,
+
 	// Fake/System instructions
 	Interpret,
 
@@ -181,6 +191,17 @@ enum IRComparison {
 	Bad,
 };
 
+// Some common vec4 constants.
+enum class Vec4Init {
+	AllZERO,
+	AllONE,
+	AllMinusONE,
+	Set_1000,
+	Set_0100,
+	Set_0010,
+	Set_0001,
+};
+
 // Hm, unused
 inline IRComparison Invert(IRComparison comp) {
 	switch (comp) {
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 63e0bd533e..2a601bb8f8 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -6,6 +6,7 @@
 #include "Core/HLE/HLE.h"
 #include "Core/HLE/ReplaceTables.h"
 #include "Core/MIPS/MIPSTables.h"
+#include "Core/MIPS/MIPSVFPUUtils.h"
 
 #include "math/math_util.h"
 #include "Common/CommonTypes.h"
@@ -14,6 +15,16 @@
 #include "Core/MIPS/IR/IRInst.h"
 #include "Core/MIPS/IR/IRInterpreter.h"
 
+alignas(16) float vec4InitValues[8][4] = {
+	{ 0.0f, 0.0f, 0.0f, 0.0f },
+	{ 1.0f, 1.0f, 1.0f, 1.0f },
+	{ -1.0f, -1.0f, -1.0f, -1.0f },
+	{ 1.0f, 0.0f, 0.0f, 0.0f },
+	{ 0.0f, 1.0f, 0.0f, 0.0f },
+	{ 0.0f, 0.0f, 1.0f, 0.0f },
+	{ 0.0f, 0.0f, 0.0f, 1.0f },
+};
+
 u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int count) {
 	const IRInst *end = inst + count;
 	while (inst != end) {
@@ -134,6 +145,33 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			break;
 		}
 
+		case IROp::InitVec4:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->v[inst->dest], _mm_load_ps(vec4InitValues[inst->src1]));
+#else
+			memcpy(&mips->v[inst->dest + i], vec4InitValues[inst->src1], 4 * sizeof(float));
+#endif
+			break;
+
+		case IROp::VSin:
+			mips->v[inst->dest] = vfpu_sin(mips->v[inst->src1]);
+			break;
+		case IROp::VCos:
+			mips->v[inst->dest] = vfpu_cos(mips->v[inst->src1]);
+			break;
+		case IROp::VSqrt:
+			mips->v[inst->dest] = sqrtf(mips->v[inst->src1]);
+			break;
+		case IROp::VRSqrt:
+			mips->v[inst->dest] = 1.0f / sqrtf(mips->v[inst->src1]);
+			break;
+		case IROp::VRecip:
+			mips->v[inst->dest] = 1.0f / mips->v[inst->src1];
+			break;
+		case IROp::VAsin:
+			mips->v[inst->dest] = vfpu_asin(mips->v[inst->src1]);
+			break;
+
 		case IROp::ShlImm:
 			mips->r[inst->dest] = mips->r[inst->src1] << (int)inst->src2;
 			break;
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index 661543a9d0..fb49026855 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -114,9 +114,9 @@ void IRJit::RunLoopUntil(u64 globalticks) {
 		}
 		while (mips_->downcount >= 0) {
 			u32 inst = Memory::ReadUnchecked_U32(mips_->pc);
-			u32 opcode = inst >> 24;
-			u32 data = inst & 0xFFFFFF;
-			if (opcode == (MIPS_EMUHACK_OPCODE >> 24)) {
+			u32 opcode = inst & 0xFF000000;
+			if (opcode == MIPS_EMUHACK_OPCODE) {
+				u32 data = inst & 0xFFFFFF;
 				IRBlock *block = blocks_.GetBlock(data);
 				mips_->pc = IRInterpret(mips_, block->GetInstructions(), block->GetConstants(), block->GetNumInstructions());
 			} else {
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 50bfca8903..d7c93593f9 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -340,8 +340,13 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 		// FP-only instructions don't need to flush immediates.
 		case IROp::FAdd:
 		case IROp::FMul:
-		case IROp::FDiv:
+			// Regularize, to help x86 backends (add.s r0, r1, r0 -> add.s r0, r0, r1)
+			if (inst.src2 == inst.dest && inst.src1 != inst.src2)
+				std::swap(inst.src1, inst.src2);
+			out.Write(inst);
+			break;
 		case IROp::FSub:
+		case IROp::FDiv:
 		case IROp::FNeg:
 		case IROp::FAbs:
 		case IROp::FSqrt:
@@ -373,6 +378,19 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 
+		case IROp::InitVec4:
+			out.Write(inst);
+			break;
+
+		case IROp::VSin:
+		case IROp::VCos:
+		case IROp::VSqrt:
+		case IROp::VRSqrt:
+		case IROp::VRecip:
+		case IROp::VAsin:
+			out.Write(inst);
+			break;
+
 		case IROp::ZeroFpCond:
 		case IROp::FCmpUnordered:
 		case IROp::FCmpEqual:
diff --git a/Core/MIPS/MIPSVFPUUtils.h b/Core/MIPS/MIPSVFPUUtils.h
index bb8403217f..7f6ada0fa2 100644
--- a/Core/MIPS/MIPSVFPUUtils.h
+++ b/Core/MIPS/MIPSVFPUUtils.h
@@ -45,6 +45,10 @@ inline float vfpu_cos(float angle) {
 	return cosf(angle);
 }
 
+inline float vfpu_asin(float angle) {
+	return asinf(angle) / M_PI_2;
+}
+
 inline void vfpu_sincos(float angle, float &sine, float &cosine) {
 	angle -= floorf(angle * 0.25f) * 4.f;
 	angle *= (float)M_PI_2;
@@ -127,7 +131,15 @@ int GetNumVectorElements(VectorSize sz);
 int GetMatrixSide(MatrixSize sz);
 const char *GetVectorNotation(int reg, VectorSize size);
 const char *GetMatrixNotation(int reg, MatrixSize size);
-
+inline bool IsMatrixTransposed(int matrixReg) {
+	return (matrixReg >> 5) & 1;
+}
+inline bool IsVectorColumn(int vectorReg) {
+	return !((vectorReg >> 5) & 1);
+}
+inline int TransposeMatrixReg(int matrixReg) {
+	return matrixReg ^ 0x20;
+}
 int GetVectorOverlap(int reg1, VectorSize size1, int reg2, VectorSize size2);
 
 float Float16ToFloat32(unsigned short l);
diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp
index fcc51c3646..749967f53a 100644
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@@ -101,8 +101,7 @@ void Jit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {
 	for (int i = 0; i < n; i++)
 		origV[i] = vregs[i];
 
-	for (int i = 0; i < n; i++)
-	{
+	for (int i = 0; i < n; i++) {
 		int regnum = (prefix >> (i*2)) & 3;
 		int abs    = (prefix >> (8+i)) & 1;
 		int negate = (prefix >> (16+i)) & 1;
@@ -2142,7 +2141,7 @@ void CosOnly(SinCosArg angle) {
 }
 
 void ASinScaled(SinCosArg angle) {
-	sincostemp[0] = asinf(angle) / M_PI_2;
+	sincostemp[0] = vfpu_asin(angle);
 }
 
 void SinCosNegSin(SinCosArg angle) {