More VFPU

2025-04-02 11:01:50 -04:00 · 2016-05-09 23:47:56 +02:00 · 2016-05-09 23:47:56 +02:00 · 558bb197c7
commit 558bb197c7
parent a5d5c5ce2b
11 changed files with 105 additions and 40 deletions
--- a/Core/MIPS/IR/IRCompALU.cpp
+++ b/Core/MIPS/IR/IRCompALU.cpp
@ -19,11 +19,9 @@

 #include "Core/MIPS/MIPS.h"
 #include "Core/MIPS/MIPSCodeUtils.h"
-#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRFrontend.h"
 #include "Common/CPUDetect.h"

-using namespace MIPSAnalyst;
-
 #define _RS MIPS_GET_RS(op)
 #define _RT MIPS_GET_RT(op)
 #define _RD MIPS_GET_RD(op)
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@ -28,7 +28,7 @@
 #include "Core/MIPS/MIPSAnalyst.h"
 #include "Core/MIPS/MIPSTables.h"

-#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRFrontend.h"
 #include "Core/MIPS/JitCommon/JitBlockCache.h"

 #include "Common/Arm64Emitter.h"
--- a/Core/MIPS/IR/IRCompFPU.cpp
+++ b/Core/MIPS/IR/IRCompFPU.cpp
@ -21,7 +21,7 @@
 #include "Core/MIPS/MIPSCodeUtils.h"
 #include "Core/MIPS/MIPSTables.h"

-#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRFrontend.h"
 #include "Core/MIPS/IR/IRRegCache.h"
 #include "Common/CPUDetect.h"

--- a/Core/MIPS/IR/IRCompLoadStore.cpp
+++ b/Core/MIPS/IR/IRCompLoadStore.cpp
@ -42,7 +42,7 @@
 #include "Core/MIPS/MIPS.h"
 #include "Core/MIPS/MIPSAnalyst.h"
 #include "Core/MIPS/MIPSCodeUtils.h"
-#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRFrontend.h"
 #include "Core/MIPS/IR/IRRegCache.h"

 #define _RS MIPS_GET_RS(op)
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@ -27,7 +27,7 @@
 #include "Core/Config.h"
 #include "Core/Reporting.h"

-#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRFrontend.h"
 #include "Core/MIPS/IR/IRRegCache.h"

 // All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
@ -50,6 +50,15 @@
 #define _IMM26 (op & 0x03FFFFFF)

 namespace MIPSComp {
+	static void ApplyVoffset(u8 regs[4], int count) {
+		for (int i = 0; i < count; i++) {
+			regs[i] = voffset[regs[i]];
+		}
+	}
+
+	static bool IsConsecutive4(const u8 regs[4]) {
+		return (regs[1] == regs[0] + 1 && regs[2] == regs[1] + 1 && regs[3] == regs[2] + 1);
+	}

 	void IRFrontend::Comp_VPFX(MIPSOpcode op)	{
 		CONDITIONAL_DISABLE;
@ -177,7 +186,21 @@ namespace MIPSComp {
 	}

 	void IRFrontend::Comp_SV(MIPSOpcode op) {
-		DISABLE;
+		s32 offset = (signed short)(op & 0xFFFC);
+		int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);
+		MIPSGPReg rs = _RS;
+		switch (op >> 26) {
+		case 50: //lv.s
+			ir.Write(IROp::LoadFloatV, voffset[vt], rs, ir.AddConstant(offset));
+			break;
+
+		case 58: //sv.s
+			ir.Write(IROp::StoreFloatV, voffset[vt], rs, ir.AddConstant(offset));
+			break;
+
+		default:
+			DISABLE;
+		}
 	}

 	void IRFrontend::Comp_SVQ(MIPSOpcode op) {
@ -187,27 +210,32 @@ namespace MIPSComp {

 		u8 vregs[4];
 		GetVectorRegs(vregs, V_Quad, vt);
+		ApplyVoffset(vregs, 4);  // Translate to memory order

 		switch (op >> 26) {
 		case 54: //lv.q
-		{
-			// TODO: Add vector load/store instruction to the IR
-			ir.Write(IROp::LoadFloatV, voffset[vregs[0]], rs, ir.AddConstant(imm));
-			ir.Write(IROp::LoadFloatV, voffset[vregs[1]], rs, ir.AddConstant(imm + 4));
-			ir.Write(IROp::LoadFloatV, voffset[vregs[2]], rs, ir.AddConstant(imm + 8));
-			ir.Write(IROp::LoadFloatV, voffset[vregs[3]], rs, ir.AddConstant(imm + 12));
-		}
-		break;
+			if (IsConsecutive4(vregs)) {
+				ir.Write(IROp::LoadVec4, vregs[0], rs, ir.AddConstant(imm));
+			} else {
+				// Let's not even bother with "vertical" loads for now.
+				ir.Write(IROp::LoadFloatV, vregs[0], rs, ir.AddConstant(imm));
+				ir.Write(IROp::LoadFloatV, vregs[1], rs, ir.AddConstant(imm + 4));
+				ir.Write(IROp::LoadFloatV, vregs[2], rs, ir.AddConstant(imm + 8));
+				ir.Write(IROp::LoadFloatV, vregs[3], rs, ir.AddConstant(imm + 12));
+			}
+			break;

 		case 62: //sv.q
-		{
-			// CC might be set by slow path below, so load regs first.
-			ir.Write(IROp::StoreFloatV, voffset[vregs[0]], rs, ir.AddConstant(imm));
-			ir.Write(IROp::StoreFloatV, voffset[vregs[1]], rs, ir.AddConstant(imm + 4));
-			ir.Write(IROp::StoreFloatV, voffset[vregs[2]], rs, ir.AddConstant(imm + 8));
-			ir.Write(IROp::StoreFloatV, voffset[vregs[3]], rs, ir.AddConstant(imm + 12));
-		}
-		break;
+			if (IsConsecutive4(vregs)) {
+				ir.Write(IROp::StoreVec4, vregs[0], rs, ir.AddConstant(imm));
+			} else {
+				// Let's not even bother with "vertical" stores for now.
+				ir.Write(IROp::StoreFloatV, vregs[0], rs, ir.AddConstant(imm));
+				ir.Write(IROp::StoreFloatV, vregs[1], rs, ir.AddConstant(imm + 4));
+				ir.Write(IROp::StoreFloatV, vregs[2], rs, ir.AddConstant(imm + 8));
+				ir.Write(IROp::StoreFloatV, vregs[3], rs, ir.AddConstant(imm + 12));
+			}
+			break;

 		default:
 			DISABLE;
--- a/Core/MIPS/IR/IRFrontend.cpp
+++ b/Core/MIPS/IR/IRFrontend.cpp
@ -236,8 +236,8 @@ void IRFrontend::DoJit(u32 em_address, std::vector<IRInst> &instructions, std::v
 		if (IRApplyPasses(passes, ARRAY_SIZE(passes), ir, simplified))
 			logBlocks = 1;
 		code = &simplified;
-		if (ir.GetInstructions().size() >= 24)
-			logBlocks = 1;
+		//if (ir.GetInstructions().size() >= 24)
+		//	logBlocks = 1;
 	}

 	instructions = code->GetInstructions();
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@ -60,11 +60,13 @@ static const IRMeta irMeta[] = {
 	{ IROp::Load32, "Load32", "GGC" },
 	{ IROp::LoadFloat, "LoadFloat", "FGC" },
 	{ IROp::LoadFloatV, "LoadFloatV", "VGC" },
+	{ IROp::LoadVec4, "LoadVec4", "VGC" },
 	{ IROp::Store8, "Store8", "GGC" },
 	{ IROp::Store16, "Store16", "GGC" },
 	{ IROp::Store32, "Store32", "GGC" },
 	{ IROp::StoreFloat, "StoreFloat", "FGC" },
 	{ IROp::StoreFloatV, "StoreFloatV", "VGC" },
+	{ IROp::StoreVec4, "StoreVec4", "VGC" },
 	{ IROp::FAdd, "FAdd", "FFF" },
 	{ IROp::FSub, "FSub", "FFF" },
 	{ IROp::FMul, "FMul", "FFF" },
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@ -90,12 +90,14 @@ enum class IROp : u8 {
 	Load32,
 	LoadFloat,
 	LoadFloatV,
+	LoadVec4,

 	Store8,
 	Store16,
 	Store32,
 	StoreFloat,
 	StoreFloatV,
+	StoreVec4,

 	Ext8to32,
 	Ext16to32,
@ -212,13 +214,16 @@ enum {
 	IRTEMP_LHS,  // Reserved for use in branches
 	IRTEMP_RHS,  // Reserved for use in branches

+	// 16 float temps for vector S and T prefixes and things like that.
+	// IRVTEMP_0 = 208 - 64,  // -64 to be relative to v[0]
+
 	// Hacky way to get to other state
-	IRREG_VPFU_CTRL_BASE = 208,
-	IRREG_VPFU_CC = 211,
+	IRREG_VFPU_CTRL_BASE = 208,
+	IRREG_VFPU_CC = 211,
 	IRREG_LO = 226,  // offset of lo in MIPSState / 4
 	IRREG_HI = 227,
 	IRREG_FCR31 = 228,
-	IRREG_FPCOND = 229
+	IRREG_FPCOND = 229,
 };

 struct IRMeta {
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@ -1,3 +1,7 @@
+#ifdef _M_SSE
+#include <smmintrin.h>
+#endif
+
 #include "Core/MemMap.h"
 #include "Core/HLE/HLE.h"
 #include "Core/HLE/ReplaceTables.h"
@ -107,6 +111,29 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			Memory::WriteUnchecked_Float(mips->v[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
 			break;

+		case IROp::LoadVec4:
+		{
+			u32 base = mips->r[inst->src1] + constPool[inst->src2];
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->v[inst->dest], _mm_load_ps((const float *)Memory::GetPointerUnchecked(base)));
+#else
+			for (int i = 0; i < 4; i++)
+				mips->v[inst->dest + i] = Memory::ReadUnchecked_Float(base + 4 * i);
+#endif
+			break;
+		}
+		case IROp::StoreVec4:
+		{
+			u32 base = mips->r[inst->src1] + constPool[inst->src2];
+#if defined(_M_SSE)
+			_mm_store_ps((float *)Memory::GetPointerUnchecked(base), _mm_load_ps(&mips->v[inst->dest]));
+#else
+			for (int i = 0; i < 4; i++)
+				Memory::WriteUnchecked_Float(mips->v[inst->dest + i], base + 4 * i);
+#endif
+			break;
+		}
+
 		case IROp::ShlImm:
 			mips->r[inst->dest] = mips->r[inst->src1] << (int)inst->src2;
 			break;
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@ -291,6 +291,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			break;
 		case IROp::StoreFloat:
 		case IROp::StoreFloatV:
+		case IROp::StoreVec4:
 			if (gpr.IsImm(inst.src1)) {
 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + constants[inst.src2]));
 			} else {
@ -314,6 +315,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			break;
 		case IROp::LoadFloat:
 		case IROp::LoadFloatV:
+		case IROp::LoadVec4:
 			if (gpr.IsImm(inst.src1)) {
 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + constants[inst.src2]));
 			} else {
@ -388,7 +390,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			goto doDefault;

 		case IROp::VfpuCtrlToReg:
-			gpr.MapDirtyIn(inst.dest, IRREG_VPFU_CTRL_BASE + inst.src1);
+			gpr.MapDirtyIn(inst.dest, IRREG_VFPU_CTRL_BASE + inst.src1);
 			goto doDefault;

 		case IROp::Syscall:
--- a/Core/MIPS/MIPS.h
+++ b/Core/MIPS/MIPS.h
@ -86,6 +86,7 @@ enum MIPSGPReg {
 	MIPS_REG_RA=31,

 	// Not real regs, just for convenience/jit mapping.
+	// NOTE: These are not the same as the offsets the IR has to use!
 	MIPS_REG_HI = 32,
 	MIPS_REG_LO = 33,
 	MIPS_REG_FPCOND = 34,
@ -155,7 +156,7 @@ public:

 	void DoState(PointerWrap &p);

-	// MUST start with r and be followed by f!
+	// MUST start with r and be followed by f, v, and t!
 	u32 r[32];
 	union {
 		float f[32];
@ -166,23 +167,25 @@ public:
 		float v[128];
 		u32 vi[128];
 	};
-	// Used for temporary variables by IR Interpreter.
-	// Can be indexed through r[] using indices 192+.
-	u32 t[16];

-	// Temps don't get flushed so we don't reserve space for them.
+	// Register-allocated JIT Temps don't get flushed so we don't reserve space for them.
+	// However, the IR interpreter needs some temps that can stick around between ops.
+	// Can be indexed through r[] using indices 192+.
+	u32 t[16];     //192
+	// float vt[16];  //208  TODO: VFPU temp
+
 	// If vfpuCtrl (prefixes) get mysterious values, check the VFPU regcache code.
-	u32 vfpuCtrl[16];
+	u32 vfpuCtrl[16]; // 208

 	// ARM64 wants lo/hi to be aligned to 64 bits from the base of this struct.
-	u32 padLoHi;
+	u32 padLoHi;    // 224

 	union {
 		struct {
-			u32 pc;
+			u32 pc;   //225

-			u32 lo;  // offset 192 + 16 + 16 + 1 + 1
-			u32 hi;
+			u32 lo;   //226
+			u32 hi;   //227

 			u32 fcr31; //fpu control register
 			u32 fpcond;  // cache the cond flag of fcr31  (& 1 << 23)