Merge some matrix utils and stuff from the NEON branch

2025-04-02 11:01:50 -04:00 · 2014-11-29 11:37:45 +01:00 · 2014-11-29 11:37:45 +01:00 · 8f016d3e48
commit 8f016d3e48
parent 66d74981b5
12 changed files with 552 additions and 176 deletions
--- a/Core/MIPS/JitCommon/JitBlockCache.h
+++ b/Core/MIPS/JitCommon/JitBlockCache.h
@ -37,28 +37,23 @@ namespace std {
 #if defined(ARM)
 #include "Common/ArmEmitter.h"
 namespace ArmGen { class ARMXEmitter; }
-using namespace ArmGen;
 typedef ArmGen::ARMXCodeBlock CodeBlock;
 #elif defined(_M_IX86) || defined(_M_X64)
 #include "Common/x64Emitter.h"
 namespace Gen { class XEmitter; }
-using namespace Gen;
 typedef Gen::XCodeBlock CodeBlock;
 #elif defined(PPC)
 #include "Common/ppcEmitter.h"
 namespace PpcGen { class PPCXEmitter; }
-using namespace PpcGen;
 typedef PpcGen::PPCXCodeBlock CodeBlock;
 #elif defined(MIPS)
 #include "Common/MipsEmitter.h"
 namespace MIPSGen { class MIPSEmitter; }
-using namespace MIPSGen;
 typedef MIPSGen::MIPSCodeBlock CodeBlock;
 #else
 #warning "Unsupported arch!"
 #include "Common/FakeEmitter.h"
 namespace FakeGen { class FakeXEmitter; }
-using namespace FakeGen;
 typedef FakeGen::FakeXCodeBlock CodeBlock;
 #endif

--- a/Core/MIPS/MIPSVFPUUtils.cpp
+++ b/Core/MIPS/MIPSVFPUUtils.cpp
@ -15,13 +15,13 @@
 // Official git repository and contact information can be found at
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.

+#include <limits>
+#include <stdio.h>
+
 #include "Core/Reporting.h"
 #include "Core/MIPS/MIPS.h"
 #include "Core/MIPS/MIPSVFPUUtils.h"

-#include <limits>
-#include <stdio.h>
-
 #define V(i)   (currentMIPS->v[voffset[i]])
 #define VI(i)  (currentMIPS->vi[voffset[i]])

@ -76,19 +76,87 @@ void GetMatrixRegs(u8 regs[16], MatrixSize N, int matrixReg) {
 	}
 }

+int GetMatrixName(int matrix, MatrixSize msize, int column, int row, bool transposed) {
+	// TODO: Fix (?)
+	int name = (matrix * 4) | (transposed << 5);
+	switch (msize) {
+	case M_4x4:
+		if (row || column) {
+			ERROR_LOG(JIT, "GetMatrixName: Invalid row %i or column %i for size %i", row, column, msize);
+		}
+		break;
+
+	case M_3x3:
+		if (row & ~2) {
+			ERROR_LOG(JIT, "GetMatrixName: Invalid row %i for size %i", row, msize);
+		}
+		if (column & ~2) {
+			ERROR_LOG(JIT, "GetMatrixName: Invalid col %i for size %i", column, msize);
+		}
+		name |= (row << 6) | column;
+		break;
+
+	case M_2x2:
+		if (row & ~2) {
+			ERROR_LOG(JIT, "GetMatrixName: Invalid row %i for size %i", row, msize);
+		}
+		if (column & ~2) {
+			ERROR_LOG(JIT, "GetMatrixName: Invalid col %i for size %i", column, msize);
+		}
+		name |= (row << 5) | column;
+		break;
+	}
+
+	return name;
+}
+
+int GetColumnName(int matrix, MatrixSize msize, int column, int offset) {
+	return matrix * 4 + column + offset * 32;
+}
+
+int GetRowName(int matrix, MatrixSize msize, int column, int offset) {
+	return 0x20 | (matrix * 4 + column + offset * 32);
+}
+
+void GetMatrixColumns(int matrixReg, MatrixSize msize, u8 vecs[4]) {
+	int n = GetMatrixSide(msize);
+
+	int col = matrixReg & 3;
+	int row = (matrixReg >> 5) & 2;
+	int transpose = (matrixReg >> 5) & 1;
+
+	for (int i = 0; i < n; i++) {
+		vecs[i] = (transpose << 5) | (row << 5) | (matrixReg & 0x1C) | (i + col);
+	}
+}
+
+void GetMatrixRows(int matrixReg, MatrixSize msize, u8 vecs[4]) {
+	int n = GetMatrixSide(msize);
+	int col = matrixReg & 3;
+	int row = (matrixReg >> 5) & 2;
+
+	int swappedCol = row ? (msize == M_3x3 ? 1 : 2) : 0;
+	int swappedRow = col ? 2 : 0;
+	int transpose = ((matrixReg >> 5) & 1) ^ 1;
+
+	for (int i = 0; i < n; i++) {
+		vecs[i] = (transpose << 5) | (swappedRow << 5) | (matrixReg & 0x1C) | (i + swappedCol);
+	}
+}
+
 void ReadVector(float *rd, VectorSize size, int reg) {
-	const int mtx = (reg >> 2) & 7;
-	const int col = reg & 3;
 	int row = 0;
 	int length = 0;
-	int transpose = (reg>>5) & 1;

 	switch (size) {
-	case V_Single: transpose = 0; row=(reg>>5)&3; length = 1; break;
+	case V_Single: rd[0] = V(reg); return; // transpose = 0; row=(reg>>5)&3; length = 1; break;
 	case V_Pair:   row=(reg>>5)&2; length = 2; break;
 	case V_Triple: row=(reg>>6)&1; length = 3; break;
 	case V_Quad:   row=(reg>>5)&2; length = 4; break;
 	}
+	int transpose = (reg>>5) & 1;
+	const int mtx = (reg >> 2) & 7;
+	const int col = reg & 3;

 	u32 *rdu = (u32 *)rd;
 	if (transpose) {
@ -103,18 +171,18 @@ void ReadVector(float *rd, VectorSize size, int reg) {
 }

 void WriteVector(const float *rd, VectorSize size, int reg) {
-	const int mtx = (reg>>2)&7;
-	const int col = reg & 3;
 	int row = 0;
 	int length = 0;
-	int transpose = (reg>>5)&1;

 	switch (size) {
-	case V_Single: transpose = 0; row=(reg>>5)&3; length = 1; break;
+	case V_Single: V(reg) = rd[0]; return; // transpose = 0; row=(reg>>5)&3; length = 1; break;
 	case V_Pair:   row=(reg>>5)&2; length = 2; break;
 	case V_Triple: row=(reg>>6)&1; length = 3; break;
 	case V_Quad:   row=(reg>>5)&2; length = 4; break;
 	}
+	const int mtx = (reg>>2)&7;
+	const int col = reg & 3;
+	int transpose = (reg>>5)&1;

 	u32 *rdu = (u32 *)rd;
 	if (currentMIPS->VfpuWriteMask() == 0) {
@ -186,11 +254,10 @@ void WriteMatrix(const float *rd, MatrixSize size, int reg) {
 		ERROR_LOG_REPORT(CPU, "Write mask used with vfpu matrix instruction.");
 	}

-	for (int i=0; i<side; i++) {
-		for (int j=0; j<side; j++) {
+	for (int i = 0; i < side; i++) {
+		for (int j = 0; j < side; j++) {
 			// Hm, I wonder if this should affect matrices at all.
-			if (j != side -1 || !currentMIPS->VfpuWriteMask(i))
-			{
+			if (j != side -1 || !currentMIPS->VfpuWriteMask(i))	{
 				int index = mtx * 4;
 				if (transpose)
 					index += ((row+i)&3) + ((col+j)&3)*32;
@ -202,6 +269,22 @@ void WriteMatrix(const float *rd, MatrixSize size, int reg) {
 	}
 }

+int GetVectorOverlap(int vec1, VectorSize size1, int vec2, VectorSize size2) {
+	int n1 = GetNumVectorElements(size1);
+	int n2 = GetNumVectorElements(size2);
+	u8 regs1[4];
+	u8 regs2[4];
+	GetVectorRegs(regs1, size1, vec1);
+	GetVectorRegs(regs2, size1, vec2);
+	int count = 0;
+	for (int i = 0; i < n1; i++) {
+		for (int j = 0; j < n2; j++) {
+			if (regs1[i] == regs2[j])
+				count++;
+		}
+	}
+	return count;
+}

 int GetNumVectorElements(VectorSize sz)
 {
@ -252,6 +335,25 @@ VectorSize GetVecSize(MIPSOpcode op)
 	}
 }

+VectorSize GetVectorSize(MatrixSize sz) {
+	switch (sz) {
+	case M_2x2: return V_Pair;
+	case M_3x3: return V_Triple;
+	case M_4x4: return V_Quad;
+	default:    return V_Invalid;
+	}
+}
+
+MatrixSize GetMatrixSize(VectorSize sz) {
+	switch (sz) {
+	case V_Single: return M_Invalid;
+	case V_Pair: return M_2x2;
+	case V_Triple: return M_3x3;
+	case V_Quad: return M_4x4;
+	default: return M_Invalid;
+	}
+}
+
 MatrixSize GetMtxSize(MIPSOpcode op)
 {
 	int a = (op>>7)&1;
@ -267,10 +369,17 @@ MatrixSize GetMtxSize(MIPSOpcode op)
 	}
 }

-int GetMatrixSide(MatrixSize sz)
-{
-	switch (sz)
-	{
+VectorSize MatrixVectorSize(MatrixSize sz) {
+	switch (sz) {
+	case M_2x2: return V_Pair;
+	case M_3x3: return V_Triple;
+	case M_4x4: return V_Quad;
+	default: return V_Quad;
+	}
+}
+
+int GetMatrixSide(MatrixSize sz) {
+	switch (sz) {
 	case M_2x2: return 2;
 	case M_3x3: return 3;
 	case M_4x4: return 4;
@ -278,10 +387,40 @@ int GetMatrixSide(MatrixSize sz)
 	}
 }

+// TODO: Optimize
+MatrixOverlapType GetMatrixOverlap(int mtx1, int mtx2, MatrixSize msize) {
+	int n = GetMatrixSide(msize);
+
+	if (mtx1 == mtx2)
+		return OVERLAP_EQUAL;
+
+	u8 m1[16];
+	u8 m2[16];
+	GetMatrixRegs(m1, msize, mtx1);
+	GetMatrixRegs(m2, msize, mtx2);
+
+	// Simply do an exhaustive search.
+	for (int x = 0; x < n; x++) {
+		for (int y = 0; y < n; y++) {
+			int val = m1[y * 4 + x];
+			for (int a = 0; a < n; a++) {
+				for (int b = 0; b < n; b++) {
+					if (m2[a * 4 + b] == val) {
+						return OVERLAP_PARTIAL;
+					}
+				}
+			}
+		}
+	}
+
+	return OVERLAP_NONE;
+}
+
 const char *GetVectorNotation(int reg, VectorSize size)
 {
 	static char hej[4][16];
-	static int yo=0;yo++;yo&=3;
+	static int yo = 0; yo++; yo &= 3;
+
 	int mtx = (reg>>2)&7;
 	int col = reg&3;
 	int row = 0;
--- a/Core/MIPS/MIPSVFPUUtils.h
+++ b/Core/MIPS/MIPSVFPUUtils.h
@ -16,9 +16,9 @@
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.

 #pragma once
-
 #include <cmath>

+#include "Common/CommonTypes.h"
 #include "Core/MIPS/MIPS.h"

 #define _VD (op & 0x7F)
@ -64,19 +64,19 @@ inline void vfpu_sincos(float angle, float &sine, float &cosine) {
 #define VFPU_SH_FLOAT16_FRAC    0
 #define VFPU_MASK_FLOAT16_FRAC  0x3ff

-enum VectorSize
-{
-	V_Single,
-	V_Pair,
-	V_Triple,
-	V_Quad,
+enum VectorSize {
+	V_Single = 1,
+	V_Pair = 2,
+	V_Triple = 3,
+	V_Quad = 4,
+	V_Invalid = -1,
 };

-enum MatrixSize
-{
-	M_2x2,
-	M_3x3,
-	M_4x4,
+enum MatrixSize {
+	M_2x2 = 2,
+	M_3x3 = 3,
+	M_4x4 = 4,
+	M_Invalid = -1
 };

 void ReadMatrix(float *rd, MatrixSize size, int reg);
@ -87,6 +87,31 @@ void ReadVector(float *rd, VectorSize N, int reg);

 void GetVectorRegs(u8 regs[4], VectorSize N, int vectorReg);
 void GetMatrixRegs(u8 regs[16], MatrixSize N, int matrixReg);
+ 
+// Translate between vector and matrix size. Possibly we should simply
+// join the two enums, but the type safety is kind of nice.
+VectorSize GetVectorSize(MatrixSize sz);
+MatrixSize GetMatrixSize(VectorSize sz);
+
+// Note that if matrix is a transposed matrix (E format), GetColumn will actually return rows,
+// and vice versa.
+int GetColumnName(int matrix, MatrixSize msize, int column, int offset);
+int GetRowName(int matrix, MatrixSize msize, int row, int offset);
+
+int GetMatrixName(int matrix, MatrixSize msize, int column, int row, bool transposed);
+
+void GetMatrixColumns(int matrixReg, MatrixSize msize, u8 vecs[4]);
+void GetMatrixRows(int matrixReg, MatrixSize msize, u8 vecs[4]);
+
+enum MatrixOverlapType {
+	OVERLAP_NONE = 0,
+	OVERLAP_PARTIAL = 1,
+	OVERLAP_EQUAL = 2,
+	// Transposed too?  (same space but transposed)
+};
+
+MatrixOverlapType GetMatrixOverlap(int m1, int m2, MatrixSize msize);
+

 // Returns a number from 0-7, good for checking overlap for 4x4 matrices.
 inline int GetMtx(int matrixReg) {
@ -97,9 +122,12 @@ VectorSize GetVecSize(MIPSOpcode op);
 MatrixSize GetMtxSize(MIPSOpcode op);
 VectorSize GetHalfVectorSize(VectorSize sz);
 VectorSize GetDoubleVectorSize(VectorSize sz);
+VectorSize MatrixVectorSize(MatrixSize sz);
 int GetNumVectorElements(VectorSize sz);
 int GetMatrixSide(MatrixSize sz);
 const char *GetVectorNotation(int reg, VectorSize size);
 const char *GetMatrixNotation(int reg, MatrixSize size);

+int GetVectorOverlap(int reg1, VectorSize size1, int reg2, VectorSize size2);
+
 float Float16ToFloat32(unsigned short l);
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@ -2788,7 +2788,7 @@ void Jit::Comp_Vi2x(MIPSOpcode op) {
 	fpr.ReleaseSpillLocks();
 }

-static const float MEMORY_ALIGNED16( vavg_table[4] ) = {1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f};
+static const float MEMORY_ALIGNED16(vavg_table[4]) = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };

 void Jit::Comp_Vhoriz(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
@ -2802,6 +2802,42 @@ void Jit::Comp_Vhoriz(MIPSOpcode op) {
 	u8 sregs[4], dregs[1];
 	GetVectorRegsPrefixS(sregs, sz, _VS);
 	GetVectorRegsPrefixD(dregs, V_Single, _VD);
+	if (fpr.TryMapDirtyInVS(dregs, V_Single, sregs, sz)) {
+		switch (sz) {
+		case V_Pair:
+			MOVAPS(XMM0, fpr.VS(sregs));
+			MOVAPS(XMM1, R(XMM0));
+			SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3,2,1,1));
+			ADDPS(XMM0, R(XMM1));
+			MOVAPS(fpr.VSX(dregs), R(XMM0));
+			break;
+		case V_Triple:
+			MOVAPS(XMM0, fpr.VS(sregs));
+			MOVAPS(XMM1, R(XMM0));
+			SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3,2,1,1));
+			ADDPS(XMM0, R(XMM1));
+			SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3,2,1,2));
+			ADDPS(XMM0, R(XMM1));
+			MOVAPS(fpr.VSX(dregs), R(XMM0));
+			break;
+		case V_Quad:
+			MOVAPS(XMM0, fpr.VS(sregs));
+			MOVHLPS(XMM1, XMM0);
+			ADDPS(XMM0, R(XMM1));
+			MOVAPS(XMM1, R(XMM0));
+			SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(1,1,1,1));
+			ADDPS(XMM0, R(XMM1));
+			MOVAPS(fpr.VSX(dregs), R(XMM0));
+			break;
+		}
+		if (((op >> 16) & 31) == 7) { // vavg
+			MULSS(fpr.VSX(dregs), M(&vavg_table[n]));
+		}
+		ApplyPrefixD(dregs, V_Single);
+		fpr.ReleaseSpillLocks();
+		NOTICE_LOG(JIT, "Horiz %08x", js.blockStart);
+		return;
+	}

 	// Flush SIMD.
 	fpr.SimpleRegsV(sregs, sz, 0);
--- a/Core/MIPS/x86/Jit.cpp
+++ b/Core/MIPS/x86/Jit.cpp
@ -120,7 +120,7 @@ JitOptions::JitOptions()
 	continueBranches = false;
 	continueJumps = false;
 	continueMaxInstructions = 300;
-	enableVFPUSIMD = false;
+	enableVFPUSIMD = true;
 }

 #ifdef _MSC_VER
--- a/Core/MIPS/x86/JitSafeMem.h
+++ b/Core/MIPS/x86/JitSafeMem.h
@ -24,6 +24,8 @@ class ThunkManager;

 namespace MIPSComp {

+using namespace Gen;
+
 class JitSafeMem {
 public:
 	JitSafeMem(Jit *jit, MIPSGPReg raddr, s32 offset, u32 alignMask = 0xFFFFFFFF);
--- a/unittest/JitHarness.cpp
+++ b/unittest/JitHarness.cpp
@ -108,12 +108,17 @@ bool TestJit() {

 	// TODO: Smarter way of seeding in the code sequence.
 	static const char *lines[] = {
+		//"vcrsp.t C000, C100, C200",
+		"vdot.q C000, C100, C200",
+		//"vmmul.q M000, M100, M200",
+		/*
 		"abs.s f1, f1",
 		"cvt.w.s f1, f1",
 		"cvt.w.s f3, f1",
 		"cvt.w.s f0, f2",
 		"cvt.w.s f5, f1",
 		"cvt.w.s f6, f5",
+		*/
 	};

 	bool compileSuccess = true;
--- a/unittest/TestArmEmitter.cpp
+++ b/unittest/TestArmEmitter.cpp
@ -0,0 +1,240 @@
+#include "Common/ArmEmitter.h"
+#include "Core/MIPS/ARM/ArmRegCacheFPU.h"
+#include "Core/MIPS/ARM/ArmJit.h"
+#include "Core/MIPS/JitCommon/JitState.h"
+#include "Core/MIPS/MIPSVFPUUtils.h"
+#include "ext/disarm.h"
+
+#include "UnitTest.h"
+
+bool CheckLast(ArmGen::ARMXEmitter &emit, const char *comp) {
+	u32 instr;
+	memcpy(&instr, emit.GetCodePtr() - 4, 4);
+	char disasm[512];
+	ArmDis(0, instr, disasm, sizeof(disasm), true);
+	EXPECT_EQ_STR(std::string(disasm), std::string(comp));
+	return true;
+}
+
+void DisassembleARMBetween(const u8 *start, const u8 *end) {
+	while (start < end) {
+		char disasm[512];
+		uint32_t instr;
+		memcpy(&instr, start, 4);
+		ArmDis(0, instr, disasm, sizeof(disasm), true);
+		printf("%s\n", disasm);
+		start += 4;
+	}
+}
+
+bool TestArmEmitter() {
+	using namespace ArmGen;
+
+	u32 code[512];
+	ARMXEmitter emitter((u8 *)code);
+	emitter.VNEG(S1, S2);
+	RET(CheckLast(emitter, "eef10a41 VNEG s1, s2"));
+	emitter.LDR(R3, R7);
+	RET(CheckLast(emitter, "e5973000 LDR r3, [r7, #0]"));
+	emitter.VLDR(S3, R8, 48);
+	RET(CheckLast(emitter, "edd81a0c VLDR s3, [r8, #48]"));
+	emitter.VSTR(S5, R12, -36);
+	RET(CheckLast(emitter, "ed4c2a09 VSTR s5, [r12, #-36]"));
+	emitter.VADD(S1, S2, S3);
+	RET(CheckLast(emitter, "ee710a21 VADD s1, s2, s3"));
+	emitter.VADD(D1, D2, D3);
+	RET(CheckLast(emitter, "ee321b03 VADD d1, d2, d3"));
+	emitter.VSUB(S1, S2, S3);
+	RET(CheckLast(emitter, "ee710a61 VSUB s1, s2, s3"));
+	emitter.VMUL(S7, S8, S9);
+	RET(CheckLast(emitter, "ee643a24 VMUL s7, s8, s9"));
+	emitter.VMUL(S0, S5, S10);
+	RET(CheckLast(emitter, "ee220a85 VMUL s0, s5, s10"));
+	emitter.VNMUL(S7, S8, S9);
+	RET(CheckLast(emitter, "ee643a64 VNMUL s7, s8, s9"));
+	emitter.VMLA(S7, S8, S9);
+	RET(CheckLast(emitter, "ee443a24 VMLA s7, s8, s9"));
+	emitter.VNMLA(S7, S8, S9);
+	RET(CheckLast(emitter, "ee543a64 VNMLA s7, s8, s9"));
+	emitter.VNMLS(S7, S8, S9);
+	RET(CheckLast(emitter, "ee543a24 VNMLS s7, s8, s9"));
+	emitter.VABS(S1, S2);
+	RET(CheckLast(emitter, "eef00ac1 VABS s1, s2"));
+	emitter.VMOV(S1, S2);
+	RET(CheckLast(emitter, "eef00a41 VMOV s1, s2"));
+	emitter.VMOV(D1, D2);
+	RET(CheckLast(emitter, "eeb01b42 VMOV d1, d2"));
+	emitter.VCMP(S1, S2);
+	RET(CheckLast(emitter, "eef40a41 VCMP s1, s2"));
+	emitter.VCMPE(S1, S2);
+	RET(CheckLast(emitter, "eef40ac1 VCMPE s1, s2"));
+	emitter.VSQRT(S1, S2);
+	RET(CheckLast(emitter, "eef10ac1 VSQRT s1, s2"));
+	emitter.VDIV(S1, S2, S3);
+	RET(CheckLast(emitter, "eec10a21 VDIV s1, s2, s3"));
+	emitter.VMRS(R1);
+	RET(CheckLast(emitter, "eef11a10 VMRS r1"));
+	emitter.VMSR(R7);
+	RET(CheckLast(emitter, "eee17a10 VMSR r7"));
+	emitter.VMRS_APSR();
+	RET(CheckLast(emitter, "eef1fa10 VMRS APSR"));
+	emitter.VCVT(S0, S1, TO_INT | IS_SIGNED);
+	RET(CheckLast(emitter, "eebd0a60 VCVT ..."));
+	emitter.VMOV_imm(I_32, R0, VIMM___x___x, 0xF3);
+	emitter.VMOV_imm(I_8, R0, VIMMxxxxxxxx, 0xF3);
+	emitter.VMOV_immf(Q0, 1.0f);
+	emitter.VMOV_immf(Q0, -1.0f);
+	emitter.VBIC_imm(I_32, R0, VIMM___x___x, 0xF3);
+	emitter.VMVN_imm(I_32, R0, VIMM___x___x, 0xF3);
+	emitter.VPADD(F_32, D0, D0, D0);
+	emitter.VMOV(Q14, Q2);
+
+	emitter.VMOV(S3, S6);
+	RET(CheckLast(emitter, "eef01a43 VMOV s3, s6"));
+
+	emitter.VMOV_imm(I_32, R0, VIMM___x___x, 0xF3);
+	emitter.VMOV_imm(I_8, R0, VIMMxxxxxxxx, 0xF3);
+	emitter.VMOV_immf(Q0, 1.0f);
+	RET(CheckLast(emitter, "f2870f50 VMOV q0, 1.0"));
+	emitter.VMOV_immf(Q0, -1.0f);
+	emitter.VBIC_imm(I_32, R0, VIMM___x___x, 0xF3);
+	emitter.VMVN_imm(I_32, R0, VIMM___x___x, 0xF3);
+	emitter.VPADD(F_32, D0, D0, D0);
+	emitter.VMOV(Q14, Q2);
+
+	emitter.VMOV(S9, R3);
+	RET(CheckLast(emitter, "ee043a90 VMOV s9, r3"));
+	emitter.VMOV(R9, S3);
+	RET(CheckLast(emitter, "ee119a90 VMOV r9, s3"));
+
+	emitter.VMVN(Q1, Q13);
+	RET(CheckLast(emitter, "f3b025ea VMVN q1, q3"));
+
+	emitter.VMOV(S3, S6);
+	RET(CheckLast(emitter, "eef01a43 VMOV s3, s6"));
+	emitter.VMOV(S25, S21);
+	RET(CheckLast(emitter, "eef0ca6a VMOV s25, s21"));
+	emitter.VLD1(I_32, D19, R3, 2, ALIGN_NONE, R_PC);
+	RET(CheckLast(emitter, "f4633a8f VLD1.32 {d19-d20}, [r3]"));
+	emitter.VST1(I_32, D23, R9, 1, ALIGN_NONE, R_PC);
+	RET(CheckLast(emitter, "f449778f VST1.32 {d23}, [r9]"));
+	emitter.VLD1_lane(F_32, D8, R3, 0, ALIGN_NONE, R_PC);
+	RET(CheckLast(emitter, "f4a3880f VLD1.32 {d8[0]}, [r3]"));
+	emitter.VLD1_lane(I_8, D8, R3, 2, ALIGN_NONE, R_PC);
+	RET(CheckLast(emitter, "f4a3804f VLD1.i8 {d8[2]}, [r3]"));
+
+	emitter.VADD(I_8, D3, D4, D19);
+	RET(CheckLast(emitter, "f2043823 VADD.i8 d3, d4, d19"));
+	emitter.VADD(I_32, D3, D4, D19);
+	RET(CheckLast(emitter, "f2243823 VADD.i32 d3, d4, d19"));
+	emitter.VADD(F_32, D3, D4, D19);
+	RET(CheckLast(emitter, "f2043d23 VADD.f32 d3, d4, d19"));
+	emitter.VSUB(I_16, Q5, Q6, Q15);
+	RET(CheckLast(emitter, "f31ca86e VSUB.i16 q5, q6, q15"));
+	emitter.VMUL(F_32, Q1, Q2, Q3);
+	RET(CheckLast(emitter, "f3042d56 VMUL.f32 q1, q2, q3"));
+	emitter.VMUL(F_32, Q13, Q15, Q14);
+	RET(CheckLast(emitter, "f34eadfc VMUL.f32 q13, q15, q14"));
+	emitter.VADD(F_32, Q1, Q2, Q3);
+	RET(CheckLast(emitter, "f2042d46 VADD.f32 q1, q2, q3"));
+	emitter.VADD(F_32, Q11, Q11, Q10);
+	RET(CheckLast(emitter, "f2466de4 VADD.f32, Q11, Q11, Q10"));
+	emitter.VMLA(F_32, Q1, Q2, Q3);
+	RET(CheckLast(emitter, "f2042d56 VMLA.f32 q1, q2, q3"));
+	emitter.VMLS(F_32, Q1, Q2, Q3);
+	RET(CheckLast(emitter, "f2242d56 VMLS.f32 q1, q2, q3"));
+	emitter.VMLS(I_16, Q1, Q2, Q3);
+	RET(CheckLast(emitter, "f3142946 VMLS.i16 q1, q2, q3"));
+
+	emitter.VEOR(Q0, Q1, Q2);
+	RET(CheckLast(emitter, "f3020154 VEOR q0, q1, q2"));
+	emitter.VORR(Q1, Q2, Q3);
+	RET(CheckLast(emitter, "f2242156 VORR q1, q2, q3"));
+	emitter.VORR(D1, D2, D3);
+	RET(CheckLast(emitter, "f2221113 VORR d1, d2, d3"));
+	emitter.VAND(Q1, Q2, Q3);
+	RET(CheckLast(emitter, "f2042156 VAND q1, q2, q3"));
+	emitter.VDUP(F_32, Q14, D30, 1);
+	RET(CheckLast(emitter, "f3fccc6e VDUP.32 q14, d30[1]"));
+	
+	// TODO: This is broken.
+	// emitter.VDUP(F_32, D14, D30, 1);
+	// RET(CheckLast(emitter, "f3bcec2e VDUP.32 d14, d30[1]"));
+
+	//emitter.VNEG(S1, S2);
+	//RET(CheckLast(emitter, "eef10a60 VNEG.f32 s1, s1"));
+	emitter.VNEG(F_32, Q1, Q2);
+	RET(CheckLast(emitter, "f3b927c4 VNEG.f32 q1, q2"));
+	emitter.VABS(F_32, Q1, Q2);
+	RET(CheckLast(emitter, "f3b92744 VABS.f32 q1, q2"));
+	emitter.VMOV(D26, D30);
+	RET(CheckLast(emitter, "eef0ab6e VMOV d26, d30"));
+
+	emitter.VMUL_scalar(F_32, Q12, Q8, DScalar(D0, 0));
+	RET(CheckLast(emitter, "f3e089c0 VMUL.f32 q12, q8, d0[0]"));
+
+	emitter.VMUL_scalar(F_32, Q1, Q2, DScalar(D7, 0));
+	RET(CheckLast(emitter, "f3a42947 VMUL.f32 q1, q2, d7[0]"));
+
+	emitter.VMUL_scalar(F_32, D1, D2, QScalar(Q7, 0));
+	RET(CheckLast(emitter, "f2a2194e VMUL.f32 d1, d2, d14[0]"));
+
+	emitter.VMLA_scalar(F_32, Q1, Q2, DScalar(D7, 0));
+	RET(CheckLast(emitter, "f3a42147 VMLA.f32 q1, q2, d7[0]"));
+
+	emitter.VMIN(F_32, D3, D4, D19);
+	RET(CheckLast(emitter, "f2243f23 VMIN.f32 d3, d4, d19"));
+	emitter.VMAX(F_32, Q3, Q4, Q9);
+	RET(CheckLast(emitter, "f2086f62 VMAX.f32 q3, q4, q9"));
+
+	//emitter.VMOV(S1, 112);
+	//RET(CheckLast(emitter, "eef70a00 VMOV.f32 s1, #112"));
+
+
+	const u8 *codeStart = emitter.GetCodePtr();
+
+	/*
+	MIPSState mips;
+	MIPSComp::JitState js;
+	MIPSComp::ArmJitOptions jo;
+	ArmRegCacheFPU fpr(&mips, &js, &jo);
+	fpr.SetEmitter(&emitter);
+	int C000 = GetColumnName(0, M_4x4, 0, 0);
+	int C010 = GetColumnName(0, M_4x4, 1, 0);
+	int C020 = GetColumnName(0, M_4x4, 2, 0);
+	int C030 = GetColumnName(0, M_4x4, 3, 0);
+	int R000 = GetRowName(0, M_4x4, 0, 0);
+	int R001 = GetRowName(0, M_4x4, 1, 0);
+	int R002 = GetRowName(0, M_4x4, 2, 0);
+	int R003 = GetRowName(0, M_4x4, 3, 0);
+	printf("Col 000: %s\n", GetVectorNotation(C000, V_Quad));
+	printf("Row 000: %s\n", GetVectorNotation(R000, V_Quad));
+	
+	MIPSAnalyst::AnalysisResults results;
+	memset(&results, 0, sizeof(results));
+
+	fpr.Start(results);
+	fpr.QMapReg(C000, V_Quad, MAP_DIRTY);
+	fpr.QMapReg(C010, V_Quad, MAP_DIRTY);
+	fpr.QMapReg(C020, V_Quad, MAP_DIRTY);
+	fpr.QMapReg(C030, V_Quad, MAP_DIRTY);
+	emitter.ORR(R0, R0, R0);
+	fpr.QMapReg(R000, V_Quad, MAP_DIRTY);
+	fpr.FlushAll();
+
+	fpr.Start(results);
+	emitter.ORR(R0, R0, R0);
+	fpr.QMapReg(R000, V_Quad, MAP_DIRTY);
+	fpr.QMapReg(R001, V_Quad, MAP_DIRTY);
+	fpr.QMapReg(R002, V_Quad, MAP_DIRTY);
+	fpr.QMapReg(R003, V_Quad, MAP_DIRTY);
+	emitter.ORR(R0, R0, R0);
+	fpr.QMapReg(C000, V_Quad, MAP_DIRTY);
+	fpr.FlushAll();
+
+	const u8 *codeEnd = emitter.GetCodePtr();
+
+	DisassembleARMBetween(codeStart, codeEnd);
+	*/
+	return true;
+}
--- a/unittest/UnitTest.cpp
+++ b/unittest/UnitTest.cpp
@ -29,8 +29,10 @@
 #include <cstdlib>
 #include <cmath>
 #include <string>
+#include <sstream>

 #include "base/NativeApp.h"
+#include "base/logging.h"
 #include "Common/CPUDetect.h"
 #include "Common/ArmEmitter.h"
 #include "ext/disarm.h"
@ -40,14 +42,7 @@
 #include "Core/MIPS/MIPSVFPUUtils.h"

 #include "unittest/JitHarness.h"
-
-#define EXPECT_TRUE(a) if (!(a)) { printf("%s:%i: Test Fail\n", __FUNCTION__, __LINE__); return false; }
-#define EXPECT_FALSE(a) if ((a)) { printf("%s:%i: Test Fail\n", __FUNCTION__, __LINE__); return false; }
-#define EXPECT_EQ_FLOAT(a, b) if ((a) != (b)) { printf("%s:%i: Test Fail\n%f\nvs\n%f\n", __FUNCTION__, __LINE__, a, b); return false; }
-#define EXPECT_APPROX_EQ_FLOAT(a, b) if (fabsf((a)-(b))>0.00001f) { printf("%s:%i: Test Fail\n%f\nvs\n%f\n", __FUNCTION__, __LINE__, a, b); /*return false;*/ }
-#define EXPECT_EQ_STR(a, b) if (a != b) { printf("%s: Test Fail\n%s\nvs\n%s\n", __FUNCTION__, a.c_str(), b.c_str()); return false; }
-
-#define RET(a) if (!(a)) { return false; }
+#include "unittest/UnitTest.h"

 std::string System_GetProperty(SystemProperty prop) { return ""; }
 int System_GetPropertyInt(SystemProperty prop) { return -1; }
@ -242,133 +237,6 @@ bool TestAsin() {
 	return true;
 }

-
-
-bool CheckLast(ArmGen::ARMXEmitter &emit, const char *comp) {
-	u32 instr;
-	memcpy(&instr, emit.GetCodePtr() - 4, 4);
-	char disasm[512];
-	ArmDis(0, instr, disasm, sizeof(disasm), true);
-	EXPECT_EQ_STR(std::string(disasm), std::string(comp));
-	return true;
-}
-
-
-bool TestArmEmitter() {
-	using namespace ArmGen;
-
-	u32 code[512];
-	ARMXEmitter emitter((u8 *)code);
-	emitter.LDR(R3, R7);
-	RET(CheckLast(emitter, "e5973000 LDR r3, [r7, #0]"));
-	emitter.BFI(R3, R7, 5, 9);
-	RET(CheckLast(emitter, "e7cd3297 BFI r3, r7, #5, #9"));
-	emitter.BFC(R4, 5, 9);
-	RET(CheckLast(emitter, "e7cd429f BFC r4, #5, #9"));
-	emitter.UBFX(R4, R9, 5, 9);
-	RET(CheckLast(emitter, "e7e842d9 UBFX r4, r9, #5, #9"));
-	emitter.SBFX(R0, R8, 5, 9);
-	RET(CheckLast(emitter, "e7a802d8 SBFX r0, r8, #5, #9"));
-
-	emitter.B_CC(CC_NEQ, code + 128);
-	RET(CheckLast(emitter, "1a000079 BNE &000001EC"));
-	emitter.SetJumpTarget(emitter.B_CC(CC_NEQ));
-	RET(CheckLast(emitter, "1affffff BNE &00000004"));
-	emitter.SetJumpTarget(emitter.BL_CC(CC_NEQ));
-	RET(CheckLast(emitter, "1bffffff BLNE &00000004"));
-
-	emitter.VLDR(S3, R8, 48);
-	RET(CheckLast(emitter, "edd81a0c VLDR s3, [r8, #48]"));
-	emitter.VSTR(S5, R12, -36);
-	RET(CheckLast(emitter, "ed4c2a09 VSTR s5, [r12, #-36]"));
-	emitter.VADD(S1, S2, S3);
-	RET(CheckLast(emitter, "ee710a21 VADD s1, s2, s3"));
-	emitter.VADD(D1, D2, D3);
-	RET(CheckLast(emitter, "ee321b03 VADD d1, d2, d3"));
-	emitter.VSUB(S1, S2, S3);
-	RET(CheckLast(emitter, "ee710a61 VSUB s1, s2, s3"));
-	emitter.VMUL(S7, S8, S9);
-	RET(CheckLast(emitter, "ee643a24 VMUL s7, s8, s9"));
-	emitter.VMUL(S0, S5, S10);
-	RET(CheckLast(emitter, "ee220a85 VMUL s0, s5, s10"));
-	emitter.VNMUL(S7, S8, S9);
-	RET(CheckLast(emitter, "ee643a64 VNMUL s7, s8, s9"));
-	emitter.VMLA(S7, S8, S9);
-	RET(CheckLast(emitter, "ee443a24 VMLA s7, s8, s9"));
-	emitter.VNMLA(S7, S8, S9);
-	RET(CheckLast(emitter, "ee543a64 VNMLA s7, s8, s9"));
-	emitter.VNMLS(S7, S8, S9);
-	RET(CheckLast(emitter, "ee543a24 VNMLS s7, s8, s9"));
-	emitter.VABS(S1, S2);
-	RET(CheckLast(emitter, "eef00ac1 VABS s1, s2"));
-	emitter.VMOV(S1, S2);
-	RET(CheckLast(emitter, "eef00a41 VMOV s1, s2"));
-	emitter.VCMP(S1, S2);
-	RET(CheckLast(emitter, "eef40a41 VCMP s1, s2"));
-	emitter.VCMPE(S1, S2);
-	RET(CheckLast(emitter, "eef40ac1 VCMPE s1, s2"));
-	emitter.VSQRT(S1, S2);
-	RET(CheckLast(emitter, "eef10ac1 VSQRT s1, s2"));
-	emitter.VDIV(S1, S2, S3);
-	RET(CheckLast(emitter, "eec10a21 VDIV s1, s2, s3"));
-	emitter.VMRS(R1);
-	RET(CheckLast(emitter, "eef11a10 VMRS r1"));
-	emitter.VMSR(R7);
-	RET(CheckLast(emitter, "eee17a10 VMSR r7"));
-	emitter.VMRS_APSR();
-	RET(CheckLast(emitter, "eef1fa10 VMRS APSR"));
-	emitter.VCVT(S0, S1, TO_INT | IS_SIGNED);
-	RET(CheckLast(emitter, "eebd0a60 VCVT ..."));
-
-
-	// WTF?
-	//emitter.VSUB(S4, S5, S6);
-	//RET(CheckLast(emitter, "ee322ac3 VSUB s4, s5, s6"));
-
-
-	emitter.VMOV(S3, S6);
-	RET(CheckLast(emitter, "eef01a43 VMOV s3, s6"));
-
-	/*
-	// These are only implemented in the neon-vfpu branch. will cherrypick later.
-	emitter.VMOV_imm(I_32, R0, VIMM___x___x, 0xF3);
-	emitter.VMOV_imm(I_8, R0, VIMMxxxxxxxx, 0xF3);
-	emitter.VMOV_immf(Q0, 1.0f);
-	RET(CheckLast(emitter, "eebd0a60 VMOV Q0, 1.0"));
-	emitter.VMOV_immf(Q0, -1.0f);
-	emitter.VBIC_imm(I_32, R0, VIMM___x___x, 0xF3);
-	emitter.VMVN_imm(I_32, R0, VIMM___x___x, 0xF3);
-	emitter.VPADD(F_32, D0, D0, D0);
-	emitter.VMOV(Q14, Q2);
-	*/
-
-	emitter.VMOV(S3, S6);
-	RET(CheckLast(emitter, "eef01a43 VMOV s3, s6"));
-	emitter.VLD1(I_32, D19, R3, 2, ALIGN_NONE, R_PC);
-	RET(CheckLast(emitter, "f4633a8f VLD1.32 {d19-d20}, [r3]"));
-	emitter.VST1(I_32, D23, R9, 1, ALIGN_NONE, R_PC);
-	RET(CheckLast(emitter, "f449778f VST1.32 {d23}, [r9]"));
-	emitter.VADD(I_8, D3, D4, D19);
-	RET(CheckLast(emitter, "f2043823 VADD.i8 d3, d4, d19"));
-	emitter.VADD(I_32, D3, D4, D19);
-	RET(CheckLast(emitter, "f2243823 VADD.i32 d3, d4, d19"));
-	emitter.VADD(F_32, D3, D4, D19);
-	RET(CheckLast(emitter, "f2043d23 VADD.f32 d3, d4, d19"));
-	emitter.VSUB(I_16, Q5, Q6, Q15);
-	RET(CheckLast(emitter, "f31ca86e VSUB.i16 q5, q6, q15"));
-	emitter.VMUL(F_32, Q1, Q2, Q3);
-	RET(CheckLast(emitter, "f3042d56 VMUL.f32 q1, q2, q3"));
-	emitter.VADD(F_32, Q1, Q2, Q3);
-	RET(CheckLast(emitter, "f2042d46 VADD.f32 q1, q2, q3"));
-	emitter.VMLA(F_32, Q1, Q2, Q3);
-	RET(CheckLast(emitter, "f2042d56 VMLA.f32 q1, q2, q3"));
-	emitter.VMLS(F_32, Q1, Q2, Q3);
-	RET(CheckLast(emitter, "f2242d56 VMLS.f32 q1, q2, q3"));
-	emitter.VMLS(I_16, Q1, Q2, Q3);
-	RET(CheckLast(emitter, "f3142946 VMLS.i16 q1, q2, q3"));
-	return true;
-}
-
 bool TestMathUtil() {
 	EXPECT_FALSE(my_isinf(1.0));
 	volatile float zero = 0.0f;
@ -419,6 +287,54 @@ bool TestVFPUSinCos() {
 	return true;
 }

+void TestGetMatrix(int matrix, MatrixSize sz) {
+	ILOG("Testing matrix %s", GetMatrixNotation(matrix, sz));
+	u8 fullMatrix[16];
+
+	u8 cols[4];
+	u8 rows[4];
+	GetMatrixColumns(matrix, sz, cols);
+	GetMatrixRows(matrix, sz, rows);
+
+	GetMatrixRegs(fullMatrix, sz, matrix);
+
+	int n = GetMatrixSide(sz);
+	VectorSize vsz = GetVectorSize(sz);
+	for (int i = 0; i < n; i++) {
+		// int colName = GetColumnName(matrix, sz, i, 0);
+		// int rowName = GetRowName(matrix, sz, i, 0);
+		int colName = cols[i];
+		int rowName = rows[i];
+		ILOG("Column %i: %s", i, GetVectorNotation(colName, vsz));
+		ILOG("Row %i: %s", i, GetVectorNotation(rowName, vsz));
+
+		u8 colRegs[4];
+		u8 rowRegs[4];
+		GetVectorRegs(colRegs, vsz, colName);
+		GetVectorRegs(rowRegs, vsz, rowName);
+
+		// Check that the individual regs are the expected ones.
+		std::stringstream a, b, c, d;
+		for (int j = 0; j < n; j++) {
+			a.clear();
+			b.clear();
+			a << (int)fullMatrix[i * 4 + j] << " ";
+			b << (int)colRegs[j] << " ";
+
+			c.clear();
+			d.clear();
+
+			c << (int)fullMatrix[j * 4 + i] << " ";
+			d << (int)rowRegs[j] << " ";
+		}
+		ILOG("Col: %s vs %s", a.str().c_str(), b.str().c_str());
+		if (a.str() != b.str())
+			ILOG("WRONG!");
+		ILOG("Row: %s vs %s", c.str().c_str(), d.str().c_str());
+		if (c.str() != d.str())
+			ILOG("WRONG!");
+	}
+}
 typedef bool (*TestFunc)();
 struct TestItem {
 	const char *name;
@ -427,6 +343,8 @@ struct TestItem {

 #define TEST_ITEM(name) { #name, &Test ##name, }

+bool TestArmEmitter();
+	
 TestItem availableTests[] = {
 	TEST_ITEM(Asin),
 	TEST_ITEM(SinCos),
--- a/unittest/UnitTest.h
+++ b/unittest/UnitTest.h
@ -0,0 +1,9 @@
+#pragma once
+
+#define EXPECT_TRUE(a) if (!(a)) { printf("%s:%i: Test Fail\n", __FUNCTION__, __LINE__); return false; }
+#define EXPECT_FALSE(a) if ((a)) { printf("%s:%i: Test Fail\n", __FUNCTION__, __LINE__); return false; }
+#define EXPECT_EQ_FLOAT(a, b) if ((a) != (b)) { printf("%s:%i: Test Fail\n%f\nvs\n%f\n", __FUNCTION__, __LINE__, a, b); return false; }
+#define EXPECT_APPROX_EQ_FLOAT(a, b) if (fabsf((a)-(b))>0.00001f) { printf("%s:%i: Test Fail\n%f\nvs\n%f\n", __FUNCTION__, __LINE__, a, b); /*return false;*/ }
+#define EXPECT_EQ_STR(a, b) if (a != b) { printf("%s: Test Fail\n%s\nvs\n%s\n", __FUNCTION__, a.c_str(), b.c_str()); return false; }
+
+#define RET(a) if (!(a)) { return false; }
--- a/unittest/UnitTests.vcxproj
+++ b/unittest/UnitTests.vcxproj
@ -176,6 +176,7 @@
    <ClCompile Include="..\native\ext\glew\glew.c" />
    <ClCompile Include="JitHarness.cpp" />
    <ClCompile Include="UnitTest.cpp" />
+    <ClCompile Include="TestArmEmitter.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ProjectReference Include="..\Common\Common.vcxproj">
@ -202,8 +203,9 @@
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="JitHarness.h" />
+    <ClInclude Include="UnitTest.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/unittest/UnitTests.vcxproj.filters
+++ b/unittest/UnitTests.vcxproj.filters
@ -4,8 +4,10 @@
    <ClCompile Include="UnitTest.cpp" />
    <ClCompile Include="..\native\ext\glew\glew.c" />
    <ClCompile Include="JitHarness.cpp" />
+    <ClCompile Include="TestArmEmitter.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="JitHarness.h" />
+    <ClInclude Include="UnitTest.h" />
  </ItemGroup>
-</Project>
+</Project>