From e0cc126d09390522ae30b5b8413e3490f00a4258 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 4 Jan 2018 11:37:32 +0100
Subject: [PATCH 1/5] Add some more SIMD support to IR interpreter. Mostly just
 because, but also serves as implementation reference for later code
 generation backends.

---
 Core/MIPS/IR/IRInterpreter.cpp | 50 +++++++++++++++++++++++++++++++---
 1 file changed, 46 insertions(+), 4 deletions(-)
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index d5ad7be82b..8d1b67287f 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -1,11 +1,16 @@
 #include <algorithm>
 #include <cmath>
 
+#include "ppsspp_config.h"
 #include "math/math_util.h"
 #include "Common/Common.h"
 
 #ifdef _M_SSE
-#include <emmintrin.h>
+#include <nmmintrin.h>
+#endif
+
+#if PPSSPP_ARCH(ARM_NEON)
+#include <arm_neon.h>
 #endif
 
 #include "Core/Core.h"
@@ -40,6 +45,10 @@ alignas(16) static const uint32_t noSignMask[4] = {
 	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
 };
 
+alignas(16) static const uint32_t lowBytesMask[4] = {
+	0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF,
+};
+
 u32 RunBreakpoint(u32 pc) {
 	// Should we skip this breakpoint?
 	if (CBreakPoints::CheckSkipFirst() == pc)
@@ -58,6 +67,7 @@ u32 RunMemCheck(u32 pc, u32 addr) {
 	return coreState != CORE_RUNNING ? 1 : 0;
 }
 
+// We cannot use NEON on ARM32 here until we make it a hard dependency. We can, however, on ARM64.
 u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 	const IRInst *end = inst + count;
 	while (inst != end) {
@@ -185,8 +195,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 
 		case IROp::Vec4Shuffle:
 		{
-			// Can't use the SSE shuffle here because it takes an immediate.
-			// Backends with SSE support could use that though.
+			// Can't use the SSE shuffle here because it takes an immediate. pshufb with a table would work though,
+			// or a big switch - there are only 256 shuffles possible (4^4)
 			for (int i = 0; i < 4; i++)
 				mips->f[inst->dest + i] = mips->f[inst->src1 + ((inst->src2 >> (i * 2)) & 3)];
 			break;
@@ -195,6 +205,9 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 		case IROp::Vec4Mov:
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));
+#elif PPSSPP_CONFIG(ARM64)
+			float32x4_t c = vld1q_f32(&mips->f[inst->src1]);
+			vst1q_f32(&mips->f[inst->dest], c);
 #else
 			memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));
 #endif
@@ -274,10 +287,17 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 			break;
 
 		case IROp::Vec4Unpack8To32:
+#if defined(_M_SSE)
+			__m128i src = _mm_cvtsi32_si128(mips->fi[inst->src1]);
+			src = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+			src = _mm_unpacklo_epi32(src, _mm_setzero_si128());
+			_mm_store_si128((__m128i *)&mips->fi[inst->dest], _mm_slli_epi32(src, 24));
+#else
 			mips->fi[inst->dest] = (mips->fi[inst->src1] << 24);
 			mips->fi[inst->dest + 1] = (mips->fi[inst->src1] << 16) & 0xFF000000;
 			mips->fi[inst->dest + 2] = (mips->fi[inst->src1] << 8) & 0xFF000000;
 			mips->fi[inst->dest + 3] = (mips->fi[inst->src1]) & 0xFF000000;
+#endif
 			break;
 
 		case IROp::Vec2Pack32To16:
@@ -297,21 +317,36 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 
 		case IROp::Vec4Pack32To8:
 		{
+#if defined(_M_SSE)
+			// Packs the upper bits, so we need to shift down. Then we can just use SSE packing.
+			__m128i val = _mm_srli_epi32(_mm_load_si128((const __m128i *)&mips->fi[inst->src1]), 24);
+			val = _mm_packs_epi16(_mm_packs_epi32(val, _mm_setzero_si128()), _mm_setzero_si128());
+			mips->fi[inst->dest] = _mm_cvtsi128_si32(val);
+#else
 			u32 val = mips->fi[inst->src1] >> 24;
 			val |= (mips->fi[inst->src1 + 1] >> 16) & 0xFF00;
 			val |= (mips->fi[inst->src1 + 2] >> 8) & 0xFF0000;
 			val |= (mips->fi[inst->src1 + 3]) & 0xFF000000;
 			mips->fi[inst->dest] = val;
 			break;
+#endif
 		}
 
 		case IROp::Vec4Pack31To8:
 		{
+#if defined(_M_SSE)
+			// Packs the upper bits (offset by 1), so we need to shift down and mask. Then we can just use SSE packing.
+			__m128i val = _mm_srli_epi32(_mm_load_si128((const __m128i *)&mips->fi[inst->src1]), 23);
+			val = _mm_and_si128(val, _mm_load_si128((const __m128i *)&lowBytesMask));
+			val = _mm_packs_epi16(_mm_packs_epi32(val, _mm_setzero_si128()), _mm_setzero_si128());
+			mips->fi[inst->dest] = _mm_cvtsi128_si32(val);
+#else
 			u32 val = (mips->fi[inst->src1] >> 23) & 0xFF;
 			val |= (mips->fi[inst->src1 + 1] >> 15) & 0xFF00;
 			val |= (mips->fi[inst->src1 + 2] >> 7) & 0xFF0000;
 			val |= (mips->fi[inst->src1 + 3] << 1) & 0xFF000000;
 			mips->fi[inst->dest] = val;
+#endif
 			break;
 		}
 
@@ -326,14 +361,21 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 
 		case IROp::Vec4ClampToZero:
 		{
+#if 0 && defined(_M_SSE)
+			// This is SSE4 only unfortunately, so only suitable for JIT, hence disabled above.
+			__m128i val = _mm_load_si128((const __m128i *)&mips->fi[inst->src1]);
+			val = _mm_max_epi32(val, _mm_setzero_si128());
+			mips->fi[inst->dest] = _mm_cvtsi128_si32(val);
+#else
 			for (int i = 0; i < 4; i++) {
 				u32 val = mips->fi[inst->src1 + i];
 				mips->fi[inst->dest + i] = (int)val >= 0 ? val : 0;
 			}
 			break;
+#endif
 		}
 
-		case IROp::Vec4DuplicateUpperBitsAndShift1:
+		case IROp::Vec4DuplicateUpperBitsAndShift1:  // For vuc2i, the weird one.
 			for (int i = 0; i < 4; i++) {
 				u32 val = mips->fi[inst->src1 + i];
 				val = val | (val >> 8);

From fe88d12055ef2c8f3a0f953dc3573cf5860c6dae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 4 Jan 2018 13:40:39 +0100
Subject: [PATCH 2/5] IR interpreter: Add some braces to allow variable
 declaration in the switch cases.

---
 Core/MIPS/IR/IRInterpreter.cpp | 41 +++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 8d1b67287f..e19be9027a 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -186,12 +186,14 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 		}
 
 		case IROp::Vec4Init:
+		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(vec4InitValues[inst->src1]));
 #else
 			memcpy(&mips->f[inst->dest], vec4InitValues[inst->src1], 4 * sizeof(float));
 #endif
 			break;
+		}
 
 		case IROp::Vec4Shuffle:
 		{
@@ -203,44 +205,58 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 		}
 
 		case IROp::Vec4Mov:
+		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));
-#elif PPSSPP_CONFIG(ARM64)
-			float32x4_t c = vld1q_f32(&mips->f[inst->src1]);
-			vst1q_f32(&mips->f[inst->dest], c);
+#elif PPSSPP_ARCH(ARM64)
+			vst1q_f32(&mips->f[inst->dest], vld1q_f32(&mips->f[inst->src1]));
 #else
 			memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));
 #endif
 			break;
+		}
 
 		case IROp::Vec4Add:
+		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
+#elif PPSSPP_ARCH(ARM64)
+			vst1q_f32(&mips->f[inst->dest], vaddq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
 #else
 			for (int i = 0; i < 4; i++)
 				mips->f[inst->dest + i] = mips->f[inst->src1 + i] + mips->f[inst->src2 + i];
 #endif
 			break;
+		}
 
 		case IROp::Vec4Sub:
+		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
+#elif PPSSPP_ARCH(ARM64)
+			vst1q_f32(&mips->f[inst->dest], vsubq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
 #else
 			for (int i = 0; i < 4; i++)
 				mips->f[inst->dest + i] = mips->f[inst->src1 + i] - mips->f[inst->src2 + i];
 #endif
 			break;
+		}
 
 		case IROp::Vec4Mul:
+		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
+#elif PPSSPP_ARCH(ARM64)
+			vst1q_f32(&mips->f[inst->dest], vmulq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
 #else
 			for (int i = 0; i < 4; i++)
 				mips->f[inst->dest + i] = mips->f[inst->src1 + i] * mips->f[inst->src2 + i];
 #endif
 			break;
+		}
 
 		case IROp::Vec4Div:
+		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_div_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
 #else
@@ -248,8 +264,10 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 				mips->f[inst->dest + i] = mips->f[inst->src1 + i] / mips->f[inst->src2 + i];
 #endif
 			break;
+		}
 
 		case IROp::Vec4Scale:
+		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_set1_ps(mips->f[inst->src2])));
 #else
@@ -257,36 +275,50 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 				mips->f[inst->dest + i] = mips->f[inst->src1 + i] * mips->f[inst->src2];
 #endif
 			break;
+		}
 
 		case IROp::Vec4Neg:
+		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_xor_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)signBits)));
+#elif PPSSPP_ARCH(ARM64)
+			vst1q_f32(&mips->f[inst->dest], vnegq_f32(vld1q_f32(&mips->f[inst->src1])));
 #else
 			for (int i = 0; i < 4; i++)
 				mips->f[inst->dest + i] = -mips->f[inst->src1 + i];
 #endif
 			break;
+		}
 
 		case IROp::Vec4Abs:
+		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_and_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)noSignMask)));
+#elif PPSSPP_ARCH(ARM64)
+			vst1q_f32(&mips->f[inst->dest], vabsq_f32(vld1q_f32(&mips->f[inst->src1])));
 #else
 			for (int i = 0; i < 4; i++)
 				mips->f[inst->dest + i] = fabsf(mips->f[inst->src1 + i]);
 #endif
 			break;
+		}
 
 		case IROp::Vec2Unpack16To31:
+		{
 			mips->fi[inst->dest] = (mips->fi[inst->src1] << 16) >> 1;
 			mips->fi[inst->dest + 1] = (mips->fi[inst->src1] & 0xFFFF0000) >> 1;
 			break;
+		}
 
 		case IROp::Vec2Unpack16To32:
+		{
 			mips->fi[inst->dest] = (mips->fi[inst->src1] << 16);
 			mips->fi[inst->dest + 1] = (mips->fi[inst->src1] & 0xFFFF0000);
 			break;
+		}
 
 		case IROp::Vec4Unpack8To32:
+		{
 #if defined(_M_SSE)
 			__m128i src = _mm_cvtsi32_si128(mips->fi[inst->src1]);
 			src = _mm_unpacklo_epi16(src, _mm_setzero_si128());
@@ -299,6 +331,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 			mips->fi[inst->dest + 3] = (mips->fi[inst->src1]) & 0xFF000000;
 #endif
 			break;
+		}
 
 		case IROp::Vec2Pack32To16:
 		{
@@ -376,6 +409,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 		}
 
 		case IROp::Vec4DuplicateUpperBitsAndShift1:  // For vuc2i, the weird one.
+		{
 			for (int i = 0; i < 4; i++) {
 				u32 val = mips->fi[inst->src1 + i];
 				val = val | (val >> 8);
@@ -384,6 +418,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 				mips->fi[inst->dest + i] = val;
 			}
 			break;
+		}
 
 		case IROp::FCmpVfpuBit:
 		{

From ca9050b84c655ca0e3c5e6b5038201a8bc6c2feb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 4 Jan 2018 13:54:58 +0100
Subject: [PATCH 3/5] On Linux, can't even include nmmintrin without explicitly
 enabling SSE 4.2 support.

---
 Core/MIPS/IR/IRInterpreter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index e19be9027a..3c50fecdb7 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -6,7 +6,7 @@
 #include "Common/Common.h"
 
 #ifdef _M_SSE
-#include <nmmintrin.h>
+#include <emmintrin.h>
 #endif
 
 #if PPSSPP_ARCH(ARM_NEON)

From 18be23ecccc8410377790b02de9a1d625758a6da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 4 Jan 2018 19:38:36 +0100
Subject: [PATCH 4/5] IR: More fixes. Still something wrong with VFPU compares
 (not caused by this PR).

---
 Core/MIPS/IR/IRCompVFPU.cpp    |  2 +-
 Core/MIPS/IR/IRInterpreter.cpp | 45 ++++++++++++++--------------------
 2 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index af89d6cb47..82ecffc1da 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -1642,7 +1642,7 @@ namespace MIPSComp {
 		GetVectorRegsPrefixS(sregs, sz, _VS);
 		GetVectorRegsPrefixT(tregs, sz, _VT);
 
-		VCondition cond = (VCondition)(op & 0xF);
+		int cond = op & 0xF;
 		int mask = 0;
 		for (int i = 0; i < n; i++) {
 			ir.Write(IROp::FCmpVfpuBit, cond | (i << 4), sregs[i], tregs[i]);
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 3c50fecdb7..f9dc4a2785 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -321,8 +321,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 		{
 #if defined(_M_SSE)
 			__m128i src = _mm_cvtsi32_si128(mips->fi[inst->src1]);
+			src = _mm_unpacklo_epi8(src, _mm_setzero_si128());
 			src = _mm_unpacklo_epi16(src, _mm_setzero_si128());
-			src = _mm_unpacklo_epi32(src, _mm_setzero_si128());
 			_mm_store_si128((__m128i *)&mips->fi[inst->dest], _mm_slli_epi32(src, 24));
 #else
 			mips->fi[inst->dest] = (mips->fi[inst->src1] << 24);
@@ -350,36 +350,25 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 
 		case IROp::Vec4Pack32To8:
 		{
-#if defined(_M_SSE)
-			// Packs the upper bits, so we need to shift down. Then we can just use SSE packing.
-			__m128i val = _mm_srli_epi32(_mm_load_si128((const __m128i *)&mips->fi[inst->src1]), 24);
-			val = _mm_packs_epi16(_mm_packs_epi32(val, _mm_setzero_si128()), _mm_setzero_si128());
-			mips->fi[inst->dest] = _mm_cvtsi128_si32(val);
-#else
+			// Removed previous SSE code due to the need for unsigned 16-bit pack, which I'm too lazy to work around the lack of in SSE2.
+			// pshufb or SSE4 instructions can be used instead.
 			u32 val = mips->fi[inst->src1] >> 24;
 			val |= (mips->fi[inst->src1 + 1] >> 16) & 0xFF00;
 			val |= (mips->fi[inst->src1 + 2] >> 8) & 0xFF0000;
 			val |= (mips->fi[inst->src1 + 3]) & 0xFF000000;
 			mips->fi[inst->dest] = val;
 			break;
-#endif
 		}
 
 		case IROp::Vec4Pack31To8:
 		{
-#if defined(_M_SSE)
-			// Packs the upper bits (offset by 1), so we need to shift down and mask. Then we can just use SSE packing.
-			__m128i val = _mm_srli_epi32(_mm_load_si128((const __m128i *)&mips->fi[inst->src1]), 23);
-			val = _mm_and_si128(val, _mm_load_si128((const __m128i *)&lowBytesMask));
-			val = _mm_packs_epi16(_mm_packs_epi32(val, _mm_setzero_si128()), _mm_setzero_si128());
-			mips->fi[inst->dest] = _mm_cvtsi128_si32(val);
-#else
+			// Removed previous SSE code due to the need for unsigned 16-bit pack, which I'm too lazy to work around the lack of in SSE2.
+			// pshufb or SSE4 instructions can be used instead.
 			u32 val = (mips->fi[inst->src1] >> 23) & 0xFF;
 			val |= (mips->fi[inst->src1 + 1] >> 15) & 0xFF00;
 			val |= (mips->fi[inst->src1 + 2] >> 7) & 0xFF0000;
 			val |= (mips->fi[inst->src1 + 3] << 1) & 0xFF000000;
 			mips->fi[inst->dest] = val;
-#endif
 			break;
 		}
 
@@ -394,18 +383,19 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 
 		case IROp::Vec4ClampToZero:
 		{
-#if 0 && defined(_M_SSE)
-			// This is SSE4 only unfortunately, so only suitable for JIT, hence disabled above.
+#if defined(_M_SSE)
+			// Trickery: Expand the sign bit, and use andnot to zero negative values.
 			__m128i val = _mm_load_si128((const __m128i *)&mips->fi[inst->src1]);
-			val = _mm_max_epi32(val, _mm_setzero_si128());
-			mips->fi[inst->dest] = _mm_cvtsi128_si32(val);
+			__m128i mask = _mm_srai_epi32(val, 31);
+			val = _mm_andnot_si128(mask, val);
+			_mm_store_si128((__m128i *)&mips->fi[inst->dest], val);
 #else
 			for (int i = 0; i < 4; i++) {
 				u32 val = mips->fi[inst->src1 + i];
 				mips->fi[inst->dest + i] = (int)val >= 0 ? val : 0;
 			}
-			break;
 #endif
+			break;
 		}
 
 		case IROp::Vec4DuplicateUpperBitsAndShift1:  // For vuc2i, the weird one.
@@ -450,18 +440,18 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 			} else {
 				mips->vfpuCtrl[VFPU_CTRL_CC] &= ~(1 << bit);
 			}
-		}
 			break;
+		}
 
 		case IROp::FCmpVfpuAggregate:
 		{
 			u32 mask = inst->dest;
 			u32 cc = mips->vfpuCtrl[VFPU_CTRL_CC];
-			int a = (cc & mask) ? 0x10 : 0x00;
-			int b = (cc & mask) == mask ? 0x20 : 0x00;
-			mips->vfpuCtrl[VFPU_CTRL_CC] = (cc & ~0x30) | a | b;
-		}
+			int anyBit = (cc & mask) ? 0x10 : 0x00;
+			int allBit = (cc & mask) == mask ? 0x20 : 0x00;
+			mips->vfpuCtrl[VFPU_CTRL_CC] = (cc & ~0x30) | anyBit | allBit;
 			break;
+		}
 
 		case IROp::FCmovVfpuCC:
 			if (((mips->vfpuCtrl[VFPU_CTRL_CC] >> (inst->src2 & 0xf)) & 1) == ((u32)inst->src2 >> 7)) {
@@ -792,6 +782,9 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 			case IRFpCompareMode::False:
 				mips->fpcond = 0;
 				break;
+			case IRFpCompareMode::NotEqualUnordered:
+				mips->fpcond = mips->f[inst->src1] != mips->f[inst->src2];
+				break;
 			case IRFpCompareMode::EqualOrdered:
 			case IRFpCompareMode::EqualUnordered:
 				mips->fpcond = mips->f[inst->src1] == mips->f[inst->src2];

From 331a8f91e8b31d43dbf59dcefd705752f991cfe5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 4 Jan 2018 20:06:26 +0100
Subject: [PATCH 5/5] Fix that weird unordered compare mode, hopefully

---
 Core/MIPS/IR/IRCompFPU.cpp     | 2 +-
 Core/MIPS/IR/IRInst.h          | 2 +-
 Core/MIPS/IR/IRInterpreter.cpp | 8 ++++++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Core/MIPS/IR/IRCompFPU.cpp b/Core/MIPS/IR/IRCompFPU.cpp
index 55fb033804..ce01c7ff75 100644
--- a/Core/MIPS/IR/IRCompFPU.cpp
+++ b/Core/MIPS/IR/IRCompFPU.cpp
@@ -111,7 +111,7 @@ void IRFrontend::Comp_FPUComp(MIPSOpcode op) {
 	IRFpCompareMode mode;
 	switch (opc) {
 	case 1:      // un,  ngle (unordered)
-		mode = IRFpCompareMode::NotEqualUnordered;
+		mode = IRFpCompareMode::EitherUnordered;
 		break;
 	case 2:      // eq,  seq (equal, ordered)
 		mode = IRFpCompareMode::EqualOrdered;
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index 30257c43e7..ba1712b954 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -263,7 +263,7 @@ inline IROp ComparisonToExit(IRComparison comp) {
 
 enum IRFpCompareMode {
 	False = 0,
-	NotEqualUnordered,
+	EitherUnordered,
 	EqualOrdered, // eq,  seq (equal, ordered)
 	EqualUnordered, // ueq, ngl (equal, unordered)
 	LessOrdered, // olt, lt (less than, ordered)
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index f9dc4a2785..e86f88ab7a 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -782,9 +782,13 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 			case IRFpCompareMode::False:
 				mips->fpcond = 0;
 				break;
-			case IRFpCompareMode::NotEqualUnordered:
-				mips->fpcond = mips->f[inst->src1] != mips->f[inst->src2];
+			case IRFpCompareMode::EitherUnordered:
+			{
+				float a = mips->f[inst->src1];
+				float b = mips->f[inst->src2];
+				mips->fpcond = !(a > b || a < b || a == b);
 				break;
+			}
 			case IRFpCompareMode::EqualOrdered:
 			case IRFpCompareMode::EqualUnordered:
 				mips->fpcond = mips->f[inst->src1] == mips->f[inst->src2];