From e0cc126d09390522ae30b5b8413e3490f00a4258 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 4 Jan 2018 11:37:32 +0100 Subject: [PATCH 1/5] Add some more SIMD support to IR interpreter. Mostly just because, but also serves as implementation reference for later code generation backends. --- Core/MIPS/IR/IRInterpreter.cpp | 50 +++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp index d5ad7be82b..8d1b67287f 100644 --- a/Core/MIPS/IR/IRInterpreter.cpp +++ b/Core/MIPS/IR/IRInterpreter.cpp @@ -1,11 +1,16 @@ #include #include +#include "ppsspp_config.h" #include "math/math_util.h" #include "Common/Common.h" #ifdef _M_SSE -#include +#include +#endif + +#if PPSSPP_ARCH(ARM_NEON) +#include #endif #include "Core/Core.h" @@ -40,6 +45,10 @@ alignas(16) static const uint32_t noSignMask[4] = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, }; +alignas(16) static const uint32_t lowBytesMask[4] = { + 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF, +}; + u32 RunBreakpoint(u32 pc) { // Should we skip this breakpoint? if (CBreakPoints::CheckSkipFirst() == pc) @@ -58,6 +67,7 @@ u32 RunMemCheck(u32 pc, u32 addr) { return coreState != CORE_RUNNING ? 1 : 0; } +// We cannot use NEON on ARM32 here until we make it a hard dependency. We can, however, on ARM64. u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { const IRInst *end = inst + count; while (inst != end) { @@ -185,8 +195,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { case IROp::Vec4Shuffle: { - // Can't use the SSE shuffle here because it takes an immediate. - // Backends with SSE support could use that though. + // Can't use the SSE shuffle here because it takes an immediate. pshufb with a table would work though, + // or a big switch - there are only 256 shuffles possible (4^4) for (int i = 0; i < 4; i++) mips->f[inst->dest + i] = mips->f[inst->src1 + ((inst->src2 >> (i * 2)) & 3)]; break; @@ -195,6 +205,9 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { case IROp::Vec4Mov: #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1])); +#elif PPSSPP_CONFIG(ARM64) + float32x4_t c = vld1q_f32(&mips->f[inst->src1]); + vst1q_f32(&mips->f[inst->dest], c); #else memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float)); #endif @@ -274,10 +287,17 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { break; case IROp::Vec4Unpack8To32: +#if defined(_M_SSE) + __m128i src = _mm_cvtsi32_si128(mips->fi[inst->src1]); + src = _mm_unpacklo_epi16(src, _mm_setzero_si128()); + src = _mm_unpacklo_epi32(src, _mm_setzero_si128()); + _mm_store_si128((__m128i *)&mips->fi[inst->dest], _mm_slli_epi32(src, 24)); +#else mips->fi[inst->dest] = (mips->fi[inst->src1] << 24); mips->fi[inst->dest + 1] = (mips->fi[inst->src1] << 16) & 0xFF000000; mips->fi[inst->dest + 2] = (mips->fi[inst->src1] << 8) & 0xFF000000; mips->fi[inst->dest + 3] = (mips->fi[inst->src1]) & 0xFF000000; +#endif break; case IROp::Vec2Pack32To16: @@ -297,21 +317,36 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { case IROp::Vec4Pack32To8: { +#if defined(_M_SSE) + // Packs the upper bits, so we need to shift down. Then we can just use SSE packing. + __m128i val = _mm_srli_epi32(_mm_load_si128((const __m128i *)&mips->fi[inst->src1]), 24); + val = _mm_packs_epi16(_mm_packs_epi32(val, _mm_setzero_si128()), _mm_setzero_si128()); + mips->fi[inst->dest] = _mm_cvtsi128_si32(val); +#else u32 val = mips->fi[inst->src1] >> 24; val |= (mips->fi[inst->src1 + 1] >> 16) & 0xFF00; val |= (mips->fi[inst->src1 + 2] >> 8) & 0xFF0000; val |= (mips->fi[inst->src1 + 3]) & 0xFF000000; mips->fi[inst->dest] = val; break; +#endif } case IROp::Vec4Pack31To8: { +#if defined(_M_SSE) + // Packs the upper bits (offset by 1), so we need to shift down and mask. Then we can just use SSE packing. + __m128i val = _mm_srli_epi32(_mm_load_si128((const __m128i *)&mips->fi[inst->src1]), 23); + val = _mm_and_si128(val, _mm_load_si128((const __m128i *)&lowBytesMask)); + val = _mm_packs_epi16(_mm_packs_epi32(val, _mm_setzero_si128()), _mm_setzero_si128()); + mips->fi[inst->dest] = _mm_cvtsi128_si32(val); +#else u32 val = (mips->fi[inst->src1] >> 23) & 0xFF; val |= (mips->fi[inst->src1 + 1] >> 15) & 0xFF00; val |= (mips->fi[inst->src1 + 2] >> 7) & 0xFF0000; val |= (mips->fi[inst->src1 + 3] << 1) & 0xFF000000; mips->fi[inst->dest] = val; +#endif break; } @@ -326,14 +361,21 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { case IROp::Vec4ClampToZero: { +#if 0 && defined(_M_SSE) + // This is SSE4 only unfortunately, so only suitable for JIT, hence disabled above. + __m128i val = _mm_load_si128((const __m128i *)&mips->fi[inst->src1]); + val = _mm_max_epi32(val, _mm_setzero_si128()); + mips->fi[inst->dest] = _mm_cvtsi128_si32(val); +#else for (int i = 0; i < 4; i++) { u32 val = mips->fi[inst->src1 + i]; mips->fi[inst->dest + i] = (int)val >= 0 ? val : 0; } break; +#endif } - case IROp::Vec4DuplicateUpperBitsAndShift1: + case IROp::Vec4DuplicateUpperBitsAndShift1: // For vuc2i, the weird one. for (int i = 0; i < 4; i++) { u32 val = mips->fi[inst->src1 + i]; val = val | (val >> 8); From fe88d12055ef2c8f3a0f953dc3573cf5860c6dae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 4 Jan 2018 13:40:39 +0100 Subject: [PATCH 2/5] IR interpreter: Add some braces to allow variable declaration in the switch cases. --- Core/MIPS/IR/IRInterpreter.cpp | 41 +++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp index 8d1b67287f..e19be9027a 100644 --- a/Core/MIPS/IR/IRInterpreter.cpp +++ b/Core/MIPS/IR/IRInterpreter.cpp @@ -186,12 +186,14 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { } case IROp::Vec4Init: + { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_load_ps(vec4InitValues[inst->src1])); #else memcpy(&mips->f[inst->dest], vec4InitValues[inst->src1], 4 * sizeof(float)); #endif break; + } case IROp::Vec4Shuffle: { @@ -203,44 +205,58 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { } case IROp::Vec4Mov: + { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1])); -#elif PPSSPP_CONFIG(ARM64) - float32x4_t c = vld1q_f32(&mips->f[inst->src1]); - vst1q_f32(&mips->f[inst->dest], c); +#elif PPSSPP_ARCH(ARM64) + vst1q_f32(&mips->f[inst->dest], vld1q_f32(&mips->f[inst->src1])); #else memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float)); #endif break; + } case IROp::Vec4Add: + { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2]))); +#elif PPSSPP_ARCH(ARM64) + vst1q_f32(&mips->f[inst->dest], vaddq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2]))); #else for (int i = 0; i < 4; i++) mips->f[inst->dest + i] = mips->f[inst->src1 + i] + mips->f[inst->src2 + i]; #endif break; + } case IROp::Vec4Sub: + { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2]))); +#elif PPSSPP_ARCH(ARM64) + vst1q_f32(&mips->f[inst->dest], vsubq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2]))); #else for (int i = 0; i < 4; i++) mips->f[inst->dest + i] = mips->f[inst->src1 + i] - mips->f[inst->src2 + i]; #endif break; + } case IROp::Vec4Mul: + { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2]))); +#elif PPSSPP_ARCH(ARM64) + vst1q_f32(&mips->f[inst->dest], vmulq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2]))); #else for (int i = 0; i < 4; i++) mips->f[inst->dest + i] = mips->f[inst->src1 + i] * mips->f[inst->src2 + i]; #endif break; + } case IROp::Vec4Div: + { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_div_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2]))); #else @@ -248,8 +264,10 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { mips->f[inst->dest + i] = mips->f[inst->src1 + i] / mips->f[inst->src2 + i]; #endif break; + } case IROp::Vec4Scale: + { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_set1_ps(mips->f[inst->src2]))); #else @@ -257,36 +275,50 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { mips->f[inst->dest + i] = mips->f[inst->src1 + i] * mips->f[inst->src2]; #endif break; + } case IROp::Vec4Neg: + { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_xor_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)signBits))); +#elif PPSSPP_ARCH(ARM64) + vst1q_f32(&mips->f[inst->dest], vnegq_f32(vld1q_f32(&mips->f[inst->src1]))); #else for (int i = 0; i < 4; i++) mips->f[inst->dest + i] = -mips->f[inst->src1 + i]; #endif break; + } case IROp::Vec4Abs: + { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_and_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)noSignMask))); +#elif PPSSPP_ARCH(ARM64) + vst1q_f32(&mips->f[inst->dest], vabsq_f32(vld1q_f32(&mips->f[inst->src1]))); #else for (int i = 0; i < 4; i++) mips->f[inst->dest + i] = fabsf(mips->f[inst->src1 + i]); #endif break; + } case IROp::Vec2Unpack16To31: + { mips->fi[inst->dest] = (mips->fi[inst->src1] << 16) >> 1; mips->fi[inst->dest + 1] = (mips->fi[inst->src1] & 0xFFFF0000) >> 1; break; + } case IROp::Vec2Unpack16To32: + { mips->fi[inst->dest] = (mips->fi[inst->src1] << 16); mips->fi[inst->dest + 1] = (mips->fi[inst->src1] & 0xFFFF0000); break; + } case IROp::Vec4Unpack8To32: + { #if defined(_M_SSE) __m128i src = _mm_cvtsi32_si128(mips->fi[inst->src1]); src = _mm_unpacklo_epi16(src, _mm_setzero_si128()); @@ -299,6 +331,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { mips->fi[inst->dest + 3] = (mips->fi[inst->src1]) & 0xFF000000; #endif break; + } case IROp::Vec2Pack32To16: { @@ -376,6 +409,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { } case IROp::Vec4DuplicateUpperBitsAndShift1: // For vuc2i, the weird one. + { for (int i = 0; i < 4; i++) { u32 val = mips->fi[inst->src1 + i]; val = val | (val >> 8); @@ -384,6 +418,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { mips->fi[inst->dest + i] = val; } break; + } case IROp::FCmpVfpuBit: { From ca9050b84c655ca0e3c5e6b5038201a8bc6c2feb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 4 Jan 2018 13:54:58 +0100 Subject: [PATCH 3/5] On Linux, can't even include nmmintrin without explicitly enabling SSE 4.2 support. --- Core/MIPS/IR/IRInterpreter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp index e19be9027a..3c50fecdb7 100644 --- a/Core/MIPS/IR/IRInterpreter.cpp +++ b/Core/MIPS/IR/IRInterpreter.cpp @@ -6,7 +6,7 @@ #include "Common/Common.h" #ifdef _M_SSE -#include +#include #endif #if PPSSPP_ARCH(ARM_NEON) From 18be23ecccc8410377790b02de9a1d625758a6da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 4 Jan 2018 19:38:36 +0100 Subject: [PATCH 4/5] IR: More fixes. Still something wrong with VFPU compares (not caused by this PR). --- Core/MIPS/IR/IRCompVFPU.cpp | 2 +- Core/MIPS/IR/IRInterpreter.cpp | 45 ++++++++++++++-------------------- 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp index af89d6cb47..82ecffc1da 100644 --- a/Core/MIPS/IR/IRCompVFPU.cpp +++ b/Core/MIPS/IR/IRCompVFPU.cpp @@ -1642,7 +1642,7 @@ namespace MIPSComp { GetVectorRegsPrefixS(sregs, sz, _VS); GetVectorRegsPrefixT(tregs, sz, _VT); - VCondition cond = (VCondition)(op & 0xF); + int cond = op & 0xF; int mask = 0; for (int i = 0; i < n; i++) { ir.Write(IROp::FCmpVfpuBit, cond | (i << 4), sregs[i], tregs[i]); diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp index 3c50fecdb7..f9dc4a2785 100644 --- a/Core/MIPS/IR/IRInterpreter.cpp +++ b/Core/MIPS/IR/IRInterpreter.cpp @@ -321,8 +321,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { { #if defined(_M_SSE) __m128i src = _mm_cvtsi32_si128(mips->fi[inst->src1]); + src = _mm_unpacklo_epi8(src, _mm_setzero_si128()); src = _mm_unpacklo_epi16(src, _mm_setzero_si128()); - src = _mm_unpacklo_epi32(src, _mm_setzero_si128()); _mm_store_si128((__m128i *)&mips->fi[inst->dest], _mm_slli_epi32(src, 24)); #else mips->fi[inst->dest] = (mips->fi[inst->src1] << 24); @@ -350,36 +350,25 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { case IROp::Vec4Pack32To8: { -#if defined(_M_SSE) - // Packs the upper bits, so we need to shift down. Then we can just use SSE packing. - __m128i val = _mm_srli_epi32(_mm_load_si128((const __m128i *)&mips->fi[inst->src1]), 24); - val = _mm_packs_epi16(_mm_packs_epi32(val, _mm_setzero_si128()), _mm_setzero_si128()); - mips->fi[inst->dest] = _mm_cvtsi128_si32(val); -#else + // Removed previous SSE code due to the need for unsigned 16-bit pack, which I'm too lazy to work around the lack of in SSE2. + // pshufb or SSE4 instructions can be used instead. u32 val = mips->fi[inst->src1] >> 24; val |= (mips->fi[inst->src1 + 1] >> 16) & 0xFF00; val |= (mips->fi[inst->src1 + 2] >> 8) & 0xFF0000; val |= (mips->fi[inst->src1 + 3]) & 0xFF000000; mips->fi[inst->dest] = val; break; -#endif } case IROp::Vec4Pack31To8: { -#if defined(_M_SSE) - // Packs the upper bits (offset by 1), so we need to shift down and mask. Then we can just use SSE packing. - __m128i val = _mm_srli_epi32(_mm_load_si128((const __m128i *)&mips->fi[inst->src1]), 23); - val = _mm_and_si128(val, _mm_load_si128((const __m128i *)&lowBytesMask)); - val = _mm_packs_epi16(_mm_packs_epi32(val, _mm_setzero_si128()), _mm_setzero_si128()); - mips->fi[inst->dest] = _mm_cvtsi128_si32(val); -#else + // Removed previous SSE code due to the need for unsigned 16-bit pack, which I'm too lazy to work around the lack of in SSE2. + // pshufb or SSE4 instructions can be used instead. u32 val = (mips->fi[inst->src1] >> 23) & 0xFF; val |= (mips->fi[inst->src1 + 1] >> 15) & 0xFF00; val |= (mips->fi[inst->src1 + 2] >> 7) & 0xFF0000; val |= (mips->fi[inst->src1 + 3] << 1) & 0xFF000000; mips->fi[inst->dest] = val; -#endif break; } @@ -394,18 +383,19 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { case IROp::Vec4ClampToZero: { -#if 0 && defined(_M_SSE) - // This is SSE4 only unfortunately, so only suitable for JIT, hence disabled above. +#if defined(_M_SSE) + // Trickery: Expand the sign bit, and use andnot to zero negative values. __m128i val = _mm_load_si128((const __m128i *)&mips->fi[inst->src1]); - val = _mm_max_epi32(val, _mm_setzero_si128()); - mips->fi[inst->dest] = _mm_cvtsi128_si32(val); + __m128i mask = _mm_srai_epi32(val, 31); + val = _mm_andnot_si128(mask, val); + _mm_store_si128((__m128i *)&mips->fi[inst->dest], val); #else for (int i = 0; i < 4; i++) { u32 val = mips->fi[inst->src1 + i]; mips->fi[inst->dest + i] = (int)val >= 0 ? val : 0; } - break; #endif + break; } case IROp::Vec4DuplicateUpperBitsAndShift1: // For vuc2i, the weird one. @@ -450,18 +440,18 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { } else { mips->vfpuCtrl[VFPU_CTRL_CC] &= ~(1 << bit); } - } break; + } case IROp::FCmpVfpuAggregate: { u32 mask = inst->dest; u32 cc = mips->vfpuCtrl[VFPU_CTRL_CC]; - int a = (cc & mask) ? 0x10 : 0x00; - int b = (cc & mask) == mask ? 0x20 : 0x00; - mips->vfpuCtrl[VFPU_CTRL_CC] = (cc & ~0x30) | a | b; - } + int anyBit = (cc & mask) ? 0x10 : 0x00; + int allBit = (cc & mask) == mask ? 0x20 : 0x00; + mips->vfpuCtrl[VFPU_CTRL_CC] = (cc & ~0x30) | anyBit | allBit; break; + } case IROp::FCmovVfpuCC: if (((mips->vfpuCtrl[VFPU_CTRL_CC] >> (inst->src2 & 0xf)) & 1) == ((u32)inst->src2 >> 7)) { @@ -792,6 +782,9 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { case IRFpCompareMode::False: mips->fpcond = 0; break; + case IRFpCompareMode::NotEqualUnordered: + mips->fpcond = mips->f[inst->src1] != mips->f[inst->src2]; + break; case IRFpCompareMode::EqualOrdered: case IRFpCompareMode::EqualUnordered: mips->fpcond = mips->f[inst->src1] == mips->f[inst->src2]; From 331a8f91e8b31d43dbf59dcefd705752f991cfe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 4 Jan 2018 20:06:26 +0100 Subject: [PATCH 5/5] Fix that weird unordered compare mode, hopefully --- Core/MIPS/IR/IRCompFPU.cpp | 2 +- Core/MIPS/IR/IRInst.h | 2 +- Core/MIPS/IR/IRInterpreter.cpp | 8 ++++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Core/MIPS/IR/IRCompFPU.cpp b/Core/MIPS/IR/IRCompFPU.cpp index 55fb033804..ce01c7ff75 100644 --- a/Core/MIPS/IR/IRCompFPU.cpp +++ b/Core/MIPS/IR/IRCompFPU.cpp @@ -111,7 +111,7 @@ void IRFrontend::Comp_FPUComp(MIPSOpcode op) { IRFpCompareMode mode; switch (opc) { case 1: // un, ngle (unordered) - mode = IRFpCompareMode::NotEqualUnordered; + mode = IRFpCompareMode::EitherUnordered; break; case 2: // eq, seq (equal, ordered) mode = IRFpCompareMode::EqualOrdered; diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h index 30257c43e7..ba1712b954 100644 --- a/Core/MIPS/IR/IRInst.h +++ b/Core/MIPS/IR/IRInst.h @@ -263,7 +263,7 @@ inline IROp ComparisonToExit(IRComparison comp) { enum IRFpCompareMode { False = 0, - NotEqualUnordered, + EitherUnordered, EqualOrdered, // eq, seq (equal, ordered) EqualUnordered, // ueq, ngl (equal, unordered) LessOrdered, // olt, lt (less than, ordered) diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp index f9dc4a2785..e86f88ab7a 100644 --- a/Core/MIPS/IR/IRInterpreter.cpp +++ b/Core/MIPS/IR/IRInterpreter.cpp @@ -782,9 +782,13 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { case IRFpCompareMode::False: mips->fpcond = 0; break; - case IRFpCompareMode::NotEqualUnordered: - mips->fpcond = mips->f[inst->src1] != mips->f[inst->src2]; + case IRFpCompareMode::EitherUnordered: + { + float a = mips->f[inst->src1]; + float b = mips->f[inst->src2]; + mips->fpcond = !(a > b || a < b || a == b); break; + } case IRFpCompareMode::EqualOrdered: case IRFpCompareMode::EqualUnordered: mips->fpcond = mips->f[inst->src1] == mips->f[inst->src2];