From c3578bd8ebbf014698fb2b1a7881cd3f14687b91 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 29 Nov 2014 22:12:40 -0800 Subject: [PATCH] x86jit: Find the correct available regs. If there aren't n available regs, they may not of course be in slots 0 and 1. Have to go and find them. --- Core/MIPS/x86/RegCacheFPU.cpp | 72 +++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/Core/MIPS/x86/RegCacheFPU.cpp b/Core/MIPS/x86/RegCacheFPU.cpp index 36c95451c4..91dab6965d 100644 --- a/Core/MIPS/x86/RegCacheFPU.cpp +++ b/Core/MIPS/x86/RegCacheFPU.cpp @@ -317,19 +317,28 @@ X64Reg FPURegCache::LoadRegsVS(const u8 *v, int n) { // If they're sequential, and we wouldn't need to store them all, use a single load. // But if they're already loaded, we'd have to store, not worth it. + X64Reg res = INVALID_REG; if (sequential == n && regsLoaded < n) { // TODO: What should we do if some are in regs? Better to assemble? for (int i = 0; i < n; ++i) { StoreFromRegisterV(v[i]); } + + // Grab any available reg. + for (int i = 0; i < n; ++i) { + if (xrs[i] != INVALID_REG) { + res = xrs[i]; + break; + } + } const float *f = &mips->v[voffset[v[0]]]; if (((intptr_t)f & 0x7) == 0 && n == 2) { - emit->MOVQ_xmm(xrs[0], vregs[v[0]].location); + emit->MOVQ_xmm(res, vregs[v[0]].location); } else if (((intptr_t)f & 0xf) == 0) { // On modern processors, MOVUPS on aligned is fast, but maybe not on older ones. - emit->MOVAPS(xrs[0], vregs[v[0]].location); + emit->MOVAPS(res, vregs[v[0]].location); } else { - emit->MOVUPS(xrs[0], vregs[v[0]].location); + emit->MOVUPS(res, vregs[v[0]].location); } } else if (regsAvail >= n) { // Have enough regs, potentially all in regs. @@ -354,33 +363,54 @@ X64Reg FPURegCache::LoadRegsVS(const u8 *v, int n) { if (n >= 2) { emit->UNPCKLPS(xrs[0], Gen::R(xrs[1])); } + res = xrs[0]; } else { _dbg_assert_msg_(JIT, n > 2, "2 should not be possible here."); - // TODO: More optimal. - if (xrsLoaded[0]) { - StoreFromRegisterV(v[0]); + + // Available regs are less than n, and some may be loaded. + // Let's grab the most optimal unloaded ones. + X64Reg xr1 = n == 3 ? xrs[1] : xrs[3]; + X64Reg xr2 = xrs[2]; + if (xr1 == INVALID_REG) { + // Not one of the available ones. Grab another. + for (int i = n - 1; i >= 0; --i) { + if (xrs[i] != INVALID_REG && xrs[i] != xr2) { + StoreFromRegisterV(v[i]); + xr1 = xrs[i]; + break; + } + } } - if (xrsLoaded[1]) { - StoreFromRegisterV(v[1]); + if (xr2 == INVALID_REG) { + // Not one of the available ones. Grab another. + for (int i = n - 1; i >= 0; --i) { + if (xrs[i] != INVALID_REG && xrs[i] != xr1) { + StoreFromRegisterV(v[i]); + xr2 = xrs[i]; + break; + } + } } + if (n == 3) { - emit->MOVSS(xrs[1], vregs[v[2]].location); - emit->MOVSS(xrs[0], vregs[v[1]].location); - emit->SHUFPS(xrs[0], Gen::R(xrs[1]), _MM_SHUFFLE(3, 0, 0, 0)); - emit->MOVSS(xrs[1], vregs[v[0]].location); - emit->MOVSS(xrs[0], Gen::R(xrs[1])); + emit->MOVSS(xr2, vregs[v[2]].location); + emit->MOVSS(xr1, vregs[v[1]].location); + emit->SHUFPS(xr1, Gen::R(xr2), _MM_SHUFFLE(3, 0, 0, 0)); + emit->MOVSS(xr2, vregs[v[0]].location); + emit->MOVSS(xr1, Gen::R(xr2)); } else if (n == 4) { - emit->MOVSS(xrs[1], vregs[v[2]].location); - emit->MOVSS(xrs[0], vregs[v[3]].location); - emit->UNPCKLPS(xrs[1], Gen::R(xrs[0])); - emit->MOVSS(xrs[0], vregs[v[1]].location); - emit->SHUFPS(xrs[0], Gen::R(xrs[1]), _MM_SHUFFLE(1, 0, 0, 3)); - emit->MOVSS(xrs[1], vregs[v[0]].location); - emit->MOVSS(xrs[0], Gen::R(xrs[1])); + emit->MOVSS(xr2, vregs[v[2]].location); + emit->MOVSS(xr1, vregs[v[3]].location); + emit->UNPCKLPS(xr2, Gen::R(xr1)); + emit->MOVSS(xr1, vregs[v[1]].location); + emit->SHUFPS(xr1, Gen::R(xr2), _MM_SHUFFLE(1, 0, 0, 3)); + emit->MOVSS(xr2, vregs[v[0]].location); + emit->MOVSS(xr1, Gen::R(xr2)); } + res = xr1; } - return xrs[0]; + return res; } bool FPURegCache::TryMapDirtyInVS(const u8 *vd, VectorSize vdsz, const u8 *vs, VectorSize vssz, bool avoidLoad) {