diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp index 743b2a5c1f..9e37d238b4 100644 --- a/Core/MIPS/x86/CompVFPU.cpp +++ b/Core/MIPS/x86/CompVFPU.cpp @@ -2389,6 +2389,48 @@ void Jit::Comp_Vmmov(MIPSOpcode op) { MatrixSize sz = GetMtxSize(op); int n = GetMatrixSide(sz); + if (jo.enableVFPUSIMD) { + VectorSize vsz = GetVectorSize(sz); + u8 dest[4][4]; + MatrixOverlapType overlap = GetMatrixOverlap(_VD, _VS, sz); + + u8 vecs[4]; + if (overlap == OVERLAP_NONE) { + GetMatrixColumns(_VD, sz, vecs); + for (int i = 0; i < n; ++i) { + GetVectorRegs(dest[i], vsz, vecs[i]); + } + } else { + for (int i = 0; i < n; ++i) { + fpr.GetTempVS(dest[i], vsz); + } + } + + GetMatrixColumns(_VS, sz, vecs); + for (int i = 0; i < n; i++) { + u8 vec[4]; + GetVectorRegs(vec, vsz, vecs[i]); + fpr.MapRegsVS(vec, vsz, 0); + fpr.MapRegsVS(dest[i], vsz, MAP_NOINIT); + MOVAPS(fpr.VSX(dest[i]), fpr.VS(vec)); + } + + if (overlap != OVERLAP_NONE) { + // Okay, move from the temps to VD now. + GetMatrixColumns(_VD, sz, vecs); + for (int i = 0; i < n; i++) { + u8 vec[4]; + GetVectorRegs(vec, vsz, vecs[i]); + fpr.MapRegsVS(vec, vsz, MAP_NOINIT); + fpr.MapRegsVS(dest[i], vsz, 0); + MOVAPS(fpr.VSX(vec), fpr.VS(dest[i])); + } + } + + fpr.ReleaseSpillLocks(); + return; + } + u8 sregs[16], dregs[16]; GetMatrixRegs(sregs, sz, _VS); GetMatrixRegs(dregs, sz, _VD); diff --git a/Core/MIPS/x86/RegCacheFPU.cpp b/Core/MIPS/x86/RegCacheFPU.cpp index 011bead2a7..74da185fb8 100644 --- a/Core/MIPS/x86/RegCacheFPU.cpp +++ b/Core/MIPS/x86/RegCacheFPU.cpp @@ -716,6 +716,50 @@ int FPURegCache::GetTempR() { return -1; } +int FPURegCache::GetTempVS(u8 *v, VectorSize vsz) { + pendingFlush = true; + const int n = GetNumVectorElements(vsz); + + // Let's collect regs as we go, but try for n free in a row. + int found = 0; + for (int r = TEMP0; r <= TEMP0 + NUM_TEMPS - n; ++r) { + if (regs[r].away || regs[r].tempLocked) { + continue; + } + + // How many free siblings does this have? + int seq = 1; + for (int i = 1; i < n; ++i) { + if (regs[r + i].away || regs[r + i].tempLocked) { + break; + } + ++seq; + } + + if (seq == n) { + // Got 'em. Exacty as many as we need. + for (int i = 0; i < n; ++i) { + v[i] = r + i - 32; + } + found = n; + break; + } + + if (found < n) { + v[found++] = r - 32; + } + } + + if (found != n) { + _assert_msg_(JIT, 0, "Regcache ran out of temp regs, might need to DiscardR() some."); + return -1; + } + + for (int i = 0; i < n; ++i) { + regs[v[i] + 32].tempLocked = true; + } +} + void FPURegCache::Flush() { if (!pendingFlush) { return; diff --git a/Core/MIPS/x86/RegCacheFPU.h b/Core/MIPS/x86/RegCacheFPU.h index e29a725f15..195a3188b0 100644 --- a/Core/MIPS/x86/RegCacheFPU.h +++ b/Core/MIPS/x86/RegCacheFPU.h @@ -114,7 +114,7 @@ public: int GetTempV() { return GetTempR() - 32; } - // TODO: GetTempVS? + int GetTempVS(u8 *v, VectorSize vsz); void SetEmitter(XEmitter *emitter) {emit = emitter;} void SetOptions(MIPSComp::JitOptions *jo) {jo_ = jo;} diff --git a/unittest/JitHarness.cpp b/unittest/JitHarness.cpp index 3884c4b0c1..ea6713a100 100644 --- a/unittest/JitHarness.cpp +++ b/unittest/JitHarness.cpp @@ -173,7 +173,6 @@ bool TestJit() { #else std::vector lines = DisassembleX86(block->normalEntry, block->codeSize); #endif - printf("Jit was %fx faster than interp.\n\n", jit_speed / interp_speed); // Cut off at 25 due to the repetition above. Might need tweaking for large instructions. const int cutoff = 25; for (int i = 0; i < std::min((int)lines.size(), cutoff); i++) { @@ -181,6 +180,7 @@ bool TestJit() { } if (lines.size() > cutoff) printf("...\n"); + printf("Jit was %fx faster than interp.\n\n", jit_speed / interp_speed); } printf("\n");