From 5290ffd9296a030a0abb05a46aca3aafd97801ca Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Wed, 3 Dec 2014 22:42:33 +0100 Subject: [PATCH] Minor cleanup in vtfm. Re-enable vrot combination. Optimize vfad/vavg when dpps is available. Also fixes bug in emitter of dpps. --- Common/x64Emitter.cpp | 2 +- Core/MIPS/x86/CompVFPU.cpp | 84 +++++++++++++++++++++++--------------- 2 files changed, 51 insertions(+), 35 deletions(-) diff --git a/Common/x64Emitter.cpp b/Common/x64Emitter.cpp index 64c1263a8b..3e8f3e958b 100644 --- a/Common/x64Emitter.cpp +++ b/Common/x64Emitter.cpp @@ -1743,7 +1743,7 @@ void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int ex void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(0x66, 0x3800, dest, arg);} void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest, arg);} void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);} -void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg); Write8(mask);} +void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);} void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);} void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);} diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp index 84f85c4294..217d3b4bf8 100644 --- a/Core/MIPS/x86/CompVFPU.cpp +++ b/Core/MIPS/x86/CompVFPU.cpp @@ -2754,11 +2754,9 @@ void Jit::Comp_Vtfm(MIPSOpcode op) { // The T matrix we will address individually. GetVectorRegs(dcol, sz, vd); GetMatrixRows(vs, msz, scols); - memset(tregs, 255, sizeof(tregs)); GetVectorRegs(tregs, sz, vt); - for (int i = 0; i < ARRAY_SIZE(tregs); i++) { - if (tregs[i] != 255) - fpr.StoreFromRegisterV(tregs[i]); + for (int i = 0; i < n; i++) { + fpr.StoreFromRegisterV(tregs[i]); } u8 scol[4][4]; @@ -2767,7 +2765,6 @@ void Jit::Comp_Vtfm(MIPSOpcode op) { for (int i = 0; i < n; i++) { GetVectorRegs(scol[i], sz, scols[i]); fpr.MapRegsVS(scol[i], sz, 0); - fpr.SpillLockV(scols[i], sz); } // Now, work our way through the matrix, loading things as we go. @@ -2792,7 +2789,6 @@ void Jit::Comp_Vtfm(MIPSOpcode op) { return; } - u8 sregs[16], dregs[4], tregs[4]; GetMatrixRegs(sregs, msz, _VS); GetVectorRegs(tregs, sz, _VT); @@ -2985,32 +2981,52 @@ void Jit::Comp_Vhoriz(MIPSOpcode op) { GetVectorRegsPrefixS(sregs, sz, _VS); GetVectorRegsPrefixD(dregs, V_Single, _VD); if (fpr.TryMapDirtyInVS(dregs, V_Single, sregs, sz)) { - switch (sz) { - case V_Pair: - MOVAPS(XMM0, fpr.VS(sregs)); - MOVAPS(XMM1, R(XMM0)); - SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3,2,1,1)); - ADDPS(XMM0, R(XMM1)); - MOVAPS(fpr.VSX(dregs), R(XMM0)); - break; - case V_Triple: - MOVAPS(XMM0, fpr.VS(sregs)); - MOVAPS(XMM1, R(XMM0)); - SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3,2,1,1)); - ADDPS(XMM0, R(XMM1)); - SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3,2,1,2)); - ADDPS(XMM0, R(XMM1)); - MOVAPS(fpr.VSX(dregs), R(XMM0)); - break; - case V_Quad: - MOVAPS(XMM0, fpr.VS(sregs)); - MOVHLPS(XMM1, XMM0); - ADDPS(XMM0, R(XMM1)); - MOVAPS(XMM1, R(XMM0)); - SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(1,1,1,1)); - ADDPS(XMM0, R(XMM1)); - MOVAPS(fpr.VSX(dregs), R(XMM0)); - break; + if (cpu_info.bSSE4_1) { + switch (sz) { + case V_Pair: + MOVAPS(XMM0, fpr.VS(sregs)); + DPPS(XMM0, M(&oneOneOneOne), 0x31); + MOVAPS(fpr.VSX(dregs), R(XMM0)); + break; + case V_Triple: + MOVAPS(XMM0, fpr.VS(sregs)); + DPPS(XMM0, M(&oneOneOneOne), 0x71); + MOVAPS(fpr.VSX(dregs), R(XMM0)); + break; + case V_Quad: + MOVAPS(XMM0, fpr.VS(sregs)); + DPPS(XMM0, M(&oneOneOneOne), 0xF1); + MOVAPS(fpr.VSX(dregs), R(XMM0)); + break; + } + } else { + switch (sz) { + case V_Pair: + MOVAPS(XMM0, fpr.VS(sregs)); + MOVAPS(XMM1, R(XMM0)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 1)); + ADDPS(XMM0, R(XMM1)); + MOVAPS(fpr.VSX(dregs), R(XMM0)); + break; + case V_Triple: + MOVAPS(XMM0, fpr.VS(sregs)); + MOVAPS(XMM1, R(XMM0)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 1)); + ADDPS(XMM0, R(XMM1)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 2)); + ADDPS(XMM0, R(XMM1)); + MOVAPS(fpr.VSX(dregs), R(XMM0)); + break; + case V_Quad: + MOVAPS(XMM0, fpr.VS(sregs)); + MOVHLPS(XMM1, XMM0); + ADDPS(XMM0, R(XMM1)); + MOVAPS(XMM1, R(XMM0)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(1, 1, 1, 1)); + ADDPS(XMM0, R(XMM1)); + MOVAPS(fpr.VSX(dregs), R(XMM0)); + break; + } } if (((op >> 16) & 31) == 7) { // vavg MULSS(fpr.VSX(dregs), M(&vavg_table[n])); @@ -3150,11 +3166,11 @@ void Jit::Comp_VRot(MIPSOpcode op) { u32 nextOp = Memory::Read_Opcode_JIT(js.compilerPC + 4).encoding; int vd2 = -1; int imm2 = -1; - if (false && (nextOp >> 26) == 60 && ((nextOp >> 21) & 0x1F) == 29 && _VS == MIPS_GET_VS(nextOp)) { + if ((nextOp >> 26) == 60 && ((nextOp >> 21) & 0x1F) == 29 && _VS == MIPS_GET_VS(nextOp)) { // Pair of vrot. Let's join them. vd2 = MIPS_GET_VD(nextOp); imm2 = (nextOp >> 16) & 0x1f; - NOTICE_LOG(JIT, "Joint VFPU at %08x", js.blockStart); + // NOTICE_LOG(JIT, "Joint VFPU at %08x", js.blockStart); } u8 sreg;