Armjit: Combine mul.s + neg.s to VNMUL. Implement VNMUL, VNMLA, VNMLS.

I had implemented mul.s + add/sub.s + add/sub.s -> VADD/VSUB + V(N)ML(A/S). Turns out it doesn't happen enough though (once or twice per game).
This commit is contained in:
Sacha 2013-06-08 16:39:24 +10:00
parent c2dcebf36d
commit f21218c3f9
3 changed files with 53 additions and 24 deletions

View file

@ -906,24 +906,30 @@ struct VFPEnc
// Double/single, Neon
const VFPEnc VFPOps[][2] = {
{{0xE0, 0xA0}, {0x20, 0xD1}}, // 0: VMLA
{{0xE0, 0xA4}, {0x22, 0xD1}}, // 1: VMLS
{{0xE3, 0xA0}, {0x20, 0xD0}}, // 2: VADD
{{0xE3, 0xA4}, {0x22, 0xD0}}, // 3: VSUB
{{0xE2, 0xA0}, {0x30, 0xD1}}, // 4: VMUL
{{0xEB, 0xAC}, { -1 /* 0x3B */, -1 /* 0x70 */}}, // 5: VABS(Vn(0x0) used for encoding)
{{0xE8, 0xA0}, { -1, -1}}, // 6: VDIV
{{0xEB, 0xA4}, { -1 /* 0x3B */, -1 /* 0x78 */}}, // 7: VNEG(Vn(0x1) used for encoding)
{{0xEB, 0xAC}, { -1, -1}}, // 8: VSQRT (Vn(0x1) used for encoding)
{{0xEB, 0xA4}, { -1, -1}}, // 9: VCMP (Vn(0x4 | #0 ? 1 : 0) used for encoding)
{{0xEB, 0xAC}, { -1, -1}}, // 10: VCMPE (Vn(0x4 | #0 ? 1 : 0) used for encoding)
{{ -1, -1}, {0x3B, 0x30}}, // 11: VABSi
{{0xE1, 0xA4}, { -1, -1}}, // 1: VNMLA
{{0xE0, 0xA4}, {0x22, 0xD1}}, // 2: VMLS
{{0xE1, 0xA0}, { -1, -1}}, // 3: VNMLS
{{0xE3, 0xA0}, {0x20, 0xD0}}, // 4: VADD
{{0xE3, 0xA4}, {0x22, 0xD0}}, // 5: VSUB
{{0xE2, 0xA0}, {0x30, 0xD1}}, // 6: VMUL
{{0xE2, 0xA4}, { -1, -1}}, // 7: VNMUL
{{0xEB, 0xAC}, { -1 /* 0x3B */, -1 /* 0x70 */}}, // 8: VABS(Vn(0x0) used for encoding)
{{0xE8, 0xA0}, { -1, -1}}, // 9: VDIV
{{0xEB, 0xA4}, { -1 /* 0x3B */, -1 /* 0x78 */}}, // 10: VNEG(Vn(0x1) used for encoding)
{{0xEB, 0xAC}, { -1, -1}}, // 11: VSQRT (Vn(0x1) used for encoding)
{{0xEB, 0xA4}, { -1, -1}}, // 12: VCMP (Vn(0x4 | #0 ? 1 : 0) used for encoding)
{{0xEB, 0xAC}, { -1, -1}}, // 13: VCMPE (Vn(0x4 | #0 ? 1 : 0) used for encoding)
{{ -1, -1}, {0x3B, 0x30}}, // 14: VABSi
};
const char *VFPOpNames[] = {
"VMLA",
"VNMLA",
"VMLS",
"VNMLS",
"VADD",
"VSUB",
"VMUL",
"VNMUL",
"VABS",
"VDIV",
"VNEG",
@ -993,18 +999,21 @@ void ARMXEmitter::WriteVFPDataOp(u32 Op, ARMReg Vd, ARMReg Vn, ARMReg Vm)
Write32(cond | (enc.opc1 << 20) | VnEnc | VdEnc | (enc.opc2 << 4) | (quad_reg << 6) | (double_reg << 8) | VmEnc);
}
void ARMXEmitter::VMLA(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(0, Vd, Vn, Vm); }
void ARMXEmitter::VMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(1, Vd, Vn, Vm); }
void ARMXEmitter::VADD(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(2, Vd, Vn, Vm); }
void ARMXEmitter::VSUB(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(3, Vd, Vn, Vm); }
void ARMXEmitter::VMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(4, Vd, Vn, Vm); }
void ARMXEmitter::VABS(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(5, Vd, D0, Vm); }
void ARMXEmitter::VDIV(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(6, Vd, Vn, Vm); }
void ARMXEmitter::VNEG(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(7, Vd, D1, Vm); }
void ARMXEmitter::VSQRT(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(8, Vd, D1, Vm); }
void ARMXEmitter::VCMP(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(9, Vd, D4, Vm); }
void ARMXEmitter::VCMPE(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(10, Vd, D4, Vm); }
void ARMXEmitter::VCMP(ARMReg Vd){ WriteVFPDataOp(9, Vd, D5, D0); }
void ARMXEmitter::VCMPE(ARMReg Vd){ WriteVFPDataOp(10, Vd, D5, D0); }
void ARMXEmitter::VNMLA(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(1, Vd, Vn, Vm); }
void ARMXEmitter::VMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(2, Vd, Vn, Vm); }
void ARMXEmitter::VNMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(3, Vd, Vn, Vm); }
void ARMXEmitter::VADD(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(4, Vd, Vn, Vm); }
void ARMXEmitter::VSUB(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(5, Vd, Vn, Vm); }
void ARMXEmitter::VMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(6, Vd, Vn, Vm); }
void ARMXEmitter::VNMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(7, Vd, Vn, Vm); }
void ARMXEmitter::VABS(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(8, Vd, D0, Vm); }
void ARMXEmitter::VDIV(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(9, Vd, Vn, Vm); }
void ARMXEmitter::VNEG(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(10, Vd, D1, Vm); }
void ARMXEmitter::VSQRT(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(11, Vd, D1, Vm); }
void ARMXEmitter::VCMP(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(12, Vd, D4, Vm); }
void ARMXEmitter::VCMPE(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(13, Vd, D4, Vm); }
void ARMXEmitter::VCMP(ARMReg Vd){ WriteVFPDataOp(12, Vd, D5, D0); }
void ARMXEmitter::VCMPE(ARMReg Vd){ WriteVFPDataOp(13, Vd, D5, D0); }
void ARMXEmitter::VLDR(ARMReg Dest, ARMReg Base, s16 offset)
{

View file

@ -542,6 +542,10 @@ public:
// Compares against zero
void VCMP(ARMReg Vd);
void VCMPE(ARMReg Vd);
void VNMLA(ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VNMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VNMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VDIV(ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VSQRT(ARMReg Vd, ARMReg Vm);

View file

@ -16,6 +16,7 @@
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include "Core/Config.h"
#include "Core/MIPS/MIPS.h"
#include "Core/MIPS/MIPSTables.h"
#include "ArmJit.h"
#include "ArmRegCache.h"
@ -52,7 +53,22 @@ void Jit::Comp_FPU3op(u32 op)
{
case 0: VADD(fpr.R(fd), fpr.R(fs), fpr.R(ft)); break; //F(fd) = F(fs) + F(ft); //add
case 1: VSUB(fpr.R(fd), fpr.R(fs), fpr.R(ft)); break; //F(fd) = F(fs) - F(ft); //sub
case 2: VMUL(fpr.R(fd), fpr.R(fs), fpr.R(ft)); break; //F(fd) = F(fs) * F(ft); //mul
case 2: { //F(fd) = F(fs) * F(ft); //mul
u32 nextOp = Memory::Read_Instruction(js.compilerPC + 4);
// Optimise possible if destination is the same
if (fd == ((nextOp>>6) & 0x1F)) {
// VMUL + VNEG -> VNMUL
if (!strcmp(MIPSGetName(nextOp), "neg.s")) {
if (fd == ((nextOp>>11) & 0x1F)) {
VNMUL(fpr.R(fd), fpr.R(fs), fpr.R(ft));
EatInstruction(nextOp);
}
return;
}
}
VMUL(fpr.R(fd), fpr.R(fs), fpr.R(ft));
break;
}
case 3: VDIV(fpr.R(fd), fpr.R(fs), fpr.R(ft)); break; //F(fd) = F(fs) / F(ft); //div
default:
DISABLE;