From 8dfadf7b8eb0614e5e987f6d486089863d5bb430 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sat, 22 Mar 2014 16:27:23 +0100 Subject: [PATCH] ArmEmitter: Add VMOV_neon and a Size parameter to VFMA for consistency. --- Common/ArmEmitter.cpp | 6 ++++-- Common/ArmEmitter.h | 17 +++++++++++------ GPU/GLES/VertexDecoderArm.cpp | 8 ++++---- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/Common/ArmEmitter.cpp b/Common/ArmEmitter.cpp index 5a24a3dba6..ce224e9848 100644 --- a/Common/ArmEmitter.cpp +++ b/Common/ArmEmitter.cpp @@ -1984,8 +1984,9 @@ void ARMXEmitter::VEXT(ARMReg Vd, ARMReg Vn, ARMReg Vm, u8 index) Write32((0xF2 << 24) | (0xB << 20) | EncodeVn(Vn) | EncodeVd(Vd) | (index & 0xF) \ | (register_quad << 6) | EncodeVm(Vm)); } -void ARMXEmitter::VFMA(ARMReg Vd, ARMReg Vn, ARMReg Vm) +void ARMXEmitter::VFMA(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm) { + _dbg_assert_msg_(JIT, Size == F_32, "Passed invalid size to FP-only NEON instruction"); _dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__); _dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it"); _dbg_assert_msg_(JIT, cpu_info.bVFPv4, "Can't use " __FUNCTION__ " when CPU doesn't support it"); @@ -1993,8 +1994,9 @@ void ARMXEmitter::VFMA(ARMReg Vd, ARMReg Vn, ARMReg Vm) Write32((0xF2 << 24) | EncodeVn(Vn) | EncodeVd(Vd) | (0xC1 << 4) | (register_quad << 6) | EncodeVm(Vm)); } -void ARMXEmitter::VFMS(ARMReg Vd, ARMReg Vn, ARMReg Vm) +void ARMXEmitter::VFMS(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm) { + _dbg_assert_msg_(JIT, Size == F_32, "Passed invalid size to FP-only NEON instruction"); _dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__); _dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it"); _dbg_assert_msg_(JIT, cpu_info.bVFPv4, "Can't use " __FUNCTION__ " when CPU doesn't support it"); diff --git a/Common/ArmEmitter.h b/Common/ArmEmitter.h index bd57612716..87bd4bca99 100644 --- a/Common/ArmEmitter.h +++ b/Common/ArmEmitter.h @@ -635,8 +635,6 @@ public: void VADDHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); void VADDL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); void VADDW(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); - void VAND(ARMReg Vd, ARMReg Vn, ARMReg Vm); - void VBIC(ARMReg Vd, ARMReg Vn, ARMReg Vm); void VBIF(ARMReg Vd, ARMReg Vn, ARMReg Vm); void VBIT(ARMReg Vd, ARMReg Vn, ARMReg Vm); void VBSL(ARMReg Vd, ARMReg Vn, ARMReg Vm); @@ -655,10 +653,9 @@ public: void VCNT(u32 Size, ARMReg Vd, ARMReg Vm); void VDUP(u32 Size, ARMReg Vd, ARMReg Vm, u8 index); void VDUP(u32 Size, ARMReg Vd, ARMReg Rt); - void VEOR(ARMReg Vd, ARMReg Vn, ARMReg Vm); void VEXT(ARMReg Vd, ARMReg Vn, ARMReg Vm, u8 index); - void VFMA(ARMReg Vd, ARMReg Vn, ARMReg Vm); - void VFMS(ARMReg Vd, ARMReg Vn, ARMReg Vm); + void VFMA(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); + void VFMS(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); void VHADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); void VHSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); void VMAX(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); @@ -695,9 +692,17 @@ public: void VQRDMULH_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); */ - void VNEG(u32 Size, ARMReg Vd, ARMReg Vm); + // Vector bitwise. These don't have an element size for obvious reasons. + void VAND(ARMReg Vd, ARMReg Vn, ARMReg Vm); + void VBIC(ARMReg Vd, ARMReg Vn, ARMReg Vm); + void VEOR(ARMReg Vd, ARMReg Vn, ARMReg Vm); void VORN(ARMReg Vd, ARMReg Vn, ARMReg Vm); void VORR(ARMReg Vd, ARMReg Vn, ARMReg Vm); + inline void VMOV_neon(ARMReg Dest, ARMReg Src) { + VORR(Dest, Src, Src); + } + + void VNEG(u32 Size, ARMReg Vd, ARMReg Vm); void VPADAL(u32 Size, ARMReg Vd, ARMReg Vm); void VPADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); void VPADDL(u32 Size, ARMReg Vd, ARMReg Vm); diff --git a/GPU/GLES/VertexDecoderArm.cpp b/GPU/GLES/VertexDecoderArm.cpp index e95ae21db8..1517817e78 100644 --- a/GPU/GLES/VertexDecoderArm.cpp +++ b/GPU/GLES/VertexDecoderArm.cpp @@ -330,10 +330,10 @@ void VertexDecoderJitCache::Jit_ApplyWeights() { // Krait likes VDUP + VFMA better than VMLA, and it's easy to do here. if (cpu_info.bVFPv4) { VDUP(F_32, Q1, neonWeightRegs[i >> 2], i & 1); - VFMA(Q4, Q12, Q1); - VFMA(Q5, Q13, Q1); - VFMA(Q6, Q14, Q1); - VFMA(Q7, Q15, Q1); + VFMA(F_32, Q4, Q12, Q1); + VFMA(F_32, Q5, Q13, Q1); + VFMA(F_32, Q6, Q14, Q1); + VFMA(F_32, Q7, Q15, Q1); } else { VMLA_scalar(F_32, Q4, Q12, QScalar(neonWeightRegs[0], 1)); VMLA_scalar(F_32, Q5, Q13, QScalar(neonWeightRegs[0], 1));