ARM32: Remove a lot of non-NEON fallback paths

This commit is contained in:
Henrik Rydgård 2022-04-13 10:36:37 +02:00
parent 19261c6c49
commit 584e94f01e
10 changed files with 339 additions and 910 deletions

View file

@ -1,14 +1,12 @@
#include "ppsspp_config.h" #include "ppsspp_config.h"
#include "fast_math.h" #include "fast_math.h"
#include "fast_matrix.h" #include "fast_matrix.h"
void InitFastMath(int enableNEON) { void InitFastMath() {
// Every architecture has its own define. This needs to be added to.
if (enableNEON) {
#ifndef _MSC_VER #ifndef _MSC_VER
#if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64) #if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64)
fast_matrix_mul_4x4 = &fast_matrix_mul_4x4_neon; fast_matrix_mul_4x4 = &fast_matrix_mul_4x4_neon;
#endif #endif
#endif #endif
}
} }

View file

@ -14,8 +14,8 @@ extern "C" {
// See fast_matrix.h for the first set of functions. // See fast_matrix.h for the first set of functions.
void InitFastMath(int enableNEON); void InitFastMath();
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View file

@ -138,9 +138,7 @@ void ArmJit::GenerateFixedCode() {
// consumed by CALL. // consumed by CALL.
SUB(R_SP, R_SP, 4); SUB(R_SP, R_SP, 4);
// Now we are correctly aligned and plan to stay that way. // Now we are correctly aligned and plan to stay that way.
if (cpu_info.bNEON) { VPUSH(D8, 8);
VPUSH(D8, 8);
}
// Fixed registers, these are always kept when in Jit context. // Fixed registers, these are always kept when in Jit context.
// R8 is used to hold flags during delay slots. Not always needed. // R8 is used to hold flags during delay slots. Not always needed.
@ -244,10 +242,7 @@ void ArmJit::GenerateFixedCode() {
SaveDowncount(); SaveDowncount();
RestoreRoundingMode(true); RestoreRoundingMode(true);
// Doing this above the downcount for better pipelining (slightly.) VPOP(D8, 8);
if (cpu_info.bNEON) {
VPOP(D8, 8);
}
ADD(R_SP, R_SP, 4); ADD(R_SP, R_SP, 4);

View file

@ -1132,10 +1132,6 @@ namespace MIPSComp
DISABLE; DISABLE;
} }
if (!cpu_info.bNEON) {
DISABLE;
}
// This multi-VCVT.F32.F16 is only available in the VFPv4 extension. // This multi-VCVT.F32.F16 is only available in the VFPv4 extension.
// The VFPv3 one is VCVTB, VCVTT which we don't yet have support for. // The VFPv3 one is VCVTB, VCVTT which we don't yet have support for.
if (!(cpu_info.bHalf && cpu_info.bVFPv4)) { if (!(cpu_info.bHalf && cpu_info.bVFPv4)) {
@ -1599,10 +1595,6 @@ namespace MIPSComp
DISABLE; DISABLE;
} }
if (!cpu_info.bNEON) {
DISABLE;
}
int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3) int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)
bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2) bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)

View file

@ -27,13 +27,7 @@
using namespace ArmGen; using namespace ArmGen;
using namespace ArmJitConstants; using namespace ArmJitConstants;
ArmRegCacheFPU::ArmRegCacheFPU(MIPSState *mipsState, MIPSComp::JitState *js, MIPSComp::JitOptions *jo) : mips_(mipsState), js_(js), jo_(jo), vr(mr + 32) { ArmRegCacheFPU::ArmRegCacheFPU(MIPSState *mipsState, MIPSComp::JitState *js, MIPSComp::JitOptions *jo) : mips_(mipsState), js_(js), jo_(jo), vr(mr + 32) {}
if (cpu_info.bNEON) {
numARMFpuReg_ = 32;
} else {
numARMFpuReg_ = 16;
}
}
void ArmRegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) { void ArmRegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) {
if (!initialReady) { if (!initialReady) {
@ -47,7 +41,7 @@ void ArmRegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) {
} }
void ArmRegCacheFPU::SetupInitialRegs() { void ArmRegCacheFPU::SetupInitialRegs() {
for (int i = 0; i < numARMFpuReg_; i++) { for (int i = 0; i < NUM_ARMFPUREG; i++) {
arInitial[i].mipsReg = -1; arInitial[i].mipsReg = -1;
arInitial[i].isDirty = false; arInitial[i].isDirty = false;
} }
@ -57,7 +51,7 @@ void ArmRegCacheFPU::SetupInitialRegs() {
mrInitial[i].spillLock = false; mrInitial[i].spillLock = false;
mrInitial[i].tempLock = false; mrInitial[i].tempLock = false;
} }
for (int i = 0; i < MAX_ARMQUADS; i++) { for (int i = 0; i < NUM_ARMQUADS; i++) {
qr[i].isDirty = false; qr[i].isDirty = false;
qr[i].mipsVec = -1; qr[i].mipsVec = -1;
qr[i].sz = V_Invalid; qr[i].sz = V_Invalid;
@ -68,14 +62,6 @@ void ArmRegCacheFPU::SetupInitialRegs() {
} }
const ARMReg *ArmRegCacheFPU::GetMIPSAllocationOrder(int &count) { const ARMReg *ArmRegCacheFPU::GetMIPSAllocationOrder(int &count) {
// We reserve S0-S1 as scratch. Can afford two registers. Maybe even four, which could simplify some things.
static const ARMReg allocationOrder[] = {
S2, S3,
S4, S5, S6, S7,
S8, S9, S10, S11,
S12, S13, S14, S15
};
// VFP mapping // VFP mapping
// VFPU registers and regular FP registers are mapped interchangably on top of the standard // VFPU registers and regular FP registers are mapped interchangably on top of the standard
// 16 FPU registers. // 16 FPU registers.
@ -116,12 +102,9 @@ const ARMReg *ArmRegCacheFPU::GetMIPSAllocationOrder(int &count) {
if (jo_->useNEONVFPU) { if (jo_->useNEONVFPU) {
count = sizeof(allocationOrderNEONVFPU) / sizeof(const ARMReg); count = sizeof(allocationOrderNEONVFPU) / sizeof(const ARMReg);
return allocationOrderNEONVFPU; return allocationOrderNEONVFPU;
} else if (cpu_info.bNEON) { } else {
count = sizeof(allocationOrderNEON) / sizeof(const ARMReg); count = sizeof(allocationOrderNEON) / sizeof(const ARMReg);
return allocationOrderNEON; return allocationOrderNEON;
} else {
count = sizeof(allocationOrder) / sizeof(const ARMReg);
return allocationOrder;
} }
} }
@ -404,19 +387,12 @@ void ArmRegCacheFPU::FlushR(MIPSReg r) {
mr[r].reg = (int)INVALID_REG; mr[r].reg = (int)INVALID_REG;
} }
int ArmRegCacheFPU::GetNumARMFPURegs() {
if (cpu_info.bNEON)
return 32;
else
return 16;
}
// Scalar only. Need a similar one for sequential Q vectors. // Scalar only. Need a similar one for sequential Q vectors.
int ArmRegCacheFPU::FlushGetSequential(int a, int maxArmReg) { int ArmRegCacheFPU::FlushGetSequential(int a) {
int c = 1; int c = 1;
int lastMipsOffset = GetMipsRegOffset(ar[a].mipsReg); int lastMipsOffset = GetMipsRegOffset(ar[a].mipsReg);
a++; a++;
while (a < maxArmReg) { while (a < 32) {
if (!ar[a].isDirty || ar[a].mipsReg == -1) if (!ar[a].isDirty || ar[a].mipsReg == -1)
break; break;
int mipsOffset = GetMipsRegOffset(ar[a].mipsReg); int mipsOffset = GetMipsRegOffset(ar[a].mipsReg);
@ -444,7 +420,7 @@ void ArmRegCacheFPU::FlushAll() {
// Flush quads! // Flush quads!
// These could also use sequential detection. // These could also use sequential detection.
for (int i = 4; i < MAX_ARMQUADS; i++) { for (int i = 4; i < NUM_ARMQUADS; i++) {
QFlush(i); QFlush(i);
} }
@ -466,7 +442,7 @@ void ArmRegCacheFPU::FlushAll() {
continue; continue;
} }
int c = FlushGetSequential(a, GetNumARMFPURegs()); int c = FlushGetSequential(a);
if (c == 1) { if (c == 1) {
// INFO_LOG(JIT, "Got single register: %i (%i)", a, m); // INFO_LOG(JIT, "Got single register: %i (%i)", a, m);
emit_->VSTR((ARMReg)(a + S0), CTXREG, GetMipsRegOffset(m)); emit_->VSTR((ARMReg)(a + S0), CTXREG, GetMipsRegOffset(m));
@ -502,7 +478,7 @@ void ArmRegCacheFPU::FlushAll() {
} }
// Sanity check // Sanity check
for (int i = 0; i < numARMFpuReg_; i++) { for (int i = 0; i < NUM_ARMFPUREG; i++) {
if (ar[i].mipsReg != -1) { if (ar[i].mipsReg != -1) {
ERROR_LOG(JIT, "Flush fail: ar[%i].mipsReg=%i", i, ar[i].mipsReg); ERROR_LOG(JIT, "Flush fail: ar[%i].mipsReg=%i", i, ar[i].mipsReg);
} }
@ -594,7 +570,7 @@ void ArmRegCacheFPU::ReleaseSpillLocksAndDiscardTemps() {
for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i) { for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i) {
DiscardR(i); DiscardR(i);
} }
for (int i = 0; i < MAX_ARMQUADS; i++) { for (int i = 0; i < NUM_ARMQUADS; i++) {
qr[i].spillLock = false; qr[i].spillLock = false;
if (qr[i].isTemp) { if (qr[i].isTemp) {
qr[i].isTemp = false; qr[i].isTemp = false;

View file

@ -127,7 +127,7 @@ public:
// VFPU registers as single VFP registers. // VFPU registers as single VFP registers.
ArmGen::ARMReg V(int vreg) { return R(vreg + 32); } ArmGen::ARMReg V(int vreg) { return R(vreg + 32); }
int FlushGetSequential(int a, int maxArmReg); int FlushGetSequential(int a);
void FlushAll(); void FlushAll();
// This one is allowed at any point. // This one is allowed at any point.
@ -180,7 +180,6 @@ private:
} }
// This one WILL get a free quad as long as you haven't spill-locked them all. // This one WILL get a free quad as long as you haven't spill-locked them all.
int QGetFreeQuad(int start, int count, const char *reason); int QGetFreeQuad(int start, int count, const char *reason);
int GetNumARMFPURegs();
void SetupInitialRegs(); void SetupInitialRegs();
@ -189,24 +188,23 @@ private:
MIPSComp::JitState *js_; MIPSComp::JitState *js_;
MIPSComp::JitOptions *jo_; MIPSComp::JitOptions *jo_;
int numARMFpuReg_;
int qTime_; int qTime_;
enum { enum {
// With NEON, we have 64 S = 32 D = 16 Q registers. Only the first 32 S registers // With NEON, we have 64 S = 32 D = 16 Q registers. Only the first 32 S registers
// are individually mappable though. // are individually mappable though.
MAX_ARMFPUREG = 32, NUM_ARMFPUREG = 32,
MAX_ARMQUADS = 16, NUM_ARMQUADS = 16,
NUM_MIPSFPUREG = ArmJitConstants::TOTAL_MAPPABLE_MIPSFPUREGS, NUM_MIPSFPUREG = ArmJitConstants::TOTAL_MAPPABLE_MIPSFPUREGS,
}; };
FPURegARM ar[MAX_ARMFPUREG]; FPURegARM ar[NUM_ARMFPUREG];
FPURegMIPS mr[NUM_MIPSFPUREG]; FPURegMIPS mr[NUM_MIPSFPUREG];
FPURegQuad qr[MAX_ARMQUADS]; FPURegQuad qr[NUM_ARMQUADS];
FPURegMIPS *vr; FPURegMIPS *vr;
bool pendingFlush; bool pendingFlush;
bool initialReady = false; bool initialReady = false;
FPURegARM arInitial[MAX_ARMFPUREG]; FPURegARM arInitial[NUM_ARMFPUREG];
FPURegMIPS mrInitial[NUM_MIPSFPUREG]; FPURegMIPS mrInitial[NUM_MIPSFPUREG];
}; };

View file

@ -38,7 +38,7 @@ namespace MIPSComp {
// ARM only // ARM only
downcountInRegister = true; downcountInRegister = true;
useNEONVFPU = false; // true useNEONVFPU = false; // true
if (!cpu_info.bNEON || Disabled(JitDisable::SIMD)) if (Disabled(JitDisable::SIMD))
useNEONVFPU = false; useNEONVFPU = false;
//ARM64 //ARM64

View file

@ -901,9 +901,7 @@ CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w
#ifdef _M_SSE #ifdef _M_SSE
return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h); return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h);
#elif PPSSPP_ARCH(ARM_NEON) #elif PPSSPP_ARCH(ARM_NEON)
if (cpu_info.bNEON) { return CheckAlphaRGBA8888NEON(pixelData, stride, w, h);
return CheckAlphaRGBA8888NEON(pixelData, stride, w, h);
}
#endif #endif
} }
@ -931,9 +929,7 @@ CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w
#ifdef _M_SSE #ifdef _M_SSE
return CheckAlphaABGR4444SSE2(pixelData, stride, w, h); return CheckAlphaABGR4444SSE2(pixelData, stride, w, h);
#elif PPSSPP_ARCH(ARM_NEON) #elif PPSSPP_ARCH(ARM_NEON)
if (cpu_info.bNEON) { return CheckAlphaABGR4444NEON(pixelData, stride, w, h);
return CheckAlphaABGR4444NEON(pixelData, stride, w, h);
}
#endif #endif
} }
@ -964,9 +960,7 @@ CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w
#ifdef _M_SSE #ifdef _M_SSE
return CheckAlphaABGR1555SSE2(pixelData, stride, w, h); return CheckAlphaABGR1555SSE2(pixelData, stride, w, h);
#elif PPSSPP_ARCH(ARM_NEON) #elif PPSSPP_ARCH(ARM_NEON)
if (cpu_info.bNEON) { return CheckAlphaABGR1555NEON(pixelData, stride, w, h);
return CheckAlphaABGR1555NEON(pixelData, stride, w, h);
}
#endif #endif
} }
@ -996,9 +990,7 @@ CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w
#ifdef _M_SSE #ifdef _M_SSE
return CheckAlphaRGBA4444SSE2(pixelData, stride, w, h); return CheckAlphaRGBA4444SSE2(pixelData, stride, w, h);
#elif PPSSPP_ARCH(ARM_NEON) #elif PPSSPP_ARCH(ARM_NEON)
if (cpu_info.bNEON) { return CheckAlphaRGBA4444NEON(pixelData, stride, w, h);
return CheckAlphaRGBA4444NEON(pixelData, stride, w, h);
}
#endif #endif
} }
@ -1029,9 +1021,7 @@ CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w
#ifdef _M_SSE #ifdef _M_SSE
return CheckAlphaRGBA5551SSE2(pixelData, stride, w, h); return CheckAlphaRGBA5551SSE2(pixelData, stride, w, h);
#elif PPSSPP_ARCH(ARM_NEON) #elif PPSSPP_ARCH(ARM_NEON)
if (cpu_info.bNEON) { return CheckAlphaRGBA5551NEON(pixelData, stride, w, h);
return CheckAlphaRGBA5551NEON(pixelData, stride, w, h);
}
#endif #endif
} }

File diff suppressed because it is too large Load diff

View file

@ -464,7 +464,7 @@ void NativeInit(int argc, const char *argv[], const char *savegame_dir, const ch
ShaderTranslationInit(); ShaderTranslationInit();
InitFastMath(cpu_info.bNEON); InitFastMath();
g_threadManager.Init(cpu_info.num_cores, cpu_info.logical_cpu_count); g_threadManager.Init(cpu_info.num_cores, cpu_info.logical_cpu_count);
g_Discord.SetPresenceMenu(); g_Discord.SetPresenceMenu();