ARM32: Remove a lot of non-NEON fallback paths

2025-04-02 11:01:50 -04:00 · 2022-04-13 10:36:37 +02:00 · 2022-04-13 10:36:37 +02:00 · 584e94f01e
commit 584e94f01e
parent 19261c6c49
10 changed files with 339 additions and 910 deletions
--- a/Common/Math/fast/fast_math.c
+++ b/Common/Math/fast/fast_math.c
@ -1,14 +1,12 @@
 #include "ppsspp_config.h"
 #include "fast_math.h"
 #include "fast_matrix.h"
-void InitFastMath(int enableNEON) {
+void InitFastMath() {
 	// Every architecture has its own define. This needs to be added to.
 	if (enableNEON) {
 #ifndef _MSC_VER
 #if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64)
 		fast_matrix_mul_4x4 = &fast_matrix_mul_4x4_neon;
 #endif
 #endif
 	}
 }
--- a/Common/Math/fast/fast_math.h
+++ b/Common/Math/fast/fast_math.h
@ -14,8 +14,8 @@ extern "C" {
 // See fast_matrix.h for the first set of functions.
-void InitFastMath(int enableNEON);
+void InitFastMath();
 #ifdef __cplusplus
 }
-#endif
+#endif
--- a/Core/MIPS/ARM/ArmAsm.cpp
+++ b/Core/MIPS/ARM/ArmAsm.cpp
@ -138,9 +138,7 @@ void ArmJit::GenerateFixedCode() {
 	// consumed by CALL.
 	SUB(R_SP, R_SP, 4);
 	// Now we are correctly aligned and plan to stay that way.
-	if (cpu_info.bNEON) {
+	VPUSH(D8, 8);
 		VPUSH(D8, 8);
 	}
 	// Fixed registers, these are always kept when in Jit context.
 	// R8 is used to hold flags during delay slots. Not always needed.
@ -244,10 +242,7 @@ void ArmJit::GenerateFixedCode() {
 	SaveDowncount();
 	RestoreRoundingMode(true);
-	// Doing this above the downcount for better pipelining (slightly.)
+	VPOP(D8, 8);
 	if (cpu_info.bNEON) {
 		VPOP(D8, 8);
 	}
 	ADD(R_SP, R_SP, 4);
--- a/Core/MIPS/ARM/ArmCompVFPU.cpp
+++ b/Core/MIPS/ARM/ArmCompVFPU.cpp
@ -1132,10 +1132,6 @@ namespace MIPSComp
 			DISABLE;
 		}
 		if (!cpu_info.bNEON) {
 			DISABLE;
 		}
 		// This multi-VCVT.F32.F16 is only available in the VFPv4 extension.
 		// The VFPv3 one is VCVTB, VCVTT which we don't yet have support for.
 		if (!(cpu_info.bHalf && cpu_info.bVFPv4)) {
@ -1599,10 +1595,6 @@ namespace MIPSComp
 			DISABLE;
 		}
 		if (!cpu_info.bNEON) {
 			DISABLE;
 		}
 		int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)
 		bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)
--- a/Core/MIPS/ARM/ArmRegCacheFPU.cpp
+++ b/Core/MIPS/ARM/ArmRegCacheFPU.cpp
@ -27,13 +27,7 @@
 using namespace ArmGen;
 using namespace ArmJitConstants;
-ArmRegCacheFPU::ArmRegCacheFPU(MIPSState *mipsState, MIPSComp::JitState *js, MIPSComp::JitOptions *jo) : mips_(mipsState), js_(js), jo_(jo), vr(mr + 32) {
+ArmRegCacheFPU::ArmRegCacheFPU(MIPSState *mipsState, MIPSComp::JitState *js, MIPSComp::JitOptions *jo) : mips_(mipsState), js_(js), jo_(jo), vr(mr + 32) {}
 	if (cpu_info.bNEON) {
 		numARMFpuReg_ = 32;
 	} else {
 		numARMFpuReg_ = 16;
 	}
 }
 void ArmRegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) {
 	if (!initialReady) {
@ -47,7 +41,7 @@ void ArmRegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) {
 }
 void ArmRegCacheFPU::SetupInitialRegs() {
-	for (int i = 0; i < numARMFpuReg_; i++) {
+	for (int i = 0; i < NUM_ARMFPUREG; i++) {
 		arInitial[i].mipsReg = -1;
 		arInitial[i].isDirty = false;
 	}
@ -57,7 +51,7 @@ void ArmRegCacheFPU::SetupInitialRegs() {
 		mrInitial[i].spillLock = false;
 		mrInitial[i].tempLock = false;
 	}
-	for (int i = 0; i < MAX_ARMQUADS; i++) {
+	for (int i = 0; i < NUM_ARMQUADS; i++) {
 		qr[i].isDirty = false;
 		qr[i].mipsVec = -1;
 		qr[i].sz = V_Invalid;
@ -68,14 +62,6 @@ void ArmRegCacheFPU::SetupInitialRegs() {
 }
 const ARMReg *ArmRegCacheFPU::GetMIPSAllocationOrder(int &count) {
 	// We reserve S0-S1 as scratch. Can afford two registers. Maybe even four, which could simplify some things.
 	static const ARMReg allocationOrder[] = {
 		          S2,  S3,
 		S4,  S5,  S6,  S7,
 		S8,  S9,  S10, S11,
 		S12, S13, S14, S15
 	};
 	// VFP mapping
 	// VFPU registers and regular FP registers are mapped interchangably on top of the standard
 	// 16 FPU registers.
@ -116,12 +102,9 @@ const ARMReg *ArmRegCacheFPU::GetMIPSAllocationOrder(int &count) {
 	if (jo_->useNEONVFPU) {
 		count = sizeof(allocationOrderNEONVFPU) / sizeof(const ARMReg);
 		return allocationOrderNEONVFPU;
-	} else if (cpu_info.bNEON) {
+	} else {
 		count = sizeof(allocationOrderNEON) / sizeof(const ARMReg);
 		return allocationOrderNEON;
 	} else {
 		count = sizeof(allocationOrder) / sizeof(const ARMReg);
 		return allocationOrder;
 	}
 }
@ -404,19 +387,12 @@ void ArmRegCacheFPU::FlushR(MIPSReg r) {
 	mr[r].reg = (int)INVALID_REG;
 }
 int ArmRegCacheFPU::GetNumARMFPURegs() {
 	if (cpu_info.bNEON)
 		return 32;
 	else
 		return 16;
 }
 // Scalar only. Need a similar one for sequential Q vectors.
-int ArmRegCacheFPU::FlushGetSequential(int a, int maxArmReg) {
+int ArmRegCacheFPU::FlushGetSequential(int a) {
 	int c = 1;
 	int lastMipsOffset = GetMipsRegOffset(ar[a].mipsReg);
 	a++;
-	while (a < maxArmReg) {
+	while (a < 32) {
 		if (!ar[a].isDirty || ar[a].mipsReg == -1)
 			break;
 		int mipsOffset = GetMipsRegOffset(ar[a].mipsReg);
@ -444,7 +420,7 @@ void ArmRegCacheFPU::FlushAll() {
 	// Flush quads!
 	// These could also use sequential detection.
-	for (int i = 4; i < MAX_ARMQUADS; i++) {
+	for (int i = 4; i < NUM_ARMQUADS; i++) {
 		QFlush(i);
 	}
@ -466,7 +442,7 @@ void ArmRegCacheFPU::FlushAll() {
 				continue;
 			}
-			int c = FlushGetSequential(a, GetNumARMFPURegs());
+			int c = FlushGetSequential(a);
 			if (c == 1) {
 				// INFO_LOG(JIT, "Got single register: %i (%i)", a, m);
 				emit_->VSTR((ARMReg)(a + S0), CTXREG, GetMipsRegOffset(m));
@ -502,7 +478,7 @@ void ArmRegCacheFPU::FlushAll() {
 	}
 	// Sanity check
-	for (int i = 0; i < numARMFpuReg_; i++) {
+	for (int i = 0; i < NUM_ARMFPUREG; i++) {
 		if (ar[i].mipsReg != -1) {
 			ERROR_LOG(JIT, "Flush fail: ar[%i].mipsReg=%i", i, ar[i].mipsReg);
 		}
@ -594,7 +570,7 @@ void ArmRegCacheFPU::ReleaseSpillLocksAndDiscardTemps() {
 	for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i) {
 		DiscardR(i);
 	}
-	for (int i = 0; i < MAX_ARMQUADS; i++) {
+	for (int i = 0; i < NUM_ARMQUADS; i++) {
 		qr[i].spillLock = false;
 		if (qr[i].isTemp) {
 			qr[i].isTemp = false;
--- a/Core/MIPS/ARM/ArmRegCacheFPU.h
+++ b/Core/MIPS/ARM/ArmRegCacheFPU.h
@ -127,7 +127,7 @@ public:
 	// VFPU registers as single VFP registers.
 	ArmGen::ARMReg V(int vreg) { return R(vreg + 32); }
-	int FlushGetSequential(int a, int maxArmReg);
+	int FlushGetSequential(int a);
 	void FlushAll();
 	// This one is allowed at any point.
@ -180,7 +180,6 @@ private:
 	}
 	// This one WILL get a free quad as long as you haven't spill-locked them all.
 	int QGetFreeQuad(int start, int count, const char *reason);
 	int GetNumARMFPURegs();
 	void SetupInitialRegs();
@ -189,24 +188,23 @@ private:
 	MIPSComp::JitState *js_;
 	MIPSComp::JitOptions *jo_;
 	int numARMFpuReg_;
 	int qTime_;
 	enum {
 		// With NEON, we have 64 S = 32 D = 16 Q registers. Only the first 32 S registers
 		// are individually mappable though.
-		MAX_ARMFPUREG = 32,
+		NUM_ARMFPUREG = 32,
-		MAX_ARMQUADS = 16,
+		NUM_ARMQUADS = 16,
 		NUM_MIPSFPUREG = ArmJitConstants::TOTAL_MAPPABLE_MIPSFPUREGS,
 	};
-	FPURegARM ar[MAX_ARMFPUREG];
+	FPURegARM ar[NUM_ARMFPUREG];
 	FPURegMIPS mr[NUM_MIPSFPUREG];
-	FPURegQuad qr[MAX_ARMQUADS];
+	FPURegQuad qr[NUM_ARMQUADS];
 	FPURegMIPS *vr;
 	bool pendingFlush;
 	bool initialReady = false;
-	FPURegARM arInitial[MAX_ARMFPUREG];
+	FPURegARM arInitial[NUM_ARMFPUREG];
 	FPURegMIPS mrInitial[NUM_MIPSFPUREG];
 };
--- a/Core/MIPS/JitCommon/JitState.cpp
+++ b/Core/MIPS/JitCommon/JitState.cpp
@ -38,7 +38,7 @@ namespace MIPSComp {
 		// ARM only
 		downcountInRegister = true;
 		useNEONVFPU = false;  // true
-		if (!cpu_info.bNEON || Disabled(JitDisable::SIMD))
+		if (Disabled(JitDisable::SIMD))
 			useNEONVFPU = false;
 		//ARM64
--- a/GPU/Common/TextureDecoder.cpp
+++ b/GPU/Common/TextureDecoder.cpp
@ -901,9 +901,7 @@ CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w
 #ifdef _M_SSE
 		return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h);
 #elif PPSSPP_ARCH(ARM_NEON)
-		if (cpu_info.bNEON) {
+		return CheckAlphaRGBA8888NEON(pixelData, stride, w, h);
 			return CheckAlphaRGBA8888NEON(pixelData, stride, w, h);
 		}
 #endif
 	}
@ -931,9 +929,7 @@ CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w
 #ifdef _M_SSE
 		return CheckAlphaABGR4444SSE2(pixelData, stride, w, h);
 #elif PPSSPP_ARCH(ARM_NEON)
-		if (cpu_info.bNEON) {
+		return CheckAlphaABGR4444NEON(pixelData, stride, w, h);
 			return CheckAlphaABGR4444NEON(pixelData, stride, w, h);
 		}
 #endif
 	}
@ -964,9 +960,7 @@ CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w
 #ifdef _M_SSE
 		return CheckAlphaABGR1555SSE2(pixelData, stride, w, h);
 #elif PPSSPP_ARCH(ARM_NEON)
-		if (cpu_info.bNEON) {
+		return CheckAlphaABGR1555NEON(pixelData, stride, w, h);
 			return CheckAlphaABGR1555NEON(pixelData, stride, w, h);
 		}
 #endif
 	}
@ -996,9 +990,7 @@ CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w
 #ifdef _M_SSE
 		return CheckAlphaRGBA4444SSE2(pixelData, stride, w, h);
 #elif PPSSPP_ARCH(ARM_NEON)
-		if (cpu_info.bNEON) {
+		return CheckAlphaRGBA4444NEON(pixelData, stride, w, h);
 			return CheckAlphaRGBA4444NEON(pixelData, stride, w, h);
 		}
 #endif
 	}
@ -1029,9 +1021,7 @@ CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w
 #ifdef _M_SSE
 		return CheckAlphaRGBA5551SSE2(pixelData, stride, w, h);
 #elif PPSSPP_ARCH(ARM_NEON)
-		if (cpu_info.bNEON) {
+		return CheckAlphaRGBA5551NEON(pixelData, stride, w, h);
 			return CheckAlphaRGBA5551NEON(pixelData, stride, w, h);
 		}
 #endif
 	}
--- a/GPU/Common/VertexDecoderArm.cpp
+++ b/GPU/Common/VertexDecoderArm.cpp
--- a/UI/NativeApp.cpp
+++ b/UI/NativeApp.cpp
@ -464,7 +464,7 @@ void NativeInit(int argc, const char *argv[], const char *savegame_dir, const ch
 	ShaderTranslationInit();
-	InitFastMath(cpu_info.bNEON);
+	InitFastMath();
 	g_threadManager.Init(cpu_info.num_cores, cpu_info.logical_cpu_count);
 	g_Discord.SetPresenceMenu();