mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
ARM32: Remove a lot of non-NEON fallback paths
This commit is contained in:
parent
19261c6c49
commit
584e94f01e
10 changed files with 339 additions and 910 deletions
|
@ -1,14 +1,12 @@
|
||||||
#include "ppsspp_config.h"
|
#include "ppsspp_config.h"
|
||||||
|
|
||||||
#include "fast_math.h"
|
#include "fast_math.h"
|
||||||
#include "fast_matrix.h"
|
#include "fast_matrix.h"
|
||||||
|
|
||||||
void InitFastMath(int enableNEON) {
|
void InitFastMath() {
|
||||||
// Every architecture has its own define. This needs to be added to.
|
|
||||||
if (enableNEON) {
|
|
||||||
#ifndef _MSC_VER
|
#ifndef _MSC_VER
|
||||||
#if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64)
|
#if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64)
|
||||||
fast_matrix_mul_4x4 = &fast_matrix_mul_4x4_neon;
|
fast_matrix_mul_4x4 = &fast_matrix_mul_4x4_neon;
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,8 +14,8 @@ extern "C" {
|
||||||
|
|
||||||
// See fast_matrix.h for the first set of functions.
|
// See fast_matrix.h for the first set of functions.
|
||||||
|
|
||||||
void InitFastMath(int enableNEON);
|
void InitFastMath();
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -138,9 +138,7 @@ void ArmJit::GenerateFixedCode() {
|
||||||
// consumed by CALL.
|
// consumed by CALL.
|
||||||
SUB(R_SP, R_SP, 4);
|
SUB(R_SP, R_SP, 4);
|
||||||
// Now we are correctly aligned and plan to stay that way.
|
// Now we are correctly aligned and plan to stay that way.
|
||||||
if (cpu_info.bNEON) {
|
VPUSH(D8, 8);
|
||||||
VPUSH(D8, 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fixed registers, these are always kept when in Jit context.
|
// Fixed registers, these are always kept when in Jit context.
|
||||||
// R8 is used to hold flags during delay slots. Not always needed.
|
// R8 is used to hold flags during delay slots. Not always needed.
|
||||||
|
@ -244,10 +242,7 @@ void ArmJit::GenerateFixedCode() {
|
||||||
SaveDowncount();
|
SaveDowncount();
|
||||||
RestoreRoundingMode(true);
|
RestoreRoundingMode(true);
|
||||||
|
|
||||||
// Doing this above the downcount for better pipelining (slightly.)
|
VPOP(D8, 8);
|
||||||
if (cpu_info.bNEON) {
|
|
||||||
VPOP(D8, 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
ADD(R_SP, R_SP, 4);
|
ADD(R_SP, R_SP, 4);
|
||||||
|
|
||||||
|
|
|
@ -1132,10 +1132,6 @@ namespace MIPSComp
|
||||||
DISABLE;
|
DISABLE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!cpu_info.bNEON) {
|
|
||||||
DISABLE;
|
|
||||||
}
|
|
||||||
|
|
||||||
// This multi-VCVT.F32.F16 is only available in the VFPv4 extension.
|
// This multi-VCVT.F32.F16 is only available in the VFPv4 extension.
|
||||||
// The VFPv3 one is VCVTB, VCVTT which we don't yet have support for.
|
// The VFPv3 one is VCVTB, VCVTT which we don't yet have support for.
|
||||||
if (!(cpu_info.bHalf && cpu_info.bVFPv4)) {
|
if (!(cpu_info.bHalf && cpu_info.bVFPv4)) {
|
||||||
|
@ -1599,10 +1595,6 @@ namespace MIPSComp
|
||||||
DISABLE;
|
DISABLE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!cpu_info.bNEON) {
|
|
||||||
DISABLE;
|
|
||||||
}
|
|
||||||
|
|
||||||
int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)
|
int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)
|
||||||
bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)
|
bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)
|
||||||
|
|
||||||
|
|
|
@ -27,13 +27,7 @@
|
||||||
using namespace ArmGen;
|
using namespace ArmGen;
|
||||||
using namespace ArmJitConstants;
|
using namespace ArmJitConstants;
|
||||||
|
|
||||||
ArmRegCacheFPU::ArmRegCacheFPU(MIPSState *mipsState, MIPSComp::JitState *js, MIPSComp::JitOptions *jo) : mips_(mipsState), js_(js), jo_(jo), vr(mr + 32) {
|
ArmRegCacheFPU::ArmRegCacheFPU(MIPSState *mipsState, MIPSComp::JitState *js, MIPSComp::JitOptions *jo) : mips_(mipsState), js_(js), jo_(jo), vr(mr + 32) {}
|
||||||
if (cpu_info.bNEON) {
|
|
||||||
numARMFpuReg_ = 32;
|
|
||||||
} else {
|
|
||||||
numARMFpuReg_ = 16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void ArmRegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) {
|
void ArmRegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) {
|
||||||
if (!initialReady) {
|
if (!initialReady) {
|
||||||
|
@ -47,7 +41,7 @@ void ArmRegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ArmRegCacheFPU::SetupInitialRegs() {
|
void ArmRegCacheFPU::SetupInitialRegs() {
|
||||||
for (int i = 0; i < numARMFpuReg_; i++) {
|
for (int i = 0; i < NUM_ARMFPUREG; i++) {
|
||||||
arInitial[i].mipsReg = -1;
|
arInitial[i].mipsReg = -1;
|
||||||
arInitial[i].isDirty = false;
|
arInitial[i].isDirty = false;
|
||||||
}
|
}
|
||||||
|
@ -57,7 +51,7 @@ void ArmRegCacheFPU::SetupInitialRegs() {
|
||||||
mrInitial[i].spillLock = false;
|
mrInitial[i].spillLock = false;
|
||||||
mrInitial[i].tempLock = false;
|
mrInitial[i].tempLock = false;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < MAX_ARMQUADS; i++) {
|
for (int i = 0; i < NUM_ARMQUADS; i++) {
|
||||||
qr[i].isDirty = false;
|
qr[i].isDirty = false;
|
||||||
qr[i].mipsVec = -1;
|
qr[i].mipsVec = -1;
|
||||||
qr[i].sz = V_Invalid;
|
qr[i].sz = V_Invalid;
|
||||||
|
@ -68,14 +62,6 @@ void ArmRegCacheFPU::SetupInitialRegs() {
|
||||||
}
|
}
|
||||||
|
|
||||||
const ARMReg *ArmRegCacheFPU::GetMIPSAllocationOrder(int &count) {
|
const ARMReg *ArmRegCacheFPU::GetMIPSAllocationOrder(int &count) {
|
||||||
// We reserve S0-S1 as scratch. Can afford two registers. Maybe even four, which could simplify some things.
|
|
||||||
static const ARMReg allocationOrder[] = {
|
|
||||||
S2, S3,
|
|
||||||
S4, S5, S6, S7,
|
|
||||||
S8, S9, S10, S11,
|
|
||||||
S12, S13, S14, S15
|
|
||||||
};
|
|
||||||
|
|
||||||
// VFP mapping
|
// VFP mapping
|
||||||
// VFPU registers and regular FP registers are mapped interchangably on top of the standard
|
// VFPU registers and regular FP registers are mapped interchangably on top of the standard
|
||||||
// 16 FPU registers.
|
// 16 FPU registers.
|
||||||
|
@ -116,12 +102,9 @@ const ARMReg *ArmRegCacheFPU::GetMIPSAllocationOrder(int &count) {
|
||||||
if (jo_->useNEONVFPU) {
|
if (jo_->useNEONVFPU) {
|
||||||
count = sizeof(allocationOrderNEONVFPU) / sizeof(const ARMReg);
|
count = sizeof(allocationOrderNEONVFPU) / sizeof(const ARMReg);
|
||||||
return allocationOrderNEONVFPU;
|
return allocationOrderNEONVFPU;
|
||||||
} else if (cpu_info.bNEON) {
|
} else {
|
||||||
count = sizeof(allocationOrderNEON) / sizeof(const ARMReg);
|
count = sizeof(allocationOrderNEON) / sizeof(const ARMReg);
|
||||||
return allocationOrderNEON;
|
return allocationOrderNEON;
|
||||||
} else {
|
|
||||||
count = sizeof(allocationOrder) / sizeof(const ARMReg);
|
|
||||||
return allocationOrder;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -404,19 +387,12 @@ void ArmRegCacheFPU::FlushR(MIPSReg r) {
|
||||||
mr[r].reg = (int)INVALID_REG;
|
mr[r].reg = (int)INVALID_REG;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ArmRegCacheFPU::GetNumARMFPURegs() {
|
|
||||||
if (cpu_info.bNEON)
|
|
||||||
return 32;
|
|
||||||
else
|
|
||||||
return 16;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Scalar only. Need a similar one for sequential Q vectors.
|
// Scalar only. Need a similar one for sequential Q vectors.
|
||||||
int ArmRegCacheFPU::FlushGetSequential(int a, int maxArmReg) {
|
int ArmRegCacheFPU::FlushGetSequential(int a) {
|
||||||
int c = 1;
|
int c = 1;
|
||||||
int lastMipsOffset = GetMipsRegOffset(ar[a].mipsReg);
|
int lastMipsOffset = GetMipsRegOffset(ar[a].mipsReg);
|
||||||
a++;
|
a++;
|
||||||
while (a < maxArmReg) {
|
while (a < 32) {
|
||||||
if (!ar[a].isDirty || ar[a].mipsReg == -1)
|
if (!ar[a].isDirty || ar[a].mipsReg == -1)
|
||||||
break;
|
break;
|
||||||
int mipsOffset = GetMipsRegOffset(ar[a].mipsReg);
|
int mipsOffset = GetMipsRegOffset(ar[a].mipsReg);
|
||||||
|
@ -444,7 +420,7 @@ void ArmRegCacheFPU::FlushAll() {
|
||||||
|
|
||||||
// Flush quads!
|
// Flush quads!
|
||||||
// These could also use sequential detection.
|
// These could also use sequential detection.
|
||||||
for (int i = 4; i < MAX_ARMQUADS; i++) {
|
for (int i = 4; i < NUM_ARMQUADS; i++) {
|
||||||
QFlush(i);
|
QFlush(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -466,7 +442,7 @@ void ArmRegCacheFPU::FlushAll() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int c = FlushGetSequential(a, GetNumARMFPURegs());
|
int c = FlushGetSequential(a);
|
||||||
if (c == 1) {
|
if (c == 1) {
|
||||||
// INFO_LOG(JIT, "Got single register: %i (%i)", a, m);
|
// INFO_LOG(JIT, "Got single register: %i (%i)", a, m);
|
||||||
emit_->VSTR((ARMReg)(a + S0), CTXREG, GetMipsRegOffset(m));
|
emit_->VSTR((ARMReg)(a + S0), CTXREG, GetMipsRegOffset(m));
|
||||||
|
@ -502,7 +478,7 @@ void ArmRegCacheFPU::FlushAll() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sanity check
|
// Sanity check
|
||||||
for (int i = 0; i < numARMFpuReg_; i++) {
|
for (int i = 0; i < NUM_ARMFPUREG; i++) {
|
||||||
if (ar[i].mipsReg != -1) {
|
if (ar[i].mipsReg != -1) {
|
||||||
ERROR_LOG(JIT, "Flush fail: ar[%i].mipsReg=%i", i, ar[i].mipsReg);
|
ERROR_LOG(JIT, "Flush fail: ar[%i].mipsReg=%i", i, ar[i].mipsReg);
|
||||||
}
|
}
|
||||||
|
@ -594,7 +570,7 @@ void ArmRegCacheFPU::ReleaseSpillLocksAndDiscardTemps() {
|
||||||
for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i) {
|
for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i) {
|
||||||
DiscardR(i);
|
DiscardR(i);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < MAX_ARMQUADS; i++) {
|
for (int i = 0; i < NUM_ARMQUADS; i++) {
|
||||||
qr[i].spillLock = false;
|
qr[i].spillLock = false;
|
||||||
if (qr[i].isTemp) {
|
if (qr[i].isTemp) {
|
||||||
qr[i].isTemp = false;
|
qr[i].isTemp = false;
|
||||||
|
|
|
@ -127,7 +127,7 @@ public:
|
||||||
// VFPU registers as single VFP registers.
|
// VFPU registers as single VFP registers.
|
||||||
ArmGen::ARMReg V(int vreg) { return R(vreg + 32); }
|
ArmGen::ARMReg V(int vreg) { return R(vreg + 32); }
|
||||||
|
|
||||||
int FlushGetSequential(int a, int maxArmReg);
|
int FlushGetSequential(int a);
|
||||||
void FlushAll();
|
void FlushAll();
|
||||||
|
|
||||||
// This one is allowed at any point.
|
// This one is allowed at any point.
|
||||||
|
@ -180,7 +180,6 @@ private:
|
||||||
}
|
}
|
||||||
// This one WILL get a free quad as long as you haven't spill-locked them all.
|
// This one WILL get a free quad as long as you haven't spill-locked them all.
|
||||||
int QGetFreeQuad(int start, int count, const char *reason);
|
int QGetFreeQuad(int start, int count, const char *reason);
|
||||||
int GetNumARMFPURegs();
|
|
||||||
|
|
||||||
void SetupInitialRegs();
|
void SetupInitialRegs();
|
||||||
|
|
||||||
|
@ -189,24 +188,23 @@ private:
|
||||||
MIPSComp::JitState *js_;
|
MIPSComp::JitState *js_;
|
||||||
MIPSComp::JitOptions *jo_;
|
MIPSComp::JitOptions *jo_;
|
||||||
|
|
||||||
int numARMFpuReg_;
|
|
||||||
int qTime_;
|
int qTime_;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
// With NEON, we have 64 S = 32 D = 16 Q registers. Only the first 32 S registers
|
// With NEON, we have 64 S = 32 D = 16 Q registers. Only the first 32 S registers
|
||||||
// are individually mappable though.
|
// are individually mappable though.
|
||||||
MAX_ARMFPUREG = 32,
|
NUM_ARMFPUREG = 32,
|
||||||
MAX_ARMQUADS = 16,
|
NUM_ARMQUADS = 16,
|
||||||
NUM_MIPSFPUREG = ArmJitConstants::TOTAL_MAPPABLE_MIPSFPUREGS,
|
NUM_MIPSFPUREG = ArmJitConstants::TOTAL_MAPPABLE_MIPSFPUREGS,
|
||||||
};
|
};
|
||||||
|
|
||||||
FPURegARM ar[MAX_ARMFPUREG];
|
FPURegARM ar[NUM_ARMFPUREG];
|
||||||
FPURegMIPS mr[NUM_MIPSFPUREG];
|
FPURegMIPS mr[NUM_MIPSFPUREG];
|
||||||
FPURegQuad qr[MAX_ARMQUADS];
|
FPURegQuad qr[NUM_ARMQUADS];
|
||||||
FPURegMIPS *vr;
|
FPURegMIPS *vr;
|
||||||
|
|
||||||
bool pendingFlush;
|
bool pendingFlush;
|
||||||
bool initialReady = false;
|
bool initialReady = false;
|
||||||
FPURegARM arInitial[MAX_ARMFPUREG];
|
FPURegARM arInitial[NUM_ARMFPUREG];
|
||||||
FPURegMIPS mrInitial[NUM_MIPSFPUREG];
|
FPURegMIPS mrInitial[NUM_MIPSFPUREG];
|
||||||
};
|
};
|
||||||
|
|
|
@ -38,7 +38,7 @@ namespace MIPSComp {
|
||||||
// ARM only
|
// ARM only
|
||||||
downcountInRegister = true;
|
downcountInRegister = true;
|
||||||
useNEONVFPU = false; // true
|
useNEONVFPU = false; // true
|
||||||
if (!cpu_info.bNEON || Disabled(JitDisable::SIMD))
|
if (Disabled(JitDisable::SIMD))
|
||||||
useNEONVFPU = false;
|
useNEONVFPU = false;
|
||||||
|
|
||||||
//ARM64
|
//ARM64
|
||||||
|
|
|
@ -901,9 +901,7 @@ CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w
|
||||||
#ifdef _M_SSE
|
#ifdef _M_SSE
|
||||||
return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h);
|
return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h);
|
||||||
#elif PPSSPP_ARCH(ARM_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
if (cpu_info.bNEON) {
|
return CheckAlphaRGBA8888NEON(pixelData, stride, w, h);
|
||||||
return CheckAlphaRGBA8888NEON(pixelData, stride, w, h);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -931,9 +929,7 @@ CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w
|
||||||
#ifdef _M_SSE
|
#ifdef _M_SSE
|
||||||
return CheckAlphaABGR4444SSE2(pixelData, stride, w, h);
|
return CheckAlphaABGR4444SSE2(pixelData, stride, w, h);
|
||||||
#elif PPSSPP_ARCH(ARM_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
if (cpu_info.bNEON) {
|
return CheckAlphaABGR4444NEON(pixelData, stride, w, h);
|
||||||
return CheckAlphaABGR4444NEON(pixelData, stride, w, h);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -964,9 +960,7 @@ CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w
|
||||||
#ifdef _M_SSE
|
#ifdef _M_SSE
|
||||||
return CheckAlphaABGR1555SSE2(pixelData, stride, w, h);
|
return CheckAlphaABGR1555SSE2(pixelData, stride, w, h);
|
||||||
#elif PPSSPP_ARCH(ARM_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
if (cpu_info.bNEON) {
|
return CheckAlphaABGR1555NEON(pixelData, stride, w, h);
|
||||||
return CheckAlphaABGR1555NEON(pixelData, stride, w, h);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -996,9 +990,7 @@ CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w
|
||||||
#ifdef _M_SSE
|
#ifdef _M_SSE
|
||||||
return CheckAlphaRGBA4444SSE2(pixelData, stride, w, h);
|
return CheckAlphaRGBA4444SSE2(pixelData, stride, w, h);
|
||||||
#elif PPSSPP_ARCH(ARM_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
if (cpu_info.bNEON) {
|
return CheckAlphaRGBA4444NEON(pixelData, stride, w, h);
|
||||||
return CheckAlphaRGBA4444NEON(pixelData, stride, w, h);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1029,9 +1021,7 @@ CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w
|
||||||
#ifdef _M_SSE
|
#ifdef _M_SSE
|
||||||
return CheckAlphaRGBA5551SSE2(pixelData, stride, w, h);
|
return CheckAlphaRGBA5551SSE2(pixelData, stride, w, h);
|
||||||
#elif PPSSPP_ARCH(ARM_NEON)
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
if (cpu_info.bNEON) {
|
return CheckAlphaRGBA5551NEON(pixelData, stride, w, h);
|
||||||
return CheckAlphaRGBA5551NEON(pixelData, stride, w, h);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -464,7 +464,7 @@ void NativeInit(int argc, const char *argv[], const char *savegame_dir, const ch
|
||||||
|
|
||||||
ShaderTranslationInit();
|
ShaderTranslationInit();
|
||||||
|
|
||||||
InitFastMath(cpu_info.bNEON);
|
InitFastMath();
|
||||||
g_threadManager.Init(cpu_info.num_cores, cpu_info.logical_cpu_count);
|
g_threadManager.Init(cpu_info.num_cores, cpu_info.logical_cpu_count);
|
||||||
|
|
||||||
g_Discord.SetPresenceMenu();
|
g_Discord.SetPresenceMenu();
|
||||||
|
|
Loading…
Add table
Reference in a new issue