VFPU: Some micro-optimizations. Don't fall back to interpreter path for vexp/vlog/vrexp.

This commit is contained in:
Henrik Rydgård 2023-06-04 10:09:20 +02:00
parent 27b8d27efc
commit 9db9fec898
6 changed files with 65 additions and 59 deletions

View file

@ -83,7 +83,7 @@ JitBlockCache::~JitBlockCache() {
Shutdown();
}
bool JitBlock::ContainsAddress(u32 em_address) {
bool JitBlock::ContainsAddress(u32 em_address) const {
// WARNING - THIS DOES NOT WORK WITH JIT INLINING ENABLED.
// However, that doesn't exist yet so meh.
return (em_address >= originalAddress && em_address < originalAddress + 4 * originalSize);

View file

@ -59,7 +59,7 @@ enum class DestroyType {
// We should be careful not to access these block structures during runtime as they are large.
// Fine to mess with them at block compile time though.
struct JitBlock {
bool ContainsAddress(u32 em_address);
bool ContainsAddress(u32 em_address) const;
const u8 *checkedEntry; // const, we have to translate to writable.
const u8 *normalEntry;

View file

@ -165,68 +165,58 @@ void GetMatrixRows(int matrixReg, MatrixSize msize, u8 vecs[4]) {
}
void ReadVector(float *rd, VectorSize size, int reg) {
int row = 0;
int length = 0;
int row;
int length;
switch (size) {
case V_Single: rd[0] = V(reg); return; // transpose = 0; row=(reg>>5)&3; length = 1; break;
case V_Pair: row=(reg>>5)&2; length = 2; break;
case V_Triple: row=(reg>>6)&1; length = 3; break;
case V_Quad: row=(reg>>5)&2; length = 4; break;
default: _assert_msg_(false, "%s: Bad vector size", __FUNCTION__);
default: length = 0; break;
}
int transpose = (reg>>5) & 1;
const int mtx = (reg >> 2) & 7;
int transpose = (reg >> 5) & 1;
const int mtx = reg & (7 << 2);
const int col = reg & 3;
if (transpose) {
const int base = mtx * 4 + col * 32;
const int base = mtx + col * 32;
for (int i = 0; i < length; i++)
rd[i] = V(base + ((row+i)&3));
} else {
const int base = mtx * 4 + col;
const int base = mtx + col;
for (int i = 0; i < length; i++)
rd[i] = V(base + ((row+i)&3)*32);
}
}
void WriteVector(const float *rd, VectorSize size, int reg) {
if (size == V_Single) {
// Optimize the common case.
if (!currentMIPS->VfpuWriteMask(0)) {
V(reg) = rd[0];
}
return;
}
const int mtx = (reg>>2)&7;
const int col = reg & 3;
int transpose = (reg>>5)&1;
int row = 0;
int length = 0;
int row;
int length;
switch (size) {
case V_Single: _dbg_assert_(false); return; // transpose = 0; row=(reg>>5)&3; length = 1; break;
case V_Single: if (!currentMIPS->VfpuWriteMask(0)) V(reg) = rd[0]; return; // transpose = 0; row=(reg>>5)&3; length = 1; break;
case V_Pair: row=(reg>>5)&2; length = 2; break;
case V_Triple: row=(reg>>6)&1; length = 3; break;
case V_Quad: row=(reg>>5)&2; length = 4; break;
default: _assert_msg_(false, "%s: Bad vector size", __FUNCTION__);
default: length = 0; break;
}
const int mtx = reg & (7 << 2);
const int col = reg & 3;
bool transpose = (reg >> 5) & 1;
if (currentMIPS->VfpuWriteMask() == 0) {
if (transpose) {
const int base = mtx * 4 + col * 32;
const int base = mtx + col * 32;
for (int i = 0; i < length; i++)
V(base + ((row+i)&3)) = rd[i];
} else {
const int base = mtx * 4 + col;
const int base = mtx + col;
for (int i = 0; i < length; i++)
V(base + ((row+i)&3)*32) = rd[i];
}
} else {
for (int i = 0; i < length; i++) {
if (!currentMIPS->VfpuWriteMask(i)) {
int index = mtx * 4;
int index = mtx;
if (transpose)
index += ((row+i)&3) + col*32;
else
@ -243,9 +233,6 @@ u32 VFPURewritePrefix(int ctrl, u32 remove, u32 add) {
}
void ReadMatrix(float *rd, MatrixSize size, int reg) {
int mtx = (reg >> 2) & 7;
int col = reg & 3;
int row = 0;
int side = 0;
int transpose = (reg >> 5) & 1;
@ -255,9 +242,12 @@ void ReadMatrix(float *rd, MatrixSize size, int reg) {
case M_2x2: row = (reg >> 5) & 2; side = 2; break;
case M_3x3: row = (reg >> 6) & 1; side = 3; break;
case M_4x4: row = (reg >> 5) & 2; side = 4; break;
default: _assert_msg_(false, "%s: Bad matrix size", __FUNCTION__);
default: side = 0; break;
}
int mtx = (reg >> 2) & 7;
int col = reg & 3;
// The voffset ordering is now integrated in these formulas,
// eliminating a table lookup.
const float *v = currentMIPS->v + (size_t)mtx * 16;
@ -296,8 +286,8 @@ void WriteMatrix(const float *rd, MatrixSize size, int reg) {
int mtx = (reg>>2)&7;
int col = reg&3;
int row = 0;
int side = 0;
int row;
int side;
int transpose = (reg >> 5) & 1;
switch (size) {
@ -305,7 +295,7 @@ void WriteMatrix(const float *rd, MatrixSize size, int reg) {
case M_2x2: row = (reg >> 5) & 2; side = 2; break;
case M_3x3: row = (reg >> 6) & 1; side = 3; break;
case M_4x4: row = (reg >> 5) & 2; side = 4; break;
default: _assert_msg_(false, "%s: Bad matrix size", __FUNCTION__);
default: side = 0;
}
if (currentMIPS->VfpuWriteMask() != 0) {
@ -370,16 +360,6 @@ int GetVectorOverlap(int vec1, VectorSize size1, int vec2, VectorSize size2) {
return count;
}
int GetNumVectorElements(VectorSize sz) {
switch (sz) {
case V_Single: return 1;
case V_Pair: return 2;
case V_Triple: return 3;
case V_Quad: return 4;
default: return 0;
}
}
VectorSize GetHalfVectorSizeSafe(VectorSize sz) {
switch (sz) {
case V_Pair: return V_Single;

View file

@ -218,7 +218,17 @@ VectorSize GetDoubleVectorSizeSafe(VectorSize sz);
VectorSize GetDoubleVectorSize(VectorSize sz);
VectorSize MatrixVectorSizeSafe(MatrixSize sz);
VectorSize MatrixVectorSize(MatrixSize sz);
int GetNumVectorElements(VectorSize sz);
inline int GetNumVectorElements(VectorSize sz) {
switch (sz) {
case V_Single: return 1;
case V_Pair: return 2;
case V_Triple: return 3;
case V_Quad: return 4;
default: return 0;
}
}
int GetMatrixSideSafe(MatrixSize sz);
int GetMatrixSide(MatrixSize sz);
std::string GetVectorNotation(int reg, VectorSize size);

View file

@ -2208,8 +2208,8 @@ void CosOnly(SinCosArg angle, float *output) {
output[1] = vfpu_cos(angle);
}
void ASinScaled(SinCosArg angle, float *output) {
output[0] = vfpu_asin(angle);
void ASinScaled(SinCosArg sine, float *output) {
output[0] = vfpu_asin(sine);
}
void SinCosNegSin(SinCosArg angle, float *output) {
@ -2217,13 +2217,25 @@ void SinCosNegSin(SinCosArg angle, float *output) {
output[0] = -output[0];
}
void Exp2(SinCosArg arg, float *output) {
output[0] = vfpu_exp2(arg);
}
void Log2(SinCosArg arg, float *output) {
output[0] = vfpu_log2(arg);
}
void RExp2(SinCosArg arg, float *output) {
output[0] = vfpu_rexp2(arg);
}
void Jit::Comp_VV2Op(MIPSOpcode op) {
CONDITIONAL_DISABLE(VFPU_VEC);
if (js.HasUnknownPrefix())
DISABLE;
auto trigCallHelper = [this](void (*sinCosFunc)(SinCosArg, float *output), u8 sreg) {
auto specialFuncCallHelper = [this](void (*specialFunc)(SinCosArg, float *output), u8 sreg) {
#if PPSSPP_ARCH(AMD64)
MOVSS(XMM0, fpr.V(sreg));
// TODO: This reg might be different on Linux...
@ -2232,7 +2244,7 @@ void Jit::Comp_VV2Op(MIPSOpcode op) {
#else
LEA(64, RDI, MIPSSTATE_VAR(sincostemp[0]));
#endif
ABI_CallFunction(thunks.ProtectFunction((const void *)sinCosFunc, 0));
ABI_CallFunction(thunks.ProtectFunction((const void *)specialFunc, 0));
#else
// Sigh, passing floats with cdecl isn't pretty, ends up on the stack.
if (fpr.V(sreg).IsSimpleReg()) {
@ -2240,7 +2252,7 @@ void Jit::Comp_VV2Op(MIPSOpcode op) {
} else {
MOV(32, R(EAX), fpr.V(sreg));
}
CallProtectedFunction((const void *)sinCosFunc, R(EAX), Imm32((uint32_t)(uintptr_t)&mips_->sincostemp[0]));
CallProtectedFunction((const void *)specialFunc, R(EAX), Imm32((uint32_t)(uintptr_t)&mips_->sincostemp[0]));
#endif
};
@ -2406,18 +2418,20 @@ void Jit::Comp_VV2Op(MIPSOpcode op) {
DIVSS(tempxregs[i], R(XMM0));
break;
case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
trigCallHelper(&SinOnly, sregs[i]);
specialFuncCallHelper(&SinOnly, sregs[i]);
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
break;
case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos
trigCallHelper(&CosOnly, sregs[i]);
specialFuncCallHelper(&CosOnly, sregs[i]);
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[1]));
break;
case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2
DISABLE;
specialFuncCallHelper(&Exp2, sregs[i]);
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
break;
case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2
DISABLE;
specialFuncCallHelper(&Log2, sregs[i]);
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
break;
case 22: // d[i] = sqrtf(s[i]); break; //vsqrt
SQRTSS(tempxregs[i], fpr.V(sregs[i]));
@ -2425,7 +2439,7 @@ void Jit::Comp_VV2Op(MIPSOpcode op) {
ANDPS(tempxregs[i], MatR(TEMPREG));
break;
case 23: // d[i] = asinf(s[i]) / M_PI_2; break; //vasin
trigCallHelper(&ASinScaled, sregs[i]);
specialFuncCallHelper(&ASinScaled, sregs[i]);
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
break;
case 24: // d[i] = -1.0f / s[i]; break; // vnrcp
@ -2436,11 +2450,12 @@ void Jit::Comp_VV2Op(MIPSOpcode op) {
MOVSS(tempxregs[i], R(XMM0));
break;
case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin
trigCallHelper(&NegSinOnly, sregs[i]);
specialFuncCallHelper(&NegSinOnly, sregs[i]);
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
break;
case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2
DISABLE;
specialFuncCallHelper(&RExp2, sregs[i]);
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
break;
}
}

View file

@ -87,6 +87,7 @@
#define MOUSEEVENTF_FROMTOUCH_NOPEN 0xFF515780 //http://msdn.microsoft.com/en-us/library/windows/desktop/ms703320(v=vs.85).aspx
#define MOUSEEVENTF_MASK_PLUS_PENTOUCH 0xFFFFFF80
// See https://github.com/unknownbrackets/verysleepy/commit/fc1b1b3bd6081fae3566cdb542d896e413238b71
int verysleepy__useSendMessage = 1;
const UINT WM_VERYSLEEPY_MSG = WM_APP + 0x3117;