From a12e448fb49906977c6494e1eef0ad3b5a929175 Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Wed, 18 Mar 2015 22:44:38 +0100 Subject: [PATCH] ARM64: Stub vertex decoder jit, implementing just enough for the cube.elf cube. --- Core/Config.cpp | 2 +- Core/MIPS/ARM/ArmJit.cpp | 2 +- Core/MIPS/ARM64/Arm64Asm.cpp | 2 - Core/MIPS/ARM64/Arm64CompVFPU.cpp | 7 +- Core/Util/DisArm64.cpp | 2 +- GPU/Common/VertexDecoderArm.cpp | 1 - GPU/Common/VertexDecoderArm64.cpp | 229 +++++++++++++++++++++++++++++- 7 files changed, 232 insertions(+), 13 deletions(-) diff --git a/Core/Config.cpp b/Core/Config.cpp index b91421a938..d5d6ae8dfa 100644 --- a/Core/Config.cpp +++ b/Core/Config.cpp @@ -265,7 +265,7 @@ static int DefaultNumWorkers() { static bool DefaultJit() { #ifdef IOS return iosCanUseJit; -#elif defined(ARM) || defined(_M_IX86) || defined(_M_X64) +#elif defined(ARM) || defined(ARM64) || defined(_M_IX86) || defined(_M_X64) return true; #else return false; diff --git a/Core/MIPS/ARM/ArmJit.cpp b/Core/MIPS/ARM/ArmJit.cpp index dce95d9720..e0f9fc97e9 100644 --- a/Core/MIPS/ARM/ArmJit.cpp +++ b/Core/MIPS/ARM/ArmJit.cpp @@ -600,7 +600,7 @@ void ArmJit::RestoreRoundingMode(bool force) { } void ArmJit::ApplyRoundingMode(bool force) { - // NOTE: Must not destory R0. + // NOTE: Must not destroy R0. // If the game has never set an interesting rounding mode, we can safely skip this. if (g_Config.bSetRoundingMode && (force || !g_Config.bForceFlushToZero || js.hasSetRounding)) { LDR(SCRATCHREG2, CTXREG, offsetof(MIPSState, fcr31)); diff --git a/Core/MIPS/ARM64/Arm64Asm.cpp b/Core/MIPS/ARM64/Arm64Asm.cpp index 01394b177f..2d9af0ec03 100644 --- a/Core/MIPS/ARM64/Arm64Asm.cpp +++ b/Core/MIPS/ARM64/Arm64Asm.cpp @@ -205,12 +205,10 @@ void Arm64Jit::GenerateFixedCode() { FlushIcache(); if (false) { - INFO_LOG(JIT, "THE DISASM : %p ========================", enterCode); std::vector lines = DisassembleArm64(enterCode, GetCodePtr() - enterCode); for (auto s : lines) { INFO_LOG(JIT, "%s", s.c_str()); } - INFO_LOG(JIT, "END OF THE DISASM : %p ========================", GetCodePtr()); } } diff --git a/Core/MIPS/ARM64/Arm64CompVFPU.cpp b/Core/MIPS/ARM64/Arm64CompVFPU.cpp index 4ab8953240..843cd95a7c 100644 --- a/Core/MIPS/ARM64/Arm64CompVFPU.cpp +++ b/Core/MIPS/ARM64/Arm64CompVFPU.cpp @@ -86,8 +86,7 @@ namespace MIPSComp DISABLE; } - void Arm64Jit::Comp_VVectorInit(MIPSOpcode op) - { + void Arm64Jit::Comp_VVectorInit(MIPSOpcode op) { DISABLE; } @@ -95,8 +94,7 @@ namespace MIPSComp DISABLE; } - void Arm64Jit::Comp_VMatrixInit(MIPSOpcode op) - { + void Arm64Jit::Comp_VMatrixInit(MIPSOpcode op) { DISABLE; } @@ -208,7 +206,6 @@ namespace MIPSComp // Very heavily used by FF:CC. Should be replaced by a fast approximation instead of // calling the math library. - // Apparently this may not work on hardfp. I don't think we have any platforms using this though. void Arm64Jit::Comp_VRot(MIPSOpcode op) { DISABLE; } diff --git a/Core/Util/DisArm64.cpp b/Core/Util/DisArm64.cpp index 9838b3871a..6809701ad1 100644 --- a/Core/Util/DisArm64.cpp +++ b/Core/Util/DisArm64.cpp @@ -185,7 +185,7 @@ static void BranchExceptionAndSystem(uint32_t w, uint64_t addr, Instruction *ins const char *opname[2] = { "cbz", "cbnz" }; char r = ((w >> 31) & 1) ? 'x' : 'w'; int offset = SignExtend19(w >> 5); - snprintf(instr->text, sizeof(instr->text), "%s %c%d", op, r, Rt); + snprintf(instr->text, sizeof(instr->text), "%s %c%d", opname[op], r, Rt); } else if (((w >> 25) & 0x3F) == 0x1B) { // Test and branch snprintf(instr->text, sizeof(instr->text), "(test & branch %08x)", w); diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp index f41d0b2d0d..261c500b45 100644 --- a/GPU/Common/VertexDecoderArm.cpp +++ b/GPU/Common/VertexDecoderArm.cpp @@ -1174,7 +1174,6 @@ void VertexDecoderJitCache::Jit_PosS16Through() { _dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order."); _dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order."); - // TODO: SIMD LDRSH(tempReg1, srcReg, dec_->posoff); LDRSH(tempReg2, srcReg, dec_->posoff + 2); LDRH(tempReg3, srcReg, dec_->posoff + 4); diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index ee47742a85..438369b6bf 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -20,14 +20,239 @@ #include "Core/Config.h" #include "Core/Reporting.h" #include "Common/Arm64Emitter.h" +#include "Core/MIPS/JitCommon/JitCommon.h" #include "GPU/GPUState.h" #include "GPU/Common/VertexDecoderCommon.h" +static const float by128 = 1.0f / 128.0f; +static const float by16384 = 1.0f / 16384.0f; +static const float by32768 = 1.0f / 32768.0f; + +using namespace Arm64Gen; + +// Pointers, X regs +static const ARM64Reg srcReg = X0; +static const ARM64Reg dstReg = X1; + +static const ARM64Reg counterReg = W2; +static const ARM64Reg tempReg1 = W3; +static const ARM64Reg tempReg2 = W4; +static const ARM64Reg tempReg3 = W5; +static const ARM64Reg scratchReg = W6; +static const ARM64Reg scratchReg2 = W7; +static const ARM64Reg scratchReg3 = W8; +static const ARM64Reg fullAlphaReg = W12; + +static const ARM64Reg fpScratchReg = S4; +static const ARM64Reg fpScratchReg2 = S5; +static const ARM64Reg fpScratchReg3 = S6; +static const ARM64Reg fpScratchReg4 = S7; +static const ARM64Reg fpUVscaleReg = D0; +static const ARM64Reg fpUVoffsetReg = D1; + +static const ARM64Reg neonScratchReg = D2; +static const ARM64Reg neonScratchReg2 = D3; + +static const ARM64Reg neonScratchRegQ = Q1; // Overlaps with all the scratch regs + +// Everything above S6 is fair game for skinning + +// S8-S15 are used during matrix generation + +// These only live through the matrix multiplication +static const ARM64Reg src[3] = { S8, S9, S10 }; // skin source +static const ARM64Reg acc[3] = { S11, S12, S13 }; // skin accumulator + +static const ARM64Reg srcNEON = Q2; +static const ARM64Reg accNEON = Q3; + +static const JitLookup jitLookup[] = { + /* + {&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8}, + {&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16}, + {&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat}, + + {&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin}, + {&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin}, + {&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin}, + */ + {&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8}, + {&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16}, + {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, + /* + {&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double}, + + {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale}, + {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale}, + {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, + + {&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through}, + {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, + {&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble}, + + {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8}, + {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16}, + {&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat}, + + {&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin}, + {&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin}, + {&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin}, + */ + {&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888}, + /* + {&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444}, + {&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565}, + {&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551}, + + {&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through}, + {&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through}, + {&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat}, + */ + {&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat}, + /* + {&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin}, + {&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin}, + {&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin}, + + {&VertexDecoder::Step_NormalS8Morph, &VertexDecoderJitCache::Jit_NormalS8Morph}, + {&VertexDecoder::Step_NormalS16Morph, &VertexDecoderJitCache::Jit_NormalS16Morph}, + {&VertexDecoder::Step_NormalFloatMorph, &VertexDecoderJitCache::Jit_NormalFloatMorph}, + + {&VertexDecoder::Step_PosS8Morph, &VertexDecoderJitCache::Jit_PosS8Morph}, + {&VertexDecoder::Step_PosS16Morph, &VertexDecoderJitCache::Jit_PosS16Morph}, + {&VertexDecoder::Step_PosFloatMorph, &VertexDecoderJitCache::Jit_PosFloatMorph}, + + {&VertexDecoder::Step_Color8888Morph, &VertexDecoderJitCache::Jit_Color8888Morph}, + {&VertexDecoder::Step_Color4444Morph, &VertexDecoderJitCache::Jit_Color4444Morph}, + {&VertexDecoder::Step_Color565Morph, &VertexDecoderJitCache::Jit_Color565Morph}, + {&VertexDecoder::Step_Color5551Morph, &VertexDecoderJitCache::Jit_Color5551Morph}, + */ +}; + + JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { - // TODO ARM64 - return NULL; + dec_ = &dec; + const u8 *start = AlignCode16(); + + WARN_LOG(HLE, "VertexDecoderJitCache::Compile"); + + bool prescaleStep = false; + bool skinning = false; + + // Look for prescaled texcoord steps + for (int i = 0; i < dec.numSteps_; i++) { + if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale || + dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale || + dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) { + prescaleStep = true; + } + if (dec.steps_[i] == &VertexDecoder::Step_WeightsU8Skin || + dec.steps_[i] == &VertexDecoder::Step_WeightsU16Skin || + dec.steps_[i] == &VertexDecoder::Step_WeightsFloatSkin) { + skinning = true; + } + } + + if (dec.weighttype && g_Config.bSoftwareSkinning && dec.morphcount == 1) { + WARN_LOG(HLE, "vtxdec-arm64 does not support sw skinning"); + return NULL; + } + + if (dec.col) { + // Or LDB and skip the conditional? This is probably cheaper. + MOVI2R(fullAlphaReg, 0xFF); + } + + const u8 *loopStart = GetCodePtr(); + for (int i = 0; i < dec.numSteps_; i++) { + if (!CompileStep(dec, i)) { + // Reset the code ptr (effectively undoing what we generated) and return zero to indicate that we failed. + SetCodePtr(const_cast(start)); + char temp[1024] = {0}; + dec.ToString(temp); + WARN_LOG(HLE, "Could not compile vertex decoder, failed at step %d: %s", i, temp); + return 0; + } + } + + ADDI2R(srcReg, srcReg, dec.VertexSize(), scratchReg); + ADDI2R(dstReg, dstReg, dec.decFmt.stride, scratchReg); + SUBS(counterReg, counterReg, 1); + B(CC_NEQ, loopStart); + + if (dec.col) { + MOVP2R(tempReg1, &gstate_c.vertexFullAlpha); + CMP(fullAlphaReg, 0); + FixupBranch skip = B(CC_NEQ); + STRB(INDEX_UNSIGNED, fullAlphaReg, tempReg1, 0); + SetJumpTarget(skip); + } + + // POP(6, R4, R5, R6, R7, R8, R_PC); + RET(); + + FlushIcache(); + + char temp[1024] = { 0 }; + dec.ToString(temp); + INFO_LOG(HLE, "=== %s (%d bytes) ===", temp, (int)(GetCodePtr() - start)); + std::vector lines = DisassembleArm64(start, GetCodePtr() - start); + for (auto line : lines) { + INFO_LOG(HLE, "%s", line.c_str()); + } + INFO_LOG(HLE, "==========", temp); + + return (JittedVertexDecoder)start; } bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) { + // See if we find a matching JIT function + for (size_t i = 0; i < ARRAY_SIZE(jitLookup); i++) { + if (dec.steps_[step] == jitLookup[i].func) { + ((*this).*jitLookup[i].jitFunc)(); + return true; + } + } return false; } + +void VertexDecoderJitCache::Jit_Color8888() { + LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff); + // TODO: Set flags to determine if alpha != 0xFF. + // ANDSI2R(tempReg2, tempReg1, 0xFF000000); + STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off); + // FixupBranch skip = B(CC_NZ); + MOVI2R(fullAlphaReg, 0); + // SetJumpTarget(skip); +} + +void VertexDecoderJitCache::Jit_TcU8() { + LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); + LDRB(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 1); + ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 8)); + STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff); +} + +void VertexDecoderJitCache::Jit_TcU16() { + LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); + LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2); + ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16)); + STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff); +} + +void VertexDecoderJitCache::Jit_TcFloat() { + LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); + LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 4); + STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff); + STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.uvoff + 4); +} + +// Just copy 12 bytes. +void VertexDecoderJitCache::Jit_PosFloat() { + LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff); + LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->posoff + 4); + LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 8); + STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.posoff); + STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.posoff + 4); + STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.posoff + 8); +}