From 1b491fe15612e281ac3a7f923540938be8fb5ccf Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Wed, 10 May 2017 17:31:34 -0700 Subject: [PATCH] SoftGPU: Stub a jit for texel fetch. --- CMakeLists.txt | 1 + GPU/GPU.vcxproj | 1 + GPU/GPU.vcxproj.filters | 3 + GPU/Software/Rasterizer.cpp | 30 ++++++---- GPU/Software/Sampler.cpp | 116 ++++++++++++++++++++++++++++++++---- GPU/Software/Sampler.h | 87 ++++++++++++++++++++++++++- GPU/Software/SamplerX86.cpp | 96 +++++++++++++++++++++++++++++ GPU/Software/SoftGpu.cpp | 6 +- UWP/GPU_UWP/GPU_UWP.vcxproj | 2 + android/jni/Android.mk | 6 +- 10 files changed, 321 insertions(+), 27 deletions(-) create mode 100644 GPU/Software/SamplerX86.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 81d1d0707f..98d4f79846 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1089,6 +1089,7 @@ list(APPEND CoreExtra Core/MIPS/x86/RegCacheFPU.cpp Core/MIPS/x86/RegCacheFPU.h GPU/Common/VertexDecoderX86.cpp + GPU/Software/SamplerX86.cpp ) list(APPEND CoreExtra diff --git a/GPU/GPU.vcxproj b/GPU/GPU.vcxproj index 54785e545d..dcf471a173 100644 --- a/GPU/GPU.vcxproj +++ b/GPU/GPU.vcxproj @@ -352,6 +352,7 @@ + diff --git a/GPU/GPU.vcxproj.filters b/GPU/GPU.vcxproj.filters index 15353f1ab7..2499cfab12 100644 --- a/GPU/GPU.vcxproj.filters +++ b/GPU/GPU.vcxproj.filters @@ -516,5 +516,8 @@ Software + + Software + \ No newline at end of file diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index 250dc28379..1fd9268271 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -1003,7 +1003,7 @@ inline void DrawSinglePixel(const DrawingCoords &p, u16 z, u8 fog, const Vec4 &prim_color, float s, float t, int texlevel, int frac_texlevel, bool bilinear, u8 *texptr[], int texbufw[]) { +static inline void ApplyTexturing(Sampler::NearestFunc sampler, Vec4 &prim_color, float s, float t, int texlevel, int frac_texlevel, bool bilinear, u8 *texptr[], int texbufw[]) { int u[8] = {0}, v[8] = {0}; // 1.23.8 fixed point int frac_u[2], frac_v[2]; @@ -1021,9 +1021,9 @@ static inline void ApplyTexturing(Vec4 &prim_color, float s, float t, int t GetTexelCoordinates(texlevel + 1, s, t, u[1], v[1]); } - texcolor0 = Sampler::SampleNearest(texlevel, u[0], v[0], tptr0, bufw0); + texcolor0 = Vec4::FromRGBA(sampler(u[0], v[0], tptr0, bufw0, texlevel)); if (frac_texlevel) { - texcolor1 = Sampler::SampleNearest(texlevel + 1, u[1], v[1], tptr1, bufw1); + texcolor1 = Vec4::FromRGBA(sampler(u[1], v[1], tptr1, bufw1, texlevel + 1)); } } else { GetTexelCoordinatesQuad(texlevel, s, t, u, v, frac_u[0], frac_v[0]); @@ -1031,9 +1031,9 @@ static inline void ApplyTexturing(Vec4 &prim_color, float s, float t, int t GetTexelCoordinatesQuad(texlevel + 1, s, t, u + 4, v + 4, frac_u[1], frac_v[1]); } - texcolor0 = Sampler::SampleLinear(texlevel, u, v, frac_u[0], frac_v[0], tptr0, bufw0); + texcolor0 = Sampler::SampleLinear(sampler, u, v, frac_u[0], frac_v[0], tptr0, bufw0, texlevel); if (frac_texlevel) { - texcolor1 = Sampler::SampleLinear(texlevel + 1, u + 4, v + 4, frac_u[1], frac_v[1], tptr1, bufw1); + texcolor1 = Sampler::SampleLinear(sampler, u + 4, v + 4, frac_u[1], frac_v[1], tptr1, bufw1, texlevel + 1); } } @@ -1106,7 +1106,7 @@ static inline void CalculateSamplingParams(const float ds, const float dt, const } } -static inline void ApplyTexturing(Vec4 *prim_color, const Vec4 &s, const Vec4 &t, int maxTexLevel, u8 *texptr[], int texbufw[]) { +static inline void ApplyTexturing(Sampler::NearestFunc sampler, Vec4 *prim_color, const Vec4 &s, const Vec4 &t, int maxTexLevel, u8 *texptr[], int texbufw[]) { float ds = s[1] - s[0]; float dt = t[2] - t[0]; @@ -1116,7 +1116,7 @@ static inline void ApplyTexturing(Vec4 *prim_color, const Vec4 &s, c CalculateSamplingParams(ds, dt, maxTexLevel, level, levelFrac, bilinear); for (int i = 0; i < 4; ++i) { - ApplyTexturing(prim_color[i], s[i], t[i], level, levelFrac, bilinear, texptr, texbufw); + ApplyTexturing(sampler, prim_color[i], s[i], t[i], level, levelFrac, bilinear, texptr, texbufw); } } @@ -1245,6 +1245,8 @@ void DrawTriangleSlice( // This is common, and when we interpolate, we lose accuracy. const bool flatZ = v0.screenpos.z == v1.screenpos.z && v0.screenpos.z == v2.screenpos.z; + Sampler::NearestFunc sampler = Sampler::GetNearestFunc(); + for (pprime.y = minY + hy1 * 32; pprime.y < minY + hy2 * 32; pprime.y += 32, w0_base = e0.StepY(w0_base), w1_base = e1.StepY(w1_base), @@ -1302,7 +1304,7 @@ void DrawTriangleSlice( GetTextureCoordinates(v0, v1, v2, w0, w1, w2, wsum_recip, s, t); } - ApplyTexturing(prim_color, s, t, maxTexLevel, texptr, texbufw); + ApplyTexturing(sampler, prim_color, s, t, maxTexLevel, texptr, texbufw); } if (!clearMode) { @@ -1412,6 +1414,8 @@ void DrawPoint(const VertexData &v0) bool clearMode = gstate.isModeClear(); + Sampler::NearestFunc sampler = Sampler::GetNearestFunc(); + if (gstate.isTextureMapEnabled() && !clearMode) { int texbufw[8] = {0}; @@ -1449,7 +1453,7 @@ void DrawPoint(const VertexData &v0) int texLevelFrac; bool bilinear; CalculateSamplingParams(0.0f, 0.0f, maxTexLevel, texLevel, texLevelFrac, bilinear); - ApplyTexturing(prim_color, s, t, texLevel, texLevelFrac, bilinear, texptr, texbufw); + ApplyTexturing(sampler, prim_color, s, t, texLevel, texLevelFrac, bilinear, texptr, texbufw); } if (!clearMode) @@ -1515,6 +1519,8 @@ void DrawLine(const VertexData &v0, const VertexData &v1) } } + Sampler::NearestFunc sampler = Sampler::GetNearestFunc(); + float x = a.x > b.x ? a.x - 1 : a.x; float y = a.y > b.y ? a.y - 1 : a.y; float z = a.z; @@ -1579,7 +1585,7 @@ void DrawLine(const VertexData &v0, const VertexData &v1) texBilinear = true; } - ApplyTexturing(prim_color, s, t, texLevel, texLevelFrac, texBilinear, texptr, texbufw); + ApplyTexturing(sampler, prim_color, s, t, texLevel, texLevelFrac, texBilinear, texptr, texbufw); } if (!clearMode) @@ -1632,10 +1638,12 @@ bool GetCurrentTexture(GPUDebugBuffer &buffer, int level) int texbufw = GetTextureBufw(level, texaddr, texfmt); u8 *texptr = Memory::GetPointer(texaddr); + Sampler::NearestFunc sampler = Sampler::GetNearestFunc(); + u32 *row = (u32 *)buffer.GetData(); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { - row[x] = Sampler::SampleNearest(level, x, y, texptr, texbufw).ToRGBA(); + row[x] = sampler(x, y, texptr, texbufw, level); } row += w; } diff --git a/GPU/Software/Sampler.cpp b/GPU/Software/Sampler.cpp index 7f1ffc001d..2907802dcc 100644 --- a/GPU/Software/Sampler.cpp +++ b/GPU/Software/Sampler.cpp @@ -15,6 +15,8 @@ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. +#include +#include #include "Common/ColorConv.h" #include "Core/Reporting.h" #include "GPU/Common/TextureDecoder.h" @@ -31,6 +33,95 @@ extern u32 clut[4096]; namespace Sampler { +static u32 SampleNearest(int u, int v, const u8 *tptr, int bufw, int level); + +std::mutex jitCacheLock; +SamplerJitCache *jitCache = nullptr; + +void Init() { + jitCache = new SamplerJitCache(); +} + +void Shutdown() { + delete jitCache; + jitCache = nullptr; +} + +NearestFunc GetNearestFunc() { + SamplerID id; + jitCache->ComputeSamplerID(&id); + NearestFunc jitted = jitCache->GetSampler(id); + if (jitted) { + return jitted; + } + + return &SampleNearest; +} + +SamplerJitCache::SamplerJitCache() +#if PPSSPP_ARCH(ARM64) + : fp(this) +#endif +{ + // 256k should be enough. + AllocCodeSpace(1024 * 64 * 4); + + // Add some random code to "help" MSVC's buggy disassembler :( +#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) + using namespace Gen; + for (int i = 0; i < 100; i++) { + MOV(32, R(EAX), R(EBX)); + RET(); + } +#elif defined(ARM) + BKPT(0); + BKPT(0); +#endif +} + +void SamplerJitCache::Clear() { + ClearCodeSpace(0); + cache_.clear(); +} + +void SamplerJitCache::ComputeSamplerID(SamplerID *id_out) { + SamplerID id; + + id.texfmt = gstate.getTextureFormat(); + id.clutfmt = gstate.getClutPaletteFormat(); + id.swizzle = gstate.isTextureSwizzled(); + // Only CLUT4 can use separate CLUTs per mimap. + id.useSharedClut = gstate.isClutSharedForMipmaps() || gstate.getTextureFormat() != GE_TFMT_CLUT4; + id.hasClutMask = gstate.getClutIndexMask() != 0xFF; + id.hasClutShift = gstate.getClutIndexShift() != 0; + id.hasClutOffset = gstate.getClutIndexStartPos() != 0; + + *id_out = id; +} + +NearestFunc SamplerJitCache::GetSampler(const SamplerID &id) { + std::lock_guard guard(jitCacheLock); + + auto it = cache_.find(id); + if (it != cache_.end()) { + return it->second; + } + + // TODO: What should be the min size? Can we even hit this? + if (GetSpaceLeft() < 16384) { + Clear(); + } + + // TODO +#ifdef _M_X64 + NearestFunc func = Compile(id); + cache_[id] = func; + return func; +#else + return nullptr; +#endif +} + template static inline int GetPixelDataOffset(unsigned int row_pitch_bytes, unsigned int u, unsigned int v) { @@ -85,7 +176,7 @@ struct Nearest4 { }; template -inline static Nearest4 SampleNearest(int level, int u[N], int v[N], const u8 *srcptr, int texbufw) +inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int texbufw, int level) { Nearest4 res; if (!srcptr) { @@ -193,14 +284,18 @@ inline static Nearest4 SampleNearest(int level, int u[N], int v[N], const u8 *sr } } -Vec4 SampleNearest(int level, int u, int v, const u8 *tptr, int bufw) { - return Vec4::FromRGBA(SampleNearest<1>(level, &u, &v, tptr, bufw)); +static u32 SampleNearest(int u, int v, const u8 *tptr, int bufw, int level) { + return SampleNearest<1>(&u, &v, tptr, bufw, level); } -Vec4 SampleLinear(int texlevel, int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw) { -#if defined(_M_SSE) - Nearest4 c = SampleNearest<4>(texlevel, u, v, tptr, bufw); +Vec4 SampleLinear(NearestFunc sampler, int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int texlevel) { + Nearest4 c; + c.v[0] = sampler(u[0], v[0], tptr, bufw, texlevel); + c.v[1] = sampler(u[1], v[1], tptr, bufw, texlevel); + c.v[2] = sampler(u[2], v[2], tptr, bufw, texlevel); + c.v[3] = sampler(u[3], v[3], tptr, bufw, texlevel); +#if defined(_M_SSE) const __m128i z = _mm_setzero_si128(); __m128i cvec = _mm_load_si128((const __m128i *)c.v); @@ -217,11 +312,10 @@ Vec4 SampleLinear(int texlevel, int u[4], int v[4], int frac_u, int frac_v, __m128i res = _mm_add_epi16(tmp, _mm_shuffle_epi32(tmp, _MM_SHUFFLE(3, 2, 3, 2))); return Vec4(_mm_unpacklo_epi16(res, z)); #else - Nearest4 nearest = SampleNearest<4>(texlevel, u, v, tptr, bufw); - Vec4 texcolor_tl = Vec4::FromRGBA(nearest.v[0]); - Vec4 texcolor_tr = Vec4::FromRGBA(nearest.v[1]); - Vec4 texcolor_bl = Vec4::FromRGBA(nearest.v[2]); - Vec4 texcolor_br = Vec4::FromRGBA(nearest.v[3]); + Vec4 texcolor_tl = Vec4::FromRGBA(c.v[0]); + Vec4 texcolor_tr = Vec4::FromRGBA(c.v[1]); + Vec4 texcolor_bl = Vec4::FromRGBA(c.v[2]); + Vec4 texcolor_br = Vec4::FromRGBA(c.v[3]); // 0x100 causes a slight bias to tl, but without it we'd have to divide by 255 * 255. Vec4 t = texcolor_tl * (0x100 - frac_u) + texcolor_tr * frac_u; Vec4 b = texcolor_bl * (0x100 - frac_u) + texcolor_br * frac_u; diff --git a/GPU/Software/Sampler.h b/GPU/Software/Sampler.h index 768f27a0ae..9549b08719 100644 --- a/GPU/Software/Sampler.h +++ b/GPU/Software/Sampler.h @@ -17,11 +17,94 @@ #pragma once +#include "ppsspp_config.h" + +#include +#if PPSSPP_ARCH(ARM) +#include "Common/ArmEmitter.h" +#elif PPSSPP_ARCH(ARM64) +#include "Common/Arm64Emitter.h" +#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) +#include "Common/x64Emitter.h" +#elif PPSSPP_ARCH(MIPS) +#include "Common/MipsEmitter.h" +#else +#include "Common/FakeEmitter.h" +#endif #include "GPU/Math3D.h" +struct SamplerID { + SamplerID() : fullKey(0) { + } + + union { + u32 fullKey; + struct { + int8_t texfmt : 4; + int8_t clutfmt : 2; + int8_t : 2; + bool swizzle : 1; + bool useSharedClut : 1; + bool hasClutMask : 1; + bool hasClutShift : 1; + bool hasClutOffset : 1; + }; + }; + + bool operator == (const SamplerID &other) const { + return fullKey == other.fullKey; + } +}; + +namespace std { + +template <> +struct hash { + std::size_t operator()(const SamplerID &k) const { + return hash()(k.fullKey); + } +}; + +}; + namespace Sampler { -Math3D::Vec4 SampleNearest(int level, int u, int v, const u8 *tptr, int bufwbytes); -Math3D::Vec4 SampleLinear(int level, int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufwbytes); +typedef u32 (*NearestFunc)(int u, int v, const u8 *tptr, int bufw, int level); +NearestFunc GetNearestFunc(); + +void Init(); +void Shutdown(); + +Math3D::Vec4 SampleLinear(NearestFunc sampler, int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level); + +#if PPSSPP_ARCH(ARM) +class SamplerJitCache : public ArmGen::ARMXCodeBlock { +#elif PPSSPP_ARCH(ARM64) +class SamplerJitCache : public Arm64Gen::ARM64CodeBlock { +#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) +class SamplerJitCache : public Gen::XCodeBlock { +#elif PPSSPP_ARCH(MIPS) +class SamplerJitCache : public MIPSGen::MIPSCodeBlock { +#else +class SamplerJitCache : public FakeGen::FakeXCodeBlock { +#endif +public: + SamplerJitCache(); + + void ComputeSamplerID(SamplerID *id_out); + + // Returns a pointer to the code to run. + NearestFunc GetSampler(const SamplerID &id); + void Clear(); + +private: + NearestFunc Compile(const SamplerID &id); + +#if PPSSPP_ARCH(ARM64) + Arm64Gen::ARM64FloatEmitter fp; +#endif + + std::unordered_map cache_; +}; }; diff --git a/GPU/Software/SamplerX86.cpp b/GPU/Software/SamplerX86.cpp new file mode 100644 index 0000000000..b736c16ec4 --- /dev/null +++ b/GPU/Software/SamplerX86.cpp @@ -0,0 +1,96 @@ +// Copyright (c) 2017- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include "ppsspp_config.h" +#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) + +#include +#include "Common/x64Emitter.h" +#include "GPU/Software/Sampler.h" +#include "GPU/ge_constants.h" + +using namespace Gen; + +namespace Sampler { + +#ifdef _WIN32 +static const X64Reg resultReg = RAX; +static const X64Reg tempReg1 = R10; +static const X64Reg tempReg2 = R11; +static const X64Reg uReg = RCX; +static const X64Reg vReg = RDX; +static const X64Reg srcReg = R8; +static const X64Reg bufwReg = R9; +// TODO: levelReg on stack +#else +static const X64Reg resultReg = RAX; +static const X64Reg tempReg1 = R9; +static const X64Reg tempReg2 = R10; +static const X64Reg uReg = RDI; +static const X64Reg vReg = RSI +static const X64Reg srcReg = RDX; +static const X64Reg bufwReg = RCX; +static const X64Reg levelReg = R8; +#endif + +NearestFunc SamplerJitCache::Compile(const SamplerID &id) { + BeginWrite(); + const u8 *start = this->AlignCode16(); + + SUB(PTRBITS, R(ESP), Imm8(64)); + MOVUPS(MDisp(ESP, 0), XMM4); + MOVUPS(MDisp(ESP, 16), XMM5); + MOVUPS(MDisp(ESP, 32), XMM6); + MOVUPS(MDisp(ESP, 48), XMM7); + + // Early exit on !srcPtr. + CMP(PTRBITS, R(srcReg), Imm32(0)); + FixupBranch nonZeroSrc = J_CC(CC_NZ); + XOR(32, R(RAX), R(RAX)); + FixupBranch zeroSrc = J(true); + SetJumpTarget(nonZeroSrc); + + GETextureFormat fmt = (GETextureFormat)id.texfmt; + bool success = true; + switch (fmt) { + default: + success = false; + } + + if (!success) { + EndWrite(); + SetCodePtr(const_cast(start)); + return nullptr; + } + + SetJumpTarget(zeroSrc); + + MOVUPS(XMM4, MDisp(ESP, 0)); + MOVUPS(XMM5, MDisp(ESP, 16)); + MOVUPS(XMM6, MDisp(ESP, 32)); + MOVUPS(XMM7, MDisp(ESP, 48)); + ADD(PTRBITS, R(ESP), Imm8(64)); + + RET(); + + EndWrite(); + return (NearestFunc)start; +} + +}; + +#endif diff --git a/GPU/Software/SoftGpu.cpp b/GPU/Software/SoftGpu.cpp index 081d20f9fb..2c05938d5f 100644 --- a/GPU/Software/SoftGpu.cpp +++ b/GPU/Software/SoftGpu.cpp @@ -31,9 +31,10 @@ #include "profiler/profiler.h" #include "thin3d/thin3d.h" +#include "GPU/Software/Rasterizer.h" +#include "GPU/Software/Sampler.h" #include "GPU/Software/SoftGpu.h" #include "GPU/Software/TransformUnit.h" -#include "GPU/Software/Rasterizer.h" #include "GPU/Common/DrawEngineCommon.h" #include "GPU/Common/FramebufferCommon.h" @@ -99,6 +100,7 @@ SoftGPU::SoftGPU(GraphicsContext *gfxCtx, Draw::DrawContext *draw) displayStride_ = 512; displayFormat_ = GE_FORMAT_8888; + Sampler::Init(); drawEngine_ = new SoftwareDrawEngine(); drawEngineCommon_ = drawEngine_; } @@ -127,6 +129,8 @@ SoftGPU::~SoftGPU() { samplerNearest = nullptr; samplerLinear->Release(); samplerLinear = nullptr; + + Sampler::Shutdown(); } void SoftGPU::SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat format) { diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj b/UWP/GPU_UWP/GPU_UWP.vcxproj index 0200d09d61..7e797889b9 100644 --- a/UWP/GPU_UWP/GPU_UWP.vcxproj +++ b/UWP/GPU_UWP/GPU_UWP.vcxproj @@ -344,6 +344,7 @@ + @@ -397,6 +398,7 @@ + diff --git a/android/jni/Android.mk b/android/jni/Android.mk index a67b94d110..7b3dd28a06 100644 --- a/android/jni/Android.mk +++ b/android/jni/Android.mk @@ -28,7 +28,8 @@ ARCH_FILES := \ $(SRC)/Core/MIPS/x86/JitSafeMem.cpp \ $(SRC)/Core/MIPS/x86/RegCache.cpp \ $(SRC)/Core/MIPS/x86/RegCacheFPU.cpp \ - $(SRC)/GPU/Common/VertexDecoderX86.cpp + $(SRC)/GPU/Common/VertexDecoderX86.cpp \ + $(SRC)/GPU/Software/SamplerX86.cpp endif ifeq ($(TARGET_ARCH_ABI),x86_64) @@ -48,7 +49,8 @@ ARCH_FILES := \ $(SRC)/Core/MIPS/x86/JitSafeMem.cpp \ $(SRC)/Core/MIPS/x86/RegCache.cpp \ $(SRC)/Core/MIPS/x86/RegCacheFPU.cpp \ - $(SRC)/GPU/Common/VertexDecoderX86.cpp + $(SRC)/GPU/Common/VertexDecoderX86.cpp \ + $(SRC)/GPU/Software/SamplerX86.cpp endif ifeq ($(findstring armeabi-v7a,$(TARGET_ARCH_ABI)),armeabi-v7a)