diff --git a/CMakeLists.txt b/CMakeLists.txt
index 81d1d0707f..98d4f79846 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1089,6 +1089,7 @@ list(APPEND CoreExtra
Core/MIPS/x86/RegCacheFPU.cpp
Core/MIPS/x86/RegCacheFPU.h
GPU/Common/VertexDecoderX86.cpp
+ GPU/Software/SamplerX86.cpp
)
list(APPEND CoreExtra
diff --git a/GPU/GPU.vcxproj b/GPU/GPU.vcxproj
index 54785e545d..dcf471a173 100644
--- a/GPU/GPU.vcxproj
+++ b/GPU/GPU.vcxproj
@@ -352,6 +352,7 @@
+
diff --git a/GPU/GPU.vcxproj.filters b/GPU/GPU.vcxproj.filters
index 15353f1ab7..2499cfab12 100644
--- a/GPU/GPU.vcxproj.filters
+++ b/GPU/GPU.vcxproj.filters
@@ -516,5 +516,8 @@
Software
+
+ Software
+
\ No newline at end of file
diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp
index 250dc28379..1fd9268271 100644
--- a/GPU/Software/Rasterizer.cpp
+++ b/GPU/Software/Rasterizer.cpp
@@ -1003,7 +1003,7 @@ inline void DrawSinglePixel(const DrawingCoords &p, u16 z, u8 fog, const Vec4 &prim_color, float s, float t, int texlevel, int frac_texlevel, bool bilinear, u8 *texptr[], int texbufw[]) {
+static inline void ApplyTexturing(Sampler::NearestFunc sampler, Vec4 &prim_color, float s, float t, int texlevel, int frac_texlevel, bool bilinear, u8 *texptr[], int texbufw[]) {
int u[8] = {0}, v[8] = {0}; // 1.23.8 fixed point
int frac_u[2], frac_v[2];
@@ -1021,9 +1021,9 @@ static inline void ApplyTexturing(Vec4 &prim_color, float s, float t, int t
GetTexelCoordinates(texlevel + 1, s, t, u[1], v[1]);
}
- texcolor0 = Sampler::SampleNearest(texlevel, u[0], v[0], tptr0, bufw0);
+ texcolor0 = Vec4::FromRGBA(sampler(u[0], v[0], tptr0, bufw0, texlevel));
if (frac_texlevel) {
- texcolor1 = Sampler::SampleNearest(texlevel + 1, u[1], v[1], tptr1, bufw1);
+ texcolor1 = Vec4::FromRGBA(sampler(u[1], v[1], tptr1, bufw1, texlevel + 1));
}
} else {
GetTexelCoordinatesQuad(texlevel, s, t, u, v, frac_u[0], frac_v[0]);
@@ -1031,9 +1031,9 @@ static inline void ApplyTexturing(Vec4 &prim_color, float s, float t, int t
GetTexelCoordinatesQuad(texlevel + 1, s, t, u + 4, v + 4, frac_u[1], frac_v[1]);
}
- texcolor0 = Sampler::SampleLinear(texlevel, u, v, frac_u[0], frac_v[0], tptr0, bufw0);
+ texcolor0 = Sampler::SampleLinear(sampler, u, v, frac_u[0], frac_v[0], tptr0, bufw0, texlevel);
if (frac_texlevel) {
- texcolor1 = Sampler::SampleLinear(texlevel + 1, u + 4, v + 4, frac_u[1], frac_v[1], tptr1, bufw1);
+ texcolor1 = Sampler::SampleLinear(sampler, u + 4, v + 4, frac_u[1], frac_v[1], tptr1, bufw1, texlevel + 1);
}
}
@@ -1106,7 +1106,7 @@ static inline void CalculateSamplingParams(const float ds, const float dt, const
}
}
-static inline void ApplyTexturing(Vec4 *prim_color, const Vec4 &s, const Vec4 &t, int maxTexLevel, u8 *texptr[], int texbufw[]) {
+static inline void ApplyTexturing(Sampler::NearestFunc sampler, Vec4 *prim_color, const Vec4 &s, const Vec4 &t, int maxTexLevel, u8 *texptr[], int texbufw[]) {
float ds = s[1] - s[0];
float dt = t[2] - t[0];
@@ -1116,7 +1116,7 @@ static inline void ApplyTexturing(Vec4 *prim_color, const Vec4 &s, c
CalculateSamplingParams(ds, dt, maxTexLevel, level, levelFrac, bilinear);
for (int i = 0; i < 4; ++i) {
- ApplyTexturing(prim_color[i], s[i], t[i], level, levelFrac, bilinear, texptr, texbufw);
+ ApplyTexturing(sampler, prim_color[i], s[i], t[i], level, levelFrac, bilinear, texptr, texbufw);
}
}
@@ -1245,6 +1245,8 @@ void DrawTriangleSlice(
// This is common, and when we interpolate, we lose accuracy.
const bool flatZ = v0.screenpos.z == v1.screenpos.z && v0.screenpos.z == v2.screenpos.z;
+ Sampler::NearestFunc sampler = Sampler::GetNearestFunc();
+
for (pprime.y = minY + hy1 * 32; pprime.y < minY + hy2 * 32; pprime.y += 32,
w0_base = e0.StepY(w0_base),
w1_base = e1.StepY(w1_base),
@@ -1302,7 +1304,7 @@ void DrawTriangleSlice(
GetTextureCoordinates(v0, v1, v2, w0, w1, w2, wsum_recip, s, t);
}
- ApplyTexturing(prim_color, s, t, maxTexLevel, texptr, texbufw);
+ ApplyTexturing(sampler, prim_color, s, t, maxTexLevel, texptr, texbufw);
}
if (!clearMode) {
@@ -1412,6 +1414,8 @@ void DrawPoint(const VertexData &v0)
bool clearMode = gstate.isModeClear();
+ Sampler::NearestFunc sampler = Sampler::GetNearestFunc();
+
if (gstate.isTextureMapEnabled() && !clearMode) {
int texbufw[8] = {0};
@@ -1449,7 +1453,7 @@ void DrawPoint(const VertexData &v0)
int texLevelFrac;
bool bilinear;
CalculateSamplingParams(0.0f, 0.0f, maxTexLevel, texLevel, texLevelFrac, bilinear);
- ApplyTexturing(prim_color, s, t, texLevel, texLevelFrac, bilinear, texptr, texbufw);
+ ApplyTexturing(sampler, prim_color, s, t, texLevel, texLevelFrac, bilinear, texptr, texbufw);
}
if (!clearMode)
@@ -1515,6 +1519,8 @@ void DrawLine(const VertexData &v0, const VertexData &v1)
}
}
+ Sampler::NearestFunc sampler = Sampler::GetNearestFunc();
+
float x = a.x > b.x ? a.x - 1 : a.x;
float y = a.y > b.y ? a.y - 1 : a.y;
float z = a.z;
@@ -1579,7 +1585,7 @@ void DrawLine(const VertexData &v0, const VertexData &v1)
texBilinear = true;
}
- ApplyTexturing(prim_color, s, t, texLevel, texLevelFrac, texBilinear, texptr, texbufw);
+ ApplyTexturing(sampler, prim_color, s, t, texLevel, texLevelFrac, texBilinear, texptr, texbufw);
}
if (!clearMode)
@@ -1632,10 +1638,12 @@ bool GetCurrentTexture(GPUDebugBuffer &buffer, int level)
int texbufw = GetTextureBufw(level, texaddr, texfmt);
u8 *texptr = Memory::GetPointer(texaddr);
+ Sampler::NearestFunc sampler = Sampler::GetNearestFunc();
+
u32 *row = (u32 *)buffer.GetData();
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
- row[x] = Sampler::SampleNearest(level, x, y, texptr, texbufw).ToRGBA();
+ row[x] = sampler(x, y, texptr, texbufw, level);
}
row += w;
}
diff --git a/GPU/Software/Sampler.cpp b/GPU/Software/Sampler.cpp
index 7f1ffc001d..2907802dcc 100644
--- a/GPU/Software/Sampler.cpp
+++ b/GPU/Software/Sampler.cpp
@@ -15,6 +15,8 @@
// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+#include
+#include
#include "Common/ColorConv.h"
#include "Core/Reporting.h"
#include "GPU/Common/TextureDecoder.h"
@@ -31,6 +33,95 @@ extern u32 clut[4096];
namespace Sampler {
+static u32 SampleNearest(int u, int v, const u8 *tptr, int bufw, int level);
+
+std::mutex jitCacheLock;
+SamplerJitCache *jitCache = nullptr;
+
+void Init() {
+ jitCache = new SamplerJitCache();
+}
+
+void Shutdown() {
+ delete jitCache;
+ jitCache = nullptr;
+}
+
+NearestFunc GetNearestFunc() {
+ SamplerID id;
+ jitCache->ComputeSamplerID(&id);
+ NearestFunc jitted = jitCache->GetSampler(id);
+ if (jitted) {
+ return jitted;
+ }
+
+ return &SampleNearest;
+}
+
+SamplerJitCache::SamplerJitCache()
+#if PPSSPP_ARCH(ARM64)
+ : fp(this)
+#endif
+{
+ // 256k should be enough.
+ AllocCodeSpace(1024 * 64 * 4);
+
+ // Add some random code to "help" MSVC's buggy disassembler :(
+#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
+ using namespace Gen;
+ for (int i = 0; i < 100; i++) {
+ MOV(32, R(EAX), R(EBX));
+ RET();
+ }
+#elif defined(ARM)
+ BKPT(0);
+ BKPT(0);
+#endif
+}
+
+void SamplerJitCache::Clear() {
+ ClearCodeSpace(0);
+ cache_.clear();
+}
+
+void SamplerJitCache::ComputeSamplerID(SamplerID *id_out) {
+ SamplerID id;
+
+ id.texfmt = gstate.getTextureFormat();
+ id.clutfmt = gstate.getClutPaletteFormat();
+ id.swizzle = gstate.isTextureSwizzled();
+ // Only CLUT4 can use separate CLUTs per mimap.
+ id.useSharedClut = gstate.isClutSharedForMipmaps() || gstate.getTextureFormat() != GE_TFMT_CLUT4;
+ id.hasClutMask = gstate.getClutIndexMask() != 0xFF;
+ id.hasClutShift = gstate.getClutIndexShift() != 0;
+ id.hasClutOffset = gstate.getClutIndexStartPos() != 0;
+
+ *id_out = id;
+}
+
+NearestFunc SamplerJitCache::GetSampler(const SamplerID &id) {
+ std::lock_guard guard(jitCacheLock);
+
+ auto it = cache_.find(id);
+ if (it != cache_.end()) {
+ return it->second;
+ }
+
+ // TODO: What should be the min size? Can we even hit this?
+ if (GetSpaceLeft() < 16384) {
+ Clear();
+ }
+
+ // TODO
+#ifdef _M_X64
+ NearestFunc func = Compile(id);
+ cache_[id] = func;
+ return func;
+#else
+ return nullptr;
+#endif
+}
+
template
static inline int GetPixelDataOffset(unsigned int row_pitch_bytes, unsigned int u, unsigned int v)
{
@@ -85,7 +176,7 @@ struct Nearest4 {
};
template
-inline static Nearest4 SampleNearest(int level, int u[N], int v[N], const u8 *srcptr, int texbufw)
+inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int texbufw, int level)
{
Nearest4 res;
if (!srcptr) {
@@ -193,14 +284,18 @@ inline static Nearest4 SampleNearest(int level, int u[N], int v[N], const u8 *sr
}
}
-Vec4 SampleNearest(int level, int u, int v, const u8 *tptr, int bufw) {
- return Vec4::FromRGBA(SampleNearest<1>(level, &u, &v, tptr, bufw));
+static u32 SampleNearest(int u, int v, const u8 *tptr, int bufw, int level) {
+ return SampleNearest<1>(&u, &v, tptr, bufw, level);
}
-Vec4 SampleLinear(int texlevel, int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw) {
-#if defined(_M_SSE)
- Nearest4 c = SampleNearest<4>(texlevel, u, v, tptr, bufw);
+Vec4 SampleLinear(NearestFunc sampler, int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int texlevel) {
+ Nearest4 c;
+ c.v[0] = sampler(u[0], v[0], tptr, bufw, texlevel);
+ c.v[1] = sampler(u[1], v[1], tptr, bufw, texlevel);
+ c.v[2] = sampler(u[2], v[2], tptr, bufw, texlevel);
+ c.v[3] = sampler(u[3], v[3], tptr, bufw, texlevel);
+#if defined(_M_SSE)
const __m128i z = _mm_setzero_si128();
__m128i cvec = _mm_load_si128((const __m128i *)c.v);
@@ -217,11 +312,10 @@ Vec4 SampleLinear(int texlevel, int u[4], int v[4], int frac_u, int frac_v,
__m128i res = _mm_add_epi16(tmp, _mm_shuffle_epi32(tmp, _MM_SHUFFLE(3, 2, 3, 2)));
return Vec4(_mm_unpacklo_epi16(res, z));
#else
- Nearest4 nearest = SampleNearest<4>(texlevel, u, v, tptr, bufw);
- Vec4 texcolor_tl = Vec4::FromRGBA(nearest.v[0]);
- Vec4 texcolor_tr = Vec4::FromRGBA(nearest.v[1]);
- Vec4 texcolor_bl = Vec4::FromRGBA(nearest.v[2]);
- Vec4 texcolor_br = Vec4::FromRGBA(nearest.v[3]);
+ Vec4 texcolor_tl = Vec4::FromRGBA(c.v[0]);
+ Vec4 texcolor_tr = Vec4::FromRGBA(c.v[1]);
+ Vec4 texcolor_bl = Vec4::FromRGBA(c.v[2]);
+ Vec4 texcolor_br = Vec4::FromRGBA(c.v[3]);
// 0x100 causes a slight bias to tl, but without it we'd have to divide by 255 * 255.
Vec4 t = texcolor_tl * (0x100 - frac_u) + texcolor_tr * frac_u;
Vec4 b = texcolor_bl * (0x100 - frac_u) + texcolor_br * frac_u;
diff --git a/GPU/Software/Sampler.h b/GPU/Software/Sampler.h
index 768f27a0ae..9549b08719 100644
--- a/GPU/Software/Sampler.h
+++ b/GPU/Software/Sampler.h
@@ -17,11 +17,94 @@
#pragma once
+#include "ppsspp_config.h"
+
+#include
+#if PPSSPP_ARCH(ARM)
+#include "Common/ArmEmitter.h"
+#elif PPSSPP_ARCH(ARM64)
+#include "Common/Arm64Emitter.h"
+#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
+#include "Common/x64Emitter.h"
+#elif PPSSPP_ARCH(MIPS)
+#include "Common/MipsEmitter.h"
+#else
+#include "Common/FakeEmitter.h"
+#endif
#include "GPU/Math3D.h"
+struct SamplerID {
+ SamplerID() : fullKey(0) {
+ }
+
+ union {
+ u32 fullKey;
+ struct {
+ int8_t texfmt : 4;
+ int8_t clutfmt : 2;
+ int8_t : 2;
+ bool swizzle : 1;
+ bool useSharedClut : 1;
+ bool hasClutMask : 1;
+ bool hasClutShift : 1;
+ bool hasClutOffset : 1;
+ };
+ };
+
+ bool operator == (const SamplerID &other) const {
+ return fullKey == other.fullKey;
+ }
+};
+
+namespace std {
+
+template <>
+struct hash {
+ std::size_t operator()(const SamplerID &k) const {
+ return hash()(k.fullKey);
+ }
+};
+
+};
+
namespace Sampler {
-Math3D::Vec4 SampleNearest(int level, int u, int v, const u8 *tptr, int bufwbytes);
-Math3D::Vec4 SampleLinear(int level, int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufwbytes);
+typedef u32 (*NearestFunc)(int u, int v, const u8 *tptr, int bufw, int level);
+NearestFunc GetNearestFunc();
+
+void Init();
+void Shutdown();
+
+Math3D::Vec4 SampleLinear(NearestFunc sampler, int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level);
+
+#if PPSSPP_ARCH(ARM)
+class SamplerJitCache : public ArmGen::ARMXCodeBlock {
+#elif PPSSPP_ARCH(ARM64)
+class SamplerJitCache : public Arm64Gen::ARM64CodeBlock {
+#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
+class SamplerJitCache : public Gen::XCodeBlock {
+#elif PPSSPP_ARCH(MIPS)
+class SamplerJitCache : public MIPSGen::MIPSCodeBlock {
+#else
+class SamplerJitCache : public FakeGen::FakeXCodeBlock {
+#endif
+public:
+ SamplerJitCache();
+
+ void ComputeSamplerID(SamplerID *id_out);
+
+ // Returns a pointer to the code to run.
+ NearestFunc GetSampler(const SamplerID &id);
+ void Clear();
+
+private:
+ NearestFunc Compile(const SamplerID &id);
+
+#if PPSSPP_ARCH(ARM64)
+ Arm64Gen::ARM64FloatEmitter fp;
+#endif
+
+ std::unordered_map cache_;
+};
};
diff --git a/GPU/Software/SamplerX86.cpp b/GPU/Software/SamplerX86.cpp
new file mode 100644
index 0000000000..b736c16ec4
--- /dev/null
+++ b/GPU/Software/SamplerX86.cpp
@@ -0,0 +1,96 @@
+// Copyright (c) 2017- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include "ppsspp_config.h"
+#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
+
+#include
+#include "Common/x64Emitter.h"
+#include "GPU/Software/Sampler.h"
+#include "GPU/ge_constants.h"
+
+using namespace Gen;
+
+namespace Sampler {
+
+#ifdef _WIN32
+static const X64Reg resultReg = RAX;
+static const X64Reg tempReg1 = R10;
+static const X64Reg tempReg2 = R11;
+static const X64Reg uReg = RCX;
+static const X64Reg vReg = RDX;
+static const X64Reg srcReg = R8;
+static const X64Reg bufwReg = R9;
+// TODO: levelReg on stack
+#else
+static const X64Reg resultReg = RAX;
+static const X64Reg tempReg1 = R9;
+static const X64Reg tempReg2 = R10;
+static const X64Reg uReg = RDI;
+static const X64Reg vReg = RSI
+static const X64Reg srcReg = RDX;
+static const X64Reg bufwReg = RCX;
+static const X64Reg levelReg = R8;
+#endif
+
+NearestFunc SamplerJitCache::Compile(const SamplerID &id) {
+ BeginWrite();
+ const u8 *start = this->AlignCode16();
+
+ SUB(PTRBITS, R(ESP), Imm8(64));
+ MOVUPS(MDisp(ESP, 0), XMM4);
+ MOVUPS(MDisp(ESP, 16), XMM5);
+ MOVUPS(MDisp(ESP, 32), XMM6);
+ MOVUPS(MDisp(ESP, 48), XMM7);
+
+ // Early exit on !srcPtr.
+ CMP(PTRBITS, R(srcReg), Imm32(0));
+ FixupBranch nonZeroSrc = J_CC(CC_NZ);
+ XOR(32, R(RAX), R(RAX));
+ FixupBranch zeroSrc = J(true);
+ SetJumpTarget(nonZeroSrc);
+
+ GETextureFormat fmt = (GETextureFormat)id.texfmt;
+ bool success = true;
+ switch (fmt) {
+ default:
+ success = false;
+ }
+
+ if (!success) {
+ EndWrite();
+ SetCodePtr(const_cast(start));
+ return nullptr;
+ }
+
+ SetJumpTarget(zeroSrc);
+
+ MOVUPS(XMM4, MDisp(ESP, 0));
+ MOVUPS(XMM5, MDisp(ESP, 16));
+ MOVUPS(XMM6, MDisp(ESP, 32));
+ MOVUPS(XMM7, MDisp(ESP, 48));
+ ADD(PTRBITS, R(ESP), Imm8(64));
+
+ RET();
+
+ EndWrite();
+ return (NearestFunc)start;
+}
+
+};
+
+#endif
diff --git a/GPU/Software/SoftGpu.cpp b/GPU/Software/SoftGpu.cpp
index 081d20f9fb..2c05938d5f 100644
--- a/GPU/Software/SoftGpu.cpp
+++ b/GPU/Software/SoftGpu.cpp
@@ -31,9 +31,10 @@
#include "profiler/profiler.h"
#include "thin3d/thin3d.h"
+#include "GPU/Software/Rasterizer.h"
+#include "GPU/Software/Sampler.h"
#include "GPU/Software/SoftGpu.h"
#include "GPU/Software/TransformUnit.h"
-#include "GPU/Software/Rasterizer.h"
#include "GPU/Common/DrawEngineCommon.h"
#include "GPU/Common/FramebufferCommon.h"
@@ -99,6 +100,7 @@ SoftGPU::SoftGPU(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
displayStride_ = 512;
displayFormat_ = GE_FORMAT_8888;
+ Sampler::Init();
drawEngine_ = new SoftwareDrawEngine();
drawEngineCommon_ = drawEngine_;
}
@@ -127,6 +129,8 @@ SoftGPU::~SoftGPU() {
samplerNearest = nullptr;
samplerLinear->Release();
samplerLinear = nullptr;
+
+ Sampler::Shutdown();
}
void SoftGPU::SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat format) {
diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj b/UWP/GPU_UWP/GPU_UWP.vcxproj
index 0200d09d61..7e797889b9 100644
--- a/UWP/GPU_UWP/GPU_UWP.vcxproj
+++ b/UWP/GPU_UWP/GPU_UWP.vcxproj
@@ -344,6 +344,7 @@
+
@@ -397,6 +398,7 @@
+
diff --git a/android/jni/Android.mk b/android/jni/Android.mk
index a67b94d110..7b3dd28a06 100644
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@@ -28,7 +28,8 @@ ARCH_FILES := \
$(SRC)/Core/MIPS/x86/JitSafeMem.cpp \
$(SRC)/Core/MIPS/x86/RegCache.cpp \
$(SRC)/Core/MIPS/x86/RegCacheFPU.cpp \
- $(SRC)/GPU/Common/VertexDecoderX86.cpp
+ $(SRC)/GPU/Common/VertexDecoderX86.cpp \
+ $(SRC)/GPU/Software/SamplerX86.cpp
endif
ifeq ($(TARGET_ARCH_ABI),x86_64)
@@ -48,7 +49,8 @@ ARCH_FILES := \
$(SRC)/Core/MIPS/x86/JitSafeMem.cpp \
$(SRC)/Core/MIPS/x86/RegCache.cpp \
$(SRC)/Core/MIPS/x86/RegCacheFPU.cpp \
- $(SRC)/GPU/Common/VertexDecoderX86.cpp
+ $(SRC)/GPU/Common/VertexDecoderX86.cpp \
+ $(SRC)/GPU/Software/SamplerX86.cpp
endif
ifeq ($(findstring armeabi-v7a,$(TARGET_ARCH_ABI)),armeabi-v7a)