diff --git a/CMakeLists.txt b/CMakeLists.txt index c1860a2b53..fcf74005e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1906,6 +1906,8 @@ set(GPU_SOURCES GPU/Common/Draw2D.cpp GPU/Common/Draw2D.h GPU/Common/DepthBufferCommon.cpp + GPU/Common/DepthRaster.cpp + GPU/Common/DepthRaster.h GPU/Common/TextureShaderCommon.cpp GPU/Common/TextureShaderCommon.h GPU/Common/DepalettizeShaderCommon.cpp diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 6ad03b8322..94b2d3933b 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -5,4 +5,3 @@ #pragma once #include "Common/Math/SIMDHeaders.h" - diff --git a/Core/Compatibility.cpp b/Core/Compatibility.cpp index 3790421980..cd2c50c488 100644 --- a/Core/Compatibility.cpp +++ b/Core/Compatibility.cpp @@ -149,6 +149,7 @@ void Compatibility::CheckSettings(IniFile &iniFile, const std::string &gameID) { CheckSetting(iniFile, gameID, "DisableMemcpySlicing", &flags_.DisableMemcpySlicing); CheckSetting(iniFile, gameID, "ForceEnableGPUReadback", &flags_.ForceEnableGPUReadback); CheckSetting(iniFile, gameID, "UseFFMPEGFindStreamInfo", &flags_.UseFFMPEGFindStreamInfo); + CheckSetting(iniFile, gameID, "SoftwareRasterDepth", &flags_.SoftwareRasterDepth); } void Compatibility::CheckVRSettings(IniFile &iniFile, const std::string &gameID) { diff --git a/Core/Compatibility.h b/Core/Compatibility.h index 8a0e33af4d..4688df37c0 100644 --- a/Core/Compatibility.h +++ b/Core/Compatibility.h @@ -112,6 +112,7 @@ struct CompatFlags { bool DisableMemcpySlicing; bool ForceEnableGPUReadback; bool UseFFMPEGFindStreamInfo; + bool SoftwareRasterDepth; }; struct VRCompat { diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp new file mode 100644 index 0000000000..20ac069f82 --- /dev/null +++ b/GPU/Common/DepthRaster.cpp @@ -0,0 +1,367 @@ +#include + +#include "Common/Math/CrossSIMD.h" +#include "GPU/Common/DepthRaster.h" +#include "GPU/Math3D.h" +#include "Common/Math/math_util.h" +#include "GPU/Common/VertexDecoderCommon.h" + +// TODO: Should respect the scissor rect. + +struct ScreenVert { + int x; + int y; + uint16_t z; + uint16_t behind; +}; + +void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, GEComparison depthCompare) { + // Swap coordinates if needed, we don't back-face-cull rects. + // We also ignore the UV rotation here. + if (x1 > x2) { + std::swap(x1, x2); + } + if (y1 > y2) { + std::swap(y1, y2); + } + if (x1 == x2 || y1 == y2) { + return; + } + + __m128i valueX8 = _mm_set1_epi16(depthValue); + +#if PPSSPP_ARCH(SSE2) + for (int y = y1; y < y2; y++) { + __m128i *ptr = (__m128i *)(dest + stride * y + x1); + int w = x2 - x1; + + switch (depthCompare) { + case GE_COMP_ALWAYS: + while (w >= 8) { + _mm_storeu_si128(ptr, valueX8); + ptr++; + w -= 8; + } + break; + // TODO: Trailer + case GE_COMP_NEVER: + break; + default: + // TODO + break; + } + } +#elif PPSSPP_ARCH(ARM64_NEON) + +#else + // Do nothing for now +#endif +} + +using namespace Math3D; +struct int2 { + int x, y; + int2(float a, float b) { + x = (int)(a + 0.5f); + y = (int)(b + 0.5f); + } +}; + +// Adapted from Intel's depth rasterizer example. +// Started with the scalar version, will SIMD-ify later. +void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const ScreenVert vertsSub[3], GEComparison compareMode) { + int tileStartX = x1; + int tileEndX = x2; + + int tileStartY = y1; + int tileEndY = y2; + + // Convert to whole pixels for now. Later subpixel precision. + ScreenVert verts[3]; + verts[0].x = vertsSub[0].x >> 4; + verts[0].y = vertsSub[0].y >> 4; + verts[0].z = vertsSub[0].z; + verts[1].x = vertsSub[2].x >> 4; + verts[1].y = vertsSub[2].y >> 4; + verts[1].z = vertsSub[2].z; + verts[2].x = vertsSub[1].x >> 4; + verts[2].y = vertsSub[1].y >> 4; + verts[2].z = vertsSub[1].z; + + // use fixed-point only for X and Y. Avoid work for Z and W. + int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX) & int(0xFFFFFFFE); + int endX = std::min(std::max(std::max(verts[0].x, verts[1].x), verts[2].x) + 1, tileEndX); + + int startY = std::max(std::min(std::min(verts[0].y, verts[1].y), verts[2].y), tileStartY) & int(0xFFFFFFFE); + int endY = std::min(std::max(std::max(verts[0].y, verts[1].y), verts[2].y) + 1, tileEndY); + + if (endX == startX || endY == startY) { + // No pixels + return; + } + + // Fab(x, y) = Ax + By + C = 0 + // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 + // Compute A = (ya - yb) for the 3 line segments that make up each triangle + int A0 = verts[1].y - verts[2].y; + int A1 = verts[2].y - verts[0].y; + int A2 = verts[0].y - verts[1].y; + + // Compute B = (xb - xa) for the 3 line segments that make up each triangle + int B0 = verts[2].x - verts[1].x; + int B1 = verts[0].x - verts[2].x; + int B2 = verts[1].x - verts[0].x; + + // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle + int C0 = verts[1].x * verts[2].y - verts[2].x * verts[1].y; + int C1 = verts[2].x * verts[0].y - verts[0].x * verts[2].y; + int C2 = verts[0].x * verts[1].y - verts[1].x * verts[0].y; + + // Compute triangle area + int triArea = A0 * verts[0].x + B0 * verts[0].y + C0; + if (triArea <= 0) { + // Too small to rasterize or backface culled + // NOTE: Just disabling this check won't enable two-sided rendering. + // Since it's not that common, let's just queue the triangles with both windings. + return; + } + + float oneOverTriArea = (1.0f / float(triArea)); + + float zz[3]; + for (int vv = 0; vv < 3; vv++) { + zz[vv] = (float)verts[vv].z * oneOverTriArea; + } + + int rowIdx = (startY * stride + startX); + int col = startX; + int row = startY; + + // Calculate slopes at starting corner. + int alpha0 = (A0 * col) + (B0 * row) + C0; + int beta0 = (A1 * col) + (B1 * row) + C1; + int gama0 = (A2 * col) + (B2 * row) + C2; + + // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) + for (int r = startY; r < endY; r++, + row++, + rowIdx += stride, + alpha0 += B0, + beta0 += B1, + gama0 += B2) + { + // Compute barycentric coordinates + int idx = rowIdx; + int alpha = alpha0; + int beta = beta0; + int gama = gama0; + + for (int c = startX; c < endX; c++, + idx++, + alpha += A0, + beta += A1, + gama += A2) + { + int mask = alpha >= 0 && beta >= 0 && gama >= 0; + // Early out if all of this quad's pixels are outside the triangle. + if (!mask) { + continue; + } + // Compute barycentric-interpolated depth + float depth = alpha * zz[0] + beta * zz[1] + gama * zz[2]; + float previousDepthValue = (float)depthBuf[idx]; + + int depthMask; + switch (compareMode) { + case GE_COMP_EQUAL: depthMask = depth == previousDepthValue; break; + case GE_COMP_LESS: depthMask = depth < previousDepthValue; break; + case GE_COMP_LEQUAL: depthMask = depth <= previousDepthValue; break; + case GE_COMP_GEQUAL: depthMask = depth >= previousDepthValue; break; + case GE_COMP_GREATER: depthMask = depth > previousDepthValue; break; + case GE_COMP_NOTEQUAL: depthMask = depth != previousDepthValue; break; + case GE_COMP_ALWAYS: + default: + depthMask = 1; + break; + } + int finalMask = mask & depthMask; + depth = finalMask == 1 ? depth : previousDepthValue; + depthBuf[idx] = (u16)depth; + } //for each column + } // for each row +} + +// We ignore lots of primitive types for now. +void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, int y2, void *bufferData, + const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *dec, u32 vertTypeID, bool clockwise) { + + GEComparison compareMode = gstate.getDepthTestFunction(); + if (gstate.isModeClear()) { + if (!gstate.isClearModeDepthMask()) { + return; + } + compareMode = GE_COMP_ALWAYS; + } else { + if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled()) + return; + } + + switch (prim) { + case GE_PRIM_INVALID: + case GE_PRIM_KEEP_PREVIOUS: + case GE_PRIM_LINES: + case GE_PRIM_LINE_STRIP: + case GE_PRIM_POINTS: + return; + default: + break; + } + + // TODO: Ditch indexed primitives for now, also ditched skinned ones since we don't have a fast way to skin without + // running the full decoder. + if (vertTypeID & (GE_VTYPE_IDX_MASK | GE_VTYPE_WEIGHT_MASK)) { + return; + } + + bool isThroughMode = (vertTypeID & GE_VTYPE_THROUGH_MASK) != 0; + + // Turn the input data into a raw float array that we can pass to an optimized triangle rasterizer. + float *verts = (float *)bufferData; + ScreenVert *screenVerts = (ScreenVert *)((uint8_t *)bufferData + 65536 * 8); + + // Simple, most common case. + int vertexStride = dec->VertexSize(); + int offset = dec->posoff; + float factor = 1.0f; + switch (vertTypeID & GE_VTYPE_POS_MASK) { + case GE_VTYPE_POS_8BIT: + if (!isThroughMode) { + factor = 1.0f / 128.0f; + } + for (int i = 0; i < count; i++) { + const s8 *data = (const s8 *)vertexData + i * vertexStride + offset; + for (int j = 0; j < 3; j++) { + verts[i * 3 + j] = data[j] * factor; + } + } + break; + case GE_VTYPE_POS_16BIT: + if (!isThroughMode) { + factor = 1.0f / 32768.0f; + } + for (int i = 0; i < count; i++) { + const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset)); + for (int j = 0; j < 3; j++) { + verts[i * 3 + j] = data[j] * factor; + } + } + break; + case GE_VTYPE_POS_FLOAT: + for (int i = 0; i < count; i++) + memcpy(&verts[i * 3], (const u8 *)vertexData + vertexStride * i + offset, sizeof(float) * 3); + break; + } + + // OK, we now have the coordinates. Let's transform, we can actually do this in-place. + if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) { + // TODO: This is very suboptimal. This should be one matrix multiplication per vertex. + + float viewportX = gstate.getViewportXCenter(); + float viewportY = gstate.getViewportYCenter(); + float viewportZ = gstate.getViewportZCenter(); + float viewportScaleX = gstate.getViewportXScale(); + float viewportScaleY = gstate.getViewportYScale(); + float viewportScaleZ = gstate.getViewportZScale(); + + bool allBehind = true; + + for (int i = 0; i < count; i++) { + float world[3]; + float view[3]; + float proj[4]; + Vec3ByMatrix43(world, verts + i * 3, gstate.worldMatrix); + Vec3ByMatrix43(view, world, gstate.viewMatrix); + Vec3ByMatrix44(proj, view, gstate.projMatrix); // TODO: Include adjustments to the proj matrix? + + float w = proj[3]; + + bool inFront = w > 0.0f; + screenVerts[i].behind = !inFront; + if (inFront) { + allBehind = false; + } + + // Clip to the w=0 plane. + proj[0] /= w; + proj[1] /= w; + proj[2] /= w; + + // Then transform by the viewport and offset to finally get subpixel coordinates. Normally, this is done by the viewport + // and offset params. + float screen[3]; + screen[0] = (proj[0] * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16(); + screen[1] = (proj[1] * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16(); + screen[2] = (proj[2] * viewportScaleZ + viewportZ); + if (screen[2] < 0.0f) { + screen[2] = 0.0f; + } + if (screen[2] >= 65535.0f) { + screen[2] = 65535.0f; + } + screenVerts[i].x = screen[0]; + screenVerts[i].y = screen[1]; + screenVerts[i].z = screen[2]; + } + if (allBehind) { + // Cull the whole draw. + return; + } + } else { + for (int i = 0; i < count; i++) { + screenVerts[i].x = (int)verts[i * 3 + 0] << 4; + screenVerts[i].y = (int)verts[i * 3 + 1] << 4; + screenVerts[i].z = (u16)clamp_value(verts[i * 3 + 2], 0.0f, 65535.0f); + } + } + + // Then we need to stitch primitives from strips, etc etc... + // For now we'll just do it tri by tri. Later let's be more efficient. + + switch (prim) { + case GE_PRIM_RECTANGLES: + for (int i = 0; i < count / 2; i++) { + uint16_t z = screenVerts[i + 1].z; // depth from second vertex + // We remove the subpixel information here. + DepthRasterRect(depth, depthStride, screenVerts[i].x >> 4, screenVerts[i].y >> 4, screenVerts[i + 1].x >> 4, screenVerts[i + 1].y >> 4, + z, compareMode); + } + break; + case GE_PRIM_TRIANGLES: + for (int i = 0; i < count / 3; i++) { + if (screenVerts[i * 3].behind || screenVerts[i * 3 + 1].behind || screenVerts[i * 3 + 2].behind) { + continue; + } + DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, screenVerts + i * 3, compareMode); + } + break; + case GE_PRIM_TRIANGLE_STRIP: + { + int wind = 2; + for (int i = 0; i < count - 2; i++) { + int i0 = i; + int i1 = i + wind; + wind ^= 3; + int i2 = i + wind; + if (screenVerts[i0].behind || screenVerts[i1].behind || screenVerts[i2].behind) { + continue; + } + ScreenVert v[3]; + v[0] = screenVerts[i0]; + v[1] = screenVerts[i1]; + v[2] = screenVerts[i2]; + DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, v, compareMode); + } + break; + } + } +} diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h new file mode 100644 index 0000000000..01fa60e257 --- /dev/null +++ b/GPU/Common/DepthRaster.h @@ -0,0 +1,12 @@ +#pragma once + +#include "Common/CommonTypes.h" +#include "GPU/ge_constants.h" + +// Specialized, very limited depth-only rasterizer. +// Meant to run in parallel with hardware rendering, in games that read back the depth buffer +// for effects like lens flare. +// So, we can be quite inaccurate without any issues, and skip a lot of functionality. + +class VertexDecoder; +void DepthRasterPrim(uint16_t *dest, int stride, int x1, int x2, int y1, int y2, void *bufferData, const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *decoder, u32 vertexTypeID, bool clockwise); diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index 0f8ab8a751..595ab929aa 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -158,6 +158,11 @@ public: _dbg_assert_(numDrawVerts_ == 0 && numDrawInds_ == 0); } + // temporary hack + uint8_t *GetTempSpace() { + return decoded_ + 12 * 65536; + } + protected: virtual bool UpdateUseHWTessellation(bool enabled) const { return enabled; } void UpdatePlanes(); diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 2793ad80ee..4b905f03c9 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -122,7 +122,7 @@ public: // Reads decoded vertex formats in a convenient way. For software transform and debugging. class VertexReader { public: - VertexReader(u8 *base, const DecVtxFormat &decFmt, int vtype) : base_(base), data_(base), decFmt_(decFmt), vtype_(vtype) {} + VertexReader(const u8 *base, const DecVtxFormat &decFmt, int vtype) : base_(base), data_(base), decFmt_(decFmt), vtype_(vtype) {} void ReadPos(float pos[3]) const { // Only DEC_FLOAT_3 is supported. @@ -297,8 +297,8 @@ public: } private: - u8 *base_; - u8 *data_; + const u8 *base_; + const u8 *data_; DecVtxFormat decFmt_; int vtype_; }; diff --git a/GPU/GPU.vcxproj b/GPU/GPU.vcxproj index 5cb3ea9e62..c27d083549 100644 --- a/GPU/GPU.vcxproj +++ b/GPU/GPU.vcxproj @@ -346,6 +346,7 @@ + @@ -468,6 +469,7 @@ + diff --git a/GPU/GPU.vcxproj.filters b/GPU/GPU.vcxproj.filters index 610ba94cbe..1529b974c1 100644 --- a/GPU/GPU.vcxproj.filters +++ b/GPU/GPU.vcxproj.filters @@ -279,6 +279,9 @@ Debugger + + Common + @@ -554,6 +557,9 @@ Debugger + + Common + diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp index 9b5389c875..f5383cf6be 100644 --- a/GPU/GPUCommonHW.cpp +++ b/GPU/GPUCommonHW.cpp @@ -13,6 +13,7 @@ #include "GPU/Common/DrawEngineCommon.h" #include "GPU/Common/TextureCacheCommon.h" #include "GPU/Common/FramebufferManagerCommon.h" +#include "GPU/Common/DepthRaster.h" struct CommonCommandTableEntry { uint8_t cmd; @@ -1039,6 +1040,10 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { if (passCulling) { if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, decoder, vertTypeID, true, &bytesRead)) { canExtend = false; + } else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) { + DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), + gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(), + verts, inds, prim, count, decoder, vertTypeID, false); } onePassed = true; } else { @@ -1117,6 +1122,10 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { if (passCulling) { if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, decoder, vertTypeID, clockwise, &bytesRead)) { canExtend = false; + } else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) { + DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), + gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(), + verts, inds, newPrim, count, decoder, vertTypeID, clockwise); } // As soon as one passes, assume we don't need to check the rest of this batch. onePassed = true; diff --git a/UI/ImDebugger/ImGe.h b/UI/ImDebugger/ImGe.h index 227b0f0c1e..41dc6a8b1f 100644 --- a/UI/ImDebugger/ImGe.h +++ b/UI/ImDebugger/ImGe.h @@ -71,14 +71,14 @@ struct ImGePixelViewer : public PixelLookup { } bool FormatValueAt(char *buf, size_t bufSize, int x, int y) const override; - uint32_t addr = 0x04000000; + uint32_t addr = 0x04110000; uint16_t stride = 512; uint16_t width = 480; uint16_t height = 272; - GEBufferFormat format = GE_FORMAT_565; + GEBufferFormat format = GE_FORMAT_DEPTH16; bool useAlpha = false; bool showAlpha = false; - float scale = 1.0f; + float scale = 20.0f; private: void UpdateTexture(Draw::DrawContext *draw); diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj b/UWP/GPU_UWP/GPU_UWP.vcxproj index 7bb4b346bd..a7ba27a140 100644 --- a/UWP/GPU_UWP/GPU_UWP.vcxproj +++ b/UWP/GPU_UWP/GPU_UWP.vcxproj @@ -109,6 +109,7 @@ + @@ -177,6 +178,7 @@ + @@ -261,4 +263,4 @@ - + \ No newline at end of file diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters index 84b4c5d396..31d14b549f 100644 --- a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters +++ b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters @@ -80,6 +80,7 @@ Debugger + @@ -163,10 +164,11 @@ Debugger + {49bcf7f6-518a-4ecd-af55-bda3a344efe7} - + \ No newline at end of file diff --git a/android/jni/Android.mk b/android/jni/Android.mk index 10ab9a5f77..dbd8809788 100644 --- a/android/jni/Android.mk +++ b/android/jni/Android.mk @@ -530,6 +530,7 @@ EXEC_AND_LIB_FILES := \ $(SRC)/GPU/Common/SoftwareTransformCommon.cpp.arm \ $(SRC)/GPU/Common/ReinterpretFramebuffer.cpp \ $(SRC)/GPU/Common/DepthBufferCommon.cpp \ + $(SRC)/GPU/Common/DepthRaster.cpp \ $(SRC)/GPU/Common/VertexDecoderCommon.cpp.arm \ $(SRC)/GPU/Common/VertexDecoderHandwritten.cpp.arm \ $(SRC)/GPU/Common/TextureCacheCommon.cpp.arm \ diff --git a/assets/compat.ini b/assets/compat.ini index 8a374753ef..0452c5b854 100644 --- a/assets/compat.ini +++ b/assets/compat.ini @@ -1228,8 +1228,10 @@ ULJS19067 = true ULAS42247 = true ULAS42318 = true +[SoftwareRasterDepth] + [DisableFirstFrameReadback] -# Wipeout Pure: Temporary workaround for lens flare flicker. See #13344 +# Wipeout Pure UCUS98612 = true UCJS10007 = true UCES00001 = true diff --git a/libretro/Makefile.common b/libretro/Makefile.common index 804a1d7219..c1cb5a454f 100644 --- a/libretro/Makefile.common +++ b/libretro/Makefile.common @@ -543,6 +543,7 @@ SOURCES_CXX += \ $(GPUDIR)/Common/TextureScalerCommon.cpp \ $(GPUDIR)/Common/SoftwareTransformCommon.cpp \ $(GPUDIR)/Common/DepthBufferCommon.cpp \ + $(GPUDIR)/Common/DepthRaster.cpp \ $(GPUDIR)/Common/StencilCommon.cpp \ $(GPUDIR)/Software/TransformUnit.cpp \ $(GPUDIR)/Software/SoftGpu.cpp \