Add DepthRaster.cpp/h. Rasterize depth rectangles, some triangles

This commit is contained in:
Henrik Rydgård 2024-12-17 12:58:33 +01:00
parent b442183259
commit c5ad81e3d5
17 changed files with 422 additions and 10 deletions

View file

@ -1906,6 +1906,8 @@ set(GPU_SOURCES
GPU/Common/Draw2D.cpp
GPU/Common/Draw2D.h
GPU/Common/DepthBufferCommon.cpp
GPU/Common/DepthRaster.cpp
GPU/Common/DepthRaster.h
GPU/Common/TextureShaderCommon.cpp
GPU/Common/TextureShaderCommon.h
GPU/Common/DepalettizeShaderCommon.cpp

View file

@ -5,4 +5,3 @@
#pragma once
#include "Common/Math/SIMDHeaders.h"

View file

@ -149,6 +149,7 @@ void Compatibility::CheckSettings(IniFile &iniFile, const std::string &gameID) {
CheckSetting(iniFile, gameID, "DisableMemcpySlicing", &flags_.DisableMemcpySlicing);
CheckSetting(iniFile, gameID, "ForceEnableGPUReadback", &flags_.ForceEnableGPUReadback);
CheckSetting(iniFile, gameID, "UseFFMPEGFindStreamInfo", &flags_.UseFFMPEGFindStreamInfo);
CheckSetting(iniFile, gameID, "SoftwareRasterDepth", &flags_.SoftwareRasterDepth);
}
void Compatibility::CheckVRSettings(IniFile &iniFile, const std::string &gameID) {

View file

@ -112,6 +112,7 @@ struct CompatFlags {
bool DisableMemcpySlicing;
bool ForceEnableGPUReadback;
bool UseFFMPEGFindStreamInfo;
bool SoftwareRasterDepth;
};
struct VRCompat {

367
GPU/Common/DepthRaster.cpp Normal file
View file

@ -0,0 +1,367 @@
#include <algorithm>
#include "Common/Math/CrossSIMD.h"
#include "GPU/Common/DepthRaster.h"
#include "GPU/Math3D.h"
#include "Common/Math/math_util.h"
#include "GPU/Common/VertexDecoderCommon.h"
// TODO: Should respect the scissor rect.
struct ScreenVert {
int x;
int y;
uint16_t z;
uint16_t behind;
};
void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, GEComparison depthCompare) {
// Swap coordinates if needed, we don't back-face-cull rects.
// We also ignore the UV rotation here.
if (x1 > x2) {
std::swap(x1, x2);
}
if (y1 > y2) {
std::swap(y1, y2);
}
if (x1 == x2 || y1 == y2) {
return;
}
__m128i valueX8 = _mm_set1_epi16(depthValue);
#if PPSSPP_ARCH(SSE2)
for (int y = y1; y < y2; y++) {
__m128i *ptr = (__m128i *)(dest + stride * y + x1);
int w = x2 - x1;
switch (depthCompare) {
case GE_COMP_ALWAYS:
while (w >= 8) {
_mm_storeu_si128(ptr, valueX8);
ptr++;
w -= 8;
}
break;
// TODO: Trailer
case GE_COMP_NEVER:
break;
default:
// TODO
break;
}
}
#elif PPSSPP_ARCH(ARM64_NEON)
#else
// Do nothing for now
#endif
}
using namespace Math3D;
struct int2 {
int x, y;
int2(float a, float b) {
x = (int)(a + 0.5f);
y = (int)(b + 0.5f);
}
};
// Adapted from Intel's depth rasterizer example.
// Started with the scalar version, will SIMD-ify later.
void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const ScreenVert vertsSub[3], GEComparison compareMode) {
int tileStartX = x1;
int tileEndX = x2;
int tileStartY = y1;
int tileEndY = y2;
// Convert to whole pixels for now. Later subpixel precision.
ScreenVert verts[3];
verts[0].x = vertsSub[0].x >> 4;
verts[0].y = vertsSub[0].y >> 4;
verts[0].z = vertsSub[0].z;
verts[1].x = vertsSub[2].x >> 4;
verts[1].y = vertsSub[2].y >> 4;
verts[1].z = vertsSub[2].z;
verts[2].x = vertsSub[1].x >> 4;
verts[2].y = vertsSub[1].y >> 4;
verts[2].z = vertsSub[1].z;
// use fixed-point only for X and Y. Avoid work for Z and W.
int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX) & int(0xFFFFFFFE);
int endX = std::min(std::max(std::max(verts[0].x, verts[1].x), verts[2].x) + 1, tileEndX);
int startY = std::max(std::min(std::min(verts[0].y, verts[1].y), verts[2].y), tileStartY) & int(0xFFFFFFFE);
int endY = std::min(std::max(std::max(verts[0].y, verts[1].y), verts[2].y) + 1, tileEndY);
if (endX == startX || endY == startY) {
// No pixels
return;
}
// Fab(x, y) = Ax + By + C = 0
// Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0
// Compute A = (ya - yb) for the 3 line segments that make up each triangle
int A0 = verts[1].y - verts[2].y;
int A1 = verts[2].y - verts[0].y;
int A2 = verts[0].y - verts[1].y;
// Compute B = (xb - xa) for the 3 line segments that make up each triangle
int B0 = verts[2].x - verts[1].x;
int B1 = verts[0].x - verts[2].x;
int B2 = verts[1].x - verts[0].x;
// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
int C0 = verts[1].x * verts[2].y - verts[2].x * verts[1].y;
int C1 = verts[2].x * verts[0].y - verts[0].x * verts[2].y;
int C2 = verts[0].x * verts[1].y - verts[1].x * verts[0].y;
// Compute triangle area
int triArea = A0 * verts[0].x + B0 * verts[0].y + C0;
if (triArea <= 0) {
// Too small to rasterize or backface culled
// NOTE: Just disabling this check won't enable two-sided rendering.
// Since it's not that common, let's just queue the triangles with both windings.
return;
}
float oneOverTriArea = (1.0f / float(triArea));
float zz[3];
for (int vv = 0; vv < 3; vv++) {
zz[vv] = (float)verts[vv].z * oneOverTriArea;
}
int rowIdx = (startY * stride + startX);
int col = startX;
int row = startY;
// Calculate slopes at starting corner.
int alpha0 = (A0 * col) + (B0 * row) + C0;
int beta0 = (A1 * col) + (B1 * row) + C1;
int gama0 = (A2 * col) + (B2 * row) + C2;
// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
for (int r = startY; r < endY; r++,
row++,
rowIdx += stride,
alpha0 += B0,
beta0 += B1,
gama0 += B2)
{
// Compute barycentric coordinates
int idx = rowIdx;
int alpha = alpha0;
int beta = beta0;
int gama = gama0;
for (int c = startX; c < endX; c++,
idx++,
alpha += A0,
beta += A1,
gama += A2)
{
int mask = alpha >= 0 && beta >= 0 && gama >= 0;
// Early out if all of this quad's pixels are outside the triangle.
if (!mask) {
continue;
}
// Compute barycentric-interpolated depth
float depth = alpha * zz[0] + beta * zz[1] + gama * zz[2];
float previousDepthValue = (float)depthBuf[idx];
int depthMask;
switch (compareMode) {
case GE_COMP_EQUAL: depthMask = depth == previousDepthValue; break;
case GE_COMP_LESS: depthMask = depth < previousDepthValue; break;
case GE_COMP_LEQUAL: depthMask = depth <= previousDepthValue; break;
case GE_COMP_GEQUAL: depthMask = depth >= previousDepthValue; break;
case GE_COMP_GREATER: depthMask = depth > previousDepthValue; break;
case GE_COMP_NOTEQUAL: depthMask = depth != previousDepthValue; break;
case GE_COMP_ALWAYS:
default:
depthMask = 1;
break;
}
int finalMask = mask & depthMask;
depth = finalMask == 1 ? depth : previousDepthValue;
depthBuf[idx] = (u16)depth;
} //for each column
} // for each row
}
// We ignore lots of primitive types for now.
void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, int y2, void *bufferData,
const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *dec, u32 vertTypeID, bool clockwise) {
GEComparison compareMode = gstate.getDepthTestFunction();
if (gstate.isModeClear()) {
if (!gstate.isClearModeDepthMask()) {
return;
}
compareMode = GE_COMP_ALWAYS;
} else {
if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled())
return;
}
switch (prim) {
case GE_PRIM_INVALID:
case GE_PRIM_KEEP_PREVIOUS:
case GE_PRIM_LINES:
case GE_PRIM_LINE_STRIP:
case GE_PRIM_POINTS:
return;
default:
break;
}
// TODO: Ditch indexed primitives for now, also ditched skinned ones since we don't have a fast way to skin without
// running the full decoder.
if (vertTypeID & (GE_VTYPE_IDX_MASK | GE_VTYPE_WEIGHT_MASK)) {
return;
}
bool isThroughMode = (vertTypeID & GE_VTYPE_THROUGH_MASK) != 0;
// Turn the input data into a raw float array that we can pass to an optimized triangle rasterizer.
float *verts = (float *)bufferData;
ScreenVert *screenVerts = (ScreenVert *)((uint8_t *)bufferData + 65536 * 8);
// Simple, most common case.
int vertexStride = dec->VertexSize();
int offset = dec->posoff;
float factor = 1.0f;
switch (vertTypeID & GE_VTYPE_POS_MASK) {
case GE_VTYPE_POS_8BIT:
if (!isThroughMode) {
factor = 1.0f / 128.0f;
}
for (int i = 0; i < count; i++) {
const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
for (int j = 0; j < 3; j++) {
verts[i * 3 + j] = data[j] * factor;
}
}
break;
case GE_VTYPE_POS_16BIT:
if (!isThroughMode) {
factor = 1.0f / 32768.0f;
}
for (int i = 0; i < count; i++) {
const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
for (int j = 0; j < 3; j++) {
verts[i * 3 + j] = data[j] * factor;
}
}
break;
case GE_VTYPE_POS_FLOAT:
for (int i = 0; i < count; i++)
memcpy(&verts[i * 3], (const u8 *)vertexData + vertexStride * i + offset, sizeof(float) * 3);
break;
}
// OK, we now have the coordinates. Let's transform, we can actually do this in-place.
if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) {
// TODO: This is very suboptimal. This should be one matrix multiplication per vertex.
float viewportX = gstate.getViewportXCenter();
float viewportY = gstate.getViewportYCenter();
float viewportZ = gstate.getViewportZCenter();
float viewportScaleX = gstate.getViewportXScale();
float viewportScaleY = gstate.getViewportYScale();
float viewportScaleZ = gstate.getViewportZScale();
bool allBehind = true;
for (int i = 0; i < count; i++) {
float world[3];
float view[3];
float proj[4];
Vec3ByMatrix43(world, verts + i * 3, gstate.worldMatrix);
Vec3ByMatrix43(view, world, gstate.viewMatrix);
Vec3ByMatrix44(proj, view, gstate.projMatrix); // TODO: Include adjustments to the proj matrix?
float w = proj[3];
bool inFront = w > 0.0f;
screenVerts[i].behind = !inFront;
if (inFront) {
allBehind = false;
}
// Clip to the w=0 plane.
proj[0] /= w;
proj[1] /= w;
proj[2] /= w;
// Then transform by the viewport and offset to finally get subpixel coordinates. Normally, this is done by the viewport
// and offset params.
float screen[3];
screen[0] = (proj[0] * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16();
screen[1] = (proj[1] * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16();
screen[2] = (proj[2] * viewportScaleZ + viewportZ);
if (screen[2] < 0.0f) {
screen[2] = 0.0f;
}
if (screen[2] >= 65535.0f) {
screen[2] = 65535.0f;
}
screenVerts[i].x = screen[0];
screenVerts[i].y = screen[1];
screenVerts[i].z = screen[2];
}
if (allBehind) {
// Cull the whole draw.
return;
}
} else {
for (int i = 0; i < count; i++) {
screenVerts[i].x = (int)verts[i * 3 + 0] << 4;
screenVerts[i].y = (int)verts[i * 3 + 1] << 4;
screenVerts[i].z = (u16)clamp_value(verts[i * 3 + 2], 0.0f, 65535.0f);
}
}
// Then we need to stitch primitives from strips, etc etc...
// For now we'll just do it tri by tri. Later let's be more efficient.
switch (prim) {
case GE_PRIM_RECTANGLES:
for (int i = 0; i < count / 2; i++) {
uint16_t z = screenVerts[i + 1].z; // depth from second vertex
// We remove the subpixel information here.
DepthRasterRect(depth, depthStride, screenVerts[i].x >> 4, screenVerts[i].y >> 4, screenVerts[i + 1].x >> 4, screenVerts[i + 1].y >> 4,
z, compareMode);
}
break;
case GE_PRIM_TRIANGLES:
for (int i = 0; i < count / 3; i++) {
if (screenVerts[i * 3].behind || screenVerts[i * 3 + 1].behind || screenVerts[i * 3 + 2].behind) {
continue;
}
DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, screenVerts + i * 3, compareMode);
}
break;
case GE_PRIM_TRIANGLE_STRIP:
{
int wind = 2;
for (int i = 0; i < count - 2; i++) {
int i0 = i;
int i1 = i + wind;
wind ^= 3;
int i2 = i + wind;
if (screenVerts[i0].behind || screenVerts[i1].behind || screenVerts[i2].behind) {
continue;
}
ScreenVert v[3];
v[0] = screenVerts[i0];
v[1] = screenVerts[i1];
v[2] = screenVerts[i2];
DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, v, compareMode);
}
break;
}
}
}

12
GPU/Common/DepthRaster.h Normal file
View file

@ -0,0 +1,12 @@
#pragma once
#include "Common/CommonTypes.h"
#include "GPU/ge_constants.h"
// Specialized, very limited depth-only rasterizer.
// Meant to run in parallel with hardware rendering, in games that read back the depth buffer
// for effects like lens flare.
// So, we can be quite inaccurate without any issues, and skip a lot of functionality.
class VertexDecoder;
void DepthRasterPrim(uint16_t *dest, int stride, int x1, int x2, int y1, int y2, void *bufferData, const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *decoder, u32 vertexTypeID, bool clockwise);

View file

@ -158,6 +158,11 @@ public:
_dbg_assert_(numDrawVerts_ == 0 && numDrawInds_ == 0);
}
// temporary hack
uint8_t *GetTempSpace() {
return decoded_ + 12 * 65536;
}
protected:
virtual bool UpdateUseHWTessellation(bool enabled) const { return enabled; }
void UpdatePlanes();

View file

@ -122,7 +122,7 @@ public:
// Reads decoded vertex formats in a convenient way. For software transform and debugging.
class VertexReader {
public:
VertexReader(u8 *base, const DecVtxFormat &decFmt, int vtype) : base_(base), data_(base), decFmt_(decFmt), vtype_(vtype) {}
VertexReader(const u8 *base, const DecVtxFormat &decFmt, int vtype) : base_(base), data_(base), decFmt_(decFmt), vtype_(vtype) {}
void ReadPos(float pos[3]) const {
// Only DEC_FLOAT_3 is supported.
@ -297,8 +297,8 @@ public:
}
private:
u8 *base_;
u8 *data_;
const u8 *base_;
const u8 *data_;
DecVtxFormat decFmt_;
int vtype_;
};

View file

@ -346,6 +346,7 @@
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="..\ext\xbrz\xbrz.h" />
<ClInclude Include="Common\DepthRaster.h" />
<ClInclude Include="Common\ReplacedTexture.h" />
<ClInclude Include="Common\TextureReplacer.h" />
<ClInclude Include="Common\TextureShaderCommon.h" />
@ -468,6 +469,7 @@
<ItemGroup>
<ClCompile Include="..\ext\xbrz\xbrz.cpp" />
<ClCompile Include="Common\DepthBufferCommon.cpp" />
<ClCompile Include="Common\DepthRaster.cpp" />
<ClCompile Include="Common\ReplacedTexture.cpp" />
<ClCompile Include="Common\TextureReplacer.cpp" />
<ClCompile Include="Common\TextureShaderCommon.cpp" />

View file

@ -279,6 +279,9 @@
<ClInclude Include="Debugger\State.h">
<Filter>Debugger</Filter>
</ClInclude>
<ClInclude Include="Common\DepthRaster.h">
<Filter>Common</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="Math3D.cpp">
@ -554,6 +557,9 @@
<ClCompile Include="Debugger\State.cpp">
<Filter>Debugger</Filter>
</ClCompile>
<ClCompile Include="Common\DepthRaster.cpp">
<Filter>Common</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<FxCompile Include="..\assets\shaders\tex_4xbrz.csh">

View file

@ -13,6 +13,7 @@
#include "GPU/Common/DrawEngineCommon.h"
#include "GPU/Common/TextureCacheCommon.h"
#include "GPU/Common/FramebufferManagerCommon.h"
#include "GPU/Common/DepthRaster.h"
struct CommonCommandTableEntry {
uint8_t cmd;
@ -1039,6 +1040,10 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
if (passCulling) {
if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, decoder, vertTypeID, true, &bytesRead)) {
canExtend = false;
} else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) {
DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(),
verts, inds, prim, count, decoder, vertTypeID, false);
}
onePassed = true;
} else {
@ -1117,6 +1122,10 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
if (passCulling) {
if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, decoder, vertTypeID, clockwise, &bytesRead)) {
canExtend = false;
} else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) {
DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(),
verts, inds, newPrim, count, decoder, vertTypeID, clockwise);
}
// As soon as one passes, assume we don't need to check the rest of this batch.
onePassed = true;

View file

@ -71,14 +71,14 @@ struct ImGePixelViewer : public PixelLookup {
}
bool FormatValueAt(char *buf, size_t bufSize, int x, int y) const override;
uint32_t addr = 0x04000000;
uint32_t addr = 0x04110000;
uint16_t stride = 512;
uint16_t width = 480;
uint16_t height = 272;
GEBufferFormat format = GE_FORMAT_565;
GEBufferFormat format = GE_FORMAT_DEPTH16;
bool useAlpha = false;
bool showAlpha = false;
float scale = 1.0f;
float scale = 20.0f;
private:
void UpdateTexture(Draw::DrawContext *draw);

View file

@ -109,6 +109,7 @@
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="..\..\GPU\Common\DepthRaster.h" />
<ClInclude Include="..\..\GPU\Common\ReplacedTexture.h" />
<ClInclude Include="..\..\GPU\Common\TextureReplacer.h" />
<ClInclude Include="..\..\GPU\Common\TextureShaderCommon.h" />
@ -177,6 +178,7 @@
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\GPU\Common\DepthBufferCommon.cpp" />
<ClCompile Include="..\..\GPU\Common\DepthRaster.cpp" />
<ClCompile Include="..\..\GPU\Common\ReplacedTexture.cpp" />
<ClCompile Include="..\..\GPU\Common\TextureReplacer.cpp" />
<ClCompile Include="..\..\GPU\Common\TextureShaderCommon.cpp" />

View file

@ -80,6 +80,7 @@
<ClCompile Include="..\..\GPU\Debugger\GECommandTable.cpp">
<Filter>Debugger</Filter>
</ClCompile>
<ClCompile Include="..\..\GPU\Common\DepthRaster.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\GPU\Common\DepalettizeShaderCommon.h" />
@ -163,6 +164,7 @@
<ClInclude Include="..\..\GPU\Debugger\GECommandTable.h">
<Filter>Debugger</Filter>
</ClInclude>
<ClInclude Include="..\..\GPU\Common\DepthRaster.h" />
</ItemGroup>
<ItemGroup>
<Filter Include="Debugger">

View file

@ -530,6 +530,7 @@ EXEC_AND_LIB_FILES := \
$(SRC)/GPU/Common/SoftwareTransformCommon.cpp.arm \
$(SRC)/GPU/Common/ReinterpretFramebuffer.cpp \
$(SRC)/GPU/Common/DepthBufferCommon.cpp \
$(SRC)/GPU/Common/DepthRaster.cpp \
$(SRC)/GPU/Common/VertexDecoderCommon.cpp.arm \
$(SRC)/GPU/Common/VertexDecoderHandwritten.cpp.arm \
$(SRC)/GPU/Common/TextureCacheCommon.cpp.arm \

View file

@ -1228,8 +1228,10 @@ ULJS19067 = true
ULAS42247 = true
ULAS42318 = true
[SoftwareRasterDepth]
[DisableFirstFrameReadback]
# Wipeout Pure: Temporary workaround for lens flare flicker. See #13344
# Wipeout Pure
UCUS98612 = true
UCJS10007 = true
UCES00001 = true

View file

@ -543,6 +543,7 @@ SOURCES_CXX += \
$(GPUDIR)/Common/TextureScalerCommon.cpp \
$(GPUDIR)/Common/SoftwareTransformCommon.cpp \
$(GPUDIR)/Common/DepthBufferCommon.cpp \
$(GPUDIR)/Common/DepthRaster.cpp \
$(GPUDIR)/Common/StencilCommon.cpp \
$(GPUDIR)/Software/TransformUnit.cpp \
$(GPUDIR)/Software/SoftGpu.cpp \