Reorganize the depth vertex pipeline for future optimizations

This commit is contained in:
Henrik Rydgård 2024-12-20 10:30:23 +01:00
parent dd31518272
commit bdb5f3a91b
6 changed files with 218 additions and 203 deletions

View file

@ -8,13 +8,6 @@
#include "Common/Math/math_util.h"
#include "GPU/Common/VertexDecoderCommon.h"
struct ScreenVert {
int x;
int y;
uint16_t z;
uint16_t behind;
};
void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, GEComparison depthCompare) {
// Swap coordinates if needed, we don't back-face-cull rects.
// We also ignore the UV rotation here.
@ -88,7 +81,7 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
// Adapted from Intel's depth rasterizer example.
// Started with the scalar version, will SIMD-ify later.
// x1/y1 etc are the scissor rect.
void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const ScreenVert vertsSub[3], GEComparison compareMode) {
void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const DepthScreenVertex vertsSub[3], GEComparison compareMode) {
int tileStartX = x1;
int tileEndX = x2;
@ -100,7 +93,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
// are slow on SSE2.
// Convert to whole pixels for now. Later subpixel precision.
ScreenVert verts[3];
DepthScreenVertex verts[3];
verts[0].x = vertsSub[0].x;
verts[0].y = vertsSub[0].y;
verts[0].z = vertsSub[0].z;
@ -218,9 +211,111 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
} // for each row
}
// We ignore lots of primitive types for now.
void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, int y2, void *bufferData,
const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *dec, u32 vertTypeID, bool clockwise) {
void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID) {
// TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder.
_dbg_assert_((vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0);
int vertexStride = dec->VertexSize();
int offset = dec->posoff;
float temp[3];
switch (vertTypeID & GE_VTYPE_POS_MASK) {
case GE_VTYPE_POS_8BIT:
for (int i = 0; i < count; i++) {
const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
for (int j = 0; j < 3; j++) {
temp[j] = data[j] * (1.0f / 128.0f); // TODO: Can we bake this factor in somewhere?
}
Vec3ByMatrix44(dest + i * 4, temp, worldviewproj);
}
break;
case GE_VTYPE_POS_16BIT:
for (int i = 0; i < count; i++) {
const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
for (int j = 0; j < 3; j++) {
temp[j] = data[j] * (1.0f / 32768.0f); // TODO: Can we bake this factor in somewhere?
}
Vec3ByMatrix44(dest + i * 4, temp, worldviewproj);
}
break;
case GE_VTYPE_POS_FLOAT:
for (int i = 0; i < count; i++) {
const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset);
Vec3ByMatrix44(dest + i * 4, data, worldviewproj);
}
break;
}
}
void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, const TransformedVertex *transformed, int count) {
for (int i = 0; i < count; i++) {
screenVerts[i].x = (int)transformed[i].pos[0];
screenVerts[i].y = (int)transformed[i].pos[1];
screenVerts[i].z = (u16)transformed[i].pos[2];
}
}
int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count) {
bool cullEnabled = gstate.isCullEnabled();
const float viewportX = gstate.getViewportXCenter();
const float viewportY = gstate.getViewportYCenter();
const float viewportZ = gstate.getViewportZCenter();
const float viewportScaleX = gstate.getViewportXScale();
const float viewportScaleY = gstate.getViewportYScale();
const float viewportScaleZ = gstate.getViewportZScale();
bool cullCCW = false;
// OK, we now have the coordinates. Let's transform, we can actually do this in-place.
int outCount = 0;
for (int i = 0; i < count; i += 3) {
const float *verts[3] = {
transformed + indexBuffer[i] * 4,
transformed + indexBuffer[i + 1] * 4,
transformed + indexBuffer[i + 2] * 4,
};
// Check if any vertex is behind the 0 plane.
if (verts[0][3] < 0.0f || verts[1][3] < 0.0f || verts[2][3] < 0.0f) {
// Ditch this triangle. Later we should clip here.
continue;
}
for (int c = 0; c < 3; c++) {
const float *src = verts[c];
float invW = 1.0f / src[3];
float x = src[0] * invW;
float y = src[1] * invW;
float z = src[2] * invW;
float screen[3];
screen[0] = (x * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16();
screen[1] = (y * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16();
screen[2] = (z * viewportScaleZ + viewportZ);
if (screen[2] < 0.0f) {
screen[2] = 0.0f;
}
if (screen[2] >= 65535.0f) {
screen[2] = 65535.0f;
}
screenVerts[outCount].x = screen[0] * (1.0f / 16.0f); // We ditch the subpixel precision here.
screenVerts[outCount].y = screen[1] * (1.0f / 16.0f);
screenVerts[outCount].z = screen[2];
outCount++;
}
}
return outCount;
}
// Rasterizes screen-space vertices.
void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count) {
// Prim should now be either TRIANGLES or RECTs.
_dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES);
GEComparison compareMode = gstate.getDepthTestFunction();
if (gstate.isModeClear()) {
@ -233,164 +328,6 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
return;
}
switch (prim) {
case GE_PRIM_INVALID:
case GE_PRIM_KEEP_PREVIOUS:
case GE_PRIM_LINES:
case GE_PRIM_LINE_STRIP:
case GE_PRIM_POINTS:
return;
default:
break;
}
// TODO: Ditch indexed primitives for now, also ditched skinned ones since we don't have a fast way to skin without
// running the full decoder.
if (vertTypeID & (GE_VTYPE_IDX_MASK | GE_VTYPE_WEIGHT_MASK)) {
return;
}
bool isThroughMode = (vertTypeID & GE_VTYPE_THROUGH_MASK) != 0;
bool cullEnabled = false;
bool cullCCW = false;
// Turn the input data into a raw float array that we can pass to an optimized triangle rasterizer.
float *transformed = (float *)bufferData;
ScreenVert *screenVerts = (ScreenVert *)((uint8_t *)bufferData + 65536 * 8);
// Simple, most common case.
int vertexStride = dec->VertexSize();
int offset = dec->posoff;
// OK, we now have the coordinates. Let's transform, we can actually do this in-place.
if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) {
float world[16];
float view[16];
float worldview[16];
float worldviewproj[16];
ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
Matrix4ByMatrix4(worldview, world, view);
Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); // TODO: Include adjustments to the proj matrix?
cullEnabled = gstate.isCullEnabled();
float viewportX = gstate.getViewportXCenter();
float viewportY = gstate.getViewportYCenter();
float viewportZ = gstate.getViewportZCenter();
float viewportScaleX = gstate.getViewportXScale();
float viewportScaleY = gstate.getViewportYScale();
float viewportScaleZ = gstate.getViewportZScale();
bool allBehind = true;
float temp[3];
for (int i = 0; i < count; i++) {
switch (vertTypeID & GE_VTYPE_POS_MASK) {
case GE_VTYPE_POS_8BIT:
for (int i = 0; i < count; i++) {
const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
for (int j = 0; j < 3; j++) {
temp[j] = data[j] * (1.0f / 128.0f); // TODO: Can we bake this factor in somewhere?
}
Vec3ByMatrix44(transformed + i * 4, temp, worldviewproj);
}
break;
case GE_VTYPE_POS_16BIT:
for (int i = 0; i < count; i++) {
const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
for (int j = 0; j < 3; j++) {
temp[j] = data[j] * (1.0f / 32768.0f); // TODO: Can we bake this factor in somewhere?
}
Vec3ByMatrix44(transformed + i * 4, temp, worldviewproj);
}
break;
case GE_VTYPE_POS_FLOAT:
for (int i = 0; i < count; i++) {
const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset);
Vec3ByMatrix44(transformed + i * 4, data, worldviewproj);
}
break;
}
}
for (int i = 0; i < count; i++) {
float proj[4];
memcpy(proj, transformed + i * 4, 4 * sizeof(float));
float w = proj[3];
bool inFront = w > 0.0f;
screenVerts[i].behind = !inFront;
if (inFront) {
allBehind = false;
}
// Clip to the w=0 plane.
proj[0] /= w;
proj[1] /= w;
proj[2] /= w;
// Then transform by the viewport and offset to finally get subpixel coordinates. Normally, this is done by the viewport
// and offset params.
float screen[3];
screen[0] = (proj[0] * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16();
screen[1] = (proj[1] * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16();
screen[2] = (proj[2] * viewportScaleZ + viewportZ);
if (screen[2] < 0.0f) {
screen[2] = 0.0f;
}
if (screen[2] >= 65535.0f) {
screen[2] = 65535.0f;
}
screenVerts[i].x = screen[0] * (1.0f / 16.0f); // We ditch the subpixel precision here.
screenVerts[i].y = screen[1] * (1.0f / 16.0f);
screenVerts[i].z = screen[2];
}
if (allBehind) {
// Cull the whole draw.
return;
}
} else {
float factor = 1.0f;
switch (vertTypeID & GE_VTYPE_POS_MASK) {
case GE_VTYPE_POS_8BIT:
for (int i = 0; i < count; i++) {
const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
for (int j = 0; j < 3; j++) {
transformed[i * 4 + j] = data[j] * factor;
}
transformed[i * 4 + 3] = 1.0f;
}
break;
case GE_VTYPE_POS_16BIT:
for (int i = 0; i < count; i++) {
const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
for (int j = 0; j < 3; j++) {
transformed[i * 4 + j] = data[j] * factor;
}
transformed[i * 4 + 3] = 1.0f;
}
break;
case GE_VTYPE_POS_FLOAT:
for (int i = 0; i < count; i++) {
memcpy(&transformed[i * 4], (const u8 *)vertexData + vertexStride * i + offset, sizeof(float) * 3);
transformed[i * 4 + 3] = 1.0f;
}
break;
}
for (int i = 0; i < count; i++) {
screenVerts[i].x = (int)transformed[i * 4 + 0];
screenVerts[i].y = (int)transformed[i * 4 + 1];
screenVerts[i].z = (u16)clamp_value(transformed[i * 4 + 2], 0.0f, 65535.0f);
}
}
// Then we need to stitch primitives from strips, etc etc...
// For now we'll just do it tri by tri. Later let's be more efficient.
switch (prim) {
case GE_PRIM_RECTANGLES:
for (int i = 0; i < count / 2; i++) {
@ -403,30 +340,10 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
break;
case GE_PRIM_TRIANGLES:
for (int i = 0; i < count / 3; i++) {
if (screenVerts[i * 3].behind || screenVerts[i * 3 + 1].behind || screenVerts[i * 3 + 2].behind) {
continue;
}
DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, screenVerts + i * 3, compareMode);
}
break;
case GE_PRIM_TRIANGLE_STRIP:
{
int wind = 2;
for (int i = 0; i < count - 2; i++) {
int i0 = i;
int i1 = i + wind;
wind ^= 3;
int i2 = i + wind;
if (screenVerts[i0].behind || screenVerts[i1].behind || screenVerts[i2].behind) {
continue;
}
ScreenVert v[3];
v[0] = screenVerts[i0];
v[1] = screenVerts[i1];
v[2] = screenVerts[i2];
DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, v, compareMode);
}
break;
}
default:
_dbg_assert_(false);
}
}

View file

@ -3,10 +3,21 @@
#include "Common/CommonTypes.h"
#include "GPU/ge_constants.h"
struct DepthScreenVertex {
int x;
int y;
uint16_t z;
};
// Specialized, very limited depth-only rasterizer.
// Meant to run in parallel with hardware rendering, in games that read back the depth buffer
// for effects like lens flare.
// So, we can be quite inaccurate without any issues, and skip a lot of functionality.
class VertexDecoder;
void DepthRasterPrim(uint16_t *dest, int stride, int x1, int x2, int y1, int y2, void *bufferData, const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *decoder, u32 vertexTypeID, bool clockwise);
struct TransformedVertex;
int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count);
void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID);
void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, const TransformedVertex *transformed, int count);
void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count);

View file

@ -23,9 +23,11 @@
#include "Common/LogReporting.h"
#include "Common/Math/SIMDHeaders.h"
#include "Common/Math/lin/matrix4x4.h"
#include "Core/System.h"
#include "Core/Config.h"
#include "GPU/Common/DrawEngineCommon.h"
#include "GPU/Common/SplineCommon.h"
#include "GPU/Common/DepthRaster.h"
#include "GPU/Common/VertexDecoderCommon.h"
#include "GPU/Common/SoftwareTransformCommon.h"
#include "GPU/ge_constants.h"
@ -34,7 +36,9 @@
#define QUAD_INDICES_MAX 65536
enum {
TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex)
TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex),
DEPTH_TRANSFORMED_SIZE = VERTEX_BUFFER_MAX * 4,
DEPTH_SCREENVERTS_SIZE = VERTEX_BUFFER_MAX * sizeof(DepthScreenVertex),
};
DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
@ -46,6 +50,12 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
decoded_ = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
decIndex_ = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
indexGen.Setup(decIndex_);
useDepthRaster_ = PSP_CoreParameter().compat.flags().SoftwareRasterDepth;
if (useDepthRaster_) {
depthTransformed_ = (float *)AllocateMemoryPages(DEPTH_TRANSFORMED_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
depthScreenVerts_ = (DepthScreenVertex *)AllocateMemoryPages(DEPTH_SCREENVERTS_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
}
}
DrawEngineCommon::~DrawEngineCommon() {
@ -53,6 +63,10 @@ DrawEngineCommon::~DrawEngineCommon() {
FreeMemoryPages(decIndex_, DECODED_INDEX_BUFFER_SIZE);
FreeMemoryPages(transformed_, TRANSFORMED_VERTEX_BUFFER_SIZE);
FreeMemoryPages(transformedExpanded_, 3 * TRANSFORMED_VERTEX_BUFFER_SIZE);
if (depthTransformed_) {
FreeMemoryPages(depthTransformed_, DEPTH_TRANSFORMED_SIZE);
FreeMemoryPages(depthScreenVerts_, DEPTH_SCREENVERTS_SIZE);
}
delete decJitCache_;
decoderMap_.Iterate([&](const uint32_t vtype, VertexDecoder *decoder) {
delete decoder;
@ -886,3 +900,61 @@ bool DrawEngineCommon::DescribeCodePtr(const u8 *ptr, std::string &name) const {
return false;
}
}
void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID) {
switch (prim) {
case GE_PRIM_INVALID:
case GE_PRIM_KEEP_PREVIOUS:
case GE_PRIM_LINES:
case GE_PRIM_LINE_STRIP:
case GE_PRIM_POINTS:
return;
default:
break;
}
if (vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) {
return;
}
float world[16];
float view[16];
float worldview[16];
float worldviewproj[16];
ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
Matrix4ByMatrix4(worldview, world, view);
Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); // TODO: Include adjustments to the proj matrix?
// Decode.
int numDec = 0;
for (int i = 0; i < numDrawVerts_; i++) {
DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, prim, worldviewproj, drawVerts_[i].verts, drawVerts_[i].vertexCount, dec, vertTypeID);
numDec += drawVerts_[i].vertexCount;
}
// Clip and triangulate using the index buffer.
int outVertCount = DepthRasterClipIndexedTriangles(depthScreenVerts_, depthTransformed_, decIndex_, numDec);
DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
GE_PRIM_TRIANGLES, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(),
depthScreenVerts_, outVertCount);
}
void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count) {
switch (prim) {
case GE_PRIM_INVALID:
case GE_PRIM_KEEP_PREVIOUS:
case GE_PRIM_LINES:
case GE_PRIM_LINE_STRIP:
case GE_PRIM_POINTS:
return;
default:
break;
}
DepthRasterConvertTransformed(depthScreenVerts_, inVerts, count);
DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
prim, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(),
depthScreenVerts_, count);
}

View file

@ -27,6 +27,7 @@
#include "GPU/Common/GPUStateUtils.h"
#include "GPU/Common/IndexGenerator.h"
#include "GPU/Common/VertexDecoderCommon.h"
#include "GPU/Common/DepthRaster.h"
class VertexDecoder;
@ -174,6 +175,9 @@ protected:
void ApplyFramebufferRead(FBOTexState *fboTexState);
void DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID);
void DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count);
static inline int IndexSize(u32 vtype) {
const u32 indexType = (vtype & GE_VTYPE_IDX_MASK);
if (indexType == GE_VTYPE_IDX_16BIT) {
@ -228,6 +232,11 @@ protected:
}
inline bool CollectedPureDraw() const {
// TODO: Do something faster.
if (useDepthRaster_) {
return false;
}
switch (seenPrims_) {
case 1 << GE_PRIM_TRIANGLE_STRIP:
return !anyCCWOrIndexed_ && numDrawInds_ == 1;
@ -343,4 +352,10 @@ protected:
bool offsetOutsideEdge_;
GPUCommon *gpuCommon_;
// Software depth raster
bool useDepthRaster_ = false;
float *depthTransformed_ = nullptr;
DepthScreenVertex *depthScreenVerts_ = nullptr;
};

View file

@ -13,7 +13,6 @@
#include "GPU/Common/DrawEngineCommon.h"
#include "GPU/Common/TextureCacheCommon.h"
#include "GPU/Common/FramebufferManagerCommon.h"
#include "GPU/Common/DepthRaster.h"
struct CommonCommandTableEntry {
uint8_t cmd;
@ -1040,10 +1039,6 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
if (passCulling) {
if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, decoder, vertTypeID, true, &bytesRead)) {
canExtend = false;
} else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) {
DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(),
verts, inds, prim, count, decoder, vertTypeID, false);
}
onePassed = true;
} else {
@ -1122,10 +1117,6 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
if (passCulling) {
if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, decoder, vertTypeID, clockwise, &bytesRead)) {
canExtend = false;
} else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) {
DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(),
verts, inds, newPrim, count, decoder, vertTypeID, clockwise);
}
// As soon as one passes, assume we don't need to check the rest of this batch.
onePassed = true;

View file

@ -370,6 +370,9 @@ void DrawEngineVulkan::Flush() {
} else {
renderManager->Draw(descSetIndex, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, vertexCount);
}
if (useDepthRaster_) {
DepthRasterTransform(prim, dec_, dec_->VertexType());
}
} else {
PROFILE_THIS_SCOPE("soft");
VertexDecoder *swDec = dec_;
@ -438,6 +441,12 @@ void DrawEngineVulkan::Flush() {
swTransform.SetProjMatrix(gstate.projMatrix, gstate_c.vpWidth < 0, gstate_c.vpHeight < 0, trans, scale);
swTransform.Transform(prim, swDec->VertexType(), swDec->GetDecVtxFmt(), numDecodedVerts_, &result);
// At this point, rect and line primitives are still preserved as such. So, it's the best time to do software depth raster.
if (useDepthRaster_) {
DepthRasterPretransformed(prim, transformed_, numDecodedVerts_);
}
// Non-zero depth clears are unusual, but some drivers don't match drawn depth values to cleared values.
// Games sometimes expect exact matches (see #12626, for example) for equal comparisons.
if (result.action == SW_CLEAR && everUsedEqualDepth_ && gstate.isClearModeDepthMask() && result.depth > 0.0f && result.depth < 1.0f)