mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Merge pull request #19756 from hrydgard/depth-raster-improvements
Depth raster: Fix when software transform is enabled, support non-through mode rectangles
This commit is contained in:
commit
ad2714ad67
6 changed files with 157 additions and 30 deletions
|
@ -106,6 +106,11 @@ struct Vec4F32 {
|
|||
};
|
||||
}
|
||||
|
||||
Vec4F32 WithLane3Zeroed() const {
|
||||
alignas(16) static uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 };
|
||||
return Vec4F32{ _mm_and_ps(v, _mm_load_ps((float *)mask)) };
|
||||
}
|
||||
|
||||
inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
|
||||
return Vec4F32{ _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
|
@ -272,6 +277,10 @@ struct Vec4F32 {
|
|||
};
|
||||
}
|
||||
|
||||
Vec4F32 WithLane3Zeroed() const {
|
||||
return Vec4F32{ vsetq_lane_f32(0.0f, v, 3) };
|
||||
}
|
||||
|
||||
// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
|
||||
static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
|
||||
#if PPSSPP_ARCH(ARM64_NEON)
|
||||
|
|
|
@ -182,7 +182,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
|
|||
}
|
||||
}
|
||||
|
||||
void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID) {
|
||||
void DecodeAndTransformForDepthRaster(float *dest, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID) {
|
||||
// TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder.
|
||||
_dbg_assert_((vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0);
|
||||
|
||||
|
@ -216,6 +216,92 @@ void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const f
|
|||
}
|
||||
}
|
||||
|
||||
void TransformPredecodedForDepthRaster(float *dest, const float *worldviewproj, const void *decodedVertexData, VertexDecoder *dec, int count) {
|
||||
// TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder.
|
||||
_dbg_assert_((dec->VertexType() & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0);
|
||||
|
||||
int vertexStride = dec->GetDecVtxFmt().stride;
|
||||
int offset = dec->GetDecVtxFmt().posoff;
|
||||
|
||||
Mat4F32 mat(worldviewproj);
|
||||
|
||||
const u8 *startPtr = (const u8 *)decodedVertexData;
|
||||
// Decoded position format is always float3.
|
||||
for (int i = 0; i < count; i++) {
|
||||
const float *data = (const float *)(startPtr + i * vertexStride + offset);
|
||||
Vec4F32::Load(data).AsVec3ByMatrix44(mat).Store(dest + i * 4);
|
||||
}
|
||||
}
|
||||
|
||||
void ConvertPredecodedThroughForDepthRaster(float *dest, const void *decodedVertexData, VertexDecoder *dec, int count) {
|
||||
// TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder.
|
||||
_dbg_assert_((dec->VertexType() & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0);
|
||||
|
||||
int vertexStride = dec->GetDecVtxFmt().stride;
|
||||
int offset = dec->GetDecVtxFmt().posoff;
|
||||
|
||||
const u8 *startPtr = (const u8 *)decodedVertexData;
|
||||
// Decoded position format is always float3.
|
||||
for (int i = 0; i < count; i++) {
|
||||
const float *data = (const float *)(startPtr + i * vertexStride + offset);
|
||||
// Just pass the position straight through - this is through mode!
|
||||
Vec4F32::Load(data).WithLane3Zeroed().Store(dest + i * 4);
|
||||
}
|
||||
}
|
||||
|
||||
int DepthRasterClipIndexedRectangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
|
||||
// TODO: On ARM we can do better by keeping these in lanes instead of splatting.
|
||||
// However, hard to find a common abstraction.
|
||||
const Vec4F32 viewportX = Vec4F32::Splat(gstate.getViewportXCenter());
|
||||
const Vec4F32 viewportY = Vec4F32::Splat(gstate.getViewportYCenter());
|
||||
const Vec4F32 viewportZ = Vec4F32::Splat(gstate.getViewportZCenter());
|
||||
const Vec4F32 viewportScaleX = Vec4F32::Splat(gstate.getViewportXScale());
|
||||
const Vec4F32 viewportScaleY = Vec4F32::Splat(gstate.getViewportYScale());
|
||||
const Vec4F32 viewportScaleZ = Vec4F32::Splat(gstate.getViewportZScale());
|
||||
|
||||
const Vec4F32 offsetX = Vec4F32::Splat(gstate.getOffsetX()); // We remove the 16 scale here
|
||||
const Vec4F32 offsetY = Vec4F32::Splat(gstate.getOffsetY());
|
||||
|
||||
int outCount = 0;
|
||||
for (int i = 0; i < count; i += 2) {
|
||||
const float *verts[2] = {
|
||||
transformed + indexBuffer[i] * 4,
|
||||
transformed + indexBuffer[i + 1] * 4,
|
||||
};
|
||||
|
||||
// Check if any vertex is behind the 0 plane.
|
||||
if (verts[0][3] < 0.0f || verts[1][3] < 0.0f) {
|
||||
// Ditch this rectangle.
|
||||
continue;
|
||||
}
|
||||
|
||||
// These names are wrong .. until we transpose.
|
||||
Vec4F32 x = Vec4F32::Load(verts[0]);
|
||||
Vec4F32 y = Vec4F32::Load(verts[1]);
|
||||
Vec4F32 z = Vec4F32::Zero();
|
||||
Vec4F32 w = Vec4F32::Zero();
|
||||
Vec4F32::Transpose(x, y, z, w);
|
||||
// Now the names are accurate! Since we only have two vertices, the third and fourth member of each vector is zero
|
||||
// and will not be stored (well it will be stored, but it'll be overwritten by the next vertex).
|
||||
Vec4F32 recipW = w.Recip();
|
||||
|
||||
x *= recipW;
|
||||
y *= recipW;
|
||||
z *= recipW;
|
||||
|
||||
Vec4S32 screen[3];
|
||||
screen[0] = Vec4S32FromF32((x * viewportScaleX + viewportX) - offsetX);
|
||||
screen[1] = Vec4S32FromF32((y * viewportScaleY + viewportY) - offsetY);
|
||||
screen[2] = Vec4S32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f));
|
||||
|
||||
screen[0].Store(tx + outCount);
|
||||
screen[1].Store(ty + outCount);
|
||||
screen[2].Store(tz + outCount);
|
||||
outCount += 2;
|
||||
}
|
||||
return outCount;
|
||||
}
|
||||
|
||||
int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
|
||||
bool cullEnabled = gstate.isCullEnabled();
|
||||
GECullMode cullMode = gstate.getCullMode();
|
||||
|
@ -232,8 +318,6 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran
|
|||
const Vec4F32 offsetX = Vec4F32::Splat(gstate.getOffsetX()); // We remove the 16 scale here
|
||||
const Vec4F32 offsetY = Vec4F32::Splat(gstate.getOffsetY());
|
||||
|
||||
bool cullCCW = false;
|
||||
|
||||
int outCount = 0;
|
||||
|
||||
int flipCull = 0;
|
||||
|
@ -294,14 +378,13 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran
|
|||
return outCount;
|
||||
}
|
||||
|
||||
void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count) {
|
||||
_dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES);
|
||||
|
||||
void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
|
||||
// TODO: This is basically a transpose, or AoS->SoA conversion. There may be fast ways.
|
||||
for (int i = 0; i < count; i++) {
|
||||
tx[i] = (int)transformed[i].pos[0];
|
||||
ty[i] = (int)transformed[i].pos[1];
|
||||
tz[i] = (u16)transformed[i].pos[2];
|
||||
const float *pos = transformed + indexBuffer[i] * 4;
|
||||
tx[i] = (int)pos[0];
|
||||
ty[i] = (int)pos[1];
|
||||
tz[i] = (u16)pos[2];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -18,6 +18,12 @@ class VertexDecoder;
|
|||
struct TransformedVertex;
|
||||
|
||||
int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count);
|
||||
void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID);
|
||||
void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count);
|
||||
int DepthRasterClipIndexedRectangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count);
|
||||
void DecodeAndTransformForDepthRaster(float *dest, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID);
|
||||
void TransformPredecodedForDepthRaster(float *dest, const float *worldviewproj, const void *decodedVertexData, VertexDecoder *dec, int count);
|
||||
void ConvertPredecodedThroughForDepthRaster(float *dest, const void *decodedVertexData, VertexDecoder *dec, int count);
|
||||
void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count);
|
||||
|
||||
// void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count);
|
||||
|
||||
void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count);
|
||||
|
|
|
@ -903,6 +903,16 @@ bool DrawEngineCommon::DescribeCodePtr(const u8 *ptr, std::string &name) const {
|
|||
}
|
||||
}
|
||||
|
||||
inline void ComputeFinalProjMatrix(float *worldviewproj) {
|
||||
float world[16];
|
||||
float view[16];
|
||||
float worldview[16];
|
||||
ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
|
||||
ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
|
||||
Matrix4ByMatrix4(worldview, world, view);
|
||||
Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);
|
||||
}
|
||||
|
||||
void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount) {
|
||||
switch (prim) {
|
||||
case GE_PRIM_INVALID:
|
||||
|
@ -919,14 +929,8 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
|
|||
return;
|
||||
}
|
||||
|
||||
float world[16];
|
||||
float view[16];
|
||||
float worldview[16];
|
||||
float worldviewproj[16];
|
||||
ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
|
||||
ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
|
||||
Matrix4ByMatrix4(worldview, world, view);
|
||||
Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); // TODO: Include adjustments to the proj matrix?
|
||||
ComputeFinalProjMatrix(worldviewproj);
|
||||
|
||||
// Decode.
|
||||
int numDec = 0;
|
||||
|
@ -943,7 +947,7 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
|
|||
}
|
||||
|
||||
// Decode the verts (and at the same time apply morphing/skinning). Simple.
|
||||
DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, prim, worldviewproj, dv.verts, indexLowerBound, indexUpperBound, dec, vertTypeID);
|
||||
DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, worldviewproj, dv.verts, indexLowerBound, indexUpperBound, dec, vertTypeID);
|
||||
numDec += indexUpperBound - indexLowerBound + 1;
|
||||
}
|
||||
|
||||
|
@ -967,7 +971,7 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
|
|||
tx, ty, tz, outVertCount);
|
||||
}
|
||||
|
||||
void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count) {
|
||||
void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *inVerts, int numDecoded, VertexDecoder *dec, int vertexCount) {
|
||||
switch (prim) {
|
||||
case GE_PRIM_INVALID:
|
||||
case GE_PRIM_KEEP_PREVIOUS:
|
||||
|
@ -985,10 +989,33 @@ void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const Tra
|
|||
int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT;
|
||||
int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2;
|
||||
|
||||
DepthRasterConvertTransformed(tx, ty, tz, prim, inVerts, count);
|
||||
if (count & 15) {
|
||||
int outVertCount = 0;
|
||||
|
||||
if (dec->throughmode) {
|
||||
ConvertPredecodedThroughForDepthRaster(depthTransformed_, decoded_, dec, numDecoded);
|
||||
DepthRasterConvertTransformed(tx, ty, tz, depthTransformed_, decIndex_, vertexCount);
|
||||
outVertCount = vertexCount;
|
||||
} else {
|
||||
float worldviewproj[16];
|
||||
ComputeFinalProjMatrix(worldviewproj);
|
||||
TransformPredecodedForDepthRaster(depthTransformed_, worldviewproj, decoded_, dec, numDecoded);
|
||||
|
||||
switch (prim) {
|
||||
case GE_PRIM_RECTANGLES:
|
||||
outVertCount = DepthRasterClipIndexedRectangles(tx, ty, tz, depthTransformed_, decIndex_, vertexCount);
|
||||
break;
|
||||
case GE_PRIM_TRIANGLES:
|
||||
outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, decIndex_, vertexCount);
|
||||
break;
|
||||
default:
|
||||
_dbg_assert_(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (prim == GE_PRIM_TRIANGLES && (outVertCount & 15) != 0) {
|
||||
// Zero padding
|
||||
for (int i = count; i < ((count + 16) & ~15); i++) {
|
||||
for (int i = outVertCount; i < ((outVertCount + 16) & ~15); i++) {
|
||||
tx[i] = 0;
|
||||
ty[i] = 0;
|
||||
tz[i] = 0;
|
||||
|
@ -996,5 +1023,5 @@ void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const Tra
|
|||
}
|
||||
DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
|
||||
prim, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(),
|
||||
tx, ty, tz, count);
|
||||
tx, ty, tz, outVertCount);
|
||||
}
|
||||
|
|
|
@ -176,7 +176,7 @@ protected:
|
|||
void ApplyFramebufferRead(FBOTexState *fboTexState);
|
||||
|
||||
void DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount);
|
||||
void DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count);
|
||||
void DepthRasterPredecoded(GEPrimitiveType prim, const void *inVerts, int numDecoded, VertexDecoder *dec, int vertexCount);
|
||||
|
||||
static inline int IndexSize(u32 vtype) {
|
||||
const u32 indexType = (vtype & GE_VTYPE_IDX_MASK);
|
||||
|
|
|
@ -434,6 +434,13 @@ void DrawEngineVulkan::Flush() {
|
|||
UpdateCachedViewportState(vpAndScissor);
|
||||
}
|
||||
|
||||
// At this point, rect and line primitives are still preserved as such. So, it's the best time to do software depth raster.
|
||||
// We could piggyback on the viewport transform below, but it gets complicated since it's different per-backend. Which we really
|
||||
// should clean up one day...
|
||||
if (useDepthRaster_) {
|
||||
DepthRasterPredecoded(prim, decoded_, numDecodedVerts_, swDec, vertexCount);
|
||||
}
|
||||
|
||||
SoftwareTransform swTransform(params);
|
||||
|
||||
const Lin::Vec3 trans(gstate_c.vpXOffset, gstate_c.vpYOffset, gstate_c.vpZOffset * 0.5f + 0.5f);
|
||||
|
@ -442,11 +449,6 @@ void DrawEngineVulkan::Flush() {
|
|||
|
||||
swTransform.Transform(prim, swDec->VertexType(), swDec->GetDecVtxFmt(), numDecodedVerts_, &result);
|
||||
|
||||
// At this point, rect and line primitives are still preserved as such. So, it's the best time to do software depth raster.
|
||||
if (useDepthRaster_) {
|
||||
DepthRasterPretransformed(prim, transformed_, numDecodedVerts_);
|
||||
}
|
||||
|
||||
// Non-zero depth clears are unusual, but some drivers don't match drawn depth values to cleared values.
|
||||
// Games sometimes expect exact matches (see #12626, for example) for equal comparisons.
|
||||
if (result.action == SW_CLEAR && everUsedEqualDepth_ && gstate.isClearModeDepthMask() && result.depth > 0.0f && result.depth < 1.0f)
|
||||
|
|
Loading…
Add table
Reference in a new issue