diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 55e7b86fc7..0824ee0e10 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -106,6 +106,11 @@ struct Vec4F32 { }; } + Vec4F32 WithLane3Zeroed() const { + alignas(16) static uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 }; + return Vec4F32{ _mm_and_ps(v, _mm_load_ps((float *)mask)) }; + } + inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) { return Vec4F32{ _mm_add_ps( _mm_add_ps( @@ -272,6 +277,10 @@ struct Vec4F32 { }; } + Vec4F32 WithLane3Zeroed() const { + return Vec4F32{ vsetq_lane_f32(0.0f, v, 3) }; + } + // One of many possible solutions. Sometimes we could also use vld4q_f32 probably.. static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { #if PPSSPP_ARCH(ARM64_NEON) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index ca8f81cedf..920dd93e39 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -182,7 +182,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, } } -void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID) { +void DecodeAndTransformForDepthRaster(float *dest, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID) { // TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder. _dbg_assert_((vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0); @@ -216,6 +216,92 @@ void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const f } } +void TransformPredecodedForDepthRaster(float *dest, const float *worldviewproj, const void *decodedVertexData, VertexDecoder *dec, int count) { + // TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder. + _dbg_assert_((dec->VertexType() & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0); + + int vertexStride = dec->GetDecVtxFmt().stride; + int offset = dec->GetDecVtxFmt().posoff; + + Mat4F32 mat(worldviewproj); + + const u8 *startPtr = (const u8 *)decodedVertexData; + // Decoded position format is always float3. + for (int i = 0; i < count; i++) { + const float *data = (const float *)(startPtr + i * vertexStride + offset); + Vec4F32::Load(data).AsVec3ByMatrix44(mat).Store(dest + i * 4); + } +} + +void ConvertPredecodedThroughForDepthRaster(float *dest, const void *decodedVertexData, VertexDecoder *dec, int count) { + // TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder. + _dbg_assert_((dec->VertexType() & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0); + + int vertexStride = dec->GetDecVtxFmt().stride; + int offset = dec->GetDecVtxFmt().posoff; + + const u8 *startPtr = (const u8 *)decodedVertexData; + // Decoded position format is always float3. + for (int i = 0; i < count; i++) { + const float *data = (const float *)(startPtr + i * vertexStride + offset); + // Just pass the position straight through - this is through mode! + Vec4F32::Load(data).WithLane3Zeroed().Store(dest + i * 4); + } +} + +int DepthRasterClipIndexedRectangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) { + // TODO: On ARM we can do better by keeping these in lanes instead of splatting. + // However, hard to find a common abstraction. + const Vec4F32 viewportX = Vec4F32::Splat(gstate.getViewportXCenter()); + const Vec4F32 viewportY = Vec4F32::Splat(gstate.getViewportYCenter()); + const Vec4F32 viewportZ = Vec4F32::Splat(gstate.getViewportZCenter()); + const Vec4F32 viewportScaleX = Vec4F32::Splat(gstate.getViewportXScale()); + const Vec4F32 viewportScaleY = Vec4F32::Splat(gstate.getViewportYScale()); + const Vec4F32 viewportScaleZ = Vec4F32::Splat(gstate.getViewportZScale()); + + const Vec4F32 offsetX = Vec4F32::Splat(gstate.getOffsetX()); // We remove the 16 scale here + const Vec4F32 offsetY = Vec4F32::Splat(gstate.getOffsetY()); + + int outCount = 0; + for (int i = 0; i < count; i += 2) { + const float *verts[2] = { + transformed + indexBuffer[i] * 4, + transformed + indexBuffer[i + 1] * 4, + }; + + // Check if any vertex is behind the 0 plane. + if (verts[0][3] < 0.0f || verts[1][3] < 0.0f) { + // Ditch this rectangle. + continue; + } + + // These names are wrong .. until we transpose. + Vec4F32 x = Vec4F32::Load(verts[0]); + Vec4F32 y = Vec4F32::Load(verts[1]); + Vec4F32 z = Vec4F32::Zero(); + Vec4F32 w = Vec4F32::Zero(); + Vec4F32::Transpose(x, y, z, w); + // Now the names are accurate! Since we only have two vertices, the third and fourth member of each vector is zero + // and will not be stored (well it will be stored, but it'll be overwritten by the next vertex). + Vec4F32 recipW = w.Recip(); + + x *= recipW; + y *= recipW; + z *= recipW; + + Vec4S32 screen[3]; + screen[0] = Vec4S32FromF32((x * viewportScaleX + viewportX) - offsetX); + screen[1] = Vec4S32FromF32((y * viewportScaleY + viewportY) - offsetY); + screen[2] = Vec4S32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f)); + + screen[0].Store(tx + outCount); + screen[1].Store(ty + outCount); + screen[2].Store(tz + outCount); + outCount += 2; + } + return outCount; +} + int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) { bool cullEnabled = gstate.isCullEnabled(); GECullMode cullMode = gstate.getCullMode(); @@ -232,8 +318,6 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran const Vec4F32 offsetX = Vec4F32::Splat(gstate.getOffsetX()); // We remove the 16 scale here const Vec4F32 offsetY = Vec4F32::Splat(gstate.getOffsetY()); - bool cullCCW = false; - int outCount = 0; int flipCull = 0; @@ -294,14 +378,13 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran return outCount; } -void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count) { - _dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES); - +void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) { // TODO: This is basically a transpose, or AoS->SoA conversion. There may be fast ways. for (int i = 0; i < count; i++) { - tx[i] = (int)transformed[i].pos[0]; - ty[i] = (int)transformed[i].pos[1]; - tz[i] = (u16)transformed[i].pos[2]; + const float *pos = transformed + indexBuffer[i] * 4; + tx[i] = (int)pos[0]; + ty[i] = (int)pos[1]; + tz[i] = (u16)pos[2]; } } diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h index e92c1a1348..ba9788eebe 100644 --- a/GPU/Common/DepthRaster.h +++ b/GPU/Common/DepthRaster.h @@ -18,6 +18,12 @@ class VertexDecoder; struct TransformedVertex; int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count); -void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID); -void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count); +int DepthRasterClipIndexedRectangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count); +void DecodeAndTransformForDepthRaster(float *dest, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID); +void TransformPredecodedForDepthRaster(float *dest, const float *worldviewproj, const void *decodedVertexData, VertexDecoder *dec, int count); +void ConvertPredecodedThroughForDepthRaster(float *dest, const void *decodedVertexData, VertexDecoder *dec, int count); +void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count); + +// void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count); + void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count); diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index bbe36fb479..088efd052e 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -903,6 +903,16 @@ bool DrawEngineCommon::DescribeCodePtr(const u8 *ptr, std::string &name) const { } } +inline void ComputeFinalProjMatrix(float *worldviewproj) { + float world[16]; + float view[16]; + float worldview[16]; + ConvertMatrix4x3To4x4(world, gstate.worldMatrix); + ConvertMatrix4x3To4x4(view, gstate.viewMatrix); + Matrix4ByMatrix4(worldview, world, view); + Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); +} + void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount) { switch (prim) { case GE_PRIM_INVALID: @@ -919,14 +929,8 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder return; } - float world[16]; - float view[16]; - float worldview[16]; float worldviewproj[16]; - ConvertMatrix4x3To4x4(world, gstate.worldMatrix); - ConvertMatrix4x3To4x4(view, gstate.viewMatrix); - Matrix4ByMatrix4(worldview, world, view); - Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); // TODO: Include adjustments to the proj matrix? + ComputeFinalProjMatrix(worldviewproj); // Decode. int numDec = 0; @@ -943,7 +947,7 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder } // Decode the verts (and at the same time apply morphing/skinning). Simple. - DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, prim, worldviewproj, dv.verts, indexLowerBound, indexUpperBound, dec, vertTypeID); + DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, worldviewproj, dv.verts, indexLowerBound, indexUpperBound, dec, vertTypeID); numDec += indexUpperBound - indexLowerBound + 1; } @@ -967,7 +971,7 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder tx, ty, tz, outVertCount); } -void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count) { +void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *inVerts, int numDecoded, VertexDecoder *dec, int vertexCount) { switch (prim) { case GE_PRIM_INVALID: case GE_PRIM_KEEP_PREVIOUS: @@ -985,10 +989,33 @@ void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const Tra int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT; int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2; - DepthRasterConvertTransformed(tx, ty, tz, prim, inVerts, count); - if (count & 15) { + int outVertCount = 0; + + if (dec->throughmode) { + ConvertPredecodedThroughForDepthRaster(depthTransformed_, decoded_, dec, numDecoded); + DepthRasterConvertTransformed(tx, ty, tz, depthTransformed_, decIndex_, vertexCount); + outVertCount = vertexCount; + } else { + float worldviewproj[16]; + ComputeFinalProjMatrix(worldviewproj); + TransformPredecodedForDepthRaster(depthTransformed_, worldviewproj, decoded_, dec, numDecoded); + + switch (prim) { + case GE_PRIM_RECTANGLES: + outVertCount = DepthRasterClipIndexedRectangles(tx, ty, tz, depthTransformed_, decIndex_, vertexCount); + break; + case GE_PRIM_TRIANGLES: + outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, decIndex_, vertexCount); + break; + default: + _dbg_assert_(false); + break; + } + } + + if (prim == GE_PRIM_TRIANGLES && (outVertCount & 15) != 0) { // Zero padding - for (int i = count; i < ((count + 16) & ~15); i++) { + for (int i = outVertCount; i < ((outVertCount + 16) & ~15); i++) { tx[i] = 0; ty[i] = 0; tz[i] = 0; @@ -996,5 +1023,5 @@ void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const Tra } DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), prim, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), - tx, ty, tz, count); + tx, ty, tz, outVertCount); } diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index 053c4c31f5..be35751550 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -176,7 +176,7 @@ protected: void ApplyFramebufferRead(FBOTexState *fboTexState); void DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount); - void DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count); + void DepthRasterPredecoded(GEPrimitiveType prim, const void *inVerts, int numDecoded, VertexDecoder *dec, int vertexCount); static inline int IndexSize(u32 vtype) { const u32 indexType = (vtype & GE_VTYPE_IDX_MASK); diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index ebe3d022df..a456b5d72c 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -434,6 +434,13 @@ void DrawEngineVulkan::Flush() { UpdateCachedViewportState(vpAndScissor); } + // At this point, rect and line primitives are still preserved as such. So, it's the best time to do software depth raster. + // We could piggyback on the viewport transform below, but it gets complicated since it's different per-backend. Which we really + // should clean up one day... + if (useDepthRaster_) { + DepthRasterPredecoded(prim, decoded_, numDecodedVerts_, swDec, vertexCount); + } + SoftwareTransform swTransform(params); const Lin::Vec3 trans(gstate_c.vpXOffset, gstate_c.vpYOffset, gstate_c.vpZOffset * 0.5f + 0.5f); @@ -442,11 +449,6 @@ void DrawEngineVulkan::Flush() { swTransform.Transform(prim, swDec->VertexType(), swDec->GetDecVtxFmt(), numDecodedVerts_, &result); - // At this point, rect and line primitives are still preserved as such. So, it's the best time to do software depth raster. - if (useDepthRaster_) { - DepthRasterPretransformed(prim, transformed_, numDecodedVerts_); - } - // Non-zero depth clears are unusual, but some drivers don't match drawn depth values to cleared values. // Games sometimes expect exact matches (see #12626, for example) for equal comparisons. if (result.action == SW_CLEAR && everUsedEqualDepth_ && gstate.isClearModeDepthMask() && result.depth > 0.0f && result.depth < 1.0f)