diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 7aabfa6364..d8220720bc 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -72,29 +72,6 @@ VertexDecoder *DrawEngineCommon::GetVertexDecoder(u32 vtype) { return dec; } -void DrawEngineCommon::DecodeVerts(u8 *dest) { - int decodeVertsCounter = decodeVertsCounter_; - for (; decodeVertsCounter < numDrawVerts_; decodeVertsCounter++) { - DecodeVertsStep(dest, decodeVertsCounter, decodedVerts_, &drawVerts_[decodeVertsCounter].uvScale); - } - decodeVertsCounter_ = decodeVertsCounter; -} - -void DrawEngineCommon::DecodeInds() { - int decodeIndsCounter = decodeIndsCounter_; - for (; decodeIndsCounter < numDrawInds_; decodeIndsCounter++) { - DecodeIndsStep(decodeIndsCounter); - } - decodeIndsCounter_ = decodeIndsCounter; - - // Sanity check - if (indexGen.Prim() < 0) { - ERROR_LOG_REPORT(G3D, "DecodeVerts: Failed to deduce prim: %i", indexGen.Prim()); - // Force to points (0) - indexGen.AddPrim(GE_PRIM_POINTS, 0, true); - } -} - std::vector DrawEngineCommon::DebugGetVertexLoaderIDs() { std::vector ids; decoderMap_.Iterate([&](const uint32_t vtype, VertexDecoder *decoder) { @@ -596,45 +573,6 @@ void DrawEngineCommon::ApplyFramebufferRead(FBOTexState *fboTexState) { gstate_c.Dirty(DIRTY_SHADERBLEND); } -void DrawEngineCommon::DecodeVertsStep(u8 *dest, int i, int &decodedVerts, const UVScale *uvScale) { - PROFILE_THIS_SCOPE("vertdec"); - - const DeferredVerts &dv = drawVerts_[i]; - - int indexLowerBound = dv.indexLowerBound; - int indexUpperBound = dv.indexUpperBound; - - // Decode the verts (and at the same time apply morphing/skinning). Simple. - dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride, dv.verts, uvScale, dv.indexLowerBound, dv.indexUpperBound); - decodedVerts += indexUpperBound - indexLowerBound + 1; -} - -void DrawEngineCommon::DecodeIndsStep(int i) { - const DeferredInds &di = drawInds_[i]; - bool clockwise = true; - if (gstate.isCullEnabled() && gstate.getCullMode() != di.cullMode) { - clockwise = false; - } - // We've already collapsed subsequent draws with the same vertex pointer, so no tricky logic here anymore. - // 2. Loop through the drawcalls, translating indices as we go. - switch (di.indexType) { - case GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT: - indexGen.AddPrim(di.prim, di.vertexCount, clockwise); - break; - case GE_VTYPE_IDX_8BIT >> GE_VTYPE_IDX_SHIFT: - indexGen.TranslatePrim(di.prim, di.vertexCount, (const u8 *)di.inds, di.indexOffset, clockwise); - break; - case GE_VTYPE_IDX_16BIT >> GE_VTYPE_IDX_SHIFT: - indexGen.TranslatePrim(di.prim, di.vertexCount, (const u16_le *)di.inds, di.indexOffset, clockwise); - break; - case GE_VTYPE_IDX_32BIT >> GE_VTYPE_IDX_SHIFT: - indexGen.TranslatePrim(di.prim, di.vertexCount, (const u32_le *)di.inds, di.indexOffset, clockwise); - break; - } - // 4. Advance indexgen vertex counter. - indexGen.Advance(di.vertexCount); -} - inline u32 ComputeMiniHashRange(const void *ptr, size_t sz) { // Switch to u32 units, and round up to avoid unaligned accesses. // Probably doesn't matter if we skip the first few bytes in some cases. @@ -672,7 +610,9 @@ u32 DrawEngineCommon::ComputeMiniHash() { } for (int i = 0; i < numDrawInds_; i += step) { const DeferredInds &di = drawInds_[i]; - fullhash += ComputeMiniHashRange(di.inds, indexSize * di.vertexCount); + if (di.inds) { + fullhash += ComputeMiniHashRange(di.inds, indexSize * di.vertexCount); + } } return fullhash; @@ -715,7 +655,6 @@ int DrawEngineCommon::ComputeNumVertsToDecode() const { uint64_t DrawEngineCommon::ComputeHash() { uint64_t fullhash = 0; const int vertexSize = dec_->GetDecVtxFmt().stride; - const int indexSize = IndexSize(dec_->VertexType()); // TODO: Add some caps both for numDrawCalls_ and num verts to check? // It is really very expensive to check all the vertex data so often. @@ -727,8 +666,11 @@ uint64_t DrawEngineCommon::ComputeHash() { for (int i = 0; i < numDrawInds_; i++) { const DeferredInds &di = drawInds_[i]; - // Hm, we will miss some indices when combining above, but meh, it should be fine. - fullhash += XXH3_64bits((const char *)di.inds, indexSize * di.vertexCount); + if (di.indexType != 0) { + int indexSize = IndexSize(di.indexType << GE_VTYPE_IDX_SHIFT); + // Hm, we will miss some indices when combining above, but meh, it should be fine. + fullhash += XXH3_64bits((const char *)di.inds, indexSize * di.vertexCount); + } } // this looks utterly broken?? @@ -738,9 +680,11 @@ uint64_t DrawEngineCommon::ComputeHash() { // vertTypeID is the vertex type but with the UVGen mode smashed into the top bits. void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead) { - if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawVerts_ >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) { + if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawVerts_ >= MAX_DEFERRED_DRAW_VERTS || numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) { DispatchFlush(); } + _dbg_assert_(numDrawVerts_ < MAX_DEFERRED_DRAW_VERTS); + _dbg_assert_(numDrawInds_ < MAX_DEFERRED_DRAW_INDS); // This isn't exactly right, if we flushed, since prims can straddle previous calls. // But it generally works for common usage. @@ -772,11 +716,15 @@ void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimiti di.indexType = (vertTypeID & GE_VTYPE_IDX_MASK) >> GE_VTYPE_IDX_SHIFT; di.prim = prim; di.cullMode = cullMode; - di.indexOffset = 0; di.vertexCount = vertexCount; + di.vertDecodeIndex = numDrawVerts_; + + _dbg_assert_(numDrawVerts_ <= MAX_DEFERRED_DRAW_VERTS); + _dbg_assert_(numDrawInds_ <= MAX_DEFERRED_DRAW_INDS); if (inds && numDrawVerts_ > decodeVertsCounter_ && drawVerts_[numDrawVerts_ - 1].verts == verts && !applySkin) { // Same vertex pointer as a previous un-decoded draw call - let's just extend the decode! + di.vertDecodeIndex = numDrawVerts_ - 1; DeferredVerts &dv = drawVerts_[numDrawVerts_ - 1]; u16 lb; u16 ub; @@ -785,8 +733,6 @@ void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimiti dv.indexLowerBound = lb; if (ub > dv.indexUpperBound) dv.indexUpperBound = ub; - di.indexOffset = indexOffset_; - // indexOffset_ += vertexCount; } else { // Record a new draw, and a new index gen. DeferredVerts &dv = drawVerts_[numDrawVerts_++]; @@ -799,14 +745,12 @@ void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimiti dv.indexLowerBound = 0; dv.indexUpperBound = vertexCount - 1; } - indexOffset_ = 0; // vertexCount; } vertexCountInDrawCalls_ += vertexCount; if (applySkin) { - DecodeVertsStep(decoded_, decodeVertsCounter_, decodedVerts_, &drawVerts_[numDrawVerts_ - 1].uvScale); - DecodeIndsStep(decodeIndsCounter_); + DecodeVerts(decoded_); } if (prim == GE_PRIM_RECTANGLES && (gstate.getTextureAddress(0) & 0x3FFFFFFF) == (gstate.getFrameBufAddress() & 0x3FFFFFFF)) { @@ -816,6 +760,60 @@ void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimiti } } +void DrawEngineCommon::DecodeVerts(u8 *dest) { + int i = decodeVertsCounter_; + int stride = (int)dec_->GetDecVtxFmt().stride; + for (; i < numDrawVerts_; i++) { + DeferredVerts &dv = drawVerts_[i]; + + int indexLowerBound = dv.indexLowerBound; + drawVertexOffsets_[i] = decodedVerts_ - indexLowerBound; + + int indexUpperBound = dv.indexUpperBound; + // Decode the verts (and at the same time apply morphing/skinning). Simple. + dec_->DecodeVerts(dest + decodedVerts_ * stride, dv.verts, &dv.uvScale, indexLowerBound, indexUpperBound); + decodedVerts_ += indexUpperBound - indexLowerBound + 1; + } + decodeVertsCounter_ = i; +} + +void DrawEngineCommon::DecodeInds() { + int i = decodeIndsCounter_; + for (; i < numDrawInds_; i++) { + const DeferredInds &di = drawInds_[i]; + + int indexOffset = drawVertexOffsets_[di.vertDecodeIndex]; + bool clockwise = true; + if (gstate.isCullEnabled() && gstate.getCullMode() != di.cullMode) { + clockwise = false; + } + // We've already collapsed subsequent draws with the same vertex pointer, so no tricky logic here anymore. + // 2. Loop through the drawcalls, translating indices as we go. + switch (di.indexType) { + case GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT: + indexGen.AddPrim(di.prim, di.vertexCount, indexOffset, clockwise); + break; + case GE_VTYPE_IDX_8BIT >> GE_VTYPE_IDX_SHIFT: + indexGen.TranslatePrim(di.prim, di.vertexCount, (const u8 *)di.inds, indexOffset, clockwise); + break; + case GE_VTYPE_IDX_16BIT >> GE_VTYPE_IDX_SHIFT: + indexGen.TranslatePrim(di.prim, di.vertexCount, (const u16_le *)di.inds, indexOffset, clockwise); + break; + case GE_VTYPE_IDX_32BIT >> GE_VTYPE_IDX_SHIFT: + indexGen.TranslatePrim(di.prim, di.vertexCount, (const u32_le *)di.inds, indexOffset, clockwise); + break; + } + } + decodeIndsCounter_ = i; + + // Sanity check + if (indexGen.Prim() < 0) { + ERROR_LOG_REPORT(G3D, "DecodeVerts: Failed to deduce prim: %i", indexGen.Prim()); + // Force to points (0) + indexGen.AddPrim(GE_PRIM_POINTS, 0, 0, true); + } +} + bool DrawEngineCommon::CanUseHardwareTransform(int prim) { if (!useHWTransform_) return false; diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index b67dbefd75..d299c3d7de 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -144,6 +144,10 @@ protected: void DecodeVerts(u8 *dest); void DecodeInds(); + int MaxIndex() const { + return decodedVerts_; + } + // Preprocessing for spline/bezier u32 NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr, int lowerBound, int upperBound, u32 vertType, int *vertexSize = nullptr); @@ -151,10 +155,6 @@ protected: u32 ComputeMiniHash(); uint64_t ComputeHash(); - // Vertex decoding - void DecodeVertsStep(u8 *dest, int i, int &decodedVerts, const UVScale *uvScale); - void DecodeIndsStep(int i); - int ComputeNumVertsToDecode() const; void ApplyFramebufferRead(FBOTexState *fboTexState); @@ -224,15 +224,17 @@ protected: struct DeferredInds { const void *inds; u32 vertexCount; + u8 vertDecodeIndex; // index into the drawVerts_ array to look up the vertexOffset. u8 indexType; s8 prim; u8 cullMode; - u16 indexOffset; }; - enum { MAX_DEFERRED_DRAW_CALLS = 128 }; - DeferredVerts drawVerts_[MAX_DEFERRED_DRAW_CALLS]; - DeferredInds drawInds_[MAX_DEFERRED_DRAW_CALLS]; + enum { MAX_DEFERRED_DRAW_VERTS = 128 }; // If you change this to more than 256, change type of DeferredInds::vertDecodeIndex. + enum { MAX_DEFERRED_DRAW_INDS = 512 }; // Monster Hunter spams indexed calls that we end up merging. + DeferredVerts drawVerts_[MAX_DEFERRED_DRAW_VERTS]; + uint32_t drawVertexOffsets_[MAX_DEFERRED_DRAW_VERTS]; + DeferredInds drawInds_[MAX_DEFERRED_DRAW_INDS]; int numDrawVerts_ = 0; int numDrawInds_ = 0; @@ -242,8 +244,6 @@ protected: int decodeVertsCounter_ = 0; int decodeIndsCounter_ = 0; - int indexOffset_ = 0; - // Vertex collector state IndexGenerator indexGen; int decodedVerts_ = 0; diff --git a/GPU/Common/IndexGenerator.cpp b/GPU/Common/IndexGenerator.cpp index d1635e807d..b361cbcbce 100644 --- a/GPU/Common/IndexGenerator.cpp +++ b/GPU/Common/IndexGenerator.cpp @@ -50,44 +50,40 @@ void IndexGenerator::Setup(u16 *inds) { Reset(); } -void IndexGenerator::AddPrim(int prim, int vertexCount, bool clockwise) { +void IndexGenerator::AddPrim(int prim, int vertexCount, int indexOffset, bool clockwise) { switch (prim) { - case GE_PRIM_POINTS: AddPoints(vertexCount); break; - case GE_PRIM_LINES: AddLineList(vertexCount); break; - case GE_PRIM_LINE_STRIP: AddLineStrip(vertexCount); break; - case GE_PRIM_TRIANGLES: AddList(vertexCount, clockwise); break; - case GE_PRIM_TRIANGLE_STRIP: AddStrip(vertexCount, clockwise); break; - case GE_PRIM_TRIANGLE_FAN: AddFan(vertexCount, clockwise); break; - case GE_PRIM_RECTANGLES: AddRectangles(vertexCount); break; // Same + case GE_PRIM_POINTS: AddPoints(vertexCount, indexOffset); break; + case GE_PRIM_LINES: AddLineList(vertexCount, indexOffset); break; + case GE_PRIM_LINE_STRIP: AddLineStrip(vertexCount, indexOffset); break; + case GE_PRIM_TRIANGLES: AddList(vertexCount, indexOffset, clockwise); break; + case GE_PRIM_TRIANGLE_STRIP: AddStrip(vertexCount, indexOffset, clockwise); break; + case GE_PRIM_TRIANGLE_FAN: AddFan(vertexCount, indexOffset, clockwise); break; + case GE_PRIM_RECTANGLES: AddRectangles(vertexCount, indexOffset); break; // Same } } -void IndexGenerator::AddPoints(int numVerts) { +void IndexGenerator::AddPoints(int numVerts, int indexOffset) { u16 *outInds = inds_; - const int startIndex = index_; for (int i = 0; i < numVerts; i++) - *outInds++ = startIndex + i; + *outInds++ = indexOffset + i; inds_ = outInds; // ignore overflow verts - index_ += numVerts; count_ += numVerts; prim_ = GE_PRIM_POINTS; seenPrims_ |= 1 << GE_PRIM_POINTS; } -void IndexGenerator::AddList(int numVerts, bool clockwise) { +void IndexGenerator::AddList(int numVerts, int indexOffset, bool clockwise) { u16 *outInds = inds_; - const int startIndex = index_; const int v1 = clockwise ? 1 : 2; const int v2 = clockwise ? 2 : 1; for (int i = 0; i < numVerts; i += 3) { - *outInds++ = startIndex + i; - *outInds++ = startIndex + i + v1; - *outInds++ = startIndex + i + v2; + *outInds++ = indexOffset + i; + *outInds++ = indexOffset + i + v1; + *outInds++ = indexOffset + i + v2; } inds_ = outInds; // ignore overflow verts - index_ += numVerts; count_ += numVerts; prim_ = GE_PRIM_TRIANGLES; seenPrims_ |= 1 << GE_PRIM_TRIANGLES; @@ -119,7 +115,7 @@ alignas(16) static const uint16_t offsets_counter_clockwise[24] = { 7, (u16)(7 + 1), (u16)(7 + 2), }; -void IndexGenerator::AddStrip(int numVerts, bool clockwise) { +void IndexGenerator::AddStrip(int numVerts, int indexOffset, bool clockwise) { int numTris = numVerts - 2; #ifdef _M_SSE // In an SSE2 register we can fit 8 16-bit integers. @@ -130,7 +126,7 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { // We allow ourselves to write some extra indices to avoid the fallback loop. // That's alright as we're appending to a buffer - they will get overwritten anyway. int numChunks = (numTris + 7) >> 3; - __m128i ibase8 = _mm_set1_epi16(index_); + __m128i ibase8 = _mm_set1_epi16(indexOffset); const __m128i *offsets = (const __m128i *)(clockwise ? offsets_clockwise : offsets_counter_clockwise); __m128i *dst = (__m128i *)inds_; __m128i offsets0 = _mm_add_epi16(ibase8, _mm_load_si128(offsets)); @@ -158,7 +154,7 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { // wind doesn't need to be updated, an even number of triangles have been drawn. #elif PPSSPP_ARCH(ARM_NEON) int numChunks = (numTris + 7) >> 3; - uint16x8_t ibase8 = vdupq_n_u16(index_); + uint16x8_t ibase8 = vdupq_n_u16(indexOffset); const u16 *offsets = clockwise ? offsets_clockwise : offsets_counter_clockwise; u16 *dst = inds_; uint16x8_t offsets0 = vaddq_u16(ibase8, vld1q_u16(offsets)); @@ -185,7 +181,7 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { #else // Slow fallback loop. int wind = clockwise ? 1 : 2; - int ibase = index_; + int ibase = indexOffset; size_t numPairs = numTris / 2; u16 *outInds = inds_; while (numPairs > 0) { @@ -207,7 +203,6 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { inds_ = outInds; #endif - index_ += numVerts; if (numTris > 0) count_ += numTris * 3; // This is so we can detect one single strip by just looking at seenPrims_. @@ -222,19 +217,17 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { } } -void IndexGenerator::AddFan(int numVerts, bool clockwise) { +void IndexGenerator::AddFan(int numVerts, int indexOffset, bool clockwise) { const int numTris = numVerts - 2; u16 *outInds = inds_; - const int startIndex = index_; const int v1 = clockwise ? 1 : 2; const int v2 = clockwise ? 2 : 1; for (int i = 0; i < numTris; i++) { - *outInds++ = startIndex; - *outInds++ = startIndex + i + v1; - *outInds++ = startIndex + i + v2; + *outInds++ = indexOffset; + *outInds++ = indexOffset + i + v1; + *outInds++ = indexOffset + i + v2; } inds_ = outInds; - index_ += numVerts; count_ += numTris * 3; prim_ = GE_PRIM_TRIANGLES; seenPrims_ |= 1 << GE_PRIM_TRIANGLE_FAN; @@ -245,46 +238,40 @@ void IndexGenerator::AddFan(int numVerts, bool clockwise) { } //Lines -void IndexGenerator::AddLineList(int numVerts) { +void IndexGenerator::AddLineList(int numVerts, int indexOffset) { u16 *outInds = inds_; - const int startIndex = index_; for (int i = 0; i < numVerts; i += 2) { - *outInds++ = startIndex + i; - *outInds++ = startIndex + i + 1; + *outInds++ = indexOffset + i; + *outInds++ = indexOffset + i + 1; } inds_ = outInds; - index_ += numVerts; count_ += numVerts; prim_ = GE_PRIM_LINES; seenPrims_ |= 1 << prim_; } -void IndexGenerator::AddLineStrip(int numVerts) { +void IndexGenerator::AddLineStrip(int numVerts, int indexOffset) { const int numLines = numVerts - 1; u16 *outInds = inds_; - const int startIndex = index_; for (int i = 0; i < numLines; i++) { - *outInds++ = startIndex + i; - *outInds++ = startIndex + i + 1; + *outInds++ = indexOffset + i; + *outInds++ = indexOffset + i + 1; } inds_ = outInds; - index_ += numVerts; count_ += numLines * 2; prim_ = GE_PRIM_LINES; seenPrims_ |= 1 << GE_PRIM_LINE_STRIP; } -void IndexGenerator::AddRectangles(int numVerts) { +void IndexGenerator::AddRectangles(int numVerts, int indexOffset) { u16 *outInds = inds_; - const int startIndex = index_; //rectangles always need 2 vertices, disregard the last one if there's an odd number numVerts = numVerts & ~1; for (int i = 0; i < numVerts; i += 2) { - *outInds++ = startIndex + i; - *outInds++ = startIndex + i + 1; + *outInds++ = indexOffset + i; + *outInds++ = indexOffset + i + 1; } inds_ = outInds; - index_ += numVerts; count_ += numVerts; prim_ = GE_PRIM_RECTANGLES; seenPrims_ |= 1 << GE_PRIM_RECTANGLES; @@ -292,7 +279,6 @@ void IndexGenerator::AddRectangles(int numVerts) { template void IndexGenerator::TranslatePoints(int numInds, const ITypeLE *inds, int indexOffset) { - indexOffset = index_ - indexOffset; u16 *outInds = inds_; for (int i = 0; i < numInds; i++) *outInds++ = indexOffset + inds[i]; @@ -304,7 +290,6 @@ void IndexGenerator::TranslatePoints(int numInds, const ITypeLE *inds, int index template void IndexGenerator::TranslateLineList(int numInds, const ITypeLE *inds, int indexOffset) { - indexOffset = index_ - indexOffset; u16 *outInds = inds_; numInds = numInds & ~1; for (int i = 0; i < numInds; i += 2) { @@ -319,7 +304,6 @@ void IndexGenerator::TranslateLineList(int numInds, const ITypeLE *inds, int ind template void IndexGenerator::TranslateLineStrip(int numInds, const ITypeLE *inds, int indexOffset) { - indexOffset = index_ - indexOffset; int numLines = numInds - 1; u16 *outInds = inds_; for (int i = 0; i < numLines; i++) { @@ -334,7 +318,6 @@ void IndexGenerator::TranslateLineStrip(int numInds, const ITypeLE *inds, int in template void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) { - indexOffset = index_ - indexOffset; // We only bother doing this minor optimization in triangle list, since it's by far the most // common operation that can benefit. if (sizeof(ITypeLE) == sizeof(inds_[0]) && indexOffset == 0 && clockwise) { @@ -347,6 +330,7 @@ void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOf numInds = numTris * 3; const int v1 = clockwise ? 1 : 2; const int v2 = clockwise ? 2 : 1; + // TODO: This can actually be SIMD-d, although will need complex shuffles if clockwise. for (int i = 0; i < numInds; i += 3) { *outInds++ = indexOffset + inds[i]; *outInds++ = indexOffset + inds[i + v1]; @@ -362,7 +346,6 @@ void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOf template void IndexGenerator::TranslateStrip(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) { int wind = clockwise ? 1 : 2; - indexOffset = index_ - indexOffset; int numTris = numInds - 2; u16 *outInds = inds_; for (int i = 0; i < numTris; i++) { @@ -380,7 +363,6 @@ void IndexGenerator::TranslateStrip(int numInds, const ITypeLE *inds, int indexO template void IndexGenerator::TranslateFan(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) { if (numInds <= 0) return; - indexOffset = index_ - indexOffset; int numTris = numInds - 2; u16 *outInds = inds_; const int v1 = clockwise ? 1 : 2; @@ -398,7 +380,6 @@ void IndexGenerator::TranslateFan(int numInds, const ITypeLE *inds, int indexOff template inline void IndexGenerator::TranslateRectangles(int numInds, const ITypeLE *inds, int indexOffset) { - indexOffset = index_ - indexOffset; u16 *outInds = inds_; //rectangles always need 2 vertices, disregard the last one if there's an odd number numInds = numInds & ~1; diff --git a/GPU/Common/IndexGenerator.h b/GPU/Common/IndexGenerator.h index e8c2578409..b5df11aab8 100644 --- a/GPU/Common/IndexGenerator.h +++ b/GPU/Common/IndexGenerator.h @@ -28,7 +28,6 @@ public: void Reset() { prim_ = GE_PRIM_INVALID; count_ = 0; - index_ = 0; seenPrims_ = 0; pureCount_ = 0; this->inds_ = indsBase_; @@ -57,19 +56,12 @@ public: } } - void AddPrim(int prim, int vertexCount, bool clockwise); + void AddPrim(int prim, int vertexCount, int indexOffset, bool clockwise); void TranslatePrim(int prim, int numInds, const u8 *inds, int indexOffset, bool clockwise); void TranslatePrim(int prim, int numInds, const u16_le *inds, int indexOffset, bool clockwise); void TranslatePrim(int prim, int numInds, const u32_le *inds, int indexOffset, bool clockwise); - void Advance(int numVerts) { - index_ += numVerts; - } - - void SetIndex(int ind) { index_ = ind; } - int MaxIndex() const { return index_; } // Really NextIndex rather than MaxIndex, it's one more than the highest index generated int VertexCount() const { return count_; } - bool Empty() const { return index_ == 0; } int SeenPrims() const { return seenPrims_; } int PureCount() const { return pureCount_; } bool SeenOnlyPurePrims() const { @@ -81,16 +73,16 @@ public: private: // Points (why index these? code simplicity) - void AddPoints(int numVerts); + void AddPoints(int numVerts, int indexOffset); // Triangles - void AddList(int numVerts, bool clockwise); - void AddStrip(int numVerts, bool clockwise); - void AddFan(int numVerts, bool clockwise); + void AddList(int numVerts, int indexOffset, bool clockwise); + void AddStrip(int numVerts, int indexOffset, bool clockwise); + void AddFan(int numVerts, int indexOffset, bool clockwise); // Lines - void AddLineList(int numVerts); - void AddLineStrip(int numVerts); + void AddLineList(int numVerts, int indexOffset); + void AddLineStrip(int numVerts, int indexOffset); // Rectangles - void AddRectangles(int numVerts); + void AddRectangles(int numVerts, int indexOffset); // These translate already indexed lists template @@ -118,7 +110,6 @@ private: u16 *indsBase_; u16 *inds_; - int index_; int count_; int pureCount_; GEPrimitiveType prim_; diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp index c31a5f1d58..ec1af7c80e 100644 --- a/GPU/Common/VertexDecoderCommon.cpp +++ b/GPU/Common/VertexDecoderCommon.cpp @@ -1293,6 +1293,9 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options, } void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, const UVScale *uvScaleOffset, int indexLowerBound, int indexUpperBound) const { + // A single 0 is acceptable for point lists. + _dbg_assert_(indexLowerBound <= indexUpperBound); + // Decode the vertices within the found bounds, once each // decoded_ and ptr_ are used in the steps, so can't be turned into locals for speed. const u8 *startPtr = (const u8*)verts + indexLowerBound * size; diff --git a/GPU/D3D11/DrawEngineD3D11.cpp b/GPU/D3D11/DrawEngineD3D11.cpp index 3163349ffb..3711fee648 100644 --- a/GPU/D3D11/DrawEngineD3D11.cpp +++ b/GPU/D3D11/DrawEngineD3D11.cpp @@ -384,9 +384,10 @@ void DrawEngineD3D11::DoFlush() { vai->status = VertexArrayInfoD3D11::VAI_HASHING; vai->drawsUntilNextFullHash = 0; DecodeVerts(decoded_); // writes to indexGen + DecodeInds(); vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); - vai->maxIndex = indexGen.MaxIndex(); + vai->maxIndex = MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAI11_FLAG_VERTEXFULLALPHA : 0; goto rotateVBO; } @@ -409,6 +410,7 @@ void DrawEngineD3D11::DoFlush() { if (newMiniHash != vai->minihash || newHash != vai->hash) { MarkUnreliable(vai); DecodeVerts(decoded_); + DecodeInds(); goto rotateVBO; } if (vai->numVerts > 64) { @@ -428,15 +430,17 @@ void DrawEngineD3D11::DoFlush() { if (newMiniHash != vai->minihash) { MarkUnreliable(vai); DecodeVerts(decoded_); + DecodeInds(); goto rotateVBO; } } if (vai->vbo == 0) { DecodeVerts(decoded_); + DecodeInds(); vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); - vai->maxIndex = indexGen.MaxIndex(); + vai->maxIndex = MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAI11_FLAG_VERTEXFULLALPHA : 0; useElements = !indexGen.SeenOnlyPurePrims() || prim == GE_PRIM_TRIANGLE_FAN; if (!useElements && indexGen.PureCount()) { @@ -446,7 +450,7 @@ void DrawEngineD3D11::DoFlush() { _dbg_assert_msg_(gstate_c.vertBounds.minV >= gstate_c.vertBounds.maxV, "Should not have checked UVs when caching."); // TODO: Combine these two into one buffer? - u32 size = dec_->GetDecVtxFmt().stride * indexGen.MaxIndex(); + u32 size = dec_->GetDecVtxFmt().stride * MaxIndex(); D3D11_BUFFER_DESC desc{ size, D3D11_USAGE_IMMUTABLE, D3D11_BIND_VERTEX_BUFFER, 0 }; D3D11_SUBRESOURCE_DATA data{ decoded_ }; ASSERT_SUCCESS(device_->CreateBuffer(&desc, &data, &vai->vbo)); @@ -500,6 +504,7 @@ void DrawEngineD3D11::DoFlush() { vai->numFrames++; } DecodeVerts(decoded_); + DecodeInds(); goto rotateVBO; } } @@ -507,11 +512,12 @@ void DrawEngineD3D11::DoFlush() { vai->lastFrame = gpuStats.numFlips; } else { DecodeVerts(decoded_); + DecodeInds(); rotateVBO: gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); useElements = !indexGen.SeenOnlyPurePrims() || prim == GE_PRIM_TRIANGLE_FAN; vertexCount = indexGen.VertexCount(); - maxIndex = indexGen.MaxIndex(); + maxIndex = MaxIndex(); if (!useElements && indexGen.PureCount()) { vertexCount = indexGen.PureCount(); } @@ -584,6 +590,7 @@ rotateVBO: dec_ = GetVertexDecoder(lastVType_); } DecodeVerts(decoded_); + DecodeInds(); bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; if (gstate.isModeThrough()) { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); @@ -622,7 +629,7 @@ rotateVBO: UpdateCachedViewportState(vpAndScissor); } - int maxIndex = indexGen.MaxIndex(); + int maxIndex = MaxIndex(); SoftwareTransform swTransform(params); const Lin::Vec3 trans(gstate_c.vpXOffset, -gstate_c.vpYOffset, gstate_c.vpZOffset * 0.5f + 0.5f); diff --git a/GPU/Directx9/DrawEngineDX9.cpp b/GPU/Directx9/DrawEngineDX9.cpp index d5f4e4f299..890daa4cc2 100644 --- a/GPU/Directx9/DrawEngineDX9.cpp +++ b/GPU/Directx9/DrawEngineDX9.cpp @@ -362,9 +362,10 @@ void DrawEngineDX9::DoFlush() { vai->status = VertexArrayInfoDX9::VAI_HASHING; vai->drawsUntilNextFullHash = 0; DecodeVerts(decoded_); // writes to indexGen + DecodeInds(); vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); - vai->maxIndex = indexGen.MaxIndex(); + vai->maxIndex = MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAI_FLAG_VERTEXFULLALPHA : 0; goto rotateVBO; @@ -388,6 +389,7 @@ void DrawEngineDX9::DoFlush() { if (newMiniHash != vai->minihash || newHash != vai->hash) { MarkUnreliable(vai); DecodeVerts(decoded_); + DecodeInds(); goto rotateVBO; } if (vai->numVerts > 64) { @@ -407,15 +409,17 @@ void DrawEngineDX9::DoFlush() { if (newMiniHash != vai->minihash) { MarkUnreliable(vai); DecodeVerts(decoded_); + DecodeInds(); goto rotateVBO; } } if (vai->vbo == 0) { DecodeVerts(decoded_); + DecodeInds(); vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); - vai->maxIndex = indexGen.MaxIndex(); + vai->maxIndex = MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAI_FLAG_VERTEXFULLALPHA : 0; useElements = !indexGen.SeenOnlyPurePrims(); if (!useElements && indexGen.PureCount()) { @@ -425,7 +429,7 @@ void DrawEngineDX9::DoFlush() { _dbg_assert_msg_(gstate_c.vertBounds.minV >= gstate_c.vertBounds.maxV, "Should not have checked UVs when caching."); void * pVb; - u32 size = dec_->GetDecVtxFmt().stride * indexGen.MaxIndex(); + u32 size = dec_->GetDecVtxFmt().stride * MaxIndex(); device_->CreateVertexBuffer(size, D3DUSAGE_WRITEONLY, 0, D3DPOOL_DEFAULT, &vai->vbo, NULL); vai->vbo->Lock(0, size, &pVb, 0); memcpy(pVb, decoded_, size); @@ -482,6 +486,7 @@ void DrawEngineDX9::DoFlush() { vai->numFrames++; } DecodeVerts(decoded_); + DecodeInds(); goto rotateVBO; } } @@ -489,17 +494,20 @@ void DrawEngineDX9::DoFlush() { vai->lastFrame = gpuStats.numFlips; } else { DecodeVerts(decoded_); + DecodeInds(); rotateVBO: gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); useElements = !indexGen.SeenOnlyPurePrims(); vertexCount = indexGen.VertexCount(); - maxIndex = indexGen.MaxIndex(); + maxIndex = MaxIndex(); if (!useElements && indexGen.PureCount()) { vertexCount = indexGen.PureCount(); } prim = indexGen.Prim(); } + _dbg_assert_((int)prim > 0); + bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; if (gstate.isModeThrough()) { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); @@ -544,6 +552,7 @@ rotateVBO: dec_ = GetVertexDecoder(lastVType_); } DecodeVerts(decoded_); + DecodeInds(); bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; if (gstate.isModeThrough()) { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); @@ -582,7 +591,7 @@ rotateVBO: UpdateCachedViewportState(vpAndScissor); } - int maxIndex = indexGen.MaxIndex(); + int maxIndex = MaxIndex(); SoftwareTransform swTransform(params); // Half pixel offset hack. diff --git a/GPU/GLES/DrawEngineGLES.cpp b/GPU/GLES/DrawEngineGLES.cpp index f8fc3cbf34..c202eb3634 100644 --- a/GPU/GLES/DrawEngineGLES.cpp +++ b/GPU/GLES/DrawEngineGLES.cpp @@ -286,9 +286,9 @@ void DrawEngineGLES::DoFlush() { // Figure out how much pushbuffer space we need to allocate. int vertsToDecode = ComputeNumVertsToDecode(); u8 *dest = (u8 *)frameData.pushVertex->Allocate(vertsToDecode * dec_->GetDecVtxFmt().stride, 4, &vertexBuffer, &vertexBufferOffset); - // Indices are decoded in here. DecodeVerts(dest); } + DecodeInds(); gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); @@ -345,6 +345,7 @@ void DrawEngineGLES::DoFlush() { dec_ = GetVertexDecoder(lastVType_); } DecodeVerts(decoded_); + DecodeInds(); bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; if (gstate.isModeThrough()) { @@ -383,7 +384,7 @@ void DrawEngineGLES::DoFlush() { UpdateCachedViewportState(vpAndScissor_); } - int maxIndex = indexGen.MaxIndex(); + int maxIndex = MaxIndex(); int vertexCount = indexGen.VertexCount(); // TODO: Split up into multiple draw calls for GLES 2.0 where you can't guarantee support for more than 0x10000 verts. diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index 70891211b9..2a463fb9e0 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -566,10 +566,11 @@ bool DrawEngineVulkan::VertexCacheLookup(int &vertexCount, GEPrimitiveType &prim vai->minihash = ComputeMiniHash(); vai->status = VertexArrayInfoVulkan::VAI_HASHING; vai->drawsUntilNextFullHash = 0; - DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); // writes to indexGen + DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); + DecodeInds(); vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); - vai->maxIndex = indexGen.MaxIndex(); + vai->maxIndex = MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAIVULKAN_FLAG_VERTEXFULLALPHA : 0; return true; } @@ -593,6 +594,7 @@ bool DrawEngineVulkan::VertexCacheLookup(int &vertexCount, GEPrimitiveType &prim if (newMiniHash != vai->minihash || newHash != vai->hash) { MarkUnreliable(vai); DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); + DecodeInds(); return true; } if (vai->numVerts > 64) { @@ -612,6 +614,7 @@ bool DrawEngineVulkan::VertexCacheLookup(int &vertexCount, GEPrimitiveType &prim if (newMiniHash != vai->minihash) { MarkUnreliable(vai); DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); + DecodeInds(); return true; } } @@ -619,9 +622,10 @@ bool DrawEngineVulkan::VertexCacheLookup(int &vertexCount, GEPrimitiveType &prim if (!vai->vb) { // Directly push to the vertex cache. DecodeVertsToPushBuffer(vertexCache_, &vai->vbOffset, &vai->vb); + DecodeInds(); _dbg_assert_msg_(gstate_c.vertBounds.minV >= gstate_c.vertBounds.maxV, "Should not have checked UVs when caching."); vai->numVerts = indexGen.VertexCount(); - vai->maxIndex = indexGen.MaxIndex(); + vai->maxIndex = MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAIVULKAN_FLAG_VERTEXFULLALPHA : 0; if (forceIndexed) { vai->prim = indexGen.GeneralPrim(); @@ -684,6 +688,7 @@ bool DrawEngineVulkan::VertexCacheLookup(int &vertexCount, GEPrimitiveType &prim vai->numFrames++; } DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); + DecodeInds(); return true; } default: @@ -889,7 +894,7 @@ void DrawEngineVulkan::DoFlush() { UpdateCachedViewportState(vpAndScissor); } - int maxIndex = indexGen.MaxIndex(); + int maxIndex = MaxIndex(); SoftwareTransform swTransform(params); const Lin::Vec3 trans(gstate_c.vpXOffset, gstate_c.vpYOffset, gstate_c.vpZOffset * 0.5f + 0.5f); @@ -1037,6 +1042,7 @@ void DrawEngineVulkan::ResetAfterDraw() { decodedVerts_ = 0; numDrawVerts_ = 0; numDrawInds_ = 0; + vertexCountInDrawCalls_ = 0; decodeIndsCounter_ = 0; decodeVertsCounter_ = 0; decOptions_.applySkinInDecode = g_Config.bSoftwareSkinning; diff --git a/GPU/Vulkan/DrawEngineVulkan.h b/GPU/Vulkan/DrawEngineVulkan.h index cfc03a7b6e..ae6124a04c 100644 --- a/GPU/Vulkan/DrawEngineVulkan.h +++ b/GPU/Vulkan/DrawEngineVulkan.h @@ -170,13 +170,13 @@ public: // So that this can be inlined void Flush() { - if (!numDrawVerts_) + if (!numDrawInds_) return; DoFlush(); } void FinishDeferred() { - if (!numDrawVerts_) + if (!numDrawInds_) return; // Decode any pending vertices. And also flush while we're at it, for simplicity. // It might be possible to only decode like in the other backends, but meh, it can't matter. @@ -185,9 +185,9 @@ public: } void DispatchFlush() override { - if (!numDrawVerts_) + if (!numDrawInds_) return; - Flush(); + DoFlush(); } VkPipelineLayout GetPipelineLayout() const {