diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 95d6d5ad2e..60c250533a 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -34,7 +34,6 @@ enum { }; DrawEngineCommon::DrawEngineCommon() : decoderMap_(16) { - quadIndices_ = new u16[6 * QUAD_INDICES_MAX]; decJitCache_ = new VertexDecoderJitCache(); transformed = (TransformedVertex *)AllocateMemoryPages(TRANSFORMED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); transformedExpanded = (TransformedVertex *)AllocateMemoryPages(3 * TRANSFORMED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); @@ -43,11 +42,11 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(16) { DrawEngineCommon::~DrawEngineCommon() { FreeMemoryPages(transformed, TRANSFORMED_VERTEX_BUFFER_SIZE); FreeMemoryPages(transformedExpanded, 3 * TRANSFORMED_VERTEX_BUFFER_SIZE); - delete[] quadIndices_; delete decJitCache_; decoderMap_.Iterate([&](const uint32_t vtype, VertexDecoder *decoder) { delete decoder; }); + ClearSplineBezierWeights(); } VertexDecoder *DrawEngineCommon::GetVertexDecoder(u32 vtype) { @@ -739,3 +738,25 @@ void DrawEngineCommon::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, } } } + +void TessellationDataTransfer::CopyControlPoints(float *pos, float *tex, float *col, int posStride, int texStride, int colStride, const SimpleVertex *const *points, int size, u32 vertType) { + bool hasColor = (vertType & GE_VTYPE_COL_MASK) != 0; + bool hasTexCoord = (vertType & GE_VTYPE_TC_MASK) != 0; + + for (int i = 0; i < size; ++i) { + memcpy(pos, points[i]->pos.AsArray(), 3 * sizeof(float)); + pos += posStride; + } + if (hasTexCoord) { + for (int i = 0; i < size; ++i) { + memcpy(tex, points[i]->uv, 2 * sizeof(float)); + tex += texStride; + } + } + if (hasColor) { + for (int i = 0; i < size; ++i) { + memcpy(col, Vec4f::FromRGBA(points[i]->color_32).AsArray(), 4 * sizeof(float)); + col += colStride; + } + } +} diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index 03e9e50cf3..353b3dd7d0 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -34,7 +34,6 @@ enum { VERTEX_BUFFER_MAX = 65536, DECODED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * 64, DECODED_INDEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * 16, - SPLINE_BUFFER_SIZE = VERTEX_BUFFER_MAX * 26, // At least, this buffer needs greater than 1679616 bytes for Mist Dragon morphing in FF4CC. }; // Avoiding the full include of TextureDecoder.h. @@ -50,6 +49,15 @@ inline uint32_t GetVertTypeID(uint32_t vertType, int uvGenMode) { return (vertType & 0xFFFFFF) | (uvGenMode << 24); } +struct SimpleVertex; +namespace Spline { struct Weight2D; } + +class TessellationDataTransfer { +public: + void CopyControlPoints(float *pos, float *tex, float *col, int posStride, int texStride, int colStride, const SimpleVertex *const *points, int size, u32 vertType); + virtual void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) = 0; +}; + class DrawEngineCommon { public: DrawEngineCommon(); @@ -75,6 +83,7 @@ public: void SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead); void SubmitSpline(const void *control_points, const void *indices, int tess_u, int tess_v, int count_u, int count_v, int type_u, int type_v, GEPatchPrimType prim_type, bool computeNormals, bool patchFacing, u32 vertType, int *bytesRead); void SubmitBezier(const void *control_points, const void *indices, int tess_u, int tess_v, int count_u, int count_v, GEPatchPrimType prim_type, bool computeNormals, bool patchFacing, u32 vertType, int *bytesRead); + void ClearSplineBezierWeights(); std::vector DebugGetVertexLoaderIDs(); std::string DebugGetVertexLoaderString(std::string id, DebugShaderStringType stringType); @@ -160,31 +169,10 @@ protected: int decodedVerts_ = 0; GEPrimitiveType prevPrim_ = GE_PRIM_INVALID; - // Fixed index buffer for easy quad generation from spline/bezier - u16 *quadIndices_ = nullptr; - // Shader blending state bool fboTexNeedBind_ = false; bool fboTexBound_ = false; // Hardware tessellation - int numPatches; - class TessellationDataTransfer { - protected: - // TODO: These aren't used by all backends. - int prevSize; - int prevSizeTex; - int prevSizeCol; - public: - virtual ~TessellationDataTransfer() {} - // Send spline/bezier's control points to vertex shader through floating point texture. - virtual void PrepareBuffers(float *&pos, float *&tex, float *&col, int &posStride, int &texStride, int &colStride, int size, bool hasColor, bool hasTexCoords) { - posStride = 4; - texStride = 4; - colStride = 4; - } - virtual void SendDataToShader(const float *pos, const float *tex, const float *col, int size, bool hasColor, bool hasTexCoords) = 0; - virtual void EndFrame() {} - }; TessellationDataTransfer *tessDataTransfer; }; diff --git a/GPU/Common/ShaderId.cpp b/GPU/Common/ShaderId.cpp index 83bbd08e3e..e7ed0b4e87 100644 --- a/GPU/Common/ShaderId.cpp +++ b/GPU/Common/ShaderId.cpp @@ -53,6 +53,7 @@ std::string VertexShaderDesc(const ShaderID &id) { if (id.Bit(VS_BIT_SPLINE)) desc << "Spline "; if (id.Bit(VS_BIT_HAS_COLOR_TESS)) desc << "TessC "; if (id.Bit(VS_BIT_HAS_TEXCOORD_TESS)) desc << "TessT "; + if (id.Bit(VS_BIT_HAS_NORMAL_TESS)) desc << "TessN "; if (id.Bit(VS_BIT_NORM_REVERSE_TESS)) desc << "TessRevN "; return desc.str(); @@ -73,6 +74,7 @@ void ComputeVertexShaderID(ShaderID *id_out, u32 vertType, bool useHWTransform) bool doSpline = gstate_c.spline; bool hasColorTess = (gstate.vertType & GE_VTYPE_COL_MASK) != 0 && (doBezier || doSpline); bool hasTexcoordTess = (gstate.vertType & GE_VTYPE_TC_MASK) != 0 && (doBezier || doSpline); + bool hasNormalTess = (gstate.vertType & GE_VTYPE_NRM_MASK) != 0 && (doBezier || doSpline); bool enableFog = gstate.isFogEnabled() && !isModeThrough && !gstate.isModeClear(); bool lmode = gstate.isUsingSecondaryColor() && gstate.isLightingEnabled() && !isModeThrough; @@ -139,6 +141,7 @@ void ComputeVertexShaderID(ShaderID *id_out, u32 vertType, bool useHWTransform) id.SetBit(VS_BIT_SPLINE, doSpline); id.SetBit(VS_BIT_HAS_COLOR_TESS, hasColorTess); id.SetBit(VS_BIT_HAS_TEXCOORD_TESS, hasTexcoordTess); + id.SetBit(VS_BIT_HAS_NORMAL_TESS, hasNormalTess); id.SetBit(VS_BIT_NORM_REVERSE_TESS, gstate.isPatchNormalsReversed()); } } diff --git a/GPU/Common/ShaderId.h b/GPU/Common/ShaderId.h index 740e321fe8..573da9c3f3 100644 --- a/GPU/Common/ShaderId.h +++ b/GPU/Common/ShaderId.h @@ -24,7 +24,7 @@ enum { VS_BIT_HAS_COLOR_TESS = 12, // 1 bit VS_BIT_HAS_TEXCOORD_TESS = 13, // 1 bit VS_BIT_NORM_REVERSE_TESS = 14, // 1 bit - // 15 is free. + VS_BIT_HAS_NORMAL_TESS = 15, // 1 bit VS_BIT_UVGEN_MODE = 16, VS_BIT_UVPROJ_MODE = 18, // 2, can overlap with LS0 VS_BIT_LS0 = 18, // 2 diff --git a/GPU/Common/ShaderUniforms.cpp b/GPU/Common/ShaderUniforms.cpp index e0f6719502..26bdbd7c17 100644 --- a/GPU/Common/ShaderUniforms.cpp +++ b/GPU/Common/ShaderUniforms.cpp @@ -240,7 +240,7 @@ void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipView } if (dirtyUniforms & DIRTY_BEZIERSPLINE) { - ub->spline_counts = BytesToUint32(gstate_c.spline_count_u, gstate_c.spline_count_v, gstate_c.spline_type_u, gstate_c.spline_type_v); + ub->spline_counts = gstate_c.spline_num_points_u; } if (dirtyUniforms & DIRTY_DEPAL) { diff --git a/GPU/Common/SplineCommon.cpp b/GPU/Common/SplineCommon.cpp index 7af7926dde..77427b99ee 100644 --- a/GPU/Common/SplineCommon.cpp +++ b/GPU/Common/SplineCommon.cpp @@ -21,8 +21,6 @@ #include "profiler/profiler.h" #include "Common/CPUDetect.h" -#include "Common/MemoryUtil.h" -#include "Core/Config.h" #include "GPU/Common/GPUStateUtils.h" #include "GPU/Common/SplineCommon.h" @@ -30,67 +28,34 @@ #include "GPU/ge_constants.h" #include "GPU/GPUState.h" // only needed for UVScale stuff -#if defined(_M_SSE) -#include - -inline __m128 SSECrossProduct(__m128 a, __m128 b) -{ - const __m128 left = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2))); - const __m128 right = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1))); - return _mm_sub_ps(left, right); +bool CanUseHardwareTessellation(GEPatchPrimType prim) { + if (g_Config.bHardwareTessellation && !g_Config.bSoftwareRendering) { + return CanUseHardwareTransform(PatchPrimToPrim(prim)); + } + return false; } -inline __m128 SSENormalizeMultiplierSSE2(__m128 v) -{ - const __m128 sq = _mm_mul_ps(v, v); - const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1)); - const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2)); - const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq)); +class SimpleBufferManager { +private: + u8 *buf_; + size_t totalSize, maxSize_; +public: + SimpleBufferManager(u8 *buf, size_t maxSize) + : buf_(buf), totalSize(0), maxSize_(maxSize) {} - const __m128 rt = _mm_rsqrt_ss(res); - return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0)); -} + u8 *Allocate(size_t size) { + size = (size + 15) & ~15; // Align for 16 bytes -#if _M_SSE >= 0x401 -#include + if ((totalSize + size) > maxSize_) + return nullptr; // No more memory -inline __m128 SSENormalizeMultiplierSSE4(__m128 v) -{ - return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF)); -} + size_t tmp = totalSize; + totalSize += size; + return buf_ + tmp; + } +}; -inline __m128 SSENormalizeMultiplier(bool useSSE4, __m128 v) -{ - if (useSSE4) - return SSENormalizeMultiplierSSE4(v); - return SSENormalizeMultiplierSSE2(v); -} -#else -inline __m128 SSENormalizeMultiplier(bool useSSE4, __m128 v) -{ - return SSENormalizeMultiplierSSE2(v); -} -#endif - -#endif - - -#define START_OPEN 1 -#define END_OPEN 2 - - - -static void CopyQuad(u8 *&dest, const SimpleVertex *v1, const SimpleVertex *v2, const SimpleVertex *v3, const SimpleVertex *v4) { - int vertexSize = sizeof(SimpleVertex); - memcpy(dest, v1, vertexSize); - dest += vertexSize; - memcpy(dest, v2, vertexSize); - dest += vertexSize; - memcpy(dest, v3, vertexSize); - dest += vertexSize; - memcpy(dest, v4, vertexSize); - dest += vertexSize; -} +namespace Spline { static void CopyQuadIndex(u16 *&indices, GEPatchPrimType type, const int idx0, const int idx1, const int idx2, const int idx3) { if (type == GE_PATCHPRIM_LINES) { @@ -100,8 +65,7 @@ static void CopyQuadIndex(u16 *&indices, GEPatchPrimType type, const int idx0, c *(indices++) = idx3; *(indices++) = idx1; *(indices++) = idx2; - } - else { + } else { *(indices++) = idx0; *(indices++) = idx2; *(indices++) = idx1; @@ -111,765 +75,442 @@ static void CopyQuadIndex(u16 *&indices, GEPatchPrimType type, const int idx0, c } } -#undef b2 +void BuildIndex(u16 *indices, int &count, int num_u, int num_v, GEPatchPrimType prim_type, int total) { + for (int v = 0; v < num_v; ++v) { + for (int u = 0; u < num_u; ++u) { + int idx0 = v * (num_u + 1) + u + total; // Top left + int idx2 = (v + 1) * (num_u + 1) + u + total; // Bottom left -// Bernstein basis functions -inline float bern0(float x) { return (1 - x) * (1 - x) * (1 - x); } -inline float bern1(float x) { return 3 * x * (1 - x) * (1 - x); } -inline float bern2(float x) { return 3 * x * x * (1 - x); } -inline float bern3(float x) { return x * x * x; } - -inline float bern0deriv(float x) { return -3 * (x - 1) * (x - 1); } -inline float bern1deriv(float x) { return 9 * x * x - 12 * x + 3; } -inline float bern2deriv(float x) { return 3 * (2 - 3 * x) * x; } -inline float bern3deriv(float x) { return 3 * x * x; } - -// http://en.wikipedia.org/wiki/Bernstein_polynomial -static Math3D::Vec2f Bernstein3D(const Math3D::Vec2f& p0, const Math3D::Vec2f& p1, const Math3D::Vec2f& p2, const Math3D::Vec2f& p3, float x) { - if (x == 0) return p0; - else if (x == 1) return p3; - return p0 * bern0(x) + p1 * bern1(x) + p2 * bern2(x) + p3 * bern3(x); -} - -static Vec3f Bernstein3D(const Vec3f& p0, const Vec3f& p1, const Vec3f& p2, const Vec3f& p3, float x) { - if (x == 0) return p0; - else if (x == 1) return p3; - return p0 * bern0(x) + p1 * bern1(x) + p2 * bern2(x) + p3 * bern3(x); -} - -static Vec4f Bernstein3D(const Vec4f& p0, const Vec4f& p1, const Vec4f& p2, const Vec4f& p3, float x) { - if (x == 0) return p0; - else if (x == 1) return p3; - return p0 * bern0(x) + p1 * bern1(x) + p2 * bern2(x) + p3 * bern3(x); -} - -static Vec4f Bernstein3D(const u32& p0, const u32& p1, const u32& p2, const u32& p3, float x) { - return Bernstein3D(Vec4f::FromRGBA(p0), Vec4f::FromRGBA(p1), Vec4f::FromRGBA(p2), Vec4f::FromRGBA(p3), x); -} - -static Vec3f Bernstein3DDerivative(const Vec3f& p0, const Vec3f& p1, const Vec3f& p2, const Vec3f& p3, float x) { - return p0 * bern0deriv(x) + p1 * bern1deriv(x) + p2 * bern2deriv(x) + p3 * bern3deriv(x); -} - -static void spline_n_4(int i, float t, float *knot, float *splineVal) { - knot += i + 1; - -#ifdef _M_SSE - const __m128 knot012 = _mm_loadu_ps(&knot[0]); - const __m128 knot345 = _mm_loadu_ps(&knot[3]); - const __m128 t012 = _mm_sub_ps(_mm_set_ps1(t), knot012); - const __m128 f30_41_52 = _mm_div_ps(t012, _mm_sub_ps(knot345, knot012)); - - const __m128 knot343 = _mm_shuffle_ps(knot345, knot345, _MM_SHUFFLE(3, 0, 1, 0)); - const __m128 knot122 = _mm_shuffle_ps(knot012, knot012, _MM_SHUFFLE(3, 2, 2, 1)); - const __m128 t122 = _mm_shuffle_ps(t012, t012, _MM_SHUFFLE(3, 2, 2, 1)); - const __m128 f31_42_32 = _mm_div_ps(t122, _mm_sub_ps(knot343, knot122)); - - // It's still faster to use SSE, even with this. - alignas(16) float ff30_41_52[4]; - alignas(16) float ff31_42_32[4]; - _mm_store_ps(ff30_41_52, f30_41_52); - _mm_store_ps(ff31_42_32, f31_42_32); - - const float &f30 = ff30_41_52[0]; - const float &f41 = ff30_41_52[1]; - const float &f52 = ff30_41_52[2]; - const float &f31 = ff31_42_32[0]; - const float &f42 = ff31_42_32[1]; - const float &f32 = ff31_42_32[2]; -#else - // TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly... - float t0 = (t - knot[0]); - float t1 = (t - knot[1]); - float t2 = (t - knot[2]); - // TODO: All our knots are integers so we should be able to get rid of these divisions (How?) - float f30 = t0/(knot[3]-knot[0]); - float f41 = t1/(knot[4]-knot[1]); - float f52 = t2/(knot[5]-knot[2]); - float f31 = t1/(knot[3]-knot[1]); - float f42 = t2/(knot[4]-knot[2]); - float f32 = t2/(knot[3]-knot[2]); -#endif - - float a = (1-f30)*(1-f31); - float b = (f31*f41); - float c = (1-f41)*(1-f42); - float d = (f42*f52); - - splineVal[0] = a-(a*f32); - splineVal[1] = 1-a-b+((a+b+c-1)*f32); - splineVal[2] = b+((1-b-c-d)*f32); - splineVal[3] = d*f32; -} - -// knot should be an array sized n + 5 (n + 1 + 1 + degree (cubic)) -static void spline_knot(int n, int type, float *knot) { - memset(knot, 0, sizeof(float) * (n + 5)); - for (int i = 0; i < n - 1; ++i) - knot[i + 3] = (float)i; - - if ((type & 1) == 0) { - knot[0] = -3; - knot[1] = -2; - knot[2] = -1; - } - if ((type & 2) == 0) { - knot[n + 2] = (float)(n - 1); - knot[n + 3] = (float)(n); - knot[n + 4] = (float)(n + 1); - } else { - knot[n + 2] = (float)(n - 2); - knot[n + 3] = (float)(n - 2); - knot[n + 4] = (float)(n - 2); - } -} - -bool CanUseHardwareTessellation(GEPatchPrimType prim) { - if (g_Config.bHardwareTessellation && !g_Config.bSoftwareRendering) { - return CanUseHardwareTransform(PatchPrimToPrim(prim)); - } - return false; -} - -// Prepare mesh of one patch for "Instanced Tessellation". -static void TessellateSplinePatchHardware(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch) { - SimpleVertex *&vertices = (SimpleVertex*&)dest; - - float inv_u = 1.0f / (float)spatch.tess_u; - float inv_v = 1.0f / (float)spatch.tess_v; - - // Generating simple input vertices for the spline-computing vertex shader. - for (int tile_v = 0; tile_v < spatch.tess_v + 1; ++tile_v) { - for (int tile_u = 0; tile_u < spatch.tess_u + 1; ++tile_u) { - SimpleVertex &vert = vertices[tile_v * (spatch.tess_u + 1) + tile_u]; - vert.pos.x = (float)tile_u * inv_u; - vert.pos.y = (float)tile_v * inv_v; - - // TODO: Move to shader uniform and unify this method spline and bezier if necessary. - // For compute normal - vert.nrm.x = inv_u; - vert.nrm.y = inv_v; - } - } - - // Combine the vertices into triangles. - for (int tile_v = 0; tile_v < spatch.tess_v; ++tile_v) { - for (int tile_u = 0; tile_u < spatch.tess_u; ++tile_u) { - int idx0 = tile_v * (spatch.tess_u + 1) + tile_u; - int idx1 = tile_v * (spatch.tess_u + 1) + tile_u + 1; - int idx2 = (tile_v + 1) * (spatch.tess_u + 1) + tile_u; - int idx3 = (tile_v + 1) * (spatch.tess_u + 1) + tile_u + 1; - - CopyQuadIndex(indices, spatch.primType, idx0, idx1, idx2, idx3); + CopyQuadIndex(indices, prim_type, idx0, idx0 + 1, idx2, idx2 + 1); count += 6; } } } -static void _SplinePatchLowQuality(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType) { - // Fast and easy way - just draw the control points, generate some very basic normal vector substitutes. - // Very inaccurate but okay for Loco Roco. Maybe should keep it as an option because it's fast. +class Bezier3DWeight { +private: + void CalcWeights(float t, Weight &w) { + // Bernstein 3D basis polynomial + w.basis[0] = (1 - t) * (1 - t) * (1 - t); + w.basis[1] = 3 * t * (1 - t) * (1 - t); + w.basis[2] = 3 * t * t * (1 - t); + w.basis[3] = t * t * t; - const int tile_min_u = (spatch.type_u & START_OPEN) ? 0 : 1; - const int tile_min_v = (spatch.type_v & START_OPEN) ? 0 : 1; - const int tile_max_u = (spatch.type_u & END_OPEN) ? spatch.count_u - 1 : spatch.count_u - 2; - const int tile_max_v = (spatch.type_v & END_OPEN) ? spatch.count_v - 1 : spatch.count_v - 2; + // Derivative + w.deriv[0] = -3 * (1 - t) * (1 - t); + w.deriv[1] = 9 * t * t - 12 * t + 3; + w.deriv[2] = 3 * (2 - 3 * t) * t; + w.deriv[3] = 3 * t * t; + } +public: + Weight *CalcWeightsAll(u32 key) { + int tess = (int)key; + Weight *weights = new Weight[tess + 1]; + const float inv_tess = 1.0f / (float)tess; + for (int i = 0; i < tess + 1; ++i) { + const float t = (float)i * inv_tess; + CalcWeights(t, weights[i]); + } + return weights; + } - float tu_width = (float)spatch.count_u - 3.0f; - float tv_height = (float)spatch.count_v - 3.0f; - tu_width /= (float)(tile_max_u - tile_min_u); - tv_height /= (float)(tile_max_v - tile_min_v); + static u32 ToKey(int tess, int count, int type) { + return tess; + } - GEPatchPrimType prim_type = spatch.primType; - bool computeNormals = spatch.computeNormals; - bool patchFacing = spatch.patchFacing; + static int CalcSize(int tess, int count) { + return tess + 1; + } - int i = 0; - for (int tile_v = tile_min_v; tile_v < tile_max_v; ++tile_v) { - for (int tile_u = tile_min_u; tile_u < tile_max_u; ++tile_u) { - int point_index = tile_u + tile_v * spatch.count_u; + static WeightCache weightsCache; +}; - SimpleVertex v0 = *spatch.points[point_index]; - SimpleVertex v1 = *spatch.points[point_index + 1]; - SimpleVertex v2 = *spatch.points[point_index + spatch.count_u]; - SimpleVertex v3 = *spatch.points[point_index + spatch.count_u + 1]; +class Spline3DWeight { +private: + struct KnotDiv { + float _3_0 = 1.0f / 3.0f; + float _4_1 = 1.0f / 3.0f; + float _5_2 = 1.0f / 3.0f; + float _3_1 = 1.0f / 2.0f; + float _4_2 = 1.0f / 2.0f; + float _3_2 = 1.0f; // Always 1 + }; - // Generate UV. TODO: Do this even if UV specified in control points? - if ((origVertType & GE_VTYPE_TC_MASK) == 0) { - float u = (tile_u - tile_min_u) * tu_width; - float v = (tile_v - tile_min_v) * tv_height; + // knot should be an array sized n + 5 (n + 1 + 1 + degree (cubic)) + void CalcKnots(int n, int type, float *knots, KnotDiv *divs) { + // Basic theory (-2 to +3), optimized with KnotDiv (-2 to +0) + // for (int i = 0; i < n + 5; ++i) { + for (int i = 0; i < n + 2; ++i) { + knots[i] = (float)i - 2; + } - v0.uv[0] = u; - v0.uv[1] = v; - v1.uv[0] = u + tu_width; - v1.uv[1] = v; - v2.uv[0] = u; - v2.uv[1] = v + tv_height; - v3.uv[0] = u + tu_width; - v3.uv[1] = v + tv_height; - } + // The first edge is open + if ((type & 1) != 0) { + knots[0] = 0; + knots[1] = 0; - // Generate normal if lighting is enabled (otherwise there's no point). - // This is a really poor quality algorithm, we get facet normals. - if (computeNormals) { - Vec3Packedf norm = Cross(v1.pos - v0.pos, v2.pos - v0.pos); - norm.Normalize(); - if (patchFacing) - norm *= -1.0f; - v0.nrm = norm; - v1.nrm = norm; - v2.nrm = norm; - v3.nrm = norm; - } - - int idx0 = i * 4 + 0; - int idx1 = i * 4 + 1; - int idx2 = i * 4 + 2; - int idx3 = i * 4 + 3; - i++; - - CopyQuad(dest, &v0, &v1, &v2, &v3); - CopyQuadIndex(indices, prim_type, idx0, idx1, idx2, idx3); - count += 6; + divs[0]._3_0 = 1.0f; + divs[0]._4_1 = 1.0f / 2.0f; + divs[0]._3_1 = 1.0f; + if (n > 1) + divs[1]._3_0 = 1.0f / 2.0f; + } + // The last edge is open + if ((type & 2) != 0) { + // knots[n + 2] = (float)n; // Got rid of this line optimized with KnotDiv + // knots[n + 3] = (float)n; // Got rid of this line optimized with KnotDiv + // knots[n + 4] = (float)n; // Got rid of this line optimized with KnotDiv + divs[n - 1]._4_1 = 1.0f / 2.0f; + divs[n - 1]._5_2 = 1.0f; + divs[n - 1]._4_2 = 1.0f; + if (n > 1) + divs[n - 2]._5_2 = 1.0f / 2.0f; } } -} - -static inline void AccumulateWeighted(Vec3f &out, const Vec3Packedf &in, const Vec4f &w) { + void CalcWeights(float t, const float *knots, const KnotDiv &div, Weight &w) { #ifdef _M_SSE - out.vec = _mm_add_ps(out.vec, _mm_mul_ps(_mm_loadu_ps(in.AsArray()), w.vec)); + const __m128 knot012 = _mm_loadu_ps(knots); + const __m128 t012 = _mm_sub_ps(_mm_set_ps1(t), knot012); + const __m128 f30_41_52 = _mm_mul_ps(t012, _mm_loadu_ps(&div._3_0)); + const __m128 f52_31_42 = _mm_mul_ps(t012, _mm_loadu_ps(&div._5_2)); + + // Following comments are for explains order of the multiply. + // float a = (1-f30)*(1-f31); + // float c = (1-f41)*(1-f42); + // float b = ( f31 * f41); + // float d = ( f42 * f52); + const __m128 f30_41_31_42 = _mm_shuffle_ps(f30_41_52, f52_31_42, _MM_SHUFFLE(2, 1, 1, 0)); + const __m128 f31_42_41_52 = _mm_shuffle_ps(f52_31_42, f30_41_52, _MM_SHUFFLE(2, 1, 2, 1)); + const __m128 c1_1_0_0 = { 1, 1, 0, 0 }; + const __m128 acbd = _mm_mul_ps(_mm_sub_ps(c1_1_0_0, f30_41_31_42), _mm_sub_ps(c1_1_0_0, f31_42_41_52)); + + alignas(16) float f_t012[4]; + alignas(16) float f_acbd[4]; + alignas(16) float f_f30_41_31_42[4]; + _mm_store_ps(f_t012, t012); + _mm_store_ps(f_acbd, acbd); + _mm_store_ps(f_f30_41_31_42, f30_41_31_42); + + const float &f32 = f_t012[2]; + + const float &a = f_acbd[0]; + const float &b = f_acbd[2]; + const float &c = f_acbd[1]; + const float &d = f_acbd[3]; + + // For derivative + const float &f31 = f_f30_41_31_42[2]; + const float &f42 = f_f30_41_31_42[3]; #else - out += in * w.x; + // TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly... + float t0 = (t - knots[0]); + float t1 = (t - knots[1]); + float t2 = (t - knots[2]); + + float f30 = t0 * div._3_0; + float f41 = t1 * div._4_1; + float f52 = t2 * div._5_2; + float f31 = t1 * div._3_1; + float f42 = t2 * div._4_2; + float f32 = t2 * div._3_2; + + float a = (1 - f30) * (1 - f31); + float b = (f31 * f41); + float c = (1 - f41) * (1 - f42); + float d = (f42 * f52); #endif -} + w.basis[0] = a * (1 - f32); // (1-f30)*(1-f31)*(1-f32) + w.basis[1] = 1 - a - b + ((a + b + c - 1) * f32); + w.basis[2] = b + ((1 - b - c - d) * f32); + w.basis[3] = d * f32; // f32*f42*f52 -static inline void AccumulateWeighted(Vec4f &out, const Vec4f &in, const Vec4f &w) { -#ifdef _M_SSE - out.vec = _mm_add_ps(out.vec, _mm_mul_ps(in.vec, w.vec)); -#else - out += in * w; -#endif -} + // Derivative + float i1 = (1 - f31) * (1 - f32); + float i2 = f31 * (1 - f32) + (1 - f42) * f32; + float i3 = f42 * f32; -template -static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) { - // Full (mostly) correct tessellation of spline patches. - // Not very fast. + float f130 = i1 * div._3_0; + float f241 = i2 * div._4_1; + float f352 = i3 * div._5_2; - float *knot_u = new float[spatch.count_u + 4]; - float *knot_v = new float[spatch.count_v + 4]; - spline_knot(spatch.count_u - 1, spatch.type_u, knot_u); - spline_knot(spatch.count_v - 1, spatch.type_v, knot_v); + w.deriv[0] = 3 * (0 - f130); + w.deriv[1] = 3 * (f130 - f241); + w.deriv[2] = 3 * (f241 - f352); + w.deriv[3] = 3 * (f352 - 0); + } +public: + Weight *CalcWeightsAll(u32 key) { + int tess, count, type; + FromKey(key, tess, count, type); + const int num_patches = count - 3; + Weight *weights = new Weight[tess * num_patches + 1]; - // Increase tessellation based on the size. Should be approximately right? - int patch_div_s = (spatch.count_u - 3) * spatch.tess_u; - int patch_div_t = (spatch.count_v - 3) * spatch.tess_v; - if (quality > 1) { - // Don't cut below 2, though. - if (patch_div_s > 2) { - patch_div_s /= quality; - } - if (patch_div_t > 2) { - patch_div_t /= quality; + // float *knots = new float[num_patches + 5]; + float *knots = new float[num_patches + 2]; // Optimized with KnotDiv, must use +5 in theory + KnotDiv *divs = new KnotDiv[num_patches]; + CalcKnots(num_patches, type, knots, divs); + + const float inv_tess = 1.0f / (float)tess; + for (int i = 0; i < num_patches; ++i) { + const int start = (i == 0) ? 0 : 1; + for (int j = start; j <= tess; ++j) { + const int index = i * tess + j; + const float t = (float)index * inv_tess; + CalcWeights(t, knots + i, divs[i], weights[index]); + } } + + delete[] knots; + delete[] divs; + + return weights; } - // Downsample until it fits, in case crazy tessellation factors are sent. - while ((patch_div_s + 1) * (patch_div_t + 1) > maxVertices) { - patch_div_s /= 2; - patch_div_t /= 2; + static u32 ToKey(int tess, int count, int type) { + return tess | (count << 8) | (type << 16); } - if (patch_div_s < 1) patch_div_s = 1; - if (patch_div_t < 1) patch_div_t = 1; + static void FromKey(u32 key, int &tess, int &count, int &type) { + tess = key & 0xFF; count = (key >> 8) & 0xFF; type = (key >> 16) & 0xFF; + } - // First compute all the vertices and put them in an array - SimpleVertex *&vertices = (SimpleVertex*&)dest; + static int CalcSize(int tess, int count) { + return (count - 3) * tess + 1; + } - float tu_width = (float)spatch.count_u - 3.0f; - float tv_height = (float)spatch.count_v - 3.0f; + static WeightCache weightsCache; +}; - // int max_idx = spatch.count_u * spatch.count_v; +WeightCache Bezier3DWeight::weightsCache; +WeightCache Spline3DWeight::weightsCache; - bool computeNormals = spatch.computeNormals; +// Tessellate single patch (4x4 control points) +template +class Tessellator { +private: + const T *const p[4]; // T p[v][u]; 4x4 control points + T u[4]; // Pre-tessellated U lines +public: + Tessellator(const T *p, const int idx[4]) : p{ p + idx[0], p + idx[1], p + idx[2], p + idx[3] } {} - float one_over_patch_div_s = 1.0f / (float)(patch_div_s); - float one_over_patch_div_t = 1.0f / (float)(patch_div_t); + // Linear combination + T Sample(const T p[4], const float w[4]) { + return p[0] * w[0] + p[1] * w[1] + p[2] * w[2] + p[3] * w[3]; + } - for (int tile_v = 0; tile_v < patch_div_t + 1; tile_v++) { - float v = (float)tile_v * (float)(spatch.count_v - 3) * one_over_patch_div_t; - if (v < 0.0f) - v = 0.0f; - for (int tile_u = 0; tile_u < patch_div_s + 1; tile_u++) { - float u = (float)tile_u * (float)(spatch.count_u - 3) * one_over_patch_div_s; - if (u < 0.0f) - u = 0.0f; - SimpleVertex *vert = &vertices[tile_v * (patch_div_s + 1) + tile_u]; - Vec4f vert_color(0, 0, 0, 0); - Vec3f vert_pos; - vert_pos.SetZero(); - Vec3f vert_nrm; - if (origNrm) { - vert_nrm.SetZero(); - } - if (origCol) { - vert_color.SetZero(); - } else { - memcpy(vert->color, spatch.points[0]->color, 4); - } - if (origTc) { - vert->uv[0] = 0.0f; - vert->uv[1] = 0.0f; - } else { - vert->uv[0] = tu_width * ((float)tile_u * one_over_patch_div_s); - vert->uv[1] = tv_height * ((float)tile_v * one_over_patch_div_t); - } + void SampleEdgeU(int idx) { + u[0] = p[0][idx]; + u[1] = p[1][idx]; + u[2] = p[2][idx]; + u[3] = p[3][idx]; + } + void SampleU(const float weights[4]) { + if (weights[0] == 1.0f) { SampleEdgeU(0); return; } // weights = {1,0,0,0}, first edge is open. + if (weights[3] == 1.0f) { SampleEdgeU(3); return; } // weights = {0,0,0,1}, last edge is open. - // Collect influences from surrounding control points. - float u_weights[4]; - float v_weights[4]; + u[0] = Sample(p[0], weights); + u[1] = Sample(p[1], weights); + u[2] = Sample(p[2], weights); + u[3] = Sample(p[3], weights); + } - int iu = (int)u; - int iv = (int)v; + T SampleV(const float weights[4]) { + if (weights[0] == 1.0f) return u[0]; // weights = {1,0,0,0}, first edge is open. + if (weights[3] == 1.0f) return u[3]; // weights = {0,0,0,1}, last edge is open. - // TODO: Would really like to fix the surrounding logic somehow to get rid of these but I can't quite get it right.. - // Without the previous epsilons and with large count_u, we will end up doing an out of bounds access later without these. - if (iu >= spatch.count_u - 3) iu = spatch.count_u - 4; - if (iv >= spatch.count_v - 3) iv = spatch.count_v - 4; + return Sample(u, weights); + } +}; - spline_n_4(iu, u, knot_u, u_weights); - spline_n_4(iv, v, knot_v, v_weights); +ControlPoints::ControlPoints(const SimpleVertex *const *points, int size, SimpleBufferManager &managedBuf) { + pos = (Vec3f *)managedBuf.Allocate(sizeof(Vec3f) * size); + tex = (Vec2f *)managedBuf.Allocate(sizeof(Vec2f) * size); + col = (Vec4f *)managedBuf.Allocate(sizeof(Vec4f) * size); + Convert(points, size); +} - // Handle degenerate patches. without this, spatch.points[] may read outside the number of initialized points. - int patch_w = std::min(spatch.count_u - iu, 4); - int patch_h = std::min(spatch.count_v - iv, 4); +void ControlPoints::Convert(const SimpleVertex *const *points, int size) { + for (int i = 0; i < size; ++i) { + pos[i] = Vec3f(points[i]->pos); + tex[i] = Vec2f(points[i]->uv); + col[i] = Vec4f::FromRGBA(points[i]->color_32); + } + defcolor = points[0]->color_32; +} - for (int ii = 0; ii < patch_w; ++ii) { - for (int jj = 0; jj < patch_h; ++jj) { - float u_spline = u_weights[ii]; - float v_spline = v_weights[jj]; - float f = u_spline * v_spline; +template +class SubdivisionSurface { +public: + template + static void Tessellate(OutputBuffers &output, const Surface &surface, const ControlPoints &points, const Weight2D &weights) { + const float inv_u = 1.0f / (float)surface.tess_u; + const float inv_v = 1.0f / (float)surface.tess_v; - if (f > 0.0f) { -#ifdef _M_SSE - Vec4f fv(_mm_set_ps1(f)); -#else - Vec4f fv = Vec4f::AssignToAll(f); -#endif - int idx = spatch.count_u * (iv + jj) + (iu + ii); - /* - if (idx >= max_idx) { - char temp[512]; - snprintf(temp, sizeof(temp), "count_u: %d count_v: %d patch_w: %d patch_h: %d ii: %d jj: %d iu: %d iv: %d patch_div_s: %d patch_div_t: %d\n", spatch.count_u, spatch.count_v, patch_w, patch_h, ii, jj, iu, iv, patch_div_s, patch_div_t); - OutputDebugStringA(temp); - Crash(); - }*/ - const SimpleVertex *a = spatch.points[idx]; - AccumulateWeighted(vert_pos, a->pos, fv); - if (origTc) { - vert->uv[0] += a->uv[0] * f; - vert->uv[1] += a->uv[1] * f; + for (int patch_u = 0; patch_u < surface.num_patches_u; ++patch_u) { + const int start_u = surface.GetTessStart(patch_u); + for (int patch_v = 0; patch_v < surface.num_patches_v; ++patch_v) { + const int start_v = surface.GetTessStart(patch_v); + + // Prepare 4x4 control points to tessellate + const int idx = surface.GetPointIndex(patch_u, patch_v); + const int idx_v[4] = { idx, idx + surface.num_points_u, idx + surface.num_points_u * 2, idx + surface.num_points_u * 3 }; + Tessellator tess_pos(points.pos, idx_v); + Tessellator tess_col(points.col, idx_v); + Tessellator tess_tex(points.tex, idx_v); + Tessellator tess_nrm(points.pos, idx_v); + + for (int tile_u = start_u; tile_u <= surface.tess_u; ++tile_u) { + const int index_u = surface.GetIndexU(patch_u, tile_u); + const Weight &wu = weights.u[index_u]; + + // Pre-tessellate U lines + tess_pos.SampleU(wu.basis); + if (sampleCol) + tess_col.SampleU(wu.basis); + if (sampleTex) + tess_tex.SampleU(wu.basis); + if (sampleNrm) + tess_nrm.SampleU(wu.deriv); + + for (int tile_v = start_v; tile_v <= surface.tess_v; ++tile_v) { + const int index_v = surface.GetIndexV(patch_v, tile_v); + const Weight &wv = weights.v[index_v]; + + SimpleVertex &vert = output.vertices[surface.GetIndex(index_u, index_v, patch_u, patch_v)]; + + // Tessellate + vert.pos = tess_pos.SampleV(wv.basis); + if (sampleCol) { + vert.color_32 = tess_col.SampleV(wv.basis).ToRGBA(); + } else { + vert.color_32 = points.defcolor; } - if (origCol) { - Vec4f a_color = Vec4f::FromRGBA(a->color_32); - AccumulateWeighted(vert_color, a_color, fv); + if (sampleTex) { + tess_tex.SampleV(wv.basis).Write(vert.uv); + } else { + // Generate texcoord + vert.uv[0] = patch_u + tile_u * inv_u; + vert.uv[1] = patch_v + tile_v * inv_v; } - if (origNrm) { - AccumulateWeighted(vert_nrm, a->nrm, fv); + if (sampleNrm) { + const Vec3f derivU = tess_nrm.SampleV(wv.basis); + const Vec3f derivV = tess_pos.SampleV(wv.deriv); + + vert.nrm = Cross(derivU, derivV).Normalized(useSSE4); + if (patchFacing) + vert.nrm *= -1.0f; + } else { + vert.nrm.SetZero(); } } } } - vert->pos = vert_pos; - if (origNrm) { -#ifdef _M_SSE - const __m128 normalize = SSENormalizeMultiplier(useSSE4, vert_nrm.vec); - vert_nrm.vec = _mm_mul_ps(vert_nrm.vec, normalize); -#else - vert_nrm.Normalize(); -#endif - vert->nrm = vert_nrm; - } else { - vert->nrm.SetZero(); - vert->nrm.z = 1.0f; - } - if (origCol) { - vert->color_32 = vert_color.ToRGBA(); - } } + + surface.BuildIndex(output.indices, output.count); } - delete[] knot_u; - delete[] knot_v; + using TessFunc = void(*)(OutputBuffers &, const Surface &, const ControlPoints &, const Weight2D &); + TEMPLATE_PARAMETER_DISPATCHER_FUNCTION(Tess, SubdivisionSurface::Tessellate, TessFunc); - // Hacky normal generation through central difference. - if (computeNormals && !origNrm) { -#ifdef _M_SSE - const __m128 facing = spatch.patchFacing ? _mm_set_ps1(-1.0f) : _mm_set_ps1(1.0f); -#endif + static void Tessellate(OutputBuffers &output, const Surface &surface, const ControlPoints &points, const Weight2D &weights, u32 origVertType) { + const bool params[] = { + (origVertType & GE_VTYPE_NRM_MASK) != 0, + (origVertType & GE_VTYPE_COL_MASK) != 0, + (origVertType & GE_VTYPE_TC_MASK) != 0, + cpu_info.bSSE4_1, + surface.patchFacing, + }; + static TemplateParameterDispatcher dispatcher; // Initialize only once - for (int v = 0; v < patch_div_t + 1; v++) { - Vec3f vl_pos = vertices[v * (patch_div_s + 1)].pos; - Vec3f vc_pos = vertices[v * (patch_div_s + 1)].pos; - - for (int u = 0; u < patch_div_s + 1; u++) { - const int t = std::max(0, v - 1); - const int r = std::min(patch_div_s, u + 1); - const int b = std::min(patch_div_t, v + 1); - - const Vec3f vr_pos = vertices[v * (patch_div_s + 1) + r].pos; - -#ifdef _M_SSE - const __m128 right = _mm_sub_ps(vr_pos.vec, vl_pos.vec); - - const Vec3f vb_pos = vertices[b * (patch_div_s + 1) + u].pos; - const Vec3f vt_pos = vertices[t * (patch_div_s + 1) + u].pos; - const __m128 down = _mm_sub_ps(vb_pos.vec, vt_pos.vec); - - const __m128 crossed = SSECrossProduct(right, down); - const __m128 normalize = SSENormalizeMultiplier(useSSE4, crossed); - - Vec3f finalNrm = _mm_mul_ps(normalize, _mm_mul_ps(crossed, facing)); - vertices[v * (patch_div_s + 1) + u].nrm = finalNrm; -#else - const Vec3Packedf &right = vr_pos - vl_pos; - const Vec3Packedf &down = vertices[b * (patch_div_s + 1) + u].pos - vertices[t * (patch_div_s + 1) + u].pos; - - vertices[v * (patch_div_s + 1) + u].nrm = Cross(right, down).Normalized(); - if (spatch.patchFacing) { - vertices[v * (patch_div_s + 1) + u].nrm *= -1.0f; - } -#endif - - // Rotate for the next one to the right. - vl_pos = vc_pos; - vc_pos = vr_pos; - } - } + TessFunc func = dispatcher.GetFunc(params); + func(output, surface, points, weights); } - - GEPatchPrimType prim_type = spatch.primType; - // Tessellate. - for (int tile_v = 0; tile_v < patch_div_t; ++tile_v) { - for (int tile_u = 0; tile_u < patch_div_s; ++tile_u) { - int idx0 = tile_v * (patch_div_s + 1) + tile_u; - int idx1 = tile_v * (patch_div_s + 1) + tile_u + 1; - int idx2 = (tile_v + 1) * (patch_div_s + 1) + tile_u; - int idx3 = (tile_v + 1) * (patch_div_s + 1) + tile_u + 1; - - CopyQuadIndex(indices, prim_type, idx0, idx1, idx2, idx3); - count += 6; - } - } -} - -template -static inline void SplinePatchFullQualityDispatch4(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) { - if (cpu_info.bSSE4_1) - SplinePatchFullQuality(dest, indices, count, spatch, origVertType, quality, maxVertices); - else - SplinePatchFullQuality(dest, indices, count, spatch, origVertType, quality, maxVertices); -} - -template -static inline void SplinePatchFullQualityDispatch3(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) { - bool origTc = (origVertType & GE_VTYPE_TC_MASK) != 0; - - if (origTc) - SplinePatchFullQualityDispatch4(dest, indices, count, spatch, origVertType, quality, maxVertices); - else - SplinePatchFullQualityDispatch4(dest, indices, count, spatch, origVertType, quality, maxVertices); -} - -template -static inline void SplinePatchFullQualityDispatch2(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) { - bool origCol = (origVertType & GE_VTYPE_COL_MASK) != 0; - - if (origCol) - SplinePatchFullQualityDispatch3(dest, indices, count, spatch, origVertType, quality, maxVertices); - else - SplinePatchFullQualityDispatch3(dest, indices, count, spatch, origVertType, quality, maxVertices); -} - -static void SplinePatchFullQualityDispatch(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) { - bool origNrm = (origVertType & GE_VTYPE_NRM_MASK) != 0; - - if (origNrm) - SplinePatchFullQualityDispatch2(dest, indices, count, spatch, origVertType, quality, maxVertices); - else - SplinePatchFullQualityDispatch2(dest, indices, count, spatch, origVertType, quality, maxVertices); -} - -void TessellateSplinePatch(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int maxVertexCount) { - switch (g_Config.iSplineBezierQuality) { - case LOW_QUALITY: - _SplinePatchLowQuality(dest, indices, count, spatch, origVertType); - break; - case MEDIUM_QUALITY: - SplinePatchFullQualityDispatch(dest, indices, count, spatch, origVertType, 2, maxVertexCount); - break; - case HIGH_QUALITY: - SplinePatchFullQualityDispatch(dest, indices, count, spatch, origVertType, 1, maxVertexCount); - break; - } -} - -static void _BezierPatchLowQuality(u8 *&dest, u16 *&indices, int &count, int tess_u, int tess_v, const BezierPatch &patch, u32 origVertType) { - const float third = 1.0f / 3.0f; - // Fast and easy way - just draw the control points, generate some very basic normal vector subsitutes. - // Very inaccurate though but okay for Loco Roco. Maybe should keep it as an option. - - float u_base = patch.u_index / 3.0f; - float v_base = patch.v_index / 3.0f; - - GEPatchPrimType prim_type = patch.primType; - - for (int tile_v = 0; tile_v < 3; tile_v++) { - for (int tile_u = 0; tile_u < 3; tile_u++) { - int point_index = tile_u + tile_v * 4; - - SimpleVertex v0 = *patch.points[point_index]; - SimpleVertex v1 = *patch.points[point_index + 1]; - SimpleVertex v2 = *patch.points[point_index + 4]; - SimpleVertex v3 = *patch.points[point_index + 5]; - - // Generate UV. TODO: Do this even if UV specified in control points? - if ((origVertType & GE_VTYPE_TC_MASK) == 0) { - float u = u_base + tile_u * third; - float v = v_base + tile_v * third; - v0.uv[0] = u; - v0.uv[1] = v; - v1.uv[0] = u + third; - v1.uv[1] = v; - v2.uv[0] = u; - v2.uv[1] = v + third; - v3.uv[0] = u + third; - v3.uv[1] = v + third; - } - - // Generate normal if lighting is enabled (otherwise there's no point). - // This is a really poor quality algorithm, we get facet normals. - if (patch.computeNormals) { - Vec3Packedf norm = Cross(v1.pos - v0.pos, v2.pos - v0.pos); - norm.Normalize(); - if (patch.patchFacing) - norm *= -1.0f; - v0.nrm = norm; - v1.nrm = norm; - v2.nrm = norm; - v3.nrm = norm; - } - - int total = patch.index * 3 * 3 * 4; // A patch has 3x3 tiles, and each tiles have 4 vertices. - int tile_index = tile_u + tile_v * 3; - int idx0 = total + tile_index * 4 + 0; - int idx1 = total + tile_index * 4 + 1; - int idx2 = total + tile_index * 4 + 2; - int idx3 = total + tile_index * 4 + 3; - - CopyQuad(dest, &v0, &v1, &v2, &v3); - CopyQuadIndex(indices, prim_type, idx0, idx1, idx2, idx3); - count += 6; - } - } -} - -template -struct PrecomputedCurves { - PrecomputedCurves(int count) { - horiz1 = (T *)AllocateAlignedMemory(count * 4 * sizeof(T), 16); - horiz2 = horiz1 + count * 1; - horiz3 = horiz1 + count * 2; - horiz4 = horiz1 + count * 3; - } - ~PrecomputedCurves() { - FreeAlignedMemory(horiz1); - } - - T Bernstein3D(int u, float bv) { - return ::Bernstein3D(horiz1[u], horiz2[u], horiz3[u], horiz4[u], bv); - } - - T Bernstein3DDerivative(int u, float bv) { - return ::Bernstein3DDerivative(horiz1[u], horiz2[u], horiz3[u], horiz4[u], bv); - } - - T *horiz1; - T *horiz2; - T *horiz3; - T *horiz4; }; -static void _BezierPatchHighQuality(u8 *&dest, u16 *&indices, int &count, int tess_u, int tess_v, const BezierPatch &patch, u32 origVertType) { - const float third = 1.0f / 3.0f; +template +void SoftwareTessellation(OutputBuffers &output, const Surface &surface, u32 origVertType, const ControlPoints &points) { + using WeightType = typename Surface::WeightType; + u32 key_u = WeightType::ToKey(surface.tess_u, surface.num_points_u, surface.type_u); + u32 key_v = WeightType::ToKey(surface.tess_v, surface.num_points_v, surface.type_v); + Weight2D weights(WeightType::weightsCache, key_u, key_v); - // First compute all the vertices and put them in an array - SimpleVertex *&vertices = (SimpleVertex*&)dest; + SubdivisionSurface::Tessellate(output, surface, points, weights, origVertType); +} - PrecomputedCurves prepos(tess_u + 1); - PrecomputedCurves precol(tess_u + 1); - PrecomputedCurves pretex(tess_u + 1); - PrecomputedCurves prederivU(tess_u + 1); +template +static void HardwareTessellation(OutputBuffers &output, const Surface &surface, u32 origVertType, + const SimpleVertex *const *points, TessellationDataTransfer *tessDataTransfer) { + using WeightType = typename Surface::WeightType; + u32 key_u = WeightType::ToKey(surface.tess_u, surface.num_points_u, surface.type_u); + u32 key_v = WeightType::ToKey(surface.tess_v, surface.num_points_v, surface.type_v); + Weight2D weights(WeightType::weightsCache, key_u, key_v); + weights.size_u = WeightType::CalcSize(surface.tess_u, surface.num_points_u); + weights.size_v = WeightType::CalcSize(surface.tess_v, surface.num_points_v); + tessDataTransfer->SendDataToShader(points, surface.num_points_u, surface.num_points_v, origVertType, weights); - const bool computeNormals = patch.computeNormals; - const bool sampleColors = (origVertType & GE_VTYPE_COL_MASK) != 0; - const bool sampleTexcoords = (origVertType & GE_VTYPE_TC_MASK) != 0; - - // Precompute the horizontal curves to we only have to evaluate the vertical ones. - for (int i = 0; i < tess_u + 1; i++) { - float u = ((float)i / (float)tess_u); - prepos.horiz1[i] = Bernstein3D(patch.points[0]->pos, patch.points[1]->pos, patch.points[2]->pos, patch.points[3]->pos, u); - prepos.horiz2[i] = Bernstein3D(patch.points[4]->pos, patch.points[5]->pos, patch.points[6]->pos, patch.points[7]->pos, u); - prepos.horiz3[i] = Bernstein3D(patch.points[8]->pos, patch.points[9]->pos, patch.points[10]->pos, patch.points[11]->pos, u); - prepos.horiz4[i] = Bernstein3D(patch.points[12]->pos, patch.points[13]->pos, patch.points[14]->pos, patch.points[15]->pos, u); - - if (sampleColors) { - precol.horiz1[i] = Bernstein3D(patch.points[0]->color_32, patch.points[1]->color_32, patch.points[2]->color_32, patch.points[3]->color_32, u); - precol.horiz2[i] = Bernstein3D(patch.points[4]->color_32, patch.points[5]->color_32, patch.points[6]->color_32, patch.points[7]->color_32, u); - precol.horiz3[i] = Bernstein3D(patch.points[8]->color_32, patch.points[9]->color_32, patch.points[10]->color_32, patch.points[11]->color_32, u); - precol.horiz4[i] = Bernstein3D(patch.points[12]->color_32, patch.points[13]->color_32, patch.points[14]->color_32, patch.points[15]->color_32, u); - } - if (sampleTexcoords) { - pretex.horiz1[i] = Bernstein3D(Math3D::Vec2f(patch.points[0]->uv), Math3D::Vec2f(patch.points[1]->uv), Math3D::Vec2f(patch.points[2]->uv), Math3D::Vec2f(patch.points[3]->uv), u); - pretex.horiz2[i] = Bernstein3D(Math3D::Vec2f(patch.points[4]->uv), Math3D::Vec2f(patch.points[5]->uv), Math3D::Vec2f(patch.points[6]->uv), Math3D::Vec2f(patch.points[7]->uv), u); - pretex.horiz3[i] = Bernstein3D(Math3D::Vec2f(patch.points[8]->uv), Math3D::Vec2f(patch.points[9]->uv), Math3D::Vec2f(patch.points[10]->uv), Math3D::Vec2f(patch.points[11]->uv), u); - pretex.horiz4[i] = Bernstein3D(Math3D::Vec2f(patch.points[12]->uv), Math3D::Vec2f(patch.points[13]->uv), Math3D::Vec2f(patch.points[14]->uv), Math3D::Vec2f(patch.points[15]->uv), u); - } - - if (computeNormals) { - prederivU.horiz1[i] = Bernstein3DDerivative(patch.points[0]->pos, patch.points[1]->pos, patch.points[2]->pos, patch.points[3]->pos, u); - prederivU.horiz2[i] = Bernstein3DDerivative(patch.points[4]->pos, patch.points[5]->pos, patch.points[6]->pos, patch.points[7]->pos, u); - prederivU.horiz3[i] = Bernstein3DDerivative(patch.points[8]->pos, patch.points[9]->pos, patch.points[10]->pos, patch.points[11]->pos, u); - prederivU.horiz4[i] = Bernstein3DDerivative(patch.points[12]->pos, patch.points[13]->pos, patch.points[14]->pos, patch.points[15]->pos, u); - } - } - - - for (int tile_v = 0; tile_v < tess_v + 1; ++tile_v) { - for (int tile_u = 0; tile_u < tess_u + 1; ++tile_u) { - float u = ((float)tile_u / (float)tess_u); - float v = ((float)tile_v / (float)tess_v); - float bv = v; - - SimpleVertex &vert = vertices[tile_v * (tess_u + 1) + tile_u]; - - if (computeNormals) { - const Vec3f derivU = prederivU.Bernstein3D(tile_u, bv); - const Vec3f derivV = prepos.Bernstein3DDerivative(tile_u, bv); - - vert.nrm = Cross(derivU, derivV).Normalized(); - if (patch.patchFacing) - vert.nrm *= -1.0f; - } else { - vert.nrm.SetZero(); - } - - vert.pos = prepos.Bernstein3D(tile_u, bv); - - if (!sampleTexcoords) { - // Generate texcoord - vert.uv[0] = u + patch.u_index * third; - vert.uv[1] = v + patch.v_index * third; - } else { - // Sample UV from control points - const Math3D::Vec2f res = pretex.Bernstein3D(tile_u, bv); - vert.uv[0] = res.x; - vert.uv[1] = res.y; - } - - if (sampleColors) { - vert.color_32 = precol.Bernstein3D(tile_u, bv).ToRGBA(); - } else { - memcpy(vert.color, patch.points[0]->color, 4); + // Generating simple input vertices for the spline-computing vertex shader. + float inv_u = 1.0f / (float)surface.tess_u; + float inv_v = 1.0f / (float)surface.tess_v; + for (int patch_u = 0; patch_u < surface.num_patches_u; ++patch_u) { + const int start_u = surface.GetTessStart(patch_u); + for (int patch_v = 0; patch_v < surface.num_patches_v; ++patch_v) { + const int start_v = surface.GetTessStart(patch_v); + for (int tile_u = start_u; tile_u <= surface.tess_u; ++tile_u) { + const int index_u = surface.GetIndexU(patch_u, tile_u); + for (int tile_v = start_v; tile_v <= surface.tess_v; ++tile_v) { + const int index_v = surface.GetIndexV(patch_v, tile_v); + SimpleVertex &vert = output.vertices[surface.GetIndex(index_u, index_v, patch_u, patch_v)]; + // Index for the weights + vert.pos.x = index_u; + vert.pos.y = index_v; + // For texcoord generation + vert.nrm.x = patch_u + (float)tile_u * inv_u; + vert.nrm.y = patch_v + (float)tile_v * inv_v; + // Patch position + vert.pos.z = patch_u; + vert.nrm.z = patch_v; + } } } } - - GEPatchPrimType prim_type = patch.primType; - // Combine the vertices into triangles. - for (int tile_v = 0; tile_v < tess_v; ++tile_v) { - for (int tile_u = 0; tile_u < tess_u; ++tile_u) { - int total = patch.index * (tess_u + 1) * (tess_v + 1); - int idx0 = total + tile_v * (tess_u + 1) + tile_u; - int idx1 = total + tile_v * (tess_u + 1) + tile_u + 1; - int idx2 = total + (tile_v + 1) * (tess_u + 1) + tile_u; - int idx3 = total + (tile_v + 1) * (tess_u + 1) + tile_u + 1; - - CopyQuadIndex(indices, prim_type, idx0, idx1, idx2, idx3); - count += 6; - } - } - dest += (tess_u + 1) * (tess_v + 1) * sizeof(SimpleVertex); + surface.BuildIndex(output.indices, output.count); } -// Prepare mesh of one patch for "Instanced Tessellation". -static void TessellateBezierPatchHardware(u8 *&dest, u16 *indices, int &count, int tess_u, int tess_v, GEPatchPrimType primType) { - SimpleVertex *&vertices = (SimpleVertex*&)dest; +} // namespace Spline - float inv_u = 1.0f / (float)tess_u; - float inv_v = 1.0f / (float)tess_v; +using namespace Spline; - // Generating simple input vertices for the bezier-computing vertex shader. - for (int tile_v = 0; tile_v < tess_v + 1; ++tile_v) { - for (int tile_u = 0; tile_u < tess_u + 1; ++tile_u) { - SimpleVertex &vert = vertices[tile_v * (tess_u + 1) + tile_u]; - - vert.pos.x = (float)tile_u * inv_u; - vert.pos.y = (float)tile_v * inv_v; - } - } - - // Combine the vertices into triangles. - for (int tile_v = 0; tile_v < tess_v; ++tile_v) { - for (int tile_u = 0; tile_u < tess_u; ++tile_u) { - int idx0 = tile_v * (tess_u + 1) + tile_u; - int idx1 = tile_v * (tess_u + 1) + tile_u + 1; - int idx2 = (tile_v + 1) * (tess_u + 1) + tile_u; - int idx3 = (tile_v + 1) * (tess_u + 1) + tile_u + 1; - - CopyQuadIndex(indices, primType, idx0, idx1, idx2, idx3); - count += 6; - } - } -} - -void TessellateBezierPatch(u8 *&dest, u16 *&indices, int &count, int tess_u, int tess_v, const BezierPatch &patch, u32 origVertType) { - switch (g_Config.iSplineBezierQuality) { - case LOW_QUALITY: - _BezierPatchLowQuality(dest, indices, count, tess_u, tess_v, patch, origVertType); - break; - case MEDIUM_QUALITY: - _BezierPatchHighQuality(dest, indices, count, std::max(tess_u / 2, 1), std::max(tess_v / 2, 1), patch, origVertType); - break; - case HIGH_QUALITY: - _BezierPatchHighQuality(dest, indices, count, tess_u, tess_v, patch, origVertType); - break; - } +void DrawEngineCommon::ClearSplineBezierWeights() { + Bezier3DWeight::weightsCache.Clear(); + Spline3DWeight::weightsCache.Clear(); } void DrawEngineCommon::SubmitSpline(const void *control_points, const void *indices, int tess_u, int tess_v, int count_u, int count_v, int type_u, int type_v, GEPatchPrimType prim_type, bool computeNormals, bool patchFacing, u32 vertType, int *bytesRead) { PROFILE_THIS_SCOPE("spline"); DispatchFlush(); + // Real hardware seems to draw nothing when given < 4 either U or V. + if (count_u < 4 || count_v < 4) + return; + + SimpleBufferManager managedBuf(decoded, DECODED_VERTEX_BUFFER_SIZE / 2); + + int num_points = count_u * count_v; u16 index_lower_bound = 0; - u16 index_upper_bound = count_u * count_v - 1; - IndexConverter idxConv(vertType, indices); + u16 index_upper_bound = num_points - 1; + IndexConverter ConvertIndex(vertType, indices); if (indices) - GetIndexBounds(indices, count_u * count_v, vertType, &index_lower_bound, &index_upper_bound); + GetIndexBounds(indices, num_points, vertType, &index_lower_bound, &index_upper_bound); VertexDecoder *origVDecoder = GetVertexDecoder((vertType & 0xFFFFFF) | (gstate.getUVGenMode() << 24)); - *bytesRead = count_u * count_v * origVDecoder->VertexSize(); - - // Real hardware seems to draw nothing when given < 4 either U or V. - if (count_u < 4 || count_v < 4) { - return; - } + *bytesRead = num_points * origVDecoder->VertexSize(); // Simplify away bones and morph before proceeding - SimpleVertex *simplified_control_points = (SimpleVertex *)(decoded + 65536 * 12); - u8 *temp_buffer = decoded + 65536 * 18; + SimpleVertex *simplified_control_points = (SimpleVertex *)managedBuf.Allocate(sizeof(SimpleVertex) * (index_upper_bound + 1)); + u8 *temp_buffer = managedBuf.Allocate(sizeof(SimpleVertex) * num_points); u32 origVertType = vertType; vertType = NormalizeVertices((u8 *)simplified_control_points, temp_buffer, (u8 *)control_points, index_lower_bound, index_upper_bound, vertType); @@ -881,65 +522,35 @@ void DrawEngineCommon::SubmitSpline(const void *control_points, const void *indi ERROR_LOG(G3D, "Something went really wrong, vertex size: %i vs %i", vertexSize, (int)sizeof(SimpleVertex)); } - // TODO: Do something less idiotic to manage this buffer - auto points = new const SimpleVertex *[count_u * count_v]; - // Make an array of pointers to the control points, to get rid of indices. - for (int idx = 0; idx < count_u * count_v; idx++) { - points[idx] = simplified_control_points + (indices ? idxConv.convert(idx) : idx); - } + const SimpleVertex **points = (const SimpleVertex **)managedBuf.Allocate(sizeof(SimpleVertex *) * num_points); + for (int idx = 0; idx < num_points; idx++) + points[idx] = simplified_control_points + (indices ? ConvertIndex(idx) : idx); - int count = 0; + OutputBuffers output; + output.vertices = (SimpleVertex *)(decoded + DECODED_VERTEX_BUFFER_SIZE / 2); + output.indices = decIndex; + output.count = 0; - u8 *dest = splineBuffer; - - SplinePatchLocal patch; - patch.tess_u = tess_u; - patch.tess_v = tess_v; - patch.type_u = type_u; - patch.type_v = type_v; - patch.count_u = count_u; - patch.count_v = count_v; - patch.points = points; - patch.computeNormals = computeNormals; - patch.primType = prim_type; - patch.patchFacing = patchFacing; + SplineSurface surface; + surface.tess_u = tess_u; + surface.tess_v = tess_v; + surface.type_u = type_u; + surface.type_v = type_v; + surface.num_points_u = count_u; + surface.num_points_v = count_v; + surface.num_patches_u = count_u - 3; + surface.num_patches_v = count_v - 3; + surface.primType = prim_type; + surface.patchFacing = patchFacing; + surface.Init(DECODED_VERTEX_BUFFER_SIZE / 2 / vertexSize); if (CanUseHardwareTessellation(prim_type)) { - float *pos = (float*)(decoded + 65536 * 18); // Size 4 float - float *tex = pos + count_u * count_v * 4; // Size 4 float - float *col = tex + count_u * count_v * 4; // Size 4 float - const bool hasColor = (origVertType & GE_VTYPE_COL_MASK) != 0; - const bool hasTexCoords = (origVertType & GE_VTYPE_TC_MASK) != 0; - - int posStride, texStride, colStride; - tessDataTransfer->PrepareBuffers(pos, tex, col, posStride, texStride, colStride, count_u * count_v, hasColor, hasTexCoords); - float *p = pos; - float *t = tex; - float *c = col; - for (int idx = 0; idx < count_u * count_v; idx++) { - memcpy(p, points[idx]->pos.AsArray(), 3 * sizeof(float)); - p += posStride; - if (hasTexCoords) { - memcpy(t, points[idx]->uv, 2 * sizeof(float)); - t += texStride; - } - if (hasColor) { - memcpy(c, Vec4f::FromRGBA(points[idx]->color_32).AsArray(), 4 * sizeof(float)); - c += colStride; - } - } - if (!hasColor) - memcpy(col, Vec4f::FromRGBA(points[0]->color_32).AsArray(), 4 * sizeof(float)); - - tessDataTransfer->SendDataToShader(pos, tex, col, count_u * count_v, hasColor, hasTexCoords); - TessellateSplinePatchHardware(dest, quadIndices_, count, patch); - numPatches = (count_u - 3) * (count_v - 3); + HardwareTessellation(output, surface, origVertType, points, tessDataTransfer); } else { - int maxVertexCount = SPLINE_BUFFER_SIZE / vertexSize; - TessellateSplinePatch(dest, quadIndices_, count, patch, origVertType, maxVertexCount); + ControlPoints cpoints(points, num_points, managedBuf); + SoftwareTessellation(output, surface, origVertType, cpoints); } - delete[] points; u32 vertTypeWithIndex16 = (vertType & ~GE_VTYPE_IDX_MASK) | GE_VTYPE_IDX_16BIT; @@ -956,7 +567,7 @@ void DrawEngineCommon::SubmitSpline(const void *control_points, const void *indi uint32_t vertTypeID = GetVertTypeID(vertTypeWithIndex16, gstate.getUVGenMode()); int generatedBytesRead; - DispatchSubmitPrim(splineBuffer, quadIndices_, PatchPrimToPrim(prim_type), count, vertTypeID, &generatedBytesRead); + DispatchSubmitPrim(output.vertices, output.indices, PatchPrimToPrim(prim_type), output.count, vertTypeID, &generatedBytesRead); DispatchFlush(); @@ -967,28 +578,29 @@ void DrawEngineCommon::SubmitSpline(const void *control_points, const void *indi void DrawEngineCommon::SubmitBezier(const void *control_points, const void *indices, int tess_u, int tess_v, int count_u, int count_v, GEPatchPrimType prim_type, bool computeNormals, bool patchFacing, u32 vertType, int *bytesRead) { PROFILE_THIS_SCOPE("bezier"); - DispatchFlush(); - u16 index_lower_bound = 0; - u16 index_upper_bound = count_u * count_v - 1; - IndexConverter idxConv(vertType, indices); - if (indices) - GetIndexBounds(indices, count_u*count_v, vertType, &index_lower_bound, &index_upper_bound); - - VertexDecoder *origVDecoder = GetVertexDecoder((vertType & 0xFFFFFF) | (gstate.getUVGenMode() << 24)); - *bytesRead = count_u * count_v * origVDecoder->VertexSize(); - // Real hardware seems to draw nothing when given < 4 either U or V. // This would result in num_patches_u / num_patches_v being 0. - if (count_u < 4 || count_v < 4) { + if (count_u < 4 || count_v < 4) return; - } + + SimpleBufferManager managedBuf(decoded, DECODED_VERTEX_BUFFER_SIZE / 2); + + int num_points = count_u * count_v; + u16 index_lower_bound = 0; + u16 index_upper_bound = num_points - 1; + IndexConverter ConvertIndex(vertType, indices); + if (indices) + GetIndexBounds(indices, num_points, vertType, &index_lower_bound, &index_upper_bound); + + VertexDecoder *origVDecoder = GetVertexDecoder((vertType & 0xFFFFFF) | (gstate.getUVGenMode() << 24)); + *bytesRead = num_points * origVDecoder->VertexSize(); // Simplify away bones and morph before proceeding // There are normally not a lot of control points so just splitting decoded should be reasonably safe, although not great. - SimpleVertex *simplified_control_points = (SimpleVertex *)(decoded + 65536 * 12); - u8 *temp_buffer = decoded + 65536 * 18; + SimpleVertex *simplified_control_points = (SimpleVertex *)managedBuf.Allocate(sizeof(SimpleVertex) * (index_upper_bound + 1)); + u8 *temp_buffer = managedBuf.Allocate(sizeof(SimpleVertex) * num_points); u32 origVertType = vertType; vertType = NormalizeVertices((u8 *)simplified_control_points, temp_buffer, (u8 *)control_points, index_lower_bound, index_upper_bound, vertType); @@ -1000,89 +612,32 @@ void DrawEngineCommon::SubmitBezier(const void *control_points, const void *indi ERROR_LOG(G3D, "Something went really wrong, vertex size: %i vs %i", vertexSize, (int)sizeof(SimpleVertex)); } - float *pos = (float*)(decoded + 65536 * 18); // Size 4 float - float *tex = pos + count_u * count_v * 4; // Size 4 float - float *col = tex + count_u * count_v * 4; // Size 4 float - const bool hasColor = (origVertType & GE_VTYPE_COL_MASK) != 0; - const bool hasTexCoords = (origVertType & GE_VTYPE_TC_MASK) != 0; + // Make an array of pointers to the control points, to get rid of indices. + const SimpleVertex **points = (const SimpleVertex **)managedBuf.Allocate(sizeof(SimpleVertex *) * num_points); + for (int idx = 0; idx < num_points; idx++) + points[idx] = simplified_control_points + (indices ? ConvertIndex(idx) : idx); + + OutputBuffers output; + output.vertices = (SimpleVertex *)(decoded + DECODED_VERTEX_BUFFER_SIZE / 2); + output.indices = decIndex; + output.count = 0; + + BezierSurface surface; + surface.tess_u = tess_u; + surface.tess_v = tess_v; + surface.num_points_u = count_u; + surface.num_points_v = count_v; + surface.num_patches_u = (count_u - 1) / 3; + surface.num_patches_v = (count_v - 1) / 3; + surface.primType = prim_type; + surface.patchFacing = patchFacing; + surface.Init(DECODED_VERTEX_BUFFER_SIZE / 2 / vertexSize); - // Bezier patches share less control points than spline patches. Otherwise they are pretty much the same (except bezier don't support the open/close thing) - int num_patches_u = (count_u - 1) / 3; - int num_patches_v = (count_v - 1) / 3; - BezierPatch *patches = nullptr; if (CanUseHardwareTessellation(prim_type)) { - int posStride, texStride, colStride; - tessDataTransfer->PrepareBuffers(pos, tex, col, posStride, texStride, colStride, count_u * count_v, hasColor, hasTexCoords); - float *p = pos; - float *t = tex; - float *c = col; - for (int idx = 0; idx < count_u * count_v; idx++) { - const SimpleVertex *point = simplified_control_points + (indices ? idxConv.convert(idx) : idx); - memcpy(p, point->pos.AsArray(), 3 * sizeof(float)); - p += posStride; - if (hasTexCoords) { - memcpy(t, point->uv, 2 * sizeof(float)); - t += texStride; - } - if (hasColor) { - memcpy(c, Vec4f::FromRGBA(point->color_32).AsArray(), 4 * sizeof(float)); - c += colStride; - } - } - if (!hasColor) { - const SimpleVertex *point = simplified_control_points + (indices ? idxConv.convert(0) : 0); - memcpy(col, Vec4f::FromRGBA(point->color_32).AsArray(), 4 * sizeof(float)); - } + HardwareTessellation(output, surface, origVertType, points, tessDataTransfer); } else { - patches = new BezierPatch[num_patches_u * num_patches_v]; - for (int patch_u = 0; patch_u < num_patches_u; patch_u++) { - for (int patch_v = 0; patch_v < num_patches_v; patch_v++) { - BezierPatch& patch = patches[patch_u + patch_v * num_patches_u]; - for (int point = 0; point < 16; ++point) { - int idx = (patch_u * 3 + point % 4) + (patch_v * 3 + point / 4) * count_u; - patch.points[point] = simplified_control_points + (indices ? idxConv.convert(idx) : idx); - } - patch.u_index = patch_u * 3; - patch.v_index = patch_v * 3; - patch.index = patch_v * num_patches_u + patch_u; - patch.primType = prim_type; - patch.computeNormals = computeNormals; - patch.patchFacing = patchFacing; - } - } - } - - int count = 0; - u8 *dest = splineBuffer; - - // We shouldn't really split up into separate 4x4 patches, instead we should do something that works - // like the splines, so we subdivide across the whole "mega-patch". - - // If specified as 0, uses 1. - if (tess_u < 1) { - tess_u = 1; - } - if (tess_v < 1) { - tess_v = 1; - } - - u16 *inds = quadIndices_; - if (CanUseHardwareTessellation(prim_type)) { - tessDataTransfer->SendDataToShader(pos, tex, col, count_u * count_v, hasColor, hasTexCoords); - TessellateBezierPatchHardware(dest, inds, count, tess_u, tess_v, prim_type); - numPatches = num_patches_u * num_patches_v; - } else { - int maxVertices = SPLINE_BUFFER_SIZE / vertexSize; - // Downsample until it fits, in case crazy tessellation factors are sent. - while ((tess_u + 1) * (tess_v + 1) * num_patches_u * num_patches_v > maxVertices) { - tess_u /= 2; - tess_v /= 2; - } - for (int patch_idx = 0; patch_idx < num_patches_u*num_patches_v; ++patch_idx) { - const BezierPatch &patch = patches[patch_idx]; - TessellateBezierPatch(dest, inds, count, tess_u, tess_v, patch, origVertType); - } - delete[] patches; + ControlPoints cpoints(points, num_points, managedBuf); + SoftwareTessellation(output, surface, origVertType, cpoints); } u32 vertTypeWithIndex16 = (vertType & ~GE_VTYPE_IDX_MASK) | GE_VTYPE_IDX_16BIT; @@ -1099,7 +654,7 @@ void DrawEngineCommon::SubmitBezier(const void *control_points, const void *indi uint32_t vertTypeID = GetVertTypeID(vertTypeWithIndex16, gstate.getUVGenMode()); int generatedBytesRead; - DispatchSubmitPrim(splineBuffer, quadIndices_, PatchPrimToPrim(prim_type), count, vertTypeID, &generatedBytesRead); + DispatchSubmitPrim(output.vertices, output.indices, PatchPrimToPrim(prim_type), output.count, vertTypeID, &generatedBytesRead); DispatchFlush(); diff --git a/GPU/Common/SplineCommon.h b/GPU/Common/SplineCommon.h index 52bf75b69b..a6d82def4b 100644 --- a/GPU/Common/SplineCommon.h +++ b/GPU/Common/SplineCommon.h @@ -16,11 +16,15 @@ // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. #pragma once +#include #include "Common/CommonTypes.h" #include "Common/Swap.h" #include "GPU/Math3D.h" #include "GPU/ge_constants.h" +#include "Core/Config.h" + +#define HALF_CEIL(x) (x + 1) / 2 // Integer ceil = (int)ceil((float)x / 2.0f) // PSP compatible format so we can use the end of the pipeline in beziers etc struct SimpleVertex { @@ -33,32 +37,11 @@ struct SimpleVertex { Vec3Packedf pos; }; -// We decode all vertices into a common format for easy interpolation and stuff. -// Not fast but can be optimized later. -struct BezierPatch { - const SimpleVertex *points[16]; +class SimpleBufferManager; - // These are used to generate UVs. - int u_index, v_index; +namespace Spline { - int index; - GEPatchPrimType primType; - bool computeNormals; - bool patchFacing; -}; - -struct SplinePatchLocal { - const SimpleVertex **points; - int tess_u; - int tess_v; - int count_u; - int count_v; - int type_u; - int type_v; - bool computeNormals; - bool patchFacing; - GEPatchPrimType primType; -}; +void BuildIndex(u16 *indices, int &count, int num_u, int num_v, GEPatchPrimType prim_type, int total = 0); enum SplineQuality { LOW_QUALITY = 0, @@ -66,6 +49,207 @@ enum SplineQuality { HIGH_QUALITY = 2, }; +class Bezier3DWeight; +class Spline3DWeight; + +// We decode all vertices into a common format for easy interpolation and stuff. +// Not fast but can be optimized later. + +struct SurfaceInfo { + int tess_u, tess_v; + int num_points_u, num_points_v; + int num_patches_u, num_patches_v; + int type_u, type_v; + GEPatchPrimType primType; + bool patchFacing; + + void Init() { + // If specified as 0, uses 1. + if (tess_u < 1) tess_u = 1; + if (tess_v < 1) tess_v = 1; + + switch (g_Config.iSplineBezierQuality) { + case LOW_QUALITY: + tess_u = 2; + tess_v = 2; + break; + case MEDIUM_QUALITY: + // Don't cut below 2, though. + if (tess_u > 2) tess_u = HALF_CEIL(tess_u); + if (tess_v > 2) tess_v = HALF_CEIL(tess_v); + break; + } + } +}; + +struct BezierSurface : public SurfaceInfo { + using WeightType = Bezier3DWeight; + + int num_verts_per_patch; + + void Init(int maxVertices) { + SurfaceInfo::Init(); + // Downsample until it fits, in case crazy tessellation factors are sent. + while ((tess_u + 1) * (tess_v + 1) * num_patches_u * num_patches_v > maxVertices) { + tess_u--; + tess_v--; + } + num_verts_per_patch = (tess_u + 1) * (tess_v + 1); + } + + int GetTessStart(int patch) const { return 0; } + + int GetPointIndex(int patch_u, int patch_v) const { return patch_v * 3 * num_points_u + patch_u * 3; } + + int GetIndexU(int patch_u, int tile_u) const { return tile_u; } + int GetIndexV(int patch_v, int tile_v) const { return tile_v; } + + int GetIndex(int index_u, int index_v, int patch_u, int patch_v) const { + int patch_index = patch_v * num_patches_u + patch_u; + return index_v * (tess_u + 1) + index_u + num_verts_per_patch * patch_index; + } + + void BuildIndex(u16 *indices, int &count) const { + for (int patch_u = 0; patch_u < num_patches_u; ++patch_u) { + for (int patch_v = 0; patch_v < num_patches_v; ++patch_v) { + int patch_index = patch_v * num_patches_u + patch_u; + int total = patch_index * num_verts_per_patch; + Spline::BuildIndex(indices + count, count, tess_u, tess_v, primType, total); + } + } + } +}; + +struct SplineSurface : public SurfaceInfo { + using WeightType = Spline3DWeight; + + int num_vertices_u; + + void Init(int maxVertices) { + SurfaceInfo::Init(); + // Downsample until it fits, in case crazy tessellation factors are sent. + while ((num_patches_u * tess_u + 1) * (num_patches_v * tess_v + 1) > maxVertices) { + tess_u--; + tess_v--; + } + num_vertices_u = num_patches_u * tess_u + 1; + } + + int GetTessStart(int patch) const { return (patch == 0) ? 0 : 1; } + + int GetPointIndex(int patch_u, int patch_v) const { return patch_v * num_points_u + patch_u; } + + int GetIndexU(int patch_u, int tile_u) const { return patch_u * tess_u + tile_u; } + int GetIndexV(int patch_v, int tile_v) const { return patch_v * tess_v + tile_v; } + + int GetIndex(int index_u, int index_v, int patch_u, int patch_v) const { + return index_v * num_vertices_u + index_u; + } + + void BuildIndex(u16 *indices, int &count) const { + Spline::BuildIndex(indices, count, num_patches_u * tess_u, num_patches_v * tess_v, primType); + } +}; + +struct Weight { + float basis[4], deriv[4]; +}; + +template +class WeightCache : public T { +private: + std::unordered_map weightsCache; +public: + Weight* operator [] (u32 key) { + Weight *&weights = weightsCache[key]; + if (!weights) + weights = T::CalcWeightsAll(key); + return weights; + } + + void Clear() { + for (auto it : weightsCache) + delete[] it.second; + weightsCache.clear(); + } +}; + +struct Weight2D { + const Weight *u, *v; + int size_u, size_v; + + template + Weight2D(WeightCache &cache, u32 key_u, u32 key_v) { + u = cache[key_u]; + v = (key_u != key_v) ? cache[key_v] : u; // Use same weights if u == v + } +}; + +struct ControlPoints { + Vec3f *pos; + Vec2f *tex; + Vec4f *col; + u32_le defcolor; + + ControlPoints() {} + ControlPoints(const SimpleVertex *const *points, int size, SimpleBufferManager &managedBuf); + void Convert(const SimpleVertex *const *points, int size); +}; + +struct OutputBuffers { + SimpleVertex *vertices; + u16 *indices; + int count; +}; + +template +void SoftwareTessellation(OutputBuffers &output, const Surface &surface, u32 origVertType, const ControlPoints &points); + +} // namespace Spline + bool CanUseHardwareTessellation(GEPatchPrimType prim); -void TessellateSplinePatch(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int maxVertices); -void TessellateBezierPatch(u8 *&dest, u16 *&indices, int &count, int tess_u, int tess_v, const BezierPatch &patch, u32 origVertType); + +// Define function object for TemplateParameterDispatcher +#define TEMPLATE_PARAMETER_DISPATCHER_FUNCTION(NAME, FUNCNAME, FUNCTYPE) \ +struct NAME { \ + template \ + static FUNCTYPE GetFunc() { \ + return &FUNCNAME; \ + } \ +}; + +template +class TemplateParameterDispatcher { + + /* Store all combinations of template functions into an array */ + template + struct Initializer { + static void Init(Func funcs[]) { + Initializer::Init(funcs); // true + Initializer::Init(funcs); // false + } + }; + /* Specialized for terminates the recursive loop */ + template + struct Initializer<0, Index, Params...> { + static void Init(Func funcs[]) { + funcs[Index] = Dispatcher::template GetFunc(); // Resolve the nested dependent name as template function. + } + }; + +private: + Func funcs[1 << NumParams]; /* Function pointers array */ +public: + TemplateParameterDispatcher() { + Initializer::Init(funcs); + } + + Func GetFunc(const bool params[]) const { + /* Convert bool parameters to index of the array */ + int index = 0; + for (int i = 0; i < NumParams; ++i) + index |= params[i] << i; + + return funcs[index]; + } +}; diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 97ea633c1e..209186fd96 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -102,7 +102,7 @@ public: : indices(indices), indexType(vertType & GE_VTYPE_IDX_MASK) { } - inline u32 convert(u32 index) const { + u32 operator() (u32 index) const { switch (indexType) { case GE_VTYPE_IDX_8BIT: return indices8[index]; diff --git a/GPU/D3D11/DrawEngineD3D11.cpp b/GPU/D3D11/DrawEngineD3D11.cpp index eac5cf078f..e7752017dc 100644 --- a/GPU/D3D11/DrawEngineD3D11.cpp +++ b/GPU/D3D11/DrawEngineD3D11.cpp @@ -89,7 +89,6 @@ DrawEngineD3D11::DrawEngineD3D11(Draw::DrawContext *draw, ID3D11Device *device, // All this is a LOT of memory, need to see if we can cut down somehow. decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); - splineBuffer = (u8 *)AllocateMemoryPages(SPLINE_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); indexGen.Setup(decIndex); @@ -104,14 +103,14 @@ DrawEngineD3D11::~DrawEngineD3D11() { DestroyDeviceObjects(); FreeMemoryPages(decoded, DECODED_VERTEX_BUFFER_SIZE); FreeMemoryPages(decIndex, DECODED_INDEX_BUFFER_SIZE); - FreeMemoryPages(splineBuffer, SPLINE_BUFFER_SIZE); } void DrawEngineD3D11::InitDeviceObjects() { pushVerts_ = new PushBufferD3D11(device_, VERTEX_PUSH_SIZE, D3D11_BIND_VERTEX_BUFFER); pushInds_ = new PushBufferD3D11(device_, INDEX_PUSH_SIZE, D3D11_BIND_INDEX_BUFFER); - tessDataTransfer = new TessellationDataTransferD3D11(context_, device_); + tessDataTransferD3D11 = new TessellationDataTransferD3D11(context_, device_); + tessDataTransfer = tessDataTransferD3D11; } void DrawEngineD3D11::ClearTrackedVertexArrays() { @@ -137,7 +136,7 @@ void DrawEngineD3D11::Resized() { void DrawEngineD3D11::DestroyDeviceObjects() { ClearTrackedVertexArrays(); ClearInputLayoutMap(); - delete tessDataTransfer; + delete tessDataTransferD3D11; delete pushVerts_; delete pushInds_; depthStencilCache_.Iterate([&](const uint64_t &key, ID3D11DepthStencilState *ds) { @@ -539,10 +538,7 @@ rotateVBO: memcpy(iptr, decIndex, iSize); pushInds_->EndPush(context_); context_->IASetIndexBuffer(pushInds_->Buf(), DXGI_FORMAT_R16_UINT, iOffset); - if (tess) - context_->DrawIndexedInstanced(vertexCount, numPatches, 0, 0, 0); - else - context_->DrawIndexed(vertexCount, 0, 0); + context_->DrawIndexed(vertexCount, 0, 0); } else { context_->Draw(vertexCount, 0); } @@ -551,10 +547,7 @@ rotateVBO: context_->IASetVertexBuffers(0, 1, &vb_, &stride, &offset); if (useElements) { context_->IASetIndexBuffer(ib_, DXGI_FORMAT_R16_UINT, 0); - if (tess) - context_->DrawIndexedInstanced(vertexCount, numPatches, 0, 0, 0); - else - context_->DrawIndexed(vertexCount, 0, 0); + context_->DrawIndexed(vertexCount, 0, 0); } else { context_->Draw(vertexCount, 0); } @@ -692,38 +685,85 @@ rotateVBO: GPUDebug::NotifyDraw(); } -void DrawEngineD3D11::TessellationDataTransferD3D11::PrepareBuffers(float *&pos, float *&tex, float *&col, int &posStride, int &texStride, int &colStride, int size, bool hasColor, bool hasTexCoords) { +TessellationDataTransferD3D11::TessellationDataTransferD3D11(ID3D11DeviceContext *context, ID3D11Device *device) + : context_(context), device_(device) { + desc.Usage = D3D11_USAGE_DYNAMIC; + desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; +} + +TessellationDataTransferD3D11::~TessellationDataTransferD3D11() { + for (int i = 0; i < 3; ++i) { + if (buf[i]) buf[i]->Release(); + if (view[i]) view[i]->Release(); + } +} + +void TessellationDataTransferD3D11::SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) { struct TessData { float pos[3]; float pad1; float uv[2]; float pad2[2]; float color[4]; }; + int size = size_u * size_v; + if (prevSize < size) { prevSize = size; - if (buf) { - buf->Release(); - view->Release(); - } + if (buf[0]) buf[0]->Release(); + if (view[0]) view[0]->Release(); + desc.ByteWidth = size * sizeof(TessData); desc.StructureByteStride = sizeof(TessData); - - device_->CreateBuffer(&desc, nullptr, &buf); - device_->CreateShaderResourceView(buf, 0, &view); - context_->VSSetShaderResources(0, 1, &view); + device_->CreateBuffer(&desc, nullptr, &buf[0]); + device_->CreateShaderResourceView(buf[0], nullptr, &view[0]); + context_->VSSetShaderResources(0, 1, &view[0]); } D3D11_MAPPED_SUBRESOURCE map; - context_->Map(buf, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); + context_->Map(buf[0], 0, D3D11_MAP_WRITE_DISCARD, 0, &map); uint8_t *data = (uint8_t *)map.pData; - pos = (float *)(data); - tex = (float *)(data + offsetof(TessData, uv)); - col = (float *)(data + offsetof(TessData, color)); - posStride = sizeof(TessData) / sizeof(float); - colStride = hasColor ? (sizeof(TessData) / sizeof(float)) : 0; - texStride = sizeof(TessData) / sizeof(float); -} + float *pos = (float *)(data); + float *tex = (float *)(data + offsetof(TessData, uv)); + float *col = (float *)(data + offsetof(TessData, color)); + int stride = sizeof(TessData) / sizeof(float); -void DrawEngineD3D11::TessellationDataTransferD3D11::SendDataToShader(const float * pos, const float * tex, const float * col, int size, bool hasColor, bool hasTexCoords) { - context_->Unmap(buf, 0); + CopyControlPoints(pos, tex, col, stride, stride, stride, points, size, vertType); + + context_->Unmap(buf[0], 0); + + using Spline::Weight; + + // Weights U + if (prevSizeWU < weights.size_u) { + prevSizeWU = weights.size_u; + if (buf[1]) buf[1]->Release(); + if (view[1]) view[1]->Release(); + + desc.ByteWidth = weights.size_u * sizeof(Weight); + desc.StructureByteStride = sizeof(Weight); + device_->CreateBuffer(&desc, nullptr, &buf[1]); + device_->CreateShaderResourceView(buf[1], nullptr, &view[1]); + context_->VSSetShaderResources(1, 1, &view[1]); + } + context_->Map(buf[1], 0, D3D11_MAP_WRITE_DISCARD, 0, &map); + memcpy(map.pData, weights.u, weights.size_u * sizeof(Weight)); + context_->Unmap(buf[1], 0); + + // Weights V + if (prevSizeWV < weights.size_v) { + prevSizeWV = weights.size_v; + if (buf[2]) buf[2]->Release(); + if (view[2]) view[2]->Release(); + + desc.ByteWidth = weights.size_v * sizeof(Weight); + desc.StructureByteStride = sizeof(Weight); + device_->CreateBuffer(&desc, nullptr, &buf[2]); + device_->CreateShaderResourceView(buf[2], nullptr, &view[2]); + context_->VSSetShaderResources(2, 1, &view[2]); + } + context_->Map(buf[2], 0, D3D11_MAP_WRITE_DISCARD, 0, &map); + memcpy(map.pData, weights.v, weights.size_v * sizeof(Weight)); + context_->Unmap(buf[2], 0); } diff --git a/GPU/D3D11/DrawEngineD3D11.h b/GPU/D3D11/DrawEngineD3D11.h index 8b797a3d2d..11ed3e41e9 100644 --- a/GPU/D3D11/DrawEngineD3D11.h +++ b/GPU/D3D11/DrawEngineD3D11.h @@ -99,6 +99,22 @@ public: u8 flags; }; +class TessellationDataTransferD3D11 : public TessellationDataTransfer { +private: + ID3D11DeviceContext *context_; + ID3D11Device *device_; + ID3D11Buffer *buf[3]{}; + ID3D11ShaderResourceView *view[3]{}; + D3D11_BUFFER_DESC desc{}; + int prevSize = 0; + int prevSizeWU = 0, prevSizeWV = 0; +public: + TessellationDataTransferD3D11(ID3D11DeviceContext *context, ID3D11Device *device); + ~TessellationDataTransferD3D11(); + // Send spline/bezier's control points and weights to vertex shader through structured shader buffer. + void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) override; +}; + // Handles transform, lighting and drawing. class DrawEngineD3D11 : public DrawEngineCommon { public: @@ -199,29 +215,5 @@ private: D3D11DynamicState dynState_{}; // Hardware tessellation - class TessellationDataTransferD3D11 : public TessellationDataTransfer { - private: - ID3D11DeviceContext *context_; - ID3D11Device *device_; - ID3D11Buffer *buf; - ID3D11ShaderResourceView *view; - D3D11_BUFFER_DESC desc; - public: - TessellationDataTransferD3D11(ID3D11DeviceContext *context, ID3D11Device *device) - : TessellationDataTransfer(), context_(context), device_(device), buf(), view(), desc() { - desc.Usage = D3D11_USAGE_DYNAMIC; - desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; - desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; - } - ~TessellationDataTransferD3D11() { - if (buf) { - buf->Release(); - view->Release(); - } - } - - void PrepareBuffers(float *&pos, float *&tex, float *&col, int &posStride, int &texStride, int &colStride, int size, bool hasColor, bool hasTexCoords) override; - void SendDataToShader(const float *pos, const float *tex, const float *col, int size, bool hasColor, bool hasTexCoords) override; - }; + TessellationDataTransferD3D11 *tessDataTransferD3D11; }; diff --git a/GPU/Directx9/DrawEngineDX9.cpp b/GPU/Directx9/DrawEngineDX9.cpp index e4f24703ee..0f1c0b62d3 100644 --- a/GPU/Directx9/DrawEngineDX9.cpp +++ b/GPU/Directx9/DrawEngineDX9.cpp @@ -95,13 +95,13 @@ DrawEngineDX9::DrawEngineDX9(Draw::DrawContext *draw) : vai_(256), vertexDeclMap // All this is a LOT of memory, need to see if we can cut down somehow. decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); - splineBuffer = (u8 *)AllocateMemoryPages(SPLINE_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); indexGen.Setup(decIndex); InitDeviceObjects(); - tessDataTransfer = new TessellationDataTransferDX9(); + tessDataTransferDX9 = new TessellationDataTransferDX9(); + tessDataTransfer = tessDataTransferDX9; device_->CreateVertexDeclaration(TransformedVertexElements, &transformedVertexDecl_); } @@ -114,14 +114,13 @@ DrawEngineDX9::~DrawEngineDX9() { DestroyDeviceObjects(); FreeMemoryPages(decoded, DECODED_VERTEX_BUFFER_SIZE); FreeMemoryPages(decIndex, DECODED_INDEX_BUFFER_SIZE); - FreeMemoryPages(splineBuffer, SPLINE_BUFFER_SIZE); vertexDeclMap_.Iterate([&](const uint32_t &key, IDirect3DVertexDeclaration9 *decl) { if (decl) { decl->Release(); } }); vertexDeclMap_.Clear(); - delete tessDataTransfer; + delete tessDataTransferDX9; } void DrawEngineDX9::InitDeviceObjects() { @@ -624,8 +623,8 @@ rotateVBO: GPUDebug::NotifyDraw(); } -void DrawEngineDX9::TessellationDataTransferDX9::SendDataToShader(const float * pos, const float * tex, const float * col, int size, bool hasColor, bool hasTexCoords) -{ +void TessellationDataTransferDX9::SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) { + // TODO } } // namespace diff --git a/GPU/Directx9/DrawEngineDX9.h b/GPU/Directx9/DrawEngineDX9.h index ef015b02b5..f1c80eea51 100644 --- a/GPU/Directx9/DrawEngineDX9.h +++ b/GPU/Directx9/DrawEngineDX9.h @@ -97,6 +97,13 @@ public: u8 flags; }; +class TessellationDataTransferDX9 : public TessellationDataTransfer { +public: + TessellationDataTransferDX9() {} + ~TessellationDataTransferDX9() {} + void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) override; +}; + // Handles transform, lighting and drawing. class DrawEngineDX9 : public DrawEngineCommon { public: @@ -158,16 +165,7 @@ private: FramebufferManagerDX9 *framebufferManager_ = nullptr; // Hardware tessellation - class TessellationDataTransferDX9 : public TessellationDataTransfer { - private: - int data_tex[3]; - public: - TessellationDataTransferDX9() : TessellationDataTransfer(), data_tex() { - } - ~TessellationDataTransferDX9() { - } - void SendDataToShader(const float *pos, const float *tex, const float *col, int size, bool hasColor, bool hasTexCoords) override; - }; + TessellationDataTransferDX9 *tessDataTransferDX9; }; } // namespace diff --git a/GPU/Directx9/VertexShaderGeneratorDX9.cpp b/GPU/Directx9/VertexShaderGeneratorDX9.cpp index 51fe036f59..53ef880c17 100644 --- a/GPU/Directx9/VertexShaderGeneratorDX9.cpp +++ b/GPU/Directx9/VertexShaderGeneratorDX9.cpp @@ -86,6 +86,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage bool doSpline = id.Bit(VS_BIT_SPLINE); bool hasColorTess = id.Bit(VS_BIT_HAS_COLOR_TESS); bool hasTexcoordTess = id.Bit(VS_BIT_HAS_TEXCOORD_TESS); + bool hasNormalTess = id.Bit(VS_BIT_HAS_NORMAL_TESS); bool flipNormalTess = id.Bit(VS_BIT_NORM_REVERSE_TESS); DoLightComputation doLight[4] = { LIGHT_OFF, LIGHT_OFF, LIGHT_OFF, LIGHT_OFF }; @@ -271,82 +272,90 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage WRITE(p, " float3 pos; float pad1;\n"); WRITE(p, " float2 tex; float2 pad2;\n"); WRITE(p, " float4 col;\n"); - WRITE(p, "};"); + WRITE(p, "};\n"); WRITE(p, "StructuredBuffer tess_data : register(t0);\n"); + + WRITE(p, "struct TessWeight {\n"); + WRITE(p, " float4 basis;\n"); + WRITE(p, " float4 deriv;\n"); + WRITE(p, "};\n"); + WRITE(p, "StructuredBuffer tess_weights_u : register(t1);\n"); + WRITE(p, "StructuredBuffer tess_weights_v : register(t2);\n"); } const char *init[3] = { "0.0, 0.0", "0.0, 0.0, 0.0", "0.0, 0.0, 0.0, 0.0" }; for (int i = 2; i <= 4; i++) { // Define 3 types float2, float3, float4 - WRITE(p, "float%d tess_sample(in float%d points[16], in float2 weights[4]) {\n", i, i); + WRITE(p, "float%d tess_sample(in float%d points[16], float4x4 weights) {\n", i, i); WRITE(p, " float%d pos = float%d(%s);\n", i, i, init[i - 2]); - WRITE(p, " for (int i = 0; i < 4; ++i) {\n"); - WRITE(p, " for (int j = 0; j < 4; ++j) {\n"); - WRITE(p, " float f = weights[j].x * weights[i].y;\n"); - WRITE(p, " if (f != 0.0)\n"); - WRITE(p, " pos = pos + f * points[i * 4 + j];\n"); - WRITE(p, " }\n"); - WRITE(p, " }\n"); + for (int v = 0; v < 4; ++v) { + for (int u = 0; u < 4; ++u) { + WRITE(p, " pos += weights[%i][%i] * points[%i];\n", v, u, v * 4 + u); + } + } WRITE(p, " return pos;\n"); WRITE(p, "}\n"); } - if (doSpline) { - WRITE(p, "void spline_knot(int2 num_patches, int2 type, out float2 knot[6], int2 patch_pos) {\n"); - WRITE(p, " for (int i = 0; i < 6; ++i) {\n"); - WRITE(p, " knot[i] = float2(i + patch_pos.x - 2, i + patch_pos.y - 2);\n"); - WRITE(p, " }\n"); - // WRITE(p, " if ((type.x & 1) != 0) {\n"); - WRITE(p, " if ((type.x == 1) || (type.x == 3)) {\n"); - WRITE(p, " if (patch_pos.x <= 2)\n"); - WRITE(p, " knot[0].x = 0.0;\n"); - WRITE(p, " if (patch_pos.x <= 1)\n"); - WRITE(p, " knot[1].x = 0.0;\n"); - WRITE(p, " }\n"); - // WRITE(p, " if ((type.x & 2) != 0) {\n"); - WRITE(p, " if ((type.x == 2) || (type.x == 3)) {\n"); - WRITE(p, " if (patch_pos.x >= (num_patches.x - 2))\n"); - WRITE(p, " knot[5].x = num_patches.x;\n"); - WRITE(p, " if (patch_pos.x == (num_patches.x - 1))\n"); - WRITE(p, " knot[4].x = num_patches.x;\n"); - WRITE(p, " }\n"); - // WRITE(p, " if ((type.y & 1) != 0) {\n"); - WRITE(p, " if ((type.y == 1) || (type.y == 3)) {\n"); - WRITE(p, " if (patch_pos.y <= 2)\n"); - WRITE(p, " knot[0].y = 0.0;\n"); - WRITE(p, " if (patch_pos.y <= 1)\n"); - WRITE(p, " knot[1].y = 0.0;\n"); - WRITE(p, " }\n"); - // WRITE(p, " if ((type.y & 2) != 0) {\n"); - WRITE(p, " if ((type.y == 2) || (type.y == 3)) {\n"); - WRITE(p, " if (patch_pos.y >= (num_patches.y - 2))\n"); - WRITE(p, " knot[5].y = num_patches.y;\n"); - WRITE(p, " if (patch_pos.y == (num_patches.y - 1))\n"); - WRITE(p, " knot[4].y = num_patches.y;\n"); - WRITE(p, " }\n"); - WRITE(p, "}\n"); - WRITE(p, "void spline_weight(float2 t, in float2 knot[6], out float2 weights[4]) {\n"); - // TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly... - WRITE(p, " float2 t0 = (t - knot[0]);\n"); - WRITE(p, " float2 t1 = (t - knot[1]);\n"); - WRITE(p, " float2 t2 = (t - knot[2]);\n"); - // TODO: All our knots are integers so we should be able to get rid of these divisions (How?) - WRITE(p, " float2 f30 = t0 / (knot[3] - knot[0]);\n"); - WRITE(p, " float2 f41 = t1 / (knot[4] - knot[1]);\n"); - WRITE(p, " float2 f52 = t2 / (knot[5] - knot[2]);\n"); - WRITE(p, " float2 f31 = t1 / (knot[3] - knot[1]);\n"); - WRITE(p, " float2 f42 = t2 / (knot[4] - knot[2]);\n"); - WRITE(p, " float2 f32 = t2 / (knot[3] - knot[2]);\n"); - WRITE(p, " float2 a = (1.0 - f30)*(1.0 - f31);\n"); - WRITE(p, " float2 b = (f31*f41);\n"); - WRITE(p, " float2 c = (1.0 - f41)*(1.0 - f42);\n"); - WRITE(p, " float2 d = (f42*f52);\n"); - WRITE(p, " weights[0] = a - (a*f32);\n"); - WRITE(p, " weights[1] = 1.0 - a - b + ((a + b + c - 1.0)*f32);\n"); - WRITE(p, " weights[2] = b + ((1.0 - b - c - d)*f32);\n"); - WRITE(p, " weights[3] = d*f32;\n"); - WRITE(p, "}\n"); + WRITE(p, "float4x4 outerProduct(float4 u, float4 v) {\n"); + WRITE(p, " return mul((float4x1)v, (float1x4)u);\n"); + WRITE(p, "}\n"); + + WRITE(p, "struct Tess {\n"); + WRITE(p, " float3 pos;\n"); + if (doTexture) + WRITE(p, " float2 tex;\n"); + WRITE(p, " float4 col;\n"); + if (hasNormalTess) + WRITE(p, " float3 nrm;\n"); + WRITE(p, "};\n"); + + WRITE(p, "void tessellate(in VS_IN In, out Tess tess) {\n"); + WRITE(p, " int2 point_pos = int2(In.position.z, In.normal.z)%s;\n", doBezier ? " * 3" : ""); + WRITE(p, " int2 weight_idx = int2(In.position.xy);\n"); + // Load 4x4 control points + WRITE(p, " float3 _pos[16];\n"); + WRITE(p, " float2 _tex[16];\n"); + WRITE(p, " float4 _col[16];\n"); + WRITE(p, " int index;\n"); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + WRITE(p, " index = (%i + point_pos.y) * u_spline_counts + (%i + point_pos.x);\n", i, j); + WRITE(p, " _pos[%i] = tess_data[index].pos;\n", i * 4 + j); + if (doTexture && hasTexcoordTess) + WRITE(p, " _tex[%i] = tess_data[index].tex;\n", i * 4 + j); + if (hasColorTess) + WRITE(p, " _col[%i] = tess_data[index].col;\n", i * 4 + j); + } } + + // Basis polynomials as weight coefficients + WRITE(p, " float4 basis_u = tess_weights_u[weight_idx.x].basis;\n"); + WRITE(p, " float4 basis_v = tess_weights_v[weight_idx.y].basis;\n"); + WRITE(p, " float4x4 basis = outerProduct(basis_u, basis_v);\n"); + + // Tessellate + WRITE(p, " tess.pos = tess_sample(_pos, basis);\n"); + if (doTexture) { + if (hasTexcoordTess) + WRITE(p, " tess.tex = tess_sample(_tex, basis);\n"); + else + WRITE(p, " tess.tex = In.normal.xy;\n"); + } + if (hasColorTess) + WRITE(p, " tess.col = tess_sample(_col, basis);\n"); + else + WRITE(p, " tess.col = u_matambientalpha;\n"); + if (hasNormalTess) { + // Derivatives as weight coefficients + WRITE(p, " float4 deriv_u = tess_weights_u[weight_idx.x].deriv;\n"); + WRITE(p, " float4 deriv_v = tess_weights_v[weight_idx.y].deriv;\n"); + + WRITE(p, " float3 du = tess_sample(_pos, outerProduct(deriv_u, basis_v));\n"); + WRITE(p, " float3 dv = tess_sample(_pos, outerProduct(basis_u, deriv_v));\n"); + WRITE(p, " tess.nrm = normalize(cross(du, dv));\n"); + } + WRITE(p, "}\n"); } WRITE(p, "VS_OUT main(VS_IN In) {\n"); @@ -396,106 +405,14 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage } else { // Step 1: World Transform / Skinning if (!enableBones) { - // Hardware tessellation if (doSpline || doBezier) { - WRITE(p, " uint u_spline_count_u = u_spline_counts & 0xFF;\n"); - WRITE(p, " uint u_spline_count_v = (u_spline_counts >> 8) & 0xFF;\n"); - WRITE(p, " uint num_patches_u = %s;\n", doBezier ? "(u_spline_count_u - 1) / 3u" : "u_spline_count_u - 3"); - WRITE(p, " float2 tess_pos = In.position.xy;\n"); - WRITE(p, " int u = In.instanceId %% num_patches_u;\n"); - WRITE(p, " int v = In.instanceId / num_patches_u;\n"); - WRITE(p, " int2 patch_pos = int2(u, v);\n"); - WRITE(p, " float3 _pos[16];\n"); - WRITE(p, " float2 _tex[16];\n"); - WRITE(p, " float4 _col[16];\n"); - WRITE(p, " int index;\n"); - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - WRITE(p, " index = (%i + v%s) * u_spline_count_u + (%i + u%s);\n", i, doBezier ? " * 3" : "", j, doBezier ? " * 3" : ""); - WRITE(p, " _pos[%i] = tess_data[index].pos;\n", i * 4 + j); - if (doTexture && hasTexcoord && hasTexcoordTess) - WRITE(p, " _tex[%i] = tess_data[index].tex;\n", i * 4 + j); - if (hasColor && hasColorTess) - WRITE(p, " _col[%i] = tess_data[index].col;\n", i * 4 + j); - } - } - WRITE(p, " float2 weights[4];\n"); - if (doBezier) { - // Bernstein 3D - WRITE(p, " weights[0] = (1.0 - tess_pos) * (1.0 - tess_pos) * (1.0 - tess_pos);\n"); - WRITE(p, " weights[1] = 3.0 * tess_pos * (1.0 - tess_pos) * (1.0 - tess_pos);\n"); - WRITE(p, " weights[2] = 3.0 * tess_pos * tess_pos * (1.0 - tess_pos);\n"); - WRITE(p, " weights[3] = tess_pos * tess_pos * tess_pos;\n"); - } else if (doSpline) { - WRITE(p, " int2 spline_num_patches = int2(u_spline_count_u - 3, u_spline_count_v - 3);\n"); - WRITE(p, " int u_spline_type_u = (u_spline_counts >> 16) & 0xFF;\n"); - WRITE(p, " int u_spline_type_v = (u_spline_counts >> 24) & 0xFF;\n"); - WRITE(p, " int2 spline_type = int2(u_spline_type_u, u_spline_type_v);\n"); - WRITE(p, " float2 knots[6];\n"); - WRITE(p, " spline_knot(spline_num_patches, spline_type, knots, patch_pos);\n"); - WRITE(p, " spline_weight(tess_pos + patch_pos, knots, weights);\n"); - } - WRITE(p, " float3 pos = tess_sample(_pos, weights);\n"); - if (doTexture && hasTexcoord) { - if (hasTexcoordTess) - WRITE(p, " float2 tex = tess_sample(_tex, weights);\n"); - else - WRITE(p, " float2 tex = tess_pos + patch_pos;\n"); - } - if (hasColor) { - if (hasColorTess) - WRITE(p, " float4 col = tess_sample(_col, weights);\n"); - else - WRITE(p, " float4 col = tess_data[0].col;\n"); - } - if (hasNormal) { - // Curved surface is probably always need to compute normal(not sampling from control points) - if (doBezier) { - // Bernstein derivative - WRITE(p, " float2 bernderiv[4];\n"); - WRITE(p, " bernderiv[0] = -3.0 * (tess_pos - 1.0) * (tess_pos - 1.0); \n"); - WRITE(p, " bernderiv[1] = 9.0 * tess_pos * tess_pos - 12.0 * tess_pos + 3.0; \n"); - WRITE(p, " bernderiv[2] = 3.0 * (2.0 - 3.0 * tess_pos) * tess_pos; \n"); - WRITE(p, " bernderiv[3] = 3.0 * tess_pos * tess_pos; \n"); + // Hardware tessellation + WRITE(p, " Tess tess;\n"); + WRITE(p, " tessellate(In, tess);\n"); - WRITE(p, " float2 bernderiv_u[4];\n"); - WRITE(p, " float2 bernderiv_v[4];\n"); - WRITE(p, " for (int i = 0; i < 4; i++) {\n"); - WRITE(p, " bernderiv_u[i] = float2(bernderiv[i].x, weights[i].y);\n"); - WRITE(p, " bernderiv_v[i] = float2(weights[i].x, bernderiv[i].y);\n"); - WRITE(p, " }\n"); - - WRITE(p, " float3 du = tess_sample(_pos, bernderiv_u);\n"); - WRITE(p, " float3 dv = tess_sample(_pos, bernderiv_v);\n"); - } else if (doSpline) { - WRITE(p, " float2 tess_next_u = float2(In.normal.x, 0.0);\n"); - WRITE(p, " float2 tess_next_v = float2(0.0, In.normal.y);\n"); - // Right - WRITE(p, " float2 tess_pos_r = tess_pos + tess_next_u;\n"); - WRITE(p, " spline_weight(tess_pos_r + patch_pos, knots, weights);\n"); - WRITE(p, " float3 pos_r = tess_sample(_pos, weights);\n"); - // Left - WRITE(p, " float2 tess_pos_l = tess_pos - tess_next_u;\n"); - WRITE(p, " spline_weight(tess_pos_l + patch_pos, knots, weights);\n"); - WRITE(p, " float3 pos_l = tess_sample(_pos, weights);\n"); - // Down - WRITE(p, " float2 tess_pos_d = tess_pos + tess_next_v;\n"); - WRITE(p, " spline_weight(tess_pos_d + patch_pos, knots, weights);\n"); - WRITE(p, " float3 pos_d = tess_sample(_pos, weights);\n"); - // Up - WRITE(p, " float2 tess_pos_u = tess_pos - tess_next_v;\n"); - WRITE(p, " spline_weight(tess_pos_u + patch_pos, knots, weights);\n"); - WRITE(p, " float3 pos_u = tess_sample(_pos, weights);\n"); - - WRITE(p, " float3 du = pos_r - pos_l;\n"); - WRITE(p, " float3 dv = pos_d - pos_u;\n"); - } - WRITE(p, " float3 nrm = cross(du, dv);\n"); - WRITE(p, " nrm = normalize(nrm);\n"); - } - WRITE(p, " float3 worldpos = mul(float4(pos.xyz, 1.0), u_world);\n"); - if (hasNormal) - WRITE(p, " float3 worldnormal = normalize(mul(float4(%snrm, 0.0), u_world));\n", flipNormalTess ? "-" : ""); + WRITE(p, " float3 worldpos = mul(float4(tess.pos.xyz, 1.0), u_world);\n"); + if (hasNormalTess) + WRITE(p, " float3 worldnormal = normalize(mul(float4(%stess.nrm, 0.0), u_world));\n", flipNormalTess ? "-" : ""); else WRITE(p, " float3 worldnormal = float3(0.0, 0.0, 1.0);\n"); } else { @@ -600,9 +517,10 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage const char *diffuseStr = (matUpdate & 2) && hasColor ? "In.color0.rgb" : "u_matdiffuse"; const char *specularStr = (matUpdate & 4) && hasColor ? "In.color0.rgb" : "u_matspecular.rgb"; if (doBezier || doSpline) { - ambientStr = (matUpdate & 1) && hasColor ? "col" : "u_matambientalpha"; - diffuseStr = (matUpdate & 2) && hasColor ? "col.rgb" : "u_matdiffuse"; - specularStr = (matUpdate & 4) && hasColor ? "col.rgb" : "u_matspecular.rgb"; + // TODO: Probably, should use hasColorTess but FF4 has a problem with drawing the background. + ambientStr = (matUpdate & 1) && hasColor ? "tess.col" : "u_matambientalpha"; + diffuseStr = (matUpdate & 2) && hasColor ? "tess.col.rgb" : "u_matdiffuse"; + specularStr = (matUpdate & 4) && hasColor ? "tess.col.rgb" : "u_matspecular.rgb"; } bool diffuseIsZero = true; @@ -729,7 +647,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage // Lighting doesn't affect color. if (hasColor) { if (doBezier || doSpline) - WRITE(p, " Out.v_color0 = col;\n"); + WRITE(p, " Out.v_color0 = tess.col;\n"); else WRITE(p, " Out.v_color0 = In.color0;\n"); } else { @@ -747,7 +665,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage if (scaleUV) { if (hasTexcoord) { if (doBezier || doSpline) - WRITE(p, " Out.v_texcoord = float3(tex.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n"); + WRITE(p, " Out.v_texcoord = float3(tess.tex.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n"); else WRITE(p, " Out.v_texcoord = float3(In.texcoord.xy * u_uvscaleoffset.xy, 0.0);\n"); } else { @@ -755,10 +673,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage } } else { if (hasTexcoord) { - if (doBezier || doSpline) - WRITE(p, " Out.v_texcoord = float3(tex.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n"); - else - WRITE(p, " Out.v_texcoord = float3(In.texcoord.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n"); + WRITE(p, " Out.v_texcoord = float3(In.texcoord.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n"); } else { WRITE(p, " Out.v_texcoord = float3(u_uvscaleoffset.zw, 0.0);\n"); } diff --git a/GPU/GLES/DrawEngineGLES.cpp b/GPU/GLES/DrawEngineGLES.cpp index a8857b17bc..a2bb108860 100644 --- a/GPU/GLES/DrawEngineGLES.cpp +++ b/GPU/GLES/DrawEngineGLES.cpp @@ -81,22 +81,21 @@ DrawEngineGLES::DrawEngineGLES(Draw::DrawContext *draw) : vai_(256), draw_(draw) // All this is a LOT of memory, need to see if we can cut down somehow. decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); - splineBuffer = (u8 *)AllocateMemoryPages(SPLINE_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); indexGen.Setup(decIndex); InitDeviceObjects(); - tessDataTransfer = new TessellationDataTransferGLES(render_); + tessDataTransferGLES = new TessellationDataTransferGLES(render_); + tessDataTransfer = tessDataTransferGLES; } DrawEngineGLES::~DrawEngineGLES() { DestroyDeviceObjects(); FreeMemoryPages(decoded, DECODED_VERTEX_BUFFER_SIZE); FreeMemoryPages(decIndex, DECODED_INDEX_BUFFER_SIZE); - FreeMemoryPages(splineBuffer, SPLINE_BUFFER_SIZE); - delete tessDataTransfer; + delete tessDataTransferGLES; } void DrawEngineGLES::DeviceLost() { @@ -166,7 +165,7 @@ void DrawEngineGLES::EndFrame() { FrameData &frameData = frameData_[render_->GetCurFrame()]; render_->EndPushBuffer(frameData.pushIndex); render_->EndPushBuffer(frameData.pushVertex); - tessDataTransfer->EndFrame(); + tessDataTransferGLES->EndFrame(); } struct GlTypeInfo { @@ -520,10 +519,7 @@ rotateVBO: indexBufferOffset = (uint32_t)frameData.pushIndex->Push(decIndex, sizeof(uint16_t) * indexGen.VertexCount(), &indexBuffer); render_->BindIndexBuffer(indexBuffer); } - if (gstate_c.bezier || gstate_c.spline) - render_->DrawIndexed(glprim[prim], vertexCount, GL_UNSIGNED_SHORT, (GLvoid*)(intptr_t)indexBufferOffset, numPatches); - else - render_->DrawIndexed(glprim[prim], vertexCount, GL_UNSIGNED_SHORT, (GLvoid*)(intptr_t)indexBufferOffset); + render_->DrawIndexed(glprim[prim], vertexCount, GL_UNSIGNED_SHORT, (GLvoid*)(intptr_t)indexBufferOffset); } else { render_->Draw(glprim[prim], 0, vertexCount); } @@ -655,46 +651,66 @@ bool DrawEngineGLES::IsCodePtrVertexDecoder(const u8 *ptr) const { return decJitCache_->IsInSpace(ptr); } -void DrawEngineGLES::TessellationDataTransferGLES::SendDataToShader(const float *pos, const float *tex, const float *col, int size, bool hasColor, bool hasTexCoords) { +void TessellationDataTransferGLES::SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) { + bool hasColor = (vertType & GE_VTYPE_COL_MASK) != 0; + bool hasTexCoord = (vertType & GE_VTYPE_TC_MASK) != 0; + + int size = size_u * size_v; + float *pos = new float[size * 4]; + float *tex = hasTexCoord ? new float[size * 4] : nullptr; + float *col = hasColor ? new float[size * 4] : nullptr; + int stride = 4; + + CopyControlPoints(pos, tex, col, stride, stride, stride, points, size, vertType); // Removed the 1D texture support, it's unlikely to be relevant for performance. - if (data_tex[0]) - renderManager_->DeleteTexture(data_tex[0]); - uint8_t *pos_data = new uint8_t[size * sizeof(float) * 4]; - memcpy(pos_data, pos, size * sizeof(float) * 4); - data_tex[0] = renderManager_->CreateTexture(GL_TEXTURE_2D); - renderManager_->TextureImage(data_tex[0], 0, size, 1, GL_RGBA32F, GL_RGBA, GL_FLOAT, pos_data, GLRAllocType::NEW, false); - renderManager_->FinalizeTexture(data_tex[0], 0, false); - renderManager_->BindTexture(TEX_SLOT_SPLINE_POS, data_tex[0]); - - // Texcoords - if (hasTexCoords) { - if (data_tex[1]) - renderManager_->DeleteTexture(data_tex[1]); - uint8_t *tex_data = new uint8_t[size * sizeof(float) * 4]; - memcpy(tex_data, tex, size * sizeof(float) * 4); - data_tex[1] = renderManager_->CreateTexture(GL_TEXTURE_2D); - renderManager_->TextureImage(data_tex[1], 0, size, 1, GL_RGBA32F, GL_RGBA, GL_FLOAT, tex_data, GLRAllocType::NEW, false); - renderManager_->FinalizeTexture(data_tex[1], 0, false); - renderManager_->BindTexture(TEX_SLOT_SPLINE_NRM, data_tex[1]); + // Control Points + if (prevSizeU < size_u || prevSizeV < size_v) { + prevSizeU = size_u; + prevSizeV = size_v; + if (!data_tex[0]) + data_tex[0] = renderManager_->CreateTexture(GL_TEXTURE_2D); + renderManager_->TextureImage(data_tex[0], 0, size_u * 3, size_v, GL_RGBA32F, GL_RGBA, GL_FLOAT, nullptr, GLRAllocType::NONE, false); + renderManager_->FinalizeTexture(data_tex[0], 0, false); } + renderManager_->BindTexture(TEX_SLOT_SPLINE_POINTS, data_tex[0]); + // Position + renderManager_->TextureSubImage(data_tex[0], 0, 0, 0, size_u, size_v, GL_RGBA, GL_FLOAT, (u8 *)pos, GLRAllocType::NEW); + // Texcoord + if (hasTexCoord) + renderManager_->TextureSubImage(data_tex[0], 0, size_u, 0, size_u, size_v, GL_RGBA, GL_FLOAT, (u8 *)tex, GLRAllocType::NEW); + // Color + if (hasColor) + renderManager_->TextureSubImage(data_tex[0], 0, size_u * 2, 0, size_u, size_v, GL_RGBA, GL_FLOAT, (u8 *)col, GLRAllocType::NEW); - if (data_tex[2]) - renderManager_->DeleteTexture(data_tex[2]); - data_tex[2] = renderManager_->CreateTexture(GL_TEXTURE_2D); - int sizeColor = hasColor ? size : 1; - uint8_t *col_data = new uint8_t[sizeColor * sizeof(float) * 4]; - memcpy(col_data, col, sizeColor * sizeof(float) * 4); + // Weight U + if (prevSizeWU < weights.size_u) { + prevSizeWU = weights.size_u; + if (!data_tex[1]) + data_tex[1] = renderManager_->CreateTexture(GL_TEXTURE_2D); + renderManager_->TextureImage(data_tex[1], 0, weights.size_u * 2, 1, GL_RGBA32F, GL_RGBA, GL_FLOAT, nullptr, GLRAllocType::NONE, false); + renderManager_->FinalizeTexture(data_tex[1], 0, false); + } + renderManager_->BindTexture(TEX_SLOT_SPLINE_WEIGHTS_U, data_tex[1]); + renderManager_->TextureSubImage(data_tex[1], 0, 0, 0, weights.size_u * 2, 1, GL_RGBA, GL_FLOAT, (u8 *)weights.u, GLRAllocType::NONE); - renderManager_->TextureImage(data_tex[2], 0, sizeColor, 1, GL_RGBA32F, GL_RGBA, GL_FLOAT, col_data, GLRAllocType::NEW, false); - renderManager_->FinalizeTexture(data_tex[2], 0, false); - renderManager_->BindTexture(TEX_SLOT_SPLINE_COL, data_tex[2]); + // Weight V + if (prevSizeWV < weights.size_v) { + prevSizeWV = weights.size_v; + if (!data_tex[2]) + data_tex[2] = renderManager_->CreateTexture(GL_TEXTURE_2D); + renderManager_->TextureImage(data_tex[2], 0, weights.size_v * 2, 1, GL_RGBA32F, GL_RGBA, GL_FLOAT, nullptr, GLRAllocType::NONE, false); + renderManager_->FinalizeTexture(data_tex[2], 0, false); + } + renderManager_->BindTexture(TEX_SLOT_SPLINE_WEIGHTS_V, data_tex[2]); + renderManager_->TextureSubImage(data_tex[2], 0, 0, 0, weights.size_v * 2, 1, GL_RGBA, GL_FLOAT, (u8 *)weights.v, GLRAllocType::NONE); } -void DrawEngineGLES::TessellationDataTransferGLES::EndFrame() { +void TessellationDataTransferGLES::EndFrame() { for (int i = 0; i < 3; i++) { if (data_tex[i]) { renderManager_->DeleteTexture(data_tex[i]); data_tex[i] = nullptr; } } + prevSizeU = prevSizeV = prevSizeWU = prevSizeWV = 0; } diff --git a/GPU/GLES/DrawEngineGLES.h b/GPU/GLES/DrawEngineGLES.h index 2901a1c384..8b29700f83 100644 --- a/GPU/GLES/DrawEngineGLES.h +++ b/GPU/GLES/DrawEngineGLES.h @@ -46,9 +46,9 @@ enum { TEX_SLOT_SHADERBLEND_SRC = 1, TEX_SLOT_ALPHATEST = 2, TEX_SLOT_CLUT = 3, - TEX_SLOT_SPLINE_POS = 4, - TEX_SLOT_SPLINE_NRM = 5, - TEX_SLOT_SPLINE_COL = 6, + TEX_SLOT_SPLINE_POINTS = 4, + TEX_SLOT_SPLINE_WEIGHTS_U = 5, + TEX_SLOT_SPLINE_WEIGHTS_V = 6, }; @@ -110,6 +110,23 @@ public: u8 flags; }; +class TessellationDataTransferGLES : public TessellationDataTransfer { +private: + GLRTexture *data_tex[3]{}; + int prevSizeU = 0, prevSizeV = 0; + int prevSizeWU = 0, prevSizeWV = 0; + GLRenderManager *renderManager_; +public: + TessellationDataTransferGLES(GLRenderManager *renderManager) + : renderManager_(renderManager) { } + ~TessellationDataTransferGLES() { + EndFrame(); + } + // Send spline/bezier's control points and weights to vertex shader through floating point texture. + void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) override; + void EndFrame(); // Queues textures for deletion. +}; + // Handles transform, lighting and drawing. class DrawEngineGLES : public DrawEngineCommon { public: @@ -208,17 +225,5 @@ private: int bufferDecimationCounter_ = 0; // Hardware tessellation - class TessellationDataTransferGLES : public TessellationDataTransfer { - private: - GLRTexture *data_tex[3]{}; - GLRenderManager *renderManager_; - public: - TessellationDataTransferGLES(GLRenderManager *renderManager) - : renderManager_(renderManager) { } - ~TessellationDataTransferGLES() { - EndFrame(); - } - void SendDataToShader(const float *pos, const float *tex, const float *col, int size, bool hasColor, bool hasTexCoords) override; - void EndFrame() override; // Queues textures for deletion. - }; + TessellationDataTransferGLES *tessDataTransferGLES; }; diff --git a/GPU/GLES/GPU_GLES.cpp b/GPU/GLES/GPU_GLES.cpp index 47e04f12a0..cde3291767 100644 --- a/GPU/GLES/GPU_GLES.cpp +++ b/GPU/GLES/GPU_GLES.cpp @@ -109,8 +109,7 @@ GPU_GLES::GPU_GLES(GraphicsContext *gfxCtx, Draw::DrawContext *draw) if (g_Config.bHardwareTessellation) { // Disable hardware tessellation if device is unsupported. bool hasTexelFetch = gl_extensions.GLES3 || (!gl_extensions.IsGLES && gl_extensions.VersionGEThan(3, 3, 0)) || gl_extensions.EXT_gpu_shader4; - if (!gstate_c.SupportsAll(GPU_SUPPORTS_INSTANCE_RENDERING | GPU_SUPPORTS_VERTEX_TEXTURE_FETCH | GPU_SUPPORTS_TEXTURE_FLOAT) || !hasTexelFetch) { - // TODO: Check unsupported device name list.(Above gpu features are supported but it has issues with weak gpu, memory, shader compiler etc...) + if (!gstate_c.SupportsAll(GPU_SUPPORTS_VERTEX_TEXTURE_FETCH | GPU_SUPPORTS_TEXTURE_FLOAT) || !hasTexelFetch) { g_Config.bHardwareTessellation = false; ERROR_LOG(G3D, "Hardware Tessellation is unsupported, falling back to software tessellation"); I18NCategory *gr = GetI18NCategory("Graphics"); diff --git a/GPU/GLES/ShaderManagerGLES.cpp b/GPU/GLES/ShaderManagerGLES.cpp index 5a7aa94dbd..2a20129766 100644 --- a/GPU/GLES/ShaderManagerGLES.cpp +++ b/GPU/GLES/ShaderManagerGLES.cpp @@ -159,13 +159,10 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs, // We need to fetch these unconditionally, gstate_c.spline or bezier will not be set if we // create this shader at load time from the shader cache. - queries.push_back({ &u_tess_pos_tex, "u_tess_pos_tex" }); - queries.push_back({ &u_tess_tex_tex, "u_tess_tex_tex" }); - queries.push_back({ &u_tess_col_tex, "u_tess_col_tex" }); - queries.push_back({ &u_spline_count_u, "u_spline_count_u" }); - queries.push_back({ &u_spline_count_v, "u_spline_count_v" }); - queries.push_back({ &u_spline_type_u, "u_spline_type_u" }); - queries.push_back({ &u_spline_type_v, "u_spline_type_v" }); + queries.push_back({ &u_tess_points, "u_tess_points" }); + queries.push_back({ &u_tess_weights_u, "u_tess_weights_u" }); + queries.push_back({ &u_tess_weights_v, "u_tess_weights_v" }); + queries.push_back({ &u_spline_counts, "u_spline_counts" }); queries.push_back({ &u_depal, "u_depal" }); attrMask = vs->GetAttrMask(); @@ -176,9 +173,9 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs, initialize.push_back({ &u_fbotex, 0, 1 }); initialize.push_back({ &u_testtex, 0, 2 }); initialize.push_back({ &u_pal, 0, 3 }); // CLUT - initialize.push_back({ &u_tess_pos_tex, 0, 4 }); // Texture unit 4 - initialize.push_back({ &u_tess_tex_tex, 0, 5 }); // Texture unit 5 - initialize.push_back({ &u_tess_col_tex, 0, 6 }); // Texture unit 6 + initialize.push_back({ &u_tess_points, 0, 4 }); // Control Points + initialize.push_back({ &u_tess_weights_u, 0, 5 }); + initialize.push_back({ &u_tess_weights_v, 0, 6 }); program = render->CreateProgram(shaders, semantics, queries, initialize, gstate_c.featureFlags & GPU_SUPPORTS_DUALSOURCE_BLEND); @@ -567,13 +564,9 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid) { } if (dirty & DIRTY_BEZIERSPLINE) { - render_->SetUniformI1(&u_spline_count_u, gstate_c.spline_count_u); - if (u_spline_count_v != -1) - render_->SetUniformI1(&u_spline_count_v, gstate_c.spline_count_v); - if (u_spline_type_u != -1) - render_->SetUniformI1(&u_spline_type_u, gstate_c.spline_type_u); - if (u_spline_type_v != -1) - render_->SetUniformI1(&u_spline_type_v, gstate_c.spline_type_v); + if (u_spline_counts != -1) { + render_->SetUniformI1(&u_spline_counts, gstate_c.spline_num_points_u); + } } } diff --git a/GPU/GLES/ShaderManagerGLES.h b/GPU/GLES/ShaderManagerGLES.h index f5b9fa640b..c676639833 100644 --- a/GPU/GLES/ShaderManagerGLES.h +++ b/GPU/GLES/ShaderManagerGLES.h @@ -117,13 +117,11 @@ public: int u_lightspecular[4]; // attenuation int u_lightambient[4]; // attenuation - int u_tess_pos_tex; - int u_tess_tex_tex; - int u_tess_col_tex; - int u_spline_count_u; - int u_spline_count_v; - int u_spline_type_u; - int u_spline_type_v; + // Spline Tessellation + int u_tess_points; // Control Points + int u_tess_weights_u; + int u_tess_weights_v; + int u_spline_counts; }; // Real public interface diff --git a/GPU/GLES/VertexShaderGeneratorGLES.cpp b/GPU/GLES/VertexShaderGeneratorGLES.cpp index 7aa92fca4c..7efc8e6341 100644 --- a/GPU/GLES/VertexShaderGeneratorGLES.cpp +++ b/GPU/GLES/VertexShaderGeneratorGLES.cpp @@ -193,6 +193,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, bool doSpline = id.Bit(VS_BIT_SPLINE); bool hasColorTess = id.Bit(VS_BIT_HAS_COLOR_TESS); bool hasTexcoordTess = id.Bit(VS_BIT_HAS_TEXCOORD_TESS); + bool hasNormalTess = id.Bit(VS_BIT_HAS_NORMAL_TESS); bool flipNormalTess = id.Bit(VS_BIT_NORM_REVERSE_TESS); const char *shading = ""; @@ -379,83 +380,88 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, if (doBezier || doSpline) { *uniformMask |= DIRTY_BEZIERSPLINE; - WRITE(p, "uniform sampler2D u_tess_pos_tex;\n"); - WRITE(p, "uniform sampler2D u_tess_tex_tex;\n"); - WRITE(p, "uniform sampler2D u_tess_col_tex;\n"); + WRITE(p, "uniform sampler2D u_tess_points;\n"); // Control Points + WRITE(p, "uniform sampler2D u_tess_weights_u;\n"); + WRITE(p, "uniform sampler2D u_tess_weights_v;\n"); - WRITE(p, "uniform int u_spline_count_u;\n"); + WRITE(p, "uniform int u_spline_counts;\n"); for (int i = 2; i <= 4; i++) { // Define 3 types vec2, vec3, vec4 - WRITE(p, "vec%d tess_sample(in vec%d points[16], in vec2 weights[4]) {\n", i, i); + WRITE(p, "vec%d tess_sample(in vec%d points[16], mat4 weights) {\n", i, i); WRITE(p, " vec%d pos = vec%d(0.0);\n", i, i); - WRITE(p, " for (int i = 0; i < 4; ++i) {\n"); - WRITE(p, " for (int j = 0; j < 4; ++j) {\n"); - WRITE(p, " float f = weights[j].x * weights[i].y;\n"); - WRITE(p, " if (f != 0.0)\n"); - WRITE(p, " pos = pos + f * points[i * 4 + j];\n"); - WRITE(p, " }\n"); - WRITE(p, " }\n"); + for (int v = 0; v < 4; ++v) { + for (int u = 0; u < 4; ++u) { + WRITE(p, " pos += weights[%i][%i] * points[%i];\n", v, u, v * 4 + u); + } + } WRITE(p, " return pos;\n"); WRITE(p, "}\n"); } - if (doSpline) { - WRITE(p, "uniform int u_spline_count_v;\n"); - WRITE(p, "uniform int u_spline_type_u;\n"); - WRITE(p, "uniform int u_spline_type_v;\n"); - WRITE(p, "void spline_knot(ivec2 num_patches, ivec2 type, out vec2 knot[6], ivec2 patch_pos) {\n"); - WRITE(p, " for (int i = 0; i < 6; ++i) {\n"); - WRITE(p, " knot[i] = vec2(float(i + patch_pos.x - 2), float(i + patch_pos.y - 2));\n"); - WRITE(p, " }\n"); - WRITE(p, " if ((type.x & 1) != 0) {\n"); - WRITE(p, " if (patch_pos.x <= 2)\n"); - WRITE(p, " knot[0].x = 0.0;\n"); - WRITE(p, " if (patch_pos.x <= 1)\n"); - WRITE(p, " knot[1].x = 0.0;\n"); - WRITE(p, " }\n"); - WRITE(p, " if ((type.x & 2) != 0) {\n"); - WRITE(p, " if (patch_pos.x >= (num_patches.x - 2))\n"); - WRITE(p, " knot[5].x = float(num_patches.x);\n"); - WRITE(p, " if (patch_pos.x == (num_patches.x - 1))\n"); - WRITE(p, " knot[4].x = float(num_patches.x);\n"); - WRITE(p, " }\n"); - WRITE(p, " if ((type.y & 1) != 0) {\n"); - WRITE(p, " if (patch_pos.y <= 2)\n"); - WRITE(p, " knot[0].y = 0.0;\n"); - WRITE(p, " if (patch_pos.y <= 1)\n"); - WRITE(p, " knot[1].y = 0.0;\n"); - WRITE(p, " }\n"); - WRITE(p, " if ((type.y & 2) != 0) {\n"); - WRITE(p, " if (patch_pos.y >= (num_patches.y - 2))\n"); - WRITE(p, " knot[5].y = float(num_patches.y);\n"); - WRITE(p, " if (patch_pos.y == (num_patches.y - 1))\n"); - WRITE(p, " knot[4].y = float(num_patches.y);\n"); - WRITE(p, " }\n"); - WRITE(p, "}\n"); - - WRITE(p, "void spline_weight(vec2 t, in vec2 knot[6], out vec2 weights[4]) {\n"); - // TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly... - WRITE(p, " vec2 t0 = (t - knot[0]);\n"); - WRITE(p, " vec2 t1 = (t - knot[1]);\n"); - WRITE(p, " vec2 t2 = (t - knot[2]);\n"); - // TODO: All our knots are integers so we should be able to get rid of these divisions (How?) - WRITE(p, " vec2 f30 = t0 / (knot[3] - knot[0]);\n"); - WRITE(p, " vec2 f41 = t1 / (knot[4] - knot[1]);\n"); - WRITE(p, " vec2 f52 = t2 / (knot[5] - knot[2]);\n"); - WRITE(p, " vec2 f31 = t1 / (knot[3] - knot[1]);\n"); - WRITE(p, " vec2 f42 = t2 / (knot[4] - knot[2]);\n"); - WRITE(p, " vec2 f32 = t2 / (knot[3] - knot[2]);\n"); - WRITE(p, " vec2 a = (1.0 - f30)*(1.0 - f31);\n"); - WRITE(p, " vec2 b = (f31*f41);\n"); - WRITE(p, " vec2 c = (1.0 - f41)*(1.0 - f42);\n"); - WRITE(p, " vec2 d = (f42*f52);\n"); - WRITE(p, " weights[0] = a - (a*f32);\n"); - WRITE(p, " weights[1] = vec2(1.0) - a - b + ((a + b + c - vec2(1.0))*f32);\n"); - WRITE(p, " weights[2] = b + ((vec2(1.0) - b - c - d)*f32);\n"); - WRITE(p, " weights[3] = d*f32;\n"); + if (!gl_extensions.VersionGEThan(3, 0, 0)) { // For glsl version 1.10 + WRITE(p, "mat4 outerProduct(vec4 u, vec4 v) {\n"); + WRITE(p, " return mat4(u * v[0], u * v[1], u * v[2], u * v[3]);\n"); WRITE(p, "}\n"); } + + WRITE(p, "struct Tess {\n"); + WRITE(p, " vec3 pos;\n"); + if (doTexture) + WRITE(p, " vec2 tex;\n"); + WRITE(p, " vec4 col;\n"); + if (hasNormalTess) + WRITE(p, " vec3 nrm;\n"); + WRITE(p, "};\n"); + + WRITE(p, "void tessellate(out Tess tess) {\n"); + WRITE(p, " ivec2 point_pos = ivec2(position.z, normal.z)%s;\n", doBezier ? " * 3" : ""); + WRITE(p, " ivec2 weight_idx = ivec2(position.xy);\n"); + + // Load 4x4 control points + WRITE(p, " vec3 _pos[16];\n"); + WRITE(p, " vec2 _tex[16];\n"); + WRITE(p, " vec4 _col[16];\n"); + WRITE(p, " int index_u, index_v;\n"); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + WRITE(p, " index_u = (%i + point_pos.x);\n", j); + WRITE(p, " index_v = (%i + point_pos.y);\n", i); + WRITE(p, " _pos[%i] = %s(u_tess_points, ivec2(index_u, index_v), 0).xyz;\n", i * 4 + j, texelFetch); + if (doTexture && hasTexcoordTess) + WRITE(p, " _tex[%i] = %s(u_tess_points, ivec2(index_u + u_spline_counts, index_v), 0).xy;\n", i * 4 + j, texelFetch); + if (hasColorTess) + WRITE(p, " _col[%i] = %s(u_tess_points, ivec2(index_u + u_spline_counts * 2, index_v), 0).rgba;\n", i * 4 + j, texelFetch); + } + } + + // Basis polynomials as weight coefficients + WRITE(p, " vec4 basis_u = %s(u_tess_weights_u, %s, 0);\n", texelFetch, "ivec2(weight_idx.x * 2, 0)"); + WRITE(p, " vec4 basis_v = %s(u_tess_weights_v, %s, 0);\n", texelFetch, "ivec2(weight_idx.y * 2, 0)"); + WRITE(p, " mat4 basis = outerProduct(basis_u, basis_v);\n"); + + // Tessellate + WRITE(p, " tess.pos = tess_sample(_pos, basis);\n"); + if (doTexture) { + if (hasTexcoordTess) + WRITE(p, " tess.tex = tess_sample(_tex, basis);\n"); + else + WRITE(p, " tess.tex = normal.xy;\n"); + } + if (hasColorTess) + WRITE(p, " tess.col = tess_sample(_col, basis);\n"); + else + WRITE(p, " tess.col = u_matambientalpha;\n"); + if (hasNormalTess) { + // Derivatives as weight coefficients + WRITE(p, " vec4 deriv_u = %s(u_tess_weights_u, %s, 0);\n", texelFetch, "ivec2(weight_idx.x * 2 + 1, 0)"); + WRITE(p, " vec4 deriv_v = %s(u_tess_weights_v, %s, 0);\n", texelFetch, "ivec2(weight_idx.y * 2 + 1, 0)"); + + WRITE(p, " vec3 du = tess_sample(_pos, outerProduct(deriv_u, basis_v));\n"); + WRITE(p, " vec3 dv = tess_sample(_pos, outerProduct(basis_u, deriv_v));\n"); + WRITE(p, " tess.nrm = normalize(cross(du, dv));\n"); + } + WRITE(p, "}\n"); } WRITE(p, "void main() {\n"); @@ -494,101 +500,14 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, } else { // Step 1: World Transform / Skinning if (!enableBones) { - // Hardware tessellation if (doBezier || doSpline) { - WRITE(p, " vec3 _pos[16];\n"); - WRITE(p, " vec2 _tex[16];\n"); - WRITE(p, " vec4 _col[16];\n"); - WRITE(p, " int num_patches_u = %s;\n", doBezier ? "(u_spline_count_u - 1) / 3" : "u_spline_count_u - 3"); - WRITE(p, " int u = int(mod(float(gl_InstanceID), float(num_patches_u)));\n"); - WRITE(p, " int v = gl_InstanceID / num_patches_u;\n"); - WRITE(p, " ivec2 patch_pos = ivec2(u, v);\n"); - WRITE(p, " for (int i = 0; i < 4; i++) {\n"); - WRITE(p, " for (int j = 0; j < 4; j++) {\n"); - WRITE(p, " int index = (i + v%s) * u_spline_count_u + (j + u%s);\n", doBezier ? " * 3" : "", doBezier ? " * 3" : ""); - WRITE(p, " _pos[i * 4 + j] = %s(u_tess_pos_tex, ivec2(index, 0), 0).xyz;\n", texelFetch); - if (doTexture && hasTexcoord && hasTexcoordTess) - WRITE(p, " _tex[i * 4 + j] = %s(u_tess_tex_tex, ivec2(index, 0), 0).xy;\n", texelFetch); - if (hasColor && hasColorTess) - WRITE(p, " _col[i * 4 + j] = %s(u_tess_col_tex, ivec2(index, 0), 0).rgba;\n", texelFetch); - WRITE(p, " }\n"); - WRITE(p, " }\n"); - WRITE(p, " vec2 tess_pos = position.xy;\n"); - WRITE(p, " vec2 weights[4];\n"); - if (doBezier) { - // Bernstein 3D - WRITE(p, " weights[0] = (1.0 - tess_pos) * (1.0 - tess_pos) * (1.0 - tess_pos);\n"); - WRITE(p, " weights[1] = 3.0 * tess_pos * (1.0 - tess_pos) * (1.0 - tess_pos);\n"); - WRITE(p, " weights[2] = 3.0 * tess_pos * tess_pos * (1.0 - tess_pos);\n"); - WRITE(p, " weights[3] = tess_pos * tess_pos * tess_pos;\n"); - } else { // Spline - WRITE(p, " ivec2 spline_num_patches = ivec2(u_spline_count_u - 3, u_spline_count_v - 3);\n"); - WRITE(p, " ivec2 spline_type = ivec2(u_spline_type_u, u_spline_type_v);\n"); - WRITE(p, " vec2 knots[6];\n"); - WRITE(p, " spline_knot(spline_num_patches, spline_type, knots, patch_pos);\n"); - WRITE(p, " spline_weight(tess_pos + vec2(patch_pos), knots, weights);\n"); - } - WRITE(p, " vec3 pos = tess_sample(_pos, weights);\n"); - if (doTexture && hasTexcoord) { - if (hasTexcoordTess) - WRITE(p, " vec2 tex = tess_sample(_tex, weights);\n"); - else - WRITE(p, " vec2 tex = tess_pos + vec2(patch_pos);\n"); - } - if (hasColor) { - if (hasColorTess) - WRITE(p, " vec4 col = tess_sample(_col, weights);\n"); - else - WRITE(p, " vec4 col = %s(u_tess_col_tex, ivec2(0, 0), 0).rgba;\n", texelFetch); - } - if (hasNormal) { - // Curved surface is probably always need to compute normal(not sampling from control points) - if (doBezier) { - // Bernstein derivative - WRITE(p, " vec2 bernderiv[4];\n"); - WRITE(p, " bernderiv[0] = -3.0 * (tess_pos - 1.0) * (tess_pos - 1.0); \n"); - WRITE(p, " bernderiv[1] = 9.0 * tess_pos * tess_pos - 12.0 * tess_pos + 3.0; \n"); - WRITE(p, " bernderiv[2] = 3.0 * (2.0 - 3.0 * tess_pos) * tess_pos; \n"); - WRITE(p, " bernderiv[3] = 3.0 * tess_pos * tess_pos; \n"); + // Hardware tessellation + WRITE(p, " Tess tess;\n"); + WRITE(p, " tessellate(tess);\n"); - WRITE(p, " vec2 bernderiv_u[4];\n"); - WRITE(p, " vec2 bernderiv_v[4];\n"); - WRITE(p, " for (int i = 0; i < 4; i++) {\n"); - WRITE(p, " bernderiv_u[i] = vec2(bernderiv[i].x, weights[i].y);\n"); - WRITE(p, " bernderiv_v[i] = vec2(weights[i].x, bernderiv[i].y);\n"); - WRITE(p, " }\n"); - - WRITE(p, " vec3 du = tess_sample(_pos, bernderiv_u);\n"); - WRITE(p, " vec3 dv = tess_sample(_pos, bernderiv_v);\n"); - } else { // Spline - WRITE(p, " vec2 tess_next_u = vec2(normal.x, 0.0);\n"); - WRITE(p, " vec2 tess_next_v = vec2(0.0, normal.y);\n"); - // Right - WRITE(p, " vec2 tess_pos_r = tess_pos + tess_next_u;\n"); - WRITE(p, " spline_weight(tess_pos_r + vec2(patch_pos), knots, weights);\n"); - WRITE(p, " vec3 pos_r = tess_sample(_pos, weights);\n"); - // Left - WRITE(p, " vec2 tess_pos_l = tess_pos - tess_next_u;\n"); - WRITE(p, " spline_weight(tess_pos_l + vec2(patch_pos), knots, weights);\n"); - WRITE(p, " vec3 pos_l = tess_sample(_pos, weights);\n"); - // Down - WRITE(p, " vec2 tess_pos_d = tess_pos + tess_next_v;\n"); - WRITE(p, " spline_weight(tess_pos_d + vec2(patch_pos), knots, weights);\n"); - WRITE(p, " vec3 pos_d = tess_sample(_pos, weights);\n"); - // Up - WRITE(p, " vec2 tess_pos_u = tess_pos - tess_next_v;\n"); - WRITE(p, " spline_weight(tess_pos_u + vec2(patch_pos), knots, weights);\n"); - WRITE(p, " vec3 pos_u = tess_sample(_pos, weights);\n"); - - WRITE(p, " vec3 du = pos_r - pos_l;\n"); - WRITE(p, " vec3 dv = pos_d - pos_u;\n"); - } - WRITE(p, " vec3 nrm = cross(du, dv);\n"); - WRITE(p, " nrm = normalize(nrm);\n"); - } - WRITE(p, " vec3 worldpos = (u_world * vec4(pos.xyz, 1.0)).xyz;\n"); - if (hasNormal) { - WRITE(p, " mediump vec3 worldnormal = normalize((u_world * vec4(%snrm, 0.0)).xyz);\n", flipNormalTess ? "-" : ""); + WRITE(p, " vec3 worldpos = (u_world * vec4(tess.pos.xyz, 1.0)).xyz;\n"); + if (hasNormalTess) { + WRITE(p, " mediump vec3 worldnormal = normalize((u_world * vec4(%stess.nrm, 0.0)).xyz);\n", flipNormalTess ? "-" : ""); } else { WRITE(p, " mediump vec3 worldnormal = vec3(0.0, 0.0, 1.0);\n"); } @@ -692,9 +611,10 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, const char *diffuseStr = (matUpdate & 2) && hasColor ? "color0.rgb" : "u_matdiffuse"; const char *specularStr = (matUpdate & 4) && hasColor ? "color0.rgb" : "u_matspecular.rgb"; if (doBezier || doSpline) { - ambientStr = (matUpdate & 1) && hasColor ? "col" : "u_matambientalpha"; - diffuseStr = (matUpdate & 2) && hasColor ? "col.rgb" : "u_matdiffuse"; - specularStr = (matUpdate & 4) && hasColor ? "col.rgb" : "u_matspecular.rgb"; + // TODO: Probably, should use hasColorTess but FF4 has a problem with drawing the background. + ambientStr = (matUpdate & 1) && hasColor ? "tess.col" : "u_matambientalpha"; + diffuseStr = (matUpdate & 2) && hasColor ? "tess.col.rgb" : "u_matdiffuse"; + specularStr = (matUpdate & 4) && hasColor ? "tess.col.rgb" : "u_matspecular.rgb"; } bool diffuseIsZero = true; @@ -821,7 +741,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, // Lighting doesn't affect color. if (hasColor) { if (doBezier || doSpline) - WRITE(p, " v_color0 = col;\n"); + WRITE(p, " v_color0 = tess.col;\n"); else WRITE(p, " v_color0 = color0;\n"); } else { @@ -839,9 +759,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, if (scaleUV) { if (hasTexcoord) { if (doBezier || doSpline) - // TODO: Need fix? - // Fix to avoid temporarily texture animation bug with hardware tessellation. - WRITE(p, " v_texcoord = vec3(tex * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n"); + WRITE(p, " v_texcoord = vec3(tess.tex * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n"); else WRITE(p, " v_texcoord = vec3(texcoord.xy * u_uvscaleoffset.xy, 0.0);\n"); } else { @@ -849,10 +767,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, } } else { if (hasTexcoord) { - if (doBezier || doSpline) - WRITE(p, " v_texcoord = vec3(tex * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n"); - else - WRITE(p, " v_texcoord = vec3(texcoord.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n"); + WRITE(p, " v_texcoord = vec3(texcoord.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n"); } else { WRITE(p, " v_texcoord = vec3(u_uvscaleoffset.zw, 0.0);\n"); } diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp index a0ecf7dc4d..1d8266d500 100644 --- a/GPU/GPUCommon.cpp +++ b/GPU/GPUCommon.cpp @@ -1718,8 +1718,6 @@ bail: } void GPUCommon::Execute_Bezier(u32 op, u32 diff) { - drawEngineCommon_->DispatchFlush(); - // We don't dirty on normal changes anymore as we prescale, but it's needed for splines/bezier. gstate_c.Dirty(DIRTY_UVSCALEOFFSET); @@ -1760,9 +1758,9 @@ void GPUCommon::Execute_Bezier(u32 op, u32 diff) { if (CanUseHardwareTessellation(patchPrim)) { gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE); gstate_c.bezier = true; - if (gstate_c.spline_count_u != bz_ucount) { + if (gstate_c.spline_num_points_u != bz_ucount) { gstate_c.Dirty(DIRTY_BEZIERSPLINE); - gstate_c.spline_count_u = bz_ucount; + gstate_c.spline_num_points_u = bz_ucount; } } @@ -1780,8 +1778,6 @@ void GPUCommon::Execute_Bezier(u32 op, u32 diff) { } void GPUCommon::Execute_Spline(u32 op, u32 diff) { - drawEngineCommon_->DispatchFlush(); - // We don't dirty on normal changes anymore as we prescale, but it's needed for splines/bezier. gstate_c.Dirty(DIRTY_UVSCALEOFFSET); @@ -1824,14 +1820,9 @@ void GPUCommon::Execute_Spline(u32 op, u32 diff) { if (CanUseHardwareTessellation(patchPrim)) { gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE); gstate_c.spline = true; - bool countsChanged = gstate_c.spline_count_u != sp_ucount || gstate_c.spline_count_v != sp_vcount; - bool typesChanged = gstate_c.spline_type_u != sp_utype || gstate_c.spline_type_v != sp_vtype; - if (countsChanged || typesChanged) { + if (gstate_c.spline_num_points_u != sp_ucount) { gstate_c.Dirty(DIRTY_BEZIERSPLINE); - gstate_c.spline_count_u = sp_ucount; - gstate_c.spline_count_v = sp_vcount; - gstate_c.spline_type_u = sp_utype; - gstate_c.spline_type_v = sp_vtype; + gstate_c.spline_num_points_u = sp_ucount; } } diff --git a/GPU/GPUState.h b/GPU/GPUState.h index a7d4f719a3..25e81c6477 100644 --- a/GPU/GPUState.h +++ b/GPU/GPUState.h @@ -603,10 +603,7 @@ struct GPUStateCache { bool bezier; bool spline; - int spline_count_u; - int spline_count_v; - int spline_type_u; - int spline_type_v; + int spline_num_points_u; bool useShaderDepal; GEBufferFormat depalFramebufferFormat; diff --git a/GPU/Math3D.cpp b/GPU/Math3D.cpp index f43f41ade6..c46a4e520d 100644 --- a/GPU/Math3D.cpp +++ b/GPU/Math3D.cpp @@ -102,11 +102,49 @@ float Vec3::Distance2To(Vec3 &other) return Vec3(other-(*this)).Length2(); } +#if defined(_M_SSE) +__m128 SSENormalizeMultiplierSSE2(__m128 v) +{ + const __m128 sq = _mm_mul_ps(v, v); + const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1)); + const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2)); + const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq)); + + const __m128 rt = _mm_rsqrt_ss(res); + return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0)); +} + +#if _M_SSE >= 0x401 +__m128 SSENormalizeMultiplierSSE4(__m128 v) +{ + return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF)); +} + +__m128 SSENormalizeMultiplier(bool useSSE4, __m128 v) +{ + if (useSSE4) + return SSENormalizeMultiplierSSE4(v); + return SSENormalizeMultiplierSSE2(v); +} +#else +__m128 SSENormalizeMultiplier(bool useSSE4, __m128 v) +{ + return SSENormalizeMultiplierSSE2(v); +} +#endif template<> -Vec3 Vec3::Normalized() const +Vec3 Vec3::Normalized(bool useSSE4) const +{ + const __m128 normalize = SSENormalizeMultiplier(useSSE4, vec); + return _mm_mul_ps(normalize, vec); +} +#else +template<> +Vec3 Vec3::Normalized(bool useSSE4) const { return (*this) / Length(); } +#endif template<> float Vec3::Normalize() diff --git a/GPU/Math3D.h b/GPU/Math3D.h index ba01d31600..292c63be05 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -25,6 +25,9 @@ #if defined(_M_SSE) #include +#if _M_SSE >= 0x401 +#include +#endif #endif namespace Math3D { @@ -177,8 +180,6 @@ public: const Vec2 ts() const { return Vec2(y, x); } }; -typedef Vec2 Vec2f; - template class Vec3Packed; @@ -295,7 +296,7 @@ public: void SetLength(const float l); Vec3 WithLength(const float l) const; float Distance2To(Vec3 &other); - Vec3 Normalized() const; + Vec3 Normalized(bool useSSE4 = false) const; float Normalize(); // returns the previous length, which is often useful T& operator [] (int i) //allow vector[2] = 3 (vector.z=3) @@ -817,6 +818,7 @@ private: }; // namespace Math3D +typedef Math3D::Vec2 Vec2f; typedef Math3D::Vec3 Vec3f; typedef Math3D::Vec3Packed Vec3Packedf; typedef Math3D::Vec4 Vec4f; @@ -1082,6 +1084,69 @@ __forceinline void Vec4::ToRGBA(u8 *rgba) const *(u32 *)rgba = ToRGBA(); } +#if defined(_M_SSE) +// Specialized for SIMD optimization + +// Vec3 operation +template<> +inline void Vec3::operator += (const Vec3 &other) +{ + vec = _mm_add_ps(vec, other.vec); +} + +template<> +inline Vec3 Vec3::operator + (const Vec3 &other) const +{ + return Vec3(_mm_add_ps(vec, other.vec)); +} + +template<> +inline Vec3 Vec3::operator * (const Vec3 &other) const +{ + return Vec3(_mm_mul_ps(vec, other.vec)); +} + +template<> template<> +inline Vec3 Vec3::operator * (const float &other) const +{ + return Vec3(_mm_mul_ps(vec, _mm_set_ps1(other))); +} + +// Vec4 operation +template<> +inline void Vec4::operator += (const Vec4 &other) +{ + vec = _mm_add_ps(vec, other.vec); +} + +template<> +inline Vec4 Vec4::operator + (const Vec4 &other) const +{ + return Vec4(_mm_add_ps(vec, other.vec)); +} + +template<> +inline Vec4 Vec4::operator * (const Vec4 &other) const +{ + return Vec4(_mm_mul_ps(vec, other.vec)); +} + +template<> template<> +inline Vec4 Vec4::operator * (const float &other) const +{ + return Vec4(_mm_mul_ps(vec, _mm_set_ps1(other))); +} + +// Vec3 cross product +template<> +inline Vec3 Cross(const Vec3 &a, const Vec3 &b) +{ + const __m128 left = _mm_mul_ps(_mm_shuffle_ps(a.vec, a.vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b.vec, b.vec, _MM_SHUFFLE(3, 1, 0, 2))); + const __m128 right = _mm_mul_ps(_mm_shuffle_ps(a.vec, a.vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b.vec, b.vec, _MM_SHUFFLE(3, 0, 2, 1))); + return _mm_sub_ps(left, right); +} +#endif + }; // namespace Math3D // linear interpolation via float: 0.0=begin, 1.0=end diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index e31d062de5..e72e9a85cb 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -42,13 +42,11 @@ SoftwareDrawEngine::SoftwareDrawEngine() { // All this is a LOT of memory, need to see if we can cut down somehow. Used for splines. decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); - splineBuffer = (u8 *)AllocateMemoryPages(SPLINE_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); } SoftwareDrawEngine::~SoftwareDrawEngine() { FreeMemoryPages(decoded, DECODED_VERTEX_BUFFER_SIZE); FreeMemoryPages(decIndex, DECODED_INDEX_BUFFER_SIZE); - FreeMemoryPages(splineBuffer, SPLINE_BUFFER_SIZE); } void SoftwareDrawEngine::DispatchFlush() { @@ -280,7 +278,7 @@ void TransformUnit::SubmitPrimitive(void* vertices, void* indices, GEPrimitiveTy u16 index_lower_bound = 0; u16 index_upper_bound = vertex_count - 1; - IndexConverter idxConv(vertex_type, indices); + IndexConverter ConvertIndex(vertex_type, indices); if (indices) GetIndexBounds(indices, vertex_count, vertex_type, &index_lower_bound, &index_upper_bound); @@ -321,7 +319,7 @@ void TransformUnit::SubmitPrimitive(void* vertices, void* indices, GEPrimitiveTy { for (int vtx = 0; vtx < vertex_count; ++vtx) { if (indices) { - vreader.Goto(idxConv.convert(vtx) - index_lower_bound); + vreader.Goto(ConvertIndex(vtx) - index_lower_bound); } else { vreader.Goto(vtx); } @@ -380,7 +378,7 @@ void TransformUnit::SubmitPrimitive(void* vertices, void* indices, GEPrimitiveTy int skip_count = data_index == 0 ? 1 : 0; for (int vtx = 0; vtx < vertex_count; ++vtx) { if (indices) { - vreader.Goto(idxConv.convert(vtx) - index_lower_bound); + vreader.Goto(ConvertIndex(vtx) - index_lower_bound); } else { vreader.Goto(vtx); } @@ -410,7 +408,7 @@ void TransformUnit::SubmitPrimitive(void* vertices, void* indices, GEPrimitiveTy for (int vtx = 0; vtx < vertex_count; ++vtx) { if (indices) { - vreader.Goto(idxConv.convert(vtx) - index_lower_bound); + vreader.Goto(ConvertIndex(vtx) - index_lower_bound); } else { vreader.Goto(vtx); } @@ -452,7 +450,7 @@ void TransformUnit::SubmitPrimitive(void* vertices, void* indices, GEPrimitiveTy // Only read the central vertex if we're not continuing. if (data_index == 0) { if (indices) { - vreader.Goto(idxConv.convert(0) - index_lower_bound); + vreader.Goto(ConvertIndex(0) - index_lower_bound); } else { vreader.Goto(0); } @@ -463,7 +461,7 @@ void TransformUnit::SubmitPrimitive(void* vertices, void* indices, GEPrimitiveTy for (int vtx = start_vtx; vtx < vertex_count; ++vtx) { if (indices) { - vreader.Goto(idxConv.convert(vtx) - index_lower_bound); + vreader.Goto(ConvertIndex(vtx) - index_lower_bound); } else { vreader.Goto(vtx); } diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index 16f0c495fb..654b8a1a41 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -69,6 +69,8 @@ enum { DRAW_BINDING_DYNUBO_LIGHT = 4, DRAW_BINDING_DYNUBO_BONE = 5, DRAW_BINDING_TESS_STORAGE_BUF = 6, + DRAW_BINDING_TESS_STORAGE_BUF_WU = 7, + DRAW_BINDING_TESS_STORAGE_BUF_WV = 8, }; enum { @@ -87,7 +89,6 @@ DrawEngineVulkan::DrawEngineVulkan(VulkanContext *vulkan, Draw::DrawContext *dra // All this is a LOT of memory, need to see if we can cut down somehow. decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); - splineBuffer = (u8 *)AllocateMemoryPages(SPLINE_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); indexGen.Setup(decIndex); @@ -96,7 +97,7 @@ DrawEngineVulkan::DrawEngineVulkan(VulkanContext *vulkan, Draw::DrawContext *dra void DrawEngineVulkan::InitDeviceObjects() { // All resources we need for PSP drawing. Usually only bindings 0 and 2-4 are populated. - VkDescriptorSetLayoutBinding bindings[7]{}; + VkDescriptorSetLayoutBinding bindings[9]{}; bindings[0].descriptorCount = 1; bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; bindings[0].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; @@ -126,6 +127,14 @@ void DrawEngineVulkan::InitDeviceObjects() { bindings[6].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; bindings[6].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; bindings[6].binding = DRAW_BINDING_TESS_STORAGE_BUF; + bindings[7].descriptorCount = 1; + bindings[7].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + bindings[7].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + bindings[7].binding = DRAW_BINDING_TESS_STORAGE_BUF_WU; + bindings[8].descriptorCount = 1; + bindings[8].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + bindings[8].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + bindings[8].binding = DRAW_BINDING_TESS_STORAGE_BUF_WV; VkDevice device = vulkan_->GetDevice(); @@ -167,13 +176,13 @@ void DrawEngineVulkan::InitDeviceObjects() { vertexCache_ = new VulkanPushBuffer(vulkan_, VERTEX_CACHE_SIZE); - tessDataTransfer = new TessellationDataTransferVulkan(vulkan_); + tessDataTransferVulkan = new TessellationDataTransferVulkan(vulkan_); + tessDataTransfer = tessDataTransferVulkan; } DrawEngineVulkan::~DrawEngineVulkan() { FreeMemoryPages(decoded, DECODED_VERTEX_BUFFER_SIZE); FreeMemoryPages(decIndex, DECODED_INDEX_BUFFER_SIZE); - FreeMemoryPages(splineBuffer, SPLINE_BUFFER_SIZE); DestroyDeviceObjects(); } @@ -201,8 +210,8 @@ void DrawEngineVulkan::FrameData::Destroy(VulkanContext *vulkan) { } void DrawEngineVulkan::DestroyDeviceObjects() { - delete tessDataTransfer; - tessDataTransfer = nullptr; + delete tessDataTransferVulkan; + tessDataTransfer = tessDataTransferVulkan = nullptr; for (int i = 0; i < VulkanContext::MAX_INFLIGHT_FRAMES; i++) { frame_[i].Destroy(vulkan_); @@ -258,7 +267,7 @@ void DrawEngineVulkan::BeginFrame() { frame->pushIndex->Begin(vulkan_); // TODO: How can we make this nicer... - ((TessellationDataTransferVulkan *)tessDataTransfer)->SetPushBuffer(frame->pushUBO); + tessDataTransferVulkan->SetPushBuffer(frame->pushUBO); DirtyAllUBOs(); @@ -470,23 +479,32 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView n++; } - // Tessellation data buffer. Make sure this is declared outside the if to avoid optimizer - // shenanigans. - VkDescriptorBufferInfo tess_buf{}; + // Tessellation data buffer. if (tess) { - VkBuffer buf; - VkDeviceSize offset; - VkDeviceSize range; - ((TessellationDataTransferVulkan *)tessDataTransfer)->GetBufferAndOffset(&buf, &offset, &range); - assert(buf); - tess_buf.buffer = buf; - tess_buf.offset = offset; - tess_buf.range = range; - tessOffset_ = offset; + const VkDescriptorBufferInfo *bufInfo = tessDataTransferVulkan->GetBufferInfo(); + // Control Points writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; writes[n].pNext = nullptr; writes[n].dstBinding = DRAW_BINDING_TESS_STORAGE_BUF; - writes[n].pBufferInfo = &tess_buf; + writes[n].pBufferInfo = &bufInfo[0]; + writes[n].descriptorCount = 1; + writes[n].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + writes[n].dstSet = desc; + n++; + // Weights U + writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writes[n].pNext = nullptr; + writes[n].dstBinding = DRAW_BINDING_TESS_STORAGE_BUF_WU; + writes[n].pBufferInfo = &bufInfo[1]; + writes[n].descriptorCount = 1; + writes[n].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + writes[n].dstSet = desc; + n++; + // Weights V + writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writes[n].pNext = nullptr; + writes[n].dstBinding = DRAW_BINDING_TESS_STORAGE_BUF_WV; + writes[n].pBufferInfo = &bufInfo[2]; writes[n].descriptorCount = 1; writes[n].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; writes[n].dstSet = desc; @@ -825,8 +843,7 @@ void DrawEngineVulkan::DoFlush() { if (useElements) { if (!ibuf) ibOffset = (uint32_t)frame->pushIndex->Push(decIndex, sizeof(uint16_t) * indexGen.VertexCount(), &ibuf); - int numInstances = tess ? numPatches : 1; - renderManager->DrawIndexed(pipelineLayout_, ds, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, ibuf, ibOffset, vertexCount, numInstances, VK_INDEX_TYPE_UINT16); + renderManager->DrawIndexed(pipelineLayout_, ds, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, ibuf, ibOffset, vertexCount, 1, VK_INDEX_TYPE_UINT16); } else { renderManager->Draw(pipelineLayout_, ds, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, vertexCount); } @@ -994,16 +1011,7 @@ void DrawEngineVulkan::UpdateUBOs(FrameData *frame) { } } -DrawEngineVulkan::TessellationDataTransferVulkan::TessellationDataTransferVulkan(VulkanContext *vulkan) - : TessellationDataTransfer(), vulkan_(vulkan) { -} - -DrawEngineVulkan::TessellationDataTransferVulkan::~TessellationDataTransferVulkan() { -} - -void DrawEngineVulkan::TessellationDataTransferVulkan::PrepareBuffers(float *&pos, float *&tex, float *&col, int &posStride, int &texStride, int &colStride, int size, bool hasColor, bool hasTexCoords) { - colStride = 4; - +void TessellationDataTransferVulkan::SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) { // SSBOs that are not simply float1 or float2 need to be padded up to a float4 size. vec3 members // also need to be 16-byte aligned, hence the padding. struct TessData { @@ -1012,18 +1020,28 @@ void DrawEngineVulkan::TessellationDataTransferVulkan::PrepareBuffers(float *&po float color[4]; }; + int size = size_u * size_v; + int ssboAlignment = vulkan_->GetPhysicalDeviceProperties(vulkan_->GetCurrentPhysicalDevice()).limits.minStorageBufferOffsetAlignment; - uint8_t *data = (uint8_t *)push_->PushAligned(size * sizeof(TessData), &offset_, &buf_, ssboAlignment); - range_ = size * sizeof(TessData); + uint8_t *data = (uint8_t *)push_->PushAligned(size * sizeof(TessData), (uint32_t *)&bufInfo_[0].offset, &bufInfo_[0].buffer, ssboAlignment); + bufInfo_[0].range = size * sizeof(TessData); - pos = (float *)(data); - tex = (float *)(data + offsetof(TessData, uv)); - col = (float *)(data + offsetof(TessData, color)); - posStride = sizeof(TessData) / sizeof(float); - colStride = hasColor ? (sizeof(TessData) / sizeof(float)) : 0; - texStride = sizeof(TessData) / sizeof(float); -} + float *pos = (float *)(data); + float *tex = (float *)(data + offsetof(TessData, uv)); + float *col = (float *)(data + offsetof(TessData, color)); + int stride = sizeof(TessData) / sizeof(float); -void DrawEngineVulkan::TessellationDataTransferVulkan::SendDataToShader(const float *pos, const float *tex, const float *col, int size, bool hasColor, bool hasTexCoords) { - // Nothing to do here! The caller will write directly to the pushbuffer through the pointers it got through PrepareBuffers. + CopyControlPoints(pos, tex, col, stride, stride, stride, points, size, vertType); + + using Spline::Weight; + + // Weights U + data = (uint8_t *)push_->PushAligned(weights.size_u * sizeof(Weight), (uint32_t *)&bufInfo_[1].offset, &bufInfo_[1].buffer, ssboAlignment); + memcpy(data, weights.u, weights.size_u * sizeof(Weight)); + bufInfo_[1].range = weights.size_u * sizeof(Weight); + + // Weights V + data = (uint8_t *)push_->PushAligned(weights.size_v * sizeof(Weight), (uint32_t *)&bufInfo_[2].offset, &bufInfo_[2].buffer, ssboAlignment); + memcpy(data, weights.v, weights.size_v * sizeof(Weight)); + bufInfo_[2].range = weights.size_v * sizeof(Weight); } diff --git a/GPU/Vulkan/DrawEngineVulkan.h b/GPU/Vulkan/DrawEngineVulkan.h index 494a4ccd4b..cece2c08c6 100644 --- a/GPU/Vulkan/DrawEngineVulkan.h +++ b/GPU/Vulkan/DrawEngineVulkan.h @@ -117,6 +117,20 @@ public: class VulkanRenderManager; +class TessellationDataTransferVulkan : public TessellationDataTransfer { +public: + TessellationDataTransferVulkan(VulkanContext *vulkan) : vulkan_(vulkan) {} + + void SetPushBuffer(VulkanPushBuffer *push) { push_ = push; } + // Send spline/bezier's control points and weights to vertex shader through structured shader buffer. + void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) override; + const VkDescriptorBufferInfo *GetBufferInfo() { return bufInfo_; } +private: + VulkanContext *vulkan_; + VulkanPushBuffer *push_; // Updated each frame. + VkDescriptorBufferInfo bufInfo_[3]{}; +}; + // Handles transform, lighting and drawing. class DrawEngineVulkan : public DrawEngineCommon { public: @@ -278,31 +292,5 @@ private: int tessOffset_ = 0; // Hardware tessellation - class TessellationDataTransferVulkan : public TessellationDataTransfer { - public: - TessellationDataTransferVulkan(VulkanContext *vulkan); - ~TessellationDataTransferVulkan(); - - void SetPushBuffer(VulkanPushBuffer *push) { push_ = push; } - void SendDataToShader(const float *pos, const float *tex, const float *col, int size, bool hasColor, bool hasTexCoords) override; - void PrepareBuffers(float *&pos, float *&tex, float *&col, int &posStride, int &texStride, int &colStride, int size, bool hasColor, bool hasTexCoords) override; - - void GetBufferAndOffset(VkBuffer *buf, VkDeviceSize *offset, VkDeviceSize *range) { - *buf = buf_; - *offset = (VkDeviceSize)offset_; - *range = (VkDeviceSize)range_; - - buf_ = 0; - offset_ = 0; - range_ = 0; - } - - private: - VulkanContext *vulkan_; - VulkanPushBuffer *push_; // Updated each frame. - - uint32_t offset_ = 0; - uint32_t range_ = 0; - VkBuffer buf_ = VK_NULL_HANDLE; - }; + TessellationDataTransferVulkan *tessDataTransferVulkan; }; diff --git a/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp b/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp index 1885524639..6d2051828e 100644 --- a/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp +++ b/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp @@ -133,6 +133,7 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) { bool doSpline = id.Bit(VS_BIT_SPLINE); bool hasColorTess = id.Bit(VS_BIT_HAS_COLOR_TESS); bool hasTexcoordTess = id.Bit(VS_BIT_HAS_TEXCOORD_TESS); + bool hasNormalTess = id.Bit(VS_BIT_HAS_NORMAL_TESS); bool flipNormalTess = id.Bit(VS_BIT_NORM_REVERSE_TESS); WRITE(p, "\n"); @@ -219,78 +220,90 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) { WRITE(p, " vec4 pos;\n"); WRITE(p, " vec4 uv;\n"); WRITE(p, " vec4 color;\n"); - WRITE(p, "};"); + WRITE(p, "};\n"); WRITE(p, "layout (std430, set = 0, binding = 6) readonly buffer s_tess_data {\n"); - WRITE(p, " TessData data[];"); + WRITE(p, " TessData data[];\n"); WRITE(p, "} tess_data;\n"); + WRITE(p, "layout (std430) struct TessWeight {\n"); + WRITE(p, " vec4 basis;\n"); + WRITE(p, " vec4 deriv;\n"); + WRITE(p, "};\n"); + WRITE(p, "layout (std430, set = 0, binding = 7) readonly buffer s_tess_weights_u {\n"); + WRITE(p, " TessWeight data[];\n"); + WRITE(p, "} tess_weights_u;\n"); + WRITE(p, "layout (std430, set = 0, binding = 8) readonly buffer s_tess_weights_v {\n"); + WRITE(p, " TessWeight data[];\n"); + WRITE(p, "} tess_weights_v;\n"); + for (int i = 2; i <= 4; i++) { // Define 3 types vec2, vec3, vec4 - WRITE(p, "vec%d tess_sample(in vec%d points[16], in vec2 weights[4]) {\n", i, i); - WRITE(p, " vec%d pos = vec%d(0);\n", i, i); - WRITE(p, " for (int i = 0; i < 4; ++i) {\n"); - WRITE(p, " for (int j = 0; j < 4; ++j) {\n"); - WRITE(p, " float f = weights[j].x * weights[i].y;\n"); - WRITE(p, " if (f != 0)\n"); - WRITE(p, " pos = pos + f * points[i * 4 + j];\n"); - WRITE(p, " }\n"); - WRITE(p, " }\n"); + WRITE(p, "vec%d tess_sample(in vec%d points[16], mat4 weights) {\n", i, i); + WRITE(p, " vec%d pos = vec%d(0.0);\n", i, i); + for (int v = 0; v < 4; ++v) { + for (int u = 0; u < 4; ++u) { + WRITE(p, " pos += weights[%i][%i] * points[%i];\n", v, u, v * 4 + u); + } + } WRITE(p, " return pos;\n"); WRITE(p, "}\n"); } - if (doSpline) { - WRITE(p, "void spline_knot(ivec2 num_patches, ivec2 type, out vec2 knot[6], ivec2 patch_pos) {\n"); - WRITE(p, " for (int i = 0; i < 6; ++i) {\n"); - WRITE(p, " knot[i] = vec2(i + patch_pos.x - 2, i + patch_pos.y - 2);\n"); - WRITE(p, " }\n"); - WRITE(p, " if ((type.x & 1) != 0) {\n"); - WRITE(p, " if (patch_pos.x <= 2)\n"); - WRITE(p, " knot[0].x = 0;\n"); - WRITE(p, " if (patch_pos.x <= 1)\n"); - WRITE(p, " knot[1].x = 0;\n"); - WRITE(p, " }\n"); - WRITE(p, " if ((type.x & 2) != 0) {\n"); - WRITE(p, " if (patch_pos.x >= (num_patches.x - 2))\n"); - WRITE(p, " knot[5].x = num_patches.x;\n"); - WRITE(p, " if (patch_pos.x == (num_patches.x - 1))\n"); - WRITE(p, " knot[4].x = num_patches.x;\n"); - WRITE(p, " }\n"); - WRITE(p, " if ((type.y & 1) != 0) {\n"); - WRITE(p, " if (patch_pos.y <= 2)\n"); - WRITE(p, " knot[0].y = 0;\n"); - WRITE(p, " if (patch_pos.y <= 1)\n"); - WRITE(p, " knot[1].y = 0;\n"); - WRITE(p, " }\n"); - WRITE(p, " if ((type.y & 2) != 0) {\n"); - WRITE(p, " if (patch_pos.y >= (num_patches.y - 2))\n"); - WRITE(p, " knot[5].y = num_patches.y;\n"); - WRITE(p, " if (patch_pos.y == (num_patches.y - 1))\n"); - WRITE(p, " knot[4].y = num_patches.y;\n"); - WRITE(p, " }\n"); - WRITE(p, "}\n"); - WRITE(p, "void spline_weight(vec2 t, in vec2 knot[6], out vec2 weights[4]) {\n"); - // TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly... - WRITE(p, " vec2 t0 = (t - knot[0]);\n"); - WRITE(p, " vec2 t1 = (t - knot[1]);\n"); - WRITE(p, " vec2 t2 = (t - knot[2]);\n"); - // TODO: All our knots are integers so we should be able to get rid of these divisions (How?) - WRITE(p, " vec2 f30 = t0 / (knot[3] - knot[0]);\n"); - WRITE(p, " vec2 f41 = t1 / (knot[4] - knot[1]);\n"); - WRITE(p, " vec2 f52 = t2 / (knot[5] - knot[2]);\n"); - WRITE(p, " vec2 f31 = t1 / (knot[3] - knot[1]);\n"); - WRITE(p, " vec2 f42 = t2 / (knot[4] - knot[2]);\n"); - WRITE(p, " vec2 f32 = t2 / (knot[3] - knot[2]);\n"); - WRITE(p, " vec2 a = (1 - f30)*(1 - f31);\n"); - WRITE(p, " vec2 b = (f31*f41);\n"); - WRITE(p, " vec2 c = (1 - f41)*(1 - f42);\n"); - WRITE(p, " vec2 d = (f42*f52);\n"); - WRITE(p, " weights[0] = a - (a*f32);\n"); - WRITE(p, " weights[1] = 1 - a - b + ((a + b + c - 1)*f32);\n"); - WRITE(p, " weights[2] = b + ((1 - b - c - d)*f32);\n"); - WRITE(p, " weights[3] = d*f32;\n"); - WRITE(p, "}\n"); + WRITE(p, "struct Tess {\n"); + WRITE(p, " vec3 pos;\n"); + if (doTexture) + WRITE(p, " vec2 tex;\n"); + WRITE(p, " vec4 col;\n"); + if (hasNormalTess) + WRITE(p, " vec3 nrm;\n"); + WRITE(p, "};\n"); + + WRITE(p, "void tessellate(out Tess tess) {\n"); + WRITE(p, " ivec2 point_pos = ivec2(position.z, normal.z)%s;\n", doBezier ? " * 3" : ""); + WRITE(p, " ivec2 weight_idx = ivec2(position.xy);\n"); + // Load 4x4 control points + WRITE(p, " vec3 _pos[16];\n"); + WRITE(p, " vec2 _tex[16];\n"); + WRITE(p, " vec4 _col[16];\n"); + WRITE(p, " int index;\n"); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + WRITE(p, " index = (%i + point_pos.y) * int(base.spline_counts) + (%i + point_pos.x);\n", i, j); + WRITE(p, " _pos[%i] = tess_data.data[index].pos.xyz;\n", i * 4 + j); + if (doTexture && hasTexcoordTess) + WRITE(p, " _tex[%i] = tess_data.data[index].uv.xy;\n", i * 4 + j); + if (hasColorTess) + WRITE(p, " _col[%i] = tess_data.data[index].color;\n", i * 4 + j); + } } + + // Basis polynomials as weight coefficients + WRITE(p, " vec4 basis_u = tess_weights_u.data[weight_idx.x].basis;\n"); + WRITE(p, " vec4 basis_v = tess_weights_v.data[weight_idx.y].basis;\n"); + WRITE(p, " mat4 basis = outerProduct(basis_u, basis_v);\n"); + + // Tessellate + WRITE(p, " tess.pos = tess_sample(_pos, basis);\n"); + if (doTexture) { + if (hasTexcoordTess) + WRITE(p, " tess.tex = tess_sample(_tex, basis);\n"); + else + WRITE(p, " tess.tex = normal.xy;\n"); + } + if (hasColorTess) + WRITE(p, " tess.col = tess_sample(_col, basis);\n"); + else + WRITE(p, " tess.col = base.matambientalpha;\n"); + if (hasNormalTess) { + // Derivatives as weight coefficients + WRITE(p, " vec4 deriv_u = tess_weights_u.data[weight_idx.x].deriv;\n"); + WRITE(p, " vec4 deriv_v = tess_weights_v.data[weight_idx.y].deriv;\n"); + + WRITE(p, " vec3 du = tess_sample(_pos, outerProduct(deriv_u, basis_v));\n"); + WRITE(p, " vec3 dv = tess_sample(_pos, outerProduct(basis_u, deriv_v));\n"); + WRITE(p, " tess.nrm = normalize(cross(du, dv));\n"); + } + WRITE(p, "}\n"); } WRITE(p, "void main() {\n"); @@ -330,103 +343,13 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) { // Step 1: World Transform / Skinning if (!enableBones) { if (doBezier || doSpline) { - WRITE(p, " vec3 _pos[16];\n"); - WRITE(p, " vec2 _tex[16];\n"); - WRITE(p, " vec4 _col[16];\n"); - WRITE(p, " int spline_count_u = int(base.spline_counts & 0xff);\n"); - WRITE(p, " int spline_count_v = int((base.spline_counts >> 8) & 0xff);\n"); - WRITE(p, " int num_patches_u = %s;\n", doBezier ? "(spline_count_u - 1) / 3" : "spline_count_u - 3"); - WRITE(p, " int u = int(mod(gl_InstanceIndex, num_patches_u));\n"); - WRITE(p, " int v = gl_InstanceIndex / num_patches_u;\n"); - WRITE(p, " ivec2 patch_pos = ivec2(u, v);\n"); - WRITE(p, " for (int i = 0; i < 4; i++) {\n"); - WRITE(p, " for (int j = 0; j < 4; j++) {\n"); - WRITE(p, " int idx = (i + v%s) * spline_count_u + (j + u%s);\n", doBezier ? " * 3" : "", doBezier ? " * 3" : ""); - WRITE(p, " _pos[i * 4 + j] = tess_data.data[idx].pos.xyz;\n"); - if (doTexture && hasTexcoord && hasTexcoordTess) - WRITE(p, " _tex[i * 4 + j] = tess_data.data[idx].uv.xy;\n"); - if (hasColor && hasColorTess) - WRITE(p, " _col[i * 4 + j] = tess_data.data[idx].color;\n"); - WRITE(p, " }\n"); - WRITE(p, " }\n"); - WRITE(p, " vec2 tess_pos = position.xy;\n"); - WRITE(p, " vec2 weights[4];\n"); - if (doBezier) { - // Bernstein 3D - WRITE(p, " weights[0] = (1 - tess_pos) * (1 - tess_pos) * (1 - tess_pos);\n"); - WRITE(p, " weights[1] = 3 * tess_pos * (1 - tess_pos) * (1 - tess_pos);\n"); - WRITE(p, " weights[2] = 3 * tess_pos * tess_pos * (1 - tess_pos);\n"); - WRITE(p, " weights[3] = tess_pos * tess_pos * tess_pos;\n"); - } else { // Spline - WRITE(p, " ivec2 spline_num_patches = ivec2(spline_count_u - 3, spline_count_v - 3);\n"); - WRITE(p, " int spline_type_u = int((base.spline_counts >> 16) & 0xff);\n"); - WRITE(p, " int spline_type_v = int((base.spline_counts >> 24) & 0xff);\n"); - WRITE(p, " ivec2 spline_type = ivec2(spline_type_u, spline_type_v);\n"); - WRITE(p, " vec2 knots[6];\n"); - WRITE(p, " spline_knot(spline_num_patches, spline_type, knots, patch_pos);\n"); - WRITE(p, " spline_weight(tess_pos + patch_pos, knots, weights);\n"); - } - WRITE(p, " vec3 pos = tess_sample(_pos, weights);\n"); - if (doTexture && hasTexcoord) { - if (hasTexcoordTess) - WRITE(p, " vec2 tex = tess_sample(_tex, weights);\n"); - else - WRITE(p, " vec2 tex = tess_pos + patch_pos;\n"); - } - if (hasColor) { - if (hasColorTess) - WRITE(p, " vec4 col = tess_sample(_col, weights);\n"); - else - WRITE(p, " vec4 col = tess_data.data[0].color;\n"); - } - if (hasNormal) { - // Curved surface is probably always need to compute normal(not sampling from control points) - if (doBezier) { - // Bernstein derivative - WRITE(p, " vec2 bernderiv[4];\n"); - WRITE(p, " bernderiv[0] = -3 * (tess_pos - 1) * (tess_pos - 1); \n"); - WRITE(p, " bernderiv[1] = 9 * tess_pos * tess_pos - 12 * tess_pos + 3; \n"); - WRITE(p, " bernderiv[2] = 3 * (2 - 3 * tess_pos) * tess_pos; \n"); - WRITE(p, " bernderiv[3] = 3 * tess_pos * tess_pos; \n"); + // Hardware tessellation + WRITE(p, " Tess tess;\n"); + WRITE(p, " tessellate(tess);\n"); - WRITE(p, " vec2 bernderiv_u[4];\n"); - WRITE(p, " vec2 bernderiv_v[4];\n"); - WRITE(p, " for (int i = 0; i < 4; i++) {\n"); - WRITE(p, " bernderiv_u[i] = vec2(bernderiv[i].x, weights[i].y);\n"); - WRITE(p, " bernderiv_v[i] = vec2(weights[i].x, bernderiv[i].y);\n"); - WRITE(p, " }\n"); - - WRITE(p, " vec3 du = tess_sample(_pos, bernderiv_u);\n"); - WRITE(p, " vec3 dv = tess_sample(_pos, bernderiv_v);\n"); - } else { // Spline - WRITE(p, " vec2 tess_next_u = vec2(normal.x, 0);\n"); - WRITE(p, " vec2 tess_next_v = vec2(0, normal.y);\n"); - // Right - WRITE(p, " vec2 tess_pos_r = tess_pos + tess_next_u;\n"); - WRITE(p, " spline_weight(tess_pos_r + patch_pos, knots, weights);\n"); - WRITE(p, " vec3 pos_r = tess_sample(_pos, weights);\n"); - // Left - WRITE(p, " vec2 tess_pos_l = tess_pos - tess_next_u;\n"); - WRITE(p, " spline_weight(tess_pos_l + patch_pos, knots, weights);\n"); - WRITE(p, " vec3 pos_l = tess_sample(_pos, weights);\n"); - // Down - WRITE(p, " vec2 tess_pos_d = tess_pos + tess_next_v;\n"); - WRITE(p, " spline_weight(tess_pos_d + patch_pos, knots, weights);\n"); - WRITE(p, " vec3 pos_d = tess_sample(_pos, weights);\n"); - // Up - WRITE(p, " vec2 tess_pos_u = tess_pos - tess_next_v;\n"); - WRITE(p, " spline_weight(tess_pos_u + patch_pos, knots, weights);\n"); - WRITE(p, " vec3 pos_u = tess_sample(_pos, weights);\n"); - - WRITE(p, " vec3 du = pos_r - pos_l;\n"); - WRITE(p, " vec3 dv = pos_d - pos_u;\n"); - } - WRITE(p, " vec3 nrm = cross(du, dv);\n"); - WRITE(p, " nrm = normalize(nrm);\n"); - } - WRITE(p, " vec3 worldpos = vec4(pos.xyz, 1.0) * base.world_mtx;\n"); - if (hasNormal) { - WRITE(p, " mediump vec3 worldnormal = normalize(vec4(%snrm, 0.0) * base.world_mtx);\n", flipNormalTess ? "-" : ""); + WRITE(p, " vec3 worldpos = vec4(tess.pos.xyz, 1.0) * base.world_mtx;\n"); + if (hasNormalTess) { + WRITE(p, " mediump vec3 worldnormal = normalize(vec4(%stess.nrm, 0.0) * base.world_mtx);\n", flipNormalTess ? "-" : ""); } else { WRITE(p, " mediump vec3 worldnormal = vec3(0.0, 0.0, 1.0);\n"); } @@ -483,9 +406,10 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) { const char *diffuseStr = ((matUpdate & 2) && hasColor) ? "color0.rgb" : "light.matdiffuse"; const char *specularStr = ((matUpdate & 4) && hasColor) ? "color0.rgb" : "light.matspecular.rgb"; if (doBezier || doSpline) { - ambientStr = (matUpdate & 1) && hasColor ? "col" : "base.matambientalpha"; - diffuseStr = (matUpdate & 2) && hasColor ? "col.rgb" : "light.matdiffuse"; - specularStr = (matUpdate & 4) && hasColor ? "col.rgb" : "light.matspecular.rgb"; + // TODO: Probably, should use hasColorTess but FF4 has a problem with drawing the background. + ambientStr = (matUpdate & 1) && hasColor ? "tess.col" : "base.matambientalpha"; + diffuseStr = (matUpdate & 2) && hasColor ? "tess.col.rgb" : "light.matdiffuse"; + specularStr = (matUpdate & 4) && hasColor ? "tess.col.rgb" : "light.matspecular.rgb"; } bool diffuseIsZero = true; @@ -606,7 +530,7 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) { // Lighting doesn't affect color. if (hasColor) { if (doBezier || doSpline) - WRITE(p, " v_color0 = col;\n"); + WRITE(p, " v_color0 = tess.col;\n"); else WRITE(p, " v_color0 = color0;\n"); } else { @@ -627,7 +551,7 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) { if (scaleUV) { if (hasTexcoord) { if (doBezier || doSpline) - WRITE(p, " v_texcoord = vec3(tex.xy * base.uvscaleoffset.xy + base.uvscaleoffset.zw, 0.0);\n"); + WRITE(p, " v_texcoord = vec3(tess.tex.xy * base.uvscaleoffset.xy + base.uvscaleoffset.zw, 0.0);\n"); else WRITE(p, " v_texcoord = vec3(texcoord.xy * base.uvscaleoffset.xy, 0.0);\n"); } else { @@ -635,10 +559,7 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) { } } else { if (hasTexcoord) { - if (doBezier || doSpline) - WRITE(p, " v_texcoord = vec3(tex.xy * base.uvscaleoffset.xy + base.uvscaleoffset.zw, 0.0);\n"); - else - WRITE(p, " v_texcoord = vec3(texcoord.xy * base.uvscaleoffset.xy + base.uvscaleoffset.zw, 0.0);\n"); + WRITE(p, " v_texcoord = vec3(texcoord.xy * base.uvscaleoffset.xy + base.uvscaleoffset.zw, 0.0);\n"); } else { WRITE(p, " v_texcoord = vec3(base.uvscaleoffset.zw, 0.0);\n"); } diff --git a/UI/GameSettingsScreen.cpp b/UI/GameSettingsScreen.cpp index 43bb8dd58a..5dc93d0436 100644 --- a/UI/GameSettingsScreen.cpp +++ b/UI/GameSettingsScreen.cpp @@ -80,7 +80,7 @@ bool GameSettingsScreen::UseVerticalLayout() const { // This needs before run CheckGPUFeatures() // TODO: Remove this if fix the issue -bool CheckSupportInstancedTessellationGLES() { +bool CheckSupportShaderTessellationGLES() { #if PPSSPP_PLATFORM(UWP) return true; #else @@ -88,21 +88,17 @@ bool CheckSupportInstancedTessellationGLES() { int maxVertexTextureImageUnits = gl_extensions.maxVertexTextureUnits; bool vertexTexture = maxVertexTextureImageUnits >= 3; // At least 3 for hardware tessellation - bool canUseInstanceID = gl_extensions.EXT_draw_instanced || gl_extensions.ARB_draw_instanced; - bool canDefInstanceID = gl_extensions.IsGLES || gl_extensions.EXT_gpu_shader4 || gl_extensions.VersionGEThan(3, 1); - bool instanceRendering = gl_extensions.GLES3 || (canUseInstanceID && canDefInstanceID); - bool textureFloat = gl_extensions.ARB_texture_float || gl_extensions.OES_texture_float; bool hasTexelFetch = gl_extensions.GLES3 || (!gl_extensions.IsGLES && gl_extensions.VersionGEThan(3, 3, 0)) || gl_extensions.EXT_gpu_shader4; - return instanceRendering && vertexTexture && textureFloat && hasTexelFetch; + return vertexTexture && textureFloat && hasTexelFetch; #endif } bool DoesBackendSupportHWTess() { switch (GetGPUBackend()) { case GPUBackend::OPENGL: - return CheckSupportInstancedTessellationGLES(); + return CheckSupportShaderTessellationGLES(); case GPUBackend::VULKAN: case GPUBackend::DIRECT3D11: return true; @@ -392,11 +388,10 @@ void GameSettingsScreen::CreateViews() { } return UI::EVENT_CONTINUE; }); - beziersChoice->SetDisabledPtr(&g_Config.bHardwareTessellation); CheckBox *tessellationHW = graphicsSettings->Add(new CheckBox(&g_Config.bHardwareTessellation, gr->T("Hardware Tessellation"))); tessellationHW->OnClick.Add([=](EventParams &e) { - settingInfo_->Show(gr->T("HardwareTessellation Tip", "Uses hardware to make curves, always uses a fixed quality"), e.v); + settingInfo_->Show(gr->T("HardwareTessellation Tip", "Uses hardware to make curves"), e.v); return UI::EVENT_CONTINUE; }); tessHWEnable_ = DoesBackendSupportHWTess() && !g_Config.bSoftwareRendering && g_Config.bHardwareTransform; diff --git a/Windows/GEDebugger/VertexPreview.cpp b/Windows/GEDebugger/VertexPreview.cpp index 8fa7daa050..79183d236f 100644 --- a/Windows/GEDebugger/VertexPreview.cpp +++ b/Windows/GEDebugger/VertexPreview.cpp @@ -26,6 +26,7 @@ #include "GPU/Common/GPUDebugInterface.h" #include "GPU/Common/SplineCommon.h" #include "GPU/GPUState.h" +#include "Common/MemoryUtil.h" static const char preview_fs[] = "#ifdef GL_ES\n" @@ -164,96 +165,104 @@ u32 CGEDebugger::PrimPreviewOp() { } static void ExpandBezier(int &count, int op, const std::vector &simpleVerts, const std::vector &indices, std::vector &generatedVerts, std::vector &generatedInds) { - int count_u = (op & 0x00FF) >> 0; - int count_v = (op & 0xFF00) >> 8; + using namespace Spline; - int tess_u = gstate.getPatchDivisionU(); - int tess_v = gstate.getPatchDivisionV(); - if (tess_u < 1) { - tess_u = 1; - } - if (tess_v < 1) { - tess_v = 1; - } + int count_u = (op >> 0) & 0xFF; + int count_v = (op >> 8) & 0xFF; + // Real hardware seems to draw nothing when given < 4 either U or V. + if (count_u < 4 || count_v < 4) + return; - // Bezier patches share less control points than spline patches. Otherwise they are pretty much the same (except bezier don't support the open/close thing) - int num_patches_u = (count_u - 1) / 3; - int num_patches_v = (count_v - 1) / 3; - int total_patches = num_patches_u * num_patches_v; - std::vector patches; - patches.resize(total_patches); - for (int patch_u = 0; patch_u < num_patches_u; patch_u++) { - for (int patch_v = 0; patch_v < num_patches_v; patch_v++) { - BezierPatch &patch = patches[patch_u + patch_v * num_patches_u]; - for (int point = 0; point < 16; ++point) { - int idx = (patch_u * 3 + point % 4) + (patch_v * 3 + point / 4) * count_u; - patch.points[point] = &simpleVerts[0] + (!indices.empty() ? indices[idx] : idx); - } - patch.u_index = patch_u * 3; - patch.v_index = patch_v * 3; - patch.index = patch_v * num_patches_u + patch_u; - patch.primType = gstate.getPatchPrimitiveType(); - patch.computeNormals = false; - patch.patchFacing = false; - } - } + BezierSurface surface; + surface.num_points_u = count_u; + surface.num_points_v = count_v; + surface.tess_u = gstate.getPatchDivisionU(); + surface.tess_v = gstate.getPatchDivisionV(); + surface.num_patches_u = (count_u - 1) / 3; + surface.num_patches_v = (count_v - 1) / 3; + surface.primType = gstate.getPatchPrimitiveType(); + surface.patchFacing = false; - generatedVerts.resize((tess_u + 1) * (tess_v + 1) * total_patches); - generatedInds.resize(tess_u * tess_v * 6 * total_patches); + int num_points = count_u * count_v; + // Make an array of pointers to the control points, to get rid of indices. + std::vector points(num_points); + for (int idx = 0; idx < num_points; idx++) + points[idx] = simpleVerts.data() + (!indices.empty() ? indices[idx] : idx); - count = 0; - u8 *dest = (u8 *)&generatedVerts[0]; - u16 *inds = &generatedInds[0]; - for (int patch_idx = 0; patch_idx < total_patches; ++patch_idx) { - const BezierPatch &patch = patches[patch_idx]; - TessellateBezierPatch(dest, inds, count, tess_u, tess_v, patch, gstate.vertType); - } + int total_patches = surface.num_patches_u * surface.num_patches_v; + generatedVerts.resize((surface.tess_u + 1) * (surface.tess_v + 1) * total_patches); + generatedInds.resize(surface.tess_u * surface.tess_v * 6 * total_patches); + + OutputBuffers output; + output.vertices = generatedVerts.data(); + output.indices = generatedInds.data(); + output.count = 0; + + ControlPoints cpoints; + cpoints.pos = (Vec3f *)AllocateAlignedMemory(sizeof(Vec3f) * num_points, 16); + cpoints.tex = (Vec2f *)AllocateAlignedMemory(sizeof(Vec2f) * num_points, 16); + cpoints.col = (Vec4f *)AllocateAlignedMemory(sizeof(Vec4f) * num_points, 16); + cpoints.Convert(points.data(), num_points); + + surface.Init(generatedVerts.size()); + SoftwareTessellation(output, surface, gstate.vertType, cpoints); + count = output.count; + + FreeAlignedMemory(cpoints.pos); + FreeAlignedMemory(cpoints.tex); + FreeAlignedMemory(cpoints.col); } static void ExpandSpline(int &count, int op, const std::vector &simpleVerts, const std::vector &indices, std::vector &generatedVerts, std::vector &generatedInds) { - SplinePatchLocal patch; - patch.computeNormals = false; - patch.primType = gstate.getPatchPrimitiveType(); - patch.patchFacing = false; - - patch.count_u = (op & 0x00FF) >> 0; - patch.count_v = (op & 0xFF00) >> 8; - patch.type_u = (op >> 16) & 0x3; - patch.type_v = (op >> 18) & 0x3; - - patch.tess_u = gstate.getPatchDivisionU(); - patch.tess_v = gstate.getPatchDivisionV(); - if (patch.tess_u < 1) { - patch.tess_u = 1; - } - if (patch.tess_v < 1) { - patch.tess_v = 1; - } + using namespace Spline; + int count_u = (op >> 0) & 0xFF; + int count_v = (op >> 8) & 0xFF; // Real hardware seems to draw nothing when given < 4 either U or V. - if (patch.count_u < 4 || patch.count_v < 4) { + if (count_u < 4 || count_v < 4) return; - } - std::vector points; - points.resize(patch.count_u * patch.count_v); + SplineSurface surface; + surface.num_points_u = count_u; + surface.num_points_v = count_v; + surface.tess_u = gstate.getPatchDivisionU(); + surface.tess_v = gstate.getPatchDivisionV(); + surface.type_u = (op >> 16) & 0x3; + surface.type_v = (op >> 18) & 0x3; + surface.num_patches_u = count_u - 3; + surface.num_patches_v = count_v - 3; + surface.primType = gstate.getPatchPrimitiveType(); + surface.patchFacing = false; + int num_points = count_u * count_v; // Make an array of pointers to the control points, to get rid of indices. - for (int idx = 0; idx < patch.count_u * patch.count_v; idx++) { - points[idx] = &simpleVerts[0] + (!indices.empty() ? indices[idx] : idx); - } - patch.points = &points[0]; + std::vector points(num_points); + for (int idx = 0; idx < num_points; idx++) + points[idx] = simpleVerts.data() + (!indices.empty() ? indices[idx] : idx); - int patch_div_s = (patch.count_u - 3) * patch.tess_u; - int patch_div_t = (patch.count_v - 3) * patch.tess_v; - int maxVertexCount = (patch_div_s + 1) * (patch_div_t + 1); - - generatedVerts.resize(maxVertexCount); + int patch_div_s = surface.num_patches_u * surface.tess_u; + int patch_div_t = surface.num_patches_v * surface.tess_v; + generatedVerts.resize((patch_div_s + 1) * (patch_div_t + 1)); generatedInds.resize(patch_div_s * patch_div_t * 6); - count = 0; - u8 *dest = (u8 *)&generatedVerts[0]; - TessellateSplinePatch(dest, &generatedInds[0], count, patch, gstate.vertType, maxVertexCount); + OutputBuffers output; + output.vertices = generatedVerts.data(); + output.indices = generatedInds.data(); + output.count = 0; + + ControlPoints cpoints; + cpoints.pos = (Vec3f *)AllocateAlignedMemory(sizeof(Vec3f) * num_points, 16); + cpoints.tex = (Vec2f *)AllocateAlignedMemory(sizeof(Vec2f) * num_points, 16); + cpoints.col = (Vec4f *)AllocateAlignedMemory(sizeof(Vec4f) * num_points, 16); + cpoints.Convert(points.data(), num_points); + + surface.Init(generatedVerts.size()); + SoftwareTessellation(output, surface, gstate.vertType, cpoints); + count = output.count; + + FreeAlignedMemory(cpoints.pos); + FreeAlignedMemory(cpoints.tex); + FreeAlignedMemory(cpoints.col); } void CGEDebugger::UpdatePrimPreview(u32 op, int which) { diff --git a/ext/native/thin3d/GLQueueRunner.cpp b/ext/native/thin3d/GLQueueRunner.cpp index 36696bacd7..49031eafca 100644 --- a/ext/native/thin3d/GLQueueRunner.cpp +++ b/ext/native/thin3d/GLQueueRunner.cpp @@ -306,14 +306,14 @@ void GLQueueRunner::RunInitSteps(const std::vector &steps, bool ski glBindTexture(tex->target, tex->texture); boundTexture = tex->texture; } - if (!step.texture_image.data) + if (!step.texture_image.data && step.texture_image.allocType != GLRAllocType::NONE) Crash(); // For things to show in RenderDoc, need to split into glTexImage2D(..., nullptr) and glTexSubImage. glTexImage2D(tex->target, step.texture_image.level, step.texture_image.internalFormat, step.texture_image.width, step.texture_image.height, 0, step.texture_image.format, step.texture_image.type, step.texture_image.data); allocatedTextures = true; if (step.texture_image.allocType == GLRAllocType::ALIGNED) { FreeAlignedMemory(step.texture_image.data); - } else { + } else if (step.texture_image.allocType == GLRAllocType::NEW) { delete[] step.texture_image.data; } CHECK_GL_ERROR_IF_DEBUG(); @@ -490,7 +490,19 @@ void GLQueueRunner::RunSteps(const std::vector &steps, bool skipGLCal const GLRStep &step = *steps[i]; switch (step.stepType) { case GLRStepType::RENDER: - // TODO: With #11425 there'll be a case where we should really free spline data here. + for (const auto &c : step.commands) { + switch (c.cmd) { + case GLRRenderCommand::TEXTURE_SUBIMAGE: + if (c.texture_subimage.data) { + if (c.texture_subimage.allocType == GLRAllocType::ALIGNED) { + FreeAlignedMemory(c.texture_subimage.data); + } else if (c.texture_subimage.allocType == GLRAllocType::NEW) { + delete[] c.texture_subimage.data; + } + } + break; + } + } break; } delete steps[i]; @@ -1024,6 +1036,22 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step) { } break; } + case GLRRenderCommand::TEXTURE_SUBIMAGE: + { + GLRTexture *tex = c.texture_subimage.texture; + // TODO: Need bind? + if (!c.texture_subimage.data) + Crash(); + // For things to show in RenderDoc, need to split into glTexImage2D(..., nullptr) and glTexSubImage. + glTexSubImage2D(tex->target, c.texture_subimage.level, c.texture_subimage.x, c.texture_subimage.y, c.texture_subimage.width, c.texture_subimage.height, c.texture_subimage.format, c.texture_subimage.type, c.texture_subimage.data); + if (c.texture_subimage.allocType == GLRAllocType::ALIGNED) { + FreeAlignedMemory(c.texture_subimage.data); + } else if (c.texture_subimage.allocType == GLRAllocType::NEW) { + delete[] c.texture_subimage.data; + } + CHECK_GL_ERROR_IF_DEBUG(); + break; + } case GLRRenderCommand::RASTER: if (c.raster.cullEnable) { if (!cullEnabled) { diff --git a/ext/native/thin3d/GLQueueRunner.h b/ext/native/thin3d/GLQueueRunner.h index 5f1432e52d..b7a1586d59 100644 --- a/ext/native/thin3d/GLQueueRunner.h +++ b/ext/native/thin3d/GLQueueRunner.h @@ -20,6 +20,7 @@ struct GLOffset2D { }; enum class GLRAllocType { + NONE, NEW, ALIGNED, }; @@ -57,6 +58,7 @@ enum class GLRRenderCommand : uint8_t { DRAW, DRAW_INDEXED, PUSH_CONSTANTS, + TEXTURE_SUBIMAGE, }; // TODO: Bloated since the biggest struct decides the size. Will need something more efficient (separate structs with shared @@ -138,6 +140,18 @@ struct GLRRenderData { int slot; GLRTexture *texture; } texture; + struct { + GLRTexture *texture; + GLenum format; + GLenum type; + int level; + int x; + int y; + int width; + int height; + GLRAllocType allocType; + uint8_t *data; // owned, delete[]-d + } texture_subimage; struct { int slot; GLRFramebuffer *framebuffer; diff --git a/ext/native/thin3d/GLRenderManager.h b/ext/native/thin3d/GLRenderManager.h index 4c3c32b13d..9cc4ca4f7d 100644 --- a/ext/native/thin3d/GLRenderManager.h +++ b/ext/native/thin3d/GLRenderManager.h @@ -530,6 +530,22 @@ public: initSteps_.push_back(step); } + void TextureSubImage(GLRTexture *texture, int level, int x, int y, int width, int height, GLenum format, GLenum type, uint8_t *data, GLRAllocType allocType = GLRAllocType::NEW) { + _dbg_assert_(G3D, curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER); + GLRRenderData _data{ GLRRenderCommand::TEXTURE_SUBIMAGE }; + _data.texture_subimage.texture = texture; + _data.texture_subimage.data = data; + _data.texture_subimage.format = format; + _data.texture_subimage.type = type; + _data.texture_subimage.level = level; + _data.texture_subimage.x = x; + _data.texture_subimage.y = y; + _data.texture_subimage.width = width; + _data.texture_subimage.height = height; + _data.texture_subimage.allocType = allocType; + curRenderStep_->commands.push_back(_data); + } + void FinalizeTexture(GLRTexture *texture, int maxLevels, bool genMips) { GLRInitStep step{ GLRInitStepType::TEXTURE_FINALIZE }; step.texture_finalize.texture = texture;