diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
index 95d6d5ad2e..60c250533a 100644
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@@ -34,7 +34,6 @@ enum {
 };
 
 DrawEngineCommon::DrawEngineCommon() : decoderMap_(16) {
-	quadIndices_ = new u16[6 * QUAD_INDICES_MAX];
 	decJitCache_ = new VertexDecoderJitCache();
 	transformed = (TransformedVertex *)AllocateMemoryPages(TRANSFORMED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	transformedExpanded = (TransformedVertex *)AllocateMemoryPages(3 * TRANSFORMED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
@@ -43,11 +42,11 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(16) {
 DrawEngineCommon::~DrawEngineCommon() {
 	FreeMemoryPages(transformed, TRANSFORMED_VERTEX_BUFFER_SIZE);
 	FreeMemoryPages(transformedExpanded, 3 * TRANSFORMED_VERTEX_BUFFER_SIZE);
-	delete[] quadIndices_;
 	delete decJitCache_;
 	decoderMap_.Iterate([&](const uint32_t vtype, VertexDecoder *decoder) {
 		delete decoder;
 	});
+	ClearSplineBezierWeights();
 }
 
 VertexDecoder *DrawEngineCommon::GetVertexDecoder(u32 vtype) {
@@ -739,3 +738,25 @@ void DrawEngineCommon::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim,
 		}
 	}
 }
+
+void TessellationDataTransfer::CopyControlPoints(float *pos, float *tex, float *col, int posStride, int texStride, int colStride, const SimpleVertex *const *points, int size, u32 vertType) {
+	bool hasColor = (vertType & GE_VTYPE_COL_MASK) != 0;
+	bool hasTexCoord = (vertType & GE_VTYPE_TC_MASK) != 0;
+
+	for (int i = 0; i < size; ++i) {
+		memcpy(pos, points[i]->pos.AsArray(), 3 * sizeof(float));
+		pos += posStride;
+	}
+	if (hasTexCoord) {
+		for (int i = 0; i < size; ++i) {
+			memcpy(tex, points[i]->uv, 2 * sizeof(float));
+			tex += texStride;
+		}
+	}
+	if (hasColor) {
+		for (int i = 0; i < size; ++i) {
+			memcpy(col, Vec4f::FromRGBA(points[i]->color_32).AsArray(), 4 * sizeof(float));
+			col += colStride;
+		}
+	}
+}
diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h
index 03e9e50cf3..353b3dd7d0 100644
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@@ -34,7 +34,6 @@ enum {
 	VERTEX_BUFFER_MAX = 65536,
 	DECODED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * 64,
 	DECODED_INDEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * 16,
-	SPLINE_BUFFER_SIZE = VERTEX_BUFFER_MAX * 26, // At least, this buffer needs greater than 1679616 bytes for Mist Dragon morphing in FF4CC.
 };
 
 // Avoiding the full include of TextureDecoder.h.
@@ -50,6 +49,15 @@ inline uint32_t GetVertTypeID(uint32_t vertType, int uvGenMode) {
 	return (vertType & 0xFFFFFF) | (uvGenMode << 24);
 }
 
+struct SimpleVertex;
+namespace Spline { struct Weight2D; }
+
+class TessellationDataTransfer {
+public:
+	void CopyControlPoints(float *pos, float *tex, float *col, int posStride, int texStride, int colStride, const SimpleVertex *const *points, int size, u32 vertType);
+	virtual void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) = 0;
+};
+
 class DrawEngineCommon {
 public:
 	DrawEngineCommon();
@@ -75,6 +83,7 @@ public:
 	void SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead);
 	void SubmitSpline(const void *control_points, const void *indices, int tess_u, int tess_v, int count_u, int count_v, int type_u, int type_v, GEPatchPrimType prim_type, bool computeNormals, bool patchFacing, u32 vertType, int *bytesRead);
 	void SubmitBezier(const void *control_points, const void *indices, int tess_u, int tess_v, int count_u, int count_v, GEPatchPrimType prim_type, bool computeNormals, bool patchFacing, u32 vertType, int *bytesRead);
+	void ClearSplineBezierWeights();
 
 	std::vector<std::string> DebugGetVertexLoaderIDs();
 	std::string DebugGetVertexLoaderString(std::string id, DebugShaderStringType stringType);
@@ -160,31 +169,10 @@ protected:
 	int decodedVerts_ = 0;
 	GEPrimitiveType prevPrim_ = GE_PRIM_INVALID;
 
-	// Fixed index buffer for easy quad generation from spline/bezier
-	u16 *quadIndices_ = nullptr;
-
 	// Shader blending state
 	bool fboTexNeedBind_ = false;
 	bool fboTexBound_ = false;
 
 	// Hardware tessellation
-	int numPatches;
-	class TessellationDataTransfer {
-	protected:
-		// TODO: These aren't used by all backends.
-		int prevSize;
-		int prevSizeTex;
-		int prevSizeCol;
-	public:
-		virtual ~TessellationDataTransfer() {}
-		// Send spline/bezier's control points to vertex shader through floating point texture.
-		virtual void PrepareBuffers(float *&pos, float *&tex, float *&col, int &posStride, int &texStride, int &colStride, int size, bool hasColor, bool hasTexCoords) {
-			posStride = 4;
-			texStride = 4;
-			colStride = 4;
-		}
-		virtual void SendDataToShader(const float *pos, const float *tex, const float *col, int size, bool hasColor, bool hasTexCoords) = 0;
-		virtual void EndFrame() {}
-	};
 	TessellationDataTransfer *tessDataTransfer;
 };
diff --git a/GPU/Common/ShaderId.cpp b/GPU/Common/ShaderId.cpp
index 83bbd08e3e..e7ed0b4e87 100644
--- a/GPU/Common/ShaderId.cpp
+++ b/GPU/Common/ShaderId.cpp
@@ -53,6 +53,7 @@ std::string VertexShaderDesc(const ShaderID &id) {
 	if (id.Bit(VS_BIT_SPLINE)) desc << "Spline ";
 	if (id.Bit(VS_BIT_HAS_COLOR_TESS)) desc << "TessC ";
 	if (id.Bit(VS_BIT_HAS_TEXCOORD_TESS)) desc << "TessT ";
+	if (id.Bit(VS_BIT_HAS_NORMAL_TESS)) desc << "TessN ";
 	if (id.Bit(VS_BIT_NORM_REVERSE_TESS)) desc << "TessRevN ";
 
 	return desc.str();
@@ -73,6 +74,7 @@ void ComputeVertexShaderID(ShaderID *id_out, u32 vertType, bool useHWTransform)
 	bool doSpline = gstate_c.spline;
 	bool hasColorTess = (gstate.vertType & GE_VTYPE_COL_MASK) != 0 && (doBezier || doSpline);
 	bool hasTexcoordTess = (gstate.vertType & GE_VTYPE_TC_MASK) != 0 && (doBezier || doSpline);
+	bool hasNormalTess = (gstate.vertType & GE_VTYPE_NRM_MASK) != 0 && (doBezier || doSpline);
 
 	bool enableFog = gstate.isFogEnabled() && !isModeThrough && !gstate.isModeClear();
 	bool lmode = gstate.isUsingSecondaryColor() && gstate.isLightingEnabled() && !isModeThrough;
@@ -139,6 +141,7 @@ void ComputeVertexShaderID(ShaderID *id_out, u32 vertType, bool useHWTransform)
 			id.SetBit(VS_BIT_SPLINE, doSpline);
 			id.SetBit(VS_BIT_HAS_COLOR_TESS, hasColorTess);
 			id.SetBit(VS_BIT_HAS_TEXCOORD_TESS, hasTexcoordTess);
+			id.SetBit(VS_BIT_HAS_NORMAL_TESS, hasNormalTess);
 			id.SetBit(VS_BIT_NORM_REVERSE_TESS, gstate.isPatchNormalsReversed());
 		}
 	}
diff --git a/GPU/Common/ShaderId.h b/GPU/Common/ShaderId.h
index 740e321fe8..573da9c3f3 100644
--- a/GPU/Common/ShaderId.h
+++ b/GPU/Common/ShaderId.h
@@ -24,7 +24,7 @@ enum {
 	VS_BIT_HAS_COLOR_TESS = 12,  // 1 bit
 	VS_BIT_HAS_TEXCOORD_TESS = 13,  // 1 bit
 	VS_BIT_NORM_REVERSE_TESS = 14, // 1 bit
-	// 15 is free.
+	VS_BIT_HAS_NORMAL_TESS = 15, // 1 bit
 	VS_BIT_UVGEN_MODE = 16,
 	VS_BIT_UVPROJ_MODE = 18,  // 2, can overlap with LS0
 	VS_BIT_LS0 = 18,  // 2
diff --git a/GPU/Common/ShaderUniforms.cpp b/GPU/Common/ShaderUniforms.cpp
index e0f6719502..26bdbd7c17 100644
--- a/GPU/Common/ShaderUniforms.cpp
+++ b/GPU/Common/ShaderUniforms.cpp
@@ -240,7 +240,7 @@ void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipView
 	}
 
 	if (dirtyUniforms & DIRTY_BEZIERSPLINE) {
-		ub->spline_counts = BytesToUint32(gstate_c.spline_count_u, gstate_c.spline_count_v, gstate_c.spline_type_u, gstate_c.spline_type_v);
+		ub->spline_counts = gstate_c.spline_num_points_u;
 	}
 
 	if (dirtyUniforms & DIRTY_DEPAL) {
diff --git a/GPU/Common/SplineCommon.cpp b/GPU/Common/SplineCommon.cpp
index 7af7926dde..77427b99ee 100644
--- a/GPU/Common/SplineCommon.cpp
+++ b/GPU/Common/SplineCommon.cpp
@@ -21,8 +21,6 @@
 #include "profiler/profiler.h"
 
 #include "Common/CPUDetect.h"
-#include "Common/MemoryUtil.h"
-#include "Core/Config.h"
 
 #include "GPU/Common/GPUStateUtils.h"
 #include "GPU/Common/SplineCommon.h"
@@ -30,67 +28,34 @@
 #include "GPU/ge_constants.h"
 #include "GPU/GPUState.h"  // only needed for UVScale stuff
 
-#if defined(_M_SSE)
-#include <emmintrin.h>
-
-inline __m128 SSECrossProduct(__m128 a, __m128 b)
-{
-	const __m128 left = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)));
-	const __m128 right = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)));
-	return _mm_sub_ps(left, right);
+bool CanUseHardwareTessellation(GEPatchPrimType prim) {
+	if (g_Config.bHardwareTessellation && !g_Config.bSoftwareRendering) {
+		return CanUseHardwareTransform(PatchPrimToPrim(prim));
+	}
+	return false;
 }
 
-inline __m128 SSENormalizeMultiplierSSE2(__m128 v)
-{
-	const __m128 sq = _mm_mul_ps(v, v);
-	const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
-	const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
-	const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq));
+class SimpleBufferManager {
+private:
+	u8 *buf_;
+	size_t totalSize, maxSize_;
+public:
+	SimpleBufferManager(u8 *buf, size_t maxSize)
+		: buf_(buf), totalSize(0), maxSize_(maxSize) {}
 
-	const __m128 rt = _mm_rsqrt_ss(res);
-	return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0));
-}
+	u8 *Allocate(size_t size) {
+		size = (size + 15) & ~15; // Align for 16 bytes
 
-#if _M_SSE >= 0x401
-#include <smmintrin.h>
+		if ((totalSize + size) > maxSize_)
+			return nullptr; // No more memory
 
-inline __m128 SSENormalizeMultiplierSSE4(__m128 v)
-{
-	return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF));
-}
+		size_t tmp = totalSize;
+		totalSize += size;
+		return buf_ + tmp;
+	}
+};
 
-inline __m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)
-{
-	if (useSSE4)
-		return SSENormalizeMultiplierSSE4(v);
-	return SSENormalizeMultiplierSSE2(v);
-}
-#else
-inline __m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)
-{
-	return SSENormalizeMultiplierSSE2(v);
-}
-#endif
-
-#endif
-
-
-#define START_OPEN 1
-#define END_OPEN 2
-
-
-
-static void CopyQuad(u8 *&dest, const SimpleVertex *v1, const SimpleVertex *v2, const SimpleVertex *v3, const SimpleVertex *v4) {
-	int vertexSize = sizeof(SimpleVertex);
-	memcpy(dest, v1, vertexSize);
-	dest += vertexSize;
-	memcpy(dest, v2, vertexSize);
-	dest += vertexSize;
-	memcpy(dest, v3, vertexSize);
-	dest += vertexSize;
-	memcpy(dest, v4, vertexSize);
-	dest += vertexSize;
-}
+namespace Spline {
 
 static void CopyQuadIndex(u16 *&indices, GEPatchPrimType type, const int idx0, const int idx1, const int idx2, const int idx3) {
 	if (type == GE_PATCHPRIM_LINES) {
@@ -100,8 +65,7 @@ static void CopyQuadIndex(u16 *&indices, GEPatchPrimType type, const int idx0, c
 		*(indices++) = idx3;
 		*(indices++) = idx1;
 		*(indices++) = idx2;
-	}
-	else {
+	} else {
 		*(indices++) = idx0;
 		*(indices++) = idx2;
 		*(indices++) = idx1;
@@ -111,765 +75,442 @@ static void CopyQuadIndex(u16 *&indices, GEPatchPrimType type, const int idx0, c
 	}
 }
 
-#undef b2
+void BuildIndex(u16 *indices, int &count, int num_u, int num_v, GEPatchPrimType prim_type, int total) {
+	for (int v = 0; v < num_v; ++v) {
+		for (int u = 0; u < num_u; ++u) {
+			int idx0 = v * (num_u + 1) + u + total; // Top left
+			int idx2 = (v + 1) * (num_u + 1) + u + total; // Bottom left
 
-// Bernstein basis functions
-inline float bern0(float x) { return (1 - x) * (1 - x) * (1 - x); }
-inline float bern1(float x) { return 3 * x * (1 - x) * (1 - x); }
-inline float bern2(float x) { return 3 * x * x * (1 - x); }
-inline float bern3(float x) { return x * x * x; }
-
-inline float bern0deriv(float x) { return -3 * (x - 1) * (x - 1); }
-inline float bern1deriv(float x) { return 9 * x * x - 12 * x + 3; }
-inline float bern2deriv(float x) { return 3 * (2 - 3 * x) * x; }
-inline float bern3deriv(float x) { return 3 * x * x; }
-
-// http://en.wikipedia.org/wiki/Bernstein_polynomial
-static Math3D::Vec2f Bernstein3D(const Math3D::Vec2f& p0, const Math3D::Vec2f& p1, const Math3D::Vec2f& p2, const Math3D::Vec2f& p3, float x) {
-	if (x == 0) return p0;
-	else if (x == 1) return p3;
-	return p0 * bern0(x) + p1 * bern1(x) + p2 * bern2(x) + p3 * bern3(x);
-}
-
-static Vec3f Bernstein3D(const Vec3f& p0, const Vec3f& p1, const Vec3f& p2, const Vec3f& p3, float x) {
-	if (x == 0) return p0;
-	else if (x == 1) return p3;
-	return p0 * bern0(x) + p1 * bern1(x) + p2 * bern2(x) + p3 * bern3(x);
-}
-
-static Vec4f Bernstein3D(const Vec4f& p0, const Vec4f& p1, const Vec4f& p2, const Vec4f& p3, float x) {
-	if (x == 0) return p0;
-	else if (x == 1) return p3;
-	return p0 * bern0(x) + p1 * bern1(x) + p2 * bern2(x) + p3 * bern3(x);
-}
-
-static Vec4f Bernstein3D(const u32& p0, const u32& p1, const u32& p2, const u32& p3, float x) {
-	return Bernstein3D(Vec4f::FromRGBA(p0), Vec4f::FromRGBA(p1), Vec4f::FromRGBA(p2), Vec4f::FromRGBA(p3), x);
-}
-
-static Vec3f Bernstein3DDerivative(const Vec3f& p0, const Vec3f& p1, const Vec3f& p2, const Vec3f& p3, float x) {
-	return p0 * bern0deriv(x) + p1 * bern1deriv(x) + p2 * bern2deriv(x) + p3 * bern3deriv(x);
-}
-
-static void spline_n_4(int i, float t, float *knot, float *splineVal) {
-	knot += i + 1;
-
-#ifdef _M_SSE
-	const __m128 knot012 = _mm_loadu_ps(&knot[0]);
-	const __m128 knot345 = _mm_loadu_ps(&knot[3]);
-	const __m128 t012 = _mm_sub_ps(_mm_set_ps1(t), knot012);
-	const __m128 f30_41_52 = _mm_div_ps(t012, _mm_sub_ps(knot345, knot012));
-
-	const __m128 knot343 = _mm_shuffle_ps(knot345, knot345, _MM_SHUFFLE(3, 0, 1, 0));
-	const __m128 knot122 = _mm_shuffle_ps(knot012, knot012, _MM_SHUFFLE(3, 2, 2, 1));
-	const __m128 t122 = _mm_shuffle_ps(t012, t012, _MM_SHUFFLE(3, 2, 2, 1));
-	const __m128 f31_42_32 = _mm_div_ps(t122, _mm_sub_ps(knot343, knot122));
-
-	// It's still faster to use SSE, even with this.
-	alignas(16) float ff30_41_52[4];
-	alignas(16) float ff31_42_32[4];
-	_mm_store_ps(ff30_41_52, f30_41_52);
-	_mm_store_ps(ff31_42_32, f31_42_32);
-
-	const float &f30 = ff30_41_52[0];
-	const float &f41 = ff30_41_52[1];
-	const float &f52 = ff30_41_52[2];
-	const float &f31 = ff31_42_32[0];
-	const float &f42 = ff31_42_32[1];
-	const float &f32 = ff31_42_32[2];
-#else
-	// TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly...
-	float t0 = (t - knot[0]);
-	float t1 = (t - knot[1]);
-	float t2 = (t - knot[2]);
-	// TODO: All our knots are integers so we should be able to get rid of these divisions (How?)
-	float f30 = t0/(knot[3]-knot[0]);
-	float f41 = t1/(knot[4]-knot[1]);
-	float f52 = t2/(knot[5]-knot[2]);
-	float f31 = t1/(knot[3]-knot[1]);
-	float f42 = t2/(knot[4]-knot[2]);
-	float f32 = t2/(knot[3]-knot[2]);
-#endif
-
-	float a = (1-f30)*(1-f31);
-	float b = (f31*f41);
-	float c = (1-f41)*(1-f42);
-	float d = (f42*f52);
-
-	splineVal[0] = a-(a*f32);
-	splineVal[1] = 1-a-b+((a+b+c-1)*f32);
-	splineVal[2] = b+((1-b-c-d)*f32);
-	splineVal[3] = d*f32;
-}
-
-// knot should be an array sized n + 5  (n + 1 + 1 + degree (cubic))
-static void spline_knot(int n, int type, float *knot) {
-	memset(knot, 0, sizeof(float) * (n + 5));
-	for (int i = 0; i < n - 1; ++i)
-		knot[i + 3] = (float)i;
-
-	if ((type & 1) == 0) {
-		knot[0] = -3;
-		knot[1] = -2;
-		knot[2] = -1;
-	}
-	if ((type & 2) == 0) {
-		knot[n + 2] = (float)(n - 1);
-		knot[n + 3] = (float)(n);
-		knot[n + 4] = (float)(n + 1);
-	} else {
-		knot[n + 2] = (float)(n - 2);
-		knot[n + 3] = (float)(n - 2);
-		knot[n + 4] = (float)(n - 2);
-	}
-}
-
-bool CanUseHardwareTessellation(GEPatchPrimType prim) {
-	if (g_Config.bHardwareTessellation && !g_Config.bSoftwareRendering) {
-		return CanUseHardwareTransform(PatchPrimToPrim(prim));
-	}
-	return false;
-}
-
-// Prepare mesh of one patch for "Instanced Tessellation".
-static void TessellateSplinePatchHardware(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch) {
-	SimpleVertex *&vertices = (SimpleVertex*&)dest;
-
-	float inv_u = 1.0f / (float)spatch.tess_u;
-	float inv_v = 1.0f / (float)spatch.tess_v;
-
-	// Generating simple input vertices for the spline-computing vertex shader.
-	for (int tile_v = 0; tile_v < spatch.tess_v + 1; ++tile_v) {
-		for (int tile_u = 0; tile_u < spatch.tess_u + 1; ++tile_u) {
-			SimpleVertex &vert = vertices[tile_v * (spatch.tess_u + 1) + tile_u];
-			vert.pos.x = (float)tile_u * inv_u;
-			vert.pos.y = (float)tile_v * inv_v;
-
-			// TODO: Move to shader uniform and unify this method spline and bezier if necessary.
-			// For compute normal
-			vert.nrm.x = inv_u;
-			vert.nrm.y = inv_v;
-		}
-	}
-
-	// Combine the vertices into triangles.
-	for (int tile_v = 0; tile_v < spatch.tess_v; ++tile_v) {
-		for (int tile_u = 0; tile_u < spatch.tess_u; ++tile_u) {
-			int idx0 = tile_v * (spatch.tess_u + 1) + tile_u;
-			int idx1 = tile_v * (spatch.tess_u + 1) + tile_u + 1;
-			int idx2 = (tile_v + 1) * (spatch.tess_u + 1) + tile_u;
-			int idx3 = (tile_v + 1) * (spatch.tess_u + 1) + tile_u + 1;
-
-			CopyQuadIndex(indices, spatch.primType, idx0, idx1, idx2, idx3);
+			CopyQuadIndex(indices, prim_type, idx0, idx0 + 1, idx2, idx2 + 1);
 			count += 6;
 		}
 	}
 }
 
-static void _SplinePatchLowQuality(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType) {
-	// Fast and easy way - just draw the control points, generate some very basic normal vector substitutes.
-	// Very inaccurate but okay for Loco Roco. Maybe should keep it as an option because it's fast.
+class Bezier3DWeight {
+private:
+	void CalcWeights(float t, Weight &w) {
+		// Bernstein 3D basis polynomial
+		w.basis[0] = (1 - t) * (1 - t) * (1 - t);
+		w.basis[1] = 3 * t * (1 - t) * (1 - t);
+		w.basis[2] = 3 * t * t * (1 - t);
+		w.basis[3] = t * t * t;
 
-	const int tile_min_u = (spatch.type_u & START_OPEN) ? 0 : 1;
-	const int tile_min_v = (spatch.type_v & START_OPEN) ? 0 : 1;
-	const int tile_max_u = (spatch.type_u & END_OPEN) ? spatch.count_u - 1 : spatch.count_u - 2;
-	const int tile_max_v = (spatch.type_v & END_OPEN) ? spatch.count_v - 1 : spatch.count_v - 2;
+		// Derivative
+		w.deriv[0] = -3 * (1 - t) * (1 - t);
+		w.deriv[1] = 9 * t * t - 12 * t + 3;
+		w.deriv[2] = 3 * (2 - 3 * t) * t;
+		w.deriv[3] = 3 * t * t;
+	}
+public:
+	Weight *CalcWeightsAll(u32 key) {
+		int tess = (int)key;
+		Weight *weights = new Weight[tess + 1];
+		const float inv_tess = 1.0f / (float)tess;
+		for (int i = 0; i < tess + 1; ++i) {
+			const float t = (float)i * inv_tess;
+			CalcWeights(t, weights[i]);
+		}
+		return weights;
+	}
 
-	float tu_width = (float)spatch.count_u - 3.0f;
-	float tv_height = (float)spatch.count_v - 3.0f;
-	tu_width /= (float)(tile_max_u - tile_min_u);
-	tv_height /= (float)(tile_max_v - tile_min_v);
+	static u32 ToKey(int tess, int count, int type) {
+		return tess;
+	}
 
-	GEPatchPrimType prim_type = spatch.primType;
-	bool computeNormals = spatch.computeNormals;
-	bool patchFacing = spatch.patchFacing;
+	static int CalcSize(int tess, int count) {
+		return tess + 1;
+	}
 
-	int i = 0;
-	for (int tile_v = tile_min_v; tile_v < tile_max_v; ++tile_v) {
-		for (int tile_u = tile_min_u; tile_u < tile_max_u; ++tile_u) {
-			int point_index = tile_u + tile_v * spatch.count_u;
+	static WeightCache<Bezier3DWeight> weightsCache;
+};
 
-			SimpleVertex v0 = *spatch.points[point_index];
-			SimpleVertex v1 = *spatch.points[point_index + 1];
-			SimpleVertex v2 = *spatch.points[point_index + spatch.count_u];
-			SimpleVertex v3 = *spatch.points[point_index + spatch.count_u + 1];
+class Spline3DWeight {
+private:
+	struct KnotDiv {
+		float _3_0 = 1.0f / 3.0f;
+		float _4_1 = 1.0f / 3.0f;
+		float _5_2 = 1.0f / 3.0f;
+		float _3_1 = 1.0f / 2.0f;
+		float _4_2 = 1.0f / 2.0f;
+		float _3_2 = 1.0f; // Always 1
+	};
 
-			// Generate UV. TODO: Do this even if UV specified in control points?
-			if ((origVertType & GE_VTYPE_TC_MASK) == 0) {
-				float u = (tile_u - tile_min_u) * tu_width;
-				float v = (tile_v - tile_min_v) * tv_height;
+	// knot should be an array sized n + 5  (n + 1 + 1 + degree (cubic))
+	void CalcKnots(int n, int type, float *knots, KnotDiv *divs) {
+		// Basic theory (-2 to +3), optimized with KnotDiv (-2 to +0) 
+	//	for (int i = 0; i < n + 5; ++i) {
+		for (int i = 0; i < n + 2; ++i) {
+			knots[i] = (float)i - 2;
+		}
 
-				v0.uv[0] = u;
-				v0.uv[1] = v;
-				v1.uv[0] = u + tu_width;
-				v1.uv[1] = v;
-				v2.uv[0] = u;
-				v2.uv[1] = v + tv_height;
-				v3.uv[0] = u + tu_width;
-				v3.uv[1] = v + tv_height;
-			}
+		// The first edge is open
+		if ((type & 1) != 0) {
+			knots[0] = 0;
+			knots[1] = 0;
 
-			// Generate normal if lighting is enabled (otherwise there's no point).
-			// This is a really poor quality algorithm, we get facet normals.
-			if (computeNormals) {
-				Vec3Packedf norm = Cross(v1.pos - v0.pos, v2.pos - v0.pos);
-				norm.Normalize();
-				if (patchFacing)
-					norm *= -1.0f;
-				v0.nrm = norm;
-				v1.nrm = norm;
-				v2.nrm = norm;
-				v3.nrm = norm;
-			}
-
-			int idx0 = i * 4 + 0;
-			int idx1 = i * 4 + 1;
-			int idx2 = i * 4 + 2;
-			int idx3 = i * 4 + 3;
-			i++;
-
-			CopyQuad(dest, &v0, &v1, &v2, &v3);
-			CopyQuadIndex(indices, prim_type, idx0, idx1, idx2, idx3);
-			count += 6;
+			divs[0]._3_0 = 1.0f;
+			divs[0]._4_1 = 1.0f / 2.0f;
+			divs[0]._3_1 = 1.0f;
+			if (n > 1)
+				divs[1]._3_0 = 1.0f / 2.0f;
+		}
+		// The last edge is open
+		if ((type & 2) != 0) {
+			//	knots[n + 2] = (float)n; // Got rid of this line optimized with KnotDiv
+			//	knots[n + 3] = (float)n; // Got rid of this line optimized with KnotDiv
+			//	knots[n + 4] = (float)n; // Got rid of this line optimized with KnotDiv
+			divs[n - 1]._4_1 = 1.0f / 2.0f;
+			divs[n - 1]._5_2 = 1.0f;
+			divs[n - 1]._4_2 = 1.0f;
+			if (n > 1)
+				divs[n - 2]._5_2 = 1.0f / 2.0f;
 		}
 	}
 
-}
-
-static inline void AccumulateWeighted(Vec3f &out, const Vec3Packedf &in, const Vec4f &w) {
+	void CalcWeights(float t, const float *knots, const KnotDiv &div, Weight &w) {
 #ifdef _M_SSE
-	out.vec = _mm_add_ps(out.vec, _mm_mul_ps(_mm_loadu_ps(in.AsArray()), w.vec));
+		const __m128 knot012 = _mm_loadu_ps(knots);
+		const __m128 t012 = _mm_sub_ps(_mm_set_ps1(t), knot012);
+		const __m128 f30_41_52 = _mm_mul_ps(t012, _mm_loadu_ps(&div._3_0));
+		const __m128 f52_31_42 = _mm_mul_ps(t012, _mm_loadu_ps(&div._5_2));
+
+		// Following comments are for explains order of the multiply.
+	//	float a = (1-f30)*(1-f31);
+	//	float c = (1-f41)*(1-f42);
+	//	float b = (  f31 *   f41);
+	//	float d = (  f42 *   f52);
+		const __m128 f30_41_31_42 = _mm_shuffle_ps(f30_41_52, f52_31_42, _MM_SHUFFLE(2, 1, 1, 0));
+		const __m128 f31_42_41_52 = _mm_shuffle_ps(f52_31_42, f30_41_52, _MM_SHUFFLE(2, 1, 2, 1));
+		const __m128 c1_1_0_0 = { 1, 1, 0, 0 };
+		const __m128 acbd = _mm_mul_ps(_mm_sub_ps(c1_1_0_0, f30_41_31_42), _mm_sub_ps(c1_1_0_0, f31_42_41_52));
+
+		alignas(16) float f_t012[4];
+		alignas(16) float f_acbd[4];
+		alignas(16) float f_f30_41_31_42[4];
+		_mm_store_ps(f_t012, t012);
+		_mm_store_ps(f_acbd, acbd);
+		_mm_store_ps(f_f30_41_31_42, f30_41_31_42);
+
+		const float &f32 = f_t012[2];
+
+		const float &a = f_acbd[0];
+		const float &b = f_acbd[2];
+		const float &c = f_acbd[1];
+		const float &d = f_acbd[3];
+
+		// For derivative
+		const float &f31 = f_f30_41_31_42[2];
+		const float &f42 = f_f30_41_31_42[3];
 #else
-	out += in * w.x;
+		// TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly...
+		float t0 = (t - knots[0]);
+		float t1 = (t - knots[1]);
+		float t2 = (t - knots[2]);
+
+		float f30 = t0 * div._3_0;
+		float f41 = t1 * div._4_1;
+		float f52 = t2 * div._5_2;
+		float f31 = t1 * div._3_1;
+		float f42 = t2 * div._4_2;
+		float f32 = t2 * div._3_2;
+
+		float a = (1 - f30) * (1 - f31);
+		float b = (f31 * f41);
+		float c = (1 - f41) * (1 - f42);
+		float d = (f42 * f52);
 #endif
-}
+		w.basis[0] = a * (1 - f32); // (1-f30)*(1-f31)*(1-f32)
+		w.basis[1] = 1 - a - b + ((a + b + c - 1) * f32);
+		w.basis[2] = b + ((1 - b - c - d) * f32);
+		w.basis[3] = d * f32; // f32*f42*f52
 
-static inline void AccumulateWeighted(Vec4f &out, const Vec4f &in, const Vec4f &w) {
-#ifdef _M_SSE
-	out.vec = _mm_add_ps(out.vec, _mm_mul_ps(in.vec, w.vec));
-#else
-	out += in * w;
-#endif
-}
+		// Derivative
+		float i1 = (1 - f31) * (1 - f32);
+		float i2 = f31 * (1 - f32) + (1 - f42) * f32;
+		float i3 = f42 * f32;
 
-template <bool origNrm, bool origCol, bool origTc, bool useSSE4>
-static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) {
-	// Full (mostly) correct tessellation of spline patches.
-	// Not very fast.
+		float f130 = i1 * div._3_0;
+		float f241 = i2 * div._4_1;
+		float f352 = i3 * div._5_2;
 
-	float *knot_u = new float[spatch.count_u + 4];
-	float *knot_v = new float[spatch.count_v + 4];
-	spline_knot(spatch.count_u - 1, spatch.type_u, knot_u);
-	spline_knot(spatch.count_v - 1, spatch.type_v, knot_v);
+		w.deriv[0] = 3 * (0 - f130);
+		w.deriv[1] = 3 * (f130 - f241);
+		w.deriv[2] = 3 * (f241 - f352);
+		w.deriv[3] = 3 * (f352 - 0);
+	}
+public:
+	Weight *CalcWeightsAll(u32 key) {
+		int tess, count, type;
+		FromKey(key, tess, count, type);
+		const int num_patches = count - 3;
+		Weight *weights = new Weight[tess * num_patches + 1];
 
-	// Increase tessellation based on the size. Should be approximately right?
-	int patch_div_s = (spatch.count_u - 3) * spatch.tess_u;
-	int patch_div_t = (spatch.count_v - 3) * spatch.tess_v;
-	if (quality > 1) {
-		// Don't cut below 2, though.
-		if (patch_div_s > 2) {
-			patch_div_s /= quality;
-		}
-		if (patch_div_t > 2) {
-			patch_div_t /= quality;
+	//	float *knots = new float[num_patches + 5];
+		float *knots = new float[num_patches + 2]; // Optimized with KnotDiv, must use +5 in theory 
+		KnotDiv *divs = new KnotDiv[num_patches];
+		CalcKnots(num_patches, type, knots, divs);
+
+		const float inv_tess = 1.0f / (float)tess;
+		for (int i = 0; i < num_patches; ++i) {
+			const int start = (i == 0) ? 0 : 1;
+			for (int j = start; j <= tess; ++j) {
+				const int index = i * tess + j;
+				const float t = (float)index * inv_tess;
+				CalcWeights(t, knots + i, divs[i], weights[index]);
+			}
 		}
+
+		delete[] knots;
+		delete[] divs;
+
+		return weights;
 	}
 
-	// Downsample until it fits, in case crazy tessellation factors are sent.
-	while ((patch_div_s + 1) * (patch_div_t + 1) > maxVertices) {
-		patch_div_s /= 2;
-		patch_div_t /= 2;
+	static u32 ToKey(int tess, int count, int type) {
+		return tess | (count << 8) | (type << 16);
 	}
 
-	if (patch_div_s < 1) patch_div_s = 1;
-	if (patch_div_t < 1) patch_div_t = 1;
+	static void FromKey(u32 key, int &tess, int &count, int &type) {
+		tess = key & 0xFF; count = (key >> 8) & 0xFF; type = (key >> 16) & 0xFF;
+	}
 
-	// First compute all the vertices and put them in an array
-	SimpleVertex *&vertices = (SimpleVertex*&)dest;
+	static int CalcSize(int tess, int count) {
+		return (count - 3) * tess + 1;
+	}
 
-	float tu_width = (float)spatch.count_u - 3.0f;
-	float tv_height = (float)spatch.count_v - 3.0f;
+	static WeightCache<Spline3DWeight> weightsCache;
+};
 
-	// int max_idx = spatch.count_u * spatch.count_v;
+WeightCache<Bezier3DWeight> Bezier3DWeight::weightsCache;
+WeightCache<Spline3DWeight> Spline3DWeight::weightsCache;
 
-	bool computeNormals = spatch.computeNormals;
+// Tessellate single patch (4x4 control points)
+template<typename T>
+class Tessellator {
+private:
+	const T *const p[4]; // T p[v][u]; 4x4 control points
+	T u[4]; // Pre-tessellated U lines
+public:
+	Tessellator(const T *p, const int idx[4]) : p{ p + idx[0], p + idx[1], p + idx[2], p + idx[3] } {}
 
-	float one_over_patch_div_s = 1.0f / (float)(patch_div_s);
-	float one_over_patch_div_t = 1.0f / (float)(patch_div_t);
+	// Linear combination
+	T Sample(const T p[4], const float w[4]) {
+		return p[0] * w[0] + p[1] * w[1] + p[2] * w[2] + p[3] * w[3];
+	}
 
-	for (int tile_v = 0; tile_v < patch_div_t + 1; tile_v++) {
-		float v = (float)tile_v * (float)(spatch.count_v - 3) * one_over_patch_div_t;
-		if (v < 0.0f)
-			v = 0.0f;
-		for (int tile_u = 0; tile_u < patch_div_s + 1; tile_u++) {
-			float u = (float)tile_u * (float)(spatch.count_u - 3) * one_over_patch_div_s;
-			if (u < 0.0f)
-				u = 0.0f;
-			SimpleVertex *vert = &vertices[tile_v * (patch_div_s + 1) + tile_u];
-			Vec4f vert_color(0, 0, 0, 0);
-			Vec3f vert_pos;
-			vert_pos.SetZero();
-			Vec3f vert_nrm;
-			if (origNrm) {
-				vert_nrm.SetZero();
-			}
-			if (origCol) {
-				vert_color.SetZero();
-			} else {
-				memcpy(vert->color, spatch.points[0]->color, 4);
-			}
-			if (origTc) {
-				vert->uv[0] = 0.0f;
-				vert->uv[1] = 0.0f;
-			} else {
-				vert->uv[0] = tu_width * ((float)tile_u * one_over_patch_div_s);
-				vert->uv[1] = tv_height * ((float)tile_v * one_over_patch_div_t);
-			}
+	void SampleEdgeU(int idx) {
+		u[0] = p[0][idx];
+		u[1] = p[1][idx];
+		u[2] = p[2][idx];
+		u[3] = p[3][idx];
+	}
 
+	void SampleU(const float weights[4]) {
+		if (weights[0] == 1.0f) { SampleEdgeU(0); return; } // weights = {1,0,0,0}, first edge is open.
+		if (weights[3] == 1.0f) { SampleEdgeU(3); return; } // weights = {0,0,0,1}, last edge is open.
 
-			// Collect influences from surrounding control points.
-			float u_weights[4];
-			float v_weights[4];
+		u[0] = Sample(p[0], weights);
+		u[1] = Sample(p[1], weights);
+		u[2] = Sample(p[2], weights);
+		u[3] = Sample(p[3], weights);
+	}
 
-			int iu = (int)u;
-			int iv = (int)v;
+	T SampleV(const float weights[4]) {
+		if (weights[0] == 1.0f) return u[0]; // weights = {1,0,0,0}, first edge is open.
+		if (weights[3] == 1.0f) return u[3]; // weights = {0,0,0,1}, last edge is open.
 
-			// TODO: Would really like to fix the surrounding logic somehow to get rid of these but I can't quite get it right..
-			// Without the previous epsilons and with large count_u, we will end up doing an out of bounds access later without these.
-			if (iu >= spatch.count_u - 3) iu = spatch.count_u - 4;
-			if (iv >= spatch.count_v - 3) iv = spatch.count_v - 4;
+		return Sample(u, weights);
+	}
+};
 
-			spline_n_4(iu, u, knot_u, u_weights);
-			spline_n_4(iv, v, knot_v, v_weights);
+ControlPoints::ControlPoints(const SimpleVertex *const *points, int size, SimpleBufferManager &managedBuf) {
+	pos = (Vec3f *)managedBuf.Allocate(sizeof(Vec3f) * size);
+	tex = (Vec2f *)managedBuf.Allocate(sizeof(Vec2f) * size);
+	col = (Vec4f *)managedBuf.Allocate(sizeof(Vec4f) * size);
+	Convert(points, size);
+}
 
-			// Handle degenerate patches. without this, spatch.points[] may read outside the number of initialized points.
-			int patch_w = std::min(spatch.count_u - iu, 4);
-			int patch_h = std::min(spatch.count_v - iv, 4);
+void ControlPoints::Convert(const SimpleVertex *const *points, int size) {
+	for (int i = 0; i < size; ++i) {
+		pos[i] = Vec3f(points[i]->pos);
+		tex[i] = Vec2f(points[i]->uv);
+		col[i] = Vec4f::FromRGBA(points[i]->color_32);
+	}
+	defcolor = points[0]->color_32;
+}
 
-			for (int ii = 0; ii < patch_w; ++ii) {
-				for (int jj = 0; jj < patch_h; ++jj) {
-					float u_spline = u_weights[ii];
-					float v_spline = v_weights[jj];
-					float f = u_spline * v_spline;
+template<class Surface>
+class SubdivisionSurface {
+public:
+	template <bool sampleNrm, bool sampleCol, bool sampleTex, bool useSSE4, bool patchFacing>
+	static void Tessellate(OutputBuffers &output, const Surface &surface, const ControlPoints &points, const Weight2D &weights) {
+		const float inv_u = 1.0f / (float)surface.tess_u;
+		const float inv_v = 1.0f / (float)surface.tess_v;
 
-					if (f > 0.0f) {
-#ifdef _M_SSE
-						Vec4f fv(_mm_set_ps1(f));
-#else
-						Vec4f fv = Vec4f::AssignToAll(f);
-#endif
-						int idx = spatch.count_u * (iv + jj) + (iu + ii);
-						/*
-						if (idx >= max_idx) {
-							char temp[512];
-							snprintf(temp, sizeof(temp), "count_u: %d count_v: %d patch_w: %d patch_h: %d  ii: %d  jj: %d  iu: %d  iv: %d  patch_div_s: %d  patch_div_t: %d\n", spatch.count_u, spatch.count_v, patch_w, patch_h, ii, jj, iu, iv, patch_div_s, patch_div_t);
-							OutputDebugStringA(temp);
-							Crash();
-						}*/
-						const SimpleVertex *a = spatch.points[idx];
-						AccumulateWeighted(vert_pos, a->pos, fv);
-						if (origTc) {
-							vert->uv[0] += a->uv[0] * f;
-							vert->uv[1] += a->uv[1] * f;
+		for (int patch_u = 0; patch_u < surface.num_patches_u; ++patch_u) {
+			const int start_u = surface.GetTessStart(patch_u);
+			for (int patch_v = 0; patch_v < surface.num_patches_v; ++patch_v) {
+				const int start_v = surface.GetTessStart(patch_v);
+
+				// Prepare 4x4 control points to tessellate
+				const int idx = surface.GetPointIndex(patch_u, patch_v);
+				const int idx_v[4] = { idx, idx + surface.num_points_u, idx + surface.num_points_u * 2, idx + surface.num_points_u * 3 };
+				Tessellator<Vec3f> tess_pos(points.pos, idx_v);
+				Tessellator<Vec4f> tess_col(points.col, idx_v);
+				Tessellator<Vec2f> tess_tex(points.tex, idx_v);
+				Tessellator<Vec3f> tess_nrm(points.pos, idx_v);
+
+				for (int tile_u = start_u; tile_u <= surface.tess_u; ++tile_u) {
+					const int index_u = surface.GetIndexU(patch_u, tile_u);
+					const Weight &wu = weights.u[index_u];
+
+					// Pre-tessellate U lines
+					tess_pos.SampleU(wu.basis);
+					if (sampleCol)
+						tess_col.SampleU(wu.basis);
+					if (sampleTex)
+						tess_tex.SampleU(wu.basis);
+					if (sampleNrm)
+						tess_nrm.SampleU(wu.deriv);
+
+					for (int tile_v = start_v; tile_v <= surface.tess_v; ++tile_v) {
+						const int index_v = surface.GetIndexV(patch_v, tile_v);
+						const Weight &wv = weights.v[index_v];
+
+						SimpleVertex &vert = output.vertices[surface.GetIndex(index_u, index_v, patch_u, patch_v)];
+
+						// Tessellate
+						vert.pos = tess_pos.SampleV(wv.basis);
+						if (sampleCol) {
+							vert.color_32 = tess_col.SampleV(wv.basis).ToRGBA();
+						} else {
+							vert.color_32 = points.defcolor;
 						}
-						if (origCol) {
-							Vec4f a_color = Vec4f::FromRGBA(a->color_32);
-							AccumulateWeighted(vert_color, a_color, fv);
+						if (sampleTex) {
+							tess_tex.SampleV(wv.basis).Write(vert.uv);
+						} else {
+							// Generate texcoord
+							vert.uv[0] = patch_u + tile_u * inv_u;
+							vert.uv[1] = patch_v + tile_v * inv_v;
 						}
-						if (origNrm) {
-							AccumulateWeighted(vert_nrm, a->nrm, fv);
+						if (sampleNrm) {
+							const Vec3f derivU = tess_nrm.SampleV(wv.basis);
+							const Vec3f derivV = tess_pos.SampleV(wv.deriv);
+
+							vert.nrm = Cross(derivU, derivV).Normalized(useSSE4);
+							if (patchFacing)
+								vert.nrm *= -1.0f;
+						} else {
+							vert.nrm.SetZero();
 						}
 					}
 				}
 			}
-			vert->pos = vert_pos;
-			if (origNrm) {
-#ifdef _M_SSE
-				const __m128 normalize = SSENormalizeMultiplier(useSSE4, vert_nrm.vec);
-				vert_nrm.vec = _mm_mul_ps(vert_nrm.vec, normalize);
-#else
-				vert_nrm.Normalize();
-#endif
-				vert->nrm = vert_nrm;
-			} else {
-				vert->nrm.SetZero();
-				vert->nrm.z = 1.0f;
-			}
-			if (origCol) {
-				vert->color_32 = vert_color.ToRGBA();
-			}
 		}
+
+		surface.BuildIndex(output.indices, output.count);
 	}
 
-	delete[] knot_u;
-	delete[] knot_v;
+	using TessFunc = void(*)(OutputBuffers &, const Surface &, const ControlPoints &, const Weight2D &);
+	TEMPLATE_PARAMETER_DISPATCHER_FUNCTION(Tess, SubdivisionSurface::Tessellate, TessFunc);
 
-	// Hacky normal generation through central difference.
-	if (computeNormals && !origNrm) {
-#ifdef _M_SSE
-		const __m128 facing = spatch.patchFacing ? _mm_set_ps1(-1.0f) : _mm_set_ps1(1.0f);
-#endif
+	static void Tessellate(OutputBuffers &output, const Surface &surface, const ControlPoints &points, const Weight2D &weights, u32 origVertType) {
+		const bool params[] = {
+			(origVertType & GE_VTYPE_NRM_MASK) != 0,
+			(origVertType & GE_VTYPE_COL_MASK) != 0,
+			(origVertType & GE_VTYPE_TC_MASK) != 0,
+			cpu_info.bSSE4_1,
+			surface.patchFacing,
+		};
+		static TemplateParameterDispatcher<TessFunc, ARRAY_SIZE(params), Tess> dispatcher; // Initialize only once
 
-		for (int v = 0; v < patch_div_t + 1; v++) {
-			Vec3f vl_pos = vertices[v * (patch_div_s + 1)].pos;
-			Vec3f vc_pos = vertices[v * (patch_div_s + 1)].pos;
-
-			for (int u = 0; u < patch_div_s + 1; u++) {
-				const int t = std::max(0, v - 1);
-				const int r = std::min(patch_div_s, u + 1);
-				const int b = std::min(patch_div_t, v + 1);
-
-				const Vec3f vr_pos = vertices[v * (patch_div_s + 1) + r].pos;
-
-#ifdef _M_SSE
-				const __m128 right = _mm_sub_ps(vr_pos.vec, vl_pos.vec);
-
-				const Vec3f vb_pos = vertices[b * (patch_div_s + 1) + u].pos;
-				const Vec3f vt_pos = vertices[t * (patch_div_s + 1) + u].pos;
-				const __m128 down = _mm_sub_ps(vb_pos.vec, vt_pos.vec);
-
-				const __m128 crossed = SSECrossProduct(right, down);
-				const __m128 normalize = SSENormalizeMultiplier(useSSE4, crossed);
-
-				Vec3f finalNrm = _mm_mul_ps(normalize, _mm_mul_ps(crossed, facing));
-				vertices[v * (patch_div_s + 1) + u].nrm = finalNrm;
-#else
-				const Vec3Packedf &right = vr_pos - vl_pos;
-				const Vec3Packedf &down = vertices[b * (patch_div_s + 1) + u].pos - vertices[t * (patch_div_s + 1) + u].pos;
-
-				vertices[v * (patch_div_s + 1) + u].nrm = Cross(right, down).Normalized();
-				if (spatch.patchFacing) {
-					vertices[v * (patch_div_s + 1) + u].nrm *= -1.0f;
-				}
-#endif
-
-				// Rotate for the next one to the right.
-				vl_pos = vc_pos;
-				vc_pos = vr_pos;
-			}
-		}
+		TessFunc func = dispatcher.GetFunc(params);
+		func(output, surface, points, weights);
 	}
-
-	GEPatchPrimType prim_type = spatch.primType;
-	// Tessellate.
-	for (int tile_v = 0; tile_v < patch_div_t; ++tile_v) {
-		for (int tile_u = 0; tile_u < patch_div_s; ++tile_u) {
-			int idx0 = tile_v * (patch_div_s + 1) + tile_u;
-			int idx1 = tile_v * (patch_div_s + 1) + tile_u + 1;
-			int idx2 = (tile_v + 1) * (patch_div_s + 1) + tile_u;
-			int idx3 = (tile_v + 1) * (patch_div_s + 1) + tile_u + 1;
-
-			CopyQuadIndex(indices, prim_type, idx0, idx1, idx2, idx3);
-			count += 6;
-		}
-	}
-}
-
-template <bool origNrm, bool origCol, bool origTc>
-static inline void SplinePatchFullQualityDispatch4(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) {
-	if (cpu_info.bSSE4_1)
-		SplinePatchFullQuality<origNrm, origCol, origTc, true>(dest, indices, count, spatch, origVertType, quality, maxVertices);
-	else
-		SplinePatchFullQuality<origNrm, origCol, origTc, false>(dest, indices, count, spatch, origVertType, quality, maxVertices);
-}
-
-template <bool origNrm, bool origCol>
-static inline void SplinePatchFullQualityDispatch3(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) {
-	bool origTc = (origVertType & GE_VTYPE_TC_MASK) != 0;
-
-	if (origTc)
-		SplinePatchFullQualityDispatch4<origNrm, origCol, true>(dest, indices, count, spatch, origVertType, quality, maxVertices);
-	else
-		SplinePatchFullQualityDispatch4<origNrm, origCol, false>(dest, indices, count, spatch, origVertType, quality, maxVertices);
-}
-
-template <bool origNrm>
-static inline void SplinePatchFullQualityDispatch2(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) {
-	bool origCol = (origVertType & GE_VTYPE_COL_MASK) != 0;
-
-	if (origCol)
-		SplinePatchFullQualityDispatch3<origNrm, true>(dest, indices, count, spatch, origVertType, quality, maxVertices);
-	else
-		SplinePatchFullQualityDispatch3<origNrm, false>(dest, indices, count, spatch, origVertType, quality, maxVertices);
-}
-
-static void SplinePatchFullQualityDispatch(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) {
-	bool origNrm = (origVertType & GE_VTYPE_NRM_MASK) != 0;
-
-	if (origNrm)
-		SplinePatchFullQualityDispatch2<true>(dest, indices, count, spatch, origVertType, quality, maxVertices);
-	else
-		SplinePatchFullQualityDispatch2<false>(dest, indices, count, spatch, origVertType, quality, maxVertices);
-}
-
-void TessellateSplinePatch(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int maxVertexCount) {
-	switch (g_Config.iSplineBezierQuality) {
-	case LOW_QUALITY:
-		_SplinePatchLowQuality(dest, indices, count, spatch, origVertType);
-		break;
-	case MEDIUM_QUALITY:
-		SplinePatchFullQualityDispatch(dest, indices, count, spatch, origVertType, 2, maxVertexCount);
-		break;
-	case HIGH_QUALITY:
-		SplinePatchFullQualityDispatch(dest, indices, count, spatch, origVertType, 1, maxVertexCount);
-		break;
-	}
-}
-
-static void _BezierPatchLowQuality(u8 *&dest, u16 *&indices, int &count, int tess_u, int tess_v, const BezierPatch &patch, u32 origVertType) {
-	const float third = 1.0f / 3.0f;
-	// Fast and easy way - just draw the control points, generate some very basic normal vector subsitutes.
-	// Very inaccurate though but okay for Loco Roco. Maybe should keep it as an option.
-
-	float u_base = patch.u_index / 3.0f;
-	float v_base = patch.v_index / 3.0f;
-
-	GEPatchPrimType prim_type = patch.primType;
-
-	for (int tile_v = 0; tile_v < 3; tile_v++) {
-		for (int tile_u = 0; tile_u < 3; tile_u++) {
-			int point_index = tile_u + tile_v * 4;
-
-			SimpleVertex v0 = *patch.points[point_index];
-			SimpleVertex v1 = *patch.points[point_index + 1];
-			SimpleVertex v2 = *patch.points[point_index + 4];
-			SimpleVertex v3 = *patch.points[point_index + 5];
-
-			// Generate UV. TODO: Do this even if UV specified in control points?
-			if ((origVertType & GE_VTYPE_TC_MASK) == 0) {
-				float u = u_base + tile_u * third;
-				float v = v_base + tile_v * third;
-				v0.uv[0] = u;
-				v0.uv[1] = v;
-				v1.uv[0] = u + third;
-				v1.uv[1] = v;
-				v2.uv[0] = u;
-				v2.uv[1] = v + third;
-				v3.uv[0] = u + third;
-				v3.uv[1] = v + third;
-			}
-
-			// Generate normal if lighting is enabled (otherwise there's no point).
-			// This is a really poor quality algorithm, we get facet normals.
-			if (patch.computeNormals) {
-				Vec3Packedf norm = Cross(v1.pos - v0.pos, v2.pos - v0.pos);
-				norm.Normalize();
-				if (patch.patchFacing)
-					norm *= -1.0f;
-				v0.nrm = norm;
-				v1.nrm = norm;
-				v2.nrm = norm;
-				v3.nrm = norm;
-			}
-
-			int total = patch.index * 3 * 3 * 4; // A patch has 3x3 tiles, and each tiles have 4 vertices.
-			int tile_index = tile_u + tile_v * 3;
-			int idx0 = total + tile_index * 4 + 0;
-			int idx1 = total + tile_index * 4 + 1;
-			int idx2 = total + tile_index * 4 + 2;
-			int idx3 = total + tile_index * 4 + 3;
-
-			CopyQuad(dest, &v0, &v1, &v2, &v3);
-			CopyQuadIndex(indices, prim_type, idx0, idx1, idx2, idx3);
-			count += 6;
-		}
-	}
-}
-
-template <typename T>
-struct PrecomputedCurves {
-	PrecomputedCurves(int count) {
-		horiz1 = (T *)AllocateAlignedMemory(count * 4 * sizeof(T), 16);
-		horiz2 = horiz1 + count * 1;
-		horiz3 = horiz1 + count * 2;
-		horiz4 = horiz1 + count * 3;
-	}
-	~PrecomputedCurves() {
-		FreeAlignedMemory(horiz1);
-	}
-
-	T Bernstein3D(int u, float bv) {
-		return ::Bernstein3D(horiz1[u], horiz2[u], horiz3[u], horiz4[u], bv);
-	}
-
-	T Bernstein3DDerivative(int u, float bv) {
-		return ::Bernstein3DDerivative(horiz1[u], horiz2[u], horiz3[u], horiz4[u], bv);
-	}
-
-	T *horiz1;
-	T *horiz2;
-	T *horiz3;
-	T *horiz4;
 };
 
-static void _BezierPatchHighQuality(u8 *&dest, u16 *&indices, int &count, int tess_u, int tess_v, const BezierPatch &patch, u32 origVertType) {
-	const float third = 1.0f / 3.0f;
+template<class Surface>
+void SoftwareTessellation(OutputBuffers &output, const Surface &surface, u32 origVertType, const ControlPoints &points) {
+	using WeightType = typename Surface::WeightType;
+	u32 key_u = WeightType::ToKey(surface.tess_u, surface.num_points_u, surface.type_u);
+	u32 key_v = WeightType::ToKey(surface.tess_v, surface.num_points_v, surface.type_v);
+	Weight2D weights(WeightType::weightsCache, key_u, key_v);
 
-	// First compute all the vertices and put them in an array
-	SimpleVertex *&vertices = (SimpleVertex*&)dest;
+	SubdivisionSurface<Surface>::Tessellate(output, surface, points, weights, origVertType);
+}
 
-	PrecomputedCurves<Vec3f> prepos(tess_u + 1);
-	PrecomputedCurves<Vec4f> precol(tess_u + 1);
-	PrecomputedCurves<Math3D::Vec2f> pretex(tess_u + 1);
-	PrecomputedCurves<Vec3f> prederivU(tess_u + 1);
+template<class Surface>
+static void HardwareTessellation(OutputBuffers &output, const Surface &surface, u32 origVertType,
+	const SimpleVertex *const *points, TessellationDataTransfer *tessDataTransfer) {
+	using WeightType = typename Surface::WeightType;
+	u32 key_u = WeightType::ToKey(surface.tess_u, surface.num_points_u, surface.type_u);
+	u32 key_v = WeightType::ToKey(surface.tess_v, surface.num_points_v, surface.type_v);
+	Weight2D weights(WeightType::weightsCache, key_u, key_v);
+	weights.size_u = WeightType::CalcSize(surface.tess_u, surface.num_points_u);
+	weights.size_v = WeightType::CalcSize(surface.tess_v, surface.num_points_v);
+	tessDataTransfer->SendDataToShader(points, surface.num_points_u, surface.num_points_v, origVertType, weights);
 
-	const bool computeNormals = patch.computeNormals;
-	const bool sampleColors = (origVertType & GE_VTYPE_COL_MASK) != 0;
-	const bool sampleTexcoords = (origVertType & GE_VTYPE_TC_MASK) != 0;
-
-	// Precompute the horizontal curves to we only have to evaluate the vertical ones.
-	for (int i = 0; i < tess_u + 1; i++) {
-		float u = ((float)i / (float)tess_u);
-		prepos.horiz1[i] = Bernstein3D(patch.points[0]->pos, patch.points[1]->pos, patch.points[2]->pos, patch.points[3]->pos, u);
-		prepos.horiz2[i] = Bernstein3D(patch.points[4]->pos, patch.points[5]->pos, patch.points[6]->pos, patch.points[7]->pos, u);
-		prepos.horiz3[i] = Bernstein3D(patch.points[8]->pos, patch.points[9]->pos, patch.points[10]->pos, patch.points[11]->pos, u);
-		prepos.horiz4[i] = Bernstein3D(patch.points[12]->pos, patch.points[13]->pos, patch.points[14]->pos, patch.points[15]->pos, u);
-
-		if (sampleColors) {
-			precol.horiz1[i] = Bernstein3D(patch.points[0]->color_32, patch.points[1]->color_32, patch.points[2]->color_32, patch.points[3]->color_32, u);
-			precol.horiz2[i] = Bernstein3D(patch.points[4]->color_32, patch.points[5]->color_32, patch.points[6]->color_32, patch.points[7]->color_32, u);
-			precol.horiz3[i] = Bernstein3D(patch.points[8]->color_32, patch.points[9]->color_32, patch.points[10]->color_32, patch.points[11]->color_32, u);
-			precol.horiz4[i] = Bernstein3D(patch.points[12]->color_32, patch.points[13]->color_32, patch.points[14]->color_32, patch.points[15]->color_32, u);
-		}
-		if (sampleTexcoords) {
-			pretex.horiz1[i] = Bernstein3D(Math3D::Vec2f(patch.points[0]->uv), Math3D::Vec2f(patch.points[1]->uv), Math3D::Vec2f(patch.points[2]->uv), Math3D::Vec2f(patch.points[3]->uv), u);
-			pretex.horiz2[i] = Bernstein3D(Math3D::Vec2f(patch.points[4]->uv), Math3D::Vec2f(patch.points[5]->uv), Math3D::Vec2f(patch.points[6]->uv), Math3D::Vec2f(patch.points[7]->uv), u);
-			pretex.horiz3[i] = Bernstein3D(Math3D::Vec2f(patch.points[8]->uv), Math3D::Vec2f(patch.points[9]->uv), Math3D::Vec2f(patch.points[10]->uv), Math3D::Vec2f(patch.points[11]->uv), u);
-			pretex.horiz4[i] = Bernstein3D(Math3D::Vec2f(patch.points[12]->uv), Math3D::Vec2f(patch.points[13]->uv), Math3D::Vec2f(patch.points[14]->uv), Math3D::Vec2f(patch.points[15]->uv), u);
-		}
-
-		if (computeNormals) {
-			prederivU.horiz1[i] = Bernstein3DDerivative(patch.points[0]->pos, patch.points[1]->pos, patch.points[2]->pos, patch.points[3]->pos, u);
-			prederivU.horiz2[i] = Bernstein3DDerivative(patch.points[4]->pos, patch.points[5]->pos, patch.points[6]->pos, patch.points[7]->pos, u);
-			prederivU.horiz3[i] = Bernstein3DDerivative(patch.points[8]->pos, patch.points[9]->pos, patch.points[10]->pos, patch.points[11]->pos, u);
-			prederivU.horiz4[i] = Bernstein3DDerivative(patch.points[12]->pos, patch.points[13]->pos, patch.points[14]->pos, patch.points[15]->pos, u);
-		}
-	}
-
-
-	for (int tile_v = 0; tile_v < tess_v + 1; ++tile_v) {
-		for (int tile_u = 0; tile_u < tess_u + 1; ++tile_u) {
-			float u = ((float)tile_u / (float)tess_u);
-			float v = ((float)tile_v / (float)tess_v);
-			float bv = v;
-
-			SimpleVertex &vert = vertices[tile_v * (tess_u + 1) + tile_u];
-
-			if (computeNormals) {
-				const Vec3f derivU = prederivU.Bernstein3D(tile_u, bv);
-				const Vec3f derivV = prepos.Bernstein3DDerivative(tile_u, bv);
-
-				vert.nrm = Cross(derivU, derivV).Normalized();
-				if (patch.patchFacing)
-					vert.nrm *= -1.0f;
-			} else {
-				vert.nrm.SetZero();
-			}
-
-			vert.pos = prepos.Bernstein3D(tile_u, bv);
-
-			if (!sampleTexcoords) {
-				// Generate texcoord
-				vert.uv[0] = u + patch.u_index * third;
-				vert.uv[1] = v + patch.v_index * third;
-			} else {
-				// Sample UV from control points
-				const Math3D::Vec2f res = pretex.Bernstein3D(tile_u, bv);
-				vert.uv[0] = res.x;
-				vert.uv[1] = res.y;
-			} 
-
-			if (sampleColors) {
-				vert.color_32 = precol.Bernstein3D(tile_u, bv).ToRGBA();
-			} else {
-				memcpy(vert.color, patch.points[0]->color, 4);
+	// Generating simple input vertices for the spline-computing vertex shader.
+	float inv_u = 1.0f / (float)surface.tess_u;
+	float inv_v = 1.0f / (float)surface.tess_v;
+	for (int patch_u = 0; patch_u < surface.num_patches_u; ++patch_u) {
+		const int start_u = surface.GetTessStart(patch_u);
+		for (int patch_v = 0; patch_v < surface.num_patches_v; ++patch_v) {
+			const int start_v = surface.GetTessStart(patch_v);
+			for (int tile_u = start_u; tile_u <= surface.tess_u; ++tile_u) {
+				const int index_u = surface.GetIndexU(patch_u, tile_u);
+				for (int tile_v = start_v; tile_v <= surface.tess_v; ++tile_v) {
+					const int index_v = surface.GetIndexV(patch_v, tile_v);
+					SimpleVertex &vert = output.vertices[surface.GetIndex(index_u, index_v, patch_u, patch_v)];
+					// Index for the weights
+					vert.pos.x = index_u;
+					vert.pos.y = index_v;
+					// For texcoord generation
+					vert.nrm.x = patch_u + (float)tile_u * inv_u;
+					vert.nrm.y = patch_v + (float)tile_v * inv_v;
+					// Patch position
+					vert.pos.z = patch_u;
+					vert.nrm.z = patch_v;
+				}
 			}
 		}
 	}
-
-	GEPatchPrimType prim_type = patch.primType;
-	// Combine the vertices into triangles.
-	for (int tile_v = 0; tile_v < tess_v; ++tile_v) {
-		for (int tile_u = 0; tile_u < tess_u; ++tile_u) {
-			int total = patch.index * (tess_u + 1) * (tess_v + 1);
-			int idx0 = total + tile_v * (tess_u + 1) + tile_u;
-			int idx1 = total + tile_v * (tess_u + 1) + tile_u + 1;
-			int idx2 = total + (tile_v + 1) * (tess_u + 1) + tile_u;
-			int idx3 = total + (tile_v + 1) * (tess_u + 1) + tile_u + 1;
-
-			CopyQuadIndex(indices, prim_type, idx0, idx1, idx2, idx3);
-			count += 6;
-		}
-	}
-	dest += (tess_u + 1) * (tess_v + 1) * sizeof(SimpleVertex);
+	surface.BuildIndex(output.indices, output.count);
 }
 
-// Prepare mesh of one patch for "Instanced Tessellation".
-static void TessellateBezierPatchHardware(u8 *&dest, u16 *indices, int &count, int tess_u, int tess_v, GEPatchPrimType primType) {
-	SimpleVertex *&vertices = (SimpleVertex*&)dest;
+} // namespace Spline
 
-	float inv_u = 1.0f / (float)tess_u;
-	float inv_v = 1.0f / (float)tess_v;
+using namespace Spline;
 
-	// Generating simple input vertices for the bezier-computing vertex shader.
-	for (int tile_v = 0; tile_v < tess_v + 1; ++tile_v) {
-		for (int tile_u = 0; tile_u < tess_u + 1; ++tile_u) {
-			SimpleVertex &vert = vertices[tile_v * (tess_u + 1) + tile_u];
-
-			vert.pos.x = (float)tile_u * inv_u;
-			vert.pos.y = (float)tile_v * inv_v;
-		}
-	}
-
-	// Combine the vertices into triangles.
-	for (int tile_v = 0; tile_v < tess_v; ++tile_v) {
-		for (int tile_u = 0; tile_u < tess_u; ++tile_u) {
-			int idx0 = tile_v * (tess_u + 1) + tile_u;
-			int idx1 = tile_v * (tess_u + 1) + tile_u + 1;
-			int idx2 = (tile_v + 1) * (tess_u + 1) + tile_u;
-			int idx3 = (tile_v + 1) * (tess_u + 1) + tile_u + 1;
-
-			CopyQuadIndex(indices, primType, idx0, idx1, idx2, idx3);
-			count += 6;
-		}
-	}
-}
-
-void TessellateBezierPatch(u8 *&dest, u16 *&indices, int &count, int tess_u, int tess_v, const BezierPatch &patch, u32 origVertType) {
-	switch (g_Config.iSplineBezierQuality) {
-	case LOW_QUALITY:
-		_BezierPatchLowQuality(dest, indices, count, tess_u, tess_v, patch, origVertType);
-		break;
-	case MEDIUM_QUALITY:
-		_BezierPatchHighQuality(dest, indices, count, std::max(tess_u / 2, 1), std::max(tess_v / 2, 1), patch, origVertType);
-		break;
-	case HIGH_QUALITY:
-		_BezierPatchHighQuality(dest, indices, count, tess_u, tess_v, patch, origVertType);
-		break;
-	}
+void DrawEngineCommon::ClearSplineBezierWeights() {
+	Bezier3DWeight::weightsCache.Clear();
+	Spline3DWeight::weightsCache.Clear();
 }
 
 void DrawEngineCommon::SubmitSpline(const void *control_points, const void *indices, int tess_u, int tess_v, int count_u, int count_v, int type_u, int type_v, GEPatchPrimType prim_type, bool computeNormals, bool patchFacing, u32 vertType, int *bytesRead) {
 	PROFILE_THIS_SCOPE("spline");
 	DispatchFlush();
 
+	// Real hardware seems to draw nothing when given < 4 either U or V.
+	if (count_u < 4 || count_v < 4)
+		return;
+
+	SimpleBufferManager managedBuf(decoded, DECODED_VERTEX_BUFFER_SIZE / 2);
+
+	int num_points = count_u * count_v;
 	u16 index_lower_bound = 0;
-	u16 index_upper_bound = count_u * count_v - 1;
-	IndexConverter idxConv(vertType, indices);
+	u16 index_upper_bound = num_points - 1;
+	IndexConverter ConvertIndex(vertType, indices);
 	if (indices)
-		GetIndexBounds(indices, count_u * count_v, vertType, &index_lower_bound, &index_upper_bound);
+		GetIndexBounds(indices, num_points, vertType, &index_lower_bound, &index_upper_bound);
 
 	VertexDecoder *origVDecoder = GetVertexDecoder((vertType & 0xFFFFFF) | (gstate.getUVGenMode() << 24));
-	*bytesRead = count_u * count_v * origVDecoder->VertexSize();
-
-	// Real hardware seems to draw nothing when given < 4 either U or V.
-	if (count_u < 4 || count_v < 4) {
-		return;
-	}
+	*bytesRead = num_points * origVDecoder->VertexSize();
 
 	// Simplify away bones and morph before proceeding
-	SimpleVertex *simplified_control_points = (SimpleVertex *)(decoded + 65536 * 12);
-	u8 *temp_buffer = decoded + 65536 * 18;
+	SimpleVertex *simplified_control_points = (SimpleVertex *)managedBuf.Allocate(sizeof(SimpleVertex) * (index_upper_bound + 1));
+	u8 *temp_buffer = managedBuf.Allocate(sizeof(SimpleVertex) * num_points);
 
 	u32 origVertType = vertType;
 	vertType = NormalizeVertices((u8 *)simplified_control_points, temp_buffer, (u8 *)control_points, index_lower_bound, index_upper_bound, vertType);
@@ -881,65 +522,35 @@ void DrawEngineCommon::SubmitSpline(const void *control_points, const void *indi
 		ERROR_LOG(G3D, "Something went really wrong, vertex size: %i vs %i", vertexSize, (int)sizeof(SimpleVertex));
 	}
 
-	// TODO: Do something less idiotic to manage this buffer
-	auto points = new const SimpleVertex *[count_u * count_v];
-
 	// Make an array of pointers to the control points, to get rid of indices.
-	for (int idx = 0; idx < count_u * count_v; idx++) {
-		points[idx] = simplified_control_points + (indices ? idxConv.convert(idx) : idx);
-	}
+	const SimpleVertex **points = (const SimpleVertex **)managedBuf.Allocate(sizeof(SimpleVertex *) * num_points);
+	for (int idx = 0; idx < num_points; idx++)
+		points[idx] = simplified_control_points + (indices ? ConvertIndex(idx) : idx);
 
-	int count = 0;
+	OutputBuffers output;
+	output.vertices = (SimpleVertex *)(decoded + DECODED_VERTEX_BUFFER_SIZE / 2);
+	output.indices = decIndex;
+	output.count = 0;
 
-	u8 *dest = splineBuffer;
-
-	SplinePatchLocal patch;
-	patch.tess_u = tess_u;
-	patch.tess_v = tess_v;
-	patch.type_u = type_u;
-	patch.type_v = type_v;
-	patch.count_u = count_u;
-	patch.count_v = count_v;
-	patch.points = points;
-	patch.computeNormals = computeNormals;
-	patch.primType = prim_type;
-	patch.patchFacing = patchFacing;
+	SplineSurface surface;
+	surface.tess_u = tess_u;
+	surface.tess_v = tess_v;
+	surface.type_u = type_u;
+	surface.type_v = type_v;
+	surface.num_points_u = count_u;
+	surface.num_points_v = count_v;
+	surface.num_patches_u = count_u - 3;
+	surface.num_patches_v = count_v - 3;
+	surface.primType = prim_type;
+	surface.patchFacing = patchFacing;
+	surface.Init(DECODED_VERTEX_BUFFER_SIZE / 2 / vertexSize);
 
 	if (CanUseHardwareTessellation(prim_type)) {
-		float *pos = (float*)(decoded + 65536 * 18); // Size 4 float
-		float *tex = pos + count_u * count_v * 4; // Size 4 float
-		float *col = tex + count_u * count_v * 4; // Size 4 float
-		const bool hasColor = (origVertType & GE_VTYPE_COL_MASK) != 0;
-		const bool hasTexCoords = (origVertType & GE_VTYPE_TC_MASK) != 0;
-
-		int posStride, texStride, colStride;
-		tessDataTransfer->PrepareBuffers(pos, tex, col, posStride, texStride, colStride, count_u * count_v, hasColor, hasTexCoords);
-		float *p = pos;
-		float *t = tex;
-		float *c = col;
-		for (int idx = 0; idx < count_u * count_v; idx++) {
-			memcpy(p, points[idx]->pos.AsArray(), 3 * sizeof(float));
-			p += posStride;
-			if (hasTexCoords) {
-				memcpy(t, points[idx]->uv, 2 * sizeof(float));
-				t += texStride;
-			}
-			if (hasColor) {
-				memcpy(c, Vec4f::FromRGBA(points[idx]->color_32).AsArray(), 4 * sizeof(float));
-				c += colStride;
-			}
-		}
-		if (!hasColor)
-			memcpy(col, Vec4f::FromRGBA(points[0]->color_32).AsArray(), 4 * sizeof(float));
-
-		tessDataTransfer->SendDataToShader(pos, tex, col, count_u * count_v, hasColor, hasTexCoords);
-		TessellateSplinePatchHardware(dest, quadIndices_, count, patch);
-		numPatches = (count_u - 3) * (count_v - 3);
+		HardwareTessellation(output, surface, origVertType, points, tessDataTransfer);
 	} else {
-		int maxVertexCount = SPLINE_BUFFER_SIZE / vertexSize;
-		TessellateSplinePatch(dest, quadIndices_, count, patch, origVertType, maxVertexCount);
+		ControlPoints cpoints(points, num_points, managedBuf);
+		SoftwareTessellation(output, surface, origVertType, cpoints);
 	}
-	delete[] points;
 
 	u32 vertTypeWithIndex16 = (vertType & ~GE_VTYPE_IDX_MASK) | GE_VTYPE_IDX_16BIT;
 
@@ -956,7 +567,7 @@ void DrawEngineCommon::SubmitSpline(const void *control_points, const void *indi
 	uint32_t vertTypeID = GetVertTypeID(vertTypeWithIndex16, gstate.getUVGenMode());
 
 	int generatedBytesRead;
-	DispatchSubmitPrim(splineBuffer, quadIndices_, PatchPrimToPrim(prim_type), count, vertTypeID, &generatedBytesRead);
+	DispatchSubmitPrim(output.vertices, output.indices, PatchPrimToPrim(prim_type), output.count, vertTypeID, &generatedBytesRead);
 
 	DispatchFlush();
 
@@ -967,28 +578,29 @@ void DrawEngineCommon::SubmitSpline(const void *control_points, const void *indi
 
 void DrawEngineCommon::SubmitBezier(const void *control_points, const void *indices, int tess_u, int tess_v, int count_u, int count_v, GEPatchPrimType prim_type, bool computeNormals, bool patchFacing, u32 vertType, int *bytesRead) {
 	PROFILE_THIS_SCOPE("bezier");
-
 	DispatchFlush();
 
-	u16 index_lower_bound = 0;
-	u16 index_upper_bound = count_u * count_v - 1;
-	IndexConverter idxConv(vertType, indices);
-	if (indices)
-		GetIndexBounds(indices, count_u*count_v, vertType, &index_lower_bound, &index_upper_bound);
-
-	VertexDecoder *origVDecoder = GetVertexDecoder((vertType & 0xFFFFFF) | (gstate.getUVGenMode() << 24));
-	*bytesRead = count_u * count_v * origVDecoder->VertexSize();
-
 	// Real hardware seems to draw nothing when given < 4 either U or V.
 	// This would result in num_patches_u / num_patches_v being 0.
-	if (count_u < 4 || count_v < 4) {
+	if (count_u < 4 || count_v < 4)
 		return;
-	}
+
+	SimpleBufferManager managedBuf(decoded, DECODED_VERTEX_BUFFER_SIZE / 2);
+
+	int num_points = count_u * count_v;
+	u16 index_lower_bound = 0;
+	u16 index_upper_bound = num_points - 1;
+	IndexConverter ConvertIndex(vertType, indices);
+	if (indices)
+		GetIndexBounds(indices, num_points, vertType, &index_lower_bound, &index_upper_bound);
+
+	VertexDecoder *origVDecoder = GetVertexDecoder((vertType & 0xFFFFFF) | (gstate.getUVGenMode() << 24));
+	*bytesRead = num_points * origVDecoder->VertexSize();
 
 	// Simplify away bones and morph before proceeding
 	// There are normally not a lot of control points so just splitting decoded should be reasonably safe, although not great.
-	SimpleVertex *simplified_control_points = (SimpleVertex *)(decoded + 65536 * 12);
-	u8 *temp_buffer = decoded + 65536 * 18;
+	SimpleVertex *simplified_control_points = (SimpleVertex *)managedBuf.Allocate(sizeof(SimpleVertex) * (index_upper_bound + 1));
+	u8 *temp_buffer = managedBuf.Allocate(sizeof(SimpleVertex) * num_points);
 
 	u32 origVertType = vertType;
 	vertType = NormalizeVertices((u8 *)simplified_control_points, temp_buffer, (u8 *)control_points, index_lower_bound, index_upper_bound, vertType);
@@ -1000,89 +612,32 @@ void DrawEngineCommon::SubmitBezier(const void *control_points, const void *indi
 		ERROR_LOG(G3D, "Something went really wrong, vertex size: %i vs %i", vertexSize, (int)sizeof(SimpleVertex));
 	}
 
-	float *pos = (float*)(decoded + 65536 * 18); // Size 4 float
-	float *tex = pos + count_u * count_v * 4; // Size 4 float
-	float *col = tex + count_u * count_v * 4; // Size 4 float
-	const bool hasColor = (origVertType & GE_VTYPE_COL_MASK) != 0;
-	const bool hasTexCoords = (origVertType & GE_VTYPE_TC_MASK) != 0;
+	// Make an array of pointers to the control points, to get rid of indices.
+	const SimpleVertex **points = (const SimpleVertex **)managedBuf.Allocate(sizeof(SimpleVertex *) * num_points);
+	for (int idx = 0; idx < num_points; idx++)
+		points[idx] = simplified_control_points + (indices ? ConvertIndex(idx) : idx);
+
+	OutputBuffers output;
+	output.vertices = (SimpleVertex *)(decoded + DECODED_VERTEX_BUFFER_SIZE / 2);
+	output.indices = decIndex;
+	output.count = 0;
+
+	BezierSurface surface;
+	surface.tess_u = tess_u;
+	surface.tess_v = tess_v;
+	surface.num_points_u = count_u;
+	surface.num_points_v = count_v;
+	surface.num_patches_u = (count_u - 1) / 3;
+	surface.num_patches_v = (count_v - 1) / 3;
+	surface.primType = prim_type;
+	surface.patchFacing = patchFacing;
+	surface.Init(DECODED_VERTEX_BUFFER_SIZE / 2 / vertexSize);
 
-	// Bezier patches share less control points than spline patches. Otherwise they are pretty much the same (except bezier don't support the open/close thing)
-	int num_patches_u = (count_u - 1) / 3;
-	int num_patches_v = (count_v - 1) / 3;
-	BezierPatch *patches = nullptr;
 	if (CanUseHardwareTessellation(prim_type)) {
-		int posStride, texStride, colStride;
-		tessDataTransfer->PrepareBuffers(pos, tex, col, posStride, texStride, colStride, count_u * count_v, hasColor, hasTexCoords);
-		float *p = pos;
-		float *t = tex;
-		float *c = col;
-		for (int idx = 0; idx < count_u * count_v; idx++) {
-			const SimpleVertex *point = simplified_control_points + (indices ? idxConv.convert(idx) : idx);
-			memcpy(p, point->pos.AsArray(), 3 * sizeof(float));
-			p += posStride;
-			if (hasTexCoords) {
-				memcpy(t, point->uv, 2 * sizeof(float));
-				t += texStride;
-			}
-			if (hasColor) {
-				memcpy(c, Vec4f::FromRGBA(point->color_32).AsArray(), 4 * sizeof(float));
-				c += colStride;
-			}
-		}
-		if (!hasColor) {
-			const SimpleVertex *point = simplified_control_points + (indices ? idxConv.convert(0) : 0);
-			memcpy(col, Vec4f::FromRGBA(point->color_32).AsArray(), 4 * sizeof(float));
-		}
+		HardwareTessellation(output, surface, origVertType, points, tessDataTransfer);
 	} else {
-		patches = new BezierPatch[num_patches_u * num_patches_v];
-		for (int patch_u = 0; patch_u < num_patches_u; patch_u++) {
-			for (int patch_v = 0; patch_v < num_patches_v; patch_v++) {
-				BezierPatch& patch = patches[patch_u + patch_v * num_patches_u];
-				for (int point = 0; point < 16; ++point) {
-					int idx = (patch_u * 3 + point % 4) + (patch_v * 3 + point / 4) * count_u;
-					patch.points[point] = simplified_control_points + (indices ? idxConv.convert(idx) : idx);
-				}
-				patch.u_index = patch_u * 3;
-				patch.v_index = patch_v * 3;
-				patch.index = patch_v * num_patches_u + patch_u;
-				patch.primType = prim_type;
-				patch.computeNormals = computeNormals;
-				patch.patchFacing = patchFacing;
-			}
-		}
-	}
-
-	int count = 0;
-	u8 *dest = splineBuffer;
-
-	// We shouldn't really split up into separate 4x4 patches, instead we should do something that works
-	// like the splines, so we subdivide across the whole "mega-patch".
-
-	// If specified as 0, uses 1.
-	if (tess_u < 1) {
-		tess_u = 1;
-	}
-	if (tess_v < 1) {
-		tess_v = 1;
-	}
-
-	u16 *inds = quadIndices_;
-	if (CanUseHardwareTessellation(prim_type)) {
-		tessDataTransfer->SendDataToShader(pos, tex, col, count_u * count_v, hasColor, hasTexCoords);
-		TessellateBezierPatchHardware(dest, inds, count, tess_u, tess_v, prim_type);
-		numPatches = num_patches_u * num_patches_v;
-	} else {
-		int maxVertices = SPLINE_BUFFER_SIZE / vertexSize;
-		// Downsample until it fits, in case crazy tessellation factors are sent.
-		while ((tess_u + 1) * (tess_v + 1) * num_patches_u * num_patches_v > maxVertices) {
-			tess_u /= 2;
-			tess_v /= 2;
-		}
-		for (int patch_idx = 0; patch_idx < num_patches_u*num_patches_v; ++patch_idx) {
-			const BezierPatch &patch = patches[patch_idx];
-			TessellateBezierPatch(dest, inds, count, tess_u, tess_v, patch, origVertType);
-		}
-		delete[] patches;
+		ControlPoints cpoints(points, num_points, managedBuf);
+		SoftwareTessellation(output, surface, origVertType, cpoints);
 	}
 
 	u32 vertTypeWithIndex16 = (vertType & ~GE_VTYPE_IDX_MASK) | GE_VTYPE_IDX_16BIT;
@@ -1099,7 +654,7 @@ void DrawEngineCommon::SubmitBezier(const void *control_points, const void *indi
 
 	uint32_t vertTypeID = GetVertTypeID(vertTypeWithIndex16, gstate.getUVGenMode());
 	int generatedBytesRead;
-	DispatchSubmitPrim(splineBuffer, quadIndices_, PatchPrimToPrim(prim_type), count, vertTypeID, &generatedBytesRead);
+	DispatchSubmitPrim(output.vertices, output.indices, PatchPrimToPrim(prim_type), output.count, vertTypeID, &generatedBytesRead);
 
 	DispatchFlush();
 
diff --git a/GPU/Common/SplineCommon.h b/GPU/Common/SplineCommon.h
index 52bf75b69b..a6d82def4b 100644
--- a/GPU/Common/SplineCommon.h
+++ b/GPU/Common/SplineCommon.h
@@ -16,11 +16,15 @@
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
 
 #pragma once
+#include <unordered_map>
 
 #include "Common/CommonTypes.h"
 #include "Common/Swap.h"
 #include "GPU/Math3D.h"
 #include "GPU/ge_constants.h"
+#include "Core/Config.h"
+
+#define HALF_CEIL(x) (x + 1) / 2 // Integer ceil = (int)ceil((float)x / 2.0f)
 
 // PSP compatible format so we can use the end of the pipeline in beziers etc
 struct SimpleVertex {
@@ -33,32 +37,11 @@ struct SimpleVertex {
 	Vec3Packedf pos;
 };
 
-// We decode all vertices into a common format for easy interpolation and stuff.
-// Not fast but can be optimized later.
-struct BezierPatch {
-	const SimpleVertex *points[16];
+class SimpleBufferManager;
 
-	// These are used to generate UVs.
-	int u_index, v_index;
+namespace Spline {
 
-	int index;
-	GEPatchPrimType primType;
-	bool computeNormals;
-	bool patchFacing;
-};
-
-struct SplinePatchLocal {
-	const SimpleVertex **points;
-	int tess_u;
-	int tess_v;
-	int count_u;
-	int count_v;
-	int type_u;
-	int type_v;
-	bool computeNormals;
-	bool patchFacing;
-	GEPatchPrimType primType;
-};
+void BuildIndex(u16 *indices, int &count, int num_u, int num_v, GEPatchPrimType prim_type, int total = 0);
 
 enum SplineQuality {
 	LOW_QUALITY = 0,
@@ -66,6 +49,207 @@ enum SplineQuality {
 	HIGH_QUALITY = 2,
 };
 
+class Bezier3DWeight;
+class Spline3DWeight;
+
+// We decode all vertices into a common format for easy interpolation and stuff.
+// Not fast but can be optimized later.
+
+struct SurfaceInfo {
+	int tess_u, tess_v;
+	int num_points_u, num_points_v;
+	int num_patches_u, num_patches_v;
+	int type_u, type_v;
+	GEPatchPrimType primType;
+	bool patchFacing;
+
+	void Init() {
+		// If specified as 0, uses 1.
+		if (tess_u < 1) tess_u = 1;
+		if (tess_v < 1) tess_v = 1;
+
+		switch (g_Config.iSplineBezierQuality) {
+		case LOW_QUALITY:
+			tess_u = 2;
+			tess_v = 2;
+			break;
+		case MEDIUM_QUALITY:
+			// Don't cut below 2, though.
+			if (tess_u > 2) tess_u = HALF_CEIL(tess_u);
+			if (tess_v > 2) tess_v = HALF_CEIL(tess_v);
+			break;
+		}
+	}
+};
+
+struct BezierSurface : public SurfaceInfo {
+	using WeightType = Bezier3DWeight;
+
+	int num_verts_per_patch;
+
+	void Init(int maxVertices) {
+		SurfaceInfo::Init();
+		// Downsample until it fits, in case crazy tessellation factors are sent.
+		while ((tess_u + 1) * (tess_v + 1) * num_patches_u * num_patches_v > maxVertices) {
+			tess_u--;
+			tess_v--;
+		}
+		num_verts_per_patch = (tess_u + 1) * (tess_v + 1);
+	}
+
+	int GetTessStart(int patch) const { return 0; }
+
+	int GetPointIndex(int patch_u, int patch_v) const { return patch_v * 3 * num_points_u + patch_u * 3; }
+
+	int GetIndexU(int patch_u, int tile_u) const { return tile_u; }
+	int GetIndexV(int patch_v, int tile_v) const { return tile_v; }
+
+	int GetIndex(int index_u, int index_v, int patch_u, int patch_v) const {
+		int patch_index = patch_v * num_patches_u + patch_u;
+		return index_v * (tess_u + 1) + index_u + num_verts_per_patch * patch_index;
+	}
+
+	void BuildIndex(u16 *indices, int &count) const {
+		for (int patch_u = 0; patch_u < num_patches_u; ++patch_u) {
+			for (int patch_v = 0; patch_v < num_patches_v; ++patch_v) {
+				int patch_index = patch_v * num_patches_u + patch_u;
+				int total = patch_index * num_verts_per_patch;
+				Spline::BuildIndex(indices + count, count, tess_u, tess_v, primType, total);
+			}
+		}
+	}
+};
+
+struct SplineSurface : public SurfaceInfo {
+	using WeightType = Spline3DWeight;
+
+	int num_vertices_u;
+
+	void Init(int maxVertices) {
+		SurfaceInfo::Init();
+		// Downsample until it fits, in case crazy tessellation factors are sent.
+		while ((num_patches_u * tess_u + 1) * (num_patches_v * tess_v + 1) > maxVertices) {
+			tess_u--;
+			tess_v--;
+		}
+		num_vertices_u = num_patches_u * tess_u + 1;
+	}
+
+	int GetTessStart(int patch) const { return (patch == 0) ? 0 : 1; }
+
+	int GetPointIndex(int patch_u, int patch_v) const { return patch_v * num_points_u + patch_u; }
+
+	int GetIndexU(int patch_u, int tile_u) const { return patch_u * tess_u + tile_u; }
+	int GetIndexV(int patch_v, int tile_v) const { return patch_v * tess_v + tile_v; }
+
+	int GetIndex(int index_u, int index_v, int patch_u, int patch_v) const {
+		return index_v * num_vertices_u + index_u;
+	}
+
+	void BuildIndex(u16 *indices, int &count) const {
+		Spline::BuildIndex(indices, count, num_patches_u * tess_u, num_patches_v * tess_v, primType);
+	}
+};
+
+struct Weight {
+	float basis[4], deriv[4];
+};
+
+template<class T>
+class WeightCache : public T {
+private:
+	std::unordered_map<u32, Weight*> weightsCache;
+public:
+	Weight* operator [] (u32 key) {
+		Weight *&weights = weightsCache[key];
+		if (!weights)
+			weights = T::CalcWeightsAll(key);
+		return weights;
+	}
+
+	void Clear() {
+		for (auto it : weightsCache)
+			delete[] it.second;
+		weightsCache.clear();
+	}
+};
+
+struct Weight2D {
+	const Weight *u, *v;
+	int size_u, size_v;
+
+	template<class T>
+	Weight2D(WeightCache<T> &cache, u32 key_u, u32 key_v) {
+		u = cache[key_u];
+		v = (key_u != key_v) ? cache[key_v] : u; // Use same weights if u == v
+	}
+};
+
+struct ControlPoints {
+	Vec3f *pos;
+	Vec2f *tex;
+	Vec4f *col;
+	u32_le defcolor;
+
+	ControlPoints() {}
+	ControlPoints(const SimpleVertex *const *points, int size, SimpleBufferManager &managedBuf);
+	void Convert(const SimpleVertex *const *points, int size);
+};
+
+struct OutputBuffers {
+	SimpleVertex *vertices;
+	u16 *indices;
+	int count;
+};
+
+template<class Surface>
+void SoftwareTessellation(OutputBuffers &output, const Surface &surface, u32 origVertType, const ControlPoints &points);
+
+} // namespace Spline
+
 bool CanUseHardwareTessellation(GEPatchPrimType prim);
-void TessellateSplinePatch(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int maxVertices);
-void TessellateBezierPatch(u8 *&dest, u16 *&indices, int &count, int tess_u, int tess_v, const BezierPatch &patch, u32 origVertType);
+
+// Define function object for TemplateParameterDispatcher
+#define TEMPLATE_PARAMETER_DISPATCHER_FUNCTION(NAME, FUNCNAME, FUNCTYPE) \
+struct NAME { \
+	template<bool ...Params> \
+	static FUNCTYPE GetFunc() { \
+		return &FUNCNAME<Params...>; \
+	} \
+};
+
+template<typename Func, int NumParams, class Dispatcher> 
+class TemplateParameterDispatcher {
+
+	/* Store all combinations of template functions into an array */
+	template<int LoopCount, int Index = 0, bool ...Params>
+	struct Initializer {
+		static void Init(Func funcs[]) {
+			Initializer<LoopCount - 1, (Index << 1) + 1, true, Params...>::Init(funcs); // true
+			Initializer<LoopCount - 1, (Index << 1) + 0, false, Params...>::Init(funcs); // false
+		}
+	};
+ 	/* Specialized for terminates the recursive loop */
+	template<int Index, bool ...Params>
+	struct Initializer<0, Index, Params...> {
+		static void Init(Func funcs[]) {
+			funcs[Index] = Dispatcher::template GetFunc<Params...>(); // Resolve the nested dependent name as template function.
+		}
+	};
+
+private: 
+	Func funcs[1 << NumParams]; /* Function pointers array */ 
+public: 
+	TemplateParameterDispatcher() { 
+		Initializer<NumParams>::Init(funcs); 
+	} 
+ 
+	Func GetFunc(const bool params[]) const { 
+ 		/* Convert bool parameters to index of the array */ 
+		int index = 0; 
+		for (int i = 0; i < NumParams; ++i) 
+			index |= params[i] << i; 
+ 
+		return funcs[index]; 
+	} 
+};
diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
index 97ea633c1e..209186fd96 100644
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@@ -102,7 +102,7 @@ public:
 		: indices(indices), indexType(vertType & GE_VTYPE_IDX_MASK) {
 	}
 
-	inline u32 convert(u32 index) const {
+	u32 operator() (u32 index) const {
 		switch (indexType) {
 		case GE_VTYPE_IDX_8BIT:
 			return indices8[index];
diff --git a/GPU/D3D11/DrawEngineD3D11.cpp b/GPU/D3D11/DrawEngineD3D11.cpp
index eac5cf078f..e7752017dc 100644
--- a/GPU/D3D11/DrawEngineD3D11.cpp
+++ b/GPU/D3D11/DrawEngineD3D11.cpp
@@ -89,7 +89,6 @@ DrawEngineD3D11::DrawEngineD3D11(Draw::DrawContext *draw, ID3D11Device *device,
 	// All this is a LOT of memory, need to see if we can cut down somehow.
 	decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
-	splineBuffer = (u8 *)AllocateMemoryPages(SPLINE_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 
 	indexGen.Setup(decIndex);
 
@@ -104,14 +103,14 @@ DrawEngineD3D11::~DrawEngineD3D11() {
 	DestroyDeviceObjects();
 	FreeMemoryPages(decoded, DECODED_VERTEX_BUFFER_SIZE);
 	FreeMemoryPages(decIndex, DECODED_INDEX_BUFFER_SIZE);
-	FreeMemoryPages(splineBuffer, SPLINE_BUFFER_SIZE);
 }
 
 void DrawEngineD3D11::InitDeviceObjects() {
 	pushVerts_ = new PushBufferD3D11(device_, VERTEX_PUSH_SIZE, D3D11_BIND_VERTEX_BUFFER);
 	pushInds_ = new PushBufferD3D11(device_, INDEX_PUSH_SIZE, D3D11_BIND_INDEX_BUFFER);
 
-	tessDataTransfer = new TessellationDataTransferD3D11(context_, device_);
+	tessDataTransferD3D11 = new TessellationDataTransferD3D11(context_, device_);
+	tessDataTransfer = tessDataTransferD3D11;
 }
 
 void DrawEngineD3D11::ClearTrackedVertexArrays() {
@@ -137,7 +136,7 @@ void DrawEngineD3D11::Resized() {
 void DrawEngineD3D11::DestroyDeviceObjects() {
 	ClearTrackedVertexArrays();
 	ClearInputLayoutMap();
-	delete tessDataTransfer;
+	delete tessDataTransferD3D11;
 	delete pushVerts_;
 	delete pushInds_;
 	depthStencilCache_.Iterate([&](const uint64_t &key, ID3D11DepthStencilState *ds) {
@@ -539,10 +538,7 @@ rotateVBO:
 				memcpy(iptr, decIndex, iSize);
 				pushInds_->EndPush(context_);
 				context_->IASetIndexBuffer(pushInds_->Buf(), DXGI_FORMAT_R16_UINT, iOffset);
-				if (tess)
-					context_->DrawIndexedInstanced(vertexCount, numPatches, 0, 0, 0);
-				else
-					context_->DrawIndexed(vertexCount, 0, 0);
+				context_->DrawIndexed(vertexCount, 0, 0);
 			} else {
 				context_->Draw(vertexCount, 0);
 			}
@@ -551,10 +547,7 @@ rotateVBO:
 			context_->IASetVertexBuffers(0, 1, &vb_, &stride, &offset);
 			if (useElements) {
 				context_->IASetIndexBuffer(ib_, DXGI_FORMAT_R16_UINT, 0);
-				if (tess)
-					context_->DrawIndexedInstanced(vertexCount, numPatches, 0, 0, 0);
-				else
-					context_->DrawIndexed(vertexCount, 0, 0);
+				context_->DrawIndexed(vertexCount, 0, 0);
 			} else {
 				context_->Draw(vertexCount, 0);
 			}
@@ -692,38 +685,85 @@ rotateVBO:
 	GPUDebug::NotifyDraw();
 }
 
-void DrawEngineD3D11::TessellationDataTransferD3D11::PrepareBuffers(float *&pos, float *&tex, float *&col, int &posStride, int &texStride, int &colStride, int size, bool hasColor, bool hasTexCoords) {
+TessellationDataTransferD3D11::TessellationDataTransferD3D11(ID3D11DeviceContext *context, ID3D11Device *device)
+	: context_(context), device_(device) {
+	desc.Usage = D3D11_USAGE_DYNAMIC;
+	desc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
+	desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
+	desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
+}
+
+TessellationDataTransferD3D11::~TessellationDataTransferD3D11() {
+	for (int i = 0; i < 3; ++i) {
+		if (buf[i]) buf[i]->Release();
+		if (view[i]) view[i]->Release();
+	}
+}
+
+void TessellationDataTransferD3D11::SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) {
 	struct TessData {
 		float pos[3]; float pad1;
 		float uv[2]; float pad2[2];
 		float color[4];
 	};
 
+	int size = size_u * size_v;
+
 	if (prevSize < size) {
 		prevSize = size;
-		if (buf) {
-			buf->Release();
-			view->Release();
-		}
+		if (buf[0]) buf[0]->Release();
+		if (view[0]) view[0]->Release();
+
 		desc.ByteWidth = size * sizeof(TessData);
 		desc.StructureByteStride = sizeof(TessData);
-
-		device_->CreateBuffer(&desc, nullptr, &buf);
-		device_->CreateShaderResourceView(buf, 0, &view);
-		context_->VSSetShaderResources(0, 1, &view);
+		device_->CreateBuffer(&desc, nullptr, &buf[0]);
+		device_->CreateShaderResourceView(buf[0], nullptr, &view[0]);
+		context_->VSSetShaderResources(0, 1, &view[0]);
 	}
 	D3D11_MAPPED_SUBRESOURCE map;
-	context_->Map(buf, 0, D3D11_MAP_WRITE_DISCARD, 0, &map);
+	context_->Map(buf[0], 0, D3D11_MAP_WRITE_DISCARD, 0, &map);
 	uint8_t *data = (uint8_t *)map.pData;
 
-	pos = (float *)(data);
-	tex = (float *)(data + offsetof(TessData, uv));
-	col = (float *)(data + offsetof(TessData, color));
-	posStride = sizeof(TessData) / sizeof(float);
-	colStride = hasColor ? (sizeof(TessData) / sizeof(float)) : 0;
-	texStride = sizeof(TessData) / sizeof(float);
-}
+	float *pos = (float *)(data);
+	float *tex = (float *)(data + offsetof(TessData, uv));
+	float *col = (float *)(data + offsetof(TessData, color));
+	int stride = sizeof(TessData) / sizeof(float);
 
-void DrawEngineD3D11::TessellationDataTransferD3D11::SendDataToShader(const float * pos, const float * tex, const float * col, int size, bool hasColor, bool hasTexCoords) {
-	context_->Unmap(buf, 0);
+	CopyControlPoints(pos, tex, col, stride, stride, stride, points, size, vertType);
+
+	context_->Unmap(buf[0], 0);
+
+	using Spline::Weight;
+
+	// Weights U
+	if (prevSizeWU < weights.size_u) {
+		prevSizeWU = weights.size_u;
+		if (buf[1]) buf[1]->Release();
+		if (view[1]) view[1]->Release();
+
+		desc.ByteWidth = weights.size_u * sizeof(Weight);
+		desc.StructureByteStride = sizeof(Weight);
+		device_->CreateBuffer(&desc, nullptr, &buf[1]);
+		device_->CreateShaderResourceView(buf[1], nullptr, &view[1]);
+		context_->VSSetShaderResources(1, 1, &view[1]);
+	}
+	context_->Map(buf[1], 0, D3D11_MAP_WRITE_DISCARD, 0, &map);
+	memcpy(map.pData, weights.u, weights.size_u * sizeof(Weight));
+	context_->Unmap(buf[1], 0);
+
+	// Weights V
+	if (prevSizeWV < weights.size_v) {
+		prevSizeWV = weights.size_v;
+		if (buf[2]) buf[2]->Release();
+		if (view[2]) view[2]->Release();
+
+		desc.ByteWidth = weights.size_v * sizeof(Weight);
+		desc.StructureByteStride = sizeof(Weight);
+		device_->CreateBuffer(&desc, nullptr, &buf[2]);
+		device_->CreateShaderResourceView(buf[2], nullptr, &view[2]);
+		context_->VSSetShaderResources(2, 1, &view[2]);
+	}
+	context_->Map(buf[2], 0, D3D11_MAP_WRITE_DISCARD, 0, &map);
+	memcpy(map.pData, weights.v, weights.size_v * sizeof(Weight));
+	context_->Unmap(buf[2], 0);
 }
diff --git a/GPU/D3D11/DrawEngineD3D11.h b/GPU/D3D11/DrawEngineD3D11.h
index 8b797a3d2d..11ed3e41e9 100644
--- a/GPU/D3D11/DrawEngineD3D11.h
+++ b/GPU/D3D11/DrawEngineD3D11.h
@@ -99,6 +99,22 @@ public:
 	u8 flags;
 };
 
+class TessellationDataTransferD3D11 : public TessellationDataTransfer {
+private:
+	ID3D11DeviceContext *context_;
+	ID3D11Device *device_;
+	ID3D11Buffer *buf[3]{};
+	ID3D11ShaderResourceView *view[3]{};
+	D3D11_BUFFER_DESC desc{};
+	int prevSize = 0;
+	int prevSizeWU = 0, prevSizeWV = 0;
+public:
+	TessellationDataTransferD3D11(ID3D11DeviceContext *context, ID3D11Device *device);
+	~TessellationDataTransferD3D11();
+	// Send spline/bezier's control points and weights to vertex shader through structured shader buffer.
+	void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) override;
+};
+
 // Handles transform, lighting and drawing.
 class DrawEngineD3D11 : public DrawEngineCommon {
 public:
@@ -199,29 +215,5 @@ private:
 	D3D11DynamicState dynState_{};
 
 	// Hardware tessellation
-	class TessellationDataTransferD3D11 : public TessellationDataTransfer {
-	private:
-		ID3D11DeviceContext *context_;
-		ID3D11Device *device_;
-		ID3D11Buffer *buf;
-		ID3D11ShaderResourceView *view;
-		D3D11_BUFFER_DESC desc;
-	public:
-		TessellationDataTransferD3D11(ID3D11DeviceContext *context, ID3D11Device *device)
-			: TessellationDataTransfer(), context_(context), device_(device), buf(), view(), desc() {
-			desc.Usage = D3D11_USAGE_DYNAMIC;
-			desc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
-			desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
-			desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
-		}
-		~TessellationDataTransferD3D11() {
-			if (buf) {
-				buf->Release();
-				view->Release();
-			}
-		}
-
-		void PrepareBuffers(float *&pos, float *&tex, float *&col, int &posStride, int &texStride, int &colStride, int size, bool hasColor, bool hasTexCoords) override;
-		void SendDataToShader(const float *pos, const float *tex, const float *col, int size, bool hasColor, bool hasTexCoords) override;
-	};
+	TessellationDataTransferD3D11 *tessDataTransferD3D11;
 };
diff --git a/GPU/Directx9/DrawEngineDX9.cpp b/GPU/Directx9/DrawEngineDX9.cpp
index e4f24703ee..0f1c0b62d3 100644
--- a/GPU/Directx9/DrawEngineDX9.cpp
+++ b/GPU/Directx9/DrawEngineDX9.cpp
@@ -95,13 +95,13 @@ DrawEngineDX9::DrawEngineDX9(Draw::DrawContext *draw) : vai_(256), vertexDeclMap
 	// All this is a LOT of memory, need to see if we can cut down somehow.
 	decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
-	splineBuffer = (u8 *)AllocateMemoryPages(SPLINE_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 
 	indexGen.Setup(decIndex);
 
 	InitDeviceObjects();
 
-	tessDataTransfer = new TessellationDataTransferDX9();
+	tessDataTransferDX9 = new TessellationDataTransferDX9();
+	tessDataTransfer = tessDataTransferDX9;
 
 	device_->CreateVertexDeclaration(TransformedVertexElements, &transformedVertexDecl_);
 }
@@ -114,14 +114,13 @@ DrawEngineDX9::~DrawEngineDX9() {
 	DestroyDeviceObjects();
 	FreeMemoryPages(decoded, DECODED_VERTEX_BUFFER_SIZE);
 	FreeMemoryPages(decIndex, DECODED_INDEX_BUFFER_SIZE);
-	FreeMemoryPages(splineBuffer, SPLINE_BUFFER_SIZE);
 	vertexDeclMap_.Iterate([&](const uint32_t &key, IDirect3DVertexDeclaration9 *decl) {
 		if (decl) {
 			decl->Release();
 		}
 	});
 	vertexDeclMap_.Clear();
-	delete tessDataTransfer;
+	delete tessDataTransferDX9;
 }
 
 void DrawEngineDX9::InitDeviceObjects() {
@@ -624,8 +623,8 @@ rotateVBO:
 	GPUDebug::NotifyDraw();
 }
 
-void DrawEngineDX9::TessellationDataTransferDX9::SendDataToShader(const float * pos, const float * tex, const float * col, int size, bool hasColor, bool hasTexCoords)
-{
+void TessellationDataTransferDX9::SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) {
+	// TODO
 }
 
 }  // namespace
diff --git a/GPU/Directx9/DrawEngineDX9.h b/GPU/Directx9/DrawEngineDX9.h
index ef015b02b5..f1c80eea51 100644
--- a/GPU/Directx9/DrawEngineDX9.h
+++ b/GPU/Directx9/DrawEngineDX9.h
@@ -97,6 +97,13 @@ public:
 	u8 flags;
 };
 
+class TessellationDataTransferDX9 : public TessellationDataTransfer {
+public:
+	TessellationDataTransferDX9() {}
+	~TessellationDataTransferDX9() {}
+	void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) override;
+};
+
 // Handles transform, lighting and drawing.
 class DrawEngineDX9 : public DrawEngineCommon {
 public:
@@ -158,16 +165,7 @@ private:
 	FramebufferManagerDX9 *framebufferManager_ = nullptr;
 
 	// Hardware tessellation
-	class TessellationDataTransferDX9 : public TessellationDataTransfer {
-	private:
-		int data_tex[3];
-	public:
-		TessellationDataTransferDX9() : TessellationDataTransfer(), data_tex() {
-		}
-		~TessellationDataTransferDX9() {
-		}
-		void SendDataToShader(const float *pos, const float *tex, const float *col, int size, bool hasColor, bool hasTexCoords) override;
-	};
+	TessellationDataTransferDX9 *tessDataTransferDX9;
 };
 
 }  // namespace
diff --git a/GPU/Directx9/VertexShaderGeneratorDX9.cpp b/GPU/Directx9/VertexShaderGeneratorDX9.cpp
index 51fe036f59..53ef880c17 100644
--- a/GPU/Directx9/VertexShaderGeneratorDX9.cpp
+++ b/GPU/Directx9/VertexShaderGeneratorDX9.cpp
@@ -86,6 +86,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 	bool doSpline = id.Bit(VS_BIT_SPLINE);
 	bool hasColorTess = id.Bit(VS_BIT_HAS_COLOR_TESS);
 	bool hasTexcoordTess = id.Bit(VS_BIT_HAS_TEXCOORD_TESS);
+	bool hasNormalTess = id.Bit(VS_BIT_HAS_NORMAL_TESS);
 	bool flipNormalTess = id.Bit(VS_BIT_NORM_REVERSE_TESS);
 
 	DoLightComputation doLight[4] = { LIGHT_OFF, LIGHT_OFF, LIGHT_OFF, LIGHT_OFF };
@@ -271,82 +272,90 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 			WRITE(p, "  float3 pos; float pad1;\n");
 			WRITE(p, "  float2 tex; float2 pad2;\n");
 			WRITE(p, "  float4 col;\n");
-			WRITE(p, "};");
+			WRITE(p, "};\n");
 			WRITE(p, "StructuredBuffer<TessData> tess_data : register(t0);\n");
+
+			WRITE(p, "struct TessWeight {\n");
+			WRITE(p, "  float4 basis;\n");
+			WRITE(p, "  float4 deriv;\n");
+			WRITE(p, "};\n");
+			WRITE(p, "StructuredBuffer<TessWeight> tess_weights_u : register(t1);\n");
+			WRITE(p, "StructuredBuffer<TessWeight> tess_weights_v : register(t2);\n");
 		}
 
 		const char *init[3] = { "0.0, 0.0", "0.0, 0.0, 0.0", "0.0, 0.0, 0.0, 0.0" };
 		for (int i = 2; i <= 4; i++) {
 			// Define 3 types float2, float3, float4
-			WRITE(p, "float%d tess_sample(in float%d points[16], in float2 weights[4]) {\n", i, i);
+			WRITE(p, "float%d tess_sample(in float%d points[16], float4x4 weights) {\n", i, i);
 			WRITE(p, "  float%d pos = float%d(%s);\n", i, i, init[i - 2]);
-			WRITE(p, "  for (int i = 0; i < 4; ++i) {\n");
-			WRITE(p, "    for (int j = 0; j < 4; ++j) {\n");
-			WRITE(p, "      float f = weights[j].x * weights[i].y;\n");
-			WRITE(p, "      if (f != 0.0)\n");
-			WRITE(p, "        pos = pos + f * points[i * 4 + j];\n");
-			WRITE(p, "    }\n");
-			WRITE(p, "  }\n");
+			for (int v = 0; v < 4; ++v) {
+				for (int u = 0; u < 4; ++u) {
+					WRITE(p, "  pos += weights[%i][%i] * points[%i];\n", v, u, v * 4 + u);
+				}
+			}
 			WRITE(p, "  return pos;\n");
 			WRITE(p, "}\n");
 		}
-		if (doSpline) {
-			WRITE(p, "void spline_knot(int2 num_patches, int2 type, out float2 knot[6], int2 patch_pos) {\n");
-			WRITE(p, "  for (int i = 0; i < 6; ++i) {\n");
-			WRITE(p, "    knot[i] = float2(i + patch_pos.x - 2, i + patch_pos.y - 2);\n");
-			WRITE(p, "  }\n");
-		//	WRITE(p, "  if ((type.x & 1) != 0) {\n");
-			WRITE(p, "  if ((type.x == 1) || (type.x == 3)) {\n");
-			WRITE(p, "    if (patch_pos.x <= 2)\n");
-			WRITE(p, "      knot[0].x = 0.0;\n");
-			WRITE(p, "    if (patch_pos.x <= 1)\n");
-			WRITE(p, "      knot[1].x = 0.0;\n");
-			WRITE(p, "  }\n");
-		//	WRITE(p, "  if ((type.x & 2) != 0) {\n");
-			WRITE(p, "  if ((type.x == 2) || (type.x == 3)) {\n");
-			WRITE(p, "    if (patch_pos.x >= (num_patches.x - 2))\n");
-			WRITE(p, "      knot[5].x = num_patches.x;\n");
-			WRITE(p, "    if (patch_pos.x == (num_patches.x - 1))\n");
-			WRITE(p, "      knot[4].x = num_patches.x;\n");
-			WRITE(p, "  }\n");
-		//	WRITE(p, "  if ((type.y & 1) != 0) {\n");
-			WRITE(p, "  if ((type.y == 1) || (type.y == 3)) {\n");
-			WRITE(p, "    if (patch_pos.y <= 2)\n");
-			WRITE(p, "      knot[0].y = 0.0;\n");
-			WRITE(p, "    if (patch_pos.y <= 1)\n");
-			WRITE(p, "      knot[1].y = 0.0;\n");
-			WRITE(p, "  }\n");
-		//	WRITE(p, "  if ((type.y & 2) != 0) {\n");
-			WRITE(p, "  if ((type.y == 2) || (type.y == 3)) {\n");
-			WRITE(p, "    if (patch_pos.y >= (num_patches.y - 2))\n");
-			WRITE(p, "      knot[5].y = num_patches.y;\n");
-			WRITE(p, "    if (patch_pos.y == (num_patches.y - 1))\n");
-			WRITE(p, "      knot[4].y = num_patches.y;\n");
-			WRITE(p, "  }\n");
-			WRITE(p, "}\n");
 
-			WRITE(p, "void spline_weight(float2 t, in float2 knot[6], out float2 weights[4]) {\n");
-			// TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly...
-			WRITE(p, "  float2 t0 = (t - knot[0]);\n");
-			WRITE(p, "  float2 t1 = (t - knot[1]);\n");
-			WRITE(p, "  float2 t2 = (t - knot[2]);\n");
-			// TODO: All our knots are integers so we should be able to get rid of these divisions (How?)
-			WRITE(p, "  float2 f30 = t0 / (knot[3] - knot[0]);\n");
-			WRITE(p, "  float2 f41 = t1 / (knot[4] - knot[1]);\n");
-			WRITE(p, "  float2 f52 = t2 / (knot[5] - knot[2]);\n");
-			WRITE(p, "  float2 f31 = t1 / (knot[3] - knot[1]);\n");
-			WRITE(p, "  float2 f42 = t2 / (knot[4] - knot[2]);\n");
-			WRITE(p, "  float2 f32 = t2 / (knot[3] - knot[2]);\n");
-			WRITE(p, "  float2 a = (1.0 - f30)*(1.0 - f31);\n");
-			WRITE(p, "  float2 b = (f31*f41);\n");
-			WRITE(p, "  float2 c = (1.0 - f41)*(1.0 - f42);\n");
-			WRITE(p, "  float2 d = (f42*f52);\n");
-			WRITE(p, "  weights[0] = a - (a*f32);\n");
-			WRITE(p, "  weights[1] = 1.0 - a - b + ((a + b + c - 1.0)*f32);\n");
-			WRITE(p, "  weights[2] = b + ((1.0 - b - c - d)*f32);\n");
-			WRITE(p, "  weights[3] = d*f32;\n");
-			WRITE(p, "}\n");
+		WRITE(p, "float4x4 outerProduct(float4 u, float4 v) {\n");
+		WRITE(p, "  return mul((float4x1)v, (float1x4)u);\n");
+		WRITE(p, "}\n");
+
+		WRITE(p, "struct Tess {\n");
+		WRITE(p, "  float3 pos;\n");
+		if (doTexture)
+			WRITE(p, "  float2 tex;\n");
+		WRITE(p, "  float4 col;\n");
+		if (hasNormalTess)
+			WRITE(p, "  float3 nrm;\n");
+		WRITE(p, "};\n");
+
+		WRITE(p, "void tessellate(in VS_IN In, out Tess tess) {\n");
+		WRITE(p, "  int2 point_pos = int2(In.position.z, In.normal.z)%s;\n", doBezier ? " * 3" : "");
+		WRITE(p, "  int2 weight_idx = int2(In.position.xy);\n");
+		// Load 4x4 control points
+		WRITE(p, "  float3 _pos[16];\n");
+		WRITE(p, "  float2 _tex[16];\n");
+		WRITE(p, "  float4 _col[16];\n");
+		WRITE(p, "  int index;\n");
+		for (int i = 0; i < 4; i++) {
+			for (int j = 0; j < 4; j++) {
+				WRITE(p, "  index = (%i + point_pos.y) * u_spline_counts + (%i + point_pos.x);\n", i, j);
+				WRITE(p, "  _pos[%i] = tess_data[index].pos;\n", i * 4 + j);
+				if (doTexture && hasTexcoordTess)
+					WRITE(p, "  _tex[%i] = tess_data[index].tex;\n", i * 4 + j);
+				if (hasColorTess)
+					WRITE(p, "  _col[%i] = tess_data[index].col;\n", i * 4 + j);
+			}
 		}
+
+		// Basis polynomials as weight coefficients
+		WRITE(p, "  float4 basis_u = tess_weights_u[weight_idx.x].basis;\n");
+		WRITE(p, "  float4 basis_v = tess_weights_v[weight_idx.y].basis;\n");
+		WRITE(p, "  float4x4 basis = outerProduct(basis_u, basis_v);\n");
+
+		// Tessellate
+		WRITE(p, "  tess.pos = tess_sample(_pos, basis);\n");
+		if (doTexture) {
+			if (hasTexcoordTess)
+				WRITE(p, "  tess.tex = tess_sample(_tex, basis);\n");
+			else
+				WRITE(p, "  tess.tex = In.normal.xy;\n");
+		}
+		if (hasColorTess)
+			WRITE(p, "  tess.col = tess_sample(_col, basis);\n");
+		else
+			WRITE(p, "  tess.col = u_matambientalpha;\n");
+		if (hasNormalTess) {
+			// Derivatives as weight coefficients
+			WRITE(p, "  float4 deriv_u = tess_weights_u[weight_idx.x].deriv;\n");
+			WRITE(p, "  float4 deriv_v = tess_weights_v[weight_idx.y].deriv;\n");
+
+			WRITE(p, "  float3 du = tess_sample(_pos, outerProduct(deriv_u, basis_v));\n");
+			WRITE(p, "  float3 dv = tess_sample(_pos, outerProduct(basis_u, deriv_v));\n");
+			WRITE(p, "  tess.nrm = normalize(cross(du, dv));\n");
+		}
+		WRITE(p, "}\n");
 	}
 
 	WRITE(p, "VS_OUT main(VS_IN In) {\n");
@@ -396,106 +405,14 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 	}  else {
 		// Step 1: World Transform / Skinning
 		if (!enableBones) {
-			// Hardware tessellation
 			if (doSpline || doBezier) {
-				WRITE(p, "  uint u_spline_count_u = u_spline_counts & 0xFF;\n");
-				WRITE(p, "  uint u_spline_count_v = (u_spline_counts >> 8) & 0xFF;\n");
-				WRITE(p, "  uint num_patches_u = %s;\n", doBezier ? "(u_spline_count_u - 1) / 3u" : "u_spline_count_u - 3");
-				WRITE(p, "  float2 tess_pos = In.position.xy;\n");
-				WRITE(p, "  int u = In.instanceId %% num_patches_u;\n");
-				WRITE(p, "  int v = In.instanceId / num_patches_u;\n");
-				WRITE(p, "  int2 patch_pos = int2(u, v);\n");
-				WRITE(p, "  float3 _pos[16];\n");
-				WRITE(p, "  float2 _tex[16];\n");
-				WRITE(p, "  float4 _col[16];\n");
-				WRITE(p, "  int index;\n");
-				for (int i = 0; i < 4; i++) {
-					for (int j = 0; j < 4; j++) {
-						WRITE(p, "  index = (%i + v%s) * u_spline_count_u + (%i + u%s);\n", i, doBezier ? " * 3" : "", j, doBezier ? " * 3" : "");
-						WRITE(p, "  _pos[%i] = tess_data[index].pos;\n", i * 4 + j);
-						if (doTexture && hasTexcoord && hasTexcoordTess)
-							WRITE(p, "  _tex[%i] = tess_data[index].tex;\n", i * 4 + j);
-						if (hasColor && hasColorTess)
-							WRITE(p, "  _col[%i] = tess_data[index].col;\n", i * 4 + j);
-					}
-				}
-				WRITE(p, "  float2 weights[4];\n");
-				if (doBezier) {
-					// Bernstein 3D
-					WRITE(p, "  weights[0] = (1.0 - tess_pos) * (1.0 - tess_pos) * (1.0 - tess_pos);\n");
-					WRITE(p, "  weights[1] = 3.0 * tess_pos * (1.0 - tess_pos) * (1.0 - tess_pos);\n");
-					WRITE(p, "  weights[2] = 3.0 * tess_pos * tess_pos * (1.0 - tess_pos);\n");
-					WRITE(p, "  weights[3] = tess_pos * tess_pos * tess_pos;\n");
-				} else if (doSpline) {
-					WRITE(p, "  int2 spline_num_patches = int2(u_spline_count_u - 3, u_spline_count_v - 3);\n");
-					WRITE(p, "  int u_spline_type_u = (u_spline_counts >> 16) & 0xFF;\n");
-					WRITE(p, "  int u_spline_type_v = (u_spline_counts >> 24) & 0xFF;\n");
-					WRITE(p, "  int2 spline_type = int2(u_spline_type_u, u_spline_type_v);\n");
-					WRITE(p, "  float2 knots[6];\n");
-					WRITE(p, "  spline_knot(spline_num_patches, spline_type, knots, patch_pos);\n");
-					WRITE(p, "  spline_weight(tess_pos + patch_pos, knots, weights);\n");
-				}
-				WRITE(p, "  float3 pos = tess_sample(_pos, weights);\n");
-				if (doTexture && hasTexcoord) {
-					if (hasTexcoordTess)
-						WRITE(p, "  float2 tex = tess_sample(_tex, weights);\n");
-					else
-						WRITE(p, "  float2 tex = tess_pos + patch_pos;\n");
-				}
-				if (hasColor) {
-					if (hasColorTess)
-						WRITE(p, "  float4 col = tess_sample(_col, weights);\n");
-					else
-						WRITE(p, "  float4 col = tess_data[0].col;\n");
-				}
-				if (hasNormal) {
-					// Curved surface is probably always need to compute normal(not sampling from control points)
-					if (doBezier) {
-						// Bernstein derivative
-						WRITE(p, "  float2 bernderiv[4];\n");
-						WRITE(p, "  bernderiv[0] = -3.0 * (tess_pos - 1.0) * (tess_pos - 1.0); \n");
-						WRITE(p, "  bernderiv[1] = 9.0 * tess_pos * tess_pos - 12.0 * tess_pos + 3.0; \n");
-						WRITE(p, "  bernderiv[2] = 3.0 * (2.0 - 3.0 * tess_pos) * tess_pos; \n");
-						WRITE(p, "  bernderiv[3] = 3.0 * tess_pos * tess_pos; \n");
+				// Hardware tessellation
+				WRITE(p, "  Tess tess;\n");
+				WRITE(p, "  tessellate(In, tess);\n");
 
-						WRITE(p, "  float2 bernderiv_u[4];\n");
-						WRITE(p, "  float2 bernderiv_v[4];\n");
-						WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
-						WRITE(p, "    bernderiv_u[i] = float2(bernderiv[i].x, weights[i].y);\n");
-						WRITE(p, "    bernderiv_v[i] = float2(weights[i].x, bernderiv[i].y);\n");
-						WRITE(p, "  }\n");
-
-						WRITE(p, "  float3 du = tess_sample(_pos, bernderiv_u);\n");
-						WRITE(p, "  float3 dv = tess_sample(_pos, bernderiv_v);\n");
-					} else if (doSpline) {
-						WRITE(p, "  float2 tess_next_u = float2(In.normal.x, 0.0);\n");
-						WRITE(p, "  float2 tess_next_v = float2(0.0, In.normal.y);\n");
-						// Right
-						WRITE(p, "  float2 tess_pos_r = tess_pos + tess_next_u;\n");
-						WRITE(p, "  spline_weight(tess_pos_r + patch_pos, knots, weights);\n");
-						WRITE(p, "  float3 pos_r = tess_sample(_pos, weights);\n");
-						// Left
-						WRITE(p, "  float2 tess_pos_l = tess_pos - tess_next_u;\n");
-						WRITE(p, "  spline_weight(tess_pos_l + patch_pos, knots, weights);\n");
-						WRITE(p, "  float3 pos_l = tess_sample(_pos, weights);\n");
-						// Down
-						WRITE(p, "  float2 tess_pos_d = tess_pos + tess_next_v;\n");
-						WRITE(p, "  spline_weight(tess_pos_d + patch_pos, knots, weights);\n");
-						WRITE(p, "  float3 pos_d = tess_sample(_pos, weights);\n");
-						// Up
-						WRITE(p, "  float2 tess_pos_u = tess_pos - tess_next_v;\n");
-						WRITE(p, "  spline_weight(tess_pos_u + patch_pos, knots, weights);\n");
-						WRITE(p, "  float3 pos_u = tess_sample(_pos, weights);\n");
-
-						WRITE(p, "  float3 du = pos_r - pos_l;\n");
-						WRITE(p, "  float3 dv = pos_d - pos_u;\n");
-					}
-					WRITE(p, "  float3 nrm = cross(du, dv);\n");
-					WRITE(p, "  nrm = normalize(nrm);\n");
-				}
-				WRITE(p, "  float3 worldpos = mul(float4(pos.xyz, 1.0), u_world);\n");
-				if (hasNormal)
-					WRITE(p, "  float3 worldnormal = normalize(mul(float4(%snrm, 0.0), u_world));\n", flipNormalTess ? "-" : "");
+				WRITE(p, "  float3 worldpos = mul(float4(tess.pos.xyz, 1.0), u_world);\n");
+				if (hasNormalTess)
+					WRITE(p, "  float3 worldnormal = normalize(mul(float4(%stess.nrm, 0.0), u_world));\n", flipNormalTess ? "-" : "");
 				else
 					WRITE(p, "  float3 worldnormal = float3(0.0, 0.0, 1.0);\n");
 			} else {
@@ -600,9 +517,10 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 		const char *diffuseStr = (matUpdate & 2) && hasColor ? "In.color0.rgb" : "u_matdiffuse";
 		const char *specularStr = (matUpdate & 4) && hasColor ? "In.color0.rgb" : "u_matspecular.rgb";
 		if (doBezier || doSpline) {
-			ambientStr = (matUpdate & 1) && hasColor ? "col" : "u_matambientalpha";
-			diffuseStr = (matUpdate & 2) && hasColor ? "col.rgb" : "u_matdiffuse";
-			specularStr = (matUpdate & 4) && hasColor ? "col.rgb" : "u_matspecular.rgb";
+			// TODO: Probably, should use hasColorTess but FF4 has a problem with drawing the background.
+			ambientStr = (matUpdate & 1) && hasColor ? "tess.col" : "u_matambientalpha";
+			diffuseStr = (matUpdate & 2) && hasColor ? "tess.col.rgb" : "u_matdiffuse";
+			specularStr = (matUpdate & 4) && hasColor ? "tess.col.rgb" : "u_matspecular.rgb";
 		}
 
 		bool diffuseIsZero = true;
@@ -729,7 +647,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 			// Lighting doesn't affect color.
 			if (hasColor) {
 				if (doBezier || doSpline)
-					WRITE(p, "  Out.v_color0 = col;\n");
+					WRITE(p, "  Out.v_color0 = tess.col;\n");
 				else
 					WRITE(p, "  Out.v_color0 = In.color0;\n");
 			} else {
@@ -747,7 +665,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 				if (scaleUV) {
 					if (hasTexcoord) {
 						if (doBezier || doSpline)
-							WRITE(p, "  Out.v_texcoord = float3(tex.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n");
+							WRITE(p, "  Out.v_texcoord = float3(tess.tex.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n");
 						else
 							WRITE(p, "  Out.v_texcoord = float3(In.texcoord.xy * u_uvscaleoffset.xy, 0.0);\n");
 					} else {
@@ -755,10 +673,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 					}
 				} else {
 					if (hasTexcoord) {
-						if (doBezier || doSpline)
-							WRITE(p, "  Out.v_texcoord = float3(tex.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n");
-						else
-							WRITE(p, "  Out.v_texcoord = float3(In.texcoord.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n");
+						WRITE(p, "  Out.v_texcoord = float3(In.texcoord.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n");
 					} else {
 						WRITE(p, "  Out.v_texcoord = float3(u_uvscaleoffset.zw, 0.0);\n");
 					}
diff --git a/GPU/GLES/DrawEngineGLES.cpp b/GPU/GLES/DrawEngineGLES.cpp
index a8857b17bc..a2bb108860 100644
--- a/GPU/GLES/DrawEngineGLES.cpp
+++ b/GPU/GLES/DrawEngineGLES.cpp
@@ -81,22 +81,21 @@ DrawEngineGLES::DrawEngineGLES(Draw::DrawContext *draw) : vai_(256), draw_(draw)
 	// All this is a LOT of memory, need to see if we can cut down somehow.
 	decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
-	splineBuffer = (u8 *)AllocateMemoryPages(SPLINE_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 
 	indexGen.Setup(decIndex);
 
 	InitDeviceObjects();
 
-	tessDataTransfer = new TessellationDataTransferGLES(render_);
+	tessDataTransferGLES = new TessellationDataTransferGLES(render_);
+	tessDataTransfer = tessDataTransferGLES;
 }
 
 DrawEngineGLES::~DrawEngineGLES() {
 	DestroyDeviceObjects();
 	FreeMemoryPages(decoded, DECODED_VERTEX_BUFFER_SIZE);
 	FreeMemoryPages(decIndex, DECODED_INDEX_BUFFER_SIZE);
-	FreeMemoryPages(splineBuffer, SPLINE_BUFFER_SIZE);
 
-	delete tessDataTransfer;
+	delete tessDataTransferGLES;
 }
 
 void DrawEngineGLES::DeviceLost() {
@@ -166,7 +165,7 @@ void DrawEngineGLES::EndFrame() {
 	FrameData &frameData = frameData_[render_->GetCurFrame()];
 	render_->EndPushBuffer(frameData.pushIndex);
 	render_->EndPushBuffer(frameData.pushVertex);
-	tessDataTransfer->EndFrame();
+	tessDataTransferGLES->EndFrame();
 }
 
 struct GlTypeInfo {
@@ -520,10 +519,7 @@ rotateVBO:
 				indexBufferOffset = (uint32_t)frameData.pushIndex->Push(decIndex, sizeof(uint16_t) * indexGen.VertexCount(), &indexBuffer);
 				render_->BindIndexBuffer(indexBuffer);
 			}
-			if (gstate_c.bezier || gstate_c.spline)
-				render_->DrawIndexed(glprim[prim], vertexCount, GL_UNSIGNED_SHORT, (GLvoid*)(intptr_t)indexBufferOffset, numPatches);
-			else
-				render_->DrawIndexed(glprim[prim], vertexCount, GL_UNSIGNED_SHORT, (GLvoid*)(intptr_t)indexBufferOffset);
+			render_->DrawIndexed(glprim[prim], vertexCount, GL_UNSIGNED_SHORT, (GLvoid*)(intptr_t)indexBufferOffset);
 		} else {
 			render_->Draw(glprim[prim], 0, vertexCount);
 		}
@@ -655,46 +651,66 @@ bool DrawEngineGLES::IsCodePtrVertexDecoder(const u8 *ptr) const {
 	return decJitCache_->IsInSpace(ptr);
 }
 
-void DrawEngineGLES::TessellationDataTransferGLES::SendDataToShader(const float *pos, const float *tex, const float *col, int size, bool hasColor, bool hasTexCoords) {
+void TessellationDataTransferGLES::SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) {
+	bool hasColor = (vertType & GE_VTYPE_COL_MASK) != 0;
+	bool hasTexCoord = (vertType & GE_VTYPE_TC_MASK) != 0;
+
+	int size = size_u * size_v;
+	float *pos = new float[size * 4];
+	float *tex = hasTexCoord ? new float[size * 4] : nullptr;
+	float *col = hasColor ? new float[size * 4] : nullptr;
+	int stride = 4;
+
+	CopyControlPoints(pos, tex, col, stride, stride, stride, points, size, vertType);
 	// Removed the 1D texture support, it's unlikely to be relevant for performance.
-	if (data_tex[0])
-		renderManager_->DeleteTexture(data_tex[0]);
-	uint8_t *pos_data = new uint8_t[size * sizeof(float) * 4];
-	memcpy(pos_data, pos, size * sizeof(float) * 4);
-	data_tex[0] = renderManager_->CreateTexture(GL_TEXTURE_2D);
-	renderManager_->TextureImage(data_tex[0], 0, size, 1, GL_RGBA32F, GL_RGBA, GL_FLOAT, pos_data, GLRAllocType::NEW, false);
-	renderManager_->FinalizeTexture(data_tex[0], 0, false);
-	renderManager_->BindTexture(TEX_SLOT_SPLINE_POS, data_tex[0]);
-
-	// Texcoords
-	if (hasTexCoords) {
-		if (data_tex[1])
-			renderManager_->DeleteTexture(data_tex[1]);
-		uint8_t *tex_data = new uint8_t[size * sizeof(float) * 4];
-		memcpy(tex_data, tex, size * sizeof(float) * 4);
-		data_tex[1] = renderManager_->CreateTexture(GL_TEXTURE_2D);
-		renderManager_->TextureImage(data_tex[1], 0, size, 1, GL_RGBA32F, GL_RGBA, GL_FLOAT, tex_data, GLRAllocType::NEW, false);
-		renderManager_->FinalizeTexture(data_tex[1], 0, false);
-		renderManager_->BindTexture(TEX_SLOT_SPLINE_NRM, data_tex[1]);
+	// Control Points
+	if (prevSizeU < size_u || prevSizeV < size_v) {
+		prevSizeU = size_u;
+		prevSizeV = size_v;
+		if (!data_tex[0])
+			data_tex[0] = renderManager_->CreateTexture(GL_TEXTURE_2D);
+		renderManager_->TextureImage(data_tex[0], 0, size_u * 3, size_v, GL_RGBA32F, GL_RGBA, GL_FLOAT, nullptr, GLRAllocType::NONE, false);
+		renderManager_->FinalizeTexture(data_tex[0], 0, false);
 	}
+	renderManager_->BindTexture(TEX_SLOT_SPLINE_POINTS, data_tex[0]);
+	// Position
+	renderManager_->TextureSubImage(data_tex[0], 0, 0, 0, size_u, size_v, GL_RGBA, GL_FLOAT, (u8 *)pos, GLRAllocType::NEW);
+	// Texcoord
+	if (hasTexCoord)
+		renderManager_->TextureSubImage(data_tex[0], 0, size_u, 0, size_u, size_v, GL_RGBA, GL_FLOAT, (u8 *)tex, GLRAllocType::NEW);
+	// Color
+	if (hasColor)
+		renderManager_->TextureSubImage(data_tex[0], 0, size_u * 2, 0, size_u, size_v, GL_RGBA, GL_FLOAT, (u8 *)col, GLRAllocType::NEW);
 
-	if (data_tex[2])
-		renderManager_->DeleteTexture(data_tex[2]);
-	data_tex[2] = renderManager_->CreateTexture(GL_TEXTURE_2D);
-	int sizeColor = hasColor ? size : 1;
-	uint8_t *col_data = new uint8_t[sizeColor * sizeof(float) * 4];
-	memcpy(col_data, col, sizeColor * sizeof(float) * 4);
+	// Weight U
+	if (prevSizeWU < weights.size_u) {
+		prevSizeWU = weights.size_u;
+		if (!data_tex[1])
+			data_tex[1] = renderManager_->CreateTexture(GL_TEXTURE_2D);
+		renderManager_->TextureImage(data_tex[1], 0, weights.size_u * 2, 1, GL_RGBA32F, GL_RGBA, GL_FLOAT, nullptr, GLRAllocType::NONE, false);
+		renderManager_->FinalizeTexture(data_tex[1], 0, false);
+	}
+	renderManager_->BindTexture(TEX_SLOT_SPLINE_WEIGHTS_U, data_tex[1]);
+	renderManager_->TextureSubImage(data_tex[1], 0, 0, 0, weights.size_u * 2, 1, GL_RGBA, GL_FLOAT, (u8 *)weights.u, GLRAllocType::NONE);
 
-	renderManager_->TextureImage(data_tex[2], 0, sizeColor, 1, GL_RGBA32F, GL_RGBA, GL_FLOAT, col_data, GLRAllocType::NEW, false);
-	renderManager_->FinalizeTexture(data_tex[2], 0, false);
-	renderManager_->BindTexture(TEX_SLOT_SPLINE_COL, data_tex[2]);
+	// Weight V
+	if (prevSizeWV < weights.size_v) {
+		prevSizeWV = weights.size_v;
+		if (!data_tex[2])
+			data_tex[2] = renderManager_->CreateTexture(GL_TEXTURE_2D);
+		renderManager_->TextureImage(data_tex[2], 0, weights.size_v * 2, 1, GL_RGBA32F, GL_RGBA, GL_FLOAT, nullptr, GLRAllocType::NONE, false);
+		renderManager_->FinalizeTexture(data_tex[2], 0, false);
+	}
+	renderManager_->BindTexture(TEX_SLOT_SPLINE_WEIGHTS_V, data_tex[2]);
+	renderManager_->TextureSubImage(data_tex[2], 0, 0, 0, weights.size_v * 2, 1, GL_RGBA, GL_FLOAT, (u8 *)weights.v, GLRAllocType::NONE);
 }
 
-void DrawEngineGLES::TessellationDataTransferGLES::EndFrame() {
+void TessellationDataTransferGLES::EndFrame() {
 	for (int i = 0; i < 3; i++) {
 		if (data_tex[i]) {
 			renderManager_->DeleteTexture(data_tex[i]);
 			data_tex[i] = nullptr;
 		}
 	}
+	prevSizeU = prevSizeV = prevSizeWU = prevSizeWV = 0;
 }
diff --git a/GPU/GLES/DrawEngineGLES.h b/GPU/GLES/DrawEngineGLES.h
index 2901a1c384..8b29700f83 100644
--- a/GPU/GLES/DrawEngineGLES.h
+++ b/GPU/GLES/DrawEngineGLES.h
@@ -46,9 +46,9 @@ enum {
 	TEX_SLOT_SHADERBLEND_SRC = 1,
 	TEX_SLOT_ALPHATEST = 2,
 	TEX_SLOT_CLUT = 3,
-	TEX_SLOT_SPLINE_POS = 4,
-	TEX_SLOT_SPLINE_NRM = 5,
-	TEX_SLOT_SPLINE_COL = 6,
+	TEX_SLOT_SPLINE_POINTS = 4,
+	TEX_SLOT_SPLINE_WEIGHTS_U = 5,
+	TEX_SLOT_SPLINE_WEIGHTS_V = 6,
 };
 
 
@@ -110,6 +110,23 @@ public:
 	u8 flags;
 };
 
+class TessellationDataTransferGLES : public TessellationDataTransfer {
+private:
+	GLRTexture *data_tex[3]{};
+	int prevSizeU = 0, prevSizeV = 0;
+	int prevSizeWU = 0, prevSizeWV = 0;
+	GLRenderManager *renderManager_;
+public:
+	TessellationDataTransferGLES(GLRenderManager *renderManager)
+			: renderManager_(renderManager) { }
+	~TessellationDataTransferGLES() {
+		EndFrame();
+	}
+	// Send spline/bezier's control points and weights to vertex shader through floating point texture.
+	void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) override;
+	void EndFrame();  // Queues textures for deletion.
+};
+
 // Handles transform, lighting and drawing.
 class DrawEngineGLES : public DrawEngineCommon {
 public:
@@ -208,17 +225,5 @@ private:
 	int bufferDecimationCounter_ = 0;
 
 	// Hardware tessellation
-	class TessellationDataTransferGLES : public TessellationDataTransfer {
-	private:
-		GLRTexture *data_tex[3]{};
-		GLRenderManager *renderManager_;
-	public:
-		TessellationDataTransferGLES(GLRenderManager *renderManager)
-			  : renderManager_(renderManager) { }
-		~TessellationDataTransferGLES() {
-			EndFrame();
-		}
-		void SendDataToShader(const float *pos, const float *tex, const float *col, int size, bool hasColor, bool hasTexCoords) override;
-		void EndFrame() override;  // Queues textures for deletion.
-	};
+	TessellationDataTransferGLES *tessDataTransferGLES;
 };
diff --git a/GPU/GLES/GPU_GLES.cpp b/GPU/GLES/GPU_GLES.cpp
index 47e04f12a0..cde3291767 100644
--- a/GPU/GLES/GPU_GLES.cpp
+++ b/GPU/GLES/GPU_GLES.cpp
@@ -109,8 +109,7 @@ GPU_GLES::GPU_GLES(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 	if (g_Config.bHardwareTessellation) {
 		// Disable hardware tessellation if device is unsupported.
 		bool hasTexelFetch = gl_extensions.GLES3 || (!gl_extensions.IsGLES && gl_extensions.VersionGEThan(3, 3, 0)) || gl_extensions.EXT_gpu_shader4;
-		if (!gstate_c.SupportsAll(GPU_SUPPORTS_INSTANCE_RENDERING | GPU_SUPPORTS_VERTEX_TEXTURE_FETCH | GPU_SUPPORTS_TEXTURE_FLOAT) || !hasTexelFetch) {
-			// TODO: Check unsupported device name list.(Above gpu features are supported but it has issues with weak gpu, memory, shader compiler etc...)
+		if (!gstate_c.SupportsAll(GPU_SUPPORTS_VERTEX_TEXTURE_FETCH | GPU_SUPPORTS_TEXTURE_FLOAT) || !hasTexelFetch) {
 			g_Config.bHardwareTessellation = false;
 			ERROR_LOG(G3D, "Hardware Tessellation is unsupported, falling back to software tessellation");
 			I18NCategory *gr = GetI18NCategory("Graphics");
diff --git a/GPU/GLES/ShaderManagerGLES.cpp b/GPU/GLES/ShaderManagerGLES.cpp
index 5a7aa94dbd..2a20129766 100644
--- a/GPU/GLES/ShaderManagerGLES.cpp
+++ b/GPU/GLES/ShaderManagerGLES.cpp
@@ -159,13 +159,10 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs,
 
 	// We need to fetch these unconditionally, gstate_c.spline or bezier will not be set if we
 	// create this shader at load time from the shader cache.
-	queries.push_back({ &u_tess_pos_tex, "u_tess_pos_tex" });
-	queries.push_back({ &u_tess_tex_tex, "u_tess_tex_tex" });
-	queries.push_back({ &u_tess_col_tex, "u_tess_col_tex" });
-	queries.push_back({ &u_spline_count_u, "u_spline_count_u" });
-	queries.push_back({ &u_spline_count_v, "u_spline_count_v" });
-	queries.push_back({ &u_spline_type_u, "u_spline_type_u" });
-	queries.push_back({ &u_spline_type_v, "u_spline_type_v" });
+	queries.push_back({ &u_tess_points, "u_tess_points" });
+	queries.push_back({ &u_tess_weights_u, "u_tess_weights_u" });
+	queries.push_back({ &u_tess_weights_v, "u_tess_weights_v" });
+	queries.push_back({ &u_spline_counts, "u_spline_counts" });
 	queries.push_back({ &u_depal, "u_depal" });
 
 	attrMask = vs->GetAttrMask();
@@ -176,9 +173,9 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs,
 	initialize.push_back({ &u_fbotex,       0, 1 });
 	initialize.push_back({ &u_testtex,      0, 2 });
 	initialize.push_back({ &u_pal,          0, 3 }); // CLUT
-	initialize.push_back({ &u_tess_pos_tex, 0, 4 }); // Texture unit 4
-	initialize.push_back({ &u_tess_tex_tex, 0, 5 }); // Texture unit 5
-	initialize.push_back({ &u_tess_col_tex, 0, 6 }); // Texture unit 6
+	initialize.push_back({ &u_tess_points,  0, 4 }); // Control Points
+	initialize.push_back({ &u_tess_weights_u, 0, 5 });
+	initialize.push_back({ &u_tess_weights_v, 0, 6 });
 
 	program = render->CreateProgram(shaders, semantics, queries, initialize, gstate_c.featureFlags & GPU_SUPPORTS_DUALSOURCE_BLEND);
 
@@ -567,13 +564,9 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid) {
 	}
 
 	if (dirty & DIRTY_BEZIERSPLINE) {
-		render_->SetUniformI1(&u_spline_count_u, gstate_c.spline_count_u);
-		if (u_spline_count_v != -1)
-			render_->SetUniformI1(&u_spline_count_v, gstate_c.spline_count_v);
-		if (u_spline_type_u != -1)
-			render_->SetUniformI1(&u_spline_type_u, gstate_c.spline_type_u);
-		if (u_spline_type_v != -1)
-			render_->SetUniformI1(&u_spline_type_v, gstate_c.spline_type_v);
+		if (u_spline_counts != -1) {
+			render_->SetUniformI1(&u_spline_counts, gstate_c.spline_num_points_u);
+		}
 	}
 }
 
diff --git a/GPU/GLES/ShaderManagerGLES.h b/GPU/GLES/ShaderManagerGLES.h
index f5b9fa640b..c676639833 100644
--- a/GPU/GLES/ShaderManagerGLES.h
+++ b/GPU/GLES/ShaderManagerGLES.h
@@ -117,13 +117,11 @@ public:
 	int u_lightspecular[4];  // attenuation
 	int u_lightambient[4];  // attenuation
 
-	int u_tess_pos_tex;
-	int u_tess_tex_tex;
-	int u_tess_col_tex;
-	int u_spline_count_u;
-	int u_spline_count_v;
-	int u_spline_type_u;
-	int u_spline_type_v;
+	// Spline Tessellation
+	int u_tess_points; // Control Points
+	int u_tess_weights_u;
+	int u_tess_weights_v;
+	int u_spline_counts;
 };
 
 // Real public interface
diff --git a/GPU/GLES/VertexShaderGeneratorGLES.cpp b/GPU/GLES/VertexShaderGeneratorGLES.cpp
index 7aa92fca4c..7efc8e6341 100644
--- a/GPU/GLES/VertexShaderGeneratorGLES.cpp
+++ b/GPU/GLES/VertexShaderGeneratorGLES.cpp
@@ -193,6 +193,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 	bool doSpline = id.Bit(VS_BIT_SPLINE);
 	bool hasColorTess = id.Bit(VS_BIT_HAS_COLOR_TESS);
 	bool hasTexcoordTess = id.Bit(VS_BIT_HAS_TEXCOORD_TESS);
+	bool hasNormalTess = id.Bit(VS_BIT_HAS_NORMAL_TESS);
 	bool flipNormalTess = id.Bit(VS_BIT_NORM_REVERSE_TESS);
 
 	const char *shading = "";
@@ -379,83 +380,88 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 	if (doBezier || doSpline) {
 		*uniformMask |= DIRTY_BEZIERSPLINE;
 
-		WRITE(p, "uniform sampler2D u_tess_pos_tex;\n");
-		WRITE(p, "uniform sampler2D u_tess_tex_tex;\n");
-		WRITE(p, "uniform sampler2D u_tess_col_tex;\n");
+		WRITE(p, "uniform sampler2D u_tess_points;\n"); // Control Points
+		WRITE(p, "uniform sampler2D u_tess_weights_u;\n");
+		WRITE(p, "uniform sampler2D u_tess_weights_v;\n");
 
-		WRITE(p, "uniform int u_spline_count_u;\n");
+		WRITE(p, "uniform int u_spline_counts;\n");
 
 		for (int i = 2; i <= 4; i++) {
 			// Define 3 types vec2, vec3, vec4
-			WRITE(p, "vec%d tess_sample(in vec%d points[16], in vec2 weights[4]) {\n", i, i);
+			WRITE(p, "vec%d tess_sample(in vec%d points[16], mat4 weights) {\n", i, i);
 			WRITE(p, "  vec%d pos = vec%d(0.0);\n", i, i);
-			WRITE(p, "  for (int i = 0; i < 4; ++i) {\n");
-			WRITE(p, "    for (int j = 0; j < 4; ++j) {\n");
-			WRITE(p, "      float f = weights[j].x * weights[i].y;\n");
-			WRITE(p, "      if (f != 0.0)\n");
-			WRITE(p, "        pos = pos + f * points[i * 4 + j];\n");
-			WRITE(p, "    }\n");
-			WRITE(p, "  }\n");
+			for (int v = 0; v < 4; ++v) {
+				for (int u = 0; u < 4; ++u) {
+					WRITE(p, "  pos += weights[%i][%i] * points[%i];\n", v, u, v * 4 + u);
+				}
+			}
 			WRITE(p, "  return pos;\n");
 			WRITE(p, "}\n");
 		}
-		if (doSpline) {
-			WRITE(p, "uniform int u_spline_count_v;\n");
-			WRITE(p, "uniform int u_spline_type_u;\n");
-			WRITE(p, "uniform int u_spline_type_v;\n");
 
-			WRITE(p, "void spline_knot(ivec2 num_patches, ivec2 type, out vec2 knot[6], ivec2 patch_pos) {\n");
-			WRITE(p, "  for (int i = 0; i < 6; ++i) {\n");
-			WRITE(p, "    knot[i] = vec2(float(i + patch_pos.x - 2), float(i + patch_pos.y - 2));\n");
-			WRITE(p, "  }\n");
-			WRITE(p, "  if ((type.x & 1) != 0) {\n");
-			WRITE(p, "    if (patch_pos.x <= 2)\n");
-			WRITE(p, "      knot[0].x = 0.0;\n");
-			WRITE(p, "    if (patch_pos.x <= 1)\n");
-			WRITE(p, "      knot[1].x = 0.0;\n");
-			WRITE(p, "  }\n");
-			WRITE(p, "  if ((type.x & 2) != 0) {\n");
-			WRITE(p, "    if (patch_pos.x >= (num_patches.x - 2))\n");
-			WRITE(p, "      knot[5].x = float(num_patches.x);\n");
-			WRITE(p, "    if (patch_pos.x == (num_patches.x - 1))\n");
-			WRITE(p, "      knot[4].x = float(num_patches.x);\n");
-			WRITE(p, "  }\n");
-			WRITE(p, "  if ((type.y & 1) != 0) {\n");
-			WRITE(p, "    if (patch_pos.y <= 2)\n");
-			WRITE(p, "      knot[0].y = 0.0;\n");
-			WRITE(p, "    if (patch_pos.y <= 1)\n");
-			WRITE(p, "      knot[1].y = 0.0;\n");
-			WRITE(p, "  }\n");
-			WRITE(p, "  if ((type.y & 2) != 0) {\n");
-			WRITE(p, "    if (patch_pos.y >= (num_patches.y - 2))\n");
-			WRITE(p, "      knot[5].y = float(num_patches.y);\n");
-			WRITE(p, "    if (patch_pos.y == (num_patches.y - 1))\n");
-			WRITE(p, "      knot[4].y = float(num_patches.y);\n");
-			WRITE(p, "  }\n");
-			WRITE(p, "}\n");
-
-			WRITE(p, "void spline_weight(vec2 t, in vec2 knot[6], out vec2 weights[4]) {\n");
-			// TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly...
-			WRITE(p, "  vec2 t0 = (t - knot[0]);\n");
-			WRITE(p, "  vec2 t1 = (t - knot[1]);\n");
-			WRITE(p, "  vec2 t2 = (t - knot[2]);\n");
-			// TODO: All our knots are integers so we should be able to get rid of these divisions (How?)
-			WRITE(p, "  vec2 f30 = t0 / (knot[3] - knot[0]);\n");
-			WRITE(p, "  vec2 f41 = t1 / (knot[4] - knot[1]);\n");
-			WRITE(p, "  vec2 f52 = t2 / (knot[5] - knot[2]);\n");
-			WRITE(p, "  vec2 f31 = t1 / (knot[3] - knot[1]);\n");
-			WRITE(p, "  vec2 f42 = t2 / (knot[4] - knot[2]);\n");
-			WRITE(p, "  vec2 f32 = t2 / (knot[3] - knot[2]);\n");
-			WRITE(p, "  vec2 a = (1.0 - f30)*(1.0 - f31);\n");
-			WRITE(p, "  vec2 b = (f31*f41);\n");
-			WRITE(p, "  vec2 c = (1.0 - f41)*(1.0 - f42);\n");
-			WRITE(p, "  vec2 d = (f42*f52);\n");
-			WRITE(p, "  weights[0] = a - (a*f32);\n");
-			WRITE(p, "  weights[1] = vec2(1.0) - a - b + ((a + b + c - vec2(1.0))*f32);\n");
-			WRITE(p, "  weights[2] = b + ((vec2(1.0) - b - c - d)*f32);\n");
-			WRITE(p, "  weights[3] = d*f32;\n");
+		if (!gl_extensions.VersionGEThan(3, 0, 0)) { // For glsl version 1.10
+			WRITE(p, "mat4 outerProduct(vec4 u, vec4 v) {\n");
+			WRITE(p, "  return mat4(u * v[0], u * v[1], u * v[2], u * v[3]);\n");
 			WRITE(p, "}\n");
 		}
+
+		WRITE(p, "struct Tess {\n");
+		WRITE(p, "  vec3 pos;\n");
+		if (doTexture)
+			WRITE(p, "  vec2 tex;\n");
+		WRITE(p, "  vec4 col;\n");
+		if (hasNormalTess)
+			WRITE(p, "  vec3 nrm;\n");
+		WRITE(p, "};\n");
+
+		WRITE(p, "void tessellate(out Tess tess) {\n");
+		WRITE(p, "  ivec2 point_pos = ivec2(position.z, normal.z)%s;\n", doBezier ? " * 3" : "");
+		WRITE(p, "  ivec2 weight_idx = ivec2(position.xy);\n");
+
+		// Load 4x4 control points
+		WRITE(p, "  vec3 _pos[16];\n");
+		WRITE(p, "  vec2 _tex[16];\n");
+		WRITE(p, "  vec4 _col[16];\n");
+		WRITE(p, "  int index_u, index_v;\n");
+		for (int i = 0; i < 4; i++) {
+			for (int j = 0; j < 4; j++) {
+				WRITE(p, "  index_u = (%i + point_pos.x);\n", j);
+				WRITE(p, "  index_v = (%i + point_pos.y);\n", i);
+				WRITE(p, "  _pos[%i] = %s(u_tess_points, ivec2(index_u, index_v), 0).xyz;\n", i * 4 + j, texelFetch);
+				if (doTexture && hasTexcoordTess)
+					WRITE(p, "  _tex[%i] = %s(u_tess_points, ivec2(index_u + u_spline_counts, index_v), 0).xy;\n", i * 4 + j, texelFetch);
+				if (hasColorTess)
+					WRITE(p, "  _col[%i] = %s(u_tess_points, ivec2(index_u + u_spline_counts * 2, index_v), 0).rgba;\n", i * 4 + j, texelFetch);
+			}
+		}
+
+		// Basis polynomials as weight coefficients
+		WRITE(p, "  vec4 basis_u = %s(u_tess_weights_u, %s, 0);\n", texelFetch, "ivec2(weight_idx.x * 2, 0)");
+		WRITE(p, "  vec4 basis_v = %s(u_tess_weights_v, %s, 0);\n", texelFetch, "ivec2(weight_idx.y * 2, 0)");
+		WRITE(p, "  mat4 basis = outerProduct(basis_u, basis_v);\n");
+
+		// Tessellate
+		WRITE(p, "  tess.pos = tess_sample(_pos, basis);\n");
+		if (doTexture) {
+			if (hasTexcoordTess)
+				WRITE(p, "  tess.tex = tess_sample(_tex, basis);\n");
+			else
+				WRITE(p, "  tess.tex = normal.xy;\n");
+		}
+		if (hasColorTess)
+			WRITE(p, "  tess.col = tess_sample(_col, basis);\n");
+		else
+			WRITE(p, "  tess.col = u_matambientalpha;\n");
+		if (hasNormalTess) {
+			// Derivatives as weight coefficients
+			WRITE(p, "  vec4 deriv_u = %s(u_tess_weights_u, %s, 0);\n", texelFetch, "ivec2(weight_idx.x * 2 + 1, 0)");
+			WRITE(p, "  vec4 deriv_v = %s(u_tess_weights_v, %s, 0);\n", texelFetch, "ivec2(weight_idx.y * 2 + 1, 0)");
+
+			WRITE(p, "  vec3 du = tess_sample(_pos, outerProduct(deriv_u, basis_v));\n");
+			WRITE(p, "  vec3 dv = tess_sample(_pos, outerProduct(basis_u, deriv_v));\n");
+			WRITE(p, "  tess.nrm = normalize(cross(du, dv));\n");
+		}
+		WRITE(p, "}\n");
 	}
 
 	WRITE(p, "void main() {\n");
@@ -494,101 +500,14 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 	} else {
 		// Step 1: World Transform / Skinning
 		if (!enableBones) {
-			// Hardware tessellation
 			if (doBezier || doSpline) {
-				WRITE(p, "  vec3 _pos[16];\n");
-				WRITE(p, "  vec2 _tex[16];\n");
-				WRITE(p, "  vec4 _col[16];\n");
-				WRITE(p, "  int num_patches_u = %s;\n", doBezier ? "(u_spline_count_u - 1) / 3" : "u_spline_count_u - 3");
-				WRITE(p, "  int u = int(mod(float(gl_InstanceID), float(num_patches_u)));\n");
-				WRITE(p, "  int v = gl_InstanceID / num_patches_u;\n");
-				WRITE(p, "  ivec2 patch_pos = ivec2(u, v);\n");
-				WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
-				WRITE(p, "    for (int j = 0; j < 4; j++) {\n");
-				WRITE(p, "      int index = (i + v%s) * u_spline_count_u + (j + u%s);\n", doBezier ? " * 3" : "", doBezier ? " * 3" : "");
-				WRITE(p, "      _pos[i * 4 + j] = %s(u_tess_pos_tex, ivec2(index, 0), 0).xyz;\n", texelFetch);
-				if (doTexture && hasTexcoord && hasTexcoordTess)
-					WRITE(p, "      _tex[i * 4 + j] = %s(u_tess_tex_tex, ivec2(index, 0), 0).xy;\n", texelFetch);
-				if (hasColor && hasColorTess)
-					WRITE(p, "      _col[i * 4 + j] = %s(u_tess_col_tex, ivec2(index, 0), 0).rgba;\n", texelFetch);
-				WRITE(p, "    }\n");
-				WRITE(p, "  }\n");
-				WRITE(p, "  vec2 tess_pos = position.xy;\n");
-				WRITE(p, "  vec2 weights[4];\n");
-				if (doBezier) {
-					// Bernstein 3D
-					WRITE(p, "  weights[0] = (1.0 - tess_pos) * (1.0 - tess_pos) * (1.0 - tess_pos);\n");
-					WRITE(p, "  weights[1] = 3.0 * tess_pos * (1.0 - tess_pos) * (1.0 - tess_pos);\n");
-					WRITE(p, "  weights[2] = 3.0 * tess_pos * tess_pos * (1.0 - tess_pos);\n");
-					WRITE(p, "  weights[3] = tess_pos * tess_pos * tess_pos;\n");
-				} else { // Spline
-					WRITE(p, "  ivec2 spline_num_patches = ivec2(u_spline_count_u - 3, u_spline_count_v - 3);\n");
-					WRITE(p, "  ivec2 spline_type = ivec2(u_spline_type_u, u_spline_type_v);\n");
-					WRITE(p, "  vec2 knots[6];\n");
-					WRITE(p, "  spline_knot(spline_num_patches, spline_type, knots, patch_pos);\n");
-					WRITE(p, "  spline_weight(tess_pos + vec2(patch_pos), knots, weights);\n");
-				}
-				WRITE(p, "  vec3 pos = tess_sample(_pos, weights);\n");
-				if (doTexture && hasTexcoord) {
-					if (hasTexcoordTess)
-						WRITE(p, "  vec2 tex = tess_sample(_tex, weights);\n");
-					else
-						WRITE(p, "  vec2 tex = tess_pos + vec2(patch_pos);\n");
-				}
-				if (hasColor) {
-					if (hasColorTess)
-						WRITE(p, "  vec4 col = tess_sample(_col, weights);\n");
-					else
-						WRITE(p, "  vec4 col = %s(u_tess_col_tex, ivec2(0, 0), 0).rgba;\n", texelFetch);
-				}
-				if (hasNormal) {
-					// Curved surface is probably always need to compute normal(not sampling from control points)
-					if (doBezier) {
-						// Bernstein derivative
-						WRITE(p, "  vec2 bernderiv[4];\n");
-						WRITE(p, "  bernderiv[0] = -3.0 * (tess_pos - 1.0) * (tess_pos - 1.0); \n");
-						WRITE(p, "  bernderiv[1] = 9.0 * tess_pos * tess_pos - 12.0 * tess_pos + 3.0; \n");
-						WRITE(p, "  bernderiv[2] = 3.0 * (2.0 - 3.0 * tess_pos) * tess_pos; \n");
-						WRITE(p, "  bernderiv[3] = 3.0 * tess_pos * tess_pos; \n");
+				// Hardware tessellation
+				WRITE(p, "  Tess tess;\n");
+				WRITE(p, "  tessellate(tess);\n");
 
-						WRITE(p, "  vec2 bernderiv_u[4];\n");
-						WRITE(p, "  vec2 bernderiv_v[4];\n");
-						WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
-						WRITE(p, "    bernderiv_u[i] = vec2(bernderiv[i].x, weights[i].y);\n");
-						WRITE(p, "    bernderiv_v[i] = vec2(weights[i].x, bernderiv[i].y);\n");
-						WRITE(p, "  }\n");
-
-						WRITE(p, "  vec3 du = tess_sample(_pos, bernderiv_u);\n");
-						WRITE(p, "  vec3 dv = tess_sample(_pos, bernderiv_v);\n");
-					} else { // Spline
-						WRITE(p, "  vec2 tess_next_u = vec2(normal.x, 0.0);\n");
-						WRITE(p, "  vec2 tess_next_v = vec2(0.0, normal.y);\n");
-						// Right
-						WRITE(p, "  vec2 tess_pos_r = tess_pos + tess_next_u;\n");
-						WRITE(p, "  spline_weight(tess_pos_r + vec2(patch_pos), knots, weights);\n");
-						WRITE(p, "  vec3 pos_r = tess_sample(_pos, weights);\n");
-						// Left
-						WRITE(p, "  vec2 tess_pos_l = tess_pos - tess_next_u;\n");
-						WRITE(p, "  spline_weight(tess_pos_l + vec2(patch_pos), knots, weights);\n");
-						WRITE(p, "  vec3 pos_l = tess_sample(_pos, weights);\n");
-						// Down
-						WRITE(p, "  vec2 tess_pos_d = tess_pos + tess_next_v;\n");
-						WRITE(p, "  spline_weight(tess_pos_d + vec2(patch_pos), knots, weights);\n");
-						WRITE(p, "  vec3 pos_d = tess_sample(_pos, weights);\n");
-						// Up
-						WRITE(p, "  vec2 tess_pos_u = tess_pos - tess_next_v;\n");
-						WRITE(p, "  spline_weight(tess_pos_u + vec2(patch_pos), knots, weights);\n");
-						WRITE(p, "  vec3 pos_u = tess_sample(_pos, weights);\n");
-
-						WRITE(p, "  vec3 du = pos_r - pos_l;\n");
-						WRITE(p, "  vec3 dv = pos_d - pos_u;\n");
-					}
-					WRITE(p, "  vec3 nrm = cross(du, dv);\n");
-					WRITE(p, "  nrm = normalize(nrm);\n");
-				}
-				WRITE(p, "  vec3 worldpos = (u_world * vec4(pos.xyz, 1.0)).xyz;\n");
-				if (hasNormal) {
-					WRITE(p, "  mediump vec3 worldnormal = normalize((u_world * vec4(%snrm, 0.0)).xyz);\n", flipNormalTess ? "-" : "");
+				WRITE(p, "  vec3 worldpos = (u_world * vec4(tess.pos.xyz, 1.0)).xyz;\n");
+				if (hasNormalTess) {
+					WRITE(p, "  mediump vec3 worldnormal = normalize((u_world * vec4(%stess.nrm, 0.0)).xyz);\n", flipNormalTess ? "-" : "");
 				} else {
 					WRITE(p, "  mediump vec3 worldnormal = vec3(0.0, 0.0, 1.0);\n");
 				}
@@ -692,9 +611,10 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 		const char *diffuseStr = (matUpdate & 2) && hasColor ? "color0.rgb" : "u_matdiffuse";
 		const char *specularStr = (matUpdate & 4) && hasColor ? "color0.rgb" : "u_matspecular.rgb";
 		if (doBezier || doSpline) {
-			ambientStr = (matUpdate & 1) && hasColor ? "col" : "u_matambientalpha";
-			diffuseStr = (matUpdate & 2) && hasColor ? "col.rgb" : "u_matdiffuse";
-			specularStr = (matUpdate & 4) && hasColor ? "col.rgb" : "u_matspecular.rgb";
+			// TODO: Probably, should use hasColorTess but FF4 has a problem with drawing the background.
+			ambientStr = (matUpdate & 1) && hasColor ? "tess.col" : "u_matambientalpha";
+			diffuseStr = (matUpdate & 2) && hasColor ? "tess.col.rgb" : "u_matdiffuse";
+			specularStr = (matUpdate & 4) && hasColor ? "tess.col.rgb" : "u_matspecular.rgb";
 		}
 
 		bool diffuseIsZero = true;
@@ -821,7 +741,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 			// Lighting doesn't affect color.
 			if (hasColor) {
 				if (doBezier || doSpline)
-					WRITE(p, "  v_color0 = col;\n");
+					WRITE(p, "  v_color0 = tess.col;\n");
 				else
 					WRITE(p, "  v_color0 = color0;\n");
 			} else {
@@ -839,9 +759,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 				if (scaleUV) {
 					if (hasTexcoord) {
 						if (doBezier || doSpline)
-							// TODO: Need fix?
-							// Fix to avoid temporarily texture animation bug with hardware tessellation.
-							WRITE(p, "  v_texcoord = vec3(tex * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n");
+							WRITE(p, "  v_texcoord = vec3(tess.tex * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n");
 						else
 							WRITE(p, "  v_texcoord = vec3(texcoord.xy * u_uvscaleoffset.xy, 0.0);\n");
 					} else {
@@ -849,10 +767,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 					}
 				} else {
 					if (hasTexcoord) {
-						if (doBezier || doSpline)
-							WRITE(p, "  v_texcoord = vec3(tex * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n");
-						else
-							WRITE(p, "  v_texcoord = vec3(texcoord.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n");
+						WRITE(p, "  v_texcoord = vec3(texcoord.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n");
 					} else {
 						WRITE(p, "  v_texcoord = vec3(u_uvscaleoffset.zw, 0.0);\n");
 					}
diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp
index a0ecf7dc4d..1d8266d500 100644
--- a/GPU/GPUCommon.cpp
+++ b/GPU/GPUCommon.cpp
@@ -1718,8 +1718,6 @@ bail:
 }
 
 void GPUCommon::Execute_Bezier(u32 op, u32 diff) {
-	drawEngineCommon_->DispatchFlush();
-
 	// We don't dirty on normal changes anymore as we prescale, but it's needed for splines/bezier.
 	gstate_c.Dirty(DIRTY_UVSCALEOFFSET);
 
@@ -1760,9 +1758,9 @@ void GPUCommon::Execute_Bezier(u32 op, u32 diff) {
 	if (CanUseHardwareTessellation(patchPrim)) {
 		gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE);
 		gstate_c.bezier = true;
-		if (gstate_c.spline_count_u != bz_ucount) {
+		if (gstate_c.spline_num_points_u != bz_ucount) {
 			gstate_c.Dirty(DIRTY_BEZIERSPLINE);
-			gstate_c.spline_count_u = bz_ucount;
+			gstate_c.spline_num_points_u = bz_ucount;
 		}
 	}
 
@@ -1780,8 +1778,6 @@ void GPUCommon::Execute_Bezier(u32 op, u32 diff) {
 }
 
 void GPUCommon::Execute_Spline(u32 op, u32 diff) {
-	drawEngineCommon_->DispatchFlush();
-
 	// We don't dirty on normal changes anymore as we prescale, but it's needed for splines/bezier.
 	gstate_c.Dirty(DIRTY_UVSCALEOFFSET);
 
@@ -1824,14 +1820,9 @@ void GPUCommon::Execute_Spline(u32 op, u32 diff) {
 	if (CanUseHardwareTessellation(patchPrim)) {
 		gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE);
 		gstate_c.spline = true;
-		bool countsChanged = gstate_c.spline_count_u != sp_ucount || gstate_c.spline_count_v != sp_vcount;
-		bool typesChanged = gstate_c.spline_type_u != sp_utype || gstate_c.spline_type_v != sp_vtype;
-		if (countsChanged || typesChanged) {
+		if (gstate_c.spline_num_points_u != sp_ucount) {
 			gstate_c.Dirty(DIRTY_BEZIERSPLINE);
-			gstate_c.spline_count_u = sp_ucount;
-			gstate_c.spline_count_v = sp_vcount;
-			gstate_c.spline_type_u = sp_utype;
-			gstate_c.spline_type_v = sp_vtype;
+			gstate_c.spline_num_points_u = sp_ucount;
 		}
 	}
 
diff --git a/GPU/GPUState.h b/GPU/GPUState.h
index a7d4f719a3..25e81c6477 100644
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@@ -603,10 +603,7 @@ struct GPUStateCache {
 
 	bool bezier;
 	bool spline;
-	int spline_count_u;
-	int spline_count_v;
-	int spline_type_u;
-	int spline_type_v;
+	int spline_num_points_u;
 
 	bool useShaderDepal;
 	GEBufferFormat depalFramebufferFormat;
diff --git a/GPU/Math3D.cpp b/GPU/Math3D.cpp
index f43f41ade6..c46a4e520d 100644
--- a/GPU/Math3D.cpp
+++ b/GPU/Math3D.cpp
@@ -102,11 +102,49 @@ float Vec3<float>::Distance2To(Vec3<float> &other)
 	return Vec3<float>(other-(*this)).Length2();
 }
 
+#if defined(_M_SSE)
+__m128 SSENormalizeMultiplierSSE2(__m128 v)
+{
+	const __m128 sq = _mm_mul_ps(v, v);
+	const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
+	const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
+	const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq));
+
+	const __m128 rt = _mm_rsqrt_ss(res);
+	return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+#if _M_SSE >= 0x401
+__m128 SSENormalizeMultiplierSSE4(__m128 v)
+{
+	return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF));
+}
+
+__m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)
+{
+	if (useSSE4)
+		return SSENormalizeMultiplierSSE4(v);
+	return SSENormalizeMultiplierSSE2(v);
+}
+#else
+__m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)
+{
+	return SSENormalizeMultiplierSSE2(v);
+}
+#endif
 template<>
-Vec3<float> Vec3<float>::Normalized() const
+Vec3<float> Vec3<float>::Normalized(bool useSSE4) const
+{
+	const __m128 normalize = SSENormalizeMultiplier(useSSE4, vec);
+	return _mm_mul_ps(normalize, vec);
+}
+#else
+template<>
+Vec3<float> Vec3<float>::Normalized(bool useSSE4) const
 {
 	return (*this) / Length();
 }
+#endif
 
 template<>
 float Vec3<float>::Normalize()
diff --git a/GPU/Math3D.h b/GPU/Math3D.h
index ba01d31600..292c63be05 100644
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@@ -25,6 +25,9 @@
 
 #if defined(_M_SSE)
 #include <emmintrin.h>
+#if _M_SSE >= 0x401
+#include <smmintrin.h>
+#endif
 #endif
 
 namespace Math3D {
@@ -177,8 +180,6 @@ public:
 	const Vec2 ts() const { return Vec2(y, x); }
 };
 
-typedef Vec2<float> Vec2f;
-
 template<typename T>
 class Vec3Packed;
 
@@ -295,7 +296,7 @@ public:
 	void SetLength(const float l);
 	Vec3 WithLength(const float l) const;
 	float Distance2To(Vec3 &other);
-	Vec3 Normalized() const;
+	Vec3 Normalized(bool useSSE4 = false) const;
 	float Normalize(); // returns the previous length, which is often useful
 
 	T& operator [] (int i) //allow vector[2] = 3   (vector.z=3)
@@ -817,6 +818,7 @@ private:
 
 }; // namespace Math3D
 
+typedef Math3D::Vec2<float> Vec2f;
 typedef Math3D::Vec3<float> Vec3f;
 typedef Math3D::Vec3Packed<float> Vec3Packedf;
 typedef Math3D::Vec4<float> Vec4f;
@@ -1082,6 +1084,69 @@ __forceinline void Vec4<T>::ToRGBA(u8 *rgba) const
 	*(u32 *)rgba = ToRGBA();
 }
 
+#if defined(_M_SSE)
+// Specialized for SIMD optimization
+
+// Vec3<float> operation
+template<>
+inline void Vec3<float>::operator += (const Vec3<float> &other)
+{
+	vec = _mm_add_ps(vec, other.vec);
+}
+
+template<>
+inline Vec3<float> Vec3<float>::operator + (const Vec3 &other) const
+{
+	return Vec3<float>(_mm_add_ps(vec, other.vec));
+}
+
+template<>
+inline Vec3<float> Vec3<float>::operator * (const Vec3 &other) const
+{
+	return Vec3<float>(_mm_mul_ps(vec, other.vec));
+}
+
+template<> template<>
+inline Vec3<float> Vec3<float>::operator * (const float &other) const
+{
+	return Vec3<float>(_mm_mul_ps(vec, _mm_set_ps1(other)));
+}
+
+// Vec4<float> operation
+template<>
+inline void Vec4<float>::operator += (const Vec4<float> &other)
+{
+	vec = _mm_add_ps(vec, other.vec);
+}
+
+template<>
+inline Vec4<float> Vec4<float>::operator + (const Vec4 &other) const
+{
+	return Vec4<float>(_mm_add_ps(vec, other.vec));
+}
+
+template<>
+inline Vec4<float> Vec4<float>::operator * (const Vec4 &other) const
+{
+	return Vec4<float>(_mm_mul_ps(vec, other.vec));
+}
+
+template<> template<>
+inline Vec4<float> Vec4<float>::operator * (const float &other) const
+{
+	return Vec4<float>(_mm_mul_ps(vec, _mm_set_ps1(other)));
+}
+
+// Vec3<float> cross product
+template<>
+inline Vec3<float> Cross(const Vec3<float> &a, const Vec3<float> &b)
+{
+	const __m128 left = _mm_mul_ps(_mm_shuffle_ps(a.vec, a.vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b.vec, b.vec, _MM_SHUFFLE(3, 1, 0, 2)));
+	const __m128 right = _mm_mul_ps(_mm_shuffle_ps(a.vec, a.vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b.vec, b.vec, _MM_SHUFFLE(3, 0, 2, 1)));
+	return _mm_sub_ps(left, right);
+}
+#endif
+
 }; // namespace Math3D
 
 // linear interpolation via float: 0.0=begin, 1.0=end
diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp
index e31d062de5..e72e9a85cb 100644
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@@ -42,13 +42,11 @@ SoftwareDrawEngine::SoftwareDrawEngine() {
 	// All this is a LOT of memory, need to see if we can cut down somehow.  Used for splines.
 	decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
-	splineBuffer = (u8 *)AllocateMemoryPages(SPLINE_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 }
 
 SoftwareDrawEngine::~SoftwareDrawEngine() {
 	FreeMemoryPages(decoded, DECODED_VERTEX_BUFFER_SIZE);
 	FreeMemoryPages(decIndex, DECODED_INDEX_BUFFER_SIZE);
-	FreeMemoryPages(splineBuffer, SPLINE_BUFFER_SIZE);
 }
 
 void SoftwareDrawEngine::DispatchFlush() {
@@ -280,7 +278,7 @@ void TransformUnit::SubmitPrimitive(void* vertices, void* indices, GEPrimitiveTy
 
 	u16 index_lower_bound = 0;
 	u16 index_upper_bound = vertex_count - 1;
-	IndexConverter idxConv(vertex_type, indices);
+	IndexConverter ConvertIndex(vertex_type, indices);
 
 	if (indices)
 		GetIndexBounds(indices, vertex_count, vertex_type, &index_lower_bound, &index_upper_bound);
@@ -321,7 +319,7 @@ void TransformUnit::SubmitPrimitive(void* vertices, void* indices, GEPrimitiveTy
 		{
 			for (int vtx = 0; vtx < vertex_count; ++vtx) {
 				if (indices) {
-					vreader.Goto(idxConv.convert(vtx) - index_lower_bound);
+					vreader.Goto(ConvertIndex(vtx) - index_lower_bound);
 				} else {
 					vreader.Goto(vtx);
 				}
@@ -380,7 +378,7 @@ void TransformUnit::SubmitPrimitive(void* vertices, void* indices, GEPrimitiveTy
 			int skip_count = data_index == 0 ? 1 : 0;
 			for (int vtx = 0; vtx < vertex_count; ++vtx) {
 				if (indices) {
-					vreader.Goto(idxConv.convert(vtx) - index_lower_bound);
+					vreader.Goto(ConvertIndex(vtx) - index_lower_bound);
 				} else {
 					vreader.Goto(vtx);
 				}
@@ -410,7 +408,7 @@ void TransformUnit::SubmitPrimitive(void* vertices, void* indices, GEPrimitiveTy
 
 			for (int vtx = 0; vtx < vertex_count; ++vtx) {
 				if (indices) {
-					vreader.Goto(idxConv.convert(vtx) - index_lower_bound);
+					vreader.Goto(ConvertIndex(vtx) - index_lower_bound);
 				} else {
 					vreader.Goto(vtx);
 				}
@@ -452,7 +450,7 @@ void TransformUnit::SubmitPrimitive(void* vertices, void* indices, GEPrimitiveTy
 			// Only read the central vertex if we're not continuing.
 			if (data_index == 0) {
 				if (indices) {
-					vreader.Goto(idxConv.convert(0) - index_lower_bound);
+					vreader.Goto(ConvertIndex(0) - index_lower_bound);
 				} else {
 					vreader.Goto(0);
 				}
@@ -463,7 +461,7 @@ void TransformUnit::SubmitPrimitive(void* vertices, void* indices, GEPrimitiveTy
 
 			for (int vtx = start_vtx; vtx < vertex_count; ++vtx) {
 				if (indices) {
-					vreader.Goto(idxConv.convert(vtx) - index_lower_bound);
+					vreader.Goto(ConvertIndex(vtx) - index_lower_bound);
 				} else {
 					vreader.Goto(vtx);
 				}
diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp
index 16f0c495fb..654b8a1a41 100644
--- a/GPU/Vulkan/DrawEngineVulkan.cpp
+++ b/GPU/Vulkan/DrawEngineVulkan.cpp
@@ -69,6 +69,8 @@ enum {
 	DRAW_BINDING_DYNUBO_LIGHT = 4,
 	DRAW_BINDING_DYNUBO_BONE = 5,
 	DRAW_BINDING_TESS_STORAGE_BUF = 6,
+	DRAW_BINDING_TESS_STORAGE_BUF_WU = 7,
+	DRAW_BINDING_TESS_STORAGE_BUF_WV = 8,
 };
 
 enum {
@@ -87,7 +89,6 @@ DrawEngineVulkan::DrawEngineVulkan(VulkanContext *vulkan, Draw::DrawContext *dra
 	// All this is a LOT of memory, need to see if we can cut down somehow.
 	decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
-	splineBuffer = (u8 *)AllocateMemoryPages(SPLINE_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 
 	indexGen.Setup(decIndex);
 
@@ -96,7 +97,7 @@ DrawEngineVulkan::DrawEngineVulkan(VulkanContext *vulkan, Draw::DrawContext *dra
 
 void DrawEngineVulkan::InitDeviceObjects() {
 	// All resources we need for PSP drawing. Usually only bindings 0 and 2-4 are populated.
-	VkDescriptorSetLayoutBinding bindings[7]{};
+	VkDescriptorSetLayoutBinding bindings[9]{};
 	bindings[0].descriptorCount = 1;
 	bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
 	bindings[0].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
@@ -126,6 +127,14 @@ void DrawEngineVulkan::InitDeviceObjects() {
 	bindings[6].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
 	bindings[6].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
 	bindings[6].binding = DRAW_BINDING_TESS_STORAGE_BUF;
+	bindings[7].descriptorCount = 1;
+	bindings[7].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+	bindings[7].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
+	bindings[7].binding = DRAW_BINDING_TESS_STORAGE_BUF_WU;
+	bindings[8].descriptorCount = 1;
+	bindings[8].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+	bindings[8].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
+	bindings[8].binding = DRAW_BINDING_TESS_STORAGE_BUF_WV;
 
 	VkDevice device = vulkan_->GetDevice();
 
@@ -167,13 +176,13 @@ void DrawEngineVulkan::InitDeviceObjects() {
 
 	vertexCache_ = new VulkanPushBuffer(vulkan_, VERTEX_CACHE_SIZE);
 
-	tessDataTransfer = new TessellationDataTransferVulkan(vulkan_);
+	tessDataTransferVulkan = new TessellationDataTransferVulkan(vulkan_);
+	tessDataTransfer = tessDataTransferVulkan;
 }
 
 DrawEngineVulkan::~DrawEngineVulkan() {
 	FreeMemoryPages(decoded, DECODED_VERTEX_BUFFER_SIZE);
 	FreeMemoryPages(decIndex, DECODED_INDEX_BUFFER_SIZE);
-	FreeMemoryPages(splineBuffer, SPLINE_BUFFER_SIZE);
 
 	DestroyDeviceObjects();
 }
@@ -201,8 +210,8 @@ void DrawEngineVulkan::FrameData::Destroy(VulkanContext *vulkan) {
 }
 
 void DrawEngineVulkan::DestroyDeviceObjects() {
-	delete tessDataTransfer;
-	tessDataTransfer = nullptr;
+	delete tessDataTransferVulkan;
+	tessDataTransfer = tessDataTransferVulkan = nullptr;
 
 	for (int i = 0; i < VulkanContext::MAX_INFLIGHT_FRAMES; i++) {
 		frame_[i].Destroy(vulkan_);
@@ -258,7 +267,7 @@ void DrawEngineVulkan::BeginFrame() {
 	frame->pushIndex->Begin(vulkan_);
 
 	// TODO: How can we make this nicer...
-	((TessellationDataTransferVulkan *)tessDataTransfer)->SetPushBuffer(frame->pushUBO);
+	tessDataTransferVulkan->SetPushBuffer(frame->pushUBO);
 
 	DirtyAllUBOs();
 
@@ -470,23 +479,32 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView
 		n++;
 	}
 
-	// Tessellation data buffer. Make sure this is declared outside the if to avoid optimizer
-	// shenanigans.
-	VkDescriptorBufferInfo tess_buf{};
+	// Tessellation data buffer.
 	if (tess) {
-		VkBuffer buf;
-		VkDeviceSize offset;
-		VkDeviceSize range;
-		((TessellationDataTransferVulkan *)tessDataTransfer)->GetBufferAndOffset(&buf, &offset, &range);
-		assert(buf);
-		tess_buf.buffer = buf;
-		tess_buf.offset = offset;
-		tess_buf.range = range;
-		tessOffset_ = offset;
+		const VkDescriptorBufferInfo *bufInfo = tessDataTransferVulkan->GetBufferInfo();
+		// Control Points
 		writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
 		writes[n].pNext = nullptr;
 		writes[n].dstBinding = DRAW_BINDING_TESS_STORAGE_BUF;
-		writes[n].pBufferInfo = &tess_buf;
+		writes[n].pBufferInfo = &bufInfo[0];
+		writes[n].descriptorCount = 1;
+		writes[n].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+		writes[n].dstSet = desc;
+		n++;
+		// Weights U
+		writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+		writes[n].pNext = nullptr;
+		writes[n].dstBinding = DRAW_BINDING_TESS_STORAGE_BUF_WU;
+		writes[n].pBufferInfo = &bufInfo[1];
+		writes[n].descriptorCount = 1;
+		writes[n].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+		writes[n].dstSet = desc;
+		n++;
+		// Weights V
+		writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+		writes[n].pNext = nullptr;
+		writes[n].dstBinding = DRAW_BINDING_TESS_STORAGE_BUF_WV;
+		writes[n].pBufferInfo = &bufInfo[2];
 		writes[n].descriptorCount = 1;
 		writes[n].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
 		writes[n].dstSet = desc;
@@ -825,8 +843,7 @@ void DrawEngineVulkan::DoFlush() {
 		if (useElements) {
 			if (!ibuf)
 				ibOffset = (uint32_t)frame->pushIndex->Push(decIndex, sizeof(uint16_t) * indexGen.VertexCount(), &ibuf);
-			int numInstances = tess ? numPatches : 1;
-			renderManager->DrawIndexed(pipelineLayout_, ds, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, ibuf, ibOffset, vertexCount, numInstances, VK_INDEX_TYPE_UINT16);
+			renderManager->DrawIndexed(pipelineLayout_, ds, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, ibuf, ibOffset, vertexCount, 1, VK_INDEX_TYPE_UINT16);
 		} else {
 			renderManager->Draw(pipelineLayout_, ds, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, vertexCount);
 		}
@@ -994,16 +1011,7 @@ void DrawEngineVulkan::UpdateUBOs(FrameData *frame) {
 	}
 }
 
-DrawEngineVulkan::TessellationDataTransferVulkan::TessellationDataTransferVulkan(VulkanContext *vulkan)
-	: TessellationDataTransfer(), vulkan_(vulkan) {
-}
-
-DrawEngineVulkan::TessellationDataTransferVulkan::~TessellationDataTransferVulkan() {
-}
-
-void DrawEngineVulkan::TessellationDataTransferVulkan::PrepareBuffers(float *&pos, float *&tex, float *&col, int &posStride, int &texStride, int &colStride, int size, bool hasColor, bool hasTexCoords) {
-	colStride = 4;
-
+void TessellationDataTransferVulkan::SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) {
 	// SSBOs that are not simply float1 or float2 need to be padded up to a float4 size. vec3 members
 	// also need to be 16-byte aligned, hence the padding.
 	struct TessData {
@@ -1012,18 +1020,28 @@ void DrawEngineVulkan::TessellationDataTransferVulkan::PrepareBuffers(float *&po
 		float color[4];
 	};
 
+	int size = size_u * size_v;
+
 	int ssboAlignment = vulkan_->GetPhysicalDeviceProperties(vulkan_->GetCurrentPhysicalDevice()).limits.minStorageBufferOffsetAlignment;
-	uint8_t *data = (uint8_t *)push_->PushAligned(size * sizeof(TessData), &offset_, &buf_, ssboAlignment);
-	range_ = size * sizeof(TessData);
+	uint8_t *data = (uint8_t *)push_->PushAligned(size * sizeof(TessData), (uint32_t *)&bufInfo_[0].offset, &bufInfo_[0].buffer, ssboAlignment);
+	bufInfo_[0].range = size * sizeof(TessData);
 
-	pos = (float *)(data);
-	tex = (float *)(data + offsetof(TessData, uv));
-	col = (float *)(data + offsetof(TessData, color));
-	posStride = sizeof(TessData) / sizeof(float);
-	colStride = hasColor ? (sizeof(TessData) / sizeof(float)) : 0;
-	texStride = sizeof(TessData) / sizeof(float);
-}
+	float *pos = (float *)(data);
+	float *tex = (float *)(data + offsetof(TessData, uv));
+	float *col = (float *)(data + offsetof(TessData, color));
+	int stride = sizeof(TessData) / sizeof(float);
 
-void DrawEngineVulkan::TessellationDataTransferVulkan::SendDataToShader(const float *pos, const float *tex, const float *col, int size, bool hasColor, bool hasTexCoords) {
-	// Nothing to do here! The caller will write directly to the pushbuffer through the pointers it got through PrepareBuffers.
+	CopyControlPoints(pos, tex, col, stride, stride, stride, points, size, vertType);
+
+	using Spline::Weight;
+
+	// Weights U
+	data = (uint8_t *)push_->PushAligned(weights.size_u * sizeof(Weight), (uint32_t *)&bufInfo_[1].offset, &bufInfo_[1].buffer, ssboAlignment);
+	memcpy(data, weights.u, weights.size_u * sizeof(Weight));
+	bufInfo_[1].range = weights.size_u * sizeof(Weight);
+
+	// Weights V
+	data = (uint8_t *)push_->PushAligned(weights.size_v * sizeof(Weight), (uint32_t *)&bufInfo_[2].offset, &bufInfo_[2].buffer, ssboAlignment);
+	memcpy(data, weights.v, weights.size_v * sizeof(Weight));
+	bufInfo_[2].range = weights.size_v * sizeof(Weight);
 }
diff --git a/GPU/Vulkan/DrawEngineVulkan.h b/GPU/Vulkan/DrawEngineVulkan.h
index 494a4ccd4b..cece2c08c6 100644
--- a/GPU/Vulkan/DrawEngineVulkan.h
+++ b/GPU/Vulkan/DrawEngineVulkan.h
@@ -117,6 +117,20 @@ public:
 
 class VulkanRenderManager;
 
+class TessellationDataTransferVulkan : public TessellationDataTransfer  {
+public:
+	TessellationDataTransferVulkan(VulkanContext *vulkan) : vulkan_(vulkan) {}
+
+	void SetPushBuffer(VulkanPushBuffer *push) { push_ = push; }
+	// Send spline/bezier's control points and weights to vertex shader through structured shader buffer.
+	void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) override;
+	const VkDescriptorBufferInfo *GetBufferInfo() { return bufInfo_; }
+private:
+	VulkanContext *vulkan_;
+	VulkanPushBuffer *push_;  // Updated each frame.
+	VkDescriptorBufferInfo bufInfo_[3]{};
+};
+
 // Handles transform, lighting and drawing.
 class DrawEngineVulkan : public DrawEngineCommon {
 public:
@@ -278,31 +292,5 @@ private:
 	int tessOffset_ = 0;
 
 	// Hardware tessellation
-	class TessellationDataTransferVulkan : public TessellationDataTransfer {
-	public:
-		TessellationDataTransferVulkan(VulkanContext *vulkan);
-		~TessellationDataTransferVulkan();
-
-		void SetPushBuffer(VulkanPushBuffer *push) { push_ = push; }
-		void SendDataToShader(const float *pos, const float *tex, const float *col, int size, bool hasColor, bool hasTexCoords) override;
-		void PrepareBuffers(float *&pos, float *&tex, float *&col, int &posStride, int &texStride, int &colStride, int size, bool hasColor, bool hasTexCoords) override;
-
-		void GetBufferAndOffset(VkBuffer *buf, VkDeviceSize *offset, VkDeviceSize *range) {
-			*buf = buf_;
-			*offset = (VkDeviceSize)offset_;
-			*range = (VkDeviceSize)range_;
-
-			buf_ = 0;
-			offset_ = 0;
-			range_ = 0;
-		}
-
-	private:
-		VulkanContext *vulkan_;
-		VulkanPushBuffer *push_;  // Updated each frame.
-
-		uint32_t offset_ = 0;
-		uint32_t range_ = 0;
-		VkBuffer buf_ = VK_NULL_HANDLE;
-	};
+	TessellationDataTransferVulkan *tessDataTransferVulkan;
 };
diff --git a/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp b/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp
index 1885524639..6d2051828e 100644
--- a/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp
+++ b/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp
@@ -133,6 +133,7 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 	bool doSpline = id.Bit(VS_BIT_SPLINE);
 	bool hasColorTess = id.Bit(VS_BIT_HAS_COLOR_TESS);
 	bool hasTexcoordTess = id.Bit(VS_BIT_HAS_TEXCOORD_TESS);
+	bool hasNormalTess = id.Bit(VS_BIT_HAS_NORMAL_TESS);
 	bool flipNormalTess = id.Bit(VS_BIT_NORM_REVERSE_TESS);
 
 	WRITE(p, "\n");
@@ -219,78 +220,90 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 		WRITE(p, "  vec4 pos;\n");
 		WRITE(p, "  vec4 uv;\n");
 		WRITE(p, "  vec4 color;\n");
-		WRITE(p, "};");
+		WRITE(p, "};\n");
 		WRITE(p, "layout (std430, set = 0, binding = 6) readonly buffer s_tess_data {\n");
-		WRITE(p, "  TessData data[];");
+		WRITE(p, "  TessData data[];\n");
 		WRITE(p, "} tess_data;\n");
 
+		WRITE(p, "layout (std430) struct TessWeight {\n");
+		WRITE(p, "  vec4 basis;\n");
+		WRITE(p, "  vec4 deriv;\n");
+		WRITE(p, "};\n");
+		WRITE(p, "layout (std430, set = 0, binding = 7) readonly buffer s_tess_weights_u {\n");
+		WRITE(p, "  TessWeight data[];\n");
+		WRITE(p, "} tess_weights_u;\n");
+		WRITE(p, "layout (std430, set = 0, binding = 8) readonly buffer s_tess_weights_v {\n");
+		WRITE(p, "  TessWeight data[];\n");
+		WRITE(p, "} tess_weights_v;\n");
+
 		for (int i = 2; i <= 4; i++) {
 			// Define 3 types vec2, vec3, vec4
-			WRITE(p, "vec%d tess_sample(in vec%d points[16], in vec2 weights[4]) {\n", i, i);
-			WRITE(p, "  vec%d pos = vec%d(0);\n", i, i);
-			WRITE(p, "  for (int i = 0; i < 4; ++i) {\n");
-			WRITE(p, "    for (int j = 0; j < 4; ++j) {\n");
-			WRITE(p, "      float f = weights[j].x * weights[i].y;\n");
-			WRITE(p, "      if (f != 0)\n");
-			WRITE(p, "        pos = pos + f * points[i * 4 + j];\n");
-			WRITE(p, "    }\n");
-			WRITE(p, "  }\n");
+			WRITE(p, "vec%d tess_sample(in vec%d points[16], mat4 weights) {\n", i, i);
+			WRITE(p, "  vec%d pos = vec%d(0.0);\n", i, i);
+			for (int v = 0; v < 4; ++v) {
+				for (int u = 0; u < 4; ++u) {
+					WRITE(p, "  pos += weights[%i][%i] * points[%i];\n", v, u, v * 4 + u);
+				}
+			}
 			WRITE(p, "  return pos;\n");
 			WRITE(p, "}\n");
 		}
-		if (doSpline) {
-			WRITE(p, "void spline_knot(ivec2 num_patches, ivec2 type, out vec2 knot[6], ivec2 patch_pos) {\n");
-			WRITE(p, "  for (int i = 0; i < 6; ++i) {\n");
-			WRITE(p, "    knot[i] = vec2(i + patch_pos.x - 2, i + patch_pos.y - 2);\n");
-			WRITE(p, "  }\n");
-			WRITE(p, "  if ((type.x & 1) != 0) {\n");
-			WRITE(p, "    if (patch_pos.x <= 2)\n");
-			WRITE(p, "      knot[0].x = 0;\n");
-			WRITE(p, "    if (patch_pos.x <= 1)\n");
-			WRITE(p, "      knot[1].x = 0;\n");
-			WRITE(p, "  }\n");
-			WRITE(p, "  if ((type.x & 2) != 0) {\n");
-			WRITE(p, "    if (patch_pos.x >= (num_patches.x - 2))\n");
-			WRITE(p, "      knot[5].x = num_patches.x;\n");
-			WRITE(p, "    if (patch_pos.x == (num_patches.x - 1))\n");
-			WRITE(p, "      knot[4].x = num_patches.x;\n");
-			WRITE(p, "  }\n");
-			WRITE(p, "  if ((type.y & 1) != 0) {\n");
-			WRITE(p, "    if (patch_pos.y <= 2)\n");
-			WRITE(p, "      knot[0].y = 0;\n");
-			WRITE(p, "    if (patch_pos.y <= 1)\n");
-			WRITE(p, "      knot[1].y = 0;\n");
-			WRITE(p, "  }\n");
-			WRITE(p, "  if ((type.y & 2) != 0) {\n");
-			WRITE(p, "    if (patch_pos.y >= (num_patches.y - 2))\n");
-			WRITE(p, "      knot[5].y = num_patches.y;\n");
-			WRITE(p, "    if (patch_pos.y == (num_patches.y - 1))\n");
-			WRITE(p, "      knot[4].y = num_patches.y;\n");
-			WRITE(p, "  }\n");
-			WRITE(p, "}\n");
 
-			WRITE(p, "void spline_weight(vec2 t, in vec2 knot[6], out vec2 weights[4]) {\n");
-			// TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly...
-			WRITE(p, "  vec2 t0 = (t - knot[0]);\n");
-			WRITE(p, "  vec2 t1 = (t - knot[1]);\n");
-			WRITE(p, "  vec2 t2 = (t - knot[2]);\n");
-			// TODO: All our knots are integers so we should be able to get rid of these divisions (How?)
-			WRITE(p, "  vec2 f30 = t0 / (knot[3] - knot[0]);\n");
-			WRITE(p, "  vec2 f41 = t1 / (knot[4] - knot[1]);\n");
-			WRITE(p, "  vec2 f52 = t2 / (knot[5] - knot[2]);\n");
-			WRITE(p, "  vec2 f31 = t1 / (knot[3] - knot[1]);\n");
-			WRITE(p, "  vec2 f42 = t2 / (knot[4] - knot[2]);\n");
-			WRITE(p, "  vec2 f32 = t2 / (knot[3] - knot[2]);\n");
-			WRITE(p, "  vec2 a = (1 - f30)*(1 - f31);\n");
-			WRITE(p, "  vec2 b = (f31*f41);\n");
-			WRITE(p, "  vec2 c = (1 - f41)*(1 - f42);\n");
-			WRITE(p, "  vec2 d = (f42*f52);\n");
-			WRITE(p, "  weights[0] = a - (a*f32);\n");
-			WRITE(p, "  weights[1] = 1 - a - b + ((a + b + c - 1)*f32);\n");
-			WRITE(p, "  weights[2] = b + ((1 - b - c - d)*f32);\n");
-			WRITE(p, "  weights[3] = d*f32;\n");
-			WRITE(p, "}\n");
+		WRITE(p, "struct Tess {\n");
+		WRITE(p, "  vec3 pos;\n");
+		if (doTexture)
+			WRITE(p, "  vec2 tex;\n");
+		WRITE(p, "  vec4 col;\n");
+		if (hasNormalTess)
+			WRITE(p, "  vec3 nrm;\n");
+		WRITE(p, "};\n");
+
+		WRITE(p, "void tessellate(out Tess tess) {\n");
+		WRITE(p, "  ivec2 point_pos = ivec2(position.z, normal.z)%s;\n", doBezier ? " * 3" : "");
+		WRITE(p, "  ivec2 weight_idx = ivec2(position.xy);\n");
+		// Load 4x4 control points
+		WRITE(p, "  vec3 _pos[16];\n");
+		WRITE(p, "  vec2 _tex[16];\n");
+		WRITE(p, "  vec4 _col[16];\n");
+		WRITE(p, "  int index;\n");
+		for (int i = 0; i < 4; i++) {
+			for (int j = 0; j < 4; j++) {
+				WRITE(p, "  index = (%i + point_pos.y) * int(base.spline_counts) + (%i + point_pos.x);\n", i, j);
+				WRITE(p, "  _pos[%i] = tess_data.data[index].pos.xyz;\n", i * 4 + j);
+				if (doTexture && hasTexcoordTess)
+					WRITE(p, "  _tex[%i] = tess_data.data[index].uv.xy;\n", i * 4 + j);
+				if (hasColorTess)
+					WRITE(p, "  _col[%i] = tess_data.data[index].color;\n", i * 4 + j);
+			}
 		}
+
+		// Basis polynomials as weight coefficients
+		WRITE(p, "  vec4 basis_u = tess_weights_u.data[weight_idx.x].basis;\n");
+		WRITE(p, "  vec4 basis_v = tess_weights_v.data[weight_idx.y].basis;\n");
+		WRITE(p, "  mat4 basis = outerProduct(basis_u, basis_v);\n");
+
+		// Tessellate
+		WRITE(p, "  tess.pos = tess_sample(_pos, basis);\n");
+		if (doTexture) {
+			if (hasTexcoordTess)
+				WRITE(p, "  tess.tex = tess_sample(_tex, basis);\n");
+			else
+				WRITE(p, "  tess.tex = normal.xy;\n");
+		}
+		if (hasColorTess)
+			WRITE(p, "  tess.col = tess_sample(_col, basis);\n");
+		else
+			WRITE(p, "  tess.col = base.matambientalpha;\n");
+		if (hasNormalTess) {
+			// Derivatives as weight coefficients
+			WRITE(p, "  vec4 deriv_u = tess_weights_u.data[weight_idx.x].deriv;\n");
+			WRITE(p, "  vec4 deriv_v = tess_weights_v.data[weight_idx.y].deriv;\n");
+
+			WRITE(p, "  vec3 du = tess_sample(_pos, outerProduct(deriv_u, basis_v));\n");
+			WRITE(p, "  vec3 dv = tess_sample(_pos, outerProduct(basis_u, deriv_v));\n");
+			WRITE(p, "  tess.nrm = normalize(cross(du, dv));\n");
+		}
+		WRITE(p, "}\n");
 	}
 
 	WRITE(p, "void main() {\n");
@@ -330,103 +343,13 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 		// Step 1: World Transform / Skinning
 		if (!enableBones) {
 			if (doBezier || doSpline) {
-				WRITE(p, "  vec3 _pos[16];\n");
-				WRITE(p, "  vec2 _tex[16];\n");
-				WRITE(p, "  vec4 _col[16];\n");
-				WRITE(p, "  int spline_count_u = int(base.spline_counts & 0xff);\n");
-				WRITE(p, "  int spline_count_v = int((base.spline_counts >> 8) & 0xff);\n");
-				WRITE(p, "  int num_patches_u = %s;\n", doBezier ? "(spline_count_u - 1) / 3" : "spline_count_u - 3");
-				WRITE(p, "  int u = int(mod(gl_InstanceIndex, num_patches_u));\n");
-				WRITE(p, "  int v = gl_InstanceIndex / num_patches_u;\n");
-				WRITE(p, "  ivec2 patch_pos = ivec2(u, v);\n");
-				WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
-				WRITE(p, "    for (int j = 0; j < 4; j++) {\n");
-				WRITE(p, "      int idx = (i + v%s) * spline_count_u + (j + u%s);\n", doBezier ? " * 3" : "", doBezier ? " * 3" : "");
-				WRITE(p, "      _pos[i * 4 + j] = tess_data.data[idx].pos.xyz;\n");
-				if (doTexture && hasTexcoord && hasTexcoordTess)
-					WRITE(p, "      _tex[i * 4 + j] = tess_data.data[idx].uv.xy;\n");
-				if (hasColor && hasColorTess)
-					WRITE(p, "      _col[i * 4 + j] = tess_data.data[idx].color;\n");
-				WRITE(p, "    }\n");
-				WRITE(p, "  }\n");
-				WRITE(p, "  vec2 tess_pos = position.xy;\n");
-				WRITE(p, "  vec2 weights[4];\n");
-				if (doBezier) {
-					// Bernstein 3D
-					WRITE(p, "  weights[0] = (1 - tess_pos) * (1 - tess_pos) * (1 - tess_pos);\n");
-					WRITE(p, "  weights[1] = 3 * tess_pos * (1 - tess_pos) * (1 - tess_pos);\n");
-					WRITE(p, "  weights[2] = 3 * tess_pos * tess_pos * (1 - tess_pos);\n");
-					WRITE(p, "  weights[3] = tess_pos * tess_pos * tess_pos;\n");
-				} else { // Spline
-					WRITE(p, "  ivec2 spline_num_patches = ivec2(spline_count_u - 3, spline_count_v - 3);\n");
-					WRITE(p, "  int spline_type_u = int((base.spline_counts >> 16) & 0xff);\n");
-					WRITE(p, "  int spline_type_v = int((base.spline_counts >> 24) & 0xff);\n");
-					WRITE(p, "  ivec2 spline_type = ivec2(spline_type_u, spline_type_v);\n");
-					WRITE(p, "  vec2 knots[6];\n");
-					WRITE(p, "  spline_knot(spline_num_patches, spline_type, knots, patch_pos);\n");
-					WRITE(p, "  spline_weight(tess_pos + patch_pos, knots, weights);\n");
-				}
-				WRITE(p, "  vec3 pos = tess_sample(_pos, weights);\n");
-				if (doTexture && hasTexcoord) {
-					if (hasTexcoordTess)
-						WRITE(p, "  vec2 tex = tess_sample(_tex, weights);\n");
-					else
-						WRITE(p, "  vec2 tex = tess_pos + patch_pos;\n");
-				}
-				if (hasColor) {
-					if (hasColorTess)
-						WRITE(p, "  vec4 col = tess_sample(_col, weights);\n");
-					else
-						WRITE(p, "  vec4 col = tess_data.data[0].color;\n");
-				}
-				if (hasNormal) {
-					// Curved surface is probably always need to compute normal(not sampling from control points)
-					if (doBezier) {
-						// Bernstein derivative
-						WRITE(p, "  vec2 bernderiv[4];\n");
-						WRITE(p, "  bernderiv[0] = -3 * (tess_pos - 1) * (tess_pos - 1); \n");
-						WRITE(p, "  bernderiv[1] = 9 * tess_pos * tess_pos - 12 * tess_pos + 3; \n");
-						WRITE(p, "  bernderiv[2] = 3 * (2 - 3 * tess_pos) * tess_pos; \n");
-						WRITE(p, "  bernderiv[3] = 3 * tess_pos * tess_pos; \n");
+				// Hardware tessellation
+				WRITE(p, "  Tess tess;\n");
+				WRITE(p, "  tessellate(tess);\n");
 
-						WRITE(p, "  vec2 bernderiv_u[4];\n");
-						WRITE(p, "  vec2 bernderiv_v[4];\n");
-						WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
-						WRITE(p, "    bernderiv_u[i] = vec2(bernderiv[i].x, weights[i].y);\n");
-						WRITE(p, "    bernderiv_v[i] = vec2(weights[i].x, bernderiv[i].y);\n");
-						WRITE(p, "  }\n");
-
-						WRITE(p, "  vec3 du = tess_sample(_pos, bernderiv_u);\n");
-						WRITE(p, "  vec3 dv = tess_sample(_pos, bernderiv_v);\n");
-					} else { // Spline
-						WRITE(p, "  vec2 tess_next_u = vec2(normal.x, 0);\n");
-						WRITE(p, "  vec2 tess_next_v = vec2(0, normal.y);\n");
-						// Right
-						WRITE(p, "  vec2 tess_pos_r = tess_pos + tess_next_u;\n");
-						WRITE(p, "  spline_weight(tess_pos_r + patch_pos, knots, weights);\n");
-						WRITE(p, "  vec3 pos_r = tess_sample(_pos, weights);\n");
-						// Left
-						WRITE(p, "  vec2 tess_pos_l = tess_pos - tess_next_u;\n");
-						WRITE(p, "  spline_weight(tess_pos_l + patch_pos, knots, weights);\n");
-						WRITE(p, "  vec3 pos_l = tess_sample(_pos, weights);\n");
-						// Down
-						WRITE(p, "  vec2 tess_pos_d = tess_pos + tess_next_v;\n");
-						WRITE(p, "  spline_weight(tess_pos_d + patch_pos, knots, weights);\n");
-						WRITE(p, "  vec3 pos_d = tess_sample(_pos, weights);\n");
-						// Up
-						WRITE(p, "  vec2 tess_pos_u = tess_pos - tess_next_v;\n");
-						WRITE(p, "  spline_weight(tess_pos_u + patch_pos, knots, weights);\n");
-						WRITE(p, "  vec3 pos_u = tess_sample(_pos, weights);\n");
-
-						WRITE(p, "  vec3 du = pos_r - pos_l;\n");
-						WRITE(p, "  vec3 dv = pos_d - pos_u;\n");
-					}
-					WRITE(p, "  vec3 nrm = cross(du, dv);\n");
-					WRITE(p, "  nrm = normalize(nrm);\n");
-				}
-				WRITE(p, "  vec3 worldpos = vec4(pos.xyz, 1.0) * base.world_mtx;\n");
-				if (hasNormal) {
-					WRITE(p, "  mediump vec3 worldnormal = normalize(vec4(%snrm, 0.0) * base.world_mtx);\n", flipNormalTess ? "-" : "");
+				WRITE(p, "  vec3 worldpos = vec4(tess.pos.xyz, 1.0) * base.world_mtx;\n");
+				if (hasNormalTess) {
+					WRITE(p, "  mediump vec3 worldnormal = normalize(vec4(%stess.nrm, 0.0) * base.world_mtx);\n", flipNormalTess ? "-" : "");
 				} else {
 					WRITE(p, "  mediump vec3 worldnormal = vec3(0.0, 0.0, 1.0);\n");
 				}
@@ -483,9 +406,10 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 		const char *diffuseStr = ((matUpdate & 2) && hasColor) ? "color0.rgb" : "light.matdiffuse";
 		const char *specularStr = ((matUpdate & 4) && hasColor) ? "color0.rgb" : "light.matspecular.rgb";
 		if (doBezier || doSpline) {
-			ambientStr = (matUpdate & 1) && hasColor ? "col" : "base.matambientalpha";
-			diffuseStr = (matUpdate & 2) && hasColor ? "col.rgb" : "light.matdiffuse";
-			specularStr = (matUpdate & 4) && hasColor ? "col.rgb" : "light.matspecular.rgb";
+			// TODO: Probably, should use hasColorTess but FF4 has a problem with drawing the background.
+			ambientStr = (matUpdate & 1) && hasColor ? "tess.col" : "base.matambientalpha";
+			diffuseStr = (matUpdate & 2) && hasColor ? "tess.col.rgb" : "light.matdiffuse";
+			specularStr = (matUpdate & 4) && hasColor ? "tess.col.rgb" : "light.matspecular.rgb";
 		}
 
 		bool diffuseIsZero = true;
@@ -606,7 +530,7 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 			// Lighting doesn't affect color.
 			if (hasColor) {
 				if (doBezier || doSpline)
-					WRITE(p, "  v_color0 = col;\n");
+					WRITE(p, "  v_color0 = tess.col;\n");
 				else
 					WRITE(p, "  v_color0 = color0;\n");
 			} else {
@@ -627,7 +551,7 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 				if (scaleUV) {
 					if (hasTexcoord) {
 						if (doBezier || doSpline)
-							WRITE(p, "  v_texcoord = vec3(tex.xy * base.uvscaleoffset.xy + base.uvscaleoffset.zw, 0.0);\n");
+							WRITE(p, "  v_texcoord = vec3(tess.tex.xy * base.uvscaleoffset.xy + base.uvscaleoffset.zw, 0.0);\n");
 						else
 							WRITE(p, "  v_texcoord = vec3(texcoord.xy * base.uvscaleoffset.xy, 0.0);\n");
 					} else {
@@ -635,10 +559,7 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 					}
 				} else {
 					if (hasTexcoord) {
-						if (doBezier || doSpline)
-							WRITE(p, "  v_texcoord = vec3(tex.xy * base.uvscaleoffset.xy + base.uvscaleoffset.zw, 0.0);\n");
-						else
-							WRITE(p, "  v_texcoord = vec3(texcoord.xy * base.uvscaleoffset.xy + base.uvscaleoffset.zw, 0.0);\n");
+						WRITE(p, "  v_texcoord = vec3(texcoord.xy * base.uvscaleoffset.xy + base.uvscaleoffset.zw, 0.0);\n");
 					} else {
 						WRITE(p, "  v_texcoord = vec3(base.uvscaleoffset.zw, 0.0);\n");
 					}
diff --git a/UI/GameSettingsScreen.cpp b/UI/GameSettingsScreen.cpp
index 43bb8dd58a..5dc93d0436 100644
--- a/UI/GameSettingsScreen.cpp
+++ b/UI/GameSettingsScreen.cpp
@@ -80,7 +80,7 @@ bool GameSettingsScreen::UseVerticalLayout() const {
 
 // This needs before run CheckGPUFeatures()
 // TODO: Remove this if fix the issue
-bool CheckSupportInstancedTessellationGLES() {
+bool CheckSupportShaderTessellationGLES() {
 #if PPSSPP_PLATFORM(UWP)
 	return true;
 #else
@@ -88,21 +88,17 @@ bool CheckSupportInstancedTessellationGLES() {
 	int maxVertexTextureImageUnits = gl_extensions.maxVertexTextureUnits;
 	bool vertexTexture = maxVertexTextureImageUnits >= 3; // At least 3 for hardware tessellation
 
-	bool canUseInstanceID = gl_extensions.EXT_draw_instanced || gl_extensions.ARB_draw_instanced;
-	bool canDefInstanceID = gl_extensions.IsGLES || gl_extensions.EXT_gpu_shader4 || gl_extensions.VersionGEThan(3, 1);
-	bool instanceRendering = gl_extensions.GLES3 || (canUseInstanceID && canDefInstanceID);
-
 	bool textureFloat = gl_extensions.ARB_texture_float || gl_extensions.OES_texture_float;
 	bool hasTexelFetch = gl_extensions.GLES3 || (!gl_extensions.IsGLES && gl_extensions.VersionGEThan(3, 3, 0)) || gl_extensions.EXT_gpu_shader4;
 
-	return instanceRendering && vertexTexture && textureFloat && hasTexelFetch;
+	return vertexTexture && textureFloat && hasTexelFetch;
 #endif
 }
 
 bool DoesBackendSupportHWTess() {
 	switch (GetGPUBackend()) {
 	case GPUBackend::OPENGL:
-		return CheckSupportInstancedTessellationGLES();
+		return CheckSupportShaderTessellationGLES();
 	case GPUBackend::VULKAN:
 	case GPUBackend::DIRECT3D11:
 		return true;
@@ -392,11 +388,10 @@ void GameSettingsScreen::CreateViews() {
 		}
 		return UI::EVENT_CONTINUE;
 	});
-	beziersChoice->SetDisabledPtr(&g_Config.bHardwareTessellation);
 
 	CheckBox *tessellationHW = graphicsSettings->Add(new CheckBox(&g_Config.bHardwareTessellation, gr->T("Hardware Tessellation")));
 	tessellationHW->OnClick.Add([=](EventParams &e) {
-		settingInfo_->Show(gr->T("HardwareTessellation Tip", "Uses hardware to make curves, always uses a fixed quality"), e.v);
+		settingInfo_->Show(gr->T("HardwareTessellation Tip", "Uses hardware to make curves"), e.v);
 		return UI::EVENT_CONTINUE;
 	});
 	tessHWEnable_ = DoesBackendSupportHWTess() && !g_Config.bSoftwareRendering && g_Config.bHardwareTransform;
diff --git a/Windows/GEDebugger/VertexPreview.cpp b/Windows/GEDebugger/VertexPreview.cpp
index 8fa7daa050..79183d236f 100644
--- a/Windows/GEDebugger/VertexPreview.cpp
+++ b/Windows/GEDebugger/VertexPreview.cpp
@@ -26,6 +26,7 @@
 #include "GPU/Common/GPUDebugInterface.h"
 #include "GPU/Common/SplineCommon.h"
 #include "GPU/GPUState.h"
+#include "Common/MemoryUtil.h"
 
 static const char preview_fs[] =
 	"#ifdef GL_ES\n"
@@ -164,96 +165,104 @@ u32 CGEDebugger::PrimPreviewOp() {
 }
 
 static void ExpandBezier(int &count, int op, const std::vector<SimpleVertex> &simpleVerts, const std::vector<u16> &indices, std::vector<SimpleVertex> &generatedVerts, std::vector<u16> &generatedInds) {
-	int count_u = (op & 0x00FF) >> 0;
-	int count_v = (op & 0xFF00) >> 8;
+	using namespace Spline;
 
-	int tess_u = gstate.getPatchDivisionU();
-	int tess_v = gstate.getPatchDivisionV();
-	if (tess_u < 1) {
-		tess_u = 1;
-	}
-	if (tess_v < 1) {
-		tess_v = 1;
-	}
+	int count_u = (op >> 0) & 0xFF;
+	int count_v = (op >> 8) & 0xFF;
+	// Real hardware seems to draw nothing when given < 4 either U or V.
+	if (count_u < 4 || count_v < 4)
+		return;
 
-	// Bezier patches share less control points than spline patches. Otherwise they are pretty much the same (except bezier don't support the open/close thing)
-	int num_patches_u = (count_u - 1) / 3;
-	int num_patches_v = (count_v - 1) / 3;
-	int total_patches = num_patches_u * num_patches_v;
-	std::vector<BezierPatch> patches;
-	patches.resize(total_patches);
-	for (int patch_u = 0; patch_u < num_patches_u; patch_u++) {
-		for (int patch_v = 0; patch_v < num_patches_v; patch_v++) {
-			BezierPatch &patch = patches[patch_u + patch_v * num_patches_u];
-			for (int point = 0; point < 16; ++point) {
-				int idx = (patch_u * 3 + point % 4) + (patch_v * 3 + point / 4) * count_u;
-				patch.points[point] = &simpleVerts[0] + (!indices.empty() ? indices[idx] : idx);
-			}
-			patch.u_index = patch_u * 3;
-			patch.v_index = patch_v * 3;
-			patch.index = patch_v * num_patches_u + patch_u;
-			patch.primType = gstate.getPatchPrimitiveType();
-			patch.computeNormals = false;
-			patch.patchFacing = false;
-		}
-	}
+	BezierSurface surface;
+	surface.num_points_u = count_u;
+	surface.num_points_v = count_v;
+	surface.tess_u = gstate.getPatchDivisionU();
+	surface.tess_v = gstate.getPatchDivisionV();
+	surface.num_patches_u = (count_u - 1) / 3;
+	surface.num_patches_v = (count_v - 1) / 3;
+	surface.primType = gstate.getPatchPrimitiveType();
+	surface.patchFacing = false;
 
-	generatedVerts.resize((tess_u + 1) * (tess_v + 1) * total_patches);
-	generatedInds.resize(tess_u * tess_v * 6 * total_patches);
+	int num_points = count_u * count_v;
+	// Make an array of pointers to the control points, to get rid of indices.
+	std::vector<const SimpleVertex *> points(num_points);
+	for (int idx = 0; idx < num_points; idx++)
+		points[idx] = simpleVerts.data() + (!indices.empty() ? indices[idx] : idx);
 
-	count = 0;
-	u8 *dest = (u8 *)&generatedVerts[0];
-	u16 *inds = &generatedInds[0];
-	for (int patch_idx = 0; patch_idx < total_patches; ++patch_idx) {
-		const BezierPatch &patch = patches[patch_idx];
-		TessellateBezierPatch(dest, inds, count, tess_u, tess_v, patch, gstate.vertType);
-	}
+	int total_patches = surface.num_patches_u * surface.num_patches_v;
+	generatedVerts.resize((surface.tess_u + 1) * (surface.tess_v + 1) * total_patches);
+	generatedInds.resize(surface.tess_u * surface.tess_v * 6 * total_patches);
+
+	OutputBuffers output;
+	output.vertices = generatedVerts.data();
+	output.indices = generatedInds.data();
+	output.count = 0;
+
+	ControlPoints cpoints;
+	cpoints.pos = (Vec3f *)AllocateAlignedMemory(sizeof(Vec3f) * num_points, 16);
+	cpoints.tex = (Vec2f *)AllocateAlignedMemory(sizeof(Vec2f) * num_points, 16);
+	cpoints.col = (Vec4f *)AllocateAlignedMemory(sizeof(Vec4f) * num_points, 16);
+	cpoints.Convert(points.data(), num_points);
+
+	surface.Init(generatedVerts.size());
+	SoftwareTessellation(output, surface, gstate.vertType, cpoints);
+	count = output.count;
+
+	FreeAlignedMemory(cpoints.pos);
+	FreeAlignedMemory(cpoints.tex);
+	FreeAlignedMemory(cpoints.col);
 }
 
 static void ExpandSpline(int &count, int op, const std::vector<SimpleVertex> &simpleVerts, const std::vector<u16> &indices, std::vector<SimpleVertex> &generatedVerts, std::vector<u16> &generatedInds) {
-	SplinePatchLocal patch;
-	patch.computeNormals = false;
-	patch.primType = gstate.getPatchPrimitiveType();
-	patch.patchFacing = false;
-
-	patch.count_u = (op & 0x00FF) >> 0;
-	patch.count_v = (op & 0xFF00) >> 8;
-	patch.type_u = (op >> 16) & 0x3;
-	patch.type_v = (op >> 18) & 0x3;
-
-	patch.tess_u = gstate.getPatchDivisionU();
-	patch.tess_v = gstate.getPatchDivisionV();
-	if (patch.tess_u < 1) {
-		patch.tess_u = 1;
-	}
-	if (patch.tess_v < 1) {
-		patch.tess_v = 1;
-	}
+	using namespace Spline;
 
+	int count_u = (op >> 0) & 0xFF;
+	int count_v = (op >> 8) & 0xFF;
 	// Real hardware seems to draw nothing when given < 4 either U or V.
-	if (patch.count_u < 4 || patch.count_v < 4) {
+	if (count_u < 4 || count_v < 4)
 		return;
-	}
 
-	std::vector<const SimpleVertex *> points;
-	points.resize(patch.count_u * patch.count_v);
+	SplineSurface surface;
+	surface.num_points_u = count_u;
+	surface.num_points_v = count_v;
+	surface.tess_u = gstate.getPatchDivisionU();
+	surface.tess_v = gstate.getPatchDivisionV();
+	surface.type_u = (op >> 16) & 0x3;
+	surface.type_v = (op >> 18) & 0x3;
+	surface.num_patches_u = count_u - 3;
+	surface.num_patches_v = count_v - 3;
+	surface.primType = gstate.getPatchPrimitiveType();
+	surface.patchFacing = false;
 
+	int num_points = count_u * count_v;
 	// Make an array of pointers to the control points, to get rid of indices.
-	for (int idx = 0; idx < patch.count_u * patch.count_v; idx++) {
-		points[idx] = &simpleVerts[0] + (!indices.empty() ? indices[idx] : idx);
-	}
-	patch.points = &points[0];
+	std::vector<const SimpleVertex *> points(num_points);
+	for (int idx = 0; idx < num_points; idx++)
+		points[idx] = simpleVerts.data() + (!indices.empty() ? indices[idx] : idx);
 
-	int patch_div_s = (patch.count_u - 3) * patch.tess_u;
-	int patch_div_t = (patch.count_v - 3) * patch.tess_v;
-	int maxVertexCount = (patch_div_s + 1) * (patch_div_t + 1);
-
-	generatedVerts.resize(maxVertexCount);
+	int patch_div_s = surface.num_patches_u * surface.tess_u;
+	int patch_div_t = surface.num_patches_v * surface.tess_v;
+	generatedVerts.resize((patch_div_s + 1) * (patch_div_t + 1));
 	generatedInds.resize(patch_div_s * patch_div_t * 6);
 
-	count = 0;
-	u8 *dest = (u8 *)&generatedVerts[0];
-	TessellateSplinePatch(dest, &generatedInds[0], count, patch, gstate.vertType, maxVertexCount);
+	OutputBuffers output;
+	output.vertices = generatedVerts.data();
+	output.indices = generatedInds.data();
+	output.count = 0;
+
+	ControlPoints cpoints;
+	cpoints.pos = (Vec3f *)AllocateAlignedMemory(sizeof(Vec3f) * num_points, 16);
+	cpoints.tex = (Vec2f *)AllocateAlignedMemory(sizeof(Vec2f) * num_points, 16);
+	cpoints.col = (Vec4f *)AllocateAlignedMemory(sizeof(Vec4f) * num_points, 16);
+	cpoints.Convert(points.data(), num_points);
+
+	surface.Init(generatedVerts.size());
+	SoftwareTessellation(output, surface, gstate.vertType, cpoints);
+	count = output.count;
+
+	FreeAlignedMemory(cpoints.pos);
+	FreeAlignedMemory(cpoints.tex);
+	FreeAlignedMemory(cpoints.col);
 }
 
 void CGEDebugger::UpdatePrimPreview(u32 op, int which) {
diff --git a/ext/native/thin3d/GLQueueRunner.cpp b/ext/native/thin3d/GLQueueRunner.cpp
index 36696bacd7..49031eafca 100644
--- a/ext/native/thin3d/GLQueueRunner.cpp
+++ b/ext/native/thin3d/GLQueueRunner.cpp
@@ -306,14 +306,14 @@ void GLQueueRunner::RunInitSteps(const std::vector<GLRInitStep> &steps, bool ski
 				glBindTexture(tex->target, tex->texture);
 				boundTexture = tex->texture;
 			}
-			if (!step.texture_image.data)
+			if (!step.texture_image.data && step.texture_image.allocType != GLRAllocType::NONE)
 				Crash();
 			// For things to show in RenderDoc, need to split into glTexImage2D(..., nullptr) and glTexSubImage.
 			glTexImage2D(tex->target, step.texture_image.level, step.texture_image.internalFormat, step.texture_image.width, step.texture_image.height, 0, step.texture_image.format, step.texture_image.type, step.texture_image.data);
 			allocatedTextures = true;
 			if (step.texture_image.allocType == GLRAllocType::ALIGNED) {
 				FreeAlignedMemory(step.texture_image.data);
-			} else {
+			} else if (step.texture_image.allocType == GLRAllocType::NEW) {
 				delete[] step.texture_image.data;
 			}
 			CHECK_GL_ERROR_IF_DEBUG();
@@ -490,7 +490,19 @@ void GLQueueRunner::RunSteps(const std::vector<GLRStep *> &steps, bool skipGLCal
 			const GLRStep &step = *steps[i];
 			switch (step.stepType) {
 			case GLRStepType::RENDER:
-				// TODO: With #11425 there'll be a case where we should really free spline data here.
+				for (const auto &c : step.commands) {
+					switch (c.cmd) {
+					case GLRRenderCommand::TEXTURE_SUBIMAGE:
+						if (c.texture_subimage.data) {
+							if (c.texture_subimage.allocType == GLRAllocType::ALIGNED) {
+								FreeAlignedMemory(c.texture_subimage.data);
+							} else if (c.texture_subimage.allocType == GLRAllocType::NEW) {
+								delete[] c.texture_subimage.data;
+							}
+						}
+						break;
+					}
+				}
 				break;
 			}
 			delete steps[i];
@@ -1024,6 +1036,22 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step) {
 			}
 			break;
 		}
+		case GLRRenderCommand::TEXTURE_SUBIMAGE:
+		{
+			GLRTexture *tex = c.texture_subimage.texture;
+			// TODO: Need bind?
+			if (!c.texture_subimage.data)
+				Crash();
+			// For things to show in RenderDoc, need to split into glTexImage2D(..., nullptr) and glTexSubImage.
+			glTexSubImage2D(tex->target, c.texture_subimage.level, c.texture_subimage.x, c.texture_subimage.y, c.texture_subimage.width, c.texture_subimage.height, c.texture_subimage.format, c.texture_subimage.type, c.texture_subimage.data);
+			if (c.texture_subimage.allocType == GLRAllocType::ALIGNED) {
+				FreeAlignedMemory(c.texture_subimage.data);
+			} else if (c.texture_subimage.allocType == GLRAllocType::NEW) {
+				delete[] c.texture_subimage.data;
+			}
+			CHECK_GL_ERROR_IF_DEBUG();
+			break;
+		}
 		case GLRRenderCommand::RASTER:
 			if (c.raster.cullEnable) {
 				if (!cullEnabled) {
diff --git a/ext/native/thin3d/GLQueueRunner.h b/ext/native/thin3d/GLQueueRunner.h
index 5f1432e52d..b7a1586d59 100644
--- a/ext/native/thin3d/GLQueueRunner.h
+++ b/ext/native/thin3d/GLQueueRunner.h
@@ -20,6 +20,7 @@ struct GLOffset2D {
 };
 
 enum class GLRAllocType {
+	NONE,
 	NEW,
 	ALIGNED,
 };
@@ -57,6 +58,7 @@ enum class GLRRenderCommand : uint8_t {
 	DRAW,
 	DRAW_INDEXED,
 	PUSH_CONSTANTS,
+	TEXTURE_SUBIMAGE,
 };
 
 // TODO: Bloated since the biggest struct decides the size. Will need something more efficient (separate structs with shared
@@ -138,6 +140,18 @@ struct GLRRenderData {
 			int slot;
 			GLRTexture *texture;
 		} texture;
+		struct {
+			GLRTexture *texture;
+			GLenum format;
+			GLenum type;
+			int level;
+			int x;
+			int y;
+			int width;
+			int height;
+			GLRAllocType allocType;
+			uint8_t *data;  // owned, delete[]-d
+		} texture_subimage;
 		struct {
 			int slot;
 			GLRFramebuffer *framebuffer;
diff --git a/ext/native/thin3d/GLRenderManager.h b/ext/native/thin3d/GLRenderManager.h
index 4c3c32b13d..9cc4ca4f7d 100644
--- a/ext/native/thin3d/GLRenderManager.h
+++ b/ext/native/thin3d/GLRenderManager.h
@@ -530,6 +530,22 @@ public:
 		initSteps_.push_back(step);
 	}
 
+	void TextureSubImage(GLRTexture *texture, int level, int x, int y, int width, int height, GLenum format, GLenum type, uint8_t *data, GLRAllocType allocType = GLRAllocType::NEW) {
+		_dbg_assert_(G3D, curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
+		GLRRenderData _data{ GLRRenderCommand::TEXTURE_SUBIMAGE };
+		_data.texture_subimage.texture = texture;
+		_data.texture_subimage.data = data;
+		_data.texture_subimage.format = format;
+		_data.texture_subimage.type = type;
+		_data.texture_subimage.level = level;
+		_data.texture_subimage.x = x;
+		_data.texture_subimage.y = y;
+		_data.texture_subimage.width = width;
+		_data.texture_subimage.height = height;
+		_data.texture_subimage.allocType = allocType;
+		curRenderStep_->commands.push_back(_data);
+	}
+
 	void FinalizeTexture(GLRTexture *texture, int maxLevels, bool genMips) {
 		GLRInitStep step{ GLRInitStepType::TEXTURE_FINALIZE };
 		step.texture_finalize.texture = texture;