softgpu: Split clippos out of rasterization vert.

We don't use it, except w, at all in rasterization, so no need to keep it in the bin queue.
2025-04-02 11:01:50 -04:00 · 2022-09-26 16:50:40 -07:00 · 2022-09-26 16:50:40 -07:00 · 8376176b2f
commit 8376176b2f
parent 97ae4ae712
7 changed files with 174 additions and 163 deletions
--- a/GPU/Software/Clipper.cpp
+++ b/GPU/Software/Clipper.cpp
@ -45,7 +45,7 @@ inline bool different_signs(float x, float y) {
 	return ((x <= 0 && y > 0) || (x > 0 && y <= 0));
 }

-inline float clip_dotprod(const VertexData &vert, float A, float B, float C, float D) {
+inline float clip_dotprod(const ClipVertexData &vert, float A, float B, float C, float D) {
 	return (vert.clippos.x * A + vert.clippos.y * B + vert.clippos.z * C + vert.clippos.w * D);
 }

@ -131,7 +131,7 @@ static inline bool CheckOutsideZ(ClipCoords p, int &pos, int &neg) {
 	return false;
 }

-void ProcessRect(const VertexData &v0, const VertexData &v1, BinManager &binner) {
+void ProcessRect(const ClipVertexData &v0, const ClipVertexData &v1, BinManager &binner) {
 	if (!binner.State().throughMode) {
 		// If any verts were outside range, throw the entire prim away.
 		if (v0.OutsideRange() || v1.OutsideRange())
@ -149,37 +149,37 @@ void ProcessRect(const VertexData &v0, const VertexData &v1, BinManager &binner)
 		else if (outsidePos >= 2 || outsideNeg >= 2)
 			return;

-		if (v0.fogdepth != v1.fogdepth) {
+		if (v0.v.fogdepth != v1.v.fogdepth) {
 			// Rectangles seem to always use nearest along X for fog depth, but reversed.
 			// TODO: Check exactness of middle.
-			VertexData vhalf0 = v1;
-			vhalf0.screenpos.x = v0.screenpos.x + (v1.screenpos.x - v0.screenpos.x) / 2;
+			VertexData vhalf0 = v1.v;
+			vhalf0.screenpos.x = v0.v.screenpos.x + (v1.v.screenpos.x - v0.v.screenpos.x) / 2;

-			VertexData vhalf1 = v1;
-			vhalf1.screenpos.x = v0.screenpos.x + (v1.screenpos.x - v0.screenpos.x) / 2;
-			vhalf1.screenpos.y = v0.screenpos.y;
+			VertexData vhalf1 = v1.v;
+			vhalf1.screenpos.x = v0.v.screenpos.x + (v1.v.screenpos.x - v0.v.screenpos.x) / 2;
+			vhalf1.screenpos.y = v0.v.screenpos.y;

-			VertexData vrev1 = v1;
-			vrev1.fogdepth = v0.fogdepth;
+			VertexData vrev1 = v1.v;
+			vrev1.fogdepth = v0.v.fogdepth;

-			binner.AddRect(v0, vhalf0);
+			binner.AddRect(v0.v, vhalf0);
 			binner.AddRect(vhalf1, vrev1);
 		} else {
-			binner.AddRect(v0, v1);
+			binner.AddRect(v0.v, v1.v);
 		}
 	} else {
 		// through mode handling
-		if (Rasterizer::RectangleFastPath(v0, v1, binner)) {
+		if (Rasterizer::RectangleFastPath(v0.v, v1.v, binner)) {
 			return;
 		} else if (gstate.isModeClear() && !gstate.isDitherEnabled()) {
-			binner.AddClearRect(v0, v1);
+			binner.AddClearRect(v0.v, v1.v);
 		} else {
-			binner.AddRect(v0, v1);
+			binner.AddRect(v0.v, v1.v);
 		}
 	}
 }

-void ProcessPoint(const VertexData &v0, BinManager &binner) {
+void ProcessPoint(const ClipVertexData &v0, BinManager &binner) {
 	// If any verts were outside range, throw the entire prim away.
 	if (!binner.State().throughMode) {
 		if (v0.OutsideRange())
@ -187,13 +187,13 @@ void ProcessPoint(const VertexData &v0, BinManager &binner) {
 	}

 	// Points need no clipping. Will be bounds checked in the rasterizer (which seems backwards?)
-	binner.AddPoint(v0);
+	binner.AddPoint(v0.v);
 }

-void ProcessLine(const VertexData &v0, const VertexData &v1, BinManager &binner) {
+void ProcessLine(const ClipVertexData &v0, const ClipVertexData &v1, BinManager &binner) {
 	if (binner.State().throughMode) {
 		// Actually, should clip this one too so we don't need to do bounds checks in the rasterizer.
-		binner.AddLine(v0, v1);
+		binner.AddLine(v0.v, v1.v);
 		return;
 	}

@ -216,24 +216,26 @@ void ProcessLine(const VertexData &v0, const VertexData &v1, BinManager &binner)
 	int mask1 = CalcClipMask(v1.clippos);
 	int mask = mask0 | mask1;
 	if ((mask & CLIP_NEG_Z_BIT) == 0) {
-		binner.AddLine(v0, v1);
+		binner.AddLine(v0.v, v1.v);
 		return;
 	}

-	VertexData ClippedVertices[2] = { v0, v1 };
-	VertexData *Vertices[2] = { &ClippedVertices[0], &ClippedVertices[1] };
+	ClipVertexData ClippedVertices[2] = { v0, v1 };
+	ClipVertexData *Vertices[2] = { &ClippedVertices[0], &ClippedVertices[1] };
 	bool clipped = false;
 	CLIP_LINE(CLIP_NEG_Z_BIT,  0,  0,  1, 1);

-	VertexData data[2] = { *Vertices[0], *Vertices[1] };
+	ClipVertexData data[2] = { *Vertices[0], *Vertices[1] };
 	if (clipped) {
-		data[0].screenpos = TransformUnit::ClipToScreen(data[0].clippos);
-		data[1].screenpos = TransformUnit::ClipToScreen(data[1].clippos);
+		data[0].v.screenpos = TransformUnit::ClipToScreen(data[0].clippos);
+		data[1].v.screenpos = TransformUnit::ClipToScreen(data[1].clippos);
+		data[0].v.clipw = data[0].clippos.w;
+		data[1].v.clipw = data[1].clippos.w;
 	}
-	binner.AddLine(data[0], data[1]);
+	binner.AddLine(data[0].v, data[1].v);
 }

-void ProcessTriangle(const VertexData &v0, const VertexData &v1, const VertexData &v2, const VertexData &provoking, BinManager &binner) {
+void ProcessTriangle(const ClipVertexData &v0, const ClipVertexData &v1, const ClipVertexData &v2, const ClipVertexData &provoking, BinManager &binner) {
 	int mask = 0;
 	if (!binner.State().throughMode) {
 		// If any verts were outside range, throw the entire prim away.
@ -262,20 +264,20 @@ void ProcessTriangle(const VertexData &v0, const VertexData &v1, const VertexDat
 	if ((mask & CLIP_NEG_Z_BIT) == 0) {
 		if (gstate.getShadeMode() == GE_SHADE_FLAT) {
 			// So that the order of clipping doesn't matter...
-			VertexData corrected2 = v2;
-			corrected2.color0 = provoking.color0;
-			corrected2.color1 = provoking.color1;
-			binner.AddTriangle(v0, v1, corrected2);
+			VertexData corrected2 = v2.v;
+			corrected2.color0 = provoking.v.color0;
+			corrected2.color1 = provoking.v.color1;
+			binner.AddTriangle(v0.v, v1.v, corrected2);
 		} else {
-			binner.AddTriangle(v0, v1, v2);
+			binner.AddTriangle(v0.v, v1.v, v2.v);
 		}
 		return;
 	}

 	enum { NUM_CLIPPED_VERTICES = 3, NUM_INDICES = NUM_CLIPPED_VERTICES + 3 };

-	VertexData* Vertices[NUM_INDICES];
-	VertexData ClippedVertices[NUM_INDICES];
+	ClipVertexData* Vertices[NUM_INDICES];
+	ClipVertexData ClippedVertices[NUM_INDICES];
 	for (int i = 0; i < NUM_INDICES; ++i)
 		Vertices[i] = &ClippedVertices[i];

@ -319,22 +321,25 @@ void ProcessTriangle(const VertexData &v0, const VertexData &v1, const VertexDat

 	for (int i = 0; i + 3 <= numIndices; i += 3) {
 		if (indices[i] != SKIP_FLAG) {
-			VertexData &subv0 = *Vertices[indices[i + 0]];
-			VertexData &subv1 = *Vertices[indices[i + 1]];
-			VertexData &subv2 = *Vertices[indices[i + 2]];
+			ClipVertexData &subv0 = *Vertices[indices[i + 0]];
+			ClipVertexData &subv1 = *Vertices[indices[i + 1]];
+			ClipVertexData &subv2 = *Vertices[indices[i + 2]];
 			if (clipped) {
-				subv0.screenpos = TransformUnit::ClipToScreen(subv0.clippos);
-				subv1.screenpos = TransformUnit::ClipToScreen(subv1.clippos);
-				subv2.screenpos = TransformUnit::ClipToScreen(subv2.clippos);
+				subv0.v.screenpos = TransformUnit::ClipToScreen(subv0.clippos);
+				subv1.v.screenpos = TransformUnit::ClipToScreen(subv1.clippos);
+				subv2.v.screenpos = TransformUnit::ClipToScreen(subv2.clippos);
+				subv0.v.clipw = subv0.clippos.w;
+				subv1.v.clipw = subv1.clippos.w;
+				subv2.v.clipw = subv2.clippos.w;
 			}

 			if (gstate.getShadeMode() == GE_SHADE_FLAT) {
 				// So that the order of clipping doesn't matter...
-				subv2.color0 = provoking.color0;
-				subv2.color1 = provoking.color1;
+				subv2.v.color0 = provoking.v.color0;
+				subv2.v.color1 = provoking.v.color1;
 			}

-			binner.AddTriangle(subv0, subv1, subv2);
+			binner.AddTriangle(subv0.v, subv1.v, subv2.v);
 		}
 	}
 }
--- a/GPU/Software/Clipper.h
+++ b/GPU/Software/Clipper.h
@ -26,9 +26,9 @@ class BinManager;

 namespace Clipper {

-void ProcessPoint(const VertexData &v0, BinManager &binner);
-void ProcessLine(const VertexData &v0, const VertexData &v1, BinManager &binner);
-void ProcessTriangle(const VertexData &v0, const VertexData &v1, const VertexData &v2, const VertexData &provoking, BinManager &binner);
-void ProcessRect(const VertexData &v0, const VertexData &v1, BinManager &binner);
+void ProcessPoint(const ClipVertexData &v0, BinManager &binner);
+void ProcessLine(const ClipVertexData &v0, const ClipVertexData &v1, BinManager &binner);
+void ProcessTriangle(const ClipVertexData &v0, const ClipVertexData &v1, const ClipVertexData &v2, const ClipVertexData &provoking, BinManager &binner);
+void ProcessRect(const ClipVertexData &v0, const ClipVertexData &v1, BinManager &binner);

 }
--- a/GPU/Software/Rasterizer.cpp
+++ b/GPU/Software/Rasterizer.cpp
@ -227,8 +227,8 @@ static inline void GetTextureCoordinates(const VertexData& v0, const VertexData&
 	// All UV gen modes, by the time they get here, behave the same.

 	// Note that for environment mapping, texture coordinates have been calculated during lighting
-	float q0 = 1.f / v0.clippos.w;
-	float q1 = 1.f / v1.clippos.w;
+	float q0 = 1.f / v0.clipw;
+	float q1 = 1.f / v1.clipw;
 	float wq0 = p * q0;
 	float wq1 = (1.0f - p) * q1;

@ -241,9 +241,9 @@ static inline void GetTextureCoordinates(const VertexData &v0, const VertexData
 	// All UV gen modes, by the time they get here, behave the same.

 	// Note that for environment mapping, texture coordinates have been calculated during lighting.
-	float q0 = 1.f / v0.clippos.w;
-	float q1 = 1.f / v1.clippos.w;
-	float q2 = 1.f / v2.clippos.w;
+	float q0 = 1.f / v0.clipw;
+	float q1 = 1.f / v1.clipw;
+	float q2 = 1.f / v2.clipw;
 	Vec4<float> wq0 = w0.Cast<float>() * q0;
 	Vec4<float> wq1 = w1.Cast<float>() * q1;
 	Vec4<float> wq2 = w2.Cast<float>() * q2;
--- a/GPU/Software/RasterizerRectangle.cpp
+++ b/GPU/Software/RasterizerRectangle.cpp
@ -393,16 +393,16 @@ bool RectangleFastPath(const VertexData &v0, const VertexData &v1, BinManager &b
 	return false;
 }

-static bool AreCoordsRectangleCompatible(const RasterizerState &state, const VertexData &data0, const VertexData &data1) {
-	if (data1.color0 != data0.color0)
+static bool AreCoordsRectangleCompatible(const RasterizerState &state, const ClipVertexData &data0, const ClipVertexData &data1) {
+	if (data1.v.color0 != data0.v.color0)
 		return false;
-	if (data1.screenpos.z != data0.screenpos.z) {
+	if (data1.v.screenpos.z != data0.v.screenpos.z) {
 		// Sometimes, we don't actually care about z.
 		if (state.pixelID.depthWrite || state.pixelID.DepthTestFunc() != GE_COMP_ALWAYS)
 			return false;
 	}
 	if (!state.throughMode) {
-		if (data1.color1 != data0.color1)
+		if (data1.v.color1 != data0.v.color1)
 			return false;
 		// This means it should be culled, outside range.
 		if (data1.OutsideRange() || data0.OutsideRange())
@ -414,26 +414,26 @@ static bool AreCoordsRectangleCompatible(const RasterizerState &state, const Ver
 			if (data1.clippos.w - halftexel > data0.clippos.w || data1.clippos.w + halftexel < data0.clippos.w)
 				return false;
 		}
-		if (state.pixelID.applyFog && data1.fogdepth != data0.fogdepth) {
+		if (state.pixelID.applyFog && data1.v.fogdepth != data0.v.fogdepth) {
 			// Similar to w, this only matters if they're farther apart than 1/255.
 			static constexpr float foghalfstep = 0.5f / 255.0f;
-			if (data1.fogdepth - foghalfstep > data0.fogdepth || data1.fogdepth + foghalfstep < data0.fogdepth)
+			if (data1.v.fogdepth - foghalfstep > data0.v.fogdepth || data1.v.fogdepth + foghalfstep < data0.v.fogdepth)
 				return false;
 		}
 	}
 	return true;
 }

-bool DetectRectangleFromStrip(const RasterizerState &state, const VertexData data[4], int *tlIndex, int *brIndex) {
+bool DetectRectangleFromStrip(const RasterizerState &state, const ClipVertexData data[4], int *tlIndex, int *brIndex) {
 	// Color and Z must be flat.  Also find the TL and BR meanwhile.
 	int tl = 0, br = 0;
 	for (int i = 1; i < 4; ++i) {
 		if (!AreCoordsRectangleCompatible(state, data[i], data[0]))
 			return false;

-		if (data[i].screenpos.x <= data[tl].screenpos.x && data[i].screenpos.y <= data[tl].screenpos.y)
+		if (data[i].v.screenpos.x <= data[tl].v.screenpos.x && data[i].v.screenpos.y <= data[tl].v.screenpos.y)
 			tl = i;
-		if (data[i].screenpos.x >= data[br].screenpos.x && data[i].screenpos.y >= data[br].screenpos.y)
+		if (data[i].v.screenpos.x >= data[br].v.screenpos.x && data[i].v.screenpos.y >= data[br].v.screenpos.y)
 			br = i;
 	}

@ -442,36 +442,36 @@ bool DetectRectangleFromStrip(const RasterizerState &state, const VertexData dat

 	// OK, now let's look at data to detect rectangles. There are a few possibilities
 	// but we focus on Darkstalkers for now.
-	if (data[0].screenpos.x == data[1].screenpos.x &&
-		data[0].screenpos.y == data[2].screenpos.y &&
-		data[2].screenpos.x == data[3].screenpos.x &&
-		data[1].screenpos.y == data[3].screenpos.y) {
+	if (data[0].v.screenpos.x == data[1].v.screenpos.x &&
+		data[0].v.screenpos.y == data[2].v.screenpos.y &&
+		data[2].v.screenpos.x == data[3].v.screenpos.x &&
+		data[1].v.screenpos.y == data[3].v.screenpos.y) {
 		// Okay, this is in the shape of a rectangle, but what about texture?
 		if (!state.enableTextures)
 			return true;

-		if (data[0].texturecoords.x == data[1].texturecoords.x &&
-			data[0].texturecoords.y == data[2].texturecoords.y &&
-			data[2].texturecoords.x == data[3].texturecoords.x &&
-			data[1].texturecoords.y == data[3].texturecoords.y) {
+		if (data[0].v.texturecoords.x == data[1].v.texturecoords.x &&
+			data[0].v.texturecoords.y == data[2].v.texturecoords.y &&
+			data[2].v.texturecoords.x == data[3].v.texturecoords.x &&
+			data[1].v.texturecoords.y == data[3].v.texturecoords.y) {
 			// It's a rectangle!
 			return true;
 		}
 		return false;
 	}
 	// There's the other vertex order too...
-	if (data[0].screenpos.x == data[2].screenpos.x &&
-		data[0].screenpos.y == data[1].screenpos.y &&
-		data[1].screenpos.x == data[3].screenpos.x &&
-		data[2].screenpos.y == data[3].screenpos.y) {
+	if (data[0].v.screenpos.x == data[2].v.screenpos.x &&
+		data[0].v.screenpos.y == data[1].v.screenpos.y &&
+		data[1].v.screenpos.x == data[3].v.screenpos.x &&
+		data[2].v.screenpos.y == data[3].v.screenpos.y) {
 		// Okay, this is in the shape of a rectangle, but what about texture?
 		if (!state.enableTextures)
 			return true;

-		if (data[0].texturecoords.x == data[2].texturecoords.x &&
-			data[0].texturecoords.y == data[1].texturecoords.y &&
-			data[1].texturecoords.x == data[3].texturecoords.x &&
-			data[2].texturecoords.y == data[3].texturecoords.y) {
+		if (data[0].v.texturecoords.x == data[2].v.texturecoords.x &&
+			data[0].v.texturecoords.y == data[1].v.texturecoords.y &&
+			data[1].v.texturecoords.x == data[3].v.texturecoords.x &&
+			data[2].v.texturecoords.y == data[3].v.texturecoords.y) {
 			// It's a rectangle!
 			return true;
 		}
@ -480,7 +480,7 @@ bool DetectRectangleFromStrip(const RasterizerState &state, const VertexData dat
 	return false;
 }

-bool DetectRectangleFromFan(const RasterizerState &state, const VertexData *data, int c, int *tlIndex, int *brIndex) {
+bool DetectRectangleFromFan(const RasterizerState &state, const ClipVertexData *data, int c, int *tlIndex, int *brIndex) {
 	// Color and Z must be flat.
 	for (int i = 1; i < c; ++i) {
 		if (!AreCoordsRectangleCompatible(state, data[i], data[0]))
@ -489,8 +489,8 @@ bool DetectRectangleFromFan(const RasterizerState &state, const VertexData *data

 	// Check for the common case: a single TL-TR-BR-BL.
 	if (c == 4) {
-		const auto &pos0 = data[0].screenpos, &pos1 = data[1].screenpos;
-		const auto &pos2 = data[2].screenpos, &pos3 = data[3].screenpos;
+		const auto &pos0 = data[0].v.screenpos, &pos1 = data[1].v.screenpos;
+		const auto &pos2 = data[2].v.screenpos, &pos3 = data[3].v.screenpos;
 		if (pos0.x == pos3.x && pos1.x == pos2.x && pos0.y == pos1.y && pos3.y == pos2.y) {
 			// Looking like yes.  Set TL/BR based on y order first...
 			*tlIndex = pos0.y > pos3.y ? 2 : 0;
@ -505,13 +505,13 @@ bool DetectRectangleFromFan(const RasterizerState &state, const VertexData *data
 			if (!state.enableTextures)
 				return true;

-			const auto &textl = data[*tlIndex].texturecoords, &textr = data[*tlIndex ^ 1].texturecoords;
-			const auto &texbl = data[*brIndex ^ 1].texturecoords, &texbr = data[*brIndex].texturecoords;
+			const auto &textl = data[*tlIndex].v.texturecoords, &textr = data[*tlIndex ^ 1].v.texturecoords;
+			const auto &texbl = data[*brIndex ^ 1].v.texturecoords, &texbr = data[*brIndex].v.texturecoords;

 			if (textl.x == texbl.x && textr.x == texbr.x && textl.y == textr.y && texbl.y == texbr.y) {
 				// Okay, the texture is also good, but let's avoid rotation issues.
-				const auto &postl = data[*tlIndex].screenpos;
-				const auto &posbr = data[*brIndex].screenpos;
+				const auto &postl = data[*tlIndex].v.screenpos;
+				const auto &posbr = data[*brIndex].v.screenpos;
 				return textl.y < texbr.y && postl.y < posbr.y && textl.x < texbr.x && postl.x < posbr.x;
 			}
 		}
@ -520,26 +520,26 @@ bool DetectRectangleFromFan(const RasterizerState &state, const VertexData *data
 	return false;
 }

-bool DetectRectangleFromPair(const RasterizerState &state, const VertexData data[6], int *tlIndex, int *brIndex) {
+bool DetectRectangleFromPair(const RasterizerState &state, const ClipVertexData data[6], int *tlIndex, int *brIndex) {
 	// Color and Z must be flat.  Also find the TL and BR meanwhile.
 	int tl = 0, br = 0;
 	for (int i = 1; i < 6; ++i) {
 		if (!AreCoordsRectangleCompatible(state, data[i], data[0]))
 			return false;

-		if (data[i].screenpos.x <= data[tl].screenpos.x && data[i].screenpos.y <= data[tl].screenpos.y)
+		if (data[i].v.screenpos.x <= data[tl].v.screenpos.x && data[i].v.screenpos.y <= data[tl].v.screenpos.y)
 			tl = i;
-		if (data[i].screenpos.x >= data[br].screenpos.x && data[i].screenpos.y >= data[br].screenpos.y)
+		if (data[i].v.screenpos.x >= data[br].v.screenpos.x && data[i].v.screenpos.y >= data[br].v.screenpos.y)
 			br = i;
 	}

 	*tlIndex = tl;
 	*brIndex = br;

-	auto xat = [&](int i) { return data[i].screenpos.x; };
-	auto yat = [&](int i) { return data[i].screenpos.y; };
-	auto uat = [&](int i) { return data[i].texturecoords.x; };
-	auto vat = [&](int i) { return data[i].texturecoords.y; };
+	auto xat = [&](int i) { return data[i].v.screenpos.x; };
+	auto yat = [&](int i) { return data[i].v.screenpos.y; };
+	auto uat = [&](int i) { return data[i].v.texturecoords.x; };
+	auto vat = [&](int i) { return data[i].v.texturecoords.y; };

 	// A likely order would be: TL, TR, BR, TL, BR, BL.  We'd have the last index of each.
 	// TODO: Make more generic.
@ -567,12 +567,12 @@ bool DetectRectangleFromPair(const RasterizerState &state, const VertexData data
 	return false;
 }

-bool DetectRectangleThroughModeSlices(const RasterizerState &state, const VertexData data[4]) {
+bool DetectRectangleThroughModeSlices(const RasterizerState &state, const ClipVertexData data[4]) {
 	// Color and Z must be flat.
 	for (int i = 1; i < 4; ++i) {
-		if (!(data[i].color0 == data[0].color0))
+		if (!(data[i].v.color0 == data[0].v.color0))
 			return false;
-		if (!(data[i].screenpos.z == data[0].screenpos.z)) {
+		if (!(data[i].v.screenpos.z == data[0].v.screenpos.z)) {
 			// Sometimes, we don't actually care about z.
 			if (state.pixelID.depthWrite || state.pixelID.DepthTestFunc() != GE_COMP_ALWAYS)
 				return false;
@ -580,15 +580,15 @@ bool DetectRectangleThroughModeSlices(const RasterizerState &state, const Vertex
 	}

 	// Games very commonly use vertical strips of rectangles.  Detect and combine.
-	const auto &tl1 = data[0].screenpos, &br1 = data[1].screenpos;
-	const auto &tl2 = data[2].screenpos, &br2 = data[3].screenpos;
+	const auto &tl1 = data[0].v.screenpos, &br1 = data[1].v.screenpos;
+	const auto &tl2 = data[2].v.screenpos, &br2 = data[3].v.screenpos;
 	if (tl1.y == tl2.y && br1.y == br2.y && br1.y > tl1.y) {
 		if (br1.x == tl2.x && tl1.x < br1.x && tl2.x < br2.x) {
 			if (!state.enableTextures)
 				return true;

-			const auto &textl1 = data[0].texturecoords, &texbr1 = data[1].texturecoords;
-			const auto &textl2 = data[2].texturecoords, &texbr2 = data[3].texturecoords;
+			const auto &textl1 = data[0].v.texturecoords, &texbr1 = data[1].v.texturecoords;
+			const auto &textl2 = data[2].v.texturecoords, &texbr2 = data[3].v.texturecoords;
 			if (textl1.y != textl2.y || texbr1.y != texbr2.y || textl1.y > texbr1.y)
 				return false;
 			if (texbr1.x != textl2.x || textl1.x > texbr1.x || textl2.x > texbr2.x)
--- a/GPU/Software/RasterizerRectangle.h
+++ b/GPU/Software/RasterizerRectangle.h
@ -20,8 +20,8 @@ namespace Rasterizer {
 	bool RectangleFastPath(const VertexData &v0, const VertexData &v1, BinManager &binner);
 	void DrawSprite(const VertexData &v0, const VertexData &v1, const BinCoords &range, const RasterizerState &state);

-	bool DetectRectangleFromStrip(const RasterizerState &state, const VertexData data[4], int *tlIndex, int *brIndex);
-	bool DetectRectangleFromFan(const RasterizerState &state, const VertexData *data, int c, int *tlIndex, int *brIndex);
-	bool DetectRectangleFromPair(const RasterizerState &state, const VertexData data[6], int *tlIndex, int *brIndex);
-	bool DetectRectangleThroughModeSlices(const RasterizerState &state, const VertexData data[4]);
+	bool DetectRectangleFromStrip(const RasterizerState &state, const ClipVertexData data[4], int *tlIndex, int *brIndex);
+	bool DetectRectangleFromFan(const RasterizerState &state, const ClipVertexData *data, int c, int *tlIndex, int *brIndex);
+	bool DetectRectangleFromPair(const RasterizerState &state, const ClipVertexData data[6], int *tlIndex, int *brIndex);
+	bool DetectRectangleThroughModeSlices(const RasterizerState &state, const ClipVertexData data[4]);
 }
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@ -102,22 +102,22 @@ void SoftwareDrawEngine::DispatchSubmitImm(GEPrimitiveType prim, TransformedVert
 		transformUnit.SubmitPrimitive(nullptr, nullptr, prim, 0, vertTypeID, nullptr, this);

 	for (int i = 0; i < vertexCount; i++) {
-		VertexData vert;
+		ClipVertexData vert;
 		vert.clippos = ClipCoords(buffer[i].pos);
-		vert.texturecoords.x = buffer[i].u;
-		vert.texturecoords.y = buffer[i].v;
+		vert.v.texturecoords.x = buffer[i].u;
+		vert.v.texturecoords.y = buffer[i].v;
 		if (gstate.isModeThrough()) {
-			vert.texturecoords.x *= gstate.getTextureWidth(0);
-			vert.texturecoords.y *= gstate.getTextureHeight(0);
+			vert.v.texturecoords.x *= gstate.getTextureWidth(0);
+			vert.v.texturecoords.y *= gstate.getTextureHeight(0);
 		} else {
 			vert.clippos.z *= 1.0f / 65535.0f;
 		}
-		vert.color0 = buffer[i].color0_32;
-		vert.color1 = gstate.isUsingSecondaryColor() && !gstate.isModeThrough() ? buffer[i].color1_32 : 0;
-		vert.fogdepth = buffer[i].fog;
-		vert.screenpos.x = (int)(buffer[i].x * 16.0f);
-		vert.screenpos.y = (int)(buffer[i].y * 16.0f);
-		vert.screenpos.z = (u16)(u32)buffer[i].z;
+		vert.v.color0 = buffer[i].color0_32;
+		vert.v.color1 = gstate.isUsingSecondaryColor() && !gstate.isModeThrough() ? buffer[i].color1_32 : 0;
+		vert.v.fogdepth = buffer[i].fog;
+		vert.v.screenpos.x = (int)(buffer[i].x * 16.0f);
+		vert.v.screenpos.y = (int)(buffer[i].y * 16.0f);
+		vert.v.screenpos.z = (u16)(u32)buffer[i].z;

 		transformUnit.SubmitImmVertex(vert, this);
 	}
@ -315,10 +315,10 @@ void ComputeTransformState(TransformState *state, const VertexReader &vreader) {
 		state->roundToScreen = &ClipToScreenInternal<false, true>;
 }

-VertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState &state) {
+ClipVertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState &state) {
 	PROFILE_THIS_SCOPE("read_vert");
 	// If we ever thread this, we'll have to change this.
-	VertexData vertex;
+	ClipVertexData vertex;

 	ModelCoords pos;
 	// VertexDecoder normally scales z, but we want it unscaled.
@ -326,10 +326,10 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState

 	static Vec2f lastTC;
 	if (state.readUV) {
-		vreader.ReadUV(vertex.texturecoords.AsArray());
-		lastTC = vertex.texturecoords;
+		vreader.ReadUV(vertex.v.texturecoords.AsArray());
+		lastTC = vertex.v.texturecoords;
 	} else {
-		vertex.texturecoords = lastTC;
+		vertex.v.texturecoords = lastTC;
 	}

 	Vec3f normal;
@ -366,12 +366,12 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState
 	}

 	if (vreader.hasColor0()) {
-		vreader.ReadColor0_8888((u8 *)&vertex.color0);
+		vreader.ReadColor0_8888((u8 *)&vertex.v.color0);
 	} else {
-		vertex.color0 = gstate.getMaterialAmbientRGBA();
+		vertex.v.color0 = gstate.getMaterialAmbientRGBA();
 	}

-	vertex.color1 = 0;
+	vertex.v.color1 = 0;

 	if (state.enableTransform) {
 		WorldCoords worldpos;
@ -396,18 +396,19 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState
 		screenScaled = vertex.clippos.xyz() * state.screenScale / vertex.clippos.w + state.screenAdd;
 #endif
 		bool outside_range_flag = false;
-		vertex.screenpos = state.roundToScreen(screenScaled, vertex.clippos, &outside_range_flag);
+		vertex.v.screenpos = state.roundToScreen(screenScaled, vertex.clippos, &outside_range_flag);
 		if (outside_range_flag) {
 			// We use this, essentially, as the flag.
-			vertex.screenpos.x = 0x7FFFFFFF;
+			vertex.v.screenpos.x = 0x7FFFFFFF;
 			return vertex;
 		}

 		if (state.enableFog) {
-			vertex.fogdepth = Dot(state.posToFog, Vec4f(pos, 1.0f));
+			vertex.v.fogdepth = Dot(state.posToFog, Vec4f(pos, 1.0f));
 		} else {
-			vertex.fogdepth = 1.0f;
+			vertex.v.fogdepth = 1.0f;
 		}
+		vertex.v.clipw = vertex.clippos.w;

 		Vec3<float> worldnormal;
 		if (vreader.hasNormal()) {
@ -426,7 +427,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState
 				break;

 			case GE_PROJMAP_UV:
-				source = Vec3f(vertex.texturecoords, 0.0f);
+				source = Vec3f(vertex.v.texturecoords, 0.0f);
 				break;

 			case GE_PROJMAP_NORMALIZED_NORMAL:
@ -444,23 +445,23 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState
 				break;
 			}

-			// TODO: What about uv scale and offset?
+			// Note that UV scale/offset are not used in this mode.
 			Vec3<float> stq = Vec3ByMatrix43(source, gstate.tgenMatrix);
 			float z_recip = 1.0f / stq.z;
-			vertex.texturecoords = Vec2f(stq.x * z_recip, stq.y * z_recip);
+			vertex.v.texturecoords = Vec2f(stq.x * z_recip, stq.y * z_recip);
 		} else if (state.uvGenMode == GE_TEXMAP_ENVIRONMENT_MAP) {
-			Lighting::GenerateLightST(vertex, worldnormal);
+			Lighting::GenerateLightST(vertex.v, worldnormal);
 		}

 		PROFILE_THIS_SCOPE("light");
 		if (state.enableLighting)
-			Lighting::Process(vertex, worldpos, worldnormal, state.lightingState);
+			Lighting::Process(vertex.v, worldpos, worldnormal, state.lightingState);
 	} else {
-		vertex.screenpos.x = (int)(pos[0] * SCREEN_SCALE_FACTOR);
-		vertex.screenpos.y = (int)(pos[1] * SCREEN_SCALE_FACTOR);
-		vertex.screenpos.z = pos[2];
-		vertex.clippos.w = 1.f;
-		vertex.fogdepth = 1.f;
+		vertex.v.screenpos.x = (int)(pos[0] * SCREEN_SCALE_FACTOR);
+		vertex.v.screenpos.y = (int)(pos[1] * SCREEN_SCALE_FACTOR);
+		vertex.v.screenpos.z = pos[2];
+		vertex.v.clipw = 1.0f;
+		vertex.v.fogdepth = 1.0f;
 	}

 	return vertex;
@ -511,7 +512,7 @@ public:
 		}
 	}

-	inline VertexData Read(int vtx) {
+	inline ClipVertexData Read(int vtx) {
 		if (useIndices_) {
 			if (useCache_) {
 				return cached_[conv_(vtx) - lowerBound_];
@ -531,13 +532,13 @@ protected:
 	TransformUnit &transform_;
 	uint16_t lowerBound_;
 	uint16_t upperBound_;
-	static std::vector<VertexData> cached_;
+	static std::vector<ClipVertexData> cached_;
 	bool useIndices_ = false;
 	bool useCache_ = false;
 };

 // Static to reduce allocations mid-frame.
-std::vector<VertexData> SoftwareVertexReader::cached_;
+std::vector<ClipVertexData> SoftwareVertexReader::cached_;

 void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, GEPrimitiveType prim_type, int vertex_count, u32 vertex_type, int *bytesRead, SoftwareDrawEngine *drawEngine)
 {
@ -580,7 +581,7 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 	if (vreader.IsThrough() && cullType == CullType::OFF && prim_type == GE_PRIM_TRIANGLES && data_index_ == 0 && vertex_count >= 6 && ((vertex_count) % 6) == 0) {
 		// Some games send rectangles as a series of regular triangles.
 		// We look for this, but only in throughmode.
-		VertexData buf[6];
+		ClipVertexData buf[6];
 		int buf_index = data_index_;
 		for (int i = 0; i < data_index_; ++i) {
 			buf[i] = data_[i];
@ -831,7 +832,7 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 	}
 }

-void TransformUnit::SubmitImmVertex(const VertexData &vert, SoftwareDrawEngine *drawEngine) {
+void TransformUnit::SubmitImmVertex(const ClipVertexData &vert, SoftwareDrawEngine *drawEngine) {
 	// Where we put it is different for STRIP/FAN types.
 	switch (prev_prim_) {
 	case GE_PRIM_POINTS:
@ -872,7 +873,7 @@ void TransformUnit::SubmitImmVertex(const VertexData &vert, SoftwareDrawEngine *
 	isImmDraw_ = false;
 }

-void TransformUnit::SendTriangle(CullType cullType, const VertexData *verts, int provoking) {
+void TransformUnit::SendTriangle(CullType cullType, const ClipVertexData *verts, int provoking) {
 	if (cullType == CullType::OFF) {
 		Clipper::ProcessTriangle(verts[0], verts[1], verts[2], verts[provoking], *binner_);
 		Clipper::ProcessTriangle(verts[2], verts[1], verts[0], verts[provoking], *binner_);
--- a/GPU/Software/TransformUnit.h
+++ b/GPU/Software/TransformUnit.h
@ -78,28 +78,33 @@ struct DrawingCoords {
 	s16 y;
 };

-struct VertexData {
-	void Lerp(float t, const VertexData &a, const VertexData &b) {
-		clippos = ::Lerp(a.clippos, b.clippos, t);
-		// Ignore screenpos because Lerp() is only used pre-calculation of screenpos.
-		texturecoords = ::Lerp(a.texturecoords, b.texturecoords, t);
-		fogdepth = ::Lerp(a.fogdepth, b.fogdepth, t);
-
-		u16 t_int = (u16)(t * 256);
-		color0 = LerpInt<Vec4<int>, 256>(Vec4<int>::FromRGBA(a.color0), Vec4<int>::FromRGBA(b.color0), t_int).ToRGBA();
-		color1 = LerpInt<Vec3<int>, 256>(Vec3<int>::FromRGB(a.color1), Vec3<int>::FromRGB(b.color1), t_int).ToRGB();
-	}
-
-	bool OutsideRange() const {
-		return screenpos.x == 0x7FFFFFFF;
-	}
-
-	ClipCoords clippos;
+struct alignas(16) VertexData {
 	Vec2<float> texturecoords;
 	uint32_t color0;
 	uint32_t color1;
-	ScreenCoords screenpos; // TODO: Shouldn't store this ?
+	ScreenCoords screenpos;
 	float fogdepth;
+	float clipw;
+};
+
+struct ClipVertexData {
+	void Lerp(float t, const ClipVertexData &a, const ClipVertexData &b) {
+		clippos = ::Lerp(a.clippos, b.clippos, t);
+		// Ignore screenpos because Lerp() is only used pre-calculation of screenpos.
+		v.texturecoords = ::Lerp(a.v.texturecoords, b.v.texturecoords, t);
+		v.fogdepth = ::Lerp(a.v.fogdepth, b.v.fogdepth, t);
+
+		u16 t_int = (u16)(t * 256);
+		v.color0 = LerpInt<Vec4<int>, 256>(Vec4<int>::FromRGBA(a.v.color0), Vec4<int>::FromRGBA(b.v.color0), t_int).ToRGBA();
+		v.color1 = LerpInt<Vec3<int>, 256>(Vec3<int>::FromRGB(a.v.color1), Vec3<int>::FromRGB(b.v.color1), t_int).ToRGB();
+	}
+
+	bool OutsideRange() const {
+		return v.screenpos.x == 0x7FFFFFFF;
+	}
+
+	ClipCoords clippos;
+	VertexData v;
 };

 class VertexReader;
@ -130,7 +135,7 @@ public:
 	static ScreenCoords DrawingToScreen(const DrawingCoords &coords, u16 z);

 	void SubmitPrimitive(const void* vertices, const void* indices, GEPrimitiveType prim_type, int vertex_count, u32 vertex_type, int *bytesRead, SoftwareDrawEngine *drawEngine);
-	void SubmitImmVertex(const VertexData &vert, SoftwareDrawEngine *drawEngine);
+	void SubmitImmVertex(const ClipVertexData &vert, SoftwareDrawEngine *drawEngine);

 	bool GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices);

@ -144,14 +149,14 @@ public:
 	SoftDirty GetDirty();

 private:
-	VertexData ReadVertex(VertexReader &vreader, const TransformState &state);
-	void SendTriangle(CullType cullType, const VertexData *verts, int provoking = 2);
+	ClipVertexData ReadVertex(VertexReader &vreader, const TransformState &state);
+	void SendTriangle(CullType cullType, const ClipVertexData *verts, int provoking = 2);

 	u8 *decoded_ = nullptr;
 	BinManager *binner_ = nullptr;

 	// Normally max verts per prim is 3, but we temporarily need 4 to detect rectangles from strips.
-	VertexData data_[4];
+	ClipVertexData data_[4];
 	// This is the index of the next vert in data (or higher, may need modulus.)
 	int data_index_ = 0;
 	GEPrimitiveType prev_prim_ = GE_PRIM_POINTS;