From cba2374abd35e77055bff817c282d8975be48b9d Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Mon, 3 Jan 2022 22:56:26 -0800
Subject: [PATCH 01/10] softgpu: Separate calculation of S/T.

We could probably reuse, but we're not right now and it complicates the
logic.
---
 GPU/Software/Lighting.cpp      | 36 +++++++++++++++-------------------
 GPU/Software/Lighting.h        |  3 ++-
 GPU/Software/TransformUnit.cpp |  5 ++++-
 3 files changed, 22 insertions(+), 22 deletions(-)
diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp
index d4d7a020ee..35a1dde62a 100644
--- a/GPU/Software/Lighting.cpp
+++ b/GPU/Software/Lighting.cpp
@@ -43,6 +43,22 @@ static inline float pspLightPow(float v, float e) {
 	return v;
 }
 
+static inline float GenerateLightCoord(VertexData &vertex, int light) {
+	// TODO: Should specular lighting should affect this, too?  Doesn't in GLES.
+	Vec3<float> L = GetLightVec(gstate.lpos, light);
+	// In other words, L.Length2() == 0.0f means Dot({0, 0, 1}, worldnormal).
+	float diffuse_factor = Dot(L.NormalizedOr001(cpu_info.bSSE4_1), vertex.worldnormal);
+
+	return (diffuse_factor + 1.0f) / 2.0f;
+}
+
+void GenerateLightST(VertexData &vertex) {
+	// Always calculate texture coords from lighting results if environment mapping is active
+	// This should be done even if lighting is disabled altogether.
+	vertex.texturecoords.s() = GenerateLightCoord(vertex, gstate.getUVLS0());
+	vertex.texturecoords.t() = GenerateLightCoord(vertex, gstate.getUVLS1());
+}
+
 void Process(VertexData& vertex, bool hasColor) {
 	const int materialupdate = gstate.materialupdate & (hasColor ? 7 : 0);
 
@@ -53,26 +69,6 @@ void Process(VertexData& vertex, bool hasColor) {
 	Vec3<float> final_color = mec + mac * Vec3<float>::FromRGB(gstate.getAmbientRGBA());
 	Vec3<float> specular_color(0.0f, 0.0f, 0.0f);
 
-	for (unsigned int light = 0; light < 4; ++light) {
-		// Always calculate texture coords from lighting results if environment mapping is active
-		// TODO: Should specular lighting should affect this, too?  Doesn't in GLES.
-		// This should be done even if lighting is disabled altogether.
-		if (gstate.getUVGenMode() == GE_TEXMAP_ENVIRONMENT_MAP) {
-			Vec3<float> L = GetLightVec(gstate.lpos, light);
-			// In other words, L.Length2() == 0.0f means Dot({0, 0, 1}, worldnormal).
-			float diffuse_factor = Dot(L.NormalizedOr001(cpu_info.bSSE4_1), vertex.worldnormal);
-
-			if (gstate.getUVLS0() == (int)light)
-				vertex.texturecoords.s() = (diffuse_factor + 1.f) / 2.f;
-
-			if (gstate.getUVLS1() == (int)light)
-				vertex.texturecoords.t() = (diffuse_factor + 1.f) / 2.f;
-		}
-	}
-
-	if (!gstate.isLightingEnabled())
-		return;
-
 	for (unsigned int light = 0; light < 4; ++light) {
 		if (!gstate.isLightChanEnabled(light))
 			continue;
diff --git a/GPU/Software/Lighting.h b/GPU/Software/Lighting.h
index 9e08eacade..e5915e6aac 100644
--- a/GPU/Software/Lighting.h
+++ b/GPU/Software/Lighting.h
@@ -21,6 +21,7 @@
 
 namespace Lighting {
 
+void GenerateLightST(VertexData &vertex);
 void Process(VertexData& vertex, bool hasColor);
 
-}
\ No newline at end of file
+}
diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp
index bdecb90372..b15bb0115c 100644
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@@ -277,10 +277,13 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
 			Vec3<float> stq = tgen * source + Vec3<float>(gstate.tgenMatrix[9], gstate.tgenMatrix[10], gstate.tgenMatrix[11]);
 			float z_recip = 1.0f / stq.z;
 			vertex.texturecoords = Vec2f(stq.x * z_recip, stq.y * z_recip);
+		} else if (gstate.getUVGenMode() == GE_TEXMAP_ENVIRONMENT_MAP) {
+			Lighting::GenerateLightST(vertex);
 		}
 
 		PROFILE_THIS_SCOPE("light");
-		Lighting::Process(vertex, vreader.hasColor0());
+		if (gstate.isLightingEnabled())
+			Lighting::Process(vertex, vreader.hasColor0());
 	} else {
 		vertex.screenpos.x = (int)(pos[0] * 16) + gstate.getOffsetX16();
 		vertex.screenpos.y = (int)(pos[1] * 16) + gstate.getOffsetY16();

From 079b67e7ed2203f9fc3bb58cd6a1005b5db16118 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Tue, 4 Jan 2022 09:00:50 -0800
Subject: [PATCH 02/10] softgpu: Use common SIMD matrix multiplies.

---
 GPU/Math3D.h                   | 101 +++++++++++++++++++++++++++++++++
 GPU/Software/TransformUnit.cpp |  61 ++++++++------------
 2 files changed, 124 insertions(+), 38 deletions(-)

diff --git a/GPU/Math3D.h b/GPU/Math3D.h
index b8ede12cb8..02ab650811 100644
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@@ -38,6 +38,12 @@
 #endif
 #endif
 
+#if PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER))
+#define MATH3D_CALL __vectorcall
+#else
+#define MATH3D_CALL
+#endif
+
 namespace Math3D {
 
 // Helper for Vec classes to clamp values.
@@ -913,6 +919,38 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
 #endif
 }
 
+inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
+#if defined(_M_SSE)
+	__m128 col0 = _mm_loadu_ps(m);
+	__m128 col1 = _mm_loadu_ps(m + 3);
+	__m128 col2 = _mm_loadu_ps(m + 6);
+	__m128 col3 = _mm_loadu_ps(m + 9);
+	__m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0));
+	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
+	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
+	__m128 sum = _mm_add_ps(
+		_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
+		_mm_add_ps(_mm_mul_ps(col2, z), col3));
+	return sum;
+#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+	float32x4_t col0 = vld1q_f32(m);
+	float32x4_t col1 = vld1q_f32(m + 3);
+	float32x4_t col2 = vld1q_f32(m + 6);
+	float32x4_t col3 = vld1q_f32(m + 9);
+	float32x4_t vec = v.vec;
+	float32x4_t sum = vaddq_f32(
+		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
+		vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
+	return sum;
+#else
+	Vec3f vecOut;
+	vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6] + m[9];
+	vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7] + m[10];
+	vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8] + m[11];
+	return vecOut;
+#endif
+}
+
 inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
 {
 #if defined(_M_SSE)
@@ -945,6 +983,39 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
 #endif
 }
 
+inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
+#if defined(_M_SSE)
+	__m128 col0 = _mm_loadu_ps(m);
+	__m128 col1 = _mm_loadu_ps(m + 4);
+	__m128 col2 = _mm_loadu_ps(m + 8);
+	__m128 col3 = _mm_loadu_ps(m + 12);
+	__m128 x = _mm_set1_ps(v[0]);
+	__m128 y = _mm_set1_ps(v[1]);
+	__m128 z = _mm_set1_ps(v[2]);
+	__m128 sum = _mm_add_ps(
+		_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
+		_mm_add_ps(_mm_mul_ps(col2, z), col3));
+	return sum;
+#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+	float32x4_t col0 = vld1q_f32(m);
+	float32x4_t col1 = vld1q_f32(m + 4);
+	float32x4_t col2 = vld1q_f32(m + 8);
+	float32x4_t col3 = vld1q_f32(m + 12);
+	float32x4_t vec = v.vec;
+	float32x4_t sum = vaddq_f32(
+		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
+		vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
+	return sum;
+#else
+	Vec4f vecOut;
+	vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12];
+	vecOut[1] = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + m[13];
+	vecOut[2] = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + m[14];
+	vecOut[3] = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + m[15];
+	return vecOut;
+#endif
+}
+
 inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
 {
 	vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6];
@@ -952,6 +1023,36 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]
 	vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8];
 }
 
+inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
+#if defined(_M_SSE)
+	__m128 col0 = _mm_loadu_ps(m);
+	__m128 col1 = _mm_loadu_ps(m + 3);
+	__m128 col2 = _mm_loadu_ps(m + 6);
+	__m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0));
+	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
+	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
+	__m128 sum = _mm_add_ps(
+		_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
+		_mm_mul_ps(col2, z));
+	return sum;
+#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+	float32x4_t col0 = vld1q_f32(m);
+	float32x4_t col1 = vld1q_f32(m + 3);
+	float32x4_t col2 = vld1q_f32(m + 6);
+	float32x4_t vec = v.vec;
+	float32x4_t sum = vaddq_f32(
+		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
+		vmulq_laneq_f32(col2, vec, 2));
+	return sum;
+#else
+	Vec3f vecOut;
+	vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6];
+	vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7];
+	vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8];
+	return vecOut;
+#endif
+}
+
 inline void Matrix4ByMatrix4(float out[16], const float a[16], const float b[16]) {
 	fast_matrix_mul_4x4(out, b, a);
 }
diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp
index b15bb0115c..7902e95aff 100644
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@@ -67,29 +67,20 @@ VertexDecoder *SoftwareDrawEngine::FindVertexDecoder(u32 vtype) {
 	return DrawEngineCommon::GetVertexDecoder(vertTypeID);
 }
 
-WorldCoords TransformUnit::ModelToWorld(const ModelCoords& coords)
-{
-	Mat3x3<float> world_matrix(gstate.worldMatrix);
-	return WorldCoords(world_matrix * coords) + Vec3<float>(gstate.worldMatrix[9], gstate.worldMatrix[10], gstate.worldMatrix[11]);
+WorldCoords TransformUnit::ModelToWorld(const ModelCoords &coords) {
+	return Vec3ByMatrix43(coords, gstate.worldMatrix);
 }
 
-WorldCoords TransformUnit::ModelToWorldNormal(const ModelCoords& coords)
-{
-	Mat3x3<float> world_matrix(gstate.worldMatrix);
-	return WorldCoords(world_matrix * coords);
+WorldCoords TransformUnit::ModelToWorldNormal(const ModelCoords &coords) {
+	return Norm3ByMatrix43(coords, gstate.worldMatrix);
 }
 
-ViewCoords TransformUnit::WorldToView(const WorldCoords& coords)
-{
-	Mat3x3<float> view_matrix(gstate.viewMatrix);
-	return ViewCoords(view_matrix * coords) + Vec3<float>(gstate.viewMatrix[9], gstate.viewMatrix[10], gstate.viewMatrix[11]);
+ViewCoords TransformUnit::WorldToView(const WorldCoords &coords) {
+	return Vec3ByMatrix43(coords, gstate.viewMatrix);
 }
 
-ClipCoords TransformUnit::ViewToClip(const ViewCoords& coords)
-{
-	Vec4<float> coords4(coords.x, coords.y, coords.z, 1.0f);
-	Mat4x4<float> projection_matrix(gstate.projMatrix);
-	return ClipCoords(projection_matrix * coords4);
+ClipCoords TransformUnit::ViewToClip(const ViewCoords &coords) {
+	return Vec3ByMatrix44(coords, gstate.projMatrix);
 }
 
 static inline ScreenCoords ClipToScreenInternal(const ClipCoords& coords, bool *outside_range_flag) {
@@ -161,20 +152,16 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
 	PROFILE_THIS_SCOPE("read_vert");
 	VertexData vertex;
 
-	float pos[3];
+	ModelCoords pos;
 	// VertexDecoder normally scales z, but we want it unscaled.
-	vreader.ReadPosThroughZ16(pos);
+	vreader.ReadPosThroughZ16(pos.AsArray());
 
 	if (!gstate.isModeClear() && gstate.isTextureMapEnabled() && vreader.hasUV()) {
-		float uv[2];
-		vreader.ReadUV(uv);
-		vertex.texturecoords = Vec2<float>(uv[0], uv[1]);
+		vreader.ReadUV(vertex.texturecoords.AsArray());
 	}
 
 	if (vreader.hasNormal()) {
-		float normal[3];
-		vreader.ReadNrm(normal);
-		vertex.normal = Vec3<float>(normal[0], normal[1], normal[2]);
+		vreader.ReadNrm(vertex.normal.AsArray());
 
 		if (gstate.areNormalsReversed())
 			vertex.normal = -vertex.normal;
@@ -188,15 +175,15 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
 		Vec3<float> tmpnrm(0.f, 0.f, 0.f);
 
 		for (int i = 0; i < vertTypeGetNumBoneWeights(gstate.vertType); ++i) {
-			Mat3x3<float> bone(&gstate.boneMatrix[12*i]);
-			tmppos += (bone * ModelCoords(pos[0], pos[1], pos[2]) + Vec3<float>(gstate.boneMatrix[12*i+9], gstate.boneMatrix[12*i+10], gstate.boneMatrix[12*i+11])) * W[i];
-			if (vreader.hasNormal())
-				tmpnrm += (bone * vertex.normal) * W[i];
+			Vec3<float> step = Vec3ByMatrix43(pos, gstate.boneMatrix + i * 12);
+			tmppos += step * W[i];
+			if (vreader.hasNormal()) {
+				step = Norm3ByMatrix43(vertex.normal, gstate.boneMatrix + i * 12);
+				tmpnrm += step * W[i];
+			}
 		}
 
-		pos[0] = tmppos.x;
-		pos[1] = tmppos.y;
-		pos[2] = tmppos.z;
+		pos = tmppos;
 		if (vreader.hasNormal())
 			vertex.normal = tmpnrm;
 	}
@@ -206,7 +193,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
 		vreader.ReadColor0(col);
 		vertex.color0 = Vec4<int>(col[0]*255, col[1]*255, col[2]*255, col[3]*255);
 	} else {
-		vertex.color0 = Vec4<int>(gstate.getMaterialAmbientR(), gstate.getMaterialAmbientG(), gstate.getMaterialAmbientB(), gstate.getMaterialAmbientA());
+		vertex.color0 = Vec4<int>::FromRGBA(gstate.getMaterialAmbientRGBA());
 	}
 
 	if (vreader.hasColor1()) {
@@ -218,7 +205,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
 	}
 
 	if (!gstate.isModeThrough()) {
-		vertex.modelpos = ModelCoords(pos[0], pos[1], pos[2]);
+		vertex.modelpos = pos;
 		vertex.worldpos = WorldCoords(TransformUnit::ModelToWorld(vertex.modelpos));
 		ModelCoords viewpos = TransformUnit::WorldToView(vertex.worldpos);
 		vertex.clippos = ClipCoords(TransformUnit::ViewToClip(viewpos));
@@ -240,8 +227,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
 		vertex.screenpos = ClipToScreenInternal(vertex.clippos, &outside_range_flag);
 
 		if (vreader.hasNormal()) {
-			vertex.worldnormal = TransformUnit::ModelToWorldNormal(vertex.normal);
-			vertex.worldnormal /= vertex.worldnormal.Length();
+			vertex.worldnormal = TransformUnit::ModelToWorldNormal(vertex.normal).Normalized(cpu_info.bSSE4_1);
 		} else {
 			vertex.worldnormal = Vec3<float>(0.0f, 0.0f, 1.0f);
 		}
@@ -273,8 +259,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
 			}
 
 			// TODO: What about uv scale and offset?
-			Mat3x3<float> tgen(gstate.tgenMatrix);
-			Vec3<float> stq = tgen * source + Vec3<float>(gstate.tgenMatrix[9], gstate.tgenMatrix[10], gstate.tgenMatrix[11]);
+			Vec3<float> stq = Vec3ByMatrix43(source, gstate.tgenMatrix);
 			float z_recip = 1.0f / stq.z;
 			vertex.texturecoords = Vec2f(stq.x * z_recip, stq.y * z_recip);
 		} else if (gstate.getUVGenMode() == GE_TEXMAP_ENVIRONMENT_MAP) {

From e7d66f202992ab246f06441f7f66d7789cf63394 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Tue, 4 Jan 2022 09:21:39 -0800
Subject: [PATCH 03/10] softgpu: Reuse SSE/NEON matrix code.

---
 GPU/Math3D.h | 161 +++++++++++++++++++++++++++------------------------
 1 file changed, 84 insertions(+), 77 deletions(-)

diff --git a/GPU/Math3D.h b/GPU/Math3D.h
index 02ab650811..21f8f99cb8 100644
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@@ -882,33 +882,44 @@ float vectorGetByIndex(__m128 v) {
 }
 #endif
 
-// v and vecOut must point to different memory.
-inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
 #if defined(_M_SSE)
+inline __m128 MATH3D_CALL Vec3ByMatrix43(__m128 x, __m128 y, __m128 z, const float m[12]) {
 	__m128 col0 = _mm_loadu_ps(m);
 	__m128 col1 = _mm_loadu_ps(m + 3);
 	__m128 col2 = _mm_loadu_ps(m + 6);
 	__m128 col3 = _mm_loadu_ps(m + 9);
-	__m128 x = _mm_set1_ps(v[0]);
-	__m128 y = _mm_set1_ps(v[1]);
-	__m128 z = _mm_set1_ps(v[2]);
 	__m128 sum = _mm_add_ps(
 		_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
 		_mm_add_ps(_mm_mul_ps(col2, z), col3));
+	return sum;
+}
+#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+inline float32x4_t Vec3ByMatrix43(float32x4_t vec, const float m[16]) {
+	float32x4_t col0 = vld1q_f32(m);
+	float32x4_t col1 = vld1q_f32(m + 3);
+	float32x4_t col2 = vld1q_f32(m + 6);
+	float32x4_t col3 = vld1q_f32(m + 9);
+	float32x4_t sum = vaddq_f32(
+		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
+		vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
+	return sum;
+}
+#endif
+
+// v and vecOut must point to different memory.
+inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
+#if defined(_M_SSE)
+	__m128 x = _mm_set1_ps(v[0]);
+	__m128 y = _mm_set1_ps(v[1]);
+	__m128 z = _mm_set1_ps(v[2]);
+	__m128 sum = Vec3ByMatrix43(x, y, z, m);
 	// Not sure what the best way to store 3 elements is. Ideally, we should
 	// probably store all four.
 	vecOut[0] = _mm_cvtss_f32(sum);
 	vecOut[1] = vectorGetByIndex<1>(sum);
 	vecOut[2] = vectorGetByIndex<2>(sum);
 #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
-	float32x4_t col0 = vld1q_f32(m);
-	float32x4_t col1 = vld1q_f32(m + 3);
-	float32x4_t col2 = vld1q_f32(m + 6);
-	float32x4_t col3 = vld1q_f32(m + 9);
-	float32x4_t vec = vld1q_f32(v);
-	float32x4_t sum = vaddq_f32(
-		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
-		vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
+	float32x4_t sum = Vec3ByMatrix43(vld1q_f32(v), m);
 	vecOut[0] = vgetq_lane_f32(sum, 0);
 	vecOut[1] = vgetq_lane_f32(sum, 1);
 	vecOut[2] = vgetq_lane_f32(sum, 2);
@@ -921,59 +932,52 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
 
 inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
 #if defined(_M_SSE)
-	__m128 col0 = _mm_loadu_ps(m);
-	__m128 col1 = _mm_loadu_ps(m + 3);
-	__m128 col2 = _mm_loadu_ps(m + 6);
-	__m128 col3 = _mm_loadu_ps(m + 9);
 	__m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0));
 	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
 	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
-	__m128 sum = _mm_add_ps(
-		_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
-		_mm_add_ps(_mm_mul_ps(col2, z), col3));
-	return sum;
+	return Vec3ByMatrix43(x, y, z, m);
 #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
-	float32x4_t col0 = vld1q_f32(m);
-	float32x4_t col1 = vld1q_f32(m + 3);
-	float32x4_t col2 = vld1q_f32(m + 6);
-	float32x4_t col3 = vld1q_f32(m + 9);
-	float32x4_t vec = v.vec;
-	float32x4_t sum = vaddq_f32(
-		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
-		vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
-	return sum;
+	return Vec3ByMatrix43(v.vec, m);
 #else
 	Vec3f vecOut;
-	vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6] + m[9];
-	vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7] + m[10];
-	vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8] + m[11];
+	Vec3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);
 	return vecOut;
 #endif
 }
 
-inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
-{
 #if defined(_M_SSE)
+inline __m128 MATH3D_CALL Vec3ByMatrix44(__m128 x, __m128 y, __m128 z, const float m[16]) {
 	__m128 col0 = _mm_loadu_ps(m);
 	__m128 col1 = _mm_loadu_ps(m + 4);
 	__m128 col2 = _mm_loadu_ps(m + 8);
 	__m128 col3 = _mm_loadu_ps(m + 12);
-	__m128 x = _mm_set1_ps(v[0]);
-	__m128 y = _mm_set1_ps(v[1]);
-	__m128 z = _mm_set1_ps(v[2]);
 	__m128 sum = _mm_add_ps(
 		_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
 		_mm_add_ps(_mm_mul_ps(col2, z), col3));
-	_mm_storeu_ps(vecOut, sum);
+	return sum;
+}
 #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+inline float32x4_t Vec3ByMatrix44(float32x4_t vec, const float m[16]) {
 	float32x4_t col0 = vld1q_f32(m);
 	float32x4_t col1 = vld1q_f32(m + 4);
 	float32x4_t col2 = vld1q_f32(m + 8);
 	float32x4_t col3 = vld1q_f32(m + 12);
-	float32x4_t vec = vld1q_f32(v);
 	float32x4_t sum = vaddq_f32(
 		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
 		vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
+	return sum;
+}
+#endif
+
+inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) {
+#if defined(_M_SSE)
+	__m128 x = _mm_set1_ps(v[0]);
+	__m128 y = _mm_set1_ps(v[1]);
+	__m128 z = _mm_set1_ps(v[2]);
+	__m128 sum = Vec3ByMatrix44(x, y, z, m);
+	_mm_storeu_ps(vecOut, sum);
+#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+	float32x4_t sum = Vec3ByMatrix44(vld1q_f32(v), m);
 	vst1q_f32(vecOut, sum);
 #else
 	vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12];
@@ -985,70 +989,73 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
 
 inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
 #if defined(_M_SSE)
-	__m128 col0 = _mm_loadu_ps(m);
-	__m128 col1 = _mm_loadu_ps(m + 4);
-	__m128 col2 = _mm_loadu_ps(m + 8);
-	__m128 col3 = _mm_loadu_ps(m + 12);
-	__m128 x = _mm_set1_ps(v[0]);
-	__m128 y = _mm_set1_ps(v[1]);
-	__m128 z = _mm_set1_ps(v[2]);
-	__m128 sum = _mm_add_ps(
-		_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
-		_mm_add_ps(_mm_mul_ps(col2, z), col3));
-	return sum;
+	__m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0));
+	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
+	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
+	return Vec3ByMatrix44(x, y, z, m);
 #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
-	float32x4_t col0 = vld1q_f32(m);
-	float32x4_t col1 = vld1q_f32(m + 4);
-	float32x4_t col2 = vld1q_f32(m + 8);
-	float32x4_t col3 = vld1q_f32(m + 12);
-	float32x4_t vec = v.vec;
-	float32x4_t sum = vaddq_f32(
-		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
-		vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
-	return sum;
+	return Vec3ByMatrix44(v.vec, m);
 #else
 	Vec4f vecOut;
-	vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12];
-	vecOut[1] = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + m[13];
-	vecOut[2] = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + m[14];
-	vecOut[3] = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + m[15];
+	Vec3ByMatrix44(vecOut.AsArray(), v.AsArray(), m);
 	return vecOut;
 #endif
 }
 
-inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
-{
-	vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6];
-	vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7];
-	vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8];
-}
-
-inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
 #if defined(_M_SSE)
+inline __m128 MATH3D_CALL Norm3ByMatrix43(__m128 x, __m128 y, __m128 z, const float m[12]) {
 	__m128 col0 = _mm_loadu_ps(m);
 	__m128 col1 = _mm_loadu_ps(m + 3);
 	__m128 col2 = _mm_loadu_ps(m + 6);
-	__m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0));
-	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
-	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
 	__m128 sum = _mm_add_ps(
 		_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
 		_mm_mul_ps(col2, z));
 	return sum;
+}
 #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+inline float32x4_t Norm3ByMatrix43(float32x4_t vec, const float m[16]) {
 	float32x4_t col0 = vld1q_f32(m);
 	float32x4_t col1 = vld1q_f32(m + 3);
 	float32x4_t col2 = vld1q_f32(m + 6);
-	float32x4_t vec = v.vec;
 	float32x4_t sum = vaddq_f32(
 		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
 		vmulq_laneq_f32(col2, vec, 2));
 	return sum;
+}
+#endif
+
+inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
+#if defined(_M_SSE)
+	__m128 x = _mm_set1_ps(v[0]);
+	__m128 y = _mm_set1_ps(v[1]);
+	__m128 z = _mm_set1_ps(v[2]);
+	__m128 sum = Norm3ByMatrix43(x, y, z, m);
+	vecOut[0] = _mm_cvtss_f32(sum);
+	vecOut[1] = vectorGetByIndex<1>(sum);
+	vecOut[2] = vectorGetByIndex<2>(sum);
+#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+	float32x4_t sum = Norm3ByMatrix43(vld1q_f32(v), m);
+	vecOut[0] = vgetq_lane_f32(sum, 0);
+	vecOut[1] = vgetq_lane_f32(sum, 1);
+	vecOut[2] = vgetq_lane_f32(sum, 2);
 #else
-	Vec3f vecOut;
 	vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6];
 	vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7];
 	vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8];
+#endif
+}
+
+inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
+#if defined(_M_SSE)
+	__m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0));
+	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
+	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
+	return Norm3ByMatrix43(x, y, z, m);
+#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+	return Norm3ByMatrix43(v.vec, m);
+#else
+	Vec3f vecOut;
+	Norm3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);
 	return vecOut;
 #endif
 }

From fa80c448ee47a001ee5499811150f4fda5fa41ef Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Tue, 4 Jan 2022 23:42:01 -0800
Subject: [PATCH 04/10] softgpu: More closely match PSP light rounding.

---
 GPU/Software/Lighting.cpp | 55 +++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp
index 35a1dde62a..0f11c2c24a 100644
--- a/GPU/Software/Lighting.cpp
+++ b/GPU/Software/Lighting.cpp
@@ -62,12 +62,16 @@ void GenerateLightST(VertexData &vertex) {
 void Process(VertexData& vertex, bool hasColor) {
 	const int materialupdate = gstate.materialupdate & (hasColor ? 7 : 0);
 
-	Vec3<float> vcol0 = vertex.color0.rgb().Cast<float>() * Vec3<float>::AssignToAll(1.0f / 255.0f);
-	Vec3<float> mec = Vec3<float>::FromRGB(gstate.getMaterialEmissive());
+	Vec4<int> mec = Vec4<int>::FromRGBA(gstate.getMaterialEmissive());
 
-	Vec3<float> mac = (materialupdate & 1) ? vcol0 : Vec3<float>::FromRGB(gstate.getMaterialAmbientRGBA());
-	Vec3<float> final_color = mec + mac * Vec3<float>::FromRGB(gstate.getAmbientRGBA());
-	Vec3<float> specular_color(0.0f, 0.0f, 0.0f);
+	Vec4<int> mac = (materialupdate & 1) ? vertex.color0 : Vec4<int>::FromRGBA(gstate.getMaterialAmbientRGBA());
+	Vec4<int> ac = Vec4<int>::FromRGBA(gstate.getAmbientRGBA());
+	// Ambient (whether vertex or material) rounds using the half offset method (like alpha blend.)
+	const Vec4<int> ones = Vec4<int>::AssignToAll(1);
+	Vec4<int> ambient = ((mac * 2 + ones) * (ac * 2 + ones)) / 1024;
+
+	Vec4<int> final_color = mec + ambient;
+	Vec4<int> specular_color = Vec4<int>::AssignToAll(0);
 
 	for (unsigned int light = 0; light < 4; ++light) {
 		if (!gstate.isLightChanEnabled(light))
@@ -103,13 +107,14 @@ void Process(VertexData& vertex, bool hasColor) {
 		}
 
 		// ambient lighting
-		Vec3<float> lac = Vec3<float>::FromRGB(gstate.getLightAmbientColor(light));
-		final_color += lac * mac * att * spot;
+		int attspot = (int)ceilf(256 * 2 * att * spot + 1);
+		if (attspot > 512)
+			attspot = 512;
+		Vec4<int> lac = Vec4<int>::FromRGBA(gstate.getLightAmbientColor(light));
+		Vec4<int> lambient = ((mac * 2 + ones) * (lac * 2 + ones) * attspot) / (1024 * 512);
+		final_color += lambient;
 
 		// diffuse lighting
-		Vec3<float> ldc = Vec3<float>::FromRGB(gstate.getDiffuseColor(light));
-		Vec3<float> mdc = (materialupdate & 2) ? vcol0 : Vec3<float>::FromRGB(gstate.getMaterialDiffuse());
-
 		float diffuse_factor = Dot(L, vertex.worldnormal);
 		if (gstate.isUsingPoweredDiffuseLight(light)) {
 			float k = gstate.getMaterialSpecularCoef();
@@ -117,35 +122,39 @@ void Process(VertexData& vertex, bool hasColor) {
 		}
 
 		if (diffuse_factor > 0.f) {
-			final_color += ldc * mdc * diffuse_factor * att * spot;
+			int diffuse_attspot = (int)ceilf(attspot * diffuse_factor + 1);
+			if (diffuse_attspot > 512)
+				diffuse_attspot = 512;
+			Vec4<int> ldc = Vec4<int>::FromRGBA(gstate.getDiffuseColor(light));
+			Vec4<int> mdc = (materialupdate & 2) ? vertex.color0 : Vec4<int>::FromRGBA(gstate.getMaterialDiffuse());
+			Vec4<int> ldiffuse = ((ldc * 2 + ones) * (mdc * 2 + ones) * diffuse_attspot) / (1024 * 512);
+			final_color += ldiffuse;
 		}
 
 		if (gstate.isUsingSpecularLight(light) && diffuse_factor >= 0.0f) {
 			Vec3<float> H = L + Vec3<float>(0.f, 0.f, 1.f);
 
-			Vec3<float> lsc = Vec3<float>::FromRGB(gstate.getSpecularColor(light));
-			Vec3<float> msc = (materialupdate & 4) ? vcol0 : Vec3<float>::FromRGB(gstate.getMaterialSpecular());
-
 			float specular_factor = Dot(H.NormalizedOr001(cpu_info.bSSE4_1), vertex.worldnormal);
 			float k = gstate.getMaterialSpecularCoef();
 			specular_factor = pspLightPow(specular_factor, k);
 
 			if (specular_factor > 0.f) {
-				specular_color += lsc * msc * specular_factor * att * spot;
+				int specular_attspot = (int)ceilf(attspot * specular_factor + 1);
+				if (specular_attspot > 512)
+					specular_attspot = 512;
+				Vec4<int> lsc = Vec4<int>::FromRGBA(gstate.getSpecularColor(light));
+				Vec4<int> msc = (materialupdate & 4) ? vertex.color0 : Vec4<int>::FromRGBA(gstate.getMaterialSpecular());
+				Vec4<int> lspecular = ((lsc * 2 + ones) * (msc * 2 + ones) * specular_attspot) / (1024 * 512);
+				specular_color += lspecular;
 			}
 		}
 	}
 
-	int maa = (materialupdate & 1) ? vertex.color0.a() : gstate.getMaterialAmbientA();
-	int final_alpha = (gstate.getAmbientA() * maa) / 255;
-
 	if (gstate.isUsingSecondaryColor()) {
-		Vec3<int> final_color_int = (final_color.Clamp(0.0f, 1.0f) * 255.0f).Cast<int>();
-		vertex.color0 = Vec4<int>(final_color_int, final_alpha);
-		vertex.color1 = (specular_color.Clamp(0.0f, 1.0f) * 255.0f).Cast<int>();
+		vertex.color0 = final_color.Clamp(0, 255);
+		vertex.color1 = specular_color.Clamp(0, 255).rgb();
 	} else {
-		Vec3<int> final_color_int = ((final_color + specular_color).Clamp(0.0f, 1.0f) * 255.0f).Cast<int>();
-		vertex.color0 = Vec4<int>(final_color_int, final_alpha);
+		vertex.color0 = (final_color + specular_color).Clamp(0, 255);
 	}
 }
 

From b86bdc9456d1e5e50ae3b0520ac71a4721c54356 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Wed, 5 Jan 2022 07:04:49 -0800
Subject: [PATCH 05/10] softgpu: Correct handling of NAN attenuation.

---
 GPU/Software/Lighting.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp
index 0f11c2c24a..534edb9b66 100644
--- a/GPU/Software/Lighting.cpp
+++ b/GPU/Software/Lighting.cpp
@@ -86,11 +86,13 @@ void Process(VertexData& vertex, bool hasColor) {
 		// TODO: Should this normalize (0, 0, 0) to (0, 0, 1)?
 		float d = L.NormalizeOr001();
 
-		float att = 1.f;
+		float att = 1.0f;
 		if (!gstate.isDirectionalLight(light)) {
-			att = 1.f / Dot(GetLightVec(gstate.latt, light), Vec3f(1.0f, d, d * d));
-			if (att > 1.f) att = 1.f;
-			if (att < 0.f) att = 0.f;
+			att = 1.0f / Dot(GetLightVec(gstate.latt, light), Vec3f(1.0f, d, d * d));
+			if (!(att > 0.0f))
+				att = 0.0f;
+			else if (att > 1.0f)
+				att = 1.0f;
 		}
 
 		float spot = 1.f;

From 537e3577416ef9a2952e9fbfca1c67b1be1941b0 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Wed, 5 Jan 2022 20:25:22 -0800
Subject: [PATCH 06/10] softgpu: Correct NAN spotlight exponent/direction.

---
 GPU/Software/Lighting.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp
index 534edb9b66..90dd5f5851 100644
--- a/GPU/Software/Lighting.cpp
+++ b/GPU/Software/Lighting.cpp
@@ -16,6 +16,7 @@
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
 
 #include "ppsspp_config.h"
+#include <cmath>
 #include "Common/CPUDetect.h"
 #include "GPU/GPUState.h"
 #include "GPU/Software/Lighting.h"
@@ -95,16 +96,20 @@ void Process(VertexData& vertex, bool hasColor) {
 				att = 1.0f;
 		}
 
-		float spot = 1.f;
+		float spot = 1.0f;
 		if (gstate.isSpotLight(light)) {
 			Vec3<float> dir = GetLightVec(gstate.ldir, light);
-			float rawSpot = Dot(dir.NormalizedOr001(cpu_info.bSSE4_1), L);
+			float rawSpot = Dot(dir.Normalized(cpu_info.bSSE4_1), L);
+			if (isnan(rawSpot))
+				rawSpot = 1.0f;
 			float cutoff = getFloat24(gstate.lcutoff[light]);
 			if (rawSpot >= cutoff) {
 				float conv = getFloat24(gstate.lconv[light]);
 				spot = pspLightPow(rawSpot, conv);
+				if (isnan(spot))
+					spot = 0.0f;
 			} else {
-				spot = 0.f;
+				spot = 0.0f;
 			}
 		}
 

From bd354164bcdeea2c89f651a033672c5b611e3716 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Wed, 5 Jan 2022 23:10:47 -0800
Subject: [PATCH 07/10] softgpu: Cleanup -NAN and diffuse factor.

---
 GPU/Software/Lighting.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp
index 90dd5f5851..302d1a7055 100644
--- a/GPU/Software/Lighting.cpp
+++ b/GPU/Software/Lighting.cpp
@@ -34,7 +34,7 @@ static inline Vec3f GetLightVec(u32 lparams[12], int light) {
 }
 
 static inline float pspLightPow(float v, float e) {
-	if (e <= 0.0f) {
+	if (e <= 0.0f || (std::isnan(e) && std::signbit(e))) {
 		return 1.0f;
 	}
 	if (v > 0.0f) {
@@ -100,13 +100,15 @@ void Process(VertexData& vertex, bool hasColor) {
 		if (gstate.isSpotLight(light)) {
 			Vec3<float> dir = GetLightVec(gstate.ldir, light);
 			float rawSpot = Dot(dir.Normalized(cpu_info.bSSE4_1), L);
-			if (isnan(rawSpot))
-				rawSpot = 1.0f;
+			if (std::isnan(rawSpot))
+				rawSpot = std::signbit(rawSpot) ? 0.0f : 1.0f;
 			float cutoff = getFloat24(gstate.lcutoff[light]);
+			if (std::isnan(cutoff) && std::signbit(cutoff))
+				cutoff = 0.0f;
 			if (rawSpot >= cutoff) {
 				float conv = getFloat24(gstate.lconv[light]);
 				spot = pspLightPow(rawSpot, conv);
-				if (isnan(spot))
+				if (std::isnan(spot))
 					spot = 0.0f;
 			} else {
 				spot = 0.0f;
@@ -128,10 +130,8 @@ void Process(VertexData& vertex, bool hasColor) {
 			diffuse_factor = pspLightPow(diffuse_factor, k);
 		}
 
-		if (diffuse_factor > 0.f) {
+		if (diffuse_factor > 0.0f) {
 			int diffuse_attspot = (int)ceilf(attspot * diffuse_factor + 1);
-			if (diffuse_attspot > 512)
-				diffuse_attspot = 512;
 			Vec4<int> ldc = Vec4<int>::FromRGBA(gstate.getDiffuseColor(light));
 			Vec4<int> mdc = (materialupdate & 2) ? vertex.color0 : Vec4<int>::FromRGBA(gstate.getMaterialDiffuse());
 			Vec4<int> ldiffuse = ((ldc * 2 + ones) * (mdc * 2 + ones) * diffuse_attspot) / (1024 * 512);

From ce8a49b1c12abfcfc3ff41b54f41f1ae89cd5af3 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Thu, 6 Jan 2022 20:10:47 -0800
Subject: [PATCH 08/10] softgpu: Retain floats in diffuse/specular.

This seems to be a bit more accurate.  Color blending seems correct now,
but the factors and especially pow results are off.

Also, normalize normal to 0, 0, 1, which seems to match results better.
---
 GPU/Software/Lighting.cpp      | 8 +++++---
 GPU/Software/TransformUnit.cpp | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp
index 302d1a7055..67e21b4028 100644
--- a/GPU/Software/Lighting.cpp
+++ b/GPU/Software/Lighting.cpp
@@ -131,7 +131,9 @@ void Process(VertexData& vertex, bool hasColor) {
 		}
 
 		if (diffuse_factor > 0.0f) {
-			int diffuse_attspot = (int)ceilf(attspot * diffuse_factor + 1);
+			int diffuse_attspot = (int)ceilf(256 * 2 * att * spot * diffuse_factor + 1);
+			if (diffuse_attspot > 512)
+				diffuse_attspot = 512;
 			Vec4<int> ldc = Vec4<int>::FromRGBA(gstate.getDiffuseColor(light));
 			Vec4<int> mdc = (materialupdate & 2) ? vertex.color0 : Vec4<int>::FromRGBA(gstate.getMaterialDiffuse());
 			Vec4<int> ldiffuse = ((ldc * 2 + ones) * (mdc * 2 + ones) * diffuse_attspot) / (1024 * 512);
@@ -145,8 +147,8 @@ void Process(VertexData& vertex, bool hasColor) {
 			float k = gstate.getMaterialSpecularCoef();
 			specular_factor = pspLightPow(specular_factor, k);
 
-			if (specular_factor > 0.f) {
-				int specular_attspot = (int)ceilf(attspot * specular_factor + 1);
+			if (specular_factor > 0.0f) {
+				int specular_attspot = (int)ceilf(256 * 2 * att * spot * specular_factor + 1);
 				if (specular_attspot > 512)
 					specular_attspot = 512;
 				Vec4<int> lsc = Vec4<int>::FromRGBA(gstate.getSpecularColor(light));
diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp
index 7902e95aff..11eeda3f68 100644
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@@ -227,7 +227,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
 		vertex.screenpos = ClipToScreenInternal(vertex.clippos, &outside_range_flag);
 
 		if (vreader.hasNormal()) {
-			vertex.worldnormal = TransformUnit::ModelToWorldNormal(vertex.normal).Normalized(cpu_info.bSSE4_1);
+			vertex.worldnormal = TransformUnit::ModelToWorldNormal(vertex.normal).NormalizedOr001(cpu_info.bSSE4_1);
 		} else {
 			vertex.worldnormal = Vec3<float>(0.0f, 0.0f, 1.0f);
 		}

From 43f71884ee2634e8eee12ffebb3c3dfdbae76cac Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Fri, 7 Jan 2022 17:53:24 -0800
Subject: [PATCH 09/10] softgpu: Clarify internal matrix multiply usage.

---
 GPU/Math3D.h | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/GPU/Math3D.h b/GPU/Math3D.h
index 21f8f99cb8..a62905fa13 100644
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@@ -883,7 +883,8 @@ float vectorGetByIndex(__m128 v) {
 #endif
 
 #if defined(_M_SSE)
-inline __m128 MATH3D_CALL Vec3ByMatrix43(__m128 x, __m128 y, __m128 z, const float m[12]) {
+// x, y, and z should be broadcast.  Should only be used through Vec3f version.
+inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {
 	__m128 col0 = _mm_loadu_ps(m);
 	__m128 col1 = _mm_loadu_ps(m + 3);
 	__m128 col2 = _mm_loadu_ps(m + 6);
@@ -894,7 +895,7 @@ inline __m128 MATH3D_CALL Vec3ByMatrix43(__m128 x, __m128 y, __m128 z, const flo
 	return sum;
 }
 #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
-inline float32x4_t Vec3ByMatrix43(float32x4_t vec, const float m[16]) {
+inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
 	float32x4_t col0 = vld1q_f32(m);
 	float32x4_t col1 = vld1q_f32(m + 3);
 	float32x4_t col2 = vld1q_f32(m + 6);
@@ -912,14 +913,14 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
 	__m128 x = _mm_set1_ps(v[0]);
 	__m128 y = _mm_set1_ps(v[1]);
 	__m128 z = _mm_set1_ps(v[2]);
-	__m128 sum = Vec3ByMatrix43(x, y, z, m);
+	__m128 sum = Vec3ByMatrix43Internal(x, y, z, m);
 	// Not sure what the best way to store 3 elements is. Ideally, we should
 	// probably store all four.
 	vecOut[0] = _mm_cvtss_f32(sum);
 	vecOut[1] = vectorGetByIndex<1>(sum);
 	vecOut[2] = vectorGetByIndex<2>(sum);
 #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
-	float32x4_t sum = Vec3ByMatrix43(vld1q_f32(v), m);
+	float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(v), m);
 	vecOut[0] = vgetq_lane_f32(sum, 0);
 	vecOut[1] = vgetq_lane_f32(sum, 1);
 	vecOut[2] = vgetq_lane_f32(sum, 2);
@@ -935,9 +936,9 @@ inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
 	__m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0));
 	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
 	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
-	return Vec3ByMatrix43(x, y, z, m);
+	return Vec3ByMatrix43Internal(x, y, z, m);
 #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
-	return Vec3ByMatrix43(v.vec, m);
+	return Vec3ByMatrix43Internal(v.vec, m);
 #else
 	Vec3f vecOut;
 	Vec3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);
@@ -946,7 +947,8 @@ inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
 }
 
 #if defined(_M_SSE)
-inline __m128 MATH3D_CALL Vec3ByMatrix44(__m128 x, __m128 y, __m128 z, const float m[16]) {
+// x, y, and z should be broadcast.  Should only be used through Vec3f version.
+inline __m128 MATH3D_CALL Vec3ByMatrix44Internal(__m128 x, __m128 y, __m128 z, const float m[16]) {
 	__m128 col0 = _mm_loadu_ps(m);
 	__m128 col1 = _mm_loadu_ps(m + 4);
 	__m128 col2 = _mm_loadu_ps(m + 8);
@@ -957,7 +959,7 @@ inline __m128 MATH3D_CALL Vec3ByMatrix44(__m128 x, __m128 y, __m128 z, const flo
 	return sum;
 }
 #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
-inline float32x4_t Vec3ByMatrix44(float32x4_t vec, const float m[16]) {
+inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
 	float32x4_t col0 = vld1q_f32(m);
 	float32x4_t col1 = vld1q_f32(m + 4);
 	float32x4_t col2 = vld1q_f32(m + 8);
@@ -974,10 +976,10 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
 	__m128 x = _mm_set1_ps(v[0]);
 	__m128 y = _mm_set1_ps(v[1]);
 	__m128 z = _mm_set1_ps(v[2]);
-	__m128 sum = Vec3ByMatrix44(x, y, z, m);
+	__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
 	_mm_storeu_ps(vecOut, sum);
 #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
-	float32x4_t sum = Vec3ByMatrix44(vld1q_f32(v), m);
+	float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(v), m);
 	vst1q_f32(vecOut, sum);
 #else
 	vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12];
@@ -992,9 +994,9 @@ inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
 	__m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0));
 	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
 	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
-	return Vec3ByMatrix44(x, y, z, m);
+	return Vec3ByMatrix44Internal(x, y, z, m);
 #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
-	return Vec3ByMatrix44(v.vec, m);
+	return Vec3ByMatrix44Internal(v.vec, m);
 #else
 	Vec4f vecOut;
 	Vec3ByMatrix44(vecOut.AsArray(), v.AsArray(), m);
@@ -1003,7 +1005,8 @@ inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
 }
 
 #if defined(_M_SSE)
-inline __m128 MATH3D_CALL Norm3ByMatrix43(__m128 x, __m128 y, __m128 z, const float m[12]) {
+// x, y, and z should be broadcast.  Should only be used through Vec3f version.
+inline __m128 MATH3D_CALL Norm3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {
 	__m128 col0 = _mm_loadu_ps(m);
 	__m128 col1 = _mm_loadu_ps(m + 3);
 	__m128 col2 = _mm_loadu_ps(m + 6);
@@ -1013,7 +1016,7 @@ inline __m128 MATH3D_CALL Norm3ByMatrix43(__m128 x, __m128 y, __m128 z, const fl
 	return sum;
 }
 #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
-inline float32x4_t Norm3ByMatrix43(float32x4_t vec, const float m[16]) {
+inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
 	float32x4_t col0 = vld1q_f32(m);
 	float32x4_t col1 = vld1q_f32(m + 3);
 	float32x4_t col2 = vld1q_f32(m + 6);
@@ -1029,12 +1032,12 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]
 	__m128 x = _mm_set1_ps(v[0]);
 	__m128 y = _mm_set1_ps(v[1]);
 	__m128 z = _mm_set1_ps(v[2]);
-	__m128 sum = Norm3ByMatrix43(x, y, z, m);
+	__m128 sum = Norm3ByMatrix43Internal(x, y, z, m);
 	vecOut[0] = _mm_cvtss_f32(sum);
 	vecOut[1] = vectorGetByIndex<1>(sum);
 	vecOut[2] = vectorGetByIndex<2>(sum);
 #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
-	float32x4_t sum = Norm3ByMatrix43(vld1q_f32(v), m);
+	float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);
 	vecOut[0] = vgetq_lane_f32(sum, 0);
 	vecOut[1] = vgetq_lane_f32(sum, 1);
 	vecOut[2] = vgetq_lane_f32(sum, 2);
@@ -1050,9 +1053,9 @@ inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
 	__m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0));
 	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
 	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
-	return Norm3ByMatrix43(x, y, z, m);
+	return Norm3ByMatrix43Internal(x, y, z, m);
 #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
-	return Norm3ByMatrix43(v.vec, m);
+	return Norm3ByMatrix43Internal(v.vec, m);
 #else
 	Vec3f vecOut;
 	Norm3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);

From 9458610d96c4949c6d1a15c989c44874da775e6c Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Fri, 7 Jan 2022 23:22:57 -0800
Subject: [PATCH 10/10] softgpu: Avoid rsqrt path for normals.

In LittleBigPlanet, it's noticeable that the lighting is very off due to
the slight loss of accuracy - possibly due to cutoff or similar.
---
 GPU/Software/TransformUnit.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp
index 11eeda3f68..021e0427f5 100644
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@@ -227,7 +227,8 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
 		vertex.screenpos = ClipToScreenInternal(vertex.clippos, &outside_range_flag);
 
 		if (vreader.hasNormal()) {
-			vertex.worldnormal = TransformUnit::ModelToWorldNormal(vertex.normal).NormalizedOr001(cpu_info.bSSE4_1);
+			vertex.worldnormal = TransformUnit::ModelToWorldNormal(vertex.normal);
+			vertex.worldnormal.NormalizeOr001();
 		} else {
 			vertex.worldnormal = Vec3<float>(0.0f, 0.0f, 1.0f);
 		}