softgpu: Use more SSE in lighting.

2025-04-02 11:01:50 -04:00 · 2023-04-16 11:58:30 -07:00 · 2023-04-16 11:58:30 -07:00 · 860fc176d8
commit 860fc176d8
parent 2868495cf8
1 changed files with 41 additions and 10 deletions
--- a/GPU/Software/Lighting.cpp
+++ b/GPU/Software/Lighting.cpp
@ -206,6 +206,17 @@ static inline int LightCeilSSE4(float f) {
 	// This isn't terribly fast, but seems to be better than calling ceilf().
 	return _mm_cvt_ss2si(_mm_ceil_ss(v, v));
 }
+
+#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+[[gnu::target("sse4.1")]]
+#endif
+static inline __m128i LightColorScaleBy512SSE4(__m128i factor, __m128i color, __m128i scale) {
+	// We can use 16-bit multiply here (faster than 32-bit multiply) since our top bits are zero.
+	__m128i result18 = _mm_madd_epi16(factor, color);
+	// But now with 18 bits, we need a full multiply.
+	__m128i multiplied = _mm_mullo_epi32(result18, scale);
+	return _mm_srai_epi32(multiplied, 19);
+}
 #endif

 static inline int LightCeil(float f) {
@ -216,6 +227,25 @@ static inline int LightCeil(float f) {
 	return (int)ceilf(f);
 }

+static Vec4<int> LightColorScaleBy512(const Vec4<int> &factor, const Vec4<int> &color, int scale) {
+	// We multiply s9 * s9 * s9, resulting in s27, then shift off 19 to get 8-bit.
+	// The reason all factors are s9 is to account for rounding.
+	// Also note that all values are positive, so can be treated as unsigned.
+#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
+	if (cpu_info.bSSE4_1)
+		return LightColorScaleBy512SSE4(factor.ivec, color.ivec, _mm_set1_epi32(scale));
+#endif
+	return (factor * color * scale) / (1024 * 512);
+}
+
+static inline void LightColorSum(Vec4<int> &sum, const Vec4<int> &src) {
+#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
+	sum.ivec = _mm_add_epi32(sum.ivec, src.ivec);
+#else
+	sum += src;
+#endif
+}
+
 void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords &worldnormal, const State &state) {
 	// Lighting blending rounds using the half offset method (like alpha blend.)
 	const Vec4<int> ones = Vec4<int>::AssignToAll(1);
@ -272,8 +302,8 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords
 			int attspot = (int)LightCeil(256 * 2 * att * spot + 1);
 			if (attspot > 512)
 				attspot = 512;
-			Vec4<int> lambient = (mac * lstate.ambientColorFactor * attspot) / (1024 * 512);
-			final_color += lambient;
+			Vec4<int> lambient = LightColorScaleBy512(lstate.ambientColorFactor, mac, attspot);
+			LightColorSum(final_color, lambient);
 		}

 		// diffuse lighting
@ -290,8 +320,8 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords
 			if (diffuse_attspot > 512)
 				diffuse_attspot = 512;
 			Vec4<int> mdc = state.colorForDiffuse ? colorFactor : state.material.diffuseColorFactor;
-			Vec4<int> ldiffuse = (lstate.diffuseColorFactor * mdc * diffuse_attspot) / (1024 * 512);
-			final_color += ldiffuse;
+			Vec4<int> ldiffuse = LightColorScaleBy512(lstate.diffuseColorFactor, mdc, diffuse_attspot);
+			LightColorSum(final_color, ldiffuse);
 		}

 		if (lstate.specular && diffuse_factor >= 0.0f) {
@ -306,19 +336,20 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords
 					specular_attspot = 512;

 				Vec4<int> msc = state.colorForSpecular ? colorFactor : state.material.specularColorFactor;
-				Vec4<int> lspecular = (lstate.specularColorFactor * msc * specular_attspot) / (1024 * 512);
-				specular_color += lspecular;
+				Vec4<int> lspecular = LightColorScaleBy512(lstate.specularColorFactor, msc, specular_attspot);
+				LightColorSum(specular_color, lspecular);
 			}
 		}
 	}

+	// Note: these are all naturally clamped by ToRGBA/toRGB.
 	if (state.setColor1) {
-		vertex.color0 = final_color.Clamp(0, 255).ToRGBA();
-		vertex.color1 = specular_color.Clamp(0, 255).rgb().ToRGB();
+		vertex.color0 = final_color.ToRGBA();
+		vertex.color1 = specular_color.rgb().ToRGB();
 	} else if (state.addColor1) {
-		vertex.color0 = (final_color + specular_color).Clamp(0, 255).ToRGBA();
+		vertex.color0 = (final_color + specular_color).ToRGBA();
 	} else {
-		vertex.color0 = final_color.Clamp(0, 255).ToRGBA();
+		vertex.color0 = final_color.ToRGBA();
 	}
 }