From 2868495cf89e09a148a1562b711e8a719fa250db Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 16 Apr 2023 10:54:25 -0700 Subject: [PATCH] softgpu: Use SSE for lighting ceil if available. Tiny optimization, helps only a little. --- GPU/Software/Lighting.cpp | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp index 64a906844a..0ae32d9994 100644 --- a/GPU/Software/Lighting.cpp +++ b/GPU/Software/Lighting.cpp @@ -197,6 +197,25 @@ void GenerateLightST(VertexData &vertex, const WorldCoords &worldnormal) { vertex.texturecoords.t() = GenerateLightCoord(vertex, worldnormal, gstate.getUVLS1()); } +#if defined(_M_SSE) +#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) +[[gnu::target("sse4.1")]] +#endif +static inline int LightCeilSSE4(float f) { + __m128 v = _mm_set_ss(f); + // This isn't terribly fast, but seems to be better than calling ceilf(). + return _mm_cvt_ss2si(_mm_ceil_ss(v, v)); +} +#endif + +static inline int LightCeil(float f) { +#if defined(_M_SSE) + if (cpu_info.bSSE4_1) + return LightCeilSSE4(f); +#endif + return (int)ceilf(f); +} + void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords &worldnormal, const State &state) { // Lighting blending rounds using the half offset method (like alpha blend.) const Vec4 ones = Vec4::AssignToAll(1); @@ -250,7 +269,7 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords // ambient lighting if (lstate.ambient) { - int attspot = (int)ceilf(256 * 2 * att * spot + 1); + int attspot = (int)LightCeil(256 * 2 * att * spot + 1); if (attspot > 512) attspot = 512; Vec4 lambient = (mac * lstate.ambientColorFactor * attspot) / (1024 * 512); @@ -267,7 +286,7 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords } if (lstate.diffuse && diffuse_factor > 0.0f) { - int diffuse_attspot = (int)ceilf(256 * 2 * att * spot * diffuse_factor + 1); + int diffuse_attspot = (int)LightCeil(256 * 2 * att * spot * diffuse_factor + 1); if (diffuse_attspot > 512) diffuse_attspot = 512; Vec4 mdc = state.colorForDiffuse ? colorFactor : state.material.diffuseColorFactor; @@ -282,7 +301,7 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords specular_factor = pspLightPow(specular_factor, state.specularExp); if (specular_factor > 0.0f) { - int specular_attspot = (int)ceilf(256 * 2 * att * spot * specular_factor + 1); + int specular_attspot = (int)LightCeil(256 * 2 * att * spot * specular_factor + 1); if (specular_attspot > 512) specular_attspot = 512;