mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
softgpu: Use more SSE in lighting.
This commit is contained in:
parent
2868495cf8
commit
860fc176d8
1 changed files with 41 additions and 10 deletions
|
@ -206,6 +206,17 @@ static inline int LightCeilSSE4(float f) {
|
|||
// This isn't terribly fast, but seems to be better than calling ceilf().
|
||||
return _mm_cvt_ss2si(_mm_ceil_ss(v, v));
|
||||
}
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
|
||||
[[gnu::target("sse4.1")]]
|
||||
#endif
|
||||
static inline __m128i LightColorScaleBy512SSE4(__m128i factor, __m128i color, __m128i scale) {
|
||||
// We can use 16-bit multiply here (faster than 32-bit multiply) since our top bits are zero.
|
||||
__m128i result18 = _mm_madd_epi16(factor, color);
|
||||
// But now with 18 bits, we need a full multiply.
|
||||
__m128i multiplied = _mm_mullo_epi32(result18, scale);
|
||||
return _mm_srai_epi32(multiplied, 19);
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int LightCeil(float f) {
|
||||
|
@ -216,6 +227,25 @@ static inline int LightCeil(float f) {
|
|||
return (int)ceilf(f);
|
||||
}
|
||||
|
||||
static Vec4<int> LightColorScaleBy512(const Vec4<int> &factor, const Vec4<int> &color, int scale) {
|
||||
// We multiply s9 * s9 * s9, resulting in s27, then shift off 19 to get 8-bit.
|
||||
// The reason all factors are s9 is to account for rounding.
|
||||
// Also note that all values are positive, so can be treated as unsigned.
|
||||
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
|
||||
if (cpu_info.bSSE4_1)
|
||||
return LightColorScaleBy512SSE4(factor.ivec, color.ivec, _mm_set1_epi32(scale));
|
||||
#endif
|
||||
return (factor * color * scale) / (1024 * 512);
|
||||
}
|
||||
|
||||
static inline void LightColorSum(Vec4<int> &sum, const Vec4<int> &src) {
|
||||
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
|
||||
sum.ivec = _mm_add_epi32(sum.ivec, src.ivec);
|
||||
#else
|
||||
sum += src;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords &worldnormal, const State &state) {
|
||||
// Lighting blending rounds using the half offset method (like alpha blend.)
|
||||
const Vec4<int> ones = Vec4<int>::AssignToAll(1);
|
||||
|
@ -272,8 +302,8 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords
|
|||
int attspot = (int)LightCeil(256 * 2 * att * spot + 1);
|
||||
if (attspot > 512)
|
||||
attspot = 512;
|
||||
Vec4<int> lambient = (mac * lstate.ambientColorFactor * attspot) / (1024 * 512);
|
||||
final_color += lambient;
|
||||
Vec4<int> lambient = LightColorScaleBy512(lstate.ambientColorFactor, mac, attspot);
|
||||
LightColorSum(final_color, lambient);
|
||||
}
|
||||
|
||||
// diffuse lighting
|
||||
|
@ -290,8 +320,8 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords
|
|||
if (diffuse_attspot > 512)
|
||||
diffuse_attspot = 512;
|
||||
Vec4<int> mdc = state.colorForDiffuse ? colorFactor : state.material.diffuseColorFactor;
|
||||
Vec4<int> ldiffuse = (lstate.diffuseColorFactor * mdc * diffuse_attspot) / (1024 * 512);
|
||||
final_color += ldiffuse;
|
||||
Vec4<int> ldiffuse = LightColorScaleBy512(lstate.diffuseColorFactor, mdc, diffuse_attspot);
|
||||
LightColorSum(final_color, ldiffuse);
|
||||
}
|
||||
|
||||
if (lstate.specular && diffuse_factor >= 0.0f) {
|
||||
|
@ -306,19 +336,20 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords
|
|||
specular_attspot = 512;
|
||||
|
||||
Vec4<int> msc = state.colorForSpecular ? colorFactor : state.material.specularColorFactor;
|
||||
Vec4<int> lspecular = (lstate.specularColorFactor * msc * specular_attspot) / (1024 * 512);
|
||||
specular_color += lspecular;
|
||||
Vec4<int> lspecular = LightColorScaleBy512(lstate.specularColorFactor, msc, specular_attspot);
|
||||
LightColorSum(specular_color, lspecular);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Note: these are all naturally clamped by ToRGBA/toRGB.
|
||||
if (state.setColor1) {
|
||||
vertex.color0 = final_color.Clamp(0, 255).ToRGBA();
|
||||
vertex.color1 = specular_color.Clamp(0, 255).rgb().ToRGB();
|
||||
vertex.color0 = final_color.ToRGBA();
|
||||
vertex.color1 = specular_color.rgb().ToRGB();
|
||||
} else if (state.addColor1) {
|
||||
vertex.color0 = (final_color + specular_color).Clamp(0, 255).ToRGBA();
|
||||
vertex.color0 = (final_color + specular_color).ToRGBA();
|
||||
} else {
|
||||
vertex.color0 = final_color.Clamp(0, 255).ToRGBA();
|
||||
vertex.color0 = final_color.ToRGBA();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue