From cba2374abd35e77055bff817c282d8975be48b9d Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Mon, 3 Jan 2022 22:56:26 -0800 Subject: [PATCH 01/10] softgpu: Separate calculation of S/T. We could probably reuse, but we're not right now and it complicates the logic. --- GPU/Software/Lighting.cpp | 36 +++++++++++++++------------------- GPU/Software/Lighting.h | 3 ++- GPU/Software/TransformUnit.cpp | 5 ++++- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp index d4d7a020ee..35a1dde62a 100644 --- a/GPU/Software/Lighting.cpp +++ b/GPU/Software/Lighting.cpp @@ -43,6 +43,22 @@ static inline float pspLightPow(float v, float e) { return v; } +static inline float GenerateLightCoord(VertexData &vertex, int light) { + // TODO: Should specular lighting should affect this, too? Doesn't in GLES. + Vec3 L = GetLightVec(gstate.lpos, light); + // In other words, L.Length2() == 0.0f means Dot({0, 0, 1}, worldnormal). + float diffuse_factor = Dot(L.NormalizedOr001(cpu_info.bSSE4_1), vertex.worldnormal); + + return (diffuse_factor + 1.0f) / 2.0f; +} + +void GenerateLightST(VertexData &vertex) { + // Always calculate texture coords from lighting results if environment mapping is active + // This should be done even if lighting is disabled altogether. + vertex.texturecoords.s() = GenerateLightCoord(vertex, gstate.getUVLS0()); + vertex.texturecoords.t() = GenerateLightCoord(vertex, gstate.getUVLS1()); +} + void Process(VertexData& vertex, bool hasColor) { const int materialupdate = gstate.materialupdate & (hasColor ? 7 : 0); @@ -53,26 +69,6 @@ void Process(VertexData& vertex, bool hasColor) { Vec3 final_color = mec + mac * Vec3::FromRGB(gstate.getAmbientRGBA()); Vec3 specular_color(0.0f, 0.0f, 0.0f); - for (unsigned int light = 0; light < 4; ++light) { - // Always calculate texture coords from lighting results if environment mapping is active - // TODO: Should specular lighting should affect this, too? Doesn't in GLES. - // This should be done even if lighting is disabled altogether. - if (gstate.getUVGenMode() == GE_TEXMAP_ENVIRONMENT_MAP) { - Vec3 L = GetLightVec(gstate.lpos, light); - // In other words, L.Length2() == 0.0f means Dot({0, 0, 1}, worldnormal). - float diffuse_factor = Dot(L.NormalizedOr001(cpu_info.bSSE4_1), vertex.worldnormal); - - if (gstate.getUVLS0() == (int)light) - vertex.texturecoords.s() = (diffuse_factor + 1.f) / 2.f; - - if (gstate.getUVLS1() == (int)light) - vertex.texturecoords.t() = (diffuse_factor + 1.f) / 2.f; - } - } - - if (!gstate.isLightingEnabled()) - return; - for (unsigned int light = 0; light < 4; ++light) { if (!gstate.isLightChanEnabled(light)) continue; diff --git a/GPU/Software/Lighting.h b/GPU/Software/Lighting.h index 9e08eacade..e5915e6aac 100644 --- a/GPU/Software/Lighting.h +++ b/GPU/Software/Lighting.h @@ -21,6 +21,7 @@ namespace Lighting { +void GenerateLightST(VertexData &vertex); void Process(VertexData& vertex, bool hasColor); -} \ No newline at end of file +} diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index bdecb90372..b15bb0115c 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -277,10 +277,13 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_ Vec3 stq = tgen * source + Vec3(gstate.tgenMatrix[9], gstate.tgenMatrix[10], gstate.tgenMatrix[11]); float z_recip = 1.0f / stq.z; vertex.texturecoords = Vec2f(stq.x * z_recip, stq.y * z_recip); + } else if (gstate.getUVGenMode() == GE_TEXMAP_ENVIRONMENT_MAP) { + Lighting::GenerateLightST(vertex); } PROFILE_THIS_SCOPE("light"); - Lighting::Process(vertex, vreader.hasColor0()); + if (gstate.isLightingEnabled()) + Lighting::Process(vertex, vreader.hasColor0()); } else { vertex.screenpos.x = (int)(pos[0] * 16) + gstate.getOffsetX16(); vertex.screenpos.y = (int)(pos[1] * 16) + gstate.getOffsetY16(); From 079b67e7ed2203f9fc3bb58cd6a1005b5db16118 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Tue, 4 Jan 2022 09:00:50 -0800 Subject: [PATCH 02/10] softgpu: Use common SIMD matrix multiplies. --- GPU/Math3D.h | 101 +++++++++++++++++++++++++++++++++ GPU/Software/TransformUnit.cpp | 61 ++++++++------------ 2 files changed, 124 insertions(+), 38 deletions(-) diff --git a/GPU/Math3D.h b/GPU/Math3D.h index b8ede12cb8..02ab650811 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -38,6 +38,12 @@ #endif #endif +#if PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)) +#define MATH3D_CALL __vectorcall +#else +#define MATH3D_CALL +#endif + namespace Math3D { // Helper for Vec classes to clamp values. @@ -913,6 +919,38 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) #endif } +inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) { +#if defined(_M_SSE) + __m128 col0 = _mm_loadu_ps(m); + __m128 col1 = _mm_loadu_ps(m + 3); + __m128 col2 = _mm_loadu_ps(m + 6); + __m128 col3 = _mm_loadu_ps(m + 9); + __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 sum = _mm_add_ps( + _mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)), + _mm_add_ps(_mm_mul_ps(col2, z), col3)); + return sum; +#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) + float32x4_t col0 = vld1q_f32(m); + float32x4_t col1 = vld1q_f32(m + 3); + float32x4_t col2 = vld1q_f32(m + 6); + float32x4_t col3 = vld1q_f32(m + 9); + float32x4_t vec = v.vec; + float32x4_t sum = vaddq_f32( + vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)), + vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3)); + return sum; +#else + Vec3f vecOut; + vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6] + m[9]; + vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7] + m[10]; + vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8] + m[11]; + return vecOut; +#endif +} + inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) { #if defined(_M_SSE) @@ -945,6 +983,39 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) #endif } +inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) { +#if defined(_M_SSE) + __m128 col0 = _mm_loadu_ps(m); + __m128 col1 = _mm_loadu_ps(m + 4); + __m128 col2 = _mm_loadu_ps(m + 8); + __m128 col3 = _mm_loadu_ps(m + 12); + __m128 x = _mm_set1_ps(v[0]); + __m128 y = _mm_set1_ps(v[1]); + __m128 z = _mm_set1_ps(v[2]); + __m128 sum = _mm_add_ps( + _mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)), + _mm_add_ps(_mm_mul_ps(col2, z), col3)); + return sum; +#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) + float32x4_t col0 = vld1q_f32(m); + float32x4_t col1 = vld1q_f32(m + 4); + float32x4_t col2 = vld1q_f32(m + 8); + float32x4_t col3 = vld1q_f32(m + 12); + float32x4_t vec = v.vec; + float32x4_t sum = vaddq_f32( + vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)), + vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3)); + return sum; +#else + Vec4f vecOut; + vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12]; + vecOut[1] = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + m[13]; + vecOut[2] = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + m[14]; + vecOut[3] = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + m[15]; + return vecOut; +#endif +} + inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) { vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6]; @@ -952,6 +1023,36 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12] vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8]; } +inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) { +#if defined(_M_SSE) + __m128 col0 = _mm_loadu_ps(m); + __m128 col1 = _mm_loadu_ps(m + 3); + __m128 col2 = _mm_loadu_ps(m + 6); + __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 sum = _mm_add_ps( + _mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)), + _mm_mul_ps(col2, z)); + return sum; +#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) + float32x4_t col0 = vld1q_f32(m); + float32x4_t col1 = vld1q_f32(m + 3); + float32x4_t col2 = vld1q_f32(m + 6); + float32x4_t vec = v.vec; + float32x4_t sum = vaddq_f32( + vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)), + vmulq_laneq_f32(col2, vec, 2)); + return sum; +#else + Vec3f vecOut; + vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6]; + vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7]; + vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8]; + return vecOut; +#endif +} + inline void Matrix4ByMatrix4(float out[16], const float a[16], const float b[16]) { fast_matrix_mul_4x4(out, b, a); } diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index b15bb0115c..7902e95aff 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -67,29 +67,20 @@ VertexDecoder *SoftwareDrawEngine::FindVertexDecoder(u32 vtype) { return DrawEngineCommon::GetVertexDecoder(vertTypeID); } -WorldCoords TransformUnit::ModelToWorld(const ModelCoords& coords) -{ - Mat3x3 world_matrix(gstate.worldMatrix); - return WorldCoords(world_matrix * coords) + Vec3(gstate.worldMatrix[9], gstate.worldMatrix[10], gstate.worldMatrix[11]); +WorldCoords TransformUnit::ModelToWorld(const ModelCoords &coords) { + return Vec3ByMatrix43(coords, gstate.worldMatrix); } -WorldCoords TransformUnit::ModelToWorldNormal(const ModelCoords& coords) -{ - Mat3x3 world_matrix(gstate.worldMatrix); - return WorldCoords(world_matrix * coords); +WorldCoords TransformUnit::ModelToWorldNormal(const ModelCoords &coords) { + return Norm3ByMatrix43(coords, gstate.worldMatrix); } -ViewCoords TransformUnit::WorldToView(const WorldCoords& coords) -{ - Mat3x3 view_matrix(gstate.viewMatrix); - return ViewCoords(view_matrix * coords) + Vec3(gstate.viewMatrix[9], gstate.viewMatrix[10], gstate.viewMatrix[11]); +ViewCoords TransformUnit::WorldToView(const WorldCoords &coords) { + return Vec3ByMatrix43(coords, gstate.viewMatrix); } -ClipCoords TransformUnit::ViewToClip(const ViewCoords& coords) -{ - Vec4 coords4(coords.x, coords.y, coords.z, 1.0f); - Mat4x4 projection_matrix(gstate.projMatrix); - return ClipCoords(projection_matrix * coords4); +ClipCoords TransformUnit::ViewToClip(const ViewCoords &coords) { + return Vec3ByMatrix44(coords, gstate.projMatrix); } static inline ScreenCoords ClipToScreenInternal(const ClipCoords& coords, bool *outside_range_flag) { @@ -161,20 +152,16 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_ PROFILE_THIS_SCOPE("read_vert"); VertexData vertex; - float pos[3]; + ModelCoords pos; // VertexDecoder normally scales z, but we want it unscaled. - vreader.ReadPosThroughZ16(pos); + vreader.ReadPosThroughZ16(pos.AsArray()); if (!gstate.isModeClear() && gstate.isTextureMapEnabled() && vreader.hasUV()) { - float uv[2]; - vreader.ReadUV(uv); - vertex.texturecoords = Vec2(uv[0], uv[1]); + vreader.ReadUV(vertex.texturecoords.AsArray()); } if (vreader.hasNormal()) { - float normal[3]; - vreader.ReadNrm(normal); - vertex.normal = Vec3(normal[0], normal[1], normal[2]); + vreader.ReadNrm(vertex.normal.AsArray()); if (gstate.areNormalsReversed()) vertex.normal = -vertex.normal; @@ -188,15 +175,15 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_ Vec3 tmpnrm(0.f, 0.f, 0.f); for (int i = 0; i < vertTypeGetNumBoneWeights(gstate.vertType); ++i) { - Mat3x3 bone(&gstate.boneMatrix[12*i]); - tmppos += (bone * ModelCoords(pos[0], pos[1], pos[2]) + Vec3(gstate.boneMatrix[12*i+9], gstate.boneMatrix[12*i+10], gstate.boneMatrix[12*i+11])) * W[i]; - if (vreader.hasNormal()) - tmpnrm += (bone * vertex.normal) * W[i]; + Vec3 step = Vec3ByMatrix43(pos, gstate.boneMatrix + i * 12); + tmppos += step * W[i]; + if (vreader.hasNormal()) { + step = Norm3ByMatrix43(vertex.normal, gstate.boneMatrix + i * 12); + tmpnrm += step * W[i]; + } } - pos[0] = tmppos.x; - pos[1] = tmppos.y; - pos[2] = tmppos.z; + pos = tmppos; if (vreader.hasNormal()) vertex.normal = tmpnrm; } @@ -206,7 +193,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_ vreader.ReadColor0(col); vertex.color0 = Vec4(col[0]*255, col[1]*255, col[2]*255, col[3]*255); } else { - vertex.color0 = Vec4(gstate.getMaterialAmbientR(), gstate.getMaterialAmbientG(), gstate.getMaterialAmbientB(), gstate.getMaterialAmbientA()); + vertex.color0 = Vec4::FromRGBA(gstate.getMaterialAmbientRGBA()); } if (vreader.hasColor1()) { @@ -218,7 +205,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_ } if (!gstate.isModeThrough()) { - vertex.modelpos = ModelCoords(pos[0], pos[1], pos[2]); + vertex.modelpos = pos; vertex.worldpos = WorldCoords(TransformUnit::ModelToWorld(vertex.modelpos)); ModelCoords viewpos = TransformUnit::WorldToView(vertex.worldpos); vertex.clippos = ClipCoords(TransformUnit::ViewToClip(viewpos)); @@ -240,8 +227,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_ vertex.screenpos = ClipToScreenInternal(vertex.clippos, &outside_range_flag); if (vreader.hasNormal()) { - vertex.worldnormal = TransformUnit::ModelToWorldNormal(vertex.normal); - vertex.worldnormal /= vertex.worldnormal.Length(); + vertex.worldnormal = TransformUnit::ModelToWorldNormal(vertex.normal).Normalized(cpu_info.bSSE4_1); } else { vertex.worldnormal = Vec3(0.0f, 0.0f, 1.0f); } @@ -273,8 +259,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_ } // TODO: What about uv scale and offset? - Mat3x3 tgen(gstate.tgenMatrix); - Vec3 stq = tgen * source + Vec3(gstate.tgenMatrix[9], gstate.tgenMatrix[10], gstate.tgenMatrix[11]); + Vec3 stq = Vec3ByMatrix43(source, gstate.tgenMatrix); float z_recip = 1.0f / stq.z; vertex.texturecoords = Vec2f(stq.x * z_recip, stq.y * z_recip); } else if (gstate.getUVGenMode() == GE_TEXMAP_ENVIRONMENT_MAP) { From e7d66f202992ab246f06441f7f66d7789cf63394 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Tue, 4 Jan 2022 09:21:39 -0800 Subject: [PATCH 03/10] softgpu: Reuse SSE/NEON matrix code. --- GPU/Math3D.h | 161 +++++++++++++++++++++++++++------------------------ 1 file changed, 84 insertions(+), 77 deletions(-) diff --git a/GPU/Math3D.h b/GPU/Math3D.h index 02ab650811..21f8f99cb8 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -882,33 +882,44 @@ float vectorGetByIndex(__m128 v) { } #endif -// v and vecOut must point to different memory. -inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) { #if defined(_M_SSE) +inline __m128 MATH3D_CALL Vec3ByMatrix43(__m128 x, __m128 y, __m128 z, const float m[12]) { __m128 col0 = _mm_loadu_ps(m); __m128 col1 = _mm_loadu_ps(m + 3); __m128 col2 = _mm_loadu_ps(m + 6); __m128 col3 = _mm_loadu_ps(m + 9); - __m128 x = _mm_set1_ps(v[0]); - __m128 y = _mm_set1_ps(v[1]); - __m128 z = _mm_set1_ps(v[2]); __m128 sum = _mm_add_ps( _mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)), _mm_add_ps(_mm_mul_ps(col2, z), col3)); + return sum; +} +#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) +inline float32x4_t Vec3ByMatrix43(float32x4_t vec, const float m[16]) { + float32x4_t col0 = vld1q_f32(m); + float32x4_t col1 = vld1q_f32(m + 3); + float32x4_t col2 = vld1q_f32(m + 6); + float32x4_t col3 = vld1q_f32(m + 9); + float32x4_t sum = vaddq_f32( + vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)), + vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3)); + return sum; +} +#endif + +// v and vecOut must point to different memory. +inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) { +#if defined(_M_SSE) + __m128 x = _mm_set1_ps(v[0]); + __m128 y = _mm_set1_ps(v[1]); + __m128 z = _mm_set1_ps(v[2]); + __m128 sum = Vec3ByMatrix43(x, y, z, m); // Not sure what the best way to store 3 elements is. Ideally, we should // probably store all four. vecOut[0] = _mm_cvtss_f32(sum); vecOut[1] = vectorGetByIndex<1>(sum); vecOut[2] = vectorGetByIndex<2>(sum); #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) - float32x4_t col0 = vld1q_f32(m); - float32x4_t col1 = vld1q_f32(m + 3); - float32x4_t col2 = vld1q_f32(m + 6); - float32x4_t col3 = vld1q_f32(m + 9); - float32x4_t vec = vld1q_f32(v); - float32x4_t sum = vaddq_f32( - vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)), - vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3)); + float32x4_t sum = Vec3ByMatrix43(vld1q_f32(v), m); vecOut[0] = vgetq_lane_f32(sum, 0); vecOut[1] = vgetq_lane_f32(sum, 1); vecOut[2] = vgetq_lane_f32(sum, 2); @@ -921,59 +932,52 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) { #if defined(_M_SSE) - __m128 col0 = _mm_loadu_ps(m); - __m128 col1 = _mm_loadu_ps(m + 3); - __m128 col2 = _mm_loadu_ps(m + 6); - __m128 col3 = _mm_loadu_ps(m + 9); __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); - __m128 sum = _mm_add_ps( - _mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)), - _mm_add_ps(_mm_mul_ps(col2, z), col3)); - return sum; + return Vec3ByMatrix43(x, y, z, m); #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) - float32x4_t col0 = vld1q_f32(m); - float32x4_t col1 = vld1q_f32(m + 3); - float32x4_t col2 = vld1q_f32(m + 6); - float32x4_t col3 = vld1q_f32(m + 9); - float32x4_t vec = v.vec; - float32x4_t sum = vaddq_f32( - vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)), - vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3)); - return sum; + return Vec3ByMatrix43(v.vec, m); #else Vec3f vecOut; - vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6] + m[9]; - vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7] + m[10]; - vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8] + m[11]; + Vec3ByMatrix43(vecOut.AsArray(), v.AsArray(), m); return vecOut; #endif } -inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) -{ #if defined(_M_SSE) +inline __m128 MATH3D_CALL Vec3ByMatrix44(__m128 x, __m128 y, __m128 z, const float m[16]) { __m128 col0 = _mm_loadu_ps(m); __m128 col1 = _mm_loadu_ps(m + 4); __m128 col2 = _mm_loadu_ps(m + 8); __m128 col3 = _mm_loadu_ps(m + 12); - __m128 x = _mm_set1_ps(v[0]); - __m128 y = _mm_set1_ps(v[1]); - __m128 z = _mm_set1_ps(v[2]); __m128 sum = _mm_add_ps( _mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)), _mm_add_ps(_mm_mul_ps(col2, z), col3)); - _mm_storeu_ps(vecOut, sum); + return sum; +} #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) +inline float32x4_t Vec3ByMatrix44(float32x4_t vec, const float m[16]) { float32x4_t col0 = vld1q_f32(m); float32x4_t col1 = vld1q_f32(m + 4); float32x4_t col2 = vld1q_f32(m + 8); float32x4_t col3 = vld1q_f32(m + 12); - float32x4_t vec = vld1q_f32(v); float32x4_t sum = vaddq_f32( vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)), vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3)); + return sum; +} +#endif + +inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) { +#if defined(_M_SSE) + __m128 x = _mm_set1_ps(v[0]); + __m128 y = _mm_set1_ps(v[1]); + __m128 z = _mm_set1_ps(v[2]); + __m128 sum = Vec3ByMatrix44(x, y, z, m); + _mm_storeu_ps(vecOut, sum); +#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) + float32x4_t sum = Vec3ByMatrix44(vld1q_f32(v), m); vst1q_f32(vecOut, sum); #else vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12]; @@ -985,70 +989,73 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) { #if defined(_M_SSE) - __m128 col0 = _mm_loadu_ps(m); - __m128 col1 = _mm_loadu_ps(m + 4); - __m128 col2 = _mm_loadu_ps(m + 8); - __m128 col3 = _mm_loadu_ps(m + 12); - __m128 x = _mm_set1_ps(v[0]); - __m128 y = _mm_set1_ps(v[1]); - __m128 z = _mm_set1_ps(v[2]); - __m128 sum = _mm_add_ps( - _mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)), - _mm_add_ps(_mm_mul_ps(col2, z), col3)); - return sum; + __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); + return Vec3ByMatrix44(x, y, z, m); #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) - float32x4_t col0 = vld1q_f32(m); - float32x4_t col1 = vld1q_f32(m + 4); - float32x4_t col2 = vld1q_f32(m + 8); - float32x4_t col3 = vld1q_f32(m + 12); - float32x4_t vec = v.vec; - float32x4_t sum = vaddq_f32( - vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)), - vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3)); - return sum; + return Vec3ByMatrix44(v.vec, m); #else Vec4f vecOut; - vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12]; - vecOut[1] = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + m[13]; - vecOut[2] = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + m[14]; - vecOut[3] = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + m[15]; + Vec3ByMatrix44(vecOut.AsArray(), v.AsArray(), m); return vecOut; #endif } -inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) -{ - vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6]; - vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7]; - vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8]; -} - -inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) { #if defined(_M_SSE) +inline __m128 MATH3D_CALL Norm3ByMatrix43(__m128 x, __m128 y, __m128 z, const float m[12]) { __m128 col0 = _mm_loadu_ps(m); __m128 col1 = _mm_loadu_ps(m + 3); __m128 col2 = _mm_loadu_ps(m + 6); - __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); - __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); - __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); __m128 sum = _mm_add_ps( _mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)), _mm_mul_ps(col2, z)); return sum; +} #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) +inline float32x4_t Norm3ByMatrix43(float32x4_t vec, const float m[16]) { float32x4_t col0 = vld1q_f32(m); float32x4_t col1 = vld1q_f32(m + 3); float32x4_t col2 = vld1q_f32(m + 6); - float32x4_t vec = v.vec; float32x4_t sum = vaddq_f32( vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)), vmulq_laneq_f32(col2, vec, 2)); return sum; +} +#endif + +inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) { +#if defined(_M_SSE) + __m128 x = _mm_set1_ps(v[0]); + __m128 y = _mm_set1_ps(v[1]); + __m128 z = _mm_set1_ps(v[2]); + __m128 sum = Norm3ByMatrix43(x, y, z, m); + vecOut[0] = _mm_cvtss_f32(sum); + vecOut[1] = vectorGetByIndex<1>(sum); + vecOut[2] = vectorGetByIndex<2>(sum); +#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) + float32x4_t sum = Norm3ByMatrix43(vld1q_f32(v), m); + vecOut[0] = vgetq_lane_f32(sum, 0); + vecOut[1] = vgetq_lane_f32(sum, 1); + vecOut[2] = vgetq_lane_f32(sum, 2); #else - Vec3f vecOut; vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6]; vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7]; vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8]; +#endif +} + +inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) { +#if defined(_M_SSE) + __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); + return Norm3ByMatrix43(x, y, z, m); +#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) + return Norm3ByMatrix43(v.vec, m); +#else + Vec3f vecOut; + Norm3ByMatrix43(vecOut.AsArray(), v.AsArray(), m); return vecOut; #endif } From fa80c448ee47a001ee5499811150f4fda5fa41ef Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Tue, 4 Jan 2022 23:42:01 -0800 Subject: [PATCH 04/10] softgpu: More closely match PSP light rounding. --- GPU/Software/Lighting.cpp | 55 +++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp index 35a1dde62a..0f11c2c24a 100644 --- a/GPU/Software/Lighting.cpp +++ b/GPU/Software/Lighting.cpp @@ -62,12 +62,16 @@ void GenerateLightST(VertexData &vertex) { void Process(VertexData& vertex, bool hasColor) { const int materialupdate = gstate.materialupdate & (hasColor ? 7 : 0); - Vec3 vcol0 = vertex.color0.rgb().Cast() * Vec3::AssignToAll(1.0f / 255.0f); - Vec3 mec = Vec3::FromRGB(gstate.getMaterialEmissive()); + Vec4 mec = Vec4::FromRGBA(gstate.getMaterialEmissive()); - Vec3 mac = (materialupdate & 1) ? vcol0 : Vec3::FromRGB(gstate.getMaterialAmbientRGBA()); - Vec3 final_color = mec + mac * Vec3::FromRGB(gstate.getAmbientRGBA()); - Vec3 specular_color(0.0f, 0.0f, 0.0f); + Vec4 mac = (materialupdate & 1) ? vertex.color0 : Vec4::FromRGBA(gstate.getMaterialAmbientRGBA()); + Vec4 ac = Vec4::FromRGBA(gstate.getAmbientRGBA()); + // Ambient (whether vertex or material) rounds using the half offset method (like alpha blend.) + const Vec4 ones = Vec4::AssignToAll(1); + Vec4 ambient = ((mac * 2 + ones) * (ac * 2 + ones)) / 1024; + + Vec4 final_color = mec + ambient; + Vec4 specular_color = Vec4::AssignToAll(0); for (unsigned int light = 0; light < 4; ++light) { if (!gstate.isLightChanEnabled(light)) @@ -103,13 +107,14 @@ void Process(VertexData& vertex, bool hasColor) { } // ambient lighting - Vec3 lac = Vec3::FromRGB(gstate.getLightAmbientColor(light)); - final_color += lac * mac * att * spot; + int attspot = (int)ceilf(256 * 2 * att * spot + 1); + if (attspot > 512) + attspot = 512; + Vec4 lac = Vec4::FromRGBA(gstate.getLightAmbientColor(light)); + Vec4 lambient = ((mac * 2 + ones) * (lac * 2 + ones) * attspot) / (1024 * 512); + final_color += lambient; // diffuse lighting - Vec3 ldc = Vec3::FromRGB(gstate.getDiffuseColor(light)); - Vec3 mdc = (materialupdate & 2) ? vcol0 : Vec3::FromRGB(gstate.getMaterialDiffuse()); - float diffuse_factor = Dot(L, vertex.worldnormal); if (gstate.isUsingPoweredDiffuseLight(light)) { float k = gstate.getMaterialSpecularCoef(); @@ -117,35 +122,39 @@ void Process(VertexData& vertex, bool hasColor) { } if (diffuse_factor > 0.f) { - final_color += ldc * mdc * diffuse_factor * att * spot; + int diffuse_attspot = (int)ceilf(attspot * diffuse_factor + 1); + if (diffuse_attspot > 512) + diffuse_attspot = 512; + Vec4 ldc = Vec4::FromRGBA(gstate.getDiffuseColor(light)); + Vec4 mdc = (materialupdate & 2) ? vertex.color0 : Vec4::FromRGBA(gstate.getMaterialDiffuse()); + Vec4 ldiffuse = ((ldc * 2 + ones) * (mdc * 2 + ones) * diffuse_attspot) / (1024 * 512); + final_color += ldiffuse; } if (gstate.isUsingSpecularLight(light) && diffuse_factor >= 0.0f) { Vec3 H = L + Vec3(0.f, 0.f, 1.f); - Vec3 lsc = Vec3::FromRGB(gstate.getSpecularColor(light)); - Vec3 msc = (materialupdate & 4) ? vcol0 : Vec3::FromRGB(gstate.getMaterialSpecular()); - float specular_factor = Dot(H.NormalizedOr001(cpu_info.bSSE4_1), vertex.worldnormal); float k = gstate.getMaterialSpecularCoef(); specular_factor = pspLightPow(specular_factor, k); if (specular_factor > 0.f) { - specular_color += lsc * msc * specular_factor * att * spot; + int specular_attspot = (int)ceilf(attspot * specular_factor + 1); + if (specular_attspot > 512) + specular_attspot = 512; + Vec4 lsc = Vec4::FromRGBA(gstate.getSpecularColor(light)); + Vec4 msc = (materialupdate & 4) ? vertex.color0 : Vec4::FromRGBA(gstate.getMaterialSpecular()); + Vec4 lspecular = ((lsc * 2 + ones) * (msc * 2 + ones) * specular_attspot) / (1024 * 512); + specular_color += lspecular; } } } - int maa = (materialupdate & 1) ? vertex.color0.a() : gstate.getMaterialAmbientA(); - int final_alpha = (gstate.getAmbientA() * maa) / 255; - if (gstate.isUsingSecondaryColor()) { - Vec3 final_color_int = (final_color.Clamp(0.0f, 1.0f) * 255.0f).Cast(); - vertex.color0 = Vec4(final_color_int, final_alpha); - vertex.color1 = (specular_color.Clamp(0.0f, 1.0f) * 255.0f).Cast(); + vertex.color0 = final_color.Clamp(0, 255); + vertex.color1 = specular_color.Clamp(0, 255).rgb(); } else { - Vec3 final_color_int = ((final_color + specular_color).Clamp(0.0f, 1.0f) * 255.0f).Cast(); - vertex.color0 = Vec4(final_color_int, final_alpha); + vertex.color0 = (final_color + specular_color).Clamp(0, 255); } } From b86bdc9456d1e5e50ae3b0520ac71a4721c54356 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Wed, 5 Jan 2022 07:04:49 -0800 Subject: [PATCH 05/10] softgpu: Correct handling of NAN attenuation. --- GPU/Software/Lighting.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp index 0f11c2c24a..534edb9b66 100644 --- a/GPU/Software/Lighting.cpp +++ b/GPU/Software/Lighting.cpp @@ -86,11 +86,13 @@ void Process(VertexData& vertex, bool hasColor) { // TODO: Should this normalize (0, 0, 0) to (0, 0, 1)? float d = L.NormalizeOr001(); - float att = 1.f; + float att = 1.0f; if (!gstate.isDirectionalLight(light)) { - att = 1.f / Dot(GetLightVec(gstate.latt, light), Vec3f(1.0f, d, d * d)); - if (att > 1.f) att = 1.f; - if (att < 0.f) att = 0.f; + att = 1.0f / Dot(GetLightVec(gstate.latt, light), Vec3f(1.0f, d, d * d)); + if (!(att > 0.0f)) + att = 0.0f; + else if (att > 1.0f) + att = 1.0f; } float spot = 1.f; From 537e3577416ef9a2952e9fbfca1c67b1be1941b0 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Wed, 5 Jan 2022 20:25:22 -0800 Subject: [PATCH 06/10] softgpu: Correct NAN spotlight exponent/direction. --- GPU/Software/Lighting.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp index 534edb9b66..90dd5f5851 100644 --- a/GPU/Software/Lighting.cpp +++ b/GPU/Software/Lighting.cpp @@ -16,6 +16,7 @@ // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. #include "ppsspp_config.h" +#include #include "Common/CPUDetect.h" #include "GPU/GPUState.h" #include "GPU/Software/Lighting.h" @@ -95,16 +96,20 @@ void Process(VertexData& vertex, bool hasColor) { att = 1.0f; } - float spot = 1.f; + float spot = 1.0f; if (gstate.isSpotLight(light)) { Vec3 dir = GetLightVec(gstate.ldir, light); - float rawSpot = Dot(dir.NormalizedOr001(cpu_info.bSSE4_1), L); + float rawSpot = Dot(dir.Normalized(cpu_info.bSSE4_1), L); + if (isnan(rawSpot)) + rawSpot = 1.0f; float cutoff = getFloat24(gstate.lcutoff[light]); if (rawSpot >= cutoff) { float conv = getFloat24(gstate.lconv[light]); spot = pspLightPow(rawSpot, conv); + if (isnan(spot)) + spot = 0.0f; } else { - spot = 0.f; + spot = 0.0f; } } From bd354164bcdeea2c89f651a033672c5b611e3716 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Wed, 5 Jan 2022 23:10:47 -0800 Subject: [PATCH 07/10] softgpu: Cleanup -NAN and diffuse factor. --- GPU/Software/Lighting.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp index 90dd5f5851..302d1a7055 100644 --- a/GPU/Software/Lighting.cpp +++ b/GPU/Software/Lighting.cpp @@ -34,7 +34,7 @@ static inline Vec3f GetLightVec(u32 lparams[12], int light) { } static inline float pspLightPow(float v, float e) { - if (e <= 0.0f) { + if (e <= 0.0f || (std::isnan(e) && std::signbit(e))) { return 1.0f; } if (v > 0.0f) { @@ -100,13 +100,15 @@ void Process(VertexData& vertex, bool hasColor) { if (gstate.isSpotLight(light)) { Vec3 dir = GetLightVec(gstate.ldir, light); float rawSpot = Dot(dir.Normalized(cpu_info.bSSE4_1), L); - if (isnan(rawSpot)) - rawSpot = 1.0f; + if (std::isnan(rawSpot)) + rawSpot = std::signbit(rawSpot) ? 0.0f : 1.0f; float cutoff = getFloat24(gstate.lcutoff[light]); + if (std::isnan(cutoff) && std::signbit(cutoff)) + cutoff = 0.0f; if (rawSpot >= cutoff) { float conv = getFloat24(gstate.lconv[light]); spot = pspLightPow(rawSpot, conv); - if (isnan(spot)) + if (std::isnan(spot)) spot = 0.0f; } else { spot = 0.0f; @@ -128,10 +130,8 @@ void Process(VertexData& vertex, bool hasColor) { diffuse_factor = pspLightPow(diffuse_factor, k); } - if (diffuse_factor > 0.f) { + if (diffuse_factor > 0.0f) { int diffuse_attspot = (int)ceilf(attspot * diffuse_factor + 1); - if (diffuse_attspot > 512) - diffuse_attspot = 512; Vec4 ldc = Vec4::FromRGBA(gstate.getDiffuseColor(light)); Vec4 mdc = (materialupdate & 2) ? vertex.color0 : Vec4::FromRGBA(gstate.getMaterialDiffuse()); Vec4 ldiffuse = ((ldc * 2 + ones) * (mdc * 2 + ones) * diffuse_attspot) / (1024 * 512); From ce8a49b1c12abfcfc3ff41b54f41f1ae89cd5af3 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Thu, 6 Jan 2022 20:10:47 -0800 Subject: [PATCH 08/10] softgpu: Retain floats in diffuse/specular. This seems to be a bit more accurate. Color blending seems correct now, but the factors and especially pow results are off. Also, normalize normal to 0, 0, 1, which seems to match results better. --- GPU/Software/Lighting.cpp | 8 +++++--- GPU/Software/TransformUnit.cpp | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp index 302d1a7055..67e21b4028 100644 --- a/GPU/Software/Lighting.cpp +++ b/GPU/Software/Lighting.cpp @@ -131,7 +131,9 @@ void Process(VertexData& vertex, bool hasColor) { } if (diffuse_factor > 0.0f) { - int diffuse_attspot = (int)ceilf(attspot * diffuse_factor + 1); + int diffuse_attspot = (int)ceilf(256 * 2 * att * spot * diffuse_factor + 1); + if (diffuse_attspot > 512) + diffuse_attspot = 512; Vec4 ldc = Vec4::FromRGBA(gstate.getDiffuseColor(light)); Vec4 mdc = (materialupdate & 2) ? vertex.color0 : Vec4::FromRGBA(gstate.getMaterialDiffuse()); Vec4 ldiffuse = ((ldc * 2 + ones) * (mdc * 2 + ones) * diffuse_attspot) / (1024 * 512); @@ -145,8 +147,8 @@ void Process(VertexData& vertex, bool hasColor) { float k = gstate.getMaterialSpecularCoef(); specular_factor = pspLightPow(specular_factor, k); - if (specular_factor > 0.f) { - int specular_attspot = (int)ceilf(attspot * specular_factor + 1); + if (specular_factor > 0.0f) { + int specular_attspot = (int)ceilf(256 * 2 * att * spot * specular_factor + 1); if (specular_attspot > 512) specular_attspot = 512; Vec4 lsc = Vec4::FromRGBA(gstate.getSpecularColor(light)); diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index 7902e95aff..11eeda3f68 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -227,7 +227,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_ vertex.screenpos = ClipToScreenInternal(vertex.clippos, &outside_range_flag); if (vreader.hasNormal()) { - vertex.worldnormal = TransformUnit::ModelToWorldNormal(vertex.normal).Normalized(cpu_info.bSSE4_1); + vertex.worldnormal = TransformUnit::ModelToWorldNormal(vertex.normal).NormalizedOr001(cpu_info.bSSE4_1); } else { vertex.worldnormal = Vec3(0.0f, 0.0f, 1.0f); } From 43f71884ee2634e8eee12ffebb3c3dfdbae76cac Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Fri, 7 Jan 2022 17:53:24 -0800 Subject: [PATCH 09/10] softgpu: Clarify internal matrix multiply usage. --- GPU/Math3D.h | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/GPU/Math3D.h b/GPU/Math3D.h index 21f8f99cb8..a62905fa13 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -883,7 +883,8 @@ float vectorGetByIndex(__m128 v) { #endif #if defined(_M_SSE) -inline __m128 MATH3D_CALL Vec3ByMatrix43(__m128 x, __m128 y, __m128 z, const float m[12]) { +// x, y, and z should be broadcast. Should only be used through Vec3f version. +inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) { __m128 col0 = _mm_loadu_ps(m); __m128 col1 = _mm_loadu_ps(m + 3); __m128 col2 = _mm_loadu_ps(m + 6); @@ -894,7 +895,7 @@ inline __m128 MATH3D_CALL Vec3ByMatrix43(__m128 x, __m128 y, __m128 z, const flo return sum; } #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) -inline float32x4_t Vec3ByMatrix43(float32x4_t vec, const float m[16]) { +inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) { float32x4_t col0 = vld1q_f32(m); float32x4_t col1 = vld1q_f32(m + 3); float32x4_t col2 = vld1q_f32(m + 6); @@ -912,14 +913,14 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) __m128 x = _mm_set1_ps(v[0]); __m128 y = _mm_set1_ps(v[1]); __m128 z = _mm_set1_ps(v[2]); - __m128 sum = Vec3ByMatrix43(x, y, z, m); + __m128 sum = Vec3ByMatrix43Internal(x, y, z, m); // Not sure what the best way to store 3 elements is. Ideally, we should // probably store all four. vecOut[0] = _mm_cvtss_f32(sum); vecOut[1] = vectorGetByIndex<1>(sum); vecOut[2] = vectorGetByIndex<2>(sum); #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) - float32x4_t sum = Vec3ByMatrix43(vld1q_f32(v), m); + float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(v), m); vecOut[0] = vgetq_lane_f32(sum, 0); vecOut[1] = vgetq_lane_f32(sum, 1); vecOut[2] = vgetq_lane_f32(sum, 2); @@ -935,9 +936,9 @@ inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) { __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); - return Vec3ByMatrix43(x, y, z, m); + return Vec3ByMatrix43Internal(x, y, z, m); #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) - return Vec3ByMatrix43(v.vec, m); + return Vec3ByMatrix43Internal(v.vec, m); #else Vec3f vecOut; Vec3ByMatrix43(vecOut.AsArray(), v.AsArray(), m); @@ -946,7 +947,8 @@ inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) { } #if defined(_M_SSE) -inline __m128 MATH3D_CALL Vec3ByMatrix44(__m128 x, __m128 y, __m128 z, const float m[16]) { +// x, y, and z should be broadcast. Should only be used through Vec3f version. +inline __m128 MATH3D_CALL Vec3ByMatrix44Internal(__m128 x, __m128 y, __m128 z, const float m[16]) { __m128 col0 = _mm_loadu_ps(m); __m128 col1 = _mm_loadu_ps(m + 4); __m128 col2 = _mm_loadu_ps(m + 8); @@ -957,7 +959,7 @@ inline __m128 MATH3D_CALL Vec3ByMatrix44(__m128 x, __m128 y, __m128 z, const flo return sum; } #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) -inline float32x4_t Vec3ByMatrix44(float32x4_t vec, const float m[16]) { +inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) { float32x4_t col0 = vld1q_f32(m); float32x4_t col1 = vld1q_f32(m + 4); float32x4_t col2 = vld1q_f32(m + 8); @@ -974,10 +976,10 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) __m128 x = _mm_set1_ps(v[0]); __m128 y = _mm_set1_ps(v[1]); __m128 z = _mm_set1_ps(v[2]); - __m128 sum = Vec3ByMatrix44(x, y, z, m); + __m128 sum = Vec3ByMatrix44Internal(x, y, z, m); _mm_storeu_ps(vecOut, sum); #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) - float32x4_t sum = Vec3ByMatrix44(vld1q_f32(v), m); + float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(v), m); vst1q_f32(vecOut, sum); #else vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12]; @@ -992,9 +994,9 @@ inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) { __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); - return Vec3ByMatrix44(x, y, z, m); + return Vec3ByMatrix44Internal(x, y, z, m); #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) - return Vec3ByMatrix44(v.vec, m); + return Vec3ByMatrix44Internal(v.vec, m); #else Vec4f vecOut; Vec3ByMatrix44(vecOut.AsArray(), v.AsArray(), m); @@ -1003,7 +1005,8 @@ inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) { } #if defined(_M_SSE) -inline __m128 MATH3D_CALL Norm3ByMatrix43(__m128 x, __m128 y, __m128 z, const float m[12]) { +// x, y, and z should be broadcast. Should only be used through Vec3f version. +inline __m128 MATH3D_CALL Norm3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) { __m128 col0 = _mm_loadu_ps(m); __m128 col1 = _mm_loadu_ps(m + 3); __m128 col2 = _mm_loadu_ps(m + 6); @@ -1013,7 +1016,7 @@ inline __m128 MATH3D_CALL Norm3ByMatrix43(__m128 x, __m128 y, __m128 z, const fl return sum; } #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) -inline float32x4_t Norm3ByMatrix43(float32x4_t vec, const float m[16]) { +inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) { float32x4_t col0 = vld1q_f32(m); float32x4_t col1 = vld1q_f32(m + 3); float32x4_t col2 = vld1q_f32(m + 6); @@ -1029,12 +1032,12 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12] __m128 x = _mm_set1_ps(v[0]); __m128 y = _mm_set1_ps(v[1]); __m128 z = _mm_set1_ps(v[2]); - __m128 sum = Norm3ByMatrix43(x, y, z, m); + __m128 sum = Norm3ByMatrix43Internal(x, y, z, m); vecOut[0] = _mm_cvtss_f32(sum); vecOut[1] = vectorGetByIndex<1>(sum); vecOut[2] = vectorGetByIndex<2>(sum); #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) - float32x4_t sum = Norm3ByMatrix43(vld1q_f32(v), m); + float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m); vecOut[0] = vgetq_lane_f32(sum, 0); vecOut[1] = vgetq_lane_f32(sum, 1); vecOut[2] = vgetq_lane_f32(sum, 2); @@ -1050,9 +1053,9 @@ inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) { __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); - return Norm3ByMatrix43(x, y, z, m); + return Norm3ByMatrix43Internal(x, y, z, m); #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) - return Norm3ByMatrix43(v.vec, m); + return Norm3ByMatrix43Internal(v.vec, m); #else Vec3f vecOut; Norm3ByMatrix43(vecOut.AsArray(), v.AsArray(), m); From 9458610d96c4949c6d1a15c989c44874da775e6c Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Fri, 7 Jan 2022 23:22:57 -0800 Subject: [PATCH 10/10] softgpu: Avoid rsqrt path for normals. In LittleBigPlanet, it's noticeable that the lighting is very off due to the slight loss of accuracy - possibly due to cutoff or similar. --- GPU/Software/TransformUnit.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index 11eeda3f68..021e0427f5 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -227,7 +227,8 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_ vertex.screenpos = ClipToScreenInternal(vertex.clippos, &outside_range_flag); if (vreader.hasNormal()) { - vertex.worldnormal = TransformUnit::ModelToWorldNormal(vertex.normal).NormalizedOr001(cpu_info.bSSE4_1); + vertex.worldnormal = TransformUnit::ModelToWorldNormal(vertex.normal); + vertex.worldnormal.NormalizeOr001(); } else { vertex.worldnormal = Vec3(0.0f, 0.0f, 1.0f); }