From 7c526acc69f789f295e528dd26968622070cff99 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 13 May 2017 06:49:27 -0700 Subject: [PATCH 1/4] SoftGPU: Multiply S/T early via SIMD. --- GPU/Software/Rasterizer.cpp | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index d1b1ee3b40..72ecf814ef 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -1145,12 +1145,6 @@ static inline void ApplyTexturing(Vec4 &prim_color, float s, float t, int t int u[8] = {0}, v[8] = {0}; // 1.23.8 fixed point int frac_u[2], frac_v[2]; - if (gstate.isModeThrough()) { - // For levels > 0, these are always based on level 0. Simpler to round initially. - s /= (float)gstate.getTextureWidth(0); - t /= (float)gstate.getTextureHeight(0); - } - Vec4 texcolor0; Vec4 texcolor1; const u8 *tptr0 = texptr[texlevel]; @@ -1207,12 +1201,8 @@ static inline void ApplyTexturing(Vec4 *prim_color, const Vec4 &s, c int width = gstate.getTextureWidth(0); int height = gstate.getTextureHeight(0); - float ds = s[1] - s[0]; - float dt = t[2] - t[0]; - if (!gstate.isModeThrough()) { - ds *= width; - dt *= height; - } + float ds = (s[1] - s[0]) * width; + float dt = (t[2] - t[0]) * height; // With 8 bits of fraction (because texslope can be fairly precise.) int detail; @@ -1418,6 +1408,10 @@ void DrawTriangleSlice( if (gstate.isModeThrough()) { s = Interpolate(v0.texturecoords.s(), v1.texturecoords.s(), v2.texturecoords.s(), w0, w1, w2, wsum_recip); t = Interpolate(v0.texturecoords.t(), v1.texturecoords.t(), v2.texturecoords.t(), w0, w1, w2, wsum_recip); + + // For levels > 0, mipmapping is always based on level 0. Simpler to scale first. + s *= 1.0f / (float)gstate.getTextureWidth(0); + t *= 1.0f / (float)gstate.getTextureHeight(0); } else { // Texture coordinate interpolation must definitely be perspective-correct. GetTextureCoordinates(v0, v1, v2, w0, w1, w2, wsum_recip, s, t); @@ -1566,6 +1560,11 @@ void DrawPoint(const VertexData &v0) } } + if (gstate.isModeThrough()) { + s *= 1.0f / (float)gstate.getTextureWidth(0); + t *= 1.0f / (float)gstate.getTextureHeight(0); + } + ApplyTexturing(prim_color, s, t, 0, 0, magFilt, texptr, texbufwidthbytes); } @@ -1665,6 +1664,10 @@ void DrawLine(const VertexData &v0, const VertexData &v1) float t = tc.t(); if (gstate.isTextureMapEnabled() && !clearMode) { + if (gstate.isModeThrough()) { + s *= 1.0f / (float)gstate.getTextureWidth(0); + t *= 1.0f / (float)gstate.getTextureHeight(0); + } ApplyTexturing(prim_color, s, t, 0, 0, magFilt, texptr, texbufwidthbytes); } From 6b0c9a5531bb99c9f9d8e9f8ddbd5b50dc7f0ee5 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 13 May 2017 06:56:33 -0700 Subject: [PATCH 2/4] SoftGPU: Improve wsum recip SIMD. A bit faster. --- GPU/Software/Rasterizer.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index 72ecf814ef..c568c0cef8 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -1291,7 +1291,7 @@ inline Vec4 TriangleEdge::StepY(const Vec4 &w) { #endif } -inline Vec4 MakeMask(const Vec4 &w0, const Vec4 &w1, const Vec4 &w2, const Vec4 &bias0, const Vec4 &bias1, const Vec4 &bias2) { +static inline Vec4 MakeMask(const Vec4 &w0, const Vec4 &w1, const Vec4 &w2, const Vec4 &bias0, const Vec4 &bias1, const Vec4 &bias2) { #if defined(_M_SSE) && !defined(_M_IX86) __m128i biased0 = _mm_add_epi32(w0.ivec, bias0.ivec); __m128i biased1 = _mm_add_epi32(w1.ivec, bias1.ivec); @@ -1303,7 +1303,7 @@ inline Vec4 MakeMask(const Vec4 &w0, const Vec4 &w1, const Vec4 &mask) { +static inline bool AnyMask(const Vec4 &mask) { #if defined(_M_SSE) && !defined(_M_IX86) // In other words: !(mask.x < 0 && mask.y < 0 && mask.z < 0 && mask.w < 0) __m128i low2 = _mm_and_si128(mask.ivec, _mm_shuffle_epi32(mask.ivec, _MM_SHUFFLE(3, 2, 3, 2))); @@ -1315,6 +1315,15 @@ inline bool AnyMask(const Vec4 &mask) { #endif } +static inline Vec4 EdgeRecip(const Vec4 &w0, const Vec4 &w1, const Vec4 &w2) { +#if defined(_M_SSE) && !defined(_M_IX86) + __m128i wsum = _mm_add_epi32(w0.ivec, _mm_add_epi32(w1.ivec, w2.ivec)); + return _mm_rcp_ps(_mm_cvtepi32_ps(wsum)); +#else + return (w0 + w1 + w2).Cast().Reciprocal(); +#endif +} + template void DrawTriangleSlice( const VertexData& v0, const VertexData& v1, const VertexData& v2, @@ -1386,7 +1395,7 @@ void DrawTriangleSlice( // If p is on or inside all edges, render pixel Vec4 mask = MakeMask(w0, w1, w2, bias0, bias1, bias2); if (AnyMask(mask)) { - Vec4 wsum_recip = (w0 + w1 + w2).Cast().Reciprocal(); + Vec4 wsum_recip = EdgeRecip(w0, w1, w2); Vec4 prim_color[4]; Vec3 sec_color[4]; From 9e34601be276383b093232f02c57f87157621413 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 13 May 2017 07:53:39 -0700 Subject: [PATCH 3/4] SoftGPU: Correct texturing for pixel centers. --- GPU/Software/Rasterizer.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index c568c0cef8..1e5431d1aa 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -1201,14 +1201,17 @@ static inline void ApplyTexturing(Vec4 *prim_color, const Vec4 &s, c int width = gstate.getTextureWidth(0); int height = gstate.getTextureHeight(0); - float ds = (s[1] - s[0]) * width; - float dt = (t[2] - t[0]) * height; + float ds = s[1] - s[0]; + float dt = t[2] - t[0]; + + Vec4 os = s + Vec4::AssignToAll(ds * 0.5f); + Vec4 ot = t + Vec4::AssignToAll(dt * 0.5f); // With 8 bits of fraction (because texslope can be fairly precise.) int detail; switch (gstate.getTexLevelMode()) { case GE_TEXLEVEL_MODE_AUTO: - detail = TexLog2(std::max(ds, dt)); + detail = TexLog2(std::max(ds * width, dt * height)); break; case GE_TEXLEVEL_MODE_SLOPE: // This is always offset by an extra texlevel. @@ -1247,7 +1250,7 @@ static inline void ApplyTexturing(Vec4 *prim_color, const Vec4 &s, c } for (int i = 0; i < 4; ++i) { - ApplyTexturing(prim_color[i], s[i], t[i], level, levelFrac, filt, texptr, texbufwidthbytes); + ApplyTexturing(prim_color[i], os[i], ot[i], level, levelFrac, filt, texptr, texbufwidthbytes); } } @@ -1677,6 +1680,7 @@ void DrawLine(const VertexData &v0, const VertexData &v1) s *= 1.0f / (float)gstate.getTextureWidth(0); t *= 1.0f / (float)gstate.getTextureHeight(0); } + // TODO: ds/dt. ApplyTexturing(prim_color, s, t, 0, 0, magFilt, texptr, texbufwidthbytes); } From 86a189873258934066ba32e670139798d8cb4d37 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 13 May 2017 08:03:08 -0700 Subject: [PATCH 4/4] SoftGPU: Correct rendering for pixel centers. This should theoretically calculate everything more correctly. --- GPU/Software/Rasterizer.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index 1e5431d1aa..fba5290170 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -1204,9 +1204,6 @@ static inline void ApplyTexturing(Vec4 *prim_color, const Vec4 &s, c float ds = s[1] - s[0]; float dt = t[2] - t[0]; - Vec4 os = s + Vec4::AssignToAll(ds * 0.5f); - Vec4 ot = t + Vec4::AssignToAll(dt * 0.5f); - // With 8 bits of fraction (because texslope can be fairly precise.) int detail; switch (gstate.getTexLevelMode()) { @@ -1250,7 +1247,7 @@ static inline void ApplyTexturing(Vec4 *prim_color, const Vec4 &s, c } for (int i = 0; i < 4; ++i) { - ApplyTexturing(prim_color[i], os[i], ot[i], level, levelFrac, filt, texptr, texbufwidthbytes); + ApplyTexturing(prim_color[i], s[i], t[i], level, levelFrac, filt, texptr, texbufwidthbytes); } } @@ -1264,8 +1261,9 @@ struct TriangleEdge { }; Vec4 TriangleEdge::Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin) { - Vec4 initX = Vec4::AssignToAll(origin.x) + Vec4(0, 16, 0, 16); - Vec4 initY = Vec4::AssignToAll(origin.y) + Vec4(0, 0, 16, 16); + // Start at pixel centers. + Vec4 initX = Vec4::AssignToAll(origin.x) + Vec4(7, 23, 7, 23); + Vec4 initY = Vec4::AssignToAll(origin.y) + Vec4(7, 7, 23, 23); // orient2d refactored. int xf = v0.y - v1.y;