From a0eeb52444339dfef5f2e9dc4dad10b2368c9966 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 12 Sep 2021 09:37:34 -0700 Subject: [PATCH] softgpu: Decode DXT texels directly. This improves performance a lot compared to decoding the whole block. Eventually we may implement a cache, but threading makes that complex to make properly fast. --- GPU/Common/TextureDecoder.cpp | 64 +++++++++++++++++++++++++++++++++++ GPU/Common/TextureDecoder.h | 4 +++ GPU/Software/Sampler.cpp | 12 ++----- 3 files changed, 71 insertions(+), 9 deletions(-) diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index 72da3fd761..adb00e9ad9 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -19,6 +19,7 @@ #include "ext/xxhash.h" #include "Common/Data/Convert/ColorConv.h" #include "Common/CPUDetect.h" +#include "Common/Log.h" #include "GPU/GPU.h" #include "GPU/GPUState.h" @@ -446,6 +447,69 @@ void DXTDecoder::WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int } } +uint32_t GetDXTTexelColor(const DXT1Block *src, int x, int y, int alpha) { + _dbg_assert_(x >= 0 && x < 4); + _dbg_assert_(y >= 0 && y < 4); + + uint16_t c1 = src->color1; + uint16_t c2 = src->color2; + int red1 = (c1 << 3) & 0xF8; + int red2 = (c2 << 3) & 0xF8; + int green1 = (c1 >> 3) & 0xFC; + int green2 = (c2 >> 3) & 0xFC; + int blue1 = (c1 >> 8) & 0xF8; + int blue2 = (c2 >> 8) & 0xF8; + + int colorIndex = (src->lines[y] >> (x * 2)) & 3; + if (colorIndex == 0) { + return makecol(red1, green1, blue1, alpha); + } else if (colorIndex == 1) { + return makecol(red2, green2, blue2, alpha); + } else if (c1 > c2) { + if (colorIndex == 2) { + return makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha); + } + return makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha); + } else if (colorIndex == 3) { + return makecol(0, 0, 0, 0); + } + + // Average - these are always left shifted, so no need to worry about ties. + int red3 = (red1 + red2) / 2; + int green3 = (green1 + green2) / 2; + int blue3 = (blue1 + blue2) / 2; + return makecol(red3, green3, blue3, alpha); +} + +uint32_t GetDXT1Texel(const DXT1Block *src, int x, int y) { + return GetDXTTexelColor(src, x, y, 255); +} + +uint32_t GetDXT3Texel(const DXT3Block *src, int x, int y) { + uint32_t color = GetDXTTexelColor(&src->color, x, y, 0); + u32 alpha = (src->alphaLines[y] >> (x * 4)) & 0xF; + return color | (alpha << 28); +} + +uint32_t GetDXT5Texel(const DXT5Block *src, int x, int y) { + uint32_t color = GetDXTTexelColor(&src->color, x, y, 0); + uint64_t alphadata = ((uint64_t)(uint16_t)src->alphadata1 << 32) | (uint32_t)src->alphadata2; + int alphaIndex = (alphadata >> (y * 12 + x * 3)) & 7; + + if (alphaIndex == 0) { + return color | (src->alpha1 << 24); + } else if (alphaIndex == 1) { + return color | (src->alpha2 << 24); + } else if (src->alpha1 > src->alpha2) { + return color | (lerp8(src, alphaIndex - 1) << 24); + } else if (alphaIndex == 6) { + return color; + } else if (alphaIndex == 7) { + return color | 0xFF000000; + } + return color | (lerp6(src, alphaIndex - 1) << 24); +} + // This could probably be done faster by decoding two or four blocks at a time with SSE/NEON. void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, bool ignore1bitAlpha) { DXTDecoder dxt; diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h index 863e509cba..2b03bc4102 100644 --- a/GPU/Common/TextureDecoder.h +++ b/GPU/Common/TextureDecoder.h @@ -92,6 +92,10 @@ void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, bool void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int height); void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int height); +uint32_t GetDXT1Texel(const DXT1Block *src, int x, int y); +uint32_t GetDXT3Texel(const DXT3Block *src, int x, int y); +uint32_t GetDXT5Texel(const DXT5Block *src, int x, int y); + static const u8 textureBitsPerPixel[16] = { 16, //GE_TFMT_5650, 16, //GE_TFMT_5551, diff --git a/GPU/Software/Sampler.cpp b/GPU/Software/Sampler.cpp index 607aafd3dd..7bc9a7a43c 100644 --- a/GPU/Software/Sampler.cpp +++ b/GPU/Software/Sampler.cpp @@ -379,27 +379,21 @@ inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int t case GE_TFMT_DXT1: for (int i = 0; i < N; ++i) { const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4); - u32 data[4 * 4]; - DecodeDXT1Block(data, block, 4, 4, false); - res.v[i] = data[4 * (v[i] % 4) + (u[i] % 4)]; + res.v[i] = GetDXT1Texel(block, u[i] % 4, v[i] % 4); } return res; case GE_TFMT_DXT3: for (int i = 0; i < N; ++i) { const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4); - u32 data[4 * 4]; - DecodeDXT3Block(data, block, 4, 4); - res.v[i] = data[4 * (v[i] % 4) + (u[i] % 4)]; + res.v[i] = GetDXT3Texel(block, u[i] % 4, v[i] % 4); } return res; case GE_TFMT_DXT5: for (int i = 0; i < N; ++i) { const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4); - u32 data[4 * 4]; - DecodeDXT5Block(data, block, 4, 4); - res.v[i] = data[4 * (v[i] % 4) + (u[i] % 4)]; + res.v[i] = GetDXT5Texel(block, u[i] % 4, v[i] % 4); } return res;