softgpu: Decode DXT texels directly.

This improves performance a lot compared to decoding the whole block. Eventually we may implement a cache, but threading makes that complex to make properly fast.
2025-04-02 11:01:50 -04:00 · 2021-09-12 09:37:34 -07:00 · 2021-09-12 09:37:34 -07:00 · a0eeb52444
commit a0eeb52444
parent 1ee5352d3e
3 changed files with 71 additions and 9 deletions
--- a/GPU/Common/TextureDecoder.cpp
+++ b/GPU/Common/TextureDecoder.cpp
@ -19,6 +19,7 @@
 #include "ext/xxhash.h"
 #include "Common/Data/Convert/ColorConv.h"
 #include "Common/CPUDetect.h"
+#include "Common/Log.h"

 #include "GPU/GPU.h"
 #include "GPU/GPUState.h"
@ -446,6 +447,69 @@ void DXTDecoder::WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int
 	}
 }

+uint32_t GetDXTTexelColor(const DXT1Block *src, int x, int y, int alpha) {
+	_dbg_assert_(x >= 0 && x < 4);
+	_dbg_assert_(y >= 0 && y < 4);
+
+	uint16_t c1 = src->color1;
+	uint16_t c2 = src->color2;
+	int red1 = (c1 << 3) & 0xF8;
+	int red2 = (c2 << 3) & 0xF8;
+	int green1 = (c1 >> 3) & 0xFC;
+	int green2 = (c2 >> 3) & 0xFC;
+	int blue1 = (c1 >> 8) & 0xF8;
+	int blue2 = (c2 >> 8) & 0xF8;
+
+	int colorIndex = (src->lines[y] >> (x * 2)) & 3;
+	if (colorIndex == 0) {
+		return makecol(red1, green1, blue1, alpha);
+	} else if (colorIndex == 1) {
+		return makecol(red2, green2, blue2, alpha);
+	} else if (c1 > c2) {
+		if (colorIndex == 2) {
+			return makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);
+		}
+		return makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);
+	} else if (colorIndex == 3) {
+		return makecol(0, 0, 0, 0);
+	}
+
+	// Average - these are always left shifted, so no need to worry about ties.
+	int red3 = (red1 + red2) / 2;
+	int green3 = (green1 + green2) / 2;
+	int blue3 = (blue1 + blue2) / 2;
+	return makecol(red3, green3, blue3, alpha);
+}
+
+uint32_t GetDXT1Texel(const DXT1Block *src, int x, int y) {
+	return GetDXTTexelColor(src, x, y, 255);
+}
+
+uint32_t GetDXT3Texel(const DXT3Block *src, int x, int y) {
+	uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);
+	u32 alpha = (src->alphaLines[y] >> (x * 4)) & 0xF;
+	return color | (alpha << 28);
+}
+
+uint32_t GetDXT5Texel(const DXT5Block *src, int x, int y) {
+	uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);
+	uint64_t alphadata = ((uint64_t)(uint16_t)src->alphadata1 << 32) | (uint32_t)src->alphadata2;
+	int alphaIndex = (alphadata >> (y * 12 + x * 3)) & 7;
+
+	if (alphaIndex == 0) {
+		return color | (src->alpha1 << 24);
+	} else if (alphaIndex == 1) {
+		return color | (src->alpha2 << 24);
+	} else if (src->alpha1 > src->alpha2) {
+		return color | (lerp8(src, alphaIndex - 1) << 24);
+	} else if (alphaIndex == 6) {
+		return color;
+	} else if (alphaIndex == 7) {
+		return color | 0xFF000000;
+	}
+	return color | (lerp6(src, alphaIndex - 1) << 24);
+}
+
 // This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
 void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, bool ignore1bitAlpha) {
 	DXTDecoder dxt;
--- a/GPU/Common/TextureDecoder.h
+++ b/GPU/Common/TextureDecoder.h
@ -92,6 +92,10 @@ void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, bool
 void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int height);
 void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int height);

+uint32_t GetDXT1Texel(const DXT1Block *src, int x, int y);
+uint32_t GetDXT3Texel(const DXT3Block *src, int x, int y);
+uint32_t GetDXT5Texel(const DXT5Block *src, int x, int y);
+
 static const u8 textureBitsPerPixel[16] = {
 	16,  //GE_TFMT_5650,
 	16,  //GE_TFMT_5551,
--- a/GPU/Software/Sampler.cpp
+++ b/GPU/Software/Sampler.cpp
@ -379,27 +379,21 @@ inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int t
 	case GE_TFMT_DXT1:
 		for (int i = 0; i < N; ++i) {
 			const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
-			u32 data[4 * 4];
-			DecodeDXT1Block(data, block, 4, 4, false);
-			res.v[i] = data[4 * (v[i] % 4) + (u[i] % 4)];
+			res.v[i] = GetDXT1Texel(block, u[i] % 4, v[i] % 4);
 		}
 		return res;

 	case GE_TFMT_DXT3:
 		for (int i = 0; i < N; ++i) {
 			const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
-			u32 data[4 * 4];
-			DecodeDXT3Block(data, block, 4, 4);
-			res.v[i] = data[4 * (v[i] % 4) + (u[i] % 4)];
+			res.v[i] = GetDXT3Texel(block, u[i] % 4, v[i] % 4);
 		}
 		return res;

 	case GE_TFMT_DXT5:
 		for (int i = 0; i < N; ++i) {
 			const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
-			u32 data[4 * 4];
-			DecodeDXT5Block(data, block, 4, 4);
-			res.v[i] = data[4 * (v[i] % 4) + (u[i] % 4)];
+			res.v[i] = GetDXT5Texel(block, u[i] % 4, v[i] % 4);
 		}
 		return res;