From 3593a7963e2def821a00159ff044af88515e9c13 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 26 Mar 2016 21:29:48 -0700 Subject: [PATCH] Cleanup and clarify texture swizzling funcs. --- Core/HW/MediaEngine.cpp | 6 +-- GPU/Common/TextureCacheCommon.cpp | 61 +++++++------------------------ GPU/Common/TextureDecoder.cpp | 42 +++++++++++++-------- GPU/Common/TextureDecoder.h | 8 ++-- 4 files changed, 47 insertions(+), 70 deletions(-) diff --git a/Core/HW/MediaEngine.cpp b/Core/HW/MediaEngine.cpp index ebd35bde92..da4140b88b 100644 --- a/Core/HW/MediaEngine.cpp +++ b/Core/HW/MediaEngine.cpp @@ -673,13 +673,12 @@ int MediaEngine::writeVideoImage(u32 bufferPtr, int frameWidth, int videoPixelMo } if (swizzle) { - const u32 pitch = videoLineSize / 4; const int bxc = videoLineSize / 16; int byc = (height + 7) / 8; if (byc == 0) byc = 1; - DoSwizzleTex16((const u32 *)imgbuf, buffer, bxc, byc, pitch, videoLineSize); + DoSwizzleTex16((const u32 *)imgbuf, buffer, bxc, byc, videoLineSize); delete [] imgbuf; } @@ -789,13 +788,12 @@ int MediaEngine::writeVideoImageWithRange(u32 bufferPtr, int frameWidth, int vid if (swizzle) { WARN_LOG_REPORT_ONCE(vidswizzle, ME, "Swizzling Video with range"); - const u32 pitch = videoLineSize / 4; const int bxc = videoLineSize / 16; int byc = (height + 7) / 8; if (byc == 0) byc = 1; - DoSwizzleTex16((const u32 *)imgbuf, buffer, bxc, byc, pitch, videoLineSize); + DoSwizzleTex16((const u32 *)imgbuf, buffer, bxc, byc, videoLineSize); delete [] imgbuf; } diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index c28b896743..c68dcff668 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -365,57 +365,24 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) { } void TextureCacheCommon::UnswizzleFromMem(u32 *dest, const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel) { + // Note: bufw is always aligned to 16 bytes, so rowWidth is always >= 16. const u32 rowWidth = (bytesPerPixel > 0) ? (bufw * bytesPerPixel) : (bufw / 2); - const u32 pitch = rowWidth / 4; + // A visual mapping of unswizzling, where each letter is 16-byte and 8 letters is a block: + // + // ABCDEFGH IJKLMNOP + // -> + // AI + // BJ + // CK + // ... + // + // bxc is the number of blocks in the x direction, and byc the number in the y direction. const int bxc = rowWidth / 16; + // The height is not always aligned to 8, but rounds up. int byc = (height + 7) / 8; - if (byc == 0) - byc = 1; - u32 ydest = 0; - if (rowWidth >= 16) { - // The most common one, so it gets an optimized implementation. - DoUnswizzleTex16(texptr, dest, bxc, byc, pitch, rowWidth); - } else if (rowWidth == 8) { - const u32 *src = (const u32 *) texptr; - for (int by = 0; by < byc; by++) { - for (int n = 0; n < 8; n++, ydest += 2) { - dest[ydest + 0] = *src++; - dest[ydest + 1] = *src++; - src += 2; // skip two u32 - } - } - } else if (rowWidth == 4) { - const u32 *src = (const u32 *) texptr; - for (int by = 0; by < byc; by++) { - for (int n = 0; n < 8; n++, ydest++) { - dest[ydest] = *src++; - src += 3; - } - } - } else if (rowWidth == 2) { - const u16 *src = (const u16 *) texptr; - for (int by = 0; by < byc; by++) { - for (int n = 0; n < 4; n++, ydest++) { - u16 n1 = src[0]; - u16 n2 = src[8]; - dest[ydest] = (u32)n1 | ((u32)n2 << 16); - src += 16; - } - } - } else if (rowWidth == 1) { - const u8 *src = (const u8 *) texptr; - for (int by = 0; by < byc; by++) { - for (int n = 0; n < 2; n++, ydest++) { - u8 n1 = src[ 0]; - u8 n2 = src[16]; - u8 n3 = src[32]; - u8 n4 = src[48]; - dest[ydest] = (u32)n1 | ((u32)n2 << 8) | ((u32)n3 << 16) | ((u32)n4 << 24); - src += 64; - } - } - } + // TODO: Can change rowWidth param below (leave above) to adjust dest pitch. + DoUnswizzleTex16(texptr, dest, bxc, byc, rowWidth); } void *TextureCacheCommon::RearrangeBuf(void *inBuf, u32 inRowBytes, u32 outRowBytes, int h, bool allowInPlace) { diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index d70b3b31f3..961dca9f36 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -197,9 +197,14 @@ static u32 QuickTexHashBasic(const void *checkp, u32 size) { return check; } -void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch, u32 rowWidth) { +void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch) { + // ysrcp is in 32-bits, so this is convenient. + const u32 pitchBy32 = pitch >> 2; #ifdef _M_SSE __m128i *dest = (__m128i *)texptr; + // The pitch parameter is in bytes, so shift down for 128-bit. + // Note: it's always aligned to 16 bytes, so this is safe. + const u32 pitchBy128 = pitch >> 4; for (int by = 0; by < byc; by++) { const __m128i *xsrc = (const __m128i *)ysrcp; for (int bx = 0; bx < bxc; bx++) { @@ -207,13 +212,13 @@ void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch, u for (int n = 0; n < 2; n++) { // Textures are always 16-byte aligned so this is fine. __m128i temp1 = _mm_load_si128(src); - src += pitch >> 2; + src += pitchBy128; __m128i temp2 = _mm_load_si128(src); - src += pitch >> 2; + src += pitchBy128; __m128i temp3 = _mm_load_si128(src); - src += pitch >> 2; + src += pitchBy128; __m128i temp4 = _mm_load_si128(src); - src += pitch >> 2; + src += pitchBy128; _mm_store_si128(dest, temp1); _mm_store_si128(dest + 1, temp2); @@ -223,7 +228,7 @@ void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch, u } xsrc++; } - ysrcp += (rowWidth * 8) / 4; + ysrcp += pitchBy32 * 8; } #else u32 *dest = (u32 *)texptr; @@ -233,19 +238,24 @@ void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch, u const u32 *src = xsrc; for (int n = 0; n < 8; n++) { memcpy(dest, src, 16); - src += pitch; + src += pitchBy32; dest += 4; } xsrc += 4; } - ysrcp += (rowWidth * 8) / 4; + ysrcp += pitchBy32 * 8; } #endif } -void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) { +void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch) { + // ydestp is in 32-bits, so this is convenient. + const u32 pitchBy32 = pitch >> 2; #ifdef _M_SSE const __m128i *src = (const __m128i *)texptr; + // The pitch parameter is in bytes, so shift down for 128-bit. + // Note: it's always aligned to 16 bytes, so this is safe. + const u32 pitchBy128 = pitch >> 4; for (int by = 0; by < byc; by++) { __m128i *xdest = (__m128i *)ydestp; for (int bx = 0; bx < bxc; bx++) { @@ -257,18 +267,18 @@ void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 __m128i temp3 = _mm_load_si128(src + 2); __m128i temp4 = _mm_load_si128(src + 3); _mm_store_si128(dest, temp1); - dest += pitch >> 2; + dest += pitchBy128; _mm_store_si128(dest, temp2); - dest += pitch >> 2; + dest += pitchBy128; _mm_store_si128(dest, temp3); - dest += pitch >> 2; + dest += pitchBy128; _mm_store_si128(dest, temp4); - dest += pitch >> 2; + dest += pitchBy128; src += 4; } xdest++; } - ydestp += (rowWidth * 8) / 4; + ydestp += pitchBy32 * 8; } #else const u32 *src = (const u32 *)texptr; @@ -278,12 +288,12 @@ void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 u32 *dest = xdest; for (int n = 0; n < 8; n++) { memcpy(dest, src, 16); - dest += pitch; + dest += pitchBy32; src += 4; } xdest += 4; } - ydestp += (rowWidth * 8) / 4; + ydestp += pitchBy32 * 8; } #endif } diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h index 87efe8ef33..d3ddca9475 100644 --- a/GPU/Common/TextureDecoder.h +++ b/GPU/Common/TextureDecoder.h @@ -32,14 +32,16 @@ enum CheckAlphaResult { void SetupTextureDecoder(); -void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch, u32 rowWidth); +// Pitch must be aligned to 16 bits (as is the case on a PSP) +void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch); // For SSE, we statically link the SSE2 algorithms. #if defined(_M_SSE) u32 QuickTexHashSSE2(const void *checkp, u32 size); #define DoQuickTexHash QuickTexHashSSE2 -void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth); +// Pitch must be aligned to 16 bits (as is the case on a PSP) +void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch); #define DoUnswizzleTex16 DoUnswizzleTex16Basic #include "ext/xxhash.h" @@ -70,7 +72,7 @@ typedef u64 ReliableHashType; typedef u32 (*QuickTexHashFunc)(const void *checkp, u32 size); extern QuickTexHashFunc DoQuickTexHash; -typedef void (*UnswizzleTex16Func)(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth); +typedef void (*UnswizzleTex16Func)(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch); extern UnswizzleTex16Func DoUnswizzleTex16; typedef u32 (*ReliableHash32Func)(const void *input, size_t len, u32 seed);