diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index 80523aa554..ad8388f5a2 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -330,3 +330,339 @@ void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch) { dst += pitch; } } + +#ifdef _M_SSE +static inline u32 CombineSSEBits(const __m128i &v) { + __m128i temp; + temp = _mm_or_si128(v, _mm_srli_si128(v, 8)); + temp = _mm_or_si128(temp, _mm_srli_si128(temp, 4)); + return _mm_cvtsi128_si32(temp); +} + +CheckAlphaResult CheckAlphaRGBA8888SSE2(const u32 *pixelData, int stride, int w, int h) { + const __m128i zero = _mm_setzero_si128(); + const __m128i full = _mm_set1_epi32(0xFF); + + const __m128i *p = (const __m128i *)pixelData; + const int w4 = w / 4; + const int stride4 = stride / 4; + + __m128i hasZeroCursor = _mm_setzero_si128(); + for (int y = 0; y < h; ++y) { + __m128i hasAnyCursor = _mm_setzero_si128(); + + for (int i = 0; i < w4; ++i) { + const __m128i a = _mm_srli_epi32(_mm_load_si128(&p[i]), 24); + + const __m128i isZero = _mm_cmpeq_epi32(a, zero); + hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero); + + // If a = FF, isNotFull will be 0 -> hasAny will be 0. + // If a = 00, a & isNotFull will be 0 -> hasAny will be 0. + // In any other case, hasAny will have some bits set. + const __m128i isNotFull = _mm_cmplt_epi32(a, full); + hasAnyCursor = _mm_or_si128(hasAnyCursor, _mm_and_si128(a, isNotFull)); + } + p += stride4; + + // We check any early, in case we can skip the rest of the rows. + if (CombineSSEBits(hasAnyCursor) != 0) { + return CHECKALPHA_ANY; + } + } + + // Now let's sum up the bits. + if (CombineSSEBits(hasZeroCursor) != 0) { + return CHECKALPHA_ZERO; + } else { + return CHECKALPHA_FULL; + } +} + +CheckAlphaResult CheckAlphaABGR4444SSE2(const u32 *pixelData, int stride, int w, int h) { + const __m128i zero = _mm_setzero_si128(); + const __m128i full = _mm_set1_epi16(0x000F); + + const __m128i *p = (const __m128i *)pixelData; + const int w8 = w / 8; + const int stride8 = stride / 8; + + __m128i hasZeroCursor = _mm_setzero_si128(); + for (int y = 0; y < h; ++y) { + __m128i hasAnyCursor = _mm_setzero_si128(); + + for (int i = 0; i < w8; ++i) { + const __m128i a = _mm_and_si128(_mm_load_si128(&p[i]), full); + + const __m128i isZero = _mm_cmpeq_epi16(a, zero); + hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero); + + // If a = F, isNotFull will be 0 -> hasAny will be 0. + // If a = 0, a & isNotFull will be 0 -> hasAny will be 0. + // In any other case, hasAny will have some bits set. + const __m128i isNotFull = _mm_cmplt_epi32(a, full); + hasAnyCursor = _mm_or_si128(hasAnyCursor, _mm_and_si128(a, isNotFull)); + } + p += stride8; + + // We check any early, in case we can skip the rest of the rows. + if (CombineSSEBits(hasAnyCursor) != 0) { + return CHECKALPHA_ANY; + } + } + + // Now let's sum up the bits. + if (CombineSSEBits(hasZeroCursor) != 0) { + return CHECKALPHA_ZERO; + } else { + return CHECKALPHA_FULL; + } +} + +CheckAlphaResult CheckAlphaABGR1555SSE2(const u32 *pixelData, int stride, int w, int h) { + const __m128i zero = _mm_setzero_si128(); + + const __m128i *p = (const __m128i *)pixelData; + const int w8 = w / 8; + const int stride8 = stride / 8; + + __m128i hasZeroCursor = _mm_setzero_si128(); + for (int y = 0; y < h; ++y) { + for (int i = 0; i < w8; ++i) { + const __m128i a = _mm_slli_epi16(_mm_load_si128(&p[i]), 15); + + const __m128i isZero = _mm_cmpeq_epi16(a, zero); + hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero); + } + p += stride8; + } + + // Now let's sum up the bits. + if (CombineSSEBits(hasZeroCursor) != 0) { + return CHECKALPHA_ZERO; + } else { + return CHECKALPHA_FULL; + } +} + +CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w, int h) { + const __m128i zero = _mm_setzero_si128(); + const __m128i full = _mm_set1_epi16(0x000F); + + const __m128i *p = (const __m128i *)pixelData; + const int w8 = w / 8; + const int stride8 = stride / 8; + + __m128i hasZeroCursor = _mm_setzero_si128(); + for (int y = 0; y < h; ++y) { + __m128i hasAnyCursor = _mm_setzero_si128(); + + for (int i = 0; i < w8; ++i) { + const __m128i a = _mm_srli_epi16(_mm_load_si128(&p[i]), 12); + + const __m128i isZero = _mm_cmpeq_epi16(a, zero); + hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero); + + // If a = F, isNotFull will be 0 -> hasAny will be 0. + // If a = 0, a & isNotFull will be 0 -> hasAny will be 0. + // In any other case, hasAny will have some bits set. + const __m128i isNotFull = _mm_cmplt_epi32(a, full); + hasAnyCursor = _mm_or_si128(hasAnyCursor, _mm_and_si128(a, isNotFull)); + } + p += stride8; + + // We check any early, in case we can skip the rest of the rows. + if (CombineSSEBits(hasAnyCursor) != 0) { + return CHECKALPHA_ANY; + } + } + + // Now let's sum up the bits. + if (CombineSSEBits(hasZeroCursor) != 0) { + return CHECKALPHA_ZERO; + } else { + return CHECKALPHA_FULL; + } +} + +CheckAlphaResult CheckAlphaRGBA5551SSE2(const u32 *pixelData, int stride, int w, int h) { + const __m128i zero = _mm_setzero_si128(); + const __m128i full = _mm_set1_epi16(0x0001); + + const __m128i *p = (const __m128i *)pixelData; + const int w8 = w / 8; + const int stride8 = stride / 8; + + __m128i hasZeroCursor = _mm_setzero_si128(); + for (int y = 0; y < h; ++y) { + for (int i = 0; i < w8; ++i) { + const __m128i a = _mm_srli_epi16(_mm_load_si128(&p[i]), 15); + + const __m128i isZero = _mm_cmpeq_epi16(a, zero); + hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero); + } + p += stride8; + } + + // Now let's sum up the bits. + if (CombineSSEBits(hasZeroCursor) != 0) { + return CHECKALPHA_ZERO; + } else { + return CHECKALPHA_FULL; + } +} +#endif + +CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h) { +#ifdef _M_SSE + // Use SSE if aligned to 16 bytes / 4 pixels (almost always the case.) + if ((w & 3) == 0 && (stride & 3) == 0) { + return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h); + } +#endif + + u32 hitZeroAlpha = 0; + + const u32 *p = pixelData; + for (int y = 0; y < h; ++y) { + for (int i = 0; i < w; ++i) { + u32 a = p[i] & 0xFF000000; + hitZeroAlpha |= a ^ 0xFF000000; + if (a != 0xFF000000 && a != 0) { + // We're done, we hit non-zero, non-full alpha. + return CHECKALPHA_ANY; + } + } + p += stride; + } + + if (hitZeroAlpha) { + return CHECKALPHA_ZERO; + } else { + return CHECKALPHA_FULL; + } +} + +CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h) { +#ifdef _M_SSE + // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.) + if ((w & 7) == 0 && (stride & 7) == 0) { + return CheckAlphaABGR4444SSE2(pixelData, stride, w, h); + } +#endif + + u32 hitZeroAlpha = 0; + + const u32 *p = pixelData; + const int w2 = (w + 1) / 2; + const int stride2 = (stride + 1) / 2; + + for (int y = 0; y < h; ++y) { + for (int i = 0; i < w2; ++i) { + u32 a = p[i] & 0x000F000F; + hitZeroAlpha |= a ^ 0x000F000F; + if (a != 0x000F000F && a != 0x0000000F && a != 0x000F0000 && a != 0) { + // We're done, we hit non-zero, non-full alpha. + return CHECKALPHA_ANY; + } + } + p += stride; + } + + if (hitZeroAlpha) { + return CHECKALPHA_ZERO; + } else { + return CHECKALPHA_FULL; + } +} + +CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h) { +#ifdef _M_SSE + // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.) + if ((w & 7) == 0 && (stride & 7) == 0) { + return CheckAlphaABGR1555SSE2(pixelData, stride, w, h); + } +#endif + + u32 hitZeroAlpha = 0; + + const u32 *p = pixelData; + const int w2 = (w + 1) / 2; + const int stride2 = (stride + 1) / 2; + + for (int y = 0; y < h; ++y) { + for (int i = 0; i < w2; ++i) { + u32 a = p[i] & 0x00010001; + hitZeroAlpha |= a ^ 0x00010001; + } + p += stride; + } + + if (hitZeroAlpha) { + return CHECKALPHA_ZERO; + } else { + return CHECKALPHA_FULL; + } +} + +CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h) { +#ifdef _M_SSE + // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.) + if ((w & 7) == 0 && (stride & 7) == 0) { + return CheckAlphaRGBA4444SSE2(pixelData, stride, w, h); + } +#endif + + u32 hitZeroAlpha = 0; + + const u32 *p = pixelData; + const int w2 = (w + 1) / 2; + const int stride2 = (stride + 1) / 2; + + for (int y = 0; y < h; ++y) { + for (int i = 0; i < w2; ++i) { + u32 a = p[i] & 0xF000F000; + hitZeroAlpha |= a ^ 0xF000F000; + if (a != 0xF000F000 && a != 0xF0000000 && a != 0x0000F000 && a != 0) { + // We're done, we hit non-zero, non-full alpha. + return CHECKALPHA_ANY; + } + } + p += stride; + } + + if (hitZeroAlpha) { + return CHECKALPHA_ZERO; + } else { + return CHECKALPHA_FULL; + } +} + +CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w, int h) { +#ifdef _M_SSE + // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.) + if ((w & 7) == 0 && (stride & 7) == 0) { + return CheckAlphaRGBA5551SSE2(pixelData, stride, w, h); + } +#endif + + u32 hitZeroAlpha = 0; + + const u32 *p = pixelData; + const int w2 = (w + 1) / 2; + const int stride2 = (stride + 1) / 2; + + for (int y = 0; y < h; ++y) { + for (int i = 0; i < w2; ++i) { + u32 a = p[i] & 0x80008000; + hitZeroAlpha |= a ^ 0x80008000; + } + p += stride; + } + + if (hitZeroAlpha) { + return CHECKALPHA_ZERO; + } else { + return CHECKALPHA_FULL; + } +} diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h index 0d07cb598c..d6ff74e52c 100644 --- a/GPU/Common/TextureDecoder.h +++ b/GPU/Common/TextureDecoder.h @@ -75,6 +75,19 @@ extern ReliableHash64Func DoReliableHash64; typedef u32 ReliableHashType; #endif +enum CheckAlphaResult { + // These are intended to line up with TexCacheEntry::STATUS_ALPHA_UNKNOWN, etc. + CHECKALPHA_FULL = 0, + CHECKALPHA_ANY = 4, + CHECKALPHA_ZERO = 8, +}; + +CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h); +CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h); +CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h); +CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h); +CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w, int h); + // All these DXT structs are in the reverse order, as compared to PC. // On PC, alpha comes before color, and interpolants are before the tile data. diff --git a/GPU/Directx9/TextureCacheDX9.cpp b/GPU/Directx9/TextureCacheDX9.cpp index 5b14e75aab..15dcc55f59 100644 --- a/GPU/Directx9/TextureCacheDX9.cpp +++ b/GPU/Directx9/TextureCacheDX9.cpp @@ -1691,68 +1691,24 @@ void *TextureCacheDX9::DecodeTextureLevel(GETextureFormat format, GEPaletteForma } TextureCacheDX9::TexCacheEntry::Status TextureCacheDX9::CheckAlpha(const u32 *pixelData, u32 dstFmt, int stride, int w, int h) { - // TODO: Could probably be optimized more. - u32 hitZeroAlpha = 0; - u32 hitSomeAlpha = 0; - + CheckAlphaResult res; switch (dstFmt) { case D3DFMT_A4R4G4B4: - { - const u32 *p = pixelData; - for (int y = 0; y < h && hitSomeAlpha == 0; ++y) { - for (int i = 0; i < (w + 1) / 2; ++i) { - u32 a = p[i] & 0xF000F000; - hitZeroAlpha |= a ^ 0xF000F000; - if (a != 0xF000F000 && a != 0xF0000000 && a != 0x0000F000 && a != 0) { - hitSomeAlpha = 1; - break; - } - } - p += stride/2; - } - } + res = CheckAlphaRGBA4444Basic(pixelData, stride, w, h); break; case D3DFMT_A1R5G5B5: - { - const u32 *p = pixelData; - for (int y = 0; y < h; ++y) { - for (int i = 0; i < (w + 1) / 2; ++i) { - u32 a = p[i] & 0x80008000; - hitZeroAlpha |= a ^ 0x80008000; - } - p += stride/2; - } - } + res = CheckAlphaRGBA5551Basic(pixelData, stride, w, h); break; case D3DFMT_R5G6B5: - { - // Never has any alpha. - } + // Never has any alpha. + res = CHECKALPHA_FULL; break; default: - { - const u32 *p = pixelData; - for (int y = 0; y < h && hitSomeAlpha == 0; ++y) { - for (int i = 0; i < w; ++i) { - u32 a = p[i] & 0xFF000000; - hitZeroAlpha |= a ^ 0xFF000000; - if (a != 0xFF000000 && a != 0) { - hitSomeAlpha = 1; - break; - } - } - p += stride; - } - } + res = CheckAlphaRGBA8888Basic(pixelData, stride, w, h); break; } - if (hitSomeAlpha != 0) - return TexCacheEntry::STATUS_ALPHA_UNKNOWN; - else if (hitZeroAlpha != 0) - return TexCacheEntry::STATUS_ALPHA_SIMPLE; - else - return TexCacheEntry::STATUS_ALPHA_FULL; + return (TexCacheEntry::Status)res; } static inline void copyTexture(int xoffset, int yoffset, int w, int h, int pitch, int srcfmt, int fmt, void * pSrc, void * pDst) { diff --git a/GPU/GLES/TextureCache.cpp b/GPU/GLES/TextureCache.cpp index a66bd8b84f..6364850c34 100644 --- a/GPU/GLES/TextureCache.cpp +++ b/GPU/GLES/TextureCache.cpp @@ -1826,68 +1826,24 @@ void *TextureCache::DecodeTextureLevel(GETextureFormat format, GEPaletteFormat c } TextureCache::TexCacheEntry::Status TextureCache::CheckAlpha(const u32 *pixelData, GLenum dstFmt, int stride, int w, int h) { - // TODO: Could probably be optimized more. - u32 hitZeroAlpha = 0; - u32 hitSomeAlpha = 0; - + CheckAlphaResult res; switch (dstFmt) { case GL_UNSIGNED_SHORT_4_4_4_4: - { - const u32 *p = pixelData; - for (int y = 0; y < h && hitSomeAlpha == 0; ++y) { - for (int i = 0; i < (w + 1) / 2; ++i) { - u32 a = p[i] & 0x000F000F; - hitZeroAlpha |= a ^ 0x000F000F; - if (a != 0x000F000F && a != 0x0000000F && a != 0x000F0000 && a != 0) { - hitSomeAlpha = 1; - break; - } - } - p += stride/2; - } - } + res = CheckAlphaABGR4444Basic(pixelData, stride, w, h); break; case GL_UNSIGNED_SHORT_5_5_5_1: - { - const u32 *p = pixelData; - for (int y = 0; y < h; ++y) { - for (int i = 0; i < (w + 1) / 2; ++i) { - u32 a = p[i] & 0x00010001; - hitZeroAlpha |= a ^ 0x00010001; - } - p += stride/2; - } - } + res = CheckAlphaABGR1555Basic(pixelData, stride, w, h); break; case GL_UNSIGNED_SHORT_5_6_5: - { - // Never has any alpha. - } + // Never has any alpha. + res = CHECKALPHA_FULL; break; default: - { - const u32 *p = pixelData; - for (int y = 0; y < h && hitSomeAlpha == 0; ++y) { - for (int i = 0; i < w; ++i) { - u32 a = p[i] & 0xFF000000; - hitZeroAlpha |= a ^ 0xFF000000; - if (a != 0xFF000000 && a != 0) { - hitSomeAlpha = 1; - break; - } - } - p += stride; - } - } + res = CheckAlphaRGBA8888Basic(pixelData, stride, w, h); break; } - if (hitSomeAlpha != 0) - return TexCacheEntry::STATUS_ALPHA_UNKNOWN; - else if (hitZeroAlpha != 0) - return TexCacheEntry::STATUS_ALPHA_SIMPLE; - else - return TexCacheEntry::STATUS_ALPHA_FULL; + return (TexCacheEntry::Status)res; } void TextureCache::LoadTextureLevel(TexCacheEntry &entry, int level, bool replaceImages, int scaleFactor, GLenum dstFmt) {