mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Use SSE in CheckAlpha scanning.
This commit is contained in:
parent
a82fc0e5f7
commit
c75010931c
4 changed files with 363 additions and 102 deletions
|
@ -330,3 +330,339 @@ void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch) {
|
|||
dst += pitch;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _M_SSE
|
||||
static inline u32 CombineSSEBits(const __m128i &v) {
|
||||
__m128i temp;
|
||||
temp = _mm_or_si128(v, _mm_srli_si128(v, 8));
|
||||
temp = _mm_or_si128(temp, _mm_srli_si128(temp, 4));
|
||||
return _mm_cvtsi128_si32(temp);
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaRGBA8888SSE2(const u32 *pixelData, int stride, int w, int h) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i full = _mm_set1_epi32(0xFF);
|
||||
|
||||
const __m128i *p = (const __m128i *)pixelData;
|
||||
const int w4 = w / 4;
|
||||
const int stride4 = stride / 4;
|
||||
|
||||
__m128i hasZeroCursor = _mm_setzero_si128();
|
||||
for (int y = 0; y < h; ++y) {
|
||||
__m128i hasAnyCursor = _mm_setzero_si128();
|
||||
|
||||
for (int i = 0; i < w4; ++i) {
|
||||
const __m128i a = _mm_srli_epi32(_mm_load_si128(&p[i]), 24);
|
||||
|
||||
const __m128i isZero = _mm_cmpeq_epi32(a, zero);
|
||||
hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);
|
||||
|
||||
// If a = FF, isNotFull will be 0 -> hasAny will be 0.
|
||||
// If a = 00, a & isNotFull will be 0 -> hasAny will be 0.
|
||||
// In any other case, hasAny will have some bits set.
|
||||
const __m128i isNotFull = _mm_cmplt_epi32(a, full);
|
||||
hasAnyCursor = _mm_or_si128(hasAnyCursor, _mm_and_si128(a, isNotFull));
|
||||
}
|
||||
p += stride4;
|
||||
|
||||
// We check any early, in case we can skip the rest of the rows.
|
||||
if (CombineSSEBits(hasAnyCursor) != 0) {
|
||||
return CHECKALPHA_ANY;
|
||||
}
|
||||
}
|
||||
|
||||
// Now let's sum up the bits.
|
||||
if (CombineSSEBits(hasZeroCursor) != 0) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaABGR4444SSE2(const u32 *pixelData, int stride, int w, int h) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i full = _mm_set1_epi16(0x000F);
|
||||
|
||||
const __m128i *p = (const __m128i *)pixelData;
|
||||
const int w8 = w / 8;
|
||||
const int stride8 = stride / 8;
|
||||
|
||||
__m128i hasZeroCursor = _mm_setzero_si128();
|
||||
for (int y = 0; y < h; ++y) {
|
||||
__m128i hasAnyCursor = _mm_setzero_si128();
|
||||
|
||||
for (int i = 0; i < w8; ++i) {
|
||||
const __m128i a = _mm_and_si128(_mm_load_si128(&p[i]), full);
|
||||
|
||||
const __m128i isZero = _mm_cmpeq_epi16(a, zero);
|
||||
hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);
|
||||
|
||||
// If a = F, isNotFull will be 0 -> hasAny will be 0.
|
||||
// If a = 0, a & isNotFull will be 0 -> hasAny will be 0.
|
||||
// In any other case, hasAny will have some bits set.
|
||||
const __m128i isNotFull = _mm_cmplt_epi32(a, full);
|
||||
hasAnyCursor = _mm_or_si128(hasAnyCursor, _mm_and_si128(a, isNotFull));
|
||||
}
|
||||
p += stride8;
|
||||
|
||||
// We check any early, in case we can skip the rest of the rows.
|
||||
if (CombineSSEBits(hasAnyCursor) != 0) {
|
||||
return CHECKALPHA_ANY;
|
||||
}
|
||||
}
|
||||
|
||||
// Now let's sum up the bits.
|
||||
if (CombineSSEBits(hasZeroCursor) != 0) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaABGR1555SSE2(const u32 *pixelData, int stride, int w, int h) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
const __m128i *p = (const __m128i *)pixelData;
|
||||
const int w8 = w / 8;
|
||||
const int stride8 = stride / 8;
|
||||
|
||||
__m128i hasZeroCursor = _mm_setzero_si128();
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int i = 0; i < w8; ++i) {
|
||||
const __m128i a = _mm_slli_epi16(_mm_load_si128(&p[i]), 15);
|
||||
|
||||
const __m128i isZero = _mm_cmpeq_epi16(a, zero);
|
||||
hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);
|
||||
}
|
||||
p += stride8;
|
||||
}
|
||||
|
||||
// Now let's sum up the bits.
|
||||
if (CombineSSEBits(hasZeroCursor) != 0) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w, int h) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i full = _mm_set1_epi16(0x000F);
|
||||
|
||||
const __m128i *p = (const __m128i *)pixelData;
|
||||
const int w8 = w / 8;
|
||||
const int stride8 = stride / 8;
|
||||
|
||||
__m128i hasZeroCursor = _mm_setzero_si128();
|
||||
for (int y = 0; y < h; ++y) {
|
||||
__m128i hasAnyCursor = _mm_setzero_si128();
|
||||
|
||||
for (int i = 0; i < w8; ++i) {
|
||||
const __m128i a = _mm_srli_epi16(_mm_load_si128(&p[i]), 12);
|
||||
|
||||
const __m128i isZero = _mm_cmpeq_epi16(a, zero);
|
||||
hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);
|
||||
|
||||
// If a = F, isNotFull will be 0 -> hasAny will be 0.
|
||||
// If a = 0, a & isNotFull will be 0 -> hasAny will be 0.
|
||||
// In any other case, hasAny will have some bits set.
|
||||
const __m128i isNotFull = _mm_cmplt_epi32(a, full);
|
||||
hasAnyCursor = _mm_or_si128(hasAnyCursor, _mm_and_si128(a, isNotFull));
|
||||
}
|
||||
p += stride8;
|
||||
|
||||
// We check any early, in case we can skip the rest of the rows.
|
||||
if (CombineSSEBits(hasAnyCursor) != 0) {
|
||||
return CHECKALPHA_ANY;
|
||||
}
|
||||
}
|
||||
|
||||
// Now let's sum up the bits.
|
||||
if (CombineSSEBits(hasZeroCursor) != 0) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaRGBA5551SSE2(const u32 *pixelData, int stride, int w, int h) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i full = _mm_set1_epi16(0x0001);
|
||||
|
||||
const __m128i *p = (const __m128i *)pixelData;
|
||||
const int w8 = w / 8;
|
||||
const int stride8 = stride / 8;
|
||||
|
||||
__m128i hasZeroCursor = _mm_setzero_si128();
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int i = 0; i < w8; ++i) {
|
||||
const __m128i a = _mm_srli_epi16(_mm_load_si128(&p[i]), 15);
|
||||
|
||||
const __m128i isZero = _mm_cmpeq_epi16(a, zero);
|
||||
hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);
|
||||
}
|
||||
p += stride8;
|
||||
}
|
||||
|
||||
// Now let's sum up the bits.
|
||||
if (CombineSSEBits(hasZeroCursor) != 0) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h) {
|
||||
#ifdef _M_SSE
|
||||
// Use SSE if aligned to 16 bytes / 4 pixels (almost always the case.)
|
||||
if ((w & 3) == 0 && (stride & 3) == 0) {
|
||||
return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h);
|
||||
}
|
||||
#endif
|
||||
|
||||
u32 hitZeroAlpha = 0;
|
||||
|
||||
const u32 *p = pixelData;
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int i = 0; i < w; ++i) {
|
||||
u32 a = p[i] & 0xFF000000;
|
||||
hitZeroAlpha |= a ^ 0xFF000000;
|
||||
if (a != 0xFF000000 && a != 0) {
|
||||
// We're done, we hit non-zero, non-full alpha.
|
||||
return CHECKALPHA_ANY;
|
||||
}
|
||||
}
|
||||
p += stride;
|
||||
}
|
||||
|
||||
if (hitZeroAlpha) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h) {
|
||||
#ifdef _M_SSE
|
||||
// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
|
||||
if ((w & 7) == 0 && (stride & 7) == 0) {
|
||||
return CheckAlphaABGR4444SSE2(pixelData, stride, w, h);
|
||||
}
|
||||
#endif
|
||||
|
||||
u32 hitZeroAlpha = 0;
|
||||
|
||||
const u32 *p = pixelData;
|
||||
const int w2 = (w + 1) / 2;
|
||||
const int stride2 = (stride + 1) / 2;
|
||||
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int i = 0; i < w2; ++i) {
|
||||
u32 a = p[i] & 0x000F000F;
|
||||
hitZeroAlpha |= a ^ 0x000F000F;
|
||||
if (a != 0x000F000F && a != 0x0000000F && a != 0x000F0000 && a != 0) {
|
||||
// We're done, we hit non-zero, non-full alpha.
|
||||
return CHECKALPHA_ANY;
|
||||
}
|
||||
}
|
||||
p += stride;
|
||||
}
|
||||
|
||||
if (hitZeroAlpha) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h) {
|
||||
#ifdef _M_SSE
|
||||
// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
|
||||
if ((w & 7) == 0 && (stride & 7) == 0) {
|
||||
return CheckAlphaABGR1555SSE2(pixelData, stride, w, h);
|
||||
}
|
||||
#endif
|
||||
|
||||
u32 hitZeroAlpha = 0;
|
||||
|
||||
const u32 *p = pixelData;
|
||||
const int w2 = (w + 1) / 2;
|
||||
const int stride2 = (stride + 1) / 2;
|
||||
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int i = 0; i < w2; ++i) {
|
||||
u32 a = p[i] & 0x00010001;
|
||||
hitZeroAlpha |= a ^ 0x00010001;
|
||||
}
|
||||
p += stride;
|
||||
}
|
||||
|
||||
if (hitZeroAlpha) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h) {
|
||||
#ifdef _M_SSE
|
||||
// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
|
||||
if ((w & 7) == 0 && (stride & 7) == 0) {
|
||||
return CheckAlphaRGBA4444SSE2(pixelData, stride, w, h);
|
||||
}
|
||||
#endif
|
||||
|
||||
u32 hitZeroAlpha = 0;
|
||||
|
||||
const u32 *p = pixelData;
|
||||
const int w2 = (w + 1) / 2;
|
||||
const int stride2 = (stride + 1) / 2;
|
||||
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int i = 0; i < w2; ++i) {
|
||||
u32 a = p[i] & 0xF000F000;
|
||||
hitZeroAlpha |= a ^ 0xF000F000;
|
||||
if (a != 0xF000F000 && a != 0xF0000000 && a != 0x0000F000 && a != 0) {
|
||||
// We're done, we hit non-zero, non-full alpha.
|
||||
return CHECKALPHA_ANY;
|
||||
}
|
||||
}
|
||||
p += stride;
|
||||
}
|
||||
|
||||
if (hitZeroAlpha) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w, int h) {
|
||||
#ifdef _M_SSE
|
||||
// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
|
||||
if ((w & 7) == 0 && (stride & 7) == 0) {
|
||||
return CheckAlphaRGBA5551SSE2(pixelData, stride, w, h);
|
||||
}
|
||||
#endif
|
||||
|
||||
u32 hitZeroAlpha = 0;
|
||||
|
||||
const u32 *p = pixelData;
|
||||
const int w2 = (w + 1) / 2;
|
||||
const int stride2 = (stride + 1) / 2;
|
||||
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int i = 0; i < w2; ++i) {
|
||||
u32 a = p[i] & 0x80008000;
|
||||
hitZeroAlpha |= a ^ 0x80008000;
|
||||
}
|
||||
p += stride;
|
||||
}
|
||||
|
||||
if (hitZeroAlpha) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -75,6 +75,19 @@ extern ReliableHash64Func DoReliableHash64;
|
|||
typedef u32 ReliableHashType;
|
||||
#endif
|
||||
|
||||
enum CheckAlphaResult {
|
||||
// These are intended to line up with TexCacheEntry::STATUS_ALPHA_UNKNOWN, etc.
|
||||
CHECKALPHA_FULL = 0,
|
||||
CHECKALPHA_ANY = 4,
|
||||
CHECKALPHA_ZERO = 8,
|
||||
};
|
||||
|
||||
CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h);
|
||||
CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h);
|
||||
CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h);
|
||||
CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h);
|
||||
CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w, int h);
|
||||
|
||||
// All these DXT structs are in the reverse order, as compared to PC.
|
||||
// On PC, alpha comes before color, and interpolants are before the tile data.
|
||||
|
||||
|
|
|
@ -1691,68 +1691,24 @@ void *TextureCacheDX9::DecodeTextureLevel(GETextureFormat format, GEPaletteForma
|
|||
}
|
||||
|
||||
TextureCacheDX9::TexCacheEntry::Status TextureCacheDX9::CheckAlpha(const u32 *pixelData, u32 dstFmt, int stride, int w, int h) {
|
||||
// TODO: Could probably be optimized more.
|
||||
u32 hitZeroAlpha = 0;
|
||||
u32 hitSomeAlpha = 0;
|
||||
|
||||
CheckAlphaResult res;
|
||||
switch (dstFmt) {
|
||||
case D3DFMT_A4R4G4B4:
|
||||
{
|
||||
const u32 *p = pixelData;
|
||||
for (int y = 0; y < h && hitSomeAlpha == 0; ++y) {
|
||||
for (int i = 0; i < (w + 1) / 2; ++i) {
|
||||
u32 a = p[i] & 0xF000F000;
|
||||
hitZeroAlpha |= a ^ 0xF000F000;
|
||||
if (a != 0xF000F000 && a != 0xF0000000 && a != 0x0000F000 && a != 0) {
|
||||
hitSomeAlpha = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
p += stride/2;
|
||||
}
|
||||
}
|
||||
res = CheckAlphaRGBA4444Basic(pixelData, stride, w, h);
|
||||
break;
|
||||
case D3DFMT_A1R5G5B5:
|
||||
{
|
||||
const u32 *p = pixelData;
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int i = 0; i < (w + 1) / 2; ++i) {
|
||||
u32 a = p[i] & 0x80008000;
|
||||
hitZeroAlpha |= a ^ 0x80008000;
|
||||
}
|
||||
p += stride/2;
|
||||
}
|
||||
}
|
||||
res = CheckAlphaRGBA5551Basic(pixelData, stride, w, h);
|
||||
break;
|
||||
case D3DFMT_R5G6B5:
|
||||
{
|
||||
// Never has any alpha.
|
||||
}
|
||||
// Never has any alpha.
|
||||
res = CHECKALPHA_FULL;
|
||||
break;
|
||||
default:
|
||||
{
|
||||
const u32 *p = pixelData;
|
||||
for (int y = 0; y < h && hitSomeAlpha == 0; ++y) {
|
||||
for (int i = 0; i < w; ++i) {
|
||||
u32 a = p[i] & 0xFF000000;
|
||||
hitZeroAlpha |= a ^ 0xFF000000;
|
||||
if (a != 0xFF000000 && a != 0) {
|
||||
hitSomeAlpha = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
p += stride;
|
||||
}
|
||||
}
|
||||
res = CheckAlphaRGBA8888Basic(pixelData, stride, w, h);
|
||||
break;
|
||||
}
|
||||
|
||||
if (hitSomeAlpha != 0)
|
||||
return TexCacheEntry::STATUS_ALPHA_UNKNOWN;
|
||||
else if (hitZeroAlpha != 0)
|
||||
return TexCacheEntry::STATUS_ALPHA_SIMPLE;
|
||||
else
|
||||
return TexCacheEntry::STATUS_ALPHA_FULL;
|
||||
return (TexCacheEntry::Status)res;
|
||||
}
|
||||
|
||||
static inline void copyTexture(int xoffset, int yoffset, int w, int h, int pitch, int srcfmt, int fmt, void * pSrc, void * pDst) {
|
||||
|
|
|
@ -1826,68 +1826,24 @@ void *TextureCache::DecodeTextureLevel(GETextureFormat format, GEPaletteFormat c
|
|||
}
|
||||
|
||||
TextureCache::TexCacheEntry::Status TextureCache::CheckAlpha(const u32 *pixelData, GLenum dstFmt, int stride, int w, int h) {
|
||||
// TODO: Could probably be optimized more.
|
||||
u32 hitZeroAlpha = 0;
|
||||
u32 hitSomeAlpha = 0;
|
||||
|
||||
CheckAlphaResult res;
|
||||
switch (dstFmt) {
|
||||
case GL_UNSIGNED_SHORT_4_4_4_4:
|
||||
{
|
||||
const u32 *p = pixelData;
|
||||
for (int y = 0; y < h && hitSomeAlpha == 0; ++y) {
|
||||
for (int i = 0; i < (w + 1) / 2; ++i) {
|
||||
u32 a = p[i] & 0x000F000F;
|
||||
hitZeroAlpha |= a ^ 0x000F000F;
|
||||
if (a != 0x000F000F && a != 0x0000000F && a != 0x000F0000 && a != 0) {
|
||||
hitSomeAlpha = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
p += stride/2;
|
||||
}
|
||||
}
|
||||
res = CheckAlphaABGR4444Basic(pixelData, stride, w, h);
|
||||
break;
|
||||
case GL_UNSIGNED_SHORT_5_5_5_1:
|
||||
{
|
||||
const u32 *p = pixelData;
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int i = 0; i < (w + 1) / 2; ++i) {
|
||||
u32 a = p[i] & 0x00010001;
|
||||
hitZeroAlpha |= a ^ 0x00010001;
|
||||
}
|
||||
p += stride/2;
|
||||
}
|
||||
}
|
||||
res = CheckAlphaABGR1555Basic(pixelData, stride, w, h);
|
||||
break;
|
||||
case GL_UNSIGNED_SHORT_5_6_5:
|
||||
{
|
||||
// Never has any alpha.
|
||||
}
|
||||
// Never has any alpha.
|
||||
res = CHECKALPHA_FULL;
|
||||
break;
|
||||
default:
|
||||
{
|
||||
const u32 *p = pixelData;
|
||||
for (int y = 0; y < h && hitSomeAlpha == 0; ++y) {
|
||||
for (int i = 0; i < w; ++i) {
|
||||
u32 a = p[i] & 0xFF000000;
|
||||
hitZeroAlpha |= a ^ 0xFF000000;
|
||||
if (a != 0xFF000000 && a != 0) {
|
||||
hitSomeAlpha = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
p += stride;
|
||||
}
|
||||
}
|
||||
res = CheckAlphaRGBA8888Basic(pixelData, stride, w, h);
|
||||
break;
|
||||
}
|
||||
|
||||
if (hitSomeAlpha != 0)
|
||||
return TexCacheEntry::STATUS_ALPHA_UNKNOWN;
|
||||
else if (hitZeroAlpha != 0)
|
||||
return TexCacheEntry::STATUS_ALPHA_SIMPLE;
|
||||
else
|
||||
return TexCacheEntry::STATUS_ALPHA_FULL;
|
||||
return (TexCacheEntry::Status)res;
|
||||
}
|
||||
|
||||
void TextureCache::LoadTextureLevel(TexCacheEntry &entry, int level, bool replaceImages, int scaleFactor, GLenum dstFmt) {
|
||||
|
|
Loading…
Add table
Reference in a new issue