From 7faddd6100a7e5da116712341f294c733396c7d9 Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Tue, 20 Jan 2015 17:08:10 +0100 Subject: [PATCH] Consolidate the two TextureScaler copies into one. Also lets us clean up ColorConv slightly. --- CMakeLists.txt | 4 +- Common/ColorConv.cpp | 22 +- Common/ColorConv.h | 9 - GPU/{GLES => Common}/TextureScaler.cpp | 46 +- GPU/{GLES => Common}/TextureScaler.h | 6 +- GPU/Directx9/TextureCacheDX9.cpp | 21 +- GPU/Directx9/TextureCacheDX9.h | 4 +- GPU/Directx9/TextureScalerDX9.cpp | 651 ------------------------- GPU/Directx9/TextureScalerDX9.h | 56 --- GPU/GLES/TextureCache.cpp | 18 +- GPU/GLES/TextureCache.h | 2 +- GPU/GPU.vcxproj | 6 +- GPU/GPU.vcxproj.filters | 16 +- Qt/GPU.pro | 2 +- Windows/WndMainWindow.cpp | 2 +- android/jni/Android.mk | 2 +- 16 files changed, 88 insertions(+), 779 deletions(-) rename GPU/{GLES => Common}/TextureScaler.cpp (95%) rename GPU/{GLES => Common}/TextureScaler.h (89%) delete mode 100644 GPU/Directx9/TextureScalerDX9.cpp delete mode 100644 GPU/Directx9/TextureScalerDX9.h diff --git a/CMakeLists.txt b/CMakeLists.txt index bf2c12bac8..ba1afb2e81 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1380,6 +1380,8 @@ add_library(GPU OBJECT GPU/Common/IndexGenerator.h GPU/Common/TextureDecoder.cpp GPU/Common/TextureDecoder.h + GPU/Common/TextureScaler.cpp + GPU/Common/TextureScaler.h GPU/Common/TextureCacheCommon.cpp GPU/Common/TextureCacheCommon.h ${GPU_NEON} @@ -1408,8 +1410,6 @@ add_library(GPU OBJECT GPU/GLES/StencilBuffer.cpp GPU/GLES/TextureCache.cpp GPU/GLES/TextureCache.h - GPU/GLES/TextureScaler.cpp - GPU/GLES/TextureScaler.h GPU/GLES/TransformPipeline.cpp GPU/GLES/TransformPipeline.h GPU/GLES/VertexShaderGenerator.cpp diff --git a/Common/ColorConv.cpp b/Common/ColorConv.cpp index 4ba4a3c9c4..02b7247c54 100644 --- a/Common/ColorConv.cpp +++ b/Common/ColorConv.cpp @@ -54,28 +54,10 @@ inline u32 RGBA2BGRA(u32 src) { return r | ga | b; } -void convert4444(u16* data, u32* out, int width, int l, int u) { - for (int y = l; y < u; ++y) { - ConvertRGBA4444ToRGBA8888(out + y * width, data + y * width, width); - } -} - -void convert565(u16* data, u32* out, int width, int l, int u) { - for (int y = l; y < u; ++y) { - ConvertRGB565ToRGBA888F(out + y * width, data + y * width, width); - } -} - -void convert5551(u16* data, u32* out, int width, int l, int u) { - for (int y = l; y < u; ++y) { - ConvertRGBA5551ToRGBA8888(out + y * width, data + y * width, width); - } -} - -// Used heavily in Test Drive Unlimited +// Used heavily in Test Drive Unlimited (for no good reason...) void ConvertBGRA8888ToRGB565(u16 *dst, const u32 *src, int numPixels) { #if _M_SSE >= 0x401 - const __m128i maskG = _mm_set1_epi32(0x8000FC00); + const __m128i maskG = _mm_set1_epi32(0x0000FC00); const __m128i maskRB = _mm_set1_epi32(0x00F800F8); const __m128i mask = _mm_set1_epi32(0x0000FFFF); diff --git a/Common/ColorConv.h b/Common/ColorConv.h index 16e56b50a9..f8f8c2f45b 100644 --- a/Common/ColorConv.h +++ b/Common/ColorConv.h @@ -35,15 +35,6 @@ inline u8 Convert6To8(u8 v) { return (v << 2) | (v >> 4); } -// convert 4444 image to 8888 -void convert4444(u16* data, u32* out, int width, int l, int u); - -// convert 565 image to 8888 -void convert565(u16* data, u32* out, int width, int l, int u); - -// convert 5551 image to 8888 -void convert5551(u16* data, u32* out, int width, int l, int u); - inline u32 DecodeRGBA4444(u16 src) { const u32 r = (src & 0x000F) << 0; const u32 g = (src & 0x00F0) << 4; diff --git a/GPU/GLES/TextureScaler.cpp b/GPU/Common/TextureScaler.cpp similarity index 95% rename from GPU/GLES/TextureScaler.cpp rename to GPU/Common/TextureScaler.cpp index aaefcaee4b..72f1cb3624 100644 --- a/GPU/GLES/TextureScaler.cpp +++ b/GPU/Common/TextureScaler.cpp @@ -21,7 +21,10 @@ #endif #include -#include "GPU/GLES/TextureScaler.h" +#include +#include + +#include "GPU/Common/TextureScaler.h" #include "Core/Config.h" #include "Common/Common.h" @@ -32,8 +35,6 @@ #include "Common/ThreadPools.h" #include "Common/CPUDetect.h" #include "ext/xbrz/xbrz.h" -#include -#include #if _M_SSE >= 0x402 #include @@ -46,11 +47,11 @@ #include "native/base/timeutil.h" #endif -/////////////////////////////////////// Helper Functions (mostly math for parallelization) +// Helper Functions (mostly math for parallelization) namespace { - //////////////////////////////////////////////////////////////////// Various image processing + // Various image processing #define R(_col) ((_col>> 0)&0xFF) #define G(_col) ((_col>> 8)&0xFF) @@ -173,7 +174,8 @@ namespace { out[y*width + x] += 400; // assume distance at borders, usually makes for better result continue; } - out[y*width + x] += DISTANCE(data[yy*width + xx], center); + u32 d = data[yy*width + xx]; + out[y*width + x] += DISTANCE(d, center); } } } @@ -498,7 +500,7 @@ bool TextureScaler::IsEmptyOrFlat(u32* data, int pixels, GLenum fmt) { return true; } -void TextureScaler::Scale(u32* &data, GLenum &dstFmt, int &width, int &height, int factor) { +void TextureScaler::Scale(u32* &data, GEBufferFormat &dstFmt, int &width, int &height, int factor) { // prevent processing empty or flat textures (this happens a lot in some games) // doesn't hurt the standard case, will be very quick for textures with actual texture if(IsEmptyOrFlat(data, width*height, dstFmt)) { @@ -545,7 +547,7 @@ void TextureScaler::Scale(u32* &data, GLenum &dstFmt, int &width, int &height, i // update values accordingly data = outputBuf; - dstFmt = GL_UNSIGNED_BYTE; + dstFmt = GE_FORMAT_8888; width *= factor; height *= factor; @@ -616,21 +618,39 @@ void TextureScaler::DePosterize(u32* source, u32* dest, int width, int height) { GlobalThreadPool::Loop(std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, placeholder::_1, placeholder::_2), 0, height); } -void TextureScaler::ConvertTo8888(GLenum format, u32* source, u32* &dest, int width, int height) { +static void convert4444(u16* data, u32* out, int width, int l, int u) { + for (int y = l; y < u; ++y) { + ConvertRGBA4444ToRGBA8888(out + y * width, data + y * width, width); + } +} + +static void convert565(u16* data, u32* out, int width, int l, int u) { + for (int y = l; y < u; ++y) { + ConvertRGB565ToRGBA888F(out + y * width, data + y * width, width); + } +} + +static void convert5551(u16* data, u32* out, int width, int l, int u) { + for (int y = l; y < u; ++y) { + ConvertRGBA5551ToRGBA8888(out + y * width, data + y * width, width); + } +} + +void TextureScaler::ConvertTo8888(GEBufferFormat format, u32* source, u32* &dest, int width, int height) { switch(format) { - case GL_UNSIGNED_BYTE: + case GE_FORMAT_8888: dest = source; // already fine break; - case GL_UNSIGNED_SHORT_4_4_4_4: + case GE_FORMAT_4444: GlobalThreadPool::Loop(std::bind(&convert4444, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height); break; - case GL_UNSIGNED_SHORT_5_6_5: + case GE_FORMAT_565: GlobalThreadPool::Loop(std::bind(&convert565, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height); break; - case GL_UNSIGNED_SHORT_5_5_5_1: + case GE_FORMAT_5551: GlobalThreadPool::Loop(std::bind(&convert5551, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height); break; diff --git a/GPU/GLES/TextureScaler.h b/GPU/Common/TextureScaler.h similarity index 89% rename from GPU/GLES/TextureScaler.h rename to GPU/Common/TextureScaler.h index 6ca82764ef..b90201713f 100644 --- a/GPU/GLES/TextureScaler.h +++ b/GPU/Common/TextureScaler.h @@ -20,15 +20,15 @@ #include "Common/MemoryUtil.h" #include "../Globals.h" #include "gfx/gl_common.h" +#include "GPU/ge_constants.h" #include - class TextureScaler { public: TextureScaler(); - void Scale(u32* &data, GLenum &dstfmt, int &width, int &height, int factor); + void Scale(u32* &data, GEBufferFormat &dstfmt, int &width, int &height, int factor); enum { XBRZ= 0, HYBRID = 1, BICUBIC = 2, HYBRID_BICUBIC = 3 }; @@ -38,7 +38,7 @@ private: void ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height); void ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height); void ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic = false); - void ConvertTo8888(GLenum format, u32* source, u32* &dest, int width, int height); + void ConvertTo8888(GEBufferFormat format, u32* source, u32* &dest, int width, int height); void DePosterize(u32* source, u32* dest, int width, int height); diff --git a/GPU/Directx9/TextureCacheDX9.cpp b/GPU/Directx9/TextureCacheDX9.cpp index cced61afa7..70ddeb6685 100644 --- a/GPU/Directx9/TextureCacheDX9.cpp +++ b/GPU/Directx9/TextureCacheDX9.cpp @@ -1703,8 +1703,25 @@ void TextureCacheDX9::LoadTextureLevel(TexCacheEntry &entry, int level, int maxL gpuStats.numTexturesDecoded++; u32 *pixelData = (u32 *)finalBuf; - if (scaleFactor > 1 && (entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) - scaler.Scale(pixelData, dstFmt, w, h, scaleFactor); + if (scaleFactor > 1 && (entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) { + GEBufferFormat dstFormat; + switch (dstFmt) { + case D3DFMT_A8R8G8B8: dstFormat = GE_FORMAT_8888; break; + case D3DFMT_R5G6B5: dstFormat = GE_FORMAT_565; break; + case D3DFMT_A4R4G4B4: dstFormat = GE_FORMAT_4444; break; + case D3DFMT_A1R5G5B5: dstFormat = GE_FORMAT_5551; break; + default: goto dontScale; + } + scaler.Scale(pixelData, dstFormat, w, h, scaleFactor); + switch (dstFormat) { + case GE_FORMAT_8888: dstFmt = D3DFMT_A8R8G8B8; break; + case GE_FORMAT_565: dstFmt = D3DFMT_R5G6B5; break; + case GE_FORMAT_4444: dstFmt = D3DFMT_A4R4G4B4; break; + case GE_FORMAT_5551: dstFmt = D3DFMT_A1R5G5B5; break; + } + dontScale: + ; + } if ((entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) { TexCacheEntry::Status alphaStatus = CheckAlpha(pixelData, dstFmt, w, w, h); diff --git a/GPU/Directx9/TextureCacheDX9.h b/GPU/Directx9/TextureCacheDX9.h index cac576b0f3..2684874224 100644 --- a/GPU/Directx9/TextureCacheDX9.h +++ b/GPU/Directx9/TextureCacheDX9.h @@ -24,7 +24,7 @@ #include "helper/fbo.h" #include "GPU/GPUInterface.h" #include "GPU/GPUState.h" -#include "GPU/Directx9/TextureScalerDX9.h" +#include "GPU/Common/TextureScaler.h" #include "GPU/Common/TextureCacheCommon.h" struct VirtualFramebuffer; @@ -197,7 +197,7 @@ private: bool clearCacheNextFrame_; bool lowMemoryMode_; - TextureScalerDX9 scaler; + TextureScaler scaler; SimpleBuf tmpTexBuf32; SimpleBuf tmpTexBuf16; diff --git a/GPU/Directx9/TextureScalerDX9.cpp b/GPU/Directx9/TextureScalerDX9.cpp deleted file mode 100644 index a1aa386251..0000000000 --- a/GPU/Directx9/TextureScalerDX9.cpp +++ /dev/null @@ -1,651 +0,0 @@ -// Copyright (c) 2012- PPSSPP Project. - -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, version 2.0 or later versions. - -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License 2.0 for more details. - -// A copy of the GPL 2.0 should have been included with the program. -// If not, see http://www.gnu.org/licenses/ - -// Official git repository and contact information can be found at -// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. - -// On Visual Studio 2012, include this before anything else so -// _VARIADIC_MAX gets set to 10, to avoid std::bind compile errors. -// See header file for reasons why. -#if defined(_WIN32) && _MSC_VER == 1700 -#include "../native/base/basictypes.h" -#endif - -#include -#include "GPU/Directx9/TextureScalerDX9.h" - -#include "Core/Config.h" -#include "Common/Common.h" -#include "Common/ColorConv.h" -#include "Common/Log.h" -#include "Common/MsgHandler.h" -#include "Common/CommonFuncs.h" -#include "Common/ThreadPools.h" -#include "Common/CPUDetect.h" -#include "ext/xbrz/xbrz.h" -#include -#include -#include - -#undef min -#undef max - -#if _M_SSE >= 0x402 -#include -#endif - -// Report the time and throughput for each larger scaling operation in the log -//#define SCALING_MEASURE_TIME - -#ifdef SCALING_MEASURE_TIME -#include "native/base/timeutil.h" -#endif - -/////////////////////////////////////// Helper Functions (mostly math for parallelization) - -namespace { - - //////////////////////////////////////////////////////////////////// Various image processing - - #define R(_col) ((_col>> 0)&0xFF) - #define G(_col) ((_col>> 8)&0xFF) - #define B(_col) ((_col>>16)&0xFF) - #define A(_col) ((_col>>24)&0xFF) - - #define DISTANCE(_p1,_p2) ( abs(static_cast(static_cast(R(_p1))-R(_p2))) + abs(static_cast(static_cast(G(_p1))-G(_p2))) \ - + abs(static_cast(static_cast(B(_p1))-B(_p2))) + abs(static_cast(static_cast(A(_p1))-A(_p2))) ) - - // this is sadly much faster than an inline function with a loop, at least in VC10 - #define MIX_PIXELS(_p0, _p1, _factors) \ - ( (R(_p0)*(_factors)[0] + R(_p1)*(_factors)[1])/255 << 0 ) | \ - ( (G(_p0)*(_factors)[0] + G(_p1)*(_factors)[1])/255 << 8 ) | \ - ( (B(_p0)*(_factors)[0] + B(_p1)*(_factors)[1])/255 << 16 ) | \ - ( (A(_p0)*(_factors)[0] + A(_p1)*(_factors)[1])/255 << 24 ) - - #define BLOCK_SIZE 32 - - // 3x3 convolution with Neumann boundary conditions, parallelizable - // quite slow, could be sped up a lot - // especially handling of separable kernels - void convolve3x3(u32* data, u32* out, const int kernel[3][3], int width, int height, int l, int u) { - for(int yb = 0; yb < (u-l)/BLOCK_SIZE+1; ++yb) { - for(int xb = 0; xb < width/BLOCK_SIZE+1; ++xb) { - for(int y = l+yb*BLOCK_SIZE; y < l+(yb+1)*BLOCK_SIZE && y < u; ++y) { - for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < width; ++x) { - int val = 0; - for(int yoff = -1; yoff <= 1; ++yoff) { - int yy = std::max(std::min(y+yoff, height-1), 0); - for(int xoff = -1; xoff <= 1; ++xoff) { - int xx = std::max(std::min(x+xoff, width-1), 0); - val += data[yy*width + xx] * kernel[yoff+1][xoff+1]; - } - } - out[y*width + x] = abs(val); - } - } - } - } - } - - // deposterization: smoothes posterized gradients from low-color-depth (e.g. 444, 565, compressed) sources - void deposterizeH(u32* data, u32* out, int w, int l, int u) { - static const int T = 8; - for(int y = l; y < u; ++y) { - for(int x = 0; x < w; ++x) { - int inpos = y*w + x; - u32 center = data[inpos]; - if(x==0 || x==w-1) { - out[y*w + x] = center; - continue; - } - u32 left = data[inpos - 1]; - u32 right = data[inpos + 1]; - out[y*w + x] = 0; - for(int c=0; c<4; ++c) { - u8 lc = (( left>>c*8)&0xFF); - u8 cc = ((center>>c*8)&0xFF); - u8 rc = (( right>>c*8)&0xFF); - if((lc != rc) && ((lc == cc && abs((int)((int)rc)-cc) <= T) || (rc == cc && abs((int)((int)lc)-cc) <= T))) { - // blend this component - out[y*w + x] |= ((rc+lc)/2) << (c*8); - } else { - // no change for this component - out[y*w + x] |= cc << (c*8); - } - } - } - } - } - void deposterizeV(u32* data, u32* out, int w, int h, int l, int u) { - static const int T = 8; - for(int xb = 0; xb < w/BLOCK_SIZE+1; ++xb) { - for(int y = l; y < u; ++y) { - for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < w; ++x) { - u32 center = data[ y * w + x]; - if(y==0 || y==h-1) { - out[y*w + x] = center; - continue; - } - u32 upper = data[(y-1) * w + x]; - u32 lower = data[(y+1) * w + x]; - out[y*w + x] = 0; - for(int c=0; c<4; ++c) { - u8 uc = (( upper>>c*8)&0xFF); - u8 cc = ((center>>c*8)&0xFF); - u8 lc = (( lower>>c*8)&0xFF); - if((uc != lc) && ((uc == cc && abs((int)((int)lc)-cc) <= T) || (lc == cc && abs((int)((int)uc)-cc) <= T))) { - // blend this component - out[y*w + x] |= ((lc+uc)/2) << (c*8); - } else { - // no change for this component - out[y*w + x] |= cc << (c*8); - } - } - } - } - } - } - - // generates a distance mask value for each pixel in data - // higher values -> larger distance to the surrounding pixels - void generateDistanceMask(u32* data, u32* out, int width, int height, int l, int u) { - for(int yb = 0; yb < (u-l)/BLOCK_SIZE+1; ++yb) { - for(int xb = 0; xb < width/BLOCK_SIZE+1; ++xb) { - for(int y = l+yb*BLOCK_SIZE; y < l+(yb+1)*BLOCK_SIZE && y < u; ++y) { - for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < width; ++x) { - out[y*width + x] = 0; - u32 center = data[y*width + x]; - for(int yoff = -1; yoff <= 1; ++yoff) { - int yy = y+yoff; - if(yy == height || yy == -1) { - out[y*width + x] += 1200; // assume distance at borders, usually makes for better result - continue; - } - for(int xoff = -1; xoff <= 1; ++xoff) { - if(yoff == 0 && xoff == 0) continue; - int xx = x+xoff; - if(xx == width || xx == -1) { - out[y*width + x] += 400; // assume distance at borders, usually makes for better result - continue; - } - out[y*width + x] += DISTANCE(data[yy*width + xx], center); - } - } - } - } - } - } - } - - // mix two images based on a mask - void mix(u32* data, u32* source, u32* mask, u32 maskmax, int width, int l, int u) { - for(int y = l; y < u; ++y) { - for(int x = 0; x < width; ++x) { - int pos = y*width + x; - u8 mixFactors[2] = { 0, static_cast((std::min(mask[pos], maskmax)*255)/maskmax) }; - mixFactors[0] = 255-mixFactors[1]; - data[pos] = MIX_PIXELS(data[pos], source[pos], mixFactors); - if(A(source[pos]) == 0) data[pos] = data[pos] & 0x00FFFFFF; // xBRZ always does a better job with hard alpha - } - } - } - - //////////////////////////////////////////////////////////////////// Bicubic scaling - - // generate the value of a Mitchell-Netravali scaling spline at distance d, with parameters A and B - // B=1 C=0 : cubic B spline (very smooth) - // B=C=1/3 : recommended for general upscaling - // B=0 C=1/2 : Catmull-Rom spline (sharp, ringing) - // see Mitchell & Netravali, "Reconstruction Filters in Computer Graphics" - inline float mitchell(float x, float B, float C) { - float ax = fabs(x); - if(ax>=2.0f) return 0.0f; - if(ax>=1.0f) return ((-B-6*C)*(x*x*x) + (6*B+30*C)*(x*x) + (-12*B-48*C)*x + (8*B+24*C))/6.0f; - return ((12-9*B-6*C)*(x*x*x) + (-18+12*B+6*C)*(x*x) + (6-2*B))/6.0f; - } - - // arrays for pre-calculating weights and sums (~20KB) - // Dimensions: - // 0: 0 = BSpline, 1 = mitchell - // 2: 2-5x scaling - // 2,3: 5x5 generated pixels - // 4,5: 5x5 pixels sampled from - float bicubicWeights[2][4][5][5][5][5]; - float bicubicInvSums[2][4][5][5]; - - // initialize pre-computed weights array - void initBicubicWeights() { - float B[2] = { 1.0f, 0.334f }; - float C[2] = { 0.0f, 0.334f }; - for(int type=0; type<2; ++type) { - for(int factor=2; factor<=5; ++factor) { - for(int x=0; x - void scaleBicubicT(u32* data, u32* out, int w, int h, int l, int u) { - int outw = w*f; - for(int yb = 0; yb < (u-l)*f/BLOCK_SIZE+1; ++yb) { - for(int xb = 0; xb < w*f/BLOCK_SIZE+1; ++xb) { - for(int y = l*f+yb*BLOCK_SIZE; y < l*f+(yb+1)*BLOCK_SIZE && y < u*f; ++y) { - for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < w*f; ++x) { - float r = 0.0f, g = 0.0f, b = 0.0f, a = 0.0f; - int cx = x/f, cy = y/f; - // sample supporting pixels in original image - for(int sx = -2; sx <= 2; ++sx) { - for(int sy = -2; sy <= 2; ++sy) { - float weight = bicubicWeights[T][f-2][x%f][y%f][sx+2][sy+2]; - if(weight != 0.0f) { - // clamp pixel locations - int csy = std::max(std::min(sy+cy,h-1),0); - int csx = std::max(std::min(sx+cx,w-1),0); - // sample & add weighted components - u32 sample = data[csy*w+csx]; - r += weight*R(sample); - g += weight*G(sample); - b += weight*B(sample); - a += weight*A(sample); - } - } - } - // generate and write result - float invSum = bicubicInvSums[T][f-2][x%f][y%f]; - int ri = std::min(std::max(static_cast(ceilf(r*invSum)),0),255); - int gi = std::min(std::max(static_cast(ceilf(g*invSum)),0),255); - int bi = std::min(std::max(static_cast(ceilf(b*invSum)),0),255); - int ai = std::min(std::max(static_cast(ceilf(a*invSum)),0),255); - out[y*outw + x] = (ai << 24) | (bi << 16) | (gi << 8) | ri; - } - } - } - } - } - #if _M_SSE >= 0x401 - template - void scaleBicubicTSSE41(u32* data, u32* out, int w, int h, int l, int u) { - int outw = w*f; - for(int yb = 0; yb < (u-l)*f/BLOCK_SIZE+1; ++yb) { - for(int xb = 0; xb < w*f/BLOCK_SIZE+1; ++xb) { - for(int y = l*f+yb*BLOCK_SIZE; y < l*f+(yb+1)*BLOCK_SIZE && y < u*f; ++y) { - for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < w*f; ++x) { - __m128 result = _mm_set1_ps(0.0f); - int cx = x/f, cy = y/f; - // sample supporting pixels in original image - for(int sx = -2; sx <= 2; ++sx) { - for(int sy = -2; sy <= 2; ++sy) { - float weight = bicubicWeights[T][f-2][x%f][y%f][sx+2][sy+2]; - if(weight != 0.0f) { - // clamp pixel locations - int csy = std::max(std::min(sy+cy,h-1),0); - int csx = std::max(std::min(sx+cx,w-1),0); - // sample & add weighted components - __m128i sample = _mm_cvtsi32_si128(data[csy*w+csx]); - sample = _mm_cvtepu8_epi32(sample); - __m128 col = _mm_cvtepi32_ps(sample); - col = _mm_mul_ps(col, _mm_set1_ps(weight)); - result = _mm_add_ps(result, col); - } - } - } - // generate and write result - __m128i pixel = _mm_cvtps_epi32(_mm_mul_ps(result, _mm_set1_ps(bicubicInvSums[T][f-2][x%f][y%f]))); - pixel = _mm_packs_epi32(pixel, pixel); - pixel = _mm_packus_epi16(pixel, pixel); - out[y*outw + x] = _mm_cvtsi128_si32(pixel); - } - } - } - } - } - #endif - - void scaleBicubicBSpline(int factor, u32* data, u32* out, int w, int h, int l, int u) { - #if _M_SSE >= 0x401 - if(cpu_info.bSSE4_1) { - switch(factor) { - case 2: scaleBicubicTSSE41<2, 0>(data, out, w, h, l, u); break; // when I first tested this, - case 3: scaleBicubicTSSE41<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected - case 4: scaleBicubicTSSE41<4, 0>(data, out, w, h, l, u); break; // turns out I had not included - case 5: scaleBicubicTSSE41<5, 0>(data, out, w, h, l, u); break; // any of these break statements - default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5"); - } - } else { - #endif - switch(factor) { - case 2: scaleBicubicT<2, 0>(data, out, w, h, l, u); break; // when I first tested this, - case 3: scaleBicubicT<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected - case 4: scaleBicubicT<4, 0>(data, out, w, h, l, u); break; // turns out I had not included - case 5: scaleBicubicT<5, 0>(data, out, w, h, l, u); break; // any of these break statements - default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5"); - } - #if _M_SSE >= 0x401 - } - #endif - } - - void scaleBicubicMitchell(int factor, u32* data, u32* out, int w, int h, int l, int u) { - #if _M_SSE >= 0x401 - if(cpu_info.bSSE4_1) { - switch(factor) { - case 2: scaleBicubicTSSE41<2, 1>(data, out, w, h, l, u); break; - case 3: scaleBicubicTSSE41<3, 1>(data, out, w, h, l, u); break; - case 4: scaleBicubicTSSE41<4, 1>(data, out, w, h, l, u); break; - case 5: scaleBicubicTSSE41<5, 1>(data, out, w, h, l, u); break; - default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5"); - } - } else { - #endif - switch(factor) { - case 2: scaleBicubicT<2, 1>(data, out, w, h, l, u); break; - case 3: scaleBicubicT<3, 1>(data, out, w, h, l, u); break; - case 4: scaleBicubicT<4, 1>(data, out, w, h, l, u); break; - case 5: scaleBicubicT<5, 1>(data, out, w, h, l, u); break; - default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5"); - } - #if _M_SSE >= 0x401 - } - #endif - } - - //////////////////////////////////////////////////////////////////// Bilinear scaling - - const static u8 BILINEAR_FACTORS[4][3][2] = { - { { 44,211}, { 0, 0}, { 0, 0} }, // x2 - { { 64,191}, { 0,255}, { 0, 0} }, // x3 - { { 77,178}, { 26,229}, { 0, 0} }, // x4 - { {102,153}, { 51,204}, { 0,255} }, // x5 - }; - // integral bilinear upscaling by factor f, horizontal part - template - void bilinearHt(u32* data, u32* out, int w, int l, int u) { - static_assert(f>1 && f<=5, "Bilinear scaling only implemented for factors 2 to 5"); - int outw = w*f; - for(int y = l; y < u; ++y) { - for(int x = 0; x < w; ++x) { - int inpos = y*w + x; - u32 left = data[inpos - (x==0 ?0:1)]; - u32 center = data[inpos]; - u32 right = data[inpos + (x==w-1?0:1)]; - int i=0; - for(; i(data, out, w, l, u); break; - case 3: bilinearHt<3>(data, out, w, l, u); break; - case 4: bilinearHt<4>(data, out, w, l, u); break; - case 5: bilinearHt<5>(data, out, w, l, u); break; - default: ERROR_LOG(G3D, "Bilinear upsampling only implemented for factors 2 to 5"); - } - } - // integral bilinear upscaling by factor f, vertical part - // gl/gu == global lower and upper bound - template - void bilinearVt(u32* data, u32* out, int w, int gl, int gu, int l, int u) { - static_assert(f>1 && f<=5, "Bilinear scaling only implemented for 2x, 3x, 4x, and 5x"); - int outw = w*f; - for(int xb = 0; xb < outw/BLOCK_SIZE+1; ++xb) { - for(int y = l; y < u; ++y) { - u32 uy = y - (y==gl ?0:1); - u32 ly = y + (y==gu-1?0:1); - for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < outw; ++x) { - u32 upper = data[uy * outw + x]; - u32 center = data[y * outw + x]; - u32 lower = data[ly * outw + x]; - int i=0; - for(; i(data, out, w, gl, gu, l, u); break; - case 3: bilinearVt<3>(data, out, w, gl, gu, l, u); break; - case 4: bilinearVt<4>(data, out, w, gl, gu, l, u); break; - case 5: bilinearVt<5>(data, out, w, gl, gu, l, u); break; - default: ERROR_LOG(G3D, "Bilinear upsampling only implemented for factors 2 to 5"); - } - } - - #undef BLOCK_SIZE - #undef MIX_PIXELS - #undef DISTANCE - #undef R - #undef G - #undef B - #undef A - - // used for debugging texture scaling (writing textures to files) - static int g_imgCount = 0; - void dbgPPM(int w, int h, u8* pixels, const char* prefix = "dbg") { // 3 component RGB - char fn[32]; - snprintf(fn, 32, "%s%04d.ppm", prefix, g_imgCount++); - FILE *fp = fopen(fn, "wb"); - fprintf(fp, "P6\n%d %d\n255\n", w, h); - for(int j = 0; j < h; ++j) { - for(int i = 0; i < w; ++i) { - static unsigned char color[3]; - color[0] = pixels[(j*w+i)*4+0]; /* red */ - color[1] = pixels[(j*w+i)*4+1]; /* green */ - color[2] = pixels[(j*w+i)*4+2]; /* blue */ - fwrite(color, 1, 3, fp); - } - } - fclose(fp); - } - void dbgPGM(int w, int h, u32* pixels, const char* prefix = "dbg") { // 1 component - char fn[32]; - snprintf(fn, 32, "%s%04d.pgm", prefix, g_imgCount++); - FILE *fp = fopen(fn, "wb"); - fprintf(fp, "P5\n%d %d\n65536\n", w, h); - for(int j = 0; j < h; ++j) { - for(int i = 0; i < w; ++i) { - fwrite((pixels+(j*w+i)), 1, 2, fp); - } - } - fclose(fp); - } -} - -/////////////////////////////////////// Texture Scaler - -namespace DX9 { - -TextureScalerDX9::TextureScalerDX9() { - initBicubicWeights(); -} - -bool TextureScalerDX9::IsEmptyOrFlat(u32* data, int pixels, u32 fmt) { - int pixelsPerWord = (fmt == D3DFMT_A8R8G8B8) ? 1 : 2; - u32 ref = data[0]; - for(int i=0; i 64*64*factor*factor) { - double t = real_time_now() - t_start; - NOTICE_LOG(MASTER_LOG, "TextureScaler: processed %9d pixels in %6.5lf seconds. (%9.2lf Mpixels/second)", - width*height, t, (width*height)/(t*1000*1000)); - } - #endif -} - -void TextureScalerDX9::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) { - xbrz::ScalerCfg cfg; - GlobalThreadPool::Loop(std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, placeholder::_1, placeholder::_2), 0, height); -} - -void TextureScalerDX9::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) { - bufTmp1.resize(width*height*factor); - u32 *tmpBuf = bufTmp1.data(); - GlobalThreadPool::Loop(std::bind(&bilinearH, factor, source, tmpBuf, width, placeholder::_1, placeholder::_2), 0, height); - GlobalThreadPool::Loop(std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, placeholder::_1, placeholder::_2), 0, height); -} - -void TextureScalerDX9::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) { - GlobalThreadPool::Loop(std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, placeholder::_1, placeholder::_2), 0, height); -} - -void TextureScalerDX9::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) { - GlobalThreadPool::Loop(std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, placeholder::_1, placeholder::_2), 0, height); -} - -void TextureScalerDX9::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) { - // Basic algorithm: - // 1) determine a feature mask C based on a sobel-ish filter + splatting, and upscale that mask bilinearly - // 2) generate 2 scaled images: A - using Bilinear filtering, B - using xBRZ - // 3) output = A*C + B*(1-C) - - const static int KERNEL_SPLAT[3][3] = { - { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 } - }; - - bufTmp1.resize(width*height); - bufTmp2.resize(width*height*factor*factor); - bufTmp3.resize(width*height*factor*factor); - GlobalThreadPool::Loop(std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, placeholder::_1, placeholder::_2), 0, height); - GlobalThreadPool::Loop(std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, placeholder::_1, placeholder::_2), 0, height); - ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height); - // mask C is now in bufTmp3 - - ScaleXBRZ(factor, source, bufTmp2.data(), width, height); - // xBRZ upscaled source is in bufTmp2 - - if(bicubic) ScaleBicubicBSpline(factor, source, dest, width, height); - else ScaleBilinear(factor, source, dest, width, height); - // Upscaled source is in dest - - // Now we can mix it all together - // The factor 8192 was found through practical testing on a variety of textures - GlobalThreadPool::Loop(std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, placeholder::_1, placeholder::_2), 0, height*factor); -} - -void TextureScalerDX9::DePosterize(u32* source, u32* dest, int width, int height) { - bufTmp3.resize(width*height); - GlobalThreadPool::Loop(std::bind(&deposterizeH, source, bufTmp3.data(), width, placeholder::_1, placeholder::_2), 0, height); - GlobalThreadPool::Loop(std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, placeholder::_1, placeholder::_2), 0, height); - GlobalThreadPool::Loop(std::bind(&deposterizeH, dest, bufTmp3.data(), width, placeholder::_1, placeholder::_2), 0, height); - GlobalThreadPool::Loop(std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, placeholder::_1, placeholder::_2), 0, height); -} - -void TextureScalerDX9::ConvertTo8888(u32 format, u32* source, u32* &dest, int width, int height) { - switch(format) { - case D3DFMT_A8R8G8B8: - dest = source; // already fine - break; - - case D3DFMT_A4R4G4B4: - GlobalThreadPool::Loop(std::bind(&convert4444, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height); - break; - - case D3DFMT_R5G6B5: - GlobalThreadPool::Loop(std::bind(&convert565, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height); - break; - - case D3DFMT_A1R5G5B5: - GlobalThreadPool::Loop(std::bind(&convert5551, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height); - break; - - default: - dest = source; - ERROR_LOG(G3D, "iXBRZTexScaling: unsupported texture format"); - } -} - -} diff --git a/GPU/Directx9/TextureScalerDX9.h b/GPU/Directx9/TextureScalerDX9.h deleted file mode 100644 index 2da5484645..0000000000 --- a/GPU/Directx9/TextureScalerDX9.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2012- PPSSPP Project. - -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, version 2.0 or later versions. - -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License 2.0 for more details. - -// A copy of the GPL 2.0 should have been included with the program. -// If not, see http://www.gnu.org/licenses/ - -// Official git repository and contact information can be found at -// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. - -#pragma once - -#include "Common/MemoryUtil.h" -#include "../Globals.h" -#include "helper/global.h" -//#include "gfx/gl_common.h" - -#include - -namespace DX9 { - - -class TextureScalerDX9 { -public: - TextureScalerDX9(); - - void Scale(u32* &data, u32 &dstfmt, int &width, int &height, int factor); - - enum { XBRZ= 0, HYBRID = 1, BICUBIC = 2, HYBRID_BICUBIC = 3 }; - -private: - void ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height); - void ScaleBilinear(int factor, u32* source, u32* dest, int width, int height); - void ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height); - void ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height); - void ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic = false); - void ConvertTo8888(u32 format, u32* source, u32* &dest, int width, int height); - - void DePosterize(u32* source, u32* dest, int width, int height); - - bool IsEmptyOrFlat(u32* data, int pixels, u32 fmt); - - // depending on the factor and texture sizes, these can get pretty large - // maximum is (100 MB total for a 512 by 512 texture with scaling factor 5 and hybrid scaling) - // of course, scaling factor 5 is totally silly anyway - SimpleBuf bufInput, bufDeposter, bufOutput, bufTmp1, bufTmp2, bufTmp3; -}; - -}; diff --git a/GPU/GLES/TextureCache.cpp b/GPU/GLES/TextureCache.cpp index 4d2c67d19b..f1489b713b 100644 --- a/GPU/GLES/TextureCache.cpp +++ b/GPU/GLES/TextureCache.cpp @@ -2002,8 +2002,22 @@ void TextureCache::LoadTextureLevel(TexCacheEntry &entry, int level, bool replac bool useBGRA = UseBGRA8888() && dstFmt == GL_UNSIGNED_BYTE; u32 *pixelData = (u32 *)finalBuf; - if (scaleFactor > 1 && (entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) - scaler.Scale(pixelData, dstFmt, w, h, scaleFactor); + if (scaleFactor > 1 && (entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) { + GEBufferFormat dstFormat; + switch (dstFmt) { + case GL_UNSIGNED_BYTE: dstFormat = GE_FORMAT_8888; break; + case GL_UNSIGNED_SHORT_4_4_4_4: dstFormat = GE_FORMAT_4444; break; + case GL_UNSIGNED_SHORT_5_6_5: dstFormat = GE_FORMAT_565; break; + case GL_UNSIGNED_SHORT_5_5_5_1: dstFormat = GE_FORMAT_5551; break; + } + scaler.Scale(pixelData, dstFormat, w, h, scaleFactor); + switch (dstFormat) { + case GE_FORMAT_565: dstFmt = GL_UNSIGNED_SHORT_5_6_5; break; + case GE_FORMAT_5551: dstFmt = GL_UNSIGNED_SHORT_5_5_5_1; break; + case GE_FORMAT_4444: dstFmt = GL_UNSIGNED_SHORT_4_4_4_4; break; + case GE_FORMAT_8888: dstFmt = GL_UNSIGNED_BYTE; break; + } + } if ((entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) { TexCacheEntry::Status alphaStatus = CheckAlpha(pixelData, dstFmt, useUnpack ? bufw : w, w, h); diff --git a/GPU/GLES/TextureCache.h b/GPU/GLES/TextureCache.h index c4bb20781b..cc186a3f98 100644 --- a/GPU/GLES/TextureCache.h +++ b/GPU/GLES/TextureCache.h @@ -25,7 +25,7 @@ #include "Globals.h" #include "GPU/GPUInterface.h" #include "GPU/GPUState.h" -#include "GPU/GLES/TextureScaler.h" +#include "GPU/Common/TextureScaler.h" #include "GPU/Common/TextureCacheCommon.h" struct VirtualFramebuffer; diff --git a/GPU/GPU.vcxproj b/GPU/GPU.vcxproj index 79e07383b2..08511d2930 100644 --- a/GPU/GPU.vcxproj +++ b/GPU/GPU.vcxproj @@ -191,6 +191,7 @@ true + @@ -204,7 +205,6 @@ - @@ -217,7 +217,6 @@ - @@ -246,6 +245,7 @@ true + @@ -269,7 +269,6 @@ - @@ -283,7 +282,6 @@ - diff --git a/GPU/GPU.vcxproj.filters b/GPU/GPU.vcxproj.filters index d5dc894551..c516ce7ec2 100644 --- a/GPU/GPU.vcxproj.filters +++ b/GPU/GPU.vcxproj.filters @@ -42,6 +42,9 @@ Common + + Common + Software @@ -69,9 +72,6 @@ GLES - - GLES - GLES @@ -93,9 +93,6 @@ DirectX9 - - DirectX9 - DirectX9 @@ -254,11 +251,8 @@ DirectX9 - - DirectX9 - - - GLES + + Common Common diff --git a/Qt/GPU.pro b/Qt/GPU.pro index f14c017d2e..5d68df4851 100644 --- a/Qt/GPU.pro +++ b/Qt/GPU.pro @@ -36,7 +36,6 @@ SOURCES += $$P/GPU/GeDisasm.cpp \ # GPU $$P/GPU/GLES/StateMapping.cpp \ $$P/GPU/GLES/StencilBuffer.cpp \ $$P/GPU/GLES/TextureCache.cpp \ - $$P/GPU/GLES/TextureScaler.cpp \ $$P/GPU/GLES/TransformPipeline.cpp \ $$P/GPU/GLES/VertexShaderGenerator.cpp \ $$P/GPU/Software/*.cpp \ @@ -44,6 +43,7 @@ SOURCES += $$P/GPU/GeDisasm.cpp \ # GPU $$P/GPU/Common/IndexGenerator.cpp \ $$P/GPU/Common/TextureDecoder.cpp \ $$P/GPU/Common/VertexDecoderCommon.cpp \ + $$P/GPU/Common/TextureScaler.cpp \ $$P/GPU/Common/TextureCacheCommon.cpp \ $$P/GPU/Common/TransformCommon.cpp \ $$P/GPU/Common/SoftwareTransformCommon.cpp \ diff --git a/Windows/WndMainWindow.cpp b/Windows/WndMainWindow.cpp index bfccea360a..417e37f5cb 100644 --- a/Windows/WndMainWindow.cpp +++ b/Windows/WndMainWindow.cpp @@ -68,7 +68,7 @@ #include "GPU/GPUInterface.h" #include "GPU/GPUState.h" #include "gfx_es2/gpu_features.h" -#include "GPU/GLES/TextureScaler.h" +#include "GPU/Common/TextureScaler.h" #include "GPU/GLES/TextureCache.h" #include "GPU/GLES/Framebuffer.h" #include "ControlMapping.h" diff --git a/android/jni/Android.mk b/android/jni/Android.mk index 79c5499e65..0e40a5a576 100644 --- a/android/jni/Android.mk +++ b/android/jni/Android.mk @@ -153,6 +153,7 @@ EXEC_AND_LIB_FILES := \ $(SRC)/GPU/Common/SoftwareTransformCommon.cpp.arm \ $(SRC)/GPU/Common/VertexDecoderCommon.cpp.arm \ $(SRC)/GPU/Common/TextureCacheCommon.cpp.arm \ + $(SRC)/GPU/Common/TextureScaler.cpp.arm \ $(SRC)/GPU/Common/SplineCommon.cpp.arm \ $(SRC)/GPU/Common/DrawEngineCommon.cpp.arm \ $(SRC)/GPU/Common/TransformCommon.cpp.arm \ @@ -171,7 +172,6 @@ EXEC_AND_LIB_FILES := \ $(SRC)/GPU/GLES/VertexShaderGenerator.cpp.arm \ $(SRC)/GPU/GLES/FragmentShaderGenerator.cpp.arm \ $(SRC)/GPU/GLES/FragmentTestCache.cpp.arm \ - $(SRC)/GPU/GLES/TextureScaler.cpp \ $(SRC)/GPU/GLES/Spline.cpp \ $(SRC)/GPU/Null/NullGpu.cpp \ $(SRC)/GPU/Software/Clipper.cpp \