From fa8df5439144e794fe3e76c47620054f20481a82 Mon Sep 17 00:00:00 2001 From: Peter Thoman Date: Sun, 5 May 2013 21:54:50 +0200 Subject: [PATCH] Pre-compute all weights and sums for bicubic scaling --- GPU/GLES/TextureScaler.cpp | 127 ++++++++++++++++++++++--------------- 1 file changed, 75 insertions(+), 52 deletions(-) diff --git a/GPU/GLES/TextureScaler.cpp b/GPU/GLES/TextureScaler.cpp index ba1590e865..d8a6564432 100644 --- a/GPU/GLES/TextureScaler.cpp +++ b/GPU/GLES/TextureScaler.cpp @@ -245,40 +245,67 @@ namespace { // B=1 C=0 : cubic B spline (very smooth) // B=C=1/3 : recommended for general upscaling // B=0 C=1/2 : Catmull-Rom spline (sharp, ringing) - // B = Bi/100 and C = Ci/100 (template parameters to allow compiler to specialize weighting function for each spline type) // see Mitchell & Netravali, "Reconstruction Filters in Computer Graphics" - template - inline float mitchell(float x) { - const float B = Bi/100.0f, C = Ci/100.0f; + inline float mitchell(float x, float B, float C) { float ax = fabs(x); if(ax>=2.0f) return 0.0f; if(ax>=1.0f) return ((-B-6*C)*(x*x*x) + (6*B+30*C)*(x*x) + (-12*B-48*C)*x + (8*B+24*C))/6.0f; return ((12-9*B-6*C)*(x*x*x) + (-18+12*B+6*C)*(x*x) + (6-2*B))/6.0f; } - // perform bicubic scaling by factor f, with a Mitchell-Netravali spline specified by Bi and Ci - template + // arrays for pre-calculating weights and sums (~20KB) + // Dimensions: + // 0: 0 = BSpline, 1 = mitchell + // 2: 2-5x scaling + // 2,3: 5x5 generated pixels + // 4,5: 5x5 pixels sampled from + float bicubicWeights[2][4][5][5][5][5]; + float bicubicInvSums[2][4][5][5]; + + // initialize pre-computed weights array + void initBicubicWeights() { + float B[2] = { 1.0f, 0.334f }; + float C[2] = { 0.0f, 0.334f }; + for(int type=0; type<2; ++type) { + for(int factor=2; factor<=5; ++factor) { + for(int x=0; x void scaleBicubicT(u32* data, u32* out, int w, int h, int l, int u) { - const float ff = static_cast(f); int outw = w*f; for(int yb = 0; yb < (u-l)*f/BLOCK_SIZE+1; ++yb) { for(int xb = 0; xb < w*f/BLOCK_SIZE+1; ++xb) { for(int y = l*f+yb*BLOCK_SIZE; y < l*f+(yb+1)*BLOCK_SIZE && y < u*f; ++y) { for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < w*f; ++x) { - float r = 0.0f, g = 0.0f, b = 0.0f, a = 0.0f, sum = 0.0f; + float r = 0.0f, g = 0.0f, b = 0.0f, a = 0.0f; int cx = x/f, cy = y/f; // sample supporting pixels in original image - for(int sx = cx-2; sx <= cx+2; ++sx) { - for(int sy = cy-2; sy <= cy+2; ++sy) { - float dx = (x+0.5f)/ff - (sx+0.5f); - float dy = (y+0.5f)/ff - (sy+0.5f); - float dist = dx*dx + dy*dy; // do sqrt only after check - if(dist < 4.0f) { - float weight = mitchell(sqrt(dist)); - sum += weight; + for(int sx = -2; sx <= 2; ++sx) { + for(int sy = -2; sy <= 2; ++sy) { + float weight = bicubicWeights[T][f-2][x%f][y%f][sx+2][sy+2]; + if(weight != 0.0f) { // clamp pixel locations - int csy = std::max(std::min(sy,h-1),0); - int csx = std::max(std::min(sx,w-1),0); + int csy = std::max(std::min(sy+cy,h-1),0); + int csx = std::max(std::min(sx+cx,w-1),0); // sample & add weighted components u32 sample = data[csy*w+csx]; r += weight*R(sample); @@ -289,10 +316,11 @@ namespace { } } // generate and write result - int ri = std::min(std::max(static_cast(ceilf(r/sum)),0),255); - int gi = std::min(std::max(static_cast(ceilf(g/sum)),0),255); - int bi = std::min(std::max(static_cast(ceilf(b/sum)),0),255); - int ai = std::min(std::max(static_cast(ceilf(a/sum)),0),255); + float invSum = bicubicInvSums[T][f-2][x%f][y%f]; + int ri = std::min(std::max(static_cast(ceilf(r*invSum)),0),255); + int gi = std::min(std::max(static_cast(ceilf(g*invSum)),0),255); + int bi = std::min(std::max(static_cast(ceilf(b*invSum)),0),255); + int ai = std::min(std::max(static_cast(ceilf(a*invSum)),0),255); out[y*outw + x] = (ai << 24) | (bi << 16) | (gi << 8) | ri; } } @@ -300,29 +328,23 @@ namespace { } } #if _M_SSE >= 0x401 - template + template void scaleBicubicTSSE41(u32* data, u32* out, int w, int h, int l, int u) { - const float ff = static_cast(f); int outw = w*f; for(int yb = 0; yb < (u-l)*f/BLOCK_SIZE+1; ++yb) { for(int xb = 0; xb < w*f/BLOCK_SIZE+1; ++xb) { for(int y = l*f+yb*BLOCK_SIZE; y < l*f+(yb+1)*BLOCK_SIZE && y < u*f; ++y) { for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < w*f; ++x) { __m128 result = _mm_set1_ps(0.0f); - float sum = 0.0f; int cx = x/f, cy = y/f; // sample supporting pixels in original image - for(int sx = cx-2; sx <= cx+2; ++sx) { - for(int sy = cy-2; sy <= cy+2; ++sy) { - float dx = (x+0.5f)/ff - (sx+0.5f); - float dy = (y+0.5f)/ff - (sy+0.5f); - float dist = dx*dx + dy*dy; // do sqrt only after check - if(dist < 4.0f) { - float weight = mitchell(sqrt(dist)); - sum += weight; + for(int sx = -2; sx <= 2; ++sx) { + for(int sy = -2; sy <= 2; ++sy) { + float weight = bicubicWeights[T][f-2][x%f][y%f][sx+2][sy+2]; + if(weight != 0.0f) { // clamp pixel locations - int csy = std::max(std::min(sy,h-1),0); - int csx = std::max(std::min(sx,w-1),0); + int csy = std::max(std::min(sy+cy,h-1),0); + int csx = std::max(std::min(sx+cx,w-1),0); // sample & add weighted components __m128i sample = _mm_cvtsi32_si128(data[csy*w+csx]); sample = _mm_cvtepu8_epi32(sample); @@ -333,7 +355,7 @@ namespace { } } // generate and write result - __m128i pixel = _mm_cvtps_epi32(_mm_mul_ps(result, _mm_set1_ps(1.0f/sum))); + __m128i pixel = _mm_cvtps_epi32(_mm_mul_ps(result, _mm_set1_ps(bicubicInvSums[T][f-2][x%f][y%f]))); pixel = _mm_packs_epi32(pixel, pixel); pixel = _mm_packus_epi16(pixel, pixel); out[y*outw + x] = _mm_cvtsi128_si32(pixel); @@ -348,19 +370,19 @@ namespace { #if _M_SSE >= 0x401 if(cpu_info.bSSE4_1) { switch(factor) { - case 2: scaleBicubicTSSE41<2, 100, 0>(data, out, w, h, l, u); break; // when I first tested this, - case 3: scaleBicubicTSSE41<3, 100, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected - case 4: scaleBicubicTSSE41<4, 100, 0>(data, out, w, h, l, u); break; // turns out I had not included - case 5: scaleBicubicTSSE41<5, 100, 0>(data, out, w, h, l, u); break; // any of these break statements + case 2: scaleBicubicTSSE41<2, 0>(data, out, w, h, l, u); break; // when I first tested this, + case 3: scaleBicubicTSSE41<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected + case 4: scaleBicubicTSSE41<4, 0>(data, out, w, h, l, u); break; // turns out I had not included + case 5: scaleBicubicTSSE41<5, 0>(data, out, w, h, l, u); break; // any of these break statements default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5"); } } else { #endif switch(factor) { - case 2: scaleBicubicT<2, 100, 0>(data, out, w, h, l, u); break; // when I first tested this, - case 3: scaleBicubicT<3, 100, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected - case 4: scaleBicubicT<4, 100, 0>(data, out, w, h, l, u); break; // turns out I had not included - case 5: scaleBicubicT<5, 100, 0>(data, out, w, h, l, u); break; // any of these break statements + case 2: scaleBicubicT<2, 0>(data, out, w, h, l, u); break; // when I first tested this, + case 3: scaleBicubicT<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected + case 4: scaleBicubicT<4, 0>(data, out, w, h, l, u); break; // turns out I had not included + case 5: scaleBicubicT<5, 0>(data, out, w, h, l, u); break; // any of these break statements default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5"); } #if _M_SSE >= 0x401 @@ -372,19 +394,19 @@ namespace { #if _M_SSE >= 0x401 if(cpu_info.bSSE4_1) { switch(factor) { - case 2: scaleBicubicTSSE41<2, 34, 34>(data, out, w, h, l, u); break; - case 3: scaleBicubicTSSE41<3, 34, 34>(data, out, w, h, l, u); break; - case 4: scaleBicubicTSSE41<4, 34, 34>(data, out, w, h, l, u); break; - case 5: scaleBicubicTSSE41<5, 34, 34>(data, out, w, h, l, u); break; + case 2: scaleBicubicTSSE41<2, 1>(data, out, w, h, l, u); break; + case 3: scaleBicubicTSSE41<3, 1>(data, out, w, h, l, u); break; + case 4: scaleBicubicTSSE41<4, 1>(data, out, w, h, l, u); break; + case 5: scaleBicubicTSSE41<5, 1>(data, out, w, h, l, u); break; default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5"); } } else { #endif switch(factor) { - case 2: scaleBicubicT<2, 34, 34>(data, out, w, h, l, u); break; - case 3: scaleBicubicT<3, 34, 34>(data, out, w, h, l, u); break; - case 4: scaleBicubicT<4, 34, 34>(data, out, w, h, l, u); break; - case 5: scaleBicubicT<5, 34, 34>(data, out, w, h, l, u); break; + case 2: scaleBicubicT<2, 1>(data, out, w, h, l, u); break; + case 3: scaleBicubicT<3, 1>(data, out, w, h, l, u); break; + case 4: scaleBicubicT<4, 1>(data, out, w, h, l, u); break; + case 5: scaleBicubicT<5, 1>(data, out, w, h, l, u); break; default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5"); } #if _M_SSE >= 0x401 @@ -508,6 +530,7 @@ namespace { /////////////////////////////////////// Texture Scaler TextureScaler::TextureScaler() { + initBicubicWeights(); } bool TextureScaler::IsEmptyOrFlat(u32* data, int pixels, GLenum fmt) {