From fa8df5439144e794fe3e76c47620054f20481a82 Mon Sep 17 00:00:00 2001
From: Peter Thoman <peter@metaclassofnil.com>
Date: Sun, 5 May 2013 21:54:50 +0200
Subject: [PATCH] Pre-compute all weights and sums for bicubic scaling

---
 GPU/GLES/TextureScaler.cpp | 127 ++++++++++++++++++++++---------------
 1 file changed, 75 insertions(+), 52 deletions(-)
diff --git a/GPU/GLES/TextureScaler.cpp b/GPU/GLES/TextureScaler.cpp
index ba1590e865..d8a6564432 100644
--- a/GPU/GLES/TextureScaler.cpp
+++ b/GPU/GLES/TextureScaler.cpp
@@ -245,40 +245,67 @@ namespace {
 	// B=1 C=0   : cubic B spline (very smooth)
 	// B=C=1/3   : recommended for general upscaling
 	// B=0 C=1/2 : Catmull-Rom spline (sharp, ringing)
-	// B = Bi/100 and C = Ci/100 (template parameters to allow compiler to specialize weighting function for each spline type)
 	// see Mitchell & Netravali, "Reconstruction Filters in Computer Graphics"
-	template<int Bi, int Ci>
-	inline float mitchell(float x) {
-		const float B = Bi/100.0f, C = Ci/100.0f;
+	inline float mitchell(float x, float B, float C) {
 		float ax = fabs(x);
 		if(ax>=2.0f) return 0.0f;
 		if(ax>=1.0f) return ((-B-6*C)*(x*x*x) + (6*B+30*C)*(x*x) + (-12*B-48*C)*x + (8*B+24*C))/6.0f;
 		return ((12-9*B-6*C)*(x*x*x) + (-18+12*B+6*C)*(x*x) + (6-2*B))/6.0f;
 	}
 
-	// perform bicubic scaling by factor f, with a Mitchell-Netravali spline specified by Bi and Ci
-	template<int f, int Bi, int Ci>
+	// arrays for pre-calculating weights and sums (~20KB)
+	// Dimensions:
+	//   0: 0 = BSpline, 1 = mitchell
+	//   2: 2-5x scaling
+	// 2,3: 5x5 generated pixels 
+	// 4,5: 5x5 pixels sampled from
+	float bicubicWeights[2][4][5][5][5][5];
+	float bicubicInvSums[2][4][5][5];
+
+	// initialize pre-computed weights array
+	void initBicubicWeights() {
+		float B[2] = { 1.0f, 0.334f };
+		float C[2] = { 0.0f, 0.334f };
+		for(int type=0; type<2; ++type) {
+			for(int factor=2; factor<=5; ++factor) {
+				for(int x=0; x<factor; ++x) {
+					for(int y=0; y<factor; ++y) {
+						float sum = 0.0f;
+						for(int sx = -2; sx <= 2; ++sx) { 
+							for(int sy = -2; sy <= 2; ++sy) {
+								float dx = (x+0.5f)/factor - (sx+0.5f);
+								float dy = (y+0.5f)/factor - (sy+0.5f);
+								float dist = sqrt(dx*dx + dy*dy);
+								float weight = mitchell(dist, B[type], C[type]);
+								bicubicWeights[type][factor-2][x][y][sx+2][sy+2] = weight;
+								sum += weight;
+							}
+						}
+						bicubicInvSums[type][factor-2][x][y] = 1.0f/sum;
+					}
+				}
+			}
+		}
+	}
+
+	// perform bicubic scaling by factor f, with precomputed spline type T
+	template<int f, int T>
 	void scaleBicubicT(u32* data, u32* out, int w, int h, int l, int u) {
-		const float ff = static_cast<float>(f);
 		int outw = w*f;
 		for(int yb = 0; yb < (u-l)*f/BLOCK_SIZE+1; ++yb) {
 			for(int xb = 0; xb < w*f/BLOCK_SIZE+1; ++xb) {
 				for(int y = l*f+yb*BLOCK_SIZE; y < l*f+(yb+1)*BLOCK_SIZE && y < u*f; ++y) {
 					for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < w*f; ++x) {
-						float r = 0.0f, g = 0.0f, b = 0.0f, a = 0.0f, sum = 0.0f;
+						float r = 0.0f, g = 0.0f, b = 0.0f, a = 0.0f;
 						int cx = x/f, cy = y/f;
 						// sample supporting pixels in original image
-						for(int sx = cx-2; sx <= cx+2; ++sx) { 
-							for(int sy = cy-2; sy <= cy+2; ++sy) {
-								float dx = (x+0.5f)/ff - (sx+0.5f);
-								float dy = (y+0.5f)/ff - (sy+0.5f);
-								float dist = dx*dx + dy*dy; // do sqrt only after check
-								if(dist < 4.0f) {
-									float weight = mitchell<Bi, Ci>(sqrt(dist));
-									sum += weight;
+						for(int sx = -2; sx <= 2; ++sx) { 
+							for(int sy = -2; sy <= 2; ++sy) {
+								float weight = bicubicWeights[T][f-2][x%f][y%f][sx+2][sy+2];
+								if(weight != 0.0f) {
 									// clamp pixel locations
-									int csy = std::max(std::min(sy,h-1),0);
-									int csx = std::max(std::min(sx,w-1),0);
+									int csy = std::max(std::min(sy+cy,h-1),0);
+									int csx = std::max(std::min(sx+cx,w-1),0);
 									// sample & add weighted components
 									u32 sample = data[csy*w+csx];
 									r += weight*R(sample);
@@ -289,10 +316,11 @@ namespace {
 							}
 						}
 						// generate and write result
-						int ri = std::min(std::max(static_cast<int>(ceilf(r/sum)),0),255);
-						int gi = std::min(std::max(static_cast<int>(ceilf(g/sum)),0),255);
-						int bi = std::min(std::max(static_cast<int>(ceilf(b/sum)),0),255);
-						int ai = std::min(std::max(static_cast<int>(ceilf(a/sum)),0),255);
+						float invSum = bicubicInvSums[T][f-2][x%f][y%f];
+						int ri = std::min(std::max(static_cast<int>(ceilf(r*invSum)),0),255);
+						int gi = std::min(std::max(static_cast<int>(ceilf(g*invSum)),0),255);
+						int bi = std::min(std::max(static_cast<int>(ceilf(b*invSum)),0),255);
+						int ai = std::min(std::max(static_cast<int>(ceilf(a*invSum)),0),255);
 						out[y*outw + x] = (ai << 24) | (bi << 16) | (gi << 8) | ri;
 					}
 				}
@@ -300,29 +328,23 @@ namespace {
 		}
 	}
 	#if _M_SSE >= 0x401
-	template<int f, int Bi, int Ci>
+	template<int f, int T>
 	void scaleBicubicTSSE41(u32* data, u32* out, int w, int h, int l, int u) {
-		const float ff = static_cast<float>(f);
 		int outw = w*f;
 		for(int yb = 0; yb < (u-l)*f/BLOCK_SIZE+1; ++yb) {
 			for(int xb = 0; xb < w*f/BLOCK_SIZE+1; ++xb) {
 				for(int y = l*f+yb*BLOCK_SIZE; y < l*f+(yb+1)*BLOCK_SIZE && y < u*f; ++y) {
 					for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < w*f; ++x) {
 						__m128 result = _mm_set1_ps(0.0f);
-						float sum = 0.0f;
 						int cx = x/f, cy = y/f;
 						// sample supporting pixels in original image
-						for(int sx = cx-2; sx <= cx+2; ++sx) { 
-							for(int sy = cy-2; sy <= cy+2; ++sy) {
-								float dx = (x+0.5f)/ff - (sx+0.5f);
-								float dy = (y+0.5f)/ff - (sy+0.5f);
-								float dist = dx*dx + dy*dy; // do sqrt only after check
-								if(dist < 4.0f) {
-									float weight = mitchell<Bi, Ci>(sqrt(dist));
-									sum += weight;
+						for(int sx = -2; sx <= 2; ++sx) { 
+							for(int sy = -2; sy <= 2; ++sy) {
+								float weight = bicubicWeights[T][f-2][x%f][y%f][sx+2][sy+2];
+								if(weight != 0.0f) {
 									// clamp pixel locations
-									int csy = std::max(std::min(sy,h-1),0);
-									int csx = std::max(std::min(sx,w-1),0);
+									int csy = std::max(std::min(sy+cy,h-1),0);
+									int csx = std::max(std::min(sx+cx,w-1),0);
 									// sample & add weighted components
 									__m128i sample = _mm_cvtsi32_si128(data[csy*w+csx]);
 									sample = _mm_cvtepu8_epi32(sample);
@@ -333,7 +355,7 @@ namespace {
 							}
 						}
 						// generate and write result
-						__m128i pixel = _mm_cvtps_epi32(_mm_mul_ps(result, _mm_set1_ps(1.0f/sum)));
+						__m128i pixel = _mm_cvtps_epi32(_mm_mul_ps(result, _mm_set1_ps(bicubicInvSums[T][f-2][x%f][y%f])));
 						pixel = _mm_packs_epi32(pixel, pixel);
 						pixel = _mm_packus_epi16(pixel, pixel);
 						out[y*outw + x] = _mm_cvtsi128_si32(pixel);
@@ -348,19 +370,19 @@ namespace {
 		#if _M_SSE >= 0x401
 		if(cpu_info.bSSE4_1) {
 			switch(factor) {
-			case 2: scaleBicubicTSSE41<2, 100, 0>(data, out, w, h, l, u); break; // when I first tested this, 
-			case 3: scaleBicubicTSSE41<3, 100, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected
-			case 4: scaleBicubicTSSE41<4, 100, 0>(data, out, w, h, l, u); break; // turns out I had not included
-			case 5: scaleBicubicTSSE41<5, 100, 0>(data, out, w, h, l, u); break; // any of these break statements
+			case 2: scaleBicubicTSSE41<2, 0>(data, out, w, h, l, u); break; // when I first tested this, 
+			case 3: scaleBicubicTSSE41<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected
+			case 4: scaleBicubicTSSE41<4, 0>(data, out, w, h, l, u); break; // turns out I had not included
+			case 5: scaleBicubicTSSE41<5, 0>(data, out, w, h, l, u); break; // any of these break statements
 			default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
 			}
 		} else {
 		#endif
 			switch(factor) {
-			case 2: scaleBicubicT<2, 100, 0>(data, out, w, h, l, u); break; // when I first tested this, 
-			case 3: scaleBicubicT<3, 100, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected
-			case 4: scaleBicubicT<4, 100, 0>(data, out, w, h, l, u); break; // turns out I had not included
-			case 5: scaleBicubicT<5, 100, 0>(data, out, w, h, l, u); break; // any of these break statements
+			case 2: scaleBicubicT<2, 0>(data, out, w, h, l, u); break; // when I first tested this, 
+			case 3: scaleBicubicT<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected
+			case 4: scaleBicubicT<4, 0>(data, out, w, h, l, u); break; // turns out I had not included
+			case 5: scaleBicubicT<5, 0>(data, out, w, h, l, u); break; // any of these break statements
 			default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
 			}
 		#if _M_SSE >= 0x401
@@ -372,19 +394,19 @@ namespace {
 		#if _M_SSE >= 0x401
 		if(cpu_info.bSSE4_1) {
 			switch(factor) {
-			case 2: scaleBicubicTSSE41<2, 34, 34>(data, out, w, h, l, u); break;
-			case 3: scaleBicubicTSSE41<3, 34, 34>(data, out, w, h, l, u); break;
-			case 4: scaleBicubicTSSE41<4, 34, 34>(data, out, w, h, l, u); break;
-			case 5: scaleBicubicTSSE41<5, 34, 34>(data, out, w, h, l, u); break;
+			case 2: scaleBicubicTSSE41<2, 1>(data, out, w, h, l, u); break;
+			case 3: scaleBicubicTSSE41<3, 1>(data, out, w, h, l, u); break;
+			case 4: scaleBicubicTSSE41<4, 1>(data, out, w, h, l, u); break;
+			case 5: scaleBicubicTSSE41<5, 1>(data, out, w, h, l, u); break;
 			default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
 			}
 		} else {
 		#endif
 			switch(factor) {
-			case 2: scaleBicubicT<2, 34, 34>(data, out, w, h, l, u); break;
-			case 3: scaleBicubicT<3, 34, 34>(data, out, w, h, l, u); break;
-			case 4: scaleBicubicT<4, 34, 34>(data, out, w, h, l, u); break;
-			case 5: scaleBicubicT<5, 34, 34>(data, out, w, h, l, u); break;
+			case 2: scaleBicubicT<2, 1>(data, out, w, h, l, u); break;
+			case 3: scaleBicubicT<3, 1>(data, out, w, h, l, u); break;
+			case 4: scaleBicubicT<4, 1>(data, out, w, h, l, u); break;
+			case 5: scaleBicubicT<5, 1>(data, out, w, h, l, u); break;
 			default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
 			}
 		#if _M_SSE >= 0x401
@@ -508,6 +530,7 @@ namespace {
 /////////////////////////////////////// Texture Scaler
 
 TextureScaler::TextureScaler() {
+	initBicubicWeights();
 }
 
 bool TextureScaler::IsEmptyOrFlat(u32* data, int pixels, GLenum fmt) {