From 7faddd6100a7e5da116712341f294c733396c7d9 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Tue, 20 Jan 2015 17:08:10 +0100
Subject: [PATCH] Consolidate the two TextureScaler copies into one.

Also lets us clean up ColorConv slightly.
---
 CMakeLists.txt                         |   4 +-
 Common/ColorConv.cpp                   |  22 +-
 Common/ColorConv.h                     |   9 -
 GPU/{GLES => Common}/TextureScaler.cpp |  46 +-
 GPU/{GLES => Common}/TextureScaler.h   |   6 +-
 GPU/Directx9/TextureCacheDX9.cpp       |  21 +-
 GPU/Directx9/TextureCacheDX9.h         |   4 +-
 GPU/Directx9/TextureScalerDX9.cpp      | 651 -------------------------
 GPU/Directx9/TextureScalerDX9.h        |  56 ---
 GPU/GLES/TextureCache.cpp              |  18 +-
 GPU/GLES/TextureCache.h                |   2 +-
 GPU/GPU.vcxproj                        |   6 +-
 GPU/GPU.vcxproj.filters                |  16 +-
 Qt/GPU.pro                             |   2 +-
 Windows/WndMainWindow.cpp              |   2 +-
 android/jni/Android.mk                 |   2 +-
 16 files changed, 88 insertions(+), 779 deletions(-)
 rename GPU/{GLES => Common}/TextureScaler.cpp (95%)
 rename GPU/{GLES => Common}/TextureScaler.h (89%)
 delete mode 100644 GPU/Directx9/TextureScalerDX9.cpp
 delete mode 100644 GPU/Directx9/TextureScalerDX9.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf2c12bac8..ba1afb2e81 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1380,6 +1380,8 @@ add_library(GPU OBJECT
 	GPU/Common/IndexGenerator.h
 	GPU/Common/TextureDecoder.cpp
 	GPU/Common/TextureDecoder.h
+	GPU/Common/TextureScaler.cpp
+	GPU/Common/TextureScaler.h
 	GPU/Common/TextureCacheCommon.cpp
 	GPU/Common/TextureCacheCommon.h
 	${GPU_NEON}
@@ -1408,8 +1410,6 @@ add_library(GPU OBJECT
 	GPU/GLES/StencilBuffer.cpp
 	GPU/GLES/TextureCache.cpp
 	GPU/GLES/TextureCache.h
-	GPU/GLES/TextureScaler.cpp
-	GPU/GLES/TextureScaler.h
 	GPU/GLES/TransformPipeline.cpp
 	GPU/GLES/TransformPipeline.h
 	GPU/GLES/VertexShaderGenerator.cpp
diff --git a/Common/ColorConv.cpp b/Common/ColorConv.cpp
index 4ba4a3c9c4..02b7247c54 100644
--- a/Common/ColorConv.cpp
+++ b/Common/ColorConv.cpp
@@ -54,28 +54,10 @@ inline u32 RGBA2BGRA(u32 src) {
 	return r | ga | b;
 }
 
-void convert4444(u16* data, u32* out, int width, int l, int u) {
-	for (int y = l; y < u; ++y) {
-		ConvertRGBA4444ToRGBA8888(out + y * width, data + y * width, width);
-	}
-}
-
-void convert565(u16* data, u32* out, int width, int l, int u) {
-	for (int y = l; y < u; ++y) {
-		ConvertRGB565ToRGBA888F(out + y * width, data + y * width, width);
-	}
-}
-
-void convert5551(u16* data, u32* out, int width, int l, int u) {
-	for (int y = l; y < u; ++y) {
-		ConvertRGBA5551ToRGBA8888(out + y * width, data + y * width, width);
-	}
-}
-
-// Used heavily in Test Drive Unlimited
+// Used heavily in Test Drive Unlimited (for no good reason...)
 void ConvertBGRA8888ToRGB565(u16 *dst, const u32 *src, int numPixels) {
 #if _M_SSE >= 0x401
-	const __m128i maskG = _mm_set1_epi32(0x8000FC00);
+	const __m128i maskG = _mm_set1_epi32(0x0000FC00);
 	const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
 	const __m128i mask = _mm_set1_epi32(0x0000FFFF);
 
diff --git a/Common/ColorConv.h b/Common/ColorConv.h
index 16e56b50a9..f8f8c2f45b 100644
--- a/Common/ColorConv.h
+++ b/Common/ColorConv.h
@@ -35,15 +35,6 @@ inline u8 Convert6To8(u8 v) {
 	return (v << 2) | (v >> 4);
 }
 
-// convert 4444 image to 8888
-void convert4444(u16* data, u32* out, int width, int l, int u);
-
-// convert 565 image to 8888
-void convert565(u16* data, u32* out, int width, int l, int u);
-
-// convert 5551 image to 8888
-void convert5551(u16* data, u32* out, int width, int l, int u);
-
 inline u32 DecodeRGBA4444(u16 src) {
 	const u32 r = (src & 0x000F) << 0;
 	const u32 g = (src & 0x00F0) << 4;
diff --git a/GPU/GLES/TextureScaler.cpp b/GPU/Common/TextureScaler.cpp
similarity index 95%
rename from GPU/GLES/TextureScaler.cpp
rename to GPU/Common/TextureScaler.cpp
index aaefcaee4b..72f1cb3624 100644
--- a/GPU/GLES/TextureScaler.cpp
+++ b/GPU/Common/TextureScaler.cpp
@@ -21,7 +21,10 @@
 #endif
 
 #include <algorithm>
-#include "GPU/GLES/TextureScaler.h"
+#include <cstdlib>
+#include <cmath>
+
+#include "GPU/Common/TextureScaler.h"
 
 #include "Core/Config.h"
 #include "Common/Common.h"
@@ -32,8 +35,6 @@
 #include "Common/ThreadPools.h"
 #include "Common/CPUDetect.h"
 #include "ext/xbrz/xbrz.h"
-#include <stdlib.h>
-#include <math.h>
 
 #if _M_SSE >= 0x402
 #include <nmmintrin.h>
@@ -46,11 +47,11 @@
 #include "native/base/timeutil.h"
 #endif
 
-/////////////////////////////////////// Helper Functions (mostly math for parallelization)
+// Helper Functions (mostly math for parallelization)
 
 namespace {
 
-	//////////////////////////////////////////////////////////////////// Various image processing
+	// Various image processing
 
 	#define R(_col) ((_col>> 0)&0xFF)
 	#define G(_col) ((_col>> 8)&0xFF)
@@ -173,7 +174,8 @@ namespace {
 									out[y*width + x] += 400; // assume distance at borders, usually makes for better result
 									continue;
 								}
-								out[y*width + x] += DISTANCE(data[yy*width + xx], center);
+								u32 d = data[yy*width + xx];
+								out[y*width + x] += DISTANCE(d, center);
 							}
 						}
 					}
@@ -498,7 +500,7 @@ bool TextureScaler::IsEmptyOrFlat(u32* data, int pixels, GLenum fmt) {
 	return true;
 }
 
-void TextureScaler::Scale(u32* &data, GLenum &dstFmt, int &width, int &height, int factor) {
+void TextureScaler::Scale(u32* &data, GEBufferFormat &dstFmt, int &width, int &height, int factor) {
 	// prevent processing empty or flat textures (this happens a lot in some games)
 	// doesn't hurt the standard case, will be very quick for textures with actual texture
 	if(IsEmptyOrFlat(data, width*height, dstFmt)) {
@@ -545,7 +547,7 @@ void TextureScaler::Scale(u32* &data, GLenum &dstFmt, int &width, int &height, i
 
 	// update values accordingly
 	data = outputBuf;
-	dstFmt = GL_UNSIGNED_BYTE;
+	dstFmt = GE_FORMAT_8888;
 	width *= factor;
 	height *= factor;
 
@@ -616,21 +618,39 @@ void TextureScaler::DePosterize(u32* source, u32* dest, int width, int height) {
 	GlobalThreadPool::Loop(std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, placeholder::_1, placeholder::_2), 0, height);
 }
 
-void TextureScaler::ConvertTo8888(GLenum format, u32* source, u32* &dest, int width, int height) {
+static void convert4444(u16* data, u32* out, int width, int l, int u) {
+	for (int y = l; y < u; ++y) {
+		ConvertRGBA4444ToRGBA8888(out + y * width, data + y * width, width);
+	}
+}
+
+static void convert565(u16* data, u32* out, int width, int l, int u) {
+	for (int y = l; y < u; ++y) {
+		ConvertRGB565ToRGBA888F(out + y * width, data + y * width, width);
+	}
+}
+
+static void convert5551(u16* data, u32* out, int width, int l, int u) {
+	for (int y = l; y < u; ++y) {
+		ConvertRGBA5551ToRGBA8888(out + y * width, data + y * width, width);
+	}
+}
+
+void TextureScaler::ConvertTo8888(GEBufferFormat format, u32* source, u32* &dest, int width, int height) {
 	switch(format) {
-	case GL_UNSIGNED_BYTE:
+	case GE_FORMAT_8888:
 		dest = source; // already fine
 		break;
 
-	case GL_UNSIGNED_SHORT_4_4_4_4:
+	case GE_FORMAT_4444:
 		GlobalThreadPool::Loop(std::bind(&convert4444, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height);
 		break;
 
-	case GL_UNSIGNED_SHORT_5_6_5:
+	case GE_FORMAT_565:
 		GlobalThreadPool::Loop(std::bind(&convert565, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height);
 		break;
 
-	case GL_UNSIGNED_SHORT_5_5_5_1:
+	case GE_FORMAT_5551:
 		GlobalThreadPool::Loop(std::bind(&convert5551, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height);
 		break;
 
diff --git a/GPU/GLES/TextureScaler.h b/GPU/Common/TextureScaler.h
similarity index 89%
rename from GPU/GLES/TextureScaler.h
rename to GPU/Common/TextureScaler.h
index 6ca82764ef..b90201713f 100644
--- a/GPU/GLES/TextureScaler.h
+++ b/GPU/Common/TextureScaler.h
@@ -20,15 +20,15 @@
 #include "Common/MemoryUtil.h"
 #include "../Globals.h"
 #include "gfx/gl_common.h"
+#include "GPU/ge_constants.h"
 
 #include <vector>
 
-
 class TextureScaler {
 public:
 	TextureScaler();
 
-	void Scale(u32* &data, GLenum &dstfmt, int &width, int &height, int factor);
+	void Scale(u32* &data, GEBufferFormat &dstfmt, int &width, int &height, int factor);
 
 	enum { XBRZ= 0, HYBRID = 1, BICUBIC = 2, HYBRID_BICUBIC = 3 };
 
@@ -38,7 +38,7 @@ private:
 	void ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height);
 	void ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height);
 	void ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic = false);
-	void ConvertTo8888(GLenum format, u32* source, u32* &dest, int width, int height);
+	void ConvertTo8888(GEBufferFormat format, u32* source, u32* &dest, int width, int height);
 
 	void DePosterize(u32* source, u32* dest, int width, int height);
 
diff --git a/GPU/Directx9/TextureCacheDX9.cpp b/GPU/Directx9/TextureCacheDX9.cpp
index cced61afa7..70ddeb6685 100644
--- a/GPU/Directx9/TextureCacheDX9.cpp
+++ b/GPU/Directx9/TextureCacheDX9.cpp
@@ -1703,8 +1703,25 @@ void TextureCacheDX9::LoadTextureLevel(TexCacheEntry &entry, int level, int maxL
 	gpuStats.numTexturesDecoded++;
 
 	u32 *pixelData = (u32 *)finalBuf;
-	if (scaleFactor > 1 && (entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0)
-		scaler.Scale(pixelData, dstFmt, w, h, scaleFactor);
+	if (scaleFactor > 1 && (entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) {
+		GEBufferFormat dstFormat;
+		switch (dstFmt) {
+		case D3DFMT_A8R8G8B8: dstFormat = GE_FORMAT_8888; break;
+		case D3DFMT_R5G6B5: dstFormat = GE_FORMAT_565; break;
+		case D3DFMT_A4R4G4B4: dstFormat = GE_FORMAT_4444; break;
+		case D3DFMT_A1R5G5B5: dstFormat = GE_FORMAT_5551; break;
+		default: goto dontScale;
+		}
+		scaler.Scale(pixelData, dstFormat, w, h, scaleFactor);
+		switch (dstFormat) {
+		case GE_FORMAT_8888: dstFmt = D3DFMT_A8R8G8B8; break;
+		case GE_FORMAT_565: dstFmt = D3DFMT_R5G6B5; break;
+		case GE_FORMAT_4444: dstFmt = D3DFMT_A4R4G4B4; break;
+		case GE_FORMAT_5551: dstFmt = D3DFMT_A1R5G5B5; break;
+		}
+	dontScale:
+		;
+	}
 
 	if ((entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) {
 		TexCacheEntry::Status alphaStatus = CheckAlpha(pixelData, dstFmt, w, w, h);
diff --git a/GPU/Directx9/TextureCacheDX9.h b/GPU/Directx9/TextureCacheDX9.h
index cac576b0f3..2684874224 100644
--- a/GPU/Directx9/TextureCacheDX9.h
+++ b/GPU/Directx9/TextureCacheDX9.h
@@ -24,7 +24,7 @@
 #include "helper/fbo.h"
 #include "GPU/GPUInterface.h"
 #include "GPU/GPUState.h"
-#include "GPU/Directx9/TextureScalerDX9.h"
+#include "GPU/Common/TextureScaler.h"
 #include "GPU/Common/TextureCacheCommon.h"
 
 struct VirtualFramebuffer;
@@ -197,7 +197,7 @@ private:
 
 	bool clearCacheNextFrame_;
 	bool lowMemoryMode_;
-	TextureScalerDX9 scaler;
+	TextureScaler scaler;
 
 	SimpleBuf<u32> tmpTexBuf32;
 	SimpleBuf<u16> tmpTexBuf16;
diff --git a/GPU/Directx9/TextureScalerDX9.cpp b/GPU/Directx9/TextureScalerDX9.cpp
deleted file mode 100644
index a1aa386251..0000000000
--- a/GPU/Directx9/TextureScalerDX9.cpp
+++ /dev/null
@@ -1,651 +0,0 @@
-// Copyright (c) 2012- PPSSPP Project.
-
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, version 2.0 or later versions.
-
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License 2.0 for more details.
-
-// A copy of the GPL 2.0 should have been included with the program.
-// If not, see http://www.gnu.org/licenses/
-
-// Official git repository and contact information can be found at
-// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
-
-// On Visual Studio 2012, include this before anything else so
-// _VARIADIC_MAX gets set to 10, to avoid std::bind compile errors.
-// See header file for reasons why.
-#if defined(_WIN32) && _MSC_VER == 1700
-#include "../native/base/basictypes.h"
-#endif
-
-#include <algorithm>
-#include "GPU/Directx9/TextureScalerDX9.h"
-
-#include "Core/Config.h"
-#include "Common/Common.h"
-#include "Common/ColorConv.h"
-#include "Common/Log.h"
-#include "Common/MsgHandler.h"
-#include "Common/CommonFuncs.h"
-#include "Common/ThreadPools.h"
-#include "Common/CPUDetect.h"
-#include "ext/xbrz/xbrz.h"
-#include <stdlib.h>
-#include <math.h>
-#include <D3D9Types.h>
-
-#undef min
-#undef max
-
-#if _M_SSE >= 0x402
-#include <nmmintrin.h>
-#endif
-
-// Report the time and throughput for each larger scaling operation in the log
-//#define SCALING_MEASURE_TIME
-
-#ifdef SCALING_MEASURE_TIME
-#include "native/base/timeutil.h"
-#endif
-
-/////////////////////////////////////// Helper Functions (mostly math for parallelization)
-
-namespace {
-
-	//////////////////////////////////////////////////////////////////// Various image processing
-
-	#define R(_col) ((_col>> 0)&0xFF)
-	#define G(_col) ((_col>> 8)&0xFF)
-	#define B(_col) ((_col>>16)&0xFF)
-	#define A(_col) ((_col>>24)&0xFF)
-
-	#define DISTANCE(_p1,_p2) ( abs(static_cast<int>(static_cast<int>(R(_p1))-R(_p2))) + abs(static_cast<int>(static_cast<int>(G(_p1))-G(_p2))) \
-							  + abs(static_cast<int>(static_cast<int>(B(_p1))-B(_p2))) + abs(static_cast<int>(static_cast<int>(A(_p1))-A(_p2))) )
-	
-	// this is sadly much faster than an inline function with a loop, at least in VC10
-	#define MIX_PIXELS(_p0, _p1, _factors) \
-		( (R(_p0)*(_factors)[0] + R(_p1)*(_factors)[1])/255 <<  0 ) | \
-		( (G(_p0)*(_factors)[0] + G(_p1)*(_factors)[1])/255 <<  8 ) | \
-		( (B(_p0)*(_factors)[0] + B(_p1)*(_factors)[1])/255 << 16 ) | \
-		( (A(_p0)*(_factors)[0] + A(_p1)*(_factors)[1])/255 << 24 )
-
-	#define BLOCK_SIZE 32
-	
-	// 3x3 convolution with Neumann boundary conditions, parallelizable
-	// quite slow, could be sped up a lot
-	// especially handling of separable kernels
-	void convolve3x3(u32* data, u32* out, const int kernel[3][3], int width, int height, int l, int u) {
-		for(int yb = 0; yb < (u-l)/BLOCK_SIZE+1; ++yb) {
-			for(int xb = 0; xb < width/BLOCK_SIZE+1; ++xb) {
-				for(int y = l+yb*BLOCK_SIZE; y < l+(yb+1)*BLOCK_SIZE && y < u; ++y) {
-					for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < width; ++x) {
-						int val = 0;
-						for(int yoff = -1; yoff <= 1; ++yoff) {
-							int yy = std::max(std::min(y+yoff, height-1), 0);
-							for(int xoff = -1; xoff <= 1; ++xoff) {
-								int xx = std::max(std::min(x+xoff, width-1), 0);
-								val += data[yy*width + xx] * kernel[yoff+1][xoff+1];
-							}
-						}
-						out[y*width + x] = abs(val);
-					}
-				}
-			}
-		}
-	}
-
-	// deposterization: smoothes posterized gradients from low-color-depth (e.g. 444, 565, compressed) sources
-	void deposterizeH(u32* data, u32* out, int w, int l, int u) {
-		static const int T = 8;
-		for(int y = l; y < u; ++y) {
-			for(int x = 0; x < w; ++x) {
-				int inpos = y*w + x;
-				u32 center = data[inpos];
-				if(x==0 || x==w-1) {
-					out[y*w + x] = center;
-					continue;
-				}
-				u32 left   = data[inpos - 1];
-				u32 right  = data[inpos + 1];
-				out[y*w + x] = 0;
-				for(int c=0; c<4; ++c) {
-					u8 lc = ((  left>>c*8)&0xFF);
-					u8 cc = ((center>>c*8)&0xFF);
-					u8 rc = (( right>>c*8)&0xFF);
-					if((lc != rc) && ((lc == cc && abs((int)((int)rc)-cc) <= T) || (rc == cc && abs((int)((int)lc)-cc) <= T))) {
-						// blend this component
-						out[y*w + x] |= ((rc+lc)/2) << (c*8);
-					} else {
-						// no change for this component
-						out[y*w + x] |= cc << (c*8);
-					}
-				}
-			}
-		}
-	}
-	void deposterizeV(u32* data, u32* out, int w, int h, int l, int u) {
-		static const int T = 8;
-		for(int xb = 0; xb < w/BLOCK_SIZE+1; ++xb) {
-			for(int y = l; y < u; ++y) {
-				for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < w; ++x) {
-					u32 center = data[ y    * w + x];
-					if(y==0 || y==h-1) {
-						out[y*w + x] = center;
-						continue;
-					}
-					u32 upper  = data[(y-1) * w + x];
-					u32 lower  = data[(y+1) * w + x];
-					out[y*w + x] = 0;
-					for(int c=0; c<4; ++c) {
-						u8 uc = (( upper>>c*8)&0xFF);
-						u8 cc = ((center>>c*8)&0xFF);
-						u8 lc = (( lower>>c*8)&0xFF);
-						if((uc != lc) && ((uc == cc && abs((int)((int)lc)-cc) <= T) || (lc == cc && abs((int)((int)uc)-cc) <= T))) {
-							// blend this component
-							out[y*w + x] |= ((lc+uc)/2) << (c*8);
-						} else {
-							// no change for this component
-							out[y*w + x] |= cc << (c*8);
-						}
-					}
-				}
-			}
-		}
-	}
-
-	// generates a distance mask value for each pixel in data
-	// higher values -> larger distance to the surrounding pixels
-	void generateDistanceMask(u32* data, u32* out, int width, int height, int l, int u) {
-		for(int yb = 0; yb < (u-l)/BLOCK_SIZE+1; ++yb) {
-			for(int xb = 0; xb < width/BLOCK_SIZE+1; ++xb) {
-				for(int y = l+yb*BLOCK_SIZE; y < l+(yb+1)*BLOCK_SIZE && y < u; ++y) {
-					for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < width; ++x) {
-						out[y*width + x] = 0;
-						u32 center = data[y*width + x];
-						for(int yoff = -1; yoff <= 1; ++yoff) {
-							int yy = y+yoff;
-							if(yy == height || yy == -1) {
-								out[y*width + x] += 1200; // assume distance at borders, usually makes for better result
-								continue;
-							}
-							for(int xoff = -1; xoff <= 1; ++xoff) {
-								if(yoff == 0 && xoff == 0) continue;
-								int xx = x+xoff;
-								if(xx == width || xx == -1) {
-									out[y*width + x] += 400; // assume distance at borders, usually makes for better result
-									continue;
-								}
-								out[y*width + x] += DISTANCE(data[yy*width + xx], center);
-							}
-						}
-					}
-				}
-			}
-		}
-	}
-
-	// mix two images based on a mask
-	void mix(u32* data, u32* source, u32* mask, u32 maskmax, int width, int l, int u) {
-		for(int y = l; y < u; ++y) {
-			for(int x = 0; x < width; ++x) {
-				int pos = y*width + x;
-				u8 mixFactors[2] = { 0, static_cast<u8>((std::min(mask[pos], maskmax)*255)/maskmax) };
-				mixFactors[0] = 255-mixFactors[1];
-				data[pos] = MIX_PIXELS(data[pos], source[pos], mixFactors);
-				if(A(source[pos]) == 0) data[pos] = data[pos] & 0x00FFFFFF; // xBRZ always does a better job with hard alpha
-			}
-		}
-	}
-
-	//////////////////////////////////////////////////////////////////// Bicubic scaling
-	
-	// generate the value of a Mitchell-Netravali scaling spline at distance d, with parameters A and B
-	// B=1 C=0   : cubic B spline (very smooth)
-	// B=C=1/3   : recommended for general upscaling
-	// B=0 C=1/2 : Catmull-Rom spline (sharp, ringing)
-	// see Mitchell & Netravali, "Reconstruction Filters in Computer Graphics"
-	inline float mitchell(float x, float B, float C) {
-		float ax = fabs(x);
-		if(ax>=2.0f) return 0.0f;
-		if(ax>=1.0f) return ((-B-6*C)*(x*x*x) + (6*B+30*C)*(x*x) + (-12*B-48*C)*x + (8*B+24*C))/6.0f;
-		return ((12-9*B-6*C)*(x*x*x) + (-18+12*B+6*C)*(x*x) + (6-2*B))/6.0f;
-	}
-
-	// arrays for pre-calculating weights and sums (~20KB)
-	// Dimensions:
-	//   0: 0 = BSpline, 1 = mitchell
-	//   2: 2-5x scaling
-	// 2,3: 5x5 generated pixels 
-	// 4,5: 5x5 pixels sampled from
-	float bicubicWeights[2][4][5][5][5][5];
-	float bicubicInvSums[2][4][5][5];
-
-	// initialize pre-computed weights array
-	void initBicubicWeights() {
-		float B[2] = { 1.0f, 0.334f };
-		float C[2] = { 0.0f, 0.334f };
-		for(int type=0; type<2; ++type) {
-			for(int factor=2; factor<=5; ++factor) {
-				for(int x=0; x<factor; ++x) {
-					for(int y=0; y<factor; ++y) {
-						float sum = 0.0f;
-						for(int sx = -2; sx <= 2; ++sx) { 
-							for(int sy = -2; sy <= 2; ++sy) {
-								float dx = (x+0.5f)/factor - (sx+0.5f);
-								float dy = (y+0.5f)/factor - (sy+0.5f);
-								float dist = sqrt(dx*dx + dy*dy);
-								float weight = mitchell(dist, B[type], C[type]);
-								bicubicWeights[type][factor-2][x][y][sx+2][sy+2] = weight;
-								sum += weight;
-							}
-						}
-						bicubicInvSums[type][factor-2][x][y] = 1.0f/sum;
-					}
-				}
-			}
-		}
-	}
-
-	// perform bicubic scaling by factor f, with precomputed spline type T
-	template<int f, int T>
-	void scaleBicubicT(u32* data, u32* out, int w, int h, int l, int u) {
-		int outw = w*f;
-		for(int yb = 0; yb < (u-l)*f/BLOCK_SIZE+1; ++yb) {
-			for(int xb = 0; xb < w*f/BLOCK_SIZE+1; ++xb) {
-				for(int y = l*f+yb*BLOCK_SIZE; y < l*f+(yb+1)*BLOCK_SIZE && y < u*f; ++y) {
-					for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < w*f; ++x) {
-						float r = 0.0f, g = 0.0f, b = 0.0f, a = 0.0f;
-						int cx = x/f, cy = y/f;
-						// sample supporting pixels in original image
-						for(int sx = -2; sx <= 2; ++sx) { 
-							for(int sy = -2; sy <= 2; ++sy) {
-								float weight = bicubicWeights[T][f-2][x%f][y%f][sx+2][sy+2];
-								if(weight != 0.0f) {
-									// clamp pixel locations
-									int csy = std::max(std::min(sy+cy,h-1),0);
-									int csx = std::max(std::min(sx+cx,w-1),0);
-									// sample & add weighted components
-									u32 sample = data[csy*w+csx];
-									r += weight*R(sample);
-									g += weight*G(sample);
-									b += weight*B(sample);
-									a += weight*A(sample);
-								}
-							}
-						}
-						// generate and write result
-						float invSum = bicubicInvSums[T][f-2][x%f][y%f];
-						int ri = std::min(std::max(static_cast<int>(ceilf(r*invSum)),0),255);
-						int gi = std::min(std::max(static_cast<int>(ceilf(g*invSum)),0),255);
-						int bi = std::min(std::max(static_cast<int>(ceilf(b*invSum)),0),255);
-						int ai = std::min(std::max(static_cast<int>(ceilf(a*invSum)),0),255);
-						out[y*outw + x] = (ai << 24) | (bi << 16) | (gi << 8) | ri;
-					}
-				}
-			}
-		}
-	}
-	#if _M_SSE >= 0x401
-	template<int f, int T>
-	void scaleBicubicTSSE41(u32* data, u32* out, int w, int h, int l, int u) {
-		int outw = w*f;
-		for(int yb = 0; yb < (u-l)*f/BLOCK_SIZE+1; ++yb) {
-			for(int xb = 0; xb < w*f/BLOCK_SIZE+1; ++xb) {
-				for(int y = l*f+yb*BLOCK_SIZE; y < l*f+(yb+1)*BLOCK_SIZE && y < u*f; ++y) {
-					for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < w*f; ++x) {
-						__m128 result = _mm_set1_ps(0.0f);
-						int cx = x/f, cy = y/f;
-						// sample supporting pixels in original image
-						for(int sx = -2; sx <= 2; ++sx) { 
-							for(int sy = -2; sy <= 2; ++sy) {
-								float weight = bicubicWeights[T][f-2][x%f][y%f][sx+2][sy+2];
-								if(weight != 0.0f) {
-									// clamp pixel locations
-									int csy = std::max(std::min(sy+cy,h-1),0);
-									int csx = std::max(std::min(sx+cx,w-1),0);
-									// sample & add weighted components
-									__m128i sample = _mm_cvtsi32_si128(data[csy*w+csx]);
-									sample = _mm_cvtepu8_epi32(sample);
-									__m128 col = _mm_cvtepi32_ps(sample);
-									col = _mm_mul_ps(col, _mm_set1_ps(weight));
-									result = _mm_add_ps(result, col);
-								}
-							}
-						}
-						// generate and write result
-						__m128i pixel = _mm_cvtps_epi32(_mm_mul_ps(result, _mm_set1_ps(bicubicInvSums[T][f-2][x%f][y%f])));
-						pixel = _mm_packs_epi32(pixel, pixel);
-						pixel = _mm_packus_epi16(pixel, pixel);
-						out[y*outw + x] = _mm_cvtsi128_si32(pixel);
-					}
-				}
-			}
-		}
-	}
-	#endif
-
-	void scaleBicubicBSpline(int factor, u32* data, u32* out, int w, int h, int l, int u) {
-		#if _M_SSE >= 0x401
-		if(cpu_info.bSSE4_1) {
-			switch(factor) {
-			case 2: scaleBicubicTSSE41<2, 0>(data, out, w, h, l, u); break; // when I first tested this, 
-			case 3: scaleBicubicTSSE41<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected
-			case 4: scaleBicubicTSSE41<4, 0>(data, out, w, h, l, u); break; // turns out I had not included
-			case 5: scaleBicubicTSSE41<5, 0>(data, out, w, h, l, u); break; // any of these break statements
-			default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
-			}
-		} else {
-		#endif
-			switch(factor) {
-			case 2: scaleBicubicT<2, 0>(data, out, w, h, l, u); break; // when I first tested this, 
-			case 3: scaleBicubicT<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected
-			case 4: scaleBicubicT<4, 0>(data, out, w, h, l, u); break; // turns out I had not included
-			case 5: scaleBicubicT<5, 0>(data, out, w, h, l, u); break; // any of these break statements
-			default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
-			}
-		#if _M_SSE >= 0x401
-		}
-		#endif
-	}
-
-	void scaleBicubicMitchell(int factor, u32* data, u32* out, int w, int h, int l, int u) {
-		#if _M_SSE >= 0x401
-		if(cpu_info.bSSE4_1) {
-			switch(factor) {
-			case 2: scaleBicubicTSSE41<2, 1>(data, out, w, h, l, u); break;
-			case 3: scaleBicubicTSSE41<3, 1>(data, out, w, h, l, u); break;
-			case 4: scaleBicubicTSSE41<4, 1>(data, out, w, h, l, u); break;
-			case 5: scaleBicubicTSSE41<5, 1>(data, out, w, h, l, u); break;
-			default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
-			}
-		} else {
-		#endif
-			switch(factor) {
-			case 2: scaleBicubicT<2, 1>(data, out, w, h, l, u); break;
-			case 3: scaleBicubicT<3, 1>(data, out, w, h, l, u); break;
-			case 4: scaleBicubicT<4, 1>(data, out, w, h, l, u); break;
-			case 5: scaleBicubicT<5, 1>(data, out, w, h, l, u); break;
-			default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
-			}
-		#if _M_SSE >= 0x401
-		}
-		#endif
-	}
-
-	//////////////////////////////////////////////////////////////////// Bilinear scaling
-
-	const static u8 BILINEAR_FACTORS[4][3][2] = {
-		{ { 44,211}, {  0,  0}, {  0,  0} }, // x2
-		{ { 64,191}, {  0,255}, {  0,  0} }, // x3
-		{ { 77,178}, { 26,229}, {  0,  0} }, // x4
-		{ {102,153}, { 51,204}, {  0,255} }, // x5
-	};
-	// integral bilinear upscaling by factor f, horizontal part
-	template<int f>
-	void bilinearHt(u32* data, u32* out, int w, int l, int u) {
-		static_assert(f>1 && f<=5, "Bilinear scaling only implemented for factors 2 to 5");
-		int outw = w*f;
-		for(int y = l; y < u; ++y) {
-			for(int x = 0; x < w; ++x) {
-				int inpos = y*w + x;
-				u32 left   = data[inpos - (x==0  ?0:1)];
-				u32 center = data[inpos];
-				u32 right  = data[inpos + (x==w-1?0:1)];
-				int i=0;
-				for(; i<f/2+f%2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this
-					out[y*outw + x*f + i] = MIX_PIXELS(left, center, BILINEAR_FACTORS[f-2][i]);
-				}
-				for(; i<f      ; ++i) { // second half of the new pixels, hope the compiler unrolls this
-					out[y*outw + x*f + i] = MIX_PIXELS(right, center, BILINEAR_FACTORS[f-2][f-1-i]);
-				}
-			}
-		}
-	}
-	void bilinearH(int factor, u32* data, u32* out, int w, int l, int u) {
-		switch(factor) {
-		case 2: bilinearHt<2>(data, out, w, l, u); break;
-		case 3: bilinearHt<3>(data, out, w, l, u); break;
-		case 4: bilinearHt<4>(data, out, w, l, u); break;
-		case 5: bilinearHt<5>(data, out, w, l, u); break;
-		default: ERROR_LOG(G3D, "Bilinear upsampling only implemented for factors 2 to 5");
-		}
-	}
-	// integral bilinear upscaling by factor f, vertical part
-	// gl/gu == global lower and upper bound
-	template<int f>
-	void bilinearVt(u32* data, u32* out, int w, int gl, int gu, int l, int u) {
-		static_assert(f>1 && f<=5, "Bilinear scaling only implemented for 2x, 3x, 4x, and 5x");
-		int outw = w*f;
-		for(int xb = 0; xb < outw/BLOCK_SIZE+1; ++xb) {
-			for(int y = l; y < u; ++y) {
-				u32 uy = y - (y==gl  ?0:1);
-				u32 ly = y + (y==gu-1?0:1);
-				for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < outw; ++x) {
-					u32 upper  = data[uy * outw + x];
-					u32 center = data[y * outw + x];
-					u32 lower  = data[ly * outw + x];
-					int i=0;
-					for(; i<f/2+f%2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this
-						out[(y*f + i)*outw + x] = MIX_PIXELS(upper, center, BILINEAR_FACTORS[f-2][i]);
-					}
-					for(; i<f      ; ++i) { // second half of the new pixels, hope the compiler unrolls this
-						out[(y*f + i)*outw + x] = MIX_PIXELS(lower, center, BILINEAR_FACTORS[f-2][f-1-i]);
-					}
-				}
-			}
-		}
-	}
-	void bilinearV(int factor, u32* data, u32* out, int w, int gl, int gu, int l, int u) {
-		switch(factor) {
-		case 2: bilinearVt<2>(data, out, w, gl, gu, l, u); break;
-		case 3: bilinearVt<3>(data, out, w, gl, gu, l, u); break;
-		case 4: bilinearVt<4>(data, out, w, gl, gu, l, u); break;
-		case 5: bilinearVt<5>(data, out, w, gl, gu, l, u); break;
-		default: ERROR_LOG(G3D, "Bilinear upsampling only implemented for factors 2 to 5");
-		}
-	}
-
-	#undef BLOCK_SIZE
-	#undef MIX_PIXELS
-	#undef DISTANCE
-	#undef R
-	#undef G
-	#undef B
-	#undef A
-
-	// used for debugging texture scaling (writing textures to files)
-	static int g_imgCount = 0;
-	void dbgPPM(int w, int h, u8* pixels, const char* prefix = "dbg") { // 3 component RGB
-		char fn[32];
-		snprintf(fn, 32, "%s%04d.ppm", prefix, g_imgCount++);
-		FILE *fp = fopen(fn, "wb");
-		fprintf(fp, "P6\n%d %d\n255\n", w, h);
-		for(int j = 0; j < h; ++j) {
-			for(int i = 0; i < w; ++i) {
-				static unsigned char color[3];
-				color[0] = pixels[(j*w+i)*4+0];  /* red */
-				color[1] = pixels[(j*w+i)*4+1];  /* green */
-				color[2] = pixels[(j*w+i)*4+2];  /* blue */
-				fwrite(color, 1, 3, fp);
-			}
-		}
-		fclose(fp);
-	}
-	void dbgPGM(int w, int h, u32* pixels, const char* prefix = "dbg") { // 1 component
-		char fn[32];
-		snprintf(fn, 32, "%s%04d.pgm", prefix, g_imgCount++);
-		FILE *fp = fopen(fn, "wb");
-		fprintf(fp, "P5\n%d %d\n65536\n", w, h);
-		for(int j = 0; j < h; ++j) {
-			for(int i = 0; i < w; ++i) {
-				fwrite((pixels+(j*w+i)), 1, 2, fp);
-			}
-		}
-		fclose(fp);
-	}
-}
-
-/////////////////////////////////////// Texture Scaler
-
-namespace DX9 {
-
-TextureScalerDX9::TextureScalerDX9() {
-	initBicubicWeights();
-}
-
-bool TextureScalerDX9::IsEmptyOrFlat(u32* data, int pixels, u32 fmt) {
-	int pixelsPerWord = (fmt == D3DFMT_A8R8G8B8) ? 1 : 2;
-	u32 ref = data[0];
-	for(int i=0; i<pixels/pixelsPerWord; ++i) {
-		if(data[i]!=ref) return false;
-	}
-	return true;
-}
-
-void TextureScalerDX9::Scale(u32* &data, u32 &dstFmt, int &width, int &height, int factor) {
-	// prevent processing empty or flat textures (this happens a lot in some games)
-	// doesn't hurt the standard case, will be very quick for textures with actual texture
-	if(IsEmptyOrFlat(data, width*height, dstFmt)) {
-		INFO_LOG(G3D, "TextureScaler: early exit -- empty/flat texture");
-		return;
-	}
-
-	#ifdef SCALING_MEASURE_TIME
-	double t_start = real_time_now();
-	#endif
-
-	bufInput.resize(width*height); // used to store the input image image if it needs to be reformatted
-	bufOutput.resize(width*height*factor*factor); // used to store the upscaled image
-	u32 *inputBuf = bufInput.data();
-	u32 *outputBuf = bufOutput.data();
-
-	// convert texture to correct format for scaling
-	ConvertTo8888(dstFmt, data, inputBuf, width, height);
-	
-	// deposterize
-	if(g_Config.bTexDeposterize) {
-		bufDeposter.resize(width*height);
-		DePosterize(inputBuf, bufDeposter.data(), width, height);
-		inputBuf = bufDeposter.data();
-	}
-	
-	// scale 
-	switch(g_Config.iTexScalingType) {
-	case XBRZ:
-		ScaleXBRZ(factor, inputBuf, outputBuf, width, height);
-		break;
-	case HYBRID:
-		ScaleHybrid(factor, inputBuf, outputBuf, width, height);
-		break;
-	case BICUBIC:
-		ScaleBicubicMitchell(factor, inputBuf, outputBuf, width, height);
-		break;
-	case HYBRID_BICUBIC:
-		ScaleHybrid(factor, inputBuf, outputBuf, width, height, true);
-		break;
-	default:
-		ERROR_LOG(G3D, "Unknown scaling type: %d", g_Config.iTexScalingType);
-	}
-
-	// update values accordingly
-	data = outputBuf;
-	dstFmt = D3DFMT_A8R8G8B8;
-	width *= factor;
-	height *= factor;
-
-	#ifdef SCALING_MEASURE_TIME
-	if(width*height > 64*64*factor*factor) {
-		double t = real_time_now() - t_start;
-		NOTICE_LOG(MASTER_LOG, "TextureScaler: processed %9d pixels in %6.5lf seconds. (%9.2lf Mpixels/second)", 
-			width*height, t, (width*height)/(t*1000*1000));
-	}
-	#endif
-}
-
-void TextureScalerDX9::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) {
-	xbrz::ScalerCfg cfg;
-	GlobalThreadPool::Loop(std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, placeholder::_1, placeholder::_2), 0, height);
-}
-
-void TextureScalerDX9::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) {
-	bufTmp1.resize(width*height*factor);
-	u32 *tmpBuf = bufTmp1.data();
-	GlobalThreadPool::Loop(std::bind(&bilinearH, factor, source, tmpBuf, width, placeholder::_1, placeholder::_2), 0, height);
-	GlobalThreadPool::Loop(std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, placeholder::_1, placeholder::_2), 0, height);
-}
-
-void TextureScalerDX9::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) {
-	GlobalThreadPool::Loop(std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, placeholder::_1, placeholder::_2), 0, height);
-}
-
-void TextureScalerDX9::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) {
-	GlobalThreadPool::Loop(std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, placeholder::_1, placeholder::_2), 0, height);
-}
-
-void TextureScalerDX9::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) {
-	// Basic algorithm:
-	// 1) determine a feature mask C based on a sobel-ish filter + splatting, and upscale that mask bilinearly
-	// 2) generate 2 scaled images: A - using Bilinear filtering, B - using xBRZ
-	// 3) output = A*C + B*(1-C)
-	
-	const static int KERNEL_SPLAT[3][3] = {
-		{ 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }
-	};
-	
-	bufTmp1.resize(width*height);
-	bufTmp2.resize(width*height*factor*factor);
-	bufTmp3.resize(width*height*factor*factor);
-	GlobalThreadPool::Loop(std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, placeholder::_1, placeholder::_2), 0, height);
-	GlobalThreadPool::Loop(std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, placeholder::_1, placeholder::_2), 0, height);
-	ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height);
-	// mask C is now in bufTmp3
-
-	ScaleXBRZ(factor, source, bufTmp2.data(), width, height);
-	// xBRZ upscaled source is in bufTmp2
-
-	if(bicubic) ScaleBicubicBSpline(factor, source, dest, width, height);
-	else ScaleBilinear(factor, source, dest, width, height);
-	// Upscaled source is in dest
-
-	// Now we can mix it all together
-	// The factor 8192 was found through practical testing on a variety of textures
-	GlobalThreadPool::Loop(std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, placeholder::_1, placeholder::_2), 0, height*factor);
-}
-
-void TextureScalerDX9::DePosterize(u32* source, u32* dest, int width, int height) {
-	bufTmp3.resize(width*height);
-	GlobalThreadPool::Loop(std::bind(&deposterizeH, source, bufTmp3.data(), width, placeholder::_1, placeholder::_2), 0, height);
-	GlobalThreadPool::Loop(std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, placeholder::_1, placeholder::_2), 0, height);
-	GlobalThreadPool::Loop(std::bind(&deposterizeH, dest, bufTmp3.data(), width, placeholder::_1, placeholder::_2), 0, height);
-	GlobalThreadPool::Loop(std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, placeholder::_1, placeholder::_2), 0, height);
-}
-
-void TextureScalerDX9::ConvertTo8888(u32 format, u32* source, u32* &dest, int width, int height) {
-	switch(format) {
-	case D3DFMT_A8R8G8B8:
-		dest = source; // already fine
-		break;
-
-	case D3DFMT_A4R4G4B4:
-		GlobalThreadPool::Loop(std::bind(&convert4444, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height);
-		break;
-
-	case D3DFMT_R5G6B5:
-		GlobalThreadPool::Loop(std::bind(&convert565, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height);
-		break;
-
-	case D3DFMT_A1R5G5B5:
-		GlobalThreadPool::Loop(std::bind(&convert5551, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height);
-		break;
-
-	default:
-		dest = source;
-		ERROR_LOG(G3D, "iXBRZTexScaling: unsupported texture format");
-	}
-}
-
-}
diff --git a/GPU/Directx9/TextureScalerDX9.h b/GPU/Directx9/TextureScalerDX9.h
deleted file mode 100644
index 2da5484645..0000000000
--- a/GPU/Directx9/TextureScalerDX9.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2012- PPSSPP Project.
-
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, version 2.0 or later versions.
-
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License 2.0 for more details.
-
-// A copy of the GPL 2.0 should have been included with the program.
-// If not, see http://www.gnu.org/licenses/
-
-// Official git repository and contact information can be found at
-// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
-
-#pragma once
-
-#include "Common/MemoryUtil.h"
-#include "../Globals.h"
-#include "helper/global.h"
-//#include "gfx/gl_common.h"
-
-#include <vector>
-
-namespace DX9 {
-
-
-class TextureScalerDX9 {
-public:
-	TextureScalerDX9();
-
-	void Scale(u32* &data, u32 &dstfmt, int &width, int &height, int factor);
-
-	enum { XBRZ= 0, HYBRID = 1, BICUBIC = 2, HYBRID_BICUBIC = 3 };
-
-private:
-	void ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height);
-	void ScaleBilinear(int factor, u32* source, u32* dest, int width, int height);
-	void ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height);
-	void ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height);
-	void ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic = false);
-	void ConvertTo8888(u32 format, u32* source, u32* &dest, int width, int height);
-
-	void DePosterize(u32* source, u32* dest, int width, int height);
-
-	bool IsEmptyOrFlat(u32* data, int pixels, u32 fmt);
-
-	// depending on the factor and texture sizes, these can get pretty large 
-	// maximum is (100 MB total for a 512 by 512 texture with scaling factor 5 and hybrid scaling)
-	// of course, scaling factor 5 is totally silly anyway
-	SimpleBuf<u32> bufInput, bufDeposter, bufOutput, bufTmp1, bufTmp2, bufTmp3;
-};
-
-};
diff --git a/GPU/GLES/TextureCache.cpp b/GPU/GLES/TextureCache.cpp
index 4d2c67d19b..f1489b713b 100644
--- a/GPU/GLES/TextureCache.cpp
+++ b/GPU/GLES/TextureCache.cpp
@@ -2002,8 +2002,22 @@ void TextureCache::LoadTextureLevel(TexCacheEntry &entry, int level, bool replac
 	bool useBGRA = UseBGRA8888() && dstFmt == GL_UNSIGNED_BYTE;
 
 	u32 *pixelData = (u32 *)finalBuf;
-	if (scaleFactor > 1 && (entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0)
-		scaler.Scale(pixelData, dstFmt, w, h, scaleFactor);
+	if (scaleFactor > 1 && (entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) {
+		GEBufferFormat dstFormat;
+		switch (dstFmt) {
+		case GL_UNSIGNED_BYTE: dstFormat = GE_FORMAT_8888; break;
+		case GL_UNSIGNED_SHORT_4_4_4_4: dstFormat = GE_FORMAT_4444; break;
+		case GL_UNSIGNED_SHORT_5_6_5: dstFormat = GE_FORMAT_565; break;
+		case GL_UNSIGNED_SHORT_5_5_5_1: dstFormat = GE_FORMAT_5551; break;
+		}
+		scaler.Scale(pixelData, dstFormat, w, h, scaleFactor);
+		switch (dstFormat) {
+		case GE_FORMAT_565: dstFmt = GL_UNSIGNED_SHORT_5_6_5; break;
+		case GE_FORMAT_5551: dstFmt = GL_UNSIGNED_SHORT_5_5_5_1; break;
+		case GE_FORMAT_4444: dstFmt = GL_UNSIGNED_SHORT_4_4_4_4; break;
+		case GE_FORMAT_8888: dstFmt = GL_UNSIGNED_BYTE; break;
+		}
+	}
 
 	if ((entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) {
 		TexCacheEntry::Status alphaStatus = CheckAlpha(pixelData, dstFmt, useUnpack ? bufw : w, w, h);
diff --git a/GPU/GLES/TextureCache.h b/GPU/GLES/TextureCache.h
index c4bb20781b..cc186a3f98 100644
--- a/GPU/GLES/TextureCache.h
+++ b/GPU/GLES/TextureCache.h
@@ -25,7 +25,7 @@
 #include "Globals.h"
 #include "GPU/GPUInterface.h"
 #include "GPU/GPUState.h"
-#include "GPU/GLES/TextureScaler.h"
+#include "GPU/Common/TextureScaler.h"
 #include "GPU/Common/TextureCacheCommon.h"
 
 struct VirtualFramebuffer;
diff --git a/GPU/GPU.vcxproj b/GPU/GPU.vcxproj
index 79e07383b2..08511d2930 100644
--- a/GPU/GPU.vcxproj
+++ b/GPU/GPU.vcxproj
@@ -191,6 +191,7 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </ClInclude>
     <ClInclude Include="Common\TextureCacheCommon.h" />
+    <ClInclude Include="Common\TextureScaler.h" />
     <ClInclude Include="Common\TransformCommon.h" />
     <ClInclude Include="Common\VertexDecoderCommon.h" />
     <ClInclude Include="Debugger\Breakpoints.h" />
@@ -204,7 +205,6 @@
     <ClInclude Include="Directx9\ShaderManagerDX9.h" />
     <ClInclude Include="Directx9\StateMappingDX9.h" />
     <ClInclude Include="Directx9\TextureCacheDX9.h" />
-    <ClInclude Include="Directx9\TextureScalerDX9.h" />
     <ClInclude Include="Directx9\TransformPipelineDX9.h" />
     <ClInclude Include="Directx9\VertexShaderGeneratorDX9.h" />
     <ClInclude Include="ge_constants.h" />
@@ -217,7 +217,6 @@
     <ClInclude Include="GLES\ShaderManager.h" />
     <ClInclude Include="GLES\StateMapping.h" />
     <ClInclude Include="GLES\TextureCache.h" />
-    <ClInclude Include="GLES\TextureScaler.h" />
     <ClInclude Include="GLES\TransformPipeline.h" />
     <ClInclude Include="GLES\VertexShaderGenerator.h" />
     <ClInclude Include="GPUCommon.h" />
@@ -246,6 +245,7 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="Common\TextureCacheCommon.cpp" />
+    <ClCompile Include="Common\TextureScaler.cpp" />
     <ClCompile Include="Common\TransformCommon.cpp" />
     <ClCompile Include="Common\SoftwareTransformCommon.cpp" />
     <ClCompile Include="Common\VertexDecoderArm.cpp">
@@ -269,7 +269,6 @@
     <ClCompile Include="Directx9\StateMappingDX9.cpp" />
     <ClCompile Include="Directx9\StencilBufferDX9.cpp" />
     <ClCompile Include="Directx9\TextureCacheDX9.cpp" />
-    <ClCompile Include="Directx9\TextureScalerDX9.cpp" />
     <ClCompile Include="Directx9\TransformPipelineDX9.cpp" />
     <ClCompile Include="Directx9\VertexShaderGeneratorDX9.cpp" />
     <ClCompile Include="GeDisasm.cpp" />
@@ -283,7 +282,6 @@
     <ClCompile Include="GLES\StateMapping.cpp" />
     <ClCompile Include="GLES\StencilBuffer.cpp" />
     <ClCompile Include="GLES\TextureCache.cpp" />
-    <ClCompile Include="GLES\TextureScaler.cpp" />
     <ClCompile Include="GLES\TransformPipeline.cpp" />
     <ClCompile Include="GLES\VertexShaderGenerator.cpp" />
     <ClCompile Include="GPUCommon.cpp" />
diff --git a/GPU/GPU.vcxproj.filters b/GPU/GPU.vcxproj.filters
index d5dc894551..c516ce7ec2 100644
--- a/GPU/GPU.vcxproj.filters
+++ b/GPU/GPU.vcxproj.filters
@@ -42,6 +42,9 @@
     <ClInclude Include="GPUCommon.h">
       <Filter>Common</Filter>
     </ClInclude>
+    <ClInclude Include="Common\TextureScaler.h">
+      <Filter>Common</Filter>
+    </ClInclude>
     <ClInclude Include="Software\Clipper.h">
       <Filter>Software</Filter>
     </ClInclude>
@@ -69,9 +72,6 @@
     <ClInclude Include="GLES\TextureCache.h">
       <Filter>GLES</Filter>
     </ClInclude>
-    <ClInclude Include="GLES\TextureScaler.h">
-      <Filter>GLES</Filter>
-    </ClInclude>
     <ClInclude Include="GLES\TransformPipeline.h">
       <Filter>GLES</Filter>
     </ClInclude>
@@ -93,9 +93,6 @@
     <ClInclude Include="Directx9\TransformPipelineDX9.h">
       <Filter>DirectX9</Filter>
     </ClInclude>
-    <ClInclude Include="Directx9\TextureScalerDX9.h">
-      <Filter>DirectX9</Filter>
-    </ClInclude>
     <ClInclude Include="Directx9\TextureCacheDX9.h">
       <Filter>DirectX9</Filter>
     </ClInclude>
@@ -254,11 +251,8 @@
     <ClCompile Include="Directx9\PixelShaderGeneratorDX9.cpp">
       <Filter>DirectX9</Filter>
     </ClCompile>
-    <ClCompile Include="Directx9\TextureScalerDX9.cpp">
-      <Filter>DirectX9</Filter>
-    </ClCompile>
-    <ClCompile Include="GLES\TextureScaler.cpp">
-      <Filter>GLES</Filter>
+    <ClCompile Include="Common\TextureScaler.cpp">
+      <Filter>Common</Filter>
     </ClCompile>
     <ClCompile Include="Common\IndexGenerator.cpp">
       <Filter>Common</Filter>
diff --git a/Qt/GPU.pro b/Qt/GPU.pro
index f14c017d2e..5d68df4851 100644
--- a/Qt/GPU.pro
+++ b/Qt/GPU.pro
@@ -36,7 +36,6 @@ SOURCES += $$P/GPU/GeDisasm.cpp \ # GPU
 	$$P/GPU/GLES/StateMapping.cpp \
 	$$P/GPU/GLES/StencilBuffer.cpp \
 	$$P/GPU/GLES/TextureCache.cpp \
-	$$P/GPU/GLES/TextureScaler.cpp \
 	$$P/GPU/GLES/TransformPipeline.cpp \
 	$$P/GPU/GLES/VertexShaderGenerator.cpp \
 	$$P/GPU/Software/*.cpp \
@@ -44,6 +43,7 @@ SOURCES += $$P/GPU/GeDisasm.cpp \ # GPU
 	$$P/GPU/Common/IndexGenerator.cpp \
 	$$P/GPU/Common/TextureDecoder.cpp \
 	$$P/GPU/Common/VertexDecoderCommon.cpp \
+	$$P/GPU/Common/TextureScaler.cpp \
 	$$P/GPU/Common/TextureCacheCommon.cpp \
 	$$P/GPU/Common/TransformCommon.cpp \
 	$$P/GPU/Common/SoftwareTransformCommon.cpp \
diff --git a/Windows/WndMainWindow.cpp b/Windows/WndMainWindow.cpp
index bfccea360a..417e37f5cb 100644
--- a/Windows/WndMainWindow.cpp
+++ b/Windows/WndMainWindow.cpp
@@ -68,7 +68,7 @@
 #include "GPU/GPUInterface.h"
 #include "GPU/GPUState.h"
 #include "gfx_es2/gpu_features.h"
-#include "GPU/GLES/TextureScaler.h"
+#include "GPU/Common/TextureScaler.h"
 #include "GPU/GLES/TextureCache.h"
 #include "GPU/GLES/Framebuffer.h"
 #include "ControlMapping.h"
diff --git a/android/jni/Android.mk b/android/jni/Android.mk
index 79c5499e65..0e40a5a576 100644
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@@ -153,6 +153,7 @@ EXEC_AND_LIB_FILES := \
   $(SRC)/GPU/Common/SoftwareTransformCommon.cpp.arm \
   $(SRC)/GPU/Common/VertexDecoderCommon.cpp.arm \
   $(SRC)/GPU/Common/TextureCacheCommon.cpp.arm \
+  $(SRC)/GPU/Common/TextureScaler.cpp.arm \
   $(SRC)/GPU/Common/SplineCommon.cpp.arm \
   $(SRC)/GPU/Common/DrawEngineCommon.cpp.arm \
   $(SRC)/GPU/Common/TransformCommon.cpp.arm \
@@ -171,7 +172,6 @@ EXEC_AND_LIB_FILES := \
   $(SRC)/GPU/GLES/VertexShaderGenerator.cpp.arm \
   $(SRC)/GPU/GLES/FragmentShaderGenerator.cpp.arm \
   $(SRC)/GPU/GLES/FragmentTestCache.cpp.arm \
-  $(SRC)/GPU/GLES/TextureScaler.cpp \
   $(SRC)/GPU/GLES/Spline.cpp \
   $(SRC)/GPU/Null/NullGpu.cpp \
   $(SRC)/GPU/Software/Clipper.cpp \