mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Optimize the xBRZ texture scaling shaders by 30% on Adreno, less on Mali
This commit is contained in:
parent
dc9816ceac
commit
fb3b7f8e23
2 changed files with 72 additions and 63 deletions
|
@ -10,10 +10,7 @@
|
|||
#define STEEP_DIRECTION_THRESHOLD 2.2
|
||||
#define DOMINANT_DIRECTION_THRESHOLD 3.6
|
||||
|
||||
float reduce(vec4 color) {
|
||||
return dot(color.rgb, vec3(65536.0, 256.0, 1.0));
|
||||
}
|
||||
|
||||
// TODO: Replace this with something cheaper.
|
||||
float DistYCbCr(vec4 pixA, vec4 pixB) {
|
||||
// https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.2020_conversion
|
||||
const vec3 K = vec3(0.2627, 0.6780, 0.0593);
|
||||
|
@ -23,8 +20,8 @@ float DistYCbCr(vec4 pixA, vec4 pixB) {
|
|||
vec4 diff = pixA - pixB;
|
||||
vec3 YCbCr = diff.rgb * MATRIX;
|
||||
YCbCr.x *= LUMINANCE_WEIGHT;
|
||||
float d = length(YCbCr);
|
||||
return sqrt(pixA.a * pixB.a * d * d + diff.a * diff.a);
|
||||
float d = dot(YCbCr, YCbCr);
|
||||
return sqrt(pixA.a * pixB.a * d + diff.a * diff.a);
|
||||
}
|
||||
|
||||
bool IsPixEqual(const vec4 pixA, const vec4 pixB) {
|
||||
|
@ -36,6 +33,10 @@ bool IsBlendingNeeded(const ivec4 blend) {
|
|||
return diff.x != 0 || diff.y != 0 || diff.z != 0 || diff.w != 0;
|
||||
}
|
||||
|
||||
uint readInputu(uvec2 coord) {
|
||||
return readColoru(uvec2(clamp(coord.x, 0, params.width - 1), clamp(coord.y, 0, params.height - 1)));
|
||||
}
|
||||
|
||||
vec4 readInput(uvec2 coord) {
|
||||
return readColorf(uvec2(clamp(coord.x, 0, params.width - 1), clamp(coord.y, 0, params.height - 1)));
|
||||
}
|
||||
|
@ -62,6 +63,17 @@ void applyScaling(uvec2 origxy) {
|
|||
// 17|04|03|02|11
|
||||
// |15|14|13|
|
||||
|
||||
uint v[9];
|
||||
v[0] = readInputu(t3.yw);
|
||||
v[1] = readInputu(t3.zw);
|
||||
v[2] = readInputu(t4.zw);
|
||||
v[3] = readInputu(t4.yw);
|
||||
v[4] = readInputu(t4.xw);
|
||||
v[5] = readInputu(t3.xw);
|
||||
v[6] = readInputu(t2.xw);
|
||||
v[7] = readInputu(t2.yw);
|
||||
v[8] = readInputu(t2.zw);
|
||||
|
||||
vec4 src[25];
|
||||
|
||||
src[21] = readInput(t1.xw);
|
||||
|
@ -86,17 +98,6 @@ void applyScaling(uvec2 origxy) {
|
|||
src[10] = readInput(t7.xz);
|
||||
src[11] = readInput(t7.xw);
|
||||
|
||||
float v[9];
|
||||
v[0] = reduce(src[0]);
|
||||
v[1] = reduce(src[1]);
|
||||
v[2] = reduce(src[2]);
|
||||
v[3] = reduce(src[3]);
|
||||
v[4] = reduce(src[4]);
|
||||
v[5] = reduce(src[5]);
|
||||
v[6] = reduce(src[6]);
|
||||
v[7] = reduce(src[7]);
|
||||
v[8] = reduce(src[8]);
|
||||
|
||||
ivec4 blendResult = ivec4(BLEND_NONE);
|
||||
|
||||
// Preprocess corners
|
||||
|
@ -253,25 +254,22 @@ void applyScaling(uvec2 origxy) {
|
|||
dst[ 6] = mix(dst[ 6], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
|
||||
}
|
||||
|
||||
// This is the only difference from tex_4xbrz.csh:
|
||||
|
||||
// Output Pixel Mapping:
|
||||
// 06|07|08|09
|
||||
// 05|00|01|10
|
||||
// 04|03|02|11
|
||||
// 15|14|13|12
|
||||
const int order[16] = int[16](6, 7, 8, 9, 5, 0, 1, 10, 4, 3, 2, 11, 15, 14, 13, 12);
|
||||
// Write all 16 output pixels.
|
||||
ivec2 destXY = ivec2(origxy) * 2;
|
||||
for (int y = 0; y < 2; y++) {
|
||||
for (int x = 0; x < 2; x++) {
|
||||
vec4 sum = vec4(0.0);
|
||||
int index = y * 4 + x * 2;
|
||||
for (int iy = 0; iy < 2; iy++) {
|
||||
for (int ix = 0; ix < 2; ix++) {
|
||||
sum += dst[order[index + iy * 4 + ix]];
|
||||
}
|
||||
}
|
||||
sum *= 0.25;
|
||||
writeColorf(destXY + ivec2(x, y), sum);
|
||||
}
|
||||
}
|
||||
|
||||
vec4 topLeft = dst[6] + dst[7] + dst[5] + dst[0];
|
||||
vec4 topRight = dst[8] + dst[9] + dst[1] + dst[10];
|
||||
vec4 bottomLeft = dst[4] + dst[3] + dst[15] + dst[14];
|
||||
vec4 bottomRight = dst[2] + dst[11] + dst[13] + dst[12];
|
||||
writeColorf(destXY, topLeft * 0.25);
|
||||
writeColorf(destXY + ivec2(1, 0), topRight * 0.25);
|
||||
writeColorf(destXY + ivec2(0, 1), bottomLeft * 0.25);
|
||||
writeColorf(destXY + ivec2(1, 1), bottomRight * 0.25);
|
||||
}
|
||||
|
|
|
@ -10,10 +10,7 @@
|
|||
#define STEEP_DIRECTION_THRESHOLD 2.2
|
||||
#define DOMINANT_DIRECTION_THRESHOLD 3.6
|
||||
|
||||
float reduce(vec4 color) {
|
||||
return dot(color.rgb, vec3(65536.0, 256.0, 1.0));
|
||||
}
|
||||
|
||||
// TODO: Replace this with something cheaper.
|
||||
float DistYCbCr(vec4 pixA, vec4 pixB) {
|
||||
// https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.2020_conversion
|
||||
const vec3 K = vec3(0.2627, 0.6780, 0.0593);
|
||||
|
@ -23,8 +20,8 @@ float DistYCbCr(vec4 pixA, vec4 pixB) {
|
|||
vec4 diff = pixA - pixB;
|
||||
vec3 YCbCr = diff.rgb * MATRIX;
|
||||
YCbCr.x *= LUMINANCE_WEIGHT;
|
||||
float d = length(YCbCr);
|
||||
return sqrt(pixA.a * pixB.a * d * d + diff.a * diff.a);
|
||||
float d = dot(YCbCr, YCbCr);
|
||||
return sqrt(pixA.a * pixB.a * d + diff.a * diff.a);
|
||||
}
|
||||
|
||||
bool IsPixEqual(const vec4 pixA, const vec4 pixB) {
|
||||
|
@ -36,6 +33,10 @@ bool IsBlendingNeeded(const ivec4 blend) {
|
|||
return diff.x != 0 || diff.y != 0 || diff.z != 0 || diff.w != 0;
|
||||
}
|
||||
|
||||
uint readInputu(uvec2 coord) {
|
||||
return readColoru(uvec2(clamp(coord.x, 0, params.width - 1), clamp(coord.y, 0, params.height - 1)));
|
||||
}
|
||||
|
||||
vec4 readInput(uvec2 coord) {
|
||||
return readColorf(uvec2(clamp(coord.x, 0, params.width - 1), clamp(coord.y, 0, params.height - 1)));
|
||||
}
|
||||
|
@ -62,20 +63,31 @@ void applyScaling(uvec2 origxy) {
|
|||
// 17|04|03|02|11
|
||||
// |15|14|13|
|
||||
|
||||
uint v[9];
|
||||
v[0] = readInputu(t3.yw);
|
||||
v[1] = readInputu(t3.zw);
|
||||
v[2] = readInputu(t4.zw);
|
||||
v[3] = readInputu(t4.yw);
|
||||
v[4] = readInputu(t4.xw);
|
||||
v[5] = readInputu(t3.xw);
|
||||
v[6] = readInputu(t2.xw);
|
||||
v[7] = readInputu(t2.yw);
|
||||
v[8] = readInputu(t2.zw);
|
||||
|
||||
vec4 src[25];
|
||||
|
||||
src[21] = readInput(t1.xw);
|
||||
src[22] = readInput(t1.yw);
|
||||
src[23] = readInput(t1.zw);
|
||||
src[ 6] = readInput(t2.xw);
|
||||
src[ 7] = readInput(t2.yw);
|
||||
src[ 8] = readInput(t2.zw);
|
||||
src[ 5] = readInput(t3.xw);
|
||||
src[ 0] = readInput(t3.yw);
|
||||
src[ 1] = readInput(t3.zw);
|
||||
src[ 4] = readInput(t4.xw);
|
||||
src[ 3] = readInput(t4.yw);
|
||||
src[ 2] = readInput(t4.zw);
|
||||
src[ 6] = unpackUnorm4x8(v[6]);
|
||||
src[ 7] = unpackUnorm4x8(v[7]);
|
||||
src[ 8] = unpackUnorm4x8(v[8]);
|
||||
src[ 5] = unpackUnorm4x8(v[5]);
|
||||
src[ 0] = unpackUnorm4x8(v[0]);
|
||||
src[ 1] = unpackUnorm4x8(v[1]);
|
||||
src[ 4] = unpackUnorm4x8(v[4]);
|
||||
src[ 3] = unpackUnorm4x8(v[3]);
|
||||
src[ 2] = unpackUnorm4x8(v[2]);
|
||||
src[15] = readInput(t5.xw);
|
||||
src[14] = readInput(t5.yw);
|
||||
src[13] = readInput(t5.zw);
|
||||
|
@ -86,17 +98,6 @@ void applyScaling(uvec2 origxy) {
|
|||
src[10] = readInput(t7.xz);
|
||||
src[11] = readInput(t7.xw);
|
||||
|
||||
float v[9];
|
||||
v[0] = reduce(src[0]);
|
||||
v[1] = reduce(src[1]);
|
||||
v[2] = reduce(src[2]);
|
||||
v[3] = reduce(src[3]);
|
||||
v[4] = reduce(src[4]);
|
||||
v[5] = reduce(src[5]);
|
||||
v[6] = reduce(src[6]);
|
||||
v[7] = reduce(src[7]);
|
||||
v[8] = reduce(src[8]);
|
||||
|
||||
ivec4 blendResult = ivec4(BLEND_NONE);
|
||||
|
||||
// Preprocess corners
|
||||
|
@ -258,12 +259,22 @@ void applyScaling(uvec2 origxy) {
|
|||
// 05|00|01|10
|
||||
// 04|03|02|11
|
||||
// 15|14|13|12
|
||||
const int order[16] = int[16](6, 7, 8, 9, 5, 0, 1, 10, 4, 3, 2, 11, 15, 14, 13, 12);
|
||||
// Write all 16 output pixels.
|
||||
ivec2 destXY = ivec2(origxy) * 4;
|
||||
for (int y = 0; y < 4; y++) {
|
||||
for (int x = 0; x < 4; x++) {
|
||||
writeColorf(destXY + ivec2(x, y), dst[order[y * 4 + x]]);
|
||||
}
|
||||
}
|
||||
writeColorf(destXY, dst[6]);
|
||||
writeColorf(destXY + ivec2(1, 0), dst[7]);
|
||||
writeColorf(destXY + ivec2(2, 0), dst[8]);
|
||||
writeColorf(destXY + ivec2(3, 0), dst[9]);
|
||||
writeColorf(destXY + ivec2(0, 1), dst[5]);
|
||||
writeColorf(destXY + ivec2(1, 1), dst[0]);
|
||||
writeColorf(destXY + ivec2(2, 1), dst[1]);
|
||||
writeColorf(destXY + ivec2(3, 1), dst[10]);
|
||||
writeColorf(destXY + ivec2(0, 2), dst[4]);
|
||||
writeColorf(destXY + ivec2(1, 2), dst[3]);
|
||||
writeColorf(destXY + ivec2(2, 2), dst[2]);
|
||||
writeColorf(destXY + ivec2(3, 2), dst[11]);
|
||||
writeColorf(destXY + ivec2(0, 3), dst[15]);
|
||||
writeColorf(destXY + ivec2(1, 3), dst[14]);
|
||||
writeColorf(destXY + ivec2(2, 3), dst[13]);
|
||||
writeColorf(destXY + ivec2(3, 3), dst[12]);
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue