Optimize the xBRZ texture scaling shaders by 30% on Adreno, less on Mali

This commit is contained in:
Henrik Rydgård 2021-12-12 18:48:42 +01:00
parent dc9816ceac
commit fb3b7f8e23
2 changed files with 72 additions and 63 deletions

View file

@ -10,10 +10,7 @@
#define STEEP_DIRECTION_THRESHOLD 2.2
#define DOMINANT_DIRECTION_THRESHOLD 3.6
float reduce(vec4 color) {
return dot(color.rgb, vec3(65536.0, 256.0, 1.0));
}
// TODO: Replace this with something cheaper.
float DistYCbCr(vec4 pixA, vec4 pixB) {
// https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.2020_conversion
const vec3 K = vec3(0.2627, 0.6780, 0.0593);
@ -23,8 +20,8 @@ float DistYCbCr(vec4 pixA, vec4 pixB) {
vec4 diff = pixA - pixB;
vec3 YCbCr = diff.rgb * MATRIX;
YCbCr.x *= LUMINANCE_WEIGHT;
float d = length(YCbCr);
return sqrt(pixA.a * pixB.a * d * d + diff.a * diff.a);
float d = dot(YCbCr, YCbCr);
return sqrt(pixA.a * pixB.a * d + diff.a * diff.a);
}
bool IsPixEqual(const vec4 pixA, const vec4 pixB) {
@ -36,6 +33,10 @@ bool IsBlendingNeeded(const ivec4 blend) {
return diff.x != 0 || diff.y != 0 || diff.z != 0 || diff.w != 0;
}
uint readInputu(uvec2 coord) {
return readColoru(uvec2(clamp(coord.x, 0, params.width - 1), clamp(coord.y, 0, params.height - 1)));
}
vec4 readInput(uvec2 coord) {
return readColorf(uvec2(clamp(coord.x, 0, params.width - 1), clamp(coord.y, 0, params.height - 1)));
}
@ -62,6 +63,17 @@ void applyScaling(uvec2 origxy) {
// 17|04|03|02|11
// |15|14|13|
uint v[9];
v[0] = readInputu(t3.yw);
v[1] = readInputu(t3.zw);
v[2] = readInputu(t4.zw);
v[3] = readInputu(t4.yw);
v[4] = readInputu(t4.xw);
v[5] = readInputu(t3.xw);
v[6] = readInputu(t2.xw);
v[7] = readInputu(t2.yw);
v[8] = readInputu(t2.zw);
vec4 src[25];
src[21] = readInput(t1.xw);
@ -86,17 +98,6 @@ void applyScaling(uvec2 origxy) {
src[10] = readInput(t7.xz);
src[11] = readInput(t7.xw);
float v[9];
v[0] = reduce(src[0]);
v[1] = reduce(src[1]);
v[2] = reduce(src[2]);
v[3] = reduce(src[3]);
v[4] = reduce(src[4]);
v[5] = reduce(src[5]);
v[6] = reduce(src[6]);
v[7] = reduce(src[7]);
v[8] = reduce(src[8]);
ivec4 blendResult = ivec4(BLEND_NONE);
// Preprocess corners
@ -253,25 +254,22 @@ void applyScaling(uvec2 origxy) {
dst[ 6] = mix(dst[ 6], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
}
// This is the only difference from tex_4xbrz.csh:
// Output Pixel Mapping:
// 06|07|08|09
// 05|00|01|10
// 04|03|02|11
// 15|14|13|12
const int order[16] = int[16](6, 7, 8, 9, 5, 0, 1, 10, 4, 3, 2, 11, 15, 14, 13, 12);
// Write all 16 output pixels.
ivec2 destXY = ivec2(origxy) * 2;
for (int y = 0; y < 2; y++) {
for (int x = 0; x < 2; x++) {
vec4 sum = vec4(0.0);
int index = y * 4 + x * 2;
for (int iy = 0; iy < 2; iy++) {
for (int ix = 0; ix < 2; ix++) {
sum += dst[order[index + iy * 4 + ix]];
}
}
sum *= 0.25;
writeColorf(destXY + ivec2(x, y), sum);
}
}
vec4 topLeft = dst[6] + dst[7] + dst[5] + dst[0];
vec4 topRight = dst[8] + dst[9] + dst[1] + dst[10];
vec4 bottomLeft = dst[4] + dst[3] + dst[15] + dst[14];
vec4 bottomRight = dst[2] + dst[11] + dst[13] + dst[12];
writeColorf(destXY, topLeft * 0.25);
writeColorf(destXY + ivec2(1, 0), topRight * 0.25);
writeColorf(destXY + ivec2(0, 1), bottomLeft * 0.25);
writeColorf(destXY + ivec2(1, 1), bottomRight * 0.25);
}

View file

@ -10,10 +10,7 @@
#define STEEP_DIRECTION_THRESHOLD 2.2
#define DOMINANT_DIRECTION_THRESHOLD 3.6
float reduce(vec4 color) {
return dot(color.rgb, vec3(65536.0, 256.0, 1.0));
}
// TODO: Replace this with something cheaper.
float DistYCbCr(vec4 pixA, vec4 pixB) {
// https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.2020_conversion
const vec3 K = vec3(0.2627, 0.6780, 0.0593);
@ -23,8 +20,8 @@ float DistYCbCr(vec4 pixA, vec4 pixB) {
vec4 diff = pixA - pixB;
vec3 YCbCr = diff.rgb * MATRIX;
YCbCr.x *= LUMINANCE_WEIGHT;
float d = length(YCbCr);
return sqrt(pixA.a * pixB.a * d * d + diff.a * diff.a);
float d = dot(YCbCr, YCbCr);
return sqrt(pixA.a * pixB.a * d + diff.a * diff.a);
}
bool IsPixEqual(const vec4 pixA, const vec4 pixB) {
@ -36,6 +33,10 @@ bool IsBlendingNeeded(const ivec4 blend) {
return diff.x != 0 || diff.y != 0 || diff.z != 0 || diff.w != 0;
}
uint readInputu(uvec2 coord) {
return readColoru(uvec2(clamp(coord.x, 0, params.width - 1), clamp(coord.y, 0, params.height - 1)));
}
vec4 readInput(uvec2 coord) {
return readColorf(uvec2(clamp(coord.x, 0, params.width - 1), clamp(coord.y, 0, params.height - 1)));
}
@ -62,20 +63,31 @@ void applyScaling(uvec2 origxy) {
// 17|04|03|02|11
// |15|14|13|
uint v[9];
v[0] = readInputu(t3.yw);
v[1] = readInputu(t3.zw);
v[2] = readInputu(t4.zw);
v[3] = readInputu(t4.yw);
v[4] = readInputu(t4.xw);
v[5] = readInputu(t3.xw);
v[6] = readInputu(t2.xw);
v[7] = readInputu(t2.yw);
v[8] = readInputu(t2.zw);
vec4 src[25];
src[21] = readInput(t1.xw);
src[22] = readInput(t1.yw);
src[23] = readInput(t1.zw);
src[ 6] = readInput(t2.xw);
src[ 7] = readInput(t2.yw);
src[ 8] = readInput(t2.zw);
src[ 5] = readInput(t3.xw);
src[ 0] = readInput(t3.yw);
src[ 1] = readInput(t3.zw);
src[ 4] = readInput(t4.xw);
src[ 3] = readInput(t4.yw);
src[ 2] = readInput(t4.zw);
src[ 6] = unpackUnorm4x8(v[6]);
src[ 7] = unpackUnorm4x8(v[7]);
src[ 8] = unpackUnorm4x8(v[8]);
src[ 5] = unpackUnorm4x8(v[5]);
src[ 0] = unpackUnorm4x8(v[0]);
src[ 1] = unpackUnorm4x8(v[1]);
src[ 4] = unpackUnorm4x8(v[4]);
src[ 3] = unpackUnorm4x8(v[3]);
src[ 2] = unpackUnorm4x8(v[2]);
src[15] = readInput(t5.xw);
src[14] = readInput(t5.yw);
src[13] = readInput(t5.zw);
@ -86,17 +98,6 @@ void applyScaling(uvec2 origxy) {
src[10] = readInput(t7.xz);
src[11] = readInput(t7.xw);
float v[9];
v[0] = reduce(src[0]);
v[1] = reduce(src[1]);
v[2] = reduce(src[2]);
v[3] = reduce(src[3]);
v[4] = reduce(src[4]);
v[5] = reduce(src[5]);
v[6] = reduce(src[6]);
v[7] = reduce(src[7]);
v[8] = reduce(src[8]);
ivec4 blendResult = ivec4(BLEND_NONE);
// Preprocess corners
@ -258,12 +259,22 @@ void applyScaling(uvec2 origxy) {
// 05|00|01|10
// 04|03|02|11
// 15|14|13|12
const int order[16] = int[16](6, 7, 8, 9, 5, 0, 1, 10, 4, 3, 2, 11, 15, 14, 13, 12);
// Write all 16 output pixels.
ivec2 destXY = ivec2(origxy) * 4;
for (int y = 0; y < 4; y++) {
for (int x = 0; x < 4; x++) {
writeColorf(destXY + ivec2(x, y), dst[order[y * 4 + x]]);
}
}
writeColorf(destXY, dst[6]);
writeColorf(destXY + ivec2(1, 0), dst[7]);
writeColorf(destXY + ivec2(2, 0), dst[8]);
writeColorf(destXY + ivec2(3, 0), dst[9]);
writeColorf(destXY + ivec2(0, 1), dst[5]);
writeColorf(destXY + ivec2(1, 1), dst[0]);
writeColorf(destXY + ivec2(2, 1), dst[1]);
writeColorf(destXY + ivec2(3, 1), dst[10]);
writeColorf(destXY + ivec2(0, 2), dst[4]);
writeColorf(destXY + ivec2(1, 2), dst[3]);
writeColorf(destXY + ivec2(2, 2), dst[2]);
writeColorf(destXY + ivec2(3, 2), dst[11]);
writeColorf(destXY + ivec2(0, 3), dst[15]);
writeColorf(destXY + ivec2(1, 3), dst[14]);
writeColorf(destXY + ivec2(2, 3), dst[13]);
writeColorf(destXY + ivec2(3, 3), dst[12]);
}