n64-emu/third_party/parallel-rdp-standalone/parallel-rdp/shaders/span_setup.comp
2023-08-06 14:03:29 +09:00

227 lines
8 KiB
Text

#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "small_types.h"
#include "debug.h"
layout(local_size_x_id = 0) in;
layout(constant_id = 1) const int SCALING_LOG2 = 0;
const int SCALING_FACTOR = 1 << SCALING_LOG2;
#include "data_structures.h"
layout(std430, set = 0, binding = 0) readonly buffer TriangleSetupBuffer
{
TriangleSetupMem elems[];
} triangle_setup;
#include "load_triangle_setup.h"
layout(std430, set = 0, binding = 1) readonly buffer AttributeSetupBuffer
{
AttributeSetupMem elems[];
} attribute_setup;
#include "load_attribute_setup.h"
layout(set = 0, binding = 2, std430) readonly buffer ScissorStateBuffer
{
ScissorStateMem elems[];
} scissor_state;
#include "load_scissor_state.h"
layout(std430, set = 0, binding = 3) writeonly buffer SpanSetups
{
SpanSetupMem elems[];
} span_setups;
#include "store_span_setup.h"
layout(set = 1, binding = 0) uniform utextureBuffer uInterpolationJobs;
const int SUBPIXELS = 4;
const int SUBPIXELS_LOG2 = 2;
// Convert a 16.16 signed value to 16.3. We have 8 subpixels in X direction after snapping.
ivec4 quantize_x(ivec4 x)
{
ivec4 sticky = ivec4(notEqual(x & 0xfff, ivec4(0)));
ivec4 snapped = ivec4((x >> 12) | sticky);
return snapped;
}
int min4(ivec4 v)
{
ivec2 v2 = min(v.xy, v.zw);
return min(v2.x, v2.y);
}
int max4(ivec4 v)
{
ivec2 v2 = max(v.xy, v.zw);
return max(v2.x, v2.y);
}
ivec4 interpolate_snapped(ivec4 dvalue, int dy)
{
int dy_shifted = dy >> SCALING_LOG2;
int dy_masked = dy & (SCALING_FACTOR - 1);
return dy_shifted * dvalue + dy_masked * (dvalue >> SCALING_LOG2);
}
void main()
{
ivec3 job_indices = ivec3(texelFetch(uInterpolationJobs, int(gl_WorkGroupID.x)).xyz);
int primitive_index = job_indices.x;
int base_y = job_indices.y * SCALING_FACTOR;
int max_y = job_indices.z * SCALING_FACTOR + (SCALING_FACTOR - 1);
int y = base_y + int(gl_LocalInvocationIndex);
if (y > max_y)
return;
TriangleSetup setup = load_triangle_setup(primitive_index);
AttributeSetup attr = load_attribute_setup(primitive_index);
ScissorState scissor = load_scissor_state(primitive_index);
bool flip = (setup.flags & TRIANGLE_SETUP_FLIP_BIT) != 0;
bool interlace_en = (setup.flags & TRIANGLE_SETUP_INTERLACE_FIELD_BIT) != 0;
bool keep_odd_field = (setup.flags & TRIANGLE_SETUP_INTERLACE_KEEP_ODD_BIT) != 0;
SpanSetup span_setup;
// Interpolate RGBA, STZW to their scanline.
{
bool do_offset = (setup.flags & TRIANGLE_SETUP_DO_OFFSET_BIT) != 0;
bool skip_xfrac = (setup.flags & TRIANGLE_SETUP_SKIP_XFRAC_BIT) != 0;
int y_interpolation_base = int(setup.yh) >> 2;
y_interpolation_base *= SCALING_FACTOR;
// For high-resolution interpolation, make sure we snap interpolation correctly at whole pixels,
// and quantize derivatives in-between pixels.
int dy = y - y_interpolation_base;
int xh = setup.xh * SCALING_FACTOR + dy * (setup.dxhdy << 2);
ivec4 drgba_diff = ivec4(0);
ivec4 dstzw_diff = ivec4(0);
// In do_offset mode, varyings are latched at last subpixel line instead of first (for some reason).
if (do_offset)
{
xh += (SCALING_FACTOR * 3) * setup.dxhdy;
ivec4 drgba_deh = attr.drgba_de & ~0x1ff;
ivec4 drgba_dyh = attr.drgba_dy & ~0x1ff;
drgba_diff = drgba_deh - (drgba_deh >> 2) - drgba_dyh + (drgba_dyh >> 2);
ivec4 dstzw_deh = attr.dstzw_de & ~0x1ff;
ivec4 dstzw_dyh = attr.dstzw_dy & ~0x1ff;
dstzw_diff = dstzw_deh - (dstzw_deh >> 2) - dstzw_dyh + (dstzw_dyh >> 2);
}
int base_x = xh >> 15;
int xfrac = skip_xfrac ? 0 : ((xh >> 7) & 0xff);
ivec4 rgba = attr.rgba + interpolate_snapped(attr.drgba_de, dy);
rgba = ((rgba & ~0x1ff) + drgba_diff - interpolate_snapped((attr.drgba_dx >> 8) & ~1, xfrac)) & ~0x3ff;
ivec4 stzw = attr.stzw + interpolate_snapped(attr.dstzw_de, dy);
stzw = ((stzw & ~0x1ff) + dstzw_diff - interpolate_snapped((attr.dstzw_dx >> 8) & ~1, xfrac)) & ~0x3ff;
span_setup.rgba = rgba;
span_setup.stzw = stzw;
span_setup.interpolation_base_x = base_x;
}
// Check Y dimension.
int yh_interpolation_base = int(setup.yh) & ~(SUBPIXELS - 1);
int ym_interpolation_base = int(setup.ym);
yh_interpolation_base *= SCALING_FACTOR;
ym_interpolation_base *= SCALING_FACTOR;
int y_sub = int(y * SUBPIXELS);
ivec4 y_subs = y_sub + ivec4(0, 1, 2, 3);
int ylo = max(setup.yh, scissor.ylo) * SCALING_FACTOR;
int yhi = min(setup.yl, scissor.yhi) * SCALING_FACTOR;
bvec4 clip_lo_y = lessThan(y_subs, ivec4(ylo));
bvec4 clip_hi_y = greaterThanEqual(y_subs, ivec4(yhi));
uvec4 clip_y = uvec4(clip_lo_y) | uvec4(clip_hi_y);
// Interpolate X at all 4 Y-subpixels.
ivec4 xh = setup.xh * SCALING_FACTOR + (y_subs - yh_interpolation_base) * setup.dxhdy;
ivec4 xm = setup.xm * SCALING_FACTOR + (y_subs - yh_interpolation_base) * setup.dxmdy;
ivec4 xl = setup.xl * SCALING_FACTOR + (y_subs - ym_interpolation_base) * setup.dxldy;
xl = mix(xl, xm, lessThan(y_subs, ivec4(SCALING_FACTOR * setup.ym)));
// If we have overflows, we can become sensitive to this in invalid_line check, where
// checks that should pass fail, and vice versa.
// Note that we shaved off one bit in triangle setup for upscaling purposes,
// so this should be 28 bits normally.
xl = bitfieldExtract(xl, 0, 27 + SCALING_LOG2);
xh = bitfieldExtract(xh, 0, 27 + SCALING_LOG2);
ivec4 xh_shifted = quantize_x(xh);
ivec4 xl_shifted = quantize_x(xl);
ivec4 xleft, xright;
if (flip)
{
xleft = xh_shifted;
xright = xl_shifted;
}
else
{
xleft = xl_shifted;
xright = xh_shifted;
}
bvec4 invalid_line = greaterThan(xleft >> 1, xright >> 1);
ivec4 lo_scissor = ivec4(SCALING_FACTOR * (scissor.xlo << 1));
ivec4 hi_scissor = ivec4(SCALING_FACTOR * (scissor.xhi << 1));
bool all_over = all(greaterThanEqual(min(xleft, xright), hi_scissor));
bool all_under = all(lessThan(max(xleft, xright), lo_scissor));
xleft = max(xleft, lo_scissor);
xleft = min(xleft, hi_scissor);
xright = max(xright, lo_scissor);
xright = min(xright, hi_scissor);
invalid_line = bvec4(uvec4(invalid_line) | clip_y);
xleft = mix(xleft, ivec4(0xffff), invalid_line);
xright = mix(xright, ivec4(0), invalid_line);
int start_x = min4(xleft) >> 3;
int end_x = max4(xright) >> 3;
span_setup.xleft = xleft;
span_setup.xright = xright;
span_setup.start_x = start_x;
span_setup.end_x = end_x;
span_setup.valid_line = int(!all(invalid_line) && !all_over && !all_under);
if (interlace_en)
if (((y >> SCALING_LOG2) & 1) != int(keep_odd_field))
span_setup.valid_line = U16_C(0);
span_setup.lodlength = int(flip ? (end_x - span_setup.interpolation_base_x) : (span_setup.interpolation_base_x - start_x));
store_span_setup(gl_GlobalInvocationID.x, span_setup);
}