n64-emu/third_party/parallel-rdp-standalone/parallel-rdp/shaders/interpolation.h

/* Copyright (c) 2020 Themaister
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#ifndef INTERPOLATION_H_
#define INTERPOLATION_H_

#include "data_structures.h"
#include "clamping.h"
#include "perspective.h"

u8x4 interpolate_rgba(ivec4 rgba, ivec4 drgba_dx, ivec4 drgba_dy, int dx, int coverage)
{
	rgba += ((drgba_dx & ~0x1f) >> SCALING_LOG2) * dx;

	// RGBA is interpolated to 9-bit. The last bit is used to deal with clamping.
	// Slight underflow below 0 is clamped to 0 and slight overflow above 0xff is clamped to 0xff.

	// Keep 2 sign bits of precision before we complete the centroid interpolation.
	i16x4 snapped_rgba = i16x4(rgba >> 14);

	// Centroid clipping is based on the first coverage bit, and we interpolate at the first subpixel in scanline order.
	// With this layout we can just use findLSB to get correct result.
	// 0x01        0x02
	//       0x04        0x08
	// 0x10        0x20
	//       0x40        0x80
	int first_coverage = findLSB(coverage);
	i16 yoff = i16(first_coverage >> 1);
	i16 xoff = i16((first_coverage & 1) << 1) + (yoff & I16_C(1));
	snapped_rgba <<= I16_C(2 + SCALING_LOG2);
	snapped_rgba += xoff * i16x4(drgba_dx >> 14) + yoff * i16x4(drgba_dy >> 14);
	snapped_rgba >>= I16_C(4 + SCALING_LOG2);
	return clamp_9bit(snapped_rgba);
}

void interpolate_st_copy(SpanSetup span, ivec4 dstzw_dx, int x, bool perspective, bool flip,
                         out ivec2 st, out int s_offset)
{
	int dx = flip ? (x - span.start_x) : (span.end_x - x);

	// For copy pipe, we should duplicate pixels when scaling, there is no filtering we can (or should!) do.
	dx >>= SCALING_LOG2;

	// Snap DX to where we perform interpolation (once per N output pixels).
	int snapped_dx = dx & global_constants.fb_info.dx_mask;
	s_offset = dx - snapped_dx;
	int lerp_dx = (dx >> global_constants.fb_info.dx_shift) * (flip ? 1 : -1);
	ivec3 stw = span.stzw.xyw + (dstzw_dx.xyw & ~0x1f) * lerp_dx;

	if (perspective)
	{
		bool st_overflow;
		st = perspective_divide(stw >> 16, st_overflow);
	}
	else
		st = no_perspective_divide(stw >> 16);
}

ivec2 interpolate_st_single(ivec4 stzw, ivec4 dstzw_dx, int dx, bool perspective)
{
	ivec3 stw = stzw.xyw + ((dstzw_dx.xyw & ~0x1f) >> SCALING_LOG2) * dx;
	stw >>= 16;
	ivec2 st;

	if (perspective)
	{
		bool st_overflow;
		st = perspective_divide(stw, st_overflow);
	}
	else
		st = no_perspective_divide(stw);

	return st;
}

void interpolate_stz(ivec4 stzw, ivec4 dstzw_dx, ivec4 dstzw_dy, int dx, int coverage, bool perspective, bool uses_lod,
                     int flip_direction, out ivec2 st, out ivec2 st_dx, out ivec2 st_dy, out int z, inout bool st_overflow)
{
	ivec3 stw = stzw.xyw + ((dstzw_dx.xyw & ~0x1f) >> SCALING_LOG2) * dx;
	ivec3 stw_dx, stw_dy;

	if (uses_lod)
	{
		stw_dx = stw + flip_direction * ((dstzw_dx.xyw & ~0x1f) >> SCALING_LOG2);
		if (SCALING_FACTOR > 1)
			stw_dy = stw + abs(flip_direction) * ((dstzw_dy.xyw & ~0x7fff) >> SCALING_LOG2);
		else
			stw_dy = stw + ((dstzw_dy.xyw & ~0x7fff) >> SCALING_LOG2);
	}

	if (perspective)
	{
		st = perspective_divide(stw >> 16, st_overflow);
		if (uses_lod)
		{
			st_dx = perspective_divide(stw_dx >> 16, st_overflow);
			st_dy = perspective_divide(stw_dy >> 16, st_overflow);
		}
	}
	else
	{
		st = no_perspective_divide(stw >> 16);
		if (uses_lod)
		{
			st_dx = no_perspective_divide(stw_dx >> 16);
			st_dy = no_perspective_divide(stw_dy >> 16);
		}
	}

	// Ensure that interpolation snaps as we expect on every "main" pixel,
	// for subpixels, interpolate with quantized step factor.
	z = stzw.z + dstzw_dx.z * (dx >> SCALING_LOG2) + (dstzw_dx.z >> SCALING_LOG2) * (dx & (SCALING_FACTOR - 1));

	int snapped_z = z >> 10;
	int first_coverage = findLSB(coverage);
	int yoff = first_coverage >> 1;
	int xoff = ((first_coverage & 1) << 1) + (yoff & I16_C(1));
	snapped_z <<= 2 + SCALING_LOG2;
	snapped_z += xoff * (dstzw_dx.z >> 10) + yoff * (dstzw_dy.z >> 10);
	snapped_z >>= 5 + SCALING_LOG2;

	z = clamp_z(snapped_z);
}

#if 0
u8x4 interpolate_rgba(TriangleSetup setup, AttributeSetup attr, int x, int y, int coverage)
{
	bool do_offset = (setup.flags & TRIANGLE_SETUP_DO_OFFSET_BIT) != 0;
	int y_interpolation_base = int(setup.yh) >> 2;
	int xh = setup.xh + (y - y_interpolation_base) * (setup.dxhdy << 2);

	ivec4 drgba_diff = ivec4(0);

	// In do_offset mode, varyings are latched at last subpixel line instead of first (for some reason).
	if (do_offset)
	{
		xh += 3 * setup.dxhdy;
		ivec4 drgba_deh = attr.drgba_de & ~0x1ff;
		ivec4 drgba_dyh = attr.drgba_dy & ~0x1ff;
		drgba_diff = drgba_deh - (drgba_deh >> 2) - drgba_dyh + (drgba_dyh >> 2);
	}

	int base_x = xh >> 16;
	int xfrac = (xh >> 8) & 0xff;

	ivec4 rgba = attr.rgba;
	rgba += attr.drgba_de * (y - y_interpolation_base);
	rgba = ((rgba & ~0x1ff) + drgba_diff - xfrac * ((attr.drgba_dx >> 8) & ~1)) & ~0x3ff;
	rgba += (attr.drgba_dx & ~0x1f) * (x - base_x);

	// RGBA is interpolated to 9-bit. The last bit is used to deal with clamping.
	// Slight underflow below 0 is clamped to 0 and slight overflow above 0xff is clamped to 0xff.

	// Keep 2 sign bits of precision before we complete the centroid interpolation.
	i16x4 snapped_rgba = i16x4(rgba >> 14);

	// Centroid clipping is based on the first coverage bit, and we interpolate at the first subpixel in scanline order.
	// FWIW, Angrylion has a very different coverage bit assignment, but we need this layout to avoid an awkward LUT.
	// With this layout we can just use findLSB instead.
	// 0x01        0x02
	//       0x04        0x08
	// 0x10        0x20
	//       0x40        0x80
	int first_coverage = findLSB(coverage);
	i16 yoff = i16(first_coverage >> 1);
	i16 xoff = i16((first_coverage & 1) << 1) + (yoff & I16_C(1));
	snapped_rgba <<= I16_C(2);
	snapped_rgba += xoff * i16x4(attr.drgba_dx >> 14) + yoff * i16x4(attr.drgba_dy >> 14);
	snapped_rgba >>= I16_C(4);
	return clamp_9bit(snapped_rgba);
}

ivec3 interpolate_stw(TriangleSetup setup, AttributeSetup attr, int x, int y)
{
	bool do_offset = (setup.flags & TRIANGLE_SETUP_DO_OFFSET_BIT) != 0;
	int y_interpolation_base = int(setup.yh) >> 2;
	int xh = setup.xh + (y - y_interpolation_base) * (setup.dxhdy << 2);

	ivec3 dstw_diff = ivec3(0);

	// In do_offset mode, varyings are latched at last subpixel line instead of first (for some reason).
	if (do_offset)
	{
		xh += 3 * setup.dxhdy;
		ivec3 dstw_deh = attr.dstzw_de.xyw & ~0x1ff;
		ivec3 dstw_dyh = attr.dstzw_dy.xyw & ~0x1ff;
		dstw_diff = dstw_deh - (dstw_deh >> 2) - dstw_dyh + (dstw_dyh >> 2);
	}

	int base_x = xh >> 16;
	int xfrac = (xh >> 8) & 0xff;

	ivec3 stw = attr.stzw.xyw;
	stw += attr.dstzw_de.xyw * (y - y_interpolation_base);
	stw = ((stw & ~0x1ff) + dstw_diff - xfrac * ((attr.dstzw_dx.xyw >> 8) & ~1)) & ~0x3ff;
	stw += (attr.dstzw_dx.xyw & ~0x1f) * (x - base_x);

	ivec3 snapped_stw = stw >> 16;
	return snapped_stw;
}

int interpolate_z(TriangleSetup setup, AttributeSetup attr, int x, int y, int coverage)
{
	bool do_offset = (setup.flags & TRIANGLE_SETUP_DO_OFFSET_BIT) != 0;
	int y_interpolation_base = int(setup.yh) >> 2;
	int xh = setup.xh + (y - y_interpolation_base) * (setup.dxhdy << 2);

	int dzdiff = 0;
	// In do_offset mode, varyings are latched at last subpixel line instead of first (for some reason).
	if (do_offset)
	{
		xh += 3 * setup.dxhdy;
		int dzdeh = attr.dstzw_de.z & ~0x1ff;
		int dzdyh = attr.dstzw_dy.z & ~0x1ff;
		dzdiff = dzdeh - (dzdeh >> 2) - dzdyh + (dzdyh >> 2);
	}

	int base_x = xh >> 16;
	int xfrac = (xh >> 8) & 0xff;
	int z = attr.stzw.z;
	z += attr.dstzw_de.z * (y - y_interpolation_base);
	z = ((z & ~0x1ff) + dzdiff - xfrac * ((attr.dstzw_dx.z >> 8) & ~1)) & ~0x3ff;
	z += attr.dstzw_dx.z * (x - base_x);

	int snapped_z = z >> 10;
	int first_coverage = findLSB(coverage);
	int yoff = first_coverage >> 1;
	int xoff = ((first_coverage & 1) << 1) + (yoff & 1s);
	snapped_z <<= 2;
	snapped_z += xoff * (attr.dstzw_dx.z >> 10) + yoff * (attr.dstzw_dy.z >> 10);
	snapped_z >>= 5;
	return clamp_z(snapped_z);
}
#endif

#endif