#version 450 /* Copyright (c) 2020 Themaister * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "small_types.h" #include "fb_formats.h" layout(local_size_x_id = 3) in; layout(constant_id = 0) const int RDRAM_SIZE = 8 * 1024 * 1024; const int RDRAM_MASK_8 = RDRAM_SIZE - 1; const int RDRAM_MASK_16 = RDRAM_MASK_8 >> 1; const int RDRAM_MASK_32 = RDRAM_MASK_8 >> 2; layout(constant_id = 1) const int FB_SIZE_LOG2 = 0; layout(constant_id = 2) const bool COLOR_DEPTH_ALIAS = false; layout(constant_id = 4) const int NUM_SAMPLES = 1; layout(constant_id = 5) const bool DITHER = false; layout(constant_id = 6) const bool RDRAM_UNSCALED_WRITE_MASK = false; layout(push_constant) uniform Registers { uint num_pixels, fb_addr, fb_depth_addr, width, height; } registers; layout(set = 0, binding = 0) writeonly buffer RDRAMSingleSampled8 { uint8_t elems[]; } vram8; layout(set = 0, binding = 0) writeonly buffer RDRAMSingleSampled16 { uint16_t elems[]; } vram16; layout(set = 0, binding = 0) writeonly buffer RDRAMSingleSampled32 { uint elems[]; } vram32; layout(set = 0, binding = 2) writeonly buffer RDRAMUpscalingReference8 { uint8_t elems[]; } vram_reference8; layout(set = 0, binding = 2) writeonly buffer RDRAMUpscalingReference16 { uint16_t elems[]; } vram_reference16; layout(set = 0, binding = 2) writeonly buffer RDRAMUpscalingReference32 { uint elems[]; } vram_reference32; layout(set = 0, binding = 3) readonly buffer RDRAMUpscaling8 { uint8_t elems[]; } vram_upscaled8; layout(set = 0, binding = 3) readonly buffer RDRAMUpscaling16 { uint16_t elems[]; } vram_upscaled16; layout(set = 0, binding = 3) readonly buffer RDRAMUpscaling32 { uint elems[]; } vram_upscaled32; layout(set = 0, binding = 4) readonly buffer RDRAMHiddenUpscaling { uint8_t elems[]; } hidden_vram_upscaled; void copy_rdram_8(uint index) { uint r = 0u; for (int i = 0; i < NUM_SAMPLES; i++) { uint real_word = uint(vram_upscaled8.elems[index]); r += real_word; } r = (r + (NUM_SAMPLES >> 1)) / NUM_SAMPLES; vram_reference8.elems[index] = uint8_t(r); vram8.elems[index] = uint8_t(r); } uvec4 decode_rgba5551(uint word) { return (uvec4(word) >> uvec4(11, 6, 1, 0)) & uvec4(0x1f, 0x1f, 0x1f, 1); } uint encode_rgba5551(uvec4 color) { return (color.r << 11u) | (color.g << 6u) | (color.b << 1u) | color.a; } const uint bayer_dither_lut[16] = uint[]( 0, 4, 1, 5, 4, 0, 5, 1, 3, 7, 2, 6, 7, 3, 6, 2); void copy_rdram_16(uint index, uint x, uint y) { uvec4 rgba = uvec4(0u); for (int i = 0; i < NUM_SAMPLES; i++) { uint real_word = uint(vram_upscaled16.elems[index + i * (RDRAM_SIZE >> 1)]); rgba += decode_rgba5551(real_word); } if (DITHER) { uint dither_value = bayer_dither_lut[(y & 3u) * 4u + (x & 3u)] * NUM_SAMPLES; rgba = (8u * rgba + dither_value) / (8 * NUM_SAMPLES); } else { rgba = (rgba + (NUM_SAMPLES >> 1)) / NUM_SAMPLES; } uint encoded = encode_rgba5551(rgba); vram16.elems[index] = uint16_t(encoded); vram_reference16.elems[index] = uint16_t(encoded); } void copy_rdram_16_single_sample(uint index) { // Copies the first sample. We cannot meaningfully filter depth samples. // The first sample should overlap exactly with the single-sampled version. // Coverage clipping might slightly change the result, but shouldn't be different enough to break things. uint upscaled_word = uint(vram_upscaled16.elems[index]); vram16.elems[index] = uint16_t(upscaled_word); vram_reference16.elems[index] = uint16_t(upscaled_word); } uvec4 decode_rgba8(uint word) { return (uvec4(word) >> uvec4(24, 16, 8, 0)) & uvec4(0xff); } uint encode_rgba8(uvec4 color) { return (color.r << 24u) | (color.g << 16u) | (color.b << 8u) | (color.a << 0u); } void copy_rdram_32(uint index) { uvec4 rgba = uvec4(0u); for (int i = 0; i < NUM_SAMPLES; i++) { uint real_word = vram_upscaled32.elems[index + i * (RDRAM_SIZE >> 2)]; rgba += decode_rgba8(real_word); } rgba = (rgba + (NUM_SAMPLES >> 1)) / NUM_SAMPLES; uint encoded = encode_rgba8(rgba); vram32.elems[index] = encoded; vram_reference32.elems[index] = encoded; } void main() { uvec2 coord = gl_GlobalInvocationID.xy; uint index = coord.y * registers.width + coord.x; uint depth_index = index + registers.fb_depth_addr; uint color_index = index + registers.fb_addr; uvec2 mask_coord = coord >> 2u; uint mask_index = mask_coord.x + mask_coord.y * ((registers.width + 3) >> 2u); uint write_mask; if (coord.x < registers.width) write_mask = vram_upscaled32.elems[NUM_SAMPLES * (RDRAM_SIZE >> 2) + mask_index]; else write_mask = 0u; uint shamt = 2u * ((coord.x & 3u) + 4u * (coord.y & 3u)); write_mask = write_mask >> shamt; bool color_write_mask = (write_mask & 1u) != 0u; bool depth_write_mask = (write_mask & 2u) != 0u; if (color_write_mask) { switch (FB_SIZE_LOG2) { case 0: color_index &= RDRAM_MASK_8; color_index ^= 3u; copy_rdram_8(color_index); break; case 1: color_index &= RDRAM_MASK_16; color_index ^= 1u; copy_rdram_16(color_index, coord.x, coord.y); break; case 2: color_index &= RDRAM_MASK_32; copy_rdram_32(color_index); break; } } // Metal portability: Memory barriers must happen in uniform control flow. if (RDRAM_UNSCALED_WRITE_MASK) { // Need this memory barrier to ensure the mask readback does not read // an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is // also coherent. memoryBarrierBuffer(); if (color_write_mask) { switch (FB_SIZE_LOG2) { case 0: vram8.elems[color_index + RDRAM_SIZE] = mem_u8(0xff); break; case 1: vram16.elems[color_index + (RDRAM_SIZE >> 1)] = mem_u16(0xffff); break; case 2: vram32.elems[color_index + (RDRAM_SIZE >> 2)] = ~0u; break; } } } // Don't bother writing back hidden VRAM. It is not visible to host anyways, and coverage is meaningless when it's filtered. // If host later decides to modify the CPU memory, then the hidden VRAM values become complete bogus either way. if (!COLOR_DEPTH_ALIAS) { if (depth_write_mask) { depth_index &= RDRAM_MASK_16; depth_index ^= 1u; copy_rdram_16_single_sample(depth_index); } // Metal portability: Memory barriers must happen in uniform control flow. if (RDRAM_UNSCALED_WRITE_MASK) { // Need this memory barrier to ensure the mask readback does not read // an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is // also coherent. memoryBarrierBuffer(); if (depth_write_mask) vram16.elems[depth_index + (RDRAM_SIZE >> 1u)] = mem_u16(0xffff); } } }