#include #include #include #include #include #define SCE_GNM_SET_ALIGN(n) __attribute__((__aligned__(n))) #define SCE_GNM_ASSERT(x) ((void)(x)) typedef unsigned int uint; typedef unsigned char uchar; typedef unsigned short ushort; typedef unsigned long ulong; typedef char int8; typedef signed char sint8; typedef unsigned char uint8; typedef short int16; typedef signed short sint16; typedef unsigned short uint16; typedef int int32; typedef signed int sint32; typedef unsigned int uint32; #define LAST_IND(x,part_type) (sizeof(x)/sizeof(part_type) - 1) #define HIGH_IND(x,part_type) LAST_IND(x,part_type) #define LOW_IND(x,part_type) 0 // Partially defined types. They are used when the decompiler does not know // anything about the type except its size. #define _BYTE uint8 #define _WORD uint16 #define _DWORD uint32 #define _QWORD uint64 typedef int32_t _BOOL4; #define BYTEn(x, n) (*((_BYTE*)&(x)+n)) #define WORDn(x, n) (*((_WORD*)&(x)+n)) #define DWORDn(x, n) (*((_DWORD*)&(x)+n)) #define LOBYTE(x) BYTEn(x,LOW_IND(x,_BYTE)) #define LOWORD(x) WORDn(x,LOW_IND(x,_WORD)) #define LODWORD(x) DWORDn(x,LOW_IND(x,_DWORD)) #define HIBYTE(x) BYTEn(x,HIGH_IND(x,_BYTE)) #define HIWORD(x) WORDn(x,HIGH_IND(x,_WORD)) #define HIDWORD(x) DWORDn(x,HIGH_IND(x,_DWORD)) static const unsigned s_vex_vv[] = { //#include "vex_vv.h" //#include "test_vv.h" #include "test.h" }; //static const unsigned s_pix_p[] = { // #include "pix_p.h" // #include "test_p.h" //}; struct ShaderBinaryInfo { uint8_t m_signature[7]; // 'OrbShdr' uint8_t m_version; // ShaderBinaryInfoVersion unsigned int m_pssl_or_cg : 1; // 1 = PSSL / Cg, 0 = IL / shtb unsigned int m_cached : 1; // 1 = when compile, debugging source was cached. May only make sense for PSSL=1 uint32_t m_type : 4; // See enum ShaderBinaryType uint32_t m_source_type : 2; // See enum ShaderSourceType unsigned int m_length : 24; // Binary code length (does not include this structure or any of its preceding associated tables) uint8_t m_chunkUsageBaseOffsetInDW; // in DW, which starts at ((uint32_t*)&ShaderBinaryInfo) - m_chunkUsageBaseOffsetInDW; max is currently 7 dwords (128 T# + 32 V# + 20 CB V# + 16 UAV T#/V#) uint8_t m_numInputUsageSlots; // Up to 16 user data reg slots + 128 extended user data dwords supported by CUE; up to 16 user data reg slots + 240 extended user data dwords supported by InputUsageSlot uint8_t m_isSrt : 1; // 1 if this shader uses shader resource tables and has an SrtDef table embedded below the input usage table and any extended usage info uint8_t m_isSrtUsedInfoValid : 1; // 1 if SrtDef::m_isUsed=0 indicates an element is definitely unused; 0 if SrtDef::m_isUsed=0 indicates only that the element is not known to be used (m_isUsed=1 always indicates a resource is known to be used) uint8_t m_isExtendedUsageInfo : 1; // 1 if this shader has extended usage info for the InputUsage table embedded below the input usage table uint8_t m_reserved2 : 5; // For future use uint8_t m_reserved3; // For future use uint32_t m_shaderHash0; // Association hash first 4 bytes uint32_t m_shaderHash1; // Association hash second 4 bytes uint32_t m_crc32; // crc32 of shader + this struct, just up till this field }; typedef enum ShaderStage { kShaderStageCs = 0x00000000, ///< Compute shader stage. kShaderStagePs = 0x00000001, ///< Pixel shader stage. kShaderStageVs = 0x00000002, ///< Vertex shader stage. kShaderStageGs = 0x00000003, ///< Geometry shader stage. kShaderStageEs = 0x00000004, ///< Export shader stage. kShaderStageHs = 0x00000005, ///< Hull shader stage. kShaderStageLs = 0x00000006, ///< LDS shader stage. kShaderStageCount ///< The number of shader stages. } ShaderStage; typedef enum ShaderConstantDwordSize { kDwordSizeResource = 8, ///< T# or padded V#. kDwordSizeRwResource = 8, ///< T# or padded V#. kDwordSizeSampler = 4, ///< S#. kDwordSizeVertexBuffer = 4, ///< V#. kDwordSizeConstantBuffer = 4, ///< V#. kDwordSizeBoolConstant = 1, ///< 32 packed bits. kDwordSizeFloatConstant = 1, ///< IEEE single-precision float. kDwordSizeAppendConsumeCounterRange = 1, ///< Two 16-bit integers packed together. kDwordSizeStreamoutBuffer = 4, ///< V#. kDwordSizeExtendedUserData = 128, ///< Space for extra shader user data. kDwordSizeDispatchDrawData = 32, ///< Space for 32 DWORDs of DispatchDraw data. kDwordSizeGdsMemoryRange = 1, ///< Two 16-bit integers packed together. } ShaderConstantDwordSize; typedef enum FetchShaderInstancingMode { kFetchShaderUseVertexIndex = 0x0, ///< No instancing; use Vertex Index. Default. kFetchShaderUseInstanceId = 0x1, ///< Use the instance ID to index the data. kFetchShaderUseInstanceIdOverStepRate0 = 0x2, ///< Use the (instance ID / step rate 0) to index the data. kFetchShaderUseInstanceIdOverStepRate1 = 0x3, ///< Use the (instance ID / step rate 1) to index the data. } FetchShaderInstancingMode; const int32_t kMaxResourceCount = 16; const int32_t kMaxRwResourceCount = 16; const int32_t kMaxSamplerCount = 16; const int32_t kMaxVertexBufferCount = 16; const int32_t kMaxConstantBufferCount = 20; const int32_t kMaxStreamOutBufferCount = 4; const int32_t kMaxUserDataCount = 16; ///< PSSL compiler limit is 16, count not tracked by the InputResourceOffsets table const int32_t kMaxSrtUserDataCount = 16; ///< PSSL compiler limit is 16, count not tracked by the InputResourceOffsets table const int32_t kMaxResourceBufferCount = 4; ///< Maximum number for supported splits for the resource buffer per LCUE instance const int32_t kMaxPsInputUsageCount = 32; ///< Maximum number of interpolants a PS Stage can receive const int32_t kDefaultFetchShaderPtrSgpr = 0; ///< Default SGPR in PSSL const int32_t kDefaultVertexBufferTablePtrSgpr = 2; ///< Default SGPR in PSSL const int32_t kDefaultGlobalInternalTablePtrSgpr = 0; ///< Default SGPR in PSSL, Note: it has lower priority than FetchPtr (sgpr would be s[4:5], after FetchPtr and VbPtr) const int32_t kDefaultStreamOutTablePtrSgpr = 2; ///< Default SGPR in PSSL, only used by VS copy shader in GS active stage const int32_t kDefaultVertexldsEsGsSizeSgpr = 0; ///< Default SGPR in PSSL, only used by VS copy shader in GS active stage const int32_t kResourceInUserDataSgpr = 0x8000; ///< In User data resource Mask const int32_t kResourceIsVSharp = 0x4000; ///< VSharp resource Mask Note: only used/available for immediate resources const int32_t kResourceValueMask = 0x3FFF; ///< Resource memory offset is stored in the lower 14-bits // On-chip GS constants const uint32_t kOnChipGsInvalidSignature = 0xFFFFFFFFU; // Tessellation distribution constants (Neo only) const uint32_t kTesselationDistrbutionMask = 0x7FFFFFFFU; ///< Tessellation Distribution mask HS shader stage const uint32_t kTesselationDistrbutionEnabledMask = ~kTesselationDistrbutionMask; ///< Tessellation Distribution enabled for HS shader stage // Shader stage constants const uint32_t kShaderStageAsynchronousCompute = kShaderStageCount; ///< Additional stage for CsVs dispatch Draw shader const uint32_t kNumShaderStages = kShaderStageCount + 1; ///< Number of unique shader stages for resource binding // 6KB is enough to store anything you can bind to a GPU shader stage, all counted in DWORDs const int32_t kGpuStageBufferSizeInDwords = (6 * 1024) / sizeof(uint32_t); ///< Size of Single buffer Stage const int32_t kComputeScratchBufferSizeInDwords = kGpuStageBufferSizeInDwords; ///< Size of the Compute Scratch buffer const int32_t kGraphicsScratchBufferSizeInDwords = kNumShaderStages * kGpuStageBufferSizeInDwords; ///< Size of the Graphics Scratch buffer (encompasses all graphics shader stages) //const int32_t kGlobalInternalTableSizeInDwords = sce::kShaderGlobalResourceCount * sizeof(sce::Buffer) / sizeof(uint32_t); ///< Size of a global resource table // Internal constant buffers that are expected at fixed API-slots const int32_t kConstantBufferInternalApiSlotForEmbeddedData = 15; ///< Immediate/Embedded constant buffer fixed API-slot (any GPU stage). const int32_t kConstantBufferInternalApiSlotReserved0 = 16; ///< Slot 16 is reserved by compiler const int32_t kConstantBufferInternalApiSlotReserved1 = 17; ///< Slot 17 is reserved by compiler const int32_t kConstantBufferInternalApiSlotReserved2 = 18; ///< Slot 18 is reserved by compiler const int32_t kConstantBufferInternalApiSlotForTessellation = 19; ///< Tessellation constant buffer (with strides for LDS data) fixed API-slot (HS,VS/ES GPU stages). // Internal constants for ShaderBinaryInfo const uint64_t kShaderBinaryInfoSignatureMask = 0x00ffffffffffffffLL; const uint64_t kShaderBinaryInfoSignatureU64 = 0x007264685362724fLL; // DispatchDraw constants const uint32_t kNumDispatchDrawRingBuffersRolls = 16; ///< Number of times the dispatch ring buffers can be rolled in a LCUE instance, see setupDispatchDrawRingBuffers() struct InputResourceOffsets { uint16_t requiredBufferSizeInDwords; ///< Specifies how much memory needs to be reserved to store all memory-based resources. These are things not set through PM4. bool isSrtShader; ///< A flag that specifies whether the shader makes use of SRTs. uint8_t shaderStage; ///< The shader stage (LS/HS/ES/GS/VS/PS) for the shader resources offsets. // For each available shader-resource-ptr, store the starting SGPR s[0:254] where it'll be set (0xFF means not used). Pointers take 2 SGPRs (64b) and must be 2DW aligned uint8_t fetchShaderPtrSgpr; ///< The SGPR containing the fetch shader pointer. If this exists, s[0:1] is always used. uint8_t vertexBufferPtrSgpr; ///< The SGPR containing the vertex buffer table pointer. If this exists, s[2:3] is always used, but only in the vertex pipeline. uint8_t streamOutPtrSgpr; ///< The SGPR containing the stream out buffer pointer. If this exists, s[2:3] is always used, but only in the Geometry pipeline. uint8_t userExtendedData1PtrSgpr; ///< The SGPR containing the user extended data table pointer. // uint8_t userInternalSrtDataPtrSgpr; ///< *Note: Not supported for now*. uint8_t constBufferPtrSgpr; ///< The SGPR containing the constant buffer table pointer. uint8_t resourcePtrSgpr; ///< The SGPR containing the resource buffer table pointer. uint8_t rwResourcePtrSgpr; ///< The SGPR containing the read/write resource buffer table pointer. uint8_t samplerPtrSgpr; ///< The SGPR containing the sampler buffer table pointer. uint8_t globalInternalPtrSgpr; ///< The SGPR containing the global internal pointer, which is either stored in s[0:1] or s[4:5]. uint8_t appendConsumeCounterSgpr; ///< The SGPR containing the 32bit value address and size used from GDS. uint8_t gdsMemoryRangeSgpr; ///< The SGPR containing the GDS address range for storage. uint8_t ldsEsGsSizeSgpr; ///< The SGPR containing the GWS resource base offset. uint8_t userSrtDataSgpr; ///< The SGPR containing the start offset of the SRT Data Buffer. uint8_t userSrtDataCount; ///< The number of DWORDs in use by the SRT Data Buffer. The size will be between 1-8. uint8_t gdsKickRingBufferOffsetSgpr; ///< The SGPR containing the GDS kick ring buffer offset for DispatchDraw. uint8_t vertexRingBufferOffsetSgpr; ///< The SGPR containing the Vertex kick ring buffer offset for DispatchDraw. uint8_t dispatchDrawPtrSgpr; ///< The SGPR containing the DispatchDraw data pointer to DispatchDrawTriangleCullData structure. uint8_t dispatchDrawInstancesSgpr; ///< The SGPR containing the number of instances for DispatchDraw. // For each available shader-resource-flat-table (aka array), store the memory offset (from the start of the buffer) to the beginning of its flat-table (0xFFFF means it's not used). // Note: arrays are 0 indexed but the user can skip/set any index inside the range, allowing gaps at any place. This accelerates setting the pointer to the beginning of flat-tables. uint16_t constBufferArrayDwOffset; ///< The constant buffer table offset into the main buffer. uint16_t vertexBufferArrayDwOffset; ///< The vertex buffer table offset into the main buffer. uint16_t resourceArrayDwOffset; ///< The resource buffer table offset into the main buffer. uint16_t rwResourceArrayDwOffset; ///< The read/write resource buffer table offset into the main buffer. uint16_t samplerArrayDwOffset; ///< The sampler buffer table offset into the main buffer. uint16_t streamOutArrayDwOffset; ///< The stream out buffer table offset into the main buffer. This is only for the Geometry pipeline. // For each logical shader API slot, store either: an offset to a memory location, or a User Data (UD) SGPR where the resource should be set. // Note: if (item[i]&kResourceInUserDataSgpr) it's set directly into s[0:15] using PM4 packets, otherwise it's copied into the scratch buffer using the offset. uint16_t resourceDwOffset[kMaxResourceCount]; ///< The start offset of a resource in the resource buffer table or user data. uint16_t rwResourceDwOffset[kMaxRwResourceCount]; ///< The start offset of a resource in the read/write resource buffer table or user data. uint16_t samplerDwOffset[kMaxSamplerCount]; ///< The start offset of a sampler in the sampler buffer table or user data. uint16_t constBufferDwOffset[kMaxConstantBufferCount]; ///< The start offset of a constant buffer in the constant buffer table or user data. uint16_t vertexBufferDwOffset[kMaxVertexBufferCount]; ///< The start offset of a vertex array in the vertex buffer table or user data. uint16_t streamOutDwOffset[kMaxStreamOutBufferCount];///< The start offset of a stream out buffer in the stream out buffer table or user data. This is only for the Geometry pipeline. uint8_t resourceSlotCount; ///< The number of resource slots used by the shader. uint8_t rwResourceSlotCount; ///< The number of rw resource slots used by the shader. uint8_t samplerSlotCount; ///< The number of sampler slots used by the shader. uint8_t constBufferSlotCount; ///< The number of constant buffer slots used by the shader. uint8_t vertexBufferSlotCount; ///< The number of vertex buffer slots used by the shader. uint8_t pad[1]; /** @brief Initializes several resource slots that the shader uses. */ void initSupportedResourceCounts() { resourceSlotCount = kMaxResourceCount; rwResourceSlotCount = kMaxRwResourceCount; samplerSlotCount = kMaxSamplerCount; constBufferSlotCount = kMaxConstantBufferCount; vertexBufferSlotCount = kMaxVertexBufferCount; } // kShaderInputUsageImmAluFloatConst // Immediate float const (scalar or vector). *Not Supported* // kShaderInputUsageImmAluBool32Const // 32 immediate Booleans packed into one UINT. *Not Supported* }; typedef enum ShaderInputUsageType { kShaderInputUsageImmResource = 0x00, ///< Immediate read-only buffer/texture descriptor. kShaderInputUsageImmSampler = 0x01, ///< Immediate sampler descriptor. kShaderInputUsageImmConstBuffer = 0x02, ///< Immediate constant buffer descriptor. kShaderInputUsageImmVertexBuffer = 0x03, ///< Immediate vertex buffer descriptor. kShaderInputUsageImmRwResource = 0x04, ///< Immediate read/write buffer/texture descriptor. kShaderInputUsageImmAluFloatConst = 0x05, ///< Immediate float const (scalar or vector). kShaderInputUsageImmAluBool32Const = 0x06, ///< 32 immediate Booleans packed into one UINT. kShaderInputUsageImmGdsCounterRange = 0x07, ///< Immediate UINT with GDS address range for counters (used for append/consume buffers). kShaderInputUsageImmGdsMemoryRange = 0x08, ///< Immediate UINT with GDS address range for storage. kShaderInputUsageImmGwsBase = 0x09, ///< Immediate UINT with GWS resource base offset. kShaderInputUsageImmShaderResourceTable = 0x0A, ///< Pointer to read/write resource indirection table. kShaderInputUsageImmLdsEsGsSize = 0x0D, ///< Immediate LDS ESGS size used in on-chip GS // Skipped several items here... kShaderInputUsageSubPtrFetchShader = 0x12, ///< Immediate fetch shader subroutine pointer. kShaderInputUsagePtrResourceTable = 0x13, ///< Flat resource table pointer. kShaderInputUsagePtrInternalResourceTable = 0x14, ///< Flat internal resource table pointer. kShaderInputUsagePtrSamplerTable = 0x15, ///< Flat sampler table pointer. kShaderInputUsagePtrConstBufferTable = 0x16, ///< Flat const buffer table pointer. kShaderInputUsagePtrVertexBufferTable = 0x17, ///< Flat vertex buffer table pointer. kShaderInputUsagePtrSoBufferTable = 0x18, ///< Flat stream-out buffer table pointer. kShaderInputUsagePtrRwResourceTable = 0x19, ///< Flat read/write resource table pointer. kShaderInputUsagePtrInternalGlobalTable = 0x1A, ///< Internal driver table pointer. kShaderInputUsagePtrExtendedUserData = 0x1B, ///< Extended user data pointer. kShaderInputUsagePtrIndirectResourceTable = 0x1C, ///< Pointer to resource indirection table. kShaderInputUsagePtrIndirectInternalResourceTable = 0x1D, ///< Pointer to internal resource indirection table. kShaderInputUsagePtrIndirectRwResourceTable = 0x1E, ///< Pointer to read/write resource indirection table. // Skipped several items here... kShaderInputUsageImmGdsKickRingBufferOffset = 0x22, ///< Immediate UINT offset into GDS kick ring buffer for DispatchDraw. This must not be in extended user data. kShaderInputUsageImmVertexRingBufferOffset = 0x23, ///< Immediate UINT offset into vertex ring buffer for DispatchDraw. This must not be in extended user data. kShaderInputUsagePtrDispatchDraw = 0x24, ///< Pointer to DispatchDraw data. This must not be in extended user data. kShaderInputUsageImmDispatchDrawInstances = 0x25, ///< Immediate UINT ((firstInstance<<16)|(numInstances-1)). This must not be in extended user data. } ShaderInputUsageType; class VsShader; class PsShader; class ShaderInfo { public: union { const void *m_shaderStruct; ///< A pointer to the shader struct -- typeless. const VsShader* m_vsShader; const PsShader* m_psShader; }; const uint32_t *m_gpuShaderCode; ///< A pointer to the GPU Shader Code which will need to be copied into GPU visible memory. uint32_t m_gpuShaderCodeSize; ///< The size of the GPU Shader Code in bytes. uint32_t m_reserved; }; class PipelineStage { public: /// Represents vertex shader information. class VsInfo { public: uint8_t m_vertexShaderVariant; ///< The PsslVertexVariant such as kVertexVariantVertex, kVertexVariantExport, kVertexVariantLocal etc. uint8_t m_paddingTo32[3]; ///< Padding. }; /// Represents domain shader information. class DsInfo { public: uint8_t m_domainShaderVariant; ///< The PsslDomainVariant such as kDomainVariantVertex, kDomainVariantExport etc. uint8_t m_paddingTo32[3]; ///< Padding. }; /// Represents geometry shader information. class GsInfo { public: uint8_t m_geometryShaderVariant; ///< The PsslGeometryVariant such as kGeometryVariantOnBuffer, kGeometryVariantOnChip etc. uint8_t m_paddingTo32[3]; ///< Padding. }; /// Represents geometry shader information. class HsInfo { public: uint8_t m_hullShaderVariant; ///< The PsslHullVariant such as kHullVariantOnBuffer, kHullVariantOnChip etc. uint8_t m_paddingTo32[3]; ///< Padding. }; /// Stores data as different class types, depending on the type of shader. union { uint32_t m_u32; ///< An unsigned 32 bit integer. VsInfo m_vsInfo; ///< The vertex shader information. DsInfo m_dsInfo; ///< The domain shader information. GsInfo m_gsInfo; ///< The geometry shader information. HsInfo m_hsInfo; ///< The hull shader information. }; }; class SystemAttributes { public: /// Represents CS pipeline stage information. class CsInfo { public: uint16_t m_numThreads[3]; ///< The number of threads. }; /// Represents GS pipeline stage information. class GsInfo { public: uint16_t m_instance; ///< The instance of the GS Shader. uint16_t m_maxVertexCount; ///< The maximum number of vertices count. uint8_t m_inputType; ///< The GS Input Type (PsslGsIoType) such as triangle, line, point, adjacent tri + line, or patch. uint8_t m_outputType; ///< The GS Output Type (PsslGsIoType) such as triangles, lines, or points. uint8_t m_patchSize; ///< The patch size in case of patch topology. }; /// Represents DS pipeline stage information. class DsInfo { public: uint8_t m_domainPatchType; ///< The PsslHsDsPatchType: triangle, quad, or isoline. uint8_t m_inputControlPoints; ///< The number of points in the input patch. }; /// Represents HS pipeline stage information. class HsInfo { public: uint8_t m_domainPatchType; ///< The PsslHsDsPatchType: triangle, quad, or isoline. uint8_t m_inputControlPoints; ///< The number of points in the input patch. uint8_t m_outputTopologyType; ///< The PsslHsTopologyType: point, line, cwtri, or ccwtri. uint8_t m_partitioningType; ///< The PsslHsPartitioningType: integer, powof2, odd_fractional, or even_fractional. uint8_t m_outputControlPoints; ///< The number of points in the output patch. uint8_t m_patchSize; ///< The size of patch. uint8_t m_paddingTo32[2]; ///< Padding. float m_maxTessFactor; ///< The maximum tessellation factor. }; /// Stores data as different class types, depending on the type of shader. union { uint32_t m_u32[3]; ///< 12 bytes. CsInfo m_csInfo; ///< The compute shader information. GsInfo m_gsInfo; ///< The geometry shader information. DsInfo m_dsInfo; ///< The domain shader information. HsInfo m_hsInfo; ///< The hull shader information. }; }; class Header { public: uint8_t m_formatVersionMajor; ///< The version of shader binary format: major numbering. uint8_t m_formatVersionMinor; ///< The version of shader binary format: minor numbering. uint16_t m_compilerRevision; ///< The compiler type specific version of shader compiler: this is the svn revision for m_compilerType==kCompilerTypeOrbisPsslc or kCompilerTypeOrbisEsslc or for kCompilerTypeUnspecified (pre-SDK 2.500 versions of these compilers) uint32_t m_associationHash0; ///< The shader association hash 1. uint32_t m_associationHash1; ///< The shader association hash 2. uint8_t m_shaderType; ///< The PsslShaderType: VS, PS, GS, CS, GS, HS, or DS. uint8_t m_codeType; ///< The PsslCodeType: IL, ISA, or SCU. uint8_t m_usesShaderResourceTable; ///< The shader uses resource table. uint8_t m_compilerType : 4; ///< The PsslCompilerType; 0 uint8_t m_paddingTo32 : 4; // 0; reserved for future use uint32_t m_codeSize; ///< The size of code section. PipelineStage m_shaderTypeInfo; ///< The shader pipeline stage info. SystemAttributes m_shaderSystemAttributeInfo; ///< The shader system attribute info. }; class ShaderFileHeader { public: uint32_t m_fileHeader; ///< File identifier. Should be equal to kShaderFileHeaderId uint16_t m_majorVersion; ///< Major version of the shader binary. uint16_t m_minorVersion; ///< Minor version of the shader binary. uint8_t m_type; ///< Type of shader. Comes from ShaderType. uint8_t m_shaderHeaderSizeInDW; ///< \Shader.computeSize()/4. For example, see CsShader::computeSize(). uint8_t m_shaderAuxData; ///< A flag that indicates whether shader auxiliary data is present after end of the shader data ( sizeof(ShaderFileHeader) + ///< m_shaderHeaderSizeInDW * 4 + ShaderCommonData::m_shaderSize + ///< ShaderCommonData::m_embeddedConstantBufferSizeInDQW * 16). Set to 1 to indicate it is uint8_t m_targetGpuModes; ///< Union of all TargetGpuMode values for which this shader binary is valid. uint32_t m_reserved1; ///< Must be 0. }; class ShaderCommonData { public: // Memory Layout: // - Shader setup data (starting with ShaderCommonData) // - n InputUsage (4 bytes each) // - immediateConstants uint32_t m_shaderSize : 23; ///< The size of the shader binary code block in bytes. uint32_t m_shaderIsUsingSrt : 1; ///< A bitflag that indicates if the shader is using a Shader Resource Table. uint32_t m_numInputUsageSlots : 8; ///< The number of InputUsageSlot entries following the main shader structure. uint16_t m_embeddedConstantBufferSizeInDQW; ///< The size of the embedded constant buffer in 16-byte DWORDs. uint16_t m_scratchSizeInDWPerThread; ///< The scratch size required by each thread in 4-byte DWORDs. /** @brief Calculates and returns the size of the shader code including its embedded CB size in bytes */ uint32_t computeShaderCodeSizeInBytes() const { return m_shaderSize + m_embeddedConstantBufferSizeInDQW * 16; } }; #ifdef __cplusplus class InputUsageSlot #else // __cplusplus typedef struct InputUsageSlot #endif // __cplusplus { #ifdef __cplusplus public: #endif // __cplusplus uint8_t m_usageType; ///< From ShaderInputUsageType. uint8_t m_apiSlot; ///< API slot or chunk ID. uint8_t m_startRegister; ///< User data slot. union { struct { uint8_t m_registerCount : 1; ///< If 0, count is 4DW; if 1, count is 8DW. Other sizes are defined by the usage type. uint8_t m_resourceType : 1; ///< If 0, resource type V#; if 1, resource type T#, in case of a kShaderInputUsageImmResource. uint8_t m_reserved : 2; ///< Unused; must be set to zero. uint8_t m_chunkMask : 4; ///< Internal usage data. }; uint8_t m_srtSizeInDWordMinusOne; ///< Size of the SRT data; used for kShaderInputUsageImmShaderResourceTable. }; #ifdef __cplusplus } SCE_GNM_SET_ALIGN(4); #else // __cplusplus } InputUsageSlot; #endif // __cplusplus #ifdef __cplusplus class VertexInputSemantic #else // __cplusplus typedef struct VertexInputSemantic #endif // __cplusplus { #ifdef __cplusplus public: #endif // __cplusplus uint8_t m_semantic; uint8_t m_vgpr; uint8_t m_sizeInElements; uint8_t m_reserved; ///< Unused; must be set to zero. #ifdef __cplusplus }; #else // __cplusplus } VertexInputSemantic; #endif // __cplusplus #ifdef __cplusplus class VertexExportSemantic #else // __cplusplus typedef struct VertexExportSemantic #endif // __cplusplus { #ifdef __cplusplus public: #endif // __cplusplus uint8_t m_semantic; ///< Description to be specified. uint8_t m_outIndex : 5; ///< Description to be specified. uint8_t m_reserved : 1; uint8_t m_exportF16 : 2; ///< if (m_exportF16 == 0) this shader exports a 32-bit value to this parameter; if (m_exportF16 & 1) this shader exports a 16-bit float value to the low 16-bits of each channel; if (m_exportF16 & 2) this shader exports a 16-bit float value to the high 16-bits of each channel #ifdef __cplusplus }; #else // __cplusplus } VertexExportSemantic; #endif // __cplusplus #ifdef __cplusplus class VsStageRegisters #else // __cplusplus typedef struct VsStageRegisters #endif // __cplusplus { #ifdef __cplusplus public: #endif // __cplusplus uint32_t m_spiShaderPgmLoVs; ///< The pointer to shader program (bits 39:8). uint32_t m_spiShaderPgmHiVs; ///< The pointer to shader program (bits 47:40). This must be set to zero. uint32_t m_spiShaderPgmRsrc1Vs; uint32_t m_spiShaderPgmRsrc2Vs; uint32_t m_spiVsOutConfig; uint32_t m_spiShaderPosFormat; uint32_t m_paClVsOutCntl; #ifdef __cplusplus /** @brief Patches the GPU address of the shader code. @param[in] gpuAddress This address to patch. This must be aligned to a 256-byte boundary. */ void patchShaderGpuAddress(void *gpuAddress) { m_spiShaderPgmLoVs = static_cast(uintptr_t(gpuAddress) >> 8); m_spiShaderPgmHiVs = static_cast(uintptr_t(gpuAddress) >> 40); } bool isSharingContext(const VsStageRegisters shader) const { return !((m_spiVsOutConfig - shader.m_spiVsOutConfig) | (m_spiShaderPosFormat - shader.m_spiShaderPosFormat) | (m_paClVsOutCntl - shader.m_paClVsOutCntl)); } #endif // __cplusplus #ifdef __cplusplus }; #else // __cplusplus } VsStageRegisters; #endif // __cplusplus #ifdef __cplusplus class PsStageRegisters #else // __cplusplus typedef struct PsStageRegisters #endif // __cplusplus { #ifdef __cplusplus public: #endif // __cplusplus uint32_t m_spiShaderPgmLoPs; ///< A pointer to shader program (bits 39:8). uint32_t m_spiShaderPgmHiPs; ///< A pointer to shader program (bits 47:40). This must be set to zero. uint32_t m_spiShaderPgmRsrc1Ps; uint32_t m_spiShaderPgmRsrc2Ps; uint32_t m_spiShaderZFormat; uint32_t m_spiShaderColFormat; uint32_t m_spiPsInputEna; uint32_t m_spiPsInputAddr; uint32_t m_spiPsInControl; uint32_t m_spiBarycCntl; uint32_t m_dbShaderControl; uint32_t m_cbShaderMask; #ifdef __cplusplus void patchShaderGpuAddress(void *gpuAddress) { m_spiShaderPgmLoPs = static_cast(uintptr_t(gpuAddress) >> 8); m_spiShaderPgmHiPs = static_cast(uintptr_t(gpuAddress) >> 40); } #endif // __cplusplus #ifdef __cplusplus }; #else // __cplusplus } PsStageRegisters; #endif // __cplusplus #ifdef __cplusplus class FetchShaderBuildState #else // __cplusplus typedef struct FetchShaderBuildState #endif // __cplusplus { #ifdef __cplusplus public: #endif // __cplusplus // Filled up by: Generate[xx]FSBuildState functions uint16_t m_fetchShaderBufferSize; uint16_t m_fetchShaderFlags; ///< Description to be specified. uint8_t m_firstFreeSgpr; ///< Description to be specified. uint8_t m_vsharpSlotCount; uint16_t m_numElementsInInstancingData; ///< Description to be specified. uint32_t m_shaderModifier; // to be passed to set[x]sShader functions. uint32_t m_reserved0; ///< Unused; must be set to zero. const FetchShaderInstancingMode *m_fetchShaderInstancingData; ///< Description to be specified. // From the SC: uint8_t m_numInputSemantics; uint8_t m_vertexBaseUsgpr; uint8_t m_instanceBaseUsgpr; uint8_t m_reserved1; ///< Unused; must be set to zero. uint32_t m_numInputUsageSlots; const VertexInputSemantic *m_inputSemantics; const InputUsageSlot *m_inputUsageSlots; // Vertex Buffer Semantic Remap Table: uint32_t m_numElementsInRemapTable; uint32_t m_reserved2; const uint32_t *m_semanticsRemapTable; // vertex buffer index -> vertex buffer semantic (e.g. vbSemantic = m_semanticsRemapTable[vbIndex]) #ifdef __cplusplus }; #else // __cplusplus } FetchShaderBuildState; #endif // __cplusplus #ifdef __cplusplus class PixelInputSemantic #else // __cplusplus typedef struct PixelInputSemantic #endif // __cplusplus { #ifdef __cplusplus public: #endif // __cplusplus union { struct { uint16_t m_semantic : 8; ///< The semantic, matched against the semantic value in the VertexExportSemantic table in the VS shader. uint16_t m_defaultValue : 2; ///< The default value supplied to the shader, if m_semantic is not matched in the VS shader. 0={0,0,0,0}, 1={0,0,0,1.0}, 2={1.0,1.0,1.0,0}, 3={1.0,1.0,1.0,1.0} uint16_t m_isFlatShaded : 1; ///< if (m_interpF16 == 0) A bitflag that specifies whether the value interpolation is constant in the shader. It is ignored if m_isCustom is set; otherwise, it indicates that a shader reads only { P0 } and that some handling of infinite values in the calculation of P1-P0 and P2-P0 can be disabled. uint16_t m_isLinear : 1; ///< A bitflag that specifies whether the value interpolation is linear in the shader. It is unused by the Gnm runtime. uint16_t m_isCustom : 1; ///< if (m_interpF16 == 0) A bitflag that specifies whether the value interpolation is custom in the shader. It determines whether hardware subtraction should be disabled, supplying { P0, P1, P2 } to the shader instead of { P0, P1-P0, P2-P0 }. uint16_t m_reserved : 3; ///< Unused; set to zero. }; // NEO mode only: struct { uint16_t : 12; ///< Description to be specified. uint16_t m_defaultValueHi : 2; ///< if (m_interpF16 != 0) indicates the default value supplied to the shader for the upper 16-bits if m_semantic is not matched in the VS shader, and m_defaultValue indicates the default value for the lower 16-bits. uint16_t m_interpF16 : 2; ///< if (m_interpF16 == 0) this is a 32-bit float or custom value; if (m_interpF16 & 1) the low 16-bits of this parameter expect 16-bit float interpolation and/or default value; if (m_interpF16 & 2) the high 16-bits of this parameter expect 16-bit float interpolation and/or default value }; }; #ifdef __cplusplus }; #else // __cplusplus } PixelInputSemantic; #endif // __cplusplus class VsShader { public: ShaderCommonData m_common; ///< The common data for all shader stages. VsStageRegisters m_vsStageRegisters; ///< The data to be loaded into the VS shader stage registers. Please see DrawCommandBuffer::setVsShader() for more information. // not used if domain shader => vertex shader uint8_t m_numInputSemantics; ///< The number of entries in the input semantic table. uint8_t m_numExportSemantics; ///< The number of entries in the export semantic table. uint8_t m_gsModeOrNumInputSemanticsCs; ///< Stores a union of VsShaderGsMode values for a VsShader or GsShader::getCopyShader(), which are translated into a GsMode constant. For CsVsShader::getVertexShader() with CsVsShader::getComputeShader()->m_version==0, the number of input semantic table entries to use for the CsVsShader::getComputeShader() fetch shader is stored. uint8_t m_fetchControl; ///< The user registers that receive vertex and instance offsets for use in the fetch shader. /** @brief Patches the GPU address of the shader code. @param[in] gpuAddress This address to patch. This must be aligned to a 256-byte boundary. */ void patchShaderGpuAddress(void *gpuAddress) { m_vsStageRegisters.patchShaderGpuAddress(gpuAddress); } void *getBaseAddress() const { return (void *)((((uintptr_t)m_vsStageRegisters.m_spiShaderPgmHiVs) << 40) | (((uintptr_t)m_vsStageRegisters.m_spiShaderPgmLoVs) << 8)); } /** @brief Gets a pointer to this shader's input usage slot table that immediately follows this shader's structure in memory. @return A pointer to this shader's input usage slot table. */ const InputUsageSlot *getInputUsageSlotTable() const { return (const InputUsageSlot *)(this + 1); } /** @brief Gets a pointer to this shader's input semantic table that immediately follows the input usage table in memory. @return A pointer to this shader's input semantic table. */ const VertexInputSemantic *getInputSemanticTable() const { return (const VertexInputSemantic *)(getInputUsageSlotTable() + m_common.m_numInputUsageSlots); } /** @brief Gets a pointer to this shader's export semantic table that immediately follows the input semantic table in memory. @return A pointer to this shader's export semantic table. */ const VertexExportSemantic *getExportSemanticTable() const { return (const VertexExportSemantic *)(getInputSemanticTable() + m_numInputSemantics); } /** @brief Computes the total size (in bytes) of the shader binary including this structure, the input usage table, and the input and export semantic tables. @return The total size in bytes of this shader binary and its associated tables. */ uint32_t computeSize() const { const uint32_t size = sizeof(VsShader) + sizeof(InputUsageSlot) * m_common.m_numInputUsageSlots + sizeof(VertexInputSemantic) * m_numInputSemantics + sizeof(VertexExportSemantic) * m_numExportSemantics; return (size + 3) & ~3U; } /** @brief Gets the user register that contains the vertex offset. @return The index of the register containing the vertex offset. A value of 0 indicates no register contains the vertex offset. */ uint8_t getVertexOffsetUserRegister() const { return m_fetchControl & 0xf; } /** @brief Gets the user register that contains the instance offset. @return The index of the register containing the instance offset. A value of 0 indicates no register contains the instance offset. */ uint8_t getInstanceOffsetUserRegister() const { return (m_fetchControl >> 4) & 0xf; } }; class PsShader { public: ShaderCommonData m_common; ///< The common data for all shader stages. PsStageRegisters m_psStageRegisters; ///< The data to be loaded into the PS shader stage registers. Please see Gnm::DrawCommandBuffer::setPsShader() for more details. uint8_t m_numInputSemantics; ///< The number of entries in the input semantic table. uint8_t m_reserved[3]; ///< Unused /** @brief Patches the GPU address of the shader code. @param[in] gpuAddress The address to patch. This must be aligned to a 256-byte boundary. */ void patchShaderGpuAddress(void *gpuAddress) { m_psStageRegisters.patchShaderGpuAddress(gpuAddress); } /** @brief Retrieves the GPU address of the shader code. @return The address of the shader code. */ void *getBaseAddress() const { return (void *)((((uintptr_t)m_psStageRegisters.m_spiShaderPgmHiPs) << 40) | (((uintptr_t)m_psStageRegisters.m_spiShaderPgmLoPs) << 8)); } /** @brief Gets a pointer to this shader's input usage slot table that immediately follows this shader's structure in memory. @return A pointer to this shader's input usage slot table. */ const InputUsageSlot *getInputUsageSlotTable() const { return (const InputUsageSlot *)(this + 1); } /** @brief Gets a pointer to this shader's input semantic table that immediately follows the input usage table in memory. @return A pointer to this shader's input semantic table. */ const PixelInputSemantic *getPixelInputSemanticTable() const { return (const PixelInputSemantic *)(getInputUsageSlotTable() + m_common.m_numInputUsageSlots); } /** @brief Computes the total size (in bytes) of the shader binary including this structure, the input usage table and the input semantic table. @return The total size in bytes of this shader binary and its associated tables. */ uint32_t computeSize() const { const uint32_t size = sizeof(PsShader) + sizeof(InputUsageSlot) * m_common.m_numInputUsageSlots + sizeof(PixelInputSemantic) * m_numInputSemantics; return (size + 3) & ~3U; } }; void parseShader(ShaderInfo *shaderInfo, const void* data) { const Header *binaryHeader = (const Header*)(data); const ShaderFileHeader *header = (const ShaderFileHeader*)(binaryHeader + 1); const ShaderCommonData *shaderCommon = (const ShaderCommonData*)(header + 1); const uint32_t *sbAddress = (const uint32_t*)(shaderCommon + 1); const uint32_t sbOffsetInDW = sbAddress[0] >> 2; shaderInfo->m_shaderStruct = (void*)shaderCommon; shaderInfo->m_gpuShaderCode = (uint32_t*)shaderCommon + sbOffsetInDW; shaderInfo->m_gpuShaderCodeSize = shaderCommon->computeShaderCodeSizeInBytes(); } ShaderBinaryInfo* findShaderBinInfo(uint8_t* code) { for (size_t i = 0; i != 0xFFFFFFFF; i++) { if (!std::memcmp(&code[i], "OrbShdr", 7)) { return (ShaderBinaryInfo*)&code[i]; } } return nullptr; } void generateInputResourceOffsetTable(InputResourceOffsets* outTable, ShaderBinaryInfo* sb) { SCE_GNM_ASSERT(outTable != NULL); //SCE_GNM_ASSERT(shaderStage <= kShaderStageCount); // Get resource info to populate ShaderResourceOffsets //ShaderBinaryInfo const *shaderBinaryInfo = (ShaderBinaryInfo const*)((uintptr_t)shaderCode + shaderCodeSizeInBytes - sizeof(ShaderBinaryInfo)); ShaderBinaryInfo const *shaderBinaryInfo = sb; //SCE_GNM_ASSERT((*(reinterpret_cast(shaderBinaryInfo->m_signature)) & kShaderBinaryInfoSignatureMask) == kShaderBinaryInfoSignatureU64); // Get usage masks and input usage slots uint32_t const* usageMasks = reinterpret_cast((unsigned char const*)shaderBinaryInfo - shaderBinaryInfo->m_chunkUsageBaseOffsetInDW * 4); int32_t inputUsageSlotsCount = shaderBinaryInfo->m_numInputUsageSlots; InputUsageSlot const* inputUsageSlots = (InputUsageSlot const*)usageMasks - inputUsageSlotsCount; // Cache shader input information into the ShaderResource Offsets table __builtin_memset(outTable, 0xFF, sizeof(InputResourceOffsets)); outTable->initSupportedResourceCounts(); //outTable->shaderStage = isDispatchDraw && shaderStage == kShaderStageCs ? kShaderStageAsynchronousCompute : shaderStage; //outTable->isSrtShader = isSrtUsed; int32_t lastUserDataResourceSizeInDwords = 0; uint16_t requiredMemorySizeInDwords = 0; // Here we handle all immediate resources s[1:16] plus s[16:48] (extended user data) // resources that go into the extended user data also have "immediate" usage type, although they are stored in a table (not loaded by the SPI) for (int32_t i = 0; i < inputUsageSlotsCount; ++i) { uint8_t apiSlot = inputUsageSlots[i].m_apiSlot; uint8_t startRegister = inputUsageSlots[i].m_startRegister; bool isVSharp = (inputUsageSlots[i].m_resourceType == 0); uint16_t vsharpFlag = (isVSharp) ? kResourceIsVSharp : 0; uint16_t extendedRegisterOffsetInDwords = (startRegister >= kMaxUserDataCount) ? (startRegister - kMaxUserDataCount) : 0; requiredMemorySizeInDwords = (requiredMemorySizeInDwords > extendedRegisterOffsetInDwords) ? requiredMemorySizeInDwords : extendedRegisterOffsetInDwords; // Handle immediate resources, including some pointer types switch (inputUsageSlots[i].m_usageType) { case kShaderInputUsageImmGdsCounterRange: outTable->appendConsumeCounterSgpr = startRegister; break; case kShaderInputUsageImmGdsMemoryRange: outTable->gdsMemoryRangeSgpr = startRegister; break; case kShaderInputUsageImmLdsEsGsSize: outTable->ldsEsGsSizeSgpr = startRegister; break; case kShaderInputUsageSubPtrFetchShader: SCE_GNM_ASSERT(apiSlot == 0); outTable->fetchShaderPtrSgpr = startRegister; break; case kShaderInputUsagePtrInternalGlobalTable: SCE_GNM_ASSERT(apiSlot == 0); outTable->globalInternalPtrSgpr = startRegister; break; case kShaderInputUsagePtrExtendedUserData: SCE_GNM_ASSERT(apiSlot == 1); outTable->userExtendedData1PtrSgpr = startRegister; break; case kShaderInputUsageImmGdsKickRingBufferOffset: SCE_GNM_ASSERT(apiSlot == 0); SCE_GNM_ASSERT(startRegister < kMaxUserDataCount); outTable->gdsKickRingBufferOffsetSgpr = startRegister; break; case kShaderInputUsageImmVertexRingBufferOffset: SCE_GNM_ASSERT(apiSlot == 0); SCE_GNM_ASSERT(startRegister < kMaxUserDataCount); outTable->vertexRingBufferOffsetSgpr = startRegister; break; case kShaderInputUsagePtrDispatchDraw: SCE_GNM_ASSERT(apiSlot == 0); outTable->dispatchDrawPtrSgpr = startRegister; break; case kShaderInputUsageImmDispatchDrawInstances: SCE_GNM_ASSERT(apiSlot == 0); SCE_GNM_ASSERT(startRegister < kMaxUserDataCount); outTable->dispatchDrawInstancesSgpr = startRegister; break; // below resources can either be inside UserData or the EUD case kShaderInputUsageImmResource: SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->resourceSlotCount); outTable->resourceDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ? (kResourceInUserDataSgpr | vsharpFlag | startRegister) : (vsharpFlag | extendedRegisterOffsetInDwords); lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeResource; break; case kShaderInputUsageImmRwResource: SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->rwResourceSlotCount); outTable->rwResourceDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ? (kResourceInUserDataSgpr | vsharpFlag | startRegister) : (vsharpFlag | extendedRegisterOffsetInDwords); lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeRwResource; break; case kShaderInputUsageImmSampler: SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->samplerSlotCount); outTable->samplerDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ? (kResourceInUserDataSgpr | startRegister) : extendedRegisterOffsetInDwords; lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeSampler; break; case kShaderInputUsageImmConstBuffer: SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->constBufferSlotCount); outTable->constBufferDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ? (kResourceInUserDataSgpr | startRegister) : extendedRegisterOffsetInDwords; lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeConstantBuffer; break; case kShaderInputUsageImmVertexBuffer: SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->vertexBufferSlotCount); outTable->vertexBufferDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ? (kResourceInUserDataSgpr | startRegister) : extendedRegisterOffsetInDwords; lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeVertexBuffer; break; // SRTs will always reside inside the Imm UserData (dwords 0-15), as opposed to the // above resources which can exist in the EUD case kShaderInputUsageImmShaderResourceTable: SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < kMaxUserDataCount); outTable->userSrtDataSgpr = inputUsageSlots[i].m_startRegister; outTable->userSrtDataCount = inputUsageSlots[i].m_srtSizeInDWordMinusOne + 1; break; // case kShaderInputUsagePtrSoBufferTable: // Only present in the VS copy-shader that doesn't have a footer // outTable->streamOutPtrSgpr = startRegister; // break; } } /* // Make sure we can fit a T# (if required) in the last userOffset requiredMemorySizeInDwords += lastUserDataResourceSizeInDwords; // Now handle only pointers to resource-tables. Items handled below cannot be found more than once #if SCE_GNM_LCUE_USE_VERTEX_BUFFER_TABLE_MASK_IF_AVAILABLE // Note: in order to maintain binary compatibility, we can only put a new chunk mask for kShaderInputUsagePtrVertexBufferTable at the end of all other chunk masks bool bUseVertexBufferTableChunkMask = false; #endif for (int32_t i = 0; i < inputUsageSlotsCount; ++i) { uint8_t maskChunks = inputUsageSlots[i].m_chunkMask; const uint64_t kNibbleToCount = 0x4332322132212110ull; uint8_t chunksCount = (kNibbleToCount >> ((maskChunks & 0xF) * 4)) & 0xF; SCE_GNM_ASSERT(usageMasks + chunksCount <= (uint32_t const*)shaderBinaryInfo); // Lets fill the resource indices first uint32_t usedApiSlots[kSlotCountResource]; // Use the size of the biggest resource table uint32_t usedApiSlotCount; // This thing will break if there's more than 1 table for any resource type uint8_t startRegister = inputUsageSlots[i].m_startRegister; switch (inputUsageSlots[i].m_usageType) { case kShaderInputUsagePtrResourceTable: { SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0); outTable->resourcePtrSgpr = startRegister; outTable->resourceArrayDwOffset = requiredMemorySizeInDwords; if (!(maskChunks & 0xF)) break; SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo); uint32_t maskArray[4] = { 0, 0, 0, 0 }; // Max 128 slots are supported in the kShaderInputUsagePtrResourceTable if (maskChunks & 1) maskArray[0] = *usageMasks++; // get slots 0-31 which are set in Chunk 0 if (maskChunks & 2) maskArray[1] = *usageMasks++; // get slots 32-63 which are set in Chunk 1 if (maskChunks & 4) maskArray[2] = *usageMasks++; // get slots 64-95 which are set in Chunk 2 if (maskChunks & 8) maskArray[3] = *usageMasks++; // get slots 96-127 which are set in Chunk 3 SCE_GNM_ASSERT(usageMasks <= (uint32_t const*)shaderBinaryInfo); usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->resourceSlotCount, maskArray, kSlotCountResource); SCE_GNM_ASSERT(usedApiSlotCount > 0); uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1]; for (uint8_t j = 0; j < usedApiSlotCount; j++) { uint16_t currentApiSlot = static_cast(usedApiSlots[j]); outTable->resourceDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeResource; } requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeResource; } break; case kShaderInputUsagePtrRwResourceTable: { SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0); outTable->rwResourcePtrSgpr = startRegister; outTable->rwResourceArrayDwOffset = requiredMemorySizeInDwords; if (!(maskChunks & 1)) break; SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo); usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->rwResourceSlotCount, *usageMasks++, kSlotCountRwResource); SCE_GNM_ASSERT(usedApiSlotCount > 0); uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1]; for (uint8_t j = 0; j < usedApiSlotCount; j++) { uint16_t currentApiSlot = static_cast(usedApiSlots[j]); outTable->rwResourceDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeRwResource; } requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeRwResource; } break; case kShaderInputUsagePtrConstBufferTable: { SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0); outTable->constBufferPtrSgpr = startRegister; outTable->constBufferArrayDwOffset = requiredMemorySizeInDwords; if (!(maskChunks & 1)) break; SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo); usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->constBufferSlotCount, *usageMasks++, kSlotCountConstantBuffer); SCE_GNM_ASSERT(usedApiSlotCount > 0); uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1]; for (uint8_t j = 0; j < usedApiSlotCount; j++) { uint16_t currentApiSlot = static_cast(usedApiSlots[j]); outTable->constBufferDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeConstantBuffer; } requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeConstantBuffer; } break; case kShaderInputUsagePtrSamplerTable: { SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0); outTable->samplerPtrSgpr = startRegister; outTable->samplerArrayDwOffset = requiredMemorySizeInDwords; if (!(maskChunks & 1)) break; SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo); usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->samplerSlotCount, *usageMasks++, kSlotCountSampler); SCE_GNM_ASSERT(usedApiSlotCount > 0); uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1]; for (uint8_t j = 0; j < usedApiSlotCount; j++) { uint16_t currentApiSlot = static_cast(usedApiSlots[j]); outTable->samplerDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeSampler; } requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeSampler; } break; case kShaderInputUsagePtrVertexBufferTable: { SCE_GNM_ASSERT(shaderStage == kShaderStageLs || shaderStage == kShaderStageEs || shaderStage == kShaderStageVs || shaderStage == kShaderStageCs); SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0); outTable->vertexBufferPtrSgpr = startRegister; outTable->vertexBufferArrayDwOffset = requiredMemorySizeInDwords; #if SCE_GNM_LCUE_USE_VERTEX_BUFFER_TABLE_MASK_IF_AVAILABLE if (maskChunks & 1) { // Skip updating for the vertex buffer table below, since we are using the chunk mask at the end // we'll update it after everything else. bUseVertexBufferTableChunkMask = true; continue; } #endif SCE_GNM_ASSERT(gnmxShaderStruct != NULL); const VertexInputSemantic* semanticTable = NULL; usedApiSlotCount = 0; if (shaderStage == kShaderStageVs && isDispatchDraw) { VsShader const* pVsShader = ((CsVsShader*)gnmxShaderStruct)->getVertexShader(); usedApiSlotCount = pVsShader->m_numInputSemantics; semanticTable = pVsShader->getInputSemanticTable(); } else if (shaderStage == kShaderStageVs) { usedApiSlotCount = ((VsShader*)gnmxShaderStruct)->m_numInputSemantics; semanticTable = ((VsShader*)gnmxShaderStruct)->getInputSemanticTable(); } else if (shaderStage == kShaderStageLs) { usedApiSlotCount = ((LsShader*)gnmxShaderStruct)->m_numInputSemantics; semanticTable = ((LsShader*)gnmxShaderStruct)->getInputSemanticTable(); } else if (shaderStage == kShaderStageEs) { usedApiSlotCount = ((EsShader*)gnmxShaderStruct)->m_numInputSemantics; semanticTable = ((EsShader*)gnmxShaderStruct)->getInputSemanticTable(); } if (shaderStage == kShaderStageCs && isDispatchDraw) { VsShader const* pVsShader = ((CsVsShader*)gnmxShaderStruct)->getVertexShader(); CsShader const* pCsShader = ((CsVsShader*)gnmxShaderStruct)->getComputeShader(); if (pCsShader->m_version >= kDdCsShaderVersion_IndependentCsFetchShader) { usedApiSlotCount = pCsShader->m_numInputSemantics; semanticTable = pCsShader->getInputSemanticTable(); } else { usedApiSlotCount = pVsShader->m_gsModeOrNumInputSemanticsCs; semanticTable = pVsShader->getInputSemanticTable(); } } else if (shaderStage == kShaderStageCs) { CsShader const* pCsShader = ((CsShader*)gnmxShaderStruct); if (pCsShader->m_version >= kDdCsShaderVersion_IndependentCsFetchShader) { usedApiSlotCount = pCsShader->m_numInputSemantics; semanticTable = pCsShader->getInputSemanticTable(); } else { usedApiSlotCount = 0; semanticTable = NULL; } } // Check if the shader uses any API-slot over the maximum count configured for the InputResourceOffset table SCE_GNM_ASSERT(usedApiSlotCount > 0 && usedApiSlotCount <= outTable->vertexBufferSlotCount); uint32_t usedApiSlots[kSlotCountVertexBuffer]; // First use what the shader generated for (uint8_t i = 0; i < usedApiSlotCount; i++) { uint8_t semanticIndex = semanticTable[i].m_semantic; SCE_GNM_ASSERT(semanticIndex >= 0 && semanticIndex < outTable->vertexBufferSlotCount); usedApiSlots[i] = semanticIndex; } uint32_t firstUsedApiSlot = usedApiSlots[0]; uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1]; // If a semanticRemapTable has been provided, override the shaders defined usage slots to conform with the remapped layout if (semanticRemapTable && numElementsInSemanticRemapTable != 0) { // Override values defined in the shader binary header above SCE_GNM_ASSERT(usedApiSlotCount <= numElementsInSemanticRemapTable); usedApiSlotCount = remapVertexBufferOffsetsWithSemanticTable(&usedApiSlots[0], firstUsedApiSlot, lastUsedApiSlot, outTable->vertexBufferSlotCount, semanticRemapTable, numElementsInSemanticRemapTable); } // Generate the final dword offsets for the vertex buffer table lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1]; for (uint8_t j = 0; j < usedApiSlotCount; j++) { uint16_t currentApiSlot = static_cast(usedApiSlots[j]); outTable->vertexBufferDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeVertexBuffer; } requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeVertexBuffer; } break; } } */ // Note: this must be called after all other tables are processed above, as the vertex buffer table chunk mask (*usageMasks) // is always stored at the end of the chunk mask table /* if (bUseVertexBufferTableChunkMask) { uint32_t usedApiSlots[kSlotCountVertexBuffer]; uint32_t usedApiSlotCount; SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo); usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->vertexBufferSlotCount, *usageMasks++, kSlotCountVertexBuffer); SCE_GNM_ASSERT(usedApiSlotCount > 0); uint32_t firstUsedApiSlot = usedApiSlots[0]; uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1]; // If a semanticRemapTable has been provided, override the shaders defined usage slots to conform with the remapped layout if (semanticRemapTable && numElementsInSemanticRemapTable != 0) { // Override values defined in the shader binary header above SCE_GNM_ASSERT((uint32_t)usedApiSlotCount <= numElementsInSemanticRemapTable); usedApiSlotCount = remapVertexBufferOffsetsWithSemanticTable(&usedApiSlots[0], firstUsedApiSlot, lastUsedApiSlot, outTable->vertexBufferSlotCount, semanticRemapTable, numElementsInSemanticRemapTable); } // Generate the final dword offsets for the vertex buffer table lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1]; for (uint8_t j = 0; j < usedApiSlotCount; j++) { int32_t currentApiSlot = usedApiSlots[j]; outTable->vertexBufferDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeVertexBuffer; } requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeVertexBuffer; } */ // Final amount of memory the shader will use from the scratch and resource buffer outTable->requiredBufferSizeInDwords = requiredMemorySizeInDwords; // Checking for non handled input data for (int32_t i = 0; i < inputUsageSlotsCount; ++i) { switch (inputUsageSlots[i].m_usageType) { case kShaderInputUsageImmResource: case kShaderInputUsageImmRwResource: case kShaderInputUsageImmSampler: case kShaderInputUsageImmConstBuffer: case kShaderInputUsageImmVertexBuffer: case kShaderInputUsageImmShaderResourceTable: case kShaderInputUsageSubPtrFetchShader: case kShaderInputUsagePtrExtendedUserData: case kShaderInputUsagePtrResourceTable: case kShaderInputUsagePtrRwResourceTable: case kShaderInputUsagePtrConstBufferTable: case kShaderInputUsagePtrVertexBufferTable: case kShaderInputUsagePtrSamplerTable: case kShaderInputUsagePtrInternalGlobalTable: case kShaderInputUsageImmGdsCounterRange: case kShaderInputUsageImmGdsMemoryRange: case kShaderInputUsageImmLdsEsGsSize: case kShaderInputUsageImmGdsKickRingBufferOffset: case kShaderInputUsageImmVertexRingBufferOffset: case kShaderInputUsagePtrDispatchDraw: case kShaderInputUsageImmDispatchDrawInstances: // case kShaderInputUsagePtrSoBufferTable: // Only present in the VS copy-shader that doesn't have a footer break; default: // Not handled yet (false, "Input Usage Slot type %d is not supported by LCUE sce::generateInputResourceOffsetTable()", inputUsageSlots[i].m_usageType); break; } } } int generateVsFetchShaderBuildState(FetchShaderBuildState *fsbs, const VsStageRegisters *pregs, uint32_t numInputs, const FetchShaderInstancingMode *instancingData, const uint32_t numElementsInInstancingData, uint8_t vertexBaseUserSgpr, uint8_t instanceBaseUsgpr) { uint32_t spiShaderPgmRsrc2Vs; // ebp uint32_t spiShaderPgmRsrc1Vs; // er14 uint32_t firstFreeSgpr; // ebp _BOOL4 v10; // ebx unsigned int vsharpSlotCount; // er15 int v12; // er14 int v13; // esi __int16 result; // ax bool noInstance; // zf signed int v16; // ecx spiShaderPgmRsrc1Vs = pregs->m_spiShaderPgmRsrc1Vs; spiShaderPgmRsrc2Vs = pregs->m_spiShaderPgmRsrc2Vs; firstFreeSgpr = ((((unsigned __int8)(spiShaderPgmRsrc2Vs >> 7) | (unsigned __int8)(spiShaderPgmRsrc2Vs >> 12)) & 1) + ((spiShaderPgmRsrc2Vs >> 24) & 1) + ((spiShaderPgmRsrc2Vs >> 12) & 1) + ((spiShaderPgmRsrc2Vs >> 11) & 1) + ((spiShaderPgmRsrc2Vs >> 10) & 1) + ((spiShaderPgmRsrc2Vs >> 9) & 1) + ((unsigned __int8)spiShaderPgmRsrc2Vs >> 8) + ((spiShaderPgmRsrc2Vs >> 7) & 1) + (spiShaderPgmRsrc2Vs & 1) + ((spiShaderPgmRsrc2Vs >> 1) & 0x1F) + 3) & 0xFFFFFFFC; v10 = (((spiShaderPgmRsrc1Vs >> 3) & 0x78) + 8 - firstFreeSgpr) >> 2 == 0; vsharpSlotCount = ((((spiShaderPgmRsrc1Vs >> 3) & 0x78) + 8 - firstFreeSgpr) >> 2) + 2 * v10; v12 = spiShaderPgmRsrc1Vs & 0x3000000; v13 = (((_BYTE)v10 + (unsigned __int8)(pregs->m_spiShaderPgmRsrc1Vs >> 6)) & 0xF) << 6; fsbs->m_fetchShaderBufferSize = 4 * ((numInputs + vsharpSlotCount - 1) / vsharpSlotCount + (instanceBaseUsgpr != 0) + numInputs - ((vertexBaseUserSgpr < 1u) - 1)) + 8 * numInputs + 12; fsbs->m_fetchShaderFlags = 0; fsbs->m_firstFreeSgpr = firstFreeSgpr; fsbs->m_vsharpSlotCount = vsharpSlotCount; fsbs->m_fetchShaderInstancingData = instancingData; result = 0; noInstance = instancingData == 0LL; if (instancingData) result = numElementsInInstancingData; v16 = 0x3000000; if (noInstance) v16 = v12; fsbs->m_numElementsInInstancingData = result; fsbs->m_vertexBaseUsgpr = vertexBaseUserSgpr; fsbs->m_instanceBaseUsgpr = instanceBaseUsgpr; fsbs->m_reserved1 = 0; fsbs->m_reserved2 = 0; fsbs->m_reserved0 = 0; fsbs->m_shaderModifier = v13 | v16; return result; } int generateFetchShader(uint32_t *fs, FetchShaderBuildState *fsbs) { FetchShaderBuildState *fsbs_; // r11 __int64 *v3; // rsi char vbtStartRegister; // r14 uint16_t v5; // ax const InputUsageSlot *inputUsageSlot; // rcx unsigned __int64 i; // rdx signed __int64 idx; // r9 unsigned __int64 inputSemaCount; // rdi uint8_t vsharpSlotCount; // bl _DWORD *pEleNumInRemapTable; // r15 __int64 v12; // rdx unsigned int k; // er13 int shiftVbtStartRegister; // er14 unsigned __int64 m; // r10 unsigned int n; // er8 const uint32_t *semanticsRemapTable; // rbx int v18; // er12 int semanticIdx; // edx __int64 j; // rax unsigned int firstFreeSgprAndVsharp; // eax char v22; // cl int v23; // edx __int64 v24; // rcx __int64 v25; // rax unsigned int v26; // esi __int64 v27; // rcx const FetchShaderInstancingMode *fetchShaderInstancingData; // r12 const VertexInputSemantic *inputSemantics; // rdx __int64 t; // r14 int v31; // er11 signed __int64 pSizeInElements; // r8 unsigned int v33; // edx unsigned int v34; // er13 unsigned int v35; // er15 int vgpr; // edi int sizeInElmt; // esi unsigned int v38; // ebx unsigned int v39; // ecx __int64 v40; // rbx __int64 v41; // rdi __int64 v42; // rcx __int64 v43; // rax __int64 v44; // r9 unsigned __int64 numElementsInInstancingData; // [rsp+8h] [rbp-98h] __int64 v47; // [rsp+10h] [rbp-90h] signed __int64 v48; // [rsp+18h] [rbp-88h] __int64 v49; // [rsp+20h] [rbp-80h] FetchShaderBuildState *fsbs_bak; // [rsp+28h] [rbp-78h] unsigned __int64 numInputSemantics; // [rsp+30h] [rbp-70h] signed __int64 pEleNumInRemapTable_; // [rsp+38h] [rbp-68h] int v53; // [rsp+44h] [rbp-5Ch] int v54; // [rsp+48h] [rbp-58h] int shiftVbtStartRegister_; // [rsp+4Ch] [rbp-54h] uint32_t *fs_dst; // [rsp+50h] [rbp-50h] uint8_t vsharpSlotCount_; // [rsp+5Fh] [rbp-41h] __int64 v58; // [rsp+60h] [rbp-40h] int v59; // [rsp+68h] [rbp-38h] int v60; // [rsp+6Ch] [rbp-34h] __int64 v61; // [rsp+70h] [rbp-30h] fsbs_ = fsbs; //v3 = (__int64 *)loc_96B; fs_dst = fs; vbtStartRegister = -1; //v61 = *loc_96B; v5 = fsbs_->m_fetchShaderFlags; v58 = 0x300000000LL; v59 = (v5 & 1) + 1; v60 = 2; if (fsbs_->m_numInputUsageSlots) { inputUsageSlot = fsbs_->m_inputUsageSlots; vbtStartRegister = -1; i = 0LL; do { if (inputUsageSlot[i].m_usageType == kShaderInputUsagePtrVertexBufferTable) vbtStartRegister = inputUsageSlot[i].m_startRegister; ++i; } while (i < fsbs_->m_numInputUsageSlots); } idx = 0LL; if (fsbs_->m_vertexBaseUsgpr) { fs[idx] = fsbs_->m_vertexBaseUsgpr | 0x4A000000; ++idx; } if (fsbs_->m_instanceBaseUsgpr) { fs[idx] = fsbs_->m_instanceBaseUsgpr | 0x4A060600; ++idx; } inputSemaCount = fsbs_->m_numInputSemantics; if (fsbs_->m_numInputSemantics) { vsharpSlotCount = fsbs_->m_vsharpSlotCount; pEleNumInRemapTable = &fsbs_->m_numElementsInRemapTable; v12 = 0LL; k = 0; fsbs_bak = fsbs_; numInputSemantics = fsbs_->m_numInputSemantics; shiftVbtStartRegister = (vbtStartRegister & 0x7E) << 8; pEleNumInRemapTable_ = (signed __int64)&fsbs_->m_numElementsInRemapTable; shiftVbtStartRegister_ = shiftVbtStartRegister; vsharpSlotCount_ = fsbs_->m_vsharpSlotCount; do { if (vsharpSlotCount) { m = (unsigned int)v12; n = 0; while (1) { v12 = (unsigned int)m; if (m >= inputSemaCount) break; semanticsRemapTable = fsbs_->m_semanticsRemapTable; v18 = m; semanticIdx = fsbs_->m_inputSemantics[m].m_semantic; if (semanticsRemapTable && *pEleNumInRemapTable) { j = 0LL; do { if (semanticsRemapTable[j] == semanticIdx) break; ++j; } while ((unsigned int)j < *pEleNumInRemapTable); LOBYTE(semanticIdx) = j; } firstFreeSgprAndVsharp = *(unsigned __int16 *)&fsbs_->m_firstFreeSgpr; ++m; v22 = firstFreeSgprAndVsharp + 4 * n++; v23 = ((v22 & 0x7F) << 15) | shiftVbtStartRegister | 4 * (semanticIdx & 0x3F); v24 = (unsigned int)idx; ++idx; fs_dst[v24] = v23 | 0xC0800100; if (n >= firstFreeSgprAndVsharp >> 8) { v12 = (unsigned int)(v18 + 1); break; } } vsharpSlotCount = vsharpSlotCount_; } v25 = (unsigned int)idx; ++idx; fs_dst[v25] = 0xBF8C007F; if ((unsigned int)v12 > k) { v26 = *(_DWORD *)&fsbs_->m_firstFreeSgpr; v27 = v12; fetchShaderInstancingData = fsbs_->m_fetchShaderInstancingData; inputSemantics = fsbs_->m_inputSemantics; t = 0LL; v49 = v27; v53 = 2 * k; v54 = 2 * v27; v47 = k; v31 = k - v27; pSizeInElements = (signed __int64)&inputSemantics[k].m_sizeInElements; v33 = idx; numElementsInInstancingData = v26 >> 16; v48 = (signed __int64)&fetchShaderInstancingData[k]; v34 = (v26 & 0xFFFFFFFC) << 14; do { v35 = 0; if (fetchShaderInstancingData && v47 + t < numElementsInInstancingData) v35 = *(_DWORD *)(v48 + 4 * t); vgpr = *(unsigned __int8 *)(pSizeInElements + 4 * t - 1); sizeInElmt = *(unsigned __int8 *)(pSizeInElements + 4 * t); v38 = v34; ++t; v34 += 0x10000; v39 = (v38 & 0x1F0000) + (*((unsigned __int8 *)&v58 + 4 * v35) | (vgpr << 8)) + 0x80000000; v40 = v33; v41 = v33 + 1; v33 += 2; fs_dst[v40] = ((sizeInElmt << 18) + 0x1FC0000) & 0x1FC0000 | 0xE0002000; fs_dst[v41] = v39; } while (v31 + (_DWORD)t); v12 = v49; fsbs_ = fsbs_bak; shiftVbtStartRegister = shiftVbtStartRegister_; inputSemaCount = numInputSemantics; pEleNumInRemapTable = (_DWORD *)pEleNumInRemapTable_; vsharpSlotCount = vsharpSlotCount_; LODWORD(idx) = v54 + idx - v53; k = v49; } } while (k < (unsigned int)inputSemaCount); //v3 = (__int64 *)loc_C1E; } else { pEleNumInRemapTable = &fsbs_->m_numElementsInRemapTable; } v42 = (unsigned int)idx; v43 = (unsigned int)(idx + 1); v44 = (unsigned int)(idx + 2); fs_dst[v42] = 0xBF8C0000; fs_dst[v43] = 0xBE802000; if (*pEleNumInRemapTable) LODWORD(inputSemaCount) = *pEleNumInRemapTable; fs_dst[v44] = inputSemaCount; return 0; } int getGpuMode() { return 1; } void generatePsShaderUsageTable(uint32_t *inputTable, const VertexExportSemantic *vsTable, uint32_t vsTableNumItems, const PixelInputSemantic *psTable, uint32_t psTableNumItem) { const PixelInputSemantic *psTab; // r12 uint32_t numVsExportSemantics; // ebx __int64 idx; // r14 int v8; // ecx char v9; // r8 __int64 v10; // rax signed __int64 v11; // rax unsigned __int64 v12; // rax unsigned __int64 v13; // rcx unsigned __int64 v14; // rdx unsigned __int64 v15; // rax __int64 matchVsIdx; // r15 char hasMatchedVsSema; // r13 const PixelInputSemantic *psTab_; // r8 unsigned int v19; // edx unsigned int v20; // ecx uint32_t numPsInputSemantics; // [rsp+4h] [rbp-3Ch] const VertexExportSemantic *vsTab; // [rsp+10h] [rbp-30h] psTab = psTable; numVsExportSemantics = vsTableNumItems; numPsInputSemantics = psTableNumItem; if (psTableNumItem) { idx = 0LL; vsTab = vsTable; do { LODWORD(matchVsIdx) = 0; hasMatchedVsSema = 0; if (numVsExportSemantics) { matchVsIdx = 0LL; while (vsTable[matchVsIdx].m_semantic != psTab[idx].m_semantic) { if ((unsigned int)++matchVsIdx >= numVsExportSemantics) { hasMatchedVsSema = 0; goto LABEL_14; } } hasMatchedVsSema = 1; } LABEL_14: if (getGpuMode() == 1 && psTab[idx].m_interpF16) { if (hasMatchedVsSema) { vsTable = vsTab; v9 = *((_BYTE *)&vsTab[(unsigned int)matchVsIdx] + 1) & 0x1F; v8 = (unsigned __int16)(psTab[idx].m_interpF16) & ~(unsigned __int8)(*((_BYTE *)&vsTab[(unsigned int)matchVsIdx] + 1) >> 6); } else { vsTable = vsTab; v8 = 0; v9 = 0x20; } inputTable[idx] = 0x80000; v10 = (*(_WORD *)&psTab[idx] << 10) & 0x1000000; inputTable[idx] = v10 + 0x80000; v11 = ((*(_WORD *)&psTab[idx] << 10) & 0x2000000) + v10 + 0x80000; inputTable[idx] = v11; if (*(_WORD *)&psTab[idx] < 0xC000u) { v15 = ((unsigned __int8)v9 | (unsigned __int8)(32 * (v8 != 0))) & 0x3F | (unsigned __int64)v11; inputTable[idx] = v15; v14 = v15 | (((*(_WORD *)&psTab[idx] >> (4 * v8 & 4 ^ 0xC)) & 3LL) << 8); } else { v12 = ((unsigned __int8)v9 | (unsigned __int8)(32 * v8)) & 0x3F | (unsigned __int64)v11; inputTable[idx] = v12; v13 = v12 | ((unsigned __int64)(*((_BYTE *)&psTab[idx] + 1) & 3) << 8) | (v8 << 19) & 0x100000; inputTable[idx] = v13; v14 = v13 & 0xFFFFFFFFFF9FFFFFLL | (*(_WORD *)&psTab[idx] << 9) & 0x600000; } inputTable[idx] = v14; } else { inputTable[idx] = 0; if (hasMatchedVsSema) { vsTable = vsTab; psTab_ = &psTab[idx]; v19 = *((_BYTE *)&vsTab[(unsigned int)matchVsIdx] + 1) & 0x1F | ((unsigned int)*(_WORD *)&psTab[idx] >> 7) & 0x20; inputTable[idx] = v19; v20 = v19 | (((((unsigned int)*(_WORD *)&psTab[idx] >> 10) | ((unsigned int)*(_WORD *)&psTab[idx] >> 12)) & 1) << 10); } else { vsTable = vsTab; psTab_ = &psTab[idx]; v20 = 32; } inputTable[idx] = v20; inputTable[idx] = v20 & 0xFFFFFCFF | ((*((_BYTE *)psTab_ + 1) & 3) << 8); } ++idx; } while ((_DWORD)idx != numPsInputSemantics); } } // 128 bits struct VSharpBuffer { uint64_t base : 44; // base byte address (only 40 bits supported) uint64_t mtype_L1s : 2; // mtype for scalar L1 uint64_t mtype_L2 : 2; // mtype for L2 uint64_t stride : 14; // bytes: 0..16383 uint64_t cache_swizzle : 1; // buffer access. optionally swizzle TC L1 cache banks uint64_t swizzle_en : 1; // swizzle AOS according to stride, index_stride, and element_size, else linear (stride * index + offset) uint32_t num_records; // in units of 'stride' // Destination channel select: // 0=0, 1=1, 4=R, 5=G, 6=B, 7=A uint32_t dst_sel_x : 3; uint32_t dst_sel_y : 3; uint32_t dst_sel_z : 3; uint32_t dst_sel_w : 3; uint32_t nfmt : 3; // numeric data type (float, int, бн) uint32_t dfmt : 4; // # of fields, size of each field. Note: dfmt=0 (invalid) is a special case that will disable buffer access via vector memory ops. uint32_t element_size : 2; // 2, 4, 8, or 16 bytes. Used for swizzled buffer addressing uint32_t index_stride : 2; // 8, 16, 32, or 64. Used for swizzled buffer addressing uint32_t addtid_en : 1; // add thread id to the index for addr calc uint32_t reserved0 : 1; uint32_t hash_en : 1; // 1 = buffer addresses are hashed for better cache perf uint32_t reserved1 : 1; uint32_t mtype : 3; // mtype for L1 uint32_t type : 2; // value == 0 for buf. Overlaps upper 2 bits of 4-bit TYPE field in 128-bit T# resource }; typedef VSharpBuffer GnmBuffer; // T# Texture Descriptor Buffer // 256 bits struct TSharpBuffer { uint64_t baseaddr256 : 38; // base 256-byte aligned address bits [39:8] (top 6 bits are not used) uint64_t mtype_L2 : 2; // mtype for L2 uint64_t min_lod : 12; // fixed point 4.8 minimum LOD (0.0..15.0) uint64_t dfmt : 6; // texture data format; num components, num bits uint64_t nfmt : 4; // texture numeric format; value conversion uint64_t mtype_lsbs : 2; // mtype for L1 (LSBs) uint64_t width : 14; // texture width (0..16383) uint64_t height : 14; // texture height (0..16383) // Specifies the scale factor applied to the perf_z, perf_mip, // aniso_bias, aniso_threshold, lod_bias_sec settings // specified in the associated S#: // 0=0/16, 1=2/16, 2=5/16, 3=7/16, 4=9/16, 5=11/16, 6=14/16, 7=16/16 // The result after scaling is rounded down to the nearest // representable value for the given S# field. // (Note that perf_mod=0 effectively disables these S# // settings, while perf_mod=7 essentially eliminates the // dependency between the T# and S#.) uint64_t perf_mod : 3; uint64_t interlaced : 1; // texture is interlaced // Destination channel select: // 0=0, 1=1, 4=R, 5=G, 6=B, 7=A uint64_t dst_sel_x : 3; uint64_t dst_sel_y : 3; uint64_t dst_sel_z : 3; uint64_t dst_sel_w : 3; uint64_t base_level : 4; uint64_t last_level : 4; uint64_t tiling_idx : 5; uint64_t pow2pad : 1; uint64_t mtype_msb : 1; uint64_t reserved0 : 1; uint64_t type : 4; uint64_t depth : 13; uint64_t pitch : 14; // texture pitch in texels (0..16383); defaults to width uint64_t reserved1 : 5; uint64_t base_array : 13; // first array index (0..16383) uint64_t last_array : 13; // texture height (0..16383) uint64_t reserved2 : 6; uint64_t min_lod_warn : 12; uint64_t counter_bank_id : 8; uint64_t LOD_hdw_cnt_en : 1; uint64_t reserved3 : 43; }; typedef TSharpBuffer GnmTexture; // S# Sampler Descriptor Buffer // 128 bits struct SSharpBuffer { uint64_t clamp_x : 3; uint64_t clamp_y : 3; uint64_t clamp_z : 3; uint64_t max_aniso_ratio : 3; uint64_t depth_compare_func : 3; uint64_t force_unorm_coords : 1; uint64_t aniso_threshold : 3; uint64_t mc_coord_trunc : 1; uint64_t force_degamma : 1; uint64_t aniso_bias : 6; uint64_t trunc_coord : 1; uint64_t disable_cube_wrap : 1; uint64_t filter_mode : 2; uint64_t reserved0 : 1; uint64_t min_lod : 12; uint64_t max_lod : 12; uint64_t perf_mip : 4; uint64_t perf_z : 4; uint64_t lod_bias : 14; uint64_t lod_bias_sec : 6; uint64_t xy_mag_filter : 2; uint64_t xy_min_filter : 2; uint64_t z_filter : 2; uint64_t mip_filter : 2; uint64_t reserved1 : 4; uint64_t border_color_ptr : 12; uint64_t reserved2 : 18; uint64_t border_color_type : 2; }; int main(void) { ShaderInfo shaderInfo; parseShader(&shaderInfo, s_vex_vv); void *shaderBinary = malloc(shaderInfo.m_gpuShaderCodeSize); void *shaderHeader = malloc(shaderInfo.m_vsShader->computeSize()); memcpy(shaderBinary, shaderInfo.m_gpuShaderCode, shaderInfo.m_gpuShaderCodeSize); memcpy(shaderHeader, shaderInfo.m_vsShader, shaderInfo.m_vsShader->computeSize()); ShaderBinaryInfo* binInfo = findShaderBinInfo((uint8_t*)shaderBinary); InputResourceOffsets table; generateInputResourceOffsetTable(&table, binInfo); size_t codeLength = binInfo->m_length + sizeof(ShaderBinaryInfo); VsShader* m_shader = static_cast(shaderHeader); m_shader->patchShaderGpuAddress(shaderBinary); VsShader* vertexShader = m_shader; FetchShaderBuildState fb = { 0 }; generateVsFetchShaderBuildState(&fb, (const VsStageRegisters*)&vertexShader->m_vsStageRegisters, vertexShader->m_numInputSemantics, nullptr, 0, fb.m_vertexBaseUsgpr, fb.m_instanceBaseUsgpr); const InputUsageSlot *inputUsageSlots = vertexShader->getInputUsageSlotTable(); fb.m_numInputSemantics = vertexShader->m_numInputSemantics; fb.m_inputSemantics = vertexShader->getInputSemanticTable(); fb.m_numInputUsageSlots = vertexShader->m_common.m_numInputUsageSlots; fb.m_inputUsageSlots = inputUsageSlots; fb.m_numElementsInRemapTable = 0; fb.m_semanticsRemapTable = 0; uint32_t *fs = (uint32_t *)malloc(fb.m_fetchShaderBufferSize); generateFetchShader(fs, &fb); std::ofstream fout("fetch_shader.bin"); fout.write((char*)fs, fb.m_fetchShaderBufferSize); fout.close(); const InputUsageSlot* slot = m_shader->getInputUsageSlotTable(); const VertexInputSemantic* inputSema = m_shader->getInputSemanticTable(); const VertexExportSemantic* expSema = m_shader->getExportSemanticTable(); // //ShaderInfo psShaderInfo; //parseShader(&psShaderInfo, s_pix_p); //void *psshaderBinary = malloc(psShaderInfo.m_gpuShaderCodeSize); //void *psshaderHeader = malloc(psShaderInfo.m_psShader->computeSize()); //memcpy(psshaderBinary, psShaderInfo.m_gpuS haderCode, psShaderInfo.m_gpuShaderCodeSize); //memcpy(psshaderHeader, psShaderInfo.m_psShader, psShaderInfo.m_psShader->computeSize()); //ShaderBinaryInfo* psbinInfo = findShaderBinInfo((uint8_t*)psshaderBinary); //InputResourceOffsets pstable; //generateInputResourceOffsetTable(&pstable, psbinInfo); //size_t pscodeLength = psbinInfo->m_length + sizeof(ShaderBinaryInfo); //PsShader* m_psshader = static_cast(psshaderHeader); //m_psshader->patchShaderGpuAddress(psshaderBinary); //uint32_t psInputs[32] = {0}; //generatePsShaderUsageTable(psInputs, // vertexShader->getExportSemanticTable(), vertexShader->m_numExportSemantics, // m_psshader->getPixelInputSemanticTable(), m_psshader->m_numInputSemantics); // return binInfo->m_isSrt; }