mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
More SIMD: Add some matrix operations to CrossSIMD (#19773)
* More CrossSIMD functionality * Use the new SIMD API for the matrix multiplies
This commit is contained in:
parent
8c069917b5
commit
eec7853efe
4 changed files with 219 additions and 32 deletions
|
@ -11,6 +11,7 @@
|
|||
// The point of this, as opposed to a float4 array, is to almost force the compiler
|
||||
// to keep the matrix in registers, rather than loading on every access.
|
||||
struct Mat4F32 {
|
||||
Mat4F32() {}
|
||||
Mat4F32(const float *matrix) {
|
||||
col0 = _mm_loadu_ps(matrix);
|
||||
col1 = _mm_loadu_ps(matrix + 4);
|
||||
|
@ -23,12 +24,118 @@ struct Mat4F32 {
|
|||
_mm_storeu_ps(m + 8, col2);
|
||||
_mm_storeu_ps(m + 12, col3);
|
||||
}
|
||||
|
||||
// Unlike the old one, this one is careful about not loading out-of-range data.
|
||||
// The last two loads overlap.
|
||||
static Mat4F32 Load4x3(const float *m) {
|
||||
Mat4F32 result;
|
||||
alignas(16) static const uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 };
|
||||
alignas(16) static const float onelane3[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
|
||||
__m128 mask1110 = _mm_loadu_ps((const float *)mask);
|
||||
result.col0 = _mm_and_ps(_mm_loadu_ps(m), mask1110);
|
||||
result.col1 = _mm_and_ps(_mm_loadu_ps(m + 3), mask1110);
|
||||
result.col2 = _mm_and_ps(_mm_loadu_ps(m + 6), mask1110);
|
||||
__m128 lastCol = _mm_loadu_ps(m + 8);
|
||||
result.col3 = _mm_or_ps(_mm_and_ps(_mm_shuffle_ps(lastCol, lastCol, _MM_SHUFFLE(3, 3, 2, 1)), mask1110), _mm_load_ps(onelane3));
|
||||
return result;
|
||||
}
|
||||
|
||||
__m128 col0;
|
||||
__m128 col1;
|
||||
__m128 col2;
|
||||
__m128 col3;
|
||||
};
|
||||
|
||||
// The columns are spread out between the data*. This is just intermediate storage for multiplication.
|
||||
struct Mat4x3F32 {
|
||||
Mat4x3F32(const float *matrix) {
|
||||
data0 = _mm_loadu_ps(matrix);
|
||||
data1 = _mm_loadu_ps(matrix + 4);
|
||||
data2 = _mm_loadu_ps(matrix + 8);
|
||||
}
|
||||
|
||||
__m128 data0;
|
||||
__m128 data1;
|
||||
__m128 data2;
|
||||
};
|
||||
|
||||
// TODO: Check if loading b by 4s and shuffling is cheaper.
|
||||
inline Mat4F32 MulMem4x4By4x4(const float *a, Mat4F32 b) {
|
||||
Mat4F32 result;
|
||||
|
||||
__m128 r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[0]));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[1])));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[2])));
|
||||
result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[3])));
|
||||
|
||||
r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[4]));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[5])));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[6])));
|
||||
result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[7])));
|
||||
|
||||
r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[8]));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[9])));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[10])));
|
||||
result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[11])));
|
||||
|
||||
r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[12]));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[13])));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[14])));
|
||||
result.col3 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[15])));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) {
|
||||
Mat4F32 result;
|
||||
|
||||
__m128 r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col0, 0));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col0, 1)));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col0, 2)));
|
||||
result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col0, 3)));
|
||||
|
||||
r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col1, 0));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col1, 1)));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col1, 2)));
|
||||
result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col1, 3)));
|
||||
|
||||
r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col2, 0));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col2, 1)));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col2, 2)));
|
||||
result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col2, 3)));
|
||||
|
||||
r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col3, 0));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col3, 1)));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col3, 2)));
|
||||
result.col3 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col3, 3)));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) {
|
||||
Mat4F32 result;
|
||||
|
||||
__m128 r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data0, 0));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data0, 1)));
|
||||
result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data0, 2)));
|
||||
|
||||
r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data0, 3));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data1, 0)));
|
||||
result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data1, 1)));
|
||||
|
||||
r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data1, 2));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data1, 3)));
|
||||
result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data2, 0)));
|
||||
|
||||
r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data2, 1));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data2, 2)));
|
||||
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data2, 3)));
|
||||
|
||||
// The last entry has an implied 1.0f.
|
||||
result.col3 = _mm_add_ps(r_col, b.col3);
|
||||
return result;
|
||||
}
|
||||
|
||||
struct Vec4S32 {
|
||||
__m128i v;
|
||||
|
||||
|
@ -90,6 +197,13 @@ struct Vec4F32 {
|
|||
static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ _mm_load_ps(src) }; }
|
||||
void Store(float *dst) { _mm_storeu_ps(dst, v); }
|
||||
void StoreAligned (float *dst) { _mm_store_ps(dst, v); }
|
||||
void Store3(float *dst) {
|
||||
// TODO: There might be better ways.
|
||||
_mm_store_pd((double *)dst, _mm_castps_pd(v));
|
||||
_mm_store_ss(dst + 2, _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
}
|
||||
|
||||
static Vec4F32 LoadVec2(const float *src) { return Vec4F32{ _mm_castsi128_ps(_mm_loadl_epi64((const __m128i *)src)) }; }
|
||||
|
||||
static Vec4F32 LoadConvertS16(const int16_t *src) { // Note: will load 8 bytes
|
||||
__m128i value = _mm_loadl_epi64((const __m128i *)src);
|
||||
|
@ -104,6 +218,14 @@ struct Vec4F32 {
|
|||
return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value16, value16), 24)) };
|
||||
}
|
||||
|
||||
static Vec4F32 LoadF24x3_One(const uint32_t *src) {
|
||||
alignas(16) static const uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 };
|
||||
alignas(16) static const float onelane3[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
|
||||
|
||||
__m128 value = _mm_castsi128_ps(_mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8));
|
||||
return Vec4F32{ _mm_or_ps(_mm_and_ps(value, _mm_load_ps((const float *)mask)), _mm_load_ps(onelane3)) };
|
||||
}
|
||||
|
||||
static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ _mm_cvtepi32_ps(other.v) }; }
|
||||
|
||||
Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ _mm_add_ps(v, other.v) }; }
|
||||
|
@ -230,6 +352,7 @@ inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
|
|||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
|
||||
struct Mat4F32 {
|
||||
Mat4F32() {}
|
||||
Mat4F32(const float *matrix) {
|
||||
col0 = vld1q_f32(matrix);
|
||||
col1 = vld1q_f32(matrix + 4);
|
||||
|
@ -242,12 +365,86 @@ struct Mat4F32 {
|
|||
vst1q_f32(m + 8, col2);
|
||||
vst1q_f32(m + 12, col3);
|
||||
}
|
||||
|
||||
// Unlike the old one, this one is careful about not loading out-of-range data.
|
||||
// The last two loads overlap.
|
||||
static Mat4F32 Load4x3(const float *m) {
|
||||
Mat4F32 result;
|
||||
result.col0 = vsetq_lane_f32(0.0f, vld1q_f32(m), 3);
|
||||
result.col1 = vsetq_lane_f32(0.0f, vld1q_f32(m + 3), 3);
|
||||
result.col2 = vsetq_lane_f32(0.0f, vld1q_f32(m + 6), 3);
|
||||
result.col3 = vsetq_lane_f32(1.0f, vld1q_f32(m + 9), 3); // TODO: Fix this out of bounds read
|
||||
return result;
|
||||
}
|
||||
|
||||
float32x4_t col0;
|
||||
float32x4_t col1;
|
||||
float32x4_t col2;
|
||||
float32x4_t col3;
|
||||
};
|
||||
|
||||
// The columns are spread out between the data*. This is just intermediate storage for multiplication.
|
||||
struct Mat4x3F32 {
|
||||
Mat4x3F32(const float *matrix) {
|
||||
data0 = vld1q_f32(matrix);
|
||||
data1 = vld1q_f32(matrix + 4);
|
||||
data2 = vld1q_f32(matrix + 8);
|
||||
}
|
||||
|
||||
float32x4_t data0;
|
||||
float32x4_t data1;
|
||||
float32x4_t data2;
|
||||
};
|
||||
|
||||
inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) {
|
||||
Mat4F32 result;
|
||||
|
||||
float32x4_t r_col = vmulq_laneq_f32(b.col0, a.col0, 0);
|
||||
r_col = vfmaq_laneq_f32(r_col, b.col1, a.col0, 1);
|
||||
r_col = vfmaq_laneq_f32(r_col, b.col2, a.col0, 2);
|
||||
result.col0 = vfmaq_laneq_f32(r_col, b.col3, a.col0, 3);
|
||||
|
||||
r_col = vmulq_laneq_f32(b.col0, a.col1, 0);
|
||||
r_col = vfmaq_laneq_f32(r_col, b.col1, a.col1, 1);
|
||||
r_col = vfmaq_laneq_f32(r_col, b.col2, a.col1, 2);
|
||||
result.col1 = vfmaq_laneq_f32(r_col, b.col3, a.col1, 3);
|
||||
|
||||
r_col = vmulq_laneq_f32(b.col0, a.col2, 0);
|
||||
r_col = vfmaq_laneq_f32(r_col, b.col1, a.col2, 1);
|
||||
r_col = vfmaq_laneq_f32(r_col, b.col2, a.col2, 2);
|
||||
result.col2 = vfmaq_laneq_f32(r_col, b.col3, a.col2, 3);
|
||||
|
||||
r_col = vmulq_laneq_f32(b.col0, a.col3, 0);
|
||||
r_col = vfmaq_laneq_f32(r_col, b.col1, a.col3, 1);
|
||||
r_col = vfmaq_laneq_f32(r_col, b.col2, a.col3, 2);
|
||||
result.col3 = vfmaq_laneq_f32(r_col, b.col3, a.col3, 3);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) {
|
||||
Mat4F32 result;
|
||||
|
||||
float32x4_t r_col = vmulq_laneq_f32(b.col0, a.data0, 0);
|
||||
r_col = vfmaq_laneq_f32(r_col, b.col1, a.data0, 1);
|
||||
result.col0 = vfmaq_laneq_f32(r_col, b.col2, a.data0, 2);
|
||||
|
||||
r_col = vmulq_laneq_f32(b.col0, a.data0, 3);
|
||||
r_col = vfmaq_laneq_f32(r_col, b.col1, a.data1, 0);
|
||||
result.col1 = vfmaq_laneq_f32(r_col, b.col2, a.data1, 1);
|
||||
|
||||
r_col = vmulq_laneq_f32(b.col0, a.data1, 2);
|
||||
r_col = vfmaq_laneq_f32(r_col, b.col1, a.data1, 3);
|
||||
result.col2 = vfmaq_laneq_f32(r_col, b.col2, a.data2, 0);
|
||||
|
||||
r_col = vmulq_laneq_f32(b.col0, a.data2, 1);
|
||||
r_col = vfmaq_laneq_f32(r_col, b.col1, a.data2, 2);
|
||||
r_col = vfmaq_laneq_f32(r_col, b.col2, a.data2, 3);
|
||||
|
||||
// The last entry has an implied 1.0f.
|
||||
result.col3 = vaddq_f32(r_col, b.col3);
|
||||
return result;
|
||||
}
|
||||
|
||||
struct Vec4S32 {
|
||||
int32x4_t v;
|
||||
|
@ -292,6 +489,13 @@ struct Vec4F32 {
|
|||
static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ vld1q_f32(src) }; }
|
||||
void Store(float *dst) { vst1q_f32(dst, v); }
|
||||
void StoreAligned(float *dst) { vst1q_f32(dst, v); }
|
||||
void Store3(float *dst) {
|
||||
// TODO: There might be better ways. Try to avoid this when possible.
|
||||
vst1_f32(dst, vget_low_f32(v));
|
||||
dst[2] = vgetq_lane_f32(v, 2);
|
||||
}
|
||||
|
||||
static Vec4F32 LoadVec2(const float *src) { return Vec4F32{ vcombine_f32(vld1_f32(src), vdup_n_f32(0.0f)) }; } // TODO: Feels like there should be a better way.
|
||||
|
||||
static Vec4F32 LoadConvertS16(const int16_t *src) {
|
||||
int16x4_t value = vld1_s16(src);
|
||||
|
@ -304,6 +508,10 @@ struct Vec4F32 {
|
|||
return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value16)) };
|
||||
}
|
||||
|
||||
static Vec4F32 LoadF24x3_One(const uint32_t *src) {
|
||||
return Vec4F32{ vsetq_lane_f32(1.0f, vreinterpretq_f32_u32(vshlq_n_u32(vld1q_u32(src), 8)), 3) };
|
||||
}
|
||||
|
||||
static Vec4F32 FromVec4S32(Vec4S32 other) {
|
||||
return Vec4F32{ vcvtq_f32_s32(other.v) };
|
||||
}
|
||||
|
|
|
@ -59,6 +59,8 @@ static inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x
|
|||
}
|
||||
}
|
||||
|
||||
#define vfmaq_laneq_f32 vmlaq_laneq_f32
|
||||
|
||||
static inline uint32x4_t vcgezq_f32(float32x4_t v) {
|
||||
return vcgeq_f32(v, vdupq_n_f32(0.0f));
|
||||
}
|
||||
|
@ -118,6 +120,8 @@ inline __m128i _mm_packu_epi32_SSE2(const __m128i v0) {
|
|||
return _mm_castps_si128(_mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 2, 0)));
|
||||
}
|
||||
|
||||
#define _mm_splat_lane_ps(v, l) _mm_shuffle_ps((v), (v), _MM_SHUFFLE(l, l, l, l))
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
alignas(16) static const uint32_t g_sign32[4] = { 0x00008000, 0x00008000, 0x00008000, 0x00008000 };
|
||||
|
|
|
@ -24,16 +24,6 @@ void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
|
|||
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
|
||||
#if PPSSPP_ARCH(ARM)
|
||||
static inline float32x4_t vfmaq_laneq_f32(float32x4_t _s, float32x4_t _a, float32x4_t _b, int lane) {
|
||||
if (lane == 0) return vmlaq_lane_f32(_s, _a, vget_low_f32(_b), 0);
|
||||
else if (lane == 1) return vmlaq_lane_f32(_s, _a, vget_low_f32(_b), 1);
|
||||
else if (lane == 2) return vmlaq_lane_f32(_s, _a, vget_high_f32(_b), 0);
|
||||
else if (lane == 3) return vmlaq_lane_f32(_s, _a, vget_high_f32(_b), 1);
|
||||
else return vdupq_n_f32(0.f);
|
||||
}
|
||||
#endif
|
||||
|
||||
// From https://developer.arm.com/documentation/102467/0100/Matrix-multiplication-example
|
||||
void fast_matrix_mul_4x4_neon(float *C, const float *A, const float *B) {
|
||||
// these are the columns A
|
||||
|
|
|
@ -915,33 +915,18 @@ bool DrawEngineCommon::DescribeCodePtr(const u8 *ptr, std::string &name) const {
|
|||
}
|
||||
}
|
||||
|
||||
inline void ComputeFinalProjMatrix(float *worldviewproj) {
|
||||
float world[16];
|
||||
float view[16];
|
||||
float worldview[16];
|
||||
ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
|
||||
ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
|
||||
Matrix4ByMatrix4(worldview, world, view);
|
||||
Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);
|
||||
|
||||
// Heh, a bit ugly to mix two different matrix APIs here, but it works.
|
||||
|
||||
const float viewportScale[4] = {
|
||||
gstate.getViewportXScale(),
|
||||
gstate.getViewportYScale(),
|
||||
gstate.getViewportZScale(),
|
||||
1.0f
|
||||
};
|
||||
Mat4F32 ComputeFinalProjMatrix() {
|
||||
const float viewportTranslate[4] = {
|
||||
gstate.getViewportXCenter() - gstate.getOffsetX(),
|
||||
gstate.getViewportYCenter() - gstate.getOffsetY(),
|
||||
gstate.getViewportZCenter(),
|
||||
};
|
||||
|
||||
Mat4F32 wv = Mul4x3By4x4(Mat4x3F32(gstate.worldMatrix), Mat4F32::Load4x3(gstate.viewMatrix));
|
||||
Mat4F32 m = Mul4x4By4x4(wv, Mat4F32(gstate.projMatrix));
|
||||
// NOTE: Applying the translation actually works pre-divide, since W is also affected.
|
||||
Mat4F32 m(worldviewproj);
|
||||
TranslateAndScaleInplace(m, Vec4F32::Load(viewportScale), Vec4F32::Load(viewportTranslate));
|
||||
m.Store(worldviewproj);
|
||||
TranslateAndScaleInplace(m, Vec4F32::LoadF24x3_One(&gstate.viewportxscale), Vec4F32::Load(viewportTranslate));
|
||||
return m;
|
||||
}
|
||||
|
||||
void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount) {
|
||||
|
@ -967,7 +952,7 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
|
|||
TimeCollector collectStat(&gpuStats.msRasterizingDepth, coreCollectDebugStats);
|
||||
|
||||
float worldviewproj[16];
|
||||
ComputeFinalProjMatrix(worldviewproj);
|
||||
ComputeFinalProjMatrix().Store(worldviewproj);
|
||||
|
||||
// Decode.
|
||||
int numDec = 0;
|
||||
|
@ -1035,7 +1020,7 @@ void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *i
|
|||
return;
|
||||
}
|
||||
float worldviewproj[16];
|
||||
ComputeFinalProjMatrix(worldviewproj);
|
||||
ComputeFinalProjMatrix().Store(worldviewproj);
|
||||
TransformPredecodedForDepthRaster(depthTransformed_, worldviewproj, decoded_, dec, numDecoded);
|
||||
|
||||
switch (prim) {
|
||||
|
|
Loading…
Add table
Reference in a new issue