diff --git a/CMakeLists.txt b/CMakeLists.txt index 83f56c3aa1..a794d97f18 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -892,6 +892,8 @@ add_library(native STATIC ext/native/math/fast/fast_matrix.c ext/native/math/fast/fast_matrix_neon.S ext/native/math/fast/fast_matrix_sse.c + ext/native/math/dataconv.cpp + ext/native/math/dataconv.h ext/native/math/curves.cpp ext/native/math/curves.h ext/native/math/expression_parser.cpp diff --git a/Core/HLE/HLE.cpp b/Core/HLE/HLE.cpp index 5c6610a780..e36a8c4109 100644 --- a/Core/HLE/HLE.cpp +++ b/Core/HLE/HLE.cpp @@ -519,8 +519,7 @@ void CallSyscall(MIPSOpcode op) { PROFILE_THIS_SCOPE("syscall"); double start = 0.0; // need to initialize to fix the race condition where coreCollectDebugStats is enabled in the middle of this func. - if (coreCollectDebugStats) - { + if (coreCollectDebugStats) { time_update(); start = time_now_d(); } @@ -544,8 +543,7 @@ void CallSyscall(MIPSOpcode op) ERROR_LOG_REPORT(HLE, "Unimplemented HLE function %s", info->name ? info->name : "(\?\?\?)"); } - if (coreCollectDebugStats) - { + if (coreCollectDebugStats) { time_update(); u32 callno = (op >> 6) & 0xFFFFF; //20 bits int funcnum = callno & 0xFFF; diff --git a/GPU/Common/ShaderCommon.h b/GPU/Common/ShaderCommon.h index ac44251b05..5f17e236dc 100644 --- a/GPU/Common/ShaderCommon.h +++ b/GPU/Common/ShaderCommon.h @@ -94,6 +94,7 @@ enum : uint64_t { DIRTY_BONE_UNIFORMS = 0xFF000000ULL, DIRTY_ALL_UNIFORMS = 0x1FFFFFFFFULL, + DIRTY_ALL_LIGHTS = DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3, // Other dirty elements that aren't uniforms! DIRTY_FRAMEBUF = 1ULL << 40, diff --git a/GPU/Common/ShaderUniforms.cpp b/GPU/Common/ShaderUniforms.cpp index e92448c7ed..4e3a3deab4 100644 --- a/GPU/Common/ShaderUniforms.cpp +++ b/GPU/Common/ShaderUniforms.cpp @@ -219,7 +219,6 @@ void LightUpdateUniforms(UB_VS_Lights *ub, uint64_t dirtyUniforms) { if (dirtyUniforms & DIRTY_MATEMISSIVE) { Uint8x3ToFloat4(ub->materialEmissive, gstate.materialemissive); } - for (int i = 0; i < 4; i++) { if (dirtyUniforms & (DIRTY_LIGHT0 << i)) { if (gstate.isDirectionalLight(i)) { diff --git a/GPU/Math3D.h b/GPU/Math3D.h index 53c7a1e8a0..8d24f0b437 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -822,8 +822,7 @@ typedef Math3D::Vec3Packed Vec3Packedf; typedef Math3D::Vec4 Vec4f; -inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) -{ +inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) { vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6] + m[9]; vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7] + m[10]; vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8] + m[11]; @@ -895,6 +894,14 @@ inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) { m4x4[15] = 1.0f; } +// 0369 +// 147A +// 258B +// ->>- +// 0123 +// 4567 +// 89AB +// Don't see a way to SIMD that. Should be pretty fast anyway. inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) { m4x4[0] = m4x3[0]; m4x4[1] = m4x3[3]; diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index ce76ae0057..d973b4228e 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -670,8 +670,8 @@ void DrawEngineVulkan::DoFlush() { VulkanVertexShader *vshader = nullptr; VulkanFragmentShader *fshader = nullptr; - uint32_t ibOffset = 0; - uint32_t vbOffset = 0; + uint32_t ibOffset; + uint32_t vbOffset; if (useHWTransform) { // We don't detect clears in this path, so here we can switch framebuffers if necessary. @@ -880,7 +880,7 @@ void DrawEngineVulkan::DoFlush() { return; } if (pipeline != lastPipeline_) { - vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline->pipeline); // TODO: Avoid if same as last draw. + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline->pipeline); lastPipeline_ = pipeline; } ApplyDrawStateLate(cmd, false, 0); diff --git a/GPU/Vulkan/GPU_Vulkan.cpp b/GPU/Vulkan/GPU_Vulkan.cpp index e79b39e87b..bed17ed1d6 100644 --- a/GPU/Vulkan/GPU_Vulkan.cpp +++ b/GPU/Vulkan/GPU_Vulkan.cpp @@ -482,8 +482,6 @@ void GPU_Vulkan::Execute_Prim(u32 op, u32 diff) { // This also makes skipping drawing very effective. framebufferManager_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason); - if (!draw_->GetNativeObject(Draw::NativeObject::CURRENT_RENDERPASS)) - Crash(); if (gstate_c.skipDrawReason & (SKIPDRAW_SKIPFRAME | SKIPDRAW_NON_DISPLAYED_FB)) { drawEngine_.SetupVertexDecoder(gstate.vertType); // Do we still need to do this? diff --git a/ext/native/Android.mk b/ext/native/Android.mk index 089f49d192..4ff5c8005d 100644 --- a/ext/native/Android.mk +++ b/ext/native/Android.mk @@ -51,6 +51,7 @@ LOCAL_SRC_FILES :=\ input/input_state.cpp \ math/fast/fast_math.c \ math/fast/fast_matrix.c \ + math/dataconv.cpp \ math/math_util.cpp \ math/curves.cpp \ math/expression_parser.cpp \ diff --git a/ext/native/math/dataconv.cpp b/ext/native/math/dataconv.cpp new file mode 100644 index 0000000000..2fcc339c90 --- /dev/null +++ b/ext/native/math/dataconv.cpp @@ -0,0 +1,3 @@ +#include "dataconv.h" + +alignas(16) const float one_over_255_x4[4] = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, }; diff --git a/ext/native/math/dataconv.h b/ext/native/math/dataconv.h index 3940293e59..8c93fffed0 100644 --- a/ext/native/math/dataconv.h +++ b/ext/native/math/dataconv.h @@ -3,26 +3,65 @@ #include #include +#include "Common/Common.h" +#include "ppsspp_config.h" + #ifdef _M_SSE -#include > +#include +#endif +#if PPSSPP_PLATFORM(ARM_NEON) +#include #endif +extern const float one_over_255_x4[4]; + // Utilities useful for filling in std140-layout uniform buffers, and similar. +// NEON intrinsics: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491f/BABDCGGF.html // LSBs in f[0], etc. // Could be SSE optimized. inline void Uint8x4ToFloat4(float f[4], uint32_t u) { +#ifdef _M_SSE + __m128i zero = _mm_setzero_si128(); + __m128i value = _mm_set1_epi32(u); + __m128i value32 = _mm_unpacklo_epi16(_mm_unpacklo_epi8(value, zero), zero); + __m128 fvalues = _mm_mul_ps(_mm_cvtepi32_ps(value32), _mm_load_ps(one_over_255_x4)); + _mm_storeu_ps(f, fvalues); +#elif PPSSPP_PLATFORM(ARM_NEON) + const float32x4_t one_over = vdupq_n_f32(1.0f/255.0f); + const uint8x8_t value = vld1_lane_u32(u); + const uint16x8_t value16 = vmovl_s8(value); + const uint32x4_t value32 = vmovl_s16(vget_low_s16(value16)); + const float32x4_t valueFloat = vmulq_f32(vcvtq_f32_u32(value32), one_over); + vst1q_u32((uint32_t *)dest, valueFloat); +#else f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f); f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f); f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f); f[3] = ((u >> 24) & 0xFF) * (1.0f / 255.0f); +#endif } -inline void Uint8x3ToFloat4(float f[4], uint32_t u) { +inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) { +#if defined(_M_SSE) || PPSSPP_PLATFORM(ARM_NEON) + Uint8x4ToFloat4(f, (u & 0xFFFFFF) | (alpha << 24)); +#else f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f); f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f); f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f); - f[3] = 0.0f; + f[3] = alpha * (1.0f / 255.0f); +#endif +} + +inline void Uint8x3ToFloat4(float f[4], uint32_t u) { +#if defined(_M_SSE) || PPSSPP_PLATFORM(ARM_NEON) + Uint8x4ToFloat4(f, u & 0xFFFFFF); +#else + f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f); + f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f); + f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f); + f[3] = ((u >> 24) & 0xFF) * (1.0f / 255.0f); +#endif } inline void Uint8x3ToInt4(int i[4], uint32_t u) { @@ -46,13 +85,6 @@ inline void Uint8x3ToFloat4_Alpha(float f[4], uint32_t u, float alpha) { f[3] = alpha; } -inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) { - f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f); - f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f); - f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f); - f[3] = alpha * (1.0f / 255.0f); -} - inline void Uint8x1ToFloat4(float f[4], uint32_t u) { f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f); f[1] = 0.0f; @@ -63,40 +95,54 @@ inline void Uint8x1ToFloat4(float f[4], uint32_t u) { // These are just for readability. inline void CopyFloat2(float dest[2], const float src[2]) { - memcpy(dest, src, sizeof(float) * 2); + dest[0] = src[0]; + dest[1] = src[1]; } inline void CopyFloat3(float dest[3], const float src[3]) { - memcpy(dest, src, sizeof(float) * 3); + dest[0] = src[0]; + dest[1] = src[1]; + dest[2] = src[2]; } inline void CopyFloat1To4(float dest[4], const float src) { +#ifdef _M_SSE + _mm_storeu_ps(dest, _mm_set_ss(src)); +#else dest[0] = src; dest[1] = 0.0f; dest[2] = 0.0f; dest[3] = 0.0f; +#endif } inline void CopyFloat2To4(float dest[4], const float src[2]) { - memcpy(dest, src, sizeof(float) * 2); + dest[0] = src[0]; + dest[1] = src[1]; dest[2] = 0.0f; dest[3] = 0.0f; } inline void CopyFloat3To4(float dest[4], const float src[3]) { - memcpy(dest, src, sizeof(float) * 3); + dest[0] = src[0]; + dest[1] = src[1]; + dest[2] = src[2]; dest[3] = 0.0f; } -inline void CopyFloat4(float dest[4], const float src[4]) { - memcpy(dest, src, sizeof(float) * 4); -} - inline void CopyMatrix4x4(float dest[16], const float src[16]) { memcpy(dest, src, sizeof(float) * 16); } inline void ExpandFloat24x3ToFloat4(float dest[4], uint32_t src[3]) { +#ifdef _M_SSE + __m128i values = _mm_slli_epi32(_mm_load_si128((const __m128i *)src), 8); + _mm_storeu_si128((__m128i *)dest, values); +#elif PPSSPP_PLATFORM(ARM_NEON) + const uint32x4_t values = vshlq_n_u32(vld1q_u32(&gstate.texscaleu), 8); + vst1q_u32((uint32_t *)dest, values); +#else uint32_t temp[4] = { src[0] << 8, src[1] << 8, src[2] << 8, 0 }; memcpy(dest, temp, sizeof(float) * 4); +#endif } diff --git a/ext/native/native.vcxproj b/ext/native/native.vcxproj index 34f5e2aeb8..cbee2ad055 100644 --- a/ext/native/native.vcxproj +++ b/ext/native/native.vcxproj @@ -696,6 +696,7 @@ + diff --git a/ext/native/native.vcxproj.filters b/ext/native/native.vcxproj.filters index ee8f08f807..417a918213 100644 --- a/ext/native/native.vcxproj.filters +++ b/ext/native/native.vcxproj.filters @@ -784,6 +784,9 @@ gfx + + math +