NEON: vcvtq can scale directly, no need for a mul by const.

2025-04-02 11:01:50 -04:00 · 2023-12-09 16:40:37 +01:00 · 2023-12-09 16:40:37 +01:00 · 4e2a1bf81c
commit 4e2a1bf81c
parent 99548be8a3
2 changed files with 7 additions and 4 deletions
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@ -38,7 +38,7 @@ inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) {
 	case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0);
 	case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1);
 	case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0);
-	case 3: return vmulq_lane_f32(a, vget_high_f32(b), 1);
+	default: return vmulq_lane_f32(a, vget_high_f32(b), 1);
 	}
 }

@ -47,8 +47,12 @@ inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c,
 	case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
 	case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
 	case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
-	case 3: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
+	default: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
 	}
 }

+inline uint32x4_t vcgezq_f32(float32x4_t v) {
+	return vcgeq_f32(v, vdupq_n_f32(0.0f));
+}
+
 #endif
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@ -444,11 +444,10 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u
 			_mm_storeu_ps(verts + i * 3, pos);  // TODO: use stride 4 to avoid clashing writes?
 		}
 #elif PPSSPP_ARCH(ARM_NEON)
-		float32x4_t scaleFactor = vdupq_n_f32(1.0f / 32768.0f);
 		for (int i = 0; i < vertexCount; i++) {
 			const s16 *dataPtr = ((const s16 *)((const s8 *)vdata + i * stride + offset));
 			int32x4_t data = vmovl_s16(vld1_s16(dataPtr));
-			float32x4_t pos = vmulq_f32(vcvtq_f32_s32(data), scaleFactor);
+			float32x4_t pos = vcvtq_n_f32_s32(data, 15);  // >> 15 = division by 32768.0f
 			vst1q_f32(verts + i * 3, pos);
 		}
 #else