mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
NEON: vcvtq can scale directly, no need for a mul by const.
This commit is contained in:
parent
99548be8a3
commit
4e2a1bf81c
2 changed files with 7 additions and 4 deletions
|
@ -38,7 +38,7 @@ inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) {
|
|||
case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0);
|
||||
case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1);
|
||||
case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0);
|
||||
case 3: return vmulq_lane_f32(a, vget_high_f32(b), 1);
|
||||
default: return vmulq_lane_f32(a, vget_high_f32(b), 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -47,8 +47,12 @@ inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c,
|
|||
case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
|
||||
case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
|
||||
case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
|
||||
case 3: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
|
||||
default: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
|
||||
}
|
||||
}
|
||||
|
||||
inline uint32x4_t vcgezq_f32(float32x4_t v) {
|
||||
return vcgeq_f32(v, vdupq_n_f32(0.0f));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -444,11 +444,10 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u
|
|||
_mm_storeu_ps(verts + i * 3, pos); // TODO: use stride 4 to avoid clashing writes?
|
||||
}
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
float32x4_t scaleFactor = vdupq_n_f32(1.0f / 32768.0f);
|
||||
for (int i = 0; i < vertexCount; i++) {
|
||||
const s16 *dataPtr = ((const s16 *)((const s8 *)vdata + i * stride + offset));
|
||||
int32x4_t data = vmovl_s16(vld1_s16(dataPtr));
|
||||
float32x4_t pos = vmulq_f32(vcvtq_f32_s32(data), scaleFactor);
|
||||
float32x4_t pos = vcvtq_n_f32_s32(data, 15); // >> 15 = division by 32768.0f
|
||||
vst1q_f32(verts + i * 3, pos);
|
||||
}
|
||||
#else
|
||||
|
|
Loading…
Add table
Reference in a new issue