diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp index 8fa713d809..95ea05581b 100644 --- a/Core/MIPS/IR/IRInterpreter.cpp +++ b/Core/MIPS/IR/IRInterpreter.cpp @@ -335,7 +335,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1])); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) vst1q_f32(&mips->f[inst->dest], vld1q_f32(&mips->f[inst->src1])); #else memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float)); @@ -347,7 +347,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2]))); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) vst1q_f32(&mips->f[inst->dest], vaddq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2]))); #else for (int i = 0; i < 4; i++) @@ -360,7 +360,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2]))); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) vst1q_f32(&mips->f[inst->dest], vsubq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2]))); #else for (int i = 0; i < 4; i++) @@ -373,7 +373,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2]))); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) vst1q_f32(&mips->f[inst->dest], vmulq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2]))); #else for (int i = 0; i < 4; i++) @@ -408,7 +408,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_xor_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)signBits))); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) vst1q_f32(&mips->f[inst->dest], vnegq_f32(vld1q_f32(&mips->f[inst->src1]))); #else for (int i = 0; i < 4; i++) @@ -421,7 +421,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_and_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)noSignMask))); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) vst1q_f32(&mips->f[inst->dest], vabsq_f32(vld1q_f32(&mips->f[inst->src1]))); #else for (int i = 0; i < 4; i++) diff --git a/GPU/Math3D.h b/GPU/Math3D.h index 16e8eb7629..46a3edafc7 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -219,7 +219,7 @@ public: #if defined(_M_SSE) __m128i ivec; __m128 vec; -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) int32x4_t ivec; float32x4_t vec; #endif @@ -238,7 +238,7 @@ public: Vec3(const Vec3Packed &_xyz) { vec = _mm_loadu_ps(_xyz.AsArray()); } -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) Vec3(const float32x4_t &_vec) : vec(_vec) {} #if !defined(_MSC_VER) Vec3(const int32x4_t &_ivec) : ivec(_ivec) {} @@ -578,7 +578,7 @@ public: #if defined(_M_SSE) __m128i ivec; __m128 vec; -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) int32x4_t ivec; float32x4_t vec; #endif @@ -595,7 +595,7 @@ public: #if defined(_M_SSE) Vec4(const __m128 &_vec) : vec(_vec) {} Vec4(const __m128i &_ivec) : ivec(_ivec) {} -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) Vec4(const float32x4_t &_vec) : vec(_vec) {} #if !defined(_MSC_VER) Vec4(const int32x4_t &_ivec) : ivec(_ivec) {} @@ -607,14 +607,14 @@ public: if constexpr (std::is_same::value && std::is_same::value) { #if defined(_M_SSE) return _mm_cvtps_epi32(SAFE_M128(vec)); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) return vcvtq_s32_f32(vec); #endif } if constexpr (std::is_same::value && std::is_same::value) { #if defined(_M_SSE) return _mm_cvtepi32_ps(SAFE_M128I(ivec)); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) return vcvtq_f32_s32(ivec); #endif } @@ -922,7 +922,7 @@ inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, c _mm_add_ps(_mm_mul_ps(col2, z), col3)); return sum; } -#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) +#elif PPSSPP_ARCH(ARM64_NEON) inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) { float32x4_t col0 = vld1q_f32(m); float32x4_t col1 = vld1q_f32(m + 3); @@ -933,6 +933,17 @@ inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) { vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3)); return sum; } +#elif PPSSPP_ARCH(ARM_NEON) +inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) { + float32x4_t col0 = vld1q_f32(m); + float32x4_t col1 = vld1q_f32(m + 3); + float32x4_t col2 = vld1q_f32(m + 6); + float32x4_t col3 = vld1q_f32(m + 9); + float32x4_t sum = vaddq_f32( + vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)), + vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3)); + return sum; +} #endif // v and vecOut must point to different memory. @@ -947,7 +958,7 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) vecOut[0] = _mm_cvtss_f32(sum); vecOut[1] = vectorGetByIndex<1>(sum); vecOut[2] = vectorGetByIndex<2>(sum); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) float vecIn[4] = {v[0], v[1], v[2], 1.0f}; float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m); vecOut[0] = vgetq_lane_f32(sum, 0); @@ -967,7 +978,7 @@ inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) { __m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1)); __m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2)); return Vec3ByMatrix43Internal(x, y, z, m); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) return Vec3ByMatrix43Internal(v.vec, m); #else Vec3f vecOut; @@ -999,6 +1010,17 @@ inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) { vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3)); return sum; } +#elif PPSSPP_ARCH(ARM_NEON) +inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) { + float32x4_t col0 = vld1q_f32(m); + float32x4_t col1 = vld1q_f32(m + 4); + float32x4_t col2 = vld1q_f32(m + 8); + float32x4_t col3 = vld1q_f32(m + 12); + float32x4_t sum = vaddq_f32( + vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)), + vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3)); + return sum; +} #endif inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) { @@ -1008,7 +1030,7 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) __m128 z = _mm_set1_ps(v[2]); __m128 sum = Vec3ByMatrix44Internal(x, y, z, m); _mm_storeu_ps(vecOut, sum); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) float vecIn[4] = {v[0], v[1], v[2], 1.0f}; float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m); vst1q_f32(vecOut, sum); @@ -1027,7 +1049,7 @@ inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) { __m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1)); __m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2)); return Vec3ByMatrix44Internal(x, y, z, m); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) return Vec3ByMatrix44Internal(v.vec, m); #else Vec4f vecOut; @@ -1057,6 +1079,16 @@ inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) { vmulq_laneq_f32(col2, vec, 2)); return sum; } +#elif PPSSPP_ARCH(ARM_NEON) +inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) { + float32x4_t col0 = vld1q_f32(m); + float32x4_t col1 = vld1q_f32(m + 3); + float32x4_t col2 = vld1q_f32(m + 6); + float32x4_t sum = vaddq_f32( + vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)), + vmulq_lane_f32(col2, vget_high_f32(vec), 2)); + return sum; +} #endif inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) { @@ -1068,7 +1100,7 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12] vecOut[0] = _mm_cvtss_f32(sum); vecOut[1] = vectorGetByIndex<1>(sum); vecOut[2] = vectorGetByIndex<2>(sum); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m); vecOut[0] = vgetq_lane_f32(sum, 0); vecOut[1] = vgetq_lane_f32(sum, 1); @@ -1087,7 +1119,7 @@ inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) { __m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1)); __m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2)); return Norm3ByMatrix43Internal(x, y, z, m); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) return Norm3ByMatrix43Internal(v.vec, m); #else Vec3f vecOut; @@ -1209,7 +1241,7 @@ inline Vec3 Vec3::FromRGB(unsigned int rgb) __m128i c = _mm_cvtsi32_si128(rgb); c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z); return Vec3(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f))); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb)); uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c))); return Vec3(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f))); @@ -1228,7 +1260,7 @@ inline Vec3 Vec3::FromRGB(unsigned int rgb) __m128i c = _mm_cvtsi32_si128(rgb); c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z); return Vec3(c); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb)); uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c))); return Vec3(vreinterpretq_s32_u32(u)); @@ -1244,7 +1276,7 @@ __forceinline unsigned int Vec3::ToRGB() const __m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f))); __m128i c16 = _mm_packs_epi32(c, c); return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF; -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f)))); uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16)); return vget_lane_u32(vreinterpret_u32_u8(c8), 0); @@ -1261,7 +1293,7 @@ __forceinline unsigned int Vec3::ToRGB() const #if defined(_M_SSE) __m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec)); return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF; -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3)); uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16)); return vget_lane_u32(vreinterpret_u32_u8(c8), 0); @@ -1278,7 +1310,7 @@ inline Vec4 Vec4::FromRGBA(unsigned int rgba) __m128i c = _mm_cvtsi32_si128(rgba); c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z); return Vec4(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f))); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba)); uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c))); return Vec4(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f))); @@ -1304,7 +1336,7 @@ inline Vec4 Vec4::FromRGBA(unsigned int rgba) __m128i c = _mm_cvtsi32_si128(rgba); c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z); return Vec4(c); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba)); uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c))); return Vec4(vreinterpretq_s32_u32(u)); @@ -1320,7 +1352,7 @@ __forceinline unsigned int Vec4::ToRGBA() const __m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f))); __m128i c16 = _mm_packs_epi32(c, c); return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f)))); uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16)); return vget_lane_u32(vreinterpret_u32_u8(c8), 0); @@ -1338,7 +1370,7 @@ __forceinline unsigned int Vec4::ToRGBA() const #if defined(_M_SSE) __m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec)); return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)); -#elif PPSSPP_ARCH(ARM64_NEON) +#elif PPSSPP_ARCH(ARM_NEON) uint16x4_t c16 = vqmovun_s32(ivec); uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16)); return vget_lane_u32(vreinterpret_u32_u8(c8), 0); diff --git a/ppsspp_config.h b/ppsspp_config.h index 6228d2b556..2861b621b3 100644 --- a/ppsspp_config.h +++ b/ppsspp_config.h @@ -57,7 +57,7 @@ #if defined(__aarch64__) || defined(_M_ARM64) #define PPSSPP_ARCH_ARM64 1 #define PPSSPP_ARCH_64BIT 1 - #define PPSSPP_ARCH_ARM_NEON 1 + #define PPSSPP_ARCH_ARM_NEON 1 // Applies to both ARM32 and ARM64 #define PPSSPP_ARCH_ARM64_NEON 1 #endif