mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Add NEON impl too, mostly for exercise purposes
This commit is contained in:
parent
e6f0f84a45
commit
d0e8cfa365
2 changed files with 25 additions and 0 deletions
|
@ -52,6 +52,13 @@ inline uint32_t Float4ToUint8x4(const float f[4]) {
|
||||||
__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
|
__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
|
||||||
__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
|
__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
|
||||||
return _mm_cvtsi128_si32(ivalue);
|
return _mm_cvtsi128_si32(ivalue);
|
||||||
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
|
const float32x4_t value = vmulq_f32(vld1q_f32(f), vdupq_n_f32(255.0f));
|
||||||
|
uint32x4_t ivalue32 = vcvtq_u32_f32(value);
|
||||||
|
uint16x4_t ivalue16 = vqmovn_u32(ivalue32);
|
||||||
|
uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16)); // Is there no way to avoid the combine here?
|
||||||
|
uint32x2_t outValue32 = vreinterpret_u8_u32(ivalue8);
|
||||||
|
return vget_lane_u32(outValue32, 0);
|
||||||
#else
|
#else
|
||||||
int i4[4];
|
int i4[4];
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
|
@ -74,6 +81,13 @@ inline uint32_t Float4ToUint8x4_NoClamp(const float f[4]) {
|
||||||
__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
|
__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
|
||||||
__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
|
__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
|
||||||
return _mm_cvtsi128_si32(ivalue);
|
return _mm_cvtsi128_si32(ivalue);
|
||||||
|
#elif PPSSPP_ARCH(ARM_NEON)
|
||||||
|
const float32x4_t value = vmulq_f32(vld1q_f32(f), vdupq_n_f32(255.0f));
|
||||||
|
uint32x4_t ivalue32 = vcvtq_u32_f32(value);
|
||||||
|
uint16x4_t ivalue16 = vqmovn_u32(ivalue32);
|
||||||
|
uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16)); // Is there no way to avoid the combine here?
|
||||||
|
uint32x2_t outValue32 = vreinterpret_u8_u32(ivalue8);
|
||||||
|
return vget_lane_u32(outValue32, 0);
|
||||||
#else
|
#else
|
||||||
u32 i4[4];
|
u32 i4[4];
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
|
|
|
@ -41,6 +41,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "Common/Data/Collections/TinySet.h"
|
#include "Common/Data/Collections/TinySet.h"
|
||||||
|
#include "Common/Data/Convert/SmallDataConvert.h"
|
||||||
#include "Common/Data/Text/Parsers.h"
|
#include "Common/Data/Text/Parsers.h"
|
||||||
#include "Common/Data/Text/WrapText.h"
|
#include "Common/Data/Text/WrapText.h"
|
||||||
#include "Common/Data/Encoding/Utf8.h"
|
#include "Common/Data/Encoding/Utf8.h"
|
||||||
|
@ -782,6 +783,15 @@ static bool TestWrapText() {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool TestSmallDataConvert() {
|
||||||
|
float f[4] = { 1.0f / 255.0f, 2.0f / 255.0f, 3.0f / 255.0f, 4.0f / 255.f };
|
||||||
|
uint32_t result = Float4ToUint8x4_NoClamp(f);
|
||||||
|
EXPECT_EQ_HEX(result, 0x04030201);
|
||||||
|
result = Float4ToUint8x4(f);
|
||||||
|
EXPECT_EQ_HEX(result, 0x04030201);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
typedef bool (*TestFunc)();
|
typedef bool (*TestFunc)();
|
||||||
struct TestItem {
|
struct TestItem {
|
||||||
const char *name;
|
const char *name;
|
||||||
|
@ -832,6 +842,7 @@ TestItem availableTests[] = {
|
||||||
TEST_ITEM(ThreadManager),
|
TEST_ITEM(ThreadManager),
|
||||||
TEST_ITEM(WrapText),
|
TEST_ITEM(WrapText),
|
||||||
TEST_ITEM(TinySet),
|
TEST_ITEM(TinySet),
|
||||||
|
TEST_ITEM(SmallDataConvert),
|
||||||
};
|
};
|
||||||
|
|
||||||
int main(int argc, const char *argv[]) {
|
int main(int argc, const char *argv[]) {
|
||||||
|
|
Loading…
Add table
Reference in a new issue