Switch from shift to multiplier for volume in StereoResampler

This commit is contained in:
Henrik Rydgård 2025-02-11 15:45:10 -06:00
parent 1a41a36093
commit 25bedc44b8
2 changed files with 66 additions and 52 deletions

View file

@ -25,11 +25,11 @@
#include "Common/Common.h" #include "Common/Common.h"
#include "Common/CommonFuncs.h" #include "Common/CommonFuncs.h"
const int PSP_MODEL_FAT = 0; constexpr int PSP_MODEL_FAT = 0;
const int PSP_MODEL_SLIM = 1; constexpr int PSP_MODEL_SLIM = 1;
const int PSP_DEFAULT_FIRMWARE = 660; constexpr int PSP_DEFAULT_FIRMWARE = 660;
static const int8_t VOLUME_OFF = 0; constexpr int VOLUME_OFF = 0;
static const int8_t VOLUME_FULL = 10; constexpr int VOLUME_FULL = 10;
struct ConfigTouchPos { struct ConfigTouchPos {
float x; float x;

View file

@ -41,6 +41,7 @@
#include "Common/System/System.h" #include "Common/System/System.h"
#include "Common/Log.h" #include "Common/Log.h"
#include "Common/Math/SIMDHeaders.h" #include "Common/Math/SIMDHeaders.h"
#include "Common/Math/CrossSIMD.h"
#include "Common/TimeUtil.h" #include "Common/TimeUtil.h"
#include "Core/Config.h" #include "Core/Config.h"
#include "Core/ConfigValues.h" #include "Core/ConfigValues.h"
@ -90,61 +91,57 @@ void StereoResampler::UpdateBufferSize() {
} }
} }
template<bool useShift> // factor is a 0.12-bit fixed point number.
inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size, s8 volShift) { template<bool multiply>
inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size, int factor) {
if (multiply) {
// Let's SIMD later. Unfortunately for s16 operations, SSE2 is very different and odd
// so CrossSIMD won't be very useful.
for (size_t i = 0; i < size; i++) {
out[i] = clamp_s16((in[i] * factor) >> 12);
}
} else {
#ifdef _M_SSE #ifdef _M_SSE
// Size will always be 16-byte aligned as the hwBlockSize is. // Size will always be 16-byte aligned as the hwBlockSize is.
while (size >= 8) { while (size >= 8) {
__m128i in1 = _mm_loadu_si128((__m128i *)in); __m128i in1 = _mm_loadu_si128((__m128i *)in);
__m128i in2 = _mm_loadu_si128((__m128i *)(in + 4)); __m128i in2 = _mm_loadu_si128((__m128i *)(in + 4));
__m128i packed = _mm_packs_epi32(in1, in2); __m128i packed = _mm_packs_epi32(in1, in2); // pack with signed saturation, perfect.
if (useShift) { _mm_storeu_si128((__m128i *)out, packed);
packed = _mm_srai_epi16(packed, volShift); out += 8;
in += 8;
size -= 8;
} }
_mm_storeu_si128((__m128i *)out, packed);
out += 8;
in += 8;
size -= 8;
}
#elif PPSSPP_ARCH(ARM_NEON) #elif PPSSPP_ARCH(ARM_NEON)
// Dynamic shifts can only be left, but it's signed - negate to shift right. // Dynamic shifts can only be left, but it's signed - negate to shift right.
int16x4_t signedVolShift = vdup_n_s16(-volShift); int16x4_t signedVolShift = vdup_n_s16(-volShift);
while (size >= 8) { while (size >= 8) {
int32x4_t in1 = vld1q_s32(in); int32x4_t in1 = vld1q_s32(in);
int32x4_t in2 = vld1q_s32(in + 4); int32x4_t in2 = vld1q_s32(in + 4);
int16x4_t packed1 = vqmovn_s32(in1); int16x4_t packed1 = vqmovn_s32(in1);
int16x4_t packed2 = vqmovn_s32(in2); int16x4_t packed2 = vqmovn_s32(in2);
if (useShift) { vst1_s16(out, packed1);
packed1 = vshl_s16(packed1, signedVolShift); vst1_s16(out + 4, packed2);
packed2 = vshl_s16(packed2, signedVolShift); out += 8;
in += 8;
size -= 8;
} }
vst1_s16(out, packed1);
vst1_s16(out + 4, packed2);
out += 8;
in += 8;
size -= 8;
}
#endif #endif
// This does the remainder if SIMD was used, otherwise it does it all. // This does the remainder if SIMD was used, otherwise it does it all.
for (size_t i = 0; i < size; i++) { for (size_t i = 0; i < size; i++) {
out[i] = clamp_s16(useShift ? (in[i] >> volShift) : in[i]); out[i] = clamp_s16(in[i]);
}
} }
} }
inline void ClampBufferToS16WithVolume(s16 *out, const s32 *in, size_t size) { inline void ClampBufferToS16WithVolume(s16 *out, const s32 *in, size_t size, int volume) {
int volume = g_Config.iGlobalVolume; // The last parameter to ClampBufferToS16 is no longer a shift, now it's a 12-bit multiplier.
if (PSP_CoreParameter().fpsLimit != FPSLimit::NORMAL || PSP_CoreParameter().fastForward) { if (volume >= 4096) {
if (g_Config.iAltSpeedVolume != -1) {
volume = g_Config.iAltSpeedVolume;
}
}
if (volume >= VOLUME_FULL) {
ClampBufferToS16<false>(out, in, size, 0); ClampBufferToS16<false>(out, in, size, 0);
} else if (volume <= VOLUME_OFF) { } else if (volume <= 0) {
memset(out, 0, size * sizeof(s16)); memset(out, 0, size * sizeof(s16));
} else { } else {
ClampBufferToS16<true>(out, in, size, VOLUME_FULL - (s8)volume); ClampBufferToS16<true>(out, in, size, volume);
} }
} }
@ -277,13 +274,30 @@ void StereoResampler::PushSamples(const s32 *samples, unsigned int numSamples) {
return; return;
} }
int vol = g_Config.iGlobalVolume;
if (PSP_CoreParameter().fpsLimit != FPSLimit::NORMAL || PSP_CoreParameter().fastForward) {
if (g_Config.iAltSpeedVolume != -1) {
vol = g_Config.iAltSpeedVolume;
}
}
vol = std::clamp(vol, 0, VOLUME_FULL);
// 12-bit volume. So far this isn't any better than the shift, but stay tuned.
int volume;
if (vol != 0) {
volume = 4096 >> (VOLUME_FULL - vol);
} else {
volume = 0;
}
// Check if we need to roll over to the start of the buffer during the copy. // Check if we need to roll over to the start of the buffer during the copy.
unsigned int indexW_left_samples = m_maxBufsize * 2 - (indexW & INDEX_MASK); unsigned int indexW_left_samples = m_maxBufsize * 2 - (indexW & INDEX_MASK);
if (numSamples * 2 > indexW_left_samples) { if (numSamples * 2 > indexW_left_samples) {
ClampBufferToS16WithVolume(&m_buffer[indexW & INDEX_MASK], samples, indexW_left_samples); ClampBufferToS16WithVolume(&m_buffer[indexW & INDEX_MASK], samples, indexW_left_samples, volume);
ClampBufferToS16WithVolume(&m_buffer[0], samples + indexW_left_samples, numSamples * 2 - indexW_left_samples); ClampBufferToS16WithVolume(&m_buffer[0], samples + indexW_left_samples, numSamples * 2 - indexW_left_samples, volume);
} else { } else {
ClampBufferToS16WithVolume(&m_buffer[indexW & INDEX_MASK], samples, numSamples * 2); ClampBufferToS16WithVolume(&m_buffer[indexW & INDEX_MASK], samples, numSamples * 2, volume);
} }
m_indexW += numSamples * 2; m_indexW += numSamples * 2;