Switch from shift to multiplier for volume in StereoResampler

2025-04-02 11:01:50 -04:00 · 2025-02-11 15:45:10 -06:00 · 2025-02-11 15:45:10 -06:00 · 25bedc44b8
commit 25bedc44b8
parent 1a41a36093
2 changed files with 66 additions and 52 deletions
--- a/Core/ConfigValues.h
+++ b/Core/ConfigValues.h
@ -25,11 +25,11 @@
 #include "Common/Common.h"
 #include "Common/CommonFuncs.h"

-const int PSP_MODEL_FAT = 0;
-const int PSP_MODEL_SLIM = 1;
-const int PSP_DEFAULT_FIRMWARE = 660;
-static const int8_t VOLUME_OFF = 0;
-static const int8_t VOLUME_FULL = 10;
+constexpr int PSP_MODEL_FAT = 0;
+constexpr int PSP_MODEL_SLIM = 1;
+constexpr int PSP_DEFAULT_FIRMWARE = 660;
+constexpr int VOLUME_OFF = 0;
+constexpr int VOLUME_FULL = 10;

 struct ConfigTouchPos {
 	float x;
--- a/Core/HW/StereoResampler.cpp
+++ b/Core/HW/StereoResampler.cpp
@ -41,6 +41,7 @@
 #include "Common/System/System.h"
 #include "Common/Log.h"
 #include "Common/Math/SIMDHeaders.h"
+#include "Common/Math/CrossSIMD.h"
 #include "Common/TimeUtil.h"
 #include "Core/Config.h"
 #include "Core/ConfigValues.h"
@ -90,61 +91,57 @@ void StereoResampler::UpdateBufferSize() {
 	}
 }

-template<bool useShift>
-inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size, s8 volShift) {
+// factor is a 0.12-bit fixed point number.
+template<bool multiply>
+inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size, int factor) {
+	if (multiply) {
+		// Let's SIMD later. Unfortunately for s16 operations, SSE2 is very different and odd
+		// so CrossSIMD won't be very useful.
+		for (size_t i = 0; i < size; i++) {
+			out[i] = clamp_s16((in[i] * factor) >> 12);
+		}
+	} else {
 #ifdef _M_SSE
-	// Size will always be 16-byte aligned as the hwBlockSize is.
-	while (size >= 8) {
-		__m128i in1 = _mm_loadu_si128((__m128i *)in);
-		__m128i in2 = _mm_loadu_si128((__m128i *)(in + 4));
-		__m128i packed = _mm_packs_epi32(in1, in2);
-		if (useShift) {
-			packed = _mm_srai_epi16(packed, volShift);
+		// Size will always be 16-byte aligned as the hwBlockSize is.
+		while (size >= 8) {
+			__m128i in1 = _mm_loadu_si128((__m128i *)in);
+			__m128i in2 = _mm_loadu_si128((__m128i *)(in + 4));
+			__m128i packed = _mm_packs_epi32(in1, in2);  // pack with signed saturation, perfect.
+			_mm_storeu_si128((__m128i *)out, packed);
+			out += 8;
+			in += 8;
+			size -= 8;
 		}
-		_mm_storeu_si128((__m128i *)out, packed);
-		out += 8;
-		in += 8;
-		size -= 8;
-	}
 #elif PPSSPP_ARCH(ARM_NEON)
-	// Dynamic shifts can only be left, but it's signed - negate to shift right.
-	int16x4_t signedVolShift = vdup_n_s16(-volShift);
-	while (size >= 8) {
-		int32x4_t in1 = vld1q_s32(in);
-		int32x4_t in2 = vld1q_s32(in + 4);
-		int16x4_t packed1 = vqmovn_s32(in1);
-		int16x4_t packed2 = vqmovn_s32(in2);
-		if (useShift) {
-			packed1 = vshl_s16(packed1, signedVolShift);
-			packed2 = vshl_s16(packed2, signedVolShift);
+		// Dynamic shifts can only be left, but it's signed - negate to shift right.
+		int16x4_t signedVolShift = vdup_n_s16(-volShift);
+		while (size >= 8) {
+			int32x4_t in1 = vld1q_s32(in);
+			int32x4_t in2 = vld1q_s32(in + 4);
+			int16x4_t packed1 = vqmovn_s32(in1);
+			int16x4_t packed2 = vqmovn_s32(in2);
+			vst1_s16(out, packed1);
+			vst1_s16(out + 4, packed2);
+			out += 8;
+			in += 8;
+			size -= 8;
 		}
-		vst1_s16(out, packed1);
-		vst1_s16(out + 4, packed2);
-		out += 8;
-		in += 8;
-		size -= 8;
-	}
 #endif
-	// This does the remainder if SIMD was used, otherwise it does it all.
-	for (size_t i = 0; i < size; i++) {
-		out[i] = clamp_s16(useShift ? (in[i] >> volShift) : in[i]);
+		// This does the remainder if SIMD was used, otherwise it does it all.
+		for (size_t i = 0; i < size; i++) {
+			out[i] = clamp_s16(in[i]);
+		}
 	}
 }

-inline void ClampBufferToS16WithVolume(s16 *out, const s32 *in, size_t size) {
-	int volume = g_Config.iGlobalVolume;
-	if (PSP_CoreParameter().fpsLimit != FPSLimit::NORMAL || PSP_CoreParameter().fastForward) {
-		if (g_Config.iAltSpeedVolume != -1) {
-			volume = g_Config.iAltSpeedVolume;
-		}
-	}
-
-	if (volume >= VOLUME_FULL) {
+inline void ClampBufferToS16WithVolume(s16 *out, const s32 *in, size_t size, int volume) {
+	// The last parameter to ClampBufferToS16 is no longer a shift, now it's a 12-bit multiplier.
+	if (volume >= 4096) {
 		ClampBufferToS16<false>(out, in, size, 0);
-	} else if (volume <= VOLUME_OFF) {
+	} else if (volume <= 0) {
 		memset(out, 0, size * sizeof(s16));
 	} else {
-		ClampBufferToS16<true>(out, in, size, VOLUME_FULL - (s8)volume);
+		ClampBufferToS16<true>(out, in, size, volume);
 	}
 }

@ -277,13 +274,30 @@ void StereoResampler::PushSamples(const s32 *samples, unsigned int numSamples) {
 		return;
 	}

+	int vol = g_Config.iGlobalVolume;
+	if (PSP_CoreParameter().fpsLimit != FPSLimit::NORMAL || PSP_CoreParameter().fastForward) {
+		if (g_Config.iAltSpeedVolume != -1) {
+			vol = g_Config.iAltSpeedVolume;
+		}
+	}
+
+	vol = std::clamp(vol, 0, VOLUME_FULL);
+
+	// 12-bit volume. So far this isn't any better than the shift, but stay tuned.
+	int volume;
+	if (vol != 0) {
+		volume = 4096 >> (VOLUME_FULL - vol);
+	} else {
+		volume = 0;
+	}
+
 	// Check if we need to roll over to the start of the buffer during the copy.
 	unsigned int indexW_left_samples = m_maxBufsize * 2 - (indexW & INDEX_MASK);
 	if (numSamples * 2 > indexW_left_samples) {
-		ClampBufferToS16WithVolume(&m_buffer[indexW & INDEX_MASK], samples, indexW_left_samples);
-		ClampBufferToS16WithVolume(&m_buffer[0], samples + indexW_left_samples, numSamples * 2 - indexW_left_samples);
+		ClampBufferToS16WithVolume(&m_buffer[indexW & INDEX_MASK], samples, indexW_left_samples, volume);
+		ClampBufferToS16WithVolume(&m_buffer[0], samples + indexW_left_samples, numSamples * 2 - indexW_left_samples, volume);
 	} else {
-		ClampBufferToS16WithVolume(&m_buffer[indexW & INDEX_MASK], samples, numSamples * 2);
+		ClampBufferToS16WithVolume(&m_buffer[indexW & INDEX_MASK], samples, numSamples * 2, volume);
 	}

 	m_indexW += numSamples * 2;