From 39034586a4ce4d909d3a97fc70bff87b89b6688f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 30 May 2023 00:37:01 +0200 Subject: [PATCH 1/5] SSE: Refactor AddStrip to prepare for early out --- GPU/Common/IndexGenerator.cpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/GPU/Common/IndexGenerator.cpp b/GPU/Common/IndexGenerator.cpp index 446d7a680e..15b5c76148 100644 --- a/GPU/Common/IndexGenerator.cpp +++ b/GPU/Common/IndexGenerator.cpp @@ -121,7 +121,6 @@ alignas(16) static const uint16_t offsets_counter_clockwise[24] = { void IndexGenerator::AddStrip(int numVerts, bool clockwise) { int numTris = numVerts - 2; - #ifdef _M_SSE // In an SSE2 register we can fit 8 16-bit integers. // However, we need to output a multiple of 3 indices. @@ -134,16 +133,21 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { __m128i ibase8 = _mm_set1_epi16(index_); __m128i increment = _mm_set1_epi16(8); const __m128i *offsets = (const __m128i *)(clockwise ? offsets_clockwise : offsets_counter_clockwise); - __m128i offsets0 = _mm_load_si128(offsets); - __m128i offsets1 = _mm_load_si128(offsets + 1); - __m128i offsets2 = _mm_load_si128(offsets + 2); + __m128i offsets0 = _mm_add_epi16(ibase8, _mm_load_si128(offsets)); + __m128i offsets1 = _mm_add_epi16(ibase8, _mm_load_si128(offsets + 1)); + __m128i offsets2 = _mm_add_epi16(ibase8, _mm_load_si128(offsets + 2)); __m128i *dst = (__m128i *)inds_; - for (int i = 0; i < numChunks; i++) { - _mm_storeu_si128(dst, _mm_add_epi16(ibase8, offsets0)); - _mm_storeu_si128(dst + 1, _mm_add_epi16(ibase8, offsets1)); - _mm_storeu_si128(dst + 2, _mm_add_epi16(ibase8, offsets2)); - ibase8 = _mm_add_epi16(ibase8, increment); + _mm_storeu_si128(dst, offsets0); + _mm_storeu_si128(dst + 1, offsets1); + _mm_storeu_si128(dst + 2, offsets2); + for (int i = 1; i < numChunks; i++) { + offsets0 = _mm_add_epi16(offsets0, increment); + offsets1 = _mm_add_epi16(offsets1, increment); + offsets2 = _mm_add_epi16(offsets2, increment); dst += 3; + _mm_storeu_si128(dst, offsets0); + _mm_storeu_si128(dst + 1, offsets1); + _mm_storeu_si128(dst + 2, offsets2); } inds_ += numTris * 3; // wind doesn't need to be updated, an even number of triangles have been drawn. From 77da36c03f575946616896418e6acc229be6e220 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 13 Jun 2023 11:35:26 +0200 Subject: [PATCH 2/5] SSE addstrip: Add the early-outs. --- GPU/Common/DrawEngineCommon.cpp | 6 ++++-- GPU/Common/IndexGenerator.cpp | 31 ++++++++++++++++++------------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index c1d41e6b94..bd04fa13f1 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -104,9 +104,11 @@ int DrawEngineCommon::ComputeNumVertsToDecode() const { } void DrawEngineCommon::DecodeVerts(u8 *dest) { - for (; decodeCounter_ < numDrawCalls_; decodeCounter_++) { - DecodeVertsStep(dest, decodeCounter_, decodedVerts_, &drawCalls_[decodeCounter_].uvScale); // NOTE! DecodeVertsStep can modify decodeCounter_! + int decodeCounter = decodeCounter_; + for (; decodeCounter < numDrawCalls_; decodeCounter++) { + DecodeVertsStep(dest, decodeCounter, decodedVerts_, &drawCalls_[decodeCounter].uvScale); // NOTE! DecodeVertsStep can modify decodeCounter_! } + decodeCounter_ = decodeCounter; // Sanity check if (indexGen.Prim() < 0) { diff --git a/GPU/Common/IndexGenerator.cpp b/GPU/Common/IndexGenerator.cpp index 15b5c76148..fc48ece6c0 100644 --- a/GPU/Common/IndexGenerator.cpp +++ b/GPU/Common/IndexGenerator.cpp @@ -131,23 +131,28 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { // That's alright as we're appending to a buffer - they will get overwritten anyway. int numChunks = (numTris + 7) / 8; __m128i ibase8 = _mm_set1_epi16(index_); - __m128i increment = _mm_set1_epi16(8); const __m128i *offsets = (const __m128i *)(clockwise ? offsets_clockwise : offsets_counter_clockwise); - __m128i offsets0 = _mm_add_epi16(ibase8, _mm_load_si128(offsets)); - __m128i offsets1 = _mm_add_epi16(ibase8, _mm_load_si128(offsets + 1)); - __m128i offsets2 = _mm_add_epi16(ibase8, _mm_load_si128(offsets + 2)); __m128i *dst = (__m128i *)inds_; + __m128i offsets0 = _mm_add_epi16(ibase8, _mm_load_si128(offsets)); + // A single store is always enough for two triangles, which is a very common case. _mm_storeu_si128(dst, offsets0); - _mm_storeu_si128(dst + 1, offsets1); - _mm_storeu_si128(dst + 2, offsets2); - for (int i = 1; i < numChunks; i++) { - offsets0 = _mm_add_epi16(offsets0, increment); - offsets1 = _mm_add_epi16(offsets1, increment); - offsets2 = _mm_add_epi16(offsets2, increment); - dst += 3; - _mm_storeu_si128(dst, offsets0); + if (numTris > 2) { + __m128i offsets1 = _mm_add_epi16(ibase8, _mm_load_si128(offsets + 1)); _mm_storeu_si128(dst + 1, offsets1); - _mm_storeu_si128(dst + 2, offsets2); + if (numTris > 5) { + __m128i offsets2 = _mm_add_epi16(ibase8, _mm_load_si128(offsets + 2)); + _mm_storeu_si128(dst + 2, offsets2); + __m128i increment = _mm_set1_epi16(8); + for (int i = 1; i < numChunks; i++) { + dst += 3; + offsets0 = _mm_add_epi16(offsets0, increment); + offsets1 = _mm_add_epi16(offsets1, increment); + offsets2 = _mm_add_epi16(offsets2, increment); + _mm_storeu_si128(dst, offsets0); + _mm_storeu_si128(dst + 1, offsets1); + _mm_storeu_si128(dst + 2, offsets2); + } + } } inds_ += numTris * 3; // wind doesn't need to be updated, an even number of triangles have been drawn. From 9647872a094f51283b29daf1776c9ec0ceadd596 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 13 Jun 2023 11:38:45 +0200 Subject: [PATCH 3/5] Same for NEON, first the refactor... --- GPU/Common/IndexGenerator.cpp | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/GPU/Common/IndexGenerator.cpp b/GPU/Common/IndexGenerator.cpp index fc48ece6c0..436ddd6b5a 100644 --- a/GPU/Common/IndexGenerator.cpp +++ b/GPU/Common/IndexGenerator.cpp @@ -159,18 +159,23 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { #elif PPSSPP_ARCH(ARM_NEON) int numChunks = (numTris + 7) / 8; uint16x8_t ibase8 = vdupq_n_u16(index_); - uint16x8_t increment = vdupq_n_u16(8); const u16 *offsets = clockwise ? offsets_clockwise : offsets_counter_clockwise; - uint16x8_t offsets0 = vld1q_u16(offsets); - uint16x8_t offsets1 = vld1q_u16(offsets + 8); - uint16x8_t offsets2 = vld1q_u16(offsets + 16); u16 *dst = inds_; - for (int i = 0; i < numChunks; i++) { - vst1q_u16(dst, vaddq_u16(ibase8, offsets0)); - vst1q_u16(dst + 8, vaddq_u16(ibase8, offsets1)); - vst1q_u16(dst + 16, vaddq_u16(ibase8, offsets2)); - ibase8 = vaddq_u16(ibase8, increment); + uint16x8_t offsets0 = vaddq_u16(ibase8, vld1q_u16(offsets)); + vst1q_u16(dst, offsets0); + uint16x8_t offsets1 = vaddq_u16(ibase8, vld1q_u16(offsets + 8)); + vst1q_u16(dst + 8, offsets1); + uint16x8_t offsets2 = vaddq_u16(ibase8, vld1q_u16(offsets + 16)); + vst1q_u16(dst + 16, offsets2); + uint16x8_t increment = vdupq_n_u16(8); + for (int i = 1; i < numChunks; i++) { dst += 3 * 8; + offsets0 = vaddq_u16(offsets0, increment); + offsets1 = vaddq_u16(offsets1, increment); + offsets2 = vaddq_u16(offsets2, increment); + vst1q_u16(dst, offsets0); + vst1q_u16(dst + 8, offsets1); + vst1q_u16(dst + 16, offsets2); } inds_ += numTris * 3; #else From 0eb3702ecbab5655d5fe7ecc25d8149c7663993c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 13 Jun 2023 11:47:31 +0200 Subject: [PATCH 4/5] Then add the early-outs for NEON too. --- GPU/Common/IndexGenerator.cpp | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/GPU/Common/IndexGenerator.cpp b/GPU/Common/IndexGenerator.cpp index 436ddd6b5a..7eb4c80079 100644 --- a/GPU/Common/IndexGenerator.cpp +++ b/GPU/Common/IndexGenerator.cpp @@ -163,19 +163,23 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { u16 *dst = inds_; uint16x8_t offsets0 = vaddq_u16(ibase8, vld1q_u16(offsets)); vst1q_u16(dst, offsets0); - uint16x8_t offsets1 = vaddq_u16(ibase8, vld1q_u16(offsets + 8)); - vst1q_u16(dst + 8, offsets1); - uint16x8_t offsets2 = vaddq_u16(ibase8, vld1q_u16(offsets + 16)); - vst1q_u16(dst + 16, offsets2); - uint16x8_t increment = vdupq_n_u16(8); - for (int i = 1; i < numChunks; i++) { - dst += 3 * 8; - offsets0 = vaddq_u16(offsets0, increment); - offsets1 = vaddq_u16(offsets1, increment); - offsets2 = vaddq_u16(offsets2, increment); - vst1q_u16(dst, offsets0); + if (numTris > 2) { + uint16x8_t offsets1 = vaddq_u16(ibase8, vld1q_u16(offsets + 8)); vst1q_u16(dst + 8, offsets1); - vst1q_u16(dst + 16, offsets2); + if (numTris > 5) { + uint16x8_t offsets2 = vaddq_u16(ibase8, vld1q_u16(offsets + 16)); + vst1q_u16(dst + 16, offsets2); + uint16x8_t increment = vdupq_n_u16(8); + for (int i = 1; i < numChunks; i++) { + dst += 3 * 8; + offsets0 = vaddq_u16(offsets0, increment); + offsets1 = vaddq_u16(offsets1, increment); + offsets2 = vaddq_u16(offsets2, increment); + vst1q_u16(dst, offsets0); + vst1q_u16(dst + 8, offsets1); + vst1q_u16(dst + 16, offsets2); + } + } } inds_ += numTris * 3; #else From df7bd89b7d773bb7f24d25cb04c726cefa0ca73b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 13 Jun 2023 11:57:28 +0200 Subject: [PATCH 5/5] Division->shift. since it's a signed integer, gets rid of a cdq instruction. --- GPU/Common/IndexGenerator.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/Common/IndexGenerator.cpp b/GPU/Common/IndexGenerator.cpp index 7eb4c80079..d1635e807d 100644 --- a/GPU/Common/IndexGenerator.cpp +++ b/GPU/Common/IndexGenerator.cpp @@ -129,7 +129,7 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { // We allow ourselves to write some extra indices to avoid the fallback loop. // That's alright as we're appending to a buffer - they will get overwritten anyway. - int numChunks = (numTris + 7) / 8; + int numChunks = (numTris + 7) >> 3; __m128i ibase8 = _mm_set1_epi16(index_); const __m128i *offsets = (const __m128i *)(clockwise ? offsets_clockwise : offsets_counter_clockwise); __m128i *dst = (__m128i *)inds_; @@ -157,7 +157,7 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { inds_ += numTris * 3; // wind doesn't need to be updated, an even number of triangles have been drawn. #elif PPSSPP_ARCH(ARM_NEON) - int numChunks = (numTris + 7) / 8; + int numChunks = (numTris + 7) >> 3; uint16x8_t ibase8 = vdupq_n_u16(index_); const u16 *offsets = clockwise ? offsets_clockwise : offsets_counter_clockwise; u16 *dst = inds_;