From 5802529daa3779c38f0b5315290911ed93fc8c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sat, 20 Mar 2021 11:18:54 +0100 Subject: [PATCH] UTF-8 PPGe: Remove overlong encodings. Work around a weird issue in Ratchet & Clank. See issue #14297 --- Common/Data/Encoding/Utf8.cpp | 15 +++++++++++++++ Common/Data/Encoding/Utf8.h | 7 ++++++- Core/Util/PPGeDraw.cpp | 16 ++++++++++++++-- unittest/UnitTest.cpp | 1 + 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/Common/Data/Encoding/Utf8.cpp b/Common/Data/Encoding/Utf8.cpp index f3c2b1e223..0a2a7355cc 100644 --- a/Common/Data/Encoding/Utf8.cpp +++ b/Common/Data/Encoding/Utf8.cpp @@ -483,6 +483,21 @@ std::string ConvertUCS2ToUTF8(const std::u16string &wstr) { return s; } +std::string SanitizeUTF8(const std::string &utf8string) { + UTF8 utf(utf8string.c_str()); + std::string s; + // Worst case. + s.resize(utf8string.size() * 4); + + size_t pos = 0; + while (!utf.end_or_overlong_end()) { + int c = utf.next(); + pos += UTF8::encode(&s[pos], c); + } + s.resize(pos); + return s; +} + static size_t ConvertUTF8ToUCS2Internal(char16_t *dest, size_t destSize, const std::string &source) { const char16_t *const orig = dest; const char16_t *const destEnd = dest + destSize; diff --git a/Common/Data/Encoding/Utf8.h b/Common/Data/Encoding/Utf8.h index b954ef5371..f36cb3e75a 100644 --- a/Common/Data/Encoding/Utf8.h +++ b/Common/Data/Encoding/Utf8.h @@ -31,10 +31,11 @@ public: UTF8(const char *c) : c_(c), index_(0) {} UTF8(const char *c, int index) : c_(c), index_(index) {} bool end() const { return c_[index_] == 0; } + bool end_or_overlong_end() const { return peek() == 0; } uint32_t next() { return u8_nextchar(c_, &index_); } - uint32_t peek() { + uint32_t peek() const { int tempIndex = index_; return u8_nextchar(c_, &tempIndex); } @@ -76,6 +77,10 @@ int UTF8StringNonASCIICount(const char *utf8string); bool UTF8StringHasNonASCII(const char *utf8string); +// Removes overlong encodings and similar. +std::string SanitizeUTF8(const std::string &utf8string); + + // UTF8 to Win32 UTF-16 // Should be used when calling Win32 api calls #ifdef _WIN32 diff --git a/Core/Util/PPGeDraw.cpp b/Core/Util/PPGeDraw.cpp index 9674f69469..bc529efe3b 100644 --- a/Core/Util/PPGeDraw.cpp +++ b/Core/Util/PPGeDraw.cpp @@ -955,12 +955,24 @@ static void PPGeDecimateTextImages(int age) { } void PPGeDrawText(const char *text, float x, float y, const PPGeStyle &style) { - if (!text || !strlen(text)) { + if (!text) { + return; + } + // Seen in Ratchet & Clank - Secret Agent. To match the output of the real thing, we have to remove + // both the overlong encoding and "ENTR", whatever that is. If we just let SanitizeUTF8 remove + // the overlong null, the rest of the string is missing in the bottom left corner (save size, etc). + // It doesn't seem to be using sceCcc. + // Note how the double "" is required in the middle of the string to end the \x80 constant (otherwise it takes E). + // TODO: Potentially if the string is only ended by a C080, ReplaceAll might overshoot :( + std::string str = ReplaceAll(text, "\xC0\x80""ENTR", ""); + // Then SanitizeUTF8 is needed to get rid of various other overlong encodings. + str = SanitizeUTF8(str); + if (str.empty()) { return; } if (HasTextDrawer()) { - PPGeTextDrawerImage im = PPGeGetTextImage(text, style, 480.0f - x, false); + PPGeTextDrawerImage im = PPGeGetTextImage(str.c_str(), style, 480.0f - x, false); if (im.ptr) { PPGeDrawTextImage(im, x, y, style); return; diff --git a/unittest/UnitTest.cpp b/unittest/UnitTest.cpp index 2733287e55..65447c27f0 100644 --- a/unittest/UnitTest.cpp +++ b/unittest/UnitTest.cpp @@ -44,6 +44,7 @@ #include "ext/disarm.h" #include "Common/Math/math_util.h" #include "Common/Data/Text/Parsers.h" +#include "Common/Data/Encoding/Utf8.h" #include "Common/ArmEmitter.h" #include "Common/BitScan.h"