UTF-8 PPGe: Remove overlong encodings. Work around a weird issue in Ratchet & Clank. See issue #14297

This commit is contained in:
Henrik Rydgård 2021-03-20 11:18:54 +01:00
parent be6e975029
commit 5802529daa
4 changed files with 36 additions and 3 deletions

View file

@ -483,6 +483,21 @@ std::string ConvertUCS2ToUTF8(const std::u16string &wstr) {
return s;
}
std::string SanitizeUTF8(const std::string &utf8string) {
UTF8 utf(utf8string.c_str());
std::string s;
// Worst case.
s.resize(utf8string.size() * 4);
size_t pos = 0;
while (!utf.end_or_overlong_end()) {
int c = utf.next();
pos += UTF8::encode(&s[pos], c);
}
s.resize(pos);
return s;
}
static size_t ConvertUTF8ToUCS2Internal(char16_t *dest, size_t destSize, const std::string &source) {
const char16_t *const orig = dest;
const char16_t *const destEnd = dest + destSize;

View file

@ -31,10 +31,11 @@ public:
UTF8(const char *c) : c_(c), index_(0) {}
UTF8(const char *c, int index) : c_(c), index_(index) {}
bool end() const { return c_[index_] == 0; }
bool end_or_overlong_end() const { return peek() == 0; }
uint32_t next() {
return u8_nextchar(c_, &index_);
}
uint32_t peek() {
uint32_t peek() const {
int tempIndex = index_;
return u8_nextchar(c_, &tempIndex);
}
@ -76,6 +77,10 @@ int UTF8StringNonASCIICount(const char *utf8string);
bool UTF8StringHasNonASCII(const char *utf8string);
// Removes overlong encodings and similar.
std::string SanitizeUTF8(const std::string &utf8string);
// UTF8 to Win32 UTF-16
// Should be used when calling Win32 api calls
#ifdef _WIN32

View file

@ -955,12 +955,24 @@ static void PPGeDecimateTextImages(int age) {
}
void PPGeDrawText(const char *text, float x, float y, const PPGeStyle &style) {
if (!text || !strlen(text)) {
if (!text) {
return;
}
// Seen in Ratchet & Clank - Secret Agent. To match the output of the real thing, we have to remove
// both the overlong encoding and "ENTR", whatever that is. If we just let SanitizeUTF8 remove
// the overlong null, the rest of the string is missing in the bottom left corner (save size, etc).
// It doesn't seem to be using sceCcc.
// Note how the double "" is required in the middle of the string to end the \x80 constant (otherwise it takes E).
// TODO: Potentially if the string is only ended by a C080, ReplaceAll might overshoot :(
std::string str = ReplaceAll(text, "\xC0\x80""ENTR", "");
// Then SanitizeUTF8 is needed to get rid of various other overlong encodings.
str = SanitizeUTF8(str);
if (str.empty()) {
return;
}
if (HasTextDrawer()) {
PPGeTextDrawerImage im = PPGeGetTextImage(text, style, 480.0f - x, false);
PPGeTextDrawerImage im = PPGeGetTextImage(str.c_str(), style, 480.0f - x, false);
if (im.ptr) {
PPGeDrawTextImage(im, x, y, style);
return;

View file

@ -44,6 +44,7 @@
#include "ext/disarm.h"
#include "Common/Math/math_util.h"
#include "Common/Data/Text/Parsers.h"
#include "Common/Data/Encoding/Utf8.h"
#include "Common/ArmEmitter.h"
#include "Common/BitScan.h"