diff --git a/Common/Data/Encoding/Utf8.cpp b/Common/Data/Encoding/Utf8.cpp index fb3e829d16..4eea26adc0 100644 --- a/Common/Data/Encoding/Utf8.cpp +++ b/Common/Data/Encoding/Utf8.cpp @@ -206,27 +206,15 @@ int u8_charnum(const char *s, int offset) return charnum; } -/* number of characters */ -int u8_strlen(const char *s) -{ - int count = 0; - int i = 0; - - while (u8_nextchar(s, &i) != 0) - count++; - - return count; -} - /* reads the next utf-8 sequence out of a string, updating an index */ -uint32_t u8_nextchar(const char *s, int *index) { +uint32_t u8_nextchar(const char *s, int *index, size_t size) { uint32_t ch = 0; int sz = 0; int i = *index; do { ch = (ch << 6) + (unsigned char)s[i++]; sz++; - } while (s[i] && ((s[i]) & 0xC0) == 0x80); + } while (i < size && ((s[i]) & 0xC0) == 0x80); *index = i; return ch - offsetsFromUTF8[sz - 1]; } @@ -234,7 +222,6 @@ uint32_t u8_nextchar(const char *s, int *index) { uint32_t u8_nextchar_unsafe(const char *s, int *i) { uint32_t ch = (unsigned char)s[(*i)++]; int sz = 1; - if (ch >= 0xF0) { sz++; ch &= ~0x10; @@ -253,7 +240,6 @@ uint32_t u8_nextchar_unsafe(const char *s, int *i) { ch <<= 6; ch += ((unsigned char)s[(*i)++]) & 0x3F; } - return ch; } @@ -367,48 +353,6 @@ int u8_unescape(char *buf, int sz, char *src) return c; } -const char *u8_strchr(const char *s, uint32_t ch, int *charn) -{ - int i = 0, lasti=0; - uint32_t c; - - *charn = 0; - while (s[i]) { - c = u8_nextchar(s, &i); - if (c == ch) { - return &s[lasti]; - } - lasti = i; - (*charn)++; - } - return NULL; -} - -const char *u8_memchr(const char *s, uint32_t ch, size_t sz, int *charn) -{ - size_t i = 0, lasti=0; - uint32_t c; - int csz; - - *charn = 0; - while (i < sz) { - c = csz = 0; - do { - c <<= 6; - c += (unsigned char)s[i++]; - csz++; - } while (i < sz && !isutf(s[i])); - c -= offsetsFromUTF8[csz-1]; - - if (c == ch) { - return &s[lasti]; - } - lasti = i; - (*charn)++; - } - return NULL; -} - int u8_is_locale_utf8(const char *locale) { /* this code based on libutf8 */ @@ -428,10 +372,10 @@ int u8_is_locale_utf8(const char *locale) return 0; } -bool AnyEmojiInString(const char *s, size_t byteCount) { +bool AnyEmojiInString(std::string_view str, size_t byteCount) { int i = 0; while (i < byteCount) { - uint32_t c = u8_nextchar(s, &i); + uint32_t c = u8_nextchar(str.data(), &i, str.size()); if (CodepointIsProbablyEmoji(c)) { return true; } @@ -517,8 +461,8 @@ std::string ConvertUCS2ToUTF8(const std::u16string &wstr) { return s; } -std::string SanitizeUTF8(const std::string &utf8string) { - UTF8 utf(utf8string.c_str()); +std::string SanitizeUTF8(std::string_view utf8string) { + UTF8 utf(utf8string); std::string s; // Worst case. s.resize(utf8string.size() * 4); diff --git a/Common/Data/Encoding/Utf8.h b/Common/Data/Encoding/Utf8.h index 5123ef8a4b..5882ec1a99 100644 --- a/Common/Data/Encoding/Utf8.h +++ b/Common/Data/Encoding/Utf8.h @@ -18,11 +18,11 @@ #include #include +#include -uint32_t u8_nextchar(const char *s, int *i); +uint32_t u8_nextchar(const char *s, int *i, size_t size); uint32_t u8_nextchar_unsafe(const char *s, int *i); int u8_wc_toutf8(char *dest, uint32_t ch); -int u8_strlen(const char *s); void u8_inc(const char *s, int *i); void u8_dec(const char *s, int *i); @@ -33,21 +33,23 @@ inline bool CodepointIsProbablyEmoji(uint32_t c) { return c > 0xFFFF; } -bool AnyEmojiInString(const char *s, size_t byteCount); +bool AnyEmojiInString(std::string_view str, size_t byteCount); class UTF8 { public: static const uint32_t INVALID = (uint32_t)-1; - UTF8(const char *c) : c_(c), index_(0) {} - UTF8(const char *c, int index) : c_(c), index_(index) {} - bool end() const { return c_[index_] == 0; } + // TODO: Try to get rid of this constructor. + explicit UTF8(const char *c) : c_(c), size_((int)strlen(c)), index_(0) {} + explicit UTF8(std::string_view view) : c_(view.data()), size_((int)view.size()), index_(0) {} + explicit UTF8(std::string_view view, int index) : c_(view.data()), size_((int)view.size()), index_(index) {} + bool end() const { return index_ == size_; } // Returns true if the next character is outside BMP and Planes 1 - 16. bool invalid() const { unsigned char c = (unsigned char)c_[index_]; return (c >= 0x80 && c <= 0xC1) || c >= 0xF5; } uint32_t next() { - return u8_nextchar(c_, &index_); + return u8_nextchar(c_, &index_, size_); } // Allow invalid continuation bytes. uint32_t next_unsafe() { @@ -55,7 +57,7 @@ public: } uint32_t peek() const { int tempIndex = index_; - return u8_nextchar(c_, &tempIndex); + return u8_nextchar(c_, &tempIndex, size_); } void fwd() { u8_inc(c_, &index_); @@ -64,7 +66,7 @@ public: u8_dec(c_, &index_); } int length() const { - return u8_strlen(c_); + return size_; } int byteIndex() const { return index_; @@ -88,6 +90,7 @@ public: private: const char *c_; int index_; + int size_; }; int UTF8StringNonASCIICount(const char *utf8string); @@ -96,8 +99,7 @@ bool UTF8StringHasNonASCII(const char *utf8string); // Removes overlong encodings and similar. -std::string SanitizeUTF8(const std::string &utf8string); - +std::string SanitizeUTF8(std::string_view utf8string); std::string CodepointToUTF8(uint32_t codePoint); diff --git a/Qt/QtMain.cpp b/Qt/QtMain.cpp index 0ba01e48d1..b406305a6b 100644 --- a/Qt/QtMain.cpp +++ b/Qt/QtMain.cpp @@ -658,7 +658,7 @@ bool MainUI::event(QEvent *e) { default: if (str.size()) { int pos = 0; - int unicode = u8_nextchar(str.c_str(), &pos); + int unicode = u8_nextchar(str.c_str(), &pos, str.size()); NativeKey(KeyInput(DEVICE_ID_KEYBOARD, unicode)); } break; diff --git a/SDL/SDLMain.cpp b/SDL/SDLMain.cpp index a5d44460fe..cb3c51eee8 100644 --- a/SDL/SDLMain.cpp +++ b/SDL/SDLMain.cpp @@ -867,7 +867,7 @@ static void ProcessSDLEvent(SDL_Window *window, const SDL_Event &event, InputSta case SDL_TEXTINPUT: { int pos = 0; - int c = u8_nextchar(event.text.text, &pos); + int c = u8_nextchar(event.text.text, &pos, strlen(event.text.text)); KeyInput key; key.flags = KEY_CHAR; key.unicodeChar = c;