Add string_view functionality to the core utf-8 parsing code

2025-04-02 11:01:50 -04:00 · 2024-01-12 12:10:08 +01:00 · 2024-01-12 12:10:08 +01:00 · 9c9fe8bf8f
commit 9c9fe8bf8f
parent 26db1cb05d
4 changed files with 21 additions and 75 deletions
--- a/Common/Data/Encoding/Utf8.cpp
+++ b/Common/Data/Encoding/Utf8.cpp
@ -206,27 +206,15 @@ int u8_charnum(const char *s, int offset)
  return charnum;
 }

-/* number of characters */
-int u8_strlen(const char *s)
-{
-  int count = 0;
-  int i = 0;
-
-  while (u8_nextchar(s, &i) != 0)
-    count++;
-
-  return count;
-}
-
 /* reads the next utf-8 sequence out of a string, updating an index */
-uint32_t u8_nextchar(const char *s, int *index) {
+uint32_t u8_nextchar(const char *s, int *index, size_t size) {
 	uint32_t ch = 0;
 	int sz = 0;
 	int i = *index;
 	do {
 		ch = (ch << 6) + (unsigned char)s[i++];
 		sz++;
-	} while (s[i] && ((s[i]) & 0xC0) == 0x80);
+	} while (i < size && ((s[i]) & 0xC0) == 0x80);
 	*index = i;
 	return ch - offsetsFromUTF8[sz - 1];
 }
@ -234,7 +222,6 @@ uint32_t u8_nextchar(const char *s, int *index) {
 uint32_t u8_nextchar_unsafe(const char *s, int *i) {
 	uint32_t ch = (unsigned char)s[(*i)++];
 	int sz = 1;
-
 	if (ch >= 0xF0) {
 		sz++;
 		ch &= ~0x10;
@ -253,7 +240,6 @@ uint32_t u8_nextchar_unsafe(const char *s, int *i) {
 		ch <<= 6;
 		ch += ((unsigned char)s[(*i)++]) & 0x3F;
 	}
-
 	return ch;
 }

@ -367,48 +353,6 @@ int u8_unescape(char *buf, int sz, char *src)
  return c;
 }

-const char *u8_strchr(const char *s, uint32_t ch, int *charn)
-{
-  int i = 0, lasti=0;
-  uint32_t c;
-
-  *charn = 0;
-  while (s[i]) {
-    c = u8_nextchar(s, &i);
-    if (c == ch) {
-      return &s[lasti];
-    }
-    lasti = i;
-    (*charn)++;
-  }
-  return NULL;
-}
-
-const char *u8_memchr(const char *s, uint32_t ch, size_t sz, int *charn)
-{
-  size_t i = 0, lasti=0;
-  uint32_t c;
-  int csz;
-
-  *charn = 0;
-  while (i < sz) {
-    c = csz = 0;
-    do {
-      c <<= 6;
-      c += (unsigned char)s[i++];
-      csz++;
-    } while (i < sz && !isutf(s[i]));
-    c -= offsetsFromUTF8[csz-1];
-
-    if (c == ch) {
-      return &s[lasti];
-    }
-    lasti = i;
-    (*charn)++;
-  }
-  return NULL;
-}
-
 int u8_is_locale_utf8(const char *locale)
 {
  /* this code based on libutf8 */
@ -428,10 +372,10 @@ int u8_is_locale_utf8(const char *locale)
  return 0;
 }

-bool AnyEmojiInString(const char *s, size_t byteCount) {
+bool AnyEmojiInString(std::string_view str, size_t byteCount) {
 	int i = 0;
 	while (i < byteCount) {
-		uint32_t c = u8_nextchar(s, &i);
+		uint32_t c = u8_nextchar(str.data(), &i, str.size());
 		if (CodepointIsProbablyEmoji(c)) {
 			return true;
 		}
@ -517,8 +461,8 @@ std::string ConvertUCS2ToUTF8(const std::u16string &wstr) {
 	return s;
 }

-std::string SanitizeUTF8(const std::string &utf8string) {
-	UTF8 utf(utf8string.c_str());
+std::string SanitizeUTF8(std::string_view utf8string) {
+	UTF8 utf(utf8string);
 	std::string s;
 	// Worst case.
 	s.resize(utf8string.size() * 4);
--- a/Common/Data/Encoding/Utf8.h
+++ b/Common/Data/Encoding/Utf8.h
@ -18,11 +18,11 @@

 #include <cstdint>
 #include <string>
+#include <string_view>

-uint32_t u8_nextchar(const char *s, int *i);
+uint32_t u8_nextchar(const char *s, int *i, size_t size);
 uint32_t u8_nextchar_unsafe(const char *s, int *i);
 int u8_wc_toutf8(char *dest, uint32_t ch);
-int u8_strlen(const char *s);
 void u8_inc(const char *s, int *i);
 void u8_dec(const char *s, int *i);

@ -33,21 +33,23 @@ inline bool CodepointIsProbablyEmoji(uint32_t c) {
 	return c > 0xFFFF;
 }

-bool AnyEmojiInString(const char *s, size_t byteCount);
+bool AnyEmojiInString(std::string_view str, size_t byteCount);

 class UTF8 {
 public:
 	static const uint32_t INVALID = (uint32_t)-1;
-	UTF8(const char *c) : c_(c), index_(0) {}
-	UTF8(const char *c, int index) : c_(c), index_(index) {}
-	bool end() const { return c_[index_] == 0; }
+	// TODO: Try to get rid of this constructor.
+	explicit UTF8(const char *c) : c_(c), size_((int)strlen(c)), index_(0) {}
+	explicit UTF8(std::string_view view) : c_(view.data()), size_((int)view.size()), index_(0) {}
+	explicit UTF8(std::string_view view, int index) : c_(view.data()), size_((int)view.size()), index_(index) {}
+	bool end() const { return index_ == size_; }
 	// Returns true if the next character is outside BMP and Planes 1 - 16.
 	bool invalid() const {
 		unsigned char c = (unsigned char)c_[index_];
 		return (c >= 0x80 && c <= 0xC1) || c >= 0xF5;
 	}
 	uint32_t next() {
-		return u8_nextchar(c_, &index_);
+		return u8_nextchar(c_, &index_, size_);
 	}
 	// Allow invalid continuation bytes.
 	uint32_t next_unsafe() {
@ -55,7 +57,7 @@ public:
 	}
 	uint32_t peek() const {
 		int tempIndex = index_;
-		return u8_nextchar(c_, &tempIndex);
+		return u8_nextchar(c_, &tempIndex, size_);
 	}
 	void fwd() {
 		u8_inc(c_, &index_);
@ -64,7 +66,7 @@ public:
 		u8_dec(c_, &index_);
 	}
 	int length() const {
-		return u8_strlen(c_);
+		return size_;
 	}
 	int byteIndex() const {
 		return index_;
@ -88,6 +90,7 @@ public:
 private:
 	const char *c_;
 	int index_;
+	int size_;
 };

 int UTF8StringNonASCIICount(const char *utf8string);
@ -96,8 +99,7 @@ bool UTF8StringHasNonASCII(const char *utf8string);


 // Removes overlong encodings and similar.
-std::string SanitizeUTF8(const std::string &utf8string);
-
+std::string SanitizeUTF8(std::string_view utf8string);
 std::string CodepointToUTF8(uint32_t codePoint);


--- a/Qt/QtMain.cpp
+++ b/Qt/QtMain.cpp
@ -658,7 +658,7 @@ bool MainUI::event(QEvent *e) {
 			default:
 				if (str.size()) {
 					int pos = 0;
-					int unicode = u8_nextchar(str.c_str(), &pos);
+					int unicode = u8_nextchar(str.c_str(), &pos, str.size());
 					NativeKey(KeyInput(DEVICE_ID_KEYBOARD, unicode));
 				}
 				break;
--- a/SDL/SDLMain.cpp
+++ b/SDL/SDLMain.cpp
@ -867,7 +867,7 @@ static void ProcessSDLEvent(SDL_Window *window, const SDL_Event &event, InputSta
 	case SDL_TEXTINPUT:
 		{
 			int pos = 0;
-			int c = u8_nextchar(event.text.text, &pos);
+			int c = u8_nextchar(event.text.text, &pos, strlen(event.text.text));
 			KeyInput key;
 			key.flags = KEY_CHAR;
 			key.unicodeChar = c;