Add string_view functionality to the core utf-8 parsing code

This commit is contained in:
Henrik Rydgård 2024-01-12 12:10:08 +01:00
parent 26db1cb05d
commit 9c9fe8bf8f
4 changed files with 21 additions and 75 deletions

View file

@ -206,27 +206,15 @@ int u8_charnum(const char *s, int offset)
return charnum;
}
/* number of characters */
int u8_strlen(const char *s)
{
int count = 0;
int i = 0;
while (u8_nextchar(s, &i) != 0)
count++;
return count;
}
/* reads the next utf-8 sequence out of a string, updating an index */
uint32_t u8_nextchar(const char *s, int *index) {
uint32_t u8_nextchar(const char *s, int *index, size_t size) {
uint32_t ch = 0;
int sz = 0;
int i = *index;
do {
ch = (ch << 6) + (unsigned char)s[i++];
sz++;
} while (s[i] && ((s[i]) & 0xC0) == 0x80);
} while (i < size && ((s[i]) & 0xC0) == 0x80);
*index = i;
return ch - offsetsFromUTF8[sz - 1];
}
@ -234,7 +222,6 @@ uint32_t u8_nextchar(const char *s, int *index) {
uint32_t u8_nextchar_unsafe(const char *s, int *i) {
uint32_t ch = (unsigned char)s[(*i)++];
int sz = 1;
if (ch >= 0xF0) {
sz++;
ch &= ~0x10;
@ -253,7 +240,6 @@ uint32_t u8_nextchar_unsafe(const char *s, int *i) {
ch <<= 6;
ch += ((unsigned char)s[(*i)++]) & 0x3F;
}
return ch;
}
@ -367,48 +353,6 @@ int u8_unescape(char *buf, int sz, char *src)
return c;
}
const char *u8_strchr(const char *s, uint32_t ch, int *charn)
{
int i = 0, lasti=0;
uint32_t c;
*charn = 0;
while (s[i]) {
c = u8_nextchar(s, &i);
if (c == ch) {
return &s[lasti];
}
lasti = i;
(*charn)++;
}
return NULL;
}
const char *u8_memchr(const char *s, uint32_t ch, size_t sz, int *charn)
{
size_t i = 0, lasti=0;
uint32_t c;
int csz;
*charn = 0;
while (i < sz) {
c = csz = 0;
do {
c <<= 6;
c += (unsigned char)s[i++];
csz++;
} while (i < sz && !isutf(s[i]));
c -= offsetsFromUTF8[csz-1];
if (c == ch) {
return &s[lasti];
}
lasti = i;
(*charn)++;
}
return NULL;
}
int u8_is_locale_utf8(const char *locale)
{
/* this code based on libutf8 */
@ -428,10 +372,10 @@ int u8_is_locale_utf8(const char *locale)
return 0;
}
bool AnyEmojiInString(const char *s, size_t byteCount) {
bool AnyEmojiInString(std::string_view str, size_t byteCount) {
int i = 0;
while (i < byteCount) {
uint32_t c = u8_nextchar(s, &i);
uint32_t c = u8_nextchar(str.data(), &i, str.size());
if (CodepointIsProbablyEmoji(c)) {
return true;
}
@ -517,8 +461,8 @@ std::string ConvertUCS2ToUTF8(const std::u16string &wstr) {
return s;
}
std::string SanitizeUTF8(const std::string &utf8string) {
UTF8 utf(utf8string.c_str());
std::string SanitizeUTF8(std::string_view utf8string) {
UTF8 utf(utf8string);
std::string s;
// Worst case.
s.resize(utf8string.size() * 4);

View file

@ -18,11 +18,11 @@
#include <cstdint>
#include <string>
#include <string_view>
uint32_t u8_nextchar(const char *s, int *i);
uint32_t u8_nextchar(const char *s, int *i, size_t size);
uint32_t u8_nextchar_unsafe(const char *s, int *i);
int u8_wc_toutf8(char *dest, uint32_t ch);
int u8_strlen(const char *s);
void u8_inc(const char *s, int *i);
void u8_dec(const char *s, int *i);
@ -33,21 +33,23 @@ inline bool CodepointIsProbablyEmoji(uint32_t c) {
return c > 0xFFFF;
}
bool AnyEmojiInString(const char *s, size_t byteCount);
bool AnyEmojiInString(std::string_view str, size_t byteCount);
class UTF8 {
public:
static const uint32_t INVALID = (uint32_t)-1;
UTF8(const char *c) : c_(c), index_(0) {}
UTF8(const char *c, int index) : c_(c), index_(index) {}
bool end() const { return c_[index_] == 0; }
// TODO: Try to get rid of this constructor.
explicit UTF8(const char *c) : c_(c), size_((int)strlen(c)), index_(0) {}
explicit UTF8(std::string_view view) : c_(view.data()), size_((int)view.size()), index_(0) {}
explicit UTF8(std::string_view view, int index) : c_(view.data()), size_((int)view.size()), index_(index) {}
bool end() const { return index_ == size_; }
// Returns true if the next character is outside BMP and Planes 1 - 16.
bool invalid() const {
unsigned char c = (unsigned char)c_[index_];
return (c >= 0x80 && c <= 0xC1) || c >= 0xF5;
}
uint32_t next() {
return u8_nextchar(c_, &index_);
return u8_nextchar(c_, &index_, size_);
}
// Allow invalid continuation bytes.
uint32_t next_unsafe() {
@ -55,7 +57,7 @@ public:
}
uint32_t peek() const {
int tempIndex = index_;
return u8_nextchar(c_, &tempIndex);
return u8_nextchar(c_, &tempIndex, size_);
}
void fwd() {
u8_inc(c_, &index_);
@ -64,7 +66,7 @@ public:
u8_dec(c_, &index_);
}
int length() const {
return u8_strlen(c_);
return size_;
}
int byteIndex() const {
return index_;
@ -88,6 +90,7 @@ public:
private:
const char *c_;
int index_;
int size_;
};
int UTF8StringNonASCIICount(const char *utf8string);
@ -96,8 +99,7 @@ bool UTF8StringHasNonASCII(const char *utf8string);
// Removes overlong encodings and similar.
std::string SanitizeUTF8(const std::string &utf8string);
std::string SanitizeUTF8(std::string_view utf8string);
std::string CodepointToUTF8(uint32_t codePoint);

View file

@ -658,7 +658,7 @@ bool MainUI::event(QEvent *e) {
default:
if (str.size()) {
int pos = 0;
int unicode = u8_nextchar(str.c_str(), &pos);
int unicode = u8_nextchar(str.c_str(), &pos, str.size());
NativeKey(KeyInput(DEVICE_ID_KEYBOARD, unicode));
}
break;

View file

@ -867,7 +867,7 @@ static void ProcessSDLEvent(SDL_Window *window, const SDL_Event &event, InputSta
case SDL_TEXTINPUT:
{
int pos = 0;
int c = u8_nextchar(event.text.text, &pos);
int c = u8_nextchar(event.text.text, &pos, strlen(event.text.text));
KeyInput key;
key.flags = KEY_CHAR;
key.unicodeChar = c;