2025-12-03 22:00:29 +11:00
|
|
|
#include "Utf8.h"
|
|
|
|
|
|
|
|
|
|
int utf8CodepointLen(const unsigned char c) {
|
|
|
|
|
if (c < 0x80) return 1; // 0xxxxxxx
|
|
|
|
|
if ((c >> 5) == 0x6) return 2; // 110xxxxx
|
|
|
|
|
if ((c >> 4) == 0xE) return 3; // 1110xxxx
|
|
|
|
|
if ((c >> 3) == 0x1E) return 4; // 11110xxx
|
|
|
|
|
return 1; // fallback for invalid
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint32_t utf8NextCodepoint(const unsigned char** string) {
|
|
|
|
|
if (**string == 0) {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-12 07:05:46 +11:00
|
|
|
const unsigned char lead = **string;
|
|
|
|
|
const int bytes = utf8CodepointLen(lead);
|
2025-12-03 22:00:29 +11:00
|
|
|
const uint8_t* chr = *string;
|
2026-03-12 07:05:46 +11:00
|
|
|
|
|
|
|
|
// Invalid lead byte (stray continuation byte 0x80-0xBF, or 0xFE/0xFF)
|
|
|
|
|
if (bytes == 1 && lead >= 0x80) {
|
|
|
|
|
(*string)++;
|
|
|
|
|
return REPLACEMENT_GLYPH;
|
|
|
|
|
}
|
2025-12-03 22:00:29 +11:00
|
|
|
|
|
|
|
|
if (bytes == 1) {
|
2026-03-12 07:05:46 +11:00
|
|
|
(*string)++;
|
2025-12-03 22:00:29 +11:00
|
|
|
return chr[0];
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-12 07:05:46 +11:00
|
|
|
// Validate continuation bytes before consuming them
|
|
|
|
|
for (int i = 1; i < bytes; i++) {
|
|
|
|
|
if ((chr[i] & 0xC0) != 0x80) {
|
|
|
|
|
// Missing or invalid continuation byte — skip all bytes consumed so far
|
|
|
|
|
*string += i;
|
|
|
|
|
return REPLACEMENT_GLYPH;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-03 22:00:29 +11:00
|
|
|
uint32_t cp = chr[0] & ((1 << (7 - bytes)) - 1); // mask header bits
|
|
|
|
|
|
|
|
|
|
for (int i = 1; i < bytes; i++) {
|
|
|
|
|
cp = (cp << 6) | (chr[i] & 0x3F);
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-12 07:05:46 +11:00
|
|
|
// Reject overlong encodings, surrogates, and out-of-range values
|
|
|
|
|
const bool overlong = (bytes == 2 && cp < 0x80) || (bytes == 3 && cp < 0x800) || (bytes == 4 && cp < 0x10000);
|
|
|
|
|
const bool surrogate = (cp >= 0xD800 && cp <= 0xDFFF);
|
|
|
|
|
if (overlong || surrogate || cp > 0x10FFFF) {
|
|
|
|
|
(*string)++;
|
|
|
|
|
return REPLACEMENT_GLYPH;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*string += bytes;
|
|
|
|
|
|
2025-12-03 22:00:29 +11:00
|
|
|
return cp;
|
|
|
|
|
}
|
2026-02-01 16:23:48 +05:00
|
|
|
|
2026-03-12 07:05:46 +11:00
|
|
|
int utf8SafeTruncateBuffer(const char* buf, int len) {
|
|
|
|
|
if (len <= 0) return 0;
|
|
|
|
|
|
|
|
|
|
// Walk back past continuation bytes (10xxxxxx) to find the lead byte
|
|
|
|
|
int leadPos = len - 1;
|
|
|
|
|
while (leadPos > 0 && (static_cast<uint8_t>(buf[leadPos]) & 0xC0) == 0x80) {
|
|
|
|
|
leadPos--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Determine expected length of the sequence starting at leadPos
|
|
|
|
|
int expectedLen = utf8CodepointLen(static_cast<unsigned char>(buf[leadPos]));
|
|
|
|
|
int actualLen = len - leadPos;
|
|
|
|
|
|
|
|
|
|
if (actualLen < expectedLen && leadPos > 0) {
|
|
|
|
|
// Incomplete UTF-8 sequence at the end — exclude it
|
|
|
|
|
return leadPos;
|
|
|
|
|
}
|
|
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-01 16:23:48 +05:00
|
|
|
size_t utf8RemoveLastChar(std::string& str) {
|
|
|
|
|
if (str.empty()) return 0;
|
|
|
|
|
size_t pos = str.size() - 1;
|
|
|
|
|
while (pos > 0 && (static_cast<unsigned char>(str[pos]) & 0xC0) == 0x80) {
|
|
|
|
|
--pos;
|
|
|
|
|
}
|
|
|
|
|
str.resize(pos);
|
|
|
|
|
return pos;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Truncate string by removing N UTF-8 characters from the end
|
|
|
|
|
void utf8TruncateChars(std::string& str, const size_t numChars) {
|
|
|
|
|
for (size_t i = 0; i < numChars && !str.empty(); ++i) {
|
|
|
|
|
utf8RemoveLastChar(str);
|
|
|
|
|
}
|
|
|
|
|
}
|