#include "Utf8.h" int utf8CodepointLen(const unsigned char c) { if (c < 0x80) return 1; // 0xxxxxxx if ((c >> 5) == 0x6) return 2; // 110xxxxx if ((c >> 4) == 0xE) return 3; // 1110xxxx if ((c >> 3) == 0x1E) return 4; // 11110xxx return 1; // fallback for invalid } uint32_t utf8NextCodepoint(const unsigned char** string) { if (**string == 0) { return 0; } const unsigned char lead = **string; const int bytes = utf8CodepointLen(lead); const uint8_t* chr = *string; // Invalid lead byte (stray continuation byte 0x80-0xBF, or 0xFE/0xFF) if (bytes == 1 && lead >= 0x80) { (*string)++; return REPLACEMENT_GLYPH; } if (bytes == 1) { (*string)++; return chr[0]; } // Validate continuation bytes before consuming them for (int i = 1; i < bytes; i++) { if ((chr[i] & 0xC0) != 0x80) { // Missing or invalid continuation byte — skip all bytes consumed so far *string += i; return REPLACEMENT_GLYPH; } } uint32_t cp = chr[0] & ((1 << (7 - bytes)) - 1); // mask header bits for (int i = 1; i < bytes; i++) { cp = (cp << 6) | (chr[i] & 0x3F); } // Reject overlong encodings, surrogates, and out-of-range values const bool overlong = (bytes == 2 && cp < 0x80) || (bytes == 3 && cp < 0x800) || (bytes == 4 && cp < 0x10000); const bool surrogate = (cp >= 0xD800 && cp <= 0xDFFF); if (overlong || surrogate || cp > 0x10FFFF) { (*string)++; return REPLACEMENT_GLYPH; } *string += bytes; return cp; } int utf8SafeTruncateBuffer(const char* buf, int len) { if (len <= 0) return 0; // Walk back past continuation bytes (10xxxxxx) to find the lead byte int leadPos = len - 1; while (leadPos > 0 && (static_cast(buf[leadPos]) & 0xC0) == 0x80) { leadPos--; } // Determine expected length of the sequence starting at leadPos int expectedLen = utf8CodepointLen(static_cast(buf[leadPos])); int actualLen = len - leadPos; if (actualLen < expectedLen && leadPos > 0) { // Incomplete UTF-8 sequence at the end — exclude it return leadPos; } return len; } size_t utf8RemoveLastChar(std::string& str) { if (str.empty()) return 0; size_t pos = str.size() - 1; while (pos > 0 && (static_cast(str[pos]) & 0xC0) == 0x80) { --pos; } str.resize(pos); return pos; } // Truncate string by removing N UTF-8 characters from the end void utf8TruncateChars(std::string& str, const size_t numChars) { for (size_t i = 0; i < numChars && !str.empty(); ++i) { utf8RemoveLastChar(str); } }