From 8ab2f227307dd43593803e0c7c524c155bff2753 Mon Sep 17 00:00:00 2001 From: Zach Nelson Date: Tue, 24 Feb 2026 17:43:58 -0600 Subject: [PATCH] fix: Handle non-ASCII characters in sanitizeFilename (#1132) ## Summary **What is the goal of this PR?** Probable fix for #1118. `sanitizeFilename` was only passing through ASCII characters from filenames. It now maintains valid UTF-8 codepoints, including non-ASCII multibyte sequences. Truncation happens at a maximum number of bytes, rather than characters, to prevent filenames with many multibyte sequences from unexpectedly exceeding FAT32 limits. --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**YES, I described #1118 to Claude and it suggested sanitizeFilename as the likely cause**_ --- src/util/StringUtils.cpp | 48 ++++++++++++++++++++++++---------------- src/util/StringUtils.h | 4 ++-- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/src/util/StringUtils.cpp b/src/util/StringUtils.cpp index 8e2ce58e..ed989d55 100644 --- a/src/util/StringUtils.cpp +++ b/src/util/StringUtils.cpp @@ -1,35 +1,45 @@ #include "StringUtils.h" +#include + #include namespace StringUtils { -std::string sanitizeFilename(const std::string& name, size_t maxLength) { +std::string sanitizeFilename(const std::string& name, size_t maxBytes) { std::string result; - result.reserve(name.size()); + result.reserve(std::min(name.size(), maxBytes)); - for (char c : name) { - // Replace invalid filename characters with underscore - if (c == '/' || c == '\\' || c == ':' || c == '*' || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') { + const auto* text = reinterpret_cast(name.c_str()); + + // Skip leading spaces and dots so they don't consume the byte budget + while (*text == ' ' || *text == '.') { + text++; + } + + // Process full UTF-8 codepoints to avoid trimming in the middle of a multibyte sequence + while (*text != 0) { + const auto* cpStart = text; + uint32_t cp = utf8NextCodepoint(&text); + + if (cp == '/' || cp == '\\' || cp == ':' || cp == '*' || cp == '?' || cp == '"' || cp == '<' || cp == '>' || + cp == '|') { + // Replace illegal and control characters with '_' + if (result.length() + 1 > maxBytes) break; result += '_'; - } else if (c >= 32 && c < 127) { - // Keep printable ASCII characters - result += c; + } else if (cp >= 128 || (cp >= 32 && cp < 127)) { + const size_t cpBytes = text - cpStart; + if (result.length() + cpBytes > maxBytes) break; + result.append(reinterpret_cast(cpStart), cpBytes); } - // Skip non-printable characters } - // Trim leading/trailing spaces and dots - size_t start = result.find_first_not_of(" ."); - if (start == std::string::npos) { - return "book"; // Fallback if name is all invalid characters - } + // Trim trailing spaces and dots size_t end = result.find_last_not_of(" ."); - result = result.substr(start, end - start + 1); - - // Limit filename length - if (result.length() > maxLength) { - result.resize(maxLength); + if (end != std::string::npos) { + result.resize(end + 1); + } else { + result.clear(); } return result.empty() ? "book" : result; diff --git a/src/util/StringUtils.h b/src/util/StringUtils.h index 4b93729b..7909fe44 100644 --- a/src/util/StringUtils.h +++ b/src/util/StringUtils.h @@ -9,9 +9,9 @@ namespace StringUtils { /** * Sanitize a string for use as a filename. * Replaces invalid characters with underscores, trims spaces/dots, - * and limits length to maxLength characters. + * and limits length to maxBytes bytes. */ -std::string sanitizeFilename(const std::string& name, size_t maxLength = 100); +std::string sanitizeFilename(const std::string& name, size_t maxBytes = 100); /** * Check if the given filename ends with the specified extension (case-insensitive).