fix: Handle non-ASCII characters in sanitizeFilename (#1132)

## Summary

**What is the goal of this PR?**

Probable fix for #1118. `sanitizeFilename` was only passing through
ASCII characters from filenames. It now maintains valid UTF-8
codepoints, including non-ASCII multibyte sequences. Truncation happens
at a maximum number of bytes, rather than characters, to prevent
filenames with many multibyte sequences from unexpectedly exceeding
FAT32 limits.

---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? _**YES, I described #1118
to Claude and it suggested sanitizeFilename as the likely cause**_
This commit is contained in:
Zach Nelson
2026-02-24 17:43:58 -06:00
committed by GitHub
parent c0cd7c13a3
commit 8ab2f22730
2 changed files with 31 additions and 21 deletions

View File

@@ -1,35 +1,45 @@
#include "StringUtils.h" #include "StringUtils.h"
#include <Utf8.h>
#include <cstring> #include <cstring>
namespace StringUtils { namespace StringUtils {
std::string sanitizeFilename(const std::string& name, size_t maxLength) { std::string sanitizeFilename(const std::string& name, size_t maxBytes) {
std::string result; std::string result;
result.reserve(name.size()); result.reserve(std::min(name.size(), maxBytes));
for (char c : name) { const auto* text = reinterpret_cast<const unsigned char*>(name.c_str());
// Replace invalid filename characters with underscore
if (c == '/' || c == '\\' || c == ':' || c == '*' || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') { // Skip leading spaces and dots so they don't consume the byte budget
while (*text == ' ' || *text == '.') {
text++;
}
// Process full UTF-8 codepoints to avoid trimming in the middle of a multibyte sequence
while (*text != 0) {
const auto* cpStart = text;
uint32_t cp = utf8NextCodepoint(&text);
if (cp == '/' || cp == '\\' || cp == ':' || cp == '*' || cp == '?' || cp == '"' || cp == '<' || cp == '>' ||
cp == '|') {
// Replace illegal and control characters with '_'
if (result.length() + 1 > maxBytes) break;
result += '_'; result += '_';
} else if (c >= 32 && c < 127) { } else if (cp >= 128 || (cp >= 32 && cp < 127)) {
// Keep printable ASCII characters const size_t cpBytes = text - cpStart;
result += c; if (result.length() + cpBytes > maxBytes) break;
result.append(reinterpret_cast<const char*>(cpStart), cpBytes);
} }
// Skip non-printable characters
} }
// Trim leading/trailing spaces and dots // Trim trailing spaces and dots
size_t start = result.find_first_not_of(" .");
if (start == std::string::npos) {
return "book"; // Fallback if name is all invalid characters
}
size_t end = result.find_last_not_of(" ."); size_t end = result.find_last_not_of(" .");
result = result.substr(start, end - start + 1); if (end != std::string::npos) {
result.resize(end + 1);
// Limit filename length } else {
if (result.length() > maxLength) { result.clear();
result.resize(maxLength);
} }
return result.empty() ? "book" : result; return result.empty() ? "book" : result;

View File

@@ -9,9 +9,9 @@ namespace StringUtils {
/** /**
* Sanitize a string for use as a filename. * Sanitize a string for use as a filename.
* Replaces invalid characters with underscores, trims spaces/dots, * Replaces invalid characters with underscores, trims spaces/dots,
* and limits length to maxLength characters. * and limits length to maxBytes bytes.
*/ */
std::string sanitizeFilename(const std::string& name, size_t maxLength = 100); std::string sanitizeFilename(const std::string& name, size_t maxBytes = 100);
/** /**
* Check if the given filename ends with the specified extension (case-insensitive). * Check if the given filename ends with the specified extension (case-insensitive).