fix: Handle non-ASCII characters in sanitizeFilename (#1132)

## Summary

**What is the goal of this PR?**

Probable fix for #1118. `sanitizeFilename` was only passing through
ASCII characters from filenames. It now maintains valid UTF-8
codepoints, including non-ASCII multibyte sequences. Truncation happens
at a maximum number of bytes, rather than characters, to prevent
filenames with many multibyte sequences from unexpectedly exceeding
FAT32 limits.

---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? _**YES, I described #1118
to Claude and it suggested sanitizeFilename as the likely cause**_
This commit is contained in:
Zach Nelson
2026-02-24 17:43:58 -06:00
committed by GitHub
parent c0cd7c13a3
commit 8ab2f22730
2 changed files with 31 additions and 21 deletions

View File

@@ -1,35 +1,45 @@
#include "StringUtils.h"
#include <Utf8.h>
#include <cstring>
namespace StringUtils {
std::string sanitizeFilename(const std::string& name, size_t maxLength) {
std::string sanitizeFilename(const std::string& name, size_t maxBytes) {
std::string result;
result.reserve(name.size());
result.reserve(std::min(name.size(), maxBytes));
for (char c : name) {
// Replace invalid filename characters with underscore
if (c == '/' || c == '\\' || c == ':' || c == '*' || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
const auto* text = reinterpret_cast<const unsigned char*>(name.c_str());
// Skip leading spaces and dots so they don't consume the byte budget
while (*text == ' ' || *text == '.') {
text++;
}
// Process full UTF-8 codepoints to avoid trimming in the middle of a multibyte sequence
while (*text != 0) {
const auto* cpStart = text;
uint32_t cp = utf8NextCodepoint(&text);
if (cp == '/' || cp == '\\' || cp == ':' || cp == '*' || cp == '?' || cp == '"' || cp == '<' || cp == '>' ||
cp == '|') {
// Replace illegal and control characters with '_'
if (result.length() + 1 > maxBytes) break;
result += '_';
} else if (c >= 32 && c < 127) {
// Keep printable ASCII characters
result += c;
} else if (cp >= 128 || (cp >= 32 && cp < 127)) {
const size_t cpBytes = text - cpStart;
if (result.length() + cpBytes > maxBytes) break;
result.append(reinterpret_cast<const char*>(cpStart), cpBytes);
}
// Skip non-printable characters
}
// Trim leading/trailing spaces and dots
size_t start = result.find_first_not_of(" .");
if (start == std::string::npos) {
return "book"; // Fallback if name is all invalid characters
}
// Trim trailing spaces and dots
size_t end = result.find_last_not_of(" .");
result = result.substr(start, end - start + 1);
// Limit filename length
if (result.length() > maxLength) {
result.resize(maxLength);
if (end != std::string::npos) {
result.resize(end + 1);
} else {
result.clear();
}
return result.empty() ? "book" : result;

View File

@@ -9,9 +9,9 @@ namespace StringUtils {
/**
* Sanitize a string for use as a filename.
* Replaces invalid characters with underscores, trims spaces/dots,
* and limits length to maxLength characters.
* and limits length to maxBytes bytes.
*/
std::string sanitizeFilename(const std::string& name, size_t maxLength = 100);
std::string sanitizeFilename(const std::string& name, size_t maxBytes = 100);
/**
* Check if the given filename ends with the specified extension (case-insensitive).