fix: Handle non-ASCII characters in sanitizeFilename (#1132)

## Summary **What is the goal of this PR?** Probable fix for #1118. `sanitizeFilename` was only passing through ASCII characters from filenames. It now maintains valid UTF-8 codepoints, including non-ASCII multibyte sequences. Truncation happens at a maximum number of bytes, rather than characters, to prevent filenames with many multibyte sequences from unexpectedly exceeding FAT32 limits. --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**YES, I described #1118 to Claude and it suggested sanitizeFilename as the likely cause**_
2026-02-24 17:43:58 -06:00
parent c0cd7c13a3
commit 8ab2f22730
2 changed files with 31 additions and 21 deletions
--- a/src/util/StringUtils.cpp
+++ b/src/util/StringUtils.cpp
@@ -1,35 +1,45 @@
 #include "StringUtils.h"

+#include <Utf8.h>
+
 #include <cstring>

 namespace StringUtils {

-std::string sanitizeFilename(const std::string& name, size_t maxLength) {
+std::string sanitizeFilename(const std::string& name, size_t maxBytes) {
  std::string result;
-  result.reserve(name.size());
+  result.reserve(std::min(name.size(), maxBytes));

-  for (char c : name) {
-    // Replace invalid filename characters with underscore
-    if (c == '/' || c == '\\' || c == ':' || c == '*' || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
+  const auto* text = reinterpret_cast<const unsigned char*>(name.c_str());
+
+  // Skip leading spaces and dots so they don't consume the byte budget
+  while (*text == ' ' || *text == '.') {
+    text++;
+  }
+
+  // Process full UTF-8 codepoints to avoid trimming in the middle of a multibyte sequence
+  while (*text != 0) {
+    const auto* cpStart = text;
+    uint32_t cp = utf8NextCodepoint(&text);
+
+    if (cp == '/' || cp == '\\' || cp == ':' || cp == '*' || cp == '?' || cp == '"' || cp == '<' || cp == '>' ||
+        cp == '|') {
+      // Replace illegal and control characters with '_'
+      if (result.length() + 1 > maxBytes) break;
      result += '_';
-    } else if (c >= 32 && c < 127) {
-      // Keep printable ASCII characters
-      result += c;
+    } else if (cp >= 128 || (cp >= 32 && cp < 127)) {
+      const size_t cpBytes = text - cpStart;
+      if (result.length() + cpBytes > maxBytes) break;
+      result.append(reinterpret_cast<const char*>(cpStart), cpBytes);
    }
-    // Skip non-printable characters
  }

-  // Trim leading/trailing spaces and dots
-  size_t start = result.find_first_not_of(" .");
-  if (start == std::string::npos) {
-    return "book";  // Fallback if name is all invalid characters
-  }
+  // Trim trailing spaces and dots
  size_t end = result.find_last_not_of(" .");
-  result = result.substr(start, end - start + 1);
-
-  // Limit filename length
-  if (result.length() > maxLength) {
-    result.resize(maxLength);
+  if (end != std::string::npos) {
+    result.resize(end + 1);
+  } else {
+    result.clear();
  }

  return result.empty() ? "book" : result;
--- a/src/util/StringUtils.h
+++ b/src/util/StringUtils.h
@@ -9,9 +9,9 @@ namespace StringUtils {
 /**
 * Sanitize a string for use as a filename.
 * Replaces invalid characters with underscores, trims spaces/dots,
- * and limits length to maxLength characters.
+ * and limits length to maxBytes bytes.
 */
-std::string sanitizeFilename(const std::string& name, size_t maxLength = 100);
+std::string sanitizeFilename(const std::string& name, size_t maxBytes = 100);

 /**
 * Check if the given filename ends with the specified extension (case-insensitive).