mod: Phase 3 — Re-port unmerged upstream PRs

Re-applied upstream PRs not yet merged to upstream/master: - #1055: Byte-level framebuffer writes (fillPhysicalHSpan*, optimized fillRect/drawLine/fillRectDither/fillPolygon) - #1027: Word-width cache (FNV-1a, 128-entry) and hyphenation early exit in ParsedText for 7-9% layout speedup - #1068: Already present in upstream — URL hyphenation fix - #1019: Already present in upstream — file extensions in browser - #1090/#1185/#1217: KOReader sync improvements — binary credential store, document hash caching, ChapterXPathIndexer integration - #1209: OPDS multi-server — OpdsBookBrowserActivity accepts OpdsServer, directory picker for downloads, download-complete prompt with open/back options - #857: Dictionary activities already ported in Phase 1/2 - #1003: Placeholder cover already integrated in Phase 2 Also fixed: STR_OFF i18n string, include paths, replaced Epub::isValidThumbnailBmp with Storage.exists, replaced StringUtils::checkFileExtension with FsHelpers equivalents. Made-with: Cursor
2026-03-07 16:15:42 -05:00
parent 30473c27d3
commit 60a3e21c0e
25 changed files with 811 additions and 295 deletions
--- a/lib/Epub/Epub/ParsedText.cpp
+++ b/lib/Epub/Epub/ParsedText.cpp
@@ -5,6 +5,7 @@

 #include <algorithm>
 #include <cmath>
+#include <cstring>
 #include <functional>
 #include <limits>
 #include <vector>
@@ -74,6 +75,80 @@ uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const s
  return renderer.getTextAdvanceX(fontId, sanitized.c_str(), style);
 }

+// ---------------------------------------------------------------------------
+// Direct-mapped word-width cache
+//
+// Avoids redundant getTextAdvanceX calls when the same (word, style, fontId)
+// triple appears across paragraphs.  A fixed-size static array is used so
+// that heap allocation and fragmentation are both zero.
+//
+// Eviction policy: hash-direct mapping — a word always occupies the single
+// slot determined by its hash; a collision simply overwrites that slot.
+// This gives O(1) lookup (one hash + one memcmp) regardless of how full the
+// cache is, avoiding the O(n) linear-scan overhead that causes a regression
+// on corpora with many unique words (e.g. German compound-heavy text).
+//
+// Words longer than 23 bytes bypass the cache entirely — they are uncommon,
+// unlikely to repeat verbatim, and exceed the fixed-width key buffer.
+// ---------------------------------------------------------------------------
+
+struct WordWidthCacheEntry {
+  char word[24];  // NUL-terminated; 23 usable bytes + terminator
+  int fontId;
+  uint16_t width;
+  uint8_t style;  // EpdFontFamily::Style narrowed to one byte
+  bool valid;  // false = slot empty (BSS-initialised to 0)
+};
+
+// Power-of-two size → slot selection via fast bitmask AND.
+// 128 entries × 32 bytes = 4 KB in BSS; covers typical paragraph vocabulary
+// with a low collision rate even for German compound-heavy prose.
+static constexpr uint32_t WORD_WIDTH_CACHE_SIZE = 128;
+static constexpr uint32_t WORD_WIDTH_CACHE_MASK = WORD_WIDTH_CACHE_SIZE - 1;
+static WordWidthCacheEntry s_wordWidthCache[WORD_WIDTH_CACHE_SIZE];
+
+// FNV-1a over the word bytes, then XOR-folded with fontId and style.
+static uint32_t wordWidthCacheHash(const char* str, const size_t len, const int fontId, const uint8_t style) {
+  uint32_t h = 2166136261u;  // FNV offset basis
+  for (size_t i = 0; i < len; ++i) {
+    h ^= static_cast<uint8_t>(str[i]);
+    h *= 16777619u;  // FNV prime
+  }
+  h ^= static_cast<uint32_t>(fontId);
+  h *= 16777619u;
+  h ^= style;
+  return h;
+}
+
+// Returns the cached width for (word, style, fontId), measuring and caching
+// on a miss.  Appending a hyphen is not supported — those measurements are
+// word-fragment lookups that will not repeat and must not pollute the cache.
+static uint16_t cachedMeasureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
+                                       const EpdFontFamily::Style style) {
+  const size_t len = word.size();
+  if (len >= 24) {
+    return measureWordWidth(renderer, fontId, word, style);
+  }
+
+  const uint8_t styleByte = static_cast<uint8_t>(style);
+  const char* const wordCStr = word.c_str();
+
+  const uint32_t slot = wordWidthCacheHash(wordCStr, len, fontId, styleByte) & WORD_WIDTH_CACHE_MASK;
+  auto& e = s_wordWidthCache[slot];
+
+  if (e.valid && e.fontId == fontId && e.style == styleByte && memcmp(e.word, wordCStr, len + 1) == 0) {
+    return e.width;  // O(1) cache hit
+  }
+
+  const uint16_t w = measureWordWidth(renderer, fontId, word, style);
+  memcpy(e.word, wordCStr, len + 1);
+  e.fontId = fontId;
+  e.width = w;
+  e.style = styleByte;
+  e.valid = true;
+  return w;
+}
+
 }  // namespace

 void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, const bool underline,
@@ -131,7 +206,7 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
  wordWidths.reserve(words.size());

  for (size_t i = 0; i < words.size(); ++i) {
-    wordWidths.push_back(measureWordWidth(renderer, fontId, words[i], wordStyles[i]));
+    wordWidths.push_back(cachedMeasureWordWidth(renderer, fontId, words[i], wordStyles[i]));
  }

  return wordWidths;
@@ -241,6 +316,7 @@ std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, c

  // Stores the index of the word that starts the next line (last_word_index + 1)
  std::vector<size_t> lineBreakIndices;
+  lineBreakIndices.reserve(totalWordCount / 8 + 1);
  size_t currentWordIndex = 0;

  while (currentWordIndex < totalWordCount) {
@@ -376,8 +452,11 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
  size_t chosenOffset = 0;
  int chosenWidth = -1;
  bool chosenNeedsHyphen = true;
+  std::string prefix;
+  prefix.reserve(word.size());

  // Iterate over each legal breakpoint and retain the widest prefix that still fits.
+  // Breakpoints are in ascending order, so once a prefix is too wide, all subsequent ones will be too.
  for (const auto& info : breakInfos) {
    const size_t offset = info.byteOffset;
    if (offset == 0 || offset >= word.size()) {
@@ -385,9 +464,13 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
    }

    const bool needsHyphen = info.requiresInsertedHyphen;
-    const int prefixWidth = measureWordWidth(renderer, fontId, word.substr(0, offset), style, needsHyphen);
-    if (prefixWidth > availableWidth || prefixWidth <= chosenWidth) {
-      continue;  // Skip if too wide or not an improvement
+    prefix.assign(word, 0, offset);
+    const int prefixWidth = measureWordWidth(renderer, fontId, prefix, style, needsHyphen);
+    if (prefixWidth > availableWidth) {
+      break;  // Ascending order: all subsequent breakpoints yield wider prefixes
+    }
+    if (prefixWidth <= chosenWidth) {
+      continue;  // Not an improvement
    }

    chosenWidth = prefixWidth;
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
@@ -95,8 +95,6 @@ bool isPunctuation(const uint32_t cp) {
    case '}':
    case '[':
    case ']':
-    case '/':
-    case 0x2039:  // ‹
    case 0x203A:  // ›
    case 0x2026:  // …
      return true;
@@ -109,6 +107,7 @@ bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }

 bool isExplicitHyphen(const uint32_t cp) {
  switch (cp) {
+    case '/':
    case '-':
    case 0x00AD:  // soft hyphen
    case 0x058A:  // Armenian hyphen
--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@@ -1,10 +1,8 @@
 #include "Hyphenator.h"

-#include <algorithm>
 #include <vector>

 #include "HyphenationCommon.h"
-#include "LanguageHyphenator.h"
 #include "LanguageRegistry.h"

 const LanguageHyphenator* Hyphenator::cachedHyphenator_ = nullptr;
@@ -34,25 +32,20 @@ size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t in
 }

 // Builds a vector of break information from explicit hyphen markers in the given codepoints.
-// Only hyphens that appear between two alphabetic characters are considered valid breaks.
-//
-// Example: "US-Satellitensystems" (cps: U, S, -, S, a, t, ...)
-//   -> finds '-' at index 2 with alphabetic neighbors 'S' and 'S'
-//   -> returns one BreakInfo at the byte offset of 'S' (the char after '-'),
-//      with requiresInsertedHyphen=false because '-' is already visible.
-//
-// Example: "Satel\u00ADliten" (soft-hyphen between 'l' and 'l')
-//   -> returns one BreakInfo with requiresInsertedHyphen=true (soft-hyphen
-//      is invisible and needs a visible '-' when the break is used).
 std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<CodepointInfo>& cps) {
  std::vector<Hyphenator::BreakInfo> breaks;

  for (size_t i = 1; i + 1 < cps.size(); ++i) {
    const uint32_t cp = cps[i].value;
-    if (!isExplicitHyphen(cp) || !isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) {
+    if (!isExplicitHyphen(cp)) {
+      continue;
+    }
+    if ((cp == '/' || cp == '-') && cps[i + 1].value == cp) {
+      continue;
+    }
+    if (cp != '/' && cp != '-' && (!isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value))) {
      continue;
    }
-    // Offset points to the next codepoint so rendering starts after the hyphen marker.
    breaks.push_back({cps[i + 1].byteOffset, isSoftHyphen(cp)});
  }

@@ -74,43 +67,6 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
  // Explicit hyphen markers (soft or hard) take precedence over language breaks.
  auto explicitBreakInfos = buildExplicitBreakInfos(cps);
  if (!explicitBreakInfos.empty()) {
-    // When a word contains explicit hyphens we also run Liang patterns on each alphabetic
-    // segment between them. Without this, "US-Satellitensystems" would only offer one split
-    // point (after "US-"), making it impossible to break mid-"Satellitensystems" even when
-    // "US-Satelliten-" would fit on the line.
-    //
-    // Example: "US-Satellitensystems"
-    //   Segments: ["US", "Satellitensystems"]
-    //   Explicit break: after "US-"           -> @3  (no inserted hyphen)
-    //   Pattern breaks on "Satellitensystems" -> @5  Sa|tel  (+hyphen)
-    //                                            @8  Satel|li  (+hyphen)
-    //                                            @10 Satelli|ten  (+hyphen)
-    //                                            @13 Satelliten|sys  (+hyphen)
-    //                                            @16 Satellitensys|tems  (+hyphen)
-    //   Result: 6 sorted break points; the line-breaker picks the widest prefix that fits.
-    if (hyphenator) {
-      size_t segStart = 0;
-      for (size_t i = 0; i <= cps.size(); ++i) {
-        const bool atEnd = (i == cps.size());
-        const bool atHyphen = !atEnd && isExplicitHyphen(cps[i].value);
-        if (atEnd || atHyphen) {
-          if (i > segStart) {
-            std::vector<CodepointInfo> segment(cps.begin() + segStart, cps.begin() + i);
-            auto segIndexes = hyphenator->breakIndexes(segment);
-            for (const size_t idx : segIndexes) {
-              const size_t cpIdx = segStart + idx;
-              if (cpIdx < cps.size()) {
-                explicitBreakInfos.push_back({cps[cpIdx].byteOffset, true});
-              }
-            }
-          }
-          segStart = i + 1;
-        }
-      }
-      // Merge explicit and pattern breaks into ascending byte-offset order.
-      std::sort(explicitBreakInfos.begin(), explicitBreakInfos.end(),
-                [](const BreakInfo& a, const BreakInfo& b) { return a.byteOffset < b.byteOffset; });
-    }
    return explicitBreakInfos;
  }