mod: Phase 3 — Re-port unmerged upstream PRs
Re-applied upstream PRs not yet merged to upstream/master: - #1055: Byte-level framebuffer writes (fillPhysicalHSpan*, optimized fillRect/drawLine/fillRectDither/fillPolygon) - #1027: Word-width cache (FNV-1a, 128-entry) and hyphenation early exit in ParsedText for 7-9% layout speedup - #1068: Already present in upstream — URL hyphenation fix - #1019: Already present in upstream — file extensions in browser - #1090/#1185/#1217: KOReader sync improvements — binary credential store, document hash caching, ChapterXPathIndexer integration - #1209: OPDS multi-server — OpdsBookBrowserActivity accepts OpdsServer, directory picker for downloads, download-complete prompt with open/back options - #857: Dictionary activities already ported in Phase 1/2 - #1003: Placeholder cover already integrated in Phase 2 Also fixed: STR_OFF i18n string, include paths, replaced Epub::isValidThumbnailBmp with Storage.exists, replaced StringUtils::checkFileExtension with FsHelpers equivalents. Made-with: Cursor
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
@@ -74,6 +75,80 @@ uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const s
|
||||
return renderer.getTextAdvanceX(fontId, sanitized.c_str(), style);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Direct-mapped word-width cache
|
||||
//
|
||||
// Avoids redundant getTextAdvanceX calls when the same (word, style, fontId)
|
||||
// triple appears across paragraphs. A fixed-size static array is used so
|
||||
// that heap allocation and fragmentation are both zero.
|
||||
//
|
||||
// Eviction policy: hash-direct mapping — a word always occupies the single
|
||||
// slot determined by its hash; a collision simply overwrites that slot.
|
||||
// This gives O(1) lookup (one hash + one memcmp) regardless of how full the
|
||||
// cache is, avoiding the O(n) linear-scan overhead that causes a regression
|
||||
// on corpora with many unique words (e.g. German compound-heavy text).
|
||||
//
|
||||
// Words longer than 23 bytes bypass the cache entirely — they are uncommon,
|
||||
// unlikely to repeat verbatim, and exceed the fixed-width key buffer.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
struct WordWidthCacheEntry {
|
||||
char word[24]; // NUL-terminated; 23 usable bytes + terminator
|
||||
int fontId;
|
||||
uint16_t width;
|
||||
uint8_t style; // EpdFontFamily::Style narrowed to one byte
|
||||
bool valid; // false = slot empty (BSS-initialised to 0)
|
||||
};
|
||||
|
||||
// Power-of-two size → slot selection via fast bitmask AND.
|
||||
// 128 entries × 32 bytes = 4 KB in BSS; covers typical paragraph vocabulary
|
||||
// with a low collision rate even for German compound-heavy prose.
|
||||
static constexpr uint32_t WORD_WIDTH_CACHE_SIZE = 128;
|
||||
static constexpr uint32_t WORD_WIDTH_CACHE_MASK = WORD_WIDTH_CACHE_SIZE - 1;
|
||||
static WordWidthCacheEntry s_wordWidthCache[WORD_WIDTH_CACHE_SIZE];
|
||||
|
||||
// FNV-1a over the word bytes, then XOR-folded with fontId and style.
|
||||
static uint32_t wordWidthCacheHash(const char* str, const size_t len, const int fontId, const uint8_t style) {
|
||||
uint32_t h = 2166136261u; // FNV offset basis
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
h ^= static_cast<uint8_t>(str[i]);
|
||||
h *= 16777619u; // FNV prime
|
||||
}
|
||||
h ^= static_cast<uint32_t>(fontId);
|
||||
h *= 16777619u;
|
||||
h ^= style;
|
||||
return h;
|
||||
}
|
||||
|
||||
// Returns the cached width for (word, style, fontId), measuring and caching
|
||||
// on a miss. Appending a hyphen is not supported — those measurements are
|
||||
// word-fragment lookups that will not repeat and must not pollute the cache.
|
||||
static uint16_t cachedMeasureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
|
||||
const EpdFontFamily::Style style) {
|
||||
const size_t len = word.size();
|
||||
if (len >= 24) {
|
||||
return measureWordWidth(renderer, fontId, word, style);
|
||||
}
|
||||
|
||||
const uint8_t styleByte = static_cast<uint8_t>(style);
|
||||
const char* const wordCStr = word.c_str();
|
||||
|
||||
const uint32_t slot = wordWidthCacheHash(wordCStr, len, fontId, styleByte) & WORD_WIDTH_CACHE_MASK;
|
||||
auto& e = s_wordWidthCache[slot];
|
||||
|
||||
if (e.valid && e.fontId == fontId && e.style == styleByte && memcmp(e.word, wordCStr, len + 1) == 0) {
|
||||
return e.width; // O(1) cache hit
|
||||
}
|
||||
|
||||
const uint16_t w = measureWordWidth(renderer, fontId, word, style);
|
||||
memcpy(e.word, wordCStr, len + 1);
|
||||
e.fontId = fontId;
|
||||
e.width = w;
|
||||
e.style = styleByte;
|
||||
e.valid = true;
|
||||
return w;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, const bool underline,
|
||||
@@ -131,7 +206,7 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
|
||||
wordWidths.reserve(words.size());
|
||||
|
||||
for (size_t i = 0; i < words.size(); ++i) {
|
||||
wordWidths.push_back(measureWordWidth(renderer, fontId, words[i], wordStyles[i]));
|
||||
wordWidths.push_back(cachedMeasureWordWidth(renderer, fontId, words[i], wordStyles[i]));
|
||||
}
|
||||
|
||||
return wordWidths;
|
||||
@@ -241,6 +316,7 @@ std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, c
|
||||
|
||||
// Stores the index of the word that starts the next line (last_word_index + 1)
|
||||
std::vector<size_t> lineBreakIndices;
|
||||
lineBreakIndices.reserve(totalWordCount / 8 + 1);
|
||||
size_t currentWordIndex = 0;
|
||||
|
||||
while (currentWordIndex < totalWordCount) {
|
||||
@@ -376,8 +452,11 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
|
||||
size_t chosenOffset = 0;
|
||||
int chosenWidth = -1;
|
||||
bool chosenNeedsHyphen = true;
|
||||
std::string prefix;
|
||||
prefix.reserve(word.size());
|
||||
|
||||
// Iterate over each legal breakpoint and retain the widest prefix that still fits.
|
||||
// Breakpoints are in ascending order, so once a prefix is too wide, all subsequent ones will be too.
|
||||
for (const auto& info : breakInfos) {
|
||||
const size_t offset = info.byteOffset;
|
||||
if (offset == 0 || offset >= word.size()) {
|
||||
@@ -385,9 +464,13 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
|
||||
}
|
||||
|
||||
const bool needsHyphen = info.requiresInsertedHyphen;
|
||||
const int prefixWidth = measureWordWidth(renderer, fontId, word.substr(0, offset), style, needsHyphen);
|
||||
if (prefixWidth > availableWidth || prefixWidth <= chosenWidth) {
|
||||
continue; // Skip if too wide or not an improvement
|
||||
prefix.assign(word, 0, offset);
|
||||
const int prefixWidth = measureWordWidth(renderer, fontId, prefix, style, needsHyphen);
|
||||
if (prefixWidth > availableWidth) {
|
||||
break; // Ascending order: all subsequent breakpoints yield wider prefixes
|
||||
}
|
||||
if (prefixWidth <= chosenWidth) {
|
||||
continue; // Not an improvement
|
||||
}
|
||||
|
||||
chosenWidth = prefixWidth;
|
||||
|
||||
@@ -95,8 +95,6 @@ bool isPunctuation(const uint32_t cp) {
|
||||
case '}':
|
||||
case '[':
|
||||
case ']':
|
||||
case '/':
|
||||
case 0x2039: // ‹
|
||||
case 0x203A: // ›
|
||||
case 0x2026: // …
|
||||
return true;
|
||||
@@ -109,6 +107,7 @@ bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
|
||||
|
||||
bool isExplicitHyphen(const uint32_t cp) {
|
||||
switch (cp) {
|
||||
case '/':
|
||||
case '-':
|
||||
case 0x00AD: // soft hyphen
|
||||
case 0x058A: // Armenian hyphen
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
#include "Hyphenator.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
#include "HyphenationCommon.h"
|
||||
#include "LanguageHyphenator.h"
|
||||
#include "LanguageRegistry.h"
|
||||
|
||||
const LanguageHyphenator* Hyphenator::cachedHyphenator_ = nullptr;
|
||||
@@ -34,25 +32,20 @@ size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t in
|
||||
}
|
||||
|
||||
// Builds a vector of break information from explicit hyphen markers in the given codepoints.
|
||||
// Only hyphens that appear between two alphabetic characters are considered valid breaks.
|
||||
//
|
||||
// Example: "US-Satellitensystems" (cps: U, S, -, S, a, t, ...)
|
||||
// -> finds '-' at index 2 with alphabetic neighbors 'S' and 'S'
|
||||
// -> returns one BreakInfo at the byte offset of 'S' (the char after '-'),
|
||||
// with requiresInsertedHyphen=false because '-' is already visible.
|
||||
//
|
||||
// Example: "Satel\u00ADliten" (soft-hyphen between 'l' and 'l')
|
||||
// -> returns one BreakInfo with requiresInsertedHyphen=true (soft-hyphen
|
||||
// is invisible and needs a visible '-' when the break is used).
|
||||
std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<CodepointInfo>& cps) {
|
||||
std::vector<Hyphenator::BreakInfo> breaks;
|
||||
|
||||
for (size_t i = 1; i + 1 < cps.size(); ++i) {
|
||||
const uint32_t cp = cps[i].value;
|
||||
if (!isExplicitHyphen(cp) || !isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) {
|
||||
if (!isExplicitHyphen(cp)) {
|
||||
continue;
|
||||
}
|
||||
if ((cp == '/' || cp == '-') && cps[i + 1].value == cp) {
|
||||
continue;
|
||||
}
|
||||
if (cp != '/' && cp != '-' && (!isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value))) {
|
||||
continue;
|
||||
}
|
||||
// Offset points to the next codepoint so rendering starts after the hyphen marker.
|
||||
breaks.push_back({cps[i + 1].byteOffset, isSoftHyphen(cp)});
|
||||
}
|
||||
|
||||
@@ -74,43 +67,6 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
|
||||
// Explicit hyphen markers (soft or hard) take precedence over language breaks.
|
||||
auto explicitBreakInfos = buildExplicitBreakInfos(cps);
|
||||
if (!explicitBreakInfos.empty()) {
|
||||
// When a word contains explicit hyphens we also run Liang patterns on each alphabetic
|
||||
// segment between them. Without this, "US-Satellitensystems" would only offer one split
|
||||
// point (after "US-"), making it impossible to break mid-"Satellitensystems" even when
|
||||
// "US-Satelliten-" would fit on the line.
|
||||
//
|
||||
// Example: "US-Satellitensystems"
|
||||
// Segments: ["US", "Satellitensystems"]
|
||||
// Explicit break: after "US-" -> @3 (no inserted hyphen)
|
||||
// Pattern breaks on "Satellitensystems" -> @5 Sa|tel (+hyphen)
|
||||
// @8 Satel|li (+hyphen)
|
||||
// @10 Satelli|ten (+hyphen)
|
||||
// @13 Satelliten|sys (+hyphen)
|
||||
// @16 Satellitensys|tems (+hyphen)
|
||||
// Result: 6 sorted break points; the line-breaker picks the widest prefix that fits.
|
||||
if (hyphenator) {
|
||||
size_t segStart = 0;
|
||||
for (size_t i = 0; i <= cps.size(); ++i) {
|
||||
const bool atEnd = (i == cps.size());
|
||||
const bool atHyphen = !atEnd && isExplicitHyphen(cps[i].value);
|
||||
if (atEnd || atHyphen) {
|
||||
if (i > segStart) {
|
||||
std::vector<CodepointInfo> segment(cps.begin() + segStart, cps.begin() + i);
|
||||
auto segIndexes = hyphenator->breakIndexes(segment);
|
||||
for (const size_t idx : segIndexes) {
|
||||
const size_t cpIdx = segStart + idx;
|
||||
if (cpIdx < cps.size()) {
|
||||
explicitBreakInfos.push_back({cps[cpIdx].byteOffset, true});
|
||||
}
|
||||
}
|
||||
}
|
||||
segStart = i + 1;
|
||||
}
|
||||
}
|
||||
// Merge explicit and pattern breaks into ascending byte-offset order.
|
||||
std::sort(explicitBreakInfos.begin(), explicitBreakInfos.end(),
|
||||
[](const BreakInfo& a, const BreakInfo& b) { return a.byteOffset < b.byteOffset; });
|
||||
}
|
||||
return explicitBreakInfos;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user