Add comments to clarify hyphenation logic and structure in Epub processing
This commit is contained in:
parent
c813a2f075
commit
63668708bc
@ -33,6 +33,7 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
|
|||||||
}
|
}
|
||||||
|
|
||||||
const int spaceWidth = renderer.getSpaceWidth(fontId);
|
const int spaceWidth = renderer.getSpaceWidth(fontId);
|
||||||
|
// Maintain classic prose indenting when extra paragraph spacing is disabled.
|
||||||
const bool allowIndent = !extraParagraphSpacing && (style == TextBlock::JUSTIFIED || style == TextBlock::LEFT_ALIGN);
|
const bool allowIndent = !extraParagraphSpacing && (style == TextBlock::JUSTIFIED || style == TextBlock::LEFT_ALIGN);
|
||||||
const int indentWidth = allowIndent ? renderer.getTextWidth(fontId, "m", REGULAR) : 0;
|
const int indentWidth = allowIndent ? renderer.getTextWidth(fontId, "m", REGULAR) : 0;
|
||||||
const int firstLinePageWidth = allowIndent ? std::max(pageWidth - indentWidth, 0) : pageWidth;
|
const int firstLinePageWidth = allowIndent ? std::max(pageWidth - indentWidth, 0) : pageWidth;
|
||||||
@ -52,6 +53,7 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
|
|||||||
size_t producedLines = 0;
|
size_t producedLines = 0;
|
||||||
constexpr size_t MAX_LINES = 1000;
|
constexpr size_t MAX_LINES = 1000;
|
||||||
|
|
||||||
|
// commitLine moves buffered words/styles into a TextBlock and delivers it upstream.
|
||||||
auto commitLine = [&](const bool isLastLine) {
|
auto commitLine = [&](const bool isLastLine) {
|
||||||
if (lineWordCount == 0) {
|
if (lineWordCount == 0) {
|
||||||
return;
|
return;
|
||||||
@ -75,6 +77,7 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
|
|||||||
int spacing = spaceWidth;
|
int spacing = spaceWidth;
|
||||||
int spacingRemainder = 0;
|
int spacingRemainder = 0;
|
||||||
if (style == TextBlock::JUSTIFIED && !isLastLine && gaps > 0) {
|
if (style == TextBlock::JUSTIFIED && !isLastLine && gaps > 0) {
|
||||||
|
// Spread the remaining width evenly across the gaps for justification.
|
||||||
const int additional = std::max(0, spaceBudget - baseSpaceTotal);
|
const int additional = std::max(0, spaceBudget - baseSpaceTotal);
|
||||||
spacing = spaceWidth + (gaps > 0 ? additional / gaps : 0);
|
spacing = spaceWidth + (gaps > 0 ? additional / gaps : 0);
|
||||||
spacingRemainder = (gaps > 0) ? additional % gaps : 0;
|
spacingRemainder = (gaps > 0) ? additional % gaps : 0;
|
||||||
@ -94,6 +97,7 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
|
|||||||
xpos = indentWidth;
|
xpos = indentWidth;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Cache the x positions for each word so TextBlock can render without recomputing layout.
|
||||||
std::list<uint16_t> lineXPos;
|
std::list<uint16_t> lineXPos;
|
||||||
for (size_t idx = 0; idx < lineWordWidths.size(); ++idx) {
|
for (size_t idx = 0; idx < lineWordWidths.size(); ++idx) {
|
||||||
lineXPos.push_back(xpos);
|
lineXPos.push_back(xpos);
|
||||||
@ -148,6 +152,7 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (lineWordCount > 0 && availableWidth > 0) {
|
if (lineWordCount > 0 && availableWidth > 0) {
|
||||||
|
// Try hyphenating the next word so the current line stays compact.
|
||||||
HyphenationResult split;
|
HyphenationResult split;
|
||||||
if (Hyphenator::splitWord(renderer, fontId, *wordIt, *styleIt, availableWidth, &split, false)) {
|
if (Hyphenator::splitWord(renderer, fontId, *wordIt, *styleIt, availableWidth, &split, false)) {
|
||||||
*wordIt = std::move(split.head);
|
*wordIt = std::move(split.head);
|
||||||
@ -161,6 +166,7 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
|
|||||||
|
|
||||||
if (lineWordCount == 0) {
|
if (lineWordCount == 0) {
|
||||||
HyphenationResult split;
|
HyphenationResult split;
|
||||||
|
// Single overlong words get force-split so they can be displayed within the margins.
|
||||||
if (Hyphenator::splitWord(renderer, fontId, *wordIt, *styleIt, currentLinePageWidth, &split, true)) {
|
if (Hyphenator::splitWord(renderer, fontId, *wordIt, *styleIt, currentLinePageWidth, &split, true)) {
|
||||||
*wordIt = std::move(split.head);
|
*wordIt = std::move(split.head);
|
||||||
auto nextWordIt = std::next(wordIt);
|
auto nextWordIt = std::next(wordIt);
|
||||||
|
|||||||
@ -160,6 +160,7 @@ bool isValidEnglishOnsetTrigram(const uint32_t firstCp, const uint32_t secondCp,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Verifies that the consonant cluster could begin an English syllable.
|
||||||
bool englishClusterIsValidOnset(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
|
bool englishClusterIsValidOnset(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
|
||||||
if (start >= end) {
|
if (start >= end) {
|
||||||
return false;
|
return false;
|
||||||
@ -189,6 +190,7 @@ bool englishClusterIsValidOnset(const std::vector<CodepointInfo>& cps, const siz
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Picks the longest legal onset inside the consonant cluster between vowels.
|
||||||
size_t englishOnsetLength(const std::vector<CodepointInfo>& cps, const size_t clusterStart, const size_t clusterEnd) {
|
size_t englishOnsetLength(const std::vector<CodepointInfo>& cps, const size_t clusterStart, const size_t clusterEnd) {
|
||||||
const size_t clusterLen = clusterEnd - clusterStart;
|
const size_t clusterLen = clusterEnd - clusterStart;
|
||||||
if (clusterLen == 0) {
|
if (clusterLen == 0) {
|
||||||
@ -206,6 +208,7 @@ size_t englishOnsetLength(const std::vector<CodepointInfo>& cps, const size_t cl
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Avoids creating hyphen positions adjacent to apostrophes (e.g., contractions).
|
||||||
bool nextToApostrophe(const std::vector<CodepointInfo>& cps, const size_t index) {
|
bool nextToApostrophe(const std::vector<CodepointInfo>& cps, const size_t index) {
|
||||||
if (index == 0 || index >= cps.size()) {
|
if (index == 0 || index >= cps.size()) {
|
||||||
return false;
|
return false;
|
||||||
@ -215,6 +218,7 @@ bool nextToApostrophe(const std::vector<CodepointInfo>& cps, const size_t index)
|
|||||||
return left == '\'' || right == '\'';
|
return left == '\'' || right == '\'';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns byte indexes where the word may break according to English syllable rules.
|
||||||
std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
||||||
std::vector<size_t> indexes;
|
std::vector<size_t> indexes;
|
||||||
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include "LanguageHyphenator.h"
|
#include "LanguageHyphenator.h"
|
||||||
|
|
||||||
|
// Implements syllable-aware break calculation for Latin-script (English) words.
|
||||||
class EnglishHyphenator final : public LanguageHyphenator {
|
class EnglishHyphenator final : public LanguageHyphenator {
|
||||||
public:
|
public:
|
||||||
static const EnglishHyphenator& instance();
|
static const EnglishHyphenator& instance();
|
||||||
|
|||||||
@ -15,6 +15,7 @@
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
// Central registry for language-specific hyphenators supported on device.
|
||||||
const std::array<const LanguageHyphenator*, 2>& registeredHyphenators() {
|
const std::array<const LanguageHyphenator*, 2>& registeredHyphenators() {
|
||||||
static const std::array<const LanguageHyphenator*, 2> hyphenators = {
|
static const std::array<const LanguageHyphenator*, 2> hyphenators = {
|
||||||
&EnglishHyphenator::instance(),
|
&EnglishHyphenator::instance(),
|
||||||
@ -23,6 +24,7 @@ const std::array<const LanguageHyphenator*, 2>& registeredHyphenators() {
|
|||||||
return hyphenators;
|
return hyphenators;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Finds the hyphenator matching the detected script.
|
||||||
const LanguageHyphenator* hyphenatorForScript(const Script script) {
|
const LanguageHyphenator* hyphenatorForScript(const Script script) {
|
||||||
for (const auto* hyphenator : registeredHyphenators()) {
|
for (const auto* hyphenator : registeredHyphenators()) {
|
||||||
if (hyphenator->script() == script) {
|
if (hyphenator->script() == script) {
|
||||||
@ -32,6 +34,7 @@ const LanguageHyphenator* hyphenatorForScript(const Script script) {
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Converts the UTF-8 word into codepoint metadata for downstream rules.
|
||||||
std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
|
std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
|
||||||
std::vector<CodepointInfo> cps;
|
std::vector<CodepointInfo> cps;
|
||||||
cps.reserve(word.size());
|
cps.reserve(word.size());
|
||||||
@ -47,6 +50,7 @@ std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
|
|||||||
return cps;
|
return cps;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Rejects words containing punctuation or digits unless forced.
|
||||||
bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
|
bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
|
||||||
if (cps.empty()) {
|
if (cps.empty()) {
|
||||||
return false;
|
return false;
|
||||||
@ -60,6 +64,7 @@ bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Asks the language hyphenator for legal break positions inside the word.
|
||||||
std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
||||||
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
||||||
return {};
|
return {};
|
||||||
@ -74,6 +79,7 @@ std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
|||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Maps a codepoint index back to its byte offset inside the source word.
|
||||||
size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t index) {
|
size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t index) {
|
||||||
if (index >= cps.size()) {
|
if (index >= cps.size()) {
|
||||||
return cps.empty() ? 0 : cps.back().byteOffset;
|
return cps.empty() ? 0 : cps.back().byteOffset;
|
||||||
@ -81,6 +87,7 @@ size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t in
|
|||||||
return cps[index].byteOffset;
|
return cps[index].byteOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Safely slices a UTF-8 string without splitting multibyte sequences.
|
||||||
std::string slice(const std::string& word, const size_t startByte, const size_t endByte) {
|
std::string slice(const std::string& word, const size_t startByte, const size_t endByte) {
|
||||||
if (startByte >= endByte || startByte >= word.size()) {
|
if (startByte >= endByte || startByte >= word.size()) {
|
||||||
return std::string();
|
return std::string();
|
||||||
@ -127,6 +134,7 @@ bool Hyphenator::splitWord(const GfxRenderer& renderer, const int fontId, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (chosenIndex == std::numeric_limits<size_t>::max() && force) {
|
if (chosenIndex == std::numeric_limits<size_t>::max() && force) {
|
||||||
|
// Emergency fallback: brute-force through codepoints to avoid overflow when no legal breaks fit.
|
||||||
for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {
|
for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {
|
||||||
const size_t byteOffset = byteOffsetForIndex(cps, idx);
|
const size_t byteOffset = byteOffsetForIndex(cps, idx);
|
||||||
const std::string prefix = word.substr(0, byteOffset);
|
const std::string prefix = word.substr(0, byteOffset);
|
||||||
|
|||||||
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
class GfxRenderer;
|
class GfxRenderer;
|
||||||
|
|
||||||
|
// Holds the split portions of a hyphenated word.
|
||||||
struct HyphenationResult {
|
struct HyphenationResult {
|
||||||
std::string head;
|
std::string head;
|
||||||
std::string tail;
|
std::string tail;
|
||||||
@ -13,6 +14,7 @@ struct HyphenationResult {
|
|||||||
|
|
||||||
class Hyphenator {
|
class Hyphenator {
|
||||||
public:
|
public:
|
||||||
|
// Splits a word so it fits within availableWidth, appending a hyphen to the head when needed.
|
||||||
static bool splitWord(const GfxRenderer& renderer, int fontId, const std::string& word, EpdFontStyle style,
|
static bool splitWord(const GfxRenderer& renderer, int fontId, const std::string& word, EpdFontStyle style,
|
||||||
int availableWidth, HyphenationResult* result, bool force);
|
int availableWidth, HyphenationResult* result, bool force);
|
||||||
};
|
};
|
||||||
@ -77,6 +77,7 @@ int russianSonority(uint32_t cp) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Applies Russian sonority sequencing to ensure the consonant cluster can start a syllable.
|
||||||
bool russianClusterIsValidOnset(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
|
bool russianClusterIsValidOnset(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
|
||||||
if (start >= end) {
|
if (start >= end) {
|
||||||
return false;
|
return false;
|
||||||
@ -111,6 +112,7 @@ bool russianClusterIsValidOnset(const std::vector<CodepointInfo>& cps, const siz
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Chooses the longest valid onset contained within the inter-vowel cluster.
|
||||||
size_t russianOnsetLength(const std::vector<CodepointInfo>& cps, const size_t clusterStart, const size_t clusterEnd) {
|
size_t russianOnsetLength(const std::vector<CodepointInfo>& cps, const size_t clusterStart, const size_t clusterEnd) {
|
||||||
const size_t clusterLen = clusterEnd - clusterStart;
|
const size_t clusterLen = clusterEnd - clusterStart;
|
||||||
if (clusterLen == 0) {
|
if (clusterLen == 0) {
|
||||||
@ -128,6 +130,7 @@ size_t russianOnsetLength(const std::vector<CodepointInfo>& cps, const size_t cl
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prevents hyphenation splits immediately beside ь/ъ characters.
|
||||||
bool nextToSoftSign(const std::vector<CodepointInfo>& cps, const size_t index) {
|
bool nextToSoftSign(const std::vector<CodepointInfo>& cps, const size_t index) {
|
||||||
if (index == 0 || index >= cps.size()) {
|
if (index == 0 || index >= cps.size()) {
|
||||||
return false;
|
return false;
|
||||||
@ -137,6 +140,7 @@ bool nextToSoftSign(const std::vector<CodepointInfo>& cps, const size_t index) {
|
|||||||
return isSoftOrHardSign(left) || isSoftOrHardSign(right);
|
return isSoftOrHardSign(left) || isSoftOrHardSign(right);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Produces syllable break indexes tailored to Russian phonotactics.
|
||||||
std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
||||||
std::vector<size_t> indexes;
|
std::vector<size_t> indexes;
|
||||||
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include "LanguageHyphenator.h"
|
#include "LanguageHyphenator.h"
|
||||||
|
|
||||||
|
// Handles Cyrillic-specific hyphenation heuristics (Russian syllable rules).
|
||||||
class RussianHyphenator final : public LanguageHyphenator {
|
class RussianHyphenator final : public LanguageHyphenator {
|
||||||
public:
|
public:
|
||||||
static const RussianHyphenator& instance();
|
static const RussianHyphenator& instance();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user