Add comments to clarify hyphenation logic and structure in Epub processing

2025-12-18 20:08:31 +05:00
parent c813a2f075
commit 63668708bc
7 changed files with 26 additions and 0 deletions
--- a/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp
@@ -160,6 +160,7 @@ bool isValidEnglishOnsetTrigram(const uint32_t firstCp, const uint32_t secondCp,
  return false;
 }

+// Verifies that the consonant cluster could begin an English syllable.
 bool englishClusterIsValidOnset(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
  if (start >= end) {
    return false;
@@ -189,6 +190,7 @@ bool englishClusterIsValidOnset(const std::vector<CodepointInfo>& cps, const siz
  return false;
 }

+// Picks the longest legal onset inside the consonant cluster between vowels.
 size_t englishOnsetLength(const std::vector<CodepointInfo>& cps, const size_t clusterStart, const size_t clusterEnd) {
  const size_t clusterLen = clusterEnd - clusterStart;
  if (clusterLen == 0) {
@@ -206,6 +208,7 @@ size_t englishOnsetLength(const std::vector<CodepointInfo>& cps, const size_t cl
  return 1;
 }

+// Avoids creating hyphen positions adjacent to apostrophes (e.g., contractions).
 bool nextToApostrophe(const std::vector<CodepointInfo>& cps, const size_t index) {
  if (index == 0 || index >= cps.size()) {
    return false;
@@ -215,6 +218,7 @@ bool nextToApostrophe(const std::vector<CodepointInfo>& cps, const size_t index)
  return left == '\'' || right == '\'';
 }

+// Returns byte indexes where the word may break according to English syllable rules.
 std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
  std::vector<size_t> indexes;
  if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
--- a/lib/Epub/Epub/hyphenation/EnglishHyphenator.h
+++ b/lib/Epub/Epub/hyphenation/EnglishHyphenator.h
@@ -2,6 +2,7 @@

 #include "LanguageHyphenator.h"

+// Implements syllable-aware break calculation for Latin-script (English) words.
 class EnglishHyphenator final : public LanguageHyphenator {
 public:
  static const EnglishHyphenator& instance();
--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@@ -15,6 +15,7 @@

 namespace {

+// Central registry for language-specific hyphenators supported on device.
 const std::array<const LanguageHyphenator*, 2>& registeredHyphenators() {
  static const std::array<const LanguageHyphenator*, 2> hyphenators = {
      &EnglishHyphenator::instance(),
@@ -23,6 +24,7 @@ const std::array<const LanguageHyphenator*, 2>& registeredHyphenators() {
  return hyphenators;
 }

+// Finds the hyphenator matching the detected script.
 const LanguageHyphenator* hyphenatorForScript(const Script script) {
  for (const auto* hyphenator : registeredHyphenators()) {
    if (hyphenator->script() == script) {
@@ -32,6 +34,7 @@ const LanguageHyphenator* hyphenatorForScript(const Script script) {
  return nullptr;
 }

+// Converts the UTF-8 word into codepoint metadata for downstream rules.
 std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
  std::vector<CodepointInfo> cps;
  cps.reserve(word.size());
@@ -47,6 +50,7 @@ std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
  return cps;
 }

+// Rejects words containing punctuation or digits unless forced.
 bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
  if (cps.empty()) {
    return false;
@@ -60,6 +64,7 @@ bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
  return true;
 }

+// Asks the language hyphenator for legal break positions inside the word.
 std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps) {
  if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
    return {};
@@ -74,6 +79,7 @@ std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps) {
  return {};
 }

+// Maps a codepoint index back to its byte offset inside the source word.
 size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t index) {
  if (index >= cps.size()) {
    return cps.empty() ? 0 : cps.back().byteOffset;
@@ -81,6 +87,7 @@ size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t in
  return cps[index].byteOffset;
 }

+// Safely slices a UTF-8 string without splitting multibyte sequences.
 std::string slice(const std::string& word, const size_t startByte, const size_t endByte) {
  if (startByte >= endByte || startByte >= word.size()) {
    return std::string();
@@ -127,6 +134,7 @@ bool Hyphenator::splitWord(const GfxRenderer& renderer, const int fontId, const
  }

  if (chosenIndex == std::numeric_limits<size_t>::max() && force) {
+    // Emergency fallback: brute-force through codepoints to avoid overflow when no legal breaks fit.
    for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {
      const size_t byteOffset = byteOffsetForIndex(cps, idx);
      const std::string prefix = word.substr(0, byteOffset);
--- a/lib/Epub/Epub/hyphenation/Hyphenator.h
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.h
@@ -6,6 +6,7 @@

 class GfxRenderer;

+// Holds the split portions of a hyphenated word.
 struct HyphenationResult {
  std::string head;
  std::string tail;
@@ -13,6 +14,7 @@ struct HyphenationResult {

 class Hyphenator {
 public:
+  // Splits a word so it fits within availableWidth, appending a hyphen to the head when needed.
  static bool splitWord(const GfxRenderer& renderer, int fontId, const std::string& word, EpdFontStyle style,
                        int availableWidth, HyphenationResult* result, bool force);
 };
--- a/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp
@@ -77,6 +77,7 @@ int russianSonority(uint32_t cp) {
  }
 }

+// Applies Russian sonority sequencing to ensure the consonant cluster can start a syllable.
 bool russianClusterIsValidOnset(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
  if (start >= end) {
    return false;
@@ -111,6 +112,7 @@ bool russianClusterIsValidOnset(const std::vector<CodepointInfo>& cps, const siz
  return true;
 }

+// Chooses the longest valid onset contained within the inter-vowel cluster.
 size_t russianOnsetLength(const std::vector<CodepointInfo>& cps, const size_t clusterStart, const size_t clusterEnd) {
  const size_t clusterLen = clusterEnd - clusterStart;
  if (clusterLen == 0) {
@@ -128,6 +130,7 @@ size_t russianOnsetLength(const std::vector<CodepointInfo>& cps, const size_t cl
  return 1;
 }

+// Prevents hyphenation splits immediately beside ь/ъ characters.
 bool nextToSoftSign(const std::vector<CodepointInfo>& cps, const size_t index) {
  if (index == 0 || index >= cps.size()) {
    return false;
@@ -137,6 +140,7 @@ bool nextToSoftSign(const std::vector<CodepointInfo>& cps, const size_t index) {
  return isSoftOrHardSign(left) || isSoftOrHardSign(right);
 }

+// Produces syllable break indexes tailored to Russian phonotactics.
 std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
  std::vector<size_t> indexes;
  if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
--- a/lib/Epub/Epub/hyphenation/RussianHyphenator.h
+++ b/lib/Epub/Epub/hyphenation/RussianHyphenator.h
@@ -2,6 +2,7 @@

 #include "LanguageHyphenator.h"

+// Handles Cyrillic-specific hyphenation heuristics (Russian syllable rules).
 class RussianHyphenator final : public LanguageHyphenator {
 public:
  static const RussianHyphenator& instance();