Add language support to Epub metadata and hyphenation logic

2026-01-08 02:32:52 +05:00
parent 4f94cf2c36
commit 61d0e1cadf
11 changed files with 86 additions and 25 deletions
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
@@ -143,22 +143,3 @@ bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
  return true;
 }

-Script detectScript(const std::vector<CodepointInfo>& cps) {
-  bool hasLatin = false;
-  bool hasCyrillic = false;
-  for (const auto& info : cps) {
-    if (isLatinLetter(info.value)) {
-      hasLatin = true;
-    } else if (isCyrillicLetter(info.value)) {
-      hasCyrillic = true;
-    }
-  }
-
-  if (hasLatin && !hasCyrillic) {
-    return Script::Latin;
-  }
-  if (!hasLatin && hasCyrillic) {
-    return Script::Cyrillic;
-  }
-  return Script::Mixed;
-}
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h
@@ -35,4 +35,3 @@ bool isSoftHyphen(uint32_t cp);
 void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps);
 bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps);

-Script detectScript(const std::vector<CodepointInfo>& cps);
--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@@ -32,6 +32,37 @@ const LanguageHyphenator* hyphenatorForScript(const Script script) {
  return nullptr;
 }

+// Maps a BCP-47 language tag to a language-specific hyphenator.
+const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) {
+  if (langTag.empty()) return nullptr;
+
+  // Extract primary subtag and normalize to lowercase (e.g., "en-US" -> "en").
+  std::string primary;
+  primary.reserve(langTag.size());
+  for (char c : langTag) {
+    if (c == '-' || c == '_') break;
+    if (c >= 'A' && c <= 'Z') c = static_cast<char>(c - 'A' + 'a');
+    primary.push_back(c);
+  }
+  if (primary.empty()) return nullptr;
+
+  if (primary == "en") return &EnglishHyphenator::instance();
+  if (primary == "ru") return &RussianHyphenator::instance();
+  return nullptr;
+}
+
+// Preferred language hint; empty means "auto".
+std::string& preferredLanguage() {
+  static std::string lang;
+  return lang;
+}
+
+// Cached hyphenator instance for the current preferred language.
+const LanguageHyphenator*& cachedHyphenator() {
+  static const LanguageHyphenator* hyphenator = nullptr;
+  return hyphenator;
+}
+
 // Converts the UTF-8 word into codepoint metadata for downstream rules.
 std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
  std::vector<CodepointInfo> cps;
@@ -78,8 +109,8 @@ std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps) {
    return {};
  }

-  const Script script = detectScript(cps);
-  if (const auto* hyphenator = hyphenatorForScript(script)) {
+  // Use cached hyphenator to avoid repeated language lookups.
+  if (const auto* hyphenator = cachedHyphenator()) {
    auto indexes = hyphenator->breakIndexes(cps);
    return indexes;
  }
@@ -95,6 +126,7 @@ size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t in
  return cps[index].byteOffset;
 }

+// Builds a vector of break information from explicit hyphen markers in the given codepoints.
 std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<CodepointInfo>& cps) {
  std::vector<Hyphenator::BreakInfo> breaks;
  breaks.reserve(cps.size());
@@ -182,3 +214,8 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w

  return breaks;
 }
+
+void Hyphenator::setPreferredLanguage(const std::string& lang) {
+  preferredLanguage() = lang;
+  cachedHyphenator() = hyphenatorForLanguage(lang);
+}
--- a/lib/Epub/Epub/hyphenation/Hyphenator.h
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.h
@@ -13,4 +13,7 @@ class Hyphenator {
  // Returns byte offsets where the word may be hyphenated. When includeFallback is true, all positions obeying the
  // minimum prefix/suffix constraints are returned even if no language-specific rule matches.
  static std::vector<BreakInfo> breakOffsets(const std::string& word, bool includeFallback);
+
+  // Provide a publication-level language hint (e.g. "en", "en-US", "ru") used to select hyphenation rules.
+  static void setPreferredLanguage(const std::string& lang);
 };