#include "Hyphenator.h" #include #include #include #include #include "EnglishHyphenator.h" #include "HyphenationCommon.h" #include "LanguageHyphenator.h" #include "RussianHyphenator.h" namespace { // Central registry for language-specific hyphenators supported on device. const std::array& registeredHyphenators() { static const std::array hyphenators = { &EnglishHyphenator::instance(), &RussianHyphenator::instance(), }; return hyphenators; } // Finds the hyphenator matching the detected script. const LanguageHyphenator* hyphenatorForScript(const Script script) { for (const auto* hyphenator : registeredHyphenators()) { if (hyphenator->script() == script) { return hyphenator; } } return nullptr; } // Converts the UTF-8 word into codepoint metadata for downstream rules. std::vector collectCodepoints(const std::string& word) { std::vector cps; cps.reserve(word.size()); const unsigned char* base = reinterpret_cast(word.c_str()); const unsigned char* ptr = base; while (*ptr != 0) { const unsigned char* current = ptr; const uint32_t cp = utf8NextCodepoint(&ptr); cps.push_back({cp, static_cast(current - base)}); } return cps; } // Rejects words containing punctuation or digits unless forced. bool hasOnlyAlphabetic(const std::vector& cps) { if (cps.empty()) { return false; } for (const auto& info : cps) { if (!isAlphabetic(info.value)) { return false; } } return true; } // Asks the language hyphenator for legal break positions inside the word. std::vector collectBreakIndexes(const std::vector& cps) { if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { return {}; } const Script script = detectScript(cps); if (const auto* hyphenator = hyphenatorForScript(script)) { auto indexes = hyphenator->breakIndexes(cps); return indexes; } return {}; } // Maps a codepoint index back to its byte offset inside the source word. size_t byteOffsetForIndex(const std::vector& cps, const size_t index) { if (index >= cps.size()) { return cps.empty() ? 0 : cps.back().byteOffset; } return cps[index].byteOffset; } } // namespace std::vector Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) { if (word.empty()) { return {}; } auto cps = collectCodepoints(word); trimTrailingPunctuation(cps); if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { return {}; } std::vector indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector(); if (includeFallback) { for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) { indexes.push_back(idx); } } if (indexes.empty()) { return {}; } std::sort(indexes.begin(), indexes.end()); indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); std::vector byteOffsets; byteOffsets.reserve(indexes.size()); for (const size_t idx : indexes) { byteOffsets.push_back(byteOffsetForIndex(cps, idx)); } return byteOffsets; }