crosspoint-reader/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp
Arthur Tazhitdinov 247463a4b9 format fix
2025-12-26 04:37:15 +05:00

384 lines
11 KiB
C++

#include "EnglishHyphenator.h"
#include <algorithm>
#include <array>
#include <initializer_list>
#include <string>
#include <vector>
namespace {
char lowerLatinChar(const uint32_t cp) {
if (!isLatinLetter(cp)) {
return 0;
}
return static_cast<char>(toLowerLatin(cp));
}
bool isEnglishApproximantChar(const char c) { return c == 'l' || c == 'r' || c == 'w' || c == 'y'; }
bool isEnglishStopChar(const char c) {
switch (c) {
case 'p':
case 'b':
case 't':
case 'd':
case 'k':
case 'g':
case 'c':
case 'q':
return true;
default:
return false;
}
}
bool isEnglishFricativeChar(const char c) {
switch (c) {
case 'f':
case 'v':
case 's':
case 'z':
case 'h':
case 'x':
return true;
default:
return false;
}
}
struct LatinLiteral {
const char* text;
size_t length;
};
bool nextToApostrophe(const std::vector<CodepointInfo>& cps, size_t index);
std::string lowercaseLatinWord(const std::vector<CodepointInfo>& cps) {
std::string lower;
lower.reserve(cps.size());
for (const auto& info : cps) {
lower.push_back(lowerLatinChar(info.value));
}
return lower;
}
bool matchesPatternAt(const std::string& lowerWord, const size_t start, const LatinLiteral& pattern) {
if (!pattern.text || pattern.length == 0) {
return false;
}
if (start + pattern.length > lowerWord.size()) {
return false;
}
for (size_t i = 0; i < pattern.length; ++i) {
if (lowerWord[start + i] != pattern.text[i]) {
return false;
}
}
return true;
}
bool englishSegmentHasVowel(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
if (start >= end || start >= cps.size()) {
return false;
}
const size_t clampedEnd = std::min(end, cps.size());
for (size_t i = start; i < clampedEnd; ++i) {
if (isLatinVowel(cps[i].value)) {
return true;
}
}
return false;
}
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::string& lowerWord,
std::vector<size_t>& indexes) {
static constexpr std::array<LatinLiteral, 20> PREFIXES = {
{{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2}, {"dis", 3}, {"hyper", 5}, {"inter", 5},
{"micro", 5}, {"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3}, {"over", 4}, {"post", 4},
{"pre", 3}, {"pro", 3}, {"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}};
static constexpr std::array<LatinLiteral, 24> SUFFIXES = {
{{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4}, {"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3},
{"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6}, {"ly", 2}, {"ment", 4}, {"ments", 5}, {"ness", 4},
{"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4}, {"wards", 5}, {"ship", 4}, {"ships", 5}, {"y", 1}}};
const size_t length = cps.size();
if (length < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
return;
}
const auto tryPush = [&](const size_t breakIndex) {
if (breakIndex < MIN_PREFIX_CP || length - breakIndex < MIN_SUFFIX_CP) {
return;
}
if (!englishSegmentHasVowel(cps, 0, breakIndex) || !englishSegmentHasVowel(cps, breakIndex, length)) {
return;
}
if (nextToApostrophe(cps, breakIndex)) {
return;
}
indexes.push_back(breakIndex);
};
for (const auto& prefix : PREFIXES) {
if (prefix.length == 0 || prefix.length >= length) {
continue;
}
if (!matchesPatternAt(lowerWord, 0, prefix)) {
continue;
}
tryPush(prefix.length);
}
for (const auto& suffix : SUFFIXES) {
if (suffix.length == 0 || suffix.length >= length) {
continue;
}
const size_t breakIndex = length - suffix.length;
if (!matchesPatternAt(lowerWord, breakIndex, suffix)) {
continue;
}
tryPush(breakIndex);
}
}
struct CharPair {
char first;
char second;
};
bool matchesDigraph(const char first, const char second, const std::initializer_list<CharPair>& pairs) {
for (const auto& pair : pairs) {
if (pair.first == first && pair.second == second) {
return true;
}
}
return false;
}
bool isEnglishDiphthong(const uint32_t first, const uint32_t second) {
if (!isLatinLetter(first) || !isLatinLetter(second)) {
return false;
}
const auto f = static_cast<char>(toLowerLatin(first));
const auto s = static_cast<char>(toLowerLatin(second));
switch (f) {
case 'a':
return s == 'i' || s == 'y' || s == 'u';
case 'e':
return s == 'a' || s == 'e' || s == 'i' || s == 'o' || s == 'u' || s == 'y';
case 'i':
return s == 'e' || s == 'u' || s == 'a';
case 'o':
return s == 'a' || s == 'e' || s == 'i' || s == 'o' || s == 'u' || s == 'y';
case 'u':
return s == 'i' || s == 'a' || s == 'e';
}
return false;
}
bool isValidEnglishOnsetBigram(const uint32_t firstCp, const uint32_t secondCp) {
const char first = lowerLatinChar(firstCp);
const char second = lowerLatinChar(secondCp);
if (!first || !second) {
return false;
}
if (matchesDigraph(first, second,
{{'c', 'h'},
{'s', 'h'},
{'t', 'h'},
{'p', 'h'},
{'w', 'h'},
{'w', 'r'},
{'k', 'n'},
{'g', 'n'},
{'p', 's'},
{'p', 't'},
{'p', 'n'},
{'r', 'h'}})) {
return true;
}
if (isEnglishStopChar(first) && isEnglishApproximantChar(second)) {
return true;
}
if (isEnglishFricativeChar(first) && isEnglishApproximantChar(second)) {
return true;
}
if (first == 's' && (second == 'p' || second == 't' || second == 'k' || second == 'm' || second == 'n' ||
second == 'f' || second == 'l' || second == 'w' || second == 'c')) {
return true;
}
if (second == 'y' && (first == 'p' || first == 'b' || first == 't' || first == 'd' || first == 'f' || first == 'k' ||
first == 'g' || first == 'h' || first == 'm' || first == 'n' || first == 'l' || first == 's')) {
return true;
}
return false;
}
bool isValidEnglishOnsetTrigram(const uint32_t firstCp, const uint32_t secondCp, const uint32_t thirdCp) {
const char first = lowerLatinChar(firstCp);
const char second = lowerLatinChar(secondCp);
const char third = lowerLatinChar(thirdCp);
if (!first || !second || !third) {
return false;
}
if (first == 's') {
if (second == 'p' && (third == 'l' || third == 'r' || third == 'w')) {
return true;
}
if (second == 't' && (third == 'r' || third == 'w' || third == 'y')) {
return true;
}
if (second == 'k' && (third == 'l' || third == 'r' || third == 'w')) {
return true;
}
if (second == 'c' && (third == 'l' || third == 'r')) {
return true;
}
if (second == 'f' && third == 'r') {
return true;
}
if (second == 'h' && third == 'r') {
return true;
}
}
if (first == 't' && second == 'h' && third == 'r') {
return true;
}
return false;
}
// Verifies that the consonant cluster could begin an English syllable.
bool englishClusterIsValidOnset(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
if (start >= end) {
return false;
}
for (size_t i = start; i < end; ++i) {
const char ch = lowerLatinChar(cps[i].value);
if (!ch) {
return false;
}
if (!isLatinConsonant(cps[i].value) && ch != 'y') {
return false;
}
}
const size_t len = end - start;
if (len == 1) {
return true;
}
if (len == 2) {
return isValidEnglishOnsetBigram(cps[start].value, cps[start + 1].value);
}
if (len == 3) {
return isValidEnglishOnsetTrigram(cps[start].value, cps[start + 1].value, cps[start + 2].value);
}
return false;
}
// Picks the longest legal onset inside the consonant cluster between vowels.
size_t englishOnsetLength(const std::vector<CodepointInfo>& cps, const size_t clusterStart, const size_t clusterEnd) {
const size_t clusterLen = clusterEnd - clusterStart;
if (clusterLen == 0) {
return 0;
}
const size_t maxLen = std::min<size_t>(3, clusterLen);
for (size_t len = maxLen; len >= 1; --len) {
const size_t suffixStart = clusterEnd - len;
if (englishClusterIsValidOnset(cps, suffixStart, clusterEnd)) {
return len;
}
}
return 1;
}
// Avoids creating hyphen positions adjacent to apostrophes (e.g., contractions).
bool nextToApostrophe(const std::vector<CodepointInfo>& cps, const size_t index) {
if (index == 0 || index >= cps.size()) {
return false;
}
const auto left = cps[index - 1].value;
const auto right = cps[index].value;
return left == '\'' || right == '\'';
}
// Returns byte indexes where the word may break according to English syllable rules.
std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
std::vector<size_t> indexes;
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
return indexes;
}
const auto lowerWord = lowercaseLatinWord(cps);
std::vector<size_t> vowelPositions;
vowelPositions.reserve(cps.size());
for (size_t i = 0; i < cps.size(); ++i) {
if (isLatinVowel(cps[i].value)) {
vowelPositions.push_back(i);
}
}
if (vowelPositions.size() < 2) {
return indexes;
}
for (size_t v = 0; v + 1 < vowelPositions.size(); ++v) {
const size_t leftVowel = vowelPositions[v];
const size_t rightVowel = vowelPositions[v + 1];
if (rightVowel - leftVowel == 1) {
if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) && rightVowel >= MIN_PREFIX_CP &&
cps.size() - rightVowel >= MIN_SUFFIX_CP && !nextToApostrophe(cps, rightVowel)) {
indexes.push_back(rightVowel);
}
continue;
}
const size_t clusterStart = leftVowel + 1;
const size_t clusterEnd = rightVowel;
const size_t onsetLen = englishOnsetLength(cps, clusterStart, clusterEnd);
size_t breakIndex = clusterEnd - onsetLen;
if (breakIndex < MIN_PREFIX_CP || cps.size() - breakIndex < MIN_SUFFIX_CP) {
continue;
}
if (nextToApostrophe(cps, breakIndex)) {
continue;
}
indexes.push_back(breakIndex);
}
appendMorphologyBreaks(cps, lowerWord, indexes);
std::sort(indexes.begin(), indexes.end());
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
return indexes;
}
} // namespace
const EnglishHyphenator& EnglishHyphenator::instance() {
static EnglishHyphenator instance;
return instance;
}
Script EnglishHyphenator::script() const { return Script::Latin; }
std::vector<size_t> EnglishHyphenator::breakIndexes(const std::vector<CodepointInfo>& cps) const {
return englishBreakIndexes(cps);
}