Disable hyphenation feature in CrossPointSettings
This commit is contained in:
parent
3806f1883a
commit
3cf52d8bd1
@ -1,4 +1,5 @@
|
||||
#include "EnglishHyphenator.h"
|
||||
#include "HyphenationLiterals.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
@ -47,10 +48,20 @@ bool isEnglishFricativeChar(const char c) {
|
||||
}
|
||||
}
|
||||
|
||||
struct LatinLiteral {
|
||||
const char* text;
|
||||
size_t length;
|
||||
};
|
||||
using LatinLiteral = HyphenLiteralT<char>;
|
||||
|
||||
constexpr std::array<LatinLiteral, 20> ENGLISH_PREFIXES = {{{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2},
|
||||
{"dis", 3}, {"hyper", 5}, {"inter", 5}, {"micro", 5},
|
||||
{"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3},
|
||||
{"over", 4}, {"post", 4}, {"pre", 3}, {"pro", 3},
|
||||
{"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}};
|
||||
|
||||
constexpr std::array<LatinLiteral, 24> ENGLISH_SUFFIXES = {{{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4},
|
||||
{"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3},
|
||||
{"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6},
|
||||
{"ly", 2}, {"ment", 4}, {"ments", 5},{"ness", 4},
|
||||
{"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4},
|
||||
{"wards", 5},{"ship", 4}, {"ships", 5},{"y", 1}}};
|
||||
|
||||
bool nextToApostrophe(const std::vector<CodepointInfo>& cps, size_t index);
|
||||
|
||||
@ -63,21 +74,6 @@ std::string lowercaseLatinWord(const std::vector<CodepointInfo>& cps) {
|
||||
return lower;
|
||||
}
|
||||
|
||||
bool matchesPatternAt(const std::string& lowerWord, const size_t start, const LatinLiteral& pattern) {
|
||||
if (!pattern.text || pattern.length == 0) {
|
||||
return false;
|
||||
}
|
||||
if (start + pattern.length > lowerWord.size()) {
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < pattern.length; ++i) {
|
||||
if (lowerWord[start + i] != pattern.text[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool englishSegmentHasVowel(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
|
||||
if (start >= end || start >= cps.size()) {
|
||||
return false;
|
||||
@ -91,56 +87,32 @@ bool englishSegmentHasVowel(const std::vector<CodepointInfo>& cps, const size_t
|
||||
return false;
|
||||
}
|
||||
|
||||
bool englishBreakAllowed(const std::vector<CodepointInfo>& cps, const size_t breakIndex) {
|
||||
if (breakIndex == 0 || breakIndex >= cps.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t prefixLen = breakIndex;
|
||||
const size_t suffixLen = cps.size() - breakIndex;
|
||||
if (prefixLen < MIN_PREFIX_CP || suffixLen < MIN_SUFFIX_CP) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!englishSegmentHasVowel(cps, 0, breakIndex) || !englishSegmentHasVowel(cps, breakIndex, cps.size())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (nextToApostrophe(cps, breakIndex)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::string& lowerWord,
|
||||
std::vector<size_t>& indexes) {
|
||||
static constexpr std::array<LatinLiteral, 20> PREFIXES = {
|
||||
{{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2}, {"dis", 3}, {"hyper", 5}, {"inter", 5},
|
||||
{"micro", 5}, {"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3}, {"over", 4}, {"post", 4},
|
||||
{"pre", 3}, {"pro", 3}, {"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}};
|
||||
|
||||
static constexpr std::array<LatinLiteral, 24> SUFFIXES = {
|
||||
{{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4}, {"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3},
|
||||
{"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6}, {"ly", 2}, {"ment", 4}, {"ments", 5}, {"ness", 4},
|
||||
{"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4}, {"wards", 5}, {"ship", 4}, {"ships", 5}, {"y", 1}}};
|
||||
|
||||
const size_t length = cps.size();
|
||||
if (length < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto tryPush = [&](const size_t breakIndex) {
|
||||
if (breakIndex < MIN_PREFIX_CP || length - breakIndex < MIN_SUFFIX_CP) {
|
||||
return;
|
||||
}
|
||||
if (!englishSegmentHasVowel(cps, 0, breakIndex) || !englishSegmentHasVowel(cps, breakIndex, length)) {
|
||||
return;
|
||||
}
|
||||
if (nextToApostrophe(cps, breakIndex)) {
|
||||
return;
|
||||
}
|
||||
indexes.push_back(breakIndex);
|
||||
};
|
||||
|
||||
for (const auto& prefix : PREFIXES) {
|
||||
if (prefix.length == 0 || prefix.length >= length) {
|
||||
continue;
|
||||
}
|
||||
if (!matchesPatternAt(lowerWord, 0, prefix)) {
|
||||
continue;
|
||||
}
|
||||
tryPush(prefix.length);
|
||||
}
|
||||
|
||||
for (const auto& suffix : SUFFIXES) {
|
||||
if (suffix.length == 0 || suffix.length >= length) {
|
||||
continue;
|
||||
}
|
||||
const size_t breakIndex = length - suffix.length;
|
||||
if (!matchesPatternAt(lowerWord, breakIndex, suffix)) {
|
||||
continue;
|
||||
}
|
||||
tryPush(breakIndex);
|
||||
}
|
||||
appendLiteralBreaks(lowerWord, ENGLISH_PREFIXES, ENGLISH_SUFFIXES,
|
||||
[&](const size_t breakIndex) { return englishBreakAllowed(cps, breakIndex); }, indexes);
|
||||
}
|
||||
|
||||
struct CharPair {
|
||||
@ -341,8 +313,8 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
||||
const size_t rightVowel = vowelPositions[v + 1];
|
||||
|
||||
if (rightVowel - leftVowel == 1) {
|
||||
if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) && rightVowel >= MIN_PREFIX_CP &&
|
||||
cps.size() - rightVowel >= MIN_SUFFIX_CP && !nextToApostrophe(cps, rightVowel)) {
|
||||
if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) &&
|
||||
englishBreakAllowed(cps, rightVowel)) {
|
||||
indexes.push_back(rightVowel);
|
||||
}
|
||||
continue;
|
||||
@ -353,10 +325,7 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
||||
const size_t onsetLen = englishOnsetLength(cps, clusterStart, clusterEnd);
|
||||
size_t breakIndex = clusterEnd - onsetLen;
|
||||
|
||||
if (breakIndex < MIN_PREFIX_CP || cps.size() - breakIndex < MIN_SUFFIX_CP) {
|
||||
continue;
|
||||
}
|
||||
if (nextToApostrophe(cps, breakIndex)) {
|
||||
if (!englishBreakAllowed(cps, breakIndex)) {
|
||||
continue;
|
||||
}
|
||||
indexes.push_back(breakIndex);
|
||||
|
||||
64
lib/Epub/Epub/hyphenation/HyphenationLiterals.h
Normal file
64
lib/Epub/Epub/hyphenation/HyphenationLiterals.h
Normal file
@ -0,0 +1,64 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
|
||||
template <typename T>
|
||||
struct HyphenLiteral {
|
||||
const T* data;
|
||||
size_t length;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using HyphenLiteralT = HyphenLiteral<T>;
|
||||
|
||||
template <typename WordContainer, typename Literal>
|
||||
bool matchesLiteralAt(const WordContainer& word, const size_t start, const Literal& literal) {
|
||||
if (!literal.data || literal.length == 0) {
|
||||
return false;
|
||||
}
|
||||
if (start + literal.length > word.size()) {
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < literal.length; ++i) {
|
||||
if (word[start + i] != literal.data[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename WordContainer, typename PrefixContainer, typename SuffixContainer, typename BreakAllowedFn>
|
||||
void appendLiteralBreaks(const WordContainer& lowerWord, const PrefixContainer& prefixes,
|
||||
const SuffixContainer& suffixes, BreakAllowedFn&& breakAllowed,
|
||||
std::vector<size_t>& indexes) {
|
||||
const size_t length = lowerWord.size();
|
||||
|
||||
const auto tryPush = [&](const size_t breakIndex) {
|
||||
if (!breakAllowed(breakIndex)) {
|
||||
return;
|
||||
}
|
||||
indexes.push_back(breakIndex);
|
||||
};
|
||||
|
||||
for (const auto& literal : prefixes) {
|
||||
if (literal.length == 0 || literal.length >= length) {
|
||||
continue;
|
||||
}
|
||||
if (!matchesLiteralAt(lowerWord, 0, literal)) {
|
||||
continue;
|
||||
}
|
||||
tryPush(literal.length);
|
||||
}
|
||||
|
||||
for (const auto& literal : suffixes) {
|
||||
if (literal.length == 0 || literal.length >= length) {
|
||||
continue;
|
||||
}
|
||||
const size_t breakIndex = length - literal.length;
|
||||
if (!matchesLiteralAt(lowerWord, breakIndex, literal)) {
|
||||
continue;
|
||||
}
|
||||
tryPush(breakIndex);
|
||||
}
|
||||
}
|
||||
@ -1,11 +1,111 @@
|
||||
#include "RussianHyphenator.h"
|
||||
#include "HyphenationLiterals.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
namespace {
|
||||
|
||||
using CyrillicLiteral = HyphenLiteralT<uint32_t>;
|
||||
|
||||
constexpr uint32_t PFX_BEZ[3] = {0x0431, 0x0435, 0x0437};
|
||||
constexpr uint32_t PFX_RAZ[3] = {0x0440, 0x0430, 0x0437};
|
||||
constexpr uint32_t PFX_POD[3] = {0x043F, 0x043E, 0x0434};
|
||||
constexpr uint32_t PFX_NAD[3] = {0x043D, 0x0430, 0x0434};
|
||||
constexpr uint32_t PFX_PERE[4] = {0x043F, 0x0435, 0x0440, 0x0435};
|
||||
constexpr uint32_t PFX_SVERH[5] = {0x0441, 0x0432, 0x0435, 0x0440, 0x0445};
|
||||
constexpr uint32_t PFX_MEZH[3] = {0x043C, 0x0435, 0x0436};
|
||||
constexpr uint32_t PFX_SUPER[5] = {0x0441, 0x0443, 0x043F, 0x0435, 0x0440};
|
||||
constexpr uint32_t PFX_PRED[4] = {0x043F, 0x0440, 0x0435, 0x0434};
|
||||
constexpr uint32_t PFX_SAMO[4] = {0x0441, 0x0430, 0x043C, 0x043E};
|
||||
constexpr uint32_t PFX_OBO[3] = {0x043E, 0x0431, 0x043E};
|
||||
constexpr uint32_t PFX_PROTIV[6] = {0x043F, 0x0440, 0x043E, 0x0442, 0x0438, 0x0432};
|
||||
|
||||
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_PREFIXES = {{{PFX_BEZ, 3}, {PFX_RAZ, 3}, {PFX_POD, 3},
|
||||
{PFX_NAD, 3}, {PFX_PERE, 4}, {PFX_SVERH, 5},
|
||||
{PFX_MEZH, 3}, {PFX_SUPER, 5},{PFX_PRED, 4},
|
||||
{PFX_SAMO, 4}, {PFX_OBO, 3}, {PFX_PROTIV, 6}}};
|
||||
|
||||
constexpr uint32_t SFX_NOST[4] = {0x043D, 0x043E, 0x0441, 0x0442};
|
||||
constexpr uint32_t SFX_STVO[4] = {0x0441, 0x0442, 0x0432, 0x043E};
|
||||
constexpr uint32_t SFX_ENIE[4] = {0x0435, 0x043D, 0x0438, 0x0435};
|
||||
constexpr uint32_t SFX_ATION[4] = {0x0430, 0x0446, 0x0438, 0x044F};
|
||||
constexpr uint32_t SFX_CHIK[3] = {0x0447, 0x0438, 0x043A};
|
||||
constexpr uint32_t SFX_NIK[3] = {0x043D, 0x0438, 0x043A};
|
||||
constexpr uint32_t SFX_TEL[4] = {0x0442, 0x0435, 0x043B, 0x044C};
|
||||
constexpr uint32_t SFX_SKII[4] = {0x0441, 0x043A, 0x0438, 0x0439};
|
||||
constexpr uint32_t SFX_AL[6] = {0x0430, 0x043B, 0x044C, 0x043D, 0x044B, 0x0439};
|
||||
constexpr uint32_t SFX_ISM[3] = {0x0438, 0x0437, 0x043C};
|
||||
constexpr uint32_t SFX_LIV[5] = {0x043B, 0x0438, 0x0432, 0x044B, 0x0439};
|
||||
constexpr uint32_t SFX_OST[4] = {0x043E, 0x0441, 0x0442, 0x044C};
|
||||
|
||||
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_SUFFIXES = {{{SFX_NOST, 4}, {SFX_STVO, 4}, {SFX_ENIE, 4},
|
||||
{SFX_ATION, 4}, {SFX_CHIK, 3}, {SFX_NIK, 3},
|
||||
{SFX_TEL, 4}, {SFX_SKII, 4}, {SFX_AL, 6},
|
||||
{SFX_ISM, 3}, {SFX_LIV, 5}, {SFX_OST, 4}}};
|
||||
|
||||
std::vector<uint32_t> lowercaseCyrillicWord(const std::vector<CodepointInfo>& cps) {
|
||||
std::vector<uint32_t> lower;
|
||||
lower.reserve(cps.size());
|
||||
for (const auto& info : cps) {
|
||||
lower.push_back(isCyrillicLetter(info.value) ? toLowerCyrillic(info.value) : info.value);
|
||||
}
|
||||
return lower;
|
||||
}
|
||||
|
||||
bool russianSegmentHasVowel(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
|
||||
if (start >= cps.size()) {
|
||||
return false;
|
||||
}
|
||||
const size_t clampedEnd = std::min(end, cps.size());
|
||||
for (size_t i = start; i < clampedEnd; ++i) {
|
||||
if (isCyrillicVowel(cps[i].value)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool exposesLeadingDoubleConsonant(const std::vector<CodepointInfo>& cps, const size_t index) {
|
||||
if (index + 1 >= cps.size()) {
|
||||
return false;
|
||||
}
|
||||
const auto first = cps[index].value;
|
||||
const auto second = cps[index + 1].value;
|
||||
if (!isCyrillicConsonant(first) || !isCyrillicConsonant(second)) {
|
||||
return false;
|
||||
}
|
||||
if (toLowerCyrillic(first) != toLowerCyrillic(second)) {
|
||||
return false;
|
||||
}
|
||||
const bool hasLeftVowel = index > 0 && isCyrillicVowel(cps[index - 1].value);
|
||||
const bool hasRightVowel = (index + 2 < cps.size()) && isCyrillicVowel(cps[index + 2].value);
|
||||
return hasLeftVowel && hasRightVowel;
|
||||
}
|
||||
|
||||
bool exposesTrailingDoubleConsonant(const std::vector<CodepointInfo>& cps, const size_t index) {
|
||||
if (index < 2) {
|
||||
return false;
|
||||
}
|
||||
const auto last = cps[index - 1].value;
|
||||
const auto prev = cps[index - 2].value;
|
||||
if (!isCyrillicConsonant(last) || !isCyrillicConsonant(prev)) {
|
||||
return false;
|
||||
}
|
||||
if (toLowerCyrillic(last) != toLowerCyrillic(prev)) {
|
||||
return false;
|
||||
}
|
||||
const bool hasLeftVowel = (index >= 3) && isCyrillicVowel(cps[index - 3].value);
|
||||
const bool hasRightVowel = (index < cps.size()) && isCyrillicVowel(cps[index].value);
|
||||
return hasLeftVowel && hasRightVowel;
|
||||
}
|
||||
|
||||
bool violatesDoubleConsonantRule(const std::vector<CodepointInfo>& cps, const size_t index) {
|
||||
return exposesLeadingDoubleConsonant(cps, index) || exposesTrailingDoubleConsonant(cps, index);
|
||||
}
|
||||
|
||||
// Checks if the codepoint is the Cyrillic soft sign (ь).
|
||||
bool isSoftSign(uint32_t cp) { return toLowerCyrillic(cp) == 0x044C; }
|
||||
|
||||
@ -163,10 +263,18 @@ bool russianBreakAllowed(const std::vector<CodepointInfo>& cps, const size_t bre
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!russianSegmentHasVowel(cps, 0, breakIndex) || !russianSegmentHasVowel(cps, breakIndex, cps.size())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (beginsWithForbiddenSuffix(cps, breakIndex)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (violatesDoubleConsonantRule(cps, breakIndex)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -198,6 +306,12 @@ bool nextToSoftSign(const std::vector<CodepointInfo>& cps, const size_t index) {
|
||||
return isSoftOrHardSign(left) || isSoftOrHardSign(right);
|
||||
}
|
||||
|
||||
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::vector<uint32_t>& lowerWord,
|
||||
std::vector<size_t>& indexes) {
|
||||
appendLiteralBreaks(lowerWord, RUSSIAN_PREFIXES, RUSSIAN_SUFFIXES,
|
||||
[&](const size_t breakIndex) { return russianBreakAllowed(cps, breakIndex); }, indexes);
|
||||
}
|
||||
|
||||
// Produces syllable break indexes tailored to Russian phonotactics.
|
||||
std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
||||
std::vector<size_t> indexes;
|
||||
@ -205,6 +319,8 @@ std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
||||
return indexes;
|
||||
}
|
||||
|
||||
const auto lowerWord = lowercaseCyrillicWord(cps);
|
||||
|
||||
std::vector<size_t> vowelPositions;
|
||||
vowelPositions.reserve(cps.size());
|
||||
for (size_t i = 0; i < cps.size(); ++i) {
|
||||
@ -233,8 +349,8 @@ std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
||||
const size_t clusterEnd = rightVowel;
|
||||
|
||||
size_t breakIndex = std::numeric_limits<size_t>::max();
|
||||
if (const auto split = doubleConsonantSplit(cps, clusterStart, clusterEnd);
|
||||
split != std::numeric_limits<size_t>::max()) {
|
||||
const auto split = doubleConsonantSplit(cps, clusterStart, clusterEnd);
|
||||
if (split != std::numeric_limits<size_t>::max()) {
|
||||
breakIndex = split;
|
||||
} else {
|
||||
const size_t onsetLen = russianOnsetLength(cps, clusterStart, clusterEnd);
|
||||
@ -257,6 +373,8 @@ std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
||||
indexes.push_back(breakIndex);
|
||||
}
|
||||
|
||||
appendMorphologyBreaks(cps, lowerWord, indexes);
|
||||
|
||||
std::sort(indexes.begin(), indexes.end());
|
||||
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
|
||||
return indexes;
|
||||
|
||||
@ -24,7 +24,7 @@ class CrossPointSettings {
|
||||
uint8_t extraParagraphSpacing = 1;
|
||||
// Duration of the power button press
|
||||
uint8_t shortPwrBtn = 0;
|
||||
uint8_t hyphenationEnabled = 1;
|
||||
uint8_t hyphenationEnabled = 0;
|
||||
|
||||
~CrossPointSettings() = default;
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user