format fix
This commit is contained in:
parent
3cf52d8bd1
commit
23183a6270
@ -1,5 +1,4 @@
|
|||||||
#include "EnglishHyphenator.h"
|
#include "EnglishHyphenator.h"
|
||||||
#include "HyphenationLiterals.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
@ -7,6 +6,8 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "HyphenationLiterals.h"
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
char lowerLatinChar(const uint32_t cp) {
|
char lowerLatinChar(const uint32_t cp) {
|
||||||
@ -50,18 +51,15 @@ bool isEnglishFricativeChar(const char c) {
|
|||||||
|
|
||||||
using LatinLiteral = HyphenLiteralT<char>;
|
using LatinLiteral = HyphenLiteralT<char>;
|
||||||
|
|
||||||
constexpr std::array<LatinLiteral, 20> ENGLISH_PREFIXES = {{{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2},
|
constexpr std::array<LatinLiteral, 20> ENGLISH_PREFIXES = {
|
||||||
{"dis", 3}, {"hyper", 5}, {"inter", 5}, {"micro", 5},
|
{{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2}, {"dis", 3}, {"hyper", 5}, {"inter", 5},
|
||||||
{"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3},
|
{"micro", 5}, {"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3}, {"over", 4}, {"post", 4},
|
||||||
{"over", 4}, {"post", 4}, {"pre", 3}, {"pro", 3},
|
{"pre", 3}, {"pro", 3}, {"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}};
|
||||||
{"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}};
|
|
||||||
|
|
||||||
constexpr std::array<LatinLiteral, 24> ENGLISH_SUFFIXES = {{{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4},
|
constexpr std::array<LatinLiteral, 24> ENGLISH_SUFFIXES = {
|
||||||
{"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3},
|
{{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4}, {"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3},
|
||||||
{"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6},
|
{"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6}, {"ly", 2}, {"ment", 4}, {"ments", 5}, {"ness", 4},
|
||||||
{"ly", 2}, {"ment", 4}, {"ments", 5},{"ness", 4},
|
{"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4}, {"wards", 5}, {"ship", 4}, {"ships", 5}, {"y", 1}}};
|
||||||
{"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4},
|
|
||||||
{"wards", 5},{"ship", 4}, {"ships", 5},{"y", 1}}};
|
|
||||||
|
|
||||||
bool nextToApostrophe(const std::vector<CodepointInfo>& cps, size_t index);
|
bool nextToApostrophe(const std::vector<CodepointInfo>& cps, size_t index);
|
||||||
|
|
||||||
@ -111,8 +109,9 @@ bool englishBreakAllowed(const std::vector<CodepointInfo>& cps, const size_t bre
|
|||||||
|
|
||||||
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::string& lowerWord,
|
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::string& lowerWord,
|
||||||
std::vector<size_t>& indexes) {
|
std::vector<size_t>& indexes) {
|
||||||
appendLiteralBreaks(lowerWord, ENGLISH_PREFIXES, ENGLISH_SUFFIXES,
|
appendLiteralBreaks(
|
||||||
[&](const size_t breakIndex) { return englishBreakAllowed(cps, breakIndex); }, indexes);
|
lowerWord, ENGLISH_PREFIXES, ENGLISH_SUFFIXES,
|
||||||
|
[&](const size_t breakIndex) { return englishBreakAllowed(cps, breakIndex); }, indexes);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct CharPair {
|
struct CharPair {
|
||||||
@ -313,8 +312,7 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
|||||||
const size_t rightVowel = vowelPositions[v + 1];
|
const size_t rightVowel = vowelPositions[v + 1];
|
||||||
|
|
||||||
if (rightVowel - leftVowel == 1) {
|
if (rightVowel - leftVowel == 1) {
|
||||||
if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) &&
|
if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) && englishBreakAllowed(cps, rightVowel)) {
|
||||||
englishBreakAllowed(cps, rightVowel)) {
|
|
||||||
indexes.push_back(rightVowel);
|
indexes.push_back(rightVowel);
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
|
|||||||
@ -30,8 +30,7 @@ bool matchesLiteralAt(const WordContainer& word, const size_t start, const Liter
|
|||||||
|
|
||||||
template <typename WordContainer, typename PrefixContainer, typename SuffixContainer, typename BreakAllowedFn>
|
template <typename WordContainer, typename PrefixContainer, typename SuffixContainer, typename BreakAllowedFn>
|
||||||
void appendLiteralBreaks(const WordContainer& lowerWord, const PrefixContainer& prefixes,
|
void appendLiteralBreaks(const WordContainer& lowerWord, const PrefixContainer& prefixes,
|
||||||
const SuffixContainer& suffixes, BreakAllowedFn&& breakAllowed,
|
const SuffixContainer& suffixes, BreakAllowedFn&& breakAllowed, std::vector<size_t>& indexes) {
|
||||||
std::vector<size_t>& indexes) {
|
|
||||||
const size_t length = lowerWord.size();
|
const size_t length = lowerWord.size();
|
||||||
|
|
||||||
const auto tryPush = [&](const size_t breakIndex) {
|
const auto tryPush = [&](const size_t breakIndex) {
|
||||||
|
|||||||
@ -1,11 +1,12 @@
|
|||||||
#include "RussianHyphenator.h"
|
#include "RussianHyphenator.h"
|
||||||
#include "HyphenationLiterals.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "HyphenationLiterals.h"
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
using CyrillicLiteral = HyphenLiteralT<uint32_t>;
|
using CyrillicLiteral = HyphenLiteralT<uint32_t>;
|
||||||
@ -23,10 +24,18 @@ constexpr uint32_t PFX_SAMO[4] = {0x0441, 0x0430, 0x043C, 0x043E};
|
|||||||
constexpr uint32_t PFX_OBO[3] = {0x043E, 0x0431, 0x043E};
|
constexpr uint32_t PFX_OBO[3] = {0x043E, 0x0431, 0x043E};
|
||||||
constexpr uint32_t PFX_PROTIV[6] = {0x043F, 0x0440, 0x043E, 0x0442, 0x0438, 0x0432};
|
constexpr uint32_t PFX_PROTIV[6] = {0x043F, 0x0440, 0x043E, 0x0442, 0x0438, 0x0432};
|
||||||
|
|
||||||
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_PREFIXES = {{{PFX_BEZ, 3}, {PFX_RAZ, 3}, {PFX_POD, 3},
|
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_PREFIXES = {{{PFX_BEZ, 3},
|
||||||
{PFX_NAD, 3}, {PFX_PERE, 4}, {PFX_SVERH, 5},
|
{PFX_RAZ, 3},
|
||||||
{PFX_MEZH, 3}, {PFX_SUPER, 5},{PFX_PRED, 4},
|
{PFX_POD, 3},
|
||||||
{PFX_SAMO, 4}, {PFX_OBO, 3}, {PFX_PROTIV, 6}}};
|
{PFX_NAD, 3},
|
||||||
|
{PFX_PERE, 4},
|
||||||
|
{PFX_SVERH, 5},
|
||||||
|
{PFX_MEZH, 3},
|
||||||
|
{PFX_SUPER, 5},
|
||||||
|
{PFX_PRED, 4},
|
||||||
|
{PFX_SAMO, 4},
|
||||||
|
{PFX_OBO, 3},
|
||||||
|
{PFX_PROTIV, 6}}};
|
||||||
|
|
||||||
constexpr uint32_t SFX_NOST[4] = {0x043D, 0x043E, 0x0441, 0x0442};
|
constexpr uint32_t SFX_NOST[4] = {0x043D, 0x043E, 0x0441, 0x0442};
|
||||||
constexpr uint32_t SFX_STVO[4] = {0x0441, 0x0442, 0x0432, 0x043E};
|
constexpr uint32_t SFX_STVO[4] = {0x0441, 0x0442, 0x0432, 0x043E};
|
||||||
@ -41,10 +50,18 @@ constexpr uint32_t SFX_ISM[3] = {0x0438, 0x0437, 0x043C};
|
|||||||
constexpr uint32_t SFX_LIV[5] = {0x043B, 0x0438, 0x0432, 0x044B, 0x0439};
|
constexpr uint32_t SFX_LIV[5] = {0x043B, 0x0438, 0x0432, 0x044B, 0x0439};
|
||||||
constexpr uint32_t SFX_OST[4] = {0x043E, 0x0441, 0x0442, 0x044C};
|
constexpr uint32_t SFX_OST[4] = {0x043E, 0x0441, 0x0442, 0x044C};
|
||||||
|
|
||||||
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_SUFFIXES = {{{SFX_NOST, 4}, {SFX_STVO, 4}, {SFX_ENIE, 4},
|
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_SUFFIXES = {{{SFX_NOST, 4},
|
||||||
{SFX_ATION, 4}, {SFX_CHIK, 3}, {SFX_NIK, 3},
|
{SFX_STVO, 4},
|
||||||
{SFX_TEL, 4}, {SFX_SKII, 4}, {SFX_AL, 6},
|
{SFX_ENIE, 4},
|
||||||
{SFX_ISM, 3}, {SFX_LIV, 5}, {SFX_OST, 4}}};
|
{SFX_ATION, 4},
|
||||||
|
{SFX_CHIK, 3},
|
||||||
|
{SFX_NIK, 3},
|
||||||
|
{SFX_TEL, 4},
|
||||||
|
{SFX_SKII, 4},
|
||||||
|
{SFX_AL, 6},
|
||||||
|
{SFX_ISM, 3},
|
||||||
|
{SFX_LIV, 5},
|
||||||
|
{SFX_OST, 4}}};
|
||||||
|
|
||||||
std::vector<uint32_t> lowercaseCyrillicWord(const std::vector<CodepointInfo>& cps) {
|
std::vector<uint32_t> lowercaseCyrillicWord(const std::vector<CodepointInfo>& cps) {
|
||||||
std::vector<uint32_t> lower;
|
std::vector<uint32_t> lower;
|
||||||
@ -308,8 +325,9 @@ bool nextToSoftSign(const std::vector<CodepointInfo>& cps, const size_t index) {
|
|||||||
|
|
||||||
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::vector<uint32_t>& lowerWord,
|
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::vector<uint32_t>& lowerWord,
|
||||||
std::vector<size_t>& indexes) {
|
std::vector<size_t>& indexes) {
|
||||||
appendLiteralBreaks(lowerWord, RUSSIAN_PREFIXES, RUSSIAN_SUFFIXES,
|
appendLiteralBreaks(
|
||||||
[&](const size_t breakIndex) { return russianBreakAllowed(cps, breakIndex); }, indexes);
|
lowerWord, RUSSIAN_PREFIXES, RUSSIAN_SUFFIXES,
|
||||||
|
[&](const size_t breakIndex) { return russianBreakAllowed(cps, breakIndex); }, indexes);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Produces syllable break indexes tailored to Russian phonotactics.
|
// Produces syllable break indexes tailored to Russian phonotactics.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user