fix: Support hyphenation for EPUBs using ISO 639-2 language codes (#1461)
## Summary EPUBs that use ISO 639-2 three-letter language codes in their `dc:language` metadata (e.g. `<dc:language>eng</dc:language>`) got no hyphenation. The hyphenator registry only matched ISO 639-1 two-letter codes (`"en"`, `"fr"`, etc.), so `"eng"` produced a null hyphenator and every word in the book was treated as unhyphenatable. Added a normalization step in `hyphenatorForLanguage` that maps ISO 639-2 codes (both bibliographic and terminological variants) to their two-letter equivalents before the registry lookup. Discovered via *Project Hail Mary* (Random House), which uses `<dc:language>eng</dc:language>`. --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**PARTIALLY**_
This commit is contained in:
@@ -10,7 +10,7 @@
|
|||||||
#include "parsers/ChapterHtmlSlimParser.h"
|
#include "parsers/ChapterHtmlSlimParser.h"
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
constexpr uint8_t SECTION_FILE_VERSION = 18;
|
constexpr uint8_t SECTION_FILE_VERSION = 19;
|
||||||
constexpr uint32_t HEADER_SIZE = sizeof(uint8_t) + sizeof(int) + sizeof(float) + sizeof(bool) + sizeof(uint8_t) +
|
constexpr uint32_t HEADER_SIZE = sizeof(uint8_t) + sizeof(int) + sizeof(float) + sizeof(bool) + sizeof(uint8_t) +
|
||||||
sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(bool) + sizeof(bool) +
|
sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(bool) + sizeof(bool) +
|
||||||
sizeof(uint8_t) + sizeof(uint32_t) + sizeof(uint32_t);
|
sizeof(uint8_t) + sizeof(uint32_t) + sizeof(uint32_t);
|
||||||
|
|||||||
@@ -12,11 +12,24 @@ const LanguageHyphenator* Hyphenator::cachedHyphenator_ = nullptr;
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
// Maps a BCP-47 language tag to a language-specific hyphenator.
|
// Normalize ISO 639-2 (three-letter) codes to ISO 639-1 (two-letter) codes used by the
|
||||||
|
// hyphenation registry. EPUBs may use either form in their dc:language metadata (e.g.
|
||||||
|
// "eng" instead of "en"). Both the bibliographic ("fre"/"ger") and terminological
|
||||||
|
// ("fra"/"deu") ISO 639-2 variants are mapped.
|
||||||
|
struct Iso639Mapping {
|
||||||
|
const char* iso639_2;
|
||||||
|
const char* iso639_1;
|
||||||
|
};
|
||||||
|
static constexpr Iso639Mapping kIso639Mappings[] = {
|
||||||
|
{"eng", "en"}, {"fra", "fr"}, {"fre", "fr"}, {"deu", "de"}, {"ger", "de"},
|
||||||
|
{"rus", "ru"}, {"spa", "es"}, {"ita", "it"}, {"ukr", "uk"},
|
||||||
|
};
|
||||||
|
|
||||||
|
// Maps a BCP-47 or ISO 639-2 language tag to a language-specific hyphenator.
|
||||||
const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) {
|
const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) {
|
||||||
if (langTag.empty()) return nullptr;
|
if (langTag.empty()) return nullptr;
|
||||||
|
|
||||||
// Extract primary subtag and normalize to lowercase (e.g., "en-US" -> "en").
|
// Extract primary subtag and normalize to lowercase (e.g., "en-US" -> "en", "ENG" -> "en").
|
||||||
std::string primary;
|
std::string primary;
|
||||||
primary.reserve(langTag.size());
|
primary.reserve(langTag.size());
|
||||||
for (char c : langTag) {
|
for (char c : langTag) {
|
||||||
@@ -26,6 +39,14 @@ const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) {
|
|||||||
}
|
}
|
||||||
if (primary.empty()) return nullptr;
|
if (primary.empty()) return nullptr;
|
||||||
|
|
||||||
|
// Normalize ISO 639-2 three-letter codes to two-letter equivalents.
|
||||||
|
for (const auto& mapping : kIso639Mappings) {
|
||||||
|
if (primary == mapping.iso639_2) {
|
||||||
|
primary = mapping.iso639_1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return getLanguageHyphenatorForPrimaryTag(primary);
|
return getLanguageHyphenatorForPrimaryTag(primary);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user