From 1c1333118962f456ff56a3c599f52520d2d204d9 Mon Sep 17 00:00:00 2001 From: Zach Nelson Date: Mon, 6 Apr 2026 18:30:24 -0500 Subject: [PATCH] fix: Support hyphenation for EPUBs using ISO 639-2 language codes (#1461) ## Summary EPUBs that use ISO 639-2 three-letter language codes in their `dc:language` metadata (e.g. `eng`) got no hyphenation. The hyphenator registry only matched ISO 639-1 two-letter codes (`"en"`, `"fr"`, etc.), so `"eng"` produced a null hyphenator and every word in the book was treated as unhyphenatable. Added a normalization step in `hyphenatorForLanguage` that maps ISO 639-2 codes (both bibliographic and terminological variants) to their two-letter equivalents before the registry lookup. Discovered via *Project Hail Mary* (Random House), which uses `eng`. --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**PARTIALLY**_ --- lib/Epub/Epub/Section.cpp | 2 +- lib/Epub/Epub/hyphenation/Hyphenator.cpp | 25 ++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/lib/Epub/Epub/Section.cpp b/lib/Epub/Epub/Section.cpp index 9365df20..7c0b3c12 100644 --- a/lib/Epub/Epub/Section.cpp +++ b/lib/Epub/Epub/Section.cpp @@ -10,7 +10,7 @@ #include "parsers/ChapterHtmlSlimParser.h" namespace { -constexpr uint8_t SECTION_FILE_VERSION = 18; +constexpr uint8_t SECTION_FILE_VERSION = 19; constexpr uint32_t HEADER_SIZE = sizeof(uint8_t) + sizeof(int) + sizeof(float) + sizeof(bool) + sizeof(uint8_t) + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(bool) + sizeof(bool) + sizeof(uint8_t) + sizeof(uint32_t) + sizeof(uint32_t); diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.cpp b/lib/Epub/Epub/hyphenation/Hyphenator.cpp index 4ae5307e..ad5de454 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp @@ -12,11 +12,24 @@ const LanguageHyphenator* Hyphenator::cachedHyphenator_ = nullptr; namespace { -// Maps a BCP-47 language tag to a language-specific hyphenator. +// Normalize ISO 639-2 (three-letter) codes to ISO 639-1 (two-letter) codes used by the +// hyphenation registry. EPUBs may use either form in their dc:language metadata (e.g. +// "eng" instead of "en"). Both the bibliographic ("fre"/"ger") and terminological +// ("fra"/"deu") ISO 639-2 variants are mapped. +struct Iso639Mapping { + const char* iso639_2; + const char* iso639_1; +}; +static constexpr Iso639Mapping kIso639Mappings[] = { + {"eng", "en"}, {"fra", "fr"}, {"fre", "fr"}, {"deu", "de"}, {"ger", "de"}, + {"rus", "ru"}, {"spa", "es"}, {"ita", "it"}, {"ukr", "uk"}, +}; + +// Maps a BCP-47 or ISO 639-2 language tag to a language-specific hyphenator. const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) { if (langTag.empty()) return nullptr; - // Extract primary subtag and normalize to lowercase (e.g., "en-US" -> "en"). + // Extract primary subtag and normalize to lowercase (e.g., "en-US" -> "en", "ENG" -> "en"). std::string primary; primary.reserve(langTag.size()); for (char c : langTag) { @@ -26,6 +39,14 @@ const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) { } if (primary.empty()) return nullptr; + // Normalize ISO 639-2 three-letter codes to two-letter equivalents. + for (const auto& mapping : kIso639Mappings) { + if (primary == mapping.iso639_2) { + primary = mapping.iso639_1; + break; + } + } + return getLanguageHyphenatorForPrimaryTag(primary); }