fix: Support hyphenation for EPUBs using ISO 639-2 language codes (#1461)

## Summary

EPUBs that use ISO 639-2 three-letter language codes in their
`dc:language` metadata (e.g. `<dc:language>eng</dc:language>`) got no
hyphenation. The hyphenator registry only matched ISO 639-1 two-letter
codes (`"en"`, `"fr"`, etc.), so `"eng"` produced a null hyphenator and
every word in the book was treated as unhyphenatable.
Added a normalization step in `hyphenatorForLanguage` that maps ISO
639-2 codes (both bibliographic and terminological variants) to their
two-letter equivalents before the registry lookup.

Discovered via *Project Hail Mary* (Random House), which uses
`<dc:language>eng</dc:language>`.

---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? _**PARTIALLY**_
This commit is contained in:
Zach Nelson
2026-04-06 18:30:24 -05:00
committed by GitHub
parent 9b3885135f
commit 1c13331189
2 changed files with 24 additions and 3 deletions

View File

@@ -10,7 +10,7 @@
#include "parsers/ChapterHtmlSlimParser.h" #include "parsers/ChapterHtmlSlimParser.h"
namespace { namespace {
constexpr uint8_t SECTION_FILE_VERSION = 18; constexpr uint8_t SECTION_FILE_VERSION = 19;
constexpr uint32_t HEADER_SIZE = sizeof(uint8_t) + sizeof(int) + sizeof(float) + sizeof(bool) + sizeof(uint8_t) + constexpr uint32_t HEADER_SIZE = sizeof(uint8_t) + sizeof(int) + sizeof(float) + sizeof(bool) + sizeof(uint8_t) +
sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(bool) + sizeof(bool) + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(bool) + sizeof(bool) +
sizeof(uint8_t) + sizeof(uint32_t) + sizeof(uint32_t); sizeof(uint8_t) + sizeof(uint32_t) + sizeof(uint32_t);

View File

@@ -12,11 +12,24 @@ const LanguageHyphenator* Hyphenator::cachedHyphenator_ = nullptr;
namespace { namespace {
// Maps a BCP-47 language tag to a language-specific hyphenator. // Normalize ISO 639-2 (three-letter) codes to ISO 639-1 (two-letter) codes used by the
// hyphenation registry. EPUBs may use either form in their dc:language metadata (e.g.
// "eng" instead of "en"). Both the bibliographic ("fre"/"ger") and terminological
// ("fra"/"deu") ISO 639-2 variants are mapped.
struct Iso639Mapping {
const char* iso639_2;
const char* iso639_1;
};
static constexpr Iso639Mapping kIso639Mappings[] = {
{"eng", "en"}, {"fra", "fr"}, {"fre", "fr"}, {"deu", "de"}, {"ger", "de"},
{"rus", "ru"}, {"spa", "es"}, {"ita", "it"}, {"ukr", "uk"},
};
// Maps a BCP-47 or ISO 639-2 language tag to a language-specific hyphenator.
const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) { const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) {
if (langTag.empty()) return nullptr; if (langTag.empty()) return nullptr;
// Extract primary subtag and normalize to lowercase (e.g., "en-US" -> "en"). // Extract primary subtag and normalize to lowercase (e.g., "en-US" -> "en", "ENG" -> "en").
std::string primary; std::string primary;
primary.reserve(langTag.size()); primary.reserve(langTag.size());
for (char c : langTag) { for (char c : langTag) {
@@ -26,6 +39,14 @@ const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) {
} }
if (primary.empty()) return nullptr; if (primary.empty()) return nullptr;
// Normalize ISO 639-2 three-letter codes to two-letter equivalents.
for (const auto& mapping : kIso639Mappings) {
if (primary == mapping.iso639_2) {
primary = mapping.iso639_1;
break;
}
}
return getLanguageHyphenatorForPrimaryTag(primary); return getLanguageHyphenatorForPrimaryTag(primary);
} }