crosspoint-reader/scripts/generate_hyphenation_trie.py
Arthur Tazhitdinov 8824c87490
feat: dict based Hyphenation (#305)
## Summary

* Adds (optional) Hyphenation for English, French, German, Russian
languages

## Additional Context

* Included hyphenation dictionaries add approximately 280kb to the flash
usage (German alone takes 200kb)
* Trie encoded dictionaries are adopted from hypher project
(https://github.com/typst/hypher)
* Soft hyphens (and other explicit hyphens) take precedence over
dict-based hyphenation. Overall, the hyphenation rules are quite
aggressive, as I believe it makes more sense on our smaller screen.

---------

Co-authored-by: Dave Allie <dave@daveallie.com>
2026-01-19 12:56:26 +00:00

83 lines
2.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""Embed hypher-generated `.bin` tries into constexpr headers."""
from __future__ import annotations
import argparse
import pathlib
def _format_bytes(blob: bytes, per_line: int = 16) -> str:
# Render the blob as a comma separated list of hex literals with consistent wrapping.
lines = []
for i in range(0, len(blob), per_line):
chunk = ', '.join(f"0x{b:02X}" for b in blob[i : i + per_line])
lines.append(f" {chunk},")
if not lines:
lines.append(" 0x00,")
return '\n'.join(lines)
def _symbol_from_output(path: pathlib.Path) -> str:
# Derive a stable C identifier from the destination header name (e.g., hyph-en.trie.h -> en).
name = path.name
if name.endswith('.trie.h'):
name = name[:-7]
if name.startswith('hyph-'):
name = name[5:]
name = name.replace('-', '_')
if name.endswith('.trie'):
name = name[:-5]
return name
def write_header(path: pathlib.Path, blob: bytes, symbol: str) -> None:
# Emit a constexpr header containing the raw bytes plus a SerializedHyphenationPatterns descriptor.
path.parent.mkdir(parents=True, exist_ok=True)
data_symbol = f"{symbol}_trie_data"
patterns_symbol = f"{symbol}_patterns"
bytes_literal = _format_bytes(blob)
content = f"""#pragma once
#include <cstddef>
#include <cstdint>
#include "../SerializedHyphenationTrie.h"
// Auto-generated by generate_hyphenation_trie.py. Do not edit manually.
alignas(4) constexpr uint8_t {data_symbol}[] = {{
{bytes_literal}
}};
constexpr SerializedHyphenationPatterns {patterns_symbol} = {{
{data_symbol},
sizeof({data_symbol}),
}};
"""
path.write_text(content)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('--input', dest='inputs', action='append', required=True,
help='Path to a hypher-generated .bin trie')
parser.add_argument('--output', dest='outputs', action='append', required=True,
help='Destination header path (hyph-*.trie.h)')
args = parser.parse_args()
if len(args.inputs) != len(args.outputs):
raise SystemExit('input/output counts must match')
for src, dst in zip(args.inputs, args.outputs):
# Process each input/output pair independently so mixed-language refreshes work in one invocation.
src_path = pathlib.Path(src)
blob = src_path.read_bytes()
out_path = pathlib.Path(dst)
symbol = _symbol_from_output(out_path)
write_header(out_path, blob, symbol)
print(f'wrote {dst} ({len(blob)} bytes payload)')
if __name__ == '__main__':
main()