Files
crosspoint-reader-mod/scripts/generate_hyphenation_trie.py
Xuan-Son Nguyen 0508bfc1f7 perf: apply (micro) optimization on SerializedHyphenationPatterns (#689)
## Summary

This PR applies a micro optimization on `SerializedHyphenationPatterns`,
which allow reading `rootOffset` directly without having to parse then
cache it.

It should not affect storage space since no new bytes are added.

This also gets rid of the linear cache search whenever
`liangBreakIndexes` is called. In theory, the performance should be
improved a bit, although it may be too small to be noticeable in
practice.

## Testing

master branch:

```
english: 99.1023%
french: 100%
german: 97.7289%
russian: 97.2167%
spanish: 99.0236%
```

This PR:

```
english: 99.1023%
french: 100%
german: 97.7289%
russian: 97.2167%
spanish: 99.0236%
```

---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? PARTIALLY - mostly IDE
tab-autocompletions
2026-02-16 20:27:43 +11:00

102 lines
3.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""Embed hypher-generated `.bin` tries into constexpr headers."""
from __future__ import annotations
import argparse
import pathlib
def _format_bytes(blob: bytes, per_line: int = 16) -> str:
# Render the blob as a comma separated list of hex literals with consistent wrapping.
lines = []
for i in range(0, len(blob), per_line):
chunk = ', '.join(f"0x{b:02X}" for b in blob[i : i + per_line])
lines.append(f" {chunk},")
if not lines:
lines.append(" 0x00,")
return '\n'.join(lines)
def _symbol_from_output(path: pathlib.Path) -> str:
# Derive a stable C identifier from the destination header name (e.g., hyph-en.trie.h -> en).
name = path.name
if name.endswith('.trie.h'):
name = name[:-7]
if name.startswith('hyph-'):
name = name[5:]
name = name.replace('-', '_')
if name.endswith('.trie'):
name = name[:-5]
return name
def write_header(path: pathlib.Path, blob: bytes, symbol: str) -> None:
# Emit a constexpr header containing the raw bytes plus a SerializedHyphenationPatterns descriptor.
# The binary format has:
# - 4 bytes: big-endian root address
# - levels tape: from byte 4 to root_addr
# - nodes data: from root_addr onwards
if len(blob) < 4:
raise ValueError(f"Blob too small: {len(blob)} bytes")
# Parse root address (big-endian uint32)
root_addr = (blob[0] << 24) | (blob[1] << 16) | (blob[2] << 8) | blob[3]
if root_addr > len(blob):
raise ValueError(f"Root address {root_addr} exceeds blob size {len(blob)}")
# Remove the 4-byte root address and adjust the offset
bytes_literal = _format_bytes(blob[4:])
root_addr_new = root_addr - 4
path.parent.mkdir(parents=True, exist_ok=True)
data_symbol = f"{symbol}_trie_data"
patterns_symbol = f"{symbol}_patterns"
content = f"""#pragma once
#include <cstddef>
#include <cstdint>
#include "../SerializedHyphenationTrie.h"
// Auto-generated by generate_hyphenation_trie.py. Do not edit manually.
alignas(4) constexpr uint8_t {data_symbol}[] = {{
{bytes_literal}
}};
constexpr SerializedHyphenationPatterns {patterns_symbol} = {{
{f"0x{root_addr_new:02X}"}u,
{data_symbol},
sizeof({data_symbol}),
}};
"""
path.write_text(content)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('--input', dest='inputs', action='append', required=True,
help='Path to a hypher-generated .bin trie')
parser.add_argument('--output', dest='outputs', action='append', required=True,
help='Destination header path (hyph-*.trie.h)')
args = parser.parse_args()
if len(args.inputs) != len(args.outputs):
raise SystemExit('input/output counts must match')
for src, dst in zip(args.inputs, args.outputs):
# Process each input/output pair independently so mixed-language refreshes work in one invocation.
src_path = pathlib.Path(src)
blob = src_path.read_bytes()
out_path = pathlib.Path(dst)
symbol = _symbol_from_output(out_path)
write_header(out_path, blob, symbol)
print(f'wrote {dst} ({len(blob)} bytes payload)')
if __name__ == '__main__':
main()