crosspoint-reader/scripts/generate_dict_index.py

332 lines
11 KiB
Python
Raw Normal View History

2026-01-22 12:42:01 -05:00
#!/usr/bin/env python3
"""Generate prefix jump tables for StarDict dictionary lookup optimization.
This script parses StarDict .idx and .syn files and generates a C++ header
with pre-computed byte offsets for two-letter prefixes (aa-zz). This enables
near-instant lookup by jumping directly to the relevant section of the index.
Usage:
./scripts/generate_dict_index.py --idx path/to/dict.idx --syn path/to/dict.syn --output lib/StarDict/DictPrefixIndex.generated.h
Or extract from a zip file:
./scripts/generate_dict_index.py --zip dict-en-en.zip --output lib/StarDict/DictPrefixIndex.generated.h
"""
from __future__ import annotations
import argparse
import pathlib
import struct
import zipfile
from typing import BinaryIO
def prefix_to_index(c1: str, c2: str) -> int:
"""Convert two-letter prefix to index (0-675).
'aa' -> 0, 'ab' -> 1, ... 'zz' -> 675
"""
return (ord(c1.lower()) - ord('a')) * 26 + (ord(c2.lower()) - ord('a'))
def index_to_prefix(idx: int) -> str:
"""Convert index back to two-letter prefix for debugging."""
c1 = chr(ord('a') + idx // 26)
c2 = chr(ord('a') + idx % 26)
return c1 + c2
def is_alpha(c: str) -> bool:
"""Check if character is a-z or A-Z."""
return ('a' <= c <= 'z') or ('A' <= c <= 'Z')
def read_null_terminated_string(f: BinaryIO) -> tuple[str, int]:
"""Read a null-terminated string from file.
Returns (string, bytes_read including null terminator).
"""
chars = []
bytes_read = 0
while True:
b = f.read(1)
if not b:
break
bytes_read += 1
if b == b'\x00':
break
chars.append(b.decode('utf-8', errors='replace'))
return ''.join(chars), bytes_read
def parse_idx_file(f: BinaryIO, file_size: int) -> dict[int, int]:
"""Parse StarDict .idx file and build prefix -> offset mapping.
The .idx file format is:
[word\0][offset:4 bytes BE][size:4 bytes BE]
...repeated for each word...
Returns dict mapping prefix index (0-675) to first byte offset for that prefix.
"""
prefix_offsets: dict[int, int] = {}
current_position = 0
words_processed = 0
while current_position < file_size:
entry_start = current_position
# Read the word
word, word_bytes = read_null_terminated_string(f)
if not word:
break
current_position += word_bytes
# Read 8 bytes (offset + size, both big-endian)
data = f.read(8)
if len(data) != 8:
break
current_position += 8
# Extract prefix if word has at least 2 alphabetic characters
if len(word) >= 2 and is_alpha(word[0]) and is_alpha(word[1]):
prefix_idx = prefix_to_index(word[0], word[1])
# Only record the first occurrence of each prefix
if prefix_idx not in prefix_offsets:
prefix_offsets[prefix_idx] = entry_start
words_processed += 1
if words_processed % 100000 == 0:
print(f" Processed {words_processed} words...")
print(f" Total words processed: {words_processed}")
print(f" Unique prefixes found: {len(prefix_offsets)}")
return prefix_offsets
def parse_syn_file(f: BinaryIO, file_size: int) -> dict[int, int]:
"""Parse StarDict .syn file and build prefix -> offset mapping.
The .syn file format is:
[synonym_word\0][main_entry_index:4 bytes BE]
...repeated for each synonym...
Returns dict mapping prefix index (0-675) to first byte offset for that prefix.
"""
prefix_offsets: dict[int, int] = {}
current_position = 0
synonyms_processed = 0
while current_position < file_size:
entry_start = current_position
# Read the synonym word
word, word_bytes = read_null_terminated_string(f)
if not word:
break
current_position += word_bytes
# Read 4 bytes (index to main entry, big-endian)
data = f.read(4)
if len(data) != 4:
break
current_position += 4
# Extract prefix if word has at least 2 alphabetic characters
if len(word) >= 2 and is_alpha(word[0]) and is_alpha(word[1]):
prefix_idx = prefix_to_index(word[0], word[1])
# Only record the first occurrence of each prefix
if prefix_idx not in prefix_offsets:
prefix_offsets[prefix_idx] = entry_start
synonyms_processed += 1
if synonyms_processed % 100000 == 0:
print(f" Processed {synonyms_processed} synonyms...")
print(f" Total synonyms processed: {synonyms_processed}")
print(f" Unique prefixes found: {len(prefix_offsets)}")
return prefix_offsets
def fill_missing_prefixes(prefix_offsets: dict[int, int], file_size: int) -> list[int]:
"""Fill in missing prefixes with the next available offset.
If a prefix doesn't exist (e.g., no words starting with 'qx'),
we set its offset to the next prefix's offset so the scan will
quickly find nothing and move on.
"""
result = [0] * 676
# First pass: fill in known offsets
for idx, offset in prefix_offsets.items():
result[idx] = offset
# Second pass: fill missing with next known offset (or file_size)
# Work backwards so each missing entry gets the next valid offset
next_valid = file_size
for idx in range(675, -1, -1):
if idx in prefix_offsets:
next_valid = prefix_offsets[idx]
else:
result[idx] = next_valid
return result
def format_offset_array(offsets: list[int], name: str) -> str:
"""Format offset array as C++ constexpr with nice formatting."""
lines = [f"// Two-letter prefix jump table: {name}[prefix_to_index(c1, c2)] = byte offset"]
lines.append(f"// Prefixes: aa=0, ab=1, ... az=25, ba=26, ... zz=675")
lines.append(f"constexpr uint32_t {name}[676] = {{")
# Format 13 values per line (fits nicely with 10-digit numbers + commas)
values_per_line = 13
for i in range(0, 676, values_per_line):
chunk = offsets[i:i + values_per_line]
prefix_start = index_to_prefix(i)
prefix_end = index_to_prefix(min(i + values_per_line - 1, 675))
values_str = ', '.join(f'{v:>10}' for v in chunk)
lines.append(f" {values_str}, // {prefix_start}-{prefix_end}")
lines.append("};")
return '\n'.join(lines)
def generate_header(idx_offsets: list[int], syn_offsets: list[int] | None, output_path: pathlib.Path) -> None:
"""Generate the C++ header file with prefix offset tables."""
content = '''#pragma once
// Auto-generated by generate_dict_index.py. Do not edit manually.
// This file contains pre-computed prefix jump tables for fast dictionary lookup.
#include <cstdint>
namespace DictPrefixIndex {
// Convert two-letter prefix to index (0-675)
// "aa" -> 0, "ab" -> 1, ... "az" -> 25, "ba" -> 26, ... "zz" -> 675
inline uint16_t prefixToIndex(char c1, char c2) {
// Convert to lowercase and compute index
const int i1 = (c1 | 0x20) - 'a'; // tolower via OR with 0x20
const int i2 = (c2 | 0x20) - 'a';
// Bounds check (returns 0 for non-alpha characters)
if (i1 < 0 || i1 > 25 || i2 < 0 || i2 > 25) return 0;
return static_cast<uint16_t>(i1 * 26 + i2);
}
// Check if character is alphabetic (a-z or A-Z)
inline bool isAlpha(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
'''
content += format_offset_array(idx_offsets, "dictPrefixOffsets")
content += '\n\n'
if syn_offsets:
content += format_offset_array(syn_offsets, "synPrefixOffsets")
else:
content += "// No synonym file processed - synPrefixOffsets not generated\n"
content += "constexpr uint32_t synPrefixOffsets[676] = {0};\n"
content += '\n} // namespace DictPrefixIndex\n'
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(content)
print(f"Generated: {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(description="Generate StarDict prefix jump tables")
parser.add_argument('--idx', type=str, help='Path to .idx file')
parser.add_argument('--syn', type=str, help='Path to .syn file (optional)')
parser.add_argument('--zip', type=str, help='Path to dictionary zip file (alternative to --idx/--syn)')
parser.add_argument('--output', type=str, required=True, help='Output header path')
args = parser.parse_args()
idx_offsets: dict[int, int] = {}
syn_offsets: dict[int, int] | None = None
idx_file_size = 0
syn_file_size = 0
if args.zip:
# Extract from zip file
zip_path = pathlib.Path(args.zip)
print(f"Processing zip file: {zip_path}")
with zipfile.ZipFile(zip_path, 'r') as zf:
# Find .idx file
idx_name = None
syn_name = None
for name in zf.namelist():
if name.endswith('.idx'):
idx_name = name
elif name.endswith('.syn'):
syn_name = name
if not idx_name:
raise SystemExit("No .idx file found in zip")
print(f"\nParsing index file: {idx_name}")
with zf.open(idx_name) as f:
idx_file_size = zf.getinfo(idx_name).file_size
idx_offsets = parse_idx_file(f, idx_file_size)
if syn_name:
print(f"\nParsing synonym file: {syn_name}")
with zf.open(syn_name) as f:
syn_file_size = zf.getinfo(syn_name).file_size
syn_offsets = parse_syn_file(f, syn_file_size)
else:
# Read from individual files
if not args.idx:
raise SystemExit("Either --zip or --idx must be provided")
idx_path = pathlib.Path(args.idx)
print(f"Processing index file: {idx_path}")
idx_file_size = idx_path.stat().st_size
with open(idx_path, 'rb') as f:
idx_offsets = parse_idx_file(f, idx_file_size)
if args.syn:
syn_path = pathlib.Path(args.syn)
print(f"\nProcessing synonym file: {syn_path}")
syn_file_size = syn_path.stat().st_size
with open(syn_path, 'rb') as f:
syn_offsets = parse_syn_file(f, syn_file_size)
# Fill in missing prefixes
print("\nFilling missing prefixes...")
idx_offsets_filled = fill_missing_prefixes(idx_offsets, idx_file_size)
syn_offsets_filled = fill_missing_prefixes(syn_offsets, syn_file_size) if syn_offsets else None
# Generate header
print("\nGenerating header file...")
generate_header(idx_offsets_filled, syn_offsets_filled, pathlib.Path(args.output))
# Print some statistics
print("\n=== Statistics ===")
print(f"Index file size: {idx_file_size:,} bytes")
if syn_file_size:
print(f"Synonym file size: {syn_file_size:,} bytes")
# Show distribution of some common prefixes
print("\nSample prefix offsets:")
for prefix in ['aa', 'he', 'th', 'wo', 'zz']:
idx = prefix_to_index(prefix[0], prefix[1])
offset = idx_offsets_filled[idx]
pct = (offset / idx_file_size) * 100 if idx_file_size else 0
print(f" '{prefix}' (index {idx}): offset {offset:,} ({pct:.1f}% into file)")
if __name__ == '__main__':
main()