332 lines
11 KiB
Python
332 lines
11 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Generate prefix jump tables for StarDict dictionary lookup optimization.
|
||
|
|
|
||
|
|
This script parses StarDict .idx and .syn files and generates a C++ header
|
||
|
|
with pre-computed byte offsets for two-letter prefixes (aa-zz). This enables
|
||
|
|
near-instant lookup by jumping directly to the relevant section of the index.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
./scripts/generate_dict_index.py --idx path/to/dict.idx --syn path/to/dict.syn --output lib/StarDict/DictPrefixIndex.generated.h
|
||
|
|
|
||
|
|
Or extract from a zip file:
|
||
|
|
./scripts/generate_dict_index.py --zip dict-en-en.zip --output lib/StarDict/DictPrefixIndex.generated.h
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import pathlib
|
||
|
|
import struct
|
||
|
|
import zipfile
|
||
|
|
from typing import BinaryIO
|
||
|
|
|
||
|
|
|
||
|
|
def prefix_to_index(c1: str, c2: str) -> int:
|
||
|
|
"""Convert two-letter prefix to index (0-675).
|
||
|
|
|
||
|
|
'aa' -> 0, 'ab' -> 1, ... 'zz' -> 675
|
||
|
|
"""
|
||
|
|
return (ord(c1.lower()) - ord('a')) * 26 + (ord(c2.lower()) - ord('a'))
|
||
|
|
|
||
|
|
|
||
|
|
def index_to_prefix(idx: int) -> str:
|
||
|
|
"""Convert index back to two-letter prefix for debugging."""
|
||
|
|
c1 = chr(ord('a') + idx // 26)
|
||
|
|
c2 = chr(ord('a') + idx % 26)
|
||
|
|
return c1 + c2
|
||
|
|
|
||
|
|
|
||
|
|
def is_alpha(c: str) -> bool:
|
||
|
|
"""Check if character is a-z or A-Z."""
|
||
|
|
return ('a' <= c <= 'z') or ('A' <= c <= 'Z')
|
||
|
|
|
||
|
|
|
||
|
|
def read_null_terminated_string(f: BinaryIO) -> tuple[str, int]:
|
||
|
|
"""Read a null-terminated string from file.
|
||
|
|
|
||
|
|
Returns (string, bytes_read including null terminator).
|
||
|
|
"""
|
||
|
|
chars = []
|
||
|
|
bytes_read = 0
|
||
|
|
while True:
|
||
|
|
b = f.read(1)
|
||
|
|
if not b:
|
||
|
|
break
|
||
|
|
bytes_read += 1
|
||
|
|
if b == b'\x00':
|
||
|
|
break
|
||
|
|
chars.append(b.decode('utf-8', errors='replace'))
|
||
|
|
return ''.join(chars), bytes_read
|
||
|
|
|
||
|
|
|
||
|
|
def parse_idx_file(f: BinaryIO, file_size: int) -> dict[int, int]:
|
||
|
|
"""Parse StarDict .idx file and build prefix -> offset mapping.
|
||
|
|
|
||
|
|
The .idx file format is:
|
||
|
|
[word\0][offset:4 bytes BE][size:4 bytes BE]
|
||
|
|
...repeated for each word...
|
||
|
|
|
||
|
|
Returns dict mapping prefix index (0-675) to first byte offset for that prefix.
|
||
|
|
"""
|
||
|
|
prefix_offsets: dict[int, int] = {}
|
||
|
|
current_position = 0
|
||
|
|
words_processed = 0
|
||
|
|
|
||
|
|
while current_position < file_size:
|
||
|
|
entry_start = current_position
|
||
|
|
|
||
|
|
# Read the word
|
||
|
|
word, word_bytes = read_null_terminated_string(f)
|
||
|
|
if not word:
|
||
|
|
break
|
||
|
|
|
||
|
|
current_position += word_bytes
|
||
|
|
|
||
|
|
# Read 8 bytes (offset + size, both big-endian)
|
||
|
|
data = f.read(8)
|
||
|
|
if len(data) != 8:
|
||
|
|
break
|
||
|
|
current_position += 8
|
||
|
|
|
||
|
|
# Extract prefix if word has at least 2 alphabetic characters
|
||
|
|
if len(word) >= 2 and is_alpha(word[0]) and is_alpha(word[1]):
|
||
|
|
prefix_idx = prefix_to_index(word[0], word[1])
|
||
|
|
|
||
|
|
# Only record the first occurrence of each prefix
|
||
|
|
if prefix_idx not in prefix_offsets:
|
||
|
|
prefix_offsets[prefix_idx] = entry_start
|
||
|
|
|
||
|
|
words_processed += 1
|
||
|
|
if words_processed % 100000 == 0:
|
||
|
|
print(f" Processed {words_processed} words...")
|
||
|
|
|
||
|
|
print(f" Total words processed: {words_processed}")
|
||
|
|
print(f" Unique prefixes found: {len(prefix_offsets)}")
|
||
|
|
|
||
|
|
return prefix_offsets
|
||
|
|
|
||
|
|
|
||
|
|
def parse_syn_file(f: BinaryIO, file_size: int) -> dict[int, int]:
|
||
|
|
"""Parse StarDict .syn file and build prefix -> offset mapping.
|
||
|
|
|
||
|
|
The .syn file format is:
|
||
|
|
[synonym_word\0][main_entry_index:4 bytes BE]
|
||
|
|
...repeated for each synonym...
|
||
|
|
|
||
|
|
Returns dict mapping prefix index (0-675) to first byte offset for that prefix.
|
||
|
|
"""
|
||
|
|
prefix_offsets: dict[int, int] = {}
|
||
|
|
current_position = 0
|
||
|
|
synonyms_processed = 0
|
||
|
|
|
||
|
|
while current_position < file_size:
|
||
|
|
entry_start = current_position
|
||
|
|
|
||
|
|
# Read the synonym word
|
||
|
|
word, word_bytes = read_null_terminated_string(f)
|
||
|
|
if not word:
|
||
|
|
break
|
||
|
|
|
||
|
|
current_position += word_bytes
|
||
|
|
|
||
|
|
# Read 4 bytes (index to main entry, big-endian)
|
||
|
|
data = f.read(4)
|
||
|
|
if len(data) != 4:
|
||
|
|
break
|
||
|
|
current_position += 4
|
||
|
|
|
||
|
|
# Extract prefix if word has at least 2 alphabetic characters
|
||
|
|
if len(word) >= 2 and is_alpha(word[0]) and is_alpha(word[1]):
|
||
|
|
prefix_idx = prefix_to_index(word[0], word[1])
|
||
|
|
|
||
|
|
# Only record the first occurrence of each prefix
|
||
|
|
if prefix_idx not in prefix_offsets:
|
||
|
|
prefix_offsets[prefix_idx] = entry_start
|
||
|
|
|
||
|
|
synonyms_processed += 1
|
||
|
|
if synonyms_processed % 100000 == 0:
|
||
|
|
print(f" Processed {synonyms_processed} synonyms...")
|
||
|
|
|
||
|
|
print(f" Total synonyms processed: {synonyms_processed}")
|
||
|
|
print(f" Unique prefixes found: {len(prefix_offsets)}")
|
||
|
|
|
||
|
|
return prefix_offsets
|
||
|
|
|
||
|
|
|
||
|
|
def fill_missing_prefixes(prefix_offsets: dict[int, int], file_size: int) -> list[int]:
|
||
|
|
"""Fill in missing prefixes with the next available offset.
|
||
|
|
|
||
|
|
If a prefix doesn't exist (e.g., no words starting with 'qx'),
|
||
|
|
we set its offset to the next prefix's offset so the scan will
|
||
|
|
quickly find nothing and move on.
|
||
|
|
"""
|
||
|
|
result = [0] * 676
|
||
|
|
|
||
|
|
# First pass: fill in known offsets
|
||
|
|
for idx, offset in prefix_offsets.items():
|
||
|
|
result[idx] = offset
|
||
|
|
|
||
|
|
# Second pass: fill missing with next known offset (or file_size)
|
||
|
|
# Work backwards so each missing entry gets the next valid offset
|
||
|
|
next_valid = file_size
|
||
|
|
for idx in range(675, -1, -1):
|
||
|
|
if idx in prefix_offsets:
|
||
|
|
next_valid = prefix_offsets[idx]
|
||
|
|
else:
|
||
|
|
result[idx] = next_valid
|
||
|
|
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
def format_offset_array(offsets: list[int], name: str) -> str:
|
||
|
|
"""Format offset array as C++ constexpr with nice formatting."""
|
||
|
|
lines = [f"// Two-letter prefix jump table: {name}[prefix_to_index(c1, c2)] = byte offset"]
|
||
|
|
lines.append(f"// Prefixes: aa=0, ab=1, ... az=25, ba=26, ... zz=675")
|
||
|
|
lines.append(f"constexpr uint32_t {name}[676] = {{")
|
||
|
|
|
||
|
|
# Format 13 values per line (fits nicely with 10-digit numbers + commas)
|
||
|
|
values_per_line = 13
|
||
|
|
for i in range(0, 676, values_per_line):
|
||
|
|
chunk = offsets[i:i + values_per_line]
|
||
|
|
prefix_start = index_to_prefix(i)
|
||
|
|
prefix_end = index_to_prefix(min(i + values_per_line - 1, 675))
|
||
|
|
values_str = ', '.join(f'{v:>10}' for v in chunk)
|
||
|
|
lines.append(f" {values_str}, // {prefix_start}-{prefix_end}")
|
||
|
|
|
||
|
|
lines.append("};")
|
||
|
|
return '\n'.join(lines)
|
||
|
|
|
||
|
|
|
||
|
|
def generate_header(idx_offsets: list[int], syn_offsets: list[int] | None, output_path: pathlib.Path) -> None:
|
||
|
|
"""Generate the C++ header file with prefix offset tables."""
|
||
|
|
|
||
|
|
content = '''#pragma once
|
||
|
|
|
||
|
|
// Auto-generated by generate_dict_index.py. Do not edit manually.
|
||
|
|
// This file contains pre-computed prefix jump tables for fast dictionary lookup.
|
||
|
|
|
||
|
|
#include <cstdint>
|
||
|
|
|
||
|
|
namespace DictPrefixIndex {
|
||
|
|
|
||
|
|
// Convert two-letter prefix to index (0-675)
|
||
|
|
// "aa" -> 0, "ab" -> 1, ... "az" -> 25, "ba" -> 26, ... "zz" -> 675
|
||
|
|
inline uint16_t prefixToIndex(char c1, char c2) {
|
||
|
|
// Convert to lowercase and compute index
|
||
|
|
const int i1 = (c1 | 0x20) - 'a'; // tolower via OR with 0x20
|
||
|
|
const int i2 = (c2 | 0x20) - 'a';
|
||
|
|
// Bounds check (returns 0 for non-alpha characters)
|
||
|
|
if (i1 < 0 || i1 > 25 || i2 < 0 || i2 > 25) return 0;
|
||
|
|
return static_cast<uint16_t>(i1 * 26 + i2);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check if character is alphabetic (a-z or A-Z)
|
||
|
|
inline bool isAlpha(char c) {
|
||
|
|
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
|
||
|
|
}
|
||
|
|
|
||
|
|
'''
|
||
|
|
|
||
|
|
content += format_offset_array(idx_offsets, "dictPrefixOffsets")
|
||
|
|
content += '\n\n'
|
||
|
|
|
||
|
|
if syn_offsets:
|
||
|
|
content += format_offset_array(syn_offsets, "synPrefixOffsets")
|
||
|
|
else:
|
||
|
|
content += "// No synonym file processed - synPrefixOffsets not generated\n"
|
||
|
|
content += "constexpr uint32_t synPrefixOffsets[676] = {0};\n"
|
||
|
|
|
||
|
|
content += '\n} // namespace DictPrefixIndex\n'
|
||
|
|
|
||
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
output_path.write_text(content)
|
||
|
|
print(f"Generated: {output_path}")
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> None:
|
||
|
|
parser = argparse.ArgumentParser(description="Generate StarDict prefix jump tables")
|
||
|
|
parser.add_argument('--idx', type=str, help='Path to .idx file')
|
||
|
|
parser.add_argument('--syn', type=str, help='Path to .syn file (optional)')
|
||
|
|
parser.add_argument('--zip', type=str, help='Path to dictionary zip file (alternative to --idx/--syn)')
|
||
|
|
parser.add_argument('--output', type=str, required=True, help='Output header path')
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
idx_offsets: dict[int, int] = {}
|
||
|
|
syn_offsets: dict[int, int] | None = None
|
||
|
|
idx_file_size = 0
|
||
|
|
syn_file_size = 0
|
||
|
|
|
||
|
|
if args.zip:
|
||
|
|
# Extract from zip file
|
||
|
|
zip_path = pathlib.Path(args.zip)
|
||
|
|
print(f"Processing zip file: {zip_path}")
|
||
|
|
|
||
|
|
with zipfile.ZipFile(zip_path, 'r') as zf:
|
||
|
|
# Find .idx file
|
||
|
|
idx_name = None
|
||
|
|
syn_name = None
|
||
|
|
for name in zf.namelist():
|
||
|
|
if name.endswith('.idx'):
|
||
|
|
idx_name = name
|
||
|
|
elif name.endswith('.syn'):
|
||
|
|
syn_name = name
|
||
|
|
|
||
|
|
if not idx_name:
|
||
|
|
raise SystemExit("No .idx file found in zip")
|
||
|
|
|
||
|
|
print(f"\nParsing index file: {idx_name}")
|
||
|
|
with zf.open(idx_name) as f:
|
||
|
|
idx_file_size = zf.getinfo(idx_name).file_size
|
||
|
|
idx_offsets = parse_idx_file(f, idx_file_size)
|
||
|
|
|
||
|
|
if syn_name:
|
||
|
|
print(f"\nParsing synonym file: {syn_name}")
|
||
|
|
with zf.open(syn_name) as f:
|
||
|
|
syn_file_size = zf.getinfo(syn_name).file_size
|
||
|
|
syn_offsets = parse_syn_file(f, syn_file_size)
|
||
|
|
else:
|
||
|
|
# Read from individual files
|
||
|
|
if not args.idx:
|
||
|
|
raise SystemExit("Either --zip or --idx must be provided")
|
||
|
|
|
||
|
|
idx_path = pathlib.Path(args.idx)
|
||
|
|
print(f"Processing index file: {idx_path}")
|
||
|
|
idx_file_size = idx_path.stat().st_size
|
||
|
|
with open(idx_path, 'rb') as f:
|
||
|
|
idx_offsets = parse_idx_file(f, idx_file_size)
|
||
|
|
|
||
|
|
if args.syn:
|
||
|
|
syn_path = pathlib.Path(args.syn)
|
||
|
|
print(f"\nProcessing synonym file: {syn_path}")
|
||
|
|
syn_file_size = syn_path.stat().st_size
|
||
|
|
with open(syn_path, 'rb') as f:
|
||
|
|
syn_offsets = parse_syn_file(f, syn_file_size)
|
||
|
|
|
||
|
|
# Fill in missing prefixes
|
||
|
|
print("\nFilling missing prefixes...")
|
||
|
|
idx_offsets_filled = fill_missing_prefixes(idx_offsets, idx_file_size)
|
||
|
|
syn_offsets_filled = fill_missing_prefixes(syn_offsets, syn_file_size) if syn_offsets else None
|
||
|
|
|
||
|
|
# Generate header
|
||
|
|
print("\nGenerating header file...")
|
||
|
|
generate_header(idx_offsets_filled, syn_offsets_filled, pathlib.Path(args.output))
|
||
|
|
|
||
|
|
# Print some statistics
|
||
|
|
print("\n=== Statistics ===")
|
||
|
|
print(f"Index file size: {idx_file_size:,} bytes")
|
||
|
|
if syn_file_size:
|
||
|
|
print(f"Synonym file size: {syn_file_size:,} bytes")
|
||
|
|
|
||
|
|
# Show distribution of some common prefixes
|
||
|
|
print("\nSample prefix offsets:")
|
||
|
|
for prefix in ['aa', 'he', 'th', 'wo', 'zz']:
|
||
|
|
idx = prefix_to_index(prefix[0], prefix[1])
|
||
|
|
offset = idx_offsets_filled[idx]
|
||
|
|
pct = (offset / idx_file_size) * 100 if idx_file_size else 0
|
||
|
|
print(f" '{prefix}' (index {idx}): offset {offset:,} ({pct:.1f}% into file)")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
main()
|