#!/usr/bin/env python3 """Generate prefix jump tables for StarDict dictionary lookup optimization. This script parses StarDict .idx and .syn files and generates a C++ header with pre-computed byte offsets for two-letter prefixes (aa-zz). This enables near-instant lookup by jumping directly to the relevant section of the index. Usage: ./scripts/generate_dict_index.py --idx path/to/dict.idx --syn path/to/dict.syn --output lib/StarDict/DictPrefixIndex.generated.h Or extract from a zip file: ./scripts/generate_dict_index.py --zip dict-en-en.zip --output lib/StarDict/DictPrefixIndex.generated.h """ from __future__ import annotations import argparse import pathlib import struct import zipfile from typing import BinaryIO def prefix_to_index(c1: str, c2: str) -> int: """Convert two-letter prefix to index (0-675). 'aa' -> 0, 'ab' -> 1, ... 'zz' -> 675 """ return (ord(c1.lower()) - ord('a')) * 26 + (ord(c2.lower()) - ord('a')) def index_to_prefix(idx: int) -> str: """Convert index back to two-letter prefix for debugging.""" c1 = chr(ord('a') + idx // 26) c2 = chr(ord('a') + idx % 26) return c1 + c2 def is_alpha(c: str) -> bool: """Check if character is a-z or A-Z.""" return ('a' <= c <= 'z') or ('A' <= c <= 'Z') def read_null_terminated_string(f: BinaryIO) -> tuple[str, int]: """Read a null-terminated string from file. Returns (string, bytes_read including null terminator). """ chars = [] bytes_read = 0 while True: b = f.read(1) if not b: break bytes_read += 1 if b == b'\x00': break chars.append(b.decode('utf-8', errors='replace')) return ''.join(chars), bytes_read def parse_idx_file(f: BinaryIO, file_size: int) -> dict[int, int]: """Parse StarDict .idx file and build prefix -> offset mapping. The .idx file format is: [word\0][offset:4 bytes BE][size:4 bytes BE] ...repeated for each word... Returns dict mapping prefix index (0-675) to first byte offset for that prefix. """ prefix_offsets: dict[int, int] = {} current_position = 0 words_processed = 0 while current_position < file_size: entry_start = current_position # Read the word word, word_bytes = read_null_terminated_string(f) if not word: break current_position += word_bytes # Read 8 bytes (offset + size, both big-endian) data = f.read(8) if len(data) != 8: break current_position += 8 # Extract prefix if word has at least 2 alphabetic characters if len(word) >= 2 and is_alpha(word[0]) and is_alpha(word[1]): prefix_idx = prefix_to_index(word[0], word[1]) # Only record the first occurrence of each prefix if prefix_idx not in prefix_offsets: prefix_offsets[prefix_idx] = entry_start words_processed += 1 if words_processed % 100000 == 0: print(f" Processed {words_processed} words...") print(f" Total words processed: {words_processed}") print(f" Unique prefixes found: {len(prefix_offsets)}") return prefix_offsets def parse_syn_file(f: BinaryIO, file_size: int) -> dict[int, int]: """Parse StarDict .syn file and build prefix -> offset mapping. The .syn file format is: [synonym_word\0][main_entry_index:4 bytes BE] ...repeated for each synonym... Returns dict mapping prefix index (0-675) to first byte offset for that prefix. """ prefix_offsets: dict[int, int] = {} current_position = 0 synonyms_processed = 0 while current_position < file_size: entry_start = current_position # Read the synonym word word, word_bytes = read_null_terminated_string(f) if not word: break current_position += word_bytes # Read 4 bytes (index to main entry, big-endian) data = f.read(4) if len(data) != 4: break current_position += 4 # Extract prefix if word has at least 2 alphabetic characters if len(word) >= 2 and is_alpha(word[0]) and is_alpha(word[1]): prefix_idx = prefix_to_index(word[0], word[1]) # Only record the first occurrence of each prefix if prefix_idx not in prefix_offsets: prefix_offsets[prefix_idx] = entry_start synonyms_processed += 1 if synonyms_processed % 100000 == 0: print(f" Processed {synonyms_processed} synonyms...") print(f" Total synonyms processed: {synonyms_processed}") print(f" Unique prefixes found: {len(prefix_offsets)}") return prefix_offsets def fill_missing_prefixes(prefix_offsets: dict[int, int], file_size: int) -> list[int]: """Fill in missing prefixes with the next available offset. If a prefix doesn't exist (e.g., no words starting with 'qx'), we set its offset to the next prefix's offset so the scan will quickly find nothing and move on. """ result = [0] * 676 # First pass: fill in known offsets for idx, offset in prefix_offsets.items(): result[idx] = offset # Second pass: fill missing with next known offset (or file_size) # Work backwards so each missing entry gets the next valid offset next_valid = file_size for idx in range(675, -1, -1): if idx in prefix_offsets: next_valid = prefix_offsets[idx] else: result[idx] = next_valid return result def format_offset_array(offsets: list[int], name: str) -> str: """Format offset array as C++ constexpr with nice formatting.""" lines = [f"// Two-letter prefix jump table: {name}[prefix_to_index(c1, c2)] = byte offset"] lines.append(f"// Prefixes: aa=0, ab=1, ... az=25, ba=26, ... zz=675") lines.append(f"constexpr uint32_t {name}[676] = {{") # Format 13 values per line (fits nicely with 10-digit numbers + commas) values_per_line = 13 for i in range(0, 676, values_per_line): chunk = offsets[i:i + values_per_line] prefix_start = index_to_prefix(i) prefix_end = index_to_prefix(min(i + values_per_line - 1, 675)) values_str = ', '.join(f'{v:>10}' for v in chunk) lines.append(f" {values_str}, // {prefix_start}-{prefix_end}") lines.append("};") return '\n'.join(lines) def generate_header(idx_offsets: list[int], syn_offsets: list[int] | None, output_path: pathlib.Path) -> None: """Generate the C++ header file with prefix offset tables.""" content = '''#pragma once // Auto-generated by generate_dict_index.py. Do not edit manually. // This file contains pre-computed prefix jump tables for fast dictionary lookup. #include namespace DictPrefixIndex { // Convert two-letter prefix to index (0-675) // "aa" -> 0, "ab" -> 1, ... "az" -> 25, "ba" -> 26, ... "zz" -> 675 inline uint16_t prefixToIndex(char c1, char c2) { // Convert to lowercase and compute index const int i1 = (c1 | 0x20) - 'a'; // tolower via OR with 0x20 const int i2 = (c2 | 0x20) - 'a'; // Bounds check (returns 0 for non-alpha characters) if (i1 < 0 || i1 > 25 || i2 < 0 || i2 > 25) return 0; return static_cast(i1 * 26 + i2); } // Check if character is alphabetic (a-z or A-Z) inline bool isAlpha(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } ''' content += format_offset_array(idx_offsets, "dictPrefixOffsets") content += '\n\n' if syn_offsets: content += format_offset_array(syn_offsets, "synPrefixOffsets") else: content += "// No synonym file processed - synPrefixOffsets not generated\n" content += "constexpr uint32_t synPrefixOffsets[676] = {0};\n" content += '\n} // namespace DictPrefixIndex\n' output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(content) print(f"Generated: {output_path}") def main() -> None: parser = argparse.ArgumentParser(description="Generate StarDict prefix jump tables") parser.add_argument('--idx', type=str, help='Path to .idx file') parser.add_argument('--syn', type=str, help='Path to .syn file (optional)') parser.add_argument('--zip', type=str, help='Path to dictionary zip file (alternative to --idx/--syn)') parser.add_argument('--output', type=str, required=True, help='Output header path') args = parser.parse_args() idx_offsets: dict[int, int] = {} syn_offsets: dict[int, int] | None = None idx_file_size = 0 syn_file_size = 0 if args.zip: # Extract from zip file zip_path = pathlib.Path(args.zip) print(f"Processing zip file: {zip_path}") with zipfile.ZipFile(zip_path, 'r') as zf: # Find .idx file idx_name = None syn_name = None for name in zf.namelist(): if name.endswith('.idx'): idx_name = name elif name.endswith('.syn'): syn_name = name if not idx_name: raise SystemExit("No .idx file found in zip") print(f"\nParsing index file: {idx_name}") with zf.open(idx_name) as f: idx_file_size = zf.getinfo(idx_name).file_size idx_offsets = parse_idx_file(f, idx_file_size) if syn_name: print(f"\nParsing synonym file: {syn_name}") with zf.open(syn_name) as f: syn_file_size = zf.getinfo(syn_name).file_size syn_offsets = parse_syn_file(f, syn_file_size) else: # Read from individual files if not args.idx: raise SystemExit("Either --zip or --idx must be provided") idx_path = pathlib.Path(args.idx) print(f"Processing index file: {idx_path}") idx_file_size = idx_path.stat().st_size with open(idx_path, 'rb') as f: idx_offsets = parse_idx_file(f, idx_file_size) if args.syn: syn_path = pathlib.Path(args.syn) print(f"\nProcessing synonym file: {syn_path}") syn_file_size = syn_path.stat().st_size with open(syn_path, 'rb') as f: syn_offsets = parse_syn_file(f, syn_file_size) # Fill in missing prefixes print("\nFilling missing prefixes...") idx_offsets_filled = fill_missing_prefixes(idx_offsets, idx_file_size) syn_offsets_filled = fill_missing_prefixes(syn_offsets, syn_file_size) if syn_offsets else None # Generate header print("\nGenerating header file...") generate_header(idx_offsets_filled, syn_offsets_filled, pathlib.Path(args.output)) # Print some statistics print("\n=== Statistics ===") print(f"Index file size: {idx_file_size:,} bytes") if syn_file_size: print(f"Synonym file size: {syn_file_size:,} bytes") # Show distribution of some common prefixes print("\nSample prefix offsets:") for prefix in ['aa', 'he', 'th', 'wo', 'zz']: idx = prefix_to_index(prefix[0], prefix[1]) offset = idx_offsets_filled[idx] pct = (offset / idx_file_size) * 100 if idx_file_size else 0 print(f" '{prefix}' (index {idx}): offset {offset:,} ({pct:.1f}% into file)") if __name__ == '__main__': main()