crosspoint-reader/scripts/generate_dict_index.py

#!/usr/bin/env python3
"""Generate prefix jump tables for StarDict dictionary lookup optimization.

This script parses StarDict .idx and .syn files and generates a C++ header
with pre-computed byte offsets for two-letter prefixes (aa-zz). This enables
near-instant lookup by jumping directly to the relevant section of the index.

Usage:
    ./scripts/generate_dict_index.py --idx path/to/dict.idx --syn path/to/dict.syn --output lib/StarDict/DictPrefixIndex.generated.h

Or extract from a zip file:
    ./scripts/generate_dict_index.py --zip dict-en-en.zip --output lib/StarDict/DictPrefixIndex.generated.h
"""

from __future__ import annotations

import argparse
import pathlib
import struct
import zipfile
from typing import BinaryIO


def prefix_to_index(c1: str, c2: str) -> int:
    """Convert two-letter prefix to index (0-675).
    
    'aa' -> 0, 'ab' -> 1, ... 'zz' -> 675
    """
    return (ord(c1.lower()) - ord('a')) * 26 + (ord(c2.lower()) - ord('a'))


def index_to_prefix(idx: int) -> str:
    """Convert index back to two-letter prefix for debugging."""
    c1 = chr(ord('a') + idx // 26)
    c2 = chr(ord('a') + idx % 26)
    return c1 + c2


def is_alpha(c: str) -> bool:
    """Check if character is a-z or A-Z."""
    return ('a' <= c <= 'z') or ('A' <= c <= 'Z')


def read_null_terminated_string(f: BinaryIO) -> tuple[str, int]:
    """Read a null-terminated string from file.
    
    Returns (string, bytes_read including null terminator).
    """
    chars = []
    bytes_read = 0
    while True:
        b = f.read(1)
        if not b:
            break
        bytes_read += 1
        if b == b'\x00':
            break
        chars.append(b.decode('utf-8', errors='replace'))
    return ''.join(chars), bytes_read


def parse_idx_file(f: BinaryIO, file_size: int) -> dict[int, int]:
    """Parse StarDict .idx file and build prefix -> offset mapping.
    
    The .idx file format is:
        [word\0][offset:4 bytes BE][size:4 bytes BE]
        ...repeated for each word...
    
    Returns dict mapping prefix index (0-675) to first byte offset for that prefix.
    """
    prefix_offsets: dict[int, int] = {}
    current_position = 0
    words_processed = 0
    
    while current_position < file_size:
        entry_start = current_position
        
        # Read the word
        word, word_bytes = read_null_terminated_string(f)
        if not word:
            break
        
        current_position += word_bytes
        
        # Read 8 bytes (offset + size, both big-endian)
        data = f.read(8)
        if len(data) != 8:
            break
        current_position += 8
        
        # Extract prefix if word has at least 2 alphabetic characters
        if len(word) >= 2 and is_alpha(word[0]) and is_alpha(word[1]):
            prefix_idx = prefix_to_index(word[0], word[1])
            
            # Only record the first occurrence of each prefix
            if prefix_idx not in prefix_offsets:
                prefix_offsets[prefix_idx] = entry_start
        
        words_processed += 1
        if words_processed % 100000 == 0:
            print(f"  Processed {words_processed} words...")
    
    print(f"  Total words processed: {words_processed}")
    print(f"  Unique prefixes found: {len(prefix_offsets)}")
    
    return prefix_offsets


def parse_syn_file(f: BinaryIO, file_size: int) -> dict[int, int]:
    """Parse StarDict .syn file and build prefix -> offset mapping.
    
    The .syn file format is:
        [synonym_word\0][main_entry_index:4 bytes BE]
        ...repeated for each synonym...
    
    Returns dict mapping prefix index (0-675) to first byte offset for that prefix.
    """
    prefix_offsets: dict[int, int] = {}
    current_position = 0
    synonyms_processed = 0
    
    while current_position < file_size:
        entry_start = current_position
        
        # Read the synonym word
        word, word_bytes = read_null_terminated_string(f)
        if not word:
            break
        
        current_position += word_bytes
        
        # Read 4 bytes (index to main entry, big-endian)
        data = f.read(4)
        if len(data) != 4:
            break
        current_position += 4
        
        # Extract prefix if word has at least 2 alphabetic characters
        if len(word) >= 2 and is_alpha(word[0]) and is_alpha(word[1]):
            prefix_idx = prefix_to_index(word[0], word[1])
            
            # Only record the first occurrence of each prefix
            if prefix_idx not in prefix_offsets:
                prefix_offsets[prefix_idx] = entry_start
        
        synonyms_processed += 1
        if synonyms_processed % 100000 == 0:
            print(f"  Processed {synonyms_processed} synonyms...")
    
    print(f"  Total synonyms processed: {synonyms_processed}")
    print(f"  Unique prefixes found: {len(prefix_offsets)}")
    
    return prefix_offsets


def fill_missing_prefixes(prefix_offsets: dict[int, int], file_size: int) -> list[int]:
    """Fill in missing prefixes with the next available offset.
    
    If a prefix doesn't exist (e.g., no words starting with 'qx'), 
    we set its offset to the next prefix's offset so the scan will
    quickly find nothing and move on.
    """
    result = [0] * 676
    
    # First pass: fill in known offsets
    for idx, offset in prefix_offsets.items():
        result[idx] = offset
    
    # Second pass: fill missing with next known offset (or file_size)
    # Work backwards so each missing entry gets the next valid offset
    next_valid = file_size
    for idx in range(675, -1, -1):
        if idx in prefix_offsets:
            next_valid = prefix_offsets[idx]
        else:
            result[idx] = next_valid
    
    return result


def format_offset_array(offsets: list[int], name: str) -> str:
    """Format offset array as C++ constexpr with nice formatting."""
    lines = [f"// Two-letter prefix jump table: {name}[prefix_to_index(c1, c2)] = byte offset"]
    lines.append(f"// Prefixes: aa=0, ab=1, ... az=25, ba=26, ... zz=675")
    lines.append(f"constexpr uint32_t {name}[676] = {{")
    
    # Format 13 values per line (fits nicely with 10-digit numbers + commas)
    values_per_line = 13
    for i in range(0, 676, values_per_line):
        chunk = offsets[i:i + values_per_line]
        prefix_start = index_to_prefix(i)
        prefix_end = index_to_prefix(min(i + values_per_line - 1, 675))
        values_str = ', '.join(f'{v:>10}' for v in chunk)
        lines.append(f"    {values_str},  // {prefix_start}-{prefix_end}")
    
    lines.append("};")
    return '\n'.join(lines)


def generate_header(idx_offsets: list[int], syn_offsets: list[int] | None, output_path: pathlib.Path) -> None:
    """Generate the C++ header file with prefix offset tables."""
    
    content = '''#pragma once

// Auto-generated by generate_dict_index.py. Do not edit manually.
// This file contains pre-computed prefix jump tables for fast dictionary lookup.

#include <cstdint>

namespace DictPrefixIndex {

// Convert two-letter prefix to index (0-675)
// "aa" -> 0, "ab" -> 1, ... "az" -> 25, "ba" -> 26, ... "zz" -> 675
inline uint16_t prefixToIndex(char c1, char c2) {
  // Convert to lowercase and compute index
  const int i1 = (c1 | 0x20) - 'a';  // tolower via OR with 0x20
  const int i2 = (c2 | 0x20) - 'a';
  // Bounds check (returns 0 for non-alpha characters)
  if (i1 < 0 || i1 > 25 || i2 < 0 || i2 > 25) return 0;
  return static_cast<uint16_t>(i1 * 26 + i2);
}

// Check if character is alphabetic (a-z or A-Z)
inline bool isAlpha(char c) {
  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}

'''
    
    content += format_offset_array(idx_offsets, "dictPrefixOffsets")
    content += '\n\n'
    
    if syn_offsets:
        content += format_offset_array(syn_offsets, "synPrefixOffsets")
    else:
        content += "// No synonym file processed - synPrefixOffsets not generated\n"
        content += "constexpr uint32_t synPrefixOffsets[676] = {0};\n"
    
    content += '\n}  // namespace DictPrefixIndex\n'
    
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(content)
    print(f"Generated: {output_path}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Generate StarDict prefix jump tables")
    parser.add_argument('--idx', type=str, help='Path to .idx file')
    parser.add_argument('--syn', type=str, help='Path to .syn file (optional)')
    parser.add_argument('--zip', type=str, help='Path to dictionary zip file (alternative to --idx/--syn)')
    parser.add_argument('--output', type=str, required=True, help='Output header path')
    
    args = parser.parse_args()
    
    idx_offsets: dict[int, int] = {}
    syn_offsets: dict[int, int] | None = None
    idx_file_size = 0
    syn_file_size = 0
    
    if args.zip:
        # Extract from zip file
        zip_path = pathlib.Path(args.zip)
        print(f"Processing zip file: {zip_path}")
        
        with zipfile.ZipFile(zip_path, 'r') as zf:
            # Find .idx file
            idx_name = None
            syn_name = None
            for name in zf.namelist():
                if name.endswith('.idx'):
                    idx_name = name
                elif name.endswith('.syn'):
                    syn_name = name
            
            if not idx_name:
                raise SystemExit("No .idx file found in zip")
            
            print(f"\nParsing index file: {idx_name}")
            with zf.open(idx_name) as f:
                idx_file_size = zf.getinfo(idx_name).file_size
                idx_offsets = parse_idx_file(f, idx_file_size)
            
            if syn_name:
                print(f"\nParsing synonym file: {syn_name}")
                with zf.open(syn_name) as f:
                    syn_file_size = zf.getinfo(syn_name).file_size
                    syn_offsets = parse_syn_file(f, syn_file_size)
    else:
        # Read from individual files
        if not args.idx:
            raise SystemExit("Either --zip or --idx must be provided")
        
        idx_path = pathlib.Path(args.idx)
        print(f"Processing index file: {idx_path}")
        idx_file_size = idx_path.stat().st_size
        with open(idx_path, 'rb') as f:
            idx_offsets = parse_idx_file(f, idx_file_size)
        
        if args.syn:
            syn_path = pathlib.Path(args.syn)
            print(f"\nProcessing synonym file: {syn_path}")
            syn_file_size = syn_path.stat().st_size
            with open(syn_path, 'rb') as f:
                syn_offsets = parse_syn_file(f, syn_file_size)
    
    # Fill in missing prefixes
    print("\nFilling missing prefixes...")
    idx_offsets_filled = fill_missing_prefixes(idx_offsets, idx_file_size)
    syn_offsets_filled = fill_missing_prefixes(syn_offsets, syn_file_size) if syn_offsets else None
    
    # Generate header
    print("\nGenerating header file...")
    generate_header(idx_offsets_filled, syn_offsets_filled, pathlib.Path(args.output))
    
    # Print some statistics
    print("\n=== Statistics ===")
    print(f"Index file size: {idx_file_size:,} bytes")
    if syn_file_size:
        print(f"Synonym file size: {syn_file_size:,} bytes")
    
    # Show distribution of some common prefixes
    print("\nSample prefix offsets:")
    for prefix in ['aa', 'he', 'th', 'wo', 'zz']:
        idx = prefix_to_index(prefix[0], prefix[1])
        offset = idx_offsets_filled[idx]
        pct = (offset / idx_file_size) * 100 if idx_file_size else 0
        print(f"  '{prefix}' (index {idx}): offset {offset:,} ({pct:.1f}% into file)")


if __name__ == '__main__':
    main()
sort of working dictionary 2026-01-22 12:42:01 -05:00			`#!/usr/bin/env python3`
			`"""Generate prefix jump tables for StarDict dictionary lookup optimization.`

			`This script parses StarDict .idx and .syn files and generates a C++ header`
			`with pre-computed byte offsets for two-letter prefixes (aa-zz). This enables`
			`near-instant lookup by jumping directly to the relevant section of the index.`

			`Usage:`
			`./scripts/generate_dict_index.py --idx path/to/dict.idx --syn path/to/dict.syn --output lib/StarDict/DictPrefixIndex.generated.h`

			`Or extract from a zip file:`
			`./scripts/generate_dict_index.py --zip dict-en-en.zip --output lib/StarDict/DictPrefixIndex.generated.h`
			`"""`

			`from __future__ import annotations`

			`import argparse`
			`import pathlib`
			`import struct`
			`import zipfile`
			`from typing import BinaryIO`


			`def prefix_to_index(c1: str, c2: str) -> int:`
			`"""Convert two-letter prefix to index (0-675).`

			`'aa' -> 0, 'ab' -> 1, ... 'zz' -> 675`
			`"""`
			`return (ord(c1.lower()) - ord('a')) * 26 + (ord(c2.lower()) - ord('a'))`


			`def index_to_prefix(idx: int) -> str:`
			`"""Convert index back to two-letter prefix for debugging."""`
			`c1 = chr(ord('a') + idx // 26)`
			`c2 = chr(ord('a') + idx % 26)`
			`return c1 + c2`


			`def is_alpha(c: str) -> bool:`
			`"""Check if character is a-z or A-Z."""`
			`return ('a' <= c <= 'z') or ('A' <= c <= 'Z')`


			`def read_null_terminated_string(f: BinaryIO) -> tuple[str, int]:`
			`"""Read a null-terminated string from file.`

			`Returns (string, bytes_read including null terminator).`
			`"""`
			`chars = []`
			`bytes_read = 0`
			`while True:`
			`b = f.read(1)`
			`if not b:`
			`break`
			`bytes_read += 1`
			`if b == b'\x00':`
			`break`
			`chars.append(b.decode('utf-8', errors='replace'))`
			`return ''.join(chars), bytes_read`


			`def parse_idx_file(f: BinaryIO, file_size: int) -> dict[int, int]:`
			`"""Parse StarDict .idx file and build prefix -> offset mapping.`

			`The .idx file format is:`
			`[word\0][offset:4 bytes BE][size:4 bytes BE]`
			`...repeated for each word...`

			`Returns dict mapping prefix index (0-675) to first byte offset for that prefix.`
			`"""`
			`prefix_offsets: dict[int, int] = {}`
			`current_position = 0`
			`words_processed = 0`

			`while current_position < file_size:`
			`entry_start = current_position`

			`# Read the word`
			`word, word_bytes = read_null_terminated_string(f)`
			`if not word:`
			`break`

			`current_position += word_bytes`

			`# Read 8 bytes (offset + size, both big-endian)`
			`data = f.read(8)`
			`if len(data) != 8:`
			`break`
			`current_position += 8`

			`# Extract prefix if word has at least 2 alphabetic characters`
			`if len(word) >= 2 and is_alpha(word[0]) and is_alpha(word[1]):`
			`prefix_idx = prefix_to_index(word[0], word[1])`

			`# Only record the first occurrence of each prefix`
			`if prefix_idx not in prefix_offsets:`
			`prefix_offsets[prefix_idx] = entry_start`

			`words_processed += 1`
			`if words_processed % 100000 == 0:`
			`print(f" Processed {words_processed} words...")`

			`print(f" Total words processed: {words_processed}")`
			`print(f" Unique prefixes found: {len(prefix_offsets)}")`

			`return prefix_offsets`


			`def parse_syn_file(f: BinaryIO, file_size: int) -> dict[int, int]:`
			`"""Parse StarDict .syn file and build prefix -> offset mapping.`

			`The .syn file format is:`
			`[synonym_word\0][main_entry_index:4 bytes BE]`
			`...repeated for each synonym...`

			`Returns dict mapping prefix index (0-675) to first byte offset for that prefix.`
			`"""`
			`prefix_offsets: dict[int, int] = {}`
			`current_position = 0`
			`synonyms_processed = 0`

			`while current_position < file_size:`
			`entry_start = current_position`

			`# Read the synonym word`
			`word, word_bytes = read_null_terminated_string(f)`
			`if not word:`
			`break`

			`current_position += word_bytes`

			`# Read 4 bytes (index to main entry, big-endian)`
			`data = f.read(4)`
			`if len(data) != 4:`
			`break`
			`current_position += 4`

			`# Extract prefix if word has at least 2 alphabetic characters`
			`if len(word) >= 2 and is_alpha(word[0]) and is_alpha(word[1]):`
			`prefix_idx = prefix_to_index(word[0], word[1])`

			`# Only record the first occurrence of each prefix`
			`if prefix_idx not in prefix_offsets:`
			`prefix_offsets[prefix_idx] = entry_start`

			`synonyms_processed += 1`
			`if synonyms_processed % 100000 == 0:`
			`print(f" Processed {synonyms_processed} synonyms...")`

			`print(f" Total synonyms processed: {synonyms_processed}")`
			`print(f" Unique prefixes found: {len(prefix_offsets)}")`

			`return prefix_offsets`


			`def fill_missing_prefixes(prefix_offsets: dict[int, int], file_size: int) -> list[int]:`
			`"""Fill in missing prefixes with the next available offset.`

			`If a prefix doesn't exist (e.g., no words starting with 'qx'),`
			`we set its offset to the next prefix's offset so the scan will`
			`quickly find nothing and move on.`
			`"""`
			`result = [0] * 676`

			`# First pass: fill in known offsets`
			`for idx, offset in prefix_offsets.items():`
			`result[idx] = offset`

			`# Second pass: fill missing with next known offset (or file_size)`
			`# Work backwards so each missing entry gets the next valid offset`
			`next_valid = file_size`
			`for idx in range(675, -1, -1):`
			`if idx in prefix_offsets:`
			`next_valid = prefix_offsets[idx]`
			`else:`
			`result[idx] = next_valid`

			`return result`


			`def format_offset_array(offsets: list[int], name: str) -> str:`
			`"""Format offset array as C++ constexpr with nice formatting."""`
			`lines = [f"// Two-letter prefix jump table: {name}[prefix_to_index(c1, c2)] = byte offset"]`
			`lines.append(f"// Prefixes: aa=0, ab=1, ... az=25, ba=26, ... zz=675")`
			`lines.append(f"constexpr uint32_t {name}[676] = {{")`

			`# Format 13 values per line (fits nicely with 10-digit numbers + commas)`
			`values_per_line = 13`
			`for i in range(0, 676, values_per_line):`
			`chunk = offsets[i:i + values_per_line]`
			`prefix_start = index_to_prefix(i)`
			`prefix_end = index_to_prefix(min(i + values_per_line - 1, 675))`
			`values_str = ', '.join(f'{v:>10}' for v in chunk)`
			`lines.append(f" {values_str}, // {prefix_start}-{prefix_end}")`

			`lines.append("};")`
			`return '\n'.join(lines)`


			`def generate_header(idx_offsets: list[int], syn_offsets: list[int] \| None, output_path: pathlib.Path) -> None:`
			`"""Generate the C++ header file with prefix offset tables."""`

			`content = '''#pragma once`

			`// Auto-generated by generate_dict_index.py. Do not edit manually.`
			`// This file contains pre-computed prefix jump tables for fast dictionary lookup.`

			`#include <cstdint>`

			`namespace DictPrefixIndex {`

			`// Convert two-letter prefix to index (0-675)`
			`// "aa" -> 0, "ab" -> 1, ... "az" -> 25, "ba" -> 26, ... "zz" -> 675`
			`inline uint16_t prefixToIndex(char c1, char c2) {`
			`// Convert to lowercase and compute index`
			`const int i1 = (c1 \| 0x20) - 'a'; // tolower via OR with 0x20`
			`const int i2 = (c2 \| 0x20) - 'a';`
			`// Bounds check (returns 0 for non-alpha characters)`
			`if (i1 < 0 \|\| i1 > 25 \|\| i2 < 0 \|\| i2 > 25) return 0;`
			`return static_cast<uint16_t>(i1 * 26 + i2);`
			`}`

			`// Check if character is alphabetic (a-z or A-Z)`
			`inline bool isAlpha(char c) {`
			`return (c >= 'a' && c <= 'z') \|\| (c >= 'A' && c <= 'Z');`
			`}`

			`'''`

			`content += format_offset_array(idx_offsets, "dictPrefixOffsets")`
			`content += '\n\n'`

			`if syn_offsets:`
			`content += format_offset_array(syn_offsets, "synPrefixOffsets")`
			`else:`
			`content += "// No synonym file processed - synPrefixOffsets not generated\n"`
			`content += "constexpr uint32_t synPrefixOffsets[676] = {0};\n"`

			`content += '\n} // namespace DictPrefixIndex\n'`

			`output_path.parent.mkdir(parents=True, exist_ok=True)`
			`output_path.write_text(content)`
			`print(f"Generated: {output_path}")`


			`def main() -> None:`
			`parser = argparse.ArgumentParser(description="Generate StarDict prefix jump tables")`
			`parser.add_argument('--idx', type=str, help='Path to .idx file')`
			`parser.add_argument('--syn', type=str, help='Path to .syn file (optional)')`
			`parser.add_argument('--zip', type=str, help='Path to dictionary zip file (alternative to --idx/--syn)')`
			`parser.add_argument('--output', type=str, required=True, help='Output header path')`

			`args = parser.parse_args()`

			`idx_offsets: dict[int, int] = {}`
			`syn_offsets: dict[int, int] \| None = None`
			`idx_file_size = 0`
			`syn_file_size = 0`

			`if args.zip:`
			`# Extract from zip file`
			`zip_path = pathlib.Path(args.zip)`
			`print(f"Processing zip file: {zip_path}")`

			`with zipfile.ZipFile(zip_path, 'r') as zf:`
			`# Find .idx file`
			`idx_name = None`
			`syn_name = None`
			`for name in zf.namelist():`
			`if name.endswith('.idx'):`
			`idx_name = name`
			`elif name.endswith('.syn'):`
			`syn_name = name`

			`if not idx_name:`
			`raise SystemExit("No .idx file found in zip")`

			`print(f"\nParsing index file: {idx_name}")`
			`with zf.open(idx_name) as f:`
			`idx_file_size = zf.getinfo(idx_name).file_size`
			`idx_offsets = parse_idx_file(f, idx_file_size)`

			`if syn_name:`
			`print(f"\nParsing synonym file: {syn_name}")`
			`with zf.open(syn_name) as f:`
			`syn_file_size = zf.getinfo(syn_name).file_size`
			`syn_offsets = parse_syn_file(f, syn_file_size)`
			`else:`
			`# Read from individual files`
			`if not args.idx:`
			`raise SystemExit("Either --zip or --idx must be provided")`

			`idx_path = pathlib.Path(args.idx)`
			`print(f"Processing index file: {idx_path}")`
			`idx_file_size = idx_path.stat().st_size`
			`with open(idx_path, 'rb') as f:`
			`idx_offsets = parse_idx_file(f, idx_file_size)`

			`if args.syn:`
			`syn_path = pathlib.Path(args.syn)`
			`print(f"\nProcessing synonym file: {syn_path}")`
			`syn_file_size = syn_path.stat().st_size`
			`with open(syn_path, 'rb') as f:`
			`syn_offsets = parse_syn_file(f, syn_file_size)`

			`# Fill in missing prefixes`
			`print("\nFilling missing prefixes...")`
			`idx_offsets_filled = fill_missing_prefixes(idx_offsets, idx_file_size)`
			`syn_offsets_filled = fill_missing_prefixes(syn_offsets, syn_file_size) if syn_offsets else None`

			`# Generate header`
			`print("\nGenerating header file...")`
			`generate_header(idx_offsets_filled, syn_offsets_filled, pathlib.Path(args.output))`

			`# Print some statistics`
			`print("\n=== Statistics ===")`
			`print(f"Index file size: {idx_file_size:,} bytes")`
			`if syn_file_size:`
			`print(f"Synonym file size: {syn_file_size:,} bytes")`

			`# Show distribution of some common prefixes`
			`print("\nSample prefix offsets:")`
			`for prefix in ['aa', 'he', 'th', 'wo', 'zz']:`
			`idx = prefix_to_index(prefix[0], prefix[1])`
			`offset = idx_offsets_filled[idx]`
			`pct = (offset / idx_file_size) * 100 if idx_file_size else 0`
			`print(f" '{prefix}' (index {idx}): offset {offset:,} ({pct:.1f}% into file)")`


			`if __name__ == '__main__':`
			`main()`