Files
crosspoint-reader-mod/lib/EpdFont/scripts/fontconvert.py
danoob 5050992bd6 feat: Vietnamese glyphs support (#1147)
## Summary

* **What is the goal of this PR?**
Add Vietnamese glyphs support for the reader's built-in fonts, enabling
proper rendering of Vietnamese text in EPUB content.

* **What changes are included?**
- Added 3 new Unicode intervals to `fontconvert.py` covering Vietnamese
characters:
- **Latin Extended-B** (Vietnamese subset only): `U+01A0–U+01B0` — Ơ/ơ,
Ư/ư
- **Vietnamese Extended**: `U+1EA0–U+1EF9` — All precomposed Vietnamese
characters with tone marks (Ả, Ấ, Ầ, Ẩ, Ẫ, Ậ, Ắ, …, Ỹ)
- Re-generated all 54 built-in font header files (Bookerly, Noto Sans,
OpenDyslexic, Ubuntu across all sizes and styles) to include the new
Vietnamese glyphs.

## Additional Context

* **Scope**: This PR only covers the **reader** fonts. The outer UI
still uses the Ubuntu font which does not fully support Vietnamese — UI
and i18n will be addressed in a follow-up PR (per discussion in PR
#1124).
* **Memory impact**:

  | Metric | Before | After | Delta |
  |---|---|---|---|
| Flash Data (`.rodata`) | 2,971,028 B | 3,290,748 B | **+319,720 B
(+10.8%)** |
| Total image size | 4,663,235 B | 4,982,955 B | **+319,720 B (+6.9%)**
|
  | Flash usage | 69.1% | 74.0% | **+4.9 pp** |
  | RAM usage | 29.0% | 29.0% | **No change** |

* **Risk**: Low — this is a data-only change (font glyph tables in
`.rodata`). No logic changes, no RAM impact. Flash headroom remains
comfortable at 74%.

---

### AI Usage

Did you use AI tools to help write this code? _**PARTIALLY**_

AI was used to identify the minimal set of Unicode ranges needed for
Vietnamese support and to assist with the PR description.

---------

Co-authored-by: danoooob <danoooob@example.com>
2026-02-24 11:21:39 -06:00

856 lines
32 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!python3
import freetype
import zlib
import sys
import re
import math
import argparse
from collections import namedtuple
from fontTools.ttLib import TTFont
# Originally from https://github.com/vroland/epdiy
parser = argparse.ArgumentParser(description="Generate a header file from a font to be used with epdiy.")
parser.add_argument("name", action="store", help="name of the font.")
parser.add_argument("size", type=int, help="font size to use.")
parser.add_argument("fontstack", action="store", nargs='+', help="list of font files, ordered by descending priority.")
parser.add_argument("--2bit", dest="is2Bit", action="store_true", help="generate 2-bit greyscale bitmap instead of 1-bit black and white.")
parser.add_argument("--additional-intervals", dest="additional_intervals", action="append", help="Additional code point intervals to export as min,max. This argument can be repeated.")
parser.add_argument("--compress", dest="compress", action="store_true", help="Compress glyph bitmaps using DEFLATE with group-based compression.")
parser.add_argument("--force-autohint", dest="force_autohint", action="store_true", help="Force FreeType auto-hinter instead of native font hinting. Improves stem width consistency for fonts with weak or no native TrueType hints.")
args = parser.parse_args()
GlyphProps = namedtuple("GlyphProps", ["width", "height", "advance_x", "left", "top", "data_length", "data_offset", "code_point"])
font_stack = [freetype.Face(f) for f in args.fontstack]
is2Bit = args.is2Bit
size = args.size
font_name = args.name
load_flags = freetype.FT_LOAD_RENDER
if args.force_autohint:
load_flags |= freetype.FT_LOAD_FORCE_AUTOHINT
# inclusive unicode code point intervals
# must not overlap and be in ascending order
intervals = [
### Basic Latin ###
# ASCII letters, digits, punctuation, control characters
(0x0000, 0x007F),
### Latin-1 Supplement ###
# Accented characters for Western European languages
(0x0080, 0x00FF),
### Latin Extended-A ###
# Eastern European and Baltic languages
(0x0100, 0x017F),
### Latin Extended-B (Vietnamese subset only) ###
# Only Ơ/ơ (U+01A0-01A1), Ư/ư (U+01AF-01B0) for Vietnamese
(0x01A0, 0x01A1),
(0x01AF, 0x01B0),
### Vietnamese Extended ###
# All precomposed Vietnamese characters with tone marks
# Ả Ấ Ầ Ẩ Ẫ Ậ Ắ Ằ Ẳ Ẵ Ặ Ẹ Ẻ Ẽ Ế Ề Ể Ễ Ệ Ỉ Ị Ọ Ỏ Ố Ồ Ổ Ỗ Ộ Ớ Ờ Ở Ỡ Ợ Ụ Ủ Ứ Ừ Ử Ữ Ự Ỳ Ỵ Ỷ Ỹ
(0x1EA0, 0x1EF9),
### General Punctuation (core subset) ###
# Smart quotes, en dash, em dash, ellipsis, NO-BREAK SPACE
(0x2000, 0x206F),
### Basic Symbols From "Latin-1 + Misc" ###
# dashes, quotes, prime marks
(0x2010, 0x203A),
# misc punctuation
(0x2040, 0x205F),
# common currency symbols
(0x20A0, 0x20CF),
### Combining Diacritical Marks (minimal subset) ###
# Needed for proper rendering of many extended Latin languages
(0x0300, 0x036F),
### Greek & Coptic ###
# Used in science, maths, philosophy, some academic texts
# (0x0370, 0x03FF),
### Cyrillic ###
# Russian, Ukrainian, Bulgarian, etc.
(0x0400, 0x04FF),
### Math Symbols (common subset) ###
# Superscripts and Subscripts
(0x2070, 0x209F),
# General math operators
(0x2200, 0x22FF),
# Arrows
(0x2190, 0x21FF),
### CJK ###
# Core Unified Ideographs
# (0x4E00, 0x9FFF),
# # Extension A
# (0x3400, 0x4DBF),
# # Extension B
# (0x20000, 0x2A6DF),
# # Extension CF
# (0x2A700, 0x2EBEF),
# # Extension G
# (0x30000, 0x3134F),
# # Hiragana
# (0x3040, 0x309F),
# # Katakana
# (0x30A0, 0x30FF),
# # Katakana Phonetic Extensions
# (0x31F0, 0x31FF),
# # Halfwidth Katakana
# (0xFF60, 0xFF9F),
# # Hangul Syllables
# (0xAC00, 0xD7AF),
# # Hangul Jamo
# (0x1100, 0x11FF),
# # Hangul Compatibility Jamo
# (0x3130, 0x318F),
# # Hangul Jamo Extended-A
# (0xA960, 0xA97F),
# # Hangul Jamo Extended-B
# (0xD7B0, 0xD7FF),
# # CJK Radicals Supplement
# (0x2E80, 0x2EFF),
# # Kangxi Radicals
# (0x2F00, 0x2FDF),
# # CJK Symbols and Punctuation
# (0x3000, 0x303F),
# # CJK Compatibility Forms
# (0xFE30, 0xFE4F),
# # CJK Compatibility Ideographs
# (0xF900, 0xFAFF),
### Alphabetic Presentation Forms (Latin ligatures) ###
# ff, fi, fl, ffi, ffl, long-st, st
(0xFB00, 0xFB06),
### Specials
# Replacement Character
(0xFFFD, 0xFFFD),
]
add_ints = []
if args.additional_intervals:
add_ints = [tuple([int(n, base=0) for n in i.split(",")]) for i in args.additional_intervals]
def norm_floor(val):
return int(math.floor(val / (1 << 6)))
def norm_ceil(val):
return int(math.ceil(val / (1 << 6)))
def chunks(l, n):
for i in range(0, len(l), n):
yield l[i:i + n]
def load_glyph(code_point):
face_index = 0
while face_index < len(font_stack):
face = font_stack[face_index]
glyph_index = face.get_char_index(code_point)
if glyph_index > 0:
face.load_glyph(glyph_index, load_flags)
return face
face_index += 1
return None
unmerged_intervals = sorted(intervals + add_ints)
intervals = []
unvalidated_intervals = []
for i_start, i_end in unmerged_intervals:
if len(unvalidated_intervals) > 0 and i_start + 1 <= unvalidated_intervals[-1][1]:
unvalidated_intervals[-1] = (unvalidated_intervals[-1][0], max(unvalidated_intervals[-1][1], i_end))
continue
unvalidated_intervals.append((i_start, i_end))
for i_start, i_end in unvalidated_intervals:
start = i_start
for code_point in range(i_start, i_end + 1):
face = load_glyph(code_point)
if face is None:
if start < code_point:
intervals.append((start, code_point - 1))
start = code_point + 1
if start != i_end + 1:
intervals.append((start, i_end))
for face in font_stack:
face.set_char_size(size << 6, size << 6, 150, 150)
total_size = 0
all_glyphs = []
for i_start, i_end in intervals:
for code_point in range(i_start, i_end + 1):
face = load_glyph(code_point)
bitmap = face.glyph.bitmap
# Build out 4-bit greyscale bitmap
pixels4g = []
px = 0
for i, v in enumerate(bitmap.buffer):
y = i / bitmap.width
x = i % bitmap.width
if x % 2 == 0:
px = (v >> 4)
else:
px = px | (v & 0xF0)
pixels4g.append(px);
px = 0
# eol
if x == bitmap.width - 1 and bitmap.width % 2 > 0:
pixels4g.append(px)
px = 0
if is2Bit:
# 0-3 white, 4-7 light grey, 8-11 dark grey, 12-15 black
# Downsample to 2-bit bitmap
pixels2b = []
px = 0
pitch = (bitmap.width // 2) + (bitmap.width % 2)
for y in range(bitmap.rows):
for x in range(bitmap.width):
px = px << 2
bm = pixels4g[y * pitch + (x // 2)]
bm = (bm >> ((x % 2) * 4)) & 0xF
if bm >= 12:
px += 3
elif bm >= 8:
px += 2
elif bm >= 4:
px += 1
if (y * bitmap.width + x) % 4 == 3:
pixels2b.append(px)
px = 0
if (bitmap.width * bitmap.rows) % 4 != 0:
px = px << (4 - (bitmap.width * bitmap.rows) % 4) * 2
pixels2b.append(px)
# for y in range(bitmap.rows):
# line = ''
# for x in range(bitmap.width):
# pixelPosition = y * bitmap.width + x
# byte = pixels2b[pixelPosition // 4]
# bit_index = (3 - (pixelPosition % 4)) * 2
# line += '#' if ((byte >> bit_index) & 3) > 0 else '.'
# print(line)
# print('')
else:
# Downsample to 1-bit bitmap - treat any 2+ as black
pixelsbw = []
px = 0
pitch = (bitmap.width // 2) + (bitmap.width % 2)
for y in range(bitmap.rows):
for x in range(bitmap.width):
px = px << 1
bm = pixels4g[y * pitch + (x // 2)]
px += 1 if ((x & 1) == 0 and bm & 0xE > 0) or ((x & 1) == 1 and bm & 0xE0 > 0) else 0
if (y * bitmap.width + x) % 8 == 7:
pixelsbw.append(px)
px = 0
if (bitmap.width * bitmap.rows) % 8 != 0:
px = px << (8 - (bitmap.width * bitmap.rows) % 8)
pixelsbw.append(px)
# for y in range(bitmap.rows):
# line = ''
# for x in range(bitmap.width):
# pixelPosition = y * bitmap.width + x
# byte = pixelsbw[pixelPosition // 8]
# bit_index = 7 - (pixelPosition % 8)
# line += '#' if (byte >> bit_index) & 1 else '.'
# print(line)
# print('')
pixels = pixels2b if is2Bit else pixelsbw
# Build output data
packed = bytes(pixels)
glyph = GlyphProps(
width = bitmap.width,
height = bitmap.rows,
advance_x = norm_floor(face.glyph.advance.x),
left = face.glyph.bitmap_left,
top = face.glyph.bitmap_top,
data_length = len(packed),
data_offset = total_size,
code_point = code_point,
)
total_size += len(packed)
all_glyphs.append((glyph, packed))
# pipe seems to be a good heuristic for the "real" descender
face = load_glyph(ord('|'))
glyph_data = []
glyph_props = []
for index, glyph in enumerate(all_glyphs):
props, packed = glyph
glyph_data.extend([b for b in packed])
glyph_props.append(props)
# --- Kerning pair extraction ---
# Modern fonts store kerning in the OpenType GPOS table, which FreeType's
# get_kerning() does not read. We use fonttools to parse both the legacy
# kern table and the GPOS 'kern' feature (PairPos lookups, including
# Extension wrappers).
COMBINING_MARKS_START = 0x0300
COMBINING_MARKS_END = 0x036F
all_codepoints = [g.code_point for g in glyph_props]
kernable_codepoints = set(cp for cp in all_codepoints
if not (COMBINING_MARKS_START <= cp <= COMBINING_MARKS_END))
# Map each kernable codepoint to the font-stack index that serves it
# (same priority logic as load_glyph).
cp_to_face_idx = {}
for cp in kernable_codepoints:
for face_idx, f in enumerate(font_stack):
if f.get_char_index(cp) > 0:
cp_to_face_idx[cp] = face_idx
break
# Group codepoints by face index
face_idx_cps = {}
for cp, fi in cp_to_face_idx.items():
face_idx_cps.setdefault(fi, set()).add(cp)
def _extract_pairpos_subtable(subtable, glyph_to_cp, raw_kern):
"""Extract kerning from a PairPos subtable (Format 1 or 2)."""
if subtable.Format == 1:
# Individual pairs
for i, coverage_glyph in enumerate(subtable.Coverage.glyphs):
if coverage_glyph not in glyph_to_cp:
continue
pair_set = subtable.PairSet[i]
for pvr in pair_set.PairValueRecord:
if pvr.SecondGlyph not in glyph_to_cp:
continue
xa = 0
if hasattr(pvr, 'Value1') and pvr.Value1:
xa = getattr(pvr.Value1, 'XAdvance', 0) or 0
if xa != 0:
key = (coverage_glyph, pvr.SecondGlyph)
raw_kern[key] = raw_kern.get(key, 0) + xa
elif subtable.Format == 2:
# Class-based pairs
class_def1 = subtable.ClassDef1.classDefs if subtable.ClassDef1 else {}
class_def2 = subtable.ClassDef2.classDefs if subtable.ClassDef2 else {}
coverage_set = set(subtable.Coverage.glyphs)
for left_glyph in glyph_to_cp:
if left_glyph not in coverage_set:
continue
c1 = class_def1.get(left_glyph, 0)
if c1 >= len(subtable.Class1Record):
continue
class1_rec = subtable.Class1Record[c1]
for right_glyph in glyph_to_cp:
c2 = class_def2.get(right_glyph, 0)
if c2 >= len(class1_rec.Class2Record):
continue
c2_rec = class1_rec.Class2Record[c2]
xa = 0
if hasattr(c2_rec, 'Value1') and c2_rec.Value1:
xa = getattr(c2_rec.Value1, 'XAdvance', 0) or 0
if xa != 0:
key = (left_glyph, right_glyph)
raw_kern[key] = raw_kern.get(key, 0) + xa
def extract_kerning_fonttools(font_path, codepoints, ppem):
"""Extract kerning pairs from a font file using fonttools.
Returns dict of {(leftCp, rightCp): pixel_adjust} for the given
codepoints. Values are scaled from font design units to integer
pixels at ppem.
"""
font = TTFont(font_path)
units_per_em = font['head'].unitsPerEm
cmap = font.getBestCmap() or {}
# Build glyph_name -> codepoint map (only for requested codepoints)
glyph_to_cp = {}
for cp in codepoints:
gname = cmap.get(cp)
if gname:
glyph_to_cp[gname] = cp
# Collect raw kerning values in font design units
raw_kern = {} # (left_glyph_name, right_glyph_name) -> design_units
# 1. Legacy kern table
if 'kern' in font:
for subtable in font['kern'].kernTables:
if hasattr(subtable, 'kernTable'):
for (lg, rg), val in subtable.kernTable.items():
if lg in glyph_to_cp and rg in glyph_to_cp:
raw_kern[(lg, rg)] = raw_kern.get((lg, rg), 0) + val
# 2. GPOS 'kern' feature
if 'GPOS' in font:
gpos = font['GPOS'].table
kern_lookup_indices = set()
if gpos.FeatureList:
for fr in gpos.FeatureList.FeatureRecord:
if fr.FeatureTag == 'kern':
kern_lookup_indices.update(fr.Feature.LookupListIndex)
for li in kern_lookup_indices:
lookup = gpos.LookupList.Lookup[li]
for st in lookup.SubTable:
actual = st
# Unwrap Extension (lookup type 9) wrappers
if lookup.LookupType == 9 and hasattr(st, 'ExtSubTable'):
actual = st.ExtSubTable
if hasattr(actual, 'Format'):
_extract_pairpos_subtable(actual, glyph_to_cp, raw_kern)
font.close()
# Scale design-unit values to pixels
scale = ppem / units_per_em
result = {} # (leftCp, rightCp) -> adjust
for (lg, rg), du in raw_kern.items():
lcp = glyph_to_cp[lg]
rcp = glyph_to_cp[rg]
adjust = int(math.floor(du * scale))
if adjust != 0:
adjust = max(-128, min(127, adjust))
result[(lcp, rcp)] = adjust
return result
# The ppem used by the existing glyph rasterization:
# face.set_char_size(size << 6, size << 6, 150, 150)
# means size_pt at 150 DPI -> ppem = size * 150 / 72
ppem = size * 150.0 / 72.0
kern_map = {} # (leftCp, rightCp) -> adjust
for face_idx, cps in face_idx_cps.items():
font_path = args.fontstack[face_idx]
kern_map.update(extract_kerning_fonttools(font_path, cps, ppem))
print(f"kerning: {len(kern_map)} pairs extracted", file=sys.stderr)
# --- Derive class-based kerning from pairs ---
kern_left_classes = [] # list of (codepoint, classId)
kern_right_classes = [] # list of (codepoint, classId)
kern_matrix = [] # flat list of int8_t values
kern_left_class_count = 0
kern_right_class_count = 0
if kern_map:
all_left_cps = {lcp for lcp, _ in kern_map}
all_right_cps = {rcp for _, rcp in kern_map}
sorted_right_cps = sorted(all_right_cps)
sorted_left_cps = sorted(all_left_cps)
# Group left codepoints by identical adjustment row
left_profile_to_class = {}
left_class_map = {}
left_class_id = 1
for lcp in sorted(all_left_cps):
row = tuple(kern_map.get((lcp, rcp), 0) for rcp in sorted_right_cps)
if row not in left_profile_to_class:
left_profile_to_class[row] = left_class_id
left_class_id += 1
left_class_map[lcp] = left_profile_to_class[row]
# Group right codepoints by identical adjustment column
right_profile_to_class = {}
right_class_map = {}
right_class_id = 1
for rcp in sorted(all_right_cps):
col = tuple(kern_map.get((lcp, rcp), 0) for lcp in sorted_left_cps)
if col not in right_profile_to_class:
right_profile_to_class[col] = right_class_id
right_class_id += 1
right_class_map[rcp] = right_profile_to_class[col]
kern_left_class_count = left_class_id - 1
kern_right_class_count = right_class_id - 1
if kern_left_class_count > 255 or kern_right_class_count > 255:
print(f"WARNING: kerning class count exceeds uint8_t range "
f"(left={kern_left_class_count}, right={kern_right_class_count})",
file=sys.stderr)
# Build the class x class matrix
kern_matrix = [0] * (kern_left_class_count * kern_right_class_count)
for (lcp, rcp), adjust in kern_map.items():
lc = left_class_map[lcp] - 1
rc = right_class_map[rcp] - 1
kern_matrix[lc * kern_right_class_count + rc] = adjust
# Build sorted class entry lists
kern_left_classes = sorted(left_class_map.items())
kern_right_classes = sorted(right_class_map.items())
matrix_size = kern_left_class_count * kern_right_class_count
entries_size = (len(kern_left_classes) + len(kern_right_classes)) * 3
print(f"kerning: {kern_left_class_count} left classes, {kern_right_class_count} right classes, "
f"{matrix_size + entries_size} bytes", file=sys.stderr)
# --- Ligature pair extraction ---
# Parse the OpenType GSUB table for LigatureSubst (type 4) lookups.
# Multi-character ligatures (3+ codepoints) are decomposed into chained
# pairs when an intermediate ligature exists (e.g., ffi = ff + i where ff
# is itself a ligature). Only pairs where both input codepoints and the
# output codepoint are in the generated glyph set are included.
all_codepoints_set = set(all_codepoints)
# Standard Unicode ligature codepoints for known input sequences.
# Used as a fallback when the GSUB substitute glyph has no cmap entry.
STANDARD_LIGATURE_MAP = {
(0x66, 0x66): 0xFB00, # ff
(0x66, 0x69): 0xFB01, # fi
(0x66, 0x6C): 0xFB02, # fl
(0x66, 0x66, 0x69): 0xFB03, # ffi
(0x66, 0x66, 0x6C): 0xFB04, # ffl
(0x17F, 0x74): 0xFB05, # long-s + t
(0x73, 0x74): 0xFB06, # st
}
def extract_ligatures_fonttools(font_path, codepoints):
"""Extract ligature substitution pairs from a font file using fonttools.
Returns list of (packed_pair, ligature_codepoint) for the given codepoints.
Multi-character ligatures are decomposed into chained pairs.
"""
font = TTFont(font_path)
cmap = font.getBestCmap() or {}
# Build glyph_name -> codepoint and codepoint -> glyph_name maps
glyph_to_cp = {}
cp_to_glyph = {}
for cp, gname in cmap.items():
glyph_to_cp[gname] = cp
cp_to_glyph[cp] = gname
# Collect raw ligature rules: (sequence_of_codepoints) -> ligature_codepoint
raw_ligatures = {} # tuple of codepoints -> ligature codepoint
if 'GSUB' in font:
gsub = font['GSUB'].table
# Find lookup indices for ligature features.
# Currently extracts 'liga' (standard) and 'rlig' (required) only.
# To also extract discretionary or historical ligatures, add:
# 'dlig' - Discretionary Ligatures (e.g., ft, st in Bookerly)
# 'hlig' - Historical Ligatures (e.g., long-s+t in OpenDyslexic)
# These are off by default in standard text renderers.
LIGATURE_FEATURES = ('liga', 'rlig')
liga_lookup_indices = set()
if gsub.FeatureList:
for fr in gsub.FeatureList.FeatureRecord:
if fr.FeatureTag in LIGATURE_FEATURES:
liga_lookup_indices.update(fr.Feature.LookupListIndex)
for li in liga_lookup_indices:
lookup = gsub.LookupList.Lookup[li]
for st in lookup.SubTable:
actual = st
# Unwrap Extension (lookup type 7) wrappers
if lookup.LookupType == 7 and hasattr(st, 'ExtSubTable'):
actual = st.ExtSubTable
# LigatureSubst is lookup type 4
if not hasattr(actual, 'ligatures'):
continue
for first_glyph, ligature_list in actual.ligatures.items():
if first_glyph not in glyph_to_cp:
continue
first_cp = glyph_to_cp[first_glyph]
for lig in ligature_list:
# lig.Component is a list of subsequent glyph names
# lig.LigGlyph is the substitute glyph name
component_cps = []
valid = True
for comp_glyph in lig.Component:
if comp_glyph not in glyph_to_cp:
valid = False
break
component_cps.append(glyph_to_cp[comp_glyph])
if not valid:
continue
seq = tuple([first_cp] + component_cps)
if lig.LigGlyph in glyph_to_cp:
lig_cp = glyph_to_cp[lig.LigGlyph]
elif seq in STANDARD_LIGATURE_MAP:
lig_cp = STANDARD_LIGATURE_MAP[seq]
else:
seq_str = ', '.join(f'U+{cp:04X}' for cp in seq)
print(f"ligatures: WARNING: dropping ligature ({seq_str}) -> "
f"glyph '{lig.LigGlyph}': output glyph has no cmap entry "
f"and input sequence is not in STANDARD_LIGATURE_MAP",
file=sys.stderr)
continue
raw_ligatures[seq] = lig_cp
font.close()
# Filter: only keep ligatures where all input and output codepoints are
# in our generated glyph set
filtered = {}
for seq, lig_cp in raw_ligatures.items():
if lig_cp not in codepoints and lig_cp not in all_codepoints_set:
continue
if all(cp in codepoints for cp in seq):
filtered[seq] = lig_cp
# Decompose into chained pairs
# For 2-codepoint sequences: direct pair (a, b) -> lig
# For 3+ codepoint sequences: chain through intermediates
# e.g., (f, f, i) -> ffi requires (f, f) -> ff to exist,
# then we add (ff, i) -> ffi
pairs = []
# First pass: collect all 2-codepoint ligatures
two_char = {seq: lig_cp for seq, lig_cp in filtered.items() if len(seq) == 2}
for seq, lig_cp in two_char.items():
packed = (seq[0] << 16) | seq[1]
pairs.append((packed, lig_cp))
# Second pass: decompose 3+ codepoint ligatures into chained pairs
for seq, lig_cp in filtered.items():
if len(seq) < 3:
continue
# Try to find an intermediate: check if the first N-1 codepoints
# form a known ligature, then chain (intermediate, last) -> lig
prefix = seq[:-1]
last_cp = seq[-1]
if prefix in filtered:
intermediate_cp = filtered[prefix]
packed = (intermediate_cp << 16) | last_cp
pairs.append((packed, lig_cp))
else:
print(f"ligatures: skipping {len(seq)}-char ligature "
f"({', '.join(f'U+{cp:04X}' for cp in seq)}) -> U+{lig_cp:04X}: "
f"no intermediate ligature for prefix", file=sys.stderr)
return pairs
ligature_codepoints = set(cp for cp in all_codepoints
if not (COMBINING_MARKS_START <= cp <= COMBINING_MARKS_END))
# Map ligature codepoints to the font-stack index that serves them
lig_cp_to_face_idx = {}
for cp in ligature_codepoints:
for face_idx, f in enumerate(font_stack):
if f.get_char_index(cp) > 0:
lig_cp_to_face_idx[cp] = face_idx
break
# Group by face index
lig_face_idx_cps = {}
for cp, fi in lig_cp_to_face_idx.items():
lig_face_idx_cps.setdefault(fi, set()).add(cp)
ligature_pairs = []
for face_idx, cps in lig_face_idx_cps.items():
font_path = args.fontstack[face_idx]
ligature_pairs.extend(extract_ligatures_fonttools(font_path, cps))
# Deduplicate (keep first occurrence) and sort
seen_lig_keys = set()
unique_ligature_pairs = []
for packed, lig_cp in ligature_pairs:
if packed not in seen_lig_keys:
seen_lig_keys.add(packed)
unique_ligature_pairs.append((packed, lig_cp))
ligature_pairs = sorted(unique_ligature_pairs, key=lambda p: p[0])
print(f"ligatures: {len(ligature_pairs)} pairs extracted", file=sys.stderr)
compress = args.compress
# Build groups for compression
if compress:
# Script-based grouping: glyphs that co-occur in typical text rendering
# are grouped together for efficient LRU caching on the embedded target.
# Since glyphs are in codepoint order, glyphs in the same Unicode block
# are contiguous in the array and form natural groups.
SCRIPT_GROUP_RANGES = [
(0x0000, 0x007F), # ASCII
(0x0080, 0x00FF), # Latin-1 Supplement
(0x0100, 0x017F), # Latin Extended-A
(0x0300, 0x036F), # Combining Diacritical Marks
(0x0400, 0x04FF), # Cyrillic
(0x2000, 0x206F), # General Punctuation
(0x2070, 0x209F), # Superscripts & Subscripts
(0x20A0, 0x20CF), # Currency Symbols
(0x2190, 0x21FF), # Arrows
(0x2200, 0x22FF), # Math Operators
(0xFB00, 0xFB06), # Alphabetic Presentation Forms (ligatures)
(0xFFFD, 0xFFFD), # Replacement Character
]
def get_script_group(code_point):
for i, (start, end) in enumerate(SCRIPT_GROUP_RANGES):
if start <= code_point <= end:
return i
return -1
groups = [] # list of (first_glyph_index, glyph_count)
current_group_id = None
group_start = 0
group_count = 0
for i, (props, packed) in enumerate(all_glyphs):
sg = get_script_group(props.code_point)
if sg != current_group_id:
if group_count > 0:
groups.append((group_start, group_count))
current_group_id = sg
group_start = i
group_count = 1
else:
group_count += 1
if group_count > 0:
groups.append((group_start, group_count))
# Compress each group
compressed_groups = [] # list of (compressed_bytes, uncompressed_size, glyph_count, first_glyph_index)
compressed_bitmap_data = []
compressed_offset = 0
# Also build modified glyph props with within-group offsets
modified_glyph_props = list(glyph_props)
for first_idx, count in groups:
# Concatenate bitmap data for this group
group_data = b''
for gi in range(first_idx, first_idx + count):
props, packed = all_glyphs[gi]
# Update glyph's dataOffset to be within-group offset
within_group_offset = len(group_data)
old_props = modified_glyph_props[gi]
modified_glyph_props[gi] = GlyphProps(
width=old_props.width,
height=old_props.height,
advance_x=old_props.advance_x,
left=old_props.left,
top=old_props.top,
data_length=old_props.data_length,
data_offset=within_group_offset,
code_point=old_props.code_point,
)
group_data += packed
# Compress with raw DEFLATE (no zlib/gzip header)
compressor = zlib.compressobj(level=9, wbits=-15)
compressed = compressor.compress(group_data) + compressor.flush()
compressed_groups.append((compressed, len(group_data), count, first_idx))
compressed_bitmap_data.extend(compressed)
compressed_offset += len(compressed)
glyph_props = modified_glyph_props
total_compressed = len(compressed_bitmap_data)
total_uncompressed = len(glyph_data)
print(f"// Compression: {total_uncompressed} -> {total_compressed} bytes ({100*total_compressed/total_uncompressed:.1f}%), {len(groups)} groups", file=sys.stderr)
print(f"""/**
* generated by fontconvert.py
* name: {font_name}
* size: {size}
* mode: {'2-bit' if is2Bit else '1-bit'}{' compressed: true' if compress else ''}
* Command used: {' '.join(sys.argv)}
*/
#pragma once
#include "EpdFontData.h"
""")
if compress:
print(f"static const uint8_t {font_name}Bitmaps[{len(compressed_bitmap_data)}] = {{")
for c in chunks(compressed_bitmap_data, 16):
print (" " + " ".join(f"0x{b:02X}," for b in c))
print ("};\n");
else:
print(f"static const uint8_t {font_name}Bitmaps[{len(glyph_data)}] = {{")
for c in chunks(glyph_data, 16):
print (" " + " ".join(f"0x{b:02X}," for b in c))
print ("};\n");
def cp_label(cp):
if cp == 0x5C:
return '<backslash>'
return chr(cp) if 0x20 < cp < 0x7F else f'U+{cp:04X}'
print(f"static const EpdGlyph {font_name}Glyphs[] = {{")
for i, g in enumerate(glyph_props):
print (" { " + ", ".join([f"{a}" for a in list(g[:-1])]),"},", f"// {cp_label(g.code_point)}")
print ("};\n");
print(f"static const EpdUnicodeInterval {font_name}Intervals[] = {{")
offset = 0
for i_start, i_end in intervals:
print (f" {{ 0x{i_start:X}, 0x{i_end:X}, 0x{offset:X} }},")
offset += i_end - i_start + 1
print ("};\n");
if compress:
print(f"static const EpdFontGroup {font_name}Groups[] = {{")
compressed_offset = 0
for compressed, uncompressed_size, count, first_idx in compressed_groups:
print(f" {{ {compressed_offset}, {len(compressed)}, {uncompressed_size}, {count}, {first_idx} }},")
compressed_offset += len(compressed)
print("};\n")
if kern_map:
print(f"static const EpdKernClassEntry {font_name}KernLeftClasses[] = {{")
for cp, cls in kern_left_classes:
print(f" {{ 0x{cp:04X}, {cls} }}, // {cp_label(cp)}")
print("};\n")
print(f"static const EpdKernClassEntry {font_name}KernRightClasses[] = {{")
for cp, cls in kern_right_classes:
print(f" {{ 0x{cp:04X}, {cls} }}, // {cp_label(cp)}")
print("};\n")
print(f"static const int8_t {font_name}KernMatrix[] = {{")
for row in range(kern_left_class_count):
row_start = row * kern_right_class_count
row_vals = kern_matrix[row_start:row_start + kern_right_class_count]
print(" " + ", ".join(f"{v:4d}" for v in row_vals) + ",")
print("};\n")
if ligature_pairs:
print(f"static const EpdLigaturePair {font_name}LigaturePairs[] = {{")
for packed_pair, lig_cp in ligature_pairs:
print(f" {{ 0x{packed_pair:08X}, 0x{lig_cp:04X} }}, // {cp_label(packed_pair >> 16)} {cp_label(packed_pair & 0xFFFF)} -> {cp_label(lig_cp)}")
print("};\n")
print(f"static const EpdFontData {font_name} = {{")
print(f" {font_name}Bitmaps,")
print(f" {font_name}Glyphs,")
print(f" {font_name}Intervals,")
print(f" {len(intervals)},")
print(f" {norm_ceil(face.size.height)},")
print(f" {norm_ceil(face.size.ascender)},")
print(f" {norm_floor(face.size.descender)},")
print(f" {'true' if is2Bit else 'false'},")
if compress:
print(f" {font_name}Groups,")
print(f" {len(compressed_groups)},")
else:
print(f" nullptr,")
print(f" 0,")
if kern_map:
print(f" {font_name}KernLeftClasses,")
print(f" {font_name}KernRightClasses,")
print(f" {font_name}KernMatrix,")
print(f" {len(kern_left_classes)},")
print(f" {len(kern_right_classes)},")
print(f" {kern_left_class_count},")
print(f" {kern_right_class_count},")
else:
print(f" nullptr,")
print(f" nullptr,")
print(f" nullptr,")
print(f" 0,")
print(f" 0,")
print(f" 0,")
print(f" 0,")
if ligature_pairs:
print(f" {font_name}LigaturePairs,")
print(f" {len(ligature_pairs)},")
else:
print(f" nullptr,")
print(f" 0,")
print("};")