## Summary * **What is the goal of this PR?** Add Vietnamese glyphs support for the reader's built-in fonts, enabling proper rendering of Vietnamese text in EPUB content. * **What changes are included?** - Added 3 new Unicode intervals to `fontconvert.py` covering Vietnamese characters: - **Latin Extended-B** (Vietnamese subset only): `U+01A0–U+01B0` — Ơ/ơ, Ư/ư - **Vietnamese Extended**: `U+1EA0–U+1EF9` — All precomposed Vietnamese characters with tone marks (Ả, Ấ, Ầ, Ẩ, Ẫ, Ậ, Ắ, …, Ỹ) - Re-generated all 54 built-in font header files (Bookerly, Noto Sans, OpenDyslexic, Ubuntu across all sizes and styles) to include the new Vietnamese glyphs. ## Additional Context * **Scope**: This PR only covers the **reader** fonts. The outer UI still uses the Ubuntu font which does not fully support Vietnamese — UI and i18n will be addressed in a follow-up PR (per discussion in PR #1124). * **Memory impact**: | Metric | Before | After | Delta | |---|---|---|---| | Flash Data (`.rodata`) | 2,971,028 B | 3,290,748 B | **+319,720 B (+10.8%)** | | Total image size | 4,663,235 B | 4,982,955 B | **+319,720 B (+6.9%)** | | Flash usage | 69.1% | 74.0% | **+4.9 pp** | | RAM usage | 29.0% | 29.0% | **No change** | * **Risk**: Low — this is a data-only change (font glyph tables in `.rodata`). No logic changes, no RAM impact. Flash headroom remains comfortable at 74%. --- ### AI Usage Did you use AI tools to help write this code? _**PARTIALLY**_ AI was used to identify the minimal set of Unicode ranges needed for Vietnamese support and to assist with the PR description. --------- Co-authored-by: danoooob <danoooob@example.com>
856 lines
32 KiB
Python
Executable File
856 lines
32 KiB
Python
Executable File
#!python3
|
||
import freetype
|
||
import zlib
|
||
import sys
|
||
import re
|
||
import math
|
||
import argparse
|
||
from collections import namedtuple
|
||
from fontTools.ttLib import TTFont
|
||
|
||
# Originally from https://github.com/vroland/epdiy
|
||
|
||
parser = argparse.ArgumentParser(description="Generate a header file from a font to be used with epdiy.")
|
||
parser.add_argument("name", action="store", help="name of the font.")
|
||
parser.add_argument("size", type=int, help="font size to use.")
|
||
parser.add_argument("fontstack", action="store", nargs='+', help="list of font files, ordered by descending priority.")
|
||
parser.add_argument("--2bit", dest="is2Bit", action="store_true", help="generate 2-bit greyscale bitmap instead of 1-bit black and white.")
|
||
parser.add_argument("--additional-intervals", dest="additional_intervals", action="append", help="Additional code point intervals to export as min,max. This argument can be repeated.")
|
||
parser.add_argument("--compress", dest="compress", action="store_true", help="Compress glyph bitmaps using DEFLATE with group-based compression.")
|
||
parser.add_argument("--force-autohint", dest="force_autohint", action="store_true", help="Force FreeType auto-hinter instead of native font hinting. Improves stem width consistency for fonts with weak or no native TrueType hints.")
|
||
args = parser.parse_args()
|
||
|
||
GlyphProps = namedtuple("GlyphProps", ["width", "height", "advance_x", "left", "top", "data_length", "data_offset", "code_point"])
|
||
|
||
font_stack = [freetype.Face(f) for f in args.fontstack]
|
||
is2Bit = args.is2Bit
|
||
size = args.size
|
||
font_name = args.name
|
||
load_flags = freetype.FT_LOAD_RENDER
|
||
if args.force_autohint:
|
||
load_flags |= freetype.FT_LOAD_FORCE_AUTOHINT
|
||
|
||
# inclusive unicode code point intervals
|
||
# must not overlap and be in ascending order
|
||
intervals = [
|
||
### Basic Latin ###
|
||
# ASCII letters, digits, punctuation, control characters
|
||
(0x0000, 0x007F),
|
||
### Latin-1 Supplement ###
|
||
# Accented characters for Western European languages
|
||
(0x0080, 0x00FF),
|
||
### Latin Extended-A ###
|
||
# Eastern European and Baltic languages
|
||
(0x0100, 0x017F),
|
||
### Latin Extended-B (Vietnamese subset only) ###
|
||
# Only Ơ/ơ (U+01A0-01A1), Ư/ư (U+01AF-01B0) for Vietnamese
|
||
(0x01A0, 0x01A1),
|
||
(0x01AF, 0x01B0),
|
||
### Vietnamese Extended ###
|
||
# All precomposed Vietnamese characters with tone marks
|
||
# Ả Ấ Ầ Ẩ Ẫ Ậ Ắ Ằ Ẳ Ẵ Ặ Ẹ Ẻ Ẽ Ế Ề Ể Ễ Ệ Ỉ Ị Ọ Ỏ Ố Ồ Ổ Ỗ Ộ Ớ Ờ Ở Ỡ Ợ Ụ Ủ Ứ Ừ Ử Ữ Ự Ỳ Ỵ Ỷ Ỹ
|
||
(0x1EA0, 0x1EF9),
|
||
### General Punctuation (core subset) ###
|
||
# Smart quotes, en dash, em dash, ellipsis, NO-BREAK SPACE
|
||
(0x2000, 0x206F),
|
||
### Basic Symbols From "Latin-1 + Misc" ###
|
||
# dashes, quotes, prime marks
|
||
(0x2010, 0x203A),
|
||
# misc punctuation
|
||
(0x2040, 0x205F),
|
||
# common currency symbols
|
||
(0x20A0, 0x20CF),
|
||
### Combining Diacritical Marks (minimal subset) ###
|
||
# Needed for proper rendering of many extended Latin languages
|
||
(0x0300, 0x036F),
|
||
### Greek & Coptic ###
|
||
# Used in science, maths, philosophy, some academic texts
|
||
# (0x0370, 0x03FF),
|
||
### Cyrillic ###
|
||
# Russian, Ukrainian, Bulgarian, etc.
|
||
(0x0400, 0x04FF),
|
||
### Math Symbols (common subset) ###
|
||
# Superscripts and Subscripts
|
||
(0x2070, 0x209F),
|
||
# General math operators
|
||
(0x2200, 0x22FF),
|
||
# Arrows
|
||
(0x2190, 0x21FF),
|
||
### CJK ###
|
||
# Core Unified Ideographs
|
||
# (0x4E00, 0x9FFF),
|
||
# # Extension A
|
||
# (0x3400, 0x4DBF),
|
||
# # Extension B
|
||
# (0x20000, 0x2A6DF),
|
||
# # Extension C–F
|
||
# (0x2A700, 0x2EBEF),
|
||
# # Extension G
|
||
# (0x30000, 0x3134F),
|
||
# # Hiragana
|
||
# (0x3040, 0x309F),
|
||
# # Katakana
|
||
# (0x30A0, 0x30FF),
|
||
# # Katakana Phonetic Extensions
|
||
# (0x31F0, 0x31FF),
|
||
# # Halfwidth Katakana
|
||
# (0xFF60, 0xFF9F),
|
||
# # Hangul Syllables
|
||
# (0xAC00, 0xD7AF),
|
||
# # Hangul Jamo
|
||
# (0x1100, 0x11FF),
|
||
# # Hangul Compatibility Jamo
|
||
# (0x3130, 0x318F),
|
||
# # Hangul Jamo Extended-A
|
||
# (0xA960, 0xA97F),
|
||
# # Hangul Jamo Extended-B
|
||
# (0xD7B0, 0xD7FF),
|
||
# # CJK Radicals Supplement
|
||
# (0x2E80, 0x2EFF),
|
||
# # Kangxi Radicals
|
||
# (0x2F00, 0x2FDF),
|
||
# # CJK Symbols and Punctuation
|
||
# (0x3000, 0x303F),
|
||
# # CJK Compatibility Forms
|
||
# (0xFE30, 0xFE4F),
|
||
# # CJK Compatibility Ideographs
|
||
# (0xF900, 0xFAFF),
|
||
### Alphabetic Presentation Forms (Latin ligatures) ###
|
||
# ff, fi, fl, ffi, ffl, long-st, st
|
||
(0xFB00, 0xFB06),
|
||
### Specials
|
||
# Replacement Character
|
||
(0xFFFD, 0xFFFD),
|
||
]
|
||
|
||
add_ints = []
|
||
if args.additional_intervals:
|
||
add_ints = [tuple([int(n, base=0) for n in i.split(",")]) for i in args.additional_intervals]
|
||
|
||
def norm_floor(val):
|
||
return int(math.floor(val / (1 << 6)))
|
||
|
||
def norm_ceil(val):
|
||
return int(math.ceil(val / (1 << 6)))
|
||
|
||
def chunks(l, n):
|
||
for i in range(0, len(l), n):
|
||
yield l[i:i + n]
|
||
|
||
def load_glyph(code_point):
|
||
face_index = 0
|
||
while face_index < len(font_stack):
|
||
face = font_stack[face_index]
|
||
glyph_index = face.get_char_index(code_point)
|
||
if glyph_index > 0:
|
||
face.load_glyph(glyph_index, load_flags)
|
||
return face
|
||
face_index += 1
|
||
return None
|
||
|
||
unmerged_intervals = sorted(intervals + add_ints)
|
||
intervals = []
|
||
unvalidated_intervals = []
|
||
for i_start, i_end in unmerged_intervals:
|
||
if len(unvalidated_intervals) > 0 and i_start + 1 <= unvalidated_intervals[-1][1]:
|
||
unvalidated_intervals[-1] = (unvalidated_intervals[-1][0], max(unvalidated_intervals[-1][1], i_end))
|
||
continue
|
||
unvalidated_intervals.append((i_start, i_end))
|
||
|
||
for i_start, i_end in unvalidated_intervals:
|
||
start = i_start
|
||
for code_point in range(i_start, i_end + 1):
|
||
face = load_glyph(code_point)
|
||
if face is None:
|
||
if start < code_point:
|
||
intervals.append((start, code_point - 1))
|
||
start = code_point + 1
|
||
if start != i_end + 1:
|
||
intervals.append((start, i_end))
|
||
|
||
for face in font_stack:
|
||
face.set_char_size(size << 6, size << 6, 150, 150)
|
||
|
||
total_size = 0
|
||
all_glyphs = []
|
||
|
||
for i_start, i_end in intervals:
|
||
for code_point in range(i_start, i_end + 1):
|
||
face = load_glyph(code_point)
|
||
bitmap = face.glyph.bitmap
|
||
|
||
# Build out 4-bit greyscale bitmap
|
||
pixels4g = []
|
||
px = 0
|
||
for i, v in enumerate(bitmap.buffer):
|
||
y = i / bitmap.width
|
||
x = i % bitmap.width
|
||
if x % 2 == 0:
|
||
px = (v >> 4)
|
||
else:
|
||
px = px | (v & 0xF0)
|
||
pixels4g.append(px);
|
||
px = 0
|
||
# eol
|
||
if x == bitmap.width - 1 and bitmap.width % 2 > 0:
|
||
pixels4g.append(px)
|
||
px = 0
|
||
|
||
if is2Bit:
|
||
# 0-3 white, 4-7 light grey, 8-11 dark grey, 12-15 black
|
||
# Downsample to 2-bit bitmap
|
||
pixels2b = []
|
||
px = 0
|
||
pitch = (bitmap.width // 2) + (bitmap.width % 2)
|
||
for y in range(bitmap.rows):
|
||
for x in range(bitmap.width):
|
||
px = px << 2
|
||
bm = pixels4g[y * pitch + (x // 2)]
|
||
bm = (bm >> ((x % 2) * 4)) & 0xF
|
||
|
||
if bm >= 12:
|
||
px += 3
|
||
elif bm >= 8:
|
||
px += 2
|
||
elif bm >= 4:
|
||
px += 1
|
||
|
||
if (y * bitmap.width + x) % 4 == 3:
|
||
pixels2b.append(px)
|
||
px = 0
|
||
if (bitmap.width * bitmap.rows) % 4 != 0:
|
||
px = px << (4 - (bitmap.width * bitmap.rows) % 4) * 2
|
||
pixels2b.append(px)
|
||
|
||
# for y in range(bitmap.rows):
|
||
# line = ''
|
||
# for x in range(bitmap.width):
|
||
# pixelPosition = y * bitmap.width + x
|
||
# byte = pixels2b[pixelPosition // 4]
|
||
# bit_index = (3 - (pixelPosition % 4)) * 2
|
||
# line += '#' if ((byte >> bit_index) & 3) > 0 else '.'
|
||
# print(line)
|
||
# print('')
|
||
else:
|
||
# Downsample to 1-bit bitmap - treat any 2+ as black
|
||
pixelsbw = []
|
||
px = 0
|
||
pitch = (bitmap.width // 2) + (bitmap.width % 2)
|
||
for y in range(bitmap.rows):
|
||
for x in range(bitmap.width):
|
||
px = px << 1
|
||
bm = pixels4g[y * pitch + (x // 2)]
|
||
px += 1 if ((x & 1) == 0 and bm & 0xE > 0) or ((x & 1) == 1 and bm & 0xE0 > 0) else 0
|
||
|
||
if (y * bitmap.width + x) % 8 == 7:
|
||
pixelsbw.append(px)
|
||
px = 0
|
||
if (bitmap.width * bitmap.rows) % 8 != 0:
|
||
px = px << (8 - (bitmap.width * bitmap.rows) % 8)
|
||
pixelsbw.append(px)
|
||
|
||
# for y in range(bitmap.rows):
|
||
# line = ''
|
||
# for x in range(bitmap.width):
|
||
# pixelPosition = y * bitmap.width + x
|
||
# byte = pixelsbw[pixelPosition // 8]
|
||
# bit_index = 7 - (pixelPosition % 8)
|
||
# line += '#' if (byte >> bit_index) & 1 else '.'
|
||
# print(line)
|
||
# print('')
|
||
|
||
pixels = pixels2b if is2Bit else pixelsbw
|
||
|
||
# Build output data
|
||
packed = bytes(pixels)
|
||
glyph = GlyphProps(
|
||
width = bitmap.width,
|
||
height = bitmap.rows,
|
||
advance_x = norm_floor(face.glyph.advance.x),
|
||
left = face.glyph.bitmap_left,
|
||
top = face.glyph.bitmap_top,
|
||
data_length = len(packed),
|
||
data_offset = total_size,
|
||
code_point = code_point,
|
||
)
|
||
total_size += len(packed)
|
||
all_glyphs.append((glyph, packed))
|
||
|
||
# pipe seems to be a good heuristic for the "real" descender
|
||
face = load_glyph(ord('|'))
|
||
|
||
glyph_data = []
|
||
glyph_props = []
|
||
for index, glyph in enumerate(all_glyphs):
|
||
props, packed = glyph
|
||
glyph_data.extend([b for b in packed])
|
||
glyph_props.append(props)
|
||
|
||
# --- Kerning pair extraction ---
|
||
# Modern fonts store kerning in the OpenType GPOS table, which FreeType's
|
||
# get_kerning() does not read. We use fonttools to parse both the legacy
|
||
# kern table and the GPOS 'kern' feature (PairPos lookups, including
|
||
# Extension wrappers).
|
||
|
||
COMBINING_MARKS_START = 0x0300
|
||
COMBINING_MARKS_END = 0x036F
|
||
all_codepoints = [g.code_point for g in glyph_props]
|
||
kernable_codepoints = set(cp for cp in all_codepoints
|
||
if not (COMBINING_MARKS_START <= cp <= COMBINING_MARKS_END))
|
||
|
||
# Map each kernable codepoint to the font-stack index that serves it
|
||
# (same priority logic as load_glyph).
|
||
cp_to_face_idx = {}
|
||
for cp in kernable_codepoints:
|
||
for face_idx, f in enumerate(font_stack):
|
||
if f.get_char_index(cp) > 0:
|
||
cp_to_face_idx[cp] = face_idx
|
||
break
|
||
|
||
# Group codepoints by face index
|
||
face_idx_cps = {}
|
||
for cp, fi in cp_to_face_idx.items():
|
||
face_idx_cps.setdefault(fi, set()).add(cp)
|
||
|
||
def _extract_pairpos_subtable(subtable, glyph_to_cp, raw_kern):
|
||
"""Extract kerning from a PairPos subtable (Format 1 or 2)."""
|
||
if subtable.Format == 1:
|
||
# Individual pairs
|
||
for i, coverage_glyph in enumerate(subtable.Coverage.glyphs):
|
||
if coverage_glyph not in glyph_to_cp:
|
||
continue
|
||
pair_set = subtable.PairSet[i]
|
||
for pvr in pair_set.PairValueRecord:
|
||
if pvr.SecondGlyph not in glyph_to_cp:
|
||
continue
|
||
xa = 0
|
||
if hasattr(pvr, 'Value1') and pvr.Value1:
|
||
xa = getattr(pvr.Value1, 'XAdvance', 0) or 0
|
||
if xa != 0:
|
||
key = (coverage_glyph, pvr.SecondGlyph)
|
||
raw_kern[key] = raw_kern.get(key, 0) + xa
|
||
elif subtable.Format == 2:
|
||
# Class-based pairs
|
||
class_def1 = subtable.ClassDef1.classDefs if subtable.ClassDef1 else {}
|
||
class_def2 = subtable.ClassDef2.classDefs if subtable.ClassDef2 else {}
|
||
coverage_set = set(subtable.Coverage.glyphs)
|
||
for left_glyph in glyph_to_cp:
|
||
if left_glyph not in coverage_set:
|
||
continue
|
||
c1 = class_def1.get(left_glyph, 0)
|
||
if c1 >= len(subtable.Class1Record):
|
||
continue
|
||
class1_rec = subtable.Class1Record[c1]
|
||
for right_glyph in glyph_to_cp:
|
||
c2 = class_def2.get(right_glyph, 0)
|
||
if c2 >= len(class1_rec.Class2Record):
|
||
continue
|
||
c2_rec = class1_rec.Class2Record[c2]
|
||
xa = 0
|
||
if hasattr(c2_rec, 'Value1') and c2_rec.Value1:
|
||
xa = getattr(c2_rec.Value1, 'XAdvance', 0) or 0
|
||
if xa != 0:
|
||
key = (left_glyph, right_glyph)
|
||
raw_kern[key] = raw_kern.get(key, 0) + xa
|
||
|
||
def extract_kerning_fonttools(font_path, codepoints, ppem):
|
||
"""Extract kerning pairs from a font file using fonttools.
|
||
|
||
Returns dict of {(leftCp, rightCp): pixel_adjust} for the given
|
||
codepoints. Values are scaled from font design units to integer
|
||
pixels at ppem.
|
||
"""
|
||
font = TTFont(font_path)
|
||
units_per_em = font['head'].unitsPerEm
|
||
cmap = font.getBestCmap() or {}
|
||
|
||
# Build glyph_name -> codepoint map (only for requested codepoints)
|
||
glyph_to_cp = {}
|
||
for cp in codepoints:
|
||
gname = cmap.get(cp)
|
||
if gname:
|
||
glyph_to_cp[gname] = cp
|
||
|
||
# Collect raw kerning values in font design units
|
||
raw_kern = {} # (left_glyph_name, right_glyph_name) -> design_units
|
||
|
||
# 1. Legacy kern table
|
||
if 'kern' in font:
|
||
for subtable in font['kern'].kernTables:
|
||
if hasattr(subtable, 'kernTable'):
|
||
for (lg, rg), val in subtable.kernTable.items():
|
||
if lg in glyph_to_cp and rg in glyph_to_cp:
|
||
raw_kern[(lg, rg)] = raw_kern.get((lg, rg), 0) + val
|
||
|
||
# 2. GPOS 'kern' feature
|
||
if 'GPOS' in font:
|
||
gpos = font['GPOS'].table
|
||
kern_lookup_indices = set()
|
||
if gpos.FeatureList:
|
||
for fr in gpos.FeatureList.FeatureRecord:
|
||
if fr.FeatureTag == 'kern':
|
||
kern_lookup_indices.update(fr.Feature.LookupListIndex)
|
||
for li in kern_lookup_indices:
|
||
lookup = gpos.LookupList.Lookup[li]
|
||
for st in lookup.SubTable:
|
||
actual = st
|
||
# Unwrap Extension (lookup type 9) wrappers
|
||
if lookup.LookupType == 9 and hasattr(st, 'ExtSubTable'):
|
||
actual = st.ExtSubTable
|
||
if hasattr(actual, 'Format'):
|
||
_extract_pairpos_subtable(actual, glyph_to_cp, raw_kern)
|
||
|
||
font.close()
|
||
|
||
# Scale design-unit values to pixels
|
||
scale = ppem / units_per_em
|
||
result = {} # (leftCp, rightCp) -> adjust
|
||
for (lg, rg), du in raw_kern.items():
|
||
lcp = glyph_to_cp[lg]
|
||
rcp = glyph_to_cp[rg]
|
||
adjust = int(math.floor(du * scale))
|
||
if adjust != 0:
|
||
adjust = max(-128, min(127, adjust))
|
||
result[(lcp, rcp)] = adjust
|
||
return result
|
||
|
||
# The ppem used by the existing glyph rasterization:
|
||
# face.set_char_size(size << 6, size << 6, 150, 150)
|
||
# means size_pt at 150 DPI -> ppem = size * 150 / 72
|
||
ppem = size * 150.0 / 72.0
|
||
|
||
kern_map = {} # (leftCp, rightCp) -> adjust
|
||
for face_idx, cps in face_idx_cps.items():
|
||
font_path = args.fontstack[face_idx]
|
||
kern_map.update(extract_kerning_fonttools(font_path, cps, ppem))
|
||
|
||
print(f"kerning: {len(kern_map)} pairs extracted", file=sys.stderr)
|
||
|
||
# --- Derive class-based kerning from pairs ---
|
||
kern_left_classes = [] # list of (codepoint, classId)
|
||
kern_right_classes = [] # list of (codepoint, classId)
|
||
kern_matrix = [] # flat list of int8_t values
|
||
kern_left_class_count = 0
|
||
kern_right_class_count = 0
|
||
|
||
if kern_map:
|
||
all_left_cps = {lcp for lcp, _ in kern_map}
|
||
all_right_cps = {rcp for _, rcp in kern_map}
|
||
|
||
sorted_right_cps = sorted(all_right_cps)
|
||
sorted_left_cps = sorted(all_left_cps)
|
||
|
||
# Group left codepoints by identical adjustment row
|
||
left_profile_to_class = {}
|
||
left_class_map = {}
|
||
left_class_id = 1
|
||
for lcp in sorted(all_left_cps):
|
||
row = tuple(kern_map.get((lcp, rcp), 0) for rcp in sorted_right_cps)
|
||
if row not in left_profile_to_class:
|
||
left_profile_to_class[row] = left_class_id
|
||
left_class_id += 1
|
||
left_class_map[lcp] = left_profile_to_class[row]
|
||
|
||
# Group right codepoints by identical adjustment column
|
||
right_profile_to_class = {}
|
||
right_class_map = {}
|
||
right_class_id = 1
|
||
for rcp in sorted(all_right_cps):
|
||
col = tuple(kern_map.get((lcp, rcp), 0) for lcp in sorted_left_cps)
|
||
if col not in right_profile_to_class:
|
||
right_profile_to_class[col] = right_class_id
|
||
right_class_id += 1
|
||
right_class_map[rcp] = right_profile_to_class[col]
|
||
|
||
kern_left_class_count = left_class_id - 1
|
||
kern_right_class_count = right_class_id - 1
|
||
|
||
if kern_left_class_count > 255 or kern_right_class_count > 255:
|
||
print(f"WARNING: kerning class count exceeds uint8_t range "
|
||
f"(left={kern_left_class_count}, right={kern_right_class_count})",
|
||
file=sys.stderr)
|
||
|
||
# Build the class x class matrix
|
||
kern_matrix = [0] * (kern_left_class_count * kern_right_class_count)
|
||
for (lcp, rcp), adjust in kern_map.items():
|
||
lc = left_class_map[lcp] - 1
|
||
rc = right_class_map[rcp] - 1
|
||
kern_matrix[lc * kern_right_class_count + rc] = adjust
|
||
|
||
# Build sorted class entry lists
|
||
kern_left_classes = sorted(left_class_map.items())
|
||
kern_right_classes = sorted(right_class_map.items())
|
||
|
||
matrix_size = kern_left_class_count * kern_right_class_count
|
||
entries_size = (len(kern_left_classes) + len(kern_right_classes)) * 3
|
||
print(f"kerning: {kern_left_class_count} left classes, {kern_right_class_count} right classes, "
|
||
f"{matrix_size + entries_size} bytes", file=sys.stderr)
|
||
|
||
# --- Ligature pair extraction ---
|
||
# Parse the OpenType GSUB table for LigatureSubst (type 4) lookups.
|
||
# Multi-character ligatures (3+ codepoints) are decomposed into chained
|
||
# pairs when an intermediate ligature exists (e.g., ffi = ff + i where ff
|
||
# is itself a ligature). Only pairs where both input codepoints and the
|
||
# output codepoint are in the generated glyph set are included.
|
||
|
||
all_codepoints_set = set(all_codepoints)
|
||
|
||
# Standard Unicode ligature codepoints for known input sequences.
|
||
# Used as a fallback when the GSUB substitute glyph has no cmap entry.
|
||
STANDARD_LIGATURE_MAP = {
|
||
(0x66, 0x66): 0xFB00, # ff
|
||
(0x66, 0x69): 0xFB01, # fi
|
||
(0x66, 0x6C): 0xFB02, # fl
|
||
(0x66, 0x66, 0x69): 0xFB03, # ffi
|
||
(0x66, 0x66, 0x6C): 0xFB04, # ffl
|
||
(0x17F, 0x74): 0xFB05, # long-s + t
|
||
(0x73, 0x74): 0xFB06, # st
|
||
}
|
||
|
||
def extract_ligatures_fonttools(font_path, codepoints):
|
||
"""Extract ligature substitution pairs from a font file using fonttools.
|
||
|
||
Returns list of (packed_pair, ligature_codepoint) for the given codepoints.
|
||
Multi-character ligatures are decomposed into chained pairs.
|
||
"""
|
||
font = TTFont(font_path)
|
||
cmap = font.getBestCmap() or {}
|
||
|
||
# Build glyph_name -> codepoint and codepoint -> glyph_name maps
|
||
glyph_to_cp = {}
|
||
cp_to_glyph = {}
|
||
for cp, gname in cmap.items():
|
||
glyph_to_cp[gname] = cp
|
||
cp_to_glyph[cp] = gname
|
||
|
||
# Collect raw ligature rules: (sequence_of_codepoints) -> ligature_codepoint
|
||
raw_ligatures = {} # tuple of codepoints -> ligature codepoint
|
||
|
||
if 'GSUB' in font:
|
||
gsub = font['GSUB'].table
|
||
|
||
# Find lookup indices for ligature features.
|
||
# Currently extracts 'liga' (standard) and 'rlig' (required) only.
|
||
# To also extract discretionary or historical ligatures, add:
|
||
# 'dlig' - Discretionary Ligatures (e.g., ft, st in Bookerly)
|
||
# 'hlig' - Historical Ligatures (e.g., long-s+t in OpenDyslexic)
|
||
# These are off by default in standard text renderers.
|
||
LIGATURE_FEATURES = ('liga', 'rlig')
|
||
liga_lookup_indices = set()
|
||
if gsub.FeatureList:
|
||
for fr in gsub.FeatureList.FeatureRecord:
|
||
if fr.FeatureTag in LIGATURE_FEATURES:
|
||
liga_lookup_indices.update(fr.Feature.LookupListIndex)
|
||
|
||
for li in liga_lookup_indices:
|
||
lookup = gsub.LookupList.Lookup[li]
|
||
for st in lookup.SubTable:
|
||
actual = st
|
||
# Unwrap Extension (lookup type 7) wrappers
|
||
if lookup.LookupType == 7 and hasattr(st, 'ExtSubTable'):
|
||
actual = st.ExtSubTable
|
||
# LigatureSubst is lookup type 4
|
||
if not hasattr(actual, 'ligatures'):
|
||
continue
|
||
for first_glyph, ligature_list in actual.ligatures.items():
|
||
if first_glyph not in glyph_to_cp:
|
||
continue
|
||
first_cp = glyph_to_cp[first_glyph]
|
||
for lig in ligature_list:
|
||
# lig.Component is a list of subsequent glyph names
|
||
# lig.LigGlyph is the substitute glyph name
|
||
component_cps = []
|
||
valid = True
|
||
for comp_glyph in lig.Component:
|
||
if comp_glyph not in glyph_to_cp:
|
||
valid = False
|
||
break
|
||
component_cps.append(glyph_to_cp[comp_glyph])
|
||
if not valid:
|
||
continue
|
||
seq = tuple([first_cp] + component_cps)
|
||
if lig.LigGlyph in glyph_to_cp:
|
||
lig_cp = glyph_to_cp[lig.LigGlyph]
|
||
elif seq in STANDARD_LIGATURE_MAP:
|
||
lig_cp = STANDARD_LIGATURE_MAP[seq]
|
||
else:
|
||
seq_str = ', '.join(f'U+{cp:04X}' for cp in seq)
|
||
print(f"ligatures: WARNING: dropping ligature ({seq_str}) -> "
|
||
f"glyph '{lig.LigGlyph}': output glyph has no cmap entry "
|
||
f"and input sequence is not in STANDARD_LIGATURE_MAP",
|
||
file=sys.stderr)
|
||
continue
|
||
raw_ligatures[seq] = lig_cp
|
||
|
||
font.close()
|
||
|
||
# Filter: only keep ligatures where all input and output codepoints are
|
||
# in our generated glyph set
|
||
filtered = {}
|
||
for seq, lig_cp in raw_ligatures.items():
|
||
if lig_cp not in codepoints and lig_cp not in all_codepoints_set:
|
||
continue
|
||
if all(cp in codepoints for cp in seq):
|
||
filtered[seq] = lig_cp
|
||
|
||
# Decompose into chained pairs
|
||
# For 2-codepoint sequences: direct pair (a, b) -> lig
|
||
# For 3+ codepoint sequences: chain through intermediates
|
||
# e.g., (f, f, i) -> ffi requires (f, f) -> ff to exist,
|
||
# then we add (ff, i) -> ffi
|
||
pairs = []
|
||
# First pass: collect all 2-codepoint ligatures
|
||
two_char = {seq: lig_cp for seq, lig_cp in filtered.items() if len(seq) == 2}
|
||
for seq, lig_cp in two_char.items():
|
||
packed = (seq[0] << 16) | seq[1]
|
||
pairs.append((packed, lig_cp))
|
||
|
||
# Second pass: decompose 3+ codepoint ligatures into chained pairs
|
||
for seq, lig_cp in filtered.items():
|
||
if len(seq) < 3:
|
||
continue
|
||
# Try to find an intermediate: check if the first N-1 codepoints
|
||
# form a known ligature, then chain (intermediate, last) -> lig
|
||
prefix = seq[:-1]
|
||
last_cp = seq[-1]
|
||
if prefix in filtered:
|
||
intermediate_cp = filtered[prefix]
|
||
packed = (intermediate_cp << 16) | last_cp
|
||
pairs.append((packed, lig_cp))
|
||
else:
|
||
print(f"ligatures: skipping {len(seq)}-char ligature "
|
||
f"({', '.join(f'U+{cp:04X}' for cp in seq)}) -> U+{lig_cp:04X}: "
|
||
f"no intermediate ligature for prefix", file=sys.stderr)
|
||
|
||
return pairs
|
||
|
||
ligature_codepoints = set(cp for cp in all_codepoints
|
||
if not (COMBINING_MARKS_START <= cp <= COMBINING_MARKS_END))
|
||
|
||
# Map ligature codepoints to the font-stack index that serves them
|
||
lig_cp_to_face_idx = {}
|
||
for cp in ligature_codepoints:
|
||
for face_idx, f in enumerate(font_stack):
|
||
if f.get_char_index(cp) > 0:
|
||
lig_cp_to_face_idx[cp] = face_idx
|
||
break
|
||
|
||
# Group by face index
|
||
lig_face_idx_cps = {}
|
||
for cp, fi in lig_cp_to_face_idx.items():
|
||
lig_face_idx_cps.setdefault(fi, set()).add(cp)
|
||
|
||
ligature_pairs = []
|
||
for face_idx, cps in lig_face_idx_cps.items():
|
||
font_path = args.fontstack[face_idx]
|
||
ligature_pairs.extend(extract_ligatures_fonttools(font_path, cps))
|
||
|
||
# Deduplicate (keep first occurrence) and sort
|
||
seen_lig_keys = set()
|
||
unique_ligature_pairs = []
|
||
for packed, lig_cp in ligature_pairs:
|
||
if packed not in seen_lig_keys:
|
||
seen_lig_keys.add(packed)
|
||
unique_ligature_pairs.append((packed, lig_cp))
|
||
ligature_pairs = sorted(unique_ligature_pairs, key=lambda p: p[0])
|
||
print(f"ligatures: {len(ligature_pairs)} pairs extracted", file=sys.stderr)
|
||
|
||
compress = args.compress
|
||
|
||
# Build groups for compression
|
||
if compress:
|
||
# Script-based grouping: glyphs that co-occur in typical text rendering
|
||
# are grouped together for efficient LRU caching on the embedded target.
|
||
# Since glyphs are in codepoint order, glyphs in the same Unicode block
|
||
# are contiguous in the array and form natural groups.
|
||
SCRIPT_GROUP_RANGES = [
|
||
(0x0000, 0x007F), # ASCII
|
||
(0x0080, 0x00FF), # Latin-1 Supplement
|
||
(0x0100, 0x017F), # Latin Extended-A
|
||
(0x0300, 0x036F), # Combining Diacritical Marks
|
||
(0x0400, 0x04FF), # Cyrillic
|
||
(0x2000, 0x206F), # General Punctuation
|
||
(0x2070, 0x209F), # Superscripts & Subscripts
|
||
(0x20A0, 0x20CF), # Currency Symbols
|
||
(0x2190, 0x21FF), # Arrows
|
||
(0x2200, 0x22FF), # Math Operators
|
||
(0xFB00, 0xFB06), # Alphabetic Presentation Forms (ligatures)
|
||
(0xFFFD, 0xFFFD), # Replacement Character
|
||
]
|
||
|
||
def get_script_group(code_point):
|
||
for i, (start, end) in enumerate(SCRIPT_GROUP_RANGES):
|
||
if start <= code_point <= end:
|
||
return i
|
||
return -1
|
||
|
||
groups = [] # list of (first_glyph_index, glyph_count)
|
||
current_group_id = None
|
||
group_start = 0
|
||
group_count = 0
|
||
|
||
for i, (props, packed) in enumerate(all_glyphs):
|
||
sg = get_script_group(props.code_point)
|
||
if sg != current_group_id:
|
||
if group_count > 0:
|
||
groups.append((group_start, group_count))
|
||
current_group_id = sg
|
||
group_start = i
|
||
group_count = 1
|
||
else:
|
||
group_count += 1
|
||
|
||
if group_count > 0:
|
||
groups.append((group_start, group_count))
|
||
|
||
# Compress each group
|
||
compressed_groups = [] # list of (compressed_bytes, uncompressed_size, glyph_count, first_glyph_index)
|
||
compressed_bitmap_data = []
|
||
compressed_offset = 0
|
||
|
||
# Also build modified glyph props with within-group offsets
|
||
modified_glyph_props = list(glyph_props)
|
||
|
||
for first_idx, count in groups:
|
||
# Concatenate bitmap data for this group
|
||
group_data = b''
|
||
for gi in range(first_idx, first_idx + count):
|
||
props, packed = all_glyphs[gi]
|
||
# Update glyph's dataOffset to be within-group offset
|
||
within_group_offset = len(group_data)
|
||
old_props = modified_glyph_props[gi]
|
||
modified_glyph_props[gi] = GlyphProps(
|
||
width=old_props.width,
|
||
height=old_props.height,
|
||
advance_x=old_props.advance_x,
|
||
left=old_props.left,
|
||
top=old_props.top,
|
||
data_length=old_props.data_length,
|
||
data_offset=within_group_offset,
|
||
code_point=old_props.code_point,
|
||
)
|
||
group_data += packed
|
||
|
||
# Compress with raw DEFLATE (no zlib/gzip header)
|
||
compressor = zlib.compressobj(level=9, wbits=-15)
|
||
compressed = compressor.compress(group_data) + compressor.flush()
|
||
|
||
compressed_groups.append((compressed, len(group_data), count, first_idx))
|
||
compressed_bitmap_data.extend(compressed)
|
||
compressed_offset += len(compressed)
|
||
|
||
glyph_props = modified_glyph_props
|
||
total_compressed = len(compressed_bitmap_data)
|
||
total_uncompressed = len(glyph_data)
|
||
print(f"// Compression: {total_uncompressed} -> {total_compressed} bytes ({100*total_compressed/total_uncompressed:.1f}%), {len(groups)} groups", file=sys.stderr)
|
||
|
||
print(f"""/**
|
||
* generated by fontconvert.py
|
||
* name: {font_name}
|
||
* size: {size}
|
||
* mode: {'2-bit' if is2Bit else '1-bit'}{' compressed: true' if compress else ''}
|
||
* Command used: {' '.join(sys.argv)}
|
||
*/
|
||
#pragma once
|
||
#include "EpdFontData.h"
|
||
""")
|
||
|
||
if compress:
|
||
print(f"static const uint8_t {font_name}Bitmaps[{len(compressed_bitmap_data)}] = {{")
|
||
for c in chunks(compressed_bitmap_data, 16):
|
||
print (" " + " ".join(f"0x{b:02X}," for b in c))
|
||
print ("};\n");
|
||
else:
|
||
print(f"static const uint8_t {font_name}Bitmaps[{len(glyph_data)}] = {{")
|
||
for c in chunks(glyph_data, 16):
|
||
print (" " + " ".join(f"0x{b:02X}," for b in c))
|
||
print ("};\n");
|
||
|
||
def cp_label(cp):
|
||
if cp == 0x5C:
|
||
return '<backslash>'
|
||
return chr(cp) if 0x20 < cp < 0x7F else f'U+{cp:04X}'
|
||
|
||
print(f"static const EpdGlyph {font_name}Glyphs[] = {{")
|
||
for i, g in enumerate(glyph_props):
|
||
print (" { " + ", ".join([f"{a}" for a in list(g[:-1])]),"},", f"// {cp_label(g.code_point)}")
|
||
print ("};\n");
|
||
|
||
print(f"static const EpdUnicodeInterval {font_name}Intervals[] = {{")
|
||
offset = 0
|
||
for i_start, i_end in intervals:
|
||
print (f" {{ 0x{i_start:X}, 0x{i_end:X}, 0x{offset:X} }},")
|
||
offset += i_end - i_start + 1
|
||
print ("};\n");
|
||
|
||
if compress:
|
||
print(f"static const EpdFontGroup {font_name}Groups[] = {{")
|
||
compressed_offset = 0
|
||
for compressed, uncompressed_size, count, first_idx in compressed_groups:
|
||
print(f" {{ {compressed_offset}, {len(compressed)}, {uncompressed_size}, {count}, {first_idx} }},")
|
||
compressed_offset += len(compressed)
|
||
print("};\n")
|
||
|
||
if kern_map:
|
||
print(f"static const EpdKernClassEntry {font_name}KernLeftClasses[] = {{")
|
||
for cp, cls in kern_left_classes:
|
||
print(f" {{ 0x{cp:04X}, {cls} }}, // {cp_label(cp)}")
|
||
print("};\n")
|
||
|
||
print(f"static const EpdKernClassEntry {font_name}KernRightClasses[] = {{")
|
||
for cp, cls in kern_right_classes:
|
||
print(f" {{ 0x{cp:04X}, {cls} }}, // {cp_label(cp)}")
|
||
print("};\n")
|
||
|
||
print(f"static const int8_t {font_name}KernMatrix[] = {{")
|
||
for row in range(kern_left_class_count):
|
||
row_start = row * kern_right_class_count
|
||
row_vals = kern_matrix[row_start:row_start + kern_right_class_count]
|
||
print(" " + ", ".join(f"{v:4d}" for v in row_vals) + ",")
|
||
print("};\n")
|
||
|
||
if ligature_pairs:
|
||
print(f"static const EpdLigaturePair {font_name}LigaturePairs[] = {{")
|
||
for packed_pair, lig_cp in ligature_pairs:
|
||
print(f" {{ 0x{packed_pair:08X}, 0x{lig_cp:04X} }}, // {cp_label(packed_pair >> 16)} {cp_label(packed_pair & 0xFFFF)} -> {cp_label(lig_cp)}")
|
||
print("};\n")
|
||
|
||
print(f"static const EpdFontData {font_name} = {{")
|
||
print(f" {font_name}Bitmaps,")
|
||
print(f" {font_name}Glyphs,")
|
||
print(f" {font_name}Intervals,")
|
||
print(f" {len(intervals)},")
|
||
print(f" {norm_ceil(face.size.height)},")
|
||
print(f" {norm_ceil(face.size.ascender)},")
|
||
print(f" {norm_floor(face.size.descender)},")
|
||
print(f" {'true' if is2Bit else 'false'},")
|
||
if compress:
|
||
print(f" {font_name}Groups,")
|
||
print(f" {len(compressed_groups)},")
|
||
else:
|
||
print(f" nullptr,")
|
||
print(f" 0,")
|
||
if kern_map:
|
||
print(f" {font_name}KernLeftClasses,")
|
||
print(f" {font_name}KernRightClasses,")
|
||
print(f" {font_name}KernMatrix,")
|
||
print(f" {len(kern_left_classes)},")
|
||
print(f" {len(kern_right_classes)},")
|
||
print(f" {kern_left_class_count},")
|
||
print(f" {kern_right_class_count},")
|
||
else:
|
||
print(f" nullptr,")
|
||
print(f" nullptr,")
|
||
print(f" nullptr,")
|
||
print(f" 0,")
|
||
print(f" 0,")
|
||
print(f" 0,")
|
||
print(f" 0,")
|
||
if ligature_pairs:
|
||
print(f" {font_name}LigaturePairs,")
|
||
print(f" {len(ligature_pairs)},")
|
||
else:
|
||
print(f" nullptr,")
|
||
print(f" 0,")
|
||
print("};")
|