fix: tighten ePub image spacing — CSS margins, HTML cleanup for empty tags and wrappers

Made-with: Cursor
This commit is contained in:
cottongin
2026-04-06 17:04:40 -04:00
parent 49acf09aa1
commit 807ab8610d
2 changed files with 81 additions and 3 deletions

View File

@@ -1,7 +1,9 @@
import json import json
import os import os
import re
from datetime import date from datetime import date
from bs4 import BeautifulSoup
from ebooklib import epub from ebooklib import epub
from src.models import Article, Image from src.models import Article, Image
@@ -11,11 +13,51 @@ body { font-family: serif; margin: 1em; line-height: 1.5; }
h1 { font-size: 1.4em; margin-bottom: 0.3em; } h1 { font-size: 1.4em; margin-bottom: 0.3em; }
.byline { font-size: 0.85em; color: #555; margin-bottom: 0.5em; } .byline { font-size: 0.85em; color: #555; margin-bottom: 0.5em; }
.categories { font-size: 0.8em; color: #777; margin-bottom: 1em; } .categories { font-size: 0.8em; color: #777; margin-bottom: 1em; }
img { max-width: 100%; display: block; margin: 0.5em auto; } img { max-width: 100%; display: block; margin: 0.2em auto; padding: 0; }
figcaption { font-size: 0.8em; text-align: center; color: #555; } figure, .wp-block-image, .wp-block-cover, .wp-block-media-text {
margin: 0; padding: 0;
}
figcaption { font-size: 0.8em; text-align: center; color: #555; margin: 0.2em 0 0.5em 0; }
""" """
def _cleanup_html(html: str) -> str:
"""Clean up WordPress HTML for ePub: remove empty elements, tighten image spacing."""
soup = BeautifulSoup(html, "html.parser")
for p in soup.find_all("p"):
text = p.get_text(strip=True)
if not text or text == "\xa0":
children = [c for c in p.children if c.name and c.name != "br"]
if not children:
p.decompose()
for img in soup.find_all("img"):
for sibling in [img.previous_sibling, img.next_sibling]:
if sibling and getattr(sibling, "name", None) == "br":
sibling.decompose()
for tag_name in ["figure", "div"]:
for wrapper in soup.find_all(tag_name):
imgs = wrapper.find_all("img")
if not imgs:
text = wrapper.get_text(strip=True)
if not text or text == "\xa0":
wrapper.decompose()
continue
figcaption = wrapper.find("figcaption")
other_content = [
c for c in wrapper.children
if c.name and c.name not in ("img", "figcaption", "br")
and c.get_text(strip=True)
]
if not other_content and not figcaption:
wrapper.unwrap()
return str(soup)
def build_epub( def build_epub(
week_start: date, week_start: date,
week_end: date, week_end: date,
@@ -68,6 +110,7 @@ def build_epub(
chapter_html += f'<p class="categories">{cat_str}</p>\n' chapter_html += f'<p class="categories">{cat_str}</p>\n'
content = article.content_html content = article.content_html
content = _cleanup_html(content)
article_images = Image.query.filter_by(article_id=article.id).all() article_images = Image.query.filter_by(article_id=article.id).all()
for img_record in article_images: for img_record in article_images:

View File

@@ -4,7 +4,7 @@ from datetime import datetime, date
from PIL import Image as PILImage from PIL import Image as PILImage
from src.models import Article, Image from src.models import Article, Image
from src.epub_builder import build_epub from src.epub_builder import _cleanup_html, build_epub
def _create_test_image(path): def _create_test_image(path):
@@ -100,3 +100,38 @@ def test_build_epub_respects_article_order(app, db, tmp_path):
assert titles[0] == "Earlier Article" assert titles[0] == "Earlier Article"
assert titles[1] == "Later Article" assert titles[1] == "Later Article"
def test_cleanup_removes_empty_paragraphs():
html = '<p>Real content.</p><p></p><p>&nbsp;</p><p> </p><p><br></p><p>More.</p>'
result = _cleanup_html(html)
assert "<p>Real content.</p>" in result
assert "<p>More.</p>" in result
assert result.count("<p") == 2
def test_cleanup_removes_br_near_images():
html = '<p>Text</p><br><img src="test.jpg"><br/><p>More</p>'
result = _cleanup_html(html)
assert "<br" not in result
assert '<img src="test.jpg"' in result
def test_cleanup_collapses_image_wrappers():
html = '<div><figure><img src="test.jpg"></figure></div>'
result = _cleanup_html(html)
assert '<img src="test.jpg"' in result
assert "<figure" not in result or "<figcaption" in result
def test_cleanup_preserves_figcaption():
html = '<figure><img src="test.jpg"><figcaption>Caption</figcaption></figure>'
result = _cleanup_html(html)
assert '<img src="test.jpg"' in result
assert "Caption" in result
def test_cleanup_removes_empty_divs():
html = '<p>Content</p><div> </div><div></div><p>More</p>'
result = _cleanup_html(html)
assert result.count("<div") == 0