fix: tighten ePub image spacing — CSS margins, HTML cleanup for empty tags and wrappers

Made-with: Cursor
2026-04-06 17:04:40 -04:00
parent 49acf09aa1
commit 807ab8610d
2 changed files with 81 additions and 3 deletions
--- a/src/epub_builder.py
+++ b/src/epub_builder.py
@@ -1,7 +1,9 @@
 import json
 import os
+import re
 from datetime import date

+from bs4 import BeautifulSoup
 from ebooklib import epub

 from src.models import Article, Image
@@ -11,11 +13,51 @@ body { font-family: serif; margin: 1em; line-height: 1.5; }
 h1 { font-size: 1.4em; margin-bottom: 0.3em; }
 .byline { font-size: 0.85em; color: #555; margin-bottom: 0.5em; }
 .categories { font-size: 0.8em; color: #777; margin-bottom: 1em; }
-img { max-width: 100%; display: block; margin: 0.5em auto; }
-figcaption { font-size: 0.8em; text-align: center; color: #555; }
+img { max-width: 100%; display: block; margin: 0.2em auto; padding: 0; }
+figure, .wp-block-image, .wp-block-cover, .wp-block-media-text {
+    margin: 0; padding: 0;
+}
+figcaption { font-size: 0.8em; text-align: center; color: #555; margin: 0.2em 0 0.5em 0; }
 """


+def _cleanup_html(html: str) -> str:
+    """Clean up WordPress HTML for ePub: remove empty elements, tighten image spacing."""
+    soup = BeautifulSoup(html, "html.parser")
+
+    for p in soup.find_all("p"):
+        text = p.get_text(strip=True)
+        if not text or text == "\xa0":
+            children = [c for c in p.children if c.name and c.name != "br"]
+            if not children:
+                p.decompose()
+
+    for img in soup.find_all("img"):
+        for sibling in [img.previous_sibling, img.next_sibling]:
+            if sibling and getattr(sibling, "name", None) == "br":
+                sibling.decompose()
+
+    for tag_name in ["figure", "div"]:
+        for wrapper in soup.find_all(tag_name):
+            imgs = wrapper.find_all("img")
+            if not imgs:
+                text = wrapper.get_text(strip=True)
+                if not text or text == "\xa0":
+                    wrapper.decompose()
+                continue
+
+            figcaption = wrapper.find("figcaption")
+            other_content = [
+                c for c in wrapper.children
+                if c.name and c.name not in ("img", "figcaption", "br")
+                and c.get_text(strip=True)
+            ]
+            if not other_content and not figcaption:
+                wrapper.unwrap()
+
+    return str(soup)
+
+
 def build_epub(
    week_start: date,
    week_end: date,
@@ -68,6 +110,7 @@ def build_epub(
            chapter_html += f'<p class="categories">{cat_str}</p>\n'

        content = article.content_html
+        content = _cleanup_html(content)
        article_images = Image.query.filter_by(article_id=article.id).all()

        for img_record in article_images:
--- a/tests/test_epub_builder.py
+++ b/tests/test_epub_builder.py
@@ -4,7 +4,7 @@ from datetime import datetime, date
 from PIL import Image as PILImage

 from src.models import Article, Image
-from src.epub_builder import build_epub
+from src.epub_builder import _cleanup_html, build_epub


 def _create_test_image(path):
@@ -100,3 +100,38 @@ def test_build_epub_respects_article_order(app, db, tmp_path):

        assert titles[0] == "Earlier Article"
        assert titles[1] == "Later Article"
+
+
+def test_cleanup_removes_empty_paragraphs():
+    html = '<p>Real content.</p><p></p><p>&nbsp;</p><p> </p><p><br></p><p>More.</p>'
+    result = _cleanup_html(html)
+    assert "<p>Real content.</p>" in result
+    assert "<p>More.</p>" in result
+    assert result.count("<p") == 2
+
+
+def test_cleanup_removes_br_near_images():
+    html = '<p>Text</p><br><img src="test.jpg"><br/><p>More</p>'
+    result = _cleanup_html(html)
+    assert "<br" not in result
+    assert '<img src="test.jpg"' in result
+
+
+def test_cleanup_collapses_image_wrappers():
+    html = '<div><figure><img src="test.jpg"></figure></div>'
+    result = _cleanup_html(html)
+    assert '<img src="test.jpg"' in result
+    assert "<figure" not in result or "<figcaption" in result
+
+
+def test_cleanup_preserves_figcaption():
+    html = '<figure><img src="test.jpg"><figcaption>Caption</figcaption></figure>'
+    result = _cleanup_html(html)
+    assert '<img src="test.jpg"' in result
+    assert "Caption" in result
+
+
+def test_cleanup_removes_empty_divs():
+    html = '<p>Content</p><div>  </div><div></div><p>More</p>'
+    result = _cleanup_html(html)
+    assert result.count("<div") == 0