diff --git a/src/epub_builder.py b/src/epub_builder.py index 237c6e7..a66657a 100644 --- a/src/epub_builder.py +++ b/src/epub_builder.py @@ -1,7 +1,9 @@ import json import os +import re from datetime import date +from bs4 import BeautifulSoup from ebooklib import epub from src.models import Article, Image @@ -11,11 +13,51 @@ body { font-family: serif; margin: 1em; line-height: 1.5; } h1 { font-size: 1.4em; margin-bottom: 0.3em; } .byline { font-size: 0.85em; color: #555; margin-bottom: 0.5em; } .categories { font-size: 0.8em; color: #777; margin-bottom: 1em; } -img { max-width: 100%; display: block; margin: 0.5em auto; } -figcaption { font-size: 0.8em; text-align: center; color: #555; } +img { max-width: 100%; display: block; margin: 0.2em auto; padding: 0; } +figure, .wp-block-image, .wp-block-cover, .wp-block-media-text { + margin: 0; padding: 0; +} +figcaption { font-size: 0.8em; text-align: center; color: #555; margin: 0.2em 0 0.5em 0; } """ +def _cleanup_html(html: str) -> str: + """Clean up WordPress HTML for ePub: remove empty elements, tighten image spacing.""" + soup = BeautifulSoup(html, "html.parser") + + for p in soup.find_all("p"): + text = p.get_text(strip=True) + if not text or text == "\xa0": + children = [c for c in p.children if c.name and c.name != "br"] + if not children: + p.decompose() + + for img in soup.find_all("img"): + for sibling in [img.previous_sibling, img.next_sibling]: + if sibling and getattr(sibling, "name", None) == "br": + sibling.decompose() + + for tag_name in ["figure", "div"]: + for wrapper in soup.find_all(tag_name): + imgs = wrapper.find_all("img") + if not imgs: + text = wrapper.get_text(strip=True) + if not text or text == "\xa0": + wrapper.decompose() + continue + + figcaption = wrapper.find("figcaption") + other_content = [ + c for c in wrapper.children + if c.name and c.name not in ("img", "figcaption", "br") + and c.get_text(strip=True) + ] + if not other_content and not figcaption: + wrapper.unwrap() + + return str(soup) + + def build_epub( week_start: date, week_end: date, @@ -68,6 +110,7 @@ def build_epub( chapter_html += f'

{cat_str}

\n' content = article.content_html + content = _cleanup_html(content) article_images = Image.query.filter_by(article_id=article.id).all() for img_record in article_images: diff --git a/tests/test_epub_builder.py b/tests/test_epub_builder.py index 8a8b5e1..bc45bb2 100644 --- a/tests/test_epub_builder.py +++ b/tests/test_epub_builder.py @@ -4,7 +4,7 @@ from datetime import datetime, date from PIL import Image as PILImage from src.models import Article, Image -from src.epub_builder import build_epub +from src.epub_builder import _cleanup_html, build_epub def _create_test_image(path): @@ -100,3 +100,38 @@ def test_build_epub_respects_article_order(app, db, tmp_path): assert titles[0] == "Earlier Article" assert titles[1] == "Later Article" + + +def test_cleanup_removes_empty_paragraphs(): + html = '

Real content.

 


More.

' + result = _cleanup_html(html) + assert "

Real content.

" in result + assert "

More.

" in result + assert result.count("