fix: tighten ePub image spacing — CSS margins, HTML cleanup for empty tags and wrappers

Made-with: Cursor
This commit is contained in:
cottongin
2026-04-06 17:04:40 -04:00
parent 49acf09aa1
commit 807ab8610d
2 changed files with 81 additions and 3 deletions

View File

@@ -1,7 +1,9 @@
import json
import os
import re
from datetime import date
from bs4 import BeautifulSoup
from ebooklib import epub
from src.models import Article, Image
@@ -11,11 +13,51 @@ body { font-family: serif; margin: 1em; line-height: 1.5; }
h1 { font-size: 1.4em; margin-bottom: 0.3em; }
.byline { font-size: 0.85em; color: #555; margin-bottom: 0.5em; }
.categories { font-size: 0.8em; color: #777; margin-bottom: 1em; }
img { max-width: 100%; display: block; margin: 0.5em auto; }
figcaption { font-size: 0.8em; text-align: center; color: #555; }
img { max-width: 100%; display: block; margin: 0.2em auto; padding: 0; }
figure, .wp-block-image, .wp-block-cover, .wp-block-media-text {
margin: 0; padding: 0;
}
figcaption { font-size: 0.8em; text-align: center; color: #555; margin: 0.2em 0 0.5em 0; }
"""
def _cleanup_html(html: str) -> str:
"""Clean up WordPress HTML for ePub: remove empty elements, tighten image spacing."""
soup = BeautifulSoup(html, "html.parser")
for p in soup.find_all("p"):
text = p.get_text(strip=True)
if not text or text == "\xa0":
children = [c for c in p.children if c.name and c.name != "br"]
if not children:
p.decompose()
for img in soup.find_all("img"):
for sibling in [img.previous_sibling, img.next_sibling]:
if sibling and getattr(sibling, "name", None) == "br":
sibling.decompose()
for tag_name in ["figure", "div"]:
for wrapper in soup.find_all(tag_name):
imgs = wrapper.find_all("img")
if not imgs:
text = wrapper.get_text(strip=True)
if not text or text == "\xa0":
wrapper.decompose()
continue
figcaption = wrapper.find("figcaption")
other_content = [
c for c in wrapper.children
if c.name and c.name not in ("img", "figcaption", "br")
and c.get_text(strip=True)
]
if not other_content and not figcaption:
wrapper.unwrap()
return str(soup)
def build_epub(
week_start: date,
week_end: date,
@@ -68,6 +110,7 @@ def build_epub(
chapter_html += f'<p class="categories">{cat_str}</p>\n'
content = article.content_html
content = _cleanup_html(content)
article_images = Image.query.filter_by(article_id=article.id).all()
for img_record in article_images: