fix: tighten ePub image spacing — CSS margins, HTML cleanup for empty tags and wrappers
Made-with: Cursor
This commit is contained in:
@@ -1,7 +1,9 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
from datetime import date
|
from datetime import date
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from ebooklib import epub
|
from ebooklib import epub
|
||||||
|
|
||||||
from src.models import Article, Image
|
from src.models import Article, Image
|
||||||
@@ -11,11 +13,51 @@ body { font-family: serif; margin: 1em; line-height: 1.5; }
|
|||||||
h1 { font-size: 1.4em; margin-bottom: 0.3em; }
|
h1 { font-size: 1.4em; margin-bottom: 0.3em; }
|
||||||
.byline { font-size: 0.85em; color: #555; margin-bottom: 0.5em; }
|
.byline { font-size: 0.85em; color: #555; margin-bottom: 0.5em; }
|
||||||
.categories { font-size: 0.8em; color: #777; margin-bottom: 1em; }
|
.categories { font-size: 0.8em; color: #777; margin-bottom: 1em; }
|
||||||
img { max-width: 100%; display: block; margin: 0.5em auto; }
|
img { max-width: 100%; display: block; margin: 0.2em auto; padding: 0; }
|
||||||
figcaption { font-size: 0.8em; text-align: center; color: #555; }
|
figure, .wp-block-image, .wp-block-cover, .wp-block-media-text {
|
||||||
|
margin: 0; padding: 0;
|
||||||
|
}
|
||||||
|
figcaption { font-size: 0.8em; text-align: center; color: #555; margin: 0.2em 0 0.5em 0; }
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup_html(html: str) -> str:
|
||||||
|
"""Clean up WordPress HTML for ePub: remove empty elements, tighten image spacing."""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
|
for p in soup.find_all("p"):
|
||||||
|
text = p.get_text(strip=True)
|
||||||
|
if not text or text == "\xa0":
|
||||||
|
children = [c for c in p.children if c.name and c.name != "br"]
|
||||||
|
if not children:
|
||||||
|
p.decompose()
|
||||||
|
|
||||||
|
for img in soup.find_all("img"):
|
||||||
|
for sibling in [img.previous_sibling, img.next_sibling]:
|
||||||
|
if sibling and getattr(sibling, "name", None) == "br":
|
||||||
|
sibling.decompose()
|
||||||
|
|
||||||
|
for tag_name in ["figure", "div"]:
|
||||||
|
for wrapper in soup.find_all(tag_name):
|
||||||
|
imgs = wrapper.find_all("img")
|
||||||
|
if not imgs:
|
||||||
|
text = wrapper.get_text(strip=True)
|
||||||
|
if not text or text == "\xa0":
|
||||||
|
wrapper.decompose()
|
||||||
|
continue
|
||||||
|
|
||||||
|
figcaption = wrapper.find("figcaption")
|
||||||
|
other_content = [
|
||||||
|
c for c in wrapper.children
|
||||||
|
if c.name and c.name not in ("img", "figcaption", "br")
|
||||||
|
and c.get_text(strip=True)
|
||||||
|
]
|
||||||
|
if not other_content and not figcaption:
|
||||||
|
wrapper.unwrap()
|
||||||
|
|
||||||
|
return str(soup)
|
||||||
|
|
||||||
|
|
||||||
def build_epub(
|
def build_epub(
|
||||||
week_start: date,
|
week_start: date,
|
||||||
week_end: date,
|
week_end: date,
|
||||||
@@ -68,6 +110,7 @@ def build_epub(
|
|||||||
chapter_html += f'<p class="categories">{cat_str}</p>\n'
|
chapter_html += f'<p class="categories">{cat_str}</p>\n'
|
||||||
|
|
||||||
content = article.content_html
|
content = article.content_html
|
||||||
|
content = _cleanup_html(content)
|
||||||
article_images = Image.query.filter_by(article_id=article.id).all()
|
article_images = Image.query.filter_by(article_id=article.id).all()
|
||||||
|
|
||||||
for img_record in article_images:
|
for img_record in article_images:
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from datetime import datetime, date
|
|||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
|
|
||||||
from src.models import Article, Image
|
from src.models import Article, Image
|
||||||
from src.epub_builder import build_epub
|
from src.epub_builder import _cleanup_html, build_epub
|
||||||
|
|
||||||
|
|
||||||
def _create_test_image(path):
|
def _create_test_image(path):
|
||||||
@@ -100,3 +100,38 @@ def test_build_epub_respects_article_order(app, db, tmp_path):
|
|||||||
|
|
||||||
assert titles[0] == "Earlier Article"
|
assert titles[0] == "Earlier Article"
|
||||||
assert titles[1] == "Later Article"
|
assert titles[1] == "Later Article"
|
||||||
|
|
||||||
|
|
||||||
|
def test_cleanup_removes_empty_paragraphs():
|
||||||
|
html = '<p>Real content.</p><p></p><p> </p><p> </p><p><br></p><p>More.</p>'
|
||||||
|
result = _cleanup_html(html)
|
||||||
|
assert "<p>Real content.</p>" in result
|
||||||
|
assert "<p>More.</p>" in result
|
||||||
|
assert result.count("<p") == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_cleanup_removes_br_near_images():
|
||||||
|
html = '<p>Text</p><br><img src="test.jpg"><br/><p>More</p>'
|
||||||
|
result = _cleanup_html(html)
|
||||||
|
assert "<br" not in result
|
||||||
|
assert '<img src="test.jpg"' in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_cleanup_collapses_image_wrappers():
|
||||||
|
html = '<div><figure><img src="test.jpg"></figure></div>'
|
||||||
|
result = _cleanup_html(html)
|
||||||
|
assert '<img src="test.jpg"' in result
|
||||||
|
assert "<figure" not in result or "<figcaption" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_cleanup_preserves_figcaption():
|
||||||
|
html = '<figure><img src="test.jpg"><figcaption>Caption</figcaption></figure>'
|
||||||
|
result = _cleanup_html(html)
|
||||||
|
assert '<img src="test.jpg"' in result
|
||||||
|
assert "Caption" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_cleanup_removes_empty_divs():
|
||||||
|
html = '<p>Content</p><div> </div><div></div><p>More</p>'
|
||||||
|
result = _cleanup_html(html)
|
||||||
|
assert result.count("<div") == 0
|
||||||
|
|||||||
Reference in New Issue
Block a user