Files
pi-weekly-newspaper/tests/test_epub_builder.py

141 lines
4.8 KiB
Python
Raw Permalink Normal View History

import json
import os
from datetime import datetime, date
from PIL import Image as PILImage
from src.models import Article, Image
from src.epub_builder import _cleanup_html, build_epub
def _create_test_image(path):
os.makedirs(os.path.dirname(path), exist_ok=True)
img = PILImage.new("RGB", (800, 450), color="green")
img.save(path, format="JPEG")
def test_build_epub_creates_file(app, db, tmp_path):
with app.app_context():
img_path = str(tmp_path / "images" / "abc123.jpg")
_create_test_image(img_path)
a1 = Article(
guid="g1", title="First Article", author="Author A",
pub_date=datetime(2026, 4, 6, 10, 0),
categories=json.dumps(["Government"]),
link="http://example.com/1",
content_html=f'<p>Content one.</p><img src="{img_path}" />',
)
a2 = Article(
guid="g2", title="Second Article", author="Author B",
pub_date=datetime(2026, 4, 7, 10, 0),
categories=json.dumps(["Culture Calendar"]),
link="http://example.com/2",
content_html="<p>Content two.</p>",
)
db.session.add_all([a1, a2])
db.session.flush()
img_record = Image(
article_id=a1.id, original_url="https://example.com/photo.jpg",
local_path=img_path, width=800, height=450,
)
db.session.add(img_record)
db.session.commit()
cover_img = PILImage.new("RGB", (800, 480), color="white")
cover_path = str(tmp_path / "cover.jpg")
cover_img.save(cover_path, format="JPEG")
output_dir = str(tmp_path / "issues")
epub_path = build_epub(
week_start=date(2026, 4, 6),
week_end=date(2026, 4, 12),
article_ids=[a1.id, a2.id],
cover_path=cover_path,
output_dir=output_dir,
)
assert os.path.exists(epub_path)
assert epub_path.endswith(".epub")
assert os.path.getsize(epub_path) > 0
def test_build_epub_respects_article_order(app, db, tmp_path):
with app.app_context():
a1 = Article(
guid="g1", title="Later Article", author="A",
pub_date=datetime(2026, 4, 8, 10, 0),
categories="[]", link="http://a", content_html="<p>Later</p>",
)
a2 = Article(
guid="g2", title="Earlier Article", author="B",
pub_date=datetime(2026, 4, 6, 10, 0),
categories="[]", link="http://b", content_html="<p>Earlier</p>",
)
db.session.add_all([a1, a2])
db.session.commit()
cover_path = str(tmp_path / "cover.jpg")
PILImage.new("RGB", (800, 480)).save(cover_path, format="JPEG")
epub_path = build_epub(
week_start=date(2026, 4, 6),
week_end=date(2026, 4, 12),
article_ids=[a1.id, a2.id],
cover_path=cover_path,
output_dir=str(tmp_path / "issues"),
)
from ebooklib import epub as epublib
import warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=FutureWarning, module="ebooklib")
book = epublib.read_epub(epub_path, options={'ignore_ncx': True})
spine_items = [book.get_item_with_id(item_id)
for item_id, _ in book.spine if item_id != "nav"]
titles = []
for item in spine_items:
if item and b"<h1>" in item.get_content():
content = item.get_content().decode("utf-8")
start = content.index("<h1>") + 4
end = content.index("</h1>")
titles.append(content[start:end])
assert titles[0] == "Earlier Article"
assert titles[1] == "Later Article"
def test_cleanup_removes_empty_paragraphs():
html = '<p>Real content.</p><p></p><p>&nbsp;</p><p> </p><p><br></p><p>More.</p>'
result = _cleanup_html(html)
assert "<p>Real content.</p>" in result
assert "<p>More.</p>" in result
assert result.count("<p") == 2
def test_cleanup_removes_br_near_images():
html = '<p>Text</p><br><img src="test.jpg"><br/><p>More</p>'
result = _cleanup_html(html)
assert "<br" not in result
assert '<img src="test.jpg"' in result
def test_cleanup_collapses_image_wrappers():
html = '<div><figure><img src="test.jpg"></figure></div>'
result = _cleanup_html(html)
assert '<img src="test.jpg"' in result
assert "<figure" not in result or "<figcaption" in result
def test_cleanup_preserves_figcaption():
html = '<figure><img src="test.jpg"><figcaption>Caption</figcaption></figure>'
result = _cleanup_html(html)
assert '<img src="test.jpg"' in result
assert "Caption" in result
def test_cleanup_removes_empty_divs():
html = '<p>Content</p><div> </div><div></div><p>More</p>'
result = _cleanup_html(html)
assert result.count("<div") == 0