tests/test_epub_builder.py

import json
import os
from datetime import datetime, date
from PIL import Image as PILImage

from src.models import Article, Image
from src.epub_builder import _cleanup_html, build_epub


def _create_test_image(path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    img = PILImage.new("RGB", (800, 450), color="green")
    img.save(path, format="JPEG")


def test_build_epub_creates_file(app, db, tmp_path):
    with app.app_context():
        img_path = str(tmp_path / "images" / "abc123.jpg")
        _create_test_image(img_path)

        a1 = Article(
            guid="g1", title="First Article", author="Author A",
            pub_date=datetime(2026, 4, 6, 10, 0),
            categories=json.dumps(["Government"]),
            link="http://example.com/1",
            content_html=f'<p>Content one.</p><img src="{img_path}" />',
        )
        a2 = Article(
            guid="g2", title="Second Article", author="Author B",
            pub_date=datetime(2026, 4, 7, 10, 0),
            categories=json.dumps(["Culture Calendar"]),
            link="http://example.com/2",
            content_html="<p>Content two.</p>",
        )
        db.session.add_all([a1, a2])
        db.session.flush()

        img_record = Image(
            article_id=a1.id, original_url="https://example.com/photo.jpg",
            local_path=img_path, width=800, height=450,
        )
        db.session.add(img_record)
        db.session.commit()

        cover_img = PILImage.new("RGB", (800, 480), color="white")
        cover_path = str(tmp_path / "cover.jpg")
        cover_img.save(cover_path, format="JPEG")

        output_dir = str(tmp_path / "issues")
        epub_path = build_epub(
            week_start=date(2026, 4, 6),
            week_end=date(2026, 4, 12),
            article_ids=[a1.id, a2.id],
            cover_path=cover_path,
            output_dir=output_dir,
        )

        assert os.path.exists(epub_path)
        assert epub_path.endswith(".epub")
        assert os.path.getsize(epub_path) > 0


def test_build_epub_respects_article_order(app, db, tmp_path):
    with app.app_context():
        a1 = Article(
            guid="g1", title="Later Article", author="A",
            pub_date=datetime(2026, 4, 8, 10, 0),
            categories="[]", link="http://a", content_html="<p>Later</p>",
        )
        a2 = Article(
            guid="g2", title="Earlier Article", author="B",
            pub_date=datetime(2026, 4, 6, 10, 0),
            categories="[]", link="http://b", content_html="<p>Earlier</p>",
        )
        db.session.add_all([a1, a2])
        db.session.commit()

        cover_path = str(tmp_path / "cover.jpg")
        PILImage.new("RGB", (800, 480)).save(cover_path, format="JPEG")

        epub_path = build_epub(
            week_start=date(2026, 4, 6),
            week_end=date(2026, 4, 12),
            article_ids=[a1.id, a2.id],
            cover_path=cover_path,
            output_dir=str(tmp_path / "issues"),
        )

        from ebooklib import epub as epublib
        book = epublib.read_epub(epub_path)
        spine_items = [book.get_item_with_id(item_id)
                       for item_id, _ in book.spine if item_id != "nav"]
        titles = []
        for item in spine_items:
            if item and b"<h1>" in item.get_content():
                content = item.get_content().decode("utf-8")
                start = content.index("<h1>") + 4
                end = content.index("</h1>")
                titles.append(content[start:end])

        assert titles[0] == "Earlier Article"
        assert titles[1] == "Later Article"


def test_cleanup_removes_empty_paragraphs():
    html = '<p>Real content.</p><p></p><p>&nbsp;</p><p> </p><p><br></p><p>More.</p>'
    result = _cleanup_html(html)
    assert "<p>Real content.</p>" in result
    assert "<p>More.</p>" in result
    assert result.count("<p") == 2


def test_cleanup_removes_br_near_images():
    html = '<p>Text</p><br><img src="test.jpg"><br/><p>More</p>'
    result = _cleanup_html(html)
    assert "<br" not in result
    assert '<img src="test.jpg"' in result


def test_cleanup_collapses_image_wrappers():
    html = '<div><figure><img src="test.jpg"></figure></div>'
    result = _cleanup_html(html)
    assert '<img src="test.jpg"' in result
    assert "<figure" not in result or "<figcaption" in result


def test_cleanup_preserves_figcaption():
    html = '<figure><img src="test.jpg"><figcaption>Caption</figcaption></figure>'
    result = _cleanup_html(html)
    assert '<img src="test.jpg"' in result
    assert "Caption" in result


def test_cleanup_removes_empty_divs():
    html = '<p>Content</p><div>  </div><div></div><p>More</p>'
    result = _cleanup_html(html)
    assert result.count("<div") == 0
feat: ePub builder with chapters, images, TOC, cover Made-with: Cursor 2026-04-06 15:17:21 -04:00			`import json`
			`import os`
			`from datetime import datetime, date`
			`from PIL import Image as PILImage`

			`from src.models import Article, Image`
fix: tighten ePub image spacing — CSS margins, HTML cleanup for empty tags and wrappers Made-with: Cursor 2026-04-06 17:04:40 -04:00			`from src.epub_builder import _cleanup_html, build_epub`
feat: ePub builder with chapters, images, TOC, cover Made-with: Cursor 2026-04-06 15:17:21 -04:00

			`def _create_test_image(path):`
			`os.makedirs(os.path.dirname(path), exist_ok=True)`
			`img = PILImage.new("RGB", (800, 450), color="green")`
			`img.save(path, format="JPEG")`


			`def test_build_epub_creates_file(app, db, tmp_path):`
			`with app.app_context():`
			`img_path = str(tmp_path / "images" / "abc123.jpg")`
			`_create_test_image(img_path)`

			`a1 = Article(`
			`guid="g1", title="First Article", author="Author A",`
			`pub_date=datetime(2026, 4, 6, 10, 0),`
			`categories=json.dumps(["Government"]),`
			`link="http://example.com/1",`
			`content_html=f'<p>Content one.</p><img src="{img_path}" />',`
			`)`
			`a2 = Article(`
			`guid="g2", title="Second Article", author="Author B",`
			`pub_date=datetime(2026, 4, 7, 10, 0),`
			`categories=json.dumps(["Culture Calendar"]),`
			`link="http://example.com/2",`
			`content_html="<p>Content two.</p>",`
			`)`
			`db.session.add_all([a1, a2])`
			`db.session.flush()`

			`img_record = Image(`
			`article_id=a1.id, original_url="https://example.com/photo.jpg",`
			`local_path=img_path, width=800, height=450,`
			`)`
			`db.session.add(img_record)`
			`db.session.commit()`

			`cover_img = PILImage.new("RGB", (800, 480), color="white")`
			`cover_path = str(tmp_path / "cover.jpg")`
			`cover_img.save(cover_path, format="JPEG")`

			`output_dir = str(tmp_path / "issues")`
			`epub_path = build_epub(`
			`week_start=date(2026, 4, 6),`
			`week_end=date(2026, 4, 12),`
			`article_ids=[a1.id, a2.id],`
			`cover_path=cover_path,`
			`output_dir=output_dir,`
			`)`

			`assert os.path.exists(epub_path)`
			`assert epub_path.endswith(".epub")`
			`assert os.path.getsize(epub_path) > 0`


			`def test_build_epub_respects_article_order(app, db, tmp_path):`
			`with app.app_context():`
			`a1 = Article(`
			`guid="g1", title="Later Article", author="A",`
			`pub_date=datetime(2026, 4, 8, 10, 0),`
			`categories="[]", link="http://a", content_html="<p>Later</p>",`
			`)`
			`a2 = Article(`
			`guid="g2", title="Earlier Article", author="B",`
			`pub_date=datetime(2026, 4, 6, 10, 0),`
			`categories="[]", link="http://b", content_html="<p>Earlier</p>",`
			`)`
			`db.session.add_all([a1, a2])`
			`db.session.commit()`

			`cover_path = str(tmp_path / "cover.jpg")`
			`PILImage.new("RGB", (800, 480)).save(cover_path, format="JPEG")`

			`epub_path = build_epub(`
			`week_start=date(2026, 4, 6),`
			`week_end=date(2026, 4, 12),`
			`article_ids=[a1.id, a2.id],`
			`cover_path=cover_path,`
			`output_dir=str(tmp_path / "issues"),`
			`)`

			`from ebooklib import epub as epublib`
			`book = epublib.read_epub(epub_path)`
			`spine_items = [book.get_item_with_id(item_id)`
			`for item_id, _ in book.spine if item_id != "nav"]`
			`titles = []`
			`for item in spine_items:`
			`if item and b"<h1>" in item.get_content():`
			`content = item.get_content().decode("utf-8")`
			`start = content.index("<h1>") + 4`
			`end = content.index("</h1>")`
			`titles.append(content[start:end])`

			`assert titles[0] == "Earlier Article"`
			`assert titles[1] == "Later Article"`
fix: tighten ePub image spacing — CSS margins, HTML cleanup for empty tags and wrappers Made-with: Cursor 2026-04-06 17:04:40 -04:00

			`def test_cleanup_removes_empty_paragraphs():`
			`html = '<p>Real content.</p><p></p><p> </p><p> </p><p><br></p><p>More.</p>'`
			`result = _cleanup_html(html)`
			`assert "<p>Real content.</p>" in result`
			`assert "<p>More.</p>" in result`
			`assert result.count("<p") == 2`


			`def test_cleanup_removes_br_near_images():`
			`html = '<p>Text</p><br><img src="test.jpg"><br/><p>More</p>'`
			`result = _cleanup_html(html)`
			`assert "<br" not in result`
			`assert '<img src="test.jpg"' in result`


			`def test_cleanup_collapses_image_wrappers():`
			`html = '<div><figure><img src="test.jpg"></figure></div>'`
			`result = _cleanup_html(html)`
			`assert '<img src="test.jpg"' in result`
			`assert "<figure" not in result or "<figcaption" in result`


			`def test_cleanup_preserves_figcaption():`
			`html = '<figure><img src="test.jpg"><figcaption>Caption</figcaption></figure>'`
			`result = _cleanup_html(html)`
			`assert '<img src="test.jpg"' in result`
			`assert "Caption" in result`


			`def test_cleanup_removes_empty_divs():`
			`html = '<p>Content</p><div> </div><div></div><p>More</p>'`
			`result = _cleanup_html(html)`
			`assert result.count("<div") == 0`