src/epub_builder.py

import json
import os
import re
from datetime import date

from bs4 import BeautifulSoup
from ebooklib import epub

from src.models import Article, Image

EPUB_CSS = """
body { font-family: serif; margin: 1em; line-height: 1.5; }
h1 { font-size: 1.4em; margin-bottom: 0.3em; }
.byline { font-size: 0.85em; color: #555; margin-bottom: 0.5em; }
.categories { font-size: 0.8em; color: #777; margin-bottom: 1em; }
img { max-width: 100%; display: block; margin: 0.2em auto; padding: 0; }
figure, .wp-block-image, .wp-block-cover, .wp-block-media-text {
    margin: 0; padding: 0;
}
figcaption { font-size: 0.8em; text-align: center; color: #555; margin: 0.2em 0 0.5em 0; }
"""


def _cleanup_html(html: str) -> str:
    """Clean up WordPress HTML for ePub: remove empty elements, tighten image spacing."""
    soup = BeautifulSoup(html, "html.parser")

    for p in soup.find_all("p"):
        text = p.get_text(strip=True)
        if not text or text == "\xa0":
            children = [c for c in p.children if c.name and c.name != "br"]
            if not children:
                p.decompose()

    for img in soup.find_all("img"):
        for sibling in [img.previous_sibling, img.next_sibling]:
            if sibling and getattr(sibling, "name", None) == "br":
                sibling.decompose()

    for tag_name in ["figure", "div"]:
        for wrapper in soup.find_all(tag_name):
            imgs = wrapper.find_all("img")
            if not imgs:
                text = wrapper.get_text(strip=True)
                if not text or text == "\xa0":
                    wrapper.decompose()
                continue

            figcaption = wrapper.find("figcaption")
            other_content = [
                c for c in wrapper.children
                if c.name and c.name not in ("img", "figcaption", "br")
                and c.get_text(strip=True)
            ]
            if not other_content and not figcaption:
                wrapper.unwrap()

    return str(soup)


def build_epub(
    week_start: date,
    week_end: date,
    article_ids: list[int],
    cover_path: str,
    output_dir: str,
) -> str:
    os.makedirs(output_dir, exist_ok=True)

    articles = (
        Article.query
        .filter(Article.id.in_(article_ids))
        .order_by(Article.pub_date.asc())
        .all()
    )

    title = (
        f"Plymouth Independent \u2014 "
        f"Week of {week_start.strftime('%b %d')}\u2013{week_end.strftime('%b %d, %Y')}"
    )

    book = epub.EpubBook()
    book.set_identifier(f"pi-{week_start.isoformat()}")
    book.set_title(title)
    book.set_language("en")
    book.add_author("Plymouth Independent")

    with open(cover_path, "rb") as f:
        book.set_cover("cover.jpg", f.read())

    style = epub.EpubItem(
        uid="style", file_name="style/default.css",
        media_type="text/css", content=EPUB_CSS.encode("utf-8"),
    )
    book.add_item(style)

    chapters = []
    image_counter = 0

    for article in articles:
        categories = json.loads(article.categories)
        cat_str = ", ".join(categories) if categories else ""

        chapter_html = f"<h1>{article.title}</h1>\n"
        chapter_html += (
            f'<p class="byline">{article.author} \u00b7 '
            f'{article.pub_date.strftime("%B %d, %Y")}</p>\n'
        )
        if cat_str:
            chapter_html += f'<p class="categories">{cat_str}</p>\n'

        content = article.content_html
        content = _cleanup_html(content)
        article_images = Image.query.filter_by(article_id=article.id).all()

        for img_record in article_images:
            if not os.path.exists(img_record.local_path):
                continue

            image_counter += 1
            epub_img_name = f"images/img_{image_counter}.jpg"

            with open(img_record.local_path, "rb") as f:
                img_data = f.read()

            epub_img = epub.EpubItem(
                uid=f"img_{image_counter}",
                file_name=epub_img_name,
                media_type="image/jpeg",
                content=img_data,
            )
            book.add_item(epub_img)
            content = content.replace(img_record.local_path, epub_img_name)

        chapter_html += content

        chapter = epub.EpubHtml(
            title=article.title,
            file_name=f"chapter_{article.id}.xhtml",
            lang="en",
        )
        chapter.set_content(
            f'<html><head><link rel="stylesheet" href="style/default.css"/>'
            f"</head><body>{chapter_html}</body></html>"
        )
        chapter.add_item(style)
        chapters.append(chapter)
        book.add_item(chapter)

    book.toc = [(c, []) for c in chapters]
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())

    book.spine = ["nav"] + chapters

    iso_week = week_start.isocalendar()[1]
    filename = f"plymouth-independent-{week_start.year}-W{iso_week:02d}.epub"
    epub_path = os.path.join(output_dir, filename)
    epub.write_epub(epub_path, book)

    return epub_path
feat: ePub builder with chapters, images, TOC, cover Made-with: Cursor 2026-04-06 15:17:21 -04:00			`import json`
			`import os`
fix: tighten ePub image spacing — CSS margins, HTML cleanup for empty tags and wrappers Made-with: Cursor 2026-04-06 17:04:40 -04:00			`import re`
feat: ePub builder with chapters, images, TOC, cover Made-with: Cursor 2026-04-06 15:17:21 -04:00			`from datetime import date`

fix: tighten ePub image spacing — CSS margins, HTML cleanup for empty tags and wrappers Made-with: Cursor 2026-04-06 17:04:40 -04:00			`from bs4 import BeautifulSoup`
feat: ePub builder with chapters, images, TOC, cover Made-with: Cursor 2026-04-06 15:17:21 -04:00			`from ebooklib import epub`

			`from src.models import Article, Image`

			`EPUB_CSS = """`
			`body { font-family: serif; margin: 1em; line-height: 1.5; }`
			`h1 { font-size: 1.4em; margin-bottom: 0.3em; }`
			`.byline { font-size: 0.85em; color: #555; margin-bottom: 0.5em; }`
			`.categories { font-size: 0.8em; color: #777; margin-bottom: 1em; }`
fix: tighten ePub image spacing — CSS margins, HTML cleanup for empty tags and wrappers Made-with: Cursor 2026-04-06 17:04:40 -04:00			`img { max-width: 100%; display: block; margin: 0.2em auto; padding: 0; }`
			`figure, .wp-block-image, .wp-block-cover, .wp-block-media-text {`
			`margin: 0; padding: 0;`
			`}`
			`figcaption { font-size: 0.8em; text-align: center; color: #555; margin: 0.2em 0 0.5em 0; }`
feat: ePub builder with chapters, images, TOC, cover Made-with: Cursor 2026-04-06 15:17:21 -04:00			`"""`


fix: tighten ePub image spacing — CSS margins, HTML cleanup for empty tags and wrappers Made-with: Cursor 2026-04-06 17:04:40 -04:00			`def _cleanup_html(html: str) -> str:`
			`"""Clean up WordPress HTML for ePub: remove empty elements, tighten image spacing."""`
			`soup = BeautifulSoup(html, "html.parser")`

			`for p in soup.find_all("p"):`
			`text = p.get_text(strip=True)`
			`if not text or text == "\xa0":`
			`children = [c for c in p.children if c.name and c.name != "br"]`
			`if not children:`
			`p.decompose()`

			`for img in soup.find_all("img"):`
			`for sibling in [img.previous_sibling, img.next_sibling]:`
			`if sibling and getattr(sibling, "name", None) == "br":`
			`sibling.decompose()`

			`for tag_name in ["figure", "div"]:`
			`for wrapper in soup.find_all(tag_name):`
			`imgs = wrapper.find_all("img")`
			`if not imgs:`
			`text = wrapper.get_text(strip=True)`
			`if not text or text == "\xa0":`
			`wrapper.decompose()`
			`continue`

			`figcaption = wrapper.find("figcaption")`
			`other_content = [`
			`c for c in wrapper.children`
			`if c.name and c.name not in ("img", "figcaption", "br")`
			`and c.get_text(strip=True)`
			`]`
			`if not other_content and not figcaption:`
			`wrapper.unwrap()`

			`return str(soup)`


feat: ePub builder with chapters, images, TOC, cover Made-with: Cursor 2026-04-06 15:17:21 -04:00			`def build_epub(`
			`week_start: date,`
			`week_end: date,`
			`article_ids: list[int],`
			`cover_path: str,`
			`output_dir: str,`
			`) -> str:`
			`os.makedirs(output_dir, exist_ok=True)`

			`articles = (`
			`Article.query`
			`.filter(Article.id.in_(article_ids))`
			`.order_by(Article.pub_date.asc())`
			`.all()`
			`)`

			`title = (`
			`f"Plymouth Independent \u2014 "`
			`f"Week of {week_start.strftime('%b %d')}\u2013{week_end.strftime('%b %d, %Y')}"`
			`)`

			`book = epub.EpubBook()`
			`book.set_identifier(f"pi-{week_start.isoformat()}")`
			`book.set_title(title)`
			`book.set_language("en")`
			`book.add_author("Plymouth Independent")`

			`with open(cover_path, "rb") as f:`
			`book.set_cover("cover.jpg", f.read())`

			`style = epub.EpubItem(`
			`uid="style", file_name="style/default.css",`
			`media_type="text/css", content=EPUB_CSS.encode("utf-8"),`
			`)`
			`book.add_item(style)`

			`chapters = []`
			`image_counter = 0`

			`for article in articles:`
			`categories = json.loads(article.categories)`
			`cat_str = ", ".join(categories) if categories else ""`

			`chapter_html = f"<h1>{article.title}</h1>\n"`
			`chapter_html += (`
			`f'<p class="byline">{article.author} \u00b7 '`
			`f'{article.pub_date.strftime("%B %d, %Y")}</p>\n'`
			`)`
			`if cat_str:`
			`chapter_html += f'<p class="categories">{cat_str}</p>\n'`

			`content = article.content_html`
fix: tighten ePub image spacing — CSS margins, HTML cleanup for empty tags and wrappers Made-with: Cursor 2026-04-06 17:04:40 -04:00			`content = _cleanup_html(content)`
feat: ePub builder with chapters, images, TOC, cover Made-with: Cursor 2026-04-06 15:17:21 -04:00			`article_images = Image.query.filter_by(article_id=article.id).all()`

			`for img_record in article_images:`
			`if not os.path.exists(img_record.local_path):`
			`continue`

			`image_counter += 1`
			`epub_img_name = f"images/img_{image_counter}.jpg"`

			`with open(img_record.local_path, "rb") as f:`
			`img_data = f.read()`

			`epub_img = epub.EpubItem(`
			`uid=f"img_{image_counter}",`
			`file_name=epub_img_name,`
			`media_type="image/jpeg",`
			`content=img_data,`
			`)`
			`book.add_item(epub_img)`
			`content = content.replace(img_record.local_path, epub_img_name)`

			`chapter_html += content`

			`chapter = epub.EpubHtml(`
			`title=article.title,`
			`file_name=f"chapter_{article.id}.xhtml",`
			`lang="en",`
			`)`
			`chapter.set_content(`
			`f'<html><head><link rel="stylesheet" href="style/default.css"/>'`
			`f"</head><body>{chapter_html}</body></html>"`
			`)`
			`chapter.add_item(style)`
			`chapters.append(chapter)`
			`book.add_item(chapter)`

			`book.toc = [(c, []) for c in chapters]`
			`book.add_item(epub.EpubNcx())`
			`book.add_item(epub.EpubNav())`

			`book.spine = ["nav"] + chapters`

			`iso_week = week_start.isocalendar()[1]`
			`filename = f"plymouth-independent-{week_start.year}-W{iso_week:02d}.epub"`
			`epub_path = os.path.join(output_dir, filename)`
			`epub.write_epub(epub_path, book)`

			`return epub_path`