Files
pi-weekly-newspaper/src/epub_builder.py

162 lines
4.9 KiB
Python
Raw Normal View History

import json
import os
import re
from datetime import date
from bs4 import BeautifulSoup
from ebooklib import epub
from src.models import Article, Image
EPUB_CSS = """
body { font-family: serif; margin: 1em; line-height: 1.5; }
h1 { font-size: 1.4em; margin-bottom: 0.3em; }
.byline { font-size: 0.85em; color: #555; margin-bottom: 0.5em; }
.categories { font-size: 0.8em; color: #777; margin-bottom: 1em; }
img { max-width: 100%; display: block; margin: 0.2em auto; padding: 0; }
figure, .wp-block-image, .wp-block-cover, .wp-block-media-text {
margin: 0; padding: 0;
}
figcaption { font-size: 0.8em; text-align: center; color: #555; margin: 0.2em 0 0.5em 0; }
"""
def _cleanup_html(html: str) -> str:
"""Clean up WordPress HTML for ePub: remove empty elements, tighten image spacing."""
soup = BeautifulSoup(html, "html.parser")
for p in soup.find_all("p"):
text = p.get_text(strip=True)
if not text or text == "\xa0":
children = [c for c in p.children if c.name and c.name != "br"]
if not children:
p.decompose()
for img in soup.find_all("img"):
for sibling in [img.previous_sibling, img.next_sibling]:
if sibling and getattr(sibling, "name", None) == "br":
sibling.decompose()
for tag_name in ["figure", "div"]:
for wrapper in soup.find_all(tag_name):
imgs = wrapper.find_all("img")
if not imgs:
text = wrapper.get_text(strip=True)
if not text or text == "\xa0":
wrapper.decompose()
continue
figcaption = wrapper.find("figcaption")
other_content = [
c for c in wrapper.children
if c.name and c.name not in ("img", "figcaption", "br")
and c.get_text(strip=True)
]
if not other_content and not figcaption:
wrapper.unwrap()
return str(soup)
def build_epub(
week_start: date,
week_end: date,
article_ids: list[int],
cover_path: str,
output_dir: str,
) -> str:
os.makedirs(output_dir, exist_ok=True)
articles = (
Article.query
.filter(Article.id.in_(article_ids))
.order_by(Article.pub_date.asc())
.all()
)
title = (
f"Plymouth Independent \u2014 "
f"Week of {week_start.strftime('%b %d')}\u2013{week_end.strftime('%b %d, %Y')}"
)
book = epub.EpubBook()
book.set_identifier(f"pi-{week_start.isoformat()}")
book.set_title(title)
book.set_language("en")
book.add_author("Plymouth Independent")
with open(cover_path, "rb") as f:
book.set_cover("cover.jpg", f.read())
style = epub.EpubItem(
uid="style", file_name="style/default.css",
media_type="text/css", content=EPUB_CSS.encode("utf-8"),
)
book.add_item(style)
chapters = []
image_counter = 0
for article in articles:
categories = json.loads(article.categories)
cat_str = ", ".join(categories) if categories else ""
chapter_html = f"<h1>{article.title}</h1>\n"
chapter_html += (
f'<p class="byline">{article.author} \u00b7 '
f'{article.pub_date.strftime("%B %d, %Y")}</p>\n'
)
if cat_str:
chapter_html += f'<p class="categories">{cat_str}</p>\n'
content = article.content_html
content = _cleanup_html(content)
article_images = Image.query.filter_by(article_id=article.id).all()
for img_record in article_images:
if not os.path.exists(img_record.local_path):
continue
image_counter += 1
epub_img_name = f"images/img_{image_counter}.jpg"
with open(img_record.local_path, "rb") as f:
img_data = f.read()
epub_img = epub.EpubItem(
uid=f"img_{image_counter}",
file_name=epub_img_name,
media_type="image/jpeg",
content=img_data,
)
book.add_item(epub_img)
content = content.replace(img_record.local_path, epub_img_name)
chapter_html += content
chapter = epub.EpubHtml(
title=article.title,
file_name=f"chapter_{article.id}.xhtml",
lang="en",
)
chapter.set_content(
f'<html><head><link rel="stylesheet" href="style/default.css"/>'
f"</head><body>{chapter_html}</body></html>"
)
chapter.add_item(style)
chapters.append(chapter)
book.add_item(chapter)
book.toc = [(c, []) for c in chapters]
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
book.spine = ["nav"] + chapters
iso_week = week_start.isocalendar()[1]
filename = f"plymouth-independent-{week_start.year}-W{iso_week:02d}.epub"
epub_path = os.path.join(output_dir, filename)
epub.write_epub(epub_path, book)
return epub_path