Files
pi-weekly-newspaper/src/epub_builder.py

185 lines
5.9 KiB
Python
Raw Normal View History

import json
import os
import re
from datetime import date, datetime
from bs4 import BeautifulSoup
from ebooklib import epub
from src.models import Article, Image
EPUB_CSS = """
body { font-family: serif; margin: 1em; line-height: 1.5; }
h1 { font-size: 1.4em; margin-bottom: 0.3em; }
.byline { font-size: 0.85em; color: #555; margin-bottom: 0.5em; }
.categories { font-size: 0.8em; color: #777; margin-bottom: 1em; }
img { max-width: 100%; display: block; margin: 0.2em auto; padding: 0; }
figure, .wp-block-image, .wp-block-cover, .wp-block-media-text {
margin: 0; padding: 0;
}
figcaption { font-size: 0.8em; text-align: center; color: #555; margin: 0.2em 0 0.5em 0; }
"""
def _cleanup_html(html: str) -> str:
"""Clean up WordPress HTML for ePub: remove empty elements, tighten image spacing."""
soup = BeautifulSoup(html, "html.parser")
for p in soup.find_all("p"):
text = p.get_text(strip=True)
if not text or text == "\xa0":
children = [c for c in p.children if c.name and c.name != "br"]
if not children:
p.decompose()
for img in soup.find_all("img"):
for sibling in [img.previous_sibling, img.next_sibling]:
if sibling and getattr(sibling, "name", None) == "br":
sibling.decompose()
for tag_name in ["figure", "div"]:
for wrapper in soup.find_all(tag_name):
imgs = wrapper.find_all("img")
if not imgs:
text = wrapper.get_text(strip=True)
if not text or text == "\xa0":
wrapper.decompose()
continue
figcaption = wrapper.find("figcaption")
other_content = [
c for c in wrapper.children
if c.name and c.name not in ("img", "figcaption", "br")
and c.get_text(strip=True)
]
if not other_content and not figcaption:
wrapper.unwrap()
return str(soup)
def build_epub(
week_start: date,
week_end: date,
article_ids: list[int],
cover_path: str,
output_dir: str,
issue_type: str = "weekly",
) -> str:
os.makedirs(output_dir, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d%H%M%S")
articles = (
Article.query
.filter(Article.id.in_(article_ids))
.order_by(Article.pub_date.asc())
.all()
)
if issue_type == "single_article" and len(articles) == 1:
title = f"Plymouth Independent \u2014 {articles[0].title}"
elif issue_type == "multi_week":
w1 = week_start.isocalendar()[1]
w2 = week_end.isocalendar()[1]
title = (
f"Plymouth Independent \u2014 "
f"Weeks {w1}\u2013{w2}, {week_start.strftime('%b %d')}\u2013{week_end.strftime('%b %d, %Y')}"
)
else:
title = (
f"Plymouth Independent \u2014 "
f"Week of {week_start.strftime('%b %d')}\u2013{week_end.strftime('%b %d, %Y')}"
)
book = epub.EpubBook()
book.set_identifier(f"pi-{week_start.isoformat()}-{issue_type}-{ts}")
book.set_title(title)
book.set_language("en")
book.add_author("Plymouth Independent")
with open(cover_path, "rb") as f:
book.set_cover("cover.jpg", f.read())
for item in book.get_items():
if item.get_name() == "cover.xhtml":
item.is_linear = True
break
style = epub.EpubItem(
uid="style", file_name="style/default.css",
media_type="text/css", content=EPUB_CSS.encode("utf-8"),
)
book.add_item(style)
chapters = []
image_counter = 0
for article in articles:
categories = json.loads(article.categories)
cat_str = ", ".join(categories) if categories else ""
chapter_html = f"<h1>{article.title}</h1>\n"
chapter_html += (
f'<p class="byline">{article.author} \u00b7 '
f'{article.pub_date.strftime("%B %d, %Y")}</p>\n'
)
if cat_str:
chapter_html += f'<p class="categories">{cat_str}</p>\n'
content = article.content_html
content = _cleanup_html(content)
article_images = Image.query.filter_by(article_id=article.id).all()
for img_record in article_images:
if not os.path.exists(img_record.local_path):
continue
image_counter += 1
epub_img_name = f"images/img_{image_counter}.jpg"
with open(img_record.local_path, "rb") as f:
img_data = f.read()
epub_img = epub.EpubItem(
uid=f"img_{image_counter}",
file_name=epub_img_name,
media_type="image/jpeg",
content=img_data,
)
book.add_item(epub_img)
content = content.replace(img_record.local_path, epub_img_name)
chapter_html += content
chapter = epub.EpubHtml(
title=article.title,
file_name=f"chapter_{article.id}.xhtml",
lang="en",
)
chapter.set_content(
f'<html><head><link rel="stylesheet" href="style/default.css"/>'
f"</head><body>{chapter_html}</body></html>"
)
chapter.add_item(style)
chapters.append(chapter)
book.add_item(chapter)
book.toc = [(c, []) for c in chapters]
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
book.spine = ["cover", "nav"] + chapters
iso_week = week_start.isocalendar()[1]
if issue_type == "multi_week":
w2 = week_end.isocalendar()[1]
filename = f"plymouth-independent-{week_start.year}-W{iso_week:02d}-W{w2:02d}-{ts}.epub"
elif issue_type == "single_article":
filename = f"plymouth-independent-single-{ts}.epub"
else:
filename = f"plymouth-independent-{week_start.year}-W{iso_week:02d}-{ts}.epub"
epub_path = os.path.join(output_dir, filename)
epub.write_epub(epub_path, book)
return epub_path