2026-04-06 15:17:21 -04:00
|
|
|
import json
|
|
|
|
|
import os
|
2026-04-06 17:04:40 -04:00
|
|
|
import re
|
2026-04-06 15:17:21 -04:00
|
|
|
from datetime import date
|
|
|
|
|
|
2026-04-06 17:04:40 -04:00
|
|
|
from bs4 import BeautifulSoup
|
2026-04-06 15:17:21 -04:00
|
|
|
from ebooklib import epub
|
|
|
|
|
|
|
|
|
|
from src.models import Article, Image
|
|
|
|
|
|
|
|
|
|
EPUB_CSS = """
|
|
|
|
|
body { font-family: serif; margin: 1em; line-height: 1.5; }
|
|
|
|
|
h1 { font-size: 1.4em; margin-bottom: 0.3em; }
|
|
|
|
|
.byline { font-size: 0.85em; color: #555; margin-bottom: 0.5em; }
|
|
|
|
|
.categories { font-size: 0.8em; color: #777; margin-bottom: 1em; }
|
2026-04-06 17:04:40 -04:00
|
|
|
img { max-width: 100%; display: block; margin: 0.2em auto; padding: 0; }
|
|
|
|
|
figure, .wp-block-image, .wp-block-cover, .wp-block-media-text {
|
|
|
|
|
margin: 0; padding: 0;
|
|
|
|
|
}
|
|
|
|
|
figcaption { font-size: 0.8em; text-align: center; color: #555; margin: 0.2em 0 0.5em 0; }
|
2026-04-06 15:17:21 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
2026-04-06 17:04:40 -04:00
|
|
|
def _cleanup_html(html: str) -> str:
|
|
|
|
|
"""Clean up WordPress HTML for ePub: remove empty elements, tighten image spacing."""
|
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
|
|
|
|
|
|
for p in soup.find_all("p"):
|
|
|
|
|
text = p.get_text(strip=True)
|
|
|
|
|
if not text or text == "\xa0":
|
|
|
|
|
children = [c for c in p.children if c.name and c.name != "br"]
|
|
|
|
|
if not children:
|
|
|
|
|
p.decompose()
|
|
|
|
|
|
|
|
|
|
for img in soup.find_all("img"):
|
|
|
|
|
for sibling in [img.previous_sibling, img.next_sibling]:
|
|
|
|
|
if sibling and getattr(sibling, "name", None) == "br":
|
|
|
|
|
sibling.decompose()
|
|
|
|
|
|
|
|
|
|
for tag_name in ["figure", "div"]:
|
|
|
|
|
for wrapper in soup.find_all(tag_name):
|
|
|
|
|
imgs = wrapper.find_all("img")
|
|
|
|
|
if not imgs:
|
|
|
|
|
text = wrapper.get_text(strip=True)
|
|
|
|
|
if not text or text == "\xa0":
|
|
|
|
|
wrapper.decompose()
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
figcaption = wrapper.find("figcaption")
|
|
|
|
|
other_content = [
|
|
|
|
|
c for c in wrapper.children
|
|
|
|
|
if c.name and c.name not in ("img", "figcaption", "br")
|
|
|
|
|
and c.get_text(strip=True)
|
|
|
|
|
]
|
|
|
|
|
if not other_content and not figcaption:
|
|
|
|
|
wrapper.unwrap()
|
|
|
|
|
|
|
|
|
|
return str(soup)
|
|
|
|
|
|
|
|
|
|
|
2026-04-06 15:17:21 -04:00
|
|
|
def build_epub(
|
|
|
|
|
week_start: date,
|
|
|
|
|
week_end: date,
|
|
|
|
|
article_ids: list[int],
|
|
|
|
|
cover_path: str,
|
|
|
|
|
output_dir: str,
|
|
|
|
|
) -> str:
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
articles = (
|
|
|
|
|
Article.query
|
|
|
|
|
.filter(Article.id.in_(article_ids))
|
|
|
|
|
.order_by(Article.pub_date.asc())
|
|
|
|
|
.all()
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
title = (
|
|
|
|
|
f"Plymouth Independent \u2014 "
|
|
|
|
|
f"Week of {week_start.strftime('%b %d')}\u2013{week_end.strftime('%b %d, %Y')}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
book = epub.EpubBook()
|
|
|
|
|
book.set_identifier(f"pi-{week_start.isoformat()}")
|
|
|
|
|
book.set_title(title)
|
|
|
|
|
book.set_language("en")
|
|
|
|
|
book.add_author("Plymouth Independent")
|
|
|
|
|
|
|
|
|
|
with open(cover_path, "rb") as f:
|
|
|
|
|
book.set_cover("cover.jpg", f.read())
|
|
|
|
|
|
|
|
|
|
style = epub.EpubItem(
|
|
|
|
|
uid="style", file_name="style/default.css",
|
|
|
|
|
media_type="text/css", content=EPUB_CSS.encode("utf-8"),
|
|
|
|
|
)
|
|
|
|
|
book.add_item(style)
|
|
|
|
|
|
|
|
|
|
chapters = []
|
|
|
|
|
image_counter = 0
|
|
|
|
|
|
|
|
|
|
for article in articles:
|
|
|
|
|
categories = json.loads(article.categories)
|
|
|
|
|
cat_str = ", ".join(categories) if categories else ""
|
|
|
|
|
|
|
|
|
|
chapter_html = f"<h1>{article.title}</h1>\n"
|
|
|
|
|
chapter_html += (
|
|
|
|
|
f'<p class="byline">{article.author} \u00b7 '
|
|
|
|
|
f'{article.pub_date.strftime("%B %d, %Y")}</p>\n'
|
|
|
|
|
)
|
|
|
|
|
if cat_str:
|
|
|
|
|
chapter_html += f'<p class="categories">{cat_str}</p>\n'
|
|
|
|
|
|
|
|
|
|
content = article.content_html
|
2026-04-06 17:04:40 -04:00
|
|
|
content = _cleanup_html(content)
|
2026-04-06 15:17:21 -04:00
|
|
|
article_images = Image.query.filter_by(article_id=article.id).all()
|
|
|
|
|
|
|
|
|
|
for img_record in article_images:
|
|
|
|
|
if not os.path.exists(img_record.local_path):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
image_counter += 1
|
|
|
|
|
epub_img_name = f"images/img_{image_counter}.jpg"
|
|
|
|
|
|
|
|
|
|
with open(img_record.local_path, "rb") as f:
|
|
|
|
|
img_data = f.read()
|
|
|
|
|
|
|
|
|
|
epub_img = epub.EpubItem(
|
|
|
|
|
uid=f"img_{image_counter}",
|
|
|
|
|
file_name=epub_img_name,
|
|
|
|
|
media_type="image/jpeg",
|
|
|
|
|
content=img_data,
|
|
|
|
|
)
|
|
|
|
|
book.add_item(epub_img)
|
|
|
|
|
content = content.replace(img_record.local_path, epub_img_name)
|
|
|
|
|
|
|
|
|
|
chapter_html += content
|
|
|
|
|
|
|
|
|
|
chapter = epub.EpubHtml(
|
|
|
|
|
title=article.title,
|
|
|
|
|
file_name=f"chapter_{article.id}.xhtml",
|
|
|
|
|
lang="en",
|
|
|
|
|
)
|
|
|
|
|
chapter.set_content(
|
|
|
|
|
f'<html><head><link rel="stylesheet" href="style/default.css"/>'
|
|
|
|
|
f"</head><body>{chapter_html}</body></html>"
|
|
|
|
|
)
|
|
|
|
|
chapter.add_item(style)
|
|
|
|
|
chapters.append(chapter)
|
|
|
|
|
book.add_item(chapter)
|
|
|
|
|
|
|
|
|
|
book.toc = [(c, []) for c in chapters]
|
|
|
|
|
book.add_item(epub.EpubNcx())
|
|
|
|
|
book.add_item(epub.EpubNav())
|
|
|
|
|
|
|
|
|
|
book.spine = ["nav"] + chapters
|
|
|
|
|
|
|
|
|
|
iso_week = week_start.isocalendar()[1]
|
|
|
|
|
filename = f"plymouth-independent-{week_start.year}-W{iso_week:02d}.epub"
|
|
|
|
|
epub_path = os.path.join(output_dir, filename)
|
|
|
|
|
epub.write_epub(epub_path, book)
|
|
|
|
|
|
|
|
|
|
return epub_path
|