import json import os import re from datetime import date from bs4 import BeautifulSoup from ebooklib import epub from src.models import Article, Image EPUB_CSS = """ body { font-family: serif; margin: 1em; line-height: 1.5; } h1 { font-size: 1.4em; margin-bottom: 0.3em; } .byline { font-size: 0.85em; color: #555; margin-bottom: 0.5em; } .categories { font-size: 0.8em; color: #777; margin-bottom: 1em; } img { max-width: 100%; display: block; margin: 0.2em auto; padding: 0; } figure, .wp-block-image, .wp-block-cover, .wp-block-media-text { margin: 0; padding: 0; } figcaption { font-size: 0.8em; text-align: center; color: #555; margin: 0.2em 0 0.5em 0; } """ def _cleanup_html(html: str) -> str: """Clean up WordPress HTML for ePub: remove empty elements, tighten image spacing.""" soup = BeautifulSoup(html, "html.parser") for p in soup.find_all("p"): text = p.get_text(strip=True) if not text or text == "\xa0": children = [c for c in p.children if c.name and c.name != "br"] if not children: p.decompose() for img in soup.find_all("img"): for sibling in [img.previous_sibling, img.next_sibling]: if sibling and getattr(sibling, "name", None) == "br": sibling.decompose() for tag_name in ["figure", "div"]: for wrapper in soup.find_all(tag_name): imgs = wrapper.find_all("img") if not imgs: text = wrapper.get_text(strip=True) if not text or text == "\xa0": wrapper.decompose() continue figcaption = wrapper.find("figcaption") other_content = [ c for c in wrapper.children if c.name and c.name not in ("img", "figcaption", "br") and c.get_text(strip=True) ] if not other_content and not figcaption: wrapper.unwrap() return str(soup) def build_epub( week_start: date, week_end: date, article_ids: list[int], cover_path: str, output_dir: str, issue_type: str = "weekly", ) -> str: os.makedirs(output_dir, exist_ok=True) articles = ( Article.query .filter(Article.id.in_(article_ids)) .order_by(Article.pub_date.asc()) .all() ) if issue_type == "single_article" and len(articles) == 1: title = f"Plymouth Independent \u2014 {articles[0].title}" elif issue_type == "multi_week": w1 = week_start.isocalendar()[1] w2 = week_end.isocalendar()[1] title = ( f"Plymouth Independent \u2014 " f"Weeks {w1}\u2013{w2}, {week_start.strftime('%b %d')}\u2013{week_end.strftime('%b %d, %Y')}" ) else: title = ( f"Plymouth Independent \u2014 " f"Week of {week_start.strftime('%b %d')}\u2013{week_end.strftime('%b %d, %Y')}" ) book = epub.EpubBook() book.set_identifier(f"pi-{week_start.isoformat()}") book.set_title(title) book.set_language("en") book.add_author("Plymouth Independent") with open(cover_path, "rb") as f: book.set_cover("cover.jpg", f.read()) style = epub.EpubItem( uid="style", file_name="style/default.css", media_type="text/css", content=EPUB_CSS.encode("utf-8"), ) book.add_item(style) chapters = [] image_counter = 0 for article in articles: categories = json.loads(article.categories) cat_str = ", ".join(categories) if categories else "" chapter_html = f"
{cat_str}
\n' content = article.content_html content = _cleanup_html(content) article_images = Image.query.filter_by(article_id=article.id).all() for img_record in article_images: if not os.path.exists(img_record.local_path): continue image_counter += 1 epub_img_name = f"images/img_{image_counter}.jpg" with open(img_record.local_path, "rb") as f: img_data = f.read() epub_img = epub.EpubItem( uid=f"img_{image_counter}", file_name=epub_img_name, media_type="image/jpeg", content=img_data, ) book.add_item(epub_img) content = content.replace(img_record.local_path, epub_img_name) chapter_html += content chapter = epub.EpubHtml( title=article.title, file_name=f"chapter_{article.id}.xhtml", lang="en", ) chapter.set_content( f'' f"{chapter_html}" ) chapter.add_item(style) chapters.append(chapter) book.add_item(chapter) book.toc = [(c, []) for c in chapters] book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) book.spine = ["nav"] + chapters iso_week = week_start.isocalendar()[1] filename = f"plymouth-independent-{week_start.year}-W{iso_week:02d}.epub" epub_path = os.path.join(output_dir, filename) epub.write_epub(epub_path, book) return epub_path