feat: RSS fetcher with dedup, image download, HTML rewriting

Made-with: Cursor
This commit is contained in:
cottongin
2026-04-06 15:03:55 -04:00
parent 58fe002c6f
commit 46796b8bf8
2 changed files with 174 additions and 0 deletions

91
src/fetcher.py Normal file
View File

@@ -0,0 +1,91 @@
import json
import logging
from datetime import datetime, timezone
import feedparser
import requests
from bs4 import BeautifulSoup
from email.utils import parsedate_to_datetime
import config
from app import db
from src.models import Article, Image
from src.images import process_image
logger = logging.getLogger(__name__)
def fetch_and_cache_articles() -> dict:
"""Fetch RSS feed and cache new articles. Returns stats dict."""
stats = {"new": 0, "skipped": 0, "errors": 0, "error": None}
try:
response = requests.get(config.FEED_URL, timeout=30)
response.raise_for_status()
except Exception as e:
logger.error("Failed to fetch RSS feed: %s", e)
stats["error"] = str(e)
return stats
feed = feedparser.parse(response.text)
for entry in feed.entries:
guid = entry.get("id", entry.get("link", ""))
if not guid:
continue
existing = Article.query.filter_by(guid=guid).first()
if existing:
stats["skipped"] += 1
continue
try:
pub_date = parsedate_to_datetime(entry.get("published", ""))
except Exception:
pub_date = datetime.now(timezone.utc)
categories = [t.term for t in entry.get("tags", [])]
content_html = ""
if entry.get("content"):
content_html = entry.content[0].get("value", "")
elif entry.get("summary"):
content_html = entry.summary
article = Article(
guid=guid,
title=entry.get("title", "Untitled"),
author=entry.get("author", "Unknown"),
pub_date=pub_date,
categories=json.dumps(categories),
link=entry.get("link", ""),
content_html=content_html,
)
db.session.add(article)
db.session.flush()
soup = BeautifulSoup(content_html, "html.parser")
for img_tag in soup.find_all("img"):
src = img_tag.get("src")
if not src or not src.startswith("http"):
continue
try:
local_path, w, h = process_image(src, config.IMAGES_DIR)
image_record = Image(
article_id=article.id,
original_url=src,
local_path=local_path,
width=w,
height=h,
)
db.session.add(image_record)
img_tag["src"] = local_path
except Exception as e:
logger.warning("Failed to process image %s: %s", src, e)
stats["errors"] += 1
article.content_html = str(soup)
db.session.commit()
stats["new"] += 1
return stats