feat: RSS fetcher with dedup, image download, HTML rewriting

Made-with: Cursor
2026-04-06 15:03:55 -04:00
parent 58fe002c6f
commit 46796b8bf8
2 changed files with 174 additions and 0 deletions
--- a/src/fetcher.py
+++ b/src/fetcher.py
@@ -0,0 +1,91 @@
 import json
 import logging
 from datetime import datetime, timezone
 import feedparser
 import requests
 from bs4 import BeautifulSoup
 from email.utils import parsedate_to_datetime
 import config
 from app import db
 from src.models import Article, Image
 from src.images import process_image
 logger = logging.getLogger(__name__)
 def fetch_and_cache_articles() -> dict:
    """Fetch RSS feed and cache new articles. Returns stats dict."""
    stats = {"new": 0, "skipped": 0, "errors": 0, "error": None}
    try:
        response = requests.get(config.FEED_URL, timeout=30)
        response.raise_for_status()
    except Exception as e:
        logger.error("Failed to fetch RSS feed: %s", e)
        stats["error"] = str(e)
        return stats
    feed = feedparser.parse(response.text)
    for entry in feed.entries:
        guid = entry.get("id", entry.get("link", ""))
        if not guid:
            continue
        existing = Article.query.filter_by(guid=guid).first()
        if existing:
            stats["skipped"] += 1
            continue
        try:
            pub_date = parsedate_to_datetime(entry.get("published", ""))
        except Exception:
            pub_date = datetime.now(timezone.utc)
        categories = [t.term for t in entry.get("tags", [])]
        content_html = ""
        if entry.get("content"):
            content_html = entry.content[0].get("value", "")
        elif entry.get("summary"):
            content_html = entry.summary
        article = Article(
            guid=guid,
            title=entry.get("title", "Untitled"),
            author=entry.get("author", "Unknown"),
            pub_date=pub_date,
            categories=json.dumps(categories),
            link=entry.get("link", ""),
            content_html=content_html,
        )
        db.session.add(article)
        db.session.flush()
        soup = BeautifulSoup(content_html, "html.parser")
        for img_tag in soup.find_all("img"):
            src = img_tag.get("src")
            if not src or not src.startswith("http"):
                continue
            try:
                local_path, w, h = process_image(src, config.IMAGES_DIR)
                image_record = Image(
                    article_id=article.id,
                    original_url=src,
                    local_path=local_path,
                    width=w,
                    height=h,
                )
                db.session.add(image_record)
                img_tag["src"] = local_path
            except Exception as e:
                logger.warning("Failed to process image %s: %s", src, e)
                stats["errors"] += 1
        article.content_html = str(soup)
        db.session.commit()
        stats["new"] += 1
    return stats
--- a/tests/test_fetcher.py
+++ b/tests/test_fetcher.py
@@ -0,0 +1,83 @@
 import json
 from unittest.mock import patch, MagicMock
 from src.fetcher import fetch_and_cache_articles
 from src.models import Article, Image
 from tests.conftest import SAMPLE_RSS_XML
 def _mock_feed_response(xml_content):
    mock = MagicMock()
    mock.content = xml_content.encode("utf-8")
    mock.text = xml_content
    mock.status_code = 200
    mock.raise_for_status = MagicMock()
    return mock
 def test_fetch_creates_articles(app, db):
    with app.app_context():
        with patch("src.fetcher.requests.get") as mock_get:
            mock_get.return_value = _mock_feed_response(SAMPLE_RSS_XML)
            with patch("src.fetcher.process_image") as mock_img:
                mock_img.return_value = ("/fake/path.jpg", 800, 450)
                result = fetch_and_cache_articles()
        assert result["new"] == 2
        assert result["skipped"] == 0
        articles = Article.query.order_by(Article.pub_date).all()
        assert len(articles) == 2
        assert articles[0].title == "Test Article One"
        assert articles[1].title == "Test Article Two"
 def test_fetch_deduplicates(app, db):
    with app.app_context():
        with patch("src.fetcher.requests.get") as mock_get:
            mock_get.return_value = _mock_feed_response(SAMPLE_RSS_XML)
            with patch("src.fetcher.process_image") as mock_img:
                mock_img.return_value = ("/fake/path.jpg", 800, 450)
                fetch_and_cache_articles()
                result = fetch_and_cache_articles()
        assert result["new"] == 0
        assert result["skipped"] == 2
        assert Article.query.count() == 2
 def test_fetch_downloads_images(app, db):
    with app.app_context():
        with patch("src.fetcher.requests.get") as mock_get:
            mock_get.return_value = _mock_feed_response(SAMPLE_RSS_XML)
            with patch("src.fetcher.process_image") as mock_img:
                mock_img.return_value = ("/fake/path.jpg", 800, 450)
                fetch_and_cache_articles()
        images = Image.query.all()
        assert len(images) == 1
        assert images[0].original_url == "https://example.com/image1.jpg"
 def test_fetch_rewrites_image_src(app, db):
    with app.app_context():
        with patch("src.fetcher.requests.get") as mock_get:
            mock_get.return_value = _mock_feed_response(SAMPLE_RSS_XML)
            with patch("src.fetcher.process_image") as mock_img:
                mock_img.return_value = ("/fake/path.jpg", 800, 450)
                fetch_and_cache_articles()
        article = Article.query.filter_by(
            guid="https://example.com/?p=1001"
        ).first()
        assert "https://example.com/image1.jpg" not in article.content_html
        assert "/fake/path.jpg" in article.content_html
 def test_fetch_handles_feed_error(app, db):
    with app.app_context():
        with patch("src.fetcher.requests.get") as mock_get:
            mock_get.side_effect = Exception("Network error")
            result = fetch_and_cache_articles()
        assert result["error"] is not None
        assert Article.query.count() == 0