From 46796b8bf8229a5ac0ecfe669685e994f6c184a9 Mon Sep 17 00:00:00 2001 From: cottongin Date: Mon, 6 Apr 2026 15:03:55 -0400 Subject: [PATCH] feat: RSS fetcher with dedup, image download, HTML rewriting Made-with: Cursor --- src/fetcher.py | 91 +++++++++++++++++++++++++++++++++++++++++++ tests/test_fetcher.py | 83 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 src/fetcher.py create mode 100644 tests/test_fetcher.py diff --git a/src/fetcher.py b/src/fetcher.py new file mode 100644 index 0000000..3386187 --- /dev/null +++ b/src/fetcher.py @@ -0,0 +1,91 @@ +import json +import logging +from datetime import datetime, timezone + +import feedparser +import requests +from bs4 import BeautifulSoup +from email.utils import parsedate_to_datetime + +import config +from app import db +from src.models import Article, Image +from src.images import process_image + +logger = logging.getLogger(__name__) + + +def fetch_and_cache_articles() -> dict: + """Fetch RSS feed and cache new articles. Returns stats dict.""" + stats = {"new": 0, "skipped": 0, "errors": 0, "error": None} + + try: + response = requests.get(config.FEED_URL, timeout=30) + response.raise_for_status() + except Exception as e: + logger.error("Failed to fetch RSS feed: %s", e) + stats["error"] = str(e) + return stats + + feed = feedparser.parse(response.text) + + for entry in feed.entries: + guid = entry.get("id", entry.get("link", "")) + if not guid: + continue + + existing = Article.query.filter_by(guid=guid).first() + if existing: + stats["skipped"] += 1 + continue + + try: + pub_date = parsedate_to_datetime(entry.get("published", "")) + except Exception: + pub_date = datetime.now(timezone.utc) + + categories = [t.term for t in entry.get("tags", [])] + + content_html = "" + if entry.get("content"): + content_html = entry.content[0].get("value", "") + elif entry.get("summary"): + content_html = entry.summary + + article = Article( + guid=guid, + title=entry.get("title", "Untitled"), + author=entry.get("author", "Unknown"), + pub_date=pub_date, + categories=json.dumps(categories), + link=entry.get("link", ""), + content_html=content_html, + ) + db.session.add(article) + db.session.flush() + + soup = BeautifulSoup(content_html, "html.parser") + for img_tag in soup.find_all("img"): + src = img_tag.get("src") + if not src or not src.startswith("http"): + continue + try: + local_path, w, h = process_image(src, config.IMAGES_DIR) + image_record = Image( + article_id=article.id, + original_url=src, + local_path=local_path, + width=w, + height=h, + ) + db.session.add(image_record) + img_tag["src"] = local_path + except Exception as e: + logger.warning("Failed to process image %s: %s", src, e) + stats["errors"] += 1 + + article.content_html = str(soup) + db.session.commit() + stats["new"] += 1 + + return stats diff --git a/tests/test_fetcher.py b/tests/test_fetcher.py new file mode 100644 index 0000000..10df237 --- /dev/null +++ b/tests/test_fetcher.py @@ -0,0 +1,83 @@ +import json +from unittest.mock import patch, MagicMock + +from src.fetcher import fetch_and_cache_articles +from src.models import Article, Image +from tests.conftest import SAMPLE_RSS_XML + + +def _mock_feed_response(xml_content): + mock = MagicMock() + mock.content = xml_content.encode("utf-8") + mock.text = xml_content + mock.status_code = 200 + mock.raise_for_status = MagicMock() + return mock + + +def test_fetch_creates_articles(app, db): + with app.app_context(): + with patch("src.fetcher.requests.get") as mock_get: + mock_get.return_value = _mock_feed_response(SAMPLE_RSS_XML) + with patch("src.fetcher.process_image") as mock_img: + mock_img.return_value = ("/fake/path.jpg", 800, 450) + result = fetch_and_cache_articles() + + assert result["new"] == 2 + assert result["skipped"] == 0 + articles = Article.query.order_by(Article.pub_date).all() + assert len(articles) == 2 + assert articles[0].title == "Test Article One" + assert articles[1].title == "Test Article Two" + + +def test_fetch_deduplicates(app, db): + with app.app_context(): + with patch("src.fetcher.requests.get") as mock_get: + mock_get.return_value = _mock_feed_response(SAMPLE_RSS_XML) + with patch("src.fetcher.process_image") as mock_img: + mock_img.return_value = ("/fake/path.jpg", 800, 450) + fetch_and_cache_articles() + result = fetch_and_cache_articles() + + assert result["new"] == 0 + assert result["skipped"] == 2 + assert Article.query.count() == 2 + + +def test_fetch_downloads_images(app, db): + with app.app_context(): + with patch("src.fetcher.requests.get") as mock_get: + mock_get.return_value = _mock_feed_response(SAMPLE_RSS_XML) + with patch("src.fetcher.process_image") as mock_img: + mock_img.return_value = ("/fake/path.jpg", 800, 450) + fetch_and_cache_articles() + + images = Image.query.all() + assert len(images) == 1 + assert images[0].original_url == "https://example.com/image1.jpg" + + +def test_fetch_rewrites_image_src(app, db): + with app.app_context(): + with patch("src.fetcher.requests.get") as mock_get: + mock_get.return_value = _mock_feed_response(SAMPLE_RSS_XML) + with patch("src.fetcher.process_image") as mock_img: + mock_img.return_value = ("/fake/path.jpg", 800, 450) + fetch_and_cache_articles() + + article = Article.query.filter_by( + guid="https://example.com/?p=1001" + ).first() + assert "https://example.com/image1.jpg" not in article.content_html + assert "/fake/path.jpg" in article.content_html + + +def test_fetch_handles_feed_error(app, db): + with app.app_context(): + with patch("src.fetcher.requests.get") as mock_get: + mock_get.side_effect = Exception("Network error") + result = fetch_and_cache_articles() + + assert result["error"] is not None + assert Article.query.count() == 0