feat: RSS fetcher with dedup, image download, HTML rewriting
Made-with: Cursor
This commit is contained in:
91
src/fetcher.py
Normal file
91
src/fetcher.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import feedparser
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from email.utils import parsedate_to_datetime
|
||||
|
||||
import config
|
||||
from app import db
|
||||
from src.models import Article, Image
|
||||
from src.images import process_image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def fetch_and_cache_articles() -> dict:
|
||||
"""Fetch RSS feed and cache new articles. Returns stats dict."""
|
||||
stats = {"new": 0, "skipped": 0, "errors": 0, "error": None}
|
||||
|
||||
try:
|
||||
response = requests.get(config.FEED_URL, timeout=30)
|
||||
response.raise_for_status()
|
||||
except Exception as e:
|
||||
logger.error("Failed to fetch RSS feed: %s", e)
|
||||
stats["error"] = str(e)
|
||||
return stats
|
||||
|
||||
feed = feedparser.parse(response.text)
|
||||
|
||||
for entry in feed.entries:
|
||||
guid = entry.get("id", entry.get("link", ""))
|
||||
if not guid:
|
||||
continue
|
||||
|
||||
existing = Article.query.filter_by(guid=guid).first()
|
||||
if existing:
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
pub_date = parsedate_to_datetime(entry.get("published", ""))
|
||||
except Exception:
|
||||
pub_date = datetime.now(timezone.utc)
|
||||
|
||||
categories = [t.term for t in entry.get("tags", [])]
|
||||
|
||||
content_html = ""
|
||||
if entry.get("content"):
|
||||
content_html = entry.content[0].get("value", "")
|
||||
elif entry.get("summary"):
|
||||
content_html = entry.summary
|
||||
|
||||
article = Article(
|
||||
guid=guid,
|
||||
title=entry.get("title", "Untitled"),
|
||||
author=entry.get("author", "Unknown"),
|
||||
pub_date=pub_date,
|
||||
categories=json.dumps(categories),
|
||||
link=entry.get("link", ""),
|
||||
content_html=content_html,
|
||||
)
|
||||
db.session.add(article)
|
||||
db.session.flush()
|
||||
|
||||
soup = BeautifulSoup(content_html, "html.parser")
|
||||
for img_tag in soup.find_all("img"):
|
||||
src = img_tag.get("src")
|
||||
if not src or not src.startswith("http"):
|
||||
continue
|
||||
try:
|
||||
local_path, w, h = process_image(src, config.IMAGES_DIR)
|
||||
image_record = Image(
|
||||
article_id=article.id,
|
||||
original_url=src,
|
||||
local_path=local_path,
|
||||
width=w,
|
||||
height=h,
|
||||
)
|
||||
db.session.add(image_record)
|
||||
img_tag["src"] = local_path
|
||||
except Exception as e:
|
||||
logger.warning("Failed to process image %s: %s", src, e)
|
||||
stats["errors"] += 1
|
||||
|
||||
article.content_html = str(soup)
|
||||
db.session.commit()
|
||||
stats["new"] += 1
|
||||
|
||||
return stats
|
||||
83
tests/test_fetcher.py
Normal file
83
tests/test_fetcher.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import json
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from src.fetcher import fetch_and_cache_articles
|
||||
from src.models import Article, Image
|
||||
from tests.conftest import SAMPLE_RSS_XML
|
||||
|
||||
|
||||
def _mock_feed_response(xml_content):
|
||||
mock = MagicMock()
|
||||
mock.content = xml_content.encode("utf-8")
|
||||
mock.text = xml_content
|
||||
mock.status_code = 200
|
||||
mock.raise_for_status = MagicMock()
|
||||
return mock
|
||||
|
||||
|
||||
def test_fetch_creates_articles(app, db):
|
||||
with app.app_context():
|
||||
with patch("src.fetcher.requests.get") as mock_get:
|
||||
mock_get.return_value = _mock_feed_response(SAMPLE_RSS_XML)
|
||||
with patch("src.fetcher.process_image") as mock_img:
|
||||
mock_img.return_value = ("/fake/path.jpg", 800, 450)
|
||||
result = fetch_and_cache_articles()
|
||||
|
||||
assert result["new"] == 2
|
||||
assert result["skipped"] == 0
|
||||
articles = Article.query.order_by(Article.pub_date).all()
|
||||
assert len(articles) == 2
|
||||
assert articles[0].title == "Test Article One"
|
||||
assert articles[1].title == "Test Article Two"
|
||||
|
||||
|
||||
def test_fetch_deduplicates(app, db):
|
||||
with app.app_context():
|
||||
with patch("src.fetcher.requests.get") as mock_get:
|
||||
mock_get.return_value = _mock_feed_response(SAMPLE_RSS_XML)
|
||||
with patch("src.fetcher.process_image") as mock_img:
|
||||
mock_img.return_value = ("/fake/path.jpg", 800, 450)
|
||||
fetch_and_cache_articles()
|
||||
result = fetch_and_cache_articles()
|
||||
|
||||
assert result["new"] == 0
|
||||
assert result["skipped"] == 2
|
||||
assert Article.query.count() == 2
|
||||
|
||||
|
||||
def test_fetch_downloads_images(app, db):
|
||||
with app.app_context():
|
||||
with patch("src.fetcher.requests.get") as mock_get:
|
||||
mock_get.return_value = _mock_feed_response(SAMPLE_RSS_XML)
|
||||
with patch("src.fetcher.process_image") as mock_img:
|
||||
mock_img.return_value = ("/fake/path.jpg", 800, 450)
|
||||
fetch_and_cache_articles()
|
||||
|
||||
images = Image.query.all()
|
||||
assert len(images) == 1
|
||||
assert images[0].original_url == "https://example.com/image1.jpg"
|
||||
|
||||
|
||||
def test_fetch_rewrites_image_src(app, db):
|
||||
with app.app_context():
|
||||
with patch("src.fetcher.requests.get") as mock_get:
|
||||
mock_get.return_value = _mock_feed_response(SAMPLE_RSS_XML)
|
||||
with patch("src.fetcher.process_image") as mock_img:
|
||||
mock_img.return_value = ("/fake/path.jpg", 800, 450)
|
||||
fetch_and_cache_articles()
|
||||
|
||||
article = Article.query.filter_by(
|
||||
guid="https://example.com/?p=1001"
|
||||
).first()
|
||||
assert "https://example.com/image1.jpg" not in article.content_html
|
||||
assert "/fake/path.jpg" in article.content_html
|
||||
|
||||
|
||||
def test_fetch_handles_feed_error(app, db):
|
||||
with app.app_context():
|
||||
with patch("src.fetcher.requests.get") as mock_get:
|
||||
mock_get.side_effect = Exception("Network error")
|
||||
result = fetch_and_cache_articles()
|
||||
|
||||
assert result["error"] is not None
|
||||
assert Article.query.count() == 0
|
||||
Reference in New Issue
Block a user