Files
NtR-soudcloud-fetcher/tests/test_backfill.py
cottongin cb3ae403cf feat: add historical backfill with --init CLI and episode numbering
Adds a --init mode that seeds the database with past shows from a given
anchor episode/date forward, batch-fetching likes from SoundCloud and
partitioning them into weekly buckets. Episode numbers are tracked in
the shows table and auto-incremented by the poller for new shows.

Includes full API documentation (docs/api.md) and updated README.

Made-with: Cursor
2026-03-12 02:09:15 -04:00

199 lines
5.5 KiB
Python

from datetime import date, datetime, timezone
from unittest.mock import AsyncMock
import pytest
from ntr_fetcher.backfill import _compute_show_weeks, run_backfill
from ntr_fetcher.db import Database
from ntr_fetcher.models import Track
@pytest.fixture
def db(tmp_path):
database = Database(str(tmp_path / "test.db"))
database.initialize()
return database
def _make_track(id: int, liked_at: str) -> Track:
return Track(
id=id,
title=f"Track {id}",
artist="Artist",
permalink_url=f"https://soundcloud.com/a/t-{id}",
artwork_url=None,
duration_ms=180000,
license="cc-by",
liked_at=datetime.fromisoformat(liked_at),
raw_json="{}",
)
class TestComputeShowWeeks:
def test_single_week_anchor_is_today(self):
today = date.today()
weeks = _compute_show_weeks(today, 100, show_day=2, show_hour=22)
assert len(weeks) >= 1
assert weeks[0][0] == 100
def test_multiple_weeks(self):
weeks = _compute_show_weeks(
anchor_aired=date(2026, 1, 7),
anchor_episode=521,
show_day=2,
show_hour=22,
)
assert weeks[0][0] == 521
assert weeks[1][0] == 522
for i, (ep, start, end) in enumerate(weeks):
assert ep == 521 + i
assert end > start
def test_week_boundaries_are_utc(self):
weeks = _compute_show_weeks(
anchor_aired=date(2026, 1, 7),
anchor_episode=521,
show_day=2,
show_hour=22,
)
for _, start, end in weeks:
assert start.tzinfo == timezone.utc
assert end.tzinfo == timezone.utc
def test_consecutive_weeks_are_contiguous(self):
weeks = _compute_show_weeks(
anchor_aired=date(2026, 1, 7),
anchor_episode=521,
show_day=2,
show_hour=22,
)
for i in range(len(weeks) - 1):
assert weeks[i][2] == weeks[i + 1][1], (
f"Week {i} end != week {i+1} start"
)
def test_anchor_in_future_returns_empty(self):
future = date(2099, 1, 1)
weeks = _compute_show_weeks(future, 999, show_day=2, show_hour=22)
assert weeks == []
@pytest.mark.asyncio
async def test_run_backfill_populates_db(db):
mock_sc = AsyncMock()
mock_sc.resolve_user.return_value = 12345
t1 = _make_track(1, "2026-01-02T05:00:00+00:00")
t2 = _make_track(2, "2026-01-04T15:00:00+00:00")
t3 = _make_track(3, "2026-01-09T10:00:00+00:00")
mock_sc.fetch_likes.return_value = [t1, t2, t3]
await run_backfill(
db=db,
soundcloud=mock_sc,
soundcloud_user="nicktherat",
show_day=2,
show_hour=22,
anchor_episode=521,
anchor_aired=date(2026, 1, 7),
)
shows = db.list_shows(limit=100, offset=0)
assert len(shows) >= 1
ep_521 = next((s for s in shows if s.episode_number == 521), None)
assert ep_521 is not None
tracks = db.get_show_tracks(ep_521.id)
track_ids = [t["track_id"] for t in tracks]
assert 1 in track_ids or 2 in track_ids or 3 in track_ids
@pytest.mark.asyncio
async def test_run_backfill_partitions_tracks_by_week(db):
mock_sc = AsyncMock()
mock_sc.resolve_user.return_value = 12345
t_week1 = _make_track(10, "2026-01-02T12:00:00+00:00")
t_week2 = _make_track(20, "2026-01-10T12:00:00+00:00")
mock_sc.fetch_likes.return_value = [t_week1, t_week2]
await run_backfill(
db=db,
soundcloud=mock_sc,
soundcloud_user="nicktherat",
show_day=2,
show_hour=22,
anchor_episode=521,
anchor_aired=date(2026, 1, 7),
)
shows = db.list_shows(limit=100, offset=0)
ep_521 = next((s for s in shows if s.episode_number == 521), None)
ep_522 = next((s for s in shows if s.episode_number == 522), None)
if ep_521:
tracks_521 = db.get_show_tracks(ep_521.id)
ids_521 = {t["track_id"] for t in tracks_521}
else:
ids_521 = set()
if ep_522:
tracks_522 = db.get_show_tracks(ep_522.id)
ids_522 = {t["track_id"] for t in tracks_522}
else:
ids_522 = set()
assert ids_521 & ids_522 == set(), "Tracks should not appear in multiple weeks"
@pytest.mark.asyncio
async def test_run_backfill_no_tracks(db):
mock_sc = AsyncMock()
mock_sc.resolve_user.return_value = 12345
mock_sc.fetch_likes.return_value = []
await run_backfill(
db=db,
soundcloud=mock_sc,
soundcloud_user="nicktherat",
show_day=2,
show_hour=22,
anchor_episode=521,
anchor_aired=date(2026, 1, 7),
)
shows = db.list_shows(limit=100, offset=0)
assert len(shows) >= 1
for show in shows:
tracks = db.get_show_tracks(show.id)
assert len(tracks) == 0
@pytest.mark.asyncio
async def test_run_backfill_idempotent(db):
"""Running backfill twice with the same data shouldn't duplicate shows."""
mock_sc = AsyncMock()
mock_sc.resolve_user.return_value = 12345
mock_sc.fetch_likes.return_value = [
_make_track(1, "2026-01-05T12:00:00+00:00"),
]
kwargs = dict(
db=db,
soundcloud=mock_sc,
soundcloud_user="nicktherat",
show_day=2,
show_hour=22,
anchor_episode=521,
anchor_aired=date(2026, 1, 7),
)
await run_backfill(**kwargs)
count_first = len(db.list_shows(limit=1000, offset=0))
await run_backfill(**kwargs)
count_second = len(db.list_shows(limit=1000, offset=0))
assert count_first == count_second