feat: add historical backfill with --init CLI and episode numbering

Adds a --init mode that seeds the database with past shows from a given
anchor episode/date forward, batch-fetching likes from SoundCloud and
partitioning them into weekly buckets. Episode numbers are tracked in
the shows table and auto-incremented by the poller for new shows.

Includes full API documentation (docs/api.md) and updated README.

Made-with: Cursor
This commit is contained in:
cottongin
2026-03-12 02:09:15 -04:00
parent c88826ac4d
commit cb3ae403cf
14 changed files with 922 additions and 21 deletions

View File

@@ -62,6 +62,7 @@ def test_playlist(client, db):
resp = client.get("/playlist")
assert resp.status_code == 200
data = resp.json()
assert "episode_number" in data
assert len(data["tracks"]) == 2
assert data["tracks"][0]["position"] == 1
assert data["tracks"][0]["title"] == "Song A"
@@ -84,14 +85,18 @@ def test_shows_list(client, db):
_seed_show(db)
resp = client.get("/shows")
assert resp.status_code == 200
assert len(resp.json()) >= 1
data = resp.json()
assert len(data) >= 1
assert "episode_number" in data[0]
def test_shows_detail(client, db):
show = _seed_show(db)
resp = client.get(f"/shows/{show.id}")
assert resp.status_code == 200
assert len(resp.json()["tracks"]) == 2
data = resp.json()
assert "episode_number" in data
assert len(data["tracks"]) == 2
def test_admin_refresh_requires_token(client):

198
tests/test_backfill.py Normal file
View File

@@ -0,0 +1,198 @@
from datetime import date, datetime, timezone
from unittest.mock import AsyncMock
import pytest
from ntr_fetcher.backfill import _compute_show_weeks, run_backfill
from ntr_fetcher.db import Database
from ntr_fetcher.models import Track
@pytest.fixture
def db(tmp_path):
database = Database(str(tmp_path / "test.db"))
database.initialize()
return database
def _make_track(id: int, liked_at: str) -> Track:
return Track(
id=id,
title=f"Track {id}",
artist="Artist",
permalink_url=f"https://soundcloud.com/a/t-{id}",
artwork_url=None,
duration_ms=180000,
license="cc-by",
liked_at=datetime.fromisoformat(liked_at),
raw_json="{}",
)
class TestComputeShowWeeks:
def test_single_week_anchor_is_today(self):
today = date.today()
weeks = _compute_show_weeks(today, 100, show_day=2, show_hour=22)
assert len(weeks) >= 1
assert weeks[0][0] == 100
def test_multiple_weeks(self):
weeks = _compute_show_weeks(
anchor_aired=date(2026, 1, 7),
anchor_episode=521,
show_day=2,
show_hour=22,
)
assert weeks[0][0] == 521
assert weeks[1][0] == 522
for i, (ep, start, end) in enumerate(weeks):
assert ep == 521 + i
assert end > start
def test_week_boundaries_are_utc(self):
weeks = _compute_show_weeks(
anchor_aired=date(2026, 1, 7),
anchor_episode=521,
show_day=2,
show_hour=22,
)
for _, start, end in weeks:
assert start.tzinfo == timezone.utc
assert end.tzinfo == timezone.utc
def test_consecutive_weeks_are_contiguous(self):
weeks = _compute_show_weeks(
anchor_aired=date(2026, 1, 7),
anchor_episode=521,
show_day=2,
show_hour=22,
)
for i in range(len(weeks) - 1):
assert weeks[i][2] == weeks[i + 1][1], (
f"Week {i} end != week {i+1} start"
)
def test_anchor_in_future_returns_empty(self):
future = date(2099, 1, 1)
weeks = _compute_show_weeks(future, 999, show_day=2, show_hour=22)
assert weeks == []
@pytest.mark.asyncio
async def test_run_backfill_populates_db(db):
mock_sc = AsyncMock()
mock_sc.resolve_user.return_value = 12345
t1 = _make_track(1, "2026-01-02T05:00:00+00:00")
t2 = _make_track(2, "2026-01-04T15:00:00+00:00")
t3 = _make_track(3, "2026-01-09T10:00:00+00:00")
mock_sc.fetch_likes.return_value = [t1, t2, t3]
await run_backfill(
db=db,
soundcloud=mock_sc,
soundcloud_user="nicktherat",
show_day=2,
show_hour=22,
anchor_episode=521,
anchor_aired=date(2026, 1, 7),
)
shows = db.list_shows(limit=100, offset=0)
assert len(shows) >= 1
ep_521 = next((s for s in shows if s.episode_number == 521), None)
assert ep_521 is not None
tracks = db.get_show_tracks(ep_521.id)
track_ids = [t["track_id"] for t in tracks]
assert 1 in track_ids or 2 in track_ids or 3 in track_ids
@pytest.mark.asyncio
async def test_run_backfill_partitions_tracks_by_week(db):
mock_sc = AsyncMock()
mock_sc.resolve_user.return_value = 12345
t_week1 = _make_track(10, "2026-01-02T12:00:00+00:00")
t_week2 = _make_track(20, "2026-01-10T12:00:00+00:00")
mock_sc.fetch_likes.return_value = [t_week1, t_week2]
await run_backfill(
db=db,
soundcloud=mock_sc,
soundcloud_user="nicktherat",
show_day=2,
show_hour=22,
anchor_episode=521,
anchor_aired=date(2026, 1, 7),
)
shows = db.list_shows(limit=100, offset=0)
ep_521 = next((s for s in shows if s.episode_number == 521), None)
ep_522 = next((s for s in shows if s.episode_number == 522), None)
if ep_521:
tracks_521 = db.get_show_tracks(ep_521.id)
ids_521 = {t["track_id"] for t in tracks_521}
else:
ids_521 = set()
if ep_522:
tracks_522 = db.get_show_tracks(ep_522.id)
ids_522 = {t["track_id"] for t in tracks_522}
else:
ids_522 = set()
assert ids_521 & ids_522 == set(), "Tracks should not appear in multiple weeks"
@pytest.mark.asyncio
async def test_run_backfill_no_tracks(db):
mock_sc = AsyncMock()
mock_sc.resolve_user.return_value = 12345
mock_sc.fetch_likes.return_value = []
await run_backfill(
db=db,
soundcloud=mock_sc,
soundcloud_user="nicktherat",
show_day=2,
show_hour=22,
anchor_episode=521,
anchor_aired=date(2026, 1, 7),
)
shows = db.list_shows(limit=100, offset=0)
assert len(shows) >= 1
for show in shows:
tracks = db.get_show_tracks(show.id)
assert len(tracks) == 0
@pytest.mark.asyncio
async def test_run_backfill_idempotent(db):
"""Running backfill twice with the same data shouldn't duplicate shows."""
mock_sc = AsyncMock()
mock_sc.resolve_user.return_value = 12345
mock_sc.fetch_likes.return_value = [
_make_track(1, "2026-01-05T12:00:00+00:00"),
]
kwargs = dict(
db=db,
soundcloud=mock_sc,
soundcloud_user="nicktherat",
show_day=2,
show_hour=22,
anchor_episode=521,
anchor_aired=date(2026, 1, 7),
)
await run_backfill(**kwargs)
count_first = len(db.list_shows(limit=1000, offset=0))
await run_backfill(**kwargs)
count_second = len(db.list_shows(limit=1000, offset=0))
assert count_first == count_second

View File

@@ -226,6 +226,52 @@ def test_add_track_to_show_at_position(db):
assert tracks[2]["track_id"] == 2
def test_get_or_create_show_with_episode_number(db):
week_start = datetime(2026, 1, 8, 3, 0, 0, tzinfo=timezone.utc)
week_end = datetime(2026, 1, 15, 3, 0, 0, tzinfo=timezone.utc)
show = db.get_or_create_show(week_start, week_end, episode_number=521)
assert show.episode_number == 521
show2 = db.get_or_create_show(week_start, week_end)
assert show2.id == show.id
assert show2.episode_number == 521
def test_get_or_create_show_updates_episode_number(db):
week_start = datetime(2026, 1, 8, 3, 0, 0, tzinfo=timezone.utc)
week_end = datetime(2026, 1, 15, 3, 0, 0, tzinfo=timezone.utc)
show = db.get_or_create_show(week_start, week_end)
assert show.episode_number is None
show2 = db.get_or_create_show(week_start, week_end, episode_number=521)
assert show2.id == show.id
assert show2.episode_number == 521
def test_get_latest_episode_number(db):
assert db.get_latest_episode_number() is None
db.get_or_create_show(
datetime(2026, 1, 8, 3, 0, 0, tzinfo=timezone.utc),
datetime(2026, 1, 15, 3, 0, 0, tzinfo=timezone.utc),
episode_number=521,
)
assert db.get_latest_episode_number() == 521
db.get_or_create_show(
datetime(2026, 1, 15, 3, 0, 0, tzinfo=timezone.utc),
datetime(2026, 1, 22, 3, 0, 0, tzinfo=timezone.utc),
episode_number=522,
)
assert db.get_latest_episode_number() == 522
def test_update_show_episode_number(db):
week_start = datetime(2026, 1, 8, 3, 0, 0, tzinfo=timezone.utc)
week_end = datetime(2026, 1, 15, 3, 0, 0, tzinfo=timezone.utc)
show = db.get_or_create_show(week_start, week_end)
assert show.episode_number is None
db.update_show_episode_number(show.id, 521)
show2 = db.get_or_create_show(week_start, week_end)
assert show2.episode_number == 521
def test_has_track_in_show(db):
week_start = datetime(2026, 3, 13, 2, 0, 0, tzinfo=timezone.utc)
week_end = datetime(2026, 3, 20, 2, 0, 0, tzinfo=timezone.utc)

View File

@@ -93,6 +93,89 @@ async def test_poll_once_removes_unliked_tracks():
assert call_args[0][1] == [1]
@pytest.mark.asyncio
async def test_poll_once_auto_assigns_episode_number():
mock_sc = AsyncMock()
mock_sc.resolve_user.return_value = 206979918
mock_sc.fetch_likes.return_value = [
_make_track(1, "2026-03-14T01:00:00+00:00"),
]
mock_db = MagicMock()
mock_show = MagicMock()
mock_show.id = 5
mock_show.episode_number = None
mock_db.get_or_create_show.return_value = mock_show
mock_db.get_latest_episode_number.return_value = 530
poller = Poller(
db=mock_db,
soundcloud=mock_sc,
soundcloud_user="nicktherat",
show_day=2,
show_hour=22,
poll_interval=3600,
)
await poller.poll_once()
mock_db.update_show_episode_number.assert_called_once_with(5, 531)
@pytest.mark.asyncio
async def test_poll_once_skips_numbering_when_no_history():
mock_sc = AsyncMock()
mock_sc.resolve_user.return_value = 206979918
mock_sc.fetch_likes.return_value = []
mock_db = MagicMock()
mock_show = MagicMock()
mock_show.id = 1
mock_show.episode_number = None
mock_db.get_or_create_show.return_value = mock_show
mock_db.get_latest_episode_number.return_value = None
poller = Poller(
db=mock_db,
soundcloud=mock_sc,
soundcloud_user="nicktherat",
show_day=2,
show_hour=22,
poll_interval=3600,
)
await poller.poll_once()
mock_db.update_show_episode_number.assert_not_called()
@pytest.mark.asyncio
async def test_poll_once_skips_numbering_when_already_assigned():
mock_sc = AsyncMock()
mock_sc.resolve_user.return_value = 206979918
mock_sc.fetch_likes.return_value = []
mock_db = MagicMock()
mock_show = MagicMock()
mock_show.id = 1
mock_show.episode_number = 530
mock_db.get_or_create_show.return_value = mock_show
poller = Poller(
db=mock_db,
soundcloud=mock_sc,
soundcloud_user="nicktherat",
show_day=2,
show_hour=22,
poll_interval=3600,
)
await poller.poll_once()
mock_db.get_latest_episode_number.assert_not_called()
mock_db.update_show_episode_number.assert_not_called()
@pytest.mark.asyncio
async def test_poll_once_full_refresh():
mock_sc = AsyncMock()