src/ntr_fetcher/soundcloud.py

import asyncio
import json
import logging
import re
from datetime import datetime
from urllib.parse import parse_qs, urlparse

import httpx

from ntr_fetcher.models import Track

logger = logging.getLogger(__name__)

SOUNDCLOUD_BASE = "https://soundcloud.com"
API_BASE = "https://api-v2.soundcloud.com"
HYDRATION_PATTERN = re.compile(r"__sc_hydration\s*=\s*(\[.*?\])\s*;", re.DOTALL)


def _build_cursor(until: datetime, user_id: int) -> str:
    ts = until.strftime("%Y-%m-%dT%H:%M:%S.000Z")
    padded_user = str(user_id).zfill(20)
    return f"{ts},user-track-likes,000-{padded_user}-99999999999999999999"


class SoundCloudClient:
    def __init__(self, http_client: httpx.AsyncClient | None = None):
        self._http = http_client or httpx.AsyncClient(timeout=15.0)
        self._client_id: str | None = None

    async def _extract_client_id(self) -> str:
        if self._client_id is not None:
            return self._client_id

        resp = await self._http.get(SOUNDCLOUD_BASE)
        resp.raise_for_status()
        match = HYDRATION_PATTERN.search(resp.text)
        if not match:
            raise ValueError("Could not find __sc_hydration in SoundCloud HTML — cannot extract client_id")

        hydration = json.loads(match.group(1))
        for entry in hydration:
            if entry.get("hydratable") == "apiClient":
                self._client_id = entry["data"]["id"]
                is_expiring = entry["data"].get("isExpiring", False)
                if is_expiring:
                    logger.warning("SoundCloud client_id is marked as expiring")
                return self._client_id

        raise ValueError("No apiClient entry in __sc_hydration — cannot extract client_id")

    def invalidate_client_id(self) -> None:
        self._client_id = None

    async def _api_get(self, url: str, params: dict | None = None) -> httpx.Response:
        client_id = await self._extract_client_id()
        params = dict(params or {})
        params["client_id"] = client_id

        max_attempts = 3
        for attempt in range(max_attempts):
            resp = await self._http.get(url, params=params)

            if resp.status_code == 401 or resp.status_code >= 500:
                logger.warning(
                    "Got %d from SoundCloud API, refreshing client_id (attempt %d/%d)",
                    resp.status_code, attempt + 1, max_attempts,
                )
                self.invalidate_client_id()
                if resp.status_code >= 500:
                    await asyncio.sleep(2 ** attempt)
                client_id = await self._extract_client_id()
                params["client_id"] = client_id
                continue

            resp.raise_for_status()
            return resp

        raise httpx.HTTPStatusError(
            f"Failed after {max_attempts} attempts (last status: {resp.status_code})",
            request=resp.request,
            response=resp,
        )

    async def resolve_user(self, username: str) -> int:
        resp = await self._api_get(
            f"{API_BASE}/resolve",
            params={"url": f"{SOUNDCLOUD_BASE}/{username}"},
        )
        return resp.json()["id"]

    async def fetch_likes(
        self,
        user_id: int,
        since: datetime,
        until: datetime,
        limit: int = 50,
    ) -> list[Track]:
        cursor: str | None = _build_cursor(until, user_id)
        collected: list[Track] = []
        used_fabricated_cursor = True

        while True:
            params: dict = {"limit": limit}
            if cursor:
                params["offset"] = cursor

            try:
                resp = await self._api_get(f"{API_BASE}/users/{user_id}/likes", params=params)
            except httpx.HTTPStatusError as exc:
                if used_fabricated_cursor and cursor and exc.response.status_code >= 500:
                    logger.warning("Fabricated cursor rejected (HTTP %d), retrying without cursor", exc.response.status_code)
                    cursor = None
                    used_fabricated_cursor = False
                    continue
                raise
            data = resp.json()
            collection = data.get("collection", [])

            if not collection:
                break

            stop = False
            for item in collection:
                liked_at_str = item.get("created_at", "")
                liked_at = datetime.fromisoformat(liked_at_str.replace("Z", "+00:00"))

                if liked_at < since:
                    stop = True
                    break

                if liked_at > until:
                    continue

                track_data = item.get("track")
                if track_data is None:
                    continue

                user_data = track_data.get("user", {})
                collected.append(
                    Track(
                        id=track_data["id"],
                        title=track_data["title"],
                        artist=user_data.get("username", "Unknown"),
                        permalink_url=track_data["permalink_url"],
                        artwork_url=track_data.get("artwork_url"),
                        duration_ms=track_data.get("full_duration", track_data.get("duration", 0)),
                        license=track_data.get("license", ""),
                        liked_at=liked_at,
                        raw_json=json.dumps(track_data),
                    )
                )

            if stop:
                break

            next_href = data.get("next_href")
            if not next_href:
                break

            parsed = urlparse(next_href)
            qs = parse_qs(parsed.query)
            cursor = qs.get("offset", [None])[0]
            if cursor is None:
                break

        collected.sort(key=lambda t: t.liked_at)
        return collected

    async def close(self) -> None:
        await self._http.aclose()
fix: handle SoundCloud API 5xx errors with client_id refresh, backoff, and cursor fallback SoundCloud began rejecting the fabricated pagination cursor with 500 errors. Fixed cursor user_id padding (zfill 22→20) to match the documented format, added 5xx retry with exponential backoff in _api_get, and added a fallback in fetch_likes that drops the fabricated cursor when it causes persistent 500s. Made-with: Cursor 2026-03-25 08:20:20 -04:00			`import asyncio`
feat: add SoundCloud client with client_id extraction Made-with: Cursor 2026-03-12 01:36:03 -04:00			`import json`
			`import logging`
			`import re`
			`from datetime import datetime`
			`from urllib.parse import parse_qs, urlparse`

			`import httpx`

			`from ntr_fetcher.models import Track`

			`logger = logging.getLogger(__name__)`

			`SOUNDCLOUD_BASE = "https://soundcloud.com"`
			`API_BASE = "https://api-v2.soundcloud.com"`
			`HYDRATION_PATTERN = re.compile(r"__sc_hydration\s=\s(\[.?\])\s;", re.DOTALL)`


			`def _build_cursor(until: datetime, user_id: int) -> str:`
			`ts = until.strftime("%Y-%m-%dT%H:%M:%S.000Z")`
fix: handle SoundCloud API 5xx errors with client_id refresh, backoff, and cursor fallback SoundCloud began rejecting the fabricated pagination cursor with 500 errors. Fixed cursor user_id padding (zfill 22→20) to match the documented format, added 5xx retry with exponential backoff in _api_get, and added a fallback in fetch_likes that drops the fabricated cursor when it causes persistent 500s. Made-with: Cursor 2026-03-25 08:20:20 -04:00			`padded_user = str(user_id).zfill(20)`
feat: add SoundCloud client with client_id extraction Made-with: Cursor 2026-03-12 01:36:03 -04:00			`return f"{ts},user-track-likes,000-{padded_user}-99999999999999999999"`


			`class SoundCloudClient:`
			`def __init__(self, http_client: httpx.AsyncClient \| None = None):`
			`self._http = http_client or httpx.AsyncClient(timeout=15.0)`
			`self._client_id: str \| None = None`

			`async def _extract_client_id(self) -> str:`
			`if self._client_id is not None:`
			`return self._client_id`

			`resp = await self._http.get(SOUNDCLOUD_BASE)`
			`resp.raise_for_status()`
			`match = HYDRATION_PATTERN.search(resp.text)`
			`if not match:`
			`raise ValueError("Could not find __sc_hydration in SoundCloud HTML — cannot extract client_id")`

			`hydration = json.loads(match.group(1))`
			`for entry in hydration:`
			`if entry.get("hydratable") == "apiClient":`
			`self._client_id = entry["data"]["id"]`
			`is_expiring = entry["data"].get("isExpiring", False)`
			`if is_expiring:`
			`logger.warning("SoundCloud client_id is marked as expiring")`
			`return self._client_id`

			`raise ValueError("No apiClient entry in __sc_hydration — cannot extract client_id")`

			`def invalidate_client_id(self) -> None:`
			`self._client_id = None`

feat: add user resolution and likes fetching with 401 retry Made-with: Cursor 2026-03-12 01:36:28 -04:00			`async def _api_get(self, url: str, params: dict \| None = None) -> httpx.Response:`
			`client_id = await self._extract_client_id()`
			`params = dict(params or {})`
			`params["client_id"] = client_id`

fix: handle SoundCloud API 5xx errors with client_id refresh, backoff, and cursor fallback SoundCloud began rejecting the fabricated pagination cursor with 500 errors. Fixed cursor user_id padding (zfill 22→20) to match the documented format, added 5xx retry with exponential backoff in _api_get, and added a fallback in fetch_likes that drops the fabricated cursor when it causes persistent 500s. Made-with: Cursor 2026-03-25 08:20:20 -04:00			`max_attempts = 3`
			`for attempt in range(max_attempts):`
feat: add user resolution and likes fetching with 401 retry Made-with: Cursor 2026-03-12 01:36:28 -04:00			`resp = await self._http.get(url, params=params)`
fix: handle SoundCloud API 5xx errors with client_id refresh, backoff, and cursor fallback SoundCloud began rejecting the fabricated pagination cursor with 500 errors. Fixed cursor user_id padding (zfill 22→20) to match the documented format, added 5xx retry with exponential backoff in _api_get, and added a fallback in fetch_likes that drops the fabricated cursor when it causes persistent 500s. Made-with: Cursor 2026-03-25 08:20:20 -04:00
			`if resp.status_code == 401 or resp.status_code >= 500:`
			`logger.warning(`
			`"Got %d from SoundCloud API, refreshing client_id (attempt %d/%d)",`
			`resp.status_code, attempt + 1, max_attempts,`
			`)`
feat: add user resolution and likes fetching with 401 retry Made-with: Cursor 2026-03-12 01:36:28 -04:00			`self.invalidate_client_id()`
fix: handle SoundCloud API 5xx errors with client_id refresh, backoff, and cursor fallback SoundCloud began rejecting the fabricated pagination cursor with 500 errors. Fixed cursor user_id padding (zfill 22→20) to match the documented format, added 5xx retry with exponential backoff in _api_get, and added a fallback in fetch_likes that drops the fabricated cursor when it causes persistent 500s. Made-with: Cursor 2026-03-25 08:20:20 -04:00			`if resp.status_code >= 500:`
			`await asyncio.sleep(2 ** attempt)`
feat: add user resolution and likes fetching with 401 retry Made-with: Cursor 2026-03-12 01:36:28 -04:00			`client_id = await self._extract_client_id()`
			`params["client_id"] = client_id`
			`continue`
fix: handle SoundCloud API 5xx errors with client_id refresh, backoff, and cursor fallback SoundCloud began rejecting the fabricated pagination cursor with 500 errors. Fixed cursor user_id padding (zfill 22→20) to match the documented format, added 5xx retry with exponential backoff in _api_get, and added a fallback in fetch_likes that drops the fabricated cursor when it causes persistent 500s. Made-with: Cursor 2026-03-25 08:20:20 -04:00
feat: add user resolution and likes fetching with 401 retry Made-with: Cursor 2026-03-12 01:36:28 -04:00			`resp.raise_for_status()`
			`return resp`

			`raise httpx.HTTPStatusError(`
fix: handle SoundCloud API 5xx errors with client_id refresh, backoff, and cursor fallback SoundCloud began rejecting the fabricated pagination cursor with 500 errors. Fixed cursor user_id padding (zfill 22→20) to match the documented format, added 5xx retry with exponential backoff in _api_get, and added a fallback in fetch_likes that drops the fabricated cursor when it causes persistent 500s. Made-with: Cursor 2026-03-25 08:20:20 -04:00			`f"Failed after {max_attempts} attempts (last status: {resp.status_code})",`
feat: add user resolution and likes fetching with 401 retry Made-with: Cursor 2026-03-12 01:36:28 -04:00			`request=resp.request,`
			`response=resp,`
			`)`

			`async def resolve_user(self, username: str) -> int:`
			`resp = await self._api_get(`
			`f"{API_BASE}/resolve",`
			`params={"url": f"{SOUNDCLOUD_BASE}/{username}"},`
			`)`
			`return resp.json()["id"]`

			`async def fetch_likes(`
			`self,`
			`user_id: int,`
			`since: datetime,`
			`until: datetime,`
			`limit: int = 50,`
			`) -> list[Track]:`
fix: handle SoundCloud API 5xx errors with client_id refresh, backoff, and cursor fallback SoundCloud began rejecting the fabricated pagination cursor with 500 errors. Fixed cursor user_id padding (zfill 22→20) to match the documented format, added 5xx retry with exponential backoff in _api_get, and added a fallback in fetch_likes that drops the fabricated cursor when it causes persistent 500s. Made-with: Cursor 2026-03-25 08:20:20 -04:00			`cursor: str \| None = _build_cursor(until, user_id)`
feat: add user resolution and likes fetching with 401 retry Made-with: Cursor 2026-03-12 01:36:28 -04:00			`collected: list[Track] = []`
fix: handle SoundCloud API 5xx errors with client_id refresh, backoff, and cursor fallback SoundCloud began rejecting the fabricated pagination cursor with 500 errors. Fixed cursor user_id padding (zfill 22→20) to match the documented format, added 5xx retry with exponential backoff in _api_get, and added a fallback in fetch_likes that drops the fabricated cursor when it causes persistent 500s. Made-with: Cursor 2026-03-25 08:20:20 -04:00			`used_fabricated_cursor = True`
feat: add user resolution and likes fetching with 401 retry Made-with: Cursor 2026-03-12 01:36:28 -04:00
			`while True:`
			`params: dict = {"limit": limit}`
			`if cursor:`
			`params["offset"] = cursor`

fix: handle SoundCloud API 5xx errors with client_id refresh, backoff, and cursor fallback SoundCloud began rejecting the fabricated pagination cursor with 500 errors. Fixed cursor user_id padding (zfill 22→20) to match the documented format, added 5xx retry with exponential backoff in _api_get, and added a fallback in fetch_likes that drops the fabricated cursor when it causes persistent 500s. Made-with: Cursor 2026-03-25 08:20:20 -04:00			`try:`
			`resp = await self._api_get(f"{API_BASE}/users/{user_id}/likes", params=params)`
			`except httpx.HTTPStatusError as exc:`
			`if used_fabricated_cursor and cursor and exc.response.status_code >= 500:`
			`logger.warning("Fabricated cursor rejected (HTTP %d), retrying without cursor", exc.response.status_code)`
			`cursor = None`
			`used_fabricated_cursor = False`
			`continue`
			`raise`
feat: add user resolution and likes fetching with 401 retry Made-with: Cursor 2026-03-12 01:36:28 -04:00			`data = resp.json()`
			`collection = data.get("collection", [])`

			`if not collection:`
			`break`

			`stop = False`
			`for item in collection:`
			`liked_at_str = item.get("created_at", "")`
			`liked_at = datetime.fromisoformat(liked_at_str.replace("Z", "+00:00"))`

			`if liked_at < since:`
			`stop = True`
			`break`

			`if liked_at > until:`
			`continue`

			`track_data = item.get("track")`
			`if track_data is None:`
			`continue`

			`user_data = track_data.get("user", {})`
			`collected.append(`
			`Track(`
			`id=track_data["id"],`
			`title=track_data["title"],`
			`artist=user_data.get("username", "Unknown"),`
			`permalink_url=track_data["permalink_url"],`
			`artwork_url=track_data.get("artwork_url"),`
			`duration_ms=track_data.get("full_duration", track_data.get("duration", 0)),`
			`license=track_data.get("license", ""),`
			`liked_at=liked_at,`
			`raw_json=json.dumps(track_data),`
			`)`
			`)`

			`if stop:`
			`break`

			`next_href = data.get("next_href")`
			`if not next_href:`
			`break`

			`parsed = urlparse(next_href)`
			`qs = parse_qs(parsed.query)`
			`cursor = qs.get("offset", [None])[0]`
			`if cursor is None:`
			`break`

			`collected.sort(key=lambda t: t.liked_at)`
			`return collected`

feat: add SoundCloud client with client_id extraction Made-with: Cursor 2026-03-12 01:36:03 -04:00			`async def close(self) -> None:`
			`await self._http.aclose()`