feat(main): commit

2026-03-08 16:48:58 -04:00
parent 567766eaed
commit f37382d2b8
29 changed files with 3735 additions and 223 deletions
--- a/core/services/youtube.py
+++ b/core/services/youtube.py
@@ -0,0 +1,244 @@
+"""
+YouTube source sync service.
+
+Two-phase design:
+  Phase 1 — METADATA ONLY (sync_source):
+    Crawls a YouTube channel or playlist and upserts MediaItem rows with
+    title, duration, thumbnail etc.  No video files are downloaded.
+    A max_videos cap keeps this fast for large channels.
+
+  Phase 2 — DOWNLOAD ON DEMAND (download_for_airing):
+    Called only by `python manage.py cache_upcoming` immediately before
+    a scheduled Airing.  Downloads only the specific video needed.
+"""
+
+import logging
+import os
+from pathlib import Path
+
+import yt_dlp
+from django.conf import settings
+from django.utils import timezone
+
+from core.models import MediaItem, MediaSource
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+
+YOUTUBE_SOURCE_TYPES = {
+    MediaSource.SourceType.YOUTUBE_CHANNEL,
+    MediaSource.SourceType.YOUTUBE_PLAYLIST,
+}
+
+
+def _cache_dir() -> Path:
+    """Return (and create) the directory where downloaded videos are stored."""
+    root = Path(getattr(settings, "MEDIA_ROOT", "/tmp/pytv_cache"))
+    root.mkdir(parents=True, exist_ok=True)
+    return root
+
+
+# ---------------------------------------------------------------------------
+# metadata extraction (no download)
+# ---------------------------------------------------------------------------
+
+def _extract_playlist_info(url: str, max_videos: int | None = None) -> list[dict]:
+    """
+    Use yt-dlp to extract metadata for up to `max_videos` videos in a
+    channel/playlist without downloading any files.
+
+    `extract_flat=True` is crucial — it fetches only a lightweight index
+    (title, id, duration) rather than resolving full stream URLs, which
+    makes crawling large channels orders of magnitude faster.
+
+    Returns a list of yt-dlp info dicts (most-recent first for channels).
+    """
+    ydl_opts = {
+        "quiet": True,
+        "no_warnings": True,
+        "extract_flat": True,   # metadata only — NO stream/download URLs
+        "ignoreerrors": True,
+    }
+    if max_videos is not None:
+        # yt-dlp uses 1-based playlist indices; playlistend limits how many
+        # entries are fetched from the source before returning.
+        ydl_opts["playlistend"] = max_videos
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=False)
+
+    if info is None:
+        return []
+
+    # Both channels and playlists wrap entries in an "entries" key.
+    entries = info.get("entries") or []
+    # Flatten one extra level for channels (channel -> playlist -> entries)
+    flat: list[dict] = []
+    for entry in entries:
+        if entry is None:
+            continue
+        if "entries" in entry:          # nested playlist
+            flat.extend(e for e in entry["entries"] if e)
+        else:
+            flat.append(entry)
+    return flat
+
+
+# ---------------------------------------------------------------------------
+# public API
+# ---------------------------------------------------------------------------
+
+def sync_source(media_source: MediaSource, max_videos: int | None = None) -> dict:
+    """
+    Phase 1: Metadata-only sync.
+
+    Crawls a YouTube channel/playlist and upserts MediaItem rows for each
+    discovered video.  No video files are ever downloaded here.
+
+    Args:
+        media_source: The MediaSource to sync.
+        max_videos:   Maximum number of videos to import.  When None the
+                      defaults are applied:
+                        - youtube_channel  → 50  (channels can have 10k+ videos)
+                        - youtube_playlist → 200 (playlists are usually curated)
+
+    Returns:
+        {"created": int, "updated": int, "skipped": int}
+    """
+    if media_source.source_type not in YOUTUBE_SOURCE_TYPES:
+        raise ValueError(f"MediaSource {media_source.id} is not a YouTube source.")
+
+    # Apply sensible defaults per source type
+    if max_videos is None:
+        if media_source.source_type == MediaSource.SourceType.YOUTUBE_CHANNEL:
+            max_videos = 50
+        else:
+            max_videos = 200
+
+    entries = _extract_playlist_info(media_source.uri, max_videos=max_videos)
+    created = updated = skipped = 0
+
+    for entry in entries:
+        video_id = entry.get("id")
+        if not video_id:
+            skipped += 1
+            continue
+
+        title = entry.get("title") or f"YouTube Video {video_id}"
+        duration = entry.get("duration") or 0     # seconds, may be None for live
+        thumbnail = entry.get("thumbnail") or ""
+        description = entry.get("description") or ""
+        release_year = None
+        upload_date = entry.get("upload_date")    # "YYYYMMDD"
+        if upload_date and len(upload_date) >= 4:
+            try:
+                release_year = int(upload_date[:4])
+            except ValueError:
+                pass
+
+        # Store the YouTube watch URL in file_path so the scheduler can
+        # reference it.  The ACTUAL video file will only be downloaded when
+        # `cache_upcoming` runs before the airing.
+        video_url = entry.get("url") or f"https://www.youtube.com/watch?v={video_id}"
+
+        obj, was_created = MediaItem.objects.update_or_create(
+            media_source=media_source,
+            youtube_video_id=video_id,
+            defaults={
+                "title": title,
+                "item_kind": MediaItem.ItemKind.MOVIE,
+                "runtime_seconds": max(int(duration), 1),
+                "file_path": video_url,
+                "thumbnail_path": thumbnail,
+                "description": description,
+                "release_year": release_year,
+                "metadata_json": {
+                    "yt_id": video_id,
+                    "yt_url": video_url,
+                    "uploader": entry.get("uploader", ""),
+                },
+                "is_active": True,
+            },
+        )
+
+        if was_created:
+            created += 1
+        else:
+            updated += 1
+
+    # Update last-scanned timestamp
+    media_source.last_scanned_at = timezone.now()
+    media_source.save(update_fields=["last_scanned_at"])
+
+    logger.info(
+        "sync_source(%s): created=%d updated=%d skipped=%d (limit=%s)",
+        media_source.id,
+        created,
+        updated,
+        skipped,
+        max_videos,
+    )
+    return {"created": created, "updated": updated, "skipped": skipped}
+
+
+def download_for_airing(media_item: MediaItem) -> Path:
+    """
+    Download a YouTube video to the local cache so it can be served
+    directly without network dependency at airing time.
+
+    Returns the local Path of the downloaded file.
+    Raises RuntimeError if the download fails.
+    """
+    video_id = media_item.youtube_video_id
+    if not video_id:
+        raise ValueError(f"MediaItem {media_item.id} has no youtube_video_id.")
+
+    cache_dir = _cache_dir()
+    # Use video_id so we can detect already-cached files quickly.
+    output_template = str(cache_dir / f"{video_id}.%(ext)s")
+
+    # Check if already cached and not expired
+    if media_item.cached_file_path:
+        existing = Path(media_item.cached_file_path)
+        if existing.exists():
+            logger.info("cache hit: %s already at %s", video_id, existing)
+            return existing
+
+    ydl_opts = {
+        "quiet": True,
+        "no_warnings": True,
+        "outtmpl": output_template,
+        # Only request pre-muxed (progressive) formats — no separate video+audio
+        # streams that would require ffmpeg to merge.  Falls back through:
+        #   1. Best pre-muxed mp4 up to 1080p
+        #   2. Any pre-muxed mp4
+        #   3. Any pre-muxed webm
+        #   4. Anything pre-muxed (no merger needed)
+        "format": "best[ext=mp4][height<=1080]/best[ext=mp4]/best[ext=webm]/best",
+    }
+
+    url = media_item.file_path  # URL stored here by sync_source
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=True)
+
+    if info is None:
+        raise RuntimeError(f"yt-dlp returned no info for {url}")
+
+    downloaded_path = Path(ydl.prepare_filename(info))
+    if not downloaded_path.exists():
+        # yt-dlp may have merged to .mp4 even if the template said otherwise
+        mp4_path = downloaded_path.with_suffix(".mp4")
+        if mp4_path.exists():
+            downloaded_path = mp4_path
+        else:
+            raise RuntimeError(f"Expected download at {downloaded_path} but file not found.")
+
+    # Persist the cache location on the model
+    media_item.cached_file_path = str(downloaded_path)
+    media_item.save(update_fields=["cached_file_path"])
+
+    logger.info("downloaded %s -> %s", video_id, downloaded_path)
+    return downloaded_path