feat(main): commit

2026-03-08 16:48:58 -04:00
parent 567766eaed
commit f37382d2b8
29 changed files with 3735 additions and 223 deletions
--- a/core/management/commands/cache_upcoming.py
+++ b/core/management/commands/cache_upcoming.py
@@ -0,0 +1,52 @@
+"""
+management command: cache_upcoming
+
+Delegates to core.services.cache.run_cache() — the same logic exposed
+by the API endpoint, so CLI and web UI behavior are always in sync.
+
+Usage:
+    python manage.py cache_upcoming               # default: next 24 hours
+    python manage.py cache_upcoming --hours 48
+    python manage.py cache_upcoming --prune-only
+"""
+
+from django.core.management.base import BaseCommand
+from core.services.cache import run_cache
+
+
+class Command(BaseCommand):
+    help = "Download YouTube videos for upcoming airings and prune old cache files."
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--hours",
+            type=int,
+            default=24,
+            help="How many hours ahead to scan for upcoming airings (default: 24).",
+        )
+        parser.add_argument(
+            "--prune-only",
+            action="store_true",
+            default=False,
+            help="Only delete expired cache files; do not download anything new.",
+        )
+
+    def handle(self, *args, **options):
+        hours = options["hours"]
+        prune_only = options["prune_only"]
+
+        self.stdout.write(f"▶  Running cache worker (window: {hours}h, prune-only: {prune_only})")
+        result = run_cache(hours=hours, prune_only=prune_only)
+
+        self.stdout.write(self.style.SUCCESS(f"   🗑  Pruned:          {result['pruned']}"))
+        self.stdout.write(self.style.SUCCESS(f"   ↓   Downloaded:     {result['downloaded']}"))
+        self.stdout.write(self.style.SUCCESS(f"   ✓   Already cached: {result['already_cached']}"))
+        if result["failed"]:
+            self.stderr.write(self.style.ERROR(f"   ✗   Failed:         {result['failed']}"))
+
+        for item in result["items"]:
+            icon = {"downloaded": "↓", "cached": "✓", "failed": "✗"}.get(item["status"], "?")
+            line = f"   {icon}  [{item['status']:10}] {item['title'][:70]}"
+            if item.get("error"):
+                line += f"  — {item['error']}"
+            self.stdout.write(line)
--- a/core/management/commands/state.py
+++ b/core/management/commands/state.py
@@ -1,16 +1,29 @@
 from django.core.management.base import BaseCommand
 from core.models import AppUser, Library, Channel, MediaItem, Airing, ScheduleTemplate
+from core.services.scheduler import ScheduleGenerator
 from django.utils import timezone
-from datetime import timedelta
+from datetime import timedelta, date
+import textwrap

 class Command(BaseCommand):
    help = "Displays a beautifully formatted terminal dashboard of the current backend state."

+    def add_arguments(self, parser):
+        parser.add_argument('--channel', type=int, help='Inspect specific channel schedule')
+        parser.add_argument('--test-generate', action='store_true', help='Trigger generation for today if inspecting a channel')
+
    def get_color(self, text, code):
        """Helper to wrap string in bash color codes"""
        return f"\033[{code}m{text}\033[0m"

    def handle(self, *args, **options):
+        channel_id = options.get('channel')
+        test_generate = options.get('test_generate')
+
+        if channel_id:
+            self.inspect_channel(channel_id, test_generate)
+            return
+
        # 1. Gather Aggregate Metrics
        total_users = AppUser.objects.count()
        total_libraries = Library.objects.count()
@@ -46,7 +59,7 @@ class Command(BaseCommand):
        for c in channels:
            status_color = "1;32" if c.is_active else "1;31"
            status_text = "ACTIVE" if c.is_active else "INACTIVE"
-            self.stdout.write(f"\n  📺 [{c.channel_number or '-'}] {c.name} ({self.get_color(status_text, status_color)})")
+            self.stdout.write(f"\n  📺 [{c.id}] {c.name} (Ch {c.channel_number or '-'}) ({self.get_color(status_text, status_color)})")
            
            # Show templates
            templates = c.scheduletemplate_set.filter(is_active=True).order_by('-priority')
@@ -57,4 +70,40 @@ class Command(BaseCommand):
                    blocks_count = t.scheduleblock_set.count()
                    self.stdout.write(f"      📄 Template: {t.name} (Priority {t.priority}) -> {blocks_count} Blocks")
                    
+        self.stdout.write(f"\nUse {self.get_color('--channel <id>', '1;37')} to inspect detailed schedule.\n")
+
+    def inspect_channel(self, channel_id, test_generate):
+        try:
+            channel = Channel.objects.get(id=channel_id)
+        except Channel.DoesNotExist:
+            self.stdout.write(self.get_color(f"Error: Channel {channel_id} not found.", "1;31"))
+            return
+
+        if test_generate:
+            self.stdout.write(self.get_color(f"\nTriggering schedule generation for {channel.name}...", "1;33"))
+            generator = ScheduleGenerator(channel)
+            count = generator.generate_for_date(date.today())
+            self.stdout.write(f"Done. Created {self.get_color(str(count), '1;32')} new airings.")
+
+        now = timezone.now()
+        end_window = now + timedelta(hours=12)
+        
+        airings = Airing.objects.filter(
+            channel=channel,
+            ends_at__gt=now,
+            starts_at__lt=end_window
+        ).select_related('media_item').order_by('starts_at')
+
+        self.stdout.write(self.get_color(f"\n=== Schedule for {channel.name} (Next 12h) ===", "1;34"))
+        
+        if not airings:
+            self.stdout.write(self.get_color("  (No airings scheduled in this window)", "1;33"))
+        else:
+            for a in airings:
+                time_str = f"{a.starts_at.strftime('%H:%M')} - {a.ends_at.strftime('%H:%M')}"
+                if a.starts_at <= now <= a.ends_at:
+                    self.stdout.write(f"  {self.get_color('▶ ON AIR', '1;32')} {self.get_color(time_str, '1;37')} | {a.media_item.title}")
+                else:
+                    self.stdout.write(f"          {time_str} | {a.media_item.title}")
+
        self.stdout.write("\n")
--- a/core/migrations/0002_mediaitem_cache_expires_at_and_more.py
+++ b/core/migrations/0002_mediaitem_cache_expires_at_and_more.py
@@ -0,0 +1,45 @@
+# Generated by Django 6.0.3 on 2026-03-08 15:34
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("core", "0001_initial"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="mediaitem",
+            name="cache_expires_at",
+            field=models.DateTimeField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="mediaitem",
+            name="cached_file_path",
+            field=models.TextField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="mediaitem",
+            name="youtube_video_id",
+            field=models.CharField(blank=True, db_index=True, max_length=64, null=True),
+        ),
+        migrations.AlterField(
+            model_name="mediasource",
+            name="source_type",
+            field=models.CharField(
+                choices=[
+                    ("local_directory", "Local Directory"),
+                    ("network_share", "Network Share"),
+                    ("manual_import", "Manual Import"),
+                    ("playlist", "Playlist"),
+                    ("stream", "Stream"),
+                    ("api_feed", "API Feed"),
+                    ("youtube_channel", "YouTube Channel"),
+                    ("youtube_playlist", "YouTube Playlist"),
+                ],
+                max_length=32,
+            ),
+        ),
+    ]
--- a/core/models.py
+++ b/core/models.py
@@ -85,6 +85,8 @@ class MediaSource(models.Model):
        PLAYLIST = 'playlist', 'Playlist'
        STREAM = 'stream', 'Stream'
        API_FEED = 'api_feed', 'API Feed'
+        YOUTUBE_CHANNEL = 'youtube_channel', 'YouTube Channel'
+        YOUTUBE_PLAYLIST = 'youtube_playlist', 'YouTube Playlist'
        
    source_type = models.CharField(max_length=32, choices=SourceType.choices)
    uri = models.TextField()
@@ -152,6 +154,11 @@ class MediaItem(models.Model):
    is_active = models.BooleanField(default=True)
    date_added_at = models.DateTimeField(auto_now_add=True)
    metadata_json = models.JSONField(default=dict)
+    # YouTube-specific: the video ID from yt-dlp
+    youtube_video_id = models.CharField(max_length=64, blank=True, null=True, db_index=True)
+    # Local cache path for downloaded YouTube videos (distinct from file_path which holds source URI)
+    cached_file_path = models.TextField(blank=True, null=True)
+    cache_expires_at = models.DateTimeField(blank=True, null=True)
    
    genres = models.ManyToManyField(Genre, related_name="media_items", blank=True)

--- a/core/services/cache.py
+++ b/core/services/cache.py
@@ -0,0 +1,140 @@
+"""
+Cache service — reusable download/prune logic used by both:
+  - python manage.py cache_upcoming
+  - POST /api/sources/cache-upcoming
+"""
+
+import logging
+import pathlib
+from datetime import timedelta
+
+from django.utils import timezone
+
+from core.models import Airing, MediaItem, MediaSource
+from core.services.youtube import download_for_airing, YOUTUBE_SOURCE_TYPES
+
+logger = logging.getLogger(__name__)
+
+
+def run_cache(hours: int = 24, prune_only: bool = False) -> dict:
+    """
+    Scan Airings in the next `hours` hours, download any uncached YouTube
+    videos, and prune stale local files.
+
+    Returns a summary dict suitable for JSON serialization.
+    """
+    now = timezone.now()
+    window_end = now + timedelta(hours=hours)
+
+    # ── Prune first ────────────────────────────────────────────────────────
+    pruned = _prune(now)
+
+    if prune_only:
+        return {"pruned": pruned, "downloaded": 0, "already_cached": 0, "failed": 0, "items": []}
+
+    # ── Find upcoming and currently playing YouTube-backed airings ──────────
+    upcoming = (
+        Airing.objects
+        .filter(ends_at__gt=now, starts_at__lte=window_end)
+        .select_related("media_item__media_source")
+    )
+
+    youtube_items: dict[int, MediaItem] = {}
+    for airing in upcoming:
+        item = airing.media_item
+        if item.media_source and item.media_source.source_type in YOUTUBE_SOURCE_TYPES:
+            youtube_items[item.pk] = item
+
+    downloaded = already_cached = failed = 0
+    items_status = []
+
+    for item in youtube_items.values():
+        # Skip if already cached
+        if item.cached_file_path and pathlib.Path(item.cached_file_path).exists():
+            already_cached += 1
+            items_status.append({
+                "id": item.pk,
+                "title": item.title,
+                "status": "cached",
+                "path": item.cached_file_path,
+            })
+            continue
+
+        try:
+            local_path = download_for_airing(item)
+            downloaded += 1
+            items_status.append({
+                "id": item.pk,
+                "title": item.title,
+                "status": "downloaded",
+                "path": str(local_path),
+            })
+        except Exception as exc:
+            failed += 1
+            items_status.append({
+                "id": item.pk,
+                "title": item.title,
+                "status": "failed",
+                "error": str(exc),
+            })
+            logger.error("download_for_airing(%s) failed: %s", item.pk, exc)
+
+    logger.info(
+        "run_cache(hours=%d): pruned=%d downloaded=%d cached=%d failed=%d",
+        hours, pruned, downloaded, already_cached, failed,
+    )
+    return {
+        "pruned": pruned,
+        "downloaded": downloaded,
+        "already_cached": already_cached,
+        "failed": failed,
+        "items": items_status,
+    }
+
+
+def _prune(now) -> int:
+    """Delete local cache files whose airings have all ended."""
+    pruned = 0
+    stale = MediaItem.objects.filter(cached_file_path__isnull=False).exclude(
+        airing__ends_at__gte=now
+    )
+    for item in stale:
+        p = pathlib.Path(item.cached_file_path)
+        if p.exists():
+            try:
+                p.unlink()
+                pruned += 1
+            except OSError as exc:
+                logger.warning("Could not delete %s: %s", p, exc)
+        item.cached_file_path = None
+        item.cache_expires_at = None
+        item.save(update_fields=["cached_file_path", "cache_expires_at"])
+    return pruned
+
+
+def get_download_status() -> dict:
+    """
+    Return a snapshot of all YouTube MediaItems and their cache status,
+    useful for rendering the Downloads UI.
+    """
+    items = (
+        MediaItem.objects
+        .filter(media_source__source_type__in=YOUTUBE_SOURCE_TYPES)
+        .select_related("media_source")
+        .order_by("media_source__name", "title")
+    )
+
+    result = []
+    for item in items:
+        cached = bool(item.cached_file_path and pathlib.Path(item.cached_file_path).exists())
+        result.append({
+            "id": item.pk,
+            "title": item.title,
+            "source_name": item.media_source.name,
+            "source_id": item.media_source.id,
+            "youtube_video_id": item.youtube_video_id,
+            "runtime_seconds": item.runtime_seconds,
+            "cached": cached,
+            "cached_path": item.cached_file_path if cached else None,
+        })
+    return {"items": result, "total": len(result), "cached": sum(1 for r in result if r["cached"])}
--- a/core/services/scheduler.py
+++ b/core/services/scheduler.py
@@ -1,108 +1,210 @@
-from datetime import datetime, timedelta, date, time, timezone
-from core.models import Channel, ScheduleTemplate, ScheduleBlock, Airing, MediaItem
+"""
+Schedule generator — respects ChannelSourceRule assignments.
+
+Source selection priority:
+  1. If any rules with rule_mode='prefer' exist, items from those sources
+     are weighted much more heavily.
+  2. Items from rule_mode='allow' sources fill the rest.
+  3. Items from rule_mode='avoid' sources are only used as a last resort
+     (weight × 0.1).
+  4. Items from rule_mode='block' sources are NEVER scheduled.
+  5. If NO ChannelSourceRule rows exist for this channel, falls back to
+     the old behaviour (all items in the channel's library).
+"""
+
 import random
+import uuid
+from datetime import datetime, timedelta, date, timezone
+
+from core.models import (
+    Channel, ChannelSourceRule, ScheduleTemplate,
+    ScheduleBlock, Airing, MediaItem,
+)
+

 class ScheduleGenerator:
    """
-    A service that reads the latest ScheduleTemplate and Blocks for a given channel
-    and generates concrete Airings logic based on available matching MediaItems.
+    Reads ScheduleTemplate + ScheduleBlocks for a channel and fills the day
+    with concrete Airing rows, picking MediaItems according to the channel's
+    ChannelSourceRule assignments.
    """
-    
+
    def __init__(self, channel: Channel):
        self.channel = channel
-        
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
    def generate_for_date(self, target_date: date) -> int:
        """
-        Idempotent generation of airings for a specific date on this channel.
-        Returns the number of new Airings created.
+        Idempotent generation of airings for `target_date`.
+        Returns the number of new Airing rows created.
        """
-        # 1. Get the highest priority active template valid on this date
-        template = ScheduleTemplate.objects.filter(
-            channel=self.channel,
-            is_active=True
-        ).filter(
-            # Start date is null or <= target_date
-            valid_from_date__isnull=True
-        ).order_by('-priority').first()
-        
-        # In a real app we'd construct complex Q objects for the valid dates,
-        # but for PYTV mock we will just grab the highest priority active template.
+        template = self._get_template()
        if not template:
-            template = ScheduleTemplate.objects.filter(channel=self.channel, is_active=True).order_by('-priority').first()
-            if not template:
-                return 0
-                
-        # 2. Extract day of week mask
-        # Python weekday: 0=Monday, 6=Sunday
-        # Our mask: bit 0 = Monday, bit 6 = Sunday
+            return 0
+
        target_weekday_bit = 1 << target_date.weekday()
-        
        blocks = template.scheduleblock_set.all()
        airings_created = 0
-        
+
        for block in blocks:
-            # Check if block runs on this day
            if not (block.day_of_week_mask & target_weekday_bit):
                continue
-                
-            # Naive time combining mapping local time to UTC timeline without specific tz logic for simplicity now
+
            start_dt = datetime.combine(target_date, block.start_local_time, tzinfo=timezone.utc)
-            end_dt = datetime.combine(target_date, block.end_local_time, tzinfo=timezone.utc)
-            
-            # If the block wraps past midnight (e.g. 23:00 to 02:00)
+            end_dt   = datetime.combine(target_date, block.end_local_time,   tzinfo=timezone.utc)
+
+            # Midnight-wrap support (e.g. 23:00–02:00)
            if end_dt <= start_dt:
                end_dt += timedelta(days=1)
-                
-            # Clear existing airings in this window to allow idempotency
+
+            # Clear existing airings in this window (idempotency)
            Airing.objects.filter(
                channel=self.channel,
                starts_at__gte=start_dt,
-                starts_at__lt=end_dt
+                starts_at__lt=end_dt,
            ).delete()
-            
-            # 3. Pull matching Media Items
-            # Simplistic matching: pull items from library matching the block's genre
-            items_query = MediaItem.objects.filter(media_source__library=self.channel.library)
-            if block.default_genre:
-                items_query = items_query.filter(genres=block.default_genre)
-                
-            available_items = list(items_query.exclude(item_kind="bumper"))
+
+            available_items = self._get_weighted_items(block)
            if not available_items:
                continue

-            # Shuffle randomly for basic scheduling variety
-            random.shuffle(available_items)
-            
-            # 4. Fill the block
-            current_cursor = start_dt
-            item_index = 0
-            
-            while current_cursor < end_dt and item_index < len(available_items):
-                item = available_items[item_index]
-                duration = timedelta(seconds=item.runtime_seconds or 3600)
-                
-                # Check if this item fits
-                if current_cursor + duration > end_dt:
-                    # Item doesn't strictly fit, but we'll squeeze it in and break if needed
-                    # Real systems pad this out or trim the slot.
-                    pass
-                
-                import uuid
-                Airing.objects.create(
-                    channel=self.channel,
-                    schedule_template=template,
-                    schedule_block=block,
-                    media_item=item,
-                    starts_at=current_cursor,
-                    ends_at=current_cursor + duration,
-                    slot_kind="program",
-                    status="scheduled",
-                    source_reason="template",
-                    generation_batch_uuid=uuid.uuid4()
-                )
-                
-                current_cursor += duration
-                item_index += 1
-                airings_created += 1
+            airings_created += self._fill_block(
+                template, block, start_dt, end_dt, available_items
+            )

        return airings_created
+
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+
+    def _get_template(self):
+        """Pick the highest-priority active ScheduleTemplate for this channel."""
+        qs = ScheduleTemplate.objects.filter(
+            channel=self.channel, is_active=True
+        ).order_by('-priority')
+        return qs.first()
+
+    def _get_weighted_items(self, block: ScheduleBlock) -> list:
+        """
+        Build a weighted pool of MediaItems respecting ChannelSourceRule.
+
+        Returns a flat list with items duplicated according to their effective
+        weight (rounded to nearest int, min 1) so random.choice() gives the
+        right probability distribution without needing numpy.
+        """
+        rules = list(
+            ChannelSourceRule.objects.filter(channel=self.channel)
+            .select_related('media_source')
+        )
+
+        if rules:
+            # ── Rules exist: build filtered + weighted pool ───────────────
+            allowed_source_ids  = set()  # allow + prefer
+            blocked_source_ids  = set()  # block
+            avoid_source_ids    = set()  # avoid
+            source_weights: dict[int, float] = {}
+
+            for rule in rules:
+                sid = rule.media_source_id
+                mode = rule.rule_mode
+                w = float(rule.weight or 1.0)
+
+                if mode == 'block':
+                    blocked_source_ids.add(sid)
+                elif mode == 'avoid':
+                    avoid_source_ids.add(sid)
+                    source_weights[sid] = w * 0.1   # heavily discounted
+                elif mode == 'prefer':
+                    allowed_source_ids.add(sid)
+                    source_weights[sid] = w * 3.0   # boosted
+                else:  # 'allow'
+                    allowed_source_ids.add(sid)
+                    source_weights[sid] = w
+
+            # Build base queryset from allowed + avoid sources (not blocked)
+            eligible_source_ids = (allowed_source_ids | avoid_source_ids) - blocked_source_ids
+
+            if not eligible_source_ids:
+                return []
+
+            base_qs = MediaItem.objects.filter(
+                media_source_id__in=eligible_source_ids,
+                is_active=True,
+            ).exclude(item_kind='bumper').select_related('media_source')
+
+        else:
+            # ── No rules: fall back to full library (old behaviour) ────────
+            base_qs = MediaItem.objects.filter(
+                media_source__library=self.channel.library,
+                is_active=True,
+            ).exclude(item_kind='bumper')
+            source_weights = {}
+
+        # Optionally filter by genre if block specifies one
+        if block.default_genre:
+            base_qs = base_qs.filter(genres=block.default_genre)
+
+        items = list(base_qs)
+        if not items:
+            return []
+
+        if not source_weights:
+            # No weight information — plain shuffle
+            random.shuffle(items)
+            return items
+
+        # Build weighted list: each item appears ⌈weight⌉ times
+        weighted: list[MediaItem] = []
+        for item in items:
+            w = source_weights.get(item.media_source_id, 1.0)
+            copies = max(1, round(w))
+            weighted.extend([item] * copies)
+
+        random.shuffle(weighted)
+        return weighted
+
+    def _fill_block(
+        self,
+        template: ScheduleTemplate,
+        block: ScheduleBlock,
+        start_dt: datetime,
+        end_dt: datetime,
+        items: list,
+    ) -> int:
+        """Fill start_dt→end_dt with sequential Airings, cycling through items."""
+        cursor = start_dt
+        idx = 0
+        created = 0
+        batch = uuid.uuid4()
+
+        while cursor < end_dt:
+            item = items[idx % len(items)]
+            idx += 1
+
+            duration = timedelta(seconds=max(item.runtime_seconds or 1800, 1))
+
+            # Don't let a single item overshoot the end by more than its own length
+            if cursor + duration > end_dt + timedelta(hours=1):
+                break
+
+            Airing.objects.create(
+                channel=self.channel,
+                schedule_template=template,
+                schedule_block=block,
+                media_item=item,
+                starts_at=cursor,
+                ends_at=cursor + duration,
+                slot_kind="program",
+                status="scheduled",
+                source_reason="template",
+                generation_batch_uuid=batch,
+            )
+
+            cursor += duration
+            created += 1
+
+        return created
--- a/core/services/youtube.py
+++ b/core/services/youtube.py
@@ -0,0 +1,244 @@
+"""
+YouTube source sync service.
+
+Two-phase design:
+  Phase 1 — METADATA ONLY (sync_source):
+    Crawls a YouTube channel or playlist and upserts MediaItem rows with
+    title, duration, thumbnail etc.  No video files are downloaded.
+    A max_videos cap keeps this fast for large channels.
+
+  Phase 2 — DOWNLOAD ON DEMAND (download_for_airing):
+    Called only by `python manage.py cache_upcoming` immediately before
+    a scheduled Airing.  Downloads only the specific video needed.
+"""
+
+import logging
+import os
+from pathlib import Path
+
+import yt_dlp
+from django.conf import settings
+from django.utils import timezone
+
+from core.models import MediaItem, MediaSource
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+
+YOUTUBE_SOURCE_TYPES = {
+    MediaSource.SourceType.YOUTUBE_CHANNEL,
+    MediaSource.SourceType.YOUTUBE_PLAYLIST,
+}
+
+
+def _cache_dir() -> Path:
+    """Return (and create) the directory where downloaded videos are stored."""
+    root = Path(getattr(settings, "MEDIA_ROOT", "/tmp/pytv_cache"))
+    root.mkdir(parents=True, exist_ok=True)
+    return root
+
+
+# ---------------------------------------------------------------------------
+# metadata extraction (no download)
+# ---------------------------------------------------------------------------
+
+def _extract_playlist_info(url: str, max_videos: int | None = None) -> list[dict]:
+    """
+    Use yt-dlp to extract metadata for up to `max_videos` videos in a
+    channel/playlist without downloading any files.
+
+    `extract_flat=True` is crucial — it fetches only a lightweight index
+    (title, id, duration) rather than resolving full stream URLs, which
+    makes crawling large channels orders of magnitude faster.
+
+    Returns a list of yt-dlp info dicts (most-recent first for channels).
+    """
+    ydl_opts = {
+        "quiet": True,
+        "no_warnings": True,
+        "extract_flat": True,   # metadata only — NO stream/download URLs
+        "ignoreerrors": True,
+    }
+    if max_videos is not None:
+        # yt-dlp uses 1-based playlist indices; playlistend limits how many
+        # entries are fetched from the source before returning.
+        ydl_opts["playlistend"] = max_videos
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=False)
+
+    if info is None:
+        return []
+
+    # Both channels and playlists wrap entries in an "entries" key.
+    entries = info.get("entries") or []
+    # Flatten one extra level for channels (channel -> playlist -> entries)
+    flat: list[dict] = []
+    for entry in entries:
+        if entry is None:
+            continue
+        if "entries" in entry:          # nested playlist
+            flat.extend(e for e in entry["entries"] if e)
+        else:
+            flat.append(entry)
+    return flat
+
+
+# ---------------------------------------------------------------------------
+# public API
+# ---------------------------------------------------------------------------
+
+def sync_source(media_source: MediaSource, max_videos: int | None = None) -> dict:
+    """
+    Phase 1: Metadata-only sync.
+
+    Crawls a YouTube channel/playlist and upserts MediaItem rows for each
+    discovered video.  No video files are ever downloaded here.
+
+    Args:
+        media_source: The MediaSource to sync.
+        max_videos:   Maximum number of videos to import.  When None the
+                      defaults are applied:
+                        - youtube_channel  → 50  (channels can have 10k+ videos)
+                        - youtube_playlist → 200 (playlists are usually curated)
+
+    Returns:
+        {"created": int, "updated": int, "skipped": int}
+    """
+    if media_source.source_type not in YOUTUBE_SOURCE_TYPES:
+        raise ValueError(f"MediaSource {media_source.id} is not a YouTube source.")
+
+    # Apply sensible defaults per source type
+    if max_videos is None:
+        if media_source.source_type == MediaSource.SourceType.YOUTUBE_CHANNEL:
+            max_videos = 50
+        else:
+            max_videos = 200
+
+    entries = _extract_playlist_info(media_source.uri, max_videos=max_videos)
+    created = updated = skipped = 0
+
+    for entry in entries:
+        video_id = entry.get("id")
+        if not video_id:
+            skipped += 1
+            continue
+
+        title = entry.get("title") or f"YouTube Video {video_id}"
+        duration = entry.get("duration") or 0     # seconds, may be None for live
+        thumbnail = entry.get("thumbnail") or ""
+        description = entry.get("description") or ""
+        release_year = None
+        upload_date = entry.get("upload_date")    # "YYYYMMDD"
+        if upload_date and len(upload_date) >= 4:
+            try:
+                release_year = int(upload_date[:4])
+            except ValueError:
+                pass
+
+        # Store the YouTube watch URL in file_path so the scheduler can
+        # reference it.  The ACTUAL video file will only be downloaded when
+        # `cache_upcoming` runs before the airing.
+        video_url = entry.get("url") or f"https://www.youtube.com/watch?v={video_id}"
+
+        obj, was_created = MediaItem.objects.update_or_create(
+            media_source=media_source,
+            youtube_video_id=video_id,
+            defaults={
+                "title": title,
+                "item_kind": MediaItem.ItemKind.MOVIE,
+                "runtime_seconds": max(int(duration), 1),
+                "file_path": video_url,
+                "thumbnail_path": thumbnail,
+                "description": description,
+                "release_year": release_year,
+                "metadata_json": {
+                    "yt_id": video_id,
+                    "yt_url": video_url,
+                    "uploader": entry.get("uploader", ""),
+                },
+                "is_active": True,
+            },
+        )
+
+        if was_created:
+            created += 1
+        else:
+            updated += 1
+
+    # Update last-scanned timestamp
+    media_source.last_scanned_at = timezone.now()
+    media_source.save(update_fields=["last_scanned_at"])
+
+    logger.info(
+        "sync_source(%s): created=%d updated=%d skipped=%d (limit=%s)",
+        media_source.id,
+        created,
+        updated,
+        skipped,
+        max_videos,
+    )
+    return {"created": created, "updated": updated, "skipped": skipped}
+
+
+def download_for_airing(media_item: MediaItem) -> Path:
+    """
+    Download a YouTube video to the local cache so it can be served
+    directly without network dependency at airing time.
+
+    Returns the local Path of the downloaded file.
+    Raises RuntimeError if the download fails.
+    """
+    video_id = media_item.youtube_video_id
+    if not video_id:
+        raise ValueError(f"MediaItem {media_item.id} has no youtube_video_id.")
+
+    cache_dir = _cache_dir()
+    # Use video_id so we can detect already-cached files quickly.
+    output_template = str(cache_dir / f"{video_id}.%(ext)s")
+
+    # Check if already cached and not expired
+    if media_item.cached_file_path:
+        existing = Path(media_item.cached_file_path)
+        if existing.exists():
+            logger.info("cache hit: %s already at %s", video_id, existing)
+            return existing
+
+    ydl_opts = {
+        "quiet": True,
+        "no_warnings": True,
+        "outtmpl": output_template,
+        # Only request pre-muxed (progressive) formats — no separate video+audio
+        # streams that would require ffmpeg to merge.  Falls back through:
+        #   1. Best pre-muxed mp4 up to 1080p
+        #   2. Any pre-muxed mp4
+        #   3. Any pre-muxed webm
+        #   4. Anything pre-muxed (no merger needed)
+        "format": "best[ext=mp4][height<=1080]/best[ext=mp4]/best[ext=webm]/best",
+    }
+
+    url = media_item.file_path  # URL stored here by sync_source
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=True)
+
+    if info is None:
+        raise RuntimeError(f"yt-dlp returned no info for {url}")
+
+    downloaded_path = Path(ydl.prepare_filename(info))
+    if not downloaded_path.exists():
+        # yt-dlp may have merged to .mp4 even if the template said otherwise
+        mp4_path = downloaded_path.with_suffix(".mp4")
+        if mp4_path.exists():
+            downloaded_path = mp4_path
+        else:
+            raise RuntimeError(f"Expected download at {downloaded_path} but file not found.")
+
+    # Persist the cache location on the model
+    media_item.cached_file_path = str(downloaded_path)
+    media_item.save(update_fields=["cached_file_path"])
+
+    logger.info("downloaded %s -> %s", video_id, downloaded_path)
+    return downloaded_path