feat(main): commit

This commit is contained in:
2026-03-08 16:48:58 -04:00
parent 567766eaed
commit f37382d2b8
29 changed files with 3735 additions and 223 deletions

View File

@@ -0,0 +1,52 @@
"""
management command: cache_upcoming
Delegates to core.services.cache.run_cache() — the same logic exposed
by the API endpoint, so CLI and web UI behavior are always in sync.
Usage:
python manage.py cache_upcoming # default: next 24 hours
python manage.py cache_upcoming --hours 48
python manage.py cache_upcoming --prune-only
"""
from django.core.management.base import BaseCommand
from core.services.cache import run_cache
class Command(BaseCommand):
help = "Download YouTube videos for upcoming airings and prune old cache files."
def add_arguments(self, parser):
parser.add_argument(
"--hours",
type=int,
default=24,
help="How many hours ahead to scan for upcoming airings (default: 24).",
)
parser.add_argument(
"--prune-only",
action="store_true",
default=False,
help="Only delete expired cache files; do not download anything new.",
)
def handle(self, *args, **options):
hours = options["hours"]
prune_only = options["prune_only"]
self.stdout.write(f"▶ Running cache worker (window: {hours}h, prune-only: {prune_only})")
result = run_cache(hours=hours, prune_only=prune_only)
self.stdout.write(self.style.SUCCESS(f" 🗑 Pruned: {result['pruned']}"))
self.stdout.write(self.style.SUCCESS(f" ↓ Downloaded: {result['downloaded']}"))
self.stdout.write(self.style.SUCCESS(f" ✓ Already cached: {result['already_cached']}"))
if result["failed"]:
self.stderr.write(self.style.ERROR(f" ✗ Failed: {result['failed']}"))
for item in result["items"]:
icon = {"downloaded": "", "cached": "", "failed": ""}.get(item["status"], "?")
line = f" {icon} [{item['status']:10}] {item['title'][:70]}"
if item.get("error"):
line += f"{item['error']}"
self.stdout.write(line)

View File

@@ -1,16 +1,29 @@
from django.core.management.base import BaseCommand
from core.models import AppUser, Library, Channel, MediaItem, Airing, ScheduleTemplate
from core.services.scheduler import ScheduleGenerator
from django.utils import timezone
from datetime import timedelta
from datetime import timedelta, date
import textwrap
class Command(BaseCommand):
help = "Displays a beautifully formatted terminal dashboard of the current backend state."
def add_arguments(self, parser):
parser.add_argument('--channel', type=int, help='Inspect specific channel schedule')
parser.add_argument('--test-generate', action='store_true', help='Trigger generation for today if inspecting a channel')
def get_color(self, text, code):
"""Helper to wrap string in bash color codes"""
return f"\033[{code}m{text}\033[0m"
def handle(self, *args, **options):
channel_id = options.get('channel')
test_generate = options.get('test_generate')
if channel_id:
self.inspect_channel(channel_id, test_generate)
return
# 1. Gather Aggregate Metrics
total_users = AppUser.objects.count()
total_libraries = Library.objects.count()
@@ -46,7 +59,7 @@ class Command(BaseCommand):
for c in channels:
status_color = "1;32" if c.is_active else "1;31"
status_text = "ACTIVE" if c.is_active else "INACTIVE"
self.stdout.write(f"\n 📺 [{c.channel_number or '-'}] {c.name} ({self.get_color(status_text, status_color)})")
self.stdout.write(f"\n 📺 [{c.id}] {c.name} (Ch {c.channel_number or '-'}) ({self.get_color(status_text, status_color)})")
# Show templates
templates = c.scheduletemplate_set.filter(is_active=True).order_by('-priority')
@@ -57,4 +70,40 @@ class Command(BaseCommand):
blocks_count = t.scheduleblock_set.count()
self.stdout.write(f" 📄 Template: {t.name} (Priority {t.priority}) -> {blocks_count} Blocks")
self.stdout.write(f"\nUse {self.get_color('--channel <id>', '1;37')} to inspect detailed schedule.\n")
def inspect_channel(self, channel_id, test_generate):
try:
channel = Channel.objects.get(id=channel_id)
except Channel.DoesNotExist:
self.stdout.write(self.get_color(f"Error: Channel {channel_id} not found.", "1;31"))
return
if test_generate:
self.stdout.write(self.get_color(f"\nTriggering schedule generation for {channel.name}...", "1;33"))
generator = ScheduleGenerator(channel)
count = generator.generate_for_date(date.today())
self.stdout.write(f"Done. Created {self.get_color(str(count), '1;32')} new airings.")
now = timezone.now()
end_window = now + timedelta(hours=12)
airings = Airing.objects.filter(
channel=channel,
ends_at__gt=now,
starts_at__lt=end_window
).select_related('media_item').order_by('starts_at')
self.stdout.write(self.get_color(f"\n=== Schedule for {channel.name} (Next 12h) ===", "1;34"))
if not airings:
self.stdout.write(self.get_color(" (No airings scheduled in this window)", "1;33"))
else:
for a in airings:
time_str = f"{a.starts_at.strftime('%H:%M')} - {a.ends_at.strftime('%H:%M')}"
if a.starts_at <= now <= a.ends_at:
self.stdout.write(f" {self.get_color('▶ ON AIR', '1;32')} {self.get_color(time_str, '1;37')} | {a.media_item.title}")
else:
self.stdout.write(f" {time_str} | {a.media_item.title}")
self.stdout.write("\n")

View File

@@ -0,0 +1,45 @@
# Generated by Django 6.0.3 on 2026-03-08 15:34
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("core", "0001_initial"),
]
operations = [
migrations.AddField(
model_name="mediaitem",
name="cache_expires_at",
field=models.DateTimeField(blank=True, null=True),
),
migrations.AddField(
model_name="mediaitem",
name="cached_file_path",
field=models.TextField(blank=True, null=True),
),
migrations.AddField(
model_name="mediaitem",
name="youtube_video_id",
field=models.CharField(blank=True, db_index=True, max_length=64, null=True),
),
migrations.AlterField(
model_name="mediasource",
name="source_type",
field=models.CharField(
choices=[
("local_directory", "Local Directory"),
("network_share", "Network Share"),
("manual_import", "Manual Import"),
("playlist", "Playlist"),
("stream", "Stream"),
("api_feed", "API Feed"),
("youtube_channel", "YouTube Channel"),
("youtube_playlist", "YouTube Playlist"),
],
max_length=32,
),
),
]

View File

@@ -85,6 +85,8 @@ class MediaSource(models.Model):
PLAYLIST = 'playlist', 'Playlist'
STREAM = 'stream', 'Stream'
API_FEED = 'api_feed', 'API Feed'
YOUTUBE_CHANNEL = 'youtube_channel', 'YouTube Channel'
YOUTUBE_PLAYLIST = 'youtube_playlist', 'YouTube Playlist'
source_type = models.CharField(max_length=32, choices=SourceType.choices)
uri = models.TextField()
@@ -152,6 +154,11 @@ class MediaItem(models.Model):
is_active = models.BooleanField(default=True)
date_added_at = models.DateTimeField(auto_now_add=True)
metadata_json = models.JSONField(default=dict)
# YouTube-specific: the video ID from yt-dlp
youtube_video_id = models.CharField(max_length=64, blank=True, null=True, db_index=True)
# Local cache path for downloaded YouTube videos (distinct from file_path which holds source URI)
cached_file_path = models.TextField(blank=True, null=True)
cache_expires_at = models.DateTimeField(blank=True, null=True)
genres = models.ManyToManyField(Genre, related_name="media_items", blank=True)

140
core/services/cache.py Normal file
View File

@@ -0,0 +1,140 @@
"""
Cache service — reusable download/prune logic used by both:
- python manage.py cache_upcoming
- POST /api/sources/cache-upcoming
"""
import logging
import pathlib
from datetime import timedelta
from django.utils import timezone
from core.models import Airing, MediaItem, MediaSource
from core.services.youtube import download_for_airing, YOUTUBE_SOURCE_TYPES
logger = logging.getLogger(__name__)
def run_cache(hours: int = 24, prune_only: bool = False) -> dict:
"""
Scan Airings in the next `hours` hours, download any uncached YouTube
videos, and prune stale local files.
Returns a summary dict suitable for JSON serialization.
"""
now = timezone.now()
window_end = now + timedelta(hours=hours)
# ── Prune first ────────────────────────────────────────────────────────
pruned = _prune(now)
if prune_only:
return {"pruned": pruned, "downloaded": 0, "already_cached": 0, "failed": 0, "items": []}
# ── Find upcoming and currently playing YouTube-backed airings ──────────
upcoming = (
Airing.objects
.filter(ends_at__gt=now, starts_at__lte=window_end)
.select_related("media_item__media_source")
)
youtube_items: dict[int, MediaItem] = {}
for airing in upcoming:
item = airing.media_item
if item.media_source and item.media_source.source_type in YOUTUBE_SOURCE_TYPES:
youtube_items[item.pk] = item
downloaded = already_cached = failed = 0
items_status = []
for item in youtube_items.values():
# Skip if already cached
if item.cached_file_path and pathlib.Path(item.cached_file_path).exists():
already_cached += 1
items_status.append({
"id": item.pk,
"title": item.title,
"status": "cached",
"path": item.cached_file_path,
})
continue
try:
local_path = download_for_airing(item)
downloaded += 1
items_status.append({
"id": item.pk,
"title": item.title,
"status": "downloaded",
"path": str(local_path),
})
except Exception as exc:
failed += 1
items_status.append({
"id": item.pk,
"title": item.title,
"status": "failed",
"error": str(exc),
})
logger.error("download_for_airing(%s) failed: %s", item.pk, exc)
logger.info(
"run_cache(hours=%d): pruned=%d downloaded=%d cached=%d failed=%d",
hours, pruned, downloaded, already_cached, failed,
)
return {
"pruned": pruned,
"downloaded": downloaded,
"already_cached": already_cached,
"failed": failed,
"items": items_status,
}
def _prune(now) -> int:
"""Delete local cache files whose airings have all ended."""
pruned = 0
stale = MediaItem.objects.filter(cached_file_path__isnull=False).exclude(
airing__ends_at__gte=now
)
for item in stale:
p = pathlib.Path(item.cached_file_path)
if p.exists():
try:
p.unlink()
pruned += 1
except OSError as exc:
logger.warning("Could not delete %s: %s", p, exc)
item.cached_file_path = None
item.cache_expires_at = None
item.save(update_fields=["cached_file_path", "cache_expires_at"])
return pruned
def get_download_status() -> dict:
"""
Return a snapshot of all YouTube MediaItems and their cache status,
useful for rendering the Downloads UI.
"""
items = (
MediaItem.objects
.filter(media_source__source_type__in=YOUTUBE_SOURCE_TYPES)
.select_related("media_source")
.order_by("media_source__name", "title")
)
result = []
for item in items:
cached = bool(item.cached_file_path and pathlib.Path(item.cached_file_path).exists())
result.append({
"id": item.pk,
"title": item.title,
"source_name": item.media_source.name,
"source_id": item.media_source.id,
"youtube_video_id": item.youtube_video_id,
"runtime_seconds": item.runtime_seconds,
"cached": cached,
"cached_path": item.cached_file_path if cached else None,
})
return {"items": result, "total": len(result), "cached": sum(1 for r in result if r["cached"])}

View File

@@ -1,108 +1,210 @@
from datetime import datetime, timedelta, date, time, timezone
from core.models import Channel, ScheduleTemplate, ScheduleBlock, Airing, MediaItem
"""
Schedule generator — respects ChannelSourceRule assignments.
Source selection priority:
1. If any rules with rule_mode='prefer' exist, items from those sources
are weighted much more heavily.
2. Items from rule_mode='allow' sources fill the rest.
3. Items from rule_mode='avoid' sources are only used as a last resort
(weight × 0.1).
4. Items from rule_mode='block' sources are NEVER scheduled.
5. If NO ChannelSourceRule rows exist for this channel, falls back to
the old behaviour (all items in the channel's library).
"""
import random
import uuid
from datetime import datetime, timedelta, date, timezone
from core.models import (
Channel, ChannelSourceRule, ScheduleTemplate,
ScheduleBlock, Airing, MediaItem,
)
class ScheduleGenerator:
"""
A service that reads the latest ScheduleTemplate and Blocks for a given channel
and generates concrete Airings logic based on available matching MediaItems.
Reads ScheduleTemplate + ScheduleBlocks for a channel and fills the day
with concrete Airing rows, picking MediaItems according to the channel's
ChannelSourceRule assignments.
"""
def __init__(self, channel: Channel):
self.channel = channel
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def generate_for_date(self, target_date: date) -> int:
"""
Idempotent generation of airings for a specific date on this channel.
Returns the number of new Airings created.
Idempotent generation of airings for `target_date`.
Returns the number of new Airing rows created.
"""
# 1. Get the highest priority active template valid on this date
template = ScheduleTemplate.objects.filter(
channel=self.channel,
is_active=True
).filter(
# Start date is null or <= target_date
valid_from_date__isnull=True
).order_by('-priority').first()
# In a real app we'd construct complex Q objects for the valid dates,
# but for PYTV mock we will just grab the highest priority active template.
template = self._get_template()
if not template:
template = ScheduleTemplate.objects.filter(channel=self.channel, is_active=True).order_by('-priority').first()
if not template:
return 0
# 2. Extract day of week mask
# Python weekday: 0=Monday, 6=Sunday
# Our mask: bit 0 = Monday, bit 6 = Sunday
return 0
target_weekday_bit = 1 << target_date.weekday()
blocks = template.scheduleblock_set.all()
airings_created = 0
for block in blocks:
# Check if block runs on this day
if not (block.day_of_week_mask & target_weekday_bit):
continue
# Naive time combining mapping local time to UTC timeline without specific tz logic for simplicity now
start_dt = datetime.combine(target_date, block.start_local_time, tzinfo=timezone.utc)
end_dt = datetime.combine(target_date, block.end_local_time, tzinfo=timezone.utc)
# If the block wraps past midnight (e.g. 23:00 to 02:00)
end_dt = datetime.combine(target_date, block.end_local_time, tzinfo=timezone.utc)
# Midnight-wrap support (e.g. 23:0002:00)
if end_dt <= start_dt:
end_dt += timedelta(days=1)
# Clear existing airings in this window to allow idempotency
# Clear existing airings in this window (idempotency)
Airing.objects.filter(
channel=self.channel,
starts_at__gte=start_dt,
starts_at__lt=end_dt
starts_at__lt=end_dt,
).delete()
# 3. Pull matching Media Items
# Simplistic matching: pull items from library matching the block's genre
items_query = MediaItem.objects.filter(media_source__library=self.channel.library)
if block.default_genre:
items_query = items_query.filter(genres=block.default_genre)
available_items = list(items_query.exclude(item_kind="bumper"))
available_items = self._get_weighted_items(block)
if not available_items:
continue
# Shuffle randomly for basic scheduling variety
random.shuffle(available_items)
# 4. Fill the block
current_cursor = start_dt
item_index = 0
while current_cursor < end_dt and item_index < len(available_items):
item = available_items[item_index]
duration = timedelta(seconds=item.runtime_seconds or 3600)
# Check if this item fits
if current_cursor + duration > end_dt:
# Item doesn't strictly fit, but we'll squeeze it in and break if needed
# Real systems pad this out or trim the slot.
pass
import uuid
Airing.objects.create(
channel=self.channel,
schedule_template=template,
schedule_block=block,
media_item=item,
starts_at=current_cursor,
ends_at=current_cursor + duration,
slot_kind="program",
status="scheduled",
source_reason="template",
generation_batch_uuid=uuid.uuid4()
)
current_cursor += duration
item_index += 1
airings_created += 1
airings_created += self._fill_block(
template, block, start_dt, end_dt, available_items
)
return airings_created
# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------
def _get_template(self):
"""Pick the highest-priority active ScheduleTemplate for this channel."""
qs = ScheduleTemplate.objects.filter(
channel=self.channel, is_active=True
).order_by('-priority')
return qs.first()
def _get_weighted_items(self, block: ScheduleBlock) -> list:
"""
Build a weighted pool of MediaItems respecting ChannelSourceRule.
Returns a flat list with items duplicated according to their effective
weight (rounded to nearest int, min 1) so random.choice() gives the
right probability distribution without needing numpy.
"""
rules = list(
ChannelSourceRule.objects.filter(channel=self.channel)
.select_related('media_source')
)
if rules:
# ── Rules exist: build filtered + weighted pool ───────────────
allowed_source_ids = set() # allow + prefer
blocked_source_ids = set() # block
avoid_source_ids = set() # avoid
source_weights: dict[int, float] = {}
for rule in rules:
sid = rule.media_source_id
mode = rule.rule_mode
w = float(rule.weight or 1.0)
if mode == 'block':
blocked_source_ids.add(sid)
elif mode == 'avoid':
avoid_source_ids.add(sid)
source_weights[sid] = w * 0.1 # heavily discounted
elif mode == 'prefer':
allowed_source_ids.add(sid)
source_weights[sid] = w * 3.0 # boosted
else: # 'allow'
allowed_source_ids.add(sid)
source_weights[sid] = w
# Build base queryset from allowed + avoid sources (not blocked)
eligible_source_ids = (allowed_source_ids | avoid_source_ids) - blocked_source_ids
if not eligible_source_ids:
return []
base_qs = MediaItem.objects.filter(
media_source_id__in=eligible_source_ids,
is_active=True,
).exclude(item_kind='bumper').select_related('media_source')
else:
# ── No rules: fall back to full library (old behaviour) ────────
base_qs = MediaItem.objects.filter(
media_source__library=self.channel.library,
is_active=True,
).exclude(item_kind='bumper')
source_weights = {}
# Optionally filter by genre if block specifies one
if block.default_genre:
base_qs = base_qs.filter(genres=block.default_genre)
items = list(base_qs)
if not items:
return []
if not source_weights:
# No weight information — plain shuffle
random.shuffle(items)
return items
# Build weighted list: each item appears ⌈weight⌉ times
weighted: list[MediaItem] = []
for item in items:
w = source_weights.get(item.media_source_id, 1.0)
copies = max(1, round(w))
weighted.extend([item] * copies)
random.shuffle(weighted)
return weighted
def _fill_block(
self,
template: ScheduleTemplate,
block: ScheduleBlock,
start_dt: datetime,
end_dt: datetime,
items: list,
) -> int:
"""Fill start_dt→end_dt with sequential Airings, cycling through items."""
cursor = start_dt
idx = 0
created = 0
batch = uuid.uuid4()
while cursor < end_dt:
item = items[idx % len(items)]
idx += 1
duration = timedelta(seconds=max(item.runtime_seconds or 1800, 1))
# Don't let a single item overshoot the end by more than its own length
if cursor + duration > end_dt + timedelta(hours=1):
break
Airing.objects.create(
channel=self.channel,
schedule_template=template,
schedule_block=block,
media_item=item,
starts_at=cursor,
ends_at=cursor + duration,
slot_kind="program",
status="scheduled",
source_reason="template",
generation_batch_uuid=batch,
)
cursor += duration
created += 1
return created

244
core/services/youtube.py Normal file
View File

@@ -0,0 +1,244 @@
"""
YouTube source sync service.
Two-phase design:
Phase 1 — METADATA ONLY (sync_source):
Crawls a YouTube channel or playlist and upserts MediaItem rows with
title, duration, thumbnail etc. No video files are downloaded.
A max_videos cap keeps this fast for large channels.
Phase 2 — DOWNLOAD ON DEMAND (download_for_airing):
Called only by `python manage.py cache_upcoming` immediately before
a scheduled Airing. Downloads only the specific video needed.
"""
import logging
import os
from pathlib import Path
import yt_dlp
from django.conf import settings
from django.utils import timezone
from core.models import MediaItem, MediaSource
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# helpers
# ---------------------------------------------------------------------------
YOUTUBE_SOURCE_TYPES = {
MediaSource.SourceType.YOUTUBE_CHANNEL,
MediaSource.SourceType.YOUTUBE_PLAYLIST,
}
def _cache_dir() -> Path:
"""Return (and create) the directory where downloaded videos are stored."""
root = Path(getattr(settings, "MEDIA_ROOT", "/tmp/pytv_cache"))
root.mkdir(parents=True, exist_ok=True)
return root
# ---------------------------------------------------------------------------
# metadata extraction (no download)
# ---------------------------------------------------------------------------
def _extract_playlist_info(url: str, max_videos: int | None = None) -> list[dict]:
"""
Use yt-dlp to extract metadata for up to `max_videos` videos in a
channel/playlist without downloading any files.
`extract_flat=True` is crucial — it fetches only a lightweight index
(title, id, duration) rather than resolving full stream URLs, which
makes crawling large channels orders of magnitude faster.
Returns a list of yt-dlp info dicts (most-recent first for channels).
"""
ydl_opts = {
"quiet": True,
"no_warnings": True,
"extract_flat": True, # metadata only — NO stream/download URLs
"ignoreerrors": True,
}
if max_videos is not None:
# yt-dlp uses 1-based playlist indices; playlistend limits how many
# entries are fetched from the source before returning.
ydl_opts["playlistend"] = max_videos
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
if info is None:
return []
# Both channels and playlists wrap entries in an "entries" key.
entries = info.get("entries") or []
# Flatten one extra level for channels (channel -> playlist -> entries)
flat: list[dict] = []
for entry in entries:
if entry is None:
continue
if "entries" in entry: # nested playlist
flat.extend(e for e in entry["entries"] if e)
else:
flat.append(entry)
return flat
# ---------------------------------------------------------------------------
# public API
# ---------------------------------------------------------------------------
def sync_source(media_source: MediaSource, max_videos: int | None = None) -> dict:
"""
Phase 1: Metadata-only sync.
Crawls a YouTube channel/playlist and upserts MediaItem rows for each
discovered video. No video files are ever downloaded here.
Args:
media_source: The MediaSource to sync.
max_videos: Maximum number of videos to import. When None the
defaults are applied:
- youtube_channel → 50 (channels can have 10k+ videos)
- youtube_playlist → 200 (playlists are usually curated)
Returns:
{"created": int, "updated": int, "skipped": int}
"""
if media_source.source_type not in YOUTUBE_SOURCE_TYPES:
raise ValueError(f"MediaSource {media_source.id} is not a YouTube source.")
# Apply sensible defaults per source type
if max_videos is None:
if media_source.source_type == MediaSource.SourceType.YOUTUBE_CHANNEL:
max_videos = 50
else:
max_videos = 200
entries = _extract_playlist_info(media_source.uri, max_videos=max_videos)
created = updated = skipped = 0
for entry in entries:
video_id = entry.get("id")
if not video_id:
skipped += 1
continue
title = entry.get("title") or f"YouTube Video {video_id}"
duration = entry.get("duration") or 0 # seconds, may be None for live
thumbnail = entry.get("thumbnail") or ""
description = entry.get("description") or ""
release_year = None
upload_date = entry.get("upload_date") # "YYYYMMDD"
if upload_date and len(upload_date) >= 4:
try:
release_year = int(upload_date[:4])
except ValueError:
pass
# Store the YouTube watch URL in file_path so the scheduler can
# reference it. The ACTUAL video file will only be downloaded when
# `cache_upcoming` runs before the airing.
video_url = entry.get("url") or f"https://www.youtube.com/watch?v={video_id}"
obj, was_created = MediaItem.objects.update_or_create(
media_source=media_source,
youtube_video_id=video_id,
defaults={
"title": title,
"item_kind": MediaItem.ItemKind.MOVIE,
"runtime_seconds": max(int(duration), 1),
"file_path": video_url,
"thumbnail_path": thumbnail,
"description": description,
"release_year": release_year,
"metadata_json": {
"yt_id": video_id,
"yt_url": video_url,
"uploader": entry.get("uploader", ""),
},
"is_active": True,
},
)
if was_created:
created += 1
else:
updated += 1
# Update last-scanned timestamp
media_source.last_scanned_at = timezone.now()
media_source.save(update_fields=["last_scanned_at"])
logger.info(
"sync_source(%s): created=%d updated=%d skipped=%d (limit=%s)",
media_source.id,
created,
updated,
skipped,
max_videos,
)
return {"created": created, "updated": updated, "skipped": skipped}
def download_for_airing(media_item: MediaItem) -> Path:
"""
Download a YouTube video to the local cache so it can be served
directly without network dependency at airing time.
Returns the local Path of the downloaded file.
Raises RuntimeError if the download fails.
"""
video_id = media_item.youtube_video_id
if not video_id:
raise ValueError(f"MediaItem {media_item.id} has no youtube_video_id.")
cache_dir = _cache_dir()
# Use video_id so we can detect already-cached files quickly.
output_template = str(cache_dir / f"{video_id}.%(ext)s")
# Check if already cached and not expired
if media_item.cached_file_path:
existing = Path(media_item.cached_file_path)
if existing.exists():
logger.info("cache hit: %s already at %s", video_id, existing)
return existing
ydl_opts = {
"quiet": True,
"no_warnings": True,
"outtmpl": output_template,
# Only request pre-muxed (progressive) formats — no separate video+audio
# streams that would require ffmpeg to merge. Falls back through:
# 1. Best pre-muxed mp4 up to 1080p
# 2. Any pre-muxed mp4
# 3. Any pre-muxed webm
# 4. Anything pre-muxed (no merger needed)
"format": "best[ext=mp4][height<=1080]/best[ext=mp4]/best[ext=webm]/best",
}
url = media_item.file_path # URL stored here by sync_source
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=True)
if info is None:
raise RuntimeError(f"yt-dlp returned no info for {url}")
downloaded_path = Path(ydl.prepare_filename(info))
if not downloaded_path.exists():
# yt-dlp may have merged to .mp4 even if the template said otherwise
mp4_path = downloaded_path.with_suffix(".mp4")
if mp4_path.exists():
downloaded_path = mp4_path
else:
raise RuntimeError(f"Expected download at {downloaded_path} but file not found.")
# Persist the cache location on the model
media_item.cached_file_path = str(downloaded_path)
media_item.save(update_fields=["cached_file_path"])
logger.info("downloaded %s -> %s", video_id, downloaded_path)
return downloaded_path