feat(main): commit
This commit is contained in:
244
core/services/youtube.py
Normal file
244
core/services/youtube.py
Normal file
@@ -0,0 +1,244 @@
|
||||
"""
|
||||
YouTube source sync service.
|
||||
|
||||
Two-phase design:
|
||||
Phase 1 — METADATA ONLY (sync_source):
|
||||
Crawls a YouTube channel or playlist and upserts MediaItem rows with
|
||||
title, duration, thumbnail etc. No video files are downloaded.
|
||||
A max_videos cap keeps this fast for large channels.
|
||||
|
||||
Phase 2 — DOWNLOAD ON DEMAND (download_for_airing):
|
||||
Called only by `python manage.py cache_upcoming` immediately before
|
||||
a scheduled Airing. Downloads only the specific video needed.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import yt_dlp
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
|
||||
from core.models import MediaItem, MediaSource
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
YOUTUBE_SOURCE_TYPES = {
|
||||
MediaSource.SourceType.YOUTUBE_CHANNEL,
|
||||
MediaSource.SourceType.YOUTUBE_PLAYLIST,
|
||||
}
|
||||
|
||||
|
||||
def _cache_dir() -> Path:
|
||||
"""Return (and create) the directory where downloaded videos are stored."""
|
||||
root = Path(getattr(settings, "MEDIA_ROOT", "/tmp/pytv_cache"))
|
||||
root.mkdir(parents=True, exist_ok=True)
|
||||
return root
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# metadata extraction (no download)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _extract_playlist_info(url: str, max_videos: int | None = None) -> list[dict]:
|
||||
"""
|
||||
Use yt-dlp to extract metadata for up to `max_videos` videos in a
|
||||
channel/playlist without downloading any files.
|
||||
|
||||
`extract_flat=True` is crucial — it fetches only a lightweight index
|
||||
(title, id, duration) rather than resolving full stream URLs, which
|
||||
makes crawling large channels orders of magnitude faster.
|
||||
|
||||
Returns a list of yt-dlp info dicts (most-recent first for channels).
|
||||
"""
|
||||
ydl_opts = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"extract_flat": True, # metadata only — NO stream/download URLs
|
||||
"ignoreerrors": True,
|
||||
}
|
||||
if max_videos is not None:
|
||||
# yt-dlp uses 1-based playlist indices; playlistend limits how many
|
||||
# entries are fetched from the source before returning.
|
||||
ydl_opts["playlistend"] = max_videos
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
|
||||
if info is None:
|
||||
return []
|
||||
|
||||
# Both channels and playlists wrap entries in an "entries" key.
|
||||
entries = info.get("entries") or []
|
||||
# Flatten one extra level for channels (channel -> playlist -> entries)
|
||||
flat: list[dict] = []
|
||||
for entry in entries:
|
||||
if entry is None:
|
||||
continue
|
||||
if "entries" in entry: # nested playlist
|
||||
flat.extend(e for e in entry["entries"] if e)
|
||||
else:
|
||||
flat.append(entry)
|
||||
return flat
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def sync_source(media_source: MediaSource, max_videos: int | None = None) -> dict:
|
||||
"""
|
||||
Phase 1: Metadata-only sync.
|
||||
|
||||
Crawls a YouTube channel/playlist and upserts MediaItem rows for each
|
||||
discovered video. No video files are ever downloaded here.
|
||||
|
||||
Args:
|
||||
media_source: The MediaSource to sync.
|
||||
max_videos: Maximum number of videos to import. When None the
|
||||
defaults are applied:
|
||||
- youtube_channel → 50 (channels can have 10k+ videos)
|
||||
- youtube_playlist → 200 (playlists are usually curated)
|
||||
|
||||
Returns:
|
||||
{"created": int, "updated": int, "skipped": int}
|
||||
"""
|
||||
if media_source.source_type not in YOUTUBE_SOURCE_TYPES:
|
||||
raise ValueError(f"MediaSource {media_source.id} is not a YouTube source.")
|
||||
|
||||
# Apply sensible defaults per source type
|
||||
if max_videos is None:
|
||||
if media_source.source_type == MediaSource.SourceType.YOUTUBE_CHANNEL:
|
||||
max_videos = 50
|
||||
else:
|
||||
max_videos = 200
|
||||
|
||||
entries = _extract_playlist_info(media_source.uri, max_videos=max_videos)
|
||||
created = updated = skipped = 0
|
||||
|
||||
for entry in entries:
|
||||
video_id = entry.get("id")
|
||||
if not video_id:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
title = entry.get("title") or f"YouTube Video {video_id}"
|
||||
duration = entry.get("duration") or 0 # seconds, may be None for live
|
||||
thumbnail = entry.get("thumbnail") or ""
|
||||
description = entry.get("description") or ""
|
||||
release_year = None
|
||||
upload_date = entry.get("upload_date") # "YYYYMMDD"
|
||||
if upload_date and len(upload_date) >= 4:
|
||||
try:
|
||||
release_year = int(upload_date[:4])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Store the YouTube watch URL in file_path so the scheduler can
|
||||
# reference it. The ACTUAL video file will only be downloaded when
|
||||
# `cache_upcoming` runs before the airing.
|
||||
video_url = entry.get("url") or f"https://www.youtube.com/watch?v={video_id}"
|
||||
|
||||
obj, was_created = MediaItem.objects.update_or_create(
|
||||
media_source=media_source,
|
||||
youtube_video_id=video_id,
|
||||
defaults={
|
||||
"title": title,
|
||||
"item_kind": MediaItem.ItemKind.MOVIE,
|
||||
"runtime_seconds": max(int(duration), 1),
|
||||
"file_path": video_url,
|
||||
"thumbnail_path": thumbnail,
|
||||
"description": description,
|
||||
"release_year": release_year,
|
||||
"metadata_json": {
|
||||
"yt_id": video_id,
|
||||
"yt_url": video_url,
|
||||
"uploader": entry.get("uploader", ""),
|
||||
},
|
||||
"is_active": True,
|
||||
},
|
||||
)
|
||||
|
||||
if was_created:
|
||||
created += 1
|
||||
else:
|
||||
updated += 1
|
||||
|
||||
# Update last-scanned timestamp
|
||||
media_source.last_scanned_at = timezone.now()
|
||||
media_source.save(update_fields=["last_scanned_at"])
|
||||
|
||||
logger.info(
|
||||
"sync_source(%s): created=%d updated=%d skipped=%d (limit=%s)",
|
||||
media_source.id,
|
||||
created,
|
||||
updated,
|
||||
skipped,
|
||||
max_videos,
|
||||
)
|
||||
return {"created": created, "updated": updated, "skipped": skipped}
|
||||
|
||||
|
||||
def download_for_airing(media_item: MediaItem) -> Path:
|
||||
"""
|
||||
Download a YouTube video to the local cache so it can be served
|
||||
directly without network dependency at airing time.
|
||||
|
||||
Returns the local Path of the downloaded file.
|
||||
Raises RuntimeError if the download fails.
|
||||
"""
|
||||
video_id = media_item.youtube_video_id
|
||||
if not video_id:
|
||||
raise ValueError(f"MediaItem {media_item.id} has no youtube_video_id.")
|
||||
|
||||
cache_dir = _cache_dir()
|
||||
# Use video_id so we can detect already-cached files quickly.
|
||||
output_template = str(cache_dir / f"{video_id}.%(ext)s")
|
||||
|
||||
# Check if already cached and not expired
|
||||
if media_item.cached_file_path:
|
||||
existing = Path(media_item.cached_file_path)
|
||||
if existing.exists():
|
||||
logger.info("cache hit: %s already at %s", video_id, existing)
|
||||
return existing
|
||||
|
||||
ydl_opts = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"outtmpl": output_template,
|
||||
# Only request pre-muxed (progressive) formats — no separate video+audio
|
||||
# streams that would require ffmpeg to merge. Falls back through:
|
||||
# 1. Best pre-muxed mp4 up to 1080p
|
||||
# 2. Any pre-muxed mp4
|
||||
# 3. Any pre-muxed webm
|
||||
# 4. Anything pre-muxed (no merger needed)
|
||||
"format": "best[ext=mp4][height<=1080]/best[ext=mp4]/best[ext=webm]/best",
|
||||
}
|
||||
|
||||
url = media_item.file_path # URL stored here by sync_source
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=True)
|
||||
|
||||
if info is None:
|
||||
raise RuntimeError(f"yt-dlp returned no info for {url}")
|
||||
|
||||
downloaded_path = Path(ydl.prepare_filename(info))
|
||||
if not downloaded_path.exists():
|
||||
# yt-dlp may have merged to .mp4 even if the template said otherwise
|
||||
mp4_path = downloaded_path.with_suffix(".mp4")
|
||||
if mp4_path.exists():
|
||||
downloaded_path = mp4_path
|
||||
else:
|
||||
raise RuntimeError(f"Expected download at {downloaded_path} but file not found.")
|
||||
|
||||
# Persist the cache location on the model
|
||||
media_item.cached_file_path = str(downloaded_path)
|
||||
media_item.save(update_fields=["cached_file_path"])
|
||||
|
||||
logger.info("downloaded %s -> %s", video_id, downloaded_path)
|
||||
return downloaded_path
|
||||
Reference in New Issue
Block a user