#!/usr/bin/env python3
"""Curator for sandman/youtube_ad_catalog.json.

Run weekly (cron) or manually. Reads source brand-channel handles from the
catalog, queries YouTube Data API v3 for fresh uploads, filters to ad-length
videos (6-90s), confirms dead IDs via oembed, and rewrites the catalog.

Usage:
    YOUTUBE_API_KEY=AIza... python3 curate_ad_catalog.py [--dry-run]
    # or with --catalog /share/.ha_cache/wd_ad_catalog.json on the Pi

Quota: ~3 units per source channel per run. With 36 SG sources and 4 runs/month,
~430 units/month — far below the 10k/day free-tier limit.
"""
from __future__ import annotations

import argparse
import json
import os
import re
import sys
import time
import urllib.parse
import urllib.request
from datetime import datetime

API_BASE = "https://www.googleapis.com/youtube/v3"
OEMBED = "https://www.youtube.com/oembed?format=json&url="
CHANNEL_CACHE_KEY = "_channel_uploads_cache"
ISO_DUR = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?")


def http_json(url: str, timeout: int = 15) -> tuple[int, dict | None]:
    req = urllib.request.Request(url, headers={"User-Agent": "sandman-curator/1.0"})
    try:
        with urllib.request.urlopen(req, timeout=timeout) as r:
            return r.status, json.loads(r.read())
    except urllib.error.HTTPError as e:
        return e.code, None
    except (urllib.error.URLError, TimeoutError, json.JSONDecodeError):
        return 0, None


def iso_duration_to_seconds(s: str) -> int:
    m = ISO_DUR.fullmatch(s or "")
    if not m:
        return 0
    h, mi, se = (int(x) if x else 0 for x in m.groups())
    return h * 3600 + mi * 60 + se


def resolve_channel(handle: str, api_key: str) -> str | None:
    """@handle -> uploads playlist ID (UU...). Returns None on failure."""
    url = f"{API_BASE}/channels?part=contentDetails&forHandle={urllib.parse.quote(handle)}&key={api_key}"
    code, data = http_json(url)
    if code != 200 or not data:
        return None
    items = data.get("items") or []
    if not items:
        return None
    return items[0]["contentDetails"]["relatedPlaylists"]["uploads"]


def fetch_recent_uploads(uploads_playlist_id: str, api_key: str, max_n: int = 50) -> list[str]:
    url = (f"{API_BASE}/playlistItems?part=contentDetails&playlistId={uploads_playlist_id}"
           f"&maxResults={min(max_n, 50)}&key={api_key}")
    code, data = http_json(url)
    if code != 200 or not data:
        return []
    return [it["contentDetails"]["videoId"] for it in (data.get("items") or [])]


def fetch_metadata(video_ids: list[str], api_key: str) -> dict[str, dict]:
    """Batch lookup duration + shorts detection. Returns {video_id: {duration, is_short}}.

    Shorts detection: vertical thumbnail aspect ratio OR #shorts in title/description.
    Real YouTube ads are landscape full-screen videos, never shorts."""
    out = {}
    for i in range(0, len(video_ids), 50):
        batch = video_ids[i:i + 50]
        url = (f"{API_BASE}/videos?part=contentDetails,snippet&id={','.join(batch)}"
               f"&key={api_key}")
        code, data = http_json(url)
        if code != 200 or not data:
            continue
        for it in data.get("items", []):
            duration = iso_duration_to_seconds(it["contentDetails"]["duration"])
            sn = it.get("snippet", {})
            title = (sn.get("title") or "").lower()
            desc = (sn.get("description") or "").lower()
            is_short = False
            # Tag-based hint
            if "#shorts" in title or "#shorts" in desc or "#short " in title:
                is_short = True
            # Aspect-ratio hint: shorts are vertical (height > width)
            if not is_short:
                thumbs = sn.get("thumbnails", {})
                for size in ("maxres", "standard", "high", "medium"):
                    t = thumbs.get(size)
                    if t and t.get("width") and t.get("height"):
                        if t["height"] > t["width"]:
                            is_short = True
                        break
            out[it["id"]] = {"duration": duration, "is_short": is_short}
    return out


def is_alive_oembed(video_id: str) -> bool:
    """True if oembed returns 200 (video is publicly playable)."""
    url = OEMBED + urllib.parse.quote(f"https://www.youtube.com/watch?v={video_id}")
    code, _ = http_json(url, timeout=8)
    return code == 200


def curate(catalog_path: str, api_key: str, dry_run: bool = False, target_inventory: int = 100) -> dict:
    with open(catalog_path) as f:
        cat = json.load(f)

    sources = cat.get("sources", [])
    ads = {a["id"]: a for a in cat.get("ads", []) if a.get("id")}
    dead_ids = set(cat.get("dead_ids", []))
    cache = cat.setdefault(CHANNEL_CACHE_KEY, {})

    length_cfg = cat.get("ad_length_s", {})
    floor = length_cfg.get("min", 6)
    ceil = length_cfg.get("max", 90)

    today = datetime.now().strftime("%Y-%m-%d")
    stats = {"sources": len(sources), "added": 0, "pruned_dead": 0,
             "skipped_too_long": 0, "skipped_too_short": 0, "skipped_shorts": 0,
             "pruned_shorts": 0,
             "channel_resolve_failed": 0, "starting": len(ads),
             "starting_dead": len(dead_ids)}

    # Prune any pre-existing shorts that slipped in before this filter existed.
    existing_ids = list(ads.keys())
    if existing_ids:
        meta = fetch_metadata(existing_ids, api_key)
        for vid, m in meta.items():
            if m.get("is_short"):
                ads.pop(vid, None)
                dead_ids.add(vid)
                stats["pruned_shorts"] += 1

    # ── Top-up: discover new ads from each source channel ──
    for src in sources:
        handle = src.get("handle")
        if not handle:
            continue
        uploads_id = cache.get(handle)
        if not uploads_id:
            uploads_id = resolve_channel(handle, api_key)
            if uploads_id:
                cache[handle] = uploads_id
            else:
                stats["channel_resolve_failed"] += 1
                continue

        recent = fetch_recent_uploads(uploads_id, api_key, max_n=50)
        new_ids = [v for v in recent if v not in ads and v not in dead_ids]
        if not new_ids:
            continue
        meta = fetch_metadata(new_ids, api_key)
        for vid in new_ids:
            m = meta.get(vid, {})
            d = m.get("duration", 0)
            if d == 0:
                continue  # unknown — skip
            if m.get("is_short"):
                stats["skipped_shorts"] += 1
                dead_ids.add(vid)  # avoid re-checking this ID in future runs
                continue
            if d < floor:
                stats["skipped_too_short"] += 1
                continue
            if d > ceil:
                stats["skipped_too_long"] += 1
                continue
            ads[vid] = {
                "id": vid,
                "brand": src.get("brand", "?"),
                "lang": src.get("lang", "en"),
                "category": src.get("category", "misc"),
                "length_s": d,
                "verified": today,
                "source": handle,
            }
            stats["added"] += 1

    # ── Prune: confirm pending dead IDs via oembed ──
    confirmed_dead = []
    for vid in list(dead_ids)[:50]:  # cap per-run to avoid latency / quota waste
        time.sleep(0.1)  # gentle pacing
        if is_alive_oembed(vid):
            # Was flagged dead at runtime but is fine now — leave in dead_ids
            # for one more run (avoids re-adding immediately) but don't act.
            continue
        # Confirmed dead — remove from ads if present
        if vid in ads:
            ads.pop(vid)
            stats["pruned_dead"] += 1
        confirmed_dead.append(vid)

    # ── Recheck a sample of live ads for stale entries (quota-light) ──
    sample_size = min(20, max(0, target_inventory - len(ads) + 5))
    if sample_size and ads:
        # Check the oldest-verified entries first
        oldest = sorted(ads.values(), key=lambda a: a.get("verified", ""))[:sample_size]
        for ad in oldest:
            time.sleep(0.1)
            if not is_alive_oembed(ad["id"]):
                ads.pop(ad["id"], None)
                dead_ids.add(ad["id"])
                stats["pruned_dead"] += 1
            else:
                ad["verified"] = today

    cat["ads"] = sorted(ads.values(), key=lambda a: (a.get("brand", ""), a["id"]))
    cat["dead_ids"] = sorted(dead_ids - set(confirmed_dead))
    cat[CHANNEL_CACHE_KEY] = cache
    cat.setdefault("_meta", {})["last_curator_run"] = today
    stats["ending"] = len(cat["ads"])

    if not dry_run:
        tmp = catalog_path + ".tmp"
        with open(tmp, "w") as f:
            json.dump(cat, f, indent=2, ensure_ascii=False)
        os.replace(tmp, catalog_path)

    return stats


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--catalog", default=os.environ.get("AD_CATALOG_PATH",
                    "youtube_ad_catalog.json"))
    ap.add_argument("--api-key", default=os.environ.get("YOUTUBE_API_KEY"))
    ap.add_argument("--dry-run", action="store_true")
    ap.add_argument("--target", type=int, default=100,
                    help="target live ad count")
    args = ap.parse_args()

    if not args.api_key:
        print("ERROR: set YOUTUBE_API_KEY env var or pass --api-key", file=sys.stderr)
        return 2
    if not os.path.exists(args.catalog):
        print(f"ERROR: catalog not found: {args.catalog}", file=sys.stderr)
        return 2

    stats = curate(args.catalog, args.api_key,
                   dry_run=args.dry_run, target_inventory=args.target)
    print(json.dumps(stats, indent=2))
    if stats["ending"] < args.target * 0.5:
        print(f"WARNING: catalog below 50% of target ({stats['ending']}/{args.target})",
              file=sys.stderr)
    return 0


if __name__ == "__main__":
    sys.exit(main())