face-sets/work/video_target_pipeline.py

"""Video target preprocessing pipeline for roop-unleashed.

Discovers video files in an input folder, runs scene-cut detection, samples
frames within each scene, runs face detection + embedding via Windows DML
worker, stitches per-frame detections into face tracks, applies quality
gates, cuts approved segments out with ffmpeg stream-copy, and writes a
report. Output clips have generic UUID names + a sidecar JSON with full
provenance.

Subcommands:
  scan      list input videos, run ffprobe, write per-video index
  scenes    PySceneDetect AdaptiveDetector per video; write scenes_<basename>.json
  stage     write frame queue.json (sampled @ 2 fps within scenes)
  merge     ingest worker results.json into per-video frame_results
  track     IoU+embedding stitching of per-frame detections into tracks
  score     track-level quality gating + segment plan
  cut       ffmpeg -c copy each accepted segment to <out_dir>/<uuid>.mp4
  report    HTML preview with thumbnails + identity tags
"""

from __future__ import annotations

import argparse
import json
import math
import re
import shutil
import subprocess
import sys
import time
import uuid
from collections import defaultdict
from pathlib import Path

import numpy as np

DEFAULT_INPUT = Path("/mnt/x/src/vd")
DEFAULT_OUTPUT = Path("/mnt/x/src/vd/ct")
WORK_DIR = Path("/opt/face-sets/work/video_preprocess")

# defaults — first set was strict-portrait; second set loosened for side-profile + segment merging
SAMPLE_FPS = 2.0
QUALITY_YAW_MAX = 75.0      # was 25; allow full 3/4 + profile (face-sets handle it)
QUALITY_PITCH_MAX = 45.0    # was 30
QUALITY_FACE_MIN = 80       # was 96
QUALITY_BLUR_MIN = 50.0
QUALITY_DET_MIN = 0.5       # was 0.6
TRACK_GATE_FRAC = 0.7       # >=70% of frames in track must pass per-frame gates
SEGMENT_MIN_S = 1.0
SEGMENT_MAX_S = 30.0        # was 10
SEGMENT_BRIDGE_S = 3.0      # was 1.0 — within-track pose-failure bridging
SEGMENT_MERGE_GAP_S = 2.0   # NEW — across-track merge if same scene + within this gap
TRACK_IOU_MIN = 0.3
TRACK_EMB_MIN = 0.5

CACHES = [
    Path("/opt/face-sets/work/cache/nl_full.npz"),
    Path("/opt/face-sets/work/cache/immich_peter.npz"),
    Path("/opt/face-sets/work/cache/immich_nic.npz"),
]
FACESETS_ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
IDENTITY_TAG_THRESHOLD = 0.6  # cosine sim to faceset centroid


def wsl_to_win(p: str) -> str:
    s = str(p)
    if s.startswith("/mnt/"):
        return f"{s[5].upper()}:\\{s[7:].replace('/', chr(92))}"
    return s


# ----------------------------- ffprobe / scan -----------------------------

def ffprobe(video: Path) -> dict:
    cmd = [
        "ffprobe", "-v", "error", "-print_format", "json",
        "-show_format", "-show_streams", str(video),
    ]
    r = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
    if r.returncode != 0:
        return {"error": r.stderr.strip()}
    return json.loads(r.stdout)


def parse_video_meta(probe: dict) -> dict:
    if "error" in probe:
        return {"error": probe["error"]}
    fmt = probe.get("format", {})
    duration = float(fmt.get("duration", 0))
    vstream = next((s for s in probe.get("streams", []) if s.get("codec_type") == "video"), None)
    if vstream is None:
        return {"error": "no video stream"}
    fps_str = vstream.get("avg_frame_rate", "0/1")
    try:
        num, den = (int(x) for x in fps_str.split("/"))
        fps = num / den if den else 0.0
    except Exception:
        fps = 0.0
    nb_frames = int(vstream.get("nb_frames", 0)) or int(round(duration * fps))
    return {
        "duration_s": duration,
        "fps": fps,
        "frames": nb_frames,
        "width": int(vstream.get("width", 0)),
        "height": int(vstream.get("height", 0)),
        "codec": vstream.get("codec_name"),
    }


def cmd_scan(args):
    in_dir = Path(args.input)
    out = Path(args.out)
    out.parent.mkdir(parents=True, exist_ok=True)
    extensions = {".mp4", ".mov", ".mkv", ".m4v", ".avi", ".webm"}
    out_root = Path(args.output_dir).resolve()
    videos = []
    for p in sorted(in_dir.iterdir() if not args.recursive else in_dir.rglob("*")):
        if not p.is_file():
            continue
        if out_root in p.parents or p.resolve() == out_root:
            continue  # never include the output dir
        if p.suffix.lower() not in extensions:
            continue
        videos.append(p)
    print(f"[scan] {len(videos)} candidate videos", file=sys.stderr)
    inventory = []
    for p in videos:
        meta = parse_video_meta(ffprobe(p))
        meta["path"] = str(p)
        meta["win_path"] = wsl_to_win(str(p))
        meta["size"] = p.stat().st_size
        inventory.append(meta)
        if "error" not in meta:
            print(f"  {p.name}: {meta['duration_s']:.1f}s @ {meta['fps']:.1f}fps "
                  f"{meta['width']}x{meta['height']} {meta['codec']}", file=sys.stderr)
        else:
            print(f"  {p.name}: ERROR {meta['error']}", file=sys.stderr)
    out.write_text(json.dumps({"input": str(in_dir), "videos": inventory}, indent=2))
    print(f"[scan] inventory -> {out}", file=sys.stderr)


# ----------------------------- scenes -----------------------------

def cmd_scenes(args):
    from scenedetect import open_video, SceneManager
    from scenedetect.detectors import AdaptiveDetector
    inv = json.loads(Path(args.inventory).read_text())
    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    only = set(args.only.split(",")) if args.only else None
    for v in inv["videos"]:
        if "error" in v:
            continue
        path = Path(v["path"])
        if only and path.name not in only:
            continue
        out_file = out_dir / (path.stem + ".scenes.json")
        if out_file.exists() and not args.force:
            continue
        print(f"[scenes] {path.name} ...", file=sys.stderr, flush=True)
        t0 = time.time()
        try:
            video = open_video(str(path))
            sm = SceneManager()
            sm.add_detector(AdaptiveDetector(min_scene_len=int(round(v.get("fps", 30) or 30) * 0.5)))
            sm.detect_scenes(video, show_progress=False)
            scenes = sm.get_scene_list()
            entries = []
            for s, e in scenes:
                entries.append({
                    "start_frame": s.frame_num, "end_frame": e.frame_num,
                    "start_s": s.get_seconds(), "end_s": e.get_seconds(),
                    "duration_s": e.get_seconds() - s.get_seconds(),
                })
            # if no cuts found, treat the whole video as one scene
            if not entries:
                entries = [{
                    "start_frame": 0, "end_frame": v["frames"],
                    "start_s": 0.0, "end_s": v["duration_s"],
                    "duration_s": v["duration_s"],
                }]
            out_file.write_text(json.dumps({"video": str(path), "scenes": entries}, indent=2))
            print(f"  {len(entries)} scenes in {time.time()-t0:.1f}s -> {out_file.name}",
                  file=sys.stderr)
        except Exception as e:
            print(f"  ERROR: {e}", file=sys.stderr)


# ----------------------------- stage -----------------------------

def cmd_stage(args):
    inv = json.loads(Path(args.inventory).read_text())
    scenes_dir = Path(args.scenes_dir)
    queue = []
    qid = 0
    sample_every = 1.0 / args.sample_fps
    for v in inv["videos"]:
        if "error" in v:
            continue
        p = Path(v["path"])
        sf = scenes_dir / (p.stem + ".scenes.json")
        if not sf.exists():
            print(f"[warn] no scenes file for {p.name}; skipping", file=sys.stderr)
            continue
        scenes = json.loads(sf.read_text()).get("scenes", [])
        fps = v.get("fps", 30) or 30
        for sc in scenes:
            t = sc["start_s"]
            while t < sc["end_s"] - 0.01:
                fidx = int(round(t * fps))
                if fidx >= v["frames"]:
                    break
                queue.append({
                    "queue_id": f"q{qid:08d}",
                    "video_path": str(p),
                    "win_video_path": v["win_path"],
                    "frame_idx": fidx,
                    "time_s": t,
                })
                qid += 1
                t += sample_every
    out = Path(args.out)
    out.parent.mkdir(parents=True, exist_ok=True)
    out.write_text(json.dumps(queue, indent=2))
    print(f"[stage] {len(queue)} sampled frames @ {args.sample_fps} fps -> {out}",
          file=sys.stderr)
    print(f"[stage] win path for worker: {wsl_to_win(str(out))}", file=sys.stderr)


# ----------------------------- merge + track -----------------------------

def cmd_merge(args):
    """Read worker output and group by video_path. Supports either JSONL (one record
    per line, the new format) or legacy JSON (results.json with `results` list)."""
    src_path = Path(args.results)
    records = []
    # try JSONL first (sister .jsonl file or .results passed directly)
    jsonl_candidate = src_path.with_suffix(".jsonl")
    if jsonl_candidate.exists():
        with open(jsonl_candidate) as f:
            for line in f:
                line = line.strip()
                if line:
                    records.append(json.loads(line))
    elif src_path.suffix == ".jsonl":
        with open(src_path) as f:
            for line in f:
                line = line.strip()
                if line:
                    records.append(json.loads(line))
    else:
        # legacy: monolithic JSON
        src = json.loads(src_path.read_text())
        records = src.get("results", [])
    by_video: dict[str, list] = {}
    for r in records:
        by_video.setdefault(r["video_path"], []).append(r)
    for v in by_video:
        by_video[v].sort(key=lambda x: x["frame_idx"])
    out = Path(args.out)
    out.parent.mkdir(parents=True, exist_ok=True)
    out.write_text(json.dumps({"by_video": by_video}, indent=2))
    print(f"[merge] {sum(len(v) for v in by_video.values())} frames across {len(by_video)} videos "
          f"-> {out}", file=sys.stderr)


def _iou(a, b):
    ax1, ay1, ax2, ay2 = a
    bx1, by1, bx2, by2 = b
    ix1 = max(ax1, bx1); iy1 = max(ay1, by1)
    ix2 = min(ax2, bx2); iy2 = min(ay2, by2)
    iw = max(ix2 - ix1, 0); ih = max(iy2 - iy1, 0)
    inter = iw * ih
    ua = (ax2 - ax1) * (ay2 - ay1) + (bx2 - bx1) * (by2 - by1) - inter
    return inter / ua if ua > 0 else 0.0


def cmd_track(args):
    """Stitch per-frame face detections into tracks within each scene of each video.
    Track = list of (frame_idx, face_idx) where adjacent samples have IoU>=0.3 OR
    cosine(emb)>=0.5. New face → new track. No cross-scene merging."""
    fr = json.loads(Path(args.frames).read_text())
    scenes_dir = Path(args.scenes_dir)
    inv = json.loads(Path(args.inventory).read_text())
    inv_by_path = {v["path"]: v for v in inv["videos"]}

    all_video_tracks: dict[str, list] = {}
    for video_path, frames in fr["by_video"].items():
        v = inv_by_path.get(video_path, {})
        sf = scenes_dir / (Path(video_path).stem + ".scenes.json")
        scenes = json.loads(sf.read_text()).get("scenes", []) if sf.exists() else []
        # group frames by scene
        scene_for_frame = {}
        for si, sc in enumerate(scenes):
            for f in frames:
                if f["frame_idx"] >= sc["start_frame"] and f["frame_idx"] < sc["end_frame"]:
                    scene_for_frame.setdefault(si, []).append(f)
        video_tracks = []
        for si, scene_frames in scene_for_frame.items():
            scene_frames.sort(key=lambda x: x["frame_idx"])
            # tracks = list of dict{ "members": [(frame_idx, face_idx, face_dict)], "last_bbox", "last_emb" }
            tracks = []
            for f in scene_frames:
                claimed = set()
                for face_idx, face in enumerate(f.get("faces", [])):
                    bbox = face["bbox"]
                    emb = np.array(face.get("embedding", []), dtype=np.float32) if face.get("embedding") else None
                    best_track = None
                    best_score = 0.0
                    for ti, tr in enumerate(tracks):
                        if ti in claimed:
                            continue
                        # staleness in TIME (sample period independent of source fps)
                        last_time = tr["members"][-1][3]
                        if f["time_s"] - last_time > 1.5:  # stale if >1.5s gap (3 sample periods @ 2fps)
                            continue
                        score = _iou(tr["last_bbox"], bbox)
                        if emb is not None and tr.get("last_emb") is not None:
                            score = max(score, float(np.dot(tr["last_emb"], emb)))
                        if score > best_score:
                            best_score = score
                            best_track = ti
                    if best_track is not None and best_score >= min(TRACK_IOU_MIN, TRACK_EMB_MIN):
                        tr = tracks[best_track]
                        tr["members"].append((f["frame_idx"], face_idx, face, f["time_s"]))
                        tr["last_bbox"] = bbox
                        if emb is not None:
                            tr["last_emb"] = emb
                        claimed.add(best_track)
                    else:
                        tracks.append({
                            "members": [(f["frame_idx"], face_idx, face, f["time_s"])],
                            "last_bbox": bbox,
                            "last_emb": emb,
                        })
            for tr in tracks:
                if len(tr["members"]) < 2:
                    continue
                video_tracks.append({
                    "scene_idx": si,
                    "members": [
                        {"frame_idx": m[0], "face_idx": m[1], "time_s": m[3], "face": m[2]}
                        for m in tr["members"]
                    ],
                })
        all_video_tracks[video_path] = video_tracks
        print(f"[track] {Path(video_path).name}: {sum(len(s) for s in scene_for_frame.values())} frames "
              f"-> {len(video_tracks)} tracks across {len(scene_for_frame)} scenes",
              file=sys.stderr)

    out = Path(args.out)
    out.parent.mkdir(parents=True, exist_ok=True)
    out.write_text(json.dumps({"by_video": all_video_tracks}, indent=2))
    print(f"[track] -> {out}", file=sys.stderr)


# ----------------------------- score (quality gates) -----------------------------

def _track_passes(track, cfg):
    """Per-frame quality gating; return list of bool (does each member pass) +
    aggregate stats. cfg: dict with yaw_max, pitch_max, face_min, det_min."""
    passes = []
    yaws, pitches, sizes, dets = [], [], [], []
    for m in track["members"]:
        f = m["face"]
        yaw = abs(f.get("pose", [0, 0, 0])[1]) if f.get("pose") else 0
        pitch = abs(f.get("pose", [0, 0, 0])[0]) if f.get("pose") else 0
        size = f.get("face_short", 0)
        det = f.get("det_score", 0)
        ok = (yaw <= cfg["yaw_max"] and pitch <= cfg["pitch_max"]
              and size >= cfg["face_min"] and det >= cfg["det_min"])
        passes.append(ok)
        yaws.append(yaw); pitches.append(pitch); sizes.append(size); dets.append(det)
    return passes, {
        "n": len(passes), "n_pass": sum(passes), "frac_pass": sum(passes) / max(1, len(passes)),
        "yaw_med": float(np.median(yaws)) if yaws else None,
        "pitch_med": float(np.median(pitches)) if pitches else None,
        "size_med": float(np.median(sizes)) if sizes else None,
        "det_med": float(np.median(dets)) if dets else None,
    }


def _build_segments(track, cfg):
    """Return list of (start_s, end_s) accepted sub-segments of this track:
    contiguous runs of passing frames meeting min/max duration. Pose-failure
    spans <= cfg['bridge_s'] long get bridged across (handles momentary head
    turns / detection misses)."""
    passes, stats = _track_passes(track, cfg)
    members = track["members"]
    if not members:
        return [], stats
    # bridge gaps of failing frames (any width) up to cfg["bridge_s"] seconds
    bridged = list(passes)
    n = len(bridged)
    i = 0
    while i < n:
        if bridged[i]:
            i += 1
            continue
        # find run of consecutive False starting at i
        j = i
        while j < n and not bridged[j]:
            j += 1
        # bridge if surrounded by True on both sides AND time gap <= bridge_s
        if i > 0 and j < n and bridged[i - 1] and bridged[j]:
            t_left = members[i - 1]["time_s"]
            t_right = members[j]["time_s"]
            if t_right - t_left <= cfg["bridge_s"]:
                for k in range(i, j):
                    bridged[k] = True
        i = j
    # find runs of True
    runs = []
    i = 0
    while i < n:
        if not bridged[i]:
            i += 1; continue
        j = i
        while j + 1 < n and bridged[j + 1]:
            j += 1
        s = members[i]["time_s"]
        # end is the time of the last passing sample plus one sample-period
        e = members[j]["time_s"] + 1.0 / max(SAMPLE_FPS, 1e-3)
        runs.append((s, e))
        i = j + 1
    return runs, stats


def _merge_close_segments(segs_with_meta, merge_gap_s: float):
    """Merge segments within the same scene that are within merge_gap_s of each other.
    segs_with_meta: list of dicts with start_s, end_s, scene_idx, track_idx, stats.
    Returns list of merged dicts (one per merged group). Identity-tag and stats
    aggregation happen later."""
    by_scene: dict[int, list] = {}
    for s in segs_with_meta:
        by_scene.setdefault(s["scene_idx"], []).append(s)
    merged_all = []
    for scene_idx, segs in by_scene.items():
        segs.sort(key=lambda x: x["start_s"])
        cur = None
        for s in segs:
            if cur is None:
                cur = {**s, "track_idxs": [s["track_idx"]], "member_count": s["stats"]["n"],
                       "pass_count": s["stats"]["n_pass"]}
                continue
            gap = s["start_s"] - cur["end_s"]
            if gap <= merge_gap_s:
                # merge
                cur["end_s"] = max(cur["end_s"], s["end_s"])
                cur["track_idxs"].append(s["track_idx"])
                cur["member_count"] += s["stats"]["n"]
                cur["pass_count"] += s["stats"]["n_pass"]
                # take the better-quality stats for display
                if s["stats"]["n_pass"] > cur["stats"]["n_pass"]:
                    cur["stats"] = s["stats"]
            else:
                merged_all.append(cur)
                cur = {**s, "track_idxs": [s["track_idx"]], "member_count": s["stats"]["n"],
                       "pass_count": s["stats"]["n_pass"]}
        if cur is not None:
            merged_all.append(cur)
    return merged_all


def _split_long_segments(segs_with_meta, min_s: float, max_s: float):
    """Apply min/max duration: drop too-short, split too-long evenly."""
    out = []
    for s in segs_with_meta:
        dur = s["end_s"] - s["start_s"]
        if dur < min_s:
            continue
        if dur <= max_s:
            out.append(s)
            continue
        n = int(math.ceil(dur / max_s))
        chunk = dur / n
        base_start = s["start_s"]
        for k in range(n):
            piece = dict(s)
            piece["start_s"] = base_start + k * chunk
            piece["end_s"] = base_start + (k + 1) * chunk
            out.append(piece)
    return out


# identity tagging via cached arcface centroids
def load_caches_index():
    rec_index = {}
    alias_map = {}
    for c in CACHES:
        if not c.exists():
            continue
        d = np.load(c, allow_pickle=True)
        emb = d["embeddings"]
        meta = json.loads(str(d["meta"]))
        face_records = [m for m in meta if not m.get("noface")]
        if "path_aliases" in d.files:
            paliases = json.loads(str(d["path_aliases"]))
            for canon, alist in paliases.items():
                alias_map.setdefault(canon, canon)
                for a in alist:
                    alias_map[a] = canon
        for i, rec in enumerate(face_records):
            v = emb[i].astype(np.float32)
            n = float(np.linalg.norm(v))
            if n > 0:
                v = v / n
            rec_index[(rec["path"], tuple(int(x) for x in rec["bbox"]))] = v
            alias_map.setdefault(rec["path"], rec["path"])
    return rec_index, alias_map


def load_faceset_centroids():
    """Return dict faceset_name -> normalized centroid embedding."""
    rec_index, alias_map = load_caches_index()
    centroids = {}
    for fs_dir in sorted(FACESETS_ROOT.iterdir()):
        if not fs_dir.is_dir() or fs_dir.name.startswith("_"):
            continue
        # exclude era splits to avoid double-tagging within a family
        if re.match(r"^faceset_\d+_(?:\d{4}-\d{2,4}|\d{4}|undated)", fs_dir.name):
            continue
        mp = fs_dir / "manifest.json"
        if not mp.exists():
            continue
        m = json.loads(mp.read_text())
        vecs = []
        for f in m.get("faces", []):
            src = f.get("source"); bbox = f.get("bbox")
            if not src or not bbox:
                continue
            canon = alias_map.get(src, src)
            v = rec_index.get((canon, tuple(int(x) for x in bbox)))
            if v is None and canon != src:
                v = rec_index.get((src, tuple(int(x) for x in bbox)))
            if v is not None:
                vecs.append(v)
        if len(vecs) < 3:
            continue
        c = np.stack(vecs).mean(axis=0)
        n = float(np.linalg.norm(c))
        if n > 0:
            c = c / n
        centroids[fs_dir.name] = c
    return centroids


def _track_centroid(track):
    embs = [m["face"].get("embedding") for m in track["members"] if m["face"].get("embedding")]
    if not embs:
        return None
    arr = np.array(embs, dtype=np.float32)
    c = arr.mean(axis=0)
    n = float(np.linalg.norm(c))
    return c / n if n > 0 else c


def cmd_score(args):
    tr = json.loads(Path(args.tracks).read_text())
    inv = json.loads(Path(args.inventory).read_text())
    inv_by_path = {v["path"]: v for v in inv["videos"]}

    cfg = {
        "yaw_max": args.max_yaw, "pitch_max": args.max_pitch,
        "face_min": args.min_face, "det_min": args.min_det,
        "bridge_s": args.bridge_gap,
    }

    centroids = {}
    if not args.no_identity:
        print("[score] loading faceset centroids ...", file=sys.stderr)
        t0 = time.time()
        centroids = load_faceset_centroids()
        print(f"[score]   {len(centroids)} active faceset centroids loaded in {time.time()-t0:.1f}s",
              file=sys.stderr)

    n_total_tracks = 0
    n_accepted_tracks = 0
    # collect per-track candidate segments first; merging happens per-video below
    per_video_candidates: dict[str, list] = {}
    track_centroids_by_video: dict[str, dict] = {}
    for video_path, tracks in tr["by_video"].items():
        per_video_candidates.setdefault(video_path, [])
        track_centroids_by_video.setdefault(video_path, {})
        for ti, track in enumerate(tracks):
            n_total_tracks += 1
            runs, stats = _build_segments(track, cfg)
            if stats["frac_pass"] < args.track_gate_frac:
                continue
            if not runs:
                continue
            n_accepted_tracks += 1
            track_centroids_by_video[video_path][ti] = _track_centroid(track)
            for (s, e) in runs:
                per_video_candidates[video_path].append({
                    "video_path": video_path,
                    "track_idx": ti,
                    "scene_idx": track["scene_idx"],
                    "start_s": s,
                    "end_s": e,
                    "stats": stats,
                })

    plan = []
    for video_path, segs in per_video_candidates.items():
        if not segs:
            continue
        # merge across tracks within the same scene if gap <= merge_gap_s
        merged = _merge_close_segments(segs, args.merge_gap)
        # apply min/max duration (split long, drop short)
        merged = _split_long_segments(merged, args.min_dur, args.max_dur)
        for s in merged:
            tag = None
            tag_sim = None
            # identity from union of contributing tracks' centroids
            if centroids:
                track_centroid_list = [
                    track_centroids_by_video[video_path].get(ti)
                    for ti in s.get("track_idxs", [s.get("track_idx")])
                ]
                track_centroid_list = [c for c in track_centroid_list if c is not None]
                if track_centroid_list:
                    union = np.stack(track_centroid_list).mean(axis=0)
                    nm = float(np.linalg.norm(union))
                    if nm > 0:
                        union = union / nm
                    sims = {name: float(np.dot(c, union)) for name, c in centroids.items()}
                    best = max(sims, key=sims.get)
                    if sims[best] >= IDENTITY_TAG_THRESHOLD:
                        tag = best; tag_sim = round(sims[best], 4)
            plan.append({
                "video_path": video_path,
                "track_idxs": s.get("track_idxs", [s.get("track_idx")]),
                "scene_idx": s["scene_idx"],
                "start_s": round(s["start_s"], 3),
                "end_s": round(s["end_s"], 3),
                "duration_s": round(s["end_s"] - s["start_s"], 3),
                "member_count": s.get("member_count", s["stats"]["n"]),
                "pass_count": s.get("pass_count", s["stats"]["n_pass"]),
                "stats": s["stats"],
                "identity_tag": tag,
                "identity_sim": tag_sim,
                "uuid": uuid.uuid4().hex[:12],
            })

    plan.sort(key=lambda p: (p["video_path"], p["start_s"]))
    out = Path(args.out)
    out.parent.mkdir(parents=True, exist_ok=True)
    out.write_text(json.dumps({
        "thresholds": {
            "yaw_max": args.max_yaw, "pitch_max": args.max_pitch,
            "face_min": args.min_face, "blur_min": QUALITY_BLUR_MIN,
            "det_min": args.min_det, "track_gate_frac": args.track_gate_frac,
            "bridge_s": args.bridge_gap, "merge_gap_s": args.merge_gap,
            "min_dur_s": args.min_dur, "max_dur_s": args.max_dur,
            "identity_tag_threshold": IDENTITY_TAG_THRESHOLD,
        },
        "totals": {
            "tracks_total": n_total_tracks, "tracks_accepted": n_accepted_tracks,
            "segments": len(plan),
        },
        "plan": plan,
    }, indent=2))
    print(f"[score] {n_accepted_tracks}/{n_total_tracks} tracks accepted -> {len(plan)} segments "
          f"-> {out}", file=sys.stderr)


# ----------------------------- cut -----------------------------

def cmd_cut(args):
    plan = json.loads(Path(args.plan).read_text())
    out_dir = Path(args.output_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    if args.clean:
        # remove only existing UUID-named clips + sidecars (12-char hex), keeping any other files
        import re as _re
        uuid_pat = _re.compile(r"^[0-9a-f]{12}\.(mp4|json)$")
        n_removed = 0
        for child in out_dir.iterdir():
            if child.is_file() and uuid_pat.match(child.name):
                child.unlink()
                n_removed += 1
            elif child.is_dir() and _re.match(r"^[A-Za-z0-9_.-]+$", child.name):
                # subfolder of prior runs — clear UUID files inside, then remove if empty
                for inner in child.iterdir():
                    if inner.is_file() and uuid_pat.match(inner.name):
                        inner.unlink()
                        n_removed += 1
                try:
                    child.rmdir()
                except OSError:
                    pass
        if n_removed:
            print(f"[clean] removed {n_removed} prior UUID clips/sidecars", file=sys.stderr)

    n_done = 0
    n_err = 0
    sidecars = []
    for seg in plan["plan"]:
        sub = Path(seg["video_path"]).stem
        seg_dir = out_dir / sub
        seg_dir.mkdir(parents=True, exist_ok=True)
        out_video = seg_dir / f"{seg['uuid']}.mp4"
        if out_video.exists() and not args.force:
            continue
        s = seg["start_s"]; d = seg["duration_s"]
        cmd = [
            "ffmpeg", "-y", "-loglevel", "error",
            "-ss", f"{s}",
            "-i", seg["video_path"],
            "-t", f"{d}",
            "-c", "copy",
            "-avoid_negative_ts", "make_zero",
            str(out_video),
        ]
        r = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
        if r.returncode != 0 or not out_video.exists() or out_video.stat().st_size < 1024:
            print(f"[cut-err] {seg['uuid']} {seg['video_path']}@{s}+{d}: {r.stderr.strip()[:200]}",
                  file=sys.stderr)
            n_err += 1
            if out_video.exists() and out_video.stat().st_size < 1024:
                out_video.unlink()
            continue
        if args.write_sidecar:
            sidecar = seg_dir / f"{seg['uuid']}.json"
            sidecar.write_text(json.dumps({
                "uuid": seg["uuid"],
                "source_video": seg["video_path"],
                "source_basename": Path(seg["video_path"]).name,
                "start_s": s, "end_s": seg["end_s"], "duration_s": d,
                "scene_idx": seg["scene_idx"],
                "track_idxs": seg.get("track_idxs", [seg.get("track_idx")]),
                "member_count": seg.get("member_count"),
                "pass_count": seg.get("pass_count"),
                "stats": seg["stats"],
                "identity_tag": seg["identity_tag"],
                "identity_sim": seg["identity_sim"],
                "thresholds": plan["thresholds"],
            }, indent=2))
            sidecars.append(sidecar)
        n_done += 1
    print(f"[cut] {n_done} clips written, {n_err} errors -> {out_dir}", file=sys.stderr)


# ----------------------------- report -----------------------------

def cmd_report(args):
    plan = json.loads(Path(args.plan).read_text())
    out_dir = Path(args.out)
    out_dir.mkdir(parents=True, exist_ok=True)
    thumbs_dir = out_dir / "thumbs"
    thumbs_dir.mkdir(exist_ok=True)
    output_dir = Path(args.output_dir)

    # group by video
    by_video: dict[str, list] = {}
    for seg in plan["plan"]:
        by_video.setdefault(seg["video_path"], []).append(seg)

    # generate thumbs from each segment's first frame via ffmpeg
    print(f"[report] generating thumbs for {len(plan['plan'])} segments", file=sys.stderr)
    for seg in plan["plan"]:
        thumb = thumbs_dir / f"{seg['uuid']}.jpg"
        if thumb.exists():
            continue
        s = seg["start_s"] + 0.1
        cmd = [
            "ffmpeg", "-y", "-loglevel", "error",
            "-ss", f"{s}",
            "-i", seg["video_path"],
            "-frames:v", "1",
            "-vf", "scale=240:-1",
            str(thumb),
        ]
        subprocess.run(cmd, capture_output=True, timeout=30)

    # render
    rows = []
    rows.append("<h1>Video target preprocessing &mdash; review</h1>")
    t = plan["totals"]
    th = plan["thresholds"]
    rows.append(f"<p>Tracks accepted: {t['tracks_accepted']}/{t['tracks_total']}; "
                f"segments emitted: {t['segments']}.<br>"
                f"Thresholds: pose &le;{th['yaw_max']}&deg;yaw / {th['pitch_max']}&deg;pitch, "
                f"face_short &ge;{th['face_min']}px, det &ge;{th['det_min']}, "
                f"track-gate &ge;{int(100*th['track_gate_frac'])}%, "
                f"duration {th['min_dur_s']}–{th['max_dur_s']}s. "
                f"Output dir: <code>{output_dir}</code></p>")
    nav = " · ".join(f"<a href='#v{i}'>{Path(v).name}</a>"
                     for i, v in enumerate(by_video.keys()))
    rows.append(f"<div class='nav'>{nav}</div>")
    for vi, (video_path, segs) in enumerate(by_video.items()):
        rows.append(f"<section id='v{vi}' class='vid'>")
        rows.append(f"<h2>{Path(video_path).name} <small>({len(segs)} segments)</small></h2>")
        rows.append("<div class='cells'>")
        for seg in sorted(segs, key=lambda x: x["start_s"]):
            stats = seg["stats"]
            tag = seg["identity_tag"] or ""
            tag_sim = seg["identity_sim"]
            tag_html = (f"<span class='tag'>{tag} ({tag_sim:.2f})</span>" if tag else "<span class='tag none'>untagged</span>")
            sub_name = Path(seg['video_path']).stem
            rows.append(
                f"<div class='cell'>"
                f"<a href='{output_dir}/{sub_name}/{seg['uuid']}.mp4'><img src='thumbs/{seg['uuid']}.jpg' loading='lazy'></a>"
                f"<div class='meta'>"
                f"<code>{sub_name}/{seg['uuid']}.mp4</code><br>"
                f"{seg['start_s']:.1f}s &rarr; {seg['end_s']:.1f}s ({seg['duration_s']:.1f}s)<br>"
                f"yaw={stats['yaw_med']:.0f}&deg; size={stats['size_med']:.0f}px det={stats['det_med']:.2f}<br>"
                f"pass {stats['n_pass']}/{stats['n']}<br>"
                f"{tag_html}"
                f"</div></div>"
            )
        rows.append("</div></section>")
    html = f"""<!doctype html>
<html><head><meta charset='utf-8'><title>Video targets review</title>
<style>
body {{ font-family: system-ui, sans-serif; background:#111; color:#eee; padding:1em; }}
h1, h2 {{ margin-top: 1em; }} h2 {{ border-bottom: 1px solid #333; padding-bottom: 4px; }}
small {{ color:#999; font-weight:normal; }}
section.vid {{ background:#1a1a1a; border-radius:6px; padding:12px; margin:12px 0; }}
.cells {{ display:flex; flex-wrap:wrap; gap:8px; }}
.cell {{ background:#222; border-radius:4px; padding:6px; width:260px; font-size:11px; font-family:monospace; }}
.cell img {{ width:100%; height:auto; border-radius:3px; }}
.meta {{ padding-top:4px; line-height:1.4; }}
.tag {{ display:inline-block; padding:1px 6px; background:#5fa05f; color:#000; border-radius:2px; }}
.tag.none {{ background:#444; color:#aaa; }}
.nav {{ position:sticky; top:0; background:#111; padding:.5em 0; border-bottom:1px solid #333; font-size:12px; }}
a {{ color:#6cf; }}
code {{ background:#000; padding:1px 4px; border-radius:2px; }}
</style></head>
<body>
{''.join(rows)}
</body></html>"""
    out_html = out_dir / "index.html"
    out_html.write_text(html)
    print(f"[report] -> {out_html}", file=sys.stderr)


# ----------------------------- main -----------------------------

def main():
    ap = argparse.ArgumentParser()
    sub = ap.add_subparsers(dest="cmd", required=True)

    s = sub.add_parser("scan")
    s.add_argument("--input", default=str(DEFAULT_INPUT))
    s.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
    s.add_argument("--recursive", action="store_true")
    s.add_argument("--out", required=True)
    s.set_defaults(func=cmd_scan)

    sc = sub.add_parser("scenes")
    sc.add_argument("--inventory", required=True)
    sc.add_argument("--out-dir", required=True)
    sc.add_argument("--only", default=None, help="comma-separated basenames to limit run")
    sc.add_argument("--force", action="store_true")
    sc.set_defaults(func=cmd_scenes)

    st = sub.add_parser("stage")
    st.add_argument("--inventory", required=True)
    st.add_argument("--scenes-dir", required=True)
    st.add_argument("--sample-fps", type=float, default=SAMPLE_FPS)
    st.add_argument("--out", required=True)
    st.set_defaults(func=cmd_stage)

    m = sub.add_parser("merge")
    m.add_argument("--results", required=True)
    m.add_argument("--out", required=True)
    m.set_defaults(func=cmd_merge)

    tr = sub.add_parser("track")
    tr.add_argument("--frames", required=True)
    tr.add_argument("--scenes-dir", required=True)
    tr.add_argument("--inventory", required=True)
    tr.add_argument("--sample-fps", type=float, default=SAMPLE_FPS)
    tr.add_argument("--out", required=True)
    tr.set_defaults(func=cmd_track)

    sc2 = sub.add_parser("score")
    sc2.add_argument("--tracks", required=True)
    sc2.add_argument("--inventory", required=True)
    sc2.add_argument("--out", required=True)
    sc2.add_argument("--no-identity", action="store_true")
    sc2.add_argument("--max-yaw", type=float, default=QUALITY_YAW_MAX)
    sc2.add_argument("--max-pitch", type=float, default=QUALITY_PITCH_MAX)
    sc2.add_argument("--min-face", type=int, default=QUALITY_FACE_MIN)
    sc2.add_argument("--min-det", type=float, default=QUALITY_DET_MIN)
    sc2.add_argument("--track-gate-frac", type=float, default=TRACK_GATE_FRAC)
    sc2.add_argument("--bridge-gap", type=float, default=SEGMENT_BRIDGE_S,
                     help="bridge within-track failure gaps up to this many seconds")
    sc2.add_argument("--merge-gap", type=float, default=SEGMENT_MERGE_GAP_S,
                     help="merge across-track segments in same scene if within this gap")
    sc2.add_argument("--min-dur", type=float, default=SEGMENT_MIN_S)
    sc2.add_argument("--max-dur", type=float, default=SEGMENT_MAX_S)
    sc2.set_defaults(func=cmd_score)

    cu = sub.add_parser("cut")
    cu.add_argument("--plan", required=True)
    cu.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
    cu.add_argument("--force", action="store_true")
    cu.add_argument("--clean", action="store_true",
                    help="remove prior UUID-named clips before cutting (preserves non-UUID files)")
    cu.add_argument("--write-sidecar", action="store_true",
                    help="emit <uuid>.json provenance sidecar alongside each clip (default off)")
    cu.set_defaults(func=cmd_cut)

    rp = sub.add_parser("report")
    rp.add_argument("--plan", required=True)
    rp.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
    rp.add_argument("--out", required=True)
    rp.set_defaults(func=cmd_report)

    args = ap.parse_args()
    args.func(args)


if __name__ == "__main__":
    main()