Add target-side video preprocessing pipeline

Preprocesses a folder of video files into UUID-named clips suitable as target inputs for roop-unleashed-style face-swap. Counterpart to the faceset (source-side) tooling. work/video_target_pipeline.py — orchestration with subcommands scan / scenes / stage / merge / track / score / cut / report. Quality gates default to face-sets-can-handle-side-profile values (yaw<=75°, pitch<=45°, face_short>=80px, det>=0.5). Cross-track segment merge fuses adjacent-in-time tracks within the same scene up to 2s gap. Output organized into <output_dir>/<source_stem>/<uuid>.mp4 + <uuid>.json sidecar with full provenance. work/video_face_worker.py — Windows DML face detect+embed worker. Uses JSONL append-only for results.jsonl: a critical perf fix (re- serializing the monolithic 245MB results.json on every flush was the dominant cost in the first attempt, dropping throughput to 0.5 fps). Append-only got it to 13+ fps, ~7.5 fps cumulative across the first 6.18h batch. Also uses seek-once-per-video + sequential cap.grab() between samples to dodge cv2 per-sample seek pathology on long H.264. Legacy results.json is auto-migrated to .jsonl on first load. work/run_video_pipeline.sh — generic chain driver, parameterized via WORK / INPUT_DIR / OUTPUT_DIR / FILTER_FROM / SKIP_PATTERN / MAX_DUR / IDENTITY env vars. work/status_video_pipeline.sh — generic status helper. First production batch (ct_src_00050..00062, 13 files, 6.18h input): 600 emitted segments, 239.5min accepted content (64.6% of input), 254 segments built from >=2 tracks (cross-track merge), 1h43m wall clock. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 21:38:50 +02:00
parent 49a43c7685
commit 998fa79f81
6 changed files with 1480 additions and 0 deletions
--- a/work/video_target_pipeline.py
+++ b/work/video_target_pipeline.py
@@ -0,0 +1,917 @@
+"""Video target preprocessing pipeline for roop-unleashed.
+
+Discovers video files in an input folder, runs scene-cut detection, samples
+frames within each scene, runs face detection + embedding via Windows DML
+worker, stitches per-frame detections into face tracks, applies quality
+gates, cuts approved segments out with ffmpeg stream-copy, and writes a
+report. Output clips have generic UUID names + a sidecar JSON with full
+provenance.
+
+Subcommands:
+  scan      list input videos, run ffprobe, write per-video index
+  scenes    PySceneDetect AdaptiveDetector per video; write scenes_<basename>.json
+  stage     write frame queue.json (sampled @ 2 fps within scenes)
+  merge     ingest worker results.json into per-video frame_results
+  track     IoU+embedding stitching of per-frame detections into tracks
+  score     track-level quality gating + segment plan
+  cut       ffmpeg -c copy each accepted segment to <out_dir>/<uuid>.mp4
+  report    HTML preview with thumbnails + identity tags
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import re
+import shutil
+import subprocess
+import sys
+import time
+import uuid
+from collections import defaultdict
+from pathlib import Path
+
+import numpy as np
+
+DEFAULT_INPUT = Path("/mnt/x/src/vd")
+DEFAULT_OUTPUT = Path("/mnt/x/src/vd/ct")
+WORK_DIR = Path("/opt/face-sets/work/video_preprocess")
+
+# defaults — first set was strict-portrait; second set loosened for side-profile + segment merging
+SAMPLE_FPS = 2.0
+QUALITY_YAW_MAX = 75.0      # was 25; allow full 3/4 + profile (face-sets handle it)
+QUALITY_PITCH_MAX = 45.0    # was 30
+QUALITY_FACE_MIN = 80       # was 96
+QUALITY_BLUR_MIN = 50.0
+QUALITY_DET_MIN = 0.5       # was 0.6
+TRACK_GATE_FRAC = 0.7       # >=70% of frames in track must pass per-frame gates
+SEGMENT_MIN_S = 1.0
+SEGMENT_MAX_S = 30.0        # was 10
+SEGMENT_BRIDGE_S = 3.0      # was 1.0 — within-track pose-failure bridging
+SEGMENT_MERGE_GAP_S = 2.0   # NEW — across-track merge if same scene + within this gap
+TRACK_IOU_MIN = 0.3
+TRACK_EMB_MIN = 0.5
+
+CACHES = [
+    Path("/opt/face-sets/work/cache/nl_full.npz"),
+    Path("/opt/face-sets/work/cache/immich_peter.npz"),
+    Path("/opt/face-sets/work/cache/immich_nic.npz"),
+]
+FACESETS_ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
+IDENTITY_TAG_THRESHOLD = 0.6  # cosine sim to faceset centroid
+
+
+def wsl_to_win(p: str) -> str:
+    s = str(p)
+    if s.startswith("/mnt/"):
+        return f"{s[5].upper()}:\\{s[7:].replace('/', chr(92))}"
+    return s
+
+
+# ----------------------------- ffprobe / scan -----------------------------
+
+def ffprobe(video: Path) -> dict:
+    cmd = [
+        "ffprobe", "-v", "error", "-print_format", "json",
+        "-show_format", "-show_streams", str(video),
+    ]
+    r = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+    if r.returncode != 0:
+        return {"error": r.stderr.strip()}
+    return json.loads(r.stdout)
+
+
+def parse_video_meta(probe: dict) -> dict:
+    if "error" in probe:
+        return {"error": probe["error"]}
+    fmt = probe.get("format", {})
+    duration = float(fmt.get("duration", 0))
+    vstream = next((s for s in probe.get("streams", []) if s.get("codec_type") == "video"), None)
+    if vstream is None:
+        return {"error": "no video stream"}
+    fps_str = vstream.get("avg_frame_rate", "0/1")
+    try:
+        num, den = (int(x) for x in fps_str.split("/"))
+        fps = num / den if den else 0.0
+    except Exception:
+        fps = 0.0
+    nb_frames = int(vstream.get("nb_frames", 0)) or int(round(duration * fps))
+    return {
+        "duration_s": duration,
+        "fps": fps,
+        "frames": nb_frames,
+        "width": int(vstream.get("width", 0)),
+        "height": int(vstream.get("height", 0)),
+        "codec": vstream.get("codec_name"),
+    }
+
+
+def cmd_scan(args):
+    in_dir = Path(args.input)
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    extensions = {".mp4", ".mov", ".mkv", ".m4v", ".avi", ".webm"}
+    out_root = Path(args.output_dir).resolve()
+    videos = []
+    for p in sorted(in_dir.iterdir() if not args.recursive else in_dir.rglob("*")):
+        if not p.is_file():
+            continue
+        if out_root in p.parents or p.resolve() == out_root:
+            continue  # never include the output dir
+        if p.suffix.lower() not in extensions:
+            continue
+        videos.append(p)
+    print(f"[scan] {len(videos)} candidate videos", file=sys.stderr)
+    inventory = []
+    for p in videos:
+        meta = parse_video_meta(ffprobe(p))
+        meta["path"] = str(p)
+        meta["win_path"] = wsl_to_win(str(p))
+        meta["size"] = p.stat().st_size
+        inventory.append(meta)
+        if "error" not in meta:
+            print(f"  {p.name}: {meta['duration_s']:.1f}s @ {meta['fps']:.1f}fps "
+                  f"{meta['width']}x{meta['height']} {meta['codec']}", file=sys.stderr)
+        else:
+            print(f"  {p.name}: ERROR {meta['error']}", file=sys.stderr)
+    out.write_text(json.dumps({"input": str(in_dir), "videos": inventory}, indent=2))
+    print(f"[scan] inventory -> {out}", file=sys.stderr)
+
+
+# ----------------------------- scenes -----------------------------
+
+def cmd_scenes(args):
+    from scenedetect import open_video, SceneManager
+    from scenedetect.detectors import AdaptiveDetector
+    inv = json.loads(Path(args.inventory).read_text())
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    only = set(args.only.split(",")) if args.only else None
+    for v in inv["videos"]:
+        if "error" in v:
+            continue
+        path = Path(v["path"])
+        if only and path.name not in only:
+            continue
+        out_file = out_dir / (path.stem + ".scenes.json")
+        if out_file.exists() and not args.force:
+            continue
+        print(f"[scenes] {path.name} ...", file=sys.stderr, flush=True)
+        t0 = time.time()
+        try:
+            video = open_video(str(path))
+            sm = SceneManager()
+            sm.add_detector(AdaptiveDetector(min_scene_len=int(round(v.get("fps", 30) or 30) * 0.5)))
+            sm.detect_scenes(video, show_progress=False)
+            scenes = sm.get_scene_list()
+            entries = []
+            for s, e in scenes:
+                entries.append({
+                    "start_frame": s.frame_num, "end_frame": e.frame_num,
+                    "start_s": s.get_seconds(), "end_s": e.get_seconds(),
+                    "duration_s": e.get_seconds() - s.get_seconds(),
+                })
+            # if no cuts found, treat the whole video as one scene
+            if not entries:
+                entries = [{
+                    "start_frame": 0, "end_frame": v["frames"],
+                    "start_s": 0.0, "end_s": v["duration_s"],
+                    "duration_s": v["duration_s"],
+                }]
+            out_file.write_text(json.dumps({"video": str(path), "scenes": entries}, indent=2))
+            print(f"  {len(entries)} scenes in {time.time()-t0:.1f}s -> {out_file.name}",
+                  file=sys.stderr)
+        except Exception as e:
+            print(f"  ERROR: {e}", file=sys.stderr)
+
+
+# ----------------------------- stage -----------------------------
+
+def cmd_stage(args):
+    inv = json.loads(Path(args.inventory).read_text())
+    scenes_dir = Path(args.scenes_dir)
+    queue = []
+    qid = 0
+    sample_every = 1.0 / args.sample_fps
+    for v in inv["videos"]:
+        if "error" in v:
+            continue
+        p = Path(v["path"])
+        sf = scenes_dir / (p.stem + ".scenes.json")
+        if not sf.exists():
+            print(f"[warn] no scenes file for {p.name}; skipping", file=sys.stderr)
+            continue
+        scenes = json.loads(sf.read_text()).get("scenes", [])
+        fps = v.get("fps", 30) or 30
+        for sc in scenes:
+            t = sc["start_s"]
+            while t < sc["end_s"] - 0.01:
+                fidx = int(round(t * fps))
+                if fidx >= v["frames"]:
+                    break
+                queue.append({
+                    "queue_id": f"q{qid:08d}",
+                    "video_path": str(p),
+                    "win_video_path": v["win_path"],
+                    "frame_idx": fidx,
+                    "time_s": t,
+                })
+                qid += 1
+                t += sample_every
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(queue, indent=2))
+    print(f"[stage] {len(queue)} sampled frames @ {args.sample_fps} fps -> {out}",
+          file=sys.stderr)
+    print(f"[stage] win path for worker: {wsl_to_win(str(out))}", file=sys.stderr)
+
+
+# ----------------------------- merge + track -----------------------------
+
+def cmd_merge(args):
+    """Read worker output and group by video_path. Supports either JSONL (one record
+    per line, the new format) or legacy JSON (results.json with `results` list)."""
+    src_path = Path(args.results)
+    records = []
+    # try JSONL first (sister .jsonl file or .results passed directly)
+    jsonl_candidate = src_path.with_suffix(".jsonl")
+    if jsonl_candidate.exists():
+        with open(jsonl_candidate) as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    records.append(json.loads(line))
+    elif src_path.suffix == ".jsonl":
+        with open(src_path) as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    records.append(json.loads(line))
+    else:
+        # legacy: monolithic JSON
+        src = json.loads(src_path.read_text())
+        records = src.get("results", [])
+    by_video: dict[str, list] = {}
+    for r in records:
+        by_video.setdefault(r["video_path"], []).append(r)
+    for v in by_video:
+        by_video[v].sort(key=lambda x: x["frame_idx"])
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps({"by_video": by_video}, indent=2))
+    print(f"[merge] {sum(len(v) for v in by_video.values())} frames across {len(by_video)} videos "
+          f"-> {out}", file=sys.stderr)
+
+
+def _iou(a, b):
+    ax1, ay1, ax2, ay2 = a
+    bx1, by1, bx2, by2 = b
+    ix1 = max(ax1, bx1); iy1 = max(ay1, by1)
+    ix2 = min(ax2, bx2); iy2 = min(ay2, by2)
+    iw = max(ix2 - ix1, 0); ih = max(iy2 - iy1, 0)
+    inter = iw * ih
+    ua = (ax2 - ax1) * (ay2 - ay1) + (bx2 - bx1) * (by2 - by1) - inter
+    return inter / ua if ua > 0 else 0.0
+
+
+def cmd_track(args):
+    """Stitch per-frame face detections into tracks within each scene of each video.
+    Track = list of (frame_idx, face_idx) where adjacent samples have IoU>=0.3 OR
+    cosine(emb)>=0.5. New face → new track. No cross-scene merging."""
+    fr = json.loads(Path(args.frames).read_text())
+    scenes_dir = Path(args.scenes_dir)
+    inv = json.loads(Path(args.inventory).read_text())
+    inv_by_path = {v["path"]: v for v in inv["videos"]}
+
+    all_video_tracks: dict[str, list] = {}
+    for video_path, frames in fr["by_video"].items():
+        v = inv_by_path.get(video_path, {})
+        sf = scenes_dir / (Path(video_path).stem + ".scenes.json")
+        scenes = json.loads(sf.read_text()).get("scenes", []) if sf.exists() else []
+        # group frames by scene
+        scene_for_frame = {}
+        for si, sc in enumerate(scenes):
+            for f in frames:
+                if f["frame_idx"] >= sc["start_frame"] and f["frame_idx"] < sc["end_frame"]:
+                    scene_for_frame.setdefault(si, []).append(f)
+        video_tracks = []
+        for si, scene_frames in scene_for_frame.items():
+            scene_frames.sort(key=lambda x: x["frame_idx"])
+            # tracks = list of dict{ "members": [(frame_idx, face_idx, face_dict)], "last_bbox", "last_emb" }
+            tracks = []
+            for f in scene_frames:
+                claimed = set()
+                for face_idx, face in enumerate(f.get("faces", [])):
+                    bbox = face["bbox"]
+                    emb = np.array(face.get("embedding", []), dtype=np.float32) if face.get("embedding") else None
+                    best_track = None
+                    best_score = 0.0
+                    for ti, tr in enumerate(tracks):
+                        if ti in claimed:
+                            continue
+                        # staleness in TIME (sample period independent of source fps)
+                        last_time = tr["members"][-1][3]
+                        if f["time_s"] - last_time > 1.5:  # stale if >1.5s gap (3 sample periods @ 2fps)
+                            continue
+                        score = _iou(tr["last_bbox"], bbox)
+                        if emb is not None and tr.get("last_emb") is not None:
+                            score = max(score, float(np.dot(tr["last_emb"], emb)))
+                        if score > best_score:
+                            best_score = score
+                            best_track = ti
+                    if best_track is not None and best_score >= min(TRACK_IOU_MIN, TRACK_EMB_MIN):
+                        tr = tracks[best_track]
+                        tr["members"].append((f["frame_idx"], face_idx, face, f["time_s"]))
+                        tr["last_bbox"] = bbox
+                        if emb is not None:
+                            tr["last_emb"] = emb
+                        claimed.add(best_track)
+                    else:
+                        tracks.append({
+                            "members": [(f["frame_idx"], face_idx, face, f["time_s"])],
+                            "last_bbox": bbox,
+                            "last_emb": emb,
+                        })
+            for tr in tracks:
+                if len(tr["members"]) < 2:
+                    continue
+                video_tracks.append({
+                    "scene_idx": si,
+                    "members": [
+                        {"frame_idx": m[0], "face_idx": m[1], "time_s": m[3], "face": m[2]}
+                        for m in tr["members"]
+                    ],
+                })
+        all_video_tracks[video_path] = video_tracks
+        print(f"[track] {Path(video_path).name}: {sum(len(s) for s in scene_for_frame.values())} frames "
+              f"-> {len(video_tracks)} tracks across {len(scene_for_frame)} scenes",
+              file=sys.stderr)
+
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps({"by_video": all_video_tracks}, indent=2))
+    print(f"[track] -> {out}", file=sys.stderr)
+
+
+# ----------------------------- score (quality gates) -----------------------------
+
+def _track_passes(track, cfg):
+    """Per-frame quality gating; return list of bool (does each member pass) +
+    aggregate stats. cfg: dict with yaw_max, pitch_max, face_min, det_min."""
+    passes = []
+    yaws, pitches, sizes, dets = [], [], [], []
+    for m in track["members"]:
+        f = m["face"]
+        yaw = abs(f.get("pose", [0, 0, 0])[1]) if f.get("pose") else 0
+        pitch = abs(f.get("pose", [0, 0, 0])[0]) if f.get("pose") else 0
+        size = f.get("face_short", 0)
+        det = f.get("det_score", 0)
+        ok = (yaw <= cfg["yaw_max"] and pitch <= cfg["pitch_max"]
+              and size >= cfg["face_min"] and det >= cfg["det_min"])
+        passes.append(ok)
+        yaws.append(yaw); pitches.append(pitch); sizes.append(size); dets.append(det)
+    return passes, {
+        "n": len(passes), "n_pass": sum(passes), "frac_pass": sum(passes) / max(1, len(passes)),
+        "yaw_med": float(np.median(yaws)) if yaws else None,
+        "pitch_med": float(np.median(pitches)) if pitches else None,
+        "size_med": float(np.median(sizes)) if sizes else None,
+        "det_med": float(np.median(dets)) if dets else None,
+    }
+
+
+def _build_segments(track, cfg):
+    """Return list of (start_s, end_s) accepted sub-segments of this track:
+    contiguous runs of passing frames meeting min/max duration. Pose-failure
+    spans <= cfg['bridge_s'] long get bridged across (handles momentary head
+    turns / detection misses)."""
+    passes, stats = _track_passes(track, cfg)
+    members = track["members"]
+    if not members:
+        return [], stats
+    # bridge gaps of failing frames (any width) up to cfg["bridge_s"] seconds
+    bridged = list(passes)
+    n = len(bridged)
+    i = 0
+    while i < n:
+        if bridged[i]:
+            i += 1
+            continue
+        # find run of consecutive False starting at i
+        j = i
+        while j < n and not bridged[j]:
+            j += 1
+        # bridge if surrounded by True on both sides AND time gap <= bridge_s
+        if i > 0 and j < n and bridged[i - 1] and bridged[j]:
+            t_left = members[i - 1]["time_s"]
+            t_right = members[j]["time_s"]
+            if t_right - t_left <= cfg["bridge_s"]:
+                for k in range(i, j):
+                    bridged[k] = True
+        i = j
+    # find runs of True
+    runs = []
+    i = 0
+    while i < n:
+        if not bridged[i]:
+            i += 1; continue
+        j = i
+        while j + 1 < n and bridged[j + 1]:
+            j += 1
+        s = members[i]["time_s"]
+        # end is the time of the last passing sample plus one sample-period
+        e = members[j]["time_s"] + 1.0 / max(SAMPLE_FPS, 1e-3)
+        runs.append((s, e))
+        i = j + 1
+    return runs, stats
+
+
+def _merge_close_segments(segs_with_meta, merge_gap_s: float):
+    """Merge segments within the same scene that are within merge_gap_s of each other.
+    segs_with_meta: list of dicts with start_s, end_s, scene_idx, track_idx, stats.
+    Returns list of merged dicts (one per merged group). Identity-tag and stats
+    aggregation happen later."""
+    by_scene: dict[int, list] = {}
+    for s in segs_with_meta:
+        by_scene.setdefault(s["scene_idx"], []).append(s)
+    merged_all = []
+    for scene_idx, segs in by_scene.items():
+        segs.sort(key=lambda x: x["start_s"])
+        cur = None
+        for s in segs:
+            if cur is None:
+                cur = {**s, "track_idxs": [s["track_idx"]], "member_count": s["stats"]["n"],
+                       "pass_count": s["stats"]["n_pass"]}
+                continue
+            gap = s["start_s"] - cur["end_s"]
+            if gap <= merge_gap_s:
+                # merge
+                cur["end_s"] = max(cur["end_s"], s["end_s"])
+                cur["track_idxs"].append(s["track_idx"])
+                cur["member_count"] += s["stats"]["n"]
+                cur["pass_count"] += s["stats"]["n_pass"]
+                # take the better-quality stats for display
+                if s["stats"]["n_pass"] > cur["stats"]["n_pass"]:
+                    cur["stats"] = s["stats"]
+            else:
+                merged_all.append(cur)
+                cur = {**s, "track_idxs": [s["track_idx"]], "member_count": s["stats"]["n"],
+                       "pass_count": s["stats"]["n_pass"]}
+        if cur is not None:
+            merged_all.append(cur)
+    return merged_all
+
+
+def _split_long_segments(segs_with_meta, min_s: float, max_s: float):
+    """Apply min/max duration: drop too-short, split too-long evenly."""
+    out = []
+    for s in segs_with_meta:
+        dur = s["end_s"] - s["start_s"]
+        if dur < min_s:
+            continue
+        if dur <= max_s:
+            out.append(s)
+            continue
+        n = int(math.ceil(dur / max_s))
+        chunk = dur / n
+        base_start = s["start_s"]
+        for k in range(n):
+            piece = dict(s)
+            piece["start_s"] = base_start + k * chunk
+            piece["end_s"] = base_start + (k + 1) * chunk
+            out.append(piece)
+    return out
+
+
+# identity tagging via cached arcface centroids
+def load_caches_index():
+    rec_index = {}
+    alias_map = {}
+    for c in CACHES:
+        if not c.exists():
+            continue
+        d = np.load(c, allow_pickle=True)
+        emb = d["embeddings"]
+        meta = json.loads(str(d["meta"]))
+        face_records = [m for m in meta if not m.get("noface")]
+        if "path_aliases" in d.files:
+            paliases = json.loads(str(d["path_aliases"]))
+            for canon, alist in paliases.items():
+                alias_map.setdefault(canon, canon)
+                for a in alist:
+                    alias_map[a] = canon
+        for i, rec in enumerate(face_records):
+            v = emb[i].astype(np.float32)
+            n = float(np.linalg.norm(v))
+            if n > 0:
+                v = v / n
+            rec_index[(rec["path"], tuple(int(x) for x in rec["bbox"]))] = v
+            alias_map.setdefault(rec["path"], rec["path"])
+    return rec_index, alias_map
+
+
+def load_faceset_centroids():
+    """Return dict faceset_name -> normalized centroid embedding."""
+    rec_index, alias_map = load_caches_index()
+    centroids = {}
+    for fs_dir in sorted(FACESETS_ROOT.iterdir()):
+        if not fs_dir.is_dir() or fs_dir.name.startswith("_"):
+            continue
+        # exclude era splits to avoid double-tagging within a family
+        if re.match(r"^faceset_\d+_(?:\d{4}-\d{2,4}|\d{4}|undated)", fs_dir.name):
+            continue
+        mp = fs_dir / "manifest.json"
+        if not mp.exists():
+            continue
+        m = json.loads(mp.read_text())
+        vecs = []
+        for f in m.get("faces", []):
+            src = f.get("source"); bbox = f.get("bbox")
+            if not src or not bbox:
+                continue
+            canon = alias_map.get(src, src)
+            v = rec_index.get((canon, tuple(int(x) for x in bbox)))
+            if v is None and canon != src:
+                v = rec_index.get((src, tuple(int(x) for x in bbox)))
+            if v is not None:
+                vecs.append(v)
+        if len(vecs) < 3:
+            continue
+        c = np.stack(vecs).mean(axis=0)
+        n = float(np.linalg.norm(c))
+        if n > 0:
+            c = c / n
+        centroids[fs_dir.name] = c
+    return centroids
+
+
+def _track_centroid(track):
+    embs = [m["face"].get("embedding") for m in track["members"] if m["face"].get("embedding")]
+    if not embs:
+        return None
+    arr = np.array(embs, dtype=np.float32)
+    c = arr.mean(axis=0)
+    n = float(np.linalg.norm(c))
+    return c / n if n > 0 else c
+
+
+def cmd_score(args):
+    tr = json.loads(Path(args.tracks).read_text())
+    inv = json.loads(Path(args.inventory).read_text())
+    inv_by_path = {v["path"]: v for v in inv["videos"]}
+
+    cfg = {
+        "yaw_max": args.max_yaw, "pitch_max": args.max_pitch,
+        "face_min": args.min_face, "det_min": args.min_det,
+        "bridge_s": args.bridge_gap,
+    }
+
+    centroids = {}
+    if not args.no_identity:
+        print("[score] loading faceset centroids ...", file=sys.stderr)
+        t0 = time.time()
+        centroids = load_faceset_centroids()
+        print(f"[score]   {len(centroids)} active faceset centroids loaded in {time.time()-t0:.1f}s",
+              file=sys.stderr)
+
+    n_total_tracks = 0
+    n_accepted_tracks = 0
+    # collect per-track candidate segments first; merging happens per-video below
+    per_video_candidates: dict[str, list] = {}
+    track_centroids_by_video: dict[str, dict] = {}
+    for video_path, tracks in tr["by_video"].items():
+        per_video_candidates.setdefault(video_path, [])
+        track_centroids_by_video.setdefault(video_path, {})
+        for ti, track in enumerate(tracks):
+            n_total_tracks += 1
+            runs, stats = _build_segments(track, cfg)
+            if stats["frac_pass"] < args.track_gate_frac:
+                continue
+            if not runs:
+                continue
+            n_accepted_tracks += 1
+            track_centroids_by_video[video_path][ti] = _track_centroid(track)
+            for (s, e) in runs:
+                per_video_candidates[video_path].append({
+                    "video_path": video_path,
+                    "track_idx": ti,
+                    "scene_idx": track["scene_idx"],
+                    "start_s": s,
+                    "end_s": e,
+                    "stats": stats,
+                })
+
+    plan = []
+    for video_path, segs in per_video_candidates.items():
+        if not segs:
+            continue
+        # merge across tracks within the same scene if gap <= merge_gap_s
+        merged = _merge_close_segments(segs, args.merge_gap)
+        # apply min/max duration (split long, drop short)
+        merged = _split_long_segments(merged, args.min_dur, args.max_dur)
+        for s in merged:
+            tag = None
+            tag_sim = None
+            # identity from union of contributing tracks' centroids
+            if centroids:
+                track_centroid_list = [
+                    track_centroids_by_video[video_path].get(ti)
+                    for ti in s.get("track_idxs", [s.get("track_idx")])
+                ]
+                track_centroid_list = [c for c in track_centroid_list if c is not None]
+                if track_centroid_list:
+                    union = np.stack(track_centroid_list).mean(axis=0)
+                    nm = float(np.linalg.norm(union))
+                    if nm > 0:
+                        union = union / nm
+                    sims = {name: float(np.dot(c, union)) for name, c in centroids.items()}
+                    best = max(sims, key=sims.get)
+                    if sims[best] >= IDENTITY_TAG_THRESHOLD:
+                        tag = best; tag_sim = round(sims[best], 4)
+            plan.append({
+                "video_path": video_path,
+                "track_idxs": s.get("track_idxs", [s.get("track_idx")]),
+                "scene_idx": s["scene_idx"],
+                "start_s": round(s["start_s"], 3),
+                "end_s": round(s["end_s"], 3),
+                "duration_s": round(s["end_s"] - s["start_s"], 3),
+                "member_count": s.get("member_count", s["stats"]["n"]),
+                "pass_count": s.get("pass_count", s["stats"]["n_pass"]),
+                "stats": s["stats"],
+                "identity_tag": tag,
+                "identity_sim": tag_sim,
+                "uuid": uuid.uuid4().hex[:12],
+            })
+
+    plan.sort(key=lambda p: (p["video_path"], p["start_s"]))
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps({
+        "thresholds": {
+            "yaw_max": args.max_yaw, "pitch_max": args.max_pitch,
+            "face_min": args.min_face, "blur_min": QUALITY_BLUR_MIN,
+            "det_min": args.min_det, "track_gate_frac": args.track_gate_frac,
+            "bridge_s": args.bridge_gap, "merge_gap_s": args.merge_gap,
+            "min_dur_s": args.min_dur, "max_dur_s": args.max_dur,
+            "identity_tag_threshold": IDENTITY_TAG_THRESHOLD,
+        },
+        "totals": {
+            "tracks_total": n_total_tracks, "tracks_accepted": n_accepted_tracks,
+            "segments": len(plan),
+        },
+        "plan": plan,
+    }, indent=2))
+    print(f"[score] {n_accepted_tracks}/{n_total_tracks} tracks accepted -> {len(plan)} segments "
+          f"-> {out}", file=sys.stderr)
+
+
+# ----------------------------- cut -----------------------------
+
+def cmd_cut(args):
+    plan = json.loads(Path(args.plan).read_text())
+    out_dir = Path(args.output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    if args.clean:
+        # remove only existing UUID-named clips + sidecars (12-char hex), keeping any other files
+        import re as _re
+        uuid_pat = _re.compile(r"^[0-9a-f]{12}\.(mp4|json)$")
+        n_removed = 0
+        for child in out_dir.iterdir():
+            if child.is_file() and uuid_pat.match(child.name):
+                child.unlink()
+                n_removed += 1
+            elif child.is_dir() and _re.match(r"^[A-Za-z0-9_.-]+$", child.name):
+                # subfolder of prior runs — clear UUID files inside, then remove if empty
+                for inner in child.iterdir():
+                    if inner.is_file() and uuid_pat.match(inner.name):
+                        inner.unlink()
+                        n_removed += 1
+                try:
+                    child.rmdir()
+                except OSError:
+                    pass
+        if n_removed:
+            print(f"[clean] removed {n_removed} prior UUID clips/sidecars", file=sys.stderr)
+
+    n_done = 0
+    n_err = 0
+    sidecars = []
+    for seg in plan["plan"]:
+        sub = Path(seg["video_path"]).stem
+        seg_dir = out_dir / sub
+        seg_dir.mkdir(parents=True, exist_ok=True)
+        out_video = seg_dir / f"{seg['uuid']}.mp4"
+        if out_video.exists() and not args.force:
+            continue
+        s = seg["start_s"]; d = seg["duration_s"]
+        cmd = [
+            "ffmpeg", "-y", "-loglevel", "error",
+            "-ss", f"{s}",
+            "-i", seg["video_path"],
+            "-t", f"{d}",
+            "-c", "copy",
+            "-avoid_negative_ts", "make_zero",
+            str(out_video),
+        ]
+        r = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+        if r.returncode != 0 or not out_video.exists() or out_video.stat().st_size < 1024:
+            print(f"[cut-err] {seg['uuid']} {seg['video_path']}@{s}+{d}: {r.stderr.strip()[:200]}",
+                  file=sys.stderr)
+            n_err += 1
+            if out_video.exists() and out_video.stat().st_size < 1024:
+                out_video.unlink()
+            continue
+        # sidecar (alongside the clip in the source-named subfolder)
+        sidecar = seg_dir / f"{seg['uuid']}.json"
+        sidecar.write_text(json.dumps({
+            "uuid": seg["uuid"],
+            "source_video": seg["video_path"],
+            "source_basename": Path(seg["video_path"]).name,
+            "start_s": s, "end_s": seg["end_s"], "duration_s": d,
+            "scene_idx": seg["scene_idx"],
+            "track_idxs": seg.get("track_idxs", [seg.get("track_idx")]),
+            "member_count": seg.get("member_count"),
+            "pass_count": seg.get("pass_count"),
+            "stats": seg["stats"],
+            "identity_tag": seg["identity_tag"],
+            "identity_sim": seg["identity_sim"],
+            "thresholds": plan["thresholds"],
+        }, indent=2))
+        sidecars.append(sidecar)
+        n_done += 1
+    print(f"[cut] {n_done} clips written, {n_err} errors -> {out_dir}", file=sys.stderr)
+
+
+# ----------------------------- report -----------------------------
+
+def cmd_report(args):
+    plan = json.loads(Path(args.plan).read_text())
+    out_dir = Path(args.out)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    thumbs_dir = out_dir / "thumbs"
+    thumbs_dir.mkdir(exist_ok=True)
+    output_dir = Path(args.output_dir)
+
+    # group by video
+    by_video: dict[str, list] = {}
+    for seg in plan["plan"]:
+        by_video.setdefault(seg["video_path"], []).append(seg)
+
+    # generate thumbs from each segment's first frame via ffmpeg
+    print(f"[report] generating thumbs for {len(plan['plan'])} segments", file=sys.stderr)
+    for seg in plan["plan"]:
+        thumb = thumbs_dir / f"{seg['uuid']}.jpg"
+        if thumb.exists():
+            continue
+        s = seg["start_s"] + 0.1
+        cmd = [
+            "ffmpeg", "-y", "-loglevel", "error",
+            "-ss", f"{s}",
+            "-i", seg["video_path"],
+            "-frames:v", "1",
+            "-vf", "scale=240:-1",
+            str(thumb),
+        ]
+        subprocess.run(cmd, capture_output=True, timeout=30)
+
+    # render
+    rows = []
+    rows.append("<h1>Video target preprocessing &mdash; review</h1>")
+    t = plan["totals"]
+    th = plan["thresholds"]
+    rows.append(f"<p>Tracks accepted: {t['tracks_accepted']}/{t['tracks_total']}; "
+                f"segments emitted: {t['segments']}.<br>"
+                f"Thresholds: pose &le;{th['yaw_max']}&deg;yaw / {th['pitch_max']}&deg;pitch, "
+                f"face_short &ge;{th['face_min']}px, det &ge;{th['det_min']}, "
+                f"track-gate &ge;{int(100*th['track_gate_frac'])}%, "
+                f"duration {th['min_dur_s']}–{th['max_dur_s']}s. "
+                f"Output dir: <code>{output_dir}</code></p>")
+    nav = " · ".join(f"<a href='#v{i}'>{Path(v).name}</a>"
+                     for i, v in enumerate(by_video.keys()))
+    rows.append(f"<div class='nav'>{nav}</div>")
+    for vi, (video_path, segs) in enumerate(by_video.items()):
+        rows.append(f"<section id='v{vi}' class='vid'>")
+        rows.append(f"<h2>{Path(video_path).name} <small>({len(segs)} segments)</small></h2>")
+        rows.append("<div class='cells'>")
+        for seg in sorted(segs, key=lambda x: x["start_s"]):
+            stats = seg["stats"]
+            tag = seg["identity_tag"] or ""
+            tag_sim = seg["identity_sim"]
+            tag_html = (f"<span class='tag'>{tag} ({tag_sim:.2f})</span>" if tag else "<span class='tag none'>untagged</span>")
+            sub_name = Path(seg['video_path']).stem
+            rows.append(
+                f"<div class='cell'>"
+                f"<a href='{output_dir}/{sub_name}/{seg['uuid']}.mp4'><img src='thumbs/{seg['uuid']}.jpg' loading='lazy'></a>"
+                f"<div class='meta'>"
+                f"<code>{sub_name}/{seg['uuid']}.mp4</code><br>"
+                f"{seg['start_s']:.1f}s &rarr; {seg['end_s']:.1f}s ({seg['duration_s']:.1f}s)<br>"
+                f"yaw={stats['yaw_med']:.0f}&deg; size={stats['size_med']:.0f}px det={stats['det_med']:.2f}<br>"
+                f"pass {stats['n_pass']}/{stats['n']}<br>"
+                f"{tag_html}"
+                f"</div></div>"
+            )
+        rows.append("</div></section>")
+    html = f"""<!doctype html>
+<html><head><meta charset='utf-8'><title>Video targets review</title>
+<style>
+body {{ font-family: system-ui, sans-serif; background:#111; color:#eee; padding:1em; }}
+h1, h2 {{ margin-top: 1em; }} h2 {{ border-bottom: 1px solid #333; padding-bottom: 4px; }}
+small {{ color:#999; font-weight:normal; }}
+section.vid {{ background:#1a1a1a; border-radius:6px; padding:12px; margin:12px 0; }}
+.cells {{ display:flex; flex-wrap:wrap; gap:8px; }}
+.cell {{ background:#222; border-radius:4px; padding:6px; width:260px; font-size:11px; font-family:monospace; }}
+.cell img {{ width:100%; height:auto; border-radius:3px; }}
+.meta {{ padding-top:4px; line-height:1.4; }}
+.tag {{ display:inline-block; padding:1px 6px; background:#5fa05f; color:#000; border-radius:2px; }}
+.tag.none {{ background:#444; color:#aaa; }}
+.nav {{ position:sticky; top:0; background:#111; padding:.5em 0; border-bottom:1px solid #333; font-size:12px; }}
+a {{ color:#6cf; }}
+code {{ background:#000; padding:1px 4px; border-radius:2px; }}
+</style></head>
+<body>
+{''.join(rows)}
+</body></html>"""
+    out_html = out_dir / "index.html"
+    out_html.write_text(html)
+    print(f"[report] -> {out_html}", file=sys.stderr)
+
+
+# ----------------------------- main -----------------------------
+
+def main():
+    ap = argparse.ArgumentParser()
+    sub = ap.add_subparsers(dest="cmd", required=True)
+
+    s = sub.add_parser("scan")
+    s.add_argument("--input", default=str(DEFAULT_INPUT))
+    s.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
+    s.add_argument("--recursive", action="store_true")
+    s.add_argument("--out", required=True)
+    s.set_defaults(func=cmd_scan)
+
+    sc = sub.add_parser("scenes")
+    sc.add_argument("--inventory", required=True)
+    sc.add_argument("--out-dir", required=True)
+    sc.add_argument("--only", default=None, help="comma-separated basenames to limit run")
+    sc.add_argument("--force", action="store_true")
+    sc.set_defaults(func=cmd_scenes)
+
+    st = sub.add_parser("stage")
+    st.add_argument("--inventory", required=True)
+    st.add_argument("--scenes-dir", required=True)
+    st.add_argument("--sample-fps", type=float, default=SAMPLE_FPS)
+    st.add_argument("--out", required=True)
+    st.set_defaults(func=cmd_stage)
+
+    m = sub.add_parser("merge")
+    m.add_argument("--results", required=True)
+    m.add_argument("--out", required=True)
+    m.set_defaults(func=cmd_merge)
+
+    tr = sub.add_parser("track")
+    tr.add_argument("--frames", required=True)
+    tr.add_argument("--scenes-dir", required=True)
+    tr.add_argument("--inventory", required=True)
+    tr.add_argument("--sample-fps", type=float, default=SAMPLE_FPS)
+    tr.add_argument("--out", required=True)
+    tr.set_defaults(func=cmd_track)
+
+    sc2 = sub.add_parser("score")
+    sc2.add_argument("--tracks", required=True)
+    sc2.add_argument("--inventory", required=True)
+    sc2.add_argument("--out", required=True)
+    sc2.add_argument("--no-identity", action="store_true")
+    sc2.add_argument("--max-yaw", type=float, default=QUALITY_YAW_MAX)
+    sc2.add_argument("--max-pitch", type=float, default=QUALITY_PITCH_MAX)
+    sc2.add_argument("--min-face", type=int, default=QUALITY_FACE_MIN)
+    sc2.add_argument("--min-det", type=float, default=QUALITY_DET_MIN)
+    sc2.add_argument("--track-gate-frac", type=float, default=TRACK_GATE_FRAC)
+    sc2.add_argument("--bridge-gap", type=float, default=SEGMENT_BRIDGE_S,
+                     help="bridge within-track failure gaps up to this many seconds")
+    sc2.add_argument("--merge-gap", type=float, default=SEGMENT_MERGE_GAP_S,
+                     help="merge across-track segments in same scene if within this gap")
+    sc2.add_argument("--min-dur", type=float, default=SEGMENT_MIN_S)
+    sc2.add_argument("--max-dur", type=float, default=SEGMENT_MAX_S)
+    sc2.set_defaults(func=cmd_score)
+
+    cu = sub.add_parser("cut")
+    cu.add_argument("--plan", required=True)
+    cu.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
+    cu.add_argument("--force", action="store_true")
+    cu.add_argument("--clean", action="store_true",
+                    help="remove prior UUID-named clips before cutting (preserves non-UUID files)")
+    cu.set_defaults(func=cmd_cut)
+
+    rp = sub.add_parser("report")
+    rp.add_argument("--plan", required=True)
+    rp.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
+    rp.add_argument("--out", required=True)
+    rp.set_defaults(func=cmd_report)
+
+    args = ap.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()