Files
face-sets/work/video_target_pipeline.py
Peter 7960dec350 Make per-clip sidecar JSONs opt-in (default off)
Previously every video_target_pipeline cut wrote a <uuid>.json provenance
sidecar alongside each <uuid>.mp4. The same provenance is already in the
per-batch plan.json, so the per-clip sidecars are redundant unless a
downstream tool wants each clip self-describing in isolation.

- video_target_pipeline.py cut: new --write-sidecar flag, default off.
- run_video_pipeline.sh: new SIDECAR env var (default "no"), passes
  --write-sidecar when SIDECAR=yes.
- README + docs/analysis/video-target-preprocessing.md updated.

The 1,984 already-emitted sidecars in /mnt/x/src/vd/ct/ct_src_*/ have
been deleted (1.5 MB).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 12:44:27 +02:00

920 lines
37 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Video target preprocessing pipeline for roop-unleashed.
Discovers video files in an input folder, runs scene-cut detection, samples
frames within each scene, runs face detection + embedding via Windows DML
worker, stitches per-frame detections into face tracks, applies quality
gates, cuts approved segments out with ffmpeg stream-copy, and writes a
report. Output clips have generic UUID names + a sidecar JSON with full
provenance.
Subcommands:
scan list input videos, run ffprobe, write per-video index
scenes PySceneDetect AdaptiveDetector per video; write scenes_<basename>.json
stage write frame queue.json (sampled @ 2 fps within scenes)
merge ingest worker results.json into per-video frame_results
track IoU+embedding stitching of per-frame detections into tracks
score track-level quality gating + segment plan
cut ffmpeg -c copy each accepted segment to <out_dir>/<uuid>.mp4
report HTML preview with thumbnails + identity tags
"""
from __future__ import annotations
import argparse
import json
import math
import re
import shutil
import subprocess
import sys
import time
import uuid
from collections import defaultdict
from pathlib import Path
import numpy as np
DEFAULT_INPUT = Path("/mnt/x/src/vd")
DEFAULT_OUTPUT = Path("/mnt/x/src/vd/ct")
WORK_DIR = Path("/opt/face-sets/work/video_preprocess")
# defaults — first set was strict-portrait; second set loosened for side-profile + segment merging
SAMPLE_FPS = 2.0
QUALITY_YAW_MAX = 75.0 # was 25; allow full 3/4 + profile (face-sets handle it)
QUALITY_PITCH_MAX = 45.0 # was 30
QUALITY_FACE_MIN = 80 # was 96
QUALITY_BLUR_MIN = 50.0
QUALITY_DET_MIN = 0.5 # was 0.6
TRACK_GATE_FRAC = 0.7 # >=70% of frames in track must pass per-frame gates
SEGMENT_MIN_S = 1.0
SEGMENT_MAX_S = 30.0 # was 10
SEGMENT_BRIDGE_S = 3.0 # was 1.0 — within-track pose-failure bridging
SEGMENT_MERGE_GAP_S = 2.0 # NEW — across-track merge if same scene + within this gap
TRACK_IOU_MIN = 0.3
TRACK_EMB_MIN = 0.5
CACHES = [
Path("/opt/face-sets/work/cache/nl_full.npz"),
Path("/opt/face-sets/work/cache/immich_peter.npz"),
Path("/opt/face-sets/work/cache/immich_nic.npz"),
]
FACESETS_ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
IDENTITY_TAG_THRESHOLD = 0.6 # cosine sim to faceset centroid
def wsl_to_win(p: str) -> str:
s = str(p)
if s.startswith("/mnt/"):
return f"{s[5].upper()}:\\{s[7:].replace('/', chr(92))}"
return s
# ----------------------------- ffprobe / scan -----------------------------
def ffprobe(video: Path) -> dict:
cmd = [
"ffprobe", "-v", "error", "-print_format", "json",
"-show_format", "-show_streams", str(video),
]
r = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
if r.returncode != 0:
return {"error": r.stderr.strip()}
return json.loads(r.stdout)
def parse_video_meta(probe: dict) -> dict:
if "error" in probe:
return {"error": probe["error"]}
fmt = probe.get("format", {})
duration = float(fmt.get("duration", 0))
vstream = next((s for s in probe.get("streams", []) if s.get("codec_type") == "video"), None)
if vstream is None:
return {"error": "no video stream"}
fps_str = vstream.get("avg_frame_rate", "0/1")
try:
num, den = (int(x) for x in fps_str.split("/"))
fps = num / den if den else 0.0
except Exception:
fps = 0.0
nb_frames = int(vstream.get("nb_frames", 0)) or int(round(duration * fps))
return {
"duration_s": duration,
"fps": fps,
"frames": nb_frames,
"width": int(vstream.get("width", 0)),
"height": int(vstream.get("height", 0)),
"codec": vstream.get("codec_name"),
}
def cmd_scan(args):
in_dir = Path(args.input)
out = Path(args.out)
out.parent.mkdir(parents=True, exist_ok=True)
extensions = {".mp4", ".mov", ".mkv", ".m4v", ".avi", ".webm"}
out_root = Path(args.output_dir).resolve()
videos = []
for p in sorted(in_dir.iterdir() if not args.recursive else in_dir.rglob("*")):
if not p.is_file():
continue
if out_root in p.parents or p.resolve() == out_root:
continue # never include the output dir
if p.suffix.lower() not in extensions:
continue
videos.append(p)
print(f"[scan] {len(videos)} candidate videos", file=sys.stderr)
inventory = []
for p in videos:
meta = parse_video_meta(ffprobe(p))
meta["path"] = str(p)
meta["win_path"] = wsl_to_win(str(p))
meta["size"] = p.stat().st_size
inventory.append(meta)
if "error" not in meta:
print(f" {p.name}: {meta['duration_s']:.1f}s @ {meta['fps']:.1f}fps "
f"{meta['width']}x{meta['height']} {meta['codec']}", file=sys.stderr)
else:
print(f" {p.name}: ERROR {meta['error']}", file=sys.stderr)
out.write_text(json.dumps({"input": str(in_dir), "videos": inventory}, indent=2))
print(f"[scan] inventory -> {out}", file=sys.stderr)
# ----------------------------- scenes -----------------------------
def cmd_scenes(args):
from scenedetect import open_video, SceneManager
from scenedetect.detectors import AdaptiveDetector
inv = json.loads(Path(args.inventory).read_text())
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
only = set(args.only.split(",")) if args.only else None
for v in inv["videos"]:
if "error" in v:
continue
path = Path(v["path"])
if only and path.name not in only:
continue
out_file = out_dir / (path.stem + ".scenes.json")
if out_file.exists() and not args.force:
continue
print(f"[scenes] {path.name} ...", file=sys.stderr, flush=True)
t0 = time.time()
try:
video = open_video(str(path))
sm = SceneManager()
sm.add_detector(AdaptiveDetector(min_scene_len=int(round(v.get("fps", 30) or 30) * 0.5)))
sm.detect_scenes(video, show_progress=False)
scenes = sm.get_scene_list()
entries = []
for s, e in scenes:
entries.append({
"start_frame": s.frame_num, "end_frame": e.frame_num,
"start_s": s.get_seconds(), "end_s": e.get_seconds(),
"duration_s": e.get_seconds() - s.get_seconds(),
})
# if no cuts found, treat the whole video as one scene
if not entries:
entries = [{
"start_frame": 0, "end_frame": v["frames"],
"start_s": 0.0, "end_s": v["duration_s"],
"duration_s": v["duration_s"],
}]
out_file.write_text(json.dumps({"video": str(path), "scenes": entries}, indent=2))
print(f" {len(entries)} scenes in {time.time()-t0:.1f}s -> {out_file.name}",
file=sys.stderr)
except Exception as e:
print(f" ERROR: {e}", file=sys.stderr)
# ----------------------------- stage -----------------------------
def cmd_stage(args):
inv = json.loads(Path(args.inventory).read_text())
scenes_dir = Path(args.scenes_dir)
queue = []
qid = 0
sample_every = 1.0 / args.sample_fps
for v in inv["videos"]:
if "error" in v:
continue
p = Path(v["path"])
sf = scenes_dir / (p.stem + ".scenes.json")
if not sf.exists():
print(f"[warn] no scenes file for {p.name}; skipping", file=sys.stderr)
continue
scenes = json.loads(sf.read_text()).get("scenes", [])
fps = v.get("fps", 30) or 30
for sc in scenes:
t = sc["start_s"]
while t < sc["end_s"] - 0.01:
fidx = int(round(t * fps))
if fidx >= v["frames"]:
break
queue.append({
"queue_id": f"q{qid:08d}",
"video_path": str(p),
"win_video_path": v["win_path"],
"frame_idx": fidx,
"time_s": t,
})
qid += 1
t += sample_every
out = Path(args.out)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(queue, indent=2))
print(f"[stage] {len(queue)} sampled frames @ {args.sample_fps} fps -> {out}",
file=sys.stderr)
print(f"[stage] win path for worker: {wsl_to_win(str(out))}", file=sys.stderr)
# ----------------------------- merge + track -----------------------------
def cmd_merge(args):
"""Read worker output and group by video_path. Supports either JSONL (one record
per line, the new format) or legacy JSON (results.json with `results` list)."""
src_path = Path(args.results)
records = []
# try JSONL first (sister .jsonl file or .results passed directly)
jsonl_candidate = src_path.with_suffix(".jsonl")
if jsonl_candidate.exists():
with open(jsonl_candidate) as f:
for line in f:
line = line.strip()
if line:
records.append(json.loads(line))
elif src_path.suffix == ".jsonl":
with open(src_path) as f:
for line in f:
line = line.strip()
if line:
records.append(json.loads(line))
else:
# legacy: monolithic JSON
src = json.loads(src_path.read_text())
records = src.get("results", [])
by_video: dict[str, list] = {}
for r in records:
by_video.setdefault(r["video_path"], []).append(r)
for v in by_video:
by_video[v].sort(key=lambda x: x["frame_idx"])
out = Path(args.out)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps({"by_video": by_video}, indent=2))
print(f"[merge] {sum(len(v) for v in by_video.values())} frames across {len(by_video)} videos "
f"-> {out}", file=sys.stderr)
def _iou(a, b):
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
ix1 = max(ax1, bx1); iy1 = max(ay1, by1)
ix2 = min(ax2, bx2); iy2 = min(ay2, by2)
iw = max(ix2 - ix1, 0); ih = max(iy2 - iy1, 0)
inter = iw * ih
ua = (ax2 - ax1) * (ay2 - ay1) + (bx2 - bx1) * (by2 - by1) - inter
return inter / ua if ua > 0 else 0.0
def cmd_track(args):
"""Stitch per-frame face detections into tracks within each scene of each video.
Track = list of (frame_idx, face_idx) where adjacent samples have IoU>=0.3 OR
cosine(emb)>=0.5. New face → new track. No cross-scene merging."""
fr = json.loads(Path(args.frames).read_text())
scenes_dir = Path(args.scenes_dir)
inv = json.loads(Path(args.inventory).read_text())
inv_by_path = {v["path"]: v for v in inv["videos"]}
all_video_tracks: dict[str, list] = {}
for video_path, frames in fr["by_video"].items():
v = inv_by_path.get(video_path, {})
sf = scenes_dir / (Path(video_path).stem + ".scenes.json")
scenes = json.loads(sf.read_text()).get("scenes", []) if sf.exists() else []
# group frames by scene
scene_for_frame = {}
for si, sc in enumerate(scenes):
for f in frames:
if f["frame_idx"] >= sc["start_frame"] and f["frame_idx"] < sc["end_frame"]:
scene_for_frame.setdefault(si, []).append(f)
video_tracks = []
for si, scene_frames in scene_for_frame.items():
scene_frames.sort(key=lambda x: x["frame_idx"])
# tracks = list of dict{ "members": [(frame_idx, face_idx, face_dict)], "last_bbox", "last_emb" }
tracks = []
for f in scene_frames:
claimed = set()
for face_idx, face in enumerate(f.get("faces", [])):
bbox = face["bbox"]
emb = np.array(face.get("embedding", []), dtype=np.float32) if face.get("embedding") else None
best_track = None
best_score = 0.0
for ti, tr in enumerate(tracks):
if ti in claimed:
continue
# staleness in TIME (sample period independent of source fps)
last_time = tr["members"][-1][3]
if f["time_s"] - last_time > 1.5: # stale if >1.5s gap (3 sample periods @ 2fps)
continue
score = _iou(tr["last_bbox"], bbox)
if emb is not None and tr.get("last_emb") is not None:
score = max(score, float(np.dot(tr["last_emb"], emb)))
if score > best_score:
best_score = score
best_track = ti
if best_track is not None and best_score >= min(TRACK_IOU_MIN, TRACK_EMB_MIN):
tr = tracks[best_track]
tr["members"].append((f["frame_idx"], face_idx, face, f["time_s"]))
tr["last_bbox"] = bbox
if emb is not None:
tr["last_emb"] = emb
claimed.add(best_track)
else:
tracks.append({
"members": [(f["frame_idx"], face_idx, face, f["time_s"])],
"last_bbox": bbox,
"last_emb": emb,
})
for tr in tracks:
if len(tr["members"]) < 2:
continue
video_tracks.append({
"scene_idx": si,
"members": [
{"frame_idx": m[0], "face_idx": m[1], "time_s": m[3], "face": m[2]}
for m in tr["members"]
],
})
all_video_tracks[video_path] = video_tracks
print(f"[track] {Path(video_path).name}: {sum(len(s) for s in scene_for_frame.values())} frames "
f"-> {len(video_tracks)} tracks across {len(scene_for_frame)} scenes",
file=sys.stderr)
out = Path(args.out)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps({"by_video": all_video_tracks}, indent=2))
print(f"[track] -> {out}", file=sys.stderr)
# ----------------------------- score (quality gates) -----------------------------
def _track_passes(track, cfg):
"""Per-frame quality gating; return list of bool (does each member pass) +
aggregate stats. cfg: dict with yaw_max, pitch_max, face_min, det_min."""
passes = []
yaws, pitches, sizes, dets = [], [], [], []
for m in track["members"]:
f = m["face"]
yaw = abs(f.get("pose", [0, 0, 0])[1]) if f.get("pose") else 0
pitch = abs(f.get("pose", [0, 0, 0])[0]) if f.get("pose") else 0
size = f.get("face_short", 0)
det = f.get("det_score", 0)
ok = (yaw <= cfg["yaw_max"] and pitch <= cfg["pitch_max"]
and size >= cfg["face_min"] and det >= cfg["det_min"])
passes.append(ok)
yaws.append(yaw); pitches.append(pitch); sizes.append(size); dets.append(det)
return passes, {
"n": len(passes), "n_pass": sum(passes), "frac_pass": sum(passes) / max(1, len(passes)),
"yaw_med": float(np.median(yaws)) if yaws else None,
"pitch_med": float(np.median(pitches)) if pitches else None,
"size_med": float(np.median(sizes)) if sizes else None,
"det_med": float(np.median(dets)) if dets else None,
}
def _build_segments(track, cfg):
"""Return list of (start_s, end_s) accepted sub-segments of this track:
contiguous runs of passing frames meeting min/max duration. Pose-failure
spans <= cfg['bridge_s'] long get bridged across (handles momentary head
turns / detection misses)."""
passes, stats = _track_passes(track, cfg)
members = track["members"]
if not members:
return [], stats
# bridge gaps of failing frames (any width) up to cfg["bridge_s"] seconds
bridged = list(passes)
n = len(bridged)
i = 0
while i < n:
if bridged[i]:
i += 1
continue
# find run of consecutive False starting at i
j = i
while j < n and not bridged[j]:
j += 1
# bridge if surrounded by True on both sides AND time gap <= bridge_s
if i > 0 and j < n and bridged[i - 1] and bridged[j]:
t_left = members[i - 1]["time_s"]
t_right = members[j]["time_s"]
if t_right - t_left <= cfg["bridge_s"]:
for k in range(i, j):
bridged[k] = True
i = j
# find runs of True
runs = []
i = 0
while i < n:
if not bridged[i]:
i += 1; continue
j = i
while j + 1 < n and bridged[j + 1]:
j += 1
s = members[i]["time_s"]
# end is the time of the last passing sample plus one sample-period
e = members[j]["time_s"] + 1.0 / max(SAMPLE_FPS, 1e-3)
runs.append((s, e))
i = j + 1
return runs, stats
def _merge_close_segments(segs_with_meta, merge_gap_s: float):
"""Merge segments within the same scene that are within merge_gap_s of each other.
segs_with_meta: list of dicts with start_s, end_s, scene_idx, track_idx, stats.
Returns list of merged dicts (one per merged group). Identity-tag and stats
aggregation happen later."""
by_scene: dict[int, list] = {}
for s in segs_with_meta:
by_scene.setdefault(s["scene_idx"], []).append(s)
merged_all = []
for scene_idx, segs in by_scene.items():
segs.sort(key=lambda x: x["start_s"])
cur = None
for s in segs:
if cur is None:
cur = {**s, "track_idxs": [s["track_idx"]], "member_count": s["stats"]["n"],
"pass_count": s["stats"]["n_pass"]}
continue
gap = s["start_s"] - cur["end_s"]
if gap <= merge_gap_s:
# merge
cur["end_s"] = max(cur["end_s"], s["end_s"])
cur["track_idxs"].append(s["track_idx"])
cur["member_count"] += s["stats"]["n"]
cur["pass_count"] += s["stats"]["n_pass"]
# take the better-quality stats for display
if s["stats"]["n_pass"] > cur["stats"]["n_pass"]:
cur["stats"] = s["stats"]
else:
merged_all.append(cur)
cur = {**s, "track_idxs": [s["track_idx"]], "member_count": s["stats"]["n"],
"pass_count": s["stats"]["n_pass"]}
if cur is not None:
merged_all.append(cur)
return merged_all
def _split_long_segments(segs_with_meta, min_s: float, max_s: float):
"""Apply min/max duration: drop too-short, split too-long evenly."""
out = []
for s in segs_with_meta:
dur = s["end_s"] - s["start_s"]
if dur < min_s:
continue
if dur <= max_s:
out.append(s)
continue
n = int(math.ceil(dur / max_s))
chunk = dur / n
base_start = s["start_s"]
for k in range(n):
piece = dict(s)
piece["start_s"] = base_start + k * chunk
piece["end_s"] = base_start + (k + 1) * chunk
out.append(piece)
return out
# identity tagging via cached arcface centroids
def load_caches_index():
rec_index = {}
alias_map = {}
for c in CACHES:
if not c.exists():
continue
d = np.load(c, allow_pickle=True)
emb = d["embeddings"]
meta = json.loads(str(d["meta"]))
face_records = [m for m in meta if not m.get("noface")]
if "path_aliases" in d.files:
paliases = json.loads(str(d["path_aliases"]))
for canon, alist in paliases.items():
alias_map.setdefault(canon, canon)
for a in alist:
alias_map[a] = canon
for i, rec in enumerate(face_records):
v = emb[i].astype(np.float32)
n = float(np.linalg.norm(v))
if n > 0:
v = v / n
rec_index[(rec["path"], tuple(int(x) for x in rec["bbox"]))] = v
alias_map.setdefault(rec["path"], rec["path"])
return rec_index, alias_map
def load_faceset_centroids():
"""Return dict faceset_name -> normalized centroid embedding."""
rec_index, alias_map = load_caches_index()
centroids = {}
for fs_dir in sorted(FACESETS_ROOT.iterdir()):
if not fs_dir.is_dir() or fs_dir.name.startswith("_"):
continue
# exclude era splits to avoid double-tagging within a family
if re.match(r"^faceset_\d+_(?:\d{4}-\d{2,4}|\d{4}|undated)", fs_dir.name):
continue
mp = fs_dir / "manifest.json"
if not mp.exists():
continue
m = json.loads(mp.read_text())
vecs = []
for f in m.get("faces", []):
src = f.get("source"); bbox = f.get("bbox")
if not src or not bbox:
continue
canon = alias_map.get(src, src)
v = rec_index.get((canon, tuple(int(x) for x in bbox)))
if v is None and canon != src:
v = rec_index.get((src, tuple(int(x) for x in bbox)))
if v is not None:
vecs.append(v)
if len(vecs) < 3:
continue
c = np.stack(vecs).mean(axis=0)
n = float(np.linalg.norm(c))
if n > 0:
c = c / n
centroids[fs_dir.name] = c
return centroids
def _track_centroid(track):
embs = [m["face"].get("embedding") for m in track["members"] if m["face"].get("embedding")]
if not embs:
return None
arr = np.array(embs, dtype=np.float32)
c = arr.mean(axis=0)
n = float(np.linalg.norm(c))
return c / n if n > 0 else c
def cmd_score(args):
tr = json.loads(Path(args.tracks).read_text())
inv = json.loads(Path(args.inventory).read_text())
inv_by_path = {v["path"]: v for v in inv["videos"]}
cfg = {
"yaw_max": args.max_yaw, "pitch_max": args.max_pitch,
"face_min": args.min_face, "det_min": args.min_det,
"bridge_s": args.bridge_gap,
}
centroids = {}
if not args.no_identity:
print("[score] loading faceset centroids ...", file=sys.stderr)
t0 = time.time()
centroids = load_faceset_centroids()
print(f"[score] {len(centroids)} active faceset centroids loaded in {time.time()-t0:.1f}s",
file=sys.stderr)
n_total_tracks = 0
n_accepted_tracks = 0
# collect per-track candidate segments first; merging happens per-video below
per_video_candidates: dict[str, list] = {}
track_centroids_by_video: dict[str, dict] = {}
for video_path, tracks in tr["by_video"].items():
per_video_candidates.setdefault(video_path, [])
track_centroids_by_video.setdefault(video_path, {})
for ti, track in enumerate(tracks):
n_total_tracks += 1
runs, stats = _build_segments(track, cfg)
if stats["frac_pass"] < args.track_gate_frac:
continue
if not runs:
continue
n_accepted_tracks += 1
track_centroids_by_video[video_path][ti] = _track_centroid(track)
for (s, e) in runs:
per_video_candidates[video_path].append({
"video_path": video_path,
"track_idx": ti,
"scene_idx": track["scene_idx"],
"start_s": s,
"end_s": e,
"stats": stats,
})
plan = []
for video_path, segs in per_video_candidates.items():
if not segs:
continue
# merge across tracks within the same scene if gap <= merge_gap_s
merged = _merge_close_segments(segs, args.merge_gap)
# apply min/max duration (split long, drop short)
merged = _split_long_segments(merged, args.min_dur, args.max_dur)
for s in merged:
tag = None
tag_sim = None
# identity from union of contributing tracks' centroids
if centroids:
track_centroid_list = [
track_centroids_by_video[video_path].get(ti)
for ti in s.get("track_idxs", [s.get("track_idx")])
]
track_centroid_list = [c for c in track_centroid_list if c is not None]
if track_centroid_list:
union = np.stack(track_centroid_list).mean(axis=0)
nm = float(np.linalg.norm(union))
if nm > 0:
union = union / nm
sims = {name: float(np.dot(c, union)) for name, c in centroids.items()}
best = max(sims, key=sims.get)
if sims[best] >= IDENTITY_TAG_THRESHOLD:
tag = best; tag_sim = round(sims[best], 4)
plan.append({
"video_path": video_path,
"track_idxs": s.get("track_idxs", [s.get("track_idx")]),
"scene_idx": s["scene_idx"],
"start_s": round(s["start_s"], 3),
"end_s": round(s["end_s"], 3),
"duration_s": round(s["end_s"] - s["start_s"], 3),
"member_count": s.get("member_count", s["stats"]["n"]),
"pass_count": s.get("pass_count", s["stats"]["n_pass"]),
"stats": s["stats"],
"identity_tag": tag,
"identity_sim": tag_sim,
"uuid": uuid.uuid4().hex[:12],
})
plan.sort(key=lambda p: (p["video_path"], p["start_s"]))
out = Path(args.out)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps({
"thresholds": {
"yaw_max": args.max_yaw, "pitch_max": args.max_pitch,
"face_min": args.min_face, "blur_min": QUALITY_BLUR_MIN,
"det_min": args.min_det, "track_gate_frac": args.track_gate_frac,
"bridge_s": args.bridge_gap, "merge_gap_s": args.merge_gap,
"min_dur_s": args.min_dur, "max_dur_s": args.max_dur,
"identity_tag_threshold": IDENTITY_TAG_THRESHOLD,
},
"totals": {
"tracks_total": n_total_tracks, "tracks_accepted": n_accepted_tracks,
"segments": len(plan),
},
"plan": plan,
}, indent=2))
print(f"[score] {n_accepted_tracks}/{n_total_tracks} tracks accepted -> {len(plan)} segments "
f"-> {out}", file=sys.stderr)
# ----------------------------- cut -----------------------------
def cmd_cut(args):
plan = json.loads(Path(args.plan).read_text())
out_dir = Path(args.output_dir)
out_dir.mkdir(parents=True, exist_ok=True)
if args.clean:
# remove only existing UUID-named clips + sidecars (12-char hex), keeping any other files
import re as _re
uuid_pat = _re.compile(r"^[0-9a-f]{12}\.(mp4|json)$")
n_removed = 0
for child in out_dir.iterdir():
if child.is_file() and uuid_pat.match(child.name):
child.unlink()
n_removed += 1
elif child.is_dir() and _re.match(r"^[A-Za-z0-9_.-]+$", child.name):
# subfolder of prior runs — clear UUID files inside, then remove if empty
for inner in child.iterdir():
if inner.is_file() and uuid_pat.match(inner.name):
inner.unlink()
n_removed += 1
try:
child.rmdir()
except OSError:
pass
if n_removed:
print(f"[clean] removed {n_removed} prior UUID clips/sidecars", file=sys.stderr)
n_done = 0
n_err = 0
sidecars = []
for seg in plan["plan"]:
sub = Path(seg["video_path"]).stem
seg_dir = out_dir / sub
seg_dir.mkdir(parents=True, exist_ok=True)
out_video = seg_dir / f"{seg['uuid']}.mp4"
if out_video.exists() and not args.force:
continue
s = seg["start_s"]; d = seg["duration_s"]
cmd = [
"ffmpeg", "-y", "-loglevel", "error",
"-ss", f"{s}",
"-i", seg["video_path"],
"-t", f"{d}",
"-c", "copy",
"-avoid_negative_ts", "make_zero",
str(out_video),
]
r = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if r.returncode != 0 or not out_video.exists() or out_video.stat().st_size < 1024:
print(f"[cut-err] {seg['uuid']} {seg['video_path']}@{s}+{d}: {r.stderr.strip()[:200]}",
file=sys.stderr)
n_err += 1
if out_video.exists() and out_video.stat().st_size < 1024:
out_video.unlink()
continue
if args.write_sidecar:
sidecar = seg_dir / f"{seg['uuid']}.json"
sidecar.write_text(json.dumps({
"uuid": seg["uuid"],
"source_video": seg["video_path"],
"source_basename": Path(seg["video_path"]).name,
"start_s": s, "end_s": seg["end_s"], "duration_s": d,
"scene_idx": seg["scene_idx"],
"track_idxs": seg.get("track_idxs", [seg.get("track_idx")]),
"member_count": seg.get("member_count"),
"pass_count": seg.get("pass_count"),
"stats": seg["stats"],
"identity_tag": seg["identity_tag"],
"identity_sim": seg["identity_sim"],
"thresholds": plan["thresholds"],
}, indent=2))
sidecars.append(sidecar)
n_done += 1
print(f"[cut] {n_done} clips written, {n_err} errors -> {out_dir}", file=sys.stderr)
# ----------------------------- report -----------------------------
def cmd_report(args):
plan = json.loads(Path(args.plan).read_text())
out_dir = Path(args.out)
out_dir.mkdir(parents=True, exist_ok=True)
thumbs_dir = out_dir / "thumbs"
thumbs_dir.mkdir(exist_ok=True)
output_dir = Path(args.output_dir)
# group by video
by_video: dict[str, list] = {}
for seg in plan["plan"]:
by_video.setdefault(seg["video_path"], []).append(seg)
# generate thumbs from each segment's first frame via ffmpeg
print(f"[report] generating thumbs for {len(plan['plan'])} segments", file=sys.stderr)
for seg in plan["plan"]:
thumb = thumbs_dir / f"{seg['uuid']}.jpg"
if thumb.exists():
continue
s = seg["start_s"] + 0.1
cmd = [
"ffmpeg", "-y", "-loglevel", "error",
"-ss", f"{s}",
"-i", seg["video_path"],
"-frames:v", "1",
"-vf", "scale=240:-1",
str(thumb),
]
subprocess.run(cmd, capture_output=True, timeout=30)
# render
rows = []
rows.append("<h1>Video target preprocessing &mdash; review</h1>")
t = plan["totals"]
th = plan["thresholds"]
rows.append(f"<p>Tracks accepted: {t['tracks_accepted']}/{t['tracks_total']}; "
f"segments emitted: {t['segments']}.<br>"
f"Thresholds: pose &le;{th['yaw_max']}&deg;yaw / {th['pitch_max']}&deg;pitch, "
f"face_short &ge;{th['face_min']}px, det &ge;{th['det_min']}, "
f"track-gate &ge;{int(100*th['track_gate_frac'])}%, "
f"duration {th['min_dur_s']}{th['max_dur_s']}s. "
f"Output dir: <code>{output_dir}</code></p>")
nav = " · ".join(f"<a href='#v{i}'>{Path(v).name}</a>"
for i, v in enumerate(by_video.keys()))
rows.append(f"<div class='nav'>{nav}</div>")
for vi, (video_path, segs) in enumerate(by_video.items()):
rows.append(f"<section id='v{vi}' class='vid'>")
rows.append(f"<h2>{Path(video_path).name} <small>({len(segs)} segments)</small></h2>")
rows.append("<div class='cells'>")
for seg in sorted(segs, key=lambda x: x["start_s"]):
stats = seg["stats"]
tag = seg["identity_tag"] or ""
tag_sim = seg["identity_sim"]
tag_html = (f"<span class='tag'>{tag} ({tag_sim:.2f})</span>" if tag else "<span class='tag none'>untagged</span>")
sub_name = Path(seg['video_path']).stem
rows.append(
f"<div class='cell'>"
f"<a href='{output_dir}/{sub_name}/{seg['uuid']}.mp4'><img src='thumbs/{seg['uuid']}.jpg' loading='lazy'></a>"
f"<div class='meta'>"
f"<code>{sub_name}/{seg['uuid']}.mp4</code><br>"
f"{seg['start_s']:.1f}s &rarr; {seg['end_s']:.1f}s ({seg['duration_s']:.1f}s)<br>"
f"yaw={stats['yaw_med']:.0f}&deg; size={stats['size_med']:.0f}px det={stats['det_med']:.2f}<br>"
f"pass {stats['n_pass']}/{stats['n']}<br>"
f"{tag_html}"
f"</div></div>"
)
rows.append("</div></section>")
html = f"""<!doctype html>
<html><head><meta charset='utf-8'><title>Video targets review</title>
<style>
body {{ font-family: system-ui, sans-serif; background:#111; color:#eee; padding:1em; }}
h1, h2 {{ margin-top: 1em; }} h2 {{ border-bottom: 1px solid #333; padding-bottom: 4px; }}
small {{ color:#999; font-weight:normal; }}
section.vid {{ background:#1a1a1a; border-radius:6px; padding:12px; margin:12px 0; }}
.cells {{ display:flex; flex-wrap:wrap; gap:8px; }}
.cell {{ background:#222; border-radius:4px; padding:6px; width:260px; font-size:11px; font-family:monospace; }}
.cell img {{ width:100%; height:auto; border-radius:3px; }}
.meta {{ padding-top:4px; line-height:1.4; }}
.tag {{ display:inline-block; padding:1px 6px; background:#5fa05f; color:#000; border-radius:2px; }}
.tag.none {{ background:#444; color:#aaa; }}
.nav {{ position:sticky; top:0; background:#111; padding:.5em 0; border-bottom:1px solid #333; font-size:12px; }}
a {{ color:#6cf; }}
code {{ background:#000; padding:1px 4px; border-radius:2px; }}
</style></head>
<body>
{''.join(rows)}
</body></html>"""
out_html = out_dir / "index.html"
out_html.write_text(html)
print(f"[report] -> {out_html}", file=sys.stderr)
# ----------------------------- main -----------------------------
def main():
ap = argparse.ArgumentParser()
sub = ap.add_subparsers(dest="cmd", required=True)
s = sub.add_parser("scan")
s.add_argument("--input", default=str(DEFAULT_INPUT))
s.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
s.add_argument("--recursive", action="store_true")
s.add_argument("--out", required=True)
s.set_defaults(func=cmd_scan)
sc = sub.add_parser("scenes")
sc.add_argument("--inventory", required=True)
sc.add_argument("--out-dir", required=True)
sc.add_argument("--only", default=None, help="comma-separated basenames to limit run")
sc.add_argument("--force", action="store_true")
sc.set_defaults(func=cmd_scenes)
st = sub.add_parser("stage")
st.add_argument("--inventory", required=True)
st.add_argument("--scenes-dir", required=True)
st.add_argument("--sample-fps", type=float, default=SAMPLE_FPS)
st.add_argument("--out", required=True)
st.set_defaults(func=cmd_stage)
m = sub.add_parser("merge")
m.add_argument("--results", required=True)
m.add_argument("--out", required=True)
m.set_defaults(func=cmd_merge)
tr = sub.add_parser("track")
tr.add_argument("--frames", required=True)
tr.add_argument("--scenes-dir", required=True)
tr.add_argument("--inventory", required=True)
tr.add_argument("--sample-fps", type=float, default=SAMPLE_FPS)
tr.add_argument("--out", required=True)
tr.set_defaults(func=cmd_track)
sc2 = sub.add_parser("score")
sc2.add_argument("--tracks", required=True)
sc2.add_argument("--inventory", required=True)
sc2.add_argument("--out", required=True)
sc2.add_argument("--no-identity", action="store_true")
sc2.add_argument("--max-yaw", type=float, default=QUALITY_YAW_MAX)
sc2.add_argument("--max-pitch", type=float, default=QUALITY_PITCH_MAX)
sc2.add_argument("--min-face", type=int, default=QUALITY_FACE_MIN)
sc2.add_argument("--min-det", type=float, default=QUALITY_DET_MIN)
sc2.add_argument("--track-gate-frac", type=float, default=TRACK_GATE_FRAC)
sc2.add_argument("--bridge-gap", type=float, default=SEGMENT_BRIDGE_S,
help="bridge within-track failure gaps up to this many seconds")
sc2.add_argument("--merge-gap", type=float, default=SEGMENT_MERGE_GAP_S,
help="merge across-track segments in same scene if within this gap")
sc2.add_argument("--min-dur", type=float, default=SEGMENT_MIN_S)
sc2.add_argument("--max-dur", type=float, default=SEGMENT_MAX_S)
sc2.set_defaults(func=cmd_score)
cu = sub.add_parser("cut")
cu.add_argument("--plan", required=True)
cu.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
cu.add_argument("--force", action="store_true")
cu.add_argument("--clean", action="store_true",
help="remove prior UUID-named clips before cutting (preserves non-UUID files)")
cu.add_argument("--write-sidecar", action="store_true",
help="emit <uuid>.json provenance sidecar alongside each clip (default off)")
cu.set_defaults(func=cmd_cut)
rp = sub.add_parser("report")
rp.add_argument("--plan", required=True)
rp.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
rp.add_argument("--out", required=True)
rp.set_defaults(func=cmd_report)
args = ap.parse_args()
args.func(args)
if __name__ == "__main__":
main()