Previously every video_target_pipeline cut wrote a <uuid>.json provenance sidecar alongside each <uuid>.mp4. The same provenance is already in the per-batch plan.json, so the per-clip sidecars are redundant unless a downstream tool wants each clip self-describing in isolation. - video_target_pipeline.py cut: new --write-sidecar flag, default off. - run_video_pipeline.sh: new SIDECAR env var (default "no"), passes --write-sidecar when SIDECAR=yes. - README + docs/analysis/video-target-preprocessing.md updated. The 1,984 already-emitted sidecars in /mnt/x/src/vd/ct/ct_src_*/ have been deleted (1.5 MB). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
920 lines
37 KiB
Python
920 lines
37 KiB
Python
"""Video target preprocessing pipeline for roop-unleashed.
|
||
|
||
Discovers video files in an input folder, runs scene-cut detection, samples
|
||
frames within each scene, runs face detection + embedding via Windows DML
|
||
worker, stitches per-frame detections into face tracks, applies quality
|
||
gates, cuts approved segments out with ffmpeg stream-copy, and writes a
|
||
report. Output clips have generic UUID names + a sidecar JSON with full
|
||
provenance.
|
||
|
||
Subcommands:
|
||
scan list input videos, run ffprobe, write per-video index
|
||
scenes PySceneDetect AdaptiveDetector per video; write scenes_<basename>.json
|
||
stage write frame queue.json (sampled @ 2 fps within scenes)
|
||
merge ingest worker results.json into per-video frame_results
|
||
track IoU+embedding stitching of per-frame detections into tracks
|
||
score track-level quality gating + segment plan
|
||
cut ffmpeg -c copy each accepted segment to <out_dir>/<uuid>.mp4
|
||
report HTML preview with thumbnails + identity tags
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import math
|
||
import re
|
||
import shutil
|
||
import subprocess
|
||
import sys
|
||
import time
|
||
import uuid
|
||
from collections import defaultdict
|
||
from pathlib import Path
|
||
|
||
import numpy as np
|
||
|
||
DEFAULT_INPUT = Path("/mnt/x/src/vd")
|
||
DEFAULT_OUTPUT = Path("/mnt/x/src/vd/ct")
|
||
WORK_DIR = Path("/opt/face-sets/work/video_preprocess")
|
||
|
||
# defaults — first set was strict-portrait; second set loosened for side-profile + segment merging
|
||
SAMPLE_FPS = 2.0
|
||
QUALITY_YAW_MAX = 75.0 # was 25; allow full 3/4 + profile (face-sets handle it)
|
||
QUALITY_PITCH_MAX = 45.0 # was 30
|
||
QUALITY_FACE_MIN = 80 # was 96
|
||
QUALITY_BLUR_MIN = 50.0
|
||
QUALITY_DET_MIN = 0.5 # was 0.6
|
||
TRACK_GATE_FRAC = 0.7 # >=70% of frames in track must pass per-frame gates
|
||
SEGMENT_MIN_S = 1.0
|
||
SEGMENT_MAX_S = 30.0 # was 10
|
||
SEGMENT_BRIDGE_S = 3.0 # was 1.0 — within-track pose-failure bridging
|
||
SEGMENT_MERGE_GAP_S = 2.0 # NEW — across-track merge if same scene + within this gap
|
||
TRACK_IOU_MIN = 0.3
|
||
TRACK_EMB_MIN = 0.5
|
||
|
||
CACHES = [
|
||
Path("/opt/face-sets/work/cache/nl_full.npz"),
|
||
Path("/opt/face-sets/work/cache/immich_peter.npz"),
|
||
Path("/opt/face-sets/work/cache/immich_nic.npz"),
|
||
]
|
||
FACESETS_ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
|
||
IDENTITY_TAG_THRESHOLD = 0.6 # cosine sim to faceset centroid
|
||
|
||
|
||
def wsl_to_win(p: str) -> str:
|
||
s = str(p)
|
||
if s.startswith("/mnt/"):
|
||
return f"{s[5].upper()}:\\{s[7:].replace('/', chr(92))}"
|
||
return s
|
||
|
||
|
||
# ----------------------------- ffprobe / scan -----------------------------
|
||
|
||
def ffprobe(video: Path) -> dict:
|
||
cmd = [
|
||
"ffprobe", "-v", "error", "-print_format", "json",
|
||
"-show_format", "-show_streams", str(video),
|
||
]
|
||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
||
if r.returncode != 0:
|
||
return {"error": r.stderr.strip()}
|
||
return json.loads(r.stdout)
|
||
|
||
|
||
def parse_video_meta(probe: dict) -> dict:
|
||
if "error" in probe:
|
||
return {"error": probe["error"]}
|
||
fmt = probe.get("format", {})
|
||
duration = float(fmt.get("duration", 0))
|
||
vstream = next((s for s in probe.get("streams", []) if s.get("codec_type") == "video"), None)
|
||
if vstream is None:
|
||
return {"error": "no video stream"}
|
||
fps_str = vstream.get("avg_frame_rate", "0/1")
|
||
try:
|
||
num, den = (int(x) for x in fps_str.split("/"))
|
||
fps = num / den if den else 0.0
|
||
except Exception:
|
||
fps = 0.0
|
||
nb_frames = int(vstream.get("nb_frames", 0)) or int(round(duration * fps))
|
||
return {
|
||
"duration_s": duration,
|
||
"fps": fps,
|
||
"frames": nb_frames,
|
||
"width": int(vstream.get("width", 0)),
|
||
"height": int(vstream.get("height", 0)),
|
||
"codec": vstream.get("codec_name"),
|
||
}
|
||
|
||
|
||
def cmd_scan(args):
|
||
in_dir = Path(args.input)
|
||
out = Path(args.out)
|
||
out.parent.mkdir(parents=True, exist_ok=True)
|
||
extensions = {".mp4", ".mov", ".mkv", ".m4v", ".avi", ".webm"}
|
||
out_root = Path(args.output_dir).resolve()
|
||
videos = []
|
||
for p in sorted(in_dir.iterdir() if not args.recursive else in_dir.rglob("*")):
|
||
if not p.is_file():
|
||
continue
|
||
if out_root in p.parents or p.resolve() == out_root:
|
||
continue # never include the output dir
|
||
if p.suffix.lower() not in extensions:
|
||
continue
|
||
videos.append(p)
|
||
print(f"[scan] {len(videos)} candidate videos", file=sys.stderr)
|
||
inventory = []
|
||
for p in videos:
|
||
meta = parse_video_meta(ffprobe(p))
|
||
meta["path"] = str(p)
|
||
meta["win_path"] = wsl_to_win(str(p))
|
||
meta["size"] = p.stat().st_size
|
||
inventory.append(meta)
|
||
if "error" not in meta:
|
||
print(f" {p.name}: {meta['duration_s']:.1f}s @ {meta['fps']:.1f}fps "
|
||
f"{meta['width']}x{meta['height']} {meta['codec']}", file=sys.stderr)
|
||
else:
|
||
print(f" {p.name}: ERROR {meta['error']}", file=sys.stderr)
|
||
out.write_text(json.dumps({"input": str(in_dir), "videos": inventory}, indent=2))
|
||
print(f"[scan] inventory -> {out}", file=sys.stderr)
|
||
|
||
|
||
# ----------------------------- scenes -----------------------------
|
||
|
||
def cmd_scenes(args):
|
||
from scenedetect import open_video, SceneManager
|
||
from scenedetect.detectors import AdaptiveDetector
|
||
inv = json.loads(Path(args.inventory).read_text())
|
||
out_dir = Path(args.out_dir)
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
only = set(args.only.split(",")) if args.only else None
|
||
for v in inv["videos"]:
|
||
if "error" in v:
|
||
continue
|
||
path = Path(v["path"])
|
||
if only and path.name not in only:
|
||
continue
|
||
out_file = out_dir / (path.stem + ".scenes.json")
|
||
if out_file.exists() and not args.force:
|
||
continue
|
||
print(f"[scenes] {path.name} ...", file=sys.stderr, flush=True)
|
||
t0 = time.time()
|
||
try:
|
||
video = open_video(str(path))
|
||
sm = SceneManager()
|
||
sm.add_detector(AdaptiveDetector(min_scene_len=int(round(v.get("fps", 30) or 30) * 0.5)))
|
||
sm.detect_scenes(video, show_progress=False)
|
||
scenes = sm.get_scene_list()
|
||
entries = []
|
||
for s, e in scenes:
|
||
entries.append({
|
||
"start_frame": s.frame_num, "end_frame": e.frame_num,
|
||
"start_s": s.get_seconds(), "end_s": e.get_seconds(),
|
||
"duration_s": e.get_seconds() - s.get_seconds(),
|
||
})
|
||
# if no cuts found, treat the whole video as one scene
|
||
if not entries:
|
||
entries = [{
|
||
"start_frame": 0, "end_frame": v["frames"],
|
||
"start_s": 0.0, "end_s": v["duration_s"],
|
||
"duration_s": v["duration_s"],
|
||
}]
|
||
out_file.write_text(json.dumps({"video": str(path), "scenes": entries}, indent=2))
|
||
print(f" {len(entries)} scenes in {time.time()-t0:.1f}s -> {out_file.name}",
|
||
file=sys.stderr)
|
||
except Exception as e:
|
||
print(f" ERROR: {e}", file=sys.stderr)
|
||
|
||
|
||
# ----------------------------- stage -----------------------------
|
||
|
||
def cmd_stage(args):
|
||
inv = json.loads(Path(args.inventory).read_text())
|
||
scenes_dir = Path(args.scenes_dir)
|
||
queue = []
|
||
qid = 0
|
||
sample_every = 1.0 / args.sample_fps
|
||
for v in inv["videos"]:
|
||
if "error" in v:
|
||
continue
|
||
p = Path(v["path"])
|
||
sf = scenes_dir / (p.stem + ".scenes.json")
|
||
if not sf.exists():
|
||
print(f"[warn] no scenes file for {p.name}; skipping", file=sys.stderr)
|
||
continue
|
||
scenes = json.loads(sf.read_text()).get("scenes", [])
|
||
fps = v.get("fps", 30) or 30
|
||
for sc in scenes:
|
||
t = sc["start_s"]
|
||
while t < sc["end_s"] - 0.01:
|
||
fidx = int(round(t * fps))
|
||
if fidx >= v["frames"]:
|
||
break
|
||
queue.append({
|
||
"queue_id": f"q{qid:08d}",
|
||
"video_path": str(p),
|
||
"win_video_path": v["win_path"],
|
||
"frame_idx": fidx,
|
||
"time_s": t,
|
||
})
|
||
qid += 1
|
||
t += sample_every
|
||
out = Path(args.out)
|
||
out.parent.mkdir(parents=True, exist_ok=True)
|
||
out.write_text(json.dumps(queue, indent=2))
|
||
print(f"[stage] {len(queue)} sampled frames @ {args.sample_fps} fps -> {out}",
|
||
file=sys.stderr)
|
||
print(f"[stage] win path for worker: {wsl_to_win(str(out))}", file=sys.stderr)
|
||
|
||
|
||
# ----------------------------- merge + track -----------------------------
|
||
|
||
def cmd_merge(args):
|
||
"""Read worker output and group by video_path. Supports either JSONL (one record
|
||
per line, the new format) or legacy JSON (results.json with `results` list)."""
|
||
src_path = Path(args.results)
|
||
records = []
|
||
# try JSONL first (sister .jsonl file or .results passed directly)
|
||
jsonl_candidate = src_path.with_suffix(".jsonl")
|
||
if jsonl_candidate.exists():
|
||
with open(jsonl_candidate) as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if line:
|
||
records.append(json.loads(line))
|
||
elif src_path.suffix == ".jsonl":
|
||
with open(src_path) as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if line:
|
||
records.append(json.loads(line))
|
||
else:
|
||
# legacy: monolithic JSON
|
||
src = json.loads(src_path.read_text())
|
||
records = src.get("results", [])
|
||
by_video: dict[str, list] = {}
|
||
for r in records:
|
||
by_video.setdefault(r["video_path"], []).append(r)
|
||
for v in by_video:
|
||
by_video[v].sort(key=lambda x: x["frame_idx"])
|
||
out = Path(args.out)
|
||
out.parent.mkdir(parents=True, exist_ok=True)
|
||
out.write_text(json.dumps({"by_video": by_video}, indent=2))
|
||
print(f"[merge] {sum(len(v) for v in by_video.values())} frames across {len(by_video)} videos "
|
||
f"-> {out}", file=sys.stderr)
|
||
|
||
|
||
def _iou(a, b):
|
||
ax1, ay1, ax2, ay2 = a
|
||
bx1, by1, bx2, by2 = b
|
||
ix1 = max(ax1, bx1); iy1 = max(ay1, by1)
|
||
ix2 = min(ax2, bx2); iy2 = min(ay2, by2)
|
||
iw = max(ix2 - ix1, 0); ih = max(iy2 - iy1, 0)
|
||
inter = iw * ih
|
||
ua = (ax2 - ax1) * (ay2 - ay1) + (bx2 - bx1) * (by2 - by1) - inter
|
||
return inter / ua if ua > 0 else 0.0
|
||
|
||
|
||
def cmd_track(args):
|
||
"""Stitch per-frame face detections into tracks within each scene of each video.
|
||
Track = list of (frame_idx, face_idx) where adjacent samples have IoU>=0.3 OR
|
||
cosine(emb)>=0.5. New face → new track. No cross-scene merging."""
|
||
fr = json.loads(Path(args.frames).read_text())
|
||
scenes_dir = Path(args.scenes_dir)
|
||
inv = json.loads(Path(args.inventory).read_text())
|
||
inv_by_path = {v["path"]: v for v in inv["videos"]}
|
||
|
||
all_video_tracks: dict[str, list] = {}
|
||
for video_path, frames in fr["by_video"].items():
|
||
v = inv_by_path.get(video_path, {})
|
||
sf = scenes_dir / (Path(video_path).stem + ".scenes.json")
|
||
scenes = json.loads(sf.read_text()).get("scenes", []) if sf.exists() else []
|
||
# group frames by scene
|
||
scene_for_frame = {}
|
||
for si, sc in enumerate(scenes):
|
||
for f in frames:
|
||
if f["frame_idx"] >= sc["start_frame"] and f["frame_idx"] < sc["end_frame"]:
|
||
scene_for_frame.setdefault(si, []).append(f)
|
||
video_tracks = []
|
||
for si, scene_frames in scene_for_frame.items():
|
||
scene_frames.sort(key=lambda x: x["frame_idx"])
|
||
# tracks = list of dict{ "members": [(frame_idx, face_idx, face_dict)], "last_bbox", "last_emb" }
|
||
tracks = []
|
||
for f in scene_frames:
|
||
claimed = set()
|
||
for face_idx, face in enumerate(f.get("faces", [])):
|
||
bbox = face["bbox"]
|
||
emb = np.array(face.get("embedding", []), dtype=np.float32) if face.get("embedding") else None
|
||
best_track = None
|
||
best_score = 0.0
|
||
for ti, tr in enumerate(tracks):
|
||
if ti in claimed:
|
||
continue
|
||
# staleness in TIME (sample period independent of source fps)
|
||
last_time = tr["members"][-1][3]
|
||
if f["time_s"] - last_time > 1.5: # stale if >1.5s gap (3 sample periods @ 2fps)
|
||
continue
|
||
score = _iou(tr["last_bbox"], bbox)
|
||
if emb is not None and tr.get("last_emb") is not None:
|
||
score = max(score, float(np.dot(tr["last_emb"], emb)))
|
||
if score > best_score:
|
||
best_score = score
|
||
best_track = ti
|
||
if best_track is not None and best_score >= min(TRACK_IOU_MIN, TRACK_EMB_MIN):
|
||
tr = tracks[best_track]
|
||
tr["members"].append((f["frame_idx"], face_idx, face, f["time_s"]))
|
||
tr["last_bbox"] = bbox
|
||
if emb is not None:
|
||
tr["last_emb"] = emb
|
||
claimed.add(best_track)
|
||
else:
|
||
tracks.append({
|
||
"members": [(f["frame_idx"], face_idx, face, f["time_s"])],
|
||
"last_bbox": bbox,
|
||
"last_emb": emb,
|
||
})
|
||
for tr in tracks:
|
||
if len(tr["members"]) < 2:
|
||
continue
|
||
video_tracks.append({
|
||
"scene_idx": si,
|
||
"members": [
|
||
{"frame_idx": m[0], "face_idx": m[1], "time_s": m[3], "face": m[2]}
|
||
for m in tr["members"]
|
||
],
|
||
})
|
||
all_video_tracks[video_path] = video_tracks
|
||
print(f"[track] {Path(video_path).name}: {sum(len(s) for s in scene_for_frame.values())} frames "
|
||
f"-> {len(video_tracks)} tracks across {len(scene_for_frame)} scenes",
|
||
file=sys.stderr)
|
||
|
||
out = Path(args.out)
|
||
out.parent.mkdir(parents=True, exist_ok=True)
|
||
out.write_text(json.dumps({"by_video": all_video_tracks}, indent=2))
|
||
print(f"[track] -> {out}", file=sys.stderr)
|
||
|
||
|
||
# ----------------------------- score (quality gates) -----------------------------
|
||
|
||
def _track_passes(track, cfg):
|
||
"""Per-frame quality gating; return list of bool (does each member pass) +
|
||
aggregate stats. cfg: dict with yaw_max, pitch_max, face_min, det_min."""
|
||
passes = []
|
||
yaws, pitches, sizes, dets = [], [], [], []
|
||
for m in track["members"]:
|
||
f = m["face"]
|
||
yaw = abs(f.get("pose", [0, 0, 0])[1]) if f.get("pose") else 0
|
||
pitch = abs(f.get("pose", [0, 0, 0])[0]) if f.get("pose") else 0
|
||
size = f.get("face_short", 0)
|
||
det = f.get("det_score", 0)
|
||
ok = (yaw <= cfg["yaw_max"] and pitch <= cfg["pitch_max"]
|
||
and size >= cfg["face_min"] and det >= cfg["det_min"])
|
||
passes.append(ok)
|
||
yaws.append(yaw); pitches.append(pitch); sizes.append(size); dets.append(det)
|
||
return passes, {
|
||
"n": len(passes), "n_pass": sum(passes), "frac_pass": sum(passes) / max(1, len(passes)),
|
||
"yaw_med": float(np.median(yaws)) if yaws else None,
|
||
"pitch_med": float(np.median(pitches)) if pitches else None,
|
||
"size_med": float(np.median(sizes)) if sizes else None,
|
||
"det_med": float(np.median(dets)) if dets else None,
|
||
}
|
||
|
||
|
||
def _build_segments(track, cfg):
|
||
"""Return list of (start_s, end_s) accepted sub-segments of this track:
|
||
contiguous runs of passing frames meeting min/max duration. Pose-failure
|
||
spans <= cfg['bridge_s'] long get bridged across (handles momentary head
|
||
turns / detection misses)."""
|
||
passes, stats = _track_passes(track, cfg)
|
||
members = track["members"]
|
||
if not members:
|
||
return [], stats
|
||
# bridge gaps of failing frames (any width) up to cfg["bridge_s"] seconds
|
||
bridged = list(passes)
|
||
n = len(bridged)
|
||
i = 0
|
||
while i < n:
|
||
if bridged[i]:
|
||
i += 1
|
||
continue
|
||
# find run of consecutive False starting at i
|
||
j = i
|
||
while j < n and not bridged[j]:
|
||
j += 1
|
||
# bridge if surrounded by True on both sides AND time gap <= bridge_s
|
||
if i > 0 and j < n and bridged[i - 1] and bridged[j]:
|
||
t_left = members[i - 1]["time_s"]
|
||
t_right = members[j]["time_s"]
|
||
if t_right - t_left <= cfg["bridge_s"]:
|
||
for k in range(i, j):
|
||
bridged[k] = True
|
||
i = j
|
||
# find runs of True
|
||
runs = []
|
||
i = 0
|
||
while i < n:
|
||
if not bridged[i]:
|
||
i += 1; continue
|
||
j = i
|
||
while j + 1 < n and bridged[j + 1]:
|
||
j += 1
|
||
s = members[i]["time_s"]
|
||
# end is the time of the last passing sample plus one sample-period
|
||
e = members[j]["time_s"] + 1.0 / max(SAMPLE_FPS, 1e-3)
|
||
runs.append((s, e))
|
||
i = j + 1
|
||
return runs, stats
|
||
|
||
|
||
def _merge_close_segments(segs_with_meta, merge_gap_s: float):
|
||
"""Merge segments within the same scene that are within merge_gap_s of each other.
|
||
segs_with_meta: list of dicts with start_s, end_s, scene_idx, track_idx, stats.
|
||
Returns list of merged dicts (one per merged group). Identity-tag and stats
|
||
aggregation happen later."""
|
||
by_scene: dict[int, list] = {}
|
||
for s in segs_with_meta:
|
||
by_scene.setdefault(s["scene_idx"], []).append(s)
|
||
merged_all = []
|
||
for scene_idx, segs in by_scene.items():
|
||
segs.sort(key=lambda x: x["start_s"])
|
||
cur = None
|
||
for s in segs:
|
||
if cur is None:
|
||
cur = {**s, "track_idxs": [s["track_idx"]], "member_count": s["stats"]["n"],
|
||
"pass_count": s["stats"]["n_pass"]}
|
||
continue
|
||
gap = s["start_s"] - cur["end_s"]
|
||
if gap <= merge_gap_s:
|
||
# merge
|
||
cur["end_s"] = max(cur["end_s"], s["end_s"])
|
||
cur["track_idxs"].append(s["track_idx"])
|
||
cur["member_count"] += s["stats"]["n"]
|
||
cur["pass_count"] += s["stats"]["n_pass"]
|
||
# take the better-quality stats for display
|
||
if s["stats"]["n_pass"] > cur["stats"]["n_pass"]:
|
||
cur["stats"] = s["stats"]
|
||
else:
|
||
merged_all.append(cur)
|
||
cur = {**s, "track_idxs": [s["track_idx"]], "member_count": s["stats"]["n"],
|
||
"pass_count": s["stats"]["n_pass"]}
|
||
if cur is not None:
|
||
merged_all.append(cur)
|
||
return merged_all
|
||
|
||
|
||
def _split_long_segments(segs_with_meta, min_s: float, max_s: float):
|
||
"""Apply min/max duration: drop too-short, split too-long evenly."""
|
||
out = []
|
||
for s in segs_with_meta:
|
||
dur = s["end_s"] - s["start_s"]
|
||
if dur < min_s:
|
||
continue
|
||
if dur <= max_s:
|
||
out.append(s)
|
||
continue
|
||
n = int(math.ceil(dur / max_s))
|
||
chunk = dur / n
|
||
base_start = s["start_s"]
|
||
for k in range(n):
|
||
piece = dict(s)
|
||
piece["start_s"] = base_start + k * chunk
|
||
piece["end_s"] = base_start + (k + 1) * chunk
|
||
out.append(piece)
|
||
return out
|
||
|
||
|
||
# identity tagging via cached arcface centroids
|
||
def load_caches_index():
|
||
rec_index = {}
|
||
alias_map = {}
|
||
for c in CACHES:
|
||
if not c.exists():
|
||
continue
|
||
d = np.load(c, allow_pickle=True)
|
||
emb = d["embeddings"]
|
||
meta = json.loads(str(d["meta"]))
|
||
face_records = [m for m in meta if not m.get("noface")]
|
||
if "path_aliases" in d.files:
|
||
paliases = json.loads(str(d["path_aliases"]))
|
||
for canon, alist in paliases.items():
|
||
alias_map.setdefault(canon, canon)
|
||
for a in alist:
|
||
alias_map[a] = canon
|
||
for i, rec in enumerate(face_records):
|
||
v = emb[i].astype(np.float32)
|
||
n = float(np.linalg.norm(v))
|
||
if n > 0:
|
||
v = v / n
|
||
rec_index[(rec["path"], tuple(int(x) for x in rec["bbox"]))] = v
|
||
alias_map.setdefault(rec["path"], rec["path"])
|
||
return rec_index, alias_map
|
||
|
||
|
||
def load_faceset_centroids():
|
||
"""Return dict faceset_name -> normalized centroid embedding."""
|
||
rec_index, alias_map = load_caches_index()
|
||
centroids = {}
|
||
for fs_dir in sorted(FACESETS_ROOT.iterdir()):
|
||
if not fs_dir.is_dir() or fs_dir.name.startswith("_"):
|
||
continue
|
||
# exclude era splits to avoid double-tagging within a family
|
||
if re.match(r"^faceset_\d+_(?:\d{4}-\d{2,4}|\d{4}|undated)", fs_dir.name):
|
||
continue
|
||
mp = fs_dir / "manifest.json"
|
||
if not mp.exists():
|
||
continue
|
||
m = json.loads(mp.read_text())
|
||
vecs = []
|
||
for f in m.get("faces", []):
|
||
src = f.get("source"); bbox = f.get("bbox")
|
||
if not src or not bbox:
|
||
continue
|
||
canon = alias_map.get(src, src)
|
||
v = rec_index.get((canon, tuple(int(x) for x in bbox)))
|
||
if v is None and canon != src:
|
||
v = rec_index.get((src, tuple(int(x) for x in bbox)))
|
||
if v is not None:
|
||
vecs.append(v)
|
||
if len(vecs) < 3:
|
||
continue
|
||
c = np.stack(vecs).mean(axis=0)
|
||
n = float(np.linalg.norm(c))
|
||
if n > 0:
|
||
c = c / n
|
||
centroids[fs_dir.name] = c
|
||
return centroids
|
||
|
||
|
||
def _track_centroid(track):
|
||
embs = [m["face"].get("embedding") for m in track["members"] if m["face"].get("embedding")]
|
||
if not embs:
|
||
return None
|
||
arr = np.array(embs, dtype=np.float32)
|
||
c = arr.mean(axis=0)
|
||
n = float(np.linalg.norm(c))
|
||
return c / n if n > 0 else c
|
||
|
||
|
||
def cmd_score(args):
|
||
tr = json.loads(Path(args.tracks).read_text())
|
||
inv = json.loads(Path(args.inventory).read_text())
|
||
inv_by_path = {v["path"]: v for v in inv["videos"]}
|
||
|
||
cfg = {
|
||
"yaw_max": args.max_yaw, "pitch_max": args.max_pitch,
|
||
"face_min": args.min_face, "det_min": args.min_det,
|
||
"bridge_s": args.bridge_gap,
|
||
}
|
||
|
||
centroids = {}
|
||
if not args.no_identity:
|
||
print("[score] loading faceset centroids ...", file=sys.stderr)
|
||
t0 = time.time()
|
||
centroids = load_faceset_centroids()
|
||
print(f"[score] {len(centroids)} active faceset centroids loaded in {time.time()-t0:.1f}s",
|
||
file=sys.stderr)
|
||
|
||
n_total_tracks = 0
|
||
n_accepted_tracks = 0
|
||
# collect per-track candidate segments first; merging happens per-video below
|
||
per_video_candidates: dict[str, list] = {}
|
||
track_centroids_by_video: dict[str, dict] = {}
|
||
for video_path, tracks in tr["by_video"].items():
|
||
per_video_candidates.setdefault(video_path, [])
|
||
track_centroids_by_video.setdefault(video_path, {})
|
||
for ti, track in enumerate(tracks):
|
||
n_total_tracks += 1
|
||
runs, stats = _build_segments(track, cfg)
|
||
if stats["frac_pass"] < args.track_gate_frac:
|
||
continue
|
||
if not runs:
|
||
continue
|
||
n_accepted_tracks += 1
|
||
track_centroids_by_video[video_path][ti] = _track_centroid(track)
|
||
for (s, e) in runs:
|
||
per_video_candidates[video_path].append({
|
||
"video_path": video_path,
|
||
"track_idx": ti,
|
||
"scene_idx": track["scene_idx"],
|
||
"start_s": s,
|
||
"end_s": e,
|
||
"stats": stats,
|
||
})
|
||
|
||
plan = []
|
||
for video_path, segs in per_video_candidates.items():
|
||
if not segs:
|
||
continue
|
||
# merge across tracks within the same scene if gap <= merge_gap_s
|
||
merged = _merge_close_segments(segs, args.merge_gap)
|
||
# apply min/max duration (split long, drop short)
|
||
merged = _split_long_segments(merged, args.min_dur, args.max_dur)
|
||
for s in merged:
|
||
tag = None
|
||
tag_sim = None
|
||
# identity from union of contributing tracks' centroids
|
||
if centroids:
|
||
track_centroid_list = [
|
||
track_centroids_by_video[video_path].get(ti)
|
||
for ti in s.get("track_idxs", [s.get("track_idx")])
|
||
]
|
||
track_centroid_list = [c for c in track_centroid_list if c is not None]
|
||
if track_centroid_list:
|
||
union = np.stack(track_centroid_list).mean(axis=0)
|
||
nm = float(np.linalg.norm(union))
|
||
if nm > 0:
|
||
union = union / nm
|
||
sims = {name: float(np.dot(c, union)) for name, c in centroids.items()}
|
||
best = max(sims, key=sims.get)
|
||
if sims[best] >= IDENTITY_TAG_THRESHOLD:
|
||
tag = best; tag_sim = round(sims[best], 4)
|
||
plan.append({
|
||
"video_path": video_path,
|
||
"track_idxs": s.get("track_idxs", [s.get("track_idx")]),
|
||
"scene_idx": s["scene_idx"],
|
||
"start_s": round(s["start_s"], 3),
|
||
"end_s": round(s["end_s"], 3),
|
||
"duration_s": round(s["end_s"] - s["start_s"], 3),
|
||
"member_count": s.get("member_count", s["stats"]["n"]),
|
||
"pass_count": s.get("pass_count", s["stats"]["n_pass"]),
|
||
"stats": s["stats"],
|
||
"identity_tag": tag,
|
||
"identity_sim": tag_sim,
|
||
"uuid": uuid.uuid4().hex[:12],
|
||
})
|
||
|
||
plan.sort(key=lambda p: (p["video_path"], p["start_s"]))
|
||
out = Path(args.out)
|
||
out.parent.mkdir(parents=True, exist_ok=True)
|
||
out.write_text(json.dumps({
|
||
"thresholds": {
|
||
"yaw_max": args.max_yaw, "pitch_max": args.max_pitch,
|
||
"face_min": args.min_face, "blur_min": QUALITY_BLUR_MIN,
|
||
"det_min": args.min_det, "track_gate_frac": args.track_gate_frac,
|
||
"bridge_s": args.bridge_gap, "merge_gap_s": args.merge_gap,
|
||
"min_dur_s": args.min_dur, "max_dur_s": args.max_dur,
|
||
"identity_tag_threshold": IDENTITY_TAG_THRESHOLD,
|
||
},
|
||
"totals": {
|
||
"tracks_total": n_total_tracks, "tracks_accepted": n_accepted_tracks,
|
||
"segments": len(plan),
|
||
},
|
||
"plan": plan,
|
||
}, indent=2))
|
||
print(f"[score] {n_accepted_tracks}/{n_total_tracks} tracks accepted -> {len(plan)} segments "
|
||
f"-> {out}", file=sys.stderr)
|
||
|
||
|
||
# ----------------------------- cut -----------------------------
|
||
|
||
def cmd_cut(args):
|
||
plan = json.loads(Path(args.plan).read_text())
|
||
out_dir = Path(args.output_dir)
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
if args.clean:
|
||
# remove only existing UUID-named clips + sidecars (12-char hex), keeping any other files
|
||
import re as _re
|
||
uuid_pat = _re.compile(r"^[0-9a-f]{12}\.(mp4|json)$")
|
||
n_removed = 0
|
||
for child in out_dir.iterdir():
|
||
if child.is_file() and uuid_pat.match(child.name):
|
||
child.unlink()
|
||
n_removed += 1
|
||
elif child.is_dir() and _re.match(r"^[A-Za-z0-9_.-]+$", child.name):
|
||
# subfolder of prior runs — clear UUID files inside, then remove if empty
|
||
for inner in child.iterdir():
|
||
if inner.is_file() and uuid_pat.match(inner.name):
|
||
inner.unlink()
|
||
n_removed += 1
|
||
try:
|
||
child.rmdir()
|
||
except OSError:
|
||
pass
|
||
if n_removed:
|
||
print(f"[clean] removed {n_removed} prior UUID clips/sidecars", file=sys.stderr)
|
||
|
||
n_done = 0
|
||
n_err = 0
|
||
sidecars = []
|
||
for seg in plan["plan"]:
|
||
sub = Path(seg["video_path"]).stem
|
||
seg_dir = out_dir / sub
|
||
seg_dir.mkdir(parents=True, exist_ok=True)
|
||
out_video = seg_dir / f"{seg['uuid']}.mp4"
|
||
if out_video.exists() and not args.force:
|
||
continue
|
||
s = seg["start_s"]; d = seg["duration_s"]
|
||
cmd = [
|
||
"ffmpeg", "-y", "-loglevel", "error",
|
||
"-ss", f"{s}",
|
||
"-i", seg["video_path"],
|
||
"-t", f"{d}",
|
||
"-c", "copy",
|
||
"-avoid_negative_ts", "make_zero",
|
||
str(out_video),
|
||
]
|
||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||
if r.returncode != 0 or not out_video.exists() or out_video.stat().st_size < 1024:
|
||
print(f"[cut-err] {seg['uuid']} {seg['video_path']}@{s}+{d}: {r.stderr.strip()[:200]}",
|
||
file=sys.stderr)
|
||
n_err += 1
|
||
if out_video.exists() and out_video.stat().st_size < 1024:
|
||
out_video.unlink()
|
||
continue
|
||
if args.write_sidecar:
|
||
sidecar = seg_dir / f"{seg['uuid']}.json"
|
||
sidecar.write_text(json.dumps({
|
||
"uuid": seg["uuid"],
|
||
"source_video": seg["video_path"],
|
||
"source_basename": Path(seg["video_path"]).name,
|
||
"start_s": s, "end_s": seg["end_s"], "duration_s": d,
|
||
"scene_idx": seg["scene_idx"],
|
||
"track_idxs": seg.get("track_idxs", [seg.get("track_idx")]),
|
||
"member_count": seg.get("member_count"),
|
||
"pass_count": seg.get("pass_count"),
|
||
"stats": seg["stats"],
|
||
"identity_tag": seg["identity_tag"],
|
||
"identity_sim": seg["identity_sim"],
|
||
"thresholds": plan["thresholds"],
|
||
}, indent=2))
|
||
sidecars.append(sidecar)
|
||
n_done += 1
|
||
print(f"[cut] {n_done} clips written, {n_err} errors -> {out_dir}", file=sys.stderr)
|
||
|
||
|
||
# ----------------------------- report -----------------------------
|
||
|
||
def cmd_report(args):
|
||
plan = json.loads(Path(args.plan).read_text())
|
||
out_dir = Path(args.out)
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
thumbs_dir = out_dir / "thumbs"
|
||
thumbs_dir.mkdir(exist_ok=True)
|
||
output_dir = Path(args.output_dir)
|
||
|
||
# group by video
|
||
by_video: dict[str, list] = {}
|
||
for seg in plan["plan"]:
|
||
by_video.setdefault(seg["video_path"], []).append(seg)
|
||
|
||
# generate thumbs from each segment's first frame via ffmpeg
|
||
print(f"[report] generating thumbs for {len(plan['plan'])} segments", file=sys.stderr)
|
||
for seg in plan["plan"]:
|
||
thumb = thumbs_dir / f"{seg['uuid']}.jpg"
|
||
if thumb.exists():
|
||
continue
|
||
s = seg["start_s"] + 0.1
|
||
cmd = [
|
||
"ffmpeg", "-y", "-loglevel", "error",
|
||
"-ss", f"{s}",
|
||
"-i", seg["video_path"],
|
||
"-frames:v", "1",
|
||
"-vf", "scale=240:-1",
|
||
str(thumb),
|
||
]
|
||
subprocess.run(cmd, capture_output=True, timeout=30)
|
||
|
||
# render
|
||
rows = []
|
||
rows.append("<h1>Video target preprocessing — review</h1>")
|
||
t = plan["totals"]
|
||
th = plan["thresholds"]
|
||
rows.append(f"<p>Tracks accepted: {t['tracks_accepted']}/{t['tracks_total']}; "
|
||
f"segments emitted: {t['segments']}.<br>"
|
||
f"Thresholds: pose ≤{th['yaw_max']}°yaw / {th['pitch_max']}°pitch, "
|
||
f"face_short ≥{th['face_min']}px, det ≥{th['det_min']}, "
|
||
f"track-gate ≥{int(100*th['track_gate_frac'])}%, "
|
||
f"duration {th['min_dur_s']}–{th['max_dur_s']}s. "
|
||
f"Output dir: <code>{output_dir}</code></p>")
|
||
nav = " · ".join(f"<a href='#v{i}'>{Path(v).name}</a>"
|
||
for i, v in enumerate(by_video.keys()))
|
||
rows.append(f"<div class='nav'>{nav}</div>")
|
||
for vi, (video_path, segs) in enumerate(by_video.items()):
|
||
rows.append(f"<section id='v{vi}' class='vid'>")
|
||
rows.append(f"<h2>{Path(video_path).name} <small>({len(segs)} segments)</small></h2>")
|
||
rows.append("<div class='cells'>")
|
||
for seg in sorted(segs, key=lambda x: x["start_s"]):
|
||
stats = seg["stats"]
|
||
tag = seg["identity_tag"] or ""
|
||
tag_sim = seg["identity_sim"]
|
||
tag_html = (f"<span class='tag'>{tag} ({tag_sim:.2f})</span>" if tag else "<span class='tag none'>untagged</span>")
|
||
sub_name = Path(seg['video_path']).stem
|
||
rows.append(
|
||
f"<div class='cell'>"
|
||
f"<a href='{output_dir}/{sub_name}/{seg['uuid']}.mp4'><img src='thumbs/{seg['uuid']}.jpg' loading='lazy'></a>"
|
||
f"<div class='meta'>"
|
||
f"<code>{sub_name}/{seg['uuid']}.mp4</code><br>"
|
||
f"{seg['start_s']:.1f}s → {seg['end_s']:.1f}s ({seg['duration_s']:.1f}s)<br>"
|
||
f"yaw={stats['yaw_med']:.0f}° size={stats['size_med']:.0f}px det={stats['det_med']:.2f}<br>"
|
||
f"pass {stats['n_pass']}/{stats['n']}<br>"
|
||
f"{tag_html}"
|
||
f"</div></div>"
|
||
)
|
||
rows.append("</div></section>")
|
||
html = f"""<!doctype html>
|
||
<html><head><meta charset='utf-8'><title>Video targets review</title>
|
||
<style>
|
||
body {{ font-family: system-ui, sans-serif; background:#111; color:#eee; padding:1em; }}
|
||
h1, h2 {{ margin-top: 1em; }} h2 {{ border-bottom: 1px solid #333; padding-bottom: 4px; }}
|
||
small {{ color:#999; font-weight:normal; }}
|
||
section.vid {{ background:#1a1a1a; border-radius:6px; padding:12px; margin:12px 0; }}
|
||
.cells {{ display:flex; flex-wrap:wrap; gap:8px; }}
|
||
.cell {{ background:#222; border-radius:4px; padding:6px; width:260px; font-size:11px; font-family:monospace; }}
|
||
.cell img {{ width:100%; height:auto; border-radius:3px; }}
|
||
.meta {{ padding-top:4px; line-height:1.4; }}
|
||
.tag {{ display:inline-block; padding:1px 6px; background:#5fa05f; color:#000; border-radius:2px; }}
|
||
.tag.none {{ background:#444; color:#aaa; }}
|
||
.nav {{ position:sticky; top:0; background:#111; padding:.5em 0; border-bottom:1px solid #333; font-size:12px; }}
|
||
a {{ color:#6cf; }}
|
||
code {{ background:#000; padding:1px 4px; border-radius:2px; }}
|
||
</style></head>
|
||
<body>
|
||
{''.join(rows)}
|
||
</body></html>"""
|
||
out_html = out_dir / "index.html"
|
||
out_html.write_text(html)
|
||
print(f"[report] -> {out_html}", file=sys.stderr)
|
||
|
||
|
||
# ----------------------------- main -----------------------------
|
||
|
||
def main():
|
||
ap = argparse.ArgumentParser()
|
||
sub = ap.add_subparsers(dest="cmd", required=True)
|
||
|
||
s = sub.add_parser("scan")
|
||
s.add_argument("--input", default=str(DEFAULT_INPUT))
|
||
s.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
|
||
s.add_argument("--recursive", action="store_true")
|
||
s.add_argument("--out", required=True)
|
||
s.set_defaults(func=cmd_scan)
|
||
|
||
sc = sub.add_parser("scenes")
|
||
sc.add_argument("--inventory", required=True)
|
||
sc.add_argument("--out-dir", required=True)
|
||
sc.add_argument("--only", default=None, help="comma-separated basenames to limit run")
|
||
sc.add_argument("--force", action="store_true")
|
||
sc.set_defaults(func=cmd_scenes)
|
||
|
||
st = sub.add_parser("stage")
|
||
st.add_argument("--inventory", required=True)
|
||
st.add_argument("--scenes-dir", required=True)
|
||
st.add_argument("--sample-fps", type=float, default=SAMPLE_FPS)
|
||
st.add_argument("--out", required=True)
|
||
st.set_defaults(func=cmd_stage)
|
||
|
||
m = sub.add_parser("merge")
|
||
m.add_argument("--results", required=True)
|
||
m.add_argument("--out", required=True)
|
||
m.set_defaults(func=cmd_merge)
|
||
|
||
tr = sub.add_parser("track")
|
||
tr.add_argument("--frames", required=True)
|
||
tr.add_argument("--scenes-dir", required=True)
|
||
tr.add_argument("--inventory", required=True)
|
||
tr.add_argument("--sample-fps", type=float, default=SAMPLE_FPS)
|
||
tr.add_argument("--out", required=True)
|
||
tr.set_defaults(func=cmd_track)
|
||
|
||
sc2 = sub.add_parser("score")
|
||
sc2.add_argument("--tracks", required=True)
|
||
sc2.add_argument("--inventory", required=True)
|
||
sc2.add_argument("--out", required=True)
|
||
sc2.add_argument("--no-identity", action="store_true")
|
||
sc2.add_argument("--max-yaw", type=float, default=QUALITY_YAW_MAX)
|
||
sc2.add_argument("--max-pitch", type=float, default=QUALITY_PITCH_MAX)
|
||
sc2.add_argument("--min-face", type=int, default=QUALITY_FACE_MIN)
|
||
sc2.add_argument("--min-det", type=float, default=QUALITY_DET_MIN)
|
||
sc2.add_argument("--track-gate-frac", type=float, default=TRACK_GATE_FRAC)
|
||
sc2.add_argument("--bridge-gap", type=float, default=SEGMENT_BRIDGE_S,
|
||
help="bridge within-track failure gaps up to this many seconds")
|
||
sc2.add_argument("--merge-gap", type=float, default=SEGMENT_MERGE_GAP_S,
|
||
help="merge across-track segments in same scene if within this gap")
|
||
sc2.add_argument("--min-dur", type=float, default=SEGMENT_MIN_S)
|
||
sc2.add_argument("--max-dur", type=float, default=SEGMENT_MAX_S)
|
||
sc2.set_defaults(func=cmd_score)
|
||
|
||
cu = sub.add_parser("cut")
|
||
cu.add_argument("--plan", required=True)
|
||
cu.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
|
||
cu.add_argument("--force", action="store_true")
|
||
cu.add_argument("--clean", action="store_true",
|
||
help="remove prior UUID-named clips before cutting (preserves non-UUID files)")
|
||
cu.add_argument("--write-sidecar", action="store_true",
|
||
help="emit <uuid>.json provenance sidecar alongside each clip (default off)")
|
||
cu.set_defaults(func=cmd_cut)
|
||
|
||
rp = sub.add_parser("report")
|
||
rp.add_argument("--plan", required=True)
|
||
rp.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
|
||
rp.add_argument("--out", required=True)
|
||
rp.set_defaults(func=cmd_report)
|
||
|
||
args = ap.parse_args()
|
||
args.func(args)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|