Add target-side video preprocessing pipeline
Preprocesses a folder of video files into UUID-named clips suitable as target inputs for roop-unleashed-style face-swap. Counterpart to the faceset (source-side) tooling. work/video_target_pipeline.py — orchestration with subcommands scan / scenes / stage / merge / track / score / cut / report. Quality gates default to face-sets-can-handle-side-profile values (yaw<=75°, pitch<=45°, face_short>=80px, det>=0.5). Cross-track segment merge fuses adjacent-in-time tracks within the same scene up to 2s gap. Output organized into <output_dir>/<source_stem>/<uuid>.mp4 + <uuid>.json sidecar with full provenance. work/video_face_worker.py — Windows DML face detect+embed worker. Uses JSONL append-only for results.jsonl: a critical perf fix (re- serializing the monolithic 245MB results.json on every flush was the dominant cost in the first attempt, dropping throughput to 0.5 fps). Append-only got it to 13+ fps, ~7.5 fps cumulative across the first 6.18h batch. Also uses seek-once-per-video + sequential cap.grab() between samples to dodge cv2 per-sample seek pathology on long H.264. Legacy results.json is auto-migrated to .jsonl on first load. work/run_video_pipeline.sh — generic chain driver, parameterized via WORK / INPUT_DIR / OUTPUT_DIR / FILTER_FROM / SKIP_PATTERN / MAX_DUR / IDENTITY env vars. work/status_video_pipeline.sh — generic status helper. First production batch (ct_src_00050..00062, 13 files, 6.18h input): 600 emitted segments, 239.5min accepted content (64.6% of input), 254 segments built from >=2 tracks (cross-track merge), 1h43m wall clock. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
917
work/video_target_pipeline.py
Normal file
917
work/video_target_pipeline.py
Normal file
@@ -0,0 +1,917 @@
|
||||
"""Video target preprocessing pipeline for roop-unleashed.
|
||||
|
||||
Discovers video files in an input folder, runs scene-cut detection, samples
|
||||
frames within each scene, runs face detection + embedding via Windows DML
|
||||
worker, stitches per-frame detections into face tracks, applies quality
|
||||
gates, cuts approved segments out with ffmpeg stream-copy, and writes a
|
||||
report. Output clips have generic UUID names + a sidecar JSON with full
|
||||
provenance.
|
||||
|
||||
Subcommands:
|
||||
scan list input videos, run ffprobe, write per-video index
|
||||
scenes PySceneDetect AdaptiveDetector per video; write scenes_<basename>.json
|
||||
stage write frame queue.json (sampled @ 2 fps within scenes)
|
||||
merge ingest worker results.json into per-video frame_results
|
||||
track IoU+embedding stitching of per-frame detections into tracks
|
||||
score track-level quality gating + segment plan
|
||||
cut ffmpeg -c copy each accepted segment to <out_dir>/<uuid>.mp4
|
||||
report HTML preview with thumbnails + identity tags
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
DEFAULT_INPUT = Path("/mnt/x/src/vd")
|
||||
DEFAULT_OUTPUT = Path("/mnt/x/src/vd/ct")
|
||||
WORK_DIR = Path("/opt/face-sets/work/video_preprocess")
|
||||
|
||||
# defaults — first set was strict-portrait; second set loosened for side-profile + segment merging
|
||||
SAMPLE_FPS = 2.0
|
||||
QUALITY_YAW_MAX = 75.0 # was 25; allow full 3/4 + profile (face-sets handle it)
|
||||
QUALITY_PITCH_MAX = 45.0 # was 30
|
||||
QUALITY_FACE_MIN = 80 # was 96
|
||||
QUALITY_BLUR_MIN = 50.0
|
||||
QUALITY_DET_MIN = 0.5 # was 0.6
|
||||
TRACK_GATE_FRAC = 0.7 # >=70% of frames in track must pass per-frame gates
|
||||
SEGMENT_MIN_S = 1.0
|
||||
SEGMENT_MAX_S = 30.0 # was 10
|
||||
SEGMENT_BRIDGE_S = 3.0 # was 1.0 — within-track pose-failure bridging
|
||||
SEGMENT_MERGE_GAP_S = 2.0 # NEW — across-track merge if same scene + within this gap
|
||||
TRACK_IOU_MIN = 0.3
|
||||
TRACK_EMB_MIN = 0.5
|
||||
|
||||
CACHES = [
|
||||
Path("/opt/face-sets/work/cache/nl_full.npz"),
|
||||
Path("/opt/face-sets/work/cache/immich_peter.npz"),
|
||||
Path("/opt/face-sets/work/cache/immich_nic.npz"),
|
||||
]
|
||||
FACESETS_ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
|
||||
IDENTITY_TAG_THRESHOLD = 0.6 # cosine sim to faceset centroid
|
||||
|
||||
|
||||
def wsl_to_win(p: str) -> str:
|
||||
s = str(p)
|
||||
if s.startswith("/mnt/"):
|
||||
return f"{s[5].upper()}:\\{s[7:].replace('/', chr(92))}"
|
||||
return s
|
||||
|
||||
|
||||
# ----------------------------- ffprobe / scan -----------------------------
|
||||
|
||||
def ffprobe(video: Path) -> dict:
|
||||
cmd = [
|
||||
"ffprobe", "-v", "error", "-print_format", "json",
|
||||
"-show_format", "-show_streams", str(video),
|
||||
]
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
||||
if r.returncode != 0:
|
||||
return {"error": r.stderr.strip()}
|
||||
return json.loads(r.stdout)
|
||||
|
||||
|
||||
def parse_video_meta(probe: dict) -> dict:
|
||||
if "error" in probe:
|
||||
return {"error": probe["error"]}
|
||||
fmt = probe.get("format", {})
|
||||
duration = float(fmt.get("duration", 0))
|
||||
vstream = next((s for s in probe.get("streams", []) if s.get("codec_type") == "video"), None)
|
||||
if vstream is None:
|
||||
return {"error": "no video stream"}
|
||||
fps_str = vstream.get("avg_frame_rate", "0/1")
|
||||
try:
|
||||
num, den = (int(x) for x in fps_str.split("/"))
|
||||
fps = num / den if den else 0.0
|
||||
except Exception:
|
||||
fps = 0.0
|
||||
nb_frames = int(vstream.get("nb_frames", 0)) or int(round(duration * fps))
|
||||
return {
|
||||
"duration_s": duration,
|
||||
"fps": fps,
|
||||
"frames": nb_frames,
|
||||
"width": int(vstream.get("width", 0)),
|
||||
"height": int(vstream.get("height", 0)),
|
||||
"codec": vstream.get("codec_name"),
|
||||
}
|
||||
|
||||
|
||||
def cmd_scan(args):
|
||||
in_dir = Path(args.input)
|
||||
out = Path(args.out)
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
extensions = {".mp4", ".mov", ".mkv", ".m4v", ".avi", ".webm"}
|
||||
out_root = Path(args.output_dir).resolve()
|
||||
videos = []
|
||||
for p in sorted(in_dir.iterdir() if not args.recursive else in_dir.rglob("*")):
|
||||
if not p.is_file():
|
||||
continue
|
||||
if out_root in p.parents or p.resolve() == out_root:
|
||||
continue # never include the output dir
|
||||
if p.suffix.lower() not in extensions:
|
||||
continue
|
||||
videos.append(p)
|
||||
print(f"[scan] {len(videos)} candidate videos", file=sys.stderr)
|
||||
inventory = []
|
||||
for p in videos:
|
||||
meta = parse_video_meta(ffprobe(p))
|
||||
meta["path"] = str(p)
|
||||
meta["win_path"] = wsl_to_win(str(p))
|
||||
meta["size"] = p.stat().st_size
|
||||
inventory.append(meta)
|
||||
if "error" not in meta:
|
||||
print(f" {p.name}: {meta['duration_s']:.1f}s @ {meta['fps']:.1f}fps "
|
||||
f"{meta['width']}x{meta['height']} {meta['codec']}", file=sys.stderr)
|
||||
else:
|
||||
print(f" {p.name}: ERROR {meta['error']}", file=sys.stderr)
|
||||
out.write_text(json.dumps({"input": str(in_dir), "videos": inventory}, indent=2))
|
||||
print(f"[scan] inventory -> {out}", file=sys.stderr)
|
||||
|
||||
|
||||
# ----------------------------- scenes -----------------------------
|
||||
|
||||
def cmd_scenes(args):
|
||||
from scenedetect import open_video, SceneManager
|
||||
from scenedetect.detectors import AdaptiveDetector
|
||||
inv = json.loads(Path(args.inventory).read_text())
|
||||
out_dir = Path(args.out_dir)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
only = set(args.only.split(",")) if args.only else None
|
||||
for v in inv["videos"]:
|
||||
if "error" in v:
|
||||
continue
|
||||
path = Path(v["path"])
|
||||
if only and path.name not in only:
|
||||
continue
|
||||
out_file = out_dir / (path.stem + ".scenes.json")
|
||||
if out_file.exists() and not args.force:
|
||||
continue
|
||||
print(f"[scenes] {path.name} ...", file=sys.stderr, flush=True)
|
||||
t0 = time.time()
|
||||
try:
|
||||
video = open_video(str(path))
|
||||
sm = SceneManager()
|
||||
sm.add_detector(AdaptiveDetector(min_scene_len=int(round(v.get("fps", 30) or 30) * 0.5)))
|
||||
sm.detect_scenes(video, show_progress=False)
|
||||
scenes = sm.get_scene_list()
|
||||
entries = []
|
||||
for s, e in scenes:
|
||||
entries.append({
|
||||
"start_frame": s.frame_num, "end_frame": e.frame_num,
|
||||
"start_s": s.get_seconds(), "end_s": e.get_seconds(),
|
||||
"duration_s": e.get_seconds() - s.get_seconds(),
|
||||
})
|
||||
# if no cuts found, treat the whole video as one scene
|
||||
if not entries:
|
||||
entries = [{
|
||||
"start_frame": 0, "end_frame": v["frames"],
|
||||
"start_s": 0.0, "end_s": v["duration_s"],
|
||||
"duration_s": v["duration_s"],
|
||||
}]
|
||||
out_file.write_text(json.dumps({"video": str(path), "scenes": entries}, indent=2))
|
||||
print(f" {len(entries)} scenes in {time.time()-t0:.1f}s -> {out_file.name}",
|
||||
file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}", file=sys.stderr)
|
||||
|
||||
|
||||
# ----------------------------- stage -----------------------------
|
||||
|
||||
def cmd_stage(args):
|
||||
inv = json.loads(Path(args.inventory).read_text())
|
||||
scenes_dir = Path(args.scenes_dir)
|
||||
queue = []
|
||||
qid = 0
|
||||
sample_every = 1.0 / args.sample_fps
|
||||
for v in inv["videos"]:
|
||||
if "error" in v:
|
||||
continue
|
||||
p = Path(v["path"])
|
||||
sf = scenes_dir / (p.stem + ".scenes.json")
|
||||
if not sf.exists():
|
||||
print(f"[warn] no scenes file for {p.name}; skipping", file=sys.stderr)
|
||||
continue
|
||||
scenes = json.loads(sf.read_text()).get("scenes", [])
|
||||
fps = v.get("fps", 30) or 30
|
||||
for sc in scenes:
|
||||
t = sc["start_s"]
|
||||
while t < sc["end_s"] - 0.01:
|
||||
fidx = int(round(t * fps))
|
||||
if fidx >= v["frames"]:
|
||||
break
|
||||
queue.append({
|
||||
"queue_id": f"q{qid:08d}",
|
||||
"video_path": str(p),
|
||||
"win_video_path": v["win_path"],
|
||||
"frame_idx": fidx,
|
||||
"time_s": t,
|
||||
})
|
||||
qid += 1
|
||||
t += sample_every
|
||||
out = Path(args.out)
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out.write_text(json.dumps(queue, indent=2))
|
||||
print(f"[stage] {len(queue)} sampled frames @ {args.sample_fps} fps -> {out}",
|
||||
file=sys.stderr)
|
||||
print(f"[stage] win path for worker: {wsl_to_win(str(out))}", file=sys.stderr)
|
||||
|
||||
|
||||
# ----------------------------- merge + track -----------------------------
|
||||
|
||||
def cmd_merge(args):
|
||||
"""Read worker output and group by video_path. Supports either JSONL (one record
|
||||
per line, the new format) or legacy JSON (results.json with `results` list)."""
|
||||
src_path = Path(args.results)
|
||||
records = []
|
||||
# try JSONL first (sister .jsonl file or .results passed directly)
|
||||
jsonl_candidate = src_path.with_suffix(".jsonl")
|
||||
if jsonl_candidate.exists():
|
||||
with open(jsonl_candidate) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
records.append(json.loads(line))
|
||||
elif src_path.suffix == ".jsonl":
|
||||
with open(src_path) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
records.append(json.loads(line))
|
||||
else:
|
||||
# legacy: monolithic JSON
|
||||
src = json.loads(src_path.read_text())
|
||||
records = src.get("results", [])
|
||||
by_video: dict[str, list] = {}
|
||||
for r in records:
|
||||
by_video.setdefault(r["video_path"], []).append(r)
|
||||
for v in by_video:
|
||||
by_video[v].sort(key=lambda x: x["frame_idx"])
|
||||
out = Path(args.out)
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out.write_text(json.dumps({"by_video": by_video}, indent=2))
|
||||
print(f"[merge] {sum(len(v) for v in by_video.values())} frames across {len(by_video)} videos "
|
||||
f"-> {out}", file=sys.stderr)
|
||||
|
||||
|
||||
def _iou(a, b):
|
||||
ax1, ay1, ax2, ay2 = a
|
||||
bx1, by1, bx2, by2 = b
|
||||
ix1 = max(ax1, bx1); iy1 = max(ay1, by1)
|
||||
ix2 = min(ax2, bx2); iy2 = min(ay2, by2)
|
||||
iw = max(ix2 - ix1, 0); ih = max(iy2 - iy1, 0)
|
||||
inter = iw * ih
|
||||
ua = (ax2 - ax1) * (ay2 - ay1) + (bx2 - bx1) * (by2 - by1) - inter
|
||||
return inter / ua if ua > 0 else 0.0
|
||||
|
||||
|
||||
def cmd_track(args):
|
||||
"""Stitch per-frame face detections into tracks within each scene of each video.
|
||||
Track = list of (frame_idx, face_idx) where adjacent samples have IoU>=0.3 OR
|
||||
cosine(emb)>=0.5. New face → new track. No cross-scene merging."""
|
||||
fr = json.loads(Path(args.frames).read_text())
|
||||
scenes_dir = Path(args.scenes_dir)
|
||||
inv = json.loads(Path(args.inventory).read_text())
|
||||
inv_by_path = {v["path"]: v for v in inv["videos"]}
|
||||
|
||||
all_video_tracks: dict[str, list] = {}
|
||||
for video_path, frames in fr["by_video"].items():
|
||||
v = inv_by_path.get(video_path, {})
|
||||
sf = scenes_dir / (Path(video_path).stem + ".scenes.json")
|
||||
scenes = json.loads(sf.read_text()).get("scenes", []) if sf.exists() else []
|
||||
# group frames by scene
|
||||
scene_for_frame = {}
|
||||
for si, sc in enumerate(scenes):
|
||||
for f in frames:
|
||||
if f["frame_idx"] >= sc["start_frame"] and f["frame_idx"] < sc["end_frame"]:
|
||||
scene_for_frame.setdefault(si, []).append(f)
|
||||
video_tracks = []
|
||||
for si, scene_frames in scene_for_frame.items():
|
||||
scene_frames.sort(key=lambda x: x["frame_idx"])
|
||||
# tracks = list of dict{ "members": [(frame_idx, face_idx, face_dict)], "last_bbox", "last_emb" }
|
||||
tracks = []
|
||||
for f in scene_frames:
|
||||
claimed = set()
|
||||
for face_idx, face in enumerate(f.get("faces", [])):
|
||||
bbox = face["bbox"]
|
||||
emb = np.array(face.get("embedding", []), dtype=np.float32) if face.get("embedding") else None
|
||||
best_track = None
|
||||
best_score = 0.0
|
||||
for ti, tr in enumerate(tracks):
|
||||
if ti in claimed:
|
||||
continue
|
||||
# staleness in TIME (sample period independent of source fps)
|
||||
last_time = tr["members"][-1][3]
|
||||
if f["time_s"] - last_time > 1.5: # stale if >1.5s gap (3 sample periods @ 2fps)
|
||||
continue
|
||||
score = _iou(tr["last_bbox"], bbox)
|
||||
if emb is not None and tr.get("last_emb") is not None:
|
||||
score = max(score, float(np.dot(tr["last_emb"], emb)))
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_track = ti
|
||||
if best_track is not None and best_score >= min(TRACK_IOU_MIN, TRACK_EMB_MIN):
|
||||
tr = tracks[best_track]
|
||||
tr["members"].append((f["frame_idx"], face_idx, face, f["time_s"]))
|
||||
tr["last_bbox"] = bbox
|
||||
if emb is not None:
|
||||
tr["last_emb"] = emb
|
||||
claimed.add(best_track)
|
||||
else:
|
||||
tracks.append({
|
||||
"members": [(f["frame_idx"], face_idx, face, f["time_s"])],
|
||||
"last_bbox": bbox,
|
||||
"last_emb": emb,
|
||||
})
|
||||
for tr in tracks:
|
||||
if len(tr["members"]) < 2:
|
||||
continue
|
||||
video_tracks.append({
|
||||
"scene_idx": si,
|
||||
"members": [
|
||||
{"frame_idx": m[0], "face_idx": m[1], "time_s": m[3], "face": m[2]}
|
||||
for m in tr["members"]
|
||||
],
|
||||
})
|
||||
all_video_tracks[video_path] = video_tracks
|
||||
print(f"[track] {Path(video_path).name}: {sum(len(s) for s in scene_for_frame.values())} frames "
|
||||
f"-> {len(video_tracks)} tracks across {len(scene_for_frame)} scenes",
|
||||
file=sys.stderr)
|
||||
|
||||
out = Path(args.out)
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out.write_text(json.dumps({"by_video": all_video_tracks}, indent=2))
|
||||
print(f"[track] -> {out}", file=sys.stderr)
|
||||
|
||||
|
||||
# ----------------------------- score (quality gates) -----------------------------
|
||||
|
||||
def _track_passes(track, cfg):
|
||||
"""Per-frame quality gating; return list of bool (does each member pass) +
|
||||
aggregate stats. cfg: dict with yaw_max, pitch_max, face_min, det_min."""
|
||||
passes = []
|
||||
yaws, pitches, sizes, dets = [], [], [], []
|
||||
for m in track["members"]:
|
||||
f = m["face"]
|
||||
yaw = abs(f.get("pose", [0, 0, 0])[1]) if f.get("pose") else 0
|
||||
pitch = abs(f.get("pose", [0, 0, 0])[0]) if f.get("pose") else 0
|
||||
size = f.get("face_short", 0)
|
||||
det = f.get("det_score", 0)
|
||||
ok = (yaw <= cfg["yaw_max"] and pitch <= cfg["pitch_max"]
|
||||
and size >= cfg["face_min"] and det >= cfg["det_min"])
|
||||
passes.append(ok)
|
||||
yaws.append(yaw); pitches.append(pitch); sizes.append(size); dets.append(det)
|
||||
return passes, {
|
||||
"n": len(passes), "n_pass": sum(passes), "frac_pass": sum(passes) / max(1, len(passes)),
|
||||
"yaw_med": float(np.median(yaws)) if yaws else None,
|
||||
"pitch_med": float(np.median(pitches)) if pitches else None,
|
||||
"size_med": float(np.median(sizes)) if sizes else None,
|
||||
"det_med": float(np.median(dets)) if dets else None,
|
||||
}
|
||||
|
||||
|
||||
def _build_segments(track, cfg):
|
||||
"""Return list of (start_s, end_s) accepted sub-segments of this track:
|
||||
contiguous runs of passing frames meeting min/max duration. Pose-failure
|
||||
spans <= cfg['bridge_s'] long get bridged across (handles momentary head
|
||||
turns / detection misses)."""
|
||||
passes, stats = _track_passes(track, cfg)
|
||||
members = track["members"]
|
||||
if not members:
|
||||
return [], stats
|
||||
# bridge gaps of failing frames (any width) up to cfg["bridge_s"] seconds
|
||||
bridged = list(passes)
|
||||
n = len(bridged)
|
||||
i = 0
|
||||
while i < n:
|
||||
if bridged[i]:
|
||||
i += 1
|
||||
continue
|
||||
# find run of consecutive False starting at i
|
||||
j = i
|
||||
while j < n and not bridged[j]:
|
||||
j += 1
|
||||
# bridge if surrounded by True on both sides AND time gap <= bridge_s
|
||||
if i > 0 and j < n and bridged[i - 1] and bridged[j]:
|
||||
t_left = members[i - 1]["time_s"]
|
||||
t_right = members[j]["time_s"]
|
||||
if t_right - t_left <= cfg["bridge_s"]:
|
||||
for k in range(i, j):
|
||||
bridged[k] = True
|
||||
i = j
|
||||
# find runs of True
|
||||
runs = []
|
||||
i = 0
|
||||
while i < n:
|
||||
if not bridged[i]:
|
||||
i += 1; continue
|
||||
j = i
|
||||
while j + 1 < n and bridged[j + 1]:
|
||||
j += 1
|
||||
s = members[i]["time_s"]
|
||||
# end is the time of the last passing sample plus one sample-period
|
||||
e = members[j]["time_s"] + 1.0 / max(SAMPLE_FPS, 1e-3)
|
||||
runs.append((s, e))
|
||||
i = j + 1
|
||||
return runs, stats
|
||||
|
||||
|
||||
def _merge_close_segments(segs_with_meta, merge_gap_s: float):
|
||||
"""Merge segments within the same scene that are within merge_gap_s of each other.
|
||||
segs_with_meta: list of dicts with start_s, end_s, scene_idx, track_idx, stats.
|
||||
Returns list of merged dicts (one per merged group). Identity-tag and stats
|
||||
aggregation happen later."""
|
||||
by_scene: dict[int, list] = {}
|
||||
for s in segs_with_meta:
|
||||
by_scene.setdefault(s["scene_idx"], []).append(s)
|
||||
merged_all = []
|
||||
for scene_idx, segs in by_scene.items():
|
||||
segs.sort(key=lambda x: x["start_s"])
|
||||
cur = None
|
||||
for s in segs:
|
||||
if cur is None:
|
||||
cur = {**s, "track_idxs": [s["track_idx"]], "member_count": s["stats"]["n"],
|
||||
"pass_count": s["stats"]["n_pass"]}
|
||||
continue
|
||||
gap = s["start_s"] - cur["end_s"]
|
||||
if gap <= merge_gap_s:
|
||||
# merge
|
||||
cur["end_s"] = max(cur["end_s"], s["end_s"])
|
||||
cur["track_idxs"].append(s["track_idx"])
|
||||
cur["member_count"] += s["stats"]["n"]
|
||||
cur["pass_count"] += s["stats"]["n_pass"]
|
||||
# take the better-quality stats for display
|
||||
if s["stats"]["n_pass"] > cur["stats"]["n_pass"]:
|
||||
cur["stats"] = s["stats"]
|
||||
else:
|
||||
merged_all.append(cur)
|
||||
cur = {**s, "track_idxs": [s["track_idx"]], "member_count": s["stats"]["n"],
|
||||
"pass_count": s["stats"]["n_pass"]}
|
||||
if cur is not None:
|
||||
merged_all.append(cur)
|
||||
return merged_all
|
||||
|
||||
|
||||
def _split_long_segments(segs_with_meta, min_s: float, max_s: float):
|
||||
"""Apply min/max duration: drop too-short, split too-long evenly."""
|
||||
out = []
|
||||
for s in segs_with_meta:
|
||||
dur = s["end_s"] - s["start_s"]
|
||||
if dur < min_s:
|
||||
continue
|
||||
if dur <= max_s:
|
||||
out.append(s)
|
||||
continue
|
||||
n = int(math.ceil(dur / max_s))
|
||||
chunk = dur / n
|
||||
base_start = s["start_s"]
|
||||
for k in range(n):
|
||||
piece = dict(s)
|
||||
piece["start_s"] = base_start + k * chunk
|
||||
piece["end_s"] = base_start + (k + 1) * chunk
|
||||
out.append(piece)
|
||||
return out
|
||||
|
||||
|
||||
# identity tagging via cached arcface centroids
|
||||
def load_caches_index():
|
||||
rec_index = {}
|
||||
alias_map = {}
|
||||
for c in CACHES:
|
||||
if not c.exists():
|
||||
continue
|
||||
d = np.load(c, allow_pickle=True)
|
||||
emb = d["embeddings"]
|
||||
meta = json.loads(str(d["meta"]))
|
||||
face_records = [m for m in meta if not m.get("noface")]
|
||||
if "path_aliases" in d.files:
|
||||
paliases = json.loads(str(d["path_aliases"]))
|
||||
for canon, alist in paliases.items():
|
||||
alias_map.setdefault(canon, canon)
|
||||
for a in alist:
|
||||
alias_map[a] = canon
|
||||
for i, rec in enumerate(face_records):
|
||||
v = emb[i].astype(np.float32)
|
||||
n = float(np.linalg.norm(v))
|
||||
if n > 0:
|
||||
v = v / n
|
||||
rec_index[(rec["path"], tuple(int(x) for x in rec["bbox"]))] = v
|
||||
alias_map.setdefault(rec["path"], rec["path"])
|
||||
return rec_index, alias_map
|
||||
|
||||
|
||||
def load_faceset_centroids():
|
||||
"""Return dict faceset_name -> normalized centroid embedding."""
|
||||
rec_index, alias_map = load_caches_index()
|
||||
centroids = {}
|
||||
for fs_dir in sorted(FACESETS_ROOT.iterdir()):
|
||||
if not fs_dir.is_dir() or fs_dir.name.startswith("_"):
|
||||
continue
|
||||
# exclude era splits to avoid double-tagging within a family
|
||||
if re.match(r"^faceset_\d+_(?:\d{4}-\d{2,4}|\d{4}|undated)", fs_dir.name):
|
||||
continue
|
||||
mp = fs_dir / "manifest.json"
|
||||
if not mp.exists():
|
||||
continue
|
||||
m = json.loads(mp.read_text())
|
||||
vecs = []
|
||||
for f in m.get("faces", []):
|
||||
src = f.get("source"); bbox = f.get("bbox")
|
||||
if not src or not bbox:
|
||||
continue
|
||||
canon = alias_map.get(src, src)
|
||||
v = rec_index.get((canon, tuple(int(x) for x in bbox)))
|
||||
if v is None and canon != src:
|
||||
v = rec_index.get((src, tuple(int(x) for x in bbox)))
|
||||
if v is not None:
|
||||
vecs.append(v)
|
||||
if len(vecs) < 3:
|
||||
continue
|
||||
c = np.stack(vecs).mean(axis=0)
|
||||
n = float(np.linalg.norm(c))
|
||||
if n > 0:
|
||||
c = c / n
|
||||
centroids[fs_dir.name] = c
|
||||
return centroids
|
||||
|
||||
|
||||
def _track_centroid(track):
|
||||
embs = [m["face"].get("embedding") for m in track["members"] if m["face"].get("embedding")]
|
||||
if not embs:
|
||||
return None
|
||||
arr = np.array(embs, dtype=np.float32)
|
||||
c = arr.mean(axis=0)
|
||||
n = float(np.linalg.norm(c))
|
||||
return c / n if n > 0 else c
|
||||
|
||||
|
||||
def cmd_score(args):
|
||||
tr = json.loads(Path(args.tracks).read_text())
|
||||
inv = json.loads(Path(args.inventory).read_text())
|
||||
inv_by_path = {v["path"]: v for v in inv["videos"]}
|
||||
|
||||
cfg = {
|
||||
"yaw_max": args.max_yaw, "pitch_max": args.max_pitch,
|
||||
"face_min": args.min_face, "det_min": args.min_det,
|
||||
"bridge_s": args.bridge_gap,
|
||||
}
|
||||
|
||||
centroids = {}
|
||||
if not args.no_identity:
|
||||
print("[score] loading faceset centroids ...", file=sys.stderr)
|
||||
t0 = time.time()
|
||||
centroids = load_faceset_centroids()
|
||||
print(f"[score] {len(centroids)} active faceset centroids loaded in {time.time()-t0:.1f}s",
|
||||
file=sys.stderr)
|
||||
|
||||
n_total_tracks = 0
|
||||
n_accepted_tracks = 0
|
||||
# collect per-track candidate segments first; merging happens per-video below
|
||||
per_video_candidates: dict[str, list] = {}
|
||||
track_centroids_by_video: dict[str, dict] = {}
|
||||
for video_path, tracks in tr["by_video"].items():
|
||||
per_video_candidates.setdefault(video_path, [])
|
||||
track_centroids_by_video.setdefault(video_path, {})
|
||||
for ti, track in enumerate(tracks):
|
||||
n_total_tracks += 1
|
||||
runs, stats = _build_segments(track, cfg)
|
||||
if stats["frac_pass"] < args.track_gate_frac:
|
||||
continue
|
||||
if not runs:
|
||||
continue
|
||||
n_accepted_tracks += 1
|
||||
track_centroids_by_video[video_path][ti] = _track_centroid(track)
|
||||
for (s, e) in runs:
|
||||
per_video_candidates[video_path].append({
|
||||
"video_path": video_path,
|
||||
"track_idx": ti,
|
||||
"scene_idx": track["scene_idx"],
|
||||
"start_s": s,
|
||||
"end_s": e,
|
||||
"stats": stats,
|
||||
})
|
||||
|
||||
plan = []
|
||||
for video_path, segs in per_video_candidates.items():
|
||||
if not segs:
|
||||
continue
|
||||
# merge across tracks within the same scene if gap <= merge_gap_s
|
||||
merged = _merge_close_segments(segs, args.merge_gap)
|
||||
# apply min/max duration (split long, drop short)
|
||||
merged = _split_long_segments(merged, args.min_dur, args.max_dur)
|
||||
for s in merged:
|
||||
tag = None
|
||||
tag_sim = None
|
||||
# identity from union of contributing tracks' centroids
|
||||
if centroids:
|
||||
track_centroid_list = [
|
||||
track_centroids_by_video[video_path].get(ti)
|
||||
for ti in s.get("track_idxs", [s.get("track_idx")])
|
||||
]
|
||||
track_centroid_list = [c for c in track_centroid_list if c is not None]
|
||||
if track_centroid_list:
|
||||
union = np.stack(track_centroid_list).mean(axis=0)
|
||||
nm = float(np.linalg.norm(union))
|
||||
if nm > 0:
|
||||
union = union / nm
|
||||
sims = {name: float(np.dot(c, union)) for name, c in centroids.items()}
|
||||
best = max(sims, key=sims.get)
|
||||
if sims[best] >= IDENTITY_TAG_THRESHOLD:
|
||||
tag = best; tag_sim = round(sims[best], 4)
|
||||
plan.append({
|
||||
"video_path": video_path,
|
||||
"track_idxs": s.get("track_idxs", [s.get("track_idx")]),
|
||||
"scene_idx": s["scene_idx"],
|
||||
"start_s": round(s["start_s"], 3),
|
||||
"end_s": round(s["end_s"], 3),
|
||||
"duration_s": round(s["end_s"] - s["start_s"], 3),
|
||||
"member_count": s.get("member_count", s["stats"]["n"]),
|
||||
"pass_count": s.get("pass_count", s["stats"]["n_pass"]),
|
||||
"stats": s["stats"],
|
||||
"identity_tag": tag,
|
||||
"identity_sim": tag_sim,
|
||||
"uuid": uuid.uuid4().hex[:12],
|
||||
})
|
||||
|
||||
plan.sort(key=lambda p: (p["video_path"], p["start_s"]))
|
||||
out = Path(args.out)
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out.write_text(json.dumps({
|
||||
"thresholds": {
|
||||
"yaw_max": args.max_yaw, "pitch_max": args.max_pitch,
|
||||
"face_min": args.min_face, "blur_min": QUALITY_BLUR_MIN,
|
||||
"det_min": args.min_det, "track_gate_frac": args.track_gate_frac,
|
||||
"bridge_s": args.bridge_gap, "merge_gap_s": args.merge_gap,
|
||||
"min_dur_s": args.min_dur, "max_dur_s": args.max_dur,
|
||||
"identity_tag_threshold": IDENTITY_TAG_THRESHOLD,
|
||||
},
|
||||
"totals": {
|
||||
"tracks_total": n_total_tracks, "tracks_accepted": n_accepted_tracks,
|
||||
"segments": len(plan),
|
||||
},
|
||||
"plan": plan,
|
||||
}, indent=2))
|
||||
print(f"[score] {n_accepted_tracks}/{n_total_tracks} tracks accepted -> {len(plan)} segments "
|
||||
f"-> {out}", file=sys.stderr)
|
||||
|
||||
|
||||
# ----------------------------- cut -----------------------------
|
||||
|
||||
def cmd_cut(args):
|
||||
plan = json.loads(Path(args.plan).read_text())
|
||||
out_dir = Path(args.output_dir)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if args.clean:
|
||||
# remove only existing UUID-named clips + sidecars (12-char hex), keeping any other files
|
||||
import re as _re
|
||||
uuid_pat = _re.compile(r"^[0-9a-f]{12}\.(mp4|json)$")
|
||||
n_removed = 0
|
||||
for child in out_dir.iterdir():
|
||||
if child.is_file() and uuid_pat.match(child.name):
|
||||
child.unlink()
|
||||
n_removed += 1
|
||||
elif child.is_dir() and _re.match(r"^[A-Za-z0-9_.-]+$", child.name):
|
||||
# subfolder of prior runs — clear UUID files inside, then remove if empty
|
||||
for inner in child.iterdir():
|
||||
if inner.is_file() and uuid_pat.match(inner.name):
|
||||
inner.unlink()
|
||||
n_removed += 1
|
||||
try:
|
||||
child.rmdir()
|
||||
except OSError:
|
||||
pass
|
||||
if n_removed:
|
||||
print(f"[clean] removed {n_removed} prior UUID clips/sidecars", file=sys.stderr)
|
||||
|
||||
n_done = 0
|
||||
n_err = 0
|
||||
sidecars = []
|
||||
for seg in plan["plan"]:
|
||||
sub = Path(seg["video_path"]).stem
|
||||
seg_dir = out_dir / sub
|
||||
seg_dir.mkdir(parents=True, exist_ok=True)
|
||||
out_video = seg_dir / f"{seg['uuid']}.mp4"
|
||||
if out_video.exists() and not args.force:
|
||||
continue
|
||||
s = seg["start_s"]; d = seg["duration_s"]
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-ss", f"{s}",
|
||||
"-i", seg["video_path"],
|
||||
"-t", f"{d}",
|
||||
"-c", "copy",
|
||||
"-avoid_negative_ts", "make_zero",
|
||||
str(out_video),
|
||||
]
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
if r.returncode != 0 or not out_video.exists() or out_video.stat().st_size < 1024:
|
||||
print(f"[cut-err] {seg['uuid']} {seg['video_path']}@{s}+{d}: {r.stderr.strip()[:200]}",
|
||||
file=sys.stderr)
|
||||
n_err += 1
|
||||
if out_video.exists() and out_video.stat().st_size < 1024:
|
||||
out_video.unlink()
|
||||
continue
|
||||
# sidecar (alongside the clip in the source-named subfolder)
|
||||
sidecar = seg_dir / f"{seg['uuid']}.json"
|
||||
sidecar.write_text(json.dumps({
|
||||
"uuid": seg["uuid"],
|
||||
"source_video": seg["video_path"],
|
||||
"source_basename": Path(seg["video_path"]).name,
|
||||
"start_s": s, "end_s": seg["end_s"], "duration_s": d,
|
||||
"scene_idx": seg["scene_idx"],
|
||||
"track_idxs": seg.get("track_idxs", [seg.get("track_idx")]),
|
||||
"member_count": seg.get("member_count"),
|
||||
"pass_count": seg.get("pass_count"),
|
||||
"stats": seg["stats"],
|
||||
"identity_tag": seg["identity_tag"],
|
||||
"identity_sim": seg["identity_sim"],
|
||||
"thresholds": plan["thresholds"],
|
||||
}, indent=2))
|
||||
sidecars.append(sidecar)
|
||||
n_done += 1
|
||||
print(f"[cut] {n_done} clips written, {n_err} errors -> {out_dir}", file=sys.stderr)
|
||||
|
||||
|
||||
# ----------------------------- report -----------------------------
|
||||
|
||||
def cmd_report(args):
|
||||
plan = json.loads(Path(args.plan).read_text())
|
||||
out_dir = Path(args.out)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
thumbs_dir = out_dir / "thumbs"
|
||||
thumbs_dir.mkdir(exist_ok=True)
|
||||
output_dir = Path(args.output_dir)
|
||||
|
||||
# group by video
|
||||
by_video: dict[str, list] = {}
|
||||
for seg in plan["plan"]:
|
||||
by_video.setdefault(seg["video_path"], []).append(seg)
|
||||
|
||||
# generate thumbs from each segment's first frame via ffmpeg
|
||||
print(f"[report] generating thumbs for {len(plan['plan'])} segments", file=sys.stderr)
|
||||
for seg in plan["plan"]:
|
||||
thumb = thumbs_dir / f"{seg['uuid']}.jpg"
|
||||
if thumb.exists():
|
||||
continue
|
||||
s = seg["start_s"] + 0.1
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-loglevel", "error",
|
||||
"-ss", f"{s}",
|
||||
"-i", seg["video_path"],
|
||||
"-frames:v", "1",
|
||||
"-vf", "scale=240:-1",
|
||||
str(thumb),
|
||||
]
|
||||
subprocess.run(cmd, capture_output=True, timeout=30)
|
||||
|
||||
# render
|
||||
rows = []
|
||||
rows.append("<h1>Video target preprocessing — review</h1>")
|
||||
t = plan["totals"]
|
||||
th = plan["thresholds"]
|
||||
rows.append(f"<p>Tracks accepted: {t['tracks_accepted']}/{t['tracks_total']}; "
|
||||
f"segments emitted: {t['segments']}.<br>"
|
||||
f"Thresholds: pose ≤{th['yaw_max']}°yaw / {th['pitch_max']}°pitch, "
|
||||
f"face_short ≥{th['face_min']}px, det ≥{th['det_min']}, "
|
||||
f"track-gate ≥{int(100*th['track_gate_frac'])}%, "
|
||||
f"duration {th['min_dur_s']}–{th['max_dur_s']}s. "
|
||||
f"Output dir: <code>{output_dir}</code></p>")
|
||||
nav = " · ".join(f"<a href='#v{i}'>{Path(v).name}</a>"
|
||||
for i, v in enumerate(by_video.keys()))
|
||||
rows.append(f"<div class='nav'>{nav}</div>")
|
||||
for vi, (video_path, segs) in enumerate(by_video.items()):
|
||||
rows.append(f"<section id='v{vi}' class='vid'>")
|
||||
rows.append(f"<h2>{Path(video_path).name} <small>({len(segs)} segments)</small></h2>")
|
||||
rows.append("<div class='cells'>")
|
||||
for seg in sorted(segs, key=lambda x: x["start_s"]):
|
||||
stats = seg["stats"]
|
||||
tag = seg["identity_tag"] or ""
|
||||
tag_sim = seg["identity_sim"]
|
||||
tag_html = (f"<span class='tag'>{tag} ({tag_sim:.2f})</span>" if tag else "<span class='tag none'>untagged</span>")
|
||||
sub_name = Path(seg['video_path']).stem
|
||||
rows.append(
|
||||
f"<div class='cell'>"
|
||||
f"<a href='{output_dir}/{sub_name}/{seg['uuid']}.mp4'><img src='thumbs/{seg['uuid']}.jpg' loading='lazy'></a>"
|
||||
f"<div class='meta'>"
|
||||
f"<code>{sub_name}/{seg['uuid']}.mp4</code><br>"
|
||||
f"{seg['start_s']:.1f}s → {seg['end_s']:.1f}s ({seg['duration_s']:.1f}s)<br>"
|
||||
f"yaw={stats['yaw_med']:.0f}° size={stats['size_med']:.0f}px det={stats['det_med']:.2f}<br>"
|
||||
f"pass {stats['n_pass']}/{stats['n']}<br>"
|
||||
f"{tag_html}"
|
||||
f"</div></div>"
|
||||
)
|
||||
rows.append("</div></section>")
|
||||
html = f"""<!doctype html>
|
||||
<html><head><meta charset='utf-8'><title>Video targets review</title>
|
||||
<style>
|
||||
body {{ font-family: system-ui, sans-serif; background:#111; color:#eee; padding:1em; }}
|
||||
h1, h2 {{ margin-top: 1em; }} h2 {{ border-bottom: 1px solid #333; padding-bottom: 4px; }}
|
||||
small {{ color:#999; font-weight:normal; }}
|
||||
section.vid {{ background:#1a1a1a; border-radius:6px; padding:12px; margin:12px 0; }}
|
||||
.cells {{ display:flex; flex-wrap:wrap; gap:8px; }}
|
||||
.cell {{ background:#222; border-radius:4px; padding:6px; width:260px; font-size:11px; font-family:monospace; }}
|
||||
.cell img {{ width:100%; height:auto; border-radius:3px; }}
|
||||
.meta {{ padding-top:4px; line-height:1.4; }}
|
||||
.tag {{ display:inline-block; padding:1px 6px; background:#5fa05f; color:#000; border-radius:2px; }}
|
||||
.tag.none {{ background:#444; color:#aaa; }}
|
||||
.nav {{ position:sticky; top:0; background:#111; padding:.5em 0; border-bottom:1px solid #333; font-size:12px; }}
|
||||
a {{ color:#6cf; }}
|
||||
code {{ background:#000; padding:1px 4px; border-radius:2px; }}
|
||||
</style></head>
|
||||
<body>
|
||||
{''.join(rows)}
|
||||
</body></html>"""
|
||||
out_html = out_dir / "index.html"
|
||||
out_html.write_text(html)
|
||||
print(f"[report] -> {out_html}", file=sys.stderr)
|
||||
|
||||
|
||||
# ----------------------------- main -----------------------------
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
sub = ap.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
s = sub.add_parser("scan")
|
||||
s.add_argument("--input", default=str(DEFAULT_INPUT))
|
||||
s.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
|
||||
s.add_argument("--recursive", action="store_true")
|
||||
s.add_argument("--out", required=True)
|
||||
s.set_defaults(func=cmd_scan)
|
||||
|
||||
sc = sub.add_parser("scenes")
|
||||
sc.add_argument("--inventory", required=True)
|
||||
sc.add_argument("--out-dir", required=True)
|
||||
sc.add_argument("--only", default=None, help="comma-separated basenames to limit run")
|
||||
sc.add_argument("--force", action="store_true")
|
||||
sc.set_defaults(func=cmd_scenes)
|
||||
|
||||
st = sub.add_parser("stage")
|
||||
st.add_argument("--inventory", required=True)
|
||||
st.add_argument("--scenes-dir", required=True)
|
||||
st.add_argument("--sample-fps", type=float, default=SAMPLE_FPS)
|
||||
st.add_argument("--out", required=True)
|
||||
st.set_defaults(func=cmd_stage)
|
||||
|
||||
m = sub.add_parser("merge")
|
||||
m.add_argument("--results", required=True)
|
||||
m.add_argument("--out", required=True)
|
||||
m.set_defaults(func=cmd_merge)
|
||||
|
||||
tr = sub.add_parser("track")
|
||||
tr.add_argument("--frames", required=True)
|
||||
tr.add_argument("--scenes-dir", required=True)
|
||||
tr.add_argument("--inventory", required=True)
|
||||
tr.add_argument("--sample-fps", type=float, default=SAMPLE_FPS)
|
||||
tr.add_argument("--out", required=True)
|
||||
tr.set_defaults(func=cmd_track)
|
||||
|
||||
sc2 = sub.add_parser("score")
|
||||
sc2.add_argument("--tracks", required=True)
|
||||
sc2.add_argument("--inventory", required=True)
|
||||
sc2.add_argument("--out", required=True)
|
||||
sc2.add_argument("--no-identity", action="store_true")
|
||||
sc2.add_argument("--max-yaw", type=float, default=QUALITY_YAW_MAX)
|
||||
sc2.add_argument("--max-pitch", type=float, default=QUALITY_PITCH_MAX)
|
||||
sc2.add_argument("--min-face", type=int, default=QUALITY_FACE_MIN)
|
||||
sc2.add_argument("--min-det", type=float, default=QUALITY_DET_MIN)
|
||||
sc2.add_argument("--track-gate-frac", type=float, default=TRACK_GATE_FRAC)
|
||||
sc2.add_argument("--bridge-gap", type=float, default=SEGMENT_BRIDGE_S,
|
||||
help="bridge within-track failure gaps up to this many seconds")
|
||||
sc2.add_argument("--merge-gap", type=float, default=SEGMENT_MERGE_GAP_S,
|
||||
help="merge across-track segments in same scene if within this gap")
|
||||
sc2.add_argument("--min-dur", type=float, default=SEGMENT_MIN_S)
|
||||
sc2.add_argument("--max-dur", type=float, default=SEGMENT_MAX_S)
|
||||
sc2.set_defaults(func=cmd_score)
|
||||
|
||||
cu = sub.add_parser("cut")
|
||||
cu.add_argument("--plan", required=True)
|
||||
cu.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
|
||||
cu.add_argument("--force", action="store_true")
|
||||
cu.add_argument("--clean", action="store_true",
|
||||
help="remove prior UUID-named clips before cutting (preserves non-UUID files)")
|
||||
cu.set_defaults(func=cmd_cut)
|
||||
|
||||
rp = sub.add_parser("report")
|
||||
rp.add_argument("--plan", required=True)
|
||||
rp.add_argument("--output-dir", default=str(DEFAULT_OUTPUT))
|
||||
rp.add_argument("--out", required=True)
|
||||
rp.set_defaults(func=cmd_report)
|
||||
|
||||
args = ap.parse_args()
|
||||
args.func(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user