Add target-side video preprocessing pipeline

Preprocesses a folder of video files into UUID-named clips suitable as target inputs for roop-unleashed-style face-swap. Counterpart to the faceset (source-side) tooling. work/video_target_pipeline.py — orchestration with subcommands scan / scenes / stage / merge / track / score / cut / report. Quality gates default to face-sets-can-handle-side-profile values (yaw<=75°, pitch<=45°, face_short>=80px, det>=0.5). Cross-track segment merge fuses adjacent-in-time tracks within the same scene up to 2s gap. Output organized into <output_dir>/<source_stem>/<uuid>.mp4 + <uuid>.json sidecar with full provenance. work/video_face_worker.py — Windows DML face detect+embed worker. Uses JSONL append-only for results.jsonl: a critical perf fix (re- serializing the monolithic 245MB results.json on every flush was the dominant cost in the first attempt, dropping throughput to 0.5 fps). Append-only got it to 13+ fps, ~7.5 fps cumulative across the first 6.18h batch. Also uses seek-once-per-video + sequential cap.grab() between samples to dodge cv2 per-sample seek pathology on long H.264. Legacy results.json is auto-migrated to .jsonl on first load. work/run_video_pipeline.sh — generic chain driver, parameterized via WORK / INPUT_DIR / OUTPUT_DIR / FILTER_FROM / SKIP_PATTERN / MAX_DUR / IDENTITY env vars. work/status_video_pipeline.sh — generic status helper. First production batch (ct_src_00050..00062, 13 files, 6.18h input): 600 emitted segments, 239.5min accepted content (64.6% of input), 254 segments built from >=2 tracks (cross-track merge), 1h43m wall clock. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 21:38:50 +02:00
parent 49a43c7685
commit 998fa79f81
6 changed files with 1480 additions and 0 deletions
--- a/work/run_video_pipeline.sh
+++ b/work/run_video_pipeline.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+# Generic chain driver for the video target preprocessing pipeline.
+#
+# Usage:
+#   WORK=/path/to/workdir SKIP_PATTERN='ct_src_(0001[015]|005[0-9]|006[0-9])\.mp4' \
+#     bash run_video_pipeline.sh > /opt/face-sets/work/logs/<name>.log 2>&1
+#
+# Required env vars:
+#   WORK         per-batch workdir (will hold scenes/, queue.json, results.jsonl, plan.json, review/)
+#
+# Optional env vars:
+#   INPUT_DIR    default /mnt/x/src/vd
+#   OUTPUT_DIR   default /mnt/x/src/vd/ct
+#   FILTER_FROM  basename floor; only files with name >= this go in (e.g. ct_src_00050.mp4)
+#   SKIP_PATTERN regex of basenames to exclude (Python `re` syntax). Applied AFTER FILTER_FROM.
+#   MAX_DUR      score --max-dur (default 120)
+#   IDENTITY     "yes" to enable identity tagging; default "no"
+
+set -e
+
+: ${WORK:?WORK env var must point at a workdir}
+: ${INPUT_DIR:=/mnt/x/src/vd}
+: ${OUTPUT_DIR:=/mnt/x/src/vd/ct}
+: ${MAX_DUR:=120}
+: ${IDENTITY:=no}
+
+mkdir -p "$WORK" "$WORK/scenes"
+
+PY_WSL=/home/peter/face_sort_env/bin/python
+PY_WIN="/mnt/c/face_embed_venv/Scripts/python.exe"
+PIPELINE=/opt/face-sets/work/video_target_pipeline.py
+WORKER=/opt/face-sets/work/video_face_worker.py
+INVENTORY_FULL=/opt/face-sets/work/video_preprocess/inventory_full.json
+
+ts() { date +"%Y-%m-%d %H:%M:%S"; }
+log() { echo "[$(ts)] [$PHASE] $*"; }
+
+PHASE="setup"
+log "STARTED — host=$(hostname) pid=$$ work=$WORK"
+log "config: input=$INPUT_DIR output=$OUTPUT_DIR filter_from=${FILTER_FROM:-<none>} skip_pattern=${SKIP_PATTERN:-<none>} max_dur=$MAX_DUR identity=$IDENTITY"
+
+PHASE="inventory"
+log "building subset inventory"
+T0=$(date +%s)
+# rebuild full inventory if missing
+if [ ! -f "$INVENTORY_FULL" ]; then
+    log "(no full inventory cached — running fresh scan)"
+    $PY_WSL $PIPELINE scan --input "$INPUT_DIR" --output-dir "$OUTPUT_DIR" --out "$INVENTORY_FULL"
+fi
+$PY_WSL <<EOF
+import json, re
+from pathlib import Path
+inv = json.load(open('$INVENTORY_FULL'))
+subset = list(inv['videos'])
+filter_from = '${FILTER_FROM}'
+skip_pat = '${SKIP_PATTERN}'
+if filter_from:
+    subset = [v for v in subset if Path(v['path']).name >= filter_from]
+if skip_pat:
+    pat = re.compile(skip_pat)
+    subset = [v for v in subset if not pat.search(Path(v['path']).name)]
+subset.sort(key=lambda v: v['path'])
+inv['videos'] = subset
+json.dump(inv, open('$WORK/inventory.json','w'), indent=2)
+total_dur = sum(v.get('duration_s', 0) for v in inv['videos'] if 'error' not in v)
+print(f'  {len(inv["videos"])} videos, total {total_dur/3600:.2f}h input')
+EOF
+log "done in $(($(date +%s)-T0))s"
+
+PHASE="scenes"
+log "PySceneDetect AdaptiveDetector across all videos (cached entries skipped)"
+T0=$(date +%s)
+$PY_WSL $PIPELINE scenes --inventory "$WORK/inventory.json" --out-dir "$WORK/scenes"
+log "done in $(($(date +%s)-T0))s"
+
+PHASE="stage"
+log "building frame queue @ 2 fps within scenes"
+T0=$(date +%s)
+$PY_WSL $PIPELINE stage --inventory "$WORK/inventory.json" --scenes-dir "$WORK/scenes" --out "$WORK/queue.json"
+log "done in $(($(date +%s)-T0))s"
+
+PHASE="worker"
+log "Windows DML face detect+embed (resumable; the slow one)"
+T0=$(date +%s)
+$PY_WIN $WORKER "$WORK/queue.json" "$WORK/results.json"
+log "done in $(($(date +%s)-T0))s"
+
+PHASE="merge"
+log "ingesting worker output (jsonl)"
+T0=$(date +%s)
+$PY_WSL $PIPELINE merge --results "$WORK/results.json" --out "$WORK/frames.json"
+log "done in $(($(date +%s)-T0))s"
+
+PHASE="track"
+log "stitching detections into tracks"
+T0=$(date +%s)
+$PY_WSL $PIPELINE track --frames "$WORK/frames.json" --scenes-dir "$WORK/scenes" \
+  --inventory "$WORK/inventory.json" --out "$WORK/tracks.json"
+log "done in $(($(date +%s)-T0))s"
+
+PHASE="score"
+log "scoring with relaxed gates + max-dur=$MAX_DUR identity=$IDENTITY"
+T0=$(date +%s)
+ID_FLAG=""
+if [ "$IDENTITY" != "yes" ]; then ID_FLAG="--no-identity"; fi
+$PY_WSL $PIPELINE score --tracks "$WORK/tracks.json" --inventory "$WORK/inventory.json" \
+  --out "$WORK/plan.json" --max-dur "$MAX_DUR" $ID_FLAG
+log "done in $(($(date +%s)-T0))s"
+
+PHASE="cut"
+log "ffmpeg stream-copy into per-source subfolders (no --clean)"
+T0=$(date +%s)
+$PY_WSL $PIPELINE cut --plan "$WORK/plan.json" --output-dir "$OUTPUT_DIR"
+log "done in $(($(date +%s)-T0))s"
+
+PHASE="report"
+log "rendering HTML"
+T0=$(date +%s)
+$PY_WSL $PIPELINE report --plan "$WORK/plan.json" --output-dir "$OUTPUT_DIR" --out "$WORK/review"
+log "done in $(($(date +%s)-T0))s"
+
+PHASE="done"
+log "PIPELINE COMPLETE — review at file://$WORK/review/index.html"