Previously every video_target_pipeline cut wrote a <uuid>.json provenance sidecar alongside each <uuid>.mp4. The same provenance is already in the per-batch plan.json, so the per-clip sidecars are redundant unless a downstream tool wants each clip self-describing in isolation. - video_target_pipeline.py cut: new --write-sidecar flag, default off. - run_video_pipeline.sh: new SIDECAR env var (default "no"), passes --write-sidecar when SIDECAR=yes. - README + docs/analysis/video-target-preprocessing.md updated. The 1,984 already-emitted sidecars in /mnt/x/src/vd/ct/ct_src_*/ have been deleted (1.5 MB). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
128 lines
4.5 KiB
Bash
Executable File
128 lines
4.5 KiB
Bash
Executable File
#!/bin/bash
|
|
# Generic chain driver for the video target preprocessing pipeline.
|
|
#
|
|
# Usage:
|
|
# WORK=/path/to/workdir SKIP_PATTERN='ct_src_(0001[015]|005[0-9]|006[0-9])\.mp4' \
|
|
# bash run_video_pipeline.sh > /opt/face-sets/work/logs/<name>.log 2>&1
|
|
#
|
|
# Required env vars:
|
|
# WORK per-batch workdir (will hold scenes/, queue.json, results.jsonl, plan.json, review/)
|
|
#
|
|
# Optional env vars:
|
|
# INPUT_DIR default /mnt/x/src/vd
|
|
# OUTPUT_DIR default /mnt/x/src/vd/ct
|
|
# FILTER_FROM basename floor; only files with name >= this go in (e.g. ct_src_00050.mp4)
|
|
# SKIP_PATTERN regex of basenames to exclude (Python `re` syntax). Applied AFTER FILTER_FROM.
|
|
# MAX_DUR score --max-dur (default 120)
|
|
# IDENTITY "yes" to enable identity tagging; default "no"
|
|
# SIDECAR "yes" to emit <uuid>.json provenance sidecars; default "no"
|
|
|
|
set -e
|
|
|
|
: ${WORK:?WORK env var must point at a workdir}
|
|
: ${INPUT_DIR:=/mnt/x/src/vd}
|
|
: ${OUTPUT_DIR:=/mnt/x/src/vd/ct}
|
|
: ${MAX_DUR:=120}
|
|
: ${IDENTITY:=no}
|
|
: ${SIDECAR:=no}
|
|
|
|
mkdir -p "$WORK" "$WORK/scenes"
|
|
|
|
PY_WSL=/home/peter/face_sort_env/bin/python
|
|
PY_WIN="/mnt/c/face_embed_venv/Scripts/python.exe"
|
|
PIPELINE=/opt/face-sets/work/video_target_pipeline.py
|
|
WORKER=/opt/face-sets/work/video_face_worker.py
|
|
INVENTORY_FULL=/opt/face-sets/work/video_preprocess/inventory_full.json
|
|
|
|
ts() { date +"%Y-%m-%d %H:%M:%S"; }
|
|
log() { echo "[$(ts)] [$PHASE] $*"; }
|
|
|
|
PHASE="setup"
|
|
log "STARTED — host=$(hostname) pid=$$ work=$WORK"
|
|
log "config: input=$INPUT_DIR output=$OUTPUT_DIR filter_from=${FILTER_FROM:-<none>} skip_pattern=${SKIP_PATTERN:-<none>} max_dur=$MAX_DUR identity=$IDENTITY sidecar=$SIDECAR"
|
|
|
|
PHASE="inventory"
|
|
log "building subset inventory"
|
|
T0=$(date +%s)
|
|
# rebuild full inventory if missing
|
|
if [ ! -f "$INVENTORY_FULL" ]; then
|
|
log "(no full inventory cached — running fresh scan)"
|
|
$PY_WSL $PIPELINE scan --input "$INPUT_DIR" --output-dir "$OUTPUT_DIR" --out "$INVENTORY_FULL"
|
|
fi
|
|
$PY_WSL <<EOF
|
|
import json, re
|
|
from pathlib import Path
|
|
inv = json.load(open('$INVENTORY_FULL'))
|
|
subset = list(inv['videos'])
|
|
filter_from = '${FILTER_FROM}'
|
|
skip_pat = '${SKIP_PATTERN}'
|
|
if filter_from:
|
|
subset = [v for v in subset if Path(v['path']).name >= filter_from]
|
|
if skip_pat:
|
|
pat = re.compile(skip_pat)
|
|
subset = [v for v in subset if not pat.search(Path(v['path']).name)]
|
|
subset.sort(key=lambda v: v['path'])
|
|
inv['videos'] = subset
|
|
json.dump(inv, open('$WORK/inventory.json','w'), indent=2)
|
|
total_dur = sum(v.get('duration_s', 0) for v in inv['videos'] if 'error' not in v)
|
|
print(f' {len(inv["videos"])} videos, total {total_dur/3600:.2f}h input')
|
|
EOF
|
|
log "done in $(($(date +%s)-T0))s"
|
|
|
|
PHASE="scenes"
|
|
log "PySceneDetect AdaptiveDetector across all videos (cached entries skipped)"
|
|
T0=$(date +%s)
|
|
$PY_WSL $PIPELINE scenes --inventory "$WORK/inventory.json" --out-dir "$WORK/scenes"
|
|
log "done in $(($(date +%s)-T0))s"
|
|
|
|
PHASE="stage"
|
|
log "building frame queue @ 2 fps within scenes"
|
|
T0=$(date +%s)
|
|
$PY_WSL $PIPELINE stage --inventory "$WORK/inventory.json" --scenes-dir "$WORK/scenes" --out "$WORK/queue.json"
|
|
log "done in $(($(date +%s)-T0))s"
|
|
|
|
PHASE="worker"
|
|
log "Windows DML face detect+embed (resumable; the slow one)"
|
|
T0=$(date +%s)
|
|
$PY_WIN $WORKER "$WORK/queue.json" "$WORK/results.json"
|
|
log "done in $(($(date +%s)-T0))s"
|
|
|
|
PHASE="merge"
|
|
log "ingesting worker output (jsonl)"
|
|
T0=$(date +%s)
|
|
$PY_WSL $PIPELINE merge --results "$WORK/results.json" --out "$WORK/frames.json"
|
|
log "done in $(($(date +%s)-T0))s"
|
|
|
|
PHASE="track"
|
|
log "stitching detections into tracks"
|
|
T0=$(date +%s)
|
|
$PY_WSL $PIPELINE track --frames "$WORK/frames.json" --scenes-dir "$WORK/scenes" \
|
|
--inventory "$WORK/inventory.json" --out "$WORK/tracks.json"
|
|
log "done in $(($(date +%s)-T0))s"
|
|
|
|
PHASE="score"
|
|
log "scoring with relaxed gates + max-dur=$MAX_DUR identity=$IDENTITY"
|
|
T0=$(date +%s)
|
|
ID_FLAG=""
|
|
if [ "$IDENTITY" != "yes" ]; then ID_FLAG="--no-identity"; fi
|
|
$PY_WSL $PIPELINE score --tracks "$WORK/tracks.json" --inventory "$WORK/inventory.json" \
|
|
--out "$WORK/plan.json" --max-dur "$MAX_DUR" $ID_FLAG
|
|
log "done in $(($(date +%s)-T0))s"
|
|
|
|
PHASE="cut"
|
|
log "ffmpeg stream-copy into per-source subfolders (no --clean)"
|
|
T0=$(date +%s)
|
|
SIDECAR_FLAG=""
|
|
if [ "$SIDECAR" = "yes" ]; then SIDECAR_FLAG="--write-sidecar"; fi
|
|
$PY_WSL $PIPELINE cut --plan "$WORK/plan.json" --output-dir "$OUTPUT_DIR" $SIDECAR_FLAG
|
|
log "done in $(($(date +%s)-T0))s"
|
|
|
|
PHASE="report"
|
|
log "rendering HTML"
|
|
T0=$(date +%s)
|
|
$PY_WSL $PIPELINE report --plan "$WORK/plan.json" --output-dir "$OUTPUT_DIR" --out "$WORK/review"
|
|
log "done in $(($(date +%s)-T0))s"
|
|
|
|
PHASE="done"
|
|
log "PIPELINE COMPLETE — review at file://$WORK/review/index.html"
|