#!/bin/bash # Generic chain driver for the video target preprocessing pipeline. # # Usage: # WORK=/path/to/workdir SKIP_PATTERN='ct_src_(0001[015]|005[0-9]|006[0-9])\.mp4' \ # bash run_video_pipeline.sh > /opt/face-sets/work/logs/.log 2>&1 # # Required env vars: # WORK per-batch workdir (will hold scenes/, queue.json, results.jsonl, plan.json, review/) # # Optional env vars: # INPUT_DIR default /mnt/x/src/vd # OUTPUT_DIR default /mnt/x/src/vd/ct # FILTER_FROM basename floor; only files with name >= this go in (e.g. ct_src_00050.mp4) # SKIP_PATTERN regex of basenames to exclude (Python `re` syntax). Applied AFTER FILTER_FROM. # MAX_DUR score --max-dur (default 120) # IDENTITY "yes" to enable identity tagging; default "no" # SIDECAR "yes" to emit .json provenance sidecars; default "no" set -e : ${WORK:?WORK env var must point at a workdir} : ${INPUT_DIR:=/mnt/x/src/vd} : ${OUTPUT_DIR:=/mnt/x/src/vd/ct} : ${MAX_DUR:=120} : ${IDENTITY:=no} : ${SIDECAR:=no} mkdir -p "$WORK" "$WORK/scenes" PY_WSL=/home/peter/face_sort_env/bin/python PY_WIN="/mnt/c/face_embed_venv/Scripts/python.exe" PIPELINE=/opt/face-sets/work/video_target_pipeline.py WORKER=/opt/face-sets/work/video_face_worker.py INVENTORY_FULL=/opt/face-sets/work/video_preprocess/inventory_full.json ts() { date +"%Y-%m-%d %H:%M:%S"; } log() { echo "[$(ts)] [$PHASE] $*"; } PHASE="setup" log "STARTED — host=$(hostname) pid=$$ work=$WORK" log "config: input=$INPUT_DIR output=$OUTPUT_DIR filter_from=${FILTER_FROM:-} skip_pattern=${SKIP_PATTERN:-} max_dur=$MAX_DUR identity=$IDENTITY sidecar=$SIDECAR" PHASE="inventory" log "building subset inventory" T0=$(date +%s) # rebuild full inventory if missing if [ ! -f "$INVENTORY_FULL" ]; then log "(no full inventory cached — running fresh scan)" $PY_WSL $PIPELINE scan --input "$INPUT_DIR" --output-dir "$OUTPUT_DIR" --out "$INVENTORY_FULL" fi $PY_WSL <= filter_from] if skip_pat: pat = re.compile(skip_pat) subset = [v for v in subset if not pat.search(Path(v['path']).name)] subset.sort(key=lambda v: v['path']) inv['videos'] = subset json.dump(inv, open('$WORK/inventory.json','w'), indent=2) total_dur = sum(v.get('duration_s', 0) for v in inv['videos'] if 'error' not in v) print(f' {len(inv["videos"])} videos, total {total_dur/3600:.2f}h input') EOF log "done in $(($(date +%s)-T0))s" PHASE="scenes" log "PySceneDetect AdaptiveDetector across all videos (cached entries skipped)" T0=$(date +%s) $PY_WSL $PIPELINE scenes --inventory "$WORK/inventory.json" --out-dir "$WORK/scenes" log "done in $(($(date +%s)-T0))s" PHASE="stage" log "building frame queue @ 2 fps within scenes" T0=$(date +%s) $PY_WSL $PIPELINE stage --inventory "$WORK/inventory.json" --scenes-dir "$WORK/scenes" --out "$WORK/queue.json" log "done in $(($(date +%s)-T0))s" PHASE="worker" log "Windows DML face detect+embed (resumable; the slow one)" T0=$(date +%s) $PY_WIN $WORKER "$WORK/queue.json" "$WORK/results.json" log "done in $(($(date +%s)-T0))s" PHASE="merge" log "ingesting worker output (jsonl)" T0=$(date +%s) $PY_WSL $PIPELINE merge --results "$WORK/results.json" --out "$WORK/frames.json" log "done in $(($(date +%s)-T0))s" PHASE="track" log "stitching detections into tracks" T0=$(date +%s) $PY_WSL $PIPELINE track --frames "$WORK/frames.json" --scenes-dir "$WORK/scenes" \ --inventory "$WORK/inventory.json" --out "$WORK/tracks.json" log "done in $(($(date +%s)-T0))s" PHASE="score" log "scoring with relaxed gates + max-dur=$MAX_DUR identity=$IDENTITY" T0=$(date +%s) ID_FLAG="" if [ "$IDENTITY" != "yes" ]; then ID_FLAG="--no-identity"; fi $PY_WSL $PIPELINE score --tracks "$WORK/tracks.json" --inventory "$WORK/inventory.json" \ --out "$WORK/plan.json" --max-dur "$MAX_DUR" $ID_FLAG log "done in $(($(date +%s)-T0))s" PHASE="cut" log "ffmpeg stream-copy into per-source subfolders (no --clean)" T0=$(date +%s) SIDECAR_FLAG="" if [ "$SIDECAR" = "yes" ]; then SIDECAR_FLAG="--write-sidecar"; fi $PY_WSL $PIPELINE cut --plan "$WORK/plan.json" --output-dir "$OUTPUT_DIR" $SIDECAR_FLAG log "done in $(($(date +%s)-T0))s" PHASE="report" log "rendering HTML" T0=$(date +%s) $PY_WSL $PIPELINE report --plan "$WORK/plan.json" --output-dir "$OUTPUT_DIR" --out "$WORK/review" log "done in $(($(date +%s)-T0))s" PHASE="done" log "PIPELINE COMPLETE — review at file://$WORK/review/index.html"