diff --git a/README.md b/README.md index d77b6f5..67091df 100644 --- a/README.md +++ b/README.md @@ -343,7 +343,7 @@ clean it up over time: | `work/consolidate_facesets.py` | Merge duplicate identities (centroid cosine sim ≥ 0.55 with confident ≥ 0.65, **complete-linkage** to defeat single-link chaining). Pulls embeddings from cache, no GPU. See `docs/analysis/identity-consolidation-and-age-extend.md`. | | `work/age_extend_001.py` | Slot newly-added PNGs into existing era buckets of `faceset_001` (anchor cosine distance ≤ 0.40 AND `|year_delta|` ≤ 5). Same anchor-fragment rule as `age_split_001.py`. | | `work/dedup_optimize.py` (+ Windows `work/multiface_worker.py`) | (a) cross-family SHA256 byte-dedup, (b) within-faceset near-dup at cosine sim ≥ 0.95, (c) multi-face audit (re-detect via insightface, drop PNGs with face_count ≠ 1). Multi-face is the load-bearing roop invariant. See `docs/analysis/dedup-and-roop-optimization.md`. | -| `work/video_target_pipeline.py` (+ Windows `work/video_face_worker.py` + `work/run_video_pipeline.sh` chain) | Target-side preprocessing: scan a folder of videos → PySceneDetect shot-cuts → 2 fps frame sampling → DML face detection + embedding → IoU+embedding tracking → quality-gated segments (yaw≤75°, face≥80px, det≥0.5, ≥70% pass-rate, 1–120s duration, 2s cross-track merge gap) → ffmpeg stream-copy into UUID-named clips with sidecar JSON. Output organized into per-source subfolders. See `docs/analysis/video-target-preprocessing.md`. | +| `work/video_target_pipeline.py` (+ Windows `work/video_face_worker.py` + `work/run_video_pipeline.sh` chain) | Target-side preprocessing: scan a folder of videos → PySceneDetect shot-cuts → 2 fps frame sampling → DML face detection + embedding → IoU+embedding tracking → quality-gated segments (yaw≤75°, face≥80px, det≥0.5, ≥70% pass-rate, 1–120s duration, 2s cross-track merge gap) → ffmpeg stream-copy into UUID-named clips. Output organized into per-source subfolders. Provenance sidecars are opt-in (`cut --write-sidecar` or `SIDECAR=yes` env var); the full plan is always retained in the per-batch `plan.json`. See `docs/analysis/video-target-preprocessing.md`. | All four operate idempotently and reversibly: dropped PNGs go to `/faces/_dropped/`, quarantined whole facesets go to diff --git a/docs/analysis/video-target-preprocessing.md b/docs/analysis/video-target-preprocessing.md index ac914f5..5bc2652 100644 --- a/docs/analysis/video-target-preprocessing.md +++ b/docs/analysis/video-target-preprocessing.md @@ -38,10 +38,11 @@ run_video_pipeline.sh (chain driver) └─ report (HTML preview) Output: //.mp4 - /.json (sidecar) + /.json (sidecar; opt-in via + --write-sidecar) ``` -`run_video_pipeline.sh` is parameterized via env vars (`WORK`, `INPUT_DIR`, `OUTPUT_DIR`, `FILTER_FROM`, `SKIP_PATTERN`, `MAX_DUR`, `IDENTITY`) so you can pin a particular batch without editing the script. +`run_video_pipeline.sh` is parameterized via env vars (`WORK`, `INPUT_DIR`, `OUTPUT_DIR`, `FILTER_FROM`, `SKIP_PATTERN`, `MAX_DUR`, `IDENTITY`, `SIDECAR`) so you can pin a particular batch without editing the script. Sidecars are off by default — the per-batch `plan.json` always carries the full provenance for every clip; the `.json` files alongside the clips are redundant and only useful if you need each clip to be self-describing in isolation. ## 3. Quality signals (matched to inswapper_128's working envelope) diff --git a/work/run_video_pipeline.sh b/work/run_video_pipeline.sh index ec4cc86..91ebb48 100755 --- a/work/run_video_pipeline.sh +++ b/work/run_video_pipeline.sh @@ -15,6 +15,7 @@ # SKIP_PATTERN regex of basenames to exclude (Python `re` syntax). Applied AFTER FILTER_FROM. # MAX_DUR score --max-dur (default 120) # IDENTITY "yes" to enable identity tagging; default "no" +# SIDECAR "yes" to emit .json provenance sidecars; default "no" set -e @@ -23,6 +24,7 @@ set -e : ${OUTPUT_DIR:=/mnt/x/src/vd/ct} : ${MAX_DUR:=120} : ${IDENTITY:=no} +: ${SIDECAR:=no} mkdir -p "$WORK" "$WORK/scenes" @@ -37,7 +39,7 @@ log() { echo "[$(ts)] [$PHASE] $*"; } PHASE="setup" log "STARTED — host=$(hostname) pid=$$ work=$WORK" -log "config: input=$INPUT_DIR output=$OUTPUT_DIR filter_from=${FILTER_FROM:-} skip_pattern=${SKIP_PATTERN:-} max_dur=$MAX_DUR identity=$IDENTITY" +log "config: input=$INPUT_DIR output=$OUTPUT_DIR filter_from=${FILTER_FROM:-} skip_pattern=${SKIP_PATTERN:-} max_dur=$MAX_DUR identity=$IDENTITY sidecar=$SIDECAR" PHASE="inventory" log "building subset inventory" @@ -110,7 +112,9 @@ log "done in $(($(date +%s)-T0))s" PHASE="cut" log "ffmpeg stream-copy into per-source subfolders (no --clean)" T0=$(date +%s) -$PY_WSL $PIPELINE cut --plan "$WORK/plan.json" --output-dir "$OUTPUT_DIR" +SIDECAR_FLAG="" +if [ "$SIDECAR" = "yes" ]; then SIDECAR_FLAG="--write-sidecar"; fi +$PY_WSL $PIPELINE cut --plan "$WORK/plan.json" --output-dir "$OUTPUT_DIR" $SIDECAR_FLAG log "done in $(($(date +%s)-T0))s" PHASE="report" diff --git a/work/video_target_pipeline.py b/work/video_target_pipeline.py index 31cf864..b2614be 100644 --- a/work/video_target_pipeline.py +++ b/work/video_target_pipeline.py @@ -722,23 +722,23 @@ def cmd_cut(args): if out_video.exists() and out_video.stat().st_size < 1024: out_video.unlink() continue - # sidecar (alongside the clip in the source-named subfolder) - sidecar = seg_dir / f"{seg['uuid']}.json" - sidecar.write_text(json.dumps({ - "uuid": seg["uuid"], - "source_video": seg["video_path"], - "source_basename": Path(seg["video_path"]).name, - "start_s": s, "end_s": seg["end_s"], "duration_s": d, - "scene_idx": seg["scene_idx"], - "track_idxs": seg.get("track_idxs", [seg.get("track_idx")]), - "member_count": seg.get("member_count"), - "pass_count": seg.get("pass_count"), - "stats": seg["stats"], - "identity_tag": seg["identity_tag"], - "identity_sim": seg["identity_sim"], - "thresholds": plan["thresholds"], - }, indent=2)) - sidecars.append(sidecar) + if args.write_sidecar: + sidecar = seg_dir / f"{seg['uuid']}.json" + sidecar.write_text(json.dumps({ + "uuid": seg["uuid"], + "source_video": seg["video_path"], + "source_basename": Path(seg["video_path"]).name, + "start_s": s, "end_s": seg["end_s"], "duration_s": d, + "scene_idx": seg["scene_idx"], + "track_idxs": seg.get("track_idxs", [seg.get("track_idx")]), + "member_count": seg.get("member_count"), + "pass_count": seg.get("pass_count"), + "stats": seg["stats"], + "identity_tag": seg["identity_tag"], + "identity_sim": seg["identity_sim"], + "thresholds": plan["thresholds"], + }, indent=2)) + sidecars.append(sidecar) n_done += 1 print(f"[cut] {n_done} clips written, {n_err} errors -> {out_dir}", file=sys.stderr) @@ -901,6 +901,8 @@ def main(): cu.add_argument("--force", action="store_true") cu.add_argument("--clean", action="store_true", help="remove prior UUID-named clips before cutting (preserves non-UUID files)") + cu.add_argument("--write-sidecar", action="store_true", + help="emit .json provenance sidecar alongside each clip (default off)") cu.set_defaults(func=cmd_cut) rp = sub.add_parser("report")