Add post-export corpus maintenance pipeline

Adds four new orchestration scripts that operate on an already-built facesets_swap_ready/ to clean it up over time: - filter_occlusions.py + clip_worker.py: CLIP zero-shot mask + sunglasses filter (open_clip ViT-L-14/dfn2b_s39b). WSL stages, Windows DML scores via new C:\clip_dml_venv. Image-level threshold 0.7; faceset-level quarantine at 40% domain dominance. - consolidate_facesets.py: duplicate-identity merger using complete-linkage centroid clustering on cached arcface embeddings. Single-linkage chains catastrophically (60-faceset clusters with min sim < 0); complete-linkage guarantees within-group sim >= edge. - age_extend_001.py: slots newly-added PNGs into existing era buckets of faceset_001 using the same anchor-fragment rule as age_split_001.py (dist <= 0.40 AND |year_delta| <= 5). Anchors not re-centered. - dedup_optimize.py + multiface_worker.py: corpus-wide cleanup with three passes — cross-family SHA256 byte-dedup (preserves intra-family era duplication), within-faceset near-dup at sim >= 0.95, and a multi-face audit (the load-bearing roop invariant). Multi-face worker hits ~19 img/s on AMD Vega — ~7x embed_worker because input is 512x512 crops. Same-day corpus evolution: 311 active / 0 masked / 68 thin / 0 merged → 181 / 51 / 71 / 74; 6,440 → 3,849 active PNGs. All quarantines and prunes preserved on disk (faces/_dropped/, _masked/, _merged/, _thin/) for full reversibility. Master manifest gains masked[], merged[], plus per-run provenance blocks. Three new docs/analysis/ writeups cover model choice, threshold rationale, and per-pass run results. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 15:41:18 +02:00
parent e66c97fd58
commit 49a43c7685
10 changed files with 3250 additions and 1 deletions
--- a/work/multiface_worker.py
+++ b/work/multiface_worker.py
@@ -0,0 +1,144 @@
+"""Windows / DirectML multi-face audit worker.
+
+For every PNG in queue.json, run insightface FaceAnalysis and record how many
+faces were detected (filtering by det_score>=MIN_DET and face_short>=MIN_PIX).
+Surfaces the load-bearing roop invariant: each .fsz PNG must hold exactly one
+face, otherwise the loader's `extract_face_images` appends every detected face
+into the FaceSet and pollutes the averaged identity embedding.
+
+CLI:
+    py -3.12 multiface_worker.py <queue.json> <out_results.json> [--limit N]
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+from PIL import Image, ImageOps
+from insightface.app import FaceAnalysis
+
+MODEL_ROOT = r"C:\face_embed_venv\models"
+MIN_DET = 0.5
+MIN_FACE_PIX = 40
+FLUSH_EVERY = 200
+
+
+def load_existing(out_path: Path):
+    if not out_path.exists():
+        return None, set()
+    try:
+        d = json.loads(out_path.read_text())
+        processed = set(d.get("processed", []))
+        return d, processed
+    except Exception as e:
+        print(f"[warn] could not parse {out_path}: {e}; starting fresh", file=sys.stderr)
+        return None, set()
+
+
+def save_atomic(out_path: Path, data: dict):
+    tmp = out_path.with_suffix(".tmp.json")
+    tmp.write_text(json.dumps(data, indent=2))
+    os.replace(tmp, out_path)
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("queue", type=Path)
+    ap.add_argument("out", type=Path)
+    ap.add_argument("--limit", type=int, default=None)
+    args = ap.parse_args()
+
+    queue = json.loads(args.queue.read_text())
+    print(f"[queue] {len(queue)} entries from {args.queue}", flush=True)
+    args.out.parent.mkdir(parents=True, exist_ok=True)
+    existing, processed = load_existing(args.out)
+    if existing:
+        print(f"[resume] {len(processed)} already scored", flush=True)
+        results = existing.get("results", [])
+    else:
+        results = []
+    pending = [e for e in queue if e["wsl_path"] not in processed]
+    if args.limit is not None:
+        pending = pending[: args.limit]
+    print(f"[pending] {len(pending)} entries", flush=True)
+    if not pending:
+        print("[done] nothing to do")
+        return
+
+    print("[load] FaceAnalysis with DmlExecutionProvider", flush=True)
+    app = FaceAnalysis(
+        name="buffalo_l",
+        root=MODEL_ROOT,
+        providers=["DmlExecutionProvider", "CPUExecutionProvider"],
+    )
+    app.prepare(ctx_id=0, det_size=(640, 640))
+
+    n_done = 0
+    n_load_err = 0
+    last_flush = time.time()
+    t_start = time.time()
+
+    def flush():
+        save_atomic(args.out, {
+            "results": results,
+            "processed": sorted(processed),
+        })
+
+    for entry in pending:
+        try:
+            with Image.open(entry["win_path"]) as im:
+                im = ImageOps.exif_transpose(im)
+                im = im.convert("RGB")
+                rgb = np.array(im)
+            bgr = rgb[:, :, ::-1].copy()
+        except Exception as e:
+            n_load_err += 1
+            results.append({
+                "wsl_path": entry["wsl_path"], "faceset": entry["faceset"], "file": entry["file"],
+                "face_count": -1, "error": "load",
+            })
+            processed.add(entry["wsl_path"])
+            n_done += 1
+            continue
+
+        faces = app.get(bgr)
+        kept = 0
+        for f in faces:
+            if float(f.det_score) < MIN_DET:
+                continue
+            x1, y1, x2, y2 = [int(round(v)) for v in f.bbox]
+            short = min(max(x2 - x1, 0), max(y2 - y1, 0))
+            if short < MIN_FACE_PIX:
+                continue
+            kept += 1
+
+        results.append({
+            "wsl_path": entry["wsl_path"], "faceset": entry["faceset"], "file": entry["file"],
+            "face_count": kept,
+        })
+        processed.add(entry["wsl_path"])
+        n_done += 1
+
+        if (n_done % FLUSH_EVERY == 0) or (time.time() - last_flush) > 30.0:
+            flush()
+            last_flush = time.time()
+            elapsed = time.time() - t_start
+            rate = n_done / max(0.1, elapsed)
+            eta = (len(pending) - n_done) / max(0.1, rate) / 60.0
+            print(f"[scan] {n_done}/{len(pending)} rate={rate:.2f} img/s eta={eta:.1f}min "
+                  f"load_err={n_load_err}", flush=True)
+
+    flush()
+    elapsed = time.time() - t_start
+    print(f"[done] {n_done} scored, {n_load_err} load errors, {elapsed:.1f}s "
+          f"({n_done/max(0.1,elapsed):.2f} img/s) -> {args.out}", flush=True)
+
+
+if __name__ == "__main__":
+    main()