Add post-export corpus maintenance pipeline
Adds four new orchestration scripts that operate on an already-built facesets_swap_ready/ to clean it up over time: - filter_occlusions.py + clip_worker.py: CLIP zero-shot mask + sunglasses filter (open_clip ViT-L-14/dfn2b_s39b). WSL stages, Windows DML scores via new C:\clip_dml_venv. Image-level threshold 0.7; faceset-level quarantine at 40% domain dominance. - consolidate_facesets.py: duplicate-identity merger using complete-linkage centroid clustering on cached arcface embeddings. Single-linkage chains catastrophically (60-faceset clusters with min sim < 0); complete-linkage guarantees within-group sim >= edge. - age_extend_001.py: slots newly-added PNGs into existing era buckets of faceset_001 using the same anchor-fragment rule as age_split_001.py (dist <= 0.40 AND |year_delta| <= 5). Anchors not re-centered. - dedup_optimize.py + multiface_worker.py: corpus-wide cleanup with three passes — cross-family SHA256 byte-dedup (preserves intra-family era duplication), within-faceset near-dup at sim >= 0.95, and a multi-face audit (the load-bearing roop invariant). Multi-face worker hits ~19 img/s on AMD Vega — ~7x embed_worker because input is 512x512 crops. Same-day corpus evolution: 311 active / 0 masked / 68 thin / 0 merged → 181 / 51 / 71 / 74; 6,440 → 3,849 active PNGs. All quarantines and prunes preserved on disk (faces/_dropped/, _masked/, _merged/, _thin/) for full reversibility. Master manifest gains masked[], merged[], plus per-run provenance blocks. Three new docs/analysis/ writeups cover model choice, threshold rationale, and per-pass run results. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
144
work/multiface_worker.py
Normal file
144
work/multiface_worker.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Windows / DirectML multi-face audit worker.
|
||||
|
||||
For every PNG in queue.json, run insightface FaceAnalysis and record how many
|
||||
faces were detected (filtering by det_score>=MIN_DET and face_short>=MIN_PIX).
|
||||
Surfaces the load-bearing roop invariant: each .fsz PNG must hold exactly one
|
||||
face, otherwise the loader's `extract_face_images` appends every detected face
|
||||
into the FaceSet and pollutes the averaged identity embedding.
|
||||
|
||||
CLI:
|
||||
py -3.12 multiface_worker.py <queue.json> <out_results.json> [--limit N]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image, ImageOps
|
||||
from insightface.app import FaceAnalysis
|
||||
|
||||
MODEL_ROOT = r"C:\face_embed_venv\models"
|
||||
MIN_DET = 0.5
|
||||
MIN_FACE_PIX = 40
|
||||
FLUSH_EVERY = 200
|
||||
|
||||
|
||||
def load_existing(out_path: Path):
|
||||
if not out_path.exists():
|
||||
return None, set()
|
||||
try:
|
||||
d = json.loads(out_path.read_text())
|
||||
processed = set(d.get("processed", []))
|
||||
return d, processed
|
||||
except Exception as e:
|
||||
print(f"[warn] could not parse {out_path}: {e}; starting fresh", file=sys.stderr)
|
||||
return None, set()
|
||||
|
||||
|
||||
def save_atomic(out_path: Path, data: dict):
|
||||
tmp = out_path.with_suffix(".tmp.json")
|
||||
tmp.write_text(json.dumps(data, indent=2))
|
||||
os.replace(tmp, out_path)
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("queue", type=Path)
|
||||
ap.add_argument("out", type=Path)
|
||||
ap.add_argument("--limit", type=int, default=None)
|
||||
args = ap.parse_args()
|
||||
|
||||
queue = json.loads(args.queue.read_text())
|
||||
print(f"[queue] {len(queue)} entries from {args.queue}", flush=True)
|
||||
args.out.parent.mkdir(parents=True, exist_ok=True)
|
||||
existing, processed = load_existing(args.out)
|
||||
if existing:
|
||||
print(f"[resume] {len(processed)} already scored", flush=True)
|
||||
results = existing.get("results", [])
|
||||
else:
|
||||
results = []
|
||||
pending = [e for e in queue if e["wsl_path"] not in processed]
|
||||
if args.limit is not None:
|
||||
pending = pending[: args.limit]
|
||||
print(f"[pending] {len(pending)} entries", flush=True)
|
||||
if not pending:
|
||||
print("[done] nothing to do")
|
||||
return
|
||||
|
||||
print("[load] FaceAnalysis with DmlExecutionProvider", flush=True)
|
||||
app = FaceAnalysis(
|
||||
name="buffalo_l",
|
||||
root=MODEL_ROOT,
|
||||
providers=["DmlExecutionProvider", "CPUExecutionProvider"],
|
||||
)
|
||||
app.prepare(ctx_id=0, det_size=(640, 640))
|
||||
|
||||
n_done = 0
|
||||
n_load_err = 0
|
||||
last_flush = time.time()
|
||||
t_start = time.time()
|
||||
|
||||
def flush():
|
||||
save_atomic(args.out, {
|
||||
"results": results,
|
||||
"processed": sorted(processed),
|
||||
})
|
||||
|
||||
for entry in pending:
|
||||
try:
|
||||
with Image.open(entry["win_path"]) as im:
|
||||
im = ImageOps.exif_transpose(im)
|
||||
im = im.convert("RGB")
|
||||
rgb = np.array(im)
|
||||
bgr = rgb[:, :, ::-1].copy()
|
||||
except Exception as e:
|
||||
n_load_err += 1
|
||||
results.append({
|
||||
"wsl_path": entry["wsl_path"], "faceset": entry["faceset"], "file": entry["file"],
|
||||
"face_count": -1, "error": "load",
|
||||
})
|
||||
processed.add(entry["wsl_path"])
|
||||
n_done += 1
|
||||
continue
|
||||
|
||||
faces = app.get(bgr)
|
||||
kept = 0
|
||||
for f in faces:
|
||||
if float(f.det_score) < MIN_DET:
|
||||
continue
|
||||
x1, y1, x2, y2 = [int(round(v)) for v in f.bbox]
|
||||
short = min(max(x2 - x1, 0), max(y2 - y1, 0))
|
||||
if short < MIN_FACE_PIX:
|
||||
continue
|
||||
kept += 1
|
||||
|
||||
results.append({
|
||||
"wsl_path": entry["wsl_path"], "faceset": entry["faceset"], "file": entry["file"],
|
||||
"face_count": kept,
|
||||
})
|
||||
processed.add(entry["wsl_path"])
|
||||
n_done += 1
|
||||
|
||||
if (n_done % FLUSH_EVERY == 0) or (time.time() - last_flush) > 30.0:
|
||||
flush()
|
||||
last_flush = time.time()
|
||||
elapsed = time.time() - t_start
|
||||
rate = n_done / max(0.1, elapsed)
|
||||
eta = (len(pending) - n_done) / max(0.1, rate) / 60.0
|
||||
print(f"[scan] {n_done}/{len(pending)} rate={rate:.2f} img/s eta={eta:.1f}min "
|
||||
f"load_err={n_load_err}", flush=True)
|
||||
|
||||
flush()
|
||||
elapsed = time.time() - t_start
|
||||
print(f"[done] {n_done} scored, {n_load_err} load errors, {elapsed:.1f}s "
|
||||
f"({n_done/max(0.1,elapsed):.2f} img/s) -> {args.out}", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user