Add post-export corpus maintenance pipeline

Adds four new orchestration scripts that operate on an already-built facesets_swap_ready/ to clean it up over time: - filter_occlusions.py + clip_worker.py: CLIP zero-shot mask + sunglasses filter (open_clip ViT-L-14/dfn2b_s39b). WSL stages, Windows DML scores via new C:\clip_dml_venv. Image-level threshold 0.7; faceset-level quarantine at 40% domain dominance. - consolidate_facesets.py: duplicate-identity merger using complete-linkage centroid clustering on cached arcface embeddings. Single-linkage chains catastrophically (60-faceset clusters with min sim < 0); complete-linkage guarantees within-group sim >= edge. - age_extend_001.py: slots newly-added PNGs into existing era buckets of faceset_001 using the same anchor-fragment rule as age_split_001.py (dist <= 0.40 AND |year_delta| <= 5). Anchors not re-centered. - dedup_optimize.py + multiface_worker.py: corpus-wide cleanup with three passes — cross-family SHA256 byte-dedup (preserves intra-family era duplication), within-faceset near-dup at sim >= 0.95, and a multi-face audit (the load-bearing roop invariant). Multi-face worker hits ~19 img/s on AMD Vega — ~7x embed_worker because input is 512x512 crops. Same-day corpus evolution: 311 active / 0 masked / 68 thin / 0 merged → 181 / 51 / 71 / 74; 6,440 → 3,849 active PNGs. All quarantines and prunes preserved on disk (faces/_dropped/, _masked/, _merged/, _thin/) for full reversibility. Master manifest gains masked[], merged[], plus per-run provenance blocks. Three new docs/analysis/ writeups cover model choice, threshold rationale, and per-pass run results. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 15:41:18 +02:00
parent e66c97fd58
commit 49a43c7685
10 changed files with 3250 additions and 1 deletions
--- a/work/clip_worker.py
+++ b/work/clip_worker.py
@@ -0,0 +1,221 @@
+"""Windows / DirectML CLIP worker for occlusion scoring.
+
+Reads a queue.json staged by /opt/face-sets/work/filter_occlusions.py (WSL side),
+runs open_clip ViT-L-14 (dfn2b_s39b) on each PNG via torch-directml on the AMD
+Vega, and writes a scores.json with mask + sunglasses softmax probabilities.
+
+CLI:
+    py -3.12 clip_worker.py <queue.json> <out_scores.json> [--limit N] [--batch 8]
+
+queue.json shape: list of objects
+    {"wsl_path": "...", "win_path": "E:\\...\\faceset_NNN\\faces\\NNNN.png",
+     "faceset": "faceset_NNN", "file": "NNNN.png"}
+
+scores.json shape:
+    {"model": "ViT-L-14/dfn2b_s39b",
+     "logit_scale": 100.0,
+     "prompts": {...},
+     "results": [{"wsl_path": "...", "faceset": "...", "file": "...",
+                  "mask": float, "sunglasses": float}],
+     "processed": [wsl_path, ...]}
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+import time
+import warnings
+from pathlib import Path
+
+# DML emits a verbose UserWarning per attention call -- silence at import time
+warnings.filterwarnings("ignore", category=UserWarning)
+
+import torch
+import torch_directml
+import open_clip
+from PIL import Image
+
+MODEL_NAME = "ViT-L-14"
+PRETRAINED = "dfn2b_s39b"
+
+# kept in sync with /opt/face-sets/work/filter_occlusions.py PROMPTS
+PROMPTS = {
+    "mask": {
+        "pos": [
+            "a photo of a person wearing a surgical face mask",
+            "a photo of a person wearing an FFP2 respirator covering mouth and nose",
+            "a photo of a person wearing a cloth face mask",
+            "a face partially covered by a medical mask",
+            "a person whose mouth and nose are hidden by a face mask",
+        ],
+        "neg": [
+            "a photo of a person's face with mouth and nose clearly visible",
+            "a clear, unobstructed photo of a face",
+            "a photo of a face without any mask or covering",
+            "a portrait of a person showing their full face",
+            "a photo of a person with a beard and visible mouth",
+        ],
+    },
+    "sunglasses": {
+        "pos": [
+            "a face with dark sunglasses covering the eyes",
+            "a portrait with the eyes hidden behind opaque sunglasses",
+            "a person wearing dark sunglasses over their eyes, eyes not visible",
+            "a face where the eyes are completely concealed by tinted lenses",
+            "a close-up portrait wearing aviator sunglasses on the eyes",
+        ],
+        "neg": [
+            "a portrait with both eyes clearly visible and uncovered",
+            "a face with sunglasses pushed up on the forehead, eyes visible below",
+            "a face with sunglasses resting on top of the head, eyes visible",
+            "a person with sunglasses hanging from their shirt, eyes visible",
+            "a face wearing clear prescription eyeglasses with visible eyes",
+            "a portrait with no eyewear and visible eyes",
+        ],
+    },
+}
+
+FLUSH_EVERY = 100
+
+
+def load_existing(out_path: Path):
+    if not out_path.exists():
+        return None, set()
+    try:
+        d = json.loads(out_path.read_text())
+        processed = set(d.get("processed", []))
+        return d, processed
+    except Exception as e:
+        print(f"[warn] could not parse existing {out_path}: {e}; starting fresh", file=sys.stderr)
+        return None, set()
+
+
+def save_atomic(out_path: Path, data: dict):
+    tmp = out_path.with_suffix(".tmp.json")
+    tmp.write_text(json.dumps(data, indent=2))
+    os.replace(tmp, out_path)
+
+
+@torch.no_grad()
+def build_text_features(model, tokenizer, device):
+    out = {}
+    for attr, sides in PROMPTS.items():
+        feats = {}
+        for side in ("pos", "neg"):
+            tokens = tokenizer(sides[side]).to(device)
+            f = model.encode_text(tokens)
+            f = f / f.norm(dim=-1, keepdim=True)
+            mean = f.mean(dim=0)
+            feats[side] = mean / mean.norm()
+        out[attr] = (feats["pos"], feats["neg"])
+    return out
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("queue", type=Path)
+    ap.add_argument("out", type=Path)
+    ap.add_argument("--limit", type=int, default=None)
+    ap.add_argument("--batch", type=int, default=8)
+    args = ap.parse_args()
+
+    queue = json.loads(args.queue.read_text())
+    print(f"[queue] {len(queue)} entries from {args.queue}")
+
+    args.out.parent.mkdir(parents=True, exist_ok=True)
+    existing, processed = load_existing(args.out)
+    if existing:
+        print(f"[resume] {len(processed)} entries already scored")
+        results = existing.get("results", [])
+    else:
+        results = []
+
+    pending = [e for e in queue if e["wsl_path"] not in processed]
+    if args.limit is not None:
+        pending = pending[: args.limit]
+    print(f"[pending] {len(pending)} entries to score")
+
+    if not pending:
+        print("[done] nothing to do")
+        return
+
+    device = torch_directml.device()
+    print(f"[load] {MODEL_NAME}/{PRETRAINED} on {torch_directml.device_name(0)}")
+    t0 = time.time()
+    model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED)
+    tokenizer = open_clip.get_tokenizer(MODEL_NAME)
+    model = model.to(device).eval()
+    logit_scale = float(model.logit_scale.exp().detach().cpu())
+    print(f"[load] ready in {time.time()-t0:.1f}s logit_scale={logit_scale:.2f}")
+    text_feats = build_text_features(model, tokenizer, device)
+
+    def flush():
+        save_atomic(args.out, {
+            "model": f"{MODEL_NAME}/{PRETRAINED}",
+            "logit_scale": logit_scale,
+            "prompts": PROMPTS,
+            "results": results,
+            "processed": sorted(processed),
+        })
+
+    n_done_this_run = 0
+    n_load_err = 0
+    last_flush = time.time()
+    t_start = time.time()
+
+    for i in range(0, len(pending), args.batch):
+        chunk = pending[i:i + args.batch]
+        imgs = []
+        keep = []
+        for entry in chunk:
+            try:
+                img = Image.open(entry["win_path"]).convert("RGB")
+                imgs.append(preprocess(img))
+                keep.append(entry)
+            except Exception as e:
+                print(f"[skip] {entry['win_path']}: {e}", file=sys.stderr)
+                n_load_err += 1
+                processed.add(entry["wsl_path"])
+        if not imgs:
+            continue
+        x = torch.stack(imgs).to(device)
+        with torch.no_grad():
+            feats = model.encode_image(x)
+            feats = feats / feats.norm(dim=-1, keepdim=True)
+            scores_per_attr = {}
+            for attr, (pos, neg) in text_feats.items():
+                sims = torch.stack([feats @ pos, feats @ neg], dim=1) * logit_scale
+                probs = sims.softmax(dim=1)[:, 0].detach().cpu().tolist()
+                scores_per_attr[attr] = probs
+        for j, entry in enumerate(keep):
+            results.append({
+                "wsl_path": entry["wsl_path"],
+                "faceset": entry["faceset"],
+                "file": entry["file"],
+                "mask": round(scores_per_attr["mask"][j], 4),
+                "sunglasses": round(scores_per_attr["sunglasses"][j], 4),
+            })
+            processed.add(entry["wsl_path"])
+            n_done_this_run += 1
+
+        if (n_done_this_run % FLUSH_EVERY < args.batch) or (time.time() - last_flush) > 30.0:
+            flush()
+            last_flush = time.time()
+            elapsed = time.time() - t_start
+            rate = n_done_this_run / max(0.1, elapsed)
+            eta_min = (len(pending) - n_done_this_run) / max(0.1, rate) / 60.0
+            print(f"[score] {n_done_this_run}/{len(pending)} "
+                  f"rate={rate:.2f} img/s eta={eta_min:.1f}min "
+                  f"load_err={n_load_err}", flush=True)
+
+    flush()
+    elapsed = time.time() - t_start
+    print(f"[done] {n_done_this_run} scored, {n_load_err} load errors, "
+          f"{elapsed:.1f}s ({n_done_this_run/max(0.1,elapsed):.2f} img/s) -> {args.out}")
+
+
+if __name__ == "__main__":
+    main()