Add post-export corpus maintenance pipeline

Adds four new orchestration scripts that operate on an already-built
facesets_swap_ready/ to clean it up over time:

- filter_occlusions.py + clip_worker.py: CLIP zero-shot mask + sunglasses
  filter (open_clip ViT-L-14/dfn2b_s39b). WSL stages, Windows DML scores
  via new C:\clip_dml_venv. Image-level threshold 0.7; faceset-level
  quarantine at 40% domain dominance.

- consolidate_facesets.py: duplicate-identity merger using complete-linkage
  centroid clustering on cached arcface embeddings. Single-linkage chains
  catastrophically (60-faceset clusters with min sim < 0); complete-linkage
  guarantees within-group sim >= edge.

- age_extend_001.py: slots newly-added PNGs into existing era buckets of
  faceset_001 using the same anchor-fragment rule as age_split_001.py
  (dist <= 0.40 AND |year_delta| <= 5). Anchors not re-centered.

- dedup_optimize.py + multiface_worker.py: corpus-wide cleanup with three
  passes — cross-family SHA256 byte-dedup (preserves intra-family era
  duplication), within-faceset near-dup at sim >= 0.95, and a multi-face
  audit (the load-bearing roop invariant). Multi-face worker hits ~19 img/s
  on AMD Vega — ~7x embed_worker because input is 512x512 crops.

Same-day corpus evolution: 311 active / 0 masked / 68 thin / 0 merged →
181 / 51 / 71 / 74; 6,440 → 3,849 active PNGs. All quarantines and prunes
preserved on disk (faces/_dropped/, _masked/, _merged/, _thin/) for full
reversibility. Master manifest gains masked[], merged[], plus per-run
provenance blocks.

Three new docs/analysis/ writeups cover model choice, threshold rationale,
and per-pass run results.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-27 15:41:18 +02:00
parent e66c97fd58
commit 49a43c7685
10 changed files with 3250 additions and 1 deletions

221
work/clip_worker.py Normal file
View File

@@ -0,0 +1,221 @@
"""Windows / DirectML CLIP worker for occlusion scoring.
Reads a queue.json staged by /opt/face-sets/work/filter_occlusions.py (WSL side),
runs open_clip ViT-L-14 (dfn2b_s39b) on each PNG via torch-directml on the AMD
Vega, and writes a scores.json with mask + sunglasses softmax probabilities.
CLI:
py -3.12 clip_worker.py <queue.json> <out_scores.json> [--limit N] [--batch 8]
queue.json shape: list of objects
{"wsl_path": "...", "win_path": "E:\\...\\faceset_NNN\\faces\\NNNN.png",
"faceset": "faceset_NNN", "file": "NNNN.png"}
scores.json shape:
{"model": "ViT-L-14/dfn2b_s39b",
"logit_scale": 100.0,
"prompts": {...},
"results": [{"wsl_path": "...", "faceset": "...", "file": "...",
"mask": float, "sunglasses": float}],
"processed": [wsl_path, ...]}
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import time
import warnings
from pathlib import Path
# DML emits a verbose UserWarning per attention call -- silence at import time
warnings.filterwarnings("ignore", category=UserWarning)
import torch
import torch_directml
import open_clip
from PIL import Image
MODEL_NAME = "ViT-L-14"
PRETRAINED = "dfn2b_s39b"
# kept in sync with /opt/face-sets/work/filter_occlusions.py PROMPTS
PROMPTS = {
"mask": {
"pos": [
"a photo of a person wearing a surgical face mask",
"a photo of a person wearing an FFP2 respirator covering mouth and nose",
"a photo of a person wearing a cloth face mask",
"a face partially covered by a medical mask",
"a person whose mouth and nose are hidden by a face mask",
],
"neg": [
"a photo of a person's face with mouth and nose clearly visible",
"a clear, unobstructed photo of a face",
"a photo of a face without any mask or covering",
"a portrait of a person showing their full face",
"a photo of a person with a beard and visible mouth",
],
},
"sunglasses": {
"pos": [
"a face with dark sunglasses covering the eyes",
"a portrait with the eyes hidden behind opaque sunglasses",
"a person wearing dark sunglasses over their eyes, eyes not visible",
"a face where the eyes are completely concealed by tinted lenses",
"a close-up portrait wearing aviator sunglasses on the eyes",
],
"neg": [
"a portrait with both eyes clearly visible and uncovered",
"a face with sunglasses pushed up on the forehead, eyes visible below",
"a face with sunglasses resting on top of the head, eyes visible",
"a person with sunglasses hanging from their shirt, eyes visible",
"a face wearing clear prescription eyeglasses with visible eyes",
"a portrait with no eyewear and visible eyes",
],
},
}
FLUSH_EVERY = 100
def load_existing(out_path: Path):
if not out_path.exists():
return None, set()
try:
d = json.loads(out_path.read_text())
processed = set(d.get("processed", []))
return d, processed
except Exception as e:
print(f"[warn] could not parse existing {out_path}: {e}; starting fresh", file=sys.stderr)
return None, set()
def save_atomic(out_path: Path, data: dict):
tmp = out_path.with_suffix(".tmp.json")
tmp.write_text(json.dumps(data, indent=2))
os.replace(tmp, out_path)
@torch.no_grad()
def build_text_features(model, tokenizer, device):
out = {}
for attr, sides in PROMPTS.items():
feats = {}
for side in ("pos", "neg"):
tokens = tokenizer(sides[side]).to(device)
f = model.encode_text(tokens)
f = f / f.norm(dim=-1, keepdim=True)
mean = f.mean(dim=0)
feats[side] = mean / mean.norm()
out[attr] = (feats["pos"], feats["neg"])
return out
def main():
ap = argparse.ArgumentParser()
ap.add_argument("queue", type=Path)
ap.add_argument("out", type=Path)
ap.add_argument("--limit", type=int, default=None)
ap.add_argument("--batch", type=int, default=8)
args = ap.parse_args()
queue = json.loads(args.queue.read_text())
print(f"[queue] {len(queue)} entries from {args.queue}")
args.out.parent.mkdir(parents=True, exist_ok=True)
existing, processed = load_existing(args.out)
if existing:
print(f"[resume] {len(processed)} entries already scored")
results = existing.get("results", [])
else:
results = []
pending = [e for e in queue if e["wsl_path"] not in processed]
if args.limit is not None:
pending = pending[: args.limit]
print(f"[pending] {len(pending)} entries to score")
if not pending:
print("[done] nothing to do")
return
device = torch_directml.device()
print(f"[load] {MODEL_NAME}/{PRETRAINED} on {torch_directml.device_name(0)}")
t0 = time.time()
model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED)
tokenizer = open_clip.get_tokenizer(MODEL_NAME)
model = model.to(device).eval()
logit_scale = float(model.logit_scale.exp().detach().cpu())
print(f"[load] ready in {time.time()-t0:.1f}s logit_scale={logit_scale:.2f}")
text_feats = build_text_features(model, tokenizer, device)
def flush():
save_atomic(args.out, {
"model": f"{MODEL_NAME}/{PRETRAINED}",
"logit_scale": logit_scale,
"prompts": PROMPTS,
"results": results,
"processed": sorted(processed),
})
n_done_this_run = 0
n_load_err = 0
last_flush = time.time()
t_start = time.time()
for i in range(0, len(pending), args.batch):
chunk = pending[i:i + args.batch]
imgs = []
keep = []
for entry in chunk:
try:
img = Image.open(entry["win_path"]).convert("RGB")
imgs.append(preprocess(img))
keep.append(entry)
except Exception as e:
print(f"[skip] {entry['win_path']}: {e}", file=sys.stderr)
n_load_err += 1
processed.add(entry["wsl_path"])
if not imgs:
continue
x = torch.stack(imgs).to(device)
with torch.no_grad():
feats = model.encode_image(x)
feats = feats / feats.norm(dim=-1, keepdim=True)
scores_per_attr = {}
for attr, (pos, neg) in text_feats.items():
sims = torch.stack([feats @ pos, feats @ neg], dim=1) * logit_scale
probs = sims.softmax(dim=1)[:, 0].detach().cpu().tolist()
scores_per_attr[attr] = probs
for j, entry in enumerate(keep):
results.append({
"wsl_path": entry["wsl_path"],
"faceset": entry["faceset"],
"file": entry["file"],
"mask": round(scores_per_attr["mask"][j], 4),
"sunglasses": round(scores_per_attr["sunglasses"][j], 4),
})
processed.add(entry["wsl_path"])
n_done_this_run += 1
if (n_done_this_run % FLUSH_EVERY < args.batch) or (time.time() - last_flush) > 30.0:
flush()
last_flush = time.time()
elapsed = time.time() - t_start
rate = n_done_this_run / max(0.1, elapsed)
eta_min = (len(pending) - n_done_this_run) / max(0.1, rate) / 60.0
print(f"[score] {n_done_this_run}/{len(pending)} "
f"rate={rate:.2f} img/s eta={eta_min:.1f}min "
f"load_err={n_load_err}", flush=True)
flush()
elapsed = time.time() - t_start
print(f"[done] {n_done_this_run} scored, {n_load_err} load errors, "
f"{elapsed:.1f}s ({n_done_this_run/max(0.1,elapsed):.2f} img/s) -> {args.out}")
if __name__ == "__main__":
main()