Add post-export corpus maintenance pipeline

Adds four new orchestration scripts that operate on an already-built facesets_swap_ready/ to clean it up over time: - filter_occlusions.py + clip_worker.py: CLIP zero-shot mask + sunglasses filter (open_clip ViT-L-14/dfn2b_s39b). WSL stages, Windows DML scores via new C:\clip_dml_venv. Image-level threshold 0.7; faceset-level quarantine at 40% domain dominance. - consolidate_facesets.py: duplicate-identity merger using complete-linkage centroid clustering on cached arcface embeddings. Single-linkage chains catastrophically (60-faceset clusters with min sim < 0); complete-linkage guarantees within-group sim >= edge. - age_extend_001.py: slots newly-added PNGs into existing era buckets of faceset_001 using the same anchor-fragment rule as age_split_001.py (dist <= 0.40 AND |year_delta| <= 5). Anchors not re-centered. - dedup_optimize.py + multiface_worker.py: corpus-wide cleanup with three passes — cross-family SHA256 byte-dedup (preserves intra-family era duplication), within-faceset near-dup at sim >= 0.95, and a multi-face audit (the load-bearing roop invariant). Multi-face worker hits ~19 img/s on AMD Vega — ~7x embed_worker because input is 512x512 crops. Same-day corpus evolution: 311 active / 0 masked / 68 thin / 0 merged → 181 / 51 / 71 / 74; 6,440 → 3,849 active PNGs. All quarantines and prunes preserved on disk (faces/_dropped/, _masked/, _merged/, _thin/) for full reversibility. Master manifest gains masked[], merged[], plus per-run provenance blocks. Three new docs/analysis/ writeups cover model choice, threshold rationale, and per-pass run results. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 15:41:18 +02:00
parent e66c97fd58
commit 49a43c7685
10 changed files with 3250 additions and 1 deletions
--- a/work/age_extend_001.py
+++ b/work/age_extend_001.py
@@ -0,0 +1,576 @@
+"""Extend the existing 6 era buckets of faceset_001 by absorbing PNGs that
+post-date the original age_split run (from consolidation merges, etc.).
+
+Mirrors the anchor-fragment assignment logic in age_split_001.py:
+  - For each unbucketed face in faceset_001's manifest, find the nearest active
+    era anchor by cosine distance to the anchor's centroid.
+  - Accept the assignment iff dist <= 0.40 AND |year_delta| <= 5
+    (where year_delta = exif_year(face) - dom_year(anchor)).
+  - Undated PNGs are skipped (no assignment).
+  - Anchors are NOT re-centered after absorption (preserves the same drift
+    guarantees as the original age_split).
+
+CLI:
+  python work/age_extend_001.py analyze --out work/age_extend/candidates.json
+  python work/age_extend_001.py report --candidates ... --out work/age_extend
+  python work/age_extend_001.py apply --candidates ... [--dry-run]
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import sys
+import time
+from collections import Counter
+from pathlib import Path
+
+import numpy as np
+from PIL import Image, ExifTags
+
+ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
+PARENT = "faceset_001"
+ACTIVE_ERAS = [
+    "faceset_001_2005-10",
+    "faceset_001_2010-13",
+    "faceset_001_2011",
+    "faceset_001_2014-17",
+    "faceset_001_2018-19",
+    "faceset_001_2018-20",
+]
+CACHES = [
+    Path("/opt/face-sets/work/cache/nl_full.npz"),
+    Path("/opt/face-sets/work/cache/immich_peter.npz"),
+    Path("/opt/face-sets/work/cache/immich_nic.npz"),
+]
+EXIF_CACHE = Path("/opt/face-sets/work/cache/age_split_exif.json")
+
+# anchor-fragment thresholds (mirror age_split_001.py)
+DIST_MAX = 0.40
+YEAR_MAX = 5
+
+
+# ----------------------------- caches -----------------------------
+
+def load_caches():
+    rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {}
+    alias_map: dict[str, str] = {}
+    for c in CACHES:
+        if not c.exists():
+            print(f"[warn] cache missing: {c}", file=sys.stderr)
+            continue
+        d = np.load(c, allow_pickle=True)
+        emb = d["embeddings"]
+        meta = json.loads(str(d["meta"]))
+        face_records = [m for m in meta if not m.get("noface")]
+        if len(face_records) != len(emb):
+            raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}")
+        if "path_aliases" in d.files:
+            paliases = json.loads(str(d["path_aliases"]))
+            for canon, alist in paliases.items():
+                alias_map.setdefault(canon, canon)
+                for a in alist:
+                    alias_map[a] = canon
+        for i, rec in enumerate(face_records):
+            p = rec["path"]
+            bbox = tuple(int(x) for x in rec["bbox"])
+            v = emb[i].astype(np.float32)
+            n = float(np.linalg.norm(v))
+            if n > 0:
+                v = v / n
+            rec_index[(p, bbox)] = v
+            alias_map.setdefault(p, p)
+    print(f"[cache] indexed {len(rec_index)} face records, {len(alias_map)} aliases", file=sys.stderr)
+    return rec_index, alias_map
+
+
+def lookup_emb(rec_index, alias_map, src: str, bbox):
+    bbox_t = tuple(int(x) for x in bbox)
+    canon = alias_map.get(src, src)
+    v = rec_index.get((canon, bbox_t))
+    if v is None and canon != src:
+        v = rec_index.get((src, bbox_t))
+    return v
+
+
+# ----------------------------- exif -----------------------------
+
+def load_exif_cache():
+    if not EXIF_CACHE.exists():
+        return {}
+    return json.loads(EXIF_CACHE.read_text())
+
+
+def save_exif_cache(cache):
+    tmp = EXIF_CACHE.with_suffix(".tmp.json")
+    tmp.write_text(json.dumps(cache, indent=2))
+    tmp.replace(EXIF_CACHE)
+
+
+def exif_year(path: Path) -> int | None:
+    try:
+        with Image.open(path) as im:
+            ex = im._getexif()
+            if not ex:
+                return None
+            for tag_id, val in ex.items():
+                tag = ExifTags.TAGS.get(tag_id, tag_id)
+                if tag == "DateTimeOriginal" and isinstance(val, str) and len(val) >= 4:
+                    return int(val[:4])
+    except Exception:
+        return None
+    return None
+
+
+def get_year(src: str, exif_cache) -> int | None:
+    """Return EXIF year for src, using cache. Mutates cache for new lookups."""
+    if src in exif_cache:
+        return exif_cache[src]
+    p = Path(src)
+    y = exif_year(p) if p.exists() else None
+    exif_cache[src] = y
+    return y
+
+
+# ----------------------------- analyze -----------------------------
+
+def cmd_analyze(args):
+    rec_index, alias_map = load_caches()
+    exif_cache = load_exif_cache()
+    exif_cache_dirty = False
+
+    parent_dir = ROOT / PARENT
+    parent_manifest = json.loads((parent_dir / "manifest.json").read_text())
+    parent_faces = parent_manifest.get("faces", [])
+    print(f"[parent] {PARENT}: {len(parent_faces)} face entries", file=sys.stderr)
+
+    # Build "in_bucket" set + each anchor's centroid + dom_year
+    anchors = []
+    in_bucket: set[tuple[str, tuple[int, int, int, int]]] = set()
+    for era in ACTIVE_ERAS:
+        ed = ROOT / era
+        if not ed.is_dir():
+            print(f"[warn] missing era bucket: {era}", file=sys.stderr)
+            continue
+        em = json.loads((ed / "manifest.json").read_text())
+        emb_list = []
+        years = []
+        n_missing_emb = 0
+        for f in em.get("faces", []):
+            src = f.get("source")
+            bbox = f.get("bbox")
+            if not src or not bbox:
+                continue
+            key = (alias_map.get(src, src), tuple(int(x) for x in bbox))
+            in_bucket.add(key)
+            in_bucket.add((src, tuple(int(x) for x in bbox)))   # cover both alias and raw
+            v = lookup_emb(rec_index, alias_map, src, bbox)
+            if v is None:
+                n_missing_emb += 1
+            else:
+                emb_list.append(v)
+            y = get_year(src, exif_cache)
+            if y is None:
+                exif_cache_dirty = True
+            else:
+                years.append(y)
+                if src not in exif_cache:
+                    exif_cache_dirty = True
+        if not emb_list:
+            print(f"[warn] {era}: no embeddings found, skipping anchor", file=sys.stderr)
+            continue
+        arr = np.stack(emb_list).astype(np.float32)
+        c = arr.mean(axis=0)
+        n = float(np.linalg.norm(c))
+        if n > 0:
+            c = c / n
+        dom_year = Counter(years).most_common(1)[0][0] if years else None
+        anchors.append({
+            "name": era, "centroid": c, "n_faces": len(em.get("faces", [])),
+            "n_emb_used": len(emb_list), "n_emb_missing": n_missing_emb,
+            "dom_year": dom_year,
+            "year_min": min(years) if years else None,
+            "year_max": max(years) if years else None,
+        })
+        print(f"[anchor] {era}: n={len(em.get('faces', []))} emb_used={len(emb_list)} "
+              f"emb_miss={n_missing_emb} dom_year={dom_year} years=[{min(years) if years else '-'}..{max(years) if years else '-'}]",
+              file=sys.stderr)
+
+    # Find unbucketed faces in parent
+    unbucketed = []
+    for f in parent_faces:
+        src = f.get("source")
+        bbox = f.get("bbox")
+        if not src or not bbox:
+            continue
+        bbox_t = tuple(int(x) for x in bbox)
+        key1 = (alias_map.get(src, src), bbox_t)
+        key2 = (src, bbox_t)
+        if key1 in in_bucket or key2 in in_bucket:
+            continue
+        unbucketed.append(f)
+    print(f"[parent] {len(unbucketed)} unbucketed face entries (in {PARENT} but no era bucket)", file=sys.stderr)
+
+    # Score each unbucketed face against every anchor
+    proposals = []
+    skipped_no_emb = 0
+    skipped_no_year = 0
+    for f in unbucketed:
+        src = f["source"]
+        bbox = f["bbox"]
+        v = lookup_emb(rec_index, alias_map, src, bbox)
+        if v is None:
+            skipped_no_emb += 1
+            continue
+        y = get_year(src, exif_cache)
+        if y is None:
+            skipped_no_year += 1
+            exif_cache_dirty = True
+            continue
+        if src not in exif_cache:
+            exif_cache_dirty = True
+        # nearest anchor
+        best = None  # (dist, idx)
+        for i, a in enumerate(anchors):
+            d = 1.0 - float(np.dot(a["centroid"], v))
+            if best is None or d < best[0]:
+                best = (d, i)
+        if best is None:
+            continue
+        dist, bidx = best
+        anchor = anchors[bidx]
+        year_delta = abs(y - anchor["dom_year"]) if anchor["dom_year"] is not None else None
+        accept = (dist <= DIST_MAX and year_delta is not None and year_delta <= YEAR_MAX)
+        proposals.append({
+            "png": f["png"],
+            "source": src,
+            "bbox": [int(x) for x in bbox],
+            "year": y,
+            "rank_in_parent": f.get("rank"),
+            "quality_composite": f.get("quality", {}).get("composite"),
+            "quality": f.get("quality", {}),
+            "best_anchor": anchor["name"],
+            "best_anchor_dom_year": anchor["dom_year"],
+            "centroid_dist": round(dist, 4),
+            "year_delta": year_delta,
+            "accept": bool(accept),
+            "all_anchor_dists": {
+                a["name"]: round(1.0 - float(np.dot(a["centroid"], v)), 4) for a in anchors
+            },
+        })
+
+    if exif_cache_dirty:
+        save_exif_cache(exif_cache)
+        print(f"[exif] cache flushed ({len(exif_cache)} entries total)", file=sys.stderr)
+
+    # Summarize
+    accepted = [p for p in proposals if p["accept"]]
+    rejected = [p for p in proposals if not p["accept"]]
+    by_anchor = Counter(p["best_anchor"] for p in accepted)
+    print(f"[summary] unbucketed={len(unbucketed)} scored={len(proposals)} "
+          f"accepted={len(accepted)} rejected={len(rejected)} "
+          f"skipped(no_emb={skipped_no_emb}, no_year={skipped_no_year})", file=sys.stderr)
+    for k, v in by_anchor.most_common():
+        print(f"  {k}: +{v}", file=sys.stderr)
+
+    out = {
+        "thresholds": {"dist_max": DIST_MAX, "year_max": YEAR_MAX},
+        "anchors": [
+            {k: v for k, v in a.items() if k != "centroid"}
+            for a in anchors
+        ],
+        "n_unbucketed": len(unbucketed),
+        "skipped": {"no_emb": skipped_no_emb, "no_year": skipped_no_year},
+        "proposals": sorted(proposals, key=lambda p: (not p["accept"], p["best_anchor"], -1 * (p["quality_composite"] or 0))),
+        "by_anchor": dict(by_anchor),
+    }
+    op = Path(args.out)
+    op.parent.mkdir(parents=True, exist_ok=True)
+    op.write_text(json.dumps(out, indent=2))
+    print(f"[done] {len(proposals)} proposals -> {op}", file=sys.stderr)
+
+
+# ----------------------------- report -----------------------------
+
+def cmd_report(args):
+    cand = json.loads(Path(args.candidates).read_text())
+    out_dir = Path(args.out)
+    thumbs_dir = out_dir / "thumbs"
+    thumbs_dir.mkdir(parents=True, exist_ok=True)
+    THUMB = 140
+
+    def make_thumb(png_relpath: str) -> str:
+        # png_relpath looks like "faces/0042.png"
+        src = ROOT / PARENT / png_relpath
+        name = Path(png_relpath).stem
+        dst = thumbs_dir / f"{name}.jpg"
+        if not dst.exists():
+            try:
+                img = Image.open(src).convert("RGB")
+                img.thumbnail((THUMB, THUMB), Image.LANCZOS)
+                img.save(dst, "JPEG", quality=82)
+            except Exception as e:
+                print(f"[thumb-skip] {src}: {e}", file=sys.stderr)
+                return ""
+        return f"thumbs/{name}.jpg"
+
+    # group accepted proposals by target anchor
+    by_anchor: dict[str, list] = {}
+    rejected = []
+    for p in cand["proposals"]:
+        if p["accept"]:
+            by_anchor.setdefault(p["best_anchor"], []).append(p)
+        else:
+            rejected.append(p)
+
+    rows = []
+    rows.append("<h1>faceset_001 age extension &mdash; review</h1>")
+    rows.append(f"<p>{cand['n_unbucketed']} unbucketed faces in {PARENT}; "
+                f"{sum(len(v) for v in by_anchor.values())} accepted / {len(rejected)} rejected; "
+                f"thresholds dist&le;{cand['thresholds']['dist_max']} AND |year_delta|&le;{cand['thresholds']['year_max']}.</p>")
+    nav = " · ".join(f"<a href='#{a}'>{a} (+{len(by_anchor[a])})</a>" for a in by_anchor) + " · <a href='#rejected'>rejected</a>"
+    rows.append(f"<div class='nav'>{nav}</div>")
+
+    for anchor_name in ACTIVE_ERAS:
+        if anchor_name not in by_anchor:
+            continue
+        items = by_anchor[anchor_name]
+        anchor_meta = next((a for a in cand["anchors"] if a["name"] == anchor_name), {})
+        rows.append(f"<section id='{anchor_name}' class='grp'>")
+        rows.append(f"<h2>{anchor_name} <small>(dom_year={anchor_meta.get('dom_year')}; "
+                    f"existing n={anchor_meta.get('n_faces')}; +{len(items)} new)</small></h2>")
+        rows.append("<div class='cells'>")
+        for p in sorted(items, key=lambda x: (x["centroid_dist"], -1 * (x["quality_composite"] or 0))):
+            thumb = make_thumb(p["png"])
+            cls = "hi" if p["centroid_dist"] <= 0.30 else "mid"
+            rows.append(
+                f"<div class='cell'>"
+                f"<img src='{thumb}' loading='lazy' title='{p['png']}'>"
+                f"<div class='meta'>{p['png']}<br>year {p['year']} (Δ{p['year_delta']})<br>"
+                f"<span class='{cls}'>dist {p['centroid_dist']:.3f}</span></div>"
+                f"</div>"
+            )
+        rows.append("</div></section>")
+
+    if rejected:
+        rows.append("<section id='rejected' class='grp rej'>")
+        rows.append(f"<h2>rejected <small>({len(rejected)} faces don't fit any anchor)</small></h2>")
+        rows.append("<div class='cells'>")
+        for p in sorted(rejected, key=lambda x: x["centroid_dist"])[:200]:
+            thumb = make_thumb(p["png"])
+            why = []
+            if p["centroid_dist"] > cand['thresholds']['dist_max']:
+                why.append(f"dist {p['centroid_dist']:.2f}>{cand['thresholds']['dist_max']}")
+            if p["year_delta"] is None or p["year_delta"] > cand['thresholds']['year_max']:
+                why.append(f"yΔ{p['year_delta']}>{cand['thresholds']['year_max']}")
+            rows.append(
+                f"<div class='cell'>"
+                f"<img src='{thumb}' loading='lazy'>"
+                f"<div class='meta'>{p['png']}<br>year {p['year']} → best {p['best_anchor']}<br>"
+                f"<span class='lo'>{'; '.join(why)}</span></div>"
+                f"</div>"
+            )
+        if len(rejected) > 200:
+            rows.append(f"<p>...{len(rejected)-200} more truncated.</p>")
+        rows.append("</div></section>")
+
+    html = f"""<!doctype html>
+<html><head><meta charset='utf-8'><title>faceset_001 age extension</title>
+<style>
+body {{ font-family: system-ui, sans-serif; background:#111; color:#eee; padding:1em; }}
+h1 {{ margin-top:0; }} h2 {{ margin:0; }}
+small {{ color:#999; font-weight:normal; }}
+section.grp {{ background:#1a1a1a; border-radius:6px; padding:12px; margin:12px 0; }}
+section.grp.rej {{ border-left:4px solid #ff5050; }}
+.cells {{ display:flex; flex-wrap:wrap; gap:6px; }}
+.cell {{ background:#222; border-radius:4px; padding:4px; width:160px; font-size:11px; font-family:monospace; text-align:center; }}
+.cell img {{ height:140px; width:auto; border-radius:3px; }}
+.meta {{ padding-top:4px; line-height:1.3; }}
+.hi  {{ color:#5fa05f; font-weight:bold; }}
+.mid {{ color:#ffb050; }}
+.lo  {{ color:#ff5050; }}
+.nav {{ position:sticky; top:0; background:#111; padding:.5em 0; border-bottom:1px solid #333; font-size:13px; }}
+a {{ color:#6cf; }}
+</style></head>
+<body>
+{''.join(rows)}
+</body></html>"""
+    out_html = out_dir / "index.html"
+    out_html.write_text(html)
+    print(f"[done] {out_html}", file=sys.stderr)
+
+
+# ----------------------------- apply -----------------------------
+
+def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
+    import zipfile
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
+        for i, p in enumerate(pngs):
+            zf.write(p, arcname=f"{i:04d}.png")
+
+
+def cmd_apply(args):
+    cand = json.loads(Path(args.candidates).read_text())
+    accepted = [p for p in cand["proposals"] if p["accept"]]
+    if args.dry_run:
+        from collections import Counter as C
+        by = C(p["best_anchor"] for p in accepted)
+        print(f"=== dry-run: {len(accepted)} assignments across {len(by)} anchors ===")
+        for k, v in by.most_common():
+            print(f"  {k}: +{v}")
+        return
+
+    parent_dir = ROOT / PARENT
+    master_path = ROOT / "manifest.json"
+    master = json.loads(master_path.read_text())
+    facesets_by_name = {f["name"]: f for f in master.get("facesets", [])}
+
+    by_anchor: dict[str, list] = {}
+    for p in accepted:
+        by_anchor.setdefault(p["best_anchor"], []).append(p)
+
+    total_added = 0
+    for anchor_name, props in by_anchor.items():
+        ed = ROOT / anchor_name
+        em_path = ed / "manifest.json"
+        em = json.loads(em_path.read_text())
+        existing = list(em.get("faces", []))
+
+        # gather new entries with their source PNG paths in faceset_001/faces/
+        new_with_src = []
+        for p in props:
+            src_png = parent_dir / p["png"]
+            if not src_png.exists():
+                print(f"[warn] missing parent PNG {src_png}; skip", file=sys.stderr)
+                continue
+            face_entry = {
+                "source": p["source"],
+                "bbox": p["bbox"],
+                "quality": p["quality"],
+                "exif_year": p["year"],
+                "centroid_dist_at_assign": p["centroid_dist"],
+                "year_delta_at_assign": p["year_delta"],
+                "extended_from_parent": True,
+            }
+            new_with_src.append((face_entry, src_png))
+
+        # combine; rank by quality.composite desc (existing entries already have rank,
+        # but we re-rank globally so new entries slot in by quality)
+        combined: list[tuple[dict, Path | None]] = []
+        for f in existing:
+            combined.append((f, None))
+        combined.extend(new_with_src)
+        combined.sort(key=lambda x: -x[0].get("quality", {}).get("composite", 0))
+
+        # stage fresh
+        staging = ed / "_faces_new"
+        if staging.exists():
+            shutil.rmtree(staging)
+        staging.mkdir()
+        new_face_entries = []
+        for new_rank, (face, src_png_or_none) in enumerate(combined, start=1):
+            new_name = f"{new_rank:04d}.png"
+            if src_png_or_none is None:
+                # existing entry: copy from current era bucket faces/
+                old_name = Path(face["png"]).name
+                src = ed / "faces" / old_name
+                if not src.exists():
+                    print(f"[warn] {anchor_name}: missing existing PNG {src}; skip", file=sys.stderr)
+                    continue
+                shutil.copy2(src, staging / new_name)
+            else:
+                shutil.copy2(src_png_or_none, staging / new_name)
+            face = dict(face)
+            face["rank"] = new_rank
+            face["png"] = f"faces/{new_name}"
+            new_face_entries.append(face)
+
+        # swap dirs
+        old_holding = ed / "_faces_old"
+        if old_holding.exists():
+            shutil.rmtree(old_holding)
+        (ed / "faces").rename(old_holding)
+        staging.rename(ed / "faces")
+        shutil.rmtree(old_holding)
+
+        # re-zip .fsz
+        survivor_pngs = sorted((ed / "faces").glob("*.png"))
+        top_n = em.get("top_n", 30)
+        top_n_eff = min(top_n, len(survivor_pngs))
+        for old in ed.glob("*.fsz"):
+            old.unlink()
+        top_fsz_name = f"{anchor_name}_top{top_n_eff}.fsz"
+        all_fsz_name = f"{anchor_name}_all.fsz"
+        _zip_png_list(survivor_pngs[:top_n_eff], ed / top_fsz_name)
+        if len(survivor_pngs) > top_n_eff:
+            _zip_png_list(survivor_pngs, ed / all_fsz_name)
+            all_fsz_used = all_fsz_name
+        else:
+            all_fsz_used = None
+
+        # update local + master manifests
+        em["faces"] = new_face_entries
+        em["exported"] = len(new_face_entries)
+        em["fsz_top"] = top_fsz_name
+        em["fsz_all"] = all_fsz_used
+        em["top_n"] = top_n_eff
+        em.setdefault("age_extend_history", []).append({
+            "added": len(new_with_src),
+            "thresholds": cand["thresholds"],
+        })
+        em_path.write_text(json.dumps(em, indent=2))
+
+        if anchor_name in facesets_by_name:
+            facesets_by_name[anchor_name]["exported"] = len(new_face_entries)
+            facesets_by_name[anchor_name]["fsz_top"] = top_fsz_name
+            facesets_by_name[anchor_name]["fsz_all"] = all_fsz_used
+            facesets_by_name[anchor_name]["top_n"] = top_n_eff
+
+        added_here = len(new_with_src)
+        total_added += added_here
+        print(f"[applied] {anchor_name}: +{added_here} (now {len(new_face_entries)} faces)", file=sys.stderr)
+
+    # rewrite master with ordering preserved
+    new_facesets = []
+    for entry in master.get("facesets", []):
+        new_facesets.append(facesets_by_name.get(entry["name"], entry))
+    master["facesets"] = new_facesets
+    master.setdefault("age_extend_runs", []).append({
+        "parent": PARENT,
+        "thresholds": cand["thresholds"],
+        "anchors": list(by_anchor.keys()),
+        "added_total": total_added,
+    })
+    tmp = master_path.with_suffix(".tmp.json")
+    tmp.write_text(json.dumps(master, indent=2))
+    tmp.replace(master_path)
+    print(f"[done] +{total_added} faces across {len(by_anchor)} anchors", file=sys.stderr)
+
+
+# ----------------------------- main -----------------------------
+
+def main():
+    ap = argparse.ArgumentParser()
+    sub = ap.add_subparsers(dest="cmd", required=True)
+
+    a = sub.add_parser("analyze")
+    a.add_argument("--out", required=True)
+    a.set_defaults(func=cmd_analyze)
+
+    r = sub.add_parser("report")
+    r.add_argument("--candidates", required=True)
+    r.add_argument("--out", required=True)
+    r.set_defaults(func=cmd_report)
+
+    p = sub.add_parser("apply")
+    p.add_argument("--candidates", required=True)
+    p.add_argument("--dry-run", action="store_true")
+    p.set_defaults(func=cmd_apply)
+
+    args = ap.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()