Add post-export corpus maintenance pipeline

Adds four new orchestration scripts that operate on an already-built facesets_swap_ready/ to clean it up over time: - filter_occlusions.py + clip_worker.py: CLIP zero-shot mask + sunglasses filter (open_clip ViT-L-14/dfn2b_s39b). WSL stages, Windows DML scores via new C:\clip_dml_venv. Image-level threshold 0.7; faceset-level quarantine at 40% domain dominance. - consolidate_facesets.py: duplicate-identity merger using complete-linkage centroid clustering on cached arcface embeddings. Single-linkage chains catastrophically (60-faceset clusters with min sim < 0); complete-linkage guarantees within-group sim >= edge. - age_extend_001.py: slots newly-added PNGs into existing era buckets of faceset_001 using the same anchor-fragment rule as age_split_001.py (dist <= 0.40 AND |year_delta| <= 5). Anchors not re-centered. - dedup_optimize.py + multiface_worker.py: corpus-wide cleanup with three passes — cross-family SHA256 byte-dedup (preserves intra-family era duplication), within-faceset near-dup at sim >= 0.95, and a multi-face audit (the load-bearing roop invariant). Multi-face worker hits ~19 img/s on AMD Vega — ~7x embed_worker because input is 512x512 crops. Same-day corpus evolution: 311 active / 0 masked / 68 thin / 0 merged → 181 / 51 / 71 / 74; 6,440 → 3,849 active PNGs. All quarantines and prunes preserved on disk (faces/_dropped/, _masked/, _merged/, _thin/) for full reversibility. Master manifest gains masked[], merged[], plus per-run provenance blocks. Three new docs/analysis/ writeups cover model choice, threshold rationale, and per-pass run results. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 15:41:18 +02:00
parent e66c97fd58
commit 49a43c7685
10 changed files with 3250 additions and 1 deletions
--- a/work/consolidate_facesets.py
+++ b/work/consolidate_facesets.py
@@ -0,0 +1,634 @@
+"""Consolidate facesets_swap_ready/ — find duplicate identities and merge.
+
+Pipeline:
+  1. analyze: pull arcface embeddings from work/cache/*.npz for every PNG in every
+     active faceset (skipping _masked, _thin, era splits). Compute L2-normalized
+     centroid per faceset. Build similarity graph at sim>=0.45, extract components.
+     Pick primary per component by tier (hand-sorted > auto > osrc > immich) + size.
+  2. report: HTML contact sheet at work/merge_review/index.html grouped by
+     candidate cluster, with top-3 thumbs per faceset, all pairwise sims, and
+     "merge X,Y -> Z" plan. Confident edges (sim>=0.65) are highlighted.
+  3. apply: combine PNGs of secondaries into primary, re-rank by quality.composite
+     descending, renumber 0001..NNNN, re-zip _topN.fsz + _all.fsz, move secondaries
+     to facesets_swap_ready/_merged/<name>/, update master manifest with
+     `merged[]` array + `merge_run` provenance block.
+
+Embeddings come from caches (no GPU re-embed needed); the original clusterer used
+exactly these vectors so they are the right yardstick. Era splits are excluded
+entirely (intentional time-period segmentation, not a duplication).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import shutil
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+from PIL import Image
+from scipy.cluster.hierarchy import linkage, fcluster
+from scipy.spatial.distance import squareform
+
+ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
+CACHES = [
+    Path("/opt/face-sets/work/cache/nl_full.npz"),
+    Path("/opt/face-sets/work/cache/immich_peter.npz"),
+    Path("/opt/face-sets/work/cache/immich_nic.npz"),
+]
+
+ERA_SPLIT_RE = re.compile(r"^faceset_\d+_(?:\d{4}-\d{2,4}|\d{4}|undated)$")
+
+
+# ----------------------------- helpers -----------------------------
+
+def load_caches():
+    """Return (rec_index, alias_map). rec_index keyed by (path, bbox_tuple)
+    -> embedding (np.float32, shape (512,) L2-normalized).
+    alias_map maps every alias path -> canonical path."""
+    rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {}
+    alias_map: dict[str, str] = {}
+    n_total = 0
+    for c in CACHES:
+        if not c.exists():
+            print(f"[warn] cache missing: {c}", file=sys.stderr)
+            continue
+        d = np.load(c, allow_pickle=True)
+        emb = d["embeddings"]
+        meta = json.loads(str(d["meta"]))
+        face_records = [m for m in meta if not m.get("noface")]
+        if len(face_records) != len(emb):
+            raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}")
+        # path_aliases may be present
+        if "path_aliases" in d.files:
+            paliases = json.loads(str(d["path_aliases"]))
+            for canon, alist in paliases.items():
+                alias_map.setdefault(canon, canon)
+                for a in alist:
+                    alias_map[a] = canon
+        for i, rec in enumerate(face_records):
+            p = rec["path"]
+            bbox = tuple(int(x) for x in rec["bbox"])
+            v = emb[i].astype(np.float32)
+            n = float(np.linalg.norm(v))
+            if n > 0:
+                v = v / n
+            rec_index[(p, bbox)] = v
+            alias_map.setdefault(p, p)
+        print(f"[cache] {c.name}: +{len(face_records)} face records (running total {len(rec_index)})", file=sys.stderr)
+        n_total += len(face_records)
+    print(f"[cache] indexed {n_total} face records, {len(alias_map)} path aliases", file=sys.stderr)
+    return rec_index, alias_map
+
+
+def faceset_tier(name: str) -> int:
+    """Lower number = higher priority for primary selection."""
+    m = re.match(r"^faceset_0*(\d+)$", name)
+    if not m:
+        return 99  # unknown structure
+    n = int(m.group(1))
+    if 13 <= n <= 19:
+        return 0  # hand-sorted
+    if 1 <= n <= 12:
+        return 1  # auto-clustered
+    if 20 <= n <= 25:
+        return 2  # osrc
+    if 26 <= n <= 264:
+        return 3  # immich peter
+    if 265 <= n:
+        return 4  # immich nic and beyond
+    return 99
+
+
+def is_era_split(name: str) -> bool:
+    return bool(ERA_SPLIT_RE.match(name))
+
+
+def faceset_centroid(faceset_dir: Path, rec_index, alias_map):
+    """Return (centroid, n_used, n_missing) where centroid is L2-normalized mean
+    of embeddings of the faces listed in the per-faceset manifest. Falls back to
+    None if too few embeddings found."""
+    manifest = faceset_dir / "manifest.json"
+    if not manifest.exists():
+        return None, 0, 0
+    m = json.loads(manifest.read_text())
+    vecs = []
+    n_missing = 0
+    for f in m.get("faces", []):
+        src = f.get("source")
+        bbox = f.get("bbox")
+        if src is None or bbox is None:
+            n_missing += 1
+            continue
+        bbox_t = tuple(int(x) for x in bbox)
+        canon = alias_map.get(src, src)
+        v = rec_index.get((canon, bbox_t))
+        if v is None and canon != src:
+            v = rec_index.get((src, bbox_t))
+        if v is None:
+            n_missing += 1
+            continue
+        vecs.append(v)
+    if len(vecs) < 3:
+        return None, len(vecs), n_missing
+    arr = np.stack(vecs).astype(np.float32)
+    c = arr.mean(axis=0)
+    n = float(np.linalg.norm(c))
+    if n > 0:
+        c = c / n
+    return c, len(vecs), n_missing
+
+
+def connected_components(adj: dict[int, set[int]]) -> list[list[int]]:
+    seen: set[int] = set()
+    comps = []
+    for node in adj:
+        if node in seen:
+            continue
+        stack = [node]
+        comp = []
+        while stack:
+            x = stack.pop()
+            if x in seen:
+                continue
+            seen.add(x)
+            comp.append(x)
+            for y in adj.get(x, set()):
+                if y not in seen:
+                    stack.append(y)
+        comps.append(sorted(comp))
+    return comps
+
+
+# ----------------------------- analyze -----------------------------
+
+def cmd_analyze(args):
+    rec_index, alias_map = load_caches()
+
+    # collect active facesets
+    active = []
+    for d in sorted(ROOT.iterdir()):
+        if not d.is_dir() or d.name.startswith("_"):
+            continue
+        if is_era_split(d.name):
+            continue
+        active.append(d)
+    print(f"[scan] {len(active)} active facesets (era splits + _masked + _thin excluded)", file=sys.stderr)
+
+    centroids: dict[str, np.ndarray] = {}
+    sizes: dict[str, int] = {}
+    skipped = []
+    t0 = time.time()
+    for fs in active:
+        c, n_used, n_miss = faceset_centroid(fs, rec_index, alias_map)
+        if c is None:
+            skipped.append((fs.name, n_used, n_miss))
+            continue
+        centroids[fs.name] = c
+        sizes[fs.name] = n_used
+    print(f"[centroid] {len(centroids)} facesets centroided in {time.time()-t0:.1f}s; "
+          f"{len(skipped)} skipped (too few embeddings)", file=sys.stderr)
+    if skipped:
+        for n, u, m in skipped[:10]:
+            print(f"  skip {n}: used={u} missing={m}", file=sys.stderr)
+        if len(skipped) > 10:
+            print(f"  ... +{len(skipped)-10} more", file=sys.stderr)
+
+    names = sorted(centroids.keys())
+    if not names:
+        raise SystemExit("no centroids built")
+
+    # similarity matrix
+    M = np.stack([centroids[n] for n in names]).astype(np.float32)  # (N, 512), normalized
+    sim = M @ M.T  # (N, N) cosine since unit-normalized
+    np.clip(sim, -1.0, 1.0, out=sim)
+
+    edge_thr = args.edge
+    confident_thr = args.confident
+
+    # complete-linkage agglomerative clustering on cosine distance.
+    # Cut at edge threshold: groups are guaranteed to have ALL pairs sim >= edge_thr.
+    # This avoids the chaining problem of single-link / connected-components.
+    n = len(names)
+    dist = 1.0 - sim
+    np.fill_diagonal(dist, 0.0)
+    # symmetrize numerical noise
+    dist = (dist + dist.T) / 2.0
+    np.clip(dist, 0.0, 2.0, out=dist)
+    cond = squareform(dist, checks=False)
+    Z = linkage(cond, method="complete")
+    cut_dist = 1.0 - edge_thr  # complete-link distance corresponds to (1 - min sim)
+    labels = fcluster(Z, t=cut_dist, criterion="distance")  # 1-indexed cluster ids
+
+    cluster_members: dict[int, list[int]] = {}
+    for idx, lbl in enumerate(labels):
+        cluster_members.setdefault(int(lbl), []).append(idx)
+    comps = [sorted(idxs) for idxs in cluster_members.values() if len(idxs) > 1]
+
+    n_pairs_in_groups = 0
+    for c in comps:
+        n_pairs_in_groups += len(c) * (len(c) - 1) // 2
+    print(f"[graph] complete-linkage cut at sim>={edge_thr}: {len(comps)} multi-faceset groups "
+          f"({n_pairs_in_groups} within-group pairs)", file=sys.stderr)
+
+    # pick primary per group: lowest tier number, then largest size
+    groups_out = []
+    for comp in comps:
+        members = [names[i] for i in comp]
+        members_sorted = sorted(members, key=lambda x: (faceset_tier(x), -sizes.get(x, 0), x))
+        primary = members_sorted[0]
+        secondaries = members_sorted[1:]
+        # gather pairwise sims within group
+        pair_sims = []
+        idx_of = {names[i]: i for i in comp}
+        for a in members:
+            for b in members:
+                if a >= b:
+                    continue
+                pair_sims.append({"a": a, "b": b, "sim": round(float(sim[idx_of[a], idx_of[b]]), 4)})
+        # confidence: minimum within-group sim (the weakest link)
+        min_link = min(p["sim"] for p in pair_sims)
+        max_link = max(p["sim"] for p in pair_sims)
+        confidence = "confident" if min_link >= confident_thr else "uncertain"
+        groups_out.append({
+            "primary": primary,
+            "secondaries": secondaries,
+            "members": members_sorted,
+            "tiers": {n: faceset_tier(n) for n in members},
+            "sizes": {n: sizes.get(n, 0) for n in members},
+            "pair_sims": pair_sims,
+            "min_link": round(min_link, 4),
+            "max_link": round(max_link, 4),
+            "confidence": confidence,
+        })
+    # sort: confident first, then by max_link desc
+    groups_out.sort(key=lambda g: (0 if g["confidence"] == "confident" else 1, -g["max_link"]))
+
+    out = {
+        "thresholds": {"edge": edge_thr, "confident": confident_thr},
+        "n_active": len(active),
+        "n_centroided": len(centroids),
+        "n_skipped": len(skipped),
+        "skipped_reasons": [{"name": n, "used": u, "missing": m} for n, u, m in skipped],
+        "n_groups": len(groups_out),
+        "n_facesets_in_groups": sum(len(g["members"]) for g in groups_out),
+        "groups": groups_out,
+    }
+    op = Path(args.out)
+    op.parent.mkdir(parents=True, exist_ok=True)
+    op.write_text(json.dumps(out, indent=2))
+    confident = sum(1 for g in groups_out if g["confidence"] == "confident")
+    uncertain = sum(1 for g in groups_out if g["confidence"] == "uncertain")
+    print(f"[done] {len(groups_out)} groups ({confident} confident, {uncertain} uncertain) -> {op}", file=sys.stderr)
+
+
+# ----------------------------- report -----------------------------
+
+def cmd_report(args):
+    candidates = json.loads(Path(args.candidates).read_text())
+    out_dir = Path(args.out)
+    thumbs_dir = out_dir / "thumbs"
+    thumbs_dir.mkdir(parents=True, exist_ok=True)
+
+    THUMB = 140
+    THUMBS_PER_FACESET = 4
+
+    def make_thumb(faceset: str, fname: str) -> str:
+        d = thumbs_dir / faceset
+        d.mkdir(parents=True, exist_ok=True)
+        dst = d / (Path(fname).stem + ".jpg")
+        if not dst.exists():
+            try:
+                src = ROOT / faceset / "faces" / fname
+                img = Image.open(src).convert("RGB")
+                img.thumbnail((THUMB, THUMB), Image.LANCZOS)
+                img.save(dst, "JPEG", quality=82)
+            except Exception as e:
+                print(f"[thumb-skip] {faceset}/{fname}: {e}", file=sys.stderr)
+                return ""
+        return f"thumbs/{faceset}/{Path(fname).stem}.jpg"
+
+    rows = []
+    for gi, g in enumerate(candidates["groups"]):
+        primary = g["primary"]
+        sec = g["secondaries"]
+        conf_cls = "confident" if g["confidence"] == "confident" else "uncertain"
+        rows.append(f"<section class='grp {conf_cls}' id='g{gi}'>")
+        rows.append(f"<h2>group #{gi+1} <small>({g['confidence']}; min_sim={g['min_link']:.3f}, max_sim={g['max_link']:.3f})</small></h2>")
+        rows.append(f"<div class='plan'>merge <b>{', '.join(sec)}</b> &rarr; <b>{primary}</b></div>")
+        # member rows
+        for name in g["members"]:
+            tier = g["tiers"][name]
+            sz = g["sizes"][name]
+            tier_label = ["hand-sorted", "auto", "osrc", "immich-peter", "immich-nic", "?"][min(tier, 5)]
+            badge = "PRIMARY" if name == primary else "secondary"
+            rows.append(f"<div class='member'>")
+            rows.append(f"<div class='label'><span class='badge {badge.lower()}'>{badge}</span> "
+                        f"<b>{name}</b> <small>tier={tier_label} · n={sz}</small></div>")
+            rows.append("<div class='thumbs'>")
+            faces_dir = ROOT / name / "faces"
+            files = sorted(faces_dir.glob("*.png"))[:THUMBS_PER_FACESET]
+            for f in files:
+                rel = make_thumb(name, f.name)
+                if rel:
+                    rows.append(f"<img src='{rel}' loading='lazy' title='{f.name}'>")
+            rows.append("</div></div>")
+        # pairwise sims
+        rows.append("<table class='sims'><tr><th>a</th><th>b</th><th>sim</th></tr>")
+        for ps in sorted(g["pair_sims"], key=lambda x: -x["sim"]):
+            cls = "hi" if ps["sim"] >= candidates["thresholds"]["confident"] else "mid"
+            rows.append(f"<tr><td>{ps['a']}</td><td>{ps['b']}</td><td class='{cls}'>{ps['sim']:.3f}</td></tr>")
+        rows.append("</table>")
+        rows.append("</section>")
+
+    nav = " · ".join(f"<a href='#g{i}'>#{i+1}</a>" for i in range(len(candidates["groups"])))
+
+    html = f"""<!doctype html>
+<html><head><meta charset='utf-8'><title>Faceset merge review</title>
+<style>
+body {{ font-family: system-ui, sans-serif; background: #111; color: #eee; padding: 1em; }}
+h1 {{ margin-top: 0; }}
+h2 {{ margin: 0; }}
+small {{ color: #999; font-weight: normal; }}
+section.grp {{ background: #1a1a1a; border-radius: 6px; padding: 12px; margin: 12px 0; }}
+section.grp.confident {{ border-left: 4px solid #5fa05f; }}
+section.grp.uncertain {{ border-left: 4px solid #ffb050; }}
+.plan {{ margin: .5em 0; color: #6cf; }}
+.member {{ margin: 8px 0; padding: 6px; background: #222; border-radius: 4px; }}
+.label {{ font-family: monospace; font-size: 13px; }}
+.badge {{ display: inline-block; padding: 0 6px; font-size: 10px; border-radius: 2px; }}
+.badge.primary {{ background: #5fa05f; color: #000; font-weight: bold; }}
+.badge.secondary {{ background: #444; color: #ccc; }}
+.thumbs {{ display: flex; gap: 4px; margin-top: 4px; flex-wrap: wrap; }}
+.thumbs img {{ height: 140px; width: auto; border-radius: 3px; }}
+table.sims {{ font-family: monospace; font-size: 11px; margin-top: 6px; border-collapse: collapse; }}
+table.sims td, table.sims th {{ padding: 1px 8px; border: 1px solid #333; text-align: left; }}
+table.sims td.hi {{ color: #5fa05f; font-weight: bold; }}
+table.sims td.mid {{ color: #ffb050; }}
+.nav {{ position: sticky; top: 0; background: #111; padding: .5em 0; border-bottom: 1px solid #333; font-size: 12px; }}
+a {{ color: #6cf; }}
+</style></head>
+<body>
+<h1>Merge review &mdash; {len(candidates['groups'])} candidate groups
+  <small>(edge>={candidates['thresholds']['edge']}, confident>={candidates['thresholds']['confident']})</small></h1>
+<p>{candidates['n_centroided']} of {candidates['n_active']} active facesets centroided
+  (skipped {candidates['n_skipped']} for too few cached embeddings).
+  Green = confident (min within-group sim >= {candidates['thresholds']['confident']}); orange = uncertain.</p>
+<div class='nav'>{nav}</div>
+{''.join(rows)}
+</body></html>"""
+
+    out_html = out_dir / "index.html"
+    out_html.write_text(html)
+    print(f"[done] {out_html}", file=sys.stderr)
+
+
+# ----------------------------- apply -----------------------------
+
+def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
+    import zipfile
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
+        for i, p in enumerate(pngs):
+            zf.write(p, arcname=f"{i:04d}.png")
+
+
+def cmd_apply(args):
+    candidates = json.loads(Path(args.candidates).read_text())
+    master_path = ROOT / "manifest.json"
+    master = json.loads(master_path.read_text())
+    by_name = {f["name"]: f for f in master.get("facesets", [])}
+
+    # filter: skip "uncertain" groups unless --include-uncertain
+    accepted = [g for g in candidates["groups"]
+                if g["confidence"] == "confident" or args.include_uncertain]
+    skipped_unc = [g for g in candidates["groups"]
+                   if g["confidence"] == "uncertain" and not args.include_uncertain]
+    # explicit --exclude / --only filters (group indices in the candidates file)
+    if args.only:
+        only = {int(s) for s in args.only.split(",")}
+        accepted = [g for i, g in enumerate(candidates["groups"]) if i in only]
+    if args.exclude:
+        excl = {int(s) for s in args.exclude.split(",")}
+        accepted = [g for i, g in enumerate(accepted) if i not in excl]
+
+    print(f"[plan] {len(accepted)} groups will be merged "
+          f"({len(skipped_unc)} uncertain skipped)", file=sys.stderr)
+
+    if args.dry_run:
+        for g in accepted:
+            print(f"  merge {g['secondaries']} -> {g['primary']}  "
+                  f"({g['confidence']}, min_sim={g['min_link']:.3f})")
+        return
+
+    merged_dir = ROOT / "_merged"
+    merged_dir.mkdir(exist_ok=True)
+    new_facesets: list[dict] = []
+    new_merged: list[dict] = list(master.get("merged", []))
+    consumed_names: set[str] = set()
+    primary_updates: dict[str, dict] = {}  # name -> new entry
+    primary_absorbed: dict[str, list[dict]] = {}  # primary_name -> [secondary entries]
+
+    for g in accepted:
+        primary = g["primary"]
+        if primary not in by_name:
+            print(f"[warn] primary {primary} not in master; skipping group", file=sys.stderr)
+            continue
+        primary_dir = ROOT / primary
+        if not primary_dir.is_dir():
+            print(f"[warn] primary dir {primary_dir} missing; skipping group", file=sys.stderr)
+            continue
+        primary_faces = primary_dir / "faces"
+        primary_manifest_path = primary_dir / "manifest.json"
+        primary_manifest = json.loads(primary_manifest_path.read_text())
+
+        # gather all face entries: primary + each secondary
+        combined_faces: list[dict] = list(primary_manifest.get("faces", []))
+        # adjust composite quality fall-back: ensure key exists
+        for f in combined_faces:
+            f.setdefault("origin_faceset", primary)
+
+        for sec in g["secondaries"]:
+            sec_dir = ROOT / sec
+            if not sec_dir.is_dir():
+                print(f"[warn] secondary {sec} missing; skipping", file=sys.stderr)
+                continue
+            sec_manifest_path = sec_dir / "manifest.json"
+            sec_manifest = json.loads(sec_manifest_path.read_text()) if sec_manifest_path.exists() else {"faces": []}
+            for f in sec_manifest.get("faces", []):
+                f = dict(f)
+                f["origin_faceset"] = sec
+                combined_faces.append(f)
+
+        # rank by quality.composite descending; ties broken by lower cosd_centroid
+        def sort_key(f):
+            q = f.get("quality", {}).get("composite", 0)
+            d = f.get("cosd_centroid", 1.0)
+            return (-q, d)
+        combined_faces.sort(key=sort_key)
+
+        # renumber and stage PNGs into a fresh staging dir, then atomically swap
+        staging = primary_dir / "_faces_new"
+        if staging.exists():
+            shutil.rmtree(staging)
+        staging.mkdir()
+        new_face_entries = []
+        for new_rank, f in enumerate(combined_faces, start=1):
+            origin = f.pop("origin_faceset")
+            old_png_rel = f["png"]                   # e.g. "faces/0042.png"
+            old_png_name = Path(old_png_rel).name
+            origin_png = ROOT / origin / "faces" / old_png_name
+            if not origin_png.exists():
+                # could be in _dropped if occlusion-pruned; skip
+                continue
+            new_name = f"{new_rank:04d}.png"
+            shutil.copy2(origin_png, staging / new_name)
+            f = dict(f)
+            f["rank"] = new_rank
+            f["png"] = f"faces/{new_name}"
+            f["origin_faceset"] = origin   # preserve provenance in manifest
+            new_face_entries.append(f)
+
+        # swap directories: primary/faces -> primary/_faces_old, staging -> primary/faces
+        old_faces_holding = primary_dir / "_faces_old"
+        if old_faces_holding.exists():
+            shutil.rmtree(old_faces_holding)
+        if primary_faces.exists():
+            primary_faces.rename(old_faces_holding)
+        staging.rename(primary_faces)
+        # migrate _dropped/ from old holding (so occlusion-pruned PNGs remain accessible)
+        old_dropped = old_faces_holding / "_dropped"
+        if old_dropped.exists():
+            (primary_faces / "_dropped").mkdir(exist_ok=True)
+            for x in old_dropped.iterdir():
+                shutil.move(str(x), str(primary_faces / "_dropped" / x.name))
+        shutil.rmtree(old_faces_holding)
+
+        # re-zip .fsz
+        survivor_pngs = sorted(primary_faces.glob("*.png"))
+        top_n = primary_manifest.get("top_n", 30)
+        top_n_eff = min(top_n, len(survivor_pngs))
+        # remove old .fsz files
+        for old in primary_dir.glob("*.fsz"):
+            old.unlink()
+        top_fsz_name = f"{primary}_top{top_n_eff}.fsz"
+        all_fsz_name = f"{primary}_all.fsz"
+        _zip_png_list(survivor_pngs[:top_n_eff], primary_dir / top_fsz_name)
+        if len(survivor_pngs) > top_n_eff:
+            _zip_png_list(survivor_pngs, primary_dir / all_fsz_name)
+            all_fsz_used = all_fsz_name
+        else:
+            all_fsz_used = None
+
+        # update primary's local manifest
+        primary_manifest["faces"] = new_face_entries
+        primary_manifest["exported"] = len(new_face_entries)
+        primary_manifest["fsz_top"] = top_fsz_name
+        primary_manifest["fsz_all"] = all_fsz_used
+        primary_manifest["top_n"] = top_n_eff
+        primary_manifest.setdefault("merge_history", []).append({
+            "absorbed": g["secondaries"],
+            "min_link": g["min_link"],
+            "max_link": g["max_link"],
+            "confidence": g["confidence"],
+        })
+        primary_manifest_path.write_text(json.dumps(primary_manifest, indent=2))
+
+        # move secondary directories into _merged/
+        absorbed_master_entries: list[dict] = []
+        for sec in g["secondaries"]:
+            sec_dir = ROOT / sec
+            target = merged_dir / sec
+            if not sec_dir.is_dir():
+                continue
+            if target.exists():
+                shutil.rmtree(sec_dir)  # already moved by previous run; clean stub
+            else:
+                shutil.move(str(sec_dir), str(target))
+            sec_master = dict(by_name.get(sec, {"name": sec}))
+            sec_master["merged_into"] = primary
+            sec_master["relpath"] = f"_merged/{sec}"
+            sec_master["fsz_top"] = None
+            sec_master["fsz_all"] = None
+            absorbed_master_entries.append(sec_master)
+            consumed_names.add(sec)
+
+        new_merged.extend(absorbed_master_entries)
+
+        # bump primary master entry
+        prim_master = dict(by_name[primary])
+        prim_master["exported"] = len(new_face_entries)
+        prim_master["top_n"] = top_n_eff
+        prim_master["fsz_top"] = top_fsz_name
+        prim_master["fsz_all"] = all_fsz_used
+        prim_master.setdefault("merge_history", []).append({
+            "absorbed": g["secondaries"],
+            "min_link": g["min_link"],
+            "max_link": g["max_link"],
+        })
+        primary_updates[primary] = prim_master
+
+        print(f"[merged] {g['secondaries']} -> {primary}  "
+              f"now {len(new_face_entries)} png", file=sys.stderr)
+
+    # rebuild master facesets list
+    for entry in master.get("facesets", []):
+        nm = entry["name"]
+        if nm in consumed_names:
+            continue
+        if nm in primary_updates:
+            new_facesets.append(primary_updates[nm])
+        else:
+            new_facesets.append(entry)
+
+    new_master = dict(master)
+    new_master["facesets"] = new_facesets
+    new_master["merged"] = new_merged
+    new_master["merge_run"] = {
+        "thresholds": candidates["thresholds"],
+        "groups_applied": len(accepted),
+        "facesets_consumed": len(consumed_names),
+        "include_uncertain": bool(args.include_uncertain),
+    }
+    tmp = master_path.with_suffix(".tmp.json")
+    tmp.write_text(json.dumps(new_master, indent=2))
+    tmp.replace(master_path)
+    print(f"[done] master manifest updated: {len(new_facesets)} active, "
+          f"{len(new_merged)} merged, {len(consumed_names)} consumed in this run",
+          file=sys.stderr)
+
+
+# ----------------------------- main -----------------------------
+
+def main():
+    ap = argparse.ArgumentParser()
+    sub = ap.add_subparsers(dest="cmd", required=True)
+
+    a = sub.add_parser("analyze")
+    a.add_argument("--out", required=True)
+    a.add_argument("--edge", type=float, default=0.45, help="min cosine sim to draw an edge (default 0.45)")
+    a.add_argument("--confident", type=float, default=0.65, help="min within-group sim to be confident (default 0.65)")
+    a.set_defaults(func=cmd_analyze)
+
+    r = sub.add_parser("report")
+    r.add_argument("--candidates", required=True)
+    r.add_argument("--out", required=True)
+    r.set_defaults(func=cmd_report)
+
+    p = sub.add_parser("apply")
+    p.add_argument("--candidates", required=True)
+    p.add_argument("--include-uncertain", action="store_true",
+                   help="apply uncertain groups too (default: confident only)")
+    p.add_argument("--only", default=None, help="comma-separated group indices to apply")
+    p.add_argument("--exclude", default=None, help="comma-separated group indices to skip")
+    p.add_argument("--dry-run", action="store_true")
+    p.set_defaults(func=cmd_apply)
+
+    args = ap.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()