face-sets/work/consolidate_facesets.py

"""Consolidate facesets_swap_ready/ — find duplicate identities and merge.

Pipeline:
  1. analyze: pull arcface embeddings from work/cache/*.npz for every PNG in every
     active faceset (skipping _masked, _thin, era splits). Compute L2-normalized
     centroid per faceset. Build similarity graph at sim>=0.45, extract components.
     Pick primary per component by tier (hand-sorted > auto > osrc > immich) + size.
  2. report: HTML contact sheet at work/merge_review/index.html grouped by
     candidate cluster, with top-3 thumbs per faceset, all pairwise sims, and
     "merge X,Y -> Z" plan. Confident edges (sim>=0.65) are highlighted.
  3. apply: combine PNGs of secondaries into primary, re-rank by quality.composite
     descending, renumber 0001..NNNN, re-zip _topN.fsz + _all.fsz, move secondaries
     to facesets_swap_ready/_merged/<name>/, update master manifest with
     `merged[]` array + `merge_run` provenance block.

Embeddings come from caches (no GPU re-embed needed); the original clusterer used
exactly these vectors so they are the right yardstick. Era splits are excluded
entirely (intentional time-period segmentation, not a duplication).
"""

from __future__ import annotations

import argparse
import json
import re
import shutil
import sys
import time
from pathlib import Path

import numpy as np
from PIL import Image
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform

ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
CACHES = [
    Path("/opt/face-sets/work/cache/nl_full.npz"),
    Path("/opt/face-sets/work/cache/immich_peter.npz"),
    Path("/opt/face-sets/work/cache/immich_nic.npz"),
]

ERA_SPLIT_RE = re.compile(r"^faceset_\d+_(?:\d{4}-\d{2,4}|\d{4}|undated)$")


# ----------------------------- helpers -----------------------------

def load_caches():
    """Return (rec_index, alias_map). rec_index keyed by (path, bbox_tuple)
    -> embedding (np.float32, shape (512,) L2-normalized).
    alias_map maps every alias path -> canonical path."""
    rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {}
    alias_map: dict[str, str] = {}
    n_total = 0
    for c in CACHES:
        if not c.exists():
            print(f"[warn] cache missing: {c}", file=sys.stderr)
            continue
        d = np.load(c, allow_pickle=True)
        emb = d["embeddings"]
        meta = json.loads(str(d["meta"]))
        face_records = [m for m in meta if not m.get("noface")]
        if len(face_records) != len(emb):
            raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}")
        # path_aliases may be present
        if "path_aliases" in d.files:
            paliases = json.loads(str(d["path_aliases"]))
            for canon, alist in paliases.items():
                alias_map.setdefault(canon, canon)
                for a in alist:
                    alias_map[a] = canon
        for i, rec in enumerate(face_records):
            p = rec["path"]
            bbox = tuple(int(x) for x in rec["bbox"])
            v = emb[i].astype(np.float32)
            n = float(np.linalg.norm(v))
            if n > 0:
                v = v / n
            rec_index[(p, bbox)] = v
            alias_map.setdefault(p, p)
        print(f"[cache] {c.name}: +{len(face_records)} face records (running total {len(rec_index)})", file=sys.stderr)
        n_total += len(face_records)
    print(f"[cache] indexed {n_total} face records, {len(alias_map)} path aliases", file=sys.stderr)
    return rec_index, alias_map


def faceset_tier(name: str) -> int:
    """Lower number = higher priority for primary selection."""
    m = re.match(r"^faceset_0*(\d+)$", name)
    if not m:
        return 99  # unknown structure
    n = int(m.group(1))
    if 13 <= n <= 19:
        return 0  # hand-sorted
    if 1 <= n <= 12:
        return 1  # auto-clustered
    if 20 <= n <= 25:
        return 2  # osrc
    if 26 <= n <= 264:
        return 3  # immich peter
    if 265 <= n:
        return 4  # immich nic and beyond
    return 99


def is_era_split(name: str) -> bool:
    return bool(ERA_SPLIT_RE.match(name))


def faceset_centroid(faceset_dir: Path, rec_index, alias_map):
    """Return (centroid, n_used, n_missing) where centroid is L2-normalized mean
    of embeddings of the faces listed in the per-faceset manifest. Falls back to
    None if too few embeddings found."""
    manifest = faceset_dir / "manifest.json"
    if not manifest.exists():
        return None, 0, 0
    m = json.loads(manifest.read_text())
    vecs = []
    n_missing = 0
    for f in m.get("faces", []):
        src = f.get("source")
        bbox = f.get("bbox")
        if src is None or bbox is None:
            n_missing += 1
            continue
        bbox_t = tuple(int(x) for x in bbox)
        canon = alias_map.get(src, src)
        v = rec_index.get((canon, bbox_t))
        if v is None and canon != src:
            v = rec_index.get((src, bbox_t))
        if v is None:
            n_missing += 1
            continue
        vecs.append(v)
    if len(vecs) < 3:
        return None, len(vecs), n_missing
    arr = np.stack(vecs).astype(np.float32)
    c = arr.mean(axis=0)
    n = float(np.linalg.norm(c))
    if n > 0:
        c = c / n
    return c, len(vecs), n_missing


def connected_components(adj: dict[int, set[int]]) -> list[list[int]]:
    seen: set[int] = set()
    comps = []
    for node in adj:
        if node in seen:
            continue
        stack = [node]
        comp = []
        while stack:
            x = stack.pop()
            if x in seen:
                continue
            seen.add(x)
            comp.append(x)
            for y in adj.get(x, set()):
                if y not in seen:
                    stack.append(y)
        comps.append(sorted(comp))
    return comps


# ----------------------------- analyze -----------------------------

def cmd_analyze(args):
    rec_index, alias_map = load_caches()

    # collect active facesets
    active = []
    for d in sorted(ROOT.iterdir()):
        if not d.is_dir() or d.name.startswith("_"):
            continue
        if is_era_split(d.name):
            continue
        active.append(d)
    print(f"[scan] {len(active)} active facesets (era splits + _masked + _thin excluded)", file=sys.stderr)

    centroids: dict[str, np.ndarray] = {}
    sizes: dict[str, int] = {}
    skipped = []
    t0 = time.time()
    for fs in active:
        c, n_used, n_miss = faceset_centroid(fs, rec_index, alias_map)
        if c is None:
            skipped.append((fs.name, n_used, n_miss))
            continue
        centroids[fs.name] = c
        sizes[fs.name] = n_used
    print(f"[centroid] {len(centroids)} facesets centroided in {time.time()-t0:.1f}s; "
          f"{len(skipped)} skipped (too few embeddings)", file=sys.stderr)
    if skipped:
        for n, u, m in skipped[:10]:
            print(f"  skip {n}: used={u} missing={m}", file=sys.stderr)
        if len(skipped) > 10:
            print(f"  ... +{len(skipped)-10} more", file=sys.stderr)

    names = sorted(centroids.keys())
    if not names:
        raise SystemExit("no centroids built")

    # similarity matrix
    M = np.stack([centroids[n] for n in names]).astype(np.float32)  # (N, 512), normalized
    sim = M @ M.T  # (N, N) cosine since unit-normalized
    np.clip(sim, -1.0, 1.0, out=sim)

    edge_thr = args.edge
    confident_thr = args.confident

    # complete-linkage agglomerative clustering on cosine distance.
    # Cut at edge threshold: groups are guaranteed to have ALL pairs sim >= edge_thr.
    # This avoids the chaining problem of single-link / connected-components.
    n = len(names)
    dist = 1.0 - sim
    np.fill_diagonal(dist, 0.0)
    # symmetrize numerical noise
    dist = (dist + dist.T) / 2.0
    np.clip(dist, 0.0, 2.0, out=dist)
    cond = squareform(dist, checks=False)
    Z = linkage(cond, method="complete")
    cut_dist = 1.0 - edge_thr  # complete-link distance corresponds to (1 - min sim)
    labels = fcluster(Z, t=cut_dist, criterion="distance")  # 1-indexed cluster ids

    cluster_members: dict[int, list[int]] = {}
    for idx, lbl in enumerate(labels):
        cluster_members.setdefault(int(lbl), []).append(idx)
    comps = [sorted(idxs) for idxs in cluster_members.values() if len(idxs) > 1]

    n_pairs_in_groups = 0
    for c in comps:
        n_pairs_in_groups += len(c) * (len(c) - 1) // 2
    print(f"[graph] complete-linkage cut at sim>={edge_thr}: {len(comps)} multi-faceset groups "
          f"({n_pairs_in_groups} within-group pairs)", file=sys.stderr)

    # pick primary per group: lowest tier number, then largest size
    groups_out = []
    for comp in comps:
        members = [names[i] for i in comp]
        members_sorted = sorted(members, key=lambda x: (faceset_tier(x), -sizes.get(x, 0), x))
        primary = members_sorted[0]
        secondaries = members_sorted[1:]
        # gather pairwise sims within group
        pair_sims = []
        idx_of = {names[i]: i for i in comp}
        for a in members:
            for b in members:
                if a >= b:
                    continue
                pair_sims.append({"a": a, "b": b, "sim": round(float(sim[idx_of[a], idx_of[b]]), 4)})
        # confidence: minimum within-group sim (the weakest link)
        min_link = min(p["sim"] for p in pair_sims)
        max_link = max(p["sim"] for p in pair_sims)
        confidence = "confident" if min_link >= confident_thr else "uncertain"
        groups_out.append({
            "primary": primary,
            "secondaries": secondaries,
            "members": members_sorted,
            "tiers": {n: faceset_tier(n) for n in members},
            "sizes": {n: sizes.get(n, 0) for n in members},
            "pair_sims": pair_sims,
            "min_link": round(min_link, 4),
            "max_link": round(max_link, 4),
            "confidence": confidence,
        })
    # sort: confident first, then by max_link desc
    groups_out.sort(key=lambda g: (0 if g["confidence"] == "confident" else 1, -g["max_link"]))

    out = {
        "thresholds": {"edge": edge_thr, "confident": confident_thr},
        "n_active": len(active),
        "n_centroided": len(centroids),
        "n_skipped": len(skipped),
        "skipped_reasons": [{"name": n, "used": u, "missing": m} for n, u, m in skipped],
        "n_groups": len(groups_out),
        "n_facesets_in_groups": sum(len(g["members"]) for g in groups_out),
        "groups": groups_out,
    }
    op = Path(args.out)
    op.parent.mkdir(parents=True, exist_ok=True)
    op.write_text(json.dumps(out, indent=2))
    confident = sum(1 for g in groups_out if g["confidence"] == "confident")
    uncertain = sum(1 for g in groups_out if g["confidence"] == "uncertain")
    print(f"[done] {len(groups_out)} groups ({confident} confident, {uncertain} uncertain) -> {op}", file=sys.stderr)


# ----------------------------- report -----------------------------

def cmd_report(args):
    candidates = json.loads(Path(args.candidates).read_text())
    out_dir = Path(args.out)
    thumbs_dir = out_dir / "thumbs"
    thumbs_dir.mkdir(parents=True, exist_ok=True)

    THUMB = 140
    THUMBS_PER_FACESET = 4

    def make_thumb(faceset: str, fname: str) -> str:
        d = thumbs_dir / faceset
        d.mkdir(parents=True, exist_ok=True)
        dst = d / (Path(fname).stem + ".jpg")
        if not dst.exists():
            try:
                src = ROOT / faceset / "faces" / fname
                img = Image.open(src).convert("RGB")
                img.thumbnail((THUMB, THUMB), Image.LANCZOS)
                img.save(dst, "JPEG", quality=82)
            except Exception as e:
                print(f"[thumb-skip] {faceset}/{fname}: {e}", file=sys.stderr)
                return ""
        return f"thumbs/{faceset}/{Path(fname).stem}.jpg"

    rows = []
    for gi, g in enumerate(candidates["groups"]):
        primary = g["primary"]
        sec = g["secondaries"]
        conf_cls = "confident" if g["confidence"] == "confident" else "uncertain"
        rows.append(f"<section class='grp {conf_cls}' id='g{gi}'>")
        rows.append(f"<h2>group #{gi+1} <small>({g['confidence']}; min_sim={g['min_link']:.3f}, max_sim={g['max_link']:.3f})</small></h2>")
        rows.append(f"<div class='plan'>merge <b>{', '.join(sec)}</b> &rarr; <b>{primary}</b></div>")
        # member rows
        for name in g["members"]:
            tier = g["tiers"][name]
            sz = g["sizes"][name]
            tier_label = ["hand-sorted", "auto", "osrc", "immich-peter", "immich-nic", "?"][min(tier, 5)]
            badge = "PRIMARY" if name == primary else "secondary"
            rows.append(f"<div class='member'>")
            rows.append(f"<div class='label'><span class='badge {badge.lower()}'>{badge}</span> "
                        f"<b>{name}</b> <small>tier={tier_label} · n={sz}</small></div>")
            rows.append("<div class='thumbs'>")
            faces_dir = ROOT / name / "faces"
            files = sorted(faces_dir.glob("*.png"))[:THUMBS_PER_FACESET]
            for f in files:
                rel = make_thumb(name, f.name)
                if rel:
                    rows.append(f"<img src='{rel}' loading='lazy' title='{f.name}'>")
            rows.append("</div></div>")
        # pairwise sims
        rows.append("<table class='sims'><tr><th>a</th><th>b</th><th>sim</th></tr>")
        for ps in sorted(g["pair_sims"], key=lambda x: -x["sim"]):
            cls = "hi" if ps["sim"] >= candidates["thresholds"]["confident"] else "mid"
            rows.append(f"<tr><td>{ps['a']}</td><td>{ps['b']}</td><td class='{cls}'>{ps['sim']:.3f}</td></tr>")
        rows.append("</table>")
        rows.append("</section>")

    nav = " · ".join(f"<a href='#g{i}'>#{i+1}</a>" for i in range(len(candidates["groups"])))

    html = f"""<!doctype html>
<html><head><meta charset='utf-8'><title>Faceset merge review</title>
<style>
body {{ font-family: system-ui, sans-serif; background: #111; color: #eee; padding: 1em; }}
h1 {{ margin-top: 0; }}
h2 {{ margin: 0; }}
small {{ color: #999; font-weight: normal; }}
section.grp {{ background: #1a1a1a; border-radius: 6px; padding: 12px; margin: 12px 0; }}
section.grp.confident {{ border-left: 4px solid #5fa05f; }}
section.grp.uncertain {{ border-left: 4px solid #ffb050; }}
.plan {{ margin: .5em 0; color: #6cf; }}
.member {{ margin: 8px 0; padding: 6px; background: #222; border-radius: 4px; }}
.label {{ font-family: monospace; font-size: 13px; }}
.badge {{ display: inline-block; padding: 0 6px; font-size: 10px; border-radius: 2px; }}
.badge.primary {{ background: #5fa05f; color: #000; font-weight: bold; }}
.badge.secondary {{ background: #444; color: #ccc; }}
.thumbs {{ display: flex; gap: 4px; margin-top: 4px; flex-wrap: wrap; }}
.thumbs img {{ height: 140px; width: auto; border-radius: 3px; }}
table.sims {{ font-family: monospace; font-size: 11px; margin-top: 6px; border-collapse: collapse; }}
table.sims td, table.sims th {{ padding: 1px 8px; border: 1px solid #333; text-align: left; }}
table.sims td.hi {{ color: #5fa05f; font-weight: bold; }}
table.sims td.mid {{ color: #ffb050; }}
.nav {{ position: sticky; top: 0; background: #111; padding: .5em 0; border-bottom: 1px solid #333; font-size: 12px; }}
a {{ color: #6cf; }}
</style></head>
<body>
<h1>Merge review &mdash; {len(candidates['groups'])} candidate groups
  <small>(edge>={candidates['thresholds']['edge']}, confident>={candidates['thresholds']['confident']})</small></h1>
<p>{candidates['n_centroided']} of {candidates['n_active']} active facesets centroided
  (skipped {candidates['n_skipped']} for too few cached embeddings).
  Green = confident (min within-group sim >= {candidates['thresholds']['confident']}); orange = uncertain.</p>
<div class='nav'>{nav}</div>
{''.join(rows)}
</body></html>"""

    out_html = out_dir / "index.html"
    out_html.write_text(html)
    print(f"[done] {out_html}", file=sys.stderr)


# ----------------------------- apply -----------------------------

def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
    import zipfile
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
        for i, p in enumerate(pngs):
            zf.write(p, arcname=f"{i:04d}.png")


def cmd_apply(args):
    candidates = json.loads(Path(args.candidates).read_text())
    master_path = ROOT / "manifest.json"
    master = json.loads(master_path.read_text())
    by_name = {f["name"]: f for f in master.get("facesets", [])}

    # filter: skip "uncertain" groups unless --include-uncertain
    accepted = [g for g in candidates["groups"]
                if g["confidence"] == "confident" or args.include_uncertain]
    skipped_unc = [g for g in candidates["groups"]
                   if g["confidence"] == "uncertain" and not args.include_uncertain]
    # explicit --exclude / --only filters (group indices in the candidates file)
    if args.only:
        only = {int(s) for s in args.only.split(",")}
        accepted = [g for i, g in enumerate(candidates["groups"]) if i in only]
    if args.exclude:
        excl = {int(s) for s in args.exclude.split(",")}
        accepted = [g for i, g in enumerate(accepted) if i not in excl]

    print(f"[plan] {len(accepted)} groups will be merged "
          f"({len(skipped_unc)} uncertain skipped)", file=sys.stderr)

    if args.dry_run:
        for g in accepted:
            print(f"  merge {g['secondaries']} -> {g['primary']}  "
                  f"({g['confidence']}, min_sim={g['min_link']:.3f})")
        return

    merged_dir = ROOT / "_merged"
    merged_dir.mkdir(exist_ok=True)
    new_facesets: list[dict] = []
    new_merged: list[dict] = list(master.get("merged", []))
    consumed_names: set[str] = set()
    primary_updates: dict[str, dict] = {}  # name -> new entry
    primary_absorbed: dict[str, list[dict]] = {}  # primary_name -> [secondary entries]

    for g in accepted:
        primary = g["primary"]
        if primary not in by_name:
            print(f"[warn] primary {primary} not in master; skipping group", file=sys.stderr)
            continue
        primary_dir = ROOT / primary
        if not primary_dir.is_dir():
            print(f"[warn] primary dir {primary_dir} missing; skipping group", file=sys.stderr)
            continue
        primary_faces = primary_dir / "faces"
        primary_manifest_path = primary_dir / "manifest.json"
        primary_manifest = json.loads(primary_manifest_path.read_text())

        # gather all face entries: primary + each secondary
        combined_faces: list[dict] = list(primary_manifest.get("faces", []))
        # adjust composite quality fall-back: ensure key exists
        for f in combined_faces:
            f.setdefault("origin_faceset", primary)

        for sec in g["secondaries"]:
            sec_dir = ROOT / sec
            if not sec_dir.is_dir():
                print(f"[warn] secondary {sec} missing; skipping", file=sys.stderr)
                continue
            sec_manifest_path = sec_dir / "manifest.json"
            sec_manifest = json.loads(sec_manifest_path.read_text()) if sec_manifest_path.exists() else {"faces": []}
            for f in sec_manifest.get("faces", []):
                f = dict(f)
                f["origin_faceset"] = sec
                combined_faces.append(f)

        # rank by quality.composite descending; ties broken by lower cosd_centroid
        def sort_key(f):
            q = f.get("quality", {}).get("composite", 0)
            d = f.get("cosd_centroid", 1.0)
            return (-q, d)
        combined_faces.sort(key=sort_key)

        # renumber and stage PNGs into a fresh staging dir, then atomically swap
        staging = primary_dir / "_faces_new"
        if staging.exists():
            shutil.rmtree(staging)
        staging.mkdir()
        new_face_entries = []
        for new_rank, f in enumerate(combined_faces, start=1):
            origin = f.pop("origin_faceset")
            old_png_rel = f["png"]                   # e.g. "faces/0042.png"
            old_png_name = Path(old_png_rel).name
            origin_png = ROOT / origin / "faces" / old_png_name
            if not origin_png.exists():
                # could be in _dropped if occlusion-pruned; skip
                continue
            new_name = f"{new_rank:04d}.png"
            shutil.copy2(origin_png, staging / new_name)
            f = dict(f)
            f["rank"] = new_rank
            f["png"] = f"faces/{new_name}"
            f["origin_faceset"] = origin   # preserve provenance in manifest
            new_face_entries.append(f)

        # swap directories: primary/faces -> primary/_faces_old, staging -> primary/faces
        old_faces_holding = primary_dir / "_faces_old"
        if old_faces_holding.exists():
            shutil.rmtree(old_faces_holding)
        if primary_faces.exists():
            primary_faces.rename(old_faces_holding)
        staging.rename(primary_faces)
        # migrate _dropped/ from old holding (so occlusion-pruned PNGs remain accessible)
        old_dropped = old_faces_holding / "_dropped"
        if old_dropped.exists():
            (primary_faces / "_dropped").mkdir(exist_ok=True)
            for x in old_dropped.iterdir():
                shutil.move(str(x), str(primary_faces / "_dropped" / x.name))
        shutil.rmtree(old_faces_holding)

        # re-zip .fsz
        survivor_pngs = sorted(primary_faces.glob("*.png"))
        top_n = primary_manifest.get("top_n", 30)
        top_n_eff = min(top_n, len(survivor_pngs))
        # remove old .fsz files
        for old in primary_dir.glob("*.fsz"):
            old.unlink()
        top_fsz_name = f"{primary}_top{top_n_eff}.fsz"
        all_fsz_name = f"{primary}_all.fsz"
        _zip_png_list(survivor_pngs[:top_n_eff], primary_dir / top_fsz_name)
        if len(survivor_pngs) > top_n_eff:
            _zip_png_list(survivor_pngs, primary_dir / all_fsz_name)
            all_fsz_used = all_fsz_name
        else:
            all_fsz_used = None

        # update primary's local manifest
        primary_manifest["faces"] = new_face_entries
        primary_manifest["exported"] = len(new_face_entries)
        primary_manifest["fsz_top"] = top_fsz_name
        primary_manifest["fsz_all"] = all_fsz_used
        primary_manifest["top_n"] = top_n_eff
        primary_manifest.setdefault("merge_history", []).append({
            "absorbed": g["secondaries"],
            "min_link": g["min_link"],
            "max_link": g["max_link"],
            "confidence": g["confidence"],
        })
        primary_manifest_path.write_text(json.dumps(primary_manifest, indent=2))

        # move secondary directories into _merged/
        absorbed_master_entries: list[dict] = []
        for sec in g["secondaries"]:
            sec_dir = ROOT / sec
            target = merged_dir / sec
            if not sec_dir.is_dir():
                continue
            if target.exists():
                shutil.rmtree(sec_dir)  # already moved by previous run; clean stub
            else:
                shutil.move(str(sec_dir), str(target))
            sec_master = dict(by_name.get(sec, {"name": sec}))
            sec_master["merged_into"] = primary
            sec_master["relpath"] = f"_merged/{sec}"
            sec_master["fsz_top"] = None
            sec_master["fsz_all"] = None
            absorbed_master_entries.append(sec_master)
            consumed_names.add(sec)

        new_merged.extend(absorbed_master_entries)

        # bump primary master entry
        prim_master = dict(by_name[primary])
        prim_master["exported"] = len(new_face_entries)
        prim_master["top_n"] = top_n_eff
        prim_master["fsz_top"] = top_fsz_name
        prim_master["fsz_all"] = all_fsz_used
        prim_master.setdefault("merge_history", []).append({
            "absorbed": g["secondaries"],
            "min_link": g["min_link"],
            "max_link": g["max_link"],
        })
        primary_updates[primary] = prim_master

        print(f"[merged] {g['secondaries']} -> {primary}  "
              f"now {len(new_face_entries)} png", file=sys.stderr)

    # rebuild master facesets list
    for entry in master.get("facesets", []):
        nm = entry["name"]
        if nm in consumed_names:
            continue
        if nm in primary_updates:
            new_facesets.append(primary_updates[nm])
        else:
            new_facesets.append(entry)

    new_master = dict(master)
    new_master["facesets"] = new_facesets
    new_master["merged"] = new_merged
    new_master["merge_run"] = {
        "thresholds": candidates["thresholds"],
        "groups_applied": len(accepted),
        "facesets_consumed": len(consumed_names),
        "include_uncertain": bool(args.include_uncertain),
    }
    tmp = master_path.with_suffix(".tmp.json")
    tmp.write_text(json.dumps(new_master, indent=2))
    tmp.replace(master_path)
    print(f"[done] master manifest updated: {len(new_facesets)} active, "
          f"{len(new_merged)} merged, {len(consumed_names)} consumed in this run",
          file=sys.stderr)


# ----------------------------- main -----------------------------

def main():
    ap = argparse.ArgumentParser()
    sub = ap.add_subparsers(dest="cmd", required=True)

    a = sub.add_parser("analyze")
    a.add_argument("--out", required=True)
    a.add_argument("--edge", type=float, default=0.45, help="min cosine sim to draw an edge (default 0.45)")
    a.add_argument("--confident", type=float, default=0.65, help="min within-group sim to be confident (default 0.65)")
    a.set_defaults(func=cmd_analyze)

    r = sub.add_parser("report")
    r.add_argument("--candidates", required=True)
    r.add_argument("--out", required=True)
    r.set_defaults(func=cmd_report)

    p = sub.add_parser("apply")
    p.add_argument("--candidates", required=True)
    p.add_argument("--include-uncertain", action="store_true",
                   help="apply uncertain groups too (default: confident only)")
    p.add_argument("--only", default=None, help="comma-separated group indices to apply")
    p.add_argument("--exclude", default=None, help="comma-separated group indices to skip")
    p.add_argument("--dry-run", action="store_true")
    p.set_defaults(func=cmd_apply)

    args = ap.parse_args()
    args.func(args)


if __name__ == "__main__":
    main()