"""Consolidate facesets_swap_ready/ — find duplicate identities and merge. Pipeline: 1. analyze: pull arcface embeddings from work/cache/*.npz for every PNG in every active faceset (skipping _masked, _thin, era splits). Compute L2-normalized centroid per faceset. Build similarity graph at sim>=0.45, extract components. Pick primary per component by tier (hand-sorted > auto > osrc > immich) + size. 2. report: HTML contact sheet at work/merge_review/index.html grouped by candidate cluster, with top-3 thumbs per faceset, all pairwise sims, and "merge X,Y -> Z" plan. Confident edges (sim>=0.65) are highlighted. 3. apply: combine PNGs of secondaries into primary, re-rank by quality.composite descending, renumber 0001..NNNN, re-zip _topN.fsz + _all.fsz, move secondaries to facesets_swap_ready/_merged//, update master manifest with `merged[]` array + `merge_run` provenance block. Embeddings come from caches (no GPU re-embed needed); the original clusterer used exactly these vectors so they are the right yardstick. Era splits are excluded entirely (intentional time-period segmentation, not a duplication). """ from __future__ import annotations import argparse import json import re import shutil import sys import time from pathlib import Path import numpy as np from PIL import Image from scipy.cluster.hierarchy import linkage, fcluster from scipy.spatial.distance import squareform ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready") CACHES = [ Path("/opt/face-sets/work/cache/nl_full.npz"), Path("/opt/face-sets/work/cache/immich_peter.npz"), Path("/opt/face-sets/work/cache/immich_nic.npz"), ] ERA_SPLIT_RE = re.compile(r"^faceset_\d+_(?:\d{4}-\d{2,4}|\d{4}|undated)$") # ----------------------------- helpers ----------------------------- def load_caches(): """Return (rec_index, alias_map). rec_index keyed by (path, bbox_tuple) -> embedding (np.float32, shape (512,) L2-normalized). alias_map maps every alias path -> canonical path.""" rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {} alias_map: dict[str, str] = {} n_total = 0 for c in CACHES: if not c.exists(): print(f"[warn] cache missing: {c}", file=sys.stderr) continue d = np.load(c, allow_pickle=True) emb = d["embeddings"] meta = json.loads(str(d["meta"])) face_records = [m for m in meta if not m.get("noface")] if len(face_records) != len(emb): raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}") # path_aliases may be present if "path_aliases" in d.files: paliases = json.loads(str(d["path_aliases"])) for canon, alist in paliases.items(): alias_map.setdefault(canon, canon) for a in alist: alias_map[a] = canon for i, rec in enumerate(face_records): p = rec["path"] bbox = tuple(int(x) for x in rec["bbox"]) v = emb[i].astype(np.float32) n = float(np.linalg.norm(v)) if n > 0: v = v / n rec_index[(p, bbox)] = v alias_map.setdefault(p, p) print(f"[cache] {c.name}: +{len(face_records)} face records (running total {len(rec_index)})", file=sys.stderr) n_total += len(face_records) print(f"[cache] indexed {n_total} face records, {len(alias_map)} path aliases", file=sys.stderr) return rec_index, alias_map def faceset_tier(name: str) -> int: """Lower number = higher priority for primary selection.""" m = re.match(r"^faceset_0*(\d+)$", name) if not m: return 99 # unknown structure n = int(m.group(1)) if 13 <= n <= 19: return 0 # hand-sorted if 1 <= n <= 12: return 1 # auto-clustered if 20 <= n <= 25: return 2 # osrc if 26 <= n <= 264: return 3 # immich peter if 265 <= n: return 4 # immich nic and beyond return 99 def is_era_split(name: str) -> bool: return bool(ERA_SPLIT_RE.match(name)) def faceset_centroid(faceset_dir: Path, rec_index, alias_map): """Return (centroid, n_used, n_missing) where centroid is L2-normalized mean of embeddings of the faces listed in the per-faceset manifest. Falls back to None if too few embeddings found.""" manifest = faceset_dir / "manifest.json" if not manifest.exists(): return None, 0, 0 m = json.loads(manifest.read_text()) vecs = [] n_missing = 0 for f in m.get("faces", []): src = f.get("source") bbox = f.get("bbox") if src is None or bbox is None: n_missing += 1 continue bbox_t = tuple(int(x) for x in bbox) canon = alias_map.get(src, src) v = rec_index.get((canon, bbox_t)) if v is None and canon != src: v = rec_index.get((src, bbox_t)) if v is None: n_missing += 1 continue vecs.append(v) if len(vecs) < 3: return None, len(vecs), n_missing arr = np.stack(vecs).astype(np.float32) c = arr.mean(axis=0) n = float(np.linalg.norm(c)) if n > 0: c = c / n return c, len(vecs), n_missing def connected_components(adj: dict[int, set[int]]) -> list[list[int]]: seen: set[int] = set() comps = [] for node in adj: if node in seen: continue stack = [node] comp = [] while stack: x = stack.pop() if x in seen: continue seen.add(x) comp.append(x) for y in adj.get(x, set()): if y not in seen: stack.append(y) comps.append(sorted(comp)) return comps # ----------------------------- analyze ----------------------------- def cmd_analyze(args): rec_index, alias_map = load_caches() # collect active facesets active = [] for d in sorted(ROOT.iterdir()): if not d.is_dir() or d.name.startswith("_"): continue if is_era_split(d.name): continue active.append(d) print(f"[scan] {len(active)} active facesets (era splits + _masked + _thin excluded)", file=sys.stderr) centroids: dict[str, np.ndarray] = {} sizes: dict[str, int] = {} skipped = [] t0 = time.time() for fs in active: c, n_used, n_miss = faceset_centroid(fs, rec_index, alias_map) if c is None: skipped.append((fs.name, n_used, n_miss)) continue centroids[fs.name] = c sizes[fs.name] = n_used print(f"[centroid] {len(centroids)} facesets centroided in {time.time()-t0:.1f}s; " f"{len(skipped)} skipped (too few embeddings)", file=sys.stderr) if skipped: for n, u, m in skipped[:10]: print(f" skip {n}: used={u} missing={m}", file=sys.stderr) if len(skipped) > 10: print(f" ... +{len(skipped)-10} more", file=sys.stderr) names = sorted(centroids.keys()) if not names: raise SystemExit("no centroids built") # similarity matrix M = np.stack([centroids[n] for n in names]).astype(np.float32) # (N, 512), normalized sim = M @ M.T # (N, N) cosine since unit-normalized np.clip(sim, -1.0, 1.0, out=sim) edge_thr = args.edge confident_thr = args.confident # complete-linkage agglomerative clustering on cosine distance. # Cut at edge threshold: groups are guaranteed to have ALL pairs sim >= edge_thr. # This avoids the chaining problem of single-link / connected-components. n = len(names) dist = 1.0 - sim np.fill_diagonal(dist, 0.0) # symmetrize numerical noise dist = (dist + dist.T) / 2.0 np.clip(dist, 0.0, 2.0, out=dist) cond = squareform(dist, checks=False) Z = linkage(cond, method="complete") cut_dist = 1.0 - edge_thr # complete-link distance corresponds to (1 - min sim) labels = fcluster(Z, t=cut_dist, criterion="distance") # 1-indexed cluster ids cluster_members: dict[int, list[int]] = {} for idx, lbl in enumerate(labels): cluster_members.setdefault(int(lbl), []).append(idx) comps = [sorted(idxs) for idxs in cluster_members.values() if len(idxs) > 1] n_pairs_in_groups = 0 for c in comps: n_pairs_in_groups += len(c) * (len(c) - 1) // 2 print(f"[graph] complete-linkage cut at sim>={edge_thr}: {len(comps)} multi-faceset groups " f"({n_pairs_in_groups} within-group pairs)", file=sys.stderr) # pick primary per group: lowest tier number, then largest size groups_out = [] for comp in comps: members = [names[i] for i in comp] members_sorted = sorted(members, key=lambda x: (faceset_tier(x), -sizes.get(x, 0), x)) primary = members_sorted[0] secondaries = members_sorted[1:] # gather pairwise sims within group pair_sims = [] idx_of = {names[i]: i for i in comp} for a in members: for b in members: if a >= b: continue pair_sims.append({"a": a, "b": b, "sim": round(float(sim[idx_of[a], idx_of[b]]), 4)}) # confidence: minimum within-group sim (the weakest link) min_link = min(p["sim"] for p in pair_sims) max_link = max(p["sim"] for p in pair_sims) confidence = "confident" if min_link >= confident_thr else "uncertain" groups_out.append({ "primary": primary, "secondaries": secondaries, "members": members_sorted, "tiers": {n: faceset_tier(n) for n in members}, "sizes": {n: sizes.get(n, 0) for n in members}, "pair_sims": pair_sims, "min_link": round(min_link, 4), "max_link": round(max_link, 4), "confidence": confidence, }) # sort: confident first, then by max_link desc groups_out.sort(key=lambda g: (0 if g["confidence"] == "confident" else 1, -g["max_link"])) out = { "thresholds": {"edge": edge_thr, "confident": confident_thr}, "n_active": len(active), "n_centroided": len(centroids), "n_skipped": len(skipped), "skipped_reasons": [{"name": n, "used": u, "missing": m} for n, u, m in skipped], "n_groups": len(groups_out), "n_facesets_in_groups": sum(len(g["members"]) for g in groups_out), "groups": groups_out, } op = Path(args.out) op.parent.mkdir(parents=True, exist_ok=True) op.write_text(json.dumps(out, indent=2)) confident = sum(1 for g in groups_out if g["confidence"] == "confident") uncertain = sum(1 for g in groups_out if g["confidence"] == "uncertain") print(f"[done] {len(groups_out)} groups ({confident} confident, {uncertain} uncertain) -> {op}", file=sys.stderr) # ----------------------------- report ----------------------------- def cmd_report(args): candidates = json.loads(Path(args.candidates).read_text()) out_dir = Path(args.out) thumbs_dir = out_dir / "thumbs" thumbs_dir.mkdir(parents=True, exist_ok=True) THUMB = 140 THUMBS_PER_FACESET = 4 def make_thumb(faceset: str, fname: str) -> str: d = thumbs_dir / faceset d.mkdir(parents=True, exist_ok=True) dst = d / (Path(fname).stem + ".jpg") if not dst.exists(): try: src = ROOT / faceset / "faces" / fname img = Image.open(src).convert("RGB") img.thumbnail((THUMB, THUMB), Image.LANCZOS) img.save(dst, "JPEG", quality=82) except Exception as e: print(f"[thumb-skip] {faceset}/{fname}: {e}", file=sys.stderr) return "" return f"thumbs/{faceset}/{Path(fname).stem}.jpg" rows = [] for gi, g in enumerate(candidates["groups"]): primary = g["primary"] sec = g["secondaries"] conf_cls = "confident" if g["confidence"] == "confident" else "uncertain" rows.append(f"

") rows.append(f"

group #{gi+1} ({g['confidence']}; min_sim={g['min_link']:.3f}, max_sim={g['max_link']:.3f})

") rows.append(f"

merge {', '.join(sec)} → {primary}

") # member rows for name in g["members"]: tier = g["tiers"][name] sz = g["sizes"][name] tier_label = ["hand-sorted", "auto", "osrc", "immich-peter", "immich-nic", "?"][min(tier, 5)] badge = "PRIMARY" if name == primary else "secondary" rows.append(f"

") rows.append(f"

{badge} " f"{name} tier={tier_label} · n={sz}

") rows.append("

") faces_dir = ROOT / name / "faces" files = sorted(faces_dir.glob("*.png"))[:THUMBS_PER_FACESET] for f in files: rel = make_thumb(name, f.name) if rel: rows.append(f" {f.name}

") rows.append("

") # pairwise sims rows.append("") for ps in sorted(g["pair_sims"], key=lambda x: -x["sim"]): cls = "hi" if ps["sim"] >= candidates["thresholds"]["confident"] else "mid" rows.append(f"") rows.append("

a	b	sim
{ps['a']}	{ps['b']}	{ps['sim']:.3f}

") rows.append("

") nav = " · ".join(f"#{i+1}" for i in range(len(candidates["groups"]))) html = f""" Faceset merge review

Merge review — {len(candidates['groups'])} candidate groups (edge>={candidates['thresholds']['edge']}, confident>={candidates['thresholds']['confident']})

{candidates['n_centroided']} of {candidates['n_active']} active facesets centroided (skipped {candidates['n_skipped']} for too few cached embeddings). Green = confident (min within-group sim >= {candidates['thresholds']['confident']}); orange = uncertain.

{nav}

{''.join(rows)} """ out_html = out_dir / "index.html" out_html.write_text(html) print(f"[done] {out_html}", file=sys.stderr) # ----------------------------- apply ----------------------------- def _zip_png_list(pngs: list[Path], zip_path: Path) -> None: import zipfile with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf: for i, p in enumerate(pngs): zf.write(p, arcname=f"{i:04d}.png") def cmd_apply(args): candidates = json.loads(Path(args.candidates).read_text()) master_path = ROOT / "manifest.json" master = json.loads(master_path.read_text()) by_name = {f["name"]: f for f in master.get("facesets", [])} # filter: skip "uncertain" groups unless --include-uncertain accepted = [g for g in candidates["groups"] if g["confidence"] == "confident" or args.include_uncertain] skipped_unc = [g for g in candidates["groups"] if g["confidence"] == "uncertain" and not args.include_uncertain] # explicit --exclude / --only filters (group indices in the candidates file) if args.only: only = {int(s) for s in args.only.split(",")} accepted = [g for i, g in enumerate(candidates["groups"]) if i in only] if args.exclude: excl = {int(s) for s in args.exclude.split(",")} accepted = [g for i, g in enumerate(accepted) if i not in excl] print(f"[plan] {len(accepted)} groups will be merged " f"({len(skipped_unc)} uncertain skipped)", file=sys.stderr) if args.dry_run: for g in accepted: print(f" merge {g['secondaries']} -> {g['primary']} " f"({g['confidence']}, min_sim={g['min_link']:.3f})") return merged_dir = ROOT / "_merged" merged_dir.mkdir(exist_ok=True) new_facesets: list[dict] = [] new_merged: list[dict] = list(master.get("merged", [])) consumed_names: set[str] = set() primary_updates: dict[str, dict] = {} # name -> new entry primary_absorbed: dict[str, list[dict]] = {} # primary_name -> [secondary entries] for g in accepted: primary = g["primary"] if primary not in by_name: print(f"[warn] primary {primary} not in master; skipping group", file=sys.stderr) continue primary_dir = ROOT / primary if not primary_dir.is_dir(): print(f"[warn] primary dir {primary_dir} missing; skipping group", file=sys.stderr) continue primary_faces = primary_dir / "faces" primary_manifest_path = primary_dir / "manifest.json" primary_manifest = json.loads(primary_manifest_path.read_text()) # gather all face entries: primary + each secondary combined_faces: list[dict] = list(primary_manifest.get("faces", [])) # adjust composite quality fall-back: ensure key exists for f in combined_faces: f.setdefault("origin_faceset", primary) for sec in g["secondaries"]: sec_dir = ROOT / sec if not sec_dir.is_dir(): print(f"[warn] secondary {sec} missing; skipping", file=sys.stderr) continue sec_manifest_path = sec_dir / "manifest.json" sec_manifest = json.loads(sec_manifest_path.read_text()) if sec_manifest_path.exists() else {"faces": []} for f in sec_manifest.get("faces", []): f = dict(f) f["origin_faceset"] = sec combined_faces.append(f) # rank by quality.composite descending; ties broken by lower cosd_centroid def sort_key(f): q = f.get("quality", {}).get("composite", 0) d = f.get("cosd_centroid", 1.0) return (-q, d) combined_faces.sort(key=sort_key) # renumber and stage PNGs into a fresh staging dir, then atomically swap staging = primary_dir / "_faces_new" if staging.exists(): shutil.rmtree(staging) staging.mkdir() new_face_entries = [] for new_rank, f in enumerate(combined_faces, start=1): origin = f.pop("origin_faceset") old_png_rel = f["png"] # e.g. "faces/0042.png" old_png_name = Path(old_png_rel).name origin_png = ROOT / origin / "faces" / old_png_name if not origin_png.exists(): # could be in _dropped if occlusion-pruned; skip continue new_name = f"{new_rank:04d}.png" shutil.copy2(origin_png, staging / new_name) f = dict(f) f["rank"] = new_rank f["png"] = f"faces/{new_name}" f["origin_faceset"] = origin # preserve provenance in manifest new_face_entries.append(f) # swap directories: primary/faces -> primary/_faces_old, staging -> primary/faces old_faces_holding = primary_dir / "_faces_old" if old_faces_holding.exists(): shutil.rmtree(old_faces_holding) if primary_faces.exists(): primary_faces.rename(old_faces_holding) staging.rename(primary_faces) # migrate _dropped/ from old holding (so occlusion-pruned PNGs remain accessible) old_dropped = old_faces_holding / "_dropped" if old_dropped.exists(): (primary_faces / "_dropped").mkdir(exist_ok=True) for x in old_dropped.iterdir(): shutil.move(str(x), str(primary_faces / "_dropped" / x.name)) shutil.rmtree(old_faces_holding) # re-zip .fsz survivor_pngs = sorted(primary_faces.glob("*.png")) top_n = primary_manifest.get("top_n", 30) top_n_eff = min(top_n, len(survivor_pngs)) # remove old .fsz files for old in primary_dir.glob("*.fsz"): old.unlink() top_fsz_name = f"{primary}_top{top_n_eff}.fsz" all_fsz_name = f"{primary}_all.fsz" _zip_png_list(survivor_pngs[:top_n_eff], primary_dir / top_fsz_name) if len(survivor_pngs) > top_n_eff: _zip_png_list(survivor_pngs, primary_dir / all_fsz_name) all_fsz_used = all_fsz_name else: all_fsz_used = None # update primary's local manifest primary_manifest["faces"] = new_face_entries primary_manifest["exported"] = len(new_face_entries) primary_manifest["fsz_top"] = top_fsz_name primary_manifest["fsz_all"] = all_fsz_used primary_manifest["top_n"] = top_n_eff primary_manifest.setdefault("merge_history", []).append({ "absorbed": g["secondaries"], "min_link": g["min_link"], "max_link": g["max_link"], "confidence": g["confidence"], }) primary_manifest_path.write_text(json.dumps(primary_manifest, indent=2)) # move secondary directories into _merged/ absorbed_master_entries: list[dict] = [] for sec in g["secondaries"]: sec_dir = ROOT / sec target = merged_dir / sec if not sec_dir.is_dir(): continue if target.exists(): shutil.rmtree(sec_dir) # already moved by previous run; clean stub else: shutil.move(str(sec_dir), str(target)) sec_master = dict(by_name.get(sec, {"name": sec})) sec_master["merged_into"] = primary sec_master["relpath"] = f"_merged/{sec}" sec_master["fsz_top"] = None sec_master["fsz_all"] = None absorbed_master_entries.append(sec_master) consumed_names.add(sec) new_merged.extend(absorbed_master_entries) # bump primary master entry prim_master = dict(by_name[primary]) prim_master["exported"] = len(new_face_entries) prim_master["top_n"] = top_n_eff prim_master["fsz_top"] = top_fsz_name prim_master["fsz_all"] = all_fsz_used prim_master.setdefault("merge_history", []).append({ "absorbed": g["secondaries"], "min_link": g["min_link"], "max_link": g["max_link"], }) primary_updates[primary] = prim_master print(f"[merged] {g['secondaries']} -> {primary} " f"now {len(new_face_entries)} png", file=sys.stderr) # rebuild master facesets list for entry in master.get("facesets", []): nm = entry["name"] if nm in consumed_names: continue if nm in primary_updates: new_facesets.append(primary_updates[nm]) else: new_facesets.append(entry) new_master = dict(master) new_master["facesets"] = new_facesets new_master["merged"] = new_merged new_master["merge_run"] = { "thresholds": candidates["thresholds"], "groups_applied": len(accepted), "facesets_consumed": len(consumed_names), "include_uncertain": bool(args.include_uncertain), } tmp = master_path.with_suffix(".tmp.json") tmp.write_text(json.dumps(new_master, indent=2)) tmp.replace(master_path) print(f"[done] master manifest updated: {len(new_facesets)} active, " f"{len(new_merged)} merged, {len(consumed_names)} consumed in this run", file=sys.stderr) # ----------------------------- main ----------------------------- def main(): ap = argparse.ArgumentParser() sub = ap.add_subparsers(dest="cmd", required=True) a = sub.add_parser("analyze") a.add_argument("--out", required=True) a.add_argument("--edge", type=float, default=0.45, help="min cosine sim to draw an edge (default 0.45)") a.add_argument("--confident", type=float, default=0.65, help="min within-group sim to be confident (default 0.65)") a.set_defaults(func=cmd_analyze) r = sub.add_parser("report") r.add_argument("--candidates", required=True) r.add_argument("--out", required=True) r.set_defaults(func=cmd_report) p = sub.add_parser("apply") p.add_argument("--candidates", required=True) p.add_argument("--include-uncertain", action="store_true", help="apply uncertain groups too (default: confident only)") p.add_argument("--only", default=None, help="comma-separated group indices to apply") p.add_argument("--exclude", default=None, help="comma-separated group indices to skip") p.add_argument("--dry-run", action="store_true") p.set_defaults(func=cmd_apply) args = ap.parse_args() args.func(args) if __name__ == "__main__": main()