Add post-export corpus maintenance pipeline

Adds four new orchestration scripts that operate on an already-built
facesets_swap_ready/ to clean it up over time:

- filter_occlusions.py + clip_worker.py: CLIP zero-shot mask + sunglasses
  filter (open_clip ViT-L-14/dfn2b_s39b). WSL stages, Windows DML scores
  via new C:\clip_dml_venv. Image-level threshold 0.7; faceset-level
  quarantine at 40% domain dominance.

- consolidate_facesets.py: duplicate-identity merger using complete-linkage
  centroid clustering on cached arcface embeddings. Single-linkage chains
  catastrophically (60-faceset clusters with min sim < 0); complete-linkage
  guarantees within-group sim >= edge.

- age_extend_001.py: slots newly-added PNGs into existing era buckets of
  faceset_001 using the same anchor-fragment rule as age_split_001.py
  (dist <= 0.40 AND |year_delta| <= 5). Anchors not re-centered.

- dedup_optimize.py + multiface_worker.py: corpus-wide cleanup with three
  passes — cross-family SHA256 byte-dedup (preserves intra-family era
  duplication), within-faceset near-dup at sim >= 0.95, and a multi-face
  audit (the load-bearing roop invariant). Multi-face worker hits ~19 img/s
  on AMD Vega — ~7x embed_worker because input is 512x512 crops.

Same-day corpus evolution: 311 active / 0 masked / 68 thin / 0 merged →
181 / 51 / 71 / 74; 6,440 → 3,849 active PNGs. All quarantines and prunes
preserved on disk (faces/_dropped/, _masked/, _merged/, _thin/) for full
reversibility. Master manifest gains masked[], merged[], plus per-run
provenance blocks.

Three new docs/analysis/ writeups cover model choice, threshold rationale,
and per-pass run results.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-27 15:41:18 +02:00
parent e66c97fd58
commit 49a43c7685
10 changed files with 3250 additions and 1 deletions

View File

@@ -0,0 +1,634 @@
"""Consolidate facesets_swap_ready/ — find duplicate identities and merge.
Pipeline:
1. analyze: pull arcface embeddings from work/cache/*.npz for every PNG in every
active faceset (skipping _masked, _thin, era splits). Compute L2-normalized
centroid per faceset. Build similarity graph at sim>=0.45, extract components.
Pick primary per component by tier (hand-sorted > auto > osrc > immich) + size.
2. report: HTML contact sheet at work/merge_review/index.html grouped by
candidate cluster, with top-3 thumbs per faceset, all pairwise sims, and
"merge X,Y -> Z" plan. Confident edges (sim>=0.65) are highlighted.
3. apply: combine PNGs of secondaries into primary, re-rank by quality.composite
descending, renumber 0001..NNNN, re-zip _topN.fsz + _all.fsz, move secondaries
to facesets_swap_ready/_merged/<name>/, update master manifest with
`merged[]` array + `merge_run` provenance block.
Embeddings come from caches (no GPU re-embed needed); the original clusterer used
exactly these vectors so they are the right yardstick. Era splits are excluded
entirely (intentional time-period segmentation, not a duplication).
"""
from __future__ import annotations
import argparse
import json
import re
import shutil
import sys
import time
from pathlib import Path
import numpy as np
from PIL import Image
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
CACHES = [
Path("/opt/face-sets/work/cache/nl_full.npz"),
Path("/opt/face-sets/work/cache/immich_peter.npz"),
Path("/opt/face-sets/work/cache/immich_nic.npz"),
]
ERA_SPLIT_RE = re.compile(r"^faceset_\d+_(?:\d{4}-\d{2,4}|\d{4}|undated)$")
# ----------------------------- helpers -----------------------------
def load_caches():
"""Return (rec_index, alias_map). rec_index keyed by (path, bbox_tuple)
-> embedding (np.float32, shape (512,) L2-normalized).
alias_map maps every alias path -> canonical path."""
rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {}
alias_map: dict[str, str] = {}
n_total = 0
for c in CACHES:
if not c.exists():
print(f"[warn] cache missing: {c}", file=sys.stderr)
continue
d = np.load(c, allow_pickle=True)
emb = d["embeddings"]
meta = json.loads(str(d["meta"]))
face_records = [m for m in meta if not m.get("noface")]
if len(face_records) != len(emb):
raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}")
# path_aliases may be present
if "path_aliases" in d.files:
paliases = json.loads(str(d["path_aliases"]))
for canon, alist in paliases.items():
alias_map.setdefault(canon, canon)
for a in alist:
alias_map[a] = canon
for i, rec in enumerate(face_records):
p = rec["path"]
bbox = tuple(int(x) for x in rec["bbox"])
v = emb[i].astype(np.float32)
n = float(np.linalg.norm(v))
if n > 0:
v = v / n
rec_index[(p, bbox)] = v
alias_map.setdefault(p, p)
print(f"[cache] {c.name}: +{len(face_records)} face records (running total {len(rec_index)})", file=sys.stderr)
n_total += len(face_records)
print(f"[cache] indexed {n_total} face records, {len(alias_map)} path aliases", file=sys.stderr)
return rec_index, alias_map
def faceset_tier(name: str) -> int:
"""Lower number = higher priority for primary selection."""
m = re.match(r"^faceset_0*(\d+)$", name)
if not m:
return 99 # unknown structure
n = int(m.group(1))
if 13 <= n <= 19:
return 0 # hand-sorted
if 1 <= n <= 12:
return 1 # auto-clustered
if 20 <= n <= 25:
return 2 # osrc
if 26 <= n <= 264:
return 3 # immich peter
if 265 <= n:
return 4 # immich nic and beyond
return 99
def is_era_split(name: str) -> bool:
return bool(ERA_SPLIT_RE.match(name))
def faceset_centroid(faceset_dir: Path, rec_index, alias_map):
"""Return (centroid, n_used, n_missing) where centroid is L2-normalized mean
of embeddings of the faces listed in the per-faceset manifest. Falls back to
None if too few embeddings found."""
manifest = faceset_dir / "manifest.json"
if not manifest.exists():
return None, 0, 0
m = json.loads(manifest.read_text())
vecs = []
n_missing = 0
for f in m.get("faces", []):
src = f.get("source")
bbox = f.get("bbox")
if src is None or bbox is None:
n_missing += 1
continue
bbox_t = tuple(int(x) for x in bbox)
canon = alias_map.get(src, src)
v = rec_index.get((canon, bbox_t))
if v is None and canon != src:
v = rec_index.get((src, bbox_t))
if v is None:
n_missing += 1
continue
vecs.append(v)
if len(vecs) < 3:
return None, len(vecs), n_missing
arr = np.stack(vecs).astype(np.float32)
c = arr.mean(axis=0)
n = float(np.linalg.norm(c))
if n > 0:
c = c / n
return c, len(vecs), n_missing
def connected_components(adj: dict[int, set[int]]) -> list[list[int]]:
seen: set[int] = set()
comps = []
for node in adj:
if node in seen:
continue
stack = [node]
comp = []
while stack:
x = stack.pop()
if x in seen:
continue
seen.add(x)
comp.append(x)
for y in adj.get(x, set()):
if y not in seen:
stack.append(y)
comps.append(sorted(comp))
return comps
# ----------------------------- analyze -----------------------------
def cmd_analyze(args):
rec_index, alias_map = load_caches()
# collect active facesets
active = []
for d in sorted(ROOT.iterdir()):
if not d.is_dir() or d.name.startswith("_"):
continue
if is_era_split(d.name):
continue
active.append(d)
print(f"[scan] {len(active)} active facesets (era splits + _masked + _thin excluded)", file=sys.stderr)
centroids: dict[str, np.ndarray] = {}
sizes: dict[str, int] = {}
skipped = []
t0 = time.time()
for fs in active:
c, n_used, n_miss = faceset_centroid(fs, rec_index, alias_map)
if c is None:
skipped.append((fs.name, n_used, n_miss))
continue
centroids[fs.name] = c
sizes[fs.name] = n_used
print(f"[centroid] {len(centroids)} facesets centroided in {time.time()-t0:.1f}s; "
f"{len(skipped)} skipped (too few embeddings)", file=sys.stderr)
if skipped:
for n, u, m in skipped[:10]:
print(f" skip {n}: used={u} missing={m}", file=sys.stderr)
if len(skipped) > 10:
print(f" ... +{len(skipped)-10} more", file=sys.stderr)
names = sorted(centroids.keys())
if not names:
raise SystemExit("no centroids built")
# similarity matrix
M = np.stack([centroids[n] for n in names]).astype(np.float32) # (N, 512), normalized
sim = M @ M.T # (N, N) cosine since unit-normalized
np.clip(sim, -1.0, 1.0, out=sim)
edge_thr = args.edge
confident_thr = args.confident
# complete-linkage agglomerative clustering on cosine distance.
# Cut at edge threshold: groups are guaranteed to have ALL pairs sim >= edge_thr.
# This avoids the chaining problem of single-link / connected-components.
n = len(names)
dist = 1.0 - sim
np.fill_diagonal(dist, 0.0)
# symmetrize numerical noise
dist = (dist + dist.T) / 2.0
np.clip(dist, 0.0, 2.0, out=dist)
cond = squareform(dist, checks=False)
Z = linkage(cond, method="complete")
cut_dist = 1.0 - edge_thr # complete-link distance corresponds to (1 - min sim)
labels = fcluster(Z, t=cut_dist, criterion="distance") # 1-indexed cluster ids
cluster_members: dict[int, list[int]] = {}
for idx, lbl in enumerate(labels):
cluster_members.setdefault(int(lbl), []).append(idx)
comps = [sorted(idxs) for idxs in cluster_members.values() if len(idxs) > 1]
n_pairs_in_groups = 0
for c in comps:
n_pairs_in_groups += len(c) * (len(c) - 1) // 2
print(f"[graph] complete-linkage cut at sim>={edge_thr}: {len(comps)} multi-faceset groups "
f"({n_pairs_in_groups} within-group pairs)", file=sys.stderr)
# pick primary per group: lowest tier number, then largest size
groups_out = []
for comp in comps:
members = [names[i] for i in comp]
members_sorted = sorted(members, key=lambda x: (faceset_tier(x), -sizes.get(x, 0), x))
primary = members_sorted[0]
secondaries = members_sorted[1:]
# gather pairwise sims within group
pair_sims = []
idx_of = {names[i]: i for i in comp}
for a in members:
for b in members:
if a >= b:
continue
pair_sims.append({"a": a, "b": b, "sim": round(float(sim[idx_of[a], idx_of[b]]), 4)})
# confidence: minimum within-group sim (the weakest link)
min_link = min(p["sim"] for p in pair_sims)
max_link = max(p["sim"] for p in pair_sims)
confidence = "confident" if min_link >= confident_thr else "uncertain"
groups_out.append({
"primary": primary,
"secondaries": secondaries,
"members": members_sorted,
"tiers": {n: faceset_tier(n) for n in members},
"sizes": {n: sizes.get(n, 0) for n in members},
"pair_sims": pair_sims,
"min_link": round(min_link, 4),
"max_link": round(max_link, 4),
"confidence": confidence,
})
# sort: confident first, then by max_link desc
groups_out.sort(key=lambda g: (0 if g["confidence"] == "confident" else 1, -g["max_link"]))
out = {
"thresholds": {"edge": edge_thr, "confident": confident_thr},
"n_active": len(active),
"n_centroided": len(centroids),
"n_skipped": len(skipped),
"skipped_reasons": [{"name": n, "used": u, "missing": m} for n, u, m in skipped],
"n_groups": len(groups_out),
"n_facesets_in_groups": sum(len(g["members"]) for g in groups_out),
"groups": groups_out,
}
op = Path(args.out)
op.parent.mkdir(parents=True, exist_ok=True)
op.write_text(json.dumps(out, indent=2))
confident = sum(1 for g in groups_out if g["confidence"] == "confident")
uncertain = sum(1 for g in groups_out if g["confidence"] == "uncertain")
print(f"[done] {len(groups_out)} groups ({confident} confident, {uncertain} uncertain) -> {op}", file=sys.stderr)
# ----------------------------- report -----------------------------
def cmd_report(args):
candidates = json.loads(Path(args.candidates).read_text())
out_dir = Path(args.out)
thumbs_dir = out_dir / "thumbs"
thumbs_dir.mkdir(parents=True, exist_ok=True)
THUMB = 140
THUMBS_PER_FACESET = 4
def make_thumb(faceset: str, fname: str) -> str:
d = thumbs_dir / faceset
d.mkdir(parents=True, exist_ok=True)
dst = d / (Path(fname).stem + ".jpg")
if not dst.exists():
try:
src = ROOT / faceset / "faces" / fname
img = Image.open(src).convert("RGB")
img.thumbnail((THUMB, THUMB), Image.LANCZOS)
img.save(dst, "JPEG", quality=82)
except Exception as e:
print(f"[thumb-skip] {faceset}/{fname}: {e}", file=sys.stderr)
return ""
return f"thumbs/{faceset}/{Path(fname).stem}.jpg"
rows = []
for gi, g in enumerate(candidates["groups"]):
primary = g["primary"]
sec = g["secondaries"]
conf_cls = "confident" if g["confidence"] == "confident" else "uncertain"
rows.append(f"<section class='grp {conf_cls}' id='g{gi}'>")
rows.append(f"<h2>group #{gi+1} <small>({g['confidence']}; min_sim={g['min_link']:.3f}, max_sim={g['max_link']:.3f})</small></h2>")
rows.append(f"<div class='plan'>merge <b>{', '.join(sec)}</b> &rarr; <b>{primary}</b></div>")
# member rows
for name in g["members"]:
tier = g["tiers"][name]
sz = g["sizes"][name]
tier_label = ["hand-sorted", "auto", "osrc", "immich-peter", "immich-nic", "?"][min(tier, 5)]
badge = "PRIMARY" if name == primary else "secondary"
rows.append(f"<div class='member'>")
rows.append(f"<div class='label'><span class='badge {badge.lower()}'>{badge}</span> "
f"<b>{name}</b> <small>tier={tier_label} · n={sz}</small></div>")
rows.append("<div class='thumbs'>")
faces_dir = ROOT / name / "faces"
files = sorted(faces_dir.glob("*.png"))[:THUMBS_PER_FACESET]
for f in files:
rel = make_thumb(name, f.name)
if rel:
rows.append(f"<img src='{rel}' loading='lazy' title='{f.name}'>")
rows.append("</div></div>")
# pairwise sims
rows.append("<table class='sims'><tr><th>a</th><th>b</th><th>sim</th></tr>")
for ps in sorted(g["pair_sims"], key=lambda x: -x["sim"]):
cls = "hi" if ps["sim"] >= candidates["thresholds"]["confident"] else "mid"
rows.append(f"<tr><td>{ps['a']}</td><td>{ps['b']}</td><td class='{cls}'>{ps['sim']:.3f}</td></tr>")
rows.append("</table>")
rows.append("</section>")
nav = " · ".join(f"<a href='#g{i}'>#{i+1}</a>" for i in range(len(candidates["groups"])))
html = f"""<!doctype html>
<html><head><meta charset='utf-8'><title>Faceset merge review</title>
<style>
body {{ font-family: system-ui, sans-serif; background: #111; color: #eee; padding: 1em; }}
h1 {{ margin-top: 0; }}
h2 {{ margin: 0; }}
small {{ color: #999; font-weight: normal; }}
section.grp {{ background: #1a1a1a; border-radius: 6px; padding: 12px; margin: 12px 0; }}
section.grp.confident {{ border-left: 4px solid #5fa05f; }}
section.grp.uncertain {{ border-left: 4px solid #ffb050; }}
.plan {{ margin: .5em 0; color: #6cf; }}
.member {{ margin: 8px 0; padding: 6px; background: #222; border-radius: 4px; }}
.label {{ font-family: monospace; font-size: 13px; }}
.badge {{ display: inline-block; padding: 0 6px; font-size: 10px; border-radius: 2px; }}
.badge.primary {{ background: #5fa05f; color: #000; font-weight: bold; }}
.badge.secondary {{ background: #444; color: #ccc; }}
.thumbs {{ display: flex; gap: 4px; margin-top: 4px; flex-wrap: wrap; }}
.thumbs img {{ height: 140px; width: auto; border-radius: 3px; }}
table.sims {{ font-family: monospace; font-size: 11px; margin-top: 6px; border-collapse: collapse; }}
table.sims td, table.sims th {{ padding: 1px 8px; border: 1px solid #333; text-align: left; }}
table.sims td.hi {{ color: #5fa05f; font-weight: bold; }}
table.sims td.mid {{ color: #ffb050; }}
.nav {{ position: sticky; top: 0; background: #111; padding: .5em 0; border-bottom: 1px solid #333; font-size: 12px; }}
a {{ color: #6cf; }}
</style></head>
<body>
<h1>Merge review &mdash; {len(candidates['groups'])} candidate groups
<small>(edge>={candidates['thresholds']['edge']}, confident>={candidates['thresholds']['confident']})</small></h1>
<p>{candidates['n_centroided']} of {candidates['n_active']} active facesets centroided
(skipped {candidates['n_skipped']} for too few cached embeddings).
Green = confident (min within-group sim >= {candidates['thresholds']['confident']}); orange = uncertain.</p>
<div class='nav'>{nav}</div>
{''.join(rows)}
</body></html>"""
out_html = out_dir / "index.html"
out_html.write_text(html)
print(f"[done] {out_html}", file=sys.stderr)
# ----------------------------- apply -----------------------------
def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
import zipfile
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
for i, p in enumerate(pngs):
zf.write(p, arcname=f"{i:04d}.png")
def cmd_apply(args):
candidates = json.loads(Path(args.candidates).read_text())
master_path = ROOT / "manifest.json"
master = json.loads(master_path.read_text())
by_name = {f["name"]: f for f in master.get("facesets", [])}
# filter: skip "uncertain" groups unless --include-uncertain
accepted = [g for g in candidates["groups"]
if g["confidence"] == "confident" or args.include_uncertain]
skipped_unc = [g for g in candidates["groups"]
if g["confidence"] == "uncertain" and not args.include_uncertain]
# explicit --exclude / --only filters (group indices in the candidates file)
if args.only:
only = {int(s) for s in args.only.split(",")}
accepted = [g for i, g in enumerate(candidates["groups"]) if i in only]
if args.exclude:
excl = {int(s) for s in args.exclude.split(",")}
accepted = [g for i, g in enumerate(accepted) if i not in excl]
print(f"[plan] {len(accepted)} groups will be merged "
f"({len(skipped_unc)} uncertain skipped)", file=sys.stderr)
if args.dry_run:
for g in accepted:
print(f" merge {g['secondaries']} -> {g['primary']} "
f"({g['confidence']}, min_sim={g['min_link']:.3f})")
return
merged_dir = ROOT / "_merged"
merged_dir.mkdir(exist_ok=True)
new_facesets: list[dict] = []
new_merged: list[dict] = list(master.get("merged", []))
consumed_names: set[str] = set()
primary_updates: dict[str, dict] = {} # name -> new entry
primary_absorbed: dict[str, list[dict]] = {} # primary_name -> [secondary entries]
for g in accepted:
primary = g["primary"]
if primary not in by_name:
print(f"[warn] primary {primary} not in master; skipping group", file=sys.stderr)
continue
primary_dir = ROOT / primary
if not primary_dir.is_dir():
print(f"[warn] primary dir {primary_dir} missing; skipping group", file=sys.stderr)
continue
primary_faces = primary_dir / "faces"
primary_manifest_path = primary_dir / "manifest.json"
primary_manifest = json.loads(primary_manifest_path.read_text())
# gather all face entries: primary + each secondary
combined_faces: list[dict] = list(primary_manifest.get("faces", []))
# adjust composite quality fall-back: ensure key exists
for f in combined_faces:
f.setdefault("origin_faceset", primary)
for sec in g["secondaries"]:
sec_dir = ROOT / sec
if not sec_dir.is_dir():
print(f"[warn] secondary {sec} missing; skipping", file=sys.stderr)
continue
sec_manifest_path = sec_dir / "manifest.json"
sec_manifest = json.loads(sec_manifest_path.read_text()) if sec_manifest_path.exists() else {"faces": []}
for f in sec_manifest.get("faces", []):
f = dict(f)
f["origin_faceset"] = sec
combined_faces.append(f)
# rank by quality.composite descending; ties broken by lower cosd_centroid
def sort_key(f):
q = f.get("quality", {}).get("composite", 0)
d = f.get("cosd_centroid", 1.0)
return (-q, d)
combined_faces.sort(key=sort_key)
# renumber and stage PNGs into a fresh staging dir, then atomically swap
staging = primary_dir / "_faces_new"
if staging.exists():
shutil.rmtree(staging)
staging.mkdir()
new_face_entries = []
for new_rank, f in enumerate(combined_faces, start=1):
origin = f.pop("origin_faceset")
old_png_rel = f["png"] # e.g. "faces/0042.png"
old_png_name = Path(old_png_rel).name
origin_png = ROOT / origin / "faces" / old_png_name
if not origin_png.exists():
# could be in _dropped if occlusion-pruned; skip
continue
new_name = f"{new_rank:04d}.png"
shutil.copy2(origin_png, staging / new_name)
f = dict(f)
f["rank"] = new_rank
f["png"] = f"faces/{new_name}"
f["origin_faceset"] = origin # preserve provenance in manifest
new_face_entries.append(f)
# swap directories: primary/faces -> primary/_faces_old, staging -> primary/faces
old_faces_holding = primary_dir / "_faces_old"
if old_faces_holding.exists():
shutil.rmtree(old_faces_holding)
if primary_faces.exists():
primary_faces.rename(old_faces_holding)
staging.rename(primary_faces)
# migrate _dropped/ from old holding (so occlusion-pruned PNGs remain accessible)
old_dropped = old_faces_holding / "_dropped"
if old_dropped.exists():
(primary_faces / "_dropped").mkdir(exist_ok=True)
for x in old_dropped.iterdir():
shutil.move(str(x), str(primary_faces / "_dropped" / x.name))
shutil.rmtree(old_faces_holding)
# re-zip .fsz
survivor_pngs = sorted(primary_faces.glob("*.png"))
top_n = primary_manifest.get("top_n", 30)
top_n_eff = min(top_n, len(survivor_pngs))
# remove old .fsz files
for old in primary_dir.glob("*.fsz"):
old.unlink()
top_fsz_name = f"{primary}_top{top_n_eff}.fsz"
all_fsz_name = f"{primary}_all.fsz"
_zip_png_list(survivor_pngs[:top_n_eff], primary_dir / top_fsz_name)
if len(survivor_pngs) > top_n_eff:
_zip_png_list(survivor_pngs, primary_dir / all_fsz_name)
all_fsz_used = all_fsz_name
else:
all_fsz_used = None
# update primary's local manifest
primary_manifest["faces"] = new_face_entries
primary_manifest["exported"] = len(new_face_entries)
primary_manifest["fsz_top"] = top_fsz_name
primary_manifest["fsz_all"] = all_fsz_used
primary_manifest["top_n"] = top_n_eff
primary_manifest.setdefault("merge_history", []).append({
"absorbed": g["secondaries"],
"min_link": g["min_link"],
"max_link": g["max_link"],
"confidence": g["confidence"],
})
primary_manifest_path.write_text(json.dumps(primary_manifest, indent=2))
# move secondary directories into _merged/
absorbed_master_entries: list[dict] = []
for sec in g["secondaries"]:
sec_dir = ROOT / sec
target = merged_dir / sec
if not sec_dir.is_dir():
continue
if target.exists():
shutil.rmtree(sec_dir) # already moved by previous run; clean stub
else:
shutil.move(str(sec_dir), str(target))
sec_master = dict(by_name.get(sec, {"name": sec}))
sec_master["merged_into"] = primary
sec_master["relpath"] = f"_merged/{sec}"
sec_master["fsz_top"] = None
sec_master["fsz_all"] = None
absorbed_master_entries.append(sec_master)
consumed_names.add(sec)
new_merged.extend(absorbed_master_entries)
# bump primary master entry
prim_master = dict(by_name[primary])
prim_master["exported"] = len(new_face_entries)
prim_master["top_n"] = top_n_eff
prim_master["fsz_top"] = top_fsz_name
prim_master["fsz_all"] = all_fsz_used
prim_master.setdefault("merge_history", []).append({
"absorbed": g["secondaries"],
"min_link": g["min_link"],
"max_link": g["max_link"],
})
primary_updates[primary] = prim_master
print(f"[merged] {g['secondaries']} -> {primary} "
f"now {len(new_face_entries)} png", file=sys.stderr)
# rebuild master facesets list
for entry in master.get("facesets", []):
nm = entry["name"]
if nm in consumed_names:
continue
if nm in primary_updates:
new_facesets.append(primary_updates[nm])
else:
new_facesets.append(entry)
new_master = dict(master)
new_master["facesets"] = new_facesets
new_master["merged"] = new_merged
new_master["merge_run"] = {
"thresholds": candidates["thresholds"],
"groups_applied": len(accepted),
"facesets_consumed": len(consumed_names),
"include_uncertain": bool(args.include_uncertain),
}
tmp = master_path.with_suffix(".tmp.json")
tmp.write_text(json.dumps(new_master, indent=2))
tmp.replace(master_path)
print(f"[done] master manifest updated: {len(new_facesets)} active, "
f"{len(new_merged)} merged, {len(consumed_names)} consumed in this run",
file=sys.stderr)
# ----------------------------- main -----------------------------
def main():
ap = argparse.ArgumentParser()
sub = ap.add_subparsers(dest="cmd", required=True)
a = sub.add_parser("analyze")
a.add_argument("--out", required=True)
a.add_argument("--edge", type=float, default=0.45, help="min cosine sim to draw an edge (default 0.45)")
a.add_argument("--confident", type=float, default=0.65, help="min within-group sim to be confident (default 0.65)")
a.set_defaults(func=cmd_analyze)
r = sub.add_parser("report")
r.add_argument("--candidates", required=True)
r.add_argument("--out", required=True)
r.set_defaults(func=cmd_report)
p = sub.add_parser("apply")
p.add_argument("--candidates", required=True)
p.add_argument("--include-uncertain", action="store_true",
help="apply uncertain groups too (default: confident only)")
p.add_argument("--only", default=None, help="comma-separated group indices to apply")
p.add_argument("--exclude", default=None, help="comma-separated group indices to skip")
p.add_argument("--dry-run", action="store_true")
p.set_defaults(func=cmd_apply)
args = ap.parse_args()
args.func(args)
if __name__ == "__main__":
main()