Adds four new orchestration scripts that operate on an already-built facesets_swap_ready/ to clean it up over time: - filter_occlusions.py + clip_worker.py: CLIP zero-shot mask + sunglasses filter (open_clip ViT-L-14/dfn2b_s39b). WSL stages, Windows DML scores via new C:\clip_dml_venv. Image-level threshold 0.7; faceset-level quarantine at 40% domain dominance. - consolidate_facesets.py: duplicate-identity merger using complete-linkage centroid clustering on cached arcface embeddings. Single-linkage chains catastrophically (60-faceset clusters with min sim < 0); complete-linkage guarantees within-group sim >= edge. - age_extend_001.py: slots newly-added PNGs into existing era buckets of faceset_001 using the same anchor-fragment rule as age_split_001.py (dist <= 0.40 AND |year_delta| <= 5). Anchors not re-centered. - dedup_optimize.py + multiface_worker.py: corpus-wide cleanup with three passes — cross-family SHA256 byte-dedup (preserves intra-family era duplication), within-faceset near-dup at sim >= 0.95, and a multi-face audit (the load-bearing roop invariant). Multi-face worker hits ~19 img/s on AMD Vega — ~7x embed_worker because input is 512x512 crops. Same-day corpus evolution: 311 active / 0 masked / 68 thin / 0 merged → 181 / 51 / 71 / 74; 6,440 → 3,849 active PNGs. All quarantines and prunes preserved on disk (faces/_dropped/, _masked/, _merged/, _thin/) for full reversibility. Master manifest gains masked[], merged[], plus per-run provenance blocks. Three new docs/analysis/ writeups cover model choice, threshold rationale, and per-pass run results. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
635 lines
26 KiB
Python
635 lines
26 KiB
Python
"""Consolidate facesets_swap_ready/ — find duplicate identities and merge.
|
|
|
|
Pipeline:
|
|
1. analyze: pull arcface embeddings from work/cache/*.npz for every PNG in every
|
|
active faceset (skipping _masked, _thin, era splits). Compute L2-normalized
|
|
centroid per faceset. Build similarity graph at sim>=0.45, extract components.
|
|
Pick primary per component by tier (hand-sorted > auto > osrc > immich) + size.
|
|
2. report: HTML contact sheet at work/merge_review/index.html grouped by
|
|
candidate cluster, with top-3 thumbs per faceset, all pairwise sims, and
|
|
"merge X,Y -> Z" plan. Confident edges (sim>=0.65) are highlighted.
|
|
3. apply: combine PNGs of secondaries into primary, re-rank by quality.composite
|
|
descending, renumber 0001..NNNN, re-zip _topN.fsz + _all.fsz, move secondaries
|
|
to facesets_swap_ready/_merged/<name>/, update master manifest with
|
|
`merged[]` array + `merge_run` provenance block.
|
|
|
|
Embeddings come from caches (no GPU re-embed needed); the original clusterer used
|
|
exactly these vectors so they are the right yardstick. Era splits are excluded
|
|
entirely (intentional time-period segmentation, not a duplication).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import shutil
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
from PIL import Image
|
|
from scipy.cluster.hierarchy import linkage, fcluster
|
|
from scipy.spatial.distance import squareform
|
|
|
|
ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
|
|
CACHES = [
|
|
Path("/opt/face-sets/work/cache/nl_full.npz"),
|
|
Path("/opt/face-sets/work/cache/immich_peter.npz"),
|
|
Path("/opt/face-sets/work/cache/immich_nic.npz"),
|
|
]
|
|
|
|
ERA_SPLIT_RE = re.compile(r"^faceset_\d+_(?:\d{4}-\d{2,4}|\d{4}|undated)$")
|
|
|
|
|
|
# ----------------------------- helpers -----------------------------
|
|
|
|
def load_caches():
|
|
"""Return (rec_index, alias_map). rec_index keyed by (path, bbox_tuple)
|
|
-> embedding (np.float32, shape (512,) L2-normalized).
|
|
alias_map maps every alias path -> canonical path."""
|
|
rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {}
|
|
alias_map: dict[str, str] = {}
|
|
n_total = 0
|
|
for c in CACHES:
|
|
if not c.exists():
|
|
print(f"[warn] cache missing: {c}", file=sys.stderr)
|
|
continue
|
|
d = np.load(c, allow_pickle=True)
|
|
emb = d["embeddings"]
|
|
meta = json.loads(str(d["meta"]))
|
|
face_records = [m for m in meta if not m.get("noface")]
|
|
if len(face_records) != len(emb):
|
|
raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}")
|
|
# path_aliases may be present
|
|
if "path_aliases" in d.files:
|
|
paliases = json.loads(str(d["path_aliases"]))
|
|
for canon, alist in paliases.items():
|
|
alias_map.setdefault(canon, canon)
|
|
for a in alist:
|
|
alias_map[a] = canon
|
|
for i, rec in enumerate(face_records):
|
|
p = rec["path"]
|
|
bbox = tuple(int(x) for x in rec["bbox"])
|
|
v = emb[i].astype(np.float32)
|
|
n = float(np.linalg.norm(v))
|
|
if n > 0:
|
|
v = v / n
|
|
rec_index[(p, bbox)] = v
|
|
alias_map.setdefault(p, p)
|
|
print(f"[cache] {c.name}: +{len(face_records)} face records (running total {len(rec_index)})", file=sys.stderr)
|
|
n_total += len(face_records)
|
|
print(f"[cache] indexed {n_total} face records, {len(alias_map)} path aliases", file=sys.stderr)
|
|
return rec_index, alias_map
|
|
|
|
|
|
def faceset_tier(name: str) -> int:
|
|
"""Lower number = higher priority for primary selection."""
|
|
m = re.match(r"^faceset_0*(\d+)$", name)
|
|
if not m:
|
|
return 99 # unknown structure
|
|
n = int(m.group(1))
|
|
if 13 <= n <= 19:
|
|
return 0 # hand-sorted
|
|
if 1 <= n <= 12:
|
|
return 1 # auto-clustered
|
|
if 20 <= n <= 25:
|
|
return 2 # osrc
|
|
if 26 <= n <= 264:
|
|
return 3 # immich peter
|
|
if 265 <= n:
|
|
return 4 # immich nic and beyond
|
|
return 99
|
|
|
|
|
|
def is_era_split(name: str) -> bool:
|
|
return bool(ERA_SPLIT_RE.match(name))
|
|
|
|
|
|
def faceset_centroid(faceset_dir: Path, rec_index, alias_map):
|
|
"""Return (centroid, n_used, n_missing) where centroid is L2-normalized mean
|
|
of embeddings of the faces listed in the per-faceset manifest. Falls back to
|
|
None if too few embeddings found."""
|
|
manifest = faceset_dir / "manifest.json"
|
|
if not manifest.exists():
|
|
return None, 0, 0
|
|
m = json.loads(manifest.read_text())
|
|
vecs = []
|
|
n_missing = 0
|
|
for f in m.get("faces", []):
|
|
src = f.get("source")
|
|
bbox = f.get("bbox")
|
|
if src is None or bbox is None:
|
|
n_missing += 1
|
|
continue
|
|
bbox_t = tuple(int(x) for x in bbox)
|
|
canon = alias_map.get(src, src)
|
|
v = rec_index.get((canon, bbox_t))
|
|
if v is None and canon != src:
|
|
v = rec_index.get((src, bbox_t))
|
|
if v is None:
|
|
n_missing += 1
|
|
continue
|
|
vecs.append(v)
|
|
if len(vecs) < 3:
|
|
return None, len(vecs), n_missing
|
|
arr = np.stack(vecs).astype(np.float32)
|
|
c = arr.mean(axis=0)
|
|
n = float(np.linalg.norm(c))
|
|
if n > 0:
|
|
c = c / n
|
|
return c, len(vecs), n_missing
|
|
|
|
|
|
def connected_components(adj: dict[int, set[int]]) -> list[list[int]]:
|
|
seen: set[int] = set()
|
|
comps = []
|
|
for node in adj:
|
|
if node in seen:
|
|
continue
|
|
stack = [node]
|
|
comp = []
|
|
while stack:
|
|
x = stack.pop()
|
|
if x in seen:
|
|
continue
|
|
seen.add(x)
|
|
comp.append(x)
|
|
for y in adj.get(x, set()):
|
|
if y not in seen:
|
|
stack.append(y)
|
|
comps.append(sorted(comp))
|
|
return comps
|
|
|
|
|
|
# ----------------------------- analyze -----------------------------
|
|
|
|
def cmd_analyze(args):
|
|
rec_index, alias_map = load_caches()
|
|
|
|
# collect active facesets
|
|
active = []
|
|
for d in sorted(ROOT.iterdir()):
|
|
if not d.is_dir() or d.name.startswith("_"):
|
|
continue
|
|
if is_era_split(d.name):
|
|
continue
|
|
active.append(d)
|
|
print(f"[scan] {len(active)} active facesets (era splits + _masked + _thin excluded)", file=sys.stderr)
|
|
|
|
centroids: dict[str, np.ndarray] = {}
|
|
sizes: dict[str, int] = {}
|
|
skipped = []
|
|
t0 = time.time()
|
|
for fs in active:
|
|
c, n_used, n_miss = faceset_centroid(fs, rec_index, alias_map)
|
|
if c is None:
|
|
skipped.append((fs.name, n_used, n_miss))
|
|
continue
|
|
centroids[fs.name] = c
|
|
sizes[fs.name] = n_used
|
|
print(f"[centroid] {len(centroids)} facesets centroided in {time.time()-t0:.1f}s; "
|
|
f"{len(skipped)} skipped (too few embeddings)", file=sys.stderr)
|
|
if skipped:
|
|
for n, u, m in skipped[:10]:
|
|
print(f" skip {n}: used={u} missing={m}", file=sys.stderr)
|
|
if len(skipped) > 10:
|
|
print(f" ... +{len(skipped)-10} more", file=sys.stderr)
|
|
|
|
names = sorted(centroids.keys())
|
|
if not names:
|
|
raise SystemExit("no centroids built")
|
|
|
|
# similarity matrix
|
|
M = np.stack([centroids[n] for n in names]).astype(np.float32) # (N, 512), normalized
|
|
sim = M @ M.T # (N, N) cosine since unit-normalized
|
|
np.clip(sim, -1.0, 1.0, out=sim)
|
|
|
|
edge_thr = args.edge
|
|
confident_thr = args.confident
|
|
|
|
# complete-linkage agglomerative clustering on cosine distance.
|
|
# Cut at edge threshold: groups are guaranteed to have ALL pairs sim >= edge_thr.
|
|
# This avoids the chaining problem of single-link / connected-components.
|
|
n = len(names)
|
|
dist = 1.0 - sim
|
|
np.fill_diagonal(dist, 0.0)
|
|
# symmetrize numerical noise
|
|
dist = (dist + dist.T) / 2.0
|
|
np.clip(dist, 0.0, 2.0, out=dist)
|
|
cond = squareform(dist, checks=False)
|
|
Z = linkage(cond, method="complete")
|
|
cut_dist = 1.0 - edge_thr # complete-link distance corresponds to (1 - min sim)
|
|
labels = fcluster(Z, t=cut_dist, criterion="distance") # 1-indexed cluster ids
|
|
|
|
cluster_members: dict[int, list[int]] = {}
|
|
for idx, lbl in enumerate(labels):
|
|
cluster_members.setdefault(int(lbl), []).append(idx)
|
|
comps = [sorted(idxs) for idxs in cluster_members.values() if len(idxs) > 1]
|
|
|
|
n_pairs_in_groups = 0
|
|
for c in comps:
|
|
n_pairs_in_groups += len(c) * (len(c) - 1) // 2
|
|
print(f"[graph] complete-linkage cut at sim>={edge_thr}: {len(comps)} multi-faceset groups "
|
|
f"({n_pairs_in_groups} within-group pairs)", file=sys.stderr)
|
|
|
|
# pick primary per group: lowest tier number, then largest size
|
|
groups_out = []
|
|
for comp in comps:
|
|
members = [names[i] for i in comp]
|
|
members_sorted = sorted(members, key=lambda x: (faceset_tier(x), -sizes.get(x, 0), x))
|
|
primary = members_sorted[0]
|
|
secondaries = members_sorted[1:]
|
|
# gather pairwise sims within group
|
|
pair_sims = []
|
|
idx_of = {names[i]: i for i in comp}
|
|
for a in members:
|
|
for b in members:
|
|
if a >= b:
|
|
continue
|
|
pair_sims.append({"a": a, "b": b, "sim": round(float(sim[idx_of[a], idx_of[b]]), 4)})
|
|
# confidence: minimum within-group sim (the weakest link)
|
|
min_link = min(p["sim"] for p in pair_sims)
|
|
max_link = max(p["sim"] for p in pair_sims)
|
|
confidence = "confident" if min_link >= confident_thr else "uncertain"
|
|
groups_out.append({
|
|
"primary": primary,
|
|
"secondaries": secondaries,
|
|
"members": members_sorted,
|
|
"tiers": {n: faceset_tier(n) for n in members},
|
|
"sizes": {n: sizes.get(n, 0) for n in members},
|
|
"pair_sims": pair_sims,
|
|
"min_link": round(min_link, 4),
|
|
"max_link": round(max_link, 4),
|
|
"confidence": confidence,
|
|
})
|
|
# sort: confident first, then by max_link desc
|
|
groups_out.sort(key=lambda g: (0 if g["confidence"] == "confident" else 1, -g["max_link"]))
|
|
|
|
out = {
|
|
"thresholds": {"edge": edge_thr, "confident": confident_thr},
|
|
"n_active": len(active),
|
|
"n_centroided": len(centroids),
|
|
"n_skipped": len(skipped),
|
|
"skipped_reasons": [{"name": n, "used": u, "missing": m} for n, u, m in skipped],
|
|
"n_groups": len(groups_out),
|
|
"n_facesets_in_groups": sum(len(g["members"]) for g in groups_out),
|
|
"groups": groups_out,
|
|
}
|
|
op = Path(args.out)
|
|
op.parent.mkdir(parents=True, exist_ok=True)
|
|
op.write_text(json.dumps(out, indent=2))
|
|
confident = sum(1 for g in groups_out if g["confidence"] == "confident")
|
|
uncertain = sum(1 for g in groups_out if g["confidence"] == "uncertain")
|
|
print(f"[done] {len(groups_out)} groups ({confident} confident, {uncertain} uncertain) -> {op}", file=sys.stderr)
|
|
|
|
|
|
# ----------------------------- report -----------------------------
|
|
|
|
def cmd_report(args):
|
|
candidates = json.loads(Path(args.candidates).read_text())
|
|
out_dir = Path(args.out)
|
|
thumbs_dir = out_dir / "thumbs"
|
|
thumbs_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
THUMB = 140
|
|
THUMBS_PER_FACESET = 4
|
|
|
|
def make_thumb(faceset: str, fname: str) -> str:
|
|
d = thumbs_dir / faceset
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
dst = d / (Path(fname).stem + ".jpg")
|
|
if not dst.exists():
|
|
try:
|
|
src = ROOT / faceset / "faces" / fname
|
|
img = Image.open(src).convert("RGB")
|
|
img.thumbnail((THUMB, THUMB), Image.LANCZOS)
|
|
img.save(dst, "JPEG", quality=82)
|
|
except Exception as e:
|
|
print(f"[thumb-skip] {faceset}/{fname}: {e}", file=sys.stderr)
|
|
return ""
|
|
return f"thumbs/{faceset}/{Path(fname).stem}.jpg"
|
|
|
|
rows = []
|
|
for gi, g in enumerate(candidates["groups"]):
|
|
primary = g["primary"]
|
|
sec = g["secondaries"]
|
|
conf_cls = "confident" if g["confidence"] == "confident" else "uncertain"
|
|
rows.append(f"<section class='grp {conf_cls}' id='g{gi}'>")
|
|
rows.append(f"<h2>group #{gi+1} <small>({g['confidence']}; min_sim={g['min_link']:.3f}, max_sim={g['max_link']:.3f})</small></h2>")
|
|
rows.append(f"<div class='plan'>merge <b>{', '.join(sec)}</b> → <b>{primary}</b></div>")
|
|
# member rows
|
|
for name in g["members"]:
|
|
tier = g["tiers"][name]
|
|
sz = g["sizes"][name]
|
|
tier_label = ["hand-sorted", "auto", "osrc", "immich-peter", "immich-nic", "?"][min(tier, 5)]
|
|
badge = "PRIMARY" if name == primary else "secondary"
|
|
rows.append(f"<div class='member'>")
|
|
rows.append(f"<div class='label'><span class='badge {badge.lower()}'>{badge}</span> "
|
|
f"<b>{name}</b> <small>tier={tier_label} · n={sz}</small></div>")
|
|
rows.append("<div class='thumbs'>")
|
|
faces_dir = ROOT / name / "faces"
|
|
files = sorted(faces_dir.glob("*.png"))[:THUMBS_PER_FACESET]
|
|
for f in files:
|
|
rel = make_thumb(name, f.name)
|
|
if rel:
|
|
rows.append(f"<img src='{rel}' loading='lazy' title='{f.name}'>")
|
|
rows.append("</div></div>")
|
|
# pairwise sims
|
|
rows.append("<table class='sims'><tr><th>a</th><th>b</th><th>sim</th></tr>")
|
|
for ps in sorted(g["pair_sims"], key=lambda x: -x["sim"]):
|
|
cls = "hi" if ps["sim"] >= candidates["thresholds"]["confident"] else "mid"
|
|
rows.append(f"<tr><td>{ps['a']}</td><td>{ps['b']}</td><td class='{cls}'>{ps['sim']:.3f}</td></tr>")
|
|
rows.append("</table>")
|
|
rows.append("</section>")
|
|
|
|
nav = " · ".join(f"<a href='#g{i}'>#{i+1}</a>" for i in range(len(candidates["groups"])))
|
|
|
|
html = f"""<!doctype html>
|
|
<html><head><meta charset='utf-8'><title>Faceset merge review</title>
|
|
<style>
|
|
body {{ font-family: system-ui, sans-serif; background: #111; color: #eee; padding: 1em; }}
|
|
h1 {{ margin-top: 0; }}
|
|
h2 {{ margin: 0; }}
|
|
small {{ color: #999; font-weight: normal; }}
|
|
section.grp {{ background: #1a1a1a; border-radius: 6px; padding: 12px; margin: 12px 0; }}
|
|
section.grp.confident {{ border-left: 4px solid #5fa05f; }}
|
|
section.grp.uncertain {{ border-left: 4px solid #ffb050; }}
|
|
.plan {{ margin: .5em 0; color: #6cf; }}
|
|
.member {{ margin: 8px 0; padding: 6px; background: #222; border-radius: 4px; }}
|
|
.label {{ font-family: monospace; font-size: 13px; }}
|
|
.badge {{ display: inline-block; padding: 0 6px; font-size: 10px; border-radius: 2px; }}
|
|
.badge.primary {{ background: #5fa05f; color: #000; font-weight: bold; }}
|
|
.badge.secondary {{ background: #444; color: #ccc; }}
|
|
.thumbs {{ display: flex; gap: 4px; margin-top: 4px; flex-wrap: wrap; }}
|
|
.thumbs img {{ height: 140px; width: auto; border-radius: 3px; }}
|
|
table.sims {{ font-family: monospace; font-size: 11px; margin-top: 6px; border-collapse: collapse; }}
|
|
table.sims td, table.sims th {{ padding: 1px 8px; border: 1px solid #333; text-align: left; }}
|
|
table.sims td.hi {{ color: #5fa05f; font-weight: bold; }}
|
|
table.sims td.mid {{ color: #ffb050; }}
|
|
.nav {{ position: sticky; top: 0; background: #111; padding: .5em 0; border-bottom: 1px solid #333; font-size: 12px; }}
|
|
a {{ color: #6cf; }}
|
|
</style></head>
|
|
<body>
|
|
<h1>Merge review — {len(candidates['groups'])} candidate groups
|
|
<small>(edge>={candidates['thresholds']['edge']}, confident>={candidates['thresholds']['confident']})</small></h1>
|
|
<p>{candidates['n_centroided']} of {candidates['n_active']} active facesets centroided
|
|
(skipped {candidates['n_skipped']} for too few cached embeddings).
|
|
Green = confident (min within-group sim >= {candidates['thresholds']['confident']}); orange = uncertain.</p>
|
|
<div class='nav'>{nav}</div>
|
|
{''.join(rows)}
|
|
</body></html>"""
|
|
|
|
out_html = out_dir / "index.html"
|
|
out_html.write_text(html)
|
|
print(f"[done] {out_html}", file=sys.stderr)
|
|
|
|
|
|
# ----------------------------- apply -----------------------------
|
|
|
|
def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
|
|
import zipfile
|
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
|
|
for i, p in enumerate(pngs):
|
|
zf.write(p, arcname=f"{i:04d}.png")
|
|
|
|
|
|
def cmd_apply(args):
|
|
candidates = json.loads(Path(args.candidates).read_text())
|
|
master_path = ROOT / "manifest.json"
|
|
master = json.loads(master_path.read_text())
|
|
by_name = {f["name"]: f for f in master.get("facesets", [])}
|
|
|
|
# filter: skip "uncertain" groups unless --include-uncertain
|
|
accepted = [g for g in candidates["groups"]
|
|
if g["confidence"] == "confident" or args.include_uncertain]
|
|
skipped_unc = [g for g in candidates["groups"]
|
|
if g["confidence"] == "uncertain" and not args.include_uncertain]
|
|
# explicit --exclude / --only filters (group indices in the candidates file)
|
|
if args.only:
|
|
only = {int(s) for s in args.only.split(",")}
|
|
accepted = [g for i, g in enumerate(candidates["groups"]) if i in only]
|
|
if args.exclude:
|
|
excl = {int(s) for s in args.exclude.split(",")}
|
|
accepted = [g for i, g in enumerate(accepted) if i not in excl]
|
|
|
|
print(f"[plan] {len(accepted)} groups will be merged "
|
|
f"({len(skipped_unc)} uncertain skipped)", file=sys.stderr)
|
|
|
|
if args.dry_run:
|
|
for g in accepted:
|
|
print(f" merge {g['secondaries']} -> {g['primary']} "
|
|
f"({g['confidence']}, min_sim={g['min_link']:.3f})")
|
|
return
|
|
|
|
merged_dir = ROOT / "_merged"
|
|
merged_dir.mkdir(exist_ok=True)
|
|
new_facesets: list[dict] = []
|
|
new_merged: list[dict] = list(master.get("merged", []))
|
|
consumed_names: set[str] = set()
|
|
primary_updates: dict[str, dict] = {} # name -> new entry
|
|
primary_absorbed: dict[str, list[dict]] = {} # primary_name -> [secondary entries]
|
|
|
|
for g in accepted:
|
|
primary = g["primary"]
|
|
if primary not in by_name:
|
|
print(f"[warn] primary {primary} not in master; skipping group", file=sys.stderr)
|
|
continue
|
|
primary_dir = ROOT / primary
|
|
if not primary_dir.is_dir():
|
|
print(f"[warn] primary dir {primary_dir} missing; skipping group", file=sys.stderr)
|
|
continue
|
|
primary_faces = primary_dir / "faces"
|
|
primary_manifest_path = primary_dir / "manifest.json"
|
|
primary_manifest = json.loads(primary_manifest_path.read_text())
|
|
|
|
# gather all face entries: primary + each secondary
|
|
combined_faces: list[dict] = list(primary_manifest.get("faces", []))
|
|
# adjust composite quality fall-back: ensure key exists
|
|
for f in combined_faces:
|
|
f.setdefault("origin_faceset", primary)
|
|
|
|
for sec in g["secondaries"]:
|
|
sec_dir = ROOT / sec
|
|
if not sec_dir.is_dir():
|
|
print(f"[warn] secondary {sec} missing; skipping", file=sys.stderr)
|
|
continue
|
|
sec_manifest_path = sec_dir / "manifest.json"
|
|
sec_manifest = json.loads(sec_manifest_path.read_text()) if sec_manifest_path.exists() else {"faces": []}
|
|
for f in sec_manifest.get("faces", []):
|
|
f = dict(f)
|
|
f["origin_faceset"] = sec
|
|
combined_faces.append(f)
|
|
|
|
# rank by quality.composite descending; ties broken by lower cosd_centroid
|
|
def sort_key(f):
|
|
q = f.get("quality", {}).get("composite", 0)
|
|
d = f.get("cosd_centroid", 1.0)
|
|
return (-q, d)
|
|
combined_faces.sort(key=sort_key)
|
|
|
|
# renumber and stage PNGs into a fresh staging dir, then atomically swap
|
|
staging = primary_dir / "_faces_new"
|
|
if staging.exists():
|
|
shutil.rmtree(staging)
|
|
staging.mkdir()
|
|
new_face_entries = []
|
|
for new_rank, f in enumerate(combined_faces, start=1):
|
|
origin = f.pop("origin_faceset")
|
|
old_png_rel = f["png"] # e.g. "faces/0042.png"
|
|
old_png_name = Path(old_png_rel).name
|
|
origin_png = ROOT / origin / "faces" / old_png_name
|
|
if not origin_png.exists():
|
|
# could be in _dropped if occlusion-pruned; skip
|
|
continue
|
|
new_name = f"{new_rank:04d}.png"
|
|
shutil.copy2(origin_png, staging / new_name)
|
|
f = dict(f)
|
|
f["rank"] = new_rank
|
|
f["png"] = f"faces/{new_name}"
|
|
f["origin_faceset"] = origin # preserve provenance in manifest
|
|
new_face_entries.append(f)
|
|
|
|
# swap directories: primary/faces -> primary/_faces_old, staging -> primary/faces
|
|
old_faces_holding = primary_dir / "_faces_old"
|
|
if old_faces_holding.exists():
|
|
shutil.rmtree(old_faces_holding)
|
|
if primary_faces.exists():
|
|
primary_faces.rename(old_faces_holding)
|
|
staging.rename(primary_faces)
|
|
# migrate _dropped/ from old holding (so occlusion-pruned PNGs remain accessible)
|
|
old_dropped = old_faces_holding / "_dropped"
|
|
if old_dropped.exists():
|
|
(primary_faces / "_dropped").mkdir(exist_ok=True)
|
|
for x in old_dropped.iterdir():
|
|
shutil.move(str(x), str(primary_faces / "_dropped" / x.name))
|
|
shutil.rmtree(old_faces_holding)
|
|
|
|
# re-zip .fsz
|
|
survivor_pngs = sorted(primary_faces.glob("*.png"))
|
|
top_n = primary_manifest.get("top_n", 30)
|
|
top_n_eff = min(top_n, len(survivor_pngs))
|
|
# remove old .fsz files
|
|
for old in primary_dir.glob("*.fsz"):
|
|
old.unlink()
|
|
top_fsz_name = f"{primary}_top{top_n_eff}.fsz"
|
|
all_fsz_name = f"{primary}_all.fsz"
|
|
_zip_png_list(survivor_pngs[:top_n_eff], primary_dir / top_fsz_name)
|
|
if len(survivor_pngs) > top_n_eff:
|
|
_zip_png_list(survivor_pngs, primary_dir / all_fsz_name)
|
|
all_fsz_used = all_fsz_name
|
|
else:
|
|
all_fsz_used = None
|
|
|
|
# update primary's local manifest
|
|
primary_manifest["faces"] = new_face_entries
|
|
primary_manifest["exported"] = len(new_face_entries)
|
|
primary_manifest["fsz_top"] = top_fsz_name
|
|
primary_manifest["fsz_all"] = all_fsz_used
|
|
primary_manifest["top_n"] = top_n_eff
|
|
primary_manifest.setdefault("merge_history", []).append({
|
|
"absorbed": g["secondaries"],
|
|
"min_link": g["min_link"],
|
|
"max_link": g["max_link"],
|
|
"confidence": g["confidence"],
|
|
})
|
|
primary_manifest_path.write_text(json.dumps(primary_manifest, indent=2))
|
|
|
|
# move secondary directories into _merged/
|
|
absorbed_master_entries: list[dict] = []
|
|
for sec in g["secondaries"]:
|
|
sec_dir = ROOT / sec
|
|
target = merged_dir / sec
|
|
if not sec_dir.is_dir():
|
|
continue
|
|
if target.exists():
|
|
shutil.rmtree(sec_dir) # already moved by previous run; clean stub
|
|
else:
|
|
shutil.move(str(sec_dir), str(target))
|
|
sec_master = dict(by_name.get(sec, {"name": sec}))
|
|
sec_master["merged_into"] = primary
|
|
sec_master["relpath"] = f"_merged/{sec}"
|
|
sec_master["fsz_top"] = None
|
|
sec_master["fsz_all"] = None
|
|
absorbed_master_entries.append(sec_master)
|
|
consumed_names.add(sec)
|
|
|
|
new_merged.extend(absorbed_master_entries)
|
|
|
|
# bump primary master entry
|
|
prim_master = dict(by_name[primary])
|
|
prim_master["exported"] = len(new_face_entries)
|
|
prim_master["top_n"] = top_n_eff
|
|
prim_master["fsz_top"] = top_fsz_name
|
|
prim_master["fsz_all"] = all_fsz_used
|
|
prim_master.setdefault("merge_history", []).append({
|
|
"absorbed": g["secondaries"],
|
|
"min_link": g["min_link"],
|
|
"max_link": g["max_link"],
|
|
})
|
|
primary_updates[primary] = prim_master
|
|
|
|
print(f"[merged] {g['secondaries']} -> {primary} "
|
|
f"now {len(new_face_entries)} png", file=sys.stderr)
|
|
|
|
# rebuild master facesets list
|
|
for entry in master.get("facesets", []):
|
|
nm = entry["name"]
|
|
if nm in consumed_names:
|
|
continue
|
|
if nm in primary_updates:
|
|
new_facesets.append(primary_updates[nm])
|
|
else:
|
|
new_facesets.append(entry)
|
|
|
|
new_master = dict(master)
|
|
new_master["facesets"] = new_facesets
|
|
new_master["merged"] = new_merged
|
|
new_master["merge_run"] = {
|
|
"thresholds": candidates["thresholds"],
|
|
"groups_applied": len(accepted),
|
|
"facesets_consumed": len(consumed_names),
|
|
"include_uncertain": bool(args.include_uncertain),
|
|
}
|
|
tmp = master_path.with_suffix(".tmp.json")
|
|
tmp.write_text(json.dumps(new_master, indent=2))
|
|
tmp.replace(master_path)
|
|
print(f"[done] master manifest updated: {len(new_facesets)} active, "
|
|
f"{len(new_merged)} merged, {len(consumed_names)} consumed in this run",
|
|
file=sys.stderr)
|
|
|
|
|
|
# ----------------------------- main -----------------------------
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
sub = ap.add_subparsers(dest="cmd", required=True)
|
|
|
|
a = sub.add_parser("analyze")
|
|
a.add_argument("--out", required=True)
|
|
a.add_argument("--edge", type=float, default=0.45, help="min cosine sim to draw an edge (default 0.45)")
|
|
a.add_argument("--confident", type=float, default=0.65, help="min within-group sim to be confident (default 0.65)")
|
|
a.set_defaults(func=cmd_analyze)
|
|
|
|
r = sub.add_parser("report")
|
|
r.add_argument("--candidates", required=True)
|
|
r.add_argument("--out", required=True)
|
|
r.set_defaults(func=cmd_report)
|
|
|
|
p = sub.add_parser("apply")
|
|
p.add_argument("--candidates", required=True)
|
|
p.add_argument("--include-uncertain", action="store_true",
|
|
help="apply uncertain groups too (default: confident only)")
|
|
p.add_argument("--only", default=None, help="comma-separated group indices to apply")
|
|
p.add_argument("--exclude", default=None, help="comma-separated group indices to skip")
|
|
p.add_argument("--dry-run", action="store_true")
|
|
p.set_defaults(func=cmd_apply)
|
|
|
|
args = ap.parse_args()
|
|
args.func(args)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|