#!/usr/bin/env python3 """Age-split person_001 into era-specific facesets. Workflow: 1. Seed a clean person_001 centroid from the existing curated 707-face `facesets_swap_ready/faceset_001/`. 2. Wide-recovery scan: pull every face record under /mnt/x/src/{nl, lzbkp_red} from `nl_full.npz` with cos-dist <= 0.55 from the seed centroid. 3. Apply export-swap-style per-face quality gates. 4. One re-centroid + 0.50 tighten pass to absorb the recovery without drift. 5. Agglomerative sub-clustering at cos-dist 0.35. 6. Post-merge sub-clusters whose centroids <0.30 AND whose dominant EXIF years are within 2 years. 7. Read EXIF DateTimeOriginal for each face's source path; era label = (p10 year, p90 year) over dated faces. 8. Undated faces are assigned to the nearest era by embedding distance. 9. For each era: composite-quality rank, single-face PNG crops, .fsz bundles (top-N and _all if era > top_n). `_.txt` marker file. Eras with <20 face records get a `THIN.txt` marker. 10. Append era entries into the canonical `facesets_swap_ready/manifest.json` next to the existing 19. """ from __future__ import annotations import json import shutil import sys from collections import Counter from pathlib import Path import numpy as np from PIL import Image, ExifTags, ImageOps REPO = Path(__file__).resolve().parent.parent sys.path.insert(0, str(REPO)) from sort_faces import ( # noqa: E402 QUALITY_WEIGHTS, _crop_face_square, _zip_png_list, compute_quality, load_cache, load_rgb_bgr, ) # ---- config -------------------------------------------------------------- # CACHE = REPO / "work" / "cache" / "nl_full.npz" SWAP_READY = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready") FS001 = SWAP_READY / "faceset_001" SCAN_ROOTS = [ Path("/mnt/x/src/nl"), Path("/mnt/x/src/lzbkp_red"), ] # Recovery + identity refinement RECOVERY_THRESHOLD = 0.55 # initial centroid match TIGHTEN_THRESHOLD = 0.50 # post-recentroid drift trim # Quality gates (mirror export-swap defaults) MIN_FACE_SHORT = 100 # Sub-cluster SUBCLUSTER_THRESHOLD = 0.35 # Anchor-based fragment assignment (replaces transitive union-find merge): ANCHOR_MIN_SIZE = 20 # sub-cluster size to qualify as an era anchor FRAGMENT_CENTROID_MAX = 0.40 # small fragment may join an anchor only if cent_dist <= FRAGMENT_YEAR_MAX = 5 # AND |dom_year_anchor - dom_year_fragment| <= # Output TOP_N = 30 PAD_RATIO = 0.5 OUT_SIZE = 512 THIN_THRESHOLD = 20 # EXIF cache (so re-runs skip the 30-min Windows-mount EXIF read) EXIF_CACHE = REPO / "work" / "cache" / "age_split_exif.json" # ---- helpers ------------------------------------------------------------- # def _normalize(v: np.ndarray) -> np.ndarray: n = np.linalg.norm(v) return v / n if n > 0 else v def _under(roots: list[Path], p: str) -> bool: for r in roots: rs = str(r).rstrip("/") + "/" if p == str(r) or p.startswith(rs): return True return False def _record_in_roots(rec: dict, roots: list[Path], path_aliases: dict) -> bool: if _under(roots, rec["path"]): return True for alias in path_aliases.get(rec["path"], []): if _under(roots, alias): return True return False def exif_year(path: Path) -> int | None: try: with Image.open(path) as im: exif = im._getexif() if not exif: return None for tag_id, val in exif.items(): tag = ExifTags.TAGS.get(tag_id, tag_id) if tag == "DateTimeOriginal" and isinstance(val, str) and len(val) >= 4: return int(val[:4]) except Exception: return None return None def label_for_era(years: list[int]) -> str: """Era label as a year-range string. Falls back to 'undated' if no years.""" if not years: return "undated" ys = sorted(years) lo = ys[len(ys) // 10] if len(ys) >= 10 else ys[0] hi = ys[-(len(ys) // 10) - 1] if len(ys) >= 10 else ys[-1] if lo == hi: return str(lo) # Compact year range like 2011-13 if same century, else 2009-2024. if (lo // 100) == (hi // 100): return f"{lo}-{hi % 100:02d}" return f"{lo}-{hi}" # ---- phase 1 + 2: seed centroid + recovery scan ------------------------- # def main() -> None: if not FS001.exists(): raise SystemExit(f"missing seed faceset: {FS001}") print("=== loading cache ===") emb, meta, _src, _proc, path_aliases = load_cache(CACHE) face_records = [m for m in meta if not m.get("noface")] if len(face_records) != len(emb): raise SystemExit(f"emb/meta mismatch: {len(face_records)} vs {len(emb)}") bbox_idx = {(m["path"], tuple(m.get("bbox") or ())): i for i, m in enumerate(face_records)} seed_manifest = json.loads((FS001 / "manifest.json").read_text()) seed_face_keys = [(f["source"], tuple(f.get("bbox") or ())) for f in seed_manifest["faces"]] seed_indices = [bbox_idx[k] for k in seed_face_keys if k in bbox_idx] print(f"seed faces from faceset_001: {len(seed_indices)} (manifest had {len(seed_face_keys)})") seed_centroid = _normalize(emb[seed_indices].mean(axis=0)) # Recovery: every face record under nl/ + lzbkp_red/ within RECOVERY_THRESHOLD. candidate_idxs = [ i for i, rec in enumerate(face_records) if _record_in_roots(rec, SCAN_ROOTS, path_aliases) ] print(f"\ncandidates under {[str(r) for r in SCAN_ROOTS]}: {len(candidate_idxs)}") cand_emb = emb[candidate_idxs] cand_dists = 1.0 - cand_emb @ seed_centroid recovered_local = [k for k, d in enumerate(cand_dists) if d <= RECOVERY_THRESHOLD] recovered = [candidate_idxs[k] for k in recovered_local] print(f"recovered at cos-dist <= {RECOVERY_THRESHOLD}: {len(recovered)}") # Quality gate. qualified = [] drop_size = drop_blur = drop_det = 0 for i in recovered: r = face_records[i] if r.get("face_short", 0) < MIN_FACE_SHORT: drop_size += 1 continue if r.get("blur", 0.0) < 40.0: drop_blur += 1 continue if r.get("det_score", 0.0) < 0.6: drop_det += 1 continue qualified.append(i) print(f"after quality gate: {len(qualified)} (drop size={drop_size} blur={drop_blur} det={drop_det})") # One tightening pass: re-centroid on qualified, drop anyone > TIGHTEN_THRESHOLD. qcent = _normalize(emb[qualified].mean(axis=0)) qd = 1.0 - emb[qualified] @ qcent tight = [qualified[k] for k, d in enumerate(qd) if d <= TIGHTEN_THRESHOLD] print(f"after re-centroid tighten ({TIGHTEN_THRESHOLD}): {len(tight)}") # ---- phase 5: sub-cluster -------------------------------------------- # print("\n=== sub-clustering ===") from sklearn.cluster import AgglomerativeClustering E = emb[tight] sims = E @ E.T dists = 1.0 - sims # Floor numerical noise. np.fill_diagonal(dists, 0.0) dists = np.maximum(dists, 0.0) ac = AgglomerativeClustering( n_clusters=None, metric="precomputed", linkage="average", distance_threshold=SUBCLUSTER_THRESHOLD, ) labels = ac.fit_predict(dists) sub_sizes = Counter(labels) print(f"raw sub-clusters: {len(sub_sizes)} (sizes: top10={sorted(sub_sizes.values(), reverse=True)[:10]})") # Per-cluster: indices, centroid, EXIF years. cluster_indices: dict[int, list[int]] = {} for k, lab in enumerate(labels): cluster_indices.setdefault(int(lab), []).append(tight[k]) cluster_centroids: dict[int, np.ndarray] = {} for lab, idxs in cluster_indices.items(): cluster_centroids[lab] = _normalize(emb[idxs].mean(axis=0)) print("\n=== EXIF years (one read per source path; cached) ===") unique_paths = sorted({face_records[i]["path"] for i in tight}) if EXIF_CACHE.exists(): cached = json.loads(EXIF_CACHE.read_text()) else: cached = {} path_year: dict[str, int | None] = {} new_reads = 0 for p in unique_paths: if p in cached: path_year[p] = cached[p] else: y = exif_year(Path(p)) path_year[p] = y cached[p] = y new_reads += 1 EXIF_CACHE.parent.mkdir(parents=True, exist_ok=True) EXIF_CACHE.write_text(json.dumps(cached, indent=0)) dated = sum(1 for v in path_year.values() if v is not None) print(f" EXIF cache: {len(cached)} entries, {new_reads} new reads, " f"{dated}/{len(unique_paths)} dated") cluster_years: dict[int, list[int]] = {} cluster_dom_year: dict[int, int | None] = {} for lab, idxs in cluster_indices.items(): ys = [] for i in idxs: y = path_year.get(face_records[i]["path"]) if y is not None: ys.append(y) cluster_years[lab] = ys cluster_dom_year[lab] = (Counter(ys).most_common(1)[0][0]) if ys else None # ---- phase 6: anchor-based fragment assignment ----------------------- # # Each sub-cluster of size >= ANCHOR_MIN_SIZE is an "era anchor". Smaller # fragments are assigned to the single nearest anchor IFF (centroid distance # <= FRAGMENT_CENTROID_MAX AND |dom_year delta| <= FRAGMENT_YEAR_MAX). # Anchors do NOT merge with each other — that prevented transitive year drift # observed when union-find was used. Standalone fragments stay as their own # (likely THIN) eras. print("\n=== anchor-based assignment ===") anchors = [lab for lab, idxs in cluster_indices.items() if len(idxs) >= ANCHOR_MIN_SIZE] fragments = [lab for lab in cluster_indices if lab not in anchors] anchors.sort(key=lambda l: -len(cluster_indices[l])) print(f"anchors (size>={ANCHOR_MIN_SIZE}): {len(anchors)}; fragments: {len(fragments)}") for a in anchors: print(f" anchor sub {a}: size={len(cluster_indices[a])} dom_year={cluster_dom_year[a]}") if anchors: a_cent = np.stack([cluster_centroids[a] for a in anchors]) assignments: dict[int, int] = {a: a for a in anchors} # anchor -> self unassigned: list[int] = [] for f in fragments: f_cent = cluster_centroids[f] f_year = cluster_dom_year[f] # cosine distances to each anchor cd = 1.0 - a_cent @ f_cent # year distance (inf if either dom-year unknown) yd = [] for a in anchors: ay = cluster_dom_year[a] if f_year is None or ay is None: yd.append(float("inf")) else: yd.append(abs(f_year - ay)) yd = np.array(yd) ok = (cd <= FRAGMENT_CENTROID_MAX) & (yd <= FRAGMENT_YEAR_MAX) if not ok.any(): unassigned.append(f) continue # nearest qualifying anchor by centroid distance. cd_masked = np.where(ok, cd, np.inf) best = int(np.argmin(cd_masked)) assignments[f] = anchors[best] print(f" assigned fragments: {sum(1 for k,v in assignments.items() if k!=v)}/{len(fragments)}; " f"unassigned (standalone): {len(unassigned)}") else: print(" no anchors; every sub-cluster stands alone") assignments = {lab: lab for lab in cluster_indices} unassigned = [] merged: dict[int, list[int]] = {} for lab, idxs in cluster_indices.items(): root = assignments.get(lab, lab) merged.setdefault(root, []).extend(idxs) merged_sizes = sorted(((r, len(v)) for r, v in merged.items()), key=lambda kv: -kv[1]) print(f"era buckets: {len(merged)} (top10 sizes: {[s for _, s in merged_sizes[:10]]})") # Recompute centroid + dom-year for merged eras. era_indices: dict[int, list[int]] = merged era_centroids: dict[int, np.ndarray] = {} era_year_label: dict[int, str] = {} era_years_full: dict[int, list[int]] = {} for root, idxs in era_indices.items(): era_centroids[root] = _normalize(emb[idxs].mean(axis=0)) ys = [] for i in idxs: y = path_year.get(face_records[i]["path"]) if y is not None: ys.append(y) era_years_full[root] = ys era_year_label[root] = label_for_era(ys) # ---- phase 8: assign undated faces (no-EXIF) to nearest era ---------- # # NB: undated = path's EXIF was None. For era assignment we use embedding, # but the year *label* is unaffected because labels come from dated faces only. # Actually undated face is already in some sub-cluster; here we just note count. n_undated = sum(1 for i in tight if path_year.get(face_records[i]["path"]) is None) print(f"undated face records (no EXIF): {n_undated}/{len(tight)} (placed by embedding only)") # ---- phase 9: per-era export ----------------------------------------- # import cv2 print("\n=== exporting era bundles ===") new_manifest_entries: list[dict] = [] eras_sorted = sorted(era_indices.items(), key=lambda kv: -len(kv[1])) for root, idxs in eras_sorted: size = len(idxs) label = era_year_label[root] era_name = f"faceset_001_{label}" out_dir = SWAP_READY / era_name # Disambiguate same-label collisions (e.g. two distinct embedding eras both 2019). collision = 2 while out_dir.exists(): era_name = f"faceset_001_{label}_v{collision}" out_dir = SWAP_READY / era_name collision += 1 faces_dir = out_dir / "faces" faces_dir.mkdir(parents=True, exist_ok=True) # Composite quality + rank. ranked = [] for ci in idxs: rec = face_records[ci] q = compute_quality(rec) ranked.append({"cache_idx": ci, "rec": rec, "quality": q}) # Dedup by source path within this era — keep highest-quality face per path. seen_path: dict[str, dict] = {} for r in ranked: p = r["rec"]["path"] prev = seen_path.get(p) if prev is None or r["quality"]["composite"] > prev["quality"]["composite"]: seen_path[p] = r unique = sorted(seen_path.values(), key=lambda r: -r["quality"]["composite"]) # Materialize crops. written: list[Path] = [] face_entries: list[dict] = [] for rank, r in enumerate(unique, start=1): rec = r["rec"] src = Path(rec["path"]) if not src.exists(): continue rgb, _ = load_rgb_bgr(src) if rgb is None: continue crop = _crop_face_square(rgb, rec["bbox"], PAD_RATIO, OUT_SIZE) png = faces_dir / f"{rank:04d}.png" cv2.imwrite(str(png), cv2.cvtColor(crop, cv2.COLOR_RGB2BGR)) written.append(png) face_entries.append({ "rank": rank, "png": f"faces/{rank:04d}.png", "source": rec["path"], "aliases": path_aliases.get(rec["path"], []), "bbox": rec["bbox"], "face_short": rec.get("face_short"), "det_score": rec.get("det_score"), "blur": rec.get("blur"), "pose": rec.get("pose"), "exif_year": path_year.get(rec["path"]), "quality": r["quality"], }) if not written: print(f"[{era_name}] empty after materialization; skipping") shutil.rmtree(out_dir) continue # Bundle. top_n_eff = min(TOP_N, len(written)) top_fsz = out_dir / f"{era_name}_top{top_n_eff}.fsz" _zip_png_list(written[:top_n_eff], top_fsz) all_fsz: Path | None = None if len(written) > top_n_eff: all_fsz = out_dir / f"{era_name}_all.fsz" _zip_png_list(written, all_fsz) # Per-era manifest. ys = era_years_full[root] year_summary = { "label": label, "year_count": len(ys), "year_min": min(ys) if ys else None, "year_max": max(ys) if ys else None, "year_dist": dict(Counter(ys).most_common()), } is_thin = size < THIN_THRESHOLD manifest = { "name": era_name, "parent_identity": "faceset_001", "era": year_summary, "input_face_records": size, "exported": len(written), "top_n": top_n_eff, "fsz_top": top_fsz.name, "fsz_all": all_fsz.name if all_fsz else None, "thin": is_thin, "quality_weights": QUALITY_WEIGHTS, "params": { "recovery_threshold": RECOVERY_THRESHOLD, "tighten_threshold": TIGHTEN_THRESHOLD, "subcluster_threshold": SUBCLUSTER_THRESHOLD, "anchor_min_size": ANCHOR_MIN_SIZE, "fragment_centroid_max": FRAGMENT_CENTROID_MAX, "fragment_year_max": FRAGMENT_YEAR_MAX, "min_face_short": MIN_FACE_SHORT, }, "faces": face_entries, } (out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2)) # Per-era marker file (always: .txt for human reference). (out_dir / f"{label}.txt").write_text( f"{era_name}\n\nEra: {label}\n" f"Year span: {year_summary['year_min']}..{year_summary['year_max']} " f"({year_summary['year_count']} dated of {size} faces)\n" f"Sub-cluster size: {size} face records, {len(unique)} unique source paths, " f"{len(written)} exported PNGs.\n" ) if is_thin: (out_dir / "THIN.txt").write_text( f"This era has only {size} face records (<{THIN_THRESHOLD}). " f"Averaged embedding may be dominated by single-photo idiosyncrasies.\n" ) # Append to top-level manifest summary. new_manifest_entries.append({k: v for k, v in manifest.items() if k != "faces"}) thin_tag = " THIN" if is_thin else "" print( f"[{era_name}] size={size} unique_paths={len(unique)} exported={len(written)} " f"top{top_n_eff}{thin_tag}" ) # ---- merge into top-level manifest ----------------------------------- # top_path = SWAP_READY / "manifest.json" existing = json.loads(top_path.read_text()) if top_path.exists() else {"facesets": []} existing_names = {fs.get("name") for fs in existing.get("facesets", [])} appended = 0 for entry in new_manifest_entries: if entry["name"] in existing_names: continue existing["facesets"].append(entry) appended += 1 top_path.write_text(json.dumps(existing, indent=2)) print(f"\nAppended {appended} era entries to {top_path}") print(f"Done. {len(new_manifest_entries)} era buckets emitted (faceset_001/ left untouched).") if __name__ == "__main__": main()