Document hand-sorted-folder import + age-split workflow

- README: document work/build_folders.py (hand-sorted folder identities) and the new age-split workflow for splitting a long-running identity into era-specific facesets after clustering. - Force-track work/age_split_001.py and work/check_faceset001_age.py; these are the worked example + readiness probe for faceset_001 and the template for splitting any other identity by EXIF era. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 12:08:25 +02:00
parent 4d7a8780de
commit 03a0c75531
3 changed files with 729 additions and 2 deletions
--- a/work/age_split_001.py
+++ b/work/age_split_001.py
@@ -0,0 +1,485 @@
+#!/usr/bin/env python3
+"""Age-split person_001 into era-specific facesets.
+
+Workflow:
+1. Seed a clean person_001 centroid from the existing curated 707-face
+   `facesets_swap_ready/faceset_001/`.
+2. Wide-recovery scan: pull every face record under /mnt/x/src/{nl, lzbkp_red}
+   from `nl_full.npz` with cos-dist <= 0.55 from the seed centroid.
+3. Apply export-swap-style per-face quality gates.
+4. One re-centroid + 0.50 tighten pass to absorb the recovery without drift.
+5. Agglomerative sub-clustering at cos-dist 0.35.
+6. Post-merge sub-clusters whose centroids <0.30 AND whose dominant EXIF
+   years are within 2 years.
+7. Read EXIF DateTimeOriginal for each face's source path; era label =
+   (p10 year, p90 year) over dated faces.
+8. Undated faces are assigned to the nearest era by embedding distance.
+9. For each era: composite-quality rank, single-face PNG crops, .fsz bundles
+   (top-N and _all if era > top_n). `<era>_<range>.txt` marker file. Eras
+   with <20 face records get a `THIN.txt` marker.
+10. Append era entries into the canonical
+    `facesets_swap_ready/manifest.json` next to the existing 19.
+"""
+
+from __future__ import annotations
+
+import json
+import shutil
+import sys
+from collections import Counter
+from pathlib import Path
+
+import numpy as np
+from PIL import Image, ExifTags, ImageOps
+
+REPO = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(REPO))
+
+from sort_faces import (  # noqa: E402
+    QUALITY_WEIGHTS,
+    _crop_face_square,
+    _zip_png_list,
+    compute_quality,
+    load_cache,
+    load_rgb_bgr,
+)
+
+# ---- config -------------------------------------------------------------- #
+
+CACHE = REPO / "work" / "cache" / "nl_full.npz"
+SWAP_READY = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
+FS001 = SWAP_READY / "faceset_001"
+
+SCAN_ROOTS = [
+    Path("/mnt/x/src/nl"),
+    Path("/mnt/x/src/lzbkp_red"),
+]
+
+# Recovery + identity refinement
+RECOVERY_THRESHOLD = 0.55  # initial centroid match
+TIGHTEN_THRESHOLD = 0.50   # post-recentroid drift trim
+# Quality gates (mirror export-swap defaults)
+MIN_FACE_SHORT = 100
+# Sub-cluster
+SUBCLUSTER_THRESHOLD = 0.35
+# Anchor-based fragment assignment (replaces transitive union-find merge):
+ANCHOR_MIN_SIZE = 20          # sub-cluster size to qualify as an era anchor
+FRAGMENT_CENTROID_MAX = 0.40  # small fragment may join an anchor only if cent_dist <=
+FRAGMENT_YEAR_MAX = 5         # AND |dom_year_anchor - dom_year_fragment| <=
+# Output
+TOP_N = 30
+PAD_RATIO = 0.5
+OUT_SIZE = 512
+THIN_THRESHOLD = 20
+
+# EXIF cache (so re-runs skip the 30-min Windows-mount EXIF read)
+EXIF_CACHE = REPO / "work" / "cache" / "age_split_exif.json"
+
+
+# ---- helpers ------------------------------------------------------------- #
+
+def _normalize(v: np.ndarray) -> np.ndarray:
+    n = np.linalg.norm(v)
+    return v / n if n > 0 else v
+
+
+def _under(roots: list[Path], p: str) -> bool:
+    for r in roots:
+        rs = str(r).rstrip("/") + "/"
+        if p == str(r) or p.startswith(rs):
+            return True
+    return False
+
+
+def _record_in_roots(rec: dict, roots: list[Path], path_aliases: dict) -> bool:
+    if _under(roots, rec["path"]):
+        return True
+    for alias in path_aliases.get(rec["path"], []):
+        if _under(roots, alias):
+            return True
+    return False
+
+
+def exif_year(path: Path) -> int | None:
+    try:
+        with Image.open(path) as im:
+            exif = im._getexif()
+            if not exif:
+                return None
+            for tag_id, val in exif.items():
+                tag = ExifTags.TAGS.get(tag_id, tag_id)
+                if tag == "DateTimeOriginal" and isinstance(val, str) and len(val) >= 4:
+                    return int(val[:4])
+    except Exception:
+        return None
+    return None
+
+
+def label_for_era(years: list[int]) -> str:
+    """Era label as a year-range string. Falls back to 'undated' if no years."""
+    if not years:
+        return "undated"
+    ys = sorted(years)
+    lo = ys[len(ys) // 10] if len(ys) >= 10 else ys[0]
+    hi = ys[-(len(ys) // 10) - 1] if len(ys) >= 10 else ys[-1]
+    if lo == hi:
+        return str(lo)
+    # Compact year range like 2011-13 if same century, else 2009-2024.
+    if (lo // 100) == (hi // 100):
+        return f"{lo}-{hi % 100:02d}"
+    return f"{lo}-{hi}"
+
+
+# ---- phase 1 + 2: seed centroid + recovery scan ------------------------- #
+
+def main() -> None:
+    if not FS001.exists():
+        raise SystemExit(f"missing seed faceset: {FS001}")
+
+    print("=== loading cache ===")
+    emb, meta, _src, _proc, path_aliases = load_cache(CACHE)
+    face_records = [m for m in meta if not m.get("noface")]
+    if len(face_records) != len(emb):
+        raise SystemExit(f"emb/meta mismatch: {len(face_records)} vs {len(emb)}")
+
+    bbox_idx = {(m["path"], tuple(m.get("bbox") or ())): i for i, m in enumerate(face_records)}
+
+    seed_manifest = json.loads((FS001 / "manifest.json").read_text())
+    seed_face_keys = [(f["source"], tuple(f.get("bbox") or ())) for f in seed_manifest["faces"]]
+    seed_indices = [bbox_idx[k] for k in seed_face_keys if k in bbox_idx]
+    print(f"seed faces from faceset_001: {len(seed_indices)} (manifest had {len(seed_face_keys)})")
+
+    seed_centroid = _normalize(emb[seed_indices].mean(axis=0))
+
+    # Recovery: every face record under nl/ + lzbkp_red/ within RECOVERY_THRESHOLD.
+    candidate_idxs = [
+        i for i, rec in enumerate(face_records)
+        if _record_in_roots(rec, SCAN_ROOTS, path_aliases)
+    ]
+    print(f"\ncandidates under {[str(r) for r in SCAN_ROOTS]}: {len(candidate_idxs)}")
+
+    cand_emb = emb[candidate_idxs]
+    cand_dists = 1.0 - cand_emb @ seed_centroid
+    recovered_local = [k for k, d in enumerate(cand_dists) if d <= RECOVERY_THRESHOLD]
+    recovered = [candidate_idxs[k] for k in recovered_local]
+    print(f"recovered at cos-dist <= {RECOVERY_THRESHOLD}: {len(recovered)}")
+
+    # Quality gate.
+    qualified = []
+    drop_size = drop_blur = drop_det = 0
+    for i in recovered:
+        r = face_records[i]
+        if r.get("face_short", 0) < MIN_FACE_SHORT:
+            drop_size += 1
+            continue
+        if r.get("blur", 0.0) < 40.0:
+            drop_blur += 1
+            continue
+        if r.get("det_score", 0.0) < 0.6:
+            drop_det += 1
+            continue
+        qualified.append(i)
+    print(f"after quality gate: {len(qualified)} (drop size={drop_size} blur={drop_blur} det={drop_det})")
+
+    # One tightening pass: re-centroid on qualified, drop anyone > TIGHTEN_THRESHOLD.
+    qcent = _normalize(emb[qualified].mean(axis=0))
+    qd = 1.0 - emb[qualified] @ qcent
+    tight = [qualified[k] for k, d in enumerate(qd) if d <= TIGHTEN_THRESHOLD]
+    print(f"after re-centroid tighten ({TIGHTEN_THRESHOLD}): {len(tight)}")
+
+    # ---- phase 5: sub-cluster -------------------------------------------- #
+    print("\n=== sub-clustering ===")
+    from sklearn.cluster import AgglomerativeClustering
+
+    E = emb[tight]
+    sims = E @ E.T
+    dists = 1.0 - sims
+    # Floor numerical noise.
+    np.fill_diagonal(dists, 0.0)
+    dists = np.maximum(dists, 0.0)
+
+    ac = AgglomerativeClustering(
+        n_clusters=None,
+        metric="precomputed",
+        linkage="average",
+        distance_threshold=SUBCLUSTER_THRESHOLD,
+    )
+    labels = ac.fit_predict(dists)
+    sub_sizes = Counter(labels)
+    print(f"raw sub-clusters: {len(sub_sizes)} (sizes: top10={sorted(sub_sizes.values(), reverse=True)[:10]})")
+
+    # Per-cluster: indices, centroid, EXIF years.
+    cluster_indices: dict[int, list[int]] = {}
+    for k, lab in enumerate(labels):
+        cluster_indices.setdefault(int(lab), []).append(tight[k])
+
+    cluster_centroids: dict[int, np.ndarray] = {}
+    for lab, idxs in cluster_indices.items():
+        cluster_centroids[lab] = _normalize(emb[idxs].mean(axis=0))
+
+    print("\n=== EXIF years (one read per source path; cached) ===")
+    unique_paths = sorted({face_records[i]["path"] for i in tight})
+    if EXIF_CACHE.exists():
+        cached = json.loads(EXIF_CACHE.read_text())
+    else:
+        cached = {}
+    path_year: dict[str, int | None] = {}
+    new_reads = 0
+    for p in unique_paths:
+        if p in cached:
+            path_year[p] = cached[p]
+        else:
+            y = exif_year(Path(p))
+            path_year[p] = y
+            cached[p] = y
+            new_reads += 1
+    EXIF_CACHE.parent.mkdir(parents=True, exist_ok=True)
+    EXIF_CACHE.write_text(json.dumps(cached, indent=0))
+    dated = sum(1 for v in path_year.values() if v is not None)
+    print(f"  EXIF cache: {len(cached)} entries, {new_reads} new reads, "
+          f"{dated}/{len(unique_paths)} dated")
+
+    cluster_years: dict[int, list[int]] = {}
+    cluster_dom_year: dict[int, int | None] = {}
+    for lab, idxs in cluster_indices.items():
+        ys = []
+        for i in idxs:
+            y = path_year.get(face_records[i]["path"])
+            if y is not None:
+                ys.append(y)
+        cluster_years[lab] = ys
+        cluster_dom_year[lab] = (Counter(ys).most_common(1)[0][0]) if ys else None
+
+    # ---- phase 6: anchor-based fragment assignment ----------------------- #
+    # Each sub-cluster of size >= ANCHOR_MIN_SIZE is an "era anchor". Smaller
+    # fragments are assigned to the single nearest anchor IFF (centroid distance
+    # <= FRAGMENT_CENTROID_MAX AND |dom_year delta| <= FRAGMENT_YEAR_MAX).
+    # Anchors do NOT merge with each other — that prevented transitive year drift
+    # observed when union-find was used. Standalone fragments stay as their own
+    # (likely THIN) eras.
+    print("\n=== anchor-based assignment ===")
+    anchors = [lab for lab, idxs in cluster_indices.items() if len(idxs) >= ANCHOR_MIN_SIZE]
+    fragments = [lab for lab in cluster_indices if lab not in anchors]
+    anchors.sort(key=lambda l: -len(cluster_indices[l]))
+    print(f"anchors (size>={ANCHOR_MIN_SIZE}): {len(anchors)}; fragments: {len(fragments)}")
+    for a in anchors:
+        print(f"  anchor sub {a}: size={len(cluster_indices[a])} dom_year={cluster_dom_year[a]}")
+
+    if anchors:
+        a_cent = np.stack([cluster_centroids[a] for a in anchors])
+        assignments: dict[int, int] = {a: a for a in anchors}  # anchor -> self
+        unassigned: list[int] = []
+        for f in fragments:
+            f_cent = cluster_centroids[f]
+            f_year = cluster_dom_year[f]
+            # cosine distances to each anchor
+            cd = 1.0 - a_cent @ f_cent
+            # year distance (inf if either dom-year unknown)
+            yd = []
+            for a in anchors:
+                ay = cluster_dom_year[a]
+                if f_year is None or ay is None:
+                    yd.append(float("inf"))
+                else:
+                    yd.append(abs(f_year - ay))
+            yd = np.array(yd)
+            ok = (cd <= FRAGMENT_CENTROID_MAX) & (yd <= FRAGMENT_YEAR_MAX)
+            if not ok.any():
+                unassigned.append(f)
+                continue
+            # nearest qualifying anchor by centroid distance.
+            cd_masked = np.where(ok, cd, np.inf)
+            best = int(np.argmin(cd_masked))
+            assignments[f] = anchors[best]
+        print(f"  assigned fragments: {sum(1 for k,v in assignments.items() if k!=v)}/{len(fragments)}; "
+              f"unassigned (standalone): {len(unassigned)}")
+    else:
+        print("  no anchors; every sub-cluster stands alone")
+        assignments = {lab: lab for lab in cluster_indices}
+        unassigned = []
+
+    merged: dict[int, list[int]] = {}
+    for lab, idxs in cluster_indices.items():
+        root = assignments.get(lab, lab)
+        merged.setdefault(root, []).extend(idxs)
+
+    merged_sizes = sorted(((r, len(v)) for r, v in merged.items()), key=lambda kv: -kv[1])
+    print(f"era buckets: {len(merged)} (top10 sizes: {[s for _, s in merged_sizes[:10]]})")
+
+    # Recompute centroid + dom-year for merged eras.
+    era_indices: dict[int, list[int]] = merged
+    era_centroids: dict[int, np.ndarray] = {}
+    era_year_label: dict[int, str] = {}
+    era_years_full: dict[int, list[int]] = {}
+    for root, idxs in era_indices.items():
+        era_centroids[root] = _normalize(emb[idxs].mean(axis=0))
+        ys = []
+        for i in idxs:
+            y = path_year.get(face_records[i]["path"])
+            if y is not None:
+                ys.append(y)
+        era_years_full[root] = ys
+        era_year_label[root] = label_for_era(ys)
+
+    # ---- phase 8: assign undated faces (no-EXIF) to nearest era ---------- #
+    # NB: undated = path's EXIF was None. For era assignment we use embedding,
+    # but the year *label* is unaffected because labels come from dated faces only.
+    # Actually undated face is already in some sub-cluster; here we just note count.
+    n_undated = sum(1 for i in tight if path_year.get(face_records[i]["path"]) is None)
+    print(f"undated face records (no EXIF): {n_undated}/{len(tight)} (placed by embedding only)")
+
+    # ---- phase 9: per-era export ----------------------------------------- #
+    import cv2
+
+    print("\n=== exporting era bundles ===")
+    new_manifest_entries: list[dict] = []
+    eras_sorted = sorted(era_indices.items(), key=lambda kv: -len(kv[1]))
+    for root, idxs in eras_sorted:
+        size = len(idxs)
+        label = era_year_label[root]
+        era_name = f"faceset_001_{label}"
+        out_dir = SWAP_READY / era_name
+
+        # Disambiguate same-label collisions (e.g. two distinct embedding eras both 2019).
+        collision = 2
+        while out_dir.exists():
+            era_name = f"faceset_001_{label}_v{collision}"
+            out_dir = SWAP_READY / era_name
+            collision += 1
+
+        faces_dir = out_dir / "faces"
+        faces_dir.mkdir(parents=True, exist_ok=True)
+
+        # Composite quality + rank.
+        ranked = []
+        for ci in idxs:
+            rec = face_records[ci]
+            q = compute_quality(rec)
+            ranked.append({"cache_idx": ci, "rec": rec, "quality": q})
+
+        # Dedup by source path within this era — keep highest-quality face per path.
+        seen_path: dict[str, dict] = {}
+        for r in ranked:
+            p = r["rec"]["path"]
+            prev = seen_path.get(p)
+            if prev is None or r["quality"]["composite"] > prev["quality"]["composite"]:
+                seen_path[p] = r
+        unique = sorted(seen_path.values(), key=lambda r: -r["quality"]["composite"])
+
+        # Materialize crops.
+        written: list[Path] = []
+        face_entries: list[dict] = []
+        for rank, r in enumerate(unique, start=1):
+            rec = r["rec"]
+            src = Path(rec["path"])
+            if not src.exists():
+                continue
+            rgb, _ = load_rgb_bgr(src)
+            if rgb is None:
+                continue
+            crop = _crop_face_square(rgb, rec["bbox"], PAD_RATIO, OUT_SIZE)
+            png = faces_dir / f"{rank:04d}.png"
+            cv2.imwrite(str(png), cv2.cvtColor(crop, cv2.COLOR_RGB2BGR))
+            written.append(png)
+            face_entries.append({
+                "rank": rank,
+                "png": f"faces/{rank:04d}.png",
+                "source": rec["path"],
+                "aliases": path_aliases.get(rec["path"], []),
+                "bbox": rec["bbox"],
+                "face_short": rec.get("face_short"),
+                "det_score": rec.get("det_score"),
+                "blur": rec.get("blur"),
+                "pose": rec.get("pose"),
+                "exif_year": path_year.get(rec["path"]),
+                "quality": r["quality"],
+            })
+
+        if not written:
+            print(f"[{era_name}] empty after materialization; skipping")
+            shutil.rmtree(out_dir)
+            continue
+
+        # Bundle.
+        top_n_eff = min(TOP_N, len(written))
+        top_fsz = out_dir / f"{era_name}_top{top_n_eff}.fsz"
+        _zip_png_list(written[:top_n_eff], top_fsz)
+        all_fsz: Path | None = None
+        if len(written) > top_n_eff:
+            all_fsz = out_dir / f"{era_name}_all.fsz"
+            _zip_png_list(written, all_fsz)
+
+        # Per-era manifest.
+        ys = era_years_full[root]
+        year_summary = {
+            "label": label,
+            "year_count": len(ys),
+            "year_min": min(ys) if ys else None,
+            "year_max": max(ys) if ys else None,
+            "year_dist": dict(Counter(ys).most_common()),
+        }
+        is_thin = size < THIN_THRESHOLD
+        manifest = {
+            "name": era_name,
+            "parent_identity": "faceset_001",
+            "era": year_summary,
+            "input_face_records": size,
+            "exported": len(written),
+            "top_n": top_n_eff,
+            "fsz_top": top_fsz.name,
+            "fsz_all": all_fsz.name if all_fsz else None,
+            "thin": is_thin,
+            "quality_weights": QUALITY_WEIGHTS,
+            "params": {
+                "recovery_threshold": RECOVERY_THRESHOLD,
+                "tighten_threshold": TIGHTEN_THRESHOLD,
+                "subcluster_threshold": SUBCLUSTER_THRESHOLD,
+                "anchor_min_size": ANCHOR_MIN_SIZE,
+                "fragment_centroid_max": FRAGMENT_CENTROID_MAX,
+                "fragment_year_max": FRAGMENT_YEAR_MAX,
+                "min_face_short": MIN_FACE_SHORT,
+            },
+            "faces": face_entries,
+        }
+        (out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2))
+
+        # Per-era marker file (always: <label>.txt for human reference).
+        (out_dir / f"{label}.txt").write_text(
+            f"{era_name}\n\nEra: {label}\n"
+            f"Year span: {year_summary['year_min']}..{year_summary['year_max']} "
+            f"({year_summary['year_count']} dated of {size} faces)\n"
+            f"Sub-cluster size: {size} face records, {len(unique)} unique source paths, "
+            f"{len(written)} exported PNGs.\n"
+        )
+        if is_thin:
+            (out_dir / "THIN.txt").write_text(
+                f"This era has only {size} face records (<{THIN_THRESHOLD}). "
+                f"Averaged embedding may be dominated by single-photo idiosyncrasies.\n"
+            )
+
+        # Append to top-level manifest summary.
+        new_manifest_entries.append({k: v for k, v in manifest.items() if k != "faces"})
+
+        thin_tag = " THIN" if is_thin else ""
+        print(
+            f"[{era_name}] size={size} unique_paths={len(unique)} exported={len(written)} "
+            f"top{top_n_eff}{thin_tag}"
+        )
+
+    # ---- merge into top-level manifest ----------------------------------- #
+    top_path = SWAP_READY / "manifest.json"
+    existing = json.loads(top_path.read_text()) if top_path.exists() else {"facesets": []}
+    existing_names = {fs.get("name") for fs in existing.get("facesets", [])}
+    appended = 0
+    for entry in new_manifest_entries:
+        if entry["name"] in existing_names:
+            continue
+        existing["facesets"].append(entry)
+        appended += 1
+    top_path.write_text(json.dumps(existing, indent=2))
+    print(f"\nAppended {appended} era entries to {top_path}")
+    print(f"Done. {len(new_manifest_entries)} era buckets emitted (faceset_001/ left untouched).")
+
+
+if __name__ == "__main__":
+    main()
--- a/work/check_faceset001_age.py
+++ b/work/check_faceset001_age.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""Probe faceset_001 for age-sortable sub-structure.
+
+Three questions:
+1. How spread is the embedding cloud? (intra-cluster pairwise distance histogram)
+2. Does it split naturally into sub-clusters at a tight threshold?
+3. Do the sub-clusters correspond to distinct time periods (EXIF DateTimeOriginal)?
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from collections import Counter
+from pathlib import Path
+
+import numpy as np
+from PIL import Image, ExifTags
+
+REPO = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(REPO))
+from sort_faces import load_cache  # noqa: E402
+
+CACHE = REPO / "work" / "cache" / "nl_full.npz"
+FS001 = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready/faceset_001")
+
+
+def exif_year(path: Path) -> int | None:
+    try:
+        with Image.open(path) as im:
+            exif = im._getexif()
+            if not exif:
+                return None
+            for tag_id, val in exif.items():
+                tag = ExifTags.TAGS.get(tag_id, tag_id)
+                if tag == "DateTimeOriginal" and isinstance(val, str) and len(val) >= 4:
+                    return int(val[:4])
+    except Exception:
+        return None
+    return None
+
+
+def main() -> None:
+    manifest = json.loads((FS001 / "manifest.json").read_text())
+    faces = manifest["faces"]
+    paths = [Path(f["source"]) for f in faces]
+    print(f"faceset_001 has {len(paths)} ranked faces in the swap-ready set")
+
+    # Pull embeddings for these face records by (path, bbox).
+    emb, meta, _src, _proc, _aliases = load_cache(CACHE)
+    face_records = [m for m in meta if not m.get("noface")]
+    if len(face_records) != len(emb):
+        raise SystemExit("emb/meta mismatch")
+    bbox_key = {}
+    for i, m in enumerate(face_records):
+        bbox_key[(m["path"], tuple(m.get("bbox") or ()))] = i
+
+    selected = []
+    missing = 0
+    for f in faces:
+        key = (f["source"], tuple(f.get("bbox") or ()))
+        i = bbox_key.get(key)
+        if i is None:
+            missing += 1
+            continue
+        selected.append(i)
+    print(f"matched {len(selected)} embeddings (missing {missing})")
+
+    E = emb[selected]
+    # All embeddings are L2-normalized -> cosine dist = 1 - dot.
+    sims = E @ E.T
+    dists = 1.0 - sims
+    iu = np.triu_indices_from(dists, k=1)
+    pw = dists[iu]
+    print("\n-- intra-cluster pairwise cosine distance --")
+    print(f"  n_pairs = {len(pw):,}")
+    print(f"  mean    = {pw.mean():.3f}")
+    print(f"  median  = {np.median(pw):.3f}")
+    print(f"  p10/p25/p75/p90 = {np.percentile(pw, [10,25,75,90])}")
+    print(f"  max     = {pw.max():.3f}")
+
+    # Histogram bins around interesting thresholds.
+    edges = [0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0, 1.4]
+    hist, _ = np.histogram(pw, bins=edges)
+    print("\n  histogram (cos-dist bin -> pair count):")
+    for lo, hi, c in zip(edges[:-1], edges[1:], hist):
+        bar = "#" * int(60 * c / max(hist.max(), 1))
+        print(f"    [{lo:.1f},{hi:.1f})  {c:7d}  {bar}")
+
+    # Sub-cluster at three thresholds via agglomerative on the distance matrix.
+    from sklearn.cluster import AgglomerativeClustering
+    print("\n-- sub-clustering --")
+    for thr in (0.30, 0.35, 0.40, 0.45, 0.50):
+        ac = AgglomerativeClustering(
+            n_clusters=None,
+            metric="precomputed",
+            linkage="average",
+            distance_threshold=thr,
+        )
+        labels = ac.fit_predict(dists)
+        sizes = Counter(labels)
+        n = len(sizes)
+        big = sum(1 for s in sizes.values() if s >= 10)
+        top5 = sorted(sizes.values(), reverse=True)[:5]
+        print(f"  threshold {thr:.2f}: {n} sub-clusters, {big} with >=10 images, top-5 sizes={top5}")
+
+    # Pick the threshold that gives 2-5 substantial sub-clusters.
+    target_thr = 0.35
+    ac = AgglomerativeClustering(
+        n_clusters=None, metric="precomputed", linkage="average",
+        distance_threshold=target_thr,
+    )
+    labels = ac.fit_predict(dists)
+    sizes = Counter(labels)
+    big_labels = [lab for lab, s in sizes.most_common() if s >= 20]
+    print(f"\n-- EXIF year analysis at threshold {target_thr} (sub-clusters with >=20 images) --")
+    print(f"   {len(big_labels)} substantial sub-clusters")
+
+    # Build label -> list of source paths
+    by_label: dict[int, list[Path]] = {}
+    for ci, lab in zip(selected, labels):
+        rec = face_records[ci]
+        by_label.setdefault(int(lab), []).append(Path(rec["path"]))
+
+    for lab in big_labels[:6]:
+        paths_in = by_label[lab]
+        years = []
+        for p in paths_in:
+            y = exif_year(p)
+            if y is not None:
+                years.append(y)
+        n_paths = len(paths_in)
+        n_years = len(years)
+        if years:
+            ys = np.array(years)
+            ymin, ymax = int(ys.min()), int(ys.max())
+            ymed = int(np.median(ys))
+            yhist = Counter(years)
+            top_years = ", ".join(f"{y}:{c}" for y, c in sorted(yhist.most_common(5)))
+        else:
+            ymin = ymax = ymed = None
+            top_years = ""
+        print(
+            f"  cluster {lab}: {n_paths} faces, EXIF on {n_years}/{n_paths}, "
+            f"year range {ymin}..{ymax} (median {ymed})"
+        )
+        print(f"    top years: {top_years}")
+
+
+if __name__ == "__main__":
+    main()