Document hand-sorted-folder import + age-split workflow

- README: document work/build_folders.py (hand-sorted folder identities) and the new age-split workflow for splitting a long-running identity into era-specific facesets after clustering. - Force-track work/age_split_001.py and work/check_faceset001_age.py; these are the worked example + readiness probe for faceset_001 and the template for splitting any other identity by EXIF era. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 12:08:25 +02:00
parent 4d7a8780de
commit 03a0c75531
3 changed files with 729 additions and 2 deletions
--- a/work/check_faceset001_age.py
+++ b/work/check_faceset001_age.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""Probe faceset_001 for age-sortable sub-structure.
+
+Three questions:
+1. How spread is the embedding cloud? (intra-cluster pairwise distance histogram)
+2. Does it split naturally into sub-clusters at a tight threshold?
+3. Do the sub-clusters correspond to distinct time periods (EXIF DateTimeOriginal)?
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from collections import Counter
+from pathlib import Path
+
+import numpy as np
+from PIL import Image, ExifTags
+
+REPO = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(REPO))
+from sort_faces import load_cache  # noqa: E402
+
+CACHE = REPO / "work" / "cache" / "nl_full.npz"
+FS001 = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready/faceset_001")
+
+
+def exif_year(path: Path) -> int | None:
+    try:
+        with Image.open(path) as im:
+            exif = im._getexif()
+            if not exif:
+                return None
+            for tag_id, val in exif.items():
+                tag = ExifTags.TAGS.get(tag_id, tag_id)
+                if tag == "DateTimeOriginal" and isinstance(val, str) and len(val) >= 4:
+                    return int(val[:4])
+    except Exception:
+        return None
+    return None
+
+
+def main() -> None:
+    manifest = json.loads((FS001 / "manifest.json").read_text())
+    faces = manifest["faces"]
+    paths = [Path(f["source"]) for f in faces]
+    print(f"faceset_001 has {len(paths)} ranked faces in the swap-ready set")
+
+    # Pull embeddings for these face records by (path, bbox).
+    emb, meta, _src, _proc, _aliases = load_cache(CACHE)
+    face_records = [m for m in meta if not m.get("noface")]
+    if len(face_records) != len(emb):
+        raise SystemExit("emb/meta mismatch")
+    bbox_key = {}
+    for i, m in enumerate(face_records):
+        bbox_key[(m["path"], tuple(m.get("bbox") or ()))] = i
+
+    selected = []
+    missing = 0
+    for f in faces:
+        key = (f["source"], tuple(f.get("bbox") or ()))
+        i = bbox_key.get(key)
+        if i is None:
+            missing += 1
+            continue
+        selected.append(i)
+    print(f"matched {len(selected)} embeddings (missing {missing})")
+
+    E = emb[selected]
+    # All embeddings are L2-normalized -> cosine dist = 1 - dot.
+    sims = E @ E.T
+    dists = 1.0 - sims
+    iu = np.triu_indices_from(dists, k=1)
+    pw = dists[iu]
+    print("\n-- intra-cluster pairwise cosine distance --")
+    print(f"  n_pairs = {len(pw):,}")
+    print(f"  mean    = {pw.mean():.3f}")
+    print(f"  median  = {np.median(pw):.3f}")
+    print(f"  p10/p25/p75/p90 = {np.percentile(pw, [10,25,75,90])}")
+    print(f"  max     = {pw.max():.3f}")
+
+    # Histogram bins around interesting thresholds.
+    edges = [0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0, 1.4]
+    hist, _ = np.histogram(pw, bins=edges)
+    print("\n  histogram (cos-dist bin -> pair count):")
+    for lo, hi, c in zip(edges[:-1], edges[1:], hist):
+        bar = "#" * int(60 * c / max(hist.max(), 1))
+        print(f"    [{lo:.1f},{hi:.1f})  {c:7d}  {bar}")
+
+    # Sub-cluster at three thresholds via agglomerative on the distance matrix.
+    from sklearn.cluster import AgglomerativeClustering
+    print("\n-- sub-clustering --")
+    for thr in (0.30, 0.35, 0.40, 0.45, 0.50):
+        ac = AgglomerativeClustering(
+            n_clusters=None,
+            metric="precomputed",
+            linkage="average",
+            distance_threshold=thr,
+        )
+        labels = ac.fit_predict(dists)
+        sizes = Counter(labels)
+        n = len(sizes)
+        big = sum(1 for s in sizes.values() if s >= 10)
+        top5 = sorted(sizes.values(), reverse=True)[:5]
+        print(f"  threshold {thr:.2f}: {n} sub-clusters, {big} with >=10 images, top-5 sizes={top5}")
+
+    # Pick the threshold that gives 2-5 substantial sub-clusters.
+    target_thr = 0.35
+    ac = AgglomerativeClustering(
+        n_clusters=None, metric="precomputed", linkage="average",
+        distance_threshold=target_thr,
+    )
+    labels = ac.fit_predict(dists)
+    sizes = Counter(labels)
+    big_labels = [lab for lab, s in sizes.most_common() if s >= 20]
+    print(f"\n-- EXIF year analysis at threshold {target_thr} (sub-clusters with >=20 images) --")
+    print(f"   {len(big_labels)} substantial sub-clusters")
+
+    # Build label -> list of source paths
+    by_label: dict[int, list[Path]] = {}
+    for ci, lab in zip(selected, labels):
+        rec = face_records[ci]
+        by_label.setdefault(int(lab), []).append(Path(rec["path"]))
+
+    for lab in big_labels[:6]:
+        paths_in = by_label[lab]
+        years = []
+        for p in paths_in:
+            y = exif_year(p)
+            if y is not None:
+                years.append(y)
+        n_paths = len(paths_in)
+        n_years = len(years)
+        if years:
+            ys = np.array(years)
+            ymin, ymax = int(ys.min()), int(ys.max())
+            ymed = int(np.median(ys))
+            yhist = Counter(years)
+            top_years = ", ".join(f"{y}:{c}" for y, c in sorted(yhist.most_common(5)))
+        else:
+            ymin = ymax = ymed = None
+            top_years = ""
+        print(
+            f"  cluster {lab}: {n_paths} faces, EXIF on {n_years}/{n_paths}, "
+            f"year range {ymin}..{ymax} (median {ymed})"
+        )
+        print(f"    top years: {top_years}")
+
+
+if __name__ == "__main__":
+    main()