Document hand-sorted-folder import + age-split workflow

- README: document work/build_folders.py (hand-sorted folder identities)
  and the new age-split workflow for splitting a long-running identity
  into era-specific facesets after clustering.
- Force-track work/age_split_001.py and work/check_faceset001_age.py;
  these are the worked example + readiness probe for faceset_001 and
  the template for splitting any other identity by EXIF era.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-26 12:08:25 +02:00
parent 4d7a8780de
commit 03a0c75531
3 changed files with 729 additions and 2 deletions

485
work/age_split_001.py Normal file
View File

@@ -0,0 +1,485 @@
#!/usr/bin/env python3
"""Age-split person_001 into era-specific facesets.
Workflow:
1. Seed a clean person_001 centroid from the existing curated 707-face
`facesets_swap_ready/faceset_001/`.
2. Wide-recovery scan: pull every face record under /mnt/x/src/{nl, lzbkp_red}
from `nl_full.npz` with cos-dist <= 0.55 from the seed centroid.
3. Apply export-swap-style per-face quality gates.
4. One re-centroid + 0.50 tighten pass to absorb the recovery without drift.
5. Agglomerative sub-clustering at cos-dist 0.35.
6. Post-merge sub-clusters whose centroids <0.30 AND whose dominant EXIF
years are within 2 years.
7. Read EXIF DateTimeOriginal for each face's source path; era label =
(p10 year, p90 year) over dated faces.
8. Undated faces are assigned to the nearest era by embedding distance.
9. For each era: composite-quality rank, single-face PNG crops, .fsz bundles
(top-N and _all if era > top_n). `<era>_<range>.txt` marker file. Eras
with <20 face records get a `THIN.txt` marker.
10. Append era entries into the canonical
`facesets_swap_ready/manifest.json` next to the existing 19.
"""
from __future__ import annotations
import json
import shutil
import sys
from collections import Counter
from pathlib import Path
import numpy as np
from PIL import Image, ExifTags, ImageOps
REPO = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO))
from sort_faces import ( # noqa: E402
QUALITY_WEIGHTS,
_crop_face_square,
_zip_png_list,
compute_quality,
load_cache,
load_rgb_bgr,
)
# ---- config -------------------------------------------------------------- #
CACHE = REPO / "work" / "cache" / "nl_full.npz"
SWAP_READY = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
FS001 = SWAP_READY / "faceset_001"
SCAN_ROOTS = [
Path("/mnt/x/src/nl"),
Path("/mnt/x/src/lzbkp_red"),
]
# Recovery + identity refinement
RECOVERY_THRESHOLD = 0.55 # initial centroid match
TIGHTEN_THRESHOLD = 0.50 # post-recentroid drift trim
# Quality gates (mirror export-swap defaults)
MIN_FACE_SHORT = 100
# Sub-cluster
SUBCLUSTER_THRESHOLD = 0.35
# Anchor-based fragment assignment (replaces transitive union-find merge):
ANCHOR_MIN_SIZE = 20 # sub-cluster size to qualify as an era anchor
FRAGMENT_CENTROID_MAX = 0.40 # small fragment may join an anchor only if cent_dist <=
FRAGMENT_YEAR_MAX = 5 # AND |dom_year_anchor - dom_year_fragment| <=
# Output
TOP_N = 30
PAD_RATIO = 0.5
OUT_SIZE = 512
THIN_THRESHOLD = 20
# EXIF cache (so re-runs skip the 30-min Windows-mount EXIF read)
EXIF_CACHE = REPO / "work" / "cache" / "age_split_exif.json"
# ---- helpers ------------------------------------------------------------- #
def _normalize(v: np.ndarray) -> np.ndarray:
n = np.linalg.norm(v)
return v / n if n > 0 else v
def _under(roots: list[Path], p: str) -> bool:
for r in roots:
rs = str(r).rstrip("/") + "/"
if p == str(r) or p.startswith(rs):
return True
return False
def _record_in_roots(rec: dict, roots: list[Path], path_aliases: dict) -> bool:
if _under(roots, rec["path"]):
return True
for alias in path_aliases.get(rec["path"], []):
if _under(roots, alias):
return True
return False
def exif_year(path: Path) -> int | None:
try:
with Image.open(path) as im:
exif = im._getexif()
if not exif:
return None
for tag_id, val in exif.items():
tag = ExifTags.TAGS.get(tag_id, tag_id)
if tag == "DateTimeOriginal" and isinstance(val, str) and len(val) >= 4:
return int(val[:4])
except Exception:
return None
return None
def label_for_era(years: list[int]) -> str:
"""Era label as a year-range string. Falls back to 'undated' if no years."""
if not years:
return "undated"
ys = sorted(years)
lo = ys[len(ys) // 10] if len(ys) >= 10 else ys[0]
hi = ys[-(len(ys) // 10) - 1] if len(ys) >= 10 else ys[-1]
if lo == hi:
return str(lo)
# Compact year range like 2011-13 if same century, else 2009-2024.
if (lo // 100) == (hi // 100):
return f"{lo}-{hi % 100:02d}"
return f"{lo}-{hi}"
# ---- phase 1 + 2: seed centroid + recovery scan ------------------------- #
def main() -> None:
if not FS001.exists():
raise SystemExit(f"missing seed faceset: {FS001}")
print("=== loading cache ===")
emb, meta, _src, _proc, path_aliases = load_cache(CACHE)
face_records = [m for m in meta if not m.get("noface")]
if len(face_records) != len(emb):
raise SystemExit(f"emb/meta mismatch: {len(face_records)} vs {len(emb)}")
bbox_idx = {(m["path"], tuple(m.get("bbox") or ())): i for i, m in enumerate(face_records)}
seed_manifest = json.loads((FS001 / "manifest.json").read_text())
seed_face_keys = [(f["source"], tuple(f.get("bbox") or ())) for f in seed_manifest["faces"]]
seed_indices = [bbox_idx[k] for k in seed_face_keys if k in bbox_idx]
print(f"seed faces from faceset_001: {len(seed_indices)} (manifest had {len(seed_face_keys)})")
seed_centroid = _normalize(emb[seed_indices].mean(axis=0))
# Recovery: every face record under nl/ + lzbkp_red/ within RECOVERY_THRESHOLD.
candidate_idxs = [
i for i, rec in enumerate(face_records)
if _record_in_roots(rec, SCAN_ROOTS, path_aliases)
]
print(f"\ncandidates under {[str(r) for r in SCAN_ROOTS]}: {len(candidate_idxs)}")
cand_emb = emb[candidate_idxs]
cand_dists = 1.0 - cand_emb @ seed_centroid
recovered_local = [k for k, d in enumerate(cand_dists) if d <= RECOVERY_THRESHOLD]
recovered = [candidate_idxs[k] for k in recovered_local]
print(f"recovered at cos-dist <= {RECOVERY_THRESHOLD}: {len(recovered)}")
# Quality gate.
qualified = []
drop_size = drop_blur = drop_det = 0
for i in recovered:
r = face_records[i]
if r.get("face_short", 0) < MIN_FACE_SHORT:
drop_size += 1
continue
if r.get("blur", 0.0) < 40.0:
drop_blur += 1
continue
if r.get("det_score", 0.0) < 0.6:
drop_det += 1
continue
qualified.append(i)
print(f"after quality gate: {len(qualified)} (drop size={drop_size} blur={drop_blur} det={drop_det})")
# One tightening pass: re-centroid on qualified, drop anyone > TIGHTEN_THRESHOLD.
qcent = _normalize(emb[qualified].mean(axis=0))
qd = 1.0 - emb[qualified] @ qcent
tight = [qualified[k] for k, d in enumerate(qd) if d <= TIGHTEN_THRESHOLD]
print(f"after re-centroid tighten ({TIGHTEN_THRESHOLD}): {len(tight)}")
# ---- phase 5: sub-cluster -------------------------------------------- #
print("\n=== sub-clustering ===")
from sklearn.cluster import AgglomerativeClustering
E = emb[tight]
sims = E @ E.T
dists = 1.0 - sims
# Floor numerical noise.
np.fill_diagonal(dists, 0.0)
dists = np.maximum(dists, 0.0)
ac = AgglomerativeClustering(
n_clusters=None,
metric="precomputed",
linkage="average",
distance_threshold=SUBCLUSTER_THRESHOLD,
)
labels = ac.fit_predict(dists)
sub_sizes = Counter(labels)
print(f"raw sub-clusters: {len(sub_sizes)} (sizes: top10={sorted(sub_sizes.values(), reverse=True)[:10]})")
# Per-cluster: indices, centroid, EXIF years.
cluster_indices: dict[int, list[int]] = {}
for k, lab in enumerate(labels):
cluster_indices.setdefault(int(lab), []).append(tight[k])
cluster_centroids: dict[int, np.ndarray] = {}
for lab, idxs in cluster_indices.items():
cluster_centroids[lab] = _normalize(emb[idxs].mean(axis=0))
print("\n=== EXIF years (one read per source path; cached) ===")
unique_paths = sorted({face_records[i]["path"] for i in tight})
if EXIF_CACHE.exists():
cached = json.loads(EXIF_CACHE.read_text())
else:
cached = {}
path_year: dict[str, int | None] = {}
new_reads = 0
for p in unique_paths:
if p in cached:
path_year[p] = cached[p]
else:
y = exif_year(Path(p))
path_year[p] = y
cached[p] = y
new_reads += 1
EXIF_CACHE.parent.mkdir(parents=True, exist_ok=True)
EXIF_CACHE.write_text(json.dumps(cached, indent=0))
dated = sum(1 for v in path_year.values() if v is not None)
print(f" EXIF cache: {len(cached)} entries, {new_reads} new reads, "
f"{dated}/{len(unique_paths)} dated")
cluster_years: dict[int, list[int]] = {}
cluster_dom_year: dict[int, int | None] = {}
for lab, idxs in cluster_indices.items():
ys = []
for i in idxs:
y = path_year.get(face_records[i]["path"])
if y is not None:
ys.append(y)
cluster_years[lab] = ys
cluster_dom_year[lab] = (Counter(ys).most_common(1)[0][0]) if ys else None
# ---- phase 6: anchor-based fragment assignment ----------------------- #
# Each sub-cluster of size >= ANCHOR_MIN_SIZE is an "era anchor". Smaller
# fragments are assigned to the single nearest anchor IFF (centroid distance
# <= FRAGMENT_CENTROID_MAX AND |dom_year delta| <= FRAGMENT_YEAR_MAX).
# Anchors do NOT merge with each other — that prevented transitive year drift
# observed when union-find was used. Standalone fragments stay as their own
# (likely THIN) eras.
print("\n=== anchor-based assignment ===")
anchors = [lab for lab, idxs in cluster_indices.items() if len(idxs) >= ANCHOR_MIN_SIZE]
fragments = [lab for lab in cluster_indices if lab not in anchors]
anchors.sort(key=lambda l: -len(cluster_indices[l]))
print(f"anchors (size>={ANCHOR_MIN_SIZE}): {len(anchors)}; fragments: {len(fragments)}")
for a in anchors:
print(f" anchor sub {a}: size={len(cluster_indices[a])} dom_year={cluster_dom_year[a]}")
if anchors:
a_cent = np.stack([cluster_centroids[a] for a in anchors])
assignments: dict[int, int] = {a: a for a in anchors} # anchor -> self
unassigned: list[int] = []
for f in fragments:
f_cent = cluster_centroids[f]
f_year = cluster_dom_year[f]
# cosine distances to each anchor
cd = 1.0 - a_cent @ f_cent
# year distance (inf if either dom-year unknown)
yd = []
for a in anchors:
ay = cluster_dom_year[a]
if f_year is None or ay is None:
yd.append(float("inf"))
else:
yd.append(abs(f_year - ay))
yd = np.array(yd)
ok = (cd <= FRAGMENT_CENTROID_MAX) & (yd <= FRAGMENT_YEAR_MAX)
if not ok.any():
unassigned.append(f)
continue
# nearest qualifying anchor by centroid distance.
cd_masked = np.where(ok, cd, np.inf)
best = int(np.argmin(cd_masked))
assignments[f] = anchors[best]
print(f" assigned fragments: {sum(1 for k,v in assignments.items() if k!=v)}/{len(fragments)}; "
f"unassigned (standalone): {len(unassigned)}")
else:
print(" no anchors; every sub-cluster stands alone")
assignments = {lab: lab for lab in cluster_indices}
unassigned = []
merged: dict[int, list[int]] = {}
for lab, idxs in cluster_indices.items():
root = assignments.get(lab, lab)
merged.setdefault(root, []).extend(idxs)
merged_sizes = sorted(((r, len(v)) for r, v in merged.items()), key=lambda kv: -kv[1])
print(f"era buckets: {len(merged)} (top10 sizes: {[s for _, s in merged_sizes[:10]]})")
# Recompute centroid + dom-year for merged eras.
era_indices: dict[int, list[int]] = merged
era_centroids: dict[int, np.ndarray] = {}
era_year_label: dict[int, str] = {}
era_years_full: dict[int, list[int]] = {}
for root, idxs in era_indices.items():
era_centroids[root] = _normalize(emb[idxs].mean(axis=0))
ys = []
for i in idxs:
y = path_year.get(face_records[i]["path"])
if y is not None:
ys.append(y)
era_years_full[root] = ys
era_year_label[root] = label_for_era(ys)
# ---- phase 8: assign undated faces (no-EXIF) to nearest era ---------- #
# NB: undated = path's EXIF was None. For era assignment we use embedding,
# but the year *label* is unaffected because labels come from dated faces only.
# Actually undated face is already in some sub-cluster; here we just note count.
n_undated = sum(1 for i in tight if path_year.get(face_records[i]["path"]) is None)
print(f"undated face records (no EXIF): {n_undated}/{len(tight)} (placed by embedding only)")
# ---- phase 9: per-era export ----------------------------------------- #
import cv2
print("\n=== exporting era bundles ===")
new_manifest_entries: list[dict] = []
eras_sorted = sorted(era_indices.items(), key=lambda kv: -len(kv[1]))
for root, idxs in eras_sorted:
size = len(idxs)
label = era_year_label[root]
era_name = f"faceset_001_{label}"
out_dir = SWAP_READY / era_name
# Disambiguate same-label collisions (e.g. two distinct embedding eras both 2019).
collision = 2
while out_dir.exists():
era_name = f"faceset_001_{label}_v{collision}"
out_dir = SWAP_READY / era_name
collision += 1
faces_dir = out_dir / "faces"
faces_dir.mkdir(parents=True, exist_ok=True)
# Composite quality + rank.
ranked = []
for ci in idxs:
rec = face_records[ci]
q = compute_quality(rec)
ranked.append({"cache_idx": ci, "rec": rec, "quality": q})
# Dedup by source path within this era — keep highest-quality face per path.
seen_path: dict[str, dict] = {}
for r in ranked:
p = r["rec"]["path"]
prev = seen_path.get(p)
if prev is None or r["quality"]["composite"] > prev["quality"]["composite"]:
seen_path[p] = r
unique = sorted(seen_path.values(), key=lambda r: -r["quality"]["composite"])
# Materialize crops.
written: list[Path] = []
face_entries: list[dict] = []
for rank, r in enumerate(unique, start=1):
rec = r["rec"]
src = Path(rec["path"])
if not src.exists():
continue
rgb, _ = load_rgb_bgr(src)
if rgb is None:
continue
crop = _crop_face_square(rgb, rec["bbox"], PAD_RATIO, OUT_SIZE)
png = faces_dir / f"{rank:04d}.png"
cv2.imwrite(str(png), cv2.cvtColor(crop, cv2.COLOR_RGB2BGR))
written.append(png)
face_entries.append({
"rank": rank,
"png": f"faces/{rank:04d}.png",
"source": rec["path"],
"aliases": path_aliases.get(rec["path"], []),
"bbox": rec["bbox"],
"face_short": rec.get("face_short"),
"det_score": rec.get("det_score"),
"blur": rec.get("blur"),
"pose": rec.get("pose"),
"exif_year": path_year.get(rec["path"]),
"quality": r["quality"],
})
if not written:
print(f"[{era_name}] empty after materialization; skipping")
shutil.rmtree(out_dir)
continue
# Bundle.
top_n_eff = min(TOP_N, len(written))
top_fsz = out_dir / f"{era_name}_top{top_n_eff}.fsz"
_zip_png_list(written[:top_n_eff], top_fsz)
all_fsz: Path | None = None
if len(written) > top_n_eff:
all_fsz = out_dir / f"{era_name}_all.fsz"
_zip_png_list(written, all_fsz)
# Per-era manifest.
ys = era_years_full[root]
year_summary = {
"label": label,
"year_count": len(ys),
"year_min": min(ys) if ys else None,
"year_max": max(ys) if ys else None,
"year_dist": dict(Counter(ys).most_common()),
}
is_thin = size < THIN_THRESHOLD
manifest = {
"name": era_name,
"parent_identity": "faceset_001",
"era": year_summary,
"input_face_records": size,
"exported": len(written),
"top_n": top_n_eff,
"fsz_top": top_fsz.name,
"fsz_all": all_fsz.name if all_fsz else None,
"thin": is_thin,
"quality_weights": QUALITY_WEIGHTS,
"params": {
"recovery_threshold": RECOVERY_THRESHOLD,
"tighten_threshold": TIGHTEN_THRESHOLD,
"subcluster_threshold": SUBCLUSTER_THRESHOLD,
"anchor_min_size": ANCHOR_MIN_SIZE,
"fragment_centroid_max": FRAGMENT_CENTROID_MAX,
"fragment_year_max": FRAGMENT_YEAR_MAX,
"min_face_short": MIN_FACE_SHORT,
},
"faces": face_entries,
}
(out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2))
# Per-era marker file (always: <label>.txt for human reference).
(out_dir / f"{label}.txt").write_text(
f"{era_name}\n\nEra: {label}\n"
f"Year span: {year_summary['year_min']}..{year_summary['year_max']} "
f"({year_summary['year_count']} dated of {size} faces)\n"
f"Sub-cluster size: {size} face records, {len(unique)} unique source paths, "
f"{len(written)} exported PNGs.\n"
)
if is_thin:
(out_dir / "THIN.txt").write_text(
f"This era has only {size} face records (<{THIN_THRESHOLD}). "
f"Averaged embedding may be dominated by single-photo idiosyncrasies.\n"
)
# Append to top-level manifest summary.
new_manifest_entries.append({k: v for k, v in manifest.items() if k != "faces"})
thin_tag = " THIN" if is_thin else ""
print(
f"[{era_name}] size={size} unique_paths={len(unique)} exported={len(written)} "
f"top{top_n_eff}{thin_tag}"
)
# ---- merge into top-level manifest ----------------------------------- #
top_path = SWAP_READY / "manifest.json"
existing = json.loads(top_path.read_text()) if top_path.exists() else {"facesets": []}
existing_names = {fs.get("name") for fs in existing.get("facesets", [])}
appended = 0
for entry in new_manifest_entries:
if entry["name"] in existing_names:
continue
existing["facesets"].append(entry)
appended += 1
top_path.write_text(json.dumps(existing, indent=2))
print(f"\nAppended {appended} era entries to {top_path}")
print(f"Done. {len(new_manifest_entries)} era buckets emitted (faceset_001/ left untouched).")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,151 @@
#!/usr/bin/env python3
"""Probe faceset_001 for age-sortable sub-structure.
Three questions:
1. How spread is the embedding cloud? (intra-cluster pairwise distance histogram)
2. Does it split naturally into sub-clusters at a tight threshold?
3. Do the sub-clusters correspond to distinct time periods (EXIF DateTimeOriginal)?
"""
from __future__ import annotations
import json
import sys
from collections import Counter
from pathlib import Path
import numpy as np
from PIL import Image, ExifTags
REPO = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO))
from sort_faces import load_cache # noqa: E402
CACHE = REPO / "work" / "cache" / "nl_full.npz"
FS001 = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready/faceset_001")
def exif_year(path: Path) -> int | None:
try:
with Image.open(path) as im:
exif = im._getexif()
if not exif:
return None
for tag_id, val in exif.items():
tag = ExifTags.TAGS.get(tag_id, tag_id)
if tag == "DateTimeOriginal" and isinstance(val, str) and len(val) >= 4:
return int(val[:4])
except Exception:
return None
return None
def main() -> None:
manifest = json.loads((FS001 / "manifest.json").read_text())
faces = manifest["faces"]
paths = [Path(f["source"]) for f in faces]
print(f"faceset_001 has {len(paths)} ranked faces in the swap-ready set")
# Pull embeddings for these face records by (path, bbox).
emb, meta, _src, _proc, _aliases = load_cache(CACHE)
face_records = [m for m in meta if not m.get("noface")]
if len(face_records) != len(emb):
raise SystemExit("emb/meta mismatch")
bbox_key = {}
for i, m in enumerate(face_records):
bbox_key[(m["path"], tuple(m.get("bbox") or ()))] = i
selected = []
missing = 0
for f in faces:
key = (f["source"], tuple(f.get("bbox") or ()))
i = bbox_key.get(key)
if i is None:
missing += 1
continue
selected.append(i)
print(f"matched {len(selected)} embeddings (missing {missing})")
E = emb[selected]
# All embeddings are L2-normalized -> cosine dist = 1 - dot.
sims = E @ E.T
dists = 1.0 - sims
iu = np.triu_indices_from(dists, k=1)
pw = dists[iu]
print("\n-- intra-cluster pairwise cosine distance --")
print(f" n_pairs = {len(pw):,}")
print(f" mean = {pw.mean():.3f}")
print(f" median = {np.median(pw):.3f}")
print(f" p10/p25/p75/p90 = {np.percentile(pw, [10,25,75,90])}")
print(f" max = {pw.max():.3f}")
# Histogram bins around interesting thresholds.
edges = [0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0, 1.4]
hist, _ = np.histogram(pw, bins=edges)
print("\n histogram (cos-dist bin -> pair count):")
for lo, hi, c in zip(edges[:-1], edges[1:], hist):
bar = "#" * int(60 * c / max(hist.max(), 1))
print(f" [{lo:.1f},{hi:.1f}) {c:7d} {bar}")
# Sub-cluster at three thresholds via agglomerative on the distance matrix.
from sklearn.cluster import AgglomerativeClustering
print("\n-- sub-clustering --")
for thr in (0.30, 0.35, 0.40, 0.45, 0.50):
ac = AgglomerativeClustering(
n_clusters=None,
metric="precomputed",
linkage="average",
distance_threshold=thr,
)
labels = ac.fit_predict(dists)
sizes = Counter(labels)
n = len(sizes)
big = sum(1 for s in sizes.values() if s >= 10)
top5 = sorted(sizes.values(), reverse=True)[:5]
print(f" threshold {thr:.2f}: {n} sub-clusters, {big} with >=10 images, top-5 sizes={top5}")
# Pick the threshold that gives 2-5 substantial sub-clusters.
target_thr = 0.35
ac = AgglomerativeClustering(
n_clusters=None, metric="precomputed", linkage="average",
distance_threshold=target_thr,
)
labels = ac.fit_predict(dists)
sizes = Counter(labels)
big_labels = [lab for lab, s in sizes.most_common() if s >= 20]
print(f"\n-- EXIF year analysis at threshold {target_thr} (sub-clusters with >=20 images) --")
print(f" {len(big_labels)} substantial sub-clusters")
# Build label -> list of source paths
by_label: dict[int, list[Path]] = {}
for ci, lab in zip(selected, labels):
rec = face_records[ci]
by_label.setdefault(int(lab), []).append(Path(rec["path"]))
for lab in big_labels[:6]:
paths_in = by_label[lab]
years = []
for p in paths_in:
y = exif_year(p)
if y is not None:
years.append(y)
n_paths = len(paths_in)
n_years = len(years)
if years:
ys = np.array(years)
ymin, ymax = int(ys.min()), int(ys.max())
ymed = int(np.median(ys))
yhist = Counter(years)
top_years = ", ".join(f"{y}:{c}" for y, c in sorted(yhist.most_common(5)))
else:
ymin = ymax = ymed = None
top_years = ""
print(
f" cluster {lab}: {n_paths} faces, EXIF on {n_years}/{n_paths}, "
f"year range {ymin}..{ymax} (median {ymed})"
)
print(f" top years: {top_years}")
if __name__ == "__main__":
main()