"""Extend the existing 6 era buckets of faceset_001 by absorbing PNGs that post-date the original age_split run (from consolidation merges, etc.). Mirrors the anchor-fragment assignment logic in age_split_001.py: - For each unbucketed face in faceset_001's manifest, find the nearest active era anchor by cosine distance to the anchor's centroid. - Accept the assignment iff dist <= 0.40 AND |year_delta| <= 5 (where year_delta = exif_year(face) - dom_year(anchor)). - Undated PNGs are skipped (no assignment). - Anchors are NOT re-centered after absorption (preserves the same drift guarantees as the original age_split). CLI: python work/age_extend_001.py analyze --out work/age_extend/candidates.json python work/age_extend_001.py report --candidates ... --out work/age_extend python work/age_extend_001.py apply --candidates ... [--dry-run] """ from __future__ import annotations import argparse import json import shutil import sys import time from collections import Counter from pathlib import Path import numpy as np from PIL import Image, ExifTags ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready") PARENT = "faceset_001" ACTIVE_ERAS = [ "faceset_001_2005-10", "faceset_001_2010-13", "faceset_001_2011", "faceset_001_2014-17", "faceset_001_2018-19", "faceset_001_2018-20", ] CACHES = [ Path("/opt/face-sets/work/cache/nl_full.npz"), Path("/opt/face-sets/work/cache/immich_peter.npz"), Path("/opt/face-sets/work/cache/immich_nic.npz"), ] EXIF_CACHE = Path("/opt/face-sets/work/cache/age_split_exif.json") # anchor-fragment thresholds (mirror age_split_001.py) DIST_MAX = 0.40 YEAR_MAX = 5 # ----------------------------- caches ----------------------------- def load_caches(): rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {} alias_map: dict[str, str] = {} for c in CACHES: if not c.exists(): print(f"[warn] cache missing: {c}", file=sys.stderr) continue d = np.load(c, allow_pickle=True) emb = d["embeddings"] meta = json.loads(str(d["meta"])) face_records = [m for m in meta if not m.get("noface")] if len(face_records) != len(emb): raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}") if "path_aliases" in d.files: paliases = json.loads(str(d["path_aliases"])) for canon, alist in paliases.items(): alias_map.setdefault(canon, canon) for a in alist: alias_map[a] = canon for i, rec in enumerate(face_records): p = rec["path"] bbox = tuple(int(x) for x in rec["bbox"]) v = emb[i].astype(np.float32) n = float(np.linalg.norm(v)) if n > 0: v = v / n rec_index[(p, bbox)] = v alias_map.setdefault(p, p) print(f"[cache] indexed {len(rec_index)} face records, {len(alias_map)} aliases", file=sys.stderr) return rec_index, alias_map def lookup_emb(rec_index, alias_map, src: str, bbox): bbox_t = tuple(int(x) for x in bbox) canon = alias_map.get(src, src) v = rec_index.get((canon, bbox_t)) if v is None and canon != src: v = rec_index.get((src, bbox_t)) return v # ----------------------------- exif ----------------------------- def load_exif_cache(): if not EXIF_CACHE.exists(): return {} return json.loads(EXIF_CACHE.read_text()) def save_exif_cache(cache): tmp = EXIF_CACHE.with_suffix(".tmp.json") tmp.write_text(json.dumps(cache, indent=2)) tmp.replace(EXIF_CACHE) def exif_year(path: Path) -> int | None: try: with Image.open(path) as im: ex = im._getexif() if not ex: return None for tag_id, val in ex.items(): tag = ExifTags.TAGS.get(tag_id, tag_id) if tag == "DateTimeOriginal" and isinstance(val, str) and len(val) >= 4: return int(val[:4]) except Exception: return None return None def get_year(src: str, exif_cache) -> int | None: """Return EXIF year for src, using cache. Mutates cache for new lookups.""" if src in exif_cache: return exif_cache[src] p = Path(src) y = exif_year(p) if p.exists() else None exif_cache[src] = y return y # ----------------------------- analyze ----------------------------- def cmd_analyze(args): rec_index, alias_map = load_caches() exif_cache = load_exif_cache() exif_cache_dirty = False parent_dir = ROOT / PARENT parent_manifest = json.loads((parent_dir / "manifest.json").read_text()) parent_faces = parent_manifest.get("faces", []) print(f"[parent] {PARENT}: {len(parent_faces)} face entries", file=sys.stderr) # Build "in_bucket" set + each anchor's centroid + dom_year anchors = [] in_bucket: set[tuple[str, tuple[int, int, int, int]]] = set() for era in ACTIVE_ERAS: ed = ROOT / era if not ed.is_dir(): print(f"[warn] missing era bucket: {era}", file=sys.stderr) continue em = json.loads((ed / "manifest.json").read_text()) emb_list = [] years = [] n_missing_emb = 0 for f in em.get("faces", []): src = f.get("source") bbox = f.get("bbox") if not src or not bbox: continue key = (alias_map.get(src, src), tuple(int(x) for x in bbox)) in_bucket.add(key) in_bucket.add((src, tuple(int(x) for x in bbox))) # cover both alias and raw v = lookup_emb(rec_index, alias_map, src, bbox) if v is None: n_missing_emb += 1 else: emb_list.append(v) y = get_year(src, exif_cache) if y is None: exif_cache_dirty = True else: years.append(y) if src not in exif_cache: exif_cache_dirty = True if not emb_list: print(f"[warn] {era}: no embeddings found, skipping anchor", file=sys.stderr) continue arr = np.stack(emb_list).astype(np.float32) c = arr.mean(axis=0) n = float(np.linalg.norm(c)) if n > 0: c = c / n dom_year = Counter(years).most_common(1)[0][0] if years else None anchors.append({ "name": era, "centroid": c, "n_faces": len(em.get("faces", [])), "n_emb_used": len(emb_list), "n_emb_missing": n_missing_emb, "dom_year": dom_year, "year_min": min(years) if years else None, "year_max": max(years) if years else None, }) print(f"[anchor] {era}: n={len(em.get('faces', []))} emb_used={len(emb_list)} " f"emb_miss={n_missing_emb} dom_year={dom_year} years=[{min(years) if years else '-'}..{max(years) if years else '-'}]", file=sys.stderr) # Find unbucketed faces in parent unbucketed = [] for f in parent_faces: src = f.get("source") bbox = f.get("bbox") if not src or not bbox: continue bbox_t = tuple(int(x) for x in bbox) key1 = (alias_map.get(src, src), bbox_t) key2 = (src, bbox_t) if key1 in in_bucket or key2 in in_bucket: continue unbucketed.append(f) print(f"[parent] {len(unbucketed)} unbucketed face entries (in {PARENT} but no era bucket)", file=sys.stderr) # Score each unbucketed face against every anchor proposals = [] skipped_no_emb = 0 skipped_no_year = 0 for f in unbucketed: src = f["source"] bbox = f["bbox"] v = lookup_emb(rec_index, alias_map, src, bbox) if v is None: skipped_no_emb += 1 continue y = get_year(src, exif_cache) if y is None: skipped_no_year += 1 exif_cache_dirty = True continue if src not in exif_cache: exif_cache_dirty = True # nearest anchor best = None # (dist, idx) for i, a in enumerate(anchors): d = 1.0 - float(np.dot(a["centroid"], v)) if best is None or d < best[0]: best = (d, i) if best is None: continue dist, bidx = best anchor = anchors[bidx] year_delta = abs(y - anchor["dom_year"]) if anchor["dom_year"] is not None else None accept = (dist <= DIST_MAX and year_delta is not None and year_delta <= YEAR_MAX) proposals.append({ "png": f["png"], "source": src, "bbox": [int(x) for x in bbox], "year": y, "rank_in_parent": f.get("rank"), "quality_composite": f.get("quality", {}).get("composite"), "quality": f.get("quality", {}), "best_anchor": anchor["name"], "best_anchor_dom_year": anchor["dom_year"], "centroid_dist": round(dist, 4), "year_delta": year_delta, "accept": bool(accept), "all_anchor_dists": { a["name"]: round(1.0 - float(np.dot(a["centroid"], v)), 4) for a in anchors }, }) if exif_cache_dirty: save_exif_cache(exif_cache) print(f"[exif] cache flushed ({len(exif_cache)} entries total)", file=sys.stderr) # Summarize accepted = [p for p in proposals if p["accept"]] rejected = [p for p in proposals if not p["accept"]] by_anchor = Counter(p["best_anchor"] for p in accepted) print(f"[summary] unbucketed={len(unbucketed)} scored={len(proposals)} " f"accepted={len(accepted)} rejected={len(rejected)} " f"skipped(no_emb={skipped_no_emb}, no_year={skipped_no_year})", file=sys.stderr) for k, v in by_anchor.most_common(): print(f" {k}: +{v}", file=sys.stderr) out = { "thresholds": {"dist_max": DIST_MAX, "year_max": YEAR_MAX}, "anchors": [ {k: v for k, v in a.items() if k != "centroid"} for a in anchors ], "n_unbucketed": len(unbucketed), "skipped": {"no_emb": skipped_no_emb, "no_year": skipped_no_year}, "proposals": sorted(proposals, key=lambda p: (not p["accept"], p["best_anchor"], -1 * (p["quality_composite"] or 0))), "by_anchor": dict(by_anchor), } op = Path(args.out) op.parent.mkdir(parents=True, exist_ok=True) op.write_text(json.dumps(out, indent=2)) print(f"[done] {len(proposals)} proposals -> {op}", file=sys.stderr) # ----------------------------- report ----------------------------- def cmd_report(args): cand = json.loads(Path(args.candidates).read_text()) out_dir = Path(args.out) thumbs_dir = out_dir / "thumbs" thumbs_dir.mkdir(parents=True, exist_ok=True) THUMB = 140 def make_thumb(png_relpath: str) -> str: # png_relpath looks like "faces/0042.png" src = ROOT / PARENT / png_relpath name = Path(png_relpath).stem dst = thumbs_dir / f"{name}.jpg" if not dst.exists(): try: img = Image.open(src).convert("RGB") img.thumbnail((THUMB, THUMB), Image.LANCZOS) img.save(dst, "JPEG", quality=82) except Exception as e: print(f"[thumb-skip] {src}: {e}", file=sys.stderr) return "" return f"thumbs/{name}.jpg" # group accepted proposals by target anchor by_anchor: dict[str, list] = {} rejected = [] for p in cand["proposals"]: if p["accept"]: by_anchor.setdefault(p["best_anchor"], []).append(p) else: rejected.append(p) rows = [] rows.append("
{cand['n_unbucketed']} unbucketed faces in {PARENT}; " f"{sum(len(v) for v in by_anchor.values())} accepted / {len(rejected)} rejected; " f"thresholds dist≤{cand['thresholds']['dist_max']} AND |year_delta|≤{cand['thresholds']['year_max']}.
") nav = " · ".join(f"{a} (+{len(by_anchor[a])})" for a in by_anchor) + " · rejected" rows.append(f"") for anchor_name in ACTIVE_ERAS: if anchor_name not in by_anchor: continue items = by_anchor[anchor_name] anchor_meta = next((a for a in cand["anchors"] if a["name"] == anchor_name), {}) rows.append(f"...{len(rejected)-200} more truncated.
") rows.append("