diff --git a/README.md b/README.md index f1fe9f7..b6a9cf1 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,57 @@ For the `faceset_001` run on 5260-face `nl_full.npz`, this produced 6 substantiv era buckets (2005–10, 2010–13, 2011, 2014–17, 2018–19, 2018–20; sizes 43–282) plus 68 thin/fragment buckets quarantined under `_thin/`. +### Discovering new identities in a mixed bucket + +A flat folder of mixed-identity photos (e.g. `osrc/`) is the opposite of the +hand-sorted case: identities have to be discovered, not asserted, but should +not collide with already-known identities or scramble their numbering. + +`work/cluster_osrc.py` is the worked example. The pipeline: + +- **Filter cache to the source root**, including any byte-aliased path that + resolves under it. +- **Drop already-covered faces** by comparing each candidate to the centroids + of the existing canonical facesets at the `EXISTING_MATCH_THRESHOLD` + (default 0.45 — same cutoff as `build_folders.py`'s osrc routing). These + faces are already routed by `extend` / `build_folders.py` and shouldn't + seed new facesets. +- **Cluster the unmatched** at cos-dist 0.55 (matches the `extend` default + for the new-cluster phase). +- **Apply `refine`-equivalent gates** per cluster: `face_short`, `blur`, + `det_score`, plus outlier rejection (cluster-centroid cos-dist > 0.55) for + clusters of size ≥ 4. Keep clusters whose surviving unique-source-path + count is ≥ `MIN_FACES`. +- **Number new facesets past the existing maximum** (`START_NNN`), so + `faceset_001..NNN` are never disturbed. +- **Synthesize a refine manifest** and run `cmd_export_swap` against it, + then move the resulting dirs into `facesets_swap_ready/` and append to the + top-level `manifest.json`. Each new dir gets an `osrc.txt` provenance + marker. + +Always run `extend` first so `raw_full/` and `facesets_full/` reflect the new +source — the `cluster_osrc.py` step then operates against the canonical +cache and doesn't need `raw_full/` for input: + +```bash +# 1. Bring raw_full / facesets_full up to date (folds matches into existing +# person folders + facesets, creates new person_NNN+ for unmatched). +python sort_faces.py extend "$CACHE" "$OUT/raw_full" \ + --refine-out "$OUT/facesets_full" + +# 2. Optional dry-run: report cluster sizes and per-faceset survivor counts +# without touching facesets_swap_ready/. +python work/cluster_osrc.py --dry-run + +# 3. Real run: emits facesets_swap_ready/faceset_NNN+ and merges the manifest. +python work/cluster_osrc.py +``` + +For the 2026-04-26 run on 336 osrc face records (after dropping 18 covered by +existing identities), this produced 6 new facesets (`faceset_020..025`, +sizes 4–26 exported PNGs; the 7th candidate cluster lost all 6 faces to +export-swap's tighter `min_face_short=100` gate). + ## Key defaults `refine`: @@ -201,7 +252,9 @@ Highly recommended at swap time: enable **Select post-processing = GFPGAN** with ├─ build_folders.py (hand-sorted-folder orchestration) ├─ check_faceset001_age.py (age-split readiness probe) ├─ age_split_001.py (age-split orchestration; faceset_001) + ├─ cluster_osrc.py (mixed-bucket identity discovery) ├─ synthetic_refine_manifest.json (last build_folders.py output) + ├─ synthetic_osrc_manifest.json (last cluster_osrc.py output) ├─ cache/ │ ├─ nl_full.npz (canonical cache + duplicates.json) │ └─ age_split_exif.json (path → EXIF-year cache) diff --git a/docs/analysis/osrc-identity-discovery.md b/docs/analysis/osrc-identity-discovery.md new file mode 100644 index 0000000..e6118a6 --- /dev/null +++ b/docs/analysis/osrc-identity-discovery.md @@ -0,0 +1,119 @@ +# Identity discovery in `/mnt/x/src/osrc` + +_Run date: 2026-04-26. Cache: `work/cache/nl_full.npz` (5260 face records). +Driver script: `work/cluster_osrc.py`._ + +## 1. Source + +`/mnt/x/src/osrc/` is a flat mixed-identity bucket: 213 files in root + a +`psd/` subfolder with 41 PSD files + a single file in `[Originaldateien]/`. +File extensions are 171 jpg + 1 jpeg + 41 psd. PSDs are not embedded +(InsightFace's loader doesn't read PSD); the 41 PSDs were skipped, on the +working assumption that the same identities are also present in the +adjacent JPGs. + +`nl_full.npz` already covered 160 of the 213 files (the remaining 53: 41 +psd + 12 jpg). Of the 12 missing JPGs, 11 are byte-duplicates of `00843resc.jpg` +.. `00855resc.jpg` (same file sizes, paired by sha256) — already aliased +in the cache. Only 1 jpg (`19554226_..._n.jpg`) is genuinely uncovered. + +The 160 covered files yielded **336 face records / 10 noface**, with 64 +single-face / 35 two-face / 19 three-face / 24 four-face / 8 with 5–8 +faces. Quality is good: median `face_short=116px`, `det_score=0.85`, +`blur=244`. Min `face_short=40px` will fail the 90px refine gate. + +## 2. Coverage by existing identities + +Computed cos-dist from each osrc face to the centroids of the canonical +`faceset_001..019` (built from each manifest's `(source, bbox)` keys). +Median nearest-cos-dist was 0.875 — i.e. the bulk of osrc is **not** the +existing 19 identities. + +At cos-dist ≤ 0.45 (matching `build_folders.py`'s `OSRC_THRESHOLD`): + +| existing identity | osrc faces matched | +|------------------|------------------:| +| faceset_002 | 7 | +| faceset_008 | 4 | +| faceset_015 | 3 | +| faceset_019 | 4 | + +These 18 osrc faces are routed to existing identities by +`build_folders.py` and `extend`; they are excluded from the +identity-discovery step. + +## 3. Pipeline + +`work/cluster_osrc.py` mirrors `build_folders.py`'s structure (synthesize +a refine manifest, hand off to `cmd_export_swap`, relocate, merge +top-level manifest) but discovers identities by clustering rather than +asserting them by folder. + +1. Filter cache to face records under `/mnt/x/src/osrc` (canonical or + byte-aliased path). +2. Drop the 18 already-covered faces (cos-dist ≤ 0.45 to any existing + identity centroid). +3. Cluster the remaining 318 faces among themselves at cos-dist 0.55 + (matches the `extend` default for new-cluster formation). +4. For each cluster, apply `refine`-equivalent per-face gates + (`face_short ≥ 90`, `blur ≥ 40`, `det_score ≥ 0.6`); for clusters ≥ 4 + faces apply outlier rejection at cluster-centroid cos-dist 0.55. Keep + clusters whose surviving unique-path count is ≥ 6 (the operator- + chosen `MIN_FACES`, lower than the canonical 15 because osrc is small + per-identity). +5. Number kept clusters `faceset_020+` (past the existing + `facesets_swap_ready/` max of 019) ordered by size descending. +6. Synthesize a refine manifest and call `cmd_export_swap` on it. Move + the emitted dirs into `facesets_swap_ready/`, drop an `osrc.txt` + provenance marker, and append the new entries to the top-level + `manifest.json` (without disturbing existing `facesets` / `thin_eras`). + +## 4. Result (2026-04-26) + +Phase 1 (clustering, before export-swap): + +- 137 raw clusters at cos-dist 0.55; top sizes [37, 20, 12, 9, 7, 7, 6, 6, 6, 5]. +- After quality gate: 124 faces dropped (mostly `face_short < 90` from + group-photo tertiary subjects). +- Outlier rejection: 0 dropped (clusters were tight). +- After `min_faces=6`: **7 candidate clusters kept** (sizes 6–28 unique + source paths). + +Phase 2 (`cmd_export_swap` with `min_face_short=100`, +`outlier_threshold=0.45`): + +| name | input | outlier drop | exported PNGs | +|--------------|------:|-------------:|--------------:| +| faceset_020 | 71 | 42 | 26 | +| faceset_021 | 36 | 21 | 10 | +| faceset_022 | 15 | 7 | 8 | +| faceset_023 | 19 | 14 | 4 | +| faceset_024 | 6 | 0 | 6 | +| faceset_025 | 10 | 4 | 6 | +| faceset_026 | — | — | 0 (skipped: empty after filter) | + +`faceset_026`'s 6 cluster faces all failed export-swap's tighter +`min_face_short=100` gate (vs. cluster's 90); it is not emitted. +`faceset_023` is small (4 PNGs) but useful as an averaged identity at +that size. + +Top-level `facesets_swap_ready/manifest.json` now: **31 substantive +facesets** (12 auto-cluster nl/lzbkp + 7 hand-sorted + 6 era splits + 6 +osrc-discovered) + **68 thin_eras** under `_thin/`. + +## 5. Re-running and applying to other mixed buckets + +- The cache holds osrc embeddings; to re-run with different parameters, + edit `cluster_osrc.py`'s config block and re-execute. Cluster discovery + + export-swap is a few minutes total. +- For a different mixed-bucket source, copy `cluster_osrc.py` to + `cluster_.py` and change `OSRC_DIR`, `OUT_TMP`, `SYNTH_MANIFEST`, + `START_NNN`. The exclusion step compares against the *current* contents + of `facesets_swap_ready/faceset_NNN/` so it picks up everything emitted + by previous discovery / split / hand-sorted runs. +- Lowering `MIN_FACES` from 6 to 4 would have admitted ~3 additional + marginal clusters at this corpus size; the trade-off is a noisier + identity average for small-N facesets. +- `extend` should be run before `cluster_osrc.py` so `raw_full/` and + `facesets_full/` stay in sync — `cluster_osrc.py` itself only writes + to `facesets_swap_ready/`. diff --git a/work/cluster_osrc.py b/work/cluster_osrc.py new file mode 100644 index 0000000..b06f770 --- /dev/null +++ b/work/cluster_osrc.py @@ -0,0 +1,352 @@ +#!/usr/bin/env python3 +"""Discover new identities in /mnt/x/src/osrc and emit them as facesets. + +Workflow (mirrors the shape of build_folders.py, but identities are +discovered by clustering rather than asserted by folder): + + 1. Load cache; restrict to face records whose canonical or alias path + lies under /mnt/x/src/osrc/. + 2. Build centroids of the existing 19 canonical identities in + facesets_swap_ready/faceset_001..019. Drop any osrc face whose + nearest-existing-identity cos-dist <= EXISTING_MATCH_THRESHOLD; + those are already covered by `extend` and shouldn't seed new + facesets. + 3. Cluster the remaining osrc faces among themselves at + INITIAL_THRESHOLD (matches `extend`'s new_cluster_threshold default). + 4. Per cluster, apply refine-equivalent gates: face_short >= MIN_SHORT, + blur >= MIN_BLUR, det_score >= MIN_DET_SCORE; for clusters >= 4, + drop faces with cos-dist > OUTLIER_THRESHOLD from the cluster + centroid. + 5. Keep clusters whose surviving unique source-path count is >= MIN_FACES. + 6. Number kept clusters faceset_020, 021, ... (past the highest existing + in facesets_swap_ready, which is 019). Order by descending size. + 7. Synthesize a refine_manifest.json and call cmd_export_swap on it, + emitting into a temp dir. Move new dirs into facesets_swap_ready/. + 8. Append new entries to the top-level facesets_swap_ready/manifest.json + (preserving existing facesets / thin_eras). +""" + +from __future__ import annotations + +import json +import shutil +import sys +from pathlib import Path + +import numpy as np + +REPO = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO)) + +from sort_faces import ( # noqa: E402 + _cluster_embeddings, + cmd_export_swap, + load_cache, +) + +# ---- config -------------------------------------------------------------- # + +CACHE = REPO / "work" / "cache" / "nl_full.npz" +SWAP_READY = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready") +OUT_TMP = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready_osrc_new") +SYNTH_MANIFEST = REPO / "work" / "synthetic_osrc_manifest.json" + +OSRC_DIR = Path("/mnt/x/src/osrc") +START_NNN = 20 # facesets_swap_ready max is 019; pick up here. + +# Existing-identity exclusion: drop osrc faces whose nearest existing +# identity centroid is within this cosine distance. 0.45 matches the +# build_folders.py OSRC_THRESHOLD: at this cutoff the face is already +# routed to an existing identity by extend / build_folders.py. +EXISTING_MATCH_THRESHOLD = 0.45 + +# Cluster the unmatched. +INITIAL_THRESHOLD = 0.55 + +# Refine-equivalent gates (per the user's request: drop min_faces to 6). +MIN_FACES = 6 +MIN_SHORT = 90 +MIN_BLUR = 40.0 +MIN_DET_SCORE = 0.6 +OUTLIER_THRESHOLD = 0.55 # only applied if cluster >= 4 + +# export-swap params (defaults from sort_faces.py). +TOP_N = 30 +EXPORT_OUTLIER_THRESHOLD = 0.45 +PAD_RATIO = 0.5 +OUT_SIZE = 512 +EXPORT_MIN_FACE_SHORT = 100 + + +# ---- helpers ------------------------------------------------------------- # + +def _normalize(v: np.ndarray) -> np.ndarray: + n = np.linalg.norm(v) + return v / n if n > 0 else v + + +def _under(folder: Path, p: str) -> bool: + fs = str(folder).rstrip("/") + "/" + return p == str(folder) or p.startswith(fs) + + +def _record_in_folder(rec: dict, folder: Path, path_aliases: dict[str, list[str]]) -> bool: + if _under(folder, rec["path"]): + return True + for alias in path_aliases.get(rec["path"], []): + if _under(folder, alias): + return True + return False + + +def _existing_identity_centroids( + emb: np.ndarray, face_records: list[dict] +) -> tuple[np.ndarray, list[str]]: + """Build a (n_identities, 512) matrix of L2-normalized centroids and a parallel name list, + drawn from the canonical faceset_001..019 manifests in facesets_swap_ready/.""" + bbox_idx: dict[tuple[str, tuple], int] = { + (m["path"], tuple(m.get("bbox") or ())): i for i, m in enumerate(face_records) + } + centroids: list[np.ndarray] = [] + names: list[str] = [] + for n in range(1, 20): + d = SWAP_READY / f"faceset_{n:03d}" + man_path = d / "manifest.json" + if not man_path.exists(): + continue + man = json.loads(man_path.read_text()) + keys = [(f["source"], tuple(f.get("bbox") or ())) for f in man.get("faces", [])] + idxs = [bbox_idx[k] for k in keys if k in bbox_idx] + if not idxs: + continue + centroids.append(_normalize(emb[idxs].mean(axis=0))) + names.append(d.name) + return np.stack(centroids), names + + +# ---- phase 1: identify new osrc clusters --------------------------------- # + +def discover_new_clusters() -> tuple[dict, list[dict]]: + emb, meta, _src_root, _proc, path_aliases = load_cache(CACHE) + face_records = [m for m in meta if not m.get("noface")] + if len(face_records) != len(emb): + raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}") + print(f"Cache: {len(face_records)} face records.") + + # Step 1: filter to osrc. + osrc_idx = [ + i for i, m in enumerate(face_records) + if _record_in_folder(m, OSRC_DIR, path_aliases) + ] + print(f"osrc face records: {len(osrc_idx)}") + + # Step 2: drop those already matching an existing identity. + cents, cent_names = _existing_identity_centroids(emb, face_records) + osrc_emb = emb[osrc_idx] + sims = osrc_emb @ cents.T + nearest_d = 1.0 - sims.max(axis=1) + nearest_id = sims.argmax(axis=1) + covered_mask = nearest_d <= EXISTING_MATCH_THRESHOLD + n_covered = int(covered_mask.sum()) + print( + f"Already covered by existing 19 identities at cos-dist <= " + f"{EXISTING_MATCH_THRESHOLD}: {n_covered}/{len(osrc_idx)}" + ) + # Per-identity coverage breakdown (for logging only). + for j, name in enumerate(cent_names): + c = int(((nearest_id == j) & covered_mask).sum()) + if c: + print(f" -> {name}: {c}") + + new_idx = [osrc_idx[k] for k in range(len(osrc_idx)) if not covered_mask[k]] + print(f"\nUnmatched osrc faces to cluster: {len(new_idx)}") + + # Step 3: cluster the unmatched among themselves. + new_emb = emb[new_idx] + if len(new_idx) <= 1: + labels = np.zeros(len(new_idx), dtype=int) + else: + labels = _cluster_embeddings(new_emb, INITIAL_THRESHOLD) + n_clusters = len(set(int(l) for l in labels)) + print( + f"Initial clusters at threshold {INITIAL_THRESHOLD}: {n_clusters} " + f"(top sizes: {sorted([int((labels==l).sum()) for l in set(labels)], reverse=True)[:10]})" + ) + + # Step 4 + 5: per-cluster refine gates + min_faces. + clusters: dict[int, list[int]] = {} + for k, lab in enumerate(labels): + clusters.setdefault(int(lab), []).append(new_idx[k]) + + kept_clusters: list[dict] = [] + drop_quality_total = 0 + drop_outlier_total = 0 + for cid, idxs in clusters.items(): + # Per-face quality gate. + good: list[int] = [] + for i in idxs: + r = face_records[i] + if r.get("face_short", 0) < MIN_SHORT: + drop_quality_total += 1 + continue + if r.get("blur", 0.0) < MIN_BLUR: + drop_quality_total += 1 + continue + if r.get("det_score", 0.0) < MIN_DET_SCORE: + drop_quality_total += 1 + continue + good.append(i) + if not good: + continue + + # Outlier rejection (only if cluster >= 4). + if len(good) >= 4: + cent = _normalize(emb[good].mean(axis=0)) + d = 1.0 - emb[good] @ cent + tight = [good[k] for k, dist in enumerate(d) if dist <= OUTLIER_THRESHOLD] + drop_outlier_total += len(good) - len(tight) + good = tight + if not good: + continue + + unique_paths = sorted({face_records[i]["path"] for i in good}) + if len(unique_paths) < MIN_FACES: + continue + + kept_clusters.append({ + "indices": good, + "unique_paths": unique_paths, + "size_face": len(good), + "size_paths": len(unique_paths), + }) + + kept_clusters.sort(key=lambda c: -c["size_paths"]) + print( + f"\nAfter quality gate ({drop_quality_total} dropped) + outlier " + f"rejection ({drop_outlier_total} dropped) + min_faces={MIN_FACES}: " + f"{len(kept_clusters)} clusters kept" + ) + for rank, c in enumerate(kept_clusters, start=START_NNN): + print( + f" faceset_{rank:03d}: faces={c['size_face']:3d} " + f"unique_paths={c['size_paths']:3d}" + ) + + # Build synthetic refine_manifest.json compatible with cmd_export_swap. + facesets = [ + { + "name": f"faceset_{rank:03d}", + "image_count": c["size_paths"], + "face_count": c["size_face"], + "images": c["unique_paths"], + } + for rank, c in enumerate(kept_clusters, start=START_NNN) + ] + manifest = { + "params": { + "existing_match_threshold": EXISTING_MATCH_THRESHOLD, + "initial_threshold": INITIAL_THRESHOLD, + "outlier_threshold": OUTLIER_THRESHOLD, + "min_faces": MIN_FACES, + "min_short": MIN_SHORT, + "min_blur": MIN_BLUR, + "min_det_score": MIN_DET_SCORE, + "source_root": str(OSRC_DIR), + }, + "facesets": facesets, + } + SYNTH_MANIFEST.write_text(json.dumps(manifest, indent=2)) + print(f"\nSynthetic manifest -> {SYNTH_MANIFEST}") + return manifest, kept_clusters + + +# ---- phase 2: export + relocate + merge top-level manifest -------------- # + +def export_and_relocate(manifest: dict) -> None: + if OUT_TMP.exists(): + shutil.rmtree(OUT_TMP) + OUT_TMP.mkdir(parents=True) + + print(f"\nRunning cmd_export_swap -> {OUT_TMP}") + cmd_export_swap( + cache_path=CACHE, + refine_manifest_path=SYNTH_MANIFEST, + raw_manifest_path=None, + out_dir=OUT_TMP, + top_n=TOP_N, + outlier_threshold=EXPORT_OUTLIER_THRESHOLD, + pad_ratio=PAD_RATIO, + out_size=OUT_SIZE, + include_candidates=False, + candidate_match_threshold=0.55, + candidate_min_score=0.40, + min_face_short=EXPORT_MIN_FACE_SHORT, + ) + + new_top = json.loads((OUT_TMP / "manifest.json").read_text()) + new_entries = new_top.get("facesets", []) + + moved = 0 + for fs_meta in new_entries: + name = fs_meta["name"] + src_dir = OUT_TMP / name + if not src_dir.exists(): + print(f"[{name}] export dir missing; skipping") + continue + dst_dir = SWAP_READY / name + if dst_dir.exists(): + print(f"[{name}] {dst_dir} already exists; refusing to overwrite") + continue + # Add a marker file so the source provenance is obvious. + (src_dir / "osrc.txt").write_text( + f"{name}\n\nSource: osrc cluster (auto-discovered, {OSRC_DIR}).\n" + ) + shutil.move(str(src_dir), str(dst_dir)) + moved += 1 + print(f"[{name}] -> {dst_dir}") + + # Merge top-level manifest, preserving facesets / thin_eras / etc. + final_manifest_path = SWAP_READY / "manifest.json" + if final_manifest_path.exists(): + existing = json.loads(final_manifest_path.read_text()) + else: + existing = {"facesets": []} + existing.setdefault("facesets", []) + + existing_names = {fs["name"] for fs in existing["facesets"]} + appended = 0 + for entry in new_entries: + if entry["name"] in existing_names: + print(f"[manifest] {entry['name']} already present; not duplicating") + continue + existing["facesets"].append(entry) + appended += 1 + + final_manifest_path.write_text(json.dumps(existing, indent=2)) + print(f"\nMerged manifest: appended {appended} entries -> {final_manifest_path}") + print(f"Moved {moved} faceset directories into {SWAP_READY}") + + # Clean up temp dir if empty. + if OUT_TMP.exists(): + leftover = list(OUT_TMP.iterdir()) + if not leftover: + OUT_TMP.rmdir() + + +# ---- main ---------------------------------------------------------------- # + +def main() -> None: + dry = "--dry-run" in sys.argv + manifest, kept = discover_new_clusters() + if dry: + print("\n--dry-run: stopping after cluster discovery (no exports written).") + return + if not manifest.get("facesets"): + print("No new facesets to build; nothing to do.") + return + export_and_relocate(manifest) + print("\nDone.") + + +if __name__ == "__main__": + main()