#!/usr/bin/env python3 """Build per-folder facesets from hand-sorted source directories. Phase B + C of the folder-import workflow: - Filter cache records into per-folder identity sets, run 2-pass centroid+outlier rejection so non-target faces in group photos drop out. - Route every osrc face record to every trusted-folder identity within a tight cosine cutoff (multi-identity osrc photos land in multiple facesets; cmd_export_swap then per-bbox-filters so each faceset crops only the matching face). - Synthesize a refine_manifest.json compatible with cmd_export_swap. - Invoke cmd_export_swap to emit faceset_NNN/ dirs into a temp output dir. - Rename .fsz bundles after the source folder, replace NAME.txt with foldername.txt, move dirs into the canonical facesets_swap_ready/, merge top-level manifest preserving existing faceset_001..012 entries. """ from __future__ import annotations import json import shutil import sys from pathlib import Path import numpy as np REPO = Path(__file__).resolve().parent.parent sys.path.insert(0, str(REPO)) from sort_faces import ( # noqa: E402 cmd_export_swap, load_cache, ) # ---- config -------------------------------------------------------------- # CACHE = REPO / "work" / "cache" / "nl_full.npz" OUT_FINAL = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready") OUT_TMP = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready_new") SYNTH_MANIFEST = REPO / "work" / "synthetic_refine_manifest.json" # Trusted folders, in numbering order. faceset_NNN starts at 013. TRUSTED: list[tuple[str, Path]] = [ ("k", Path("/mnt/x/src/k")), ("m", Path("/mnt/x/src/m")), ("mi", Path("/mnt/x/src/mi")), ("mir", Path("/mnt/x/src/mir")), ("s", Path("/mnt/x/src/s")), ("sab", Path("/mnt/x/src/sab")), ("t", Path("/mnt/x/src/t")), ] START_NNN = 13 OSRC_DIR = Path("/mnt/x/src/osrc") # Centroid-build outlier passes (loose then tight). PASS1_THRESHOLD = 0.55 PASS2_THRESHOLD = 0.45 # osrc routing cutoff (tight). OSRC_THRESHOLD = 0.45 # export-swap params (defaults from sort_faces.py). TOP_N = 30 EXPORT_OUTLIER_THRESHOLD = 0.45 PAD_RATIO = 0.5 OUT_SIZE = 512 MIN_FACE_SHORT = 100 # ---- helpers ------------------------------------------------------------- # def _normalize_rows(mat: np.ndarray) -> np.ndarray: n = np.linalg.norm(mat, axis=1, keepdims=True) n[n == 0] = 1.0 return mat / n def _centroid(vecs: np.ndarray) -> np.ndarray: c = vecs.mean(axis=0) n = np.linalg.norm(c) return c / n if n > 0 else c def _under(folder: Path, p: str) -> bool: """True iff path string p lies under folder.""" fs = str(folder).rstrip("/") + "/" return p == str(folder) or p.startswith(fs) def _record_in_folder(rec: dict, folder: Path, path_aliases: dict[str, list[str]]) -> bool: if _under(folder, rec["path"]): return True for alias in path_aliases.get(rec["path"], []): if _under(folder, alias): return True return False # ---- phase B: identity centroids + osrc routing ------------------------- # def build_synthetic_manifest() -> tuple[dict, dict[str, np.ndarray], dict[str, dict]]: emb, meta, _src_root, _processed, path_aliases = load_cache(CACHE) # emb is aligned with the no-noface-filtered records (matching cmd_export_swap's # invariant). Use indices into face_records to access emb. face_records = [m for m in meta if not m.get("noface")] if len(face_records) != len(emb): raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}") print(f"Loaded cache: {len(face_records)} face records.") # Per-folder identity centroids. centroids: dict[str, np.ndarray] = {} folder_paths: dict[str, set[str]] = {} folder_stats: dict[str, dict] = {} for label, folder in TRUSTED: idxs = [i for i, m in enumerate(face_records) if _record_in_folder(m, folder, path_aliases)] if not idxs: print(f"[{label}] no face records found under {folder}; skipping") continue vecs = emb[idxs] cent = _centroid(vecs) # Pass 1: drop loose outliers. d1 = 1.0 - vecs @ cent keep1 = [idxs[k] for k, dist in enumerate(d1) if dist <= PASS1_THRESHOLD] if not keep1: print(f"[{label}] every face was a pass-1 outlier; using all faces as-is") keep1 = idxs cent = _centroid(emb[keep1]) # Pass 2: tight outlier rejection. d2 = 1.0 - emb[keep1] @ cent keep2 = [keep1[k] for k, dist in enumerate(d2) if dist <= PASS2_THRESHOLD] if not keep2: print(f"[{label}] every face was a pass-2 outlier; falling back to pass-1") keep2 = keep1 cent = _centroid(emb[keep2]) centroids[label] = cent # Use canonical path strings; export-swap will look up indices by path. folder_paths[label] = {face_records[i]["path"] for i in keep2} folder_stats[label] = { "folder": str(folder), "input_records": len(idxs), "after_pass1": len(keep1), "after_pass2": len(keep2), "unique_paths": len(folder_paths[label]), } print( f"[{label}] in={len(idxs)} pass1={len(keep1)} pass2={len(keep2)} " f"unique_paths={len(folder_paths[label])}" ) # osrc routing: every osrc face -> every centroid within OSRC_THRESHOLD. osrc_idxs = [ i for i, m in enumerate(face_records) if _record_in_folder(m, OSRC_DIR, path_aliases) ] print(f"\nosrc: {len(osrc_idxs)} face records to route") if osrc_idxs and centroids: labels = list(centroids.keys()) cent_mat = np.stack([centroids[lab] for lab in labels]) # Build sims: (n_osrc, n_labels) osrc_emb = emb[osrc_idxs] sims = osrc_emb @ cent_mat.T # cosine similarity (vectors already normalized) dists = 1.0 - sims per_label_added: dict[str, int] = {lab: 0 for lab in labels} for row, ci in enumerate(osrc_idxs): p = face_records[ci]["path"] for col, lab in enumerate(labels): if dists[row, col] <= OSRC_THRESHOLD: if p not in folder_paths[lab]: folder_paths[lab].add(p) per_label_added[lab] += 1 for lab in labels: folder_stats[lab]["osrc_paths_added"] = per_label_added[lab] print(f"[{lab}] osrc faces routed: +{per_label_added[lab]} unique paths") # Build synthetic refine_manifest. facesets: list[dict] = [] for n, (label, _folder) in enumerate(TRUSTED, start=START_NNN): if label not in folder_paths: continue facesets.append({ "name": f"faceset_{n:03d}", "label": label, "image_count": len(folder_paths[label]), "images": sorted(folder_paths[label]), }) manifest = { "params": { "pass1_threshold": PASS1_THRESHOLD, "pass2_threshold": PASS2_THRESHOLD, "osrc_threshold": OSRC_THRESHOLD, "min_face_short": MIN_FACE_SHORT, }, "facesets": facesets, "_per_folder_stats": folder_stats, } SYNTH_MANIFEST.write_text(json.dumps(manifest, indent=2)) print(f"\nSynthetic manifest -> {SYNTH_MANIFEST}") return manifest, centroids, folder_stats # ---- phase C: export + rename + merge ----------------------------------- # def export_and_relocate(manifest: dict) -> None: if OUT_TMP.exists(): shutil.rmtree(OUT_TMP) OUT_TMP.mkdir(parents=True) print(f"\nRunning cmd_export_swap -> {OUT_TMP}") cmd_export_swap( cache_path=CACHE, refine_manifest_path=SYNTH_MANIFEST, raw_manifest_path=None, out_dir=OUT_TMP, top_n=TOP_N, outlier_threshold=EXPORT_OUTLIER_THRESHOLD, pad_ratio=PAD_RATIO, out_size=OUT_SIZE, include_candidates=False, candidate_match_threshold=0.55, candidate_min_score=0.40, min_face_short=MIN_FACE_SHORT, ) # Map name -> label from the synthetic manifest. name_to_label = {fs["name"]: fs["label"] for fs in manifest["facesets"]} # Load the temp top-level manifest (export-swap just wrote it). new_top = json.loads((OUT_TMP / "manifest.json").read_text()) new_entries = new_top.get("facesets", []) # Per-faceset rename + relocate. for fs_meta in new_entries: name = fs_meta["name"] label = name_to_label.get(name) src_dir = OUT_TMP / name if not src_dir.exists(): print(f"[{name}] export dir missing; skipping") continue # Rename .fsz bundles to