diff --git a/work/build_folders.py b/work/build_folders.py new file mode 100644 index 0000000..34b2a03 --- /dev/null +++ b/work/build_folders.py @@ -0,0 +1,323 @@ +#!/usr/bin/env python3 +"""Build per-folder facesets from hand-sorted source directories. + +Phase B + C of the folder-import workflow: + - Filter cache records into per-folder identity sets, run 2-pass centroid+outlier + rejection so non-target faces in group photos drop out. + - Route every osrc face record to every trusted-folder identity within a tight + cosine cutoff (multi-identity osrc photos land in multiple facesets; + cmd_export_swap then per-bbox-filters so each faceset crops only the matching face). + - Synthesize a refine_manifest.json compatible with cmd_export_swap. + - Invoke cmd_export_swap to emit faceset_NNN/ dirs into a temp output dir. + - Rename .fsz bundles after the source folder, replace NAME.txt with foldername.txt, + move dirs into the canonical facesets_swap_ready/, merge top-level manifest + preserving existing faceset_001..012 entries. +""" + +from __future__ import annotations + +import json +import shutil +import sys +from pathlib import Path + +import numpy as np + +REPO = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO)) + +from sort_faces import ( # noqa: E402 + cmd_export_swap, + load_cache, +) + +# ---- config -------------------------------------------------------------- # + +CACHE = REPO / "work" / "cache" / "nl_full.npz" +OUT_FINAL = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready") +OUT_TMP = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready_new") +SYNTH_MANIFEST = REPO / "work" / "synthetic_refine_manifest.json" + +# Trusted folders, in numbering order. faceset_NNN starts at 013. +TRUSTED: list[tuple[str, Path]] = [ + ("k", Path("/mnt/x/src/k")), + ("m", Path("/mnt/x/src/m")), + ("mi", Path("/mnt/x/src/mi")), + ("mir", Path("/mnt/x/src/mir")), + ("s", Path("/mnt/x/src/s")), + ("sab", Path("/mnt/x/src/sab")), + ("t", Path("/mnt/x/src/t")), +] +START_NNN = 13 +OSRC_DIR = Path("/mnt/x/src/osrc") + +# Centroid-build outlier passes (loose then tight). +PASS1_THRESHOLD = 0.55 +PASS2_THRESHOLD = 0.45 +# osrc routing cutoff (tight). +OSRC_THRESHOLD = 0.45 + +# export-swap params (defaults from sort_faces.py). +TOP_N = 30 +EXPORT_OUTLIER_THRESHOLD = 0.45 +PAD_RATIO = 0.5 +OUT_SIZE = 512 +MIN_FACE_SHORT = 100 + + +# ---- helpers ------------------------------------------------------------- # + +def _normalize_rows(mat: np.ndarray) -> np.ndarray: + n = np.linalg.norm(mat, axis=1, keepdims=True) + n[n == 0] = 1.0 + return mat / n + + +def _centroid(vecs: np.ndarray) -> np.ndarray: + c = vecs.mean(axis=0) + n = np.linalg.norm(c) + return c / n if n > 0 else c + + +def _under(folder: Path, p: str) -> bool: + """True iff path string p lies under folder.""" + fs = str(folder).rstrip("/") + "/" + return p == str(folder) or p.startswith(fs) + + +def _record_in_folder(rec: dict, folder: Path, path_aliases: dict[str, list[str]]) -> bool: + if _under(folder, rec["path"]): + return True + for alias in path_aliases.get(rec["path"], []): + if _under(folder, alias): + return True + return False + + +# ---- phase B: identity centroids + osrc routing ------------------------- # + +def build_synthetic_manifest() -> tuple[dict, dict[str, np.ndarray], dict[str, dict]]: + emb, meta, _src_root, _processed, path_aliases = load_cache(CACHE) + # emb is aligned with the no-noface-filtered records (matching cmd_export_swap's + # invariant). Use indices into face_records to access emb. + face_records = [m for m in meta if not m.get("noface")] + if len(face_records) != len(emb): + raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}") + + print(f"Loaded cache: {len(face_records)} face records.") + + # Per-folder identity centroids. + centroids: dict[str, np.ndarray] = {} + folder_paths: dict[str, set[str]] = {} + folder_stats: dict[str, dict] = {} + + for label, folder in TRUSTED: + idxs = [i for i, m in enumerate(face_records) if _record_in_folder(m, folder, path_aliases)] + if not idxs: + print(f"[{label}] no face records found under {folder}; skipping") + continue + + vecs = emb[idxs] + cent = _centroid(vecs) + + # Pass 1: drop loose outliers. + d1 = 1.0 - vecs @ cent + keep1 = [idxs[k] for k, dist in enumerate(d1) if dist <= PASS1_THRESHOLD] + if not keep1: + print(f"[{label}] every face was a pass-1 outlier; using all faces as-is") + keep1 = idxs + cent = _centroid(emb[keep1]) + + # Pass 2: tight outlier rejection. + d2 = 1.0 - emb[keep1] @ cent + keep2 = [keep1[k] for k, dist in enumerate(d2) if dist <= PASS2_THRESHOLD] + if not keep2: + print(f"[{label}] every face was a pass-2 outlier; falling back to pass-1") + keep2 = keep1 + cent = _centroid(emb[keep2]) + + centroids[label] = cent + # Use canonical path strings; export-swap will look up indices by path. + folder_paths[label] = {face_records[i]["path"] for i in keep2} + folder_stats[label] = { + "folder": str(folder), + "input_records": len(idxs), + "after_pass1": len(keep1), + "after_pass2": len(keep2), + "unique_paths": len(folder_paths[label]), + } + print( + f"[{label}] in={len(idxs)} pass1={len(keep1)} pass2={len(keep2)} " + f"unique_paths={len(folder_paths[label])}" + ) + + # osrc routing: every osrc face -> every centroid within OSRC_THRESHOLD. + osrc_idxs = [ + i for i, m in enumerate(face_records) + if _record_in_folder(m, OSRC_DIR, path_aliases) + ] + print(f"\nosrc: {len(osrc_idxs)} face records to route") + if osrc_idxs and centroids: + labels = list(centroids.keys()) + cent_mat = np.stack([centroids[lab] for lab in labels]) + # Build sims: (n_osrc, n_labels) + osrc_emb = emb[osrc_idxs] + sims = osrc_emb @ cent_mat.T # cosine similarity (vectors already normalized) + dists = 1.0 - sims + per_label_added: dict[str, int] = {lab: 0 for lab in labels} + for row, ci in enumerate(osrc_idxs): + p = face_records[ci]["path"] + for col, lab in enumerate(labels): + if dists[row, col] <= OSRC_THRESHOLD: + if p not in folder_paths[lab]: + folder_paths[lab].add(p) + per_label_added[lab] += 1 + for lab in labels: + folder_stats[lab]["osrc_paths_added"] = per_label_added[lab] + print(f"[{lab}] osrc faces routed: +{per_label_added[lab]} unique paths") + + # Build synthetic refine_manifest. + facesets: list[dict] = [] + for n, (label, _folder) in enumerate(TRUSTED, start=START_NNN): + if label not in folder_paths: + continue + facesets.append({ + "name": f"faceset_{n:03d}", + "label": label, + "image_count": len(folder_paths[label]), + "images": sorted(folder_paths[label]), + }) + + manifest = { + "params": { + "pass1_threshold": PASS1_THRESHOLD, + "pass2_threshold": PASS2_THRESHOLD, + "osrc_threshold": OSRC_THRESHOLD, + "min_face_short": MIN_FACE_SHORT, + }, + "facesets": facesets, + "_per_folder_stats": folder_stats, + } + SYNTH_MANIFEST.write_text(json.dumps(manifest, indent=2)) + print(f"\nSynthetic manifest -> {SYNTH_MANIFEST}") + return manifest, centroids, folder_stats + + +# ---- phase C: export + rename + merge ----------------------------------- # + +def export_and_relocate(manifest: dict) -> None: + if OUT_TMP.exists(): + shutil.rmtree(OUT_TMP) + OUT_TMP.mkdir(parents=True) + + print(f"\nRunning cmd_export_swap -> {OUT_TMP}") + cmd_export_swap( + cache_path=CACHE, + refine_manifest_path=SYNTH_MANIFEST, + raw_manifest_path=None, + out_dir=OUT_TMP, + top_n=TOP_N, + outlier_threshold=EXPORT_OUTLIER_THRESHOLD, + pad_ratio=PAD_RATIO, + out_size=OUT_SIZE, + include_candidates=False, + candidate_match_threshold=0.55, + candidate_min_score=0.40, + min_face_short=MIN_FACE_SHORT, + ) + + # Map name -> label from the synthetic manifest. + name_to_label = {fs["name"]: fs["label"] for fs in manifest["facesets"]} + + # Load the temp top-level manifest (export-swap just wrote it). + new_top = json.loads((OUT_TMP / "manifest.json").read_text()) + new_entries = new_top.get("facesets", []) + + # Per-faceset rename + relocate. + for fs_meta in new_entries: + name = fs_meta["name"] + label = name_to_label.get(name) + src_dir = OUT_TMP / name + if not src_dir.exists(): + print(f"[{name}] export dir missing; skipping") + continue + + # Rename .fsz bundles to