Files
face-sets/work/build_folders.py
Peter 1d82d71e68 Force-track work/build_folders.py
The README documents work/build_folders.py as the orchestration script
for hand-sorted-folder identity import, but it was excluded by the
work/ gitignore. Force-track it for parity with the other orchestration
scripts (age_split_001.py, check_faceset001_age.py) so the documented
workflow points at code that exists in the repo.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 12:13:56 +02:00

324 lines
12 KiB
Python

#!/usr/bin/env python3
"""Build per-folder facesets from hand-sorted source directories.
Phase B + C of the folder-import workflow:
- Filter cache records into per-folder identity sets, run 2-pass centroid+outlier
rejection so non-target faces in group photos drop out.
- Route every osrc face record to every trusted-folder identity within a tight
cosine cutoff (multi-identity osrc photos land in multiple facesets;
cmd_export_swap then per-bbox-filters so each faceset crops only the matching face).
- Synthesize a refine_manifest.json compatible with cmd_export_swap.
- Invoke cmd_export_swap to emit faceset_NNN/ dirs into a temp output dir.
- Rename .fsz bundles after the source folder, replace NAME.txt with foldername.txt,
move dirs into the canonical facesets_swap_ready/, merge top-level manifest
preserving existing faceset_001..012 entries.
"""
from __future__ import annotations
import json
import shutil
import sys
from pathlib import Path
import numpy as np
REPO = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO))
from sort_faces import ( # noqa: E402
cmd_export_swap,
load_cache,
)
# ---- config -------------------------------------------------------------- #
CACHE = REPO / "work" / "cache" / "nl_full.npz"
OUT_FINAL = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
OUT_TMP = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready_new")
SYNTH_MANIFEST = REPO / "work" / "synthetic_refine_manifest.json"
# Trusted folders, in numbering order. faceset_NNN starts at 013.
TRUSTED: list[tuple[str, Path]] = [
("k", Path("/mnt/x/src/k")),
("m", Path("/mnt/x/src/m")),
("mi", Path("/mnt/x/src/mi")),
("mir", Path("/mnt/x/src/mir")),
("s", Path("/mnt/x/src/s")),
("sab", Path("/mnt/x/src/sab")),
("t", Path("/mnt/x/src/t")),
]
START_NNN = 13
OSRC_DIR = Path("/mnt/x/src/osrc")
# Centroid-build outlier passes (loose then tight).
PASS1_THRESHOLD = 0.55
PASS2_THRESHOLD = 0.45
# osrc routing cutoff (tight).
OSRC_THRESHOLD = 0.45
# export-swap params (defaults from sort_faces.py).
TOP_N = 30
EXPORT_OUTLIER_THRESHOLD = 0.45
PAD_RATIO = 0.5
OUT_SIZE = 512
MIN_FACE_SHORT = 100
# ---- helpers ------------------------------------------------------------- #
def _normalize_rows(mat: np.ndarray) -> np.ndarray:
n = np.linalg.norm(mat, axis=1, keepdims=True)
n[n == 0] = 1.0
return mat / n
def _centroid(vecs: np.ndarray) -> np.ndarray:
c = vecs.mean(axis=0)
n = np.linalg.norm(c)
return c / n if n > 0 else c
def _under(folder: Path, p: str) -> bool:
"""True iff path string p lies under folder."""
fs = str(folder).rstrip("/") + "/"
return p == str(folder) or p.startswith(fs)
def _record_in_folder(rec: dict, folder: Path, path_aliases: dict[str, list[str]]) -> bool:
if _under(folder, rec["path"]):
return True
for alias in path_aliases.get(rec["path"], []):
if _under(folder, alias):
return True
return False
# ---- phase B: identity centroids + osrc routing ------------------------- #
def build_synthetic_manifest() -> tuple[dict, dict[str, np.ndarray], dict[str, dict]]:
emb, meta, _src_root, _processed, path_aliases = load_cache(CACHE)
# emb is aligned with the no-noface-filtered records (matching cmd_export_swap's
# invariant). Use indices into face_records to access emb.
face_records = [m for m in meta if not m.get("noface")]
if len(face_records) != len(emb):
raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}")
print(f"Loaded cache: {len(face_records)} face records.")
# Per-folder identity centroids.
centroids: dict[str, np.ndarray] = {}
folder_paths: dict[str, set[str]] = {}
folder_stats: dict[str, dict] = {}
for label, folder in TRUSTED:
idxs = [i for i, m in enumerate(face_records) if _record_in_folder(m, folder, path_aliases)]
if not idxs:
print(f"[{label}] no face records found under {folder}; skipping")
continue
vecs = emb[idxs]
cent = _centroid(vecs)
# Pass 1: drop loose outliers.
d1 = 1.0 - vecs @ cent
keep1 = [idxs[k] for k, dist in enumerate(d1) if dist <= PASS1_THRESHOLD]
if not keep1:
print(f"[{label}] every face was a pass-1 outlier; using all faces as-is")
keep1 = idxs
cent = _centroid(emb[keep1])
# Pass 2: tight outlier rejection.
d2 = 1.0 - emb[keep1] @ cent
keep2 = [keep1[k] for k, dist in enumerate(d2) if dist <= PASS2_THRESHOLD]
if not keep2:
print(f"[{label}] every face was a pass-2 outlier; falling back to pass-1")
keep2 = keep1
cent = _centroid(emb[keep2])
centroids[label] = cent
# Use canonical path strings; export-swap will look up indices by path.
folder_paths[label] = {face_records[i]["path"] for i in keep2}
folder_stats[label] = {
"folder": str(folder),
"input_records": len(idxs),
"after_pass1": len(keep1),
"after_pass2": len(keep2),
"unique_paths": len(folder_paths[label]),
}
print(
f"[{label}] in={len(idxs)} pass1={len(keep1)} pass2={len(keep2)} "
f"unique_paths={len(folder_paths[label])}"
)
# osrc routing: every osrc face -> every centroid within OSRC_THRESHOLD.
osrc_idxs = [
i for i, m in enumerate(face_records)
if _record_in_folder(m, OSRC_DIR, path_aliases)
]
print(f"\nosrc: {len(osrc_idxs)} face records to route")
if osrc_idxs and centroids:
labels = list(centroids.keys())
cent_mat = np.stack([centroids[lab] for lab in labels])
# Build sims: (n_osrc, n_labels)
osrc_emb = emb[osrc_idxs]
sims = osrc_emb @ cent_mat.T # cosine similarity (vectors already normalized)
dists = 1.0 - sims
per_label_added: dict[str, int] = {lab: 0 for lab in labels}
for row, ci in enumerate(osrc_idxs):
p = face_records[ci]["path"]
for col, lab in enumerate(labels):
if dists[row, col] <= OSRC_THRESHOLD:
if p not in folder_paths[lab]:
folder_paths[lab].add(p)
per_label_added[lab] += 1
for lab in labels:
folder_stats[lab]["osrc_paths_added"] = per_label_added[lab]
print(f"[{lab}] osrc faces routed: +{per_label_added[lab]} unique paths")
# Build synthetic refine_manifest.
facesets: list[dict] = []
for n, (label, _folder) in enumerate(TRUSTED, start=START_NNN):
if label not in folder_paths:
continue
facesets.append({
"name": f"faceset_{n:03d}",
"label": label,
"image_count": len(folder_paths[label]),
"images": sorted(folder_paths[label]),
})
manifest = {
"params": {
"pass1_threshold": PASS1_THRESHOLD,
"pass2_threshold": PASS2_THRESHOLD,
"osrc_threshold": OSRC_THRESHOLD,
"min_face_short": MIN_FACE_SHORT,
},
"facesets": facesets,
"_per_folder_stats": folder_stats,
}
SYNTH_MANIFEST.write_text(json.dumps(manifest, indent=2))
print(f"\nSynthetic manifest -> {SYNTH_MANIFEST}")
return manifest, centroids, folder_stats
# ---- phase C: export + rename + merge ----------------------------------- #
def export_and_relocate(manifest: dict) -> None:
if OUT_TMP.exists():
shutil.rmtree(OUT_TMP)
OUT_TMP.mkdir(parents=True)
print(f"\nRunning cmd_export_swap -> {OUT_TMP}")
cmd_export_swap(
cache_path=CACHE,
refine_manifest_path=SYNTH_MANIFEST,
raw_manifest_path=None,
out_dir=OUT_TMP,
top_n=TOP_N,
outlier_threshold=EXPORT_OUTLIER_THRESHOLD,
pad_ratio=PAD_RATIO,
out_size=OUT_SIZE,
include_candidates=False,
candidate_match_threshold=0.55,
candidate_min_score=0.40,
min_face_short=MIN_FACE_SHORT,
)
# Map name -> label from the synthetic manifest.
name_to_label = {fs["name"]: fs["label"] for fs in manifest["facesets"]}
# Load the temp top-level manifest (export-swap just wrote it).
new_top = json.loads((OUT_TMP / "manifest.json").read_text())
new_entries = new_top.get("facesets", [])
# Per-faceset rename + relocate.
for fs_meta in new_entries:
name = fs_meta["name"]
label = name_to_label.get(name)
src_dir = OUT_TMP / name
if not src_dir.exists():
print(f"[{name}] export dir missing; skipping")
continue
# Rename .fsz bundles to <label>_*.fsz; record updated names.
renames = {}
for fsz in sorted(src_dir.glob(f"{name}_top*.fsz")):
new = src_dir / fsz.name.replace(name + "_", label + "_", 1)
fsz.rename(new)
renames[fsz.name] = new.name
for fsz in sorted(src_dir.glob(f"{name}_all.fsz")):
new = src_dir / fsz.name.replace(name + "_", label + "_", 1)
fsz.rename(new)
renames[fsz.name] = new.name
# Replace NAME.txt placeholder with <label>.txt.
nametxt = src_dir / "NAME.txt"
if nametxt.exists():
nametxt.unlink()
(src_dir / f"{label}.txt").write_text(
f"{label}\n\nSource: /mnt/x/src/{label} (hand-sorted) + matched osrc faces.\n"
)
# Update fs_meta entry's fsz fields to point at the renamed files.
for k in ("fsz_top", "fsz_all"):
if fs_meta.get(k) and fs_meta[k] in renames:
fs_meta[k] = renames[fs_meta[k]]
fs_meta["label"] = label
# Move the directory into the final output.
dst_dir = OUT_FINAL / name
if dst_dir.exists():
print(f"[{name}] {dst_dir} already exists; refusing to overwrite")
continue
shutil.move(str(src_dir), str(dst_dir))
print(f"[{name}] -> {dst_dir} (label={label})")
# Merge top-level manifest, preserving existing faceset_001..012 entries.
final_manifest_path = OUT_FINAL / "manifest.json"
if final_manifest_path.exists():
existing = json.loads(final_manifest_path.read_text())
else:
existing = {"facesets": []}
existing_names = {fs["name"] for fs in existing.get("facesets", [])}
appended = 0
for entry in new_entries:
if entry["name"] in existing_names:
print(f"[manifest] {entry['name']} already in top-level manifest; not duplicating")
continue
existing["facesets"].append(entry)
appended += 1
# Carry over export-swap params if not already present.
for k in ("quality_weights", "outlier_threshold", "top_n", "pad_ratio", "out_size"):
if k not in existing and k in new_top:
existing[k] = new_top[k]
final_manifest_path.write_text(json.dumps(existing, indent=2))
print(f"\nMerged manifest: appended {appended} entries -> {final_manifest_path}")
# Clean up temp dir if empty.
leftover = list(OUT_TMP.iterdir()) if OUT_TMP.exists() else []
if not leftover:
OUT_TMP.rmdir()
else:
# leave temp manifest.json for inspection
pass
# ---- main ---------------------------------------------------------------- #
def main() -> None:
manifest, _centroids, _stats = build_synthetic_manifest()
if not manifest.get("facesets"):
print("No facesets to build; nothing to do.")
return
export_and_relocate(manifest)
print("\nDone.")
if __name__ == "__main__":
main()