The README documents work/build_folders.py as the orchestration script for hand-sorted-folder identity import, but it was excluded by the work/ gitignore. Force-track it for parity with the other orchestration scripts (age_split_001.py, check_faceset001_age.py) so the documented workflow points at code that exists in the repo. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
324 lines
12 KiB
Python
324 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""Build per-folder facesets from hand-sorted source directories.
|
|
|
|
Phase B + C of the folder-import workflow:
|
|
- Filter cache records into per-folder identity sets, run 2-pass centroid+outlier
|
|
rejection so non-target faces in group photos drop out.
|
|
- Route every osrc face record to every trusted-folder identity within a tight
|
|
cosine cutoff (multi-identity osrc photos land in multiple facesets;
|
|
cmd_export_swap then per-bbox-filters so each faceset crops only the matching face).
|
|
- Synthesize a refine_manifest.json compatible with cmd_export_swap.
|
|
- Invoke cmd_export_swap to emit faceset_NNN/ dirs into a temp output dir.
|
|
- Rename .fsz bundles after the source folder, replace NAME.txt with foldername.txt,
|
|
move dirs into the canonical facesets_swap_ready/, merge top-level manifest
|
|
preserving existing faceset_001..012 entries.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import shutil
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
|
|
REPO = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(REPO))
|
|
|
|
from sort_faces import ( # noqa: E402
|
|
cmd_export_swap,
|
|
load_cache,
|
|
)
|
|
|
|
# ---- config -------------------------------------------------------------- #
|
|
|
|
CACHE = REPO / "work" / "cache" / "nl_full.npz"
|
|
OUT_FINAL = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
|
|
OUT_TMP = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready_new")
|
|
SYNTH_MANIFEST = REPO / "work" / "synthetic_refine_manifest.json"
|
|
|
|
# Trusted folders, in numbering order. faceset_NNN starts at 013.
|
|
TRUSTED: list[tuple[str, Path]] = [
|
|
("k", Path("/mnt/x/src/k")),
|
|
("m", Path("/mnt/x/src/m")),
|
|
("mi", Path("/mnt/x/src/mi")),
|
|
("mir", Path("/mnt/x/src/mir")),
|
|
("s", Path("/mnt/x/src/s")),
|
|
("sab", Path("/mnt/x/src/sab")),
|
|
("t", Path("/mnt/x/src/t")),
|
|
]
|
|
START_NNN = 13
|
|
OSRC_DIR = Path("/mnt/x/src/osrc")
|
|
|
|
# Centroid-build outlier passes (loose then tight).
|
|
PASS1_THRESHOLD = 0.55
|
|
PASS2_THRESHOLD = 0.45
|
|
# osrc routing cutoff (tight).
|
|
OSRC_THRESHOLD = 0.45
|
|
|
|
# export-swap params (defaults from sort_faces.py).
|
|
TOP_N = 30
|
|
EXPORT_OUTLIER_THRESHOLD = 0.45
|
|
PAD_RATIO = 0.5
|
|
OUT_SIZE = 512
|
|
MIN_FACE_SHORT = 100
|
|
|
|
|
|
# ---- helpers ------------------------------------------------------------- #
|
|
|
|
def _normalize_rows(mat: np.ndarray) -> np.ndarray:
|
|
n = np.linalg.norm(mat, axis=1, keepdims=True)
|
|
n[n == 0] = 1.0
|
|
return mat / n
|
|
|
|
|
|
def _centroid(vecs: np.ndarray) -> np.ndarray:
|
|
c = vecs.mean(axis=0)
|
|
n = np.linalg.norm(c)
|
|
return c / n if n > 0 else c
|
|
|
|
|
|
def _under(folder: Path, p: str) -> bool:
|
|
"""True iff path string p lies under folder."""
|
|
fs = str(folder).rstrip("/") + "/"
|
|
return p == str(folder) or p.startswith(fs)
|
|
|
|
|
|
def _record_in_folder(rec: dict, folder: Path, path_aliases: dict[str, list[str]]) -> bool:
|
|
if _under(folder, rec["path"]):
|
|
return True
|
|
for alias in path_aliases.get(rec["path"], []):
|
|
if _under(folder, alias):
|
|
return True
|
|
return False
|
|
|
|
|
|
# ---- phase B: identity centroids + osrc routing ------------------------- #
|
|
|
|
def build_synthetic_manifest() -> tuple[dict, dict[str, np.ndarray], dict[str, dict]]:
|
|
emb, meta, _src_root, _processed, path_aliases = load_cache(CACHE)
|
|
# emb is aligned with the no-noface-filtered records (matching cmd_export_swap's
|
|
# invariant). Use indices into face_records to access emb.
|
|
face_records = [m for m in meta if not m.get("noface")]
|
|
if len(face_records) != len(emb):
|
|
raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}")
|
|
|
|
print(f"Loaded cache: {len(face_records)} face records.")
|
|
|
|
# Per-folder identity centroids.
|
|
centroids: dict[str, np.ndarray] = {}
|
|
folder_paths: dict[str, set[str]] = {}
|
|
folder_stats: dict[str, dict] = {}
|
|
|
|
for label, folder in TRUSTED:
|
|
idxs = [i for i, m in enumerate(face_records) if _record_in_folder(m, folder, path_aliases)]
|
|
if not idxs:
|
|
print(f"[{label}] no face records found under {folder}; skipping")
|
|
continue
|
|
|
|
vecs = emb[idxs]
|
|
cent = _centroid(vecs)
|
|
|
|
# Pass 1: drop loose outliers.
|
|
d1 = 1.0 - vecs @ cent
|
|
keep1 = [idxs[k] for k, dist in enumerate(d1) if dist <= PASS1_THRESHOLD]
|
|
if not keep1:
|
|
print(f"[{label}] every face was a pass-1 outlier; using all faces as-is")
|
|
keep1 = idxs
|
|
cent = _centroid(emb[keep1])
|
|
|
|
# Pass 2: tight outlier rejection.
|
|
d2 = 1.0 - emb[keep1] @ cent
|
|
keep2 = [keep1[k] for k, dist in enumerate(d2) if dist <= PASS2_THRESHOLD]
|
|
if not keep2:
|
|
print(f"[{label}] every face was a pass-2 outlier; falling back to pass-1")
|
|
keep2 = keep1
|
|
cent = _centroid(emb[keep2])
|
|
|
|
centroids[label] = cent
|
|
# Use canonical path strings; export-swap will look up indices by path.
|
|
folder_paths[label] = {face_records[i]["path"] for i in keep2}
|
|
folder_stats[label] = {
|
|
"folder": str(folder),
|
|
"input_records": len(idxs),
|
|
"after_pass1": len(keep1),
|
|
"after_pass2": len(keep2),
|
|
"unique_paths": len(folder_paths[label]),
|
|
}
|
|
print(
|
|
f"[{label}] in={len(idxs)} pass1={len(keep1)} pass2={len(keep2)} "
|
|
f"unique_paths={len(folder_paths[label])}"
|
|
)
|
|
|
|
# osrc routing: every osrc face -> every centroid within OSRC_THRESHOLD.
|
|
osrc_idxs = [
|
|
i for i, m in enumerate(face_records)
|
|
if _record_in_folder(m, OSRC_DIR, path_aliases)
|
|
]
|
|
print(f"\nosrc: {len(osrc_idxs)} face records to route")
|
|
if osrc_idxs and centroids:
|
|
labels = list(centroids.keys())
|
|
cent_mat = np.stack([centroids[lab] for lab in labels])
|
|
# Build sims: (n_osrc, n_labels)
|
|
osrc_emb = emb[osrc_idxs]
|
|
sims = osrc_emb @ cent_mat.T # cosine similarity (vectors already normalized)
|
|
dists = 1.0 - sims
|
|
per_label_added: dict[str, int] = {lab: 0 for lab in labels}
|
|
for row, ci in enumerate(osrc_idxs):
|
|
p = face_records[ci]["path"]
|
|
for col, lab in enumerate(labels):
|
|
if dists[row, col] <= OSRC_THRESHOLD:
|
|
if p not in folder_paths[lab]:
|
|
folder_paths[lab].add(p)
|
|
per_label_added[lab] += 1
|
|
for lab in labels:
|
|
folder_stats[lab]["osrc_paths_added"] = per_label_added[lab]
|
|
print(f"[{lab}] osrc faces routed: +{per_label_added[lab]} unique paths")
|
|
|
|
# Build synthetic refine_manifest.
|
|
facesets: list[dict] = []
|
|
for n, (label, _folder) in enumerate(TRUSTED, start=START_NNN):
|
|
if label not in folder_paths:
|
|
continue
|
|
facesets.append({
|
|
"name": f"faceset_{n:03d}",
|
|
"label": label,
|
|
"image_count": len(folder_paths[label]),
|
|
"images": sorted(folder_paths[label]),
|
|
})
|
|
|
|
manifest = {
|
|
"params": {
|
|
"pass1_threshold": PASS1_THRESHOLD,
|
|
"pass2_threshold": PASS2_THRESHOLD,
|
|
"osrc_threshold": OSRC_THRESHOLD,
|
|
"min_face_short": MIN_FACE_SHORT,
|
|
},
|
|
"facesets": facesets,
|
|
"_per_folder_stats": folder_stats,
|
|
}
|
|
SYNTH_MANIFEST.write_text(json.dumps(manifest, indent=2))
|
|
print(f"\nSynthetic manifest -> {SYNTH_MANIFEST}")
|
|
return manifest, centroids, folder_stats
|
|
|
|
|
|
# ---- phase C: export + rename + merge ----------------------------------- #
|
|
|
|
def export_and_relocate(manifest: dict) -> None:
|
|
if OUT_TMP.exists():
|
|
shutil.rmtree(OUT_TMP)
|
|
OUT_TMP.mkdir(parents=True)
|
|
|
|
print(f"\nRunning cmd_export_swap -> {OUT_TMP}")
|
|
cmd_export_swap(
|
|
cache_path=CACHE,
|
|
refine_manifest_path=SYNTH_MANIFEST,
|
|
raw_manifest_path=None,
|
|
out_dir=OUT_TMP,
|
|
top_n=TOP_N,
|
|
outlier_threshold=EXPORT_OUTLIER_THRESHOLD,
|
|
pad_ratio=PAD_RATIO,
|
|
out_size=OUT_SIZE,
|
|
include_candidates=False,
|
|
candidate_match_threshold=0.55,
|
|
candidate_min_score=0.40,
|
|
min_face_short=MIN_FACE_SHORT,
|
|
)
|
|
|
|
# Map name -> label from the synthetic manifest.
|
|
name_to_label = {fs["name"]: fs["label"] for fs in manifest["facesets"]}
|
|
|
|
# Load the temp top-level manifest (export-swap just wrote it).
|
|
new_top = json.loads((OUT_TMP / "manifest.json").read_text())
|
|
new_entries = new_top.get("facesets", [])
|
|
|
|
# Per-faceset rename + relocate.
|
|
for fs_meta in new_entries:
|
|
name = fs_meta["name"]
|
|
label = name_to_label.get(name)
|
|
src_dir = OUT_TMP / name
|
|
if not src_dir.exists():
|
|
print(f"[{name}] export dir missing; skipping")
|
|
continue
|
|
|
|
# Rename .fsz bundles to <label>_*.fsz; record updated names.
|
|
renames = {}
|
|
for fsz in sorted(src_dir.glob(f"{name}_top*.fsz")):
|
|
new = src_dir / fsz.name.replace(name + "_", label + "_", 1)
|
|
fsz.rename(new)
|
|
renames[fsz.name] = new.name
|
|
for fsz in sorted(src_dir.glob(f"{name}_all.fsz")):
|
|
new = src_dir / fsz.name.replace(name + "_", label + "_", 1)
|
|
fsz.rename(new)
|
|
renames[fsz.name] = new.name
|
|
|
|
# Replace NAME.txt placeholder with <label>.txt.
|
|
nametxt = src_dir / "NAME.txt"
|
|
if nametxt.exists():
|
|
nametxt.unlink()
|
|
(src_dir / f"{label}.txt").write_text(
|
|
f"{label}\n\nSource: /mnt/x/src/{label} (hand-sorted) + matched osrc faces.\n"
|
|
)
|
|
|
|
# Update fs_meta entry's fsz fields to point at the renamed files.
|
|
for k in ("fsz_top", "fsz_all"):
|
|
if fs_meta.get(k) and fs_meta[k] in renames:
|
|
fs_meta[k] = renames[fs_meta[k]]
|
|
fs_meta["label"] = label
|
|
|
|
# Move the directory into the final output.
|
|
dst_dir = OUT_FINAL / name
|
|
if dst_dir.exists():
|
|
print(f"[{name}] {dst_dir} already exists; refusing to overwrite")
|
|
continue
|
|
shutil.move(str(src_dir), str(dst_dir))
|
|
print(f"[{name}] -> {dst_dir} (label={label})")
|
|
|
|
# Merge top-level manifest, preserving existing faceset_001..012 entries.
|
|
final_manifest_path = OUT_FINAL / "manifest.json"
|
|
if final_manifest_path.exists():
|
|
existing = json.loads(final_manifest_path.read_text())
|
|
else:
|
|
existing = {"facesets": []}
|
|
|
|
existing_names = {fs["name"] for fs in existing.get("facesets", [])}
|
|
appended = 0
|
|
for entry in new_entries:
|
|
if entry["name"] in existing_names:
|
|
print(f"[manifest] {entry['name']} already in top-level manifest; not duplicating")
|
|
continue
|
|
existing["facesets"].append(entry)
|
|
appended += 1
|
|
|
|
# Carry over export-swap params if not already present.
|
|
for k in ("quality_weights", "outlier_threshold", "top_n", "pad_ratio", "out_size"):
|
|
if k not in existing and k in new_top:
|
|
existing[k] = new_top[k]
|
|
|
|
final_manifest_path.write_text(json.dumps(existing, indent=2))
|
|
print(f"\nMerged manifest: appended {appended} entries -> {final_manifest_path}")
|
|
|
|
# Clean up temp dir if empty.
|
|
leftover = list(OUT_TMP.iterdir()) if OUT_TMP.exists() else []
|
|
if not leftover:
|
|
OUT_TMP.rmdir()
|
|
else:
|
|
# leave temp manifest.json for inspection
|
|
pass
|
|
|
|
|
|
# ---- main ---------------------------------------------------------------- #
|
|
|
|
def main() -> None:
|
|
manifest, _centroids, _stats = build_synthetic_manifest()
|
|
if not manifest.get("facesets"):
|
|
print("No facesets to build; nothing to do.")
|
|
return
|
|
export_and_relocate(manifest)
|
|
print("\nDone.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|