Add face-sort pipeline as the repo's base

Single-file CLI (embed / cluster / refine) using InsightFace buffalo_l embeddings and agglomerative clustering, migrated in from the ad-hoc /home/peter/face_sort/ directory so this repo is the canonical home for faceset preparation feeding roop-unleashed and similar tools. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 11:20:00 +02:00
parent 01ae516b54
commit c5a4e2dfdb
3 changed files with 496 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,5 @@
+work/
+__pycache__/
+*.pyc
+.venv/
+.claude/
--- a/README.md
+++ b/README.md
@@ -1 +1,48 @@
-Dummy
+# face-sets
+
+Sort photos by similar face using InsightFace embeddings + agglomerative clustering, then refine into faceset-ready folders for downstream face-swap tooling (roop-unleashed, etc.).
+
+## Pipeline
+
+`sort_faces.py` is a single-file CLI with three subcommands:
+
+| step    | what it does                                                                 |
+|---------|------------------------------------------------------------------------------|
+| embed   | Recursively scan a source tree, detect + embed every face, write `.npz` cache |
+| cluster | Raw agglomerative clustering of the cache into `person_NNN/` / `_singletons/` / `_noface/` |
+| refine  | Initial cluster → centroid merge → quality gate → outlier rejection → size filter → `faceset_NNN/` |
+
+Cache and outputs are kept out of the repo via `.gitignore`; defaults live under `work/`.
+
+## Typical run
+
+```bash
+# 1. Embed (CPU; InsightFace buffalo_l). Caches faces + metadata.
+python sort_faces.py embed "/mnt/x/src/nl/Neuer Ordner (2)/New Folder" work/cache/nl_all.npz
+
+# 2. Raw clusters (every multi-face cluster -> a person_NNN/ folder).
+python sort_faces.py cluster work/cache/nl_all.npz /mnt/e/temp_things/fcswp/nl_sorted/raw
+
+# 3. Refined facesets (filters for faceset-ready quality).
+python sort_faces.py refine  work/cache/nl_all.npz /mnt/e/temp_things/fcswp/nl_sorted/facesets
+```
+
+## Refine defaults
+
+| flag | default | meaning |
+|---|---|---|
+| `--initial-threshold` | 0.55 | cosine distance for stage-1 clustering |
+| `--merge-threshold`   | 0.40 | centroid-level merge of over-split clusters |
+| `--outlier-threshold` | 0.55 | drop face if cosine dist from cluster centroid exceeds this (only if cluster ≥ 4) |
+| `--min-faces`         | 15   | minimum unique images per faceset |
+| `--min-short`         | 90   | minimum short-edge pixels of face bbox |
+| `--min-blur`          | 40.0 | Laplacian-variance blur gate |
+| `--min-det-score`     | 0.6  | InsightFace detector score gate |
+| `--mode`              | copy | copy / move / symlink |
+
+## Prior runs (as of 2026-04-22)
+
+- `work/cache/kos11.npz` — 181 images, 333 faces from `Kos '11/` → `kos11_sorted/`
+- `work/cache/nl_all.npz` — 916 images, 1396 faces from `Neuer Ordner (2)/New Folder/` → `nl_sorted/raw/`, refined to 6 facesets (197, 120, 91, 47, 23, 18 images)
+
+Output lives outside the repo at `/mnt/e/temp_things/fcswp/`.
--- a/sort_faces.py
+++ b/sort_faces.py
@@ -0,0 +1,443 @@
+"""Sort photos by similar faces using InsightFace embeddings + agglomerative clustering.
+
+Subcommands:
+    embed   <src_dir> <cache.npz>            recursively scan, detect+embed faces
+    cluster <cache.npz> <out_dir> [opts]     raw agglomerative clustering -> person_NNN/
+    refine  <cache.npz> <out_dir> [opts]     merge + outlier + quality pass -> faceset-ready folders
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+from PIL import Image, ImageOps
+from tqdm import tqdm
+
+IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp", ".heic"}
+MIN_DET_SCORE = 0.5
+MIN_FACE_PIX = 40
+
+
+def list_images(src: Path) -> list[Path]:
+    out: list[Path] = []
+    for p in src.rglob("*"):
+        if p.is_file() and p.suffix.lower() in IMG_EXTS:
+            out.append(p)
+    return sorted(out)
+
+
+def load_rgb_bgr(path: Path):
+    try:
+        with Image.open(path) as im:
+            im = ImageOps.exif_transpose(im)
+            im = im.convert("RGB")
+            rgb = np.array(im)
+        bgr = rgb[:, :, ::-1].copy()
+        return rgb, bgr
+    except Exception as e:
+        print(f"[warn] failed to load {path}: {e}", file=sys.stderr)
+        return None, None
+
+
+def laplacian_variance(gray: np.ndarray) -> float:
+    """Simple blur metric without OpenCV Laplacian call (uses numpy)."""
+    k = np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]], dtype=np.float32)
+    # same-size convolution via numpy slicing
+    g = gray.astype(np.float32)
+    lap = (
+        -4.0 * g[1:-1, 1:-1]
+        + g[:-2, 1:-1] + g[2:, 1:-1]
+        + g[1:-1, :-2] + g[1:-1, 2:]
+    )
+    return float(lap.var())
+
+
+def make_rel(path: Path, root: Path) -> str:
+    try:
+        return str(path.relative_to(root))
+    except ValueError:
+        return path.name
+
+
+def safe_dst_name(path: Path, root: Path) -> str:
+    """Collision-safe filename built from source-relative path."""
+    rel = make_rel(path, root)
+    # Flatten: replace separators with double underscore
+    flat = rel.replace("/", "__").replace("\\", "__").replace(" ", "_")
+    return flat
+
+
+def cmd_embed(src_dir: Path, cache_path: Path) -> None:
+    from insightface.app import FaceAnalysis
+
+    app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"])
+    app.prepare(ctx_id=-1, det_size=(640, 640))
+
+    images = list_images(src_dir)
+    print(f"Found {len(images)} images under {src_dir}")
+
+    embeddings: list[np.ndarray] = []
+    meta: list[dict] = []
+
+    t0 = time.time()
+    for img_path in tqdm(images, desc="embedding"):
+        rgb, bgr = load_rgb_bgr(img_path)
+        if bgr is None:
+            meta.append({"path": str(img_path), "face_idx": -1, "noface": True, "error": "load"})
+            continue
+        faces = app.get(bgr)
+        kept = 0
+        for i, f in enumerate(faces):
+            if float(f.det_score) < MIN_DET_SCORE:
+                continue
+            x1, y1, x2, y2 = [int(round(v)) for v in f.bbox]
+            x1, y1 = max(x1, 0), max(y1, 0)
+            x2, y2 = min(x2, rgb.shape[1]), min(y2, rgb.shape[0])
+            w, h = x2 - x1, y2 - y1
+            short = min(w, h)
+            if short < MIN_FACE_PIX:
+                continue
+            # Blur metric on the face crop (grayscale)
+            crop = rgb[y1:y2, x1:x2]
+            if crop.size == 0:
+                continue
+            gray = crop.mean(axis=2)
+            blur = laplacian_variance(gray) if min(gray.shape) > 3 else 0.0
+
+            emb = f.normed_embedding.astype(np.float32)
+            embeddings.append(emb)
+            meta.append({
+                "path": str(img_path),
+                "face_idx": i,
+                "det_score": float(f.det_score),
+                "bbox": [x1, y1, x2, y2],
+                "face_short": int(short),
+                "face_area": int(w * h),
+                "blur": blur,
+                "noface": False,
+            })
+            kept += 1
+        if kept == 0:
+            meta.append({"path": str(img_path), "face_idx": -1, "noface": True})
+
+    dt = time.time() - t0
+    print(f"Detected {len(embeddings)} faces across {len(images)} images in {dt:.1f}s")
+
+    emb_arr = np.stack(embeddings) if embeddings else np.zeros((0, 512), dtype=np.float32)
+    np.savez(cache_path, embeddings=emb_arr, meta=json.dumps(meta), src_root=str(src_dir))
+    print(f"Cache written to {cache_path}")
+
+
+def load_cache(cache_path: Path):
+    data = np.load(cache_path, allow_pickle=True)
+    emb = data["embeddings"]
+    meta = json.loads(str(data["meta"]))
+    src_root = Path(str(data["src_root"])) if "src_root" in data.files else None
+    return emb, meta, src_root
+
+
+def _transfer(src: Path, dst: Path, mode: str) -> None:
+    if dst.exists():
+        return
+    if mode == "copy":
+        shutil.copy2(src, dst)
+    elif mode == "move":
+        shutil.move(str(src), str(dst))
+    elif mode == "symlink":
+        dst.symlink_to(src)
+
+
+def _cluster_embeddings(emb: np.ndarray, threshold: float) -> np.ndarray:
+    from sklearn.cluster import AgglomerativeClustering
+
+    clusterer = AgglomerativeClustering(
+        n_clusters=None,
+        distance_threshold=threshold,
+        metric="cosine",
+        linkage="average",
+    )
+    return clusterer.fit_predict(emb)
+
+
+def cmd_cluster(cache_path: Path, out_dir: Path, threshold: float, mode: str, dry_run: bool) -> None:
+    emb, meta, src_root = load_cache(cache_path)
+    if src_root is None:
+        src_root = Path("/")
+    face_records = [m for m in meta if not m.get("noface")]
+    noface_records = [m for m in meta if m.get("noface")]
+
+    if len(face_records) != len(emb):
+        raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}")
+    if len(emb) == 0:
+        print("No faces detected; nothing to cluster.")
+        return
+
+    print(f"Clustering {len(emb)} face embeddings (threshold={threshold} cosine distance)")
+    labels = _cluster_embeddings(emb, threshold)
+
+    clusters: dict[int, list[dict]] = {}
+    for rec, lbl in zip(face_records, labels):
+        rec = dict(rec)
+        rec["cluster"] = int(lbl)
+        clusters.setdefault(int(lbl), []).append(rec)
+
+    ordered = sorted(clusters.items(), key=lambda kv: (-len(kv[1]), kv[0]))
+    sizes = [len(v) for _, v in ordered]
+    singletons = sum(1 for s in sizes if s == 1)
+    print(f"Clusters: {len(ordered)} | top sizes: {sizes[:15]}")
+    print(f"Multi-face clusters: {len(sizes) - singletons}  singletons: {singletons}")
+    print(f"No-face images: {len(noface_records)}")
+
+    if dry_run:
+        for cid, recs in ordered[:20]:
+            imgs = {r["path"] for r in recs}
+            print(f"  cluster {cid:3d} faces={len(recs):3d} imgs={len(imgs)}")
+        return
+
+    out_dir.mkdir(parents=True, exist_ok=True)
+    rank = 0
+    cluster_dir: dict[int, Path] = {}
+    for cid, recs in ordered:
+        if len(recs) == 1:
+            cluster_dir[cid] = out_dir / "_singletons"
+        else:
+            rank += 1
+            cluster_dir[cid] = out_dir / f"person_{rank:03d}"
+        cluster_dir[cid].mkdir(parents=True, exist_ok=True)
+
+    per_cluster_imgs: dict[int, set[str]] = {cid: set() for cid, _ in ordered}
+    for cid, recs in ordered:
+        for r in recs:
+            per_cluster_imgs[cid].add(r["path"])
+
+    total = sum(len(v) for v in per_cluster_imgs.values())
+    unique = len({p for s in per_cluster_imgs.values() for p in s})
+    print(f"Placing {total} file instances across {unique} unique images (mode={mode}) -> {out_dir}")
+
+    for cid, paths in tqdm(per_cluster_imgs.items(), desc="transferring"):
+        dst_dir = cluster_dir[cid]
+        for p in sorted(paths):
+            src = Path(p)
+            dst = dst_dir / safe_dst_name(src, src_root)
+            _transfer(src, dst, mode)
+
+    if noface_records:
+        noface_dir = out_dir / "_noface"
+        noface_dir.mkdir(exist_ok=True)
+        for r in noface_records:
+            src = Path(r["path"])
+            if not src.exists():
+                continue
+            _transfer(src, noface_dir / safe_dst_name(src, src_root), mode)
+        print(f"{len(noface_records)} no-face images -> {noface_dir}")
+
+    manifest = []
+    for cid, recs in ordered:
+        for r in recs:
+            manifest.append({
+                "image": Path(r["path"]).name,
+                "source": r["path"],
+                "cluster": cid,
+                "folder": cluster_dir[cid].name,
+                "bbox": r.get("bbox"),
+                "det_score": r.get("det_score"),
+                "face_short": r.get("face_short"),
+                "blur": r.get("blur"),
+            })
+    (out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2))
+    print(f"Manifest -> {out_dir / 'manifest.json'}")
+
+
+def _cluster_centroids(emb: np.ndarray, labels: np.ndarray) -> tuple[np.ndarray, list[int]]:
+    ids = sorted(set(int(l) for l in labels))
+    cents = []
+    for cid in ids:
+        mask = labels == cid
+        v = emb[mask].mean(axis=0)
+        n = np.linalg.norm(v)
+        if n > 0:
+            v = v / n
+        cents.append(v)
+    return np.stack(cents), ids
+
+
+def cmd_refine(
+    cache_path: Path,
+    out_dir: Path,
+    initial_threshold: float,
+    merge_threshold: float,
+    outlier_threshold: float,
+    min_faces: int,
+    min_short: int,
+    min_blur: float,
+    min_det_score: float,
+    mode: str,
+    dry_run: bool,
+) -> None:
+    emb, meta, src_root = load_cache(cache_path)
+    if src_root is None:
+        src_root = Path("/")
+    face_records = [m for m in meta if not m.get("noface")]
+    if len(face_records) != len(emb):
+        raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}")
+
+    print(f"Stage 1: initial clustering (threshold={initial_threshold})")
+    labels = _cluster_embeddings(emb, initial_threshold)
+
+    # Stage 2: merge similar clusters by centroid
+    cents, cent_ids = _cluster_centroids(emb, labels)
+    print(f"Stage 2: centroid merge on {len(cent_ids)} clusters (merge_threshold={merge_threshold})")
+    cent_labels = _cluster_embeddings(cents, merge_threshold) if len(cents) > 1 else np.zeros(1, dtype=int)
+    # remap original labels via centroid merge
+    label_map = {cid: int(ml) for cid, ml in zip(cent_ids, cent_labels)}
+    merged = np.array([label_map[int(l)] for l in labels])
+
+    # Build merged clusters
+    clusters: dict[int, list[tuple[int, dict]]] = {}  # cluster -> list of (global_idx, rec)
+    for idx, (rec, lbl) in enumerate(zip(face_records, merged)):
+        clusters.setdefault(int(lbl), []).append((idx, dict(rec)))
+
+    print(f"After merge: {len(clusters)} clusters")
+
+    # Stage 3: outlier rejection + quality filter per cluster
+    kept_by_cluster: dict[int, list[tuple[int, dict]]] = {}
+    dropped_quality = 0
+    dropped_outlier = 0
+    for cid, items in clusters.items():
+        idxs = [i for i, _ in items]
+        cvecs = emb[idxs]
+        # centroid from the in-cluster faces
+        c = cvecs.mean(axis=0)
+        n = np.linalg.norm(c)
+        if n > 0:
+            c = c / n
+
+        kept: list[tuple[int, dict]] = []
+        for (idx, rec), v in zip(items, cvecs):
+            # Quality gate
+            if rec.get("face_short", 0) < min_short:
+                dropped_quality += 1
+                continue
+            if rec.get("blur", 0.0) < min_blur:
+                dropped_quality += 1
+                continue
+            if rec.get("det_score", 0.0) < min_det_score:
+                dropped_quality += 1
+                continue
+            # Outlier: only apply if the merged cluster has >=4 surviving-ish members
+            if len(items) >= 4:
+                cos_dist = 1.0 - float(v @ c)
+                if cos_dist > outlier_threshold:
+                    dropped_outlier += 1
+                    continue
+            kept.append((idx, rec))
+        if kept:
+            kept_by_cluster[cid] = kept
+
+    print(f"Dropped {dropped_quality} faces by quality gate, {dropped_outlier} as outliers")
+
+    # Stage 4: enforce minimum cluster size (by unique images, not faces)
+    final: list[tuple[int, list[tuple[int, dict]]]] = []
+    for cid, items in kept_by_cluster.items():
+        unique_imgs = {rec["path"] for _, rec in items}
+        if len(unique_imgs) >= min_faces:
+            final.append((cid, items))
+    final.sort(key=lambda kv: -len(kv[1]))
+
+    print(f"Facesets meeting min_faces={min_faces}: {len(final)}")
+    for rank, (cid, items) in enumerate(final, 1):
+        unique_imgs = {rec["path"] for _, rec in items}
+        print(f"  faceset_{rank:03d}: faces={len(items):3d} imgs={len(unique_imgs):3d}")
+
+    if dry_run:
+        return
+
+    out_dir.mkdir(parents=True, exist_ok=True)
+    for rank, (cid, items) in enumerate(final, 1):
+        dst_dir = out_dir / f"faceset_{rank:03d}"
+        dst_dir.mkdir(exist_ok=True)
+        seen_paths: set[str] = set()
+        for _, rec in items:
+            p = rec["path"]
+            if p in seen_paths:
+                continue
+            seen_paths.add(p)
+            src = Path(p)
+            if not src.exists():
+                continue
+            _transfer(src, dst_dir / safe_dst_name(src, src_root), mode)
+
+    # Write refinement manifest
+    manifest = {
+        "params": {
+            "initial_threshold": initial_threshold,
+            "merge_threshold": merge_threshold,
+            "outlier_threshold": outlier_threshold,
+            "min_faces": min_faces,
+            "min_short": min_short,
+            "min_blur": min_blur,
+            "min_det_score": min_det_score,
+        },
+        "facesets": [
+            {
+                "name": f"faceset_{rank:03d}",
+                "face_count": len(items),
+                "image_count": len({rec["path"] for _, rec in items}),
+                "images": sorted({rec["path"] for _, rec in items}),
+            }
+            for rank, (_, items) in enumerate(final, 1)
+        ],
+    }
+    (out_dir / "refine_manifest.json").write_text(json.dumps(manifest, indent=2))
+    print(f"Refine manifest -> {out_dir / 'refine_manifest.json'}")
+
+
+def main() -> None:
+    p = argparse.ArgumentParser()
+    sub = p.add_subparsers(dest="cmd", required=True)
+
+    pe = sub.add_parser("embed")
+    pe.add_argument("src_dir", type=Path)
+    pe.add_argument("cache", type=Path)
+
+    pc = sub.add_parser("cluster")
+    pc.add_argument("cache", type=Path)
+    pc.add_argument("out_dir", type=Path)
+    pc.add_argument("--threshold", type=float, default=0.55)
+    pc.add_argument("--mode", choices=["copy", "move", "symlink"], default="copy")
+    pc.add_argument("--dry-run", action="store_true")
+
+    pr = sub.add_parser("refine")
+    pr.add_argument("cache", type=Path)
+    pr.add_argument("out_dir", type=Path)
+    pr.add_argument("--initial-threshold", type=float, default=0.55)
+    pr.add_argument("--merge-threshold", type=float, default=0.40)
+    pr.add_argument("--outlier-threshold", type=float, default=0.55)
+    pr.add_argument("--min-faces", type=int, default=15)
+    pr.add_argument("--min-short", type=int, default=90)
+    pr.add_argument("--min-blur", type=float, default=40.0)
+    pr.add_argument("--min-det-score", type=float, default=0.6)
+    pr.add_argument("--mode", choices=["copy", "move", "symlink"], default="copy")
+    pr.add_argument("--dry-run", action="store_true")
+
+    args = p.parse_args()
+    if args.cmd == "embed":
+        cmd_embed(args.src_dir, args.cache)
+    elif args.cmd == "cluster":
+        cmd_cluster(args.cache, args.out_dir, args.threshold, args.mode, args.dry_run)
+    elif args.cmd == "refine":
+        cmd_refine(
+            args.cache, args.out_dir,
+            args.initial_threshold, args.merge_threshold, args.outlier_threshold,
+            args.min_faces, args.min_short, args.min_blur, args.min_det_score,
+            args.mode, args.dry_run,
+        )
+
+
+if __name__ == "__main__":
+    main()