Add face-sort pipeline as the repo's base

Single-file CLI (embed / cluster / refine) using InsightFace buffalo_l embeddings and agglomerative clustering, migrated in from the ad-hoc /home/peter/face_sort/ directory so this repo is the canonical home for faceset preparation feeding roop-unleashed and similar tools. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 11:20:00 +02:00
parent 01ae516b54
commit c5a4e2dfdb
3 changed files with 496 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,5 @@
 work/
 __pycache__/
 *.pyc
 .venv/
 .claude/
--- a/README.md
+++ b/README.md
@@ -1 +1,48 @@
-Dummy
+# face-sets
 Sort photos by similar face using InsightFace embeddings + agglomerative clustering, then refine into faceset-ready folders for downstream face-swap tooling (roop-unleashed, etc.).
 ## Pipeline
 `sort_faces.py` is a single-file CLI with three subcommands:
 | step    | what it does                                                                 |
 |---------|------------------------------------------------------------------------------|
 | embed   | Recursively scan a source tree, detect + embed every face, write `.npz` cache |
 | cluster | Raw agglomerative clustering of the cache into `person_NNN/` / `_singletons/` / `_noface/` |
 | refine  | Initial cluster → centroid merge → quality gate → outlier rejection → size filter → `faceset_NNN/` |
 Cache and outputs are kept out of the repo via `.gitignore`; defaults live under `work/`.
 ## Typical run
 ```bash
 # 1. Embed (CPU; InsightFace buffalo_l). Caches faces + metadata.
 python sort_faces.py embed "/mnt/x/src/nl/Neuer Ordner (2)/New Folder" work/cache/nl_all.npz
 # 2. Raw clusters (every multi-face cluster -> a person_NNN/ folder).
 python sort_faces.py cluster work/cache/nl_all.npz /mnt/e/temp_things/fcswp/nl_sorted/raw
 # 3. Refined facesets (filters for faceset-ready quality).
 python sort_faces.py refine  work/cache/nl_all.npz /mnt/e/temp_things/fcswp/nl_sorted/facesets
 ```
 ## Refine defaults
 | flag | default | meaning |
 |---|---|---|
 | `--initial-threshold` | 0.55 | cosine distance for stage-1 clustering |
 | `--merge-threshold`   | 0.40 | centroid-level merge of over-split clusters |
 | `--outlier-threshold` | 0.55 | drop face if cosine dist from cluster centroid exceeds this (only if cluster ≥ 4) |
 | `--min-faces`         | 15   | minimum unique images per faceset |
 | `--min-short`         | 90   | minimum short-edge pixels of face bbox |
 | `--min-blur`          | 40.0 | Laplacian-variance blur gate |
 | `--min-det-score`     | 0.6  | InsightFace detector score gate |
 | `--mode`              | copy | copy / move / symlink |
 ## Prior runs (as of 2026-04-22)
 - `work/cache/kos11.npz` — 181 images, 333 faces from `Kos '11/` → `kos11_sorted/`
 - `work/cache/nl_all.npz` — 916 images, 1396 faces from `Neuer Ordner (2)/New Folder/` → `nl_sorted/raw/`, refined to 6 facesets (197, 120, 91, 47, 23, 18 images)
 Output lives outside the repo at `/mnt/e/temp_things/fcswp/`.
--- a/sort_faces.py
+++ b/sort_faces.py
@@ -0,0 +1,443 @@
 """Sort photos by similar faces using InsightFace embeddings + agglomerative clustering.
 Subcommands:
    embed   <src_dir> <cache.npz>            recursively scan, detect+embed faces
    cluster <cache.npz> <out_dir> [opts]     raw agglomerative clustering -> person_NNN/
    refine  <cache.npz> <out_dir> [opts]     merge + outlier + quality pass -> faceset-ready folders
 """
 from __future__ import annotations
 import argparse
 import json
 import shutil
 import sys
 import time
 from pathlib import Path
 import numpy as np
 from PIL import Image, ImageOps
 from tqdm import tqdm
 IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp", ".heic"}
 MIN_DET_SCORE = 0.5
 MIN_FACE_PIX = 40
 def list_images(src: Path) -> list[Path]:
    out: list[Path] = []
    for p in src.rglob("*"):
        if p.is_file() and p.suffix.lower() in IMG_EXTS:
            out.append(p)
    return sorted(out)
 def load_rgb_bgr(path: Path):
    try:
        with Image.open(path) as im:
            im = ImageOps.exif_transpose(im)
            im = im.convert("RGB")
            rgb = np.array(im)
        bgr = rgb[:, :, ::-1].copy()
        return rgb, bgr
    except Exception as e:
        print(f"[warn] failed to load {path}: {e}", file=sys.stderr)
        return None, None
 def laplacian_variance(gray: np.ndarray) -> float:
    """Simple blur metric without OpenCV Laplacian call (uses numpy)."""
    k = np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]], dtype=np.float32)
    # same-size convolution via numpy slicing
    g = gray.astype(np.float32)
    lap = (
        -4.0 * g[1:-1, 1:-1]
        + g[:-2, 1:-1] + g[2:, 1:-1]
        + g[1:-1, :-2] + g[1:-1, 2:]
    )
    return float(lap.var())
 def make_rel(path: Path, root: Path) -> str:
    try:
        return str(path.relative_to(root))
    except ValueError:
        return path.name
 def safe_dst_name(path: Path, root: Path) -> str:
    """Collision-safe filename built from source-relative path."""
    rel = make_rel(path, root)
    # Flatten: replace separators with double underscore
    flat = rel.replace("/", "__").replace("\\", "__").replace(" ", "_")
    return flat
 def cmd_embed(src_dir: Path, cache_path: Path) -> None:
    from insightface.app import FaceAnalysis
    app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"])
    app.prepare(ctx_id=-1, det_size=(640, 640))
    images = list_images(src_dir)
    print(f"Found {len(images)} images under {src_dir}")
    embeddings: list[np.ndarray] = []
    meta: list[dict] = []
    t0 = time.time()
    for img_path in tqdm(images, desc="embedding"):
        rgb, bgr = load_rgb_bgr(img_path)
        if bgr is None:
            meta.append({"path": str(img_path), "face_idx": -1, "noface": True, "error": "load"})
            continue
        faces = app.get(bgr)
        kept = 0
        for i, f in enumerate(faces):
            if float(f.det_score) < MIN_DET_SCORE:
                continue
            x1, y1, x2, y2 = [int(round(v)) for v in f.bbox]
            x1, y1 = max(x1, 0), max(y1, 0)
            x2, y2 = min(x2, rgb.shape[1]), min(y2, rgb.shape[0])
            w, h = x2 - x1, y2 - y1
            short = min(w, h)
            if short < MIN_FACE_PIX:
                continue
            # Blur metric on the face crop (grayscale)
            crop = rgb[y1:y2, x1:x2]
            if crop.size == 0:
                continue
            gray = crop.mean(axis=2)
            blur = laplacian_variance(gray) if min(gray.shape) > 3 else 0.0
            emb = f.normed_embedding.astype(np.float32)
            embeddings.append(emb)
            meta.append({
                "path": str(img_path),
                "face_idx": i,
                "det_score": float(f.det_score),
                "bbox": [x1, y1, x2, y2],
                "face_short": int(short),
                "face_area": int(w * h),
                "blur": blur,
                "noface": False,
            })
            kept += 1
        if kept == 0:
            meta.append({"path": str(img_path), "face_idx": -1, "noface": True})
    dt = time.time() - t0
    print(f"Detected {len(embeddings)} faces across {len(images)} images in {dt:.1f}s")
    emb_arr = np.stack(embeddings) if embeddings else np.zeros((0, 512), dtype=np.float32)
    np.savez(cache_path, embeddings=emb_arr, meta=json.dumps(meta), src_root=str(src_dir))
    print(f"Cache written to {cache_path}")
 def load_cache(cache_path: Path):
    data = np.load(cache_path, allow_pickle=True)
    emb = data["embeddings"]
    meta = json.loads(str(data["meta"]))
    src_root = Path(str(data["src_root"])) if "src_root" in data.files else None
    return emb, meta, src_root
 def _transfer(src: Path, dst: Path, mode: str) -> None:
    if dst.exists():
        return
    if mode == "copy":
        shutil.copy2(src, dst)
    elif mode == "move":
        shutil.move(str(src), str(dst))
    elif mode == "symlink":
        dst.symlink_to(src)
 def _cluster_embeddings(emb: np.ndarray, threshold: float) -> np.ndarray:
    from sklearn.cluster import AgglomerativeClustering
    clusterer = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=threshold,
        metric="cosine",
        linkage="average",
    )
    return clusterer.fit_predict(emb)
 def cmd_cluster(cache_path: Path, out_dir: Path, threshold: float, mode: str, dry_run: bool) -> None:
    emb, meta, src_root = load_cache(cache_path)
    if src_root is None:
        src_root = Path("/")
    face_records = [m for m in meta if not m.get("noface")]
    noface_records = [m for m in meta if m.get("noface")]
    if len(face_records) != len(emb):
        raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}")
    if len(emb) == 0:
        print("No faces detected; nothing to cluster.")
        return
    print(f"Clustering {len(emb)} face embeddings (threshold={threshold} cosine distance)")
    labels = _cluster_embeddings(emb, threshold)
    clusters: dict[int, list[dict]] = {}
    for rec, lbl in zip(face_records, labels):
        rec = dict(rec)
        rec["cluster"] = int(lbl)
        clusters.setdefault(int(lbl), []).append(rec)
    ordered = sorted(clusters.items(), key=lambda kv: (-len(kv[1]), kv[0]))
    sizes = [len(v) for _, v in ordered]
    singletons = sum(1 for s in sizes if s == 1)
    print(f"Clusters: {len(ordered)} | top sizes: {sizes[:15]}")
    print(f"Multi-face clusters: {len(sizes) - singletons}  singletons: {singletons}")
    print(f"No-face images: {len(noface_records)}")
    if dry_run:
        for cid, recs in ordered[:20]:
            imgs = {r["path"] for r in recs}
            print(f"  cluster {cid:3d} faces={len(recs):3d} imgs={len(imgs)}")
        return
    out_dir.mkdir(parents=True, exist_ok=True)
    rank = 0
    cluster_dir: dict[int, Path] = {}
    for cid, recs in ordered:
        if len(recs) == 1:
            cluster_dir[cid] = out_dir / "_singletons"
        else:
            rank += 1
            cluster_dir[cid] = out_dir / f"person_{rank:03d}"
        cluster_dir[cid].mkdir(parents=True, exist_ok=True)
    per_cluster_imgs: dict[int, set[str]] = {cid: set() for cid, _ in ordered}
    for cid, recs in ordered:
        for r in recs:
            per_cluster_imgs[cid].add(r["path"])
    total = sum(len(v) for v in per_cluster_imgs.values())
    unique = len({p for s in per_cluster_imgs.values() for p in s})
    print(f"Placing {total} file instances across {unique} unique images (mode={mode}) -> {out_dir}")
    for cid, paths in tqdm(per_cluster_imgs.items(), desc="transferring"):
        dst_dir = cluster_dir[cid]
        for p in sorted(paths):
            src = Path(p)
            dst = dst_dir / safe_dst_name(src, src_root)
            _transfer(src, dst, mode)
    if noface_records:
        noface_dir = out_dir / "_noface"
        noface_dir.mkdir(exist_ok=True)
        for r in noface_records:
            src = Path(r["path"])
            if not src.exists():
                continue
            _transfer(src, noface_dir / safe_dst_name(src, src_root), mode)
        print(f"{len(noface_records)} no-face images -> {noface_dir}")
    manifest = []
    for cid, recs in ordered:
        for r in recs:
            manifest.append({
                "image": Path(r["path"]).name,
                "source": r["path"],
                "cluster": cid,
                "folder": cluster_dir[cid].name,
                "bbox": r.get("bbox"),
                "det_score": r.get("det_score"),
                "face_short": r.get("face_short"),
                "blur": r.get("blur"),
            })
    (out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2))
    print(f"Manifest -> {out_dir / 'manifest.json'}")
 def _cluster_centroids(emb: np.ndarray, labels: np.ndarray) -> tuple[np.ndarray, list[int]]:
    ids = sorted(set(int(l) for l in labels))
    cents = []
    for cid in ids:
        mask = labels == cid
        v = emb[mask].mean(axis=0)
        n = np.linalg.norm(v)
        if n > 0:
            v = v / n
        cents.append(v)
    return np.stack(cents), ids
 def cmd_refine(
    cache_path: Path,
    out_dir: Path,
    initial_threshold: float,
    merge_threshold: float,
    outlier_threshold: float,
    min_faces: int,
    min_short: int,
    min_blur: float,
    min_det_score: float,
    mode: str,
    dry_run: bool,
 ) -> None:
    emb, meta, src_root = load_cache(cache_path)
    if src_root is None:
        src_root = Path("/")
    face_records = [m for m in meta if not m.get("noface")]
    if len(face_records) != len(emb):
        raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}")
    print(f"Stage 1: initial clustering (threshold={initial_threshold})")
    labels = _cluster_embeddings(emb, initial_threshold)
    # Stage 2: merge similar clusters by centroid
    cents, cent_ids = _cluster_centroids(emb, labels)
    print(f"Stage 2: centroid merge on {len(cent_ids)} clusters (merge_threshold={merge_threshold})")
    cent_labels = _cluster_embeddings(cents, merge_threshold) if len(cents) > 1 else np.zeros(1, dtype=int)
    # remap original labels via centroid merge
    label_map = {cid: int(ml) for cid, ml in zip(cent_ids, cent_labels)}
    merged = np.array([label_map[int(l)] for l in labels])
    # Build merged clusters
    clusters: dict[int, list[tuple[int, dict]]] = {}  # cluster -> list of (global_idx, rec)
    for idx, (rec, lbl) in enumerate(zip(face_records, merged)):
        clusters.setdefault(int(lbl), []).append((idx, dict(rec)))
    print(f"After merge: {len(clusters)} clusters")
    # Stage 3: outlier rejection + quality filter per cluster
    kept_by_cluster: dict[int, list[tuple[int, dict]]] = {}
    dropped_quality = 0
    dropped_outlier = 0
    for cid, items in clusters.items():
        idxs = [i for i, _ in items]
        cvecs = emb[idxs]
        # centroid from the in-cluster faces
        c = cvecs.mean(axis=0)
        n = np.linalg.norm(c)
        if n > 0:
            c = c / n
        kept: list[tuple[int, dict]] = []
        for (idx, rec), v in zip(items, cvecs):
            # Quality gate
            if rec.get("face_short", 0) < min_short:
                dropped_quality += 1
                continue
            if rec.get("blur", 0.0) < min_blur:
                dropped_quality += 1
                continue
            if rec.get("det_score", 0.0) < min_det_score:
                dropped_quality += 1
                continue
            # Outlier: only apply if the merged cluster has >=4 surviving-ish members
            if len(items) >= 4:
                cos_dist = 1.0 - float(v @ c)
                if cos_dist > outlier_threshold:
                    dropped_outlier += 1
                    continue
            kept.append((idx, rec))
        if kept:
            kept_by_cluster[cid] = kept
    print(f"Dropped {dropped_quality} faces by quality gate, {dropped_outlier} as outliers")
    # Stage 4: enforce minimum cluster size (by unique images, not faces)
    final: list[tuple[int, list[tuple[int, dict]]]] = []
    for cid, items in kept_by_cluster.items():
        unique_imgs = {rec["path"] for _, rec in items}
        if len(unique_imgs) >= min_faces:
            final.append((cid, items))
    final.sort(key=lambda kv: -len(kv[1]))
    print(f"Facesets meeting min_faces={min_faces}: {len(final)}")
    for rank, (cid, items) in enumerate(final, 1):
        unique_imgs = {rec["path"] for _, rec in items}
        print(f"  faceset_{rank:03d}: faces={len(items):3d} imgs={len(unique_imgs):3d}")
    if dry_run:
        return
    out_dir.mkdir(parents=True, exist_ok=True)
    for rank, (cid, items) in enumerate(final, 1):
        dst_dir = out_dir / f"faceset_{rank:03d}"
        dst_dir.mkdir(exist_ok=True)
        seen_paths: set[str] = set()
        for _, rec in items:
            p = rec["path"]
            if p in seen_paths:
                continue
            seen_paths.add(p)
            src = Path(p)
            if not src.exists():
                continue
            _transfer(src, dst_dir / safe_dst_name(src, src_root), mode)
    # Write refinement manifest
    manifest = {
        "params": {
            "initial_threshold": initial_threshold,
            "merge_threshold": merge_threshold,
            "outlier_threshold": outlier_threshold,
            "min_faces": min_faces,
            "min_short": min_short,
            "min_blur": min_blur,
            "min_det_score": min_det_score,
        },
        "facesets": [
            {
                "name": f"faceset_{rank:03d}",
                "face_count": len(items),
                "image_count": len({rec["path"] for _, rec in items}),
                "images": sorted({rec["path"] for _, rec in items}),
            }
            for rank, (_, items) in enumerate(final, 1)
        ],
    }
    (out_dir / "refine_manifest.json").write_text(json.dumps(manifest, indent=2))
    print(f"Refine manifest -> {out_dir / 'refine_manifest.json'}")
 def main() -> None:
    p = argparse.ArgumentParser()
    sub = p.add_subparsers(dest="cmd", required=True)
    pe = sub.add_parser("embed")
    pe.add_argument("src_dir", type=Path)
    pe.add_argument("cache", type=Path)
    pc = sub.add_parser("cluster")
    pc.add_argument("cache", type=Path)
    pc.add_argument("out_dir", type=Path)
    pc.add_argument("--threshold", type=float, default=0.55)
    pc.add_argument("--mode", choices=["copy", "move", "symlink"], default="copy")
    pc.add_argument("--dry-run", action="store_true")
    pr = sub.add_parser("refine")
    pr.add_argument("cache", type=Path)
    pr.add_argument("out_dir", type=Path)
    pr.add_argument("--initial-threshold", type=float, default=0.55)
    pr.add_argument("--merge-threshold", type=float, default=0.40)
    pr.add_argument("--outlier-threshold", type=float, default=0.55)
    pr.add_argument("--min-faces", type=int, default=15)
    pr.add_argument("--min-short", type=int, default=90)
    pr.add_argument("--min-blur", type=float, default=40.0)
    pr.add_argument("--min-det-score", type=float, default=0.6)
    pr.add_argument("--mode", choices=["copy", "move", "symlink"], default="copy")
    pr.add_argument("--dry-run", action="store_true")
    args = p.parse_args()
    if args.cmd == "embed":
        cmd_embed(args.src_dir, args.cache)
    elif args.cmd == "cluster":
        cmd_cluster(args.cache, args.out_dir, args.threshold, args.mode, args.dry_run)
    elif args.cmd == "refine":
        cmd_refine(
            args.cache, args.out_dir,
            args.initial_threshold, args.merge_threshold, args.outlier_threshold,
            args.min_faces, args.min_short, args.min_blur, args.min_det_score,
            args.mode, args.dry_run,
        )
 if __name__ == "__main__":
    main()