diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b8de722 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +work/ +__pycache__/ +*.pyc +.venv/ +.claude/ diff --git a/README.md b/README.md index 938179b..9a0a652 100644 --- a/README.md +++ b/README.md @@ -1 +1,48 @@ -Dummy \ No newline at end of file +# face-sets + +Sort photos by similar face using InsightFace embeddings + agglomerative clustering, then refine into faceset-ready folders for downstream face-swap tooling (roop-unleashed, etc.). + +## Pipeline + +`sort_faces.py` is a single-file CLI with three subcommands: + +| step | what it does | +|---------|------------------------------------------------------------------------------| +| embed | Recursively scan a source tree, detect + embed every face, write `.npz` cache | +| cluster | Raw agglomerative clustering of the cache into `person_NNN/` / `_singletons/` / `_noface/` | +| refine | Initial cluster → centroid merge → quality gate → outlier rejection → size filter → `faceset_NNN/` | + +Cache and outputs are kept out of the repo via `.gitignore`; defaults live under `work/`. + +## Typical run + +```bash +# 1. Embed (CPU; InsightFace buffalo_l). Caches faces + metadata. +python sort_faces.py embed "/mnt/x/src/nl/Neuer Ordner (2)/New Folder" work/cache/nl_all.npz + +# 2. Raw clusters (every multi-face cluster -> a person_NNN/ folder). +python sort_faces.py cluster work/cache/nl_all.npz /mnt/e/temp_things/fcswp/nl_sorted/raw + +# 3. Refined facesets (filters for faceset-ready quality). +python sort_faces.py refine work/cache/nl_all.npz /mnt/e/temp_things/fcswp/nl_sorted/facesets +``` + +## Refine defaults + +| flag | default | meaning | +|---|---|---| +| `--initial-threshold` | 0.55 | cosine distance for stage-1 clustering | +| `--merge-threshold` | 0.40 | centroid-level merge of over-split clusters | +| `--outlier-threshold` | 0.55 | drop face if cosine dist from cluster centroid exceeds this (only if cluster ≥ 4) | +| `--min-faces` | 15 | minimum unique images per faceset | +| `--min-short` | 90 | minimum short-edge pixels of face bbox | +| `--min-blur` | 40.0 | Laplacian-variance blur gate | +| `--min-det-score` | 0.6 | InsightFace detector score gate | +| `--mode` | copy | copy / move / symlink | + +## Prior runs (as of 2026-04-22) + +- `work/cache/kos11.npz` — 181 images, 333 faces from `Kos '11/` → `kos11_sorted/` +- `work/cache/nl_all.npz` — 916 images, 1396 faces from `Neuer Ordner (2)/New Folder/` → `nl_sorted/raw/`, refined to 6 facesets (197, 120, 91, 47, 23, 18 images) + +Output lives outside the repo at `/mnt/e/temp_things/fcswp/`. diff --git a/sort_faces.py b/sort_faces.py new file mode 100644 index 0000000..ca6a862 --- /dev/null +++ b/sort_faces.py @@ -0,0 +1,443 @@ +"""Sort photos by similar faces using InsightFace embeddings + agglomerative clustering. + +Subcommands: + embed recursively scan, detect+embed faces + cluster [opts] raw agglomerative clustering -> person_NNN/ + refine [opts] merge + outlier + quality pass -> faceset-ready folders +""" +from __future__ import annotations + +import argparse +import json +import shutil +import sys +import time +from pathlib import Path + +import numpy as np +from PIL import Image, ImageOps +from tqdm import tqdm + +IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp", ".heic"} +MIN_DET_SCORE = 0.5 +MIN_FACE_PIX = 40 + + +def list_images(src: Path) -> list[Path]: + out: list[Path] = [] + for p in src.rglob("*"): + if p.is_file() and p.suffix.lower() in IMG_EXTS: + out.append(p) + return sorted(out) + + +def load_rgb_bgr(path: Path): + try: + with Image.open(path) as im: + im = ImageOps.exif_transpose(im) + im = im.convert("RGB") + rgb = np.array(im) + bgr = rgb[:, :, ::-1].copy() + return rgb, bgr + except Exception as e: + print(f"[warn] failed to load {path}: {e}", file=sys.stderr) + return None, None + + +def laplacian_variance(gray: np.ndarray) -> float: + """Simple blur metric without OpenCV Laplacian call (uses numpy).""" + k = np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]], dtype=np.float32) + # same-size convolution via numpy slicing + g = gray.astype(np.float32) + lap = ( + -4.0 * g[1:-1, 1:-1] + + g[:-2, 1:-1] + g[2:, 1:-1] + + g[1:-1, :-2] + g[1:-1, 2:] + ) + return float(lap.var()) + + +def make_rel(path: Path, root: Path) -> str: + try: + return str(path.relative_to(root)) + except ValueError: + return path.name + + +def safe_dst_name(path: Path, root: Path) -> str: + """Collision-safe filename built from source-relative path.""" + rel = make_rel(path, root) + # Flatten: replace separators with double underscore + flat = rel.replace("/", "__").replace("\\", "__").replace(" ", "_") + return flat + + +def cmd_embed(src_dir: Path, cache_path: Path) -> None: + from insightface.app import FaceAnalysis + + app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"]) + app.prepare(ctx_id=-1, det_size=(640, 640)) + + images = list_images(src_dir) + print(f"Found {len(images)} images under {src_dir}") + + embeddings: list[np.ndarray] = [] + meta: list[dict] = [] + + t0 = time.time() + for img_path in tqdm(images, desc="embedding"): + rgb, bgr = load_rgb_bgr(img_path) + if bgr is None: + meta.append({"path": str(img_path), "face_idx": -1, "noface": True, "error": "load"}) + continue + faces = app.get(bgr) + kept = 0 + for i, f in enumerate(faces): + if float(f.det_score) < MIN_DET_SCORE: + continue + x1, y1, x2, y2 = [int(round(v)) for v in f.bbox] + x1, y1 = max(x1, 0), max(y1, 0) + x2, y2 = min(x2, rgb.shape[1]), min(y2, rgb.shape[0]) + w, h = x2 - x1, y2 - y1 + short = min(w, h) + if short < MIN_FACE_PIX: + continue + # Blur metric on the face crop (grayscale) + crop = rgb[y1:y2, x1:x2] + if crop.size == 0: + continue + gray = crop.mean(axis=2) + blur = laplacian_variance(gray) if min(gray.shape) > 3 else 0.0 + + emb = f.normed_embedding.astype(np.float32) + embeddings.append(emb) + meta.append({ + "path": str(img_path), + "face_idx": i, + "det_score": float(f.det_score), + "bbox": [x1, y1, x2, y2], + "face_short": int(short), + "face_area": int(w * h), + "blur": blur, + "noface": False, + }) + kept += 1 + if kept == 0: + meta.append({"path": str(img_path), "face_idx": -1, "noface": True}) + + dt = time.time() - t0 + print(f"Detected {len(embeddings)} faces across {len(images)} images in {dt:.1f}s") + + emb_arr = np.stack(embeddings) if embeddings else np.zeros((0, 512), dtype=np.float32) + np.savez(cache_path, embeddings=emb_arr, meta=json.dumps(meta), src_root=str(src_dir)) + print(f"Cache written to {cache_path}") + + +def load_cache(cache_path: Path): + data = np.load(cache_path, allow_pickle=True) + emb = data["embeddings"] + meta = json.loads(str(data["meta"])) + src_root = Path(str(data["src_root"])) if "src_root" in data.files else None + return emb, meta, src_root + + +def _transfer(src: Path, dst: Path, mode: str) -> None: + if dst.exists(): + return + if mode == "copy": + shutil.copy2(src, dst) + elif mode == "move": + shutil.move(str(src), str(dst)) + elif mode == "symlink": + dst.symlink_to(src) + + +def _cluster_embeddings(emb: np.ndarray, threshold: float) -> np.ndarray: + from sklearn.cluster import AgglomerativeClustering + + clusterer = AgglomerativeClustering( + n_clusters=None, + distance_threshold=threshold, + metric="cosine", + linkage="average", + ) + return clusterer.fit_predict(emb) + + +def cmd_cluster(cache_path: Path, out_dir: Path, threshold: float, mode: str, dry_run: bool) -> None: + emb, meta, src_root = load_cache(cache_path) + if src_root is None: + src_root = Path("/") + face_records = [m for m in meta if not m.get("noface")] + noface_records = [m for m in meta if m.get("noface")] + + if len(face_records) != len(emb): + raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}") + if len(emb) == 0: + print("No faces detected; nothing to cluster.") + return + + print(f"Clustering {len(emb)} face embeddings (threshold={threshold} cosine distance)") + labels = _cluster_embeddings(emb, threshold) + + clusters: dict[int, list[dict]] = {} + for rec, lbl in zip(face_records, labels): + rec = dict(rec) + rec["cluster"] = int(lbl) + clusters.setdefault(int(lbl), []).append(rec) + + ordered = sorted(clusters.items(), key=lambda kv: (-len(kv[1]), kv[0])) + sizes = [len(v) for _, v in ordered] + singletons = sum(1 for s in sizes if s == 1) + print(f"Clusters: {len(ordered)} | top sizes: {sizes[:15]}") + print(f"Multi-face clusters: {len(sizes) - singletons} singletons: {singletons}") + print(f"No-face images: {len(noface_records)}") + + if dry_run: + for cid, recs in ordered[:20]: + imgs = {r["path"] for r in recs} + print(f" cluster {cid:3d} faces={len(recs):3d} imgs={len(imgs)}") + return + + out_dir.mkdir(parents=True, exist_ok=True) + rank = 0 + cluster_dir: dict[int, Path] = {} + for cid, recs in ordered: + if len(recs) == 1: + cluster_dir[cid] = out_dir / "_singletons" + else: + rank += 1 + cluster_dir[cid] = out_dir / f"person_{rank:03d}" + cluster_dir[cid].mkdir(parents=True, exist_ok=True) + + per_cluster_imgs: dict[int, set[str]] = {cid: set() for cid, _ in ordered} + for cid, recs in ordered: + for r in recs: + per_cluster_imgs[cid].add(r["path"]) + + total = sum(len(v) for v in per_cluster_imgs.values()) + unique = len({p for s in per_cluster_imgs.values() for p in s}) + print(f"Placing {total} file instances across {unique} unique images (mode={mode}) -> {out_dir}") + + for cid, paths in tqdm(per_cluster_imgs.items(), desc="transferring"): + dst_dir = cluster_dir[cid] + for p in sorted(paths): + src = Path(p) + dst = dst_dir / safe_dst_name(src, src_root) + _transfer(src, dst, mode) + + if noface_records: + noface_dir = out_dir / "_noface" + noface_dir.mkdir(exist_ok=True) + for r in noface_records: + src = Path(r["path"]) + if not src.exists(): + continue + _transfer(src, noface_dir / safe_dst_name(src, src_root), mode) + print(f"{len(noface_records)} no-face images -> {noface_dir}") + + manifest = [] + for cid, recs in ordered: + for r in recs: + manifest.append({ + "image": Path(r["path"]).name, + "source": r["path"], + "cluster": cid, + "folder": cluster_dir[cid].name, + "bbox": r.get("bbox"), + "det_score": r.get("det_score"), + "face_short": r.get("face_short"), + "blur": r.get("blur"), + }) + (out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2)) + print(f"Manifest -> {out_dir / 'manifest.json'}") + + +def _cluster_centroids(emb: np.ndarray, labels: np.ndarray) -> tuple[np.ndarray, list[int]]: + ids = sorted(set(int(l) for l in labels)) + cents = [] + for cid in ids: + mask = labels == cid + v = emb[mask].mean(axis=0) + n = np.linalg.norm(v) + if n > 0: + v = v / n + cents.append(v) + return np.stack(cents), ids + + +def cmd_refine( + cache_path: Path, + out_dir: Path, + initial_threshold: float, + merge_threshold: float, + outlier_threshold: float, + min_faces: int, + min_short: int, + min_blur: float, + min_det_score: float, + mode: str, + dry_run: bool, +) -> None: + emb, meta, src_root = load_cache(cache_path) + if src_root is None: + src_root = Path("/") + face_records = [m for m in meta if not m.get("noface")] + if len(face_records) != len(emb): + raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}") + + print(f"Stage 1: initial clustering (threshold={initial_threshold})") + labels = _cluster_embeddings(emb, initial_threshold) + + # Stage 2: merge similar clusters by centroid + cents, cent_ids = _cluster_centroids(emb, labels) + print(f"Stage 2: centroid merge on {len(cent_ids)} clusters (merge_threshold={merge_threshold})") + cent_labels = _cluster_embeddings(cents, merge_threshold) if len(cents) > 1 else np.zeros(1, dtype=int) + # remap original labels via centroid merge + label_map = {cid: int(ml) for cid, ml in zip(cent_ids, cent_labels)} + merged = np.array([label_map[int(l)] for l in labels]) + + # Build merged clusters + clusters: dict[int, list[tuple[int, dict]]] = {} # cluster -> list of (global_idx, rec) + for idx, (rec, lbl) in enumerate(zip(face_records, merged)): + clusters.setdefault(int(lbl), []).append((idx, dict(rec))) + + print(f"After merge: {len(clusters)} clusters") + + # Stage 3: outlier rejection + quality filter per cluster + kept_by_cluster: dict[int, list[tuple[int, dict]]] = {} + dropped_quality = 0 + dropped_outlier = 0 + for cid, items in clusters.items(): + idxs = [i for i, _ in items] + cvecs = emb[idxs] + # centroid from the in-cluster faces + c = cvecs.mean(axis=0) + n = np.linalg.norm(c) + if n > 0: + c = c / n + + kept: list[tuple[int, dict]] = [] + for (idx, rec), v in zip(items, cvecs): + # Quality gate + if rec.get("face_short", 0) < min_short: + dropped_quality += 1 + continue + if rec.get("blur", 0.0) < min_blur: + dropped_quality += 1 + continue + if rec.get("det_score", 0.0) < min_det_score: + dropped_quality += 1 + continue + # Outlier: only apply if the merged cluster has >=4 surviving-ish members + if len(items) >= 4: + cos_dist = 1.0 - float(v @ c) + if cos_dist > outlier_threshold: + dropped_outlier += 1 + continue + kept.append((idx, rec)) + if kept: + kept_by_cluster[cid] = kept + + print(f"Dropped {dropped_quality} faces by quality gate, {dropped_outlier} as outliers") + + # Stage 4: enforce minimum cluster size (by unique images, not faces) + final: list[tuple[int, list[tuple[int, dict]]]] = [] + for cid, items in kept_by_cluster.items(): + unique_imgs = {rec["path"] for _, rec in items} + if len(unique_imgs) >= min_faces: + final.append((cid, items)) + final.sort(key=lambda kv: -len(kv[1])) + + print(f"Facesets meeting min_faces={min_faces}: {len(final)}") + for rank, (cid, items) in enumerate(final, 1): + unique_imgs = {rec["path"] for _, rec in items} + print(f" faceset_{rank:03d}: faces={len(items):3d} imgs={len(unique_imgs):3d}") + + if dry_run: + return + + out_dir.mkdir(parents=True, exist_ok=True) + for rank, (cid, items) in enumerate(final, 1): + dst_dir = out_dir / f"faceset_{rank:03d}" + dst_dir.mkdir(exist_ok=True) + seen_paths: set[str] = set() + for _, rec in items: + p = rec["path"] + if p in seen_paths: + continue + seen_paths.add(p) + src = Path(p) + if not src.exists(): + continue + _transfer(src, dst_dir / safe_dst_name(src, src_root), mode) + + # Write refinement manifest + manifest = { + "params": { + "initial_threshold": initial_threshold, + "merge_threshold": merge_threshold, + "outlier_threshold": outlier_threshold, + "min_faces": min_faces, + "min_short": min_short, + "min_blur": min_blur, + "min_det_score": min_det_score, + }, + "facesets": [ + { + "name": f"faceset_{rank:03d}", + "face_count": len(items), + "image_count": len({rec["path"] for _, rec in items}), + "images": sorted({rec["path"] for _, rec in items}), + } + for rank, (_, items) in enumerate(final, 1) + ], + } + (out_dir / "refine_manifest.json").write_text(json.dumps(manifest, indent=2)) + print(f"Refine manifest -> {out_dir / 'refine_manifest.json'}") + + +def main() -> None: + p = argparse.ArgumentParser() + sub = p.add_subparsers(dest="cmd", required=True) + + pe = sub.add_parser("embed") + pe.add_argument("src_dir", type=Path) + pe.add_argument("cache", type=Path) + + pc = sub.add_parser("cluster") + pc.add_argument("cache", type=Path) + pc.add_argument("out_dir", type=Path) + pc.add_argument("--threshold", type=float, default=0.55) + pc.add_argument("--mode", choices=["copy", "move", "symlink"], default="copy") + pc.add_argument("--dry-run", action="store_true") + + pr = sub.add_parser("refine") + pr.add_argument("cache", type=Path) + pr.add_argument("out_dir", type=Path) + pr.add_argument("--initial-threshold", type=float, default=0.55) + pr.add_argument("--merge-threshold", type=float, default=0.40) + pr.add_argument("--outlier-threshold", type=float, default=0.55) + pr.add_argument("--min-faces", type=int, default=15) + pr.add_argument("--min-short", type=int, default=90) + pr.add_argument("--min-blur", type=float, default=40.0) + pr.add_argument("--min-det-score", type=float, default=0.6) + pr.add_argument("--mode", choices=["copy", "move", "symlink"], default="copy") + pr.add_argument("--dry-run", action="store_true") + + args = p.parse_args() + if args.cmd == "embed": + cmd_embed(args.src_dir, args.cache) + elif args.cmd == "cluster": + cmd_cluster(args.cache, args.out_dir, args.threshold, args.mode, args.dry_run) + elif args.cmd == "refine": + cmd_refine( + args.cache, args.out_dir, + args.initial_threshold, args.merge_threshold, args.outlier_threshold, + args.min_faces, args.min_short, args.min_blur, args.min_det_score, + args.mode, args.dry_run, + ) + + +if __name__ == "__main__": + main()