"""Sort photos by similar faces using InsightFace embeddings + agglomerative clustering. Subcommands: embed recursively scan, detect+embed faces cluster [opts] raw agglomerative clustering -> person_NNN/ refine [opts] merge + outlier + quality pass -> faceset-ready folders """ from __future__ import annotations import argparse import json import shutil import sys import time from pathlib import Path import numpy as np from PIL import Image, ImageOps from tqdm import tqdm IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp", ".heic"} MIN_DET_SCORE = 0.5 MIN_FACE_PIX = 40 def list_images(src: Path) -> list[Path]: out: list[Path] = [] for p in src.rglob("*"): if p.is_file() and p.suffix.lower() in IMG_EXTS: out.append(p) return sorted(out) def load_rgb_bgr(path: Path): try: with Image.open(path) as im: im = ImageOps.exif_transpose(im) im = im.convert("RGB") rgb = np.array(im) bgr = rgb[:, :, ::-1].copy() return rgb, bgr except Exception as e: print(f"[warn] failed to load {path}: {e}", file=sys.stderr) return None, None def laplacian_variance(gray: np.ndarray) -> float: """Simple blur metric without OpenCV Laplacian call (uses numpy).""" k = np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]], dtype=np.float32) # same-size convolution via numpy slicing g = gray.astype(np.float32) lap = ( -4.0 * g[1:-1, 1:-1] + g[:-2, 1:-1] + g[2:, 1:-1] + g[1:-1, :-2] + g[1:-1, 2:] ) return float(lap.var()) def make_rel(path: Path, root: Path) -> str: try: return str(path.relative_to(root)) except ValueError: return path.name def safe_dst_name(path: Path, root: Path) -> str: """Collision-safe filename built from source-relative path.""" rel = make_rel(path, root) # Flatten: replace separators with double underscore flat = rel.replace("/", "__").replace("\\", "__").replace(" ", "_") return flat def cmd_embed(src_dir: Path, cache_path: Path) -> None: from insightface.app import FaceAnalysis app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"]) app.prepare(ctx_id=-1, det_size=(640, 640)) images = list_images(src_dir) print(f"Found {len(images)} images under {src_dir}") embeddings: list[np.ndarray] = [] meta: list[dict] = [] t0 = time.time() for img_path in tqdm(images, desc="embedding"): rgb, bgr = load_rgb_bgr(img_path) if bgr is None: meta.append({"path": str(img_path), "face_idx": -1, "noface": True, "error": "load"}) continue faces = app.get(bgr) kept = 0 for i, f in enumerate(faces): if float(f.det_score) < MIN_DET_SCORE: continue x1, y1, x2, y2 = [int(round(v)) for v in f.bbox] x1, y1 = max(x1, 0), max(y1, 0) x2, y2 = min(x2, rgb.shape[1]), min(y2, rgb.shape[0]) w, h = x2 - x1, y2 - y1 short = min(w, h) if short < MIN_FACE_PIX: continue # Blur metric on the face crop (grayscale) crop = rgb[y1:y2, x1:x2] if crop.size == 0: continue gray = crop.mean(axis=2) blur = laplacian_variance(gray) if min(gray.shape) > 3 else 0.0 emb = f.normed_embedding.astype(np.float32) embeddings.append(emb) meta.append({ "path": str(img_path), "face_idx": i, "det_score": float(f.det_score), "bbox": [x1, y1, x2, y2], "face_short": int(short), "face_area": int(w * h), "blur": blur, "noface": False, }) kept += 1 if kept == 0: meta.append({"path": str(img_path), "face_idx": -1, "noface": True}) dt = time.time() - t0 print(f"Detected {len(embeddings)} faces across {len(images)} images in {dt:.1f}s") emb_arr = np.stack(embeddings) if embeddings else np.zeros((0, 512), dtype=np.float32) np.savez(cache_path, embeddings=emb_arr, meta=json.dumps(meta), src_root=str(src_dir)) print(f"Cache written to {cache_path}") def load_cache(cache_path: Path): data = np.load(cache_path, allow_pickle=True) emb = data["embeddings"] meta = json.loads(str(data["meta"])) src_root = Path(str(data["src_root"])) if "src_root" in data.files else None return emb, meta, src_root def _transfer(src: Path, dst: Path, mode: str) -> None: if dst.exists(): return if mode == "copy": shutil.copy2(src, dst) elif mode == "move": shutil.move(str(src), str(dst)) elif mode == "symlink": dst.symlink_to(src) def _cluster_embeddings(emb: np.ndarray, threshold: float) -> np.ndarray: from sklearn.cluster import AgglomerativeClustering clusterer = AgglomerativeClustering( n_clusters=None, distance_threshold=threshold, metric="cosine", linkage="average", ) return clusterer.fit_predict(emb) def cmd_cluster(cache_path: Path, out_dir: Path, threshold: float, mode: str, dry_run: bool) -> None: emb, meta, src_root = load_cache(cache_path) if src_root is None: src_root = Path("/") face_records = [m for m in meta if not m.get("noface")] noface_records = [m for m in meta if m.get("noface")] if len(face_records) != len(emb): raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}") if len(emb) == 0: print("No faces detected; nothing to cluster.") return print(f"Clustering {len(emb)} face embeddings (threshold={threshold} cosine distance)") labels = _cluster_embeddings(emb, threshold) clusters: dict[int, list[dict]] = {} for rec, lbl in zip(face_records, labels): rec = dict(rec) rec["cluster"] = int(lbl) clusters.setdefault(int(lbl), []).append(rec) ordered = sorted(clusters.items(), key=lambda kv: (-len(kv[1]), kv[0])) sizes = [len(v) for _, v in ordered] singletons = sum(1 for s in sizes if s == 1) print(f"Clusters: {len(ordered)} | top sizes: {sizes[:15]}") print(f"Multi-face clusters: {len(sizes) - singletons} singletons: {singletons}") print(f"No-face images: {len(noface_records)}") if dry_run: for cid, recs in ordered[:20]: imgs = {r["path"] for r in recs} print(f" cluster {cid:3d} faces={len(recs):3d} imgs={len(imgs)}") return out_dir.mkdir(parents=True, exist_ok=True) rank = 0 cluster_dir: dict[int, Path] = {} for cid, recs in ordered: if len(recs) == 1: cluster_dir[cid] = out_dir / "_singletons" else: rank += 1 cluster_dir[cid] = out_dir / f"person_{rank:03d}" cluster_dir[cid].mkdir(parents=True, exist_ok=True) per_cluster_imgs: dict[int, set[str]] = {cid: set() for cid, _ in ordered} for cid, recs in ordered: for r in recs: per_cluster_imgs[cid].add(r["path"]) total = sum(len(v) for v in per_cluster_imgs.values()) unique = len({p for s in per_cluster_imgs.values() for p in s}) print(f"Placing {total} file instances across {unique} unique images (mode={mode}) -> {out_dir}") for cid, paths in tqdm(per_cluster_imgs.items(), desc="transferring"): dst_dir = cluster_dir[cid] for p in sorted(paths): src = Path(p) dst = dst_dir / safe_dst_name(src, src_root) _transfer(src, dst, mode) if noface_records: noface_dir = out_dir / "_noface" noface_dir.mkdir(exist_ok=True) for r in noface_records: src = Path(r["path"]) if not src.exists(): continue _transfer(src, noface_dir / safe_dst_name(src, src_root), mode) print(f"{len(noface_records)} no-face images -> {noface_dir}") manifest = [] for cid, recs in ordered: for r in recs: manifest.append({ "image": Path(r["path"]).name, "source": r["path"], "cluster": cid, "folder": cluster_dir[cid].name, "bbox": r.get("bbox"), "det_score": r.get("det_score"), "face_short": r.get("face_short"), "blur": r.get("blur"), }) (out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2)) print(f"Manifest -> {out_dir / 'manifest.json'}") def _cluster_centroids(emb: np.ndarray, labels: np.ndarray) -> tuple[np.ndarray, list[int]]: ids = sorted(set(int(l) for l in labels)) cents = [] for cid in ids: mask = labels == cid v = emb[mask].mean(axis=0) n = np.linalg.norm(v) if n > 0: v = v / n cents.append(v) return np.stack(cents), ids def cmd_refine( cache_path: Path, out_dir: Path, initial_threshold: float, merge_threshold: float, outlier_threshold: float, min_faces: int, min_short: int, min_blur: float, min_det_score: float, mode: str, dry_run: bool, ) -> None: emb, meta, src_root = load_cache(cache_path) if src_root is None: src_root = Path("/") face_records = [m for m in meta if not m.get("noface")] if len(face_records) != len(emb): raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}") print(f"Stage 1: initial clustering (threshold={initial_threshold})") labels = _cluster_embeddings(emb, initial_threshold) # Stage 2: merge similar clusters by centroid cents, cent_ids = _cluster_centroids(emb, labels) print(f"Stage 2: centroid merge on {len(cent_ids)} clusters (merge_threshold={merge_threshold})") cent_labels = _cluster_embeddings(cents, merge_threshold) if len(cents) > 1 else np.zeros(1, dtype=int) # remap original labels via centroid merge label_map = {cid: int(ml) for cid, ml in zip(cent_ids, cent_labels)} merged = np.array([label_map[int(l)] for l in labels]) # Build merged clusters clusters: dict[int, list[tuple[int, dict]]] = {} # cluster -> list of (global_idx, rec) for idx, (rec, lbl) in enumerate(zip(face_records, merged)): clusters.setdefault(int(lbl), []).append((idx, dict(rec))) print(f"After merge: {len(clusters)} clusters") # Stage 3: outlier rejection + quality filter per cluster kept_by_cluster: dict[int, list[tuple[int, dict]]] = {} dropped_quality = 0 dropped_outlier = 0 for cid, items in clusters.items(): idxs = [i for i, _ in items] cvecs = emb[idxs] # centroid from the in-cluster faces c = cvecs.mean(axis=0) n = np.linalg.norm(c) if n > 0: c = c / n kept: list[tuple[int, dict]] = [] for (idx, rec), v in zip(items, cvecs): # Quality gate if rec.get("face_short", 0) < min_short: dropped_quality += 1 continue if rec.get("blur", 0.0) < min_blur: dropped_quality += 1 continue if rec.get("det_score", 0.0) < min_det_score: dropped_quality += 1 continue # Outlier: only apply if the merged cluster has >=4 surviving-ish members if len(items) >= 4: cos_dist = 1.0 - float(v @ c) if cos_dist > outlier_threshold: dropped_outlier += 1 continue kept.append((idx, rec)) if kept: kept_by_cluster[cid] = kept print(f"Dropped {dropped_quality} faces by quality gate, {dropped_outlier} as outliers") # Stage 4: enforce minimum cluster size (by unique images, not faces) final: list[tuple[int, list[tuple[int, dict]]]] = [] for cid, items in kept_by_cluster.items(): unique_imgs = {rec["path"] for _, rec in items} if len(unique_imgs) >= min_faces: final.append((cid, items)) final.sort(key=lambda kv: -len(kv[1])) print(f"Facesets meeting min_faces={min_faces}: {len(final)}") for rank, (cid, items) in enumerate(final, 1): unique_imgs = {rec["path"] for _, rec in items} print(f" faceset_{rank:03d}: faces={len(items):3d} imgs={len(unique_imgs):3d}") if dry_run: return out_dir.mkdir(parents=True, exist_ok=True) for rank, (cid, items) in enumerate(final, 1): dst_dir = out_dir / f"faceset_{rank:03d}" dst_dir.mkdir(exist_ok=True) seen_paths: set[str] = set() for _, rec in items: p = rec["path"] if p in seen_paths: continue seen_paths.add(p) src = Path(p) if not src.exists(): continue _transfer(src, dst_dir / safe_dst_name(src, src_root), mode) # Write refinement manifest manifest = { "params": { "initial_threshold": initial_threshold, "merge_threshold": merge_threshold, "outlier_threshold": outlier_threshold, "min_faces": min_faces, "min_short": min_short, "min_blur": min_blur, "min_det_score": min_det_score, }, "facesets": [ { "name": f"faceset_{rank:03d}", "face_count": len(items), "image_count": len({rec["path"] for _, rec in items}), "images": sorted({rec["path"] for _, rec in items}), } for rank, (_, items) in enumerate(final, 1) ], } (out_dir / "refine_manifest.json").write_text(json.dumps(manifest, indent=2)) print(f"Refine manifest -> {out_dir / 'refine_manifest.json'}") def main() -> None: p = argparse.ArgumentParser() sub = p.add_subparsers(dest="cmd", required=True) pe = sub.add_parser("embed") pe.add_argument("src_dir", type=Path) pe.add_argument("cache", type=Path) pc = sub.add_parser("cluster") pc.add_argument("cache", type=Path) pc.add_argument("out_dir", type=Path) pc.add_argument("--threshold", type=float, default=0.55) pc.add_argument("--mode", choices=["copy", "move", "symlink"], default="copy") pc.add_argument("--dry-run", action="store_true") pr = sub.add_parser("refine") pr.add_argument("cache", type=Path) pr.add_argument("out_dir", type=Path) pr.add_argument("--initial-threshold", type=float, default=0.55) pr.add_argument("--merge-threshold", type=float, default=0.40) pr.add_argument("--outlier-threshold", type=float, default=0.55) pr.add_argument("--min-faces", type=int, default=15) pr.add_argument("--min-short", type=int, default=90) pr.add_argument("--min-blur", type=float, default=40.0) pr.add_argument("--min-det-score", type=float, default=0.6) pr.add_argument("--mode", choices=["copy", "move", "symlink"], default="copy") pr.add_argument("--dry-run", action="store_true") args = p.parse_args() if args.cmd == "embed": cmd_embed(args.src_dir, args.cache) elif args.cmd == "cluster": cmd_cluster(args.cache, args.out_dir, args.threshold, args.mode, args.dry_run) elif args.cmd == "refine": cmd_refine( args.cache, args.out_dir, args.initial_threshold, args.merge_threshold, args.outlier_threshold, args.min_faces, args.min_short, args.min_blur, args.min_det_score, args.mode, args.dry_run, ) if __name__ == "__main__": main()