Add face-sort pipeline as the repo's base
Single-file CLI (embed / cluster / refine) using InsightFace buffalo_l embeddings and agglomerative clustering, migrated in from the ad-hoc /home/peter/face_sort/ directory so this repo is the canonical home for faceset preparation feeding roop-unleashed and similar tools. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
work/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.venv/
|
||||
.claude/
|
||||
49
README.md
49
README.md
@@ -1 +1,48 @@
|
||||
Dummy
|
||||
# face-sets
|
||||
|
||||
Sort photos by similar face using InsightFace embeddings + agglomerative clustering, then refine into faceset-ready folders for downstream face-swap tooling (roop-unleashed, etc.).
|
||||
|
||||
## Pipeline
|
||||
|
||||
`sort_faces.py` is a single-file CLI with three subcommands:
|
||||
|
||||
| step | what it does |
|
||||
|---------|------------------------------------------------------------------------------|
|
||||
| embed | Recursively scan a source tree, detect + embed every face, write `.npz` cache |
|
||||
| cluster | Raw agglomerative clustering of the cache into `person_NNN/` / `_singletons/` / `_noface/` |
|
||||
| refine | Initial cluster → centroid merge → quality gate → outlier rejection → size filter → `faceset_NNN/` |
|
||||
|
||||
Cache and outputs are kept out of the repo via `.gitignore`; defaults live under `work/`.
|
||||
|
||||
## Typical run
|
||||
|
||||
```bash
|
||||
# 1. Embed (CPU; InsightFace buffalo_l). Caches faces + metadata.
|
||||
python sort_faces.py embed "/mnt/x/src/nl/Neuer Ordner (2)/New Folder" work/cache/nl_all.npz
|
||||
|
||||
# 2. Raw clusters (every multi-face cluster -> a person_NNN/ folder).
|
||||
python sort_faces.py cluster work/cache/nl_all.npz /mnt/e/temp_things/fcswp/nl_sorted/raw
|
||||
|
||||
# 3. Refined facesets (filters for faceset-ready quality).
|
||||
python sort_faces.py refine work/cache/nl_all.npz /mnt/e/temp_things/fcswp/nl_sorted/facesets
|
||||
```
|
||||
|
||||
## Refine defaults
|
||||
|
||||
| flag | default | meaning |
|
||||
|---|---|---|
|
||||
| `--initial-threshold` | 0.55 | cosine distance for stage-1 clustering |
|
||||
| `--merge-threshold` | 0.40 | centroid-level merge of over-split clusters |
|
||||
| `--outlier-threshold` | 0.55 | drop face if cosine dist from cluster centroid exceeds this (only if cluster ≥ 4) |
|
||||
| `--min-faces` | 15 | minimum unique images per faceset |
|
||||
| `--min-short` | 90 | minimum short-edge pixels of face bbox |
|
||||
| `--min-blur` | 40.0 | Laplacian-variance blur gate |
|
||||
| `--min-det-score` | 0.6 | InsightFace detector score gate |
|
||||
| `--mode` | copy | copy / move / symlink |
|
||||
|
||||
## Prior runs (as of 2026-04-22)
|
||||
|
||||
- `work/cache/kos11.npz` — 181 images, 333 faces from `Kos '11/` → `kos11_sorted/`
|
||||
- `work/cache/nl_all.npz` — 916 images, 1396 faces from `Neuer Ordner (2)/New Folder/` → `nl_sorted/raw/`, refined to 6 facesets (197, 120, 91, 47, 23, 18 images)
|
||||
|
||||
Output lives outside the repo at `/mnt/e/temp_things/fcswp/`.
|
||||
|
||||
443
sort_faces.py
Normal file
443
sort_faces.py
Normal file
@@ -0,0 +1,443 @@
|
||||
"""Sort photos by similar faces using InsightFace embeddings + agglomerative clustering.
|
||||
|
||||
Subcommands:
|
||||
embed <src_dir> <cache.npz> recursively scan, detect+embed faces
|
||||
cluster <cache.npz> <out_dir> [opts] raw agglomerative clustering -> person_NNN/
|
||||
refine <cache.npz> <out_dir> [opts] merge + outlier + quality pass -> faceset-ready folders
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import shutil
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image, ImageOps
|
||||
from tqdm import tqdm
|
||||
|
||||
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp", ".heic"}
|
||||
MIN_DET_SCORE = 0.5
|
||||
MIN_FACE_PIX = 40
|
||||
|
||||
|
||||
def list_images(src: Path) -> list[Path]:
|
||||
out: list[Path] = []
|
||||
for p in src.rglob("*"):
|
||||
if p.is_file() and p.suffix.lower() in IMG_EXTS:
|
||||
out.append(p)
|
||||
return sorted(out)
|
||||
|
||||
|
||||
def load_rgb_bgr(path: Path):
|
||||
try:
|
||||
with Image.open(path) as im:
|
||||
im = ImageOps.exif_transpose(im)
|
||||
im = im.convert("RGB")
|
||||
rgb = np.array(im)
|
||||
bgr = rgb[:, :, ::-1].copy()
|
||||
return rgb, bgr
|
||||
except Exception as e:
|
||||
print(f"[warn] failed to load {path}: {e}", file=sys.stderr)
|
||||
return None, None
|
||||
|
||||
|
||||
def laplacian_variance(gray: np.ndarray) -> float:
|
||||
"""Simple blur metric without OpenCV Laplacian call (uses numpy)."""
|
||||
k = np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]], dtype=np.float32)
|
||||
# same-size convolution via numpy slicing
|
||||
g = gray.astype(np.float32)
|
||||
lap = (
|
||||
-4.0 * g[1:-1, 1:-1]
|
||||
+ g[:-2, 1:-1] + g[2:, 1:-1]
|
||||
+ g[1:-1, :-2] + g[1:-1, 2:]
|
||||
)
|
||||
return float(lap.var())
|
||||
|
||||
|
||||
def make_rel(path: Path, root: Path) -> str:
|
||||
try:
|
||||
return str(path.relative_to(root))
|
||||
except ValueError:
|
||||
return path.name
|
||||
|
||||
|
||||
def safe_dst_name(path: Path, root: Path) -> str:
|
||||
"""Collision-safe filename built from source-relative path."""
|
||||
rel = make_rel(path, root)
|
||||
# Flatten: replace separators with double underscore
|
||||
flat = rel.replace("/", "__").replace("\\", "__").replace(" ", "_")
|
||||
return flat
|
||||
|
||||
|
||||
def cmd_embed(src_dir: Path, cache_path: Path) -> None:
|
||||
from insightface.app import FaceAnalysis
|
||||
|
||||
app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"])
|
||||
app.prepare(ctx_id=-1, det_size=(640, 640))
|
||||
|
||||
images = list_images(src_dir)
|
||||
print(f"Found {len(images)} images under {src_dir}")
|
||||
|
||||
embeddings: list[np.ndarray] = []
|
||||
meta: list[dict] = []
|
||||
|
||||
t0 = time.time()
|
||||
for img_path in tqdm(images, desc="embedding"):
|
||||
rgb, bgr = load_rgb_bgr(img_path)
|
||||
if bgr is None:
|
||||
meta.append({"path": str(img_path), "face_idx": -1, "noface": True, "error": "load"})
|
||||
continue
|
||||
faces = app.get(bgr)
|
||||
kept = 0
|
||||
for i, f in enumerate(faces):
|
||||
if float(f.det_score) < MIN_DET_SCORE:
|
||||
continue
|
||||
x1, y1, x2, y2 = [int(round(v)) for v in f.bbox]
|
||||
x1, y1 = max(x1, 0), max(y1, 0)
|
||||
x2, y2 = min(x2, rgb.shape[1]), min(y2, rgb.shape[0])
|
||||
w, h = x2 - x1, y2 - y1
|
||||
short = min(w, h)
|
||||
if short < MIN_FACE_PIX:
|
||||
continue
|
||||
# Blur metric on the face crop (grayscale)
|
||||
crop = rgb[y1:y2, x1:x2]
|
||||
if crop.size == 0:
|
||||
continue
|
||||
gray = crop.mean(axis=2)
|
||||
blur = laplacian_variance(gray) if min(gray.shape) > 3 else 0.0
|
||||
|
||||
emb = f.normed_embedding.astype(np.float32)
|
||||
embeddings.append(emb)
|
||||
meta.append({
|
||||
"path": str(img_path),
|
||||
"face_idx": i,
|
||||
"det_score": float(f.det_score),
|
||||
"bbox": [x1, y1, x2, y2],
|
||||
"face_short": int(short),
|
||||
"face_area": int(w * h),
|
||||
"blur": blur,
|
||||
"noface": False,
|
||||
})
|
||||
kept += 1
|
||||
if kept == 0:
|
||||
meta.append({"path": str(img_path), "face_idx": -1, "noface": True})
|
||||
|
||||
dt = time.time() - t0
|
||||
print(f"Detected {len(embeddings)} faces across {len(images)} images in {dt:.1f}s")
|
||||
|
||||
emb_arr = np.stack(embeddings) if embeddings else np.zeros((0, 512), dtype=np.float32)
|
||||
np.savez(cache_path, embeddings=emb_arr, meta=json.dumps(meta), src_root=str(src_dir))
|
||||
print(f"Cache written to {cache_path}")
|
||||
|
||||
|
||||
def load_cache(cache_path: Path):
|
||||
data = np.load(cache_path, allow_pickle=True)
|
||||
emb = data["embeddings"]
|
||||
meta = json.loads(str(data["meta"]))
|
||||
src_root = Path(str(data["src_root"])) if "src_root" in data.files else None
|
||||
return emb, meta, src_root
|
||||
|
||||
|
||||
def _transfer(src: Path, dst: Path, mode: str) -> None:
|
||||
if dst.exists():
|
||||
return
|
||||
if mode == "copy":
|
||||
shutil.copy2(src, dst)
|
||||
elif mode == "move":
|
||||
shutil.move(str(src), str(dst))
|
||||
elif mode == "symlink":
|
||||
dst.symlink_to(src)
|
||||
|
||||
|
||||
def _cluster_embeddings(emb: np.ndarray, threshold: float) -> np.ndarray:
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
|
||||
clusterer = AgglomerativeClustering(
|
||||
n_clusters=None,
|
||||
distance_threshold=threshold,
|
||||
metric="cosine",
|
||||
linkage="average",
|
||||
)
|
||||
return clusterer.fit_predict(emb)
|
||||
|
||||
|
||||
def cmd_cluster(cache_path: Path, out_dir: Path, threshold: float, mode: str, dry_run: bool) -> None:
|
||||
emb, meta, src_root = load_cache(cache_path)
|
||||
if src_root is None:
|
||||
src_root = Path("/")
|
||||
face_records = [m for m in meta if not m.get("noface")]
|
||||
noface_records = [m for m in meta if m.get("noface")]
|
||||
|
||||
if len(face_records) != len(emb):
|
||||
raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}")
|
||||
if len(emb) == 0:
|
||||
print("No faces detected; nothing to cluster.")
|
||||
return
|
||||
|
||||
print(f"Clustering {len(emb)} face embeddings (threshold={threshold} cosine distance)")
|
||||
labels = _cluster_embeddings(emb, threshold)
|
||||
|
||||
clusters: dict[int, list[dict]] = {}
|
||||
for rec, lbl in zip(face_records, labels):
|
||||
rec = dict(rec)
|
||||
rec["cluster"] = int(lbl)
|
||||
clusters.setdefault(int(lbl), []).append(rec)
|
||||
|
||||
ordered = sorted(clusters.items(), key=lambda kv: (-len(kv[1]), kv[0]))
|
||||
sizes = [len(v) for _, v in ordered]
|
||||
singletons = sum(1 for s in sizes if s == 1)
|
||||
print(f"Clusters: {len(ordered)} | top sizes: {sizes[:15]}")
|
||||
print(f"Multi-face clusters: {len(sizes) - singletons} singletons: {singletons}")
|
||||
print(f"No-face images: {len(noface_records)}")
|
||||
|
||||
if dry_run:
|
||||
for cid, recs in ordered[:20]:
|
||||
imgs = {r["path"] for r in recs}
|
||||
print(f" cluster {cid:3d} faces={len(recs):3d} imgs={len(imgs)}")
|
||||
return
|
||||
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
rank = 0
|
||||
cluster_dir: dict[int, Path] = {}
|
||||
for cid, recs in ordered:
|
||||
if len(recs) == 1:
|
||||
cluster_dir[cid] = out_dir / "_singletons"
|
||||
else:
|
||||
rank += 1
|
||||
cluster_dir[cid] = out_dir / f"person_{rank:03d}"
|
||||
cluster_dir[cid].mkdir(parents=True, exist_ok=True)
|
||||
|
||||
per_cluster_imgs: dict[int, set[str]] = {cid: set() for cid, _ in ordered}
|
||||
for cid, recs in ordered:
|
||||
for r in recs:
|
||||
per_cluster_imgs[cid].add(r["path"])
|
||||
|
||||
total = sum(len(v) for v in per_cluster_imgs.values())
|
||||
unique = len({p for s in per_cluster_imgs.values() for p in s})
|
||||
print(f"Placing {total} file instances across {unique} unique images (mode={mode}) -> {out_dir}")
|
||||
|
||||
for cid, paths in tqdm(per_cluster_imgs.items(), desc="transferring"):
|
||||
dst_dir = cluster_dir[cid]
|
||||
for p in sorted(paths):
|
||||
src = Path(p)
|
||||
dst = dst_dir / safe_dst_name(src, src_root)
|
||||
_transfer(src, dst, mode)
|
||||
|
||||
if noface_records:
|
||||
noface_dir = out_dir / "_noface"
|
||||
noface_dir.mkdir(exist_ok=True)
|
||||
for r in noface_records:
|
||||
src = Path(r["path"])
|
||||
if not src.exists():
|
||||
continue
|
||||
_transfer(src, noface_dir / safe_dst_name(src, src_root), mode)
|
||||
print(f"{len(noface_records)} no-face images -> {noface_dir}")
|
||||
|
||||
manifest = []
|
||||
for cid, recs in ordered:
|
||||
for r in recs:
|
||||
manifest.append({
|
||||
"image": Path(r["path"]).name,
|
||||
"source": r["path"],
|
||||
"cluster": cid,
|
||||
"folder": cluster_dir[cid].name,
|
||||
"bbox": r.get("bbox"),
|
||||
"det_score": r.get("det_score"),
|
||||
"face_short": r.get("face_short"),
|
||||
"blur": r.get("blur"),
|
||||
})
|
||||
(out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2))
|
||||
print(f"Manifest -> {out_dir / 'manifest.json'}")
|
||||
|
||||
|
||||
def _cluster_centroids(emb: np.ndarray, labels: np.ndarray) -> tuple[np.ndarray, list[int]]:
|
||||
ids = sorted(set(int(l) for l in labels))
|
||||
cents = []
|
||||
for cid in ids:
|
||||
mask = labels == cid
|
||||
v = emb[mask].mean(axis=0)
|
||||
n = np.linalg.norm(v)
|
||||
if n > 0:
|
||||
v = v / n
|
||||
cents.append(v)
|
||||
return np.stack(cents), ids
|
||||
|
||||
|
||||
def cmd_refine(
|
||||
cache_path: Path,
|
||||
out_dir: Path,
|
||||
initial_threshold: float,
|
||||
merge_threshold: float,
|
||||
outlier_threshold: float,
|
||||
min_faces: int,
|
||||
min_short: int,
|
||||
min_blur: float,
|
||||
min_det_score: float,
|
||||
mode: str,
|
||||
dry_run: bool,
|
||||
) -> None:
|
||||
emb, meta, src_root = load_cache(cache_path)
|
||||
if src_root is None:
|
||||
src_root = Path("/")
|
||||
face_records = [m for m in meta if not m.get("noface")]
|
||||
if len(face_records) != len(emb):
|
||||
raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}")
|
||||
|
||||
print(f"Stage 1: initial clustering (threshold={initial_threshold})")
|
||||
labels = _cluster_embeddings(emb, initial_threshold)
|
||||
|
||||
# Stage 2: merge similar clusters by centroid
|
||||
cents, cent_ids = _cluster_centroids(emb, labels)
|
||||
print(f"Stage 2: centroid merge on {len(cent_ids)} clusters (merge_threshold={merge_threshold})")
|
||||
cent_labels = _cluster_embeddings(cents, merge_threshold) if len(cents) > 1 else np.zeros(1, dtype=int)
|
||||
# remap original labels via centroid merge
|
||||
label_map = {cid: int(ml) for cid, ml in zip(cent_ids, cent_labels)}
|
||||
merged = np.array([label_map[int(l)] for l in labels])
|
||||
|
||||
# Build merged clusters
|
||||
clusters: dict[int, list[tuple[int, dict]]] = {} # cluster -> list of (global_idx, rec)
|
||||
for idx, (rec, lbl) in enumerate(zip(face_records, merged)):
|
||||
clusters.setdefault(int(lbl), []).append((idx, dict(rec)))
|
||||
|
||||
print(f"After merge: {len(clusters)} clusters")
|
||||
|
||||
# Stage 3: outlier rejection + quality filter per cluster
|
||||
kept_by_cluster: dict[int, list[tuple[int, dict]]] = {}
|
||||
dropped_quality = 0
|
||||
dropped_outlier = 0
|
||||
for cid, items in clusters.items():
|
||||
idxs = [i for i, _ in items]
|
||||
cvecs = emb[idxs]
|
||||
# centroid from the in-cluster faces
|
||||
c = cvecs.mean(axis=0)
|
||||
n = np.linalg.norm(c)
|
||||
if n > 0:
|
||||
c = c / n
|
||||
|
||||
kept: list[tuple[int, dict]] = []
|
||||
for (idx, rec), v in zip(items, cvecs):
|
||||
# Quality gate
|
||||
if rec.get("face_short", 0) < min_short:
|
||||
dropped_quality += 1
|
||||
continue
|
||||
if rec.get("blur", 0.0) < min_blur:
|
||||
dropped_quality += 1
|
||||
continue
|
||||
if rec.get("det_score", 0.0) < min_det_score:
|
||||
dropped_quality += 1
|
||||
continue
|
||||
# Outlier: only apply if the merged cluster has >=4 surviving-ish members
|
||||
if len(items) >= 4:
|
||||
cos_dist = 1.0 - float(v @ c)
|
||||
if cos_dist > outlier_threshold:
|
||||
dropped_outlier += 1
|
||||
continue
|
||||
kept.append((idx, rec))
|
||||
if kept:
|
||||
kept_by_cluster[cid] = kept
|
||||
|
||||
print(f"Dropped {dropped_quality} faces by quality gate, {dropped_outlier} as outliers")
|
||||
|
||||
# Stage 4: enforce minimum cluster size (by unique images, not faces)
|
||||
final: list[tuple[int, list[tuple[int, dict]]]] = []
|
||||
for cid, items in kept_by_cluster.items():
|
||||
unique_imgs = {rec["path"] for _, rec in items}
|
||||
if len(unique_imgs) >= min_faces:
|
||||
final.append((cid, items))
|
||||
final.sort(key=lambda kv: -len(kv[1]))
|
||||
|
||||
print(f"Facesets meeting min_faces={min_faces}: {len(final)}")
|
||||
for rank, (cid, items) in enumerate(final, 1):
|
||||
unique_imgs = {rec["path"] for _, rec in items}
|
||||
print(f" faceset_{rank:03d}: faces={len(items):3d} imgs={len(unique_imgs):3d}")
|
||||
|
||||
if dry_run:
|
||||
return
|
||||
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
for rank, (cid, items) in enumerate(final, 1):
|
||||
dst_dir = out_dir / f"faceset_{rank:03d}"
|
||||
dst_dir.mkdir(exist_ok=True)
|
||||
seen_paths: set[str] = set()
|
||||
for _, rec in items:
|
||||
p = rec["path"]
|
||||
if p in seen_paths:
|
||||
continue
|
||||
seen_paths.add(p)
|
||||
src = Path(p)
|
||||
if not src.exists():
|
||||
continue
|
||||
_transfer(src, dst_dir / safe_dst_name(src, src_root), mode)
|
||||
|
||||
# Write refinement manifest
|
||||
manifest = {
|
||||
"params": {
|
||||
"initial_threshold": initial_threshold,
|
||||
"merge_threshold": merge_threshold,
|
||||
"outlier_threshold": outlier_threshold,
|
||||
"min_faces": min_faces,
|
||||
"min_short": min_short,
|
||||
"min_blur": min_blur,
|
||||
"min_det_score": min_det_score,
|
||||
},
|
||||
"facesets": [
|
||||
{
|
||||
"name": f"faceset_{rank:03d}",
|
||||
"face_count": len(items),
|
||||
"image_count": len({rec["path"] for _, rec in items}),
|
||||
"images": sorted({rec["path"] for _, rec in items}),
|
||||
}
|
||||
for rank, (_, items) in enumerate(final, 1)
|
||||
],
|
||||
}
|
||||
(out_dir / "refine_manifest.json").write_text(json.dumps(manifest, indent=2))
|
||||
print(f"Refine manifest -> {out_dir / 'refine_manifest.json'}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser()
|
||||
sub = p.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
pe = sub.add_parser("embed")
|
||||
pe.add_argument("src_dir", type=Path)
|
||||
pe.add_argument("cache", type=Path)
|
||||
|
||||
pc = sub.add_parser("cluster")
|
||||
pc.add_argument("cache", type=Path)
|
||||
pc.add_argument("out_dir", type=Path)
|
||||
pc.add_argument("--threshold", type=float, default=0.55)
|
||||
pc.add_argument("--mode", choices=["copy", "move", "symlink"], default="copy")
|
||||
pc.add_argument("--dry-run", action="store_true")
|
||||
|
||||
pr = sub.add_parser("refine")
|
||||
pr.add_argument("cache", type=Path)
|
||||
pr.add_argument("out_dir", type=Path)
|
||||
pr.add_argument("--initial-threshold", type=float, default=0.55)
|
||||
pr.add_argument("--merge-threshold", type=float, default=0.40)
|
||||
pr.add_argument("--outlier-threshold", type=float, default=0.55)
|
||||
pr.add_argument("--min-faces", type=int, default=15)
|
||||
pr.add_argument("--min-short", type=int, default=90)
|
||||
pr.add_argument("--min-blur", type=float, default=40.0)
|
||||
pr.add_argument("--min-det-score", type=float, default=0.6)
|
||||
pr.add_argument("--mode", choices=["copy", "move", "symlink"], default="copy")
|
||||
pr.add_argument("--dry-run", action="store_true")
|
||||
|
||||
args = p.parse_args()
|
||||
if args.cmd == "embed":
|
||||
cmd_embed(args.src_dir, args.cache)
|
||||
elif args.cmd == "cluster":
|
||||
cmd_cluster(args.cache, args.out_dir, args.threshold, args.mode, args.dry_run)
|
||||
elif args.cmd == "refine":
|
||||
cmd_refine(
|
||||
args.cache, args.out_dir,
|
||||
args.initial_threshold, args.merge_threshold, args.outlier_threshold,
|
||||
args.min_faces, args.min_short, args.min_blur, args.min_det_score,
|
||||
args.mode, args.dry_run,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user