face-sets/work/cluster_immich.py

#!/usr/bin/env python3
"""Discover new identities in an Immich-sourced cache and emit them as facesets.

Mirrors `work/cluster_osrc.py`, but the source corpus is an arbitrary
Immich user's `immich_<user>.npz` cache produced by the Windows DML embed
worker. Existing identity centroids come from the union of every faceset
already in `facesets_swap_ready/` (faceset_001..NNN, both auto-clustered
and hand-sorted).

Pipeline:
 1. Load immich_<user>.npz; restrict to face records (drop noface).
 2. Build centroids of every existing canonical faceset in
    facesets_swap_ready/ (skip era splits and _thin/).
 3. Drop immich faces whose nearest existing centroid is within
    EXISTING_MATCH_THRESHOLD; those are already covered by the canonical set.
 4. Cluster the remaining among themselves at INITIAL_THRESHOLD.
 5. Per cluster: refine-equivalent gates (face_short, blur, det_score),
    plus outlier rejection at OUTLIER_THRESHOLD for clusters of size >= 4.
 6. Keep clusters whose surviving unique source-path count is >= MIN_FACES.
 7. Number kept clusters past the existing facesets_swap_ready/ max.
 8. Synthesize a refine_manifest, hand off to cmd_export_swap, move dirs into
    facesets_swap_ready/, drop a provenance marker, append to top-level
    manifest.json (preserving facesets / thin_eras).
"""

from __future__ import annotations

import argparse
import json
import shutil
import sys
from pathlib import Path

import numpy as np

REPO = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO))

from sort_faces import (  # noqa: E402
    _cluster_embeddings,
    cmd_export_swap,
    load_cache,
)

# ---- config -------------------------------------------------------------- #

REPO_WORK = REPO / "work"
SWAP_READY = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")

EXISTING_MATCH_THRESHOLD = 0.45
INITIAL_THRESHOLD = 0.55

MIN_FACES = 6
MIN_SHORT = 90
MIN_BLUR = 40.0
MIN_DET_SCORE = 0.6
OUTLIER_THRESHOLD = 0.55

TOP_N = 30
EXPORT_OUTLIER_THRESHOLD = 0.45
PAD_RATIO = 0.5
OUT_SIZE = 512
EXPORT_MIN_FACE_SHORT = 100


# ---- helpers ------------------------------------------------------------- #

def _normalize(v: np.ndarray) -> np.ndarray:
    n = np.linalg.norm(v)
    return v / n if n > 0 else v


def _existing_identity_centroids(
    nl_cache: Path,
) -> tuple[np.ndarray, list[str]]:
    """Build identity centroids from every canonical faceset_NNN/ in
    facesets_swap_ready/. Era-split sub-dirs (faceset_001_<era>) and the
    _thin/ quarantine are skipped. Each faceset's manifest.json provides
    (source, bbox) keys we use to look up rows in nl_full.npz."""
    emb, meta, _src, _proc, _aliases = load_cache(nl_cache)
    face_records = [m for m in meta if not m.get("noface")]
    if len(face_records) != len(emb):
        raise SystemExit(f"meta/embedding mismatch in {nl_cache}: {len(face_records)} vs {len(emb)}")
    bbox_idx = {(m["path"], tuple(m.get("bbox") or ())): i for i, m in enumerate(face_records)}

    centroids: list[np.ndarray] = []
    names: list[str] = []
    for d in sorted(SWAP_READY.iterdir()):
        if not d.is_dir():
            continue
        if d.name.startswith("_"):
            continue
        # Skip era-split sub-facesets (faceset_NNN_*).
        if d.name.startswith("faceset_") and "_" in d.name[len("faceset_"):]:
            continue
        man = d / "manifest.json"
        if not man.exists():
            continue
        try:
            entries = json.loads(man.read_text()).get("faces", [])
        except Exception:
            continue
        keys = [(f["source"], tuple(f.get("bbox") or ())) for f in entries]
        idxs = [bbox_idx[k] for k in keys if k in bbox_idx]
        if not idxs:
            continue
        centroids.append(_normalize(emb[idxs].mean(axis=0)))
        names.append(d.name)
    if not centroids:
        raise SystemExit("no canonical identity centroids could be built; check facesets_swap_ready/")
    return np.stack(centroids), names


def _next_faceset_number() -> int:
    nums = []
    for d in SWAP_READY.iterdir():
        if not d.is_dir() or not d.name.startswith("faceset_"):
            continue
        tail = d.name[len("faceset_"):]
        # Take only top-level numbered facesets (no era suffix).
        if "_" in tail:
            continue
        try:
            nums.append(int(tail))
        except ValueError:
            continue
    return (max(nums) + 1) if nums else 1


# ---- phase 1: discover --------------------------------------------------- #

def discover_new_clusters(
    immich_cache: Path, nl_cache: Path, start_nnn: int, source_label: str
) -> tuple[dict, list[dict]]:
    print(f"loading immich cache: {immich_cache}")
    emb, meta, _src, _proc, _aliases = load_cache(immich_cache)
    face_records = [m for m in meta if not m.get("noface")]
    if len(face_records) != len(emb):
        raise SystemExit(f"meta/embedding mismatch: {len(face_records)} vs {len(emb)}")
    print(f"  {len(face_records)} face records, {sum(1 for m in meta if m.get('noface'))} noface")

    print(f"building existing-identity centroids from {SWAP_READY}")
    cents, cent_names = _existing_identity_centroids(nl_cache)
    print(f"  {len(cent_names)} canonical centroids")

    sims = emb @ cents.T
    nearest_d = 1.0 - sims.max(axis=1)
    nearest_id = sims.argmax(axis=1)
    covered = nearest_d <= EXISTING_MATCH_THRESHOLD
    print(f"\nfaces already covered (cos-dist <= {EXISTING_MATCH_THRESHOLD}): "
          f"{int(covered.sum())}/{len(emb)}")
    for j, name in enumerate(cent_names):
        c = int(((nearest_id == j) & covered).sum())
        if c:
            print(f"  -> {name}: {c}")

    new_idx = [i for i in range(len(emb)) if not covered[i]]
    print(f"\nunmatched immich faces to cluster: {len(new_idx)}")
    if len(new_idx) <= 1:
        labels = np.zeros(len(new_idx), dtype=int)
    else:
        labels = _cluster_embeddings(emb[new_idx], INITIAL_THRESHOLD)
    n_clusters = len(set(int(l) for l in labels))
    sizes = sorted([int((labels == l).sum()) for l in set(labels)], reverse=True)
    print(f"clusters at threshold {INITIAL_THRESHOLD}: {n_clusters}  "
          f"top sizes: {sizes[:10]}")

    clusters: dict[int, list[int]] = {}
    for k, lab in enumerate(labels):
        clusters.setdefault(int(lab), []).append(new_idx[k])

    kept: list[dict] = []
    drop_quality_total = 0
    drop_outlier_total = 0
    for cid, idxs in clusters.items():
        good: list[int] = []
        for i in idxs:
            r = face_records[i]
            if r.get("face_short", 0) < MIN_SHORT:
                drop_quality_total += 1; continue
            if r.get("blur", 0.0) < MIN_BLUR:
                drop_quality_total += 1; continue
            if r.get("det_score", 0.0) < MIN_DET_SCORE:
                drop_quality_total += 1; continue
            good.append(i)
        if not good:
            continue
        if len(good) >= 4:
            cent = _normalize(emb[good].mean(axis=0))
            d = 1.0 - emb[good] @ cent
            tight = [good[k] for k, dist in enumerate(d) if dist <= OUTLIER_THRESHOLD]
            drop_outlier_total += len(good) - len(tight)
            good = tight
        if not good:
            continue
        unique_paths = sorted({face_records[i]["path"] for i in good})
        if len(unique_paths) < MIN_FACES:
            continue
        kept.append({
            "indices": good,
            "unique_paths": unique_paths,
            "size_face": len(good),
            "size_paths": len(unique_paths),
        })

    kept.sort(key=lambda c: -c["size_paths"])
    print(f"\nafter quality+outlier+min_faces: {len(kept)} clusters kept "
          f"(dropped: quality={drop_quality_total} outlier={drop_outlier_total})")
    for rank, c in enumerate(kept, start=start_nnn):
        print(f"  faceset_{rank:03d}: faces={c['size_face']:3d} "
              f"unique_paths={c['size_paths']:3d}")

    facesets = [
        {
            "name": f"faceset_{rank:03d}",
            "image_count": c["size_paths"],
            "face_count": c["size_face"],
            "images": c["unique_paths"],
        }
        for rank, c in enumerate(kept, start=start_nnn)
    ]
    manifest = {
        "params": {
            "existing_match_threshold": EXISTING_MATCH_THRESHOLD,
            "initial_threshold": INITIAL_THRESHOLD,
            "outlier_threshold": OUTLIER_THRESHOLD,
            "min_faces": MIN_FACES,
            "min_short": MIN_SHORT,
            "min_blur": MIN_BLUR,
            "min_det_score": MIN_DET_SCORE,
            "source_label": source_label,
            "source_cache": str(immich_cache),
        },
        "facesets": facesets,
    }
    return manifest, kept


# ---- phase 2: export + relocate ----------------------------------------- #

def export_and_relocate(manifest: dict, immich_cache: Path, source_label: str) -> None:
    synth_path = REPO_WORK / f"synthetic_{source_label}_manifest.json"
    synth_path.write_text(json.dumps(manifest, indent=2))
    print(f"\nsynthetic manifest -> {synth_path}")

    out_tmp = SWAP_READY.parent / f"facesets_swap_ready_{source_label}_new"
    if out_tmp.exists():
        shutil.rmtree(out_tmp)
    out_tmp.mkdir(parents=True)

    print(f"running cmd_export_swap -> {out_tmp}")
    cmd_export_swap(
        cache_path=immich_cache,
        refine_manifest_path=synth_path,
        raw_manifest_path=None,
        out_dir=out_tmp,
        top_n=TOP_N,
        outlier_threshold=EXPORT_OUTLIER_THRESHOLD,
        pad_ratio=PAD_RATIO,
        out_size=OUT_SIZE,
        include_candidates=False,
        candidate_match_threshold=0.55,
        candidate_min_score=0.40,
        min_face_short=EXPORT_MIN_FACE_SHORT,
    )

    new_top = json.loads((out_tmp / "manifest.json").read_text())
    new_entries = new_top.get("facesets", [])

    moved = 0
    for fs_meta in new_entries:
        name = fs_meta["name"]
        src_dir = out_tmp / name
        if not src_dir.exists():
            print(f"[{name}] export dir missing; skipping")
            continue
        dst_dir = SWAP_READY / name
        if dst_dir.exists():
            print(f"[{name}] {dst_dir} already exists; refusing to overwrite")
            continue
        (src_dir / f"immich_{source_label}.txt").write_text(
            f"{name}\n\nSource: Immich user {source_label} cluster (auto-discovered).\n"
        )
        shutil.move(str(src_dir), str(dst_dir))
        moved += 1
        print(f"[{name}] -> {dst_dir}")

    final_manifest_path = SWAP_READY / "manifest.json"
    if final_manifest_path.exists():
        existing = json.loads(final_manifest_path.read_text())
    else:
        existing = {"facesets": []}
    existing.setdefault("facesets", [])
    existing_names = {fs["name"] for fs in existing["facesets"]}
    appended = 0
    for entry in new_entries:
        if entry["name"] in existing_names:
            print(f"[manifest] {entry['name']} already present; not duplicating")
            continue
        existing["facesets"].append(entry)
        appended += 1
    final_manifest_path.write_text(json.dumps(existing, indent=2))
    print(f"\nmerged manifest: appended {appended} entries -> {final_manifest_path}")
    print(f"moved {moved} faceset directories into {SWAP_READY}")
    if out_tmp.exists() and not list(out_tmp.iterdir()):
        out_tmp.rmdir()


# ---- main ---------------------------------------------------------------- #

def main() -> None:
    p = argparse.ArgumentParser()
    p.add_argument("immich_cache", type=Path,
                   help="path to immich_<user>.npz produced by the embed worker")
    p.add_argument("--nl-cache", type=Path, default=REPO_WORK / "cache" / "nl_full.npz",
                   help="canonical cache for existing identity centroids")
    p.add_argument("--source-label", default=None,
                   help="short label used in marker filenames; default = stem of immich_cache")
    p.add_argument("--start-nnn", type=int, default=None,
                   help="first faceset number to assign; default = current max+1 in facesets_swap_ready/")
    p.add_argument("--dry-run", action="store_true")
    args = p.parse_args()

    label = args.source_label or args.immich_cache.stem.removeprefix("immich_") or args.immich_cache.stem
    start_nnn = args.start_nnn if args.start_nnn is not None else _next_faceset_number()
    print(f"source label: {label!r}; first faceset number: {start_nnn:03d}")

    manifest, kept = discover_new_clusters(args.immich_cache, args.nl_cache, start_nnn, label)
    if args.dry_run:
        print("\n--dry-run: stopping after cluster discovery (no exports written).")
        return
    if not manifest.get("facesets"):
        print("no new facesets to build.")
        return
    export_and_relocate(manifest, args.immich_cache, label)
    print("\nDone.")


if __name__ == "__main__":
    main()