Add post-export corpus maintenance pipeline

Adds four new orchestration scripts that operate on an already-built
facesets_swap_ready/ to clean it up over time:

- filter_occlusions.py + clip_worker.py: CLIP zero-shot mask + sunglasses
  filter (open_clip ViT-L-14/dfn2b_s39b). WSL stages, Windows DML scores
  via new C:\clip_dml_venv. Image-level threshold 0.7; faceset-level
  quarantine at 40% domain dominance.

- consolidate_facesets.py: duplicate-identity merger using complete-linkage
  centroid clustering on cached arcface embeddings. Single-linkage chains
  catastrophically (60-faceset clusters with min sim < 0); complete-linkage
  guarantees within-group sim >= edge.

- age_extend_001.py: slots newly-added PNGs into existing era buckets of
  faceset_001 using the same anchor-fragment rule as age_split_001.py
  (dist <= 0.40 AND |year_delta| <= 5). Anchors not re-centered.

- dedup_optimize.py + multiface_worker.py: corpus-wide cleanup with three
  passes — cross-family SHA256 byte-dedup (preserves intra-family era
  duplication), within-faceset near-dup at sim >= 0.95, and a multi-face
  audit (the load-bearing roop invariant). Multi-face worker hits ~19 img/s
  on AMD Vega — ~7x embed_worker because input is 512x512 crops.

Same-day corpus evolution: 311 active / 0 masked / 68 thin / 0 merged →
181 / 51 / 71 / 74; 6,440 → 3,849 active PNGs. All quarantines and prunes
preserved on disk (faces/_dropped/, _masked/, _merged/, _thin/) for full
reversibility. Master manifest gains masked[], merged[], plus per-run
provenance blocks.

Three new docs/analysis/ writeups cover model choice, threshold rationale,
and per-pass run results.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-27 15:41:18 +02:00
parent e66c97fd58
commit 49a43c7685
10 changed files with 3250 additions and 1 deletions

576
work/age_extend_001.py Normal file
View File

@@ -0,0 +1,576 @@
"""Extend the existing 6 era buckets of faceset_001 by absorbing PNGs that
post-date the original age_split run (from consolidation merges, etc.).
Mirrors the anchor-fragment assignment logic in age_split_001.py:
- For each unbucketed face in faceset_001's manifest, find the nearest active
era anchor by cosine distance to the anchor's centroid.
- Accept the assignment iff dist <= 0.40 AND |year_delta| <= 5
(where year_delta = exif_year(face) - dom_year(anchor)).
- Undated PNGs are skipped (no assignment).
- Anchors are NOT re-centered after absorption (preserves the same drift
guarantees as the original age_split).
CLI:
python work/age_extend_001.py analyze --out work/age_extend/candidates.json
python work/age_extend_001.py report --candidates ... --out work/age_extend
python work/age_extend_001.py apply --candidates ... [--dry-run]
"""
from __future__ import annotations
import argparse
import json
import shutil
import sys
import time
from collections import Counter
from pathlib import Path
import numpy as np
from PIL import Image, ExifTags
ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
PARENT = "faceset_001"
ACTIVE_ERAS = [
"faceset_001_2005-10",
"faceset_001_2010-13",
"faceset_001_2011",
"faceset_001_2014-17",
"faceset_001_2018-19",
"faceset_001_2018-20",
]
CACHES = [
Path("/opt/face-sets/work/cache/nl_full.npz"),
Path("/opt/face-sets/work/cache/immich_peter.npz"),
Path("/opt/face-sets/work/cache/immich_nic.npz"),
]
EXIF_CACHE = Path("/opt/face-sets/work/cache/age_split_exif.json")
# anchor-fragment thresholds (mirror age_split_001.py)
DIST_MAX = 0.40
YEAR_MAX = 5
# ----------------------------- caches -----------------------------
def load_caches():
rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {}
alias_map: dict[str, str] = {}
for c in CACHES:
if not c.exists():
print(f"[warn] cache missing: {c}", file=sys.stderr)
continue
d = np.load(c, allow_pickle=True)
emb = d["embeddings"]
meta = json.loads(str(d["meta"]))
face_records = [m for m in meta if not m.get("noface")]
if len(face_records) != len(emb):
raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}")
if "path_aliases" in d.files:
paliases = json.loads(str(d["path_aliases"]))
for canon, alist in paliases.items():
alias_map.setdefault(canon, canon)
for a in alist:
alias_map[a] = canon
for i, rec in enumerate(face_records):
p = rec["path"]
bbox = tuple(int(x) for x in rec["bbox"])
v = emb[i].astype(np.float32)
n = float(np.linalg.norm(v))
if n > 0:
v = v / n
rec_index[(p, bbox)] = v
alias_map.setdefault(p, p)
print(f"[cache] indexed {len(rec_index)} face records, {len(alias_map)} aliases", file=sys.stderr)
return rec_index, alias_map
def lookup_emb(rec_index, alias_map, src: str, bbox):
bbox_t = tuple(int(x) for x in bbox)
canon = alias_map.get(src, src)
v = rec_index.get((canon, bbox_t))
if v is None and canon != src:
v = rec_index.get((src, bbox_t))
return v
# ----------------------------- exif -----------------------------
def load_exif_cache():
if not EXIF_CACHE.exists():
return {}
return json.loads(EXIF_CACHE.read_text())
def save_exif_cache(cache):
tmp = EXIF_CACHE.with_suffix(".tmp.json")
tmp.write_text(json.dumps(cache, indent=2))
tmp.replace(EXIF_CACHE)
def exif_year(path: Path) -> int | None:
try:
with Image.open(path) as im:
ex = im._getexif()
if not ex:
return None
for tag_id, val in ex.items():
tag = ExifTags.TAGS.get(tag_id, tag_id)
if tag == "DateTimeOriginal" and isinstance(val, str) and len(val) >= 4:
return int(val[:4])
except Exception:
return None
return None
def get_year(src: str, exif_cache) -> int | None:
"""Return EXIF year for src, using cache. Mutates cache for new lookups."""
if src in exif_cache:
return exif_cache[src]
p = Path(src)
y = exif_year(p) if p.exists() else None
exif_cache[src] = y
return y
# ----------------------------- analyze -----------------------------
def cmd_analyze(args):
rec_index, alias_map = load_caches()
exif_cache = load_exif_cache()
exif_cache_dirty = False
parent_dir = ROOT / PARENT
parent_manifest = json.loads((parent_dir / "manifest.json").read_text())
parent_faces = parent_manifest.get("faces", [])
print(f"[parent] {PARENT}: {len(parent_faces)} face entries", file=sys.stderr)
# Build "in_bucket" set + each anchor's centroid + dom_year
anchors = []
in_bucket: set[tuple[str, tuple[int, int, int, int]]] = set()
for era in ACTIVE_ERAS:
ed = ROOT / era
if not ed.is_dir():
print(f"[warn] missing era bucket: {era}", file=sys.stderr)
continue
em = json.loads((ed / "manifest.json").read_text())
emb_list = []
years = []
n_missing_emb = 0
for f in em.get("faces", []):
src = f.get("source")
bbox = f.get("bbox")
if not src or not bbox:
continue
key = (alias_map.get(src, src), tuple(int(x) for x in bbox))
in_bucket.add(key)
in_bucket.add((src, tuple(int(x) for x in bbox))) # cover both alias and raw
v = lookup_emb(rec_index, alias_map, src, bbox)
if v is None:
n_missing_emb += 1
else:
emb_list.append(v)
y = get_year(src, exif_cache)
if y is None:
exif_cache_dirty = True
else:
years.append(y)
if src not in exif_cache:
exif_cache_dirty = True
if not emb_list:
print(f"[warn] {era}: no embeddings found, skipping anchor", file=sys.stderr)
continue
arr = np.stack(emb_list).astype(np.float32)
c = arr.mean(axis=0)
n = float(np.linalg.norm(c))
if n > 0:
c = c / n
dom_year = Counter(years).most_common(1)[0][0] if years else None
anchors.append({
"name": era, "centroid": c, "n_faces": len(em.get("faces", [])),
"n_emb_used": len(emb_list), "n_emb_missing": n_missing_emb,
"dom_year": dom_year,
"year_min": min(years) if years else None,
"year_max": max(years) if years else None,
})
print(f"[anchor] {era}: n={len(em.get('faces', []))} emb_used={len(emb_list)} "
f"emb_miss={n_missing_emb} dom_year={dom_year} years=[{min(years) if years else '-'}..{max(years) if years else '-'}]",
file=sys.stderr)
# Find unbucketed faces in parent
unbucketed = []
for f in parent_faces:
src = f.get("source")
bbox = f.get("bbox")
if not src or not bbox:
continue
bbox_t = tuple(int(x) for x in bbox)
key1 = (alias_map.get(src, src), bbox_t)
key2 = (src, bbox_t)
if key1 in in_bucket or key2 in in_bucket:
continue
unbucketed.append(f)
print(f"[parent] {len(unbucketed)} unbucketed face entries (in {PARENT} but no era bucket)", file=sys.stderr)
# Score each unbucketed face against every anchor
proposals = []
skipped_no_emb = 0
skipped_no_year = 0
for f in unbucketed:
src = f["source"]
bbox = f["bbox"]
v = lookup_emb(rec_index, alias_map, src, bbox)
if v is None:
skipped_no_emb += 1
continue
y = get_year(src, exif_cache)
if y is None:
skipped_no_year += 1
exif_cache_dirty = True
continue
if src not in exif_cache:
exif_cache_dirty = True
# nearest anchor
best = None # (dist, idx)
for i, a in enumerate(anchors):
d = 1.0 - float(np.dot(a["centroid"], v))
if best is None or d < best[0]:
best = (d, i)
if best is None:
continue
dist, bidx = best
anchor = anchors[bidx]
year_delta = abs(y - anchor["dom_year"]) if anchor["dom_year"] is not None else None
accept = (dist <= DIST_MAX and year_delta is not None and year_delta <= YEAR_MAX)
proposals.append({
"png": f["png"],
"source": src,
"bbox": [int(x) for x in bbox],
"year": y,
"rank_in_parent": f.get("rank"),
"quality_composite": f.get("quality", {}).get("composite"),
"quality": f.get("quality", {}),
"best_anchor": anchor["name"],
"best_anchor_dom_year": anchor["dom_year"],
"centroid_dist": round(dist, 4),
"year_delta": year_delta,
"accept": bool(accept),
"all_anchor_dists": {
a["name"]: round(1.0 - float(np.dot(a["centroid"], v)), 4) for a in anchors
},
})
if exif_cache_dirty:
save_exif_cache(exif_cache)
print(f"[exif] cache flushed ({len(exif_cache)} entries total)", file=sys.stderr)
# Summarize
accepted = [p for p in proposals if p["accept"]]
rejected = [p for p in proposals if not p["accept"]]
by_anchor = Counter(p["best_anchor"] for p in accepted)
print(f"[summary] unbucketed={len(unbucketed)} scored={len(proposals)} "
f"accepted={len(accepted)} rejected={len(rejected)} "
f"skipped(no_emb={skipped_no_emb}, no_year={skipped_no_year})", file=sys.stderr)
for k, v in by_anchor.most_common():
print(f" {k}: +{v}", file=sys.stderr)
out = {
"thresholds": {"dist_max": DIST_MAX, "year_max": YEAR_MAX},
"anchors": [
{k: v for k, v in a.items() if k != "centroid"}
for a in anchors
],
"n_unbucketed": len(unbucketed),
"skipped": {"no_emb": skipped_no_emb, "no_year": skipped_no_year},
"proposals": sorted(proposals, key=lambda p: (not p["accept"], p["best_anchor"], -1 * (p["quality_composite"] or 0))),
"by_anchor": dict(by_anchor),
}
op = Path(args.out)
op.parent.mkdir(parents=True, exist_ok=True)
op.write_text(json.dumps(out, indent=2))
print(f"[done] {len(proposals)} proposals -> {op}", file=sys.stderr)
# ----------------------------- report -----------------------------
def cmd_report(args):
cand = json.loads(Path(args.candidates).read_text())
out_dir = Path(args.out)
thumbs_dir = out_dir / "thumbs"
thumbs_dir.mkdir(parents=True, exist_ok=True)
THUMB = 140
def make_thumb(png_relpath: str) -> str:
# png_relpath looks like "faces/0042.png"
src = ROOT / PARENT / png_relpath
name = Path(png_relpath).stem
dst = thumbs_dir / f"{name}.jpg"
if not dst.exists():
try:
img = Image.open(src).convert("RGB")
img.thumbnail((THUMB, THUMB), Image.LANCZOS)
img.save(dst, "JPEG", quality=82)
except Exception as e:
print(f"[thumb-skip] {src}: {e}", file=sys.stderr)
return ""
return f"thumbs/{name}.jpg"
# group accepted proposals by target anchor
by_anchor: dict[str, list] = {}
rejected = []
for p in cand["proposals"]:
if p["accept"]:
by_anchor.setdefault(p["best_anchor"], []).append(p)
else:
rejected.append(p)
rows = []
rows.append("<h1>faceset_001 age extension &mdash; review</h1>")
rows.append(f"<p>{cand['n_unbucketed']} unbucketed faces in {PARENT}; "
f"{sum(len(v) for v in by_anchor.values())} accepted / {len(rejected)} rejected; "
f"thresholds dist&le;{cand['thresholds']['dist_max']} AND |year_delta|&le;{cand['thresholds']['year_max']}.</p>")
nav = " · ".join(f"<a href='#{a}'>{a} (+{len(by_anchor[a])})</a>" for a in by_anchor) + " · <a href='#rejected'>rejected</a>"
rows.append(f"<div class='nav'>{nav}</div>")
for anchor_name in ACTIVE_ERAS:
if anchor_name not in by_anchor:
continue
items = by_anchor[anchor_name]
anchor_meta = next((a for a in cand["anchors"] if a["name"] == anchor_name), {})
rows.append(f"<section id='{anchor_name}' class='grp'>")
rows.append(f"<h2>{anchor_name} <small>(dom_year={anchor_meta.get('dom_year')}; "
f"existing n={anchor_meta.get('n_faces')}; +{len(items)} new)</small></h2>")
rows.append("<div class='cells'>")
for p in sorted(items, key=lambda x: (x["centroid_dist"], -1 * (x["quality_composite"] or 0))):
thumb = make_thumb(p["png"])
cls = "hi" if p["centroid_dist"] <= 0.30 else "mid"
rows.append(
f"<div class='cell'>"
f"<img src='{thumb}' loading='lazy' title='{p['png']}'>"
f"<div class='meta'>{p['png']}<br>year {p['year']}{p['year_delta']})<br>"
f"<span class='{cls}'>dist {p['centroid_dist']:.3f}</span></div>"
f"</div>"
)
rows.append("</div></section>")
if rejected:
rows.append("<section id='rejected' class='grp rej'>")
rows.append(f"<h2>rejected <small>({len(rejected)} faces don't fit any anchor)</small></h2>")
rows.append("<div class='cells'>")
for p in sorted(rejected, key=lambda x: x["centroid_dist"])[:200]:
thumb = make_thumb(p["png"])
why = []
if p["centroid_dist"] > cand['thresholds']['dist_max']:
why.append(f"dist {p['centroid_dist']:.2f}>{cand['thresholds']['dist_max']}")
if p["year_delta"] is None or p["year_delta"] > cand['thresholds']['year_max']:
why.append(f"{p['year_delta']}>{cand['thresholds']['year_max']}")
rows.append(
f"<div class='cell'>"
f"<img src='{thumb}' loading='lazy'>"
f"<div class='meta'>{p['png']}<br>year {p['year']} → best {p['best_anchor']}<br>"
f"<span class='lo'>{'; '.join(why)}</span></div>"
f"</div>"
)
if len(rejected) > 200:
rows.append(f"<p>...{len(rejected)-200} more truncated.</p>")
rows.append("</div></section>")
html = f"""<!doctype html>
<html><head><meta charset='utf-8'><title>faceset_001 age extension</title>
<style>
body {{ font-family: system-ui, sans-serif; background:#111; color:#eee; padding:1em; }}
h1 {{ margin-top:0; }} h2 {{ margin:0; }}
small {{ color:#999; font-weight:normal; }}
section.grp {{ background:#1a1a1a; border-radius:6px; padding:12px; margin:12px 0; }}
section.grp.rej {{ border-left:4px solid #ff5050; }}
.cells {{ display:flex; flex-wrap:wrap; gap:6px; }}
.cell {{ background:#222; border-radius:4px; padding:4px; width:160px; font-size:11px; font-family:monospace; text-align:center; }}
.cell img {{ height:140px; width:auto; border-radius:3px; }}
.meta {{ padding-top:4px; line-height:1.3; }}
.hi {{ color:#5fa05f; font-weight:bold; }}
.mid {{ color:#ffb050; }}
.lo {{ color:#ff5050; }}
.nav {{ position:sticky; top:0; background:#111; padding:.5em 0; border-bottom:1px solid #333; font-size:13px; }}
a {{ color:#6cf; }}
</style></head>
<body>
{''.join(rows)}
</body></html>"""
out_html = out_dir / "index.html"
out_html.write_text(html)
print(f"[done] {out_html}", file=sys.stderr)
# ----------------------------- apply -----------------------------
def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
import zipfile
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
for i, p in enumerate(pngs):
zf.write(p, arcname=f"{i:04d}.png")
def cmd_apply(args):
cand = json.loads(Path(args.candidates).read_text())
accepted = [p for p in cand["proposals"] if p["accept"]]
if args.dry_run:
from collections import Counter as C
by = C(p["best_anchor"] for p in accepted)
print(f"=== dry-run: {len(accepted)} assignments across {len(by)} anchors ===")
for k, v in by.most_common():
print(f" {k}: +{v}")
return
parent_dir = ROOT / PARENT
master_path = ROOT / "manifest.json"
master = json.loads(master_path.read_text())
facesets_by_name = {f["name"]: f for f in master.get("facesets", [])}
by_anchor: dict[str, list] = {}
for p in accepted:
by_anchor.setdefault(p["best_anchor"], []).append(p)
total_added = 0
for anchor_name, props in by_anchor.items():
ed = ROOT / anchor_name
em_path = ed / "manifest.json"
em = json.loads(em_path.read_text())
existing = list(em.get("faces", []))
# gather new entries with their source PNG paths in faceset_001/faces/
new_with_src = []
for p in props:
src_png = parent_dir / p["png"]
if not src_png.exists():
print(f"[warn] missing parent PNG {src_png}; skip", file=sys.stderr)
continue
face_entry = {
"source": p["source"],
"bbox": p["bbox"],
"quality": p["quality"],
"exif_year": p["year"],
"centroid_dist_at_assign": p["centroid_dist"],
"year_delta_at_assign": p["year_delta"],
"extended_from_parent": True,
}
new_with_src.append((face_entry, src_png))
# combine; rank by quality.composite desc (existing entries already have rank,
# but we re-rank globally so new entries slot in by quality)
combined: list[tuple[dict, Path | None]] = []
for f in existing:
combined.append((f, None))
combined.extend(new_with_src)
combined.sort(key=lambda x: -x[0].get("quality", {}).get("composite", 0))
# stage fresh
staging = ed / "_faces_new"
if staging.exists():
shutil.rmtree(staging)
staging.mkdir()
new_face_entries = []
for new_rank, (face, src_png_or_none) in enumerate(combined, start=1):
new_name = f"{new_rank:04d}.png"
if src_png_or_none is None:
# existing entry: copy from current era bucket faces/
old_name = Path(face["png"]).name
src = ed / "faces" / old_name
if not src.exists():
print(f"[warn] {anchor_name}: missing existing PNG {src}; skip", file=sys.stderr)
continue
shutil.copy2(src, staging / new_name)
else:
shutil.copy2(src_png_or_none, staging / new_name)
face = dict(face)
face["rank"] = new_rank
face["png"] = f"faces/{new_name}"
new_face_entries.append(face)
# swap dirs
old_holding = ed / "_faces_old"
if old_holding.exists():
shutil.rmtree(old_holding)
(ed / "faces").rename(old_holding)
staging.rename(ed / "faces")
shutil.rmtree(old_holding)
# re-zip .fsz
survivor_pngs = sorted((ed / "faces").glob("*.png"))
top_n = em.get("top_n", 30)
top_n_eff = min(top_n, len(survivor_pngs))
for old in ed.glob("*.fsz"):
old.unlink()
top_fsz_name = f"{anchor_name}_top{top_n_eff}.fsz"
all_fsz_name = f"{anchor_name}_all.fsz"
_zip_png_list(survivor_pngs[:top_n_eff], ed / top_fsz_name)
if len(survivor_pngs) > top_n_eff:
_zip_png_list(survivor_pngs, ed / all_fsz_name)
all_fsz_used = all_fsz_name
else:
all_fsz_used = None
# update local + master manifests
em["faces"] = new_face_entries
em["exported"] = len(new_face_entries)
em["fsz_top"] = top_fsz_name
em["fsz_all"] = all_fsz_used
em["top_n"] = top_n_eff
em.setdefault("age_extend_history", []).append({
"added": len(new_with_src),
"thresholds": cand["thresholds"],
})
em_path.write_text(json.dumps(em, indent=2))
if anchor_name in facesets_by_name:
facesets_by_name[anchor_name]["exported"] = len(new_face_entries)
facesets_by_name[anchor_name]["fsz_top"] = top_fsz_name
facesets_by_name[anchor_name]["fsz_all"] = all_fsz_used
facesets_by_name[anchor_name]["top_n"] = top_n_eff
added_here = len(new_with_src)
total_added += added_here
print(f"[applied] {anchor_name}: +{added_here} (now {len(new_face_entries)} faces)", file=sys.stderr)
# rewrite master with ordering preserved
new_facesets = []
for entry in master.get("facesets", []):
new_facesets.append(facesets_by_name.get(entry["name"], entry))
master["facesets"] = new_facesets
master.setdefault("age_extend_runs", []).append({
"parent": PARENT,
"thresholds": cand["thresholds"],
"anchors": list(by_anchor.keys()),
"added_total": total_added,
})
tmp = master_path.with_suffix(".tmp.json")
tmp.write_text(json.dumps(master, indent=2))
tmp.replace(master_path)
print(f"[done] +{total_added} faces across {len(by_anchor)} anchors", file=sys.stderr)
# ----------------------------- main -----------------------------
def main():
ap = argparse.ArgumentParser()
sub = ap.add_subparsers(dest="cmd", required=True)
a = sub.add_parser("analyze")
a.add_argument("--out", required=True)
a.set_defaults(func=cmd_analyze)
r = sub.add_parser("report")
r.add_argument("--candidates", required=True)
r.add_argument("--out", required=True)
r.set_defaults(func=cmd_report)
p = sub.add_parser("apply")
p.add_argument("--candidates", required=True)
p.add_argument("--dry-run", action="store_true")
p.set_defaults(func=cmd_apply)
args = ap.parse_args()
args.func(args)
if __name__ == "__main__":
main()

221
work/clip_worker.py Normal file
View File

@@ -0,0 +1,221 @@
"""Windows / DirectML CLIP worker for occlusion scoring.
Reads a queue.json staged by /opt/face-sets/work/filter_occlusions.py (WSL side),
runs open_clip ViT-L-14 (dfn2b_s39b) on each PNG via torch-directml on the AMD
Vega, and writes a scores.json with mask + sunglasses softmax probabilities.
CLI:
py -3.12 clip_worker.py <queue.json> <out_scores.json> [--limit N] [--batch 8]
queue.json shape: list of objects
{"wsl_path": "...", "win_path": "E:\\...\\faceset_NNN\\faces\\NNNN.png",
"faceset": "faceset_NNN", "file": "NNNN.png"}
scores.json shape:
{"model": "ViT-L-14/dfn2b_s39b",
"logit_scale": 100.0,
"prompts": {...},
"results": [{"wsl_path": "...", "faceset": "...", "file": "...",
"mask": float, "sunglasses": float}],
"processed": [wsl_path, ...]}
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import time
import warnings
from pathlib import Path
# DML emits a verbose UserWarning per attention call -- silence at import time
warnings.filterwarnings("ignore", category=UserWarning)
import torch
import torch_directml
import open_clip
from PIL import Image
MODEL_NAME = "ViT-L-14"
PRETRAINED = "dfn2b_s39b"
# kept in sync with /opt/face-sets/work/filter_occlusions.py PROMPTS
PROMPTS = {
"mask": {
"pos": [
"a photo of a person wearing a surgical face mask",
"a photo of a person wearing an FFP2 respirator covering mouth and nose",
"a photo of a person wearing a cloth face mask",
"a face partially covered by a medical mask",
"a person whose mouth and nose are hidden by a face mask",
],
"neg": [
"a photo of a person's face with mouth and nose clearly visible",
"a clear, unobstructed photo of a face",
"a photo of a face without any mask or covering",
"a portrait of a person showing their full face",
"a photo of a person with a beard and visible mouth",
],
},
"sunglasses": {
"pos": [
"a face with dark sunglasses covering the eyes",
"a portrait with the eyes hidden behind opaque sunglasses",
"a person wearing dark sunglasses over their eyes, eyes not visible",
"a face where the eyes are completely concealed by tinted lenses",
"a close-up portrait wearing aviator sunglasses on the eyes",
],
"neg": [
"a portrait with both eyes clearly visible and uncovered",
"a face with sunglasses pushed up on the forehead, eyes visible below",
"a face with sunglasses resting on top of the head, eyes visible",
"a person with sunglasses hanging from their shirt, eyes visible",
"a face wearing clear prescription eyeglasses with visible eyes",
"a portrait with no eyewear and visible eyes",
],
},
}
FLUSH_EVERY = 100
def load_existing(out_path: Path):
if not out_path.exists():
return None, set()
try:
d = json.loads(out_path.read_text())
processed = set(d.get("processed", []))
return d, processed
except Exception as e:
print(f"[warn] could not parse existing {out_path}: {e}; starting fresh", file=sys.stderr)
return None, set()
def save_atomic(out_path: Path, data: dict):
tmp = out_path.with_suffix(".tmp.json")
tmp.write_text(json.dumps(data, indent=2))
os.replace(tmp, out_path)
@torch.no_grad()
def build_text_features(model, tokenizer, device):
out = {}
for attr, sides in PROMPTS.items():
feats = {}
for side in ("pos", "neg"):
tokens = tokenizer(sides[side]).to(device)
f = model.encode_text(tokens)
f = f / f.norm(dim=-1, keepdim=True)
mean = f.mean(dim=0)
feats[side] = mean / mean.norm()
out[attr] = (feats["pos"], feats["neg"])
return out
def main():
ap = argparse.ArgumentParser()
ap.add_argument("queue", type=Path)
ap.add_argument("out", type=Path)
ap.add_argument("--limit", type=int, default=None)
ap.add_argument("--batch", type=int, default=8)
args = ap.parse_args()
queue = json.loads(args.queue.read_text())
print(f"[queue] {len(queue)} entries from {args.queue}")
args.out.parent.mkdir(parents=True, exist_ok=True)
existing, processed = load_existing(args.out)
if existing:
print(f"[resume] {len(processed)} entries already scored")
results = existing.get("results", [])
else:
results = []
pending = [e for e in queue if e["wsl_path"] not in processed]
if args.limit is not None:
pending = pending[: args.limit]
print(f"[pending] {len(pending)} entries to score")
if not pending:
print("[done] nothing to do")
return
device = torch_directml.device()
print(f"[load] {MODEL_NAME}/{PRETRAINED} on {torch_directml.device_name(0)}")
t0 = time.time()
model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED)
tokenizer = open_clip.get_tokenizer(MODEL_NAME)
model = model.to(device).eval()
logit_scale = float(model.logit_scale.exp().detach().cpu())
print(f"[load] ready in {time.time()-t0:.1f}s logit_scale={logit_scale:.2f}")
text_feats = build_text_features(model, tokenizer, device)
def flush():
save_atomic(args.out, {
"model": f"{MODEL_NAME}/{PRETRAINED}",
"logit_scale": logit_scale,
"prompts": PROMPTS,
"results": results,
"processed": sorted(processed),
})
n_done_this_run = 0
n_load_err = 0
last_flush = time.time()
t_start = time.time()
for i in range(0, len(pending), args.batch):
chunk = pending[i:i + args.batch]
imgs = []
keep = []
for entry in chunk:
try:
img = Image.open(entry["win_path"]).convert("RGB")
imgs.append(preprocess(img))
keep.append(entry)
except Exception as e:
print(f"[skip] {entry['win_path']}: {e}", file=sys.stderr)
n_load_err += 1
processed.add(entry["wsl_path"])
if not imgs:
continue
x = torch.stack(imgs).to(device)
with torch.no_grad():
feats = model.encode_image(x)
feats = feats / feats.norm(dim=-1, keepdim=True)
scores_per_attr = {}
for attr, (pos, neg) in text_feats.items():
sims = torch.stack([feats @ pos, feats @ neg], dim=1) * logit_scale
probs = sims.softmax(dim=1)[:, 0].detach().cpu().tolist()
scores_per_attr[attr] = probs
for j, entry in enumerate(keep):
results.append({
"wsl_path": entry["wsl_path"],
"faceset": entry["faceset"],
"file": entry["file"],
"mask": round(scores_per_attr["mask"][j], 4),
"sunglasses": round(scores_per_attr["sunglasses"][j], 4),
})
processed.add(entry["wsl_path"])
n_done_this_run += 1
if (n_done_this_run % FLUSH_EVERY < args.batch) or (time.time() - last_flush) > 30.0:
flush()
last_flush = time.time()
elapsed = time.time() - t_start
rate = n_done_this_run / max(0.1, elapsed)
eta_min = (len(pending) - n_done_this_run) / max(0.1, rate) / 60.0
print(f"[score] {n_done_this_run}/{len(pending)} "
f"rate={rate:.2f} img/s eta={eta_min:.1f}min "
f"load_err={n_load_err}", flush=True)
flush()
elapsed = time.time() - t_start
print(f"[done] {n_done_this_run} scored, {n_load_err} load errors, "
f"{elapsed:.1f}s ({n_done_this_run/max(0.1,elapsed):.2f} img/s) -> {args.out}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,634 @@
"""Consolidate facesets_swap_ready/ — find duplicate identities and merge.
Pipeline:
1. analyze: pull arcface embeddings from work/cache/*.npz for every PNG in every
active faceset (skipping _masked, _thin, era splits). Compute L2-normalized
centroid per faceset. Build similarity graph at sim>=0.45, extract components.
Pick primary per component by tier (hand-sorted > auto > osrc > immich) + size.
2. report: HTML contact sheet at work/merge_review/index.html grouped by
candidate cluster, with top-3 thumbs per faceset, all pairwise sims, and
"merge X,Y -> Z" plan. Confident edges (sim>=0.65) are highlighted.
3. apply: combine PNGs of secondaries into primary, re-rank by quality.composite
descending, renumber 0001..NNNN, re-zip _topN.fsz + _all.fsz, move secondaries
to facesets_swap_ready/_merged/<name>/, update master manifest with
`merged[]` array + `merge_run` provenance block.
Embeddings come from caches (no GPU re-embed needed); the original clusterer used
exactly these vectors so they are the right yardstick. Era splits are excluded
entirely (intentional time-period segmentation, not a duplication).
"""
from __future__ import annotations
import argparse
import json
import re
import shutil
import sys
import time
from pathlib import Path
import numpy as np
from PIL import Image
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
CACHES = [
Path("/opt/face-sets/work/cache/nl_full.npz"),
Path("/opt/face-sets/work/cache/immich_peter.npz"),
Path("/opt/face-sets/work/cache/immich_nic.npz"),
]
ERA_SPLIT_RE = re.compile(r"^faceset_\d+_(?:\d{4}-\d{2,4}|\d{4}|undated)$")
# ----------------------------- helpers -----------------------------
def load_caches():
"""Return (rec_index, alias_map). rec_index keyed by (path, bbox_tuple)
-> embedding (np.float32, shape (512,) L2-normalized).
alias_map maps every alias path -> canonical path."""
rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {}
alias_map: dict[str, str] = {}
n_total = 0
for c in CACHES:
if not c.exists():
print(f"[warn] cache missing: {c}", file=sys.stderr)
continue
d = np.load(c, allow_pickle=True)
emb = d["embeddings"]
meta = json.loads(str(d["meta"]))
face_records = [m for m in meta if not m.get("noface")]
if len(face_records) != len(emb):
raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}")
# path_aliases may be present
if "path_aliases" in d.files:
paliases = json.loads(str(d["path_aliases"]))
for canon, alist in paliases.items():
alias_map.setdefault(canon, canon)
for a in alist:
alias_map[a] = canon
for i, rec in enumerate(face_records):
p = rec["path"]
bbox = tuple(int(x) for x in rec["bbox"])
v = emb[i].astype(np.float32)
n = float(np.linalg.norm(v))
if n > 0:
v = v / n
rec_index[(p, bbox)] = v
alias_map.setdefault(p, p)
print(f"[cache] {c.name}: +{len(face_records)} face records (running total {len(rec_index)})", file=sys.stderr)
n_total += len(face_records)
print(f"[cache] indexed {n_total} face records, {len(alias_map)} path aliases", file=sys.stderr)
return rec_index, alias_map
def faceset_tier(name: str) -> int:
"""Lower number = higher priority for primary selection."""
m = re.match(r"^faceset_0*(\d+)$", name)
if not m:
return 99 # unknown structure
n = int(m.group(1))
if 13 <= n <= 19:
return 0 # hand-sorted
if 1 <= n <= 12:
return 1 # auto-clustered
if 20 <= n <= 25:
return 2 # osrc
if 26 <= n <= 264:
return 3 # immich peter
if 265 <= n:
return 4 # immich nic and beyond
return 99
def is_era_split(name: str) -> bool:
return bool(ERA_SPLIT_RE.match(name))
def faceset_centroid(faceset_dir: Path, rec_index, alias_map):
"""Return (centroid, n_used, n_missing) where centroid is L2-normalized mean
of embeddings of the faces listed in the per-faceset manifest. Falls back to
None if too few embeddings found."""
manifest = faceset_dir / "manifest.json"
if not manifest.exists():
return None, 0, 0
m = json.loads(manifest.read_text())
vecs = []
n_missing = 0
for f in m.get("faces", []):
src = f.get("source")
bbox = f.get("bbox")
if src is None or bbox is None:
n_missing += 1
continue
bbox_t = tuple(int(x) for x in bbox)
canon = alias_map.get(src, src)
v = rec_index.get((canon, bbox_t))
if v is None and canon != src:
v = rec_index.get((src, bbox_t))
if v is None:
n_missing += 1
continue
vecs.append(v)
if len(vecs) < 3:
return None, len(vecs), n_missing
arr = np.stack(vecs).astype(np.float32)
c = arr.mean(axis=0)
n = float(np.linalg.norm(c))
if n > 0:
c = c / n
return c, len(vecs), n_missing
def connected_components(adj: dict[int, set[int]]) -> list[list[int]]:
seen: set[int] = set()
comps = []
for node in adj:
if node in seen:
continue
stack = [node]
comp = []
while stack:
x = stack.pop()
if x in seen:
continue
seen.add(x)
comp.append(x)
for y in adj.get(x, set()):
if y not in seen:
stack.append(y)
comps.append(sorted(comp))
return comps
# ----------------------------- analyze -----------------------------
def cmd_analyze(args):
rec_index, alias_map = load_caches()
# collect active facesets
active = []
for d in sorted(ROOT.iterdir()):
if not d.is_dir() or d.name.startswith("_"):
continue
if is_era_split(d.name):
continue
active.append(d)
print(f"[scan] {len(active)} active facesets (era splits + _masked + _thin excluded)", file=sys.stderr)
centroids: dict[str, np.ndarray] = {}
sizes: dict[str, int] = {}
skipped = []
t0 = time.time()
for fs in active:
c, n_used, n_miss = faceset_centroid(fs, rec_index, alias_map)
if c is None:
skipped.append((fs.name, n_used, n_miss))
continue
centroids[fs.name] = c
sizes[fs.name] = n_used
print(f"[centroid] {len(centroids)} facesets centroided in {time.time()-t0:.1f}s; "
f"{len(skipped)} skipped (too few embeddings)", file=sys.stderr)
if skipped:
for n, u, m in skipped[:10]:
print(f" skip {n}: used={u} missing={m}", file=sys.stderr)
if len(skipped) > 10:
print(f" ... +{len(skipped)-10} more", file=sys.stderr)
names = sorted(centroids.keys())
if not names:
raise SystemExit("no centroids built")
# similarity matrix
M = np.stack([centroids[n] for n in names]).astype(np.float32) # (N, 512), normalized
sim = M @ M.T # (N, N) cosine since unit-normalized
np.clip(sim, -1.0, 1.0, out=sim)
edge_thr = args.edge
confident_thr = args.confident
# complete-linkage agglomerative clustering on cosine distance.
# Cut at edge threshold: groups are guaranteed to have ALL pairs sim >= edge_thr.
# This avoids the chaining problem of single-link / connected-components.
n = len(names)
dist = 1.0 - sim
np.fill_diagonal(dist, 0.0)
# symmetrize numerical noise
dist = (dist + dist.T) / 2.0
np.clip(dist, 0.0, 2.0, out=dist)
cond = squareform(dist, checks=False)
Z = linkage(cond, method="complete")
cut_dist = 1.0 - edge_thr # complete-link distance corresponds to (1 - min sim)
labels = fcluster(Z, t=cut_dist, criterion="distance") # 1-indexed cluster ids
cluster_members: dict[int, list[int]] = {}
for idx, lbl in enumerate(labels):
cluster_members.setdefault(int(lbl), []).append(idx)
comps = [sorted(idxs) for idxs in cluster_members.values() if len(idxs) > 1]
n_pairs_in_groups = 0
for c in comps:
n_pairs_in_groups += len(c) * (len(c) - 1) // 2
print(f"[graph] complete-linkage cut at sim>={edge_thr}: {len(comps)} multi-faceset groups "
f"({n_pairs_in_groups} within-group pairs)", file=sys.stderr)
# pick primary per group: lowest tier number, then largest size
groups_out = []
for comp in comps:
members = [names[i] for i in comp]
members_sorted = sorted(members, key=lambda x: (faceset_tier(x), -sizes.get(x, 0), x))
primary = members_sorted[0]
secondaries = members_sorted[1:]
# gather pairwise sims within group
pair_sims = []
idx_of = {names[i]: i for i in comp}
for a in members:
for b in members:
if a >= b:
continue
pair_sims.append({"a": a, "b": b, "sim": round(float(sim[idx_of[a], idx_of[b]]), 4)})
# confidence: minimum within-group sim (the weakest link)
min_link = min(p["sim"] for p in pair_sims)
max_link = max(p["sim"] for p in pair_sims)
confidence = "confident" if min_link >= confident_thr else "uncertain"
groups_out.append({
"primary": primary,
"secondaries": secondaries,
"members": members_sorted,
"tiers": {n: faceset_tier(n) for n in members},
"sizes": {n: sizes.get(n, 0) for n in members},
"pair_sims": pair_sims,
"min_link": round(min_link, 4),
"max_link": round(max_link, 4),
"confidence": confidence,
})
# sort: confident first, then by max_link desc
groups_out.sort(key=lambda g: (0 if g["confidence"] == "confident" else 1, -g["max_link"]))
out = {
"thresholds": {"edge": edge_thr, "confident": confident_thr},
"n_active": len(active),
"n_centroided": len(centroids),
"n_skipped": len(skipped),
"skipped_reasons": [{"name": n, "used": u, "missing": m} for n, u, m in skipped],
"n_groups": len(groups_out),
"n_facesets_in_groups": sum(len(g["members"]) for g in groups_out),
"groups": groups_out,
}
op = Path(args.out)
op.parent.mkdir(parents=True, exist_ok=True)
op.write_text(json.dumps(out, indent=2))
confident = sum(1 for g in groups_out if g["confidence"] == "confident")
uncertain = sum(1 for g in groups_out if g["confidence"] == "uncertain")
print(f"[done] {len(groups_out)} groups ({confident} confident, {uncertain} uncertain) -> {op}", file=sys.stderr)
# ----------------------------- report -----------------------------
def cmd_report(args):
candidates = json.loads(Path(args.candidates).read_text())
out_dir = Path(args.out)
thumbs_dir = out_dir / "thumbs"
thumbs_dir.mkdir(parents=True, exist_ok=True)
THUMB = 140
THUMBS_PER_FACESET = 4
def make_thumb(faceset: str, fname: str) -> str:
d = thumbs_dir / faceset
d.mkdir(parents=True, exist_ok=True)
dst = d / (Path(fname).stem + ".jpg")
if not dst.exists():
try:
src = ROOT / faceset / "faces" / fname
img = Image.open(src).convert("RGB")
img.thumbnail((THUMB, THUMB), Image.LANCZOS)
img.save(dst, "JPEG", quality=82)
except Exception as e:
print(f"[thumb-skip] {faceset}/{fname}: {e}", file=sys.stderr)
return ""
return f"thumbs/{faceset}/{Path(fname).stem}.jpg"
rows = []
for gi, g in enumerate(candidates["groups"]):
primary = g["primary"]
sec = g["secondaries"]
conf_cls = "confident" if g["confidence"] == "confident" else "uncertain"
rows.append(f"<section class='grp {conf_cls}' id='g{gi}'>")
rows.append(f"<h2>group #{gi+1} <small>({g['confidence']}; min_sim={g['min_link']:.3f}, max_sim={g['max_link']:.3f})</small></h2>")
rows.append(f"<div class='plan'>merge <b>{', '.join(sec)}</b> &rarr; <b>{primary}</b></div>")
# member rows
for name in g["members"]:
tier = g["tiers"][name]
sz = g["sizes"][name]
tier_label = ["hand-sorted", "auto", "osrc", "immich-peter", "immich-nic", "?"][min(tier, 5)]
badge = "PRIMARY" if name == primary else "secondary"
rows.append(f"<div class='member'>")
rows.append(f"<div class='label'><span class='badge {badge.lower()}'>{badge}</span> "
f"<b>{name}</b> <small>tier={tier_label} · n={sz}</small></div>")
rows.append("<div class='thumbs'>")
faces_dir = ROOT / name / "faces"
files = sorted(faces_dir.glob("*.png"))[:THUMBS_PER_FACESET]
for f in files:
rel = make_thumb(name, f.name)
if rel:
rows.append(f"<img src='{rel}' loading='lazy' title='{f.name}'>")
rows.append("</div></div>")
# pairwise sims
rows.append("<table class='sims'><tr><th>a</th><th>b</th><th>sim</th></tr>")
for ps in sorted(g["pair_sims"], key=lambda x: -x["sim"]):
cls = "hi" if ps["sim"] >= candidates["thresholds"]["confident"] else "mid"
rows.append(f"<tr><td>{ps['a']}</td><td>{ps['b']}</td><td class='{cls}'>{ps['sim']:.3f}</td></tr>")
rows.append("</table>")
rows.append("</section>")
nav = " · ".join(f"<a href='#g{i}'>#{i+1}</a>" for i in range(len(candidates["groups"])))
html = f"""<!doctype html>
<html><head><meta charset='utf-8'><title>Faceset merge review</title>
<style>
body {{ font-family: system-ui, sans-serif; background: #111; color: #eee; padding: 1em; }}
h1 {{ margin-top: 0; }}
h2 {{ margin: 0; }}
small {{ color: #999; font-weight: normal; }}
section.grp {{ background: #1a1a1a; border-radius: 6px; padding: 12px; margin: 12px 0; }}
section.grp.confident {{ border-left: 4px solid #5fa05f; }}
section.grp.uncertain {{ border-left: 4px solid #ffb050; }}
.plan {{ margin: .5em 0; color: #6cf; }}
.member {{ margin: 8px 0; padding: 6px; background: #222; border-radius: 4px; }}
.label {{ font-family: monospace; font-size: 13px; }}
.badge {{ display: inline-block; padding: 0 6px; font-size: 10px; border-radius: 2px; }}
.badge.primary {{ background: #5fa05f; color: #000; font-weight: bold; }}
.badge.secondary {{ background: #444; color: #ccc; }}
.thumbs {{ display: flex; gap: 4px; margin-top: 4px; flex-wrap: wrap; }}
.thumbs img {{ height: 140px; width: auto; border-radius: 3px; }}
table.sims {{ font-family: monospace; font-size: 11px; margin-top: 6px; border-collapse: collapse; }}
table.sims td, table.sims th {{ padding: 1px 8px; border: 1px solid #333; text-align: left; }}
table.sims td.hi {{ color: #5fa05f; font-weight: bold; }}
table.sims td.mid {{ color: #ffb050; }}
.nav {{ position: sticky; top: 0; background: #111; padding: .5em 0; border-bottom: 1px solid #333; font-size: 12px; }}
a {{ color: #6cf; }}
</style></head>
<body>
<h1>Merge review &mdash; {len(candidates['groups'])} candidate groups
<small>(edge>={candidates['thresholds']['edge']}, confident>={candidates['thresholds']['confident']})</small></h1>
<p>{candidates['n_centroided']} of {candidates['n_active']} active facesets centroided
(skipped {candidates['n_skipped']} for too few cached embeddings).
Green = confident (min within-group sim >= {candidates['thresholds']['confident']}); orange = uncertain.</p>
<div class='nav'>{nav}</div>
{''.join(rows)}
</body></html>"""
out_html = out_dir / "index.html"
out_html.write_text(html)
print(f"[done] {out_html}", file=sys.stderr)
# ----------------------------- apply -----------------------------
def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
import zipfile
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
for i, p in enumerate(pngs):
zf.write(p, arcname=f"{i:04d}.png")
def cmd_apply(args):
candidates = json.loads(Path(args.candidates).read_text())
master_path = ROOT / "manifest.json"
master = json.loads(master_path.read_text())
by_name = {f["name"]: f for f in master.get("facesets", [])}
# filter: skip "uncertain" groups unless --include-uncertain
accepted = [g for g in candidates["groups"]
if g["confidence"] == "confident" or args.include_uncertain]
skipped_unc = [g for g in candidates["groups"]
if g["confidence"] == "uncertain" and not args.include_uncertain]
# explicit --exclude / --only filters (group indices in the candidates file)
if args.only:
only = {int(s) for s in args.only.split(",")}
accepted = [g for i, g in enumerate(candidates["groups"]) if i in only]
if args.exclude:
excl = {int(s) for s in args.exclude.split(",")}
accepted = [g for i, g in enumerate(accepted) if i not in excl]
print(f"[plan] {len(accepted)} groups will be merged "
f"({len(skipped_unc)} uncertain skipped)", file=sys.stderr)
if args.dry_run:
for g in accepted:
print(f" merge {g['secondaries']} -> {g['primary']} "
f"({g['confidence']}, min_sim={g['min_link']:.3f})")
return
merged_dir = ROOT / "_merged"
merged_dir.mkdir(exist_ok=True)
new_facesets: list[dict] = []
new_merged: list[dict] = list(master.get("merged", []))
consumed_names: set[str] = set()
primary_updates: dict[str, dict] = {} # name -> new entry
primary_absorbed: dict[str, list[dict]] = {} # primary_name -> [secondary entries]
for g in accepted:
primary = g["primary"]
if primary not in by_name:
print(f"[warn] primary {primary} not in master; skipping group", file=sys.stderr)
continue
primary_dir = ROOT / primary
if not primary_dir.is_dir():
print(f"[warn] primary dir {primary_dir} missing; skipping group", file=sys.stderr)
continue
primary_faces = primary_dir / "faces"
primary_manifest_path = primary_dir / "manifest.json"
primary_manifest = json.loads(primary_manifest_path.read_text())
# gather all face entries: primary + each secondary
combined_faces: list[dict] = list(primary_manifest.get("faces", []))
# adjust composite quality fall-back: ensure key exists
for f in combined_faces:
f.setdefault("origin_faceset", primary)
for sec in g["secondaries"]:
sec_dir = ROOT / sec
if not sec_dir.is_dir():
print(f"[warn] secondary {sec} missing; skipping", file=sys.stderr)
continue
sec_manifest_path = sec_dir / "manifest.json"
sec_manifest = json.loads(sec_manifest_path.read_text()) if sec_manifest_path.exists() else {"faces": []}
for f in sec_manifest.get("faces", []):
f = dict(f)
f["origin_faceset"] = sec
combined_faces.append(f)
# rank by quality.composite descending; ties broken by lower cosd_centroid
def sort_key(f):
q = f.get("quality", {}).get("composite", 0)
d = f.get("cosd_centroid", 1.0)
return (-q, d)
combined_faces.sort(key=sort_key)
# renumber and stage PNGs into a fresh staging dir, then atomically swap
staging = primary_dir / "_faces_new"
if staging.exists():
shutil.rmtree(staging)
staging.mkdir()
new_face_entries = []
for new_rank, f in enumerate(combined_faces, start=1):
origin = f.pop("origin_faceset")
old_png_rel = f["png"] # e.g. "faces/0042.png"
old_png_name = Path(old_png_rel).name
origin_png = ROOT / origin / "faces" / old_png_name
if not origin_png.exists():
# could be in _dropped if occlusion-pruned; skip
continue
new_name = f"{new_rank:04d}.png"
shutil.copy2(origin_png, staging / new_name)
f = dict(f)
f["rank"] = new_rank
f["png"] = f"faces/{new_name}"
f["origin_faceset"] = origin # preserve provenance in manifest
new_face_entries.append(f)
# swap directories: primary/faces -> primary/_faces_old, staging -> primary/faces
old_faces_holding = primary_dir / "_faces_old"
if old_faces_holding.exists():
shutil.rmtree(old_faces_holding)
if primary_faces.exists():
primary_faces.rename(old_faces_holding)
staging.rename(primary_faces)
# migrate _dropped/ from old holding (so occlusion-pruned PNGs remain accessible)
old_dropped = old_faces_holding / "_dropped"
if old_dropped.exists():
(primary_faces / "_dropped").mkdir(exist_ok=True)
for x in old_dropped.iterdir():
shutil.move(str(x), str(primary_faces / "_dropped" / x.name))
shutil.rmtree(old_faces_holding)
# re-zip .fsz
survivor_pngs = sorted(primary_faces.glob("*.png"))
top_n = primary_manifest.get("top_n", 30)
top_n_eff = min(top_n, len(survivor_pngs))
# remove old .fsz files
for old in primary_dir.glob("*.fsz"):
old.unlink()
top_fsz_name = f"{primary}_top{top_n_eff}.fsz"
all_fsz_name = f"{primary}_all.fsz"
_zip_png_list(survivor_pngs[:top_n_eff], primary_dir / top_fsz_name)
if len(survivor_pngs) > top_n_eff:
_zip_png_list(survivor_pngs, primary_dir / all_fsz_name)
all_fsz_used = all_fsz_name
else:
all_fsz_used = None
# update primary's local manifest
primary_manifest["faces"] = new_face_entries
primary_manifest["exported"] = len(new_face_entries)
primary_manifest["fsz_top"] = top_fsz_name
primary_manifest["fsz_all"] = all_fsz_used
primary_manifest["top_n"] = top_n_eff
primary_manifest.setdefault("merge_history", []).append({
"absorbed": g["secondaries"],
"min_link": g["min_link"],
"max_link": g["max_link"],
"confidence": g["confidence"],
})
primary_manifest_path.write_text(json.dumps(primary_manifest, indent=2))
# move secondary directories into _merged/
absorbed_master_entries: list[dict] = []
for sec in g["secondaries"]:
sec_dir = ROOT / sec
target = merged_dir / sec
if not sec_dir.is_dir():
continue
if target.exists():
shutil.rmtree(sec_dir) # already moved by previous run; clean stub
else:
shutil.move(str(sec_dir), str(target))
sec_master = dict(by_name.get(sec, {"name": sec}))
sec_master["merged_into"] = primary
sec_master["relpath"] = f"_merged/{sec}"
sec_master["fsz_top"] = None
sec_master["fsz_all"] = None
absorbed_master_entries.append(sec_master)
consumed_names.add(sec)
new_merged.extend(absorbed_master_entries)
# bump primary master entry
prim_master = dict(by_name[primary])
prim_master["exported"] = len(new_face_entries)
prim_master["top_n"] = top_n_eff
prim_master["fsz_top"] = top_fsz_name
prim_master["fsz_all"] = all_fsz_used
prim_master.setdefault("merge_history", []).append({
"absorbed": g["secondaries"],
"min_link": g["min_link"],
"max_link": g["max_link"],
})
primary_updates[primary] = prim_master
print(f"[merged] {g['secondaries']} -> {primary} "
f"now {len(new_face_entries)} png", file=sys.stderr)
# rebuild master facesets list
for entry in master.get("facesets", []):
nm = entry["name"]
if nm in consumed_names:
continue
if nm in primary_updates:
new_facesets.append(primary_updates[nm])
else:
new_facesets.append(entry)
new_master = dict(master)
new_master["facesets"] = new_facesets
new_master["merged"] = new_merged
new_master["merge_run"] = {
"thresholds": candidates["thresholds"],
"groups_applied": len(accepted),
"facesets_consumed": len(consumed_names),
"include_uncertain": bool(args.include_uncertain),
}
tmp = master_path.with_suffix(".tmp.json")
tmp.write_text(json.dumps(new_master, indent=2))
tmp.replace(master_path)
print(f"[done] master manifest updated: {len(new_facesets)} active, "
f"{len(new_merged)} merged, {len(consumed_names)} consumed in this run",
file=sys.stderr)
# ----------------------------- main -----------------------------
def main():
ap = argparse.ArgumentParser()
sub = ap.add_subparsers(dest="cmd", required=True)
a = sub.add_parser("analyze")
a.add_argument("--out", required=True)
a.add_argument("--edge", type=float, default=0.45, help="min cosine sim to draw an edge (default 0.45)")
a.add_argument("--confident", type=float, default=0.65, help="min within-group sim to be confident (default 0.65)")
a.set_defaults(func=cmd_analyze)
r = sub.add_parser("report")
r.add_argument("--candidates", required=True)
r.add_argument("--out", required=True)
r.set_defaults(func=cmd_report)
p = sub.add_parser("apply")
p.add_argument("--candidates", required=True)
p.add_argument("--include-uncertain", action="store_true",
help="apply uncertain groups too (default: confident only)")
p.add_argument("--only", default=None, help="comma-separated group indices to apply")
p.add_argument("--exclude", default=None, help="comma-separated group indices to skip")
p.add_argument("--dry-run", action="store_true")
p.set_defaults(func=cmd_apply)
args = ap.parse_args()
args.func(args)
if __name__ == "__main__":
main()

594
work/dedup_optimize.py Normal file
View File

@@ -0,0 +1,594 @@
"""Corpus-wide dedup + roop-unleashed optimization.
Two passes:
1. Cross-family byte-identical PNG dedup (same SHA256 in two different identity
families) — keep the higher-tier family copy. Era splits of the same parent
identity (faceset_NNN_*) are intentional duplications and are NOT deduped
within their family.
2. Within-faceset near-duplicate dedup using cached arcface embeddings
(cosine sim >= 0.95). Keep highest quality.composite, drop the rest.
Plus a Windows-DML multi-face audit (separate phase via clip_worker-style split):
3. Re-detect each PNG with insightface; flag any with 0 or >1 detected faces.
The roop loader appends every detected face per PNG, so multi-face crops
pollute identity averaging.
All flagged PNGs are MOVED to <faceset>/faces/_dropped/ (reversible). Affected
.fsz files are re-zipped, manifests updated.
CLI:
analyze --out work/dedup_audit/dedup_plan.json
apply --plan ... [--dry-run]
stage_multiface --out work/dedup_audit/multiface_queue.json
merge_multiface --results <worker_out> --out work/dedup_audit/multiface_plan.json
apply_multiface --plan ... [--dry-run]
report --dedup ... --multiface ... --out work/dedup_audit
"""
from __future__ import annotations
import argparse
import hashlib
import json
import re
import shutil
import sys
import time
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import numpy as np
ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
WIN_ROOT = r"E:\temp_things\fcswp\nl_sorted\facesets_swap_ready"
CACHES = [
Path("/opt/face-sets/work/cache/nl_full.npz"),
Path("/opt/face-sets/work/cache/immich_peter.npz"),
Path("/opt/face-sets/work/cache/immich_nic.npz"),
]
NEAR_DUP_THRESHOLD = 0.95
HASH_PARALLEL = 16
# ----------------------------- helpers -----------------------------
def faceset_tier(name: str) -> int:
m = re.match(r"^faceset_0*(\d+)(?:_.+)?$", name)
if not m:
return 99
n = int(m.group(1))
if 13 <= n <= 19:
return 0
if 1 <= n <= 12:
return 1
if 20 <= n <= 25:
return 2
if 26 <= n <= 264:
return 3
if 265 <= n:
return 4
return 99
def faceset_family(name: str) -> str:
"""faceset_001_2010-13 → faceset_001; faceset_001 → faceset_001."""
m = re.match(r"^(faceset_\d+)(?:_.+)?$", name)
return m.group(1) if m else name
def wsl_to_win(p: str) -> str:
s = str(p)
if s.startswith("/mnt/"):
return f"{s[5].upper()}:\\{s[7:].replace('/', chr(92))}"
return s
def iter_active_facesets() -> list[Path]:
out = []
for d in sorted(ROOT.iterdir()):
if d.is_dir() and not d.name.startswith("_"):
out.append(d)
return out
def sha256_file(p: Path) -> str:
h = hashlib.sha256()
with open(p, "rb") as f:
while True:
b = f.read(1 << 20)
if not b:
break
h.update(b)
return h.hexdigest()
def load_caches():
rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {}
alias_map: dict[str, str] = {}
for c in CACHES:
if not c.exists():
continue
d = np.load(c, allow_pickle=True)
emb = d["embeddings"]
meta = json.loads(str(d["meta"]))
face_records = [m for m in meta if not m.get("noface")]
if "path_aliases" in d.files:
paliases = json.loads(str(d["path_aliases"]))
for canon, alist in paliases.items():
alias_map.setdefault(canon, canon)
for a in alist:
alias_map[a] = canon
for i, rec in enumerate(face_records):
p = rec["path"]
bbox = tuple(int(x) for x in rec["bbox"])
v = emb[i].astype(np.float32)
n = float(np.linalg.norm(v))
if n > 0:
v = v / n
rec_index[(p, bbox)] = v
alias_map.setdefault(p, p)
return rec_index, alias_map
def lookup_emb(rec_index, alias_map, src: str, bbox):
bbox_t = tuple(int(x) for x in bbox)
canon = alias_map.get(src, src)
v = rec_index.get((canon, bbox_t))
if v is None and canon != src:
v = rec_index.get((src, bbox_t))
return v
# ----------------------------- analyze -----------------------------
def cmd_analyze(args):
rec_index, alias_map = load_caches()
facesets = iter_active_facesets()
print(f"[scan] {len(facesets)} active facesets", file=sys.stderr)
# Phase 1: walk every PNG, collect (faceset, file, src, bbox, quality, emb, sha256)
all_pngs = [] # list of dicts
t0 = time.time()
for fs in facesets:
manifest_path = fs / "manifest.json"
if not manifest_path.exists():
continue
m = json.loads(manifest_path.read_text())
for f in m.get("faces", []):
png_rel = f.get("png")
if not png_rel:
continue
disk_path = fs / png_rel
if not disk_path.exists():
continue
all_pngs.append({
"faceset": fs.name,
"family": faceset_family(fs.name),
"tier": faceset_tier(fs.name),
"file": Path(png_rel).name,
"rank": f.get("rank"),
"source": f.get("source"),
"bbox": f.get("bbox"),
"quality": f.get("quality", {}).get("composite", 0),
"disk_path": str(disk_path),
})
print(f"[scan] {len(all_pngs)} PNGs walked in {time.time()-t0:.1f}s", file=sys.stderr)
# Phase 2: SHA256 hash each PNG (parallel I/O)
t0 = time.time()
def _hash_one(idx):
all_pngs[idx]["sha256"] = sha256_file(Path(all_pngs[idx]["disk_path"]))
with ThreadPoolExecutor(max_workers=HASH_PARALLEL) as ex:
# exhaust the iterator to actually run
for _ in ex.map(_hash_one, range(len(all_pngs)), chunksize=16):
pass
print(f"[hash] {len(all_pngs)} PNGs hashed in {time.time()-t0:.1f}s", file=sys.stderr)
# Phase 3: cross-family byte-dedup
by_sha: dict[str, list[int]] = {}
for i, p in enumerate(all_pngs):
by_sha.setdefault(p["sha256"], []).append(i)
cross_family_groups = []
byte_drops: set[int] = set() # indices of PNGs to drop
for sha, idxs in by_sha.items():
if len(idxs) < 2:
continue
families = {all_pngs[i]["family"] for i in idxs}
if len(families) < 2:
continue # all in same family — intentional era duplication
# multiple families share this content → dedup keeping the best one
cross_family_groups.append({"sha256": sha, "members": [
{"faceset": all_pngs[i]["faceset"], "file": all_pngs[i]["file"],
"tier": all_pngs[i]["tier"], "quality": all_pngs[i]["quality"],
"rank": all_pngs[i]["rank"]} for i in idxs
]})
# keeper rule: lowest tier number, then highest quality
best = sorted(idxs, key=lambda i: (all_pngs[i]["tier"], -all_pngs[i]["quality"]))[0]
for i in idxs:
# NEVER drop within-family copies (preserve era duplication intentionally)
# We only drop indices whose family != best's family
if i != best and all_pngs[i]["family"] != all_pngs[best]["family"]:
byte_drops.add(i)
print(f"[byte] {len(cross_family_groups)} cross-family hash groups; "
f"{len(byte_drops)} PNGs marked for byte-dedup drop", file=sys.stderr)
# Phase 4: within-faceset near-dup (embedding sim >= threshold)
by_faceset: dict[str, list[int]] = {}
for i, p in enumerate(all_pngs):
by_faceset.setdefault(p["faceset"], []).append(i)
near_dup_groups = []
near_drops: set[int] = set()
miss_emb_total = 0
t0 = time.time()
for fs_name, idxs in by_faceset.items():
if len(idxs) < 2:
continue
# gather embeddings
embs = []
kept_idxs = []
for i in idxs:
v = lookup_emb(rec_index, alias_map, all_pngs[i]["source"], all_pngs[i]["bbox"])
if v is None:
miss_emb_total += 1
continue
embs.append(v)
kept_idxs.append(i)
if len(kept_idxs) < 2:
continue
M = np.stack(embs).astype(np.float32)
sim = M @ M.T
np.fill_diagonal(sim, -1) # ignore self
# find connected components in the (sim >= threshold) graph
adj = {k: set() for k in range(len(kept_idxs))}
for a in range(len(kept_idxs)):
# only check a < b to avoid double work
hi = np.where(sim[a, a+1:] >= NEAR_DUP_THRESHOLD)[0]
for off in hi:
b = a + 1 + int(off)
adj[a].add(b)
adj[b].add(a)
seen = set()
for k in adj:
if k in seen or not adj[k]:
continue
stack = [k]
comp = []
while stack:
x = stack.pop()
if x in seen:
continue
seen.add(x)
comp.append(x)
for y in adj[x]:
if y not in seen:
stack.append(y)
if len(comp) < 2:
continue
comp_idxs = [kept_idxs[c] for c in comp]
# keeper: highest quality.composite, tie-break: lowest rank
best = sorted(comp_idxs, key=lambda i: (-all_pngs[i]["quality"], all_pngs[i]["rank"] or 9999))[0]
sims_in_group = []
for ci in range(len(comp)):
for cj in range(ci+1, len(comp)):
sims_in_group.append(float(sim[comp[ci], comp[cj]]))
near_dup_groups.append({
"faceset": fs_name,
"members": [{"file": all_pngs[i]["file"], "rank": all_pngs[i]["rank"],
"quality": all_pngs[i]["quality"]} for i in comp_idxs],
"keeper": all_pngs[best]["file"],
"min_sim": min(sims_in_group) if sims_in_group else None,
"max_sim": max(sims_in_group) if sims_in_group else None,
})
for i in comp_idxs:
if i != best:
near_drops.add(i)
print(f"[near] {len(near_dup_groups)} near-dup groups; "
f"{len(near_drops)} PNGs marked for near-dup drop "
f"(miss_emb={miss_emb_total}); {time.time()-t0:.1f}s", file=sys.stderr)
# Combined drop set; for output, group by faceset
all_drops = byte_drops | near_drops
drops_by_faceset: dict[str, list] = {}
for i in all_drops:
p = all_pngs[i]
reason = []
if i in byte_drops: reason.append("byte_dup")
if i in near_drops: reason.append("near_dup")
drops_by_faceset.setdefault(p["faceset"], []).append({
"file": p["file"], "rank": p["rank"], "reason": "+".join(reason),
"sha256": p["sha256"], "quality": p["quality"],
})
plan = {
"thresholds": {"near_dup_sim": NEAR_DUP_THRESHOLD},
"totals": {
"active_facesets": len(facesets),
"active_pngs": len(all_pngs),
"byte_dup_groups": len(cross_family_groups),
"byte_dup_drops": len(byte_drops),
"near_dup_groups": len(near_dup_groups),
"near_dup_drops": len(near_drops),
"all_drops": len(all_drops),
"facesets_affected": len(drops_by_faceset),
},
"byte_dup_groups": cross_family_groups,
"near_dup_groups": near_dup_groups,
"drops_by_faceset": drops_by_faceset,
}
op = Path(args.out)
op.parent.mkdir(parents=True, exist_ok=True)
op.write_text(json.dumps(plan, indent=2))
print(f"[done] plan -> {op}", file=sys.stderr)
# ----------------------------- apply -----------------------------
def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
import zipfile
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
for i, p in enumerate(pngs):
zf.write(p, arcname=f"{i:04d}.png")
def _apply_drops_to_facesets(drops_by_faceset: dict[str, list], reason_label: str, master_path: Path):
"""Move flagged PNGs to <faceset>/faces/_dropped/, rebuild manifests + .fsz.
drops_by_faceset values are lists of {"file": str, ...}.
Returns total moved + counts per faceset."""
master = json.loads(master_path.read_text())
by_name = {f["name"]: f for f in master.get("facesets", [])}
total_moved = 0
per_faceset_counts = {}
for fs_name, drops in drops_by_faceset.items():
fs_dir = ROOT / fs_name
if not fs_dir.is_dir():
print(f"[warn] {fs_name}: dir missing, skip", file=sys.stderr)
continue
faces_dir = fs_dir / "faces"
dropped_dir = faces_dir / "_dropped"
dropped_dir.mkdir(exist_ok=True)
drop_files = {d["file"] for d in drops}
moved_here = 0
for fname in sorted(drop_files):
src = faces_dir / fname
if not src.exists():
continue
shutil.move(str(src), str(dropped_dir / fname))
moved_here += 1
# rebuild manifest by filtering out dropped files
manifest_path = fs_dir / "manifest.json"
if manifest_path.exists():
mm = json.loads(manifest_path.read_text())
new_faces = [f for f in mm.get("faces", []) if Path(f.get("png", "")).name not in drop_files]
mm["faces"] = new_faces
mm["exported"] = len(new_faces)
mm.setdefault(f"{reason_label}_history", []).append({"dropped": moved_here})
# re-zip
survivor_pngs = sorted(faces_dir.glob("*.png"))
top_n = mm.get("top_n", 30)
top_n_eff = min(top_n, len(survivor_pngs))
for old in fs_dir.glob("*.fsz"):
old.unlink()
top_fsz_name = f"{fs_name}_top{top_n_eff}.fsz"
all_fsz_name = f"{fs_name}_all.fsz"
if top_n_eff > 0:
_zip_png_list(survivor_pngs[:top_n_eff], fs_dir / top_fsz_name)
mm["fsz_top"] = top_fsz_name
mm["top_n"] = top_n_eff
else:
mm["fsz_top"] = None
mm["top_n"] = 0
if len(survivor_pngs) > top_n_eff:
_zip_png_list(survivor_pngs, fs_dir / all_fsz_name)
mm["fsz_all"] = all_fsz_name
else:
mm["fsz_all"] = None
manifest_path.write_text(json.dumps(mm, indent=2))
if fs_name in by_name:
by_name[fs_name]["exported"] = len(new_faces)
by_name[fs_name]["fsz_top"] = mm["fsz_top"]
by_name[fs_name]["fsz_all"] = mm["fsz_all"]
by_name[fs_name]["top_n"] = mm["top_n"]
by_name[fs_name].setdefault(f"{reason_label}_dropped", 0)
by_name[fs_name][f"{reason_label}_dropped"] += moved_here
total_moved += moved_here
per_faceset_counts[fs_name] = moved_here
# rewrite master with same ordering
new_facesets = [by_name.get(e["name"], e) for e in master.get("facesets", [])]
master["facesets"] = new_facesets
master.setdefault(f"{reason_label}_runs", []).append({
"facesets_affected": len(per_faceset_counts),
"pngs_moved": total_moved,
})
tmp = master_path.with_suffix(".tmp.json")
tmp.write_text(json.dumps(master, indent=2))
tmp.replace(master_path)
return total_moved, per_faceset_counts
def cmd_apply(args):
plan = json.loads(Path(args.plan).read_text())
drops = plan["drops_by_faceset"]
if args.dry_run:
for fs, items in sorted(drops.items()):
reasons = {}
for it in items:
reasons[it["reason"]] = reasons.get(it["reason"], 0) + 1
print(f" {fs}: {len(items)} dropped ({reasons})")
print(f"=== total: {sum(len(v) for v in drops.values())} PNGs across {len(drops)} facesets ===")
return
master_path = ROOT / "manifest.json"
total, _ = _apply_drops_to_facesets(drops, "dedup", master_path)
print(f"[done] {total} PNGs moved to faces/_dropped/ across {len(drops)} facesets", file=sys.stderr)
# ----------------------------- multiface staging + apply -----------------------------
def cmd_stage_multiface(args):
"""Build queue.json of all currently-active PNGs in the corpus
for the Windows DML multi-face audit worker."""
queue = []
for fs in iter_active_facesets():
faces_dir = fs / "faces"
if not faces_dir.is_dir():
continue
for p in sorted(faces_dir.glob("*.png")):
queue.append({
"wsl_path": str(p),
"win_path": wsl_to_win(str(p)),
"faceset": fs.name,
"file": p.name,
})
op = Path(args.out)
op.parent.mkdir(parents=True, exist_ok=True)
op.write_text(json.dumps(queue, indent=2))
print(f"[stage] {len(queue)} PNGs -> {op}", file=sys.stderr)
def cmd_merge_multiface(args):
"""Convert worker results.json into a drops_by_faceset plan."""
src = json.loads(Path(args.results).read_text())
drops_by_faceset: dict[str, list] = {}
bad_count = 0
for r in src.get("results", []):
n_faces = r.get("face_count", -1)
if n_faces == 1:
continue
bad_count += 1
drops_by_faceset.setdefault(r["faceset"], []).append({
"file": r["file"],
"reason": f"multiface_{n_faces}",
"face_count": n_faces,
})
plan = {
"totals": {"bad_pngs": bad_count, "facesets_affected": len(drops_by_faceset),
"scored": len(src.get("results", []))},
"drops_by_faceset": drops_by_faceset,
}
op = Path(args.out)
op.parent.mkdir(parents=True, exist_ok=True)
op.write_text(json.dumps(plan, indent=2))
print(f"[merge] {bad_count} bad PNGs across {len(drops_by_faceset)} facesets -> {op}", file=sys.stderr)
def cmd_apply_multiface(args):
plan = json.loads(Path(args.plan).read_text())
drops = plan["drops_by_faceset"]
if args.dry_run:
for fs, items in sorted(drops.items()):
print(f" {fs}: {len(items)} bad PNG(s)")
print(f"=== total: {sum(len(v) for v in drops.values())} ===")
return
master_path = ROOT / "manifest.json"
total, _ = _apply_drops_to_facesets(drops, "multiface", master_path)
print(f"[done] {total} PNGs moved to faces/_dropped/ across {len(drops)} facesets", file=sys.stderr)
# ----------------------------- report -----------------------------
def cmd_report(args):
out_dir = Path(args.out)
out_dir.mkdir(parents=True, exist_ok=True)
sections = []
if args.dedup:
d = json.loads(Path(args.dedup).read_text())
t = d["totals"]
sections.append(f"<h2>Dedup</h2>")
sections.append(
f"<ul>"
f"<li>Active facesets: {t['active_facesets']}, active PNGs: {t['active_pngs']}</li>"
f"<li>Cross-family byte-dup groups: {t['byte_dup_groups']}{t['byte_dup_drops']} PNGs dropped</li>"
f"<li>Within-faceset near-dup groups (sim≥{d['thresholds']['near_dup_sim']}): {t['near_dup_groups']}{t['near_dup_drops']} PNGs dropped</li>"
f"<li><b>Total dedup drops: {t['all_drops']}</b> across {t['facesets_affected']} facesets</li>"
f"</ul>"
)
# top-N affected facesets
rows = sorted(d["drops_by_faceset"].items(), key=lambda x: -len(x[1]))[:25]
sections.append("<h3>Top 25 most-affected facesets</h3><table><tr><th>faceset</th><th>dropped</th><th>reasons</th></tr>")
for fs, items in rows:
r = {}
for it in items:
r[it["reason"]] = r.get(it["reason"], 0) + 1
sections.append(f"<tr><td>{fs}</td><td>{len(items)}</td><td>{r}</td></tr>")
sections.append("</table>")
if args.multiface:
m = json.loads(Path(args.multiface).read_text())
t = m["totals"]
sections.append("<h2>Multi-face audit</h2>")
sections.append(
f"<ul>"
f"<li>PNGs scored: {t['scored']}</li>"
f"<li>Bad PNGs (0 or >1 face): {t['bad_pngs']} across {t['facesets_affected']} facesets</li>"
f"</ul>"
)
html = f"""<!doctype html>
<html><head><meta charset='utf-8'><title>Dedup + multi-face audit</title>
<style>
body {{ font-family: system-ui, sans-serif; background:#111; color:#eee; padding:1em; }}
h1, h2, h3 {{ margin-top:1em; }}
table {{ border-collapse: collapse; font-family: monospace; font-size: 12px; }}
table td, table th {{ padding: 2px 8px; border: 1px solid #333; }}
ul li {{ margin: 4px 0; }}
</style></head>
<body>
<h1>facesets_swap_ready dedup + roop optimization audit</h1>
{''.join(sections)}
</body></html>"""
out_html = out_dir / "index.html"
out_html.write_text(html)
print(f"[done] {out_html}", file=sys.stderr)
# ----------------------------- main -----------------------------
def main():
ap = argparse.ArgumentParser()
sub = ap.add_subparsers(dest="cmd", required=True)
a = sub.add_parser("analyze")
a.add_argument("--out", required=True)
a.set_defaults(func=cmd_analyze)
p = sub.add_parser("apply")
p.add_argument("--plan", required=True)
p.add_argument("--dry-run", action="store_true")
p.set_defaults(func=cmd_apply)
sm = sub.add_parser("stage_multiface")
sm.add_argument("--out", required=True)
sm.set_defaults(func=cmd_stage_multiface)
mm = sub.add_parser("merge_multiface")
mm.add_argument("--results", required=True)
mm.add_argument("--out", required=True)
mm.set_defaults(func=cmd_merge_multiface)
am = sub.add_parser("apply_multiface")
am.add_argument("--plan", required=True)
am.add_argument("--dry-run", action="store_true")
am.set_defaults(func=cmd_apply_multiface)
r = sub.add_parser("report")
r.add_argument("--dedup", default=None)
r.add_argument("--multiface", default=None)
r.add_argument("--out", required=True)
r.set_defaults(func=cmd_report)
args = ap.parse_args()
args.func(args)
if __name__ == "__main__":
main()

574
work/filter_occlusions.py Normal file
View File

@@ -0,0 +1,574 @@
"""CLIP zero-shot scoring for masks + sunglasses across facesets_swap_ready/.
Usage:
# score one or more specific facesets (test mode)
python work/filter_occlusions.py score --facesets faceset_001,faceset_050 \
--out work/test_batch_occlusion/scores.json
# score everything (full corpus)
python work/filter_occlusions.py score --out work/occlusion_scores.json
# render HTML contact sheet from a scores.json
python work/filter_occlusions.py report --scores work/test_batch_occlusion/scores.json \
--out work/test_batch_occlusion
Notes:
- This script never modifies facesets_swap_ready/. An --apply step lives elsewhere
(or will be added once thresholds are validated).
- Model: open_clip ViT-L-14 / dfn2b_s39b (best public zero-shot at this size).
"""
from __future__ import annotations
import argparse
import json
import sys
import time
from pathlib import Path
from typing import Iterable
import torch
from PIL import Image
import open_clip
ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
WIN_ROOT = r"E:\temp_things\fcswp\nl_sorted\facesets_swap_ready"
MODEL_NAME = "ViT-L-14"
PRETRAINED = "dfn2b_s39b"
def wsl_to_win(wsl_path: str) -> str:
"""Translate a /mnt/e/... wsl path to E:\\... for the Windows worker."""
s = str(wsl_path)
if s.startswith("/mnt/"):
drive = s[5]
rest = s[7:].replace("/", "\\")
return f"{drive.upper()}:\\{rest}"
return s
# Prompt ensembles. Each pair (positive, negative) becomes one binary classifier.
# We average text embeddings within each list, then softmax across the two means.
PROMPTS = {
"mask": {
"pos": [
"a photo of a person wearing a surgical face mask",
"a photo of a person wearing an FFP2 respirator covering mouth and nose",
"a photo of a person wearing a cloth face mask",
"a face partially covered by a medical mask",
"a person whose mouth and nose are hidden by a face mask",
],
"neg": [
"a photo of a person's face with mouth and nose clearly visible",
"a clear, unobstructed photo of a face",
"a photo of a face without any mask or covering",
"a portrait of a person showing their full face",
"a photo of a person with a beard and visible mouth", # avoid beard false positives
],
},
"sunglasses": {
# We want to flag ONLY images where sunglasses occlude the eyes.
# False positives to defeat: sunglasses pushed up on the head/forehead, hanging on a shirt collar.
"pos": [
"a face with dark sunglasses covering the eyes",
"a portrait with the eyes hidden behind opaque sunglasses",
"a person wearing dark sunglasses over their eyes, eyes not visible",
"a face where the eyes are completely concealed by tinted lenses",
"a close-up portrait wearing aviator sunglasses on the eyes",
],
"neg": [
"a portrait with both eyes clearly visible and uncovered",
"a face with sunglasses pushed up on the forehead, eyes visible below",
"a face with sunglasses resting on top of the head, eyes visible",
"a person with sunglasses hanging from their shirt, eyes visible",
"a face wearing clear prescription eyeglasses with visible eyes",
"a portrait with no eyewear and visible eyes",
],
},
}
def load_model(device: str = "cpu"):
print(f"[clip] loading {MODEL_NAME} / {PRETRAINED} on {device} ...", file=sys.stderr)
t0 = time.time()
model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED)
tokenizer = open_clip.get_tokenizer(MODEL_NAME)
model = model.to(device).eval()
logit_scale = float(model.logit_scale.exp().detach().cpu())
print(f"[clip] ready in {time.time()-t0:.1f}s, logit_scale={logit_scale:.2f}", file=sys.stderr)
return model, preprocess, tokenizer, logit_scale
@torch.no_grad()
def build_text_features(model, tokenizer, device: str):
"""Return dict {attr: (pos_mean_emb, neg_mean_emb)} on device, both L2-normalized."""
out = {}
for attr, sides in PROMPTS.items():
feats = {}
for side in ("pos", "neg"):
tokens = tokenizer(sides[side]).to(device)
f = model.encode_text(tokens)
f = f / f.norm(dim=-1, keepdim=True)
mean = f.mean(dim=0)
feats[side] = mean / mean.norm()
out[attr] = (feats["pos"], feats["neg"])
return out
@torch.no_grad()
def score_images(model, preprocess, text_feats, logit_scale: float, paths: list[Path], device: str, batch: int = 16):
"""Yield (path, {attr: pos_prob}) per image. logit_scale is CLIP's learned temperature (~100)."""
for i in range(0, len(paths), batch):
chunk = paths[i:i + batch]
imgs = []
keep = []
for p in chunk:
try:
img = Image.open(p).convert("RGB")
imgs.append(preprocess(img))
keep.append(p)
except Exception as e:
print(f"[skip] {p}: {e}", file=sys.stderr)
if not imgs:
continue
x = torch.stack(imgs).to(device)
feats = model.encode_image(x)
feats = feats / feats.norm(dim=-1, keepdim=True) # (B, D)
results = {}
for attr, (pos, neg) in text_feats.items():
sims = torch.stack([feats @ pos, feats @ neg], dim=1) * logit_scale # (B, 2)
probs = sims.softmax(dim=1)[:, 0].tolist() # P(pos)
results[attr] = probs
for j, p in enumerate(keep):
yield p, {attr: results[attr][j] for attr in text_feats}
def iter_facesets(root: Path, only: list[str] | None) -> Iterable[Path]:
if only:
for name in only:
d = root / name
if d.is_dir():
yield d
else:
print(f"[warn] not a directory: {d}", file=sys.stderr)
return
for d in sorted(root.iterdir()):
if d.is_dir() and not d.name.startswith("_"):
yield d
def cmd_score(args):
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess, tokenizer, logit_scale = load_model(device)
text_feats = build_text_features(model, tokenizer, device)
only = [s.strip() for s in args.facesets.split(",")] if args.facesets else None
facesets = list(iter_facesets(ROOT, only))
if args.sample_per_faceset:
# take first N PNGs per faceset (cheap deterministic sample for test batches)
pass
report = {
"model": f"{MODEL_NAME}/{PRETRAINED}",
"root": str(ROOT),
"prompts": PROMPTS,
"facesets": {},
}
total_imgs = 0
t0 = time.time()
for fs in facesets:
faces = sorted((fs / "faces").glob("*.png")) if (fs / "faces").is_dir() else sorted(fs.glob("*.png"))
if args.sample_per_faceset:
faces = faces[: args.sample_per_faceset]
if not faces:
continue
print(f"[scan] {fs.name}: {len(faces)} png", file=sys.stderr)
per_image = []
for p, scores in score_images(model, preprocess, text_feats, logit_scale, faces, device):
per_image.append({"file": p.name, "mask": round(scores["mask"], 4), "sunglasses": round(scores["sunglasses"], 4)})
total_imgs += 1
report["facesets"][fs.name] = per_image
out = Path(args.out)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(report, indent=2))
dt = time.time() - t0
print(f"[done] {total_imgs} images, {dt:.1f}s ({total_imgs/max(dt,1e-3):.2f} img/s) -> {out}", file=sys.stderr)
def cmd_report(args):
"""Render an HTML contact sheet from scores.json. Generates JPG thumbs."""
import io
scores = json.loads(Path(args.scores).read_text())
out_dir = Path(args.out)
thumbs_dir = out_dir / "thumbs"
thumbs_dir.mkdir(parents=True, exist_ok=True)
THUMB = 160
rows_html = []
def thumb_path(faceset: str, fname: str) -> Path:
d = thumbs_dir / faceset
d.mkdir(parents=True, exist_ok=True)
return d / (Path(fname).stem + ".jpg")
def make_thumb(src: Path, dst: Path):
if dst.exists():
return
try:
img = Image.open(src).convert("RGB")
img.thumbnail((THUMB, THUMB), Image.LANCZOS)
img.save(dst, "JPEG", quality=82)
except Exception as e:
print(f"[thumb-skip] {src}: {e}", file=sys.stderr)
facesets = scores["facesets"]
for faceset, items in facesets.items():
# sort: high score first so borderline cases group at the boundary
items_sorted = sorted(items, key=lambda x: max(x["mask"], x["sunglasses"]), reverse=True)
# faceset summary
n = len(items)
n_mask = sum(1 for x in items if x["mask"] >= 0.7)
n_sg = sum(1 for x in items if x["sunglasses"] >= 0.7)
pct_mask = (100 * n_mask / n) if n else 0
pct_sg = (100 * n_sg / n) if n else 0
rows_html.append(f"<h2 id='{faceset}'>{faceset} <small>({n} imgs &middot; mask&ge;0.7: {n_mask} ({pct_mask:.0f}%) &middot; sunglasses&ge;0.7: {n_sg} ({pct_sg:.0f}%))</small></h2>")
rows_html.append("<div class='grid'>")
src_root = ROOT / faceset
faces_root = (src_root / "faces") if (src_root / "faces").is_dir() else src_root
for it in items_sorted:
src = faces_root / it["file"]
dst = thumb_path(faceset, it["file"])
make_thumb(src, dst)
rel = f"thumbs/{faceset}/{Path(it['file']).stem}.jpg"
m, s = it["mask"], it["sunglasses"]
cls_m = "hi" if m >= 0.7 else ("mid" if m >= 0.4 else "lo")
cls_s = "hi" if s >= 0.7 else ("mid" if s >= 0.4 else "lo")
rows_html.append(
f"<div class='cell'>"
f"<img src='{rel}' loading='lazy' title='{it['file']}'>"
f"<div class='scores'><span class='{cls_m}'>M {m:.2f}</span> <span class='{cls_s}'>S {s:.2f}</span></div>"
f"</div>"
)
rows_html.append("</div>")
nav = " · ".join(f"<a href='#{f}'>{f}</a>" for f in facesets)
html = f"""<!doctype html>
<html><head><meta charset='utf-8'><title>Occlusion test batch</title>
<style>
body {{ font-family: system-ui, sans-serif; background: #111; color: #eee; padding: 1em; }}
h1 {{ margin-top: 0; }}
h2 {{ margin-top: 1.5em; border-bottom: 1px solid #333; padding-bottom: .25em; }}
small {{ color: #999; font-weight: normal; }}
.grid {{ display: grid; grid-template-columns: repeat(auto-fill, minmax(170px, 1fr)); gap: .5em; }}
.cell {{ background: #1c1c1c; padding: 4px; border-radius: 4px; text-align: center; }}
.cell img {{ max-width: 100%; height: auto; display: block; margin: 0 auto; }}
.scores {{ font-family: monospace; font-size: 11px; padding-top: 4px; }}
.hi {{ color: #ff5050; font-weight: bold; }}
.mid {{ color: #ffb050; }}
.lo {{ color: #5fa05f; }}
.nav {{ position: sticky; top: 0; background: #111; padding: .5em 0; border-bottom: 1px solid #333; }}
a {{ color: #6cf; }}
</style></head>
<body>
<h1>Occlusion scores &mdash; {scores['model']}</h1>
<p>Sorted within each faceset by max(mask, sunglasses) descending.
Color: <span class='hi'>&ge;0.70</span> &middot; <span class='mid'>0.40&ndash;0.70</span> &middot; <span class='lo'>&lt;0.40</span></p>
<div class='nav'>{nav}</div>
{''.join(rows_html)}
</body></html>"""
out_html = out_dir / "index.html"
out_html.write_text(html)
print(f"[done] {out_html}", file=sys.stderr)
def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
"""Mirror of sort_faces.py:_zip_png_list. Renames PNGs to 0000.png, 0001.png, ..."""
import zipfile
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
for i, p in enumerate(pngs):
zf.write(p, arcname=f"{i:04d}.png")
def cmd_apply(args):
"""Prune mask/sunglasses PNGs, quarantine occlusion-dominated facesets,
re-zip .fsz, update top-level manifest. --dry-run prints the plan only."""
import shutil
threshold = args.threshold
domain_pct = args.domain_pct
min_survivors = args.min_survivors
top_n_target = args.top_n
scores = json.loads(Path(args.scores).read_text())
master_path = ROOT / "manifest.json"
master = json.loads(master_path.read_text())
by_name = {f["name"]: f for f in master.get("facesets", [])}
masked_dir = ROOT / "_masked"
thin_dir = ROOT / "_thin"
plan = []
for faceset, items in scores["facesets"].items():
if faceset not in by_name:
print(f"[warn] {faceset} not in master manifest — skipping", file=sys.stderr)
continue
n = len(items)
flagged_files = sorted(
it["file"] for it in items
if it["mask"] >= threshold or it["sunglasses"] >= threshold
)
survivors_items = [it for it in items if it["file"] not in set(flagged_files)]
# preserve quality order from filename (0001.png is highest-rank)
survivors_files = sorted(it["file"] for it in survivors_items)
n_mask = sum(1 for it in items if it["mask"] >= threshold)
n_sg = sum(1 for it in items if it["sunglasses"] >= threshold)
pct_mask = n_mask / n if n else 0
pct_sg = n_sg / n if n else 0
if pct_mask >= domain_pct:
action, reason = "quarantine_masked", f"mask_pct={pct_mask:.0%}"
elif pct_sg >= domain_pct:
action, reason = "quarantine_masked", f"sunglasses_pct={pct_sg:.0%}"
elif flagged_files and len(survivors_files) < min_survivors:
# only quarantine-as-thin if pruning is the cause of the drop below threshold;
# pre-existing small facesets without occlusions are left alone
action, reason = "quarantine_thin", f"survivors={len(survivors_files)}<{min_survivors}"
elif flagged_files:
action, reason = "prune", f"drop {len(flagged_files)}"
else:
action, reason = "keep", "clean"
plan.append({
"faceset": faceset, "action": action, "reason": reason,
"n": n, "n_mask": n_mask, "n_sg": n_sg,
"n_dropped": len(flagged_files), "n_survivors": len(survivors_files),
"dropped_files": flagged_files,
})
# Summary
counts = {a: 0 for a in ("keep", "prune", "quarantine_masked", "quarantine_thin")}
for p in plan:
counts[p["action"]] += 1
total_dropped_pngs = sum(p["n_dropped"] for p in plan if p["action"] == "prune")
total_quarantined_pngs = sum(p["n"] for p in plan if p["action"].startswith("quarantine"))
print(f"=== plan summary (threshold={threshold} domain_pct={domain_pct} min_survivors={min_survivors}) ===")
for a, c in counts.items():
print(f" {a}: {c}")
print(f" PNGs to drop (prune): {total_dropped_pngs}")
print(f" PNGs to quarantine (whole): {total_quarantined_pngs}")
print(f" facesets in master: {len(master['facesets'])}")
print(f" facesets scored: {len(plan)}")
# Write plan for audit
plan_path = Path(args.out_plan)
plan_path.parent.mkdir(parents=True, exist_ok=True)
plan_path.write_text(json.dumps({
"thresholds": {"image": threshold, "domain_pct": domain_pct, "min_survivors": min_survivors},
"counts": counts,
"totals": {"dropped_pngs": total_dropped_pngs, "quarantined_pngs": total_quarantined_pngs},
"plan": plan,
}, indent=2))
print(f" plan written to {plan_path}")
if args.dry_run:
# pretty list of quarantines
for p in plan:
if p["action"].startswith("quarantine"):
print(f" [{p['action']:>18s}] {p['faceset']} ({p['reason']}, n={p['n']})")
return
# ----- destructive section -----
masked_dir.mkdir(parents=True, exist_ok=True)
thin_dir.mkdir(parents=True, exist_ok=True)
new_facesets = []
new_masked = list(master.get("masked", [])) # preserve any prior runs
new_thin = list(master.get("thin_eras", []))
# build a name -> existing-thin/masked entry index, to update relpath if we re-quarantine
by_name_thin = {e["name"]: e for e in new_thin}
by_name_masked = {e["name"]: e for e in new_masked}
for p in plan:
entry = dict(by_name[p["faceset"]]) # copy
fs_dir = ROOT / p["faceset"]
faces_dir = fs_dir / "faces"
if p["action"] == "keep":
new_facesets.append(entry)
continue
# prune dropped PNGs first (applies to both prune and quarantine_thin paths)
if p["dropped_files"]:
dropped_holding = faces_dir / "_dropped"
dropped_holding.mkdir(exist_ok=True)
for fname in p["dropped_files"]:
src = faces_dir / fname
if src.exists():
shutil.move(str(src), str(dropped_holding / fname))
if p["action"].startswith("quarantine"):
target_root = masked_dir if p["action"] == "quarantine_masked" else thin_dir
target = target_root / p["faceset"]
if target.exists():
# idempotency: if a previous run already moved it, skip move
pass
else:
shutil.move(str(fs_dir), str(target))
entry["occlusion_filter"] = {
"action": p["action"], "reason": p["reason"],
"n_input": p["n"], "n_mask": p["n_mask"], "n_sg": p["n_sg"],
"n_dropped": p["n_dropped"], "n_survivors": p["n_survivors"],
"threshold": threshold, "domain_pct": domain_pct,
}
entry["relpath"] = f"{'_masked' if p['action']=='quarantine_masked' else '_thin'}/{p['faceset']}"
entry["fsz_top"] = None
entry["fsz_all"] = None
if p["action"] == "quarantine_masked":
entry["masked"] = True
new_masked.append(entry)
else:
entry["thin"] = True
new_thin.append(entry)
continue
# action == prune
survivor_pngs = sorted([pp for pp in faces_dir.glob("*.png")])
if not survivor_pngs:
print(f"[warn] {p['faceset']}: no survivor PNGs after prune", file=sys.stderr)
new_facesets.append(entry)
continue
# re-zip .fsz from survivors in quality order
top_n_eff = min(top_n_target, len(survivor_pngs))
top_fsz = fs_dir / f"{p['faceset']}_top{top_n_eff}.fsz"
all_fsz = fs_dir / f"{p['faceset']}_all.fsz"
# remove old .fsz files (they may have different top_n in name)
for old in fs_dir.glob("*.fsz"):
old.unlink()
_zip_png_list(survivor_pngs[:top_n_eff], top_fsz)
if len(survivor_pngs) > top_n_eff:
_zip_png_list(survivor_pngs, all_fsz)
entry["fsz_all"] = all_fsz.name
else:
entry["fsz_all"] = None
entry["fsz_top"] = top_fsz.name
entry["top_n"] = top_n_eff
entry["exported"] = len(survivor_pngs)
entry["dropped_occlusion"] = p["n_dropped"]
entry["occlusion_filter"] = {
"action": "prune", "n_input": p["n"], "n_mask": p["n_mask"],
"n_sg": p["n_sg"], "n_dropped": p["n_dropped"], "n_survivors": p["n_survivors"],
"threshold": threshold,
}
new_facesets.append(entry)
# write updated master manifest
new_master = dict(master)
new_master["facesets"] = new_facesets
new_master["masked"] = new_masked
new_master["thin_eras"] = new_thin
new_master["occlusion_filter_run"] = {
"model": scores.get("model"),
"threshold": threshold,
"domain_pct": domain_pct,
"min_survivors": min_survivors,
"counts": counts,
"totals": {"dropped_pngs": total_dropped_pngs, "quarantined_pngs": total_quarantined_pngs},
}
tmp = master_path.with_suffix(".tmp.json")
tmp.write_text(json.dumps(new_master, indent=2))
tmp.replace(master_path)
print(f"[done] master manifest updated: {len(new_facesets)} active, "
f"{len(new_masked)} masked, {len(new_thin)} thin")
def cmd_stage(args):
"""Walk facesets_swap_ready/ and write a queue.json for the Windows clip_worker."""
only = [s.strip() for s in args.facesets.split(",")] if args.facesets else None
queue = []
for fs in iter_facesets(ROOT, only):
faces = sorted((fs / "faces").glob("*.png")) if (fs / "faces").is_dir() else sorted(fs.glob("*.png"))
for p in faces:
queue.append({
"wsl_path": str(p),
"win_path": wsl_to_win(str(p)),
"faceset": fs.name,
"file": p.name,
})
out = Path(args.out)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(queue, indent=2))
print(f"[stage] {len(queue)} png paths -> {out}", file=sys.stderr)
print(f"[stage] win queue file: {wsl_to_win(str(out))}", file=sys.stderr)
def cmd_merge(args):
"""Ingest worker scores.json into the per-faceset shape that `report` reads."""
src = json.loads(Path(args.scores).read_text())
by_faceset: dict[str, list] = {}
for r in src.get("results", []):
by_faceset.setdefault(r["faceset"], []).append({
"file": r["file"],
"mask": r["mask"],
"sunglasses": r["sunglasses"],
})
# stable ordering: faceset by name, files by name
out_data = {
"model": src.get("model", f"{MODEL_NAME}/{PRETRAINED}"),
"root": str(ROOT),
"prompts": src.get("prompts", PROMPTS),
"facesets": {fs: sorted(items, key=lambda x: x["file"]) for fs, items in sorted(by_faceset.items())},
}
out = Path(args.out)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(out_data, indent=2))
total = sum(len(v) for v in by_faceset.values())
print(f"[merge] {total} scores across {len(by_faceset)} facesets -> {out}", file=sys.stderr)
def main():
ap = argparse.ArgumentParser()
sub = ap.add_subparsers(dest="cmd", required=True)
s = sub.add_parser("score", help="WSL CPU scoring (slow but no GPU dependency)")
s.add_argument("--facesets", default=None, help="comma-separated faceset names; default = all")
s.add_argument("--sample-per-faceset", type=int, default=0, help="cap PNGs per faceset (0 = all)")
s.add_argument("--out", required=True)
s.set_defaults(func=cmd_score)
st = sub.add_parser("stage", help="Build queue.json for Windows clip_worker.py")
st.add_argument("--facesets", default=None, help="comma-separated faceset names; default = all")
st.add_argument("--out", required=True)
st.set_defaults(func=cmd_stage)
m = sub.add_parser("merge", help="Convert worker scores.json into per-faceset report format")
m.add_argument("--scores", required=True, help="worker output (flat list of results)")
m.add_argument("--out", required=True, help="output path for per-faceset format")
m.set_defaults(func=cmd_merge)
r = sub.add_parser("report", help="Render HTML contact sheet from a per-faceset scores.json")
r.add_argument("--scores", required=True)
r.add_argument("--out", required=True)
r.set_defaults(func=cmd_report)
a = sub.add_parser("apply", help="Prune flagged PNGs, quarantine dominated facesets, re-zip .fsz, update manifest")
a.add_argument("--scores", required=True, help="per-faceset scores.json (output of `merge` or `score`)")
a.add_argument("--out-plan", required=True, help="path to write the apply plan json (audit)")
a.add_argument("--threshold", type=float, default=0.7, help="image-level drop threshold for mask/sunglasses (default 0.7)")
a.add_argument("--domain-pct", type=float, default=0.40, help="faceset-level quarantine threshold (default 0.40)")
a.add_argument("--min-survivors", type=int, default=5, help="quarantine to _thin if survivors below this (default 5)")
a.add_argument("--top-n", type=int, default=30, help="top-N for re-zipped _topN.fsz (default 30)")
a.add_argument("--dry-run", action="store_true", help="print plan only, no filesystem changes")
a.set_defaults(func=cmd_apply)
args = ap.parse_args()
args.func(args)
if __name__ == "__main__":
main()

144
work/multiface_worker.py Normal file
View File

@@ -0,0 +1,144 @@
"""Windows / DirectML multi-face audit worker.
For every PNG in queue.json, run insightface FaceAnalysis and record how many
faces were detected (filtering by det_score>=MIN_DET and face_short>=MIN_PIX).
Surfaces the load-bearing roop invariant: each .fsz PNG must hold exactly one
face, otherwise the loader's `extract_face_images` appends every detected face
into the FaceSet and pollutes the averaged identity embedding.
CLI:
py -3.12 multiface_worker.py <queue.json> <out_results.json> [--limit N]
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import time
from pathlib import Path
import numpy as np
from PIL import Image, ImageOps
from insightface.app import FaceAnalysis
MODEL_ROOT = r"C:\face_embed_venv\models"
MIN_DET = 0.5
MIN_FACE_PIX = 40
FLUSH_EVERY = 200
def load_existing(out_path: Path):
if not out_path.exists():
return None, set()
try:
d = json.loads(out_path.read_text())
processed = set(d.get("processed", []))
return d, processed
except Exception as e:
print(f"[warn] could not parse {out_path}: {e}; starting fresh", file=sys.stderr)
return None, set()
def save_atomic(out_path: Path, data: dict):
tmp = out_path.with_suffix(".tmp.json")
tmp.write_text(json.dumps(data, indent=2))
os.replace(tmp, out_path)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("queue", type=Path)
ap.add_argument("out", type=Path)
ap.add_argument("--limit", type=int, default=None)
args = ap.parse_args()
queue = json.loads(args.queue.read_text())
print(f"[queue] {len(queue)} entries from {args.queue}", flush=True)
args.out.parent.mkdir(parents=True, exist_ok=True)
existing, processed = load_existing(args.out)
if existing:
print(f"[resume] {len(processed)} already scored", flush=True)
results = existing.get("results", [])
else:
results = []
pending = [e for e in queue if e["wsl_path"] not in processed]
if args.limit is not None:
pending = pending[: args.limit]
print(f"[pending] {len(pending)} entries", flush=True)
if not pending:
print("[done] nothing to do")
return
print("[load] FaceAnalysis with DmlExecutionProvider", flush=True)
app = FaceAnalysis(
name="buffalo_l",
root=MODEL_ROOT,
providers=["DmlExecutionProvider", "CPUExecutionProvider"],
)
app.prepare(ctx_id=0, det_size=(640, 640))
n_done = 0
n_load_err = 0
last_flush = time.time()
t_start = time.time()
def flush():
save_atomic(args.out, {
"results": results,
"processed": sorted(processed),
})
for entry in pending:
try:
with Image.open(entry["win_path"]) as im:
im = ImageOps.exif_transpose(im)
im = im.convert("RGB")
rgb = np.array(im)
bgr = rgb[:, :, ::-1].copy()
except Exception as e:
n_load_err += 1
results.append({
"wsl_path": entry["wsl_path"], "faceset": entry["faceset"], "file": entry["file"],
"face_count": -1, "error": "load",
})
processed.add(entry["wsl_path"])
n_done += 1
continue
faces = app.get(bgr)
kept = 0
for f in faces:
if float(f.det_score) < MIN_DET:
continue
x1, y1, x2, y2 = [int(round(v)) for v in f.bbox]
short = min(max(x2 - x1, 0), max(y2 - y1, 0))
if short < MIN_FACE_PIX:
continue
kept += 1
results.append({
"wsl_path": entry["wsl_path"], "faceset": entry["faceset"], "file": entry["file"],
"face_count": kept,
})
processed.add(entry["wsl_path"])
n_done += 1
if (n_done % FLUSH_EVERY == 0) or (time.time() - last_flush) > 30.0:
flush()
last_flush = time.time()
elapsed = time.time() - t_start
rate = n_done / max(0.1, elapsed)
eta = (len(pending) - n_done) / max(0.1, rate) / 60.0
print(f"[scan] {n_done}/{len(pending)} rate={rate:.2f} img/s eta={eta:.1f}min "
f"load_err={n_load_err}", flush=True)
flush()
elapsed = time.time() - t_start
print(f"[done] {n_done} scored, {n_load_err} load errors, {elapsed:.1f}s "
f"({n_done/max(0.1,elapsed):.2f} img/s) -> {args.out}", flush=True)
if __name__ == "__main__":
main()