Add post-export corpus maintenance pipeline

Adds four new orchestration scripts that operate on an already-built
facesets_swap_ready/ to clean it up over time:

- filter_occlusions.py + clip_worker.py: CLIP zero-shot mask + sunglasses
  filter (open_clip ViT-L-14/dfn2b_s39b). WSL stages, Windows DML scores
  via new C:\clip_dml_venv. Image-level threshold 0.7; faceset-level
  quarantine at 40% domain dominance.

- consolidate_facesets.py: duplicate-identity merger using complete-linkage
  centroid clustering on cached arcface embeddings. Single-linkage chains
  catastrophically (60-faceset clusters with min sim < 0); complete-linkage
  guarantees within-group sim >= edge.

- age_extend_001.py: slots newly-added PNGs into existing era buckets of
  faceset_001 using the same anchor-fragment rule as age_split_001.py
  (dist <= 0.40 AND |year_delta| <= 5). Anchors not re-centered.

- dedup_optimize.py + multiface_worker.py: corpus-wide cleanup with three
  passes — cross-family SHA256 byte-dedup (preserves intra-family era
  duplication), within-faceset near-dup at sim >= 0.95, and a multi-face
  audit (the load-bearing roop invariant). Multi-face worker hits ~19 img/s
  on AMD Vega — ~7x embed_worker because input is 512x512 crops.

Same-day corpus evolution: 311 active / 0 masked / 68 thin / 0 merged →
181 / 51 / 71 / 74; 6,440 → 3,849 active PNGs. All quarantines and prunes
preserved on disk (faces/_dropped/, _masked/, _merged/, _thin/) for full
reversibility. Master manifest gains masked[], merged[], plus per-run
provenance blocks.

Three new docs/analysis/ writeups cover model choice, threshold rationale,
and per-pass run results.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-27 15:41:18 +02:00
parent e66c97fd58
commit 49a43c7685
10 changed files with 3250 additions and 1 deletions

576
work/age_extend_001.py Normal file
View File

@@ -0,0 +1,576 @@
"""Extend the existing 6 era buckets of faceset_001 by absorbing PNGs that
post-date the original age_split run (from consolidation merges, etc.).
Mirrors the anchor-fragment assignment logic in age_split_001.py:
- For each unbucketed face in faceset_001's manifest, find the nearest active
era anchor by cosine distance to the anchor's centroid.
- Accept the assignment iff dist <= 0.40 AND |year_delta| <= 5
(where year_delta = exif_year(face) - dom_year(anchor)).
- Undated PNGs are skipped (no assignment).
- Anchors are NOT re-centered after absorption (preserves the same drift
guarantees as the original age_split).
CLI:
python work/age_extend_001.py analyze --out work/age_extend/candidates.json
python work/age_extend_001.py report --candidates ... --out work/age_extend
python work/age_extend_001.py apply --candidates ... [--dry-run]
"""
from __future__ import annotations
import argparse
import json
import shutil
import sys
import time
from collections import Counter
from pathlib import Path
import numpy as np
from PIL import Image, ExifTags
ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
PARENT = "faceset_001"
ACTIVE_ERAS = [
"faceset_001_2005-10",
"faceset_001_2010-13",
"faceset_001_2011",
"faceset_001_2014-17",
"faceset_001_2018-19",
"faceset_001_2018-20",
]
CACHES = [
Path("/opt/face-sets/work/cache/nl_full.npz"),
Path("/opt/face-sets/work/cache/immich_peter.npz"),
Path("/opt/face-sets/work/cache/immich_nic.npz"),
]
EXIF_CACHE = Path("/opt/face-sets/work/cache/age_split_exif.json")
# anchor-fragment thresholds (mirror age_split_001.py)
DIST_MAX = 0.40
YEAR_MAX = 5
# ----------------------------- caches -----------------------------
def load_caches():
rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {}
alias_map: dict[str, str] = {}
for c in CACHES:
if not c.exists():
print(f"[warn] cache missing: {c}", file=sys.stderr)
continue
d = np.load(c, allow_pickle=True)
emb = d["embeddings"]
meta = json.loads(str(d["meta"]))
face_records = [m for m in meta if not m.get("noface")]
if len(face_records) != len(emb):
raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}")
if "path_aliases" in d.files:
paliases = json.loads(str(d["path_aliases"]))
for canon, alist in paliases.items():
alias_map.setdefault(canon, canon)
for a in alist:
alias_map[a] = canon
for i, rec in enumerate(face_records):
p = rec["path"]
bbox = tuple(int(x) for x in rec["bbox"])
v = emb[i].astype(np.float32)
n = float(np.linalg.norm(v))
if n > 0:
v = v / n
rec_index[(p, bbox)] = v
alias_map.setdefault(p, p)
print(f"[cache] indexed {len(rec_index)} face records, {len(alias_map)} aliases", file=sys.stderr)
return rec_index, alias_map
def lookup_emb(rec_index, alias_map, src: str, bbox):
bbox_t = tuple(int(x) for x in bbox)
canon = alias_map.get(src, src)
v = rec_index.get((canon, bbox_t))
if v is None and canon != src:
v = rec_index.get((src, bbox_t))
return v
# ----------------------------- exif -----------------------------
def load_exif_cache():
if not EXIF_CACHE.exists():
return {}
return json.loads(EXIF_CACHE.read_text())
def save_exif_cache(cache):
tmp = EXIF_CACHE.with_suffix(".tmp.json")
tmp.write_text(json.dumps(cache, indent=2))
tmp.replace(EXIF_CACHE)
def exif_year(path: Path) -> int | None:
try:
with Image.open(path) as im:
ex = im._getexif()
if not ex:
return None
for tag_id, val in ex.items():
tag = ExifTags.TAGS.get(tag_id, tag_id)
if tag == "DateTimeOriginal" and isinstance(val, str) and len(val) >= 4:
return int(val[:4])
except Exception:
return None
return None
def get_year(src: str, exif_cache) -> int | None:
"""Return EXIF year for src, using cache. Mutates cache for new lookups."""
if src in exif_cache:
return exif_cache[src]
p = Path(src)
y = exif_year(p) if p.exists() else None
exif_cache[src] = y
return y
# ----------------------------- analyze -----------------------------
def cmd_analyze(args):
rec_index, alias_map = load_caches()
exif_cache = load_exif_cache()
exif_cache_dirty = False
parent_dir = ROOT / PARENT
parent_manifest = json.loads((parent_dir / "manifest.json").read_text())
parent_faces = parent_manifest.get("faces", [])
print(f"[parent] {PARENT}: {len(parent_faces)} face entries", file=sys.stderr)
# Build "in_bucket" set + each anchor's centroid + dom_year
anchors = []
in_bucket: set[tuple[str, tuple[int, int, int, int]]] = set()
for era in ACTIVE_ERAS:
ed = ROOT / era
if not ed.is_dir():
print(f"[warn] missing era bucket: {era}", file=sys.stderr)
continue
em = json.loads((ed / "manifest.json").read_text())
emb_list = []
years = []
n_missing_emb = 0
for f in em.get("faces", []):
src = f.get("source")
bbox = f.get("bbox")
if not src or not bbox:
continue
key = (alias_map.get(src, src), tuple(int(x) for x in bbox))
in_bucket.add(key)
in_bucket.add((src, tuple(int(x) for x in bbox))) # cover both alias and raw
v = lookup_emb(rec_index, alias_map, src, bbox)
if v is None:
n_missing_emb += 1
else:
emb_list.append(v)
y = get_year(src, exif_cache)
if y is None:
exif_cache_dirty = True
else:
years.append(y)
if src not in exif_cache:
exif_cache_dirty = True
if not emb_list:
print(f"[warn] {era}: no embeddings found, skipping anchor", file=sys.stderr)
continue
arr = np.stack(emb_list).astype(np.float32)
c = arr.mean(axis=0)
n = float(np.linalg.norm(c))
if n > 0:
c = c / n
dom_year = Counter(years).most_common(1)[0][0] if years else None
anchors.append({
"name": era, "centroid": c, "n_faces": len(em.get("faces", [])),
"n_emb_used": len(emb_list), "n_emb_missing": n_missing_emb,
"dom_year": dom_year,
"year_min": min(years) if years else None,
"year_max": max(years) if years else None,
})
print(f"[anchor] {era}: n={len(em.get('faces', []))} emb_used={len(emb_list)} "
f"emb_miss={n_missing_emb} dom_year={dom_year} years=[{min(years) if years else '-'}..{max(years) if years else '-'}]",
file=sys.stderr)
# Find unbucketed faces in parent
unbucketed = []
for f in parent_faces:
src = f.get("source")
bbox = f.get("bbox")
if not src or not bbox:
continue
bbox_t = tuple(int(x) for x in bbox)
key1 = (alias_map.get(src, src), bbox_t)
key2 = (src, bbox_t)
if key1 in in_bucket or key2 in in_bucket:
continue
unbucketed.append(f)
print(f"[parent] {len(unbucketed)} unbucketed face entries (in {PARENT} but no era bucket)", file=sys.stderr)
# Score each unbucketed face against every anchor
proposals = []
skipped_no_emb = 0
skipped_no_year = 0
for f in unbucketed:
src = f["source"]
bbox = f["bbox"]
v = lookup_emb(rec_index, alias_map, src, bbox)
if v is None:
skipped_no_emb += 1
continue
y = get_year(src, exif_cache)
if y is None:
skipped_no_year += 1
exif_cache_dirty = True
continue
if src not in exif_cache:
exif_cache_dirty = True
# nearest anchor
best = None # (dist, idx)
for i, a in enumerate(anchors):
d = 1.0 - float(np.dot(a["centroid"], v))
if best is None or d < best[0]:
best = (d, i)
if best is None:
continue
dist, bidx = best
anchor = anchors[bidx]
year_delta = abs(y - anchor["dom_year"]) if anchor["dom_year"] is not None else None
accept = (dist <= DIST_MAX and year_delta is not None and year_delta <= YEAR_MAX)
proposals.append({
"png": f["png"],
"source": src,
"bbox": [int(x) for x in bbox],
"year": y,
"rank_in_parent": f.get("rank"),
"quality_composite": f.get("quality", {}).get("composite"),
"quality": f.get("quality", {}),
"best_anchor": anchor["name"],
"best_anchor_dom_year": anchor["dom_year"],
"centroid_dist": round(dist, 4),
"year_delta": year_delta,
"accept": bool(accept),
"all_anchor_dists": {
a["name"]: round(1.0 - float(np.dot(a["centroid"], v)), 4) for a in anchors
},
})
if exif_cache_dirty:
save_exif_cache(exif_cache)
print(f"[exif] cache flushed ({len(exif_cache)} entries total)", file=sys.stderr)
# Summarize
accepted = [p for p in proposals if p["accept"]]
rejected = [p for p in proposals if not p["accept"]]
by_anchor = Counter(p["best_anchor"] for p in accepted)
print(f"[summary] unbucketed={len(unbucketed)} scored={len(proposals)} "
f"accepted={len(accepted)} rejected={len(rejected)} "
f"skipped(no_emb={skipped_no_emb}, no_year={skipped_no_year})", file=sys.stderr)
for k, v in by_anchor.most_common():
print(f" {k}: +{v}", file=sys.stderr)
out = {
"thresholds": {"dist_max": DIST_MAX, "year_max": YEAR_MAX},
"anchors": [
{k: v for k, v in a.items() if k != "centroid"}
for a in anchors
],
"n_unbucketed": len(unbucketed),
"skipped": {"no_emb": skipped_no_emb, "no_year": skipped_no_year},
"proposals": sorted(proposals, key=lambda p: (not p["accept"], p["best_anchor"], -1 * (p["quality_composite"] or 0))),
"by_anchor": dict(by_anchor),
}
op = Path(args.out)
op.parent.mkdir(parents=True, exist_ok=True)
op.write_text(json.dumps(out, indent=2))
print(f"[done] {len(proposals)} proposals -> {op}", file=sys.stderr)
# ----------------------------- report -----------------------------
def cmd_report(args):
cand = json.loads(Path(args.candidates).read_text())
out_dir = Path(args.out)
thumbs_dir = out_dir / "thumbs"
thumbs_dir.mkdir(parents=True, exist_ok=True)
THUMB = 140
def make_thumb(png_relpath: str) -> str:
# png_relpath looks like "faces/0042.png"
src = ROOT / PARENT / png_relpath
name = Path(png_relpath).stem
dst = thumbs_dir / f"{name}.jpg"
if not dst.exists():
try:
img = Image.open(src).convert("RGB")
img.thumbnail((THUMB, THUMB), Image.LANCZOS)
img.save(dst, "JPEG", quality=82)
except Exception as e:
print(f"[thumb-skip] {src}: {e}", file=sys.stderr)
return ""
return f"thumbs/{name}.jpg"
# group accepted proposals by target anchor
by_anchor: dict[str, list] = {}
rejected = []
for p in cand["proposals"]:
if p["accept"]:
by_anchor.setdefault(p["best_anchor"], []).append(p)
else:
rejected.append(p)
rows = []
rows.append("<h1>faceset_001 age extension &mdash; review</h1>")
rows.append(f"<p>{cand['n_unbucketed']} unbucketed faces in {PARENT}; "
f"{sum(len(v) for v in by_anchor.values())} accepted / {len(rejected)} rejected; "
f"thresholds dist&le;{cand['thresholds']['dist_max']} AND |year_delta|&le;{cand['thresholds']['year_max']}.</p>")
nav = " · ".join(f"<a href='#{a}'>{a} (+{len(by_anchor[a])})</a>" for a in by_anchor) + " · <a href='#rejected'>rejected</a>"
rows.append(f"<div class='nav'>{nav}</div>")
for anchor_name in ACTIVE_ERAS:
if anchor_name not in by_anchor:
continue
items = by_anchor[anchor_name]
anchor_meta = next((a for a in cand["anchors"] if a["name"] == anchor_name), {})
rows.append(f"<section id='{anchor_name}' class='grp'>")
rows.append(f"<h2>{anchor_name} <small>(dom_year={anchor_meta.get('dom_year')}; "
f"existing n={anchor_meta.get('n_faces')}; +{len(items)} new)</small></h2>")
rows.append("<div class='cells'>")
for p in sorted(items, key=lambda x: (x["centroid_dist"], -1 * (x["quality_composite"] or 0))):
thumb = make_thumb(p["png"])
cls = "hi" if p["centroid_dist"] <= 0.30 else "mid"
rows.append(
f"<div class='cell'>"
f"<img src='{thumb}' loading='lazy' title='{p['png']}'>"
f"<div class='meta'>{p['png']}<br>year {p['year']}{p['year_delta']})<br>"
f"<span class='{cls}'>dist {p['centroid_dist']:.3f}</span></div>"
f"</div>"
)
rows.append("</div></section>")
if rejected:
rows.append("<section id='rejected' class='grp rej'>")
rows.append(f"<h2>rejected <small>({len(rejected)} faces don't fit any anchor)</small></h2>")
rows.append("<div class='cells'>")
for p in sorted(rejected, key=lambda x: x["centroid_dist"])[:200]:
thumb = make_thumb(p["png"])
why = []
if p["centroid_dist"] > cand['thresholds']['dist_max']:
why.append(f"dist {p['centroid_dist']:.2f}>{cand['thresholds']['dist_max']}")
if p["year_delta"] is None or p["year_delta"] > cand['thresholds']['year_max']:
why.append(f"{p['year_delta']}>{cand['thresholds']['year_max']}")
rows.append(
f"<div class='cell'>"
f"<img src='{thumb}' loading='lazy'>"
f"<div class='meta'>{p['png']}<br>year {p['year']} → best {p['best_anchor']}<br>"
f"<span class='lo'>{'; '.join(why)}</span></div>"
f"</div>"
)
if len(rejected) > 200:
rows.append(f"<p>...{len(rejected)-200} more truncated.</p>")
rows.append("</div></section>")
html = f"""<!doctype html>
<html><head><meta charset='utf-8'><title>faceset_001 age extension</title>
<style>
body {{ font-family: system-ui, sans-serif; background:#111; color:#eee; padding:1em; }}
h1 {{ margin-top:0; }} h2 {{ margin:0; }}
small {{ color:#999; font-weight:normal; }}
section.grp {{ background:#1a1a1a; border-radius:6px; padding:12px; margin:12px 0; }}
section.grp.rej {{ border-left:4px solid #ff5050; }}
.cells {{ display:flex; flex-wrap:wrap; gap:6px; }}
.cell {{ background:#222; border-radius:4px; padding:4px; width:160px; font-size:11px; font-family:monospace; text-align:center; }}
.cell img {{ height:140px; width:auto; border-radius:3px; }}
.meta {{ padding-top:4px; line-height:1.3; }}
.hi {{ color:#5fa05f; font-weight:bold; }}
.mid {{ color:#ffb050; }}
.lo {{ color:#ff5050; }}
.nav {{ position:sticky; top:0; background:#111; padding:.5em 0; border-bottom:1px solid #333; font-size:13px; }}
a {{ color:#6cf; }}
</style></head>
<body>
{''.join(rows)}
</body></html>"""
out_html = out_dir / "index.html"
out_html.write_text(html)
print(f"[done] {out_html}", file=sys.stderr)
# ----------------------------- apply -----------------------------
def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
import zipfile
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
for i, p in enumerate(pngs):
zf.write(p, arcname=f"{i:04d}.png")
def cmd_apply(args):
cand = json.loads(Path(args.candidates).read_text())
accepted = [p for p in cand["proposals"] if p["accept"]]
if args.dry_run:
from collections import Counter as C
by = C(p["best_anchor"] for p in accepted)
print(f"=== dry-run: {len(accepted)} assignments across {len(by)} anchors ===")
for k, v in by.most_common():
print(f" {k}: +{v}")
return
parent_dir = ROOT / PARENT
master_path = ROOT / "manifest.json"
master = json.loads(master_path.read_text())
facesets_by_name = {f["name"]: f for f in master.get("facesets", [])}
by_anchor: dict[str, list] = {}
for p in accepted:
by_anchor.setdefault(p["best_anchor"], []).append(p)
total_added = 0
for anchor_name, props in by_anchor.items():
ed = ROOT / anchor_name
em_path = ed / "manifest.json"
em = json.loads(em_path.read_text())
existing = list(em.get("faces", []))
# gather new entries with their source PNG paths in faceset_001/faces/
new_with_src = []
for p in props:
src_png = parent_dir / p["png"]
if not src_png.exists():
print(f"[warn] missing parent PNG {src_png}; skip", file=sys.stderr)
continue
face_entry = {
"source": p["source"],
"bbox": p["bbox"],
"quality": p["quality"],
"exif_year": p["year"],
"centroid_dist_at_assign": p["centroid_dist"],
"year_delta_at_assign": p["year_delta"],
"extended_from_parent": True,
}
new_with_src.append((face_entry, src_png))
# combine; rank by quality.composite desc (existing entries already have rank,
# but we re-rank globally so new entries slot in by quality)
combined: list[tuple[dict, Path | None]] = []
for f in existing:
combined.append((f, None))
combined.extend(new_with_src)
combined.sort(key=lambda x: -x[0].get("quality", {}).get("composite", 0))
# stage fresh
staging = ed / "_faces_new"
if staging.exists():
shutil.rmtree(staging)
staging.mkdir()
new_face_entries = []
for new_rank, (face, src_png_or_none) in enumerate(combined, start=1):
new_name = f"{new_rank:04d}.png"
if src_png_or_none is None:
# existing entry: copy from current era bucket faces/
old_name = Path(face["png"]).name
src = ed / "faces" / old_name
if not src.exists():
print(f"[warn] {anchor_name}: missing existing PNG {src}; skip", file=sys.stderr)
continue
shutil.copy2(src, staging / new_name)
else:
shutil.copy2(src_png_or_none, staging / new_name)
face = dict(face)
face["rank"] = new_rank
face["png"] = f"faces/{new_name}"
new_face_entries.append(face)
# swap dirs
old_holding = ed / "_faces_old"
if old_holding.exists():
shutil.rmtree(old_holding)
(ed / "faces").rename(old_holding)
staging.rename(ed / "faces")
shutil.rmtree(old_holding)
# re-zip .fsz
survivor_pngs = sorted((ed / "faces").glob("*.png"))
top_n = em.get("top_n", 30)
top_n_eff = min(top_n, len(survivor_pngs))
for old in ed.glob("*.fsz"):
old.unlink()
top_fsz_name = f"{anchor_name}_top{top_n_eff}.fsz"
all_fsz_name = f"{anchor_name}_all.fsz"
_zip_png_list(survivor_pngs[:top_n_eff], ed / top_fsz_name)
if len(survivor_pngs) > top_n_eff:
_zip_png_list(survivor_pngs, ed / all_fsz_name)
all_fsz_used = all_fsz_name
else:
all_fsz_used = None
# update local + master manifests
em["faces"] = new_face_entries
em["exported"] = len(new_face_entries)
em["fsz_top"] = top_fsz_name
em["fsz_all"] = all_fsz_used
em["top_n"] = top_n_eff
em.setdefault("age_extend_history", []).append({
"added": len(new_with_src),
"thresholds": cand["thresholds"],
})
em_path.write_text(json.dumps(em, indent=2))
if anchor_name in facesets_by_name:
facesets_by_name[anchor_name]["exported"] = len(new_face_entries)
facesets_by_name[anchor_name]["fsz_top"] = top_fsz_name
facesets_by_name[anchor_name]["fsz_all"] = all_fsz_used
facesets_by_name[anchor_name]["top_n"] = top_n_eff
added_here = len(new_with_src)
total_added += added_here
print(f"[applied] {anchor_name}: +{added_here} (now {len(new_face_entries)} faces)", file=sys.stderr)
# rewrite master with ordering preserved
new_facesets = []
for entry in master.get("facesets", []):
new_facesets.append(facesets_by_name.get(entry["name"], entry))
master["facesets"] = new_facesets
master.setdefault("age_extend_runs", []).append({
"parent": PARENT,
"thresholds": cand["thresholds"],
"anchors": list(by_anchor.keys()),
"added_total": total_added,
})
tmp = master_path.with_suffix(".tmp.json")
tmp.write_text(json.dumps(master, indent=2))
tmp.replace(master_path)
print(f"[done] +{total_added} faces across {len(by_anchor)} anchors", file=sys.stderr)
# ----------------------------- main -----------------------------
def main():
ap = argparse.ArgumentParser()
sub = ap.add_subparsers(dest="cmd", required=True)
a = sub.add_parser("analyze")
a.add_argument("--out", required=True)
a.set_defaults(func=cmd_analyze)
r = sub.add_parser("report")
r.add_argument("--candidates", required=True)
r.add_argument("--out", required=True)
r.set_defaults(func=cmd_report)
p = sub.add_parser("apply")
p.add_argument("--candidates", required=True)
p.add_argument("--dry-run", action="store_true")
p.set_defaults(func=cmd_apply)
args = ap.parse_args()
args.func(args)
if __name__ == "__main__":
main()