Add post-export corpus maintenance pipeline
Adds four new orchestration scripts that operate on an already-built facesets_swap_ready/ to clean it up over time: - filter_occlusions.py + clip_worker.py: CLIP zero-shot mask + sunglasses filter (open_clip ViT-L-14/dfn2b_s39b). WSL stages, Windows DML scores via new C:\clip_dml_venv. Image-level threshold 0.7; faceset-level quarantine at 40% domain dominance. - consolidate_facesets.py: duplicate-identity merger using complete-linkage centroid clustering on cached arcface embeddings. Single-linkage chains catastrophically (60-faceset clusters with min sim < 0); complete-linkage guarantees within-group sim >= edge. - age_extend_001.py: slots newly-added PNGs into existing era buckets of faceset_001 using the same anchor-fragment rule as age_split_001.py (dist <= 0.40 AND |year_delta| <= 5). Anchors not re-centered. - dedup_optimize.py + multiface_worker.py: corpus-wide cleanup with three passes — cross-family SHA256 byte-dedup (preserves intra-family era duplication), within-faceset near-dup at sim >= 0.95, and a multi-face audit (the load-bearing roop invariant). Multi-face worker hits ~19 img/s on AMD Vega — ~7x embed_worker because input is 512x512 crops. Same-day corpus evolution: 311 active / 0 masked / 68 thin / 0 merged → 181 / 51 / 71 / 74; 6,440 → 3,849 active PNGs. All quarantines and prunes preserved on disk (faces/_dropped/, _masked/, _merged/, _thin/) for full reversibility. Master manifest gains masked[], merged[], plus per-run provenance blocks. Three new docs/analysis/ writeups cover model choice, threshold rationale, and per-pass run results. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
576
work/age_extend_001.py
Normal file
576
work/age_extend_001.py
Normal file
@@ -0,0 +1,576 @@
|
||||
"""Extend the existing 6 era buckets of faceset_001 by absorbing PNGs that
|
||||
post-date the original age_split run (from consolidation merges, etc.).
|
||||
|
||||
Mirrors the anchor-fragment assignment logic in age_split_001.py:
|
||||
- For each unbucketed face in faceset_001's manifest, find the nearest active
|
||||
era anchor by cosine distance to the anchor's centroid.
|
||||
- Accept the assignment iff dist <= 0.40 AND |year_delta| <= 5
|
||||
(where year_delta = exif_year(face) - dom_year(anchor)).
|
||||
- Undated PNGs are skipped (no assignment).
|
||||
- Anchors are NOT re-centered after absorption (preserves the same drift
|
||||
guarantees as the original age_split).
|
||||
|
||||
CLI:
|
||||
python work/age_extend_001.py analyze --out work/age_extend/candidates.json
|
||||
python work/age_extend_001.py report --candidates ... --out work/age_extend
|
||||
python work/age_extend_001.py apply --candidates ... [--dry-run]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import shutil
|
||||
import sys
|
||||
import time
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image, ExifTags
|
||||
|
||||
ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
|
||||
PARENT = "faceset_001"
|
||||
ACTIVE_ERAS = [
|
||||
"faceset_001_2005-10",
|
||||
"faceset_001_2010-13",
|
||||
"faceset_001_2011",
|
||||
"faceset_001_2014-17",
|
||||
"faceset_001_2018-19",
|
||||
"faceset_001_2018-20",
|
||||
]
|
||||
CACHES = [
|
||||
Path("/opt/face-sets/work/cache/nl_full.npz"),
|
||||
Path("/opt/face-sets/work/cache/immich_peter.npz"),
|
||||
Path("/opt/face-sets/work/cache/immich_nic.npz"),
|
||||
]
|
||||
EXIF_CACHE = Path("/opt/face-sets/work/cache/age_split_exif.json")
|
||||
|
||||
# anchor-fragment thresholds (mirror age_split_001.py)
|
||||
DIST_MAX = 0.40
|
||||
YEAR_MAX = 5
|
||||
|
||||
|
||||
# ----------------------------- caches -----------------------------
|
||||
|
||||
def load_caches():
|
||||
rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {}
|
||||
alias_map: dict[str, str] = {}
|
||||
for c in CACHES:
|
||||
if not c.exists():
|
||||
print(f"[warn] cache missing: {c}", file=sys.stderr)
|
||||
continue
|
||||
d = np.load(c, allow_pickle=True)
|
||||
emb = d["embeddings"]
|
||||
meta = json.loads(str(d["meta"]))
|
||||
face_records = [m for m in meta if not m.get("noface")]
|
||||
if len(face_records) != len(emb):
|
||||
raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}")
|
||||
if "path_aliases" in d.files:
|
||||
paliases = json.loads(str(d["path_aliases"]))
|
||||
for canon, alist in paliases.items():
|
||||
alias_map.setdefault(canon, canon)
|
||||
for a in alist:
|
||||
alias_map[a] = canon
|
||||
for i, rec in enumerate(face_records):
|
||||
p = rec["path"]
|
||||
bbox = tuple(int(x) for x in rec["bbox"])
|
||||
v = emb[i].astype(np.float32)
|
||||
n = float(np.linalg.norm(v))
|
||||
if n > 0:
|
||||
v = v / n
|
||||
rec_index[(p, bbox)] = v
|
||||
alias_map.setdefault(p, p)
|
||||
print(f"[cache] indexed {len(rec_index)} face records, {len(alias_map)} aliases", file=sys.stderr)
|
||||
return rec_index, alias_map
|
||||
|
||||
|
||||
def lookup_emb(rec_index, alias_map, src: str, bbox):
|
||||
bbox_t = tuple(int(x) for x in bbox)
|
||||
canon = alias_map.get(src, src)
|
||||
v = rec_index.get((canon, bbox_t))
|
||||
if v is None and canon != src:
|
||||
v = rec_index.get((src, bbox_t))
|
||||
return v
|
||||
|
||||
|
||||
# ----------------------------- exif -----------------------------
|
||||
|
||||
def load_exif_cache():
|
||||
if not EXIF_CACHE.exists():
|
||||
return {}
|
||||
return json.loads(EXIF_CACHE.read_text())
|
||||
|
||||
|
||||
def save_exif_cache(cache):
|
||||
tmp = EXIF_CACHE.with_suffix(".tmp.json")
|
||||
tmp.write_text(json.dumps(cache, indent=2))
|
||||
tmp.replace(EXIF_CACHE)
|
||||
|
||||
|
||||
def exif_year(path: Path) -> int | None:
|
||||
try:
|
||||
with Image.open(path) as im:
|
||||
ex = im._getexif()
|
||||
if not ex:
|
||||
return None
|
||||
for tag_id, val in ex.items():
|
||||
tag = ExifTags.TAGS.get(tag_id, tag_id)
|
||||
if tag == "DateTimeOriginal" and isinstance(val, str) and len(val) >= 4:
|
||||
return int(val[:4])
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def get_year(src: str, exif_cache) -> int | None:
|
||||
"""Return EXIF year for src, using cache. Mutates cache for new lookups."""
|
||||
if src in exif_cache:
|
||||
return exif_cache[src]
|
||||
p = Path(src)
|
||||
y = exif_year(p) if p.exists() else None
|
||||
exif_cache[src] = y
|
||||
return y
|
||||
|
||||
|
||||
# ----------------------------- analyze -----------------------------
|
||||
|
||||
def cmd_analyze(args):
|
||||
rec_index, alias_map = load_caches()
|
||||
exif_cache = load_exif_cache()
|
||||
exif_cache_dirty = False
|
||||
|
||||
parent_dir = ROOT / PARENT
|
||||
parent_manifest = json.loads((parent_dir / "manifest.json").read_text())
|
||||
parent_faces = parent_manifest.get("faces", [])
|
||||
print(f"[parent] {PARENT}: {len(parent_faces)} face entries", file=sys.stderr)
|
||||
|
||||
# Build "in_bucket" set + each anchor's centroid + dom_year
|
||||
anchors = []
|
||||
in_bucket: set[tuple[str, tuple[int, int, int, int]]] = set()
|
||||
for era in ACTIVE_ERAS:
|
||||
ed = ROOT / era
|
||||
if not ed.is_dir():
|
||||
print(f"[warn] missing era bucket: {era}", file=sys.stderr)
|
||||
continue
|
||||
em = json.loads((ed / "manifest.json").read_text())
|
||||
emb_list = []
|
||||
years = []
|
||||
n_missing_emb = 0
|
||||
for f in em.get("faces", []):
|
||||
src = f.get("source")
|
||||
bbox = f.get("bbox")
|
||||
if not src or not bbox:
|
||||
continue
|
||||
key = (alias_map.get(src, src), tuple(int(x) for x in bbox))
|
||||
in_bucket.add(key)
|
||||
in_bucket.add((src, tuple(int(x) for x in bbox))) # cover both alias and raw
|
||||
v = lookup_emb(rec_index, alias_map, src, bbox)
|
||||
if v is None:
|
||||
n_missing_emb += 1
|
||||
else:
|
||||
emb_list.append(v)
|
||||
y = get_year(src, exif_cache)
|
||||
if y is None:
|
||||
exif_cache_dirty = True
|
||||
else:
|
||||
years.append(y)
|
||||
if src not in exif_cache:
|
||||
exif_cache_dirty = True
|
||||
if not emb_list:
|
||||
print(f"[warn] {era}: no embeddings found, skipping anchor", file=sys.stderr)
|
||||
continue
|
||||
arr = np.stack(emb_list).astype(np.float32)
|
||||
c = arr.mean(axis=0)
|
||||
n = float(np.linalg.norm(c))
|
||||
if n > 0:
|
||||
c = c / n
|
||||
dom_year = Counter(years).most_common(1)[0][0] if years else None
|
||||
anchors.append({
|
||||
"name": era, "centroid": c, "n_faces": len(em.get("faces", [])),
|
||||
"n_emb_used": len(emb_list), "n_emb_missing": n_missing_emb,
|
||||
"dom_year": dom_year,
|
||||
"year_min": min(years) if years else None,
|
||||
"year_max": max(years) if years else None,
|
||||
})
|
||||
print(f"[anchor] {era}: n={len(em.get('faces', []))} emb_used={len(emb_list)} "
|
||||
f"emb_miss={n_missing_emb} dom_year={dom_year} years=[{min(years) if years else '-'}..{max(years) if years else '-'}]",
|
||||
file=sys.stderr)
|
||||
|
||||
# Find unbucketed faces in parent
|
||||
unbucketed = []
|
||||
for f in parent_faces:
|
||||
src = f.get("source")
|
||||
bbox = f.get("bbox")
|
||||
if not src or not bbox:
|
||||
continue
|
||||
bbox_t = tuple(int(x) for x in bbox)
|
||||
key1 = (alias_map.get(src, src), bbox_t)
|
||||
key2 = (src, bbox_t)
|
||||
if key1 in in_bucket or key2 in in_bucket:
|
||||
continue
|
||||
unbucketed.append(f)
|
||||
print(f"[parent] {len(unbucketed)} unbucketed face entries (in {PARENT} but no era bucket)", file=sys.stderr)
|
||||
|
||||
# Score each unbucketed face against every anchor
|
||||
proposals = []
|
||||
skipped_no_emb = 0
|
||||
skipped_no_year = 0
|
||||
for f in unbucketed:
|
||||
src = f["source"]
|
||||
bbox = f["bbox"]
|
||||
v = lookup_emb(rec_index, alias_map, src, bbox)
|
||||
if v is None:
|
||||
skipped_no_emb += 1
|
||||
continue
|
||||
y = get_year(src, exif_cache)
|
||||
if y is None:
|
||||
skipped_no_year += 1
|
||||
exif_cache_dirty = True
|
||||
continue
|
||||
if src not in exif_cache:
|
||||
exif_cache_dirty = True
|
||||
# nearest anchor
|
||||
best = None # (dist, idx)
|
||||
for i, a in enumerate(anchors):
|
||||
d = 1.0 - float(np.dot(a["centroid"], v))
|
||||
if best is None or d < best[0]:
|
||||
best = (d, i)
|
||||
if best is None:
|
||||
continue
|
||||
dist, bidx = best
|
||||
anchor = anchors[bidx]
|
||||
year_delta = abs(y - anchor["dom_year"]) if anchor["dom_year"] is not None else None
|
||||
accept = (dist <= DIST_MAX and year_delta is not None and year_delta <= YEAR_MAX)
|
||||
proposals.append({
|
||||
"png": f["png"],
|
||||
"source": src,
|
||||
"bbox": [int(x) for x in bbox],
|
||||
"year": y,
|
||||
"rank_in_parent": f.get("rank"),
|
||||
"quality_composite": f.get("quality", {}).get("composite"),
|
||||
"quality": f.get("quality", {}),
|
||||
"best_anchor": anchor["name"],
|
||||
"best_anchor_dom_year": anchor["dom_year"],
|
||||
"centroid_dist": round(dist, 4),
|
||||
"year_delta": year_delta,
|
||||
"accept": bool(accept),
|
||||
"all_anchor_dists": {
|
||||
a["name"]: round(1.0 - float(np.dot(a["centroid"], v)), 4) for a in anchors
|
||||
},
|
||||
})
|
||||
|
||||
if exif_cache_dirty:
|
||||
save_exif_cache(exif_cache)
|
||||
print(f"[exif] cache flushed ({len(exif_cache)} entries total)", file=sys.stderr)
|
||||
|
||||
# Summarize
|
||||
accepted = [p for p in proposals if p["accept"]]
|
||||
rejected = [p for p in proposals if not p["accept"]]
|
||||
by_anchor = Counter(p["best_anchor"] for p in accepted)
|
||||
print(f"[summary] unbucketed={len(unbucketed)} scored={len(proposals)} "
|
||||
f"accepted={len(accepted)} rejected={len(rejected)} "
|
||||
f"skipped(no_emb={skipped_no_emb}, no_year={skipped_no_year})", file=sys.stderr)
|
||||
for k, v in by_anchor.most_common():
|
||||
print(f" {k}: +{v}", file=sys.stderr)
|
||||
|
||||
out = {
|
||||
"thresholds": {"dist_max": DIST_MAX, "year_max": YEAR_MAX},
|
||||
"anchors": [
|
||||
{k: v for k, v in a.items() if k != "centroid"}
|
||||
for a in anchors
|
||||
],
|
||||
"n_unbucketed": len(unbucketed),
|
||||
"skipped": {"no_emb": skipped_no_emb, "no_year": skipped_no_year},
|
||||
"proposals": sorted(proposals, key=lambda p: (not p["accept"], p["best_anchor"], -1 * (p["quality_composite"] or 0))),
|
||||
"by_anchor": dict(by_anchor),
|
||||
}
|
||||
op = Path(args.out)
|
||||
op.parent.mkdir(parents=True, exist_ok=True)
|
||||
op.write_text(json.dumps(out, indent=2))
|
||||
print(f"[done] {len(proposals)} proposals -> {op}", file=sys.stderr)
|
||||
|
||||
|
||||
# ----------------------------- report -----------------------------
|
||||
|
||||
def cmd_report(args):
|
||||
cand = json.loads(Path(args.candidates).read_text())
|
||||
out_dir = Path(args.out)
|
||||
thumbs_dir = out_dir / "thumbs"
|
||||
thumbs_dir.mkdir(parents=True, exist_ok=True)
|
||||
THUMB = 140
|
||||
|
||||
def make_thumb(png_relpath: str) -> str:
|
||||
# png_relpath looks like "faces/0042.png"
|
||||
src = ROOT / PARENT / png_relpath
|
||||
name = Path(png_relpath).stem
|
||||
dst = thumbs_dir / f"{name}.jpg"
|
||||
if not dst.exists():
|
||||
try:
|
||||
img = Image.open(src).convert("RGB")
|
||||
img.thumbnail((THUMB, THUMB), Image.LANCZOS)
|
||||
img.save(dst, "JPEG", quality=82)
|
||||
except Exception as e:
|
||||
print(f"[thumb-skip] {src}: {e}", file=sys.stderr)
|
||||
return ""
|
||||
return f"thumbs/{name}.jpg"
|
||||
|
||||
# group accepted proposals by target anchor
|
||||
by_anchor: dict[str, list] = {}
|
||||
rejected = []
|
||||
for p in cand["proposals"]:
|
||||
if p["accept"]:
|
||||
by_anchor.setdefault(p["best_anchor"], []).append(p)
|
||||
else:
|
||||
rejected.append(p)
|
||||
|
||||
rows = []
|
||||
rows.append("<h1>faceset_001 age extension — review</h1>")
|
||||
rows.append(f"<p>{cand['n_unbucketed']} unbucketed faces in {PARENT}; "
|
||||
f"{sum(len(v) for v in by_anchor.values())} accepted / {len(rejected)} rejected; "
|
||||
f"thresholds dist≤{cand['thresholds']['dist_max']} AND |year_delta|≤{cand['thresholds']['year_max']}.</p>")
|
||||
nav = " · ".join(f"<a href='#{a}'>{a} (+{len(by_anchor[a])})</a>" for a in by_anchor) + " · <a href='#rejected'>rejected</a>"
|
||||
rows.append(f"<div class='nav'>{nav}</div>")
|
||||
|
||||
for anchor_name in ACTIVE_ERAS:
|
||||
if anchor_name not in by_anchor:
|
||||
continue
|
||||
items = by_anchor[anchor_name]
|
||||
anchor_meta = next((a for a in cand["anchors"] if a["name"] == anchor_name), {})
|
||||
rows.append(f"<section id='{anchor_name}' class='grp'>")
|
||||
rows.append(f"<h2>{anchor_name} <small>(dom_year={anchor_meta.get('dom_year')}; "
|
||||
f"existing n={anchor_meta.get('n_faces')}; +{len(items)} new)</small></h2>")
|
||||
rows.append("<div class='cells'>")
|
||||
for p in sorted(items, key=lambda x: (x["centroid_dist"], -1 * (x["quality_composite"] or 0))):
|
||||
thumb = make_thumb(p["png"])
|
||||
cls = "hi" if p["centroid_dist"] <= 0.30 else "mid"
|
||||
rows.append(
|
||||
f"<div class='cell'>"
|
||||
f"<img src='{thumb}' loading='lazy' title='{p['png']}'>"
|
||||
f"<div class='meta'>{p['png']}<br>year {p['year']} (Δ{p['year_delta']})<br>"
|
||||
f"<span class='{cls}'>dist {p['centroid_dist']:.3f}</span></div>"
|
||||
f"</div>"
|
||||
)
|
||||
rows.append("</div></section>")
|
||||
|
||||
if rejected:
|
||||
rows.append("<section id='rejected' class='grp rej'>")
|
||||
rows.append(f"<h2>rejected <small>({len(rejected)} faces don't fit any anchor)</small></h2>")
|
||||
rows.append("<div class='cells'>")
|
||||
for p in sorted(rejected, key=lambda x: x["centroid_dist"])[:200]:
|
||||
thumb = make_thumb(p["png"])
|
||||
why = []
|
||||
if p["centroid_dist"] > cand['thresholds']['dist_max']:
|
||||
why.append(f"dist {p['centroid_dist']:.2f}>{cand['thresholds']['dist_max']}")
|
||||
if p["year_delta"] is None or p["year_delta"] > cand['thresholds']['year_max']:
|
||||
why.append(f"yΔ{p['year_delta']}>{cand['thresholds']['year_max']}")
|
||||
rows.append(
|
||||
f"<div class='cell'>"
|
||||
f"<img src='{thumb}' loading='lazy'>"
|
||||
f"<div class='meta'>{p['png']}<br>year {p['year']} → best {p['best_anchor']}<br>"
|
||||
f"<span class='lo'>{'; '.join(why)}</span></div>"
|
||||
f"</div>"
|
||||
)
|
||||
if len(rejected) > 200:
|
||||
rows.append(f"<p>...{len(rejected)-200} more truncated.</p>")
|
||||
rows.append("</div></section>")
|
||||
|
||||
html = f"""<!doctype html>
|
||||
<html><head><meta charset='utf-8'><title>faceset_001 age extension</title>
|
||||
<style>
|
||||
body {{ font-family: system-ui, sans-serif; background:#111; color:#eee; padding:1em; }}
|
||||
h1 {{ margin-top:0; }} h2 {{ margin:0; }}
|
||||
small {{ color:#999; font-weight:normal; }}
|
||||
section.grp {{ background:#1a1a1a; border-radius:6px; padding:12px; margin:12px 0; }}
|
||||
section.grp.rej {{ border-left:4px solid #ff5050; }}
|
||||
.cells {{ display:flex; flex-wrap:wrap; gap:6px; }}
|
||||
.cell {{ background:#222; border-radius:4px; padding:4px; width:160px; font-size:11px; font-family:monospace; text-align:center; }}
|
||||
.cell img {{ height:140px; width:auto; border-radius:3px; }}
|
||||
.meta {{ padding-top:4px; line-height:1.3; }}
|
||||
.hi {{ color:#5fa05f; font-weight:bold; }}
|
||||
.mid {{ color:#ffb050; }}
|
||||
.lo {{ color:#ff5050; }}
|
||||
.nav {{ position:sticky; top:0; background:#111; padding:.5em 0; border-bottom:1px solid #333; font-size:13px; }}
|
||||
a {{ color:#6cf; }}
|
||||
</style></head>
|
||||
<body>
|
||||
{''.join(rows)}
|
||||
</body></html>"""
|
||||
out_html = out_dir / "index.html"
|
||||
out_html.write_text(html)
|
||||
print(f"[done] {out_html}", file=sys.stderr)
|
||||
|
||||
|
||||
# ----------------------------- apply -----------------------------
|
||||
|
||||
def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
|
||||
import zipfile
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
|
||||
for i, p in enumerate(pngs):
|
||||
zf.write(p, arcname=f"{i:04d}.png")
|
||||
|
||||
|
||||
def cmd_apply(args):
|
||||
cand = json.loads(Path(args.candidates).read_text())
|
||||
accepted = [p for p in cand["proposals"] if p["accept"]]
|
||||
if args.dry_run:
|
||||
from collections import Counter as C
|
||||
by = C(p["best_anchor"] for p in accepted)
|
||||
print(f"=== dry-run: {len(accepted)} assignments across {len(by)} anchors ===")
|
||||
for k, v in by.most_common():
|
||||
print(f" {k}: +{v}")
|
||||
return
|
||||
|
||||
parent_dir = ROOT / PARENT
|
||||
master_path = ROOT / "manifest.json"
|
||||
master = json.loads(master_path.read_text())
|
||||
facesets_by_name = {f["name"]: f for f in master.get("facesets", [])}
|
||||
|
||||
by_anchor: dict[str, list] = {}
|
||||
for p in accepted:
|
||||
by_anchor.setdefault(p["best_anchor"], []).append(p)
|
||||
|
||||
total_added = 0
|
||||
for anchor_name, props in by_anchor.items():
|
||||
ed = ROOT / anchor_name
|
||||
em_path = ed / "manifest.json"
|
||||
em = json.loads(em_path.read_text())
|
||||
existing = list(em.get("faces", []))
|
||||
|
||||
# gather new entries with their source PNG paths in faceset_001/faces/
|
||||
new_with_src = []
|
||||
for p in props:
|
||||
src_png = parent_dir / p["png"]
|
||||
if not src_png.exists():
|
||||
print(f"[warn] missing parent PNG {src_png}; skip", file=sys.stderr)
|
||||
continue
|
||||
face_entry = {
|
||||
"source": p["source"],
|
||||
"bbox": p["bbox"],
|
||||
"quality": p["quality"],
|
||||
"exif_year": p["year"],
|
||||
"centroid_dist_at_assign": p["centroid_dist"],
|
||||
"year_delta_at_assign": p["year_delta"],
|
||||
"extended_from_parent": True,
|
||||
}
|
||||
new_with_src.append((face_entry, src_png))
|
||||
|
||||
# combine; rank by quality.composite desc (existing entries already have rank,
|
||||
# but we re-rank globally so new entries slot in by quality)
|
||||
combined: list[tuple[dict, Path | None]] = []
|
||||
for f in existing:
|
||||
combined.append((f, None))
|
||||
combined.extend(new_with_src)
|
||||
combined.sort(key=lambda x: -x[0].get("quality", {}).get("composite", 0))
|
||||
|
||||
# stage fresh
|
||||
staging = ed / "_faces_new"
|
||||
if staging.exists():
|
||||
shutil.rmtree(staging)
|
||||
staging.mkdir()
|
||||
new_face_entries = []
|
||||
for new_rank, (face, src_png_or_none) in enumerate(combined, start=1):
|
||||
new_name = f"{new_rank:04d}.png"
|
||||
if src_png_or_none is None:
|
||||
# existing entry: copy from current era bucket faces/
|
||||
old_name = Path(face["png"]).name
|
||||
src = ed / "faces" / old_name
|
||||
if not src.exists():
|
||||
print(f"[warn] {anchor_name}: missing existing PNG {src}; skip", file=sys.stderr)
|
||||
continue
|
||||
shutil.copy2(src, staging / new_name)
|
||||
else:
|
||||
shutil.copy2(src_png_or_none, staging / new_name)
|
||||
face = dict(face)
|
||||
face["rank"] = new_rank
|
||||
face["png"] = f"faces/{new_name}"
|
||||
new_face_entries.append(face)
|
||||
|
||||
# swap dirs
|
||||
old_holding = ed / "_faces_old"
|
||||
if old_holding.exists():
|
||||
shutil.rmtree(old_holding)
|
||||
(ed / "faces").rename(old_holding)
|
||||
staging.rename(ed / "faces")
|
||||
shutil.rmtree(old_holding)
|
||||
|
||||
# re-zip .fsz
|
||||
survivor_pngs = sorted((ed / "faces").glob("*.png"))
|
||||
top_n = em.get("top_n", 30)
|
||||
top_n_eff = min(top_n, len(survivor_pngs))
|
||||
for old in ed.glob("*.fsz"):
|
||||
old.unlink()
|
||||
top_fsz_name = f"{anchor_name}_top{top_n_eff}.fsz"
|
||||
all_fsz_name = f"{anchor_name}_all.fsz"
|
||||
_zip_png_list(survivor_pngs[:top_n_eff], ed / top_fsz_name)
|
||||
if len(survivor_pngs) > top_n_eff:
|
||||
_zip_png_list(survivor_pngs, ed / all_fsz_name)
|
||||
all_fsz_used = all_fsz_name
|
||||
else:
|
||||
all_fsz_used = None
|
||||
|
||||
# update local + master manifests
|
||||
em["faces"] = new_face_entries
|
||||
em["exported"] = len(new_face_entries)
|
||||
em["fsz_top"] = top_fsz_name
|
||||
em["fsz_all"] = all_fsz_used
|
||||
em["top_n"] = top_n_eff
|
||||
em.setdefault("age_extend_history", []).append({
|
||||
"added": len(new_with_src),
|
||||
"thresholds": cand["thresholds"],
|
||||
})
|
||||
em_path.write_text(json.dumps(em, indent=2))
|
||||
|
||||
if anchor_name in facesets_by_name:
|
||||
facesets_by_name[anchor_name]["exported"] = len(new_face_entries)
|
||||
facesets_by_name[anchor_name]["fsz_top"] = top_fsz_name
|
||||
facesets_by_name[anchor_name]["fsz_all"] = all_fsz_used
|
||||
facesets_by_name[anchor_name]["top_n"] = top_n_eff
|
||||
|
||||
added_here = len(new_with_src)
|
||||
total_added += added_here
|
||||
print(f"[applied] {anchor_name}: +{added_here} (now {len(new_face_entries)} faces)", file=sys.stderr)
|
||||
|
||||
# rewrite master with ordering preserved
|
||||
new_facesets = []
|
||||
for entry in master.get("facesets", []):
|
||||
new_facesets.append(facesets_by_name.get(entry["name"], entry))
|
||||
master["facesets"] = new_facesets
|
||||
master.setdefault("age_extend_runs", []).append({
|
||||
"parent": PARENT,
|
||||
"thresholds": cand["thresholds"],
|
||||
"anchors": list(by_anchor.keys()),
|
||||
"added_total": total_added,
|
||||
})
|
||||
tmp = master_path.with_suffix(".tmp.json")
|
||||
tmp.write_text(json.dumps(master, indent=2))
|
||||
tmp.replace(master_path)
|
||||
print(f"[done] +{total_added} faces across {len(by_anchor)} anchors", file=sys.stderr)
|
||||
|
||||
|
||||
# ----------------------------- main -----------------------------
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
sub = ap.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
a = sub.add_parser("analyze")
|
||||
a.add_argument("--out", required=True)
|
||||
a.set_defaults(func=cmd_analyze)
|
||||
|
||||
r = sub.add_parser("report")
|
||||
r.add_argument("--candidates", required=True)
|
||||
r.add_argument("--out", required=True)
|
||||
r.set_defaults(func=cmd_report)
|
||||
|
||||
p = sub.add_parser("apply")
|
||||
p.add_argument("--candidates", required=True)
|
||||
p.add_argument("--dry-run", action="store_true")
|
||||
p.set_defaults(func=cmd_apply)
|
||||
|
||||
args = ap.parse_args()
|
||||
args.func(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user