Add post-export corpus maintenance pipeline
Adds four new orchestration scripts that operate on an already-built facesets_swap_ready/ to clean it up over time: - filter_occlusions.py + clip_worker.py: CLIP zero-shot mask + sunglasses filter (open_clip ViT-L-14/dfn2b_s39b). WSL stages, Windows DML scores via new C:\clip_dml_venv. Image-level threshold 0.7; faceset-level quarantine at 40% domain dominance. - consolidate_facesets.py: duplicate-identity merger using complete-linkage centroid clustering on cached arcface embeddings. Single-linkage chains catastrophically (60-faceset clusters with min sim < 0); complete-linkage guarantees within-group sim >= edge. - age_extend_001.py: slots newly-added PNGs into existing era buckets of faceset_001 using the same anchor-fragment rule as age_split_001.py (dist <= 0.40 AND |year_delta| <= 5). Anchors not re-centered. - dedup_optimize.py + multiface_worker.py: corpus-wide cleanup with three passes — cross-family SHA256 byte-dedup (preserves intra-family era duplication), within-faceset near-dup at sim >= 0.95, and a multi-face audit (the load-bearing roop invariant). Multi-face worker hits ~19 img/s on AMD Vega — ~7x embed_worker because input is 512x512 crops. Same-day corpus evolution: 311 active / 0 masked / 68 thin / 0 merged → 181 / 51 / 71 / 74; 6,440 → 3,849 active PNGs. All quarantines and prunes preserved on disk (faces/_dropped/, _masked/, _merged/, _thin/) for full reversibility. Master manifest gains masked[], merged[], plus per-run provenance blocks. Three new docs/analysis/ writeups cover model choice, threshold rationale, and per-pass run results. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
594
work/dedup_optimize.py
Normal file
594
work/dedup_optimize.py
Normal file
@@ -0,0 +1,594 @@
|
||||
"""Corpus-wide dedup + roop-unleashed optimization.
|
||||
|
||||
Two passes:
|
||||
1. Cross-family byte-identical PNG dedup (same SHA256 in two different identity
|
||||
families) — keep the higher-tier family copy. Era splits of the same parent
|
||||
identity (faceset_NNN_*) are intentional duplications and are NOT deduped
|
||||
within their family.
|
||||
2. Within-faceset near-duplicate dedup using cached arcface embeddings
|
||||
(cosine sim >= 0.95). Keep highest quality.composite, drop the rest.
|
||||
|
||||
Plus a Windows-DML multi-face audit (separate phase via clip_worker-style split):
|
||||
3. Re-detect each PNG with insightface; flag any with 0 or >1 detected faces.
|
||||
The roop loader appends every detected face per PNG, so multi-face crops
|
||||
pollute identity averaging.
|
||||
|
||||
All flagged PNGs are MOVED to <faceset>/faces/_dropped/ (reversible). Affected
|
||||
.fsz files are re-zipped, manifests updated.
|
||||
|
||||
CLI:
|
||||
analyze --out work/dedup_audit/dedup_plan.json
|
||||
apply --plan ... [--dry-run]
|
||||
stage_multiface --out work/dedup_audit/multiface_queue.json
|
||||
merge_multiface --results <worker_out> --out work/dedup_audit/multiface_plan.json
|
||||
apply_multiface --plan ... [--dry-run]
|
||||
report --dedup ... --multiface ... --out work/dedup_audit
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
|
||||
WIN_ROOT = r"E:\temp_things\fcswp\nl_sorted\facesets_swap_ready"
|
||||
CACHES = [
|
||||
Path("/opt/face-sets/work/cache/nl_full.npz"),
|
||||
Path("/opt/face-sets/work/cache/immich_peter.npz"),
|
||||
Path("/opt/face-sets/work/cache/immich_nic.npz"),
|
||||
]
|
||||
|
||||
NEAR_DUP_THRESHOLD = 0.95
|
||||
HASH_PARALLEL = 16
|
||||
|
||||
|
||||
# ----------------------------- helpers -----------------------------
|
||||
|
||||
def faceset_tier(name: str) -> int:
|
||||
m = re.match(r"^faceset_0*(\d+)(?:_.+)?$", name)
|
||||
if not m:
|
||||
return 99
|
||||
n = int(m.group(1))
|
||||
if 13 <= n <= 19:
|
||||
return 0
|
||||
if 1 <= n <= 12:
|
||||
return 1
|
||||
if 20 <= n <= 25:
|
||||
return 2
|
||||
if 26 <= n <= 264:
|
||||
return 3
|
||||
if 265 <= n:
|
||||
return 4
|
||||
return 99
|
||||
|
||||
|
||||
def faceset_family(name: str) -> str:
|
||||
"""faceset_001_2010-13 → faceset_001; faceset_001 → faceset_001."""
|
||||
m = re.match(r"^(faceset_\d+)(?:_.+)?$", name)
|
||||
return m.group(1) if m else name
|
||||
|
||||
|
||||
def wsl_to_win(p: str) -> str:
|
||||
s = str(p)
|
||||
if s.startswith("/mnt/"):
|
||||
return f"{s[5].upper()}:\\{s[7:].replace('/', chr(92))}"
|
||||
return s
|
||||
|
||||
|
||||
def iter_active_facesets() -> list[Path]:
|
||||
out = []
|
||||
for d in sorted(ROOT.iterdir()):
|
||||
if d.is_dir() and not d.name.startswith("_"):
|
||||
out.append(d)
|
||||
return out
|
||||
|
||||
|
||||
def sha256_file(p: Path) -> str:
|
||||
h = hashlib.sha256()
|
||||
with open(p, "rb") as f:
|
||||
while True:
|
||||
b = f.read(1 << 20)
|
||||
if not b:
|
||||
break
|
||||
h.update(b)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def load_caches():
|
||||
rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {}
|
||||
alias_map: dict[str, str] = {}
|
||||
for c in CACHES:
|
||||
if not c.exists():
|
||||
continue
|
||||
d = np.load(c, allow_pickle=True)
|
||||
emb = d["embeddings"]
|
||||
meta = json.loads(str(d["meta"]))
|
||||
face_records = [m for m in meta if not m.get("noface")]
|
||||
if "path_aliases" in d.files:
|
||||
paliases = json.loads(str(d["path_aliases"]))
|
||||
for canon, alist in paliases.items():
|
||||
alias_map.setdefault(canon, canon)
|
||||
for a in alist:
|
||||
alias_map[a] = canon
|
||||
for i, rec in enumerate(face_records):
|
||||
p = rec["path"]
|
||||
bbox = tuple(int(x) for x in rec["bbox"])
|
||||
v = emb[i].astype(np.float32)
|
||||
n = float(np.linalg.norm(v))
|
||||
if n > 0:
|
||||
v = v / n
|
||||
rec_index[(p, bbox)] = v
|
||||
alias_map.setdefault(p, p)
|
||||
return rec_index, alias_map
|
||||
|
||||
|
||||
def lookup_emb(rec_index, alias_map, src: str, bbox):
|
||||
bbox_t = tuple(int(x) for x in bbox)
|
||||
canon = alias_map.get(src, src)
|
||||
v = rec_index.get((canon, bbox_t))
|
||||
if v is None and canon != src:
|
||||
v = rec_index.get((src, bbox_t))
|
||||
return v
|
||||
|
||||
|
||||
# ----------------------------- analyze -----------------------------
|
||||
|
||||
def cmd_analyze(args):
|
||||
rec_index, alias_map = load_caches()
|
||||
facesets = iter_active_facesets()
|
||||
print(f"[scan] {len(facesets)} active facesets", file=sys.stderr)
|
||||
|
||||
# Phase 1: walk every PNG, collect (faceset, file, src, bbox, quality, emb, sha256)
|
||||
all_pngs = [] # list of dicts
|
||||
t0 = time.time()
|
||||
for fs in facesets:
|
||||
manifest_path = fs / "manifest.json"
|
||||
if not manifest_path.exists():
|
||||
continue
|
||||
m = json.loads(manifest_path.read_text())
|
||||
for f in m.get("faces", []):
|
||||
png_rel = f.get("png")
|
||||
if not png_rel:
|
||||
continue
|
||||
disk_path = fs / png_rel
|
||||
if not disk_path.exists():
|
||||
continue
|
||||
all_pngs.append({
|
||||
"faceset": fs.name,
|
||||
"family": faceset_family(fs.name),
|
||||
"tier": faceset_tier(fs.name),
|
||||
"file": Path(png_rel).name,
|
||||
"rank": f.get("rank"),
|
||||
"source": f.get("source"),
|
||||
"bbox": f.get("bbox"),
|
||||
"quality": f.get("quality", {}).get("composite", 0),
|
||||
"disk_path": str(disk_path),
|
||||
})
|
||||
print(f"[scan] {len(all_pngs)} PNGs walked in {time.time()-t0:.1f}s", file=sys.stderr)
|
||||
|
||||
# Phase 2: SHA256 hash each PNG (parallel I/O)
|
||||
t0 = time.time()
|
||||
def _hash_one(idx):
|
||||
all_pngs[idx]["sha256"] = sha256_file(Path(all_pngs[idx]["disk_path"]))
|
||||
with ThreadPoolExecutor(max_workers=HASH_PARALLEL) as ex:
|
||||
# exhaust the iterator to actually run
|
||||
for _ in ex.map(_hash_one, range(len(all_pngs)), chunksize=16):
|
||||
pass
|
||||
print(f"[hash] {len(all_pngs)} PNGs hashed in {time.time()-t0:.1f}s", file=sys.stderr)
|
||||
|
||||
# Phase 3: cross-family byte-dedup
|
||||
by_sha: dict[str, list[int]] = {}
|
||||
for i, p in enumerate(all_pngs):
|
||||
by_sha.setdefault(p["sha256"], []).append(i)
|
||||
|
||||
cross_family_groups = []
|
||||
byte_drops: set[int] = set() # indices of PNGs to drop
|
||||
for sha, idxs in by_sha.items():
|
||||
if len(idxs) < 2:
|
||||
continue
|
||||
families = {all_pngs[i]["family"] for i in idxs}
|
||||
if len(families) < 2:
|
||||
continue # all in same family — intentional era duplication
|
||||
# multiple families share this content → dedup keeping the best one
|
||||
cross_family_groups.append({"sha256": sha, "members": [
|
||||
{"faceset": all_pngs[i]["faceset"], "file": all_pngs[i]["file"],
|
||||
"tier": all_pngs[i]["tier"], "quality": all_pngs[i]["quality"],
|
||||
"rank": all_pngs[i]["rank"]} for i in idxs
|
||||
]})
|
||||
# keeper rule: lowest tier number, then highest quality
|
||||
best = sorted(idxs, key=lambda i: (all_pngs[i]["tier"], -all_pngs[i]["quality"]))[0]
|
||||
for i in idxs:
|
||||
# NEVER drop within-family copies (preserve era duplication intentionally)
|
||||
# We only drop indices whose family != best's family
|
||||
if i != best and all_pngs[i]["family"] != all_pngs[best]["family"]:
|
||||
byte_drops.add(i)
|
||||
print(f"[byte] {len(cross_family_groups)} cross-family hash groups; "
|
||||
f"{len(byte_drops)} PNGs marked for byte-dedup drop", file=sys.stderr)
|
||||
|
||||
# Phase 4: within-faceset near-dup (embedding sim >= threshold)
|
||||
by_faceset: dict[str, list[int]] = {}
|
||||
for i, p in enumerate(all_pngs):
|
||||
by_faceset.setdefault(p["faceset"], []).append(i)
|
||||
|
||||
near_dup_groups = []
|
||||
near_drops: set[int] = set()
|
||||
miss_emb_total = 0
|
||||
t0 = time.time()
|
||||
for fs_name, idxs in by_faceset.items():
|
||||
if len(idxs) < 2:
|
||||
continue
|
||||
# gather embeddings
|
||||
embs = []
|
||||
kept_idxs = []
|
||||
for i in idxs:
|
||||
v = lookup_emb(rec_index, alias_map, all_pngs[i]["source"], all_pngs[i]["bbox"])
|
||||
if v is None:
|
||||
miss_emb_total += 1
|
||||
continue
|
||||
embs.append(v)
|
||||
kept_idxs.append(i)
|
||||
if len(kept_idxs) < 2:
|
||||
continue
|
||||
M = np.stack(embs).astype(np.float32)
|
||||
sim = M @ M.T
|
||||
np.fill_diagonal(sim, -1) # ignore self
|
||||
# find connected components in the (sim >= threshold) graph
|
||||
adj = {k: set() for k in range(len(kept_idxs))}
|
||||
for a in range(len(kept_idxs)):
|
||||
# only check a < b to avoid double work
|
||||
hi = np.where(sim[a, a+1:] >= NEAR_DUP_THRESHOLD)[0]
|
||||
for off in hi:
|
||||
b = a + 1 + int(off)
|
||||
adj[a].add(b)
|
||||
adj[b].add(a)
|
||||
seen = set()
|
||||
for k in adj:
|
||||
if k in seen or not adj[k]:
|
||||
continue
|
||||
stack = [k]
|
||||
comp = []
|
||||
while stack:
|
||||
x = stack.pop()
|
||||
if x in seen:
|
||||
continue
|
||||
seen.add(x)
|
||||
comp.append(x)
|
||||
for y in adj[x]:
|
||||
if y not in seen:
|
||||
stack.append(y)
|
||||
if len(comp) < 2:
|
||||
continue
|
||||
comp_idxs = [kept_idxs[c] for c in comp]
|
||||
# keeper: highest quality.composite, tie-break: lowest rank
|
||||
best = sorted(comp_idxs, key=lambda i: (-all_pngs[i]["quality"], all_pngs[i]["rank"] or 9999))[0]
|
||||
sims_in_group = []
|
||||
for ci in range(len(comp)):
|
||||
for cj in range(ci+1, len(comp)):
|
||||
sims_in_group.append(float(sim[comp[ci], comp[cj]]))
|
||||
near_dup_groups.append({
|
||||
"faceset": fs_name,
|
||||
"members": [{"file": all_pngs[i]["file"], "rank": all_pngs[i]["rank"],
|
||||
"quality": all_pngs[i]["quality"]} for i in comp_idxs],
|
||||
"keeper": all_pngs[best]["file"],
|
||||
"min_sim": min(sims_in_group) if sims_in_group else None,
|
||||
"max_sim": max(sims_in_group) if sims_in_group else None,
|
||||
})
|
||||
for i in comp_idxs:
|
||||
if i != best:
|
||||
near_drops.add(i)
|
||||
print(f"[near] {len(near_dup_groups)} near-dup groups; "
|
||||
f"{len(near_drops)} PNGs marked for near-dup drop "
|
||||
f"(miss_emb={miss_emb_total}); {time.time()-t0:.1f}s", file=sys.stderr)
|
||||
|
||||
# Combined drop set; for output, group by faceset
|
||||
all_drops = byte_drops | near_drops
|
||||
drops_by_faceset: dict[str, list] = {}
|
||||
for i in all_drops:
|
||||
p = all_pngs[i]
|
||||
reason = []
|
||||
if i in byte_drops: reason.append("byte_dup")
|
||||
if i in near_drops: reason.append("near_dup")
|
||||
drops_by_faceset.setdefault(p["faceset"], []).append({
|
||||
"file": p["file"], "rank": p["rank"], "reason": "+".join(reason),
|
||||
"sha256": p["sha256"], "quality": p["quality"],
|
||||
})
|
||||
|
||||
plan = {
|
||||
"thresholds": {"near_dup_sim": NEAR_DUP_THRESHOLD},
|
||||
"totals": {
|
||||
"active_facesets": len(facesets),
|
||||
"active_pngs": len(all_pngs),
|
||||
"byte_dup_groups": len(cross_family_groups),
|
||||
"byte_dup_drops": len(byte_drops),
|
||||
"near_dup_groups": len(near_dup_groups),
|
||||
"near_dup_drops": len(near_drops),
|
||||
"all_drops": len(all_drops),
|
||||
"facesets_affected": len(drops_by_faceset),
|
||||
},
|
||||
"byte_dup_groups": cross_family_groups,
|
||||
"near_dup_groups": near_dup_groups,
|
||||
"drops_by_faceset": drops_by_faceset,
|
||||
}
|
||||
op = Path(args.out)
|
||||
op.parent.mkdir(parents=True, exist_ok=True)
|
||||
op.write_text(json.dumps(plan, indent=2))
|
||||
print(f"[done] plan -> {op}", file=sys.stderr)
|
||||
|
||||
|
||||
# ----------------------------- apply -----------------------------
|
||||
|
||||
def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
|
||||
import zipfile
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
|
||||
for i, p in enumerate(pngs):
|
||||
zf.write(p, arcname=f"{i:04d}.png")
|
||||
|
||||
|
||||
def _apply_drops_to_facesets(drops_by_faceset: dict[str, list], reason_label: str, master_path: Path):
|
||||
"""Move flagged PNGs to <faceset>/faces/_dropped/, rebuild manifests + .fsz.
|
||||
drops_by_faceset values are lists of {"file": str, ...}.
|
||||
Returns total moved + counts per faceset."""
|
||||
master = json.loads(master_path.read_text())
|
||||
by_name = {f["name"]: f for f in master.get("facesets", [])}
|
||||
total_moved = 0
|
||||
per_faceset_counts = {}
|
||||
|
||||
for fs_name, drops in drops_by_faceset.items():
|
||||
fs_dir = ROOT / fs_name
|
||||
if not fs_dir.is_dir():
|
||||
print(f"[warn] {fs_name}: dir missing, skip", file=sys.stderr)
|
||||
continue
|
||||
faces_dir = fs_dir / "faces"
|
||||
dropped_dir = faces_dir / "_dropped"
|
||||
dropped_dir.mkdir(exist_ok=True)
|
||||
drop_files = {d["file"] for d in drops}
|
||||
|
||||
moved_here = 0
|
||||
for fname in sorted(drop_files):
|
||||
src = faces_dir / fname
|
||||
if not src.exists():
|
||||
continue
|
||||
shutil.move(str(src), str(dropped_dir / fname))
|
||||
moved_here += 1
|
||||
|
||||
# rebuild manifest by filtering out dropped files
|
||||
manifest_path = fs_dir / "manifest.json"
|
||||
if manifest_path.exists():
|
||||
mm = json.loads(manifest_path.read_text())
|
||||
new_faces = [f for f in mm.get("faces", []) if Path(f.get("png", "")).name not in drop_files]
|
||||
mm["faces"] = new_faces
|
||||
mm["exported"] = len(new_faces)
|
||||
mm.setdefault(f"{reason_label}_history", []).append({"dropped": moved_here})
|
||||
|
||||
# re-zip
|
||||
survivor_pngs = sorted(faces_dir.glob("*.png"))
|
||||
top_n = mm.get("top_n", 30)
|
||||
top_n_eff = min(top_n, len(survivor_pngs))
|
||||
for old in fs_dir.glob("*.fsz"):
|
||||
old.unlink()
|
||||
top_fsz_name = f"{fs_name}_top{top_n_eff}.fsz"
|
||||
all_fsz_name = f"{fs_name}_all.fsz"
|
||||
if top_n_eff > 0:
|
||||
_zip_png_list(survivor_pngs[:top_n_eff], fs_dir / top_fsz_name)
|
||||
mm["fsz_top"] = top_fsz_name
|
||||
mm["top_n"] = top_n_eff
|
||||
else:
|
||||
mm["fsz_top"] = None
|
||||
mm["top_n"] = 0
|
||||
if len(survivor_pngs) > top_n_eff:
|
||||
_zip_png_list(survivor_pngs, fs_dir / all_fsz_name)
|
||||
mm["fsz_all"] = all_fsz_name
|
||||
else:
|
||||
mm["fsz_all"] = None
|
||||
manifest_path.write_text(json.dumps(mm, indent=2))
|
||||
|
||||
if fs_name in by_name:
|
||||
by_name[fs_name]["exported"] = len(new_faces)
|
||||
by_name[fs_name]["fsz_top"] = mm["fsz_top"]
|
||||
by_name[fs_name]["fsz_all"] = mm["fsz_all"]
|
||||
by_name[fs_name]["top_n"] = mm["top_n"]
|
||||
by_name[fs_name].setdefault(f"{reason_label}_dropped", 0)
|
||||
by_name[fs_name][f"{reason_label}_dropped"] += moved_here
|
||||
|
||||
total_moved += moved_here
|
||||
per_faceset_counts[fs_name] = moved_here
|
||||
|
||||
# rewrite master with same ordering
|
||||
new_facesets = [by_name.get(e["name"], e) for e in master.get("facesets", [])]
|
||||
master["facesets"] = new_facesets
|
||||
master.setdefault(f"{reason_label}_runs", []).append({
|
||||
"facesets_affected": len(per_faceset_counts),
|
||||
"pngs_moved": total_moved,
|
||||
})
|
||||
tmp = master_path.with_suffix(".tmp.json")
|
||||
tmp.write_text(json.dumps(master, indent=2))
|
||||
tmp.replace(master_path)
|
||||
return total_moved, per_faceset_counts
|
||||
|
||||
|
||||
def cmd_apply(args):
|
||||
plan = json.loads(Path(args.plan).read_text())
|
||||
drops = plan["drops_by_faceset"]
|
||||
if args.dry_run:
|
||||
for fs, items in sorted(drops.items()):
|
||||
reasons = {}
|
||||
for it in items:
|
||||
reasons[it["reason"]] = reasons.get(it["reason"], 0) + 1
|
||||
print(f" {fs}: {len(items)} dropped ({reasons})")
|
||||
print(f"=== total: {sum(len(v) for v in drops.values())} PNGs across {len(drops)} facesets ===")
|
||||
return
|
||||
master_path = ROOT / "manifest.json"
|
||||
total, _ = _apply_drops_to_facesets(drops, "dedup", master_path)
|
||||
print(f"[done] {total} PNGs moved to faces/_dropped/ across {len(drops)} facesets", file=sys.stderr)
|
||||
|
||||
|
||||
# ----------------------------- multiface staging + apply -----------------------------
|
||||
|
||||
def cmd_stage_multiface(args):
|
||||
"""Build queue.json of all currently-active PNGs in the corpus
|
||||
for the Windows DML multi-face audit worker."""
|
||||
queue = []
|
||||
for fs in iter_active_facesets():
|
||||
faces_dir = fs / "faces"
|
||||
if not faces_dir.is_dir():
|
||||
continue
|
||||
for p in sorted(faces_dir.glob("*.png")):
|
||||
queue.append({
|
||||
"wsl_path": str(p),
|
||||
"win_path": wsl_to_win(str(p)),
|
||||
"faceset": fs.name,
|
||||
"file": p.name,
|
||||
})
|
||||
op = Path(args.out)
|
||||
op.parent.mkdir(parents=True, exist_ok=True)
|
||||
op.write_text(json.dumps(queue, indent=2))
|
||||
print(f"[stage] {len(queue)} PNGs -> {op}", file=sys.stderr)
|
||||
|
||||
|
||||
def cmd_merge_multiface(args):
|
||||
"""Convert worker results.json into a drops_by_faceset plan."""
|
||||
src = json.loads(Path(args.results).read_text())
|
||||
drops_by_faceset: dict[str, list] = {}
|
||||
bad_count = 0
|
||||
for r in src.get("results", []):
|
||||
n_faces = r.get("face_count", -1)
|
||||
if n_faces == 1:
|
||||
continue
|
||||
bad_count += 1
|
||||
drops_by_faceset.setdefault(r["faceset"], []).append({
|
||||
"file": r["file"],
|
||||
"reason": f"multiface_{n_faces}",
|
||||
"face_count": n_faces,
|
||||
})
|
||||
plan = {
|
||||
"totals": {"bad_pngs": bad_count, "facesets_affected": len(drops_by_faceset),
|
||||
"scored": len(src.get("results", []))},
|
||||
"drops_by_faceset": drops_by_faceset,
|
||||
}
|
||||
op = Path(args.out)
|
||||
op.parent.mkdir(parents=True, exist_ok=True)
|
||||
op.write_text(json.dumps(plan, indent=2))
|
||||
print(f"[merge] {bad_count} bad PNGs across {len(drops_by_faceset)} facesets -> {op}", file=sys.stderr)
|
||||
|
||||
|
||||
def cmd_apply_multiface(args):
|
||||
plan = json.loads(Path(args.plan).read_text())
|
||||
drops = plan["drops_by_faceset"]
|
||||
if args.dry_run:
|
||||
for fs, items in sorted(drops.items()):
|
||||
print(f" {fs}: {len(items)} bad PNG(s)")
|
||||
print(f"=== total: {sum(len(v) for v in drops.values())} ===")
|
||||
return
|
||||
master_path = ROOT / "manifest.json"
|
||||
total, _ = _apply_drops_to_facesets(drops, "multiface", master_path)
|
||||
print(f"[done] {total} PNGs moved to faces/_dropped/ across {len(drops)} facesets", file=sys.stderr)
|
||||
|
||||
|
||||
# ----------------------------- report -----------------------------
|
||||
|
||||
def cmd_report(args):
|
||||
out_dir = Path(args.out)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
sections = []
|
||||
if args.dedup:
|
||||
d = json.loads(Path(args.dedup).read_text())
|
||||
t = d["totals"]
|
||||
sections.append(f"<h2>Dedup</h2>")
|
||||
sections.append(
|
||||
f"<ul>"
|
||||
f"<li>Active facesets: {t['active_facesets']}, active PNGs: {t['active_pngs']}</li>"
|
||||
f"<li>Cross-family byte-dup groups: {t['byte_dup_groups']} → {t['byte_dup_drops']} PNGs dropped</li>"
|
||||
f"<li>Within-faceset near-dup groups (sim≥{d['thresholds']['near_dup_sim']}): {t['near_dup_groups']} → {t['near_dup_drops']} PNGs dropped</li>"
|
||||
f"<li><b>Total dedup drops: {t['all_drops']}</b> across {t['facesets_affected']} facesets</li>"
|
||||
f"</ul>"
|
||||
)
|
||||
# top-N affected facesets
|
||||
rows = sorted(d["drops_by_faceset"].items(), key=lambda x: -len(x[1]))[:25]
|
||||
sections.append("<h3>Top 25 most-affected facesets</h3><table><tr><th>faceset</th><th>dropped</th><th>reasons</th></tr>")
|
||||
for fs, items in rows:
|
||||
r = {}
|
||||
for it in items:
|
||||
r[it["reason"]] = r.get(it["reason"], 0) + 1
|
||||
sections.append(f"<tr><td>{fs}</td><td>{len(items)}</td><td>{r}</td></tr>")
|
||||
sections.append("</table>")
|
||||
|
||||
if args.multiface:
|
||||
m = json.loads(Path(args.multiface).read_text())
|
||||
t = m["totals"]
|
||||
sections.append("<h2>Multi-face audit</h2>")
|
||||
sections.append(
|
||||
f"<ul>"
|
||||
f"<li>PNGs scored: {t['scored']}</li>"
|
||||
f"<li>Bad PNGs (0 or >1 face): {t['bad_pngs']} across {t['facesets_affected']} facesets</li>"
|
||||
f"</ul>"
|
||||
)
|
||||
|
||||
html = f"""<!doctype html>
|
||||
<html><head><meta charset='utf-8'><title>Dedup + multi-face audit</title>
|
||||
<style>
|
||||
body {{ font-family: system-ui, sans-serif; background:#111; color:#eee; padding:1em; }}
|
||||
h1, h2, h3 {{ margin-top:1em; }}
|
||||
table {{ border-collapse: collapse; font-family: monospace; font-size: 12px; }}
|
||||
table td, table th {{ padding: 2px 8px; border: 1px solid #333; }}
|
||||
ul li {{ margin: 4px 0; }}
|
||||
</style></head>
|
||||
<body>
|
||||
<h1>facesets_swap_ready dedup + roop optimization audit</h1>
|
||||
{''.join(sections)}
|
||||
</body></html>"""
|
||||
out_html = out_dir / "index.html"
|
||||
out_html.write_text(html)
|
||||
print(f"[done] {out_html}", file=sys.stderr)
|
||||
|
||||
|
||||
# ----------------------------- main -----------------------------
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
sub = ap.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
a = sub.add_parser("analyze")
|
||||
a.add_argument("--out", required=True)
|
||||
a.set_defaults(func=cmd_analyze)
|
||||
|
||||
p = sub.add_parser("apply")
|
||||
p.add_argument("--plan", required=True)
|
||||
p.add_argument("--dry-run", action="store_true")
|
||||
p.set_defaults(func=cmd_apply)
|
||||
|
||||
sm = sub.add_parser("stage_multiface")
|
||||
sm.add_argument("--out", required=True)
|
||||
sm.set_defaults(func=cmd_stage_multiface)
|
||||
|
||||
mm = sub.add_parser("merge_multiface")
|
||||
mm.add_argument("--results", required=True)
|
||||
mm.add_argument("--out", required=True)
|
||||
mm.set_defaults(func=cmd_merge_multiface)
|
||||
|
||||
am = sub.add_parser("apply_multiface")
|
||||
am.add_argument("--plan", required=True)
|
||||
am.add_argument("--dry-run", action="store_true")
|
||||
am.set_defaults(func=cmd_apply_multiface)
|
||||
|
||||
r = sub.add_parser("report")
|
||||
r.add_argument("--dedup", default=None)
|
||||
r.add_argument("--multiface", default=None)
|
||||
r.add_argument("--out", required=True)
|
||||
r.set_defaults(func=cmd_report)
|
||||
|
||||
args = ap.parse_args()
|
||||
args.func(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user