From 49a43c7685b640897a4a757ec1c61cc0783ebe65 Mon Sep 17 00:00:00 2001 From: Peter Date: Mon, 27 Apr 2026 15:41:18 +0200 Subject: [PATCH] Add post-export corpus maintenance pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds four new orchestration scripts that operate on an already-built facesets_swap_ready/ to clean it up over time: - filter_occlusions.py + clip_worker.py: CLIP zero-shot mask + sunglasses filter (open_clip ViT-L-14/dfn2b_s39b). WSL stages, Windows DML scores via new C:\clip_dml_venv. Image-level threshold 0.7; faceset-level quarantine at 40% domain dominance. - consolidate_facesets.py: duplicate-identity merger using complete-linkage centroid clustering on cached arcface embeddings. Single-linkage chains catastrophically (60-faceset clusters with min sim < 0); complete-linkage guarantees within-group sim >= edge. - age_extend_001.py: slots newly-added PNGs into existing era buckets of faceset_001 using the same anchor-fragment rule as age_split_001.py (dist <= 0.40 AND |year_delta| <= 5). Anchors not re-centered. - dedup_optimize.py + multiface_worker.py: corpus-wide cleanup with three passes — cross-family SHA256 byte-dedup (preserves intra-family era duplication), within-faceset near-dup at sim >= 0.95, and a multi-face audit (the load-bearing roop invariant). Multi-face worker hits ~19 img/s on AMD Vega — ~7x embed_worker because input is 512x512 crops. Same-day corpus evolution: 311 active / 0 masked / 68 thin / 0 merged → 181 / 51 / 71 / 74; 6,440 → 3,849 active PNGs. All quarantines and prunes preserved on disk (faces/_dropped/, _masked/, _merged/, _thin/) for full reversibility. Master manifest gains masked[], merged[], plus per-run provenance blocks. Three new docs/analysis/ writeups cover model choice, threshold rationale, and per-pass run results. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 29 +- docs/analysis/clip-occlusion-filter.md | 154 +++++ docs/analysis/dedup-and-roop-optimization.md | 155 +++++ .../identity-consolidation-and-age-extend.md | 170 +++++ work/age_extend_001.py | 576 ++++++++++++++++ work/clip_worker.py | 221 ++++++ work/consolidate_facesets.py | 634 ++++++++++++++++++ work/dedup_optimize.py | 594 ++++++++++++++++ work/filter_occlusions.py | 574 ++++++++++++++++ work/multiface_worker.py | 144 ++++ 10 files changed, 3250 insertions(+), 1 deletion(-) create mode 100644 docs/analysis/clip-occlusion-filter.md create mode 100644 docs/analysis/dedup-and-roop-optimization.md create mode 100644 docs/analysis/identity-consolidation-and-age-extend.md create mode 100644 work/age_extend_001.py create mode 100644 work/clip_worker.py create mode 100644 work/consolidate_facesets.py create mode 100644 work/dedup_optimize.py create mode 100644 work/filter_occlusions.py create mode 100644 work/multiface_worker.py diff --git a/README.md b/README.md index ff2c943..2edcb2e 100644 --- a/README.md +++ b/README.md @@ -331,6 +331,27 @@ from the saved `state.json` without re-fetching what was already done. The composite quality score in `export-swap` is `0.30·frontality + 0.20·det_score + 0.20·landmark_symmetry + 0.15·face_size + 0.15·sharpness`, each normalized to `[0, 1]`. +## Post-export corpus maintenance + +The `sort_faces.py` pipeline above produces `facesets_swap_ready/`. Four +orchestration scripts under `work/` operate on that already-built corpus to +clean it up over time: + +| script | purpose | +|--------|---------| +| `work/filter_occlusions.py` (+ Windows `work/clip_worker.py`) | Drop PNGs of masked / sun-glassed faces using open_clip ViT-L-14/dfn2b_s39b zero-shot scoring. Image-level threshold 0.7; faceset-level quarantine at 40% domain dominance. WSL stages a queue, Windows DML scores, WSL applies. See `docs/analysis/clip-occlusion-filter.md`. | +| `work/consolidate_facesets.py` | Merge duplicate identities (centroid cosine sim ≥ 0.55 with confident ≥ 0.65, **complete-linkage** to defeat single-link chaining). Pulls embeddings from cache, no GPU. See `docs/analysis/identity-consolidation-and-age-extend.md`. | +| `work/age_extend_001.py` | Slot newly-added PNGs into existing era buckets of `faceset_001` (anchor cosine distance ≤ 0.40 AND `|year_delta|` ≤ 5). Same anchor-fragment rule as `age_split_001.py`. | +| `work/dedup_optimize.py` (+ Windows `work/multiface_worker.py`) | (a) cross-family SHA256 byte-dedup, (b) within-faceset near-dup at cosine sim ≥ 0.95, (c) multi-face audit (re-detect via insightface, drop PNGs with face_count ≠ 1). Multi-face is the load-bearing roop invariant. See `docs/analysis/dedup-and-roop-optimization.md`. | + +All four operate idempotently and reversibly: dropped PNGs go to +`/faces/_dropped/`, quarantined whole facesets go to +`facesets_swap_ready/_masked/` or `_merged/` (parallel to the existing +`_thin/`). The master `manifest.json` partitions entries across `facesets[]`, +`masked[]`, `thin_eras[]`, and `merged[]` arrays, plus per-run provenance +blocks (`occlusion_filter_run`, `merge_run`, `age_extend_runs`, `dedup_runs`, +`multiface_runs`). + ## Downstream: roop-unleashed The `.fsz` bundles emitted by `export-swap` drop straight into roop-unleashed's Face Swap tab. Each PNG inside is already a clean single-face crop — critical, because the roop-unleashed loader appends every face it re-detects in each PNG to the averaged identity embedding. @@ -350,11 +371,17 @@ Highly recommended at swap time: enable **Select post-processing = GFPGAN** with ├─ build_folders.py (hand-sorted-folder orchestration) ├─ check_faceset001_age.py (age-split readiness probe) ├─ age_split_001.py (age-split orchestration; faceset_001) + ├─ age_extend_001.py (extends existing era buckets with new PNGs) ├─ cluster_osrc.py (mixed-bucket identity discovery) ├─ immich_stage.py (Immich library staging, parallel) - ├─ embed_worker.py (Windows DML embed worker, runs from C:\face_embed_venv\) + ├─ embed_worker.py (Windows DML embed worker; C:\face_embed_venv\) ├─ cluster_immich.py (Immich identity discovery + export) ├─ finalize_immich.sh (chains queue → embed → cluster) + ├─ filter_occlusions.py (CLIP zero-shot mask + sunglasses filter) + ├─ clip_worker.py (Windows DML CLIP worker; C:\clip_dml_venv\) + ├─ consolidate_facesets.py (duplicate-identity merger; complete-linkage) + ├─ dedup_optimize.py (byte + near-dup + multi-face audit driver) + ├─ multiface_worker.py (Windows DML multi-face audit worker) ├─ synthetic_*_manifest.json (per-run synthetic refine manifests) ├─ immich/ │ ├─ users.json (label -> userId map; gitignored) diff --git a/docs/analysis/clip-occlusion-filter.md b/docs/analysis/clip-occlusion-filter.md new file mode 100644 index 0000000..a977736 --- /dev/null +++ b/docs/analysis/clip-occlusion-filter.md @@ -0,0 +1,154 @@ +# CLIP zero-shot occlusion filter (masks + sunglasses) + +_Run date: 2026-04-27. Driver scripts: `work/filter_occlusions.py`, `work/clip_worker.py`._ + +## 1. Why + +`facesets_swap_ready/` ended the Immich import day with 311 substantive +facesets and a long tail of identities whose clusters had latched onto +*eyewear or mask appearance* instead of identity (covid-era shots, vacation +photos with sunglasses dominating the frame). Two failure modes: + +1. **Pollution of averaged identity** — roop's `FaceSet.AverageEmbeddings()` + averages every face in the .fsz. A faceset where 40 % of images are + sunglassed gives a biased centroid; the swap reproduces sunglass-shaped + eye sockets. +2. **Whole-cluster identity drift** — clustering at the embedding level + sometimes anchors on the eyewear silhouette rather than the face, + producing clusters of "the same sunglasses across multiple people". + +A targeted attribute scorer was the cleanest fix. + +## 2. Model + prompts + +**Model**: `open_clip` `ViT-L-14` / `dfn2b_s39b` (Apple Data Filtering Networks). +Best public zero-shot at this size. Loads weights from HF Hub (~890 MB). +Bit-identical scores between WSL CPU and Windows DML. + +**Prompt design**: per-attribute ensembles of 5–6 positive + 5–6 negative +prompts. Positive ensembles are mean-pooled and L2-normalized before softmax. + +**Critical bug if forgotten**: CLIP cosine similarities are tiny (0.2–0.3 +range). Raw `softmax([sim_pos, sim_neg])` collapses to ~0.5/0.5 on every +image. **Multiply by `model.logit_scale.exp()` (~100) before softmax.** +Without that scale the entire scorer outputs a uniform 0.5. + +**Sunglasses prompt pitfall**: the first set caught faces with sunglasses +*pushed up on the forehead* with the same probability as faces with +sunglasses *covering the eyes* — CLIP detects "presence of sunglasses in +frame", not "eyes occluded". Fixed by putting the false positive into the +*negative* class explicitly: + +``` +positive: "a face with dark sunglasses covering the eyes" + "a portrait with the eyes hidden behind opaque sunglasses" + ... +negative: "a face with sunglasses pushed up on the forehead, eyes visible below" + "a face with sunglasses resting on top of the head, eyes visible" + "a face wearing clear prescription eyeglasses with visible eyes" + ... +``` + +Validation pair (faceset_005): sunglasses-on-eyes → 0.91, sunglasses-on-forehead +→ 0.39. Threshold 0.7 cleanly separates. + +## 3. Architecture + +``` + ┌─────────────────────────────────────────────┐ + │ WSL /opt/face-sets/work/filter_occlusions.py │ + │ • stage: walk facesets/, write queue.json │ + │ • merge: ingest worker results │ + │ • report: HTML contact sheet │ + │ • apply: prune + quarantine + re-zip │ + └────────────┬────────────────────────────────┘ + │ queue.json (paths) via \\wsl.localhost\ + ▼ + ┌─────────────────────────────────────────────┐ + │ Windows C:\clip_dml_venv\ │ + │ /opt/face-sets/work/clip_worker.py │ + │ Python 3.12 + torch 2.4.1 CPU │ + │ + torch-directml 0.2.5 + open_clip_torch │ + │ Reads PNGs from native E:\, writes scores │ + └─────────────────────────────────────────────┘ +``` + +A separate Windows venv (not the existing `C:\face_embed_venv\`) is needed +because `torch-directml` brings ~1.5 GB of wheels and version-pinned +numpy/pillow that risk breaking the embed_worker venv's +`onnxruntime-directml` + `insightface` stack. + +## 4. DML throughput surprise + +Measured on AMD Radeon RX Vega: + +| input | model | throughput | speedup vs WSL CPU | +|------|-------|-----------:|-------------------:| +| ViT-L-14 (CLIP, this filter) | open_clip | **1.43 img/s** | **2.4×** | +| buffalo_l (insightface, embed_worker) | onnxruntime | 2.6 img/s | 7.5× | + +Only 2.4× because `aten::_native_multi_head_attention` is not implemented in +the directml plugin and falls back to CPU. The vision encoder runs on GPU, +attention runs on CPU per layer, both alternating. A silenced UserWarning +makes this near-invisible. Workable for a one-shot 73-min corpus run, but +the embed_worker pattern (pure ONNX) remains the gold standard for DML. + +## 5. Thresholds (validated 2026-04-27 on 6,318 PNGs) + +| level | threshold | semantics | +|-------|----------:|-----------| +| image | P(positive) ≥ 0.7 | drop the PNG | +| faceset | ≥ 40 % of images flagged for either attr | quarantine whole faceset to `_masked/` | +| min-survivors | < 5 surviving AND something pruned | quarantine to `_thin/` | + +The `AND something pruned` guard is essential — without it, naturally-small +facesets (hand-sorted with ≤4 PNGs) get incorrectly quarantined for being +small even when they have zero occlusions. + +## 6. Run results + +| action | count | net effect | +|--------|------:|------------| +| keep | 209 | unchanged | +| prune | 46 | 183 PNGs dropped within survivors | +| quarantine_masked | 51 | whole faceset → `_masked/` (11 mask-driven, 40 sunglasses-driven) | +| quarantine_thin | 3 | survivors < 5 → `_thin/` | + +Net: 311 active → 255 active after the filter run. 763 PNGs quarantined +whole-faceset, 183 pruned within survivors. All dropped PNGs preserved at +`/faces/_dropped/` for reversibility. Master manifest gained a +`masked[]` array parallel to `thin_eras[]`, plus an `occlusion_filter_run` +provenance block. + +## 7. Known limitations + +- **Per-faceset manifests are NOT updated by `apply`** — only the master + manifest is. Each faceset's own `/manifest.json` retains stale + `faces[]` entries pointing at PNGs that moved into `_dropped/`. Harmless + for `.fsz` consumers (the .fsz is re-zipped from current disk state) but + downstream tools reading `faces[]` will see broken references. Discovered + later by `age_extend_001.py`'s rebuild loop, which generated 42 missing-PNG + warnings before being caught. + +## 8. Re-running + +```bash +# 1. Stage queue from current corpus state +python work/filter_occlusions.py stage --out work/clip_dml/queue.json + +# 2. Score on Windows DML (resumable) +"/mnt/c/clip_dml_venv/Scripts/python.exe" work/clip_worker.py \ + work/clip_dml/queue.json work/clip_dml/scores.json --batch 8 + +# 3. Reshape into per-faceset format, then HTML for visual approval +python work/filter_occlusions.py merge \ + --scores work/clip_dml/scores.json --out work/occlusion_scores.json +python work/filter_occlusions.py report \ + --scores work/occlusion_scores.json --out work/occlusion_review + +# 4. Apply (always dry-run first) +python work/filter_occlusions.py apply \ + --scores work/occlusion_scores.json --out-plan work/occlusion_apply_plan.json --dry-run +python work/filter_occlusions.py apply \ + --scores work/occlusion_scores.json --out-plan work/occlusion_apply_plan.json +``` diff --git a/docs/analysis/dedup-and-roop-optimization.md b/docs/analysis/dedup-and-roop-optimization.md new file mode 100644 index 0000000..a6b4373 --- /dev/null +++ b/docs/analysis/dedup-and-roop-optimization.md @@ -0,0 +1,155 @@ +# Corpus dedup + roop-unleashed optimization + +_Run date: 2026-04-27. Driver scripts: `work/dedup_optimize.py`, `work/multiface_worker.py`._ + +After consolidation collapsed duplicate identities and age-extend slotted +new PNGs into era buckets, the corpus still carried artifacts that hurt +roop's averaged-embedding quality: + +- **Burst-photo near-duplicates** within facesets, especially in + immich-discovered identities where source libraries had many similar + shots within seconds. +- **Cross-faceset byte-identical PNGs** that escaped consolidation's + centroid-similarity matching when individual PNGs matched exactly but + cluster centroids diverged. +- **Multi-face PNGs** that polluted identity averaging because the roop + loader appends every detected face per PNG to the FaceSet (load-bearing + invariant — see § 2). + +This pipeline runs three independent passes and an optional fourth, all +moving dropped PNGs to `/faces/_dropped/` for reversibility. + +## 1. Cross-family byte-dedup + +SHA256-hash every PNG in the active corpus (parallel I/O via +`ThreadPoolExecutor(max_workers=16)`, ~17 s for 5,386 PNGs over the +`/mnt/e/` Windows mount). Group by hash; for groups with members in +multiple identity families, keep the higher-tier copy. + +**Family detection**: regex `^(faceset_\d+)(?:_.+)?$` — captures the parent +identity. Same family includes parent + era splits (e.g. `faceset_001` + +`faceset_001_2010-13`); these are intentional duplications for the era +.fsz files and are preserved. + +Run results: 20 cross-family hash groups → 24 PNGs dropped. Most cases were +small immich identity-cluster errors that consolidation missed because +individual PNG embeddings matched but the cluster mean did not. + +## 2. Within-faceset near-dup at sim ≥ 0.95 + +Per-faceset pairwise cosine similarity on cached arcface embeddings. +Connected components in the `sim ≥ 0.95` graph. Keep highest +`quality.composite` per component, drop the rest. + +**Threshold rationale**: legitimate same-person-different-pose pairs land at +0.5–0.85; ≥ 0.95 means essentially the same shot (burst frames or +recompressed dupes). Roop's `FaceSet.AverageEmbeddings()` averages all faces +into `faces[0].embedding`; near-identical embeddings averaged ≈ averaging +once. Removing them does not lose identity information; it removes a bias +weight on the most-photographed moments. + +Run results: 851 groups → **1,225 PNGs dropped** (23 % of corpus). +Most-affected: `faceset_026` (-132 of 262), `faceset_027` (-107), +`faceset_028` (-92), `faceset_030` (-92). All immich-discovered identities +where the source library had burst sequences. + +## 3. Multi-face audit (load-bearing roop invariant) + +The roop loader at `roop/ui/tabs/faceswap_tab.py:661–691` runs +`extract_face_images(filename, (False, 0))` on every PNG and **appends every +detected face** to `face_set.faces`. A multi-face PNG therefore pollutes the +averaged identity. The export-swap pipeline drops multi-face crops at +creation, but post-pipeline operations (consolidation, age-extend) move +PNGs across facesets without re-checking. + +**This audit re-detects every PNG** with insightface FaceAnalysis and flags +any with `face_count ≠ 1` (filtered by `det_score ≥ 0.5` and +`face_short ≥ 40`). Includes: +- ≥ 2 faces → loader will inject extra identities into averaging +- 0 faces → insightface can't find a face on the cropped PNG; useless for + roop, would silently fail + +Run results: 4,146 PNGs scored, 332 flagged (272 with 2 faces, 9 with 3, +2 with 4, **49 with 0**). 82 facesets affected. + +## 4. DML throughput jump for face crops + +The audit reuses the same insightface + onnxruntime-directml stack as +`embed_worker.py` but achieves **~19 img/s** on AMD Vega vs embed_worker's +2.6 img/s — same model, same hardware. The difference is input size: + +| stage | typical input | DML throughput | +|-------|--------------|---------------:| +| `embed_worker.py` (Immich import) | 1024–4000 px source | 2.6 img/s | +| `multiface_worker.py` (this audit) | 512×512 face crops | **19 img/s** | + +Detection on small inputs is fast; recognition on aligned 112×112 inputs is +the same cost either way. Implication: **any pipeline operating on +already-cropped face PNGs can rely on a roughly 7× higher DML throughput +ceiling than full-resolution embedding**. + +## 5. Architecture + +``` + ┌────────────────────────────────────────────┐ + │ WSL /opt/face-sets/work/dedup_optimize.py │ + │ • analyze: hashes + within-faceset sim │ + │ • apply: move + re-zip (no GPU) │ + │ • stage_multiface: write queue.json │ + │ • merge_multiface: ingest worker results │ + │ • apply_multiface: move + re-zip │ + │ • report: HTML audit │ + └────────────┬───────────────────────────────┘ + │ queue.json via \\wsl.localhost\ + ▼ + ┌────────────────────────────────────────────┐ + │ Windows C:\face_embed_venv\ │ + │ /opt/face-sets/work/multiface_worker.py │ + │ insightface FaceAnalysis on DmlExecutionProvider │ + │ Reads PNGs from native E:\, writes face_count │ + └────────────────────────────────────────────┘ +``` + +Reuses the existing `C:\face_embed_venv\` (no new venv needed — same +insightface stack as `embed_worker.py`). + +## 6. Final corpus state (2026-04-27 night) + +| metric | start of day | after occlusion filter | after consolidation | after age-extend | after this dedup + multiface | +|--------|-------------:|----------------------:|-------------------:|-----------------:|----------------------------:| +| active facesets | 311 | 255 | 181 | 181 | **181** | +| active PNGs | ~6,440 | 5,386 | 5,386 | 5,400 | **3,849** | +| `_masked/` | 0 | 51 | 51 | 51 | 51 | +| `_thin/` | 68 | 71 | 71 | 71 | 71 | +| `_merged/` | 0 | 0 | 74 | 74 | 74 | + +Net reduction at the end of the day: **2,591 PNGs and 130 facesets** removed +or quarantined from the active pool. All preserved on disk for +reversibility (`/faces/_dropped/` for prunes, `_masked/_merged/_thin/` +for quarantines). + +## 7. Re-running + +Run after any new import / consolidation / extend: + +```bash +# 1. Byte-dedup + within-faceset near-dup (CPU only) +python work/dedup_optimize.py analyze --out work/dedup_audit/dedup_plan.json +python work/dedup_optimize.py apply --plan work/dedup_audit/dedup_plan.json + +# 2. Multi-face audit on Windows DML (resumable) +python work/dedup_optimize.py stage_multiface --out work/dedup_audit/multiface_queue.json +"/mnt/c/face_embed_venv/Scripts/python.exe" work/multiface_worker.py \ + work/dedup_audit/multiface_queue.json work/dedup_audit/multiface_results.json +python work/dedup_optimize.py merge_multiface \ + --results work/dedup_audit/multiface_results.json \ + --out work/dedup_audit/multiface_plan.json +python work/dedup_optimize.py apply_multiface \ + --plan work/dedup_audit/multiface_plan.json + +# 3. HTML audit +python work/dedup_optimize.py report \ + --dedup work/dedup_audit/dedup_plan.json \ + --multiface work/dedup_audit/multiface_plan.json \ + --out work/dedup_audit +``` diff --git a/docs/analysis/identity-consolidation-and-age-extend.md b/docs/analysis/identity-consolidation-and-age-extend.md new file mode 100644 index 0000000..936975e --- /dev/null +++ b/docs/analysis/identity-consolidation-and-age-extend.md @@ -0,0 +1,170 @@ +# Identity consolidation + age-bucket extension + +_Run date: 2026-04-27. Driver scripts: `work/consolidate_facesets.py`, `work/age_extend_001.py`._ + +After the Immich peter + nic imports added 280 new facesets to a corpus that +had ~25 canonical identities, many "new" identities were duplicates of +existing household members at lower clustering confidence. Two cooperating +passes clean this up: identity consolidation merges duplicates, then +age-extend slots newly-merged PNGs into the existing era buckets of +`faceset_001`. + +## 1. Identity consolidation + +### 1.1 Approach + +For each active faceset, pull cached arcface embeddings from +`work/cache/{nl_full,immich_peter,immich_nic}.npz` keyed by +`(source, bbox)` from the per-faceset manifest's `faces[]`. Compute +L2-normalized centroid. Pairwise cosine similarity matrix. + +**Tier-based primary selection** (lowest tier number wins, size breaks ties): + +| tier | sources | rationale | +|-----:|---------|-----------| +| 0 | `faceset_013..019` (hand-sorted) | user's curated labels | +| 1 | `faceset_001..012` (auto-clustered) | well-established household | +| 2 | `faceset_020..025` (osrc) | mixed-bucket discovery | +| 3 | `faceset_026..264` (immich peter) | speculative | +| 4 | `faceset_265+` (immich nic) | speculative | + +**Era splits and quarantines excluded** — `faceset_NNN_`, `_masked/`, +`_thin/` are skipped during analysis. + +### 1.2 Single-linkage chains catastrophically — complete-linkage required + +First attempt used connected-components on edge ≥ 0.45 → produced a +**60-faceset cluster** around `faceset_001` with min within-group sim of +**−0.16** (definitely-different people bridged via chains +`A↔B↔C` where `A`, `C` are not similar). Bumping to edge ≥ 0.55 still +chained (group of 17 with min 0.20). + +Real fix: `scipy.cluster.hierarchy.linkage(method='complete')` then +`fcluster(Z, t=1-edge_threshold, criterion='distance')`. Complete-linkage +**guarantees** every within-group pair sim ≥ edge threshold. Without this +guarantee the report is unusable and the apply step would produce +identity-poisoned merges. + +### 1.3 Thresholds + run results + +`edge=0.55`, `confident=0.65` → 48 multi-faceset groups (29 confident, 19 +uncertain). Max group size 7, all bilateral or small triplets after +complete-linkage. + +After applying all 48 (with `--include-uncertain` after visual approval): + +- **74 facesets consumed** (some groups had multiple secondaries: + `[10, 45, 135] → faceset_002`; `[113, 96, 178, 109, 110, 286] → faceset_095`; + etc.) +- Active count 255 → 181 +- Notable absorptions: `faceset_001` (peter) 707 → 753 PNGs (+ 7, 132, 151); + `faceset_002` 209 → 247; `faceset_026` 60 → 262 (+ 168, 146, 325); + `faceset_028` → 207 +- Master manifest gained `merged[]` array (parallel to `thin_eras[]`); each + entry has `merged_into` field pointing at the primary + +### 1.4 Apply mechanics + +Combine all PNGs from primary + secondaries, re-rank by existing +`quality.composite` desc (no re-enrich), renumber `0001..NNNN`, copy into a +fresh staging dir, atomic swap. Move secondary directories to +`_merged//` (preserved in full for reversibility). Re-zip +`_topN.fsz` and `_all.fsz`. + +The primary's existing per-PNG quality scores are reused — re-ranking does +not require re-running `enrich`-equivalent landmarks/pose on the cropped +PNGs. The primary's `_dropped/` (from prior occlusion filter) is preserved +through the merge. + +## 2. Age extension of faceset_001 era buckets + +### 2.1 Why a follow-on pass + +Consolidation absorbed faceset_007/132/151 into faceset_001 (+46 PNGs). +The original `age_split_001.py` had bucketed peter into 6 era anchors +(`_2005-10`, `_2010-13`, `_2011`, `_2014-17`, `_2018-19`, `_2018-20`), but +those new PNGs had never been seen by age_split. They sat in faceset_001's +parent-only set, missing from every era .fsz. + +### 2.2 Era-label pitfall + +The 6 anchor era labels are NOT strict year ranges. They are +`Counter(years).most_common(1)`-derived dom-years from the original sub-cluster: + +| label | dom_year | actual span of members | +|-------|---------:|-----------------------:| +| `_2005-10` | 2010 | 2005–2010 | +| `_2010-13` | 2011 | **2007–2024** | +| `_2011` | 2011 | 2011 only | +| `_2014-17` | 2016 | 2005–2018 | +| `_2018-19` | 2018 | 2012–2020 | +| `_2018-20` | 2019 | 2014–2022 | + +The clusters are *appearance-anchored*, not year-bounded. Year is a +descriptive label. Assignment rule must use dom-year, not member span. + +### 2.3 Algorithm + +For each unbucketed face entry in `faceset_001`'s manifest (50 of 753): + +1. Look up embedding in cache by `(source, bbox)`. +2. Look up EXIF year via `work/cache/age_split_exif.json`; fetch on cache miss. +3. Find single nearest era anchor by cosine distance to its centroid. +4. Accept iff `dist ≤ 0.40` AND `|year − anchor.dom_year| ≤ 5`. + These thresholds match `age_split_001.py`'s anchor-fragment rule. +5. Anchors are NOT re-centered after absorption (preserves age_split's + drift-prevention guarantee). + +### 2.4 Run results + +50 unbucketed → 21 with EXIF year → **14 accepted**: + +| anchor | dom_year | added | +|--------|---------:|------:| +| `_2005-10` | 2010 | +2 | +| `_2010-13` | 2011 | +1 | +| `_2014-17` | 2016 | **+9** | +| `_2018-20` | 2019 | +2 | + +29 PNGs skipped for missing EXIF year (mostly immich-stripped +photos). 7 dist/year-rejected (e.g. two PNGs from 2025 want +`_2018-19` but year-delta 7 > 5). + +### 2.5 Reconciliation side effect + +The apply rebuilds each affected era bucket's `faces/` from staging. This +incidentally reconciled the per-bucket manifests with disk after the prior +occlusion filter run had left era manifests stale at 282/126/132 entries vs +~248/125/129 actual files (occlusion filter only updates the master +manifest, never per-faceset manifests — see +`docs/analysis/clip-occlusion-filter.md` §7). 42 occlusion-dropped era PNGs +inside the old `faces/_dropped/` were removed during rebuild. The +parent `faceset_001/faces/_dropped/` still has the corpus-level audit; all +source images are intact at `/mnt/x/src/`, so the era-level dropped PNGs +are regeneratable via `cmd_export_swap`. + +## 3. Re-running + +Always run both passes after any new identity import (Immich, osrc, +hand-sorted folder): + +```bash +# 1. Find duplicate identities +python work/consolidate_facesets.py analyze \ + --out work/merge_review/candidates.json [--edge 0.55 --confident 0.65] +python work/consolidate_facesets.py report \ + --candidates work/merge_review/candidates.json --out work/merge_review +# inspect work/merge_review/index.html +python work/consolidate_facesets.py apply \ + --candidates work/merge_review/candidates.json [--include-uncertain] + +# 2. Slot new faceset_001 PNGs into existing era buckets +python work/age_extend_001.py analyze --out work/age_extend/candidates.json +python work/age_extend_001.py report \ + --candidates work/age_extend/candidates.json --out work/age_extend +python work/age_extend_001.py apply --candidates work/age_extend/candidates.json +``` + +Both are idempotent. `consolidate_facesets` skips secondaries already in +`_merged/`; `age_extend_001` recomputes anchor centroids + dom-year fresh +on every run. diff --git a/work/age_extend_001.py b/work/age_extend_001.py new file mode 100644 index 0000000..d611d9e --- /dev/null +++ b/work/age_extend_001.py @@ -0,0 +1,576 @@ +"""Extend the existing 6 era buckets of faceset_001 by absorbing PNGs that +post-date the original age_split run (from consolidation merges, etc.). + +Mirrors the anchor-fragment assignment logic in age_split_001.py: + - For each unbucketed face in faceset_001's manifest, find the nearest active + era anchor by cosine distance to the anchor's centroid. + - Accept the assignment iff dist <= 0.40 AND |year_delta| <= 5 + (where year_delta = exif_year(face) - dom_year(anchor)). + - Undated PNGs are skipped (no assignment). + - Anchors are NOT re-centered after absorption (preserves the same drift + guarantees as the original age_split). + +CLI: + python work/age_extend_001.py analyze --out work/age_extend/candidates.json + python work/age_extend_001.py report --candidates ... --out work/age_extend + python work/age_extend_001.py apply --candidates ... [--dry-run] +""" + +from __future__ import annotations + +import argparse +import json +import shutil +import sys +import time +from collections import Counter +from pathlib import Path + +import numpy as np +from PIL import Image, ExifTags + +ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready") +PARENT = "faceset_001" +ACTIVE_ERAS = [ + "faceset_001_2005-10", + "faceset_001_2010-13", + "faceset_001_2011", + "faceset_001_2014-17", + "faceset_001_2018-19", + "faceset_001_2018-20", +] +CACHES = [ + Path("/opt/face-sets/work/cache/nl_full.npz"), + Path("/opt/face-sets/work/cache/immich_peter.npz"), + Path("/opt/face-sets/work/cache/immich_nic.npz"), +] +EXIF_CACHE = Path("/opt/face-sets/work/cache/age_split_exif.json") + +# anchor-fragment thresholds (mirror age_split_001.py) +DIST_MAX = 0.40 +YEAR_MAX = 5 + + +# ----------------------------- caches ----------------------------- + +def load_caches(): + rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {} + alias_map: dict[str, str] = {} + for c in CACHES: + if not c.exists(): + print(f"[warn] cache missing: {c}", file=sys.stderr) + continue + d = np.load(c, allow_pickle=True) + emb = d["embeddings"] + meta = json.loads(str(d["meta"])) + face_records = [m for m in meta if not m.get("noface")] + if len(face_records) != len(emb): + raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}") + if "path_aliases" in d.files: + paliases = json.loads(str(d["path_aliases"])) + for canon, alist in paliases.items(): + alias_map.setdefault(canon, canon) + for a in alist: + alias_map[a] = canon + for i, rec in enumerate(face_records): + p = rec["path"] + bbox = tuple(int(x) for x in rec["bbox"]) + v = emb[i].astype(np.float32) + n = float(np.linalg.norm(v)) + if n > 0: + v = v / n + rec_index[(p, bbox)] = v + alias_map.setdefault(p, p) + print(f"[cache] indexed {len(rec_index)} face records, {len(alias_map)} aliases", file=sys.stderr) + return rec_index, alias_map + + +def lookup_emb(rec_index, alias_map, src: str, bbox): + bbox_t = tuple(int(x) for x in bbox) + canon = alias_map.get(src, src) + v = rec_index.get((canon, bbox_t)) + if v is None and canon != src: + v = rec_index.get((src, bbox_t)) + return v + + +# ----------------------------- exif ----------------------------- + +def load_exif_cache(): + if not EXIF_CACHE.exists(): + return {} + return json.loads(EXIF_CACHE.read_text()) + + +def save_exif_cache(cache): + tmp = EXIF_CACHE.with_suffix(".tmp.json") + tmp.write_text(json.dumps(cache, indent=2)) + tmp.replace(EXIF_CACHE) + + +def exif_year(path: Path) -> int | None: + try: + with Image.open(path) as im: + ex = im._getexif() + if not ex: + return None + for tag_id, val in ex.items(): + tag = ExifTags.TAGS.get(tag_id, tag_id) + if tag == "DateTimeOriginal" and isinstance(val, str) and len(val) >= 4: + return int(val[:4]) + except Exception: + return None + return None + + +def get_year(src: str, exif_cache) -> int | None: + """Return EXIF year for src, using cache. Mutates cache for new lookups.""" + if src in exif_cache: + return exif_cache[src] + p = Path(src) + y = exif_year(p) if p.exists() else None + exif_cache[src] = y + return y + + +# ----------------------------- analyze ----------------------------- + +def cmd_analyze(args): + rec_index, alias_map = load_caches() + exif_cache = load_exif_cache() + exif_cache_dirty = False + + parent_dir = ROOT / PARENT + parent_manifest = json.loads((parent_dir / "manifest.json").read_text()) + parent_faces = parent_manifest.get("faces", []) + print(f"[parent] {PARENT}: {len(parent_faces)} face entries", file=sys.stderr) + + # Build "in_bucket" set + each anchor's centroid + dom_year + anchors = [] + in_bucket: set[tuple[str, tuple[int, int, int, int]]] = set() + for era in ACTIVE_ERAS: + ed = ROOT / era + if not ed.is_dir(): + print(f"[warn] missing era bucket: {era}", file=sys.stderr) + continue + em = json.loads((ed / "manifest.json").read_text()) + emb_list = [] + years = [] + n_missing_emb = 0 + for f in em.get("faces", []): + src = f.get("source") + bbox = f.get("bbox") + if not src or not bbox: + continue + key = (alias_map.get(src, src), tuple(int(x) for x in bbox)) + in_bucket.add(key) + in_bucket.add((src, tuple(int(x) for x in bbox))) # cover both alias and raw + v = lookup_emb(rec_index, alias_map, src, bbox) + if v is None: + n_missing_emb += 1 + else: + emb_list.append(v) + y = get_year(src, exif_cache) + if y is None: + exif_cache_dirty = True + else: + years.append(y) + if src not in exif_cache: + exif_cache_dirty = True + if not emb_list: + print(f"[warn] {era}: no embeddings found, skipping anchor", file=sys.stderr) + continue + arr = np.stack(emb_list).astype(np.float32) + c = arr.mean(axis=0) + n = float(np.linalg.norm(c)) + if n > 0: + c = c / n + dom_year = Counter(years).most_common(1)[0][0] if years else None + anchors.append({ + "name": era, "centroid": c, "n_faces": len(em.get("faces", [])), + "n_emb_used": len(emb_list), "n_emb_missing": n_missing_emb, + "dom_year": dom_year, + "year_min": min(years) if years else None, + "year_max": max(years) if years else None, + }) + print(f"[anchor] {era}: n={len(em.get('faces', []))} emb_used={len(emb_list)} " + f"emb_miss={n_missing_emb} dom_year={dom_year} years=[{min(years) if years else '-'}..{max(years) if years else '-'}]", + file=sys.stderr) + + # Find unbucketed faces in parent + unbucketed = [] + for f in parent_faces: + src = f.get("source") + bbox = f.get("bbox") + if not src or not bbox: + continue + bbox_t = tuple(int(x) for x in bbox) + key1 = (alias_map.get(src, src), bbox_t) + key2 = (src, bbox_t) + if key1 in in_bucket or key2 in in_bucket: + continue + unbucketed.append(f) + print(f"[parent] {len(unbucketed)} unbucketed face entries (in {PARENT} but no era bucket)", file=sys.stderr) + + # Score each unbucketed face against every anchor + proposals = [] + skipped_no_emb = 0 + skipped_no_year = 0 + for f in unbucketed: + src = f["source"] + bbox = f["bbox"] + v = lookup_emb(rec_index, alias_map, src, bbox) + if v is None: + skipped_no_emb += 1 + continue + y = get_year(src, exif_cache) + if y is None: + skipped_no_year += 1 + exif_cache_dirty = True + continue + if src not in exif_cache: + exif_cache_dirty = True + # nearest anchor + best = None # (dist, idx) + for i, a in enumerate(anchors): + d = 1.0 - float(np.dot(a["centroid"], v)) + if best is None or d < best[0]: + best = (d, i) + if best is None: + continue + dist, bidx = best + anchor = anchors[bidx] + year_delta = abs(y - anchor["dom_year"]) if anchor["dom_year"] is not None else None + accept = (dist <= DIST_MAX and year_delta is not None and year_delta <= YEAR_MAX) + proposals.append({ + "png": f["png"], + "source": src, + "bbox": [int(x) for x in bbox], + "year": y, + "rank_in_parent": f.get("rank"), + "quality_composite": f.get("quality", {}).get("composite"), + "quality": f.get("quality", {}), + "best_anchor": anchor["name"], + "best_anchor_dom_year": anchor["dom_year"], + "centroid_dist": round(dist, 4), + "year_delta": year_delta, + "accept": bool(accept), + "all_anchor_dists": { + a["name"]: round(1.0 - float(np.dot(a["centroid"], v)), 4) for a in anchors + }, + }) + + if exif_cache_dirty: + save_exif_cache(exif_cache) + print(f"[exif] cache flushed ({len(exif_cache)} entries total)", file=sys.stderr) + + # Summarize + accepted = [p for p in proposals if p["accept"]] + rejected = [p for p in proposals if not p["accept"]] + by_anchor = Counter(p["best_anchor"] for p in accepted) + print(f"[summary] unbucketed={len(unbucketed)} scored={len(proposals)} " + f"accepted={len(accepted)} rejected={len(rejected)} " + f"skipped(no_emb={skipped_no_emb}, no_year={skipped_no_year})", file=sys.stderr) + for k, v in by_anchor.most_common(): + print(f" {k}: +{v}", file=sys.stderr) + + out = { + "thresholds": {"dist_max": DIST_MAX, "year_max": YEAR_MAX}, + "anchors": [ + {k: v for k, v in a.items() if k != "centroid"} + for a in anchors + ], + "n_unbucketed": len(unbucketed), + "skipped": {"no_emb": skipped_no_emb, "no_year": skipped_no_year}, + "proposals": sorted(proposals, key=lambda p: (not p["accept"], p["best_anchor"], -1 * (p["quality_composite"] or 0))), + "by_anchor": dict(by_anchor), + } + op = Path(args.out) + op.parent.mkdir(parents=True, exist_ok=True) + op.write_text(json.dumps(out, indent=2)) + print(f"[done] {len(proposals)} proposals -> {op}", file=sys.stderr) + + +# ----------------------------- report ----------------------------- + +def cmd_report(args): + cand = json.loads(Path(args.candidates).read_text()) + out_dir = Path(args.out) + thumbs_dir = out_dir / "thumbs" + thumbs_dir.mkdir(parents=True, exist_ok=True) + THUMB = 140 + + def make_thumb(png_relpath: str) -> str: + # png_relpath looks like "faces/0042.png" + src = ROOT / PARENT / png_relpath + name = Path(png_relpath).stem + dst = thumbs_dir / f"{name}.jpg" + if not dst.exists(): + try: + img = Image.open(src).convert("RGB") + img.thumbnail((THUMB, THUMB), Image.LANCZOS) + img.save(dst, "JPEG", quality=82) + except Exception as e: + print(f"[thumb-skip] {src}: {e}", file=sys.stderr) + return "" + return f"thumbs/{name}.jpg" + + # group accepted proposals by target anchor + by_anchor: dict[str, list] = {} + rejected = [] + for p in cand["proposals"]: + if p["accept"]: + by_anchor.setdefault(p["best_anchor"], []).append(p) + else: + rejected.append(p) + + rows = [] + rows.append("

faceset_001 age extension — review

") + rows.append(f"

{cand['n_unbucketed']} unbucketed faces in {PARENT}; " + f"{sum(len(v) for v in by_anchor.values())} accepted / {len(rejected)} rejected; " + f"thresholds dist≤{cand['thresholds']['dist_max']} AND |year_delta|≤{cand['thresholds']['year_max']}.

") + nav = " · ".join(f"{a} (+{len(by_anchor[a])})" for a in by_anchor) + " · rejected" + rows.append(f"") + + for anchor_name in ACTIVE_ERAS: + if anchor_name not in by_anchor: + continue + items = by_anchor[anchor_name] + anchor_meta = next((a for a in cand["anchors"] if a["name"] == anchor_name), {}) + rows.append(f"
") + rows.append(f"

{anchor_name} (dom_year={anchor_meta.get('dom_year')}; " + f"existing n={anchor_meta.get('n_faces')}; +{len(items)} new)

") + rows.append("
") + for p in sorted(items, key=lambda x: (x["centroid_dist"], -1 * (x["quality_composite"] or 0))): + thumb = make_thumb(p["png"]) + cls = "hi" if p["centroid_dist"] <= 0.30 else "mid" + rows.append( + f"
" + f"" + f"
{p['png']}
year {p['year']} (Δ{p['year_delta']})
" + f"dist {p['centroid_dist']:.3f}
" + f"
" + ) + rows.append("
") + + if rejected: + rows.append("
") + rows.append(f"

rejected ({len(rejected)} faces don't fit any anchor)

") + rows.append("
") + for p in sorted(rejected, key=lambda x: x["centroid_dist"])[:200]: + thumb = make_thumb(p["png"]) + why = [] + if p["centroid_dist"] > cand['thresholds']['dist_max']: + why.append(f"dist {p['centroid_dist']:.2f}>{cand['thresholds']['dist_max']}") + if p["year_delta"] is None or p["year_delta"] > cand['thresholds']['year_max']: + why.append(f"yΔ{p['year_delta']}>{cand['thresholds']['year_max']}") + rows.append( + f"
" + f"" + f"
{p['png']}
year {p['year']} → best {p['best_anchor']}
" + f"{'; '.join(why)}
" + f"
" + ) + if len(rejected) > 200: + rows.append(f"

...{len(rejected)-200} more truncated.

") + rows.append("
") + + html = f""" +faceset_001 age extension + + +{''.join(rows)} +""" + out_html = out_dir / "index.html" + out_html.write_text(html) + print(f"[done] {out_html}", file=sys.stderr) + + +# ----------------------------- apply ----------------------------- + +def _zip_png_list(pngs: list[Path], zip_path: Path) -> None: + import zipfile + with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf: + for i, p in enumerate(pngs): + zf.write(p, arcname=f"{i:04d}.png") + + +def cmd_apply(args): + cand = json.loads(Path(args.candidates).read_text()) + accepted = [p for p in cand["proposals"] if p["accept"]] + if args.dry_run: + from collections import Counter as C + by = C(p["best_anchor"] for p in accepted) + print(f"=== dry-run: {len(accepted)} assignments across {len(by)} anchors ===") + for k, v in by.most_common(): + print(f" {k}: +{v}") + return + + parent_dir = ROOT / PARENT + master_path = ROOT / "manifest.json" + master = json.loads(master_path.read_text()) + facesets_by_name = {f["name"]: f for f in master.get("facesets", [])} + + by_anchor: dict[str, list] = {} + for p in accepted: + by_anchor.setdefault(p["best_anchor"], []).append(p) + + total_added = 0 + for anchor_name, props in by_anchor.items(): + ed = ROOT / anchor_name + em_path = ed / "manifest.json" + em = json.loads(em_path.read_text()) + existing = list(em.get("faces", [])) + + # gather new entries with their source PNG paths in faceset_001/faces/ + new_with_src = [] + for p in props: + src_png = parent_dir / p["png"] + if not src_png.exists(): + print(f"[warn] missing parent PNG {src_png}; skip", file=sys.stderr) + continue + face_entry = { + "source": p["source"], + "bbox": p["bbox"], + "quality": p["quality"], + "exif_year": p["year"], + "centroid_dist_at_assign": p["centroid_dist"], + "year_delta_at_assign": p["year_delta"], + "extended_from_parent": True, + } + new_with_src.append((face_entry, src_png)) + + # combine; rank by quality.composite desc (existing entries already have rank, + # but we re-rank globally so new entries slot in by quality) + combined: list[tuple[dict, Path | None]] = [] + for f in existing: + combined.append((f, None)) + combined.extend(new_with_src) + combined.sort(key=lambda x: -x[0].get("quality", {}).get("composite", 0)) + + # stage fresh + staging = ed / "_faces_new" + if staging.exists(): + shutil.rmtree(staging) + staging.mkdir() + new_face_entries = [] + for new_rank, (face, src_png_or_none) in enumerate(combined, start=1): + new_name = f"{new_rank:04d}.png" + if src_png_or_none is None: + # existing entry: copy from current era bucket faces/ + old_name = Path(face["png"]).name + src = ed / "faces" / old_name + if not src.exists(): + print(f"[warn] {anchor_name}: missing existing PNG {src}; skip", file=sys.stderr) + continue + shutil.copy2(src, staging / new_name) + else: + shutil.copy2(src_png_or_none, staging / new_name) + face = dict(face) + face["rank"] = new_rank + face["png"] = f"faces/{new_name}" + new_face_entries.append(face) + + # swap dirs + old_holding = ed / "_faces_old" + if old_holding.exists(): + shutil.rmtree(old_holding) + (ed / "faces").rename(old_holding) + staging.rename(ed / "faces") + shutil.rmtree(old_holding) + + # re-zip .fsz + survivor_pngs = sorted((ed / "faces").glob("*.png")) + top_n = em.get("top_n", 30) + top_n_eff = min(top_n, len(survivor_pngs)) + for old in ed.glob("*.fsz"): + old.unlink() + top_fsz_name = f"{anchor_name}_top{top_n_eff}.fsz" + all_fsz_name = f"{anchor_name}_all.fsz" + _zip_png_list(survivor_pngs[:top_n_eff], ed / top_fsz_name) + if len(survivor_pngs) > top_n_eff: + _zip_png_list(survivor_pngs, ed / all_fsz_name) + all_fsz_used = all_fsz_name + else: + all_fsz_used = None + + # update local + master manifests + em["faces"] = new_face_entries + em["exported"] = len(new_face_entries) + em["fsz_top"] = top_fsz_name + em["fsz_all"] = all_fsz_used + em["top_n"] = top_n_eff + em.setdefault("age_extend_history", []).append({ + "added": len(new_with_src), + "thresholds": cand["thresholds"], + }) + em_path.write_text(json.dumps(em, indent=2)) + + if anchor_name in facesets_by_name: + facesets_by_name[anchor_name]["exported"] = len(new_face_entries) + facesets_by_name[anchor_name]["fsz_top"] = top_fsz_name + facesets_by_name[anchor_name]["fsz_all"] = all_fsz_used + facesets_by_name[anchor_name]["top_n"] = top_n_eff + + added_here = len(new_with_src) + total_added += added_here + print(f"[applied] {anchor_name}: +{added_here} (now {len(new_face_entries)} faces)", file=sys.stderr) + + # rewrite master with ordering preserved + new_facesets = [] + for entry in master.get("facesets", []): + new_facesets.append(facesets_by_name.get(entry["name"], entry)) + master["facesets"] = new_facesets + master.setdefault("age_extend_runs", []).append({ + "parent": PARENT, + "thresholds": cand["thresholds"], + "anchors": list(by_anchor.keys()), + "added_total": total_added, + }) + tmp = master_path.with_suffix(".tmp.json") + tmp.write_text(json.dumps(master, indent=2)) + tmp.replace(master_path) + print(f"[done] +{total_added} faces across {len(by_anchor)} anchors", file=sys.stderr) + + +# ----------------------------- main ----------------------------- + +def main(): + ap = argparse.ArgumentParser() + sub = ap.add_subparsers(dest="cmd", required=True) + + a = sub.add_parser("analyze") + a.add_argument("--out", required=True) + a.set_defaults(func=cmd_analyze) + + r = sub.add_parser("report") + r.add_argument("--candidates", required=True) + r.add_argument("--out", required=True) + r.set_defaults(func=cmd_report) + + p = sub.add_parser("apply") + p.add_argument("--candidates", required=True) + p.add_argument("--dry-run", action="store_true") + p.set_defaults(func=cmd_apply) + + args = ap.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/work/clip_worker.py b/work/clip_worker.py new file mode 100644 index 0000000..9d28536 --- /dev/null +++ b/work/clip_worker.py @@ -0,0 +1,221 @@ +"""Windows / DirectML CLIP worker for occlusion scoring. + +Reads a queue.json staged by /opt/face-sets/work/filter_occlusions.py (WSL side), +runs open_clip ViT-L-14 (dfn2b_s39b) on each PNG via torch-directml on the AMD +Vega, and writes a scores.json with mask + sunglasses softmax probabilities. + +CLI: + py -3.12 clip_worker.py [--limit N] [--batch 8] + +queue.json shape: list of objects + {"wsl_path": "...", "win_path": "E:\\...\\faceset_NNN\\faces\\NNNN.png", + "faceset": "faceset_NNN", "file": "NNNN.png"} + +scores.json shape: + {"model": "ViT-L-14/dfn2b_s39b", + "logit_scale": 100.0, + "prompts": {...}, + "results": [{"wsl_path": "...", "faceset": "...", "file": "...", + "mask": float, "sunglasses": float}], + "processed": [wsl_path, ...]} +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +import time +import warnings +from pathlib import Path + +# DML emits a verbose UserWarning per attention call -- silence at import time +warnings.filterwarnings("ignore", category=UserWarning) + +import torch +import torch_directml +import open_clip +from PIL import Image + +MODEL_NAME = "ViT-L-14" +PRETRAINED = "dfn2b_s39b" + +# kept in sync with /opt/face-sets/work/filter_occlusions.py PROMPTS +PROMPTS = { + "mask": { + "pos": [ + "a photo of a person wearing a surgical face mask", + "a photo of a person wearing an FFP2 respirator covering mouth and nose", + "a photo of a person wearing a cloth face mask", + "a face partially covered by a medical mask", + "a person whose mouth and nose are hidden by a face mask", + ], + "neg": [ + "a photo of a person's face with mouth and nose clearly visible", + "a clear, unobstructed photo of a face", + "a photo of a face without any mask or covering", + "a portrait of a person showing their full face", + "a photo of a person with a beard and visible mouth", + ], + }, + "sunglasses": { + "pos": [ + "a face with dark sunglasses covering the eyes", + "a portrait with the eyes hidden behind opaque sunglasses", + "a person wearing dark sunglasses over their eyes, eyes not visible", + "a face where the eyes are completely concealed by tinted lenses", + "a close-up portrait wearing aviator sunglasses on the eyes", + ], + "neg": [ + "a portrait with both eyes clearly visible and uncovered", + "a face with sunglasses pushed up on the forehead, eyes visible below", + "a face with sunglasses resting on top of the head, eyes visible", + "a person with sunglasses hanging from their shirt, eyes visible", + "a face wearing clear prescription eyeglasses with visible eyes", + "a portrait with no eyewear and visible eyes", + ], + }, +} + +FLUSH_EVERY = 100 + + +def load_existing(out_path: Path): + if not out_path.exists(): + return None, set() + try: + d = json.loads(out_path.read_text()) + processed = set(d.get("processed", [])) + return d, processed + except Exception as e: + print(f"[warn] could not parse existing {out_path}: {e}; starting fresh", file=sys.stderr) + return None, set() + + +def save_atomic(out_path: Path, data: dict): + tmp = out_path.with_suffix(".tmp.json") + tmp.write_text(json.dumps(data, indent=2)) + os.replace(tmp, out_path) + + +@torch.no_grad() +def build_text_features(model, tokenizer, device): + out = {} + for attr, sides in PROMPTS.items(): + feats = {} + for side in ("pos", "neg"): + tokens = tokenizer(sides[side]).to(device) + f = model.encode_text(tokens) + f = f / f.norm(dim=-1, keepdim=True) + mean = f.mean(dim=0) + feats[side] = mean / mean.norm() + out[attr] = (feats["pos"], feats["neg"]) + return out + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("queue", type=Path) + ap.add_argument("out", type=Path) + ap.add_argument("--limit", type=int, default=None) + ap.add_argument("--batch", type=int, default=8) + args = ap.parse_args() + + queue = json.loads(args.queue.read_text()) + print(f"[queue] {len(queue)} entries from {args.queue}") + + args.out.parent.mkdir(parents=True, exist_ok=True) + existing, processed = load_existing(args.out) + if existing: + print(f"[resume] {len(processed)} entries already scored") + results = existing.get("results", []) + else: + results = [] + + pending = [e for e in queue if e["wsl_path"] not in processed] + if args.limit is not None: + pending = pending[: args.limit] + print(f"[pending] {len(pending)} entries to score") + + if not pending: + print("[done] nothing to do") + return + + device = torch_directml.device() + print(f"[load] {MODEL_NAME}/{PRETRAINED} on {torch_directml.device_name(0)}") + t0 = time.time() + model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED) + tokenizer = open_clip.get_tokenizer(MODEL_NAME) + model = model.to(device).eval() + logit_scale = float(model.logit_scale.exp().detach().cpu()) + print(f"[load] ready in {time.time()-t0:.1f}s logit_scale={logit_scale:.2f}") + text_feats = build_text_features(model, tokenizer, device) + + def flush(): + save_atomic(args.out, { + "model": f"{MODEL_NAME}/{PRETRAINED}", + "logit_scale": logit_scale, + "prompts": PROMPTS, + "results": results, + "processed": sorted(processed), + }) + + n_done_this_run = 0 + n_load_err = 0 + last_flush = time.time() + t_start = time.time() + + for i in range(0, len(pending), args.batch): + chunk = pending[i:i + args.batch] + imgs = [] + keep = [] + for entry in chunk: + try: + img = Image.open(entry["win_path"]).convert("RGB") + imgs.append(preprocess(img)) + keep.append(entry) + except Exception as e: + print(f"[skip] {entry['win_path']}: {e}", file=sys.stderr) + n_load_err += 1 + processed.add(entry["wsl_path"]) + if not imgs: + continue + x = torch.stack(imgs).to(device) + with torch.no_grad(): + feats = model.encode_image(x) + feats = feats / feats.norm(dim=-1, keepdim=True) + scores_per_attr = {} + for attr, (pos, neg) in text_feats.items(): + sims = torch.stack([feats @ pos, feats @ neg], dim=1) * logit_scale + probs = sims.softmax(dim=1)[:, 0].detach().cpu().tolist() + scores_per_attr[attr] = probs + for j, entry in enumerate(keep): + results.append({ + "wsl_path": entry["wsl_path"], + "faceset": entry["faceset"], + "file": entry["file"], + "mask": round(scores_per_attr["mask"][j], 4), + "sunglasses": round(scores_per_attr["sunglasses"][j], 4), + }) + processed.add(entry["wsl_path"]) + n_done_this_run += 1 + + if (n_done_this_run % FLUSH_EVERY < args.batch) or (time.time() - last_flush) > 30.0: + flush() + last_flush = time.time() + elapsed = time.time() - t_start + rate = n_done_this_run / max(0.1, elapsed) + eta_min = (len(pending) - n_done_this_run) / max(0.1, rate) / 60.0 + print(f"[score] {n_done_this_run}/{len(pending)} " + f"rate={rate:.2f} img/s eta={eta_min:.1f}min " + f"load_err={n_load_err}", flush=True) + + flush() + elapsed = time.time() - t_start + print(f"[done] {n_done_this_run} scored, {n_load_err} load errors, " + f"{elapsed:.1f}s ({n_done_this_run/max(0.1,elapsed):.2f} img/s) -> {args.out}") + + +if __name__ == "__main__": + main() diff --git a/work/consolidate_facesets.py b/work/consolidate_facesets.py new file mode 100644 index 0000000..c476cfa --- /dev/null +++ b/work/consolidate_facesets.py @@ -0,0 +1,634 @@ +"""Consolidate facesets_swap_ready/ — find duplicate identities and merge. + +Pipeline: + 1. analyze: pull arcface embeddings from work/cache/*.npz for every PNG in every + active faceset (skipping _masked, _thin, era splits). Compute L2-normalized + centroid per faceset. Build similarity graph at sim>=0.45, extract components. + Pick primary per component by tier (hand-sorted > auto > osrc > immich) + size. + 2. report: HTML contact sheet at work/merge_review/index.html grouped by + candidate cluster, with top-3 thumbs per faceset, all pairwise sims, and + "merge X,Y -> Z" plan. Confident edges (sim>=0.65) are highlighted. + 3. apply: combine PNGs of secondaries into primary, re-rank by quality.composite + descending, renumber 0001..NNNN, re-zip _topN.fsz + _all.fsz, move secondaries + to facesets_swap_ready/_merged//, update master manifest with + `merged[]` array + `merge_run` provenance block. + +Embeddings come from caches (no GPU re-embed needed); the original clusterer used +exactly these vectors so they are the right yardstick. Era splits are excluded +entirely (intentional time-period segmentation, not a duplication). +""" + +from __future__ import annotations + +import argparse +import json +import re +import shutil +import sys +import time +from pathlib import Path + +import numpy as np +from PIL import Image +from scipy.cluster.hierarchy import linkage, fcluster +from scipy.spatial.distance import squareform + +ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready") +CACHES = [ + Path("/opt/face-sets/work/cache/nl_full.npz"), + Path("/opt/face-sets/work/cache/immich_peter.npz"), + Path("/opt/face-sets/work/cache/immich_nic.npz"), +] + +ERA_SPLIT_RE = re.compile(r"^faceset_\d+_(?:\d{4}-\d{2,4}|\d{4}|undated)$") + + +# ----------------------------- helpers ----------------------------- + +def load_caches(): + """Return (rec_index, alias_map). rec_index keyed by (path, bbox_tuple) + -> embedding (np.float32, shape (512,) L2-normalized). + alias_map maps every alias path -> canonical path.""" + rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {} + alias_map: dict[str, str] = {} + n_total = 0 + for c in CACHES: + if not c.exists(): + print(f"[warn] cache missing: {c}", file=sys.stderr) + continue + d = np.load(c, allow_pickle=True) + emb = d["embeddings"] + meta = json.loads(str(d["meta"])) + face_records = [m for m in meta if not m.get("noface")] + if len(face_records) != len(emb): + raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}") + # path_aliases may be present + if "path_aliases" in d.files: + paliases = json.loads(str(d["path_aliases"])) + for canon, alist in paliases.items(): + alias_map.setdefault(canon, canon) + for a in alist: + alias_map[a] = canon + for i, rec in enumerate(face_records): + p = rec["path"] + bbox = tuple(int(x) for x in rec["bbox"]) + v = emb[i].astype(np.float32) + n = float(np.linalg.norm(v)) + if n > 0: + v = v / n + rec_index[(p, bbox)] = v + alias_map.setdefault(p, p) + print(f"[cache] {c.name}: +{len(face_records)} face records (running total {len(rec_index)})", file=sys.stderr) + n_total += len(face_records) + print(f"[cache] indexed {n_total} face records, {len(alias_map)} path aliases", file=sys.stderr) + return rec_index, alias_map + + +def faceset_tier(name: str) -> int: + """Lower number = higher priority for primary selection.""" + m = re.match(r"^faceset_0*(\d+)$", name) + if not m: + return 99 # unknown structure + n = int(m.group(1)) + if 13 <= n <= 19: + return 0 # hand-sorted + if 1 <= n <= 12: + return 1 # auto-clustered + if 20 <= n <= 25: + return 2 # osrc + if 26 <= n <= 264: + return 3 # immich peter + if 265 <= n: + return 4 # immich nic and beyond + return 99 + + +def is_era_split(name: str) -> bool: + return bool(ERA_SPLIT_RE.match(name)) + + +def faceset_centroid(faceset_dir: Path, rec_index, alias_map): + """Return (centroid, n_used, n_missing) where centroid is L2-normalized mean + of embeddings of the faces listed in the per-faceset manifest. Falls back to + None if too few embeddings found.""" + manifest = faceset_dir / "manifest.json" + if not manifest.exists(): + return None, 0, 0 + m = json.loads(manifest.read_text()) + vecs = [] + n_missing = 0 + for f in m.get("faces", []): + src = f.get("source") + bbox = f.get("bbox") + if src is None or bbox is None: + n_missing += 1 + continue + bbox_t = tuple(int(x) for x in bbox) + canon = alias_map.get(src, src) + v = rec_index.get((canon, bbox_t)) + if v is None and canon != src: + v = rec_index.get((src, bbox_t)) + if v is None: + n_missing += 1 + continue + vecs.append(v) + if len(vecs) < 3: + return None, len(vecs), n_missing + arr = np.stack(vecs).astype(np.float32) + c = arr.mean(axis=0) + n = float(np.linalg.norm(c)) + if n > 0: + c = c / n + return c, len(vecs), n_missing + + +def connected_components(adj: dict[int, set[int]]) -> list[list[int]]: + seen: set[int] = set() + comps = [] + for node in adj: + if node in seen: + continue + stack = [node] + comp = [] + while stack: + x = stack.pop() + if x in seen: + continue + seen.add(x) + comp.append(x) + for y in adj.get(x, set()): + if y not in seen: + stack.append(y) + comps.append(sorted(comp)) + return comps + + +# ----------------------------- analyze ----------------------------- + +def cmd_analyze(args): + rec_index, alias_map = load_caches() + + # collect active facesets + active = [] + for d in sorted(ROOT.iterdir()): + if not d.is_dir() or d.name.startswith("_"): + continue + if is_era_split(d.name): + continue + active.append(d) + print(f"[scan] {len(active)} active facesets (era splits + _masked + _thin excluded)", file=sys.stderr) + + centroids: dict[str, np.ndarray] = {} + sizes: dict[str, int] = {} + skipped = [] + t0 = time.time() + for fs in active: + c, n_used, n_miss = faceset_centroid(fs, rec_index, alias_map) + if c is None: + skipped.append((fs.name, n_used, n_miss)) + continue + centroids[fs.name] = c + sizes[fs.name] = n_used + print(f"[centroid] {len(centroids)} facesets centroided in {time.time()-t0:.1f}s; " + f"{len(skipped)} skipped (too few embeddings)", file=sys.stderr) + if skipped: + for n, u, m in skipped[:10]: + print(f" skip {n}: used={u} missing={m}", file=sys.stderr) + if len(skipped) > 10: + print(f" ... +{len(skipped)-10} more", file=sys.stderr) + + names = sorted(centroids.keys()) + if not names: + raise SystemExit("no centroids built") + + # similarity matrix + M = np.stack([centroids[n] for n in names]).astype(np.float32) # (N, 512), normalized + sim = M @ M.T # (N, N) cosine since unit-normalized + np.clip(sim, -1.0, 1.0, out=sim) + + edge_thr = args.edge + confident_thr = args.confident + + # complete-linkage agglomerative clustering on cosine distance. + # Cut at edge threshold: groups are guaranteed to have ALL pairs sim >= edge_thr. + # This avoids the chaining problem of single-link / connected-components. + n = len(names) + dist = 1.0 - sim + np.fill_diagonal(dist, 0.0) + # symmetrize numerical noise + dist = (dist + dist.T) / 2.0 + np.clip(dist, 0.0, 2.0, out=dist) + cond = squareform(dist, checks=False) + Z = linkage(cond, method="complete") + cut_dist = 1.0 - edge_thr # complete-link distance corresponds to (1 - min sim) + labels = fcluster(Z, t=cut_dist, criterion="distance") # 1-indexed cluster ids + + cluster_members: dict[int, list[int]] = {} + for idx, lbl in enumerate(labels): + cluster_members.setdefault(int(lbl), []).append(idx) + comps = [sorted(idxs) for idxs in cluster_members.values() if len(idxs) > 1] + + n_pairs_in_groups = 0 + for c in comps: + n_pairs_in_groups += len(c) * (len(c) - 1) // 2 + print(f"[graph] complete-linkage cut at sim>={edge_thr}: {len(comps)} multi-faceset groups " + f"({n_pairs_in_groups} within-group pairs)", file=sys.stderr) + + # pick primary per group: lowest tier number, then largest size + groups_out = [] + for comp in comps: + members = [names[i] for i in comp] + members_sorted = sorted(members, key=lambda x: (faceset_tier(x), -sizes.get(x, 0), x)) + primary = members_sorted[0] + secondaries = members_sorted[1:] + # gather pairwise sims within group + pair_sims = [] + idx_of = {names[i]: i for i in comp} + for a in members: + for b in members: + if a >= b: + continue + pair_sims.append({"a": a, "b": b, "sim": round(float(sim[idx_of[a], idx_of[b]]), 4)}) + # confidence: minimum within-group sim (the weakest link) + min_link = min(p["sim"] for p in pair_sims) + max_link = max(p["sim"] for p in pair_sims) + confidence = "confident" if min_link >= confident_thr else "uncertain" + groups_out.append({ + "primary": primary, + "secondaries": secondaries, + "members": members_sorted, + "tiers": {n: faceset_tier(n) for n in members}, + "sizes": {n: sizes.get(n, 0) for n in members}, + "pair_sims": pair_sims, + "min_link": round(min_link, 4), + "max_link": round(max_link, 4), + "confidence": confidence, + }) + # sort: confident first, then by max_link desc + groups_out.sort(key=lambda g: (0 if g["confidence"] == "confident" else 1, -g["max_link"])) + + out = { + "thresholds": {"edge": edge_thr, "confident": confident_thr}, + "n_active": len(active), + "n_centroided": len(centroids), + "n_skipped": len(skipped), + "skipped_reasons": [{"name": n, "used": u, "missing": m} for n, u, m in skipped], + "n_groups": len(groups_out), + "n_facesets_in_groups": sum(len(g["members"]) for g in groups_out), + "groups": groups_out, + } + op = Path(args.out) + op.parent.mkdir(parents=True, exist_ok=True) + op.write_text(json.dumps(out, indent=2)) + confident = sum(1 for g in groups_out if g["confidence"] == "confident") + uncertain = sum(1 for g in groups_out if g["confidence"] == "uncertain") + print(f"[done] {len(groups_out)} groups ({confident} confident, {uncertain} uncertain) -> {op}", file=sys.stderr) + + +# ----------------------------- report ----------------------------- + +def cmd_report(args): + candidates = json.loads(Path(args.candidates).read_text()) + out_dir = Path(args.out) + thumbs_dir = out_dir / "thumbs" + thumbs_dir.mkdir(parents=True, exist_ok=True) + + THUMB = 140 + THUMBS_PER_FACESET = 4 + + def make_thumb(faceset: str, fname: str) -> str: + d = thumbs_dir / faceset + d.mkdir(parents=True, exist_ok=True) + dst = d / (Path(fname).stem + ".jpg") + if not dst.exists(): + try: + src = ROOT / faceset / "faces" / fname + img = Image.open(src).convert("RGB") + img.thumbnail((THUMB, THUMB), Image.LANCZOS) + img.save(dst, "JPEG", quality=82) + except Exception as e: + print(f"[thumb-skip] {faceset}/{fname}: {e}", file=sys.stderr) + return "" + return f"thumbs/{faceset}/{Path(fname).stem}.jpg" + + rows = [] + for gi, g in enumerate(candidates["groups"]): + primary = g["primary"] + sec = g["secondaries"] + conf_cls = "confident" if g["confidence"] == "confident" else "uncertain" + rows.append(f"
") + rows.append(f"

group #{gi+1} ({g['confidence']}; min_sim={g['min_link']:.3f}, max_sim={g['max_link']:.3f})

") + rows.append(f"
merge {', '.join(sec)}{primary}
") + # member rows + for name in g["members"]: + tier = g["tiers"][name] + sz = g["sizes"][name] + tier_label = ["hand-sorted", "auto", "osrc", "immich-peter", "immich-nic", "?"][min(tier, 5)] + badge = "PRIMARY" if name == primary else "secondary" + rows.append(f"
") + rows.append(f"
{badge} " + f"{name} tier={tier_label} · n={sz}
") + rows.append("
") + faces_dir = ROOT / name / "faces" + files = sorted(faces_dir.glob("*.png"))[:THUMBS_PER_FACESET] + for f in files: + rel = make_thumb(name, f.name) + if rel: + rows.append(f"") + rows.append("
") + # pairwise sims + rows.append("") + for ps in sorted(g["pair_sims"], key=lambda x: -x["sim"]): + cls = "hi" if ps["sim"] >= candidates["thresholds"]["confident"] else "mid" + rows.append(f"") + rows.append("
absim
{ps['a']}{ps['b']}{ps['sim']:.3f}
") + rows.append("
") + + nav = " · ".join(f"#{i+1}" for i in range(len(candidates["groups"]))) + + html = f""" +Faceset merge review + + +

Merge review — {len(candidates['groups'])} candidate groups + (edge>={candidates['thresholds']['edge']}, confident>={candidates['thresholds']['confident']})

+

{candidates['n_centroided']} of {candidates['n_active']} active facesets centroided + (skipped {candidates['n_skipped']} for too few cached embeddings). + Green = confident (min within-group sim >= {candidates['thresholds']['confident']}); orange = uncertain.

+ +{''.join(rows)} +""" + + out_html = out_dir / "index.html" + out_html.write_text(html) + print(f"[done] {out_html}", file=sys.stderr) + + +# ----------------------------- apply ----------------------------- + +def _zip_png_list(pngs: list[Path], zip_path: Path) -> None: + import zipfile + with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf: + for i, p in enumerate(pngs): + zf.write(p, arcname=f"{i:04d}.png") + + +def cmd_apply(args): + candidates = json.loads(Path(args.candidates).read_text()) + master_path = ROOT / "manifest.json" + master = json.loads(master_path.read_text()) + by_name = {f["name"]: f for f in master.get("facesets", [])} + + # filter: skip "uncertain" groups unless --include-uncertain + accepted = [g for g in candidates["groups"] + if g["confidence"] == "confident" or args.include_uncertain] + skipped_unc = [g for g in candidates["groups"] + if g["confidence"] == "uncertain" and not args.include_uncertain] + # explicit --exclude / --only filters (group indices in the candidates file) + if args.only: + only = {int(s) for s in args.only.split(",")} + accepted = [g for i, g in enumerate(candidates["groups"]) if i in only] + if args.exclude: + excl = {int(s) for s in args.exclude.split(",")} + accepted = [g for i, g in enumerate(accepted) if i not in excl] + + print(f"[plan] {len(accepted)} groups will be merged " + f"({len(skipped_unc)} uncertain skipped)", file=sys.stderr) + + if args.dry_run: + for g in accepted: + print(f" merge {g['secondaries']} -> {g['primary']} " + f"({g['confidence']}, min_sim={g['min_link']:.3f})") + return + + merged_dir = ROOT / "_merged" + merged_dir.mkdir(exist_ok=True) + new_facesets: list[dict] = [] + new_merged: list[dict] = list(master.get("merged", [])) + consumed_names: set[str] = set() + primary_updates: dict[str, dict] = {} # name -> new entry + primary_absorbed: dict[str, list[dict]] = {} # primary_name -> [secondary entries] + + for g in accepted: + primary = g["primary"] + if primary not in by_name: + print(f"[warn] primary {primary} not in master; skipping group", file=sys.stderr) + continue + primary_dir = ROOT / primary + if not primary_dir.is_dir(): + print(f"[warn] primary dir {primary_dir} missing; skipping group", file=sys.stderr) + continue + primary_faces = primary_dir / "faces" + primary_manifest_path = primary_dir / "manifest.json" + primary_manifest = json.loads(primary_manifest_path.read_text()) + + # gather all face entries: primary + each secondary + combined_faces: list[dict] = list(primary_manifest.get("faces", [])) + # adjust composite quality fall-back: ensure key exists + for f in combined_faces: + f.setdefault("origin_faceset", primary) + + for sec in g["secondaries"]: + sec_dir = ROOT / sec + if not sec_dir.is_dir(): + print(f"[warn] secondary {sec} missing; skipping", file=sys.stderr) + continue + sec_manifest_path = sec_dir / "manifest.json" + sec_manifest = json.loads(sec_manifest_path.read_text()) if sec_manifest_path.exists() else {"faces": []} + for f in sec_manifest.get("faces", []): + f = dict(f) + f["origin_faceset"] = sec + combined_faces.append(f) + + # rank by quality.composite descending; ties broken by lower cosd_centroid + def sort_key(f): + q = f.get("quality", {}).get("composite", 0) + d = f.get("cosd_centroid", 1.0) + return (-q, d) + combined_faces.sort(key=sort_key) + + # renumber and stage PNGs into a fresh staging dir, then atomically swap + staging = primary_dir / "_faces_new" + if staging.exists(): + shutil.rmtree(staging) + staging.mkdir() + new_face_entries = [] + for new_rank, f in enumerate(combined_faces, start=1): + origin = f.pop("origin_faceset") + old_png_rel = f["png"] # e.g. "faces/0042.png" + old_png_name = Path(old_png_rel).name + origin_png = ROOT / origin / "faces" / old_png_name + if not origin_png.exists(): + # could be in _dropped if occlusion-pruned; skip + continue + new_name = f"{new_rank:04d}.png" + shutil.copy2(origin_png, staging / new_name) + f = dict(f) + f["rank"] = new_rank + f["png"] = f"faces/{new_name}" + f["origin_faceset"] = origin # preserve provenance in manifest + new_face_entries.append(f) + + # swap directories: primary/faces -> primary/_faces_old, staging -> primary/faces + old_faces_holding = primary_dir / "_faces_old" + if old_faces_holding.exists(): + shutil.rmtree(old_faces_holding) + if primary_faces.exists(): + primary_faces.rename(old_faces_holding) + staging.rename(primary_faces) + # migrate _dropped/ from old holding (so occlusion-pruned PNGs remain accessible) + old_dropped = old_faces_holding / "_dropped" + if old_dropped.exists(): + (primary_faces / "_dropped").mkdir(exist_ok=True) + for x in old_dropped.iterdir(): + shutil.move(str(x), str(primary_faces / "_dropped" / x.name)) + shutil.rmtree(old_faces_holding) + + # re-zip .fsz + survivor_pngs = sorted(primary_faces.glob("*.png")) + top_n = primary_manifest.get("top_n", 30) + top_n_eff = min(top_n, len(survivor_pngs)) + # remove old .fsz files + for old in primary_dir.glob("*.fsz"): + old.unlink() + top_fsz_name = f"{primary}_top{top_n_eff}.fsz" + all_fsz_name = f"{primary}_all.fsz" + _zip_png_list(survivor_pngs[:top_n_eff], primary_dir / top_fsz_name) + if len(survivor_pngs) > top_n_eff: + _zip_png_list(survivor_pngs, primary_dir / all_fsz_name) + all_fsz_used = all_fsz_name + else: + all_fsz_used = None + + # update primary's local manifest + primary_manifest["faces"] = new_face_entries + primary_manifest["exported"] = len(new_face_entries) + primary_manifest["fsz_top"] = top_fsz_name + primary_manifest["fsz_all"] = all_fsz_used + primary_manifest["top_n"] = top_n_eff + primary_manifest.setdefault("merge_history", []).append({ + "absorbed": g["secondaries"], + "min_link": g["min_link"], + "max_link": g["max_link"], + "confidence": g["confidence"], + }) + primary_manifest_path.write_text(json.dumps(primary_manifest, indent=2)) + + # move secondary directories into _merged/ + absorbed_master_entries: list[dict] = [] + for sec in g["secondaries"]: + sec_dir = ROOT / sec + target = merged_dir / sec + if not sec_dir.is_dir(): + continue + if target.exists(): + shutil.rmtree(sec_dir) # already moved by previous run; clean stub + else: + shutil.move(str(sec_dir), str(target)) + sec_master = dict(by_name.get(sec, {"name": sec})) + sec_master["merged_into"] = primary + sec_master["relpath"] = f"_merged/{sec}" + sec_master["fsz_top"] = None + sec_master["fsz_all"] = None + absorbed_master_entries.append(sec_master) + consumed_names.add(sec) + + new_merged.extend(absorbed_master_entries) + + # bump primary master entry + prim_master = dict(by_name[primary]) + prim_master["exported"] = len(new_face_entries) + prim_master["top_n"] = top_n_eff + prim_master["fsz_top"] = top_fsz_name + prim_master["fsz_all"] = all_fsz_used + prim_master.setdefault("merge_history", []).append({ + "absorbed": g["secondaries"], + "min_link": g["min_link"], + "max_link": g["max_link"], + }) + primary_updates[primary] = prim_master + + print(f"[merged] {g['secondaries']} -> {primary} " + f"now {len(new_face_entries)} png", file=sys.stderr) + + # rebuild master facesets list + for entry in master.get("facesets", []): + nm = entry["name"] + if nm in consumed_names: + continue + if nm in primary_updates: + new_facesets.append(primary_updates[nm]) + else: + new_facesets.append(entry) + + new_master = dict(master) + new_master["facesets"] = new_facesets + new_master["merged"] = new_merged + new_master["merge_run"] = { + "thresholds": candidates["thresholds"], + "groups_applied": len(accepted), + "facesets_consumed": len(consumed_names), + "include_uncertain": bool(args.include_uncertain), + } + tmp = master_path.with_suffix(".tmp.json") + tmp.write_text(json.dumps(new_master, indent=2)) + tmp.replace(master_path) + print(f"[done] master manifest updated: {len(new_facesets)} active, " + f"{len(new_merged)} merged, {len(consumed_names)} consumed in this run", + file=sys.stderr) + + +# ----------------------------- main ----------------------------- + +def main(): + ap = argparse.ArgumentParser() + sub = ap.add_subparsers(dest="cmd", required=True) + + a = sub.add_parser("analyze") + a.add_argument("--out", required=True) + a.add_argument("--edge", type=float, default=0.45, help="min cosine sim to draw an edge (default 0.45)") + a.add_argument("--confident", type=float, default=0.65, help="min within-group sim to be confident (default 0.65)") + a.set_defaults(func=cmd_analyze) + + r = sub.add_parser("report") + r.add_argument("--candidates", required=True) + r.add_argument("--out", required=True) + r.set_defaults(func=cmd_report) + + p = sub.add_parser("apply") + p.add_argument("--candidates", required=True) + p.add_argument("--include-uncertain", action="store_true", + help="apply uncertain groups too (default: confident only)") + p.add_argument("--only", default=None, help="comma-separated group indices to apply") + p.add_argument("--exclude", default=None, help="comma-separated group indices to skip") + p.add_argument("--dry-run", action="store_true") + p.set_defaults(func=cmd_apply) + + args = ap.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/work/dedup_optimize.py b/work/dedup_optimize.py new file mode 100644 index 0000000..c337403 --- /dev/null +++ b/work/dedup_optimize.py @@ -0,0 +1,594 @@ +"""Corpus-wide dedup + roop-unleashed optimization. + +Two passes: + 1. Cross-family byte-identical PNG dedup (same SHA256 in two different identity + families) — keep the higher-tier family copy. Era splits of the same parent + identity (faceset_NNN_*) are intentional duplications and are NOT deduped + within their family. + 2. Within-faceset near-duplicate dedup using cached arcface embeddings + (cosine sim >= 0.95). Keep highest quality.composite, drop the rest. + +Plus a Windows-DML multi-face audit (separate phase via clip_worker-style split): + 3. Re-detect each PNG with insightface; flag any with 0 or >1 detected faces. + The roop loader appends every detected face per PNG, so multi-face crops + pollute identity averaging. + +All flagged PNGs are MOVED to /faces/_dropped/ (reversible). Affected +.fsz files are re-zipped, manifests updated. + +CLI: + analyze --out work/dedup_audit/dedup_plan.json + apply --plan ... [--dry-run] + stage_multiface --out work/dedup_audit/multiface_queue.json + merge_multiface --results --out work/dedup_audit/multiface_plan.json + apply_multiface --plan ... [--dry-run] + report --dedup ... --multiface ... --out work/dedup_audit +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import re +import shutil +import sys +import time +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np + +ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready") +WIN_ROOT = r"E:\temp_things\fcswp\nl_sorted\facesets_swap_ready" +CACHES = [ + Path("/opt/face-sets/work/cache/nl_full.npz"), + Path("/opt/face-sets/work/cache/immich_peter.npz"), + Path("/opt/face-sets/work/cache/immich_nic.npz"), +] + +NEAR_DUP_THRESHOLD = 0.95 +HASH_PARALLEL = 16 + + +# ----------------------------- helpers ----------------------------- + +def faceset_tier(name: str) -> int: + m = re.match(r"^faceset_0*(\d+)(?:_.+)?$", name) + if not m: + return 99 + n = int(m.group(1)) + if 13 <= n <= 19: + return 0 + if 1 <= n <= 12: + return 1 + if 20 <= n <= 25: + return 2 + if 26 <= n <= 264: + return 3 + if 265 <= n: + return 4 + return 99 + + +def faceset_family(name: str) -> str: + """faceset_001_2010-13 → faceset_001; faceset_001 → faceset_001.""" + m = re.match(r"^(faceset_\d+)(?:_.+)?$", name) + return m.group(1) if m else name + + +def wsl_to_win(p: str) -> str: + s = str(p) + if s.startswith("/mnt/"): + return f"{s[5].upper()}:\\{s[7:].replace('/', chr(92))}" + return s + + +def iter_active_facesets() -> list[Path]: + out = [] + for d in sorted(ROOT.iterdir()): + if d.is_dir() and not d.name.startswith("_"): + out.append(d) + return out + + +def sha256_file(p: Path) -> str: + h = hashlib.sha256() + with open(p, "rb") as f: + while True: + b = f.read(1 << 20) + if not b: + break + h.update(b) + return h.hexdigest() + + +def load_caches(): + rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {} + alias_map: dict[str, str] = {} + for c in CACHES: + if not c.exists(): + continue + d = np.load(c, allow_pickle=True) + emb = d["embeddings"] + meta = json.loads(str(d["meta"])) + face_records = [m for m in meta if not m.get("noface")] + if "path_aliases" in d.files: + paliases = json.loads(str(d["path_aliases"])) + for canon, alist in paliases.items(): + alias_map.setdefault(canon, canon) + for a in alist: + alias_map[a] = canon + for i, rec in enumerate(face_records): + p = rec["path"] + bbox = tuple(int(x) for x in rec["bbox"]) + v = emb[i].astype(np.float32) + n = float(np.linalg.norm(v)) + if n > 0: + v = v / n + rec_index[(p, bbox)] = v + alias_map.setdefault(p, p) + return rec_index, alias_map + + +def lookup_emb(rec_index, alias_map, src: str, bbox): + bbox_t = tuple(int(x) for x in bbox) + canon = alias_map.get(src, src) + v = rec_index.get((canon, bbox_t)) + if v is None and canon != src: + v = rec_index.get((src, bbox_t)) + return v + + +# ----------------------------- analyze ----------------------------- + +def cmd_analyze(args): + rec_index, alias_map = load_caches() + facesets = iter_active_facesets() + print(f"[scan] {len(facesets)} active facesets", file=sys.stderr) + + # Phase 1: walk every PNG, collect (faceset, file, src, bbox, quality, emb, sha256) + all_pngs = [] # list of dicts + t0 = time.time() + for fs in facesets: + manifest_path = fs / "manifest.json" + if not manifest_path.exists(): + continue + m = json.loads(manifest_path.read_text()) + for f in m.get("faces", []): + png_rel = f.get("png") + if not png_rel: + continue + disk_path = fs / png_rel + if not disk_path.exists(): + continue + all_pngs.append({ + "faceset": fs.name, + "family": faceset_family(fs.name), + "tier": faceset_tier(fs.name), + "file": Path(png_rel).name, + "rank": f.get("rank"), + "source": f.get("source"), + "bbox": f.get("bbox"), + "quality": f.get("quality", {}).get("composite", 0), + "disk_path": str(disk_path), + }) + print(f"[scan] {len(all_pngs)} PNGs walked in {time.time()-t0:.1f}s", file=sys.stderr) + + # Phase 2: SHA256 hash each PNG (parallel I/O) + t0 = time.time() + def _hash_one(idx): + all_pngs[idx]["sha256"] = sha256_file(Path(all_pngs[idx]["disk_path"])) + with ThreadPoolExecutor(max_workers=HASH_PARALLEL) as ex: + # exhaust the iterator to actually run + for _ in ex.map(_hash_one, range(len(all_pngs)), chunksize=16): + pass + print(f"[hash] {len(all_pngs)} PNGs hashed in {time.time()-t0:.1f}s", file=sys.stderr) + + # Phase 3: cross-family byte-dedup + by_sha: dict[str, list[int]] = {} + for i, p in enumerate(all_pngs): + by_sha.setdefault(p["sha256"], []).append(i) + + cross_family_groups = [] + byte_drops: set[int] = set() # indices of PNGs to drop + for sha, idxs in by_sha.items(): + if len(idxs) < 2: + continue + families = {all_pngs[i]["family"] for i in idxs} + if len(families) < 2: + continue # all in same family — intentional era duplication + # multiple families share this content → dedup keeping the best one + cross_family_groups.append({"sha256": sha, "members": [ + {"faceset": all_pngs[i]["faceset"], "file": all_pngs[i]["file"], + "tier": all_pngs[i]["tier"], "quality": all_pngs[i]["quality"], + "rank": all_pngs[i]["rank"]} for i in idxs + ]}) + # keeper rule: lowest tier number, then highest quality + best = sorted(idxs, key=lambda i: (all_pngs[i]["tier"], -all_pngs[i]["quality"]))[0] + for i in idxs: + # NEVER drop within-family copies (preserve era duplication intentionally) + # We only drop indices whose family != best's family + if i != best and all_pngs[i]["family"] != all_pngs[best]["family"]: + byte_drops.add(i) + print(f"[byte] {len(cross_family_groups)} cross-family hash groups; " + f"{len(byte_drops)} PNGs marked for byte-dedup drop", file=sys.stderr) + + # Phase 4: within-faceset near-dup (embedding sim >= threshold) + by_faceset: dict[str, list[int]] = {} + for i, p in enumerate(all_pngs): + by_faceset.setdefault(p["faceset"], []).append(i) + + near_dup_groups = [] + near_drops: set[int] = set() + miss_emb_total = 0 + t0 = time.time() + for fs_name, idxs in by_faceset.items(): + if len(idxs) < 2: + continue + # gather embeddings + embs = [] + kept_idxs = [] + for i in idxs: + v = lookup_emb(rec_index, alias_map, all_pngs[i]["source"], all_pngs[i]["bbox"]) + if v is None: + miss_emb_total += 1 + continue + embs.append(v) + kept_idxs.append(i) + if len(kept_idxs) < 2: + continue + M = np.stack(embs).astype(np.float32) + sim = M @ M.T + np.fill_diagonal(sim, -1) # ignore self + # find connected components in the (sim >= threshold) graph + adj = {k: set() for k in range(len(kept_idxs))} + for a in range(len(kept_idxs)): + # only check a < b to avoid double work + hi = np.where(sim[a, a+1:] >= NEAR_DUP_THRESHOLD)[0] + for off in hi: + b = a + 1 + int(off) + adj[a].add(b) + adj[b].add(a) + seen = set() + for k in adj: + if k in seen or not adj[k]: + continue + stack = [k] + comp = [] + while stack: + x = stack.pop() + if x in seen: + continue + seen.add(x) + comp.append(x) + for y in adj[x]: + if y not in seen: + stack.append(y) + if len(comp) < 2: + continue + comp_idxs = [kept_idxs[c] for c in comp] + # keeper: highest quality.composite, tie-break: lowest rank + best = sorted(comp_idxs, key=lambda i: (-all_pngs[i]["quality"], all_pngs[i]["rank"] or 9999))[0] + sims_in_group = [] + for ci in range(len(comp)): + for cj in range(ci+1, len(comp)): + sims_in_group.append(float(sim[comp[ci], comp[cj]])) + near_dup_groups.append({ + "faceset": fs_name, + "members": [{"file": all_pngs[i]["file"], "rank": all_pngs[i]["rank"], + "quality": all_pngs[i]["quality"]} for i in comp_idxs], + "keeper": all_pngs[best]["file"], + "min_sim": min(sims_in_group) if sims_in_group else None, + "max_sim": max(sims_in_group) if sims_in_group else None, + }) + for i in comp_idxs: + if i != best: + near_drops.add(i) + print(f"[near] {len(near_dup_groups)} near-dup groups; " + f"{len(near_drops)} PNGs marked for near-dup drop " + f"(miss_emb={miss_emb_total}); {time.time()-t0:.1f}s", file=sys.stderr) + + # Combined drop set; for output, group by faceset + all_drops = byte_drops | near_drops + drops_by_faceset: dict[str, list] = {} + for i in all_drops: + p = all_pngs[i] + reason = [] + if i in byte_drops: reason.append("byte_dup") + if i in near_drops: reason.append("near_dup") + drops_by_faceset.setdefault(p["faceset"], []).append({ + "file": p["file"], "rank": p["rank"], "reason": "+".join(reason), + "sha256": p["sha256"], "quality": p["quality"], + }) + + plan = { + "thresholds": {"near_dup_sim": NEAR_DUP_THRESHOLD}, + "totals": { + "active_facesets": len(facesets), + "active_pngs": len(all_pngs), + "byte_dup_groups": len(cross_family_groups), + "byte_dup_drops": len(byte_drops), + "near_dup_groups": len(near_dup_groups), + "near_dup_drops": len(near_drops), + "all_drops": len(all_drops), + "facesets_affected": len(drops_by_faceset), + }, + "byte_dup_groups": cross_family_groups, + "near_dup_groups": near_dup_groups, + "drops_by_faceset": drops_by_faceset, + } + op = Path(args.out) + op.parent.mkdir(parents=True, exist_ok=True) + op.write_text(json.dumps(plan, indent=2)) + print(f"[done] plan -> {op}", file=sys.stderr) + + +# ----------------------------- apply ----------------------------- + +def _zip_png_list(pngs: list[Path], zip_path: Path) -> None: + import zipfile + with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf: + for i, p in enumerate(pngs): + zf.write(p, arcname=f"{i:04d}.png") + + +def _apply_drops_to_facesets(drops_by_faceset: dict[str, list], reason_label: str, master_path: Path): + """Move flagged PNGs to /faces/_dropped/, rebuild manifests + .fsz. + drops_by_faceset values are lists of {"file": str, ...}. + Returns total moved + counts per faceset.""" + master = json.loads(master_path.read_text()) + by_name = {f["name"]: f for f in master.get("facesets", [])} + total_moved = 0 + per_faceset_counts = {} + + for fs_name, drops in drops_by_faceset.items(): + fs_dir = ROOT / fs_name + if not fs_dir.is_dir(): + print(f"[warn] {fs_name}: dir missing, skip", file=sys.stderr) + continue + faces_dir = fs_dir / "faces" + dropped_dir = faces_dir / "_dropped" + dropped_dir.mkdir(exist_ok=True) + drop_files = {d["file"] for d in drops} + + moved_here = 0 + for fname in sorted(drop_files): + src = faces_dir / fname + if not src.exists(): + continue + shutil.move(str(src), str(dropped_dir / fname)) + moved_here += 1 + + # rebuild manifest by filtering out dropped files + manifest_path = fs_dir / "manifest.json" + if manifest_path.exists(): + mm = json.loads(manifest_path.read_text()) + new_faces = [f for f in mm.get("faces", []) if Path(f.get("png", "")).name not in drop_files] + mm["faces"] = new_faces + mm["exported"] = len(new_faces) + mm.setdefault(f"{reason_label}_history", []).append({"dropped": moved_here}) + + # re-zip + survivor_pngs = sorted(faces_dir.glob("*.png")) + top_n = mm.get("top_n", 30) + top_n_eff = min(top_n, len(survivor_pngs)) + for old in fs_dir.glob("*.fsz"): + old.unlink() + top_fsz_name = f"{fs_name}_top{top_n_eff}.fsz" + all_fsz_name = f"{fs_name}_all.fsz" + if top_n_eff > 0: + _zip_png_list(survivor_pngs[:top_n_eff], fs_dir / top_fsz_name) + mm["fsz_top"] = top_fsz_name + mm["top_n"] = top_n_eff + else: + mm["fsz_top"] = None + mm["top_n"] = 0 + if len(survivor_pngs) > top_n_eff: + _zip_png_list(survivor_pngs, fs_dir / all_fsz_name) + mm["fsz_all"] = all_fsz_name + else: + mm["fsz_all"] = None + manifest_path.write_text(json.dumps(mm, indent=2)) + + if fs_name in by_name: + by_name[fs_name]["exported"] = len(new_faces) + by_name[fs_name]["fsz_top"] = mm["fsz_top"] + by_name[fs_name]["fsz_all"] = mm["fsz_all"] + by_name[fs_name]["top_n"] = mm["top_n"] + by_name[fs_name].setdefault(f"{reason_label}_dropped", 0) + by_name[fs_name][f"{reason_label}_dropped"] += moved_here + + total_moved += moved_here + per_faceset_counts[fs_name] = moved_here + + # rewrite master with same ordering + new_facesets = [by_name.get(e["name"], e) for e in master.get("facesets", [])] + master["facesets"] = new_facesets + master.setdefault(f"{reason_label}_runs", []).append({ + "facesets_affected": len(per_faceset_counts), + "pngs_moved": total_moved, + }) + tmp = master_path.with_suffix(".tmp.json") + tmp.write_text(json.dumps(master, indent=2)) + tmp.replace(master_path) + return total_moved, per_faceset_counts + + +def cmd_apply(args): + plan = json.loads(Path(args.plan).read_text()) + drops = plan["drops_by_faceset"] + if args.dry_run: + for fs, items in sorted(drops.items()): + reasons = {} + for it in items: + reasons[it["reason"]] = reasons.get(it["reason"], 0) + 1 + print(f" {fs}: {len(items)} dropped ({reasons})") + print(f"=== total: {sum(len(v) for v in drops.values())} PNGs across {len(drops)} facesets ===") + return + master_path = ROOT / "manifest.json" + total, _ = _apply_drops_to_facesets(drops, "dedup", master_path) + print(f"[done] {total} PNGs moved to faces/_dropped/ across {len(drops)} facesets", file=sys.stderr) + + +# ----------------------------- multiface staging + apply ----------------------------- + +def cmd_stage_multiface(args): + """Build queue.json of all currently-active PNGs in the corpus + for the Windows DML multi-face audit worker.""" + queue = [] + for fs in iter_active_facesets(): + faces_dir = fs / "faces" + if not faces_dir.is_dir(): + continue + for p in sorted(faces_dir.glob("*.png")): + queue.append({ + "wsl_path": str(p), + "win_path": wsl_to_win(str(p)), + "faceset": fs.name, + "file": p.name, + }) + op = Path(args.out) + op.parent.mkdir(parents=True, exist_ok=True) + op.write_text(json.dumps(queue, indent=2)) + print(f"[stage] {len(queue)} PNGs -> {op}", file=sys.stderr) + + +def cmd_merge_multiface(args): + """Convert worker results.json into a drops_by_faceset plan.""" + src = json.loads(Path(args.results).read_text()) + drops_by_faceset: dict[str, list] = {} + bad_count = 0 + for r in src.get("results", []): + n_faces = r.get("face_count", -1) + if n_faces == 1: + continue + bad_count += 1 + drops_by_faceset.setdefault(r["faceset"], []).append({ + "file": r["file"], + "reason": f"multiface_{n_faces}", + "face_count": n_faces, + }) + plan = { + "totals": {"bad_pngs": bad_count, "facesets_affected": len(drops_by_faceset), + "scored": len(src.get("results", []))}, + "drops_by_faceset": drops_by_faceset, + } + op = Path(args.out) + op.parent.mkdir(parents=True, exist_ok=True) + op.write_text(json.dumps(plan, indent=2)) + print(f"[merge] {bad_count} bad PNGs across {len(drops_by_faceset)} facesets -> {op}", file=sys.stderr) + + +def cmd_apply_multiface(args): + plan = json.loads(Path(args.plan).read_text()) + drops = plan["drops_by_faceset"] + if args.dry_run: + for fs, items in sorted(drops.items()): + print(f" {fs}: {len(items)} bad PNG(s)") + print(f"=== total: {sum(len(v) for v in drops.values())} ===") + return + master_path = ROOT / "manifest.json" + total, _ = _apply_drops_to_facesets(drops, "multiface", master_path) + print(f"[done] {total} PNGs moved to faces/_dropped/ across {len(drops)} facesets", file=sys.stderr) + + +# ----------------------------- report ----------------------------- + +def cmd_report(args): + out_dir = Path(args.out) + out_dir.mkdir(parents=True, exist_ok=True) + + sections = [] + if args.dedup: + d = json.loads(Path(args.dedup).read_text()) + t = d["totals"] + sections.append(f"

Dedup

") + sections.append( + f"
    " + f"
  • Active facesets: {t['active_facesets']}, active PNGs: {t['active_pngs']}
  • " + f"
  • Cross-family byte-dup groups: {t['byte_dup_groups']} → {t['byte_dup_drops']} PNGs dropped
  • " + f"
  • Within-faceset near-dup groups (sim≥{d['thresholds']['near_dup_sim']}): {t['near_dup_groups']} → {t['near_dup_drops']} PNGs dropped
  • " + f"
  • Total dedup drops: {t['all_drops']} across {t['facesets_affected']} facesets
  • " + f"
" + ) + # top-N affected facesets + rows = sorted(d["drops_by_faceset"].items(), key=lambda x: -len(x[1]))[:25] + sections.append("

Top 25 most-affected facesets

") + for fs, items in rows: + r = {} + for it in items: + r[it["reason"]] = r.get(it["reason"], 0) + 1 + sections.append(f"") + sections.append("
facesetdroppedreasons
{fs}{len(items)}{r}
") + + if args.multiface: + m = json.loads(Path(args.multiface).read_text()) + t = m["totals"] + sections.append("

Multi-face audit

") + sections.append( + f"
    " + f"
  • PNGs scored: {t['scored']}
  • " + f"
  • Bad PNGs (0 or >1 face): {t['bad_pngs']} across {t['facesets_affected']} facesets
  • " + f"
" + ) + + html = f""" +Dedup + multi-face audit + + +

facesets_swap_ready dedup + roop optimization audit

+{''.join(sections)} +""" + out_html = out_dir / "index.html" + out_html.write_text(html) + print(f"[done] {out_html}", file=sys.stderr) + + +# ----------------------------- main ----------------------------- + +def main(): + ap = argparse.ArgumentParser() + sub = ap.add_subparsers(dest="cmd", required=True) + + a = sub.add_parser("analyze") + a.add_argument("--out", required=True) + a.set_defaults(func=cmd_analyze) + + p = sub.add_parser("apply") + p.add_argument("--plan", required=True) + p.add_argument("--dry-run", action="store_true") + p.set_defaults(func=cmd_apply) + + sm = sub.add_parser("stage_multiface") + sm.add_argument("--out", required=True) + sm.set_defaults(func=cmd_stage_multiface) + + mm = sub.add_parser("merge_multiface") + mm.add_argument("--results", required=True) + mm.add_argument("--out", required=True) + mm.set_defaults(func=cmd_merge_multiface) + + am = sub.add_parser("apply_multiface") + am.add_argument("--plan", required=True) + am.add_argument("--dry-run", action="store_true") + am.set_defaults(func=cmd_apply_multiface) + + r = sub.add_parser("report") + r.add_argument("--dedup", default=None) + r.add_argument("--multiface", default=None) + r.add_argument("--out", required=True) + r.set_defaults(func=cmd_report) + + args = ap.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/work/filter_occlusions.py b/work/filter_occlusions.py new file mode 100644 index 0000000..6daa1e3 --- /dev/null +++ b/work/filter_occlusions.py @@ -0,0 +1,574 @@ +"""CLIP zero-shot scoring for masks + sunglasses across facesets_swap_ready/. + +Usage: + # score one or more specific facesets (test mode) + python work/filter_occlusions.py score --facesets faceset_001,faceset_050 \ + --out work/test_batch_occlusion/scores.json + + # score everything (full corpus) + python work/filter_occlusions.py score --out work/occlusion_scores.json + + # render HTML contact sheet from a scores.json + python work/filter_occlusions.py report --scores work/test_batch_occlusion/scores.json \ + --out work/test_batch_occlusion + +Notes: +- This script never modifies facesets_swap_ready/. An --apply step lives elsewhere + (or will be added once thresholds are validated). +- Model: open_clip ViT-L-14 / dfn2b_s39b (best public zero-shot at this size). +""" + +from __future__ import annotations + +import argparse +import json +import sys +import time +from pathlib import Path +from typing import Iterable + +import torch +from PIL import Image +import open_clip + +ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready") +WIN_ROOT = r"E:\temp_things\fcswp\nl_sorted\facesets_swap_ready" + +MODEL_NAME = "ViT-L-14" +PRETRAINED = "dfn2b_s39b" + + +def wsl_to_win(wsl_path: str) -> str: + """Translate a /mnt/e/... wsl path to E:\\... for the Windows worker.""" + s = str(wsl_path) + if s.startswith("/mnt/"): + drive = s[5] + rest = s[7:].replace("/", "\\") + return f"{drive.upper()}:\\{rest}" + return s + +# Prompt ensembles. Each pair (positive, negative) becomes one binary classifier. +# We average text embeddings within each list, then softmax across the two means. +PROMPTS = { + "mask": { + "pos": [ + "a photo of a person wearing a surgical face mask", + "a photo of a person wearing an FFP2 respirator covering mouth and nose", + "a photo of a person wearing a cloth face mask", + "a face partially covered by a medical mask", + "a person whose mouth and nose are hidden by a face mask", + ], + "neg": [ + "a photo of a person's face with mouth and nose clearly visible", + "a clear, unobstructed photo of a face", + "a photo of a face without any mask or covering", + "a portrait of a person showing their full face", + "a photo of a person with a beard and visible mouth", # avoid beard false positives + ], + }, + "sunglasses": { + # We want to flag ONLY images where sunglasses occlude the eyes. + # False positives to defeat: sunglasses pushed up on the head/forehead, hanging on a shirt collar. + "pos": [ + "a face with dark sunglasses covering the eyes", + "a portrait with the eyes hidden behind opaque sunglasses", + "a person wearing dark sunglasses over their eyes, eyes not visible", + "a face where the eyes are completely concealed by tinted lenses", + "a close-up portrait wearing aviator sunglasses on the eyes", + ], + "neg": [ + "a portrait with both eyes clearly visible and uncovered", + "a face with sunglasses pushed up on the forehead, eyes visible below", + "a face with sunglasses resting on top of the head, eyes visible", + "a person with sunglasses hanging from their shirt, eyes visible", + "a face wearing clear prescription eyeglasses with visible eyes", + "a portrait with no eyewear and visible eyes", + ], + }, +} + + +def load_model(device: str = "cpu"): + print(f"[clip] loading {MODEL_NAME} / {PRETRAINED} on {device} ...", file=sys.stderr) + t0 = time.time() + model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED) + tokenizer = open_clip.get_tokenizer(MODEL_NAME) + model = model.to(device).eval() + logit_scale = float(model.logit_scale.exp().detach().cpu()) + print(f"[clip] ready in {time.time()-t0:.1f}s, logit_scale={logit_scale:.2f}", file=sys.stderr) + return model, preprocess, tokenizer, logit_scale + + +@torch.no_grad() +def build_text_features(model, tokenizer, device: str): + """Return dict {attr: (pos_mean_emb, neg_mean_emb)} on device, both L2-normalized.""" + out = {} + for attr, sides in PROMPTS.items(): + feats = {} + for side in ("pos", "neg"): + tokens = tokenizer(sides[side]).to(device) + f = model.encode_text(tokens) + f = f / f.norm(dim=-1, keepdim=True) + mean = f.mean(dim=0) + feats[side] = mean / mean.norm() + out[attr] = (feats["pos"], feats["neg"]) + return out + + +@torch.no_grad() +def score_images(model, preprocess, text_feats, logit_scale: float, paths: list[Path], device: str, batch: int = 16): + """Yield (path, {attr: pos_prob}) per image. logit_scale is CLIP's learned temperature (~100).""" + for i in range(0, len(paths), batch): + chunk = paths[i:i + batch] + imgs = [] + keep = [] + for p in chunk: + try: + img = Image.open(p).convert("RGB") + imgs.append(preprocess(img)) + keep.append(p) + except Exception as e: + print(f"[skip] {p}: {e}", file=sys.stderr) + if not imgs: + continue + x = torch.stack(imgs).to(device) + feats = model.encode_image(x) + feats = feats / feats.norm(dim=-1, keepdim=True) # (B, D) + results = {} + for attr, (pos, neg) in text_feats.items(): + sims = torch.stack([feats @ pos, feats @ neg], dim=1) * logit_scale # (B, 2) + probs = sims.softmax(dim=1)[:, 0].tolist() # P(pos) + results[attr] = probs + for j, p in enumerate(keep): + yield p, {attr: results[attr][j] for attr in text_feats} + + +def iter_facesets(root: Path, only: list[str] | None) -> Iterable[Path]: + if only: + for name in only: + d = root / name + if d.is_dir(): + yield d + else: + print(f"[warn] not a directory: {d}", file=sys.stderr) + return + for d in sorted(root.iterdir()): + if d.is_dir() and not d.name.startswith("_"): + yield d + + +def cmd_score(args): + device = "cuda" if torch.cuda.is_available() else "cpu" + model, preprocess, tokenizer, logit_scale = load_model(device) + text_feats = build_text_features(model, tokenizer, device) + + only = [s.strip() for s in args.facesets.split(",")] if args.facesets else None + facesets = list(iter_facesets(ROOT, only)) + if args.sample_per_faceset: + # take first N PNGs per faceset (cheap deterministic sample for test batches) + pass + + report = { + "model": f"{MODEL_NAME}/{PRETRAINED}", + "root": str(ROOT), + "prompts": PROMPTS, + "facesets": {}, + } + total_imgs = 0 + t0 = time.time() + for fs in facesets: + faces = sorted((fs / "faces").glob("*.png")) if (fs / "faces").is_dir() else sorted(fs.glob("*.png")) + if args.sample_per_faceset: + faces = faces[: args.sample_per_faceset] + if not faces: + continue + print(f"[scan] {fs.name}: {len(faces)} png", file=sys.stderr) + per_image = [] + for p, scores in score_images(model, preprocess, text_feats, logit_scale, faces, device): + per_image.append({"file": p.name, "mask": round(scores["mask"], 4), "sunglasses": round(scores["sunglasses"], 4)}) + total_imgs += 1 + report["facesets"][fs.name] = per_image + + out = Path(args.out) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(report, indent=2)) + dt = time.time() - t0 + print(f"[done] {total_imgs} images, {dt:.1f}s ({total_imgs/max(dt,1e-3):.2f} img/s) -> {out}", file=sys.stderr) + + +def cmd_report(args): + """Render an HTML contact sheet from scores.json. Generates JPG thumbs.""" + import io + scores = json.loads(Path(args.scores).read_text()) + out_dir = Path(args.out) + thumbs_dir = out_dir / "thumbs" + thumbs_dir.mkdir(parents=True, exist_ok=True) + + THUMB = 160 + rows_html = [] + + def thumb_path(faceset: str, fname: str) -> Path: + d = thumbs_dir / faceset + d.mkdir(parents=True, exist_ok=True) + return d / (Path(fname).stem + ".jpg") + + def make_thumb(src: Path, dst: Path): + if dst.exists(): + return + try: + img = Image.open(src).convert("RGB") + img.thumbnail((THUMB, THUMB), Image.LANCZOS) + img.save(dst, "JPEG", quality=82) + except Exception as e: + print(f"[thumb-skip] {src}: {e}", file=sys.stderr) + + facesets = scores["facesets"] + for faceset, items in facesets.items(): + # sort: high score first so borderline cases group at the boundary + items_sorted = sorted(items, key=lambda x: max(x["mask"], x["sunglasses"]), reverse=True) + # faceset summary + n = len(items) + n_mask = sum(1 for x in items if x["mask"] >= 0.7) + n_sg = sum(1 for x in items if x["sunglasses"] >= 0.7) + pct_mask = (100 * n_mask / n) if n else 0 + pct_sg = (100 * n_sg / n) if n else 0 + rows_html.append(f"

{faceset} ({n} imgs · mask≥0.7: {n_mask} ({pct_mask:.0f}%) · sunglasses≥0.7: {n_sg} ({pct_sg:.0f}%))

") + rows_html.append("
") + src_root = ROOT / faceset + faces_root = (src_root / "faces") if (src_root / "faces").is_dir() else src_root + for it in items_sorted: + src = faces_root / it["file"] + dst = thumb_path(faceset, it["file"]) + make_thumb(src, dst) + rel = f"thumbs/{faceset}/{Path(it['file']).stem}.jpg" + m, s = it["mask"], it["sunglasses"] + cls_m = "hi" if m >= 0.7 else ("mid" if m >= 0.4 else "lo") + cls_s = "hi" if s >= 0.7 else ("mid" if s >= 0.4 else "lo") + rows_html.append( + f"
" + f"" + f"
M {m:.2f} S {s:.2f}
" + f"
" + ) + rows_html.append("
") + + nav = " · ".join(f"{f}" for f in facesets) + + html = f""" +Occlusion test batch + + +

Occlusion scores — {scores['model']}

+

Sorted within each faceset by max(mask, sunglasses) descending. +Color: ≥0.70 · 0.40–0.70 · <0.40

+ +{''.join(rows_html)} +""" + + out_html = out_dir / "index.html" + out_html.write_text(html) + print(f"[done] {out_html}", file=sys.stderr) + + +def _zip_png_list(pngs: list[Path], zip_path: Path) -> None: + """Mirror of sort_faces.py:_zip_png_list. Renames PNGs to 0000.png, 0001.png, ...""" + import zipfile + with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf: + for i, p in enumerate(pngs): + zf.write(p, arcname=f"{i:04d}.png") + + +def cmd_apply(args): + """Prune mask/sunglasses PNGs, quarantine occlusion-dominated facesets, + re-zip .fsz, update top-level manifest. --dry-run prints the plan only.""" + import shutil + + threshold = args.threshold + domain_pct = args.domain_pct + min_survivors = args.min_survivors + top_n_target = args.top_n + + scores = json.loads(Path(args.scores).read_text()) + master_path = ROOT / "manifest.json" + master = json.loads(master_path.read_text()) + by_name = {f["name"]: f for f in master.get("facesets", [])} + + masked_dir = ROOT / "_masked" + thin_dir = ROOT / "_thin" + + plan = [] + for faceset, items in scores["facesets"].items(): + if faceset not in by_name: + print(f"[warn] {faceset} not in master manifest — skipping", file=sys.stderr) + continue + n = len(items) + flagged_files = sorted( + it["file"] for it in items + if it["mask"] >= threshold or it["sunglasses"] >= threshold + ) + survivors_items = [it for it in items if it["file"] not in set(flagged_files)] + # preserve quality order from filename (0001.png is highest-rank) + survivors_files = sorted(it["file"] for it in survivors_items) + + n_mask = sum(1 for it in items if it["mask"] >= threshold) + n_sg = sum(1 for it in items if it["sunglasses"] >= threshold) + pct_mask = n_mask / n if n else 0 + pct_sg = n_sg / n if n else 0 + + if pct_mask >= domain_pct: + action, reason = "quarantine_masked", f"mask_pct={pct_mask:.0%}" + elif pct_sg >= domain_pct: + action, reason = "quarantine_masked", f"sunglasses_pct={pct_sg:.0%}" + elif flagged_files and len(survivors_files) < min_survivors: + # only quarantine-as-thin if pruning is the cause of the drop below threshold; + # pre-existing small facesets without occlusions are left alone + action, reason = "quarantine_thin", f"survivors={len(survivors_files)}<{min_survivors}" + elif flagged_files: + action, reason = "prune", f"drop {len(flagged_files)}" + else: + action, reason = "keep", "clean" + + plan.append({ + "faceset": faceset, "action": action, "reason": reason, + "n": n, "n_mask": n_mask, "n_sg": n_sg, + "n_dropped": len(flagged_files), "n_survivors": len(survivors_files), + "dropped_files": flagged_files, + }) + + # Summary + counts = {a: 0 for a in ("keep", "prune", "quarantine_masked", "quarantine_thin")} + for p in plan: + counts[p["action"]] += 1 + total_dropped_pngs = sum(p["n_dropped"] for p in plan if p["action"] == "prune") + total_quarantined_pngs = sum(p["n"] for p in plan if p["action"].startswith("quarantine")) + print(f"=== plan summary (threshold={threshold} domain_pct={domain_pct} min_survivors={min_survivors}) ===") + for a, c in counts.items(): + print(f" {a}: {c}") + print(f" PNGs to drop (prune): {total_dropped_pngs}") + print(f" PNGs to quarantine (whole): {total_quarantined_pngs}") + print(f" facesets in master: {len(master['facesets'])}") + print(f" facesets scored: {len(plan)}") + + # Write plan for audit + plan_path = Path(args.out_plan) + plan_path.parent.mkdir(parents=True, exist_ok=True) + plan_path.write_text(json.dumps({ + "thresholds": {"image": threshold, "domain_pct": domain_pct, "min_survivors": min_survivors}, + "counts": counts, + "totals": {"dropped_pngs": total_dropped_pngs, "quarantined_pngs": total_quarantined_pngs}, + "plan": plan, + }, indent=2)) + print(f" plan written to {plan_path}") + + if args.dry_run: + # pretty list of quarantines + for p in plan: + if p["action"].startswith("quarantine"): + print(f" [{p['action']:>18s}] {p['faceset']} ({p['reason']}, n={p['n']})") + return + + # ----- destructive section ----- + masked_dir.mkdir(parents=True, exist_ok=True) + thin_dir.mkdir(parents=True, exist_ok=True) + + new_facesets = [] + new_masked = list(master.get("masked", [])) # preserve any prior runs + new_thin = list(master.get("thin_eras", [])) + + # build a name -> existing-thin/masked entry index, to update relpath if we re-quarantine + by_name_thin = {e["name"]: e for e in new_thin} + by_name_masked = {e["name"]: e for e in new_masked} + + for p in plan: + entry = dict(by_name[p["faceset"]]) # copy + fs_dir = ROOT / p["faceset"] + faces_dir = fs_dir / "faces" + + if p["action"] == "keep": + new_facesets.append(entry) + continue + + # prune dropped PNGs first (applies to both prune and quarantine_thin paths) + if p["dropped_files"]: + dropped_holding = faces_dir / "_dropped" + dropped_holding.mkdir(exist_ok=True) + for fname in p["dropped_files"]: + src = faces_dir / fname + if src.exists(): + shutil.move(str(src), str(dropped_holding / fname)) + + if p["action"].startswith("quarantine"): + target_root = masked_dir if p["action"] == "quarantine_masked" else thin_dir + target = target_root / p["faceset"] + if target.exists(): + # idempotency: if a previous run already moved it, skip move + pass + else: + shutil.move(str(fs_dir), str(target)) + entry["occlusion_filter"] = { + "action": p["action"], "reason": p["reason"], + "n_input": p["n"], "n_mask": p["n_mask"], "n_sg": p["n_sg"], + "n_dropped": p["n_dropped"], "n_survivors": p["n_survivors"], + "threshold": threshold, "domain_pct": domain_pct, + } + entry["relpath"] = f"{'_masked' if p['action']=='quarantine_masked' else '_thin'}/{p['faceset']}" + entry["fsz_top"] = None + entry["fsz_all"] = None + if p["action"] == "quarantine_masked": + entry["masked"] = True + new_masked.append(entry) + else: + entry["thin"] = True + new_thin.append(entry) + continue + + # action == prune + survivor_pngs = sorted([pp for pp in faces_dir.glob("*.png")]) + if not survivor_pngs: + print(f"[warn] {p['faceset']}: no survivor PNGs after prune", file=sys.stderr) + new_facesets.append(entry) + continue + + # re-zip .fsz from survivors in quality order + top_n_eff = min(top_n_target, len(survivor_pngs)) + top_fsz = fs_dir / f"{p['faceset']}_top{top_n_eff}.fsz" + all_fsz = fs_dir / f"{p['faceset']}_all.fsz" + # remove old .fsz files (they may have different top_n in name) + for old in fs_dir.glob("*.fsz"): + old.unlink() + _zip_png_list(survivor_pngs[:top_n_eff], top_fsz) + if len(survivor_pngs) > top_n_eff: + _zip_png_list(survivor_pngs, all_fsz) + entry["fsz_all"] = all_fsz.name + else: + entry["fsz_all"] = None + entry["fsz_top"] = top_fsz.name + entry["top_n"] = top_n_eff + entry["exported"] = len(survivor_pngs) + entry["dropped_occlusion"] = p["n_dropped"] + entry["occlusion_filter"] = { + "action": "prune", "n_input": p["n"], "n_mask": p["n_mask"], + "n_sg": p["n_sg"], "n_dropped": p["n_dropped"], "n_survivors": p["n_survivors"], + "threshold": threshold, + } + new_facesets.append(entry) + + # write updated master manifest + new_master = dict(master) + new_master["facesets"] = new_facesets + new_master["masked"] = new_masked + new_master["thin_eras"] = new_thin + new_master["occlusion_filter_run"] = { + "model": scores.get("model"), + "threshold": threshold, + "domain_pct": domain_pct, + "min_survivors": min_survivors, + "counts": counts, + "totals": {"dropped_pngs": total_dropped_pngs, "quarantined_pngs": total_quarantined_pngs}, + } + tmp = master_path.with_suffix(".tmp.json") + tmp.write_text(json.dumps(new_master, indent=2)) + tmp.replace(master_path) + print(f"[done] master manifest updated: {len(new_facesets)} active, " + f"{len(new_masked)} masked, {len(new_thin)} thin") + + +def cmd_stage(args): + """Walk facesets_swap_ready/ and write a queue.json for the Windows clip_worker.""" + only = [s.strip() for s in args.facesets.split(",")] if args.facesets else None + queue = [] + for fs in iter_facesets(ROOT, only): + faces = sorted((fs / "faces").glob("*.png")) if (fs / "faces").is_dir() else sorted(fs.glob("*.png")) + for p in faces: + queue.append({ + "wsl_path": str(p), + "win_path": wsl_to_win(str(p)), + "faceset": fs.name, + "file": p.name, + }) + out = Path(args.out) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(queue, indent=2)) + print(f"[stage] {len(queue)} png paths -> {out}", file=sys.stderr) + print(f"[stage] win queue file: {wsl_to_win(str(out))}", file=sys.stderr) + + +def cmd_merge(args): + """Ingest worker scores.json into the per-faceset shape that `report` reads.""" + src = json.loads(Path(args.scores).read_text()) + by_faceset: dict[str, list] = {} + for r in src.get("results", []): + by_faceset.setdefault(r["faceset"], []).append({ + "file": r["file"], + "mask": r["mask"], + "sunglasses": r["sunglasses"], + }) + # stable ordering: faceset by name, files by name + out_data = { + "model": src.get("model", f"{MODEL_NAME}/{PRETRAINED}"), + "root": str(ROOT), + "prompts": src.get("prompts", PROMPTS), + "facesets": {fs: sorted(items, key=lambda x: x["file"]) for fs, items in sorted(by_faceset.items())}, + } + out = Path(args.out) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(out_data, indent=2)) + total = sum(len(v) for v in by_faceset.values()) + print(f"[merge] {total} scores across {len(by_faceset)} facesets -> {out}", file=sys.stderr) + + +def main(): + ap = argparse.ArgumentParser() + sub = ap.add_subparsers(dest="cmd", required=True) + + s = sub.add_parser("score", help="WSL CPU scoring (slow but no GPU dependency)") + s.add_argument("--facesets", default=None, help="comma-separated faceset names; default = all") + s.add_argument("--sample-per-faceset", type=int, default=0, help="cap PNGs per faceset (0 = all)") + s.add_argument("--out", required=True) + s.set_defaults(func=cmd_score) + + st = sub.add_parser("stage", help="Build queue.json for Windows clip_worker.py") + st.add_argument("--facesets", default=None, help="comma-separated faceset names; default = all") + st.add_argument("--out", required=True) + st.set_defaults(func=cmd_stage) + + m = sub.add_parser("merge", help="Convert worker scores.json into per-faceset report format") + m.add_argument("--scores", required=True, help="worker output (flat list of results)") + m.add_argument("--out", required=True, help="output path for per-faceset format") + m.set_defaults(func=cmd_merge) + + r = sub.add_parser("report", help="Render HTML contact sheet from a per-faceset scores.json") + r.add_argument("--scores", required=True) + r.add_argument("--out", required=True) + r.set_defaults(func=cmd_report) + + a = sub.add_parser("apply", help="Prune flagged PNGs, quarantine dominated facesets, re-zip .fsz, update manifest") + a.add_argument("--scores", required=True, help="per-faceset scores.json (output of `merge` or `score`)") + a.add_argument("--out-plan", required=True, help="path to write the apply plan json (audit)") + a.add_argument("--threshold", type=float, default=0.7, help="image-level drop threshold for mask/sunglasses (default 0.7)") + a.add_argument("--domain-pct", type=float, default=0.40, help="faceset-level quarantine threshold (default 0.40)") + a.add_argument("--min-survivors", type=int, default=5, help="quarantine to _thin if survivors below this (default 5)") + a.add_argument("--top-n", type=int, default=30, help="top-N for re-zipped _topN.fsz (default 30)") + a.add_argument("--dry-run", action="store_true", help="print plan only, no filesystem changes") + a.set_defaults(func=cmd_apply) + + args = ap.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/work/multiface_worker.py b/work/multiface_worker.py new file mode 100644 index 0000000..033b75f --- /dev/null +++ b/work/multiface_worker.py @@ -0,0 +1,144 @@ +"""Windows / DirectML multi-face audit worker. + +For every PNG in queue.json, run insightface FaceAnalysis and record how many +faces were detected (filtering by det_score>=MIN_DET and face_short>=MIN_PIX). +Surfaces the load-bearing roop invariant: each .fsz PNG must hold exactly one +face, otherwise the loader's `extract_face_images` appends every detected face +into the FaceSet and pollutes the averaged identity embedding. + +CLI: + py -3.12 multiface_worker.py [--limit N] +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +import time +from pathlib import Path + +import numpy as np +from PIL import Image, ImageOps +from insightface.app import FaceAnalysis + +MODEL_ROOT = r"C:\face_embed_venv\models" +MIN_DET = 0.5 +MIN_FACE_PIX = 40 +FLUSH_EVERY = 200 + + +def load_existing(out_path: Path): + if not out_path.exists(): + return None, set() + try: + d = json.loads(out_path.read_text()) + processed = set(d.get("processed", [])) + return d, processed + except Exception as e: + print(f"[warn] could not parse {out_path}: {e}; starting fresh", file=sys.stderr) + return None, set() + + +def save_atomic(out_path: Path, data: dict): + tmp = out_path.with_suffix(".tmp.json") + tmp.write_text(json.dumps(data, indent=2)) + os.replace(tmp, out_path) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("queue", type=Path) + ap.add_argument("out", type=Path) + ap.add_argument("--limit", type=int, default=None) + args = ap.parse_args() + + queue = json.loads(args.queue.read_text()) + print(f"[queue] {len(queue)} entries from {args.queue}", flush=True) + args.out.parent.mkdir(parents=True, exist_ok=True) + existing, processed = load_existing(args.out) + if existing: + print(f"[resume] {len(processed)} already scored", flush=True) + results = existing.get("results", []) + else: + results = [] + pending = [e for e in queue if e["wsl_path"] not in processed] + if args.limit is not None: + pending = pending[: args.limit] + print(f"[pending] {len(pending)} entries", flush=True) + if not pending: + print("[done] nothing to do") + return + + print("[load] FaceAnalysis with DmlExecutionProvider", flush=True) + app = FaceAnalysis( + name="buffalo_l", + root=MODEL_ROOT, + providers=["DmlExecutionProvider", "CPUExecutionProvider"], + ) + app.prepare(ctx_id=0, det_size=(640, 640)) + + n_done = 0 + n_load_err = 0 + last_flush = time.time() + t_start = time.time() + + def flush(): + save_atomic(args.out, { + "results": results, + "processed": sorted(processed), + }) + + for entry in pending: + try: + with Image.open(entry["win_path"]) as im: + im = ImageOps.exif_transpose(im) + im = im.convert("RGB") + rgb = np.array(im) + bgr = rgb[:, :, ::-1].copy() + except Exception as e: + n_load_err += 1 + results.append({ + "wsl_path": entry["wsl_path"], "faceset": entry["faceset"], "file": entry["file"], + "face_count": -1, "error": "load", + }) + processed.add(entry["wsl_path"]) + n_done += 1 + continue + + faces = app.get(bgr) + kept = 0 + for f in faces: + if float(f.det_score) < MIN_DET: + continue + x1, y1, x2, y2 = [int(round(v)) for v in f.bbox] + short = min(max(x2 - x1, 0), max(y2 - y1, 0)) + if short < MIN_FACE_PIX: + continue + kept += 1 + + results.append({ + "wsl_path": entry["wsl_path"], "faceset": entry["faceset"], "file": entry["file"], + "face_count": kept, + }) + processed.add(entry["wsl_path"]) + n_done += 1 + + if (n_done % FLUSH_EVERY == 0) or (time.time() - last_flush) > 30.0: + flush() + last_flush = time.time() + elapsed = time.time() - t_start + rate = n_done / max(0.1, elapsed) + eta = (len(pending) - n_done) / max(0.1, rate) / 60.0 + print(f"[scan] {n_done}/{len(pending)} rate={rate:.2f} img/s eta={eta:.1f}min " + f"load_err={n_load_err}", flush=True) + + flush() + elapsed = time.time() - t_start + print(f"[done] {n_done} scored, {n_load_err} load errors, {elapsed:.1f}s " + f"({n_done/max(0.1,elapsed):.2f} img/s) -> {args.out}", flush=True) + + +if __name__ == "__main__": + main()