From 49a43c7685b640897a4a757ec1c61cc0783ebe65 Mon Sep 17 00:00:00 2001
From: Peter <peter@computerlie.be>
Date: Mon, 27 Apr 2026 15:41:18 +0200
Subject: [PATCH] Add post-export corpus maintenance pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds four new orchestration scripts that operate on an already-built
facesets_swap_ready/ to clean it up over time:

- filter_occlusions.py + clip_worker.py: CLIP zero-shot mask + sunglasses
  filter (open_clip ViT-L-14/dfn2b_s39b). WSL stages, Windows DML scores
  via new C:\clip_dml_venv. Image-level threshold 0.7; faceset-level
  quarantine at 40% domain dominance.

- consolidate_facesets.py: duplicate-identity merger using complete-linkage
  centroid clustering on cached arcface embeddings. Single-linkage chains
  catastrophically (60-faceset clusters with min sim < 0); complete-linkage
  guarantees within-group sim >= edge.

- age_extend_001.py: slots newly-added PNGs into existing era buckets of
  faceset_001 using the same anchor-fragment rule as age_split_001.py
  (dist <= 0.40 AND |year_delta| <= 5). Anchors not re-centered.

- dedup_optimize.py + multiface_worker.py: corpus-wide cleanup with three
  passes — cross-family SHA256 byte-dedup (preserves intra-family era
  duplication), within-faceset near-dup at sim >= 0.95, and a multi-face
  audit (the load-bearing roop invariant). Multi-face worker hits ~19 img/s
  on AMD Vega — ~7x embed_worker because input is 512x512 crops.

Same-day corpus evolution: 311 active / 0 masked / 68 thin / 0 merged →
181 / 51 / 71 / 74; 6,440 → 3,849 active PNGs. All quarantines and prunes
preserved on disk (faces/_dropped/, _masked/, _merged/, _thin/) for full
reversibility. Master manifest gains masked[], merged[], plus per-run
provenance blocks.

Three new docs/analysis/ writeups cover model choice, threshold rationale,
and per-pass run results.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 README.md                                     |  29 +-
 docs/analysis/clip-occlusion-filter.md        | 154 +++++
 docs/analysis/dedup-and-roop-optimization.md  | 155 +++++
 .../identity-consolidation-and-age-extend.md  | 170 +++++
 work/age_extend_001.py                        | 576 ++++++++++++++++
 work/clip_worker.py                           | 221 ++++++
 work/consolidate_facesets.py                  | 634 ++++++++++++++++++
 work/dedup_optimize.py                        | 594 ++++++++++++++++
 work/filter_occlusions.py                     | 574 ++++++++++++++++
 work/multiface_worker.py                      | 144 ++++
 10 files changed, 3250 insertions(+), 1 deletion(-)
 create mode 100644 docs/analysis/clip-occlusion-filter.md
 create mode 100644 docs/analysis/dedup-and-roop-optimization.md
 create mode 100644 docs/analysis/identity-consolidation-and-age-extend.md
 create mode 100644 work/age_extend_001.py
 create mode 100644 work/clip_worker.py
 create mode 100644 work/consolidate_facesets.py
 create mode 100644 work/dedup_optimize.py
 create mode 100644 work/filter_occlusions.py
 create mode 100644 work/multiface_worker.py

diff --git a/README.md b/README.md
index ff2c943..2edcb2e 100644
--- a/README.md
+++ b/README.md
@@ -331,6 +331,27 @@ from the saved `state.json` without re-fetching what was already done.
 
 The composite quality score in `export-swap` is `0.30·frontality + 0.20·det_score + 0.20·landmark_symmetry + 0.15·face_size + 0.15·sharpness`, each normalized to `[0, 1]`.
 
+## Post-export corpus maintenance
+
+The `sort_faces.py` pipeline above produces `facesets_swap_ready/`. Four
+orchestration scripts under `work/` operate on that already-built corpus to
+clean it up over time:
+
+| script | purpose |
+|--------|---------|
+| `work/filter_occlusions.py` (+ Windows `work/clip_worker.py`) | Drop PNGs of masked / sun-glassed faces using open_clip ViT-L-14/dfn2b_s39b zero-shot scoring. Image-level threshold 0.7; faceset-level quarantine at 40% domain dominance. WSL stages a queue, Windows DML scores, WSL applies. See `docs/analysis/clip-occlusion-filter.md`. |
+| `work/consolidate_facesets.py` | Merge duplicate identities (centroid cosine sim ≥ 0.55 with confident ≥ 0.65, **complete-linkage** to defeat single-link chaining). Pulls embeddings from cache, no GPU. See `docs/analysis/identity-consolidation-and-age-extend.md`. |
+| `work/age_extend_001.py` | Slot newly-added PNGs into existing era buckets of `faceset_001` (anchor cosine distance ≤ 0.40 AND `|year_delta|` ≤ 5). Same anchor-fragment rule as `age_split_001.py`. |
+| `work/dedup_optimize.py` (+ Windows `work/multiface_worker.py`) | (a) cross-family SHA256 byte-dedup, (b) within-faceset near-dup at cosine sim ≥ 0.95, (c) multi-face audit (re-detect via insightface, drop PNGs with face_count ≠ 1). Multi-face is the load-bearing roop invariant. See `docs/analysis/dedup-and-roop-optimization.md`. |
+
+All four operate idempotently and reversibly: dropped PNGs go to
+`<faceset>/faces/_dropped/`, quarantined whole facesets go to
+`facesets_swap_ready/_masked/` or `_merged/` (parallel to the existing
+`_thin/`). The master `manifest.json` partitions entries across `facesets[]`,
+`masked[]`, `thin_eras[]`, and `merged[]` arrays, plus per-run provenance
+blocks (`occlusion_filter_run`, `merge_run`, `age_extend_runs`, `dedup_runs`,
+`multiface_runs`).
+
 ## Downstream: roop-unleashed
 
 The `.fsz` bundles emitted by `export-swap` drop straight into roop-unleashed's Face Swap tab. Each PNG inside is already a clean single-face crop — critical, because the roop-unleashed loader appends every face it re-detects in each PNG to the averaged identity embedding.
@@ -350,11 +371,17 @@ Highly recommended at swap time: enable **Select post-processing = GFPGAN** with
    ├─ build_folders.py                           (hand-sorted-folder orchestration)
    ├─ check_faceset001_age.py                    (age-split readiness probe)
    ├─ age_split_001.py                           (age-split orchestration; faceset_001)
+   ├─ age_extend_001.py                          (extends existing era buckets with new PNGs)
    ├─ cluster_osrc.py                            (mixed-bucket identity discovery)
    ├─ immich_stage.py                            (Immich library staging, parallel)
-   ├─ embed_worker.py                            (Windows DML embed worker, runs from C:\face_embed_venv\)
+   ├─ embed_worker.py                            (Windows DML embed worker; C:\face_embed_venv\)
    ├─ cluster_immich.py                          (Immich identity discovery + export)
    ├─ finalize_immich.sh                         (chains queue → embed → cluster)
+   ├─ filter_occlusions.py                       (CLIP zero-shot mask + sunglasses filter)
+   ├─ clip_worker.py                             (Windows DML CLIP worker; C:\clip_dml_venv\)
+   ├─ consolidate_facesets.py                    (duplicate-identity merger; complete-linkage)
+   ├─ dedup_optimize.py                          (byte + near-dup + multi-face audit driver)
+   ├─ multiface_worker.py                        (Windows DML multi-face audit worker)
    ├─ synthetic_*_manifest.json                  (per-run synthetic refine manifests)
    ├─ immich/
    │  ├─ users.json                              (label -> userId map; gitignored)
diff --git a/docs/analysis/clip-occlusion-filter.md b/docs/analysis/clip-occlusion-filter.md
new file mode 100644
index 0000000..a977736
--- /dev/null
+++ b/docs/analysis/clip-occlusion-filter.md
@@ -0,0 +1,154 @@
+# CLIP zero-shot occlusion filter (masks + sunglasses)
+
+_Run date: 2026-04-27. Driver scripts: `work/filter_occlusions.py`, `work/clip_worker.py`._
+
+## 1. Why
+
+`facesets_swap_ready/` ended the Immich import day with 311 substantive
+facesets and a long tail of identities whose clusters had latched onto
+*eyewear or mask appearance* instead of identity (covid-era shots, vacation
+photos with sunglasses dominating the frame). Two failure modes:
+
+1. **Pollution of averaged identity** — roop's `FaceSet.AverageEmbeddings()`
+   averages every face in the .fsz. A faceset where 40 % of images are
+   sunglassed gives a biased centroid; the swap reproduces sunglass-shaped
+   eye sockets.
+2. **Whole-cluster identity drift** — clustering at the embedding level
+   sometimes anchors on the eyewear silhouette rather than the face,
+   producing clusters of "the same sunglasses across multiple people".
+
+A targeted attribute scorer was the cleanest fix.
+
+## 2. Model + prompts
+
+**Model**: `open_clip` `ViT-L-14` / `dfn2b_s39b` (Apple Data Filtering Networks).
+Best public zero-shot at this size. Loads weights from HF Hub (~890 MB).
+Bit-identical scores between WSL CPU and Windows DML.
+
+**Prompt design**: per-attribute ensembles of 5–6 positive + 5–6 negative
+prompts. Positive ensembles are mean-pooled and L2-normalized before softmax.
+
+**Critical bug if forgotten**: CLIP cosine similarities are tiny (0.2–0.3
+range). Raw `softmax([sim_pos, sim_neg])` collapses to ~0.5/0.5 on every
+image. **Multiply by `model.logit_scale.exp()` (~100) before softmax.**
+Without that scale the entire scorer outputs a uniform 0.5.
+
+**Sunglasses prompt pitfall**: the first set caught faces with sunglasses
+*pushed up on the forehead* with the same probability as faces with
+sunglasses *covering the eyes* — CLIP detects "presence of sunglasses in
+frame", not "eyes occluded". Fixed by putting the false positive into the
+*negative* class explicitly:
+
+```
+positive: "a face with dark sunglasses covering the eyes"
+          "a portrait with the eyes hidden behind opaque sunglasses"
+          ...
+negative: "a face with sunglasses pushed up on the forehead, eyes visible below"
+          "a face with sunglasses resting on top of the head, eyes visible"
+          "a face wearing clear prescription eyeglasses with visible eyes"
+          ...
+```
+
+Validation pair (faceset_005): sunglasses-on-eyes → 0.91, sunglasses-on-forehead
+→ 0.39. Threshold 0.7 cleanly separates.
+
+## 3. Architecture
+
+```
+   ┌─────────────────────────────────────────────┐
+   │ WSL  /opt/face-sets/work/filter_occlusions.py │
+   │  • stage:  walk facesets/, write queue.json   │
+   │  • merge:  ingest worker results              │
+   │  • report: HTML contact sheet                  │
+   │  • apply:  prune + quarantine + re-zip         │
+   └────────────┬────────────────────────────────┘
+                │ queue.json (paths) via \\wsl.localhost\
+                ▼
+   ┌─────────────────────────────────────────────┐
+   │ Windows  C:\clip_dml_venv\                  │
+   │  /opt/face-sets/work/clip_worker.py         │
+   │  Python 3.12 + torch 2.4.1 CPU              │
+   │  + torch-directml 0.2.5 + open_clip_torch   │
+   │  Reads PNGs from native E:\, writes scores  │
+   └─────────────────────────────────────────────┘
+```
+
+A separate Windows venv (not the existing `C:\face_embed_venv\`) is needed
+because `torch-directml` brings ~1.5 GB of wheels and version-pinned
+numpy/pillow that risk breaking the embed_worker venv's
+`onnxruntime-directml` + `insightface` stack.
+
+## 4. DML throughput surprise
+
+Measured on AMD Radeon RX Vega:
+
+| input | model | throughput | speedup vs WSL CPU |
+|------|-------|-----------:|-------------------:|
+| ViT-L-14 (CLIP, this filter) | open_clip | **1.43 img/s** | **2.4×** |
+| buffalo_l (insightface, embed_worker) | onnxruntime | 2.6 img/s | 7.5× |
+
+Only 2.4× because `aten::_native_multi_head_attention` is not implemented in
+the directml plugin and falls back to CPU. The vision encoder runs on GPU,
+attention runs on CPU per layer, both alternating. A silenced UserWarning
+makes this near-invisible. Workable for a one-shot 73-min corpus run, but
+the embed_worker pattern (pure ONNX) remains the gold standard for DML.
+
+## 5. Thresholds (validated 2026-04-27 on 6,318 PNGs)
+
+| level | threshold | semantics |
+|-------|----------:|-----------|
+| image | P(positive) ≥ 0.7 | drop the PNG |
+| faceset | ≥ 40 % of images flagged for either attr | quarantine whole faceset to `_masked/` |
+| min-survivors | < 5 surviving AND something pruned | quarantine to `_thin/` |
+
+The `AND something pruned` guard is essential — without it, naturally-small
+facesets (hand-sorted with ≤4 PNGs) get incorrectly quarantined for being
+small even when they have zero occlusions.
+
+## 6. Run results
+
+| action | count | net effect |
+|--------|------:|------------|
+| keep | 209 | unchanged |
+| prune | 46 | 183 PNGs dropped within survivors |
+| quarantine_masked | 51 | whole faceset → `_masked/` (11 mask-driven, 40 sunglasses-driven) |
+| quarantine_thin | 3 | survivors < 5 → `_thin/` |
+
+Net: 311 active → 255 active after the filter run. 763 PNGs quarantined
+whole-faceset, 183 pruned within survivors. All dropped PNGs preserved at
+`<faceset>/faces/_dropped/` for reversibility. Master manifest gained a
+`masked[]` array parallel to `thin_eras[]`, plus an `occlusion_filter_run`
+provenance block.
+
+## 7. Known limitations
+
+- **Per-faceset manifests are NOT updated by `apply`** — only the master
+  manifest is. Each faceset's own `<faceset>/manifest.json` retains stale
+  `faces[]` entries pointing at PNGs that moved into `_dropped/`. Harmless
+  for `.fsz` consumers (the .fsz is re-zipped from current disk state) but
+  downstream tools reading `faces[]` will see broken references. Discovered
+  later by `age_extend_001.py`'s rebuild loop, which generated 42 missing-PNG
+  warnings before being caught.
+
+## 8. Re-running
+
+```bash
+# 1. Stage queue from current corpus state
+python work/filter_occlusions.py stage --out work/clip_dml/queue.json
+
+# 2. Score on Windows DML (resumable)
+"/mnt/c/clip_dml_venv/Scripts/python.exe" work/clip_worker.py \
+  work/clip_dml/queue.json work/clip_dml/scores.json --batch 8
+
+# 3. Reshape into per-faceset format, then HTML for visual approval
+python work/filter_occlusions.py merge \
+  --scores work/clip_dml/scores.json --out work/occlusion_scores.json
+python work/filter_occlusions.py report \
+  --scores work/occlusion_scores.json --out work/occlusion_review
+
+# 4. Apply (always dry-run first)
+python work/filter_occlusions.py apply \
+  --scores work/occlusion_scores.json --out-plan work/occlusion_apply_plan.json --dry-run
+python work/filter_occlusions.py apply \
+  --scores work/occlusion_scores.json --out-plan work/occlusion_apply_plan.json
+```
diff --git a/docs/analysis/dedup-and-roop-optimization.md b/docs/analysis/dedup-and-roop-optimization.md
new file mode 100644
index 0000000..a6b4373
--- /dev/null
+++ b/docs/analysis/dedup-and-roop-optimization.md
@@ -0,0 +1,155 @@
+# Corpus dedup + roop-unleashed optimization
+
+_Run date: 2026-04-27. Driver scripts: `work/dedup_optimize.py`, `work/multiface_worker.py`._
+
+After consolidation collapsed duplicate identities and age-extend slotted
+new PNGs into era buckets, the corpus still carried artifacts that hurt
+roop's averaged-embedding quality:
+
+- **Burst-photo near-duplicates** within facesets, especially in
+  immich-discovered identities where source libraries had many similar
+  shots within seconds.
+- **Cross-faceset byte-identical PNGs** that escaped consolidation's
+  centroid-similarity matching when individual PNGs matched exactly but
+  cluster centroids diverged.
+- **Multi-face PNGs** that polluted identity averaging because the roop
+  loader appends every detected face per PNG to the FaceSet (load-bearing
+  invariant — see § 2).
+
+This pipeline runs three independent passes and an optional fourth, all
+moving dropped PNGs to `<faceset>/faces/_dropped/` for reversibility.
+
+## 1. Cross-family byte-dedup
+
+SHA256-hash every PNG in the active corpus (parallel I/O via
+`ThreadPoolExecutor(max_workers=16)`, ~17 s for 5,386 PNGs over the
+`/mnt/e/` Windows mount). Group by hash; for groups with members in
+multiple identity families, keep the higher-tier copy.
+
+**Family detection**: regex `^(faceset_\d+)(?:_.+)?$` — captures the parent
+identity. Same family includes parent + era splits (e.g. `faceset_001` +
+`faceset_001_2010-13`); these are intentional duplications for the era
+.fsz files and are preserved.
+
+Run results: 20 cross-family hash groups → 24 PNGs dropped. Most cases were
+small immich identity-cluster errors that consolidation missed because
+individual PNG embeddings matched but the cluster mean did not.
+
+## 2. Within-faceset near-dup at sim ≥ 0.95
+
+Per-faceset pairwise cosine similarity on cached arcface embeddings.
+Connected components in the `sim ≥ 0.95` graph. Keep highest
+`quality.composite` per component, drop the rest.
+
+**Threshold rationale**: legitimate same-person-different-pose pairs land at
+0.5–0.85; ≥ 0.95 means essentially the same shot (burst frames or
+recompressed dupes). Roop's `FaceSet.AverageEmbeddings()` averages all faces
+into `faces[0].embedding`; near-identical embeddings averaged ≈ averaging
+once. Removing them does not lose identity information; it removes a bias
+weight on the most-photographed moments.
+
+Run results: 851 groups → **1,225 PNGs dropped** (23 % of corpus).
+Most-affected: `faceset_026` (-132 of 262), `faceset_027` (-107),
+`faceset_028` (-92), `faceset_030` (-92). All immich-discovered identities
+where the source library had burst sequences.
+
+## 3. Multi-face audit (load-bearing roop invariant)
+
+The roop loader at `roop/ui/tabs/faceswap_tab.py:661–691` runs
+`extract_face_images(filename, (False, 0))` on every PNG and **appends every
+detected face** to `face_set.faces`. A multi-face PNG therefore pollutes the
+averaged identity. The export-swap pipeline drops multi-face crops at
+creation, but post-pipeline operations (consolidation, age-extend) move
+PNGs across facesets without re-checking.
+
+**This audit re-detects every PNG** with insightface FaceAnalysis and flags
+any with `face_count ≠ 1` (filtered by `det_score ≥ 0.5` and
+`face_short ≥ 40`). Includes:
+- ≥ 2 faces → loader will inject extra identities into averaging
+- 0 faces → insightface can't find a face on the cropped PNG; useless for
+  roop, would silently fail
+
+Run results: 4,146 PNGs scored, 332 flagged (272 with 2 faces, 9 with 3,
+2 with 4, **49 with 0**). 82 facesets affected.
+
+## 4. DML throughput jump for face crops
+
+The audit reuses the same insightface + onnxruntime-directml stack as
+`embed_worker.py` but achieves **~19 img/s** on AMD Vega vs embed_worker's
+2.6 img/s — same model, same hardware. The difference is input size:
+
+| stage | typical input | DML throughput |
+|-------|--------------|---------------:|
+| `embed_worker.py` (Immich import) | 1024–4000 px source | 2.6 img/s |
+| `multiface_worker.py` (this audit) | 512×512 face crops | **19 img/s** |
+
+Detection on small inputs is fast; recognition on aligned 112×112 inputs is
+the same cost either way. Implication: **any pipeline operating on
+already-cropped face PNGs can rely on a roughly 7× higher DML throughput
+ceiling than full-resolution embedding**.
+
+## 5. Architecture
+
+```
+   ┌────────────────────────────────────────────┐
+   │ WSL  /opt/face-sets/work/dedup_optimize.py │
+   │  • analyze:      hashes + within-faceset sim │
+   │  • apply:         move + re-zip (no GPU)     │
+   │  • stage_multiface: write queue.json         │
+   │  • merge_multiface: ingest worker results    │
+   │  • apply_multiface: move + re-zip             │
+   │  • report:        HTML audit                  │
+   └────────────┬───────────────────────────────┘
+                │ queue.json via \\wsl.localhost\
+                ▼
+   ┌────────────────────────────────────────────┐
+   │ Windows  C:\face_embed_venv\               │
+   │  /opt/face-sets/work/multiface_worker.py    │
+   │  insightface FaceAnalysis on DmlExecutionProvider │
+   │  Reads PNGs from native E:\, writes face_count │
+   └────────────────────────────────────────────┘
+```
+
+Reuses the existing `C:\face_embed_venv\` (no new venv needed — same
+insightface stack as `embed_worker.py`).
+
+## 6. Final corpus state (2026-04-27 night)
+
+| metric | start of day | after occlusion filter | after consolidation | after age-extend | after this dedup + multiface |
+|--------|-------------:|----------------------:|-------------------:|-----------------:|----------------------------:|
+| active facesets | 311 | 255 | 181 | 181 | **181** |
+| active PNGs | ~6,440 | 5,386 | 5,386 | 5,400 | **3,849** |
+| `_masked/` | 0 | 51 | 51 | 51 | 51 |
+| `_thin/` | 68 | 71 | 71 | 71 | 71 |
+| `_merged/` | 0 | 0 | 74 | 74 | 74 |
+
+Net reduction at the end of the day: **2,591 PNGs and 130 facesets** removed
+or quarantined from the active pool. All preserved on disk for
+reversibility (`<faceset>/faces/_dropped/` for prunes, `_masked/_merged/_thin/`
+for quarantines).
+
+## 7. Re-running
+
+Run after any new import / consolidation / extend:
+
+```bash
+# 1. Byte-dedup + within-faceset near-dup (CPU only)
+python work/dedup_optimize.py analyze --out work/dedup_audit/dedup_plan.json
+python work/dedup_optimize.py apply  --plan work/dedup_audit/dedup_plan.json
+
+# 2. Multi-face audit on Windows DML (resumable)
+python work/dedup_optimize.py stage_multiface --out work/dedup_audit/multiface_queue.json
+"/mnt/c/face_embed_venv/Scripts/python.exe" work/multiface_worker.py \
+  work/dedup_audit/multiface_queue.json work/dedup_audit/multiface_results.json
+python work/dedup_optimize.py merge_multiface \
+  --results work/dedup_audit/multiface_results.json \
+  --out work/dedup_audit/multiface_plan.json
+python work/dedup_optimize.py apply_multiface \
+  --plan work/dedup_audit/multiface_plan.json
+
+# 3. HTML audit
+python work/dedup_optimize.py report \
+  --dedup work/dedup_audit/dedup_plan.json \
+  --multiface work/dedup_audit/multiface_plan.json \
+  --out work/dedup_audit
+```
diff --git a/docs/analysis/identity-consolidation-and-age-extend.md b/docs/analysis/identity-consolidation-and-age-extend.md
new file mode 100644
index 0000000..936975e
--- /dev/null
+++ b/docs/analysis/identity-consolidation-and-age-extend.md
@@ -0,0 +1,170 @@
+# Identity consolidation + age-bucket extension
+
+_Run date: 2026-04-27. Driver scripts: `work/consolidate_facesets.py`, `work/age_extend_001.py`._
+
+After the Immich peter + nic imports added 280 new facesets to a corpus that
+had ~25 canonical identities, many "new" identities were duplicates of
+existing household members at lower clustering confidence. Two cooperating
+passes clean this up: identity consolidation merges duplicates, then
+age-extend slots newly-merged PNGs into the existing era buckets of
+`faceset_001`.
+
+## 1. Identity consolidation
+
+### 1.1 Approach
+
+For each active faceset, pull cached arcface embeddings from
+`work/cache/{nl_full,immich_peter,immich_nic}.npz` keyed by
+`(source, bbox)` from the per-faceset manifest's `faces[]`. Compute
+L2-normalized centroid. Pairwise cosine similarity matrix.
+
+**Tier-based primary selection** (lowest tier number wins, size breaks ties):
+
+| tier | sources | rationale |
+|-----:|---------|-----------|
+| 0 | `faceset_013..019` (hand-sorted) | user's curated labels |
+| 1 | `faceset_001..012` (auto-clustered) | well-established household |
+| 2 | `faceset_020..025` (osrc) | mixed-bucket discovery |
+| 3 | `faceset_026..264` (immich peter) | speculative |
+| 4 | `faceset_265+` (immich nic) | speculative |
+
+**Era splits and quarantines excluded** — `faceset_NNN_<era>`, `_masked/`,
+`_thin/` are skipped during analysis.
+
+### 1.2 Single-linkage chains catastrophically — complete-linkage required
+
+First attempt used connected-components on edge ≥ 0.45 → produced a
+**60-faceset cluster** around `faceset_001` with min within-group sim of
+**−0.16** (definitely-different people bridged via chains
+`A↔B↔C` where `A`, `C` are not similar). Bumping to edge ≥ 0.55 still
+chained (group of 17 with min 0.20).
+
+Real fix: `scipy.cluster.hierarchy.linkage(method='complete')` then
+`fcluster(Z, t=1-edge_threshold, criterion='distance')`. Complete-linkage
+**guarantees** every within-group pair sim ≥ edge threshold. Without this
+guarantee the report is unusable and the apply step would produce
+identity-poisoned merges.
+
+### 1.3 Thresholds + run results
+
+`edge=0.55`, `confident=0.65` → 48 multi-faceset groups (29 confident, 19
+uncertain). Max group size 7, all bilateral or small triplets after
+complete-linkage.
+
+After applying all 48 (with `--include-uncertain` after visual approval):
+
+- **74 facesets consumed** (some groups had multiple secondaries:
+  `[10, 45, 135] → faceset_002`; `[113, 96, 178, 109, 110, 286] → faceset_095`;
+  etc.)
+- Active count 255 → 181
+- Notable absorptions: `faceset_001` (peter) 707 → 753 PNGs (+ 7, 132, 151);
+  `faceset_002` 209 → 247; `faceset_026` 60 → 262 (+ 168, 146, 325);
+  `faceset_028` → 207
+- Master manifest gained `merged[]` array (parallel to `thin_eras[]`); each
+  entry has `merged_into` field pointing at the primary
+
+### 1.4 Apply mechanics
+
+Combine all PNGs from primary + secondaries, re-rank by existing
+`quality.composite` desc (no re-enrich), renumber `0001..NNNN`, copy into a
+fresh staging dir, atomic swap. Move secondary directories to
+`_merged/<original_name>/` (preserved in full for reversibility). Re-zip
+`_topN.fsz` and `_all.fsz`.
+
+The primary's existing per-PNG quality scores are reused — re-ranking does
+not require re-running `enrich`-equivalent landmarks/pose on the cropped
+PNGs. The primary's `_dropped/` (from prior occlusion filter) is preserved
+through the merge.
+
+## 2. Age extension of faceset_001 era buckets
+
+### 2.1 Why a follow-on pass
+
+Consolidation absorbed faceset_007/132/151 into faceset_001 (+46 PNGs).
+The original `age_split_001.py` had bucketed peter into 6 era anchors
+(`_2005-10`, `_2010-13`, `_2011`, `_2014-17`, `_2018-19`, `_2018-20`), but
+those new PNGs had never been seen by age_split. They sat in faceset_001's
+parent-only set, missing from every era .fsz.
+
+### 2.2 Era-label pitfall
+
+The 6 anchor era labels are NOT strict year ranges. They are
+`Counter(years).most_common(1)`-derived dom-years from the original sub-cluster:
+
+| label | dom_year | actual span of members |
+|-------|---------:|-----------------------:|
+| `_2005-10` | 2010 | 2005–2010 |
+| `_2010-13` | 2011 | **2007–2024** |
+| `_2011` | 2011 | 2011 only |
+| `_2014-17` | 2016 | 2005–2018 |
+| `_2018-19` | 2018 | 2012–2020 |
+| `_2018-20` | 2019 | 2014–2022 |
+
+The clusters are *appearance-anchored*, not year-bounded. Year is a
+descriptive label. Assignment rule must use dom-year, not member span.
+
+### 2.3 Algorithm
+
+For each unbucketed face entry in `faceset_001`'s manifest (50 of 753):
+
+1. Look up embedding in cache by `(source, bbox)`.
+2. Look up EXIF year via `work/cache/age_split_exif.json`; fetch on cache miss.
+3. Find single nearest era anchor by cosine distance to its centroid.
+4. Accept iff `dist ≤ 0.40` AND `|year − anchor.dom_year| ≤ 5`.
+   These thresholds match `age_split_001.py`'s anchor-fragment rule.
+5. Anchors are NOT re-centered after absorption (preserves age_split's
+   drift-prevention guarantee).
+
+### 2.4 Run results
+
+50 unbucketed → 21 with EXIF year → **14 accepted**:
+
+| anchor | dom_year | added |
+|--------|---------:|------:|
+| `_2005-10` | 2010 | +2 |
+| `_2010-13` | 2011 | +1 |
+| `_2014-17` | 2016 | **+9** |
+| `_2018-20` | 2019 | +2 |
+
+29 PNGs skipped for missing EXIF year (mostly immich-stripped
+photos). 7 dist/year-rejected (e.g. two PNGs from 2025 want
+`_2018-19` but year-delta 7 > 5).
+
+### 2.5 Reconciliation side effect
+
+The apply rebuilds each affected era bucket's `faces/` from staging. This
+incidentally reconciled the per-bucket manifests with disk after the prior
+occlusion filter run had left era manifests stale at 282/126/132 entries vs
+~248/125/129 actual files (occlusion filter only updates the master
+manifest, never per-faceset manifests — see
+`docs/analysis/clip-occlusion-filter.md` §7). 42 occlusion-dropped era PNGs
+inside the old `faces/_dropped/` were removed during rebuild. The
+parent `faceset_001/faces/_dropped/` still has the corpus-level audit; all
+source images are intact at `/mnt/x/src/`, so the era-level dropped PNGs
+are regeneratable via `cmd_export_swap`.
+
+## 3. Re-running
+
+Always run both passes after any new identity import (Immich, osrc,
+hand-sorted folder):
+
+```bash
+# 1. Find duplicate identities
+python work/consolidate_facesets.py analyze \
+  --out work/merge_review/candidates.json [--edge 0.55 --confident 0.65]
+python work/consolidate_facesets.py report \
+  --candidates work/merge_review/candidates.json --out work/merge_review
+# inspect work/merge_review/index.html
+python work/consolidate_facesets.py apply \
+  --candidates work/merge_review/candidates.json [--include-uncertain]
+
+# 2. Slot new faceset_001 PNGs into existing era buckets
+python work/age_extend_001.py analyze --out work/age_extend/candidates.json
+python work/age_extend_001.py report \
+  --candidates work/age_extend/candidates.json --out work/age_extend
+python work/age_extend_001.py apply --candidates work/age_extend/candidates.json
+```
+
+Both are idempotent. `consolidate_facesets` skips secondaries already in
+`_merged/`; `age_extend_001` recomputes anchor centroids + dom-year fresh
+on every run.
diff --git a/work/age_extend_001.py b/work/age_extend_001.py
new file mode 100644
index 0000000..d611d9e
--- /dev/null
+++ b/work/age_extend_001.py
@@ -0,0 +1,576 @@
+"""Extend the existing 6 era buckets of faceset_001 by absorbing PNGs that
+post-date the original age_split run (from consolidation merges, etc.).
+
+Mirrors the anchor-fragment assignment logic in age_split_001.py:
+  - For each unbucketed face in faceset_001's manifest, find the nearest active
+    era anchor by cosine distance to the anchor's centroid.
+  - Accept the assignment iff dist <= 0.40 AND |year_delta| <= 5
+    (where year_delta = exif_year(face) - dom_year(anchor)).
+  - Undated PNGs are skipped (no assignment).
+  - Anchors are NOT re-centered after absorption (preserves the same drift
+    guarantees as the original age_split).
+
+CLI:
+  python work/age_extend_001.py analyze --out work/age_extend/candidates.json
+  python work/age_extend_001.py report --candidates ... --out work/age_extend
+  python work/age_extend_001.py apply --candidates ... [--dry-run]
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import sys
+import time
+from collections import Counter
+from pathlib import Path
+
+import numpy as np
+from PIL import Image, ExifTags
+
+ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
+PARENT = "faceset_001"
+ACTIVE_ERAS = [
+    "faceset_001_2005-10",
+    "faceset_001_2010-13",
+    "faceset_001_2011",
+    "faceset_001_2014-17",
+    "faceset_001_2018-19",
+    "faceset_001_2018-20",
+]
+CACHES = [
+    Path("/opt/face-sets/work/cache/nl_full.npz"),
+    Path("/opt/face-sets/work/cache/immich_peter.npz"),
+    Path("/opt/face-sets/work/cache/immich_nic.npz"),
+]
+EXIF_CACHE = Path("/opt/face-sets/work/cache/age_split_exif.json")
+
+# anchor-fragment thresholds (mirror age_split_001.py)
+DIST_MAX = 0.40
+YEAR_MAX = 5
+
+
+# ----------------------------- caches -----------------------------
+
+def load_caches():
+    rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {}
+    alias_map: dict[str, str] = {}
+    for c in CACHES:
+        if not c.exists():
+            print(f"[warn] cache missing: {c}", file=sys.stderr)
+            continue
+        d = np.load(c, allow_pickle=True)
+        emb = d["embeddings"]
+        meta = json.loads(str(d["meta"]))
+        face_records = [m for m in meta if not m.get("noface")]
+        if len(face_records) != len(emb):
+            raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}")
+        if "path_aliases" in d.files:
+            paliases = json.loads(str(d["path_aliases"]))
+            for canon, alist in paliases.items():
+                alias_map.setdefault(canon, canon)
+                for a in alist:
+                    alias_map[a] = canon
+        for i, rec in enumerate(face_records):
+            p = rec["path"]
+            bbox = tuple(int(x) for x in rec["bbox"])
+            v = emb[i].astype(np.float32)
+            n = float(np.linalg.norm(v))
+            if n > 0:
+                v = v / n
+            rec_index[(p, bbox)] = v
+            alias_map.setdefault(p, p)
+    print(f"[cache] indexed {len(rec_index)} face records, {len(alias_map)} aliases", file=sys.stderr)
+    return rec_index, alias_map
+
+
+def lookup_emb(rec_index, alias_map, src: str, bbox):
+    bbox_t = tuple(int(x) for x in bbox)
+    canon = alias_map.get(src, src)
+    v = rec_index.get((canon, bbox_t))
+    if v is None and canon != src:
+        v = rec_index.get((src, bbox_t))
+    return v
+
+
+# ----------------------------- exif -----------------------------
+
+def load_exif_cache():
+    if not EXIF_CACHE.exists():
+        return {}
+    return json.loads(EXIF_CACHE.read_text())
+
+
+def save_exif_cache(cache):
+    tmp = EXIF_CACHE.with_suffix(".tmp.json")
+    tmp.write_text(json.dumps(cache, indent=2))
+    tmp.replace(EXIF_CACHE)
+
+
+def exif_year(path: Path) -> int | None:
+    try:
+        with Image.open(path) as im:
+            ex = im._getexif()
+            if not ex:
+                return None
+            for tag_id, val in ex.items():
+                tag = ExifTags.TAGS.get(tag_id, tag_id)
+                if tag == "DateTimeOriginal" and isinstance(val, str) and len(val) >= 4:
+                    return int(val[:4])
+    except Exception:
+        return None
+    return None
+
+
+def get_year(src: str, exif_cache) -> int | None:
+    """Return EXIF year for src, using cache. Mutates cache for new lookups."""
+    if src in exif_cache:
+        return exif_cache[src]
+    p = Path(src)
+    y = exif_year(p) if p.exists() else None
+    exif_cache[src] = y
+    return y
+
+
+# ----------------------------- analyze -----------------------------
+
+def cmd_analyze(args):
+    rec_index, alias_map = load_caches()
+    exif_cache = load_exif_cache()
+    exif_cache_dirty = False
+
+    parent_dir = ROOT / PARENT
+    parent_manifest = json.loads((parent_dir / "manifest.json").read_text())
+    parent_faces = parent_manifest.get("faces", [])
+    print(f"[parent] {PARENT}: {len(parent_faces)} face entries", file=sys.stderr)
+
+    # Build "in_bucket" set + each anchor's centroid + dom_year
+    anchors = []
+    in_bucket: set[tuple[str, tuple[int, int, int, int]]] = set()
+    for era in ACTIVE_ERAS:
+        ed = ROOT / era
+        if not ed.is_dir():
+            print(f"[warn] missing era bucket: {era}", file=sys.stderr)
+            continue
+        em = json.loads((ed / "manifest.json").read_text())
+        emb_list = []
+        years = []
+        n_missing_emb = 0
+        for f in em.get("faces", []):
+            src = f.get("source")
+            bbox = f.get("bbox")
+            if not src or not bbox:
+                continue
+            key = (alias_map.get(src, src), tuple(int(x) for x in bbox))
+            in_bucket.add(key)
+            in_bucket.add((src, tuple(int(x) for x in bbox)))   # cover both alias and raw
+            v = lookup_emb(rec_index, alias_map, src, bbox)
+            if v is None:
+                n_missing_emb += 1
+            else:
+                emb_list.append(v)
+            y = get_year(src, exif_cache)
+            if y is None:
+                exif_cache_dirty = True
+            else:
+                years.append(y)
+                if src not in exif_cache:
+                    exif_cache_dirty = True
+        if not emb_list:
+            print(f"[warn] {era}: no embeddings found, skipping anchor", file=sys.stderr)
+            continue
+        arr = np.stack(emb_list).astype(np.float32)
+        c = arr.mean(axis=0)
+        n = float(np.linalg.norm(c))
+        if n > 0:
+            c = c / n
+        dom_year = Counter(years).most_common(1)[0][0] if years else None
+        anchors.append({
+            "name": era, "centroid": c, "n_faces": len(em.get("faces", [])),
+            "n_emb_used": len(emb_list), "n_emb_missing": n_missing_emb,
+            "dom_year": dom_year,
+            "year_min": min(years) if years else None,
+            "year_max": max(years) if years else None,
+        })
+        print(f"[anchor] {era}: n={len(em.get('faces', []))} emb_used={len(emb_list)} "
+              f"emb_miss={n_missing_emb} dom_year={dom_year} years=[{min(years) if years else '-'}..{max(years) if years else '-'}]",
+              file=sys.stderr)
+
+    # Find unbucketed faces in parent
+    unbucketed = []
+    for f in parent_faces:
+        src = f.get("source")
+        bbox = f.get("bbox")
+        if not src or not bbox:
+            continue
+        bbox_t = tuple(int(x) for x in bbox)
+        key1 = (alias_map.get(src, src), bbox_t)
+        key2 = (src, bbox_t)
+        if key1 in in_bucket or key2 in in_bucket:
+            continue
+        unbucketed.append(f)
+    print(f"[parent] {len(unbucketed)} unbucketed face entries (in {PARENT} but no era bucket)", file=sys.stderr)
+
+    # Score each unbucketed face against every anchor
+    proposals = []
+    skipped_no_emb = 0
+    skipped_no_year = 0
+    for f in unbucketed:
+        src = f["source"]
+        bbox = f["bbox"]
+        v = lookup_emb(rec_index, alias_map, src, bbox)
+        if v is None:
+            skipped_no_emb += 1
+            continue
+        y = get_year(src, exif_cache)
+        if y is None:
+            skipped_no_year += 1
+            exif_cache_dirty = True
+            continue
+        if src not in exif_cache:
+            exif_cache_dirty = True
+        # nearest anchor
+        best = None  # (dist, idx)
+        for i, a in enumerate(anchors):
+            d = 1.0 - float(np.dot(a["centroid"], v))
+            if best is None or d < best[0]:
+                best = (d, i)
+        if best is None:
+            continue
+        dist, bidx = best
+        anchor = anchors[bidx]
+        year_delta = abs(y - anchor["dom_year"]) if anchor["dom_year"] is not None else None
+        accept = (dist <= DIST_MAX and year_delta is not None and year_delta <= YEAR_MAX)
+        proposals.append({
+            "png": f["png"],
+            "source": src,
+            "bbox": [int(x) for x in bbox],
+            "year": y,
+            "rank_in_parent": f.get("rank"),
+            "quality_composite": f.get("quality", {}).get("composite"),
+            "quality": f.get("quality", {}),
+            "best_anchor": anchor["name"],
+            "best_anchor_dom_year": anchor["dom_year"],
+            "centroid_dist": round(dist, 4),
+            "year_delta": year_delta,
+            "accept": bool(accept),
+            "all_anchor_dists": {
+                a["name"]: round(1.0 - float(np.dot(a["centroid"], v)), 4) for a in anchors
+            },
+        })
+
+    if exif_cache_dirty:
+        save_exif_cache(exif_cache)
+        print(f"[exif] cache flushed ({len(exif_cache)} entries total)", file=sys.stderr)
+
+    # Summarize
+    accepted = [p for p in proposals if p["accept"]]
+    rejected = [p for p in proposals if not p["accept"]]
+    by_anchor = Counter(p["best_anchor"] for p in accepted)
+    print(f"[summary] unbucketed={len(unbucketed)} scored={len(proposals)} "
+          f"accepted={len(accepted)} rejected={len(rejected)} "
+          f"skipped(no_emb={skipped_no_emb}, no_year={skipped_no_year})", file=sys.stderr)
+    for k, v in by_anchor.most_common():
+        print(f"  {k}: +{v}", file=sys.stderr)
+
+    out = {
+        "thresholds": {"dist_max": DIST_MAX, "year_max": YEAR_MAX},
+        "anchors": [
+            {k: v for k, v in a.items() if k != "centroid"}
+            for a in anchors
+        ],
+        "n_unbucketed": len(unbucketed),
+        "skipped": {"no_emb": skipped_no_emb, "no_year": skipped_no_year},
+        "proposals": sorted(proposals, key=lambda p: (not p["accept"], p["best_anchor"], -1 * (p["quality_composite"] or 0))),
+        "by_anchor": dict(by_anchor),
+    }
+    op = Path(args.out)
+    op.parent.mkdir(parents=True, exist_ok=True)
+    op.write_text(json.dumps(out, indent=2))
+    print(f"[done] {len(proposals)} proposals -> {op}", file=sys.stderr)
+
+
+# ----------------------------- report -----------------------------
+
+def cmd_report(args):
+    cand = json.loads(Path(args.candidates).read_text())
+    out_dir = Path(args.out)
+    thumbs_dir = out_dir / "thumbs"
+    thumbs_dir.mkdir(parents=True, exist_ok=True)
+    THUMB = 140
+
+    def make_thumb(png_relpath: str) -> str:
+        # png_relpath looks like "faces/0042.png"
+        src = ROOT / PARENT / png_relpath
+        name = Path(png_relpath).stem
+        dst = thumbs_dir / f"{name}.jpg"
+        if not dst.exists():
+            try:
+                img = Image.open(src).convert("RGB")
+                img.thumbnail((THUMB, THUMB), Image.LANCZOS)
+                img.save(dst, "JPEG", quality=82)
+            except Exception as e:
+                print(f"[thumb-skip] {src}: {e}", file=sys.stderr)
+                return ""
+        return f"thumbs/{name}.jpg"
+
+    # group accepted proposals by target anchor
+    by_anchor: dict[str, list] = {}
+    rejected = []
+    for p in cand["proposals"]:
+        if p["accept"]:
+            by_anchor.setdefault(p["best_anchor"], []).append(p)
+        else:
+            rejected.append(p)
+
+    rows = []
+    rows.append("<h1>faceset_001 age extension &mdash; review</h1>")
+    rows.append(f"<p>{cand['n_unbucketed']} unbucketed faces in {PARENT}; "
+                f"{sum(len(v) for v in by_anchor.values())} accepted / {len(rejected)} rejected; "
+                f"thresholds dist&le;{cand['thresholds']['dist_max']} AND |year_delta|&le;{cand['thresholds']['year_max']}.</p>")
+    nav = " · ".join(f"<a href='#{a}'>{a} (+{len(by_anchor[a])})</a>" for a in by_anchor) + " · <a href='#rejected'>rejected</a>"
+    rows.append(f"<div class='nav'>{nav}</div>")
+
+    for anchor_name in ACTIVE_ERAS:
+        if anchor_name not in by_anchor:
+            continue
+        items = by_anchor[anchor_name]
+        anchor_meta = next((a for a in cand["anchors"] if a["name"] == anchor_name), {})
+        rows.append(f"<section id='{anchor_name}' class='grp'>")
+        rows.append(f"<h2>{anchor_name} <small>(dom_year={anchor_meta.get('dom_year')}; "
+                    f"existing n={anchor_meta.get('n_faces')}; +{len(items)} new)</small></h2>")
+        rows.append("<div class='cells'>")
+        for p in sorted(items, key=lambda x: (x["centroid_dist"], -1 * (x["quality_composite"] or 0))):
+            thumb = make_thumb(p["png"])
+            cls = "hi" if p["centroid_dist"] <= 0.30 else "mid"
+            rows.append(
+                f"<div class='cell'>"
+                f"<img src='{thumb}' loading='lazy' title='{p['png']}'>"
+                f"<div class='meta'>{p['png']}<br>year {p['year']} (Δ{p['year_delta']})<br>"
+                f"<span class='{cls}'>dist {p['centroid_dist']:.3f}</span></div>"
+                f"</div>"
+            )
+        rows.append("</div></section>")
+
+    if rejected:
+        rows.append("<section id='rejected' class='grp rej'>")
+        rows.append(f"<h2>rejected <small>({len(rejected)} faces don't fit any anchor)</small></h2>")
+        rows.append("<div class='cells'>")
+        for p in sorted(rejected, key=lambda x: x["centroid_dist"])[:200]:
+            thumb = make_thumb(p["png"])
+            why = []
+            if p["centroid_dist"] > cand['thresholds']['dist_max']:
+                why.append(f"dist {p['centroid_dist']:.2f}>{cand['thresholds']['dist_max']}")
+            if p["year_delta"] is None or p["year_delta"] > cand['thresholds']['year_max']:
+                why.append(f"yΔ{p['year_delta']}>{cand['thresholds']['year_max']}")
+            rows.append(
+                f"<div class='cell'>"
+                f"<img src='{thumb}' loading='lazy'>"
+                f"<div class='meta'>{p['png']}<br>year {p['year']} → best {p['best_anchor']}<br>"
+                f"<span class='lo'>{'; '.join(why)}</span></div>"
+                f"</div>"
+            )
+        if len(rejected) > 200:
+            rows.append(f"<p>...{len(rejected)-200} more truncated.</p>")
+        rows.append("</div></section>")
+
+    html = f"""<!doctype html>
+<html><head><meta charset='utf-8'><title>faceset_001 age extension</title>
+<style>
+body {{ font-family: system-ui, sans-serif; background:#111; color:#eee; padding:1em; }}
+h1 {{ margin-top:0; }} h2 {{ margin:0; }}
+small {{ color:#999; font-weight:normal; }}
+section.grp {{ background:#1a1a1a; border-radius:6px; padding:12px; margin:12px 0; }}
+section.grp.rej {{ border-left:4px solid #ff5050; }}
+.cells {{ display:flex; flex-wrap:wrap; gap:6px; }}
+.cell {{ background:#222; border-radius:4px; padding:4px; width:160px; font-size:11px; font-family:monospace; text-align:center; }}
+.cell img {{ height:140px; width:auto; border-radius:3px; }}
+.meta {{ padding-top:4px; line-height:1.3; }}
+.hi  {{ color:#5fa05f; font-weight:bold; }}
+.mid {{ color:#ffb050; }}
+.lo  {{ color:#ff5050; }}
+.nav {{ position:sticky; top:0; background:#111; padding:.5em 0; border-bottom:1px solid #333; font-size:13px; }}
+a {{ color:#6cf; }}
+</style></head>
+<body>
+{''.join(rows)}
+</body></html>"""
+    out_html = out_dir / "index.html"
+    out_html.write_text(html)
+    print(f"[done] {out_html}", file=sys.stderr)
+
+
+# ----------------------------- apply -----------------------------
+
+def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
+    import zipfile
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
+        for i, p in enumerate(pngs):
+            zf.write(p, arcname=f"{i:04d}.png")
+
+
+def cmd_apply(args):
+    cand = json.loads(Path(args.candidates).read_text())
+    accepted = [p for p in cand["proposals"] if p["accept"]]
+    if args.dry_run:
+        from collections import Counter as C
+        by = C(p["best_anchor"] for p in accepted)
+        print(f"=== dry-run: {len(accepted)} assignments across {len(by)} anchors ===")
+        for k, v in by.most_common():
+            print(f"  {k}: +{v}")
+        return
+
+    parent_dir = ROOT / PARENT
+    master_path = ROOT / "manifest.json"
+    master = json.loads(master_path.read_text())
+    facesets_by_name = {f["name"]: f for f in master.get("facesets", [])}
+
+    by_anchor: dict[str, list] = {}
+    for p in accepted:
+        by_anchor.setdefault(p["best_anchor"], []).append(p)
+
+    total_added = 0
+    for anchor_name, props in by_anchor.items():
+        ed = ROOT / anchor_name
+        em_path = ed / "manifest.json"
+        em = json.loads(em_path.read_text())
+        existing = list(em.get("faces", []))
+
+        # gather new entries with their source PNG paths in faceset_001/faces/
+        new_with_src = []
+        for p in props:
+            src_png = parent_dir / p["png"]
+            if not src_png.exists():
+                print(f"[warn] missing parent PNG {src_png}; skip", file=sys.stderr)
+                continue
+            face_entry = {
+                "source": p["source"],
+                "bbox": p["bbox"],
+                "quality": p["quality"],
+                "exif_year": p["year"],
+                "centroid_dist_at_assign": p["centroid_dist"],
+                "year_delta_at_assign": p["year_delta"],
+                "extended_from_parent": True,
+            }
+            new_with_src.append((face_entry, src_png))
+
+        # combine; rank by quality.composite desc (existing entries already have rank,
+        # but we re-rank globally so new entries slot in by quality)
+        combined: list[tuple[dict, Path | None]] = []
+        for f in existing:
+            combined.append((f, None))
+        combined.extend(new_with_src)
+        combined.sort(key=lambda x: -x[0].get("quality", {}).get("composite", 0))
+
+        # stage fresh
+        staging = ed / "_faces_new"
+        if staging.exists():
+            shutil.rmtree(staging)
+        staging.mkdir()
+        new_face_entries = []
+        for new_rank, (face, src_png_or_none) in enumerate(combined, start=1):
+            new_name = f"{new_rank:04d}.png"
+            if src_png_or_none is None:
+                # existing entry: copy from current era bucket faces/
+                old_name = Path(face["png"]).name
+                src = ed / "faces" / old_name
+                if not src.exists():
+                    print(f"[warn] {anchor_name}: missing existing PNG {src}; skip", file=sys.stderr)
+                    continue
+                shutil.copy2(src, staging / new_name)
+            else:
+                shutil.copy2(src_png_or_none, staging / new_name)
+            face = dict(face)
+            face["rank"] = new_rank
+            face["png"] = f"faces/{new_name}"
+            new_face_entries.append(face)
+
+        # swap dirs
+        old_holding = ed / "_faces_old"
+        if old_holding.exists():
+            shutil.rmtree(old_holding)
+        (ed / "faces").rename(old_holding)
+        staging.rename(ed / "faces")
+        shutil.rmtree(old_holding)
+
+        # re-zip .fsz
+        survivor_pngs = sorted((ed / "faces").glob("*.png"))
+        top_n = em.get("top_n", 30)
+        top_n_eff = min(top_n, len(survivor_pngs))
+        for old in ed.glob("*.fsz"):
+            old.unlink()
+        top_fsz_name = f"{anchor_name}_top{top_n_eff}.fsz"
+        all_fsz_name = f"{anchor_name}_all.fsz"
+        _zip_png_list(survivor_pngs[:top_n_eff], ed / top_fsz_name)
+        if len(survivor_pngs) > top_n_eff:
+            _zip_png_list(survivor_pngs, ed / all_fsz_name)
+            all_fsz_used = all_fsz_name
+        else:
+            all_fsz_used = None
+
+        # update local + master manifests
+        em["faces"] = new_face_entries
+        em["exported"] = len(new_face_entries)
+        em["fsz_top"] = top_fsz_name
+        em["fsz_all"] = all_fsz_used
+        em["top_n"] = top_n_eff
+        em.setdefault("age_extend_history", []).append({
+            "added": len(new_with_src),
+            "thresholds": cand["thresholds"],
+        })
+        em_path.write_text(json.dumps(em, indent=2))
+
+        if anchor_name in facesets_by_name:
+            facesets_by_name[anchor_name]["exported"] = len(new_face_entries)
+            facesets_by_name[anchor_name]["fsz_top"] = top_fsz_name
+            facesets_by_name[anchor_name]["fsz_all"] = all_fsz_used
+            facesets_by_name[anchor_name]["top_n"] = top_n_eff
+
+        added_here = len(new_with_src)
+        total_added += added_here
+        print(f"[applied] {anchor_name}: +{added_here} (now {len(new_face_entries)} faces)", file=sys.stderr)
+
+    # rewrite master with ordering preserved
+    new_facesets = []
+    for entry in master.get("facesets", []):
+        new_facesets.append(facesets_by_name.get(entry["name"], entry))
+    master["facesets"] = new_facesets
+    master.setdefault("age_extend_runs", []).append({
+        "parent": PARENT,
+        "thresholds": cand["thresholds"],
+        "anchors": list(by_anchor.keys()),
+        "added_total": total_added,
+    })
+    tmp = master_path.with_suffix(".tmp.json")
+    tmp.write_text(json.dumps(master, indent=2))
+    tmp.replace(master_path)
+    print(f"[done] +{total_added} faces across {len(by_anchor)} anchors", file=sys.stderr)
+
+
+# ----------------------------- main -----------------------------
+
+def main():
+    ap = argparse.ArgumentParser()
+    sub = ap.add_subparsers(dest="cmd", required=True)
+
+    a = sub.add_parser("analyze")
+    a.add_argument("--out", required=True)
+    a.set_defaults(func=cmd_analyze)
+
+    r = sub.add_parser("report")
+    r.add_argument("--candidates", required=True)
+    r.add_argument("--out", required=True)
+    r.set_defaults(func=cmd_report)
+
+    p = sub.add_parser("apply")
+    p.add_argument("--candidates", required=True)
+    p.add_argument("--dry-run", action="store_true")
+    p.set_defaults(func=cmd_apply)
+
+    args = ap.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/work/clip_worker.py b/work/clip_worker.py
new file mode 100644
index 0000000..9d28536
--- /dev/null
+++ b/work/clip_worker.py
@@ -0,0 +1,221 @@
+"""Windows / DirectML CLIP worker for occlusion scoring.
+
+Reads a queue.json staged by /opt/face-sets/work/filter_occlusions.py (WSL side),
+runs open_clip ViT-L-14 (dfn2b_s39b) on each PNG via torch-directml on the AMD
+Vega, and writes a scores.json with mask + sunglasses softmax probabilities.
+
+CLI:
+    py -3.12 clip_worker.py <queue.json> <out_scores.json> [--limit N] [--batch 8]
+
+queue.json shape: list of objects
+    {"wsl_path": "...", "win_path": "E:\\...\\faceset_NNN\\faces\\NNNN.png",
+     "faceset": "faceset_NNN", "file": "NNNN.png"}
+
+scores.json shape:
+    {"model": "ViT-L-14/dfn2b_s39b",
+     "logit_scale": 100.0,
+     "prompts": {...},
+     "results": [{"wsl_path": "...", "faceset": "...", "file": "...",
+                  "mask": float, "sunglasses": float}],
+     "processed": [wsl_path, ...]}
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+import time
+import warnings
+from pathlib import Path
+
+# DML emits a verbose UserWarning per attention call -- silence at import time
+warnings.filterwarnings("ignore", category=UserWarning)
+
+import torch
+import torch_directml
+import open_clip
+from PIL import Image
+
+MODEL_NAME = "ViT-L-14"
+PRETRAINED = "dfn2b_s39b"
+
+# kept in sync with /opt/face-sets/work/filter_occlusions.py PROMPTS
+PROMPTS = {
+    "mask": {
+        "pos": [
+            "a photo of a person wearing a surgical face mask",
+            "a photo of a person wearing an FFP2 respirator covering mouth and nose",
+            "a photo of a person wearing a cloth face mask",
+            "a face partially covered by a medical mask",
+            "a person whose mouth and nose are hidden by a face mask",
+        ],
+        "neg": [
+            "a photo of a person's face with mouth and nose clearly visible",
+            "a clear, unobstructed photo of a face",
+            "a photo of a face without any mask or covering",
+            "a portrait of a person showing their full face",
+            "a photo of a person with a beard and visible mouth",
+        ],
+    },
+    "sunglasses": {
+        "pos": [
+            "a face with dark sunglasses covering the eyes",
+            "a portrait with the eyes hidden behind opaque sunglasses",
+            "a person wearing dark sunglasses over their eyes, eyes not visible",
+            "a face where the eyes are completely concealed by tinted lenses",
+            "a close-up portrait wearing aviator sunglasses on the eyes",
+        ],
+        "neg": [
+            "a portrait with both eyes clearly visible and uncovered",
+            "a face with sunglasses pushed up on the forehead, eyes visible below",
+            "a face with sunglasses resting on top of the head, eyes visible",
+            "a person with sunglasses hanging from their shirt, eyes visible",
+            "a face wearing clear prescription eyeglasses with visible eyes",
+            "a portrait with no eyewear and visible eyes",
+        ],
+    },
+}
+
+FLUSH_EVERY = 100
+
+
+def load_existing(out_path: Path):
+    if not out_path.exists():
+        return None, set()
+    try:
+        d = json.loads(out_path.read_text())
+        processed = set(d.get("processed", []))
+        return d, processed
+    except Exception as e:
+        print(f"[warn] could not parse existing {out_path}: {e}; starting fresh", file=sys.stderr)
+        return None, set()
+
+
+def save_atomic(out_path: Path, data: dict):
+    tmp = out_path.with_suffix(".tmp.json")
+    tmp.write_text(json.dumps(data, indent=2))
+    os.replace(tmp, out_path)
+
+
+@torch.no_grad()
+def build_text_features(model, tokenizer, device):
+    out = {}
+    for attr, sides in PROMPTS.items():
+        feats = {}
+        for side in ("pos", "neg"):
+            tokens = tokenizer(sides[side]).to(device)
+            f = model.encode_text(tokens)
+            f = f / f.norm(dim=-1, keepdim=True)
+            mean = f.mean(dim=0)
+            feats[side] = mean / mean.norm()
+        out[attr] = (feats["pos"], feats["neg"])
+    return out
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("queue", type=Path)
+    ap.add_argument("out", type=Path)
+    ap.add_argument("--limit", type=int, default=None)
+    ap.add_argument("--batch", type=int, default=8)
+    args = ap.parse_args()
+
+    queue = json.loads(args.queue.read_text())
+    print(f"[queue] {len(queue)} entries from {args.queue}")
+
+    args.out.parent.mkdir(parents=True, exist_ok=True)
+    existing, processed = load_existing(args.out)
+    if existing:
+        print(f"[resume] {len(processed)} entries already scored")
+        results = existing.get("results", [])
+    else:
+        results = []
+
+    pending = [e for e in queue if e["wsl_path"] not in processed]
+    if args.limit is not None:
+        pending = pending[: args.limit]
+    print(f"[pending] {len(pending)} entries to score")
+
+    if not pending:
+        print("[done] nothing to do")
+        return
+
+    device = torch_directml.device()
+    print(f"[load] {MODEL_NAME}/{PRETRAINED} on {torch_directml.device_name(0)}")
+    t0 = time.time()
+    model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED)
+    tokenizer = open_clip.get_tokenizer(MODEL_NAME)
+    model = model.to(device).eval()
+    logit_scale = float(model.logit_scale.exp().detach().cpu())
+    print(f"[load] ready in {time.time()-t0:.1f}s logit_scale={logit_scale:.2f}")
+    text_feats = build_text_features(model, tokenizer, device)
+
+    def flush():
+        save_atomic(args.out, {
+            "model": f"{MODEL_NAME}/{PRETRAINED}",
+            "logit_scale": logit_scale,
+            "prompts": PROMPTS,
+            "results": results,
+            "processed": sorted(processed),
+        })
+
+    n_done_this_run = 0
+    n_load_err = 0
+    last_flush = time.time()
+    t_start = time.time()
+
+    for i in range(0, len(pending), args.batch):
+        chunk = pending[i:i + args.batch]
+        imgs = []
+        keep = []
+        for entry in chunk:
+            try:
+                img = Image.open(entry["win_path"]).convert("RGB")
+                imgs.append(preprocess(img))
+                keep.append(entry)
+            except Exception as e:
+                print(f"[skip] {entry['win_path']}: {e}", file=sys.stderr)
+                n_load_err += 1
+                processed.add(entry["wsl_path"])
+        if not imgs:
+            continue
+        x = torch.stack(imgs).to(device)
+        with torch.no_grad():
+            feats = model.encode_image(x)
+            feats = feats / feats.norm(dim=-1, keepdim=True)
+            scores_per_attr = {}
+            for attr, (pos, neg) in text_feats.items():
+                sims = torch.stack([feats @ pos, feats @ neg], dim=1) * logit_scale
+                probs = sims.softmax(dim=1)[:, 0].detach().cpu().tolist()
+                scores_per_attr[attr] = probs
+        for j, entry in enumerate(keep):
+            results.append({
+                "wsl_path": entry["wsl_path"],
+                "faceset": entry["faceset"],
+                "file": entry["file"],
+                "mask": round(scores_per_attr["mask"][j], 4),
+                "sunglasses": round(scores_per_attr["sunglasses"][j], 4),
+            })
+            processed.add(entry["wsl_path"])
+            n_done_this_run += 1
+
+        if (n_done_this_run % FLUSH_EVERY < args.batch) or (time.time() - last_flush) > 30.0:
+            flush()
+            last_flush = time.time()
+            elapsed = time.time() - t_start
+            rate = n_done_this_run / max(0.1, elapsed)
+            eta_min = (len(pending) - n_done_this_run) / max(0.1, rate) / 60.0
+            print(f"[score] {n_done_this_run}/{len(pending)} "
+                  f"rate={rate:.2f} img/s eta={eta_min:.1f}min "
+                  f"load_err={n_load_err}", flush=True)
+
+    flush()
+    elapsed = time.time() - t_start
+    print(f"[done] {n_done_this_run} scored, {n_load_err} load errors, "
+          f"{elapsed:.1f}s ({n_done_this_run/max(0.1,elapsed):.2f} img/s) -> {args.out}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/work/consolidate_facesets.py b/work/consolidate_facesets.py
new file mode 100644
index 0000000..c476cfa
--- /dev/null
+++ b/work/consolidate_facesets.py
@@ -0,0 +1,634 @@
+"""Consolidate facesets_swap_ready/ — find duplicate identities and merge.
+
+Pipeline:
+  1. analyze: pull arcface embeddings from work/cache/*.npz for every PNG in every
+     active faceset (skipping _masked, _thin, era splits). Compute L2-normalized
+     centroid per faceset. Build similarity graph at sim>=0.45, extract components.
+     Pick primary per component by tier (hand-sorted > auto > osrc > immich) + size.
+  2. report: HTML contact sheet at work/merge_review/index.html grouped by
+     candidate cluster, with top-3 thumbs per faceset, all pairwise sims, and
+     "merge X,Y -> Z" plan. Confident edges (sim>=0.65) are highlighted.
+  3. apply: combine PNGs of secondaries into primary, re-rank by quality.composite
+     descending, renumber 0001..NNNN, re-zip _topN.fsz + _all.fsz, move secondaries
+     to facesets_swap_ready/_merged/<name>/, update master manifest with
+     `merged[]` array + `merge_run` provenance block.
+
+Embeddings come from caches (no GPU re-embed needed); the original clusterer used
+exactly these vectors so they are the right yardstick. Era splits are excluded
+entirely (intentional time-period segmentation, not a duplication).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import shutil
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+from PIL import Image
+from scipy.cluster.hierarchy import linkage, fcluster
+from scipy.spatial.distance import squareform
+
+ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
+CACHES = [
+    Path("/opt/face-sets/work/cache/nl_full.npz"),
+    Path("/opt/face-sets/work/cache/immich_peter.npz"),
+    Path("/opt/face-sets/work/cache/immich_nic.npz"),
+]
+
+ERA_SPLIT_RE = re.compile(r"^faceset_\d+_(?:\d{4}-\d{2,4}|\d{4}|undated)$")
+
+
+# ----------------------------- helpers -----------------------------
+
+def load_caches():
+    """Return (rec_index, alias_map). rec_index keyed by (path, bbox_tuple)
+    -> embedding (np.float32, shape (512,) L2-normalized).
+    alias_map maps every alias path -> canonical path."""
+    rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {}
+    alias_map: dict[str, str] = {}
+    n_total = 0
+    for c in CACHES:
+        if not c.exists():
+            print(f"[warn] cache missing: {c}", file=sys.stderr)
+            continue
+        d = np.load(c, allow_pickle=True)
+        emb = d["embeddings"]
+        meta = json.loads(str(d["meta"]))
+        face_records = [m for m in meta if not m.get("noface")]
+        if len(face_records) != len(emb):
+            raise SystemExit(f"meta/emb mismatch in {c}: {len(face_records)} vs {len(emb)}")
+        # path_aliases may be present
+        if "path_aliases" in d.files:
+            paliases = json.loads(str(d["path_aliases"]))
+            for canon, alist in paliases.items():
+                alias_map.setdefault(canon, canon)
+                for a in alist:
+                    alias_map[a] = canon
+        for i, rec in enumerate(face_records):
+            p = rec["path"]
+            bbox = tuple(int(x) for x in rec["bbox"])
+            v = emb[i].astype(np.float32)
+            n = float(np.linalg.norm(v))
+            if n > 0:
+                v = v / n
+            rec_index[(p, bbox)] = v
+            alias_map.setdefault(p, p)
+        print(f"[cache] {c.name}: +{len(face_records)} face records (running total {len(rec_index)})", file=sys.stderr)
+        n_total += len(face_records)
+    print(f"[cache] indexed {n_total} face records, {len(alias_map)} path aliases", file=sys.stderr)
+    return rec_index, alias_map
+
+
+def faceset_tier(name: str) -> int:
+    """Lower number = higher priority for primary selection."""
+    m = re.match(r"^faceset_0*(\d+)$", name)
+    if not m:
+        return 99  # unknown structure
+    n = int(m.group(1))
+    if 13 <= n <= 19:
+        return 0  # hand-sorted
+    if 1 <= n <= 12:
+        return 1  # auto-clustered
+    if 20 <= n <= 25:
+        return 2  # osrc
+    if 26 <= n <= 264:
+        return 3  # immich peter
+    if 265 <= n:
+        return 4  # immich nic and beyond
+    return 99
+
+
+def is_era_split(name: str) -> bool:
+    return bool(ERA_SPLIT_RE.match(name))
+
+
+def faceset_centroid(faceset_dir: Path, rec_index, alias_map):
+    """Return (centroid, n_used, n_missing) where centroid is L2-normalized mean
+    of embeddings of the faces listed in the per-faceset manifest. Falls back to
+    None if too few embeddings found."""
+    manifest = faceset_dir / "manifest.json"
+    if not manifest.exists():
+        return None, 0, 0
+    m = json.loads(manifest.read_text())
+    vecs = []
+    n_missing = 0
+    for f in m.get("faces", []):
+        src = f.get("source")
+        bbox = f.get("bbox")
+        if src is None or bbox is None:
+            n_missing += 1
+            continue
+        bbox_t = tuple(int(x) for x in bbox)
+        canon = alias_map.get(src, src)
+        v = rec_index.get((canon, bbox_t))
+        if v is None and canon != src:
+            v = rec_index.get((src, bbox_t))
+        if v is None:
+            n_missing += 1
+            continue
+        vecs.append(v)
+    if len(vecs) < 3:
+        return None, len(vecs), n_missing
+    arr = np.stack(vecs).astype(np.float32)
+    c = arr.mean(axis=0)
+    n = float(np.linalg.norm(c))
+    if n > 0:
+        c = c / n
+    return c, len(vecs), n_missing
+
+
+def connected_components(adj: dict[int, set[int]]) -> list[list[int]]:
+    seen: set[int] = set()
+    comps = []
+    for node in adj:
+        if node in seen:
+            continue
+        stack = [node]
+        comp = []
+        while stack:
+            x = stack.pop()
+            if x in seen:
+                continue
+            seen.add(x)
+            comp.append(x)
+            for y in adj.get(x, set()):
+                if y not in seen:
+                    stack.append(y)
+        comps.append(sorted(comp))
+    return comps
+
+
+# ----------------------------- analyze -----------------------------
+
+def cmd_analyze(args):
+    rec_index, alias_map = load_caches()
+
+    # collect active facesets
+    active = []
+    for d in sorted(ROOT.iterdir()):
+        if not d.is_dir() or d.name.startswith("_"):
+            continue
+        if is_era_split(d.name):
+            continue
+        active.append(d)
+    print(f"[scan] {len(active)} active facesets (era splits + _masked + _thin excluded)", file=sys.stderr)
+
+    centroids: dict[str, np.ndarray] = {}
+    sizes: dict[str, int] = {}
+    skipped = []
+    t0 = time.time()
+    for fs in active:
+        c, n_used, n_miss = faceset_centroid(fs, rec_index, alias_map)
+        if c is None:
+            skipped.append((fs.name, n_used, n_miss))
+            continue
+        centroids[fs.name] = c
+        sizes[fs.name] = n_used
+    print(f"[centroid] {len(centroids)} facesets centroided in {time.time()-t0:.1f}s; "
+          f"{len(skipped)} skipped (too few embeddings)", file=sys.stderr)
+    if skipped:
+        for n, u, m in skipped[:10]:
+            print(f"  skip {n}: used={u} missing={m}", file=sys.stderr)
+        if len(skipped) > 10:
+            print(f"  ... +{len(skipped)-10} more", file=sys.stderr)
+
+    names = sorted(centroids.keys())
+    if not names:
+        raise SystemExit("no centroids built")
+
+    # similarity matrix
+    M = np.stack([centroids[n] for n in names]).astype(np.float32)  # (N, 512), normalized
+    sim = M @ M.T  # (N, N) cosine since unit-normalized
+    np.clip(sim, -1.0, 1.0, out=sim)
+
+    edge_thr = args.edge
+    confident_thr = args.confident
+
+    # complete-linkage agglomerative clustering on cosine distance.
+    # Cut at edge threshold: groups are guaranteed to have ALL pairs sim >= edge_thr.
+    # This avoids the chaining problem of single-link / connected-components.
+    n = len(names)
+    dist = 1.0 - sim
+    np.fill_diagonal(dist, 0.0)
+    # symmetrize numerical noise
+    dist = (dist + dist.T) / 2.0
+    np.clip(dist, 0.0, 2.0, out=dist)
+    cond = squareform(dist, checks=False)
+    Z = linkage(cond, method="complete")
+    cut_dist = 1.0 - edge_thr  # complete-link distance corresponds to (1 - min sim)
+    labels = fcluster(Z, t=cut_dist, criterion="distance")  # 1-indexed cluster ids
+
+    cluster_members: dict[int, list[int]] = {}
+    for idx, lbl in enumerate(labels):
+        cluster_members.setdefault(int(lbl), []).append(idx)
+    comps = [sorted(idxs) for idxs in cluster_members.values() if len(idxs) > 1]
+
+    n_pairs_in_groups = 0
+    for c in comps:
+        n_pairs_in_groups += len(c) * (len(c) - 1) // 2
+    print(f"[graph] complete-linkage cut at sim>={edge_thr}: {len(comps)} multi-faceset groups "
+          f"({n_pairs_in_groups} within-group pairs)", file=sys.stderr)
+
+    # pick primary per group: lowest tier number, then largest size
+    groups_out = []
+    for comp in comps:
+        members = [names[i] for i in comp]
+        members_sorted = sorted(members, key=lambda x: (faceset_tier(x), -sizes.get(x, 0), x))
+        primary = members_sorted[0]
+        secondaries = members_sorted[1:]
+        # gather pairwise sims within group
+        pair_sims = []
+        idx_of = {names[i]: i for i in comp}
+        for a in members:
+            for b in members:
+                if a >= b:
+                    continue
+                pair_sims.append({"a": a, "b": b, "sim": round(float(sim[idx_of[a], idx_of[b]]), 4)})
+        # confidence: minimum within-group sim (the weakest link)
+        min_link = min(p["sim"] for p in pair_sims)
+        max_link = max(p["sim"] for p in pair_sims)
+        confidence = "confident" if min_link >= confident_thr else "uncertain"
+        groups_out.append({
+            "primary": primary,
+            "secondaries": secondaries,
+            "members": members_sorted,
+            "tiers": {n: faceset_tier(n) for n in members},
+            "sizes": {n: sizes.get(n, 0) for n in members},
+            "pair_sims": pair_sims,
+            "min_link": round(min_link, 4),
+            "max_link": round(max_link, 4),
+            "confidence": confidence,
+        })
+    # sort: confident first, then by max_link desc
+    groups_out.sort(key=lambda g: (0 if g["confidence"] == "confident" else 1, -g["max_link"]))
+
+    out = {
+        "thresholds": {"edge": edge_thr, "confident": confident_thr},
+        "n_active": len(active),
+        "n_centroided": len(centroids),
+        "n_skipped": len(skipped),
+        "skipped_reasons": [{"name": n, "used": u, "missing": m} for n, u, m in skipped],
+        "n_groups": len(groups_out),
+        "n_facesets_in_groups": sum(len(g["members"]) for g in groups_out),
+        "groups": groups_out,
+    }
+    op = Path(args.out)
+    op.parent.mkdir(parents=True, exist_ok=True)
+    op.write_text(json.dumps(out, indent=2))
+    confident = sum(1 for g in groups_out if g["confidence"] == "confident")
+    uncertain = sum(1 for g in groups_out if g["confidence"] == "uncertain")
+    print(f"[done] {len(groups_out)} groups ({confident} confident, {uncertain} uncertain) -> {op}", file=sys.stderr)
+
+
+# ----------------------------- report -----------------------------
+
+def cmd_report(args):
+    candidates = json.loads(Path(args.candidates).read_text())
+    out_dir = Path(args.out)
+    thumbs_dir = out_dir / "thumbs"
+    thumbs_dir.mkdir(parents=True, exist_ok=True)
+
+    THUMB = 140
+    THUMBS_PER_FACESET = 4
+
+    def make_thumb(faceset: str, fname: str) -> str:
+        d = thumbs_dir / faceset
+        d.mkdir(parents=True, exist_ok=True)
+        dst = d / (Path(fname).stem + ".jpg")
+        if not dst.exists():
+            try:
+                src = ROOT / faceset / "faces" / fname
+                img = Image.open(src).convert("RGB")
+                img.thumbnail((THUMB, THUMB), Image.LANCZOS)
+                img.save(dst, "JPEG", quality=82)
+            except Exception as e:
+                print(f"[thumb-skip] {faceset}/{fname}: {e}", file=sys.stderr)
+                return ""
+        return f"thumbs/{faceset}/{Path(fname).stem}.jpg"
+
+    rows = []
+    for gi, g in enumerate(candidates["groups"]):
+        primary = g["primary"]
+        sec = g["secondaries"]
+        conf_cls = "confident" if g["confidence"] == "confident" else "uncertain"
+        rows.append(f"<section class='grp {conf_cls}' id='g{gi}'>")
+        rows.append(f"<h2>group #{gi+1} <small>({g['confidence']}; min_sim={g['min_link']:.3f}, max_sim={g['max_link']:.3f})</small></h2>")
+        rows.append(f"<div class='plan'>merge <b>{', '.join(sec)}</b> &rarr; <b>{primary}</b></div>")
+        # member rows
+        for name in g["members"]:
+            tier = g["tiers"][name]
+            sz = g["sizes"][name]
+            tier_label = ["hand-sorted", "auto", "osrc", "immich-peter", "immich-nic", "?"][min(tier, 5)]
+            badge = "PRIMARY" if name == primary else "secondary"
+            rows.append(f"<div class='member'>")
+            rows.append(f"<div class='label'><span class='badge {badge.lower()}'>{badge}</span> "
+                        f"<b>{name}</b> <small>tier={tier_label} · n={sz}</small></div>")
+            rows.append("<div class='thumbs'>")
+            faces_dir = ROOT / name / "faces"
+            files = sorted(faces_dir.glob("*.png"))[:THUMBS_PER_FACESET]
+            for f in files:
+                rel = make_thumb(name, f.name)
+                if rel:
+                    rows.append(f"<img src='{rel}' loading='lazy' title='{f.name}'>")
+            rows.append("</div></div>")
+        # pairwise sims
+        rows.append("<table class='sims'><tr><th>a</th><th>b</th><th>sim</th></tr>")
+        for ps in sorted(g["pair_sims"], key=lambda x: -x["sim"]):
+            cls = "hi" if ps["sim"] >= candidates["thresholds"]["confident"] else "mid"
+            rows.append(f"<tr><td>{ps['a']}</td><td>{ps['b']}</td><td class='{cls}'>{ps['sim']:.3f}</td></tr>")
+        rows.append("</table>")
+        rows.append("</section>")
+
+    nav = " · ".join(f"<a href='#g{i}'>#{i+1}</a>" for i in range(len(candidates["groups"])))
+
+    html = f"""<!doctype html>
+<html><head><meta charset='utf-8'><title>Faceset merge review</title>
+<style>
+body {{ font-family: system-ui, sans-serif; background: #111; color: #eee; padding: 1em; }}
+h1 {{ margin-top: 0; }}
+h2 {{ margin: 0; }}
+small {{ color: #999; font-weight: normal; }}
+section.grp {{ background: #1a1a1a; border-radius: 6px; padding: 12px; margin: 12px 0; }}
+section.grp.confident {{ border-left: 4px solid #5fa05f; }}
+section.grp.uncertain {{ border-left: 4px solid #ffb050; }}
+.plan {{ margin: .5em 0; color: #6cf; }}
+.member {{ margin: 8px 0; padding: 6px; background: #222; border-radius: 4px; }}
+.label {{ font-family: monospace; font-size: 13px; }}
+.badge {{ display: inline-block; padding: 0 6px; font-size: 10px; border-radius: 2px; }}
+.badge.primary {{ background: #5fa05f; color: #000; font-weight: bold; }}
+.badge.secondary {{ background: #444; color: #ccc; }}
+.thumbs {{ display: flex; gap: 4px; margin-top: 4px; flex-wrap: wrap; }}
+.thumbs img {{ height: 140px; width: auto; border-radius: 3px; }}
+table.sims {{ font-family: monospace; font-size: 11px; margin-top: 6px; border-collapse: collapse; }}
+table.sims td, table.sims th {{ padding: 1px 8px; border: 1px solid #333; text-align: left; }}
+table.sims td.hi {{ color: #5fa05f; font-weight: bold; }}
+table.sims td.mid {{ color: #ffb050; }}
+.nav {{ position: sticky; top: 0; background: #111; padding: .5em 0; border-bottom: 1px solid #333; font-size: 12px; }}
+a {{ color: #6cf; }}
+</style></head>
+<body>
+<h1>Merge review &mdash; {len(candidates['groups'])} candidate groups
+  <small>(edge>={candidates['thresholds']['edge']}, confident>={candidates['thresholds']['confident']})</small></h1>
+<p>{candidates['n_centroided']} of {candidates['n_active']} active facesets centroided
+  (skipped {candidates['n_skipped']} for too few cached embeddings).
+  Green = confident (min within-group sim >= {candidates['thresholds']['confident']}); orange = uncertain.</p>
+<div class='nav'>{nav}</div>
+{''.join(rows)}
+</body></html>"""
+
+    out_html = out_dir / "index.html"
+    out_html.write_text(html)
+    print(f"[done] {out_html}", file=sys.stderr)
+
+
+# ----------------------------- apply -----------------------------
+
+def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
+    import zipfile
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
+        for i, p in enumerate(pngs):
+            zf.write(p, arcname=f"{i:04d}.png")
+
+
+def cmd_apply(args):
+    candidates = json.loads(Path(args.candidates).read_text())
+    master_path = ROOT / "manifest.json"
+    master = json.loads(master_path.read_text())
+    by_name = {f["name"]: f for f in master.get("facesets", [])}
+
+    # filter: skip "uncertain" groups unless --include-uncertain
+    accepted = [g for g in candidates["groups"]
+                if g["confidence"] == "confident" or args.include_uncertain]
+    skipped_unc = [g for g in candidates["groups"]
+                   if g["confidence"] == "uncertain" and not args.include_uncertain]
+    # explicit --exclude / --only filters (group indices in the candidates file)
+    if args.only:
+        only = {int(s) for s in args.only.split(",")}
+        accepted = [g for i, g in enumerate(candidates["groups"]) if i in only]
+    if args.exclude:
+        excl = {int(s) for s in args.exclude.split(",")}
+        accepted = [g for i, g in enumerate(accepted) if i not in excl]
+
+    print(f"[plan] {len(accepted)} groups will be merged "
+          f"({len(skipped_unc)} uncertain skipped)", file=sys.stderr)
+
+    if args.dry_run:
+        for g in accepted:
+            print(f"  merge {g['secondaries']} -> {g['primary']}  "
+                  f"({g['confidence']}, min_sim={g['min_link']:.3f})")
+        return
+
+    merged_dir = ROOT / "_merged"
+    merged_dir.mkdir(exist_ok=True)
+    new_facesets: list[dict] = []
+    new_merged: list[dict] = list(master.get("merged", []))
+    consumed_names: set[str] = set()
+    primary_updates: dict[str, dict] = {}  # name -> new entry
+    primary_absorbed: dict[str, list[dict]] = {}  # primary_name -> [secondary entries]
+
+    for g in accepted:
+        primary = g["primary"]
+        if primary not in by_name:
+            print(f"[warn] primary {primary} not in master; skipping group", file=sys.stderr)
+            continue
+        primary_dir = ROOT / primary
+        if not primary_dir.is_dir():
+            print(f"[warn] primary dir {primary_dir} missing; skipping group", file=sys.stderr)
+            continue
+        primary_faces = primary_dir / "faces"
+        primary_manifest_path = primary_dir / "manifest.json"
+        primary_manifest = json.loads(primary_manifest_path.read_text())
+
+        # gather all face entries: primary + each secondary
+        combined_faces: list[dict] = list(primary_manifest.get("faces", []))
+        # adjust composite quality fall-back: ensure key exists
+        for f in combined_faces:
+            f.setdefault("origin_faceset", primary)
+
+        for sec in g["secondaries"]:
+            sec_dir = ROOT / sec
+            if not sec_dir.is_dir():
+                print(f"[warn] secondary {sec} missing; skipping", file=sys.stderr)
+                continue
+            sec_manifest_path = sec_dir / "manifest.json"
+            sec_manifest = json.loads(sec_manifest_path.read_text()) if sec_manifest_path.exists() else {"faces": []}
+            for f in sec_manifest.get("faces", []):
+                f = dict(f)
+                f["origin_faceset"] = sec
+                combined_faces.append(f)
+
+        # rank by quality.composite descending; ties broken by lower cosd_centroid
+        def sort_key(f):
+            q = f.get("quality", {}).get("composite", 0)
+            d = f.get("cosd_centroid", 1.0)
+            return (-q, d)
+        combined_faces.sort(key=sort_key)
+
+        # renumber and stage PNGs into a fresh staging dir, then atomically swap
+        staging = primary_dir / "_faces_new"
+        if staging.exists():
+            shutil.rmtree(staging)
+        staging.mkdir()
+        new_face_entries = []
+        for new_rank, f in enumerate(combined_faces, start=1):
+            origin = f.pop("origin_faceset")
+            old_png_rel = f["png"]                   # e.g. "faces/0042.png"
+            old_png_name = Path(old_png_rel).name
+            origin_png = ROOT / origin / "faces" / old_png_name
+            if not origin_png.exists():
+                # could be in _dropped if occlusion-pruned; skip
+                continue
+            new_name = f"{new_rank:04d}.png"
+            shutil.copy2(origin_png, staging / new_name)
+            f = dict(f)
+            f["rank"] = new_rank
+            f["png"] = f"faces/{new_name}"
+            f["origin_faceset"] = origin   # preserve provenance in manifest
+            new_face_entries.append(f)
+
+        # swap directories: primary/faces -> primary/_faces_old, staging -> primary/faces
+        old_faces_holding = primary_dir / "_faces_old"
+        if old_faces_holding.exists():
+            shutil.rmtree(old_faces_holding)
+        if primary_faces.exists():
+            primary_faces.rename(old_faces_holding)
+        staging.rename(primary_faces)
+        # migrate _dropped/ from old holding (so occlusion-pruned PNGs remain accessible)
+        old_dropped = old_faces_holding / "_dropped"
+        if old_dropped.exists():
+            (primary_faces / "_dropped").mkdir(exist_ok=True)
+            for x in old_dropped.iterdir():
+                shutil.move(str(x), str(primary_faces / "_dropped" / x.name))
+        shutil.rmtree(old_faces_holding)
+
+        # re-zip .fsz
+        survivor_pngs = sorted(primary_faces.glob("*.png"))
+        top_n = primary_manifest.get("top_n", 30)
+        top_n_eff = min(top_n, len(survivor_pngs))
+        # remove old .fsz files
+        for old in primary_dir.glob("*.fsz"):
+            old.unlink()
+        top_fsz_name = f"{primary}_top{top_n_eff}.fsz"
+        all_fsz_name = f"{primary}_all.fsz"
+        _zip_png_list(survivor_pngs[:top_n_eff], primary_dir / top_fsz_name)
+        if len(survivor_pngs) > top_n_eff:
+            _zip_png_list(survivor_pngs, primary_dir / all_fsz_name)
+            all_fsz_used = all_fsz_name
+        else:
+            all_fsz_used = None
+
+        # update primary's local manifest
+        primary_manifest["faces"] = new_face_entries
+        primary_manifest["exported"] = len(new_face_entries)
+        primary_manifest["fsz_top"] = top_fsz_name
+        primary_manifest["fsz_all"] = all_fsz_used
+        primary_manifest["top_n"] = top_n_eff
+        primary_manifest.setdefault("merge_history", []).append({
+            "absorbed": g["secondaries"],
+            "min_link": g["min_link"],
+            "max_link": g["max_link"],
+            "confidence": g["confidence"],
+        })
+        primary_manifest_path.write_text(json.dumps(primary_manifest, indent=2))
+
+        # move secondary directories into _merged/
+        absorbed_master_entries: list[dict] = []
+        for sec in g["secondaries"]:
+            sec_dir = ROOT / sec
+            target = merged_dir / sec
+            if not sec_dir.is_dir():
+                continue
+            if target.exists():
+                shutil.rmtree(sec_dir)  # already moved by previous run; clean stub
+            else:
+                shutil.move(str(sec_dir), str(target))
+            sec_master = dict(by_name.get(sec, {"name": sec}))
+            sec_master["merged_into"] = primary
+            sec_master["relpath"] = f"_merged/{sec}"
+            sec_master["fsz_top"] = None
+            sec_master["fsz_all"] = None
+            absorbed_master_entries.append(sec_master)
+            consumed_names.add(sec)
+
+        new_merged.extend(absorbed_master_entries)
+
+        # bump primary master entry
+        prim_master = dict(by_name[primary])
+        prim_master["exported"] = len(new_face_entries)
+        prim_master["top_n"] = top_n_eff
+        prim_master["fsz_top"] = top_fsz_name
+        prim_master["fsz_all"] = all_fsz_used
+        prim_master.setdefault("merge_history", []).append({
+            "absorbed": g["secondaries"],
+            "min_link": g["min_link"],
+            "max_link": g["max_link"],
+        })
+        primary_updates[primary] = prim_master
+
+        print(f"[merged] {g['secondaries']} -> {primary}  "
+              f"now {len(new_face_entries)} png", file=sys.stderr)
+
+    # rebuild master facesets list
+    for entry in master.get("facesets", []):
+        nm = entry["name"]
+        if nm in consumed_names:
+            continue
+        if nm in primary_updates:
+            new_facesets.append(primary_updates[nm])
+        else:
+            new_facesets.append(entry)
+
+    new_master = dict(master)
+    new_master["facesets"] = new_facesets
+    new_master["merged"] = new_merged
+    new_master["merge_run"] = {
+        "thresholds": candidates["thresholds"],
+        "groups_applied": len(accepted),
+        "facesets_consumed": len(consumed_names),
+        "include_uncertain": bool(args.include_uncertain),
+    }
+    tmp = master_path.with_suffix(".tmp.json")
+    tmp.write_text(json.dumps(new_master, indent=2))
+    tmp.replace(master_path)
+    print(f"[done] master manifest updated: {len(new_facesets)} active, "
+          f"{len(new_merged)} merged, {len(consumed_names)} consumed in this run",
+          file=sys.stderr)
+
+
+# ----------------------------- main -----------------------------
+
+def main():
+    ap = argparse.ArgumentParser()
+    sub = ap.add_subparsers(dest="cmd", required=True)
+
+    a = sub.add_parser("analyze")
+    a.add_argument("--out", required=True)
+    a.add_argument("--edge", type=float, default=0.45, help="min cosine sim to draw an edge (default 0.45)")
+    a.add_argument("--confident", type=float, default=0.65, help="min within-group sim to be confident (default 0.65)")
+    a.set_defaults(func=cmd_analyze)
+
+    r = sub.add_parser("report")
+    r.add_argument("--candidates", required=True)
+    r.add_argument("--out", required=True)
+    r.set_defaults(func=cmd_report)
+
+    p = sub.add_parser("apply")
+    p.add_argument("--candidates", required=True)
+    p.add_argument("--include-uncertain", action="store_true",
+                   help="apply uncertain groups too (default: confident only)")
+    p.add_argument("--only", default=None, help="comma-separated group indices to apply")
+    p.add_argument("--exclude", default=None, help="comma-separated group indices to skip")
+    p.add_argument("--dry-run", action="store_true")
+    p.set_defaults(func=cmd_apply)
+
+    args = ap.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/work/dedup_optimize.py b/work/dedup_optimize.py
new file mode 100644
index 0000000..c337403
--- /dev/null
+++ b/work/dedup_optimize.py
@@ -0,0 +1,594 @@
+"""Corpus-wide dedup + roop-unleashed optimization.
+
+Two passes:
+  1. Cross-family byte-identical PNG dedup (same SHA256 in two different identity
+     families) — keep the higher-tier family copy. Era splits of the same parent
+     identity (faceset_NNN_*) are intentional duplications and are NOT deduped
+     within their family.
+  2. Within-faceset near-duplicate dedup using cached arcface embeddings
+     (cosine sim >= 0.95). Keep highest quality.composite, drop the rest.
+
+Plus a Windows-DML multi-face audit (separate phase via clip_worker-style split):
+  3. Re-detect each PNG with insightface; flag any with 0 or >1 detected faces.
+     The roop loader appends every detected face per PNG, so multi-face crops
+     pollute identity averaging.
+
+All flagged PNGs are MOVED to <faceset>/faces/_dropped/ (reversible). Affected
+.fsz files are re-zipped, manifests updated.
+
+CLI:
+  analyze        --out work/dedup_audit/dedup_plan.json
+  apply          --plan ... [--dry-run]
+  stage_multiface --out work/dedup_audit/multiface_queue.json
+  merge_multiface --results <worker_out> --out work/dedup_audit/multiface_plan.json
+  apply_multiface --plan ... [--dry-run]
+  report         --dedup ... --multiface ... --out work/dedup_audit
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import re
+import shutil
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+
+import numpy as np
+
+ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
+WIN_ROOT = r"E:\temp_things\fcswp\nl_sorted\facesets_swap_ready"
+CACHES = [
+    Path("/opt/face-sets/work/cache/nl_full.npz"),
+    Path("/opt/face-sets/work/cache/immich_peter.npz"),
+    Path("/opt/face-sets/work/cache/immich_nic.npz"),
+]
+
+NEAR_DUP_THRESHOLD = 0.95
+HASH_PARALLEL = 16
+
+
+# ----------------------------- helpers -----------------------------
+
+def faceset_tier(name: str) -> int:
+    m = re.match(r"^faceset_0*(\d+)(?:_.+)?$", name)
+    if not m:
+        return 99
+    n = int(m.group(1))
+    if 13 <= n <= 19:
+        return 0
+    if 1 <= n <= 12:
+        return 1
+    if 20 <= n <= 25:
+        return 2
+    if 26 <= n <= 264:
+        return 3
+    if 265 <= n:
+        return 4
+    return 99
+
+
+def faceset_family(name: str) -> str:
+    """faceset_001_2010-13 → faceset_001; faceset_001 → faceset_001."""
+    m = re.match(r"^(faceset_\d+)(?:_.+)?$", name)
+    return m.group(1) if m else name
+
+
+def wsl_to_win(p: str) -> str:
+    s = str(p)
+    if s.startswith("/mnt/"):
+        return f"{s[5].upper()}:\\{s[7:].replace('/', chr(92))}"
+    return s
+
+
+def iter_active_facesets() -> list[Path]:
+    out = []
+    for d in sorted(ROOT.iterdir()):
+        if d.is_dir() and not d.name.startswith("_"):
+            out.append(d)
+    return out
+
+
+def sha256_file(p: Path) -> str:
+    h = hashlib.sha256()
+    with open(p, "rb") as f:
+        while True:
+            b = f.read(1 << 20)
+            if not b:
+                break
+            h.update(b)
+    return h.hexdigest()
+
+
+def load_caches():
+    rec_index: dict[tuple[str, tuple[int, int, int, int]], np.ndarray] = {}
+    alias_map: dict[str, str] = {}
+    for c in CACHES:
+        if not c.exists():
+            continue
+        d = np.load(c, allow_pickle=True)
+        emb = d["embeddings"]
+        meta = json.loads(str(d["meta"]))
+        face_records = [m for m in meta if not m.get("noface")]
+        if "path_aliases" in d.files:
+            paliases = json.loads(str(d["path_aliases"]))
+            for canon, alist in paliases.items():
+                alias_map.setdefault(canon, canon)
+                for a in alist:
+                    alias_map[a] = canon
+        for i, rec in enumerate(face_records):
+            p = rec["path"]
+            bbox = tuple(int(x) for x in rec["bbox"])
+            v = emb[i].astype(np.float32)
+            n = float(np.linalg.norm(v))
+            if n > 0:
+                v = v / n
+            rec_index[(p, bbox)] = v
+            alias_map.setdefault(p, p)
+    return rec_index, alias_map
+
+
+def lookup_emb(rec_index, alias_map, src: str, bbox):
+    bbox_t = tuple(int(x) for x in bbox)
+    canon = alias_map.get(src, src)
+    v = rec_index.get((canon, bbox_t))
+    if v is None and canon != src:
+        v = rec_index.get((src, bbox_t))
+    return v
+
+
+# ----------------------------- analyze -----------------------------
+
+def cmd_analyze(args):
+    rec_index, alias_map = load_caches()
+    facesets = iter_active_facesets()
+    print(f"[scan] {len(facesets)} active facesets", file=sys.stderr)
+
+    # Phase 1: walk every PNG, collect (faceset, file, src, bbox, quality, emb, sha256)
+    all_pngs = []  # list of dicts
+    t0 = time.time()
+    for fs in facesets:
+        manifest_path = fs / "manifest.json"
+        if not manifest_path.exists():
+            continue
+        m = json.loads(manifest_path.read_text())
+        for f in m.get("faces", []):
+            png_rel = f.get("png")
+            if not png_rel:
+                continue
+            disk_path = fs / png_rel
+            if not disk_path.exists():
+                continue
+            all_pngs.append({
+                "faceset": fs.name,
+                "family": faceset_family(fs.name),
+                "tier": faceset_tier(fs.name),
+                "file": Path(png_rel).name,
+                "rank": f.get("rank"),
+                "source": f.get("source"),
+                "bbox": f.get("bbox"),
+                "quality": f.get("quality", {}).get("composite", 0),
+                "disk_path": str(disk_path),
+            })
+    print(f"[scan] {len(all_pngs)} PNGs walked in {time.time()-t0:.1f}s", file=sys.stderr)
+
+    # Phase 2: SHA256 hash each PNG (parallel I/O)
+    t0 = time.time()
+    def _hash_one(idx):
+        all_pngs[idx]["sha256"] = sha256_file(Path(all_pngs[idx]["disk_path"]))
+    with ThreadPoolExecutor(max_workers=HASH_PARALLEL) as ex:
+        # exhaust the iterator to actually run
+        for _ in ex.map(_hash_one, range(len(all_pngs)), chunksize=16):
+            pass
+    print(f"[hash] {len(all_pngs)} PNGs hashed in {time.time()-t0:.1f}s", file=sys.stderr)
+
+    # Phase 3: cross-family byte-dedup
+    by_sha: dict[str, list[int]] = {}
+    for i, p in enumerate(all_pngs):
+        by_sha.setdefault(p["sha256"], []).append(i)
+
+    cross_family_groups = []
+    byte_drops: set[int] = set()  # indices of PNGs to drop
+    for sha, idxs in by_sha.items():
+        if len(idxs) < 2:
+            continue
+        families = {all_pngs[i]["family"] for i in idxs}
+        if len(families) < 2:
+            continue  # all in same family — intentional era duplication
+        # multiple families share this content → dedup keeping the best one
+        cross_family_groups.append({"sha256": sha, "members": [
+            {"faceset": all_pngs[i]["faceset"], "file": all_pngs[i]["file"],
+             "tier": all_pngs[i]["tier"], "quality": all_pngs[i]["quality"],
+             "rank": all_pngs[i]["rank"]} for i in idxs
+        ]})
+        # keeper rule: lowest tier number, then highest quality
+        best = sorted(idxs, key=lambda i: (all_pngs[i]["tier"], -all_pngs[i]["quality"]))[0]
+        for i in idxs:
+            # NEVER drop within-family copies (preserve era duplication intentionally)
+            # We only drop indices whose family != best's family
+            if i != best and all_pngs[i]["family"] != all_pngs[best]["family"]:
+                byte_drops.add(i)
+    print(f"[byte] {len(cross_family_groups)} cross-family hash groups; "
+          f"{len(byte_drops)} PNGs marked for byte-dedup drop", file=sys.stderr)
+
+    # Phase 4: within-faceset near-dup (embedding sim >= threshold)
+    by_faceset: dict[str, list[int]] = {}
+    for i, p in enumerate(all_pngs):
+        by_faceset.setdefault(p["faceset"], []).append(i)
+
+    near_dup_groups = []
+    near_drops: set[int] = set()
+    miss_emb_total = 0
+    t0 = time.time()
+    for fs_name, idxs in by_faceset.items():
+        if len(idxs) < 2:
+            continue
+        # gather embeddings
+        embs = []
+        kept_idxs = []
+        for i in idxs:
+            v = lookup_emb(rec_index, alias_map, all_pngs[i]["source"], all_pngs[i]["bbox"])
+            if v is None:
+                miss_emb_total += 1
+                continue
+            embs.append(v)
+            kept_idxs.append(i)
+        if len(kept_idxs) < 2:
+            continue
+        M = np.stack(embs).astype(np.float32)
+        sim = M @ M.T
+        np.fill_diagonal(sim, -1)  # ignore self
+        # find connected components in the (sim >= threshold) graph
+        adj = {k: set() for k in range(len(kept_idxs))}
+        for a in range(len(kept_idxs)):
+            # only check a < b to avoid double work
+            hi = np.where(sim[a, a+1:] >= NEAR_DUP_THRESHOLD)[0]
+            for off in hi:
+                b = a + 1 + int(off)
+                adj[a].add(b)
+                adj[b].add(a)
+        seen = set()
+        for k in adj:
+            if k in seen or not adj[k]:
+                continue
+            stack = [k]
+            comp = []
+            while stack:
+                x = stack.pop()
+                if x in seen:
+                    continue
+                seen.add(x)
+                comp.append(x)
+                for y in adj[x]:
+                    if y not in seen:
+                        stack.append(y)
+            if len(comp) < 2:
+                continue
+            comp_idxs = [kept_idxs[c] for c in comp]
+            # keeper: highest quality.composite, tie-break: lowest rank
+            best = sorted(comp_idxs, key=lambda i: (-all_pngs[i]["quality"], all_pngs[i]["rank"] or 9999))[0]
+            sims_in_group = []
+            for ci in range(len(comp)):
+                for cj in range(ci+1, len(comp)):
+                    sims_in_group.append(float(sim[comp[ci], comp[cj]]))
+            near_dup_groups.append({
+                "faceset": fs_name,
+                "members": [{"file": all_pngs[i]["file"], "rank": all_pngs[i]["rank"],
+                             "quality": all_pngs[i]["quality"]} for i in comp_idxs],
+                "keeper": all_pngs[best]["file"],
+                "min_sim": min(sims_in_group) if sims_in_group else None,
+                "max_sim": max(sims_in_group) if sims_in_group else None,
+            })
+            for i in comp_idxs:
+                if i != best:
+                    near_drops.add(i)
+    print(f"[near] {len(near_dup_groups)} near-dup groups; "
+          f"{len(near_drops)} PNGs marked for near-dup drop "
+          f"(miss_emb={miss_emb_total}); {time.time()-t0:.1f}s", file=sys.stderr)
+
+    # Combined drop set; for output, group by faceset
+    all_drops = byte_drops | near_drops
+    drops_by_faceset: dict[str, list] = {}
+    for i in all_drops:
+        p = all_pngs[i]
+        reason = []
+        if i in byte_drops: reason.append("byte_dup")
+        if i in near_drops: reason.append("near_dup")
+        drops_by_faceset.setdefault(p["faceset"], []).append({
+            "file": p["file"], "rank": p["rank"], "reason": "+".join(reason),
+            "sha256": p["sha256"], "quality": p["quality"],
+        })
+
+    plan = {
+        "thresholds": {"near_dup_sim": NEAR_DUP_THRESHOLD},
+        "totals": {
+            "active_facesets": len(facesets),
+            "active_pngs": len(all_pngs),
+            "byte_dup_groups": len(cross_family_groups),
+            "byte_dup_drops": len(byte_drops),
+            "near_dup_groups": len(near_dup_groups),
+            "near_dup_drops": len(near_drops),
+            "all_drops": len(all_drops),
+            "facesets_affected": len(drops_by_faceset),
+        },
+        "byte_dup_groups": cross_family_groups,
+        "near_dup_groups": near_dup_groups,
+        "drops_by_faceset": drops_by_faceset,
+    }
+    op = Path(args.out)
+    op.parent.mkdir(parents=True, exist_ok=True)
+    op.write_text(json.dumps(plan, indent=2))
+    print(f"[done] plan -> {op}", file=sys.stderr)
+
+
+# ----------------------------- apply -----------------------------
+
+def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
+    import zipfile
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
+        for i, p in enumerate(pngs):
+            zf.write(p, arcname=f"{i:04d}.png")
+
+
+def _apply_drops_to_facesets(drops_by_faceset: dict[str, list], reason_label: str, master_path: Path):
+    """Move flagged PNGs to <faceset>/faces/_dropped/, rebuild manifests + .fsz.
+    drops_by_faceset values are lists of {"file": str, ...}.
+    Returns total moved + counts per faceset."""
+    master = json.loads(master_path.read_text())
+    by_name = {f["name"]: f for f in master.get("facesets", [])}
+    total_moved = 0
+    per_faceset_counts = {}
+
+    for fs_name, drops in drops_by_faceset.items():
+        fs_dir = ROOT / fs_name
+        if not fs_dir.is_dir():
+            print(f"[warn] {fs_name}: dir missing, skip", file=sys.stderr)
+            continue
+        faces_dir = fs_dir / "faces"
+        dropped_dir = faces_dir / "_dropped"
+        dropped_dir.mkdir(exist_ok=True)
+        drop_files = {d["file"] for d in drops}
+
+        moved_here = 0
+        for fname in sorted(drop_files):
+            src = faces_dir / fname
+            if not src.exists():
+                continue
+            shutil.move(str(src), str(dropped_dir / fname))
+            moved_here += 1
+
+        # rebuild manifest by filtering out dropped files
+        manifest_path = fs_dir / "manifest.json"
+        if manifest_path.exists():
+            mm = json.loads(manifest_path.read_text())
+            new_faces = [f for f in mm.get("faces", []) if Path(f.get("png", "")).name not in drop_files]
+            mm["faces"] = new_faces
+            mm["exported"] = len(new_faces)
+            mm.setdefault(f"{reason_label}_history", []).append({"dropped": moved_here})
+
+            # re-zip
+            survivor_pngs = sorted(faces_dir.glob("*.png"))
+            top_n = mm.get("top_n", 30)
+            top_n_eff = min(top_n, len(survivor_pngs))
+            for old in fs_dir.glob("*.fsz"):
+                old.unlink()
+            top_fsz_name = f"{fs_name}_top{top_n_eff}.fsz"
+            all_fsz_name = f"{fs_name}_all.fsz"
+            if top_n_eff > 0:
+                _zip_png_list(survivor_pngs[:top_n_eff], fs_dir / top_fsz_name)
+                mm["fsz_top"] = top_fsz_name
+                mm["top_n"] = top_n_eff
+            else:
+                mm["fsz_top"] = None
+                mm["top_n"] = 0
+            if len(survivor_pngs) > top_n_eff:
+                _zip_png_list(survivor_pngs, fs_dir / all_fsz_name)
+                mm["fsz_all"] = all_fsz_name
+            else:
+                mm["fsz_all"] = None
+            manifest_path.write_text(json.dumps(mm, indent=2))
+
+            if fs_name in by_name:
+                by_name[fs_name]["exported"] = len(new_faces)
+                by_name[fs_name]["fsz_top"] = mm["fsz_top"]
+                by_name[fs_name]["fsz_all"] = mm["fsz_all"]
+                by_name[fs_name]["top_n"] = mm["top_n"]
+                by_name[fs_name].setdefault(f"{reason_label}_dropped", 0)
+                by_name[fs_name][f"{reason_label}_dropped"] += moved_here
+
+        total_moved += moved_here
+        per_faceset_counts[fs_name] = moved_here
+
+    # rewrite master with same ordering
+    new_facesets = [by_name.get(e["name"], e) for e in master.get("facesets", [])]
+    master["facesets"] = new_facesets
+    master.setdefault(f"{reason_label}_runs", []).append({
+        "facesets_affected": len(per_faceset_counts),
+        "pngs_moved": total_moved,
+    })
+    tmp = master_path.with_suffix(".tmp.json")
+    tmp.write_text(json.dumps(master, indent=2))
+    tmp.replace(master_path)
+    return total_moved, per_faceset_counts
+
+
+def cmd_apply(args):
+    plan = json.loads(Path(args.plan).read_text())
+    drops = plan["drops_by_faceset"]
+    if args.dry_run:
+        for fs, items in sorted(drops.items()):
+            reasons = {}
+            for it in items:
+                reasons[it["reason"]] = reasons.get(it["reason"], 0) + 1
+            print(f"  {fs}: {len(items)} dropped ({reasons})")
+        print(f"=== total: {sum(len(v) for v in drops.values())} PNGs across {len(drops)} facesets ===")
+        return
+    master_path = ROOT / "manifest.json"
+    total, _ = _apply_drops_to_facesets(drops, "dedup", master_path)
+    print(f"[done] {total} PNGs moved to faces/_dropped/ across {len(drops)} facesets", file=sys.stderr)
+
+
+# ----------------------------- multiface staging + apply -----------------------------
+
+def cmd_stage_multiface(args):
+    """Build queue.json of all currently-active PNGs in the corpus
+    for the Windows DML multi-face audit worker."""
+    queue = []
+    for fs in iter_active_facesets():
+        faces_dir = fs / "faces"
+        if not faces_dir.is_dir():
+            continue
+        for p in sorted(faces_dir.glob("*.png")):
+            queue.append({
+                "wsl_path": str(p),
+                "win_path": wsl_to_win(str(p)),
+                "faceset": fs.name,
+                "file": p.name,
+            })
+    op = Path(args.out)
+    op.parent.mkdir(parents=True, exist_ok=True)
+    op.write_text(json.dumps(queue, indent=2))
+    print(f"[stage] {len(queue)} PNGs -> {op}", file=sys.stderr)
+
+
+def cmd_merge_multiface(args):
+    """Convert worker results.json into a drops_by_faceset plan."""
+    src = json.loads(Path(args.results).read_text())
+    drops_by_faceset: dict[str, list] = {}
+    bad_count = 0
+    for r in src.get("results", []):
+        n_faces = r.get("face_count", -1)
+        if n_faces == 1:
+            continue
+        bad_count += 1
+        drops_by_faceset.setdefault(r["faceset"], []).append({
+            "file": r["file"],
+            "reason": f"multiface_{n_faces}",
+            "face_count": n_faces,
+        })
+    plan = {
+        "totals": {"bad_pngs": bad_count, "facesets_affected": len(drops_by_faceset),
+                   "scored": len(src.get("results", []))},
+        "drops_by_faceset": drops_by_faceset,
+    }
+    op = Path(args.out)
+    op.parent.mkdir(parents=True, exist_ok=True)
+    op.write_text(json.dumps(plan, indent=2))
+    print(f"[merge] {bad_count} bad PNGs across {len(drops_by_faceset)} facesets -> {op}", file=sys.stderr)
+
+
+def cmd_apply_multiface(args):
+    plan = json.loads(Path(args.plan).read_text())
+    drops = plan["drops_by_faceset"]
+    if args.dry_run:
+        for fs, items in sorted(drops.items()):
+            print(f"  {fs}: {len(items)} bad PNG(s)")
+        print(f"=== total: {sum(len(v) for v in drops.values())} ===")
+        return
+    master_path = ROOT / "manifest.json"
+    total, _ = _apply_drops_to_facesets(drops, "multiface", master_path)
+    print(f"[done] {total} PNGs moved to faces/_dropped/ across {len(drops)} facesets", file=sys.stderr)
+
+
+# ----------------------------- report -----------------------------
+
+def cmd_report(args):
+    out_dir = Path(args.out)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    sections = []
+    if args.dedup:
+        d = json.loads(Path(args.dedup).read_text())
+        t = d["totals"]
+        sections.append(f"<h2>Dedup</h2>")
+        sections.append(
+            f"<ul>"
+            f"<li>Active facesets: {t['active_facesets']}, active PNGs: {t['active_pngs']}</li>"
+            f"<li>Cross-family byte-dup groups: {t['byte_dup_groups']} → {t['byte_dup_drops']} PNGs dropped</li>"
+            f"<li>Within-faceset near-dup groups (sim≥{d['thresholds']['near_dup_sim']}): {t['near_dup_groups']} → {t['near_dup_drops']} PNGs dropped</li>"
+            f"<li><b>Total dedup drops: {t['all_drops']}</b> across {t['facesets_affected']} facesets</li>"
+            f"</ul>"
+        )
+        # top-N affected facesets
+        rows = sorted(d["drops_by_faceset"].items(), key=lambda x: -len(x[1]))[:25]
+        sections.append("<h3>Top 25 most-affected facesets</h3><table><tr><th>faceset</th><th>dropped</th><th>reasons</th></tr>")
+        for fs, items in rows:
+            r = {}
+            for it in items:
+                r[it["reason"]] = r.get(it["reason"], 0) + 1
+            sections.append(f"<tr><td>{fs}</td><td>{len(items)}</td><td>{r}</td></tr>")
+        sections.append("</table>")
+
+    if args.multiface:
+        m = json.loads(Path(args.multiface).read_text())
+        t = m["totals"]
+        sections.append("<h2>Multi-face audit</h2>")
+        sections.append(
+            f"<ul>"
+            f"<li>PNGs scored: {t['scored']}</li>"
+            f"<li>Bad PNGs (0 or >1 face): {t['bad_pngs']} across {t['facesets_affected']} facesets</li>"
+            f"</ul>"
+        )
+
+    html = f"""<!doctype html>
+<html><head><meta charset='utf-8'><title>Dedup + multi-face audit</title>
+<style>
+body {{ font-family: system-ui, sans-serif; background:#111; color:#eee; padding:1em; }}
+h1, h2, h3 {{ margin-top:1em; }}
+table {{ border-collapse: collapse; font-family: monospace; font-size: 12px; }}
+table td, table th {{ padding: 2px 8px; border: 1px solid #333; }}
+ul li {{ margin: 4px 0; }}
+</style></head>
+<body>
+<h1>facesets_swap_ready dedup + roop optimization audit</h1>
+{''.join(sections)}
+</body></html>"""
+    out_html = out_dir / "index.html"
+    out_html.write_text(html)
+    print(f"[done] {out_html}", file=sys.stderr)
+
+
+# ----------------------------- main -----------------------------
+
+def main():
+    ap = argparse.ArgumentParser()
+    sub = ap.add_subparsers(dest="cmd", required=True)
+
+    a = sub.add_parser("analyze")
+    a.add_argument("--out", required=True)
+    a.set_defaults(func=cmd_analyze)
+
+    p = sub.add_parser("apply")
+    p.add_argument("--plan", required=True)
+    p.add_argument("--dry-run", action="store_true")
+    p.set_defaults(func=cmd_apply)
+
+    sm = sub.add_parser("stage_multiface")
+    sm.add_argument("--out", required=True)
+    sm.set_defaults(func=cmd_stage_multiface)
+
+    mm = sub.add_parser("merge_multiface")
+    mm.add_argument("--results", required=True)
+    mm.add_argument("--out", required=True)
+    mm.set_defaults(func=cmd_merge_multiface)
+
+    am = sub.add_parser("apply_multiface")
+    am.add_argument("--plan", required=True)
+    am.add_argument("--dry-run", action="store_true")
+    am.set_defaults(func=cmd_apply_multiface)
+
+    r = sub.add_parser("report")
+    r.add_argument("--dedup", default=None)
+    r.add_argument("--multiface", default=None)
+    r.add_argument("--out", required=True)
+    r.set_defaults(func=cmd_report)
+
+    args = ap.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/work/filter_occlusions.py b/work/filter_occlusions.py
new file mode 100644
index 0000000..6daa1e3
--- /dev/null
+++ b/work/filter_occlusions.py
@@ -0,0 +1,574 @@
+"""CLIP zero-shot scoring for masks + sunglasses across facesets_swap_ready/.
+
+Usage:
+  # score one or more specific facesets (test mode)
+  python work/filter_occlusions.py score --facesets faceset_001,faceset_050 \
+      --out work/test_batch_occlusion/scores.json
+
+  # score everything (full corpus)
+  python work/filter_occlusions.py score --out work/occlusion_scores.json
+
+  # render HTML contact sheet from a scores.json
+  python work/filter_occlusions.py report --scores work/test_batch_occlusion/scores.json \
+      --out work/test_batch_occlusion
+
+Notes:
+- This script never modifies facesets_swap_ready/. An --apply step lives elsewhere
+  (or will be added once thresholds are validated).
+- Model: open_clip ViT-L-14 / dfn2b_s39b (best public zero-shot at this size).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+from typing import Iterable
+
+import torch
+from PIL import Image
+import open_clip
+
+ROOT = Path("/mnt/e/temp_things/fcswp/nl_sorted/facesets_swap_ready")
+WIN_ROOT = r"E:\temp_things\fcswp\nl_sorted\facesets_swap_ready"
+
+MODEL_NAME = "ViT-L-14"
+PRETRAINED = "dfn2b_s39b"
+
+
+def wsl_to_win(wsl_path: str) -> str:
+    """Translate a /mnt/e/... wsl path to E:\\... for the Windows worker."""
+    s = str(wsl_path)
+    if s.startswith("/mnt/"):
+        drive = s[5]
+        rest = s[7:].replace("/", "\\")
+        return f"{drive.upper()}:\\{rest}"
+    return s
+
+# Prompt ensembles. Each pair (positive, negative) becomes one binary classifier.
+# We average text embeddings within each list, then softmax across the two means.
+PROMPTS = {
+    "mask": {
+        "pos": [
+            "a photo of a person wearing a surgical face mask",
+            "a photo of a person wearing an FFP2 respirator covering mouth and nose",
+            "a photo of a person wearing a cloth face mask",
+            "a face partially covered by a medical mask",
+            "a person whose mouth and nose are hidden by a face mask",
+        ],
+        "neg": [
+            "a photo of a person's face with mouth and nose clearly visible",
+            "a clear, unobstructed photo of a face",
+            "a photo of a face without any mask or covering",
+            "a portrait of a person showing their full face",
+            "a photo of a person with a beard and visible mouth",  # avoid beard false positives
+        ],
+    },
+    "sunglasses": {
+        # We want to flag ONLY images where sunglasses occlude the eyes.
+        # False positives to defeat: sunglasses pushed up on the head/forehead, hanging on a shirt collar.
+        "pos": [
+            "a face with dark sunglasses covering the eyes",
+            "a portrait with the eyes hidden behind opaque sunglasses",
+            "a person wearing dark sunglasses over their eyes, eyes not visible",
+            "a face where the eyes are completely concealed by tinted lenses",
+            "a close-up portrait wearing aviator sunglasses on the eyes",
+        ],
+        "neg": [
+            "a portrait with both eyes clearly visible and uncovered",
+            "a face with sunglasses pushed up on the forehead, eyes visible below",
+            "a face with sunglasses resting on top of the head, eyes visible",
+            "a person with sunglasses hanging from their shirt, eyes visible",
+            "a face wearing clear prescription eyeglasses with visible eyes",
+            "a portrait with no eyewear and visible eyes",
+        ],
+    },
+}
+
+
+def load_model(device: str = "cpu"):
+    print(f"[clip] loading {MODEL_NAME} / {PRETRAINED} on {device} ...", file=sys.stderr)
+    t0 = time.time()
+    model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED)
+    tokenizer = open_clip.get_tokenizer(MODEL_NAME)
+    model = model.to(device).eval()
+    logit_scale = float(model.logit_scale.exp().detach().cpu())
+    print(f"[clip] ready in {time.time()-t0:.1f}s, logit_scale={logit_scale:.2f}", file=sys.stderr)
+    return model, preprocess, tokenizer, logit_scale
+
+
+@torch.no_grad()
+def build_text_features(model, tokenizer, device: str):
+    """Return dict {attr: (pos_mean_emb, neg_mean_emb)} on device, both L2-normalized."""
+    out = {}
+    for attr, sides in PROMPTS.items():
+        feats = {}
+        for side in ("pos", "neg"):
+            tokens = tokenizer(sides[side]).to(device)
+            f = model.encode_text(tokens)
+            f = f / f.norm(dim=-1, keepdim=True)
+            mean = f.mean(dim=0)
+            feats[side] = mean / mean.norm()
+        out[attr] = (feats["pos"], feats["neg"])
+    return out
+
+
+@torch.no_grad()
+def score_images(model, preprocess, text_feats, logit_scale: float, paths: list[Path], device: str, batch: int = 16):
+    """Yield (path, {attr: pos_prob}) per image. logit_scale is CLIP's learned temperature (~100)."""
+    for i in range(0, len(paths), batch):
+        chunk = paths[i:i + batch]
+        imgs = []
+        keep = []
+        for p in chunk:
+            try:
+                img = Image.open(p).convert("RGB")
+                imgs.append(preprocess(img))
+                keep.append(p)
+            except Exception as e:
+                print(f"[skip] {p}: {e}", file=sys.stderr)
+        if not imgs:
+            continue
+        x = torch.stack(imgs).to(device)
+        feats = model.encode_image(x)
+        feats = feats / feats.norm(dim=-1, keepdim=True)  # (B, D)
+        results = {}
+        for attr, (pos, neg) in text_feats.items():
+            sims = torch.stack([feats @ pos, feats @ neg], dim=1) * logit_scale  # (B, 2)
+            probs = sims.softmax(dim=1)[:, 0].tolist()                            # P(pos)
+            results[attr] = probs
+        for j, p in enumerate(keep):
+            yield p, {attr: results[attr][j] for attr in text_feats}
+
+
+def iter_facesets(root: Path, only: list[str] | None) -> Iterable[Path]:
+    if only:
+        for name in only:
+            d = root / name
+            if d.is_dir():
+                yield d
+            else:
+                print(f"[warn] not a directory: {d}", file=sys.stderr)
+        return
+    for d in sorted(root.iterdir()):
+        if d.is_dir() and not d.name.startswith("_"):
+            yield d
+
+
+def cmd_score(args):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model, preprocess, tokenizer, logit_scale = load_model(device)
+    text_feats = build_text_features(model, tokenizer, device)
+
+    only = [s.strip() for s in args.facesets.split(",")] if args.facesets else None
+    facesets = list(iter_facesets(ROOT, only))
+    if args.sample_per_faceset:
+        # take first N PNGs per faceset (cheap deterministic sample for test batches)
+        pass
+
+    report = {
+        "model": f"{MODEL_NAME}/{PRETRAINED}",
+        "root": str(ROOT),
+        "prompts": PROMPTS,
+        "facesets": {},
+    }
+    total_imgs = 0
+    t0 = time.time()
+    for fs in facesets:
+        faces = sorted((fs / "faces").glob("*.png")) if (fs / "faces").is_dir() else sorted(fs.glob("*.png"))
+        if args.sample_per_faceset:
+            faces = faces[: args.sample_per_faceset]
+        if not faces:
+            continue
+        print(f"[scan] {fs.name}: {len(faces)} png", file=sys.stderr)
+        per_image = []
+        for p, scores in score_images(model, preprocess, text_feats, logit_scale, faces, device):
+            per_image.append({"file": p.name, "mask": round(scores["mask"], 4), "sunglasses": round(scores["sunglasses"], 4)})
+            total_imgs += 1
+        report["facesets"][fs.name] = per_image
+
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(report, indent=2))
+    dt = time.time() - t0
+    print(f"[done] {total_imgs} images, {dt:.1f}s ({total_imgs/max(dt,1e-3):.2f} img/s) -> {out}", file=sys.stderr)
+
+
+def cmd_report(args):
+    """Render an HTML contact sheet from scores.json. Generates JPG thumbs."""
+    import io
+    scores = json.loads(Path(args.scores).read_text())
+    out_dir = Path(args.out)
+    thumbs_dir = out_dir / "thumbs"
+    thumbs_dir.mkdir(parents=True, exist_ok=True)
+
+    THUMB = 160
+    rows_html = []
+
+    def thumb_path(faceset: str, fname: str) -> Path:
+        d = thumbs_dir / faceset
+        d.mkdir(parents=True, exist_ok=True)
+        return d / (Path(fname).stem + ".jpg")
+
+    def make_thumb(src: Path, dst: Path):
+        if dst.exists():
+            return
+        try:
+            img = Image.open(src).convert("RGB")
+            img.thumbnail((THUMB, THUMB), Image.LANCZOS)
+            img.save(dst, "JPEG", quality=82)
+        except Exception as e:
+            print(f"[thumb-skip] {src}: {e}", file=sys.stderr)
+
+    facesets = scores["facesets"]
+    for faceset, items in facesets.items():
+        # sort: high score first so borderline cases group at the boundary
+        items_sorted = sorted(items, key=lambda x: max(x["mask"], x["sunglasses"]), reverse=True)
+        # faceset summary
+        n = len(items)
+        n_mask = sum(1 for x in items if x["mask"] >= 0.7)
+        n_sg = sum(1 for x in items if x["sunglasses"] >= 0.7)
+        pct_mask = (100 * n_mask / n) if n else 0
+        pct_sg = (100 * n_sg / n) if n else 0
+        rows_html.append(f"<h2 id='{faceset}'>{faceset} <small>({n} imgs &middot; mask&ge;0.7: {n_mask} ({pct_mask:.0f}%) &middot; sunglasses&ge;0.7: {n_sg} ({pct_sg:.0f}%))</small></h2>")
+        rows_html.append("<div class='grid'>")
+        src_root = ROOT / faceset
+        faces_root = (src_root / "faces") if (src_root / "faces").is_dir() else src_root
+        for it in items_sorted:
+            src = faces_root / it["file"]
+            dst = thumb_path(faceset, it["file"])
+            make_thumb(src, dst)
+            rel = f"thumbs/{faceset}/{Path(it['file']).stem}.jpg"
+            m, s = it["mask"], it["sunglasses"]
+            cls_m = "hi" if m >= 0.7 else ("mid" if m >= 0.4 else "lo")
+            cls_s = "hi" if s >= 0.7 else ("mid" if s >= 0.4 else "lo")
+            rows_html.append(
+                f"<div class='cell'>"
+                f"<img src='{rel}' loading='lazy' title='{it['file']}'>"
+                f"<div class='scores'><span class='{cls_m}'>M {m:.2f}</span> <span class='{cls_s}'>S {s:.2f}</span></div>"
+                f"</div>"
+            )
+        rows_html.append("</div>")
+
+    nav = " · ".join(f"<a href='#{f}'>{f}</a>" for f in facesets)
+
+    html = f"""<!doctype html>
+<html><head><meta charset='utf-8'><title>Occlusion test batch</title>
+<style>
+body {{ font-family: system-ui, sans-serif; background: #111; color: #eee; padding: 1em; }}
+h1 {{ margin-top: 0; }}
+h2 {{ margin-top: 1.5em; border-bottom: 1px solid #333; padding-bottom: .25em; }}
+small {{ color: #999; font-weight: normal; }}
+.grid {{ display: grid; grid-template-columns: repeat(auto-fill, minmax(170px, 1fr)); gap: .5em; }}
+.cell {{ background: #1c1c1c; padding: 4px; border-radius: 4px; text-align: center; }}
+.cell img {{ max-width: 100%; height: auto; display: block; margin: 0 auto; }}
+.scores {{ font-family: monospace; font-size: 11px; padding-top: 4px; }}
+.hi  {{ color: #ff5050; font-weight: bold; }}
+.mid {{ color: #ffb050; }}
+.lo  {{ color: #5fa05f; }}
+.nav {{ position: sticky; top: 0; background: #111; padding: .5em 0; border-bottom: 1px solid #333; }}
+a {{ color: #6cf; }}
+</style></head>
+<body>
+<h1>Occlusion scores &mdash; {scores['model']}</h1>
+<p>Sorted within each faceset by max(mask, sunglasses) descending.
+Color: <span class='hi'>&ge;0.70</span> &middot; <span class='mid'>0.40&ndash;0.70</span> &middot; <span class='lo'>&lt;0.40</span></p>
+<div class='nav'>{nav}</div>
+{''.join(rows_html)}
+</body></html>"""
+
+    out_html = out_dir / "index.html"
+    out_html.write_text(html)
+    print(f"[done] {out_html}", file=sys.stderr)
+
+
+def _zip_png_list(pngs: list[Path], zip_path: Path) -> None:
+    """Mirror of sort_faces.py:_zip_png_list. Renames PNGs to 0000.png, 0001.png, ..."""
+    import zipfile
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=4) as zf:
+        for i, p in enumerate(pngs):
+            zf.write(p, arcname=f"{i:04d}.png")
+
+
+def cmd_apply(args):
+    """Prune mask/sunglasses PNGs, quarantine occlusion-dominated facesets,
+    re-zip .fsz, update top-level manifest. --dry-run prints the plan only."""
+    import shutil
+
+    threshold = args.threshold
+    domain_pct = args.domain_pct
+    min_survivors = args.min_survivors
+    top_n_target = args.top_n
+
+    scores = json.loads(Path(args.scores).read_text())
+    master_path = ROOT / "manifest.json"
+    master = json.loads(master_path.read_text())
+    by_name = {f["name"]: f for f in master.get("facesets", [])}
+
+    masked_dir = ROOT / "_masked"
+    thin_dir = ROOT / "_thin"
+
+    plan = []
+    for faceset, items in scores["facesets"].items():
+        if faceset not in by_name:
+            print(f"[warn] {faceset} not in master manifest — skipping", file=sys.stderr)
+            continue
+        n = len(items)
+        flagged_files = sorted(
+            it["file"] for it in items
+            if it["mask"] >= threshold or it["sunglasses"] >= threshold
+        )
+        survivors_items = [it for it in items if it["file"] not in set(flagged_files)]
+        # preserve quality order from filename (0001.png is highest-rank)
+        survivors_files = sorted(it["file"] for it in survivors_items)
+
+        n_mask = sum(1 for it in items if it["mask"] >= threshold)
+        n_sg = sum(1 for it in items if it["sunglasses"] >= threshold)
+        pct_mask = n_mask / n if n else 0
+        pct_sg = n_sg / n if n else 0
+
+        if pct_mask >= domain_pct:
+            action, reason = "quarantine_masked", f"mask_pct={pct_mask:.0%}"
+        elif pct_sg >= domain_pct:
+            action, reason = "quarantine_masked", f"sunglasses_pct={pct_sg:.0%}"
+        elif flagged_files and len(survivors_files) < min_survivors:
+            # only quarantine-as-thin if pruning is the cause of the drop below threshold;
+            # pre-existing small facesets without occlusions are left alone
+            action, reason = "quarantine_thin", f"survivors={len(survivors_files)}<{min_survivors}"
+        elif flagged_files:
+            action, reason = "prune", f"drop {len(flagged_files)}"
+        else:
+            action, reason = "keep", "clean"
+
+        plan.append({
+            "faceset": faceset, "action": action, "reason": reason,
+            "n": n, "n_mask": n_mask, "n_sg": n_sg,
+            "n_dropped": len(flagged_files), "n_survivors": len(survivors_files),
+            "dropped_files": flagged_files,
+        })
+
+    # Summary
+    counts = {a: 0 for a in ("keep", "prune", "quarantine_masked", "quarantine_thin")}
+    for p in plan:
+        counts[p["action"]] += 1
+    total_dropped_pngs = sum(p["n_dropped"] for p in plan if p["action"] == "prune")
+    total_quarantined_pngs = sum(p["n"] for p in plan if p["action"].startswith("quarantine"))
+    print(f"=== plan summary (threshold={threshold} domain_pct={domain_pct} min_survivors={min_survivors}) ===")
+    for a, c in counts.items():
+        print(f"  {a}: {c}")
+    print(f"  PNGs to drop (prune):       {total_dropped_pngs}")
+    print(f"  PNGs to quarantine (whole): {total_quarantined_pngs}")
+    print(f"  facesets in master:         {len(master['facesets'])}")
+    print(f"  facesets scored:            {len(plan)}")
+
+    # Write plan for audit
+    plan_path = Path(args.out_plan)
+    plan_path.parent.mkdir(parents=True, exist_ok=True)
+    plan_path.write_text(json.dumps({
+        "thresholds": {"image": threshold, "domain_pct": domain_pct, "min_survivors": min_survivors},
+        "counts": counts,
+        "totals": {"dropped_pngs": total_dropped_pngs, "quarantined_pngs": total_quarantined_pngs},
+        "plan": plan,
+    }, indent=2))
+    print(f"  plan written to {plan_path}")
+
+    if args.dry_run:
+        # pretty list of quarantines
+        for p in plan:
+            if p["action"].startswith("quarantine"):
+                print(f"  [{p['action']:>18s}] {p['faceset']}  ({p['reason']}, n={p['n']})")
+        return
+
+    # ----- destructive section -----
+    masked_dir.mkdir(parents=True, exist_ok=True)
+    thin_dir.mkdir(parents=True, exist_ok=True)
+
+    new_facesets = []
+    new_masked = list(master.get("masked", []))   # preserve any prior runs
+    new_thin = list(master.get("thin_eras", []))
+
+    # build a name -> existing-thin/masked entry index, to update relpath if we re-quarantine
+    by_name_thin = {e["name"]: e for e in new_thin}
+    by_name_masked = {e["name"]: e for e in new_masked}
+
+    for p in plan:
+        entry = dict(by_name[p["faceset"]])  # copy
+        fs_dir = ROOT / p["faceset"]
+        faces_dir = fs_dir / "faces"
+
+        if p["action"] == "keep":
+            new_facesets.append(entry)
+            continue
+
+        # prune dropped PNGs first (applies to both prune and quarantine_thin paths)
+        if p["dropped_files"]:
+            dropped_holding = faces_dir / "_dropped"
+            dropped_holding.mkdir(exist_ok=True)
+            for fname in p["dropped_files"]:
+                src = faces_dir / fname
+                if src.exists():
+                    shutil.move(str(src), str(dropped_holding / fname))
+
+        if p["action"].startswith("quarantine"):
+            target_root = masked_dir if p["action"] == "quarantine_masked" else thin_dir
+            target = target_root / p["faceset"]
+            if target.exists():
+                # idempotency: if a previous run already moved it, skip move
+                pass
+            else:
+                shutil.move(str(fs_dir), str(target))
+            entry["occlusion_filter"] = {
+                "action": p["action"], "reason": p["reason"],
+                "n_input": p["n"], "n_mask": p["n_mask"], "n_sg": p["n_sg"],
+                "n_dropped": p["n_dropped"], "n_survivors": p["n_survivors"],
+                "threshold": threshold, "domain_pct": domain_pct,
+            }
+            entry["relpath"] = f"{'_masked' if p['action']=='quarantine_masked' else '_thin'}/{p['faceset']}"
+            entry["fsz_top"] = None
+            entry["fsz_all"] = None
+            if p["action"] == "quarantine_masked":
+                entry["masked"] = True
+                new_masked.append(entry)
+            else:
+                entry["thin"] = True
+                new_thin.append(entry)
+            continue
+
+        # action == prune
+        survivor_pngs = sorted([pp for pp in faces_dir.glob("*.png")])
+        if not survivor_pngs:
+            print(f"[warn] {p['faceset']}: no survivor PNGs after prune", file=sys.stderr)
+            new_facesets.append(entry)
+            continue
+
+        # re-zip .fsz from survivors in quality order
+        top_n_eff = min(top_n_target, len(survivor_pngs))
+        top_fsz = fs_dir / f"{p['faceset']}_top{top_n_eff}.fsz"
+        all_fsz = fs_dir / f"{p['faceset']}_all.fsz"
+        # remove old .fsz files (they may have different top_n in name)
+        for old in fs_dir.glob("*.fsz"):
+            old.unlink()
+        _zip_png_list(survivor_pngs[:top_n_eff], top_fsz)
+        if len(survivor_pngs) > top_n_eff:
+            _zip_png_list(survivor_pngs, all_fsz)
+            entry["fsz_all"] = all_fsz.name
+        else:
+            entry["fsz_all"] = None
+        entry["fsz_top"] = top_fsz.name
+        entry["top_n"] = top_n_eff
+        entry["exported"] = len(survivor_pngs)
+        entry["dropped_occlusion"] = p["n_dropped"]
+        entry["occlusion_filter"] = {
+            "action": "prune", "n_input": p["n"], "n_mask": p["n_mask"],
+            "n_sg": p["n_sg"], "n_dropped": p["n_dropped"], "n_survivors": p["n_survivors"],
+            "threshold": threshold,
+        }
+        new_facesets.append(entry)
+
+    # write updated master manifest
+    new_master = dict(master)
+    new_master["facesets"] = new_facesets
+    new_master["masked"] = new_masked
+    new_master["thin_eras"] = new_thin
+    new_master["occlusion_filter_run"] = {
+        "model": scores.get("model"),
+        "threshold": threshold,
+        "domain_pct": domain_pct,
+        "min_survivors": min_survivors,
+        "counts": counts,
+        "totals": {"dropped_pngs": total_dropped_pngs, "quarantined_pngs": total_quarantined_pngs},
+    }
+    tmp = master_path.with_suffix(".tmp.json")
+    tmp.write_text(json.dumps(new_master, indent=2))
+    tmp.replace(master_path)
+    print(f"[done] master manifest updated: {len(new_facesets)} active, "
+          f"{len(new_masked)} masked, {len(new_thin)} thin")
+
+
+def cmd_stage(args):
+    """Walk facesets_swap_ready/ and write a queue.json for the Windows clip_worker."""
+    only = [s.strip() for s in args.facesets.split(",")] if args.facesets else None
+    queue = []
+    for fs in iter_facesets(ROOT, only):
+        faces = sorted((fs / "faces").glob("*.png")) if (fs / "faces").is_dir() else sorted(fs.glob("*.png"))
+        for p in faces:
+            queue.append({
+                "wsl_path": str(p),
+                "win_path": wsl_to_win(str(p)),
+                "faceset": fs.name,
+                "file": p.name,
+            })
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(queue, indent=2))
+    print(f"[stage] {len(queue)} png paths -> {out}", file=sys.stderr)
+    print(f"[stage] win queue file: {wsl_to_win(str(out))}", file=sys.stderr)
+
+
+def cmd_merge(args):
+    """Ingest worker scores.json into the per-faceset shape that `report` reads."""
+    src = json.loads(Path(args.scores).read_text())
+    by_faceset: dict[str, list] = {}
+    for r in src.get("results", []):
+        by_faceset.setdefault(r["faceset"], []).append({
+            "file": r["file"],
+            "mask": r["mask"],
+            "sunglasses": r["sunglasses"],
+        })
+    # stable ordering: faceset by name, files by name
+    out_data = {
+        "model": src.get("model", f"{MODEL_NAME}/{PRETRAINED}"),
+        "root": str(ROOT),
+        "prompts": src.get("prompts", PROMPTS),
+        "facesets": {fs: sorted(items, key=lambda x: x["file"]) for fs, items in sorted(by_faceset.items())},
+    }
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(out_data, indent=2))
+    total = sum(len(v) for v in by_faceset.values())
+    print(f"[merge] {total} scores across {len(by_faceset)} facesets -> {out}", file=sys.stderr)
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    sub = ap.add_subparsers(dest="cmd", required=True)
+
+    s = sub.add_parser("score", help="WSL CPU scoring (slow but no GPU dependency)")
+    s.add_argument("--facesets", default=None, help="comma-separated faceset names; default = all")
+    s.add_argument("--sample-per-faceset", type=int, default=0, help="cap PNGs per faceset (0 = all)")
+    s.add_argument("--out", required=True)
+    s.set_defaults(func=cmd_score)
+
+    st = sub.add_parser("stage", help="Build queue.json for Windows clip_worker.py")
+    st.add_argument("--facesets", default=None, help="comma-separated faceset names; default = all")
+    st.add_argument("--out", required=True)
+    st.set_defaults(func=cmd_stage)
+
+    m = sub.add_parser("merge", help="Convert worker scores.json into per-faceset report format")
+    m.add_argument("--scores", required=True, help="worker output (flat list of results)")
+    m.add_argument("--out", required=True, help="output path for per-faceset format")
+    m.set_defaults(func=cmd_merge)
+
+    r = sub.add_parser("report", help="Render HTML contact sheet from a per-faceset scores.json")
+    r.add_argument("--scores", required=True)
+    r.add_argument("--out", required=True)
+    r.set_defaults(func=cmd_report)
+
+    a = sub.add_parser("apply", help="Prune flagged PNGs, quarantine dominated facesets, re-zip .fsz, update manifest")
+    a.add_argument("--scores", required=True, help="per-faceset scores.json (output of `merge` or `score`)")
+    a.add_argument("--out-plan", required=True, help="path to write the apply plan json (audit)")
+    a.add_argument("--threshold", type=float, default=0.7, help="image-level drop threshold for mask/sunglasses (default 0.7)")
+    a.add_argument("--domain-pct", type=float, default=0.40, help="faceset-level quarantine threshold (default 0.40)")
+    a.add_argument("--min-survivors", type=int, default=5, help="quarantine to _thin if survivors below this (default 5)")
+    a.add_argument("--top-n", type=int, default=30, help="top-N for re-zipped _topN.fsz (default 30)")
+    a.add_argument("--dry-run", action="store_true", help="print plan only, no filesystem changes")
+    a.set_defaults(func=cmd_apply)
+
+    args = ap.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/work/multiface_worker.py b/work/multiface_worker.py
new file mode 100644
index 0000000..033b75f
--- /dev/null
+++ b/work/multiface_worker.py
@@ -0,0 +1,144 @@
+"""Windows / DirectML multi-face audit worker.
+
+For every PNG in queue.json, run insightface FaceAnalysis and record how many
+faces were detected (filtering by det_score>=MIN_DET and face_short>=MIN_PIX).
+Surfaces the load-bearing roop invariant: each .fsz PNG must hold exactly one
+face, otherwise the loader's `extract_face_images` appends every detected face
+into the FaceSet and pollutes the averaged identity embedding.
+
+CLI:
+    py -3.12 multiface_worker.py <queue.json> <out_results.json> [--limit N]
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+from PIL import Image, ImageOps
+from insightface.app import FaceAnalysis
+
+MODEL_ROOT = r"C:\face_embed_venv\models"
+MIN_DET = 0.5
+MIN_FACE_PIX = 40
+FLUSH_EVERY = 200
+
+
+def load_existing(out_path: Path):
+    if not out_path.exists():
+        return None, set()
+    try:
+        d = json.loads(out_path.read_text())
+        processed = set(d.get("processed", []))
+        return d, processed
+    except Exception as e:
+        print(f"[warn] could not parse {out_path}: {e}; starting fresh", file=sys.stderr)
+        return None, set()
+
+
+def save_atomic(out_path: Path, data: dict):
+    tmp = out_path.with_suffix(".tmp.json")
+    tmp.write_text(json.dumps(data, indent=2))
+    os.replace(tmp, out_path)
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("queue", type=Path)
+    ap.add_argument("out", type=Path)
+    ap.add_argument("--limit", type=int, default=None)
+    args = ap.parse_args()
+
+    queue = json.loads(args.queue.read_text())
+    print(f"[queue] {len(queue)} entries from {args.queue}", flush=True)
+    args.out.parent.mkdir(parents=True, exist_ok=True)
+    existing, processed = load_existing(args.out)
+    if existing:
+        print(f"[resume] {len(processed)} already scored", flush=True)
+        results = existing.get("results", [])
+    else:
+        results = []
+    pending = [e for e in queue if e["wsl_path"] not in processed]
+    if args.limit is not None:
+        pending = pending[: args.limit]
+    print(f"[pending] {len(pending)} entries", flush=True)
+    if not pending:
+        print("[done] nothing to do")
+        return
+
+    print("[load] FaceAnalysis with DmlExecutionProvider", flush=True)
+    app = FaceAnalysis(
+        name="buffalo_l",
+        root=MODEL_ROOT,
+        providers=["DmlExecutionProvider", "CPUExecutionProvider"],
+    )
+    app.prepare(ctx_id=0, det_size=(640, 640))
+
+    n_done = 0
+    n_load_err = 0
+    last_flush = time.time()
+    t_start = time.time()
+
+    def flush():
+        save_atomic(args.out, {
+            "results": results,
+            "processed": sorted(processed),
+        })
+
+    for entry in pending:
+        try:
+            with Image.open(entry["win_path"]) as im:
+                im = ImageOps.exif_transpose(im)
+                im = im.convert("RGB")
+                rgb = np.array(im)
+            bgr = rgb[:, :, ::-1].copy()
+        except Exception as e:
+            n_load_err += 1
+            results.append({
+                "wsl_path": entry["wsl_path"], "faceset": entry["faceset"], "file": entry["file"],
+                "face_count": -1, "error": "load",
+            })
+            processed.add(entry["wsl_path"])
+            n_done += 1
+            continue
+
+        faces = app.get(bgr)
+        kept = 0
+        for f in faces:
+            if float(f.det_score) < MIN_DET:
+                continue
+            x1, y1, x2, y2 = [int(round(v)) for v in f.bbox]
+            short = min(max(x2 - x1, 0), max(y2 - y1, 0))
+            if short < MIN_FACE_PIX:
+                continue
+            kept += 1
+
+        results.append({
+            "wsl_path": entry["wsl_path"], "faceset": entry["faceset"], "file": entry["file"],
+            "face_count": kept,
+        })
+        processed.add(entry["wsl_path"])
+        n_done += 1
+
+        if (n_done % FLUSH_EVERY == 0) or (time.time() - last_flush) > 30.0:
+            flush()
+            last_flush = time.time()
+            elapsed = time.time() - t_start
+            rate = n_done / max(0.1, elapsed)
+            eta = (len(pending) - n_done) / max(0.1, rate) / 60.0
+            print(f"[scan] {n_done}/{len(pending)} rate={rate:.2f} img/s eta={eta:.1f}min "
+                  f"load_err={n_load_err}", flush=True)
+
+    flush()
+    elapsed = time.time() - t_start
+    print(f"[done] {n_done} scored, {n_load_err} load errors, {elapsed:.1f}s "
+          f"({n_done/max(0.1,elapsed):.2f} img/s) -> {args.out}", flush=True)
+
+
+if __name__ == "__main__":
+    main()