"""Windows / DirectML multi-face audit worker.

For every PNG in queue.json, run insightface FaceAnalysis and record how many
faces were detected (filtering by det_score>=MIN_DET and face_short>=MIN_PIX).
Surfaces the load-bearing roop invariant: each .fsz PNG must hold exactly one
face, otherwise the loader's `extract_face_images` appends every detected face
into the FaceSet and pollutes the averaged identity embedding.

CLI:
    py -3.12 multiface_worker.py <queue.json> <out_results.json> [--limit N]
"""

from __future__ import annotations

import argparse
import json
import os
import sys
import time
from pathlib import Path

import numpy as np
from PIL import Image, ImageOps
from insightface.app import FaceAnalysis

MODEL_ROOT = r"C:\face_embed_venv\models"
MIN_DET = 0.5
MIN_FACE_PIX = 40
FLUSH_EVERY = 200


def load_existing(out_path: Path):
    if not out_path.exists():
        return None, set()
    try:
        d = json.loads(out_path.read_text())
        processed = set(d.get("processed", []))
        return d, processed
    except Exception as e:
        print(f"[warn] could not parse {out_path}: {e}; starting fresh", file=sys.stderr)
        return None, set()


def save_atomic(out_path: Path, data: dict):
    tmp = out_path.with_suffix(".tmp.json")
    tmp.write_text(json.dumps(data, indent=2))
    os.replace(tmp, out_path)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("queue", type=Path)
    ap.add_argument("out", type=Path)
    ap.add_argument("--limit", type=int, default=None)
    args = ap.parse_args()

    queue = json.loads(args.queue.read_text())
    print(f"[queue] {len(queue)} entries from {args.queue}", flush=True)
    args.out.parent.mkdir(parents=True, exist_ok=True)
    existing, processed = load_existing(args.out)
    if existing:
        print(f"[resume] {len(processed)} already scored", flush=True)
        results = existing.get("results", [])
    else:
        results = []
    pending = [e for e in queue if e["wsl_path"] not in processed]
    if args.limit is not None:
        pending = pending[: args.limit]
    print(f"[pending] {len(pending)} entries", flush=True)
    if not pending:
        print("[done] nothing to do")
        return

    print("[load] FaceAnalysis with DmlExecutionProvider", flush=True)
    app = FaceAnalysis(
        name="buffalo_l",
        root=MODEL_ROOT,
        providers=["DmlExecutionProvider", "CPUExecutionProvider"],
    )
    app.prepare(ctx_id=0, det_size=(640, 640))

    n_done = 0
    n_load_err = 0
    last_flush = time.time()
    t_start = time.time()

    def flush():
        save_atomic(args.out, {
            "results": results,
            "processed": sorted(processed),
        })

    for entry in pending:
        try:
            with Image.open(entry["win_path"]) as im:
                im = ImageOps.exif_transpose(im)
                im = im.convert("RGB")
                rgb = np.array(im)
            bgr = rgb[:, :, ::-1].copy()
        except Exception as e:
            n_load_err += 1
            results.append({
                "wsl_path": entry["wsl_path"], "faceset": entry["faceset"], "file": entry["file"],
                "face_count": -1, "error": "load",
            })
            processed.add(entry["wsl_path"])
            n_done += 1
            continue

        faces = app.get(bgr)
        kept = 0
        for f in faces:
            if float(f.det_score) < MIN_DET:
                continue
            x1, y1, x2, y2 = [int(round(v)) for v in f.bbox]
            short = min(max(x2 - x1, 0), max(y2 - y1, 0))
            if short < MIN_FACE_PIX:
                continue
            kept += 1

        results.append({
            "wsl_path": entry["wsl_path"], "faceset": entry["faceset"], "file": entry["file"],
            "face_count": kept,
        })
        processed.add(entry["wsl_path"])
        n_done += 1

        if (n_done % FLUSH_EVERY == 0) or (time.time() - last_flush) > 30.0:
            flush()
            last_flush = time.time()
            elapsed = time.time() - t_start
            rate = n_done / max(0.1, elapsed)
            eta = (len(pending) - n_done) / max(0.1, rate) / 60.0
            print(f"[scan] {n_done}/{len(pending)} rate={rate:.2f} img/s eta={eta:.1f}min "
                  f"load_err={n_load_err}", flush=True)

    flush()
    elapsed = time.time() - t_start
    print(f"[done] {n_done} scored, {n_load_err} load errors, {elapsed:.1f}s "
          f"({n_done/max(0.1,elapsed):.2f} img/s) -> {args.out}", flush=True)


if __name__ == "__main__":
    main()