Add Immich outage circuit breaker; document nic run + Tailscale quirk
work/immich_stage.py: - Startup probe of /server/version (exit 2 if unreachable). - Outage circuit breaker: after OUTAGE_FAIL_STREAK=12 consecutive faces_error/download_error results, run a quick probe; if the probe also fails, persist state and exit with code 2 so a long unattended run can pause rather than silently churning through tens of thousands of retries during an upstream outage. Resume by re-running the same command -- state.json + queue.json are intact. README: - Document the nic run (per-user API key necessary; second pipeline invocation confirmed expected behavior; cleaner library than peter's with 0 internal byte-dupes vs 2,976). - Mention the circuit breaker as the mechanism that keeps long unattended runs safe under the known Tailscale flicker pattern at this site. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
14
README.md
14
README.md
@@ -259,6 +259,20 @@ v2.7.2), with the admin API key:
|
|||||||
| matched existing identities | **8,103 of 19,480 (42%)** at cos-dist ≤ 0.45; biggest hits faceset_002 (+2,666), faceset_001 (+1,856), faceset_003 (+670) |
|
| matched existing identities | **8,103 of 19,480 (42%)** at cos-dist ≤ 0.45; biggest hits faceset_002 (+2,666), faceset_001 (+1,856), faceset_003 (+670) |
|
||||||
| new clusters | 2,534 at threshold 0.55 → 239 surviving refine gates → **185 emitted** as `faceset_026..264` (gaps where export-swap's tighter outlier filter dropped clusters below the export quality bar) |
|
| new clusters | 2,534 at threshold 0.55 → 239 surviving refine gates → **185 emitted** as `faceset_026..264` (gaps where export-swap's tighter outlier filter dropped clusters below the export quality bar) |
|
||||||
|
|
||||||
|
A second 2026-04-26 run with **nic's per-user API key** confirmed the
|
||||||
|
expected behavior: 25,777 of nic's IMAGE assets were enumerated (matching
|
||||||
|
her `/server/statistics` count of 25,786, off by 9 ≈ the transient errors
|
||||||
|
that didn't get marked seen), **7,834 staged** (30% face-bearing-with-big-face,
|
||||||
|
denser than peter's 19%), 519 byte-deduped vs `nl_full.npz`, **0 internal
|
||||||
|
byte-duplicates** (cleaner library than peter's 2,976), 54 transient errors.
|
||||||
|
|
||||||
|
`work/immich_stage.py` carries a built-in **outage circuit breaker**:
|
||||||
|
after 12 consecutive HTTP errors it probes Immich; if that probe also
|
||||||
|
fails, the script exits cleanly with code 2, state preserved. This made
|
||||||
|
the nic run survive a mid-stage Immich outage — the script paused, the
|
||||||
|
operator confirmed connectivity was back, and the same command resumed
|
||||||
|
from the saved `state.json` without re-fetching what was already done.
|
||||||
|
|
||||||
**Important caveats for Immich v2.7.2**:
|
**Important caveats for Immich v2.7.2**:
|
||||||
- The `userIds` filter on `/search/metadata` is **silently ignored** when
|
- The `userIds` filter on `/search/metadata` is **silently ignored** when
|
||||||
the API key is bound to a different user. The "import everything the
|
the API key is bound to a different user. The "import everything the
|
||||||
|
|||||||
@@ -78,6 +78,12 @@ HTTP_TIMEOUT = 60 # seconds, conservative for big originals
|
|||||||
HTTP_RETRIES = 3
|
HTTP_RETRIES = 3
|
||||||
HTTP_BACKOFF = 2.0
|
HTTP_BACKOFF = 2.0
|
||||||
|
|
||||||
|
# Circuit breaker: if this many consecutive workers fail with network errors,
|
||||||
|
# probe Immich; if probe also fails, exit cleanly with code 2 so the orchestrator
|
||||||
|
# can pause until the user says resume. State is preserved (resume-safe).
|
||||||
|
OUTAGE_FAIL_STREAK = 12
|
||||||
|
OUTAGE_PROBE_TIMEOUT = 8
|
||||||
|
|
||||||
# ---- helpers ------------------------------------------------------------- #
|
# ---- helpers ------------------------------------------------------------- #
|
||||||
|
|
||||||
def http_get(url: str, accept_bytes: bool = False) -> bytes | dict:
|
def http_get(url: str, accept_bytes: bool = False) -> bytes | dict:
|
||||||
@@ -96,6 +102,16 @@ def http_get(url: str, accept_bytes: bool = False) -> bytes | dict:
|
|||||||
raise RuntimeError(f"GET {url} failed after {HTTP_RETRIES} attempts: {last_err}")
|
raise RuntimeError(f"GET {url} failed after {HTTP_RETRIES} attempts: {last_err}")
|
||||||
|
|
||||||
|
|
||||||
|
def probe_immich() -> bool:
|
||||||
|
"""Quick connectivity probe (no retry). Used by the circuit breaker."""
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(f"{API}/server/version", headers=HEADERS)
|
||||||
|
urllib.request.urlopen(req, timeout=OUTAGE_PROBE_TIMEOUT).read()
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def http_post(url: str, payload: dict) -> dict:
|
def http_post(url: str, payload: dict) -> dict:
|
||||||
last_err = None
|
last_err = None
|
||||||
body = json.dumps(payload).encode("utf-8")
|
body = json.dumps(payload).encode("utf-8")
|
||||||
@@ -241,6 +257,12 @@ def stage(user_label: str, limit: int | None, workers: int) -> None:
|
|||||||
f"{len(queue)} in queue, {len(aliases)} aliased to existing cache")
|
f"{len(queue)} in queue, {len(aliases)} aliased to existing cache")
|
||||||
seen = set(state["seen_asset_ids"])
|
seen = set(state["seen_asset_ids"])
|
||||||
|
|
||||||
|
# ---- startup connectivity probe ---- #
|
||||||
|
if not probe_immich():
|
||||||
|
print(f"[init] Immich probe failed at {API}/server/version -- exiting code 2")
|
||||||
|
sys.exit(2)
|
||||||
|
print("[init] Immich reachable")
|
||||||
|
|
||||||
# ---- load existing canonical cache hashes (sha256) ---- #
|
# ---- load existing canonical cache hashes (sha256) ---- #
|
||||||
print(f"[init] loading existing cache hashes from {CACHE_PATH}")
|
print(f"[init] loading existing cache hashes from {CACHE_PATH}")
|
||||||
_emb, meta, _src, _proc, _aliases = load_cache(CACHE_PATH)
|
_emb, meta, _src, _proc, _aliases = load_cache(CACHE_PATH)
|
||||||
@@ -280,6 +302,7 @@ def stage(user_label: str, limit: int | None, workers: int) -> None:
|
|||||||
return asset, faces, blob, eligible, None
|
return asset, faces, blob, eligible, None
|
||||||
|
|
||||||
n = 0
|
n = 0
|
||||||
|
err_streak = 0
|
||||||
last_flush = time.time()
|
last_flush = time.time()
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
pool = ThreadPoolExecutor(max_workers=workers)
|
pool = ThreadPoolExecutor(max_workers=workers)
|
||||||
@@ -304,7 +327,22 @@ def stage(user_label: str, limit: int | None, workers: int) -> None:
|
|||||||
detail = err.split(":", 1)[1][:160] if ":" in err else err
|
detail = err.split(":", 1)[1][:160] if ":" in err else err
|
||||||
print(f"[err] {kind} {aid}: {detail}")
|
print(f"[err] {kind} {aid}: {detail}")
|
||||||
state["skipped_download_error"] += 1
|
state["skipped_download_error"] += 1
|
||||||
|
err_streak += 1
|
||||||
|
# Circuit breaker: long streak -> probe; if down, save and exit.
|
||||||
|
if err_streak >= OUTAGE_FAIL_STREAK:
|
||||||
|
print(f"[breaker] {err_streak} consecutive errors; probing Immich...")
|
||||||
|
if probe_immich():
|
||||||
|
print("[breaker] probe ok, treating as transient; continuing")
|
||||||
|
err_streak = 0
|
||||||
|
else:
|
||||||
|
print("[breaker] probe FAILED -- pausing run; resume with same command")
|
||||||
|
queue_path.write_text(json.dumps(queue, indent=2))
|
||||||
|
state_path.write_text(json.dumps(state, indent=2))
|
||||||
|
aliases_path.write_text(json.dumps(aliases, indent=2))
|
||||||
|
sys.exit(2)
|
||||||
continue
|
continue
|
||||||
|
else:
|
||||||
|
err_streak = 0
|
||||||
|
|
||||||
# Permanent classifications -> seen.
|
# Permanent classifications -> seen.
|
||||||
if err == "no_faces":
|
if err == "no_faces":
|
||||||
|
|||||||
Reference in New Issue
Block a user