lakehouse/scripts/staffing/fetch_face_pool.py

#!/usr/bin/env python3
"""
fetch_face_pool.py — pull N synthetic headshots from
https://thispersondoesnotexist.com/, write to data/headshots/face_NNNN.jpg,
optionally tag each with gender via deepface, emit a JSONL manifest.

Each fetch is a fresh StyleGAN face — no real people. Deterministic per
worker mapping happens at serve time (mcp-server hashes the worker key
into the pool); this script just builds the pool.

Usage:
    python3 scripts/staffing/fetch_face_pool.py --count 300 --concurrency 3
    python3 scripts/staffing/fetch_face_pool.py --count 50  --no-gender

Re-running is idempotent: existing face_NNNN.jpg files are skipped, and
the manifest is rewritten from disk state.
"""
from __future__ import annotations
import argparse
import hashlib
import json
import os
import sys
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

import urllib.request
import urllib.error

URL = "https://thispersondoesnotexist.com/"
UA = "Lakehouse/1.0 (face-pool fetch · synthetic-only · no real-person tracking)"


def fetch_one(idx: int, out_dir: str) -> tuple[int, str, bool, str | None]:
    """Returns (idx, basename, cached, error)."""
    fname = f"face_{idx:04d}.jpg"
    full = os.path.join(out_dir, fname)
    if os.path.exists(full) and os.path.getsize(full) > 1024:
        return idx, fname, True, None
    try:
        req = urllib.request.Request(URL, headers={"User-Agent": UA})
        with urllib.request.urlopen(req, timeout=20) as resp:
            blob = resp.read()
        if len(blob) < 1024:
            return idx, fname, False, f"response too small ({len(blob)} bytes)"
        with open(full, "wb") as f:
            f.write(blob)
        return idx, fname, False, None
    except urllib.error.URLError as e:
        return idx, fname, False, f"urlerror: {e}"
    except Exception as e:
        return idx, fname, False, f"{type(e).__name__}: {e}"


def maybe_tag_gender(records: list[dict], out_dir: str) -> dict[str, int]:
    """If deepface is installed, label each record with gender. Returns
    a count summary; mutates records in place. On import error, returns
    None and tags every record as unknown."""
    try:
        from deepface import DeepFace  # type: ignore
    except Exception as e:
        print(f"  (deepface unavailable: {e}) — pool will mix naturally")
        for r in records:
            r["gender"] = None
        return {"unknown": len(records)}

    print("  tagging gender via deepface (CPU; ~0.5-1s per face)…")
    counts: dict[str, int] = {}
    for i, r in enumerate(records):
        full = os.path.join(out_dir, r["file"])
        try:
            ana = DeepFace.analyze(
                img_path=full,
                actions=["gender"],
                enforce_detection=False,
                silent=True,
            )
            if isinstance(ana, list):
                ana = ana[0] if ana else {}
            g_raw = (ana.get("dominant_gender") or "").lower().strip()
            r["gender"] = (
                "man" if g_raw.startswith("man") else
                "woman" if g_raw.startswith("woman") else
                None
            )
        except Exception as e:
            r["gender"] = None
            r["gender_error"] = f"{type(e).__name__}: {e}"
        counts[r["gender"] or "unknown"] = counts.get(r["gender"] or "unknown", 0) + 1
        if (i + 1) % 25 == 0:
            print(f"    [{i+1}/{len(records)}] {counts}")
    return counts


def main():
    p = argparse.ArgumentParser()
    p.add_argument("--count", type=int, default=300, help="how many faces to maintain in pool")
    p.add_argument(
        "--out",
        default=os.path.join(os.path.dirname(__file__), "..", "..", "data", "headshots"),
    )
    p.add_argument("--concurrency", type=int, default=3, help="parallel fetches (be polite)")
    p.add_argument("--no-gender", action="store_true", help="skip deepface gender tagging")
    args = p.parse_args()

    out = os.path.realpath(args.out)
    os.makedirs(out, exist_ok=True)

    print(f"Fetching {args.count} faces → {out}")
    print(f"Source: {URL} (synthetic StyleGAN — no real people)")

    results: list[dict] = [None] * args.count  # type: ignore
    t0 = time.time()
    with ThreadPoolExecutor(max_workers=max(1, args.concurrency)) as ex:
        futs = {ex.submit(fetch_one, i, out): i for i in range(args.count)}
        for done, fut in enumerate(as_completed(futs), 1):
            idx, fname, cached, err = fut.result()
            results[idx] = {
                "id": idx,
                "file": fname,
                "cached": cached,
                "error": err,
            }
            if done % 25 == 0 or done == args.count:
                ok = sum(1 for r in results if r and not r.get("error"))
                print(f"  [{done}/{args.count}] {ok} ok  ({time.time()-t0:.1f}s)")

    # Drop slots that errored or are still None (shouldn't happen)
    records = [r for r in results if r and not r.get("error")]
    print(f"\nPool ready: {len(records)} faces, {sum(1 for r in records if r['cached'])} from cache")

    if not args.no_gender and records:
        print("\nGender-tagging pass:")
        summary = maybe_tag_gender(records, out)
        print(f"  distribution: {summary}")
    else:
        for r in records:
            r["gender"] = None

    # Strip transient flags before persisting
    for r in records:
        r.pop("cached", None)
        r.pop("error", None)

    manifest = os.path.join(out, "manifest.jsonl")
    with open(manifest, "w") as f:
        for r in records:
            f.write(json.dumps(r) + "\n")
    print(f"\nManifest: {manifest}  ({len(records)} entries)")

    # Quick checksum manifest for downstream debugging
    h = hashlib.sha256()
    for r in records:
        h.update(r["file"].encode())
        h.update(b"|")
        h.update((r.get("gender") or "?").encode())
    print(f"Pool fingerprint (sha256): {h.hexdigest()[:16]}")


if __name__ == "__main__":
    main()