Three layers shipped:
1. SCRIPT — scripts/staffing/fetch_face_pool.py
Pulls N synthetic StyleGAN faces from thispersondoesnotexist.com
into data/headshots/face_NNNN.jpg, writes manifest.jsonl. Idempotent:
re-running skips existing files. Optional gender tagging via deepface
(currently unavailable on this box; the script handles ImportError
gracefully and tags everything as untagged). Fetched 198 faces with
concurrency=3 in ~67s.
2. SERVER — /headshots/:key route in mcp-server/index.ts
Loads manifest at first hit, caches in globalThis._faces. Hashes the
key with djb2-style mixing → pool index → returns the JPG. Same
key always gets the same face (deterministic). Accepts
?g=man|woman&e=caucasian|black|hispanic|south_asian|east_asian|middle_eastern
to bias pool selection — the gender/ethnicity buckets fall back to
the full pool when no tagged matches exist. Cache-Control:
86400 immutable so faces ride the browser cache after first hit.
/headshots/__reload re-reads the manifest without restart.
3. UI — search.html + console.html worker cards
Re-added overlay <img> on top of the monogram .av circle. img.src
= /headshots/<encoded-key>?g=<hint>&e=<hint>. img.onerror removes
the failed image so the monogram stays visible if the face pool
isn't fetched / CDN is blocked. .av now has overflow:hidden +
position:relative to clip the img to a perfect circle.
Forced-confident name resolution (J: "we're CREATING the profile,
created as though you truly have the information Xavier is more
likely Hispanic and he's a male"):
genderFor(name) — looks up MALE_NAMES + FEMALE_NAMES,
falls back to a deterministic hash split
so unknown names spread ~50/50. Sets now
include cross-cultural names: Alejandro/
Andres/Mateo/Santiago/Joaquin/Cesar/Hugo/
Felipe/Gerardo/Salvador/Ramon (Hispanic),
Raj/Anil/Vikram/Krishna/Pradeep (South
Asian), Wei/Yi/Hiroshi/Akira/Hyun (East
Asian), Demetrius/Kareem/DaQuan/Khalil
(Black), Omar/Khalid/Hassan/Ahmed/Bilal
(Middle Eastern). FEMALE_NAMES extended
in parallel.
guessEthnicityFromFirstName(name)
— confident default of 'caucasian' for any
name not in the cultural buckets so every
worker resolves to a category the face
pool can be biased toward. Order: ME → Black
→ Hispanic → South Asian → East Asian →
Caucasian (matters where names overlap,
e.g. Aisha appears in ME + Black, biases
toward ME for visual fit).
Both helpers also ported into console.html so the triage backfills
and try-it-yourself rendering get the same hint stack.
Privacy note in the script + route comments: the synthetic data uses
the worker's name as the seed; production should hash worker_id (not
name) to avoid leaking PII to a third-party CDN. The fetch URL itself
is referenced once per pool build, not per-worker.
.gitignore — added data/headshots/face_*.jpg (~100MB for 198 faces;
the manifest + script are tracked). Re-running the script on a fresh
checkout rebuilds the pool from scratch.
Verified end-to-end via playwright on devop.live/lakehouse:
forklift query → 10 worker cards
10/10 with face images (real synthetic headshots, not monograms)
0/10 broken
Alejandro G. Nelson → ?g=man&e=hispanic
Patricia K. Garcia → ?g=woman&e=caucasian
Each name → unique face, deterministic across loads.
Console triage backfills get the same treatment.
162 lines
5.9 KiB
Python
162 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
fetch_face_pool.py — pull N synthetic headshots from
|
|
https://thispersondoesnotexist.com/, write to data/headshots/face_NNNN.jpg,
|
|
optionally tag each with gender via deepface, emit a JSONL manifest.
|
|
|
|
Each fetch is a fresh StyleGAN face — no real people. Deterministic per
|
|
worker mapping happens at serve time (mcp-server hashes the worker key
|
|
into the pool); this script just builds the pool.
|
|
|
|
Usage:
|
|
python3 scripts/staffing/fetch_face_pool.py --count 300 --concurrency 3
|
|
python3 scripts/staffing/fetch_face_pool.py --count 50 --no-gender
|
|
|
|
Re-running is idempotent: existing face_NNNN.jpg files are skipped, and
|
|
the manifest is rewritten from disk state.
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
URL = "https://thispersondoesnotexist.com/"
|
|
UA = "Lakehouse/1.0 (face-pool fetch · synthetic-only · no real-person tracking)"
|
|
|
|
|
|
def fetch_one(idx: int, out_dir: str) -> tuple[int, str, bool, str | None]:
|
|
"""Returns (idx, basename, cached, error)."""
|
|
fname = f"face_{idx:04d}.jpg"
|
|
full = os.path.join(out_dir, fname)
|
|
if os.path.exists(full) and os.path.getsize(full) > 1024:
|
|
return idx, fname, True, None
|
|
try:
|
|
req = urllib.request.Request(URL, headers={"User-Agent": UA})
|
|
with urllib.request.urlopen(req, timeout=20) as resp:
|
|
blob = resp.read()
|
|
if len(blob) < 1024:
|
|
return idx, fname, False, f"response too small ({len(blob)} bytes)"
|
|
with open(full, "wb") as f:
|
|
f.write(blob)
|
|
return idx, fname, False, None
|
|
except urllib.error.URLError as e:
|
|
return idx, fname, False, f"urlerror: {e}"
|
|
except Exception as e:
|
|
return idx, fname, False, f"{type(e).__name__}: {e}"
|
|
|
|
|
|
def maybe_tag_gender(records: list[dict], out_dir: str) -> dict[str, int]:
|
|
"""If deepface is installed, label each record with gender. Returns
|
|
a count summary; mutates records in place. On import error, returns
|
|
None and tags every record as unknown."""
|
|
try:
|
|
from deepface import DeepFace # type: ignore
|
|
except Exception as e:
|
|
print(f" (deepface unavailable: {e}) — pool will mix naturally")
|
|
for r in records:
|
|
r["gender"] = None
|
|
return {"unknown": len(records)}
|
|
|
|
print(" tagging gender via deepface (CPU; ~0.5-1s per face)…")
|
|
counts: dict[str, int] = {}
|
|
for i, r in enumerate(records):
|
|
full = os.path.join(out_dir, r["file"])
|
|
try:
|
|
ana = DeepFace.analyze(
|
|
img_path=full,
|
|
actions=["gender"],
|
|
enforce_detection=False,
|
|
silent=True,
|
|
)
|
|
if isinstance(ana, list):
|
|
ana = ana[0] if ana else {}
|
|
g_raw = (ana.get("dominant_gender") or "").lower().strip()
|
|
r["gender"] = (
|
|
"man" if g_raw.startswith("man") else
|
|
"woman" if g_raw.startswith("woman") else
|
|
None
|
|
)
|
|
except Exception as e:
|
|
r["gender"] = None
|
|
r["gender_error"] = f"{type(e).__name__}: {e}"
|
|
counts[r["gender"] or "unknown"] = counts.get(r["gender"] or "unknown", 0) + 1
|
|
if (i + 1) % 25 == 0:
|
|
print(f" [{i+1}/{len(records)}] {counts}")
|
|
return counts
|
|
|
|
|
|
def main():
|
|
p = argparse.ArgumentParser()
|
|
p.add_argument("--count", type=int, default=300, help="how many faces to maintain in pool")
|
|
p.add_argument(
|
|
"--out",
|
|
default=os.path.join(os.path.dirname(__file__), "..", "..", "data", "headshots"),
|
|
)
|
|
p.add_argument("--concurrency", type=int, default=3, help="parallel fetches (be polite)")
|
|
p.add_argument("--no-gender", action="store_true", help="skip deepface gender tagging")
|
|
args = p.parse_args()
|
|
|
|
out = os.path.realpath(args.out)
|
|
os.makedirs(out, exist_ok=True)
|
|
|
|
print(f"Fetching {args.count} faces → {out}")
|
|
print(f"Source: {URL} (synthetic StyleGAN — no real people)")
|
|
|
|
results: list[dict] = [None] * args.count # type: ignore
|
|
t0 = time.time()
|
|
with ThreadPoolExecutor(max_workers=max(1, args.concurrency)) as ex:
|
|
futs = {ex.submit(fetch_one, i, out): i for i in range(args.count)}
|
|
for done, fut in enumerate(as_completed(futs), 1):
|
|
idx, fname, cached, err = fut.result()
|
|
results[idx] = {
|
|
"id": idx,
|
|
"file": fname,
|
|
"cached": cached,
|
|
"error": err,
|
|
}
|
|
if done % 25 == 0 or done == args.count:
|
|
ok = sum(1 for r in results if r and not r.get("error"))
|
|
print(f" [{done}/{args.count}] {ok} ok ({time.time()-t0:.1f}s)")
|
|
|
|
# Drop slots that errored or are still None (shouldn't happen)
|
|
records = [r for r in results if r and not r.get("error")]
|
|
print(f"\nPool ready: {len(records)} faces, {sum(1 for r in records if r['cached'])} from cache")
|
|
|
|
if not args.no_gender and records:
|
|
print("\nGender-tagging pass:")
|
|
summary = maybe_tag_gender(records, out)
|
|
print(f" distribution: {summary}")
|
|
else:
|
|
for r in records:
|
|
r["gender"] = None
|
|
|
|
# Strip transient flags before persisting
|
|
for r in records:
|
|
r.pop("cached", None)
|
|
r.pop("error", None)
|
|
|
|
manifest = os.path.join(out, "manifest.jsonl")
|
|
with open(manifest, "w") as f:
|
|
for r in records:
|
|
f.write(json.dumps(r) + "\n")
|
|
print(f"\nManifest: {manifest} ({len(records)} entries)")
|
|
|
|
# Quick checksum manifest for downstream debugging
|
|
h = hashlib.sha256()
|
|
for r in records:
|
|
h.update(r["file"].encode())
|
|
h.update(b"|")
|
|
h.update((r.get("gender") or "?").encode())
|
|
print(f"Pool fingerprint (sha256): {h.hexdigest()[:16]}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|