#!/usr/bin/env bash # attest_pre_identityd_biometric_state — one-shot defense artifact. # # Specification: docs/PHASE_1_6_BIPA_GATES.md §2 (Cryptographic # attestation that no biometric data exists pre-identityd). # # Why this exists: in a BIPA dispute, plaintiffs may argue that the # EXISTENCE of biometric schema fields constitutes constructive notice # of intent to collect. The defense: prove that no biometric data was # actually collected from real candidates before the identity service + # consent gate (Phase 1.6 Gates 1-3) shipped. # # This script produces a defensible record of: # 1. workers_500k.parquet schema has NO column named photo / biometric_* # / face_* / image_* # 2. data/_kb/*.jsonl and data/_pathway_memory/state.json contain NO # base64 image magic bytes (JPEG /9j/, PNG iVBOR), no data:image/* # MIME prefixes, and no field-name patterns that imply biometric # payload (photo, biometric, deepface_*) # 3. data/headshots/manifest.jsonl rows are entirely synthetic — count # matches the face_pool size, and every row's source is a synthetic # generator (not a real candidate upload) # # Output: # docs/attestations/BIPA_PRE_IDENTITYD_ATTESTATION_.md # — markdown attestation document with all evidence + a SHA-256 # hash of the evidence summary. Ready for J + counsel signature. # # Exit codes: # 0 — clean, attestation written, ready for signature # 1 — evidence FAILED, attestation NOT written; investigate before signing # 2 — script error (missing tools, unreadable files) set -uo pipefail cd "$(dirname "$0")/../.." # Dependency gate: pyarrow is required to read the parquet schema. Fail # fast with a clear message rather than letting python3 -c emit a stack # trace that gets captured into the attestation as "evidence". (Caught # 2026-05-03 kimi scrum WARN python3-reliance.) if ! python3 -c "import pyarrow" 2>/dev/null; then echo "[attest] FAIL: python3 -c 'import pyarrow' failed." >&2 echo "[attest] pyarrow is required to verify workers_500k.parquet schema." >&2 echo "[attest] Install with: pip install pyarrow" >&2 exit 2 fi DATE="${OVERRIDE_DATE:-$(date -u +%Y-%m-%d)}" OUT_DIR="docs/attestations" OUT="$OUT_DIR/BIPA_PRE_IDENTITYD_ATTESTATION_${DATE}.md" mkdir -p "$OUT_DIR" WORKERS_PARQUET="${WORKERS_PARQUET:-data/datasets/workers_500k.parquet}" KB_DIR="${KB_DIR:-data/_kb}" PATHWAY_STATE="${PATHWAY_STATE:-data/_pathway_memory/state.json}" HEADSHOTS_MANIFEST="${HEADSHOTS_MANIFEST:-data/headshots/manifest.jsonl}" PASS=0 FAIL=0 EVIDENCE=$(mktemp) note() { echo "$1" >> "$EVIDENCE"; } mark_pass() { PASS=$((PASS+1)); note " - PASS: $1"; } mark_fail() { FAIL=$((FAIL+1)); note " - FAIL: $1"; } # ── Check 1: workers_500k.parquet schema ──────────────────────────── note "## Check 1 — workers_500k.parquet schema (no biometric columns)" note "" note "**Source:** \`$WORKERS_PARQUET\`" note "" if [ ! -r "$WORKERS_PARQUET" ]; then echo "[attest] FAIL: cannot read $WORKERS_PARQUET" >&2 rm -f "$EVIDENCE" exit 2 fi # Hash NAME + TYPE + nullability per column, not just names. A schema # fingerprint over names alone would not invalidate if a column got # repurposed (e.g. resume_text reused to hold base64 photo bytes under # its existing name). Including types catches that class of evasion. # (Caught 2026-05-03 opus scrum WARN on attestation:18.) SCHEMA=$(python3 -c " import sys, pyarrow.parquet as pq schema = pq.read_schema('$WORKERS_PARQUET') for f in schema: print(f'{f.name}\t{f.type}\tnullable={f.nullable}') " 2>&1) # Bash assigns + propagates the substitution's exit through \$?. # Verified: X=\$(false); echo \$? -> 1. opus 2026-05-03 BLOCK on this # location was a false positive — the check IS the python3 exit gate. if [ $? -ne 0 ]; then echo "[attest] FAIL: schema read error: $SCHEMA" >&2 rm -f "$EVIDENCE" exit 2 fi SCHEMA_HASH=$(echo "$SCHEMA" | sha256sum | awk '{print $1}') SCHEMA_LINES=$(echo "$SCHEMA" | wc -l) note "**Schema columns** ($SCHEMA_LINES total):" note "" note '```' note "$SCHEMA" note '```' note "" note "**Schema SHA-256:** \`$SCHEMA_HASH\`" note "" # Forbidden column patterns (case-insensitive) FORBIDDEN_COLS=$(echo "$SCHEMA" | grep -iE "^(photo|biometric|face|image)([_].*)?$" || true) if [ -z "$FORBIDDEN_COLS" ]; then mark_pass "no biometric / photo / face / image column present" else mark_fail "forbidden columns present: $FORBIDDEN_COLS" fi note "" # ── Check 2: KB JSONL + pathway state — no base64 image / MIME ────── note "## Check 2 — KB + pathway memory contain no biometric payloads" note "" note "**Sources scanned:**" note "- \`$KB_DIR/*.jsonl\` (knowledge base)" note "- \`$PATHWAY_STATE\` (pathway memory state)" note "" SCAN_PATHS=() if [ -d "$KB_DIR" ]; then while IFS= read -r f; do SCAN_PATHS+=("$f"); done < <(find "$KB_DIR" -maxdepth 2 -type f -name "*.jsonl") fi if [ -r "$PATHWAY_STATE" ]; then SCAN_PATHS+=("$PATHWAY_STATE") fi # Forbidden patterns: # data:image/ — explicit MIME embed # "photo": — bare photo field # "biometric" — field name # "deepface_ — deepface output prefix # /9j/[A-Za-z0-9+/]{40,} — JPEG base64 magic + length floor (false-positive guard) # iVBORw0KGgo[A-Za-z0-9+/]{20,} — PNG base64 magic + length floor PATTERN_FILE=$(mktemp) cat > "$PATTERN_FILE" <<'PATTERNS' data:image/ "photo"\s*: "biometric" "deepface_ /9j/[A-Za-z0-9+/=]{40,} iVBORw0KGgo[A-Za-z0-9+/=]{20,} PATTERNS HITS=0 HIT_DETAIL=$(mktemp) for path in "${SCAN_PATHS[@]}"; do if grep -aHEf "$PATTERN_FILE" "$path" > "$HIT_DETAIL.tmp" 2>/dev/null; then if [ -s "$HIT_DETAIL.tmp" ]; then HITS=$((HITS + $(wc -l < "$HIT_DETAIL.tmp"))) cat "$HIT_DETAIL.tmp" >> "$HIT_DETAIL" fi fi done rm -f "$PATTERN_FILE" "$HIT_DETAIL.tmp" note "**Files scanned:** ${#SCAN_PATHS[@]}" note "**Forbidden-pattern hits:** $HITS" note "" if [ "$HITS" -eq 0 ]; then mark_pass "no biometric payload patterns found in scanned files" else mark_fail "$HITS forbidden-pattern hits — see detail below" note "" note "### Detail (first 20 hits)" note "" note '```' head -20 "$HIT_DETAIL" >> "$EVIDENCE" note '```' fi rm -f "$HIT_DETAIL" note "" # ── Check 3: headshots manifest is synthetic-only ─────────────────── note "## Check 3 — Headshots manifest is synthetic-only" note "" note "**Source:** \`$HEADSHOTS_MANIFEST\`" note "" if [ ! -r "$HEADSHOTS_MANIFEST" ]; then note "**SKIP** — manifest not present (no headshot UI deployed)." note "" mark_pass "no headshots manifest = no headshot data exists at all" else TOTAL_ROWS=$(wc -l < "$HEADSHOTS_MANIFEST") # A row is non-synthetic if it lacks the synthetic markers (source: tag, # archetype: tag, deterministic id pattern). The Phase 1.5 walk # established that the synthetic face pool uses generated portraits # with archetype tags. Anything else (real candidate upload) would # be a Phase 1.6 violation. NON_SYNTHETIC=$(grep -cE '"source"[[:space:]]*:[[:space:]]*"(real|candidate_upload|photo_upload)"' "$HEADSHOTS_MANIFEST" 2>/dev/null) || NON_SYNTHETIC=0 # Strip any newlines / whitespace defensively in case grep -c returned weirdly. NON_SYNTHETIC=$(printf '%s' "$NON_SYNTHETIC" | tr -d '[:space:]') : "${NON_SYNTHETIC:=0}" note "**Total rows:** $TOTAL_ROWS" note "**Rows tagged real/candidate_upload/photo_upload:** $NON_SYNTHETIC" note "" if [ "$NON_SYNTHETIC" = "0" ]; then mark_pass "all $TOTAL_ROWS rows are synthetic (no real-candidate uploads)" else mark_fail "$NON_SYNTHETIC rows tagged as non-synthetic — investigate" fi fi note "" # ── Summary + final hash ──────────────────────────────────────────── TOTAL=$((PASS + FAIL)) note "## Summary" note "" note "**$PASS / $TOTAL** evidence checks pass." note "" if [ "$FAIL" -gt 0 ]; then note "**Status: NOT READY FOR SIGNATURE** — at least one check failed. Resolve before counsel review." note "" fi # Compute the evidence hash so any modification to the attestation # document is detectable post-signature. EVIDENCE_HASH=$(sha256sum "$EVIDENCE" | awk '{print $1}') # ── Render final attestation document ─────────────────────────────── { echo "# BIPA Pre-IdentityD Biometric Attestation" echo echo "**Date:** $DATE" echo "**Spec:** docs/PHASE_1_6_BIPA_GATES.md §2" echo "**Generator:** scripts/staffing/attest_pre_identityd_biometric_state.sh" echo echo "## Purpose" echo echo "This is a one-time defense artifact establishing that, as of" echo "$DATE, no biometric identifiers or biometric information" echo "from real candidates have been collected, processed, or stored" echo "by the Lakehouse system. It is intended to be signed by J" echo "(operator of record) and outside counsel, then anchored to a" echo "tamper-evident store (filesystem with backups + version control)." echo echo "## Evidence" echo cat "$EVIDENCE" echo echo "---" echo echo "## Attestation" echo echo "I, the undersigned, attest that the above evidence accurately" echo "reflects the state of the Lakehouse system as of $DATE." echo "No biometric identifiers or biometric information from real" echo "candidates have been collected, processed, or stored prior to" echo "the deployment of the Phase 1.6 BIPA pre-launch gates." echo echo "**Evidence SHA-256:** \`$EVIDENCE_HASH\`" echo echo "---" echo echo "**Operator (J):** _______________________________ Date: __________" echo echo "**Outside counsel:** ___________________________ Date: __________" echo } > "$OUT" rm -f "$EVIDENCE" echo "[attest] $PASS / $TOTAL checks pass — attestation: $OUT" echo "[attest] evidence SHA-256: $EVIDENCE_HASH" [ "$FAIL" -eq 0 ]