lakehouse/mcp-server/phase_1_6_gate_4.test.ts
root 4708717f6b phase 1.6 BIPA gates — engineering wave (4 of 7 staged)
Per docs/PHASE_1_6_BIPA_GATES.md. Status table now reflects:

  DONE (engineering-only, no counsel dependency):
  - Gate 4: name→ethnicity inference removed from mcp-server.
    Removal note in search.html:3372 + new Bun absence test
    (mcp-server/phase_1_6_gate_4.test.ts) with 3 assertions:
    walker actually scans files, regex catches synthetic positives,
    no offending DEFINITION patterns in any .html/.ts/.js source.
    3/3 pass.

  ENG-DONE, signature pending:
  - §2 attestation: scripts/staffing/attest_pre_identityd_biometric_state.sh
    runs three checks against the live state:
      1. workers_500k.parquet schema has no biometric/photo/face/image col
      2. data/_kb/*.jsonl + pathway state contain no base64 image magic
         bytes (JPEG /9j/, PNG iVBOR), no data:image/* MIME prefixes,
         no field-name patterns ("photo", "biometric", "deepface_*")
      3. data/headshots/manifest.jsonl is entirely synthetic-tagged
    3/3 evidence checks pass on the live data dir. Generates a
    signed-by-operator+counsel attestation document committed at
    docs/attestations/BIPA_PRE_IDENTITYD_ATTESTATION_2026-05-03.md
    with SHA-256 of the evidence summary so post-signature tampering
    is detectable.

  ENG-STAGED, awaiting counsel review:
  - Gate 1 retention schedule scaffold at
    docs/policies/consent/biometric_retention_schedule_v1.md (BIPA
    §15(a)). Engineering facts (categories, 18-month operational
    ceiling vs 3-year statutory cap, destruction procedure pointer
    to Gate 5 runbook) plus ⚖ COUNSEL markers for the binding text.
  - Gate 2 consent template scaffold at
    docs/policies/consent/biometric_consent_template_v1.md (BIPA
    §15(b)(1)-(3)). Required disclosures + plain-language summary +
    withdrawal procedure + the structured fields the consent UI must
    post to identityd.
  - Gate 5 destruction runbook at docs/runbooks/BIPA_DESTRUCTION_RUNBOOK.md.
    Triggers, pre-destruction checks (incl. chain-verified gate via
    /audit/subject/{id}), procedure (legal-tier endpoint), automatic
    audit row append (subject_audit.v1 with kind=biometric_erasure),
    backup-window disclosure, monthly reporting cadence, audit-trail
    attestation procedure cross-referencing the cross-runtime parity
    probe.

  BLOCKED on engineering design:
  - Gate 3 photo-upload endpoint. Requires identityd photo intake
    design + deepface integration scope. Deferred to its own session.

  DEFERRED:
  - §3 employee training material. Gate 5 runbook §7 may serve as
    substrate; counsel decides whether a separate program is needed.

Calendar bottleneck is now counsel review. Engineering can stage no
further deliverables until either (a) Gate 3's design conversation
happens or (b) counsel completes review of items 1/2/5/6.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 04:38:49 -05:00

131 lines
4.9 KiB
TypeScript

// Phase 1.6 Gate 4 absence test.
//
// Spec: docs/PHASE_1_6_BIPA_GATES.md §1 Gate 4 — Engineering acceptance:
// "Unit test asserts no protected-attribute inference functions exist
// in search.html or any mcp-server module"
//
// What this guards: the FEMALE_NAMES / NAMES_HISPANIC / SURNAMES_* lookup
// tables and the genderFor() / guessEthnicityFromFirstName() / etc.
// inference functions removed 2026-05-03. Re-introduction would re-open
// (1) Title VII / IL Human Rights Act discriminatory-feature risk and
// (2) BIPA's broad-reading "biometric information derived from a biometric
// identifier" pattern when combined with deepface output.
//
// Strategy: walk every .html / .ts / .tsx / .js / .mjs file under
// mcp-server/ and grep-assert that none of them DEFINE the forbidden
// symbols. We deliberately allow the symbol NAMES to appear inside
// comments — search.html has a removal note that names them so future
// readers know what was excised — but we forbid actual definition
// patterns (var / const / let / function / class member / object literal).
import { test, expect } from "bun:test";
import { readdirSync, statSync, readFileSync } from "node:fs";
import { join } from "node:path";
const FORBIDDEN_DATA_TABLES = [
// First-name lookup tables
"FEMALE_NAMES",
"MALE_NAMES",
"NAMES_HISPANIC",
"NAMES_BLACK",
"NAMES_SOUTH_ASIAN",
"NAMES_EAST_ASIAN",
"NAMES_MIDDLE_EASTERN",
// Surname lookup tables
"SURNAMES_HISPANIC",
"SURNAMES_BLACK",
"SURNAMES_SOUTH_ASIAN",
"SURNAMES_EAST_ASIAN",
"SURNAMES_MIDDLE_EASTERN",
];
const FORBIDDEN_FUNCTIONS = [
"guessGenderFromFirstName",
"guessEthnicityFromName",
"guessEthnicityFromFirstName",
"genderFor",
];
function* walkSource(dir: string): Generator<string> {
for (const entry of readdirSync(dir)) {
if (entry === "node_modules" || entry === "dist" || entry.startsWith(".")) continue;
const path = join(dir, entry);
const stat = statSync(path);
if (stat.isDirectory()) {
yield* walkSource(path);
} else if (/\.(html|ts|tsx|js|mjs)$/.test(entry)) {
// Don't grep this test file itself — it lists the forbidden tokens
// by name as match targets, not as definitions.
if (path.endsWith("phase_1_6_gate_4.test.ts")) continue;
yield path;
}
}
}
// definitionPatternsFor: returns regexes that match common DEFINITION
// forms in JS/TS/HTML embedded scripts. A bare reference inside a
// comment is intentionally NOT matched.
function definitionPatternsFor(symbol: string): RegExp[] {
return [
// var / const / let SYMBOL =
new RegExp(`\\b(?:var|const|let)\\s+${symbol}\\b\\s*=`),
// function SYMBOL(
new RegExp(`\\bfunction\\s+${symbol}\\s*\\(`),
// SYMBOL = function( OR SYMBOL = (...) =>
new RegExp(`\\b${symbol}\\s*=\\s*(?:function\\s*\\(|\\([^)]*\\)\\s*=>|async\\s*(?:\\(|function))`),
// class member: SYMBOL(...) { (a method declaration)
new RegExp(`^\\s*${symbol}\\s*\\([^)]*\\)\\s*\\{`, "m"),
];
}
function findOffenders(filePath: string, text: string): string[] {
const out: string[] = [];
for (const sym of [...FORBIDDEN_DATA_TABLES, ...FORBIDDEN_FUNCTIONS]) {
for (const pattern of definitionPatternsFor(sym)) {
if (pattern.test(text)) {
out.push(`${filePath}: definition of ${sym} (matched ${pattern})`);
}
}
}
return out;
}
test("Gate 4: no protected-attribute inference DEFINITIONS in mcp-server", () => {
const root = import.meta.dir;
const offenders: string[] = [];
for (const path of walkSource(root)) {
const text = readFileSync(path, "utf8");
offenders.push(...findOffenders(path, text));
}
if (offenders.length > 0) {
throw new Error(
`Phase 1.6 Gate 4 violation — protected-attribute inference symbols defined in mcp-server:\n` +
offenders.map((o) => ` - ${o}`).join("\n"),
);
}
expect(offenders.length).toBe(0);
});
// Sanity: confirm the test actually walks files (otherwise the absence
// assertion is vacuously true). If mcp-server ever lost its source
// tree, this would catch it.
test("Gate 4: walker actually finds source files to scan", () => {
const root = import.meta.dir;
let count = 0;
for (const _ of walkSource(root)) count++;
expect(count).toBeGreaterThan(5); // mcp-server has more than 5 source files
});
// Defense in depth: the regex itself must catch a synthetic positive.
// If the definition pattern ever stops matching real code, the absence
// test would silently pass on actual reintroductions.
test("Gate 4: regex catches a synthetic positive (defense in depth)", () => {
const synthetic =
`var NAMES_HISPANIC = ["Maria"];\n` +
`function guessEthnicityFromFirstName(name) { return "?"; }\n`;
const offenders = findOffenders("synthetic_test_input", synthetic);
expect(offenders.length).toBeGreaterThanOrEqual(2);
expect(offenders.some((o) => o.includes("NAMES_HISPANIC"))).toBe(true);
expect(offenders.some((o) => o.includes("guessEthnicityFromFirstName"))).toBe(true);
});