Merge pull request 'Auditor: PR-claim hard-block reviewer (scaffold)' (#1) from auditor/scaffold into main

2026-04-22 09:13:34 +00:00 · 2026-04-22 09:13:34 +00:00 · b6d69b2e82
commit b6d69b2e82
parent b82caa9971 c33c1bcbc5
13 changed files with 1785 additions and 0 deletions
--- a/auditor/README.md
+++ b/auditor/README.md
@ -0,0 +1,53 @@
+# Lakehouse Claim Auditor
+
+A Bun sub-agent that watches open PRs on Gitea, reads the ship-claims
+in commit messages and PR bodies, and **hard-blocks** merges when the
+code doesn't back the claim.
+
+Rationale: when "compiles + one curl works" gets called "phase shipped,"
+placeholder code accumulates. This auditor runs every 90s, fetches
+each open PR, and subjects it to four checks:
+
+1. **Static diff** — grep/parse looking for placeholder patterns
+2. **Dynamic** — runs the never-before-executed hybrid test fixture
+3. **Cloud inference** — asks `gpt-oss:120b` via `/v1/chat` to
+   identify gaps in the diff
+4. **KB query** — looks up `data/_kb/` + observer for prior failure
+   patterns on similar claims
+
+Verdict is assembled, posted to Gitea as:
+- A **failing commit status** (hard block — branch protection
+  prevents merge)
+- A **review comment** explaining every finding
+
+## Run manually
+
+```bash
+cd /home/profit/lakehouse
+bun run auditor/index.ts
+```
+
+Defaults: polls every 90s, stops on `auditor.paused` file present.
+
+## State
+
+- `data/_auditor/state.json` — last-audited head SHA per PR
+- `data/_auditor/verdicts/{pr}-{sha}.json` — per-run verdict record
+
+## Where YOU edit
+
+`auditor/policy.ts` — the verdict assembler. Controls which findings
+block vs warn vs inform. All other code is mechanical: fetching,
+running checks, posting to Gitea.
+
+## Hard-block mechanism
+
+1. Commit status is posted as `failure` with context `lakehouse/auditor`
+2. If `main` branch protection requires `lakehouse/auditor` status
+   to pass, Gitea prevents merge
+3. When code is fixed and re-audit passes, status flips to `success`,
+   merge unblocks
+
+Enable branch protection (one-time, via Gitea UI or API):
+- `POST /repos/profit/lakehouse/branch_protections`
+- `{"branch_name": "main", "required_status_checks": {"contexts": ["lakehouse/auditor"]}}`
--- a/auditor/audit.ts
+++ b/auditor/audit.ts
@ -0,0 +1,171 @@
+// Orchestrator — runs all four checks on a PR, assembles a verdict,
+// posts to Gitea. This is task #8's integration layer; the poller
+// (task #9) calls this once per PR on every fresh head SHA.
+//
+// Hard-block mechanism: commit status posted with state="failure"
+// and context="lakehouse/auditor". If `main` branch protection
+// requires that context to pass, merge is physically impossible
+// until the auditor re-audits a fixed commit and flips the status
+// to "success".
+//
+// Human-readable reasoning: posted as a PR issue comment (not a
+// review — reviews have self-review restrictions on Gitea and the
+// auditor currently uses the same PAT as the PR author).
+
+import { readFile, writeFile, mkdir } from "node:fs/promises";
+import { join } from "node:path";
+import type { PrSnapshot, Verdict, Finding } from "./types.ts";
+import { getPrDiff, postCommitStatus, postIssueComment } from "./gitea.ts";
+import { parseClaims } from "./claim_parser.ts";
+import { assembleVerdict } from "./policy.ts";
+import { runStaticCheck } from "./checks/static.ts";
+import { runDynamicCheck } from "./checks/dynamic.ts";
+import { runInferenceCheck } from "./checks/inference.ts";
+import { runKbCheck } from "./checks/kb_query.ts";
+
+const VERDICTS_DIR = "/home/profit/lakehouse/data/_auditor/verdicts";
+
+export interface AuditOptions {
+  // Skip the cloud inference call (fast path for iteration). Default false.
+  skip_inference?: boolean;
+  // Skip the dynamic check (avoid running the hybrid fixture every PR,
+  // since it hits live services and mutates playbook state). Default false
+  // on `main`-branch-target PRs, true when auditing feature branches
+  // where the fixture would pollute state. Caller decides.
+  skip_dynamic?: boolean;
+  // Skip Gitea posting — useful for dry-runs / local testing.
+  // Default false.
+  dry_run?: boolean;
+}
+
+export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<Verdict> {
+  const t0 = Date.now();
+  const diff = await getPrDiff(pr.number);
+  const { claims } = parseClaims(pr);
+
+  // Run checks in parallel where they don't share mutable state.
+  // Static + kb_query + inference are all read-only. Dynamic mutates
+  // playbook state (nonce-scoped per run, but still live) so if
+  // skip_dynamic is false we still run it in parallel — the mutation
+  // is namespaced.
+  const [staticFindings, dynamicFindings, inferenceFindings, kbFindings] = await Promise.all([
+    runStaticCheck(diff),
+    opts.skip_dynamic ? Promise.resolve(stubFinding("dynamic", "skipped by options")) : runDynamicCheck(),
+    opts.skip_inference ? Promise.resolve(stubFinding("inference", "skipped by options")) : runInferenceCheck(claims, diff),
+    runKbCheck(claims),
+  ]);
+
+  const allFindings: Finding[] = [
+    ...staticFindings,
+    ...dynamicFindings,
+    ...inferenceFindings,
+    ...kbFindings,
+  ];
+
+  const duration_ms = Date.now() - t0;
+  const metrics = {
+    audit_duration_ms: duration_ms,
+    findings_total: allFindings.length,
+    findings_block: allFindings.filter(f => f.severity === "block").length,
+    findings_warn: allFindings.filter(f => f.severity === "warn").length,
+    findings_info: allFindings.filter(f => f.severity === "info").length,
+    claims_strong: claims.filter(c => c.strength === "strong").length,
+    claims_moderate: claims.filter(c => c.strength === "moderate").length,
+    claims_weak: claims.filter(c => c.strength === "weak").length,
+    claims_total: claims.length,
+    diff_bytes: diff.length,
+  };
+
+  const verdict = assembleVerdict(allFindings, metrics, pr.number, pr.head_sha);
+
+  await persistVerdict(verdict);
+
+  if (!opts.dry_run) {
+    await postToGitea(verdict);
+  }
+
+  return verdict;
+}
+
+async function persistVerdict(v: Verdict): Promise<void> {
+  await mkdir(VERDICTS_DIR, { recursive: true });
+  const filename = `${v.pr_number}-${v.head_sha.slice(0, 12)}.json`;
+  await writeFile(join(VERDICTS_DIR, filename), JSON.stringify(v, null, 2));
+}
+
+export async function postToGitea(v: Verdict): Promise<void> {
+  // 1. Commit status — the hard block signal (if branch protection
+  //    is configured to require lakehouse/auditor on main).
+  const state = v.overall === "approve" ? "success" : "failure";
+  await postCommitStatus({
+    sha: v.head_sha,
+    state,
+    context: "lakehouse/auditor",
+    description: v.one_liner,
+    target_url: "", // no URL yet; could point to a verdicts dashboard
+  });
+
+  // 2. Issue comment — the reasoning. Gated so we don't spam the PR
+  //    with identical comments on re-audits of the same SHA. Caller
+  //    (poller) ensures we only re-audit fresh SHAs, but a dedup
+  //    marker inside the body keeps it idempotent if re-run.
+  const body = formatReviewBody(v);
+  await postIssueComment({ pr_number: v.pr_number, body });
+}
+
+function formatReviewBody(v: Verdict): string {
+  const byCheck: Record<string, Finding[]> = {};
+  for (const f of v.findings) {
+    (byCheck[f.check] ||= []).push(f);
+  }
+
+  const verdictEmoji =
+    v.overall === "approve" ? "✅" :
+    v.overall === "request_changes" ? "⚠️" :
+    "🛑";
+
+  const lines: string[] = [];
+  lines.push(`## Auditor verdict: ${verdictEmoji} \`${v.overall}\``);
+  lines.push("");
+  lines.push(`**One-liner:** ${v.one_liner}`);
+  lines.push(`**Head SHA:** \`${v.head_sha.slice(0, 12)}\``);
+  lines.push(`**Audited at:** ${v.audited_at}`);
+  lines.push("");
+
+  // Per-check sections, only if the check produced findings.
+  const checkOrder = ["static", "dynamic", "inference", "kb_query"] as const;
+  for (const check of checkOrder) {
+    const fs = byCheck[check] ?? [];
+    if (fs.length === 0) continue;
+    const bySev = {
+      block: fs.filter(f => f.severity === "block").length,
+      warn: fs.filter(f => f.severity === "warn").length,
+      info: fs.filter(f => f.severity === "info").length,
+    };
+    lines.push(`<details><summary><b>${check}</b> — ${fs.length} findings (${bySev.block} block, ${bySev.warn} warn, ${bySev.info} info)</summary>`);
+    lines.push("");
+    for (const f of fs) {
+      const mark = f.severity === "block" ? "🛑" : f.severity === "warn" ? "⚠️" : "ℹ️";
+      lines.push(`${mark} **${f.severity}** — ${f.summary}`);
+      for (const e of f.evidence.slice(0, 3)) {
+        lines.push(`  - \`${e.slice(0, 180).replace(/\n/g, " ")}\``);
+      }
+    }
+    lines.push("");
+    lines.push("</details>");
+    lines.push("");
+  }
+
+  lines.push("### Metrics");
+  lines.push("```json");
+  lines.push(JSON.stringify(v.metrics, null, 2));
+  lines.push("```");
+  lines.push("");
+  lines.push(`<sub>Lakehouse auditor · SHA ${v.head_sha.slice(0, 8)} · re-audit on new commit flips the status automatically.</sub>`);
+
+  return lines.join("\n");
+}
+
+function stubFinding(check: "dynamic" | "inference", why: string): Finding[] {
+  return [{ check, severity: "info", summary: `${check} check skipped — ${why}`, evidence: [why] }];
+}
--- a/auditor/checks/dynamic.ts
+++ b/auditor/checks/dynamic.ts
@ -0,0 +1,91 @@
+// Dynamic execution check — runs the hybrid fixture and maps its
+// layer results to auditor Findings.
+//
+// A layer that fails with a "not implemented / 404 / slice N" error
+// gets severity=info (honest placeholder signal). A layer that fails
+// any other way gets severity=warn (something actually broke).
+// An info-level summary finding is always emitted carrying the real
+// numbers — shipped/placeholder phase counts, per-layer latency.
+
+import { runHybridFixture } from "../fixtures/hybrid_38_40_45.ts";
+import type { Finding } from "../types.ts";
+
+const PLACEHOLDER_MARKERS = [
+  "unimplemented",
+  " 404 ", "(404)", " 405 ", "(405)",
+  "slice 3", "slice 4", "slice 5",
+  "endpoint not built", "not yet",
+];
+
+function isPlaceholderFailure(err?: string): boolean {
+  if (!err) return false;
+  const low = err.toLowerCase();
+  return PLACEHOLDER_MARKERS.some(m => low.includes(m.toLowerCase()));
+}
+
+export async function runDynamicCheck(): Promise<Finding[]> {
+  const findings: Finding[] = [];
+
+  let result;
+  try {
+    result = await runHybridFixture();
+  } catch (e) {
+    // Fixture itself crashed — can't run dynamic check at all.
+    return [
+      {
+        check: "dynamic",
+        severity: "warn",
+        summary: `hybrid fixture crashed before completing: ${(e as Error).message.slice(0, 140)}`,
+        evidence: [(e as Error).message],
+      },
+    ];
+  }
+
+  // Per-layer findings for every non-ok layer.
+  for (const layer of result.layers) {
+    if (layer.ok) continue;
+    const placeholder = isPlaceholderFailure(layer.error);
+    findings.push({
+      check: "dynamic",
+      severity: placeholder ? "info" : "warn",
+      summary: placeholder
+        ? `hybrid fixture layer ${layer.layer} (Phase ${layer.phase}) honestly reports unimplemented`
+        : `hybrid fixture layer ${layer.layer} (Phase ${layer.phase}) failed — not a placeholder, a real failure`,
+      evidence: [
+        `evidence: ${layer.evidence.slice(0, 160)}`,
+        ...(layer.error ? [`error: ${layer.error.slice(0, 160)}`] : []),
+        `latency_ms: ${layer.latency_ms}`,
+      ],
+    });
+  }
+
+  // One overall summary with real numbers so the report shows what
+  // DID pass plus per-layer timing.
+  const metrics_preview = Object.entries(result.real_numbers)
+    .slice(0, 10)
+    .map(([k, v]) => `${k}=${v}`);
+  findings.push({
+    check: "dynamic",
+    severity: "info",
+    summary: `hybrid fixture overall=${result.overall}, shipped [${result.shipped_phases.join(", ")}], placeholder [${result.placeholder_phases.join(", ")}]`,
+    evidence: metrics_preview.length > 0 ? metrics_preview : ["no metrics emitted"],
+  });
+
+  // If the fixture ran at all but nothing passed, elevate one of the
+  // summary findings to warn — something more than "all honest
+  // placeholders" is wrong.
+  if (result.overall === "fail") {
+    findings.push({
+      check: "dynamic",
+      severity: "warn",
+      summary: `hybrid fixture: 0 layers passed (overall=fail)`,
+      evidence: [
+        "a total fixture fail usually means a precondition service is down",
+        "(gateway /health / sidecar / Langfuse /v1/chat) — NOT necessarily",
+        "the PR's code problem. Check service status before blaming the PR.",
+      ],
+    });
+  }
+
+  return findings;
+}
--- a/auditor/checks/inference.ts
+++ b/auditor/checks/inference.ts
@ -0,0 +1,206 @@
+// Cloud inference check — wraps the proven run_codereview pattern
+// from llm_team_ui.py (same 3-stage framing, same cloud model) to
+// critique a PR's claims against its diff.
+//
+// Proved out 2026-04-22 via /tmp/codereview_runner.py — gpt-oss:120b
+// caught a real ternary bug in auditor/fixtures/hybrid_38_40_45.ts
+// that unit tests missed. This module reuses the reviewer prompt
+// shape (bugs / security / performance / style / edge cases) and
+// adds claim-vs-diff specific framing.
+//
+// Call surface: runInferenceCheck(claims, diff) → Finding[].
+// Cloud latency budget: ~60s (gpt-oss:120b reviewer typically 35-50s
+// with a 15KB diff + claim list).
+
+import type { Claim, Finding } from "../types.ts";
+
+const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
+const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
+// 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was
+// previously truncated at 15KB causing the reviewer to miss later
+// files (gitea.ts, policy.ts) and flag "no Gitea client present" as a
+// block finding when the file was simply outside the truncation window.
+const MAX_DIFF_CHARS = 40000;
+const CALL_TIMEOUT_MS = 120_000;
+
+export async function runInferenceCheck(claims: Claim[], diff: string): Promise<Finding[]> {
+  if (claims.length === 0) {
+    return [{
+      check: "inference",
+      severity: "info",
+      summary: "no ship-claims extracted — skipping cloud inference",
+      evidence: ["parser returned empty claim list; nothing to verify against cloud"],
+    }];
+  }
+
+  const truncated = diff.length > MAX_DIFF_CHARS
+    ? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]`
+    : diff;
+
+  // Build the reviewer prompt in the same shape as run_codereview's
+  // review stage (llm_team_ui.py:10950), adapted for claim verification:
+  //   "Task: ..."
+  //   "Code: ..."
+  //   "Review: bugs/security/perf/style/edge. Provide corrected code."
+  // We add: claim list upfront + ask for structured JSON verdict.
+  const systemMsg = [
+    "You review pull-request diffs against the author's own ship-claims.",
+    "For each claim, decide: is it backed by actual code in the diff, or is",
+    "it placeholder / aspirational / unwired?",
+    "",
+    "A claim is BACKED when the diff contains a real code path that delivers",
+    "the claimed behavior. A claim is NOT BACKED when:",
+    "  - the claim asserts functionality but the diff only adds types/fields",
+    "    with no consumer",
+    "  - the claim mentions tests but no test function was added",
+    "  - the claim claims integration but the integration point is a stub",
+    "  - the diff contains unimplemented!() / todo!() / TODO comments",
+    "  - the claim says 'works end-to-end' but the diff has no end-to-end test",
+    "",
+    "Respond with strict JSON only. No prose before or after. Shape:",
+    "{",
+    '  "claim_verdicts": [',
+    '    {"claim_idx": 0, "backed": false, "evidence": "short reason"}',
+    "  ],",
+    '  "unflagged_gaps": [',
+    '    {"location": "file:line", "summary": "short description"}',
+    "  ]",
+    "}",
+  ].join("\n");
+
+  const userMsg = [
+    `Ship-claims the author made (numbered 0..N-1):`,
+    claims.map((c, i) => `  ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
+    "",
+    `Diff:`,
+    "```",
+    truncated,
+    "```",
+    "",
+    `For each numbered claim above, emit a claim_verdicts entry. For gaps the`,
+    `author DIDN'T claim but that look like placeholder code, emit unflagged_gaps.`,
+    `Strict JSON only, matching the shape described. No prose outside JSON.`,
+  ].join("\n");
+
+  let resp: Response;
+  try {
+    resp = await fetch(`${GATEWAY}/v1/chat`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        provider: "ollama_cloud",
+        model: MODEL,
+        messages: [
+          { role: "system", content: systemMsg },
+          { role: "user", content: userMsg },
+        ],
+        max_tokens: 3000,
+        temperature: 0.2,
+        think: true, // T3 overseer should reason — JSON shape is still required
+      }),
+      signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
+    });
+  } catch (e) {
+    // Cloud unreachable → soft-fail. Don't block a PR because the
+    // reviewer model is down. Static + dynamic + kb still run.
+    return [{
+      check: "inference",
+      severity: "info",
+      summary: "cloud inference unreachable — skipped",
+      evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`],
+    }];
+  }
+
+  if (!resp.ok) {
+    return [{
+      check: "inference",
+      severity: "info",
+      summary: `cloud inference returned ${resp.status} — skipped`,
+      evidence: [`body: ${(await resp.text()).slice(0, 200)}`],
+    }];
+  }
+
+  const body: any = await resp.json();
+  const content: string = body?.choices?.[0]?.message?.content ?? "";
+  const usage = body?.usage ?? {};
+
+  const parsed = extractJson(content);
+  if (!parsed) {
+    return [{
+      check: "inference",
+      severity: "info",
+      summary: "cloud returned unparseable output — skipped",
+      evidence: [
+        `head: ${content.slice(0, 200)}`,
+        `tokens: ${usage.total_tokens ?? "?"}`,
+      ],
+    }];
+  }
+
+  const findings: Finding[] = [];
+
+  // One summary info finding so the verdict layer knows the check ran.
+  findings.push({
+    check: "inference",
+    severity: "info",
+    summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})`,
+    evidence: [
+      `claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`,
+    ],
+  });
+
+  for (const v of parsed.claim_verdicts ?? []) {
+    if (v?.backed === false) {
+      const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
+      const claim = claims[idx];
+      if (!claim) continue;
+      // Strong+unbacked = BLOCK. That's the whole point of the auditor.
+      const sev: Finding["severity"] = claim.strength === "strong" ? "block"
+        : claim.strength === "moderate" ? "warn"
+        : "info";
+      findings.push({
+        check: "inference",
+        severity: sev,
+        claim_text: claim.text,
+        summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`,
+        evidence: [
+          `at ${claim.location}`,
+          `cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`,
+        ],
+      });
+    }
+  }
+
+  for (const g of parsed.unflagged_gaps ?? []) {
+    findings.push({
+      check: "inference",
+      severity: "warn",
+      summary: `cloud-flagged gap not in any claim: ${String(g?.summary ?? "?").slice(0, 120)}`,
+      evidence: [`location: ${String(g?.location ?? "?").slice(0, 140)}`],
+    });
+  }
+
+  return findings;
+}
+
+// Lift the first balanced JSON object out of the response. Tolerates
+// leading prose, code fences, and model reasoning preamble when the
+// cloud model ignored "strict JSON only."
+function extractJson(text: string): any | null {
+  const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, "");
+  let depth = 0;
+  let start = -1;
+  for (let i = 0; i < cleaned.length; i++) {
+    const c = cleaned[i];
+    if (c === "{") {
+      if (depth === 0) start = i;
+      depth++;
+    } else if (c === "}") {
+      depth--;
+      if (depth === 0 && start >= 0) {
+        try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; }
+      }
+    }
+  }
+  return null;
+}
--- a/auditor/checks/kb_query.ts
+++ b/auditor/checks/kb_query.ts
@ -0,0 +1,183 @@
+// Local-KB check — reads data/_kb/ + data/_observer/ + data/_bot/
+// for prior evidence bearing on this PR's claims. Cheap, offline,
+// no model calls. The point: if a claim like "Phase X shipped" has
+// a historical record of failing on the same signature before, the
+// auditor surfaces that pattern before the cloud check has to
+// infer it.
+//
+// What this check reads (all file-backed, append-only or periodic):
+//   data/_kb/outcomes.jsonl         — per-scenario outcomes (kb.ts)
+//   data/_kb/error_corrections.jsonl — fail→succeed deltas on same sig
+//   data/_observer/ops.jsonl         — observer ring → disk stream
+//   data/_bot/cycles/*.json          — bot cycle results
+//
+// Each JSONL line / per-cycle file is small; this check reads tails
+// only (last N lines or last M files) to stay cheap on large corpora.
+
+import { readFile, readdir, stat } from "node:fs/promises";
+import { join } from "node:path";
+import type { Claim, Finding } from "../types.ts";
+
+const KB_DIR = "/home/profit/lakehouse/data/_kb";
+const OBSERVER_OPS = "/home/profit/lakehouse/data/_observer/ops.jsonl";
+const BOT_CYCLES_DIR = "/home/profit/lakehouse/data/_bot/cycles";
+const TAIL_LINES = 500;
+const MAX_BOT_CYCLE_FILES = 30;
+
+export async function runKbCheck(claims: Claim[]): Promise<Finding[]> {
+  const findings: Finding[] = [];
+
+  // 1. Recent scenario outcomes: are strong-claim-style phrases showing
+  //    up alongside failed events? That's "we claimed it worked" +
+  //    "it didn't" in the KB.
+  const scenarioFindings = await checkScenarioOutcomes(claims);
+  findings.push(...scenarioFindings);
+
+  // 2. Error corrections: any of the claims text overlap a
+  //    recently-observed fail→succeed pair? If yes, add context.
+  const correctionFindings = await checkErrorCorrections(claims);
+  findings.push(...correctionFindings);
+
+  // 3. Bot cycles: any prior bot cycle ended in tests_failed or
+  //    apply_failed on a file this PR is also touching?
+  const botFindings = await checkBotCycles();
+  findings.push(...botFindings);
+
+  // 4. Observer: count recent error events. High volume = shared
+  //    infra problem, worth flagging (context for other findings).
+  const obsFindings = await checkObserverStream();
+  findings.push(...obsFindings);
+
+  return findings;
+}
+
+async function tailJsonl<T = any>(path: string, n: number): Promise<T[]> {
+  try {
+    const raw = await readFile(path, "utf8");
+    const lines = raw.split("\n").filter(l => l.length > 0);
+    const slice = lines.slice(-n);
+    const out: T[] = [];
+    for (const line of slice) {
+      try { out.push(JSON.parse(line)); } catch { /* skip malformed */ }
+    }
+    return out;
+  } catch {
+    return [];
+  }
+}
+
+async function checkScenarioOutcomes(_claims: Claim[]): Promise<Finding[]> {
+  const outcomes = await tailJsonl<any>(join(KB_DIR, "outcomes.jsonl"), TAIL_LINES);
+  if (outcomes.length === 0) return [];
+  const totalEvents = outcomes.reduce((s, o) => s + (o.total_events ?? 0), 0);
+  const okEvents = outcomes.reduce((s, o) => s + (o.ok_events ?? 0), 0);
+  const failRate = totalEvents > 0 ? 1 - okEvents / totalEvents : 0;
+
+  if (totalEvents === 0) {
+    return [{
+      check: "kb_query",
+      severity: "info",
+      summary: `KB: no scenario outcomes on file — learning loop is empty`,
+      evidence: [`data/_kb/outcomes.jsonl has ${outcomes.length} entries with 0 total events`],
+    }];
+  }
+
+  const recent = outcomes.slice(-10);
+  const recentFailSigs: string[] = recent
+    .filter(o => (o.ok_events ?? 0) < (o.total_events ?? 0))
+    .map(o => o.sig_hash)
+    .filter(s => typeof s === "string");
+
+  const findings: Finding[] = [{
+    check: "kb_query",
+    severity: failRate > 0.3 ? "warn" : "info",
+    summary: `KB: ${outcomes.length} recent scenario runs, ${okEvents}/${totalEvents} events ok (fail rate ${(failRate * 100).toFixed(1)}%)`,
+    evidence: [
+      `most recent: ${recent[recent.length - 1]?.run_id ?? "?"}`,
+      `recent failing sigs: ${recentFailSigs.length > 0 ? recentFailSigs.slice(-3).join(", ") : "none"}`,
+    ],
+  }];
+  return findings;
+}
+
+async function checkErrorCorrections(_claims: Claim[]): Promise<Finding[]> {
+  const corrections = await tailJsonl<any>(join(KB_DIR, "error_corrections.jsonl"), TAIL_LINES);
+  if (corrections.length === 0) return [];
+  return [{
+    check: "kb_query",
+    severity: "info",
+    summary: `KB: ${corrections.length} error corrections on file (fail→succeed pairs)`,
+    evidence: [
+      corrections.length > 0
+        ? `most recent: ${String(corrections[corrections.length - 1]?.sig_hash ?? "?").slice(0, 24)}`
+        : "none",
+    ],
+  }];
+}
+
+async function checkBotCycles(): Promise<Finding[]> {
+  let entries: string[] = [];
+  try { entries = await readdir(BOT_CYCLES_DIR); }
+  catch { return []; }
+
+  const jsonFiles = entries.filter(e => e.endsWith(".json"));
+  if (jsonFiles.length === 0) return [];
+
+  // Sort by mtime desc, take most recent N
+  const withStat = await Promise.all(
+    jsonFiles.map(async name => {
+      try { return { name, mtime: (await stat(join(BOT_CYCLES_DIR, name))).mtimeMs }; }
+      catch { return { name, mtime: 0 }; }
+    }),
+  );
+  const recent = withStat.sort((a, b) => b.mtime - a.mtime).slice(0, MAX_BOT_CYCLE_FILES);
+
+  const outcomes: Record<string, number> = {};
+  for (const { name } of recent) {
+    try {
+      const r = JSON.parse(await readFile(join(BOT_CYCLES_DIR, name), "utf8"));
+      const o = String(r.outcome ?? "unknown");
+      outcomes[o] = (outcomes[o] ?? 0) + 1;
+    } catch { /* skip */ }
+  }
+
+  const summary = Object.entries(outcomes)
+    .sort((a, b) => b[1] - a[1])
+    .map(([k, v]) => `${k}=${v}`)
+    .join(", ");
+
+  const failCount = (outcomes["tests_failed"] ?? 0) + (outcomes["apply_failed"] ?? 0) + (outcomes["model_failed"] ?? 0);
+  return [{
+    check: "kb_query",
+    severity: failCount > recent.length / 2 ? "warn" : "info",
+    summary: `KB: bot recorded ${recent.length} recent cycles — ${summary || "no outcomes parsed"}`,
+    evidence: [
+      `dir: ${BOT_CYCLES_DIR}`,
+      `fail-class total: ${failCount} / ${recent.length}`,
+    ],
+  }];
+}
+
+async function checkObserverStream(): Promise<Finding[]> {
+  const ops = await tailJsonl<any>(OBSERVER_OPS, TAIL_LINES);
+  if (ops.length === 0) return [];
+  const failures = ops.filter(o => o.ok === false).length;
+  return [{
+    check: "kb_query",
+    severity: "info",
+    summary: `KB: observer stream ${ops.length} recent ops, ${failures} failures`,
+    evidence: [
+      `source: ${OBSERVER_OPS}`,
+      `by source: ${observerBySource(ops)}`,
+    ],
+  }];
+}
+
+function observerBySource(ops: any[]): string {
+  const c: Record<string, number> = {};
+  for (const o of ops) {
+    const s = String(o.source ?? "unknown");
+    c[s] = (c[s] ?? 0) + 1;
+  }
+  return Object.entries(c).sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ") || "empty";
+}
--- a/auditor/checks/static.ts
+++ b/auditor/checks/static.ts
@ -0,0 +1,159 @@
+// Static diff check — grep-style, no AST, no LLM. Looks for patterns
+// that are high-signal evidence of placeholder code.
+//
+// Findings are severity-graded:
+//   block — explicit non-impl markers (unimplemented!, todo!,
+//           panic!("not implemented"), throw new Error("not implemented"))
+//   warn  — TODO / FIXME / XXX / HACK comments on added lines,
+//           new struct fields with no read-site anywhere in the diff,
+//           suspiciously-empty function bodies ({ Ok(()) } / {} when
+//           the commit message claims the fn "implements" something)
+//   info  — hardcoded "test" / "dummy" / "placeholder" strings in
+//           added lines (could be real, just flag for inspection)
+//
+// Consumes: raw unified diff text from Gitea.
+
+import type { Finding } from "../types.ts";
+
+// Rust + TypeScript patterns that almost always indicate "this is
+// not actually implemented yet."
+const BLOCK_PATTERNS: Array<{ re: RegExp; why: string }> = [
+  { re: /\bunimplemented!\s*\(/, why: "unimplemented!() macro call" },
+  { re: /\btodo!\s*\(/, why: "todo!() macro call" },
+  { re: /panic!\s*\(\s*"(?:not implemented|TODO|not yet|unimpl)/i, why: "panic! with not-implemented message" },
+  { re: /throw\s+new\s+Error\s*\(\s*['"](?:not implemented|TODO|unimpl)/i, why: "throw Error 'not implemented'" },
+];
+
+const WARN_COMMENT_PATTERNS: Array<{ re: RegExp; why: string }> = [
+  { re: /^\+.*\/\/\s*(TODO|FIXME|XXX|HACK)\b/i, why: "TODO/FIXME/XXX/HACK comment added" },
+  { re: /^\+.*#\s*(TODO|FIXME|XXX|HACK)\b/i, why: "TODO/FIXME/XXX/HACK comment added" },
+];
+
+const INFO_HARDCODED_PATTERNS: Array<{ re: RegExp; why: string }> = [
+  { re: /"(?:placeholder|dummy|foobar|xxx|replaceme|changeme)"/i, why: "suspicious hardcoded string" },
+];
+
+export function runStaticCheck(diff: string): Finding[] {
+  const findings: Finding[] = [];
+
+  // Per-file walk: only look at ADDED lines (prefix '+' but not '+++'
+  // which is the diff header).
+  const perFile = splitDiffByFile(diff);
+
+  for (const [path, lines] of perFile) {
+    // Skip diff bookkeeping + pure-delete files
+    if (!lines.some(l => l.startsWith("+") && !l.startsWith("+++"))) continue;
+
+    // The auditor's own check files literally contain the BLOCK
+    // patterns as regex definitions (BLOCK_PATTERNS in this file,
+    // prompt examples in inference.ts). Skipping BLOCK scan on these
+    // specific paths prevents the checker from self-flagging its own
+    // string literals. WARN/INFO patterns still run — those genuinely
+    // could indicate problems in the checker's own code (TODO
+    // comments don't self-define).
+    const isAuditorCheckerFile = path.startsWith("auditor/checks/") ||
+                                 path.startsWith("auditor/fixtures/");
+
+    for (let idx = 0; idx < lines.length; idx++) {
+      const line = lines[idx];
+      if (!line.startsWith("+") || line.startsWith("+++")) continue;
+      const added = line.slice(1);
+
+      if (!isAuditorCheckerFile) {
+        for (const { re, why } of BLOCK_PATTERNS) {
+          if (re.test(added)) {
+            findings.push({
+              check: "static",
+              severity: "block",
+              summary: `${why} in ${path}`,
+              evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
+            });
+          }
+        }
+      }
+      for (const { re, why } of WARN_COMMENT_PATTERNS) {
+        if (re.test(line)) {
+          findings.push({
+            check: "static",
+            severity: "warn",
+            summary: `${why} in ${path}`,
+            evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
+          });
+        }
+      }
+      for (const { re, why } of INFO_HARDCODED_PATTERNS) {
+        if (re.test(added)) {
+          findings.push({
+            check: "static",
+            severity: "info",
+            summary: `${why} in ${path}`,
+            evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
+          });
+        }
+      }
+    }
+
+    // "Field added but never read" heuristic — catches exactly the
+    // Phase 45 DocRef placeholder pattern. Limited to the diff itself:
+    // we're not doing a full-codebase grep here (too noisy; callers
+    // elsewhere might exist). The point is: if NEITHER this diff nor
+    // any other line in the diff reads the field, the PR is shipping
+    // state without a consumer.
+    const addedLines = lines.filter(l => l.startsWith("+") && !l.startsWith("+++"))
+      .map(l => l.slice(1));
+    const newFields = extractNewFields(addedLines);
+    for (const field of newFields) {
+      const readPattern = new RegExp(`[\\.:]\\s*${escape(field)}\\b|\\b${escape(field)}\\s*:`);
+      // The definition line itself matches readPattern — filter it out
+      // by requiring at least TWO lines in the diff mention the field
+      // (one defines, one reads).
+      const hits = addedLines.filter(l => readPattern.test(l));
+      if (hits.length < 2) {
+        findings.push({
+          check: "static",
+          severity: "warn",
+          summary: `field '${field}' added in ${path} but no read-site in the diff — could be placeholder state without a consumer`,
+          evidence: [`${path}: added '${field}' with no reader; rest of diff has ${hits.length - 1} mentions`],
+        });
+      }
+    }
+  }
+
+  return findings;
+}
+
+function splitDiffByFile(diff: string): Map<string, string[]> {
+  const out = new Map<string, string[]>();
+  let current: string | null = null;
+  let buf: string[] = [];
+  for (const line of diff.split(/\r?\n/)) {
+    const m = line.match(/^diff --git a\/(\S+) b\/(\S+)/);
+    if (m) {
+      if (current) out.set(current, buf);
+      current = m[2];
+      buf = [];
+      continue;
+    }
+    buf.push(line);
+  }
+  if (current) out.set(current, buf);
+  return out;
+}
+
+// Extract new `pub name: Type,` fields from added lines. Rust syntax.
+// Narrowly-scoped: only matches at the start of a trimmed line,
+// requires `pub ` prefix, ignores `pub fn` / `pub struct` / etc.
+function extractNewFields(addedLines: string[]): string[] {
+  const fields = new Set<string>();
+  for (const line of addedLines) {
+    const t = line.trim();
+    // pub NAME: Type,
+    const m = t.match(/^pub\s+(?!fn\b|struct\b|enum\b|mod\b|use\b|trait\b|impl\b|const\b|static\b|type\b)(\w+)\s*:/);
+    if (m) fields.add(m[1]);
+  }
+  return Array.from(fields);
+}
+
+function escape(s: string): string {
+  return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
--- a/auditor/claim_parser.ts
+++ b/auditor/claim_parser.ts
@ -0,0 +1,119 @@
+// Claim parser — reads commit messages + PR body, extracts ship-claims.
+//
+// A "ship-claim" is any phrase that asserts functionality is working,
+// tested, complete, or landed. These are the assertions the downstream
+// checks (static/dynamic/inference/kb) try to falsify.
+//
+// Heuristic approach (regex + strength grading) — intentionally NOT
+// using an LLM here. Reason: the inference check already asks a cloud
+// model "does this match the claim?". The parser's job is to surface
+// the claim substrates, not judge them. Over-engineering the parser
+// risks false-negatives when the cloud model was going to catch it
+// anyway.
+
+import type { Claim, PrSnapshot } from "./types.ts";
+
+// Strong claims: explicit end-to-end + verification vocabulary
+const STRONG_PATTERNS: RegExp[] = [
+  /\bverified\s+(end[- ]to[- ]end|live|in\s+production|against)\b/i,
+  /\btested\s+(live|end[- ]to[- ]end|against|with)\b/i,
+  /\bworks\s+(end[- ]to[- ]end|live|in\s+production)\b/i,
+  /\bproduction[- ]ready\b/i,
+  /\bfully\s+(functional|wired|working)\b/i,
+  /\bphase\s+\d+(\.\d+)?\s+(shipped|complete|done|landed)\b/i,
+  /\bground\s+truth\b/i,
+  /\bproven\b/i,
+];
+
+// Moderate claims: asserted completion or pass but without the strong
+// verification qualifier.
+const MODERATE_PATTERNS: RegExp[] = [
+  /\bshipped\b/i,
+  /\blanded\b/i,
+  /\bgreen\b/i,
+  /\b(tests?\s+)?pass(ing|ed)\b/i,
+  /\bcomplet(e|ed)\b/i,
+  /\bdone\b/i,
+  /\bwired\b/i,
+  /\bfixed\b/i,
+  /\bworks\b/i,
+];
+
+// Weak claims: aspirational or hedged. Usually low-risk but recorded
+// for completeness.
+const WEAK_PATTERNS: RegExp[] = [
+  /\bshould\s+work\b/i,
+  /\bexpected\s+to\b/i,
+  /\bintended\s+to\b/i,
+  /\bwill\s+(work|handle|support)\b/i,
+  /\bprobably\b/i,
+];
+
+export interface ParsedClaims {
+  claims: Claim[];
+  commits_scanned: number;
+}
+
+export function parseClaims(pr: PrSnapshot): ParsedClaims {
+  const claims: Claim[] = [];
+
+  // PR body — every matching line becomes a claim at location "pr_body:N"
+  if (pr.body) {
+    scanText(pr.body, "pr_body", pr.head_sha, claims);
+  }
+
+  // Each commit message gets its own scan.
+  for (const c of pr.commits) {
+    if (!c.message) continue;
+    scanText(c.message, `commit:${c.sha.slice(0, 8)}`, c.sha, claims);
+  }
+
+  return { claims, commits_scanned: pr.commits.length };
+}
+
+function scanText(text: string, location_prefix: string, commit_sha: string, out: Claim[]): void {
+  const lines = text.split(/\r?\n/);
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i];
+    if (line.length < 3) continue;
+
+    // Strong patterns first — if a line matches strong, it's strong,
+    // don't double-count as moderate.
+    const strong = firstMatch(line, STRONG_PATTERNS);
+    if (strong) {
+      out.push({
+        text: line.trim().slice(0, 200),
+        commit_sha,
+        location: `${location_prefix}:${i + 1}`,
+        strength: "strong",
+      });
+      continue;
+    }
+    const moderate = firstMatch(line, MODERATE_PATTERNS);
+    if (moderate) {
+      out.push({
+        text: line.trim().slice(0, 200),
+        commit_sha,
+        location: `${location_prefix}:${i + 1}`,
+        strength: "moderate",
+      });
+      continue;
+    }
+    const weak = firstMatch(line, WEAK_PATTERNS);
+    if (weak) {
+      out.push({
+        text: line.trim().slice(0, 200),
+        commit_sha,
+        location: `${location_prefix}:${i + 1}`,
+        strength: "weak",
+      });
+    }
+  }
+}
+
+function firstMatch(text: string, patterns: RegExp[]): RegExp | null {
+  for (const p of patterns) {
+    if (p.test(text)) return p;
+  }
+  return null;
+}
--- a/auditor/fixtures/cli.ts
+++ b/auditor/fixtures/cli.ts
@ -0,0 +1,49 @@
+// Standalone runner for auditor fixtures. Invoke:
+//
+//   bun run auditor/fixtures/cli.ts hybrid_38_40_45
+//
+// Prints human-readable per-layer breakdown + JSON result so the
+// output can be captured by the dynamic check without re-running.
+
+import { runHybridFixture, type FixtureResult } from "./hybrid_38_40_45.ts";
+
+const fixtureArg = process.argv[2] ?? "hybrid_38_40_45";
+
+let result: FixtureResult;
+switch (fixtureArg) {
+  case "hybrid_38_40_45":
+    result = await runHybridFixture();
+    break;
+  default:
+    console.error(`unknown fixture: ${fixtureArg}`);
+    process.exit(2);
+}
+
+// Human-readable summary
+console.error(""); // blank line to stderr so stdout JSON stays clean
+console.error(`─── Fixture: ${result.fixture} ───`);
+console.error(`  overall: ${result.overall.toUpperCase()}`);
+console.error(`  shipped:     [${result.shipped_phases.join(", ") || "—"}]`);
+console.error(`  placeholder: [${result.placeholder_phases.join(", ") || "—"}]`);
+console.error("");
+for (const l of result.layers) {
+  const mark = l.ok ? "✓" : "✗";
+  const phaseStr = `Phase ${l.phase}`.padEnd(11);
+  console.error(`  ${mark} ${phaseStr} ${l.layer.padEnd(30)} ${String(l.latency_ms).padStart(6)}ms`);
+  if (l.ok) {
+    console.error(`      ${l.evidence.slice(0, 200)}`);
+  } else {
+    console.error(`      ERROR: ${l.error?.slice(0, 240) ?? "unknown"}`);
+  }
+}
+console.error("");
+for (const n of result.notes) console.error(`  note: ${n}`);
+console.error("");
+
+// Machine-readable output on stdout.
+console.log(JSON.stringify(result, null, 2));
+
+// Exit code reflects overall: 0 pass, 2 partial, 1 fail.
+// Dynamic check reads this AND the JSON; partial-pass is treated
+// as informative (some layers shipped), not blocking on its own.
+process.exit(result.overall === "pass" ? 0 : result.overall === "partial_pass" ? 2 : 1);
--- a/auditor/fixtures/hybrid_38_40_45.ts
+++ b/auditor/fixtures/hybrid_38_40_45.ts
@ -0,0 +1,335 @@
+// The never-run hybrid test fixture. Exercises the full stack end-to-end
+// across Phase 38 (/v1/chat), Phase 40 (Langfuse tracing), Phase 45
+// slice 1 (DocRef on playbook), and Phase 45 slice 2 (context7 bridge
+// drift detection).
+//
+// Returns HONEST per-layer results. A layer that's unimplemented or
+// broken returns ok=false with a real error; overall verdict flags
+// which phases are genuinely shipped vs still placeholder.
+//
+// Deterministic: always runs the same payload so a run's output is
+// comparable across audits. Uses TEST-prefixed names so playbook
+// state doesn't get polluted with production-looking fixture data.
+//
+// Preconditions:
+//   - gateway up on :3100
+//   - Python sidecar up on :3200
+//   - Ollama local + Ollama Cloud key loaded
+//   - Langfuse up on :3001
+//   - context7 bridge up on :3900 (manual-start per mcp-server/README)
+//     If bridge isn't running, that layer reports ok=false with a
+//     clear error — the fixture doesn't try to auto-start it.
+
+import { readFile } from "node:fs/promises";
+
+const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
+const LANGFUSE = process.env.LANGFUSE_URL ?? "http://localhost:3001";
+const BRIDGE = process.env.CONTEXT7_BRIDGE_URL ?? "http://localhost:3900";
+const FIXTURE_ID = "auditor:hybrid_38_40_45:v1";
+const TEST_WORKER_NAME = "__auditor_test_worker__";
+const STALE_HASH = "stale-hash-for-drift-proof";
+// Unique-per-run identifiers so each fixture invocation hits the ADD
+// path in upsert_entry (not UPDATE/NOOP on a leftover from a prior
+// run). Catches state pollution + avoids interaction with task #12
+// (UPDATE branch silently drops doc_refs).
+const RUN_NONCE = Date.now().toString(36);
+const TEST_WORKER_NAME_VERSIONED = `__auditor_test_worker_${RUN_NONCE}__`;
+const TEST_OPERATION_VERSIONED = `TEST: fill: __auditor_${RUN_NONCE}__ x1 in TestCity, TC`;
+
+export interface LayerResult {
+  layer: string;
+  phase: string;
+  ok: boolean;
+  latency_ms: number;
+  evidence: string;
+  error?: string;
+  // Layer-specific numbers that downstream checks can aggregate.
+  metrics?: Record<string, number | string | boolean>;
+}
+
+export interface FixtureResult {
+  fixture: string;
+  ran_at: string;
+  overall: "pass" | "partial_pass" | "fail";
+  layers: LayerResult[];
+  real_numbers: Record<string, number>;
+  shipped_phases: string[];
+  placeholder_phases: string[];
+  notes: string[];
+}
+
+// Wraps a layer's async body with timing + structured error capture.
+// On throw: returns ok=false with the error string as evidence, so
+// downstream verdict code treats it as a layer failure rather than
+// a fixture crash.
+async function measureLayer(
+  layer: string,
+  phase: string,
+  body: () => Promise<{ evidence: string; metrics?: Record<string, number | string | boolean> }>,
+): Promise<LayerResult> {
+  const t0 = Date.now();
+  try {
+    const { evidence, metrics } = await body();
+    return {
+      layer, phase, ok: true,
+      latency_ms: Date.now() - t0,
+      evidence, metrics,
+    };
+  } catch (e) {
+    return {
+      layer, phase, ok: false,
+      latency_ms: Date.now() - t0,
+      evidence: `FAILED: ${(e as Error).message.slice(0, 200)}`,
+      error: (e as Error).message,
+    };
+  }
+}
+
+async function langfuseAuth(): Promise<{ pk: string; sk: string } | null> {
+  // Default credentials match mcp-server/tracing.ts. Same sources the
+  // Rust side uses in gateway/src/v1/langfuse_trace.rs.
+  let pk = process.env.LANGFUSE_PUBLIC_KEY ?? "pk-lf-staffing";
+  let sk = process.env.LANGFUSE_SECRET_KEY ?? "sk-lf-staffing-secret";
+  if (!pk || !sk) return null;
+  return { pk, sk };
+}
+
+export async function runHybridFixture(): Promise<FixtureResult> {
+  const result: FixtureResult = {
+    fixture: FIXTURE_ID,
+    ran_at: new Date().toISOString(),
+    overall: "fail",
+    layers: [],
+    real_numbers: {},
+    shipped_phases: [],
+    placeholder_phases: [],
+    notes: [],
+  };
+
+  // ========================================================================
+  // Layer 1 — Phase 38: POST /v1/chat returns valid OpenAI shape
+  // ========================================================================
+  // Captured HERE, immediately before the chat layer runs, so layer 2's
+  // Langfuse-trace filter uses the actual moment the chat call was
+  // attempted — not the fixture start time. Earlier draft had a
+  // meaningless ternary returning result.ran_at on both branches; the
+  // LLM-Team codereview (2026-04-22) caught this and flagged it as a
+  // false-negative window on traces created between fixture-start and
+  // chat-fetch.
+  const chat_request_sent_ms = Date.now();
+
+  const l1 = await measureLayer("phase38_chat", "38", async () => {
+    const r = await fetch(`${GATEWAY}/v1/chat`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        model: "qwen3.5:latest",
+        messages: [
+          { role: "system", content: "Respond in 8 words or fewer." },
+          { role: "user", content: "Name one Docker command." },
+        ],
+        max_tokens: 60,
+        temperature: 0.2,
+      }),
+      signal: AbortSignal.timeout(60000),
+    });
+    if (!r.ok) throw new Error(`gateway ${r.status}: ${await r.text()}`);
+    const j: any = await r.json();
+    const content = j.choices?.[0]?.message?.content;
+    if (typeof content !== "string" || content.length === 0) {
+      throw new Error(`empty content — think-false default may not be wired: ${JSON.stringify(j).slice(0, 200)}`);
+    }
+    return {
+      evidence: `content="${content.slice(0, 60)}" tokens=${j.usage?.total_tokens}`,
+      metrics: {
+        prompt_tokens: j.usage?.prompt_tokens ?? 0,
+        completion_tokens: j.usage?.completion_tokens ?? 0,
+        total_tokens: j.usage?.total_tokens ?? 0,
+        content_nonempty: true,
+      },
+    };
+  });
+  result.layers.push(l1);
+
+  // ========================================================================
+  // Layer 2 — Phase 40: Langfuse trace lands within 3s
+  // ========================================================================
+  // Fire-and-forget tracing means we need a brief sleep before query.
+  await new Promise(res => setTimeout(res, 2500));
+  const l2 = await measureLayer("phase40_langfuse_trace", "40", async () => {
+    const auth = await langfuseAuth();
+    if (!auth) throw new Error("Langfuse credentials not in env");
+    const r = await fetch(`${LANGFUSE}/api/public/traces?limit=5&name=v1.chat:ollama`, {
+      headers: { Authorization: `Basic ${btoa(`${auth.pk}:${auth.sk}`)}` },
+      signal: AbortSignal.timeout(10000),
+    });
+    if (!r.ok) throw new Error(`langfuse ${r.status}: ${await r.text()}`);
+    const j: any = await r.json();
+    const items = Array.isArray(j.data) ? j.data : [];
+    // Filter on the chat-request timestamp captured above. A Langfuse
+    // trace must be newer than the moment we fired /v1/chat to plausibly
+    // belong to our request. Using fixture start time (result.ran_at)
+    // was wrong and could false-negative on slow fixtures.
+    const recent = items.filter((t: any) => Date.parse(t.timestamp) >= chat_request_sent_ms);
+    if (recent.length === 0) {
+      throw new Error(`no v1.chat:ollama trace since ${new Date(chat_request_sent_ms).toISOString()} (${items.length} older traces visible, Langfuse reachable — tracing is not firing)`);
+    }
+    const trace = recent[0];
+    return {
+      evidence: `trace ${trace.id.slice(0, 8)} name=${trace.name} age=${Date.now() - Date.parse(trace.timestamp)}ms`,
+      metrics: {
+        trace_age_ms: Date.now() - Date.parse(trace.timestamp),
+        trace_latency_reported: Number(trace.latency ?? 0),
+        recent_trace_count: recent.length,
+      },
+    };
+  });
+  result.layers.push(l2);
+
+  // ========================================================================
+  // Layer 3 — Phase 45 slice 1: /seed accepts doc_refs and persists them
+  // ========================================================================
+  // Seed a playbook with doc_refs referencing Docker at a stale hash.
+  // Ignore the cleanup concern for v1 — this fixture pollutes the
+  // in-memory playbook state; operator can retire test entries.
+  const l3 = await measureLayer("phase45_seed_with_doc_refs", "45.1", async () => {
+    const r = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        operation: TEST_OPERATION_VERSIONED,
+        approach: "auditor hybrid fixture run",
+        context: "doc_refs integration test — retire after audit",
+        endorsed_names: [TEST_WORKER_NAME_VERSIONED],
+        append: true,
+        doc_refs: [
+          {
+            tool: "docker",
+            version_seen: "24.0.7",
+            snippet_hash: STALE_HASH,
+            source_url: "https://context7.com/docker/docs",
+            seen_at: new Date().toISOString(),
+          },
+        ],
+      }),
+      signal: AbortSignal.timeout(30000),
+    });
+    if (!r.ok) throw new Error(`seed ${r.status}: ${await r.text()}`);
+    const j: any = await r.json();
+    // Verify doc_refs actually persisted — read state.json directly.
+    // That's the canonical source of truth; the seed response may or
+    // may not echo doc_refs back.
+    const state_raw = await readFile("/home/profit/lakehouse/data/_playbook_memory/state.json", "utf8");
+    const state = JSON.parse(state_raw);
+    const entries = state.entries ?? [];
+    const ours = entries.find((e: any) => (e.endorsed_names ?? []).includes(TEST_WORKER_NAME_VERSIONED));
+    if (!ours) throw new Error(`seeded entry not found in state.json (worker ${TEST_WORKER_NAME_VERSIONED} not present)`);
+    const docRefs = ours.doc_refs ?? [];
+    if (docRefs.length === 0) {
+      throw new Error("entry saved but doc_refs field is empty — the field accepted by the API isn't being persisted");
+    }
+    return {
+      evidence: `playbook ${ours.playbook_id.slice(0, 16)} persisted with doc_refs.length=${docRefs.length}; first tool=${docRefs[0]?.tool} hash=${docRefs[0]?.snippet_hash}`,
+      metrics: {
+        doc_refs_stored: docRefs.length,
+        entries_after: j.entries_after ?? -1,
+        playbook_id: ours.playbook_id,
+        mode: j.outcome?.mode ?? "unknown",
+      },
+    };
+  });
+  result.layers.push(l3);
+
+  // ========================================================================
+  // Layer 4 — Phase 45 slice 2: context7 bridge detects drift vs stale hash
+  // ========================================================================
+  const l4 = await measureLayer("phase45_bridge_diff", "45.2", async () => {
+    // Health check first — if bridge isn't running, fail with a clear
+    // message rather than mysterious connection refused.
+    try {
+      const h = await fetch(`${BRIDGE}/health`, { signal: AbortSignal.timeout(3000) });
+      if (!h.ok) throw new Error(`bridge /health ${h.status}`);
+    } catch (e) {
+      throw new Error(`context7 bridge at ${BRIDGE} is not reachable (bridge is manual-start; run 'bun run mcp-server/context7_bridge.ts'): ${(e as Error).message}`);
+    }
+    const r = await fetch(`${BRIDGE}/docs/docker/diff?since=${encodeURIComponent(STALE_HASH)}`, {
+      signal: AbortSignal.timeout(30000),
+    });
+    if (!r.ok) throw new Error(`bridge diff ${r.status}: ${await r.text()}`);
+    const j: any = await r.json();
+    if (j.drifted !== true) {
+      throw new Error(`expected drifted=true against ${STALE_HASH}; got ${JSON.stringify(j)}`);
+    }
+    return {
+      evidence: `bridge confirms drift: current=${j.current_snippet_hash} previous=${j.previous_snippet_hash} upstream_updated=${j.last_updated_upstream}`,
+      metrics: {
+        drifted: true,
+        current_hash_length: String(j.current_snippet_hash ?? "").length,
+        library_id: j.library_id,
+      },
+    };
+  });
+  result.layers.push(l4);
+
+  // ========================================================================
+  // Layer 5 — Phase 45 slice 3: /doc_drift/check/{id} endpoint
+  //                              (EXPECTED TO FAIL — endpoint unimplemented)
+  // ========================================================================
+  const l5 = await measureLayer("phase45_slice3_drift_flag", "45.3", async () => {
+    const pid = String(l3.metrics?.playbook_id ?? "");
+    if (!pid) throw new Error("layer 3 did not produce a playbook_id to check");
+    const r = await fetch(`${GATEWAY}/vectors/playbook_memory/doc_drift/check/${encodeURIComponent(pid)}`, {
+      method: "POST",
+      signal: AbortSignal.timeout(10000),
+    });
+    if (r.status === 404 || r.status === 405) {
+      // This is the HONEST signal: the endpoint doesn't exist yet.
+      // Fail this layer with a clear marker — not a silent pass.
+      throw new Error(`endpoint unimplemented (${r.status}) — Phase 45 slice 3 is still placeholder. doc_refs are persisted (layer 3) and the bridge can answer drift questions (layer 4), but nothing ties them together into a playbook flag yet.`);
+    }
+    if (!r.ok) throw new Error(`drift/check ${r.status}: ${await r.text()}`);
+    const j: any = await r.json();
+    // If the endpoint DID exist, assert the flag got set.
+    if (!j.flagged) {
+      throw new Error(`endpoint responded but flag not set: ${JSON.stringify(j).slice(0, 200)}`);
+    }
+    return {
+      evidence: `playbook flagged: ${JSON.stringify(j).slice(0, 160)}`,
+      metrics: { flagged: true },
+    };
+  });
+  result.layers.push(l5);
+
+  // ========================================================================
+  // Verdict assembly
+  // ========================================================================
+  for (const l of result.layers) {
+    if (l.ok) result.shipped_phases.push(l.phase);
+    else result.placeholder_phases.push(l.phase);
+    result.real_numbers[`${l.layer}_latency_ms`] = l.latency_ms;
+    if (l.metrics) {
+      for (const [k, v] of Object.entries(l.metrics)) {
+        if (typeof v === "number") result.real_numbers[`${l.layer}.${k}`] = v;
+      }
+    }
+  }
+
+  const anyFail = result.placeholder_phases.length > 0;
+  const allPass = result.layers.every(l => l.ok);
+  result.overall = allPass ? "pass" : (result.shipped_phases.length > 0 ? "partial_pass" : "fail");
+
+  if (!l5.ok) {
+    result.notes.push(
+      "Phase 45 slice 3 (doc_drift/check endpoint) is the expected-fail layer. " +
+      "Layer 5 failing with a 404/405 is the CORRECT honest signal — don't mask this " +
+      "to get a green result. When slice 3 ships, layer 5 flips to ok=true automatically."
+    );
+  }
+  if (anyFail) {
+    result.notes.push(
+      `${result.placeholder_phases.length} layer(s) failed. Check evidence per layer for specifics.`
+    );
+  }
+
+  return result;
+}
--- a/auditor/gitea.ts
+++ b/auditor/gitea.ts
@ -0,0 +1,145 @@
+// Gitea API client. Minimal surface — only what the auditor needs:
+// list open PRs, get commits + files for a PR, fetch a diff, post a
+// commit status, post a review.
+//
+// Auth: reads PAT from ~/.git-credentials (set up by the credential
+// helper flow in 2026-04-22 session). Gitea's "token" auth scheme
+// matches what `git fetch` is already using.
+
+import { readFile } from "node:fs/promises";
+import type { PrSnapshot } from "./types.ts";
+
+const HOST = process.env.GITEA_HOST ?? "https://git.agentview.dev";
+const OWNER = "profit";
+const REPO = "lakehouse";
+const CRED_FILE = "/home/profit/.git-credentials";
+
+let cachedPat: string | null = null;
+
+async function getPat(): Promise<string> {
+  if (cachedPat) return cachedPat;
+  const raw = await readFile(CRED_FILE, "utf8");
+  for (const line of raw.split("\n")) {
+    const m = line.match(/^https:\/\/[^:]+:([^@]+)@git\.agentview\.dev/);
+    if (m) { cachedPat = m[1]; return m[1]; }
+  }
+  throw new Error(`no Gitea PAT in ${CRED_FILE}`);
+}
+
+async function giteaFetch(path: string, init: RequestInit = {}): Promise<Response> {
+  const pat = await getPat();
+  const url = `${HOST}/api/v1${path}`;
+  const headers = new Headers(init.headers);
+  headers.set("Authorization", `token ${pat}`);
+  if (init.body && !headers.has("content-type")) {
+    headers.set("content-type", "application/json");
+  }
+  return fetch(url, { ...init, headers, signal: AbortSignal.timeout(20000) });
+}
+
+export async function listOpenPrs(): Promise<PrSnapshot[]> {
+  const r = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls?state=open&page=1&limit=50`);
+  if (!r.ok) throw new Error(`listOpenPrs ${r.status}: ${await r.text()}`);
+  const rows = (await r.json()) as any[];
+  return Promise.all(rows.map(row => snapshotFromPr(row)));
+}
+
+export async function getPrSnapshot(num: number): Promise<PrSnapshot> {
+  const r = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${num}`);
+  if (!r.ok) throw new Error(`getPr ${num} ${r.status}: ${await r.text()}`);
+  return snapshotFromPr((await r.json()) as any);
+}
+
+async function snapshotFromPr(row: any): Promise<PrSnapshot> {
+  const num = row.number;
+  const commitsResp = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${num}/commits`);
+  const commits = commitsResp.ok ? ((await commitsResp.json()) as any[]) : [];
+  const filesResp = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${num}/files`);
+  const files = filesResp.ok ? ((await filesResp.json()) as any[]) : [];
+  return {
+    number: num,
+    head_sha: row.head?.sha ?? "",
+    base_sha: row.base?.sha ?? "",
+    title: row.title ?? "",
+    body: row.body ?? "",
+    state: row.state === "open" ? "open" : (row.merged ? "merged" : "closed"),
+    author: row.user?.login ?? "",
+    commits: commits.map(c => ({
+      sha: (c.sha ?? "").slice(0, 12),
+      message: c.commit?.message ?? "",
+      author: c.commit?.author?.name ?? "",
+    })),
+    files: files.map(f => ({
+      path: f.filename ?? "",
+      additions: f.additions ?? 0,
+      deletions: f.deletions ?? 0,
+    })),
+  };
+}
+
+/// Returns the unified diff text of the PR. Used by static checks.
+export async function getPrDiff(num: number): Promise<string> {
+  const r = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${num}.diff`);
+  if (!r.ok) throw new Error(`getDiff ${num} ${r.status}: ${await r.text()}`);
+  return await r.text();
+}
+
+/// Hard-block mechanism: post a failing commit status on the PR head
+/// SHA. Branch protection (if enabled on `main`) treats this as a
+/// required-check fail and prevents merge. The description is shown
+/// in the Gitea UI next to the red X.
+export async function postCommitStatus(args: {
+  sha: string;
+  state: "success" | "pending" | "failure" | "error";
+  context: string;
+  description: string;
+  target_url?: string;
+}): Promise<void> {
+  const r = await giteaFetch(`/repos/${OWNER}/${REPO}/statuses/${args.sha}`, {
+    method: "POST",
+    body: JSON.stringify({
+      state: args.state,
+      context: args.context,
+      description: args.description.slice(0, 140),
+      target_url: args.target_url ?? "",
+    }),
+  });
+  if (!r.ok) throw new Error(`postCommitStatus ${r.status}: ${await r.text()}`);
+}
+
+/// Post a review comment. Gitea typically blocks self-review
+/// (author posting a review on their own PR). Prefer
+/// `postIssueComment` when running with the author's PAT.
+export async function postReview(args: {
+  pr_number: number;
+  commit_id: string;
+  body: string;
+  event: "APPROVE" | "REQUEST_CHANGES" | "COMMENT";
+}): Promise<void> {
+  const r = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${args.pr_number}/reviews`, {
+    method: "POST",
+    body: JSON.stringify({
+      commit_id: args.commit_id,
+      body: args.body,
+      event: args.event,
+    }),
+  });
+  if (!r.ok) throw new Error(`postReview ${r.status}: ${await r.text()}`);
+}
+
+/// Plain issue comment. Works for the auditor's own PAT because
+/// Gitea allows authors to comment on their own PRs (just not
+/// review them). Auditor uses this for the reasoning body; the
+/// actual block signal is the commit status.
+export async function postIssueComment(args: {
+  pr_number: number;
+  body: string;
+}): Promise<{ id: number; html_url: string }> {
+  const r = await giteaFetch(`/repos/${OWNER}/${REPO}/issues/${args.pr_number}/comments`, {
+    method: "POST",
+    body: JSON.stringify({ body: args.body }),
+  });
+  if (!r.ok) throw new Error(`postIssueComment ${r.status}: ${await r.text()}`);
+  const j = await r.json() as any;
+  return { id: j.id, html_url: j.html_url };
+}
--- a/auditor/index.ts
+++ b/auditor/index.ts
@ -0,0 +1,147 @@
+// Auditor poller — the top-level entry. Polls Gitea for open PRs on
+// a fixed interval, dedupes by head SHA, runs audit + posts verdict
+// for each new (pr, sha) pair.
+//
+// Run manually:
+//   bun run auditor/index.ts
+//
+// Stop:
+//   touch auditor.paused        (skips next cycle)
+//   pkill -f auditor/index.ts   (kills in-flight)
+//
+// State:
+//   data/_auditor/state.json           — last-audited SHA per PR
+//   data/_auditor/verdicts/{id}.json   — per-run verdict records
+//
+// This entry runs forever. A systemd unit would wrap it once the
+// workflow is trusted (same pattern as mcp-server, observer).
+
+import { readFile, writeFile, mkdir, access } from "node:fs/promises";
+import { listOpenPrs } from "./gitea.ts";
+import { auditPr } from "./audit.ts";
+
+const POLL_INTERVAL_MS = 90_000; // 90s — enough budget for audit runs to complete
+const PAUSE_FILE = "/home/profit/lakehouse/auditor.paused";
+const STATE_FILE = "/home/profit/lakehouse/data/_auditor/state.json";
+
+interface State {
+  // Map: PR number → last-audited head SHA. Lets us dedupe audits
+  // across restarts (poller can crash/restart without re-auditing
+  // all open PRs from scratch).
+  last_audited: Record<string, string>;
+  started_at: string;
+  cycles_total: number;
+  cycles_skipped_paused: number;
+  audits_run: number;
+  last_cycle_at?: string;
+}
+
+async function fileExists(path: string): Promise<boolean> {
+  try { await access(path); return true; } catch { return false; }
+}
+
+async function loadState(): Promise<State> {
+  try {
+    const raw = await readFile(STATE_FILE, "utf8");
+    const s = JSON.parse(raw);
+    return {
+      last_audited: s.last_audited ?? {},
+      started_at: s.started_at ?? new Date().toISOString(),
+      cycles_total: s.cycles_total ?? 0,
+      cycles_skipped_paused: s.cycles_skipped_paused ?? 0,
+      audits_run: s.audits_run ?? 0,
+      last_cycle_at: s.last_cycle_at,
+    };
+  } catch {
+    return {
+      last_audited: {},
+      started_at: new Date().toISOString(),
+      cycles_total: 0,
+      cycles_skipped_paused: 0,
+      audits_run: 0,
+    };
+  }
+}
+
+async function saveState(s: State): Promise<void> {
+  await mkdir("/home/profit/lakehouse/data/_auditor", { recursive: true });
+  await writeFile(STATE_FILE, JSON.stringify(s, null, 2));
+}
+
+async function runCycle(state: State): Promise<State> {
+  state.cycles_total += 1;
+  state.last_cycle_at = new Date().toISOString();
+
+  if (await fileExists(PAUSE_FILE)) {
+    state.cycles_skipped_paused += 1;
+    console.log(`[auditor] cycle ${state.cycles_total}: paused (touch ${PAUSE_FILE} exists)`);
+    return state;
+  }
+
+  let prs;
+  try {
+    prs = await listOpenPrs();
+  } catch (e) {
+    console.error(`[auditor] listOpenPrs failed: ${(e as Error).message}`);
+    return state;
+  }
+
+  console.log(`[auditor] cycle ${state.cycles_total}: ${prs.length} open PR(s)`);
+
+  for (const pr of prs) {
+    const last = state.last_audited[String(pr.number)];
+    if (last === pr.head_sha) {
+      console.log(`[auditor]   skip PR #${pr.number} (SHA ${pr.head_sha.slice(0, 8)} already audited)`);
+      continue;
+    }
+    console.log(`[auditor]   audit PR #${pr.number} (${pr.head_sha.slice(0, 8)}) — ${pr.title.slice(0, 60)}`);
+    try {
+      // Skip dynamic by default: it mutates live playbook state and
+      // re-runs on every PR update would pollute quickly. Operator
+      // can run dynamic via `bun run auditor/fixtures/cli.ts` manually
+      // OR set LH_AUDITOR_RUN_DYNAMIC=1 to opt in.
+      const run_dynamic = process.env.LH_AUDITOR_RUN_DYNAMIC === "1";
+      const verdict = await auditPr(pr, {
+        skip_dynamic: !run_dynamic,
+        skip_inference: process.env.LH_AUDITOR_SKIP_INFERENCE === "1",
+      });
+      console.log(`[auditor]     verdict=${verdict.overall} findings=${verdict.metrics.findings_total} (block=${verdict.metrics.findings_block} warn=${verdict.metrics.findings_warn})`);
+      state.last_audited[String(pr.number)] = pr.head_sha;
+      state.audits_run += 1;
+    } catch (e) {
+      console.error(`[auditor]     audit failed: ${(e as Error).message}`);
+    }
+  }
+
+  return state;
+}
+
+async function main(): Promise<void> {
+  console.log(`[auditor] starting poller — interval ${POLL_INTERVAL_MS / 1000}s`);
+  console.log(`[auditor] pause file: ${PAUSE_FILE}`);
+  console.log(`[auditor] state file: ${STATE_FILE}`);
+
+  let state = await loadState();
+  console.log(`[auditor] loaded state: ${Object.keys(state.last_audited).length} PRs previously audited, ${state.cycles_total} cycles so far`);
+
+  // Single-shot mode for CLI testing: `bun run auditor/index.ts --once`
+  const once = process.argv.includes("--once");
+  if (once) {
+    state = await runCycle(state);
+    await saveState(state);
+    console.log(`[auditor] single-shot complete. total audits: ${state.audits_run}`);
+    return;
+  }
+
+  // Loop.
+  while (true) {
+    state = await runCycle(state);
+    await saveState(state);
+    await new Promise(res => setTimeout(res, POLL_INTERVAL_MS));
+  }
+}
+
+main().catch(e => {
+  console.error("[auditor] fatal:", e);
+  process.exit(1);
+});
--- a/auditor/policy.ts
+++ b/auditor/policy.ts
@ -0,0 +1,62 @@
+// ═══════════════════════════════════════════════════════════════════
+// YOU WRITE THIS FILE. Policy decides what blocks vs what's a comment.
+// Defaults are opinionated on the "stop clicking past placeholder"
+// side — easier to loosen than to tighten when you're watching the
+// auditor behave in live PRs.
+// ═══════════════════════════════════════════════════════════════════
+
+import type { Finding, Verdict } from "./types.ts";
+
+/// Translate the four-check output into a single verdict. This is the
+/// single pane of glass the auditor operates on — tune thresholds here.
+export function assembleVerdict(
+  findings: Finding[],
+  metrics: Record<string, number>,
+  pr_number: number,
+  head_sha: string,
+): Verdict {
+  const blocking = findings.filter(f => f.severity === "block");
+  const warning = findings.filter(f => f.severity === "warn");
+
+  let overall: Verdict["overall"];
+  let one_liner: string;
+
+  if (blocking.length > 0) {
+    overall = "block";
+    one_liner = `${blocking.length} blocking issue${blocking.length > 1 ? "s" : ""}: ${blocking[0].summary}`;
+  } else if (warning.length >= 3) {
+    // Three or more warnings is a block — death by a thousand cuts.
+    overall = "request_changes";
+    one_liner = `${warning.length} warnings — see review`;
+  } else if (warning.length > 0) {
+    overall = "request_changes";
+    one_liner = warning[0].summary;
+  } else {
+    overall = "approve";
+    one_liner = `all checks passed (${findings.length} findings, all info)`;
+  }
+
+  return {
+    pr_number,
+    head_sha,
+    audited_at: new Date().toISOString(),
+    overall,
+    findings,
+    metrics,
+    one_liner,
+  };
+}
+
+/// Which strength-of-claim warrants which severity when evidence is
+/// weak? A "Phase X shipped" claim with zero integration tests is a
+/// blocker. A "should work" claim with no test is a warn.
+export function severityFromClaimEvidence(
+  claim_strength: "weak" | "moderate" | "strong",
+  evidence_grade: "none" | "partial" | "full",
+): "info" | "warn" | "block" {
+  if (evidence_grade === "full") return "info";
+  if (claim_strength === "strong" && evidence_grade === "none") return "block";
+  if (claim_strength === "strong" && evidence_grade === "partial") return "warn";
+  if (claim_strength === "moderate" && evidence_grade === "none") return "warn";
+  return "info";
+}
--- a/auditor/types.ts
+++ b/auditor/types.ts
@ -0,0 +1,65 @@
+// Shared types for the claim-auditor. Every field exists for a reason;
+// if something can't be verified from a check, it goes into `evidence`
+// so the verdict is inspectable, not a black box.
+
+export type CheckKind = "static" | "dynamic" | "inference" | "kb_query";
+
+export type Severity = "info" | "warn" | "block";
+
+export interface Claim {
+  // Verbatim phrase that raised the claim — e.g. "Phase 38 shipped",
+  // "verified end-to-end", "works after restart". Used as the "what
+  // does the author assert" input to downstream checks.
+  text: string;
+  // Where it came from. `commit_sha` is the short hash; `location`
+  // is a file:line for in-diff claims, or "pr_body" / "commit_message".
+  commit_sha: string;
+  location: string;
+  // Heuristic rating of how strong the claim is. "green+tested"
+  // is strong; "should work" is weak. Drives sensitivity — stronger
+  // claims get harder-blocked on weak evidence.
+  strength: "weak" | "moderate" | "strong";
+}
+
+export interface Finding {
+  check: CheckKind;
+  severity: Severity;
+  claim_text?: string;
+  // Free-form short description: "field added but never read", "no
+  // test covers this code path", "cloud model says placeholder".
+  summary: string;
+  // Concrete evidence: file paths, line numbers, log excerpts, test
+  // output, cloud-model verdict. No handwaving.
+  evidence: string[];
+}
+
+export interface Verdict {
+  pr_number: number;
+  head_sha: string;
+  audited_at: string;
+  overall: "approve" | "request_changes" | "block";
+  findings: Finding[];
+  // Real numbers that downstream policy can gate on. e.g. if the
+  // hybrid test produced latency numbers or token counts, they
+  // surface here so /auditor/history is queryable.
+  metrics: Record<string, number>;
+  // Short one-line justification for the `overall` verdict. What
+  // gets posted as the commit-status description in Gitea (max 140
+  // chars) must fit here.
+  one_liner: string;
+}
+
+export interface PrSnapshot {
+  number: number;
+  head_sha: string;
+  base_sha: string;
+  title: string;
+  body: string;
+  state: "open" | "closed" | "merged";
+  author: string;
+  // Array of commit messages in the PR (not diffs — those are
+  // fetched on-demand per-check).
+  commits: Array<{ sha: string; message: string; author: string }>;
+  // File paths touched by the PR, with lines-added / lines-removed.
+  files: Array<{ path: string; additions: number; deletions: number }>;
+}