lakehouse/auditor/audit.ts

// Orchestrator — runs all four checks on a PR, assembles a verdict,
// posts to Gitea. This is task #8's integration layer; the poller
// (task #9) calls this once per PR on every fresh head SHA.
//
// Hard-block mechanism: commit status posted with state="failure"
// and context="lakehouse/auditor". If `main` branch protection
// requires that context to pass, merge is physically impossible
// until the auditor re-audits a fixed commit and flips the status
// to "success".
//
// Human-readable reasoning: posted as a PR issue comment (not a
// review — reviews have self-review restrictions on Gitea and the
// auditor currently uses the same PAT as the PR author).

import { readFile, writeFile, mkdir } from "node:fs/promises";
import { join } from "node:path";
import type { PrSnapshot, Verdict, Finding } from "./types.ts";
import { getPrDiff, postCommitStatus, postIssueComment } from "./gitea.ts";
import { parseClaims } from "./claim_parser.ts";
import { assembleVerdict } from "./policy.ts";
import { runStaticCheck } from "./checks/static.ts";
import { runDynamicCheck } from "./checks/dynamic.ts";
import { runInferenceCheck } from "./checks/inference.ts";
import { runKbCheck } from "./checks/kb_query.ts";

const VERDICTS_DIR = "/home/profit/lakehouse/data/_auditor/verdicts";

export interface AuditOptions {
  // Skip the cloud inference call (fast path for iteration). Default false.
  skip_inference?: boolean;
  // Skip the dynamic check (avoid running the hybrid fixture every PR,
  // since it hits live services and mutates playbook state). Default false
  // on `main`-branch-target PRs, true when auditing feature branches
  // where the fixture would pollute state. Caller decides.
  skip_dynamic?: boolean;
  // Skip Gitea posting — useful for dry-runs / local testing.
  // Default false.
  dry_run?: boolean;
}

export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<Verdict> {
  const t0 = Date.now();
  const diff = await getPrDiff(pr.number);
  const { claims } = parseClaims(pr);

  // Run checks in parallel where they don't share mutable state.
  // Static + kb_query + inference are all read-only. Dynamic mutates
  // playbook state (nonce-scoped per run, but still live) so if
  // skip_dynamic is false we still run it in parallel — the mutation
  // is namespaced.
  const [staticFindings, dynamicFindings, inferenceFindings, kbFindings] = await Promise.all([
    runStaticCheck(diff),
    opts.skip_dynamic ? Promise.resolve(stubFinding("dynamic", "skipped by options")) : runDynamicCheck(),
    opts.skip_inference ? Promise.resolve(stubFinding("inference", "skipped by options")) : runInferenceCheck(claims, diff),
    runKbCheck(claims),
  ]);

  const allFindings: Finding[] = [
    ...staticFindings,
    ...dynamicFindings,
    ...inferenceFindings,
    ...kbFindings,
  ];

  const duration_ms = Date.now() - t0;
  const metrics = {
    audit_duration_ms: duration_ms,
    findings_total: allFindings.length,
    findings_block: allFindings.filter(f => f.severity === "block").length,
    findings_warn: allFindings.filter(f => f.severity === "warn").length,
    findings_info: allFindings.filter(f => f.severity === "info").length,
    claims_strong: claims.filter(c => c.strength === "strong").length,
    claims_moderate: claims.filter(c => c.strength === "moderate").length,
    claims_weak: claims.filter(c => c.strength === "weak").length,
    claims_total: claims.length,
    diff_bytes: diff.length,
  };

  const verdict = assembleVerdict(allFindings, metrics, pr.number, pr.head_sha);

  await persistVerdict(verdict);

  // Phase A of the cohesion plan (docs/COHESION_INTEGRATION_PLAN.md):
  // make every audit verdict visible to the observer + KB. Enables
  // future Phase B (kb_query sees prior audit history) without a
  // separate backfill. Fire-and-forget: observer/KB failures don't
  // block the Gitea post.
  indexVerdictToObserver(verdict).catch(e =>
    console.error(`[auditor] observer indexing failed: ${(e as Error).message}`));
  appendVerdictToKbOutcomes(verdict).catch(e =>
    console.error(`[auditor] kb outcomes append failed: ${(e as Error).message}`));

  if (!opts.dry_run) {
    await postToGitea(verdict);
  }

  return verdict;
}

// Phase A — verdict indexing.
//
// Two destinations, both append-only + non-blocking:
//   1. observer :3800/event — ring buffer + data/_observer/ops.jsonl
//   2. data/_kb/outcomes.jsonl — same file scenarios write to, with
//      kind:"audit" so readers can filter
//
// Errors log + drop. The verdict is still on disk at
// _auditor/verdicts/{pr}-{sha}.json; observer + KB are a convenience
// surface, not a source of truth.

const OBSERVER_URL = process.env.LH_OBSERVER_URL ?? "http://localhost:3800";
const KB_OUTCOMES = "/home/profit/lakehouse/data/_kb/outcomes.jsonl";

async function indexVerdictToObserver(v: Verdict): Promise<void> {
  const payload = {
    source: "auditor",
    event_kind: "audit",
    ok: v.overall === "approve",
    sig_hash: `pr${v.pr_number}-${v.head_sha.slice(0, 8)}`,
    pr_number: v.pr_number,
    head_sha: v.head_sha,
    overall: v.overall,
    one_liner: v.one_liner,
    findings_block: v.metrics.findings_block,
    findings_warn: v.metrics.findings_warn,
    audit_duration_ms: v.metrics.audit_duration_ms,
    audited_at: v.audited_at,
  };
  const r = await fetch(`${OBSERVER_URL}/event`, {
    method: "POST",
    headers: { "content-type": "application/json" },
    body: JSON.stringify(payload),
    signal: AbortSignal.timeout(3000),
  });
  if (!r.ok) throw new Error(`observer ${r.status}: ${await r.text()}`);
}

async function appendVerdictToKbOutcomes(v: Verdict): Promise<void> {
  const { appendFile, mkdir } = await import("node:fs/promises");
  const { dirname } = await import("node:path");
  await mkdir(dirname(KB_OUTCOMES), { recursive: true });
  const row = {
    kind: "audit",
    sig_hash: `pr${v.pr_number}-${v.head_sha.slice(0, 8)}`,
    audited_at: v.audited_at,
    pr_number: v.pr_number,
    head_sha: v.head_sha,
    overall: v.overall,
    one_liner: v.one_liner,
    ok_events: v.overall === "approve" ? 1 : 0,
    total_events: 1,
    findings: {
      block: v.metrics.findings_block,
      warn: v.metrics.findings_warn,
      info: v.metrics.findings_info,
    },
    elapsed_secs: (v.metrics.audit_duration_ms ?? 0) / 1000,
  };
  await appendFile(KB_OUTCOMES, JSON.stringify(row) + "\n");
}

async function persistVerdict(v: Verdict): Promise<void> {
  await mkdir(VERDICTS_DIR, { recursive: true });
  const filename = `${v.pr_number}-${v.head_sha.slice(0, 12)}.json`;
  await writeFile(join(VERDICTS_DIR, filename), JSON.stringify(v, null, 2));
}

export async function postToGitea(v: Verdict): Promise<void> {
  // 1. Commit status — the hard block signal (if branch protection
  //    is configured to require lakehouse/auditor on main).
  const state = v.overall === "approve" ? "success" : "failure";
  await postCommitStatus({
    sha: v.head_sha,
    state,
    context: "lakehouse/auditor",
    description: v.one_liner,
    target_url: "", // no URL yet; could point to a verdicts dashboard
  });

  // 2. Issue comment — the reasoning. Gated so we don't spam the PR
  //    with identical comments on re-audits of the same SHA. Caller
  //    (poller) ensures we only re-audit fresh SHAs, but a dedup
  //    marker inside the body keeps it idempotent if re-run.
  const body = formatReviewBody(v);
  await postIssueComment({ pr_number: v.pr_number, body });
}

function formatReviewBody(v: Verdict): string {
  const byCheck: Record<string, Finding[]> = {};
  for (const f of v.findings) {
    (byCheck[f.check] ||= []).push(f);
  }

  const verdictEmoji =
    v.overall === "approve" ? "✅" :
    v.overall === "request_changes" ? "⚠️" :
    "🛑";

  const lines: string[] = [];
  lines.push(`## Auditor verdict: ${verdictEmoji} \`${v.overall}\``);
  lines.push("");
  lines.push(`**One-liner:** ${v.one_liner}`);
  lines.push(`**Head SHA:** \`${v.head_sha.slice(0, 12)}\``);
  lines.push(`**Audited at:** ${v.audited_at}`);
  lines.push("");

  // Per-check sections, only if the check produced findings.
  const checkOrder = ["static", "dynamic", "inference", "kb_query"] as const;
  for (const check of checkOrder) {
    const fs = byCheck[check] ?? [];
    if (fs.length === 0) continue;
    const bySev = {
      block: fs.filter(f => f.severity === "block").length,
      warn: fs.filter(f => f.severity === "warn").length,
      info: fs.filter(f => f.severity === "info").length,
    };
    lines.push(`<details><summary><b>${check}</b> — ${fs.length} findings (${bySev.block} block, ${bySev.warn} warn, ${bySev.info} info)</summary>`);
    lines.push("");
    for (const f of fs) {
      const mark = f.severity === "block" ? "🛑" : f.severity === "warn" ? "⚠️" : "ℹ️";
      lines.push(`${mark} **${f.severity}** — ${f.summary}`);
      for (const e of f.evidence.slice(0, 3)) {
        lines.push(`  - \`${e.slice(0, 180).replace(/\n/g, " ")}\``);
      }
    }
    lines.push("");
    lines.push("</details>");
    lines.push("");
  }

  lines.push("### Metrics");
  lines.push("```json");
  lines.push(JSON.stringify(v.metrics, null, 2));
  lines.push("```");
  lines.push("");
  lines.push(`<sub>Lakehouse auditor · SHA ${v.head_sha.slice(0, 8)} · re-audit on new commit flips the status automatically.</sub>`);

  return lines.join("\n");
}

function stubFinding(check: "dynamic" | "inference", why: string): Finding[] {
  return [{ check, severity: "info", summary: `${check} check skipped — ${why}`, evidence: [why] }];
}