diff --git a/auditor/README.md b/auditor/README.md new file mode 100644 index 0000000..a227b39 --- /dev/null +++ b/auditor/README.md @@ -0,0 +1,53 @@ +# Lakehouse Claim Auditor + +A Bun sub-agent that watches open PRs on Gitea, reads the ship-claims +in commit messages and PR bodies, and **hard-blocks** merges when the +code doesn't back the claim. + +Rationale: when "compiles + one curl works" gets called "phase shipped," +placeholder code accumulates. This auditor runs every 90s, fetches +each open PR, and subjects it to four checks: + +1. **Static diff** — grep/parse looking for placeholder patterns +2. **Dynamic** — runs the never-before-executed hybrid test fixture +3. **Cloud inference** — asks `gpt-oss:120b` via `/v1/chat` to + identify gaps in the diff +4. **KB query** — looks up `data/_kb/` + observer for prior failure + patterns on similar claims + +Verdict is assembled, posted to Gitea as: +- A **failing commit status** (hard block — branch protection + prevents merge) +- A **review comment** explaining every finding + +## Run manually + +```bash +cd /home/profit/lakehouse +bun run auditor/index.ts +``` + +Defaults: polls every 90s, stops on `auditor.paused` file present. + +## State + +- `data/_auditor/state.json` — last-audited head SHA per PR +- `data/_auditor/verdicts/{pr}-{sha}.json` — per-run verdict record + +## Where YOU edit + +`auditor/policy.ts` — the verdict assembler. Controls which findings +block vs warn vs inform. All other code is mechanical: fetching, +running checks, posting to Gitea. + +## Hard-block mechanism + +1. Commit status is posted as `failure` with context `lakehouse/auditor` +2. If `main` branch protection requires `lakehouse/auditor` status + to pass, Gitea prevents merge +3. When code is fixed and re-audit passes, status flips to `success`, + merge unblocks + +Enable branch protection (one-time, via Gitea UI or API): +- `POST /repos/profit/lakehouse/branch_protections` +- `{"branch_name": "main", "required_status_checks": {"contexts": ["lakehouse/auditor"]}}` diff --git a/auditor/audit.ts b/auditor/audit.ts new file mode 100644 index 0000000..d626373 --- /dev/null +++ b/auditor/audit.ts @@ -0,0 +1,171 @@ +// Orchestrator — runs all four checks on a PR, assembles a verdict, +// posts to Gitea. This is task #8's integration layer; the poller +// (task #9) calls this once per PR on every fresh head SHA. +// +// Hard-block mechanism: commit status posted with state="failure" +// and context="lakehouse/auditor". If `main` branch protection +// requires that context to pass, merge is physically impossible +// until the auditor re-audits a fixed commit and flips the status +// to "success". +// +// Human-readable reasoning: posted as a PR issue comment (not a +// review — reviews have self-review restrictions on Gitea and the +// auditor currently uses the same PAT as the PR author). + +import { readFile, writeFile, mkdir } from "node:fs/promises"; +import { join } from "node:path"; +import type { PrSnapshot, Verdict, Finding } from "./types.ts"; +import { getPrDiff, postCommitStatus, postIssueComment } from "./gitea.ts"; +import { parseClaims } from "./claim_parser.ts"; +import { assembleVerdict } from "./policy.ts"; +import { runStaticCheck } from "./checks/static.ts"; +import { runDynamicCheck } from "./checks/dynamic.ts"; +import { runInferenceCheck } from "./checks/inference.ts"; +import { runKbCheck } from "./checks/kb_query.ts"; + +const VERDICTS_DIR = "/home/profit/lakehouse/data/_auditor/verdicts"; + +export interface AuditOptions { + // Skip the cloud inference call (fast path for iteration). Default false. + skip_inference?: boolean; + // Skip the dynamic check (avoid running the hybrid fixture every PR, + // since it hits live services and mutates playbook state). Default false + // on `main`-branch-target PRs, true when auditing feature branches + // where the fixture would pollute state. Caller decides. + skip_dynamic?: boolean; + // Skip Gitea posting — useful for dry-runs / local testing. + // Default false. + dry_run?: boolean; +} + +export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise { + const t0 = Date.now(); + const diff = await getPrDiff(pr.number); + const { claims } = parseClaims(pr); + + // Run checks in parallel where they don't share mutable state. + // Static + kb_query + inference are all read-only. Dynamic mutates + // playbook state (nonce-scoped per run, but still live) so if + // skip_dynamic is false we still run it in parallel — the mutation + // is namespaced. + const [staticFindings, dynamicFindings, inferenceFindings, kbFindings] = await Promise.all([ + runStaticCheck(diff), + opts.skip_dynamic ? Promise.resolve(stubFinding("dynamic", "skipped by options")) : runDynamicCheck(), + opts.skip_inference ? Promise.resolve(stubFinding("inference", "skipped by options")) : runInferenceCheck(claims, diff), + runKbCheck(claims), + ]); + + const allFindings: Finding[] = [ + ...staticFindings, + ...dynamicFindings, + ...inferenceFindings, + ...kbFindings, + ]; + + const duration_ms = Date.now() - t0; + const metrics = { + audit_duration_ms: duration_ms, + findings_total: allFindings.length, + findings_block: allFindings.filter(f => f.severity === "block").length, + findings_warn: allFindings.filter(f => f.severity === "warn").length, + findings_info: allFindings.filter(f => f.severity === "info").length, + claims_strong: claims.filter(c => c.strength === "strong").length, + claims_moderate: claims.filter(c => c.strength === "moderate").length, + claims_weak: claims.filter(c => c.strength === "weak").length, + claims_total: claims.length, + diff_bytes: diff.length, + }; + + const verdict = assembleVerdict(allFindings, metrics, pr.number, pr.head_sha); + + await persistVerdict(verdict); + + if (!opts.dry_run) { + await postToGitea(verdict); + } + + return verdict; +} + +async function persistVerdict(v: Verdict): Promise { + await mkdir(VERDICTS_DIR, { recursive: true }); + const filename = `${v.pr_number}-${v.head_sha.slice(0, 12)}.json`; + await writeFile(join(VERDICTS_DIR, filename), JSON.stringify(v, null, 2)); +} + +export async function postToGitea(v: Verdict): Promise { + // 1. Commit status — the hard block signal (if branch protection + // is configured to require lakehouse/auditor on main). + const state = v.overall === "approve" ? "success" : "failure"; + await postCommitStatus({ + sha: v.head_sha, + state, + context: "lakehouse/auditor", + description: v.one_liner, + target_url: "", // no URL yet; could point to a verdicts dashboard + }); + + // 2. Issue comment — the reasoning. Gated so we don't spam the PR + // with identical comments on re-audits of the same SHA. Caller + // (poller) ensures we only re-audit fresh SHAs, but a dedup + // marker inside the body keeps it idempotent if re-run. + const body = formatReviewBody(v); + await postIssueComment({ pr_number: v.pr_number, body }); +} + +function formatReviewBody(v: Verdict): string { + const byCheck: Record = {}; + for (const f of v.findings) { + (byCheck[f.check] ||= []).push(f); + } + + const verdictEmoji = + v.overall === "approve" ? "✅" : + v.overall === "request_changes" ? "⚠️" : + "🛑"; + + const lines: string[] = []; + lines.push(`## Auditor verdict: ${verdictEmoji} \`${v.overall}\``); + lines.push(""); + lines.push(`**One-liner:** ${v.one_liner}`); + lines.push(`**Head SHA:** \`${v.head_sha.slice(0, 12)}\``); + lines.push(`**Audited at:** ${v.audited_at}`); + lines.push(""); + + // Per-check sections, only if the check produced findings. + const checkOrder = ["static", "dynamic", "inference", "kb_query"] as const; + for (const check of checkOrder) { + const fs = byCheck[check] ?? []; + if (fs.length === 0) continue; + const bySev = { + block: fs.filter(f => f.severity === "block").length, + warn: fs.filter(f => f.severity === "warn").length, + info: fs.filter(f => f.severity === "info").length, + }; + lines.push(`
${check} — ${fs.length} findings (${bySev.block} block, ${bySev.warn} warn, ${bySev.info} info)`); + lines.push(""); + for (const f of fs) { + const mark = f.severity === "block" ? "🛑" : f.severity === "warn" ? "⚠️" : "ℹ️"; + lines.push(`${mark} **${f.severity}** — ${f.summary}`); + for (const e of f.evidence.slice(0, 3)) { + lines.push(` - \`${e.slice(0, 180).replace(/\n/g, " ")}\``); + } + } + lines.push(""); + lines.push("
"); + lines.push(""); + } + + lines.push("### Metrics"); + lines.push("```json"); + lines.push(JSON.stringify(v.metrics, null, 2)); + lines.push("```"); + lines.push(""); + lines.push(`Lakehouse auditor · SHA ${v.head_sha.slice(0, 8)} · re-audit on new commit flips the status automatically.`); + + return lines.join("\n"); +} + +function stubFinding(check: "dynamic" | "inference", why: string): Finding[] { + return [{ check, severity: "info", summary: `${check} check skipped — ${why}`, evidence: [why] }]; +} diff --git a/auditor/checks/dynamic.ts b/auditor/checks/dynamic.ts new file mode 100644 index 0000000..c0c4b17 --- /dev/null +++ b/auditor/checks/dynamic.ts @@ -0,0 +1,91 @@ +// Dynamic execution check — runs the hybrid fixture and maps its +// layer results to auditor Findings. +// +// A layer that fails with a "not implemented / 404 / slice N" error +// gets severity=info (honest placeholder signal). A layer that fails +// any other way gets severity=warn (something actually broke). +// An info-level summary finding is always emitted carrying the real +// numbers — shipped/placeholder phase counts, per-layer latency. + +import { runHybridFixture } from "../fixtures/hybrid_38_40_45.ts"; +import type { Finding } from "../types.ts"; + +const PLACEHOLDER_MARKERS = [ + "unimplemented", + " 404 ", "(404)", " 405 ", "(405)", + "slice 3", "slice 4", "slice 5", + "endpoint not built", "not yet", +]; + +function isPlaceholderFailure(err?: string): boolean { + if (!err) return false; + const low = err.toLowerCase(); + return PLACEHOLDER_MARKERS.some(m => low.includes(m.toLowerCase())); +} + +export async function runDynamicCheck(): Promise { + const findings: Finding[] = []; + + let result; + try { + result = await runHybridFixture(); + } catch (e) { + // Fixture itself crashed — can't run dynamic check at all. + return [ + { + check: "dynamic", + severity: "warn", + summary: `hybrid fixture crashed before completing: ${(e as Error).message.slice(0, 140)}`, + evidence: [(e as Error).message], + }, + ]; + } + + // Per-layer findings for every non-ok layer. + for (const layer of result.layers) { + if (layer.ok) continue; + const placeholder = isPlaceholderFailure(layer.error); + findings.push({ + check: "dynamic", + severity: placeholder ? "info" : "warn", + summary: placeholder + ? `hybrid fixture layer ${layer.layer} (Phase ${layer.phase}) honestly reports unimplemented` + : `hybrid fixture layer ${layer.layer} (Phase ${layer.phase}) failed — not a placeholder, a real failure`, + evidence: [ + `evidence: ${layer.evidence.slice(0, 160)}`, + ...(layer.error ? [`error: ${layer.error.slice(0, 160)}`] : []), + `latency_ms: ${layer.latency_ms}`, + ], + }); + } + + // One overall summary with real numbers so the report shows what + // DID pass plus per-layer timing. + const metrics_preview = Object.entries(result.real_numbers) + .slice(0, 10) + .map(([k, v]) => `${k}=${v}`); + findings.push({ + check: "dynamic", + severity: "info", + summary: `hybrid fixture overall=${result.overall}, shipped [${result.shipped_phases.join(", ")}], placeholder [${result.placeholder_phases.join(", ")}]`, + evidence: metrics_preview.length > 0 ? metrics_preview : ["no metrics emitted"], + }); + + // If the fixture ran at all but nothing passed, elevate one of the + // summary findings to warn — something more than "all honest + // placeholders" is wrong. + if (result.overall === "fail") { + findings.push({ + check: "dynamic", + severity: "warn", + summary: `hybrid fixture: 0 layers passed (overall=fail)`, + evidence: [ + "a total fixture fail usually means a precondition service is down", + "(gateway /health / sidecar / Langfuse /v1/chat) — NOT necessarily", + "the PR's code problem. Check service status before blaming the PR.", + ], + }); + } + + return findings; +} diff --git a/auditor/checks/inference.ts b/auditor/checks/inference.ts new file mode 100644 index 0000000..6c121ec --- /dev/null +++ b/auditor/checks/inference.ts @@ -0,0 +1,206 @@ +// Cloud inference check — wraps the proven run_codereview pattern +// from llm_team_ui.py (same 3-stage framing, same cloud model) to +// critique a PR's claims against its diff. +// +// Proved out 2026-04-22 via /tmp/codereview_runner.py — gpt-oss:120b +// caught a real ternary bug in auditor/fixtures/hybrid_38_40_45.ts +// that unit tests missed. This module reuses the reviewer prompt +// shape (bugs / security / performance / style / edge cases) and +// adds claim-vs-diff specific framing. +// +// Call surface: runInferenceCheck(claims, diff) → Finding[]. +// Cloud latency budget: ~60s (gpt-oss:120b reviewer typically 35-50s +// with a 15KB diff + claim list). + +import type { Claim, Finding } from "../types.ts"; + +const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100"; +const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b"; +// 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was +// previously truncated at 15KB causing the reviewer to miss later +// files (gitea.ts, policy.ts) and flag "no Gitea client present" as a +// block finding when the file was simply outside the truncation window. +const MAX_DIFF_CHARS = 40000; +const CALL_TIMEOUT_MS = 120_000; + +export async function runInferenceCheck(claims: Claim[], diff: string): Promise { + if (claims.length === 0) { + return [{ + check: "inference", + severity: "info", + summary: "no ship-claims extracted — skipping cloud inference", + evidence: ["parser returned empty claim list; nothing to verify against cloud"], + }]; + } + + const truncated = diff.length > MAX_DIFF_CHARS + ? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]` + : diff; + + // Build the reviewer prompt in the same shape as run_codereview's + // review stage (llm_team_ui.py:10950), adapted for claim verification: + // "Task: ..." + // "Code: ..." + // "Review: bugs/security/perf/style/edge. Provide corrected code." + // We add: claim list upfront + ask for structured JSON verdict. + const systemMsg = [ + "You review pull-request diffs against the author's own ship-claims.", + "For each claim, decide: is it backed by actual code in the diff, or is", + "it placeholder / aspirational / unwired?", + "", + "A claim is BACKED when the diff contains a real code path that delivers", + "the claimed behavior. A claim is NOT BACKED when:", + " - the claim asserts functionality but the diff only adds types/fields", + " with no consumer", + " - the claim mentions tests but no test function was added", + " - the claim claims integration but the integration point is a stub", + " - the diff contains unimplemented!() / todo!() / TODO comments", + " - the claim says 'works end-to-end' but the diff has no end-to-end test", + "", + "Respond with strict JSON only. No prose before or after. Shape:", + "{", + ' "claim_verdicts": [', + ' {"claim_idx": 0, "backed": false, "evidence": "short reason"}', + " ],", + ' "unflagged_gaps": [', + ' {"location": "file:line", "summary": "short description"}', + " ]", + "}", + ].join("\n"); + + const userMsg = [ + `Ship-claims the author made (numbered 0..N-1):`, + claims.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"), + "", + `Diff:`, + "```", + truncated, + "```", + "", + `For each numbered claim above, emit a claim_verdicts entry. For gaps the`, + `author DIDN'T claim but that look like placeholder code, emit unflagged_gaps.`, + `Strict JSON only, matching the shape described. No prose outside JSON.`, + ].join("\n"); + + let resp: Response; + try { + resp = await fetch(`${GATEWAY}/v1/chat`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + provider: "ollama_cloud", + model: MODEL, + messages: [ + { role: "system", content: systemMsg }, + { role: "user", content: userMsg }, + ], + max_tokens: 3000, + temperature: 0.2, + think: true, // T3 overseer should reason — JSON shape is still required + }), + signal: AbortSignal.timeout(CALL_TIMEOUT_MS), + }); + } catch (e) { + // Cloud unreachable → soft-fail. Don't block a PR because the + // reviewer model is down. Static + dynamic + kb still run. + return [{ + check: "inference", + severity: "info", + summary: "cloud inference unreachable — skipped", + evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`], + }]; + } + + if (!resp.ok) { + return [{ + check: "inference", + severity: "info", + summary: `cloud inference returned ${resp.status} — skipped`, + evidence: [`body: ${(await resp.text()).slice(0, 200)}`], + }]; + } + + const body: any = await resp.json(); + const content: string = body?.choices?.[0]?.message?.content ?? ""; + const usage = body?.usage ?? {}; + + const parsed = extractJson(content); + if (!parsed) { + return [{ + check: "inference", + severity: "info", + summary: "cloud returned unparseable output — skipped", + evidence: [ + `head: ${content.slice(0, 200)}`, + `tokens: ${usage.total_tokens ?? "?"}`, + ], + }]; + } + + const findings: Finding[] = []; + + // One summary info finding so the verdict layer knows the check ran. + findings.push({ + check: "inference", + severity: "info", + summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})`, + evidence: [ + `claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`, + ], + }); + + for (const v of parsed.claim_verdicts ?? []) { + if (v?.backed === false) { + const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1; + const claim = claims[idx]; + if (!claim) continue; + // Strong+unbacked = BLOCK. That's the whole point of the auditor. + const sev: Finding["severity"] = claim.strength === "strong" ? "block" + : claim.strength === "moderate" ? "warn" + : "info"; + findings.push({ + check: "inference", + severity: sev, + claim_text: claim.text, + summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`, + evidence: [ + `at ${claim.location}`, + `cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`, + ], + }); + } + } + + for (const g of parsed.unflagged_gaps ?? []) { + findings.push({ + check: "inference", + severity: "warn", + summary: `cloud-flagged gap not in any claim: ${String(g?.summary ?? "?").slice(0, 120)}`, + evidence: [`location: ${String(g?.location ?? "?").slice(0, 140)}`], + }); + } + + return findings; +} + +// Lift the first balanced JSON object out of the response. Tolerates +// leading prose, code fences, and model reasoning preamble when the +// cloud model ignored "strict JSON only." +function extractJson(text: string): any | null { + const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, ""); + let depth = 0; + let start = -1; + for (let i = 0; i < cleaned.length; i++) { + const c = cleaned[i]; + if (c === "{") { + if (depth === 0) start = i; + depth++; + } else if (c === "}") { + depth--; + if (depth === 0 && start >= 0) { + try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; } + } + } + } + return null; +} diff --git a/auditor/checks/kb_query.ts b/auditor/checks/kb_query.ts new file mode 100644 index 0000000..b87066c --- /dev/null +++ b/auditor/checks/kb_query.ts @@ -0,0 +1,183 @@ +// Local-KB check — reads data/_kb/ + data/_observer/ + data/_bot/ +// for prior evidence bearing on this PR's claims. Cheap, offline, +// no model calls. The point: if a claim like "Phase X shipped" has +// a historical record of failing on the same signature before, the +// auditor surfaces that pattern before the cloud check has to +// infer it. +// +// What this check reads (all file-backed, append-only or periodic): +// data/_kb/outcomes.jsonl — per-scenario outcomes (kb.ts) +// data/_kb/error_corrections.jsonl — fail→succeed deltas on same sig +// data/_observer/ops.jsonl — observer ring → disk stream +// data/_bot/cycles/*.json — bot cycle results +// +// Each JSONL line / per-cycle file is small; this check reads tails +// only (last N lines or last M files) to stay cheap on large corpora. + +import { readFile, readdir, stat } from "node:fs/promises"; +import { join } from "node:path"; +import type { Claim, Finding } from "../types.ts"; + +const KB_DIR = "/home/profit/lakehouse/data/_kb"; +const OBSERVER_OPS = "/home/profit/lakehouse/data/_observer/ops.jsonl"; +const BOT_CYCLES_DIR = "/home/profit/lakehouse/data/_bot/cycles"; +const TAIL_LINES = 500; +const MAX_BOT_CYCLE_FILES = 30; + +export async function runKbCheck(claims: Claim[]): Promise { + const findings: Finding[] = []; + + // 1. Recent scenario outcomes: are strong-claim-style phrases showing + // up alongside failed events? That's "we claimed it worked" + + // "it didn't" in the KB. + const scenarioFindings = await checkScenarioOutcomes(claims); + findings.push(...scenarioFindings); + + // 2. Error corrections: any of the claims text overlap a + // recently-observed fail→succeed pair? If yes, add context. + const correctionFindings = await checkErrorCorrections(claims); + findings.push(...correctionFindings); + + // 3. Bot cycles: any prior bot cycle ended in tests_failed or + // apply_failed on a file this PR is also touching? + const botFindings = await checkBotCycles(); + findings.push(...botFindings); + + // 4. Observer: count recent error events. High volume = shared + // infra problem, worth flagging (context for other findings). + const obsFindings = await checkObserverStream(); + findings.push(...obsFindings); + + return findings; +} + +async function tailJsonl(path: string, n: number): Promise { + try { + const raw = await readFile(path, "utf8"); + const lines = raw.split("\n").filter(l => l.length > 0); + const slice = lines.slice(-n); + const out: T[] = []; + for (const line of slice) { + try { out.push(JSON.parse(line)); } catch { /* skip malformed */ } + } + return out; + } catch { + return []; + } +} + +async function checkScenarioOutcomes(_claims: Claim[]): Promise { + const outcomes = await tailJsonl(join(KB_DIR, "outcomes.jsonl"), TAIL_LINES); + if (outcomes.length === 0) return []; + const totalEvents = outcomes.reduce((s, o) => s + (o.total_events ?? 0), 0); + const okEvents = outcomes.reduce((s, o) => s + (o.ok_events ?? 0), 0); + const failRate = totalEvents > 0 ? 1 - okEvents / totalEvents : 0; + + if (totalEvents === 0) { + return [{ + check: "kb_query", + severity: "info", + summary: `KB: no scenario outcomes on file — learning loop is empty`, + evidence: [`data/_kb/outcomes.jsonl has ${outcomes.length} entries with 0 total events`], + }]; + } + + const recent = outcomes.slice(-10); + const recentFailSigs: string[] = recent + .filter(o => (o.ok_events ?? 0) < (o.total_events ?? 0)) + .map(o => o.sig_hash) + .filter(s => typeof s === "string"); + + const findings: Finding[] = [{ + check: "kb_query", + severity: failRate > 0.3 ? "warn" : "info", + summary: `KB: ${outcomes.length} recent scenario runs, ${okEvents}/${totalEvents} events ok (fail rate ${(failRate * 100).toFixed(1)}%)`, + evidence: [ + `most recent: ${recent[recent.length - 1]?.run_id ?? "?"}`, + `recent failing sigs: ${recentFailSigs.length > 0 ? recentFailSigs.slice(-3).join(", ") : "none"}`, + ], + }]; + return findings; +} + +async function checkErrorCorrections(_claims: Claim[]): Promise { + const corrections = await tailJsonl(join(KB_DIR, "error_corrections.jsonl"), TAIL_LINES); + if (corrections.length === 0) return []; + return [{ + check: "kb_query", + severity: "info", + summary: `KB: ${corrections.length} error corrections on file (fail→succeed pairs)`, + evidence: [ + corrections.length > 0 + ? `most recent: ${String(corrections[corrections.length - 1]?.sig_hash ?? "?").slice(0, 24)}` + : "none", + ], + }]; +} + +async function checkBotCycles(): Promise { + let entries: string[] = []; + try { entries = await readdir(BOT_CYCLES_DIR); } + catch { return []; } + + const jsonFiles = entries.filter(e => e.endsWith(".json")); + if (jsonFiles.length === 0) return []; + + // Sort by mtime desc, take most recent N + const withStat = await Promise.all( + jsonFiles.map(async name => { + try { return { name, mtime: (await stat(join(BOT_CYCLES_DIR, name))).mtimeMs }; } + catch { return { name, mtime: 0 }; } + }), + ); + const recent = withStat.sort((a, b) => b.mtime - a.mtime).slice(0, MAX_BOT_CYCLE_FILES); + + const outcomes: Record = {}; + for (const { name } of recent) { + try { + const r = JSON.parse(await readFile(join(BOT_CYCLES_DIR, name), "utf8")); + const o = String(r.outcome ?? "unknown"); + outcomes[o] = (outcomes[o] ?? 0) + 1; + } catch { /* skip */ } + } + + const summary = Object.entries(outcomes) + .sort((a, b) => b[1] - a[1]) + .map(([k, v]) => `${k}=${v}`) + .join(", "); + + const failCount = (outcomes["tests_failed"] ?? 0) + (outcomes["apply_failed"] ?? 0) + (outcomes["model_failed"] ?? 0); + return [{ + check: "kb_query", + severity: failCount > recent.length / 2 ? "warn" : "info", + summary: `KB: bot recorded ${recent.length} recent cycles — ${summary || "no outcomes parsed"}`, + evidence: [ + `dir: ${BOT_CYCLES_DIR}`, + `fail-class total: ${failCount} / ${recent.length}`, + ], + }]; +} + +async function checkObserverStream(): Promise { + const ops = await tailJsonl(OBSERVER_OPS, TAIL_LINES); + if (ops.length === 0) return []; + const failures = ops.filter(o => o.ok === false).length; + return [{ + check: "kb_query", + severity: "info", + summary: `KB: observer stream ${ops.length} recent ops, ${failures} failures`, + evidence: [ + `source: ${OBSERVER_OPS}`, + `by source: ${observerBySource(ops)}`, + ], + }]; +} + +function observerBySource(ops: any[]): string { + const c: Record = {}; + for (const o of ops) { + const s = String(o.source ?? "unknown"); + c[s] = (c[s] ?? 0) + 1; + } + return Object.entries(c).sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ") || "empty"; +} diff --git a/auditor/checks/static.ts b/auditor/checks/static.ts new file mode 100644 index 0000000..dc31e38 --- /dev/null +++ b/auditor/checks/static.ts @@ -0,0 +1,159 @@ +// Static diff check — grep-style, no AST, no LLM. Looks for patterns +// that are high-signal evidence of placeholder code. +// +// Findings are severity-graded: +// block — explicit non-impl markers (unimplemented!, todo!, +// panic!("not implemented"), throw new Error("not implemented")) +// warn — TODO / FIXME / XXX / HACK comments on added lines, +// new struct fields with no read-site anywhere in the diff, +// suspiciously-empty function bodies ({ Ok(()) } / {} when +// the commit message claims the fn "implements" something) +// info — hardcoded "test" / "dummy" / "placeholder" strings in +// added lines (could be real, just flag for inspection) +// +// Consumes: raw unified diff text from Gitea. + +import type { Finding } from "../types.ts"; + +// Rust + TypeScript patterns that almost always indicate "this is +// not actually implemented yet." +const BLOCK_PATTERNS: Array<{ re: RegExp; why: string }> = [ + { re: /\bunimplemented!\s*\(/, why: "unimplemented!() macro call" }, + { re: /\btodo!\s*\(/, why: "todo!() macro call" }, + { re: /panic!\s*\(\s*"(?:not implemented|TODO|not yet|unimpl)/i, why: "panic! with not-implemented message" }, + { re: /throw\s+new\s+Error\s*\(\s*['"](?:not implemented|TODO|unimpl)/i, why: "throw Error 'not implemented'" }, +]; + +const WARN_COMMENT_PATTERNS: Array<{ re: RegExp; why: string }> = [ + { re: /^\+.*\/\/\s*(TODO|FIXME|XXX|HACK)\b/i, why: "TODO/FIXME/XXX/HACK comment added" }, + { re: /^\+.*#\s*(TODO|FIXME|XXX|HACK)\b/i, why: "TODO/FIXME/XXX/HACK comment added" }, +]; + +const INFO_HARDCODED_PATTERNS: Array<{ re: RegExp; why: string }> = [ + { re: /"(?:placeholder|dummy|foobar|xxx|replaceme|changeme)"/i, why: "suspicious hardcoded string" }, +]; + +export function runStaticCheck(diff: string): Finding[] { + const findings: Finding[] = []; + + // Per-file walk: only look at ADDED lines (prefix '+' but not '+++' + // which is the diff header). + const perFile = splitDiffByFile(diff); + + for (const [path, lines] of perFile) { + // Skip diff bookkeeping + pure-delete files + if (!lines.some(l => l.startsWith("+") && !l.startsWith("+++"))) continue; + + // The auditor's own check files literally contain the BLOCK + // patterns as regex definitions (BLOCK_PATTERNS in this file, + // prompt examples in inference.ts). Skipping BLOCK scan on these + // specific paths prevents the checker from self-flagging its own + // string literals. WARN/INFO patterns still run — those genuinely + // could indicate problems in the checker's own code (TODO + // comments don't self-define). + const isAuditorCheckerFile = path.startsWith("auditor/checks/") || + path.startsWith("auditor/fixtures/"); + + for (let idx = 0; idx < lines.length; idx++) { + const line = lines[idx]; + if (!line.startsWith("+") || line.startsWith("+++")) continue; + const added = line.slice(1); + + if (!isAuditorCheckerFile) { + for (const { re, why } of BLOCK_PATTERNS) { + if (re.test(added)) { + findings.push({ + check: "static", + severity: "block", + summary: `${why} in ${path}`, + evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`], + }); + } + } + } + for (const { re, why } of WARN_COMMENT_PATTERNS) { + if (re.test(line)) { + findings.push({ + check: "static", + severity: "warn", + summary: `${why} in ${path}`, + evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`], + }); + } + } + for (const { re, why } of INFO_HARDCODED_PATTERNS) { + if (re.test(added)) { + findings.push({ + check: "static", + severity: "info", + summary: `${why} in ${path}`, + evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`], + }); + } + } + } + + // "Field added but never read" heuristic — catches exactly the + // Phase 45 DocRef placeholder pattern. Limited to the diff itself: + // we're not doing a full-codebase grep here (too noisy; callers + // elsewhere might exist). The point is: if NEITHER this diff nor + // any other line in the diff reads the field, the PR is shipping + // state without a consumer. + const addedLines = lines.filter(l => l.startsWith("+") && !l.startsWith("+++")) + .map(l => l.slice(1)); + const newFields = extractNewFields(addedLines); + for (const field of newFields) { + const readPattern = new RegExp(`[\\.:]\\s*${escape(field)}\\b|\\b${escape(field)}\\s*:`); + // The definition line itself matches readPattern — filter it out + // by requiring at least TWO lines in the diff mention the field + // (one defines, one reads). + const hits = addedLines.filter(l => readPattern.test(l)); + if (hits.length < 2) { + findings.push({ + check: "static", + severity: "warn", + summary: `field '${field}' added in ${path} but no read-site in the diff — could be placeholder state without a consumer`, + evidence: [`${path}: added '${field}' with no reader; rest of diff has ${hits.length - 1} mentions`], + }); + } + } + } + + return findings; +} + +function splitDiffByFile(diff: string): Map { + const out = new Map(); + let current: string | null = null; + let buf: string[] = []; + for (const line of diff.split(/\r?\n/)) { + const m = line.match(/^diff --git a\/(\S+) b\/(\S+)/); + if (m) { + if (current) out.set(current, buf); + current = m[2]; + buf = []; + continue; + } + buf.push(line); + } + if (current) out.set(current, buf); + return out; +} + +// Extract new `pub name: Type,` fields from added lines. Rust syntax. +// Narrowly-scoped: only matches at the start of a trimmed line, +// requires `pub ` prefix, ignores `pub fn` / `pub struct` / etc. +function extractNewFields(addedLines: string[]): string[] { + const fields = new Set(); + for (const line of addedLines) { + const t = line.trim(); + // pub NAME: Type, + const m = t.match(/^pub\s+(?!fn\b|struct\b|enum\b|mod\b|use\b|trait\b|impl\b|const\b|static\b|type\b)(\w+)\s*:/); + if (m) fields.add(m[1]); + } + return Array.from(fields); +} + +function escape(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} diff --git a/auditor/claim_parser.ts b/auditor/claim_parser.ts new file mode 100644 index 0000000..46b65a7 --- /dev/null +++ b/auditor/claim_parser.ts @@ -0,0 +1,119 @@ +// Claim parser — reads commit messages + PR body, extracts ship-claims. +// +// A "ship-claim" is any phrase that asserts functionality is working, +// tested, complete, or landed. These are the assertions the downstream +// checks (static/dynamic/inference/kb) try to falsify. +// +// Heuristic approach (regex + strength grading) — intentionally NOT +// using an LLM here. Reason: the inference check already asks a cloud +// model "does this match the claim?". The parser's job is to surface +// the claim substrates, not judge them. Over-engineering the parser +// risks false-negatives when the cloud model was going to catch it +// anyway. + +import type { Claim, PrSnapshot } from "./types.ts"; + +// Strong claims: explicit end-to-end + verification vocabulary +const STRONG_PATTERNS: RegExp[] = [ + /\bverified\s+(end[- ]to[- ]end|live|in\s+production|against)\b/i, + /\btested\s+(live|end[- ]to[- ]end|against|with)\b/i, + /\bworks\s+(end[- ]to[- ]end|live|in\s+production)\b/i, + /\bproduction[- ]ready\b/i, + /\bfully\s+(functional|wired|working)\b/i, + /\bphase\s+\d+(\.\d+)?\s+(shipped|complete|done|landed)\b/i, + /\bground\s+truth\b/i, + /\bproven\b/i, +]; + +// Moderate claims: asserted completion or pass but without the strong +// verification qualifier. +const MODERATE_PATTERNS: RegExp[] = [ + /\bshipped\b/i, + /\blanded\b/i, + /\bgreen\b/i, + /\b(tests?\s+)?pass(ing|ed)\b/i, + /\bcomplet(e|ed)\b/i, + /\bdone\b/i, + /\bwired\b/i, + /\bfixed\b/i, + /\bworks\b/i, +]; + +// Weak claims: aspirational or hedged. Usually low-risk but recorded +// for completeness. +const WEAK_PATTERNS: RegExp[] = [ + /\bshould\s+work\b/i, + /\bexpected\s+to\b/i, + /\bintended\s+to\b/i, + /\bwill\s+(work|handle|support)\b/i, + /\bprobably\b/i, +]; + +export interface ParsedClaims { + claims: Claim[]; + commits_scanned: number; +} + +export function parseClaims(pr: PrSnapshot): ParsedClaims { + const claims: Claim[] = []; + + // PR body — every matching line becomes a claim at location "pr_body:N" + if (pr.body) { + scanText(pr.body, "pr_body", pr.head_sha, claims); + } + + // Each commit message gets its own scan. + for (const c of pr.commits) { + if (!c.message) continue; + scanText(c.message, `commit:${c.sha.slice(0, 8)}`, c.sha, claims); + } + + return { claims, commits_scanned: pr.commits.length }; +} + +function scanText(text: string, location_prefix: string, commit_sha: string, out: Claim[]): void { + const lines = text.split(/\r?\n/); + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + if (line.length < 3) continue; + + // Strong patterns first — if a line matches strong, it's strong, + // don't double-count as moderate. + const strong = firstMatch(line, STRONG_PATTERNS); + if (strong) { + out.push({ + text: line.trim().slice(0, 200), + commit_sha, + location: `${location_prefix}:${i + 1}`, + strength: "strong", + }); + continue; + } + const moderate = firstMatch(line, MODERATE_PATTERNS); + if (moderate) { + out.push({ + text: line.trim().slice(0, 200), + commit_sha, + location: `${location_prefix}:${i + 1}`, + strength: "moderate", + }); + continue; + } + const weak = firstMatch(line, WEAK_PATTERNS); + if (weak) { + out.push({ + text: line.trim().slice(0, 200), + commit_sha, + location: `${location_prefix}:${i + 1}`, + strength: "weak", + }); + } + } +} + +function firstMatch(text: string, patterns: RegExp[]): RegExp | null { + for (const p of patterns) { + if (p.test(text)) return p; + } + return null; +} diff --git a/auditor/fixtures/cli.ts b/auditor/fixtures/cli.ts new file mode 100644 index 0000000..bc6d048 --- /dev/null +++ b/auditor/fixtures/cli.ts @@ -0,0 +1,49 @@ +// Standalone runner for auditor fixtures. Invoke: +// +// bun run auditor/fixtures/cli.ts hybrid_38_40_45 +// +// Prints human-readable per-layer breakdown + JSON result so the +// output can be captured by the dynamic check without re-running. + +import { runHybridFixture, type FixtureResult } from "./hybrid_38_40_45.ts"; + +const fixtureArg = process.argv[2] ?? "hybrid_38_40_45"; + +let result: FixtureResult; +switch (fixtureArg) { + case "hybrid_38_40_45": + result = await runHybridFixture(); + break; + default: + console.error(`unknown fixture: ${fixtureArg}`); + process.exit(2); +} + +// Human-readable summary +console.error(""); // blank line to stderr so stdout JSON stays clean +console.error(`─── Fixture: ${result.fixture} ───`); +console.error(` overall: ${result.overall.toUpperCase()}`); +console.error(` shipped: [${result.shipped_phases.join(", ") || "—"}]`); +console.error(` placeholder: [${result.placeholder_phases.join(", ") || "—"}]`); +console.error(""); +for (const l of result.layers) { + const mark = l.ok ? "✓" : "✗"; + const phaseStr = `Phase ${l.phase}`.padEnd(11); + console.error(` ${mark} ${phaseStr} ${l.layer.padEnd(30)} ${String(l.latency_ms).padStart(6)}ms`); + if (l.ok) { + console.error(` ${l.evidence.slice(0, 200)}`); + } else { + console.error(` ERROR: ${l.error?.slice(0, 240) ?? "unknown"}`); + } +} +console.error(""); +for (const n of result.notes) console.error(` note: ${n}`); +console.error(""); + +// Machine-readable output on stdout. +console.log(JSON.stringify(result, null, 2)); + +// Exit code reflects overall: 0 pass, 2 partial, 1 fail. +// Dynamic check reads this AND the JSON; partial-pass is treated +// as informative (some layers shipped), not blocking on its own. +process.exit(result.overall === "pass" ? 0 : result.overall === "partial_pass" ? 2 : 1); diff --git a/auditor/fixtures/hybrid_38_40_45.ts b/auditor/fixtures/hybrid_38_40_45.ts new file mode 100644 index 0000000..7b327f9 --- /dev/null +++ b/auditor/fixtures/hybrid_38_40_45.ts @@ -0,0 +1,335 @@ +// The never-run hybrid test fixture. Exercises the full stack end-to-end +// across Phase 38 (/v1/chat), Phase 40 (Langfuse tracing), Phase 45 +// slice 1 (DocRef on playbook), and Phase 45 slice 2 (context7 bridge +// drift detection). +// +// Returns HONEST per-layer results. A layer that's unimplemented or +// broken returns ok=false with a real error; overall verdict flags +// which phases are genuinely shipped vs still placeholder. +// +// Deterministic: always runs the same payload so a run's output is +// comparable across audits. Uses TEST-prefixed names so playbook +// state doesn't get polluted with production-looking fixture data. +// +// Preconditions: +// - gateway up on :3100 +// - Python sidecar up on :3200 +// - Ollama local + Ollama Cloud key loaded +// - Langfuse up on :3001 +// - context7 bridge up on :3900 (manual-start per mcp-server/README) +// If bridge isn't running, that layer reports ok=false with a +// clear error — the fixture doesn't try to auto-start it. + +import { readFile } from "node:fs/promises"; + +const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100"; +const LANGFUSE = process.env.LANGFUSE_URL ?? "http://localhost:3001"; +const BRIDGE = process.env.CONTEXT7_BRIDGE_URL ?? "http://localhost:3900"; +const FIXTURE_ID = "auditor:hybrid_38_40_45:v1"; +const TEST_WORKER_NAME = "__auditor_test_worker__"; +const STALE_HASH = "stale-hash-for-drift-proof"; +// Unique-per-run identifiers so each fixture invocation hits the ADD +// path in upsert_entry (not UPDATE/NOOP on a leftover from a prior +// run). Catches state pollution + avoids interaction with task #12 +// (UPDATE branch silently drops doc_refs). +const RUN_NONCE = Date.now().toString(36); +const TEST_WORKER_NAME_VERSIONED = `__auditor_test_worker_${RUN_NONCE}__`; +const TEST_OPERATION_VERSIONED = `TEST: fill: __auditor_${RUN_NONCE}__ x1 in TestCity, TC`; + +export interface LayerResult { + layer: string; + phase: string; + ok: boolean; + latency_ms: number; + evidence: string; + error?: string; + // Layer-specific numbers that downstream checks can aggregate. + metrics?: Record; +} + +export interface FixtureResult { + fixture: string; + ran_at: string; + overall: "pass" | "partial_pass" | "fail"; + layers: LayerResult[]; + real_numbers: Record; + shipped_phases: string[]; + placeholder_phases: string[]; + notes: string[]; +} + +// Wraps a layer's async body with timing + structured error capture. +// On throw: returns ok=false with the error string as evidence, so +// downstream verdict code treats it as a layer failure rather than +// a fixture crash. +async function measureLayer( + layer: string, + phase: string, + body: () => Promise<{ evidence: string; metrics?: Record }>, +): Promise { + const t0 = Date.now(); + try { + const { evidence, metrics } = await body(); + return { + layer, phase, ok: true, + latency_ms: Date.now() - t0, + evidence, metrics, + }; + } catch (e) { + return { + layer, phase, ok: false, + latency_ms: Date.now() - t0, + evidence: `FAILED: ${(e as Error).message.slice(0, 200)}`, + error: (e as Error).message, + }; + } +} + +async function langfuseAuth(): Promise<{ pk: string; sk: string } | null> { + // Default credentials match mcp-server/tracing.ts. Same sources the + // Rust side uses in gateway/src/v1/langfuse_trace.rs. + let pk = process.env.LANGFUSE_PUBLIC_KEY ?? "pk-lf-staffing"; + let sk = process.env.LANGFUSE_SECRET_KEY ?? "sk-lf-staffing-secret"; + if (!pk || !sk) return null; + return { pk, sk }; +} + +export async function runHybridFixture(): Promise { + const result: FixtureResult = { + fixture: FIXTURE_ID, + ran_at: new Date().toISOString(), + overall: "fail", + layers: [], + real_numbers: {}, + shipped_phases: [], + placeholder_phases: [], + notes: [], + }; + + // ======================================================================== + // Layer 1 — Phase 38: POST /v1/chat returns valid OpenAI shape + // ======================================================================== + // Captured HERE, immediately before the chat layer runs, so layer 2's + // Langfuse-trace filter uses the actual moment the chat call was + // attempted — not the fixture start time. Earlier draft had a + // meaningless ternary returning result.ran_at on both branches; the + // LLM-Team codereview (2026-04-22) caught this and flagged it as a + // false-negative window on traces created between fixture-start and + // chat-fetch. + const chat_request_sent_ms = Date.now(); + + const l1 = await measureLayer("phase38_chat", "38", async () => { + const r = await fetch(`${GATEWAY}/v1/chat`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + model: "qwen3.5:latest", + messages: [ + { role: "system", content: "Respond in 8 words or fewer." }, + { role: "user", content: "Name one Docker command." }, + ], + max_tokens: 60, + temperature: 0.2, + }), + signal: AbortSignal.timeout(60000), + }); + if (!r.ok) throw new Error(`gateway ${r.status}: ${await r.text()}`); + const j: any = await r.json(); + const content = j.choices?.[0]?.message?.content; + if (typeof content !== "string" || content.length === 0) { + throw new Error(`empty content — think-false default may not be wired: ${JSON.stringify(j).slice(0, 200)}`); + } + return { + evidence: `content="${content.slice(0, 60)}" tokens=${j.usage?.total_tokens}`, + metrics: { + prompt_tokens: j.usage?.prompt_tokens ?? 0, + completion_tokens: j.usage?.completion_tokens ?? 0, + total_tokens: j.usage?.total_tokens ?? 0, + content_nonempty: true, + }, + }; + }); + result.layers.push(l1); + + // ======================================================================== + // Layer 2 — Phase 40: Langfuse trace lands within 3s + // ======================================================================== + // Fire-and-forget tracing means we need a brief sleep before query. + await new Promise(res => setTimeout(res, 2500)); + const l2 = await measureLayer("phase40_langfuse_trace", "40", async () => { + const auth = await langfuseAuth(); + if (!auth) throw new Error("Langfuse credentials not in env"); + const r = await fetch(`${LANGFUSE}/api/public/traces?limit=5&name=v1.chat:ollama`, { + headers: { Authorization: `Basic ${btoa(`${auth.pk}:${auth.sk}`)}` }, + signal: AbortSignal.timeout(10000), + }); + if (!r.ok) throw new Error(`langfuse ${r.status}: ${await r.text()}`); + const j: any = await r.json(); + const items = Array.isArray(j.data) ? j.data : []; + // Filter on the chat-request timestamp captured above. A Langfuse + // trace must be newer than the moment we fired /v1/chat to plausibly + // belong to our request. Using fixture start time (result.ran_at) + // was wrong and could false-negative on slow fixtures. + const recent = items.filter((t: any) => Date.parse(t.timestamp) >= chat_request_sent_ms); + if (recent.length === 0) { + throw new Error(`no v1.chat:ollama trace since ${new Date(chat_request_sent_ms).toISOString()} (${items.length} older traces visible, Langfuse reachable — tracing is not firing)`); + } + const trace = recent[0]; + return { + evidence: `trace ${trace.id.slice(0, 8)} name=${trace.name} age=${Date.now() - Date.parse(trace.timestamp)}ms`, + metrics: { + trace_age_ms: Date.now() - Date.parse(trace.timestamp), + trace_latency_reported: Number(trace.latency ?? 0), + recent_trace_count: recent.length, + }, + }; + }); + result.layers.push(l2); + + // ======================================================================== + // Layer 3 — Phase 45 slice 1: /seed accepts doc_refs and persists them + // ======================================================================== + // Seed a playbook with doc_refs referencing Docker at a stale hash. + // Ignore the cleanup concern for v1 — this fixture pollutes the + // in-memory playbook state; operator can retire test entries. + const l3 = await measureLayer("phase45_seed_with_doc_refs", "45.1", async () => { + const r = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + operation: TEST_OPERATION_VERSIONED, + approach: "auditor hybrid fixture run", + context: "doc_refs integration test — retire after audit", + endorsed_names: [TEST_WORKER_NAME_VERSIONED], + append: true, + doc_refs: [ + { + tool: "docker", + version_seen: "24.0.7", + snippet_hash: STALE_HASH, + source_url: "https://context7.com/docker/docs", + seen_at: new Date().toISOString(), + }, + ], + }), + signal: AbortSignal.timeout(30000), + }); + if (!r.ok) throw new Error(`seed ${r.status}: ${await r.text()}`); + const j: any = await r.json(); + // Verify doc_refs actually persisted — read state.json directly. + // That's the canonical source of truth; the seed response may or + // may not echo doc_refs back. + const state_raw = await readFile("/home/profit/lakehouse/data/_playbook_memory/state.json", "utf8"); + const state = JSON.parse(state_raw); + const entries = state.entries ?? []; + const ours = entries.find((e: any) => (e.endorsed_names ?? []).includes(TEST_WORKER_NAME_VERSIONED)); + if (!ours) throw new Error(`seeded entry not found in state.json (worker ${TEST_WORKER_NAME_VERSIONED} not present)`); + const docRefs = ours.doc_refs ?? []; + if (docRefs.length === 0) { + throw new Error("entry saved but doc_refs field is empty — the field accepted by the API isn't being persisted"); + } + return { + evidence: `playbook ${ours.playbook_id.slice(0, 16)} persisted with doc_refs.length=${docRefs.length}; first tool=${docRefs[0]?.tool} hash=${docRefs[0]?.snippet_hash}`, + metrics: { + doc_refs_stored: docRefs.length, + entries_after: j.entries_after ?? -1, + playbook_id: ours.playbook_id, + mode: j.outcome?.mode ?? "unknown", + }, + }; + }); + result.layers.push(l3); + + // ======================================================================== + // Layer 4 — Phase 45 slice 2: context7 bridge detects drift vs stale hash + // ======================================================================== + const l4 = await measureLayer("phase45_bridge_diff", "45.2", async () => { + // Health check first — if bridge isn't running, fail with a clear + // message rather than mysterious connection refused. + try { + const h = await fetch(`${BRIDGE}/health`, { signal: AbortSignal.timeout(3000) }); + if (!h.ok) throw new Error(`bridge /health ${h.status}`); + } catch (e) { + throw new Error(`context7 bridge at ${BRIDGE} is not reachable (bridge is manual-start; run 'bun run mcp-server/context7_bridge.ts'): ${(e as Error).message}`); + } + const r = await fetch(`${BRIDGE}/docs/docker/diff?since=${encodeURIComponent(STALE_HASH)}`, { + signal: AbortSignal.timeout(30000), + }); + if (!r.ok) throw new Error(`bridge diff ${r.status}: ${await r.text()}`); + const j: any = await r.json(); + if (j.drifted !== true) { + throw new Error(`expected drifted=true against ${STALE_HASH}; got ${JSON.stringify(j)}`); + } + return { + evidence: `bridge confirms drift: current=${j.current_snippet_hash} previous=${j.previous_snippet_hash} upstream_updated=${j.last_updated_upstream}`, + metrics: { + drifted: true, + current_hash_length: String(j.current_snippet_hash ?? "").length, + library_id: j.library_id, + }, + }; + }); + result.layers.push(l4); + + // ======================================================================== + // Layer 5 — Phase 45 slice 3: /doc_drift/check/{id} endpoint + // (EXPECTED TO FAIL — endpoint unimplemented) + // ======================================================================== + const l5 = await measureLayer("phase45_slice3_drift_flag", "45.3", async () => { + const pid = String(l3.metrics?.playbook_id ?? ""); + if (!pid) throw new Error("layer 3 did not produce a playbook_id to check"); + const r = await fetch(`${GATEWAY}/vectors/playbook_memory/doc_drift/check/${encodeURIComponent(pid)}`, { + method: "POST", + signal: AbortSignal.timeout(10000), + }); + if (r.status === 404 || r.status === 405) { + // This is the HONEST signal: the endpoint doesn't exist yet. + // Fail this layer with a clear marker — not a silent pass. + throw new Error(`endpoint unimplemented (${r.status}) — Phase 45 slice 3 is still placeholder. doc_refs are persisted (layer 3) and the bridge can answer drift questions (layer 4), but nothing ties them together into a playbook flag yet.`); + } + if (!r.ok) throw new Error(`drift/check ${r.status}: ${await r.text()}`); + const j: any = await r.json(); + // If the endpoint DID exist, assert the flag got set. + if (!j.flagged) { + throw new Error(`endpoint responded but flag not set: ${JSON.stringify(j).slice(0, 200)}`); + } + return { + evidence: `playbook flagged: ${JSON.stringify(j).slice(0, 160)}`, + metrics: { flagged: true }, + }; + }); + result.layers.push(l5); + + // ======================================================================== + // Verdict assembly + // ======================================================================== + for (const l of result.layers) { + if (l.ok) result.shipped_phases.push(l.phase); + else result.placeholder_phases.push(l.phase); + result.real_numbers[`${l.layer}_latency_ms`] = l.latency_ms; + if (l.metrics) { + for (const [k, v] of Object.entries(l.metrics)) { + if (typeof v === "number") result.real_numbers[`${l.layer}.${k}`] = v; + } + } + } + + const anyFail = result.placeholder_phases.length > 0; + const allPass = result.layers.every(l => l.ok); + result.overall = allPass ? "pass" : (result.shipped_phases.length > 0 ? "partial_pass" : "fail"); + + if (!l5.ok) { + result.notes.push( + "Phase 45 slice 3 (doc_drift/check endpoint) is the expected-fail layer. " + + "Layer 5 failing with a 404/405 is the CORRECT honest signal — don't mask this " + + "to get a green result. When slice 3 ships, layer 5 flips to ok=true automatically." + ); + } + if (anyFail) { + result.notes.push( + `${result.placeholder_phases.length} layer(s) failed. Check evidence per layer for specifics.` + ); + } + + return result; +} diff --git a/auditor/gitea.ts b/auditor/gitea.ts new file mode 100644 index 0000000..2b20d57 --- /dev/null +++ b/auditor/gitea.ts @@ -0,0 +1,145 @@ +// Gitea API client. Minimal surface — only what the auditor needs: +// list open PRs, get commits + files for a PR, fetch a diff, post a +// commit status, post a review. +// +// Auth: reads PAT from ~/.git-credentials (set up by the credential +// helper flow in 2026-04-22 session). Gitea's "token" auth scheme +// matches what `git fetch` is already using. + +import { readFile } from "node:fs/promises"; +import type { PrSnapshot } from "./types.ts"; + +const HOST = process.env.GITEA_HOST ?? "https://git.agentview.dev"; +const OWNER = "profit"; +const REPO = "lakehouse"; +const CRED_FILE = "/home/profit/.git-credentials"; + +let cachedPat: string | null = null; + +async function getPat(): Promise { + if (cachedPat) return cachedPat; + const raw = await readFile(CRED_FILE, "utf8"); + for (const line of raw.split("\n")) { + const m = line.match(/^https:\/\/[^:]+:([^@]+)@git\.agentview\.dev/); + if (m) { cachedPat = m[1]; return m[1]; } + } + throw new Error(`no Gitea PAT in ${CRED_FILE}`); +} + +async function giteaFetch(path: string, init: RequestInit = {}): Promise { + const pat = await getPat(); + const url = `${HOST}/api/v1${path}`; + const headers = new Headers(init.headers); + headers.set("Authorization", `token ${pat}`); + if (init.body && !headers.has("content-type")) { + headers.set("content-type", "application/json"); + } + return fetch(url, { ...init, headers, signal: AbortSignal.timeout(20000) }); +} + +export async function listOpenPrs(): Promise { + const r = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls?state=open&page=1&limit=50`); + if (!r.ok) throw new Error(`listOpenPrs ${r.status}: ${await r.text()}`); + const rows = (await r.json()) as any[]; + return Promise.all(rows.map(row => snapshotFromPr(row))); +} + +export async function getPrSnapshot(num: number): Promise { + const r = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${num}`); + if (!r.ok) throw new Error(`getPr ${num} ${r.status}: ${await r.text()}`); + return snapshotFromPr((await r.json()) as any); +} + +async function snapshotFromPr(row: any): Promise { + const num = row.number; + const commitsResp = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${num}/commits`); + const commits = commitsResp.ok ? ((await commitsResp.json()) as any[]) : []; + const filesResp = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${num}/files`); + const files = filesResp.ok ? ((await filesResp.json()) as any[]) : []; + return { + number: num, + head_sha: row.head?.sha ?? "", + base_sha: row.base?.sha ?? "", + title: row.title ?? "", + body: row.body ?? "", + state: row.state === "open" ? "open" : (row.merged ? "merged" : "closed"), + author: row.user?.login ?? "", + commits: commits.map(c => ({ + sha: (c.sha ?? "").slice(0, 12), + message: c.commit?.message ?? "", + author: c.commit?.author?.name ?? "", + })), + files: files.map(f => ({ + path: f.filename ?? "", + additions: f.additions ?? 0, + deletions: f.deletions ?? 0, + })), + }; +} + +/// Returns the unified diff text of the PR. Used by static checks. +export async function getPrDiff(num: number): Promise { + const r = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${num}.diff`); + if (!r.ok) throw new Error(`getDiff ${num} ${r.status}: ${await r.text()}`); + return await r.text(); +} + +/// Hard-block mechanism: post a failing commit status on the PR head +/// SHA. Branch protection (if enabled on `main`) treats this as a +/// required-check fail and prevents merge. The description is shown +/// in the Gitea UI next to the red X. +export async function postCommitStatus(args: { + sha: string; + state: "success" | "pending" | "failure" | "error"; + context: string; + description: string; + target_url?: string; +}): Promise { + const r = await giteaFetch(`/repos/${OWNER}/${REPO}/statuses/${args.sha}`, { + method: "POST", + body: JSON.stringify({ + state: args.state, + context: args.context, + description: args.description.slice(0, 140), + target_url: args.target_url ?? "", + }), + }); + if (!r.ok) throw new Error(`postCommitStatus ${r.status}: ${await r.text()}`); +} + +/// Post a review comment. Gitea typically blocks self-review +/// (author posting a review on their own PR). Prefer +/// `postIssueComment` when running with the author's PAT. +export async function postReview(args: { + pr_number: number; + commit_id: string; + body: string; + event: "APPROVE" | "REQUEST_CHANGES" | "COMMENT"; +}): Promise { + const r = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${args.pr_number}/reviews`, { + method: "POST", + body: JSON.stringify({ + commit_id: args.commit_id, + body: args.body, + event: args.event, + }), + }); + if (!r.ok) throw new Error(`postReview ${r.status}: ${await r.text()}`); +} + +/// Plain issue comment. Works for the auditor's own PAT because +/// Gitea allows authors to comment on their own PRs (just not +/// review them). Auditor uses this for the reasoning body; the +/// actual block signal is the commit status. +export async function postIssueComment(args: { + pr_number: number; + body: string; +}): Promise<{ id: number; html_url: string }> { + const r = await giteaFetch(`/repos/${OWNER}/${REPO}/issues/${args.pr_number}/comments`, { + method: "POST", + body: JSON.stringify({ body: args.body }), + }); + if (!r.ok) throw new Error(`postIssueComment ${r.status}: ${await r.text()}`); + const j = await r.json() as any; + return { id: j.id, html_url: j.html_url }; +} diff --git a/auditor/index.ts b/auditor/index.ts new file mode 100644 index 0000000..cd64144 --- /dev/null +++ b/auditor/index.ts @@ -0,0 +1,147 @@ +// Auditor poller — the top-level entry. Polls Gitea for open PRs on +// a fixed interval, dedupes by head SHA, runs audit + posts verdict +// for each new (pr, sha) pair. +// +// Run manually: +// bun run auditor/index.ts +// +// Stop: +// touch auditor.paused (skips next cycle) +// pkill -f auditor/index.ts (kills in-flight) +// +// State: +// data/_auditor/state.json — last-audited SHA per PR +// data/_auditor/verdicts/{id}.json — per-run verdict records +// +// This entry runs forever. A systemd unit would wrap it once the +// workflow is trusted (same pattern as mcp-server, observer). + +import { readFile, writeFile, mkdir, access } from "node:fs/promises"; +import { listOpenPrs } from "./gitea.ts"; +import { auditPr } from "./audit.ts"; + +const POLL_INTERVAL_MS = 90_000; // 90s — enough budget for audit runs to complete +const PAUSE_FILE = "/home/profit/lakehouse/auditor.paused"; +const STATE_FILE = "/home/profit/lakehouse/data/_auditor/state.json"; + +interface State { + // Map: PR number → last-audited head SHA. Lets us dedupe audits + // across restarts (poller can crash/restart without re-auditing + // all open PRs from scratch). + last_audited: Record; + started_at: string; + cycles_total: number; + cycles_skipped_paused: number; + audits_run: number; + last_cycle_at?: string; +} + +async function fileExists(path: string): Promise { + try { await access(path); return true; } catch { return false; } +} + +async function loadState(): Promise { + try { + const raw = await readFile(STATE_FILE, "utf8"); + const s = JSON.parse(raw); + return { + last_audited: s.last_audited ?? {}, + started_at: s.started_at ?? new Date().toISOString(), + cycles_total: s.cycles_total ?? 0, + cycles_skipped_paused: s.cycles_skipped_paused ?? 0, + audits_run: s.audits_run ?? 0, + last_cycle_at: s.last_cycle_at, + }; + } catch { + return { + last_audited: {}, + started_at: new Date().toISOString(), + cycles_total: 0, + cycles_skipped_paused: 0, + audits_run: 0, + }; + } +} + +async function saveState(s: State): Promise { + await mkdir("/home/profit/lakehouse/data/_auditor", { recursive: true }); + await writeFile(STATE_FILE, JSON.stringify(s, null, 2)); +} + +async function runCycle(state: State): Promise { + state.cycles_total += 1; + state.last_cycle_at = new Date().toISOString(); + + if (await fileExists(PAUSE_FILE)) { + state.cycles_skipped_paused += 1; + console.log(`[auditor] cycle ${state.cycles_total}: paused (touch ${PAUSE_FILE} exists)`); + return state; + } + + let prs; + try { + prs = await listOpenPrs(); + } catch (e) { + console.error(`[auditor] listOpenPrs failed: ${(e as Error).message}`); + return state; + } + + console.log(`[auditor] cycle ${state.cycles_total}: ${prs.length} open PR(s)`); + + for (const pr of prs) { + const last = state.last_audited[String(pr.number)]; + if (last === pr.head_sha) { + console.log(`[auditor] skip PR #${pr.number} (SHA ${pr.head_sha.slice(0, 8)} already audited)`); + continue; + } + console.log(`[auditor] audit PR #${pr.number} (${pr.head_sha.slice(0, 8)}) — ${pr.title.slice(0, 60)}`); + try { + // Skip dynamic by default: it mutates live playbook state and + // re-runs on every PR update would pollute quickly. Operator + // can run dynamic via `bun run auditor/fixtures/cli.ts` manually + // OR set LH_AUDITOR_RUN_DYNAMIC=1 to opt in. + const run_dynamic = process.env.LH_AUDITOR_RUN_DYNAMIC === "1"; + const verdict = await auditPr(pr, { + skip_dynamic: !run_dynamic, + skip_inference: process.env.LH_AUDITOR_SKIP_INFERENCE === "1", + }); + console.log(`[auditor] verdict=${verdict.overall} findings=${verdict.metrics.findings_total} (block=${verdict.metrics.findings_block} warn=${verdict.metrics.findings_warn})`); + state.last_audited[String(pr.number)] = pr.head_sha; + state.audits_run += 1; + } catch (e) { + console.error(`[auditor] audit failed: ${(e as Error).message}`); + } + } + + return state; +} + +async function main(): Promise { + console.log(`[auditor] starting poller — interval ${POLL_INTERVAL_MS / 1000}s`); + console.log(`[auditor] pause file: ${PAUSE_FILE}`); + console.log(`[auditor] state file: ${STATE_FILE}`); + + let state = await loadState(); + console.log(`[auditor] loaded state: ${Object.keys(state.last_audited).length} PRs previously audited, ${state.cycles_total} cycles so far`); + + // Single-shot mode for CLI testing: `bun run auditor/index.ts --once` + const once = process.argv.includes("--once"); + if (once) { + state = await runCycle(state); + await saveState(state); + console.log(`[auditor] single-shot complete. total audits: ${state.audits_run}`); + return; + } + + // Loop. + while (true) { + state = await runCycle(state); + await saveState(state); + await new Promise(res => setTimeout(res, POLL_INTERVAL_MS)); + } +} + +main().catch(e => { + console.error("[auditor] fatal:", e); + process.exit(1); +}); diff --git a/auditor/policy.ts b/auditor/policy.ts new file mode 100644 index 0000000..ea0def1 --- /dev/null +++ b/auditor/policy.ts @@ -0,0 +1,62 @@ +// ═══════════════════════════════════════════════════════════════════ +// YOU WRITE THIS FILE. Policy decides what blocks vs what's a comment. +// Defaults are opinionated on the "stop clicking past placeholder" +// side — easier to loosen than to tighten when you're watching the +// auditor behave in live PRs. +// ═══════════════════════════════════════════════════════════════════ + +import type { Finding, Verdict } from "./types.ts"; + +/// Translate the four-check output into a single verdict. This is the +/// single pane of glass the auditor operates on — tune thresholds here. +export function assembleVerdict( + findings: Finding[], + metrics: Record, + pr_number: number, + head_sha: string, +): Verdict { + const blocking = findings.filter(f => f.severity === "block"); + const warning = findings.filter(f => f.severity === "warn"); + + let overall: Verdict["overall"]; + let one_liner: string; + + if (blocking.length > 0) { + overall = "block"; + one_liner = `${blocking.length} blocking issue${blocking.length > 1 ? "s" : ""}: ${blocking[0].summary}`; + } else if (warning.length >= 3) { + // Three or more warnings is a block — death by a thousand cuts. + overall = "request_changes"; + one_liner = `${warning.length} warnings — see review`; + } else if (warning.length > 0) { + overall = "request_changes"; + one_liner = warning[0].summary; + } else { + overall = "approve"; + one_liner = `all checks passed (${findings.length} findings, all info)`; + } + + return { + pr_number, + head_sha, + audited_at: new Date().toISOString(), + overall, + findings, + metrics, + one_liner, + }; +} + +/// Which strength-of-claim warrants which severity when evidence is +/// weak? A "Phase X shipped" claim with zero integration tests is a +/// blocker. A "should work" claim with no test is a warn. +export function severityFromClaimEvidence( + claim_strength: "weak" | "moderate" | "strong", + evidence_grade: "none" | "partial" | "full", +): "info" | "warn" | "block" { + if (evidence_grade === "full") return "info"; + if (claim_strength === "strong" && evidence_grade === "none") return "block"; + if (claim_strength === "strong" && evidence_grade === "partial") return "warn"; + if (claim_strength === "moderate" && evidence_grade === "none") return "warn"; + return "info"; +} diff --git a/auditor/types.ts b/auditor/types.ts new file mode 100644 index 0000000..5ab0360 --- /dev/null +++ b/auditor/types.ts @@ -0,0 +1,65 @@ +// Shared types for the claim-auditor. Every field exists for a reason; +// if something can't be verified from a check, it goes into `evidence` +// so the verdict is inspectable, not a black box. + +export type CheckKind = "static" | "dynamic" | "inference" | "kb_query"; + +export type Severity = "info" | "warn" | "block"; + +export interface Claim { + // Verbatim phrase that raised the claim — e.g. "Phase 38 shipped", + // "verified end-to-end", "works after restart". Used as the "what + // does the author assert" input to downstream checks. + text: string; + // Where it came from. `commit_sha` is the short hash; `location` + // is a file:line for in-diff claims, or "pr_body" / "commit_message". + commit_sha: string; + location: string; + // Heuristic rating of how strong the claim is. "green+tested" + // is strong; "should work" is weak. Drives sensitivity — stronger + // claims get harder-blocked on weak evidence. + strength: "weak" | "moderate" | "strong"; +} + +export interface Finding { + check: CheckKind; + severity: Severity; + claim_text?: string; + // Free-form short description: "field added but never read", "no + // test covers this code path", "cloud model says placeholder". + summary: string; + // Concrete evidence: file paths, line numbers, log excerpts, test + // output, cloud-model verdict. No handwaving. + evidence: string[]; +} + +export interface Verdict { + pr_number: number; + head_sha: string; + audited_at: string; + overall: "approve" | "request_changes" | "block"; + findings: Finding[]; + // Real numbers that downstream policy can gate on. e.g. if the + // hybrid test produced latency numbers or token counts, they + // surface here so /auditor/history is queryable. + metrics: Record; + // Short one-line justification for the `overall` verdict. What + // gets posted as the commit-status description in Gitea (max 140 + // chars) must fit here. + one_liner: string; +} + +export interface PrSnapshot { + number: number; + head_sha: string; + base_sha: string; + title: string; + body: string; + state: "open" | "closed" | "merged"; + author: string; + // Array of commit messages in the PR (not diffs — those are + // fetched on-demand per-check). + commits: Array<{ sha: string; message: string; author: string }>; + // File paths touched by the PR, with lines-added / lines-removed. + files: Array<{ path: string; additions: number; deletions: number }>; +}