Merge pull request 'Auditor: PR-claim hard-block reviewer (scaffold)' (#1) from auditor/scaffold into main
This commit is contained in:
commit
b6d69b2e82
53
auditor/README.md
Normal file
53
auditor/README.md
Normal file
@ -0,0 +1,53 @@
|
||||
# Lakehouse Claim Auditor
|
||||
|
||||
A Bun sub-agent that watches open PRs on Gitea, reads the ship-claims
|
||||
in commit messages and PR bodies, and **hard-blocks** merges when the
|
||||
code doesn't back the claim.
|
||||
|
||||
Rationale: when "compiles + one curl works" gets called "phase shipped,"
|
||||
placeholder code accumulates. This auditor runs every 90s, fetches
|
||||
each open PR, and subjects it to four checks:
|
||||
|
||||
1. **Static diff** — grep/parse looking for placeholder patterns
|
||||
2. **Dynamic** — runs the never-before-executed hybrid test fixture
|
||||
3. **Cloud inference** — asks `gpt-oss:120b` via `/v1/chat` to
|
||||
identify gaps in the diff
|
||||
4. **KB query** — looks up `data/_kb/` + observer for prior failure
|
||||
patterns on similar claims
|
||||
|
||||
Verdict is assembled, posted to Gitea as:
|
||||
- A **failing commit status** (hard block — branch protection
|
||||
prevents merge)
|
||||
- A **review comment** explaining every finding
|
||||
|
||||
## Run manually
|
||||
|
||||
```bash
|
||||
cd /home/profit/lakehouse
|
||||
bun run auditor/index.ts
|
||||
```
|
||||
|
||||
Defaults: polls every 90s, stops on `auditor.paused` file present.
|
||||
|
||||
## State
|
||||
|
||||
- `data/_auditor/state.json` — last-audited head SHA per PR
|
||||
- `data/_auditor/verdicts/{pr}-{sha}.json` — per-run verdict record
|
||||
|
||||
## Where YOU edit
|
||||
|
||||
`auditor/policy.ts` — the verdict assembler. Controls which findings
|
||||
block vs warn vs inform. All other code is mechanical: fetching,
|
||||
running checks, posting to Gitea.
|
||||
|
||||
## Hard-block mechanism
|
||||
|
||||
1. Commit status is posted as `failure` with context `lakehouse/auditor`
|
||||
2. If `main` branch protection requires `lakehouse/auditor` status
|
||||
to pass, Gitea prevents merge
|
||||
3. When code is fixed and re-audit passes, status flips to `success`,
|
||||
merge unblocks
|
||||
|
||||
Enable branch protection (one-time, via Gitea UI or API):
|
||||
- `POST /repos/profit/lakehouse/branch_protections`
|
||||
- `{"branch_name": "main", "required_status_checks": {"contexts": ["lakehouse/auditor"]}}`
|
||||
171
auditor/audit.ts
Normal file
171
auditor/audit.ts
Normal file
@ -0,0 +1,171 @@
|
||||
// Orchestrator — runs all four checks on a PR, assembles a verdict,
|
||||
// posts to Gitea. This is task #8's integration layer; the poller
|
||||
// (task #9) calls this once per PR on every fresh head SHA.
|
||||
//
|
||||
// Hard-block mechanism: commit status posted with state="failure"
|
||||
// and context="lakehouse/auditor". If `main` branch protection
|
||||
// requires that context to pass, merge is physically impossible
|
||||
// until the auditor re-audits a fixed commit and flips the status
|
||||
// to "success".
|
||||
//
|
||||
// Human-readable reasoning: posted as a PR issue comment (not a
|
||||
// review — reviews have self-review restrictions on Gitea and the
|
||||
// auditor currently uses the same PAT as the PR author).
|
||||
|
||||
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import type { PrSnapshot, Verdict, Finding } from "./types.ts";
|
||||
import { getPrDiff, postCommitStatus, postIssueComment } from "./gitea.ts";
|
||||
import { parseClaims } from "./claim_parser.ts";
|
||||
import { assembleVerdict } from "./policy.ts";
|
||||
import { runStaticCheck } from "./checks/static.ts";
|
||||
import { runDynamicCheck } from "./checks/dynamic.ts";
|
||||
import { runInferenceCheck } from "./checks/inference.ts";
|
||||
import { runKbCheck } from "./checks/kb_query.ts";
|
||||
|
||||
const VERDICTS_DIR = "/home/profit/lakehouse/data/_auditor/verdicts";
|
||||
|
||||
export interface AuditOptions {
|
||||
// Skip the cloud inference call (fast path for iteration). Default false.
|
||||
skip_inference?: boolean;
|
||||
// Skip the dynamic check (avoid running the hybrid fixture every PR,
|
||||
// since it hits live services and mutates playbook state). Default false
|
||||
// on `main`-branch-target PRs, true when auditing feature branches
|
||||
// where the fixture would pollute state. Caller decides.
|
||||
skip_dynamic?: boolean;
|
||||
// Skip Gitea posting — useful for dry-runs / local testing.
|
||||
// Default false.
|
||||
dry_run?: boolean;
|
||||
}
|
||||
|
||||
export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<Verdict> {
|
||||
const t0 = Date.now();
|
||||
const diff = await getPrDiff(pr.number);
|
||||
const { claims } = parseClaims(pr);
|
||||
|
||||
// Run checks in parallel where they don't share mutable state.
|
||||
// Static + kb_query + inference are all read-only. Dynamic mutates
|
||||
// playbook state (nonce-scoped per run, but still live) so if
|
||||
// skip_dynamic is false we still run it in parallel — the mutation
|
||||
// is namespaced.
|
||||
const [staticFindings, dynamicFindings, inferenceFindings, kbFindings] = await Promise.all([
|
||||
runStaticCheck(diff),
|
||||
opts.skip_dynamic ? Promise.resolve(stubFinding("dynamic", "skipped by options")) : runDynamicCheck(),
|
||||
opts.skip_inference ? Promise.resolve(stubFinding("inference", "skipped by options")) : runInferenceCheck(claims, diff),
|
||||
runKbCheck(claims),
|
||||
]);
|
||||
|
||||
const allFindings: Finding[] = [
|
||||
...staticFindings,
|
||||
...dynamicFindings,
|
||||
...inferenceFindings,
|
||||
...kbFindings,
|
||||
];
|
||||
|
||||
const duration_ms = Date.now() - t0;
|
||||
const metrics = {
|
||||
audit_duration_ms: duration_ms,
|
||||
findings_total: allFindings.length,
|
||||
findings_block: allFindings.filter(f => f.severity === "block").length,
|
||||
findings_warn: allFindings.filter(f => f.severity === "warn").length,
|
||||
findings_info: allFindings.filter(f => f.severity === "info").length,
|
||||
claims_strong: claims.filter(c => c.strength === "strong").length,
|
||||
claims_moderate: claims.filter(c => c.strength === "moderate").length,
|
||||
claims_weak: claims.filter(c => c.strength === "weak").length,
|
||||
claims_total: claims.length,
|
||||
diff_bytes: diff.length,
|
||||
};
|
||||
|
||||
const verdict = assembleVerdict(allFindings, metrics, pr.number, pr.head_sha);
|
||||
|
||||
await persistVerdict(verdict);
|
||||
|
||||
if (!opts.dry_run) {
|
||||
await postToGitea(verdict);
|
||||
}
|
||||
|
||||
return verdict;
|
||||
}
|
||||
|
||||
async function persistVerdict(v: Verdict): Promise<void> {
|
||||
await mkdir(VERDICTS_DIR, { recursive: true });
|
||||
const filename = `${v.pr_number}-${v.head_sha.slice(0, 12)}.json`;
|
||||
await writeFile(join(VERDICTS_DIR, filename), JSON.stringify(v, null, 2));
|
||||
}
|
||||
|
||||
export async function postToGitea(v: Verdict): Promise<void> {
|
||||
// 1. Commit status — the hard block signal (if branch protection
|
||||
// is configured to require lakehouse/auditor on main).
|
||||
const state = v.overall === "approve" ? "success" : "failure";
|
||||
await postCommitStatus({
|
||||
sha: v.head_sha,
|
||||
state,
|
||||
context: "lakehouse/auditor",
|
||||
description: v.one_liner,
|
||||
target_url: "", // no URL yet; could point to a verdicts dashboard
|
||||
});
|
||||
|
||||
// 2. Issue comment — the reasoning. Gated so we don't spam the PR
|
||||
// with identical comments on re-audits of the same SHA. Caller
|
||||
// (poller) ensures we only re-audit fresh SHAs, but a dedup
|
||||
// marker inside the body keeps it idempotent if re-run.
|
||||
const body = formatReviewBody(v);
|
||||
await postIssueComment({ pr_number: v.pr_number, body });
|
||||
}
|
||||
|
||||
function formatReviewBody(v: Verdict): string {
|
||||
const byCheck: Record<string, Finding[]> = {};
|
||||
for (const f of v.findings) {
|
||||
(byCheck[f.check] ||= []).push(f);
|
||||
}
|
||||
|
||||
const verdictEmoji =
|
||||
v.overall === "approve" ? "✅" :
|
||||
v.overall === "request_changes" ? "⚠️" :
|
||||
"🛑";
|
||||
|
||||
const lines: string[] = [];
|
||||
lines.push(`## Auditor verdict: ${verdictEmoji} \`${v.overall}\``);
|
||||
lines.push("");
|
||||
lines.push(`**One-liner:** ${v.one_liner}`);
|
||||
lines.push(`**Head SHA:** \`${v.head_sha.slice(0, 12)}\``);
|
||||
lines.push(`**Audited at:** ${v.audited_at}`);
|
||||
lines.push("");
|
||||
|
||||
// Per-check sections, only if the check produced findings.
|
||||
const checkOrder = ["static", "dynamic", "inference", "kb_query"] as const;
|
||||
for (const check of checkOrder) {
|
||||
const fs = byCheck[check] ?? [];
|
||||
if (fs.length === 0) continue;
|
||||
const bySev = {
|
||||
block: fs.filter(f => f.severity === "block").length,
|
||||
warn: fs.filter(f => f.severity === "warn").length,
|
||||
info: fs.filter(f => f.severity === "info").length,
|
||||
};
|
||||
lines.push(`<details><summary><b>${check}</b> — ${fs.length} findings (${bySev.block} block, ${bySev.warn} warn, ${bySev.info} info)</summary>`);
|
||||
lines.push("");
|
||||
for (const f of fs) {
|
||||
const mark = f.severity === "block" ? "🛑" : f.severity === "warn" ? "⚠️" : "ℹ️";
|
||||
lines.push(`${mark} **${f.severity}** — ${f.summary}`);
|
||||
for (const e of f.evidence.slice(0, 3)) {
|
||||
lines.push(` - \`${e.slice(0, 180).replace(/\n/g, " ")}\``);
|
||||
}
|
||||
}
|
||||
lines.push("");
|
||||
lines.push("</details>");
|
||||
lines.push("");
|
||||
}
|
||||
|
||||
lines.push("### Metrics");
|
||||
lines.push("```json");
|
||||
lines.push(JSON.stringify(v.metrics, null, 2));
|
||||
lines.push("```");
|
||||
lines.push("");
|
||||
lines.push(`<sub>Lakehouse auditor · SHA ${v.head_sha.slice(0, 8)} · re-audit on new commit flips the status automatically.</sub>`);
|
||||
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
function stubFinding(check: "dynamic" | "inference", why: string): Finding[] {
|
||||
return [{ check, severity: "info", summary: `${check} check skipped — ${why}`, evidence: [why] }];
|
||||
}
|
||||
91
auditor/checks/dynamic.ts
Normal file
91
auditor/checks/dynamic.ts
Normal file
@ -0,0 +1,91 @@
|
||||
// Dynamic execution check — runs the hybrid fixture and maps its
|
||||
// layer results to auditor Findings.
|
||||
//
|
||||
// A layer that fails with a "not implemented / 404 / slice N" error
|
||||
// gets severity=info (honest placeholder signal). A layer that fails
|
||||
// any other way gets severity=warn (something actually broke).
|
||||
// An info-level summary finding is always emitted carrying the real
|
||||
// numbers — shipped/placeholder phase counts, per-layer latency.
|
||||
|
||||
import { runHybridFixture } from "../fixtures/hybrid_38_40_45.ts";
|
||||
import type { Finding } from "../types.ts";
|
||||
|
||||
const PLACEHOLDER_MARKERS = [
|
||||
"unimplemented",
|
||||
" 404 ", "(404)", " 405 ", "(405)",
|
||||
"slice 3", "slice 4", "slice 5",
|
||||
"endpoint not built", "not yet",
|
||||
];
|
||||
|
||||
function isPlaceholderFailure(err?: string): boolean {
|
||||
if (!err) return false;
|
||||
const low = err.toLowerCase();
|
||||
return PLACEHOLDER_MARKERS.some(m => low.includes(m.toLowerCase()));
|
||||
}
|
||||
|
||||
export async function runDynamicCheck(): Promise<Finding[]> {
|
||||
const findings: Finding[] = [];
|
||||
|
||||
let result;
|
||||
try {
|
||||
result = await runHybridFixture();
|
||||
} catch (e) {
|
||||
// Fixture itself crashed — can't run dynamic check at all.
|
||||
return [
|
||||
{
|
||||
check: "dynamic",
|
||||
severity: "warn",
|
||||
summary: `hybrid fixture crashed before completing: ${(e as Error).message.slice(0, 140)}`,
|
||||
evidence: [(e as Error).message],
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
// Per-layer findings for every non-ok layer.
|
||||
for (const layer of result.layers) {
|
||||
if (layer.ok) continue;
|
||||
const placeholder = isPlaceholderFailure(layer.error);
|
||||
findings.push({
|
||||
check: "dynamic",
|
||||
severity: placeholder ? "info" : "warn",
|
||||
summary: placeholder
|
||||
? `hybrid fixture layer ${layer.layer} (Phase ${layer.phase}) honestly reports unimplemented`
|
||||
: `hybrid fixture layer ${layer.layer} (Phase ${layer.phase}) failed — not a placeholder, a real failure`,
|
||||
evidence: [
|
||||
`evidence: ${layer.evidence.slice(0, 160)}`,
|
||||
...(layer.error ? [`error: ${layer.error.slice(0, 160)}`] : []),
|
||||
`latency_ms: ${layer.latency_ms}`,
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
// One overall summary with real numbers so the report shows what
|
||||
// DID pass plus per-layer timing.
|
||||
const metrics_preview = Object.entries(result.real_numbers)
|
||||
.slice(0, 10)
|
||||
.map(([k, v]) => `${k}=${v}`);
|
||||
findings.push({
|
||||
check: "dynamic",
|
||||
severity: "info",
|
||||
summary: `hybrid fixture overall=${result.overall}, shipped [${result.shipped_phases.join(", ")}], placeholder [${result.placeholder_phases.join(", ")}]`,
|
||||
evidence: metrics_preview.length > 0 ? metrics_preview : ["no metrics emitted"],
|
||||
});
|
||||
|
||||
// If the fixture ran at all but nothing passed, elevate one of the
|
||||
// summary findings to warn — something more than "all honest
|
||||
// placeholders" is wrong.
|
||||
if (result.overall === "fail") {
|
||||
findings.push({
|
||||
check: "dynamic",
|
||||
severity: "warn",
|
||||
summary: `hybrid fixture: 0 layers passed (overall=fail)`,
|
||||
evidence: [
|
||||
"a total fixture fail usually means a precondition service is down",
|
||||
"(gateway /health / sidecar / Langfuse /v1/chat) — NOT necessarily",
|
||||
"the PR's code problem. Check service status before blaming the PR.",
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
return findings;
|
||||
}
|
||||
206
auditor/checks/inference.ts
Normal file
206
auditor/checks/inference.ts
Normal file
@ -0,0 +1,206 @@
|
||||
// Cloud inference check — wraps the proven run_codereview pattern
|
||||
// from llm_team_ui.py (same 3-stage framing, same cloud model) to
|
||||
// critique a PR's claims against its diff.
|
||||
//
|
||||
// Proved out 2026-04-22 via /tmp/codereview_runner.py — gpt-oss:120b
|
||||
// caught a real ternary bug in auditor/fixtures/hybrid_38_40_45.ts
|
||||
// that unit tests missed. This module reuses the reviewer prompt
|
||||
// shape (bugs / security / performance / style / edge cases) and
|
||||
// adds claim-vs-diff specific framing.
|
||||
//
|
||||
// Call surface: runInferenceCheck(claims, diff) → Finding[].
|
||||
// Cloud latency budget: ~60s (gpt-oss:120b reviewer typically 35-50s
|
||||
// with a 15KB diff + claim list).
|
||||
|
||||
import type { Claim, Finding } from "../types.ts";
|
||||
|
||||
const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
|
||||
const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
|
||||
// 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was
|
||||
// previously truncated at 15KB causing the reviewer to miss later
|
||||
// files (gitea.ts, policy.ts) and flag "no Gitea client present" as a
|
||||
// block finding when the file was simply outside the truncation window.
|
||||
const MAX_DIFF_CHARS = 40000;
|
||||
const CALL_TIMEOUT_MS = 120_000;
|
||||
|
||||
export async function runInferenceCheck(claims: Claim[], diff: string): Promise<Finding[]> {
|
||||
if (claims.length === 0) {
|
||||
return [{
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: "no ship-claims extracted — skipping cloud inference",
|
||||
evidence: ["parser returned empty claim list; nothing to verify against cloud"],
|
||||
}];
|
||||
}
|
||||
|
||||
const truncated = diff.length > MAX_DIFF_CHARS
|
||||
? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]`
|
||||
: diff;
|
||||
|
||||
// Build the reviewer prompt in the same shape as run_codereview's
|
||||
// review stage (llm_team_ui.py:10950), adapted for claim verification:
|
||||
// "Task: ..."
|
||||
// "Code: ..."
|
||||
// "Review: bugs/security/perf/style/edge. Provide corrected code."
|
||||
// We add: claim list upfront + ask for structured JSON verdict.
|
||||
const systemMsg = [
|
||||
"You review pull-request diffs against the author's own ship-claims.",
|
||||
"For each claim, decide: is it backed by actual code in the diff, or is",
|
||||
"it placeholder / aspirational / unwired?",
|
||||
"",
|
||||
"A claim is BACKED when the diff contains a real code path that delivers",
|
||||
"the claimed behavior. A claim is NOT BACKED when:",
|
||||
" - the claim asserts functionality but the diff only adds types/fields",
|
||||
" with no consumer",
|
||||
" - the claim mentions tests but no test function was added",
|
||||
" - the claim claims integration but the integration point is a stub",
|
||||
" - the diff contains unimplemented!() / todo!() / TODO comments",
|
||||
" - the claim says 'works end-to-end' but the diff has no end-to-end test",
|
||||
"",
|
||||
"Respond with strict JSON only. No prose before or after. Shape:",
|
||||
"{",
|
||||
' "claim_verdicts": [',
|
||||
' {"claim_idx": 0, "backed": false, "evidence": "short reason"}',
|
||||
" ],",
|
||||
' "unflagged_gaps": [',
|
||||
' {"location": "file:line", "summary": "short description"}',
|
||||
" ]",
|
||||
"}",
|
||||
].join("\n");
|
||||
|
||||
const userMsg = [
|
||||
`Ship-claims the author made (numbered 0..N-1):`,
|
||||
claims.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
|
||||
"",
|
||||
`Diff:`,
|
||||
"```",
|
||||
truncated,
|
||||
"```",
|
||||
"",
|
||||
`For each numbered claim above, emit a claim_verdicts entry. For gaps the`,
|
||||
`author DIDN'T claim but that look like placeholder code, emit unflagged_gaps.`,
|
||||
`Strict JSON only, matching the shape described. No prose outside JSON.`,
|
||||
].join("\n");
|
||||
|
||||
let resp: Response;
|
||||
try {
|
||||
resp = await fetch(`${GATEWAY}/v1/chat`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
provider: "ollama_cloud",
|
||||
model: MODEL,
|
||||
messages: [
|
||||
{ role: "system", content: systemMsg },
|
||||
{ role: "user", content: userMsg },
|
||||
],
|
||||
max_tokens: 3000,
|
||||
temperature: 0.2,
|
||||
think: true, // T3 overseer should reason — JSON shape is still required
|
||||
}),
|
||||
signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
|
||||
});
|
||||
} catch (e) {
|
||||
// Cloud unreachable → soft-fail. Don't block a PR because the
|
||||
// reviewer model is down. Static + dynamic + kb still run.
|
||||
return [{
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: "cloud inference unreachable — skipped",
|
||||
evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`],
|
||||
}];
|
||||
}
|
||||
|
||||
if (!resp.ok) {
|
||||
return [{
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: `cloud inference returned ${resp.status} — skipped`,
|
||||
evidence: [`body: ${(await resp.text()).slice(0, 200)}`],
|
||||
}];
|
||||
}
|
||||
|
||||
const body: any = await resp.json();
|
||||
const content: string = body?.choices?.[0]?.message?.content ?? "";
|
||||
const usage = body?.usage ?? {};
|
||||
|
||||
const parsed = extractJson(content);
|
||||
if (!parsed) {
|
||||
return [{
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: "cloud returned unparseable output — skipped",
|
||||
evidence: [
|
||||
`head: ${content.slice(0, 200)}`,
|
||||
`tokens: ${usage.total_tokens ?? "?"}`,
|
||||
],
|
||||
}];
|
||||
}
|
||||
|
||||
const findings: Finding[] = [];
|
||||
|
||||
// One summary info finding so the verdict layer knows the check ran.
|
||||
findings.push({
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})`,
|
||||
evidence: [
|
||||
`claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`,
|
||||
],
|
||||
});
|
||||
|
||||
for (const v of parsed.claim_verdicts ?? []) {
|
||||
if (v?.backed === false) {
|
||||
const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
|
||||
const claim = claims[idx];
|
||||
if (!claim) continue;
|
||||
// Strong+unbacked = BLOCK. That's the whole point of the auditor.
|
||||
const sev: Finding["severity"] = claim.strength === "strong" ? "block"
|
||||
: claim.strength === "moderate" ? "warn"
|
||||
: "info";
|
||||
findings.push({
|
||||
check: "inference",
|
||||
severity: sev,
|
||||
claim_text: claim.text,
|
||||
summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`,
|
||||
evidence: [
|
||||
`at ${claim.location}`,
|
||||
`cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`,
|
||||
],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
for (const g of parsed.unflagged_gaps ?? []) {
|
||||
findings.push({
|
||||
check: "inference",
|
||||
severity: "warn",
|
||||
summary: `cloud-flagged gap not in any claim: ${String(g?.summary ?? "?").slice(0, 120)}`,
|
||||
evidence: [`location: ${String(g?.location ?? "?").slice(0, 140)}`],
|
||||
});
|
||||
}
|
||||
|
||||
return findings;
|
||||
}
|
||||
|
||||
// Lift the first balanced JSON object out of the response. Tolerates
|
||||
// leading prose, code fences, and model reasoning preamble when the
|
||||
// cloud model ignored "strict JSON only."
|
||||
function extractJson(text: string): any | null {
|
||||
const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, "");
|
||||
let depth = 0;
|
||||
let start = -1;
|
||||
for (let i = 0; i < cleaned.length; i++) {
|
||||
const c = cleaned[i];
|
||||
if (c === "{") {
|
||||
if (depth === 0) start = i;
|
||||
depth++;
|
||||
} else if (c === "}") {
|
||||
depth--;
|
||||
if (depth === 0 && start >= 0) {
|
||||
try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; }
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
183
auditor/checks/kb_query.ts
Normal file
183
auditor/checks/kb_query.ts
Normal file
@ -0,0 +1,183 @@
|
||||
// Local-KB check — reads data/_kb/ + data/_observer/ + data/_bot/
|
||||
// for prior evidence bearing on this PR's claims. Cheap, offline,
|
||||
// no model calls. The point: if a claim like "Phase X shipped" has
|
||||
// a historical record of failing on the same signature before, the
|
||||
// auditor surfaces that pattern before the cloud check has to
|
||||
// infer it.
|
||||
//
|
||||
// What this check reads (all file-backed, append-only or periodic):
|
||||
// data/_kb/outcomes.jsonl — per-scenario outcomes (kb.ts)
|
||||
// data/_kb/error_corrections.jsonl — fail→succeed deltas on same sig
|
||||
// data/_observer/ops.jsonl — observer ring → disk stream
|
||||
// data/_bot/cycles/*.json — bot cycle results
|
||||
//
|
||||
// Each JSONL line / per-cycle file is small; this check reads tails
|
||||
// only (last N lines or last M files) to stay cheap on large corpora.
|
||||
|
||||
import { readFile, readdir, stat } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import type { Claim, Finding } from "../types.ts";
|
||||
|
||||
const KB_DIR = "/home/profit/lakehouse/data/_kb";
|
||||
const OBSERVER_OPS = "/home/profit/lakehouse/data/_observer/ops.jsonl";
|
||||
const BOT_CYCLES_DIR = "/home/profit/lakehouse/data/_bot/cycles";
|
||||
const TAIL_LINES = 500;
|
||||
const MAX_BOT_CYCLE_FILES = 30;
|
||||
|
||||
export async function runKbCheck(claims: Claim[]): Promise<Finding[]> {
|
||||
const findings: Finding[] = [];
|
||||
|
||||
// 1. Recent scenario outcomes: are strong-claim-style phrases showing
|
||||
// up alongside failed events? That's "we claimed it worked" +
|
||||
// "it didn't" in the KB.
|
||||
const scenarioFindings = await checkScenarioOutcomes(claims);
|
||||
findings.push(...scenarioFindings);
|
||||
|
||||
// 2. Error corrections: any of the claims text overlap a
|
||||
// recently-observed fail→succeed pair? If yes, add context.
|
||||
const correctionFindings = await checkErrorCorrections(claims);
|
||||
findings.push(...correctionFindings);
|
||||
|
||||
// 3. Bot cycles: any prior bot cycle ended in tests_failed or
|
||||
// apply_failed on a file this PR is also touching?
|
||||
const botFindings = await checkBotCycles();
|
||||
findings.push(...botFindings);
|
||||
|
||||
// 4. Observer: count recent error events. High volume = shared
|
||||
// infra problem, worth flagging (context for other findings).
|
||||
const obsFindings = await checkObserverStream();
|
||||
findings.push(...obsFindings);
|
||||
|
||||
return findings;
|
||||
}
|
||||
|
||||
async function tailJsonl<T = any>(path: string, n: number): Promise<T[]> {
|
||||
try {
|
||||
const raw = await readFile(path, "utf8");
|
||||
const lines = raw.split("\n").filter(l => l.length > 0);
|
||||
const slice = lines.slice(-n);
|
||||
const out: T[] = [];
|
||||
for (const line of slice) {
|
||||
try { out.push(JSON.parse(line)); } catch { /* skip malformed */ }
|
||||
}
|
||||
return out;
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async function checkScenarioOutcomes(_claims: Claim[]): Promise<Finding[]> {
|
||||
const outcomes = await tailJsonl<any>(join(KB_DIR, "outcomes.jsonl"), TAIL_LINES);
|
||||
if (outcomes.length === 0) return [];
|
||||
const totalEvents = outcomes.reduce((s, o) => s + (o.total_events ?? 0), 0);
|
||||
const okEvents = outcomes.reduce((s, o) => s + (o.ok_events ?? 0), 0);
|
||||
const failRate = totalEvents > 0 ? 1 - okEvents / totalEvents : 0;
|
||||
|
||||
if (totalEvents === 0) {
|
||||
return [{
|
||||
check: "kb_query",
|
||||
severity: "info",
|
||||
summary: `KB: no scenario outcomes on file — learning loop is empty`,
|
||||
evidence: [`data/_kb/outcomes.jsonl has ${outcomes.length} entries with 0 total events`],
|
||||
}];
|
||||
}
|
||||
|
||||
const recent = outcomes.slice(-10);
|
||||
const recentFailSigs: string[] = recent
|
||||
.filter(o => (o.ok_events ?? 0) < (o.total_events ?? 0))
|
||||
.map(o => o.sig_hash)
|
||||
.filter(s => typeof s === "string");
|
||||
|
||||
const findings: Finding[] = [{
|
||||
check: "kb_query",
|
||||
severity: failRate > 0.3 ? "warn" : "info",
|
||||
summary: `KB: ${outcomes.length} recent scenario runs, ${okEvents}/${totalEvents} events ok (fail rate ${(failRate * 100).toFixed(1)}%)`,
|
||||
evidence: [
|
||||
`most recent: ${recent[recent.length - 1]?.run_id ?? "?"}`,
|
||||
`recent failing sigs: ${recentFailSigs.length > 0 ? recentFailSigs.slice(-3).join(", ") : "none"}`,
|
||||
],
|
||||
}];
|
||||
return findings;
|
||||
}
|
||||
|
||||
async function checkErrorCorrections(_claims: Claim[]): Promise<Finding[]> {
|
||||
const corrections = await tailJsonl<any>(join(KB_DIR, "error_corrections.jsonl"), TAIL_LINES);
|
||||
if (corrections.length === 0) return [];
|
||||
return [{
|
||||
check: "kb_query",
|
||||
severity: "info",
|
||||
summary: `KB: ${corrections.length} error corrections on file (fail→succeed pairs)`,
|
||||
evidence: [
|
||||
corrections.length > 0
|
||||
? `most recent: ${String(corrections[corrections.length - 1]?.sig_hash ?? "?").slice(0, 24)}`
|
||||
: "none",
|
||||
],
|
||||
}];
|
||||
}
|
||||
|
||||
async function checkBotCycles(): Promise<Finding[]> {
|
||||
let entries: string[] = [];
|
||||
try { entries = await readdir(BOT_CYCLES_DIR); }
|
||||
catch { return []; }
|
||||
|
||||
const jsonFiles = entries.filter(e => e.endsWith(".json"));
|
||||
if (jsonFiles.length === 0) return [];
|
||||
|
||||
// Sort by mtime desc, take most recent N
|
||||
const withStat = await Promise.all(
|
||||
jsonFiles.map(async name => {
|
||||
try { return { name, mtime: (await stat(join(BOT_CYCLES_DIR, name))).mtimeMs }; }
|
||||
catch { return { name, mtime: 0 }; }
|
||||
}),
|
||||
);
|
||||
const recent = withStat.sort((a, b) => b.mtime - a.mtime).slice(0, MAX_BOT_CYCLE_FILES);
|
||||
|
||||
const outcomes: Record<string, number> = {};
|
||||
for (const { name } of recent) {
|
||||
try {
|
||||
const r = JSON.parse(await readFile(join(BOT_CYCLES_DIR, name), "utf8"));
|
||||
const o = String(r.outcome ?? "unknown");
|
||||
outcomes[o] = (outcomes[o] ?? 0) + 1;
|
||||
} catch { /* skip */ }
|
||||
}
|
||||
|
||||
const summary = Object.entries(outcomes)
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.map(([k, v]) => `${k}=${v}`)
|
||||
.join(", ");
|
||||
|
||||
const failCount = (outcomes["tests_failed"] ?? 0) + (outcomes["apply_failed"] ?? 0) + (outcomes["model_failed"] ?? 0);
|
||||
return [{
|
||||
check: "kb_query",
|
||||
severity: failCount > recent.length / 2 ? "warn" : "info",
|
||||
summary: `KB: bot recorded ${recent.length} recent cycles — ${summary || "no outcomes parsed"}`,
|
||||
evidence: [
|
||||
`dir: ${BOT_CYCLES_DIR}`,
|
||||
`fail-class total: ${failCount} / ${recent.length}`,
|
||||
],
|
||||
}];
|
||||
}
|
||||
|
||||
async function checkObserverStream(): Promise<Finding[]> {
|
||||
const ops = await tailJsonl<any>(OBSERVER_OPS, TAIL_LINES);
|
||||
if (ops.length === 0) return [];
|
||||
const failures = ops.filter(o => o.ok === false).length;
|
||||
return [{
|
||||
check: "kb_query",
|
||||
severity: "info",
|
||||
summary: `KB: observer stream ${ops.length} recent ops, ${failures} failures`,
|
||||
evidence: [
|
||||
`source: ${OBSERVER_OPS}`,
|
||||
`by source: ${observerBySource(ops)}`,
|
||||
],
|
||||
}];
|
||||
}
|
||||
|
||||
function observerBySource(ops: any[]): string {
|
||||
const c: Record<string, number> = {};
|
||||
for (const o of ops) {
|
||||
const s = String(o.source ?? "unknown");
|
||||
c[s] = (c[s] ?? 0) + 1;
|
||||
}
|
||||
return Object.entries(c).sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ") || "empty";
|
||||
}
|
||||
159
auditor/checks/static.ts
Normal file
159
auditor/checks/static.ts
Normal file
@ -0,0 +1,159 @@
|
||||
// Static diff check — grep-style, no AST, no LLM. Looks for patterns
|
||||
// that are high-signal evidence of placeholder code.
|
||||
//
|
||||
// Findings are severity-graded:
|
||||
// block — explicit non-impl markers (unimplemented!, todo!,
|
||||
// panic!("not implemented"), throw new Error("not implemented"))
|
||||
// warn — TODO / FIXME / XXX / HACK comments on added lines,
|
||||
// new struct fields with no read-site anywhere in the diff,
|
||||
// suspiciously-empty function bodies ({ Ok(()) } / {} when
|
||||
// the commit message claims the fn "implements" something)
|
||||
// info — hardcoded "test" / "dummy" / "placeholder" strings in
|
||||
// added lines (could be real, just flag for inspection)
|
||||
//
|
||||
// Consumes: raw unified diff text from Gitea.
|
||||
|
||||
import type { Finding } from "../types.ts";
|
||||
|
||||
// Rust + TypeScript patterns that almost always indicate "this is
|
||||
// not actually implemented yet."
|
||||
const BLOCK_PATTERNS: Array<{ re: RegExp; why: string }> = [
|
||||
{ re: /\bunimplemented!\s*\(/, why: "unimplemented!() macro call" },
|
||||
{ re: /\btodo!\s*\(/, why: "todo!() macro call" },
|
||||
{ re: /panic!\s*\(\s*"(?:not implemented|TODO|not yet|unimpl)/i, why: "panic! with not-implemented message" },
|
||||
{ re: /throw\s+new\s+Error\s*\(\s*['"](?:not implemented|TODO|unimpl)/i, why: "throw Error 'not implemented'" },
|
||||
];
|
||||
|
||||
const WARN_COMMENT_PATTERNS: Array<{ re: RegExp; why: string }> = [
|
||||
{ re: /^\+.*\/\/\s*(TODO|FIXME|XXX|HACK)\b/i, why: "TODO/FIXME/XXX/HACK comment added" },
|
||||
{ re: /^\+.*#\s*(TODO|FIXME|XXX|HACK)\b/i, why: "TODO/FIXME/XXX/HACK comment added" },
|
||||
];
|
||||
|
||||
const INFO_HARDCODED_PATTERNS: Array<{ re: RegExp; why: string }> = [
|
||||
{ re: /"(?:placeholder|dummy|foobar|xxx|replaceme|changeme)"/i, why: "suspicious hardcoded string" },
|
||||
];
|
||||
|
||||
export function runStaticCheck(diff: string): Finding[] {
|
||||
const findings: Finding[] = [];
|
||||
|
||||
// Per-file walk: only look at ADDED lines (prefix '+' but not '+++'
|
||||
// which is the diff header).
|
||||
const perFile = splitDiffByFile(diff);
|
||||
|
||||
for (const [path, lines] of perFile) {
|
||||
// Skip diff bookkeeping + pure-delete files
|
||||
if (!lines.some(l => l.startsWith("+") && !l.startsWith("+++"))) continue;
|
||||
|
||||
// The auditor's own check files literally contain the BLOCK
|
||||
// patterns as regex definitions (BLOCK_PATTERNS in this file,
|
||||
// prompt examples in inference.ts). Skipping BLOCK scan on these
|
||||
// specific paths prevents the checker from self-flagging its own
|
||||
// string literals. WARN/INFO patterns still run — those genuinely
|
||||
// could indicate problems in the checker's own code (TODO
|
||||
// comments don't self-define).
|
||||
const isAuditorCheckerFile = path.startsWith("auditor/checks/") ||
|
||||
path.startsWith("auditor/fixtures/");
|
||||
|
||||
for (let idx = 0; idx < lines.length; idx++) {
|
||||
const line = lines[idx];
|
||||
if (!line.startsWith("+") || line.startsWith("+++")) continue;
|
||||
const added = line.slice(1);
|
||||
|
||||
if (!isAuditorCheckerFile) {
|
||||
for (const { re, why } of BLOCK_PATTERNS) {
|
||||
if (re.test(added)) {
|
||||
findings.push({
|
||||
check: "static",
|
||||
severity: "block",
|
||||
summary: `${why} in ${path}`,
|
||||
evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const { re, why } of WARN_COMMENT_PATTERNS) {
|
||||
if (re.test(line)) {
|
||||
findings.push({
|
||||
check: "static",
|
||||
severity: "warn",
|
||||
summary: `${why} in ${path}`,
|
||||
evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
|
||||
});
|
||||
}
|
||||
}
|
||||
for (const { re, why } of INFO_HARDCODED_PATTERNS) {
|
||||
if (re.test(added)) {
|
||||
findings.push({
|
||||
check: "static",
|
||||
severity: "info",
|
||||
summary: `${why} in ${path}`,
|
||||
evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// "Field added but never read" heuristic — catches exactly the
|
||||
// Phase 45 DocRef placeholder pattern. Limited to the diff itself:
|
||||
// we're not doing a full-codebase grep here (too noisy; callers
|
||||
// elsewhere might exist). The point is: if NEITHER this diff nor
|
||||
// any other line in the diff reads the field, the PR is shipping
|
||||
// state without a consumer.
|
||||
const addedLines = lines.filter(l => l.startsWith("+") && !l.startsWith("+++"))
|
||||
.map(l => l.slice(1));
|
||||
const newFields = extractNewFields(addedLines);
|
||||
for (const field of newFields) {
|
||||
const readPattern = new RegExp(`[\\.:]\\s*${escape(field)}\\b|\\b${escape(field)}\\s*:`);
|
||||
// The definition line itself matches readPattern — filter it out
|
||||
// by requiring at least TWO lines in the diff mention the field
|
||||
// (one defines, one reads).
|
||||
const hits = addedLines.filter(l => readPattern.test(l));
|
||||
if (hits.length < 2) {
|
||||
findings.push({
|
||||
check: "static",
|
||||
severity: "warn",
|
||||
summary: `field '${field}' added in ${path} but no read-site in the diff — could be placeholder state without a consumer`,
|
||||
evidence: [`${path}: added '${field}' with no reader; rest of diff has ${hits.length - 1} mentions`],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return findings;
|
||||
}
|
||||
|
||||
function splitDiffByFile(diff: string): Map<string, string[]> {
|
||||
const out = new Map<string, string[]>();
|
||||
let current: string | null = null;
|
||||
let buf: string[] = [];
|
||||
for (const line of diff.split(/\r?\n/)) {
|
||||
const m = line.match(/^diff --git a\/(\S+) b\/(\S+)/);
|
||||
if (m) {
|
||||
if (current) out.set(current, buf);
|
||||
current = m[2];
|
||||
buf = [];
|
||||
continue;
|
||||
}
|
||||
buf.push(line);
|
||||
}
|
||||
if (current) out.set(current, buf);
|
||||
return out;
|
||||
}
|
||||
|
||||
// Extract new `pub name: Type,` fields from added lines. Rust syntax.
|
||||
// Narrowly-scoped: only matches at the start of a trimmed line,
|
||||
// requires `pub ` prefix, ignores `pub fn` / `pub struct` / etc.
|
||||
function extractNewFields(addedLines: string[]): string[] {
|
||||
const fields = new Set<string>();
|
||||
for (const line of addedLines) {
|
||||
const t = line.trim();
|
||||
// pub NAME: Type,
|
||||
const m = t.match(/^pub\s+(?!fn\b|struct\b|enum\b|mod\b|use\b|trait\b|impl\b|const\b|static\b|type\b)(\w+)\s*:/);
|
||||
if (m) fields.add(m[1]);
|
||||
}
|
||||
return Array.from(fields);
|
||||
}
|
||||
|
||||
function escape(s: string): string {
|
||||
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
119
auditor/claim_parser.ts
Normal file
119
auditor/claim_parser.ts
Normal file
@ -0,0 +1,119 @@
|
||||
// Claim parser — reads commit messages + PR body, extracts ship-claims.
|
||||
//
|
||||
// A "ship-claim" is any phrase that asserts functionality is working,
|
||||
// tested, complete, or landed. These are the assertions the downstream
|
||||
// checks (static/dynamic/inference/kb) try to falsify.
|
||||
//
|
||||
// Heuristic approach (regex + strength grading) — intentionally NOT
|
||||
// using an LLM here. Reason: the inference check already asks a cloud
|
||||
// model "does this match the claim?". The parser's job is to surface
|
||||
// the claim substrates, not judge them. Over-engineering the parser
|
||||
// risks false-negatives when the cloud model was going to catch it
|
||||
// anyway.
|
||||
|
||||
import type { Claim, PrSnapshot } from "./types.ts";
|
||||
|
||||
// Strong claims: explicit end-to-end + verification vocabulary
|
||||
const STRONG_PATTERNS: RegExp[] = [
|
||||
/\bverified\s+(end[- ]to[- ]end|live|in\s+production|against)\b/i,
|
||||
/\btested\s+(live|end[- ]to[- ]end|against|with)\b/i,
|
||||
/\bworks\s+(end[- ]to[- ]end|live|in\s+production)\b/i,
|
||||
/\bproduction[- ]ready\b/i,
|
||||
/\bfully\s+(functional|wired|working)\b/i,
|
||||
/\bphase\s+\d+(\.\d+)?\s+(shipped|complete|done|landed)\b/i,
|
||||
/\bground\s+truth\b/i,
|
||||
/\bproven\b/i,
|
||||
];
|
||||
|
||||
// Moderate claims: asserted completion or pass but without the strong
|
||||
// verification qualifier.
|
||||
const MODERATE_PATTERNS: RegExp[] = [
|
||||
/\bshipped\b/i,
|
||||
/\blanded\b/i,
|
||||
/\bgreen\b/i,
|
||||
/\b(tests?\s+)?pass(ing|ed)\b/i,
|
||||
/\bcomplet(e|ed)\b/i,
|
||||
/\bdone\b/i,
|
||||
/\bwired\b/i,
|
||||
/\bfixed\b/i,
|
||||
/\bworks\b/i,
|
||||
];
|
||||
|
||||
// Weak claims: aspirational or hedged. Usually low-risk but recorded
|
||||
// for completeness.
|
||||
const WEAK_PATTERNS: RegExp[] = [
|
||||
/\bshould\s+work\b/i,
|
||||
/\bexpected\s+to\b/i,
|
||||
/\bintended\s+to\b/i,
|
||||
/\bwill\s+(work|handle|support)\b/i,
|
||||
/\bprobably\b/i,
|
||||
];
|
||||
|
||||
export interface ParsedClaims {
|
||||
claims: Claim[];
|
||||
commits_scanned: number;
|
||||
}
|
||||
|
||||
export function parseClaims(pr: PrSnapshot): ParsedClaims {
|
||||
const claims: Claim[] = [];
|
||||
|
||||
// PR body — every matching line becomes a claim at location "pr_body:N"
|
||||
if (pr.body) {
|
||||
scanText(pr.body, "pr_body", pr.head_sha, claims);
|
||||
}
|
||||
|
||||
// Each commit message gets its own scan.
|
||||
for (const c of pr.commits) {
|
||||
if (!c.message) continue;
|
||||
scanText(c.message, `commit:${c.sha.slice(0, 8)}`, c.sha, claims);
|
||||
}
|
||||
|
||||
return { claims, commits_scanned: pr.commits.length };
|
||||
}
|
||||
|
||||
function scanText(text: string, location_prefix: string, commit_sha: string, out: Claim[]): void {
|
||||
const lines = text.split(/\r?\n/);
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i];
|
||||
if (line.length < 3) continue;
|
||||
|
||||
// Strong patterns first — if a line matches strong, it's strong,
|
||||
// don't double-count as moderate.
|
||||
const strong = firstMatch(line, STRONG_PATTERNS);
|
||||
if (strong) {
|
||||
out.push({
|
||||
text: line.trim().slice(0, 200),
|
||||
commit_sha,
|
||||
location: `${location_prefix}:${i + 1}`,
|
||||
strength: "strong",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
const moderate = firstMatch(line, MODERATE_PATTERNS);
|
||||
if (moderate) {
|
||||
out.push({
|
||||
text: line.trim().slice(0, 200),
|
||||
commit_sha,
|
||||
location: `${location_prefix}:${i + 1}`,
|
||||
strength: "moderate",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
const weak = firstMatch(line, WEAK_PATTERNS);
|
||||
if (weak) {
|
||||
out.push({
|
||||
text: line.trim().slice(0, 200),
|
||||
commit_sha,
|
||||
location: `${location_prefix}:${i + 1}`,
|
||||
strength: "weak",
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function firstMatch(text: string, patterns: RegExp[]): RegExp | null {
|
||||
for (const p of patterns) {
|
||||
if (p.test(text)) return p;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
49
auditor/fixtures/cli.ts
Normal file
49
auditor/fixtures/cli.ts
Normal file
@ -0,0 +1,49 @@
|
||||
// Standalone runner for auditor fixtures. Invoke:
|
||||
//
|
||||
// bun run auditor/fixtures/cli.ts hybrid_38_40_45
|
||||
//
|
||||
// Prints human-readable per-layer breakdown + JSON result so the
|
||||
// output can be captured by the dynamic check without re-running.
|
||||
|
||||
import { runHybridFixture, type FixtureResult } from "./hybrid_38_40_45.ts";
|
||||
|
||||
const fixtureArg = process.argv[2] ?? "hybrid_38_40_45";
|
||||
|
||||
let result: FixtureResult;
|
||||
switch (fixtureArg) {
|
||||
case "hybrid_38_40_45":
|
||||
result = await runHybridFixture();
|
||||
break;
|
||||
default:
|
||||
console.error(`unknown fixture: ${fixtureArg}`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
// Human-readable summary
|
||||
console.error(""); // blank line to stderr so stdout JSON stays clean
|
||||
console.error(`─── Fixture: ${result.fixture} ───`);
|
||||
console.error(` overall: ${result.overall.toUpperCase()}`);
|
||||
console.error(` shipped: [${result.shipped_phases.join(", ") || "—"}]`);
|
||||
console.error(` placeholder: [${result.placeholder_phases.join(", ") || "—"}]`);
|
||||
console.error("");
|
||||
for (const l of result.layers) {
|
||||
const mark = l.ok ? "✓" : "✗";
|
||||
const phaseStr = `Phase ${l.phase}`.padEnd(11);
|
||||
console.error(` ${mark} ${phaseStr} ${l.layer.padEnd(30)} ${String(l.latency_ms).padStart(6)}ms`);
|
||||
if (l.ok) {
|
||||
console.error(` ${l.evidence.slice(0, 200)}`);
|
||||
} else {
|
||||
console.error(` ERROR: ${l.error?.slice(0, 240) ?? "unknown"}`);
|
||||
}
|
||||
}
|
||||
console.error("");
|
||||
for (const n of result.notes) console.error(` note: ${n}`);
|
||||
console.error("");
|
||||
|
||||
// Machine-readable output on stdout.
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
|
||||
// Exit code reflects overall: 0 pass, 2 partial, 1 fail.
|
||||
// Dynamic check reads this AND the JSON; partial-pass is treated
|
||||
// as informative (some layers shipped), not blocking on its own.
|
||||
process.exit(result.overall === "pass" ? 0 : result.overall === "partial_pass" ? 2 : 1);
|
||||
335
auditor/fixtures/hybrid_38_40_45.ts
Normal file
335
auditor/fixtures/hybrid_38_40_45.ts
Normal file
@ -0,0 +1,335 @@
|
||||
// The never-run hybrid test fixture. Exercises the full stack end-to-end
|
||||
// across Phase 38 (/v1/chat), Phase 40 (Langfuse tracing), Phase 45
|
||||
// slice 1 (DocRef on playbook), and Phase 45 slice 2 (context7 bridge
|
||||
// drift detection).
|
||||
//
|
||||
// Returns HONEST per-layer results. A layer that's unimplemented or
|
||||
// broken returns ok=false with a real error; overall verdict flags
|
||||
// which phases are genuinely shipped vs still placeholder.
|
||||
//
|
||||
// Deterministic: always runs the same payload so a run's output is
|
||||
// comparable across audits. Uses TEST-prefixed names so playbook
|
||||
// state doesn't get polluted with production-looking fixture data.
|
||||
//
|
||||
// Preconditions:
|
||||
// - gateway up on :3100
|
||||
// - Python sidecar up on :3200
|
||||
// - Ollama local + Ollama Cloud key loaded
|
||||
// - Langfuse up on :3001
|
||||
// - context7 bridge up on :3900 (manual-start per mcp-server/README)
|
||||
// If bridge isn't running, that layer reports ok=false with a
|
||||
// clear error — the fixture doesn't try to auto-start it.
|
||||
|
||||
import { readFile } from "node:fs/promises";
|
||||
|
||||
const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
|
||||
const LANGFUSE = process.env.LANGFUSE_URL ?? "http://localhost:3001";
|
||||
const BRIDGE = process.env.CONTEXT7_BRIDGE_URL ?? "http://localhost:3900";
|
||||
const FIXTURE_ID = "auditor:hybrid_38_40_45:v1";
|
||||
const TEST_WORKER_NAME = "__auditor_test_worker__";
|
||||
const STALE_HASH = "stale-hash-for-drift-proof";
|
||||
// Unique-per-run identifiers so each fixture invocation hits the ADD
|
||||
// path in upsert_entry (not UPDATE/NOOP on a leftover from a prior
|
||||
// run). Catches state pollution + avoids interaction with task #12
|
||||
// (UPDATE branch silently drops doc_refs).
|
||||
const RUN_NONCE = Date.now().toString(36);
|
||||
const TEST_WORKER_NAME_VERSIONED = `__auditor_test_worker_${RUN_NONCE}__`;
|
||||
const TEST_OPERATION_VERSIONED = `TEST: fill: __auditor_${RUN_NONCE}__ x1 in TestCity, TC`;
|
||||
|
||||
export interface LayerResult {
|
||||
layer: string;
|
||||
phase: string;
|
||||
ok: boolean;
|
||||
latency_ms: number;
|
||||
evidence: string;
|
||||
error?: string;
|
||||
// Layer-specific numbers that downstream checks can aggregate.
|
||||
metrics?: Record<string, number | string | boolean>;
|
||||
}
|
||||
|
||||
export interface FixtureResult {
|
||||
fixture: string;
|
||||
ran_at: string;
|
||||
overall: "pass" | "partial_pass" | "fail";
|
||||
layers: LayerResult[];
|
||||
real_numbers: Record<string, number>;
|
||||
shipped_phases: string[];
|
||||
placeholder_phases: string[];
|
||||
notes: string[];
|
||||
}
|
||||
|
||||
// Wraps a layer's async body with timing + structured error capture.
|
||||
// On throw: returns ok=false with the error string as evidence, so
|
||||
// downstream verdict code treats it as a layer failure rather than
|
||||
// a fixture crash.
|
||||
async function measureLayer(
|
||||
layer: string,
|
||||
phase: string,
|
||||
body: () => Promise<{ evidence: string; metrics?: Record<string, number | string | boolean> }>,
|
||||
): Promise<LayerResult> {
|
||||
const t0 = Date.now();
|
||||
try {
|
||||
const { evidence, metrics } = await body();
|
||||
return {
|
||||
layer, phase, ok: true,
|
||||
latency_ms: Date.now() - t0,
|
||||
evidence, metrics,
|
||||
};
|
||||
} catch (e) {
|
||||
return {
|
||||
layer, phase, ok: false,
|
||||
latency_ms: Date.now() - t0,
|
||||
evidence: `FAILED: ${(e as Error).message.slice(0, 200)}`,
|
||||
error: (e as Error).message,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async function langfuseAuth(): Promise<{ pk: string; sk: string } | null> {
|
||||
// Default credentials match mcp-server/tracing.ts. Same sources the
|
||||
// Rust side uses in gateway/src/v1/langfuse_trace.rs.
|
||||
let pk = process.env.LANGFUSE_PUBLIC_KEY ?? "pk-lf-staffing";
|
||||
let sk = process.env.LANGFUSE_SECRET_KEY ?? "sk-lf-staffing-secret";
|
||||
if (!pk || !sk) return null;
|
||||
return { pk, sk };
|
||||
}
|
||||
|
||||
export async function runHybridFixture(): Promise<FixtureResult> {
|
||||
const result: FixtureResult = {
|
||||
fixture: FIXTURE_ID,
|
||||
ran_at: new Date().toISOString(),
|
||||
overall: "fail",
|
||||
layers: [],
|
||||
real_numbers: {},
|
||||
shipped_phases: [],
|
||||
placeholder_phases: [],
|
||||
notes: [],
|
||||
};
|
||||
|
||||
// ========================================================================
|
||||
// Layer 1 — Phase 38: POST /v1/chat returns valid OpenAI shape
|
||||
// ========================================================================
|
||||
// Captured HERE, immediately before the chat layer runs, so layer 2's
|
||||
// Langfuse-trace filter uses the actual moment the chat call was
|
||||
// attempted — not the fixture start time. Earlier draft had a
|
||||
// meaningless ternary returning result.ran_at on both branches; the
|
||||
// LLM-Team codereview (2026-04-22) caught this and flagged it as a
|
||||
// false-negative window on traces created between fixture-start and
|
||||
// chat-fetch.
|
||||
const chat_request_sent_ms = Date.now();
|
||||
|
||||
const l1 = await measureLayer("phase38_chat", "38", async () => {
|
||||
const r = await fetch(`${GATEWAY}/v1/chat`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "qwen3.5:latest",
|
||||
messages: [
|
||||
{ role: "system", content: "Respond in 8 words or fewer." },
|
||||
{ role: "user", content: "Name one Docker command." },
|
||||
],
|
||||
max_tokens: 60,
|
||||
temperature: 0.2,
|
||||
}),
|
||||
signal: AbortSignal.timeout(60000),
|
||||
});
|
||||
if (!r.ok) throw new Error(`gateway ${r.status}: ${await r.text()}`);
|
||||
const j: any = await r.json();
|
||||
const content = j.choices?.[0]?.message?.content;
|
||||
if (typeof content !== "string" || content.length === 0) {
|
||||
throw new Error(`empty content — think-false default may not be wired: ${JSON.stringify(j).slice(0, 200)}`);
|
||||
}
|
||||
return {
|
||||
evidence: `content="${content.slice(0, 60)}" tokens=${j.usage?.total_tokens}`,
|
||||
metrics: {
|
||||
prompt_tokens: j.usage?.prompt_tokens ?? 0,
|
||||
completion_tokens: j.usage?.completion_tokens ?? 0,
|
||||
total_tokens: j.usage?.total_tokens ?? 0,
|
||||
content_nonempty: true,
|
||||
},
|
||||
};
|
||||
});
|
||||
result.layers.push(l1);
|
||||
|
||||
// ========================================================================
|
||||
// Layer 2 — Phase 40: Langfuse trace lands within 3s
|
||||
// ========================================================================
|
||||
// Fire-and-forget tracing means we need a brief sleep before query.
|
||||
await new Promise(res => setTimeout(res, 2500));
|
||||
const l2 = await measureLayer("phase40_langfuse_trace", "40", async () => {
|
||||
const auth = await langfuseAuth();
|
||||
if (!auth) throw new Error("Langfuse credentials not in env");
|
||||
const r = await fetch(`${LANGFUSE}/api/public/traces?limit=5&name=v1.chat:ollama`, {
|
||||
headers: { Authorization: `Basic ${btoa(`${auth.pk}:${auth.sk}`)}` },
|
||||
signal: AbortSignal.timeout(10000),
|
||||
});
|
||||
if (!r.ok) throw new Error(`langfuse ${r.status}: ${await r.text()}`);
|
||||
const j: any = await r.json();
|
||||
const items = Array.isArray(j.data) ? j.data : [];
|
||||
// Filter on the chat-request timestamp captured above. A Langfuse
|
||||
// trace must be newer than the moment we fired /v1/chat to plausibly
|
||||
// belong to our request. Using fixture start time (result.ran_at)
|
||||
// was wrong and could false-negative on slow fixtures.
|
||||
const recent = items.filter((t: any) => Date.parse(t.timestamp) >= chat_request_sent_ms);
|
||||
if (recent.length === 0) {
|
||||
throw new Error(`no v1.chat:ollama trace since ${new Date(chat_request_sent_ms).toISOString()} (${items.length} older traces visible, Langfuse reachable — tracing is not firing)`);
|
||||
}
|
||||
const trace = recent[0];
|
||||
return {
|
||||
evidence: `trace ${trace.id.slice(0, 8)} name=${trace.name} age=${Date.now() - Date.parse(trace.timestamp)}ms`,
|
||||
metrics: {
|
||||
trace_age_ms: Date.now() - Date.parse(trace.timestamp),
|
||||
trace_latency_reported: Number(trace.latency ?? 0),
|
||||
recent_trace_count: recent.length,
|
||||
},
|
||||
};
|
||||
});
|
||||
result.layers.push(l2);
|
||||
|
||||
// ========================================================================
|
||||
// Layer 3 — Phase 45 slice 1: /seed accepts doc_refs and persists them
|
||||
// ========================================================================
|
||||
// Seed a playbook with doc_refs referencing Docker at a stale hash.
|
||||
// Ignore the cleanup concern for v1 — this fixture pollutes the
|
||||
// in-memory playbook state; operator can retire test entries.
|
||||
const l3 = await measureLayer("phase45_seed_with_doc_refs", "45.1", async () => {
|
||||
const r = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
operation: TEST_OPERATION_VERSIONED,
|
||||
approach: "auditor hybrid fixture run",
|
||||
context: "doc_refs integration test — retire after audit",
|
||||
endorsed_names: [TEST_WORKER_NAME_VERSIONED],
|
||||
append: true,
|
||||
doc_refs: [
|
||||
{
|
||||
tool: "docker",
|
||||
version_seen: "24.0.7",
|
||||
snippet_hash: STALE_HASH,
|
||||
source_url: "https://context7.com/docker/docs",
|
||||
seen_at: new Date().toISOString(),
|
||||
},
|
||||
],
|
||||
}),
|
||||
signal: AbortSignal.timeout(30000),
|
||||
});
|
||||
if (!r.ok) throw new Error(`seed ${r.status}: ${await r.text()}`);
|
||||
const j: any = await r.json();
|
||||
// Verify doc_refs actually persisted — read state.json directly.
|
||||
// That's the canonical source of truth; the seed response may or
|
||||
// may not echo doc_refs back.
|
||||
const state_raw = await readFile("/home/profit/lakehouse/data/_playbook_memory/state.json", "utf8");
|
||||
const state = JSON.parse(state_raw);
|
||||
const entries = state.entries ?? [];
|
||||
const ours = entries.find((e: any) => (e.endorsed_names ?? []).includes(TEST_WORKER_NAME_VERSIONED));
|
||||
if (!ours) throw new Error(`seeded entry not found in state.json (worker ${TEST_WORKER_NAME_VERSIONED} not present)`);
|
||||
const docRefs = ours.doc_refs ?? [];
|
||||
if (docRefs.length === 0) {
|
||||
throw new Error("entry saved but doc_refs field is empty — the field accepted by the API isn't being persisted");
|
||||
}
|
||||
return {
|
||||
evidence: `playbook ${ours.playbook_id.slice(0, 16)} persisted with doc_refs.length=${docRefs.length}; first tool=${docRefs[0]?.tool} hash=${docRefs[0]?.snippet_hash}`,
|
||||
metrics: {
|
||||
doc_refs_stored: docRefs.length,
|
||||
entries_after: j.entries_after ?? -1,
|
||||
playbook_id: ours.playbook_id,
|
||||
mode: j.outcome?.mode ?? "unknown",
|
||||
},
|
||||
};
|
||||
});
|
||||
result.layers.push(l3);
|
||||
|
||||
// ========================================================================
|
||||
// Layer 4 — Phase 45 slice 2: context7 bridge detects drift vs stale hash
|
||||
// ========================================================================
|
||||
const l4 = await measureLayer("phase45_bridge_diff", "45.2", async () => {
|
||||
// Health check first — if bridge isn't running, fail with a clear
|
||||
// message rather than mysterious connection refused.
|
||||
try {
|
||||
const h = await fetch(`${BRIDGE}/health`, { signal: AbortSignal.timeout(3000) });
|
||||
if (!h.ok) throw new Error(`bridge /health ${h.status}`);
|
||||
} catch (e) {
|
||||
throw new Error(`context7 bridge at ${BRIDGE} is not reachable (bridge is manual-start; run 'bun run mcp-server/context7_bridge.ts'): ${(e as Error).message}`);
|
||||
}
|
||||
const r = await fetch(`${BRIDGE}/docs/docker/diff?since=${encodeURIComponent(STALE_HASH)}`, {
|
||||
signal: AbortSignal.timeout(30000),
|
||||
});
|
||||
if (!r.ok) throw new Error(`bridge diff ${r.status}: ${await r.text()}`);
|
||||
const j: any = await r.json();
|
||||
if (j.drifted !== true) {
|
||||
throw new Error(`expected drifted=true against ${STALE_HASH}; got ${JSON.stringify(j)}`);
|
||||
}
|
||||
return {
|
||||
evidence: `bridge confirms drift: current=${j.current_snippet_hash} previous=${j.previous_snippet_hash} upstream_updated=${j.last_updated_upstream}`,
|
||||
metrics: {
|
||||
drifted: true,
|
||||
current_hash_length: String(j.current_snippet_hash ?? "").length,
|
||||
library_id: j.library_id,
|
||||
},
|
||||
};
|
||||
});
|
||||
result.layers.push(l4);
|
||||
|
||||
// ========================================================================
|
||||
// Layer 5 — Phase 45 slice 3: /doc_drift/check/{id} endpoint
|
||||
// (EXPECTED TO FAIL — endpoint unimplemented)
|
||||
// ========================================================================
|
||||
const l5 = await measureLayer("phase45_slice3_drift_flag", "45.3", async () => {
|
||||
const pid = String(l3.metrics?.playbook_id ?? "");
|
||||
if (!pid) throw new Error("layer 3 did not produce a playbook_id to check");
|
||||
const r = await fetch(`${GATEWAY}/vectors/playbook_memory/doc_drift/check/${encodeURIComponent(pid)}`, {
|
||||
method: "POST",
|
||||
signal: AbortSignal.timeout(10000),
|
||||
});
|
||||
if (r.status === 404 || r.status === 405) {
|
||||
// This is the HONEST signal: the endpoint doesn't exist yet.
|
||||
// Fail this layer with a clear marker — not a silent pass.
|
||||
throw new Error(`endpoint unimplemented (${r.status}) — Phase 45 slice 3 is still placeholder. doc_refs are persisted (layer 3) and the bridge can answer drift questions (layer 4), but nothing ties them together into a playbook flag yet.`);
|
||||
}
|
||||
if (!r.ok) throw new Error(`drift/check ${r.status}: ${await r.text()}`);
|
||||
const j: any = await r.json();
|
||||
// If the endpoint DID exist, assert the flag got set.
|
||||
if (!j.flagged) {
|
||||
throw new Error(`endpoint responded but flag not set: ${JSON.stringify(j).slice(0, 200)}`);
|
||||
}
|
||||
return {
|
||||
evidence: `playbook flagged: ${JSON.stringify(j).slice(0, 160)}`,
|
||||
metrics: { flagged: true },
|
||||
};
|
||||
});
|
||||
result.layers.push(l5);
|
||||
|
||||
// ========================================================================
|
||||
// Verdict assembly
|
||||
// ========================================================================
|
||||
for (const l of result.layers) {
|
||||
if (l.ok) result.shipped_phases.push(l.phase);
|
||||
else result.placeholder_phases.push(l.phase);
|
||||
result.real_numbers[`${l.layer}_latency_ms`] = l.latency_ms;
|
||||
if (l.metrics) {
|
||||
for (const [k, v] of Object.entries(l.metrics)) {
|
||||
if (typeof v === "number") result.real_numbers[`${l.layer}.${k}`] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const anyFail = result.placeholder_phases.length > 0;
|
||||
const allPass = result.layers.every(l => l.ok);
|
||||
result.overall = allPass ? "pass" : (result.shipped_phases.length > 0 ? "partial_pass" : "fail");
|
||||
|
||||
if (!l5.ok) {
|
||||
result.notes.push(
|
||||
"Phase 45 slice 3 (doc_drift/check endpoint) is the expected-fail layer. " +
|
||||
"Layer 5 failing with a 404/405 is the CORRECT honest signal — don't mask this " +
|
||||
"to get a green result. When slice 3 ships, layer 5 flips to ok=true automatically."
|
||||
);
|
||||
}
|
||||
if (anyFail) {
|
||||
result.notes.push(
|
||||
`${result.placeholder_phases.length} layer(s) failed. Check evidence per layer for specifics.`
|
||||
);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
145
auditor/gitea.ts
Normal file
145
auditor/gitea.ts
Normal file
@ -0,0 +1,145 @@
|
||||
// Gitea API client. Minimal surface — only what the auditor needs:
|
||||
// list open PRs, get commits + files for a PR, fetch a diff, post a
|
||||
// commit status, post a review.
|
||||
//
|
||||
// Auth: reads PAT from ~/.git-credentials (set up by the credential
|
||||
// helper flow in 2026-04-22 session). Gitea's "token" auth scheme
|
||||
// matches what `git fetch` is already using.
|
||||
|
||||
import { readFile } from "node:fs/promises";
|
||||
import type { PrSnapshot } from "./types.ts";
|
||||
|
||||
const HOST = process.env.GITEA_HOST ?? "https://git.agentview.dev";
|
||||
const OWNER = "profit";
|
||||
const REPO = "lakehouse";
|
||||
const CRED_FILE = "/home/profit/.git-credentials";
|
||||
|
||||
let cachedPat: string | null = null;
|
||||
|
||||
async function getPat(): Promise<string> {
|
||||
if (cachedPat) return cachedPat;
|
||||
const raw = await readFile(CRED_FILE, "utf8");
|
||||
for (const line of raw.split("\n")) {
|
||||
const m = line.match(/^https:\/\/[^:]+:([^@]+)@git\.agentview\.dev/);
|
||||
if (m) { cachedPat = m[1]; return m[1]; }
|
||||
}
|
||||
throw new Error(`no Gitea PAT in ${CRED_FILE}`);
|
||||
}
|
||||
|
||||
async function giteaFetch(path: string, init: RequestInit = {}): Promise<Response> {
|
||||
const pat = await getPat();
|
||||
const url = `${HOST}/api/v1${path}`;
|
||||
const headers = new Headers(init.headers);
|
||||
headers.set("Authorization", `token ${pat}`);
|
||||
if (init.body && !headers.has("content-type")) {
|
||||
headers.set("content-type", "application/json");
|
||||
}
|
||||
return fetch(url, { ...init, headers, signal: AbortSignal.timeout(20000) });
|
||||
}
|
||||
|
||||
export async function listOpenPrs(): Promise<PrSnapshot[]> {
|
||||
const r = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls?state=open&page=1&limit=50`);
|
||||
if (!r.ok) throw new Error(`listOpenPrs ${r.status}: ${await r.text()}`);
|
||||
const rows = (await r.json()) as any[];
|
||||
return Promise.all(rows.map(row => snapshotFromPr(row)));
|
||||
}
|
||||
|
||||
export async function getPrSnapshot(num: number): Promise<PrSnapshot> {
|
||||
const r = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${num}`);
|
||||
if (!r.ok) throw new Error(`getPr ${num} ${r.status}: ${await r.text()}`);
|
||||
return snapshotFromPr((await r.json()) as any);
|
||||
}
|
||||
|
||||
async function snapshotFromPr(row: any): Promise<PrSnapshot> {
|
||||
const num = row.number;
|
||||
const commitsResp = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${num}/commits`);
|
||||
const commits = commitsResp.ok ? ((await commitsResp.json()) as any[]) : [];
|
||||
const filesResp = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${num}/files`);
|
||||
const files = filesResp.ok ? ((await filesResp.json()) as any[]) : [];
|
||||
return {
|
||||
number: num,
|
||||
head_sha: row.head?.sha ?? "",
|
||||
base_sha: row.base?.sha ?? "",
|
||||
title: row.title ?? "",
|
||||
body: row.body ?? "",
|
||||
state: row.state === "open" ? "open" : (row.merged ? "merged" : "closed"),
|
||||
author: row.user?.login ?? "",
|
||||
commits: commits.map(c => ({
|
||||
sha: (c.sha ?? "").slice(0, 12),
|
||||
message: c.commit?.message ?? "",
|
||||
author: c.commit?.author?.name ?? "",
|
||||
})),
|
||||
files: files.map(f => ({
|
||||
path: f.filename ?? "",
|
||||
additions: f.additions ?? 0,
|
||||
deletions: f.deletions ?? 0,
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns the unified diff text of the PR. Used by static checks.
|
||||
export async function getPrDiff(num: number): Promise<string> {
|
||||
const r = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${num}.diff`);
|
||||
if (!r.ok) throw new Error(`getDiff ${num} ${r.status}: ${await r.text()}`);
|
||||
return await r.text();
|
||||
}
|
||||
|
||||
/// Hard-block mechanism: post a failing commit status on the PR head
|
||||
/// SHA. Branch protection (if enabled on `main`) treats this as a
|
||||
/// required-check fail and prevents merge. The description is shown
|
||||
/// in the Gitea UI next to the red X.
|
||||
export async function postCommitStatus(args: {
|
||||
sha: string;
|
||||
state: "success" | "pending" | "failure" | "error";
|
||||
context: string;
|
||||
description: string;
|
||||
target_url?: string;
|
||||
}): Promise<void> {
|
||||
const r = await giteaFetch(`/repos/${OWNER}/${REPO}/statuses/${args.sha}`, {
|
||||
method: "POST",
|
||||
body: JSON.stringify({
|
||||
state: args.state,
|
||||
context: args.context,
|
||||
description: args.description.slice(0, 140),
|
||||
target_url: args.target_url ?? "",
|
||||
}),
|
||||
});
|
||||
if (!r.ok) throw new Error(`postCommitStatus ${r.status}: ${await r.text()}`);
|
||||
}
|
||||
|
||||
/// Post a review comment. Gitea typically blocks self-review
|
||||
/// (author posting a review on their own PR). Prefer
|
||||
/// `postIssueComment` when running with the author's PAT.
|
||||
export async function postReview(args: {
|
||||
pr_number: number;
|
||||
commit_id: string;
|
||||
body: string;
|
||||
event: "APPROVE" | "REQUEST_CHANGES" | "COMMENT";
|
||||
}): Promise<void> {
|
||||
const r = await giteaFetch(`/repos/${OWNER}/${REPO}/pulls/${args.pr_number}/reviews`, {
|
||||
method: "POST",
|
||||
body: JSON.stringify({
|
||||
commit_id: args.commit_id,
|
||||
body: args.body,
|
||||
event: args.event,
|
||||
}),
|
||||
});
|
||||
if (!r.ok) throw new Error(`postReview ${r.status}: ${await r.text()}`);
|
||||
}
|
||||
|
||||
/// Plain issue comment. Works for the auditor's own PAT because
|
||||
/// Gitea allows authors to comment on their own PRs (just not
|
||||
/// review them). Auditor uses this for the reasoning body; the
|
||||
/// actual block signal is the commit status.
|
||||
export async function postIssueComment(args: {
|
||||
pr_number: number;
|
||||
body: string;
|
||||
}): Promise<{ id: number; html_url: string }> {
|
||||
const r = await giteaFetch(`/repos/${OWNER}/${REPO}/issues/${args.pr_number}/comments`, {
|
||||
method: "POST",
|
||||
body: JSON.stringify({ body: args.body }),
|
||||
});
|
||||
if (!r.ok) throw new Error(`postIssueComment ${r.status}: ${await r.text()}`);
|
||||
const j = await r.json() as any;
|
||||
return { id: j.id, html_url: j.html_url };
|
||||
}
|
||||
147
auditor/index.ts
Normal file
147
auditor/index.ts
Normal file
@ -0,0 +1,147 @@
|
||||
// Auditor poller — the top-level entry. Polls Gitea for open PRs on
|
||||
// a fixed interval, dedupes by head SHA, runs audit + posts verdict
|
||||
// for each new (pr, sha) pair.
|
||||
//
|
||||
// Run manually:
|
||||
// bun run auditor/index.ts
|
||||
//
|
||||
// Stop:
|
||||
// touch auditor.paused (skips next cycle)
|
||||
// pkill -f auditor/index.ts (kills in-flight)
|
||||
//
|
||||
// State:
|
||||
// data/_auditor/state.json — last-audited SHA per PR
|
||||
// data/_auditor/verdicts/{id}.json — per-run verdict records
|
||||
//
|
||||
// This entry runs forever. A systemd unit would wrap it once the
|
||||
// workflow is trusted (same pattern as mcp-server, observer).
|
||||
|
||||
import { readFile, writeFile, mkdir, access } from "node:fs/promises";
|
||||
import { listOpenPrs } from "./gitea.ts";
|
||||
import { auditPr } from "./audit.ts";
|
||||
|
||||
const POLL_INTERVAL_MS = 90_000; // 90s — enough budget for audit runs to complete
|
||||
const PAUSE_FILE = "/home/profit/lakehouse/auditor.paused";
|
||||
const STATE_FILE = "/home/profit/lakehouse/data/_auditor/state.json";
|
||||
|
||||
interface State {
|
||||
// Map: PR number → last-audited head SHA. Lets us dedupe audits
|
||||
// across restarts (poller can crash/restart without re-auditing
|
||||
// all open PRs from scratch).
|
||||
last_audited: Record<string, string>;
|
||||
started_at: string;
|
||||
cycles_total: number;
|
||||
cycles_skipped_paused: number;
|
||||
audits_run: number;
|
||||
last_cycle_at?: string;
|
||||
}
|
||||
|
||||
async function fileExists(path: string): Promise<boolean> {
|
||||
try { await access(path); return true; } catch { return false; }
|
||||
}
|
||||
|
||||
async function loadState(): Promise<State> {
|
||||
try {
|
||||
const raw = await readFile(STATE_FILE, "utf8");
|
||||
const s = JSON.parse(raw);
|
||||
return {
|
||||
last_audited: s.last_audited ?? {},
|
||||
started_at: s.started_at ?? new Date().toISOString(),
|
||||
cycles_total: s.cycles_total ?? 0,
|
||||
cycles_skipped_paused: s.cycles_skipped_paused ?? 0,
|
||||
audits_run: s.audits_run ?? 0,
|
||||
last_cycle_at: s.last_cycle_at,
|
||||
};
|
||||
} catch {
|
||||
return {
|
||||
last_audited: {},
|
||||
started_at: new Date().toISOString(),
|
||||
cycles_total: 0,
|
||||
cycles_skipped_paused: 0,
|
||||
audits_run: 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async function saveState(s: State): Promise<void> {
|
||||
await mkdir("/home/profit/lakehouse/data/_auditor", { recursive: true });
|
||||
await writeFile(STATE_FILE, JSON.stringify(s, null, 2));
|
||||
}
|
||||
|
||||
async function runCycle(state: State): Promise<State> {
|
||||
state.cycles_total += 1;
|
||||
state.last_cycle_at = new Date().toISOString();
|
||||
|
||||
if (await fileExists(PAUSE_FILE)) {
|
||||
state.cycles_skipped_paused += 1;
|
||||
console.log(`[auditor] cycle ${state.cycles_total}: paused (touch ${PAUSE_FILE} exists)`);
|
||||
return state;
|
||||
}
|
||||
|
||||
let prs;
|
||||
try {
|
||||
prs = await listOpenPrs();
|
||||
} catch (e) {
|
||||
console.error(`[auditor] listOpenPrs failed: ${(e as Error).message}`);
|
||||
return state;
|
||||
}
|
||||
|
||||
console.log(`[auditor] cycle ${state.cycles_total}: ${prs.length} open PR(s)`);
|
||||
|
||||
for (const pr of prs) {
|
||||
const last = state.last_audited[String(pr.number)];
|
||||
if (last === pr.head_sha) {
|
||||
console.log(`[auditor] skip PR #${pr.number} (SHA ${pr.head_sha.slice(0, 8)} already audited)`);
|
||||
continue;
|
||||
}
|
||||
console.log(`[auditor] audit PR #${pr.number} (${pr.head_sha.slice(0, 8)}) — ${pr.title.slice(0, 60)}`);
|
||||
try {
|
||||
// Skip dynamic by default: it mutates live playbook state and
|
||||
// re-runs on every PR update would pollute quickly. Operator
|
||||
// can run dynamic via `bun run auditor/fixtures/cli.ts` manually
|
||||
// OR set LH_AUDITOR_RUN_DYNAMIC=1 to opt in.
|
||||
const run_dynamic = process.env.LH_AUDITOR_RUN_DYNAMIC === "1";
|
||||
const verdict = await auditPr(pr, {
|
||||
skip_dynamic: !run_dynamic,
|
||||
skip_inference: process.env.LH_AUDITOR_SKIP_INFERENCE === "1",
|
||||
});
|
||||
console.log(`[auditor] verdict=${verdict.overall} findings=${verdict.metrics.findings_total} (block=${verdict.metrics.findings_block} warn=${verdict.metrics.findings_warn})`);
|
||||
state.last_audited[String(pr.number)] = pr.head_sha;
|
||||
state.audits_run += 1;
|
||||
} catch (e) {
|
||||
console.error(`[auditor] audit failed: ${(e as Error).message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
console.log(`[auditor] starting poller — interval ${POLL_INTERVAL_MS / 1000}s`);
|
||||
console.log(`[auditor] pause file: ${PAUSE_FILE}`);
|
||||
console.log(`[auditor] state file: ${STATE_FILE}`);
|
||||
|
||||
let state = await loadState();
|
||||
console.log(`[auditor] loaded state: ${Object.keys(state.last_audited).length} PRs previously audited, ${state.cycles_total} cycles so far`);
|
||||
|
||||
// Single-shot mode for CLI testing: `bun run auditor/index.ts --once`
|
||||
const once = process.argv.includes("--once");
|
||||
if (once) {
|
||||
state = await runCycle(state);
|
||||
await saveState(state);
|
||||
console.log(`[auditor] single-shot complete. total audits: ${state.audits_run}`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Loop.
|
||||
while (true) {
|
||||
state = await runCycle(state);
|
||||
await saveState(state);
|
||||
await new Promise(res => setTimeout(res, POLL_INTERVAL_MS));
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error("[auditor] fatal:", e);
|
||||
process.exit(1);
|
||||
});
|
||||
62
auditor/policy.ts
Normal file
62
auditor/policy.ts
Normal file
@ -0,0 +1,62 @@
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// YOU WRITE THIS FILE. Policy decides what blocks vs what's a comment.
|
||||
// Defaults are opinionated on the "stop clicking past placeholder"
|
||||
// side — easier to loosen than to tighten when you're watching the
|
||||
// auditor behave in live PRs.
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
import type { Finding, Verdict } from "./types.ts";
|
||||
|
||||
/// Translate the four-check output into a single verdict. This is the
|
||||
/// single pane of glass the auditor operates on — tune thresholds here.
|
||||
export function assembleVerdict(
|
||||
findings: Finding[],
|
||||
metrics: Record<string, number>,
|
||||
pr_number: number,
|
||||
head_sha: string,
|
||||
): Verdict {
|
||||
const blocking = findings.filter(f => f.severity === "block");
|
||||
const warning = findings.filter(f => f.severity === "warn");
|
||||
|
||||
let overall: Verdict["overall"];
|
||||
let one_liner: string;
|
||||
|
||||
if (blocking.length > 0) {
|
||||
overall = "block";
|
||||
one_liner = `${blocking.length} blocking issue${blocking.length > 1 ? "s" : ""}: ${blocking[0].summary}`;
|
||||
} else if (warning.length >= 3) {
|
||||
// Three or more warnings is a block — death by a thousand cuts.
|
||||
overall = "request_changes";
|
||||
one_liner = `${warning.length} warnings — see review`;
|
||||
} else if (warning.length > 0) {
|
||||
overall = "request_changes";
|
||||
one_liner = warning[0].summary;
|
||||
} else {
|
||||
overall = "approve";
|
||||
one_liner = `all checks passed (${findings.length} findings, all info)`;
|
||||
}
|
||||
|
||||
return {
|
||||
pr_number,
|
||||
head_sha,
|
||||
audited_at: new Date().toISOString(),
|
||||
overall,
|
||||
findings,
|
||||
metrics,
|
||||
one_liner,
|
||||
};
|
||||
}
|
||||
|
||||
/// Which strength-of-claim warrants which severity when evidence is
|
||||
/// weak? A "Phase X shipped" claim with zero integration tests is a
|
||||
/// blocker. A "should work" claim with no test is a warn.
|
||||
export function severityFromClaimEvidence(
|
||||
claim_strength: "weak" | "moderate" | "strong",
|
||||
evidence_grade: "none" | "partial" | "full",
|
||||
): "info" | "warn" | "block" {
|
||||
if (evidence_grade === "full") return "info";
|
||||
if (claim_strength === "strong" && evidence_grade === "none") return "block";
|
||||
if (claim_strength === "strong" && evidence_grade === "partial") return "warn";
|
||||
if (claim_strength === "moderate" && evidence_grade === "none") return "warn";
|
||||
return "info";
|
||||
}
|
||||
65
auditor/types.ts
Normal file
65
auditor/types.ts
Normal file
@ -0,0 +1,65 @@
|
||||
// Shared types for the claim-auditor. Every field exists for a reason;
|
||||
// if something can't be verified from a check, it goes into `evidence`
|
||||
// so the verdict is inspectable, not a black box.
|
||||
|
||||
export type CheckKind = "static" | "dynamic" | "inference" | "kb_query";
|
||||
|
||||
export type Severity = "info" | "warn" | "block";
|
||||
|
||||
export interface Claim {
|
||||
// Verbatim phrase that raised the claim — e.g. "Phase 38 shipped",
|
||||
// "verified end-to-end", "works after restart". Used as the "what
|
||||
// does the author assert" input to downstream checks.
|
||||
text: string;
|
||||
// Where it came from. `commit_sha` is the short hash; `location`
|
||||
// is a file:line for in-diff claims, or "pr_body" / "commit_message".
|
||||
commit_sha: string;
|
||||
location: string;
|
||||
// Heuristic rating of how strong the claim is. "green+tested"
|
||||
// is strong; "should work" is weak. Drives sensitivity — stronger
|
||||
// claims get harder-blocked on weak evidence.
|
||||
strength: "weak" | "moderate" | "strong";
|
||||
}
|
||||
|
||||
export interface Finding {
|
||||
check: CheckKind;
|
||||
severity: Severity;
|
||||
claim_text?: string;
|
||||
// Free-form short description: "field added but never read", "no
|
||||
// test covers this code path", "cloud model says placeholder".
|
||||
summary: string;
|
||||
// Concrete evidence: file paths, line numbers, log excerpts, test
|
||||
// output, cloud-model verdict. No handwaving.
|
||||
evidence: string[];
|
||||
}
|
||||
|
||||
export interface Verdict {
|
||||
pr_number: number;
|
||||
head_sha: string;
|
||||
audited_at: string;
|
||||
overall: "approve" | "request_changes" | "block";
|
||||
findings: Finding[];
|
||||
// Real numbers that downstream policy can gate on. e.g. if the
|
||||
// hybrid test produced latency numbers or token counts, they
|
||||
// surface here so /auditor/history is queryable.
|
||||
metrics: Record<string, number>;
|
||||
// Short one-line justification for the `overall` verdict. What
|
||||
// gets posted as the commit-status description in Gitea (max 140
|
||||
// chars) must fit here.
|
||||
one_liner: string;
|
||||
}
|
||||
|
||||
export interface PrSnapshot {
|
||||
number: number;
|
||||
head_sha: string;
|
||||
base_sha: string;
|
||||
title: string;
|
||||
body: string;
|
||||
state: "open" | "closed" | "merged";
|
||||
author: string;
|
||||
// Array of commit messages in the PR (not diffs — those are
|
||||
// fetched on-demand per-check).
|
||||
commits: Array<{ sha: string; message: string; author: string }>;
|
||||
// File paths touched by the PR, with lines-added / lines-removed.
|
||||
files: Array<{ path: string; additions: number; deletions: number }>;
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user