Compare commits
3 Commits
77650c4ba3
...
a264bcf3fc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a264bcf3fc | ||
|
|
181c35b829 | ||
|
|
2afad0f83f |
@ -19,6 +19,15 @@ import { extractFacts } from "../fact_extractor.ts";
|
|||||||
|
|
||||||
const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
|
const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
|
||||||
const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
|
const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
|
||||||
|
// Tie-breaker for claims where the N=3 consensus produces a 1-1-1
|
||||||
|
// split (genuinely borderline). Different architecture from the
|
||||||
|
// primary reviewer (gpt-oss) so the tie-break isn't correlated with
|
||||||
|
// the original disagreement. qwen3-coder:480b is a newer coding
|
||||||
|
// specialist at 480B params, well-suited to PR-diff claim verification
|
||||||
|
// and distinct in training lineage from gpt-oss.
|
||||||
|
const TIEBREAKER_MODEL = process.env.LH_AUDITOR_TIEBREAKER_MODEL ?? "qwen3-coder:480b";
|
||||||
|
const N_CONSENSUS = Number(process.env.LH_AUDITOR_CONSENSUS_N ?? 3);
|
||||||
|
const AUDIT_DISCREPANCIES_JSONL = "/home/profit/lakehouse/data/_kb/audit_discrepancies.jsonl";
|
||||||
// 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was
|
// 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was
|
||||||
// previously truncated at 15KB causing the reviewer to miss later
|
// previously truncated at 15KB causing the reviewer to miss later
|
||||||
// files (gitea.ts, policy.ts) and flag "no Gitea client present" as a
|
// files (gitea.ts, policy.ts) and flag "no Gitea client present" as a
|
||||||
@ -168,94 +177,131 @@ export async function runInferenceCheck(
|
|||||||
`Strict JSON only, matching the shape described. No prose outside JSON.`,
|
`Strict JSON only, matching the shape described. No prose outside JSON.`,
|
||||||
].join("\n");
|
].join("\n");
|
||||||
|
|
||||||
let resp: Response;
|
// N=3 consensus — run the primary reviewer in parallel, collect
|
||||||
try {
|
// all three parsed responses, majority-vote per claim. Parallel
|
||||||
resp = await fetch(`${GATEWAY}/v1/chat`, {
|
// (Promise.all) because each call is ~20-30s and they're independent;
|
||||||
method: "POST",
|
// wall-clock stays ~same as single call, cost 3x tokens. Empirical
|
||||||
headers: { "content-type": "application/json" },
|
// justification: in 3-run determinism tests, 7/8 findings were
|
||||||
body: JSON.stringify({
|
// stable but 1 flipped across runs — majority vote stabilizes the
|
||||||
provider: "ollama_cloud",
|
// flipping class without losing the stable signal.
|
||||||
model: MODEL,
|
const primaryRuns = await Promise.all(
|
||||||
messages: [
|
Array.from({ length: N_CONSENSUS }, () =>
|
||||||
{ role: "system", content: systemMsg },
|
runCloudInference(systemMsg, userMsg, MODEL)),
|
||||||
{ role: "user", content: userMsg },
|
);
|
||||||
],
|
|
||||||
// Deterministic classification — temp=0 is greedy-sample, so
|
const parsedRuns = primaryRuns.filter(r => r.parsed !== null);
|
||||||
// identical input yields identical output on the same model
|
if (parsedRuns.length === 0) {
|
||||||
// version. This kills the signature creep we observed in the
|
// All N calls failed. Surface the first-run diagnostic so the
|
||||||
// 9-run empirical test (sig_count 16→27 from cloud phrasing
|
// operator sees *why* (unreachable / non-200 / unparseable).
|
||||||
// variance at temp=0.2).
|
const first = primaryRuns[0];
|
||||||
//
|
|
||||||
// IMPORTANT: keep think=true. gpt-oss:120b is a reasoning
|
|
||||||
// model; setting think=false caused it to return empty content
|
|
||||||
// on large prompts (observed during Level 1 validation: 13421
|
|
||||||
// tokens used, empty content returned). The reasoning trace is
|
|
||||||
// variable prose, but at temp=0 the FINAL classification is
|
|
||||||
// still deterministic because greedy sampling converges to
|
|
||||||
// the same conclusion from the same starting state.
|
|
||||||
max_tokens: 3000,
|
|
||||||
temperature: 0,
|
|
||||||
think: true,
|
|
||||||
}),
|
|
||||||
signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
|
|
||||||
});
|
|
||||||
} catch (e) {
|
|
||||||
// Cloud unreachable → soft-fail. Don't block a PR because the
|
|
||||||
// reviewer model is down. Static + dynamic + kb still run.
|
|
||||||
return [{
|
return [{
|
||||||
check: "inference",
|
check: "inference",
|
||||||
severity: "info",
|
severity: "info",
|
||||||
summary: "cloud inference unreachable — skipped",
|
summary: `cloud inference all ${N_CONSENSUS} consensus runs failed — ${first.error ?? "unknown"}`,
|
||||||
evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`],
|
|
||||||
}];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!resp.ok) {
|
|
||||||
return [{
|
|
||||||
check: "inference",
|
|
||||||
severity: "info",
|
|
||||||
summary: `cloud inference returned ${resp.status} — skipped`,
|
|
||||||
evidence: [`body: ${(await resp.text()).slice(0, 200)}`],
|
|
||||||
}];
|
|
||||||
}
|
|
||||||
|
|
||||||
const body: any = await resp.json();
|
|
||||||
const content: string = body?.choices?.[0]?.message?.content ?? "";
|
|
||||||
const usage = body?.usage ?? {};
|
|
||||||
|
|
||||||
const parsed = extractJson(content);
|
|
||||||
if (!parsed) {
|
|
||||||
return [{
|
|
||||||
check: "inference",
|
|
||||||
severity: "info",
|
|
||||||
summary: "cloud returned unparseable output — skipped",
|
|
||||||
evidence: [
|
evidence: [
|
||||||
`head: ${content.slice(0, 200)}`,
|
`first-run diagnostic: ${first.diagnostic ?? "(none)"}`,
|
||||||
`tokens: ${usage.total_tokens ?? "?"}`,
|
`successful runs: 0 / ${N_CONSENSUS}`,
|
||||||
],
|
],
|
||||||
}];
|
}];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Aggregate votes per claim_idx.
|
||||||
|
interface Votes { trues: number; falses: number; evidences: string[] }
|
||||||
|
const votesByClaim = new Map<number, Votes>();
|
||||||
|
const unflaggedByRun: any[][] = [];
|
||||||
|
let totalTokens = 0;
|
||||||
|
for (const run of parsedRuns) {
|
||||||
|
totalTokens += run.tokens;
|
||||||
|
unflaggedByRun.push(Array.isArray(run.parsed?.unflagged_gaps) ? run.parsed.unflagged_gaps : []);
|
||||||
|
for (const v of run.parsed?.claim_verdicts ?? []) {
|
||||||
|
const idx = Number(v?.claim_idx);
|
||||||
|
if (!Number.isFinite(idx)) continue;
|
||||||
|
const rec = votesByClaim.get(idx) ?? { trues: 0, falses: 0, evidences: [] };
|
||||||
|
if (v.backed === false) {
|
||||||
|
rec.falses++;
|
||||||
|
rec.evidences.push(String(v.evidence ?? ""));
|
||||||
|
} else if (v.backed === true) {
|
||||||
|
rec.trues++;
|
||||||
|
}
|
||||||
|
votesByClaim.set(idx, rec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const findings: Finding[] = [];
|
const findings: Finding[] = [];
|
||||||
|
|
||||||
// One summary info finding so the verdict layer knows the check ran.
|
// Summary finding so the verdict layer knows the check ran.
|
||||||
findings.push({
|
findings.push({
|
||||||
check: "inference",
|
check: "inference",
|
||||||
severity: "info",
|
severity: "info",
|
||||||
summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})${curationNote}`,
|
summary: `cloud review completed (model=${MODEL}, consensus=${parsedRuns.length}/${N_CONSENSUS}, tokens=${totalTokens})${curationNote}`,
|
||||||
evidence: [
|
evidence: [
|
||||||
`claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`,
|
`claims voted: ${votesByClaim.size}`,
|
||||||
|
`parsed runs: ${parsedRuns.length} / ${N_CONSENSUS}`,
|
||||||
],
|
],
|
||||||
});
|
});
|
||||||
|
|
||||||
for (const v of parsed.claim_verdicts ?? []) {
|
// Per-claim majority vote; tie-break if no majority.
|
||||||
if (v?.backed === false) {
|
const discrepancies: Array<{
|
||||||
const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
|
claim_idx: number;
|
||||||
// Indices point at the verifiable[] list we sent the cloud,
|
claim_text: string;
|
||||||
// not the full claims[] list. Translate back.
|
votes: { trues: number; falses: number };
|
||||||
|
resolution: "majority_backed" | "majority_not_backed" | "tiebreaker_backed" | "tiebreaker_not_backed" | "unresolved";
|
||||||
|
tiebreaker_model?: string;
|
||||||
|
}> = [];
|
||||||
|
|
||||||
|
for (const [idx, votes] of votesByClaim) {
|
||||||
const claim = verifiable[idx];
|
const claim = verifiable[idx];
|
||||||
if (!claim) continue;
|
if (!claim) continue;
|
||||||
// Strong+unbacked = BLOCK. That's the whole point of the auditor.
|
const totalVotes = votes.trues + votes.falses;
|
||||||
|
let notBacked: boolean | null = null;
|
||||||
|
let resolution: typeof discrepancies[number]["resolution"] = "majority_backed";
|
||||||
|
let evidenceText = "";
|
||||||
|
let tbModel: string | undefined;
|
||||||
|
|
||||||
|
if (votes.falses > votes.trues) {
|
||||||
|
notBacked = true;
|
||||||
|
resolution = "majority_not_backed";
|
||||||
|
evidenceText = votes.evidences[0] ?? "(no reason given)";
|
||||||
|
} else if (votes.trues > votes.falses) {
|
||||||
|
notBacked = false;
|
||||||
|
resolution = "majority_backed";
|
||||||
|
} else {
|
||||||
|
// Tie. Run tie-breaker with a different-architecture model.
|
||||||
|
const tb = await runCloudInference(systemMsg, userMsg, TIEBREAKER_MODEL);
|
||||||
|
if (tb.parsed) {
|
||||||
|
const tv = (tb.parsed.claim_verdicts ?? []).find((v: any) => Number(v?.claim_idx) === idx);
|
||||||
|
if (tv?.backed === false) {
|
||||||
|
notBacked = true;
|
||||||
|
resolution = "tiebreaker_not_backed";
|
||||||
|
evidenceText = `(tie-breaker ${TIEBREAKER_MODEL}) ${String(tv.evidence ?? "")}`;
|
||||||
|
tbModel = TIEBREAKER_MODEL;
|
||||||
|
} else if (tv?.backed === true) {
|
||||||
|
notBacked = false;
|
||||||
|
resolution = "tiebreaker_backed";
|
||||||
|
tbModel = TIEBREAKER_MODEL;
|
||||||
|
} else {
|
||||||
|
resolution = "unresolved";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
resolution = "unresolved";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log every case where the N runs disagreed — discrepancies are
|
||||||
|
// signal, not noise. Separate from audit_lessons.jsonl because
|
||||||
|
// they're about the *auditor's* quality, not the PR's quality.
|
||||||
|
const disagreed = totalVotes >= 2 && votes.trues > 0 && votes.falses > 0;
|
||||||
|
if (disagreed || resolution.startsWith("tiebreaker") || resolution === "unresolved") {
|
||||||
|
discrepancies.push({
|
||||||
|
claim_idx: idx,
|
||||||
|
claim_text: claim.text,
|
||||||
|
votes: { trues: votes.trues, falses: votes.falses },
|
||||||
|
resolution,
|
||||||
|
tiebreaker_model: tbModel,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (notBacked === true) {
|
||||||
const sev: Finding["severity"] = claim.strength === "strong" ? "block"
|
const sev: Finding["severity"] = claim.strength === "strong" ? "block"
|
||||||
: claim.strength === "moderate" ? "warn"
|
: claim.strength === "moderate" ? "warn"
|
||||||
: "info";
|
: "info";
|
||||||
@ -266,12 +312,22 @@ export async function runInferenceCheck(
|
|||||||
summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`,
|
summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`,
|
||||||
evidence: [
|
evidence: [
|
||||||
`at ${claim.location}`,
|
`at ${claim.location}`,
|
||||||
`cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`,
|
`consensus: ${votes.falses}/${totalVotes} not-backed (resolution: ${resolution})`,
|
||||||
|
`cloud reason: ${evidenceText.slice(0, 200)}`,
|
||||||
],
|
],
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Persist discrepancies so we can measure consensus drift over time.
|
||||||
|
if (discrepancies.length > 0 && ctx) {
|
||||||
|
persistDiscrepancies(ctx, discrepancies).catch(e =>
|
||||||
|
console.error(`[inference] discrepancy log failed: ${(e as Error).message}`));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use first run's parsed for downstream unflagged_gaps processing.
|
||||||
|
const parsed = parsedRuns[0].parsed;
|
||||||
|
|
||||||
// Route the curated scratchpad through llm_team's extract-facts
|
// Route the curated scratchpad through llm_team's extract-facts
|
||||||
// pipeline when we have (a) a curated scratchpad (best signal about
|
// pipeline when we have (a) a curated scratchpad (best signal about
|
||||||
// what the PR actually changed) and (b) PR context to scope facts.
|
// what the PR actually changed) and (b) PR context to scope facts.
|
||||||
@ -338,6 +394,71 @@ export async function runInferenceCheck(
|
|||||||
return findings;
|
return findings;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Single cloud call — the consensus loop calls this N times in
|
||||||
|
// parallel. Returns the parsed JSON shape + token usage + any error
|
||||||
|
// diagnostic. NEVER throws; the consensus aggregator handles partial
|
||||||
|
// failures by dropping non-parsed runs from the vote.
|
||||||
|
interface CloudRunResult {
|
||||||
|
parsed: any | null;
|
||||||
|
tokens: number;
|
||||||
|
error?: string; // "unreachable" | "non_200" | "unparseable"
|
||||||
|
diagnostic?: string; // first 200 chars for debugging
|
||||||
|
model: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function runCloudInference(systemMsg: string, userMsg: string, model: string): Promise<CloudRunResult> {
|
||||||
|
let resp: Response;
|
||||||
|
try {
|
||||||
|
resp = await fetch(`${GATEWAY}/v1/chat`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "content-type": "application/json" },
|
||||||
|
body: JSON.stringify({
|
||||||
|
provider: "ollama_cloud",
|
||||||
|
model,
|
||||||
|
messages: [
|
||||||
|
{ role: "system", content: systemMsg },
|
||||||
|
{ role: "user", content: userMsg },
|
||||||
|
],
|
||||||
|
// temp=0 (greedy) + think=true. think=true is required for
|
||||||
|
// gpt-oss:120b — without it the model returns empty content
|
||||||
|
// on large prompts. Variance from the think trace is observed
|
||||||
|
// in practice, which is why we use N=3 consensus, not single-
|
||||||
|
// call determinism.
|
||||||
|
max_tokens: 3000,
|
||||||
|
temperature: 0,
|
||||||
|
think: true,
|
||||||
|
}),
|
||||||
|
signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
|
||||||
|
});
|
||||||
|
} catch (e) {
|
||||||
|
return { parsed: null, tokens: 0, error: "unreachable", diagnostic: (e as Error).message.slice(0, 200), model };
|
||||||
|
}
|
||||||
|
if (!resp.ok) {
|
||||||
|
return { parsed: null, tokens: 0, error: "non_200", diagnostic: `${resp.status}: ${(await resp.text()).slice(0, 160)}`, model };
|
||||||
|
}
|
||||||
|
let body: any;
|
||||||
|
try { body = await resp.json(); }
|
||||||
|
catch (e) { return { parsed: null, tokens: 0, error: "unparseable", diagnostic: (e as Error).message, model }; }
|
||||||
|
const content: string = body?.choices?.[0]?.message?.content ?? "";
|
||||||
|
const tokens: number = body?.usage?.total_tokens ?? 0;
|
||||||
|
const parsed = extractJson(content);
|
||||||
|
if (!parsed) {
|
||||||
|
return { parsed: null, tokens, error: "unparseable", diagnostic: content.slice(0, 200), model };
|
||||||
|
}
|
||||||
|
return { parsed, tokens, model };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function persistDiscrepancies(ctx: InferenceContext, discrepancies: any[]): Promise<void> {
|
||||||
|
await mkdir("/home/profit/lakehouse/data/_kb", { recursive: true });
|
||||||
|
const rows = discrepancies.map(d => JSON.stringify({
|
||||||
|
pr_number: ctx.pr_number,
|
||||||
|
head_sha: ctx.head_sha,
|
||||||
|
logged_at: new Date().toISOString(),
|
||||||
|
...d,
|
||||||
|
}));
|
||||||
|
await appendFile(AUDIT_DISCREPANCIES_JSONL, rows.join("\n") + "\n");
|
||||||
|
}
|
||||||
|
|
||||||
// Extract structured knowledge from the curated scratchpad and append
|
// Extract structured knowledge from the curated scratchpad and append
|
||||||
// to data/_kb/audit_facts.jsonl — one row per extract run, keyed by
|
// to data/_kb/audit_facts.jsonl — one row per extract run, keyed by
|
||||||
// PR number + head SHA for scope tracking. kb_query tails this next
|
// PR number + head SHA for scope tracking. kb_query tails this next
|
||||||
@ -360,6 +481,10 @@ async function extractAndPersistFacts(scratchpad: string, ctx: InferenceContext)
|
|||||||
entities: ex.entities,
|
entities: ex.entities,
|
||||||
relationships: ex.relationships,
|
relationships: ex.relationships,
|
||||||
verification_preview: ex.verification.slice(0, 400),
|
verification_preview: ex.verification.slice(0, 400),
|
||||||
|
verifier_verdicts: ex.verifier_verdicts,
|
||||||
|
facts_dropped_by_verifier: ex.facts_dropped_by_verifier ?? 0,
|
||||||
|
schema_version: 2,
|
||||||
|
source: "audit_inference",
|
||||||
};
|
};
|
||||||
await mkdir("/home/profit/lakehouse/data/_kb", { recursive: true });
|
await mkdir("/home/profit/lakehouse/data/_kb", { recursive: true });
|
||||||
await appendFile(AUDIT_FACTS_JSONL, JSON.stringify(row) + "\n");
|
await appendFile(AUDIT_FACTS_JSONL, JSON.stringify(row) + "\n");
|
||||||
|
|||||||
@ -48,6 +48,12 @@ export interface ExtractedFacts {
|
|||||||
// persists to its own team_runs; this is for our own cross-ref).
|
// persists to its own team_runs; this is for our own cross-ref).
|
||||||
llm_team_run_id?: number;
|
llm_team_run_id?: number;
|
||||||
extracted_at: string;
|
extracted_at: string;
|
||||||
|
// Per-fact verdicts from the verifier pass (CORRECT/INCORRECT/
|
||||||
|
// UNVERIFIABLE/UNCHECKED). Aligned 1:1 with the *raw* fact list
|
||||||
|
// pre-drop so operators can see which verdicts mapped to dropped
|
||||||
|
// facts if needed.
|
||||||
|
verifier_verdicts?: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED">;
|
||||||
|
facts_dropped_by_verifier?: number;
|
||||||
error?: string;
|
error?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -142,9 +148,25 @@ export async function extractFacts(source: string): Promise<ExtractedFacts> {
|
|||||||
return { ...base, error: "extractor returned no parseable JSON", verification: verifierText };
|
return { ...base, error: "extractor returned no parseable JSON", verification: verifierText };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const rawFacts: string[] = Array.isArray(parsed.facts)
|
||||||
|
? parsed.facts.slice(0, 50).map(String)
|
||||||
|
: [];
|
||||||
|
|
||||||
|
// Parse the verifier's free-form prose into per-fact verdicts, then
|
||||||
|
// drop any fact the verifier explicitly marked INCORRECT. Leave
|
||||||
|
// UNVERIFIABLE in place: many of our extractions are domain-specific
|
||||||
|
// (Lakehouse internals) and the verifier has no prior-knowledge
|
||||||
|
// anchor, so UNVERIFIABLE is the expected verdict for new signal,
|
||||||
|
// not a quality fail. This is verifier-gated persistence: drop only
|
||||||
|
// what's affirmatively wrong, not what's novel.
|
||||||
|
const verdicts = parseVerifierVerdicts(verifierText, rawFacts.length);
|
||||||
|
const incorrectIdx = new Set<number>();
|
||||||
|
verdicts.forEach((v, i) => { if (v === "INCORRECT") incorrectIdx.add(i); });
|
||||||
|
const kept = rawFacts.filter((_, i) => !incorrectIdx.has(i));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
...base,
|
...base,
|
||||||
facts: Array.isArray(parsed.facts) ? parsed.facts.slice(0, 50).map(String) : [],
|
facts: kept,
|
||||||
entities: Array.isArray(parsed.entities)
|
entities: Array.isArray(parsed.entities)
|
||||||
? parsed.entities.slice(0, 30).map((e: any) => ({
|
? parsed.entities.slice(0, 30).map((e: any) => ({
|
||||||
name: String(e?.name ?? ""),
|
name: String(e?.name ?? ""),
|
||||||
@ -160,10 +182,36 @@ export async function extractFacts(source: string): Promise<ExtractedFacts> {
|
|||||||
})).filter(r => r.from.length > 0 && r.to.length > 0)
|
})).filter(r => r.from.length > 0 && r.to.length > 0)
|
||||||
: [],
|
: [],
|
||||||
verification: verifierText.slice(0, 1500),
|
verification: verifierText.slice(0, 1500),
|
||||||
|
facts_dropped_by_verifier: incorrectIdx.size,
|
||||||
|
verifier_verdicts: verdicts,
|
||||||
llm_team_run_id: runId,
|
llm_team_run_id: runId,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Parse verifier's free-form output into a per-fact verdict array.
|
||||||
|
// The verifier output typically looks like:
|
||||||
|
// **1.** The claim...
|
||||||
|
// * **Verdict:** CORRECT
|
||||||
|
// **2.** ...
|
||||||
|
// **Verdict:** UNVERIFIABLE
|
||||||
|
// Using matchAll to iterate — returns a verdict array of length
|
||||||
|
// numFacts; unmatched positions stay UNCHECKED.
|
||||||
|
function parseVerifierVerdicts(
|
||||||
|
verifierText: string,
|
||||||
|
numFacts: number,
|
||||||
|
): Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> {
|
||||||
|
const out: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> =
|
||||||
|
Array(numFacts).fill("UNCHECKED");
|
||||||
|
const re = /(?:\*\*|#+\s*)?(\d+)[.):]\s[\s\S]*?\bVerdict\s*:\s*\*?\*?\s*(CORRECT|INCORRECT|UNVERIFIABLE)/gi;
|
||||||
|
for (const m of verifierText.matchAll(re)) {
|
||||||
|
const idx = Number(m[1]) - 1;
|
||||||
|
if (idx >= 0 && idx < numFacts) {
|
||||||
|
out[idx] = m[2].toUpperCase() as "CORRECT" | "INCORRECT" | "UNVERIFIABLE";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
// Lift the first balanced JSON object out of (possibly fenced) text.
|
// Lift the first balanced JSON object out of (possibly fenced) text.
|
||||||
// Same discipline as inference.ts::extractJson.
|
// Same discipline as inference.ts::extractJson.
|
||||||
function extractFirstJsonObject(text: string): any | null {
|
function extractFirstJsonObject(text: string): any | null {
|
||||||
|
|||||||
269
auditor/kb_stats.ts
Normal file
269
auditor/kb_stats.ts
Normal file
@ -0,0 +1,269 @@
|
|||||||
|
// kb_stats — on-demand dashboard numbers from the KB scratchpad
|
||||||
|
// files. Reads data/_auditor/verdicts/*, data/_kb/audit_lessons.jsonl,
|
||||||
|
// data/_kb/audit_facts.jsonl, data/_kb/audit_discrepancies.jsonl,
|
||||||
|
// data/_kb/scrum_reviews.jsonl and prints:
|
||||||
|
//
|
||||||
|
// - verdict flip-flop rate (same SHA re-audited, verdict changed?)
|
||||||
|
// - consensus discrepancy rate (N runs disagreed on a claim)
|
||||||
|
// - confidence distribution from kb_index aggregator
|
||||||
|
// - top N recurring entities from audit_facts
|
||||||
|
// - fact growth over time
|
||||||
|
// - scrum vs inference KB split
|
||||||
|
//
|
||||||
|
// Run: bun run auditor/kb_stats.ts
|
||||||
|
// bun run auditor/kb_stats.ts --top 15 # show top 15 entities
|
||||||
|
// bun run auditor/kb_stats.ts --json # machine-readable
|
||||||
|
//
|
||||||
|
// This is the "dashboard" without running Grafana. If someone really
|
||||||
|
// wants a dashboard, wire this output into a static HTML page + cron.
|
||||||
|
|
||||||
|
import { readFile, readdir } from "node:fs/promises";
|
||||||
|
import { join } from "node:path";
|
||||||
|
import { aggregate } from "./kb_index.ts";
|
||||||
|
|
||||||
|
const REPO = "/home/profit/lakehouse";
|
||||||
|
const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`;
|
||||||
|
const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`;
|
||||||
|
const AUDIT_FACTS = `${REPO}/data/_kb/audit_facts.jsonl`;
|
||||||
|
const AUDIT_DISCREPANCIES = `${REPO}/data/_kb/audit_discrepancies.jsonl`;
|
||||||
|
const SCRUM_REVIEWS = `${REPO}/data/_kb/scrum_reviews.jsonl`;
|
||||||
|
|
||||||
|
interface Args {
|
||||||
|
top: number;
|
||||||
|
json: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseArgs(argv: string[]): Args {
|
||||||
|
const a: Args = { top: 10, json: false };
|
||||||
|
for (let i = 2; i < argv.length; i++) {
|
||||||
|
if (argv[i] === "--top") a.top = Number(argv[++i] ?? 10);
|
||||||
|
else if (argv[i] === "--json") a.json = true;
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function readJsonl<T = any>(path: string): Promise<T[]> {
|
||||||
|
try {
|
||||||
|
const raw = await readFile(path, "utf8");
|
||||||
|
return raw.split("\n").filter(l => l.length > 0).map(l => {
|
||||||
|
try { return JSON.parse(l) as T; } catch { return null as any; }
|
||||||
|
}).filter(r => r !== null);
|
||||||
|
} catch { return []; }
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadVerdicts(): Promise<Array<{ pr: number; sha: string; overall: string; findings_total: number; findings_block: number; findings_warn: number }>> {
|
||||||
|
let files: string[] = [];
|
||||||
|
try { files = await readdir(VERDICTS_DIR); } catch { return []; }
|
||||||
|
const out = [];
|
||||||
|
for (const f of files) {
|
||||||
|
if (!f.endsWith(".json")) continue;
|
||||||
|
const m = f.match(/^(\d+)-([0-9a-f]+)\.json$/);
|
||||||
|
if (!m) continue;
|
||||||
|
try {
|
||||||
|
const v = JSON.parse(await readFile(join(VERDICTS_DIR, f), "utf8"));
|
||||||
|
out.push({
|
||||||
|
pr: Number(m[1]),
|
||||||
|
sha: m[2],
|
||||||
|
overall: String(v.overall),
|
||||||
|
findings_total: Number(v.metrics?.findings_total ?? 0),
|
||||||
|
findings_block: Number(v.metrics?.findings_block ?? 0),
|
||||||
|
findings_warn: Number(v.metrics?.findings_warn ?? 0),
|
||||||
|
});
|
||||||
|
} catch { /* skip corrupt */ }
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface Stats {
|
||||||
|
audit_count: number;
|
||||||
|
verdict_distribution: Record<string, number>;
|
||||||
|
// Same PR with multiple SHAs — if verdicts differ, that's drift across
|
||||||
|
// the PR's commit history. Not a flip-flop in the classical sense,
|
||||||
|
// but worth surfacing (e.g. "PR #8 was block block req req block").
|
||||||
|
per_pr_verdict_sequences: Record<number, string[]>;
|
||||||
|
// For each PR with ≥ 2 audits, how many distinct verdicts did it
|
||||||
|
// produce? 1 = stable; 2+ = some flipping.
|
||||||
|
verdict_instability: { pr_count: number; pr_with_multiple_verdicts: number; pr_with_3plus: number };
|
||||||
|
consensus: { discrepancy_count: number; tiebreaker_used: number; unresolved: number };
|
||||||
|
kb: {
|
||||||
|
audit_lessons_rows: number;
|
||||||
|
audit_facts_rows: number;
|
||||||
|
scrum_reviews_rows: number;
|
||||||
|
distinct_finding_signatures: number;
|
||||||
|
distinct_entities_across_prs: number;
|
||||||
|
entities_in_2plus_prs: number;
|
||||||
|
entities_in_5plus_prs: number;
|
||||||
|
};
|
||||||
|
fact_quality: {
|
||||||
|
verifier_verdict_distribution: Record<string, number>;
|
||||||
|
facts_dropped_by_verifier_total: number;
|
||||||
|
extraction_success_rate: number;
|
||||||
|
};
|
||||||
|
top_entities: Array<{ name: string; distinct_prs: number; count: number; types: string[] }>;
|
||||||
|
kb_by_source: Record<string, number>;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function collect(args: Args): Promise<Stats> {
|
||||||
|
const verdicts = await loadVerdicts();
|
||||||
|
const lessons = await readJsonl<any>(AUDIT_LESSONS);
|
||||||
|
const facts = await readJsonl<any>(AUDIT_FACTS);
|
||||||
|
const disc = await readJsonl<any>(AUDIT_DISCREPANCIES);
|
||||||
|
const reviews = await readJsonl<any>(SCRUM_REVIEWS);
|
||||||
|
|
||||||
|
// Verdict stability
|
||||||
|
const byPr: Record<number, string[]> = {};
|
||||||
|
const verdictDist: Record<string, number> = {};
|
||||||
|
for (const v of verdicts) {
|
||||||
|
(byPr[v.pr] ??= []).push(v.overall);
|
||||||
|
verdictDist[v.overall] = (verdictDist[v.overall] ?? 0) + 1;
|
||||||
|
}
|
||||||
|
let multi = 0, tri = 0;
|
||||||
|
for (const [_, seq] of Object.entries(byPr)) {
|
||||||
|
const distinct = new Set(seq);
|
||||||
|
if (distinct.size >= 2) multi++;
|
||||||
|
if (distinct.size >= 3) tri++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consensus drift
|
||||||
|
const consensus = {
|
||||||
|
discrepancy_count: disc.length,
|
||||||
|
tiebreaker_used: disc.filter(d => String(d.resolution).startsWith("tiebreaker")).length,
|
||||||
|
unresolved: disc.filter(d => d.resolution === "unresolved").length,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Lesson signatures
|
||||||
|
const lessonAgg = await aggregate<any>(AUDIT_LESSONS, {
|
||||||
|
keyFn: r => r?.signature,
|
||||||
|
scopeFn: r => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Entity aggregation across audit_facts rows
|
||||||
|
interface EntAgg { distinct_prs: Set<number>; count: number; types: Set<string>; name: string; sources: Set<string> }
|
||||||
|
const entAgg = new Map<string, EntAgg>();
|
||||||
|
const sourceCount: Record<string, number> = {};
|
||||||
|
let totalVerdictDist: Record<string, number> = { CORRECT: 0, INCORRECT: 0, UNVERIFIABLE: 0, UNCHECKED: 0 };
|
||||||
|
let factsDroppedTotal = 0;
|
||||||
|
let extractionsWithFacts = 0;
|
||||||
|
|
||||||
|
for (const row of facts) {
|
||||||
|
const src = String(row.source ?? "unknown");
|
||||||
|
sourceCount[src] = (sourceCount[src] ?? 0) + 1;
|
||||||
|
const pr = Number(row.pr_number);
|
||||||
|
if (Array.isArray(row.verifier_verdicts)) {
|
||||||
|
for (const v of row.verifier_verdicts) {
|
||||||
|
totalVerdictDist[v] = (totalVerdictDist[v] ?? 0) + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
factsDroppedTotal += Number(row.facts_dropped_by_verifier ?? 0);
|
||||||
|
if ((Array.isArray(row.facts) && row.facts.length > 0) || (Array.isArray(row.entities) && row.entities.length > 0)) {
|
||||||
|
extractionsWithFacts++;
|
||||||
|
}
|
||||||
|
for (const e of Array.isArray(row.entities) ? row.entities : []) {
|
||||||
|
const name = String(e?.name ?? "").trim();
|
||||||
|
if (name.length < 3) continue;
|
||||||
|
const key = name.toLowerCase();
|
||||||
|
const agg = entAgg.get(key) ?? { distinct_prs: new Set(), count: 0, types: new Set(), name, sources: new Set() };
|
||||||
|
agg.count++;
|
||||||
|
if (Number.isFinite(pr) && pr > 0) agg.distinct_prs.add(pr);
|
||||||
|
if (e?.type) agg.types.add(String(e.type));
|
||||||
|
agg.sources.add(src);
|
||||||
|
entAgg.set(key, agg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const entitiesIn2Plus = Array.from(entAgg.values()).filter(a => a.distinct_prs.size >= 2).length;
|
||||||
|
const entitiesIn5Plus = Array.from(entAgg.values()).filter(a => a.distinct_prs.size >= 5).length;
|
||||||
|
const topEntities = Array.from(entAgg.values())
|
||||||
|
.sort((a, b) => b.distinct_prs.size - a.distinct_prs.size || b.count - a.count)
|
||||||
|
.slice(0, args.top)
|
||||||
|
.map(a => ({
|
||||||
|
name: a.name,
|
||||||
|
distinct_prs: a.distinct_prs.size,
|
||||||
|
count: a.count,
|
||||||
|
types: Array.from(a.types),
|
||||||
|
}));
|
||||||
|
|
||||||
|
const stats: Stats = {
|
||||||
|
audit_count: verdicts.length,
|
||||||
|
verdict_distribution: verdictDist,
|
||||||
|
per_pr_verdict_sequences: byPr,
|
||||||
|
verdict_instability: {
|
||||||
|
pr_count: Object.keys(byPr).length,
|
||||||
|
pr_with_multiple_verdicts: multi,
|
||||||
|
pr_with_3plus: tri,
|
||||||
|
},
|
||||||
|
consensus,
|
||||||
|
kb: {
|
||||||
|
audit_lessons_rows: lessons.length,
|
||||||
|
audit_facts_rows: facts.length,
|
||||||
|
scrum_reviews_rows: reviews.length,
|
||||||
|
distinct_finding_signatures: lessonAgg.size,
|
||||||
|
distinct_entities_across_prs: entAgg.size,
|
||||||
|
entities_in_2plus_prs: entitiesIn2Plus,
|
||||||
|
entities_in_5plus_prs: entitiesIn5Plus,
|
||||||
|
},
|
||||||
|
fact_quality: {
|
||||||
|
verifier_verdict_distribution: totalVerdictDist,
|
||||||
|
facts_dropped_by_verifier_total: factsDroppedTotal,
|
||||||
|
extraction_success_rate: facts.length > 0 ? extractionsWithFacts / facts.length : 0,
|
||||||
|
},
|
||||||
|
top_entities: topEntities,
|
||||||
|
kb_by_source: sourceCount,
|
||||||
|
};
|
||||||
|
return stats;
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderHuman(s: Stats): string {
|
||||||
|
const lines: string[] = [];
|
||||||
|
lines.push("═══ KB STATS ═══");
|
||||||
|
lines.push("");
|
||||||
|
lines.push(`Audits: ${s.audit_count} total across ${s.verdict_instability.pr_count} distinct PRs`);
|
||||||
|
lines.push(`Verdicts: ${Object.entries(s.verdict_distribution).map(([k, v]) => `${k}=${v}`).join(" ")}`);
|
||||||
|
const multiplePct = s.verdict_instability.pr_count > 0
|
||||||
|
? Math.round(100 * s.verdict_instability.pr_with_multiple_verdicts / s.verdict_instability.pr_count)
|
||||||
|
: 0;
|
||||||
|
lines.push(`Verdict instability: ${s.verdict_instability.pr_with_multiple_verdicts}/${s.verdict_instability.pr_count} PRs had 2+ distinct verdicts (${multiplePct}%) — 3+ distinct: ${s.verdict_instability.pr_with_3plus}`);
|
||||||
|
lines.push("");
|
||||||
|
lines.push("─── Consensus ───");
|
||||||
|
lines.push(` discrepancies logged: ${s.consensus.discrepancy_count}`);
|
||||||
|
lines.push(` tiebreaker used: ${s.consensus.tiebreaker_used}`);
|
||||||
|
lines.push(` unresolved: ${s.consensus.unresolved}`);
|
||||||
|
const dRate = s.audit_count > 0 ? (100 * s.consensus.discrepancy_count / s.audit_count).toFixed(1) : "0";
|
||||||
|
lines.push(` discrepancy rate: ${dRate}% of audits`);
|
||||||
|
lines.push("");
|
||||||
|
lines.push("─── KB size ───");
|
||||||
|
lines.push(` audit_lessons.jsonl: ${s.kb.audit_lessons_rows} rows, ${s.kb.distinct_finding_signatures} distinct signatures`);
|
||||||
|
lines.push(` audit_facts.jsonl: ${s.kb.audit_facts_rows} rows, ${s.kb.distinct_entities_across_prs} distinct entities`);
|
||||||
|
lines.push(` scrum_reviews.jsonl: ${s.kb.scrum_reviews_rows} rows`);
|
||||||
|
lines.push(` entities in 2+ PRs: ${s.kb.entities_in_2plus_prs}`);
|
||||||
|
lines.push(` entities in 5+ PRs: ${s.kb.entities_in_5plus_prs} ← strong cross-cutting signal`);
|
||||||
|
lines.push("");
|
||||||
|
lines.push("─── Fact quality ───");
|
||||||
|
const v = s.fact_quality.verifier_verdict_distribution;
|
||||||
|
lines.push(` verifier verdicts: CORRECT=${v.CORRECT ?? 0} UNVERIFIABLE=${v.UNVERIFIABLE ?? 0} UNCHECKED=${v.UNCHECKED ?? 0} INCORRECT=${v.INCORRECT ?? 0}`);
|
||||||
|
lines.push(` facts dropped by verifier: ${s.fact_quality.facts_dropped_by_verifier_total}`);
|
||||||
|
lines.push(` extraction success rate: ${(s.fact_quality.extraction_success_rate * 100).toFixed(1)}%`);
|
||||||
|
lines.push("");
|
||||||
|
lines.push("─── KB sources ───");
|
||||||
|
for (const [src, n] of Object.entries(s.kb_by_source)) {
|
||||||
|
lines.push(` ${src}: ${n}`);
|
||||||
|
}
|
||||||
|
lines.push("");
|
||||||
|
lines.push(`─── Top ${s.top_entities.length} recurring entities ───`);
|
||||||
|
for (const e of s.top_entities) {
|
||||||
|
lines.push(` [${e.distinct_prs} PRs × ${e.count} obs] ${e.name} (${e.types.join(",")})`);
|
||||||
|
}
|
||||||
|
return lines.join("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const args = parseArgs(process.argv);
|
||||||
|
const stats = await collect(args);
|
||||||
|
if (args.json) {
|
||||||
|
console.log(JSON.stringify(stats, (_, v) => v instanceof Set ? Array.from(v) : v, 2));
|
||||||
|
} else {
|
||||||
|
console.log(renderHuman(stats));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(e => { console.error("[kb_stats] fatal:", e); process.exit(1); });
|
||||||
@ -343,12 +343,50 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
|
|||||||
attempts_made: history.length,
|
attempts_made: history.length,
|
||||||
tree_split_fired: treeSplitFired,
|
tree_split_fired: treeSplitFired,
|
||||||
suggestions_preview: accepted.slice(0, 2000),
|
suggestions_preview: accepted.slice(0, 2000),
|
||||||
|
schema_version: 2,
|
||||||
|
scrum_master_reviewed: true,
|
||||||
};
|
};
|
||||||
try {
|
try {
|
||||||
await appendFile(SCRUM_REVIEWS_JSONL, JSON.stringify(row) + "\n");
|
await appendFile(SCRUM_REVIEWS_JSONL, JSON.stringify(row) + "\n");
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error(`[scrum] failed to append scrum_reviews.jsonl: ${(e as Error).message}`);
|
console.error(`[scrum] failed to append scrum_reviews.jsonl: ${(e as Error).message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Route the accepted review through llm_team's fact extractor so
|
||||||
|
// its entities + relationships land in audit_facts.jsonl alongside
|
||||||
|
// inference-side extractions. Same index, two sources. Tagged
|
||||||
|
// source:"scrum_review" + scrum_master_reviewed:true so downstream
|
||||||
|
// queries can filter by provenance. Reviews shorter than 120
|
||||||
|
// chars are skipped — they're usually one-liners ("LGTM") with
|
||||||
|
// no extractable knowledge.
|
||||||
|
if (accepted.length >= 120 && process.env.LH_SCRUM_SKIP_EXTRACT !== "1") {
|
||||||
|
try {
|
||||||
|
const { extractFacts } = await import("../../auditor/fact_extractor.ts");
|
||||||
|
const ex = await extractFacts(accepted);
|
||||||
|
if (!ex.error || ex.entities.length + ex.facts.length > 0) {
|
||||||
|
const factRow = {
|
||||||
|
pr_number: 0, // scrum runs outside a PR scope
|
||||||
|
file: rel,
|
||||||
|
head_sha: "", // no SHA scope; scope is the file+timestamp
|
||||||
|
extracted_at: ex.extracted_at,
|
||||||
|
extractor: ex.extractor_model,
|
||||||
|
verifier: ex.verifier_model,
|
||||||
|
llm_team_run_id: ex.llm_team_run_id ?? null,
|
||||||
|
facts: ex.facts,
|
||||||
|
entities: ex.entities,
|
||||||
|
relationships: ex.relationships,
|
||||||
|
verification_preview: ex.verification.slice(0, 400),
|
||||||
|
schema_version: 2,
|
||||||
|
source: "scrum_review",
|
||||||
|
scrum_master_reviewed: true,
|
||||||
|
};
|
||||||
|
const AUDIT_FACTS_JSONL = "/home/profit/lakehouse/data/_kb/audit_facts.jsonl";
|
||||||
|
await appendFile(AUDIT_FACTS_JSONL, JSON.stringify(factRow) + "\n");
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.error(`[scrum] fact extraction failed for ${rel}: ${(e as Error).message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return review;
|
return review;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user