Auditor self-test branch: real-world pipelines + cohesion Phase C + KB index (PR #8)
Bundles 12 commits validating the auditor + scrum_master architecture end-to-end: - enrich_prd_pipeline / hard_task_escalation / scrum_master_pipeline stress tests - Tree-split + scrum_reviews.jsonl + kb_query surfacing - Verdict → audit_lessons feedback loop (closed) - kb_index aggregator with confidence-based severity policy - 9-run + 5-run empirical tests proved the predictive-compounding property - Level 1 correction: temp=0 cloud inference for deterministic per-claim verdicts - audit_one.ts dry-run CLI - Fixes: static quoted-string guard, empirical-claim classification, symbol-resolver gate, repo-file size cap See PR #8 for run-by-run commit history.
This commit is contained in:
parent
6d7b251607
commit
156dae6732
@ -33,6 +33,13 @@ Defaults: polls every 90s, stops on `auditor.paused` file present.
|
||||
|
||||
- `data/_auditor/state.json` — last-audited head SHA per PR
|
||||
- `data/_auditor/verdicts/{pr}-{sha}.json` — per-run verdict record
|
||||
- `data/_kb/audit_lessons.jsonl` — one row per block/warn finding,
|
||||
path-agnostic signature for dedup. Tailed by kb_query on each audit
|
||||
to surface recurring patterns (2+ distinct PRs with same signature
|
||||
→ info, 3-4 → warn, 5+ → block). This is how the auditor learns.
|
||||
- `data/_kb/scrum_reviews.jsonl` — scrum-master per-file reviews. If
|
||||
a file in the current PR has been scrum-reviewed, kb_query surfaces
|
||||
the review as a finding with the accepted model and attempt count.
|
||||
|
||||
## Where YOU edit
|
||||
|
||||
|
||||
@ -12,7 +12,8 @@
|
||||
// review — reviews have self-review restrictions on Gitea and the
|
||||
// auditor currently uses the same PAT as the PR author).
|
||||
|
||||
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
||||
import { readFile, writeFile, mkdir, appendFile } from "node:fs/promises";
|
||||
import { createHash } from "node:crypto";
|
||||
import { join } from "node:path";
|
||||
import type { PrSnapshot, Verdict, Finding } from "./types.ts";
|
||||
import { getPrDiff, postCommitStatus, postIssueComment } from "./gitea.ts";
|
||||
@ -24,6 +25,10 @@ import { runInferenceCheck } from "./checks/inference.ts";
|
||||
import { runKbCheck } from "./checks/kb_query.ts";
|
||||
|
||||
const VERDICTS_DIR = "/home/profit/lakehouse/data/_auditor/verdicts";
|
||||
// Playbook for audit findings — one row per block/warn finding from a
|
||||
// verdict. kb_query tails this next audit and escalates recurrences.
|
||||
// Structured as JSONL so it's cheap to append and cheap to tail.
|
||||
const AUDIT_LESSONS_JSONL = "/home/profit/lakehouse/data/_kb/audit_lessons.jsonl";
|
||||
|
||||
export interface AuditOptions {
|
||||
// Skip the cloud inference call (fast path for iteration). Default false.
|
||||
@ -52,7 +57,7 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
|
||||
runStaticCheck(diff),
|
||||
opts.skip_dynamic ? Promise.resolve(stubFinding("dynamic", "skipped by options")) : runDynamicCheck(),
|
||||
opts.skip_inference ? Promise.resolve(stubFinding("inference", "skipped by options")) : runInferenceCheck(claims, diff),
|
||||
runKbCheck(claims),
|
||||
runKbCheck(claims, pr.files.map(f => f.path)),
|
||||
]);
|
||||
|
||||
const allFindings: Finding[] = [
|
||||
@ -72,6 +77,7 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
|
||||
claims_strong: claims.filter(c => c.strength === "strong").length,
|
||||
claims_moderate: claims.filter(c => c.strength === "moderate").length,
|
||||
claims_weak: claims.filter(c => c.strength === "weak").length,
|
||||
claims_empirical: claims.filter(c => c.strength === "empirical").length,
|
||||
claims_total: claims.length,
|
||||
diff_bytes: diff.length,
|
||||
};
|
||||
@ -80,6 +86,15 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
|
||||
|
||||
await persistVerdict(verdict);
|
||||
|
||||
// Feedback loop — every block/warn finding becomes a row in
|
||||
// audit_lessons.jsonl, dedup-keyed by (check, normalized-summary).
|
||||
// The next audit's kb_query reads these and escalates recurring
|
||||
// findings so we don't lose the "this pattern has been flagged
|
||||
// before" signal across runs. Fire-and-forget; failure here must
|
||||
// not break the audit.
|
||||
appendAuditLessons(verdict).catch(e =>
|
||||
console.error(`[audit] audit_lessons append failed: ${(e as Error).message}`));
|
||||
|
||||
if (!opts.dry_run) {
|
||||
await postToGitea(verdict);
|
||||
}
|
||||
@ -87,6 +102,42 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
|
||||
return verdict;
|
||||
}
|
||||
|
||||
// Normalizes a finding summary for dedup: strips path-specific tails
|
||||
// ("in path/to/file.ts" → "in <file>"), line numbers, and long
|
||||
// commit-hash snippets. The goal is: the SAME class of finding on
|
||||
// DIFFERENT files should share a signature, so we can measure
|
||||
// "this pattern keeps showing up."
|
||||
function normalizedSignature(f: Finding): string {
|
||||
const summary = String(f.summary)
|
||||
.replace(/\bin\s+\S+\.(ts|rs|js|py|md)\b/gi, "in <file>")
|
||||
.replace(/:\+?\d+\b/g, ":<line>")
|
||||
.replace(/[0-9a-f]{8,}/gi, "<hash>")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim()
|
||||
.slice(0, 240);
|
||||
const src = `${f.check}::${f.severity}::${summary}`;
|
||||
return createHash("sha256").update(src).digest("hex").slice(0, 16);
|
||||
}
|
||||
|
||||
async function appendAuditLessons(v: Verdict): Promise<void> {
|
||||
const actionable = v.findings.filter(f => f.severity === "block" || f.severity === "warn");
|
||||
if (actionable.length === 0) return;
|
||||
await mkdir(join(AUDIT_LESSONS_JSONL, ".."), { recursive: true });
|
||||
const rows: string[] = [];
|
||||
for (const f of actionable) {
|
||||
rows.push(JSON.stringify({
|
||||
signature: normalizedSignature(f),
|
||||
check: f.check,
|
||||
severity: f.severity,
|
||||
summary: f.summary,
|
||||
pr_number: v.pr_number,
|
||||
head_sha: v.head_sha,
|
||||
audited_at: v.audited_at,
|
||||
}));
|
||||
}
|
||||
await appendFile(AUDIT_LESSONS_JSONL, rows.join("\n") + "\n");
|
||||
}
|
||||
|
||||
async function persistVerdict(v: Verdict): Promise<void> {
|
||||
await mkdir(VERDICTS_DIR, { recursive: true });
|
||||
const filename = `${v.pr_number}-${v.head_sha.slice(0, 12)}.json`;
|
||||
|
||||
68
auditor/audit_one.ts
Normal file
68
auditor/audit_one.ts
Normal file
@ -0,0 +1,68 @@
|
||||
// One-shot dry-run audit of a single PR. Useful for verifying check
|
||||
// behavior (kb_query scrum surfacing, inference prompts, etc.) without
|
||||
// posting to Gitea. Does NOT touch state.json and does NOT post
|
||||
// commit status or PR comments.
|
||||
//
|
||||
// Run: bun run auditor/audit_one.ts <pr-number>
|
||||
|
||||
import { getPrSnapshot } from "./gitea.ts";
|
||||
import { auditPr } from "./audit.ts";
|
||||
|
||||
async function main() {
|
||||
const prNumRaw = process.argv[2];
|
||||
if (!prNumRaw) {
|
||||
console.error("usage: bun run auditor/audit_one.ts <pr-number>");
|
||||
process.exit(2);
|
||||
}
|
||||
const prNum = Number(prNumRaw);
|
||||
if (!Number.isFinite(prNum)) {
|
||||
console.error(`invalid PR number: ${prNumRaw}`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
console.log(`[audit_one] fetching PR #${prNum}...`);
|
||||
const pr = await getPrSnapshot(prNum);
|
||||
console.log(`[audit_one] PR #${pr.number}: "${pr.title}" (head=${pr.head_sha.slice(0, 12)})`);
|
||||
console.log(`[audit_one] files in diff: ${pr.files.length}`);
|
||||
for (const f of pr.files) console.log(` - ${f.path} (+${f.additions}/-${f.deletions})`);
|
||||
console.log("");
|
||||
|
||||
const verdict = await auditPr(pr, {
|
||||
dry_run: true, // no Gitea posting
|
||||
skip_dynamic: true, // don't run fixture
|
||||
skip_inference: process.env.LH_AUDITOR_SKIP_INFERENCE === "1",
|
||||
});
|
||||
|
||||
console.log("\n═══ VERDICT ═══");
|
||||
console.log(`overall: ${verdict.overall}`);
|
||||
console.log(`one-liner: ${verdict.one_liner}`);
|
||||
console.log(`findings: total=${verdict.metrics.findings_total} block=${verdict.metrics.findings_block} warn=${verdict.metrics.findings_warn} info=${verdict.metrics.findings_info}`);
|
||||
console.log("");
|
||||
|
||||
// Print findings, highlighting kb_query scrum surfacing
|
||||
const byCheck: Record<string, typeof verdict.findings> = {};
|
||||
for (const f of verdict.findings) (byCheck[f.check] ||= []).push(f);
|
||||
|
||||
for (const [check, findings] of Object.entries(byCheck)) {
|
||||
console.log(`── ${check} (${findings.length}) ──`);
|
||||
for (const f of findings) {
|
||||
const tag = f.severity === "block" ? "🛑" : f.severity === "warn" ? "⚠️ " : "ℹ️ ";
|
||||
console.log(` ${tag} [${f.severity}] ${f.summary}`);
|
||||
if (f.summary.includes("scrum-master")) {
|
||||
for (const e of f.evidence) {
|
||||
console.log(` → ${e.slice(0, 200)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const scrumFindings = verdict.findings.filter(f => f.summary.includes("scrum-master"));
|
||||
console.log("");
|
||||
console.log(`═══ SCRUM WIRE CHECK: ${scrumFindings.length} scrum-master findings surfaced by kb_query ═══`);
|
||||
if (scrumFindings.length === 0) {
|
||||
console.log(" (none — either no matching scrum_reviews.jsonl rows, or files didn't match PR diff)");
|
||||
}
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("[audit_one] fatal:", e); process.exit(1); });
|
||||
@ -13,6 +13,8 @@
|
||||
// with a 15KB diff + claim list).
|
||||
|
||||
import type { Claim, Finding } from "../types.ts";
|
||||
import { Glob } from "bun";
|
||||
import { readFile } from "node:fs/promises";
|
||||
|
||||
const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
|
||||
const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
|
||||
@ -22,6 +24,7 @@ const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
|
||||
// block finding when the file was simply outside the truncation window.
|
||||
const MAX_DIFF_CHARS = 40000;
|
||||
const CALL_TIMEOUT_MS = 120_000;
|
||||
const REPO_ROOT = "/home/profit/lakehouse";
|
||||
|
||||
export async function runInferenceCheck(claims: Claim[], diff: string): Promise<Finding[]> {
|
||||
if (claims.length === 0) {
|
||||
@ -33,6 +36,21 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
|
||||
}];
|
||||
}
|
||||
|
||||
// Empirical claims (runtime metrics / observed outcomes) can't be
|
||||
// verified from the diff. Drop them from the cloud prompt so the
|
||||
// reviewer doesn't chase ghosts. A future `runtime_evidence` check
|
||||
// can validate these against data/_kb/*/summary.json outputs.
|
||||
const verifiable = claims.filter(c => c.strength !== "empirical");
|
||||
const empiricalCount = claims.length - verifiable.length;
|
||||
if (verifiable.length === 0) {
|
||||
return [{
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: `all ${claims.length} claims are empirical (runtime metrics) — skipping cloud inference`,
|
||||
evidence: [`empirical claims can't be verified from a static diff; needs runtime-evidence check`],
|
||||
}];
|
||||
}
|
||||
|
||||
const truncated = diff.length > MAX_DIFF_CHARS
|
||||
? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]`
|
||||
: diff;
|
||||
@ -70,7 +88,7 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
|
||||
|
||||
const userMsg = [
|
||||
`Ship-claims the author made (numbered 0..N-1):`,
|
||||
claims.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
|
||||
verifiable.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
|
||||
"",
|
||||
`Diff:`,
|
||||
"```",
|
||||
@ -94,9 +112,22 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
|
||||
{ role: "system", content: systemMsg },
|
||||
{ role: "user", content: userMsg },
|
||||
],
|
||||
// Deterministic classification — temp=0 is greedy-sample, so
|
||||
// identical input yields identical output on the same model
|
||||
// version. This kills the signature creep we observed in the
|
||||
// 9-run empirical test (sig_count 16→27 from cloud phrasing
|
||||
// variance at temp=0.2).
|
||||
//
|
||||
// IMPORTANT: keep think=true. gpt-oss:120b is a reasoning
|
||||
// model; setting think=false caused it to return empty content
|
||||
// on large prompts (observed during Level 1 validation: 13421
|
||||
// tokens used, empty content returned). The reasoning trace is
|
||||
// variable prose, but at temp=0 the FINAL classification is
|
||||
// still deterministic because greedy sampling converges to
|
||||
// the same conclusion from the same starting state.
|
||||
max_tokens: 3000,
|
||||
temperature: 0.2,
|
||||
think: true, // T3 overseer should reason — JSON shape is still required
|
||||
temperature: 0,
|
||||
think: true,
|
||||
}),
|
||||
signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
|
||||
});
|
||||
@ -152,7 +183,9 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
|
||||
for (const v of parsed.claim_verdicts ?? []) {
|
||||
if (v?.backed === false) {
|
||||
const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
|
||||
const claim = claims[idx];
|
||||
// Indices point at the verifiable[] list we sent the cloud,
|
||||
// not the full claims[] list. Translate back.
|
||||
const claim = verifiable[idx];
|
||||
if (!claim) continue;
|
||||
// Strong+unbacked = BLOCK. That's the whole point of the auditor.
|
||||
const sev: Finding["severity"] = claim.strength === "strong" ? "block"
|
||||
@ -172,17 +205,113 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
|
||||
}
|
||||
|
||||
for (const g of parsed.unflagged_gaps ?? []) {
|
||||
const summary = String(g?.summary ?? "?");
|
||||
const location = String(g?.location ?? "?");
|
||||
// False-positive guard — when the cloud says "X not defined in this
|
||||
// diff" or "missing implementation of X", the cloud may just mean
|
||||
// "X is not in the added lines," not "X doesn't exist in the repo."
|
||||
// Extract candidate symbol names and grep the repo. If any symbol
|
||||
// is defined elsewhere, drop the finding — it's a known-symbol
|
||||
// reference, not a placeholder.
|
||||
if (/not\s+defined|missing\s+implementation|never\s+referenced\s+or\s+integrated/i.test(summary)) {
|
||||
const symbols = extractSymbols(summary);
|
||||
if (symbols.length > 0) {
|
||||
const resolved = await symbolsExistInRepo(symbols);
|
||||
if (resolved.length === symbols.length) {
|
||||
// Every named symbol exists somewhere in the repo — silent drop.
|
||||
continue;
|
||||
}
|
||||
if (resolved.length > 0) {
|
||||
// Partially resolved — demote to info with a note.
|
||||
findings.push({
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: `cloud gap partially resolved by repo grep: ${summary.slice(0, 120)}`,
|
||||
evidence: [
|
||||
`location: ${location.slice(0, 140)}`,
|
||||
`resolved via grep: ${resolved.join(",")}`,
|
||||
`unresolved: ${symbols.filter(s => !resolved.includes(s)).join(",")}`,
|
||||
],
|
||||
});
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
findings.push({
|
||||
check: "inference",
|
||||
severity: "warn",
|
||||
summary: `cloud-flagged gap not in any claim: ${String(g?.summary ?? "?").slice(0, 120)}`,
|
||||
evidence: [`location: ${String(g?.location ?? "?").slice(0, 140)}`],
|
||||
summary: `cloud-flagged gap not in any claim: ${summary.slice(0, 120)}`,
|
||||
evidence: [`location: ${location.slice(0, 140)}`],
|
||||
});
|
||||
}
|
||||
|
||||
return findings;
|
||||
}
|
||||
|
||||
// Pull out plausible code-symbol names from a summary string.
|
||||
// Matches:
|
||||
// - identifier with backticks: `foo_bar`
|
||||
// - identifier followed by parens: foo_bar()
|
||||
// - CamelCase types
|
||||
// - snake_case_functions
|
||||
// Filters out common English words that could be matched accidentally.
|
||||
const STOPWORDS = new Set([
|
||||
"not","the","and","for","this","that","with","but","are","was","has",
|
||||
"have","been","any","missing","implementation","diff","defined","never",
|
||||
"referenced","integrated","flow","code","file","some","only","when",
|
||||
]);
|
||||
function extractSymbols(text: string): string[] {
|
||||
const out = new Set<string>();
|
||||
// `backticked` symbols
|
||||
for (const m of text.matchAll(/`([A-Za-z_][A-Za-z0-9_]{2,})`/g)) out.add(m[1]);
|
||||
// foo() or foo_bar() calls
|
||||
for (const m of text.matchAll(/\b([A-Za-z_][A-Za-z0-9_]{2,})\s*\(/g)) out.add(m[1]);
|
||||
// CamelCase types (3+ chars, must start with uppercase)
|
||||
for (const m of text.matchAll(/\b([A-Z][A-Za-z0-9]{2,})\b/g)) out.add(m[1]);
|
||||
return Array.from(out).filter(s => !STOPWORDS.has(s.toLowerCase()));
|
||||
}
|
||||
|
||||
// Scan the repo for at least one definition of each symbol. Uses Bun's
|
||||
// Glob to walk TS/Rust/Python/JS sources; ignores node_modules, data/,
|
||||
// and target/. Skips files > 500KB — those are fixtures/snapshots that
|
||||
// won't contain a definition line and slurping them slows the audit.
|
||||
async function symbolsExistInRepo(symbols: string[]): Promise<string[]> {
|
||||
const patterns = ["**/*.ts", "**/*.tsx", "**/*.rs", "**/*.py", "**/*.js"];
|
||||
const skip = (p: string) => p.includes("/node_modules/") || p.startsWith("data/") || p.includes("/target/") || p.startsWith("dist/");
|
||||
const MAX_FILE_BYTES = 500_000;
|
||||
const { stat } = await import("node:fs/promises");
|
||||
const resolved = new Set<string>();
|
||||
const toFind = new Set(symbols);
|
||||
for (const pat of patterns) {
|
||||
if (toFind.size === 0) break;
|
||||
const glob = new Glob(pat);
|
||||
for await (const f of glob.scan({ cwd: REPO_ROOT, onlyFiles: true })) {
|
||||
if (skip(f)) continue;
|
||||
try { const s = await stat(`${REPO_ROOT}/${f}`); if (s.size > MAX_FILE_BYTES) continue; } catch { continue; }
|
||||
let content: string;
|
||||
try { content = await readFile(`${REPO_ROOT}/${f}`, "utf8"); } catch { continue; }
|
||||
for (const sym of Array.from(toFind)) {
|
||||
// Definition heuristics: `function sym`, `fn sym`, `const sym`,
|
||||
// `let sym`, `def sym`, `class sym`, `struct sym`, `enum sym`,
|
||||
// `trait sym`, `async function sym`, `pub (async )?fn sym`.
|
||||
const re = new RegExp(
|
||||
`\\b(function|async\\s+function|const|let|var|def|class|struct|enum|trait|impl|type|interface|fn|pub\\s+(async\\s+)?fn)\\s+${escapeRe(sym)}\\b`
|
||||
);
|
||||
if (re.test(content)) {
|
||||
resolved.add(sym);
|
||||
toFind.delete(sym);
|
||||
if (toFind.size === 0) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return Array.from(resolved);
|
||||
}
|
||||
|
||||
function escapeRe(s: string): string {
|
||||
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
|
||||
// Lift the first balanced JSON object out of the response. Tolerates
|
||||
// leading prose, code fences, and model reasoning preamble when the
|
||||
// cloud model ignored "strict JSON only."
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
// What this check reads (all file-backed, append-only or periodic):
|
||||
// data/_kb/outcomes.jsonl — per-scenario outcomes (kb.ts)
|
||||
// data/_kb/error_corrections.jsonl — fail→succeed deltas on same sig
|
||||
// data/_kb/scrum_reviews.jsonl — scrum-master accepted reviews
|
||||
// data/_observer/ops.jsonl — observer ring → disk stream
|
||||
// data/_bot/cycles/*.json — bot cycle results
|
||||
//
|
||||
@ -17,14 +18,17 @@
|
||||
import { readFile, readdir, stat } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import type { Claim, Finding } from "../types.ts";
|
||||
import { aggregate, ratingSeverity, formatAgg } from "../kb_index.ts";
|
||||
|
||||
const KB_DIR = "/home/profit/lakehouse/data/_kb";
|
||||
const OBSERVER_OPS = "/home/profit/lakehouse/data/_observer/ops.jsonl";
|
||||
const BOT_CYCLES_DIR = "/home/profit/lakehouse/data/_bot/cycles";
|
||||
const SCRUM_REVIEWS_JSONL = "/home/profit/lakehouse/data/_kb/scrum_reviews.jsonl";
|
||||
const AUDIT_LESSONS_JSONL = "/home/profit/lakehouse/data/_kb/audit_lessons.jsonl";
|
||||
const TAIL_LINES = 500;
|
||||
const MAX_BOT_CYCLE_FILES = 30;
|
||||
|
||||
export async function runKbCheck(claims: Claim[]): Promise<Finding[]> {
|
||||
export async function runKbCheck(claims: Claim[], prFiles: string[] = []): Promise<Finding[]> {
|
||||
const findings: Finding[] = [];
|
||||
|
||||
// 1. Recent scenario outcomes: are strong-claim-style phrases showing
|
||||
@ -48,6 +52,27 @@ export async function runKbCheck(claims: Claim[]): Promise<Finding[]> {
|
||||
const obsFindings = await checkObserverStream();
|
||||
findings.push(...obsFindings);
|
||||
|
||||
// 5. Scrum-master reviews — surface prior accepted reviews for any
|
||||
// file in this PR's diff. Cohesion plan Phase C wire: the
|
||||
// auditor gets to "borrow" the scrum-master's deeper per-file
|
||||
// analysis instead of re-doing that work.
|
||||
if (prFiles.length > 0) {
|
||||
const scrumFindings = await checkScrumReviews(prFiles);
|
||||
findings.push(...scrumFindings);
|
||||
}
|
||||
|
||||
// 6. Audit-lessons feedback loop — summarize the top recurring
|
||||
// patterns from prior audits' block/warn findings. If the same
|
||||
// pattern signature has fired 3+ times across prior audits,
|
||||
// emit it as a block-severity finding so reviewers know this
|
||||
// is a known-recurring class, not a one-off. Does NOT couple
|
||||
// to the current audit's static/inference findings (those run
|
||||
// in parallel and we can't see them here) — the amplification
|
||||
// is emergent: if the current audit's finding-summary matches
|
||||
// a top recurrence, the reviewer sees both.
|
||||
const auditLessonFindings = await checkAuditLessons();
|
||||
findings.push(...auditLessonFindings);
|
||||
|
||||
return findings;
|
||||
}
|
||||
|
||||
@ -181,3 +206,90 @@ function observerBySource(ops: any[]): string {
|
||||
}
|
||||
return Object.entries(c).sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ") || "empty";
|
||||
}
|
||||
|
||||
// Audit-lessons — reads data/_kb/audit_lessons.jsonl (populated by
|
||||
// every audit's appendAuditLessons). Uses the shared kb_index
|
||||
// aggregator: groups by `signature`, distinct-scopes keyed by PR
|
||||
// number, severity from ratingSeverity(agg) which applies the
|
||||
// confidence × count rating (see kb_index.ts). This is the same
|
||||
// aggregation any other KB reader uses — shared discipline, not
|
||||
// per-check custom logic.
|
||||
async function checkAuditLessons(): Promise<Finding[]> {
|
||||
const bySig = await aggregate<any>(AUDIT_LESSONS_JSONL, {
|
||||
keyFn: (r) => r?.signature,
|
||||
scopeFn: (r) => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
|
||||
checkFn: (r) => r?.check,
|
||||
tailLimit: TAIL_LINES * 4,
|
||||
});
|
||||
if (bySig.size === 0) return [];
|
||||
|
||||
const findings: Finding[] = [];
|
||||
for (const [sig, agg] of bySig) {
|
||||
// Silent on first-ever occurrence — not yet signal.
|
||||
if (agg.count < 2) continue;
|
||||
const sev = ratingSeverity(agg);
|
||||
findings.push({
|
||||
check: "kb_query",
|
||||
severity: sev,
|
||||
summary: `recurring audit pattern (${agg.distinct_scopes} distinct PRs, ${agg.count} flaggings, conf=${agg.confidence.toFixed(2)}): ${agg.representative_summary.slice(0, 160)}`,
|
||||
evidence: [
|
||||
`signature=${sig}`,
|
||||
`checks: ${agg.checks.join(",")}`,
|
||||
`scopes: ${agg.scopes.slice(-6).join(",")}`,
|
||||
formatAgg(agg),
|
||||
],
|
||||
});
|
||||
}
|
||||
return findings;
|
||||
}
|
||||
|
||||
// Scrum-master reviews — the scrum pipeline writes one row per
|
||||
// accepted per-file review. We match reviews whose `file` matches
|
||||
// any path in the PR's diff, then surface the *preview* + which
|
||||
// model the escalation ladder had to reach. If the scrum-master
|
||||
// needed the 123B specialist or larger to resolve a file, that's
|
||||
// a meaningful signal about the code's complexity — and it's
|
||||
// surfaced to the PR without the auditor having to re-run the
|
||||
// escalation ladder itself.
|
||||
async function checkScrumReviews(prFiles: string[]): Promise<Finding[]> {
|
||||
const rows = await tailJsonl<any>(SCRUM_REVIEWS_JSONL, TAIL_LINES);
|
||||
if (rows.length === 0) return [];
|
||||
|
||||
// Match by exact file OR filename suffix — PR files arrive as
|
||||
// `auditor/audit.ts`-style relative paths; scrum stores the same.
|
||||
const norm = (p: string) => p.replace(/^\/+/, "").replace(/^home\/profit\/lakehouse\//, "");
|
||||
const prSet = new Set(prFiles.map(norm));
|
||||
|
||||
// Keep only the most recent review per file (last-wins).
|
||||
const latestByFile = new Map<string, any>();
|
||||
for (const r of rows) {
|
||||
const f = norm(String(r.file ?? ""));
|
||||
if (!f) continue;
|
||||
if (!prSet.has(f)) continue;
|
||||
latestByFile.set(f, r);
|
||||
}
|
||||
if (latestByFile.size === 0) return [];
|
||||
|
||||
const findings: Finding[] = [];
|
||||
for (const [file, r] of latestByFile) {
|
||||
const model = String(r.accepted_model ?? "?");
|
||||
const attempt = r.accepted_on_attempt ?? "?";
|
||||
const treeSplit = !!r.tree_split_fired;
|
||||
// Heuristic: if the scrum-master had to escalate past attempt 3,
|
||||
// or had to tree-split, that's context the PR reviewer should see.
|
||||
// Severity: info for low-escalation, warn if escalated far up
|
||||
// the ladder (cloud specialist required).
|
||||
const heavyEscalation = Number(attempt) >= 4;
|
||||
const sev: "warn" | "info" = heavyEscalation ? "warn" : "info";
|
||||
findings.push({
|
||||
check: "kb_query",
|
||||
severity: sev,
|
||||
summary: `scrum-master review for \`${file}\` — accepted on attempt ${attempt} by \`${model}\`${treeSplit ? " (tree-split)" : ""}`,
|
||||
evidence: [
|
||||
`reviewed_at: ${r.reviewed_at ?? "?"}`,
|
||||
`preview: ${String(r.suggestions_preview ?? "").slice(0, 300).replace(/\n/g, " ")}`,
|
||||
],
|
||||
});
|
||||
}
|
||||
return findings;
|
||||
}
|
||||
|
||||
@ -61,7 +61,13 @@ export function runStaticCheck(diff: string): Finding[] {
|
||||
|
||||
if (!isAuditorCheckerFile) {
|
||||
for (const { re, why } of BLOCK_PATTERNS) {
|
||||
if (re.test(added)) {
|
||||
const m = added.match(re);
|
||||
if (m && typeof m.index === "number") {
|
||||
// Skip if the match sits inside a quoted string literal —
|
||||
// this is how rubric files (tests/real-world/*, prompt
|
||||
// templates) legitimately reference the patterns they
|
||||
// guard against, without actually executing them.
|
||||
if (isInsideQuotedString(added, m.index)) continue;
|
||||
findings.push({
|
||||
check: "static",
|
||||
severity: "block",
|
||||
@ -154,6 +160,25 @@ function extractNewFields(addedLines: string[]): string[] {
|
||||
return Array.from(fields);
|
||||
}
|
||||
|
||||
// True if `pos` falls inside a double- or single-quoted string on this
|
||||
// line (backtick template literals too). Walks left→right toggling the
|
||||
// "in quote" state on each unescaped quote. Good enough for single-
|
||||
// line matches; multi-line strings aren't parsed (they're extremely
|
||||
// rare in the patterns we're blocking on, and would require a proper
|
||||
// tokenizer to handle correctly).
|
||||
function isInsideQuotedString(line: string, pos: number): boolean {
|
||||
let inDouble = false, inSingle = false, inBacktick = false;
|
||||
for (let i = 0; i < pos; i++) {
|
||||
const c = line[i];
|
||||
const esc = i > 0 && line[i - 1] === "\\";
|
||||
if (esc) continue;
|
||||
if (c === '"' && !inSingle && !inBacktick) inDouble = !inDouble;
|
||||
else if (c === "'" && !inDouble && !inBacktick) inSingle = !inSingle;
|
||||
else if (c === "`" && !inDouble && !inSingle) inBacktick = !inBacktick;
|
||||
}
|
||||
return inDouble || inSingle || inBacktick;
|
||||
}
|
||||
|
||||
function escape(s: string): string {
|
||||
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
|
||||
@ -49,6 +49,25 @@ const WEAK_PATTERNS: RegExp[] = [
|
||||
/\bprobably\b/i,
|
||||
];
|
||||
|
||||
// Empirical claims: runtime measurements / observed outcomes that can't
|
||||
// be verified from a diff (only from the actual run that produced
|
||||
// them). Example: "6/6 iterations complete, 58 cloud calls, 306s
|
||||
// end-to-end" — true, but only the test's own summary.json can
|
||||
// confirm it. Classifying as empirical lets the inference check skip
|
||||
// diff-verification and saves the ladder for falsifiable claims.
|
||||
const EMPIRICAL_PATTERNS: RegExp[] = [
|
||||
// Iteration / attempt counts: "6/6 iterations", "attempt 5", "accepted on attempt 3"
|
||||
/\b\d+\s*\/\s*\d+\s+(iterations?|attempts?|cycles?|runs?|shards?)\b/i,
|
||||
/\b(accepted|resolved|converged)\s+on\s+attempt\s+\d+\b/i,
|
||||
// Runtime metrics: "58 cloud calls", "306s end-to-end", "245s total", "5931 chars"
|
||||
/\b\d+\s+(cloud\s+)?calls?\b/i,
|
||||
/\b\d+\s*(ms|s|seconds?|minutes?|m)\s+(end[- ]to[- ]end|total|elapsed|duration)\b/i,
|
||||
/\b\d+\s+chars?\b.*\b(accepted|generated|produced)\b/i,
|
||||
// "escalated through N tiers", "N distinct models"
|
||||
/\bescalated\s+through\s+\d+\b/i,
|
||||
/\b\d+\s+distinct\s+(model|tier)s?\b/i,
|
||||
];
|
||||
|
||||
export interface ParsedClaims {
|
||||
claims: Claim[];
|
||||
commits_scanned: number;
|
||||
@ -77,8 +96,21 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
|
||||
const line = lines[i];
|
||||
if (line.length < 3) continue;
|
||||
|
||||
// Strong patterns first — if a line matches strong, it's strong,
|
||||
// don't double-count as moderate.
|
||||
// Empirical match wins over everything else — if a line ALSO
|
||||
// contains a moderate word like "complete", we still want to
|
||||
// classify it as empirical so the inference check doesn't ask
|
||||
// the cloud to prove "58 cloud calls" from the diff. Order:
|
||||
// empirical → strong → moderate → weak.
|
||||
const empirical = firstMatch(line, EMPIRICAL_PATTERNS);
|
||||
if (empirical) {
|
||||
out.push({
|
||||
text: line.trim().slice(0, 200),
|
||||
commit_sha,
|
||||
location: `${location_prefix}:${i + 1}`,
|
||||
strength: "empirical",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
const strong = firstMatch(line, STRONG_PATTERNS);
|
||||
if (strong) {
|
||||
out.push({
|
||||
|
||||
161
auditor/kb_index.ts
Normal file
161
auditor/kb_index.ts
Normal file
@ -0,0 +1,161 @@
|
||||
// kb_index — generic on-the-fly aggregation over append-only JSONL
|
||||
// scratchpads (audit_lessons, scrum_reviews, outcomes, observer ops).
|
||||
//
|
||||
// The mem0 insight: raw rows are CHEAP and tell the full story, but
|
||||
// downstream prompts need a DEFINITION, not a log. A definition is
|
||||
// the aggregate: "this signature has fired N times across M distinct
|
||||
// scopes, first_seen=X, last_seen=Y, confidence=M/N."
|
||||
//
|
||||
// This library is the single shared aggregator. Every KB writer keeps
|
||||
// appending raw rows; every KB reader uses aggregate() instead of
|
||||
// tailing the raw stream. No second file to sync, no ADD/UPDATE/NOOP
|
||||
// routing — the stats roll up from the raw rows every time.
|
||||
//
|
||||
// Why this works past hundreds of runs:
|
||||
// - aggregate() is bounded by distinct_signatures, not total_rows.
|
||||
// - confidence = distinct_scopes / count — low for same-scope noise,
|
||||
// high for cross-scope patterns. Downstream severity ramps on
|
||||
// confidence × count, not raw count, so one unfixed PR can't
|
||||
// inflate its own recurrence score (the classic mem0 failure).
|
||||
// - rotation (later) moves old raw to archive files; aggregate()
|
||||
// can still read both to compute lifetime counts when needed.
|
||||
|
||||
import { readFile } from "node:fs/promises";
|
||||
|
||||
export interface AggregateRow {
|
||||
signature: string;
|
||||
count: number;
|
||||
distinct_scopes: number;
|
||||
first_seen: string;
|
||||
last_seen: string;
|
||||
confidence: number; // distinct_scopes / count — capped at 1.0
|
||||
representative_summary: string; // most-recent summary for this signature
|
||||
scopes: string[]; // up to 20 most-recent scopes for debugging
|
||||
checks: string[]; // distinct `check` values (audit_lessons-specific)
|
||||
}
|
||||
|
||||
export interface AggregateOptions<T> {
|
||||
/** How to extract the dedup key from a row. */
|
||||
keyFn: (row: T) => string | undefined;
|
||||
/** How to extract the "scope" — distinct scopes count gives confidence. */
|
||||
scopeFn: (row: T) => string | undefined;
|
||||
/** How to extract the timestamp (defaults to row.audited_at / row.reviewed_at / row.timestamp). */
|
||||
timeFn?: (row: T) => string | undefined;
|
||||
/** How to extract a representative summary (defaults to row.summary). */
|
||||
summaryFn?: (row: T) => string | undefined;
|
||||
/** Max rows to read from the JSONL tail; 0 = read all. */
|
||||
tailLimit?: number;
|
||||
/** Include per-row check field (for multi-check aggregates). */
|
||||
checkFn?: (row: T) => string | undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a JSONL file and produce the aggregate map keyed by signature.
|
||||
* Safe on missing or malformed files — returns empty map.
|
||||
*/
|
||||
export async function aggregate<T = any>(
|
||||
jsonlPath: string,
|
||||
opts: AggregateOptions<T>,
|
||||
): Promise<Map<string, AggregateRow>> {
|
||||
const out = new Map<string, AggregateRow>();
|
||||
let raw: string;
|
||||
try { raw = await readFile(jsonlPath, "utf8"); } catch { return out; }
|
||||
const lines = raw.split("\n").filter(l => l.length > 0);
|
||||
const sliceFrom = opts.tailLimit && opts.tailLimit > 0 ? Math.max(0, lines.length - opts.tailLimit) : 0;
|
||||
|
||||
const timeFn = opts.timeFn ?? ((r: any) => r?.audited_at ?? r?.reviewed_at ?? r?.timestamp ?? r?.ran_at);
|
||||
const summaryFn = opts.summaryFn ?? ((r: any) => r?.summary ?? r?.representative_summary);
|
||||
|
||||
// Per-signature scope tracking — need counts by scope to compute
|
||||
// distinct_scopes without double-counting a scope that appears 50
|
||||
// times. Using a Set<scope> per signature.
|
||||
const scopeSets = new Map<string, Set<string>>();
|
||||
const checkSets = new Map<string, Set<string>>();
|
||||
|
||||
for (let i = sliceFrom; i < lines.length; i++) {
|
||||
let row: T;
|
||||
try { row = JSON.parse(lines[i]) as T; } catch { continue; }
|
||||
const sig = opts.keyFn(row);
|
||||
if (!sig) continue;
|
||||
|
||||
let agg = out.get(sig);
|
||||
if (!agg) {
|
||||
agg = {
|
||||
signature: sig,
|
||||
count: 0,
|
||||
distinct_scopes: 0,
|
||||
first_seen: "",
|
||||
last_seen: "",
|
||||
confidence: 0,
|
||||
representative_summary: "",
|
||||
scopes: [],
|
||||
checks: [],
|
||||
};
|
||||
out.set(sig, agg);
|
||||
scopeSets.set(sig, new Set<string>());
|
||||
checkSets.set(sig, new Set<string>());
|
||||
}
|
||||
|
||||
agg.count += 1;
|
||||
|
||||
const scope = opts.scopeFn(row);
|
||||
if (scope !== undefined && scope !== null && scope !== "") {
|
||||
scopeSets.get(sig)!.add(String(scope));
|
||||
// Keep scopes array ordered by recency (newest wins — shift
|
||||
// oldest when at cap).
|
||||
const arr = agg.scopes;
|
||||
const s = String(scope);
|
||||
const existing = arr.indexOf(s);
|
||||
if (existing >= 0) arr.splice(existing, 1);
|
||||
arr.push(s);
|
||||
if (arr.length > 20) arr.shift();
|
||||
}
|
||||
|
||||
if (opts.checkFn) {
|
||||
const c = opts.checkFn(row);
|
||||
if (c) checkSets.get(sig)!.add(String(c));
|
||||
}
|
||||
|
||||
const t = timeFn(row);
|
||||
if (t) {
|
||||
if (!agg.first_seen || t < agg.first_seen) agg.first_seen = t;
|
||||
if (!agg.last_seen || t > agg.last_seen) agg.last_seen = t;
|
||||
}
|
||||
|
||||
const s = summaryFn(row);
|
||||
if (s) agg.representative_summary = String(s);
|
||||
}
|
||||
|
||||
// Finalize derived fields.
|
||||
for (const [sig, agg] of out) {
|
||||
const scopes = scopeSets.get(sig) ?? new Set<string>();
|
||||
agg.distinct_scopes = scopes.size;
|
||||
agg.confidence = agg.count > 0 ? Math.min(1, agg.distinct_scopes / agg.count) : 0;
|
||||
const checks = checkSets.get(sig);
|
||||
if (checks) agg.checks = Array.from(checks).sort();
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Severity policy derived from aggregate stats. The rating lives here
|
||||
* (not in each check) so all KB readers ramp severity consistently.
|
||||
*
|
||||
* - confidence × count product is the real signal.
|
||||
* - Low confidence (< 0.3) = same-scope noise → info regardless of count.
|
||||
* - Mid confidence (0.3-0.6) = mixed signal → warn at count ≥ 3.
|
||||
* - High confidence (> 0.6) with count ≥ 5 = block-worthy cross-cutting pattern.
|
||||
*
|
||||
* Callers can override by reading agg directly; this is the default
|
||||
* policy that matches the "don't escalate one unfixed PR" discipline.
|
||||
*/
|
||||
export function ratingSeverity(agg: AggregateRow): "info" | "warn" | "block" {
|
||||
if (agg.confidence >= 0.6 && agg.count >= 5) return "block";
|
||||
if (agg.confidence >= 0.3 && agg.count >= 3) return "warn";
|
||||
return "info";
|
||||
}
|
||||
|
||||
/** Human-friendly one-line summary of an aggregate row for finding evidence. */
|
||||
export function formatAgg(agg: AggregateRow): string {
|
||||
return `count=${agg.count} distinct_scopes=${agg.distinct_scopes} confidence=${agg.confidence.toFixed(2)} seen=[${agg.first_seen.slice(0, 10)}..${agg.last_seen.slice(0, 10)}]`;
|
||||
}
|
||||
@ -18,7 +18,14 @@ export interface Claim {
|
||||
// Heuristic rating of how strong the claim is. "green+tested"
|
||||
// is strong; "should work" is weak. Drives sensitivity — stronger
|
||||
// claims get harder-blocked on weak evidence.
|
||||
strength: "weak" | "moderate" | "strong";
|
||||
//
|
||||
// "empirical" is a separate class: runtime measurements like
|
||||
// "N cloud calls" / "306s end-to-end" / "accepted on attempt N".
|
||||
// These cannot be verified from a static diff — only from the test
|
||||
// output that produced them. Inference skips diff-verification for
|
||||
// empirical claims; they become info-level context unless a future
|
||||
// runtime_evidence check contradicts them.
|
||||
strength: "weak" | "moderate" | "strong" | "empirical";
|
||||
}
|
||||
|
||||
export interface Finding {
|
||||
|
||||
528
tests/real-world/enrich_prd_pipeline.ts
Normal file
528
tests/real-world/enrich_prd_pipeline.ts
Normal file
@ -0,0 +1,528 @@
|
||||
// Real-world architecture stress test — 6 iterations of the full pipeline
|
||||
// against the PRD as a corpus. Goal: prove at scale what Phase 21
|
||||
// promised (context continuation + tree-split), plus Phase 19
|
||||
// compounding across iterations.
|
||||
//
|
||||
// Run: bun run tests/real-world/enrich_prd_pipeline.ts
|
||||
//
|
||||
// No mocks. No skipped layers. On any error, the test triggers
|
||||
// cloud-rescue rather than fail — it's the architecture's job to
|
||||
// recover. The test FAILS only if we can't complete 6 iterations.
|
||||
|
||||
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
||||
import { createHash } from "node:crypto";
|
||||
|
||||
const PRD_PATH = "/home/profit/lakehouse/docs/PRD.md";
|
||||
const CHUNK_SIZE = 800; // chars per chunk — ~200 tokens
|
||||
const CHUNK_OVERLAP = 120;
|
||||
const TOP_K_RETRIEVE = 12; // chunks per iteration — pulled up to force overflow
|
||||
const CONTEXT_BUDGET_CHARS = 4000; // tight budget — forces tree-split on every iter
|
||||
const INJECT_FAIL_ON_ITER = 3; // force the TASK-retry loop on iter 3
|
||||
|
||||
// Continuation controls (per-cloud-call) — used for output-overflow.
|
||||
// Separate from the task-retry loop (per-task) — that handles errors
|
||||
// across attempts.
|
||||
const PRIMARY_MAX_TOKENS = 150; // tight — forces truncation
|
||||
const CONTINUATION_MAX_TOKENS = 300; // each continuation doubles headroom
|
||||
const MAX_CONTINUATIONS = 6; // max stitch pieces per cloud call
|
||||
|
||||
// Task-level retry loop (J's clarification, 2026-04-22):
|
||||
// When a TASK errors, retry the whole task up to 6 times. Each
|
||||
// retry gets prior attempts' failures injected as learning context,
|
||||
// so attempt N+1 is informed by what N failed at. The loop caps at
|
||||
// 6 to avoid infinite spinning on genuinely unsolvable tasks.
|
||||
const MAX_TASK_RETRIES = 6;
|
||||
|
||||
// To FORCE the retry loop on iter INJECT_FAIL_ON_ITER, cycle through
|
||||
// 5 deliberately-invalid models + 1 valid one. Attempts 1-5 will
|
||||
// 502/404 from Ollama Cloud; attempt 6 finally succeeds. Proves the
|
||||
// loop fires all 6 with compounding failure context.
|
||||
const FORCE_RETRY_MODEL_SEQUENCE = [
|
||||
"deliberately-invalid-model-attempt-1",
|
||||
"deliberately-invalid-model-attempt-2",
|
||||
"deliberately-invalid-model-attempt-3",
|
||||
"deliberately-invalid-model-attempt-4",
|
||||
"deliberately-invalid-model-attempt-5",
|
||||
"gpt-oss:20b", // 6th attempt succeeds
|
||||
];
|
||||
const GATEWAY = "http://localhost:3100";
|
||||
const SIDECAR = "http://localhost:3200";
|
||||
const CLOUD_MODEL = "gpt-oss:120b";
|
||||
const RESCUE_MODEL = "gpt-oss:20b"; // fallback local cloud model via sidecar
|
||||
const RUN_NONCE = Date.now().toString(36);
|
||||
const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/${RUN_NONCE}`;
|
||||
|
||||
// The 6 progressively-compounding questions. #6 explicitly requires
|
||||
// synthesis across prior 5 answers.
|
||||
const QUESTIONS: string[] = [
|
||||
"Summarize the Lakehouse project's one-paragraph thesis: what problem does it solve, what's the unique approach?",
|
||||
"How does Phase 19 playbook memory turn successful fills into a signal that boosts future rankings?",
|
||||
"Explain the role of Phase 24 observer in the learning loop — what does it see, and what does it feed into?",
|
||||
"What's the VRAM-aware profile swap mechanism in Phase 17, and why does it matter for multi-model serving?",
|
||||
"How do Phase 25 validity windows and Phase 27 playbook versioning interact when a schema drifts?",
|
||||
"Synthesize the prior 5 answers: how do the pieces (playbook memory, observer, profile swap, validity windows, versioning) compose into a system that measurably gets smarter over time? Cite specific prior answers.",
|
||||
];
|
||||
|
||||
type Chunk = { id: string; text: string; embedding: number[]; offset: number };
|
||||
|
||||
interface IterationResult {
|
||||
iteration: number;
|
||||
question: string;
|
||||
retrieval_top_k: number;
|
||||
context_chars_before_budget: number;
|
||||
tree_split_fired: boolean;
|
||||
cloud_calls_total: number;
|
||||
continuation_retries: number;
|
||||
rescue_triggered: boolean;
|
||||
// Task-level retry telemetry
|
||||
task_attempts_made: number; // how many attempts fired (1 = first succeeded)
|
||||
task_retry_history: Array<{ n: number; model: string; error: string }>;
|
||||
playbook_id: string | null;
|
||||
tokens_prompt: number;
|
||||
tokens_completion: number;
|
||||
citations_from_prior_iterations: string[];
|
||||
duration_ms: number;
|
||||
answer_preview: string;
|
||||
errors_recovered: string[];
|
||||
}
|
||||
|
||||
function log(msg: string) { console.log(`[enrich] ${msg}`); }
|
||||
|
||||
function sleep(ms: number) { return new Promise(r => setTimeout(r, ms)); }
|
||||
|
||||
function cosine(a: number[], b: number[]): number {
|
||||
let dot = 0, na = 0, nb = 0;
|
||||
for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; }
|
||||
if (na === 0 || nb === 0) return 0;
|
||||
return dot / (Math.sqrt(na) * Math.sqrt(nb));
|
||||
}
|
||||
|
||||
function hash(s: string): string { return createHash("sha256").update(s).digest("hex").slice(0, 10); }
|
||||
|
||||
async function embedBatch(texts: string[]): Promise<number[][]> {
|
||||
// Sidecar /embed accepts a list. On partial failure, retry individually.
|
||||
const r = await fetch(`${SIDECAR}/embed`, {
|
||||
method: "POST", headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({ texts }),
|
||||
signal: AbortSignal.timeout(120000),
|
||||
});
|
||||
if (!r.ok) throw new Error(`embed batch ${r.status}: ${await r.text()}`);
|
||||
const j: any = await r.json();
|
||||
return j.embeddings;
|
||||
}
|
||||
|
||||
function chunkText(text: string): Array<{ text: string; offset: number }> {
|
||||
const out: Array<{ text: string; offset: number }> = [];
|
||||
let i = 0;
|
||||
while (i < text.length) {
|
||||
const end = Math.min(i + CHUNK_SIZE, text.length);
|
||||
const slice = text.slice(i, end).trim();
|
||||
if (slice.length > 50) out.push({ text: slice, offset: i });
|
||||
if (end >= text.length) break;
|
||||
i = end - CHUNK_OVERLAP;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
async function chat(opts: {
|
||||
provider: "ollama" | "ollama_cloud",
|
||||
model: string,
|
||||
messages: Array<{ role: string; content: string }>,
|
||||
max_tokens: number,
|
||||
think: boolean,
|
||||
}): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; finish_reason: string }> {
|
||||
const r = await fetch(`${GATEWAY}/v1/chat`, {
|
||||
method: "POST", headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({ ...opts }),
|
||||
signal: AbortSignal.timeout(180000),
|
||||
});
|
||||
if (!r.ok) throw new Error(`/v1/chat ${r.status}: ${await r.text()}`);
|
||||
const j: any = await r.json();
|
||||
return {
|
||||
content: j.choices?.[0]?.message?.content ?? "",
|
||||
prompt_tokens: j.usage?.prompt_tokens ?? 0,
|
||||
completion_tokens: j.usage?.completion_tokens ?? 0,
|
||||
finish_reason: j.choices?.[0]?.finish_reason ?? "?",
|
||||
};
|
||||
}
|
||||
|
||||
// ─── Tree-split over oversized chunk set ──────────────────────────
|
||||
async function treeSplitSummarize(
|
||||
chunks: Chunk[],
|
||||
question: string,
|
||||
): Promise<{ scratchpad: string; cloud_calls: number }> {
|
||||
// Shard into groups fitting within half the budget each.
|
||||
const perShard = Math.max(1, Math.floor((CONTEXT_BUDGET_CHARS / 2) / CHUNK_SIZE));
|
||||
const shards: Chunk[][] = [];
|
||||
for (let i = 0; i < chunks.length; i += perShard) {
|
||||
shards.push(chunks.slice(i, i + perShard));
|
||||
}
|
||||
log(` tree-split: ${chunks.length} chunks → ${shards.length} shards of up to ${perShard}`);
|
||||
let scratchpad = "";
|
||||
let cloud_calls = 0;
|
||||
for (let si = 0; si < shards.length; si++) {
|
||||
const shard = shards[si];
|
||||
const shardText = shard.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n");
|
||||
const userMsg = `Question: ${question}\n\nShard ${si + 1}/${shards.length} of source material:\n\n${shardText}\n\nScratchpad so far:\n${scratchpad || "(empty)"}\n\nUpdate the scratchpad: extract only facts from THIS shard that help answer the question. Be terse. No prose.`;
|
||||
const r = await chat({
|
||||
provider: "ollama_cloud",
|
||||
model: CLOUD_MODEL,
|
||||
messages: [
|
||||
{ role: "system", content: "You maintain a concise factual scratchpad across multiple shards of source text. No prose outside the scratchpad. Each shard, append ≤80 words of relevant facts." },
|
||||
{ role: "user", content: userMsg },
|
||||
],
|
||||
max_tokens: 500,
|
||||
think: false,
|
||||
});
|
||||
cloud_calls += 1;
|
||||
scratchpad += `\n--- shard ${si + 1} notes ---\n${r.content.trim()}`;
|
||||
if (scratchpad.length > CONTEXT_BUDGET_CHARS) {
|
||||
// truncate oldest halves
|
||||
scratchpad = scratchpad.slice(-CONTEXT_BUDGET_CHARS);
|
||||
log(` tree-split: scratchpad truncated to ${scratchpad.length} chars`);
|
||||
}
|
||||
}
|
||||
return { scratchpad, cloud_calls };
|
||||
}
|
||||
|
||||
// ─── Continuable generate — up to max_continuations stitches ──────
|
||||
//
|
||||
// Two failure modes handled:
|
||||
// A) Empty response — typically thinking model burned the budget
|
||||
// on hidden reasoning. Retry with 2× max_tokens.
|
||||
// B) Truncated response (finish_reason=length) — answer got cut off
|
||||
// mid-sentence. Pass the partial back as scratchpad and ask the
|
||||
// model to continue from where it stopped.
|
||||
//
|
||||
// Stitching: keep appending content across retries; prompt_tokens and
|
||||
// completion_tokens accumulate; finish_reason reflects the LAST call.
|
||||
// Loop exits on the first call that finishes cleanly (stop) with
|
||||
// non-empty content, OR when retries hit the cap.
|
||||
async function generateContinuable(
|
||||
opts: Parameters<typeof chat>[0] & { max_continuations?: number },
|
||||
): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string }> {
|
||||
const maxCont = opts.max_continuations ?? 1;
|
||||
let total = await chat(opts);
|
||||
let retries = 0;
|
||||
while (retries < maxCont && (total.content.length === 0 || total.finish_reason === "length")) {
|
||||
retries += 1;
|
||||
const mode = total.content.length === 0 ? "empty" : "truncated";
|
||||
log(` continuation retry ${retries}/${maxCont} (${mode}: finish=${total.finish_reason}, content=${total.content.length} chars)`);
|
||||
// Continuation prompt — branch on failure mode:
|
||||
// empty → retry with 2× tokens, same prompt (thinking budget)
|
||||
// length → pass the partial as assistant turn, ask to continue
|
||||
const continuationMessages = total.content.length === 0
|
||||
? opts.messages
|
||||
: [
|
||||
...opts.messages,
|
||||
{ role: "assistant", content: total.content },
|
||||
{ role: "user", content: "Continue from exactly where you stopped. Do not repeat. Finish the answer." },
|
||||
];
|
||||
const continued = await chat({
|
||||
...opts,
|
||||
max_tokens: CONTINUATION_MAX_TOKENS,
|
||||
messages: continuationMessages,
|
||||
});
|
||||
total = {
|
||||
content: total.content + continued.content,
|
||||
prompt_tokens: total.prompt_tokens + continued.prompt_tokens,
|
||||
completion_tokens: total.completion_tokens + continued.completion_tokens,
|
||||
finish_reason: continued.finish_reason,
|
||||
};
|
||||
}
|
||||
return { ...total, continuation_retries: retries };
|
||||
}
|
||||
|
||||
// ─── Single iteration: retrieve → budget-check → chat → seed ─────
|
||||
async function runIteration(
|
||||
iteration: number,
|
||||
question: string,
|
||||
allChunks: Chunk[],
|
||||
priorPlaybookIds: string[],
|
||||
priorAnswers: string[],
|
||||
): Promise<IterationResult> {
|
||||
const started = Date.now();
|
||||
const errorsRecovered: string[] = [];
|
||||
log(`iter ${iteration}: "${question.slice(0, 70)}..."`);
|
||||
|
||||
// 1. Embed the question
|
||||
const qEmb = (await embedBatch([question]))[0];
|
||||
|
||||
// 2. Retrieve top-K chunks by cosine
|
||||
const scored = allChunks
|
||||
.map(c => ({ c, score: cosine(qEmb, c.embedding) }))
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, TOP_K_RETRIEVE);
|
||||
const chunks = scored.map(x => x.c);
|
||||
log(` retrieved top ${chunks.length} chunks (score range ${scored[0].score.toFixed(3)} .. ${scored[scored.length - 1].score.toFixed(3)})`);
|
||||
|
||||
// 3. Context budget check — tree-split if over
|
||||
const contextChars = chunks.map(c => c.text).join("\n\n").length;
|
||||
let contextForPrompt: string;
|
||||
let treeSplit = false;
|
||||
let cloudCallsTotal = 0;
|
||||
if (contextChars > CONTEXT_BUDGET_CHARS) {
|
||||
treeSplit = true;
|
||||
log(` context ${contextChars} chars > budget ${CONTEXT_BUDGET_CHARS} → tree-split`);
|
||||
const { scratchpad, cloud_calls } = await treeSplitSummarize(chunks, question);
|
||||
contextForPrompt = `Distilled scratchpad from ${chunks.length} source chunks (too large to fit directly):\n${scratchpad}`;
|
||||
cloudCallsTotal += cloud_calls;
|
||||
} else {
|
||||
contextForPrompt = chunks.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n");
|
||||
}
|
||||
|
||||
// 4. Seed prompt with prior iteration answers (real compounding).
|
||||
// Not just IDs — the model needs the CONTENT to synthesize.
|
||||
let citationBlock = "";
|
||||
let citationsReceived: string[] = [];
|
||||
if (priorPlaybookIds.length > 0 && priorAnswers.length > 0) {
|
||||
const lines = priorAnswers.map((ans, i) => {
|
||||
const pid = priorPlaybookIds[i]?.slice(0, 12) ?? "unknown";
|
||||
// Trim each prior answer to ~400 chars so we don't blow budget
|
||||
return `[pb:${pid}] iter ${i + 1} answer:\n${ans.slice(0, 400)}\n`;
|
||||
});
|
||||
citationBlock = `\n\n═══ PRIOR ITERATIONS (compounding context) ═══\n${lines.join("\n")}═══ end prior iterations ═══\n\nYour answer MUST cite specific prior iterations using [pb:ID] notation when drawing on them. Synthesis questions require explicit cross-iteration reasoning.`;
|
||||
citationsReceived = priorPlaybookIds.slice();
|
||||
}
|
||||
|
||||
// 5. TASK-LEVEL RETRY LOOP — per J's clarification 2026-04-22.
|
||||
// Try the task up to MAX_TASK_RETRIES times. Each retry:
|
||||
// a) Picks a model (normally CLOUD_MODEL; on INJECT_FAIL_ON_ITER,
|
||||
// cycles through 5 invalid models + 1 valid to force full loop)
|
||||
// b) Injects prior attempt errors as learning context
|
||||
// c) If the attempt succeeds (non-empty, >100 chars), loop exits
|
||||
// d) Otherwise, records failure and tries again with the learning
|
||||
//
|
||||
// Cap at 6 so we don't spin forever on unsolvable tasks.
|
||||
let result: { content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string } | null = null;
|
||||
let rescueTriggered = false;
|
||||
const taskAttemptHistory: Array<{ n: number; model: string; error: string }> = [];
|
||||
const forceRetries = iteration === INJECT_FAIL_ON_ITER;
|
||||
if (forceRetries) log(` FORCING TASK-RETRY LOOP — iter ${iteration} will cycle through 5 invalid models + 1 valid`);
|
||||
|
||||
for (let attempt = 1; attempt <= MAX_TASK_RETRIES; attempt++) {
|
||||
const modelForAttempt = forceRetries
|
||||
? FORCE_RETRY_MODEL_SEQUENCE[attempt - 1]
|
||||
: CLOUD_MODEL;
|
||||
// Compose a prior-attempts learning block for attempts 2+
|
||||
const learningBlock = taskAttemptHistory.length > 0
|
||||
? `\n\n═══ PRIOR ATTEMPTS THIS TASK (do NOT repeat these failures; adjust approach) ═══\n${taskAttemptHistory.map(a => `Attempt ${a.n} (model ${a.model}) failed: ${a.error.slice(0, 160)}`).join("\n")}\n═══ end prior attempts ═══\n`
|
||||
: "";
|
||||
log(` task attempt ${attempt}/${MAX_TASK_RETRIES}: model=${modelForAttempt}${learningBlock ? " [with prior-failure context]" : ""}`);
|
||||
try {
|
||||
const r = await generateContinuable({
|
||||
provider: "ollama_cloud",
|
||||
model: modelForAttempt,
|
||||
messages: [
|
||||
{ role: "system", content: "You answer questions about the Lakehouse PRD using only the provided source material and prior iteration answers. Be specific. Cite chunk offsets OR [pb:ID] markers. Write a detailed 250-word answer." },
|
||||
{ role: "user", content: `Question: ${question}\n\nSource material:\n${contextForPrompt}${citationBlock}${learningBlock}` },
|
||||
],
|
||||
max_tokens: PRIMARY_MAX_TOKENS,
|
||||
think: false,
|
||||
max_continuations: MAX_CONTINUATIONS,
|
||||
});
|
||||
cloudCallsTotal += 1 + r.continuation_retries;
|
||||
if (r.content && r.content.length > 100) {
|
||||
// Acceptable answer — exit loop
|
||||
result = r;
|
||||
if (attempt > 1) {
|
||||
log(` task attempt ${attempt} SUCCEEDED (${r.content.length} chars) after ${attempt - 1} prior failures`);
|
||||
rescueTriggered = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
// Thin response — count as failure with learning signal
|
||||
const err = `thin-answer: ${r.content.length} chars, finish=${r.finish_reason}`;
|
||||
taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
|
||||
errorsRecovered.push(`attempt ${attempt}: ${err}`);
|
||||
} catch (e) {
|
||||
const err = (e as Error).message;
|
||||
taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
|
||||
errorsRecovered.push(`attempt ${attempt}: ${err.slice(0, 120)}`);
|
||||
cloudCallsTotal += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Last-ditch: if all 6 task attempts failed, try the local fallback
|
||||
// once more so we at least return SOMETHING. This is the "don't get
|
||||
// caught in a loop, accept best-so-far" rule J stated explicitly.
|
||||
if (!result) {
|
||||
errorsRecovered.push(`all ${MAX_TASK_RETRIES} task attempts failed — local fallback`);
|
||||
rescueTriggered = true;
|
||||
try {
|
||||
result = await generateContinuable({
|
||||
provider: "ollama",
|
||||
model: "qwen3.5:latest",
|
||||
messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
|
||||
max_tokens: 300,
|
||||
think: false,
|
||||
max_continuations: 2,
|
||||
});
|
||||
cloudCallsTotal += 1 + result.continuation_retries;
|
||||
} catch (e) {
|
||||
// Absolute last resort — fabricate a skeleton result
|
||||
result = {
|
||||
content: `[task failed after ${MAX_TASK_RETRIES} retries + local fallback: ${(e as Error).message}]`,
|
||||
prompt_tokens: 0,
|
||||
completion_tokens: 0,
|
||||
continuation_retries: 0,
|
||||
finish_reason: "error",
|
||||
};
|
||||
}
|
||||
}
|
||||
if (result.content.length === 0) {
|
||||
errorsRecovered.push("even rescue returned empty — last-ditch local fallback");
|
||||
rescueTriggered = true;
|
||||
result = await generateContinuable({
|
||||
provider: "ollama",
|
||||
model: "qwen3.5:latest",
|
||||
messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
|
||||
max_tokens: 300,
|
||||
think: false,
|
||||
});
|
||||
cloudCallsTotal += 1;
|
||||
}
|
||||
|
||||
// 6. Seed playbook with the answer
|
||||
let playbook_id: string | null = null;
|
||||
try {
|
||||
const ts = new Date().toISOString();
|
||||
const seedOp = `TEST: enrich_prd_run_${RUN_NONCE} iter${iteration} in Corpus, PRD`;
|
||||
const r = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
|
||||
method: "POST", headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
operation: seedOp,
|
||||
approach: `q="${question.slice(0, 80)}" context_chars=${contextChars} tree_split=${treeSplit}`,
|
||||
context: result.content.slice(0, 600),
|
||||
endorsed_names: [`iter${iteration}_${RUN_NONCE}`],
|
||||
append: true,
|
||||
}),
|
||||
signal: AbortSignal.timeout(15000),
|
||||
});
|
||||
if (r.ok) {
|
||||
const j: any = await r.json();
|
||||
playbook_id = j.outcome?.playbook_id ?? null;
|
||||
} else {
|
||||
errorsRecovered.push(`seed ${r.status}: ${(await r.text()).slice(0, 100)}`);
|
||||
}
|
||||
} catch (e) {
|
||||
errorsRecovered.push(`seed exception: ${(e as Error).message}`);
|
||||
}
|
||||
|
||||
return {
|
||||
iteration,
|
||||
question,
|
||||
retrieval_top_k: chunks.length,
|
||||
context_chars_before_budget: contextChars,
|
||||
tree_split_fired: treeSplit,
|
||||
cloud_calls_total: cloudCallsTotal,
|
||||
continuation_retries: result.continuation_retries,
|
||||
rescue_triggered: rescueTriggered,
|
||||
task_attempts_made: taskAttemptHistory.length + 1, // +1 for the successful attempt
|
||||
task_retry_history: taskAttemptHistory,
|
||||
playbook_id,
|
||||
tokens_prompt: result.prompt_tokens,
|
||||
tokens_completion: result.completion_tokens,
|
||||
citations_from_prior_iterations: citationsReceived,
|
||||
duration_ms: Date.now() - started,
|
||||
answer_preview: result.content.slice(0, 500),
|
||||
errors_recovered: errorsRecovered,
|
||||
};
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await mkdir(OUT_DIR, { recursive: true });
|
||||
log(`run nonce: ${RUN_NONCE}`);
|
||||
log(`output dir: ${OUT_DIR}`);
|
||||
|
||||
// ─── Phase 1: load, chunk, embed the PRD ───────────────────────
|
||||
log(`loading PRD from ${PRD_PATH}`);
|
||||
const prd = await readFile(PRD_PATH, "utf8");
|
||||
log(`PRD: ${prd.length} chars, ${prd.split("\n").length} lines`);
|
||||
|
||||
const raw_chunks = chunkText(prd);
|
||||
log(`chunked into ${raw_chunks.length} pieces (size ${CHUNK_SIZE}, overlap ${CHUNK_OVERLAP})`);
|
||||
|
||||
// Embed in batches of 32 to avoid sidecar overload
|
||||
const allChunks: Chunk[] = [];
|
||||
const BATCH = 32;
|
||||
const t0 = Date.now();
|
||||
for (let i = 0; i < raw_chunks.length; i += BATCH) {
|
||||
const batch = raw_chunks.slice(i, i + BATCH);
|
||||
const embs = await embedBatch(batch.map(b => b.text));
|
||||
for (let j = 0; j < batch.length; j++) {
|
||||
allChunks.push({
|
||||
id: hash(batch[j].text),
|
||||
text: batch[j].text,
|
||||
embedding: embs[j].map(x => Number(x)),
|
||||
offset: batch[j].offset,
|
||||
});
|
||||
}
|
||||
log(` embedded ${allChunks.length}/${raw_chunks.length}`);
|
||||
}
|
||||
log(`embedded all ${allChunks.length} chunks in ${((Date.now() - t0) / 1000).toFixed(1)}s`);
|
||||
|
||||
// ─── Phase 2: 6 iterations ─────────────────────────────────────
|
||||
const results: IterationResult[] = [];
|
||||
const priorIds: string[] = [];
|
||||
const priorAnswers: string[] = [];
|
||||
for (let i = 1; i <= QUESTIONS.length; i++) {
|
||||
const q = QUESTIONS[i - 1];
|
||||
const r = await runIteration(i, q, allChunks, priorIds, priorAnswers);
|
||||
results.push(r);
|
||||
if (r.playbook_id) priorIds.push(r.playbook_id);
|
||||
priorAnswers.push(r.answer_preview);
|
||||
log(` → iter ${i}: ${r.errors_recovered.length} errors recovered, ${r.continuation_retries} continuations, tree-split=${r.tree_split_fired}, rescue=${r.rescue_triggered}, ${r.duration_ms}ms`);
|
||||
await writeFile(`${OUT_DIR}/iter_${i}.json`, JSON.stringify(r, null, 2));
|
||||
}
|
||||
// Check whether iter 6 actually cited prior pb:IDs in its answer.
|
||||
// Playbook IDs look like `pb-seed-<hex>` so the regex needs to allow
|
||||
// hyphens + letters inside the brackets, not just hex chars.
|
||||
const iter6 = results[5];
|
||||
const citationsHonored = iter6 ? (iter6.answer_preview.match(/\[pb:[\w-]+\]/gi)?.length ?? 0) : 0;
|
||||
|
||||
// ─── Phase 3: summary ──────────────────────────────────────────
|
||||
const summary = {
|
||||
run_nonce: RUN_NONCE,
|
||||
ran_at: new Date().toISOString(),
|
||||
prd_chars: prd.length,
|
||||
prd_chunks: allChunks.length,
|
||||
iterations: results.length,
|
||||
total_cloud_calls: results.reduce((s, r) => s + r.cloud_calls_total, 0),
|
||||
total_continuation_retries: results.reduce((s, r) => s + r.continuation_retries, 0),
|
||||
total_errors_recovered: results.reduce((s, r) => s + r.errors_recovered.length, 0),
|
||||
tree_splits_fired: results.filter(r => r.tree_split_fired).length,
|
||||
rescues_triggered: results.filter(r => r.rescue_triggered).length,
|
||||
iter6_received_prior_ids: results[5]?.citations_from_prior_iterations.length ?? 0,
|
||||
iter6_actually_cited_in_answer: citationsHonored,
|
||||
iter3_task_attempts: results[2]?.task_attempts_made ?? 0,
|
||||
iter3_task_retries: results[2]?.task_retry_history.length ?? 0,
|
||||
max_task_attempts_any_iter: Math.max(...results.map(r => r.task_attempts_made)),
|
||||
total_duration_ms: results.reduce((s, r) => s + r.duration_ms, 0),
|
||||
overall: results.length === 6 && results.every(r => r.playbook_id !== null) ? "PASS" : "PARTIAL",
|
||||
};
|
||||
await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
|
||||
|
||||
log("");
|
||||
log(`══════ SUMMARY ${summary.overall} ══════`);
|
||||
log(` 6 iterations, ${summary.total_cloud_calls} cloud calls, ${summary.total_errors_recovered} errors recovered`);
|
||||
log(` tree-splits: ${summary.tree_splits_fired}/6 continuations: ${summary.total_continuation_retries} rescues: ${summary.rescues_triggered}`);
|
||||
log(` iter 6 received ${summary.iter6_received_prior_ids} prior IDs, cited ${summary.iter6_actually_cited_in_answer} [pb:...] markers in its answer`);
|
||||
log(` iter 3 task-retry loop: ${summary.iter3_task_attempts} attempts (${summary.iter3_task_retries} prior-failure retries before success)`);
|
||||
log(` total duration: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
|
||||
log("");
|
||||
for (const r of results) {
|
||||
const flags = [
|
||||
r.tree_split_fired ? "tree-split" : "",
|
||||
r.continuation_retries > 0 ? `cont=${r.continuation_retries}` : "",
|
||||
r.rescue_triggered ? "rescued" : "",
|
||||
r.errors_recovered.length > 0 ? `err=${r.errors_recovered.length}` : "",
|
||||
].filter(Boolean).join(" ");
|
||||
log(` iter ${r.iteration}: ${r.tokens_prompt}+${r.tokens_completion} tok, ${r.duration_ms}ms ${flags ? `[${flags}]` : ""}`);
|
||||
}
|
||||
log("");
|
||||
log(`artifacts: ${OUT_DIR}/{iter_1..6.json, summary.json}`);
|
||||
process.exit(summary.overall === "PASS" ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("[enrich] fatal:", e); process.exit(2); });
|
||||
267
tests/real-world/hard_task_escalation.ts
Normal file
267
tests/real-world/hard_task_escalation.ts
Normal file
@ -0,0 +1,267 @@
|
||||
// Hard-task escalation test. The task is deliberately constructed so
|
||||
// that a local 7B model (qwen3.5:latest) will miss at least one of the
|
||||
// validation rules. Watch the escalation ladder:
|
||||
// 1. qwen3.5:latest (local 7B) — likely fails
|
||||
// 2. qwen3:latest (local 7B) — likely fails differently
|
||||
// 3. gpt-oss:20b (cloud 20B) — may fail
|
||||
// 4. gpt-oss:120b (cloud 120B) — should succeed
|
||||
// 5. gpt-oss:120b w/ prior-attempt errors injected — retry with context
|
||||
// 6. absolute last ditch: return best-so-far with failure annotation
|
||||
//
|
||||
// Each attempt:
|
||||
// - Calls the model via /v1/chat
|
||||
// - Validates the output against a strict rubric
|
||||
// - On fail: records the specific rubric violations + the partial
|
||||
// output, injects both into the next attempt's prompt as "here's
|
||||
// what's wrong, fix it specifically"
|
||||
// - On success: exit loop
|
||||
//
|
||||
// Run: bun run tests/real-world/hard_task_escalation.ts
|
||||
|
||||
import { writeFile, mkdir } from "node:fs/promises";
|
||||
|
||||
const GATEWAY = "http://localhost:3100";
|
||||
const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/hard_task_${Date.now().toString(36)}`;
|
||||
const MAX_ATTEMPTS = 6;
|
||||
|
||||
// The hard task. Specific enough that a small model will miss at
|
||||
// least one rule. Not purely knowledge-based — it's a code-generation
|
||||
// task with strict structural constraints.
|
||||
const TASK = `Write a complete Rust async function with the EXACT signature:
|
||||
|
||||
pub async fn check_drift_batched(refs: Vec<DocRef>) -> Result<Vec<String>, String>
|
||||
|
||||
It must:
|
||||
1. Group refs by tool name (case-insensitive — use .to_ascii_lowercase())
|
||||
2. Issue parallel HTTP GET requests to http://localhost:3900/docs/{tool}/diff?since={snippet_hash}
|
||||
3. Use reqwest and a JoinSet/Semaphore to cap concurrent in-flight requests at 4
|
||||
4. On HTTP 5xx, retry with exponential backoff: sleep 250ms, then 500ms, then 1000ms, then give up on that tool
|
||||
5. Parse the response JSON: {"drifted": bool, ...}. Return a Vec<String> of tool names where drifted == true
|
||||
6. All errors bubble via ? or Result — NO .unwrap(), NO .expect(), NO panic!()
|
||||
7. Include rustdoc /// comments on the function and each internal helper
|
||||
|
||||
Assume this struct is already imported:
|
||||
|
||||
pub struct DocRef { pub tool: String, pub snippet_hash: Option<String>, pub version_seen: String }
|
||||
|
||||
Output ONLY the Rust code. No prose, no markdown fences, no explanation. Start directly with the /// doc comment.`;
|
||||
|
||||
// Escalation ladder — small-local → large-local → cloud → specialist
|
||||
// cloud → trillion-param cloud. Corrected 2026-04-22 per J:
|
||||
// gpt-oss:20b is LOCAL (ollama list confirms 13 GB on disk), and the
|
||||
// final escalation tier should be kimi-k2:1t (the biggest model
|
||||
// we have access to on Ollama Cloud).
|
||||
const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [
|
||||
{ provider: "ollama", model: "qwen3.5:latest", note: "local 7B" },
|
||||
{ provider: "ollama", model: "qwen3:latest", note: "local 7B (different) " },
|
||||
{ provider: "ollama", model: "gpt-oss:20b", note: "local 20B" }, // FIXED: local, not cloud
|
||||
{ provider: "ollama_cloud", model: "gpt-oss:120b", note: "cloud 120B" },
|
||||
{ provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B (coding specialist)" },
|
||||
// NOTE 2026-04-22 — J wanted Kimi as the last escalation but Kimi
|
||||
// K2.5/K2.6 both return "this model requires a subscription" on our
|
||||
// current Ollama Cloud key. mistral-large-3:675b is the biggest
|
||||
// model actually provisioned on this key (verified via direct curl
|
||||
// to ollama.com/api/generate). Upgrade path: Ollama Cloud Pro →
|
||||
// swap this line to kimi-k2.5 or kimi-k2.6:cloud.
|
||||
{ provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B (biggest available on current key; kimi-k2.x needs pro subscription)" },
|
||||
];
|
||||
|
||||
// Validation rubric — the answer must pass all of these to be accepted.
|
||||
interface RubricResult {
|
||||
passed: boolean;
|
||||
violations: string[];
|
||||
passed_rules: string[];
|
||||
}
|
||||
|
||||
function validate(code: string): RubricResult {
|
||||
const violations: string[] = [];
|
||||
const passed: string[] = [];
|
||||
|
||||
const check = (rule: string, ok: boolean) => { ok ? passed.push(rule) : violations.push(rule); };
|
||||
|
||||
check("has pub async fn check_drift_batched signature",
|
||||
/pub\s+async\s+fn\s+check_drift_batched\s*\(/.test(code));
|
||||
check("takes Vec<DocRef> argument",
|
||||
/refs\s*:\s*Vec\s*<\s*DocRef\s*>/.test(code));
|
||||
check("returns Result<Vec<String>, String>",
|
||||
/Result\s*<\s*Vec\s*<\s*String\s*>\s*,\s*String\s*>/.test(code));
|
||||
check("uses reqwest",
|
||||
/\breqwest\b/i.test(code));
|
||||
check("references JoinSet or Semaphore for concurrency",
|
||||
/\bJoinSet\b|\bSemaphore\b/i.test(code));
|
||||
check("bounds concurrency at 4",
|
||||
/\b4\b/.test(code) && (/Semaphore\s*::\s*new\s*\(\s*4\b/.test(code) || /permits\s*:\s*4\b/.test(code) || /limit\s*:\s*4\b/.test(code) || /max\s*:\s*4\b/.test(code) || /capacity\s*:\s*4\b/.test(code)));
|
||||
// Exponential backoff — models express this several ways. Accept
|
||||
// any recognizable doubling pattern starting at 250ms. 2026-04-22:
|
||||
// devstral-2:123b wrote `retry_delay *= 2` which my earlier regex
|
||||
// rejected even though the code is correct. Broadening rubric to
|
||||
// match all idiomatic doubling forms.
|
||||
const hasSeed250 = /Duration\s*::\s*from_millis\s*\(\s*250\b/.test(code)
|
||||
|| /millis\s*\(\s*250\b/.test(code);
|
||||
const hasDoublingPattern = /250\s*\*\s*2/.test(code) // 250 * 2^n literal
|
||||
|| /<<\s*\d+/.test(code) // bit-shift
|
||||
|| /\.pow\s*\(/.test(code) // 2u32.pow(attempt)
|
||||
|| /\*=\s*2\b/.test(code) // delay *= 2 ← was missing
|
||||
|| /\*\s*2\s*;/.test(code) // delay = delay * 2;
|
||||
|| /saturating_mul\s*\(\s*2\b/.test(code); // saturating doubling
|
||||
check("has 250ms backoff seed",
|
||||
hasSeed250);
|
||||
check("reaches 500ms backoff (literal or doubling from 250)",
|
||||
/Duration\s*::\s*from_millis\s*\(\s*500\b/.test(code)
|
||||
|| /millis\s*\(\s*500\b/.test(code)
|
||||
|| (hasSeed250 && hasDoublingPattern));
|
||||
check("reaches 1000ms backoff (literal or doubling to 1000)",
|
||||
/Duration\s*::\s*from_millis\s*\(\s*1000\b/.test(code)
|
||||
|| /millis\s*\(\s*1000\b/.test(code)
|
||||
|| (hasSeed250 && hasDoublingPattern));
|
||||
check("case-insensitive tool grouping (to_ascii_lowercase)",
|
||||
/to_ascii_lowercase|to_lowercase/.test(code));
|
||||
check("NO .unwrap() — all errors bubble via ?",
|
||||
!/\.unwrap\s*\(\s*\)/.test(code));
|
||||
check("NO .expect(...) — all errors bubble via ?",
|
||||
!/\.expect\s*\(/.test(code));
|
||||
check("NO panic!() / unimplemented!() / todo!()",
|
||||
!/\bpanic!\s*\(|\bunimplemented!\s*\(|\btodo!\s*\(/.test(code));
|
||||
check("has rustdoc /// comments",
|
||||
/\/\/\//.test(code));
|
||||
check("reasonable length (> 500 chars)",
|
||||
code.length > 500);
|
||||
|
||||
return { passed: violations.length === 0, violations, passed_rules: passed };
|
||||
}
|
||||
|
||||
function log(msg: string) { console.log(`[hard] ${msg}`); }
|
||||
|
||||
async function chat(opts: {
|
||||
provider: "ollama" | "ollama_cloud",
|
||||
model: string,
|
||||
prompt: string,
|
||||
}): Promise<{ content: string; error?: string }> {
|
||||
try {
|
||||
const r = await fetch(`${GATEWAY}/v1/chat`, {
|
||||
method: "POST", headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
provider: opts.provider,
|
||||
model: opts.model,
|
||||
messages: [{ role: "user", content: opts.prompt }],
|
||||
max_tokens: 2500,
|
||||
temperature: 0.2,
|
||||
think: false,
|
||||
}),
|
||||
signal: AbortSignal.timeout(240000),
|
||||
});
|
||||
if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 300)}` };
|
||||
const j: any = await r.json();
|
||||
return { content: j.choices?.[0]?.message?.content ?? "" };
|
||||
} catch (e) {
|
||||
return { content: "", error: (e as Error).message };
|
||||
}
|
||||
}
|
||||
|
||||
interface AttemptRecord {
|
||||
n: number;
|
||||
provider: string;
|
||||
model: string;
|
||||
duration_ms: number;
|
||||
content_chars: number;
|
||||
error: string | null;
|
||||
rubric_violations: string[];
|
||||
rubric_passed: string[];
|
||||
accepted: boolean;
|
||||
}
|
||||
|
||||
function extractCode(raw: string): string {
|
||||
// Strip common fence wrappers
|
||||
const m = raw.match(/```(?:rust)?\s*\n([\s\S]*?)```/);
|
||||
if (m) return m[1].trim();
|
||||
return raw.trim();
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await mkdir(OUT_DIR, { recursive: true });
|
||||
log(`output: ${OUT_DIR}`);
|
||||
log(`task: ${TASK.slice(0, 120)}...`);
|
||||
log("");
|
||||
|
||||
const attempts: AttemptRecord[] = [];
|
||||
let acceptedCode: string | null = null;
|
||||
|
||||
for (let i = 0; i < MAX_ATTEMPTS; i++) {
|
||||
const n = i + 1;
|
||||
const rung = LADDER[i] ?? LADDER[LADDER.length - 1];
|
||||
|
||||
// Build the prompt: base task + prior failures' learning blocks
|
||||
let priorLearning = "";
|
||||
if (attempts.length > 0) {
|
||||
priorLearning = `\n\n═══ PRIOR ATTEMPTS FAILED. Fix these exact issues: ═══\n`;
|
||||
for (const a of attempts) {
|
||||
priorLearning += `Attempt ${a.n} (${a.provider}/${a.model}, ${a.content_chars} chars) violations:\n`;
|
||||
for (const v of a.rubric_violations) priorLearning += ` - ${v}\n`;
|
||||
if (a.error) priorLearning += ` [error: ${a.error.slice(0, 120)}]\n`;
|
||||
}
|
||||
priorLearning += `═══ end prior attempts ═══\n\nDO NOT repeat the above violations. Address each one explicitly.`;
|
||||
}
|
||||
|
||||
log(`attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${priorLearning ? " [w/ learning]" : ""}`);
|
||||
const t0 = Date.now();
|
||||
const r = await chat({ provider: rung.provider, model: rung.model, prompt: TASK + priorLearning });
|
||||
const dur = Date.now() - t0;
|
||||
|
||||
const code = extractCode(r.content);
|
||||
const rubric = code ? validate(code) : { passed: false, violations: ["empty response"], passed_rules: [] };
|
||||
|
||||
const record: AttemptRecord = {
|
||||
n,
|
||||
provider: rung.provider,
|
||||
model: rung.model,
|
||||
duration_ms: dur,
|
||||
content_chars: code.length,
|
||||
error: r.error ?? null,
|
||||
rubric_violations: rubric.violations,
|
||||
rubric_passed: rubric.passed_rules,
|
||||
accepted: rubric.passed,
|
||||
};
|
||||
attempts.push(record);
|
||||
|
||||
log(` → ${dur}ms, ${code.length} chars, ${rubric.passed_rules.length} rules passed / ${rubric.violations.length} failed${r.error ? `, err: ${r.error.slice(0, 80)}` : ""}`);
|
||||
for (const v of rubric.violations.slice(0, 5)) log(` ✗ ${v}`);
|
||||
|
||||
await writeFile(`${OUT_DIR}/attempt_${n}.txt`, code);
|
||||
await writeFile(`${OUT_DIR}/attempt_${n}.json`, JSON.stringify(record, null, 2));
|
||||
|
||||
if (rubric.passed) {
|
||||
log(` ✅ ACCEPTED on attempt ${n}`);
|
||||
acceptedCode = code;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const summary = {
|
||||
task: TASK.slice(0, 200),
|
||||
total_attempts: attempts.length,
|
||||
accepted: acceptedCode !== null,
|
||||
accepted_on_attempt: acceptedCode ? attempts.findIndex(a => a.accepted) + 1 : null,
|
||||
escalation_path: attempts.map(a => `${a.provider}/${a.model}`),
|
||||
per_attempt_pass_counts: attempts.map(a => a.rubric_passed.length),
|
||||
per_attempt_violation_counts: attempts.map(a => a.rubric_violations.length),
|
||||
total_duration_ms: attempts.reduce((s, a) => s + a.duration_ms, 0),
|
||||
};
|
||||
await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
|
||||
|
||||
log("");
|
||||
log(`═══ RESULT ═══`);
|
||||
log(`attempts: ${summary.total_attempts}`);
|
||||
log(`accepted: ${summary.accepted} ${summary.accepted ? `on attempt ${summary.accepted_on_attempt}` : ""}`);
|
||||
log(`escalation path:`);
|
||||
for (const [i, a] of attempts.entries()) {
|
||||
const mark = a.accepted ? "✅" : "❌";
|
||||
log(` ${mark} attempt ${i + 1}: ${a.provider}/${a.model} — ${a.rubric_passed.length}/${a.rubric_passed.length + a.rubric_violations.length} rules passed, ${a.duration_ms}ms`);
|
||||
}
|
||||
log("");
|
||||
log(`total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
|
||||
log(`artifacts: ${OUT_DIR}/{attempt_1..N.{txt,json}, summary.json}`);
|
||||
|
||||
process.exit(summary.accepted ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("[hard] fatal:", e); process.exit(2); });
|
||||
181
tests/real-world/nine_consecutive_audits.ts
Normal file
181
tests/real-world/nine_consecutive_audits.ts
Normal file
@ -0,0 +1,181 @@
|
||||
// Nine-consecutive audit runner — empirical test of the predictive-
|
||||
// compounding property. Pushes 9 empty commits to the current branch,
|
||||
// waits for each audit to complete on the new SHA, captures the
|
||||
// verdict + audit_lessons state after each run, and reports whether
|
||||
// the KB stabilizes or drifts.
|
||||
//
|
||||
// What we expect (favorable compounding):
|
||||
// - signature_count grows sublinearly (same patterns recur, so
|
||||
// distinct-signature count stabilizes fast)
|
||||
// - verdict settles on a stable value after run 2-3 (first audit
|
||||
// establishes baseline, rest repeat)
|
||||
// - confidence stays LOW for all signatures (same PR repeatedly)
|
||||
// - NO new recurring findings fire because confidence < 0.3 on
|
||||
// same-PR noise (kb_index rating policy)
|
||||
//
|
||||
// What would indicate drift (the thing we want to prove DOESN'T happen):
|
||||
// - signature_count grows linearly — each run produces new signatures
|
||||
// - verdict oscillates (block → approve → block ...)
|
||||
// - confidence inflates — kb_index rating escalates on repeated runs
|
||||
//
|
||||
// Run: bun run tests/real-world/nine_consecutive_audits.ts
|
||||
|
||||
import { readFile } from "node:fs/promises";
|
||||
import { aggregate } from "../../auditor/kb_index.ts";
|
||||
|
||||
const REPO = "/home/profit/lakehouse";
|
||||
const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`;
|
||||
const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`;
|
||||
const POLL_INTERVAL_MS = 5_000;
|
||||
const AUDIT_TIMEOUT_MS = 180_000;
|
||||
const RUNS = Number(process.env.LH_AUDIT_RUNS ?? 9);
|
||||
const TARGET_PR = Number(process.env.LH_AUDIT_PR ?? 8);
|
||||
|
||||
async function sh(cmd: string): Promise<{ stdout: string; stderr: string; code: number }> {
|
||||
const p = Bun.spawn(["bash", "-lc", cmd], { cwd: REPO, stdout: "pipe", stderr: "pipe" });
|
||||
const [stdout, stderr] = await Promise.all([new Response(p.stdout).text(), new Response(p.stderr).text()]);
|
||||
const code = await p.exited;
|
||||
return { stdout, stderr, code };
|
||||
}
|
||||
|
||||
async function getHeadSha(): Promise<string> {
|
||||
const r = await sh("git rev-parse HEAD");
|
||||
return r.stdout.trim();
|
||||
}
|
||||
|
||||
async function pushEmptyCommit(n: number): Promise<string> {
|
||||
const msg = `test: nine-consecutive audit run ${n}/${RUNS} (compounding probe)`;
|
||||
await sh(`GIT_AUTHOR_NAME=profit GIT_AUTHOR_EMAIL=profit@lakehouse GIT_COMMITTER_NAME=profit GIT_COMMITTER_EMAIL=profit@lakehouse git commit --allow-empty -m "${msg}"`);
|
||||
const sha = await getHeadSha();
|
||||
const pushCmd = `PAT="dead60d1160a02f81d241197d5d18f4608794fb2"; git -c credential.helper='!f() { echo "username=profit"; echo "password='$PAT'"; }; f' push origin HEAD 2>&1`;
|
||||
const pr = await sh(pushCmd);
|
||||
if (pr.code !== 0) throw new Error(`push failed: ${pr.stderr || pr.stdout}`);
|
||||
return sha;
|
||||
}
|
||||
|
||||
async function waitForVerdict(sha: string, deadlineMs: number): Promise<any> {
|
||||
const short = sha.slice(0, 12);
|
||||
const path = `${VERDICTS_DIR}/${TARGET_PR}-${short}.json`;
|
||||
const start = Date.now();
|
||||
while (Date.now() - start < deadlineMs) {
|
||||
try {
|
||||
const raw = await readFile(path, "utf8");
|
||||
return JSON.parse(raw);
|
||||
} catch { /* not yet */ }
|
||||
await new Promise(r => setTimeout(r, POLL_INTERVAL_MS));
|
||||
}
|
||||
throw new Error(`no verdict file after ${deadlineMs}ms: ${path}`);
|
||||
}
|
||||
|
||||
async function captureAggState(): Promise<{ sig_count: number; max_count: number; max_confidence: number; top3: Array<{ sig: string; count: number; conf: number; summary: string }> }> {
|
||||
const agg = await aggregate<any>(AUDIT_LESSONS, {
|
||||
keyFn: (r) => r?.signature,
|
||||
scopeFn: (r) => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
|
||||
});
|
||||
const list = Array.from(agg.values()).sort((a, b) => b.count - a.count);
|
||||
return {
|
||||
sig_count: list.length,
|
||||
max_count: list[0]?.count ?? 0,
|
||||
max_confidence: list.reduce((m, a) => Math.max(m, a.confidence), 0),
|
||||
top3: list.slice(0, 3).map(a => ({
|
||||
sig: a.signature,
|
||||
count: a.count,
|
||||
conf: a.confidence,
|
||||
summary: a.representative_summary.slice(0, 80),
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
interface RunRecord {
|
||||
run: number;
|
||||
sha: string;
|
||||
verdict_overall: string;
|
||||
findings_total: number;
|
||||
findings_block: number;
|
||||
findings_warn: number;
|
||||
findings_info: number;
|
||||
audit_duration_ms: number;
|
||||
claims_total: number;
|
||||
claims_empirical: number;
|
||||
kb_sig_count_after: number;
|
||||
kb_max_count_after: number;
|
||||
kb_max_confidence_after: number;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log(`[nine] target PR: #${TARGET_PR}`);
|
||||
console.log(`[nine] runs: ${RUNS}`);
|
||||
console.log(`[nine] audit_lessons.jsonl: ${AUDIT_LESSONS}`);
|
||||
console.log("");
|
||||
|
||||
const baseline = await captureAggState();
|
||||
console.log(`[nine] baseline: sig_count=${baseline.sig_count} max_count=${baseline.max_count} max_conf=${baseline.max_confidence.toFixed(2)}`);
|
||||
console.log("");
|
||||
|
||||
const records: RunRecord[] = [];
|
||||
for (let n = 1; n <= RUNS; n++) {
|
||||
const t0 = Date.now();
|
||||
console.log(`─── run ${n}/${RUNS} ───`);
|
||||
const sha = await pushEmptyCommit(n);
|
||||
console.log(` pushed ${sha.slice(0, 12)}`);
|
||||
const verdict = await waitForVerdict(sha, AUDIT_TIMEOUT_MS);
|
||||
const after = await captureAggState();
|
||||
const rec: RunRecord = {
|
||||
run: n,
|
||||
sha: sha.slice(0, 12),
|
||||
verdict_overall: String(verdict.overall),
|
||||
findings_total: Number(verdict.metrics?.findings_total ?? 0),
|
||||
findings_block: Number(verdict.metrics?.findings_block ?? 0),
|
||||
findings_warn: Number(verdict.metrics?.findings_warn ?? 0),
|
||||
findings_info: Number(verdict.metrics?.findings_info ?? 0),
|
||||
audit_duration_ms: Number(verdict.metrics?.audit_duration_ms ?? 0),
|
||||
claims_total: Number(verdict.metrics?.claims_total ?? 0),
|
||||
claims_empirical: Number(verdict.metrics?.claims_empirical ?? 0),
|
||||
kb_sig_count_after: after.sig_count,
|
||||
kb_max_count_after: after.max_count,
|
||||
kb_max_confidence_after: after.max_confidence,
|
||||
};
|
||||
records.push(rec);
|
||||
console.log(` verdict=${rec.verdict_overall} findings=${rec.findings_total} (b=${rec.findings_block} w=${rec.findings_warn})`);
|
||||
console.log(` kb after: sig=${rec.kb_sig_count_after} max_count=${rec.kb_max_count_after} max_conf=${rec.kb_max_confidence_after.toFixed(2)}`);
|
||||
console.log(` elapsed: ${((Date.now() - t0) / 1000).toFixed(1)}s`);
|
||||
console.log("");
|
||||
}
|
||||
|
||||
console.log("═══ FINAL ═══");
|
||||
console.log("run | verdict | find | block warn info | dur_s | kb_sig max_count max_conf");
|
||||
for (const r of records) {
|
||||
console.log(
|
||||
` ${String(r.run).padStart(1)} | ${r.verdict_overall.padEnd(16)} | ${String(r.findings_total).padStart(4)} | ${String(r.findings_block).padStart(5)} ${String(r.findings_warn).padStart(5)} ${String(r.findings_info).padStart(5)} | ${(r.audit_duration_ms / 1000).toFixed(1).padStart(5)} | ${String(r.kb_sig_count_after).padStart(6)} ${String(r.kb_max_count_after).padStart(9)} ${r.kb_max_confidence_after.toFixed(2)}`,
|
||||
);
|
||||
}
|
||||
|
||||
console.log("");
|
||||
console.log("═══ COMPOUNDING PROPERTY ═══");
|
||||
const sigDelta = records[records.length - 1].kb_sig_count_after - baseline.sig_count;
|
||||
const maxCount = records[records.length - 1].kb_max_count_after;
|
||||
const maxConf = records[records.length - 1].kb_max_confidence_after;
|
||||
console.log(` signatures added over ${RUNS} runs: ${sigDelta}`);
|
||||
console.log(` max count after run ${RUNS}: ${maxCount} (same-PR recurrences per signature)`);
|
||||
console.log(` max confidence after run ${RUNS}: ${maxConf.toFixed(2)} (expect LOW — same-PR should not inflate)`);
|
||||
|
||||
const verdictSet = new Set(records.map(r => r.verdict_overall));
|
||||
if (verdictSet.size === 1) {
|
||||
console.log(` verdict stable: all ${RUNS} runs returned '${[...verdictSet][0]}' ✓`);
|
||||
} else {
|
||||
console.log(` verdict oscillated across runs: ${[...verdictSet].join(" | ")} ✗`);
|
||||
}
|
||||
|
||||
if (maxConf < 0.3) {
|
||||
console.log(` confidence policy holding: same-PR noise stays below escalation threshold ✓`);
|
||||
} else {
|
||||
console.log(` ⚠ confidence escalated above 0.3 on same-PR noise — kb_index policy needs tightening`);
|
||||
}
|
||||
|
||||
const jsonOut = `${REPO}/tests/real-world/runs/nine_consecutive_${Date.now().toString(36)}.json`;
|
||||
await Bun.write(jsonOut, JSON.stringify({ target_pr: TARGET_PR, baseline, records }, null, 2));
|
||||
console.log("");
|
||||
console.log(` report: ${jsonOut}`);
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("[nine] fatal:", e); process.exit(1); });
|
||||
443
tests/real-world/scrum_master_pipeline.ts
Normal file
443
tests/real-world/scrum_master_pipeline.ts
Normal file
@ -0,0 +1,443 @@
|
||||
// Scrum-master orchestrator — pulls git repo source + PRD + a change
|
||||
// proposal, chunks everything, hands each code piece to the proven
|
||||
// escalation ladder (small-local → big-local → cloud → specialist →
|
||||
// biggest) with learning context between attempts. Collects per-file
|
||||
// suggestions in a coherent handoff report.
|
||||
//
|
||||
// What it composes (everything below is already shipped + proven):
|
||||
// - Chunker + embeddings (sidecar /embed, nomic-embed-text)
|
||||
// - In-memory cosine retrieval (top-K PRD + plan chunks per file)
|
||||
// - Escalation ladder (6 tiers, cycling on empty/error/thin-answer)
|
||||
// - Per-attempt learning-context injection (prior failures → prompt)
|
||||
// - Tree-split fallback when combined context exceeds budget
|
||||
// - JSONL output per file + summary
|
||||
//
|
||||
// Deliberate scope limit: TARGET_FILES is 3 files by default. The
|
||||
// pipeline works at larger N, but at ~90s/file × 3 files = 4-5 min,
|
||||
// 15 files = 22 min. Bump via env LH_SCRUM_FILES="path1,path2,...".
|
||||
//
|
||||
// Run: bun run tests/real-world/scrum_master_pipeline.ts
|
||||
|
||||
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
||||
import { createHash } from "node:crypto";
|
||||
|
||||
const GATEWAY = "http://localhost:3100";
|
||||
const SIDECAR = "http://localhost:3200";
|
||||
const CHUNK_SIZE = 800;
|
||||
const CHUNK_OVERLAP = 120;
|
||||
const TOP_K_CONTEXT = 5;
|
||||
const MAX_ATTEMPTS = 6;
|
||||
// Files larger than this get tree-split instead of truncated. Fixes the
|
||||
// 6KB false-positive class (model claiming a field is "missing" when
|
||||
// it exists past the context cutoff).
|
||||
const FILE_TREE_SPLIT_THRESHOLD = 6000;
|
||||
const FILE_SHARD_SIZE = 3500;
|
||||
// Appended jsonl so auditor's kb_query can surface scrum findings for
|
||||
// files touched by a PR under review. Part of cohesion plan Phase C.
|
||||
const SCRUM_REVIEWS_JSONL = "/home/profit/lakehouse/data/_kb/scrum_reviews.jsonl";
|
||||
const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/scrum_${Date.now().toString(36)}`;
|
||||
|
||||
const PRD_PATH = "/home/profit/lakehouse/docs/PRD.md";
|
||||
// Using CONTROL_PLANE_PRD as the "suggested changes" doc since it
|
||||
// describes the Phase 38-44 target architecture and is on main.
|
||||
// COHESION_INTEGRATION_PLAN.md is still on PR #7 branch.
|
||||
const PROPOSAL_PATH = "/home/profit/lakehouse/docs/CONTROL_PLANE_PRD.md";
|
||||
|
||||
// Scoped target: 3 representative source files by default.
|
||||
// The scrum-master walks these in order and produces one suggestion
|
||||
// set per file. Override via env for a wider sweep.
|
||||
const DEFAULT_TARGETS = [
|
||||
"/home/profit/lakehouse/crates/vectord/src/playbook_memory.rs",
|
||||
"/home/profit/lakehouse/crates/vectord/src/doc_drift.rs",
|
||||
"/home/profit/lakehouse/auditor/audit.ts",
|
||||
];
|
||||
const TARGET_FILES: string[] = process.env.LH_SCRUM_FILES
|
||||
? process.env.LH_SCRUM_FILES.split(",").map(s => s.trim())
|
||||
: DEFAULT_TARGETS;
|
||||
|
||||
const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [
|
||||
{ provider: "ollama", model: "qwen3.5:latest", note: "local 7B" },
|
||||
{ provider: "ollama", model: "qwen3:latest", note: "local 7B (peer)" },
|
||||
{ provider: "ollama", model: "gpt-oss:20b", note: "local 20B" },
|
||||
{ provider: "ollama_cloud", model: "gpt-oss:120b", note: "cloud 120B" },
|
||||
{ provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B coding specialist" },
|
||||
{ provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B last-ditch" },
|
||||
];
|
||||
|
||||
type Chunk = { id: string; text: string; embedding: number[]; origin: string; offset: number };
|
||||
|
||||
interface FileReview {
|
||||
file: string;
|
||||
file_bytes: number;
|
||||
tree_split_fired: boolean;
|
||||
shards_summarized: number;
|
||||
top_prd_chunks: Array<{ origin: string; offset: number; score: number }>;
|
||||
top_proposal_chunks: Array<{ origin: string; offset: number; score: number }>;
|
||||
attempts_made: number;
|
||||
attempts_history: Array<{ n: number; model: string; status: "accepted" | "thin" | "error"; chars: number; error?: string }>;
|
||||
accepted_on: number | null;
|
||||
escalated_to_model: string;
|
||||
suggestions: string;
|
||||
duration_ms: number;
|
||||
}
|
||||
|
||||
function log(msg: string) { console.log(`[scrum] ${msg}`); }
|
||||
|
||||
function cosine(a: number[], b: number[]): number {
|
||||
let dot = 0, na = 0, nb = 0;
|
||||
for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; }
|
||||
return na && nb ? dot / (Math.sqrt(na) * Math.sqrt(nb)) : 0;
|
||||
}
|
||||
|
||||
function chunkText(text: string): Array<{ text: string; offset: number }> {
|
||||
const out: Array<{ text: string; offset: number }> = [];
|
||||
for (let i = 0; i < text.length; ) {
|
||||
const end = Math.min(i + CHUNK_SIZE, text.length);
|
||||
const slice = text.slice(i, end).trim();
|
||||
if (slice.length > 60) out.push({ text: slice, offset: i });
|
||||
if (end >= text.length) break;
|
||||
i = end - CHUNK_OVERLAP;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
async function embedBatch(texts: string[]): Promise<number[][]> {
|
||||
const r = await fetch(`${SIDECAR}/embed`, {
|
||||
method: "POST", headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({ texts }),
|
||||
signal: AbortSignal.timeout(120000),
|
||||
});
|
||||
if (!r.ok) throw new Error(`embed ${r.status}`);
|
||||
return (await r.json() as any).embeddings;
|
||||
}
|
||||
|
||||
async function chat(opts: {
|
||||
provider: "ollama" | "ollama_cloud",
|
||||
model: string,
|
||||
prompt: string,
|
||||
max_tokens?: number,
|
||||
}): Promise<{ content: string; error?: string; prompt_tokens: number; completion_tokens: number }> {
|
||||
try {
|
||||
const r = await fetch(`${GATEWAY}/v1/chat`, {
|
||||
method: "POST", headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
provider: opts.provider,
|
||||
model: opts.model,
|
||||
messages: [{ role: "user", content: opts.prompt }],
|
||||
max_tokens: opts.max_tokens ?? 1500,
|
||||
temperature: 0.2,
|
||||
think: false,
|
||||
}),
|
||||
signal: AbortSignal.timeout(180000),
|
||||
});
|
||||
if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 200)}`, prompt_tokens: 0, completion_tokens: 0 };
|
||||
const j: any = await r.json();
|
||||
return {
|
||||
content: j.choices?.[0]?.message?.content ?? "",
|
||||
prompt_tokens: j.usage?.prompt_tokens ?? 0,
|
||||
completion_tokens: j.usage?.completion_tokens ?? 0,
|
||||
};
|
||||
} catch (e) {
|
||||
return { content: "", error: (e as Error).message, prompt_tokens: 0, completion_tokens: 0 };
|
||||
}
|
||||
}
|
||||
|
||||
// Accept a file-review answer if it's substantive + structured.
|
||||
// We're not validating Rust here — we're validating that the model
|
||||
// produced a coherent suggestion set.
|
||||
function isAcceptable(answer: string): boolean {
|
||||
if (answer.length < 200) return false; // too thin
|
||||
// Must at least try a structured form — numbered list, bullets,
|
||||
// or sections. Models that just hand-wave fail.
|
||||
const hasStructure = /^\s*[-*]\s/m.test(answer)
|
||||
|| /^\s*\d+\.\s/m.test(answer)
|
||||
|| /^\s*#/m.test(answer);
|
||||
return hasStructure;
|
||||
}
|
||||
|
||||
function retrieveTopK(query_emb: number[], pool: Chunk[], k: number): Chunk[] {
|
||||
return pool
|
||||
.map(c => ({ c, score: cosine(query_emb, c.embedding) }))
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, k)
|
||||
.map(x => ({ ...x.c, _score: x.score } as any));
|
||||
}
|
||||
|
||||
// Tree-split a large file: shard it, summarize each shard against
|
||||
// the review question, merge into a scratchpad. Uses cloud because
|
||||
// the summarization step needs quality > speed. Returns the
|
||||
// scratchpad (full-file distillation) and the cloud-call count.
|
||||
async function treeSplitFile(
|
||||
filePath: string,
|
||||
content: string,
|
||||
): Promise<{ scratchpad: string; shards: number; cloud_calls: number }> {
|
||||
const shards: Array<{ from: number; to: number; text: string }> = [];
|
||||
for (let i = 0; i < content.length; i += FILE_SHARD_SIZE) {
|
||||
const end = Math.min(i + FILE_SHARD_SIZE, content.length);
|
||||
shards.push({ from: i, to: end, text: content.slice(i, end) });
|
||||
}
|
||||
let scratchpad = "";
|
||||
let cloud_calls = 0;
|
||||
log(` tree-split: ${content.length} chars → ${shards.length} shards of ${FILE_SHARD_SIZE}`);
|
||||
for (const [si, shard] of shards.entries()) {
|
||||
const prompt = `You are summarizing ONE SHARD of a source file as part of a multi-shard review. File: ${filePath}. Shard ${si + 1}/${shards.length} (bytes ${shard.from}..${shard.to}).
|
||||
|
||||
─────── shard source ───────
|
||||
${shard.text}
|
||||
─────── end shard ───────
|
||||
|
||||
Scratchpad of prior shards (if empty, this is shard 1):
|
||||
${scratchpad || "(empty)"}
|
||||
|
||||
Extract ONLY facts useful for reviewing this file against its PRD: function names + purposes, struct fields + types, invariants, edge cases, TODO markers, error-handling style. Under 150 words. No prose outside the extracted facts.`;
|
||||
const r = await chat({
|
||||
provider: "ollama_cloud",
|
||||
model: "gpt-oss:120b",
|
||||
prompt,
|
||||
max_tokens: 400,
|
||||
});
|
||||
cloud_calls += 1;
|
||||
if (r.content) {
|
||||
scratchpad += `\n--- shard ${si + 1} (bytes ${shard.from}..${shard.to}) ---\n${r.content.trim()}`;
|
||||
}
|
||||
}
|
||||
return { scratchpad, shards: shards.length, cloud_calls };
|
||||
}
|
||||
|
||||
async function reviewFile(
|
||||
filePath: string,
|
||||
prd_chunks: Chunk[],
|
||||
proposal_chunks: Chunk[],
|
||||
): Promise<FileReview> {
|
||||
const t0 = Date.now();
|
||||
log(`file: ${filePath}`);
|
||||
const content = await readFile(filePath, "utf8");
|
||||
const rel = filePath.replace("/home/profit/lakehouse/", "");
|
||||
|
||||
// Build a query embedding from the first ~800 chars of the file
|
||||
// (good enough for topical retrieval).
|
||||
const seed = content.slice(0, 800);
|
||||
const [seedEmb] = await embedBatch([seed]);
|
||||
|
||||
const topPrd = retrieveTopK(seedEmb, prd_chunks, TOP_K_CONTEXT);
|
||||
const topPlan = retrieveTopK(seedEmb, proposal_chunks, TOP_K_CONTEXT);
|
||||
log(` retrieved ${topPrd.length} PRD chunks + ${topPlan.length} proposal chunks`);
|
||||
|
||||
const contextBlock = [
|
||||
"═══ RELEVANT PRD EXCERPTS ═══",
|
||||
...topPrd.map(c => `[PRD @${c.offset}]\n${c.text.slice(0, 600)}`),
|
||||
"",
|
||||
"═══ RELEVANT CHANGE PROPOSAL EXCERPTS ═══",
|
||||
...topPlan.map(c => `[PLAN @${c.offset}]\n${c.text.slice(0, 600)}`),
|
||||
].join("\n\n");
|
||||
|
||||
// Files bigger than FILE_TREE_SPLIT_THRESHOLD get tree-split.
|
||||
// Summarize each shard to a scratchpad, then review against the
|
||||
// scratchpad instead of the truncated first chunk. Prevents the
|
||||
// false-positive pattern where the model claims a field is
|
||||
// "missing" because it's past the context cutoff.
|
||||
let sourceForPrompt: string;
|
||||
let treeSplitFired = false;
|
||||
let shardsSummarized = 0;
|
||||
let extraCloudCalls = 0;
|
||||
if (content.length > FILE_TREE_SPLIT_THRESHOLD) {
|
||||
treeSplitFired = true;
|
||||
const ts = await treeSplitFile(rel, content);
|
||||
shardsSummarized = ts.shards;
|
||||
extraCloudCalls = ts.cloud_calls;
|
||||
sourceForPrompt = `[FULL-FILE SCRATCHPAD — distilled from ${ts.shards} shards via tree-split]\n${ts.scratchpad}`;
|
||||
} else {
|
||||
sourceForPrompt = content;
|
||||
}
|
||||
|
||||
// Prompt — when tree-split fired, include an explicit instruction
|
||||
// not to claim a field/function is "missing" because the scratchpad
|
||||
// is a distillation not the full file. Attacks the rubric-tuning
|
||||
// concern J called out.
|
||||
const truncationWarning = treeSplitFired
|
||||
? `\nIMPORTANT: the "source" below is a multi-shard distillation (tree-split across ${shardsSummarized} shards), NOT the full raw file. DO NOT claim any field, function, or feature is "missing" based on its absence from this distillation — the distillation may have elided it. Only call out gaps that appear DIRECTLY contradicted by the PRD excerpts.\n`
|
||||
: "";
|
||||
|
||||
const baseTask = `You are reviewing one source file against the Lakehouse PRD and an active cohesion-integration plan.
|
||||
|
||||
FILE: ${rel} (${content.length} bytes${treeSplitFired ? `, tree-split into ${shardsSummarized} shards` : ""})
|
||||
${truncationWarning}
|
||||
─────── source ───────
|
||||
${sourceForPrompt}
|
||||
─────── end source ───────
|
||||
|
||||
${contextBlock}
|
||||
|
||||
Produce a structured review with:
|
||||
1. Alignment score (1-10) between this file and the PRD intent
|
||||
2. 3-5 concrete suggested changes (bullet points), each naming a specific function/line and what to change
|
||||
3. Any gap where this file's behavior contradicts the PRD or the proposal
|
||||
|
||||
Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-offset when relevant.`;
|
||||
|
||||
const history: FileReview["attempts_history"] = [];
|
||||
let accepted: string | null = null;
|
||||
let acceptedModel = "";
|
||||
let acceptedOn = 0;
|
||||
|
||||
for (let i = 0; i < MAX_ATTEMPTS; i++) {
|
||||
const n = i + 1;
|
||||
const rung = LADDER[i];
|
||||
const learning = history.length > 0
|
||||
? `\n\n═══ PRIOR ATTEMPTS FAILED. Specific issues to fix: ═══\n${history.map(h => `Attempt ${h.n} (${h.model}, ${h.chars} chars): ${h.status} — ${h.error ?? "thin/unstructured answer"}`).join("\n")}\n═══`
|
||||
: "";
|
||||
|
||||
log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${learning ? " [w/ learning]" : ""}`);
|
||||
const r = await chat({
|
||||
provider: rung.provider,
|
||||
model: rung.model,
|
||||
prompt: baseTask + learning,
|
||||
max_tokens: 1500,
|
||||
});
|
||||
|
||||
if (r.error) {
|
||||
history.push({ n, model: rung.model, status: "error", chars: 0, error: r.error.slice(0, 180) });
|
||||
log(` ✗ error: ${r.error.slice(0, 80)}`);
|
||||
continue;
|
||||
}
|
||||
if (!isAcceptable(r.content)) {
|
||||
history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: `thin/unstructured (${r.content.length} chars)` });
|
||||
log(` ✗ thin/unstructured (${r.content.length} chars)`);
|
||||
continue;
|
||||
}
|
||||
history.push({ n, model: rung.model, status: "accepted", chars: r.content.length });
|
||||
accepted = r.content;
|
||||
acceptedModel = `${rung.provider}/${rung.model}`;
|
||||
acceptedOn = n;
|
||||
log(` ✓ ACCEPTED on attempt ${n} (${rung.model}, ${r.content.length} chars)`);
|
||||
break;
|
||||
}
|
||||
|
||||
const review: FileReview = {
|
||||
file: rel,
|
||||
file_bytes: content.length,
|
||||
tree_split_fired: treeSplitFired,
|
||||
shards_summarized: shardsSummarized,
|
||||
top_prd_chunks: topPrd.map(c => ({ origin: c.origin, offset: c.offset, score: (c as any)._score })),
|
||||
top_proposal_chunks: topPlan.map(c => ({ origin: c.origin, offset: c.offset, score: (c as any)._score })),
|
||||
attempts_made: history.length,
|
||||
attempts_history: history,
|
||||
accepted_on: acceptedOn || null,
|
||||
escalated_to_model: acceptedModel,
|
||||
suggestions: accepted ?? "[no acceptable answer after escalation ladder exhausted]",
|
||||
duration_ms: Date.now() - t0,
|
||||
};
|
||||
|
||||
// Append to the shared scrum-reviews jsonl so the auditor's
|
||||
// kb_query check can surface relevant reviews for files in a
|
||||
// PR diff. Cohesion plan Phase C wire.
|
||||
if (accepted) {
|
||||
const { appendFile, mkdir } = await import("node:fs/promises");
|
||||
const { dirname } = await import("node:path");
|
||||
await mkdir(dirname(SCRUM_REVIEWS_JSONL), { recursive: true });
|
||||
const row = {
|
||||
file: rel,
|
||||
reviewed_at: new Date().toISOString(),
|
||||
accepted_model: acceptedModel,
|
||||
accepted_on_attempt: acceptedOn,
|
||||
attempts_made: history.length,
|
||||
tree_split_fired: treeSplitFired,
|
||||
suggestions_preview: accepted.slice(0, 2000),
|
||||
};
|
||||
try {
|
||||
await appendFile(SCRUM_REVIEWS_JSONL, JSON.stringify(row) + "\n");
|
||||
} catch (e) {
|
||||
console.error(`[scrum] failed to append scrum_reviews.jsonl: ${(e as Error).message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return review;
|
||||
}
|
||||
|
||||
async function loadAndChunk(path: string, origin_tag: string): Promise<Chunk[]> {
|
||||
const text = await readFile(path, "utf8");
|
||||
const raw = chunkText(text);
|
||||
const embs = await embedBatch(raw.map(r => r.text));
|
||||
return raw.map((r, i) => ({
|
||||
id: createHash("sha256").update(r.text).digest("hex").slice(0, 10),
|
||||
text: r.text,
|
||||
embedding: embs[i],
|
||||
origin: origin_tag,
|
||||
offset: r.offset,
|
||||
}));
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await mkdir(OUT_DIR, { recursive: true });
|
||||
log(`output: ${OUT_DIR}`);
|
||||
log(`targets: ${TARGET_FILES.length} files`);
|
||||
|
||||
log("loading + embedding PRD...");
|
||||
const prd_chunks = await loadAndChunk(PRD_PATH, "PRD");
|
||||
log(` PRD: ${prd_chunks.length} chunks`);
|
||||
|
||||
log("loading + embedding cohesion plan...");
|
||||
const plan_chunks = await loadAndChunk(PROPOSAL_PATH, "COHESION_PLAN");
|
||||
log(` plan: ${plan_chunks.length} chunks`);
|
||||
|
||||
log("");
|
||||
log("─── scrum master: walking target files ───");
|
||||
|
||||
const reviews: FileReview[] = [];
|
||||
for (const f of TARGET_FILES) {
|
||||
const review = await reviewFile(f, prd_chunks, plan_chunks);
|
||||
reviews.push(review);
|
||||
await writeFile(
|
||||
`${OUT_DIR}/review_${review.file.replace(/\//g, "_")}.json`,
|
||||
JSON.stringify(review, null, 2),
|
||||
);
|
||||
log(` → ${review.file}: ${review.accepted_on ? `accepted on ${review.accepted_on} by ${review.escalated_to_model}` : "UNRESOLVED"} (${review.duration_ms}ms)`);
|
||||
}
|
||||
|
||||
// Consolidated scrum-master report
|
||||
const report_md: string[] = [];
|
||||
report_md.push(`# Scrum-master review\n`);
|
||||
report_md.push(`Generated: ${new Date().toISOString()}`);
|
||||
report_md.push(`Files reviewed: ${reviews.length}`);
|
||||
report_md.push(`Total duration: ${(reviews.reduce((s, r) => s + r.duration_ms, 0) / 1000).toFixed(1)}s\n`);
|
||||
for (const r of reviews) {
|
||||
report_md.push(`\n## ${r.file}`);
|
||||
report_md.push(`- **Accepted on attempt:** ${r.accepted_on ?? "NOT resolved after 6 attempts"}`);
|
||||
report_md.push(`- **Escalated to:** \`${r.escalated_to_model || "—"}\``);
|
||||
report_md.push(`- **Total attempts:** ${r.attempts_made}`);
|
||||
if (r.attempts_history.length > 1) {
|
||||
report_md.push(`- **Attempt history:**`);
|
||||
for (const h of r.attempts_history) {
|
||||
report_md.push(` - ${h.n}: \`${h.model}\` → ${h.status}${h.error ? ` (${h.error.slice(0, 100)})` : ""}`);
|
||||
}
|
||||
}
|
||||
report_md.push(`\n### Suggestions\n\n${r.suggestions}\n`);
|
||||
}
|
||||
await writeFile(`${OUT_DIR}/scrum_report.md`, report_md.join("\n"));
|
||||
|
||||
const summary = {
|
||||
ran_at: new Date().toISOString(),
|
||||
target_count: TARGET_FILES.length,
|
||||
resolved: reviews.filter(r => r.accepted_on !== null).length,
|
||||
total_attempts: reviews.reduce((s, r) => s + r.attempts_made, 0),
|
||||
total_duration_ms: reviews.reduce((s, r) => s + r.duration_ms, 0),
|
||||
per_file: reviews.map(r => ({ file: r.file, accepted_on: r.accepted_on, model: r.escalated_to_model, attempts: r.attempts_made, ms: r.duration_ms })),
|
||||
};
|
||||
await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
|
||||
|
||||
log("");
|
||||
log("═══ SCRUM REPORT ═══");
|
||||
log(` files: ${summary.target_count}, resolved: ${summary.resolved}, total attempts: ${summary.total_attempts}`);
|
||||
log(` total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
|
||||
log("");
|
||||
for (const p of summary.per_file) {
|
||||
const mark = p.accepted_on ? "✓" : "✗";
|
||||
log(` ${mark} ${p.file.padEnd(60)} attempt ${p.accepted_on ?? "—"}/${p.attempts} ${p.model} ${p.ms}ms`);
|
||||
}
|
||||
log("");
|
||||
log(`report: ${OUT_DIR}/scrum_report.md`);
|
||||
|
||||
process.exit(summary.resolved === summary.target_count ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("[scrum] fatal:", e); process.exit(2); });
|
||||
Loading…
x
Reference in New Issue
Block a user