Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Pass 5 (5 reps × 4 conditions × 1 file on grok-4.1-fast) showed composing matrix corpora is anti-additive on strong models — composed lakehouse_arch + symbols LOST 5/5 head-to-head vs codereview_isolation (Δ −1.8 grounded findings, p=0.031). Default flips to isolation; matrix path now auto- downgrades when the resolved model is strong. Mode runner: - matrix_corpus is Vec<String> (string OR array via deserialize_string_or_vec) - top_k=6 from each corpus, merge by score, take top 8 globally - chunk tag prefers doc_id over source so reviewer sees [adr:009] vs [lakehouse_arch] - is_weak_model() gate auto-downgrades codereview_lakehouse → codereview_isolation for strong models (default-strong; weak = :free suffix or local last-resort) - LH_FORCE_FULL_ENRICHMENT=1 bypasses for diagnostic runs - EnrichmentSources.downgraded_from records when the gate fires Three corpora indexed via /vectors/index (5849 chunks total): - lakehouse_arch_v1 — ADRs + phases + PRD + scrum spec (93 docs, 2119 chunks) - scrum_findings_v1 — past scrum_reviews.jsonl (168 docs, 1260 chunks; EXCLUDED from defaults — 24% out-of-bounds line citations from cross-file drift) - lakehouse_symbols_v1 — regex-extracted pub items + /// docs (656 docs, 2470 chunks) Experiment infra: - scripts/build_*_corpus.ts — re-runnable when source content changes - scripts/mode_pass5_variance_paid.ts — N reps × M conditions on one file - scripts/mode_pass5_summarize.ts — mean ± σ + head-to-head, parser handles numbered + path-with-line + path-with-symbol finding tables - scripts/mode_compare.ts — groups by mode|corpus when sweeps span corpora - scripts/mode_experiment.ts — default model bumped to x-ai/grok-4.1-fast, --corpus flag for per-call override Decisions + open follow-ups: docs/MODE_RUNNER_TUNING_PLAN.md Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
142 lines
5.1 KiB
TypeScript
142 lines
5.1 KiB
TypeScript
#!/usr/bin/env bun
|
|
/**
|
|
* Build the `lakehouse_symbols_v1` corpus — Option C from 2026-04-26
|
|
* pass. Extracts public Rust items with their /// doc comments from
|
|
* crates/**\/*.rs. Regex-based — covers ~80% of definitions without
|
|
* pulling in a syn-based parser.
|
|
*
|
|
* doc_id: `symbol:<crate>::<kind>::<name>` e.g. symbol:vectord::struct::PathwayTrace
|
|
*
|
|
* Each chunk includes: doc comment (if any) + signature + 1-2 lines
|
|
* after the brace so reviewer sees field types / variants for structs
|
|
* and enums.
|
|
*/
|
|
|
|
import { readFileSync, readdirSync, statSync } from "node:fs";
|
|
import { resolve, relative } from "node:path";
|
|
|
|
const ROOT = resolve(import.meta.dir, "..");
|
|
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
|
|
const INDEX_NAME = process.env.LH_CORPUS_NAME ?? "lakehouse_symbols_v1";
|
|
const SOURCE_LABEL = "lakehouse_symbols";
|
|
const CHUNK_SIZE = Number(process.env.LH_CHUNK_SIZE ?? 800);
|
|
const OVERLAP = Number(process.env.LH_OVERLAP ?? 80);
|
|
|
|
interface Doc { id: string; text: string }
|
|
|
|
function walkRs(dir: string): string[] {
|
|
const out: string[] = [];
|
|
for (const entry of readdirSync(dir)) {
|
|
if (entry === "target" || entry.startsWith(".")) continue;
|
|
const full = resolve(dir, entry);
|
|
const st = statSync(full);
|
|
if (st.isDirectory()) out.push(...walkRs(full));
|
|
else if (entry.endsWith(".rs")) out.push(full);
|
|
}
|
|
return out;
|
|
}
|
|
|
|
function crateOf(rsPath: string): string {
|
|
const rel = relative(resolve(ROOT, "crates"), rsPath);
|
|
return rel.split("/")[0];
|
|
}
|
|
|
|
// Match pub fn|struct|enum|trait declarations. Capture the (optional)
|
|
// preceding contiguous /// doc block and a few lines after for signature
|
|
// + body preview. Skips items inside `mod tests` blocks and #[cfg(test)].
|
|
const ITEM_RE = /(?:^[ \t]*\/\/\/.*\n)*[ \t]*pub(?:\([^)]+\))?[ \t]+(fn|struct|enum|trait|async[ \t]+fn)[ \t]+([A-Za-z_][A-Za-z0-9_]*)/gm;
|
|
|
|
function extractItems(src: string, crate: string, relPath: string): Doc[] {
|
|
const docs: Doc[] = [];
|
|
const seen = new Set<string>();
|
|
|
|
// Quick test-module guard: drop everything from a `mod tests {` line
|
|
// onward. Coarse but adequate — public items inside tests are rare.
|
|
const cutoff = src.search(/^(#\[cfg\(test\)\]|mod tests\b)/m);
|
|
const usable = cutoff > 0 ? src.slice(0, cutoff) : src;
|
|
|
|
for (const m of usable.matchAll(ITEM_RE)) {
|
|
const matchStart = m.index!;
|
|
const kind = m[1].replace(/^async[ \t]+/, "async_");
|
|
const name = m[2];
|
|
|
|
// Walk backward to capture the contiguous /// doc block above.
|
|
const lines = usable.slice(0, matchStart).split("\n");
|
|
const docLines: string[] = [];
|
|
for (let i = lines.length - 1; i >= 0; i--) {
|
|
const t = lines[i].trim();
|
|
if (t.startsWith("///")) docLines.unshift(t.replace(/^\/\/\/\s?/, ""));
|
|
else if (t === "" || t.startsWith("#[")) continue;
|
|
else break;
|
|
}
|
|
|
|
// Capture signature + ~6 lines of body preview.
|
|
const after = usable.slice(matchStart, matchStart + 800);
|
|
const bodyEnd = after.search(/\n\}\n|\n\n[a-z#]/);
|
|
const body = bodyEnd > 0 ? after.slice(0, Math.min(bodyEnd, 800)) : after.slice(0, 800);
|
|
|
|
const id = `symbol:${crate}::${kind}::${name}`;
|
|
if (seen.has(id)) continue;
|
|
seen.add(id);
|
|
|
|
const header = `${crate}::${name} (${kind}) — ${relPath}`;
|
|
const docText = docLines.length > 0 ? `\n${docLines.join("\n")}\n` : "\n";
|
|
docs.push({ id, text: `${header}\n${docText}\n\`\`\`rust\n${body}\n\`\`\`` });
|
|
}
|
|
return docs;
|
|
}
|
|
|
|
function buildDocs(): Doc[] {
|
|
const cratesDir = resolve(ROOT, "crates");
|
|
const docs: Doc[] = [];
|
|
for (const f of walkRs(cratesDir)) {
|
|
const src = readFileSync(f, "utf8");
|
|
const crate = crateOf(f);
|
|
const rel = relative(ROOT, f);
|
|
docs.push(...extractItems(src, crate, rel));
|
|
}
|
|
return docs;
|
|
}
|
|
|
|
async function main() {
|
|
const printOnly = process.argv.includes("--print");
|
|
const dryRun = process.argv.includes("--dry-run") || printOnly;
|
|
|
|
const docs = buildDocs();
|
|
const totalBytes = docs.reduce((s, d) => s + d.text.length, 0);
|
|
const byCrate = new Map<string, number>();
|
|
for (const d of docs) {
|
|
const c = d.id.split("::")[0].replace("symbol:", "");
|
|
byCrate.set(c, (byCrate.get(c) ?? 0) + 1);
|
|
}
|
|
console.log(`[corpus-C] ${docs.length} symbols · ${totalBytes} bytes · chunk_size=${CHUNK_SIZE}`);
|
|
console.log(`[corpus-C] by crate: ${[...byCrate.entries()].map(([k, v]) => `${k}=${v}`).join(", ")}`);
|
|
|
|
if (printOnly) {
|
|
docs.slice(0, 3).forEach(d => console.log(` ${d.id} (${d.text.length}b)\n ${d.text.slice(0, 200).replace(/\n/g, "\n ")}\n`));
|
|
return;
|
|
}
|
|
if (dryRun) return;
|
|
|
|
const r = await fetch(`${GATEWAY}/vectors/index`, {
|
|
method: "POST",
|
|
headers: { "content-type": "application/json" },
|
|
body: JSON.stringify({
|
|
index_name: INDEX_NAME,
|
|
source: SOURCE_LABEL,
|
|
documents: docs,
|
|
chunk_size: CHUNK_SIZE,
|
|
overlap: OVERLAP,
|
|
}),
|
|
signal: AbortSignal.timeout(60_000),
|
|
});
|
|
if (!r.ok) {
|
|
console.error(`[corpus-C] HTTP ${r.status}: ${await r.text()}`);
|
|
process.exit(1);
|
|
}
|
|
const j: any = await r.json();
|
|
console.log(`[corpus-C] job ${j.job_id} · ${j.documents} docs → ${j.chunks} chunks queued`);
|
|
}
|
|
|
|
main().catch(e => { console.error(e); process.exit(1); });
|