lakehouse/scripts/build_symbols_corpus.ts
root 2dbc8dbc83
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
v1/mode: model-aware enrichment downgrade + 3 corpora + variance harness
Pass 5 (5 reps × 4 conditions × 1 file on grok-4.1-fast) showed composing
matrix corpora is anti-additive on strong models — composed lakehouse_arch
+ symbols LOST 5/5 head-to-head vs codereview_isolation (Δ −1.8 grounded
findings, p=0.031). Default flips to isolation; matrix path now auto-
downgrades when the resolved model is strong.

Mode runner:
- matrix_corpus is Vec<String> (string OR array via deserialize_string_or_vec)
- top_k=6 from each corpus, merge by score, take top 8 globally
- chunk tag prefers doc_id over source so reviewer sees [adr:009] vs [lakehouse_arch]
- is_weak_model() gate auto-downgrades codereview_lakehouse → codereview_isolation
  for strong models (default-strong; weak = :free suffix or local last-resort)
- LH_FORCE_FULL_ENRICHMENT=1 bypasses for diagnostic runs
- EnrichmentSources.downgraded_from records when the gate fires

Three corpora indexed via /vectors/index (5849 chunks total):
- lakehouse_arch_v1 — ADRs + phases + PRD + scrum spec (93 docs, 2119 chunks)
- scrum_findings_v1 — past scrum_reviews.jsonl (168 docs, 1260 chunks; EXCLUDED
  from defaults — 24% out-of-bounds line citations from cross-file drift)
- lakehouse_symbols_v1 — regex-extracted pub items + /// docs (656 docs, 2470 chunks)

Experiment infra:
- scripts/build_*_corpus.ts — re-runnable when source content changes
- scripts/mode_pass5_variance_paid.ts — N reps × M conditions on one file
- scripts/mode_pass5_summarize.ts — mean ± σ + head-to-head, parser handles
  numbered + path-with-line + path-with-symbol finding tables
- scripts/mode_compare.ts — groups by mode|corpus when sweeps span corpora
- scripts/mode_experiment.ts — default model bumped to x-ai/grok-4.1-fast,
  --corpus flag for per-call override

Decisions + open follow-ups: docs/MODE_RUNNER_TUNING_PLAN.md

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 17:29:17 -05:00

142 lines
5.1 KiB
TypeScript

#!/usr/bin/env bun
/**
* Build the `lakehouse_symbols_v1` corpus — Option C from 2026-04-26
* pass. Extracts public Rust items with their /// doc comments from
* crates/**\/*.rs. Regex-based — covers ~80% of definitions without
* pulling in a syn-based parser.
*
* doc_id: `symbol:<crate>::<kind>::<name>` e.g. symbol:vectord::struct::PathwayTrace
*
* Each chunk includes: doc comment (if any) + signature + 1-2 lines
* after the brace so reviewer sees field types / variants for structs
* and enums.
*/
import { readFileSync, readdirSync, statSync } from "node:fs";
import { resolve, relative } from "node:path";
const ROOT = resolve(import.meta.dir, "..");
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
const INDEX_NAME = process.env.LH_CORPUS_NAME ?? "lakehouse_symbols_v1";
const SOURCE_LABEL = "lakehouse_symbols";
const CHUNK_SIZE = Number(process.env.LH_CHUNK_SIZE ?? 800);
const OVERLAP = Number(process.env.LH_OVERLAP ?? 80);
interface Doc { id: string; text: string }
function walkRs(dir: string): string[] {
const out: string[] = [];
for (const entry of readdirSync(dir)) {
if (entry === "target" || entry.startsWith(".")) continue;
const full = resolve(dir, entry);
const st = statSync(full);
if (st.isDirectory()) out.push(...walkRs(full));
else if (entry.endsWith(".rs")) out.push(full);
}
return out;
}
function crateOf(rsPath: string): string {
const rel = relative(resolve(ROOT, "crates"), rsPath);
return rel.split("/")[0];
}
// Match pub fn|struct|enum|trait declarations. Capture the (optional)
// preceding contiguous /// doc block and a few lines after for signature
// + body preview. Skips items inside `mod tests` blocks and #[cfg(test)].
const ITEM_RE = /(?:^[ \t]*\/\/\/.*\n)*[ \t]*pub(?:\([^)]+\))?[ \t]+(fn|struct|enum|trait|async[ \t]+fn)[ \t]+([A-Za-z_][A-Za-z0-9_]*)/gm;
function extractItems(src: string, crate: string, relPath: string): Doc[] {
const docs: Doc[] = [];
const seen = new Set<string>();
// Quick test-module guard: drop everything from a `mod tests {` line
// onward. Coarse but adequate — public items inside tests are rare.
const cutoff = src.search(/^(#\[cfg\(test\)\]|mod tests\b)/m);
const usable = cutoff > 0 ? src.slice(0, cutoff) : src;
for (const m of usable.matchAll(ITEM_RE)) {
const matchStart = m.index!;
const kind = m[1].replace(/^async[ \t]+/, "async_");
const name = m[2];
// Walk backward to capture the contiguous /// doc block above.
const lines = usable.slice(0, matchStart).split("\n");
const docLines: string[] = [];
for (let i = lines.length - 1; i >= 0; i--) {
const t = lines[i].trim();
if (t.startsWith("///")) docLines.unshift(t.replace(/^\/\/\/\s?/, ""));
else if (t === "" || t.startsWith("#[")) continue;
else break;
}
// Capture signature + ~6 lines of body preview.
const after = usable.slice(matchStart, matchStart + 800);
const bodyEnd = after.search(/\n\}\n|\n\n[a-z#]/);
const body = bodyEnd > 0 ? after.slice(0, Math.min(bodyEnd, 800)) : after.slice(0, 800);
const id = `symbol:${crate}::${kind}::${name}`;
if (seen.has(id)) continue;
seen.add(id);
const header = `${crate}::${name} (${kind}) — ${relPath}`;
const docText = docLines.length > 0 ? `\n${docLines.join("\n")}\n` : "\n";
docs.push({ id, text: `${header}\n${docText}\n\`\`\`rust\n${body}\n\`\`\`` });
}
return docs;
}
function buildDocs(): Doc[] {
const cratesDir = resolve(ROOT, "crates");
const docs: Doc[] = [];
for (const f of walkRs(cratesDir)) {
const src = readFileSync(f, "utf8");
const crate = crateOf(f);
const rel = relative(ROOT, f);
docs.push(...extractItems(src, crate, rel));
}
return docs;
}
async function main() {
const printOnly = process.argv.includes("--print");
const dryRun = process.argv.includes("--dry-run") || printOnly;
const docs = buildDocs();
const totalBytes = docs.reduce((s, d) => s + d.text.length, 0);
const byCrate = new Map<string, number>();
for (const d of docs) {
const c = d.id.split("::")[0].replace("symbol:", "");
byCrate.set(c, (byCrate.get(c) ?? 0) + 1);
}
console.log(`[corpus-C] ${docs.length} symbols · ${totalBytes} bytes · chunk_size=${CHUNK_SIZE}`);
console.log(`[corpus-C] by crate: ${[...byCrate.entries()].map(([k, v]) => `${k}=${v}`).join(", ")}`);
if (printOnly) {
docs.slice(0, 3).forEach(d => console.log(` ${d.id} (${d.text.length}b)\n ${d.text.slice(0, 200).replace(/\n/g, "\n ")}\n`));
return;
}
if (dryRun) return;
const r = await fetch(`${GATEWAY}/vectors/index`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
index_name: INDEX_NAME,
source: SOURCE_LABEL,
documents: docs,
chunk_size: CHUNK_SIZE,
overlap: OVERLAP,
}),
signal: AbortSignal.timeout(60_000),
});
if (!r.ok) {
console.error(`[corpus-C] HTTP ${r.status}: ${await r.text()}`);
process.exit(1);
}
const j: any = await r.json();
console.log(`[corpus-C] job ${j.job_id} · ${j.documents} docs → ${j.chunks} chunks queued`);
}
main().catch(e => { console.error(e); process.exit(1); });