#!/usr/bin/env bun /** * Build the `lakehouse_symbols_v1` corpus — Option C from 2026-04-26 * pass. Extracts public Rust items with their /// doc comments from * crates/**\/*.rs. Regex-based — covers ~80% of definitions without * pulling in a syn-based parser. * * doc_id: `symbol:::::` e.g. symbol:vectord::struct::PathwayTrace * * Each chunk includes: doc comment (if any) + signature + 1-2 lines * after the brace so reviewer sees field types / variants for structs * and enums. */ import { readFileSync, readdirSync, statSync } from "node:fs"; import { resolve, relative } from "node:path"; const ROOT = resolve(import.meta.dir, ".."); const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100"; const INDEX_NAME = process.env.LH_CORPUS_NAME ?? "lakehouse_symbols_v1"; const SOURCE_LABEL = "lakehouse_symbols"; const CHUNK_SIZE = Number(process.env.LH_CHUNK_SIZE ?? 800); const OVERLAP = Number(process.env.LH_OVERLAP ?? 80); interface Doc { id: string; text: string } function walkRs(dir: string): string[] { const out: string[] = []; for (const entry of readdirSync(dir)) { if (entry === "target" || entry.startsWith(".")) continue; const full = resolve(dir, entry); const st = statSync(full); if (st.isDirectory()) out.push(...walkRs(full)); else if (entry.endsWith(".rs")) out.push(full); } return out; } function crateOf(rsPath: string): string { const rel = relative(resolve(ROOT, "crates"), rsPath); return rel.split("/")[0]; } // Match pub fn|struct|enum|trait declarations. Capture the (optional) // preceding contiguous /// doc block and a few lines after for signature // + body preview. Skips items inside `mod tests` blocks and #[cfg(test)]. const ITEM_RE = /(?:^[ \t]*\/\/\/.*\n)*[ \t]*pub(?:\([^)]+\))?[ \t]+(fn|struct|enum|trait|async[ \t]+fn)[ \t]+([A-Za-z_][A-Za-z0-9_]*)/gm; function extractItems(src: string, crate: string, relPath: string): Doc[] { const docs: Doc[] = []; const seen = new Set(); // Quick test-module guard: drop everything from a `mod tests {` line // onward. Coarse but adequate — public items inside tests are rare. const cutoff = src.search(/^(#\[cfg\(test\)\]|mod tests\b)/m); const usable = cutoff > 0 ? src.slice(0, cutoff) : src; for (const m of usable.matchAll(ITEM_RE)) { const matchStart = m.index!; const kind = m[1].replace(/^async[ \t]+/, "async_"); const name = m[2]; // Walk backward to capture the contiguous /// doc block above. const lines = usable.slice(0, matchStart).split("\n"); const docLines: string[] = []; for (let i = lines.length - 1; i >= 0; i--) { const t = lines[i].trim(); if (t.startsWith("///")) docLines.unshift(t.replace(/^\/\/\/\s?/, "")); else if (t === "" || t.startsWith("#[")) continue; else break; } // Capture signature + ~6 lines of body preview. const after = usable.slice(matchStart, matchStart + 800); const bodyEnd = after.search(/\n\}\n|\n\n[a-z#]/); const body = bodyEnd > 0 ? after.slice(0, Math.min(bodyEnd, 800)) : after.slice(0, 800); const id = `symbol:${crate}::${kind}::${name}`; if (seen.has(id)) continue; seen.add(id); const header = `${crate}::${name} (${kind}) — ${relPath}`; const docText = docLines.length > 0 ? `\n${docLines.join("\n")}\n` : "\n"; docs.push({ id, text: `${header}\n${docText}\n\`\`\`rust\n${body}\n\`\`\`` }); } return docs; } function buildDocs(): Doc[] { const cratesDir = resolve(ROOT, "crates"); const docs: Doc[] = []; for (const f of walkRs(cratesDir)) { const src = readFileSync(f, "utf8"); const crate = crateOf(f); const rel = relative(ROOT, f); docs.push(...extractItems(src, crate, rel)); } return docs; } async function main() { const printOnly = process.argv.includes("--print"); const dryRun = process.argv.includes("--dry-run") || printOnly; const docs = buildDocs(); const totalBytes = docs.reduce((s, d) => s + d.text.length, 0); const byCrate = new Map(); for (const d of docs) { const c = d.id.split("::")[0].replace("symbol:", ""); byCrate.set(c, (byCrate.get(c) ?? 0) + 1); } console.log(`[corpus-C] ${docs.length} symbols · ${totalBytes} bytes · chunk_size=${CHUNK_SIZE}`); console.log(`[corpus-C] by crate: ${[...byCrate.entries()].map(([k, v]) => `${k}=${v}`).join(", ")}`); if (printOnly) { docs.slice(0, 3).forEach(d => console.log(` ${d.id} (${d.text.length}b)\n ${d.text.slice(0, 200).replace(/\n/g, "\n ")}\n`)); return; } if (dryRun) return; const r = await fetch(`${GATEWAY}/vectors/index`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ index_name: INDEX_NAME, source: SOURCE_LABEL, documents: docs, chunk_size: CHUNK_SIZE, overlap: OVERLAP, }), signal: AbortSignal.timeout(60_000), }); if (!r.ok) { console.error(`[corpus-C] HTTP ${r.status}: ${await r.text()}`); process.exit(1); } const j: any = await r.json(); console.log(`[corpus-C] job ${j.job_id} · ${j.documents} docs → ${j.chunks} chunks queued`); } main().catch(e => { console.error(e); process.exit(1); });