lakehouse/scripts/build_symbols_corpus.ts

#!/usr/bin/env bun
/**
 * Build the `lakehouse_symbols_v1` corpus — Option C from 2026-04-26
 * pass. Extracts public Rust items with their /// doc comments from
 * crates/**\/*.rs. Regex-based — covers ~80% of definitions without
 * pulling in a syn-based parser.
 *
 * doc_id: `symbol:<crate>::<kind>::<name>`  e.g. symbol:vectord::struct::PathwayTrace
 *
 * Each chunk includes: doc comment (if any) + signature + 1-2 lines
 * after the brace so reviewer sees field types / variants for structs
 * and enums.
 */

import { readFileSync, readdirSync, statSync } from "node:fs";
import { resolve, relative } from "node:path";

const ROOT = resolve(import.meta.dir, "..");
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
const INDEX_NAME = process.env.LH_CORPUS_NAME ?? "lakehouse_symbols_v1";
const SOURCE_LABEL = "lakehouse_symbols";
const CHUNK_SIZE = Number(process.env.LH_CHUNK_SIZE ?? 800);
const OVERLAP = Number(process.env.LH_OVERLAP ?? 80);

interface Doc { id: string; text: string }

function walkRs(dir: string): string[] {
  const out: string[] = [];
  for (const entry of readdirSync(dir)) {
    if (entry === "target" || entry.startsWith(".")) continue;
    const full = resolve(dir, entry);
    const st = statSync(full);
    if (st.isDirectory()) out.push(...walkRs(full));
    else if (entry.endsWith(".rs")) out.push(full);
  }
  return out;
}

function crateOf(rsPath: string): string {
  const rel = relative(resolve(ROOT, "crates"), rsPath);
  return rel.split("/")[0];
}

// Match pub fn|struct|enum|trait declarations. Capture the (optional)
// preceding contiguous /// doc block and a few lines after for signature
// + body preview. Skips items inside `mod tests` blocks and #[cfg(test)].
const ITEM_RE = /(?:^[ \t]*\/\/\/.*\n)*[ \t]*pub(?:\([^)]+\))?[ \t]+(fn|struct|enum|trait|async[ \t]+fn)[ \t]+([A-Za-z_][A-Za-z0-9_]*)/gm;

function extractItems(src: string, crate: string, relPath: string): Doc[] {
  const docs: Doc[] = [];
  const seen = new Set<string>();

  // Quick test-module guard: drop everything from a `mod tests {` line
  // onward. Coarse but adequate — public items inside tests are rare.
  const cutoff = src.search(/^(#\[cfg\(test\)\]|mod tests\b)/m);
  const usable = cutoff > 0 ? src.slice(0, cutoff) : src;

  for (const m of usable.matchAll(ITEM_RE)) {
    const matchStart = m.index!;
    const kind = m[1].replace(/^async[ \t]+/, "async_");
    const name = m[2];

    // Walk backward to capture the contiguous /// doc block above.
    const lines = usable.slice(0, matchStart).split("\n");
    const docLines: string[] = [];
    for (let i = lines.length - 1; i >= 0; i--) {
      const t = lines[i].trim();
      if (t.startsWith("///")) docLines.unshift(t.replace(/^\/\/\/\s?/, ""));
      else if (t === "" || t.startsWith("#[")) continue;
      else break;
    }

    // Capture signature + ~6 lines of body preview.
    const after = usable.slice(matchStart, matchStart + 800);
    const bodyEnd = after.search(/\n\}\n|\n\n[a-z#]/);
    const body = bodyEnd > 0 ? after.slice(0, Math.min(bodyEnd, 800)) : after.slice(0, 800);

    const id = `symbol:${crate}::${kind}::${name}`;
    if (seen.has(id)) continue;
    seen.add(id);

    const header = `${crate}::${name} (${kind}) — ${relPath}`;
    const docText = docLines.length > 0 ? `\n${docLines.join("\n")}\n` : "\n";
    docs.push({ id, text: `${header}\n${docText}\n\`\`\`rust\n${body}\n\`\`\`` });
  }
  return docs;
}

function buildDocs(): Doc[] {
  const cratesDir = resolve(ROOT, "crates");
  const docs: Doc[] = [];
  for (const f of walkRs(cratesDir)) {
    const src = readFileSync(f, "utf8");
    const crate = crateOf(f);
    const rel = relative(ROOT, f);
    docs.push(...extractItems(src, crate, rel));
  }
  return docs;
}

async function main() {
  const printOnly = process.argv.includes("--print");
  const dryRun = process.argv.includes("--dry-run") || printOnly;

  const docs = buildDocs();
  const totalBytes = docs.reduce((s, d) => s + d.text.length, 0);
  const byCrate = new Map<string, number>();
  for (const d of docs) {
    const c = d.id.split("::")[0].replace("symbol:", "");
    byCrate.set(c, (byCrate.get(c) ?? 0) + 1);
  }
  console.log(`[corpus-C] ${docs.length} symbols · ${totalBytes} bytes · chunk_size=${CHUNK_SIZE}`);
  console.log(`[corpus-C] by crate: ${[...byCrate.entries()].map(([k, v]) => `${k}=${v}`).join(", ")}`);

  if (printOnly) {
    docs.slice(0, 3).forEach(d => console.log(`  ${d.id} (${d.text.length}b)\n  ${d.text.slice(0, 200).replace(/\n/g, "\n  ")}\n`));
    return;
  }
  if (dryRun) return;

  const r = await fetch(`${GATEWAY}/vectors/index`, {
    method: "POST",
    headers: { "content-type": "application/json" },
    body: JSON.stringify({
      index_name: INDEX_NAME,
      source: SOURCE_LABEL,
      documents: docs,
      chunk_size: CHUNK_SIZE,
      overlap: OVERLAP,
    }),
    signal: AbortSignal.timeout(60_000),
  });
  if (!r.ok) {
    console.error(`[corpus-C] HTTP ${r.status}: ${await r.text()}`);
    process.exit(1);
  }
  const j: any = await r.json();
  console.log(`[corpus-C] job ${j.job_id} · ${j.documents} docs → ${j.chunks} chunks queued`);
}

main().catch(e => { console.error(e); process.exit(1); });