lakehouse/scripts/build_lakehouse_corpus.ts

#!/usr/bin/env bun
/**
 * Build the `lakehouse_arch_v1` corpus — Option A from 2026-04-26
 * corpus-tightening pass. Sources: DECISIONS.md ADRs, standalone
 * ADR-NNN-*.md docs, PHASES.md per-phase entries, PRD.md,
 * CONTROL_PLANE_PRD.md, SCRUM_MASTER_SPEC.md sections.
 *
 * doc_id encodes origin (adr:017, phase:19, prd:executive_summary, ...)
 * so the reviewer prompt's [tag] surfaces useful context.
 *
 * Usage:
 *   bun run scripts/build_lakehouse_corpus.ts            # build
 *   bun run scripts/build_lakehouse_corpus.ts --dry-run  # show docs, don't POST
 *   bun run scripts/build_lakehouse_corpus.ts --print    # dump first chunk + count
 */

import { readFileSync, readdirSync } from "node:fs";
import { resolve } from "node:path";

const ROOT = resolve(import.meta.dir, "..");
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
const INDEX_NAME = process.env.LH_CORPUS_NAME ?? "lakehouse_arch_v1";
const SOURCE_LABEL = "lakehouse_arch";
const CHUNK_SIZE = Number(process.env.LH_CHUNK_SIZE ?? 1500);
const OVERLAP = Number(process.env.LH_OVERLAP ?? 150);

interface Doc { id: string; text: string }

function slug(s: string): string {
  return s
    .toLowerCase()
    .replace(/[^a-z0-9]+/g, "_")
    .replace(/^_+|_+$/g, "")
    .slice(0, 60);
}

// Split DECISIONS.md by `## ADR-NNN: title`. Drop date line so it doesn't
// dilute the embedding (ADRs are about intent, not when they happened).
function chunkDecisionsMd(md: string): Doc[] {
  const docs: Doc[] = [];
  const sections = md.split(/^## ADR-(\d+):\s*(.+)$/m);
  // sections = [preamble, num, title, body, num, title, body, ...]
  for (let i = 1; i < sections.length; i += 3) {
    const num = sections[i].padStart(3, "0");
    const title = sections[i + 1].trim();
    const body = sections[i + 2]
      .replace(/^\*\*Date:\*\*.*$/m, "")
      .trim();
    docs.push({
      id: `adr:${num}`,
      text: `# ADR-${num}: ${title}\n\n${body}`,
    });
  }
  return docs;
}

// Standalone ADR-NNN-*.md files in docs/ — keep one doc per file.
function chunkStandaloneAdrs(dir: string): Doc[] {
  const docs: Doc[] = [];
  for (const f of readdirSync(dir)) {
    const m = f.match(/^ADR-(\d+)-(.+)\.md$/);
    if (!m) continue;
    const num = m[1].padStart(3, "0");
    const slug_ = slug(m[2]);
    docs.push({
      id: `adr_doc:${num}_${slug_}`,
      text: readFileSync(resolve(dir, f), "utf8"),
    });
  }
  return docs;
}

// PHASES.md uses `## Phase N: title` headings + nested checklists. Split
// by phase. Sub-bullets stay with their parent phase so context is intact.
function chunkPhasesMd(md: string): Doc[] {
  const docs: Doc[] = [];
  const sections = md.split(/^## (Phase[^\n]*)$/m);
  for (let i = 1; i < sections.length; i += 2) {
    const heading = sections[i].trim();
    const body = sections[i + 1].trim();
    if (!body) continue;
    const phase_num_match = heading.match(/Phase\s+(\S+)/);
    const id_part = phase_num_match
      ? `phase:${slug(phase_num_match[1])}`
      : `phase:${slug(heading)}`;
    docs.push({ id: id_part, text: `## ${heading}\n${body}` });
  }
  return docs;
}

// Generic doc: split by `## Section` (top-level inside a single doc). If
// the section list is empty, return the whole file as one doc and let the
// server-side chunker handle it.
function chunkBySectionH2(filePath: string, originPrefix: string): Doc[] {
  const md = readFileSync(filePath, "utf8");
  const sections = md.split(/^## (.+)$/m);
  if (sections.length < 3) {
    return [{ id: `${originPrefix}:_full`, text: md }];
  }
  const docs: Doc[] = [];
  // Capture preamble (before any ## heading) if non-trivial
  if (sections[0].trim().length > 200) {
    docs.push({
      id: `${originPrefix}:_preamble`,
      text: sections[0].trim(),
    });
  }
  for (let i = 1; i < sections.length; i += 2) {
    const heading = sections[i].trim();
    const body = sections[i + 1].trim();
    if (!body) continue;
    docs.push({
      id: `${originPrefix}:${slug(heading)}`,
      text: `## ${heading}\n${body}`,
    });
  }
  return docs;
}

function buildAllDocs(): Doc[] {
  const docs: Doc[] = [];
  docs.push(...chunkDecisionsMd(readFileSync(resolve(ROOT, "docs/DECISIONS.md"), "utf8")));
  docs.push(...chunkStandaloneAdrs(resolve(ROOT, "docs")));
  docs.push(...chunkPhasesMd(readFileSync(resolve(ROOT, "docs/PHASES.md"), "utf8")));
  docs.push(...chunkBySectionH2(resolve(ROOT, "docs/PRD.md"), "prd"));
  docs.push(...chunkBySectionH2(resolve(ROOT, "docs/CONTROL_PLANE_PRD.md"), "ctrl_prd"));
  docs.push(...chunkBySectionH2(resolve(ROOT, "docs/SCRUM_MASTER_SPEC.md"), "scrum_spec"));
  return docs;
}

async function main() {
  const dryRun = process.argv.includes("--dry-run") || process.argv.includes("--print");
  const printOnly = process.argv.includes("--print");

  const docs = buildAllDocs();
  const totalBytes = docs.reduce((s, d) => s + d.text.length, 0);
  const expectedChunks = Math.ceil(totalBytes / (CHUNK_SIZE - OVERLAP));

  console.log(`[corpus] ${docs.length} documents · ${totalBytes} bytes · ~${expectedChunks} chunks at ${CHUNK_SIZE}/${OVERLAP}`);
  console.log(`[corpus] origins: ${[...new Set(docs.map(d => d.id.split(":")[0]))].join(", ")}`);

  if (printOnly) {
    console.log("\n[corpus] first 3 doc IDs:");
    docs.slice(0, 3).forEach(d => console.log(`  ${d.id} (${d.text.length} bytes) ${d.text.slice(0, 80).replace(/\n/g, " ")}…`));
    console.log("\n[corpus] last 3 doc IDs:");
    docs.slice(-3).forEach(d => console.log(`  ${d.id} (${d.text.length} bytes) ${d.text.slice(0, 80).replace(/\n/g, " ")}…`));
    return;
  }
  if (dryRun) return;

  const body = {
    index_name: INDEX_NAME,
    source: SOURCE_LABEL,
    documents: docs,
    chunk_size: CHUNK_SIZE,
    overlap: OVERLAP,
  };

  console.log(`[corpus] POST ${GATEWAY}/vectors/index → ${INDEX_NAME}`);
  const r = await fetch(`${GATEWAY}/vectors/index`, {
    method: "POST",
    headers: { "content-type": "application/json" },
    body: JSON.stringify(body),
    signal: AbortSignal.timeout(60_000),
  });
  if (!r.ok) {
    console.error(`[corpus] HTTP ${r.status}: ${await r.text()}`);
    process.exit(1);
  }
  const j: any = await r.json();
  console.log(`[corpus] job ${j.job_id} · ${j.documents} docs → ${j.chunks} chunks queued`);
  console.log(`[corpus] poll: curl -s ${GATEWAY}/vectors/jobs/${j.job_id} | jq`);
  console.log(`[corpus] verify: curl -s '${GATEWAY}/vectors/indexes' | jq '.[]|select(.index_name=="${INDEX_NAME}")'`);
}

main().catch(e => { console.error(e); process.exit(1); });