#!/usr/bin/env bun /** * Build the `lakehouse_arch_v1` corpus — Option A from 2026-04-26 * corpus-tightening pass. Sources: DECISIONS.md ADRs, standalone * ADR-NNN-*.md docs, PHASES.md per-phase entries, PRD.md, * CONTROL_PLANE_PRD.md, SCRUM_MASTER_SPEC.md sections. * * doc_id encodes origin (adr:017, phase:19, prd:executive_summary, ...) * so the reviewer prompt's [tag] surfaces useful context. * * Usage: * bun run scripts/build_lakehouse_corpus.ts # build * bun run scripts/build_lakehouse_corpus.ts --dry-run # show docs, don't POST * bun run scripts/build_lakehouse_corpus.ts --print # dump first chunk + count */ import { readFileSync, readdirSync } from "node:fs"; import { resolve } from "node:path"; const ROOT = resolve(import.meta.dir, ".."); const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100"; const INDEX_NAME = process.env.LH_CORPUS_NAME ?? "lakehouse_arch_v1"; const SOURCE_LABEL = "lakehouse_arch"; const CHUNK_SIZE = Number(process.env.LH_CHUNK_SIZE ?? 1500); const OVERLAP = Number(process.env.LH_OVERLAP ?? 150); interface Doc { id: string; text: string } function slug(s: string): string { return s .toLowerCase() .replace(/[^a-z0-9]+/g, "_") .replace(/^_+|_+$/g, "") .slice(0, 60); } // Split DECISIONS.md by `## ADR-NNN: title`. Drop date line so it doesn't // dilute the embedding (ADRs are about intent, not when they happened). function chunkDecisionsMd(md: string): Doc[] { const docs: Doc[] = []; const sections = md.split(/^## ADR-(\d+):\s*(.+)$/m); // sections = [preamble, num, title, body, num, title, body, ...] for (let i = 1; i < sections.length; i += 3) { const num = sections[i].padStart(3, "0"); const title = sections[i + 1].trim(); const body = sections[i + 2] .replace(/^\*\*Date:\*\*.*$/m, "") .trim(); docs.push({ id: `adr:${num}`, text: `# ADR-${num}: ${title}\n\n${body}`, }); } return docs; } // Standalone ADR-NNN-*.md files in docs/ — keep one doc per file. function chunkStandaloneAdrs(dir: string): Doc[] { const docs: Doc[] = []; for (const f of readdirSync(dir)) { const m = f.match(/^ADR-(\d+)-(.+)\.md$/); if (!m) continue; const num = m[1].padStart(3, "0"); const slug_ = slug(m[2]); docs.push({ id: `adr_doc:${num}_${slug_}`, text: readFileSync(resolve(dir, f), "utf8"), }); } return docs; } // PHASES.md uses `## Phase N: title` headings + nested checklists. Split // by phase. Sub-bullets stay with their parent phase so context is intact. function chunkPhasesMd(md: string): Doc[] { const docs: Doc[] = []; const sections = md.split(/^## (Phase[^\n]*)$/m); for (let i = 1; i < sections.length; i += 2) { const heading = sections[i].trim(); const body = sections[i + 1].trim(); if (!body) continue; const phase_num_match = heading.match(/Phase\s+(\S+)/); const id_part = phase_num_match ? `phase:${slug(phase_num_match[1])}` : `phase:${slug(heading)}`; docs.push({ id: id_part, text: `## ${heading}\n${body}` }); } return docs; } // Generic doc: split by `## Section` (top-level inside a single doc). If // the section list is empty, return the whole file as one doc and let the // server-side chunker handle it. function chunkBySectionH2(filePath: string, originPrefix: string): Doc[] { const md = readFileSync(filePath, "utf8"); const sections = md.split(/^## (.+)$/m); if (sections.length < 3) { return [{ id: `${originPrefix}:_full`, text: md }]; } const docs: Doc[] = []; // Capture preamble (before any ## heading) if non-trivial if (sections[0].trim().length > 200) { docs.push({ id: `${originPrefix}:_preamble`, text: sections[0].trim(), }); } for (let i = 1; i < sections.length; i += 2) { const heading = sections[i].trim(); const body = sections[i + 1].trim(); if (!body) continue; docs.push({ id: `${originPrefix}:${slug(heading)}`, text: `## ${heading}\n${body}`, }); } return docs; } function buildAllDocs(): Doc[] { const docs: Doc[] = []; docs.push(...chunkDecisionsMd(readFileSync(resolve(ROOT, "docs/DECISIONS.md"), "utf8"))); docs.push(...chunkStandaloneAdrs(resolve(ROOT, "docs"))); docs.push(...chunkPhasesMd(readFileSync(resolve(ROOT, "docs/PHASES.md"), "utf8"))); docs.push(...chunkBySectionH2(resolve(ROOT, "docs/PRD.md"), "prd")); docs.push(...chunkBySectionH2(resolve(ROOT, "docs/CONTROL_PLANE_PRD.md"), "ctrl_prd")); docs.push(...chunkBySectionH2(resolve(ROOT, "docs/SCRUM_MASTER_SPEC.md"), "scrum_spec")); return docs; } async function main() { const dryRun = process.argv.includes("--dry-run") || process.argv.includes("--print"); const printOnly = process.argv.includes("--print"); const docs = buildAllDocs(); const totalBytes = docs.reduce((s, d) => s + d.text.length, 0); const expectedChunks = Math.ceil(totalBytes / (CHUNK_SIZE - OVERLAP)); console.log(`[corpus] ${docs.length} documents · ${totalBytes} bytes · ~${expectedChunks} chunks at ${CHUNK_SIZE}/${OVERLAP}`); console.log(`[corpus] origins: ${[...new Set(docs.map(d => d.id.split(":")[0]))].join(", ")}`); if (printOnly) { console.log("\n[corpus] first 3 doc IDs:"); docs.slice(0, 3).forEach(d => console.log(` ${d.id} (${d.text.length} bytes) ${d.text.slice(0, 80).replace(/\n/g, " ")}…`)); console.log("\n[corpus] last 3 doc IDs:"); docs.slice(-3).forEach(d => console.log(` ${d.id} (${d.text.length} bytes) ${d.text.slice(0, 80).replace(/\n/g, " ")}…`)); return; } if (dryRun) return; const body = { index_name: INDEX_NAME, source: SOURCE_LABEL, documents: docs, chunk_size: CHUNK_SIZE, overlap: OVERLAP, }; console.log(`[corpus] POST ${GATEWAY}/vectors/index → ${INDEX_NAME}`); const r = await fetch(`${GATEWAY}/vectors/index`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify(body), signal: AbortSignal.timeout(60_000), }); if (!r.ok) { console.error(`[corpus] HTTP ${r.status}: ${await r.text()}`); process.exit(1); } const j: any = await r.json(); console.log(`[corpus] job ${j.job_id} · ${j.documents} docs → ${j.chunks} chunks queued`); console.log(`[corpus] poll: curl -s ${GATEWAY}/vectors/jobs/${j.job_id} | jq`); console.log(`[corpus] verify: curl -s '${GATEWAY}/vectors/indexes' | jq '.[]|select(.index_name=="${INDEX_NAME}")'`); } main().catch(e => { console.error(e); process.exit(1); });