Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Pass 5 (5 reps × 4 conditions × 1 file on grok-4.1-fast) showed composing matrix corpora is anti-additive on strong models — composed lakehouse_arch + symbols LOST 5/5 head-to-head vs codereview_isolation (Δ −1.8 grounded findings, p=0.031). Default flips to isolation; matrix path now auto- downgrades when the resolved model is strong. Mode runner: - matrix_corpus is Vec<String> (string OR array via deserialize_string_or_vec) - top_k=6 from each corpus, merge by score, take top 8 globally - chunk tag prefers doc_id over source so reviewer sees [adr:009] vs [lakehouse_arch] - is_weak_model() gate auto-downgrades codereview_lakehouse → codereview_isolation for strong models (default-strong; weak = :free suffix or local last-resort) - LH_FORCE_FULL_ENRICHMENT=1 bypasses for diagnostic runs - EnrichmentSources.downgraded_from records when the gate fires Three corpora indexed via /vectors/index (5849 chunks total): - lakehouse_arch_v1 — ADRs + phases + PRD + scrum spec (93 docs, 2119 chunks) - scrum_findings_v1 — past scrum_reviews.jsonl (168 docs, 1260 chunks; EXCLUDED from defaults — 24% out-of-bounds line citations from cross-file drift) - lakehouse_symbols_v1 — regex-extracted pub items + /// docs (656 docs, 2470 chunks) Experiment infra: - scripts/build_*_corpus.ts — re-runnable when source content changes - scripts/mode_pass5_variance_paid.ts — N reps × M conditions on one file - scripts/mode_pass5_summarize.ts — mean ± σ + head-to-head, parser handles numbered + path-with-line + path-with-symbol finding tables - scripts/mode_compare.ts — groups by mode|corpus when sweeps span corpora - scripts/mode_experiment.ts — default model bumped to x-ai/grok-4.1-fast, --corpus flag for per-call override Decisions + open follow-ups: docs/MODE_RUNNER_TUNING_PLAN.md Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
177 lines
6.4 KiB
TypeScript
177 lines
6.4 KiB
TypeScript
#!/usr/bin/env bun
|
|
/**
|
|
* Build the `lakehouse_arch_v1` corpus — Option A from 2026-04-26
|
|
* corpus-tightening pass. Sources: DECISIONS.md ADRs, standalone
|
|
* ADR-NNN-*.md docs, PHASES.md per-phase entries, PRD.md,
|
|
* CONTROL_PLANE_PRD.md, SCRUM_MASTER_SPEC.md sections.
|
|
*
|
|
* doc_id encodes origin (adr:017, phase:19, prd:executive_summary, ...)
|
|
* so the reviewer prompt's [tag] surfaces useful context.
|
|
*
|
|
* Usage:
|
|
* bun run scripts/build_lakehouse_corpus.ts # build
|
|
* bun run scripts/build_lakehouse_corpus.ts --dry-run # show docs, don't POST
|
|
* bun run scripts/build_lakehouse_corpus.ts --print # dump first chunk + count
|
|
*/
|
|
|
|
import { readFileSync, readdirSync } from "node:fs";
|
|
import { resolve } from "node:path";
|
|
|
|
const ROOT = resolve(import.meta.dir, "..");
|
|
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
|
|
const INDEX_NAME = process.env.LH_CORPUS_NAME ?? "lakehouse_arch_v1";
|
|
const SOURCE_LABEL = "lakehouse_arch";
|
|
const CHUNK_SIZE = Number(process.env.LH_CHUNK_SIZE ?? 1500);
|
|
const OVERLAP = Number(process.env.LH_OVERLAP ?? 150);
|
|
|
|
interface Doc { id: string; text: string }
|
|
|
|
function slug(s: string): string {
|
|
return s
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9]+/g, "_")
|
|
.replace(/^_+|_+$/g, "")
|
|
.slice(0, 60);
|
|
}
|
|
|
|
// Split DECISIONS.md by `## ADR-NNN: title`. Drop date line so it doesn't
|
|
// dilute the embedding (ADRs are about intent, not when they happened).
|
|
function chunkDecisionsMd(md: string): Doc[] {
|
|
const docs: Doc[] = [];
|
|
const sections = md.split(/^## ADR-(\d+):\s*(.+)$/m);
|
|
// sections = [preamble, num, title, body, num, title, body, ...]
|
|
for (let i = 1; i < sections.length; i += 3) {
|
|
const num = sections[i].padStart(3, "0");
|
|
const title = sections[i + 1].trim();
|
|
const body = sections[i + 2]
|
|
.replace(/^\*\*Date:\*\*.*$/m, "")
|
|
.trim();
|
|
docs.push({
|
|
id: `adr:${num}`,
|
|
text: `# ADR-${num}: ${title}\n\n${body}`,
|
|
});
|
|
}
|
|
return docs;
|
|
}
|
|
|
|
// Standalone ADR-NNN-*.md files in docs/ — keep one doc per file.
|
|
function chunkStandaloneAdrs(dir: string): Doc[] {
|
|
const docs: Doc[] = [];
|
|
for (const f of readdirSync(dir)) {
|
|
const m = f.match(/^ADR-(\d+)-(.+)\.md$/);
|
|
if (!m) continue;
|
|
const num = m[1].padStart(3, "0");
|
|
const slug_ = slug(m[2]);
|
|
docs.push({
|
|
id: `adr_doc:${num}_${slug_}`,
|
|
text: readFileSync(resolve(dir, f), "utf8"),
|
|
});
|
|
}
|
|
return docs;
|
|
}
|
|
|
|
// PHASES.md uses `## Phase N: title` headings + nested checklists. Split
|
|
// by phase. Sub-bullets stay with their parent phase so context is intact.
|
|
function chunkPhasesMd(md: string): Doc[] {
|
|
const docs: Doc[] = [];
|
|
const sections = md.split(/^## (Phase[^\n]*)$/m);
|
|
for (let i = 1; i < sections.length; i += 2) {
|
|
const heading = sections[i].trim();
|
|
const body = sections[i + 1].trim();
|
|
if (!body) continue;
|
|
const phase_num_match = heading.match(/Phase\s+(\S+)/);
|
|
const id_part = phase_num_match
|
|
? `phase:${slug(phase_num_match[1])}`
|
|
: `phase:${slug(heading)}`;
|
|
docs.push({ id: id_part, text: `## ${heading}\n${body}` });
|
|
}
|
|
return docs;
|
|
}
|
|
|
|
// Generic doc: split by `## Section` (top-level inside a single doc). If
|
|
// the section list is empty, return the whole file as one doc and let the
|
|
// server-side chunker handle it.
|
|
function chunkBySectionH2(filePath: string, originPrefix: string): Doc[] {
|
|
const md = readFileSync(filePath, "utf8");
|
|
const sections = md.split(/^## (.+)$/m);
|
|
if (sections.length < 3) {
|
|
return [{ id: `${originPrefix}:_full`, text: md }];
|
|
}
|
|
const docs: Doc[] = [];
|
|
// Capture preamble (before any ## heading) if non-trivial
|
|
if (sections[0].trim().length > 200) {
|
|
docs.push({
|
|
id: `${originPrefix}:_preamble`,
|
|
text: sections[0].trim(),
|
|
});
|
|
}
|
|
for (let i = 1; i < sections.length; i += 2) {
|
|
const heading = sections[i].trim();
|
|
const body = sections[i + 1].trim();
|
|
if (!body) continue;
|
|
docs.push({
|
|
id: `${originPrefix}:${slug(heading)}`,
|
|
text: `## ${heading}\n${body}`,
|
|
});
|
|
}
|
|
return docs;
|
|
}
|
|
|
|
function buildAllDocs(): Doc[] {
|
|
const docs: Doc[] = [];
|
|
docs.push(...chunkDecisionsMd(readFileSync(resolve(ROOT, "docs/DECISIONS.md"), "utf8")));
|
|
docs.push(...chunkStandaloneAdrs(resolve(ROOT, "docs")));
|
|
docs.push(...chunkPhasesMd(readFileSync(resolve(ROOT, "docs/PHASES.md"), "utf8")));
|
|
docs.push(...chunkBySectionH2(resolve(ROOT, "docs/PRD.md"), "prd"));
|
|
docs.push(...chunkBySectionH2(resolve(ROOT, "docs/CONTROL_PLANE_PRD.md"), "ctrl_prd"));
|
|
docs.push(...chunkBySectionH2(resolve(ROOT, "docs/SCRUM_MASTER_SPEC.md"), "scrum_spec"));
|
|
return docs;
|
|
}
|
|
|
|
async function main() {
|
|
const dryRun = process.argv.includes("--dry-run") || process.argv.includes("--print");
|
|
const printOnly = process.argv.includes("--print");
|
|
|
|
const docs = buildAllDocs();
|
|
const totalBytes = docs.reduce((s, d) => s + d.text.length, 0);
|
|
const expectedChunks = Math.ceil(totalBytes / (CHUNK_SIZE - OVERLAP));
|
|
|
|
console.log(`[corpus] ${docs.length} documents · ${totalBytes} bytes · ~${expectedChunks} chunks at ${CHUNK_SIZE}/${OVERLAP}`);
|
|
console.log(`[corpus] origins: ${[...new Set(docs.map(d => d.id.split(":")[0]))].join(", ")}`);
|
|
|
|
if (printOnly) {
|
|
console.log("\n[corpus] first 3 doc IDs:");
|
|
docs.slice(0, 3).forEach(d => console.log(` ${d.id} (${d.text.length} bytes) ${d.text.slice(0, 80).replace(/\n/g, " ")}…`));
|
|
console.log("\n[corpus] last 3 doc IDs:");
|
|
docs.slice(-3).forEach(d => console.log(` ${d.id} (${d.text.length} bytes) ${d.text.slice(0, 80).replace(/\n/g, " ")}…`));
|
|
return;
|
|
}
|
|
if (dryRun) return;
|
|
|
|
const body = {
|
|
index_name: INDEX_NAME,
|
|
source: SOURCE_LABEL,
|
|
documents: docs,
|
|
chunk_size: CHUNK_SIZE,
|
|
overlap: OVERLAP,
|
|
};
|
|
|
|
console.log(`[corpus] POST ${GATEWAY}/vectors/index → ${INDEX_NAME}`);
|
|
const r = await fetch(`${GATEWAY}/vectors/index`, {
|
|
method: "POST",
|
|
headers: { "content-type": "application/json" },
|
|
body: JSON.stringify(body),
|
|
signal: AbortSignal.timeout(60_000),
|
|
});
|
|
if (!r.ok) {
|
|
console.error(`[corpus] HTTP ${r.status}: ${await r.text()}`);
|
|
process.exit(1);
|
|
}
|
|
const j: any = await r.json();
|
|
console.log(`[corpus] job ${j.job_id} · ${j.documents} docs → ${j.chunks} chunks queued`);
|
|
console.log(`[corpus] poll: curl -s ${GATEWAY}/vectors/jobs/${j.job_id} | jq`);
|
|
console.log(`[corpus] verify: curl -s '${GATEWAY}/vectors/indexes' | jq '.[]|select(.index_name=="${INDEX_NAME}")'`);
|
|
}
|
|
|
|
main().catch(e => { console.error(e); process.exit(1); });
|