#!/usr/bin/env bun /** * Build `lakehouse_answers_v1` — the gold-standard answer corpus. * * Sources: * - data/_kb/scrum_reviews.jsonl (per-file scrum reviews) * - data/_kb/observer_escalations.jsonl (deepseek-v3.1-terminus diagnoses) * * doc_id prefixes distinguish origin so consumers can same-file-gate * (only return `review:` chunks when the focus file matches) or * broaden (always include `escalation:` for general task-class signal). * * Replaces scrum_findings_v1 — broader scope, cleaner gating story. * * Re-run after every scrum run / observer escalation lands. The * pipeline's epilogue calls this; manual runs work too. */ import { readFileSync, existsSync } from "node:fs"; import { resolve } from "node:path"; const ROOT = resolve(import.meta.dir, ".."); const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100"; const INDEX_NAME = process.env.LH_CORPUS_NAME ?? "lakehouse_answers_v1"; const SOURCE_LABEL = "lakehouse_answers"; const CHUNK_SIZE = Number(process.env.LH_CHUNK_SIZE ?? 1500); const OVERLAP = Number(process.env.LH_OVERLAP ?? 150); const MIN_BYTES = 200; // skip stub rows interface Doc { id: string; text: string } function slugFile(path: string): string { return path.replace(/^crates\//, "").replace(/[^a-z0-9]+/gi, "_").slice(0, 40); } function compactTs(iso: string): string { return iso.replace(/[-:T]/g, "").slice(0, 14); } function buildScrumReviewDocs(): Doc[] { const path = resolve(ROOT, "data/_kb/scrum_reviews.jsonl"); if (!existsSync(path)) return []; const lines = readFileSync(path, "utf8").split("\n").filter(Boolean); const docs: Doc[] = []; const idCounts = new Map(); for (const line of lines) { let row: any; try { row = JSON.parse(line); } catch { continue; } const file = row.file ?? ""; const preview = row.suggestions_preview ?? ""; if (!file || preview.length < MIN_BYTES) continue; const ts = compactTs(row.reviewed_at ?? ""); const baseId = `review:${slugFile(file)}:${ts || "no_ts"}`; const count = (idCounts.get(baseId) ?? 0) + 1; idCounts.set(baseId, count); const id = count === 1 ? baseId : `${baseId}_${count}`; const header = `File: ${file}\nReviewed: ${row.reviewed_at ?? "?"}\nModel: ${row.accepted_model ?? "?"}\nVerdict: ${row.verdict ?? "?"}\n\n`; docs.push({ id, text: header + preview }); } return docs; } function buildEscalationDocs(): Doc[] { const path = resolve(ROOT, "data/_kb/observer_escalations.jsonl"); if (!existsSync(path)) return []; const lines = readFileSync(path, "utf8").split("\n").filter(Boolean); const docs: Doc[] = []; const idCounts = new Map(); for (const line of lines) { let row: any; try { row = JSON.parse(line); } catch { continue; } const analysis = row.analysis ?? ""; if (analysis.length < MIN_BYTES) continue; const ts = compactTs(row.ts ?? ""); const sigSlug = String(row.sig_hash ?? "no_sig").slice(0, 12); const baseId = `escalation:${sigSlug}:${ts || "no_ts"}`; const count = (idCounts.get(baseId) ?? 0) + 1; idCounts.set(baseId, count); const id = count === 1 ? baseId : `${baseId}_${count}`; const header = `Failure cluster · sig_hash=${row.sig_hash ?? "?"} · cluster_size=${row.cluster_size ?? "?"} · endpoint=${row.cluster_endpoint ?? "?"}\nDiagnosed by: ${row.mode ?? "?"}\nWhen: ${row.ts ?? "?"}\n\n`; docs.push({ id, text: header + analysis }); } return docs; } async function main() { const dryRun = process.argv.includes("--dry-run") || process.argv.includes("--print"); const printOnly = process.argv.includes("--print"); const reviews = buildScrumReviewDocs(); const escalations = buildEscalationDocs(); const docs = [...reviews, ...escalations]; const totalBytes = docs.reduce((s, d) => s + d.text.length, 0); console.log(`[answers] ${docs.length} docs · ${totalBytes} bytes`); console.log(`[answers] reviews: ${reviews.length}`); console.log(`[answers] escalations: ${escalations.length}`); if (printOnly) { docs.slice(0, 2).forEach(d => console.log(` ${d.id} (${d.text.length}b) ${d.text.slice(0, 100).replace(/\n/g, " ")}…`)); if (escalations.length > 0) { console.log(` ... (last escalation): ${escalations[escalations.length - 1].id}`); } return; } if (dryRun) return; if (docs.length === 0) { console.log("[answers] no docs to index — skipping POST"); return; } const r = await fetch(`${GATEWAY}/vectors/index`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ index_name: INDEX_NAME, source: SOURCE_LABEL, documents: docs, chunk_size: CHUNK_SIZE, overlap: OVERLAP, }), signal: AbortSignal.timeout(60_000), }); if (!r.ok) { console.error(`[answers] HTTP ${r.status}: ${await r.text()}`); process.exit(1); } const j: any = await r.json(); console.log(`[answers] job ${j.job_id} · ${j.documents} docs → ${j.chunks} chunks queued`); } main().catch(e => { console.error(e); process.exit(1); });