Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
The compose-don't-add discipline applied to the original ask: when big
models produce good results (scrum reviews + observer escalations),
save them into the matrix indexer so future small-model handlers can
retrieve them as scaffolding. Local model gets near-paid quality from
a fraction of the cost.
New: scripts/build_answers_corpus.ts indexes lakehouse_answers_v1
from data/_kb/scrum_reviews.jsonl + data/_kb/observer_escalations.jsonl.
doc_id prefixes ('review:' vs 'escalation:') let consumers same-file-
gate the prior-reviews case while keeping escalations broad.
observer.ts: buildKbPreamble adds lakehouse_answers_v1 as a third
retrieval source alongside pathway/bug_fingerprints + lakehouse_arch_v1.
qwen3.5:latest synthesis now compresses three lenses into a single
briefing for the cloud reviewer.
scrum_master_pipeline.ts: epilogue dispatches a fire-and-forget rebuild
of lakehouse_answers_v1 after each run so this run's accepted reviews
are retrievable within ~30s. LH_SCRUM_SKIP_ANSWERS_REBUILD=1 disables.
Verified live: kb_preamble grew 416 → 727 chars after wiring third
source; qwen3.5:latest synthesis (702 → 128 tokens) compresses
correctly; deepseek-v3.1-terminus diagnosis (301 → 148 tokens) is
sharper, citing architectural patterns (circuit breaker, adapter
files) instead of generic timeouts. Total cost per escalation
unchanged at ~$0.0002.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
139 lines
5.0 KiB
TypeScript
139 lines
5.0 KiB
TypeScript
#!/usr/bin/env bun
|
|
/**
|
|
* Build `lakehouse_answers_v1` — the gold-standard answer corpus.
|
|
*
|
|
* Sources:
|
|
* - data/_kb/scrum_reviews.jsonl (per-file scrum reviews)
|
|
* - data/_kb/observer_escalations.jsonl (deepseek-v3.1-terminus diagnoses)
|
|
*
|
|
* doc_id prefixes distinguish origin so consumers can same-file-gate
|
|
* (only return `review:<file>` chunks when the focus file matches) or
|
|
* broaden (always include `escalation:` for general task-class signal).
|
|
*
|
|
* Replaces scrum_findings_v1 — broader scope, cleaner gating story.
|
|
*
|
|
* Re-run after every scrum run / observer escalation lands. The
|
|
* pipeline's epilogue calls this; manual runs work too.
|
|
*/
|
|
|
|
import { readFileSync, existsSync } from "node:fs";
|
|
import { resolve } from "node:path";
|
|
|
|
const ROOT = resolve(import.meta.dir, "..");
|
|
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
|
|
const INDEX_NAME = process.env.LH_CORPUS_NAME ?? "lakehouse_answers_v1";
|
|
const SOURCE_LABEL = "lakehouse_answers";
|
|
const CHUNK_SIZE = Number(process.env.LH_CHUNK_SIZE ?? 1500);
|
|
const OVERLAP = Number(process.env.LH_OVERLAP ?? 150);
|
|
const MIN_BYTES = 200; // skip stub rows
|
|
|
|
interface Doc { id: string; text: string }
|
|
|
|
function slugFile(path: string): string {
|
|
return path.replace(/^crates\//, "").replace(/[^a-z0-9]+/gi, "_").slice(0, 40);
|
|
}
|
|
|
|
function compactTs(iso: string): string {
|
|
return iso.replace(/[-:T]/g, "").slice(0, 14);
|
|
}
|
|
|
|
function buildScrumReviewDocs(): Doc[] {
|
|
const path = resolve(ROOT, "data/_kb/scrum_reviews.jsonl");
|
|
if (!existsSync(path)) return [];
|
|
const lines = readFileSync(path, "utf8").split("\n").filter(Boolean);
|
|
const docs: Doc[] = [];
|
|
const idCounts = new Map<string, number>();
|
|
|
|
for (const line of lines) {
|
|
let row: any;
|
|
try { row = JSON.parse(line); } catch { continue; }
|
|
const file = row.file ?? "";
|
|
const preview = row.suggestions_preview ?? "";
|
|
if (!file || preview.length < MIN_BYTES) continue;
|
|
|
|
const ts = compactTs(row.reviewed_at ?? "");
|
|
const baseId = `review:${slugFile(file)}:${ts || "no_ts"}`;
|
|
const count = (idCounts.get(baseId) ?? 0) + 1;
|
|
idCounts.set(baseId, count);
|
|
const id = count === 1 ? baseId : `${baseId}_${count}`;
|
|
|
|
const header = `File: ${file}\nReviewed: ${row.reviewed_at ?? "?"}\nModel: ${row.accepted_model ?? "?"}\nVerdict: ${row.verdict ?? "?"}\n\n`;
|
|
docs.push({ id, text: header + preview });
|
|
}
|
|
return docs;
|
|
}
|
|
|
|
function buildEscalationDocs(): Doc[] {
|
|
const path = resolve(ROOT, "data/_kb/observer_escalations.jsonl");
|
|
if (!existsSync(path)) return [];
|
|
const lines = readFileSync(path, "utf8").split("\n").filter(Boolean);
|
|
const docs: Doc[] = [];
|
|
const idCounts = new Map<string, number>();
|
|
|
|
for (const line of lines) {
|
|
let row: any;
|
|
try { row = JSON.parse(line); } catch { continue; }
|
|
const analysis = row.analysis ?? "";
|
|
if (analysis.length < MIN_BYTES) continue;
|
|
|
|
const ts = compactTs(row.ts ?? "");
|
|
const sigSlug = String(row.sig_hash ?? "no_sig").slice(0, 12);
|
|
const baseId = `escalation:${sigSlug}:${ts || "no_ts"}`;
|
|
const count = (idCounts.get(baseId) ?? 0) + 1;
|
|
idCounts.set(baseId, count);
|
|
const id = count === 1 ? baseId : `${baseId}_${count}`;
|
|
|
|
const header = `Failure cluster · sig_hash=${row.sig_hash ?? "?"} · cluster_size=${row.cluster_size ?? "?"} · endpoint=${row.cluster_endpoint ?? "?"}\nDiagnosed by: ${row.mode ?? "?"}\nWhen: ${row.ts ?? "?"}\n\n`;
|
|
docs.push({ id, text: header + analysis });
|
|
}
|
|
return docs;
|
|
}
|
|
|
|
async function main() {
|
|
const dryRun = process.argv.includes("--dry-run") || process.argv.includes("--print");
|
|
const printOnly = process.argv.includes("--print");
|
|
|
|
const reviews = buildScrumReviewDocs();
|
|
const escalations = buildEscalationDocs();
|
|
const docs = [...reviews, ...escalations];
|
|
const totalBytes = docs.reduce((s, d) => s + d.text.length, 0);
|
|
|
|
console.log(`[answers] ${docs.length} docs · ${totalBytes} bytes`);
|
|
console.log(`[answers] reviews: ${reviews.length}`);
|
|
console.log(`[answers] escalations: ${escalations.length}`);
|
|
|
|
if (printOnly) {
|
|
docs.slice(0, 2).forEach(d => console.log(` ${d.id} (${d.text.length}b) ${d.text.slice(0, 100).replace(/\n/g, " ")}…`));
|
|
if (escalations.length > 0) {
|
|
console.log(` ... (last escalation): ${escalations[escalations.length - 1].id}`);
|
|
}
|
|
return;
|
|
}
|
|
if (dryRun) return;
|
|
if (docs.length === 0) {
|
|
console.log("[answers] no docs to index — skipping POST");
|
|
return;
|
|
}
|
|
|
|
const r = await fetch(`${GATEWAY}/vectors/index`, {
|
|
method: "POST",
|
|
headers: { "content-type": "application/json" },
|
|
body: JSON.stringify({
|
|
index_name: INDEX_NAME,
|
|
source: SOURCE_LABEL,
|
|
documents: docs,
|
|
chunk_size: CHUNK_SIZE,
|
|
overlap: OVERLAP,
|
|
}),
|
|
signal: AbortSignal.timeout(60_000),
|
|
});
|
|
if (!r.ok) {
|
|
console.error(`[answers] HTTP ${r.status}: ${await r.text()}`);
|
|
process.exit(1);
|
|
}
|
|
const j: any = await r.json();
|
|
console.log(`[answers] job ${j.job_id} · ${j.documents} docs → ${j.chunks} chunks queued`);
|
|
}
|
|
|
|
main().catch(e => { console.error(e); process.exit(1); });
|