lakehouse/tests/real-world/consensus_reducer_design.ts

// consensus_reducer_design.ts — N=3 design consultation.
//
// J's ask: enhance the tree-split reducer to preserve FULL backtrack-able
// context (endpoints tried, attempt count per model in the ladder, KB
// sources retrieved, context7 bridge hits, MCP observer signals, audit
// verdicts) instead of collapsing to a summary. Then index the full
// context through our existing vectord matrix indexing (HNSW + Lance +
// playbook_memory) so successful pathways become hot-swappable — the
// system asks "what did we try, what worked, in what order" for a
// similar task class and gets a ranked playbook back.
//
// Before building, consult three diverse models and print their proposals
// side-by-side so we can pick the convergent design.

const GATEWAY = "http://localhost:3100";

const DESIGN_BRIEF = `
# Context — Lakehouse signal→commit loop

We run 6x scrum-master iterations that audit Rust crates for PRD
alignment, produce findings + confidence, and feed an auto-applier that
lands small mechanical commits through a cargo-green-and-warning-stable
gate. Key components:

- \`tests/real-world/scrum_master_pipeline.ts\` — orchestrator. 9-rung
  model LADDER (kimi-k2:1t → qwen3-coder:480b → deepseek → mistral-large
  → gpt-oss:120b → qwen3.5:397b → openrouter free rescues → local
  qwen3.5:latest). Each target file retrieves 5 PRD chunks + 5
  proposal-doc chunks via vectord RAG, tree-splits large files into 3.5K
  shards, asks each rung in order, accepts first response passing
  structural checks.
- \`mcp-server/observer.ts\` — receives scrum \`/event\` emissions
  (file, verdict, critical_failures_count, gradient_tier, attempts,
  reviewer_model, tree_split_fired). Escalates failure clusters to LLM
  Team by POSTing to /v1/chat with qwen3-coder:480b.
- \`context7-bridge\` — external library docs lookup.
- \`auditor/audit.ts\` — independent N=3 consensus re-check of scrum
  findings; writes to data/_kb/audit_facts.jsonl via LLM Team
  \`/api/run?mode=extract\`.
- \`crates/vectord/src/playbook_memory.rs\` — indexing for proven
  playbooks: PlaybookEntry, DocRef, FailureRecord, BoostEntry,
  PatternReport. Uses HNSW index + Lance columnar backend + promotion
  pipeline. Already battle-tested for workers/staffing queries.
- Tree-split REDUCER: after shards return map-style summaries, they are
  concatenated with internal §N§ markers and fed to a reviewer model to
  produce ONE file-level review. Currently the reducer sees summaries,
  not the full context behind each shard's conclusion.

# The problem

The reducer currently TRUNCATES to a short summary. When the auditor or
a future iteration wants to backtrack WHY the reducer concluded what it
did — which attempt succeeded, which failed, what KB chunks were
retrieved, what observer signal classified the file as LOOPING vs
CONVERGING — that context is lost. So:

1. Auditor can't verify citation provenance beyond the summary line.
2. Applier can't tell a "tried X, failed, qwen fixed it" playbook from a
   "tried X and it was easy" playbook — they look identical downstream.
3. The matrix indexing is only used for RAG chunks during the scrum
   pass, NOT for storing the full end-to-end pathway of a successful
   review.

# The design question

Propose an enhanced reducer + indexing design that:

(a) Preserves the FULL backtrack context per reviewed file:
    - every ladder attempt (model, ms, accepted_y/n, reject_reason)
    - every retrieved KB chunk (source doc, chunk id, cosine score, rank)
    - every observer signal (class, priors, prior-iter outcomes)
    - every context7 bridge hit (library, version pulled)
    - every sub-pipeline call (LLM Team extract results, audit consensus)

(b) Stores this pathway into vectord's matrix indexing alongside the
    review verdict so it becomes retrievable by similarity. When a new
    file's fingerprint (task_class + file-path prefix + signal class)
    matches a past successful pathway, the system can hot-swap by
    replaying or short-circuiting to the model/KB combo that worked.

(c) Surfaces the matrix-index hit rate as a feedback signal on the
    scrum's UI — "this file was solved 3 times before by the same ladder
    rung; consider short-circuiting to rung 5."

(d) Is compatible with the existing playbook_memory.rs primitives
    (PlaybookEntry, DocRef, FailureRecord, BoostEntry) — extend don't
    replace. The indexing layer is in production for workers/staffing;
    we want the reducer pathway to piggyback on proven infrastructure.

# Constraints
- NO new crate. Extend vectord + scrum_master_pipeline.
- Full context can be LARGE — a reviewed file might have 5 retrievals,
  4 ladder attempts, 8 observer priors. Design the embedding /
  fingerprint so similar-but-not-identical pathways cluster.
- The reducer summary is still needed for the reviewer LLM input —
  don't remove it, ADD the full-context sidecar.
- Audit trail: every pathway must be replayable deterministically from
  what's stored (i.e., enough context to re-run without the original
  prompt cache).

# Required output (STRICT JSON, no prose, no markdown fences):

{
  "approach": "one-paragraph summary of your proposed design",
  "data_model": {
    "new_fields_on_playbook_entry": [...],
    "new_types": [ {"name": "...", "purpose": "...", "fields": [...]} ]
  },
  "storage_strategy": {
    "what_to_vectorize": "the text that becomes the embedding",
    "fingerprint_key": "the deterministic key for similarity retrieval",
    "backend": "HNSW, Lance, playbook_memory — pick"
  },
  "reducer_changes": {
    "inputs_added": [...],
    "outputs_added": [...],
    "compatibility_notes": "how existing callers stay working"
  },
  "hot_swap_logic": "concrete rule for when to skip the ladder and replay a past pathway",
  "ui_signal": "what to surface so J sees whether matrix indexing is earning its keep",
  "risks": [...],
  "why_this_beats_summarization": "one-paragraph argument"
}
`.trim();

interface Probe {
  name: string;
  provider: "ollama" | "ollama_cloud" | "openrouter";
  model: string;
}

// Round-3 probe set — 4 probes covering the remaining ladder rungs +
// architecture/provider diversity. J wanted all 4 of the untouched
// options so the aggregated 10-model signal is saturated across the
// usable ladder.
const PROBES: Probe[] = [
  { name: "qwen35-397b",        provider: "ollama_cloud", model: "qwen3.5:397b" },
  { name: "openrouter-gpt-oss", provider: "openrouter",   model: "openai/gpt-oss-120b:free" },
  { name: "openrouter-gemma3",  provider: "openrouter",   model: "google/gemma-3-27b-it:free" },
  { name: "qwen3-coder-480b-2", provider: "ollama_cloud", model: "qwen3-coder:480b" }, // second probe of the coding specialist — stability check
];

async function ask(p: Probe): Promise<{ name: string; raw: string; ms: number; error?: string }> {
  const started = Date.now();
  try {
    const r = await fetch(`${GATEWAY}/v1/chat`, {
      method: "POST",
      headers: { "Content-Type": "application/json" },
      body: JSON.stringify({
        provider: p.provider,
        model: p.model,
        messages: [
          { role: "system", content: "You are a senior architect. Output STRICT JSON only." },
          { role: "user", content: DESIGN_BRIEF },
        ],
        max_tokens: 3000,
        temperature: 0,
      }),
    });
    const ms = Date.now() - started;
    if (!r.ok) return { name: p.name, raw: "", ms, error: `HTTP ${r.status}: ${(await r.text()).slice(0, 200)}` };
    const j = await r.json();
    const content = j.content ?? j.message?.content ?? j.choices?.[0]?.message?.content ?? "";
    return { name: p.name, raw: String(content), ms };
  } catch (e: any) {
    return { name: p.name, raw: "", ms: Date.now() - started, error: String(e).slice(0, 200) };
  }
}

function extractJson(raw: string): any | null {
  let s = raw.trim();
  const fence = s.match(/^```(?:json)?\s*/);
  if (fence) s = s.slice(fence[0].length);
  if (s.endsWith("```")) s = s.slice(0, -3).trim();
  const first = s.indexOf("{");
  const last = s.lastIndexOf("}");
  if (first < 0 || last <= first) return null;
  try {
    return JSON.parse(s.slice(first, last + 1));
  } catch {
    return null;
  }
}

function summarize(obj: any, max = 240): string {
  if (!obj) return "(no JSON parsed)";
  if (typeof obj === "string") return obj.length > max ? obj.slice(0, max) + "…" : obj;
  if (Array.isArray(obj)) return obj.map((x) => summarize(x, max)).join("; ");
  return Object.entries(obj)
    .map(([k, v]) => `${k}=${summarize(v, max)}`)
    .join(" | ");
}

async function main() {
  console.log(`\n── N=3 design consensus ──`);
  console.log(`models: ${PROBES.map((p) => p.model).join(", ")}\n`);

  const results = await Promise.all(PROBES.map(ask));

  for (const r of results) {
    console.log(`\n── ${r.name} (${r.ms}ms) ──`);
    if (r.error) { console.log(`  ERROR: ${r.error}`); continue; }
    const j = extractJson(r.raw);
    if (!j) {
      console.log(`  raw (no JSON): ${r.raw.slice(0, 600)}…`);
      continue;
    }
    console.log(`  approach: ${summarize(j.approach, 400)}`);
    console.log(`  fingerprint: ${summarize(j.storage_strategy?.fingerprint_key, 200)}`);
    console.log(`  vectorize:   ${summarize(j.storage_strategy?.what_to_vectorize, 200)}`);
    console.log(`  backend:     ${summarize(j.storage_strategy?.backend, 200)}`);
    console.log(`  hot_swap:    ${summarize(j.hot_swap_logic, 300)}`);
    console.log(`  new_types:   ${summarize(j.data_model?.new_types, 400)}`);
    console.log(`  risks:       ${summarize(j.risks, 300)}`);
    console.log(`  why>summary: ${summarize(j.why_this_beats_summarization, 300)}`);
  }

  // Write full JSON to disk so we can inspect later.
  const outPath = `/home/profit/lakehouse/data/_kb/consensus_reducer_design_${Date.now().toString(36)}.json`;
  await Bun.write(outPath, JSON.stringify(results, null, 2));
  console.log(`\nfull responses → ${outPath}`);
}

await main();