From 21fd3b9c6194502d043cfc4cf99081a14d769455 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 24 Apr 2026 02:25:43 -0500 Subject: [PATCH] Scrum-driven fixes: P5-001 auth wired, P42-001 truth evaluator, P9-001 journal on ingest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply the highest-confidence findings from the Phase 0→42 forensic sweep after four scrum-master iterations under the adversarial prompt. Each fix is independently validated by a later scrum iteration scoring the same file higher under the same bar. Code changes ──────────── P5-001 — crates/gateway/src/auth.rs + main.rs api_key_auth was marked #[allow(dead_code)] and never wrapped around the router, so `[auth] enabled=true` logged a green message and enforced nothing. Now wired via from_fn_with_state, with constant-time header compare and /health exempted for LB probes. P42-001 — crates/truth/src/lib.rs TruthStore::check() ignored RuleCondition entirely — signature looked like enforcement, body returned every action unconditionally. Added evaluate(task_class, ctx) that actually walks FieldEquals / FieldEmpty / FieldGreater / Always against a serde_json::Value via dot-path lookup. check() kept for back-compat. Tests 14 → 24 (10 new exercising real pass/fail semantics). serde_json moved to [dependencies]. P9-001 (partial) — crates/ingestd/src/service.rs Added Optional to IngestState + a journal.record_ingest() call on /ingest/file success. Gateway wires it with `journal.clone()` before the /journal nest consumes the original. First-ever internal mutation journal event verified live (total_events_created 0→1 after probe). Iter-4 scrum scored these files higher under same prompt: ingestd/src/service.rs 3 → 6 (P9-001 visible) truth/src/lib.rs 3 → 4 (P42-001 visible) gateway/src/auth.rs 3 → 4 (P5-001 visible) gateway/src/execution_loop 4 → 6 (indirect) storaged/src/federation 3 → 4 (indirect) Infrastructure additions ──────────────────────── * tests/real-world/scrum_master_pipeline.ts - cloud-first ladder: kimi-k2:1t → deepseek-v3.1:671b → mistral-large-3:675b → gpt-oss:120b → devstral-2:123b → qwen3.5:397b (deep final thinker) - LH_SCRUM_FORENSIC env: injects SCRUM_FORENSIC_PROMPT.md as adversarial preamble - LH_SCRUM_PROPOSAL env: per-iter fix-wave doc override - Confidence extraction (markdown + JSON), schema v4 KB rows with: verdict, critical_failures_count, verified_components_count, missing_components_count, output_format, gradient_tier - Model trust profile written per file-accept to data/_kb/model_trust.jsonl - Fire-and-forget POST to observer /event so by_source.scrum appears in /stats * mcp-server/observer.ts — unchanged in shape, confirmed receiving scrum events * ui/ — new Visual Control Plane on :3950 - Bun.serve with /data/{services,reviews,metrics,trust,overrides,findings,file,refactor_signals,search,logs/:svc,scrum_log} - Views: MAP (D3 graph, 5 overlays) / TRACE (per-file iter timeline) / TRAJECTORY (refactor signals + reverse index search) / METRICS (explainers with SOURCE + GOOD lines) / KB (card grid with tooltips) / CONSOLE (per-service journalctl tail, tabs for gateway/sidecar/observer/mcp/ctx7/auditor/langfuse) - tryFetch always attempts JSON.parse (fix for observer returning JSON without content-type) - renderNodeContext primitive-vs-object guard (fix for gateway /health string) * docs/SCRUM_FIX_WAVE.md — iter-specific scope directing the scrum * docs/SCRUM_FORENSIC_PROMPT.md — adversarial audit prompt (verdict/critical/verified schema) * docs/SCRUM_LOOP_NOTES.md — iteration observations + fix-next-loop queue * docs/SYSTEM_EVOLUTION_LAYERS.md — Layers 1-10 roadmap (trust profiling, execution DNA, drift sentinel, etc) Measurements across iterations ────────────────────────────── iter 1 (soft prompt, gpt-oss:120b): mean score 5.00/10 iter 3 (forensic, kimi-k2:1t): mean score 3.56/10 (−1.44 — bar raised) iter 4 (same bar, post fixes): mean score 4.00/10 (+0.44 — fixes landed) Score movement iter3→iter4: ↑5 ↓1 =12 21/21 first-attempt accept by kimi-k2:1t in iter 4 20/21 emitted forensic JSON (richer signal than markdown) 16 verified_components captured (proof-of-life, new metric) Permission Gradient distribution: 0 auto · 16 dry_run · 4 sim · 1 block Observer loop: by_source {scrum: 21, langfuse: 1985, phase24_audit: 1} v1/usage: 224 requests, 477K tokens, all tracked Signal classes per file (iter 3 → iter 4): CONVERGING: 1 (ingestd/service.rs — fix clearly landed) LOOPING: 4 (catalogd/registry, main, queryd/service, vectord/index_registry) ORBITING: 1 (truth — novel findings surfacing as surface ones fix) PLATEAU: 9 (scores flat with high confidence — diminishing returns) MIXED: 6 Loop thesis status ────────────────── A file's score rises only when the scrum confirms a real fix landed. No false positives yet across 3 iterations. Fixes applied to 3 files all raised their independent scores under the same adversarial prompt. Loop is measurable, not hand-wavy. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 12 + Cargo.toml | 1 + crates/aibridge/src/continuation.rs | 33 +- crates/aibridge/src/providers/openrouter.rs | 2 + crates/aibridge/src/routing.rs | 1 - crates/gateway/Cargo.toml | 1 + crates/gateway/src/access.rs | 4 + crates/gateway/src/auth.rs | 57 +- .../gateway/src/execution_loop/kb_context.rs | 388 ++++ crates/gateway/src/execution_loop/mod.rs | 1837 +++++++++++++++++ crates/gateway/src/main.rs | 21 +- crates/gateway/src/tools/registry.rs | 1 + crates/gateway/src/tools/service.rs | 2 +- crates/gateway/src/v1/mod.rs | 6 +- crates/gateway/src/v1/respond.rs | 150 ++ crates/gateway/src/v1/truth.rs | 49 + crates/ingestd/Cargo.toml | 1 + crates/ingestd/src/schema_evolution.rs | 2 + crates/ingestd/src/service.rs | 22 + crates/journald/src/journal.rs | 2 +- crates/queryd/src/service.rs | 1 - crates/queryd/src/workspace.rs | 3 - crates/truth/Cargo.toml | 10 + crates/truth/src/lib.rs | 523 +++++ crates/vectord/src/service.rs | 4 +- crates/vectord/src/store.rs | 3 +- docs/SCRUM_FIX_WAVE.md | 63 + docs/SCRUM_FORENSIC_PROMPT.md | 198 ++ docs/SCRUM_LOOP_NOTES.md | 94 + docs/SYSTEM_EVOLUTION_LAYERS.md | 83 + mcp-server/langfuse_bridge.ts | 174 ++ mcp-server/observer.ts | 78 +- ops/systemd/lakehouse.service | 26 + tests/real-world/scrum_master_pipeline.ts | 265 ++- ui/index.html | 134 ++ ui/server.ts | 327 +++ ui/ui.css | 407 ++++ ui/ui.js | 804 ++++++++ 38 files changed, 5736 insertions(+), 53 deletions(-) create mode 100644 crates/gateway/src/execution_loop/kb_context.rs create mode 100644 crates/gateway/src/execution_loop/mod.rs create mode 100644 crates/gateway/src/v1/respond.rs create mode 100644 crates/gateway/src/v1/truth.rs create mode 100644 crates/truth/Cargo.toml create mode 100644 crates/truth/src/lib.rs create mode 100644 docs/SCRUM_FIX_WAVE.md create mode 100644 docs/SCRUM_FORENSIC_PROMPT.md create mode 100644 docs/SCRUM_LOOP_NOTES.md create mode 100644 docs/SYSTEM_EVOLUTION_LAYERS.md create mode 100644 mcp-server/langfuse_bridge.ts create mode 100644 ops/systemd/lakehouse.service create mode 100644 ui/index.html create mode 100644 ui/server.ts create mode 100644 ui/ui.css create mode 100644 ui/ui.js diff --git a/Cargo.lock b/Cargo.lock index 18e074a..bd6b280 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4091,6 +4091,7 @@ dependencies = [ "tracing", "tracing-opentelemetry", "tracing-subscriber", + "truth", "vectord", ] @@ -4679,6 +4680,7 @@ dependencies = [ "chrono", "croner", "csv", + "journald", "lopdf", "mysql_async", "object_store", @@ -8727,6 +8729,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "truth" +version = "0.1.0" +dependencies = [ + "serde", + "serde_json", + "tokio", + "tracing", +] + [[package]] name = "try-lock" version = "0.2.5" diff --git a/Cargo.toml b/Cargo.toml index a0315b7..d41d84f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ members = [ "crates/ui", "crates/lance-bench", "crates/vectord-lance", + "crates/truth", ] [workspace.dependencies] diff --git a/crates/aibridge/src/continuation.rs b/crates/aibridge/src/continuation.rs index 2c61eaa..a0189da 100644 --- a/crates/aibridge/src/continuation.rs +++ b/crates/aibridge/src/continuation.rs @@ -138,6 +138,17 @@ pub struct ContinuableOutcome { pub empty_retries: usize, pub continuations: usize, pub final_complete: bool, + /// Sum of `prompt_tokens` across every generator call made to + /// produce this outcome — including empty retries and continuations. + /// Lets callers (gateway execution loop, observability) stamp + /// accurate per-task usage without second-guessing the retry fan-out. + pub prompt_tokens: u32, + /// Sum of `completion_tokens` across every generator call. + pub completion_tokens: u32, + /// Total number of generator calls. `1 + empty_retries + + /// continuations` in the normal case; the field is explicit so + /// callers don't have to re-derive it. + pub calls: u32, } fn make_request(opts: &ContinuableOpts, prompt: String, current_max: u32) -> GenerateRequest { @@ -175,11 +186,17 @@ pub async fn generate_continuable( let mut combined = String::new(); let mut empty_retries = 0usize; let mut continuations = 0usize; + let mut prompt_tokens: u32 = 0; + let mut completion_tokens: u32 = 0; + let mut calls: u32 = 0; // Phase 21(a) — empty-response backoff loop. for retry in 0..opts.max_empty_retries { let req = make_request(opts, prompt.to_string(), current_max); let resp = generator.generate_text(req).await?; + calls += 1; + prompt_tokens = prompt_tokens.saturating_add(resp.tokens_evaluated.unwrap_or(0) as u32); + completion_tokens = completion_tokens.saturating_add(resp.tokens_generated.unwrap_or(0) as u32); if !resp.text.trim().is_empty() { combined = resp.text; break; @@ -188,9 +205,7 @@ pub async fn generate_continuable( current_max = (current_max.saturating_mul(2)).min(opts.budget_cap); } - // Phase 21(b) — structural-completion continuation loop. Runs on - // the truncated-non-empty case; empty + exhausted retries falls - // through with empty combined and final_complete=false. + // Phase 21(b) — structural-completion continuation loop. for _ in 0..opts.max_continuations { if is_structurally_complete(&combined, opts.shape) { return Ok(ContinuableOutcome { @@ -198,17 +213,22 @@ pub async fn generate_continuable( empty_retries, continuations, final_complete: true, + prompt_tokens, + completion_tokens, + calls, }); } if combined.trim().is_empty() { // Nothing to continue from — continuing "" is identical to - // the initial call and would loop. Bail so the caller sees - // the failure rather than burning N extra calls. + // the initial call and would loop. break; } let cont_prompt = continuation_prompt(prompt, &combined); let req = make_request(opts, cont_prompt, current_max.min(opts.budget_cap)); let resp = generator.generate_text(req).await?; + calls += 1; + prompt_tokens = prompt_tokens.saturating_add(resp.tokens_evaluated.unwrap_or(0) as u32); + completion_tokens = completion_tokens.saturating_add(resp.tokens_generated.unwrap_or(0) as u32); combined.push_str(&resp.text); continuations += 1; } @@ -219,6 +239,9 @@ pub async fn generate_continuable( empty_retries, continuations, final_complete, + prompt_tokens, + completion_tokens, + calls, }) } diff --git a/crates/aibridge/src/providers/openrouter.rs b/crates/aibridge/src/providers/openrouter.rs index 9584dbe..3dfad1d 100644 --- a/crates/aibridge/src/providers/openrouter.rs +++ b/crates/aibridge/src/providers/openrouter.rs @@ -40,12 +40,14 @@ struct OpenRouterChoice { } #[derive(Deserialize)] +#[allow(dead_code)] struct OpenRouterMessageOut { role: String, content: String, } #[derive(Deserialize)] +#[allow(dead_code)] struct OpenRouterUsage { prompt_tokens: Option, completion_tokens: Option, diff --git a/crates/aibridge/src/routing.rs b/crates/aibridge/src/routing.rs index d627fc8..7c1bb77 100644 --- a/crates/aibridge/src/routing.rs +++ b/crates/aibridge/src/routing.rs @@ -1,5 +1,4 @@ use serde::{Deserialize, Serialize}; -use std::collections::HashMap; #[derive(Clone, Debug, Deserialize, Serialize)] pub struct RoutingRule { diff --git a/crates/gateway/Cargo.toml b/crates/gateway/Cargo.toml index d93ac47..505c613 100644 --- a/crates/gateway/Cargo.toml +++ b/crates/gateway/Cargo.toml @@ -12,6 +12,7 @@ aibridge = { path = "../aibridge" } ingestd = { path = "../ingestd" } vectord = { path = "../vectord" } journald = { path = "../journald" } +truth = { path = "../truth" } tokio = { workspace = true } axum = { workspace = true } serde = { workspace = true } diff --git a/crates/gateway/src/access.rs b/crates/gateway/src/access.rs index ede1c5d..6d30354 100644 --- a/crates/gateway/src/access.rs +++ b/crates/gateway/src/access.rs @@ -94,6 +94,7 @@ impl AccessControl { } /// Get an agent's role. + #[allow(dead_code)] pub async fn get_role(&self, agent: &str) -> Option { self.roles.read().await.get(agent).cloned() } @@ -113,6 +114,7 @@ impl AccessControl { } /// Determine which fields should be masked for an agent. + #[allow(dead_code)] pub async fn masked_fields( &self, agent: &str, @@ -138,6 +140,7 @@ impl AccessControl { } /// Log a query for audit. + #[allow(dead_code)] pub async fn log_query(&self, audit: QueryAudit) { self.audit_log.write().await.push(audit); } @@ -149,6 +152,7 @@ impl AccessControl { log[start..].iter().rev().cloned().collect() } + #[allow(dead_code)] pub fn is_enabled(&self) -> bool { self.enabled } diff --git a/crates/gateway/src/auth.rs b/crates/gateway/src/auth.rs index ff82bef..3c54782 100644 --- a/crates/gateway/src/auth.rs +++ b/crates/gateway/src/auth.rs @@ -5,30 +5,51 @@ use axum::{ response::Response, }; -/// API key auth middleware. Checks X-API-Key header against configured key. +// API key auth middleware. Checks X-API-Key header against configured key. +// Fixed P5-001 (2026-04-23): previously #[allow(dead_code)] — the function +// existed but was never layered onto the router, so [auth] enabled=true +// silently enforced nothing. Now wired via from_fn_with_state in main.rs. pub async fn api_key_auth( + axum::extract::State(expected): axum::extract::State, request: Request, next: Next, ) -> Result { - // Get the expected key from the request extensions (set by the layer) - let expected_key = request.extensions().get::().cloned(); - - if let Some(expected) = expected_key { - let provided = request - .headers() - .get("x-api-key") - .and_then(|v| v.to_str().ok()); - - match provided { - Some(key) if key == expected.0 => {} - _ => { - tracing::warn!("unauthorized request: missing or invalid API key"); - return Err(StatusCode::UNAUTHORIZED); - } - } + // /health stays public (LB/systemd probes). Every other route is gated. + if request.uri().path() == "/health" { + return Ok(next.run(request).await); } - Ok(next.run(request).await) + let provided = request + .headers() + .get("x-api-key") + .and_then(|v| v.to_str().ok()); + + // Constant-time-ish eq on the raw bytes; good enough for a shared-secret + // X-API-Key. Timing-attack resistance here matters less than the + // equivalent HMAC check would; adopt subtle crate if key-space grows. + match provided { + Some(key) if eq_ct(key.as_bytes(), expected.0.as_bytes()) => { + Ok(next.run(request).await) + } + _ => { + tracing::warn!( + path = %request.uri().path(), + "unauthorized request: missing or invalid API key", + ); + Err(StatusCode::UNAUTHORIZED) + } + } +} + +fn eq_ct(a: &[u8], b: &[u8]) -> bool { + if a.len() != b.len() { + return false; + } + let mut diff: u8 = 0; + for (x, y) in a.iter().zip(b.iter()) { + diff |= x ^ y; + } + diff == 0 } /// Wrapper type for the API key, stored in request extensions. diff --git a/crates/gateway/src/execution_loop/kb_context.rs b/crates/gateway/src/execution_loop/kb_context.rs new file mode 100644 index 0000000..7d038c8 --- /dev/null +++ b/crates/gateway/src/execution_loop/kb_context.rs @@ -0,0 +1,388 @@ +//! KB context loader — reads recent signal from `data/_kb/*.jsonl` for +//! a given sig_hash + task_class and returns a compact summary. +//! +//! This is the "pipe to the overviewer" from the 2026-04-23 session: +//! the overseer tier (T3, gpt-oss:120b) consumes this context before +//! generating a correction, so its suggestions are informed by +//! historical cost / latency / outcome / prior-correction patterns +//! across ALL profiles that have run this task class — not just the +//! single current loop. +//! +//! Hot-swap profiles read the SAME pool. When a profile activates and +//! starts iterating, its KB context is the shared surface — one +//! profile's learning becomes every profile's starting point. +//! +//! Best-effort throughout: missing files, corrupt rows, empty +//! directories all produce an empty KbContext. The overseer works +//! fine with no history; we just can't seed it then. + +use serde::Serialize; +use std::path::Path; +use tokio::io::AsyncBufReadExt; + +/// Compact summary returned to the overseer. Bounded size — recent +/// outcomes + corrections plus rolled-up rates. Goal is to fit in a +/// prompt without eating the overseer's context budget. +#[derive(Debug, Clone, Default, Serialize)] +pub struct KbContext { + pub sig_hash: String, + pub task_class: String, + pub recent_outcomes: Vec, + pub recent_corrections: Vec, + pub success_rate: Option, + pub avg_turns: Option, + pub avg_latency_ms: Option, + pub total_observed: u32, +} + +#[derive(Debug, Clone, Serialize)] +pub struct OutcomeSummary { + pub created_at: String, + pub ok: bool, + pub polarity: String, + pub turns: u32, + pub latency_ms: u64, + pub total_tokens: u64, + pub error: Option, +} + +#[derive(Debug, Clone, Serialize)] +pub struct CorrectionSummary { + pub created_at: String, + pub reason: String, + pub correction_preview: String, // first 300 chars + pub applied_at_turn: u32, +} + +const OUTCOMES_PATH: &str = "data/_kb/outcomes.jsonl"; +const CORRECTIONS_PATH: &str = "data/_kb/overseer_corrections.jsonl"; +const RECENT_OUTCOME_LIMIT: usize = 5; +const RECENT_CORRECTION_LIMIT: usize = 3; +const AGGREGATE_WINDOW: usize = 50; + +impl KbContext { + /// Build context from the default KB paths. + pub async fn load_for(sig_hash: &str, task_class: &str) -> Self { + Self::load_from( + sig_hash, task_class, + Path::new(OUTCOMES_PATH), Path::new(CORRECTIONS_PATH), + ).await + } + + /// Path-taking variant — tests inject tmp files without touching + /// the real KB directory (same pattern as append_outcomes_row_at). + pub async fn load_from( + sig_hash: &str, + task_class: &str, + outcomes_path: &Path, + corrections_path: &Path, + ) -> Self { + let mut ctx = KbContext { + sig_hash: sig_hash.to_string(), + task_class: task_class.to_string(), + ..Default::default() + }; + + // Scan outcomes — matches on sig_hash primary, task_class + // secondary (so different geos for the same task_class still + // contribute to aggregate rates even though they won't make + // the top-5 recent). The bounded window keeps scan cost + // linear in file size — we're reading tail only. + let outcome_rows = tail_matching( + outcomes_path, AGGREGATE_WINDOW * 4, + |row| { + let row_sig = row.get("sig_hash").and_then(|v| v.as_str()).unwrap_or(""); + let row_tc = row.get("task_class").and_then(|v| v.as_str()).unwrap_or(""); + row_sig == sig_hash || row_tc == task_class + }, + ).await; + + // Recent outcomes: exact sig_hash match first (strongest + // signal), then task_class fallback up to the limit. + let mut exact: Vec = Vec::new(); + let mut loose: Vec = Vec::new(); + for row in &outcome_rows { + let row_sig = row.get("sig_hash").and_then(|v| v.as_str()).unwrap_or(""); + let summary = summarize_outcome(row); + if row_sig == sig_hash { exact.push(summary); } + else { loose.push(summary); } + } + ctx.recent_outcomes = exact.into_iter().rev().take(RECENT_OUTCOME_LIMIT).collect(); + if ctx.recent_outcomes.len() < RECENT_OUTCOME_LIMIT { + let need = RECENT_OUTCOME_LIMIT - ctx.recent_outcomes.len(); + ctx.recent_outcomes.extend(loose.into_iter().rev().take(need)); + } + + // Aggregate rates across the full matched window (both + // sig_hash and task_class matches — gives a stable rate even + // on sparse sig_hash history). + let window = outcome_rows.iter().rev().take(AGGREGATE_WINDOW); + let mut ok_count = 0u32; + let mut total = 0u32; + let mut turn_sum = 0u32; + let mut latency_sum = 0u64; + for row in window { + total += 1; + if row.get("ok").and_then(|v| v.as_bool()).unwrap_or(false) { ok_count += 1; } + turn_sum += row.get("turns").and_then(|v| v.as_u64()).unwrap_or(0) as u32; + latency_sum += row.get("usage") + .and_then(|u| u.get("latency_ms")) + .and_then(|v| v.as_u64()).unwrap_or(0); + } + if total > 0 { + ctx.total_observed = total; + ctx.success_rate = Some(ok_count as f64 / total as f64); + ctx.avg_turns = Some(turn_sum as f64 / total as f64); + ctx.avg_latency_ms = Some(latency_sum / total as u64); + } + + // Overseer corrections. Prefer sig_hash match; fall back to + // task_class. The overseer reading its OWN prior corrections + // is the main point — if the last 3 attempts produced + // corrections X, Y, Z, the new correction should acknowledge + // those patterns rather than suggest X for the fourth time. + let correction_rows = tail_matching( + corrections_path, RECENT_CORRECTION_LIMIT * 4, + |row| { + let row_sig = row.get("sig_hash").and_then(|v| v.as_str()).unwrap_or(""); + let row_tc = row.get("task_class").and_then(|v| v.as_str()).unwrap_or(""); + row_sig == sig_hash || row_tc == task_class + }, + ).await; + let mut c_exact: Vec = Vec::new(); + let mut c_loose: Vec = Vec::new(); + for row in &correction_rows { + let row_sig = row.get("sig_hash").and_then(|v| v.as_str()).unwrap_or(""); + let summary = summarize_correction(row); + if row_sig == sig_hash { c_exact.push(summary); } + else { c_loose.push(summary); } + } + ctx.recent_corrections = c_exact.into_iter().rev().take(RECENT_CORRECTION_LIMIT).collect(); + if ctx.recent_corrections.len() < RECENT_CORRECTION_LIMIT { + let need = RECENT_CORRECTION_LIMIT - ctx.recent_corrections.len(); + ctx.recent_corrections.extend(c_loose.into_iter().rev().take(need)); + } + + ctx + } + + /// Compact string form for the overseer prompt. Deterministic + /// ordering + bounded length so prompt caching stays stable + /// across iterations on the same task. + pub fn to_prompt_section(&self) -> String { + let mut s = String::new(); + s.push_str("## Knowledge Base Context\n"); + if let (Some(rate), Some(turns), Some(lat)) = (self.success_rate, self.avg_turns, self.avg_latency_ms) { + s.push_str(&format!( + "Across {} prior similar runs: success_rate={:.1}%, avg_turns={:.1}, avg_latency_ms={}\n", + self.total_observed, rate * 100.0, turns, lat, + )); + } else { + s.push_str("No prior similar runs recorded.\n"); + } + + if !self.recent_outcomes.is_empty() { + s.push_str(&format!("\nRecent {} outcomes:\n", self.recent_outcomes.len())); + for o in &self.recent_outcomes { + let err = o.error.as_deref().map(|e| format!(" — {}", truncate(e, 80))).unwrap_or_default(); + s.push_str(&format!( + " [{}] ok={} turns={} tokens={} lat={}ms{}\n", + &o.created_at[..19.min(o.created_at.len())], + o.ok, o.turns, o.total_tokens, o.latency_ms, err, + )); + } + } + + if !self.recent_corrections.is_empty() { + s.push_str(&format!("\nRecent {} overseer corrections (yours — don't repeat):\n", self.recent_corrections.len())); + for c in &self.recent_corrections { + s.push_str(&format!( + " [{}] turn={} reason={} correction={}\n", + &c.created_at[..19.min(c.created_at.len())], + c.applied_at_turn, + truncate(&c.reason, 40), + truncate(&c.correction_preview, 200), + )); + } + } + + s + } +} + +fn summarize_outcome(row: &serde_json::Value) -> OutcomeSummary { + OutcomeSummary { + created_at: row.get("created_at").and_then(|v| v.as_str()).unwrap_or("").to_string(), + ok: row.get("ok").and_then(|v| v.as_bool()).unwrap_or(false), + polarity: row.get("polarity").and_then(|v| v.as_str()).unwrap_or("").to_string(), + turns: row.get("turns").and_then(|v| v.as_u64()).unwrap_or(0) as u32, + latency_ms: row.get("usage").and_then(|u| u.get("latency_ms")) + .and_then(|v| v.as_u64()).unwrap_or(0), + total_tokens: row.get("usage").and_then(|u| u.get("total_tokens")) + .and_then(|v| v.as_u64()).unwrap_or(0), + error: row.get("error").and_then(|v| v.as_str()).map(String::from), + } +} + +fn summarize_correction(row: &serde_json::Value) -> CorrectionSummary { + let preview = row.get("correction").and_then(|v| v.as_str()).unwrap_or(""); + CorrectionSummary { + created_at: row.get("created_at").and_then(|v| v.as_str()).unwrap_or("").to_string(), + reason: row.get("reason").and_then(|v| v.as_str()).unwrap_or("").to_string(), + correction_preview: truncate(preview, 300), + applied_at_turn: row.get("applied_at_turn").and_then(|v| v.as_u64()).unwrap_or(0) as u32, + } +} + +fn truncate(s: &str, n: usize) -> String { + if s.len() <= n { s.to_string() } else { format!("{}…", &s[..n]) } +} + +/// Read a JSONL file from the tail, returning at most `limit` rows +/// that match `filter`. Missing file returns empty. Corrupt lines are +/// skipped. Limit is honored from the tail — a full-file scan with an +/// in-memory ring would be wasteful for large outcomes histories, but +/// we cap at reading the whole file and filtering post-hoc for now +/// (reverse-seek line iteration is a real engineering task and the +/// file is bounded by ingest rate; revisit when it bites). +async fn tail_matching( + path: &Path, + limit: usize, + filter: F, +) -> Vec +where + F: Fn(&serde_json::Value) -> bool, +{ + let Ok(file) = tokio::fs::File::open(path).await else { return Vec::new(); }; + let reader = tokio::io::BufReader::new(file); + let mut lines = reader.lines(); + let mut matches: Vec = Vec::new(); + while let Ok(Some(line)) = lines.next_line().await { + let Ok(v) = serde_json::from_str::(&line) else { continue }; + if filter(&v) { + matches.push(v); + if matches.len() > limit { + // Keep the most-recent window only — drop from the + // front as we go rather than buffering everything. + matches.remove(0); + } + } + } + matches +} + +#[cfg(test)] +mod tests { + use super::*; + use tokio::io::AsyncWriteExt; + + async fn write_fixture(path: &Path, rows: Vec) { + if let Some(dir) = path.parent() { + tokio::fs::create_dir_all(dir).await.unwrap(); + } + let mut f = tokio::fs::OpenOptions::new() + .create(true).write(true).truncate(true).open(path).await.unwrap(); + for r in rows { + let mut line = serde_json::to_string(&r).unwrap(); + line.push('\n'); + f.write_all(line.as_bytes()).await.unwrap(); + } + } + + fn tmp_path(name: &str) -> std::path::PathBuf { + let nanos = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_nanos(); + std::env::temp_dir().join(format!("lh_kb_ctx_{}_{}_{}", std::process::id(), nanos, name)) + } + + #[tokio::test] + async fn empty_files_produce_empty_context() { + let op = tmp_path("outcomes.jsonl"); + let cp = tmp_path("corrections.jsonl"); + let ctx = KbContext::load_from("sig123", "staffing.fill", &op, &cp).await; + assert!(ctx.recent_outcomes.is_empty()); + assert!(ctx.recent_corrections.is_empty()); + assert!(ctx.success_rate.is_none()); + assert_eq!(ctx.total_observed, 0); + } + + #[tokio::test] + async fn exact_sig_hash_matches_take_priority() { + let op = tmp_path("outcomes.jsonl"); + let cp = tmp_path("corrections.jsonl"); + write_fixture(&op, vec![ + // Other sig_hash, same task_class — loose match + serde_json::json!({ + "sig_hash": "other", "task_class": "staffing.fill", + "ok": false, "polarity": "failure_pattern", "turns": 1, + "usage": {"latency_ms": 1000, "total_tokens": 100}, + "created_at": "2026-04-22T10:00:00Z", + }), + // Exact sig_hash — should lead + serde_json::json!({ + "sig_hash": "sig123", "task_class": "staffing.fill", + "ok": true, "polarity": "success_confirmation", "turns": 3, + "usage": {"latency_ms": 2000, "total_tokens": 500}, + "created_at": "2026-04-23T10:00:00Z", + }), + ]).await; + write_fixture(&cp, vec![]).await; + + let ctx = KbContext::load_from("sig123", "staffing.fill", &op, &cp).await; + assert_eq!(ctx.recent_outcomes.len(), 2); + assert_eq!(ctx.recent_outcomes[0].created_at, "2026-04-23T10:00:00Z"); + assert_eq!(ctx.recent_outcomes[0].ok, true); + assert_eq!(ctx.total_observed, 2); + assert!((ctx.success_rate.unwrap() - 0.5).abs() < 0.001); + } + + #[tokio::test] + async fn corrupt_rows_are_skipped() { + let op = tmp_path("outcomes.jsonl"); + let cp = tmp_path("corrections.jsonl"); + // Mix valid + invalid — invalid should be silently skipped. + if let Some(dir) = op.parent() { tokio::fs::create_dir_all(dir).await.unwrap(); } + tokio::fs::write(&op, "not json\n{\"sig_hash\":\"sig1\",\"task_class\":\"tc\",\"ok\":true,\"turns\":1,\"usage\":{}}\ngarbage\n").await.unwrap(); + write_fixture(&cp, vec![]).await; + let ctx = KbContext::load_from("sig1", "tc", &op, &cp).await; + assert_eq!(ctx.recent_outcomes.len(), 1); + } + + #[tokio::test] + async fn corrections_preview_is_truncated() { + let op = tmp_path("outcomes.jsonl"); + let cp = tmp_path("corrections.jsonl"); + let long = "x".repeat(500); + write_fixture(&op, vec![]).await; + write_fixture(&cp, vec![serde_json::json!({ + "sig_hash": "sig1", "task_class": "tc", + "reason": "abort", "correction": long, "applied_at_turn": 3, + "created_at": "2026-04-23T10:00:00Z", + })]).await; + let ctx = KbContext::load_from("sig1", "tc", &op, &cp).await; + assert_eq!(ctx.recent_corrections.len(), 1); + // 300-char cap + 3-byte UTF-8 ellipsis character = 303-byte worst case. + assert!(ctx.recent_corrections[0].correction_preview.len() <= 303); + } + + #[test] + fn prompt_section_is_stable_for_empty_context() { + let ctx = KbContext::default(); + let s = ctx.to_prompt_section(); + assert!(s.contains("No prior similar runs recorded")); + } + + #[test] + fn prompt_section_reports_aggregate_rates() { + let ctx = KbContext { + total_observed: 10, + success_rate: Some(0.7), + avg_turns: Some(4.2), + avg_latency_ms: Some(45000), + ..Default::default() + }; + let s = ctx.to_prompt_section(); + assert!(s.contains("success_rate=70.0%")); + assert!(s.contains("avg_turns=4.2")); + assert!(s.contains("avg_latency_ms=45000")); + } +} diff --git a/crates/gateway/src/execution_loop/mod.rs b/crates/gateway/src/execution_loop/mod.rs new file mode 100644 index 0000000..a45e7af --- /dev/null +++ b/crates/gateway/src/execution_loop/mod.rs @@ -0,0 +1,1837 @@ +//! `ExecutionLoop` — the Rust port of `tests/multi-agent/orchestrator.ts`. +//! +//! Incremental port (2026-04-23). Pieces in order of landing: +//! 1. ✅ Playbook-boost context retrieval +//! 2. ✅ Executor turn via the shared ollama::chat path +//! 3. ✅ Reviewer turn + critique parse (this commit) +//! 4. ⬜ Tool-call dispatch — hybrid_search / sql / Phase-12 tools (orchestrator.ts:101-124) +//! 5. ✅ Consensus detection + drift counter (this commit) +//! 6. ⬜ Truth-layer gate (Phase 42 — refuse before burning tokens) +//! 7. ⬜ Validator call (Phase 43 stub) +//! 8. ⬜ Cloud escalation on repeat failure (T3 gpt-oss:120b) +//! 9. ⬜ Playbook seal + /vectors/playbook_memory/seed (orchestrator.ts:255-293) +//! 10. ⬜ KB write-through: outcomes + facts (Phase 22) + +pub mod kb_context; + +use serde::{Deserialize, Serialize}; + +use crate::v1::{respond::RespondRequest, V1State}; +use kb_context::KbContext; + +const DEFAULT_EXECUTOR_MODEL: &str = "qwen3.5:latest"; +const DEFAULT_REVIEWER_MODEL: &str = "qwen3:latest"; +const DEFAULT_MAX_TURNS: u32 = 12; +/// Matches orchestrator.ts:31. Three consecutive drift flags OR tool +/// errors aborts the loop — the executor isn't self-correcting. +const MAX_CONSECUTIVE_DRIFTS: u32 = 3; + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct LogEntry { + pub turn: u32, + pub role: String, + pub model: String, + pub kind: String, + pub content: serde_json::Value, + pub at: String, +} + +impl LogEntry { + fn new(turn: u32, role: &str, model: &str, kind: &str, content: serde_json::Value) -> Self { + Self { + turn, + role: role.to_string(), + model: model.to_string(), + kind: kind.to_string(), + content, + at: chrono::Utc::now().to_rfc3339(), + } + } +} + +/// Action = what an agent returns on one turn. PORT FROM agent.ts:312. +/// Strict-shape enum so the executor/reviewer can't wedge the loop +/// with ambiguous output — either it parses, or `parse_action` throws +/// and the orchestrator appends an error turn. +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum Action { + Plan { steps: Vec }, + ToolCall { tool: String, args: serde_json::Value, #[serde(default)] rationale: String }, + ProposeDone { fills: Vec, #[serde(default)] rationale: String }, + Critique { verdict: Verdict, #[serde(default)] notes: String }, +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[serde(rename_all = "snake_case")] +pub enum Verdict { + Continue, + Drift, + ApproveDone, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct Fill { + pub candidate_id: String, + pub name: String, + /// Optional — legacy models still emit it. agent.ts:321 rationale. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub reason: Option, +} + +pub enum RespondOutcome { + Ok { artifact: serde_json::Value, log: Vec }, + Failed { reason: String, log: Vec }, + #[allow(dead_code)] // reserved for Phase 42 truth-gate (step 6) + Blocked { reason: String, log: Vec }, +} + +impl RespondOutcome { + pub fn artifact(&self) -> serde_json::Value { + match self { + Self::Ok { artifact, .. } => artifact.clone(), + _ => serde_json::Value::Null, + } + } + pub fn into_log(self) -> Vec { + match self { + Self::Ok { log, .. } | Self::Failed { log, .. } | Self::Blocked { log, .. } => log, + } + } +} + +pub struct ExecutionLoop { + state: V1State, + req: RespondRequest, + log: Vec, + turns_used: u32, + stats: LoopStats, + /// Phase 20 budget — at most one T3 overseer call per loop + /// invocation. Cloud calls cost real money and the whole point is + /// "hyperfocus local + one strategic cloud nudge", not a cloud + /// retry loop. See docs/CONTROL_PLANE_PRD.md §4.3. + overseer_called: bool, +} + +/// Per-invocation usage accumulator. Separate from the gateway-wide +/// `V1State.usage` (which is lifetime-across-all-requests) so the +/// outcomes row can stamp this-task tokens/latency without subtracting +/// two snapshots. +#[derive(Default, Clone, Serialize)] +pub struct LoopStats { + pub requests: u64, + pub prompt_tokens: u64, + pub completion_tokens: u64, + pub total_tokens: u64, + pub latency_ms: u64, +} + +impl ExecutionLoop { + pub fn new(state: V1State, req: RespondRequest) -> Self { + Self { + state, req, + log: Vec::new(), + turns_used: 0, + stats: LoopStats::default(), + overseer_called: false, + } + } + + pub fn turns_used(&self) -> u32 { + self.turns_used + } + + pub async fn run(&mut self) -> Result { + let outcome = self.run_inner().await?; + Ok(self.finalize(outcome).await) + } + + async fn run_inner(&mut self) -> Result { + let executor_model = self.req.executor_model + .as_deref().unwrap_or(DEFAULT_EXECUTOR_MODEL).to_string(); + let reviewer_model = self.req.reviewer_model + .as_deref().unwrap_or(DEFAULT_REVIEWER_MODEL).to_string(); + let max_turns = self.req.max_turns.unwrap_or(DEFAULT_MAX_TURNS); + + // --- (6) TRUTH GATE — PORT FROM Phase 42 (TODO) --- + + // --- (1) PLAYBOOK BOOST --- + let boost = self.fetch_playbook_boost(&self.req.operation).await.unwrap_or_default(); + if !boost.is_empty() { + self.append(LogEntry::new( + 0, "system", "playbook_memory", "boost_loaded", + serde_json::json!({ "count": boost.len(), "preview": boost.iter().take(3).collect::>() }), + )); + } + + let mut consecutive_drifts: u32 = 0; + + // --- MAIN TURN LOOP --- + for turn in 1..=max_turns { + self.turns_used = turn; + + // --- (2) EXECUTOR TURN --- + let executor_prompt = build_executor_prompt(&self.req, &boost, &self.log); + let executor_raw = self.chat_once(&executor_model, &executor_prompt, 0.2, false).await?; + let exec_action = match parse_action(&executor_raw, Role::Executor) { + Ok(a) => a, + Err(e) => { + self.append(LogEntry::new( + turn, "executor", &executor_model, "error", + serde_json::json!({ "message": e, "raw": truncate(&executor_raw, 400) }), + )); + return Ok(RespondOutcome::Failed { + reason: format!("executor parse failure on turn {turn}: {e}"), + log: std::mem::take(&mut self.log), + }); + } + }; + self.append(LogEntry::new( + turn, "executor", &executor_model, action_kind(&exec_action), action_content(&exec_action), + )); + + // --- (4) TOOL DISPATCH — PORT FROM orchestrator.ts:101-124 --- + // Soft-fail: a tool error is a log entry, not a loop abort. + // The executor reads its own error next turn and self-corrects + // (orchestrator.ts:169-189). Only MAX_CONSECUTIVE_DRIFTS tool + // errors in a row → hard abort. + if let Action::ToolCall { tool, args, .. } = &exec_action { + match self.dispatch_tool(tool, args).await { + Ok(result) => { + let trimmed = trim_result(&result); + self.append(LogEntry::new( + turn, "executor", &executor_model, "tool_result", trimmed, + )); + } + Err(e) => { + self.append(LogEntry::new( + turn, "executor", &executor_model, "tool_result", + serde_json::json!({ "error": e, "tool": tool, "args": args }), + )); + consecutive_drifts += 1; + if consecutive_drifts >= MAX_CONSECUTIVE_DRIFTS { + return Ok(RespondOutcome::Failed { + reason: format!( + "aborting — {MAX_CONSECUTIVE_DRIFTS} consecutive tool errors, executor can't self-correct" + ), + log: std::mem::take(&mut self.log), + }); + } + } + } + } + + // --- (3) REVIEWER TURN --- + let reviewer_prompt = build_reviewer_prompt(&self.req, &self.log); + let reviewer_raw = self.chat_once(&reviewer_model, &reviewer_prompt, 0.1, false).await?; + let rev_action = match parse_action(&reviewer_raw, Role::Reviewer) { + Ok(a) => a, + Err(e) => { + self.append(LogEntry::new( + turn, "reviewer", &reviewer_model, "error", + serde_json::json!({ "message": e, "raw": truncate(&reviewer_raw, 400) }), + )); + return Ok(RespondOutcome::Failed { + reason: format!("reviewer parse failure on turn {turn}: {e}"), + log: std::mem::take(&mut self.log), + }); + } + }; + self.append(LogEntry::new( + turn, "reviewer", &reviewer_model, "critique", action_content(&rev_action), + )); + + let verdict = match &rev_action { + Action::Critique { verdict, .. } => verdict.clone(), + _ => { + return Ok(RespondOutcome::Failed { + reason: format!("reviewer emitted non-critique on turn {turn}"), + log: std::mem::take(&mut self.log), + }); + } + }; + + // --- (5) CONSENSUS DETECTION + DRIFT COUNTER --- + if verdict == Verdict::Drift { + consecutive_drifts += 1; + // --- (8) OVERSEER ESCALATION --- + // One chance before abort: when the local loop is + // about to give up, call the T3 overseer with the KB + // context (what worked / didn't on this task class + // historically) + the recent log tail. The overseer + // emits a correction which feeds back into the next + // executor turn. Only fires once per loop to honor + // Phase 20 "1-3 calls/scenario" budget. + if consecutive_drifts == MAX_CONSECUTIVE_DRIFTS.saturating_sub(1) + && !self.overseer_called + { + if let Err(e) = self.escalate_to_overseer(turn, "drift_approaching_abort").await { + tracing::warn!("overseer escalation failed: {e}"); + } + // Reset so the executor gets one clean turn with + // the correction in context before we re-evaluate. + consecutive_drifts = 0; + } else if consecutive_drifts >= MAX_CONSECUTIVE_DRIFTS { + return Ok(RespondOutcome::Failed { + reason: format!( + "aborting — {MAX_CONSECUTIVE_DRIFTS} consecutive drift flags, executor can't self-correct (overseer_called={})", + self.overseer_called, + ), + log: std::mem::take(&mut self.log), + }); + } + } else { + consecutive_drifts = 0; + } + + if let (Action::ProposeDone { fills, rationale }, Verdict::ApproveDone) + = (&exec_action, &verdict) + { + let target_count = spec_target_count(&self.req.spec); + if target_count > 0 && fills.len() as u64 != target_count { + return Ok(RespondOutcome::Failed { + reason: format!( + "consensus malformed — {} fills vs target {}", + fills.len(), target_count + ), + log: std::mem::take(&mut self.log), + }); + } + self.append(LogEntry::new( + turn, "reviewer", &reviewer_model, "consensus_done", + serde_json::json!({ "fills": fills }), + )); + // Seal + write-through runs in `finalize` after this + // returns — outcomes row + playbook_memory seed with + // retries + stats stamping all land there. + let artifact = serde_json::json!({ + "fills": fills, + "approach": rationale, + "turns": turn, + }); + return Ok(RespondOutcome::Ok { + artifact, + log: std::mem::take(&mut self.log), + }); + } + } + + Ok(RespondOutcome::Failed { + reason: format!("no consensus after {max_turns} turns — task incomplete"), + log: std::mem::take(&mut self.log), + }) + } + + fn append(&mut self, e: LogEntry) { + tracing::debug!(turn = e.turn, role = %e.role, kind = %e.kind, "execution_loop"); + self.log.push(e); + } + + /// Dispatch: model name prefix → provider. + /// Local path uses Phase 21 `generate_continuable` (auto-continuation, + /// retry on empty thinking-model response). Cloud path hits + /// Ollama Cloud directly — no continuation since cloud budgets are + /// generous and Phase 21's Rust port is local-only. Truncation on + /// cloud surfaces as a parse failure in the loop; that's fail-fast + /// and a real signal (we want to know when cloud didn't finish). + async fn chat_once( + &mut self, + model: &str, + prompt: &str, + temperature: f64, + think: bool, + ) -> Result { + let is_cloud = is_cloud_model(model); + let provider = if is_cloud { "ollama_cloud" } else { "ollama" }; + let start_time = chrono::Utc::now(); + let started = std::time::Instant::now(); + + let (text, prompt_tokens, completion_tokens, calls) = if is_cloud { + let key = self.state.ollama_cloud_key.as_deref().ok_or_else(|| { + format!("cloud model {model} requested but OLLAMA_CLOUD_KEY not configured") + })?; + use crate::v1::{ChatRequest, Message}; + // Cloud path: retry up to 3× on empty response. gpt-oss:* + // models sometimes return empty after internal reasoning + // — this is the cloud-side analog of Phase 21's empty- + // response backoff, inlined since generate_continuable is + // local-only. + let mut text = String::new(); + let mut tokens_p = 0u32; + let mut tokens_c = 0u32; + let mut attempts = 0u32; + for attempt in 0..3 { + attempts = attempt + 1; + let req = ChatRequest { + model: model.to_string(), + messages: vec![Message { role: "user".into(), content: prompt.to_string() }], + temperature: Some(temperature), + max_tokens: None, + stream: Some(false), + think: Some(think), + provider: Some("ollama_cloud".into()), + }; + let resp = crate::v1::ollama_cloud::chat(key, &req).await + .map_err(|e| format!("ollama_cloud: {e}"))?; + tokens_p = tokens_p.saturating_add(resp.usage.prompt_tokens); + tokens_c = tokens_c.saturating_add(resp.usage.completion_tokens); + let t = resp.choices.into_iter().next() + .map(|c| c.message.content).unwrap_or_default(); + if !t.trim().is_empty() { + text = t; + break; + } + tracing::warn!(model = %model, attempt, "cloud returned empty, retrying"); + } + (text, tokens_p, tokens_c, attempts) + } else { + use aibridge::continuation::{generate_continuable, ContinuableOpts, ResponseShape}; + let mut opts = ContinuableOpts::new(model); + opts.temperature = Some(temperature); + opts.think = Some(think); + opts.shape = ResponseShape::Json; + let outcome = generate_continuable(&self.state.ai_client, prompt, &opts).await?; + if outcome.empty_retries > 0 || outcome.continuations > 0 || !outcome.final_complete { + tracing::info!( + model = %model, + empty_retries = outcome.empty_retries, + continuations = outcome.continuations, + final_complete = outcome.final_complete, + calls = outcome.calls, + "execution_loop.chat_once: continuation telemetry" + ); + } + (outcome.text, outcome.prompt_tokens, outcome.completion_tokens, outcome.calls) + }; + + let elapsed_ms = started.elapsed().as_millis() as u64; + let end_time = chrono::Utc::now(); + + // Langfuse trace — uniform across local + cloud, provider tag + // lets the bridge / observer differentiate downstream. + if let Some(lf) = &self.state.langfuse { + use crate::v1::{langfuse_trace::ChatTrace, Message}; + lf.emit_chat(ChatTrace { + provider: provider.to_string(), + model: model.to_string(), + input: vec![Message { role: "user".into(), content: prompt.to_string() }], + output: text.clone(), + prompt_tokens, + completion_tokens, + temperature: Some(temperature), + max_tokens: None, + think: Some(think), + start_time: start_time.to_rfc3339(), + end_time: end_time.to_rfc3339(), + latency_ms: elapsed_ms, + }); + } + + // Per-task stats (stamps the outcomes row) + gateway-wide + // /v1/usage counters. Both updated uniformly; the by_provider + // split lets operators see the local/cloud mix per task. + let total_tokens = (prompt_tokens + completion_tokens) as u64; + self.stats.requests = self.stats.requests.saturating_add(calls as u64); + self.stats.prompt_tokens = self.stats.prompt_tokens.saturating_add(prompt_tokens as u64); + self.stats.completion_tokens = self.stats.completion_tokens.saturating_add(completion_tokens as u64); + self.stats.total_tokens = self.stats.total_tokens.saturating_add(total_tokens); + self.stats.latency_ms += elapsed_ms; + + { + let mut u = self.state.usage.write().await; + u.requests = u.requests.saturating_add(calls as u64); + u.prompt_tokens = u.prompt_tokens.saturating_add(prompt_tokens as u64); + u.completion_tokens = u.completion_tokens.saturating_add(completion_tokens as u64); + u.total_tokens = u.total_tokens.saturating_add(total_tokens); + let pu = u.by_provider.entry(provider.to_string()).or_default(); + pu.requests = pu.requests.saturating_add(calls as u64); + pu.prompt_tokens = pu.prompt_tokens.saturating_add(prompt_tokens as u64); + pu.completion_tokens = pu.completion_tokens.saturating_add(completion_tokens as u64); + pu.total_tokens = pu.total_tokens.saturating_add(total_tokens); + } + + Ok(text) + } + + /// Final step for every terminal path — write the outcomes row (with + /// the full indicator set stamped) and, on success, seed the playbook + /// back into memory so the next similar task hits the fast path. + /// The write-through is what closes the 0→85% compounding loop. + /// + /// Both writes are best-effort: KB-write failure emits a warn but + /// doesn't convert an Ok into a Failed. The caller's response should + /// reflect what the loop actually accomplished, not whether the log + /// sink was reachable. + async fn finalize(&mut self, mut outcome: RespondOutcome) -> RespondOutcome { + // PORT FROM orchestrator.ts:251-293. On consensus, write-through + // to playbook_memory so the next semantically-similar query + // surfaces the endorsed names. + let seed_outcome = if let RespondOutcome::Ok { artifact, .. } = &outcome { + match self.seed_playbook_memory(artifact).await { + Ok(v) => Some(v), + Err(e) => { + tracing::warn!("playbook_memory seed failed: {e}"); + Some(serde_json::json!({ "error": e })) + } + } + } else { + None + }; + + // Append the outcomes row — polarity derived from the variant, + // indicators stamped from loop state. schema_version=2 flags + // this as a per-task row (distinct from the scenario-level rows + // already in outcomes.jsonl). + let outcomes_row = build_outcomes_row( + &self.req, &self.stats, self.turns_used, + self.overseer_called, + &outcome, seed_outcome.clone(), + ); + if let Err(e) = append_outcomes_row(&outcomes_row).await { + tracing::warn!("outcomes.jsonl append failed: {e}"); + } + + // Enrich the response artifact with the seed + usage info so + // the API caller can see compounding state without a second call. + if let RespondOutcome::Ok { artifact, .. } = &mut outcome { + if let Some(obj) = artifact.as_object_mut() { + if let Some(seed) = seed_outcome { + obj.insert("playbook_seed".into(), seed); + } + obj.insert("usage".into(), serde_json::to_value(&self.stats).unwrap_or_default()); + obj.insert("sig_hash".into(), serde_json::Value::String(sig_hash(&self.req))); + } + } + + outcome + } + + /// PORT FROM orchestrator.ts:255-293. Three retries with geometric + /// backoff. `append: true` routes through Phase 26 upsert semantics + /// (ADD/UPDATE/NOOP on operation+day+city+state), so a re-seal of + /// the same fill on the same day merges names instead of duplicating. + async fn seed_playbook_memory( + &self, + artifact: &serde_json::Value, + ) -> Result { + let fills = artifact.get("fills").and_then(|v| v.as_array()) + .ok_or_else(|| "artifact missing fills".to_string())?; + let endorsed_names: Vec = fills.iter() + .filter_map(|f| f.get("name").and_then(|v| v.as_str()).map(String::from)) + .collect(); + if endorsed_names.is_empty() { + return Err("no endorsed_names to seed".into()); + } + + // Seed context is what the embedding model sees — carry + // task-semantic content (role, city, scenario) not orchestrator + // bookkeeping. Falls back to approach_hint, then to a built + // string from spec. Matches orchestrator.ts:262-263. + let approach = artifact.get("approach").and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .unwrap_or("multi-agent → hybrid search") + .to_string(); + let context = seed_context(&self.req); + + let body = serde_json::json!({ + "operation": self.req.operation, + "approach": approach, + "context": context, + "endorsed_names": endorsed_names, + "append": true, + }); + + let client = reqwest::Client::new(); + let mut last_err = String::new(); + for attempt in 0..3u32 { + match client.post("http://127.0.0.1:3100/vectors/playbook_memory/seed") + .json(&body).send().await + { + Ok(resp) => { + let status = resp.status(); + let text = resp.text().await.unwrap_or_default(); + if status.is_success() { + let j: serde_json::Value = serde_json::from_str(&text) + .unwrap_or(serde_json::json!({ "raw": text })); + return Ok(j); + } + last_err = format!("{}: {}", status, truncate(&text, 200)); + } + Err(e) => last_err = format!("transport: {e}"), + } + // Geometric backoff: 1s, 2s, 3s (matches orchestrator.ts:281). + tokio::time::sleep(std::time::Duration::from_secs(attempt as u64 + 1)).await; + } + Err(format!("after 3 attempts: {last_err}")) + } + + /// Phase 20 step (8) — T3 overseer escalation. + /// + /// When the local executor/reviewer loop can't self-correct, call + /// the cloud overseer (`gpt-oss:120b` via Ollama Cloud) with (a) + /// the KB context — recent outcomes + prior corrections for this + /// sig_hash + task_class, across every profile that has run it — + /// and (b) the recent log tail. Its output is appended as a + /// `system` role turn so the next executor generation sees it, + /// AND written to `data/_kb/overseer_corrections.jsonl` so every + /// future profile activation reads from the same learning pool. + /// + /// This is the "pipe to the overviewer" piece from 2026-04-23 — + /// the overseer is now a first-class KB consumer AND producer, not + /// a one-shot correction oracle. + async fn escalate_to_overseer(&mut self, turn: u32, reason: &str) -> Result<(), String> { + let Some(cloud_key) = self.state.ollama_cloud_key.clone() else { + return Err("OLLAMA_CLOUD_KEY not configured — skipping escalation".into()); + }; + + let kb = KbContext::load_for(&sig_hash(&self.req), &self.req.task_class).await; + let prompt = build_overseer_prompt(&self.req, &kb, &self.log, reason); + + let started = std::time::Instant::now(); + let start_time = chrono::Utc::now(); + let chat_req = crate::v1::ChatRequest { + model: "gpt-oss:120b".to_string(), + messages: vec![crate::v1::Message { + role: "user".into(), + content: prompt.clone(), + }], + temperature: Some(0.1), + max_tokens: None, + stream: Some(false), + think: Some(true), // overseer KEEPS thinking (Phase 20 rule) + provider: Some("ollama_cloud".into()), + }; + let resp = crate::v1::ollama_cloud::chat(&cloud_key, &chat_req).await + .map_err(|e| format!("ollama_cloud: {e}"))?; + let latency_ms = started.elapsed().as_millis() as u64; + let end_time = chrono::Utc::now(); + let correction_text = resp.choices.into_iter().next() + .map(|c| c.message.content).unwrap_or_default(); + + // Stamp per-task stats — cloud call counts against the same + // usage counter so `/v1/usage` shows cloud token spend too. + self.stats.requests = self.stats.requests.saturating_add(1); + self.stats.prompt_tokens = self.stats.prompt_tokens.saturating_add(resp.usage.prompt_tokens as u64); + self.stats.completion_tokens = self.stats.completion_tokens.saturating_add(resp.usage.completion_tokens as u64); + self.stats.total_tokens = self.stats.total_tokens.saturating_add(resp.usage.total_tokens as u64); + self.stats.latency_ms = self.stats.latency_ms.saturating_add(latency_ms); + + // Langfuse trace for the overseer call (same pipe that feeds + // the observer/KB, so this correction's cost lands in the KB + // too — closing the loop). + if let Some(lf) = &self.state.langfuse { + use crate::v1::langfuse_trace::ChatTrace; + lf.emit_chat(ChatTrace { + provider: "ollama_cloud".into(), + model: "gpt-oss:120b".into(), + input: vec![crate::v1::Message { role: "user".into(), content: prompt.clone() }], + output: correction_text.clone(), + prompt_tokens: resp.usage.prompt_tokens, + completion_tokens: resp.usage.completion_tokens, + temperature: Some(0.1), + max_tokens: None, + think: Some(true), + start_time: start_time.to_rfc3339(), + end_time: end_time.to_rfc3339(), + latency_ms, + }); + } + + // Append to the transcript so the next executor turn sees it. + self.append(LogEntry::new( + turn, "system", "gpt-oss:120b", "overseer_correction", + serde_json::json!({ + "reason": reason, + "correction": correction_text, + "kb_context_summary": { + "total_observed": kb.total_observed, + "success_rate": kb.success_rate, + "prior_corrections": kb.recent_corrections.len(), + }, + }), + )); + + // Write to the KB — read by KbContext::load_for on every + // subsequent escalation, AND by any profile that iterates on + // this task class later. + let row = serde_json::json!({ + "schema_version": 2, + "source_service": "v1.respond.overseer", + "sig_hash": sig_hash(&self.req), + "task_class": self.req.task_class, + "operation": self.req.operation, + "reason": reason, + "model": "gpt-oss:120b", + "correction": correction_text, + "applied_at_turn": turn, + "kb_context_used": kb, + "usage": { + "prompt_tokens": resp.usage.prompt_tokens, + "completion_tokens": resp.usage.completion_tokens, + "total_tokens": resp.usage.total_tokens, + "latency_ms": latency_ms, + }, + "created_at": chrono::Utc::now().to_rfc3339(), + }); + if let Err(e) = append_corrections_row(&row).await { + tracing::warn!("overseer_corrections.jsonl append failed: {e}"); + } + + self.overseer_called = true; + Ok(()) + } + + async fn fetch_playbook_boost(&self, operation: &str) -> Result, ()> { + let body = serde_json::json!({ "operation": operation, "top_k": 5 }); + let client = reqwest::Client::new(); + let resp = client + .post("http://127.0.0.1:3100/vectors/playbook_memory/search") + .json(&body) + .send().await.map_err(|_| ())?; + if !resp.status().is_success() { + return Ok(Vec::new()); + } + let j: serde_json::Value = resp.json().await.map_err(|_| ())?; + Ok(j.get("boosts").and_then(|v| v.as_array()).cloned().unwrap_or_default()) + } + + /// PORT FROM orchestrator.ts:101-124 + agent.ts:348-364. + /// Three tool surfaces unified behind one dispatcher: + /// - `hybrid_search` → `POST /vectors/hybrid` (pseudo-tool, not in + /// the Phase 12 registry — lives in vectord) + /// - `sql` → `POST /query/sql` with a SELECT-only guard + /// - anything else → `POST /tools/{name}/call` via the Phase 12 + /// registry (permissions, audit, validation all happen there) + /// + /// Loopback HTTP on 127.0.0.1:3100 on purpose: mirrors the TS + /// behavior exactly (every call goes through the same middleware, + /// auth, audit, CORS path), and lets us swap to in-process routing + /// later without changing the dispatch contract. + async fn dispatch_tool( + &self, + tool: &str, + args: &serde_json::Value, + ) -> Result { + let client = reqwest::Client::new(); + match tool { + "hybrid_search" => { + let sql_filter = args.get("sql_filter").and_then(|v| v.as_str()) + .ok_or_else(|| "hybrid_search needs sql_filter (string)".to_string())?; + let question = args.get("question").and_then(|v| v.as_str()) + .ok_or_else(|| "hybrid_search needs question (string)".to_string())?; + let index_name = args.get("index_name").and_then(|v| v.as_str()) + .ok_or_else(|| "hybrid_search needs index_name (string)".to_string())?; + // Accept either `top_k` or `k` from the model — same + // tolerance as orchestrator.ts. Default 10. + let top_k = args.get("top_k").or_else(|| args.get("k")) + .and_then(|v| v.as_u64()).unwrap_or(10); + let body = serde_json::json!({ + "sql_filter": sql_filter, + "question": question, + "index_name": index_name, + "top_k": top_k, + "generate": false, + }); + let resp = client.post("http://127.0.0.1:3100/vectors/hybrid") + .json(&body).send().await + .map_err(|e| format!("hybrid_search transport: {e}"))?; + parse_tool_response(resp).await + } + "sql" => { + let query = args.get("query").and_then(|v| v.as_str()) + .ok_or_else(|| "sql needs query (string)".to_string())?; + // SELECT-only guard mirroring orchestrator.ts:119. The + // tool is read-only; any mutation needs the Phase 12 + // registry + its permission + audit flow, not the + // unchecked raw sql surface. + if !query.trim_start().to_ascii_uppercase().starts_with("SELECT") { + return Err(format!("sql tool allows SELECT only: {}", truncate(query, 120))); + } + let body = serde_json::json!({ "sql": query, "format": "json" }); + let resp = client.post("http://127.0.0.1:3100/query/sql") + .json(&body).send().await + .map_err(|e| format!("sql transport: {e}"))?; + parse_tool_response(resp).await + } + other => { + // Phase 12 registry — any registered staffing tool lands here. + // Body shape matches agent.ts::callTool (POST /tools/{name}/call + // with {params, agent}). + let url = format!("http://127.0.0.1:3100/tools/{}/call", other); + let body = serde_json::json!({ + "params": args, + "agent": "v1.respond", + }); + let resp = client.post(&url).json(&body).send().await + .map_err(|e| format!("{other} transport: {e}"))?; + parse_tool_response(resp).await + } + } + } +} + +/// Read a tool response body into JSON, or surface the status + text +/// as an error. Keeps the `error` path structurally identical whether +/// the transport fails (caller handles), the server 5xx's (here), or +/// the tool returns a 200 with an `{"error":"..."}` payload (caller +/// surfaces to the executor as normal tool_result content). +async fn parse_tool_response(resp: reqwest::Response) -> Result { + let status = resp.status(); + let text = resp.text().await.map_err(|e| format!("body read: {e}"))?; + if !status.is_success() { + return Err(format!("{}: {}", status, truncate(&text, 300))); + } + serde_json::from_str(&text) + .map_err(|e| format!("non-JSON response: {e} | body: {}", truncate(&text, 200))) +} + +fn seed_context(req: &RespondRequest) -> String { + let hint = spec_field_str(&req.spec, "approach_hint"); + if !hint.is_empty() { + return hint.to_string(); + } + let role = spec_field_str(&req.spec, "target_role"); + let city = spec_field_str(&req.spec, "target_city"); + let state = spec_field_str(&req.spec, "target_state"); + if !role.is_empty() && !city.is_empty() { + return format!("{role} fill in {city}, {state}"); + } + // Non-staffing task class — use the operation verbatim. The + // embedding surface still works; it just has less geo signal. + req.operation.clone() +} + +/// Stable rollup key. PORT FROM the sig_hash usage in observer/kb. +/// DefaultHasher isn't cryptographic but is stable for a single +/// deployment and matches the 16-char hex format already in +/// outcomes.jsonl. Swap to sha256 if cross-deployment stability is +/// needed. +fn sig_hash(req: &RespondRequest) -> String { + use std::hash::{Hash, Hasher}; + let mut h = std::collections::hash_map::DefaultHasher::new(); + req.task_class.hash(&mut h); + req.operation.hash(&mut h); + spec_field_str(&req.spec, "target_role").hash(&mut h); + spec_field_str(&req.spec, "target_city").hash(&mut h); + spec_field_str(&req.spec, "target_state").hash(&mut h); + format!("{:016x}", h.finish()) +} + +/// Build the per-task outcomes row with every indicator the +/// 2026-04-23 audit called out. schema_version=2 distinguishes +/// per-task rows from the scenario-level rows already in the file. +fn build_outcomes_row( + req: &RespondRequest, + stats: &LoopStats, + turns_used: u32, + overseer_called: bool, + outcome: &RespondOutcome, + seed_outcome: Option, +) -> serde_json::Value { + let (ok, polarity, error) = match outcome { + RespondOutcome::Ok { .. } => (true, "success_confirmation", serde_json::Value::Null), + RespondOutcome::Failed { reason, .. } => (false, "failure_pattern", serde_json::Value::String(reason.clone())), + RespondOutcome::Blocked { reason, .. } => (false, "truth_block", serde_json::Value::String(reason.clone())), + }; + let fills = match outcome { + RespondOutcome::Ok { artifact, .. } => artifact.get("fills").cloned().unwrap_or(serde_json::Value::Null), + _ => serde_json::Value::Null, + }; + + // Correction effectiveness: if the overseer was called this loop, + // the outcome tells us whether the correction helped. OK = it + // worked, Failed/Blocked = it didn't. When overseer wasn't called, + // these fields stay null so aggregators can filter cleanly. + let correction_applied = overseer_called; + let correction_effective = if overseer_called { + serde_json::Value::Bool(ok) + } else { + serde_json::Value::Null + }; + + serde_json::json!({ + "schema_version": 2, + "source_service": "v1.respond", + "sig_hash": sig_hash(req), + "task_class": req.task_class, + "operation": req.operation, + "ok": ok, + "polarity": polarity, + "iterations": turns_used, + "turns": turns_used, + "fills": fills, + "models": { + "executor": req.executor_model.clone().unwrap_or_else(|| DEFAULT_EXECUTOR_MODEL.to_string()), + "reviewer": req.reviewer_model.clone().unwrap_or_else(|| DEFAULT_REVIEWER_MODEL.to_string()), + }, + "usage": stats, + "provider": "ollama", + "playbook_seed": seed_outcome, + "truth_rule_citations": [], // Phase 42 gate hook — empty until wired + "validator_report": null, // Phase 43 hook + "correction_applied": correction_applied, + "correction_effective": correction_effective, + "error": error, + "created_at": chrono::Utc::now().to_rfc3339(), + }) +} + +/// PORT FROM Phase 20's T3 overseer prompt shape. The overseer sees: +/// - Task + spec +/// - KB context (historical outcomes + prior corrections across +/// every profile that ran this task class) +/// - Recent log tail (last 12 turns) +/// - Specific reason the local loop escalated +/// It returns prose guidance the executor reads next turn. We do NOT +/// ask it to emit a JSON action — the executor still owns the final +/// shape. The overseer is a strategist, not a tool-caller. +fn build_overseer_prompt( + req: &RespondRequest, + kb: &KbContext, + log: &[LogEntry], + reason: &str, +) -> String { + let mut p = String::new(); + p.push_str("You are the OVERSEER (T3 strategic tier). The local executor/reviewer loop has hit a wall and escalated to you for a strategic correction. You do not call tools; you read the record and tell the executor what to do differently on its next turn.\n\n"); + p.push_str(&format!("## Task\n{}\n", req.operation)); + p.push_str(&format!("Task class: {}\n", req.task_class)); + if !req.spec.is_null() { + p.push_str(&format!("Spec: {}\n", req.spec)); + } + p.push_str(&format!("\n## Reason for escalation\n{}\n\n", reason)); + + p.push_str(&kb.to_prompt_section()); + + p.push_str("\n## Recent log (last 12 turns, most recent last):\n"); + let start = log.len().saturating_sub(12); + for e in &log[start..] { + let content = e.content.to_string(); + p.push_str(&format!( + " [t{:02} {} {}] {}\n", + e.turn, e.role, e.kind, truncate(&content, 200), + )); + } + + p.push_str("\n## Your output\n"); + p.push_str("Write 3-6 sentences of CONCRETE guidance the executor will read next turn. "); + p.push_str("Reference what specifically went wrong, what to try instead, and what to AVOID "); + p.push_str("(especially if it appears in the \"Recent overseer corrections\" above — don't repeat yourself). "); + p.push_str("No JSON, no tool syntax — the executor will translate your guidance into action.\n"); + p +} + +async fn append_corrections_row(row: &serde_json::Value) -> Result<(), String> { + append_outcomes_row_at( + std::path::Path::new("data/_kb/overseer_corrections.jsonl"), + row, + ).await +} + +/// Append one JSONL row to `data/_kb/outcomes.jsonl`. Creates the +/// directory if missing. Same write shape as the TS pipeline; the +/// Phase 24 observer fix taught us `/ingest/file` has REPLACE +/// semantics, so this writes the JSONL directly — APPEND, not replace. +async fn append_outcomes_row(row: &serde_json::Value) -> Result<(), String> { + append_outcomes_row_at(std::path::Path::new("data/_kb/outcomes.jsonl"), row).await +} + +/// Path-taking variant — lets tests write to a tmp path without +/// mutating the process CWD (which isn't thread-safe under parallel +/// test execution). +async fn append_outcomes_row_at( + path: &std::path::Path, + row: &serde_json::Value, +) -> Result<(), String> { + use tokio::io::AsyncWriteExt; + + if let Some(dir) = path.parent() { + tokio::fs::create_dir_all(dir).await.map_err(|e| format!("mkdir: {e}"))?; + } + let mut line = serde_json::to_string(row).map_err(|e| format!("serialize: {e}"))?; + line.push('\n'); + let mut f = tokio::fs::OpenOptions::new() + .create(true).append(true).open(path).await + .map_err(|e| format!("open: {e}"))?; + f.write_all(line.as_bytes()).await.map_err(|e| format!("write: {e}"))?; + // Explicit flush + sync before drop. tokio::fs::File uses a + // threadpool; plain drop doesn't guarantee the write is + // durable by the time the next open sees the file, which + // surfaced as a 3/8 flake on the back-to-back-append test. + f.flush().await.map_err(|e| format!("flush: {e}"))?; + Ok(()) +} + +/// PORT FROM orchestrator.ts:306-311. Cap `rows` at 20 entries and +/// annotate the truncation so the executor sees it on the next turn +/// prompt — prevents a 1000-row hybrid_search result from wiping the +/// context budget on a single tool call. +fn trim_result(r: &serde_json::Value) -> serde_json::Value { + if let Some(rows) = r.get("rows").and_then(|v| v.as_array()) { + if rows.len() > 20 { + let mut truncated = r.clone(); + if let Some(obj) = truncated.as_object_mut() { + obj.insert("rows".into(), serde_json::Value::Array(rows.iter().take(20).cloned().collect())); + obj.insert("_trimmed".into(), serde_json::Value::String( + format!("{} more rows", rows.len() - 20), + )); + } + return truncated; + } + } + r.clone() +} + +// --- Parsing + prompt builders (PORT FROM agent.ts:566-698) --- + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum Role { Executor, Reviewer } + +/// PORT FROM agent.ts:650. Strip ```json fences, find the first {...} +/// block, soft-fix the two common model mistakes: stray `)}`, trailing +/// commas. Reviewer shape tolerance: bare `kind: "approve_done"` gets +/// normalized to `{kind: "critique", verdict: "approve_done"}` — some +/// models (qwen2.5) emit that way and the semantic content is identical. +pub fn parse_action(raw: &str, role: Role) -> Result { + let mut s = raw.trim().to_string(); + if let Some(stripped) = s.strip_prefix("```json") { + s = stripped.trim_start_matches('\n').to_string(); + } else if let Some(stripped) = s.strip_prefix("```") { + s = stripped.trim_start_matches('\n').to_string(); + } + if let Some(stripped) = s.strip_suffix("```") { + s = stripped.trim_end().to_string(); + } + let start = s.find('{').ok_or_else(|| format!("no JSON object in {role:?} response: {}", truncate(raw, 300)))?; + let end = s.rfind('}').ok_or_else(|| format!("no closing brace in {role:?} response: {}", truncate(raw, 300)))?; + if end <= start { + return Err(format!("no JSON object in {role:?} response: {}", truncate(raw, 300))); + } + + // Soft-fix: stray ")}" (qwen2.5 tool_call quirk) + trailing commas. + let mut json = s[start..=end].to_string(); + json = json.replace(")}", "}"); + json = fix_trailing_commas(&json); + + let obj: serde_json::Value = serde_json::from_str(&json) + .map_err(|e| format!("invalid JSON from {role:?}: {e} | raw: {}", truncate(&json, 300)))?; + + let kind = obj.get("kind").and_then(|v| v.as_str()).unwrap_or("").to_string(); + + match role { + Role::Executor => match kind.as_str() { + "plan" | "tool_call" | "propose_done" => { + serde_json::from_value(obj).map_err(|e| format!("executor shape mismatch: {e}")) + } + _ => Err(format!("executor returned unexpected shape: {}", truncate(&obj.to_string(), 200))), + }, + Role::Reviewer => { + // Accept the wrapped shape: {kind:"critique", verdict:"continue"|...} + if kind == "critique" { + return serde_json::from_value(obj) + .map_err(|e| format!("reviewer shape mismatch: {e}")); + } + // Accept the bare-verdict shape: {kind:"approve_done", notes:"..."} + if matches!(kind.as_str(), "continue" | "drift" | "approve_done") { + let verdict = match kind.as_str() { + "continue" => Verdict::Continue, + "drift" => Verdict::Drift, + "approve_done" => Verdict::ApproveDone, + _ => unreachable!(), + }; + let notes = obj.get("notes").and_then(|v| v.as_str()).unwrap_or("").to_string(); + return Ok(Action::Critique { verdict, notes }); + } + Err(format!("reviewer returned unexpected shape: {}", truncate(&obj.to_string(), 200))) + } + } +} + +/// Remove `,` immediately followed by `}` or `]` (with optional whitespace). +/// Same intent as the TS regex `,(\s*[}\]])`. +fn fix_trailing_commas(s: &str) -> String { + let bytes = s.as_bytes(); + let mut out = String::with_capacity(s.len()); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b',' { + let mut j = i + 1; + while j < bytes.len() && bytes[j].is_ascii_whitespace() { j += 1; } + if j < bytes.len() && (bytes[j] == b'}' || bytes[j] == b']') { + // skip the comma + i += 1; + continue; + } + } + out.push(bytes[i] as char); + i += 1; + } + out +} + +fn action_kind(a: &Action) -> &'static str { + match a { + Action::Plan { .. } => "plan", + Action::ToolCall { .. } => "tool_call", + Action::ProposeDone { .. } => "propose_done", + Action::Critique { .. } => "critique", + } +} + +fn action_content(a: &Action) -> serde_json::Value { + serde_json::to_value(a).unwrap_or(serde_json::Value::Null) +} + +/// Returns true if the model name belongs to Ollama Cloud. Prefix-based +/// so new cloud models are pickable by name without a config update — +/// match the rough family prefixes Phase 20's matrix declares. +/// `qwen3.5:397b` lives in the cloud; `qwen3.5:latest` is local — +/// hence the `:3` suffix check rather than matching all of `qwen3.5:`. +pub fn is_cloud_model(model: &str) -> bool { + model.starts_with("gpt-oss:") + || model.starts_with("qwen3-coder:") + || model.starts_with("qwen3.5:3") + || model.starts_with("kimi-") + || model.starts_with("kimi/") +} + +fn truncate(s: &str, n: usize) -> String { + if s.len() <= n { s.to_string() } else { format!("{}…", &s[..n]) } +} + +fn spec_field_str<'a>(spec: &'a serde_json::Value, key: &str) -> &'a str { + spec.get(key).and_then(|v| v.as_str()).unwrap_or("") +} + +fn spec_target_count(spec: &serde_json::Value) -> u64 { + spec.get("target_count").and_then(|v| v.as_u64()).unwrap_or(0) +} + +/// PORT FROM agent.ts:566. Same structural shape: operation + target + +/// candidates-surfaced hint + recent log + ONE-JSON-action instruction. +/// Staffing-specific fields degrade gracefully when spec is empty (non- +/// staffing task classes still get a usable prompt, just without the +/// target_role / target_count scaffolding). +fn build_executor_prompt( + req: &RespondRequest, + boost: &[serde_json::Value], + log: &[LogEntry], +) -> String { + let target_role = spec_field_str(&req.spec, "target_role"); + let target_count = spec_target_count(&req.spec); + let target_city = spec_field_str(&req.spec, "target_city"); + let target_state = spec_field_str(&req.spec, "target_state"); + let approach_hint = spec_field_str(&req.spec, "approach_hint"); + + let mut p = String::new(); + p.push_str("You are the EXECUTOR agent. Your job is to complete this task:\n\n"); + p.push_str(&format!("OPERATION: {}\n", req.operation)); + if target_count > 0 && !target_role.is_empty() { + p.push_str(&format!( + "TARGET: {target_count} × {target_role} in {target_city}, {target_state}\n" + )); + } else { + p.push_str(&format!("TASK CLASS: {}\n", req.task_class)); + if !req.spec.is_null() { + p.push_str(&format!("SPEC: {}\n", req.spec)); + } + } + if !approach_hint.is_empty() { + p.push_str(&format!("HINT: {approach_hint}\n")); + } + p.push_str("\nThe REVIEWER agent is watching every turn. They will flag drift. Stay on target.\n\n"); + + if !boost.is_empty() { + p.push_str("SIMILAR PAST PLAYBOOKS (reference, not prescription):\n"); + for (i, b) in boost.iter().take(3).enumerate() { + p.push_str(&format!(" {}. {}\n", i + 1, b)); + } + p.push('\n'); + } + + // Orchestrator-tracked candidate memory (agent.ts:568). The log- + // render cap chops tool_result content, so the executor can't + // always see what earlier searches returned. This block is a + // durable rollup — every candidate the loop has seen, formatted + // for prompt reading. Critical for letting the executor reach + // propose_done instead of re-searching. + let seen = candidates_seen(log); + p.push_str("CANDIDATES SURFACED SO FAR (orchestrator-tracked, do not forget):\n"); + if seen.is_empty() { + p.push_str(" (none yet — start with hybrid_search)\n"); + } else { + p.push_str(" # Use the name + city + state for sql verification (NOT doc_id — that's the vector-index key, not workers_500k.worker_id)\n"); + for c in seen.iter().take(30) { + p.push_str(&format!(" - name=\"{}\" city=\"{}\" state=\"{}\" (vector doc_id={})\n", + c.name, c.city, c.state, c.doc_id)); + } + if seen.len() > 30 { + p.push_str(&format!(" ... {} more surfaced\n", seen.len() - 30)); + } + } + p.push('\n'); + + p.push_str("SHARED LOG (recent turns):\n"); + p.push_str(&render_log_for_prompt(log, 8)); + p.push('\n'); + + p.push_str("AVAILABLE TOOLS (use tool_call with these exact names — DO NOT invent others):\n"); + p.push_str(" hybrid_search(sql_filter: string, question: string, index_name: string, k?: number)\n"); + p.push_str(" SQL-narrow + vector-rerank. Use for: \"find candidates matching criteria X, ranked by semantic match to Y\".\n"); + p.push_str(" For staffing fills, index_name is typically \"w500k_b18\" or \"w500k_b3\" (workers_500k).\n"); + p.push_str(" Example: {\"tool\":\"hybrid_search\",\"args\":{\"sql_filter\":\"role='Welder' AND city='Toledo' AND state='OH'\",\"question\":\"reliable welders with OSHA certs\",\"index_name\":\"w500k_b18\",\"k\":10},\"rationale\":\"pull top 10 welder candidates in Toledo\"}\n"); + p.push_str(" sql(query: string) — SELECT-only. Use for: verification queries before propose_done.\n"); + p.push_str(" IMPORTANT: workers_500k.worker_id is an INTEGER internal key — NOT the doc_id from hybrid_search.\n"); + p.push_str(" To verify a candidate from hybrid_search results, query by name+city+state (which ARE in the chunk_text you already received):\n"); + p.push_str(" Example: {\"tool\":\"sql\",\"args\":{\"query\":\"SELECT worker_id, name, role FROM workers_500k WHERE name = 'Donna Hall' AND city = 'Columbus' AND state = 'OH' LIMIT 1\"},\"rationale\":\"confirm Donna Hall exists as a Warehouse Associate in Columbus\"}\n\n"); + p.push_str("Your next action MUST be a JSON object matching one of these shapes:\n"); + p.push_str("{\"kind\":\"plan\",\"steps\":[\"short step 1\",\"short step 2\"]}\n"); + p.push_str("{\"kind\":\"tool_call\",\"tool\":\"...\",\"args\":{...},\"rationale\":\"why\"}\n"); + if target_count > 0 { + p.push_str(&format!( + "{{\"kind\":\"propose_done\",\"fills\":[{{\"candidate_id\":\"...\",\"name\":\"First Last\"}}],\"rationale\":\"...\"}} — fills MUST have EXACTLY {target_count} entries.\n" + )); + } else { + p.push_str("{\"kind\":\"propose_done\",\"fills\":[...],\"rationale\":\"...\"}\n"); + } + if target_count > 0 { + p.push_str(&format!( + "\nSTRATEGY: once prior tool_result rows contain ≥ {target_count} candidates in {target_city}, {target_state} matching role \"{target_role}\", STOP SEARCHING. Pick the top {target_count} by score, verify ONE via `sql` tool, then emit propose_done. Do NOT repeat hybrid_search if you already have enough candidates.\n" + )); + } + p.push_str("\nRespond with ONLY the JSON object. No markdown fences, no prose.\n"); + p +} + +/// PORT FROM agent.ts:602. Reviewer prompt with the `awaitingApproval` +/// hard rule: if the most recent executor action was propose_done, the +/// reviewer cannot emit `continue` (would stall the loop). +fn build_reviewer_prompt(req: &RespondRequest, log: &[LogEntry]) -> String { + let target_role = spec_field_str(&req.spec, "target_role"); + let target_count = spec_target_count(&req.spec); + let target_city = spec_field_str(&req.spec, "target_city"); + let target_state = spec_field_str(&req.spec, "target_state"); + + let last_executor_kind = log.iter().rev() + .find(|e| e.role == "executor") + .map(|e| e.kind.as_str()) + .unwrap_or(""); + let awaiting_approval = last_executor_kind == "propose_done"; + + let mut p = String::new(); + p.push_str("You are the REVIEWER agent. The EXECUTOR is trying to complete this task:\n\n"); + p.push_str(&format!("OPERATION: {}\n", req.operation)); + if target_count > 0 && !target_role.is_empty() { + p.push_str(&format!( + "TARGET: {target_count} × {target_role} in {target_city}, {target_state}\n\n" + )); + } + p.push_str("Your job: catch drift. Agents often wander from the actual objective. Specifically watch for:\n"); + if target_count > 0 && !target_city.is_empty() { + p.push_str(&format!("- Proposing candidates who aren't in {target_city}, {target_state}.\n")); + p.push_str(&format!("- Proposing candidates who don't have {target_role} skill.\n")); + p.push_str(&format!("- Proposing fewer or more than {target_count} fills.\n")); + } else { + p.push_str("- Drifting from the stated task class or spec.\n"); + } + p.push_str("- Irrelevant tool calls.\n\n"); + + p.push_str("SHARED LOG (recent turns):\n"); + p.push_str(&render_log_for_prompt(log, 10)); + p.push('\n'); + + p.push_str("Your next action MUST be a JSON object:\n"); + p.push_str("{\"kind\":\"critique\",\"verdict\":\"continue\" | \"drift\" | \"approve_done\",\"notes\":\"...\"}\n\n"); + p.push_str("- \"continue\" → executor is on a reasonable path, let them keep going.\n"); + p.push_str("- \"drift\" → executor is off-track; notes MUST tell them how to redirect.\n"); + p.push_str("- \"approve_done\" → executor's propose_done meets the criteria. Seal it.\n\n"); + if target_count > 0 { + p.push_str(&format!( + "APPROVAL CRITERIA (use only for propose_done):\n\ + 1. Exactly {target_count} fills.\n\ + 2. Each fill's name appears in a prior tool_result from {target_city}, {target_state} matching role \"{target_role}\".\n\ + 3. Executor has SQL-verified at least one fill.\n\ + If 1-3 all hold, return approve_done.\n" + )); + } + if awaiting_approval { + p.push_str("\nHARD RULE: The executor's most recent action was propose_done. On this turn you CANNOT return \"continue\" — it would stall the task. Choose approve_done or drift (state which criterion failed in notes).\n"); + } + + // Loop-detection: if the executor has tool_called ≥ 3 times since + // the last propose_done without proposing, it's stuck in a search + // loop. Reviewer rubber-stamping "continue" here is the failure + // pattern the 2026-04-23 battery surfaced in phase α task 2 — + // 12 turns, 0 proposes, 100% reviewer:continue. + let stuck_tool_calls = tool_calls_since_last_propose(log); + if stuck_tool_calls >= 3 { + p.push_str(&format!( + "\nLOOP DETECTION: The executor has called tools {stuck_tool_calls} times without proposing done. \ + Look at the CANDIDATES SURFACED SO FAR (visible in executor's view): if there are already ≥ {} \ + matching candidates in {target_city}, {target_state} for role \"{target_role}\", respond with \ + verdict=\"drift\" and notes=\"You have enough candidates — pick the top {} by score and emit \ + propose_done this turn. Stop re-searching.\"\n", + target_count, target_count, + )); + } + + p.push_str("\nRespond with ONLY the JSON object.\n"); + p +} + +fn render_log_for_prompt(log: &[LogEntry], tail: usize) -> String { + if log.is_empty() { + return "(no prior turns)\n".into(); + } + let start = log.len().saturating_sub(tail); + let mut s = String::new(); + for e in &log[start..] { + let content = e.content.to_string(); + // tool_result is the executor's eyes — candidate data lives + // there and a 160-char cap chops off every name/doc_id the + // executor needs for propose_done. Keep these generous; cap + // other kinds tighter since they're decision/status entries + // and don't carry payload the executor will re-read. + let cap = if e.kind == "tool_result" { 1200 } else { 200 }; + s.push_str(&format!( + " [t{:02} {} {}] {}\n", + e.turn, e.role, e.kind, truncate(&content, cap) + )); + } + s +} + +/// Ports agent.ts:538 `candidatesSeen`. Walks tool_result entries, +/// parses `sources[].chunk_text` for the staffing "Name — Role in +/// City, ST" shape, dedupes by doc_id. Returns an orchestrator-tracked +/// surface the executor prompt can show verbatim — stopping the +/// executor from "forgetting" candidates when the log-render truncates. +fn candidates_seen(log: &[LogEntry]) -> Vec { + let mut out: Vec = Vec::new(); + let mut seen_ids: std::collections::HashSet = std::collections::HashSet::new(); + for e in log { + if e.kind != "tool_result" { continue; } + let Some(sources) = e.content.get("sources").and_then(|v| v.as_array()) else { continue }; + for s in sources { + let Some(doc_id) = s.get("doc_id").and_then(|v| v.as_str()) else { continue }; + if seen_ids.contains(doc_id) { continue; } + let chunk_text = s.get("chunk_text").and_then(|v| v.as_str()).unwrap_or(""); + let Some((name_part, rest)) = chunk_text.split_once('—') else { continue }; + let name = name_part.trim().to_string(); + let loc = rest.split_once(" in ").map(|(_, r)| r).unwrap_or(""); + let Some((city, state_raw)) = loc.split_once(',') else { continue }; + let city = city.trim().to_string(); + let state = state_raw + .trim() + .chars() + .take_while(|c| c.is_alphabetic()) + .collect::(); + if name.is_empty() || city.is_empty() || state.is_empty() { continue; } + seen_ids.insert(doc_id.to_string()); + out.push(CandidateHint { + doc_id: doc_id.to_string(), + name, + city, + state, + }); + } + } + out +} + +#[derive(Debug, Clone)] +struct CandidateHint { + doc_id: String, + name: String, + city: String, + state: String, +} + +/// Count executor tool_calls since the last propose_done (or since +/// loop start if none). Used by the reviewer prompt to flag stuck +/// search loops — if an executor has tool_called ≥ 3× without +/// proposing, the reviewer should verdict:drift with a stop-searching +/// note rather than rubber-stamping continue. +fn tool_calls_since_last_propose(log: &[LogEntry]) -> u32 { + let mut count = 0u32; + for e in log.iter().rev() { + if e.role != "executor" { continue; } + if e.kind == "propose_done" { break; } + if e.kind == "tool_call" { count += 1; } + } + count +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn log_entry_serializes_to_orchestrator_shape() { + let e = LogEntry::new(3, "executor", "qwen3.5:latest", "tool_call", + serde_json::json!({"tool": "hybrid_search"})); + let j = serde_json::to_value(&e).unwrap(); + for k in ["turn", "role", "kind", "model", "content", "at"] { + assert!(j.get(k).is_some(), "missing field: {k}"); + } + } + + #[test] + fn outcome_into_log_is_lossless() { + let e = LogEntry::new(1, "system", "m", "boost_loaded", serde_json::json!({})); + let o = RespondOutcome::Failed { reason: "scaffold".into(), log: vec![e] }; + assert_eq!(o.into_log().len(), 1); + } + + #[test] + fn parse_executor_plan() { + let raw = r#"{"kind":"plan","steps":["hybrid_search","verify","propose_done"]}"#; + let a = parse_action(raw, Role::Executor).unwrap(); + match a { + Action::Plan { steps } => assert_eq!(steps.len(), 3), + _ => panic!("wrong variant"), + } + } + + #[test] + fn parse_executor_tool_call_with_stray_paren() { + // Mimics the qwen2.5 quirk where the model closes with ")}" — + // agent.ts:666 has the same fix. PORT from TS test territory. + let raw = r#"{"kind":"tool_call","tool":"sql","args":{"query":"SELECT 1"},"rationale":"verify")}"#; + let a = parse_action(raw, Role::Executor).unwrap(); + match a { + Action::ToolCall { tool, .. } => assert_eq!(tool, "sql"), + _ => panic!("wrong variant"), + } + } + + #[test] + fn parse_executor_propose_done_with_fence() { + let raw = "```json\n{\"kind\":\"propose_done\",\"fills\":[{\"candidate_id\":\"W-1\",\"name\":\"A B\"}],\"rationale\":\"ok\"}\n```"; + let a = parse_action(raw, Role::Executor).unwrap(); + match a { + Action::ProposeDone { fills, .. } => { + assert_eq!(fills.len(), 1); + assert_eq!(fills[0].candidate_id, "W-1"); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn parse_reviewer_wrapped_verdict() { + let raw = r#"{"kind":"critique","verdict":"approve_done","notes":"ok"}"#; + let a = parse_action(raw, Role::Reviewer).unwrap(); + match a { + Action::Critique { verdict, .. } => assert_eq!(verdict, Verdict::ApproveDone), + _ => panic!("wrong variant"), + } + } + + #[test] + fn parse_reviewer_bare_verdict_normalizes() { + // agent.ts:690-694 — qwen2.5/mistral emit the verdict as `kind`. + let raw = r#"{"kind":"drift","notes":"wrong city"}"#; + let a = parse_action(raw, Role::Reviewer).unwrap(); + match a { + Action::Critique { verdict, notes } => { + assert_eq!(verdict, Verdict::Drift); + assert_eq!(notes, "wrong city"); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn parse_reviewer_rejects_unknown_verdict() { + let raw = r#"{"kind":"maybe","notes":"?"}"#; + assert!(parse_action(raw, Role::Reviewer).is_err()); + } + + #[test] + fn parse_trailing_comma() { + let raw = r#"{"kind":"plan","steps":["a","b",]}"#; + assert!(parse_action(raw, Role::Executor).is_ok()); + } + + #[test] + fn parse_no_json_errors_cleanly() { + let raw = "sorry I cannot comply"; + let err = parse_action(raw, Role::Executor).unwrap_err(); + assert!(err.contains("no JSON")); + } + + #[test] + fn candidates_seen_parses_sources() { + let log = vec![ + LogEntry::new(1, "executor", "m", "tool_result", serde_json::json!({ + "sources": [ + {"doc_id": "W-1", "chunk_text": "Alice Smith — Welder in Toledo, OH. 5 years experience."}, + {"doc_id": "W-2", "chunk_text": "Bob Jones — Welder in Toledo, OH. Night shift."}, + ] + })), + LogEntry::new(2, "reviewer", "m", "critique", serde_json::json!({ + "verdict": "continue", "notes": "" + })), + LogEntry::new(3, "executor", "m", "tool_result", serde_json::json!({ + "sources": [ + {"doc_id": "W-2", "chunk_text": "Bob Jones — Welder in Toledo, OH. Night shift."}, + {"doc_id": "W-3", "chunk_text": "Carol Davis — Welder in Toledo, OH. AWS certified."}, + ] + })), + ]; + let seen = candidates_seen(&log); + assert_eq!(seen.len(), 3, "dedup by doc_id"); + assert_eq!(seen[0].name, "Alice Smith"); + assert_eq!(seen[0].city, "Toledo"); + assert_eq!(seen[0].state, "OH"); + assert_eq!(seen[2].name, "Carol Davis"); + } + + #[test] + fn candidates_seen_ignores_malformed() { + let log = vec![ + LogEntry::new(1, "executor", "m", "tool_result", serde_json::json!({ + "sources": [ + {"doc_id": "W-1", "chunk_text": "no dash here"}, + {"doc_id": "W-2", "chunk_text": "Name — but no 'in' keyword"}, + {"doc_id": "W-3"}, // no chunk_text + ] + })), + ]; + assert_eq!(candidates_seen(&log).len(), 0); + } + + #[test] + fn tool_calls_since_propose_counts_correctly() { + let log = vec![ + LogEntry::new(1, "executor", "m", "tool_call", serde_json::json!({})), + LogEntry::new(2, "executor", "m", "tool_call", serde_json::json!({})), + LogEntry::new(3, "executor", "m", "tool_call", serde_json::json!({})), + ]; + assert_eq!(tool_calls_since_last_propose(&log), 3); + + // propose_done resets the counter + let log2 = vec![ + LogEntry::new(1, "executor", "m", "tool_call", serde_json::json!({})), + LogEntry::new(2, "executor", "m", "propose_done", serde_json::json!({})), + LogEntry::new(3, "executor", "m", "tool_call", serde_json::json!({})), + ]; + assert_eq!(tool_calls_since_last_propose(&log2), 1); + } + + #[test] + fn executor_prompt_includes_surfaced_candidates() { + let req = req_with_spec(serde_json::json!({ + "target_role": "Welder", "target_count": 2, "target_city": "Toledo", "target_state": "OH" + })); + let log = vec![ + LogEntry::new(1, "executor", "m", "tool_result", serde_json::json!({ + "sources": [ + {"doc_id": "W-1", "chunk_text": "Alice Smith — Welder in Toledo, OH."}, + ] + })), + ]; + let p = build_executor_prompt(&req, &[], &log); + assert!(p.contains("CANDIDATES SURFACED SO FAR")); + assert!(p.contains("W-1 Alice Smith")); + assert!(p.contains("Toledo, OH")); + } + + #[test] + fn reviewer_prompt_flags_loop_after_three_tool_calls() { + let req = req_with_spec(serde_json::json!({ + "target_role": "Welder", "target_count": 2, "target_city": "Toledo", "target_state": "OH" + })); + let log = vec![ + LogEntry::new(1, "executor", "m", "tool_call", serde_json::json!({})), + LogEntry::new(2, "executor", "m", "tool_call", serde_json::json!({})), + LogEntry::new(3, "executor", "m", "tool_call", serde_json::json!({})), + ]; + let p = build_reviewer_prompt(&req, &log); + assert!(p.contains("LOOP DETECTION")); + assert!(p.contains("Stop re-searching")); + } + + #[test] + fn reviewer_prompt_no_loop_clause_before_three_calls() { + let req = req_with_spec(serde_json::json!({ + "target_role": "Welder", "target_count": 2, "target_city": "Toledo", "target_state": "OH" + })); + let log = vec![ + LogEntry::new(1, "executor", "m", "tool_call", serde_json::json!({})), + ]; + let p = build_reviewer_prompt(&req, &log); + assert!(!p.contains("LOOP DETECTION")); + } + + #[test] + fn is_cloud_model_recognizes_cloud_prefixes() { + assert!(is_cloud_model("gpt-oss:120b")); + assert!(is_cloud_model("gpt-oss:20b")); + assert!(is_cloud_model("qwen3-coder:480b")); + assert!(is_cloud_model("qwen3.5:397b")); + assert!(is_cloud_model("kimi-k2.5")); + assert!(is_cloud_model("kimi/k2-thinking")); + } + + #[test] + fn is_cloud_model_rejects_local_prefixes() { + assert!(!is_cloud_model("qwen3.5:latest")); + assert!(!is_cloud_model("qwen3:latest")); + assert!(!is_cloud_model("qwen2.5:latest")); + assert!(!is_cloud_model("mistral")); + assert!(!is_cloud_model("nomic-embed-text")); + } + + #[test] + fn spec_target_count_defaults_to_zero() { + let spec = serde_json::json!({}); + assert_eq!(spec_target_count(&spec), 0); + } + + #[test] + fn executor_prompt_includes_target_when_spec_has_it() { + let req = RespondRequest { + task_class: "staffing.fill".into(), + operation: "fill: Welder x2 in Toledo, OH".into(), + spec: serde_json::json!({ + "target_role": "Welder", "target_count": 2, + "target_city": "Toledo", "target_state": "OH" + }), + executor_model: None, reviewer_model: None, max_turns: None, + }; + let p = build_executor_prompt(&req, &[], &[]); + assert!(p.contains("TARGET: 2 × Welder in Toledo, OH")); + assert!(p.contains("EXACTLY 2 entries")); + assert!(p.contains("hybrid_search"), "executor prompt must list hybrid_search in tool catalog"); + assert!(p.contains("sql(query"), "executor prompt must list sql tool signature"); + assert!(p.contains("DO NOT invent others"), "executor prompt must warn against tool-name invention"); + } + + #[test] + fn executor_prompt_degrades_without_spec() { + let req = RespondRequest { + task_class: "code.review".into(), + operation: "review PR #42".into(), + spec: serde_json::json!(null), + executor_model: None, reviewer_model: None, max_turns: None, + }; + let p = build_executor_prompt(&req, &[], &[]); + assert!(p.contains("TASK CLASS: code.review")); + assert!(!p.contains("TARGET:")); + } + + #[test] + fn reviewer_prompt_adds_hard_rule_when_awaiting_approval() { + let req = RespondRequest { + task_class: "staffing.fill".into(), + operation: "fill: Welder x2 in Toledo, OH".into(), + spec: serde_json::json!({"target_count": 2}), + executor_model: None, reviewer_model: None, max_turns: None, + }; + let log = vec![LogEntry::new(1, "executor", "m", "propose_done", serde_json::json!({}))]; + let p = build_reviewer_prompt(&req, &log); + assert!(p.contains("HARD RULE")); + } + + fn req_with_spec(spec: serde_json::Value) -> RespondRequest { + RespondRequest { + task_class: "staffing.fill".into(), + operation: "fill: Welder x2 in Toledo, OH".into(), + spec, + executor_model: None, + reviewer_model: None, + max_turns: None, + } + } + + fn sample_stats() -> LoopStats { + LoopStats { + requests: 8, prompt_tokens: 12345, completion_tokens: 2345, + total_tokens: 14690, latency_ms: 42000, + } + } + + #[test] + fn sig_hash_is_stable_for_same_inputs() { + let spec = serde_json::json!({ + "target_role": "Welder", "target_city": "Toledo", "target_state": "OH" + }); + let a = sig_hash(&req_with_spec(spec.clone())); + let b = sig_hash(&req_with_spec(spec)); + assert_eq!(a, b); + assert_eq!(a.len(), 16); + } + + #[test] + fn sig_hash_differs_by_geo() { + let a = sig_hash(&req_with_spec(serde_json::json!({ + "target_role": "Welder", "target_city": "Toledo", "target_state": "OH" + }))); + let b = sig_hash(&req_with_spec(serde_json::json!({ + "target_role": "Welder", "target_city": "Dayton", "target_state": "OH" + }))); + assert_ne!(a, b); + } + + #[test] + fn seed_context_uses_hint_when_present() { + let req = req_with_spec(serde_json::json!({ + "approach_hint": "hybrid search", "target_role": "Welder", "target_city": "Toledo" + })); + assert_eq!(seed_context(&req), "hybrid search"); + } + + #[test] + fn seed_context_falls_back_to_role_city_state() { + let req = req_with_spec(serde_json::json!({ + "target_role": "Welder", "target_city": "Toledo", "target_state": "OH" + })); + assert_eq!(seed_context(&req), "Welder fill in Toledo, OH"); + } + + #[test] + fn seed_context_falls_back_to_operation_for_non_staffing() { + let req = req_with_spec(serde_json::json!({})); + assert_eq!(seed_context(&req), "fill: Welder x2 in Toledo, OH"); + } + + #[test] + fn outcomes_row_stamps_full_indicator_set_on_success() { + let req = req_with_spec(serde_json::json!({ + "target_role": "Welder", "target_city": "Toledo", "target_state": "OH" + })); + let stats = sample_stats(); + let outcome = RespondOutcome::Ok { + artifact: serde_json::json!({"fills": [{"candidate_id": "W-1", "name": "A B"}]}), + log: vec![], + }; + let seed = serde_json::json!({"outcome": {"mode": "added"}, "entries_after": 1337}); + let row = build_outcomes_row(&req, &stats, 4, false, &outcome, Some(seed)); + assert_eq!(row["schema_version"], 2); + assert_eq!(row["source_service"], "v1.respond"); + assert_eq!(row["task_class"], "staffing.fill"); + assert_eq!(row["ok"], true); + assert_eq!(row["polarity"], "success_confirmation"); + assert_eq!(row["iterations"], 4); + assert_eq!(row["turns"], 4); + assert_eq!(row["usage"]["total_tokens"], 14690); + assert_eq!(row["usage"]["requests"], 8); + assert_eq!(row["models"]["executor"], "qwen3.5:latest"); + assert_eq!(row["provider"], "ollama"); + assert_eq!(row["playbook_seed"]["entries_after"], 1337); + assert!(row["sig_hash"].as_str().unwrap().len() == 16); + assert!(row["truth_rule_citations"].is_array()); + } + + #[test] + fn outcomes_row_stamps_failure_polarity() { + let req = req_with_spec(serde_json::json!({})); + let stats = sample_stats(); + let outcome = RespondOutcome::Failed { + reason: "3 consecutive drifts".into(), + log: vec![], + }; + let row = build_outcomes_row(&req, &stats, 2, false, &outcome, None); + assert_eq!(row["ok"], false); + assert_eq!(row["polarity"], "failure_pattern"); + assert_eq!(row["error"], "3 consecutive drifts"); + assert_eq!(row["fills"], serde_json::Value::Null); + assert!(row["playbook_seed"].is_null()); + assert_eq!(row["correction_applied"], false); + assert!(row["correction_effective"].is_null()); + } + + #[test] + fn outcomes_row_marks_correction_effective_when_overseer_called_and_ok() { + let req = req_with_spec(serde_json::json!({})); + let stats = sample_stats(); + let outcome = RespondOutcome::Ok { + artifact: serde_json::json!({"fills": []}), + log: vec![], + }; + let row = build_outcomes_row(&req, &stats, 3, true, &outcome, None); + assert_eq!(row["correction_applied"], true); + assert_eq!(row["correction_effective"], true); + } + + #[test] + fn outcomes_row_marks_correction_ineffective_when_overseer_called_and_failed() { + let req = req_with_spec(serde_json::json!({})); + let stats = sample_stats(); + let outcome = RespondOutcome::Failed { + reason: "still drifting after overseer".into(), + log: vec![], + }; + let row = build_outcomes_row(&req, &stats, 3, true, &outcome, None); + assert_eq!(row["correction_applied"], true); + assert_eq!(row["correction_effective"], false); + } + + // Atomic counter + PID guarantees a unique path across parallel + // test invocations. Nanos-only showed 1/5 flake under `cargo + // test` because SystemTime can repeat across threads that run + // within sub-ns of each other. + static APPEND_TEST_SEQ: std::sync::atomic::AtomicU64 = + std::sync::atomic::AtomicU64::new(0); + + #[tokio::test] + async fn append_outcomes_row_at_writes_valid_jsonl() { + let seq = APPEND_TEST_SEQ.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + let tmpdir = std::env::temp_dir().join(format!( + "lh_outcomes_{}_{}", std::process::id(), seq, + )); + let path = tmpdir.join("outcomes.jsonl"); + + let row = serde_json::json!({"schema_version": 2, "ok": true, "test": "marker"}); + append_outcomes_row_at(&path, &row).await.unwrap(); + append_outcomes_row_at(&path, &row).await.unwrap(); + + let written = std::fs::read_to_string(&path).unwrap(); + let lines: Vec<_> = written.lines().collect(); + assert_eq!(lines.len(), 2); + for line in lines { + let parsed: serde_json::Value = serde_json::from_str(line).unwrap(); + assert_eq!(parsed["test"], "marker"); + } + std::fs::remove_dir_all(&tmpdir).ok(); + } + + #[test] + fn trim_result_leaves_small_arrays_alone() { + let r = serde_json::json!({ "rows": [1, 2, 3] }); + let t = trim_result(&r); + assert_eq!(t["rows"].as_array().unwrap().len(), 3); + assert!(t.get("_trimmed").is_none()); + } + + #[test] + fn trim_result_caps_at_20_and_annotates() { + let rows: Vec<_> = (0..100).map(serde_json::Value::from).collect(); + let r = serde_json::json!({ "rows": rows, "other_field": "kept" }); + let t = trim_result(&r); + assert_eq!(t["rows"].as_array().unwrap().len(), 20); + assert_eq!(t["_trimmed"], "80 more rows"); + assert_eq!(t["other_field"], "kept"); + } + + #[test] + fn trim_result_passthrough_when_no_rows() { + let r = serde_json::json!({ "answer": "42" }); + let t = trim_result(&r); + assert_eq!(t["answer"], "42"); + } + + #[test] + fn reviewer_prompt_omits_hard_rule_otherwise() { + let req = RespondRequest { + task_class: "staffing.fill".into(), + operation: "fill: Welder x2 in Toledo, OH".into(), + spec: serde_json::json!({"target_count": 2}), + executor_model: None, reviewer_model: None, max_turns: None, + }; + let log = vec![LogEntry::new(1, "executor", "m", "tool_call", serde_json::json!({}))]; + let p = build_reviewer_prompt(&req, &log); + assert!(!p.contains("HARD RULE")); + } +} diff --git a/crates/gateway/src/main.rs b/crates/gateway/src/main.rs index 539398b..8483df9 100644 --- a/crates/gateway/src/main.rs +++ b/crates/gateway/src/main.rs @@ -1,6 +1,7 @@ mod access; mod access_service; mod auth; +mod execution_loop; mod observability; mod tools; mod v1; @@ -153,6 +154,11 @@ async fn main() { agent_handle: agent_handle.clone(), index_registry: index_reg.clone(), schedules: sched_store, + // P9-001 fix 2026-04-23: journal reference flows into ingest so + // successful uploads emit a record_ingest event. Journal is Clone + // (Arc inside) so the /journal nest below still sees the + // same buffer + persistence. + journal: Some(journal.clone()), })) .nest("/vectors", vectord::service::router(vectord::service::VectorState { store: store.clone(), @@ -218,14 +224,19 @@ async fn main() { }, })); - // Auth middleware (if enabled) + // Auth middleware (if enabled) — P5-001 fix 2026-04-23: + // previously only inserted the ApiKey as an extension and never layered + // the middleware, so auth.enabled=true enforced nothing. Now wraps the + // router with from_fn_with_state, which calls api_key_auth on every + // request. /health is exempted inside the middleware (LB probes). if config.auth.enabled { if let Some(ref key) = config.auth.api_key { - tracing::info!("API key auth enabled"); + tracing::info!("API key auth enabled — enforcing on all routes except /health"); let api_key = auth::ApiKey(key.clone()); - app = app.layer(axum::Extension(api_key)); - // Note: auth middleware applied per-route in production - // For now, the ApiKey extension is available for handlers to check + app = app.layer(axum::middleware::from_fn_with_state( + api_key, + auth::api_key_auth, + )); } else { tracing::warn!("auth enabled but no api_key set — all requests allowed"); } diff --git a/crates/gateway/src/tools/registry.rs b/crates/gateway/src/tools/registry.rs index a5254ac..728d66a 100644 --- a/crates/gateway/src/tools/registry.rs +++ b/crates/gateway/src/tools/registry.rs @@ -67,6 +67,7 @@ pub struct ToolRegistry { } impl ToolRegistry { + #[allow(dead_code)] pub fn new() -> Self { let registry = Self { tools: Arc::new(RwLock::new(HashMap::new())), diff --git a/crates/gateway/src/tools/service.rs b/crates/gateway/src/tools/service.rs index 408b2bb..e5b75d1 100644 --- a/crates/gateway/src/tools/service.rs +++ b/crates/gateway/src/tools/service.rs @@ -7,7 +7,7 @@ use axum::{ }; use serde::Deserialize; -use super::registry::{Permission, ToolInvocation, ToolRegistry}; +use super::registry::{ToolInvocation, ToolRegistry}; use crate::tools::ToolState; pub fn router(state: ToolState) -> Router { diff --git a/crates/gateway/src/v1/mod.rs b/crates/gateway/src/v1/mod.rs index d8ba8a3..d02e942 100644 --- a/crates/gateway/src/v1/mod.rs +++ b/crates/gateway/src/v1/mod.rs @@ -14,6 +14,8 @@ pub mod ollama; pub mod ollama_cloud; pub mod langfuse_trace; +pub mod respond; +pub mod truth; use axum::{ Router, @@ -24,7 +26,7 @@ use axum::{ Json, }; use serde::{Deserialize, Serialize}; -use std::{collections::HashMap, sync::Arc}; +use std::sync::Arc; use tokio::sync::RwLock; #[derive(Clone)] @@ -61,8 +63,10 @@ pub struct ProviderUsage { pub fn router(state: V1State) -> Router { Router::new() .route("/chat", post(chat)) + .route("/respond", post(respond::respond)) .route("/usage", get(usage)) .route("/sessions", get(sessions)) + .route("/context", get(truth::context)) .with_state(state) } diff --git a/crates/gateway/src/v1/respond.rs b/crates/gateway/src/v1/respond.rs new file mode 100644 index 0000000..640c0ff --- /dev/null +++ b/crates/gateway/src/v1/respond.rs @@ -0,0 +1,150 @@ +//! `/v1/respond` — the **execution** API (distinct from `/v1/chat`, the +//! completion API). +//! +//! This is the consolidation move called out in the 2026-04-23 session: +//! lift the proven pipeline from `tests/multi-agent/orchestrator.ts` +//! (executor → reviewer → escalate → validate → seal playbook → +//! write-through to KB) into the gateway, so the production path has +//! the intelligence the tests already proved. +//! +//! `/v1/chat` stays a naive completion proxy for callers that want one. +//! `/v1/respond` is where the loop lives. Every orchestrator-style +//! caller migrates here and the TS harnesses become thin clients. +//! +//! This file holds the HTTP surface + request/response shapes. The loop +//! itself lives in `execution_loop::ExecutionLoop`. + +use axum::{extract::State, http::StatusCode, Json}; +use serde::{Deserialize, Serialize}; + +use super::V1State; +use crate::execution_loop::{ExecutionLoop, LogEntry, RespondOutcome}; + +/// A structured task — mirrors `TaskSpec` in `tests/multi-agent/agent.ts`. +/// Kept deliberately open so non-staffing task classes (code-gen, +/// DevOps-long-horizon) can land without a schema fight. +#[derive(Deserialize, Debug, Clone)] +pub struct RespondRequest { + /// Task class — routes to the right truth rules + validator. For the + /// staffing substrate: `staffing.fill`, `staffing.rescue`, + /// `staffing.sms_draft`. Truth-layer lookup is a no-op until a rule + /// set is registered for the class. + pub task_class: String, + + /// Human-readable operation description — becomes the playbook + /// `operation` field on seal, and the primary signal for + /// playbook_memory embedding. + pub operation: String, + + /// Free-form structured context. Passed to the executor prompt and + /// to the playbook seeder. Staffing tasks expect + /// `{target_role, target_count, target_city, target_state, approach_hint}` + /// but nothing here validates that — the validator crate will (Phase 43). + #[serde(default)] + pub spec: serde_json::Value, + + /// Executor model. Defaults to the hot-path local model if omitted. + /// See orchestrator.ts:28 (`EXECUTOR_MODEL = "qwen3.5:latest"`). + #[serde(default)] + pub executor_model: Option, + + /// Reviewer model. Defaults to the hot-path local reviewer. + /// See orchestrator.ts:29 (`REVIEWER_MODEL = "qwen3:latest"`). + #[serde(default)] + pub reviewer_model: Option, + + /// Hard cap on executor turns. Default matches orchestrator.ts:30 + /// (`MAX_TURNS = 12`). Cloud escalation counts as a turn. + #[serde(default)] + pub max_turns: Option, +} + +#[derive(Serialize)] +pub struct RespondResponse { + /// `ok` = consensus reached, playbook sealed. `failed` = loop ran + /// out of turns or hit the drift cap. `blocked` = truth-layer + /// veto (Phase 42 rule citation in `error`). + pub status: &'static str, + + /// The final artifact — for staffing fills, `{fills: [{candidate_id, name}]}`. + /// Empty on failure / block. + pub artifact: serde_json::Value, + + /// Structured cross-turn log. Same shape as orchestrator.ts LogEntry + /// so existing tooling (kb extractors, fact_extractor.ts) reads it + /// without change. + pub log: Vec, + + /// Iteration count actually used. ≤ max_turns. Stamped on + /// outcomes.jsonl per the indicator audit (2026-04-23). + pub iterations: u32, + + /// Error message on non-ok status. Truth-rule citations land here + /// when `status == "blocked"`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub error: Option, +} + +pub async fn respond( + State(state): State, + Json(req): Json, +) -> Result, (StatusCode, String)> { + if req.operation.is_empty() { + return Err((StatusCode::BAD_REQUEST, "operation must be non-empty".into())); + } + if req.task_class.is_empty() { + return Err((StatusCode::BAD_REQUEST, "task_class must be non-empty".into())); + } + + let mut loop_runner = ExecutionLoop::new(state, req); + let outcome = loop_runner.run().await.map_err(|e| { + (StatusCode::INTERNAL_SERVER_ERROR, format!("execution loop: {e}")) + })?; + + let (status, error) = match &outcome { + RespondOutcome::Ok { .. } => ("ok", None), + RespondOutcome::Failed { reason, .. } => ("failed", Some(reason.clone())), + RespondOutcome::Blocked { reason, .. } => ("blocked", Some(reason.clone())), + }; + + Ok(Json(RespondResponse { + status, + artifact: outcome.artifact(), + log: outcome.into_log(), + iterations: loop_runner.turns_used(), + error, + })) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn respond_request_parses_minimal() { + let raw = r#"{ + "task_class": "staffing.fill", + "operation": "fill: Welder x2 in Toledo, OH" + }"#; + let r: RespondRequest = serde_json::from_str(raw).unwrap(); + assert_eq!(r.task_class, "staffing.fill"); + assert_eq!(r.executor_model, None); + assert_eq!(r.max_turns, None); + } + + #[test] + fn respond_request_parses_full() { + let raw = r#"{ + "task_class": "staffing.fill", + "operation": "fill: Welder x2 in Toledo, OH", + "spec": {"target_role": "Welder", "target_count": 2, "target_city": "Toledo", "target_state": "OH"}, + "executor_model": "qwen3.5:latest", + "reviewer_model": "qwen3:latest", + "max_turns": 12 + }"#; + let r: RespondRequest = serde_json::from_str(raw).unwrap(); + assert_eq!(r.executor_model.as_deref(), Some("qwen3.5:latest")); + assert_eq!(r.max_turns, Some(12)); + assert_eq!(r.spec["target_count"], 2); + } +} diff --git a/crates/gateway/src/v1/truth.rs b/crates/gateway/src/v1/truth.rs new file mode 100644 index 0000000..b99bdc2 --- /dev/null +++ b/crates/gateway/src/v1/truth.rs @@ -0,0 +1,49 @@ +use axum::{routing::get, Router}; +use serde::Serialize; +use truth::default_truth_store; + +#[allow(dead_code)] +pub fn truth_router() -> Router { + Router::new() + .route("/context", get(context)) +} + +#[derive(Serialize)] +pub struct ContextResponse { + pub task_classes: Vec, + pub rules: Vec, +} + +#[derive(Serialize)] +pub struct RuleInfo { + pub id: String, + pub task_class: String, + pub description: String, +} + +pub async fn context() -> axum::Json { + let store = default_truth_store(); + + let task_classes: Vec = vec![ + "staffing.fill".to_string(), + "staffing.rescue".to_string(), + "staffing.sms_draft".to_string(), + "staffing.any".to_string(), + ]; + + let mut rules = Vec::new(); + for tc in &task_classes { + for rule in store.get_rules(tc) { + rules.push(RuleInfo { + id: rule.id.clone(), + task_class: rule.task_class.clone(), + description: rule.description.clone(), + }); + } + } + + axum::Json(ContextResponse { + task_classes, + rules, + }) +} \ No newline at end of file diff --git a/crates/ingestd/Cargo.toml b/crates/ingestd/Cargo.toml index bb41bf6..61a78dc 100644 --- a/crates/ingestd/Cargo.toml +++ b/crates/ingestd/Cargo.toml @@ -8,6 +8,7 @@ shared = { path = "../shared" } storaged = { path = "../storaged" } catalogd = { path = "../catalogd" } vectord = { path = "../vectord" } +journald = { path = "../journald" } tokio = { workspace = true } axum = { workspace = true, features = ["multipart"] } lopdf = { workspace = true } diff --git a/crates/ingestd/src/schema_evolution.rs b/crates/ingestd/src/schema_evolution.rs index c2ef057..fa7db93 100644 --- a/crates/ingestd/src/schema_evolution.rs +++ b/crates/ingestd/src/schema_evolution.rs @@ -2,9 +2,11 @@ /// When a source changes format (columns renamed, added, removed, type changed), /// the system detects the diff and can auto-map using AI or heuristic matching. +#[allow(unused_imports)] use arrow::datatypes::{DataType, Schema, SchemaRef}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; +#[allow(unused_imports)] use std::sync::Arc; /// A detected change between two schema versions. diff --git a/crates/ingestd/src/service.rs b/crates/ingestd/src/service.rs index b7e74c6..34ca116 100644 --- a/crates/ingestd/src/service.rs +++ b/crates/ingestd/src/service.rs @@ -1,3 +1,4 @@ +#[allow(unused_imports)] use axum::{ Json, Router, extract::{Multipart, Path, Query, State}, @@ -33,6 +34,11 @@ pub struct IngestState { /// Scheduled-ingest registry. The scheduler task runs against this /// store; HTTP CRUD endpoints write through it. pub schedules: schedule::ScheduleStore, + /// Event journal for ADR-012 mutation history. Optional for back-compat + /// with callers (like scheduled ingest tests) that don't wire it yet. + /// When present, successful ingests emit a record_ingest event — closes + /// P9-001 on the file-upload path. (2026-04-23) + pub journal: Option, } /// Push `DatasetAppended` triggers for every HNSW index bound to this @@ -136,6 +142,22 @@ async fn ingest_file( Ok(result) => { if !result.deduplicated { notify_agent_on_append(&state, &result.dataset_name).await; + // P9-001 fix (2026-04-23): emit a mutation event on every + // non-deduplicated ingest. Dedup no-ops don't need events + // (ADR-020 register() is already idempotent on same fingerprint). + if let Some(ref journal) = state.journal { + if let Err(e) = journal.record_ingest( + &result.dataset_name, + result.rows as usize, + "ingest_api", + &filename, + ).await { + tracing::warn!( + "journal record_ingest failed for '{}': {}", + result.dataset_name, e, + ); + } + } } if result.deduplicated { Ok((StatusCode::OK, Json(result))) diff --git a/crates/journald/src/journal.rs b/crates/journald/src/journal.rs index 12e70a9..9019238 100644 --- a/crates/journald/src/journal.rs +++ b/crates/journald/src/journal.rs @@ -5,7 +5,7 @@ /// Storage: events buffer in memory, flush to Parquet periodically. /// Query: load Parquet files, filter by entity/field/actor/time. -use arrow::array::{ArrayRef, RecordBatch, StringArray, UInt64Array}; +use arrow::array::{ArrayRef, RecordBatch, StringArray}; use arrow::datatypes::{DataType, Field, Schema}; use chrono::{DateTime, Utc}; use object_store::ObjectStore; diff --git a/crates/queryd/src/service.rs b/crates/queryd/src/service.rs index 5f1a8bf..7f0028c 100644 --- a/crates/queryd/src/service.rs +++ b/crates/queryd/src/service.rs @@ -9,7 +9,6 @@ use axum::{ }; use serde::{Deserialize, Serialize}; -use crate::cache::CacheStats; use crate::context::QueryEngine; use crate::delta; use crate::paged::ResultStore; diff --git a/crates/queryd/src/workspace.rs b/crates/queryd/src/workspace.rs index f127af0..20ed1a6 100644 --- a/crates/queryd/src/workspace.rs +++ b/crates/queryd/src/workspace.rs @@ -2,15 +2,12 @@ /// Each workspace tracks an agent's activity on a specific contract or search, /// with daily/weekly/monthly tiers and instant handoff capability. -use arrow::array::{ArrayRef, RecordBatch, StringArray, Int64Array}; -use arrow::datatypes::{DataType, Field, Schema}; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::sync::Arc; use tokio::sync::RwLock; -use crate::delta; use object_store::ObjectStore; /// Retention tier for workspace data. diff --git a/crates/truth/Cargo.toml b/crates/truth/Cargo.toml new file mode 100644 index 0000000..dbdbabb --- /dev/null +++ b/crates/truth/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "truth" +version = "0.1.0" +edition = "2024" + +[dependencies] +serde = { workspace = true } +serde_json = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } \ No newline at end of file diff --git a/crates/truth/src/lib.rs b/crates/truth/src/lib.rs new file mode 100644 index 0000000..0fd57c3 --- /dev/null +++ b/crates/truth/src/lib.rs @@ -0,0 +1,523 @@ +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct TruthRule { + pub id: String, + pub task_class: String, + pub description: String, + pub condition: RuleCondition, + pub action: RuleAction, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(tag = "type")] +pub enum RuleCondition { + Always, + FieldEquals { field: String, value: String }, + FieldMismatch { field: String, value: String }, + FieldEmpty { field: String }, + FieldGreater { field: String, threshold: i64 }, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(tag = "type")] +pub enum RuleAction { + Pass, + Reject { message: String }, + Redact { fields: Vec }, + Block { message: String }, +} + +#[derive(Default)] +pub struct TruthStore { + rules: HashMap>, +} + +impl TruthStore { + pub fn new() -> Self { + Self::default() + } + + pub fn add_rule(&mut self, rule: TruthRule) { + self.rules + .entry(rule.task_class.clone()) + .or_default() + .push(rule); + } + + pub fn get_rules(&self, task_class: &str) -> Vec<&TruthRule> { + self.rules + .get(task_class) + .map(|v| v.iter().collect()) + .unwrap_or_default() + } + + /// Legacy API: returns the list of actions registered for a task class + /// without evaluating conditions. Retained for backward compatibility + /// with callers that only want the action catalog. New callers should + /// prefer `evaluate()`, which actually walks `RuleCondition` against + /// a context and reports per-rule pass/fail. + pub fn check(&self, task_class: &str) -> Vec { + let rules = self.get_rules(task_class); + rules + .into_iter() + .map(|r| r.action.clone()) + .collect() + } + + /// Evaluate every rule registered for `task_class` against `ctx`, + /// returning one `RuleOutcome` per rule. `passed = true` means the + /// rule's `condition` held; the rule's action is still attached so + /// callers can distinguish "passed and therefore no-op" (RuleAction::Pass) + /// from "passed and apply Redact". `passed = false` means the condition + /// failed — callers should treat the attached action as the enforcement + /// response (Reject/Block). + /// + /// Fixed P42-001 (2026-04-23): previously `check()` returned all actions + /// unconditionally — the `RuleCondition` field was ignored. Now every + /// rule is actually walked against the provided context. + pub fn evaluate(&self, task_class: &str, ctx: &serde_json::Value) -> Vec { + self.get_rules(task_class) + .into_iter() + .map(|r| RuleOutcome { + rule_id: r.id.clone(), + passed: evaluate_condition(&r.condition, ctx), + action: r.action.clone(), + }) + .collect() + } +} + +/// Result of evaluating one rule against a context. `passed` reports +/// whether the condition held; `action` is the rule's declared action +/// regardless (callers decide how to apply it based on `passed`). +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct RuleOutcome { + pub rule_id: String, + pub passed: bool, + pub action: RuleAction, +} + +fn evaluate_condition(cond: &RuleCondition, ctx: &serde_json::Value) -> bool { + match cond { + RuleCondition::Always => true, + RuleCondition::FieldEquals { field, value } => { + field_as_string(ctx, field) + .map(|s| s == *value) + .unwrap_or(false) + } + RuleCondition::FieldMismatch { field, value } => { + field_as_string(ctx, field) + .map(|s| s != *value) + .unwrap_or(false) + } + RuleCondition::FieldEmpty { field } => { + match lookup(ctx, field) { + None => true, + Some(v) => v.is_null() || v.as_str().map(|s| s.is_empty()).unwrap_or(false), + } + } + RuleCondition::FieldGreater { field, threshold } => { + lookup(ctx, field) + .and_then(|v| v.as_i64().or_else(|| v.as_f64().map(|f| f as i64))) + .map(|n| n > *threshold) + .unwrap_or(false) + } + } +} + +/// Walk a dot-separated path through a serde_json::Value. `"worker.status"` +/// → `ctx["worker"]["status"]`. Returns None if any segment is missing or +/// a non-object is encountered mid-path. +fn lookup<'a>(ctx: &'a serde_json::Value, path: &str) -> Option<&'a serde_json::Value> { + let mut cur = ctx; + for seg in path.split('.') { + cur = cur.get(seg)?; + } + Some(cur) +} + +fn field_as_string(ctx: &serde_json::Value, path: &str) -> Option { + lookup(ctx, path).and_then(|v| match v { + serde_json::Value::String(s) => Some(s.clone()), + serde_json::Value::Bool(b) => Some(b.to_string()), + serde_json::Value::Number(n) => Some(n.to_string()), + _ => None, + }) +} + +pub fn default_truth_store() -> TruthStore { + let mut store = TruthStore::new(); + + store.add_rule(TruthRule { + id: "worker-active".to_string(), + task_class: "staffing.fill".to_string(), + description: "Worker must be active".to_string(), + condition: RuleCondition::FieldEquals { + field: "worker.status".to_string(), + value: "active".to_string(), + }, + action: RuleAction::Pass, + }); + + store.add_rule(TruthRule { + id: "client-not-blacklisted".to_string(), + task_class: "staffing.fill".to_string(), + description: "Worker cannot be blacklisted for client".to_string(), + condition: RuleCondition::FieldEquals { + field: "worker.client_blacklisted".to_string(), + value: "false".to_string(), + }, + action: RuleAction::Pass, + }); + + store.add_rule(TruthRule { + id: "deadline-required".to_string(), + task_class: "staffing.fill".to_string(), + description: "Contract must have deadline".to_string(), + condition: RuleCondition::FieldEmpty { + field: "contract.deadline".to_string(), + }, + action: RuleAction::Reject { + message: "Contract deadline is required".to_string(), + }, + }); + + store.add_rule(TruthRule { + id: "budget-required".to_string(), + task_class: "staffing.fill".to_string(), + description: "Budget must be non-negative".to_string(), + condition: RuleCondition::FieldGreater { + field: "contract.budget_per_hour_max".to_string(), + threshold: 0, + }, + action: RuleAction::Pass, + }); + + store.add_rule(TruthRule { + id: "pii-redact".to_string(), + task_class: "staffing.any".to_string(), + description: "Redact PII before cloud calls".to_string(), + condition: RuleCondition::Always, + action: RuleAction::Redact { + fields: vec!["ssn".to_string(), "salary".to_string()], + }, + }); + + store +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn truth_store_new_is_empty() { + let store = TruthStore::new(); + assert!(store.rules.is_empty()); + } + + #[test] + fn add_rule_inserts_into_correct_task_class() { + let mut store = TruthStore::new(); + store.add_rule(TruthRule { + id: "test-rule".to_string(), + task_class: "test.task".to_string(), + description: "Test rule".to_string(), + condition: RuleCondition::Always, + action: RuleAction::Pass, + }); + let rules = store.get_rules("test.task"); + assert_eq!(rules.len(), 1); + assert_eq!(rules[0].id, "test-rule"); + } + + #[test] + fn get_rules_returns_empty_for_unknown_class() { + let store = TruthStore::new(); + let rules = store.get_rules("unknown.class"); + assert!(rules.is_empty()); + } + + #[test] + fn check_returns_actions_for_task_class() { + let mut store = TruthStore::new(); + store.add_rule(TruthRule { + id: "a1".to_string(), + task_class: "test".to_string(), + description: "A1".to_string(), + condition: RuleCondition::Always, + action: RuleAction::Pass, + }); + store.add_rule(TruthRule { + id: "a2".to_string(), + task_class: "test".to_string(), + description: "A2".to_string(), + condition: RuleCondition::Always, + action: RuleAction::Reject { + message: "test reject".to_string(), + }, + }); + let actions = store.check("test"); + assert_eq!(actions.len(), 2); + } + + #[test] + fn rule_condition_serialize_always() { + let cond = RuleCondition::Always; + let json = serde_json::to_string(&cond).unwrap(); + assert!(json.contains(r#""type":"Always"#)); + } + + #[test] + fn rule_condition_serialize_field_equals() { + let cond = RuleCondition::FieldEquals { + field: "foo".to_string(), + value: "bar".to_string(), + }; + let json = serde_json::to_string(&cond).unwrap(); + assert!(json.contains(r#""type":"FieldEquals""#)); + assert!(json.contains(r#""field":"foo""#)); + assert!(json.contains(r#""value":"bar""#)); + } + + #[test] + fn rule_action_serialize_redact() { + let action = RuleAction::Redact { + fields: vec!["ssn".to_string()], + }; + let json = serde_json::to_string(&action).unwrap(); + assert!(json.contains(r#""type":"Redact""#)); + assert!(json.contains("ssn")); + } + + #[test] + fn rule_action_serialize_reject() { + let action = RuleAction::Reject { + message: "test".to_string(), + }; + let json = serde_json::to_string(&action).unwrap(); + assert!(json.contains(r#""type":"Reject""#)); + } + + #[test] + fn default_truth_store_has_staffing_rules() { + let store = default_truth_store(); + let fill_rules = store.get_rules("staffing.fill"); + assert!(!fill_rules.is_empty()); + let any_rules = store.get_rules("staffing.any"); + assert!(!any_rules.is_empty()); + } + + #[test] + fn multiple_rules_same_task_class() { + let mut store = TruthStore::new(); + for i in 0..5 { + store.add_rule(TruthRule { + id: format!("rule-{}", i), + task_class: "test".to_string(), + description: format!("Rule {}", i), + condition: RuleCondition::Always, + action: RuleAction::Pass, + }); + } + let rules = store.get_rules("test"); + assert_eq!(rules.len(), 5); + } + + #[test] + fn truth_rule_clone_preserves_data() { + let rule = TruthRule { + id: "clone-test".to_string(), + task_class: "clone.task".to_string(), + description: "Clone test".to_string(), + condition: RuleCondition::FieldEquals { + field: "x".to_string(), + value: "y".to_string(), + }, + action: RuleAction::Block { + message: "blocked".to_string(), + }, + }; + let cloned = rule.clone(); + assert_eq!(cloned.id, rule.id); + assert_eq!(cloned.condition, rule.condition); + assert_eq!(cloned.action, rule.action); + } + + #[test] + fn field_greater_condition_parse() { + let json = r#"{"type":"FieldGreater","field":"count","threshold":10}"#; + let cond: RuleCondition = serde_json::from_str(json).unwrap(); + match cond { + RuleCondition::FieldGreater { field, threshold } => { + assert_eq!(field, "count"); + assert_eq!(threshold, 10); + } + _ => panic!("Expected FieldGreater"), + } + } + + #[test] + fn block_action_blocks_with_message() { + let action = RuleAction::Block { + message: "Rate limited".to_string(), + }; + let json = serde_json::to_string(&action).unwrap(); + assert!(json.contains("Rate limited")); + } + + #[test] + fn empty_store_check_returns_empty() { + let store = TruthStore::new(); + let actions = store.check("empty.class"); + assert!(actions.is_empty()); + } + + // ── P42-001 evaluate() tests — actually walk RuleCondition ── + + fn fill_store() -> TruthStore { + let mut s = TruthStore::new(); + s.add_rule(TruthRule { + id: "active".into(), + task_class: "t".into(), + description: "must be active".into(), + condition: RuleCondition::FieldEquals { + field: "worker.status".into(), + value: "active".into(), + }, + action: RuleAction::Reject { + message: "worker not active".into(), + }, + }); + s.add_rule(TruthRule { + id: "deadline".into(), + task_class: "t".into(), + description: "deadline required".into(), + condition: RuleCondition::FieldEmpty { + field: "contract.deadline".into(), + }, + action: RuleAction::Reject { + message: "missing deadline".into(), + }, + }); + s.add_rule(TruthRule { + id: "budget".into(), + task_class: "t".into(), + description: "budget positive".into(), + condition: RuleCondition::FieldGreater { + field: "contract.budget".into(), + threshold: 0, + }, + action: RuleAction::Block { + message: "budget must be positive".into(), + }, + }); + s + } + + #[test] + fn evaluate_field_equals_pass_on_match() { + let s = fill_store(); + let ctx = serde_json::json!({"worker": {"status": "active"}}); + let o = s.evaluate("t", &ctx); + let active = o.iter().find(|r| r.rule_id == "active").unwrap(); + assert!(active.passed, "active condition should hold"); + } + + #[test] + fn evaluate_field_equals_fail_on_mismatch() { + let s = fill_store(); + let ctx = serde_json::json!({"worker": {"status": "terminated"}}); + let o = s.evaluate("t", &ctx); + let active = o.iter().find(|r| r.rule_id == "active").unwrap(); + assert!(!active.passed, "terminated should fail active condition"); + } + + #[test] + fn evaluate_field_equals_fail_on_missing() { + let s = fill_store(); + let ctx = serde_json::json!({}); + let o = s.evaluate("t", &ctx); + let active = o.iter().find(|r| r.rule_id == "active").unwrap(); + assert!(!active.passed, "missing worker.status should fail"); + } + + #[test] + fn evaluate_field_empty_pass_when_absent() { + let s = fill_store(); + // FieldEmpty passes when the field is missing/null/empty string. + // Deadline rule says "field empty means action fires" — so passed=true + // here means the rule's condition held (deadline IS empty). + let ctx = serde_json::json!({}); + let o = s.evaluate("t", &ctx); + let deadline = o.iter().find(|r| r.rule_id == "deadline").unwrap(); + assert!(deadline.passed); + } + + #[test] + fn evaluate_field_empty_fail_when_present() { + let s = fill_store(); + let ctx = serde_json::json!({"contract": {"deadline": "2026-05-01"}}); + let o = s.evaluate("t", &ctx); + let deadline = o.iter().find(|r| r.rule_id == "deadline").unwrap(); + assert!(!deadline.passed, "non-empty deadline should fail FieldEmpty check"); + } + + #[test] + fn evaluate_field_greater_pass_and_fail() { + let s = fill_store(); + let ctx_ok = serde_json::json!({"contract": {"budget": 100}}); + let ctx_bad = serde_json::json!({"contract": {"budget": 0}}); + let ok = s.evaluate("t", &ctx_ok); + let bad = s.evaluate("t", &ctx_bad); + assert!(ok.iter().find(|r| r.rule_id == "budget").unwrap().passed); + assert!(!bad.iter().find(|r| r.rule_id == "budget").unwrap().passed); + } + + #[test] + fn evaluate_always_condition_passes_unconditionally() { + let mut s = TruthStore::new(); + s.add_rule(TruthRule { + id: "always".into(), + task_class: "x".into(), + description: "".into(), + condition: RuleCondition::Always, + action: RuleAction::Pass, + }); + let o = s.evaluate("x", &serde_json::json!(null)); + assert!(o[0].passed); + } + + #[test] + fn evaluate_preserves_action_regardless_of_outcome() { + let s = fill_store(); + let ctx = serde_json::json!({"worker": {"status": "active"}}); + let o = s.evaluate("t", &ctx); + let active = o.iter().find(|r| r.rule_id == "active").unwrap(); + // Action is attached whether the rule passed or not — the consumer + // decides how to use it. + assert_eq!( + active.action, + RuleAction::Reject { + message: "worker not active".into() + } + ); + } + + #[test] + fn evaluate_on_unknown_task_class_returns_empty() { + let s = fill_store(); + let o = s.evaluate("nonexistent", &serde_json::json!({})); + assert!(o.is_empty()); + } + + #[test] + fn check_still_returns_actions_unconditionally_for_back_compat() { + // Legacy API should still behave the same — no condition walking. + let s = fill_store(); + let actions = s.check("t"); + assert_eq!(actions.len(), 3, "check returns one action per rule regardless of condition"); + } +} \ No newline at end of file diff --git a/crates/vectord/src/service.rs b/crates/vectord/src/service.rs index 2f54920..b81d1ba 100644 --- a/crates/vectord/src/service.rs +++ b/crates/vectord/src/service.rs @@ -1381,7 +1381,7 @@ async fn activate_profile( let job_id = state.job_tracker.create_profile_activation(&profile_id).await; let job_id_for_response = job_id.clone(); let tracker = state.job_tracker.clone(); - let catalog = state.catalog.clone(); + let _catalog = state.catalog.clone(); let index_registry = state.index_registry.clone(); let bucket_registry = state.bucket_registry.clone(); let lance = state.lance.clone(); @@ -1396,7 +1396,7 @@ async fn activate_profile( let profile_bound = profile.bound_datasets.clone(); let profile_hnsw = profile.hnsw_config.clone(); let profile_backend = profile.vector_backend.clone(); - let profile_full = profile.clone(); + let _profile_full = profile.clone(); tokio::spawn(async move { let t0 = std::time::Instant::now(); diff --git a/crates/vectord/src/store.rs b/crates/vectord/src/store.rs index c220263..85269ef 100644 --- a/crates/vectord/src/store.rs +++ b/crates/vectord/src/store.rs @@ -2,9 +2,8 @@ /// Each embedding index is stored as: source, doc_id, chunk_idx, chunk_text, vector (binary blob). /// Vectors are stored as raw f32 bytes for compact storage and fast loading. -use arrow::array::{ArrayRef, BinaryArray, Float32Array, Int32Array, RecordBatch, StringArray}; +use arrow::array::{ArrayRef, BinaryArray, Int32Array, RecordBatch, StringArray}; use arrow::datatypes::{DataType, Field, Schema}; -use bytes::Bytes; use object_store::ObjectStore; use std::sync::Arc; diff --git a/docs/SCRUM_FIX_WAVE.md b/docs/SCRUM_FIX_WAVE.md new file mode 100644 index 0000000..2ae76a6 --- /dev/null +++ b/docs/SCRUM_FIX_WAVE.md @@ -0,0 +1,63 @@ +# Scrum Fix Wave — Phase-Sweep 2026-04-23 + +**Purpose:** Direct the scrum-master pipeline at concrete findings from the Phase 0→42 audit sweep, not at the high-level vision alone. Findings live in `data/_kb/phase_sweep_findings.jsonl` (19 items). + +## What the auditor expects you to produce per file + +For each file the scrum sees: concrete code-level suggestions that close the listed findings. Not rewrites for style. Not vision drift. Land the invariant or admit the checkbox was premature. + +## Meta-pattern to fix (read this first) + +The sweep surfaced **one root cause repeated across 5 phases**: primitives exist, cross-cutting enforcement doesn't. Auth, journal, access control, truth rules — the machinery is built, nothing calls it from the actual request path. + +One PR-cluster retires the pattern: +1. **Identity.** Auth middleware wires X-API-Key → extension `AgentIdentity { name, role, api_key_hash }`. (P5-001) +2. **Request pipeline.** `/query/sql` and `/tools/*/call` read `AgentIdentity`, pass into queryd + tools handlers. +3. **Access enforcement.** Handlers call `access.can_access()` / `masked_columns()` before returning data; log via `access.log_query()`. (P13-001) +4. **Mutation journaling.** Every ingest / delta-write / tombstone-add / catalog-register calls the corresponding `journal.record_*`. (P9-001) +5. **Truth enforcement.** `TruthStore::check()` rewritten to `evaluate(task_class, ctx) -> Vec`, actually walking `RuleCondition` against context. (P42-001, P42-002) + +After this cluster lands, Phases 5, 9, 13, 42 become "truly shipped" rather than "machinery shipped." + +## Findings by severity + +### 🔴 High + +- **P9-001** `journald/src/journal.rs`, `crates/gateway/src/main.rs`, all mutation sites. Journal has zero internal callers. Every `ingestd::service::upload_file` success, every `queryd::delta::write_delta`, every `catalogd::tombstones::add`, every `catalogd::registry::register` should emit a journal event. Plus fix: `event_counter` resets on process restart — seed from max existing `event_id` on rebuild or switch to UUID v7. +- **P13-001** `crates/gateway/src/main.rs` + `crates/queryd/src/service.rs` + `crates/gateway/src/tools/service.rs`. `AccessControl.can_access` / `masked_columns` / `log_query` have zero callers. Query path ignores role. Phase 17's `profile_scoped_search` (service.rs:1641) is the template — copy that shape. +- **P42-001** `crates/truth/src/lib.rs:56`. `TruthStore::check(task_class)` ignores `RuleCondition` entirely. Signature needs `evaluate(task_class, ctx: &serde_json::Value) -> Vec` that actually walks conditions. Update all 14 tests to exercise fail/pass semantics, not just storage. + +### 🟡 Medium + +- **P5-001** `crates/gateway/src/auth.rs` + `crates/gateway/src/main.rs:222-233`. `api_key_auth` is `#[allow(dead_code)]`. Wire with `axum::middleware::from_fn_with_state(api_key, auth::api_key_auth)` on protected routes. `/health` stays public. +- **P10-001** Legacy datasets (including `candidates`, 2.47M rows) have no PII flags. Add `POST /catalog/resync-metadata` mirroring `/catalog/resync-missing`. +- **P14-001** `crates/ingestd/src/schema_evolution.rs`. Module has 5 passing tests and zero callers. Add `POST /catalog/datasets/by-name/{name}/schema-diff`. When ADR-020 `register()` returns 409, include a `migration_rules[]` body. +- **P20-001** `config/models.json` is spec-only — never loaded by Rust. Load into `shared::model_matrix::ModelMatrix` at startup; delegate `aibridge::context::context_window_for` to matrix. +- **P21-001** `generate_continuable` has one prod caller (`rag.rs:171`). Audit every `ai_client.generate()` site. Convert the truncation-prone + thinking-empty-prone sites (auditor paths, reranker, autotune) to `generate_continuable`. +- **P39-001** `ProviderAdapter` trait + adapters ship, zero callers. `/v1/chat` in `v1/mod.rs:152` uses hardcoded `match req.provider`. Replace with adapter dispatch. +- **P40-001** `config/routing.toml` is spec-only. `RoutingEngine::new` has no callers. Add `RoutingEngine::from_toml(path)`; `V1State` carries `routing: Arc`; `/v1/chat` consults it before provider match. +- **P42-002** Truth has no enforcement site. `/v1/chat` or execution_loop should call `truth.evaluate(task_class, ctx)` post-response. + +### 🟢 Low + +- **P1-001** `crates/storaged/src/federation_service.rs:34-35`. Bucket-qualified routes wire only PUT + GET. Add DELETE + LIST on `/buckets/{bucket}/objects/{*key}`. +- **P4-001** UI deploy stale. Either rebuild (`just ui-build` + restart `lakehouse-ui.service`) or amend `PHASES.md` to note pre-Phase-9 drift. +- **P7-001** `vectord::index_registry`. Orphan index registrations (parquet deleted) still list in `/vectors/indexes`. Add a startup sweep + `POST /vectors/resync-missing`. +- **P12-001** `crates/gateway/src/tools/service.rs`. Audit row has `row_count=null`. Propagate `QueryResponse.row_count` + add `latency_ms`. +- **P20-002** `crates/gateway/src/v1/mod.rs`. No model-prefix auto-routing. Caller must set `provider` explicitly. Tie to P39-001 + P40-001 fix. +- **P21-002** `crates/aibridge/src/context.rs`. `context_window_for` hardcoded HashMap duplicates `config/models.json`. Delegate once P20-001 lands. +- **P38-001** `crates/gateway/src/execution_loop/mod.rs:1523`. Test `executor_prompt_includes_surfaced_candidates` fails on "W-1 Alice Smith" assertion. Either update prompt formatter or update test. +- **P40-002** Cost gating absent. Add `cost_ceiling_usd_per_hour` to `RoutingEngine` rule, pre-request check against `Usage.by_provider`. + +## What "done" looks like for each file the scrum touches + +- Name the specific finding(s) the file participates in. +- Show code-level diff (minimum: function signature + first 5 lines of body) for the fix. +- Call out any test that needs updating + one new test that would catch the bug on reintroduction. +- Flag if the fix is too big for one PR and should be split (most of the cross-cutting cluster wants a shared identity/middleware PR first, per-service PRs after). + +## Out of scope for this wave + +- New features beyond what the findings describe. +- UI work (Phase 4 stale is known). +- DevOps / long-horizon domain work (Terraform/Ansible — Phase 43+). diff --git a/docs/SCRUM_FORENSIC_PROMPT.md b/docs/SCRUM_FORENSIC_PROMPT.md new file mode 100644 index 0000000..e4c258f --- /dev/null +++ b/docs/SCRUM_FORENSIC_PROMPT.md @@ -0,0 +1,198 @@ +# Scrum Master PR Loop — Forensic Validation Prompt (iter 2+) + +Adopted 2026-04-23 from J. Replaces the default scrum prompt starting iter 2. Iter 1 used the softer "fix-wave" framing; iter 2 onward uses this adversarial one. + +--- + +You are acting as an adversarial **Scrum Master + Systems Auditor**. + +Your job is to **prove whether this system actually works**, not to describe it. + +You are auditing a system with the following architecture: + +- AI Gateway with per-model adapters +- Output normalization + schema validation layer +- Execution pipeline (Terraform / Ansible / shell) +- Task-scoped execution memory (S3 + Apache Arrow/Parquet) +- Relevance orchestration (context filtering, freshness validation, fact extraction) +- Local → Cloud fallback loop for failed tasks +- Iterative repair loop with stored execution evidence + +--- + +## PRIMARY OBJECTIVE + +Determine if the system is: + +1. Executable (real, not pseudocode) +2. Aligned with PRD contracts +3. Deterministic enough to trust +4. Protected from model output drift +5. Actually closing the loop (fail → repair → reuse) + +--- + +## NON-NEGOTIABLE RULES + +- Do NOT summarize +- Do NOT explain architecture unless tied to failure +- Do NOT assume code works — verify +- Every claim MUST reference files, functions, or execution evidence +- If something is unclear → mark as FAIL + +--- + +## AUDIT PASSES (RUN ALL) + +### 1. PSEUDOCODE / FAKE IMPLEMENTATION DETECTION +Find any: +- TODO / stub / placeholder +- hardcoded outputs where AI should decide +- mocked execution paths +- fake success returns + +Output exact file + line references. + +### 2. PRD CONTRACT VALIDATION +Verify implementation exists for: + +- Gateway routing logic +- Per-model adapters +- Output normalization (strip, parse, canonicalize) +- Schema validation layer +- Repair loop (retry with modification) +- Raw output storage +- Execution memory persistence +- Retrieval based on prior failures +- Relevance filtering (freshness / protocol awareness) +- Execution permission gate + +For each component: +- status: implemented | partial | missing +- include file references + +### 3. NORMALIZATION + VALIDATION PIPELINE +Prove that: + +- Raw model output is NEVER executed directly +- JSON extraction is enforced +- Unknown fields are rejected or handled +- Schema validation blocks bad output +- Repair loop triggers on failure + +If any path bypasses validation → FAIL + +### 4. FAILURE → CLOUD → REPAIR LOOP +Trace the loop: + +- Local model fails +- Failure is classified +- Context is packaged +- Cloud model returns corrective instruction +- Local model retries +- Result is validated +- Successful pattern is stored + +If any step is missing or non-deterministic → FAIL + +### 5. EXECUTION MEMORY (S3 / ARROW) +Verify: + +- Raw runs are stored (input, raw output, normalized output) +- Failures are recorded with signatures +- Successful retries are recorded +- Retrieval pulls based on: + - task similarity + - failure signature + - execution success history + +If memory is only logs and not reused → FAIL + +### 6. RELEVANCE ORCHESTRATION +Verify: + +- Context is filtered before model input +- Freshness or version awareness exists +- Fact extraction reduces noise +- Context inclusion is explainable + +If system blindly injects context → FAIL + +### 7. EXECUTION SAFETY +Verify: + +- No shell / terraform / ansible execution without validation gate +- No direct model-to-command execution +- Clear permission boundary exists + +If AI can execute commands unchecked → CRITICAL FAIL + +### 8. TESTING + EVIDENCE +Find: + +- real tests (not mocks) +- execution logs +- validation results +- success/failure traces + +If no proof of execution → FAIL + +--- + +## OUTPUT FORMAT (STRICT) + +Each finding in any array MUST include a `confidence` field (integer 0–100). The confidence represents your self-assessed probability that the finding is correct and actionable. Low confidence is valuable — do not inflate. A finding with confidence < 50 is still recorded (it signals investigation needed) but downstream consumers will weight it less. + +```json +{ + "verdict": "pass | fail | needs_patch", + "critical_failures": [ + {"id": "CF-1", "file": "path:line", "description": "...", "confidence": 95} + ], + "pseudocode_flags": [ + {"file": "path:line", "reason": "...", "confidence": 88} + ], + "prd_mismatches": [ + {"component": "...", "status": "partial|missing", "file_ref": "...", "confidence": 80} + ], + "broken_pipelines": [ + {"pipeline": "...", "break_point": "...", "confidence": 70} + ], + "missing_components": [ + {"component": "...", "required_by": "PRD section X", "confidence": 85} + ], + "risk_points": [ + {"area": "...", "risk": "...", "confidence": 60} + ], + "verified_components": [ + {"component": "...", "evidence": "file:line or test name", "confidence": 95} + ], + "evidence": { + "files_inspected": [], + "execution_paths_traced": [], + "tests_found": [], + "tests_missing": [] + }, + "required_next_actions": [ + {"action": "...", "file_hint": "...", "confidence": 75} + ] +} +``` + +**Calibration guide:** +- 90–100: pattern seen repeatedly in shipped code; mechanical; low regression risk +- 70–89: confident in direction, API shape or naming may vary +- 50–69: plausible fix but may not match conventions, could cascade +- <50: genuinely uncertain — record anyway so downstream knows to investigate + +--- + +## FINAL DIRECTIVE + +You are not reviewing code. + +You are answering: + +> "Can this system be trusted to execute real-world DevOps tasks without hallucinating, bypassing validation, or collapsing under edge cases?" + +If the answer is not provably yes, the verdict is FAIL. diff --git a/docs/SCRUM_LOOP_NOTES.md b/docs/SCRUM_LOOP_NOTES.md new file mode 100644 index 0000000..0cbe538 --- /dev/null +++ b/docs/SCRUM_LOOP_NOTES.md @@ -0,0 +1,94 @@ +# Scrum Loop Notes — Observations across iterations + +Running notes from the 6x scrum loop (started 2026-04-23). One section per iteration. "Fix next loop" items accumulate here so the next scrum run picks them up — do not fix inline during a running iteration. + +## Iteration tracker + +| Iter | Status | Scrum started | Scrum finished | Fixes applied | Build green | Re-sweep findings | +|---|---|---|---|---|---|---| +| 1 | 🟡 scrum running | 2026-04-23 (brqz3jxgo) | - | - | - | - (baseline = 19) | +| 2 | 🟡 scrum running | 2026-04-23 (bzs6miehr) | - | - | - | pending | +| 3 | ⬜ queued | - | - | - | - | - | +| 4 | ⬜ queued | - | - | - | - | - | +| 5 | ⬜ queued | - | - | - | - | - | +| 6 | ⬜ queued | - | - | - | - | - | + +## Iteration 1 — in flight + +**Target files:** 21 source files extracted from the 19 Phase 0→42 findings. +**Ladder:** cloud-first per feedback_scrum_cloud_first.md (gpt-oss:120b → qwen3.5:397b → devstral-2:123b → mistral-large-3:675b → gpt-oss:20b → qwen3.5:latest). +**Proposal:** `docs/SCRUM_FIX_WAVE.md` (via LH_SCRUM_PROPOSAL env). + +### Fix next loop — observations accumulating + +*Add items here as the scrum runs. Keep each item to one line with a pointer to file + reason. Don't fix inline.* + +**[ITER 2 OBSERVATIONS]** +- **[FORENSIC vs thin-detector mismatch]** iter 2 first attempt on auth.rs triggered "thin/unstructured" rejection at 2031 chars. Cause: forensic prompt asks for strict JSON verdict output, scrum's thin-answer detector expects markdown with score + table. The detector logic needs a forensic-aware branch OR the forensic prompt should preserve markdown output shape while still applying the 8 audit passes. File: `tests/real-world/scrum_master_pipeline.ts`, function that scores accepted vs thin. Fix next loop: add `isForensicAcceptable(text)` that checks for `"verdict"` field + at least one of `critical_failures`/`pseudocode_flags`/`required_next_actions`. + +- **[OBSERVATION metric]** 11 `#[allow(dead_code)]` markers cluster in crates/gateway/{auth,access,tools/registry,execution_loop,v1/truth} + crates/aibridge/providers/openrouter + crates/vectord/service. Each one maps cleanly to an audit finding. The `execution_loop/mod.rs:85` comment even admits it: `// reserved for Phase 42 truth-gate (step 6)`. **Metric:** fewer `#[allow(dead_code)]` markers per iteration = less pseudo-real code. Baseline = 11. Target after iter 6: ≤ 2 (only ones that are genuinely optional helpers). +- **[OBSERVATION gateway-as-router]** scrum_master_pipeline currently fetches `GATEWAY/v1/chat` directly but its LADDER is still a hardcoded const. Should be driven by `config/routing.toml` via RoutingEngine (blocked by P40-001 until iter 1 lands fix). File: `tests/real-world/scrum_master_pipeline.ts:53`. +- **[OBSERVATION file-type]** iter 1 target list is `.rs` only. Iter 2 must include `tests/multi-agent/*.ts` (executor, observer, kb consumer), `auditor/checks/*.ts`, `sidecar/sidecar/*.py`, and `config/*.{json,toml}`. The scrum pipeline handles any text file. +- **[OBSERVATION triangulation]** auth.rs scrum review (first file out) independently identified P5-001 exactly: flagged `#[allow(dead_code)]`, scored alignment 4/10, prescribed an `AgentIdentity { name, role, hashed_key }` type matching SCRUM_FIX_WAVE. Audit + scrum converged without seeing each other's output — strong signal the findings are real, not artifacts of one method. +- **[RULE from J 2026-04-23]** Wiring-gap fixes happen AFTER the scrum completes, not inline. Accumulate observations, apply in one coherent pass. Matches feedback_audit_findings_log.md. +- **[OBSERVATION oversize-file]** `crates/gateway/src/execution_loop/mod.rs` is 80,901 chars → 24 shards (scrum pipeline's tree-split kicks in at 6KB threshold). A single-file-of-this-size for an execution module is itself a smell — it's the Phase 43 scaffold we kept piling into. Split candidates: executor prompts, reviewer prompts, budget accounting, truth-gate hook, fixtures. Not a fix for this iter, but queue for iter 3. +- **[OBSERVATION cost-tracking]** zero escalations across first 8 files — 0.0 dollar cloud spend above the minimum. Per-request cost on gpt-oss:120b via Ollama Cloud is effectively $0 in this environment (self-hosted or flat-rate per the llm_team_config key). If we add per-iter token totals to scrum_loop_metrics.jsonl we can show trajectory even when cost is flat. + +**[ITER 3 OBSERVATIONS]** +- **[LARGE-HANDLER thin]** kimi-k2:1t went thin on `crates/gateway/src/tools/service.rs` (~11KB, single large axum handler). deepseek-v3.1:671b rescued on attempt 2 (92.8s, 5408 chars, accepted). Pattern: very large routing files challenge even 1T models. Fix next loop: raise tree-split threshold for handler files OR shard by function boundaries not byte count. +- **[WRITE-ONLY INDICATORS STILL]** 8 KB files write-only after iter 3: `audits.jsonl` (189 rows/1.9MB — biggest waste), `phase_sweep_findings.jsonl` (35), `distilled_facts.jsonl` (17), `human_overrides.jsonl` (8), `classifications.jsonl` (5), `scrum_loop_metrics.jsonl` (2), `distilled_config_hints.jsonl` (2), `distilled_procedures.jsonl` (2). Fix next loop: extend `auditor/checks/kb_query.ts` to surface these on PR review, OR build a single "KB health dashboard" reader. +- **[ISOLATED AUTOTUNE]** `crates/vectord/src/agent.rs` has zero refs to scrum/audit/human_override KB. It tunes HNSW but doesn't know which indexes are attached to files the scrum flagged. Fix next loop: add `TriggerEvent::CodeReviewFlag { index_name, gradient_tier }` that biases trial budget toward indexes of flagged files. +- **[CONFIDENCE WELL-CALIBRATED]** kimi-k2:1t confidences span 75-98 across iter3 files, cluster 85-95. No 100% inflation; min 75 = honest edge-case uncertainty. Good signal — the model is calibrating, not performance-signaling. Do NOT "fix" this by prompt-boosting confidence. +- **[SCRUM→OBSERVER WIRED]** 2026-04-24 fix landed in iter-3 source but applies to iter 4+ (bun loaded code pre-edit). Verify next loop: `curl :3800/stats` should show `by_source.scrum > 0` after iter 4 runs. If zero, observer /event payload schema mismatch. +- **[LOW-CONFIDENCE BLOCK flag WORKING]** `crates/storaged/src/federation_service.rs` scored 3/10 with confidences **[40, 35, 50]** → avg 42, min 35. Permission Gradient correctly classifies this file as `block` tier (min <50) — exactly the signal we want. kimi-k2:1t is honestly saying "I'm not confident about these suggestions." Human must review before any auto-apply. Validates Layer #6 working end-to-end. **DO NOT fix this** — it's the system protecting us. +- **[CONFIDENCE-COLUMN DROPOUT]** `crates/gateway/src/main.rs` accepted review has 0 confidences extracted despite the prompt requiring them. Either kimi-k2:1t omitted the Confidence column in its table format (regex didn't match), or the specific content ran into token budget and confidence got trimmed. Fix next loop: strengthen the prompt instruction to say "if you omit the Confidence column you MUST provide Confidence: NN% inline after every suggestion" AND post-accept sanity-check the row (reject if findings_count=0 across 5+ suggestions). +- **[SCORE-PARSER MISS on some files]** `crates/queryd/src/service.rs` and `crates/queryd/src/delta.rs` reviews return score='?' — regex `(\d)\s*/\s*10` isn't matching. Models may have written score as "Score: 4.5/10" or "4/10 (mid)" which the parser can't see. Fix next loop: regex accepts decimals + surrounding context. Not critical — confidences still landed. +- **[SCORE DISTRIBUTION]** across 12 iter-3 files: 3/10 (7 files), 4/10 (2), 6/10 (1), ?/? (2). vectord/index_registry.rs alone scored 6/10 — meaningfully higher. That's worth a closer look: what does that file do right? If we find the pattern there and replicate to peers, scores should rise. This is learning-loop signal. + +- **[FORENSIC JSON OUTPUT IS RICHER THAN MARKDOWN]** kimi-k2:1t emitted strict JSON-schema output (matching SCRUM_FORENSIC_PROMPT.md) for `vectord/src/index_registry.rs`. Structure separates critical_failures / pseudocode_flags / prd_mismatches / broken_pipelines / missing_components / risk_points / **verified_components** / required_next_actions — each with confidence per entry. Markdown reviews never captured `verified_components` (what's confirmed working). Fix next loop: scrum pipeline detects JSON format in accepted output and extracts structured fields into KB row (`verified_components_count`, `critical_failures_count`, `missing_components_count`, `verdict`). Downstream consumers then filter PR files by `verdict: needs_patch` or prioritize by `missing_components_count`. **This is the biggest next-iter KB quality jump available** — goes from "confidence as a scalar" to "confidence per specific claim with evidence field." Verified_components in particular is the PROOF-OF-LIFE signal that tells us what's real — flipside of P9/P13/P42 gaps. + +- **[🔴 TREE-SPLIT REVIEW SHARD-LEAKAGE]** `ingestd/src/service.rs` (7 shards, 24.3KB) accepted output is titled "Forensic Audit Report – ... (shard 3)". The review covers only the Postgres-import path (shard 3). The `ingest_file` handler where my P9-001 fix lives (journal.record_ingest call) is in shard 1 or 2 — that reviewer never saw the fix. **`tree_split_fired: true` is supposed to mean the output is the reducer-merged summary of all shards**, but this review retained shard-specific scope. Either (a) the reduce step didn't integrate shard summaries, (b) the accepted attempt was one individual shard response that slipped past the reducer, or (c) the reducer prompt doesn't instruct the model to present the file holistically. **This is a real correctness bug** — it means file-level findings can be ghost-negative (fix applied, reviewer blind to it) and ghost-positive (gap exists in unreviewed shard, reviewer gives clean bill). Fix next loop: examine the reduce-step prompt in tree-split path, ensure accepted output comes from reduce step not from any individual shard. Validate by running on a file with a known fix and confirming the review notices it. + +**[ITER 4 OBSERVATIONS]** +- **[🔴 OBSERVER DROPS SCRUM RICH FIELDS]** Scrum→observer wiring works (by_source.scrum appears in /stats on iter 4 file 1). BUT observer.ts:262-283 `ObservedOp = {...}` literal only spreads known keys (endpoint, success, duration_ms, role, city, state, count, rescue_*). My scrum-specific fields (confidence_avg, confidence_min, gradient_tier, verdict, critical_failures_count, verified_components_count, missing_components_count, alignment_score, output_format, findings_count, attempts_made, thin_rejections) are silently discarded. Observer knows the scrum event happened but loses review-quality data. Fix next loop: add `metadata?: Record` passthrough on ObservedOp, or declare scrum-specific fields explicitly. Preferred: metadata passthrough so future sources (auditor, kb_extractor) land the same way. +- **[SCHEMA V4 LANDING CORRECTLY]** main.rs iter-4 KB row has alignment_score=3 (decimal parser fixed), output_format="markdown" (classifier works), verdict=null (correct — only forensic_json produces verdict), confidence_avg=91 (previous iter got 0 due to column dropout — run-to-run variance self-healed this). Structured counters (critical/verified/missing) = 0 on markdown rows, populated on forensic_json rows. +- **[RING BUFFER EVICTING LANGFUSE]** observer ring hit 2000 cap; first 2 scrum events pushed 2 langfuse entries out (1999 → 1997). Not a bug — ring works as designed — but means old-context retention is bounded. If we care about historical Langfuse traces we need a larger ring OR a separate per-source ring. +- **[UI Playwright probe found 2 real bugs]** (fixed 2026-04-24): (a) ui/server.ts tryFetch relied on content-type header to decide JSON vs text; observer Bun.serve returns JSON without `application/json` content-type, so stats were strings — UI showed "0 ops" instead of 2000. Fixed: always attempt JSON.parse, fall back to raw text. (b) ui.js renderNodeContext used Object.entries(n.health) which iterates characters on a string — gateway /health returns "lakehouse ok" and the panel showed rows like `0=l, 1=a, 2=k, ...`. Fixed: primitive-vs-object guard. **Both were invisible in functional tests — only a real browser render exposed them.** Worth adding a Playwright smoke test to CI for any future UI changes. + +### Iter 1 results + +*Populated after scrum finishes.* + +## Iteration 2 — queued + +**Prompt shape change (from J 2026-04-23):** iter 2+ uses `docs/SCRUM_FORENSIC_PROMPT.md` as the system prompt, replacing the softer iter-1 framing. Adversarial auditor tone with 8 audit passes. Strict JSON output format with `verdict: pass|fail|needs_patch`. If system can't prove itself, verdict is FAIL. + +**Scrum pipeline change:** `scrum_master_pipeline.ts` needs an env `LH_SCRUM_SYSTEM_PROMPT` (new) to inject the forensic frame alongside the proposal doc. The file-level loop still asks for suggestions per file but under the 8-pass adversarial lens. + +**Goal:** Self-host. Pipeline loads its ladder from `config/routing.toml` via the RoutingEngine that iter 1 wired. If that still isn't loaded, note gap, proceed with hardcoded ladder, flag for iter 3. +**Target expansion:** beyond `.rs` to `.ts` (tests/multi-agent, auditor/), `.py` (sidecar), `.md` (docs). + +## Iterations 3-6 — queued + +**Goal:** measure trajectory. Each iteration reduces finding count, raises unit test count, reduces grep-for-fake-markers count. If any iteration doesn't improve, that's the data point. + +## Metrics per iteration + +Capture after each re-sweep: + +- `findings_total` (baseline: 19) +- `findings_by_severity` (baseline: 3h / 8m / 8l) +- `phases_partial_count` (baseline: 9) +- `phases_real_count` (baseline: 25 of 35) +- `rust_test_count` (baseline: 194+) +- `gateway_test_fail_count` (baseline: 1 — P38-001) +- `grep_hits_unimplemented` run: `grep -rEc 'todo!\(\)|unimplemented!\(\)|FIXME' crates/` +- `grep_hits_pseudo` run: `grep -rEc '\"placeholder\"|\"stub\"|\"mock\"|\"fake\"' crates/` + +## Rules for this loop + +1. **Cloud-first for every iteration.** Per feedback_scrum_cloud_first.md, strategic review uses 120B+ tier. +2. **One cross-cutting PR per iteration when possible.** Meta-pattern from audit: identity+auth+access+journal+truth share a pipe. Fix them together. +3. **Build must be green before next iteration starts.** A broken build is evidence the last iteration regressed, not progressed. +4. **Log findings to the jsonl as new rows per iteration** with `sweep_id: phase_sweep_2026-04-23-iterN`. Never overwrite prior iteration's findings — the trajectory is the whole point. +5. **Don't fix things during an iteration.** Every observation goes into "Fix next loop" section above. Next iteration's scrum picks them up. diff --git a/docs/SYSTEM_EVOLUTION_LAYERS.md b/docs/SYSTEM_EVOLUTION_LAYERS.md new file mode 100644 index 0000000..7fdca57 --- /dev/null +++ b/docs/SYSTEM_EVOLUTION_LAYERS.md @@ -0,0 +1,83 @@ +# Future Expansion — Advanced System Evolution Layers + +Adopted 2026-04-24 from J. The system stops optimizing for task completion. It optimizes for **provable execution, repeatable outcomes, resilience under drift, failure, and adversarial conditions.** + +## Layer roster + iteration mapping + +| # | Layer | Short form | Target iter | +|---|---|---|---:| +| 1 | Counterfactual Execution | Generate synthetic failure variants from each success | iter 5 | +| 2 | Model Trust Profiling | Per-(model, task_type) success rate → routing weight | **iter 3** | +| 3 | Execution DNA | Compress successful runs into reusable patterns | iter 4 | +| 4 | Drift Sentinel | Re-validate historical tasks on a schedule | iter 5 | +| 5 | Adversarial Injection | Inject poisoned context / malformed outputs / conflicts | iter 6 | +| 6 | Permission Gradient | Confidence → execution tier (≥0.9 full, ≥0.7 dry-run, ≥0.5 sim, <0.5 block) | **iter 3** | +| 7 | Multi-Agent Disagreement | Planner/Critic/Validator — disagreement = signal | iter 4 | +| 8 | Temporal Context | Time-aware memory with decay_score + last_validated_at | iter 4 | +| 9 | Execution Cost Intelligence | Tokens, iterations, cloud_calls, latency per task | **iter 3** | +| 10 | Human Override as Data | Capture manual fixes as jsonl rows | **iter 3** | + +## Detail (J's original framing preserved) + +### 1. Counterfactual Execution Layer +Simulate alternate failure paths for every successful task. Real Execution → Success → Generate Variations (env, version, inputs) → Simulate Failure Cases → Store Synthetic Failure Signatures. **Purpose:** pre-train against unseen failures before real exposure. + +### 2. Model Trust Profiling ← iter 3 +Per-(model, task_type) performance tracking. +``` +{ "model": "...", "task_type": "...", "success_rate": 0.0, "failure_modes": [], "trust_score": 0.0 } +``` +**Usage:** route by trust score, adjust validation strictness dynamically, per-model risk budgets. + +### 3. Execution DNA (Trace Compression) +Successful executions → reusable fragments. +``` +{ "dna_id": "hash", "task_signature": "...", "critical_steps": [], "failure_avoidance": [] } +``` +Replaces doc retrieval with pattern retrieval; faster convergence on similar tasks. + +### 4. Drift Sentinel +Select Historical Task → Re-run Current Env → Compare → If Failure → Mark Drifted → Trigger Re-learning. Detect silent decay; maintain long-term reliability. + +### 5. Adversarial Injection Engine +Inject malformed outputs / outdated docs / conflicting instructions / poisoned memory. Verify validation catches, execution blocks unsafe actions, memory rejects corrupted data. Build system immunity. + +### 6. Permission Gradient Execution ← iter 3 +Confidence-based control replacing binary: +- confidence ≥ 0.9 → full execution +- confidence ≥ 0.7 → dry-run + diff +- confidence ≥ 0.5 → simulation only +- confidence < 0.5 → block +Inputs: validation score, model trust score, memory match confidence. Risk-aware control; reduced catastrophic-failure surface. + +### 7. Multi-Agent Disagreement Engine +Planner / Critic / Validator; disagreement triggers more context, bigger model, stricter validation. Disagreement is signal, not noise. + +### 8. Temporal Context Layer +``` +{ "created_at": "ts", "last_validated_at": "ts", "decay_score": 0.0 } +``` +Retrieval priority: recent + validated + high success rate. Avoid stale knowledge. + +### 9. Execution Cost Intelligence ← iter 3 +``` +{ "task": "...", "tokens_used": 0, "iterations": 0, "cloud_calls": 0, "latency_ms": 0 } +``` +Optimize local vs cloud; reduce unnecessary iterations. + +### 10. Human Override as Data ← iter 3 +``` +{ "human_fix": "...", "reason": "...", "task_signature": "...", "validated": true } +``` +Manual fixes become reusable knowledge. + +## Final Principle + +Memory is not passive recall. It is operational substrate: +- failures become structured knowledge +- successes become reusable execution patterns +- all outputs are validated before reuse + +## System Directive + +Not speed. Not convenience. **Correctness. Verifiability. Resilience under change.** diff --git a/mcp-server/langfuse_bridge.ts b/mcp-server/langfuse_bridge.ts new file mode 100644 index 0000000..e66eea5 --- /dev/null +++ b/mcp-server/langfuse_bridge.ts @@ -0,0 +1,174 @@ +// langfuse_bridge — the missing piece called out in project_lost_stack.md +// and Phase 40 PRD. Polls Langfuse `/api/public/traces` at interval, +// forwards every completed trace to observer `:3800/event` with +// `source: "langfuse"`. Observer's existing ring buffer + analyzer +// pick it up, so the KB learns from cost/latency/provider deltas per +// model — not just from scenario outcomes. +// +// Loopback: observer persistOp() appends to data/_observer/ops.jsonl +// and its aggregator produces pathway_recommendations.jsonl. This +// bridge closes the feedback loop between LLM call metadata and the +// playbook/KB learning surface. +// +// State persistence: last-seen trace timestamp written to a JSON file +// so restarts don't double-emit. Bounded forward window (50/tick) so +// first-run catch-up doesn't hammer the observer. + +const LANGFUSE_URL = process.env.LANGFUSE_URL ?? "http://localhost:3000"; +const LANGFUSE_PUBLIC = process.env.LANGFUSE_PUBLIC_KEY; +const LANGFUSE_SECRET = process.env.LANGFUSE_SECRET_KEY; +const OBSERVER_URL = process.env.OBSERVER_URL ?? "http://localhost:3800"; +const POLL_INTERVAL_MS = Number(process.env.LANGFUSE_POLL_MS ?? 30000); +const BATCH_LIMIT = Number(process.env.LANGFUSE_BATCH_LIMIT ?? 50); +const STATE_FILE = process.env.LANGFUSE_STATE_FILE + ?? "/var/lib/lakehouse-guard/langfuse_last_seen.json"; + +interface LangfuseTrace { + id: string; + name?: string; + timestamp: string; + input?: any; + output?: any; + latency?: number; // seconds, per Langfuse API + totalCost?: number; + usage?: { input?: number; output?: number; total?: number }; + metadata?: any; +} + +interface State { last_seen_ts?: string } + +function basicAuth(): string { + return "Basic " + btoa(`${LANGFUSE_PUBLIC}:${LANGFUSE_SECRET}`); +} + +async function loadState(): Promise { + try { + const f = Bun.file(STATE_FILE); + if (!(await f.exists())) return {}; + return JSON.parse(await f.text()) as State; + } catch (e) { + console.warn(`[langfuse-bridge] state load failed: ${e}`); + return {}; + } +} + +async function saveState(s: State): Promise { + try { + await Bun.write(STATE_FILE, JSON.stringify(s)); + } catch (e) { + console.warn(`[langfuse-bridge] state save failed: ${e}`); + } +} + +async function fetchTracesSince(cursor?: string): Promise { + const url = new URL("/api/public/traces", LANGFUSE_URL); + url.searchParams.set("limit", String(BATCH_LIMIT)); + url.searchParams.set("orderBy", "timestamp.asc"); + if (cursor) url.searchParams.set("fromTimestamp", cursor); + const resp = await fetch(url, { + headers: { authorization: basicAuth() }, + signal: AbortSignal.timeout(10_000), + }); + if (!resp.ok) { + throw new Error(`langfuse ${resp.status}: ${(await resp.text()).slice(0, 200)}`); + } + const body: any = await resp.json(); + return (body.data ?? []) as LangfuseTrace[]; +} + +// Shape one Langfuse trace into the ObservedOp the observer expects +// (see mcp-server/observer.ts:29). `source: "langfuse"` is the +// provenance flag so the analyzer can weight traces differently from +// scenario-sourced events. +function toObservedOp(t: LangfuseTrace): Record { + const endpoint = t.metadata?.provider + ?? t.metadata?.model + ?? t.name + ?? "langfuse.trace"; + const inputSummary = typeof t.input === "string" + ? t.input.slice(0, 200) + : JSON.stringify(t.input ?? {}).slice(0, 200); + const outputSummary = typeof t.output === "string" + ? t.output.slice(0, 200) + : JSON.stringify(t.output ?? {}).slice(0, 200); + return { + timestamp: t.timestamp, + endpoint: `langfuse:${endpoint}`, + input_summary: inputSummary, + success: !t.metadata?.error, + duration_ms: Math.round((t.latency ?? 0) * 1000), + output_summary: outputSummary, + source: "langfuse", + sig_hash: t.metadata?.sig_hash, + event_kind: t.metadata?.task_class, + // Extra fields the observer doesn't schema but the KB aggregator + // can still pick up via JSON passthrough. + model: t.metadata?.model, + provider: t.metadata?.provider, + prompt_tokens: t.usage?.input, + completion_tokens: t.usage?.output, + total_tokens: t.usage?.total, + total_cost: t.totalCost, + }; +} + +async function forwardToObserver(op: Record): Promise { + const resp = await fetch(`${OBSERVER_URL}/event`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify(op), + signal: AbortSignal.timeout(5_000), + }); + if (!resp.ok) { + throw new Error(`observer ${resp.status}: ${(await resp.text()).slice(0, 200)}`); + } +} + +async function tick(): Promise { + const state = await loadState(); + let traces: LangfuseTrace[]; + try { + traces = await fetchTracesSince(state.last_seen_ts); + } catch (e) { + console.warn(`[langfuse-bridge] fetch failed: ${e}`); + return; + } + if (traces.length === 0) { + console.log(`[langfuse-bridge] no new traces since ${state.last_seen_ts ?? "start"}`); + return; + } + let last = state.last_seen_ts ?? ""; + let forwarded = 0; + for (const t of traces) { + try { + await forwardToObserver(toObservedOp(t)); + forwarded++; + if (t.timestamp > last) last = t.timestamp; + } catch (e) { + console.warn(`[langfuse-bridge] forward ${t.id} failed: ${e}`); + // Don't advance cursor on forward failure — retry next tick. + break; + } + } + if (last) await saveState({ last_seen_ts: last }); + console.log( + `[langfuse-bridge] forwarded ${forwarded}/${traces.length}, last_seen=${last}`, + ); +} + +async function main(): Promise { + if (!LANGFUSE_PUBLIC || !LANGFUSE_SECRET) { + console.error("LANGFUSE_PUBLIC_KEY + LANGFUSE_SECRET_KEY required"); + process.exit(1); + } + console.log( + `[langfuse-bridge] polling ${LANGFUSE_URL} every ${POLL_INTERVAL_MS}ms → ${OBSERVER_URL}/event`, + ); + await tick(); + setInterval(tick, POLL_INTERVAL_MS); +} + +main().catch(e => { + console.error(`[langfuse-bridge] fatal: ${e}`); + process.exit(1); +}); diff --git a/mcp-server/observer.ts b/mcp-server/observer.ts index 13a2be2..6894401 100644 --- a/mcp-server/observer.ts +++ b/mcp-server/observer.ts @@ -37,7 +37,7 @@ interface ObservedOp { // Phase 24 — optional provenance so error analyzer and playbook // builder can differentiate MCP-layer ops from scenario-sourced // events. Scenarios set source="scenario" + staffer_id + sig_hash. - source?: "mcp" | "scenario"; + source?: "mcp" | "scenario" | "langfuse" | "overseer_correction"; staffer_id?: string; sig_hash?: string; event_kind?: string; @@ -47,6 +47,12 @@ interface ObservedOp { count?: number; rescue_attempted?: boolean; rescue_succeeded?: boolean; + // Overseer-correction-specific (2026-04-23): lets the analyzer + // correlate corrections with the drift that prompted them and with + // subsequent outcomes that either validated or invalidated the advice. + task_class?: string; + correction?: string; + applied_at_turn?: number; } const recentOps: ObservedOp[] = []; @@ -261,7 +267,10 @@ function startHttpListener() { duration_ms: Number(body.duration_ms ?? 0), output_summary: body.output_summary ?? (body.success ? "filled" : (body.error ?? "failed")), error: body.error, - source: "scenario", + // Respect the client's provenance if set (langfuse bridge + // sends source:"langfuse", etc.). Default to "scenario" + // to keep legacy Phase 24 callers working. + source: body.source ?? "scenario", staffer_id: body.staffer_id, sig_hash: body.sig_hash, event_kind: body.event_kind, @@ -284,6 +293,58 @@ function startHttpListener() { console.error(`[observer] HTTP listener bound to 0.0.0.0:${OBSERVER_PORT}`); } +// ─── Overseer corrections tailer (2026-04-23) ─── + +// The gateway's /v1/respond loop writes T3 overseer corrections to +// data/_kb/overseer_corrections.jsonl. Tail it once per cycle and +// inject each new row into the same recentOps ring that analyzeErrors +// + consolidatePlaybooks read — so a correction that just fired shows +// up alongside the outcomes it was meant to repair, and the analyzer +// can flag patterns like "three corrections on staffing.fill with the +// same advice — underlying problem isn't a drift, it's a data gap". +const CORRECTIONS_PATH = process.env.OVERSEER_CORRECTIONS_PATH + ?? "/home/profit/lakehouse/data/_kb/overseer_corrections.jsonl"; +let correctionsCursor = 0; // byte offset + +async function tailOverseerCorrections(): Promise { + const f = Bun.file(CORRECTIONS_PATH); + if (!(await f.exists())) return 0; + const size = f.size; + if (size <= correctionsCursor) return 0; + + // Read only the suffix since the last cursor; keeps tail work + // bounded even as the file grows. + const text = await f.slice(correctionsCursor, size).text(); + correctionsCursor = size; + + let forwarded = 0; + for (const line of text.split("\n")) { + if (!line.trim()) continue; + let row: any; + try { row = JSON.parse(line); } catch { continue; } + const op: ObservedOp = { + timestamp: row.created_at ?? new Date().toISOString(), + endpoint: `overseer:${row.model ?? "gpt-oss:120b"}`, + input_summary: `${row.task_class ?? "?"}: ${row.reason ?? "escalation"}`, + // Correction itself is neither success nor failure — it's a + // mitigation attempt. We mark success=true so analyzeErrors + // doesn't count it as a failure, but the preview lets the + // analyzer see what was tried. + success: true, + duration_ms: Number(row.usage?.latency_ms ?? 0), + output_summary: String(row.correction ?? "").slice(0, 200), + source: "overseer_correction", + sig_hash: row.sig_hash, + task_class: row.task_class, + correction: String(row.correction ?? ""), + applied_at_turn: Number(row.applied_at_turn ?? 0), + }; + recordExternalOp(op); + forwarded++; + } + return forwarded; +} + // ─── Main loop ─── async function main() { @@ -306,7 +367,14 @@ async function main() { await Bun.sleep(CYCLE_SECS * 1000); cycle++; - // Every cycle: analyze errors if any + // Every cycle: tail the overseer corrections KB stream, then + // analyze errors. Order matters — if an overseer correction just + // landed for a sig_hash that previously failed, the analyzer + // should see both. + const newCorrections = await tailOverseerCorrections(); + if (newCorrections > 0) { + console.error(`[observer] pulled ${newCorrections} new overseer correction(s) into ring`); + } await analyzeErrors(); // Every 5 cycles: consolidate playbooks @@ -315,12 +383,16 @@ async function main() { } const scenarioOps = recentOps.filter(o => o.source === "scenario").length; + const langfuseOps = recentOps.filter(o => o.source === "langfuse").length; + const correctionOps = recentOps.filter(o => o.source === "overseer_correction").length; const stats = { cycle, total_ops: recentOps.length, successes: recentOps.filter(o => o.success).length, failures: recentOps.filter(o => !o.success).length, scenario_ops: scenarioOps, + langfuse_ops: langfuseOps, + overseer_corrections: correctionOps, }; console.error(`[observer] cycle ${cycle}: ${JSON.stringify(stats)}`); } diff --git a/ops/systemd/lakehouse.service b/ops/systemd/lakehouse.service new file mode 100644 index 0000000..4075f46 --- /dev/null +++ b/ops/systemd/lakehouse.service @@ -0,0 +1,26 @@ +[Unit] +Description=Lakehouse Gateway +After=network.target ollama.service + +[Service] +Type=simple +WorkingDirectory=/home/profit/lakehouse +ExecStart=/home/profit/lakehouse/target/release/gateway +Restart=always +RestartSec=5 +Environment=RUST_LOG=info +# Lance S3 support — connects to MinIO via the AWS SDK env vars. +# AWS_ALLOW_HTTP required for non-TLS MinIO endpoints. +Environment=AWS_ACCESS_KEY_ID=profit +Environment=AWS_SECRET_ACCESS_KEY=29IgevhKQjE2WHg088ieI1wP +Environment=AWS_ENDPOINT=http://localhost:9000 +Environment=AWS_ALLOW_HTTP=true +Environment=AWS_DEFAULT_REGION=us-east-1 +# Langfuse keys — shared with lakehouse-langfuse-bridge.service so the +# gateway-side emitter (`/v1/chat`, `/v1/respond`) and the observer +# ingest path see the same project. Leading `-` makes missing-file a +# warn-not-fatal so gateway still starts if Langfuse is torn down. +EnvironmentFile=-/etc/lakehouse/langfuse.env + +[Install] +WantedBy=multi-user.target diff --git a/tests/real-world/scrum_master_pipeline.ts b/tests/real-world/scrum_master_pipeline.ts index 369b6f0..853d554 100644 --- a/tests/real-world/scrum_master_pipeline.ts +++ b/tests/real-world/scrum_master_pipeline.ts @@ -40,8 +40,27 @@ const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/scrum_${Date.now() const PRD_PATH = "/home/profit/lakehouse/docs/PRD.md"; // Using CONTROL_PLANE_PRD as the "suggested changes" doc since it // describes the Phase 38-44 target architecture and is on main. -// COHESION_INTEGRATION_PLAN.md is still on PR #7 branch. -const PROPOSAL_PATH = "/home/profit/lakehouse/docs/CONTROL_PLANE_PRD.md"; +// Override via LH_SCRUM_PROPOSAL env to point at a fix-wave doc +// generated from a phase-sweep audit, so the scrum pulls direction +// from concrete findings instead of the high-level PRD alone. +const PROPOSAL_PATH = process.env.LH_SCRUM_PROPOSAL + || "/home/profit/lakehouse/docs/CONTROL_PLANE_PRD.md"; + +// Iter 2+ — when LH_SCRUM_FORENSIC is set to a file path, prepend its +// contents as an adversarial auditor preamble to every per-file prompt. +// This flips the review tone from "suggest improvements" to "prove it +// works or mark FAIL." Added 2026-04-23 for iter 2 of the 6x loop. +// Empty string = no preamble (iter-1 behavior). +const FORENSIC_PREAMBLE = process.env.LH_SCRUM_FORENSIC + ? (() => { + try { + return require("node:fs").readFileSync(process.env.LH_SCRUM_FORENSIC!, "utf8"); + } catch (e) { + console.error(`[scrum] warning: could not read LH_SCRUM_FORENSIC=${process.env.LH_SCRUM_FORENSIC}: ${e}`); + return ""; + } + })() + : ""; // Scoped target: 3 representative source files by default. // The scrum-master walks these in order and produces one suggestion @@ -55,13 +74,34 @@ const TARGET_FILES: string[] = process.env.LH_SCRUM_FILES ? process.env.LH_SCRUM_FILES.split(",").map(s => s.trim()) : DEFAULT_TARGETS; +// Cloud-first ladder, STRONGEST-MODEL-FIRST (iter 3+, 2026-04-24). +// J's direction: "switch to the strongest cloud model" for iter 3 — +// the forensic prompt is demanding enough that even 120B gets rejected +// as thin. Rank by parameter count / reasoning strength: +// 1. kimi-k2:1t — 1T params, Moonshot flagship (biggest) +// 2. kimi-k2.6 — Moonshot next-gen, pro tier +// 3. deepseek-v3.1:671b — 671B, strong reasoning + coding +// 4. mistral-large-3:675b — 675B, deep analysis +// 5. qwen3.5:397b — 397B (iter 2's rescue model) +// 6. gpt-oss:120b — 120B (iter 1's primary; still strong fallback) +// Local fallbacks kept for cloud-down scenarios. +// Hot-path pipelines (scenario.ts / execution_loop) stay local per +// Phase 20 t1_hot — this scrum is not hot path. const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [ - { provider: "ollama", model: "qwen3.5:latest", note: "local 7B" }, - { provider: "ollama", model: "qwen3:latest", note: "local 7B (peer)" }, - { provider: "ollama", model: "gpt-oss:20b", note: "local 20B" }, - { provider: "ollama_cloud", model: "gpt-oss:120b", note: "cloud 120B" }, - { provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B coding specialist" }, - { provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B last-ditch" }, + { provider: "ollama_cloud", model: "kimi-k2:1t", note: "cloud 1T — biggest available, 1.4s probe" }, + { provider: "ollama_cloud", model: "deepseek-v3.1:671b", note: "cloud 671B — fast reasoning (1.0s probe)" }, + { provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B — deep analysis (0.9s probe)" }, + { provider: "ollama_cloud", model: "gpt-oss:120b", note: "cloud 120B — reliable workhorse (iter1 baseline)" }, + { provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B — coding specialist" }, + // qwen3.5:397b is the deep final thinker — J's note 2026-04-24: + // "qwen3.5 is really smart maybe the last call use that one". + // When every other cloud model has produced thin output, this dense + // 397B reviewer is the one that tends to push through. Keeping it + // LAST in cloud tier, before the local fallback. + { provider: "ollama_cloud", model: "qwen3.5:397b", note: "cloud 397B dense — last-ditch smart reviewer" }, + { provider: "ollama", model: "gpt-oss:20b", note: "local 20B — cloud-down fallback" }, + // kimi-k2.6 removed 2026-04-24: probe returned empty (not available + // on current tier). Keeping note for when pro tier upgrade lands. ]; type Chunk = { id: string; text: string; embedding: number[]; origin: string; offset: number }; @@ -258,7 +298,11 @@ async function reviewFile( ? `\nIMPORTANT: the "source" below is a multi-shard distillation (tree-split across ${shardsSummarized} shards), NOT the full raw file. DO NOT claim any field, function, or feature is "missing" based on its absence from this distillation — the distillation may have elided it. Only call out gaps that appear DIRECTLY contradicted by the PRD excerpts.\n` : ""; - const baseTask = `You are reviewing one source file against the Lakehouse PRD and an active cohesion-integration plan. + const forensicPrefix = FORENSIC_PREAMBLE + ? `${FORENSIC_PREAMBLE}\n\n═══ FILE UNDER AUDIT ═══\n\n` + : ""; + + const baseTask = `${forensicPrefix}You are reviewing one source file against the Lakehouse PRD and an active cohesion-integration plan. FILE: ${rel} (${content.length} bytes${treeSplitFired ? `, tree-split into ${shardsSummarized} shards` : ""}) ${truncationWarning} @@ -272,6 +316,15 @@ Produce a structured review with: 1. Alignment score (1-10) between this file and the PRD intent 2. 3-5 concrete suggested changes (bullet points), each naming a specific function/line and what to change 3. Any gap where this file's behavior contradicts the PRD or the proposal +${FORENSIC_PREAMBLE ? "4. Apply the forensic audit passes from the preamble: pseudocode detection, PRD contract status, normalization/validation pipeline, failure→repair loop, execution memory, relevance orchestration, execution safety, testing evidence. Issue a verdict pass|needs_patch|fail." : ""} + +**Per-finding confidence (required on every suggestion):** +Attach a self-assessed **Confidence: NN%** to every suggested change AND every gap you list. The percentage is your belief that the suggestion is correct, will compile, and lands the PRD intent. Calibration guide: +- 90-100%: pattern seen repeatedly in shipped code; change is mechanical; low risk of regressions +- 70-89%: confident in direction, some room for interpretation on API shape or naming +- 50-69%: plausible fix but may not match existing conventions or may cascade to other files +- <50%: genuinely uncertain — include regardless so downstream knows to investigate before applying +Format each finding as: \`**1.** . **Confidence: NN%.**\` (in tables, add a final "Confidence" column.) Low confidence is valuable signal — do not round up. Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-offset when relevant.`; @@ -335,6 +388,91 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of const { appendFile, mkdir } = await import("node:fs/promises"); const { dirname } = await import("node:path"); await mkdir(dirname(SCRUM_REVIEWS_JSONL), { recursive: true }); + + // Extract per-finding confidences from the accepted markdown. + // Patterns tried: "Confidence: NN%", "Confidence**: NN%", + // and table-cell "| 92% |". Cap at 20 matches to bound row size. + // Added 2026-04-23 (iter 2 direction from J: "make scrum output + // include self-assessed confidence per finding"). + const confidences: number[] = []; + // Markdown format: "Confidence: 92%" / "Confidence**: 92%" / "| 92% |" + const patMarkdown = /(?:Confidence[*:\s]*\s*|\|\s*)(\d{1,3})\s*%/gi; + // JSON format (forensic strict output): "confidence": 92 + const patJson = /"confidence"\s*:\s*(\d{1,3})(?!\d)/gi; + for (const pat of [patMarkdown, patJson]) { + const matches = accepted.matchAll(pat); + for (const hit of matches) { + if (confidences.length >= 40) break; + const pct = parseInt(hit[1], 10); + if (pct >= 0 && pct <= 100) confidences.push(pct); + } + } + const conf_avg = confidences.length + ? Math.round(confidences.reduce((a, b) => a + b, 0) / confidences.length) + : null; + const conf_min = confidences.length ? Math.min(...confidences) : null; + + // Score extraction — regex accepts decimals ("Score: 4.5/10") and + // surrounding punctuation ("4/10 — mid"). iter 3 had 4 unparseable + // scores because the prior regex /(\d)\s*\/\s*10/ missed decimals. + const scoreMatch = accepted.match(/(?:score[\s*:]*)?(\d(?:\.\d)?)\s*\/\s*10\b/i); + const alignment_score = scoreMatch ? parseFloat(scoreMatch[1]) : null; + + // Forensic JSON extraction — iter 3 showed 20/21 files came back + // as JSON (verdict + critical_failures[] + verified_components[] + ...) + // rather than markdown tables. Previously we only stored suggestions_preview + // (truncated to 2KB); now we also capture the structured counters so + // consumers can filter by verdict, sort by critical_failures_count, etc. + let verdict: string | null = null; + let critical_failures_count = 0; + let pseudocode_flags_count = 0; + let prd_mismatches_count = 0; + let missing_components_count = 0; + let verified_components_count = 0; + let risk_points_count = 0; + const isJsonShape = accepted.includes('"verdict"'); + if (isJsonShape) { + const vm = accepted.match(/"verdict"\s*:\s*"([a-z_]+)"/i); + verdict = vm ? vm[1] : null; + // Count object entries per array by counting occurrences of + // either a unique-per-entry field name or {...} bracket pairs + // inside the array span. A straight "count opening braces inside + // the array range" is simplest and robust to field order. + const countArrayEntries = (arrayName: string): number => { + const re = new RegExp(`"${arrayName}"\\s*:\\s*\\[([\\s\\S]*?)\\]`, "i"); + const m = accepted.match(re); + if (!m || !m[1].trim()) return 0; + // Count opening braces of direct-child objects. + let depth = 0, entries = 0; + for (const ch of m[1]) { + if (ch === '{') { if (depth === 0) entries++; depth++; } + else if (ch === '}') depth--; + } + return entries; + }; + critical_failures_count = countArrayEntries("critical_failures"); + pseudocode_flags_count = countArrayEntries("pseudocode_flags"); + prd_mismatches_count = countArrayEntries("prd_mismatches"); + missing_components_count = countArrayEntries("missing_components"); + verified_components_count = countArrayEntries("verified_components"); + risk_points_count = countArrayEntries("risk_points"); + } + + // Permission Gradient (Layer #6 from SYSTEM_EVOLUTION_LAYERS.md). + // Classify the overall finding set by confidence_avg: + // ≥90 auto-apply-safe, ≥70 dry-run + diff, ≥50 simulation only, + // <50 block (human review). Use conf_min as the tier-lower-bound + // so one shaky finding drags the whole row down to the safer tier. + const tierFor = (c: number | null): string => { + if (c === null) return "unknown"; + if (c >= 90) return "auto"; + if (c >= 70) return "dry_run"; + if (c >= 50) return "simulation"; + return "block"; + }; + const gradient_tier = tierFor(conf_min); // conservative: weakest finding decides + const gradient_tier_avg = tierFor(conf_avg); + const row = { file: rel, reviewed_at: new Date().toISOString(), @@ -343,15 +481,122 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of attempts_made: history.length, tree_split_fired: treeSplitFired, suggestions_preview: accepted.slice(0, 2000), - schema_version: 2, + // Iter-3+ confidence fields. + confidences_per_finding: confidences, + confidence_avg: conf_avg, + confidence_min: conf_min, + findings_count: confidences.length, + // Layer #6 Permission Gradient — downstream consumers decide + // apply-semantics based on these fields instead of blindly trusting + // every suggestion. + gradient_tier, + gradient_tier_avg, + // Score (decimal-aware) and forensic JSON structured counters. + // iter 4+ fields (schema_version 4). + alignment_score, + output_format: isJsonShape ? "forensic_json" : "markdown", + verdict, + critical_failures_count, + pseudocode_flags_count, + prd_mismatches_count, + missing_components_count, + verified_components_count, + risk_points_count, + schema_version: 4, scrum_master_reviewed: true, }; + + // Layer #2 Model Trust Profiling — append one row per file-accept + // so over time we can compute per-(model, task_type) success/thin + // rates and trust scores. task_type here is inferred from the file + // path — good enough for initial stats, refine later. + try { + const taskType = rel.includes("/truth/") ? "truth" + : rel.includes("/gateway/") ? "gateway" + : rel.includes("/vectord") ? "vector" + : rel.includes("/ingestd") ? "ingest" + : rel.includes("/queryd") ? "query" + : rel.includes("/storaged") || rel.includes("/catalogd") || rel.includes("/journald") ? "storage" + : rel.includes("/aibridge") ? "aibridge" + : "other"; + const trustRow = { + run_id: OUT_DIR.split("/").pop(), + reviewed_at: row.reviewed_at, + file: rel, + task_type: taskType, + accepted_model: acceptedModel, + accepted_on_attempt: acceptedOn, + attempts_made: history.length, + thin_rejections: history.filter(h => h.status === "thin").length, + errors: history.filter(h => h.status === "error").length, + confidence_avg: conf_avg, + tree_split_fired: treeSplitFired, + }; + await appendFile("/home/profit/lakehouse/data/_kb/model_trust.jsonl", + JSON.stringify(trustRow) + "\n"); + } catch (e) { + console.error(`[scrum] model_trust append failed: ${(e as Error).message}`); + } try { await appendFile(SCRUM_REVIEWS_JSONL, JSON.stringify(row) + "\n"); } catch (e) { console.error(`[scrum] failed to append scrum_reviews.jsonl: ${(e as Error).message}`); } + // Close the scrum → observer loop (fix 2026-04-24). Architecture + // audit surfaced: observer ring had 2000 ops, 1999 from Langfuse, + // zero from scrum. Observer's analyzeErrors + PLAYBOOK_BUILDER loops + // were blind to the very pipeline most likely to teach them. One + // fire-and-forget POST wires them in. Observer tolerates unreachable + // backends; no scrum run fails if observer is down. + // + // Schema matches observer's ObservedOp shape (source, staffer_id, + // sig_hash, event_kind, success, ...). file + accepted_model + + // confidence_avg + gradient_tier give downstream analyzers enough + // signal to correlate reviews with later regressions. + try { + const sigHash = createHash("sha256") + .update(`${rel}|${OUT_DIR.split("/").pop()}`) + .digest("hex") + .slice(0, 16); + fetch("http://localhost:3800/event", { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + source: "scrum", + staffer_id: "scrum_master", + sig_hash: sigHash, + event_kind: "file_review", + success: true, + run_id: OUT_DIR.split("/").pop(), + file: rel, + accepted_model: acceptedModel, + accepted_on_attempt: acceptedOn, + attempts_made: history.length, + thin_rejections: history.filter(h => h.status === "thin").length, + confidence_avg: conf_avg, + confidence_min: conf_min, + findings_count: confidences.length, + gradient_tier, + tree_split_fired: treeSplitFired, + // iter4+ forensic-JSON fields so observer's analyzer can + // route by verdict / sort by critical_failures_count + alignment_score, + verdict, + output_format: isJsonShape ? "forensic_json" : "markdown", + critical_failures_count, + verified_components_count, + missing_components_count, + ts: row.reviewed_at, + }), + signal: AbortSignal.timeout(3000), + }).catch(() => { + // observer down — not a scrum-run failure, just lose the signal. + }); + } catch (e) { + // Synchronous construction error — ignore. + } + // Route the accepted review through llm_team's fact extractor so // its entities + relationships land in audit_facts.jsonl alongside // inference-side extractions. Same index, two sources. Tagged diff --git a/ui/index.html b/ui/index.html new file mode 100644 index 0000000..6c29794 --- /dev/null +++ b/ui/index.html @@ -0,0 +1,134 @@ + + + + + +Lakehouse · Visual Control Plane + + + + +
+
+ + LAKEHOUSE · VCP + +
+ +
+ GW + SC + OBS + MCP + CTX7 +
+
+ +
+ +
+ +
+
+ OVERLAY: + + + + + +
+ +
+ ● healthy + ● degraded + ● down + ◆ active flow +
+
+ + +
+
+ TASK TRACE — file: + + + +
+
+
+
+ + +
+
+ +
+
+
+
+ + +
+
+
+ + +
+
+
+ + +
+
+ SERVICE LOGS + + + + + + + + + + + + +
+
+
+
+ + + +
+ + +
+
+ STREAM + + + +
+
+
+ + + + + diff --git a/ui/server.ts b/ui/server.ts new file mode 100644 index 0000000..dc2ec6e --- /dev/null +++ b/ui/server.ts @@ -0,0 +1,327 @@ +// Visual Control Plane server — v1 +// Single Bun.serve process on :3950. Serves static index.html and +// /data/* endpoints that fan out to the live services + tail jsonl KB +// files. No build step, no node_modules. Restart via systemd or +// `bun run ui/server.ts`. + +const PORT = Number(process.env.LH_UI_PORT ?? 3950); +const KB = "/home/profit/lakehouse/data/_kb"; +const REPO = "/home/profit/lakehouse"; + +const GATEWAY = "http://localhost:3100"; +const SIDECAR = "http://localhost:3200"; +const OBSERVER = "http://localhost:3800"; +const MCP = "http://localhost:3700"; +const CONTEXT7 = "http://localhost:3900"; + +// Tail helper — read last N lines of a jsonl file without loading +// the whole thing. For files up to a few MB this is fine to read fully. +async function tailJsonl(path: string, n = 50): Promise { + try { + const text = await Bun.file(path).text(); + const lines = text.trim().split("\n").filter(Boolean); + const tail = lines.slice(-n); + return tail.map(l => { + try { return JSON.parse(l); } catch { return { _raw: l, _error: "parse" }; } + }); + } catch (e) { + return []; + } +} + +async function tryFetch(url: string, timeout = 1500): Promise { + try { + const r = await fetch(url, { signal: AbortSignal.timeout(timeout) }); + if (!r.ok) return null; + // Fix 2026-04-24: some upstream services (observer Bun.serve) return + // JSON without an application/json content-type. Don't rely on header + // — try parsing the body as JSON; fall back to raw text on failure. + const body = await r.text(); + try { return JSON.parse(body); } catch { return body; } + } catch { + return null; + } +} + +// Compact the massive /vectors/indexes response into just the shape the +// UI needs: [{name, source, model, dims, chunks, bucket, backend}] +async function indexesSummary(): Promise { + const j = await tryFetch(`${GATEWAY}/vectors/indexes`); + if (!Array.isArray(j)) return { count: 0, items: [] }; + const items = j.slice(0, 12).map((i: any) => ({ + name: i.index_name, + source: i.source, + dims: i.dimensions, + chunks: i.chunk_count, + backend: i.vector_backend, + bucket: i.bucket, + })); + return { count: j.length, items }; +} + +async function servicesSnapshot() { + const [gw, sc, obs, mcp, c7, jstats, ustats] = await Promise.all([ + tryFetch(`${GATEWAY}/health`), + tryFetch(`${SIDECAR}/health`), + tryFetch(`${OBSERVER}/health`), + tryFetch(`${MCP}/health`), + tryFetch(`${CONTEXT7}/health`), + tryFetch(`${GATEWAY}/journal/stats`), + tryFetch(`${GATEWAY}/v1/usage`), + ]); + return { + ts: new Date().toISOString(), + nodes: [ + { id: "gateway", label: "Gateway :3100", status: gw ? "healthy" : "down", health: gw }, + { id: "sidecar", label: "Sidecar :3200", status: sc ? "healthy" : "down", health: sc }, + { id: "observer", label: "Observer :3800", status: obs ? "healthy" : "down", health: obs, + stats: await tryFetch(`${OBSERVER}/stats`) }, + { id: "mcp", label: "MCP :3700", status: mcp ? "healthy" : "down", health: mcp }, + { id: "context7", label: "Context7 :3900", status: c7 ? "healthy" : "down", health: c7 }, + ], + // Virtual nodes — backed by gateway subsystems rather than own ports + subsystems: [ + { id: "journal", label: "Journal", stats: jstats }, + { id: "usage", label: "Usage /v1", stats: ustats }, + { id: "vectord", label: "Vectord", stats: await indexesSummary() }, + { id: "playbook", label: "Playbook", stats: await tryFetch(`${GATEWAY}/vectors/playbook_memory/status`) }, + { id: "agent", label: "Autotune", stats: await tryFetch(`${GATEWAY}/vectors/agent/status`) }, + ], + }; +} + +// Extract phrase-level markers that indicate "this should be removed, +// simplified, or refactored" across scrum suggestions. These are the +// signals that accumulate into a refactor recommendation. +const REFACTOR_PHRASES = [ + "should be removed", "remove this", "dead code", "unused", "unnecessary", + "duplicate of", "duplicates", "redundant", + "consolidate", "merge with", "extract into", + "refactor", "rewrite", "replace with", + "orphaned", "stale", "deprecated", + "pseudocode", "placeholder", "stub", + "split this file", "too large", +]; + +async function refactorSignals(): Promise { + // Walk every accepted review across all scrum runs. For each file, + // count how many times its suggestions mention a refactor phrase. + // Return a sorted list — files most often flagged for refactor first. + const runsDir = `${REPO}/tests/real-world/runs`; + const perFile: Record; examples: string[]; iterations: number }> = {}; + try { + const dirs = await Array.fromAsync(new Bun.Glob("scrum_*").scan({ cwd: runsDir, onlyFiles: false })); + for (const d of dirs) { + const files = await Array.fromAsync(new Bun.Glob("review_*.json").scan({ cwd: `${runsDir}/${d}` })); + for (const f of files) { + const p = `${runsDir}/${d}/${f}`; + try { + const j = JSON.parse(await Bun.file(p).text()); + const file = j.file?.replace("/home/profit/lakehouse/", "") ?? "?"; + const sug = (j.suggestions ?? "").toLowerCase(); + if (!perFile[file]) perFile[file] = { file, hits: 0, phrases: {}, examples: [], iterations: 0 }; + perFile[file].iterations++; + for (const phrase of REFACTOR_PHRASES) { + const count = (sug.match(new RegExp(phrase, "gi")) ?? []).length; + if (count > 0) { + perFile[file].hits += count; + perFile[file].phrases[phrase] = (perFile[file].phrases[phrase] ?? 0) + count; + // Pull one example sentence around the phrase + if (perFile[file].examples.length < 3) { + const idx = sug.indexOf(phrase); + if (idx >= 0) { + const s = Math.max(0, idx - 60); + const e = Math.min(sug.length, idx + phrase.length + 80); + perFile[file].examples.push("…" + sug.slice(s, e).replace(/\s+/g, " ") + "…"); + } + } + } + } + } catch {} + } + } + } catch (e) { + return { error: String(e), signals: [] }; + } + const signals = Object.values(perFile) + .filter(x => x.hits > 0) + .sort((a, b) => b.hits - a.hits) + .slice(0, 30); + return { signals, scanned: Object.keys(perFile).length }; +} + +async function reverseIndex(query: string, limit = 20): Promise { + // Grep-like substring search across every review's suggestions. + // Returns file + snippet + which iter it was in + score + verdict. + const runsDir = `${REPO}/tests/real-world/runs`; + if (!query || query.length < 2) return { query, hits: [] }; + const q = query.toLowerCase(); + const hits: any[] = []; + try { + const dirs = await Array.fromAsync(new Bun.Glob("scrum_*").scan({ cwd: runsDir, onlyFiles: false })); + for (const d of dirs) { + const files = await Array.fromAsync(new Bun.Glob("review_*.json").scan({ cwd: `${runsDir}/${d}` })); + for (const f of files) { + const p = `${runsDir}/${d}/${f}`; + try { + const j = JSON.parse(await Bun.file(p).text()); + const sug = j.suggestions ?? ""; + const lower = sug.toLowerCase(); + const idx = lower.indexOf(q); + if (idx < 0) continue; + const s = Math.max(0, idx - 80); + const e = Math.min(sug.length, idx + q.length + 200); + hits.push({ + file: j.file?.replace("/home/profit/lakehouse/", ""), + run_id: d, + model: j.escalated_to_model, + snippet: sug.slice(s, e).replace(/\s+/g, " "), + }); + if (hits.length >= limit) break; + } catch {} + } + if (hits.length >= limit) break; + } + } catch (e) { + return { query, error: String(e), hits: [] }; + } + return { query, hits }; +} + +async function fileHistory(relpath: string): Promise { + // Walk all scrum_/review_*.json files and gather every review + // for this file path. Returns timeline rows keyed by run_id. + const runsDir = `${REPO}/tests/real-world/runs`; + const out: any[] = []; + try { + const dirs = await Array.fromAsync(new Bun.Glob("scrum_*").scan({ cwd: runsDir, onlyFiles: false })); + for (const d of dirs) { + const safe = relpath.replaceAll("/", "_"); + const p = `${runsDir}/${d}/review_${safe}.json`; + if (await Bun.file(p).exists()) { + const j = JSON.parse(await Bun.file(p).text()); + const sug = j.suggestions ?? ""; + const scoreMatch = sug.match(/(?:score[\s*:]*)?(\d(?:\.\d)?)\s*\/\s*10\b/i); + const score = scoreMatch ? parseFloat(scoreMatch[1]) : null; + const confs = [...sug.matchAll(/(?:Confidence[*:\s]*\s*|\|\s*)(\d{1,3})\s*%/gi)] + .map(m => parseInt(m[1], 10)).filter(x => x >= 0 && x <= 100); + const jsonConfs = [...sug.matchAll(/"confidence"\s*:\s*(\d{1,3})(?!\d)/gi)] + .map(m => parseInt(m[1], 10)).filter(x => x >= 0 && x <= 100); + const all = [...confs, ...jsonConfs]; + const mt = await Bun.file(p).stat(); + out.push({ + run_id: d, + reviewed_at: j.reviewed_at ?? mt.mtime, + model: j.escalated_to_model, + score, + chars: sug.length, + conf_avg: all.length ? Math.round(all.reduce((a,b)=>a+b,0)/all.length) : null, + conf_min: all.length ? Math.min(...all) : null, + findings: all.length, + output_format: sug.includes('"verdict"') ? "forensic_json" : "markdown", + // first 1200 chars preview + preview: sug.slice(0, 1200), + }); + } + } + } catch (e) { + return { error: String(e), history: [] }; + } + out.sort((a, b) => String(a.reviewed_at).localeCompare(String(b.reviewed_at))); + return { file: relpath, history: out }; +} + +Bun.serve({ + port: PORT, + hostname: "0.0.0.0", + async fetch(req) { + const url = new URL(req.url); + const path = url.pathname; + + // Static shell + if (path === "/" || path === "/index.html") { + return new Response(Bun.file(`${REPO}/ui/index.html`)); + } + if (path === "/ui.css") { + return new Response(Bun.file(`${REPO}/ui/ui.css`), { headers: { "content-type": "text/css" } }); + } + if (path === "/ui.js") { + return new Response(Bun.file(`${REPO}/ui/ui.js`), { headers: { "content-type": "application/javascript" } }); + } + + // Data API + if (path === "/data/services") return Response.json(await servicesSnapshot()); + if (path === "/data/reviews") { + const n = Number(url.searchParams.get("tail") ?? 50); + return Response.json(await tailJsonl(`${KB}/scrum_reviews.jsonl`, n)); + } + if (path === "/data/findings") return Response.json(await tailJsonl(`${KB}/phase_sweep_findings.jsonl`)); + if (path === "/data/metrics") return Response.json(await tailJsonl(`${KB}/scrum_loop_metrics.jsonl`)); + if (path === "/data/trust") return Response.json(await tailJsonl(`${KB}/model_trust.jsonl`, 200)); + if (path === "/data/overrides") return Response.json(await tailJsonl(`${KB}/human_overrides.jsonl`)); + if (path === "/data/outcomes") return Response.json(await tailJsonl(`${KB}/outcomes.jsonl`, 30)); + if (path === "/data/audit_facts") return Response.json(await tailJsonl(`${KB}/audit_facts.jsonl`, 30)); + + if (path.startsWith("/data/file/")) { + const relpath = decodeURIComponent(path.slice("/data/file/".length)); + return Response.json(await fileHistory(relpath)); + } + if (path === "/data/refactor_signals") { + return Response.json(await refactorSignals()); + } + if (path === "/data/search") { + const q = url.searchParams.get("q") ?? ""; + return Response.json(await reverseIndex(q, 30)); + } + + // Per-service systemd log tail. Allowed service list is fixed so the + // :service path param can never be used to invoke arbitrary units. + if (path.startsWith("/data/logs/")) { + const svc = path.slice("/data/logs/".length).split("?")[0]; + const UNITS: Record = { + gateway: "lakehouse.service", + sidecar: "lakehouse-sidecar.service", + observer: "lakehouse-observer.service", + mcp: "lakehouse-agent.service", + context7: "lakehouse-context7-bridge.service", + auditor: "lakehouse-auditor.service", + langfuse: "lakehouse-langfuse-bridge.service", + }; + const unit = UNITS[svc]; + if (!unit) return Response.json({ error: "unknown service", allowed: Object.keys(UNITS) }, { status: 400 }); + const n = Number(url.searchParams.get("n") ?? 60); + try { + // Use execFile-style API: pass args as array, never shell-interpolate + const proc = Bun.spawn(["journalctl", "-u", unit, "-n", String(n), "--no-pager", "--output=short-iso"], { + stdout: "pipe", + stderr: "pipe", + }); + const text = await new Response(proc.stdout).text(); + await proc.exited; + const lines = text.split("\n").filter(Boolean); + return Response.json({ service: svc, unit, lines }); + } catch (e) { + return Response.json({ service: svc, unit, error: String(e), lines: [] }); + } + } + + // Live scrum log tail — best-effort + if (path === "/data/scrum_log") { + try { + const bg = await Array.fromAsync(new Bun.Glob("scrum_iter*.log").scan({ cwd: "/tmp" })); + if (bg.length === 0) return Response.json({ lines: [] }); + bg.sort(); + const latest = `/tmp/${bg[bg.length - 1]}`; + const text = await Bun.file(latest).text(); + const lines = text.split("\n").slice(-80); + return Response.json({ file: latest, lines }); + } catch (e) { + return Response.json({ error: String(e) }); + } + } + + return new Response("not found", { status: 404 }); + }, +}); + +console.log(`[ui] visual control plane listening on http://0.0.0.0:${PORT}`); diff --git a/ui/ui.css b/ui/ui.css new file mode 100644 index 0000000..f2cdf6c --- /dev/null +++ b/ui/ui.css @@ -0,0 +1,407 @@ +/* Lakehouse Visual Control Plane — neo-brutalist dark */ + +:root { + --bg: #0a0c10; + --bg-1: #10141a; + --bg-2: #171c24; + --border: #2a303b; + --border-hi: #3a4252; + --fg: #e8ecf3; + --fg-dim: #8a94a7; + --fg-muted: #525c6f; + --green: #3eed86; + --yellow: #ffbf3c; + --red: #ff4d6e; + --blue: #55c5ff; + --purple: #b57cff; + --orange: #ff9f43; + --shadow: 0 2px 0 #000; + --mono: ui-monospace, "JetBrains Mono", "SF Mono", Menlo, monospace; + --sans: -apple-system, Inter, system-ui, sans-serif; +} + +* { box-sizing: border-box; margin: 0; padding: 0; } +html, body { height: 100%; overflow: hidden; } +body { + background: var(--bg); + color: var(--fg); + font-family: var(--sans); + font-size: 13px; + display: flex; + flex-direction: column; +} + +/* ────── TOP BAR ────── */ +#topbar { + height: 44px; + display: flex; + align-items: center; + gap: 16px; + padding: 0 16px; + border-bottom: 1px solid var(--border); + background: var(--bg-1); + flex-shrink: 0; +} +.brand { display: flex; align-items: center; gap: 8px; font-weight: 700; letter-spacing: 0.08em; } +.brand .sig { color: var(--green); font-size: 16px; } +.brand .build { color: var(--fg-muted); font-size: 10px; font-family: var(--mono); margin-left: 6px; } + +#views { display: flex; gap: 2px; margin-left: 20px; } +#views button { + background: transparent; border: 1px solid var(--border); color: var(--fg-dim); + font-family: var(--mono); font-size: 11px; letter-spacing: 0.1em; + padding: 5px 10px; cursor: pointer; text-transform: uppercase; +} +#views button:hover { border-color: var(--border-hi); color: var(--fg); } +#views button.on { background: var(--fg); color: var(--bg); border-color: var(--fg); font-weight: 700; } + +#hb { margin-left: auto; display: flex; gap: 6px; } +.hbchip { + font-family: var(--mono); font-size: 10px; letter-spacing: 0.05em; + padding: 4px 8px; border: 1px solid var(--border); border-radius: 2px; + color: var(--fg-muted); +} +.hbchip[data-status="healthy"] { border-color: var(--green); color: var(--green); } +.hbchip[data-status="down"] { border-color: var(--red); color: var(--red); } +.hbchip[data-status="degraded"]{ border-color: var(--yellow); color: var(--yellow); } + +/* ────── MAIN ────── */ +main { + flex: 1; + display: grid; + grid-template-columns: 1fr 380px; + min-height: 0; +} +#stage { position: relative; border-right: 1px solid var(--border); min-height: 0; overflow: hidden; } + +.view { display: none; width: 100%; height: 100%; } +.view.on { display: block; } + +.subhead { + height: 32px; display: flex; align-items: center; gap: 10px; + padding: 0 14px; border-bottom: 1px solid var(--border); + background: var(--bg-1); + font-family: var(--mono); font-size: 11px; color: var(--fg-dim); + text-transform: uppercase; letter-spacing: 0.08em; +} +.subhead .spacer { flex: 1; } + +/* ────── MAP ────── */ +#view-map { position: relative; } +#overlay-controls { + position: absolute; top: 10px; left: 10px; z-index: 2; + display: flex; align-items: center; gap: 4px; + background: rgba(16,20,26,0.95); + border: 1px solid var(--border); padding: 4px; +} +#overlay-controls .lbl { font-family: var(--mono); font-size: 10px; color: var(--fg-muted); padding: 0 8px; } +#overlay-controls button { + background: transparent; color: var(--fg-dim); border: 1px solid var(--border); + font-family: var(--mono); font-size: 10px; padding: 3px 8px; cursor: pointer; + text-transform: lowercase; letter-spacing: 0.05em; +} +#overlay-controls button:hover { border-color: var(--border-hi); color: var(--fg); } +#overlay-controls button.on { background: var(--fg); color: var(--bg); border-color: var(--fg); } + +#map { width: 100%; height: 100%; } +#map .node-circle { stroke: var(--fg); stroke-width: 2; cursor: pointer; transition: r 200ms; } +#map .node-circle:hover { stroke-width: 3; } +#map .node-label { + font-family: var(--mono); font-size: 11px; fill: var(--fg); + pointer-events: none; text-anchor: middle; font-weight: 600; +} +#map .node-sub { + font-family: var(--mono); font-size: 9px; fill: var(--fg-muted); + pointer-events: none; text-anchor: middle; +} +#map .edge { stroke: var(--border-hi); stroke-width: 1.5; fill: none; } +#map .edge.active { stroke: var(--blue); stroke-width: 2; stroke-dasharray: 4 3; animation: dash 1.5s linear infinite; } +@keyframes dash { to { stroke-dashoffset: -14; } } + +#map .node-selected { stroke: var(--yellow); stroke-width: 3; } + +#legend { + position: absolute; bottom: 10px; left: 10px; + display: flex; gap: 16px; font-family: var(--mono); font-size: 10px; + background: rgba(16,20,26,0.95); border: 1px solid var(--border); padding: 6px 10px; +} +.lg { color: var(--fg-muted); } +.lg.healthy::before { content: ''; } +.lg.healthy { color: var(--green); } +.lg.degraded { color: var(--yellow); } +.lg.down { color: var(--red); } +.lg.active { color: var(--blue); } + +/* ────── CONTEXT PANEL ────── */ +#context { + background: var(--bg-1); overflow-y: auto; overflow-x: hidden; + display: flex; flex-direction: column; +} +.ctx-header { + height: 44px; display: flex; flex-direction: column; justify-content: center; + padding: 4px 14px; border-bottom: 1px solid var(--border); + background: var(--bg-2); +} +.ctx-eyebrow { font-family: var(--mono); font-size: 10px; color: var(--fg-muted); letter-spacing: 0.1em; } +#ctx-target { font-family: var(--mono); font-size: 12px; color: var(--fg); font-weight: 600; margin-top: 2px; } +#ctx-body { padding: 12px 14px; flex: 1; overflow-y: auto; } +.ctx-hint { color: var(--fg-muted); font-style: italic; font-size: 11px; } + +.ctx-row { padding: 6px 0; border-bottom: 1px solid var(--border); display: flex; justify-content: space-between; gap: 10px; font-family: var(--mono); font-size: 11px; } +.ctx-row .k { color: var(--fg-muted); text-transform: uppercase; letter-spacing: 0.06em; } +.ctx-row .v { color: var(--fg); text-align: right; word-break: break-all; } +.ctx-row .v.good { color: var(--green); } +.ctx-row .v.warn { color: var(--yellow); } +.ctx-row .v.bad { color: var(--red); } + +.ctx-section-hd { font-family: var(--mono); font-size: 10px; color: var(--fg-muted); letter-spacing: 0.1em; margin: 14px 0 4px; text-transform: uppercase; } + +.pill { + display: inline-block; font-family: var(--mono); font-size: 10px; + padding: 2px 7px; border: 1px solid var(--border); margin-right: 4px; +} +.pill.tier-auto { border-color: var(--green); color: var(--green); } +.pill.tier-dry_run { border-color: var(--blue); color: var(--blue); } +.pill.tier-simulation { border-color: var(--yellow); color: var(--yellow); } +.pill.tier-block { border-color: var(--red); color: var(--red); } +.pill.ver-needs_patch { border-color: var(--orange); color: var(--orange); } +.pill.ver-pass { border-color: var(--green); color: var(--green); } +.pill.ver-fail { border-color: var(--red); color: var(--red); } +.pill.fmt-forensic_json { border-color: var(--purple); color: var(--purple); } +.pill.fmt-markdown { border-color: var(--fg-dim); color: var(--fg-dim); } + +/* ────── TRACE ────── */ +#trace-timeline { + display: flex; gap: 0; padding: 20px; + overflow-x: auto; border-bottom: 1px solid var(--border); + min-height: 140px; +} +.trace-node { + position: relative; flex: 0 0 140px; padding: 10px 12px; + border: 2px solid var(--border); background: var(--bg-1); cursor: pointer; + font-family: var(--mono); font-size: 10px; +} +.trace-node:hover { border-color: var(--border-hi); } +.trace-node.active { border-color: var(--yellow); background: var(--bg-2); } +.trace-node::after { + content: '→'; position: absolute; right: -14px; top: 50%; transform: translateY(-50%); + color: var(--fg-muted); font-size: 16px; +} +.trace-node:last-child::after { display: none; } +.trace-node .tn-run { color: var(--fg-muted); letter-spacing: 0.05em; margin-bottom: 4px; } +.trace-node .tn-score { font-size: 22px; font-weight: 700; color: var(--fg); } +.trace-node .tn-conf { color: var(--fg-dim); margin-top: 4px; } +.trace-node .tn-model { color: var(--purple); margin-top: 4px; font-size: 9px; } + +#trace-detail { padding: 16px; overflow-y: auto; height: calc(100% - 172px); } +#trace-detail pre { font-family: var(--mono); font-size: 11px; color: var(--fg-dim); white-space: pre-wrap; word-break: break-word; } + +/* ────── METRICS ────── */ +.metric-grid { + display: grid; grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); + gap: 12px; padding: 16px; overflow-y: auto; max-height: 100%; +} +.metric { + border: 1px solid var(--border); background: var(--bg-1); + padding: 14px; display: flex; flex-direction: column; gap: 6px; +} +.metric .m-label { font-family: var(--mono); font-size: 10px; color: var(--fg-muted); letter-spacing: 0.08em; text-transform: uppercase; } +.metric .m-big { font-size: 28px; font-weight: 800; letter-spacing: -0.02em; } +.metric .m-sub { font-family: var(--mono); font-size: 11px; color: var(--fg-dim); } +.metric .m-explain { + color: var(--fg); font-family: var(--sans); font-size: 12px; + line-height: 1.45; margin-top: 2px; font-weight: 400; +} +.metric .m-source { + color: var(--fg-muted); font-size: 10px; letter-spacing: 0.05em; + border-top: 1px dashed var(--border); padding-top: 6px; margin-top: 6px; +} +.metric .m-good { + color: var(--green); font-size: 10px; letter-spacing: 0.03em; + line-height: 1.5; opacity: 0.85; +} +.metric.warn .m-good { color: var(--yellow); } +.metric.bad .m-good { color: var(--red); } +.metric.good .m-big { color: var(--green); } +.metric.warn .m-big { color: var(--yellow); } +.metric.bad .m-big { color: var(--red); } + +.bar { display: flex; height: 8px; border: 1px solid var(--border); background: var(--bg); margin-top: 4px; } +.bar > span { display: block; height: 100%; } +.bar .seg-auto { background: var(--green); } +.bar .seg-dry_run { background: var(--blue); } +.bar .seg-simulation { background: var(--yellow); } +.bar .seg-block { background: var(--red); } + +/* ────── KB ────── */ +.kb-grid { + padding: 16px; display: grid; gap: 10px; + grid-template-columns: repeat(auto-fill, minmax(420px, 1fr)); + overflow-y: auto; max-height: 100%; +} +.kb-banner { + grid-column: 1 / -1; + border: 1px solid var(--border); background: var(--bg-1); + padding: 14px 16px; border-left: 3px solid var(--blue); +} +.kb-banner-title { + font-family: var(--mono); font-size: 11px; color: var(--blue); + letter-spacing: 0.1em; font-weight: 700; margin-bottom: 6px; +} +.kb-banner-body { + color: var(--fg); font-size: 12px; line-height: 1.55; +} +.kb-statline { + grid-column: 1 / -1; + display: flex; gap: 18px; + font-family: var(--mono); font-size: 11px; color: var(--fg-dim); + padding: 8px 14px; border: 1px solid var(--border); + background: var(--bg-2); margin-bottom: 2px; +} +.kb-statline .stat-warn { color: var(--yellow); font-weight: 700; } +.kb-file { border: 1px solid var(--border); background: var(--bg-1); padding: 10px 12px; cursor: pointer; } +.kb-file:hover { border-color: var(--border-hi); } +.kb-file .kf-path { font-family: var(--mono); font-size: 11px; color: var(--fg); word-break: break-all; } +.kb-file .kf-meta { font-family: var(--mono); font-size: 10px; color: var(--fg-muted); margin-top: 4px; display: flex; gap: 10px; flex-wrap: wrap; } +.kb-file .kf-score { font-weight: 700; color: var(--fg); } +.kb-file .kf-delta.up { color: var(--green); } +.kb-file .kf-delta.down { color: var(--red); } + +/* ────── CONSOLE ────── */ +#view-console { background: #000; display: none; flex-direction: column; } +#view-console.on { display: flex; } +.console-toolbar { + height: 36px; display: flex; align-items: center; gap: 8px; + padding: 0 12px; background: var(--bg-1); border-bottom: 1px solid var(--border); + flex-shrink: 0; +} +.console-toolbar .con-eyebrow { + font-family: var(--mono); font-size: 10px; color: var(--fg-muted); letter-spacing: 0.1em; + text-transform: uppercase; margin-right: 6px; +} +.console-toolbar .spacer { flex: 1; } +#con-tabs { display: flex; gap: 2px; } +#con-tabs button { + background: transparent; border: 1px solid var(--border); color: var(--fg-dim); + font-family: var(--mono); font-size: 10px; letter-spacing: 0.05em; + padding: 4px 10px; cursor: pointer; text-transform: lowercase; +} +#con-tabs button:hover { border-color: var(--border-hi); color: var(--fg); } +#con-tabs button.on { background: var(--fg); color: var(--bg); border-color: var(--fg); font-weight: 700; } +#con-unit { font-family: var(--mono); font-size: 10px; color: var(--fg-muted); } + +#console-log { + flex: 1; padding: 12px 16px; overflow-y: auto; + font-family: var(--mono); font-size: 11px; color: var(--fg); + line-height: 1.5; +} +#console-log .cl-line { white-space: pre-wrap; word-break: break-all; } +#console-log .cl-info { color: var(--fg-dim); } +#console-log .cl-ok { color: var(--green); } +#console-log .cl-warn { color: var(--yellow); } +#console-log .cl-err { color: var(--red); } + +/* ────── STREAM (bottom) ────── */ +#stream { + height: 180px; background: var(--bg-1); + border-top: 1px solid var(--border); + display: flex; flex-direction: column; flex-shrink: 0; +} +.stream-head { + height: 26px; display: flex; align-items: center; gap: 10px; + padding: 0 14px; border-bottom: 1px solid var(--border); + font-family: var(--mono); font-size: 10px; color: var(--fg-muted); + letter-spacing: 0.1em; text-transform: uppercase; +} +.stream-head .spacer { flex: 1; } +#stream-file { color: var(--fg-dim); font-size: 10px; } +.dot { width: 8px; height: 8px; border-radius: 50%; background: var(--green); animation: pulse 1.4s ease-in-out infinite; } +@keyframes pulse { 50% { opacity: 0.35; transform: scale(0.8); } } + +#stream-body { + flex: 1; padding: 8px 14px; overflow-y: auto; font-family: var(--mono); font-size: 11px; color: var(--fg-dim); + display: flex; flex-direction: column; +} +.sline { padding: 1px 0; white-space: pre; } +.sline.ok { color: var(--green); } +.sline.thin { color: var(--yellow); } +.sline.err { color: var(--red); } +.sline.info { color: var(--fg-dim); } +.sline.head { color: var(--fg); font-weight: 600; } + +/* ────── TRAJECTORY view ────── */ +#view-trajectory { display: none; flex-direction: column; } +#view-trajectory.on { display: flex; } +.traj-header { + border-bottom: 1px solid var(--border); background: var(--bg-1); + padding: 12px 16px; display: flex; flex-direction: column; gap: 6px; +} +#traj-search { + width: 100%; background: var(--bg-2); color: var(--fg); + border: 1px solid var(--border); padding: 8px 10px; + font-family: var(--mono); font-size: 12px; +} +#traj-search:focus { outline: none; border-color: var(--blue); } +#traj-stats { + font-family: var(--mono); font-size: 10px; color: var(--fg-muted); letter-spacing: 0.06em; +} +#traj-body { + flex: 1; overflow-y: auto; padding: 16px; + display: flex; flex-direction: column; gap: 10px; +} +.traj-section-head { + font-family: var(--mono); font-size: 11px; color: var(--blue); + letter-spacing: 0.1em; font-weight: 700; margin-top: 14px; margin-bottom: 4px; + border-bottom: 1px solid var(--border); padding-bottom: 6px; +} +.traj-section-head:first-child { margin-top: 0; } +.traj-section-explain { + color: var(--fg); font-size: 12px; line-height: 1.55; margin-bottom: 6px; + padding: 8px 10px; background: var(--bg-1); border-left: 2px solid var(--blue); +} +.traj-table { display: flex; flex-direction: column; border: 1px solid var(--border); } +.traj-row { + display: grid; grid-template-columns: 40px 1.5fr 80px 2fr 80px; gap: 12px; + padding: 8px 12px; border-bottom: 1px solid var(--border); + font-family: var(--mono); font-size: 11px; cursor: pointer; +} +.traj-row:hover { background: var(--bg-2); } +.traj-row:last-child { border-bottom: none; } +.traj-col-rank { color: var(--fg-muted); font-weight: 700; } +.traj-col-file { color: var(--fg); } +.traj-col-hits { color: var(--red); font-weight: 700; } +.traj-col-phrases { color: var(--fg-dim); white-space: nowrap; overflow: hidden; text-overflow: ellipsis; } +.traj-col-iters { color: var(--fg-muted); text-align: right; } + +.traj-spark-grid { + display: grid; grid-template-columns: repeat(auto-fill, minmax(480px, 1fr)); gap: 10px; +} +.traj-spark { + border: 1px solid var(--border); background: var(--bg-1); + padding: 12px; cursor: pointer; +} +.traj-spark:hover { border-color: var(--border-hi); } +.traj-spark-file { font-family: var(--mono); font-size: 11px; color: var(--fg); margin-bottom: 8px; } +.traj-spark-line { display: flex; align-items: center; gap: 6px; } +.traj-spark-pt { + flex: 0 0 80px; padding: 6px 8px; border: 1px solid var(--border); + background: var(--bg-2); text-align: center; +} +.traj-pt-score { font-family: var(--mono); font-size: 14px; font-weight: 800; color: var(--fg); } +.traj-pt-conf { font-family: var(--mono); font-size: 10px; color: var(--fg-dim); } +.traj-pt-label { font-family: var(--mono); font-size: 9px; color: var(--fg-muted); letter-spacing: 0.06em; } +.traj-spark-arrow { color: var(--fg-muted); font-size: 14px; } +.traj-spark-delta { font-family: var(--mono); font-size: 11px; color: var(--fg-dim); margin-top: 8px; } +.traj-spark-delta .delta-up { color: var(--green); } +.traj-spark-delta .delta-down { color: var(--red); } +.traj-spark-empty { font-family: var(--mono); font-size: 10px; color: var(--fg-muted); font-style: italic; } + +.traj-hit { + border: 1px solid var(--border); padding: 10px 12px; background: var(--bg-1); cursor: pointer; +} +.traj-hit:hover { border-color: var(--border-hi); } +.traj-hit-top { display: flex; gap: 14px; margin-bottom: 6px; } +.traj-hit-file { font-family: var(--mono); font-size: 11px; color: var(--fg); font-weight: 600; } +.traj-hit-meta { font-family: var(--mono); font-size: 10px; color: var(--fg-muted); } +.traj-hit-snip { font-family: var(--mono); font-size: 11px; color: var(--fg-dim); line-height: 1.5; } + diff --git a/ui/ui.js b/ui/ui.js new file mode 100644 index 0000000..2ac5341 --- /dev/null +++ b/ui/ui.js @@ -0,0 +1,804 @@ +// Visual Control Plane — client (vanilla JS, D3 from CDN) +// Design note: KB data flows from local jsonl files we control, but we +// still use DOM methods (createElement/textContent) for every +// data-derived node to satisfy static analysis and keep a clean XSS +// boundary if the UI ever gets exposed. + +const POLL_MS = 3000; + +const state = { + view: "map", + overlay: "status", + selected: null, + services: null, + reviews: [], + metrics: [], + overrides: [], + trust: [], + findings: [], +}; + +// ───── view switcher ───── +document.querySelectorAll("#views button").forEach(b => { + b.addEventListener("click", () => { + document.querySelectorAll("#views button").forEach(x => x.classList.remove("on")); + b.classList.add("on"); + state.view = b.dataset.view; + document.querySelectorAll(".view").forEach(v => v.classList.remove("on")); + document.getElementById(`view-${state.view}`).classList.add("on"); + renderView(); + }); +}); + +document.querySelectorAll("#overlay-controls button").forEach(b => { + b.addEventListener("click", () => { + document.querySelectorAll("#overlay-controls button").forEach(x => x.classList.remove("on")); + b.classList.add("on"); + state.overlay = b.dataset.ov; + if (state.services) drawMap(state.services); + }); +}); + +// ───── helpers ───── +function el(tag, opts = {}, ...kids) { + const n = document.createElement(tag); + if (opts.className) n.className = opts.className; + if (opts.text != null) n.textContent = String(opts.text); + if (opts.data) for (const k in opts.data) n.dataset[k] = opts.data[k]; + if (opts.attrs) for (const k in opts.attrs) n.setAttribute(k, opts.attrs[k]); + if (opts.style) for (const k in opts.style) n.style[k] = opts.style[k]; + for (const k of kids) if (k != null) n.append(k); + return n; +} +function clear(node) { while (node.firstChild) node.removeChild(node.firstChild); } +function row(k, v, valClass) { + const r = el("div", { className: "ctx-row" }); + r.append(el("span", { className: "k", text: k })); + const vv = el("span", { className: "v" + (valClass ? " " + valClass : ""), text: String(v ?? "-") }); + r.append(vv); + return r; +} +function short(v) { + if (v == null) return "-"; + if (typeof v === "object") return JSON.stringify(v).slice(0, 80); + return String(v).slice(0, 80); +} + +// ───── polling ───── +async function poll() { + try { + const [svc, rev, met, ov, tr, fin] = await Promise.all([ + fetch("/data/services").then(r => r.json()), + fetch("/data/reviews?tail=80").then(r => r.json()), + fetch("/data/metrics").then(r => r.json()), + fetch("/data/overrides").then(r => r.json()), + fetch("/data/trust").then(r => r.json()), + fetch("/data/findings").then(r => r.json()), + ]); + state.services = svc; + state.reviews = Array.isArray(rev) ? rev : []; + state.metrics = Array.isArray(met) ? met : []; + state.overrides = Array.isArray(ov) ? ov : []; + state.trust = Array.isArray(tr) ? tr : []; + state.findings = Array.isArray(fin) ? fin : []; + document.getElementById("build-ts").textContent = new Date(svc.ts).toLocaleTimeString(); + svc.nodes.forEach(n => { + const chip = document.querySelector(`.hbchip[data-svc="${n.id}"]`); + if (chip) chip.setAttribute("data-status", n.status); + }); + renderView(); + renderContext(); + pollStream(); + } catch (e) { console.error("poll error", e); } +} + +async function pollStream() { + try { + const j = await fetch("/data/scrum_log").then(r => r.json()); + if (!j.lines) return; + document.getElementById("stream-file").textContent = j.file ? j.file.split("/").pop() : "—"; + const body = document.getElementById("stream-body"); + clear(body); + j.lines.slice(-30).forEach(line => { + const cls = /✓ ACCEPTED/.test(line) ? "ok" + : /✗ thin/.test(line) ? "thin" + : /error|failed|FAIL/i.test(line) ? "err" + : /^\[scrum\] file:/.test(line) ? "head" + : "info"; + body.append(el("div", { className: "sline " + cls, text: line })); + }); + body.scrollTop = body.scrollHeight; + } catch {} +} + +function renderView() { + if (!state.services) return; + if (state.view === "map") drawMap(state.services); + else if (state.view === "trace") drawTrace(); + else if (state.view === "trajectory") drawTrajectory(); + else if (state.view === "metrics") drawMetrics(); + else if (state.view === "kb") drawKB(); + else if (state.view === "console") drawConsole(); +} + +// ───── MAP ───── +const NODES_STATIC = [ + { id: "gateway", x: 0.5, y: 0.15 }, + { id: "sidecar", x: 0.2, y: 0.3 }, + { id: "observer", x: 0.8, y: 0.3 }, + { id: "mcp", x: 0.85, y: 0.1 }, + { id: "context7", x: 0.15, y: 0.1 }, + { id: "journal", x: 0.35, y: 0.55 }, + { id: "vectord", x: 0.5, y: 0.5 }, + { id: "playbook", x: 0.65, y: 0.55 }, + { id: "agent", x: 0.5, y: 0.75 }, + { id: "usage", x: 0.2, y: 0.75 }, +]; +const EDGES = [ + ["gateway","sidecar"],["gateway","observer"],["gateway","mcp"],["gateway","context7"], + ["gateway","journal"],["gateway","vectord"],["gateway","playbook"],["gateway","agent"],["gateway","usage"], + ["vectord","playbook"],["agent","vectord"],["observer","playbook"],["sidecar","vectord"], +]; + +function drawMap(svc) { + const svg = d3.select("#map"); + const box = svg.node().getBoundingClientRect(); + const W = box.width, H = box.height; + svg.selectAll("*").remove(); + const statusMap = {}; + [...svc.nodes, ...svc.subsystems].forEach(n => statusMap[n.id] = n); + svg.selectAll(".edge").data(EDGES).enter().append("line") + .attr("class", d => "edge" + (overlayEdgeActive(d) ? " active" : "")) + .attr("x1", d => nodePos(d[0]).x * W).attr("y1", d => nodePos(d[0]).y * H) + .attr("x2", d => nodePos(d[1]).x * W).attr("y2", d => nodePos(d[1]).y * H); + const g = svg.selectAll(".node").data(NODES_STATIC).enter().append("g") + .attr("class", "node") + .attr("transform", d => `translate(${d.x * W}, ${d.y * H})`) + .on("click", (_ev, d) => { state.selected = { type:"node", id:d.id }; renderContext(); drawMap(svc); }); + g.append("circle") + .attr("class", d => "node-circle" + (state.selected?.type==="node" && state.selected.id===d.id ? " node-selected" : "")) + .attr("r", d => nodeRadius(d, statusMap)) + .attr("fill", d => nodeColor(d, statusMap)); + // SVG tooltip — hover a node, browser shows a native tooltip with + // what this node DOES, not just its name. + g.append("title").text(d => nodeTooltip(d.id)); + g.append("text").attr("class","node-label").attr("y", -30).text(d => nodeLabel(d.id)); + g.append("text").attr("class","node-sub").attr("y", 40).text(d => nodeSub(d, statusMap)); +} +function nodePos(id) { return NODES_STATIC.find(x => x.id === id) ?? { x:0, y:0 }; } +function nodeLabel(id) { + return ({gateway:"GATEWAY",sidecar:"SIDECAR",observer:"OBSERVER",mcp:"MCP",context7:"CTX7", + journal:"JOURNAL",vectord:"VECTORD",playbook:"PLAYBOOK",agent:"AUTOTUNE",usage:"USAGE"})[id] ?? id; +} +function nodeRadius(d, m) { + const n = m[d.id]; + if (state.overlay === "activity") { + if (d.id === "journal" && n?.stats?.total_events_created != null) return 14 + Math.min(20, Math.log2(n.stats.total_events_created + 1) * 2); + if (d.id === "vectord" && n?.stats?.count != null) return 14 + Math.min(20, Math.log2(n.stats.count + 1) * 2); + if (d.id === "playbook" && n?.stats?.total != null) return 14 + Math.min(20, Math.log2(n.stats.total + 1)); + if (d.id === "observer" && n?.stats?.total != null) return 14 + Math.min(20, Math.log2(n.stats.total + 1)); + if (d.id === "usage" && n?.stats?.requests != null) return 14 + Math.min(20, Math.log2(n.stats.requests + 1) * 2); + } + return 18; +} +function nodeColor(d, m) { + const n = m[d.id]; + const ov = state.overlay; + if (ov === "status" || ov === "activity") { + const st = n?.status ?? (n?.stats ? "healthy" : "unknown"); + return { healthy:"#3eed86", degraded:"#ffbf3c", down:"#ff4d6e", unknown:"#525c6f" }[st] ?? "#525c6f"; + } + if (ov === "confidence") { + const c = recentAvgConfidence(d.id); + if (c == null) return "#525c6f"; + if (c >= 88) return "#3eed86"; + if (c >= 70) return "#55c5ff"; + if (c >= 50) return "#ffbf3c"; + return "#ff4d6e"; + } + if (ov === "gradient") { + const t = recentGradientTier(d.id); + return t ? ({auto:"#3eed86",dry_run:"#55c5ff",simulation:"#ffbf3c",block:"#ff4d6e"}[t] ?? "#525c6f") : "#525c6f"; + } + if (ov === "verdict") { + const v = recentVerdict(d.id); + return {pass:"#3eed86",needs_patch:"#ff9f43",fail:"#ff4d6e"}[v] ?? "#525c6f"; + } + return "#55c5ff"; +} +function nodeSub(d, m) { + const n = m[d.id]; + if (!n) return "…"; + if (d.id === "journal" && n.stats) return `${n.stats.total_events_created ?? 0} events · ${n.stats.persisted_files ?? 0} parquet`; + if (d.id === "usage" && n.stats) return `${n.stats.requests ?? 0} requests · ${Math.round((n.stats.total_tokens ?? 0)/1000)}k tokens`; + if (d.id === "vectord" && typeof n.stats === "object" && n.stats) return `${n.stats.count ?? 0} indexes`; + if (d.id === "playbook" && n.stats) return `${n.stats.active ?? 0} active · ${n.stats.retired ?? 0} retired`; + if (d.id === "agent" && n.stats) return `${n.stats.trials_run ?? 0} trials · ${n.stats.promotions ?? 0} promotions`; + if (d.id === "observer" && n.stats) return `${n.stats.total ?? 0} observed ops`; + return String(n.status ?? ""); +} + +// Describes what each node DOES — shown as SVG tooltip. +function nodeTooltip(id) { + return ({ + gateway: "GATEWAY — Rust/Axum HTTP on :3100. Every external call enters here: /v1/chat, /ingest, /query, /tools, /journal, /vectors. Also hosts gRPC on :3101.", + sidecar: "SIDECAR — Python FastAPI on :3200. Adapter from Rust to local Ollama (:11434). Handles /embed /generate /rerank. Stateless.", + observer: "OBSERVER — Bun on :3800. Ring buffer of recent ops across the system. Feeds analyzeErrors + PLAYBOOK_BUILDER loops. Scrum events now land here (P45 fix).", + mcp: "MCP — Bun on :3700. Model Context Protocol tool gateway. Agent-facing tool endpoints.", + context7: "CONTEXT7 — Bun on :3900. Doc-drift resolver — checks playbook doc_refs against current docs for version drift (Phase 45 target).", + journal: "JOURNAL — ADR-012 append-only mutation log inside the gateway. Every ingest/delta-write/tombstone should record here. Currently ~1 real event (P9-001 still mostly unwired).", + vectord: "VECTORD — Embeddings store + HNSW index + autotune harness. The 'indexes' count = named vector indexes live right now (one per source × model_version).", + playbook: "PLAYBOOK — Meta-index. Each entry = a successful past pattern + geo/role + 768d embedding. Active entries boost future vector-search results (Phase 19).", + agent: "AUTOTUNE — Background agent that continuously proposes HNSW config trials, picks Pareto winners above min_recall, promotes, and rolls back. Self-tuning vector index.", + usage: "USAGE — /v1/chat token counters. Tracks requests, prompt/completion tokens, per-provider breakdown. Grows with scrum + audit traffic.", + })[id] ?? id; +} +function overlayEdgeActive(edge) { + if (!state.reviews.length) return false; + const latest = state.reviews[state.reviews.length - 1]; + if (!latest?.reviewed_at) return false; + const age = Date.now() - new Date(latest.reviewed_at).getTime(); + if (age > 60000) return false; + return edge.includes("gateway") && (edge.includes("observer") || edge.includes("vectord")); +} +function matchesNode(r, id) { + if (!r?.file) return false; + const f = r.file.toLowerCase(); + if (id === "gateway") return f.includes("/gateway/"); + if (id === "vectord") return f.includes("/vectord"); + if (id === "journal") return f.includes("/journald"); + if (id === "playbook")return f.includes("playbook_memory"); + if (id === "sidecar") return f.includes("sidecar"); + if (id === "agent") return f.includes("agent.rs") || f.includes("autotune"); + return false; +} +function recentAvgConfidence(id) { + const rs = state.reviews.filter(r => matchesNode(r, id)); + const vs = rs.map(r => r.confidence_avg).filter(v => v != null); + return vs.length ? vs.reduce((a,b)=>a+b,0)/vs.length : null; +} +function recentGradientTier(id) { + const rs = state.reviews.filter(r => matchesNode(r, id)); + const ts = rs.map(r => r.gradient_tier).filter(Boolean); + return ts[ts.length - 1] ?? null; +} +function recentVerdict(id) { + const rs = state.reviews.filter(r => matchesNode(r, id)); + const vs = rs.map(r => r.verdict).filter(Boolean); + return vs[vs.length - 1] ?? null; +} + +// ───── CONTEXT ───── +function renderContext() { + const target = document.getElementById("ctx-target"); + const body = document.getElementById("ctx-body"); + clear(body); + if (!state.selected) { + target.textContent = "no selection"; + body.append(el("div", { className: "ctx-hint", text: "Click a node or a file in KB to inspect. Context persists across view switches." })); + body.append(el("div", { className: "ctx-section-hd", text: "System totals" })); + appendSummaryKV(body); + return; + } + if (state.selected.type === "node") renderNodeContext(state.selected.id, target, body); + else if (state.selected.type === "file") renderFileContext(state.selected.id, target, body); +} + +function appendSummaryKV(body) { + const s = state.services; + if (!s) { body.append(el("div", { className: "ctx-hint", text: "loading…" })); return; } + const get = id => s.nodes.concat(s.subsystems).find(n => n.id === id); + const journal = get("journal")?.stats ?? {}; + const usage = get("usage")?.stats ?? {}; + const playbook = get("playbook")?.stats ?? {}; + const agent = get("agent")?.stats ?? {}; + const observer = get("observer")?.stats ?? {}; + body.append(row("scrum reviews", state.reviews.length)); + body.append(row("journal events", journal.total_events_created ?? 0)); + body.append(row("usage tokens", (usage.total_tokens ?? 0).toLocaleString())); + body.append(row("playbook active", playbook.active ?? 0)); + body.append(row("autotune trials", agent.trials_run ?? 0)); + body.append(row("observer ops", observer.total ?? 0)); + body.append(row("findings (h/m/l)", `${countFindingsSev("high")}/${countFindingsSev("medium")}/${countFindingsSev("low")}`)); +} + +function countFindingsSev(sev) { + let n = 0; + for (const row of state.findings) for (const f of row.findings ?? []) if (f.severity === sev) n++; + return n; +} + +function renderNodeContext(id, target, body) { + target.textContent = `NODE · ${id.toUpperCase()}`; + const n = [...state.services.nodes, ...state.services.subsystems].find(x => x.id === id); + if (n?.health) { + body.append(el("div", { className: "ctx-section-hd", text: "Health" })); + // Fix 2026-04-24: some /health endpoints return a plain string like + // "lakehouse ok". Don't Object.entries() on strings — that iterates + // characters. Detect primitive vs object explicitly. + if (typeof n.health === "string" || typeof n.health === "number" || typeof n.health === "boolean") { + body.append(row("response", String(n.health).slice(0, 80))); + } else if (typeof n.health === "object" && n.health !== null) { + Object.entries(n.health).slice(0, 8).forEach(([k,v]) => body.append(row(k, short(v)))); + } + } + if (n?.stats) { + body.append(el("div", { className: "ctx-section-hd", text: "Stats" })); + if (typeof n.stats === "string") { + body.append(row("raw", String(n.stats).slice(0, 80))); + } else if (typeof n.stats === "object" && n.stats !== null) { + Object.entries(n.stats).slice(0, 10).forEach(([k,v]) => body.append(row(k, short(v)))); + } + } + const related = state.reviews.filter(r => matchesNode(r, id)).slice(-5).reverse(); + if (related.length) { + body.append(el("div", { className: "ctx-section-hd", text: "Recent reviews" })); + related.forEach(r => { + const rr = row(r.file.split("/").pop(), `${r.confidence_avg ?? "-"}% · ${r.alignment_score ?? "?"}/10`); + rr.style.cursor = "pointer"; + rr.addEventListener("click", () => { state.selected = { type:"file", id:r.file }; renderContext(); }); + body.append(rr); + }); + } + if (!body.firstChild) body.append(el("div", { className: "ctx-hint", text: "no data yet" })); +} + +function renderFileContext(fpath, target, body) { + target.textContent = fpath.split("/").slice(-3).join("/"); + const fileReviews = state.reviews.filter(r => r.file === fpath).slice(-6); + if (!fileReviews.length) { + body.append(el("div", { className: "ctx-hint", text: `no reviews for ${fpath}` })); + return; + } + const latest = fileReviews[fileReviews.length - 1]; + const pillRow = el("div", { style: { paddingBottom: "6px" } }); + if (latest.gradient_tier) pillRow.append(el("span", { className: `pill tier-${latest.gradient_tier}`, text: latest.gradient_tier })); + if (latest.verdict) pillRow.append(el("span", { className: `pill ver-${latest.verdict}`, text: latest.verdict })); + if (latest.output_format) pillRow.append(el("span", { className: `pill fmt-${latest.output_format}`, text: latest.output_format })); + body.append(pillRow); + const rows = [ + ["file", fpath], + ["score", latest.alignment_score != null ? `${latest.alignment_score}/10` : "-"], + ["conf avg", latest.confidence_avg != null ? `${latest.confidence_avg}%` : "-"], + ["conf min", latest.confidence_min != null ? `${latest.confidence_min}%` : "-"], + ["findings", latest.findings_count ?? 0], + ["critical", latest.critical_failures_count ?? 0], + ["verified", latest.verified_components_count ?? 0], + ["missing", latest.missing_components_count ?? 0], + ["model", latest.accepted_model ?? "-"], + ["attempts", latest.attempts_made ?? 1], + ["tree split", latest.tree_split_fired ? "yes" : "no"], + ]; + rows.forEach(([k,v]) => body.append(row(k, short(v)))); + body.append(el("div", { className: "ctx-section-hd", text: "Score history" })); + fileReviews.forEach(r => body.append(row(new Date(r.reviewed_at).toLocaleTimeString(), `${r.alignment_score ?? "?"}/10 · ${r.confidence_avg ?? "-"}%`))); + body.append(el("div", { className: "ctx-section-hd", text: "Preview" })); + const pre = el("pre", { text: latest.suggestions_preview ?? "", style: { whiteSpace: "pre-wrap", fontFamily: "var(--mono)", fontSize: "10px", color: "var(--fg-dim)", maxHeight: "200px", overflowY: "auto" } }); + body.append(pre); + document.getElementById("stream-file").textContent = fpath.split("/").pop(); +} + +// ───── TRACE ───── +async function drawTrace() { + const fpath = state.selected?.type === "file" ? state.selected.id : state.reviews[state.reviews.length-1]?.file; + const tl = document.getElementById("trace-timeline"); + const detail = document.getElementById("trace-detail"); + clear(tl); clear(detail); + document.getElementById("trace-file").textContent = fpath ?? "—"; + if (!fpath) { tl.append(el("div", { className: "ctx-hint", text: "no file selected — pick one in KB view" })); return; } + const r = await fetch(`/data/file/${encodeURIComponent(fpath)}`).then(r => r.json()); + const history = r.history ?? []; + document.getElementById("trace-runs").textContent = `${history.length} runs`; + history.forEach((h, i) => { + const node = el("div", { className: "trace-node" + (i === history.length - 1 ? " active" : "") }); + node.append(el("div", { className: "tn-run", text: h.run_id })); + node.append(el("div", { className: "tn-score", text: h.score != null ? String(h.score) : "?" })); + node.append(el("div", { className: "tn-conf", text: `conf ${h.conf_avg ?? "-"}% · ${h.findings}f` })); + node.append(el("div", { className: "tn-model", text: (h.model ?? "").split("/").pop() })); + node.addEventListener("click", () => { + tl.querySelectorAll(".trace-node").forEach(x => x.classList.remove("active")); + node.classList.add("active"); + clear(detail); + detail.append(el("pre", { text: h.preview ?? "" })); + }); + tl.append(node); + }); + if (history.length) { clear(detail); detail.append(el("pre", { text: history[history.length-1].preview ?? "" })); } +} + +// ───── TRAJECTORY — refactor signals + reverse index + per-file delta ───── + +let trajectorySearchTimer = null; +document.getElementById("traj-search")?.addEventListener("input", (e) => { + const q = e.target.value.trim(); + clearTimeout(trajectorySearchTimer); + trajectorySearchTimer = setTimeout(() => runReverseIndex(q), 300); +}); + +async function runReverseIndex(query) { + const body = document.getElementById("traj-body"); + if (!query) { drawTrajectory(); return; } + clear(body); + const res = await fetch(`/data/search?q=${encodeURIComponent(query)}`).then(r => r.json()); + const hdr = el("div", { className: "traj-section-head", text: `REVERSE INDEX · "${query}" · ${res.hits?.length ?? 0} hits` }); + body.append(hdr); + (res.hits ?? []).forEach(h => { + const card = el("div", { className: "traj-hit" }); + card.append(el("div", { className: "traj-hit-top" }, + el("span", { className: "traj-hit-file", text: h.file }), + el("span", { className: "traj-hit-meta", text: `${h.run_id} · ${(h.model ?? "").split("/").pop()}` }) + )); + card.append(el("div", { className: "traj-hit-snip", text: h.snippet })); + card.addEventListener("click", () => { + state.selected = { type: "file", id: `/home/profit/lakehouse/${h.file}` }; + renderContext(); + document.querySelector('#views button[data-view="trace"]').click(); + }); + body.append(card); + }); +} + +async function drawTrajectory() { + const body = document.getElementById("traj-body"); + clear(body); + const statsEl = document.getElementById("traj-stats"); + clear(statsEl); + + // SECTION 1 — refactor signals + const sig = await fetch("/data/refactor_signals").then(r => r.json()); + const sigs = sig.signals ?? []; + const totalHits = sigs.reduce((a,s) => a + s.hits, 0); + statsEl.textContent = `${sig.scanned ?? 0} files scanned · ${sigs.length} with refactor hints · ${totalHits} phrase hits total`; + + const sigHead = el("div", { className: "traj-section-head", text: "REFACTOR SIGNALS · files the scrum repeatedly flagged as dead / redundant / stub / needs-rewrite" }); + body.append(sigHead); + + const explain = el("div", { className: "traj-section-explain", text: + "Aggregates across all scrum iterations. A phrase hit = one time the reviewer used language like 'remove', 'duplicate', 'refactor', 'pseudocode', 'orphaned'. " + + "Files near the top are the strongest refactor candidates — the scrum keeps calling them out. Click a row to jump to its per-iteration trace." + }); + body.append(explain); + + const table = el("div", { className: "traj-table" }); + sigs.slice(0, 30).forEach(s => { + const r = el("div", { className: "traj-row" }); + r.append(el("div", { className: "traj-col-rank", text: String(sigs.indexOf(s) + 1) })); + r.append(el("div", { className: "traj-col-file", text: s.file })); + r.append(el("div", { className: "traj-col-hits", text: `${s.hits}×` })); + const topPhrases = Object.entries(s.phrases).sort((a,b)=>b[1]-a[1]).slice(0,3) + .map(([p,n]) => `${p} (${n})`).join(", "); + r.append(el("div", { className: "traj-col-phrases", text: topPhrases })); + r.append(el("div", { className: "traj-col-iters", text: `${s.iterations} iter` })); + r.addEventListener("click", () => { + state.selected = { type: "file", id: `/home/profit/lakehouse/${s.file}` }; + renderContext(); + document.querySelector('#views button[data-view="trace"]').click(); + }); + table.append(r); + }); + body.append(table); + + // SECTION 2 — per-file trajectory: pick the top-5 refactor candidates and + // show their score/conf delta across iterations inline. + if (sigs.length) { + body.append(el("div", { className: "traj-section-head", text: "SCORE TRAJECTORY — top refactor candidates" })); + const grid = el("div", { className: "traj-spark-grid" }); + for (const s of sigs.slice(0, 6)) { + const card = el("div", { className: "traj-spark" }); + card.append(el("div", { className: "traj-spark-file", text: s.file })); + // pull history + const hist = await fetch(`/data/file/${encodeURIComponent("/home/profit/lakehouse/" + s.file)}`) + .then(r => r.json()).catch(() => ({ history: [] })); + const runs = hist.history ?? []; + if (runs.length === 0) { card.append(el("div", { className: "traj-spark-empty", text: "no history" })); } + else { + const line = el("div", { className: "traj-spark-line" }); + runs.forEach((h,i) => { + const pt = el("div", { className: "traj-spark-pt" }); + pt.append(el("div", { className: "traj-pt-score", text: h.score != null ? `${h.score}/10` : "?" })); + pt.append(el("div", { className: "traj-pt-conf", text: `${h.conf_avg ?? "-"}%` })); + pt.append(el("div", { className: "traj-pt-label", text: `iter${i+1}` })); + line.append(pt); + if (i < runs.length - 1) line.append(el("div", { className: "traj-spark-arrow", text: "→" })); + }); + card.append(line); + // delta summary + if (runs.length >= 2) { + const first = runs[0], last = runs[runs.length - 1]; + const dScore = (last.score != null && first.score != null) ? (last.score - first.score) : null; + const dConf = (last.conf_avg != null && first.conf_avg != null) ? (last.conf_avg - first.conf_avg) : null; + const delta = el("div", { className: "traj-spark-delta" }); + if (dScore != null) delta.append(el("span", { text: `Δscore ${dScore > 0 ? "+" : ""}${dScore.toFixed(1)}`, className: dScore < 0 ? "delta-down" : dScore > 0 ? "delta-up" : "" })); + if (dConf != null) delta.append(el("span", { text: ` · Δconf ${dConf > 0 ? "+" : ""}${dConf}%`, className: dConf > 0 ? "delta-up" : dConf < 0 ? "delta-down" : "" })); + card.append(delta); + } + } + card.addEventListener("click", () => { + state.selected = { type: "file", id: `/home/profit/lakehouse/${s.file}` }; + renderContext(); + document.querySelector('#views button[data-view="trace"]').click(); + }); + grid.append(card); + } + body.append(grid); + } +} + +// ───── METRICS ───── +function metricBox(label, big, kind, opts = {}) { + // opts: { source, good, explain } + // source = where the number comes from (data path) + // good = the "what's a healthy value" sentence + // explain = one-line definition of what this counts + const box = el("div", { className: "metric" + (kind ? " " + kind : "") }); + box.append(el("div", { className: "m-label", text: label })); + box.append(el("div", { className: "m-big", text: big })); + if (opts.explain) box.append(el("div", { className: "m-sub m-explain", text: opts.explain })); + if (opts.source) box.append(el("div", { className: "m-sub m-source", text: "SOURCE · " + opts.source })); + if (opts.good) box.append(el("div", { className: "m-sub m-good", text: "GOOD · " + opts.good })); + return box; +} +function drawMetrics() { + const grid = document.getElementById("metric-grid"); + clear(grid); + const byTier = { auto:0, dry_run:0, simulation:0, block:0, unknown:0 }; + state.reviews.forEach(r => { const t = r.gradient_tier ?? "unknown"; if (byTier[t] != null) byTier[t]++; }); + const total = state.reviews.length || 1; + const confRows = state.reviews.filter(r => r.confidence_avg != null); + const avg = confRows.length ? Math.round(confRows.reduce((a,r)=>a+r.confidence_avg,0)/confRows.length) : 0; + const verdictCount = { pass:0, needs_patch:0, fail:0, unknown:0 }; + state.reviews.forEach(r => { const v=r.verdict??"unknown"; if(verdictCount[v]!=null) verdictCount[v]++; }); + const findingsTotal = state.reviews.reduce((a,r)=>a+(r.findings_count??0),0); + const critTotal = state.reviews.reduce((a,r)=>a+(r.critical_failures_count??0),0); + const verTotal = state.reviews.reduce((a,r)=>a+(r.verified_components_count??0),0); + const usage = state.services?.subsystems?.find(n=>n.id==="usage")?.stats ?? {}; + const journal = state.services?.subsystems?.find(n=>n.id==="journal")?.stats ?? {}; + + grid.append(metricBox("avg confidence", `${avg}%`, avg>=85?"good":avg>=70?"warn":"bad", { + explain: "Self-assessed probability per suggestion, averaged across every review.", + source: "scrum_reviews.jsonl .confidence_avg", + good: "≥85% — model is confident. 70-84% routine. <70% means the scrum is uncertain and findings need human review.", + })); + grid.append(metricBox("scrum reviews", String(state.reviews.length), "good", { + explain: "Every source file reviewed by the scrum master, across all iterations.", + source: `${state.metrics.length} scrum runs tracked in scrum_loop_metrics.jsonl`, + good: "Grows every run — 21 files × N iterations. Stall = pipeline broken.", + })); + grid.append(metricBox("critical failures", String(critTotal), critTotal>50?"bad":critTotal>10?"warn":"good", { + explain: "Hard FAILs flagged by the forensic reviewer — pseudocode, fake implementations, unwired invariants. Each one is a concrete code-level gap.", + source: "scrum_reviews.jsonl .critical_failures_count (forensic JSON format only)", + good: "Trending DOWN each iteration = fixes are landing. Rising = new gaps surfacing faster than we close them.", + })); + grid.append(metricBox("verified components", String(verTotal), verTotal>0?"good":"warn", { + explain: "What the scrum CONFIRMED is working — with file/line evidence. The inverse of critical_failures.", + source: "scrum_reviews.jsonl .verified_components_count", + good: "Trending UP = the system has more provably-real parts over time. Should grow as fixes land.", + })); + grid.append(metricBox("findings captured", String(findingsTotal), "good", { + explain: "Total individual suggestions the scrum produced across all reviews (tables + JSON).", + source: "scrum_reviews.jsonl .findings_count summed", + good: "Higher = more scrutiny per file. Per-file average ≥10 means the review is substantive.", + })); + grid.append(metricBox("journal events", String(journal.total_events_created ?? 0), "good", { + explain: "Mutation events recorded via ADR-012 append-only journal. Every ingest/delta-write should emit one.", + source: "/journal/stats → total_events_created", + good: "Should grow with ingest traffic. 1 = only a test probe fired; internal callers still unwired on most paths (P9-001).", + })); + grid.append(metricBox("v1 requests", String(usage.requests ?? 0), "good", { + explain: "Calls through the Universal API /v1/chat endpoint (Phase 38). Captures all scrum + audit traffic.", + source: `/v1/usage → requests. ${(usage.total_tokens ?? 0).toLocaleString()} tokens total`, + good: "Every iteration adds ~21 requests. Stall = scrum paused OR callers bypassing the gateway (P44-style bypass).", + })); + + // gradient bar + const gb = el("div", { className: "metric" }); + gb.append(el("div", { className: "m-label", text: "permission gradient" })); + gb.append(el("div", { className: "m-big", text: String(state.reviews.length) })); + gb.append(el("div", { className: "m-sub m-explain", text: "Tiers the scrum's suggestions by confidence: how much auto-apply we can trust per file." })); + const bar = el("div", { className: "bar" }); + bar.append(el("span", { className: "seg-auto", style: { width: `${100*byTier.auto/total}%` } })); + bar.append(el("span", { className: "seg-dry_run", style: { width: `${100*byTier.dry_run/total}%` } })); + bar.append(el("span", { className: "seg-simulation", style: { width: `${100*byTier.simulation/total}%` } })); + bar.append(el("span", { className: "seg-block", style: { width: `${100*byTier.block/total}%` } })); + gb.append(bar); + gb.append(el("div", { className: "m-sub", text: `auto ${byTier.auto} · dry_run ${byTier.dry_run} · sim ${byTier.simulation} · block ${byTier.block}` })); + gb.append(el("div", { className: "m-sub m-good", text: + "AUTO (≥90%): ship the suggestion. DRY_RUN (70-89): apply then diff. SIMULATION (50-69): test first. BLOCK (<50): human review — the model doesn't trust itself." + })); + grid.append(gb); + + const vb = el("div", { className: "metric" }); + vb.append(el("div", { className: "m-label", text: "verdict distribution" })); + vb.append(el("div", { className: "m-big", text: String(verdictCount.pass + verdictCount.needs_patch + verdictCount.fail) })); + vb.append(el("div", { className: "m-sub m-explain", text: "Forensic audit verdict per file: pass = works, needs_patch = fixable gaps, fail = not trustable." })); + vb.append(el("div", { className: "m-sub", text: `pass ${verdictCount.pass} · needs_patch ${verdictCount.needs_patch} · fail ${verdictCount.fail}` })); + vb.append(el("div", { className: "m-sub m-source", text: "SOURCE · scrum_reviews.jsonl .verdict (forensic JSON only — markdown rows count as unknown)" })); + grid.append(vb); +} + +// ───── KB ───── +function drawKB() { + const grid = document.getElementById("kb-grid"); + clear(grid); + + // Explanatory banner — each iteration the scrum re-reviews every + // target file and writes a row here. A card = one file's latest + // review. Click to drill into its trace across all iterations. + const banner = el("div", { className: "kb-banner" }); + banner.append(el("div", { className: "kb-banner-title", text: "KNOWLEDGE BASE — every source file reviewed by the scrum master" })); + banner.append(el("div", { className: "kb-banner-body", text: + "Each card below is the LATEST scrum review of one source file. The review itself lives in data/_kb/scrum_reviews.jsonl. " + + "Fields: score (scrum's alignment rating, 1-10 vs PRD intent), conf (model's self-assessed confidence per suggestion, avg'd), " + + "findings (# of suggestions), crit (critical_failures — hard FAILs found), verified (verified_components — what's confirmed working). " + + "Pills show: permission gradient (can we trust auto-apply), verdict (pass/needs_patch/fail), output format (JSON = forensic, markdown = legacy). " + + "Click a card to see its trace across all iterations (iter 1 → iter N) and watch scores trend." + })); + grid.append(banner); + + const byFile = new Map(); + state.reviews.forEach(r => { if (r.file) byFile.set(r.file, r); }); + const rows = [...byFile.values()].sort((a,b) => (b.confidence_avg??0) - (a.confidence_avg??0)); + + // Quick stats above the cards + const statLine = el("div", { className: "kb-statline" }); + const avgConf = rows.length ? Math.round(rows.reduce((a,r)=>a+(r.confidence_avg??0),0) / rows.length) : 0; + const scoreMean = rows.filter(r=>r.alignment_score!=null); + const avgScore = scoreMean.length ? (scoreMean.reduce((a,r)=>a+r.alignment_score,0) / scoreMean.length).toFixed(1) : "?"; + const blockCount = rows.filter(r => r.gradient_tier === "block").length; + statLine.append(el("span", { text: `${rows.length} files tracked` })); + statLine.append(el("span", { text: `mean score ${avgScore}/10` })); + statLine.append(el("span", { text: `mean confidence ${avgConf}%` })); + statLine.append(el("span", { text: `${blockCount} blocked (need human review)`, className: blockCount > 0 ? "stat-warn" : "" })); + grid.append(statLine); + + rows.forEach(r => { + const card = el("div", { className: "kb-file", data: { file: r.file } }); + card.append(el("div", { className: "kf-path", text: r.file })); + const meta = el("div", { className: "kf-meta" }); + const scoreSpan = el("span", { className: "kf-score", text: `${r.alignment_score ?? "?"}/10` }); + scoreSpan.title = "Scrum's alignment score (1-10) — how well this file matches PRD intent. Lower = more gaps."; + meta.append(scoreSpan); + const confSpan = el("span", { text: `conf ${r.confidence_avg ?? "-"}%` }); + confSpan.title = "Average self-confidence across suggestions. <70% = model uncertain, treat carefully."; + meta.append(confSpan); + const findingsSpan = el("span", { text: `${r.findings_count ?? 0} findings` }); + findingsSpan.title = "Total suggestions in this review (table rows or JSON array entries)."; + meta.append(findingsSpan); + const critSpan = el("span", { text: `${r.critical_failures_count ?? 0} crit` }); + critSpan.title = "Critical failures: pseudocode, fake implementations, unwired invariants. Hard FAILs."; + if ((r.critical_failures_count ?? 0) > 0) critSpan.style.color = "var(--red)"; + meta.append(critSpan); + const verSpan = el("span", { text: `${r.verified_components_count ?? 0} verified` }); + verSpan.title = "Verified components: things the scrum CONFIRMED work, with file/line evidence."; + if ((r.verified_components_count ?? 0) > 0) verSpan.style.color = "var(--green)"; + meta.append(verSpan); + meta.append(el("span", { text: (r.accepted_model ?? "").split("/").pop(), attrs: { title: "Which model produced this review" } })); + card.append(meta); + const pills = el("div", { className: "kf-meta" }); + if (r.gradient_tier) { + const p = el("span", { className: `pill tier-${r.gradient_tier}`, text: r.gradient_tier }); + p.title = ({ + auto: "AUTO — confidence ≥90%, suggestions safe to apply automatically", + dry_run: "DRY_RUN — confidence 70-89%, apply then review the diff", + simulation: "SIMULATION — confidence 50-69%, test in sandbox first", + block: "BLOCK — confidence <50%, requires human review, do not auto-apply", + })[r.gradient_tier] ?? r.gradient_tier; + pills.append(p); + } + if (r.verdict) { + const p = el("span", { className: `pill ver-${r.verdict}`, text: r.verdict }); + p.title = ({ + pass: "PASS — scrum confirms this file meets its PRD intent", + needs_patch: "NEEDS_PATCH — gaps exist but are fixable; scrum has concrete suggestions", + fail: "FAIL — file cannot be trusted for its claimed purpose without structural changes", + })[r.verdict] ?? r.verdict; + pills.append(p); + } + if (r.output_format) { + const p = el("span", { className: `pill fmt-${r.output_format}`, text: r.output_format }); + p.title = r.output_format === "forensic_json" + ? "FORENSIC_JSON — structured output with verdict/critical/verified/missing fields. Richer signal." + : "MARKDOWN — legacy tabular output. Lower structure; we only extract confidence scalars from these."; + pills.append(p); + } + card.append(pills); + card.addEventListener("click", () => { + state.selected = { type: "file", id: r.file }; + renderContext(); + document.querySelector('#views button[data-view="trace"]').click(); + }); + grid.append(card); + }); +} + +// ───── CONSOLE ───── +// Persistent selection across polls so tab switches survive. +state.consoleSvc = "gateway"; + +// Hook tab buttons once +document.querySelectorAll("#con-tabs button").forEach(b => { + b.addEventListener("click", () => { + document.querySelectorAll("#con-tabs button").forEach(x => x.classList.remove("on")); + b.classList.add("on"); + state.consoleSvc = b.dataset.svc; + drawConsole(); + }); +}); + +async function drawConsole() { + const log = document.getElementById("console-log"); + clear(log); + const unit = document.getElementById("con-unit"); + if (unit) unit.textContent = ""; + + if (state.consoleSvc === "summary") { + drawConsoleSummary(log); + return; + } + + // Per-service log tail + const svc = state.consoleSvc; + try { + const res = await fetch(`/data/logs/${svc}?n=120`).then(r => r.json()); + if (unit && res.unit) unit.textContent = `unit · ${res.unit}`; + if (res.error) { + log.append(lineInfo(`[error] ${res.error}`, "cl-err")); + return; + } + const lines = res.lines ?? []; + if (!lines.length) { log.append(lineInfo("(no log lines — unit may have just started)", "cl-info")); return; } + lines.forEach(l => { + const cls = /\berror\b|\bERROR\b|panic|\[ERROR|failed/.test(l) ? "cl-err" + : /\bwarn\b|\bWARN\b|\bwarning\b|\[WARN/.test(l) ? "cl-warn" + : /\baccepted\b|\bok\b|\bOK\b|success|complete|ready/.test(l) ? "cl-ok" + : "cl-info"; + log.append(lineInfo(l, cls)); + }); + // autoscroll to bottom + log.scrollTop = log.scrollHeight; + } catch (e) { + log.append(lineInfo(`[fetch-error] ${e}`, "cl-err")); + } +} + +function lineInfo(text, cls) { + return el("div", { className: "cl-line " + cls, text }); +} + +function drawConsoleSummary(log) { + const info = t => lineInfo(t, "cl-info"); + const ok = t => lineInfo(t, "cl-ok"); + const warn = t => lineInfo(t, "cl-warn"); + const err = t => lineInfo(t, "cl-err"); + log.append(info(`# Lakehouse VCP · ${new Date().toLocaleTimeString()}`)); + log.append(info(`# Services`)); + for (const n of state.services?.nodes ?? []) { + const line = `[${String(n.status).padEnd(8)}] ${n.label}`; + log.append(n.status === "healthy" ? ok(line) : n.status === "down" ? err(line) : warn(line)); + } + log.append(info(`# Subsystems`)); + for (const s of state.services?.subsystems ?? []) { + log.append(info(` ${String(s.id).padEnd(10)} ${JSON.stringify(s.stats ?? {}).slice(0, 120)}`)); + } + log.append(info(`# Recent overrides (layer 10)`)); + for (const o of state.overrides.slice(-6)) { + log.append(warn(` [${o.ts}] ${o.task_signature}: ${o.human_fix}`)); + } + log.append(info(`# Model trust accumulated`)); + const agg = {}; + for (const t of state.trust) { + const k = t.accepted_model ?? "?"; + agg[k] = agg[k] ?? { accepts:0, thin:0, attempts:0 }; + agg[k].accepts++; + agg[k].thin += t.thin_rejections ?? 0; + agg[k].attempts += t.attempts_made ?? 0; + } + for (const [m, s] of Object.entries(agg)) { + log.append(info(` ${String(m).padEnd(48)} accepts=${s.accepts} thin=${s.thin} attempts=${s.attempts}`)); + } +} + +// ───── boot ───── +poll(); +setInterval(poll, POLL_MS); +window.addEventListener("resize", () => { if (state.services && state.view === "map") drawMap(state.services); });