root 4087dde780
Some checks failed
lakehouse/auditor 2 blocking issues: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
execution_loop: update stale test assertion to match current prompt format
Pre-existing failure I've been noting across this session —
`executor_prompt_includes_surfaced_candidates` expected the substring
"W-1 Alice Smith" but the prompt format was intentionally changed
(probably in a Phase 38/39 commit) to separate doc_id from name so
the executor doesn't conflate `doc_id` (vector-index key) with
`workers_500k.worker_id` (integer PK).

Current prompt format (line 1178 in build_executor_prompt):
  - name="Alice Smith"  city="Toledo"  state="OH"  (vector doc_id=W-1)

The prompt body explicitly instructs the model NOT to conflate the
two IDs — the format separation is the mechanism enforcing that
instruction. The OLD test assertion predated that separation.

Assertion now checks the semantic contract (both tokens present,
any order) instead of the exact old concatenation.

Workspace test result after this commit: 343 passed, 0 failed, 0
warnings (both lib + tests).

This is the last stale-test hole from the phase-audit sweep — it
popped up during the 41-commit push but I was leaving it as
pre-existing-unrelated. J called it: sitting broken for hours is
worse than a one-line assertion update.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 14:06:24 -05:00

1859 lines
81 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! `ExecutionLoop` — the Rust port of `tests/multi-agent/orchestrator.ts`.
//!
//! Incremental port (2026-04-23). Pieces in order of landing:
//! 1. ✅ Playbook-boost context retrieval
//! 2. ✅ Executor turn via the shared ollama::chat path
//! 3. ✅ Reviewer turn + critique parse (this commit)
//! 4. ⬜ Tool-call dispatch — hybrid_search / sql / Phase-12 tools (orchestrator.ts:101-124)
//! 5. ✅ Consensus detection + drift counter (this commit)
//! 6. ⬜ Truth-layer gate (Phase 42 — refuse before burning tokens)
//! 7. ⬜ Validator call (Phase 43 stub)
//! 8. ⬜ Cloud escalation on repeat failure (T3 gpt-oss:120b)
//! 9. ⬜ Playbook seal + /vectors/playbook_memory/seed (orchestrator.ts:255-293)
//! 10. ⬜ KB write-through: outcomes + facts (Phase 22)
pub mod kb_context;
use serde::{Deserialize, Serialize};
use crate::v1::{respond::RespondRequest, V1State};
use kb_context::KbContext;
const DEFAULT_EXECUTOR_MODEL: &str = "qwen3.5:latest";
const DEFAULT_REVIEWER_MODEL: &str = "qwen3:latest";
const DEFAULT_MAX_TURNS: u32 = 12;
/// Matches orchestrator.ts:31. Three consecutive drift flags OR tool
/// errors aborts the loop — the executor isn't self-correcting.
const MAX_CONSECUTIVE_DRIFTS: u32 = 3;
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct LogEntry {
pub turn: u32,
pub role: String,
pub model: String,
pub kind: String,
pub content: serde_json::Value,
pub at: String,
}
impl LogEntry {
fn new(turn: u32, role: &str, model: &str, kind: &str, content: serde_json::Value) -> Self {
Self {
turn,
role: role.to_string(),
model: model.to_string(),
kind: kind.to_string(),
content,
at: chrono::Utc::now().to_rfc3339(),
}
}
}
/// Action = what an agent returns on one turn. PORT FROM agent.ts:312.
/// Strict-shape enum so the executor/reviewer can't wedge the loop
/// with ambiguous output — either it parses, or `parse_action` throws
/// and the orchestrator appends an error turn.
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum Action {
Plan { steps: Vec<String> },
ToolCall { tool: String, args: serde_json::Value, #[serde(default)] rationale: String },
ProposeDone { fills: Vec<Fill>, #[serde(default)] rationale: String },
Critique { verdict: Verdict, #[serde(default)] notes: String },
}
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum Verdict {
Continue,
Drift,
ApproveDone,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct Fill {
pub candidate_id: String,
pub name: String,
/// Optional — legacy models still emit it. agent.ts:321 rationale.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub reason: Option<String>,
}
pub enum RespondOutcome {
Ok { artifact: serde_json::Value, log: Vec<LogEntry> },
Failed { reason: String, log: Vec<LogEntry> },
// Constructed by the truth-gate check in run_inner (step 6, 2026-04-24).
Blocked { reason: String, log: Vec<LogEntry> },
}
impl RespondOutcome {
pub fn artifact(&self) -> serde_json::Value {
match self {
Self::Ok { artifact, .. } => artifact.clone(),
_ => serde_json::Value::Null,
}
}
pub fn into_log(self) -> Vec<LogEntry> {
match self {
Self::Ok { log, .. } | Self::Failed { log, .. } | Self::Blocked { log, .. } => log,
}
}
}
pub struct ExecutionLoop {
state: V1State,
req: RespondRequest,
log: Vec<LogEntry>,
turns_used: u32,
stats: LoopStats,
/// Phase 20 budget — at most one T3 overseer call per loop
/// invocation. Cloud calls cost real money and the whole point is
/// "hyperfocus local + one strategic cloud nudge", not a cloud
/// retry loop. See docs/CONTROL_PLANE_PRD.md §4.3.
overseer_called: bool,
}
/// Per-invocation usage accumulator. Separate from the gateway-wide
/// `V1State.usage` (which is lifetime-across-all-requests) so the
/// outcomes row can stamp this-task tokens/latency without subtracting
/// two snapshots.
#[derive(Default, Clone, Serialize)]
pub struct LoopStats {
pub requests: u64,
pub prompt_tokens: u64,
pub completion_tokens: u64,
pub total_tokens: u64,
pub latency_ms: u64,
}
impl ExecutionLoop {
pub fn new(state: V1State, req: RespondRequest) -> Self {
Self {
state, req,
log: Vec::new(),
turns_used: 0,
stats: LoopStats::default(),
overseer_called: false,
}
}
pub fn turns_used(&self) -> u32 {
self.turns_used
}
pub async fn run(&mut self) -> Result<RespondOutcome, String> {
let outcome = self.run_inner().await?;
Ok(self.finalize(outcome).await)
}
async fn run_inner(&mut self) -> Result<RespondOutcome, String> {
let executor_model = self.req.executor_model
.as_deref().unwrap_or(DEFAULT_EXECUTOR_MODEL).to_string();
let reviewer_model = self.req.reviewer_model
.as_deref().unwrap_or(DEFAULT_REVIEWER_MODEL).to_string();
let max_turns = self.req.max_turns.unwrap_or(DEFAULT_MAX_TURNS);
// --- (6) TRUTH GATE — Phase 42 wiring (2026-04-24) ---
// Evaluate truth rules for the request's task_class against a
// ctx built from the spec. Any rule whose condition holds AND
// whose action is Reject/Block short-circuits to Blocked before
// the executor loop runs. Mirrors queryd/service.rs SQL gate.
let truth_store = truth::default_truth_store();
for outcome in truth_store.evaluate(&self.req.task_class, &self.req.spec) {
if !outcome.passed { continue; }
if let truth::RuleAction::Reject { message } | truth::RuleAction::Block { message } = &outcome.action {
let reason = format!("truth rule {} blocked: {message}", outcome.rule_id);
self.append(LogEntry::new(0, "system", "truth", "block",
serde_json::json!({ "rule_id": outcome.rule_id, "reason": reason.clone() })));
return Ok(RespondOutcome::Blocked { reason, log: self.log.clone() });
}
}
// --- (1) PLAYBOOK BOOST ---
let boost = self.fetch_playbook_boost(&self.req.operation).await.unwrap_or_default();
if !boost.is_empty() {
self.append(LogEntry::new(
0, "system", "playbook_memory", "boost_loaded",
serde_json::json!({ "count": boost.len(), "preview": boost.iter().take(3).collect::<Vec<_>>() }),
));
}
let mut consecutive_drifts: u32 = 0;
// --- MAIN TURN LOOP ---
for turn in 1..=max_turns {
self.turns_used = turn;
// --- (2) EXECUTOR TURN ---
let executor_prompt = build_executor_prompt(&self.req, &boost, &self.log);
let executor_raw = self.chat_once(&executor_model, &executor_prompt, 0.2, false).await?;
let exec_action = match parse_action(&executor_raw, Role::Executor) {
Ok(a) => a,
Err(e) => {
self.append(LogEntry::new(
turn, "executor", &executor_model, "error",
serde_json::json!({ "message": e, "raw": truncate(&executor_raw, 400) }),
));
return Ok(RespondOutcome::Failed {
reason: format!("executor parse failure on turn {turn}: {e}"),
log: std::mem::take(&mut self.log),
});
}
};
self.append(LogEntry::new(
turn, "executor", &executor_model, action_kind(&exec_action), action_content(&exec_action),
));
// --- (4) TOOL DISPATCH — PORT FROM orchestrator.ts:101-124 ---
// Soft-fail: a tool error is a log entry, not a loop abort.
// The executor reads its own error next turn and self-corrects
// (orchestrator.ts:169-189). Only MAX_CONSECUTIVE_DRIFTS tool
// errors in a row → hard abort.
if let Action::ToolCall { tool, args, .. } = &exec_action {
match self.dispatch_tool(tool, args).await {
Ok(result) => {
let trimmed = trim_result(&result);
self.append(LogEntry::new(
turn, "executor", &executor_model, "tool_result", trimmed,
));
}
Err(e) => {
self.append(LogEntry::new(
turn, "executor", &executor_model, "tool_result",
serde_json::json!({ "error": e, "tool": tool, "args": args }),
));
consecutive_drifts += 1;
if consecutive_drifts >= MAX_CONSECUTIVE_DRIFTS {
return Ok(RespondOutcome::Failed {
reason: format!(
"aborting — {MAX_CONSECUTIVE_DRIFTS} consecutive tool errors, executor can't self-correct"
),
log: std::mem::take(&mut self.log),
});
}
}
}
}
// --- (3) REVIEWER TURN ---
let reviewer_prompt = build_reviewer_prompt(&self.req, &self.log);
let reviewer_raw = self.chat_once(&reviewer_model, &reviewer_prompt, 0.1, false).await?;
let rev_action = match parse_action(&reviewer_raw, Role::Reviewer) {
Ok(a) => a,
Err(e) => {
self.append(LogEntry::new(
turn, "reviewer", &reviewer_model, "error",
serde_json::json!({ "message": e, "raw": truncate(&reviewer_raw, 400) }),
));
return Ok(RespondOutcome::Failed {
reason: format!("reviewer parse failure on turn {turn}: {e}"),
log: std::mem::take(&mut self.log),
});
}
};
self.append(LogEntry::new(
turn, "reviewer", &reviewer_model, "critique", action_content(&rev_action),
));
let verdict = match &rev_action {
Action::Critique { verdict, .. } => verdict.clone(),
_ => {
return Ok(RespondOutcome::Failed {
reason: format!("reviewer emitted non-critique on turn {turn}"),
log: std::mem::take(&mut self.log),
});
}
};
// --- (5) CONSENSUS DETECTION + DRIFT COUNTER ---
if verdict == Verdict::Drift {
consecutive_drifts += 1;
// --- (8) OVERSEER ESCALATION ---
// One chance before abort: when the local loop is
// about to give up, call the T3 overseer with the KB
// context (what worked / didn't on this task class
// historically) + the recent log tail. The overseer
// emits a correction which feeds back into the next
// executor turn. Only fires once per loop to honor
// Phase 20 "1-3 calls/scenario" budget.
if consecutive_drifts == MAX_CONSECUTIVE_DRIFTS.saturating_sub(1)
&& !self.overseer_called
{
if let Err(e) = self.escalate_to_overseer(turn, "drift_approaching_abort").await {
tracing::warn!("overseer escalation failed: {e}");
}
// Reset so the executor gets one clean turn with
// the correction in context before we re-evaluate.
consecutive_drifts = 0;
} else if consecutive_drifts >= MAX_CONSECUTIVE_DRIFTS {
return Ok(RespondOutcome::Failed {
reason: format!(
"aborting — {MAX_CONSECUTIVE_DRIFTS} consecutive drift flags, executor can't self-correct (overseer_called={})",
self.overseer_called,
),
log: std::mem::take(&mut self.log),
});
}
} else {
consecutive_drifts = 0;
}
if let (Action::ProposeDone { fills, rationale }, Verdict::ApproveDone)
= (&exec_action, &verdict)
{
let target_count = spec_target_count(&self.req.spec);
if target_count > 0 && fills.len() as u64 != target_count {
return Ok(RespondOutcome::Failed {
reason: format!(
"consensus malformed — {} fills vs target {}",
fills.len(), target_count
),
log: std::mem::take(&mut self.log),
});
}
self.append(LogEntry::new(
turn, "reviewer", &reviewer_model, "consensus_done",
serde_json::json!({ "fills": fills }),
));
// Seal + write-through runs in `finalize` after this
// returns — outcomes row + playbook_memory seed with
// retries + stats stamping all land there.
let artifact = serde_json::json!({
"fills": fills,
"approach": rationale,
"turns": turn,
});
return Ok(RespondOutcome::Ok {
artifact,
log: std::mem::take(&mut self.log),
});
}
}
Ok(RespondOutcome::Failed {
reason: format!("no consensus after {max_turns} turns — task incomplete"),
log: std::mem::take(&mut self.log),
})
}
fn append(&mut self, e: LogEntry) {
tracing::debug!(turn = e.turn, role = %e.role, kind = %e.kind, "execution_loop");
self.log.push(e);
}
/// Dispatch: model name prefix → provider.
/// Local path uses Phase 21 `generate_continuable` (auto-continuation,
/// retry on empty thinking-model response). Cloud path hits
/// Ollama Cloud directly — no continuation since cloud budgets are
/// generous and Phase 21's Rust port is local-only. Truncation on
/// cloud surfaces as a parse failure in the loop; that's fail-fast
/// and a real signal (we want to know when cloud didn't finish).
async fn chat_once(
&mut self,
model: &str,
prompt: &str,
temperature: f64,
think: bool,
) -> Result<String, String> {
let is_cloud = is_cloud_model(model);
let provider = if is_cloud { "ollama_cloud" } else { "ollama" };
let start_time = chrono::Utc::now();
let started = std::time::Instant::now();
let (text, prompt_tokens, completion_tokens, calls) = if is_cloud {
let key = self.state.ollama_cloud_key.as_deref().ok_or_else(|| {
format!("cloud model {model} requested but OLLAMA_CLOUD_KEY not configured")
})?;
use crate::v1::{ChatRequest, Message};
// Cloud path: retry up to 3× on empty response. gpt-oss:*
// models sometimes return empty after internal reasoning
// — this is the cloud-side analog of Phase 21's empty-
// response backoff, inlined since generate_continuable is
// local-only.
let mut text = String::new();
let mut tokens_p = 0u32;
let mut tokens_c = 0u32;
let mut attempts = 0u32;
for attempt in 0..3 {
attempts = attempt + 1;
let req = ChatRequest {
model: model.to_string(),
messages: vec![Message { role: "user".into(), content: prompt.to_string() }],
temperature: Some(temperature),
max_tokens: None,
stream: Some(false),
think: Some(think),
provider: Some("ollama_cloud".into()),
};
let resp = crate::v1::ollama_cloud::chat(key, &req).await
.map_err(|e| format!("ollama_cloud: {e}"))?;
tokens_p = tokens_p.saturating_add(resp.usage.prompt_tokens);
tokens_c = tokens_c.saturating_add(resp.usage.completion_tokens);
let t = resp.choices.into_iter().next()
.map(|c| c.message.content).unwrap_or_default();
if !t.trim().is_empty() {
text = t;
break;
}
tracing::warn!(model = %model, attempt, "cloud returned empty, retrying");
}
(text, tokens_p, tokens_c, attempts)
} else {
use aibridge::continuation::{generate_continuable, ContinuableOpts, ResponseShape};
let mut opts = ContinuableOpts::new(model);
opts.temperature = Some(temperature);
opts.think = Some(think);
opts.shape = ResponseShape::Json;
let outcome = generate_continuable(&self.state.ai_client, prompt, &opts).await?;
if outcome.empty_retries > 0 || outcome.continuations > 0 || !outcome.final_complete {
tracing::info!(
model = %model,
empty_retries = outcome.empty_retries,
continuations = outcome.continuations,
final_complete = outcome.final_complete,
calls = outcome.calls,
"execution_loop.chat_once: continuation telemetry"
);
}
(outcome.text, outcome.prompt_tokens, outcome.completion_tokens, outcome.calls)
};
let elapsed_ms = started.elapsed().as_millis() as u64;
let end_time = chrono::Utc::now();
// Langfuse trace — uniform across local + cloud, provider tag
// lets the bridge / observer differentiate downstream.
if let Some(lf) = &self.state.langfuse {
use crate::v1::{langfuse_trace::ChatTrace, Message};
lf.emit_chat(ChatTrace {
provider: provider.to_string(),
model: model.to_string(),
input: vec![Message { role: "user".into(), content: prompt.to_string() }],
output: text.clone(),
prompt_tokens,
completion_tokens,
temperature: Some(temperature),
max_tokens: None,
think: Some(think),
start_time: start_time.to_rfc3339(),
end_time: end_time.to_rfc3339(),
latency_ms: elapsed_ms,
});
}
// Per-task stats (stamps the outcomes row) + gateway-wide
// /v1/usage counters. Both updated uniformly; the by_provider
// split lets operators see the local/cloud mix per task.
let total_tokens = (prompt_tokens + completion_tokens) as u64;
self.stats.requests = self.stats.requests.saturating_add(calls as u64);
self.stats.prompt_tokens = self.stats.prompt_tokens.saturating_add(prompt_tokens as u64);
self.stats.completion_tokens = self.stats.completion_tokens.saturating_add(completion_tokens as u64);
self.stats.total_tokens = self.stats.total_tokens.saturating_add(total_tokens);
self.stats.latency_ms += elapsed_ms;
{
let mut u = self.state.usage.write().await;
u.requests = u.requests.saturating_add(calls as u64);
u.prompt_tokens = u.prompt_tokens.saturating_add(prompt_tokens as u64);
u.completion_tokens = u.completion_tokens.saturating_add(completion_tokens as u64);
u.total_tokens = u.total_tokens.saturating_add(total_tokens);
let pu = u.by_provider.entry(provider.to_string()).or_default();
pu.requests = pu.requests.saturating_add(calls as u64);
pu.prompt_tokens = pu.prompt_tokens.saturating_add(prompt_tokens as u64);
pu.completion_tokens = pu.completion_tokens.saturating_add(completion_tokens as u64);
pu.total_tokens = pu.total_tokens.saturating_add(total_tokens);
}
Ok(text)
}
/// Final step for every terminal path — write the outcomes row (with
/// the full indicator set stamped) and, on success, seed the playbook
/// back into memory so the next similar task hits the fast path.
/// The write-through is what closes the 0→85% compounding loop.
///
/// Both writes are best-effort: KB-write failure emits a warn but
/// doesn't convert an Ok into a Failed. The caller's response should
/// reflect what the loop actually accomplished, not whether the log
/// sink was reachable.
async fn finalize(&mut self, mut outcome: RespondOutcome) -> RespondOutcome {
// PORT FROM orchestrator.ts:251-293. On consensus, write-through
// to playbook_memory so the next semantically-similar query
// surfaces the endorsed names.
let seed_outcome = if let RespondOutcome::Ok { artifact, .. } = &outcome {
match self.seed_playbook_memory(artifact).await {
Ok(v) => Some(v),
Err(e) => {
tracing::warn!("playbook_memory seed failed: {e}");
Some(serde_json::json!({ "error": e }))
}
}
} else {
None
};
// Append the outcomes row — polarity derived from the variant,
// indicators stamped from loop state. schema_version=2 flags
// this as a per-task row (distinct from the scenario-level rows
// already in outcomes.jsonl).
let outcomes_row = build_outcomes_row(
&self.req, &self.stats, self.turns_used,
self.overseer_called,
&outcome, seed_outcome.clone(),
);
if let Err(e) = append_outcomes_row(&outcomes_row).await {
tracing::warn!("outcomes.jsonl append failed: {e}");
}
// Enrich the response artifact with the seed + usage info so
// the API caller can see compounding state without a second call.
if let RespondOutcome::Ok { artifact, .. } = &mut outcome {
if let Some(obj) = artifact.as_object_mut() {
if let Some(seed) = seed_outcome {
obj.insert("playbook_seed".into(), seed);
}
obj.insert("usage".into(), serde_json::to_value(&self.stats).unwrap_or_default());
obj.insert("sig_hash".into(), serde_json::Value::String(sig_hash(&self.req)));
}
}
outcome
}
/// PORT FROM orchestrator.ts:255-293. Three retries with geometric
/// backoff. `append: true` routes through Phase 26 upsert semantics
/// (ADD/UPDATE/NOOP on operation+day+city+state), so a re-seal of
/// the same fill on the same day merges names instead of duplicating.
async fn seed_playbook_memory(
&self,
artifact: &serde_json::Value,
) -> Result<serde_json::Value, String> {
let fills = artifact.get("fills").and_then(|v| v.as_array())
.ok_or_else(|| "artifact missing fills".to_string())?;
let endorsed_names: Vec<String> = fills.iter()
.filter_map(|f| f.get("name").and_then(|v| v.as_str()).map(String::from))
.collect();
if endorsed_names.is_empty() {
return Err("no endorsed_names to seed".into());
}
// Seed context is what the embedding model sees — carry
// task-semantic content (role, city, scenario) not orchestrator
// bookkeeping. Falls back to approach_hint, then to a built
// string from spec. Matches orchestrator.ts:262-263.
let approach = artifact.get("approach").and_then(|v| v.as_str())
.filter(|s| !s.is_empty())
.unwrap_or("multi-agent → hybrid search")
.to_string();
let context = seed_context(&self.req);
let body = serde_json::json!({
"operation": self.req.operation,
"approach": approach,
"context": context,
"endorsed_names": endorsed_names,
"append": true,
});
let client = reqwest::Client::new();
let mut last_err = String::new();
for attempt in 0..3u32 {
match client.post("http://127.0.0.1:3100/vectors/playbook_memory/seed")
.json(&body).send().await
{
Ok(resp) => {
let status = resp.status();
let text = resp.text().await.unwrap_or_default();
if status.is_success() {
let j: serde_json::Value = serde_json::from_str(&text)
.unwrap_or(serde_json::json!({ "raw": text }));
return Ok(j);
}
last_err = format!("{}: {}", status, truncate(&text, 200));
}
Err(e) => last_err = format!("transport: {e}"),
}
// Geometric backoff: 1s, 2s, 3s (matches orchestrator.ts:281).
tokio::time::sleep(std::time::Duration::from_secs(attempt as u64 + 1)).await;
}
Err(format!("after 3 attempts: {last_err}"))
}
/// Phase 20 step (8) — T3 overseer escalation.
///
/// When the local executor/reviewer loop can't self-correct, call
/// the cloud overseer (`gpt-oss:120b` via Ollama Cloud) with (a)
/// the KB context — recent outcomes + prior corrections for this
/// sig_hash + task_class, across every profile that has run it —
/// and (b) the recent log tail. Its output is appended as a
/// `system` role turn so the next executor generation sees it,
/// AND written to `data/_kb/overseer_corrections.jsonl` so every
/// future profile activation reads from the same learning pool.
///
/// This is the "pipe to the overviewer" piece from 2026-04-23 —
/// the overseer is now a first-class KB consumer AND producer, not
/// a one-shot correction oracle.
async fn escalate_to_overseer(&mut self, turn: u32, reason: &str) -> Result<(), String> {
let Some(cloud_key) = self.state.ollama_cloud_key.clone() else {
return Err("OLLAMA_CLOUD_KEY not configured — skipping escalation".into());
};
let kb = KbContext::load_for(&sig_hash(&self.req), &self.req.task_class).await;
let prompt = build_overseer_prompt(&self.req, &kb, &self.log, reason);
let started = std::time::Instant::now();
let start_time = chrono::Utc::now();
let chat_req = crate::v1::ChatRequest {
model: "gpt-oss:120b".to_string(),
messages: vec![crate::v1::Message {
role: "user".into(),
content: prompt.clone(),
}],
temperature: Some(0.1),
max_tokens: None,
stream: Some(false),
think: Some(true), // overseer KEEPS thinking (Phase 20 rule)
provider: Some("ollama_cloud".into()),
};
let resp = crate::v1::ollama_cloud::chat(&cloud_key, &chat_req).await
.map_err(|e| format!("ollama_cloud: {e}"))?;
let latency_ms = started.elapsed().as_millis() as u64;
let end_time = chrono::Utc::now();
let correction_text = resp.choices.into_iter().next()
.map(|c| c.message.content).unwrap_or_default();
// Stamp per-task stats — cloud call counts against the same
// usage counter so `/v1/usage` shows cloud token spend too.
self.stats.requests = self.stats.requests.saturating_add(1);
self.stats.prompt_tokens = self.stats.prompt_tokens.saturating_add(resp.usage.prompt_tokens as u64);
self.stats.completion_tokens = self.stats.completion_tokens.saturating_add(resp.usage.completion_tokens as u64);
self.stats.total_tokens = self.stats.total_tokens.saturating_add(resp.usage.total_tokens as u64);
self.stats.latency_ms = self.stats.latency_ms.saturating_add(latency_ms);
// Langfuse trace for the overseer call (same pipe that feeds
// the observer/KB, so this correction's cost lands in the KB
// too — closing the loop).
if let Some(lf) = &self.state.langfuse {
use crate::v1::langfuse_trace::ChatTrace;
lf.emit_chat(ChatTrace {
provider: "ollama_cloud".into(),
model: "gpt-oss:120b".into(),
input: vec![crate::v1::Message { role: "user".into(), content: prompt.clone() }],
output: correction_text.clone(),
prompt_tokens: resp.usage.prompt_tokens,
completion_tokens: resp.usage.completion_tokens,
temperature: Some(0.1),
max_tokens: None,
think: Some(true),
start_time: start_time.to_rfc3339(),
end_time: end_time.to_rfc3339(),
latency_ms,
});
}
// Append to the transcript so the next executor turn sees it.
self.append(LogEntry::new(
turn, "system", "gpt-oss:120b", "overseer_correction",
serde_json::json!({
"reason": reason,
"correction": correction_text,
"kb_context_summary": {
"total_observed": kb.total_observed,
"success_rate": kb.success_rate,
"prior_corrections": kb.recent_corrections.len(),
},
}),
));
// Write to the KB — read by KbContext::load_for on every
// subsequent escalation, AND by any profile that iterates on
// this task class later.
let row = serde_json::json!({
"schema_version": 2,
"source_service": "v1.respond.overseer",
"sig_hash": sig_hash(&self.req),
"task_class": self.req.task_class,
"operation": self.req.operation,
"reason": reason,
"model": "gpt-oss:120b",
"correction": correction_text,
"applied_at_turn": turn,
"kb_context_used": kb,
"usage": {
"prompt_tokens": resp.usage.prompt_tokens,
"completion_tokens": resp.usage.completion_tokens,
"total_tokens": resp.usage.total_tokens,
"latency_ms": latency_ms,
},
"created_at": chrono::Utc::now().to_rfc3339(),
});
if let Err(e) = append_corrections_row(&row).await {
tracing::warn!("overseer_corrections.jsonl append failed: {e}");
}
self.overseer_called = true;
Ok(())
}
async fn fetch_playbook_boost(&self, operation: &str) -> Result<Vec<serde_json::Value>, ()> {
let body = serde_json::json!({ "operation": operation, "top_k": 5 });
let client = reqwest::Client::new();
let resp = client
.post("http://127.0.0.1:3100/vectors/playbook_memory/search")
.json(&body)
.send().await.map_err(|_| ())?;
if !resp.status().is_success() {
return Ok(Vec::new());
}
let j: serde_json::Value = resp.json().await.map_err(|_| ())?;
Ok(j.get("boosts").and_then(|v| v.as_array()).cloned().unwrap_or_default())
}
/// PORT FROM orchestrator.ts:101-124 + agent.ts:348-364.
/// Three tool surfaces unified behind one dispatcher:
/// - `hybrid_search` → `POST /vectors/hybrid` (pseudo-tool, not in
/// the Phase 12 registry — lives in vectord)
/// - `sql` → `POST /query/sql` with a SELECT-only guard
/// - anything else → `POST /tools/{name}/call` via the Phase 12
/// registry (permissions, audit, validation all happen there)
///
/// Loopback HTTP on 127.0.0.1:3100 on purpose: mirrors the TS
/// behavior exactly (every call goes through the same middleware,
/// auth, audit, CORS path), and lets us swap to in-process routing
/// later without changing the dispatch contract.
async fn dispatch_tool(
&self,
tool: &str,
args: &serde_json::Value,
) -> Result<serde_json::Value, String> {
let client = reqwest::Client::new();
match tool {
"hybrid_search" => {
let sql_filter = args.get("sql_filter").and_then(|v| v.as_str())
.ok_or_else(|| "hybrid_search needs sql_filter (string)".to_string())?;
let question = args.get("question").and_then(|v| v.as_str())
.ok_or_else(|| "hybrid_search needs question (string)".to_string())?;
let index_name = args.get("index_name").and_then(|v| v.as_str())
.ok_or_else(|| "hybrid_search needs index_name (string)".to_string())?;
// Accept either `top_k` or `k` from the model — same
// tolerance as orchestrator.ts. Default 10.
let top_k = args.get("top_k").or_else(|| args.get("k"))
.and_then(|v| v.as_u64()).unwrap_or(10);
let body = serde_json::json!({
"sql_filter": sql_filter,
"question": question,
"index_name": index_name,
"top_k": top_k,
"generate": false,
});
let resp = client.post("http://127.0.0.1:3100/vectors/hybrid")
.json(&body).send().await
.map_err(|e| format!("hybrid_search transport: {e}"))?;
parse_tool_response(resp).await
}
"sql" => {
let query = args.get("query").and_then(|v| v.as_str())
.ok_or_else(|| "sql needs query (string)".to_string())?;
// SELECT-only guard mirroring orchestrator.ts:119. The
// tool is read-only; any mutation needs the Phase 12
// registry + its permission + audit flow, not the
// unchecked raw sql surface.
if !query.trim_start().to_ascii_uppercase().starts_with("SELECT") {
return Err(format!("sql tool allows SELECT only: {}", truncate(query, 120)));
}
let body = serde_json::json!({ "sql": query, "format": "json" });
let resp = client.post("http://127.0.0.1:3100/query/sql")
.json(&body).send().await
.map_err(|e| format!("sql transport: {e}"))?;
parse_tool_response(resp).await
}
other => {
// Phase 12 registry — any registered staffing tool lands here.
// Body shape matches agent.ts::callTool (POST /tools/{name}/call
// with {params, agent}).
let url = format!("http://127.0.0.1:3100/tools/{}/call", other);
let body = serde_json::json!({
"params": args,
"agent": "v1.respond",
});
let resp = client.post(&url).json(&body).send().await
.map_err(|e| format!("{other} transport: {e}"))?;
parse_tool_response(resp).await
}
}
}
}
/// Read a tool response body into JSON, or surface the status + text
/// as an error. Keeps the `error` path structurally identical whether
/// the transport fails (caller handles), the server 5xx's (here), or
/// the tool returns a 200 with an `{"error":"..."}` payload (caller
/// surfaces to the executor as normal tool_result content).
async fn parse_tool_response(resp: reqwest::Response) -> Result<serde_json::Value, String> {
let status = resp.status();
let text = resp.text().await.map_err(|e| format!("body read: {e}"))?;
if !status.is_success() {
return Err(format!("{}: {}", status, truncate(&text, 300)));
}
serde_json::from_str(&text)
.map_err(|e| format!("non-JSON response: {e} | body: {}", truncate(&text, 200)))
}
fn seed_context(req: &RespondRequest) -> String {
let hint = spec_field_str(&req.spec, "approach_hint");
if !hint.is_empty() {
return hint.to_string();
}
let role = spec_field_str(&req.spec, "target_role");
let city = spec_field_str(&req.spec, "target_city");
let state = spec_field_str(&req.spec, "target_state");
if !role.is_empty() && !city.is_empty() {
return format!("{role} fill in {city}, {state}");
}
// Non-staffing task class — use the operation verbatim. The
// embedding surface still works; it just has less geo signal.
req.operation.clone()
}
/// Stable rollup key. PORT FROM the sig_hash usage in observer/kb.
/// DefaultHasher isn't cryptographic but is stable for a single
/// deployment and matches the 16-char hex format already in
/// outcomes.jsonl. Swap to sha256 if cross-deployment stability is
/// needed.
fn sig_hash(req: &RespondRequest) -> String {
use std::hash::{Hash, Hasher};
let mut h = std::collections::hash_map::DefaultHasher::new();
req.task_class.hash(&mut h);
req.operation.hash(&mut h);
spec_field_str(&req.spec, "target_role").hash(&mut h);
spec_field_str(&req.spec, "target_city").hash(&mut h);
spec_field_str(&req.spec, "target_state").hash(&mut h);
format!("{:016x}", h.finish())
}
/// Build the per-task outcomes row with every indicator the
/// 2026-04-23 audit called out. schema_version=2 distinguishes
/// per-task rows from the scenario-level rows already in the file.
fn build_outcomes_row(
req: &RespondRequest,
stats: &LoopStats,
turns_used: u32,
overseer_called: bool,
outcome: &RespondOutcome,
seed_outcome: Option<serde_json::Value>,
) -> serde_json::Value {
let (ok, polarity, error) = match outcome {
RespondOutcome::Ok { .. } => (true, "success_confirmation", serde_json::Value::Null),
RespondOutcome::Failed { reason, .. } => (false, "failure_pattern", serde_json::Value::String(reason.clone())),
RespondOutcome::Blocked { reason, .. } => (false, "truth_block", serde_json::Value::String(reason.clone())),
};
let fills = match outcome {
RespondOutcome::Ok { artifact, .. } => artifact.get("fills").cloned().unwrap_or(serde_json::Value::Null),
_ => serde_json::Value::Null,
};
// Correction effectiveness: if the overseer was called this loop,
// the outcome tells us whether the correction helped. OK = it
// worked, Failed/Blocked = it didn't. When overseer wasn't called,
// these fields stay null so aggregators can filter cleanly.
let correction_applied = overseer_called;
let correction_effective = if overseer_called {
serde_json::Value::Bool(ok)
} else {
serde_json::Value::Null
};
serde_json::json!({
"schema_version": 2,
"source_service": "v1.respond",
"sig_hash": sig_hash(req),
"task_class": req.task_class,
"operation": req.operation,
"ok": ok,
"polarity": polarity,
"iterations": turns_used,
"turns": turns_used,
"fills": fills,
"models": {
"executor": req.executor_model.clone().unwrap_or_else(|| DEFAULT_EXECUTOR_MODEL.to_string()),
"reviewer": req.reviewer_model.clone().unwrap_or_else(|| DEFAULT_REVIEWER_MODEL.to_string()),
},
"usage": stats,
"provider": "ollama",
"playbook_seed": seed_outcome,
"truth_rule_citations": [], // Phase 42 gate hook — empty until wired
"validator_report": null, // Phase 43 hook
"correction_applied": correction_applied,
"correction_effective": correction_effective,
"error": error,
"created_at": chrono::Utc::now().to_rfc3339(),
})
}
/// PORT FROM Phase 20's T3 overseer prompt shape. The overseer sees:
/// - Task + spec
/// - KB context (historical outcomes + prior corrections across
/// every profile that ran this task class)
/// - Recent log tail (last 12 turns)
/// - Specific reason the local loop escalated
/// It returns prose guidance the executor reads next turn. We do NOT
/// ask it to emit a JSON action — the executor still owns the final
/// shape. The overseer is a strategist, not a tool-caller.
fn build_overseer_prompt(
req: &RespondRequest,
kb: &KbContext,
log: &[LogEntry],
reason: &str,
) -> String {
let mut p = String::new();
p.push_str("You are the OVERSEER (T3 strategic tier). The local executor/reviewer loop has hit a wall and escalated to you for a strategic correction. You do not call tools; you read the record and tell the executor what to do differently on its next turn.\n\n");
p.push_str(&format!("## Task\n{}\n", req.operation));
p.push_str(&format!("Task class: {}\n", req.task_class));
if !req.spec.is_null() {
p.push_str(&format!("Spec: {}\n", req.spec));
}
p.push_str(&format!("\n## Reason for escalation\n{}\n\n", reason));
p.push_str(&kb.to_prompt_section());
p.push_str("\n## Recent log (last 12 turns, most recent last):\n");
let start = log.len().saturating_sub(12);
for e in &log[start..] {
let content = e.content.to_string();
p.push_str(&format!(
" [t{:02} {} {}] {}\n",
e.turn, e.role, e.kind, truncate(&content, 200),
));
}
p.push_str("\n## Your output\n");
p.push_str("Write 3-6 sentences of CONCRETE guidance the executor will read next turn. ");
p.push_str("Reference what specifically went wrong, what to try instead, and what to AVOID ");
p.push_str("(especially if it appears in the \"Recent overseer corrections\" above — don't repeat yourself). ");
p.push_str("No JSON, no tool syntax — the executor will translate your guidance into action.\n");
p
}
async fn append_corrections_row(row: &serde_json::Value) -> Result<(), String> {
append_outcomes_row_at(
std::path::Path::new("data/_kb/overseer_corrections.jsonl"),
row,
).await
}
/// Append one JSONL row to `data/_kb/outcomes.jsonl`. Creates the
/// directory if missing. Same write shape as the TS pipeline; the
/// Phase 24 observer fix taught us `/ingest/file` has REPLACE
/// semantics, so this writes the JSONL directly — APPEND, not replace.
async fn append_outcomes_row(row: &serde_json::Value) -> Result<(), String> {
append_outcomes_row_at(std::path::Path::new("data/_kb/outcomes.jsonl"), row).await
}
/// Path-taking variant — lets tests write to a tmp path without
/// mutating the process CWD (which isn't thread-safe under parallel
/// test execution).
async fn append_outcomes_row_at(
path: &std::path::Path,
row: &serde_json::Value,
) -> Result<(), String> {
use tokio::io::AsyncWriteExt;
if let Some(dir) = path.parent() {
tokio::fs::create_dir_all(dir).await.map_err(|e| format!("mkdir: {e}"))?;
}
let mut line = serde_json::to_string(row).map_err(|e| format!("serialize: {e}"))?;
line.push('\n');
let mut f = tokio::fs::OpenOptions::new()
.create(true).append(true).open(path).await
.map_err(|e| format!("open: {e}"))?;
f.write_all(line.as_bytes()).await.map_err(|e| format!("write: {e}"))?;
// Explicit flush + sync before drop. tokio::fs::File uses a
// threadpool; plain drop doesn't guarantee the write is
// durable by the time the next open sees the file, which
// surfaced as a 3/8 flake on the back-to-back-append test.
f.flush().await.map_err(|e| format!("flush: {e}"))?;
Ok(())
}
/// PORT FROM orchestrator.ts:306-311. Cap `rows` at 20 entries and
/// annotate the truncation so the executor sees it on the next turn
/// prompt — prevents a 1000-row hybrid_search result from wiping the
/// context budget on a single tool call.
fn trim_result(r: &serde_json::Value) -> serde_json::Value {
if let Some(rows) = r.get("rows").and_then(|v| v.as_array()) {
if rows.len() > 20 {
let mut truncated = r.clone();
if let Some(obj) = truncated.as_object_mut() {
obj.insert("rows".into(), serde_json::Value::Array(rows.iter().take(20).cloned().collect()));
obj.insert("_trimmed".into(), serde_json::Value::String(
format!("{} more rows", rows.len() - 20),
));
}
return truncated;
}
}
r.clone()
}
// --- Parsing + prompt builders (PORT FROM agent.ts:566-698) ---
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum Role { Executor, Reviewer }
/// PORT FROM agent.ts:650. Strip ```json fences, find the first {...}
/// block, soft-fix the two common model mistakes: stray `)}`, trailing
/// commas. Reviewer shape tolerance: bare `kind: "approve_done"` gets
/// normalized to `{kind: "critique", verdict: "approve_done"}` — some
/// models (qwen2.5) emit that way and the semantic content is identical.
pub fn parse_action(raw: &str, role: Role) -> Result<Action, String> {
let mut s = raw.trim().to_string();
if let Some(stripped) = s.strip_prefix("```json") {
s = stripped.trim_start_matches('\n').to_string();
} else if let Some(stripped) = s.strip_prefix("```") {
s = stripped.trim_start_matches('\n').to_string();
}
if let Some(stripped) = s.strip_suffix("```") {
s = stripped.trim_end().to_string();
}
let start = s.find('{').ok_or_else(|| format!("no JSON object in {role:?} response: {}", truncate(raw, 300)))?;
let end = s.rfind('}').ok_or_else(|| format!("no closing brace in {role:?} response: {}", truncate(raw, 300)))?;
if end <= start {
return Err(format!("no JSON object in {role:?} response: {}", truncate(raw, 300)));
}
// Soft-fix: stray ")}" (qwen2.5 tool_call quirk) + trailing commas.
let mut json = s[start..=end].to_string();
json = json.replace(")}", "}");
json = fix_trailing_commas(&json);
let obj: serde_json::Value = serde_json::from_str(&json)
.map_err(|e| format!("invalid JSON from {role:?}: {e} | raw: {}", truncate(&json, 300)))?;
let kind = obj.get("kind").and_then(|v| v.as_str()).unwrap_or("").to_string();
match role {
Role::Executor => match kind.as_str() {
"plan" | "tool_call" | "propose_done" => {
serde_json::from_value(obj).map_err(|e| format!("executor shape mismatch: {e}"))
}
_ => Err(format!("executor returned unexpected shape: {}", truncate(&obj.to_string(), 200))),
},
Role::Reviewer => {
// Accept the wrapped shape: {kind:"critique", verdict:"continue"|...}
if kind == "critique" {
return serde_json::from_value(obj)
.map_err(|e| format!("reviewer shape mismatch: {e}"));
}
// Accept the bare-verdict shape: {kind:"approve_done", notes:"..."}
if matches!(kind.as_str(), "continue" | "drift" | "approve_done") {
let verdict = match kind.as_str() {
"continue" => Verdict::Continue,
"drift" => Verdict::Drift,
"approve_done" => Verdict::ApproveDone,
_ => unreachable!(),
};
let notes = obj.get("notes").and_then(|v| v.as_str()).unwrap_or("").to_string();
return Ok(Action::Critique { verdict, notes });
}
Err(format!("reviewer returned unexpected shape: {}", truncate(&obj.to_string(), 200)))
}
}
}
/// Remove `,` immediately followed by `}` or `]` (with optional whitespace).
/// Same intent as the TS regex `,(\s*[}\]])`.
fn fix_trailing_commas(s: &str) -> String {
let bytes = s.as_bytes();
let mut out = String::with_capacity(s.len());
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b',' {
let mut j = i + 1;
while j < bytes.len() && bytes[j].is_ascii_whitespace() { j += 1; }
if j < bytes.len() && (bytes[j] == b'}' || bytes[j] == b']') {
// skip the comma
i += 1;
continue;
}
}
out.push(bytes[i] as char);
i += 1;
}
out
}
fn action_kind(a: &Action) -> &'static str {
match a {
Action::Plan { .. } => "plan",
Action::ToolCall { .. } => "tool_call",
Action::ProposeDone { .. } => "propose_done",
Action::Critique { .. } => "critique",
}
}
fn action_content(a: &Action) -> serde_json::Value {
serde_json::to_value(a).unwrap_or(serde_json::Value::Null)
}
/// Returns true if the model name belongs to Ollama Cloud. Prefix-based
/// so new cloud models are pickable by name without a config update —
/// match the rough family prefixes Phase 20's matrix declares.
/// `qwen3.5:397b` lives in the cloud; `qwen3.5:latest` is local —
/// hence the `:3` suffix check rather than matching all of `qwen3.5:`.
pub fn is_cloud_model(model: &str) -> bool {
model.starts_with("gpt-oss:")
|| model.starts_with("qwen3-coder:")
|| model.starts_with("qwen3.5:3")
|| model.starts_with("kimi-")
|| model.starts_with("kimi/")
}
fn truncate(s: &str, n: usize) -> String {
if s.len() <= n { s.to_string() } else { format!("{}", &s[..n]) }
}
fn spec_field_str<'a>(spec: &'a serde_json::Value, key: &str) -> &'a str {
spec.get(key).and_then(|v| v.as_str()).unwrap_or("")
}
fn spec_target_count(spec: &serde_json::Value) -> u64 {
spec.get("target_count").and_then(|v| v.as_u64()).unwrap_or(0)
}
/// PORT FROM agent.ts:566. Same structural shape: operation + target +
/// candidates-surfaced hint + recent log + ONE-JSON-action instruction.
/// Staffing-specific fields degrade gracefully when spec is empty (non-
/// staffing task classes still get a usable prompt, just without the
/// target_role / target_count scaffolding).
fn build_executor_prompt(
req: &RespondRequest,
boost: &[serde_json::Value],
log: &[LogEntry],
) -> String {
let target_role = spec_field_str(&req.spec, "target_role");
let target_count = spec_target_count(&req.spec);
let target_city = spec_field_str(&req.spec, "target_city");
let target_state = spec_field_str(&req.spec, "target_state");
let approach_hint = spec_field_str(&req.spec, "approach_hint");
let mut p = String::new();
p.push_str("You are the EXECUTOR agent. Your job is to complete this task:\n\n");
p.push_str(&format!("OPERATION: {}\n", req.operation));
if target_count > 0 && !target_role.is_empty() {
p.push_str(&format!(
"TARGET: {target_count} × {target_role} in {target_city}, {target_state}\n"
));
} else {
p.push_str(&format!("TASK CLASS: {}\n", req.task_class));
if !req.spec.is_null() {
p.push_str(&format!("SPEC: {}\n", req.spec));
}
}
if !approach_hint.is_empty() {
p.push_str(&format!("HINT: {approach_hint}\n"));
}
p.push_str("\nThe REVIEWER agent is watching every turn. They will flag drift. Stay on target.\n\n");
if !boost.is_empty() {
p.push_str("SIMILAR PAST PLAYBOOKS (reference, not prescription):\n");
for (i, b) in boost.iter().take(3).enumerate() {
p.push_str(&format!(" {}. {}\n", i + 1, b));
}
p.push('\n');
}
// Orchestrator-tracked candidate memory (agent.ts:568). The log-
// render cap chops tool_result content, so the executor can't
// always see what earlier searches returned. This block is a
// durable rollup — every candidate the loop has seen, formatted
// for prompt reading. Critical for letting the executor reach
// propose_done instead of re-searching.
let seen = candidates_seen(log);
p.push_str("CANDIDATES SURFACED SO FAR (orchestrator-tracked, do not forget):\n");
if seen.is_empty() {
p.push_str(" (none yet — start with hybrid_search)\n");
} else {
p.push_str(" # Use the name + city + state for sql verification (NOT doc_id — that's the vector-index key, not workers_500k.worker_id)\n");
for c in seen.iter().take(30) {
p.push_str(&format!(" - name=\"{}\" city=\"{}\" state=\"{}\" (vector doc_id={})\n",
c.name, c.city, c.state, c.doc_id));
}
if seen.len() > 30 {
p.push_str(&format!(" ... {} more surfaced\n", seen.len() - 30));
}
}
p.push('\n');
p.push_str("SHARED LOG (recent turns):\n");
p.push_str(&render_log_for_prompt(log, 8));
p.push('\n');
p.push_str("AVAILABLE TOOLS (use tool_call with these exact names — DO NOT invent others):\n");
p.push_str(" hybrid_search(sql_filter: string, question: string, index_name: string, k?: number)\n");
p.push_str(" SQL-narrow + vector-rerank. Use for: \"find candidates matching criteria X, ranked by semantic match to Y\".\n");
p.push_str(" For staffing fills, index_name is typically \"w500k_b18\" or \"w500k_b3\" (workers_500k).\n");
p.push_str(" Example: {\"tool\":\"hybrid_search\",\"args\":{\"sql_filter\":\"role='Welder' AND city='Toledo' AND state='OH'\",\"question\":\"reliable welders with OSHA certs\",\"index_name\":\"w500k_b18\",\"k\":10},\"rationale\":\"pull top 10 welder candidates in Toledo\"}\n");
p.push_str(" sql(query: string) — SELECT-only. Use for: verification queries before propose_done.\n");
p.push_str(" IMPORTANT: workers_500k.worker_id is an INTEGER internal key — NOT the doc_id from hybrid_search.\n");
p.push_str(" To verify a candidate from hybrid_search results, query by name+city+state (which ARE in the chunk_text you already received):\n");
p.push_str(" Example: {\"tool\":\"sql\",\"args\":{\"query\":\"SELECT worker_id, name, role FROM workers_500k WHERE name = 'Donna Hall' AND city = 'Columbus' AND state = 'OH' LIMIT 1\"},\"rationale\":\"confirm Donna Hall exists as a Warehouse Associate in Columbus\"}\n\n");
p.push_str("Your next action MUST be a JSON object matching one of these shapes:\n");
p.push_str("{\"kind\":\"plan\",\"steps\":[\"short step 1\",\"short step 2\"]}\n");
p.push_str("{\"kind\":\"tool_call\",\"tool\":\"...\",\"args\":{...},\"rationale\":\"why\"}\n");
if target_count > 0 {
p.push_str(&format!(
"{{\"kind\":\"propose_done\",\"fills\":[{{\"candidate_id\":\"...\",\"name\":\"First Last\"}}],\"rationale\":\"...\"}} — fills MUST have EXACTLY {target_count} entries.\n"
));
} else {
p.push_str("{\"kind\":\"propose_done\",\"fills\":[...],\"rationale\":\"...\"}\n");
}
if target_count > 0 {
p.push_str(&format!(
"\nSTRATEGY: once prior tool_result rows contain ≥ {target_count} candidates in {target_city}, {target_state} matching role \"{target_role}\", STOP SEARCHING. Pick the top {target_count} by score, verify ONE via `sql` tool, then emit propose_done. Do NOT repeat hybrid_search if you already have enough candidates.\n"
));
}
p.push_str("\nRespond with ONLY the JSON object. No markdown fences, no prose.\n");
p
}
/// PORT FROM agent.ts:602. Reviewer prompt with the `awaitingApproval`
/// hard rule: if the most recent executor action was propose_done, the
/// reviewer cannot emit `continue` (would stall the loop).
fn build_reviewer_prompt(req: &RespondRequest, log: &[LogEntry]) -> String {
let target_role = spec_field_str(&req.spec, "target_role");
let target_count = spec_target_count(&req.spec);
let target_city = spec_field_str(&req.spec, "target_city");
let target_state = spec_field_str(&req.spec, "target_state");
let last_executor_kind = log.iter().rev()
.find(|e| e.role == "executor")
.map(|e| e.kind.as_str())
.unwrap_or("");
let awaiting_approval = last_executor_kind == "propose_done";
let mut p = String::new();
p.push_str("You are the REVIEWER agent. The EXECUTOR is trying to complete this task:\n\n");
p.push_str(&format!("OPERATION: {}\n", req.operation));
if target_count > 0 && !target_role.is_empty() {
p.push_str(&format!(
"TARGET: {target_count} × {target_role} in {target_city}, {target_state}\n\n"
));
}
p.push_str("Your job: catch drift. Agents often wander from the actual objective. Specifically watch for:\n");
if target_count > 0 && !target_city.is_empty() {
p.push_str(&format!("- Proposing candidates who aren't in {target_city}, {target_state}.\n"));
p.push_str(&format!("- Proposing candidates who don't have {target_role} skill.\n"));
p.push_str(&format!("- Proposing fewer or more than {target_count} fills.\n"));
} else {
p.push_str("- Drifting from the stated task class or spec.\n");
}
p.push_str("- Irrelevant tool calls.\n\n");
p.push_str("SHARED LOG (recent turns):\n");
p.push_str(&render_log_for_prompt(log, 10));
p.push('\n');
p.push_str("Your next action MUST be a JSON object:\n");
p.push_str("{\"kind\":\"critique\",\"verdict\":\"continue\" | \"drift\" | \"approve_done\",\"notes\":\"...\"}\n\n");
p.push_str("- \"continue\" → executor is on a reasonable path, let them keep going.\n");
p.push_str("- \"drift\" → executor is off-track; notes MUST tell them how to redirect.\n");
p.push_str("- \"approve_done\" → executor's propose_done meets the criteria. Seal it.\n\n");
if target_count > 0 {
p.push_str(&format!(
"APPROVAL CRITERIA (use only for propose_done):\n\
1. Exactly {target_count} fills.\n\
2. Each fill's name appears in a prior tool_result from {target_city}, {target_state} matching role \"{target_role}\".\n\
3. Executor has SQL-verified at least one fill.\n\
If 1-3 all hold, return approve_done.\n"
));
}
if awaiting_approval {
p.push_str("\nHARD RULE: The executor's most recent action was propose_done. On this turn you CANNOT return \"continue\" — it would stall the task. Choose approve_done or drift (state which criterion failed in notes).\n");
}
// Loop-detection: if the executor has tool_called ≥ 3 times since
// the last propose_done without proposing, it's stuck in a search
// loop. Reviewer rubber-stamping "continue" here is the failure
// pattern the 2026-04-23 battery surfaced in phase α task 2 —
// 12 turns, 0 proposes, 100% reviewer:continue.
let stuck_tool_calls = tool_calls_since_last_propose(log);
if stuck_tool_calls >= 3 {
p.push_str(&format!(
"\nLOOP DETECTION: The executor has called tools {stuck_tool_calls} times without proposing done. \
Look at the CANDIDATES SURFACED SO FAR (visible in executor's view): if there are already ≥ {} \
matching candidates in {target_city}, {target_state} for role \"{target_role}\", respond with \
verdict=\"drift\" and notes=\"You have enough candidates — pick the top {} by score and emit \
propose_done this turn. Stop re-searching.\"\n",
target_count, target_count,
));
}
p.push_str("\nRespond with ONLY the JSON object.\n");
p
}
fn render_log_for_prompt(log: &[LogEntry], tail: usize) -> String {
if log.is_empty() {
return "(no prior turns)\n".into();
}
let start = log.len().saturating_sub(tail);
let mut s = String::new();
for e in &log[start..] {
let content = e.content.to_string();
// tool_result is the executor's eyes — candidate data lives
// there and a 160-char cap chops off every name/doc_id the
// executor needs for propose_done. Keep these generous; cap
// other kinds tighter since they're decision/status entries
// and don't carry payload the executor will re-read.
let cap = if e.kind == "tool_result" { 1200 } else { 200 };
s.push_str(&format!(
" [t{:02} {} {}] {}\n",
e.turn, e.role, e.kind, truncate(&content, cap)
));
}
s
}
/// Ports agent.ts:538 `candidatesSeen`. Walks tool_result entries,
/// parses `sources[].chunk_text` for the staffing "Name — Role in
/// City, ST" shape, dedupes by doc_id. Returns an orchestrator-tracked
/// surface the executor prompt can show verbatim — stopping the
/// executor from "forgetting" candidates when the log-render truncates.
fn candidates_seen(log: &[LogEntry]) -> Vec<CandidateHint> {
let mut out: Vec<CandidateHint> = Vec::new();
let mut seen_ids: std::collections::HashSet<String> = std::collections::HashSet::new();
for e in log {
if e.kind != "tool_result" { continue; }
let Some(sources) = e.content.get("sources").and_then(|v| v.as_array()) else { continue };
for s in sources {
let Some(doc_id) = s.get("doc_id").and_then(|v| v.as_str()) else { continue };
if seen_ids.contains(doc_id) { continue; }
let chunk_text = s.get("chunk_text").and_then(|v| v.as_str()).unwrap_or("");
let Some((name_part, rest)) = chunk_text.split_once('—') else { continue };
let name = name_part.trim().to_string();
let loc = rest.split_once(" in ").map(|(_, r)| r).unwrap_or("");
let Some((city, state_raw)) = loc.split_once(',') else { continue };
let city = city.trim().to_string();
let state = state_raw
.trim()
.chars()
.take_while(|c| c.is_alphabetic())
.collect::<String>();
if name.is_empty() || city.is_empty() || state.is_empty() { continue; }
seen_ids.insert(doc_id.to_string());
out.push(CandidateHint {
doc_id: doc_id.to_string(),
name,
city,
state,
});
}
}
out
}
#[derive(Debug, Clone)]
struct CandidateHint {
doc_id: String,
name: String,
city: String,
state: String,
}
/// Count executor tool_calls since the last propose_done (or since
/// loop start if none). Used by the reviewer prompt to flag stuck
/// search loops — if an executor has tool_called ≥ 3× without
/// proposing, the reviewer should verdict:drift with a stop-searching
/// note rather than rubber-stamping continue.
fn tool_calls_since_last_propose(log: &[LogEntry]) -> u32 {
let mut count = 0u32;
for e in log.iter().rev() {
if e.role != "executor" { continue; }
if e.kind == "propose_done" { break; }
if e.kind == "tool_call" { count += 1; }
}
count
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn log_entry_serializes_to_orchestrator_shape() {
let e = LogEntry::new(3, "executor", "qwen3.5:latest", "tool_call",
serde_json::json!({"tool": "hybrid_search"}));
let j = serde_json::to_value(&e).unwrap();
for k in ["turn", "role", "kind", "model", "content", "at"] {
assert!(j.get(k).is_some(), "missing field: {k}");
}
}
#[test]
fn outcome_into_log_is_lossless() {
let e = LogEntry::new(1, "system", "m", "boost_loaded", serde_json::json!({}));
let o = RespondOutcome::Failed { reason: "scaffold".into(), log: vec![e] };
assert_eq!(o.into_log().len(), 1);
}
#[test]
fn parse_executor_plan() {
let raw = r#"{"kind":"plan","steps":["hybrid_search","verify","propose_done"]}"#;
let a = parse_action(raw, Role::Executor).unwrap();
match a {
Action::Plan { steps } => assert_eq!(steps.len(), 3),
_ => panic!("wrong variant"),
}
}
#[test]
fn parse_executor_tool_call_with_stray_paren() {
// Mimics the qwen2.5 quirk where the model closes with ")}" —
// agent.ts:666 has the same fix. PORT from TS test territory.
let raw = r#"{"kind":"tool_call","tool":"sql","args":{"query":"SELECT 1"},"rationale":"verify")}"#;
let a = parse_action(raw, Role::Executor).unwrap();
match a {
Action::ToolCall { tool, .. } => assert_eq!(tool, "sql"),
_ => panic!("wrong variant"),
}
}
#[test]
fn parse_executor_propose_done_with_fence() {
let raw = "```json\n{\"kind\":\"propose_done\",\"fills\":[{\"candidate_id\":\"W-1\",\"name\":\"A B\"}],\"rationale\":\"ok\"}\n```";
let a = parse_action(raw, Role::Executor).unwrap();
match a {
Action::ProposeDone { fills, .. } => {
assert_eq!(fills.len(), 1);
assert_eq!(fills[0].candidate_id, "W-1");
}
_ => panic!("wrong variant"),
}
}
#[test]
fn parse_reviewer_wrapped_verdict() {
let raw = r#"{"kind":"critique","verdict":"approve_done","notes":"ok"}"#;
let a = parse_action(raw, Role::Reviewer).unwrap();
match a {
Action::Critique { verdict, .. } => assert_eq!(verdict, Verdict::ApproveDone),
_ => panic!("wrong variant"),
}
}
#[test]
fn parse_reviewer_bare_verdict_normalizes() {
// agent.ts:690-694 — qwen2.5/mistral emit the verdict as `kind`.
let raw = r#"{"kind":"drift","notes":"wrong city"}"#;
let a = parse_action(raw, Role::Reviewer).unwrap();
match a {
Action::Critique { verdict, notes } => {
assert_eq!(verdict, Verdict::Drift);
assert_eq!(notes, "wrong city");
}
_ => panic!("wrong variant"),
}
}
#[test]
fn parse_reviewer_rejects_unknown_verdict() {
let raw = r#"{"kind":"maybe","notes":"?"}"#;
assert!(parse_action(raw, Role::Reviewer).is_err());
}
#[test]
fn parse_trailing_comma() {
let raw = r#"{"kind":"plan","steps":["a","b",]}"#;
assert!(parse_action(raw, Role::Executor).is_ok());
}
#[test]
fn parse_no_json_errors_cleanly() {
let raw = "sorry I cannot comply";
let err = parse_action(raw, Role::Executor).unwrap_err();
assert!(err.contains("no JSON"));
}
#[test]
fn candidates_seen_parses_sources() {
let log = vec![
LogEntry::new(1, "executor", "m", "tool_result", serde_json::json!({
"sources": [
{"doc_id": "W-1", "chunk_text": "Alice Smith — Welder in Toledo, OH. 5 years experience."},
{"doc_id": "W-2", "chunk_text": "Bob Jones — Welder in Toledo, OH. Night shift."},
]
})),
LogEntry::new(2, "reviewer", "m", "critique", serde_json::json!({
"verdict": "continue", "notes": ""
})),
LogEntry::new(3, "executor", "m", "tool_result", serde_json::json!({
"sources": [
{"doc_id": "W-2", "chunk_text": "Bob Jones — Welder in Toledo, OH. Night shift."},
{"doc_id": "W-3", "chunk_text": "Carol Davis — Welder in Toledo, OH. AWS certified."},
]
})),
];
let seen = candidates_seen(&log);
assert_eq!(seen.len(), 3, "dedup by doc_id");
assert_eq!(seen[0].name, "Alice Smith");
assert_eq!(seen[0].city, "Toledo");
assert_eq!(seen[0].state, "OH");
assert_eq!(seen[2].name, "Carol Davis");
}
#[test]
fn candidates_seen_ignores_malformed() {
let log = vec![
LogEntry::new(1, "executor", "m", "tool_result", serde_json::json!({
"sources": [
{"doc_id": "W-1", "chunk_text": "no dash here"},
{"doc_id": "W-2", "chunk_text": "Name — but no 'in' keyword"},
{"doc_id": "W-3"}, // no chunk_text
]
})),
];
assert_eq!(candidates_seen(&log).len(), 0);
}
#[test]
fn tool_calls_since_propose_counts_correctly() {
let log = vec![
LogEntry::new(1, "executor", "m", "tool_call", serde_json::json!({})),
LogEntry::new(2, "executor", "m", "tool_call", serde_json::json!({})),
LogEntry::new(3, "executor", "m", "tool_call", serde_json::json!({})),
];
assert_eq!(tool_calls_since_last_propose(&log), 3);
// propose_done resets the counter
let log2 = vec![
LogEntry::new(1, "executor", "m", "tool_call", serde_json::json!({})),
LogEntry::new(2, "executor", "m", "propose_done", serde_json::json!({})),
LogEntry::new(3, "executor", "m", "tool_call", serde_json::json!({})),
];
assert_eq!(tool_calls_since_last_propose(&log2), 1);
}
#[test]
fn executor_prompt_includes_surfaced_candidates() {
let req = req_with_spec(serde_json::json!({
"target_role": "Welder", "target_count": 2, "target_city": "Toledo", "target_state": "OH"
}));
let log = vec![
LogEntry::new(1, "executor", "m", "tool_result", serde_json::json!({
"sources": [
{"doc_id": "W-1", "chunk_text": "Alice Smith — Welder in Toledo, OH."},
]
})),
];
let p = build_executor_prompt(&req, &[], &log);
assert!(p.contains("CANDIDATES SURFACED SO FAR"));
// Prompt format deliberately separates name from doc_id now —
// the line reads `name="Alice Smith" ... (vector doc_id=W-1)`
// so the executor prompt explicitly tells the model NOT to
// conflate doc_id with workers_500k.worker_id. Assertion was
// expecting the old concatenated format; update to match the
// semantic contract (both tokens present, any order).
assert!(p.contains("Alice Smith"));
assert!(p.contains("W-1"));
assert!(p.contains("Toledo"));
}
#[test]
fn reviewer_prompt_flags_loop_after_three_tool_calls() {
let req = req_with_spec(serde_json::json!({
"target_role": "Welder", "target_count": 2, "target_city": "Toledo", "target_state": "OH"
}));
let log = vec![
LogEntry::new(1, "executor", "m", "tool_call", serde_json::json!({})),
LogEntry::new(2, "executor", "m", "tool_call", serde_json::json!({})),
LogEntry::new(3, "executor", "m", "tool_call", serde_json::json!({})),
];
let p = build_reviewer_prompt(&req, &log);
assert!(p.contains("LOOP DETECTION"));
assert!(p.contains("Stop re-searching"));
}
#[test]
fn reviewer_prompt_no_loop_clause_before_three_calls() {
let req = req_with_spec(serde_json::json!({
"target_role": "Welder", "target_count": 2, "target_city": "Toledo", "target_state": "OH"
}));
let log = vec![
LogEntry::new(1, "executor", "m", "tool_call", serde_json::json!({})),
];
let p = build_reviewer_prompt(&req, &log);
assert!(!p.contains("LOOP DETECTION"));
}
#[test]
fn is_cloud_model_recognizes_cloud_prefixes() {
assert!(is_cloud_model("gpt-oss:120b"));
assert!(is_cloud_model("gpt-oss:20b"));
assert!(is_cloud_model("qwen3-coder:480b"));
assert!(is_cloud_model("qwen3.5:397b"));
assert!(is_cloud_model("kimi-k2.5"));
assert!(is_cloud_model("kimi/k2-thinking"));
}
#[test]
fn is_cloud_model_rejects_local_prefixes() {
assert!(!is_cloud_model("qwen3.5:latest"));
assert!(!is_cloud_model("qwen3:latest"));
assert!(!is_cloud_model("qwen2.5:latest"));
assert!(!is_cloud_model("mistral"));
assert!(!is_cloud_model("nomic-embed-text"));
}
#[test]
fn spec_target_count_defaults_to_zero() {
let spec = serde_json::json!({});
assert_eq!(spec_target_count(&spec), 0);
}
#[test]
fn executor_prompt_includes_target_when_spec_has_it() {
let req = RespondRequest {
task_class: "staffing.fill".into(),
operation: "fill: Welder x2 in Toledo, OH".into(),
spec: serde_json::json!({
"target_role": "Welder", "target_count": 2,
"target_city": "Toledo", "target_state": "OH"
}),
executor_model: None, reviewer_model: None, max_turns: None,
};
let p = build_executor_prompt(&req, &[], &[]);
assert!(p.contains("TARGET: 2 × Welder in Toledo, OH"));
assert!(p.contains("EXACTLY 2 entries"));
assert!(p.contains("hybrid_search"), "executor prompt must list hybrid_search in tool catalog");
assert!(p.contains("sql(query"), "executor prompt must list sql tool signature");
assert!(p.contains("DO NOT invent others"), "executor prompt must warn against tool-name invention");
}
#[test]
fn executor_prompt_degrades_without_spec() {
let req = RespondRequest {
task_class: "code.review".into(),
operation: "review PR #42".into(),
spec: serde_json::json!(null),
executor_model: None, reviewer_model: None, max_turns: None,
};
let p = build_executor_prompt(&req, &[], &[]);
assert!(p.contains("TASK CLASS: code.review"));
assert!(!p.contains("TARGET:"));
}
#[test]
fn reviewer_prompt_adds_hard_rule_when_awaiting_approval() {
let req = RespondRequest {
task_class: "staffing.fill".into(),
operation: "fill: Welder x2 in Toledo, OH".into(),
spec: serde_json::json!({"target_count": 2}),
executor_model: None, reviewer_model: None, max_turns: None,
};
let log = vec![LogEntry::new(1, "executor", "m", "propose_done", serde_json::json!({}))];
let p = build_reviewer_prompt(&req, &log);
assert!(p.contains("HARD RULE"));
}
fn req_with_spec(spec: serde_json::Value) -> RespondRequest {
RespondRequest {
task_class: "staffing.fill".into(),
operation: "fill: Welder x2 in Toledo, OH".into(),
spec,
executor_model: None,
reviewer_model: None,
max_turns: None,
}
}
fn sample_stats() -> LoopStats {
LoopStats {
requests: 8, prompt_tokens: 12345, completion_tokens: 2345,
total_tokens: 14690, latency_ms: 42000,
}
}
#[test]
fn sig_hash_is_stable_for_same_inputs() {
let spec = serde_json::json!({
"target_role": "Welder", "target_city": "Toledo", "target_state": "OH"
});
let a = sig_hash(&req_with_spec(spec.clone()));
let b = sig_hash(&req_with_spec(spec));
assert_eq!(a, b);
assert_eq!(a.len(), 16);
}
#[test]
fn sig_hash_differs_by_geo() {
let a = sig_hash(&req_with_spec(serde_json::json!({
"target_role": "Welder", "target_city": "Toledo", "target_state": "OH"
})));
let b = sig_hash(&req_with_spec(serde_json::json!({
"target_role": "Welder", "target_city": "Dayton", "target_state": "OH"
})));
assert_ne!(a, b);
}
#[test]
fn seed_context_uses_hint_when_present() {
let req = req_with_spec(serde_json::json!({
"approach_hint": "hybrid search", "target_role": "Welder", "target_city": "Toledo"
}));
assert_eq!(seed_context(&req), "hybrid search");
}
#[test]
fn seed_context_falls_back_to_role_city_state() {
let req = req_with_spec(serde_json::json!({
"target_role": "Welder", "target_city": "Toledo", "target_state": "OH"
}));
assert_eq!(seed_context(&req), "Welder fill in Toledo, OH");
}
#[test]
fn seed_context_falls_back_to_operation_for_non_staffing() {
let req = req_with_spec(serde_json::json!({}));
assert_eq!(seed_context(&req), "fill: Welder x2 in Toledo, OH");
}
#[test]
fn outcomes_row_stamps_full_indicator_set_on_success() {
let req = req_with_spec(serde_json::json!({
"target_role": "Welder", "target_city": "Toledo", "target_state": "OH"
}));
let stats = sample_stats();
let outcome = RespondOutcome::Ok {
artifact: serde_json::json!({"fills": [{"candidate_id": "W-1", "name": "A B"}]}),
log: vec![],
};
let seed = serde_json::json!({"outcome": {"mode": "added"}, "entries_after": 1337});
let row = build_outcomes_row(&req, &stats, 4, false, &outcome, Some(seed));
assert_eq!(row["schema_version"], 2);
assert_eq!(row["source_service"], "v1.respond");
assert_eq!(row["task_class"], "staffing.fill");
assert_eq!(row["ok"], true);
assert_eq!(row["polarity"], "success_confirmation");
assert_eq!(row["iterations"], 4);
assert_eq!(row["turns"], 4);
assert_eq!(row["usage"]["total_tokens"], 14690);
assert_eq!(row["usage"]["requests"], 8);
assert_eq!(row["models"]["executor"], "qwen3.5:latest");
assert_eq!(row["provider"], "ollama");
assert_eq!(row["playbook_seed"]["entries_after"], 1337);
assert!(row["sig_hash"].as_str().unwrap().len() == 16);
assert!(row["truth_rule_citations"].is_array());
}
#[test]
fn outcomes_row_stamps_failure_polarity() {
let req = req_with_spec(serde_json::json!({}));
let stats = sample_stats();
let outcome = RespondOutcome::Failed {
reason: "3 consecutive drifts".into(),
log: vec![],
};
let row = build_outcomes_row(&req, &stats, 2, false, &outcome, None);
assert_eq!(row["ok"], false);
assert_eq!(row["polarity"], "failure_pattern");
assert_eq!(row["error"], "3 consecutive drifts");
assert_eq!(row["fills"], serde_json::Value::Null);
assert!(row["playbook_seed"].is_null());
assert_eq!(row["correction_applied"], false);
assert!(row["correction_effective"].is_null());
}
#[test]
fn outcomes_row_marks_correction_effective_when_overseer_called_and_ok() {
let req = req_with_spec(serde_json::json!({}));
let stats = sample_stats();
let outcome = RespondOutcome::Ok {
artifact: serde_json::json!({"fills": []}),
log: vec![],
};
let row = build_outcomes_row(&req, &stats, 3, true, &outcome, None);
assert_eq!(row["correction_applied"], true);
assert_eq!(row["correction_effective"], true);
}
#[test]
fn outcomes_row_marks_correction_ineffective_when_overseer_called_and_failed() {
let req = req_with_spec(serde_json::json!({}));
let stats = sample_stats();
let outcome = RespondOutcome::Failed {
reason: "still drifting after overseer".into(),
log: vec![],
};
let row = build_outcomes_row(&req, &stats, 3, true, &outcome, None);
assert_eq!(row["correction_applied"], true);
assert_eq!(row["correction_effective"], false);
}
// Atomic counter + PID guarantees a unique path across parallel
// test invocations. Nanos-only showed 1/5 flake under `cargo
// test` because SystemTime can repeat across threads that run
// within sub-ns of each other.
static APPEND_TEST_SEQ: std::sync::atomic::AtomicU64 =
std::sync::atomic::AtomicU64::new(0);
#[tokio::test]
async fn append_outcomes_row_at_writes_valid_jsonl() {
let seq = APPEND_TEST_SEQ.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
let tmpdir = std::env::temp_dir().join(format!(
"lh_outcomes_{}_{}", std::process::id(), seq,
));
let path = tmpdir.join("outcomes.jsonl");
let row = serde_json::json!({"schema_version": 2, "ok": true, "test": "marker"});
append_outcomes_row_at(&path, &row).await.unwrap();
append_outcomes_row_at(&path, &row).await.unwrap();
let written = std::fs::read_to_string(&path).unwrap();
let lines: Vec<_> = written.lines().collect();
assert_eq!(lines.len(), 2);
for line in lines {
let parsed: serde_json::Value = serde_json::from_str(line).unwrap();
assert_eq!(parsed["test"], "marker");
}
std::fs::remove_dir_all(&tmpdir).ok();
}
#[test]
fn trim_result_leaves_small_arrays_alone() {
let r = serde_json::json!({ "rows": [1, 2, 3] });
let t = trim_result(&r);
assert_eq!(t["rows"].as_array().unwrap().len(), 3);
assert!(t.get("_trimmed").is_none());
}
#[test]
fn trim_result_caps_at_20_and_annotates() {
let rows: Vec<_> = (0..100).map(serde_json::Value::from).collect();
let r = serde_json::json!({ "rows": rows, "other_field": "kept" });
let t = trim_result(&r);
assert_eq!(t["rows"].as_array().unwrap().len(), 20);
assert_eq!(t["_trimmed"], "80 more rows");
assert_eq!(t["other_field"], "kept");
}
#[test]
fn trim_result_passthrough_when_no_rows() {
let r = serde_json::json!({ "answer": "42" });
let t = trim_result(&r);
assert_eq!(t["answer"], "42");
}
#[test]
fn reviewer_prompt_omits_hard_rule_otherwise() {
let req = RespondRequest {
task_class: "staffing.fill".into(),
operation: "fill: Welder x2 in Toledo, OH".into(),
spec: serde_json::json!({"target_count": 2}),
executor_model: None, reviewer_model: None, max_turns: None,
};
let log = vec![LogEntry::new(1, "executor", "m", "tool_call", serde_json::json!({}))];
let p = build_reviewer_prompt(&req, &log);
assert!(!p.contains("HARD RULE"));
}
}