From 626f18d4914d1bb17f84a5ebc1b82146aeb2fbe9 Mon Sep 17 00:00:00 2001 From: root Date: Sun, 26 Apr 2026 00:01:20 -0500 Subject: [PATCH] =?UTF-8?q?pathway=5Fmemory:=20audit-consensus=20=E2=86=92?= =?UTF-8?q?=20retire=20wire?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When observer's hand-review explicitly rejects the output of a hot-swap-recommended model, the matrix's recommendation was wrong for this context. Auto-retire the trace so future agents don't get the same poisoned recommendation in their preamble. crates/vectord/src/pathway_memory.rs — add `trace_uid` to HotSwapCandidate response and populate from the matched trace. This gives consumers single-trace precision for /pathway/retire. tests/real-world/scrum_master_pipeline.ts: - HotSwapCandidate interface gains trace_uid - new retirePathwayTrace() helper (fire-and-forget, fall-open) - in the obsVerdict reject branch: if hotSwap was active AND the rejected model is the hot-swap-recommended one AND observer confidence ≥0.7, fire retire and null hotSwap so post-loop replay bookkeeping doesn't double-process. - hotSwap declared `let` (was const) so it can be nulled Cycle verdicts ("needs different angle") don't trigger retire — only outright rejects do. Confidence gate avoids retiring on heuristic-fallback verdicts that come back without a confidence number. Closes the "audit-consensus → retire" item from HANDOVER.md. Live-tested: insert synthetic trace → /pathway/retire by trace_uid → retired counter 1 → 2. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/vectord/src/pathway_memory.rs | 6 +++ tests/real-world/scrum_master_pipeline.ts | 46 ++++++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/crates/vectord/src/pathway_memory.rs b/crates/vectord/src/pathway_memory.rs index f3bde00..603dfa4 100644 --- a/crates/vectord/src/pathway_memory.rs +++ b/crates/vectord/src/pathway_memory.rs @@ -373,6 +373,11 @@ pub struct PathwayMemory { #[derive(Debug, Serialize)] pub struct HotSwapCandidate { pub pathway_id: String, + /// trace_uid of the SPECIFIC trace this hot-swap recommendation + /// came from. Lets a caller call /pathway/retire with single-trace + /// precision when observer rejects the result — the audit-consensus + /// → retire wire (HANDOVER §queued, ADR-021). + pub trace_uid: String, pub similarity: f32, pub replay_count: u32, pub success_rate: f32, @@ -754,6 +759,7 @@ impl PathwayMemory { let accepted = p.ladder_attempts.iter().find(|a| a.accepted)?; Some(HotSwapCandidate { pathway_id: p.pathway_id.clone(), + trace_uid: p.trace_uid.clone(), similarity, replay_count: p.replay_count, success_rate: p.success_rate(), diff --git a/tests/real-world/scrum_master_pipeline.ts b/tests/real-world/scrum_master_pipeline.ts index 9e956e6..509180f 100644 --- a/tests/real-world/scrum_master_pipeline.ts +++ b/tests/real-world/scrum_master_pipeline.ts @@ -224,6 +224,7 @@ function buildQueryVec(taskClass: string, filePath: string, signalClass: string interface HotSwapCandidate { pathway_id: string; + trace_uid: string; similarity: number; replay_count: number; success_rate: number; @@ -231,6 +232,25 @@ interface HotSwapCandidate { recommended_model: string; } +// Audit-consensus → retire wire (2026-04-25). When observer rejects the +// output of a hot-swap-recommended model, the matrix's recommendation +// was wrong for this context — retire the trace so future agents don't +// get the same poisoned recommendation in their preamble. Server-side +// retire is idempotent so duplicate calls are safe. +async function retirePathwayTrace(traceUid: string, reason: string): Promise { + if (!traceUid) return; + try { + await fetch(`${GATEWAY}/vectors/pathway/retire`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ trace_uid: traceUid, reason }), + signal: AbortSignal.timeout(3000), + }); + } catch { + // Fire-and-forget; pathway memory is a hint store, not a hard gate. + } +} + async function queryHotSwap(taskClass: string, filePath: string, signalClass: string | null): Promise { try { const query_vec = buildQueryVec(taskClass, filePath, signalClass); @@ -1361,7 +1381,9 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of // service unavailable → null candidate → business as usual. const signalClass = await lookupSignalClass(rel); const taskClass = "scrum_review"; - const hotSwap = await queryHotSwap(taskClass, rel, signalClass); + // mutable so retire-on-reject can null it after firing — prevents + // the post-loop replay bookkeeping from re-touching a retired trace. + let hotSwap: HotSwapCandidate | null = await queryHotSwap(taskClass, rel, signalClass); // ADR-021 Phase C: pre-review enrichment. Pull aggregated bug // fingerprints the matrix index has learned for this narrow @@ -1515,6 +1537,28 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of const reason = `observer ${obsVerdict.verdict}: ${obsVerdict.notes ?? "no notes"} (conf=${obsVerdict.confidence ?? "?"})`; history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: reason }); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: reason }); + // Audit-consensus → retire: if a hot-swap influenced THIS + // attempt (we're trying its recommended model) and observer + // explicitly rejects, the matrix recommendation is wrong for + // this context. Retire the trace so future agents don't repeat + // it. Cycle verdicts ("needs different angle") don't trigger + // retire — only outright rejects do. Confidence ≥0.7 gate + // avoids retiring on heuristic-fallback verdicts (which return + // no confidence). + if ( + hotSwap && + obsVerdict.verdict === "reject" && + rung.model === hotSwap.recommended_model && + (obsVerdict.confidence ?? 0) >= 0.7 + ) { + const retireReason = `observer reject on hot-swap replay: ${obsVerdict.notes ?? "no notes"}`; + log(` 🗑 retiring pathway ${hotSwap.trace_uid.slice(0, 8)}… (${retireReason})`); + retirePathwayTrace(hotSwap.trace_uid, retireReason); + // Null out hotSwap so the post-loop replay bookkeeping doesn't + // also try to record success/failure against the now-retired + // trace. + hotSwap = null; + } qualityRetriesOnCurrentModel++; if (qualityRetriesOnCurrentModel > MAX_QUALITY_RETRIES) { log(` ✗ ${reason} — quality retries exhausted on ${rung.model}, advancing fallback`);