pathway_memory: audit-consensus → retire wire

When observer's hand-review explicitly rejects the output of a hot-swap-recommended model, the matrix's recommendation was wrong for this context. Auto-retire the trace so future agents don't get the same poisoned recommendation in their preamble. crates/vectord/src/pathway_memory.rs — add `trace_uid` to HotSwapCandidate response and populate from the matched trace. This gives consumers single-trace precision for /pathway/retire. tests/real-world/scrum_master_pipeline.ts: - HotSwapCandidate interface gains trace_uid - new retirePathwayTrace() helper (fire-and-forget, fall-open) - in the obsVerdict reject branch: if hotSwap was active AND the rejected model is the hot-swap-recommended one AND observer confidence ≥0.7, fire retire and null hotSwap so post-loop replay bookkeeping doesn't double-process. - hotSwap declared `let` (was const) so it can be nulled Cycle verdicts ("needs different angle") don't trigger retire — only outright rejects do. Confidence gate avoids retiring on heuristic-fallback verdicts that come back without a confidence number. Closes the "audit-consensus → retire" item from HANDOVER.md. Live-tested: insert synthetic trace → /pathway/retire by trace_uid → retired counter 1 → 2. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 00:01:20 -05:00 · 2026-04-26 00:01:20 -05:00 · 626f18d491
commit 626f18d491
parent e1ef185868
2 changed files with 51 additions and 1 deletions
--- a/crates/vectord/src/pathway_memory.rs
+++ b/crates/vectord/src/pathway_memory.rs
@ -373,6 +373,11 @@ pub struct PathwayMemory {
 #[derive(Debug, Serialize)]
 pub struct HotSwapCandidate {
    pub pathway_id: String,
+    /// trace_uid of the SPECIFIC trace this hot-swap recommendation
+    /// came from. Lets a caller call /pathway/retire with single-trace
+    /// precision when observer rejects the result — the audit-consensus
+    /// → retire wire (HANDOVER §queued, ADR-021).
+    pub trace_uid: String,
    pub similarity: f32,
    pub replay_count: u32,
    pub success_rate: f32,
@ -754,6 +759,7 @@ impl PathwayMemory {
        let accepted = p.ladder_attempts.iter().find(|a| a.accepted)?;
        Some(HotSwapCandidate {
            pathway_id: p.pathway_id.clone(),
+            trace_uid: p.trace_uid.clone(),
            similarity,
            replay_count: p.replay_count,
            success_rate: p.success_rate(),
--- a/tests/real-world/scrum_master_pipeline.ts
+++ b/tests/real-world/scrum_master_pipeline.ts
@ -224,6 +224,7 @@ function buildQueryVec(taskClass: string, filePath: string, signalClass: string

 interface HotSwapCandidate {
  pathway_id: string;
+  trace_uid: string;
  similarity: number;
  replay_count: number;
  success_rate: number;
@ -231,6 +232,25 @@ interface HotSwapCandidate {
  recommended_model: string;
 }

+// Audit-consensus → retire wire (2026-04-25). When observer rejects the
+// output of a hot-swap-recommended model, the matrix's recommendation
+// was wrong for this context — retire the trace so future agents don't
+// get the same poisoned recommendation in their preamble. Server-side
+// retire is idempotent so duplicate calls are safe.
+async function retirePathwayTrace(traceUid: string, reason: string): Promise<void> {
+  if (!traceUid) return;
+  try {
+    await fetch(`${GATEWAY}/vectors/pathway/retire`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({ trace_uid: traceUid, reason }),
+      signal: AbortSignal.timeout(3000),
+    });
+  } catch {
+    // Fire-and-forget; pathway memory is a hint store, not a hard gate.
+  }
+}
+
 async function queryHotSwap(taskClass: string, filePath: string, signalClass: string | null): Promise<HotSwapCandidate | null> {
  try {
    const query_vec = buildQueryVec(taskClass, filePath, signalClass);
@ -1361,7 +1381,9 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
  // service unavailable → null candidate → business as usual.
  const signalClass = await lookupSignalClass(rel);
  const taskClass = "scrum_review";
-  const hotSwap = await queryHotSwap(taskClass, rel, signalClass);
+  // mutable so retire-on-reject can null it after firing — prevents
+  // the post-loop replay bookkeeping from re-touching a retired trace.
+  let hotSwap: HotSwapCandidate | null = await queryHotSwap(taskClass, rel, signalClass);

  // ADR-021 Phase C: pre-review enrichment. Pull aggregated bug
  // fingerprints the matrix index has learned for this narrow
@ -1515,6 +1537,28 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
      const reason = `observer ${obsVerdict.verdict}: ${obsVerdict.notes ?? "no notes"} (conf=${obsVerdict.confidence ?? "?"})`;
      history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: reason });
      pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: reason });
+      // Audit-consensus → retire: if a hot-swap influenced THIS
+      // attempt (we're trying its recommended model) and observer
+      // explicitly rejects, the matrix recommendation is wrong for
+      // this context. Retire the trace so future agents don't repeat
+      // it. Cycle verdicts ("needs different angle") don't trigger
+      // retire — only outright rejects do. Confidence ≥0.7 gate
+      // avoids retiring on heuristic-fallback verdicts (which return
+      // no confidence).
+      if (
+        hotSwap &&
+        obsVerdict.verdict === "reject" &&
+        rung.model === hotSwap.recommended_model &&
+        (obsVerdict.confidence ?? 0) >= 0.7
+      ) {
+        const retireReason = `observer reject on hot-swap replay: ${obsVerdict.notes ?? "no notes"}`;
+        log(`    🗑 retiring pathway ${hotSwap.trace_uid.slice(0, 8)}… (${retireReason})`);
+        retirePathwayTrace(hotSwap.trace_uid, retireReason);
+        // Null out hotSwap so the post-loop replay bookkeeping doesn't
+        // also try to record success/failure against the now-retired
+        // trace.
+        hotSwap = null;
+      }
      qualityRetriesOnCurrentModel++;
      if (qualityRetriesOnCurrentModel > MAX_QUALITY_RETRIES) {
        log(`    ✗ ${reason} — quality retries exhausted on ${rung.model}, advancing fallback`);