Add self-improving pipeline: auto-scoring, analytics, reactive refine, routing intelligence

Phase 1 — Run Quality Scoring: - Auto-score every run in background via qwen2.5 judge (1-10) - Thumbs up/down vote buttons on output cards - POST /api/runs/<id>/score for user feedback - run_saved SSE event enables vote buttons after run completes - User votes override auto-scores (race-condition safe) - DB: quality_score, score_method, score_metadata on team_runs Phase 1 — Analytics Dashboard: - GET /api/admin/analytics: score-by-mode, score-by-model, heatmap, trend - New Analytics tab on Admin page with bar charts, heatmap table, trend sparkline - Scoring coverage tracker (scored vs total runs) - Model × Mode heatmap with color-coded cells Phase 2 — Reactive Pipeline: - _assess_stage(): orchestrator evaluates each stage's output mid-run - _reactive_decide(): can insert/skip stages based on assessment - Dynamic stage loop replaces fixed iteration in run_refine() - Budget tracking prevents infinite loops (max_stages hard cap) - Reactive decisions render as dashed notification bars between cards - Pipeline adjusts in real-time: "Inserting VALIDATE — high severity gaps found" Phase 3 — Cross-Run Learning: - _build_routing_table(): queries historical scores for model×mode performance - Best stage sequences per content_type from pipeline_runs - Routing table cached with 30-min TTL - Auto-Refine strategist prompt augmented with historical data - GET /api/suggest-models?mode=X returns top 3 models for that mode Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-29 06:18:32 -05:00 · 2026-03-29 06:18:32 -05:00 · 8ad221b41f
commit 8ad221b41f
parent c2cc211f21
1 changed files with 419 additions and 9 deletions
--- a/llm_team_ui.py
+++ b/llm_team_ui.py
@ -1776,16 +1776,73 @@ def get_db():

 def save_run(mode, prompt, config_data, responses):
    models = list({r.get("model", "") for r in responses if r.get("model")})
+    run_id = None
    try:
        with get_db() as conn:
            with conn.cursor() as cur:
                cur.execute(
-                    "INSERT INTO team_runs (mode, prompt, config, responses, models_used) VALUES (%s, %s, %s, %s, %s)",
+                    "INSERT INTO team_runs (mode, prompt, config, responses, models_used) VALUES (%s, %s, %s, %s, %s) RETURNING id",
                    (mode, prompt, json.dumps(config_data), json.dumps(responses), models)
                )
+                run_id = cur.fetchone()[0]
            conn.commit()
    except Exception as e:
        print(f"[DB] save_run error: {e}")
+    if run_id and responses:
+        threading.Thread(target=_auto_score_run, args=(run_id, mode, prompt, responses), daemon=True).start()
+    return run_id
+
+
+# ─── AUTO-SCORING ENGINE ─────────────────────────────────────
+_SCORE_MODEL = "qwen2.5:latest"
+
+def _auto_score_run(run_id, mode, prompt, responses):
+    """Background: auto-score a completed run via judge model."""
+    try:
+        # Pick the longest non-error response as representative
+        candidates = [r for r in responses if r.get("role") != "error" and r.get("text")]
+        if not candidates:
+            return
+        best = max(candidates, key=lambda r: len(r.get("text", "")))
+        text = best["text"][:3000]
+
+        judge_prompt = (
+            f"Rate the quality of this AI response on a scale of 1-10.\n"
+            f"Consider: relevance to the prompt, completeness, accuracy, clarity, usefulness.\n\n"
+            f"PROMPT: {prompt[:500]}\n\n"
+            f"MODE: {mode}\n\n"
+            f"RESPONSE:\n{text}\n\n"
+            f"Return ONLY a JSON object: {{\"score\": N, \"reason\": \"one sentence\"}}"
+        )
+        judgment = query_model(_SCORE_MODEL, judge_prompt)
+
+        # Parse score
+        score = None
+        try:
+            j_start = judgment.find("{")
+            j_end = judgment.rfind("}") + 1
+            if j_start >= 0 and j_end > j_start:
+                parsed = json.loads(judgment[j_start:j_end])
+                score = float(parsed.get("score", 0))
+        except Exception:
+            pass
+        if score is None:
+            m = re.search(r'\b([1-9]|10)\b', judgment)
+            score = float(m.group(1)) if m else None
+        if score is None or score < 1 or score > 10:
+            return
+
+        with get_db() as conn:
+            with conn.cursor() as cur:
+                cur.execute(
+                    "UPDATE team_runs SET quality_score = %s, score_method = 'auto', score_metadata = %s WHERE id = %s AND (score_method IS NULL OR score_method = 'auto')",
+                    (score, json.dumps({"judge": _SCORE_MODEL, "judgment": judgment[:500], "scored_model": best.get("model", ""), "reason": judgment[:200]}), run_id)
+                )
+            conn.commit()
+        print(f"[SCORE] run {run_id} scored {score}/10 by {_SCORE_MODEL}")
+    except Exception as e:
+        print(f"[SCORE] auto-score error for run {run_id}: {e}")
+

 HTML = r"""
 <!DOCTYPE html>
@ -2881,6 +2938,7 @@ function buildConfig() {

 let _runStartTime = 0;
 let _runTimer = null;
+let _lastRunId = null;
 let _runEventCount = 0;
 let _runResponseCount = 0;
 let _runTotalChars = 0;
@ -3132,6 +3190,11 @@ function handleEvent(evt) {
    return;
  }
  if (evt.type === 'done') { const bar = output.querySelector('.status-bar'); if (bar) bar.remove(); return; }
+  if (evt.type === 'run_saved') {
+    _lastRunId = evt.run_id;
+    document.querySelectorAll('.vote-btn').forEach(function(b) { b.disabled = false; });
+    return;
+  }
  if (evt.type === 'response') {
    _runResponseCount++;
    _runTotalChars += (evt.text || '').length;
@ -3150,6 +3213,14 @@ function handleEvent(evt) {
      label.textContent = phaseName;
      output.appendChild(label);
    }
+    // Reactive pipeline notification — not a full card
+    if (evt.role === 'reactive') {
+      var note = document.createElement('div');
+      note.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:10px;color:var(--accent);border:1px dashed var(--accent);border-radius:2px;padding:8px 12px;margin:4px 0;opacity:0.8;font-style:italic';
+      note.textContent = '\u26A1 ' + evt.text;
+      output.appendChild(note);
+      return;
+    }
    const mi = availableModels.findIndex(m => m.name === evt.model);
    const color = COLORS[(mi >= 0 ? mi : 0) % COLORS.length];
    const displayName = mi >= 0 ? (availableModels[mi].display_name || evt.model) : evt.model;
@ -3161,7 +3232,7 @@ function handleEvent(evt) {
    const roleTag = evt.role ? `<span class="role-tag">${evt.role}</span>` : '';
    const uid = 'resp-' + Date.now() + '-' + Math.random().toString(36).substr(2,4);
    const errorLink = isError ? `<a class="error-link" href="/admin/monitor">View error details in monitor →</a>` : '';
-    card.innerHTML = `<div class="card-header" style="cursor:pointer" onclick="openRepipe('${uid}')"><div class="dot" style="background:${isError ? 'var(--red)' : color}"></div>${displayName}${roleTag}</div><div class="card-body" id="${uid}">${escapeHtml(evt.text)}</div>${errorLink}<div class="card-actions"><button class="card-act" onclick="event.stopPropagation();copyCard('${uid}',this)">Copy</button><button class="card-act" onclick="event.stopPropagation();useAsPrompt('${uid}')">Use as Prompt</button><button class="card-act" onclick="event.stopPropagation();openRepipe('${uid}')">Iterate</button></div>`;
+    card.innerHTML = `<div class="card-header" style="cursor:pointer" onclick="openRepipe('${uid}')"><div class="dot" style="background:${isError ? 'var(--red)' : color}"></div>${displayName}${roleTag}</div><div class="card-body" id="${uid}">${escapeHtml(evt.text)}</div>${errorLink}<div class="card-actions"><button class="card-act" onclick="event.stopPropagation();copyCard('${uid}',this)">Copy</button><button class="card-act" onclick="event.stopPropagation();useAsPrompt('${uid}')">Use as Prompt</button><button class="card-act" onclick="event.stopPropagation();openRepipe('${uid}')">Iterate</button><span style="flex:1"></span><button class="card-act vote-btn" disabled onclick="event.stopPropagation();voteRun(this,'up')" title="Good output">\u{1F44D}</button><button class="card-act vote-btn" disabled onclick="event.stopPropagation();voteRun(this,'down')" title="Bad output">\u{1F44E}</button></div>`;
    card.dataset.model = evt.model;
    card.dataset.role = evt.role || '';
    card.dataset.displayName = displayName;
@ -3174,6 +3245,22 @@ function handleEvent(evt) {
 function escapeHtml(t) { return t.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;'); }

 // ─── CARD ACTIONS ────────────────────────────────────
+async function voteRun(btn, vote) {
+  if (!_lastRunId) return;
+  try {
+    const r = await fetch('/api/runs/' + _lastRunId + '/score', {
+      method: 'POST', headers: {'Content-Type': 'application/json'},
+      body: JSON.stringify({vote: vote})
+    });
+    if (r.ok) {
+      btn.closest('.card-actions').querySelectorAll('.vote-btn').forEach(function(b) { b.style.opacity = '0.3'; b.disabled = true; });
+      btn.style.opacity = '1';
+      btn.style.borderColor = vote === 'up' ? 'var(--green)' : 'var(--red)';
+      btn.style.color = vote === 'up' ? 'var(--green)' : 'var(--red)';
+    }
+  } catch(e) { console.error('Vote error:', e); }
+}
+
 function copyCard(uid, btn) {
  const el = document.getElementById(uid);
  if (!el) return;
@ -3584,6 +3671,7 @@ ADMIN_HTML = r"""
    <div class="tab" onclick="switchTab('openrouter')">OpenRouter</div>
    <div class="tab" onclick="switchTab('timeouts')">Timeouts</div>
    <div class="tab" onclick="switchTab('security')">Security</div>
+    <div class="tab" onclick="switchTab('analytics')">Analytics</div>
  </div>

  <!-- PROVIDERS TAB -->
@ -3687,6 +3775,32 @@ ADMIN_HTML = r"""
      <div id="allowlist"></div>
    </div>
  </div>
+
+  <!-- ANALYTICS TAB -->
+  <div id="tab-analytics" class="tab-content">
+    <div class="card" id="ana-coverage-card">
+      <h3>Scoring Coverage</h3>
+      <div id="ana-coverage" style="font-size:13px;color:var(--text2)">Loading...</div>
+    </div>
+    <div style="display:grid;grid-template-columns:1fr 1fr;gap:12px">
+      <div class="card">
+        <h3>Score by Mode</h3>
+        <div id="ana-by-mode" style="font-size:12px">Loading...</div>
+      </div>
+      <div class="card">
+        <h3>Score by Model</h3>
+        <div id="ana-by-model" style="font-size:12px">Loading...</div>
+      </div>
+    </div>
+    <div class="card">
+      <h3>Model × Mode Heatmap</h3>
+      <div id="ana-heatmap" style="font-size:11px;overflow-x:auto">Loading...</div>
+    </div>
+    <div class="card">
+      <h3>Score Trend (30 days)</h3>
+      <div id="ana-trend" style="font-size:12px">Loading...</div>
+    </div>
+  </div>
 </div>

 <script>
@ -3918,6 +4032,91 @@ function switchTab(name) {
  if (name === 'timeouts') renderTimeouts();
  if (name === 'models') { loadOllamaModels(); renderCloudModels(); }
  if (name === 'security') { loadDemoStatus(); loadAllowlist(); }
+  if (name === 'analytics') loadAnalytics();
+}
+
+async function loadAnalytics() {
+  try {
+    var r = await fetch('/api/admin/analytics');
+    var d = await r.json();
+    if (d.error) { document.getElementById('ana-coverage').textContent = 'Error: ' + d.error; return; }
+
+    // Coverage
+    var cov = d.coverage || {};
+    document.getElementById('ana-coverage').innerHTML =
+      '<strong>' + (cov.scored||0) + '</strong> / ' + (cov.total||0) + ' runs scored (' +
+      (cov.total ? Math.round((cov.scored||0)/(cov.total)*100) : 0) + '%)';
+
+    // Score by Mode - horizontal bars
+    var modeHtml = '';
+    (d.by_mode||[]).forEach(function(m) {
+      var pct = Math.round((parseFloat(m.avg_score)||0) * 10);
+      modeHtml += '<div style="display:flex;align-items:center;gap:8px;margin-bottom:4px">' +
+        '<span style="min-width:100px;font-family:JetBrains Mono,monospace;font-size:10px;text-transform:uppercase">' + m.mode + '</span>' +
+        '<div style="flex:1;height:14px;background:rgba(0,0,0,0.15);border-radius:2px;overflow:hidden">' +
+        '<div style="width:' + pct + '%;height:100%;background:var(--accent);border-radius:2px"></div></div>' +
+        '<span style="font-family:JetBrains Mono,monospace;font-size:11px;font-weight:700;min-width:30px">' + m.avg_score + '</span>' +
+        '<span style="font-size:9px;color:var(--text2)">' + m.runs + ' runs</span></div>';
+    });
+    document.getElementById('ana-by-mode').innerHTML = modeHtml || '<span style="color:var(--text2)">No scored runs yet</span>';
+
+    // Score by Model
+    var modelHtml = '';
+    (d.by_model||[]).forEach(function(m) {
+      var pct = Math.round((parseFloat(m.avg_score)||0) * 10);
+      var name = m.model.length > 20 ? m.model.substring(0,18) + '..' : m.model;
+      modelHtml += '<div style="display:flex;align-items:center;gap:8px;margin-bottom:4px">' +
+        '<span style="min-width:120px;font-family:JetBrains Mono,monospace;font-size:10px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap">' + name + '</span>' +
+        '<div style="flex:1;height:14px;background:rgba(0,0,0,0.15);border-radius:2px;overflow:hidden">' +
+        '<div style="width:' + pct + '%;height:100%;background:var(--green);border-radius:2px"></div></div>' +
+        '<span style="font-family:JetBrains Mono,monospace;font-size:11px;font-weight:700;min-width:30px">' + m.avg_score + '</span>' +
+        '<span style="font-size:9px;color:var(--text2)">' + m.runs + '</span></div>';
+    });
+    document.getElementById('ana-by-model').innerHTML = modelHtml || '<span style="color:var(--text2)">No data yet</span>';
+
+    // Heatmap - table
+    var hm = d.heatmap || [];
+    if (hm.length) {
+      var models = [...new Set(hm.map(function(h){return h.model}))];
+      var modes = [...new Set(hm.map(function(h){return h.mode}))];
+      var lookup = {};
+      hm.forEach(function(h) { lookup[h.mode+'|'+h.model] = h.avg_score; });
+      var tbl = '<table style="width:100%;border-collapse:collapse;font-family:JetBrains Mono,monospace;font-size:10px"><tr><th style="text-align:left;padding:4px">Mode</th>';
+      models.forEach(function(m) { tbl += '<th style="padding:4px;text-align:center">' + (m.length>12?m.substring(0,10)+'..':m) + '</th>'; });
+      tbl += '</tr>';
+      modes.forEach(function(mode) {
+        tbl += '<tr><td style="padding:4px;text-transform:uppercase;letter-spacing:0.5px">' + mode + '</td>';
+        models.forEach(function(model) {
+          var score = lookup[mode+'|'+model];
+          var bg = score ? 'rgba(' + (score >= 7 ? '74,222,128' : score >= 5 ? '226,181,90' : '224,82,82') + ',' + (parseFloat(score)/15) + ')' : 'transparent';
+          tbl += '<td style="padding:4px;text-align:center;background:' + bg + ';font-weight:700">' + (score || '-') + '</td>';
+        });
+        tbl += '</tr>';
+      });
+      tbl += '</table>';
+      document.getElementById('ana-heatmap').innerHTML = tbl;
+    } else {
+      document.getElementById('ana-heatmap').innerHTML = '<span style="color:var(--text2)">Need 2+ scored runs per model/mode combination</span>';
+    }
+
+    // Trend
+    var trend = d.trend || [];
+    if (trend.length) {
+      var trendHtml = '<div style="display:flex;align-items:flex-end;gap:2px;height:80px">';
+      var maxRuns = Math.max(...trend.map(function(t){return t.runs}));
+      trend.forEach(function(t) {
+        var h = Math.max(4, Math.round((t.runs/maxRuns)*70));
+        var color = parseFloat(t.avg_score) >= 7 ? 'var(--green)' : parseFloat(t.avg_score) >= 5 ? 'var(--accent)' : 'var(--red)';
+        trendHtml += '<div title="' + t.day + ': ' + t.avg_score + ' avg (' + t.runs + ' runs)" style="flex:1;height:' + h + 'px;background:' + color + ';border-radius:1px;min-width:4px"></div>';
+      });
+      trendHtml += '</div><div style="display:flex;justify-content:space-between;font-size:8px;color:var(--text2);margin-top:4px"><span>' + trend[0].day + '</span><span>' + trend[trend.length-1].day + '</span></div>';
+      document.getElementById('ana-trend').innerHTML = trendHtml;
+    } else {
+      document.getElementById('ana-trend').innerHTML = '<span style="color:var(--text2)">No data in last 30 days</span>';
+    }
+  } catch(e) {
+    document.getElementById('ana-coverage').textContent = 'Error loading analytics: ' + e.message;
+  }
 }

 async function loadDemoStatus() {
@ -5795,6 +5994,63 @@ def admin_mass_ban():

 # ─── ADMIN MONITOR ─────────────────────────────────────────────

+@app.route("/api/admin/analytics")
+@admin_required
+def admin_analytics():
+    """Analytics: score-by-mode, score-by-model, heatmap, trend."""
+    try:
+        with get_db() as conn:
+            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+                cur.execute("""
+                    SELECT mode, COUNT(*) as runs, ROUND(AVG(quality_score)::numeric, 2) as avg_score,
+                           ROUND(STDDEV(quality_score)::numeric, 2) as std_score
+                    FROM team_runs WHERE quality_score IS NOT NULL
+                    GROUP BY mode ORDER BY avg_score DESC
+                """)
+                by_mode = [dict(r) for r in cur.fetchall()]
+
+                cur.execute("""
+                    SELECT m as model, COUNT(*) as runs, ROUND(AVG(quality_score)::numeric, 2) as avg_score
+                    FROM team_runs, unnest(models_used) as m
+                    WHERE quality_score IS NOT NULL
+                    GROUP BY m ORDER BY avg_score DESC
+                """)
+                by_model = [dict(r) for r in cur.fetchall()]
+
+                cur.execute("""
+                    SELECT mode, m as model, COUNT(*) as runs, ROUND(AVG(quality_score)::numeric, 2) as avg_score
+                    FROM team_runs, unnest(models_used) as m
+                    WHERE quality_score IS NOT NULL
+                    GROUP BY mode, m HAVING COUNT(*) >= 2
+                    ORDER BY avg_score DESC
+                """)
+                heatmap = [dict(r) for r in cur.fetchall()]
+
+                cur.execute("""
+                    SELECT DATE(created_at) as day, COUNT(*) as runs, ROUND(AVG(quality_score)::numeric, 2) as avg_score
+                    FROM team_runs WHERE quality_score IS NOT NULL AND created_at > NOW() - INTERVAL '30 days'
+                    GROUP BY DATE(created_at) ORDER BY day
+                """)
+                trend = [{"day": str(r["day"]), "runs": r["runs"], "avg_score": float(r["avg_score"])} for r in cur.fetchall()]
+
+                cur.execute("SELECT COUNT(*) as total, COUNT(quality_score) as scored FROM team_runs WHERE archived = false")
+                coverage = dict(cur.fetchone())
+
+        return jsonify({"by_mode": by_mode, "by_model": by_model, "heatmap": heatmap, "trend": trend, "coverage": coverage})
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+
+
+@app.route("/api/suggest-models")
+@login_required
+def suggest_models():
+    """Return top-performing models for a given mode based on historical scores."""
+    mode = request.args.get("mode", "")
+    routing = _build_routing_table()
+    perf = routing.get("model_perf", {}).get(mode, [])
+    return jsonify({"mode": mode, "suggestions": perf[:3]})
+
+
@app.route("/admin/monitor")
@admin_required
 def monitor_page():
@ -6159,6 +6415,29 @@ def delete_run(run_id):
        return jsonify({"error": str(e)}), 500


+@app.route("/api/runs/<int:run_id>/score", methods=["POST"])
+@login_required
+def score_run(run_id):
+    """User thumbs up/down on a run — overrides auto-score."""
+    data = request.json or {}
+    vote = data.get("vote")
+    if vote not in ("up", "down"):
+        return jsonify({"error": "vote must be 'up' or 'down'"}), 400
+    score = 8.0 if vote == "up" else 3.0
+    method = f"user_{vote}"
+    try:
+        with get_db() as conn:
+            with conn.cursor() as cur:
+                cur.execute(
+                    "UPDATE team_runs SET quality_score = %s, score_method = %s, score_metadata = score_metadata || %s WHERE id = %s",
+                    (score, method, json.dumps({"user": session.get("username", "unknown"), "voted_at": time.time()}), run_id)
+                )
+            conn.commit()
+        return jsonify({"ok": True, "score": score, "method": method})
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+
+
@app.route("/api/runs/<int:run_id>/archive", methods=["POST"])
@login_required
 def archive_run(run_id):
@ -7537,7 +7816,9 @@ def run_team():
            _log_run(dict(run, run_id=run_id))
            _active_runs.pop(run_id, None)
        if collected:
-            save_run(mode, config.get("prompt", ""), config, collected)
+            rid = save_run(mode, config.get("prompt", ""), config, collected)
+            if rid:
+                yield sse({"type": "run_saved", "run_id": rid})

    return Response(generate(), mimetype="text/event-stream",
                    headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no", "Connection": "keep-alive"})
@ -8533,6 +8814,105 @@ def run_extract(config):
    _save_pipeline("extract", prompt or source, steps, result_data, all_models, start)


+# ─── CROSS-RUN LEARNING ───────────────────────────────────────
+_routing_table = {}
+_routing_table_ts = 0
+_ROUTING_TTL = 1800  # 30 minutes
+
+def _build_routing_table():
+    """Build routing intelligence from historical scored runs."""
+    global _routing_table, _routing_table_ts
+    now = time.time()
+    if _routing_table and (now - _routing_table_ts) < _ROUTING_TTL:
+        return _routing_table
+    try:
+        with get_db() as conn:
+            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+                # Best model per mode
+                cur.execute("""
+                    SELECT mode, m as model, ROUND(AVG(quality_score)::numeric, 2) as avg_score, COUNT(*) as runs
+                    FROM team_runs, unnest(models_used) as m
+                    WHERE quality_score IS NOT NULL AND quality_score >= 5
+                    GROUP BY mode, m HAVING COUNT(*) >= 2
+                    ORDER BY mode, avg_score DESC
+                """)
+                model_perf = {}
+                for r in cur.fetchall():
+                    mode = r["mode"]
+                    if mode not in model_perf:
+                        model_perf[mode] = []
+                    model_perf[mode].append({"model": r["model"], "avg_score": float(r["avg_score"]), "runs": r["runs"]})
+
+                # Best stage sequences for refine pipelines
+                cur.execute("""
+                    SELECT result->>'content_type' as content_type,
+                           result->'stages_run' as stages,
+                           COUNT(*) as runs
+                    FROM pipeline_runs
+                    WHERE pipeline = 'refine' AND result->>'content_type' IS NOT NULL
+                    GROUP BY result->>'content_type', result->'stages_run'
+                    ORDER BY runs DESC
+                """)
+                stage_perf = {}
+                for r in cur.fetchall():
+                    ct = r["content_type"]
+                    if ct and ct not in stage_perf:
+                        stage_perf[ct] = {"stages": r["stages"], "runs": r["runs"]}
+
+        _routing_table = {"model_perf": model_perf, "stage_perf": stage_perf}
+        _routing_table_ts = now
+    except Exception as e:
+        print(f"[ROUTING] build error: {e}")
+    return _routing_table
+
+
+def _assess_stage(orchestrator, stage_name, stage_output, content_type):
+    """Assess a stage's output — returns structured metadata for reactive decisions."""
+    assess_prompt = (
+        f"You just reviewed the output of a {stage_name} stage on a {content_type}.\n"
+        f"Assess the output briefly. Return ONLY a JSON object:\n"
+        f'{{"confidence": 0.0-1.0, "gaps": ["gap1", "gap2"], "severity": "low|medium|high", "suggest_stage": null}}\n\n'
+        f"If the output reveals a critical problem that needs a specific follow-up stage, set suggest_stage to one of: "
+        f"VALIDATE, CRITIQUE, EXPAND, STRUCTURE, STAKEHOLDER, CLARITY, EDGE_CASES, ALIGN\n"
+        f"Otherwise leave suggest_stage as null.\n\n"
+        f"OUTPUT TO ASSESS:\n{stage_output[:2000]}"
+    )
+    try:
+        raw = safe_query(orchestrator, assess_prompt)
+        text = raw.strip()
+        j_start = text.find("{")
+        j_end = text.rfind("}") + 1
+        if j_start >= 0 and j_end > j_start:
+            return json.loads(text[j_start:j_end])
+    except Exception:
+        pass
+    return {"confidence": 0.5, "gaps": [], "severity": "low", "suggest_stage": None}
+
+
+def _reactive_decide(assessment, remaining_stages, stages_executed, max_stages):
+    """Decide whether to insert, skip, or continue based on assessment."""
+    budget_left = max_stages - stages_executed
+    if budget_left <= 1:
+        return "continue", None, "budget exhausted"
+
+    suggested = assessment.get("suggest_stage")
+    severity = assessment.get("severity", "low")
+    confidence = assessment.get("confidence", 0.5)
+
+    # Insert a stage if the assessment suggests one and it's not already planned
+    if suggested and suggested not in remaining_stages and severity in ("medium", "high") and budget_left >= 2:
+        return "insert", suggested, f"{severity} severity — {', '.join(assessment.get('gaps', [])[:2])}"
+
+    # Skip next stage if confidence is very high and remaining stage seems redundant
+    if confidence > 0.9 and len(remaining_stages) > 1 and severity == "low":
+        next_stage = remaining_stages[0]
+        # Don't skip synthesis-oriented stages
+        if next_stage in ("EXPAND", "CLARITY"):
+            return "skip", next_stage, f"high confidence ({confidence:.0%}) — {next_stage} likely unnecessary"
+
+    return "continue", None, None
+
+
 def run_refine(config):
    """Auto-Refine: AI analyzes content, selects the best sequence of modes, executes them, synthesizes final version."""
    import time
@ -8549,10 +8929,20 @@ def run_refine(config):
    yield sse({"type": "status", "message": "Analyzing content and planning refinement pipeline..."})
    yield sse({"type": "progress", "current": 0, "total": 3, "label": "analyzing"})

+    # Inject routing intelligence from historical data
+    routing = _build_routing_table()
+    routing_context = ""
+    if routing.get("stage_perf"):
+        routing_context = "\nHISTORICAL DATA (from past successful runs):\n"
+        for ct, data in list(routing["stage_perf"].items())[:5]:
+            routing_context += f"- For '{ct}' content, sequence {data['stages']} was used {data['runs']} times\n"
+        routing_context += "Use this as guidance but adapt to the specific content.\n"
+
    plan_prompt = f"""You are a refinement strategist. Analyze this content and determine the optimal sequence of refinement stages to improve it.

 CONTENT TO REFINE:
 {prompt[:8000]}
+{routing_context}

 AVAILABLE REFINEMENT STAGES (pick 3-{max_stages} in the best order):
 - VALIDATE: Fact-check claims, verify accuracy, flag unsupported statements
@ -8617,11 +9007,18 @@ Pick ONLY the stages that will meaningfully improve THIS specific content. Not e
    }

    prev_output = ""
-    for si, stage in enumerate(stages):
-        stage_num = si + 1
-        worker = workers[si % len(workers)]
-        yield sse({"type": "progress", "current": stage_num, "total": total_stages, "label": stage.lower()})
-        yield sse({"type": "status", "message": f"Stage {stage_num}/{total_stages}: {stage} ({worker})..."})
+    remaining = list(stages)
+    stages_executed = 0
+    worker_idx = 0
+
+    while remaining and stages_executed < max_stages:
+        stage = remaining.pop(0)
+        stages_executed += 1
+        worker = workers[worker_idx % len(workers)]
+        worker_idx += 1
+        total_stages = stages_executed + len(remaining) + 1  # +1 for synthesis
+        yield sse({"type": "progress", "current": stages_executed, "total": total_stages, "label": stage.lower()})
+        yield sse({"type": "status", "message": f"Stage {stages_executed}: {stage} ({worker})..."})

        template = stage_prompts.get(stage, "Analyze and improve this {type}:\n\n{content}")
        stage_prompt = template.format(
@ -8637,13 +9034,26 @@ Pick ONLY the stages that will meaningfully improve THIS specific content. Not e
            prev_output = result
            steps.append({"step": stage, "model": worker, "output": result[:1000]})

-            # For STRUCTURE and CLARITY stages, the output replaces the working content
            if stage in ("STRUCTURE", "CLARITY"):
                current_content = result
+
+            # Reactive assessment — should we adjust the pipeline?
+            if remaining and stages_executed < max_stages - 1:
+                assessment = _assess_stage(orchestrator, stage, result, content_type)
+                decision, target, reason = _reactive_decide(assessment, remaining, stages_executed, max_stages)
+                if decision == "insert" and target:
+                    remaining.insert(0, target)
+                    yield sse({"type": "response", "model": "system", "text": f"Reactive: inserting {target} stage — {reason}", "role": "reactive"})
+                    steps.append({"step": "reactive_insert", "target": target, "reason": reason})
+                elif decision == "skip" and target:
+                    remaining.remove(target)
+                    yield sse({"type": "response", "model": "system", "text": f"Reactive: skipping {target} — {reason}", "role": "reactive"})
+                    steps.append({"step": "reactive_skip", "target": target, "reason": reason})
        except Exception as e:
            yield sse({"type": "response", "model": worker, "text": f"{stage} failed: {e}", "role": "error"})
            stage_outputs[stage] = f"Error: {e}"

+    total_stages = stages_executed + 1
    # Stage 3: Final synthesis — combine all insights into the definitive refined version
    yield sse({"type": "progress", "current": total_stages, "total": total_stages, "label": "synthesize"})
    yield sse({"type": "status", "message": f"Final synthesis with {orchestrator}..."})