Add self-improving pipeline: auto-scoring, analytics, reactive refine, routing intelligence

Phase 1 — Run Quality Scoring: - Auto-score every run in background via qwen2.5 judge (1-10) - Thumbs up/down vote buttons on output cards - POST /api/runs/<id>/score for user feedback - run_saved SSE event enables vote buttons after run completes - User votes override auto-scores (race-condition safe) - DB: quality_score, score_method, score_metadata on team_runs Phase 1 — Analytics Dashboard: - GET /api/admin/analytics: score-by-mode, score-by-model, heatmap, trend - New Analytics tab on Admin page with bar charts, heatmap table, trend sparkline - Scoring coverage tracker (scored vs total runs) - Model × Mode heatmap with color-coded cells Phase 2 — Reactive Pipeline: - _assess_stage(): orchestrator evaluates each stage's output mid-run - _reactive_decide(): can insert/skip stages based on assessment - Dynamic stage loop replaces fixed iteration in run_refine() - Budget tracking prevents infinite loops (max_stages hard cap) - Reactive decisions render as dashed notification bars between cards - Pipeline adjusts in real-time: "Inserting VALIDATE — high severity gaps found" Phase 3 — Cross-Run Learning: - _build_routing_table(): queries historical scores for model×mode performance - Best stage sequences per content_type from pipeline_runs - Routing table cached with 30-min TTL - Auto-Refine strategist prompt augmented with historical data - GET /api/suggest-models?mode=X returns top 3 models for that mode Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-29 06:18:32 -05:00 · 2026-03-29 06:18:32 -05:00 · 8ad221b41f
commit 8ad221b41f
parent c2cc211f21
1 changed files with 419 additions and 9 deletions
--- a/llm_team_ui.py
+++ b/llm_team_ui.py
@ -1776,16 +1776,73 @@ def get_db():
 def save_run(mode, prompt, config_data, responses):
    models = list({r.get("model", "") for r in responses if r.get("model")})
    run_id = None
    try:
        with get_db() as conn:
            with conn.cursor() as cur:
                cur.execute(
-                    "INSERT INTO team_runs (mode, prompt, config, responses, models_used) VALUES (%s, %s, %s, %s, %s)",
+                    "INSERT INTO team_runs (mode, prompt, config, responses, models_used) VALUES (%s, %s, %s, %s, %s) RETURNING id",
                    (mode, prompt, json.dumps(config_data), json.dumps(responses), models)
                )
                run_id = cur.fetchone()[0]
            conn.commit()
    except Exception as e:
        print(f"[DB] save_run error: {e}")
    if run_id and responses:
        threading.Thread(target=_auto_score_run, args=(run_id, mode, prompt, responses), daemon=True).start()
    return run_id
 # ─── AUTO-SCORING ENGINE ─────────────────────────────────────
 _SCORE_MODEL = "qwen2.5:latest"
 def _auto_score_run(run_id, mode, prompt, responses):
    """Background: auto-score a completed run via judge model."""
    try:
        # Pick the longest non-error response as representative
        candidates = [r for r in responses if r.get("role") != "error" and r.get("text")]
        if not candidates:
            return
        best = max(candidates, key=lambda r: len(r.get("text", "")))
        text = best["text"][:3000]
        judge_prompt = (
            f"Rate the quality of this AI response on a scale of 1-10.\n"
            f"Consider: relevance to the prompt, completeness, accuracy, clarity, usefulness.\n\n"
            f"PROMPT: {prompt[:500]}\n\n"
            f"MODE: {mode}\n\n"
            f"RESPONSE:\n{text}\n\n"
            f"Return ONLY a JSON object: {{\"score\": N, \"reason\": \"one sentence\"}}"
        )
        judgment = query_model(_SCORE_MODEL, judge_prompt)
        # Parse score
        score = None
        try:
            j_start = judgment.find("{")
            j_end = judgment.rfind("}") + 1
            if j_start >= 0 and j_end > j_start:
                parsed = json.loads(judgment[j_start:j_end])
                score = float(parsed.get("score", 0))
        except Exception:
            pass
        if score is None:
            m = re.search(r'\b([1-9]|10)\b', judgment)
            score = float(m.group(1)) if m else None
        if score is None or score < 1 or score > 10:
            return
        with get_db() as conn:
            with conn.cursor() as cur:
                cur.execute(
                    "UPDATE team_runs SET quality_score = %s, score_method = 'auto', score_metadata = %s WHERE id = %s AND (score_method IS NULL OR score_method = 'auto')",
                    (score, json.dumps({"judge": _SCORE_MODEL, "judgment": judgment[:500], "scored_model": best.get("model", ""), "reason": judgment[:200]}), run_id)
                )
            conn.commit()
        print(f"[SCORE] run {run_id} scored {score}/10 by {_SCORE_MODEL}")
    except Exception as e:
        print(f"[SCORE] auto-score error for run {run_id}: {e}")
 HTML = r"""
 <!DOCTYPE html>
@ -2881,6 +2938,7 @@ function buildConfig() {
 let _runStartTime = 0;
 let _runTimer = null;
 let _lastRunId = null;
 let _runEventCount = 0;
 let _runResponseCount = 0;
 let _runTotalChars = 0;
@ -3132,6 +3190,11 @@ function handleEvent(evt) {
    return;
  }
  if (evt.type === 'done') { const bar = output.querySelector('.status-bar'); if (bar) bar.remove(); return; }
  if (evt.type === 'run_saved') {
    _lastRunId = evt.run_id;
    document.querySelectorAll('.vote-btn').forEach(function(b) { b.disabled = false; });
    return;
  }
  if (evt.type === 'response') {
    _runResponseCount++;
    _runTotalChars += (evt.text || '').length;
@ -3150,6 +3213,14 @@ function handleEvent(evt) {
      label.textContent = phaseName;
      output.appendChild(label);
    }
    // Reactive pipeline notification — not a full card
    if (evt.role === 'reactive') {
      var note = document.createElement('div');
      note.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:10px;color:var(--accent);border:1px dashed var(--accent);border-radius:2px;padding:8px 12px;margin:4px 0;opacity:0.8;font-style:italic';
      note.textContent = '\u26A1 ' + evt.text;
      output.appendChild(note);
      return;
    }
    const mi = availableModels.findIndex(m => m.name === evt.model);
    const color = COLORS[(mi >= 0 ? mi : 0) % COLORS.length];
    const displayName = mi >= 0 ? (availableModels[mi].display_name || evt.model) : evt.model;
@ -3161,7 +3232,7 @@ function handleEvent(evt) {
    const roleTag = evt.role ? `<span class="role-tag">${evt.role}</span>` : '';
    const uid = 'resp-' + Date.now() + '-' + Math.random().toString(36).substr(2,4);
    const errorLink = isError ? `<a class="error-link" href="/admin/monitor">View error details in monitor →</a>` : '';
-    card.innerHTML = `<div class="card-header" style="cursor:pointer" onclick="openRepipe('${uid}')"><div class="dot" style="background:${isError ? 'var(--red)' : color}"></div>${displayName}${roleTag}</div><div class="card-body" id="${uid}">${escapeHtml(evt.text)}</div>${errorLink}<div class="card-actions"><button class="card-act" onclick="event.stopPropagation();copyCard('${uid}',this)">Copy</button><button class="card-act" onclick="event.stopPropagation();useAsPrompt('${uid}')">Use as Prompt</button><button class="card-act" onclick="event.stopPropagation();openRepipe('${uid}')">Iterate</button></div>`;
+    card.innerHTML = `<div class="card-header" style="cursor:pointer" onclick="openRepipe('${uid}')"><div class="dot" style="background:${isError ? 'var(--red)' : color}"></div>${displayName}${roleTag}</div><div class="card-body" id="${uid}">${escapeHtml(evt.text)}</div>${errorLink}<div class="card-actions"><button class="card-act" onclick="event.stopPropagation();copyCard('${uid}',this)">Copy</button><button class="card-act" onclick="event.stopPropagation();useAsPrompt('${uid}')">Use as Prompt</button><button class="card-act" onclick="event.stopPropagation();openRepipe('${uid}')">Iterate</button><span style="flex:1"></span><button class="card-act vote-btn" disabled onclick="event.stopPropagation();voteRun(this,'up')" title="Good output">\u{1F44D}</button><button class="card-act vote-btn" disabled onclick="event.stopPropagation();voteRun(this,'down')" title="Bad output">\u{1F44E}</button></div>`;
    card.dataset.model = evt.model;
    card.dataset.role = evt.role || '';
    card.dataset.displayName = displayName;
@ -3174,6 +3245,22 @@ function handleEvent(evt) {
 function escapeHtml(t) { return t.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;'); }
 // ─── CARD ACTIONS ────────────────────────────────────
 async function voteRun(btn, vote) {
  if (!_lastRunId) return;
  try {
    const r = await fetch('/api/runs/' + _lastRunId + '/score', {
      method: 'POST', headers: {'Content-Type': 'application/json'},
      body: JSON.stringify({vote: vote})
    });
    if (r.ok) {
      btn.closest('.card-actions').querySelectorAll('.vote-btn').forEach(function(b) { b.style.opacity = '0.3'; b.disabled = true; });
      btn.style.opacity = '1';
      btn.style.borderColor = vote === 'up' ? 'var(--green)' : 'var(--red)';
      btn.style.color = vote === 'up' ? 'var(--green)' : 'var(--red)';
    }
  } catch(e) { console.error('Vote error:', e); }
 }
 function copyCard(uid, btn) {
  const el = document.getElementById(uid);
  if (!el) return;
@ -3584,6 +3671,7 @@ ADMIN_HTML = r"""
    <div class="tab" onclick="switchTab('openrouter')">OpenRouter</div>
    <div class="tab" onclick="switchTab('timeouts')">Timeouts</div>
    <div class="tab" onclick="switchTab('security')">Security</div>
    <div class="tab" onclick="switchTab('analytics')">Analytics</div>
  </div>
  <!-- PROVIDERS TAB -->
@ -3687,6 +3775,32 @@ ADMIN_HTML = r"""
      <div id="allowlist"></div>
    </div>
  </div>
  <!-- ANALYTICS TAB -->
  <div id="tab-analytics" class="tab-content">
    <div class="card" id="ana-coverage-card">
      <h3>Scoring Coverage</h3>
      <div id="ana-coverage" style="font-size:13px;color:var(--text2)">Loading...</div>
    </div>
    <div style="display:grid;grid-template-columns:1fr 1fr;gap:12px">
      <div class="card">
        <h3>Score by Mode</h3>
        <div id="ana-by-mode" style="font-size:12px">Loading...</div>
      </div>
      <div class="card">
        <h3>Score by Model</h3>
        <div id="ana-by-model" style="font-size:12px">Loading...</div>
      </div>
    </div>
    <div class="card">
      <h3>Model × Mode Heatmap</h3>
      <div id="ana-heatmap" style="font-size:11px;overflow-x:auto">Loading...</div>
    </div>
    <div class="card">
      <h3>Score Trend (30 days)</h3>
      <div id="ana-trend" style="font-size:12px">Loading...</div>
    </div>
  </div>
 </div>
 <script>
@ -3918,6 +4032,91 @@ function switchTab(name) {
  if (name === 'timeouts') renderTimeouts();
  if (name === 'models') { loadOllamaModels(); renderCloudModels(); }
  if (name === 'security') { loadDemoStatus(); loadAllowlist(); }
  if (name === 'analytics') loadAnalytics();
 }
 async function loadAnalytics() {
  try {
    var r = await fetch('/api/admin/analytics');
    var d = await r.json();
    if (d.error) { document.getElementById('ana-coverage').textContent = 'Error: ' + d.error; return; }
    // Coverage
    var cov = d.coverage || {};
    document.getElementById('ana-coverage').innerHTML =
      '<strong>' + (cov.scored||0) + '</strong> / ' + (cov.total||0) + ' runs scored (' +
      (cov.total ? Math.round((cov.scored||0)/(cov.total)*100) : 0) + '%)';
    // Score by Mode - horizontal bars
    var modeHtml = '';
    (d.by_mode||[]).forEach(function(m) {
      var pct = Math.round((parseFloat(m.avg_score)||0) * 10);
      modeHtml += '<div style="display:flex;align-items:center;gap:8px;margin-bottom:4px">' +
        '<span style="min-width:100px;font-family:JetBrains Mono,monospace;font-size:10px;text-transform:uppercase">' + m.mode + '</span>' +
        '<div style="flex:1;height:14px;background:rgba(0,0,0,0.15);border-radius:2px;overflow:hidden">' +
        '<div style="width:' + pct + '%;height:100%;background:var(--accent);border-radius:2px"></div></div>' +
        '<span style="font-family:JetBrains Mono,monospace;font-size:11px;font-weight:700;min-width:30px">' + m.avg_score + '</span>' +
        '<span style="font-size:9px;color:var(--text2)">' + m.runs + ' runs</span></div>';
    });
    document.getElementById('ana-by-mode').innerHTML = modeHtml || '<span style="color:var(--text2)">No scored runs yet</span>';
    // Score by Model
    var modelHtml = '';
    (d.by_model||[]).forEach(function(m) {
      var pct = Math.round((parseFloat(m.avg_score)||0) * 10);
      var name = m.model.length > 20 ? m.model.substring(0,18) + '..' : m.model;
      modelHtml += '<div style="display:flex;align-items:center;gap:8px;margin-bottom:4px">' +
        '<span style="min-width:120px;font-family:JetBrains Mono,monospace;font-size:10px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap">' + name + '</span>' +
        '<div style="flex:1;height:14px;background:rgba(0,0,0,0.15);border-radius:2px;overflow:hidden">' +
        '<div style="width:' + pct + '%;height:100%;background:var(--green);border-radius:2px"></div></div>' +
        '<span style="font-family:JetBrains Mono,monospace;font-size:11px;font-weight:700;min-width:30px">' + m.avg_score + '</span>' +
        '<span style="font-size:9px;color:var(--text2)">' + m.runs + '</span></div>';
    });
    document.getElementById('ana-by-model').innerHTML = modelHtml || '<span style="color:var(--text2)">No data yet</span>';
    // Heatmap - table
    var hm = d.heatmap || [];
    if (hm.length) {
      var models = [...new Set(hm.map(function(h){return h.model}))];
      var modes = [...new Set(hm.map(function(h){return h.mode}))];
      var lookup = {};
      hm.forEach(function(h) { lookup[h.mode+'|'+h.model] = h.avg_score; });
      var tbl = '<table style="width:100%;border-collapse:collapse;font-family:JetBrains Mono,monospace;font-size:10px"><tr><th style="text-align:left;padding:4px">Mode</th>';
      models.forEach(function(m) { tbl += '<th style="padding:4px;text-align:center">' + (m.length>12?m.substring(0,10)+'..':m) + '</th>'; });
      tbl += '</tr>';
      modes.forEach(function(mode) {
        tbl += '<tr><td style="padding:4px;text-transform:uppercase;letter-spacing:0.5px">' + mode + '</td>';
        models.forEach(function(model) {
          var score = lookup[mode+'|'+model];
          var bg = score ? 'rgba(' + (score >= 7 ? '74,222,128' : score >= 5 ? '226,181,90' : '224,82,82') + ',' + (parseFloat(score)/15) + ')' : 'transparent';
          tbl += '<td style="padding:4px;text-align:center;background:' + bg + ';font-weight:700">' + (score || '-') + '</td>';
        });
        tbl += '</tr>';
      });
      tbl += '</table>';
      document.getElementById('ana-heatmap').innerHTML = tbl;
    } else {
      document.getElementById('ana-heatmap').innerHTML = '<span style="color:var(--text2)">Need 2+ scored runs per model/mode combination</span>';
    }
    // Trend
    var trend = d.trend || [];
    if (trend.length) {
      var trendHtml = '<div style="display:flex;align-items:flex-end;gap:2px;height:80px">';
      var maxRuns = Math.max(...trend.map(function(t){return t.runs}));
      trend.forEach(function(t) {
        var h = Math.max(4, Math.round((t.runs/maxRuns)*70));
        var color = parseFloat(t.avg_score) >= 7 ? 'var(--green)' : parseFloat(t.avg_score) >= 5 ? 'var(--accent)' : 'var(--red)';
        trendHtml += '<div title="' + t.day + ': ' + t.avg_score + ' avg (' + t.runs + ' runs)" style="flex:1;height:' + h + 'px;background:' + color + ';border-radius:1px;min-width:4px"></div>';
      });
      trendHtml += '</div><div style="display:flex;justify-content:space-between;font-size:8px;color:var(--text2);margin-top:4px"><span>' + trend[0].day + '</span><span>' + trend[trend.length-1].day + '</span></div>';
      document.getElementById('ana-trend').innerHTML = trendHtml;
    } else {
      document.getElementById('ana-trend').innerHTML = '<span style="color:var(--text2)">No data in last 30 days</span>';
    }
  } catch(e) {
    document.getElementById('ana-coverage').textContent = 'Error loading analytics: ' + e.message;
  }
 }
 async function loadDemoStatus() {
@ -5795,6 +5994,63 @@ def admin_mass_ban():
 # ─── ADMIN MONITOR ─────────────────────────────────────────────
@app.route("/api/admin/analytics")
@admin_required
 def admin_analytics():
    """Analytics: score-by-mode, score-by-model, heatmap, trend."""
    try:
        with get_db() as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
                cur.execute("""
                    SELECT mode, COUNT(*) as runs, ROUND(AVG(quality_score)::numeric, 2) as avg_score,
                           ROUND(STDDEV(quality_score)::numeric, 2) as std_score
                    FROM team_runs WHERE quality_score IS NOT NULL
                    GROUP BY mode ORDER BY avg_score DESC
                """)
                by_mode = [dict(r) for r in cur.fetchall()]
                cur.execute("""
                    SELECT m as model, COUNT(*) as runs, ROUND(AVG(quality_score)::numeric, 2) as avg_score
                    FROM team_runs, unnest(models_used) as m
                    WHERE quality_score IS NOT NULL
                    GROUP BY m ORDER BY avg_score DESC
                """)
                by_model = [dict(r) for r in cur.fetchall()]
                cur.execute("""
                    SELECT mode, m as model, COUNT(*) as runs, ROUND(AVG(quality_score)::numeric, 2) as avg_score
                    FROM team_runs, unnest(models_used) as m
                    WHERE quality_score IS NOT NULL
                    GROUP BY mode, m HAVING COUNT(*) >= 2
                    ORDER BY avg_score DESC
                """)
                heatmap = [dict(r) for r in cur.fetchall()]
                cur.execute("""
                    SELECT DATE(created_at) as day, COUNT(*) as runs, ROUND(AVG(quality_score)::numeric, 2) as avg_score
                    FROM team_runs WHERE quality_score IS NOT NULL AND created_at > NOW() - INTERVAL '30 days'
                    GROUP BY DATE(created_at) ORDER BY day
                """)
                trend = [{"day": str(r["day"]), "runs": r["runs"], "avg_score": float(r["avg_score"])} for r in cur.fetchall()]
                cur.execute("SELECT COUNT(*) as total, COUNT(quality_score) as scored FROM team_runs WHERE archived = false")
                coverage = dict(cur.fetchone())
        return jsonify({"by_mode": by_mode, "by_model": by_model, "heatmap": heatmap, "trend": trend, "coverage": coverage})
    except Exception as e:
        return jsonify({"error": str(e)}), 500
@app.route("/api/suggest-models")
@login_required
 def suggest_models():
    """Return top-performing models for a given mode based on historical scores."""
    mode = request.args.get("mode", "")
    routing = _build_routing_table()
    perf = routing.get("model_perf", {}).get(mode, [])
    return jsonify({"mode": mode, "suggestions": perf[:3]})
@app.route("/admin/monitor")
@admin_required
 def monitor_page():
@ -6159,6 +6415,29 @@ def delete_run(run_id):
        return jsonify({"error": str(e)}), 500
@app.route("/api/runs/<int:run_id>/score", methods=["POST"])
@login_required
 def score_run(run_id):
    """User thumbs up/down on a run — overrides auto-score."""
    data = request.json or {}
    vote = data.get("vote")
    if vote not in ("up", "down"):
        return jsonify({"error": "vote must be 'up' or 'down'"}), 400
    score = 8.0 if vote == "up" else 3.0
    method = f"user_{vote}"
    try:
        with get_db() as conn:
            with conn.cursor() as cur:
                cur.execute(
                    "UPDATE team_runs SET quality_score = %s, score_method = %s, score_metadata = score_metadata || %s WHERE id = %s",
                    (score, method, json.dumps({"user": session.get("username", "unknown"), "voted_at": time.time()}), run_id)
                )
            conn.commit()
        return jsonify({"ok": True, "score": score, "method": method})
    except Exception as e:
        return jsonify({"error": str(e)}), 500
@app.route("/api/runs/<int:run_id>/archive", methods=["POST"])
@login_required
 def archive_run(run_id):
@ -7537,7 +7816,9 @@ def run_team():
            _log_run(dict(run, run_id=run_id))
            _active_runs.pop(run_id, None)
        if collected:
-            save_run(mode, config.get("prompt", ""), config, collected)
+            rid = save_run(mode, config.get("prompt", ""), config, collected)
            if rid:
                yield sse({"type": "run_saved", "run_id": rid})
    return Response(generate(), mimetype="text/event-stream",
                    headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no", "Connection": "keep-alive"})
@ -8533,6 +8814,105 @@ def run_extract(config):
    _save_pipeline("extract", prompt or source, steps, result_data, all_models, start)
 # ─── CROSS-RUN LEARNING ───────────────────────────────────────
 _routing_table = {}
 _routing_table_ts = 0
 _ROUTING_TTL = 1800  # 30 minutes
 def _build_routing_table():
    """Build routing intelligence from historical scored runs."""
    global _routing_table, _routing_table_ts
    now = time.time()
    if _routing_table and (now - _routing_table_ts) < _ROUTING_TTL:
        return _routing_table
    try:
        with get_db() as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
                # Best model per mode
                cur.execute("""
                    SELECT mode, m as model, ROUND(AVG(quality_score)::numeric, 2) as avg_score, COUNT(*) as runs
                    FROM team_runs, unnest(models_used) as m
                    WHERE quality_score IS NOT NULL AND quality_score >= 5
                    GROUP BY mode, m HAVING COUNT(*) >= 2
                    ORDER BY mode, avg_score DESC
                """)
                model_perf = {}
                for r in cur.fetchall():
                    mode = r["mode"]
                    if mode not in model_perf:
                        model_perf[mode] = []
                    model_perf[mode].append({"model": r["model"], "avg_score": float(r["avg_score"]), "runs": r["runs"]})
                # Best stage sequences for refine pipelines
                cur.execute("""
                    SELECT result->>'content_type' as content_type,
                           result->'stages_run' as stages,
                           COUNT(*) as runs
                    FROM pipeline_runs
                    WHERE pipeline = 'refine' AND result->>'content_type' IS NOT NULL
                    GROUP BY result->>'content_type', result->'stages_run'
                    ORDER BY runs DESC
                """)
                stage_perf = {}
                for r in cur.fetchall():
                    ct = r["content_type"]
                    if ct and ct not in stage_perf:
                        stage_perf[ct] = {"stages": r["stages"], "runs": r["runs"]}
        _routing_table = {"model_perf": model_perf, "stage_perf": stage_perf}
        _routing_table_ts = now
    except Exception as e:
        print(f"[ROUTING] build error: {e}")
    return _routing_table
 def _assess_stage(orchestrator, stage_name, stage_output, content_type):
    """Assess a stage's output — returns structured metadata for reactive decisions."""
    assess_prompt = (
        f"You just reviewed the output of a {stage_name} stage on a {content_type}.\n"
        f"Assess the output briefly. Return ONLY a JSON object:\n"
        f'{{"confidence": 0.0-1.0, "gaps": ["gap1", "gap2"], "severity": "low|medium|high", "suggest_stage": null}}\n\n'
        f"If the output reveals a critical problem that needs a specific follow-up stage, set suggest_stage to one of: "
        f"VALIDATE, CRITIQUE, EXPAND, STRUCTURE, STAKEHOLDER, CLARITY, EDGE_CASES, ALIGN\n"
        f"Otherwise leave suggest_stage as null.\n\n"
        f"OUTPUT TO ASSESS:\n{stage_output[:2000]}"
    )
    try:
        raw = safe_query(orchestrator, assess_prompt)
        text = raw.strip()
        j_start = text.find("{")
        j_end = text.rfind("}") + 1
        if j_start >= 0 and j_end > j_start:
            return json.loads(text[j_start:j_end])
    except Exception:
        pass
    return {"confidence": 0.5, "gaps": [], "severity": "low", "suggest_stage": None}
 def _reactive_decide(assessment, remaining_stages, stages_executed, max_stages):
    """Decide whether to insert, skip, or continue based on assessment."""
    budget_left = max_stages - stages_executed
    if budget_left <= 1:
        return "continue", None, "budget exhausted"
    suggested = assessment.get("suggest_stage")
    severity = assessment.get("severity", "low")
    confidence = assessment.get("confidence", 0.5)
    # Insert a stage if the assessment suggests one and it's not already planned
    if suggested and suggested not in remaining_stages and severity in ("medium", "high") and budget_left >= 2:
        return "insert", suggested, f"{severity} severity — {', '.join(assessment.get('gaps', [])[:2])}"
    # Skip next stage if confidence is very high and remaining stage seems redundant
    if confidence > 0.9 and len(remaining_stages) > 1 and severity == "low":
        next_stage = remaining_stages[0]
        # Don't skip synthesis-oriented stages
        if next_stage in ("EXPAND", "CLARITY"):
            return "skip", next_stage, f"high confidence ({confidence:.0%}) — {next_stage} likely unnecessary"
    return "continue", None, None
 def run_refine(config):
    """Auto-Refine: AI analyzes content, selects the best sequence of modes, executes them, synthesizes final version."""
    import time
@ -8549,10 +8929,20 @@ def run_refine(config):
    yield sse({"type": "status", "message": "Analyzing content and planning refinement pipeline..."})
    yield sse({"type": "progress", "current": 0, "total": 3, "label": "analyzing"})
    # Inject routing intelligence from historical data
    routing = _build_routing_table()
    routing_context = ""
    if routing.get("stage_perf"):
        routing_context = "\nHISTORICAL DATA (from past successful runs):\n"
        for ct, data in list(routing["stage_perf"].items())[:5]:
            routing_context += f"- For '{ct}' content, sequence {data['stages']} was used {data['runs']} times\n"
        routing_context += "Use this as guidance but adapt to the specific content.\n"
    plan_prompt = f"""You are a refinement strategist. Analyze this content and determine the optimal sequence of refinement stages to improve it.
 CONTENT TO REFINE:
 {prompt[:8000]}
 {routing_context}
 AVAILABLE REFINEMENT STAGES (pick 3-{max_stages} in the best order):
 - VALIDATE: Fact-check claims, verify accuracy, flag unsupported statements
@ -8617,11 +9007,18 @@ Pick ONLY the stages that will meaningfully improve THIS specific content. Not e
    }
    prev_output = ""
-    for si, stage in enumerate(stages):
+    remaining = list(stages)
-        stage_num = si + 1
+    stages_executed = 0
-        worker = workers[si % len(workers)]
+    worker_idx = 0
-        yield sse({"type": "progress", "current": stage_num, "total": total_stages, "label": stage.lower()})
+
-        yield sse({"type": "status", "message": f"Stage {stage_num}/{total_stages}: {stage} ({worker})..."})
+    while remaining and stages_executed < max_stages:
        stage = remaining.pop(0)
        stages_executed += 1
        worker = workers[worker_idx % len(workers)]
        worker_idx += 1
        total_stages = stages_executed + len(remaining) + 1  # +1 for synthesis
        yield sse({"type": "progress", "current": stages_executed, "total": total_stages, "label": stage.lower()})
        yield sse({"type": "status", "message": f"Stage {stages_executed}: {stage} ({worker})..."})
        template = stage_prompts.get(stage, "Analyze and improve this {type}:\n\n{content}")
        stage_prompt = template.format(
@ -8637,13 +9034,26 @@ Pick ONLY the stages that will meaningfully improve THIS specific content. Not e
            prev_output = result
            steps.append({"step": stage, "model": worker, "output": result[:1000]})
            # For STRUCTURE and CLARITY stages, the output replaces the working content
            if stage in ("STRUCTURE", "CLARITY"):
                current_content = result
            # Reactive assessment — should we adjust the pipeline?
            if remaining and stages_executed < max_stages - 1:
                assessment = _assess_stage(orchestrator, stage, result, content_type)
                decision, target, reason = _reactive_decide(assessment, remaining, stages_executed, max_stages)
                if decision == "insert" and target:
                    remaining.insert(0, target)
                    yield sse({"type": "response", "model": "system", "text": f"Reactive: inserting {target} stage — {reason}", "role": "reactive"})
                    steps.append({"step": "reactive_insert", "target": target, "reason": reason})
                elif decision == "skip" and target:
                    remaining.remove(target)
                    yield sse({"type": "response", "model": "system", "text": f"Reactive: skipping {target} — {reason}", "role": "reactive"})
                    steps.append({"step": "reactive_skip", "target": target, "reason": reason})
        except Exception as e:
            yield sse({"type": "response", "model": worker, "text": f"{stage} failed: {e}", "role": "error"})
            stage_outputs[stage] = f"Error: {e}"
    total_stages = stages_executed + 1
    # Stage 3: Final synthesis — combine all insights into the definitive refined version
    yield sse({"type": "progress", "current": total_stages, "total": total_stages, "label": "synthesize"})
    yield sse({"type": "status", "message": f"Final synthesis with {orchestrator}..."})