From 8ad221b41f9ef2152492ee4e1f694798a6e5d95d Mon Sep 17 00:00:00 2001 From: root Date: Sun, 29 Mar 2026 06:18:32 -0500 Subject: [PATCH] Add self-improving pipeline: auto-scoring, analytics, reactive refine, routing intelligence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 — Run Quality Scoring: - Auto-score every run in background via qwen2.5 judge (1-10) - Thumbs up/down vote buttons on output cards - POST /api/runs//score for user feedback - run_saved SSE event enables vote buttons after run completes - User votes override auto-scores (race-condition safe) - DB: quality_score, score_method, score_metadata on team_runs Phase 1 — Analytics Dashboard: - GET /api/admin/analytics: score-by-mode, score-by-model, heatmap, trend - New Analytics tab on Admin page with bar charts, heatmap table, trend sparkline - Scoring coverage tracker (scored vs total runs) - Model × Mode heatmap with color-coded cells Phase 2 — Reactive Pipeline: - _assess_stage(): orchestrator evaluates each stage's output mid-run - _reactive_decide(): can insert/skip stages based on assessment - Dynamic stage loop replaces fixed iteration in run_refine() - Budget tracking prevents infinite loops (max_stages hard cap) - Reactive decisions render as dashed notification bars between cards - Pipeline adjusts in real-time: "Inserting VALIDATE — high severity gaps found" Phase 3 — Cross-Run Learning: - _build_routing_table(): queries historical scores for model×mode performance - Best stage sequences per content_type from pipeline_runs - Routing table cached with 30-min TTL - Auto-Refine strategist prompt augmented with historical data - GET /api/suggest-models?mode=X returns top 3 models for that mode Co-Authored-By: Claude Opus 4.6 (1M context) --- llm_team_ui.py | 428 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 419 insertions(+), 9 deletions(-) diff --git a/llm_team_ui.py b/llm_team_ui.py index 12595c5..aadae10 100644 --- a/llm_team_ui.py +++ b/llm_team_ui.py @@ -1776,16 +1776,73 @@ def get_db(): def save_run(mode, prompt, config_data, responses): models = list({r.get("model", "") for r in responses if r.get("model")}) + run_id = None try: with get_db() as conn: with conn.cursor() as cur: cur.execute( - "INSERT INTO team_runs (mode, prompt, config, responses, models_used) VALUES (%s, %s, %s, %s, %s)", + "INSERT INTO team_runs (mode, prompt, config, responses, models_used) VALUES (%s, %s, %s, %s, %s) RETURNING id", (mode, prompt, json.dumps(config_data), json.dumps(responses), models) ) + run_id = cur.fetchone()[0] conn.commit() except Exception as e: print(f"[DB] save_run error: {e}") + if run_id and responses: + threading.Thread(target=_auto_score_run, args=(run_id, mode, prompt, responses), daemon=True).start() + return run_id + + +# ─── AUTO-SCORING ENGINE ───────────────────────────────────── +_SCORE_MODEL = "qwen2.5:latest" + +def _auto_score_run(run_id, mode, prompt, responses): + """Background: auto-score a completed run via judge model.""" + try: + # Pick the longest non-error response as representative + candidates = [r for r in responses if r.get("role") != "error" and r.get("text")] + if not candidates: + return + best = max(candidates, key=lambda r: len(r.get("text", ""))) + text = best["text"][:3000] + + judge_prompt = ( + f"Rate the quality of this AI response on a scale of 1-10.\n" + f"Consider: relevance to the prompt, completeness, accuracy, clarity, usefulness.\n\n" + f"PROMPT: {prompt[:500]}\n\n" + f"MODE: {mode}\n\n" + f"RESPONSE:\n{text}\n\n" + f"Return ONLY a JSON object: {{\"score\": N, \"reason\": \"one sentence\"}}" + ) + judgment = query_model(_SCORE_MODEL, judge_prompt) + + # Parse score + score = None + try: + j_start = judgment.find("{") + j_end = judgment.rfind("}") + 1 + if j_start >= 0 and j_end > j_start: + parsed = json.loads(judgment[j_start:j_end]) + score = float(parsed.get("score", 0)) + except Exception: + pass + if score is None: + m = re.search(r'\b([1-9]|10)\b', judgment) + score = float(m.group(1)) if m else None + if score is None or score < 1 or score > 10: + return + + with get_db() as conn: + with conn.cursor() as cur: + cur.execute( + "UPDATE team_runs SET quality_score = %s, score_method = 'auto', score_metadata = %s WHERE id = %s AND (score_method IS NULL OR score_method = 'auto')", + (score, json.dumps({"judge": _SCORE_MODEL, "judgment": judgment[:500], "scored_model": best.get("model", ""), "reason": judgment[:200]}), run_id) + ) + conn.commit() + print(f"[SCORE] run {run_id} scored {score}/10 by {_SCORE_MODEL}") + except Exception as e: + print(f"[SCORE] auto-score error for run {run_id}: {e}") + HTML = r""" @@ -2881,6 +2938,7 @@ function buildConfig() { let _runStartTime = 0; let _runTimer = null; +let _lastRunId = null; let _runEventCount = 0; let _runResponseCount = 0; let _runTotalChars = 0; @@ -3132,6 +3190,11 @@ function handleEvent(evt) { return; } if (evt.type === 'done') { const bar = output.querySelector('.status-bar'); if (bar) bar.remove(); return; } + if (evt.type === 'run_saved') { + _lastRunId = evt.run_id; + document.querySelectorAll('.vote-btn').forEach(function(b) { b.disabled = false; }); + return; + } if (evt.type === 'response') { _runResponseCount++; _runTotalChars += (evt.text || '').length; @@ -3150,6 +3213,14 @@ function handleEvent(evt) { label.textContent = phaseName; output.appendChild(label); } + // Reactive pipeline notification — not a full card + if (evt.role === 'reactive') { + var note = document.createElement('div'); + note.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:10px;color:var(--accent);border:1px dashed var(--accent);border-radius:2px;padding:8px 12px;margin:4px 0;opacity:0.8;font-style:italic'; + note.textContent = '\u26A1 ' + evt.text; + output.appendChild(note); + return; + } const mi = availableModels.findIndex(m => m.name === evt.model); const color = COLORS[(mi >= 0 ? mi : 0) % COLORS.length]; const displayName = mi >= 0 ? (availableModels[mi].display_name || evt.model) : evt.model; @@ -3161,7 +3232,7 @@ function handleEvent(evt) { const roleTag = evt.role ? `${evt.role}` : ''; const uid = 'resp-' + Date.now() + '-' + Math.random().toString(36).substr(2,4); const errorLink = isError ? `View error details in monitor →` : ''; - card.innerHTML = `
${displayName}${roleTag}
${escapeHtml(evt.text)}
${errorLink}
`; + card.innerHTML = `
${displayName}${roleTag}
${escapeHtml(evt.text)}
${errorLink}
`; card.dataset.model = evt.model; card.dataset.role = evt.role || ''; card.dataset.displayName = displayName; @@ -3174,6 +3245,22 @@ function handleEvent(evt) { function escapeHtml(t) { return t.replace(/&/g,'&').replace(//g,'>'); } // ─── CARD ACTIONS ──────────────────────────────────── +async function voteRun(btn, vote) { + if (!_lastRunId) return; + try { + const r = await fetch('/api/runs/' + _lastRunId + '/score', { + method: 'POST', headers: {'Content-Type': 'application/json'}, + body: JSON.stringify({vote: vote}) + }); + if (r.ok) { + btn.closest('.card-actions').querySelectorAll('.vote-btn').forEach(function(b) { b.style.opacity = '0.3'; b.disabled = true; }); + btn.style.opacity = '1'; + btn.style.borderColor = vote === 'up' ? 'var(--green)' : 'var(--red)'; + btn.style.color = vote === 'up' ? 'var(--green)' : 'var(--red)'; + } + } catch(e) { console.error('Vote error:', e); } +} + function copyCard(uid, btn) { const el = document.getElementById(uid); if (!el) return; @@ -3584,6 +3671,7 @@ ADMIN_HTML = r"""
OpenRouter
Timeouts
Security
+
Analytics
@@ -3687,6 +3775,32 @@ ADMIN_HTML = r"""
+ + +
+
+

Scoring Coverage

+
Loading...
+
+
+
+

Score by Mode

+
Loading...
+
+
+

Score by Model

+
Loading...
+
+
+
+

Model × Mode Heatmap

+
Loading...
+
+
+

Score Trend (30 days)

+
Loading...
+
+