diff --git a/llm_team_ui.py b/llm_team_ui.py index cd9c942..f1bf4db 100644 --- a/llm_team_ui.py +++ b/llm_team_ui.py @@ -1967,13 +1967,17 @@ def cache_store(cache_key, prompt, mode, models, run_id, score, responses): def save_run(mode, prompt, config_data, responses): models = list({r.get("model", "") for r in responses if r.get("model")}) + # Calculate token usage from actual content + input_chars = len(prompt) + output_chars = sum(len(r.get("text", "")) for r in responses if r.get("text")) + est_tokens = estimate_tokens(prompt) + sum(estimate_tokens(r.get("text", "")) for r in responses if r.get("text")) run_id = None try: with get_db() as conn: with conn.cursor() as cur: cur.execute( - "INSERT INTO team_runs (mode, prompt, config, responses, models_used) VALUES (%s, %s, %s, %s, %s) RETURNING id", - (mode, prompt, json.dumps(config_data), json.dumps(responses), models) + "INSERT INTO team_runs (mode, prompt, config, responses, models_used, est_tokens, input_chars, output_chars) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) RETURNING id", + (mode, prompt, json.dumps(config_data), json.dumps(responses), models, est_tokens, input_chars, output_chars) ) run_id = cur.fetchone()[0] conn.commit() @@ -2490,6 +2494,7 @@ HTML = r"""
Auto-RefineAI pipeline
KnowledgeExtract facts
AdaptiveSelf-eval + RAG
+
Deep AnalysisFull pipeline
All models answer in parallel, then one synthesizes the best parts into a final answer.
@@ -2658,6 +2663,22 @@ HTML = r"""
Models self-evaluate confidence. Below threshold → retrieves context from knowledge base → escalates to next model. Order models from weakest to strongest. Successful responses are stored for future RAG retrieval.
+ +
@@ -3157,7 +3178,7 @@ let availableModels = []; let currentMode = 'brainstorm'; const modelSets = {}; -const ML_IDS = ['ml-brainstorm','ml-validator','ml-roundrobin','ml-consensus','ml-ladder','ml-tournament','ml-evolution','ml-blindassembly','ml-mesh','ml-hallucination','ml-research','ml-eval','ml-refine','ml-adaptive']; +const ML_IDS = ['ml-brainstorm','ml-validator','ml-roundrobin','ml-consensus','ml-ladder','ml-tournament','ml-evolution','ml-blindassembly','ml-mesh','ml-hallucination','ml-research','ml-eval','ml-refine','ml-adaptive','ml-deep_analysis']; const MODE_DESCS = { brainstorm: 'All models answer in parallel, then one synthesizes the best parts.', @@ -3181,7 +3202,8 @@ const MODE_DESCS = { eval: 'AUTONOMOUS: Same prompts sent to all selected models. Judge scores each on accuracy, reasoning, clarity. Produces a ranked leaderboard across multiple rounds.', extract: 'AUTONOMOUS: Extracts structured facts, entities, and relationships from text or local docs. Verifier cross-checks claims. Output saved as queryable JSON.', refine: 'AUTONOMOUS: AI analyzes your content, selects the best refinement stages (critique, expand, structure, validate, etc.), and runs them in the optimal order. Turns a good draft into a polished final version.', - adaptive: 'ADAPTIVE: Each model self-evaluates its confidence. If below threshold, the pipeline retrieves context from a vectorized knowledge base and escalates to a stronger model. Successful responses are stored for future RAG retrieval. The system gets smarter with every run.' + adaptive: 'ADAPTIVE: Each model self-evaluates its confidence. If below threshold, the pipeline retrieves context from a vectorized knowledge base and escalates to a stronger model. Successful responses are stored for future RAG retrieval. The system gets smarter with every run.', + deep_analysis: 'DEEP ANALYSIS: 6-phase autonomous pipeline — Research (all models) → Debate (challenge findings) → Consensus (merge perspectives) → Self-Eval (score quality) → Final Synthesis (strongest model) → Knowledge Base (store for future RAG). Designed for cloud models. Results train local models.' }; const SAMPLE_PROMPTS = { @@ -3602,6 +3624,19 @@ const SAMPLE_PROMPTS = { 'Design a privacy-preserving federated learning system for healthcare where patient data never leaves hospital networks but a central model improves from all participants. Address differential privacy, secure aggregation, and regulatory compliance.', 'Build an autonomous incident response system that correlates alerts from 15 monitoring tools, classifies severity, executes runbooks, and escalates to humans only when confidence is below threshold.', 'Design a real-time stream processing platform handling 1M events/sec with exactly-once semantics, schema evolution, time-travel debugging, and automatic partition rebalancing across 100 nodes.' + ]}, + deep_analysis: { basic: [ + 'What is the most effective approach to implementing AI in a staffing agency that currently uses spreadsheets and phone calls?', + 'Compare the costs and benefits of building vs buying an internal data platform for a 200-person company.', + 'How should a company evaluate whether to adopt a local LLM deployment vs cloud API for sensitive internal data?' + ], mid: [ + 'Design a hybrid search architecture that combines SQL filtering with vector semantic search for a database of 500K worker profiles. Address recall, latency, and ranking.', + 'What is the optimal strategy for a staffing company to use AI to predict workforce demand from public building permit data? Cover data sources, models, and integration.', + 'Design a learning feedback loop where every user interaction with a search system improves future results. Address cold start, data quality, and convergence.' + ], advanced: [ + 'Design a complete AI-powered staffing platform that anticipates client needs before they call, pre-matches workers to contracts, learns from every placement, and handles the sparse data problem where new clients have only a name and phone number. Address architecture, data pipeline, AI models, and the change management challenge of convincing skeptical staffers.', + 'Architect a system that ingests real-time public data (building permits, government contracts, economic indicators) to predict regional labor demand 3-6 months ahead, cross-references with an existing workforce database, and automatically generates recruiting strategies for identified gaps.', + 'Design an AI system that can be trusted by non-technical users who are actively hostile to AI adoption. Cover transparency, explainability, graceful degradation, and the specific UX patterns that build trust over time.' ]} }; @@ -3736,7 +3771,7 @@ function populateAllSelects() { 'staircase-challenger','drift-target','drift-analyzer','mesh-synthesizer','halluc-answerer', 'timeloop-answerer','timeloop-chaos', 'research-scout','research-checker','research-synth', - 'adaptive-synthesizer', + 'adaptive-synthesizer','deep_analysis-synthesizer', 'eval-judge','extract-model','extract-verifier','refine-orchestrator']; ids.forEach(id => { const el = document.getElementById(id); @@ -3820,6 +3855,7 @@ function buildConfig() { case 'extract': c.extractor = getVal('extract-model'); c.verifier = getVal('extract-verifier'); c.source = getVal('extract-source'); break; case 'refine': c.orchestrator = getVal('refine-orchestrator'); c.models = getModels('ml-refine'); c.max_stages = getNum('refine-stages'); break; case 'adaptive': c.models = getModels('ml-adaptive'); c.synthesizer = getVal('adaptive-synthesizer'); c.confidence_threshold = parseFloat(document.getElementById('adaptive-confidence').value) || 0.7; break; + case 'deep_analysis': c.models = getModels('ml-deep_analysis'); c.synthesizer = getVal('deep_analysis-synthesizer'); break; } return c; } @@ -10580,7 +10616,7 @@ def run_team(): "staircase": run_staircase, "drift": run_drift, "mesh": run_mesh, "hallucination": run_hallucination, "timeloop": run_timeloop, "research": run_research, "eval": run_eval, "extract": run_extract, - "refine": run_refine, "adaptive": run_adaptive, + "refine": run_refine, "adaptive": run_adaptive, "deep_analysis": run_deep_analysis, } run_id = str(_uuid.uuid4())[:8] @@ -12255,6 +12291,163 @@ def run_adaptive(config): f"Knowledge base: {'updated' if best_score is None or best_score >= score_threshold else 'not stored (below threshold)'}" ) yield sse({"type": "response", "model": "system", "text": summary, "role": "summary"}) + + +def run_deep_analysis(config): + """Deep Analysis: chains Research → Debate → Consensus → Adaptive scoring → Final synthesis. + Designed for cloud models — produces high-quality results that train the local knowledge base.""" + import time as _time + start = _time.time() + prompt = config["prompt"] + models = config.get("models", []) + synthesizer = config.get("synthesizer", models[0] if models else "") + if len(models) < 2: + yield sse({"type": "response", "model": "system", "text": "Deep Analysis requires at least 2 models. Select your strongest cloud models.", "role": "error"}) + return + + yield sse({"type": "clear"}) + all_outputs = {} + phase_times = {} + + # ═══ PHASE 1: Multi-model Research ═══ + yield sse({"type": "progress", "step": 1, "total_steps": 6, "substep": "Phase 1: Researching with all models...", "percent": 5}) + yield sse({"type": "status", "message": "Phase 1/6: Research"}) + research_prompt = ( + f"You are a senior research analyst. Provide a thorough, well-structured response to this question. " + f"Include relevant context, consider multiple angles, cite your reasoning, and identify what you're uncertain about.\n\n" + f"QUESTION:\n{prompt}" + ) + research_results = {} + p1_start = _time.time() + for i, model in enumerate(models): + pct = 5 + int((i / len(models)) * 15) + yield sse({"type": "progress", "step": 1, "total_steps": 6, "substep": f"Researching: {model}...", "percent": pct}) + try: + result = safe_query(model, research_prompt) + research_results[model] = result + yield sse({"type": "response", "model": model, "text": result, "role": "researcher"}) + except Exception as e: + yield sse({"type": "response", "model": model, "text": f"Error: {e}", "role": "error"}) + phase_times["research"] = int((_time.time() - p1_start) * 1000) + all_outputs["research"] = research_results + + if not research_results: + yield sse({"type": "response", "model": "system", "text": "All models failed in research phase.", "role": "error"}) + return + + # ═══ PHASE 2: Critical Debate ═══ + yield sse({"type": "progress", "step": 2, "total_steps": 6, "substep": "Phase 2: Challenging findings...", "percent": 25}) + yield sse({"type": "status", "message": "Phase 2/6: Debate"}) + combined_research = "\n\n---\n\n".join([f"[{m}]:\n{r[:2000]}" for m, r in research_results.items()]) + debate_prompt = ( + f"You are a critical analyst. Multiple researchers have responded to a question. " + f"Challenge their findings. What are the weak points? What assumptions are being made? " + f"What alternative perspectives exist? What's missing?\n\n" + f"ORIGINAL QUESTION:\n{prompt}\n\n" + f"RESEARCH RESPONSES:\n{combined_research[:6000]}" + ) + # Use 2 models for debate — different perspectives + debaters = models[:2] if len(models) >= 2 else models + debate_results = {} + p2_start = _time.time() + for model in debaters: + yield sse({"type": "progress", "step": 2, "total_steps": 6, "substep": f"Debating: {model}...", "percent": 30}) + try: + result = safe_query(model, debate_prompt) + debate_results[model] = result + yield sse({"type": "response", "model": model, "text": result, "role": "critic"}) + except Exception as e: + yield sse({"type": "response", "model": model, "text": f"Error: {e}", "role": "error"}) + phase_times["debate"] = int((_time.time() - p2_start) * 1000) + all_outputs["debate"] = debate_results + + # ═══ PHASE 3: Consensus Building ═══ + yield sse({"type": "progress", "step": 3, "total_steps": 6, "substep": "Phase 3: Building consensus...", "percent": 45}) + yield sse({"type": "status", "message": "Phase 3/6: Consensus"}) + combined_debate = "\n\n---\n\n".join([f"[{m}]:\n{r[:2000]}" for m, r in debate_results.items()]) + consensus_prompt = ( + f"You are synthesizing research findings with critical analysis. " + f"Merge the research with the critiques. For each major point, state: " + f"(1) what's strongly supported, (2) what's contested, (3) what needs more investigation.\n\n" + f"ORIGINAL QUESTION:\n{prompt}\n\n" + f"RESEARCH:\n{combined_research[:3000]}\n\n" + f"CRITIQUES:\n{combined_debate[:3000]}" + ) + p3_start = _time.time() + consensus_model = models[len(models) // 2] if len(models) > 2 else models[-1] + try: + consensus = safe_query(consensus_model, consensus_prompt) + yield sse({"type": "response", "model": consensus_model, "text": consensus, "role": "consensus"}) + except Exception as e: + consensus = combined_research[:3000] + yield sse({"type": "response", "model": "system", "text": f"Consensus error, using raw research: {e}", "role": "error"}) + phase_times["consensus"] = int((_time.time() - p3_start) * 1000) + all_outputs["consensus"] = consensus + + # ═══ PHASE 4: Self-Evaluation ═══ + yield sse({"type": "progress", "step": 4, "total_steps": 6, "substep": "Phase 4: Self-evaluation...", "percent": 60}) + yield sse({"type": "status", "message": "Phase 4/6: Self-eval"}) + eval_prompt = ( + f"Rate the following analysis on a scale of 1-10 for: accuracy, completeness, actionability, and nuance. " + f"Return JSON: {{\"scores\": {{\"accuracy\": N, \"completeness\": N, \"actionability\": N, \"nuance\": N}}, \"overall\": N, \"strengths\": \"...\", \"gaps\": \"...\"}}\n\n" + f"QUESTION:\n{prompt[:500]}\n\nANALYSIS:\n{consensus[:4000]}" + ) + p4_start = _time.time() + eval_result = {"overall": 0} + try: + eval_raw = safe_query(synthesizer, eval_prompt) + j_s, j_e = eval_raw.find("{"), eval_raw.rfind("}") + 1 + if j_s >= 0 and j_e > j_s: + eval_result = json.loads(eval_raw[j_s:j_e]) + yield sse({"type": "response", "model": synthesizer, "text": eval_raw, "role": "evaluator"}) + except Exception as e: + yield sse({"type": "response", "model": "system", "text": f"Eval error: {e}", "role": "error"}) + phase_times["evaluation"] = int((_time.time() - p4_start) * 1000) + + # ═══ PHASE 5: Final Synthesis ═══ + yield sse({"type": "progress", "step": 5, "total_steps": 6, "substep": "Phase 5: Final synthesis by strongest model...", "percent": 75}) + yield sse({"type": "status", "message": "Phase 5/6: Final synthesis"}) + gaps = eval_result.get("gaps", "") + synth_prompt = ( + f"You are producing the definitive response to a question that has been researched by multiple models, " + f"critically debated, and evaluated. Produce the best possible answer.\n\n" + f"ORIGINAL QUESTION:\n{prompt}\n\n" + f"CONSENSUS ANALYSIS:\n{consensus[:4000]}\n\n" + + (f"IDENTIFIED GAPS TO ADDRESS:\n{gaps}\n\n" if gaps else "") + + f"Produce a comprehensive, well-structured final answer. Be specific and actionable." + ) + p5_start = _time.time() + try: + final = safe_query(synthesizer, synth_prompt) + yield sse({"type": "response", "model": synthesizer, "text": final, "role": "final"}) + except Exception as e: + final = consensus + yield sse({"type": "response", "model": "system", "text": f"Synthesis error, using consensus: {e}", "role": "error"}) + phase_times["synthesis"] = int((_time.time() - p5_start) * 1000) + + # ═══ PHASE 6: Store in Knowledge Base ═══ + yield sse({"type": "progress", "step": 6, "total_steps": 6, "substep": "Phase 6: Storing in knowledge base...", "percent": 95}) + yield sse({"type": "status", "message": "Phase 6/6: Knowledge base"}) + overall_score = eval_result.get("overall", 7) + _kb_store(prompt, final, "deep_analysis", synthesizer, overall_score, 0.9) + yield sse({"type": "response", "model": "system", + "text": f"Final response stored in knowledge base (score: {overall_score}/10). Local models will benefit from this on future similar queries.", + "role": "notice"}) + + # Summary + total_ms = int((_time.time() - start) * 1000) + model_list = ", ".join(models) + time_breakdown = " → ".join([f"{k}: {v}ms" for k, v in phase_times.items()]) + summary = ( + f"Deep Analysis complete in {total_ms}ms\n" + f"Pipeline: Research → Debate → Consensus → Eval → Synthesis\n" + f"Models: {model_list}\n" + f"Synthesizer: {synthesizer}\n" + f"Quality: {overall_score}/10\n" + f"Phases: {time_breakdown}\n" + f"Knowledge base updated — future adaptive runs on similar topics will use this result" + ) + yield sse({"type": "response", "model": "system", "text": summary, "role": "summary"}) yield sse({"type": "progress", "step": 4, "total_steps": 4, "substep": "Complete", "percent": 100}) # Save adaptive run log