diff --git a/crates/vectord/src/rag.rs b/crates/vectord/src/rag.rs index 007329a..286c00d 100644 --- a/crates/vectord/src/rag.rs +++ b/crates/vectord/src/rag.rs @@ -163,7 +163,11 @@ pub async fn query( // production caller of the Phase 21 primitives — see audit finding // "Phase 21 Rust primitives are wired but not CALLED by any // production surface" from 2026-04-21. - let mut cont_opts = ContinuableOpts::new("qwen2.5:latest"); + // 2026-04-30 model bump: qwen2.5:latest → qwen3.5:latest to match + // the small-model-pipeline local-tier default. Same JSON-clean + // property, more capacity. think=Some(false) preserved — RAG hot + // path doesn't need reasoning traces; direct answers only. + let mut cont_opts = ContinuableOpts::new("qwen3.5:latest"); cont_opts.max_tokens = Some(512); cont_opts.temperature = Some(0.2); cont_opts.shape = ResponseShape::Text; @@ -176,7 +180,7 @@ pub async fn query( // echoes whatever Ollama loaded). Use the configured tier model // for now; if RAG needs to report the actual resolved model, // the runner can add a post-call ps probe later. - model: "qwen2.5:latest".to_string(), + model: "qwen3.5:latest".to_string(), sources: results, tokens_generated: None, }) diff --git a/lakehouse.toml b/lakehouse.toml index 19061a1..4e828e1 100644 --- a/lakehouse.toml +++ b/lakehouse.toml @@ -48,8 +48,13 @@ url = "http://localhost:3200" [ai] embed_model = "nomic-embed-text" -gen_model = "qwen2.5" -rerank_model = "qwen2.5" +# Local-tier defaults bumped 2026-04-30: qwen3.5:latest is the +# stronger local rung in the 5-loop substrate (per +# project_small_model_pipeline_vision.md). Same JSON-clean property +# as qwen2.5, more capacity. Ollama still serves both — bump back +# in this file if a workload regressed. +gen_model = "qwen3.5:latest" +rerank_model = "qwen3.5:latest" [auth] enabled = false @@ -72,7 +77,9 @@ min_recall = 0.9 # never promote below this max_trials_per_hour = 20 # hard budget cap # Model roster — available for profile hot-swap +# qwen3.5:latest: stronger local rung — JSON-clean, 8K+ context, +# default for gen_model and rerank_model # qwen3: 8.2B, 40K context, thinking+tools, best for reasoning tasks -# qwen2.5: 7B, 8K context, fast, good for SQL generation -# mistral: 7B, 8K context, good for general generation +# qwen2.5: 7B, 8K context, fast — kept loaded for the 2026-04 era +# comparison runs; new defaults use qwen3.5:latest # nomic-embed-text: 137M, embedding-only, used by all profiles diff --git a/mcp-server/index.ts b/mcp-server/index.ts index ffeff3f..5576f79 100644 --- a/mcp-server/index.ts +++ b/mcp-server/index.ts @@ -313,9 +313,9 @@ ${(buckets as any[] || []).map((b: any) => `- ${b.name}: ${b.backend} (${b.reach - Ollama: :11434 ## Available Models +- qwen3.5:latest: stronger local rung, JSON-clean (default for gen + rerank) - qwen3: 8.2B, 40K context, thinking+tools (best for reasoning) -- qwen2.5: 7B, 8K context (best for fast SQL generation) -- mistral: 7B, 8K context (general generation) +- qwen2.5: 7B, 8K context (legacy — 2026-04 era comparison runs only) - nomic-embed-text: 137M (embedding, automatic) `; return { contents: [{ uri: uri.href, mimeType: "text/plain", text }] }; diff --git a/mcp-server/observer.ts b/mcp-server/observer.ts index 24e8042..7085b0c 100644 --- a/mcp-server/observer.ts +++ b/mcp-server/observer.ts @@ -146,15 +146,16 @@ async function persistOp(op: ObservedOp) { // ─── LLM Team escalation (code_review mode) ─── // // When recent failures on a single sig_hash cross a threshold the -// local qwen2.5 analysis is probably insufficient. J's 2026-04-24 +// local-model analysis is probably insufficient. J's 2026-04-24 // direction: "the observer would trigger to give more context" — // route failure clusters to LLM Team's specialized code_review mode // (via /api/run) so richer structured signal lands in the KB for // scrum + auditor + playbook memory to consume next pass. // -// Non-destructive: runs in parallel to the existing qwen2.5 analysis, -// never replaces it. Writes to data/_kb/observer_escalations.jsonl -// as a dedicated audit surface. +// Non-destructive: runs in parallel to the existing local diagnose +// call (qwen3.5:latest after the 2026-04-30 bump), never replaces +// it. Writes to data/_kb/observer_escalations.jsonl as a dedicated +// audit surface. const LLM_TEAM = process.env.LH_LLM_TEAM_URL ?? "http://localhost:5000"; const LLM_TEAM_ESCALATIONS = "/home/profit/lakehouse/data/_kb/observer_escalations.jsonl"; @@ -542,7 +543,7 @@ async function analyzeErrors() { if (failures.length === 0) return; // NEW 2026-04-24: escalate recurring sig_hash clusters to LLM Team - // code_review mode. Runs in parallel to the local qwen2.5 analysis + // code_review mode. Runs in parallel to the local diagnose call // below — non-blocking, richer downstream signal for scrum/auditor. maybeEscalate(failures).catch(() => {}); @@ -552,13 +553,14 @@ async function analyzeErrors() { // Ask local model to diagnose. Phase 44 migration (2026-04-27): // /v1/chat instead of legacy /ai/generate so /v1/usage tracks the - // call + Langfuse traces it. Same upstream model (qwen2.5 local). + // call + Langfuse traces it. 2026-04-30 model bump: qwen2.5 → + // qwen3.5:latest to match the small-model-pipeline local-tier default. try { const resp = await fetch(`${LAKEHOUSE}/v1/chat`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ - model: "qwen2.5", + model: "qwen3.5:latest", provider: "ollama", messages: [{ role: "user",