cleanup: bump qwen2.5 → qwen3.5:latest in active defaults
Some checks failed
lakehouse/auditor 16 blocking issues: cloud: claim not backed — "Verified end-to-end via playwright on devop.live/lakehouse:"

stronger local rung is now the small-model-pipeline tier-1 default
across both Rust legacy + Go rewrite (cf. golangLAKEHOUSE phase 1).
same JSON-clean property as qwen2.5, more capacity. ollama still
serves both side-by-side; rollback is a 4-line revert if a workload
regresses.

active-default sites:
- lakehouse.toml [ai] gen_model + rerank_model → qwen3.5:latest
- mcp-server/observer.ts diagnose call (Phase 44 /v1/chat path) → qwen3.5:latest
- mcp-server/index.ts model roster doc → qwen3.5:latest first
- crates/vectord/src/rag.rs ContinuableOpts + RagResponse.model → qwen3.5:latest

skipped: execution_loop/mod.rs comments describing historic qwen2.5
tool_call quirks — those are documentation of past behavior, not
active defaults. data/_catalog/profiles/*.json are runtime-generated
(gitignored), not in scope for tracked changes.

cargo check -p vectord: clean. no behavioral change in the audit
pipeline — same JSON-clean local model, same think=Some(false)
posture, just stronger upstream.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-04-30 00:10:57 -05:00
parent d475fc7fff
commit 8de94eba08
4 changed files with 28 additions and 15 deletions

View File

@ -163,7 +163,11 @@ pub async fn query(
// production caller of the Phase 21 primitives — see audit finding // production caller of the Phase 21 primitives — see audit finding
// "Phase 21 Rust primitives are wired but not CALLED by any // "Phase 21 Rust primitives are wired but not CALLED by any
// production surface" from 2026-04-21. // production surface" from 2026-04-21.
let mut cont_opts = ContinuableOpts::new("qwen2.5:latest"); // 2026-04-30 model bump: qwen2.5:latest → qwen3.5:latest to match
// the small-model-pipeline local-tier default. Same JSON-clean
// property, more capacity. think=Some(false) preserved — RAG hot
// path doesn't need reasoning traces; direct answers only.
let mut cont_opts = ContinuableOpts::new("qwen3.5:latest");
cont_opts.max_tokens = Some(512); cont_opts.max_tokens = Some(512);
cont_opts.temperature = Some(0.2); cont_opts.temperature = Some(0.2);
cont_opts.shape = ResponseShape::Text; cont_opts.shape = ResponseShape::Text;
@ -176,7 +180,7 @@ pub async fn query(
// echoes whatever Ollama loaded). Use the configured tier model // echoes whatever Ollama loaded). Use the configured tier model
// for now; if RAG needs to report the actual resolved model, // for now; if RAG needs to report the actual resolved model,
// the runner can add a post-call ps probe later. // the runner can add a post-call ps probe later.
model: "qwen2.5:latest".to_string(), model: "qwen3.5:latest".to_string(),
sources: results, sources: results,
tokens_generated: None, tokens_generated: None,
}) })

View File

@ -48,8 +48,13 @@ url = "http://localhost:3200"
[ai] [ai]
embed_model = "nomic-embed-text" embed_model = "nomic-embed-text"
gen_model = "qwen2.5" # Local-tier defaults bumped 2026-04-30: qwen3.5:latest is the
rerank_model = "qwen2.5" # stronger local rung in the 5-loop substrate (per
# project_small_model_pipeline_vision.md). Same JSON-clean property
# as qwen2.5, more capacity. Ollama still serves both — bump back
# in this file if a workload regressed.
gen_model = "qwen3.5:latest"
rerank_model = "qwen3.5:latest"
[auth] [auth]
enabled = false enabled = false
@ -72,7 +77,9 @@ min_recall = 0.9 # never promote below this
max_trials_per_hour = 20 # hard budget cap max_trials_per_hour = 20 # hard budget cap
# Model roster — available for profile hot-swap # Model roster — available for profile hot-swap
# qwen3.5:latest: stronger local rung — JSON-clean, 8K+ context,
# default for gen_model and rerank_model
# qwen3: 8.2B, 40K context, thinking+tools, best for reasoning tasks # qwen3: 8.2B, 40K context, thinking+tools, best for reasoning tasks
# qwen2.5: 7B, 8K context, fast, good for SQL generation # qwen2.5: 7B, 8K context, fast — kept loaded for the 2026-04 era
# mistral: 7B, 8K context, good for general generation # comparison runs; new defaults use qwen3.5:latest
# nomic-embed-text: 137M, embedding-only, used by all profiles # nomic-embed-text: 137M, embedding-only, used by all profiles

View File

@ -313,9 +313,9 @@ ${(buckets as any[] || []).map((b: any) => `- ${b.name}: ${b.backend} (${b.reach
- Ollama: :11434 - Ollama: :11434
## Available Models ## Available Models
- qwen3.5:latest: stronger local rung, JSON-clean (default for gen + rerank)
- qwen3: 8.2B, 40K context, thinking+tools (best for reasoning) - qwen3: 8.2B, 40K context, thinking+tools (best for reasoning)
- qwen2.5: 7B, 8K context (best for fast SQL generation) - qwen2.5: 7B, 8K context (legacy 2026-04 era comparison runs only)
- mistral: 7B, 8K context (general generation)
- nomic-embed-text: 137M (embedding, automatic) - nomic-embed-text: 137M (embedding, automatic)
`; `;
return { contents: [{ uri: uri.href, mimeType: "text/plain", text }] }; return { contents: [{ uri: uri.href, mimeType: "text/plain", text }] };

View File

@ -146,15 +146,16 @@ async function persistOp(op: ObservedOp) {
// ─── LLM Team escalation (code_review mode) ─── // ─── LLM Team escalation (code_review mode) ───
// //
// When recent failures on a single sig_hash cross a threshold the // When recent failures on a single sig_hash cross a threshold the
// local qwen2.5 analysis is probably insufficient. J's 2026-04-24 // local-model analysis is probably insufficient. J's 2026-04-24
// direction: "the observer would trigger to give more context" — // direction: "the observer would trigger to give more context" —
// route failure clusters to LLM Team's specialized code_review mode // route failure clusters to LLM Team's specialized code_review mode
// (via /api/run) so richer structured signal lands in the KB for // (via /api/run) so richer structured signal lands in the KB for
// scrum + auditor + playbook memory to consume next pass. // scrum + auditor + playbook memory to consume next pass.
// //
// Non-destructive: runs in parallel to the existing qwen2.5 analysis, // Non-destructive: runs in parallel to the existing local diagnose
// never replaces it. Writes to data/_kb/observer_escalations.jsonl // call (qwen3.5:latest after the 2026-04-30 bump), never replaces
// as a dedicated audit surface. // it. Writes to data/_kb/observer_escalations.jsonl as a dedicated
// audit surface.
const LLM_TEAM = process.env.LH_LLM_TEAM_URL ?? "http://localhost:5000"; const LLM_TEAM = process.env.LH_LLM_TEAM_URL ?? "http://localhost:5000";
const LLM_TEAM_ESCALATIONS = "/home/profit/lakehouse/data/_kb/observer_escalations.jsonl"; const LLM_TEAM_ESCALATIONS = "/home/profit/lakehouse/data/_kb/observer_escalations.jsonl";
@ -542,7 +543,7 @@ async function analyzeErrors() {
if (failures.length === 0) return; if (failures.length === 0) return;
// NEW 2026-04-24: escalate recurring sig_hash clusters to LLM Team // NEW 2026-04-24: escalate recurring sig_hash clusters to LLM Team
// code_review mode. Runs in parallel to the local qwen2.5 analysis // code_review mode. Runs in parallel to the local diagnose call
// below — non-blocking, richer downstream signal for scrum/auditor. // below — non-blocking, richer downstream signal for scrum/auditor.
maybeEscalate(failures).catch(() => {}); maybeEscalate(failures).catch(() => {});
@ -552,13 +553,14 @@ async function analyzeErrors() {
// Ask local model to diagnose. Phase 44 migration (2026-04-27): // Ask local model to diagnose. Phase 44 migration (2026-04-27):
// /v1/chat instead of legacy /ai/generate so /v1/usage tracks the // /v1/chat instead of legacy /ai/generate so /v1/usage tracks the
// call + Langfuse traces it. Same upstream model (qwen2.5 local). // call + Langfuse traces it. 2026-04-30 model bump: qwen2.5 →
// qwen3.5:latest to match the small-model-pipeline local-tier default.
try { try {
const resp = await fetch(`${LAKEHOUSE}/v1/chat`, { const resp = await fetch(`${LAKEHOUSE}/v1/chat`, {
method: "POST", method: "POST",
headers: { "Content-Type": "application/json" }, headers: { "Content-Type": "application/json" },
body: JSON.stringify({ body: JSON.stringify({
model: "qwen2.5", model: "qwen3.5:latest",
provider: "ollama", provider: "ollama",
messages: [{ messages: [{
role: "user", role: "user",