diff --git a/config/providers.toml b/config/providers.toml index 23acf45..13bdbce 100644 --- a/config/providers.toml +++ b/config/providers.toml @@ -35,7 +35,9 @@ default_model = "deepseek-v3.2" # includes deepseek-v3.2, deepseek-v4-{flash,pro}, gemini-3-flash- # preview, glm-{5,5.1}, kimi-k2.6, qwen3-coder-next. # 2026-04-28: default upgraded gpt-oss:120b → deepseek-v3.2 (newest -# DeepSeek revision; kimi-k2:1t still upstream-broken with HTTP 500). +# DeepSeek revision). NOTE: kimi-k2:1t is upstream-broken (HTTP 500 +# on Ollama Pro probe 2026-04-28) — do not route to it. Use kimi-k2.6 +# instead, which is what staffing_inference points at. [[provider]] name = "openrouter" @@ -79,8 +81,10 @@ auth_env = "KIMI_API_KEY" default_model = "kimi-for-coding" # Direct Kimi For Coding provider. `api.kimi.com` is a SEPARATE account # system from `api.moonshot.ai` and `api.moonshot.cn` — keys are NOT -# interchangeable. Used when Ollama Cloud's `kimi-k2:1t` is upstream- -# broken and OpenRouter's `moonshotai/kimi-k2.6` is rate-limited. +# interchangeable. Used as a fallback when Ollama Cloud's kimi-k2.6 is +# unavailable and OpenRouter's `moonshotai/kimi-k2.6` is rate-limited. +# (Was `kimi-k2:1t` here pre-2026-05-03 — that model is upstream-broken +# and removed from operator guidance.) # Model id: `kimi-for-coding` (kimi-k2.6 underneath). # Key file: /etc/lakehouse/kimi.env (loaded via systemd EnvironmentFile). # Model-prefix routing: "kimi/" auto-routes here, prefix stripped. diff --git a/crates/vectord-lance/src/lib.rs b/crates/vectord-lance/src/lib.rs index 3a99b64..cd9ab60 100644 --- a/crates/vectord-lance/src/lib.rs +++ b/crates/vectord-lance/src/lib.rs @@ -620,13 +620,23 @@ mod tests { use super::*; fn temp_path(label: &str) -> String { - let n = std::time::SystemTime::now() + // Per-process atomic counter — guarantees uniqueness regardless + // of clock resolution or test scheduling. Combined with pid, the + // result is unique within and across processes for any practical + // test workload. Nanosecond timestamps were not enough on their + // own: opus WARN at lib.rs:622 from the 2026-05-02 scrum noted + // that under tokio scheduling, multiple tests in the same cargo + // process can hit the same nanos bucket. + use std::sync::atomic::{AtomicU64, Ordering}; + static COUNTER: AtomicU64 = AtomicU64::new(0); + let seq = COUNTER.fetch_add(1, Ordering::Relaxed); + let pid = std::process::id(); + let nanos = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .map(|d| d.subsec_nanos()) .unwrap_or(0); - let pid = std::process::id(); std::env::temp_dir() - .join(format!("vlance_test_{label}_{pid}_{n}")) + .join(format!("vlance_test_{label}_{pid}_{nanos}_{seq}")) .to_string_lossy() .to_string() } diff --git a/scripts/lance_smoke.sh b/scripts/lance_smoke.sh index 404eb1f..b639a6d 100755 --- a/scripts/lance_smoke.sh +++ b/scripts/lance_smoke.sh @@ -33,9 +33,18 @@ PROBE "gateway /v1/health responds" \ bash -c "curl -sf -m 3 $GATEWAY/v1/health -o /dev/null" # ── 1. Search returns IVF_PQ results on existing dataset ──────── +# Capture curl status separately so a transport-level failure (gateway +# down, network broken, timeout) shows up as its own probe — instead of +# being swallowed by `|| echo '{}'` which would surface as the next jq +# probe failing with a misleading "no method field" message. Per opus +# INFO at lance_smoke.sh:38 from the 2026-05-02 scrum. RESP=$(curl -sS -m 30 -X POST "$PREFIX/search/$DATASET" \ -H 'Content-Type: application/json' \ - -d '{"query":"forklift operator","top_k":3}' 2>/dev/null || echo '{}') + -d '{"query":"forklift operator","top_k":3}' 2>/dev/null) +CURL_RC=$? +PROBE "search/$DATASET curl reachable (exit 0)" \ + test "$CURL_RC" = "0" +[ "$CURL_RC" != "0" ] && RESP='{}' PROBE "search/$DATASET returns top-3 lance_ivf_pq results" \ bash -c "echo '$RESP' | jq -e '.method == \"lance_ivf_pq\" and (.results | length) == 3' >/dev/null" diff --git a/tests/real-world/scrum_master_pipeline.ts b/tests/real-world/scrum_master_pipeline.ts index bf3a474..faeaa7d 100644 --- a/tests/real-world/scrum_master_pipeline.ts +++ b/tests/real-world/scrum_master_pipeline.ts @@ -37,6 +37,13 @@ const MAX_ATTEMPTS = 9; // crates//src/*.rs. const FILE_TREE_SPLIT_THRESHOLD = Number(process.env.LH_SCRUM_TREE_SPLIT_THRESHOLD ?? 6000); const FILE_SHARD_SIZE = Number(process.env.LH_SCRUM_SHARD_SIZE ?? 3500); + +// Centralized to keep MAP and REDUCE phases in lockstep — diverging the two +// breaks tree-split consistency (per-shard digests must come from the same +// model the reducer collapses). Surfaced 2026-05-02 by kimi scrum WARN at +// scrum_master_pipeline.ts:1143. +const TREE_SPLIT_MODEL = "gemini-3-flash-preview"; +const TREE_SPLIT_PROVIDER = "ollama_cloud"; // Same-model retry budget after observer rejection. After this many // quality rejects on the current model, advance to the next provider- // error fallback. Counts ONLY observer/quality rejects, not provider @@ -1143,15 +1150,15 @@ Format each as a code-fenced block with the byte offset within the shard: EXACT LINE OF SOURCE — DO NOT PARAPHRASE, DO NOT TRUNCATE \`\`\` Pick the most reviewer-relevant lines: route definitions (e.g. \`@app.route(...)\`), function signatures, security-sensitive calls (auth/SQL/exec/template/secrets), hardcoded credentials/defaults, exception handlers, sensitive imports. The reviewer will REFUSE to act on any claim not backed by a verbatim anchor — so anchors are how you prove findings are real.`; - // 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama - // Pro. Tree-split MAP fires once per shard (potentially 5-20× - // per file), so latency dominates total scrum time. Gemini 3 - // flash returns shard digests substantially faster than the old - // 120B free model while staying strong enough for byte-anchored + // 2026-04-28: gpt-oss:120b → TREE_SPLIT_MODEL via Ollama Pro. + // Tree-split MAP fires once per shard (potentially 5-20× per + // file), so latency dominates total scrum time. Gemini 3 flash + // returns shard digests substantially faster than the old 120B + // free model while staying strong enough for byte-anchored // extraction. const r = await chat({ - provider: "ollama_cloud", - model: "gemini-3-flash-preview", + provider: TREE_SPLIT_PROVIDER, + model: TREE_SPLIT_MODEL, prompt, max_tokens: 900, }); @@ -1201,14 +1208,14 @@ COPY EVERY anchor block from the piece notes IN ORDER, character-perfect. DO NOT Output the anchor blocks under their original \`\`\`@offset...\`\`\` fences, each on its own with a blank line between. The reviewer rejects findings that don't quote a string from this anchors block, so completeness here directly determines review quality.`; - // 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama - // Pro. The reducer runs once per file (vs once per shard for MAP) - // but on a much larger context (all shard digests stacked), so - // throughput per token still matters. Same model as MAP for - // consistency in tree-split outputs. + // 2026-04-28: gpt-oss:120b → TREE_SPLIT_MODEL via Ollama Pro. The + // reducer runs once per file (vs once per shard for MAP) but on a + // much larger context (all shard digests stacked), so throughput + // per token still matters. Must match MAP model exactly — diverging + // the two breaks tree-split coherence. const reduced = await chat({ - provider: "ollama_cloud", - model: "gemini-3-flash-preview", + provider: TREE_SPLIT_PROVIDER, + model: TREE_SPLIT_MODEL, prompt: reducePrompt, max_tokens: 2400, });