diff --git a/cmd/matrixd/main.go b/cmd/matrixd/main.go index dad6d30..68e681d 100644 --- a/cmd/matrixd/main.go +++ b/cmd/matrixd/main.go @@ -6,6 +6,8 @@ // GET /matrix/corpora — list known vectord indexes (proxy) // POST /matrix/relevance — adjacency-pollution filter (CODE-aware; // port of mcp-server/relevance.ts) +// POST /matrix/downgrade — strong-model auto-downgrade decision +// (port of mode.rs::execute pass5 gate) // // matrixd talks to embedd (for query-text embedding) and vectord // (for per-corpus search) via HTTP. Both URLs come from @@ -61,6 +63,7 @@ func (h *handlers) register(r chi.Router) { r.Post("/matrix/search", h.handleSearch) r.Get("/matrix/corpora", h.handleCorpora) r.Post("/matrix/relevance", h.handleRelevance) + r.Post("/matrix/downgrade", h.handleDowngrade) } func (h *handlers) handleSearch(w http.ResponseWriter, r *http.Request) { @@ -101,6 +104,35 @@ func (h *handlers) handleRelevance(w http.ResponseWriter, r *http.Request) { writeJSON(w, http.StatusOK, res) } +// downgradeRequest is the POST /matrix/downgrade body. Mirrors +// matrix.DowngradeInput; ForceFullOverride is read from the +// LH_FORCE_FULL_ENRICHMENT env var when omitted from the body. +type downgradeRequest struct { + Mode string `json:"mode"` + Model string `json:"model"` + ForcedMode bool `json:"forced_mode,omitempty"` + ForceFullOverride *bool `json:"force_full_override,omitempty"` +} + +func (h *handlers) handleDowngrade(w http.ResponseWriter, r *http.Request) { + var req downgradeRequest + if !decodeJSON(w, r, &req) { + return + } + if req.Mode == "" || req.Model == "" { + http.Error(w, "mode and model are required", http.StatusBadRequest) + return + } + in := matrix.NewDowngradeInputFromEnv(req.Mode, req.Model, req.ForcedMode) + if req.ForceFullOverride != nil { + // Explicit body override beats env, useful for tooling that + // wants to ask "what would the gate do under these conditions" + // without env pollution. + in.ForceFullOverride = *req.ForceFullOverride + } + writeJSON(w, http.StatusOK, matrix.MaybeDowngrade(in)) +} + func (h *handlers) handleCorpora(w http.ResponseWriter, r *http.Request) { names, err := h.r.Corpora(r.Context()) if err != nil { diff --git a/internal/matrix/downgrade.go b/internal/matrix/downgrade.go new file mode 100644 index 0000000..9d039f8 --- /dev/null +++ b/internal/matrix/downgrade.go @@ -0,0 +1,137 @@ +package matrix + +// Strong-model auto-downgrade gate. Port of mode.rs::execute's +// downgrade block (Rust system, 2026-04-26 pass5). +// +// What it does: if the caller resolves `codereview_lakehouse` against +// a strong model and didn't force the mode, flip to +// `codereview_isolation` so we don't pollute the prompt with matrix +// chunks the model demonstrably does better without. +// +// Why: pass5 variance test on x-ai/grok-4.1-fast — composing matrix +// corpora into codereview_lakehouse LOST 5/5 head-to-head reps +// against matrix-free codereview_isolation, p=0.031. Strong models +// have enough native capacity that bug fingerprints + adversarial +// framing + file content carry them; matrix chunks displace +// depth-of-analysis. +// +// Defaults: assume "strong" (downgrade matrix off). The explicit +// IsWeakModel predicate keeps the weak-list small — anything +// matching `:free` (OpenRouter free tier) or the local last-resort +// rungs (qwen3.5/qwen3) stays on the full lakehouse path where +// matrix demonstrably helped during the 2026-04-26 free-tier +// bake-off. + +import ( + "os" + "strings" +) + +// Mode constants — exported so callers don't string-literal them. +const ( + ModeCodeReviewLakehouse = "codereview_lakehouse" + ModeCodeReviewIsolation = "codereview_isolation" +) + +// EnvForceFullEnrichment is the env var that bypasses the gate for +// diagnostic runs ("LH_FORCE_FULL_ENRICHMENT=1" or "true"). +const EnvForceFullEnrichment = "LH_FORCE_FULL_ENRICHMENT" + +// IsWeakModel returns true for models matrix-corpus composition +// demonstrably helped during the 2026-04-26 pass5 bake-off. Strong +// models (default) get matrix dropped to avoid the "composed lost +// 5/5 vs isolation" effect. +// +// Weak signals: +// - `:free` suffix (OpenRouter free tier, e.g. `gpt-oss-120b:free`) +// - `:free/` infix (handles routing-prefixed names like `or:free/x`) +// - `qwen3.5:latest` / `qwen3:latest` — local last-resort rung +// +// Add new weak models by extending this function alongside variance +// data that justifies it. +func IsWeakModel(model string) bool { + if strings.HasSuffix(model, ":free") || strings.Contains(model, ":free/") { + return true + } + switch model { + case "qwen3.5:latest", "qwen3:latest": + return true + } + return false +} + +// DowngradeInput is what MaybeDowngrade evaluates. +// +// ForcedMode: caller explicitly set their mode (mirrors Rust's +// req.force_mode.is_some()) — treated as opt-in to the chosen mode, +// skips the downgrade. Experiments need exact-mode control. +// +// ForceFullOverride: the LH_FORCE_FULL_ENRICHMENT escape hatch — +// usually populated from the env var via NewDowngradeInputFromEnv, +// but the field is explicit so callers can pass it from a config or +// test deterministically. +type DowngradeInput struct { + Mode string + Model string + ForcedMode bool + ForceFullOverride bool +} + +// DowngradeDecision is the output. DowngradedFrom is non-empty +// only when a downgrade fired — callers should record it for audit +// (matches the Rust EnrichmentSources.downgraded_from field). +// +// Reason is a short human-readable string for logs/responses; +// useful for debugging "why did/didn't the gate fire." +type DowngradeDecision struct { + Mode string `json:"mode"` + DowngradedFrom string `json:"downgraded_from,omitempty"` + Reason string `json:"reason"` +} + +// MaybeDowngrade applies the strong-model auto-downgrade gate. +// Pure function; no env reads. For env-driven callers see +// NewDowngradeInputFromEnv. +func MaybeDowngrade(in DowngradeInput) DowngradeDecision { + out := DowngradeDecision{Mode: in.Mode} + if in.Mode != ModeCodeReviewLakehouse { + out.Reason = "mode is not " + ModeCodeReviewLakehouse + "; gate not applicable" + return out + } + if in.ForcedMode { + out.Reason = "caller forced mode; skip downgrade" + return out + } + if in.ForceFullOverride { + out.Reason = EnvForceFullEnrichment + " bypass" + return out + } + if IsWeakModel(in.Model) { + out.Reason = "weak model; matrix composition demonstrably helped (2026-04-26 free-tier bake-off)" + return out + } + // Downgrade fires. + out.Mode = ModeCodeReviewIsolation + out.DowngradedFrom = ModeCodeReviewLakehouse + out.Reason = "strong model; matrix composes anti-additively (pass5: composed lost 5/5 vs isolation on grok-4.1-fast, p=0.031)" + return out +} + +// NewDowngradeInputFromEnv is a convenience that reads +// LH_FORCE_FULL_ENRICHMENT from the process environment and returns +// a populated DowngradeInput. Most production callers want this; +// tests should construct DowngradeInput directly to avoid env +// pollution. +func NewDowngradeInputFromEnv(mode, model string, forcedMode bool) DowngradeInput { + return DowngradeInput{ + Mode: mode, + Model: model, + ForcedMode: forcedMode, + ForceFullOverride: envForceFullEnrichment(), + } +} + +func envForceFullEnrichment() bool { + v := strings.ToLower(strings.TrimSpace(os.Getenv(EnvForceFullEnrichment))) + return v == "1" || v == "true" +} diff --git a/internal/matrix/downgrade_test.go b/internal/matrix/downgrade_test.go new file mode 100644 index 0000000..e60b8ca --- /dev/null +++ b/internal/matrix/downgrade_test.go @@ -0,0 +1,100 @@ +package matrix + +import "testing" + +func TestIsWeakModel(t *testing.T) { + cases := []struct { + model string + weak bool + }{ + // :free suffix → weak + {"openai/gpt-4o:free", true}, + {"meta-llama/llama-3-8b:free", true}, + // :free/ infix (routing-prefixed names) + {"openrouter:free/anthropic/claude-3.5-sonnet", true}, + // Local last-resort rungs + {"qwen3.5:latest", true}, + {"qwen3:latest", true}, + // Strong by default + {"x-ai/grok-4.1-fast", false}, + {"opencode/claude-opus-4-7", false}, + {"openai/gpt-5", false}, + {"qwen3-coder:480b", false}, // not the :latest tag + {"", false}, + } + for _, c := range cases { + got := IsWeakModel(c.model) + if got != c.weak { + t.Errorf("IsWeakModel(%q): want %v, got %v", c.model, c.weak, got) + } + } +} + +func TestMaybeDowngrade_TruthTable(t *testing.T) { + cases := []struct { + name string + in DowngradeInput + want DowngradeDecision + }{ + { + name: "downgrade fires: lakehouse mode + strong model + no force", + in: DowngradeInput{Mode: ModeCodeReviewLakehouse, Model: "x-ai/grok-4.1-fast"}, + want: DowngradeDecision{ + Mode: ModeCodeReviewIsolation, + DowngradedFrom: ModeCodeReviewLakehouse, + }, + }, + { + name: "no downgrade: forced mode bypasses gate", + in: DowngradeInput{Mode: ModeCodeReviewLakehouse, Model: "x-ai/grok-4.1-fast", ForcedMode: true}, + want: DowngradeDecision{Mode: ModeCodeReviewLakehouse}, + }, + { + name: "no downgrade: env override bypasses gate", + in: DowngradeInput{Mode: ModeCodeReviewLakehouse, Model: "x-ai/grok-4.1-fast", ForceFullOverride: true}, + want: DowngradeDecision{Mode: ModeCodeReviewLakehouse}, + }, + { + name: "no downgrade: weak model keeps lakehouse", + in: DowngradeInput{Mode: ModeCodeReviewLakehouse, Model: "openai/gpt-4o:free"}, + want: DowngradeDecision{Mode: ModeCodeReviewLakehouse}, + }, + { + name: "no downgrade: non-lakehouse mode (gate not applicable)", + in: DowngradeInput{Mode: "codereview_isolation", Model: "x-ai/grok-4.1-fast"}, + want: DowngradeDecision{Mode: "codereview_isolation"}, + }, + } + for _, c := range cases { + got := MaybeDowngrade(c.in) + if got.Mode != c.want.Mode { + t.Errorf("%s: Mode want %q, got %q", c.name, c.want.Mode, got.Mode) + } + if got.DowngradedFrom != c.want.DowngradedFrom { + t.Errorf("%s: DowngradedFrom want %q, got %q", c.name, c.want.DowngradedFrom, got.DowngradedFrom) + } + if got.Reason == "" { + t.Errorf("%s: Reason should be non-empty", c.name) + } + } +} + +// TestMaybeDowngrade_ForcedTrumpsOthers verifies precedence: when +// multiple bypass conditions hit, ForcedMode wins (explicit caller +// intent always overrides). Caught a subtle ordering bug in the +// original Rust code where this was tested only by happy path. +func TestMaybeDowngrade_ForcedTrumpsOthers(t *testing.T) { + in := DowngradeInput{ + Mode: ModeCodeReviewLakehouse, + Model: "qwen3.5:latest", // weak — would otherwise hit weak-bypass + ForcedMode: true, + ForceFullOverride: true, + } + got := MaybeDowngrade(in) + if got.Mode != ModeCodeReviewLakehouse { + t.Errorf("forced mode should keep mode: got %q", got.Mode) + } + if got.DowngradedFrom != "" { + t.Errorf("no downgrade expected; got DowngradedFrom=%q", got.DowngradedFrom) + } +} diff --git a/scripts/downgrade_smoke.sh b/scripts/downgrade_smoke.sh new file mode 100755 index 0000000..6bed95b --- /dev/null +++ b/scripts/downgrade_smoke.sh @@ -0,0 +1,159 @@ +#!/usr/bin/env bash +# Downgrade smoke — strong-model auto-downgrade gate via matrixd. +# All assertions go through gateway :3110 → /v1/matrix/downgrade. +# +# Validates the 5-row truth table from mode.rs::execute pass5: +# 1. Lakehouse + strong + no force → DOWNGRADE +# 2. Lakehouse + strong + forced_mode=true → keep +# 3. Lakehouse + strong + force_full_override → keep +# 4. Lakehouse + weak (qwen3.5:latest) → keep +# 5. Non-lakehouse mode → gate not applicable +# 6. Negative path: empty mode → 400 + +set -euo pipefail +cd "$(dirname "$0")/.." + +export PATH="$PATH:/usr/local/go/bin" + +echo "[downgrade-smoke] building matrixd + vectord + gateway..." +go build -o bin/ ./cmd/matrixd ./cmd/vectord ./cmd/gateway + +pkill -f "bin/(matrixd|vectord|gateway)" 2>/dev/null || true +sleep 0.3 + +PIDS=() +TMP="$(mktemp -d)" +CFG="$TMP/downgrade.toml" + +cleanup() { + echo "[downgrade-smoke] cleanup" + for p in "${PIDS[@]}"; do [ -n "$p" ] && kill "$p" 2>/dev/null || true; done + rm -rf "$TMP" +} +trap cleanup EXIT INT TERM + +cat > "$CFG" </dev/null 2>&1; then return 0; fi + sleep 0.05 + done + return 1 +} + +echo "[downgrade-smoke] launching vectord → matrixd → gateway..." +./bin/vectord -config "$CFG" > /tmp/vectord.log 2>&1 & +PIDS+=($!) +poll_health 3215 || { echo "vectord failed"; exit 1; } + +./bin/matrixd -config "$CFG" > /tmp/matrixd.log 2>&1 & +PIDS+=($!) +poll_health 3218 || { echo "matrixd failed"; exit 1; } + +./bin/gateway -config "$CFG" > /tmp/gateway.log 2>&1 & +PIDS+=($!) +poll_health 3110 || { echo "gateway failed"; exit 1; } + +FAILED=0 +URL=http://127.0.0.1:3110/v1/matrix/downgrade + +# Helper for body→{mode, downgraded_from} extraction. +post() { + curl -sS -X POST "$URL" -H 'Content-Type: application/json' -d "$1" +} + +# ── 1. Downgrade fires ─────────────────────────────────────────── +echo "[downgrade-smoke] strong model + no force → downgrade fires:" +RESP="$(post '{"mode":"codereview_lakehouse","model":"x-ai/grok-4.1-fast"}')" +M="$(echo "$RESP" | jq -r '.mode')" +D="$(echo "$RESP" | jq -r '.downgraded_from')" +if [ "$M" = "codereview_isolation" ] && [ "$D" = "codereview_lakehouse" ]; then + echo " ✓ codereview_lakehouse → codereview_isolation (downgraded_from=lakehouse)" +else + echo " ✗ mode=$M downgraded_from=$D"; FAILED=1 +fi + +# ── 2. Forced mode bypasses ────────────────────────────────────── +echo "[downgrade-smoke] forced_mode=true bypasses:" +RESP="$(post '{"mode":"codereview_lakehouse","model":"x-ai/grok-4.1-fast","forced_mode":true}')" +M="$(echo "$RESP" | jq -r '.mode')" +D="$(echo "$RESP" | jq -r '.downgraded_from // ""')" +if [ "$M" = "codereview_lakehouse" ] && [ "$D" = "" ]; then + echo " ✓ caller-forced mode preserved, no downgrade" +else + echo " ✗ mode=$M downgraded_from=$D"; FAILED=1 +fi + +# ── 3. force_full_override bypasses ────────────────────────────── +echo "[downgrade-smoke] force_full_override=true bypasses:" +RESP="$(post '{"mode":"codereview_lakehouse","model":"x-ai/grok-4.1-fast","force_full_override":true}')" +M="$(echo "$RESP" | jq -r '.mode')" +D="$(echo "$RESP" | jq -r '.downgraded_from // ""')" +if [ "$M" = "codereview_lakehouse" ] && [ "$D" = "" ]; then + echo " ✓ env-override bypass, no downgrade" +else + echo " ✗ mode=$M downgraded_from=$D"; FAILED=1 +fi + +# ── 4. Weak model bypasses ─────────────────────────────────────── +echo "[downgrade-smoke] weak model (qwen3.5:latest) bypasses:" +RESP="$(post '{"mode":"codereview_lakehouse","model":"qwen3.5:latest"}')" +M="$(echo "$RESP" | jq -r '.mode')" +D="$(echo "$RESP" | jq -r '.downgraded_from // ""')" +if [ "$M" = "codereview_lakehouse" ] && [ "$D" = "" ]; then + echo " ✓ weak model keeps lakehouse" +else + echo " ✗ mode=$M downgraded_from=$D"; FAILED=1 +fi + +# ── 5. Non-lakehouse mode → gate not applicable ────────────────── +echo "[downgrade-smoke] non-lakehouse mode → gate not applicable:" +RESP="$(post '{"mode":"codereview_isolation","model":"x-ai/grok-4.1-fast"}')" +M="$(echo "$RESP" | jq -r '.mode')" +D="$(echo "$RESP" | jq -r '.downgraded_from // ""')" +R="$(echo "$RESP" | jq -r '.reason')" +if [ "$M" = "codereview_isolation" ] && [ "$D" = "" ] && echo "$R" | grep -q "not applicable"; then + echo " ✓ codereview_isolation passes through unchanged" +else + echo " ✗ mode=$M downgraded_from=$D reason='$R'"; FAILED=1 +fi + +# ── 6. Negative: empty mode → 400 ──────────────────────────────── +echo "[downgrade-smoke] empty mode → 400:" +HTTP="$(curl -sS -o /dev/null -w '%{http_code}' -X POST "$URL" \ + -H 'Content-Type: application/json' -d '{"mode":"","model":"x"}')" +if [ "$HTTP" = "400" ]; then + echo " ✓ empty mode → 400" +else + echo " ✗ got $HTTP"; FAILED=1 +fi + +if [ "$FAILED" -eq 0 ]; then + echo "[downgrade-smoke] Downgrade gate acceptance: PASSED" + exit 0 +else + echo "[downgrade-smoke] Downgrade gate acceptance: FAILED" + exit 1 +fi