diff --git a/cmd/matrixd/main.go b/cmd/matrixd/main.go index e67c49e..4698d38 100644 --- a/cmd/matrixd/main.go +++ b/cmd/matrixd/main.go @@ -52,7 +52,7 @@ func main() { } retriever := matrix.New(cfg.Matrixd.EmbeddURL, cfg.Matrixd.VectordURL) - h := &handlers{r: retriever} + h := &handlers{r: retriever, weakModels: cfg.Models.WeakModels} if err := shared.Run("matrixd", cfg.Matrixd.Bind, h.register, cfg.Auth); err != nil { slog.Error("server", "err", err) @@ -62,6 +62,11 @@ func main() { type handlers struct { r *matrix.Retriever + // weakModels comes from cfg.Models.WeakModels at startup. Threaded + // into every DowngradeInput so the gate uses the configured list + // instead of matrix.DefaultWeakModels. nil/empty falls back to + // the package default — matches pre-Phase 2 behavior. + weakModels []string } func (h *handlers) register(r chi.Router) { @@ -240,6 +245,7 @@ func (h *handlers) handleDowngrade(w http.ResponseWriter, r *http.Request) { // without env pollution. in.ForceFullOverride = *req.ForceFullOverride } + in.WeakModels = h.weakModels writeJSON(w, http.StatusOK, matrix.MaybeDowngrade(in)) } diff --git a/cmd/observerd/main.go b/cmd/observerd/main.go index 25cd309..a95239c 100644 --- a/cmd/observerd/main.go +++ b/cmd/observerd/main.go @@ -74,7 +74,7 @@ func main() { // to gateway's matrixd_url so a single-toml deploy works without // duplicating the address. matrixdURL := cfg.Gateway.MatrixdURL - registerBuiltinModes(runner, matrixdURL) + registerBuiltinModes(runner, matrixdURL, cfg.Models.WeakModels) h := &handlers{store: store, runner: runner} if err := shared.Run("observerd", cfg.Observerd.Bind, h.register, cfg.Auth); err != nil { @@ -204,7 +204,7 @@ func summarizeOutput(output map[string]any) string { // - playbook.record (HTTP to matrixd) // - playbook.lookup (HTTP to matrixd) // - llm.chat (HTTP to gateway /v1/chat) -func registerBuiltinModes(r *workflow.Runner, matrixdURL string) { +func registerBuiltinModes(r *workflow.Runner, matrixdURL string, weakModels []string) { // Fixture modes for runner mechanics smokes. r.RegisterMode("fixture.echo", func(_ workflow.Context, input map[string]any) (map[string]any, error) { out := make(map[string]any, len(input)) @@ -220,7 +220,10 @@ func registerBuiltinModes(r *workflow.Runner, matrixdURL string) { // Real modes — pure-function wrappers (no I/O). r.RegisterMode("matrix.relevance", workflow.MatrixRelevance) - r.RegisterMode("matrix.downgrade", workflow.MatrixDowngrade) + // matrix.downgrade reads weakModels from config — Phase 2. + // nil/empty falls back to matrix.DefaultWeakModels per the + // MatrixDowngradeWithWeakList factory contract. + r.RegisterMode("matrix.downgrade", workflow.MatrixDowngradeWithWeakList(weakModels)) r.RegisterMode("distillation.score", workflow.DistillationScore) r.RegisterMode("drift.scorer", workflow.DriftScorer) diff --git a/internal/matrix/downgrade.go b/internal/matrix/downgrade.go index 9d039f8..5e10ded 100644 --- a/internal/matrix/downgrade.go +++ b/internal/matrix/downgrade.go @@ -37,6 +37,12 @@ const ( // diagnostic runs ("LH_FORCE_FULL_ENRICHMENT=1" or "true"). const EnvForceFullEnrichment = "LH_FORCE_FULL_ENRICHMENT" +// DefaultWeakModels is the fallback weak-model list used when no +// config-driven list is provided. Matches the pre-config hardcoded +// switch — preserved for backward compatibility with callers that +// haven't migrated to the [models] tier config (Phase 1). +var DefaultWeakModels = []string{"qwen3.5:latest", "qwen3:latest"} + // IsWeakModel returns true for models matrix-corpus composition // demonstrably helped during the 2026-04-26 pass5 bake-off. Strong // models (default) get matrix dropped to avoid the "composed lost @@ -45,17 +51,29 @@ const EnvForceFullEnrichment = "LH_FORCE_FULL_ENRICHMENT" // Weak signals: // - `:free` suffix (OpenRouter free tier, e.g. `gpt-oss-120b:free`) // - `:free/` infix (handles routing-prefixed names like `or:free/x`) -// - `qwen3.5:latest` / `qwen3:latest` — local last-resort rung +// - DefaultWeakModels list — local-hot-path eligible. // -// Add new weak models by extending this function alongside variance -// data that justifies it. +// For config-driven lists, use IsWeakModelInList. func IsWeakModel(model string) bool { + return IsWeakModelInList(model, DefaultWeakModels) +} + +// IsWeakModelInList is the config-aware variant. Callers pass the +// configured weak-model list (typically cfg.Models.WeakModels) and +// the function applies the same rule-based checks (`:free` suffix / +// infix) plus the literal list match. +// +// Empty list still applies the rule-based checks — only the literal +// match becomes a no-op. That preserves "free-tier always weak" +// regardless of what the operator configured. +func IsWeakModelInList(model string, weakList []string) bool { if strings.HasSuffix(model, ":free") || strings.Contains(model, ":free/") { return true } - switch model { - case "qwen3.5:latest", "qwen3:latest": - return true + for _, w := range weakList { + if w == model { + return true + } } return false } @@ -75,6 +93,10 @@ type DowngradeInput struct { Model string ForcedMode bool ForceFullOverride bool + // WeakModels overrides the default weak-model list (DefaultWeakModels) + // when non-nil. Typically populated from cfg.Models.WeakModels at + // startup. nil = use DefaultWeakModels (backward compat). + WeakModels []string } // DowngradeDecision is the output. DowngradedFrom is non-empty @@ -106,7 +128,11 @@ func MaybeDowngrade(in DowngradeInput) DowngradeDecision { out.Reason = EnvForceFullEnrichment + " bypass" return out } - if IsWeakModel(in.Model) { + weakList := in.WeakModels + if weakList == nil { + weakList = DefaultWeakModels + } + if IsWeakModelInList(in.Model, weakList) { out.Reason = "weak model; matrix composition demonstrably helped (2026-04-26 free-tier bake-off)" return out } diff --git a/internal/matrix/downgrade_test.go b/internal/matrix/downgrade_test.go index e60b8ca..b27266e 100644 --- a/internal/matrix/downgrade_test.go +++ b/internal/matrix/downgrade_test.go @@ -79,6 +79,68 @@ func TestMaybeDowngrade_TruthTable(t *testing.T) { } } +// TestIsWeakModelInList covers the config-driven variant added in +// Phase 2 — callers pass cfg.Models.WeakModels instead of relying on +// the package-level DefaultWeakModels. +func TestIsWeakModelInList(t *testing.T) { + custom := []string{"my-custom-7b:latest", "qwen3:latest"} + cases := []struct { + model string + list []string + weak bool + }{ + // Custom literal hits (config-driven) + {"my-custom-7b:latest", custom, true}, + {"qwen3:latest", custom, true}, + // qwen3.5:latest is in DefaultWeakModels but NOT in custom list → + // not weak when caller supplies custom list (intentional — operator + // owns the list). + {"qwen3.5:latest", custom, false}, + // :free/free-tier rules apply regardless of list + {"openai/gpt-4o:free", custom, true}, + {"openrouter:free/anthropic/claude", custom, true}, + // Empty list → only rule-based checks apply + {"qwen3.5:latest", []string{}, false}, + {"openai/gpt-4o:free", []string{}, true}, + // nil list behaves like empty (caller hasn't migrated yet) + {"qwen3.5:latest", nil, false}, + } + for _, c := range cases { + got := IsWeakModelInList(c.model, c.list) + if got != c.weak { + t.Errorf("IsWeakModelInList(%q, %v): want %v, got %v", c.model, c.list, c.weak, got) + } + } +} + +// TestMaybeDowngrade_WithConfigList verifies the DowngradeInput.WeakModels +// field overrides DefaultWeakModels when populated, so callers reading +// cfg.Models.WeakModels at startup get the configured behavior end-to-end. +func TestMaybeDowngrade_WithConfigList(t *testing.T) { + // Config says "only my-custom is weak". qwen3.5:latest is now strong + // (operator opted out of the local-hot-path treatment). + configList := []string{"my-custom:latest"} + in := DowngradeInput{ + Mode: ModeCodeReviewLakehouse, + Model: "qwen3.5:latest", + WeakModels: configList, + } + got := MaybeDowngrade(in) + if got.Mode != ModeCodeReviewIsolation { + t.Errorf("qwen3.5:latest with config list excluding it should downgrade; got Mode=%q", got.Mode) + } + if got.DowngradedFrom != ModeCodeReviewLakehouse { + t.Errorf("DowngradedFrom should be lakehouse; got %q", got.DowngradedFrom) + } + + // Same input but weak by config — gate must NOT fire. + in.Model = "my-custom:latest" + got = MaybeDowngrade(in) + if got.Mode != ModeCodeReviewLakehouse { + t.Errorf("my-custom:latest is in config list — should stay lakehouse; got %q", got.Mode) + } +} + // TestMaybeDowngrade_ForcedTrumpsOthers verifies precedence: when // multiple bypass conditions hit, ForcedMode wins (explicit caller // intent always overrides). Caught a subtle ordering bug in the diff --git a/internal/workflow/modes.go b/internal/workflow/modes.go index 59f4ca2..bdd3342 100644 --- a/internal/workflow/modes.go +++ b/internal/workflow/modes.go @@ -96,6 +96,40 @@ func MatrixDowngrade(_ Context, input map[string]any) (map[string]any, error) { }, nil } +// MatrixDowngradeWithWeakList is the config-driven variant of +// MatrixDowngrade — callers pass cfg.Models.WeakModels at startup +// and the closure includes that list in every DowngradeInput. +// nil/empty list falls back to matrix.DefaultWeakModels (matching the +// plain MatrixDowngrade behavior). +func MatrixDowngradeWithWeakList(weakModels []string) Mode { + return func(_ Context, input map[string]any) (map[string]any, error) { + var req struct { + Mode string `json:"mode"` + Model string `json:"model"` + ForcedMode bool `json:"forced_mode"` + ForceFullOverride bool `json:"force_full_override"` + } + if err := remarshalInput(input, &req); err != nil { + return nil, fmt.Errorf("matrix.downgrade: %w", err) + } + if req.Mode == "" || req.Model == "" { + return nil, fmt.Errorf("matrix.downgrade: mode and model are required") + } + dec := matrix.MaybeDowngrade(matrix.DowngradeInput{ + Mode: req.Mode, + Model: req.Model, + ForcedMode: req.ForcedMode, + ForceFullOverride: req.ForceFullOverride, + WeakModels: weakModels, + }) + return map[string]any{ + "mode": dec.Mode, + "downgraded_from": dec.DowngradedFrom, + "reason": dec.Reason, + }, nil + } +} + // DistillationScore wraps distillation.ScoreRecord — re-runs the // scorer over a single EvidenceRecord. Useful as a workflow node // that grades a freshly-produced evidence row.