migrate the strong-model auto-downgrade gate from a hardcoded weak list to cfg.Models.WeakModels. backward compatible: existing API preserved, callers that don't migrate keep using DefaultWeakModels. changes: - internal/matrix/downgrade.go: split IsWeakModel into rule-based base (`:free` suffix/infix) + literal-list lookup. New IsWeakModelInList(model, list) takes the config-supplied list. DowngradeInput grows a WeakModels field; nil falls back to DefaultWeakModels (preserves pre-phase-2 behavior). - internal/workflow/modes.go: add MatrixDowngradeWithWeakList(list) factory mirroring MatrixSearch's pattern. Plain MatrixDowngrade kept for backward compat. - cmd/matrixd/main.go: handlers struct holds weakModels populated from cfg.Models.WeakModels at startup; handleDowngrade threads it into every DowngradeInput. - cmd/observerd/main.go: registerBuiltinModes accepts weakModels and uses the factory variant. observerd reads cfg.Models.WeakModels in main(). end-to-end verified: downgrade + matrix + observer + workflow smokes all pass. Existing TestMaybeDowngrade_TruthTable + TestIsWeakModel unchanged (backward compat). Two new tests cover the config path: - TestIsWeakModelInList — covers rule + literal + empty + nil - TestMaybeDowngrade_WithConfigList — verifies cfg list overrides default Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
164 lines
6.0 KiB
Go
164 lines
6.0 KiB
Go
package matrix
|
|
|
|
// Strong-model auto-downgrade gate. Port of mode.rs::execute's
|
|
// downgrade block (Rust system, 2026-04-26 pass5).
|
|
//
|
|
// What it does: if the caller resolves `codereview_lakehouse` against
|
|
// a strong model and didn't force the mode, flip to
|
|
// `codereview_isolation` so we don't pollute the prompt with matrix
|
|
// chunks the model demonstrably does better without.
|
|
//
|
|
// Why: pass5 variance test on x-ai/grok-4.1-fast — composing matrix
|
|
// corpora into codereview_lakehouse LOST 5/5 head-to-head reps
|
|
// against matrix-free codereview_isolation, p=0.031. Strong models
|
|
// have enough native capacity that bug fingerprints + adversarial
|
|
// framing + file content carry them; matrix chunks displace
|
|
// depth-of-analysis.
|
|
//
|
|
// Defaults: assume "strong" (downgrade matrix off). The explicit
|
|
// IsWeakModel predicate keeps the weak-list small — anything
|
|
// matching `:free` (OpenRouter free tier) or the local last-resort
|
|
// rungs (qwen3.5/qwen3) stays on the full lakehouse path where
|
|
// matrix demonstrably helped during the 2026-04-26 free-tier
|
|
// bake-off.
|
|
|
|
import (
|
|
"os"
|
|
"strings"
|
|
)
|
|
|
|
// Mode constants — exported so callers don't string-literal them.
|
|
const (
|
|
ModeCodeReviewLakehouse = "codereview_lakehouse"
|
|
ModeCodeReviewIsolation = "codereview_isolation"
|
|
)
|
|
|
|
// EnvForceFullEnrichment is the env var that bypasses the gate for
|
|
// diagnostic runs ("LH_FORCE_FULL_ENRICHMENT=1" or "true").
|
|
const EnvForceFullEnrichment = "LH_FORCE_FULL_ENRICHMENT"
|
|
|
|
// DefaultWeakModels is the fallback weak-model list used when no
|
|
// config-driven list is provided. Matches the pre-config hardcoded
|
|
// switch — preserved for backward compatibility with callers that
|
|
// haven't migrated to the [models] tier config (Phase 1).
|
|
var DefaultWeakModels = []string{"qwen3.5:latest", "qwen3:latest"}
|
|
|
|
// IsWeakModel returns true for models matrix-corpus composition
|
|
// demonstrably helped during the 2026-04-26 pass5 bake-off. Strong
|
|
// models (default) get matrix dropped to avoid the "composed lost
|
|
// 5/5 vs isolation" effect.
|
|
//
|
|
// Weak signals:
|
|
// - `:free` suffix (OpenRouter free tier, e.g. `gpt-oss-120b:free`)
|
|
// - `:free/` infix (handles routing-prefixed names like `or:free/x`)
|
|
// - DefaultWeakModels list — local-hot-path eligible.
|
|
//
|
|
// For config-driven lists, use IsWeakModelInList.
|
|
func IsWeakModel(model string) bool {
|
|
return IsWeakModelInList(model, DefaultWeakModels)
|
|
}
|
|
|
|
// IsWeakModelInList is the config-aware variant. Callers pass the
|
|
// configured weak-model list (typically cfg.Models.WeakModels) and
|
|
// the function applies the same rule-based checks (`:free` suffix /
|
|
// infix) plus the literal list match.
|
|
//
|
|
// Empty list still applies the rule-based checks — only the literal
|
|
// match becomes a no-op. That preserves "free-tier always weak"
|
|
// regardless of what the operator configured.
|
|
func IsWeakModelInList(model string, weakList []string) bool {
|
|
if strings.HasSuffix(model, ":free") || strings.Contains(model, ":free/") {
|
|
return true
|
|
}
|
|
for _, w := range weakList {
|
|
if w == model {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// DowngradeInput is what MaybeDowngrade evaluates.
|
|
//
|
|
// ForcedMode: caller explicitly set their mode (mirrors Rust's
|
|
// req.force_mode.is_some()) — treated as opt-in to the chosen mode,
|
|
// skips the downgrade. Experiments need exact-mode control.
|
|
//
|
|
// ForceFullOverride: the LH_FORCE_FULL_ENRICHMENT escape hatch —
|
|
// usually populated from the env var via NewDowngradeInputFromEnv,
|
|
// but the field is explicit so callers can pass it from a config or
|
|
// test deterministically.
|
|
type DowngradeInput struct {
|
|
Mode string
|
|
Model string
|
|
ForcedMode bool
|
|
ForceFullOverride bool
|
|
// WeakModels overrides the default weak-model list (DefaultWeakModels)
|
|
// when non-nil. Typically populated from cfg.Models.WeakModels at
|
|
// startup. nil = use DefaultWeakModels (backward compat).
|
|
WeakModels []string
|
|
}
|
|
|
|
// DowngradeDecision is the output. DowngradedFrom is non-empty
|
|
// only when a downgrade fired — callers should record it for audit
|
|
// (matches the Rust EnrichmentSources.downgraded_from field).
|
|
//
|
|
// Reason is a short human-readable string for logs/responses;
|
|
// useful for debugging "why did/didn't the gate fire."
|
|
type DowngradeDecision struct {
|
|
Mode string `json:"mode"`
|
|
DowngradedFrom string `json:"downgraded_from,omitempty"`
|
|
Reason string `json:"reason"`
|
|
}
|
|
|
|
// MaybeDowngrade applies the strong-model auto-downgrade gate.
|
|
// Pure function; no env reads. For env-driven callers see
|
|
// NewDowngradeInputFromEnv.
|
|
func MaybeDowngrade(in DowngradeInput) DowngradeDecision {
|
|
out := DowngradeDecision{Mode: in.Mode}
|
|
if in.Mode != ModeCodeReviewLakehouse {
|
|
out.Reason = "mode is not " + ModeCodeReviewLakehouse + "; gate not applicable"
|
|
return out
|
|
}
|
|
if in.ForcedMode {
|
|
out.Reason = "caller forced mode; skip downgrade"
|
|
return out
|
|
}
|
|
if in.ForceFullOverride {
|
|
out.Reason = EnvForceFullEnrichment + " bypass"
|
|
return out
|
|
}
|
|
weakList := in.WeakModels
|
|
if weakList == nil {
|
|
weakList = DefaultWeakModels
|
|
}
|
|
if IsWeakModelInList(in.Model, weakList) {
|
|
out.Reason = "weak model; matrix composition demonstrably helped (2026-04-26 free-tier bake-off)"
|
|
return out
|
|
}
|
|
// Downgrade fires.
|
|
out.Mode = ModeCodeReviewIsolation
|
|
out.DowngradedFrom = ModeCodeReviewLakehouse
|
|
out.Reason = "strong model; matrix composes anti-additively (pass5: composed lost 5/5 vs isolation on grok-4.1-fast, p=0.031)"
|
|
return out
|
|
}
|
|
|
|
// NewDowngradeInputFromEnv is a convenience that reads
|
|
// LH_FORCE_FULL_ENRICHMENT from the process environment and returns
|
|
// a populated DowngradeInput. Most production callers want this;
|
|
// tests should construct DowngradeInput directly to avoid env
|
|
// pollution.
|
|
func NewDowngradeInputFromEnv(mode, model string, forcedMode bool) DowngradeInput {
|
|
return DowngradeInput{
|
|
Mode: mode,
|
|
Model: model,
|
|
ForcedMode: forcedMode,
|
|
ForceFullOverride: envForceFullEnrichment(),
|
|
}
|
|
}
|
|
|
|
func envForceFullEnrichment() bool {
|
|
v := strings.ToLower(strings.TrimSpace(os.Getenv(EnvForceFullEnrichment)))
|
|
return v == "1" || v == "true"
|
|
}
|