diff --git a/internal/llm/ollama.go b/internal/llm/ollama.go index eb54ebb..10a8885 100644 --- a/internal/llm/ollama.go +++ b/internal/llm/ollama.go @@ -97,11 +97,18 @@ func (o *OllamaProvider) HealthCheck(ctx context.Context, primary, fallback stri return st } - // 2. Basic completion + // 2. Basic completion. Scrum fix B3 (Kimi BLOCK + Opus BLOCK, + // 2026-04-30): checks that the response actually contains "OK" + // (case-insensitive, substring) — pre-fix accepted any non-empty + // string, so a thinking-model's `...` trace or an apology + // passed silently. Substring rather than equality because some + // models surround the answer with whitespace or a trailing period. if got, err := o.Complete(ctx, probeModel, "Reply with the single word: OK", CompleteOptions{Temperature: 0, MaxTokens: 8, TimeoutSeconds: 30}); err != nil { st.Errors = append(st.Errors, "basic prompt: "+err.Error()) - } else if strings.TrimSpace(got) != "" { + } else if upper := strings.ToUpper(strings.TrimSpace(got)); upper == "OK" || strings.Contains(upper, "OK") { st.BasicPromptOK = true + } else { + st.Errors = append(st.Errors, fmt.Sprintf("basic prompt: expected 'OK', got %q", abbrev(got, 80))) } // 3. JSON-mode completion @@ -140,9 +147,17 @@ func (o *OllamaProvider) CompleteJSON(ctx context.Context, model, prompt string, func (o *OllamaProvider) chatBody(model, prompt string, opts CompleteOptions, jsonMode bool) map[string]any { options := map[string]any{} - if opts.Temperature != 0 { - options["temperature"] = opts.Temperature - } + // Scrum fix B4 (Opus BLOCK, 2026-04-30): always forward the + // caller-supplied Temperature, including 0. Pre-fix `if != 0` + // silently dropped the field for callers wanting deterministic + // generation, so Ollama's ~0.8 default applied to the JSON + // probe + every reviewer call. CompleteOptions.Temperature is + // still float64 (not *float64) — the harness's two callers + // (HealthCheck, Reviewer) always set it explicitly, so "0 + // means 0" is the right semantic. If a future caller wants + // "use Ollama default", they can set MaxTokens=0 + delete the + // option (or we'll switch to *float64 like chatd did). + options["temperature"] = opts.Temperature if opts.MaxTokens > 0 { options["num_predict"] = opts.MaxTokens } diff --git a/internal/pipeline/pipeline.go b/internal/pipeline/pipeline.go index ac53782..2fd0e05 100644 --- a/internal/pipeline/pipeline.go +++ b/internal/pipeline/pipeline.go @@ -174,9 +174,18 @@ func RunRepo(ctx context.Context, in Inputs) (*Result, error) { validatePhase.OutputHash = sha } if len(valOut.Rejected) > 0 { - // Rejected file is informational, not gate-blocking — the - // audit trail belongs in version control. - if _, err := reporters.WriteJSON(filepath.Join(in.OutputDir, "rejected-findings.json"), valOut); err == nil { + // Scrum fix B2 (Kimi WARN + Opus WARN, 2026-04-30): surface + // write errors into validatePhase.Errors. Pre-fix the error + // was swallowed (`if _, err := ...; err == nil`), which broke + // the receipts-honesty contract — phase status said "ok" while + // the rejected-findings audit trail silently vanished. + if _, err := reporters.WriteJSON(filepath.Join(in.OutputDir, "rejected-findings.json"), valOut); err != nil { + validatePhase.Status = "degraded" + validatePhase.Errors = append(validatePhase.Errors, "rejected-findings.json: "+err.Error()) + if res.ExitCode == 0 { + res.ExitCode = 66 + } + } else { validatePhase.OutputFiles = append(validatePhase.OutputFiles, "rejected-findings.json") res.OutputFiles = append(res.OutputFiles, "rejected-findings.json") } diff --git a/internal/validators/validate.go b/internal/validators/validate.go index 55bafcf..949143f 100644 --- a/internal/validators/validate.go +++ b/internal/validators/validate.go @@ -178,35 +178,57 @@ func check(repoPath string, f analyzers.Finding, cache map[string]string) Result } // evidencePresent returns true if the evidence appears verbatim in -// the file. Multi-line evidence is checked as-is; single-line evidence -// is also compared trimmed (models often add/drop leading whitespace -// when quoting code). +// the file, OR every evidence line trim-matches some line in the +// file (models often re-indent quotes when quoting code). +// +// Scrum fix B1 (Kimi BLOCK + Opus WARN, 2026-04-30): +// - reject trivially-matchable evidence FIRST (empty, lone braces, +// single-char/punct quotes); strings.Contains on tiny strings +// hits half the file and lets a non-quoting LLM pass +// - per-line trim-match no longer advances a cursor on hit; earlier +// version drove cursor forward unconditionally, both preventing +// same-line repeated matches and skipping unseen lines, so +// out-of-order evidence spuriously failed +// +// Order is no longer enforced — every evidence line just needs to +// appear somewhere in the file. The contract is "evidence quotes +// real text from the file," not "evidence quotes contiguous text in +// the same order." func evidencePresent(content, evidence string) bool { + trimmed := strings.TrimSpace(evidence) + if trimmed == "" { + return false + } + // Trivial-match guard: if the *entire* evidence is shorter than 4 + // non-whitespace chars, reject regardless of how it's being matched. + // Lone `}` / `{` / `)` / `(` would substring-hit any matching brace + // in the file. Min-length picked at 4 because real verbatim quotes + // (variable names, function calls) are essentially always longer. + if nonWSLen(trimmed) < 4 { + return false + } if strings.Contains(content, evidence) { return true } - // Trim each line of the evidence + match line-by-line. Conservative: - // every evidence line must appear (in order) in the file's trimmed - // lines for the evidence to count as found. - evLines := strings.Split(strings.TrimSpace(evidence), "\n") - if len(evLines) == 0 { - return false - } + evLines := strings.Split(trimmed, "\n") contentLines := strings.Split(content, "\n") - cursor := 0 for _, ev := range evLines { want := strings.TrimSpace(ev) if want == "" { continue } + // Per-line trivial guard: even within a multi-line evidence + // block, a line of `}` shouldn't satisfy the "this evidence + // line appears in the file" check. + if nonWSLen(want) < 4 { + return false + } found := false - for cursor < len(contentLines) { - if strings.Contains(strings.TrimSpace(contentLines[cursor]), want) { + for _, cl := range contentLines { + if strings.Contains(strings.TrimSpace(cl), want) { found = true - cursor++ break } - cursor++ } if !found { return false @@ -215,6 +237,16 @@ func evidencePresent(content, evidence string) bool { return true } +func nonWSLen(s string) int { + n := 0 + for _, r := range s { + if r != ' ' && r != '\t' && r != '\n' && r != '\r' { + n++ + } + } + return n +} + // highestLine extracts the largest line number cited in the hint. // Accepts "42", "10-20" (returns 20), "line 42", "L42", "42:5". // Returns (n, true) on parse; (0, false) if no number found. diff --git a/internal/validators/validate_test.go b/internal/validators/validate_test.go index e393a20..db844a1 100644 --- a/internal/validators/validate_test.go +++ b/internal/validators/validate_test.go @@ -184,6 +184,38 @@ func TestValidate_AcceptsRelativeRepoPath(t *testing.T) { } } +// TestEvidencePresent_RejectsTrivialMatches locks in scrum fix B1. +// Pre-fix, evidence "}" or "{" trim-matched almost any line in the +// file via strings.Contains, so an LLM emitting a single brace would +// pass validation against any file with braces. +func TestEvidencePresent_RejectsTrivialMatches(t *testing.T) { + content := "package main\nfunc main() {\n\tfmt.Println(\"hi\")\n}\n" + cases := []struct { + evidence string + want bool + }{ + // Verbatim full content matches + {"func main() {", true}, + {"fmt.Println(\"hi\")", true}, + // Trivial single-char/-line matches must REJECT + {"}", false}, + {"{", false}, + {")", false}, + // Empty evidence is rejected (caller-side enforced too) + {"", false}, + // Multi-line evidence in different order — must still pass + // (cursor-reset fix B1 — pre-fix this would fail because the + // cursor advanced past the first match) + {"fmt.Println(\"hi\")\nfunc main() {", true}, + } + for _, c := range cases { + got := evidencePresent(content, c.evidence) + if got != c.want { + t.Errorf("evidencePresent(%q) = %v; want %v", c.evidence, got, c.want) + } + } +} + // === Path-traversal protection: hallucinated "../../../etc/passwd" must reject === func TestValidate_RejectsPathEscapingRepo(t *testing.T) {