Apply B1+B2+B3+B4 from 2026-04-30 cross-lineage scrum

Cross-lineage scrum (Opus 4.7 / Kimi K2.6 / Qwen3-coder via chatd's /v1/chat) on the harness's first 4 commits surfaced 5 real bugs; this commit lands the 4 inside the LLM/validator stack. B5 (scanner skip-list semantics) ships separately as it changes scan behavior on every target repo. B1 (Kimi BLOCK + Opus WARN convergent) — internal/validators: evidencePresent had two flaws: (1) cursor advanced on match in the trim-line fallback, breaking same-line repeated matches AND skipping not-yet-considered lines so out-of-order evidence spuriously failed; (2) strings.Contains on a single `}` trim-matched any closing brace in the file, defeating the "evidence quotes real text" contract. Fix: trivial-evidence guard FIRST (reject anything <4 non-whitespace chars) + per-line search no longer advances a cursor. New regression test TestEvidencePresent_RejectsTrivialMatches covers `}`, `{`, `)`, empty, and out-of-order multi-line evidence (which now passes — order isn't part of the contract). B2 (Kimi WARN + Opus WARN convergent) — internal/pipeline: WriteJSON error for rejected-findings.json was swallowed with `if err == nil`, so a write failure left the validation phase reporting status="ok" while the audit trail vanished. Mirror the validated-findings branch: surface the error in validatePhase.Errors + bump status to degraded + ExitCode=66. B3 (Kimi BLOCK + Opus BLOCK convergent) — internal/llm/ollama.go: HealthCheck.basic_prompt_ok was set to true on ANY non-empty response, so a model emitting `<think>...` traces or apologies passed silently. Now requires the response to contain "OK" (uppercase, substring). Substring rather than equality lets minor whitespace/punctuation variations through (some models add a trailing period). Errors now record what the model actually said when it fails the check. B4 (Opus BLOCK only — same class as today's chatd Anthropic-temp fix) — internal/llm/ollama.go: chatBody had `if opts.Temperature != 0` which silently dropped Temperature=0 from the request, so HealthCheck + Reviewer (both pass Temperature=0 expecting determinism) actually ran at Ollama's ~0.8 default. Always forward Temperature now. The two callers always set explicit values, so "0 means 0" is correct; if a future caller wants Ollama's default they'll switch CompleteOptions.Temperature to *float64 like chatd did this morning. Verified end-to-end: insecure-repo + --enable-llm still produces 25 confirmed findings (16 static + 9 LLM), 0 rejected. Validator unit tests: 11 pass (added TestEvidencePresent_RejectsTrivialMatches). Same-day-as-shipping scrum, same-day-as-shipping fixes. The convergent-≥2 gate caught 3 of these; the 4th was Opus-only but verified by reading the code (same idiom as today's chatd bug). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 01:33:01 -05:00 · 2026-04-30 01:33:01 -05:00 · 305f5b9ac0
commit 305f5b9ac0
parent 4dc53c5798
4 changed files with 111 additions and 23 deletions
--- a/internal/llm/ollama.go
+++ b/internal/llm/ollama.go
@ -97,11 +97,18 @@ func (o *OllamaProvider) HealthCheck(ctx context.Context, primary, fallback stri
 		return st
 	}

-	// 2. Basic completion
+	// 2. Basic completion. Scrum fix B3 (Kimi BLOCK + Opus BLOCK,
+	// 2026-04-30): checks that the response actually contains "OK"
+	// (case-insensitive, substring) — pre-fix accepted any non-empty
+	// string, so a thinking-model's `<think>...` trace or an apology
+	// passed silently. Substring rather than equality because some
+	// models surround the answer with whitespace or a trailing period.
 	if got, err := o.Complete(ctx, probeModel, "Reply with the single word: OK", CompleteOptions{Temperature: 0, MaxTokens: 8, TimeoutSeconds: 30}); err != nil {
 		st.Errors = append(st.Errors, "basic prompt: "+err.Error())
-	} else if strings.TrimSpace(got) != "" {
+	} else if upper := strings.ToUpper(strings.TrimSpace(got)); upper == "OK" || strings.Contains(upper, "OK") {
 		st.BasicPromptOK = true
+	} else {
+		st.Errors = append(st.Errors, fmt.Sprintf("basic prompt: expected 'OK', got %q", abbrev(got, 80)))
 	}

 	// 3. JSON-mode completion
@ -140,9 +147,17 @@ func (o *OllamaProvider) CompleteJSON(ctx context.Context, model, prompt string,

 func (o *OllamaProvider) chatBody(model, prompt string, opts CompleteOptions, jsonMode bool) map[string]any {
 	options := map[string]any{}
-	if opts.Temperature != 0 {
-		options["temperature"] = opts.Temperature
-	}
+	// Scrum fix B4 (Opus BLOCK, 2026-04-30): always forward the
+	// caller-supplied Temperature, including 0. Pre-fix `if != 0`
+	// silently dropped the field for callers wanting deterministic
+	// generation, so Ollama's ~0.8 default applied to the JSON
+	// probe + every reviewer call. CompleteOptions.Temperature is
+	// still float64 (not *float64) — the harness's two callers
+	// (HealthCheck, Reviewer) always set it explicitly, so "0
+	// means 0" is the right semantic. If a future caller wants
+	// "use Ollama default", they can set MaxTokens=0 + delete the
+	// option (or we'll switch to *float64 like chatd did).
+	options["temperature"] = opts.Temperature
 	if opts.MaxTokens > 0 {
 		options["num_predict"] = opts.MaxTokens
 	}
--- a/internal/pipeline/pipeline.go
+++ b/internal/pipeline/pipeline.go
@ -174,9 +174,18 @@ func RunRepo(ctx context.Context, in Inputs) (*Result, error) {
 		validatePhase.OutputHash = sha
 	}
 	if len(valOut.Rejected) > 0 {
-		// Rejected file is informational, not gate-blocking — the
-		// audit trail belongs in version control.
-		if _, err := reporters.WriteJSON(filepath.Join(in.OutputDir, "rejected-findings.json"), valOut); err == nil {
+		// Scrum fix B2 (Kimi WARN + Opus WARN, 2026-04-30): surface
+		// write errors into validatePhase.Errors. Pre-fix the error
+		// was swallowed (`if _, err := ...; err == nil`), which broke
+		// the receipts-honesty contract — phase status said "ok" while
+		// the rejected-findings audit trail silently vanished.
+		if _, err := reporters.WriteJSON(filepath.Join(in.OutputDir, "rejected-findings.json"), valOut); err != nil {
+			validatePhase.Status = "degraded"
+			validatePhase.Errors = append(validatePhase.Errors, "rejected-findings.json: "+err.Error())
+			if res.ExitCode == 0 {
+				res.ExitCode = 66
+			}
+		} else {
 			validatePhase.OutputFiles = append(validatePhase.OutputFiles, "rejected-findings.json")
 			res.OutputFiles = append(res.OutputFiles, "rejected-findings.json")
 		}
--- a/internal/validators/validate.go
+++ b/internal/validators/validate.go
@ -178,35 +178,57 @@ func check(repoPath string, f analyzers.Finding, cache map[string]string) Result
 }

 // evidencePresent returns true if the evidence appears verbatim in
-// the file. Multi-line evidence is checked as-is; single-line evidence
-// is also compared trimmed (models often add/drop leading whitespace
-// when quoting code).
+// the file, OR every evidence line trim-matches some line in the
+// file (models often re-indent quotes when quoting code).
+//
+// Scrum fix B1 (Kimi BLOCK + Opus WARN, 2026-04-30):
+//   - reject trivially-matchable evidence FIRST (empty, lone braces,
+//     single-char/punct quotes); strings.Contains on tiny strings
+//     hits half the file and lets a non-quoting LLM pass
+//   - per-line trim-match no longer advances a cursor on hit; earlier
+//     version drove cursor forward unconditionally, both preventing
+//     same-line repeated matches and skipping unseen lines, so
+//     out-of-order evidence spuriously failed
+//
+// Order is no longer enforced — every evidence line just needs to
+// appear somewhere in the file. The contract is "evidence quotes
+// real text from the file," not "evidence quotes contiguous text in
+// the same order."
 func evidencePresent(content, evidence string) bool {
+	trimmed := strings.TrimSpace(evidence)
+	if trimmed == "" {
+		return false
+	}
+	// Trivial-match guard: if the *entire* evidence is shorter than 4
+	// non-whitespace chars, reject regardless of how it's being matched.
+	// Lone `}` / `{` / `)` / `(` would substring-hit any matching brace
+	// in the file. Min-length picked at 4 because real verbatim quotes
+	// (variable names, function calls) are essentially always longer.
+	if nonWSLen(trimmed) < 4 {
+		return false
+	}
 	if strings.Contains(content, evidence) {
 		return true
 	}
-	// Trim each line of the evidence + match line-by-line. Conservative:
-	// every evidence line must appear (in order) in the file's trimmed
-	// lines for the evidence to count as found.
-	evLines := strings.Split(strings.TrimSpace(evidence), "\n")
-	if len(evLines) == 0 {
-		return false
-	}
+	evLines := strings.Split(trimmed, "\n")
 	contentLines := strings.Split(content, "\n")
-	cursor := 0
 	for _, ev := range evLines {
 		want := strings.TrimSpace(ev)
 		if want == "" {
 			continue
 		}
+		// Per-line trivial guard: even within a multi-line evidence
+		// block, a line of `}` shouldn't satisfy the "this evidence
+		// line appears in the file" check.
+		if nonWSLen(want) < 4 {
+			return false
+		}
 		found := false
-		for cursor < len(contentLines) {
-			if strings.Contains(strings.TrimSpace(contentLines[cursor]), want) {
+		for _, cl := range contentLines {
+			if strings.Contains(strings.TrimSpace(cl), want) {
 				found = true
-				cursor++
 				break
 			}
-			cursor++
 		}
 		if !found {
 			return false
@ -215,6 +237,16 @@ func evidencePresent(content, evidence string) bool {
 	return true
 }

+func nonWSLen(s string) int {
+	n := 0
+	for _, r := range s {
+		if r != ' ' && r != '\t' && r != '\n' && r != '\r' {
+			n++
+		}
+	}
+	return n
+}
+
 // highestLine extracts the largest line number cited in the hint.
 // Accepts "42", "10-20" (returns 20), "line 42", "L42", "42:5".
 // Returns (n, true) on parse; (0, false) if no number found.
--- a/internal/validators/validate_test.go
+++ b/internal/validators/validate_test.go
@ -184,6 +184,38 @@ func TestValidate_AcceptsRelativeRepoPath(t *testing.T) {
 	}
 }

+// TestEvidencePresent_RejectsTrivialMatches locks in scrum fix B1.
+// Pre-fix, evidence "}" or "{" trim-matched almost any line in the
+// file via strings.Contains, so an LLM emitting a single brace would
+// pass validation against any file with braces.
+func TestEvidencePresent_RejectsTrivialMatches(t *testing.T) {
+	content := "package main\nfunc main() {\n\tfmt.Println(\"hi\")\n}\n"
+	cases := []struct {
+		evidence string
+		want     bool
+	}{
+		// Verbatim full content matches
+		{"func main() {", true},
+		{"fmt.Println(\"hi\")", true},
+		// Trivial single-char/-line matches must REJECT
+		{"}", false},
+		{"{", false},
+		{")", false},
+		// Empty evidence is rejected (caller-side enforced too)
+		{"", false},
+		// Multi-line evidence in different order — must still pass
+		// (cursor-reset fix B1 — pre-fix this would fail because the
+		// cursor advanced past the first match)
+		{"fmt.Println(\"hi\")\nfunc main() {", true},
+	}
+	for _, c := range cases {
+		got := evidencePresent(content, c.evidence)
+		if got != c.want {
+			t.Errorf("evidencePresent(%q) = %v; want %v", c.evidence, got, c.want)
+		}
+	}
+}
+
 // === Path-traversal protection: hallucinated "../../../etc/passwd" must reject ===

 func TestValidate_RejectsPathEscapingRepo(t *testing.T) {