From 305f5b9ac00f41d026774cf306d8098c5467c6b3 Mon Sep 17 00:00:00 2001 From: "Claude (review-harness setup)" Date: Thu, 30 Apr 2026 01:33:01 -0500 Subject: [PATCH] Apply B1+B2+B3+B4 from 2026-04-30 cross-lineage scrum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cross-lineage scrum (Opus 4.7 / Kimi K2.6 / Qwen3-coder via chatd's /v1/chat) on the harness's first 4 commits surfaced 5 real bugs; this commit lands the 4 inside the LLM/validator stack. B5 (scanner skip-list semantics) ships separately as it changes scan behavior on every target repo. B1 (Kimi BLOCK + Opus WARN convergent) — internal/validators: evidencePresent had two flaws: (1) cursor advanced on match in the trim-line fallback, breaking same-line repeated matches AND skipping not-yet-considered lines so out-of-order evidence spuriously failed; (2) strings.Contains on a single `}` trim-matched any closing brace in the file, defeating the "evidence quotes real text" contract. Fix: trivial-evidence guard FIRST (reject anything <4 non-whitespace chars) + per-line search no longer advances a cursor. New regression test TestEvidencePresent_RejectsTrivialMatches covers `}`, `{`, `)`, empty, and out-of-order multi-line evidence (which now passes — order isn't part of the contract). B2 (Kimi WARN + Opus WARN convergent) — internal/pipeline: WriteJSON error for rejected-findings.json was swallowed with `if err == nil`, so a write failure left the validation phase reporting status="ok" while the audit trail vanished. Mirror the validated-findings branch: surface the error in validatePhase.Errors + bump status to degraded + ExitCode=66. B3 (Kimi BLOCK + Opus BLOCK convergent) — internal/llm/ollama.go: HealthCheck.basic_prompt_ok was set to true on ANY non-empty response, so a model emitting `...` traces or apologies passed silently. Now requires the response to contain "OK" (uppercase, substring). Substring rather than equality lets minor whitespace/punctuation variations through (some models add a trailing period). Errors now record what the model actually said when it fails the check. B4 (Opus BLOCK only — same class as today's chatd Anthropic-temp fix) — internal/llm/ollama.go: chatBody had `if opts.Temperature != 0` which silently dropped Temperature=0 from the request, so HealthCheck + Reviewer (both pass Temperature=0 expecting determinism) actually ran at Ollama's ~0.8 default. Always forward Temperature now. The two callers always set explicit values, so "0 means 0" is correct; if a future caller wants Ollama's default they'll switch CompleteOptions.Temperature to *float64 like chatd did this morning. Verified end-to-end: insecure-repo + --enable-llm still produces 25 confirmed findings (16 static + 9 LLM), 0 rejected. Validator unit tests: 11 pass (added TestEvidencePresent_RejectsTrivialMatches). Same-day-as-shipping scrum, same-day-as-shipping fixes. The convergent-≥2 gate caught 3 of these; the 4th was Opus-only but verified by reading the code (same idiom as today's chatd bug). Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/llm/ollama.go | 25 ++++++++--- internal/pipeline/pipeline.go | 15 +++++-- internal/validators/validate.go | 62 +++++++++++++++++++++------- internal/validators/validate_test.go | 32 ++++++++++++++ 4 files changed, 111 insertions(+), 23 deletions(-) diff --git a/internal/llm/ollama.go b/internal/llm/ollama.go index eb54ebb..10a8885 100644 --- a/internal/llm/ollama.go +++ b/internal/llm/ollama.go @@ -97,11 +97,18 @@ func (o *OllamaProvider) HealthCheck(ctx context.Context, primary, fallback stri return st } - // 2. Basic completion + // 2. Basic completion. Scrum fix B3 (Kimi BLOCK + Opus BLOCK, + // 2026-04-30): checks that the response actually contains "OK" + // (case-insensitive, substring) — pre-fix accepted any non-empty + // string, so a thinking-model's `...` trace or an apology + // passed silently. Substring rather than equality because some + // models surround the answer with whitespace or a trailing period. if got, err := o.Complete(ctx, probeModel, "Reply with the single word: OK", CompleteOptions{Temperature: 0, MaxTokens: 8, TimeoutSeconds: 30}); err != nil { st.Errors = append(st.Errors, "basic prompt: "+err.Error()) - } else if strings.TrimSpace(got) != "" { + } else if upper := strings.ToUpper(strings.TrimSpace(got)); upper == "OK" || strings.Contains(upper, "OK") { st.BasicPromptOK = true + } else { + st.Errors = append(st.Errors, fmt.Sprintf("basic prompt: expected 'OK', got %q", abbrev(got, 80))) } // 3. JSON-mode completion @@ -140,9 +147,17 @@ func (o *OllamaProvider) CompleteJSON(ctx context.Context, model, prompt string, func (o *OllamaProvider) chatBody(model, prompt string, opts CompleteOptions, jsonMode bool) map[string]any { options := map[string]any{} - if opts.Temperature != 0 { - options["temperature"] = opts.Temperature - } + // Scrum fix B4 (Opus BLOCK, 2026-04-30): always forward the + // caller-supplied Temperature, including 0. Pre-fix `if != 0` + // silently dropped the field for callers wanting deterministic + // generation, so Ollama's ~0.8 default applied to the JSON + // probe + every reviewer call. CompleteOptions.Temperature is + // still float64 (not *float64) — the harness's two callers + // (HealthCheck, Reviewer) always set it explicitly, so "0 + // means 0" is the right semantic. If a future caller wants + // "use Ollama default", they can set MaxTokens=0 + delete the + // option (or we'll switch to *float64 like chatd did). + options["temperature"] = opts.Temperature if opts.MaxTokens > 0 { options["num_predict"] = opts.MaxTokens } diff --git a/internal/pipeline/pipeline.go b/internal/pipeline/pipeline.go index ac53782..2fd0e05 100644 --- a/internal/pipeline/pipeline.go +++ b/internal/pipeline/pipeline.go @@ -174,9 +174,18 @@ func RunRepo(ctx context.Context, in Inputs) (*Result, error) { validatePhase.OutputHash = sha } if len(valOut.Rejected) > 0 { - // Rejected file is informational, not gate-blocking — the - // audit trail belongs in version control. - if _, err := reporters.WriteJSON(filepath.Join(in.OutputDir, "rejected-findings.json"), valOut); err == nil { + // Scrum fix B2 (Kimi WARN + Opus WARN, 2026-04-30): surface + // write errors into validatePhase.Errors. Pre-fix the error + // was swallowed (`if _, err := ...; err == nil`), which broke + // the receipts-honesty contract — phase status said "ok" while + // the rejected-findings audit trail silently vanished. + if _, err := reporters.WriteJSON(filepath.Join(in.OutputDir, "rejected-findings.json"), valOut); err != nil { + validatePhase.Status = "degraded" + validatePhase.Errors = append(validatePhase.Errors, "rejected-findings.json: "+err.Error()) + if res.ExitCode == 0 { + res.ExitCode = 66 + } + } else { validatePhase.OutputFiles = append(validatePhase.OutputFiles, "rejected-findings.json") res.OutputFiles = append(res.OutputFiles, "rejected-findings.json") } diff --git a/internal/validators/validate.go b/internal/validators/validate.go index 55bafcf..949143f 100644 --- a/internal/validators/validate.go +++ b/internal/validators/validate.go @@ -178,35 +178,57 @@ func check(repoPath string, f analyzers.Finding, cache map[string]string) Result } // evidencePresent returns true if the evidence appears verbatim in -// the file. Multi-line evidence is checked as-is; single-line evidence -// is also compared trimmed (models often add/drop leading whitespace -// when quoting code). +// the file, OR every evidence line trim-matches some line in the +// file (models often re-indent quotes when quoting code). +// +// Scrum fix B1 (Kimi BLOCK + Opus WARN, 2026-04-30): +// - reject trivially-matchable evidence FIRST (empty, lone braces, +// single-char/punct quotes); strings.Contains on tiny strings +// hits half the file and lets a non-quoting LLM pass +// - per-line trim-match no longer advances a cursor on hit; earlier +// version drove cursor forward unconditionally, both preventing +// same-line repeated matches and skipping unseen lines, so +// out-of-order evidence spuriously failed +// +// Order is no longer enforced — every evidence line just needs to +// appear somewhere in the file. The contract is "evidence quotes +// real text from the file," not "evidence quotes contiguous text in +// the same order." func evidencePresent(content, evidence string) bool { + trimmed := strings.TrimSpace(evidence) + if trimmed == "" { + return false + } + // Trivial-match guard: if the *entire* evidence is shorter than 4 + // non-whitespace chars, reject regardless of how it's being matched. + // Lone `}` / `{` / `)` / `(` would substring-hit any matching brace + // in the file. Min-length picked at 4 because real verbatim quotes + // (variable names, function calls) are essentially always longer. + if nonWSLen(trimmed) < 4 { + return false + } if strings.Contains(content, evidence) { return true } - // Trim each line of the evidence + match line-by-line. Conservative: - // every evidence line must appear (in order) in the file's trimmed - // lines for the evidence to count as found. - evLines := strings.Split(strings.TrimSpace(evidence), "\n") - if len(evLines) == 0 { - return false - } + evLines := strings.Split(trimmed, "\n") contentLines := strings.Split(content, "\n") - cursor := 0 for _, ev := range evLines { want := strings.TrimSpace(ev) if want == "" { continue } + // Per-line trivial guard: even within a multi-line evidence + // block, a line of `}` shouldn't satisfy the "this evidence + // line appears in the file" check. + if nonWSLen(want) < 4 { + return false + } found := false - for cursor < len(contentLines) { - if strings.Contains(strings.TrimSpace(contentLines[cursor]), want) { + for _, cl := range contentLines { + if strings.Contains(strings.TrimSpace(cl), want) { found = true - cursor++ break } - cursor++ } if !found { return false @@ -215,6 +237,16 @@ func evidencePresent(content, evidence string) bool { return true } +func nonWSLen(s string) int { + n := 0 + for _, r := range s { + if r != ' ' && r != '\t' && r != '\n' && r != '\r' { + n++ + } + } + return n +} + // highestLine extracts the largest line number cited in the hint. // Accepts "42", "10-20" (returns 20), "line 42", "L42", "42:5". // Returns (n, true) on parse; (0, false) if no number found. diff --git a/internal/validators/validate_test.go b/internal/validators/validate_test.go index e393a20..db844a1 100644 --- a/internal/validators/validate_test.go +++ b/internal/validators/validate_test.go @@ -184,6 +184,38 @@ func TestValidate_AcceptsRelativeRepoPath(t *testing.T) { } } +// TestEvidencePresent_RejectsTrivialMatches locks in scrum fix B1. +// Pre-fix, evidence "}" or "{" trim-matched almost any line in the +// file via strings.Contains, so an LLM emitting a single brace would +// pass validation against any file with braces. +func TestEvidencePresent_RejectsTrivialMatches(t *testing.T) { + content := "package main\nfunc main() {\n\tfmt.Println(\"hi\")\n}\n" + cases := []struct { + evidence string + want bool + }{ + // Verbatim full content matches + {"func main() {", true}, + {"fmt.Println(\"hi\")", true}, + // Trivial single-char/-line matches must REJECT + {"}", false}, + {"{", false}, + {")", false}, + // Empty evidence is rejected (caller-side enforced too) + {"", false}, + // Multi-line evidence in different order — must still pass + // (cursor-reset fix B1 — pre-fix this would fail because the + // cursor advanced past the first match) + {"fmt.Println(\"hi\")\nfunc main() {", true}, + } + for _, c := range cases { + got := evidencePresent(content, c.evidence) + if got != c.want { + t.Errorf("evidencePresent(%q) = %v; want %v", c.evidence, got, c.want) + } + } +} + // === Path-traversal protection: hallucinated "../../../etc/passwd" must reject === func TestValidate_RejectsPathEscapingRepo(t *testing.T) {