Apply B1+B2+B3+B4 from 2026-04-30 cross-lineage scrum

Cross-lineage scrum (Opus 4.7 / Kimi K2.6 / Qwen3-coder via chatd's
/v1/chat) on the harness's first 4 commits surfaced 5 real bugs;
this commit lands the 4 inside the LLM/validator stack. B5 (scanner
skip-list semantics) ships separately as it changes scan behavior
on every target repo.

B1 (Kimi BLOCK + Opus WARN convergent) — internal/validators:
evidencePresent had two flaws: (1) cursor advanced on match in the
trim-line fallback, breaking same-line repeated matches AND skipping
not-yet-considered lines so out-of-order evidence spuriously failed;
(2) strings.Contains on a single `}` trim-matched any closing brace
in the file, defeating the "evidence quotes real text" contract.
Fix: trivial-evidence guard FIRST (reject anything <4 non-whitespace
chars) + per-line search no longer advances a cursor. New regression
test TestEvidencePresent_RejectsTrivialMatches covers `}`, `{`, `)`,
empty, and out-of-order multi-line evidence (which now passes —
order isn't part of the contract).

B2 (Kimi WARN + Opus WARN convergent) — internal/pipeline:
WriteJSON error for rejected-findings.json was swallowed with
`if err == nil`, so a write failure left the validation phase
reporting status="ok" while the audit trail vanished. Mirror the
validated-findings branch: surface the error in
validatePhase.Errors + bump status to degraded + ExitCode=66.

B3 (Kimi BLOCK + Opus BLOCK convergent) — internal/llm/ollama.go:
HealthCheck.basic_prompt_ok was set to true on ANY non-empty
response, so a model emitting `<think>...` traces or apologies
passed silently. Now requires the response to contain "OK"
(uppercase, substring). Substring rather than equality lets minor
whitespace/punctuation variations through (some models add a
trailing period). Errors now record what the model actually said
when it fails the check.

B4 (Opus BLOCK only — same class as today's chatd Anthropic-temp
fix) — internal/llm/ollama.go: chatBody had `if opts.Temperature != 0`
which silently dropped Temperature=0 from the request, so HealthCheck
+ Reviewer (both pass Temperature=0 expecting determinism) actually
ran at Ollama's ~0.8 default. Always forward Temperature now. The
two callers always set explicit values, so "0 means 0" is correct;
if a future caller wants Ollama's default they'll switch
CompleteOptions.Temperature to *float64 like chatd did this morning.

Verified end-to-end: insecure-repo + --enable-llm still produces 25
confirmed findings (16 static + 9 LLM), 0 rejected. Validator unit
tests: 11 pass (added TestEvidencePresent_RejectsTrivialMatches).

Same-day-as-shipping scrum, same-day-as-shipping fixes. The
convergent-≥2 gate caught 3 of these; the 4th was Opus-only but
verified by reading the code (same idiom as today's chatd bug).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude (review-harness setup) 2026-04-30 01:33:01 -05:00
parent 4dc53c5798
commit 305f5b9ac0
4 changed files with 111 additions and 23 deletions

View File

@ -97,11 +97,18 @@ func (o *OllamaProvider) HealthCheck(ctx context.Context, primary, fallback stri
return st
}
// 2. Basic completion
// 2. Basic completion. Scrum fix B3 (Kimi BLOCK + Opus BLOCK,
// 2026-04-30): checks that the response actually contains "OK"
// (case-insensitive, substring) — pre-fix accepted any non-empty
// string, so a thinking-model's `<think>...` trace or an apology
// passed silently. Substring rather than equality because some
// models surround the answer with whitespace or a trailing period.
if got, err := o.Complete(ctx, probeModel, "Reply with the single word: OK", CompleteOptions{Temperature: 0, MaxTokens: 8, TimeoutSeconds: 30}); err != nil {
st.Errors = append(st.Errors, "basic prompt: "+err.Error())
} else if strings.TrimSpace(got) != "" {
} else if upper := strings.ToUpper(strings.TrimSpace(got)); upper == "OK" || strings.Contains(upper, "OK") {
st.BasicPromptOK = true
} else {
st.Errors = append(st.Errors, fmt.Sprintf("basic prompt: expected 'OK', got %q", abbrev(got, 80)))
}
// 3. JSON-mode completion
@ -140,9 +147,17 @@ func (o *OllamaProvider) CompleteJSON(ctx context.Context, model, prompt string,
func (o *OllamaProvider) chatBody(model, prompt string, opts CompleteOptions, jsonMode bool) map[string]any {
options := map[string]any{}
if opts.Temperature != 0 {
options["temperature"] = opts.Temperature
}
// Scrum fix B4 (Opus BLOCK, 2026-04-30): always forward the
// caller-supplied Temperature, including 0. Pre-fix `if != 0`
// silently dropped the field for callers wanting deterministic
// generation, so Ollama's ~0.8 default applied to the JSON
// probe + every reviewer call. CompleteOptions.Temperature is
// still float64 (not *float64) — the harness's two callers
// (HealthCheck, Reviewer) always set it explicitly, so "0
// means 0" is the right semantic. If a future caller wants
// "use Ollama default", they can set MaxTokens=0 + delete the
// option (or we'll switch to *float64 like chatd did).
options["temperature"] = opts.Temperature
if opts.MaxTokens > 0 {
options["num_predict"] = opts.MaxTokens
}

View File

@ -174,9 +174,18 @@ func RunRepo(ctx context.Context, in Inputs) (*Result, error) {
validatePhase.OutputHash = sha
}
if len(valOut.Rejected) > 0 {
// Rejected file is informational, not gate-blocking — the
// audit trail belongs in version control.
if _, err := reporters.WriteJSON(filepath.Join(in.OutputDir, "rejected-findings.json"), valOut); err == nil {
// Scrum fix B2 (Kimi WARN + Opus WARN, 2026-04-30): surface
// write errors into validatePhase.Errors. Pre-fix the error
// was swallowed (`if _, err := ...; err == nil`), which broke
// the receipts-honesty contract — phase status said "ok" while
// the rejected-findings audit trail silently vanished.
if _, err := reporters.WriteJSON(filepath.Join(in.OutputDir, "rejected-findings.json"), valOut); err != nil {
validatePhase.Status = "degraded"
validatePhase.Errors = append(validatePhase.Errors, "rejected-findings.json: "+err.Error())
if res.ExitCode == 0 {
res.ExitCode = 66
}
} else {
validatePhase.OutputFiles = append(validatePhase.OutputFiles, "rejected-findings.json")
res.OutputFiles = append(res.OutputFiles, "rejected-findings.json")
}

View File

@ -178,35 +178,57 @@ func check(repoPath string, f analyzers.Finding, cache map[string]string) Result
}
// evidencePresent returns true if the evidence appears verbatim in
// the file. Multi-line evidence is checked as-is; single-line evidence
// is also compared trimmed (models often add/drop leading whitespace
// when quoting code).
// the file, OR every evidence line trim-matches some line in the
// file (models often re-indent quotes when quoting code).
//
// Scrum fix B1 (Kimi BLOCK + Opus WARN, 2026-04-30):
// - reject trivially-matchable evidence FIRST (empty, lone braces,
// single-char/punct quotes); strings.Contains on tiny strings
// hits half the file and lets a non-quoting LLM pass
// - per-line trim-match no longer advances a cursor on hit; earlier
// version drove cursor forward unconditionally, both preventing
// same-line repeated matches and skipping unseen lines, so
// out-of-order evidence spuriously failed
//
// Order is no longer enforced — every evidence line just needs to
// appear somewhere in the file. The contract is "evidence quotes
// real text from the file," not "evidence quotes contiguous text in
// the same order."
func evidencePresent(content, evidence string) bool {
trimmed := strings.TrimSpace(evidence)
if trimmed == "" {
return false
}
// Trivial-match guard: if the *entire* evidence is shorter than 4
// non-whitespace chars, reject regardless of how it's being matched.
// Lone `}` / `{` / `)` / `(` would substring-hit any matching brace
// in the file. Min-length picked at 4 because real verbatim quotes
// (variable names, function calls) are essentially always longer.
if nonWSLen(trimmed) < 4 {
return false
}
if strings.Contains(content, evidence) {
return true
}
// Trim each line of the evidence + match line-by-line. Conservative:
// every evidence line must appear (in order) in the file's trimmed
// lines for the evidence to count as found.
evLines := strings.Split(strings.TrimSpace(evidence), "\n")
if len(evLines) == 0 {
return false
}
evLines := strings.Split(trimmed, "\n")
contentLines := strings.Split(content, "\n")
cursor := 0
for _, ev := range evLines {
want := strings.TrimSpace(ev)
if want == "" {
continue
}
// Per-line trivial guard: even within a multi-line evidence
// block, a line of `}` shouldn't satisfy the "this evidence
// line appears in the file" check.
if nonWSLen(want) < 4 {
return false
}
found := false
for cursor < len(contentLines) {
if strings.Contains(strings.TrimSpace(contentLines[cursor]), want) {
for _, cl := range contentLines {
if strings.Contains(strings.TrimSpace(cl), want) {
found = true
cursor++
break
}
cursor++
}
if !found {
return false
@ -215,6 +237,16 @@ func evidencePresent(content, evidence string) bool {
return true
}
func nonWSLen(s string) int {
n := 0
for _, r := range s {
if r != ' ' && r != '\t' && r != '\n' && r != '\r' {
n++
}
}
return n
}
// highestLine extracts the largest line number cited in the hint.
// Accepts "42", "10-20" (returns 20), "line 42", "L42", "42:5".
// Returns (n, true) on parse; (0, false) if no number found.

View File

@ -184,6 +184,38 @@ func TestValidate_AcceptsRelativeRepoPath(t *testing.T) {
}
}
// TestEvidencePresent_RejectsTrivialMatches locks in scrum fix B1.
// Pre-fix, evidence "}" or "{" trim-matched almost any line in the
// file via strings.Contains, so an LLM emitting a single brace would
// pass validation against any file with braces.
func TestEvidencePresent_RejectsTrivialMatches(t *testing.T) {
content := "package main\nfunc main() {\n\tfmt.Println(\"hi\")\n}\n"
cases := []struct {
evidence string
want bool
}{
// Verbatim full content matches
{"func main() {", true},
{"fmt.Println(\"hi\")", true},
// Trivial single-char/-line matches must REJECT
{"}", false},
{"{", false},
{")", false},
// Empty evidence is rejected (caller-side enforced too)
{"", false},
// Multi-line evidence in different order — must still pass
// (cursor-reset fix B1 — pre-fix this would fail because the
// cursor advanced past the first match)
{"fmt.Println(\"hi\")\nfunc main() {", true},
}
for _, c := range cases {
got := evidencePresent(content, c.evidence)
if got != c.want {
t.Errorf("evidencePresent(%q) = %v; want %v", c.evidence, got, c.want)
}
}
}
// === Path-traversal protection: hallucinated "../../../etc/passwd" must reject ===
func TestValidate_RejectsPathEscapingRepo(t *testing.T) {