Apply B1+B2+B3+B4 from 2026-04-30 cross-lineage scrum
Cross-lineage scrum (Opus 4.7 / Kimi K2.6 / Qwen3-coder via chatd's
/v1/chat) on the harness's first 4 commits surfaced 5 real bugs;
this commit lands the 4 inside the LLM/validator stack. B5 (scanner
skip-list semantics) ships separately as it changes scan behavior
on every target repo.
B1 (Kimi BLOCK + Opus WARN convergent) — internal/validators:
evidencePresent had two flaws: (1) cursor advanced on match in the
trim-line fallback, breaking same-line repeated matches AND skipping
not-yet-considered lines so out-of-order evidence spuriously failed;
(2) strings.Contains on a single `}` trim-matched any closing brace
in the file, defeating the "evidence quotes real text" contract.
Fix: trivial-evidence guard FIRST (reject anything <4 non-whitespace
chars) + per-line search no longer advances a cursor. New regression
test TestEvidencePresent_RejectsTrivialMatches covers `}`, `{`, `)`,
empty, and out-of-order multi-line evidence (which now passes —
order isn't part of the contract).
B2 (Kimi WARN + Opus WARN convergent) — internal/pipeline:
WriteJSON error for rejected-findings.json was swallowed with
`if err == nil`, so a write failure left the validation phase
reporting status="ok" while the audit trail vanished. Mirror the
validated-findings branch: surface the error in
validatePhase.Errors + bump status to degraded + ExitCode=66.
B3 (Kimi BLOCK + Opus BLOCK convergent) — internal/llm/ollama.go:
HealthCheck.basic_prompt_ok was set to true on ANY non-empty
response, so a model emitting `<think>...` traces or apologies
passed silently. Now requires the response to contain "OK"
(uppercase, substring). Substring rather than equality lets minor
whitespace/punctuation variations through (some models add a
trailing period). Errors now record what the model actually said
when it fails the check.
B4 (Opus BLOCK only — same class as today's chatd Anthropic-temp
fix) — internal/llm/ollama.go: chatBody had `if opts.Temperature != 0`
which silently dropped Temperature=0 from the request, so HealthCheck
+ Reviewer (both pass Temperature=0 expecting determinism) actually
ran at Ollama's ~0.8 default. Always forward Temperature now. The
two callers always set explicit values, so "0 means 0" is correct;
if a future caller wants Ollama's default they'll switch
CompleteOptions.Temperature to *float64 like chatd did this morning.
Verified end-to-end: insecure-repo + --enable-llm still produces 25
confirmed findings (16 static + 9 LLM), 0 rejected. Validator unit
tests: 11 pass (added TestEvidencePresent_RejectsTrivialMatches).
Same-day-as-shipping scrum, same-day-as-shipping fixes. The
convergent-≥2 gate caught 3 of these; the 4th was Opus-only but
verified by reading the code (same idiom as today's chatd bug).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4dc53c5798
commit
305f5b9ac0
@ -97,11 +97,18 @@ func (o *OllamaProvider) HealthCheck(ctx context.Context, primary, fallback stri
|
||||
return st
|
||||
}
|
||||
|
||||
// 2. Basic completion
|
||||
// 2. Basic completion. Scrum fix B3 (Kimi BLOCK + Opus BLOCK,
|
||||
// 2026-04-30): checks that the response actually contains "OK"
|
||||
// (case-insensitive, substring) — pre-fix accepted any non-empty
|
||||
// string, so a thinking-model's `<think>...` trace or an apology
|
||||
// passed silently. Substring rather than equality because some
|
||||
// models surround the answer with whitespace or a trailing period.
|
||||
if got, err := o.Complete(ctx, probeModel, "Reply with the single word: OK", CompleteOptions{Temperature: 0, MaxTokens: 8, TimeoutSeconds: 30}); err != nil {
|
||||
st.Errors = append(st.Errors, "basic prompt: "+err.Error())
|
||||
} else if strings.TrimSpace(got) != "" {
|
||||
} else if upper := strings.ToUpper(strings.TrimSpace(got)); upper == "OK" || strings.Contains(upper, "OK") {
|
||||
st.BasicPromptOK = true
|
||||
} else {
|
||||
st.Errors = append(st.Errors, fmt.Sprintf("basic prompt: expected 'OK', got %q", abbrev(got, 80)))
|
||||
}
|
||||
|
||||
// 3. JSON-mode completion
|
||||
@ -140,9 +147,17 @@ func (o *OllamaProvider) CompleteJSON(ctx context.Context, model, prompt string,
|
||||
|
||||
func (o *OllamaProvider) chatBody(model, prompt string, opts CompleteOptions, jsonMode bool) map[string]any {
|
||||
options := map[string]any{}
|
||||
if opts.Temperature != 0 {
|
||||
options["temperature"] = opts.Temperature
|
||||
}
|
||||
// Scrum fix B4 (Opus BLOCK, 2026-04-30): always forward the
|
||||
// caller-supplied Temperature, including 0. Pre-fix `if != 0`
|
||||
// silently dropped the field for callers wanting deterministic
|
||||
// generation, so Ollama's ~0.8 default applied to the JSON
|
||||
// probe + every reviewer call. CompleteOptions.Temperature is
|
||||
// still float64 (not *float64) — the harness's two callers
|
||||
// (HealthCheck, Reviewer) always set it explicitly, so "0
|
||||
// means 0" is the right semantic. If a future caller wants
|
||||
// "use Ollama default", they can set MaxTokens=0 + delete the
|
||||
// option (or we'll switch to *float64 like chatd did).
|
||||
options["temperature"] = opts.Temperature
|
||||
if opts.MaxTokens > 0 {
|
||||
options["num_predict"] = opts.MaxTokens
|
||||
}
|
||||
|
||||
@ -174,9 +174,18 @@ func RunRepo(ctx context.Context, in Inputs) (*Result, error) {
|
||||
validatePhase.OutputHash = sha
|
||||
}
|
||||
if len(valOut.Rejected) > 0 {
|
||||
// Rejected file is informational, not gate-blocking — the
|
||||
// audit trail belongs in version control.
|
||||
if _, err := reporters.WriteJSON(filepath.Join(in.OutputDir, "rejected-findings.json"), valOut); err == nil {
|
||||
// Scrum fix B2 (Kimi WARN + Opus WARN, 2026-04-30): surface
|
||||
// write errors into validatePhase.Errors. Pre-fix the error
|
||||
// was swallowed (`if _, err := ...; err == nil`), which broke
|
||||
// the receipts-honesty contract — phase status said "ok" while
|
||||
// the rejected-findings audit trail silently vanished.
|
||||
if _, err := reporters.WriteJSON(filepath.Join(in.OutputDir, "rejected-findings.json"), valOut); err != nil {
|
||||
validatePhase.Status = "degraded"
|
||||
validatePhase.Errors = append(validatePhase.Errors, "rejected-findings.json: "+err.Error())
|
||||
if res.ExitCode == 0 {
|
||||
res.ExitCode = 66
|
||||
}
|
||||
} else {
|
||||
validatePhase.OutputFiles = append(validatePhase.OutputFiles, "rejected-findings.json")
|
||||
res.OutputFiles = append(res.OutputFiles, "rejected-findings.json")
|
||||
}
|
||||
|
||||
@ -178,35 +178,57 @@ func check(repoPath string, f analyzers.Finding, cache map[string]string) Result
|
||||
}
|
||||
|
||||
// evidencePresent returns true if the evidence appears verbatim in
|
||||
// the file. Multi-line evidence is checked as-is; single-line evidence
|
||||
// is also compared trimmed (models often add/drop leading whitespace
|
||||
// when quoting code).
|
||||
// the file, OR every evidence line trim-matches some line in the
|
||||
// file (models often re-indent quotes when quoting code).
|
||||
//
|
||||
// Scrum fix B1 (Kimi BLOCK + Opus WARN, 2026-04-30):
|
||||
// - reject trivially-matchable evidence FIRST (empty, lone braces,
|
||||
// single-char/punct quotes); strings.Contains on tiny strings
|
||||
// hits half the file and lets a non-quoting LLM pass
|
||||
// - per-line trim-match no longer advances a cursor on hit; earlier
|
||||
// version drove cursor forward unconditionally, both preventing
|
||||
// same-line repeated matches and skipping unseen lines, so
|
||||
// out-of-order evidence spuriously failed
|
||||
//
|
||||
// Order is no longer enforced — every evidence line just needs to
|
||||
// appear somewhere in the file. The contract is "evidence quotes
|
||||
// real text from the file," not "evidence quotes contiguous text in
|
||||
// the same order."
|
||||
func evidencePresent(content, evidence string) bool {
|
||||
trimmed := strings.TrimSpace(evidence)
|
||||
if trimmed == "" {
|
||||
return false
|
||||
}
|
||||
// Trivial-match guard: if the *entire* evidence is shorter than 4
|
||||
// non-whitespace chars, reject regardless of how it's being matched.
|
||||
// Lone `}` / `{` / `)` / `(` would substring-hit any matching brace
|
||||
// in the file. Min-length picked at 4 because real verbatim quotes
|
||||
// (variable names, function calls) are essentially always longer.
|
||||
if nonWSLen(trimmed) < 4 {
|
||||
return false
|
||||
}
|
||||
if strings.Contains(content, evidence) {
|
||||
return true
|
||||
}
|
||||
// Trim each line of the evidence + match line-by-line. Conservative:
|
||||
// every evidence line must appear (in order) in the file's trimmed
|
||||
// lines for the evidence to count as found.
|
||||
evLines := strings.Split(strings.TrimSpace(evidence), "\n")
|
||||
if len(evLines) == 0 {
|
||||
return false
|
||||
}
|
||||
evLines := strings.Split(trimmed, "\n")
|
||||
contentLines := strings.Split(content, "\n")
|
||||
cursor := 0
|
||||
for _, ev := range evLines {
|
||||
want := strings.TrimSpace(ev)
|
||||
if want == "" {
|
||||
continue
|
||||
}
|
||||
// Per-line trivial guard: even within a multi-line evidence
|
||||
// block, a line of `}` shouldn't satisfy the "this evidence
|
||||
// line appears in the file" check.
|
||||
if nonWSLen(want) < 4 {
|
||||
return false
|
||||
}
|
||||
found := false
|
||||
for cursor < len(contentLines) {
|
||||
if strings.Contains(strings.TrimSpace(contentLines[cursor]), want) {
|
||||
for _, cl := range contentLines {
|
||||
if strings.Contains(strings.TrimSpace(cl), want) {
|
||||
found = true
|
||||
cursor++
|
||||
break
|
||||
}
|
||||
cursor++
|
||||
}
|
||||
if !found {
|
||||
return false
|
||||
@ -215,6 +237,16 @@ func evidencePresent(content, evidence string) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func nonWSLen(s string) int {
|
||||
n := 0
|
||||
for _, r := range s {
|
||||
if r != ' ' && r != '\t' && r != '\n' && r != '\r' {
|
||||
n++
|
||||
}
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// highestLine extracts the largest line number cited in the hint.
|
||||
// Accepts "42", "10-20" (returns 20), "line 42", "L42", "42:5".
|
||||
// Returns (n, true) on parse; (0, false) if no number found.
|
||||
|
||||
@ -184,6 +184,38 @@ func TestValidate_AcceptsRelativeRepoPath(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestEvidencePresent_RejectsTrivialMatches locks in scrum fix B1.
|
||||
// Pre-fix, evidence "}" or "{" trim-matched almost any line in the
|
||||
// file via strings.Contains, so an LLM emitting a single brace would
|
||||
// pass validation against any file with braces.
|
||||
func TestEvidencePresent_RejectsTrivialMatches(t *testing.T) {
|
||||
content := "package main\nfunc main() {\n\tfmt.Println(\"hi\")\n}\n"
|
||||
cases := []struct {
|
||||
evidence string
|
||||
want bool
|
||||
}{
|
||||
// Verbatim full content matches
|
||||
{"func main() {", true},
|
||||
{"fmt.Println(\"hi\")", true},
|
||||
// Trivial single-char/-line matches must REJECT
|
||||
{"}", false},
|
||||
{"{", false},
|
||||
{")", false},
|
||||
// Empty evidence is rejected (caller-side enforced too)
|
||||
{"", false},
|
||||
// Multi-line evidence in different order — must still pass
|
||||
// (cursor-reset fix B1 — pre-fix this would fail because the
|
||||
// cursor advanced past the first match)
|
||||
{"fmt.Println(\"hi\")\nfunc main() {", true},
|
||||
}
|
||||
for _, c := range cases {
|
||||
got := evidencePresent(content, c.evidence)
|
||||
if got != c.want {
|
||||
t.Errorf("evidencePresent(%q) = %v; want %v", c.evidence, got, c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// === Path-traversal protection: hallucinated "../../../etc/passwd" must reject ===
|
||||
|
||||
func TestValidate_RejectsPathEscapingRepo(t *testing.T) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user