local-review-harness/internal/validators/validate.go

// Package validators cross-checks LLM-generated findings against
// real repository evidence. PROMPT.md / REVIEW_PIPELINE.md Phase 3:
// "AI may suggest. Code validates." Findings that pass validation
// move from status=suspected → status=confirmed; failures land in a
// separate rejected-findings.json with a per-rejection reason.
//
// V0 implements 3 hard checks per the PROMPT.md "Reject A Finding If"
// list:
//   - file does not exist
//   - cited evidence does not exist verbatim in the file
//   - line hint is impossible (file has fewer lines than claimed)
//
// 3 softer checks from the same list are NOT v0 — documented as
// "open" so the audit trail is honest:
//   - claim is unsupported (semantic, requires another LLM pass)
//   - suggested fix targets unrelated code (semantic)
//   - model invents tests/commands/files (covered by file-exists for
//     files; tests/commands need a Phase D+1 fact-check)
package validators

import (
	"fmt"
	"os"
	"path/filepath"
	"regexp"
	"strconv"
	"strings"

	"local-review-harness/internal/analyzers"
)

// Reason captures why a finding was rejected. Stable strings so
// reports + receipts can group/sort by reason.
type Reason string

const (
	ReasonFileNotFound    Reason = "file_not_found"
	ReasonNoEvidence      Reason = "evidence_not_in_file"
	ReasonLineHintTooHigh Reason = "line_hint_exceeds_file_length"
	ReasonEmptyEvidence   Reason = "empty_evidence_field"
)

// Result is the validator's output for one finding.
type Result struct {
	Finding         analyzers.Finding `json:"finding"`
	Validated       bool              `json:"validated"`
	RejectionReason Reason            `json:"rejection_reason,omitempty"`
	RejectionDetail string            `json:"rejection_detail,omitempty"`
}

// Outputs split the input list into validated + rejected. Only LLM
// findings (Source == SourceLLM) get validated — static findings
// already have grep-able evidence by construction.
type Outputs struct {
	Validated []analyzers.Finding `json:"-"` // promoted to confirmed
	Rejected  []Result            `json:"rejected"`
	Pass      []Result            `json:"pass"`
}

// Validate runs the 3 hard checks for every LLM finding. Static and
// validator-source findings pass through unchanged (they have their
// own evidence pipeline). Returns the validated set + the rejected
// set with per-rejection reason for the audit trail.
//
// repoPath is the absolute path the LLM was asked to review; finding
// File paths are joined under it.
func Validate(repoPath string, findings []analyzers.Finding) Outputs {
	out := Outputs{}
	contentCache := map[string]string{} // abs path → content (read once)

	for _, f := range findings {
		if f.Source != analyzers.SourceLLM {
			// Non-LLM findings carry their own evidence path; pass through
			// unchanged. The pipeline still ships them as-is.
			f.Status = analyzers.StatusConfirmed
			out.Validated = append(out.Validated, f)
			out.Pass = append(out.Pass, Result{Finding: f, Validated: true})
			continue
		}

		res := check(repoPath, f, contentCache)
		if res.Validated {
			res.Finding.Status = analyzers.StatusConfirmed
			out.Validated = append(out.Validated, res.Finding)
			out.Pass = append(out.Pass, res)
		} else {
			res.Finding.Status = analyzers.StatusRejected
			out.Rejected = append(out.Rejected, res)
		}
	}
	return out
}

// check is the per-finding validation logic. Stops at the first
// failure — operators only need to see one rejection reason.
func check(repoPath string, f analyzers.Finding, cache map[string]string) Result {
	res := Result{Finding: f}

	// Empty evidence is unusable — the model didn't quote anything.
	if strings.TrimSpace(f.Evidence) == "" {
		res.RejectionReason = ReasonEmptyEvidence
		res.RejectionDetail = "finding has no evidence quote — can't be validated"
		return res
	}

	// Resolve absolute path. The validator runs after the scanner has
	// already classified the repo; we trust f.File is repo-relative.
	// Both repoPath AND the joined target are converted to absolute
	// before the path-traversal check — bug fixed 2026-04-30: prior
	// version compared relative-abs to absolute-repoAbs and HasPrefix
	// always failed, rejecting every real finding as file_not_found.
	joined := f.File
	if !filepath.IsAbs(joined) {
		joined = filepath.Join(repoPath, f.File)
	}
	abs, err := filepath.Abs(joined)
	if err != nil {
		res.RejectionReason = ReasonFileNotFound
		res.RejectionDetail = "abs(" + joined + "): " + err.Error()
		return res
	}
	abs = filepath.Clean(abs)

	// Refuse to traverse outside the repo (path-traversal protection
	// — the LLM might have hallucinated a "../../../etc/passwd" file).
	repoAbs, err := filepath.Abs(repoPath)
	if err != nil {
		res.RejectionReason = ReasonFileNotFound
		res.RejectionDetail = "abs(" + repoPath + "): " + err.Error()
		return res
	}
	repoAbs = filepath.Clean(repoAbs)
	if !strings.HasPrefix(abs, repoAbs+string(filepath.Separator)) && abs != repoAbs {
		res.RejectionReason = ReasonFileNotFound
		res.RejectionDetail = fmt.Sprintf("path %q escapes repo root %q (resolved: abs=%q repo_abs=%q)", f.File, repoPath, abs, repoAbs)
		return res
	}

	// Read once + cache.
	content, ok := cache[abs]
	if !ok {
		b, err := os.ReadFile(abs)
		if err != nil {
			res.RejectionReason = ReasonFileNotFound
			res.RejectionDetail = err.Error()
			return res
		}
		content = string(b)
		cache[abs] = content
	}

	// Evidence presence check — the verbatim quote MUST appear in the
	// file. Tolerate leading/trailing whitespace differences (models
	// often re-indent quotes); compare on trim. Multi-line evidence
	// is matched as-is (newlines preserved).
	if !evidencePresent(content, f.Evidence) {
		res.RejectionReason = ReasonNoEvidence
		res.RejectionDetail = fmt.Sprintf("evidence %q not found in %s", abbrev(f.Evidence, 80), f.File)
		return res
	}

	// Line hint plausibility — parse "42" or "10-20" or "line 42";
	// reject if file has fewer lines than the highest cited number.
	if hint := strings.TrimSpace(f.LineHint); hint != "" {
		hi, ok := highestLine(hint)
		if ok {
			fileLines := strings.Count(content, "\n") + 1
			if hi > fileLines {
				res.RejectionReason = ReasonLineHintTooHigh
				res.RejectionDetail = fmt.Sprintf("line %d cited but file has only %d lines", hi, fileLines)
				return res
			}
		}
	}

	res.Validated = true
	return res
}

// evidencePresent returns true if the evidence appears verbatim in
// the file, OR every evidence line trim-matches some line in the
// file (models often re-indent quotes when quoting code).
//
// Scrum fix B1 (Kimi BLOCK + Opus WARN, 2026-04-30):
//   - reject trivially-matchable evidence FIRST (empty, lone braces,
//     single-char/punct quotes); strings.Contains on tiny strings
//     hits half the file and lets a non-quoting LLM pass
//   - per-line trim-match no longer advances a cursor on hit; earlier
//     version drove cursor forward unconditionally, both preventing
//     same-line repeated matches and skipping unseen lines, so
//     out-of-order evidence spuriously failed
//
// Order is no longer enforced — every evidence line just needs to
// appear somewhere in the file. The contract is "evidence quotes
// real text from the file," not "evidence quotes contiguous text in
// the same order."
func evidencePresent(content, evidence string) bool {
	trimmed := strings.TrimSpace(evidence)
	if trimmed == "" {
		return false
	}
	// Trivial-match guard: if the *entire* evidence is shorter than 4
	// non-whitespace chars, reject regardless of how it's being matched.
	// Lone `}` / `{` / `)` / `(` would substring-hit any matching brace
	// in the file. Min-length picked at 4 because real verbatim quotes
	// (variable names, function calls) are essentially always longer.
	if nonWSLen(trimmed) < 4 {
		return false
	}
	if strings.Contains(content, evidence) {
		return true
	}
	evLines := strings.Split(trimmed, "\n")
	contentLines := strings.Split(content, "\n")
	for _, ev := range evLines {
		want := strings.TrimSpace(ev)
		if want == "" {
			continue
		}
		// Per-line trivial guard: even within a multi-line evidence
		// block, a line of `}` shouldn't satisfy the "this evidence
		// line appears in the file" check.
		if nonWSLen(want) < 4 {
			return false
		}
		found := false
		for _, cl := range contentLines {
			if strings.Contains(strings.TrimSpace(cl), want) {
				found = true
				break
			}
		}
		if !found {
			return false
		}
	}
	return true
}

func nonWSLen(s string) int {
	n := 0
	for _, r := range s {
		if r != ' ' && r != '\t' && r != '\n' && r != '\r' {
			n++
		}
	}
	return n
}

// highestLine extracts the largest line number cited in the hint.
// Accepts "42", "10-20" (returns 20), "line 42", "L42", "42:5".
// Returns (n, true) on parse; (0, false) if no number found.
var lineHintNumRe = regexp.MustCompile(`\d+`)

func highestLine(hint string) (int, bool) {
	matches := lineHintNumRe.FindAllString(hint, -1)
	if len(matches) == 0 {
		return 0, false
	}
	hi := 0
	for _, m := range matches {
		n, err := strconv.Atoi(m)
		if err != nil {
			continue
		}
		if n > hi {
			hi = n
		}
	}
	return hi, hi > 0
}

func abbrev(s string, n int) string {
	s = strings.TrimSpace(s)
	if len(s) <= n {
		return s
	}
	return s[:n] + "…"
}