local-review-harness/internal/analyzers/runner.go

package analyzers

import (
	"crypto/sha256"
	"encoding/hex"
	"os"
	"path/filepath"
	"strings"

	"local-review-harness/internal/config"
	"local-review-harness/internal/scanner"
)

// Analyzer is the contract every static check implements. Pure
// function over the scan result; no I/O outside reading files
// (which the runner does once and passes in).
type Analyzer interface {
	// ID is the stable check identifier (e.g. "static.hardcoded_paths").
	ID() string

	// Enabled reports whether the review profile turned this check on.
	Enabled(rp config.ReviewProfile) bool

	// Inspect returns findings for one file. The runner skips this
	// for binary / non-text files based on extension heuristics.
	Inspect(file scanner.File, content string, rp config.ReviewProfile) []Finding
}

// All returns the 12 MVP analyzers. Order is stable so report
// determinism flows from analyzer ordering.
func All() []Analyzer {
	return []Analyzer{
		&hardcodedPathsAnalyzer{},
		&shellExecAnalyzer{},
		&rawSQLAnalyzer{},
		&corsAnalyzer{},
		&secretPatternsAnalyzer{},
		&largeFilesAnalyzer{},
		&todoFixmeAnalyzer{},
		&missingTestsAnalyzer{},
		&envFileAnalyzer{},
		&unsafeFileIOAnalyzer{},
		&exposedMutationAnalyzer{},
		&hardcodedIPsAnalyzer{},
	}
}

// Run executes every enabled analyzer over the scan result. Reads
// each text file once + dispatches the content to all analyzers.
// Files larger than rp.Limits.MaxFileBytes are skipped (analyzers
// run on file metadata only — e.g. large-files check still fires).
func Run(scan *scanner.Result, rp config.ReviewProfile) []Finding {
	all := All()
	enabled := make([]Analyzer, 0, len(all))
	for _, a := range all {
		if a.Enabled(rp) {
			enabled = append(enabled, a)
		}
	}

	findings := []Finding{}

	// Per-file analyzers (read content once)
	for _, f := range scan.Files {
		if !isTextLike(f) {
			continue
		}
		var content string
		if f.Size <= int64(rp.Limits.MaxFileBytes) {
			b, err := os.ReadFile(f.Abs)
			if err == nil {
				content = string(b)
			}
		}
		for _, a := range enabled {
			fs := a.Inspect(f, content, rp)
			findings = append(findings, fs...)
		}
	}

	// Repo-level analyzers (scan-result-only checks)
	for _, a := range enabled {
		if rl, ok := a.(repoLevelAnalyzer); ok {
			findings = append(findings, rl.InspectRepo(scan, rp)...)
		}
	}

	// Stable ID assignment per finding so memory dedup works across runs.
	for i := range findings {
		findings[i].ID = stableID(findings[i])
	}
	return findings
}

// repoLevelAnalyzer is for checks that operate on the whole scan
// (e.g. "missing tests" — only fires once per repo, not per file).
type repoLevelAnalyzer interface {
	InspectRepo(scan *scanner.Result, rp config.ReviewProfile) []Finding
}

// isTextLike filters out files where regex scanning is meaningless.
// Conservative — when in doubt, scan; analyzers handle their own noise.
func isTextLike(f scanner.File) bool {
	switch strings.ToLower(filepath.Ext(f.Path)) {
	case ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".ico",
		".pdf", ".zip", ".tar", ".gz", ".bz2", ".xz",
		".woff", ".woff2", ".ttf", ".otf",
		".mp3", ".mp4", ".mov", ".wav",
		".so", ".dll", ".dylib", ".exe",
		".parquet", ".lance", ".arrow":
		return false
	}
	return true
}

// stableID is sha256(check_id|file|line_hint|evidence) truncated to
// 12 hex chars. Same finding across runs → same ID. Used by memory
// for append-only dedup signal (Phase E).
func stableID(f Finding) string {
	h := sha256.New()
	h.Write([]byte(f.CheckID))
	h.Write([]byte("|"))
	h.Write([]byte(f.File))
	h.Write([]byte("|"))
	h.Write([]byte(f.LineHint))
	h.Write([]byte("|"))
	h.Write([]byte(f.Evidence))
	return hex.EncodeToString(h.Sum(nil))[:12]
}