Claude (review-harness setup) f3ee4722a8 Phase A + B (MVP) — local review harness
Implements the MVP cutline from the planning artifact:
- Phase A: skeleton + CLI dispatch + provider interface + stub model doctor
- Phase B: scanner + git probe + 12 static analyzers + reporters + pipeline
- Phase B fixtures: clean-repo, insecure-repo, degraded-repo

12 static analyzers per PROMPT.md "Suggested Static Checks For MVP":
hardcoded_paths, shell_execution, raw_sql_interpolation, broad_cors,
secret_patterns, large_files, todo_comments, missing_tests,
env_file_committed, unsafe_file_io, exposed_mutation_endpoint,
hardcoded_local_ip.

Acceptance gates passing:
- B1 (intake produces accurate counts) ✓
- B2 (insecure fixture fires ≥8 distinct check_ids — actually 11/12) ✓
- B3 (clean fixture produces 0 confirmed findings — no false positives) ✓
- B4 (scrum mode produces all 6 required markdown + JSON reports) ✓
- B5 (receipts.json marks degraded phases honestly) ✓
- F  (self-review on this repo runs without crashing) ✓ — exit 66 (degraded
  because Phase C LLM review is hardcoded skipped)

Phases C (LLM review), D (validation cross-check), E (memory + diff +
rules subcommands) deferred per the cutline. The MVP delivers the
evidence-first path; LLM is purely additive.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 00:56:02 -05:00

168 lines
4.5 KiB
Go

// Package scanner walks a repository tree, classifies files, and
// surfaces metadata for the analyzers + repo-intake report.
//
// Skip-list defaults to common build/dependency dirs that nobody
// wants to scan. Operators can extend via review-profile (Phase E).
package scanner
import (
"io/fs"
"os"
"path/filepath"
"sort"
"strings"
)
// SkipDirs is the default skip-list. Matches dir basenames anywhere
// in the tree (not just at root). Includes the harness's own output
// dir so review-on-self doesn't loop.
var SkipDirs = map[string]bool{
".git": true,
".hg": true,
".svn": true,
"node_modules": true,
"vendor": true,
"target": true, // Rust
"dist": true,
"build": true,
"__pycache__": true,
".venv": true,
"venv": true,
"bin": true, // golangLAKEHOUSE convention; harness's own too
".idea": true,
".vscode": true,
"reports": true, // harness's own output
}
// File is one entry in the scan result.
type File struct {
Path string // relative to repo root
Abs string // absolute path on disk
Size int64 // bytes
Lines int // 0 if not counted
Language string // best-effort, "" if unknown
}
// Result is the scan summary the analyzers + reporters consume.
type Result struct {
RepoPath string
Files []File
LanguageBreakdown map[string]int // count of files by language
LargestFiles []File // top 10 by size
DependencyManifests []string // relative paths to package.json / go.mod / etc
TestManifests []string // tests/ dirs, *_test.go, *.test.ts, etc
}
// Walk produces a Result for repoPath. Errors on missing dir; skipped
// dirs are silently filtered. countLines is true → reads each file
// for line counts (Phase B needs this; Phase A wires false for speed).
func Walk(repoPath string, countLines bool) (*Result, error) {
abs, err := filepath.Abs(repoPath)
if err != nil {
return nil, err
}
if st, err := os.Stat(abs); err != nil {
return nil, err
} else if !st.IsDir() {
return nil, fs.ErrInvalid
}
res := &Result{
RepoPath: abs,
LanguageBreakdown: map[string]int{},
}
walkErr := filepath.WalkDir(abs, func(p string, d fs.DirEntry, walkErr error) error {
if walkErr != nil {
return nil // best-effort; permission errors etc. are silent
}
if d.IsDir() {
if SkipDirs[d.Name()] && p != abs {
return filepath.SkipDir
}
return nil
}
// Skip dotfiles at file level (.gitignore etc. are interesting,
// but most dotfiles are noise; Analyzers can opt back in).
if strings.HasPrefix(d.Name(), ".") && !interestingDotfile(d.Name()) {
return nil
}
info, err := d.Info()
if err != nil {
return nil
}
rel, err := filepath.Rel(abs, p)
if err != nil {
rel = p
}
f := File{
Path: rel,
Abs: p,
Size: info.Size(),
Language: detectLanguage(d.Name()),
}
if countLines && info.Size() < 5_000_000 { // 5MB cap; massive files lose line precision but stay scannable
if n, err := countFileLines(p); err == nil {
f.Lines = n
}
}
res.Files = append(res.Files, f)
if f.Language != "" {
res.LanguageBreakdown[f.Language]++
}
if isManifest(d.Name()) {
res.DependencyManifests = append(res.DependencyManifests, rel)
}
if isTestPath(rel) {
res.TestManifests = append(res.TestManifests, rel)
}
return nil
})
if walkErr != nil {
return nil, walkErr
}
// Largest top-10 by size.
sorted := make([]File, len(res.Files))
copy(sorted, res.Files)
sort.Slice(sorted, func(i, j int) bool { return sorted[i].Size > sorted[j].Size })
if len(sorted) > 10 {
sorted = sorted[:10]
}
res.LargestFiles = sorted
// Stable order for downstream determinism.
sort.Strings(res.DependencyManifests)
sort.Strings(res.TestManifests)
return res, nil
}
// interestingDotfile lets a few well-known dotfiles through despite
// the leading-dot filter. Keeps the scan honest about config files
// that often hold the real risk (e.g. committed .env).
func interestingDotfile(name string) bool {
switch name {
case ".env", ".env.local", ".env.production",
".gitignore", ".dockerignore", ".github",
".review-rules.md", ".review-profile.yaml":
return true
}
return false
}
func countFileLines(path string) (int, error) {
b, err := os.ReadFile(path)
if err != nil {
return 0, err
}
if len(b) == 0 {
return 0, nil
}
n := strings.Count(string(b), "\n")
if !strings.HasSuffix(string(b), "\n") {
n++ // last line without trailing newline
}
return n, nil
}