// Package scanner walks a repository tree, classifies files, and // surfaces metadata for the analyzers + repo-intake report. // // Skip-list defaults to common build/dependency dirs that nobody // wants to scan. Operators can extend via review-profile (Phase E). package scanner import ( "io/fs" "os" "path/filepath" "sort" "strings" ) // SkipDirs is the default skip-list. Matches dir basenames anywhere // in the tree (not just at root). Limited to dirs that are universally // not source code — VCS metadata, dependency caches, language-specific // virtual environments, editor state. // // Scrum fix B5 (Opus BLOCK, 2026-04-30): pre-fix this list also // included `bin`, `build`, `dist`, `target`, `reports`. Those names // have legitimate source-dir uses on target repos: // - lakehouse Rust has `reports/` with markdown documentation // - Java/Maven projects have `target/` build output but Gradle // projects don't — and some projects use `target/` as a source dir // - some Python projects keep scripts in `bin/` // // Skipping them globally produced silent false-negative scans on // real targets. The harness's own self-skip is now path-scoped (see // SkipPathsAbsolute below), not basename-scoped, so reviewing the // harness against itself doesn't recurse into its own bin/reports // while reviewing other repos with those names still works. // // Compiled binaries inside bin/ are filtered by the analyzers' // isTextLike extension check (.exe / .dll / .so / etc.), so the // scan doesn't waste time decoding them. var SkipDirs = map[string]bool{ ".git": true, ".hg": true, ".svn": true, "node_modules": true, "vendor": true, "__pycache__": true, ".venv": true, "venv": true, ".idea": true, ".vscode": true, // Harness's own memory dir. Per PROMPT.md the harness writes to // /.memory/ which contains JSONL findings + run history; // scanning those would surface the harness's own audit trail // quotes (e.g. planted-secret evidence) as new findings. ".memory": true, } // SkipPathsAbsolute is checked when scanning the harness's own repo — // these are absolute paths (resolved from the repoPath at scan time) // that the harness should never recurse into. Today: `bin/` (the // review-harness binary lands here) and `reports/` (per-run output). // // Populated via PathScopedSkipsForSelf when the scanner detects it's // scanning its own tree (heuristic: target path contains both // `cmd/review-harness` AND `internal/analyzers`). Empty otherwise so // other repos see the more permissive default skip-list. var SkipPathsAbsolute = map[string]bool{} // File is one entry in the scan result. type File struct { Path string // relative to repo root Abs string // absolute path on disk Size int64 // bytes Lines int // 0 if not counted Language string // best-effort, "" if unknown } // Result is the scan summary the analyzers + reporters consume. type Result struct { RepoPath string Files []File LanguageBreakdown map[string]int // count of files by language LargestFiles []File // top 10 by size DependencyManifests []string // relative paths to package.json / go.mod / etc TestManifests []string // tests/ dirs, *_test.go, *.test.ts, etc } // Walk produces a Result for repoPath. Errors on missing dir; skipped // dirs are silently filtered. countLines is true → reads each file // for line counts (Phase B needs this; Phase A wires false for speed). // // Skip layers, applied in order: // 1. Universal-noise basenames (.git, node_modules, vendor, etc.) // 2. .gitignore patterns (read once at repoPath/.gitignore) // 3. Path-scoped self-skip (only when the harness scans its own tree) // 4. Dotfile filter (skip leading-dot files unless interestingDotfile) // // Layer 2 is the "scrum fix B5+1" — without it, target repos with // large data dirs (lakehouse Rust has 67GB under data/) stalled the // scan. With it, the scanner respects the operator's intent. func Walk(repoPath string, countLines bool) (*Result, error) { abs, err := filepath.Abs(repoPath) if err != nil { return nil, err } if st, err := os.Stat(abs); err != nil { return nil, err } else if !st.IsDir() { return nil, fs.ErrInvalid } res := &Result{ RepoPath: abs, LanguageBreakdown: map[string]int{}, } // Path-scoped self-skip: when scanning the harness's own tree, // also skip the dirs that hold its build artifacts + run output. // Path-scoped (not basename-scoped) so other repos that happen // to have `bin/` or `reports/` still get scanned. See SkipDirs // docstring for B5 background. pathSkips := selfSkipsFor(abs) // Load .gitignore at repo root if present. Empty/missing → // matcher.Skip always returns false; behavior matches the pre- // 2026-04-30 scanner. Negations (!pattern) are NOT supported v0; // matcher.HasNegations() lets the caller surface a warning. gitignore, _ := LoadGitignore(abs) walkErr := filepath.WalkDir(abs, func(p string, d fs.DirEntry, walkErr error) error { if walkErr != nil { return nil // best-effort; permission errors etc. are silent } if d.IsDir() { if SkipDirs[d.Name()] && p != abs { return filepath.SkipDir } if pathSkips[p] { return filepath.SkipDir } if gitignore.Skip(p, true) { return filepath.SkipDir } return nil } if gitignore.Skip(p, false) { return nil } // Skip dotfiles at file level (.gitignore etc. are interesting, // but most dotfiles are noise; Analyzers can opt back in). if strings.HasPrefix(d.Name(), ".") && !interestingDotfile(d.Name()) { return nil } info, err := d.Info() if err != nil { return nil } rel, err := filepath.Rel(abs, p) if err != nil { rel = p } f := File{ Path: rel, Abs: p, Size: info.Size(), Language: detectLanguage(d.Name()), } if countLines && info.Size() < 5_000_000 { // 5MB cap; massive files lose line precision but stay scannable if n, err := countFileLines(p); err == nil { f.Lines = n } } res.Files = append(res.Files, f) if f.Language != "" { res.LanguageBreakdown[f.Language]++ } if isManifest(d.Name()) { res.DependencyManifests = append(res.DependencyManifests, rel) } if isTestPath(rel) { res.TestManifests = append(res.TestManifests, rel) } return nil }) if walkErr != nil { return nil, walkErr } // Largest top-10 by size. sorted := make([]File, len(res.Files)) copy(sorted, res.Files) sort.Slice(sorted, func(i, j int) bool { return sorted[i].Size > sorted[j].Size }) if len(sorted) > 10 { sorted = sorted[:10] } res.LargestFiles = sorted // Stable order for downstream determinism. sort.Strings(res.DependencyManifests) sort.Strings(res.TestManifests) return res, nil } // selfSkipsFor returns absolute paths to skip when scanning the // harness's own tree (where `bin/` is build output and `reports/` // is per-run runtime data). Empty when scanning anything else, so // target repos with legitimate `bin/` / `reports/` directories // don't get those silently excluded. // // Heuristic: harness self-detected when the target absolute path // contains both `cmd/review-harness` AND `internal/analyzers` // directories — that combination doesn't appear in arbitrary repos. func selfSkipsFor(abs string) map[string]bool { cmdMarker := filepath.Join(abs, "cmd", "review-harness") intMarker := filepath.Join(abs, "internal", "analyzers") st1, err1 := os.Stat(cmdMarker) st2, err2 := os.Stat(intMarker) if err1 != nil || err2 != nil || !st1.IsDir() || !st2.IsDir() { return nil } return map[string]bool{ filepath.Join(abs, "bin"): true, filepath.Join(abs, "reports"): true, } } // interestingDotfile lets a few well-known dotfiles through despite // the leading-dot filter. Keeps the scan honest about config files // that often hold the real risk (e.g. committed .env). func interestingDotfile(name string) bool { switch name { case ".env", ".env.local", ".env.production", ".gitignore", ".dockerignore", ".github", ".review-rules.md", ".review-profile.yaml": return true } return false } func countFileLines(path string) (int, error) { b, err := os.ReadFile(path) if err != nil { return 0, err } if len(b) == 0 { return 0, nil } n := strings.Count(string(b), "\n") if !strings.HasSuffix(string(b), "\n") { n++ // last line without trailing newline } return n, nil }