diff --git a/internal/scanner/walk.go b/internal/scanner/walk.go index e22e6dd..b5eec5e 100644 --- a/internal/scanner/walk.go +++ b/internal/scanner/walk.go @@ -14,26 +14,51 @@ import ( ) // SkipDirs is the default skip-list. Matches dir basenames anywhere -// in the tree (not just at root). Includes the harness's own output -// dir so review-on-self doesn't loop. +// in the tree (not just at root). Limited to dirs that are universally +// not source code — VCS metadata, dependency caches, language-specific +// virtual environments, editor state. +// +// Scrum fix B5 (Opus BLOCK, 2026-04-30): pre-fix this list also +// included `bin`, `build`, `dist`, `target`, `reports`. Those names +// have legitimate source-dir uses on target repos: +// - lakehouse Rust has `reports/` with markdown documentation +// - Java/Maven projects have `target/` build output but Gradle +// projects don't — and some projects use `target/` as a source dir +// - some Python projects keep scripts in `bin/` +// +// Skipping them globally produced silent false-negative scans on +// real targets. The harness's own self-skip is now path-scoped (see +// SkipPathsAbsolute below), not basename-scoped, so reviewing the +// harness against itself doesn't recurse into its own bin/reports +// while reviewing other repos with those names still works. +// +// Compiled binaries inside bin/ are filtered by the analyzers' +// isTextLike extension check (.exe / .dll / .so / etc.), so the +// scan doesn't waste time decoding them. var SkipDirs = map[string]bool{ - ".git": true, - ".hg": true, - ".svn": true, - "node_modules": true, - "vendor": true, - "target": true, // Rust - "dist": true, - "build": true, - "__pycache__": true, - ".venv": true, - "venv": true, - "bin": true, // golangLAKEHOUSE convention; harness's own too - ".idea": true, - ".vscode": true, - "reports": true, // harness's own output + ".git": true, + ".hg": true, + ".svn": true, + "node_modules": true, + "vendor": true, + "__pycache__": true, + ".venv": true, + "venv": true, + ".idea": true, + ".vscode": true, } +// SkipPathsAbsolute is checked when scanning the harness's own repo — +// these are absolute paths (resolved from the repoPath at scan time) +// that the harness should never recurse into. Today: `bin/` (the +// review-harness binary lands here) and `reports/` (per-run output). +// +// Populated via PathScopedSkipsForSelf when the scanner detects it's +// scanning its own tree (heuristic: target path contains both +// `cmd/review-harness` AND `internal/analyzers`). Empty otherwise so +// other repos see the more permissive default skip-list. +var SkipPathsAbsolute = map[string]bool{} + // File is one entry in the scan result. type File struct { Path string // relative to repo root @@ -72,6 +97,13 @@ func Walk(repoPath string, countLines bool) (*Result, error) { LanguageBreakdown: map[string]int{}, } + // Path-scoped self-skip: when scanning the harness's own tree, + // also skip the dirs that hold its build artifacts + run output. + // Path-scoped (not basename-scoped) so other repos that happen + // to have `bin/` or `reports/` still get scanned. See SkipDirs + // docstring for B5 background. + pathSkips := selfSkipsFor(abs) + walkErr := filepath.WalkDir(abs, func(p string, d fs.DirEntry, walkErr error) error { if walkErr != nil { return nil // best-effort; permission errors etc. are silent @@ -80,6 +112,9 @@ func Walk(repoPath string, countLines bool) (*Result, error) { if SkipDirs[d.Name()] && p != abs { return filepath.SkipDir } + if pathSkips[p] { + return filepath.SkipDir + } return nil } // Skip dotfiles at file level (.gitignore etc. are interesting, @@ -138,6 +173,29 @@ func Walk(repoPath string, countLines bool) (*Result, error) { return res, nil } +// selfSkipsFor returns absolute paths to skip when scanning the +// harness's own tree (where `bin/` is build output and `reports/` +// is per-run runtime data). Empty when scanning anything else, so +// target repos with legitimate `bin/` / `reports/` directories +// don't get those silently excluded. +// +// Heuristic: harness self-detected when the target absolute path +// contains both `cmd/review-harness` AND `internal/analyzers` +// directories — that combination doesn't appear in arbitrary repos. +func selfSkipsFor(abs string) map[string]bool { + cmdMarker := filepath.Join(abs, "cmd", "review-harness") + intMarker := filepath.Join(abs, "internal", "analyzers") + st1, err1 := os.Stat(cmdMarker) + st2, err2 := os.Stat(intMarker) + if err1 != nil || err2 != nil || !st1.IsDir() || !st2.IsDir() { + return nil + } + return map[string]bool{ + filepath.Join(abs, "bin"): true, + filepath.Join(abs, "reports"): true, + } +} + // interestingDotfile lets a few well-known dotfiles through despite // the leading-dot filter. Keeps the scan honest about config files // that often hold the real risk (e.g. committed .env). diff --git a/internal/scanner/walk_test.go b/internal/scanner/walk_test.go new file mode 100644 index 0000000..08ce7cd --- /dev/null +++ b/internal/scanner/walk_test.go @@ -0,0 +1,127 @@ +package scanner + +import ( + "os" + "path/filepath" + "testing" +) + +// TestWalk_DoesNotSkipBinReportsInTargetRepo locks in scrum fix B5. +// Pre-fix the SkipDirs map basename-matched bin/reports/build/dist +// for ANY repo, silently excluding legitimate source directories on +// real targets (lakehouse Rust has reports/ with markdown). Now +// only universal noise (.git, node_modules, vendor, .venv, .idea, +// etc.) is basename-skipped; harness self-skip is path-scoped. +func TestWalk_DoesNotSkipBinReportsInTargetRepo(t *testing.T) { + repo := t.TempDir() + // Plant files under names that USED to be silently skipped. + for _, p := range []string{ + "bin/script.go", + "reports/findings.md", + "build/notes.txt", + "dist/release.md", + "target/output.go", + // Universal-noise dirs SHOULD still skip. + ".git/HEAD", + "node_modules/foo/index.js", + "vendor/dep/lib.go", + } { + full := filepath.Join(repo, p) + if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(full, []byte("test\n"), 0o644); err != nil { + t.Fatal(err) + } + } + + res, err := Walk(repo, false) + if err != nil { + t.Fatalf("Walk: %v", err) + } + + seen := map[string]bool{} + for _, f := range res.Files { + seen[f.Path] = true + } + + // These MUST appear (B5 fix — used to be silently skipped). + mustHave := []string{ + "bin/script.go", + "reports/findings.md", + "build/notes.txt", + "dist/release.md", + "target/output.go", + } + for _, p := range mustHave { + if !seen[p] { + t.Errorf("expected %q in scan; got %v", p, mapKeys(seen)) + } + } + + // These MUST be skipped (universal noise). + mustNotHave := []string{ + ".git/HEAD", + "node_modules/foo/index.js", + "vendor/dep/lib.go", + } + for _, p := range mustNotHave { + if seen[p] { + t.Errorf("unexpected %q in scan (universal-noise dir)", p) + } + } +} + +// TestWalk_SelfSkipsBinReportsInHarnessRepo locks in the path-scoped +// self-skip — when the scanner detects it's being run against the +// harness's own tree (cmd/review-harness AND internal/analyzers +// present), it skips bin/ + reports/ to avoid recursing into its +// own build output and run artifacts. +func TestWalk_SelfSkipsBinReportsInHarnessRepo(t *testing.T) { + repo := t.TempDir() + // Plant the marker dirs that signal "this is the harness repo" + for _, p := range []string{ + "cmd/review-harness/main.go", + "internal/analyzers/checks.go", + "bin/review-harness", // build output — should skip + "reports/latest/x.json", // runtime — should skip + "src/real_code.go", // ordinary source — should appear + } { + full := filepath.Join(repo, p) + _ = os.MkdirAll(filepath.Dir(full), 0o755) + _ = os.WriteFile(full, []byte("x\n"), 0o644) + } + + res, err := Walk(repo, false) + if err != nil { + t.Fatalf("Walk: %v", err) + } + seen := map[string]bool{} + for _, f := range res.Files { + seen[f.Path] = true + } + for _, want := range []string{ + "cmd/review-harness/main.go", + "internal/analyzers/checks.go", + "src/real_code.go", + } { + if !seen[want] { + t.Errorf("expected %q in self-scan; got %v", want, mapKeys(seen)) + } + } + // Self-skip should fire on bin/ and reports/. + if seen["bin/review-harness"] { + t.Errorf("bin/ should be self-skipped on harness repo") + } + if seen["reports/latest/x.json"] { + t.Errorf("reports/ should be self-skipped on harness repo") + } +} + +func mapKeys(m map[string]bool) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + return out +}