Apply B5 from 2026-04-30 scrum — scanner skip-list scoped to harness self
Opus-only BLOCK from the cross-lineage scrum: pre-fix SkipDirs basename-matched bin/build/dist/target/reports for ANY repo, silently excluding legitimate source dirs on real targets. The lakehouse Rust repo has reports/ holding markdown; some Java/ Python/Go projects use bin/ as a source dir; target/ is project- specific. Skipping them globally produced silent false-negative scans the operator would never know about. Fix: trim SkipDirs to dirs that are universally not source code — .git, .hg, .svn (VCS metadata); node_modules, vendor (dep caches); __pycache__, .venv, venv (Python envs); .idea, .vscode (editor state). Removed: bin, build, dist, target, reports. For the harness's own self-skip (it shouldn't scan its own bin/ or reports/), added path-scoped skip via selfSkipsFor — detects "this is the harness repo" by the presence of BOTH cmd/review-harness/ AND internal/analyzers/ subdirs (combination unique to this codebase), then skips the absolute paths bin/ and reports/ for that scan only. Two regression tests: - TestWalk_DoesNotSkipBinReportsInTargetRepo plants files under bin/, reports/, build/, dist/, target/ in a synthetic target repo; asserts all 5 appear in scan, while .git/ + node_modules/ + vendor/ are still skipped. - TestWalk_SelfSkipsBinReportsInHarnessRepo plants the harness's marker dirs (cmd/review-harness/, internal/analyzers/) plus bin/ + reports/ + ordinary src/; asserts self-skip fires on bin/+reports/ but real src/ scans normally. Compiled artifacts inside bin/ are filtered by the analyzers' isTextLike extension check (.exe / .dll / .so), so target repos with bin/ holding compiled output don't waste cycles decoding it. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
305f5b9ac0
commit
ab550a7c5a
@ -14,26 +14,51 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// SkipDirs is the default skip-list. Matches dir basenames anywhere
|
// SkipDirs is the default skip-list. Matches dir basenames anywhere
|
||||||
// in the tree (not just at root). Includes the harness's own output
|
// in the tree (not just at root). Limited to dirs that are universally
|
||||||
// dir so review-on-self doesn't loop.
|
// not source code — VCS metadata, dependency caches, language-specific
|
||||||
|
// virtual environments, editor state.
|
||||||
|
//
|
||||||
|
// Scrum fix B5 (Opus BLOCK, 2026-04-30): pre-fix this list also
|
||||||
|
// included `bin`, `build`, `dist`, `target`, `reports`. Those names
|
||||||
|
// have legitimate source-dir uses on target repos:
|
||||||
|
// - lakehouse Rust has `reports/` with markdown documentation
|
||||||
|
// - Java/Maven projects have `target/` build output but Gradle
|
||||||
|
// projects don't — and some projects use `target/` as a source dir
|
||||||
|
// - some Python projects keep scripts in `bin/`
|
||||||
|
//
|
||||||
|
// Skipping them globally produced silent false-negative scans on
|
||||||
|
// real targets. The harness's own self-skip is now path-scoped (see
|
||||||
|
// SkipPathsAbsolute below), not basename-scoped, so reviewing the
|
||||||
|
// harness against itself doesn't recurse into its own bin/reports
|
||||||
|
// while reviewing other repos with those names still works.
|
||||||
|
//
|
||||||
|
// Compiled binaries inside bin/ are filtered by the analyzers'
|
||||||
|
// isTextLike extension check (.exe / .dll / .so / etc.), so the
|
||||||
|
// scan doesn't waste time decoding them.
|
||||||
var SkipDirs = map[string]bool{
|
var SkipDirs = map[string]bool{
|
||||||
".git": true,
|
".git": true,
|
||||||
".hg": true,
|
".hg": true,
|
||||||
".svn": true,
|
".svn": true,
|
||||||
"node_modules": true,
|
"node_modules": true,
|
||||||
"vendor": true,
|
"vendor": true,
|
||||||
"target": true, // Rust
|
|
||||||
"dist": true,
|
|
||||||
"build": true,
|
|
||||||
"__pycache__": true,
|
"__pycache__": true,
|
||||||
".venv": true,
|
".venv": true,
|
||||||
"venv": true,
|
"venv": true,
|
||||||
"bin": true, // golangLAKEHOUSE convention; harness's own too
|
|
||||||
".idea": true,
|
".idea": true,
|
||||||
".vscode": true,
|
".vscode": true,
|
||||||
"reports": true, // harness's own output
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SkipPathsAbsolute is checked when scanning the harness's own repo —
|
||||||
|
// these are absolute paths (resolved from the repoPath at scan time)
|
||||||
|
// that the harness should never recurse into. Today: `bin/` (the
|
||||||
|
// review-harness binary lands here) and `reports/` (per-run output).
|
||||||
|
//
|
||||||
|
// Populated via PathScopedSkipsForSelf when the scanner detects it's
|
||||||
|
// scanning its own tree (heuristic: target path contains both
|
||||||
|
// `cmd/review-harness` AND `internal/analyzers`). Empty otherwise so
|
||||||
|
// other repos see the more permissive default skip-list.
|
||||||
|
var SkipPathsAbsolute = map[string]bool{}
|
||||||
|
|
||||||
// File is one entry in the scan result.
|
// File is one entry in the scan result.
|
||||||
type File struct {
|
type File struct {
|
||||||
Path string // relative to repo root
|
Path string // relative to repo root
|
||||||
@ -72,6 +97,13 @@ func Walk(repoPath string, countLines bool) (*Result, error) {
|
|||||||
LanguageBreakdown: map[string]int{},
|
LanguageBreakdown: map[string]int{},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Path-scoped self-skip: when scanning the harness's own tree,
|
||||||
|
// also skip the dirs that hold its build artifacts + run output.
|
||||||
|
// Path-scoped (not basename-scoped) so other repos that happen
|
||||||
|
// to have `bin/` or `reports/` still get scanned. See SkipDirs
|
||||||
|
// docstring for B5 background.
|
||||||
|
pathSkips := selfSkipsFor(abs)
|
||||||
|
|
||||||
walkErr := filepath.WalkDir(abs, func(p string, d fs.DirEntry, walkErr error) error {
|
walkErr := filepath.WalkDir(abs, func(p string, d fs.DirEntry, walkErr error) error {
|
||||||
if walkErr != nil {
|
if walkErr != nil {
|
||||||
return nil // best-effort; permission errors etc. are silent
|
return nil // best-effort; permission errors etc. are silent
|
||||||
@ -80,6 +112,9 @@ func Walk(repoPath string, countLines bool) (*Result, error) {
|
|||||||
if SkipDirs[d.Name()] && p != abs {
|
if SkipDirs[d.Name()] && p != abs {
|
||||||
return filepath.SkipDir
|
return filepath.SkipDir
|
||||||
}
|
}
|
||||||
|
if pathSkips[p] {
|
||||||
|
return filepath.SkipDir
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
// Skip dotfiles at file level (.gitignore etc. are interesting,
|
// Skip dotfiles at file level (.gitignore etc. are interesting,
|
||||||
@ -138,6 +173,29 @@ func Walk(repoPath string, countLines bool) (*Result, error) {
|
|||||||
return res, nil
|
return res, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// selfSkipsFor returns absolute paths to skip when scanning the
|
||||||
|
// harness's own tree (where `bin/` is build output and `reports/`
|
||||||
|
// is per-run runtime data). Empty when scanning anything else, so
|
||||||
|
// target repos with legitimate `bin/` / `reports/` directories
|
||||||
|
// don't get those silently excluded.
|
||||||
|
//
|
||||||
|
// Heuristic: harness self-detected when the target absolute path
|
||||||
|
// contains both `cmd/review-harness` AND `internal/analyzers`
|
||||||
|
// directories — that combination doesn't appear in arbitrary repos.
|
||||||
|
func selfSkipsFor(abs string) map[string]bool {
|
||||||
|
cmdMarker := filepath.Join(abs, "cmd", "review-harness")
|
||||||
|
intMarker := filepath.Join(abs, "internal", "analyzers")
|
||||||
|
st1, err1 := os.Stat(cmdMarker)
|
||||||
|
st2, err2 := os.Stat(intMarker)
|
||||||
|
if err1 != nil || err2 != nil || !st1.IsDir() || !st2.IsDir() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return map[string]bool{
|
||||||
|
filepath.Join(abs, "bin"): true,
|
||||||
|
filepath.Join(abs, "reports"): true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// interestingDotfile lets a few well-known dotfiles through despite
|
// interestingDotfile lets a few well-known dotfiles through despite
|
||||||
// the leading-dot filter. Keeps the scan honest about config files
|
// the leading-dot filter. Keeps the scan honest about config files
|
||||||
// that often hold the real risk (e.g. committed .env).
|
// that often hold the real risk (e.g. committed .env).
|
||||||
|
|||||||
127
internal/scanner/walk_test.go
Normal file
127
internal/scanner/walk_test.go
Normal file
@ -0,0 +1,127 @@
|
|||||||
|
package scanner
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestWalk_DoesNotSkipBinReportsInTargetRepo locks in scrum fix B5.
|
||||||
|
// Pre-fix the SkipDirs map basename-matched bin/reports/build/dist
|
||||||
|
// for ANY repo, silently excluding legitimate source directories on
|
||||||
|
// real targets (lakehouse Rust has reports/ with markdown). Now
|
||||||
|
// only universal noise (.git, node_modules, vendor, .venv, .idea,
|
||||||
|
// etc.) is basename-skipped; harness self-skip is path-scoped.
|
||||||
|
func TestWalk_DoesNotSkipBinReportsInTargetRepo(t *testing.T) {
|
||||||
|
repo := t.TempDir()
|
||||||
|
// Plant files under names that USED to be silently skipped.
|
||||||
|
for _, p := range []string{
|
||||||
|
"bin/script.go",
|
||||||
|
"reports/findings.md",
|
||||||
|
"build/notes.txt",
|
||||||
|
"dist/release.md",
|
||||||
|
"target/output.go",
|
||||||
|
// Universal-noise dirs SHOULD still skip.
|
||||||
|
".git/HEAD",
|
||||||
|
"node_modules/foo/index.js",
|
||||||
|
"vendor/dep/lib.go",
|
||||||
|
} {
|
||||||
|
full := filepath.Join(repo, p)
|
||||||
|
if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(full, []byte("test\n"), 0o644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
res, err := Walk(repo, false)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Walk: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
seen := map[string]bool{}
|
||||||
|
for _, f := range res.Files {
|
||||||
|
seen[f.Path] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
// These MUST appear (B5 fix — used to be silently skipped).
|
||||||
|
mustHave := []string{
|
||||||
|
"bin/script.go",
|
||||||
|
"reports/findings.md",
|
||||||
|
"build/notes.txt",
|
||||||
|
"dist/release.md",
|
||||||
|
"target/output.go",
|
||||||
|
}
|
||||||
|
for _, p := range mustHave {
|
||||||
|
if !seen[p] {
|
||||||
|
t.Errorf("expected %q in scan; got %v", p, mapKeys(seen))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// These MUST be skipped (universal noise).
|
||||||
|
mustNotHave := []string{
|
||||||
|
".git/HEAD",
|
||||||
|
"node_modules/foo/index.js",
|
||||||
|
"vendor/dep/lib.go",
|
||||||
|
}
|
||||||
|
for _, p := range mustNotHave {
|
||||||
|
if seen[p] {
|
||||||
|
t.Errorf("unexpected %q in scan (universal-noise dir)", p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestWalk_SelfSkipsBinReportsInHarnessRepo locks in the path-scoped
|
||||||
|
// self-skip — when the scanner detects it's being run against the
|
||||||
|
// harness's own tree (cmd/review-harness AND internal/analyzers
|
||||||
|
// present), it skips bin/ + reports/ to avoid recursing into its
|
||||||
|
// own build output and run artifacts.
|
||||||
|
func TestWalk_SelfSkipsBinReportsInHarnessRepo(t *testing.T) {
|
||||||
|
repo := t.TempDir()
|
||||||
|
// Plant the marker dirs that signal "this is the harness repo"
|
||||||
|
for _, p := range []string{
|
||||||
|
"cmd/review-harness/main.go",
|
||||||
|
"internal/analyzers/checks.go",
|
||||||
|
"bin/review-harness", // build output — should skip
|
||||||
|
"reports/latest/x.json", // runtime — should skip
|
||||||
|
"src/real_code.go", // ordinary source — should appear
|
||||||
|
} {
|
||||||
|
full := filepath.Join(repo, p)
|
||||||
|
_ = os.MkdirAll(filepath.Dir(full), 0o755)
|
||||||
|
_ = os.WriteFile(full, []byte("x\n"), 0o644)
|
||||||
|
}
|
||||||
|
|
||||||
|
res, err := Walk(repo, false)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Walk: %v", err)
|
||||||
|
}
|
||||||
|
seen := map[string]bool{}
|
||||||
|
for _, f := range res.Files {
|
||||||
|
seen[f.Path] = true
|
||||||
|
}
|
||||||
|
for _, want := range []string{
|
||||||
|
"cmd/review-harness/main.go",
|
||||||
|
"internal/analyzers/checks.go",
|
||||||
|
"src/real_code.go",
|
||||||
|
} {
|
||||||
|
if !seen[want] {
|
||||||
|
t.Errorf("expected %q in self-scan; got %v", want, mapKeys(seen))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Self-skip should fire on bin/ and reports/.
|
||||||
|
if seen["bin/review-harness"] {
|
||||||
|
t.Errorf("bin/ should be self-skipped on harness repo")
|
||||||
|
}
|
||||||
|
if seen["reports/latest/x.json"] {
|
||||||
|
t.Errorf("reports/ should be self-skipped on harness repo")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func mapKeys(m map[string]bool) []string {
|
||||||
|
out := make([]string, 0, len(m))
|
||||||
|
for k := range m {
|
||||||
|
out = append(out, k)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user