Single biggest unblock for using the harness on real targets. The lakehouse Rust repo has a 67GB data/ directory holding parquet, JSONL pathway memory, headshots, and other runtime data — all gitignored. Pre-fix the scanner walked it all (and stalled). Post- fix the full Rust scan completes in 15s. internal/scanner/gitignore.go — minimal Matcher that handles the patterns real .gitignore files use ~99% of the time: - basename match anywhere (`pattern`) - dir-only match (`pattern/`) - root-anchored (`/pattern`) - path-anchored (`pattern/sub` — interior slash) - extension globs (`*.ext`) - path + extension (`path/*.ext`) - comments + blank lines ignored Negations (!pattern) intentionally NOT supported v0; matcher records HasNegations() so callers can surface a warning if encountered. internal/scanner/gitignore_test.go — 14 cases against a synthetic .gitignore covering all 6 pattern shapes, plus missing-file and negation-recording tests. walk.go integration: gitignore loaded once at scan start; checked in the dir-skip branch (SkipDir cascades) and the file-emit branch. Skip layers in order: universal-noise basenames → .gitignore → path-scoped self-skip → dotfile filter. Verified end-to-end: - lakehouse Rust full repo: 15s scan, 1031 findings, 0 critical (no committed secrets in source — independently confirms what scrum2 + the Rust auditor said) - 529 hardcoded-path findings IS the Sprint 4 gap the audit kept naming; the harness just put a number on it This was Opus's WARN B5 from the cross-lineage scrum, plus the "harness stalls on real repos" gap exposed when running it against the actual Lakehouse repos. Both addressed in one wave. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
117 lines
3.6 KiB
Go
117 lines
3.6 KiB
Go
package scanner
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
)
|
|
|
|
// TestLoadGitignore_SkipsPatternedPaths covers the patterns most
|
|
// real-world .gitignore files use — the lakehouse Rust 14-line file
|
|
// + a few additions that exercise edge cases.
|
|
func TestLoadGitignore_SkipsPatternedPaths(t *testing.T) {
|
|
repo := t.TempDir()
|
|
|
|
gitignoreContent := `
|
|
# Comment line — ignored
|
|
/target
|
|
*.swp
|
|
.env
|
|
__pycache__/
|
|
data/headshots/face_*.jpg
|
|
data/headshots_gen/
|
|
node_modules
|
|
`
|
|
if err := os.WriteFile(filepath.Join(repo, ".gitignore"), []byte(gitignoreContent), 0o644); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
m, err := LoadGitignore(repo)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
cases := []struct {
|
|
path string
|
|
isDir bool
|
|
skip bool
|
|
why string
|
|
}{
|
|
// /target — anchored at repo root
|
|
{"target", true, true, "anchored /target dir"},
|
|
{"target/release/foo.rs", false, true, "file under anchored /target"},
|
|
{"crates/foo/target", true, false, "nested target NOT anchored"},
|
|
|
|
// *.swp — extension glob anywhere
|
|
{"foo.swp", false, true, "swp file at root"},
|
|
{"a/b/c.swp", false, true, "swp file nested"},
|
|
{"foo.swp.bak", false, false, "swp not the suffix"},
|
|
|
|
// .env — exact basename
|
|
{".env", false, true, ".env at root"},
|
|
{".env.example", false, false, "example variant should NOT match"},
|
|
|
|
// __pycache__/ — dir-only basename match
|
|
{"__pycache__", true, true, "pycache dir at root"},
|
|
{"a/b/__pycache__", true, true, "pycache dir nested"},
|
|
{"__pycache__", false, false, "pycache as file (impossible but tested)"},
|
|
|
|
// data/headshots/face_*.jpg — anchored path glob
|
|
{"data/headshots/face_001.jpg", false, true, "matches glob"},
|
|
{"data/headshots/manifest.json", false, false, "different name"},
|
|
{"a/data/headshots/face_001.jpg", false, false, "anchored — depth matters"},
|
|
|
|
// data/headshots_gen/ — anchored dir-only match. The walker
|
|
// SkipDirs on the dir match, so descendant files are never
|
|
// queried; we only assert the dir match itself.
|
|
{"data/headshots_gen", true, true, "anchored dir matches"},
|
|
|
|
// node_modules — unanchored basename, matches anywhere
|
|
{"node_modules", true, true, "at root"},
|
|
{"a/b/node_modules", true, true, "nested"},
|
|
{"my_node_modules", true, false, "NOT exact basename"},
|
|
}
|
|
|
|
for _, c := range cases {
|
|
abs := filepath.Join(repo, c.path)
|
|
got := m.Skip(abs, c.isDir)
|
|
if got != c.skip {
|
|
t.Errorf("Skip(%q, isDir=%v) = %v; want %v (%s)", c.path, c.isDir, got, c.skip, c.why)
|
|
}
|
|
}
|
|
}
|
|
|
|
// TestLoadGitignore_MissingFileReturnsEmptyMatcher — no .gitignore →
|
|
// matcher.Skip always false.
|
|
func TestLoadGitignore_MissingFileReturnsEmptyMatcher(t *testing.T) {
|
|
repo := t.TempDir()
|
|
m, err := LoadGitignore(repo)
|
|
if err != nil {
|
|
t.Fatalf("missing .gitignore should NOT error; got %v", err)
|
|
}
|
|
if m.Skip(filepath.Join(repo, "anything"), true) {
|
|
t.Errorf("empty matcher should never skip")
|
|
}
|
|
}
|
|
|
|
// TestLoadGitignore_NegationsRecorded — negations (!pattern) aren't
|
|
// supported v0; just record that they were seen so callers can warn.
|
|
func TestLoadGitignore_NegationsRecorded(t *testing.T) {
|
|
repo := t.TempDir()
|
|
if err := os.WriteFile(filepath.Join(repo, ".gitignore"), []byte("*.log\n!keep.log\n"), 0o644); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
m, err := LoadGitignore(repo)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if !m.HasNegations() {
|
|
t.Errorf("HasNegations should be true after parsing !keep.log")
|
|
}
|
|
// keep.log STILL gets skipped because we don't honor negations
|
|
// — operator gets a warning via HasNegations.
|
|
if !m.Skip(filepath.Join(repo, "keep.log"), false) {
|
|
t.Errorf("v0 doesn't honor negations — keep.log should still be skipped")
|
|
}
|
|
}
|