Claude (review-harness setup) 2fc047487f Scanner respects .gitignore — full Lakehouse Rust scan now possible
Single biggest unblock for using the harness on real targets. The
lakehouse Rust repo has a 67GB data/ directory holding parquet,
JSONL pathway memory, headshots, and other runtime data — all
gitignored. Pre-fix the scanner walked it all (and stalled). Post-
fix the full Rust scan completes in 15s.

internal/scanner/gitignore.go — minimal Matcher that handles the
patterns real .gitignore files use ~99% of the time:
  - basename match anywhere (`pattern`)
  - dir-only match (`pattern/`)
  - root-anchored (`/pattern`)
  - path-anchored (`pattern/sub` — interior slash)
  - extension globs (`*.ext`)
  - path + extension (`path/*.ext`)
  - comments + blank lines ignored

Negations (!pattern) intentionally NOT supported v0; matcher records
HasNegations() so callers can surface a warning if encountered.

internal/scanner/gitignore_test.go — 14 cases against a synthetic
.gitignore covering all 6 pattern shapes, plus missing-file and
negation-recording tests.

walk.go integration: gitignore loaded once at scan start; checked
in the dir-skip branch (SkipDir cascades) and the file-emit branch.
Skip layers in order: universal-noise basenames → .gitignore →
path-scoped self-skip → dotfile filter.

Verified end-to-end:
- lakehouse Rust full repo: 15s scan, 1031 findings, 0 critical
  (no committed secrets in source — independently confirms what
  scrum2 + the Rust auditor said)
- 529 hardcoded-path findings IS the Sprint 4 gap the audit kept
  naming; the harness just put a number on it

This was Opus's WARN B5 from the cross-lineage scrum, plus the
"harness stalls on real repos" gap exposed when running it against
the actual Lakehouse repos. Both addressed in one wave.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 02:14:10 -05:00

117 lines
3.6 KiB
Go

package scanner
import (
"os"
"path/filepath"
"testing"
)
// TestLoadGitignore_SkipsPatternedPaths covers the patterns most
// real-world .gitignore files use — the lakehouse Rust 14-line file
// + a few additions that exercise edge cases.
func TestLoadGitignore_SkipsPatternedPaths(t *testing.T) {
repo := t.TempDir()
gitignoreContent := `
# Comment line — ignored
/target
*.swp
.env
__pycache__/
data/headshots/face_*.jpg
data/headshots_gen/
node_modules
`
if err := os.WriteFile(filepath.Join(repo, ".gitignore"), []byte(gitignoreContent), 0o644); err != nil {
t.Fatal(err)
}
m, err := LoadGitignore(repo)
if err != nil {
t.Fatal(err)
}
cases := []struct {
path string
isDir bool
skip bool
why string
}{
// /target — anchored at repo root
{"target", true, true, "anchored /target dir"},
{"target/release/foo.rs", false, true, "file under anchored /target"},
{"crates/foo/target", true, false, "nested target NOT anchored"},
// *.swp — extension glob anywhere
{"foo.swp", false, true, "swp file at root"},
{"a/b/c.swp", false, true, "swp file nested"},
{"foo.swp.bak", false, false, "swp not the suffix"},
// .env — exact basename
{".env", false, true, ".env at root"},
{".env.example", false, false, "example variant should NOT match"},
// __pycache__/ — dir-only basename match
{"__pycache__", true, true, "pycache dir at root"},
{"a/b/__pycache__", true, true, "pycache dir nested"},
{"__pycache__", false, false, "pycache as file (impossible but tested)"},
// data/headshots/face_*.jpg — anchored path glob
{"data/headshots/face_001.jpg", false, true, "matches glob"},
{"data/headshots/manifest.json", false, false, "different name"},
{"a/data/headshots/face_001.jpg", false, false, "anchored — depth matters"},
// data/headshots_gen/ — anchored dir-only match. The walker
// SkipDirs on the dir match, so descendant files are never
// queried; we only assert the dir match itself.
{"data/headshots_gen", true, true, "anchored dir matches"},
// node_modules — unanchored basename, matches anywhere
{"node_modules", true, true, "at root"},
{"a/b/node_modules", true, true, "nested"},
{"my_node_modules", true, false, "NOT exact basename"},
}
for _, c := range cases {
abs := filepath.Join(repo, c.path)
got := m.Skip(abs, c.isDir)
if got != c.skip {
t.Errorf("Skip(%q, isDir=%v) = %v; want %v (%s)", c.path, c.isDir, got, c.skip, c.why)
}
}
}
// TestLoadGitignore_MissingFileReturnsEmptyMatcher — no .gitignore →
// matcher.Skip always false.
func TestLoadGitignore_MissingFileReturnsEmptyMatcher(t *testing.T) {
repo := t.TempDir()
m, err := LoadGitignore(repo)
if err != nil {
t.Fatalf("missing .gitignore should NOT error; got %v", err)
}
if m.Skip(filepath.Join(repo, "anything"), true) {
t.Errorf("empty matcher should never skip")
}
}
// TestLoadGitignore_NegationsRecorded — negations (!pattern) aren't
// supported v0; just record that they were seen so callers can warn.
func TestLoadGitignore_NegationsRecorded(t *testing.T) {
repo := t.TempDir()
if err := os.WriteFile(filepath.Join(repo, ".gitignore"), []byte("*.log\n!keep.log\n"), 0o644); err != nil {
t.Fatal(err)
}
m, err := LoadGitignore(repo)
if err != nil {
t.Fatal(err)
}
if !m.HasNegations() {
t.Errorf("HasNegations should be true after parsing !keep.log")
}
// keep.log STILL gets skipped because we don't honor negations
// — operator gets a warning via HasNegations.
if !m.Skip(filepath.Join(repo, "keep.log"), false) {
t.Errorf("v0 doesn't honor negations — keep.log should still be skipped")
}
}