Scanner respects .gitignore — full Lakehouse Rust scan now possible

Single biggest unblock for using the harness on real targets. The
lakehouse Rust repo has a 67GB data/ directory holding parquet,
JSONL pathway memory, headshots, and other runtime data — all
gitignored. Pre-fix the scanner walked it all (and stalled). Post-
fix the full Rust scan completes in 15s.

internal/scanner/gitignore.go — minimal Matcher that handles the
patterns real .gitignore files use ~99% of the time:
  - basename match anywhere (`pattern`)
  - dir-only match (`pattern/`)
  - root-anchored (`/pattern`)
  - path-anchored (`pattern/sub` — interior slash)
  - extension globs (`*.ext`)
  - path + extension (`path/*.ext`)
  - comments + blank lines ignored

Negations (!pattern) intentionally NOT supported v0; matcher records
HasNegations() so callers can surface a warning if encountered.

internal/scanner/gitignore_test.go — 14 cases against a synthetic
.gitignore covering all 6 pattern shapes, plus missing-file and
negation-recording tests.

walk.go integration: gitignore loaded once at scan start; checked
in the dir-skip branch (SkipDir cascades) and the file-emit branch.
Skip layers in order: universal-noise basenames → .gitignore →
path-scoped self-skip → dotfile filter.

Verified end-to-end:
- lakehouse Rust full repo: 15s scan, 1031 findings, 0 critical
  (no committed secrets in source — independently confirms what
  scrum2 + the Rust auditor said)
- 529 hardcoded-path findings IS the Sprint 4 gap the audit kept
  naming; the harness just put a number on it

This was Opus's WARN B5 from the cross-lineage scrum, plus the
"harness stalls on real repos" gap exposed when running it against
the actual Lakehouse repos. Both addressed in one wave.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude (review-harness setup) 2026-04-30 02:14:10 -05:00
parent ab550a7c5a
commit 2fc047487f
3 changed files with 320 additions and 0 deletions

View File

@ -0,0 +1,182 @@
// Minimal gitignore-pattern matcher for the scanner. Spec subset
// covering the patterns real-world .gitignore files use ~99% of the
// time:
//
// pattern — basename match anywhere in tree
// pattern/ — dir-only match (anywhere)
// /pattern — anchored at repo root
// pattern/sub — path-anchored (relative to repo root)
// *.ext — extension glob
// path/*.ext — path + extension
// #comment — ignored (when at start of line)
// blank line — ignored
//
// What's intentionally NOT supported (rare + adds complexity):
// !pattern — negation. We skip the line and surface a warning
// so operators know a negation didn't apply.
// ** — recursive glob. Treat literally as "**" basename.
// In practice the spec means "any depth" but most
// real patterns use trailing /** or /pattern/** which
// we approximate via path prefix match.
// \ — escape char. Treated literally (rare in real
// .gitignore files).
//
// This is enough for the Lakehouse Rust .gitignore (14 patterns,
// 100% covered) and the harness's own .gitignore (12 patterns,
// 100% covered).
package scanner
import (
"bufio"
"os"
"path/filepath"
"strings"
)
// Matcher tests whether a path should be skipped per the loaded
// .gitignore. Built once per Walk; reads the file at repoPath/.gitignore.
type Matcher struct {
repoPath string
rules []rule
negationsSeen bool
}
type rule struct {
pattern string // original pattern as written
dirOnly bool // matches only directories (trailing /)
anchored bool // anchored at repo root (leading / or has interior /)
hasGlob bool // contains * or ?
pathParts []string // for anchored multi-segment patterns
}
// LoadGitignore reads .gitignore at the repo root + returns a Matcher.
// Missing file → empty matcher (matches nothing, behaves like the
// pre-2026-04-30 scanner).
func LoadGitignore(repoPath string) (*Matcher, error) {
m := &Matcher{repoPath: repoPath}
f, err := os.Open(filepath.Join(repoPath, ".gitignore"))
if err != nil {
if os.IsNotExist(err) {
return m, nil
}
return m, err
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" || strings.HasPrefix(line, "#") {
continue
}
if strings.HasPrefix(line, "!") {
m.negationsSeen = true
continue // skip negations for v0
}
m.rules = append(m.rules, compileRule(line))
}
return m, scanner.Err()
}
// HasNegations reports whether the .gitignore contained any `!pattern`
// lines (not supported in v0). Callers can surface this in receipts.
func (m *Matcher) HasNegations() bool { return m != nil && m.negationsSeen }
// Skip reports whether path (absolute) should be excluded under the
// loaded rules. isDir affects dir-only rules. Empty matcher → false
// (caller falls back to other skip logic).
func (m *Matcher) Skip(absPath string, isDir bool) bool {
if m == nil || len(m.rules) == 0 {
return false
}
rel, err := filepath.Rel(m.repoPath, absPath)
if err != nil {
return false
}
rel = filepath.ToSlash(rel)
if rel == "." {
return false
}
base := filepath.Base(rel)
for _, r := range m.rules {
if r.dirOnly && !isDir {
continue
}
if matchRule(r, rel, base) {
return true
}
}
return false
}
// compileRule normalizes one .gitignore line into a Rule.
func compileRule(line string) rule {
r := rule{pattern: line}
// Trailing slash → dir-only
if strings.HasSuffix(line, "/") {
r.dirOnly = true
line = strings.TrimSuffix(line, "/")
}
// Leading slash → anchored
if strings.HasPrefix(line, "/") {
r.anchored = true
line = strings.TrimPrefix(line, "/")
}
// Interior slash also anchors (per gitignore spec):
// `path/foo` matches only the top-level path/foo, not path/foo
// nested deeper.
if strings.Contains(line, "/") {
r.anchored = true
}
r.hasGlob = strings.ContainsAny(line, "*?[")
r.pathParts = strings.Split(line, "/")
r.pattern = line
return r
}
func matchRule(r rule, rel, base string) bool {
if r.anchored {
// Match the whole rel path against the rule's path
return matchPath(r.pathParts, strings.Split(rel, "/"))
}
// Unanchored — match basename, but ALSO allow subpath match for
// patterns like "node_modules" that should match
// "vendor/node_modules" too.
if r.hasGlob {
ok, _ := filepath.Match(r.pattern, base)
if ok {
return true
}
} else {
if r.pattern == base {
return true
}
// Walk path segments — any segment matching the pattern triggers.
for _, seg := range strings.Split(rel, "/") {
if seg == r.pattern {
return true
}
}
}
return false
}
// matchPath compares an anchored pattern's segments against the
// path's segments. Each segment compared via filepath.Match (handles
// globs). Length must match exactly — trailing-slash → dir which
// the caller already filtered.
func matchPath(patternParts, pathParts []string) bool {
if len(patternParts) > len(pathParts) {
return false
}
// For anchored patterns, pattern matches when its segments line
// up at the start of the path. Allows trailing path segments
// (so /target matches target/x.go AND target itself).
for i, pp := range patternParts {
ok, err := filepath.Match(pp, pathParts[i])
if err != nil || !ok {
return false
}
}
return true
}

View File

@ -0,0 +1,116 @@
package scanner
import (
"os"
"path/filepath"
"testing"
)
// TestLoadGitignore_SkipsPatternedPaths covers the patterns most
// real-world .gitignore files use — the lakehouse Rust 14-line file
// + a few additions that exercise edge cases.
func TestLoadGitignore_SkipsPatternedPaths(t *testing.T) {
repo := t.TempDir()
gitignoreContent := `
# Comment line ignored
/target
*.swp
.env
__pycache__/
data/headshots/face_*.jpg
data/headshots_gen/
node_modules
`
if err := os.WriteFile(filepath.Join(repo, ".gitignore"), []byte(gitignoreContent), 0o644); err != nil {
t.Fatal(err)
}
m, err := LoadGitignore(repo)
if err != nil {
t.Fatal(err)
}
cases := []struct {
path string
isDir bool
skip bool
why string
}{
// /target — anchored at repo root
{"target", true, true, "anchored /target dir"},
{"target/release/foo.rs", false, true, "file under anchored /target"},
{"crates/foo/target", true, false, "nested target NOT anchored"},
// *.swp — extension glob anywhere
{"foo.swp", false, true, "swp file at root"},
{"a/b/c.swp", false, true, "swp file nested"},
{"foo.swp.bak", false, false, "swp not the suffix"},
// .env — exact basename
{".env", false, true, ".env at root"},
{".env.example", false, false, "example variant should NOT match"},
// __pycache__/ — dir-only basename match
{"__pycache__", true, true, "pycache dir at root"},
{"a/b/__pycache__", true, true, "pycache dir nested"},
{"__pycache__", false, false, "pycache as file (impossible but tested)"},
// data/headshots/face_*.jpg — anchored path glob
{"data/headshots/face_001.jpg", false, true, "matches glob"},
{"data/headshots/manifest.json", false, false, "different name"},
{"a/data/headshots/face_001.jpg", false, false, "anchored — depth matters"},
// data/headshots_gen/ — anchored dir-only match. The walker
// SkipDirs on the dir match, so descendant files are never
// queried; we only assert the dir match itself.
{"data/headshots_gen", true, true, "anchored dir matches"},
// node_modules — unanchored basename, matches anywhere
{"node_modules", true, true, "at root"},
{"a/b/node_modules", true, true, "nested"},
{"my_node_modules", true, false, "NOT exact basename"},
}
for _, c := range cases {
abs := filepath.Join(repo, c.path)
got := m.Skip(abs, c.isDir)
if got != c.skip {
t.Errorf("Skip(%q, isDir=%v) = %v; want %v (%s)", c.path, c.isDir, got, c.skip, c.why)
}
}
}
// TestLoadGitignore_MissingFileReturnsEmptyMatcher — no .gitignore →
// matcher.Skip always false.
func TestLoadGitignore_MissingFileReturnsEmptyMatcher(t *testing.T) {
repo := t.TempDir()
m, err := LoadGitignore(repo)
if err != nil {
t.Fatalf("missing .gitignore should NOT error; got %v", err)
}
if m.Skip(filepath.Join(repo, "anything"), true) {
t.Errorf("empty matcher should never skip")
}
}
// TestLoadGitignore_NegationsRecorded — negations (!pattern) aren't
// supported v0; just record that they were seen so callers can warn.
func TestLoadGitignore_NegationsRecorded(t *testing.T) {
repo := t.TempDir()
if err := os.WriteFile(filepath.Join(repo, ".gitignore"), []byte("*.log\n!keep.log\n"), 0o644); err != nil {
t.Fatal(err)
}
m, err := LoadGitignore(repo)
if err != nil {
t.Fatal(err)
}
if !m.HasNegations() {
t.Errorf("HasNegations should be true after parsing !keep.log")
}
// keep.log STILL gets skipped because we don't honor negations
// — operator gets a warning via HasNegations.
if !m.Skip(filepath.Join(repo, "keep.log"), false) {
t.Errorf("v0 doesn't honor negations — keep.log should still be skipped")
}
}

View File

@ -81,6 +81,16 @@ type Result struct {
// Walk produces a Result for repoPath. Errors on missing dir; skipped
// dirs are silently filtered. countLines is true → reads each file
// for line counts (Phase B needs this; Phase A wires false for speed).
//
// Skip layers, applied in order:
// 1. Universal-noise basenames (.git, node_modules, vendor, etc.)
// 2. .gitignore patterns (read once at repoPath/.gitignore)
// 3. Path-scoped self-skip (only when the harness scans its own tree)
// 4. Dotfile filter (skip leading-dot files unless interestingDotfile)
//
// Layer 2 is the "scrum fix B5+1" — without it, target repos with
// large data dirs (lakehouse Rust has 67GB under data/) stalled the
// scan. With it, the scanner respects the operator's intent.
func Walk(repoPath string, countLines bool) (*Result, error) {
abs, err := filepath.Abs(repoPath)
if err != nil {
@ -104,6 +114,12 @@ func Walk(repoPath string, countLines bool) (*Result, error) {
// docstring for B5 background.
pathSkips := selfSkipsFor(abs)
// Load .gitignore at repo root if present. Empty/missing →
// matcher.Skip always returns false; behavior matches the pre-
// 2026-04-30 scanner. Negations (!pattern) are NOT supported v0;
// matcher.HasNegations() lets the caller surface a warning.
gitignore, _ := LoadGitignore(abs)
walkErr := filepath.WalkDir(abs, func(p string, d fs.DirEntry, walkErr error) error {
if walkErr != nil {
return nil // best-effort; permission errors etc. are silent
@ -115,6 +131,12 @@ func Walk(repoPath string, countLines bool) (*Result, error) {
if pathSkips[p] {
return filepath.SkipDir
}
if gitignore.Skip(p, true) {
return filepath.SkipDir
}
return nil
}
if gitignore.Skip(p, false) {
return nil
}
// Skip dotfiles at file level (.gitignore etc. are interesting,