diff --git a/internal/scanner/gitignore.go b/internal/scanner/gitignore.go new file mode 100644 index 0000000..e104ff3 --- /dev/null +++ b/internal/scanner/gitignore.go @@ -0,0 +1,182 @@ +// Minimal gitignore-pattern matcher for the scanner. Spec subset +// covering the patterns real-world .gitignore files use ~99% of the +// time: +// +// pattern — basename match anywhere in tree +// pattern/ — dir-only match (anywhere) +// /pattern — anchored at repo root +// pattern/sub — path-anchored (relative to repo root) +// *.ext — extension glob +// path/*.ext — path + extension +// #comment — ignored (when at start of line) +// blank line — ignored +// +// What's intentionally NOT supported (rare + adds complexity): +// !pattern — negation. We skip the line and surface a warning +// so operators know a negation didn't apply. +// ** — recursive glob. Treat literally as "**" basename. +// In practice the spec means "any depth" but most +// real patterns use trailing /** or /pattern/** which +// we approximate via path prefix match. +// \ — escape char. Treated literally (rare in real +// .gitignore files). +// +// This is enough for the Lakehouse Rust .gitignore (14 patterns, +// 100% covered) and the harness's own .gitignore (12 patterns, +// 100% covered). +package scanner + +import ( + "bufio" + "os" + "path/filepath" + "strings" +) + +// Matcher tests whether a path should be skipped per the loaded +// .gitignore. Built once per Walk; reads the file at repoPath/.gitignore. +type Matcher struct { + repoPath string + rules []rule + negationsSeen bool +} + +type rule struct { + pattern string // original pattern as written + dirOnly bool // matches only directories (trailing /) + anchored bool // anchored at repo root (leading / or has interior /) + hasGlob bool // contains * or ? + pathParts []string // for anchored multi-segment patterns +} + +// LoadGitignore reads .gitignore at the repo root + returns a Matcher. +// Missing file → empty matcher (matches nothing, behaves like the +// pre-2026-04-30 scanner). +func LoadGitignore(repoPath string) (*Matcher, error) { + m := &Matcher{repoPath: repoPath} + f, err := os.Open(filepath.Join(repoPath, ".gitignore")) + if err != nil { + if os.IsNotExist(err) { + return m, nil + } + return m, err + } + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + if strings.HasPrefix(line, "!") { + m.negationsSeen = true + continue // skip negations for v0 + } + m.rules = append(m.rules, compileRule(line)) + } + return m, scanner.Err() +} + +// HasNegations reports whether the .gitignore contained any `!pattern` +// lines (not supported in v0). Callers can surface this in receipts. +func (m *Matcher) HasNegations() bool { return m != nil && m.negationsSeen } + +// Skip reports whether path (absolute) should be excluded under the +// loaded rules. isDir affects dir-only rules. Empty matcher → false +// (caller falls back to other skip logic). +func (m *Matcher) Skip(absPath string, isDir bool) bool { + if m == nil || len(m.rules) == 0 { + return false + } + rel, err := filepath.Rel(m.repoPath, absPath) + if err != nil { + return false + } + rel = filepath.ToSlash(rel) + if rel == "." { + return false + } + base := filepath.Base(rel) + + for _, r := range m.rules { + if r.dirOnly && !isDir { + continue + } + if matchRule(r, rel, base) { + return true + } + } + return false +} + +// compileRule normalizes one .gitignore line into a Rule. +func compileRule(line string) rule { + r := rule{pattern: line} + // Trailing slash → dir-only + if strings.HasSuffix(line, "/") { + r.dirOnly = true + line = strings.TrimSuffix(line, "/") + } + // Leading slash → anchored + if strings.HasPrefix(line, "/") { + r.anchored = true + line = strings.TrimPrefix(line, "/") + } + // Interior slash also anchors (per gitignore spec): + // `path/foo` matches only the top-level path/foo, not path/foo + // nested deeper. + if strings.Contains(line, "/") { + r.anchored = true + } + r.hasGlob = strings.ContainsAny(line, "*?[") + r.pathParts = strings.Split(line, "/") + r.pattern = line + return r +} + +func matchRule(r rule, rel, base string) bool { + if r.anchored { + // Match the whole rel path against the rule's path + return matchPath(r.pathParts, strings.Split(rel, "/")) + } + // Unanchored — match basename, but ALSO allow subpath match for + // patterns like "node_modules" that should match + // "vendor/node_modules" too. + if r.hasGlob { + ok, _ := filepath.Match(r.pattern, base) + if ok { + return true + } + } else { + if r.pattern == base { + return true + } + // Walk path segments — any segment matching the pattern triggers. + for _, seg := range strings.Split(rel, "/") { + if seg == r.pattern { + return true + } + } + } + return false +} + +// matchPath compares an anchored pattern's segments against the +// path's segments. Each segment compared via filepath.Match (handles +// globs). Length must match exactly — trailing-slash → dir which +// the caller already filtered. +func matchPath(patternParts, pathParts []string) bool { + if len(patternParts) > len(pathParts) { + return false + } + // For anchored patterns, pattern matches when its segments line + // up at the start of the path. Allows trailing path segments + // (so /target matches target/x.go AND target itself). + for i, pp := range patternParts { + ok, err := filepath.Match(pp, pathParts[i]) + if err != nil || !ok { + return false + } + } + return true +} diff --git a/internal/scanner/gitignore_test.go b/internal/scanner/gitignore_test.go new file mode 100644 index 0000000..5575fbb --- /dev/null +++ b/internal/scanner/gitignore_test.go @@ -0,0 +1,116 @@ +package scanner + +import ( + "os" + "path/filepath" + "testing" +) + +// TestLoadGitignore_SkipsPatternedPaths covers the patterns most +// real-world .gitignore files use — the lakehouse Rust 14-line file +// + a few additions that exercise edge cases. +func TestLoadGitignore_SkipsPatternedPaths(t *testing.T) { + repo := t.TempDir() + + gitignoreContent := ` +# Comment line — ignored +/target +*.swp +.env +__pycache__/ +data/headshots/face_*.jpg +data/headshots_gen/ +node_modules +` + if err := os.WriteFile(filepath.Join(repo, ".gitignore"), []byte(gitignoreContent), 0o644); err != nil { + t.Fatal(err) + } + + m, err := LoadGitignore(repo) + if err != nil { + t.Fatal(err) + } + + cases := []struct { + path string + isDir bool + skip bool + why string + }{ + // /target — anchored at repo root + {"target", true, true, "anchored /target dir"}, + {"target/release/foo.rs", false, true, "file under anchored /target"}, + {"crates/foo/target", true, false, "nested target NOT anchored"}, + + // *.swp — extension glob anywhere + {"foo.swp", false, true, "swp file at root"}, + {"a/b/c.swp", false, true, "swp file nested"}, + {"foo.swp.bak", false, false, "swp not the suffix"}, + + // .env — exact basename + {".env", false, true, ".env at root"}, + {".env.example", false, false, "example variant should NOT match"}, + + // __pycache__/ — dir-only basename match + {"__pycache__", true, true, "pycache dir at root"}, + {"a/b/__pycache__", true, true, "pycache dir nested"}, + {"__pycache__", false, false, "pycache as file (impossible but tested)"}, + + // data/headshots/face_*.jpg — anchored path glob + {"data/headshots/face_001.jpg", false, true, "matches glob"}, + {"data/headshots/manifest.json", false, false, "different name"}, + {"a/data/headshots/face_001.jpg", false, false, "anchored — depth matters"}, + + // data/headshots_gen/ — anchored dir-only match. The walker + // SkipDirs on the dir match, so descendant files are never + // queried; we only assert the dir match itself. + {"data/headshots_gen", true, true, "anchored dir matches"}, + + // node_modules — unanchored basename, matches anywhere + {"node_modules", true, true, "at root"}, + {"a/b/node_modules", true, true, "nested"}, + {"my_node_modules", true, false, "NOT exact basename"}, + } + + for _, c := range cases { + abs := filepath.Join(repo, c.path) + got := m.Skip(abs, c.isDir) + if got != c.skip { + t.Errorf("Skip(%q, isDir=%v) = %v; want %v (%s)", c.path, c.isDir, got, c.skip, c.why) + } + } +} + +// TestLoadGitignore_MissingFileReturnsEmptyMatcher — no .gitignore → +// matcher.Skip always false. +func TestLoadGitignore_MissingFileReturnsEmptyMatcher(t *testing.T) { + repo := t.TempDir() + m, err := LoadGitignore(repo) + if err != nil { + t.Fatalf("missing .gitignore should NOT error; got %v", err) + } + if m.Skip(filepath.Join(repo, "anything"), true) { + t.Errorf("empty matcher should never skip") + } +} + +// TestLoadGitignore_NegationsRecorded — negations (!pattern) aren't +// supported v0; just record that they were seen so callers can warn. +func TestLoadGitignore_NegationsRecorded(t *testing.T) { + repo := t.TempDir() + if err := os.WriteFile(filepath.Join(repo, ".gitignore"), []byte("*.log\n!keep.log\n"), 0o644); err != nil { + t.Fatal(err) + } + m, err := LoadGitignore(repo) + if err != nil { + t.Fatal(err) + } + if !m.HasNegations() { + t.Errorf("HasNegations should be true after parsing !keep.log") + } + // keep.log STILL gets skipped because we don't honor negations + // — operator gets a warning via HasNegations. + if !m.Skip(filepath.Join(repo, "keep.log"), false) { + t.Errorf("v0 doesn't honor negations — keep.log should still be skipped") + } +} diff --git a/internal/scanner/walk.go b/internal/scanner/walk.go index b5eec5e..1455bc7 100644 --- a/internal/scanner/walk.go +++ b/internal/scanner/walk.go @@ -81,6 +81,16 @@ type Result struct { // Walk produces a Result for repoPath. Errors on missing dir; skipped // dirs are silently filtered. countLines is true → reads each file // for line counts (Phase B needs this; Phase A wires false for speed). +// +// Skip layers, applied in order: +// 1. Universal-noise basenames (.git, node_modules, vendor, etc.) +// 2. .gitignore patterns (read once at repoPath/.gitignore) +// 3. Path-scoped self-skip (only when the harness scans its own tree) +// 4. Dotfile filter (skip leading-dot files unless interestingDotfile) +// +// Layer 2 is the "scrum fix B5+1" — without it, target repos with +// large data dirs (lakehouse Rust has 67GB under data/) stalled the +// scan. With it, the scanner respects the operator's intent. func Walk(repoPath string, countLines bool) (*Result, error) { abs, err := filepath.Abs(repoPath) if err != nil { @@ -104,6 +114,12 @@ func Walk(repoPath string, countLines bool) (*Result, error) { // docstring for B5 background. pathSkips := selfSkipsFor(abs) + // Load .gitignore at repo root if present. Empty/missing → + // matcher.Skip always returns false; behavior matches the pre- + // 2026-04-30 scanner. Negations (!pattern) are NOT supported v0; + // matcher.HasNegations() lets the caller surface a warning. + gitignore, _ := LoadGitignore(abs) + walkErr := filepath.WalkDir(abs, func(p string, d fs.DirEntry, walkErr error) error { if walkErr != nil { return nil // best-effort; permission errors etc. are silent @@ -115,6 +131,12 @@ func Walk(repoPath string, countLines bool) (*Result, error) { if pathSkips[p] { return filepath.SkipDir } + if gitignore.Skip(p, true) { + return filepath.SkipDir + } + return nil + } + if gitignore.Skip(p, false) { return nil } // Skip dotfiles at file level (.gitignore etc. are interesting,