Single biggest unblock for using the harness on real targets. The lakehouse Rust repo has a 67GB data/ directory holding parquet, JSONL pathway memory, headshots, and other runtime data — all gitignored. Pre-fix the scanner walked it all (and stalled). Post- fix the full Rust scan completes in 15s. internal/scanner/gitignore.go — minimal Matcher that handles the patterns real .gitignore files use ~99% of the time: - basename match anywhere (`pattern`) - dir-only match (`pattern/`) - root-anchored (`/pattern`) - path-anchored (`pattern/sub` — interior slash) - extension globs (`*.ext`) - path + extension (`path/*.ext`) - comments + blank lines ignored Negations (!pattern) intentionally NOT supported v0; matcher records HasNegations() so callers can surface a warning if encountered. internal/scanner/gitignore_test.go — 14 cases against a synthetic .gitignore covering all 6 pattern shapes, plus missing-file and negation-recording tests. walk.go integration: gitignore loaded once at scan start; checked in the dir-skip branch (SkipDir cascades) and the file-emit branch. Skip layers in order: universal-noise basenames → .gitignore → path-scoped self-skip → dotfile filter. Verified end-to-end: - lakehouse Rust full repo: 15s scan, 1031 findings, 0 critical (no committed secrets in source — independently confirms what scrum2 + the Rust auditor said) - 529 hardcoded-path findings IS the Sprint 4 gap the audit kept naming; the harness just put a number on it This was Opus's WARN B5 from the cross-lineage scrum, plus the "harness stalls on real repos" gap exposed when running it against the actual Lakehouse repos. Both addressed in one wave. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
183 lines
5.4 KiB
Go
183 lines
5.4 KiB
Go
// Minimal gitignore-pattern matcher for the scanner. Spec subset
|
|
// covering the patterns real-world .gitignore files use ~99% of the
|
|
// time:
|
|
//
|
|
// pattern — basename match anywhere in tree
|
|
// pattern/ — dir-only match (anywhere)
|
|
// /pattern — anchored at repo root
|
|
// pattern/sub — path-anchored (relative to repo root)
|
|
// *.ext — extension glob
|
|
// path/*.ext — path + extension
|
|
// #comment — ignored (when at start of line)
|
|
// blank line — ignored
|
|
//
|
|
// What's intentionally NOT supported (rare + adds complexity):
|
|
// !pattern — negation. We skip the line and surface a warning
|
|
// so operators know a negation didn't apply.
|
|
// ** — recursive glob. Treat literally as "**" basename.
|
|
// In practice the spec means "any depth" but most
|
|
// real patterns use trailing /** or /pattern/** which
|
|
// we approximate via path prefix match.
|
|
// \ — escape char. Treated literally (rare in real
|
|
// .gitignore files).
|
|
//
|
|
// This is enough for the Lakehouse Rust .gitignore (14 patterns,
|
|
// 100% covered) and the harness's own .gitignore (12 patterns,
|
|
// 100% covered).
|
|
package scanner
|
|
|
|
import (
|
|
"bufio"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
// Matcher tests whether a path should be skipped per the loaded
|
|
// .gitignore. Built once per Walk; reads the file at repoPath/.gitignore.
|
|
type Matcher struct {
|
|
repoPath string
|
|
rules []rule
|
|
negationsSeen bool
|
|
}
|
|
|
|
type rule struct {
|
|
pattern string // original pattern as written
|
|
dirOnly bool // matches only directories (trailing /)
|
|
anchored bool // anchored at repo root (leading / or has interior /)
|
|
hasGlob bool // contains * or ?
|
|
pathParts []string // for anchored multi-segment patterns
|
|
}
|
|
|
|
// LoadGitignore reads .gitignore at the repo root + returns a Matcher.
|
|
// Missing file → empty matcher (matches nothing, behaves like the
|
|
// pre-2026-04-30 scanner).
|
|
func LoadGitignore(repoPath string) (*Matcher, error) {
|
|
m := &Matcher{repoPath: repoPath}
|
|
f, err := os.Open(filepath.Join(repoPath, ".gitignore"))
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return m, nil
|
|
}
|
|
return m, err
|
|
}
|
|
defer f.Close()
|
|
scanner := bufio.NewScanner(f)
|
|
for scanner.Scan() {
|
|
line := strings.TrimSpace(scanner.Text())
|
|
if line == "" || strings.HasPrefix(line, "#") {
|
|
continue
|
|
}
|
|
if strings.HasPrefix(line, "!") {
|
|
m.negationsSeen = true
|
|
continue // skip negations for v0
|
|
}
|
|
m.rules = append(m.rules, compileRule(line))
|
|
}
|
|
return m, scanner.Err()
|
|
}
|
|
|
|
// HasNegations reports whether the .gitignore contained any `!pattern`
|
|
// lines (not supported in v0). Callers can surface this in receipts.
|
|
func (m *Matcher) HasNegations() bool { return m != nil && m.negationsSeen }
|
|
|
|
// Skip reports whether path (absolute) should be excluded under the
|
|
// loaded rules. isDir affects dir-only rules. Empty matcher → false
|
|
// (caller falls back to other skip logic).
|
|
func (m *Matcher) Skip(absPath string, isDir bool) bool {
|
|
if m == nil || len(m.rules) == 0 {
|
|
return false
|
|
}
|
|
rel, err := filepath.Rel(m.repoPath, absPath)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
rel = filepath.ToSlash(rel)
|
|
if rel == "." {
|
|
return false
|
|
}
|
|
base := filepath.Base(rel)
|
|
|
|
for _, r := range m.rules {
|
|
if r.dirOnly && !isDir {
|
|
continue
|
|
}
|
|
if matchRule(r, rel, base) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// compileRule normalizes one .gitignore line into a Rule.
|
|
func compileRule(line string) rule {
|
|
r := rule{pattern: line}
|
|
// Trailing slash → dir-only
|
|
if strings.HasSuffix(line, "/") {
|
|
r.dirOnly = true
|
|
line = strings.TrimSuffix(line, "/")
|
|
}
|
|
// Leading slash → anchored
|
|
if strings.HasPrefix(line, "/") {
|
|
r.anchored = true
|
|
line = strings.TrimPrefix(line, "/")
|
|
}
|
|
// Interior slash also anchors (per gitignore spec):
|
|
// `path/foo` matches only the top-level path/foo, not path/foo
|
|
// nested deeper.
|
|
if strings.Contains(line, "/") {
|
|
r.anchored = true
|
|
}
|
|
r.hasGlob = strings.ContainsAny(line, "*?[")
|
|
r.pathParts = strings.Split(line, "/")
|
|
r.pattern = line
|
|
return r
|
|
}
|
|
|
|
func matchRule(r rule, rel, base string) bool {
|
|
if r.anchored {
|
|
// Match the whole rel path against the rule's path
|
|
return matchPath(r.pathParts, strings.Split(rel, "/"))
|
|
}
|
|
// Unanchored — match basename, but ALSO allow subpath match for
|
|
// patterns like "node_modules" that should match
|
|
// "vendor/node_modules" too.
|
|
if r.hasGlob {
|
|
ok, _ := filepath.Match(r.pattern, base)
|
|
if ok {
|
|
return true
|
|
}
|
|
} else {
|
|
if r.pattern == base {
|
|
return true
|
|
}
|
|
// Walk path segments — any segment matching the pattern triggers.
|
|
for _, seg := range strings.Split(rel, "/") {
|
|
if seg == r.pattern {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// matchPath compares an anchored pattern's segments against the
|
|
// path's segments. Each segment compared via filepath.Match (handles
|
|
// globs). Length must match exactly — trailing-slash → dir which
|
|
// the caller already filtered.
|
|
func matchPath(patternParts, pathParts []string) bool {
|
|
if len(patternParts) > len(pathParts) {
|
|
return false
|
|
}
|
|
// For anchored patterns, pattern matches when its segments line
|
|
// up at the start of the path. Allows trailing path segments
|
|
// (so /target matches target/x.go AND target itself).
|
|
for i, pp := range patternParts {
|
|
ok, err := filepath.Match(pp, pathParts[i])
|
|
if err != nil || !ok {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|