Claude (review-harness setup) 2fc047487f Scanner respects .gitignore — full Lakehouse Rust scan now possible
Single biggest unblock for using the harness on real targets. The
lakehouse Rust repo has a 67GB data/ directory holding parquet,
JSONL pathway memory, headshots, and other runtime data — all
gitignored. Pre-fix the scanner walked it all (and stalled). Post-
fix the full Rust scan completes in 15s.

internal/scanner/gitignore.go — minimal Matcher that handles the
patterns real .gitignore files use ~99% of the time:
  - basename match anywhere (`pattern`)
  - dir-only match (`pattern/`)
  - root-anchored (`/pattern`)
  - path-anchored (`pattern/sub` — interior slash)
  - extension globs (`*.ext`)
  - path + extension (`path/*.ext`)
  - comments + blank lines ignored

Negations (!pattern) intentionally NOT supported v0; matcher records
HasNegations() so callers can surface a warning if encountered.

internal/scanner/gitignore_test.go — 14 cases against a synthetic
.gitignore covering all 6 pattern shapes, plus missing-file and
negation-recording tests.

walk.go integration: gitignore loaded once at scan start; checked
in the dir-skip branch (SkipDir cascades) and the file-emit branch.
Skip layers in order: universal-noise basenames → .gitignore →
path-scoped self-skip → dotfile filter.

Verified end-to-end:
- lakehouse Rust full repo: 15s scan, 1031 findings, 0 critical
  (no committed secrets in source — independently confirms what
  scrum2 + the Rust auditor said)
- 529 hardcoded-path findings IS the Sprint 4 gap the audit kept
  naming; the harness just put a number on it

This was Opus's WARN B5 from the cross-lineage scrum, plus the
"harness stalls on real repos" gap exposed when running it against
the actual Lakehouse repos. Both addressed in one wave.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 02:14:10 -05:00

183 lines
5.4 KiB
Go

// Minimal gitignore-pattern matcher for the scanner. Spec subset
// covering the patterns real-world .gitignore files use ~99% of the
// time:
//
// pattern — basename match anywhere in tree
// pattern/ — dir-only match (anywhere)
// /pattern — anchored at repo root
// pattern/sub — path-anchored (relative to repo root)
// *.ext — extension glob
// path/*.ext — path + extension
// #comment — ignored (when at start of line)
// blank line — ignored
//
// What's intentionally NOT supported (rare + adds complexity):
// !pattern — negation. We skip the line and surface a warning
// so operators know a negation didn't apply.
// ** — recursive glob. Treat literally as "**" basename.
// In practice the spec means "any depth" but most
// real patterns use trailing /** or /pattern/** which
// we approximate via path prefix match.
// \ — escape char. Treated literally (rare in real
// .gitignore files).
//
// This is enough for the Lakehouse Rust .gitignore (14 patterns,
// 100% covered) and the harness's own .gitignore (12 patterns,
// 100% covered).
package scanner
import (
"bufio"
"os"
"path/filepath"
"strings"
)
// Matcher tests whether a path should be skipped per the loaded
// .gitignore. Built once per Walk; reads the file at repoPath/.gitignore.
type Matcher struct {
repoPath string
rules []rule
negationsSeen bool
}
type rule struct {
pattern string // original pattern as written
dirOnly bool // matches only directories (trailing /)
anchored bool // anchored at repo root (leading / or has interior /)
hasGlob bool // contains * or ?
pathParts []string // for anchored multi-segment patterns
}
// LoadGitignore reads .gitignore at the repo root + returns a Matcher.
// Missing file → empty matcher (matches nothing, behaves like the
// pre-2026-04-30 scanner).
func LoadGitignore(repoPath string) (*Matcher, error) {
m := &Matcher{repoPath: repoPath}
f, err := os.Open(filepath.Join(repoPath, ".gitignore"))
if err != nil {
if os.IsNotExist(err) {
return m, nil
}
return m, err
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" || strings.HasPrefix(line, "#") {
continue
}
if strings.HasPrefix(line, "!") {
m.negationsSeen = true
continue // skip negations for v0
}
m.rules = append(m.rules, compileRule(line))
}
return m, scanner.Err()
}
// HasNegations reports whether the .gitignore contained any `!pattern`
// lines (not supported in v0). Callers can surface this in receipts.
func (m *Matcher) HasNegations() bool { return m != nil && m.negationsSeen }
// Skip reports whether path (absolute) should be excluded under the
// loaded rules. isDir affects dir-only rules. Empty matcher → false
// (caller falls back to other skip logic).
func (m *Matcher) Skip(absPath string, isDir bool) bool {
if m == nil || len(m.rules) == 0 {
return false
}
rel, err := filepath.Rel(m.repoPath, absPath)
if err != nil {
return false
}
rel = filepath.ToSlash(rel)
if rel == "." {
return false
}
base := filepath.Base(rel)
for _, r := range m.rules {
if r.dirOnly && !isDir {
continue
}
if matchRule(r, rel, base) {
return true
}
}
return false
}
// compileRule normalizes one .gitignore line into a Rule.
func compileRule(line string) rule {
r := rule{pattern: line}
// Trailing slash → dir-only
if strings.HasSuffix(line, "/") {
r.dirOnly = true
line = strings.TrimSuffix(line, "/")
}
// Leading slash → anchored
if strings.HasPrefix(line, "/") {
r.anchored = true
line = strings.TrimPrefix(line, "/")
}
// Interior slash also anchors (per gitignore spec):
// `path/foo` matches only the top-level path/foo, not path/foo
// nested deeper.
if strings.Contains(line, "/") {
r.anchored = true
}
r.hasGlob = strings.ContainsAny(line, "*?[")
r.pathParts = strings.Split(line, "/")
r.pattern = line
return r
}
func matchRule(r rule, rel, base string) bool {
if r.anchored {
// Match the whole rel path against the rule's path
return matchPath(r.pathParts, strings.Split(rel, "/"))
}
// Unanchored — match basename, but ALSO allow subpath match for
// patterns like "node_modules" that should match
// "vendor/node_modules" too.
if r.hasGlob {
ok, _ := filepath.Match(r.pattern, base)
if ok {
return true
}
} else {
if r.pattern == base {
return true
}
// Walk path segments — any segment matching the pattern triggers.
for _, seg := range strings.Split(rel, "/") {
if seg == r.pattern {
return true
}
}
}
return false
}
// matchPath compares an anchored pattern's segments against the
// path's segments. Each segment compared via filepath.Match (handles
// globs). Length must match exactly — trailing-slash → dir which
// the caller already filtered.
func matchPath(patternParts, pathParts []string) bool {
if len(patternParts) > len(pathParts) {
return false
}
// For anchored patterns, pattern matches when its segments line
// up at the start of the path. Allows trailing path segments
// (so /target matches target/x.go AND target itself).
for i, pp := range patternParts {
ok, err := filepath.Match(pp, pathParts[i])
if err != nil || !ok {
return false
}
}
return true
}