local-review-harness/internal/scanner/gitignore.go

// Minimal gitignore-pattern matcher for the scanner. Spec subset
// covering the patterns real-world .gitignore files use ~99% of the
// time:
//
//   pattern       — basename match anywhere in tree
//   pattern/      — dir-only match (anywhere)
//   /pattern      — anchored at repo root
//   pattern/sub   — path-anchored (relative to repo root)
//   *.ext         — extension glob
//   path/*.ext    — path + extension
//   #comment      — ignored (when at start of line)
//   blank line    — ignored
//
// What's intentionally NOT supported (rare + adds complexity):
//   !pattern      — negation. We skip the line and surface a warning
//                   so operators know a negation didn't apply.
//   **            — recursive glob. Treat literally as "**" basename.
//                   In practice the spec means "any depth" but most
//                   real patterns use trailing /** or /pattern/** which
//                   we approximate via path prefix match.
//   \             — escape char. Treated literally (rare in real
//                   .gitignore files).
//
// This is enough for the Lakehouse Rust .gitignore (14 patterns,
// 100% covered) and the harness's own .gitignore (12 patterns,
// 100% covered).
package scanner

import (
	"bufio"
	"os"
	"path/filepath"
	"strings"
)

// Matcher tests whether a path should be skipped per the loaded
// .gitignore. Built once per Walk; reads the file at repoPath/.gitignore.
type Matcher struct {
	repoPath string
	rules    []rule
	negationsSeen bool
}

type rule struct {
	pattern    string // original pattern as written
	dirOnly    bool   // matches only directories (trailing /)
	anchored   bool   // anchored at repo root (leading / or has interior /)
	hasGlob    bool   // contains * or ?
	pathParts  []string // for anchored multi-segment patterns
}

// LoadGitignore reads .gitignore at the repo root + returns a Matcher.
// Missing file → empty matcher (matches nothing, behaves like the
// pre-2026-04-30 scanner).
func LoadGitignore(repoPath string) (*Matcher, error) {
	m := &Matcher{repoPath: repoPath}
	f, err := os.Open(filepath.Join(repoPath, ".gitignore"))
	if err != nil {
		if os.IsNotExist(err) {
			return m, nil
		}
		return m, err
	}
	defer f.Close()
	scanner := bufio.NewScanner(f)
	for scanner.Scan() {
		line := strings.TrimSpace(scanner.Text())
		if line == "" || strings.HasPrefix(line, "#") {
			continue
		}
		if strings.HasPrefix(line, "!") {
			m.negationsSeen = true
			continue // skip negations for v0
		}
		m.rules = append(m.rules, compileRule(line))
	}
	return m, scanner.Err()
}

// HasNegations reports whether the .gitignore contained any `!pattern`
// lines (not supported in v0). Callers can surface this in receipts.
func (m *Matcher) HasNegations() bool { return m != nil && m.negationsSeen }

// Skip reports whether path (absolute) should be excluded under the
// loaded rules. isDir affects dir-only rules. Empty matcher → false
// (caller falls back to other skip logic).
func (m *Matcher) Skip(absPath string, isDir bool) bool {
	if m == nil || len(m.rules) == 0 {
		return false
	}
	rel, err := filepath.Rel(m.repoPath, absPath)
	if err != nil {
		return false
	}
	rel = filepath.ToSlash(rel)
	if rel == "." {
		return false
	}
	base := filepath.Base(rel)

	for _, r := range m.rules {
		if r.dirOnly && !isDir {
			continue
		}
		if matchRule(r, rel, base) {
			return true
		}
	}
	return false
}

// compileRule normalizes one .gitignore line into a Rule.
func compileRule(line string) rule {
	r := rule{pattern: line}
	// Trailing slash → dir-only
	if strings.HasSuffix(line, "/") {
		r.dirOnly = true
		line = strings.TrimSuffix(line, "/")
	}
	// Leading slash → anchored
	if strings.HasPrefix(line, "/") {
		r.anchored = true
		line = strings.TrimPrefix(line, "/")
	}
	// Interior slash also anchors (per gitignore spec):
	// `path/foo` matches only the top-level path/foo, not path/foo
	// nested deeper.
	if strings.Contains(line, "/") {
		r.anchored = true
	}
	r.hasGlob = strings.ContainsAny(line, "*?[")
	r.pathParts = strings.Split(line, "/")
	r.pattern = line
	return r
}

func matchRule(r rule, rel, base string) bool {
	if r.anchored {
		// Match the whole rel path against the rule's path
		return matchPath(r.pathParts, strings.Split(rel, "/"))
	}
	// Unanchored — match basename, but ALSO allow subpath match for
	// patterns like "node_modules" that should match
	// "vendor/node_modules" too.
	if r.hasGlob {
		ok, _ := filepath.Match(r.pattern, base)
		if ok {
			return true
		}
	} else {
		if r.pattern == base {
			return true
		}
		// Walk path segments — any segment matching the pattern triggers.
		for _, seg := range strings.Split(rel, "/") {
			if seg == r.pattern {
				return true
			}
		}
	}
	return false
}

// matchPath compares an anchored pattern's segments against the
// path's segments. Each segment compared via filepath.Match (handles
// globs). Length must match exactly — trailing-slash → dir which
// the caller already filtered.
func matchPath(patternParts, pathParts []string) bool {
	if len(patternParts) > len(pathParts) {
		return false
	}
	// For anchored patterns, pattern matches when its segments line
	// up at the start of the path. Allows trailing path segments
	// (so /target matches target/x.go AND target itself).
	for i, pp := range patternParts {
		ok, err := filepath.Match(pp, pathParts[i])
		if err != nil || !ok {
			return false
		}
	}
	return true
}