local-review-harness/internal/scanner/walk.go

// Package scanner walks a repository tree, classifies files, and
// surfaces metadata for the analyzers + repo-intake report.
//
// Skip-list defaults to common build/dependency dirs that nobody
// wants to scan. Operators can extend via review-profile (Phase E).
package scanner

import (
	"io/fs"
	"os"
	"path/filepath"
	"sort"
	"strings"
)

// SkipDirs is the default skip-list. Matches dir basenames anywhere
// in the tree (not just at root). Includes the harness's own output
// dir so review-on-self doesn't loop.
var SkipDirs = map[string]bool{
	".git":          true,
	".hg":           true,
	".svn":          true,
	"node_modules":  true,
	"vendor":        true,
	"target":        true, // Rust
	"dist":          true,
	"build":         true,
	"__pycache__":   true,
	".venv":         true,
	"venv":          true,
	"bin":           true, // golangLAKEHOUSE convention; harness's own too
	".idea":         true,
	".vscode":       true,
	"reports":       true, // harness's own output
}

// File is one entry in the scan result.
type File struct {
	Path     string // relative to repo root
	Abs      string // absolute path on disk
	Size     int64  // bytes
	Lines    int    // 0 if not counted
	Language string // best-effort, "" if unknown
}

// Result is the scan summary the analyzers + reporters consume.
type Result struct {
	RepoPath          string
	Files             []File
	LanguageBreakdown map[string]int // count of files by language
	LargestFiles      []File         // top 10 by size
	DependencyManifests []string     // relative paths to package.json / go.mod / etc
	TestManifests       []string     // tests/ dirs, *_test.go, *.test.ts, etc
}

// Walk produces a Result for repoPath. Errors on missing dir; skipped
// dirs are silently filtered. countLines is true → reads each file
// for line counts (Phase B needs this; Phase A wires false for speed).
func Walk(repoPath string, countLines bool) (*Result, error) {
	abs, err := filepath.Abs(repoPath)
	if err != nil {
		return nil, err
	}
	if st, err := os.Stat(abs); err != nil {
		return nil, err
	} else if !st.IsDir() {
		return nil, fs.ErrInvalid
	}

	res := &Result{
		RepoPath:          abs,
		LanguageBreakdown: map[string]int{},
	}

	walkErr := filepath.WalkDir(abs, func(p string, d fs.DirEntry, walkErr error) error {
		if walkErr != nil {
			return nil // best-effort; permission errors etc. are silent
		}
		if d.IsDir() {
			if SkipDirs[d.Name()] && p != abs {
				return filepath.SkipDir
			}
			return nil
		}
		// Skip dotfiles at file level (.gitignore etc. are interesting,
		// but most dotfiles are noise; Analyzers can opt back in).
		if strings.HasPrefix(d.Name(), ".") && !interestingDotfile(d.Name()) {
			return nil
		}
		info, err := d.Info()
		if err != nil {
			return nil
		}
		rel, err := filepath.Rel(abs, p)
		if err != nil {
			rel = p
		}
		f := File{
			Path:     rel,
			Abs:      p,
			Size:     info.Size(),
			Language: detectLanguage(d.Name()),
		}
		if countLines && info.Size() < 5_000_000 { // 5MB cap; massive files lose line precision but stay scannable
			if n, err := countFileLines(p); err == nil {
				f.Lines = n
			}
		}
		res.Files = append(res.Files, f)
		if f.Language != "" {
			res.LanguageBreakdown[f.Language]++
		}
		if isManifest(d.Name()) {
			res.DependencyManifests = append(res.DependencyManifests, rel)
		}
		if isTestPath(rel) {
			res.TestManifests = append(res.TestManifests, rel)
		}
		return nil
	})
	if walkErr != nil {
		return nil, walkErr
	}

	// Largest top-10 by size.
	sorted := make([]File, len(res.Files))
	copy(sorted, res.Files)
	sort.Slice(sorted, func(i, j int) bool { return sorted[i].Size > sorted[j].Size })
	if len(sorted) > 10 {
		sorted = sorted[:10]
	}
	res.LargestFiles = sorted

	// Stable order for downstream determinism.
	sort.Strings(res.DependencyManifests)
	sort.Strings(res.TestManifests)

	return res, nil
}

// interestingDotfile lets a few well-known dotfiles through despite
// the leading-dot filter. Keeps the scan honest about config files
// that often hold the real risk (e.g. committed .env).
func interestingDotfile(name string) bool {
	switch name {
	case ".env", ".env.local", ".env.production",
		".gitignore", ".dockerignore", ".github",
		".review-rules.md", ".review-profile.yaml":
		return true
	}
	return false
}

func countFileLines(path string) (int, error) {
	b, err := os.ReadFile(path)
	if err != nil {
		return 0, err
	}
	if len(b) == 0 {
		return 0, nil
	}
	n := strings.Count(string(b), "\n")
	if !strings.HasSuffix(string(b), "\n") {
		n++ // last line without trailing newline
	}
	return n, nil
}