// Package scanner walks a repository tree, classifies files, and // surfaces metadata for the analyzers + repo-intake report. // // Skip-list defaults to common build/dependency dirs that nobody // wants to scan. Operators can extend via review-profile (Phase E). package scanner import ( "io/fs" "os" "path/filepath" "sort" "strings" ) // SkipDirs is the default skip-list. Matches dir basenames anywhere // in the tree (not just at root). Includes the harness's own output // dir so review-on-self doesn't loop. var SkipDirs = map[string]bool{ ".git": true, ".hg": true, ".svn": true, "node_modules": true, "vendor": true, "target": true, // Rust "dist": true, "build": true, "__pycache__": true, ".venv": true, "venv": true, "bin": true, // golangLAKEHOUSE convention; harness's own too ".idea": true, ".vscode": true, "reports": true, // harness's own output } // File is one entry in the scan result. type File struct { Path string // relative to repo root Abs string // absolute path on disk Size int64 // bytes Lines int // 0 if not counted Language string // best-effort, "" if unknown } // Result is the scan summary the analyzers + reporters consume. type Result struct { RepoPath string Files []File LanguageBreakdown map[string]int // count of files by language LargestFiles []File // top 10 by size DependencyManifests []string // relative paths to package.json / go.mod / etc TestManifests []string // tests/ dirs, *_test.go, *.test.ts, etc } // Walk produces a Result for repoPath. Errors on missing dir; skipped // dirs are silently filtered. countLines is true → reads each file // for line counts (Phase B needs this; Phase A wires false for speed). func Walk(repoPath string, countLines bool) (*Result, error) { abs, err := filepath.Abs(repoPath) if err != nil { return nil, err } if st, err := os.Stat(abs); err != nil { return nil, err } else if !st.IsDir() { return nil, fs.ErrInvalid } res := &Result{ RepoPath: abs, LanguageBreakdown: map[string]int{}, } walkErr := filepath.WalkDir(abs, func(p string, d fs.DirEntry, walkErr error) error { if walkErr != nil { return nil // best-effort; permission errors etc. are silent } if d.IsDir() { if SkipDirs[d.Name()] && p != abs { return filepath.SkipDir } return nil } // Skip dotfiles at file level (.gitignore etc. are interesting, // but most dotfiles are noise; Analyzers can opt back in). if strings.HasPrefix(d.Name(), ".") && !interestingDotfile(d.Name()) { return nil } info, err := d.Info() if err != nil { return nil } rel, err := filepath.Rel(abs, p) if err != nil { rel = p } f := File{ Path: rel, Abs: p, Size: info.Size(), Language: detectLanguage(d.Name()), } if countLines && info.Size() < 5_000_000 { // 5MB cap; massive files lose line precision but stay scannable if n, err := countFileLines(p); err == nil { f.Lines = n } } res.Files = append(res.Files, f) if f.Language != "" { res.LanguageBreakdown[f.Language]++ } if isManifest(d.Name()) { res.DependencyManifests = append(res.DependencyManifests, rel) } if isTestPath(rel) { res.TestManifests = append(res.TestManifests, rel) } return nil }) if walkErr != nil { return nil, walkErr } // Largest top-10 by size. sorted := make([]File, len(res.Files)) copy(sorted, res.Files) sort.Slice(sorted, func(i, j int) bool { return sorted[i].Size > sorted[j].Size }) if len(sorted) > 10 { sorted = sorted[:10] } res.LargestFiles = sorted // Stable order for downstream determinism. sort.Strings(res.DependencyManifests) sort.Strings(res.TestManifests) return res, nil } // interestingDotfile lets a few well-known dotfiles through despite // the leading-dot filter. Keeps the scan honest about config files // that often hold the real risk (e.g. committed .env). func interestingDotfile(name string) bool { switch name { case ".env", ".env.local", ".env.production", ".gitignore", ".dockerignore", ".github", ".review-rules.md", ".review-profile.yaml": return true } return false } func countFileLines(path string) (int, error) { b, err := os.ReadFile(path) if err != nil { return 0, err } if len(b) == 0 { return 0, nil } n := strings.Count(string(b), "\n") if !strings.HasSuffix(string(b), "\n") { n++ // last line without trailing newline } return n, nil }