analyze.go

  1// Package onstart provides codebase analysis used to inform the initial system prompt.
  2package onstart
  3
  4import (
  5	"bufio"
  6	"bytes"
  7	"cmp"
  8	"context"
  9	"fmt"
 10	"io"
 11	"os"
 12	"os/exec"
 13	"path/filepath"
 14	"slices"
 15	"strings"
 16
 17	"golang.org/x/sync/errgroup"
 18)
 19
 20// Codebase contains metadata about the codebase.
 21type Codebase struct {
 22	// ExtensionCounts tracks the number of files with each extension
 23	ExtensionCounts map[string]int
 24	// Total number of files analyzed
 25	TotalFiles int
 26	// BuildFiles contains paths to build and configuration files
 27	BuildFiles []string
 28	// DocumentationFiles contains paths to documentation files
 29	DocumentationFiles []string
 30	// GuidanceFiles contains paths to files that provide context and guidance to LLMs
 31	GuidanceFiles []string
 32	// InjectFiles contains paths to critical guidance files (like DEAR_LLM.md, claude.md, and cursorrules)
 33	// that need to be injected into the system prompt for highest visibility
 34	InjectFiles []string
 35	// InjectFileContents maps paths to file contents for critical inject files
 36	// to avoid requiring an extra file read during template rendering
 37	InjectFileContents map[string]string
 38}
 39
 40// AnalyzeCodebase walks the codebase and analyzes the paths it finds.
 41func AnalyzeCodebase(ctx context.Context, repoPath string) (*Codebase, error) {
 42	// TODO: do a filesystem walk instead?
 43	// There's a balance: git ls-files skips node_modules etc,
 44	// but some guidance files might be locally .gitignored.
 45	cmd := exec.Command("git", "ls-files", "-z")
 46	cmd.Dir = repoPath
 47
 48	r, w := io.Pipe() // stream and scan rather than buffer
 49	cmd.Stdout = w
 50
 51	err := cmd.Start()
 52	if err != nil {
 53		return nil, err
 54	}
 55
 56	extCounts := make(map[string]int)
 57	var buildFiles []string
 58	var documentationFiles []string
 59	var guidanceFiles []string
 60	var injectFiles []string
 61	injectFileContents := make(map[string]string)
 62	var totalFiles int
 63
 64	eg, _ := errgroup.WithContext(ctx)
 65
 66	eg.Go(func() error {
 67		defer r.Close()
 68
 69		scanner := bufio.NewScanner(r)
 70		scanner.Split(scanZero)
 71		for scanner.Scan() {
 72			file := scanner.Text()
 73			file = strings.TrimSpace(file)
 74			if file == "" {
 75				continue
 76			}
 77			totalFiles++
 78			ext := strings.ToLower(filepath.Ext(file))
 79			ext = cmp.Or(ext, "<no-extension>")
 80			extCounts[ext]++
 81
 82			fileCategory := categorizeFile(file)
 83			// fmt.Println(file, "->", fileCategory)
 84			switch fileCategory {
 85			case "build":
 86				buildFiles = append(buildFiles, file)
 87			case "documentation":
 88				documentationFiles = append(documentationFiles, file)
 89			case "guidance":
 90				guidanceFiles = append(guidanceFiles, file)
 91			case "inject":
 92				injectFiles = append(injectFiles, file)
 93			}
 94		}
 95		return scanner.Err()
 96	})
 97
 98	// Wait for the command to complete
 99	eg.Go(func() error {
100		err := cmd.Wait()
101		if err != nil {
102			w.CloseWithError(err)
103		} else {
104			w.Close()
105		}
106		return err
107	})
108
109	if err := eg.Wait(); err != nil {
110		return nil, err
111	}
112
113	// Read content of inject files
114	for _, filePath := range injectFiles {
115		absPath := filepath.Join(repoPath, filePath)
116		content, err := os.ReadFile(absPath)
117		if err != nil {
118			fmt.Printf("Warning: Failed to read inject file %s: %v\n", filePath, err)
119			continue
120		}
121		injectFileContents[filePath] = string(content)
122	}
123
124	return &Codebase{
125		ExtensionCounts:    extCounts,
126		TotalFiles:         totalFiles,
127		BuildFiles:         buildFiles,
128		DocumentationFiles: documentationFiles,
129		GuidanceFiles:      guidanceFiles,
130		InjectFiles:        injectFiles,
131		InjectFileContents: injectFileContents,
132	}, nil
133}
134
135// categorizeFile categorizes a file into one of four categories: build, documentation, guidance, or inject.
136// Returns an empty string if the file doesn't belong to any of these categories.
137// categorizeFile categorizes a file into one of four categories: build, documentation, guidance, or inject.
138// Returns an empty string if the file doesn't belong to any of these categories.
139// The path parameter is relative to the repository root as returned by git ls-files.
140func categorizeFile(path string) string {
141	filename := filepath.Base(path)
142	lowerPath := strings.ToLower(path)
143	lowerFilename := strings.ToLower(filename)
144
145	// InjectFiles - critical guidance files that should be injected into the system prompt
146	// These are repository root files only - files directly in the repo root, not in subdirectories
147	// Since git ls-files returns paths relative to repo root, we just need to check for absence of path separators
148	isRepoRootFile := !strings.Contains(path, "/")
149	if isRepoRootFile {
150		if (strings.HasPrefix(lowerFilename, "claude.") && strings.HasSuffix(lowerFilename, ".md")) ||
151			strings.HasPrefix(lowerFilename, "dear_llm") ||
152			(strings.HasPrefix(lowerFilename, "agents.") && strings.HasSuffix(lowerFilename, ".md")) ||
153			strings.Contains(lowerFilename, "cursorrules") {
154			return "inject"
155		}
156	}
157
158	// GitHub Copilot: https://code.visualstudio.com/docs/copilot/copilot-customization
159	if path == ".github/copilot-instructions.md" {
160		return "inject"
161	}
162
163	// BuildFiles - build and configuration files
164	if strings.HasPrefix(lowerFilename, "makefile") ||
165		strings.HasSuffix(lowerPath, ".vscode/tasks.json") {
166		return "build"
167	}
168
169	// DocumentationFiles - general documentation files
170	if strings.HasPrefix(lowerFilename, "readme") ||
171		strings.HasPrefix(lowerFilename, "contributing") {
172		return "documentation"
173	}
174
175	// GuidanceFiles - other files that provide guidance but aren't critical enough to inject
176	// Non-root directory claude.md files, and other guidance files
177	if (strings.HasPrefix(lowerFilename, "claude.") && strings.HasSuffix(lowerFilename, ".md")) ||
178		(strings.HasPrefix(lowerFilename, "agent.") && strings.HasSuffix(lowerFilename, ".md")) {
179		return "guidance"
180	}
181
182	return ""
183}
184
185// TopExtensions returns the top 5 most common file extensions in the codebase
186func (c *Codebase) TopExtensions() []string {
187	type extCount struct {
188		ext   string
189		count int
190	}
191	pairs := make([]extCount, 0, len(c.ExtensionCounts))
192	for ext, count := range c.ExtensionCounts {
193		pairs = append(pairs, extCount{ext, count})
194	}
195
196	// Sort by count (descending), then by extension (ascending)
197	slices.SortFunc(pairs, func(a, b extCount) int {
198		return cmp.Or(
199			-cmp.Compare(a.count, b.count),
200			cmp.Compare(a.ext, b.ext),
201		)
202	})
203
204	const nTop = 5
205	count := min(nTop, len(pairs))
206	result := make([]string, count)
207	for i := range count {
208		result[i] = fmt.Sprintf("%v: %v (%0.0f%%)", pairs[i].ext, pairs[i].count, 100*float64(pairs[i].count)/float64(c.TotalFiles))
209	}
210
211	return result
212}
213
214func scanZero(data []byte, atEOF bool) (advance int, token []byte, err error) {
215	if atEOF && len(data) == 0 {
216		return 0, nil, nil
217	}
218	if i := bytes.IndexByte(data, 0); i >= 0 {
219		// We have a full NUL line.
220		return i + 1, data[0:i], nil
221	}
222	// If we're at EOF, we have a final, non-terminated line. Return it.
223	if atEOF {
224		return len(data), data, nil
225	}
226	// Request more data.
227	return 0, nil, nil
228}