patchkit.go

  1package patchkit
  2
  3import (
  4	"fmt"
  5	"go/scanner"
  6	"go/token"
  7	"slices"
  8	"strings"
  9	"unicode"
 10
 11	"sketch.dev/claudetool/editbuf"
 12)
 13
 14// A Spec specifies a single patch.
 15type Spec struct {
 16	Off int    // Byte offset to apply the replacement
 17	Len int    // Length of the replacement
 18	Src string // Original string (for debugging)
 19	Old string // Search string
 20	New string // Replacement string
 21}
 22
 23// Unique generates a patch spec to apply op, given a unique occurrence of needle in haystack and replacement text replace.
 24// It reports the number of matches found for needle in haystack: 0, 1, or 2 (for any value > 1).
 25func Unique(haystack, needle, replace string) (*Spec, int) {
 26	prefix, rest, ok := strings.Cut(haystack, needle)
 27	if !ok {
 28		return nil, 0
 29	}
 30	if strings.Contains(rest, needle) {
 31		return nil, 2
 32	}
 33	s := &Spec{
 34		Off: len(prefix),
 35		Len: len(needle),
 36		Src: haystack,
 37		Old: needle,
 38		New: replace,
 39	}
 40	return s, 1
 41}
 42
 43// minimize reduces the size of the patch by removing any shared prefix and suffix.
 44func (s *Spec) minimize() {
 45	pre := commonPrefixLen(s.Old, s.New)
 46	s.Off += pre
 47	s.Len -= pre
 48	s.Old = s.Old[pre:]
 49	s.New = s.New[pre:]
 50	suf := commonSuffixLen(s.Old, s.New)
 51	s.Len -= suf
 52	s.Old = s.Old[:len(s.Old)-suf]
 53	s.New = s.New[:len(s.New)-suf]
 54}
 55
 56// ApplyToEditBuf applies the patch to the given edit buffer.
 57func (s *Spec) ApplyToEditBuf(buf *editbuf.Buffer) {
 58	s.minimize()
 59	buf.Replace(s.Off, s.Off+s.Len, s.New)
 60}
 61
 62// UniqueDedent is Unique, but with flexibility around consistent whitespace prefix changes.
 63// Unlike Unique, which returns a count of matches,
 64// UniqueDedent returns a boolean indicating whether a unique match was found.
 65// It is for LLMs that have a hard time reliably reproducing uniform whitespace prefixes.
 66// For example, they may generate 8 spaces instead of 6 for all relevant lines.
 67// UniqueDedent adjusts the needle's whitespace prefix to match the haystack's
 68// and then replaces the unique instance of needle in haystack with replacement.
 69func UniqueDedent(haystack, needle, replace string) (*Spec, bool) {
 70	// TODO: this all definitely admits of some optimization
 71	haystackLines := slices.Collect(strings.Lines(haystack))
 72	needleLines := slices.Collect(strings.Lines(needle))
 73	match := uniqueTrimmedLineMatch(haystackLines, needleLines)
 74	if match == -1 {
 75		return nil, false
 76	}
 77	// We now systematically adjust needle's whitespace prefix to match haystack.
 78	// The first line gets special treatment, because its leading whitespace is irrelevant,
 79	// and models often skip past it (or part of it).
 80	if len(needleLines) == 0 {
 81		return nil, false
 82	}
 83	// First line: cut leading whitespace and make corresponding fixes to replacement.
 84	// The leading whitespace will come out in the wash in Unique.
 85	// We need to make corresponding fixes to the replacement.
 86	nl0 := needleLines[0]
 87	noWS := strings.TrimLeftFunc(nl0, unicode.IsSpace)
 88	ws0, _ := strings.CutSuffix(nl0, noWS) // can't fail
 89	rest, ok := strings.CutPrefix(replace, ws0)
 90	if ok {
 91		// Adjust needle and replacement in tandem.
 92		nl0 = noWS
 93		replace = rest
 94	}
 95	// Calculate common whitespace prefixes for the rest.
 96	haystackPrefix := commonWhitespacePrefix(haystackLines[match : match+len(needleLines)])
 97	needlePrefix := commonWhitespacePrefix(needleLines[1:])
 98	nbuf := new(strings.Builder)
 99	for i, line := range needleLines {
100		if i == 0 {
101			nbuf.WriteString(nl0)
102			continue
103		}
104		// Allow empty (newline-only) lines not to be prefixed.
105		if strings.TrimRight(line, "\n\r") == "" {
106			nbuf.WriteString(line)
107			continue
108		}
109		// Swap in haystackPrefix for needlePrefix.
110		nbuf.WriteString(haystackPrefix)
111		nbuf.WriteString(line[len(needlePrefix):])
112	}
113	// Do a replacement with our new-and-improved needle.
114	needle = nbuf.String()
115	spec, count := Unique(haystack, needle, replace)
116	if count != 1 {
117		return nil, false
118	}
119	return spec, true
120}
121
122type tok struct {
123	pos token.Position
124	tok token.Token
125	lit string
126}
127
128func (t tok) String() string {
129	if t.lit == "" {
130		return fmt.Sprintf("%s", t.tok)
131	}
132	return fmt.Sprintf("%s(%q)", t.tok, t.lit)
133}
134
135func tokenize(code string) ([]tok, bool) {
136	var s scanner.Scanner
137	fset := token.NewFileSet()
138	file := fset.AddFile("", fset.Base(), len(code))
139	s.Init(file, []byte(code), nil, scanner.ScanComments)
140	var tokens []tok
141	for {
142		pos, t, lit := s.Scan()
143		if s.ErrorCount > 0 {
144			return nil, false // invalid Go code (or not Go code at all)
145		}
146		if t == token.EOF {
147			return tokens, true
148		}
149		tokens = append(tokens, tok{pos: fset.PositionFor(pos, false), tok: t, lit: lit})
150	}
151}
152
153func tokensEqual(a, b []tok) bool {
154	if len(a) != len(b) {
155		return false
156	}
157	for i := range a {
158		at, bt := a[i], b[i]
159		// positions are expected to differ
160		if at.tok != bt.tok || at.lit != bt.lit {
161			return false
162		}
163	}
164	return true
165}
166
167func tokensUniqueMatch(haystack, needle []tok) int {
168	// TODO: optimize
169	match := -1
170	for i := range haystack {
171		rest := haystack[i:]
172		if len(rest) < len(needle) {
173			break
174		}
175		rest = rest[:len(needle)]
176		if !tokensEqual(rest, needle) {
177			continue
178		}
179		if match != -1 {
180			return -1 // multiple matches
181		}
182		match = i
183	}
184	return match
185}
186
187// UniqueGoTokens is Unique, but with flexibility around all insignificant whitespace.
188// Like UniqueDedent, it returns a boolean indicating whether a unique match was found.
189// It is safe (enough) because it ensures that the needle alterations occurs only in places
190// where whitespace is not semantically significant.
191// In practice, this appears safe.
192func UniqueGoTokens(haystack, needle, replace string) (*Spec, bool) {
193	nt, ok := tokenize(needle)
194	if !ok {
195		return nil, false
196	}
197	ht, ok := tokenize(haystack)
198	if !ok {
199		return nil, false
200	}
201	match := tokensUniqueMatch(ht, nt)
202	if match == -1 {
203		return nil, false
204	}
205	matchEnd := match + len(nt) - 1
206	start := ht[match].pos.Offset
207	needle = haystack[start:]
208	if matchEnd+1 < len(ht) {
209		// todo: handle match at very end of file
210		end := ht[matchEnd+1].pos.Offset
211		needle = needle[:end-start]
212	}
213	// OK, declare this very fuzzy match to be our new needle.
214	spec, count := Unique(haystack, needle, replace)
215	if count != 1 {
216		return nil, false
217	}
218	return spec, true
219}
220
221// UniqueInValidGo is Unique, but with flexibility around all leading and trailing whitespace.
222// Like UniqueDedent, it returns a boolean indicating whether a unique match was found.
223// It is safe (enough) because it ensures that the needle alterations occurs only in places
224// where whitespace is not semantically significant.
225// In practice, this appears safe.
226func UniqueInValidGo(haystack, needle, replace string) (*Spec, bool) {
227	haystackLines := slices.Collect(strings.Lines(haystack))
228	needleLines := slices.Collect(strings.Lines(needle))
229	matchStart := uniqueTrimmedLineMatch(haystackLines, needleLines)
230	if matchStart == -1 {
231		return nil, false
232	}
233	needle, replace = improveNeedle(haystack, needle, replace, matchStart)
234	matchEnd := matchStart + strings.Count(needle, "\n")
235	// Ensure that none of the lines that we fuzzy-matched involve a multiline comment or string literal.
236	var s scanner.Scanner
237	fset := token.NewFileSet()
238	file := fset.AddFile("", fset.Base(), len(haystack))
239	s.Init(file, []byte(haystack), nil, scanner.ScanComments)
240	for {
241		pos, tok, lit := s.Scan()
242		if s.ErrorCount > 0 {
243			return nil, false // invalid Go code (or not Go code at all)
244		}
245		if tok == token.EOF {
246			break
247		}
248		if tok == token.SEMICOLON || !strings.Contains(lit, "\n") {
249			continue
250		}
251		// In a token that spans multiple lines,
252		// so not perfectly matching whitespace might be unsafe.
253		p := fset.Position(pos)
254		tokenStart := p.Line - 1 // 1-based to 0-based
255		tokenEnd := tokenStart + strings.Count(lit, "\n")
256		// Check whether [matchStart, matchEnd] overlaps [tokenStart, tokenEnd]
257		// TODO: think more about edge conditions here. Any off-by-one errors?
258		// For example, leading whitespace and trailing whitespace
259		// on this token's lines are not semantically significant.
260		if tokenStart <= matchEnd && matchStart <= tokenEnd {
261			// if tokenStart <= matchStart && tokenEnd <= tokenEnd {}
262			return nil, false // this token overlaps the range we're replacing, not safe
263		}
264	}
265
266	// TODO: restore this sanity check? it's mildly expensive and i've never seen it fail.
267	// replaced := strings.Join(haystackLines[:matchStart], "") + replacement + strings.Join(haystackLines[matchEnd:], "")
268	// _, err := format.Source([]byte(replaced))
269	// if err != nil {
270	//     return nil, false
271	// }
272
273	// OK, declare this very fuzzy match to be our new needle.
274	needle = strings.Join(haystackLines[matchStart:matchEnd], "")
275	spec, count := Unique(haystack, needle, replace)
276	if count != 1 {
277		return nil, false
278	}
279	return spec, true
280}
281
282// UniqueTrim is Unique, but with flexibility to shrink old/replace in tandem.
283func UniqueTrim(haystack, needle, replace string) (*Spec, bool) {
284	// LLMs appear to particularly struggle with the first line of a patch.
285	// If that first line is replicated in replace,
286	// and removing it yields a unique match,
287	// we can remove that line entirely from both.
288	n0, nRest, nOK := strings.Cut(needle, "\n")
289	r0, rRest, rOK := strings.Cut(replace, "\n")
290	if !nOK || !rOK || n0 != r0 {
291		return nil, false
292	}
293	spec, count := Unique(haystack, nRest, rRest)
294	if count != 1 {
295		return nil, false
296	}
297	return spec, true
298}
299
300// uniqueTrimmedLineMatch returns the index of the first line in haystack that matches needle,
301// when ignoring leading and trailing whitespace.
302// uniqueTrimmedLineMatch returns -1 if there is no unique match.
303func uniqueTrimmedLineMatch(haystackLines, needleLines []string) int {
304	// TODO: optimize
305	trimmedHaystackLines := trimSpaceAll(haystackLines)
306	trimmedNeedleLines := trimSpaceAll(needleLines)
307	match := -1
308	for i := range trimmedHaystackLines {
309		rest := trimmedHaystackLines[i:]
310		if len(rest) < len(trimmedNeedleLines) {
311			break
312		}
313		rest = rest[:len(trimmedNeedleLines)]
314		if !slices.Equal(rest, trimmedNeedleLines) {
315			continue
316		}
317		if match != -1 {
318			return -1 // multiple matches
319		}
320		match = i
321	}
322	return match
323}
324
325func trimSpaceAll(x []string) []string {
326	trimmed := make([]string, len(x))
327	for i, s := range x {
328		trimmed[i] = strings.TrimSpace(s)
329	}
330	return trimmed
331}
332
333// improveNeedle adjusts both needle and replacement in tandem to better match haystack.
334// Note that we adjust search and replace together.
335func improveNeedle(haystack, needle, replacement string, matchLine int) (string, string) {
336	// TODO: we make new slices too much
337	needleLines := slices.Collect(strings.Lines(needle))
338	if len(needleLines) == 0 {
339		return needle, replacement
340	}
341	haystackLines := slices.Collect(strings.Lines(haystack))
342	if matchLine+len(needleLines) > len(haystackLines) {
343		// should be impossible, but just in case
344		return needle, replacement
345	}
346	// Add trailing last-line newline if needed to better match haystack.
347	if !strings.HasSuffix(needle, "\n") && strings.HasSuffix(haystackLines[matchLine+len(needleLines)-1], "\n") {
348		needle += "\n"
349		replacement += "\n"
350	}
351	// Add leading first-line prefix if needed to better match haystack.
352	rest, ok := strings.CutSuffix(haystackLines[matchLine], needleLines[0])
353	if ok {
354		needle = rest + needle
355		replacement = rest + replacement
356	}
357	return needle, replacement
358}
359
360func isNonSpace(r rune) bool {
361	return !unicode.IsSpace(r)
362}
363
364func whitespacePrefix(s string) string {
365	firstNonSpace := strings.IndexFunc(s, isNonSpace)
366	return s[:max(0, firstNonSpace)] // map -1 for "not found" onto 0
367}
368
369// commonWhitespacePrefix returns the longest common whitespace prefix of the elements of x, somewhat flexibly.
370func commonWhitespacePrefix(x []string) string {
371	var pre string
372	for i, s := range x {
373		if i == 0 {
374			pre = s
375			continue
376		}
377		// ignore line endings for the moment
378		// (this is just for prefixes)
379		s = strings.TrimRight(s, "\n\r")
380		if s == "" {
381			continue
382		}
383		n := commonPrefixLen(pre, whitespacePrefix(s))
384		if n == 0 {
385			return ""
386		}
387		pre = pre[:n]
388	}
389	pre = strings.TrimRightFunc(pre, isNonSpace)
390	return pre
391}
392
393// commonPrefixLen returns the length of the common prefix of a and b.
394func commonPrefixLen(a, b string) int {
395	// TODO: optimize, see https://go-review.googlesource.com/c/go/+/408116
396	shortest := min(len(a), len(b))
397	for i := range shortest {
398		if a[i] != b[i] {
399			return i
400		}
401	}
402	return shortest
403}
404
405// commonSuffixLen returns the length of the common suffix of a and b.
406func commonSuffixLen(a, b string) int {
407	// TODO: optimize
408	shortest := min(len(a), len(b))
409	for i := 0; i < shortest; i++ {
410		if a[len(a)-i-1] != b[len(b)-i-1] {
411			return i
412		}
413	}
414	return shortest
415}