old.go

  1// Copyright 2022 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5package lcs
  6
  7// TODO(adonovan): remove unclear references to "old" in this package.
  8
  9import (
 10	"fmt"
 11)
 12
 13// A Diff is a replacement of a portion of A by a portion of B.
 14type Diff struct {
 15	Start, End         int // offsets of portion to delete in A
 16	ReplStart, ReplEnd int // offset of replacement text in B
 17}
 18
 19// DiffStrings returns the differences between two strings.
 20// It does not respect rune boundaries.
 21func DiffStrings(a, b string) []Diff { return diff(stringSeqs{a, b}) }
 22
 23// DiffBytes returns the differences between two byte sequences.
 24// It does not respect rune boundaries.
 25func DiffBytes(a, b []byte) []Diff { return diff(bytesSeqs{a, b}) }
 26
 27// DiffRunes returns the differences between two rune sequences.
 28func DiffRunes(a, b []rune) []Diff { return diff(runesSeqs{a, b}) }
 29
 30func diff(seqs sequences) []Diff {
 31	// A limit on how deeply the LCS algorithm should search. The value is just a guess.
 32	const maxDiffs = 100
 33	diff, _ := compute(seqs, twosided, maxDiffs/2)
 34	return diff
 35}
 36
 37// compute computes the list of differences between two sequences,
 38// along with the LCS. It is exercised directly by tests.
 39// The algorithm is one of {forward, backward, twosided}.
 40func compute(seqs sequences, algo func(*editGraph) lcs, limit int) ([]Diff, lcs) {
 41	if limit <= 0 {
 42		limit = 1 << 25 // effectively infinity
 43	}
 44	alen, blen := seqs.lengths()
 45	g := &editGraph{
 46		seqs:  seqs,
 47		vf:    newtriang(limit),
 48		vb:    newtriang(limit),
 49		limit: limit,
 50		ux:    alen,
 51		uy:    blen,
 52		delta: alen - blen,
 53	}
 54	lcs := algo(g)
 55	diffs := lcs.toDiffs(alen, blen)
 56	return diffs, lcs
 57}
 58
 59// editGraph carries the information for computing the lcs of two sequences.
 60type editGraph struct {
 61	seqs   sequences
 62	vf, vb label // forward and backward labels
 63
 64	limit int // maximal value of D
 65	// the bounding rectangle of the current edit graph
 66	lx, ly, ux, uy int
 67	delta          int // common subexpression: (ux-lx)-(uy-ly)
 68}
 69
 70// toDiffs converts an LCS to a list of edits.
 71func (lcs lcs) toDiffs(alen, blen int) []Diff {
 72	var diffs []Diff
 73	var pa, pb int // offsets in a, b
 74	for _, l := range lcs {
 75		if pa < l.X || pb < l.Y {
 76			diffs = append(diffs, Diff{pa, l.X, pb, l.Y})
 77		}
 78		pa = l.X + l.Len
 79		pb = l.Y + l.Len
 80	}
 81	if pa < alen || pb < blen {
 82		diffs = append(diffs, Diff{pa, alen, pb, blen})
 83	}
 84	return diffs
 85}
 86
 87// --- FORWARD ---
 88
 89// fdone decides if the forward path has reached the upper right
 90// corner of the rectangle. If so, it also returns the computed lcs.
 91func (e *editGraph) fdone(D, k int) (bool, lcs) {
 92	// x, y, k are relative to the rectangle
 93	x := e.vf.get(D, k)
 94	y := x - k
 95	if x == e.ux && y == e.uy {
 96		return true, e.forwardlcs(D, k)
 97	}
 98	return false, nil
 99}
100
101// run the forward algorithm, until success or up to the limit on D.
102func forward(e *editGraph) lcs {
103	e.setForward(0, 0, e.lx)
104	if ok, ans := e.fdone(0, 0); ok {
105		return ans
106	}
107	// from D to D+1
108	for D := range e.limit {
109		e.setForward(D+1, -(D + 1), e.getForward(D, -D))
110		if ok, ans := e.fdone(D+1, -(D + 1)); ok {
111			return ans
112		}
113		e.setForward(D+1, D+1, e.getForward(D, D)+1)
114		if ok, ans := e.fdone(D+1, D+1); ok {
115			return ans
116		}
117		for k := -D + 1; k <= D-1; k += 2 {
118			// these are tricky and easy to get backwards
119			lookv := e.lookForward(k, e.getForward(D, k-1)+1)
120			lookh := e.lookForward(k, e.getForward(D, k+1))
121			if lookv > lookh {
122				e.setForward(D+1, k, lookv)
123			} else {
124				e.setForward(D+1, k, lookh)
125			}
126			if ok, ans := e.fdone(D+1, k); ok {
127				return ans
128			}
129		}
130	}
131	// D is too large
132	// find the D path with maximal x+y inside the rectangle and
133	// use that to compute the found part of the lcs
134	kmax := -e.limit - 1
135	diagmax := -1
136	for k := -e.limit; k <= e.limit; k += 2 {
137		x := e.getForward(e.limit, k)
138		y := x - k
139		if x+y > diagmax && x <= e.ux && y <= e.uy {
140			diagmax, kmax = x+y, k
141		}
142	}
143	return e.forwardlcs(e.limit, kmax)
144}
145
146// recover the lcs by backtracking from the farthest point reached
147func (e *editGraph) forwardlcs(D, k int) lcs {
148	var ans lcs
149	for x := e.getForward(D, k); x != 0 || x-k != 0; {
150		if ok(D-1, k-1) && x-1 == e.getForward(D-1, k-1) {
151			// if (x-1,y) is labelled D-1, x--,D--,k--,continue
152			D, k, x = D-1, k-1, x-1
153			continue
154		} else if ok(D-1, k+1) && x == e.getForward(D-1, k+1) {
155			// if (x,y-1) is labelled D-1, x, D--,k++, continue
156			D, k = D-1, k+1
157			continue
158		}
159		// if (x-1,y-1)--(x,y) is a diagonal, prepend,x--,y--, continue
160		y := x - k
161		ans = ans.prepend(x+e.lx-1, y+e.ly-1)
162		x--
163	}
164	return ans
165}
166
167// start at (x,y), go up the diagonal as far as possible,
168// and label the result with d
169func (e *editGraph) lookForward(k, relx int) int {
170	rely := relx - k
171	x, y := relx+e.lx, rely+e.ly
172	if x < e.ux && y < e.uy {
173		x += e.seqs.commonPrefixLen(x, e.ux, y, e.uy)
174	}
175	return x
176}
177
178func (e *editGraph) setForward(d, k, relx int) {
179	x := e.lookForward(k, relx)
180	e.vf.set(d, k, x-e.lx)
181}
182
183func (e *editGraph) getForward(d, k int) int {
184	x := e.vf.get(d, k)
185	return x
186}
187
188// --- BACKWARD ---
189
190// bdone decides if the backward path has reached the lower left corner
191func (e *editGraph) bdone(D, k int) (bool, lcs) {
192	// x, y, k are relative to the rectangle
193	x := e.vb.get(D, k)
194	y := x - (k + e.delta)
195	if x == 0 && y == 0 {
196		return true, e.backwardlcs(D, k)
197	}
198	return false, nil
199}
200
201// run the backward algorithm, until success or up to the limit on D.
202// (used only by tests)
203func backward(e *editGraph) lcs {
204	e.setBackward(0, 0, e.ux)
205	if ok, ans := e.bdone(0, 0); ok {
206		return ans
207	}
208	// from D to D+1
209	for D := range e.limit {
210		e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1)
211		if ok, ans := e.bdone(D+1, -(D + 1)); ok {
212			return ans
213		}
214		e.setBackward(D+1, D+1, e.getBackward(D, D))
215		if ok, ans := e.bdone(D+1, D+1); ok {
216			return ans
217		}
218		for k := -D + 1; k <= D-1; k += 2 {
219			// these are tricky and easy to get wrong
220			lookv := e.lookBackward(k, e.getBackward(D, k-1))
221			lookh := e.lookBackward(k, e.getBackward(D, k+1)-1)
222			if lookv < lookh {
223				e.setBackward(D+1, k, lookv)
224			} else {
225				e.setBackward(D+1, k, lookh)
226			}
227			if ok, ans := e.bdone(D+1, k); ok {
228				return ans
229			}
230		}
231	}
232
233	// D is too large
234	// find the D path with minimal x+y inside the rectangle and
235	// use that to compute the part of the lcs found
236	kmax := -e.limit - 1
237	diagmin := 1 << 25
238	for k := -e.limit; k <= e.limit; k += 2 {
239		x := e.getBackward(e.limit, k)
240		y := x - (k + e.delta)
241		if x+y < diagmin && x >= 0 && y >= 0 {
242			diagmin, kmax = x+y, k
243		}
244	}
245	if kmax < -e.limit {
246		panic(fmt.Sprintf("no paths when limit=%d?", e.limit))
247	}
248	return e.backwardlcs(e.limit, kmax)
249}
250
251// recover the lcs by backtracking
252func (e *editGraph) backwardlcs(D, k int) lcs {
253	var ans lcs
254	for x := e.getBackward(D, k); x != e.ux || x-(k+e.delta) != e.uy; {
255		if ok(D-1, k-1) && x == e.getBackward(D-1, k-1) {
256			// D--, k--, x unchanged
257			D, k = D-1, k-1
258			continue
259		} else if ok(D-1, k+1) && x+1 == e.getBackward(D-1, k+1) {
260			// D--, k++, x++
261			D, k, x = D-1, k+1, x+1
262			continue
263		}
264		y := x - (k + e.delta)
265		ans = ans.append(x+e.lx, y+e.ly)
266		x++
267	}
268	return ans
269}
270
271// start at (x,y), go down the diagonal as far as possible,
272func (e *editGraph) lookBackward(k, relx int) int {
273	rely := relx - (k + e.delta) // forward k = k + e.delta
274	x, y := relx+e.lx, rely+e.ly
275	if x > 0 && y > 0 {
276		x -= e.seqs.commonSuffixLen(0, x, 0, y)
277	}
278	return x
279}
280
281// convert to rectangle, and label the result with d
282func (e *editGraph) setBackward(d, k, relx int) {
283	x := e.lookBackward(k, relx)
284	e.vb.set(d, k, x-e.lx)
285}
286
287func (e *editGraph) getBackward(d, k int) int {
288	x := e.vb.get(d, k)
289	return x
290}
291
292// -- TWOSIDED ---
293
294func twosided(e *editGraph) lcs {
295	// The termination condition could be improved, as either the forward
296	// or backward pass could succeed before Myers' Lemma applies.
297	// Aside from questions of efficiency (is the extra testing cost-effective)
298	// this is more likely to matter when e.limit is reached.
299	e.setForward(0, 0, e.lx)
300	e.setBackward(0, 0, e.ux)
301
302	// from D to D+1
303	for D := range e.limit {
304		// just finished a backwards pass, so check
305		if got, ok := e.twoDone(D, D); ok {
306			return e.twolcs(D, D, got)
307		}
308		// do a forwards pass (D to D+1)
309		e.setForward(D+1, -(D + 1), e.getForward(D, -D))
310		e.setForward(D+1, D+1, e.getForward(D, D)+1)
311		for k := -D + 1; k <= D-1; k += 2 {
312			// these are tricky and easy to get backwards
313			lookv := e.lookForward(k, e.getForward(D, k-1)+1)
314			lookh := e.lookForward(k, e.getForward(D, k+1))
315			if lookv > lookh {
316				e.setForward(D+1, k, lookv)
317			} else {
318				e.setForward(D+1, k, lookh)
319			}
320		}
321		// just did a forward pass, so check
322		if got, ok := e.twoDone(D+1, D); ok {
323			return e.twolcs(D+1, D, got)
324		}
325		// do a backward pass, D to D+1
326		e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1)
327		e.setBackward(D+1, D+1, e.getBackward(D, D))
328		for k := -D + 1; k <= D-1; k += 2 {
329			// these are tricky and easy to get wrong
330			lookv := e.lookBackward(k, e.getBackward(D, k-1))
331			lookh := e.lookBackward(k, e.getBackward(D, k+1)-1)
332			if lookv < lookh {
333				e.setBackward(D+1, k, lookv)
334			} else {
335				e.setBackward(D+1, k, lookh)
336			}
337		}
338	}
339
340	// D too large. combine a forward and backward partial lcs
341	// first, a forward one
342	kmax := -e.limit - 1
343	diagmax := -1
344	for k := -e.limit; k <= e.limit; k += 2 {
345		x := e.getForward(e.limit, k)
346		y := x - k
347		if x+y > diagmax && x <= e.ux && y <= e.uy {
348			diagmax, kmax = x+y, k
349		}
350	}
351	if kmax < -e.limit {
352		panic(fmt.Sprintf("no forward paths when limit=%d?", e.limit))
353	}
354	lcs := e.forwardlcs(e.limit, kmax)
355	// now a backward one
356	// find the D path with minimal x+y inside the rectangle and
357	// use that to compute the lcs
358	diagmin := 1 << 25 // infinity
359	for k := -e.limit; k <= e.limit; k += 2 {
360		x := e.getBackward(e.limit, k)
361		y := x - (k + e.delta)
362		if x+y < diagmin && x >= 0 && y >= 0 {
363			diagmin, kmax = x+y, k
364		}
365	}
366	if kmax < -e.limit {
367		panic(fmt.Sprintf("no backward paths when limit=%d?", e.limit))
368	}
369	lcs = append(lcs, e.backwardlcs(e.limit, kmax)...)
370	// These may overlap (e.forwardlcs and e.backwardlcs return sorted lcs)
371	ans := lcs.fix()
372	return ans
373}
374
375// Does Myers' Lemma apply?
376func (e *editGraph) twoDone(df, db int) (int, bool) {
377	if (df+db+e.delta)%2 != 0 {
378		return 0, false // diagonals cannot overlap
379	}
380	kmin := max(-df, -db+e.delta)
381	kmax := db + e.delta
382	if df < kmax {
383		kmax = df
384	}
385	for k := kmin; k <= kmax; k += 2 {
386		x := e.vf.get(df, k)
387		u := e.vb.get(db, k-e.delta)
388		if u <= x {
389			// is it worth looking at all the other k?
390			for l := k; l <= kmax; l += 2 {
391				x := e.vf.get(df, l)
392				y := x - l
393				u := e.vb.get(db, l-e.delta)
394				v := u - l
395				if x == u || u == 0 || v == 0 || y == e.uy || x == e.ux {
396					return l, true
397				}
398			}
399			return k, true
400		}
401	}
402	return 0, false
403}
404
405func (e *editGraph) twolcs(df, db, kf int) lcs {
406	// db==df || db+1==df
407	x := e.vf.get(df, kf)
408	y := x - kf
409	kb := kf - e.delta
410	u := e.vb.get(db, kb)
411	v := u - kf
412
413	// Myers proved there is a df-path from (0,0) to (u,v)
414	// and a db-path from (x,y) to (N,M).
415	// In the first case the overall path is the forward path
416	// to (u,v) followed by the backward path to (N,M).
417	// In the second case the path is the backward path to (x,y)
418	// followed by the forward path to (x,y) from (0,0).
419
420	// Look for some special cases to avoid computing either of these paths.
421	if x == u {
422		// "babaab" "cccaba"
423		// already patched together
424		lcs := e.forwardlcs(df, kf)
425		lcs = append(lcs, e.backwardlcs(db, kb)...)
426		return lcs.sort()
427	}
428
429	// is (u-1,v) or (u,v-1) labelled df-1?
430	// if so, that forward df-1-path plus a horizontal or vertical edge
431	// is the df-path to (u,v), then plus the db-path to (N,M)
432	if u > 0 && ok(df-1, u-1-v) && e.vf.get(df-1, u-1-v) == u-1 {
433		//  "aabbab" "cbcabc"
434		lcs := e.forwardlcs(df-1, u-1-v)
435		lcs = append(lcs, e.backwardlcs(db, kb)...)
436		return lcs.sort()
437	}
438	if v > 0 && ok(df-1, (u-(v-1))) && e.vf.get(df-1, u-(v-1)) == u {
439		//  "abaabb" "bcacab"
440		lcs := e.forwardlcs(df-1, u-(v-1))
441		lcs = append(lcs, e.backwardlcs(db, kb)...)
442		return lcs.sort()
443	}
444
445	// The path can't possibly contribute to the lcs because it
446	// is all horizontal or vertical edges
447	if u == 0 || v == 0 || x == e.ux || y == e.uy {
448		// "abaabb" "abaaaa"
449		if u == 0 || v == 0 {
450			return e.backwardlcs(db, kb)
451		}
452		return e.forwardlcs(df, kf)
453	}
454
455	// is (x+1,y) or (x,y+1) labelled db-1?
456	if x+1 <= e.ux && ok(db-1, x+1-y-e.delta) && e.vb.get(db-1, x+1-y-e.delta) == x+1 {
457		// "bababb" "baaabb"
458		lcs := e.backwardlcs(db-1, kb+1)
459		lcs = append(lcs, e.forwardlcs(df, kf)...)
460		return lcs.sort()
461	}
462	if y+1 <= e.uy && ok(db-1, x-(y+1)-e.delta) && e.vb.get(db-1, x-(y+1)-e.delta) == x {
463		// "abbbaa" "cabacc"
464		lcs := e.backwardlcs(db-1, kb-1)
465		lcs = append(lcs, e.forwardlcs(df, kf)...)
466		return lcs.sort()
467	}
468
469	// need to compute another path
470	// "aabbaa" "aacaba"
471	lcs := e.backwardlcs(db, kb)
472	oldx, oldy := e.ux, e.uy
473	e.ux = u
474	e.uy = v
475	lcs = append(lcs, forward(e)...)
476	e.ux, e.uy = oldx, oldy
477	return lcs.sort()
478}