1// Copyright 2022 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package lcs
6
7// TODO(adonovan): remove unclear references to "old" in this package.
8
9import (
10 "fmt"
11)
12
13// A Diff is a replacement of a portion of A by a portion of B.
14type Diff struct {
15 Start, End int // offsets of portion to delete in A
16 ReplStart, ReplEnd int // offset of replacement text in B
17}
18
19// DiffStrings returns the differences between two strings.
20// It does not respect rune boundaries.
21func DiffStrings(a, b string) []Diff { return diff(stringSeqs{a, b}) }
22
23// DiffBytes returns the differences between two byte sequences.
24// It does not respect rune boundaries.
25func DiffBytes(a, b []byte) []Diff { return diff(bytesSeqs{a, b}) }
26
27// DiffRunes returns the differences between two rune sequences.
28func DiffRunes(a, b []rune) []Diff { return diff(runesSeqs{a, b}) }
29
30func diff(seqs sequences) []Diff {
31 // A limit on how deeply the LCS algorithm should search. The value is just a guess.
32 const maxDiffs = 100
33 diff, _ := compute(seqs, twosided, maxDiffs/2)
34 return diff
35}
36
37// compute computes the list of differences between two sequences,
38// along with the LCS. It is exercised directly by tests.
39// The algorithm is one of {forward, backward, twosided}.
40func compute(seqs sequences, algo func(*editGraph) lcs, limit int) ([]Diff, lcs) {
41 if limit <= 0 {
42 limit = 1 << 25 // effectively infinity
43 }
44 alen, blen := seqs.lengths()
45 g := &editGraph{
46 seqs: seqs,
47 vf: newtriang(limit),
48 vb: newtriang(limit),
49 limit: limit,
50 ux: alen,
51 uy: blen,
52 delta: alen - blen,
53 }
54 lcs := algo(g)
55 diffs := lcs.toDiffs(alen, blen)
56 return diffs, lcs
57}
58
59// editGraph carries the information for computing the lcs of two sequences.
60type editGraph struct {
61 seqs sequences
62 vf, vb label // forward and backward labels
63
64 limit int // maximal value of D
65 // the bounding rectangle of the current edit graph
66 lx, ly, ux, uy int
67 delta int // common subexpression: (ux-lx)-(uy-ly)
68}
69
70// toDiffs converts an LCS to a list of edits.
71func (lcs lcs) toDiffs(alen, blen int) []Diff {
72 var diffs []Diff
73 var pa, pb int // offsets in a, b
74 for _, l := range lcs {
75 if pa < l.X || pb < l.Y {
76 diffs = append(diffs, Diff{pa, l.X, pb, l.Y})
77 }
78 pa = l.X + l.Len
79 pb = l.Y + l.Len
80 }
81 if pa < alen || pb < blen {
82 diffs = append(diffs, Diff{pa, alen, pb, blen})
83 }
84 return diffs
85}
86
87// --- FORWARD ---
88
89// fdone decides if the forward path has reached the upper right
90// corner of the rectangle. If so, it also returns the computed lcs.
91func (e *editGraph) fdone(D, k int) (bool, lcs) {
92 // x, y, k are relative to the rectangle
93 x := e.vf.get(D, k)
94 y := x - k
95 if x == e.ux && y == e.uy {
96 return true, e.forwardlcs(D, k)
97 }
98 return false, nil
99}
100
101// run the forward algorithm, until success or up to the limit on D.
102func forward(e *editGraph) lcs {
103 e.setForward(0, 0, e.lx)
104 if ok, ans := e.fdone(0, 0); ok {
105 return ans
106 }
107 // from D to D+1
108 for D := range e.limit {
109 e.setForward(D+1, -(D + 1), e.getForward(D, -D))
110 if ok, ans := e.fdone(D+1, -(D + 1)); ok {
111 return ans
112 }
113 e.setForward(D+1, D+1, e.getForward(D, D)+1)
114 if ok, ans := e.fdone(D+1, D+1); ok {
115 return ans
116 }
117 for k := -D + 1; k <= D-1; k += 2 {
118 // these are tricky and easy to get backwards
119 lookv := e.lookForward(k, e.getForward(D, k-1)+1)
120 lookh := e.lookForward(k, e.getForward(D, k+1))
121 if lookv > lookh {
122 e.setForward(D+1, k, lookv)
123 } else {
124 e.setForward(D+1, k, lookh)
125 }
126 if ok, ans := e.fdone(D+1, k); ok {
127 return ans
128 }
129 }
130 }
131 // D is too large
132 // find the D path with maximal x+y inside the rectangle and
133 // use that to compute the found part of the lcs
134 kmax := -e.limit - 1
135 diagmax := -1
136 for k := -e.limit; k <= e.limit; k += 2 {
137 x := e.getForward(e.limit, k)
138 y := x - k
139 if x+y > diagmax && x <= e.ux && y <= e.uy {
140 diagmax, kmax = x+y, k
141 }
142 }
143 return e.forwardlcs(e.limit, kmax)
144}
145
146// recover the lcs by backtracking from the farthest point reached
147func (e *editGraph) forwardlcs(D, k int) lcs {
148 var ans lcs
149 for x := e.getForward(D, k); x != 0 || x-k != 0; {
150 if ok(D-1, k-1) && x-1 == e.getForward(D-1, k-1) {
151 // if (x-1,y) is labelled D-1, x--,D--,k--,continue
152 D, k, x = D-1, k-1, x-1
153 continue
154 } else if ok(D-1, k+1) && x == e.getForward(D-1, k+1) {
155 // if (x,y-1) is labelled D-1, x, D--,k++, continue
156 D, k = D-1, k+1
157 continue
158 }
159 // if (x-1,y-1)--(x,y) is a diagonal, prepend,x--,y--, continue
160 y := x - k
161 ans = ans.prepend(x+e.lx-1, y+e.ly-1)
162 x--
163 }
164 return ans
165}
166
167// start at (x,y), go up the diagonal as far as possible,
168// and label the result with d
169func (e *editGraph) lookForward(k, relx int) int {
170 rely := relx - k
171 x, y := relx+e.lx, rely+e.ly
172 if x < e.ux && y < e.uy {
173 x += e.seqs.commonPrefixLen(x, e.ux, y, e.uy)
174 }
175 return x
176}
177
178func (e *editGraph) setForward(d, k, relx int) {
179 x := e.lookForward(k, relx)
180 e.vf.set(d, k, x-e.lx)
181}
182
183func (e *editGraph) getForward(d, k int) int {
184 x := e.vf.get(d, k)
185 return x
186}
187
188// --- BACKWARD ---
189
190// bdone decides if the backward path has reached the lower left corner
191func (e *editGraph) bdone(D, k int) (bool, lcs) {
192 // x, y, k are relative to the rectangle
193 x := e.vb.get(D, k)
194 y := x - (k + e.delta)
195 if x == 0 && y == 0 {
196 return true, e.backwardlcs(D, k)
197 }
198 return false, nil
199}
200
201// run the backward algorithm, until success or up to the limit on D.
202// (used only by tests)
203func backward(e *editGraph) lcs {
204 e.setBackward(0, 0, e.ux)
205 if ok, ans := e.bdone(0, 0); ok {
206 return ans
207 }
208 // from D to D+1
209 for D := range e.limit {
210 e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1)
211 if ok, ans := e.bdone(D+1, -(D + 1)); ok {
212 return ans
213 }
214 e.setBackward(D+1, D+1, e.getBackward(D, D))
215 if ok, ans := e.bdone(D+1, D+1); ok {
216 return ans
217 }
218 for k := -D + 1; k <= D-1; k += 2 {
219 // these are tricky and easy to get wrong
220 lookv := e.lookBackward(k, e.getBackward(D, k-1))
221 lookh := e.lookBackward(k, e.getBackward(D, k+1)-1)
222 if lookv < lookh {
223 e.setBackward(D+1, k, lookv)
224 } else {
225 e.setBackward(D+1, k, lookh)
226 }
227 if ok, ans := e.bdone(D+1, k); ok {
228 return ans
229 }
230 }
231 }
232
233 // D is too large
234 // find the D path with minimal x+y inside the rectangle and
235 // use that to compute the part of the lcs found
236 kmax := -e.limit - 1
237 diagmin := 1 << 25
238 for k := -e.limit; k <= e.limit; k += 2 {
239 x := e.getBackward(e.limit, k)
240 y := x - (k + e.delta)
241 if x+y < diagmin && x >= 0 && y >= 0 {
242 diagmin, kmax = x+y, k
243 }
244 }
245 if kmax < -e.limit {
246 panic(fmt.Sprintf("no paths when limit=%d?", e.limit))
247 }
248 return e.backwardlcs(e.limit, kmax)
249}
250
251// recover the lcs by backtracking
252func (e *editGraph) backwardlcs(D, k int) lcs {
253 var ans lcs
254 for x := e.getBackward(D, k); x != e.ux || x-(k+e.delta) != e.uy; {
255 if ok(D-1, k-1) && x == e.getBackward(D-1, k-1) {
256 // D--, k--, x unchanged
257 D, k = D-1, k-1
258 continue
259 } else if ok(D-1, k+1) && x+1 == e.getBackward(D-1, k+1) {
260 // D--, k++, x++
261 D, k, x = D-1, k+1, x+1
262 continue
263 }
264 y := x - (k + e.delta)
265 ans = ans.append(x+e.lx, y+e.ly)
266 x++
267 }
268 return ans
269}
270
271// start at (x,y), go down the diagonal as far as possible,
272func (e *editGraph) lookBackward(k, relx int) int {
273 rely := relx - (k + e.delta) // forward k = k + e.delta
274 x, y := relx+e.lx, rely+e.ly
275 if x > 0 && y > 0 {
276 x -= e.seqs.commonSuffixLen(0, x, 0, y)
277 }
278 return x
279}
280
281// convert to rectangle, and label the result with d
282func (e *editGraph) setBackward(d, k, relx int) {
283 x := e.lookBackward(k, relx)
284 e.vb.set(d, k, x-e.lx)
285}
286
287func (e *editGraph) getBackward(d, k int) int {
288 x := e.vb.get(d, k)
289 return x
290}
291
292// -- TWOSIDED ---
293
294func twosided(e *editGraph) lcs {
295 // The termination condition could be improved, as either the forward
296 // or backward pass could succeed before Myers' Lemma applies.
297 // Aside from questions of efficiency (is the extra testing cost-effective)
298 // this is more likely to matter when e.limit is reached.
299 e.setForward(0, 0, e.lx)
300 e.setBackward(0, 0, e.ux)
301
302 // from D to D+1
303 for D := range e.limit {
304 // just finished a backwards pass, so check
305 if got, ok := e.twoDone(D, D); ok {
306 return e.twolcs(D, D, got)
307 }
308 // do a forwards pass (D to D+1)
309 e.setForward(D+1, -(D + 1), e.getForward(D, -D))
310 e.setForward(D+1, D+1, e.getForward(D, D)+1)
311 for k := -D + 1; k <= D-1; k += 2 {
312 // these are tricky and easy to get backwards
313 lookv := e.lookForward(k, e.getForward(D, k-1)+1)
314 lookh := e.lookForward(k, e.getForward(D, k+1))
315 if lookv > lookh {
316 e.setForward(D+1, k, lookv)
317 } else {
318 e.setForward(D+1, k, lookh)
319 }
320 }
321 // just did a forward pass, so check
322 if got, ok := e.twoDone(D+1, D); ok {
323 return e.twolcs(D+1, D, got)
324 }
325 // do a backward pass, D to D+1
326 e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1)
327 e.setBackward(D+1, D+1, e.getBackward(D, D))
328 for k := -D + 1; k <= D-1; k += 2 {
329 // these are tricky and easy to get wrong
330 lookv := e.lookBackward(k, e.getBackward(D, k-1))
331 lookh := e.lookBackward(k, e.getBackward(D, k+1)-1)
332 if lookv < lookh {
333 e.setBackward(D+1, k, lookv)
334 } else {
335 e.setBackward(D+1, k, lookh)
336 }
337 }
338 }
339
340 // D too large. combine a forward and backward partial lcs
341 // first, a forward one
342 kmax := -e.limit - 1
343 diagmax := -1
344 for k := -e.limit; k <= e.limit; k += 2 {
345 x := e.getForward(e.limit, k)
346 y := x - k
347 if x+y > diagmax && x <= e.ux && y <= e.uy {
348 diagmax, kmax = x+y, k
349 }
350 }
351 if kmax < -e.limit {
352 panic(fmt.Sprintf("no forward paths when limit=%d?", e.limit))
353 }
354 lcs := e.forwardlcs(e.limit, kmax)
355 // now a backward one
356 // find the D path with minimal x+y inside the rectangle and
357 // use that to compute the lcs
358 diagmin := 1 << 25 // infinity
359 for k := -e.limit; k <= e.limit; k += 2 {
360 x := e.getBackward(e.limit, k)
361 y := x - (k + e.delta)
362 if x+y < diagmin && x >= 0 && y >= 0 {
363 diagmin, kmax = x+y, k
364 }
365 }
366 if kmax < -e.limit {
367 panic(fmt.Sprintf("no backward paths when limit=%d?", e.limit))
368 }
369 lcs = append(lcs, e.backwardlcs(e.limit, kmax)...)
370 // These may overlap (e.forwardlcs and e.backwardlcs return sorted lcs)
371 ans := lcs.fix()
372 return ans
373}
374
375// Does Myers' Lemma apply?
376func (e *editGraph) twoDone(df, db int) (int, bool) {
377 if (df+db+e.delta)%2 != 0 {
378 return 0, false // diagonals cannot overlap
379 }
380 kmin := max(-df, -db+e.delta)
381 kmax := db + e.delta
382 if df < kmax {
383 kmax = df
384 }
385 for k := kmin; k <= kmax; k += 2 {
386 x := e.vf.get(df, k)
387 u := e.vb.get(db, k-e.delta)
388 if u <= x {
389 // is it worth looking at all the other k?
390 for l := k; l <= kmax; l += 2 {
391 x := e.vf.get(df, l)
392 y := x - l
393 u := e.vb.get(db, l-e.delta)
394 v := u - l
395 if x == u || u == 0 || v == 0 || y == e.uy || x == e.ux {
396 return l, true
397 }
398 }
399 return k, true
400 }
401 }
402 return 0, false
403}
404
405func (e *editGraph) twolcs(df, db, kf int) lcs {
406 // db==df || db+1==df
407 x := e.vf.get(df, kf)
408 y := x - kf
409 kb := kf - e.delta
410 u := e.vb.get(db, kb)
411 v := u - kf
412
413 // Myers proved there is a df-path from (0,0) to (u,v)
414 // and a db-path from (x,y) to (N,M).
415 // In the first case the overall path is the forward path
416 // to (u,v) followed by the backward path to (N,M).
417 // In the second case the path is the backward path to (x,y)
418 // followed by the forward path to (x,y) from (0,0).
419
420 // Look for some special cases to avoid computing either of these paths.
421 if x == u {
422 // "babaab" "cccaba"
423 // already patched together
424 lcs := e.forwardlcs(df, kf)
425 lcs = append(lcs, e.backwardlcs(db, kb)...)
426 return lcs.sort()
427 }
428
429 // is (u-1,v) or (u,v-1) labelled df-1?
430 // if so, that forward df-1-path plus a horizontal or vertical edge
431 // is the df-path to (u,v), then plus the db-path to (N,M)
432 if u > 0 && ok(df-1, u-1-v) && e.vf.get(df-1, u-1-v) == u-1 {
433 // "aabbab" "cbcabc"
434 lcs := e.forwardlcs(df-1, u-1-v)
435 lcs = append(lcs, e.backwardlcs(db, kb)...)
436 return lcs.sort()
437 }
438 if v > 0 && ok(df-1, (u-(v-1))) && e.vf.get(df-1, u-(v-1)) == u {
439 // "abaabb" "bcacab"
440 lcs := e.forwardlcs(df-1, u-(v-1))
441 lcs = append(lcs, e.backwardlcs(db, kb)...)
442 return lcs.sort()
443 }
444
445 // The path can't possibly contribute to the lcs because it
446 // is all horizontal or vertical edges
447 if u == 0 || v == 0 || x == e.ux || y == e.uy {
448 // "abaabb" "abaaaa"
449 if u == 0 || v == 0 {
450 return e.backwardlcs(db, kb)
451 }
452 return e.forwardlcs(df, kf)
453 }
454
455 // is (x+1,y) or (x,y+1) labelled db-1?
456 if x+1 <= e.ux && ok(db-1, x+1-y-e.delta) && e.vb.get(db-1, x+1-y-e.delta) == x+1 {
457 // "bababb" "baaabb"
458 lcs := e.backwardlcs(db-1, kb+1)
459 lcs = append(lcs, e.forwardlcs(df, kf)...)
460 return lcs.sort()
461 }
462 if y+1 <= e.uy && ok(db-1, x-(y+1)-e.delta) && e.vb.get(db-1, x-(y+1)-e.delta) == x {
463 // "abbbaa" "cabacc"
464 lcs := e.backwardlcs(db-1, kb-1)
465 lcs = append(lcs, e.forwardlcs(df, kf)...)
466 return lcs.sort()
467 }
468
469 // need to compute another path
470 // "aabbaa" "aacaba"
471 lcs := e.backwardlcs(db, kb)
472 oldx, oldy := e.ux, e.uy
473 e.ux = u
474 e.uy = v
475 lcs = append(lcs, forward(e)...)
476 e.ux, e.uy = oldx, oldy
477 return lcs.sort()
478}