ndiff.go

 1// Copyright 2022 The Go Authors. All rights reserved.
 2// Use of this source code is governed by a BSD-style
 3// license that can be found in the LICENSE file.
 4
 5package udiff
 6
 7import (
 8	"bytes"
 9	"unicode/utf8"
10
11	"github.com/aymanbagabas/go-udiff/lcs"
12)
13
14// Strings computes the differences between two strings.
15// The resulting edits respect rune boundaries.
16func Strings(before, after string) []Edit {
17	if before == after {
18		return nil // common case
19	}
20
21	if isASCII(before) && isASCII(after) {
22		// TODO(adonovan): opt: specialize diffASCII for strings.
23		return diffASCII([]byte(before), []byte(after))
24	}
25	return diffRunes([]rune(before), []rune(after))
26}
27
28// Bytes computes the differences between two byte slices.
29// The resulting edits respect rune boundaries.
30func Bytes(before, after []byte) []Edit {
31	if bytes.Equal(before, after) {
32		return nil // common case
33	}
34
35	if isASCII(before) && isASCII(after) {
36		return diffASCII(before, after)
37	}
38	return diffRunes(runes(before), runes(after))
39}
40
41func diffASCII(before, after []byte) []Edit {
42	diffs := lcs.DiffBytes(before, after)
43
44	// Convert from LCS diffs.
45	res := make([]Edit, len(diffs))
46	for i, d := range diffs {
47		res[i] = Edit{d.Start, d.End, string(after[d.ReplStart:d.ReplEnd])}
48	}
49	return res
50}
51
52func diffRunes(before, after []rune) []Edit {
53	diffs := lcs.DiffRunes(before, after)
54
55	// The diffs returned by the lcs package use indexes
56	// into whatever slice was passed in.
57	// Convert rune offsets to byte offsets.
58	res := make([]Edit, len(diffs))
59	lastEnd := 0
60	utf8Len := 0
61	for i, d := range diffs {
62		utf8Len += runesLen(before[lastEnd:d.Start]) // text between edits
63		start := utf8Len
64		utf8Len += runesLen(before[d.Start:d.End]) // text deleted by this edit
65		res[i] = Edit{start, utf8Len, string(after[d.ReplStart:d.ReplEnd])}
66		lastEnd = d.End
67	}
68	return res
69}
70
71// runes is like []rune(string(bytes)) without the duplicate allocation.
72func runes(bytes []byte) []rune {
73	n := utf8.RuneCount(bytes)
74	runes := make([]rune, n)
75	for i := range n {
76		r, sz := utf8.DecodeRune(bytes)
77		bytes = bytes[sz:]
78		runes[i] = r
79	}
80	return runes
81}
82
83// runesLen returns the length in bytes of the UTF-8 encoding of runes.
84func runesLen(runes []rune) (len int) {
85	for _, r := range runes {
86		len += utf8.RuneLen(r)
87	}
88	return len
89}
90
91// isASCII reports whether s contains only ASCII.
92func isASCII[S string | []byte](s S) bool {
93	for i := 0; i < len(s); i++ {
94		if s[i] >= utf8.RuneSelf {
95			return false
96		}
97	}
98	return true
99}