Implement a new wrap algorithm and pass all text tests

Yang Zhang created

Change summary

util/text/text.go      | 239 +++++++++++++++++++++++++++++++------------
util/text/text_test.go |  15 +-
2 files changed, 177 insertions(+), 77 deletions(-)

Detailed changes

util/text/text.go 🔗

@@ -1,9 +1,9 @@
 package text
 
 import (
-	"bytes"
 	"github.com/mattn/go-runewidth"
 	"strings"
+	"unicode/utf8"
 )
 
 // Wrap a text for an exact line size
@@ -15,96 +15,195 @@ func Wrap(text string, lineWidth int) (string, int) {
 // Wrap a text for an exact line size with a left padding
 // Handle properly terminal color escape code
 func WrapLeftPadded(text string, lineWidth int, leftPad int) (string, int) {
-	var textBuffer bytes.Buffer
-	nbLine := 0
 	pad := strings.Repeat(" ", leftPad)
+	var lines []string
+	nbLine := 0
 
 	// tabs are formatted as 4 spaces
-	text = strings.Replace(text, "\t", "    ", 4)
-	wrapped := wrapText(text, lineWidth-leftPad)
-	for _, line := range strings.Split(wrapped, "\n") {
-		textBuffer.WriteString(pad + line)
-		textBuffer.WriteString("\n")
-		nbLine++
+	text = strings.Replace(text, "\t", "    ", -1)
+	for _, line := range strings.Split(text, "\n") {
+		if line == "" || strings.TrimSpace(line) == "" {
+			lines = append(lines, "")
+			nbLine++
+		} else {
+			wrapped := softwrapLine(line, lineWidth-leftPad)
+			firstLine := true
+			for _, seg := range strings.Split(wrapped, "\n") {
+				if firstLine {
+					lines = append(lines, pad+strings.TrimRight(seg, " "))
+					firstLine = false
+				} else {
+					lines = append(lines, pad+strings.TrimSpace(seg))
+				}
+				nbLine++
+			}
+		}
 	}
-	return textBuffer.String(), nbLine
+	return strings.Join(lines, "\n"), nbLine
 }
 
-// Wrap text so that each line fills at most w cells. Lines break at word
-// boundary or multibyte chars.
-//
-// Wrapping Algorithm: Treat the text as a sequence of words, with each word be
-// an alphanumeric word, or a multibyte char. We scan through the text and
-// construct the word, and flush the word into the paragraph once a word is
-// ready. A word is ready when a word boundary is detected: a boundary char such
-// as '\n', '\t', and ' ' is encountered; a multibyte char is found; or a
-// multibyte to single-byte switch is encountered. '\n' is handled in a special
-// manner.
-func wrapText(s string, w int) string {
-	word := ""
-	out := ""
-
-	width := 0
-	firstWord := true
-	isMultibyteWord := false
-
-	flushWord := func() {
-		wl := wordLen(word)
-		if isMultibyteWord {
-			if width+wl > w {
-				out += "\n" + word
-				width = wl
+type EscapeItem struct {
+	item string
+	pos  int
+}
+
+func recordTermEscape(s string) (string, []EscapeItem) {
+	var result []EscapeItem
+	var newStr string
+
+	pos := 0
+	item := ""
+	occupiedRuneCount := 0
+	inEscape := false
+	for i, r := range []rune(s) {
+		if r == '\x1b' {
+			pos = i
+			item = string(r)
+			inEscape = true
+			continue
+		}
+		if inEscape {
+			item += string(r)
+			if r == 'm' {
+				result = append(result, EscapeItem{item: item, pos: pos - occupiedRuneCount})
+				occupiedRuneCount += utf8.RuneCountInString(item)
+				inEscape = false
+			}
+			continue
+		}
+		newStr += string(r)
+	}
+
+	return newStr, result
+}
+
+func replayTermEscape(s string, sequence []EscapeItem) string {
+	if len(sequence) == 0 {
+		return string(s)
+	}
+	// Assume the original string contains no new line and the wrapped only insert
+	// new lines. So that we can recover the position where we insert the term
+	// escapes.
+	var out string = ""
+
+	currPos := 0
+	currItem := 0
+	for _, r := range []rune(s) {
+		if currItem < len(sequence) && currPos == sequence[currItem].pos {
+			if r == '\n' {
+				out += "\n" + sequence[currItem].item
 			} else {
-				out += word
-				width += wl
+				out += sequence[currItem].item + string(r)
+				currPos++
 			}
+			currItem++
 		} else {
-			if width == 0 {
-				out += word
-				width += wl
-			} else if width+wl+1 > w {
-				out += "\n" + word
-				width = wl
-			} else {
-				out += " " + word
-				width += wl + 1
+			if r != '\n' {
+				currPos++
 			}
+			out += string(r)
 		}
-		word = ""
 	}
 
-	for _, r := range []rune(s) {
-		cw := runewidth.RuneWidth(r)
-		if firstWord {
-			word = string(r)
-			isMultibyteWord = cw > 1
-			firstWord = false
+	return out
+}
+
+// Break a line into several lines so that each line consumes at most 'w' cells.
+// Lines break at group of white spaces and multibyte chars. Nothing is removed
+// from the line so that it behaves like a softwrap.
+//
+// Required: The line shall not contain '\n' (so it is a single line).
+//
+// WRAPPING ALGORITHM: The line is broken into non-breakable groups, then line
+// breaks ("\n") is inserted between these groups so that the total length
+// between breaks does not exceed the required width. Words that are longer than
+// the width is broken into several words as `M+M+...+N`.
+func softwrapLine(s string, w int) string {
+	newStr, termSeqs := recordTermEscape(s)
+
+	const (
+		WIDE_CHAR     = iota
+		INVISIBLE     = iota
+		SHORT_UNICODE = iota
+		SPACE         = iota
+		VISIBLE_ASCII = iota
+		NONE          = iota
+	)
+
+	// In order to simplify the terminal color sequence handling, we first strip
+	// them out of the text and record their position, then do the wrap. After
+	// that, we insert back these sequences.
+	runeType := func(r rune) int {
+		rw := runewidth.RuneWidth(r)
+		if rw > 1 {
+			return WIDE_CHAR
+		} else if rw == 0 {
+			return INVISIBLE
+		} else if r > 127 {
+			return SHORT_UNICODE
+		} else if r == ' ' {
+			return SPACE
+		} else {
+			return VISIBLE_ASCII
+		}
+	}
+
+	var chunks []string
+	var word string
+	wordType := NONE
+	for _, r := range []rune(newStr) {
+		// A WIDE_CHAR itself constitutes a group.
+		thisType := runeType(r)
+		if thisType == WIDE_CHAR {
+			chunks = append(chunks, string(r))
 			continue
 		}
-		if r == '\n' {
-			flushWord()
-			out += "\n"
-			width = 0
-		} else if r == ' ' || r == '\t' {
-			flushWord()
-		} else if cw > 1 {
-			flushWord()
-			word = string(r)
-			isMultibyteWord = true
-			word = string(r)
-		} else if cw == 1 && isMultibyteWord {
-			flushWord()
+		// Other type of groups starts with a char of that type, and ends with a
+		// char with different type or end of string.
+		if thisType != wordType {
+			if wordType != NONE {
+				chunks = append(chunks, word)
+			}
 			word = string(r)
-			isMultibyteWord = false
+			wordType = thisType
 		} else {
 			word += string(r)
 		}
 	}
-	// The text may end without newlines, ensure flushing it or we can lose the
-	// last word.
-	flushWord()
+	if word != "" {
+		chunks = append(chunks, word)
+	}
 
-	return out
+	var line string = ""
+	var width int = 0
+	// Reverse the chunk array so we can use it as a stack.
+	for i, j := 0, len(chunks)-1; i < j; i, j = i+1, j-1 {
+		chunks[i], chunks[j] = chunks[j], chunks[i]
+	}
+	for len(chunks) > 0 {
+		thisWord := chunks[len(chunks)-1]
+		wl := wordLen(thisWord)
+		if width+wl <= w {
+			line += chunks[len(chunks)-1]
+			chunks = chunks[:len(chunks)-1]
+			width += wl
+			if width == w && len(chunks) > 0{
+				line += "\n"
+				width = 0
+			}
+		} else if wl > w {
+			left, right := splitWord(chunks[len(chunks)-1], w)
+			line += left + "\n"
+			chunks[len(chunks)-1] = right
+			width = 0
+		} else {
+			line += "\n"
+			width = 0
+		}
+	}
+
+	line = replayTermEscape(line, termSeqs)
+	return line
 }
 
 // wordLen return the length of a word, while ignoring the terminal escape

util/text/text_test.go 🔗

@@ -5,6 +5,7 @@ import (
 	"testing"
 )
 
+
 func TestWrap(t *testing.T) {
 	cases := []struct {
 		Input, Output string
@@ -43,7 +44,7 @@ func TestWrap(t *testing.T) {
 		// A tab counts as 4 characters.
 		{
 			"foo\nb\t r\n baz",
-			"foo\nb\n  r\n baz",
+			"foo\nb\nr\n baz",
 			4,
 		},
 		// Trailing whitespace is removed after used for wrapping.
@@ -86,19 +87,19 @@ func TestWrap(t *testing.T) {
 		// Complete example:
 		{
 			" This is a list: \n\n\t* foo\n\t* bar\n\n\n\t* baz  \nBAM    ",
-			" This\nis a\nlist:\n\n\n    *\nfoo\n    *\nbar\n\n\n    *\nbaz\nBAM\n",
+			" This\nis a\nlist:\n\n    *\nfoo\n    *\nbar\n\n\n    *\nbaz\nBAM\n",
 			6,
 		},
 		// Handle chinese (wide characters)
 		{
-			"婞一枳郲逴靲屮蜧曀殳,掫乇峔掮傎溒兀緉冘仜。",
-			"婞一枳郲逴靲\n屮蜧曀殳,掫\n乇峔掮傎溒兀\n緉冘仜。",
+			"一只敏捷的狐狸跳过了一只懒狗。",
+			"一只敏捷的狐\n狸跳过了一只\n懒狗。",
 			12,
 		},
 		// Handle chinese with colors
 		{
-			"婞一枳郲逴\x1b[31m靲屮蜧曀殳,掫乇峔掮傎溒\x1b[0m兀緉冘仜。",
-			"婞一枳郲逴\x1b[31m靲\n屮蜧曀殳,掫\n乇峔掮傎溒\x1b[0m兀\n緉冘仜。",
+			"一只敏捷的\x1b[31m狐狸跳过\x1b[0m了一只懒狗。",
+			"一只敏捷的\x1b[31m狐\n狸跳过\x1b[0m了一只\n懒狗。",
 			12,
 		},
 	}
@@ -106,7 +107,7 @@ func TestWrap(t *testing.T) {
 	for i, tc := range cases {
 		actual, lines := Wrap(tc.Input, tc.Lim)
 		if actual != tc.Output {
-			t.Fatalf("Case %d Input:\n\n`%s`\n\nExpected Output:\n\n`%s`\n\nActual Output:\n`\n%s`",
+			t.Fatalf("Case %d Input:\n\n`%s`\n\nExpected Output:\n\n`%s`\n\nActual Output:\n\n`%s`",
 				i, tc.Input, tc.Output, actual)
 		}