Merge pull request #86 from ProgramFan/master

Michael Muré created 6 years ago

Implement CJK support in termui

Change summary

util/text/left_padded.go |  19 +-
util/text/text.go        | 306 +++++++++++++++++++++++++++++++----------
util/text/text_test.go   | 109 ++++++++++++++
3 files changed, 342 insertions(+), 92 deletions(-)

Detailed changes

util/text/left_padded.go 🔗

@@ -3,25 +3,26 @@ package text
 import (
 	"bytes"
 	"fmt"
+	"github.com/mattn/go-runewidth"
 	"strings"
 )
 
-// LeftPadMaxLine pads a string on the left by a specified amount and pads the string on the right to fill the maxLength
+// LeftPadMaxLine pads a string on the left by a specified amount and pads the
+// string on the right to fill the maxLength
 func LeftPadMaxLine(text string, length, leftPad int) string {
-	runes := []rune(text)
+	var rightPart string = text
 
+	scrWidth := runewidth.StringWidth(text)
 	// truncate and ellipse if needed
-	if len(runes)+leftPad > length {
-		runes = append(runes[:(length-leftPad-1)], '…')
-	}
-
-	if len(runes)+leftPad < length {
-		runes = append(runes, []rune(strings.Repeat(" ", length-len(runes)-leftPad))...)
+	if scrWidth+leftPad > length {
+		rightPart = runewidth.Truncate(text, length-leftPad, "…")
+	} else if scrWidth+leftPad < length {
+		rightPart = runewidth.FillRight(text, length-leftPad)
 	}
 
 	return fmt.Sprintf("%s%s",
 		strings.Repeat(" ", leftPad),
-		string(runes),
+		rightPart,
 	)
 }

util/text/text.go 🔗

@@ -1,12 +1,18 @@
 package text
 
 import (
-	"bytes"
-	"strings"
-
 	"github.com/mattn/go-runewidth"
+	"strings"
+	"unicode/utf8"
 )
 
+// Force runewidth not to treat ambiguous runes as wide chars, so that things
+// like unicode ellipsis/up/down/left/right glyphs can have correct runewidth
+// and can be displayed correctly in terminals.
+func init() {
+	runewidth.DefaultCondition.EastAsianWidth = false
+}
+
 // Wrap a text for an exact line size
 // Handle properly terminal color escape code
 func Wrap(text string, lineWidth int) (string, int) {
@@ -16,98 +22,248 @@ func Wrap(text string, lineWidth int) (string, int) {
 // Wrap a text for an exact line size with a left padding
 // Handle properly terminal color escape code
 func WrapLeftPadded(text string, lineWidth int, leftPad int) (string, int) {
-	var textBuffer bytes.Buffer
-	var lineBuffer bytes.Buffer
-	nbLine := 1
-	firstLine := true
+	var lines []string
+	nbLine := 0
 	pad := strings.Repeat(" ", leftPad)
 
 	// tabs are formatted as 4 spaces
-	text = strings.Replace(text, "\t", "    ", 4)
-
+	text = strings.Replace(text, "\t", "    ", -1)
+	// NOTE: text is first segmented into lines so that softwrapLine can handle.
 	for _, line := range strings.Split(text, "\n") {
-		spaceLeft := lineWidth - leftPad
-
-		if !firstLine {
-			textBuffer.WriteString("\n")
+		if line == "" || strings.TrimSpace(line) == "" {
+			lines = append(lines, "")
 			nbLine++
+		} else {
+			wrapped := softwrapLine(line, lineWidth-leftPad)
+			firstLine := true
+			for _, seg := range strings.Split(wrapped, "\n") {
+				if firstLine {
+					lines = append(lines, pad+strings.TrimRight(seg, " "))
+					firstLine = false
+				} else {
+					lines = append(lines, pad+strings.TrimSpace(seg))
+				}
+				nbLine++
+			}
 		}
+	}
+	return strings.Join(lines, "\n"), nbLine
+}
 
-		firstWord := true
+// Break a line into several lines so that each line consumes at most
+// 'textWidth' cells.  Lines break at groups of white spaces and multibyte
+// chars. Nothing is removed from the original text so that it behaves like a
+// softwrap.
+//
+// Required: The line shall not contain '\n'
+//
+// WRAPPING ALGORITHM: The line is broken into non-breakable chunks, then line
+// breaks ("\n") are inserted between these groups so that the total length
+// between breaks does not exceed the required width. Words that are longer than
+// the textWidth are broen into pieces no longer than textWidth.
+//
+func softwrapLine(line string, textWidth int) string {
+	// NOTE: terminal escapes are stripped out of the line so the algorithm is
+	// simpler. Do not try to mix them in the wrapping algorithm, as it can get
+	// complicated quickly.
+	line1, termEscapes := extractTermEscapes(line)
+
+	chunks := segmentLine(line1)
+	// Reverse the chunk array so we can use it as a stack.
+	for i, j := 0, len(chunks)-1; i < j; i, j = i+1, j-1 {
+		chunks[i], chunks[j] = chunks[j], chunks[i]
+	}
+	var line2 string = ""
+	var width int = 0
+	for len(chunks) > 0 {
+		thisWord := chunks[len(chunks)-1]
+		wl := wordLen(thisWord)
+		if width+wl <= textWidth {
+			line2 += chunks[len(chunks)-1]
+			chunks = chunks[:len(chunks)-1]
+			width += wl
+			if width == textWidth && len(chunks) > 0 {
+				// NOTE: new line begins when current line is full and there are more
+				// chunks to come.
+				line2 += "\n"
+				width = 0
+			}
+		} else if wl > textWidth {
+			// NOTE: By default, long words are splited to fill the remaining space.
+			// But if the long words is the first non-space word in the middle of the
+			// line, preceeding spaces shall not be counted in word spliting.
+			splitWidth := textWidth - width
+			if strings.HasSuffix(line2, "\n"+strings.Repeat(" ", width)) {
+				splitWidth += width
+			}
+			left, right := splitWord(chunks[len(chunks)-1], splitWidth)
+			chunks[len(chunks)-1] = right
+			line2 += left + "\n"
+			width = 0
+		} else {
+			line2 += "\n"
+			width = 0
+		}
+	}
 
-		for _, word := range strings.Split(line, " ") {
-			wordLength := wordLen(word)
+	line3 := applyTermEscapes(line2, termEscapes)
+	return line3
+}
 
-			if !firstWord {
-				lineBuffer.WriteString(" ")
-				spaceLeft -= 1
+// EscapeItem: Storage of terminal escapes in a line. 'item' is the actural
+// escape command, and 'pos' is the index in the rune array where the 'item'
+// shall be inserted back. For example, the escape item in "F\x1b33mox" is
+// {"\x1b33m", 1}.
+type escapeItem struct {
+	item string
+	pos  int
+}
 
-				if spaceLeft <= 0 {
-					textBuffer.WriteString(pad + strings.TrimRight(lineBuffer.String(), " "))
-					textBuffer.WriteString("\n")
-					lineBuffer.Reset()
-					spaceLeft = lineWidth - leftPad
-					nbLine++
-					firstLine = false
-				}
+// Extract terminal escapes out of a line, returns a new line without terminal
+// escapes and a slice of escape items. The terminal escapes can be inserted
+// back into the new line at rune index 'item.pos' to recover the original line.
+//
+// Required: The line shall not contain "\n"
+//
+func extractTermEscapes(line string) (string, []escapeItem) {
+	var termEscapes []escapeItem
+	var line1 string
+
+	pos := 0
+	item := ""
+	occupiedRuneCount := 0
+	inEscape := false
+	for i, r := range []rune(line) {
+		if r == '\x1b' {
+			pos = i
+			item = string(r)
+			inEscape = true
+			continue
+		}
+		if inEscape {
+			item += string(r)
+			if r == 'm' {
+				termEscapes = append(termEscapes, escapeItem{item, pos - occupiedRuneCount})
+				occupiedRuneCount += utf8.RuneCountInString(item)
+				inEscape = false
 			}
+			continue
+		}
+		line1 += string(r)
+	}
+
+	return line1, termEscapes
+}
 
-			// Word fit in the current line
-			if spaceLeft >= wordLength {
-				lineBuffer.WriteString(word)
-				spaceLeft -= wordLength
-				firstWord = false
+// Apply the extracted terminal escapes to the edited line. The only edit
+// allowed is to insert "\n" like that in softwrapLine. Callers shall ensure
+// this since this function is not able to check it.
+func applyTermEscapes(line string, escapes []escapeItem) string {
+	if len(escapes) == 0 {
+		return line
+	}
+
+	var out string = ""
+
+	currPos := 0
+	currItem := 0
+	for _, r := range line {
+		if currItem < len(escapes) && currPos == escapes[currItem].pos {
+			// NOTE: We avoid terminal escapes at the end of a line by move them one
+			// pass the end of line, so that algorithms who trim right spaces are
+			// happy. But algorithms who trim left spaces are still unhappy.
+			if r == '\n' {
+				out += "\n" + escapes[currItem].item
 			} else {
-				// Break a word longer than a line
-				if wordLength > lineWidth {
-					for wordLength > 0 && wordLen(word) > 0 {
-						l := minInt(spaceLeft, wordLength)
-						part, leftover := splitWord(word, l)
-						word = leftover
-						wordLength = wordLen(word)
-
-						lineBuffer.WriteString(part)
-						textBuffer.WriteString(pad)
-						textBuffer.Write(lineBuffer.Bytes())
-						lineBuffer.Reset()
-
-						spaceLeft -= l
-
-						if spaceLeft <= 0 {
-							textBuffer.WriteString("\n")
-							nbLine++
-							spaceLeft = lineWidth - leftPad
-						}
-
-						if wordLength <= 0 {
-							break
-						}
-					}
-				} else {
-					// Normal break
-					textBuffer.WriteString(pad + strings.TrimRight(lineBuffer.String(), " "))
-					textBuffer.WriteString("\n")
-					lineBuffer.Reset()
-					lineBuffer.WriteString(word)
-					firstWord = false
-					spaceLeft = lineWidth - leftPad - wordLength
-					nbLine++
-				}
+				out += escapes[currItem].item + string(r)
+				currPos++
 			}
+			currItem++
+		} else {
+			if r != '\n' {
+				currPos++
+			}
+			out += string(r)
 		}
+	}
 
-		if lineBuffer.Len() > 0 {
-			textBuffer.WriteString(pad + strings.TrimRight(lineBuffer.String(), " "))
-			lineBuffer.Reset()
-		}
+	return out
+}
 
-		firstLine = false
+// Segment a line into chunks, where each chunk consists of chars with the same
+// type and is not breakable.
+func segmentLine(s string) []string {
+	var chunks []string
+
+	var word string
+	wordType := none
+	flushWord := func() {
+		chunks = append(chunks, word)
+		word = ""
+		wordType = none
 	}
 
-	return textBuffer.String(), nbLine
+	for _, r := range s {
+		// A WIDE_CHAR itself constitutes a chunk.
+		thisType := runeType(r)
+		if thisType == wideChar {
+			if wordType != none {
+				flushWord()
+			}
+			chunks = append(chunks, string(r))
+			continue
+		}
+		// Other type of chunks starts with a char of that type, and ends with a
+		// char with different type or end of string.
+		if thisType != wordType {
+			if wordType != none {
+				flushWord()
+			}
+			word = string(r)
+			wordType = thisType
+		} else {
+			word += string(r)
+		}
+	}
+	if word != "" {
+		flushWord()
+	}
+
+	return chunks
+}
+
+// Rune categories
+//
+// These categories are so defined that each category forms a non-breakable
+// chunk. It IS NOT the same as unicode code point categories.
+//
+const (
+	none int = iota
+	wideChar
+	invisible
+	shortUnicode
+	space
+	visibleAscii
+)
+
+// Determine the category of a rune.
+func runeType(r rune) int {
+	rw := runewidth.RuneWidth(r)
+	if rw > 1 {
+		return wideChar
+	} else if rw == 0 {
+		return invisible
+	} else if r > 127 {
+		return shortUnicode
+	} else if r == ' ' {
+		return space
+	} else {
+		return visibleAscii
+	}
 }
 
-// wordLen return the length of a word, while ignoring the terminal escape sequences
+// wordLen return the length of a word, while ignoring the terminal escape
+// sequences
 func wordLen(word string) int {
 	length := 0
 	escape := false
@@ -116,11 +272,9 @@ func wordLen(word string) int {
 		if char == '\x1b' {
 			escape = true
 		}
-
 		if !escape {
 			length += runewidth.RuneWidth(rune(char))
 		}
-
 		if char == 'm' {
 			escape = false
 		}

util/text/text_test.go 🔗

@@ -1,6 +1,7 @@
 package text
 
 import (
+	"reflect"
 	"strings"
 	"testing"
 )
@@ -43,7 +44,7 @@ func TestWrap(t *testing.T) {
 		// A tab counts as 4 characters.
 		{
 			"foo\nb\t r\n baz",
-			"foo\nb\n  r\n baz",
+			"foo\nb\nr\n baz",
 			4,
 		},
 		// Trailing whitespace is removed after used for wrapping.
@@ -86,19 +87,31 @@ func TestWrap(t *testing.T) {
 		// Complete example:
 		{
 			" This is a list: \n\n\t* foo\n\t* bar\n\n\n\t* baz  \nBAM    ",
-			" This\nis a\nlist:\n\n\n    *\nfoo\n    *\nbar\n\n\n    *\nbaz\nBAM\n",
+			" This\nis a\nlist:\n\n    *\nfoo\n    *\nbar\n\n\n    *\nbaz\nBAM\n",
 			6,
 		},
 		// Handle chinese (wide characters)
 		{
-			"婞一枳郲逴靲屮蜧曀殳，掫乇峔掮傎溒兀緉冘仜。",
-			"婞一枳郲逴靲\n屮蜧曀殳，掫\n乇峔掮傎溒兀\n緉冘仜。",
+			"一只敏捷的狐狸跳过了一只懒狗。",
+			"一只敏捷的狐\n狸跳过了一只\n懒狗。",
 			12,
 		},
 		// Handle chinese with colors
 		{
-			"婞一枳郲逴\x1b[31m靲屮蜧曀殳，掫乇峔掮傎溒\x1b[0m兀緉冘仜。",
-			"婞一枳郲逴\x1b[31m靲\n屮蜧曀殳，掫\n乇峔掮傎溒\x1b[0m兀\n緉冘仜。",
+			"一只敏捷的\x1b[31m狐狸跳过\x1b[0m了一只懒狗。",
+			"一只敏捷的\x1b[31m狐\n狸跳过\x1b[0m了一只\n懒狗。",
+			12,
+		},
+		// Handle mixed wide and short characters
+		{
+			"敏捷 A quick 的狐狸 fox 跳过 jumps over a lazy 了一只懒狗 dog。",
+			"敏捷 A quick\n的狐狸 fox\n跳过 jumps\nover a lazy\n了一只懒狗\ndog。",
+			12,
+		},
+		// Handle mixed wide and short characters with color
+		{
+			"敏捷 A \x1b31mquick 的狐狸 fox 跳\x1b0m过 jumps over a lazy 了一只懒狗 dog。",
+			"敏捷 A \x1b31mquick\n的狐狸 fox\n跳\x1b0m过 jumps\nover a lazy\n了一只懒狗\ndog。",
 			12,
 		},
 	}
@@ -106,7 +119,7 @@ func TestWrap(t *testing.T) {
 	for i, tc := range cases {
 		actual, lines := Wrap(tc.Input, tc.Lim)
 		if actual != tc.Output {
-			t.Fatalf("Case %d Input:\n\n`%s`\n\nExpected Output:\n\n`%s`\n\nActual Output:\n`\n%s`",
+			t.Fatalf("Case %d Input:\n\n`%s`\n\nExpected Output:\n\n`%s`\n\nActual Output:\n\n`%s`",
 				i, tc.Input, tc.Output, actual)
 		}
 
@@ -144,6 +157,14 @@ func TestWrapLeftPadded(t *testing.T) {
     蚗佶庂咺丌，輀鈁乇彽洢溦洰氶乇构碨洐巿阹。`,
 			59, 4,
 		},
+		// Handle long unbreakable words in a full stentence
+		{
+			"OT: there are alternatives to maintainer-/user-set priority, e.g. \"[user pain](http://www.lostgarden.com/2008/05/improving-bug-triage-with-user-pain.html)\".",
+			`    OT: there are alternatives to maintainer-/user-set
+    priority, e.g. "[user pain](http://www.lostgarden.com/
+    2008/05/improving-bug-triage-with-user-pain.html)".`,
+			58, 4,
+		},
 	}
 
 	for i, tc := range cases {
@@ -273,3 +294,77 @@ func TestSplitWord(t *testing.T) {
 		}
 	}
 }
+
+func TestExtractApplyTermEscapes(t *testing.T) {
+	cases := []struct {
+		Input       string
+		Output      string
+		TermEscapes []escapeItem
+	}{
+		// A plain ascii line with escapes.
+		{
+			"This \x1b[31mis an\x1b[0m example.",
+			"This is an example.",
+			[]escapeItem{{"\x1b[31m", 5}, {"\x1b[0m", 10}},
+		},
+		// A plain wide line with escapes.
+		{
+			"一只敏捷\x1b[31m的狐狸\x1b[0m跳过了一只懒狗。",
+			"一只敏捷的狐狸跳过了一只懒狗。",
+			[]escapeItem{{"\x1b[31m", 4}, {"\x1b[0m", 7}},
+		},
+		// A normal-wide mixed line with escapes.
+		{
+			"一只 A Quick 敏捷\x1b[31m的狐 Fox 狸\x1b[0m跳过了Dog一只懒狗。",
+			"一只 A Quick 敏捷的狐 Fox 狸跳过了Dog一只懒狗。",
+			[]escapeItem{{"\x1b[31m", 13}, {"\x1b[0m", 21}},
+		},
+	}
+
+	for i, tc := range cases {
+		line2, escapes := extractTermEscapes(tc.Input)
+		if line2 != tc.Output || !reflect.DeepEqual(escapes, tc.TermEscapes) {
+			t.Fatalf("Case %d Input:\n\n`%s`\n\nExpected Output:\n\nLine: `%s`\nEscapes: `%+v`\n\nActual Output:\n\nLine: `%s`\nEscapes: `%+v`\n\n",
+				i, tc.Input, tc.Output, tc.TermEscapes, line2, escapes)
+		}
+		line3 := applyTermEscapes(line2, escapes)
+		if line3 != tc.Input {
+			t.Fatalf("Case %d Input:\n\n`%s`\n\nExpected Result:\n\n`%s`\n\nActual Result:\n\n`%s`\n\n",
+				i, tc.Input, tc.Input, line3)
+		}
+	}
+}
+
+func TestSegmentLines(t *testing.T) {
+	cases := []struct {
+		Input  string
+		Output []string
+	}{
+		// A plain ascii line with escapes.
+		{
+			"This is an example.",
+			[]string{"This", " ", "is", " ", "an", " ", "example."},
+		},
+		// A plain wide line with escapes.
+		{
+			"一只敏捷的狐狸跳过了一只懒狗。",
+			[]string{"一", "只", "敏", "捷", "的", "狐", "狸", "跳", "过",
+				"了", "一", "只", "懒", "狗", "。"},
+		},
+		// A complex stentence.
+		{
+			"This is a 'complex' example, where   一只 and English 混合了。",
+			[]string{"This", " ", "is", " ", "a", " ", "'complex'", " ", "example,",
+				" ", "where", "   ", "一", "只", " ", "and", " ", "English", " ", "混",
+				"合", "了", "。"},
+		},
+	}
+
+	for i, tc := range cases {
+		chunks := segmentLine(tc.Input)
+		if !reflect.DeepEqual(chunks, tc.Output) {
+			t.Fatalf("Case %d Input:\n\n`%s`\n\nExpected Output:\n\n`[%s]`\n\nActual Output:\n\n`[%s]`\n\n",
+				i, tc.Input, strings.Join(tc.Output, ", "), strings.Join(chunks, ", "))
+		}
+	}
+}