gen_breaktest.go

  1//go:build generate
  2
  3// This program generates a Go containing a slice of test cases based on the
  4// Unicode Character Database auxiliary data files. The command line arguments
  5// are as follows:
  6//
  7//   1. The name of the Unicode data file (just the filename, without extension).
  8//   2. The name of the locally generated Go file.
  9//   3. The name of the slice containing the test cases.
 10//   4. The name of the generator, for logging purposes.
 11//
 12//go:generate go run gen_breaktest.go GraphemeBreakTest graphemebreak_test.go graphemeBreakTestCases graphemes
 13//go:generate go run gen_breaktest.go WordBreakTest wordbreak_test.go wordBreakTestCases words
 14//go:generate go run gen_breaktest.go SentenceBreakTest sentencebreak_test.go sentenceBreakTestCases sentences
 15//go:generate go run gen_breaktest.go LineBreakTest linebreak_test.go lineBreakTestCases lines
 16
 17package main
 18
 19import (
 20	"bufio"
 21	"bytes"
 22	"errors"
 23	"fmt"
 24	"go/format"
 25	"io/ioutil"
 26	"log"
 27	"net/http"
 28	"os"
 29	"time"
 30)
 31
 32// We want to test against a specific version rather than the latest. When the
 33// package is upgraded to a new version, change these to generate new tests.
 34const (
 35	testCaseURL = `https://www.unicode.org/Public/15.0.0/ucd/auxiliary/%s.txt`
 36)
 37
 38func main() {
 39	if len(os.Args) < 5 {
 40		fmt.Println("Not enough arguments, see code for details")
 41		os.Exit(1)
 42	}
 43
 44	log.SetPrefix("gen_breaktest (" + os.Args[4] + "): ")
 45	log.SetFlags(0)
 46
 47	// Read text of testcases and parse into Go source code.
 48	src, err := parse(fmt.Sprintf(testCaseURL, os.Args[1]))
 49	if err != nil {
 50		log.Fatal(err)
 51	}
 52
 53	// Format the Go code.
 54	formatted, err := format.Source(src)
 55	if err != nil {
 56		log.Fatalln("gofmt:", err)
 57	}
 58
 59	// Write it out.
 60	log.Print("Writing to ", os.Args[2])
 61	if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
 62		log.Fatal(err)
 63	}
 64}
 65
 66// parse reads a break text file, either from a local file or from a URL. It
 67// parses the file data into Go source code representing the test cases.
 68func parse(url string) ([]byte, error) {
 69	log.Printf("Parsing %s", url)
 70	res, err := http.Get(url)
 71	if err != nil {
 72		return nil, err
 73	}
 74	body := res.Body
 75	defer body.Close()
 76
 77	buf := new(bytes.Buffer)
 78	buf.Grow(120 << 10)
 79	buf.WriteString(`// Code generated via go generate from gen_breaktest.go. DO NOT EDIT.
 80
 81package uniseg
 82
 83// ` + os.Args[3] + ` are Grapheme testcases taken from
 84// ` + url + `
 85// on ` + time.Now().Format("January 2, 2006") + `. See
 86// https://www.unicode.org/license.html for the Unicode license agreement.
 87var ` + os.Args[3] + ` = []testCase {
 88`)
 89
 90	sc := bufio.NewScanner(body)
 91	num := 1
 92	var line []byte
 93	original := make([]byte, 0, 64)
 94	expected := make([]byte, 0, 64)
 95	for sc.Scan() {
 96		num++
 97		line = sc.Bytes()
 98		if len(line) == 0 || line[0] == '#' {
 99			continue
100		}
101		var comment []byte
102		if i := bytes.IndexByte(line, '#'); i >= 0 {
103			comment = bytes.TrimSpace(line[i+1:])
104			line = bytes.TrimSpace(line[:i])
105		}
106		original, expected, err := parseRuneSequence(line, original[:0], expected[:0])
107		if err != nil {
108			return nil, fmt.Errorf(`line %d: %v: %q`, num, err, line)
109		}
110		fmt.Fprintf(buf, "\t{original: \"%s\", expected: %s}, // %s\n", original, expected, comment)
111	}
112	if err := sc.Err(); err != nil {
113		return nil, err
114	}
115
116	// Check for final "# EOF", useful check if we're streaming via HTTP
117	if !bytes.Equal(line, []byte("# EOF")) {
118		return nil, fmt.Errorf(`line %d: exected "# EOF" as final line, got %q`, num, line)
119	}
120	buf.WriteString("}\n")
121	return buf.Bytes(), nil
122}
123
124// Used by parseRuneSequence to match input via bytes.HasPrefix.
125var (
126	prefixBreak     = []byte("÷ ")
127	prefixDontBreak = []byte("× ")
128	breakOk         = []byte("÷")
129	breakNo         = []byte("×")
130)
131
132// parseRuneSequence parses a rune + breaking opportunity sequence from b
133// and appends the Go code for testcase.original to orig
134// and appends the Go code for testcase.expected to exp.
135// It retuns the new orig and exp slices.
136//
137// E.g. for the input b="÷ 0020 × 0308 ÷ 1F1E6 ÷"
138// it will append
139//
140//	"\u0020\u0308\U0001F1E6"
141//
142// and "[][]rune{{0x0020,0x0308},{0x1F1E6},}"
143// to orig and exp respectively.
144//
145// The formatting of exp is expected to be cleaned up by gofmt or format.Source.
146// Note we explicitly require the sequence to start with ÷ and we implicitly
147// require it to end with ÷.
148func parseRuneSequence(b, orig, exp []byte) ([]byte, []byte, error) {
149	// Check for and remove first ÷ or ×.
150	if !bytes.HasPrefix(b, prefixBreak) && !bytes.HasPrefix(b, prefixDontBreak) {
151		return nil, nil, errors.New("expected ÷ or × as first character")
152	}
153	if bytes.HasPrefix(b, prefixBreak) {
154		b = b[len(prefixBreak):]
155	} else {
156		b = b[len(prefixDontBreak):]
157	}
158
159	boundary := true
160	exp = append(exp, "[][]rune{"...)
161	for len(b) > 0 {
162		if boundary {
163			exp = append(exp, '{')
164		}
165		exp = append(exp, "0x"...)
166		// Find end of hex digits.
167		var i int
168		for i = 0; i < len(b) && b[i] != ' '; i++ {
169			if d := b[i]; ('0' <= d || d <= '9') ||
170				('A' <= d || d <= 'F') ||
171				('a' <= d || d <= 'f') {
172				continue
173			}
174			return nil, nil, errors.New("bad hex digit")
175		}
176		switch i {
177		case 4:
178			orig = append(orig, "\\u"...)
179		case 5:
180			orig = append(orig, "\\U000"...)
181		default:
182			return nil, nil, errors.New("unsupport code point hex length")
183		}
184		orig = append(orig, b[:i]...)
185		exp = append(exp, b[:i]...)
186		b = b[i:]
187
188		// Check for space between hex and ÷ or ×.
189		if len(b) < 1 || b[0] != ' ' {
190			return nil, nil, errors.New("bad input")
191		}
192		b = b[1:]
193
194		// Check for next boundary.
195		switch {
196		case bytes.HasPrefix(b, breakOk):
197			boundary = true
198			b = b[len(breakOk):]
199		case bytes.HasPrefix(b, breakNo):
200			boundary = false
201			b = b[len(breakNo):]
202		default:
203			return nil, nil, errors.New("missing ÷ or ×")
204		}
205		if boundary {
206			exp = append(exp, '}')
207		}
208		exp = append(exp, ',')
209		if len(b) > 0 && b[0] == ' ' {
210			b = b[1:]
211		}
212	}
213	exp = append(exp, '}')
214	return orig, exp, nil
215}