1//go:build generate
2
3// This program generates a Go containing a slice of test cases based on the
4// Unicode Character Database auxiliary data files. The command line arguments
5// are as follows:
6//
7// 1. The name of the Unicode data file (just the filename, without extension).
8// 2. The name of the locally generated Go file.
9// 3. The name of the slice containing the test cases.
10// 4. The name of the generator, for logging purposes.
11//
12//go:generate go run gen_breaktest.go GraphemeBreakTest graphemebreak_test.go graphemeBreakTestCases graphemes
13//go:generate go run gen_breaktest.go WordBreakTest wordbreak_test.go wordBreakTestCases words
14//go:generate go run gen_breaktest.go SentenceBreakTest sentencebreak_test.go sentenceBreakTestCases sentences
15//go:generate go run gen_breaktest.go LineBreakTest linebreak_test.go lineBreakTestCases lines
16
17package main
18
19import (
20 "bufio"
21 "bytes"
22 "errors"
23 "fmt"
24 "go/format"
25 "io/ioutil"
26 "log"
27 "net/http"
28 "os"
29 "time"
30)
31
32// We want to test against a specific version rather than the latest. When the
33// package is upgraded to a new version, change these to generate new tests.
34const (
35 testCaseURL = `https://www.unicode.org/Public/15.0.0/ucd/auxiliary/%s.txt`
36)
37
38func main() {
39 if len(os.Args) < 5 {
40 fmt.Println("Not enough arguments, see code for details")
41 os.Exit(1)
42 }
43
44 log.SetPrefix("gen_breaktest (" + os.Args[4] + "): ")
45 log.SetFlags(0)
46
47 // Read text of testcases and parse into Go source code.
48 src, err := parse(fmt.Sprintf(testCaseURL, os.Args[1]))
49 if err != nil {
50 log.Fatal(err)
51 }
52
53 // Format the Go code.
54 formatted, err := format.Source(src)
55 if err != nil {
56 log.Fatalln("gofmt:", err)
57 }
58
59 // Write it out.
60 log.Print("Writing to ", os.Args[2])
61 if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
62 log.Fatal(err)
63 }
64}
65
66// parse reads a break text file, either from a local file or from a URL. It
67// parses the file data into Go source code representing the test cases.
68func parse(url string) ([]byte, error) {
69 log.Printf("Parsing %s", url)
70 res, err := http.Get(url)
71 if err != nil {
72 return nil, err
73 }
74 body := res.Body
75 defer body.Close()
76
77 buf := new(bytes.Buffer)
78 buf.Grow(120 << 10)
79 buf.WriteString(`// Code generated via go generate from gen_breaktest.go. DO NOT EDIT.
80
81package uniseg
82
83// ` + os.Args[3] + ` are Grapheme testcases taken from
84// ` + url + `
85// on ` + time.Now().Format("January 2, 2006") + `. See
86// https://www.unicode.org/license.html for the Unicode license agreement.
87var ` + os.Args[3] + ` = []testCase {
88`)
89
90 sc := bufio.NewScanner(body)
91 num := 1
92 var line []byte
93 original := make([]byte, 0, 64)
94 expected := make([]byte, 0, 64)
95 for sc.Scan() {
96 num++
97 line = sc.Bytes()
98 if len(line) == 0 || line[0] == '#' {
99 continue
100 }
101 var comment []byte
102 if i := bytes.IndexByte(line, '#'); i >= 0 {
103 comment = bytes.TrimSpace(line[i+1:])
104 line = bytes.TrimSpace(line[:i])
105 }
106 original, expected, err := parseRuneSequence(line, original[:0], expected[:0])
107 if err != nil {
108 return nil, fmt.Errorf(`line %d: %v: %q`, num, err, line)
109 }
110 fmt.Fprintf(buf, "\t{original: \"%s\", expected: %s}, // %s\n", original, expected, comment)
111 }
112 if err := sc.Err(); err != nil {
113 return nil, err
114 }
115
116 // Check for final "# EOF", useful check if we're streaming via HTTP
117 if !bytes.Equal(line, []byte("# EOF")) {
118 return nil, fmt.Errorf(`line %d: exected "# EOF" as final line, got %q`, num, line)
119 }
120 buf.WriteString("}\n")
121 return buf.Bytes(), nil
122}
123
124// Used by parseRuneSequence to match input via bytes.HasPrefix.
125var (
126 prefixBreak = []byte("÷ ")
127 prefixDontBreak = []byte("× ")
128 breakOk = []byte("÷")
129 breakNo = []byte("×")
130)
131
132// parseRuneSequence parses a rune + breaking opportunity sequence from b
133// and appends the Go code for testcase.original to orig
134// and appends the Go code for testcase.expected to exp.
135// It retuns the new orig and exp slices.
136//
137// E.g. for the input b="÷ 0020 × 0308 ÷ 1F1E6 ÷"
138// it will append
139//
140// "\u0020\u0308\U0001F1E6"
141//
142// and "[][]rune{{0x0020,0x0308},{0x1F1E6},}"
143// to orig and exp respectively.
144//
145// The formatting of exp is expected to be cleaned up by gofmt or format.Source.
146// Note we explicitly require the sequence to start with ÷ and we implicitly
147// require it to end with ÷.
148func parseRuneSequence(b, orig, exp []byte) ([]byte, []byte, error) {
149 // Check for and remove first ÷ or ×.
150 if !bytes.HasPrefix(b, prefixBreak) && !bytes.HasPrefix(b, prefixDontBreak) {
151 return nil, nil, errors.New("expected ÷ or × as first character")
152 }
153 if bytes.HasPrefix(b, prefixBreak) {
154 b = b[len(prefixBreak):]
155 } else {
156 b = b[len(prefixDontBreak):]
157 }
158
159 boundary := true
160 exp = append(exp, "[][]rune{"...)
161 for len(b) > 0 {
162 if boundary {
163 exp = append(exp, '{')
164 }
165 exp = append(exp, "0x"...)
166 // Find end of hex digits.
167 var i int
168 for i = 0; i < len(b) && b[i] != ' '; i++ {
169 if d := b[i]; ('0' <= d || d <= '9') ||
170 ('A' <= d || d <= 'F') ||
171 ('a' <= d || d <= 'f') {
172 continue
173 }
174 return nil, nil, errors.New("bad hex digit")
175 }
176 switch i {
177 case 4:
178 orig = append(orig, "\\u"...)
179 case 5:
180 orig = append(orig, "\\U000"...)
181 default:
182 return nil, nil, errors.New("unsupport code point hex length")
183 }
184 orig = append(orig, b[:i]...)
185 exp = append(exp, b[:i]...)
186 b = b[i:]
187
188 // Check for space between hex and ÷ or ×.
189 if len(b) < 1 || b[0] != ' ' {
190 return nil, nil, errors.New("bad input")
191 }
192 b = b[1:]
193
194 // Check for next boundary.
195 switch {
196 case bytes.HasPrefix(b, breakOk):
197 boundary = true
198 b = b[len(breakOk):]
199 case bytes.HasPrefix(b, breakNo):
200 boundary = false
201 b = b[len(breakNo):]
202 default:
203 return nil, nil, errors.New("missing ÷ or ×")
204 }
205 if boundary {
206 exp = append(exp, '}')
207 }
208 exp = append(exp, ',')
209 if len(b) > 0 && b[0] == ' ' {
210 b = b[1:]
211 }
212 }
213 exp = append(exp, '}')
214 return orig, exp, nil
215}