1//go:build generate
2
3// This program generates a property file in Go file from Unicode Character
4// Database auxiliary data files. The command line arguments are as follows:
5//
6// 1. The name of the Unicode data file (just the filename, without extension).
7// Can be "-" (to skip) if the emoji flag is included.
8// 2. The name of the locally generated Go file.
9// 3. The name of the slice mapping code points to properties.
10// 4. The name of the generator, for logging purposes.
11// 5. (Optional) Flags, comma-separated. The following flags are available:
12// - "emojis=<property>": include the specified emoji properties (e.g.
13// "Extended_Pictographic").
14// - "gencat": include general category properties.
15//
16//go:generate go run gen_properties.go auxiliary/GraphemeBreakProperty graphemeproperties.go graphemeCodePoints graphemes emojis=Extended_Pictographic
17//go:generate go run gen_properties.go auxiliary/WordBreakProperty wordproperties.go workBreakCodePoints words emojis=Extended_Pictographic
18//go:generate go run gen_properties.go auxiliary/SentenceBreakProperty sentenceproperties.go sentenceBreakCodePoints sentences
19//go:generate go run gen_properties.go LineBreak lineproperties.go lineBreakCodePoints lines gencat
20//go:generate go run gen_properties.go EastAsianWidth eastasianwidth.go eastAsianWidth eastasianwidth
21//go:generate go run gen_properties.go - emojipresentation.go emojiPresentation emojipresentation emojis=Emoji_Presentation
22package main
23
24import (
25 "bufio"
26 "bytes"
27 "errors"
28 "fmt"
29 "go/format"
30 "io/ioutil"
31 "log"
32 "net/http"
33 "os"
34 "regexp"
35 "sort"
36 "strconv"
37 "strings"
38 "time"
39)
40
41// We want to test against a specific version rather than the latest. When the
42// package is upgraded to a new version, change these to generate new tests.
43const (
44 propertyURL = `https://www.unicode.org/Public/15.0.0/ucd/%s.txt`
45 emojiURL = `https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt`
46)
47
48// The regular expression for a line containing a code point range property.
49var propertyPattern = regexp.MustCompile(`^([0-9A-F]{4,6})(\.\.([0-9A-F]{4,6}))?\s*;\s*([A-Za-z0-9_]+)\s*#\s(.+)$`)
50
51func main() {
52 if len(os.Args) < 5 {
53 fmt.Println("Not enough arguments, see code for details")
54 os.Exit(1)
55 }
56
57 log.SetPrefix("gen_properties (" + os.Args[4] + "): ")
58 log.SetFlags(0)
59
60 // Parse flags.
61 flags := make(map[string]string)
62 if len(os.Args) >= 6 {
63 for _, flag := range strings.Split(os.Args[5], ",") {
64 flagFields := strings.Split(flag, "=")
65 if len(flagFields) == 1 {
66 flags[flagFields[0]] = "yes"
67 } else {
68 flags[flagFields[0]] = flagFields[1]
69 }
70 }
71 }
72
73 // Parse the text file and generate Go source code from it.
74 _, includeGeneralCategory := flags["gencat"]
75 var mainURL string
76 if os.Args[1] != "-" {
77 mainURL = fmt.Sprintf(propertyURL, os.Args[1])
78 }
79 src, err := parse(mainURL, flags["emojis"], includeGeneralCategory)
80 if err != nil {
81 log.Fatal(err)
82 }
83
84 // Format the Go code.
85 formatted, err := format.Source([]byte(src))
86 if err != nil {
87 log.Fatal("gofmt:", err)
88 }
89
90 // Save it to the (local) target file.
91 log.Print("Writing to ", os.Args[2])
92 if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
93 log.Fatal(err)
94 }
95}
96
97// parse parses the Unicode Properties text files located at the given URLs and
98// returns their equivalent Go source code to be used in the uniseg package. If
99// "emojiProperty" is not an empty string, emoji code points for that emoji
100// property (e.g. "Extended_Pictographic") will be included. In those cases, you
101// may pass an empty "propertyURL" to skip parsing the main properties file. If
102// "includeGeneralCategory" is true, the Unicode General Category property will
103// be extracted from the comments and included in the output.
104func parse(propertyURL, emojiProperty string, includeGeneralCategory bool) (string, error) {
105 if propertyURL == "" && emojiProperty == "" {
106 return "", errors.New("no properties to parse")
107 }
108
109 // Temporary buffer to hold properties.
110 var properties [][4]string
111
112 // Open the first URL.
113 if propertyURL != "" {
114 log.Printf("Parsing %s", propertyURL)
115 res, err := http.Get(propertyURL)
116 if err != nil {
117 return "", err
118 }
119 in1 := res.Body
120 defer in1.Close()
121
122 // Parse it.
123 scanner := bufio.NewScanner(in1)
124 num := 0
125 for scanner.Scan() {
126 num++
127 line := strings.TrimSpace(scanner.Text())
128
129 // Skip comments and empty lines.
130 if strings.HasPrefix(line, "#") || line == "" {
131 continue
132 }
133
134 // Everything else must be a code point range, a property and a comment.
135 from, to, property, comment, err := parseProperty(line)
136 if err != nil {
137 return "", fmt.Errorf("%s line %d: %v", os.Args[4], num, err)
138 }
139 properties = append(properties, [4]string{from, to, property, comment})
140 }
141 if err := scanner.Err(); err != nil {
142 return "", err
143 }
144 }
145
146 // Open the second URL.
147 if emojiProperty != "" {
148 log.Printf("Parsing %s", emojiURL)
149 res, err := http.Get(emojiURL)
150 if err != nil {
151 return "", err
152 }
153 in2 := res.Body
154 defer in2.Close()
155
156 // Parse it.
157 scanner := bufio.NewScanner(in2)
158 num := 0
159 for scanner.Scan() {
160 num++
161 line := scanner.Text()
162
163 // Skip comments, empty lines, and everything not containing
164 // "Extended_Pictographic".
165 if strings.HasPrefix(line, "#") || line == "" || !strings.Contains(line, emojiProperty) {
166 continue
167 }
168
169 // Everything else must be a code point range, a property and a comment.
170 from, to, property, comment, err := parseProperty(line)
171 if err != nil {
172 return "", fmt.Errorf("emojis line %d: %v", num, err)
173 }
174 properties = append(properties, [4]string{from, to, property, comment})
175 }
176 if err := scanner.Err(); err != nil {
177 return "", err
178 }
179 }
180
181 // Avoid overflow during binary search.
182 if len(properties) >= 1<<31 {
183 return "", errors.New("too many properties")
184 }
185
186 // Sort properties.
187 sort.Slice(properties, func(i, j int) bool {
188 left, _ := strconv.ParseUint(properties[i][0], 16, 64)
189 right, _ := strconv.ParseUint(properties[j][0], 16, 64)
190 return left < right
191 })
192
193 // Header.
194 var (
195 buf bytes.Buffer
196 emojiComment string
197 )
198 columns := 3
199 if includeGeneralCategory {
200 columns = 4
201 }
202 if emojiURL != "" {
203 emojiComment = `
204// and
205// ` + emojiURL + `
206// ("Extended_Pictographic" only)`
207 }
208 buf.WriteString(`// Code generated via go generate from gen_properties.go. DO NOT EDIT.
209
210package uniseg
211
212// ` + os.Args[3] + ` are taken from
213// ` + propertyURL + emojiComment + `
214// on ` + time.Now().Format("January 2, 2006") + `. See https://www.unicode.org/license.html for the Unicode
215// license agreement.
216var ` + os.Args[3] + ` = [][` + strconv.Itoa(columns) + `]int{
217 `)
218
219 // Properties.
220 for _, prop := range properties {
221 if includeGeneralCategory {
222 generalCategory := "gc" + prop[3][:2]
223 if generalCategory == "gcL&" {
224 generalCategory = "gcLC"
225 }
226 prop[3] = prop[3][3:]
227 fmt.Fprintf(&buf, "{0x%s,0x%s,%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), generalCategory, prop[3])
228 } else {
229 fmt.Fprintf(&buf, "{0x%s,0x%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), prop[3])
230 }
231 }
232
233 // Tail.
234 buf.WriteString("}")
235
236 return buf.String(), nil
237}
238
239// parseProperty parses a line of the Unicode properties text file containing a
240// property for a code point range and returns it along with its comment.
241func parseProperty(line string) (from, to, property, comment string, err error) {
242 fields := propertyPattern.FindStringSubmatch(line)
243 if fields == nil {
244 err = errors.New("no property found")
245 return
246 }
247 from = fields[1]
248 to = fields[3]
249 if to == "" {
250 to = from
251 }
252 property = fields[4]
253 comment = fields[5]
254 return
255}
256
257// translateProperty translates a property name as used in the Unicode data file
258// to a variable used in the Go code.
259func translateProperty(prefix, property string) string {
260 return prefix + strings.ReplaceAll(property, "_", "")
261}