gen_properties.go

  1//go:build generate
  2
  3// This program generates a property file in Go file from Unicode Character
  4// Database auxiliary data files. The command line arguments are as follows:
  5//
  6//  1. The name of the Unicode data file (just the filename, without extension).
  7//     Can be "-" (to skip) if the emoji flag is included.
  8//  2. The name of the locally generated Go file.
  9//  3. The name of the slice mapping code points to properties.
 10//  4. The name of the generator, for logging purposes.
 11//  5. (Optional) Flags, comma-separated. The following flags are available:
 12//     - "emojis=<property>": include the specified emoji properties (e.g.
 13//     "Extended_Pictographic").
 14//     - "gencat": include general category properties.
 15//
 16//go:generate go run gen_properties.go auxiliary/GraphemeBreakProperty graphemeproperties.go graphemeCodePoints graphemes emojis=Extended_Pictographic
 17//go:generate go run gen_properties.go auxiliary/WordBreakProperty wordproperties.go workBreakCodePoints words emojis=Extended_Pictographic
 18//go:generate go run gen_properties.go auxiliary/SentenceBreakProperty sentenceproperties.go sentenceBreakCodePoints sentences
 19//go:generate go run gen_properties.go LineBreak lineproperties.go lineBreakCodePoints lines gencat
 20//go:generate go run gen_properties.go EastAsianWidth eastasianwidth.go eastAsianWidth eastasianwidth
 21//go:generate go run gen_properties.go - emojipresentation.go emojiPresentation emojipresentation emojis=Emoji_Presentation
 22package main
 23
 24import (
 25	"bufio"
 26	"bytes"
 27	"errors"
 28	"fmt"
 29	"go/format"
 30	"io/ioutil"
 31	"log"
 32	"net/http"
 33	"os"
 34	"regexp"
 35	"sort"
 36	"strconv"
 37	"strings"
 38	"time"
 39)
 40
 41// We want to test against a specific version rather than the latest. When the
 42// package is upgraded to a new version, change these to generate new tests.
 43const (
 44	propertyURL = `https://www.unicode.org/Public/15.0.0/ucd/%s.txt`
 45	emojiURL    = `https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt`
 46)
 47
 48// The regular expression for a line containing a code point range property.
 49var propertyPattern = regexp.MustCompile(`^([0-9A-F]{4,6})(\.\.([0-9A-F]{4,6}))?\s*;\s*([A-Za-z0-9_]+)\s*#\s(.+)$`)
 50
 51func main() {
 52	if len(os.Args) < 5 {
 53		fmt.Println("Not enough arguments, see code for details")
 54		os.Exit(1)
 55	}
 56
 57	log.SetPrefix("gen_properties (" + os.Args[4] + "): ")
 58	log.SetFlags(0)
 59
 60	// Parse flags.
 61	flags := make(map[string]string)
 62	if len(os.Args) >= 6 {
 63		for _, flag := range strings.Split(os.Args[5], ",") {
 64			flagFields := strings.Split(flag, "=")
 65			if len(flagFields) == 1 {
 66				flags[flagFields[0]] = "yes"
 67			} else {
 68				flags[flagFields[0]] = flagFields[1]
 69			}
 70		}
 71	}
 72
 73	// Parse the text file and generate Go source code from it.
 74	_, includeGeneralCategory := flags["gencat"]
 75	var mainURL string
 76	if os.Args[1] != "-" {
 77		mainURL = fmt.Sprintf(propertyURL, os.Args[1])
 78	}
 79	src, err := parse(mainURL, flags["emojis"], includeGeneralCategory)
 80	if err != nil {
 81		log.Fatal(err)
 82	}
 83
 84	// Format the Go code.
 85	formatted, err := format.Source([]byte(src))
 86	if err != nil {
 87		log.Fatal("gofmt:", err)
 88	}
 89
 90	// Save it to the (local) target file.
 91	log.Print("Writing to ", os.Args[2])
 92	if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
 93		log.Fatal(err)
 94	}
 95}
 96
 97// parse parses the Unicode Properties text files located at the given URLs and
 98// returns their equivalent Go source code to be used in the uniseg package. If
 99// "emojiProperty" is not an empty string, emoji code points for that emoji
100// property (e.g. "Extended_Pictographic") will be included. In those cases, you
101// may pass an empty "propertyURL" to skip parsing the main properties file. If
102// "includeGeneralCategory" is true, the Unicode General Category property will
103// be extracted from the comments and included in the output.
104func parse(propertyURL, emojiProperty string, includeGeneralCategory bool) (string, error) {
105	if propertyURL == "" && emojiProperty == "" {
106		return "", errors.New("no properties to parse")
107	}
108
109	// Temporary buffer to hold properties.
110	var properties [][4]string
111
112	// Open the first URL.
113	if propertyURL != "" {
114		log.Printf("Parsing %s", propertyURL)
115		res, err := http.Get(propertyURL)
116		if err != nil {
117			return "", err
118		}
119		in1 := res.Body
120		defer in1.Close()
121
122		// Parse it.
123		scanner := bufio.NewScanner(in1)
124		num := 0
125		for scanner.Scan() {
126			num++
127			line := strings.TrimSpace(scanner.Text())
128
129			// Skip comments and empty lines.
130			if strings.HasPrefix(line, "#") || line == "" {
131				continue
132			}
133
134			// Everything else must be a code point range, a property and a comment.
135			from, to, property, comment, err := parseProperty(line)
136			if err != nil {
137				return "", fmt.Errorf("%s line %d: %v", os.Args[4], num, err)
138			}
139			properties = append(properties, [4]string{from, to, property, comment})
140		}
141		if err := scanner.Err(); err != nil {
142			return "", err
143		}
144	}
145
146	// Open the second URL.
147	if emojiProperty != "" {
148		log.Printf("Parsing %s", emojiURL)
149		res, err := http.Get(emojiURL)
150		if err != nil {
151			return "", err
152		}
153		in2 := res.Body
154		defer in2.Close()
155
156		// Parse it.
157		scanner := bufio.NewScanner(in2)
158		num := 0
159		for scanner.Scan() {
160			num++
161			line := scanner.Text()
162
163			// Skip comments, empty lines, and everything not containing
164			// "Extended_Pictographic".
165			if strings.HasPrefix(line, "#") || line == "" || !strings.Contains(line, emojiProperty) {
166				continue
167			}
168
169			// Everything else must be a code point range, a property and a comment.
170			from, to, property, comment, err := parseProperty(line)
171			if err != nil {
172				return "", fmt.Errorf("emojis line %d: %v", num, err)
173			}
174			properties = append(properties, [4]string{from, to, property, comment})
175		}
176		if err := scanner.Err(); err != nil {
177			return "", err
178		}
179	}
180
181	// Avoid overflow during binary search.
182	if len(properties) >= 1<<31 {
183		return "", errors.New("too many properties")
184	}
185
186	// Sort properties.
187	sort.Slice(properties, func(i, j int) bool {
188		left, _ := strconv.ParseUint(properties[i][0], 16, 64)
189		right, _ := strconv.ParseUint(properties[j][0], 16, 64)
190		return left < right
191	})
192
193	// Header.
194	var (
195		buf          bytes.Buffer
196		emojiComment string
197	)
198	columns := 3
199	if includeGeneralCategory {
200		columns = 4
201	}
202	if emojiURL != "" {
203		emojiComment = `
204// and
205// ` + emojiURL + `
206// ("Extended_Pictographic" only)`
207	}
208	buf.WriteString(`// Code generated via go generate from gen_properties.go. DO NOT EDIT.
209
210package uniseg
211
212// ` + os.Args[3] + ` are taken from
213// ` + propertyURL + emojiComment + `
214// on ` + time.Now().Format("January 2, 2006") + `. See https://www.unicode.org/license.html for the Unicode
215// license agreement.
216var ` + os.Args[3] + ` = [][` + strconv.Itoa(columns) + `]int{
217	`)
218
219	// Properties.
220	for _, prop := range properties {
221		if includeGeneralCategory {
222			generalCategory := "gc" + prop[3][:2]
223			if generalCategory == "gcL&" {
224				generalCategory = "gcLC"
225			}
226			prop[3] = prop[3][3:]
227			fmt.Fprintf(&buf, "{0x%s,0x%s,%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), generalCategory, prop[3])
228		} else {
229			fmt.Fprintf(&buf, "{0x%s,0x%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), prop[3])
230		}
231	}
232
233	// Tail.
234	buf.WriteString("}")
235
236	return buf.String(), nil
237}
238
239// parseProperty parses a line of the Unicode properties text file containing a
240// property for a code point range and returns it along with its comment.
241func parseProperty(line string) (from, to, property, comment string, err error) {
242	fields := propertyPattern.FindStringSubmatch(line)
243	if fields == nil {
244		err = errors.New("no property found")
245		return
246	}
247	from = fields[1]
248	to = fields[3]
249	if to == "" {
250		to = from
251	}
252	property = fields[4]
253	comment = fields[5]
254	return
255}
256
257// translateProperty translates a property name as used in the Unicode data file
258// to a variable used in the Go code.
259func translateProperty(prefix, property string) string {
260	return prefix + strings.ReplaceAll(property, "_", "")
261}