parser.go

  1package parser
  2
  3import (
  4	"bytes"
  5	"fmt"
  6	"golang.org/x/net/html"
  7	"regexp"
  8	"strings"
  9)
 10
 11type parser interface {
 12	Parse() (string, error)
 13}
 14
 15type parserType int
 16
 17const (
 18	TitleParser parserType = iota
 19)
 20
 21// NewWithInput returns a new parser instance
 22func NewWithInput(t parserType, input string) parser {
 23	var p parser
 24
 25	switch t {
 26	case TitleParser:
 27		p = titleParser{input: input}
 28	}
 29
 30	return p
 31}
 32
 33type titleParser struct {
 34	input string
 35}
 36
 37// Parse is used to fetch the new title from a "changed title" event
 38//
 39// this func is a great example of something that is _extremely_ fragile; the
 40// input string is pulled from the body of a gitlab message containing html
 41// fragments, and has changed on at least [one occasion][0], breaking our test
 42// pipelines and preventing feature development. i think querying for an issue's
 43// _iterations_ [1] would likely be a better approach.
 44//
 45// example p.input values:
 46// - changed title from **some title** to **some{+ new +}title**
 47// - changed title from **some{- new-} title** to **some title**
 48// - <p>changed title from <code class="idiff">some title</code> to <code class="idiff">some<span class="idiff left addition"> new</span> title</code></p>
 49//
 50// [0]: https://github.com/git-bug/git-bug/issues/1367
 51// [1]: https://docs.gitlab.com/api/resource_iteration_events/#list-project-issue-iteration-events
 52func (p titleParser) Parse() (string, error) {
 53	var reHTML = regexp.MustCompile(`.* to <code\s+class="idiff"\s*>(.*?)</code>`)
 54	var reMD = regexp.MustCompile(`.* to \*\*(.*)\*\*`)
 55
 56	matchHTML := reHTML.FindAllStringSubmatch(p.input, -1)
 57	matchMD := reMD.FindAllStringSubmatch(p.input, -1)
 58
 59	if len(matchHTML) == 1 {
 60		t, err := p.stripHTML(matchHTML[0][1])
 61		if err != nil {
 62			return "", fmt.Errorf("unable to strip HTML from new title: %q", t)
 63		}
 64		return strings.TrimSpace(t), nil
 65	}
 66
 67	if len(matchMD) == 1 {
 68		reDiff := regexp.MustCompile(`{\+(.*?)\+}`)
 69
 70		t := matchMD[0][1]
 71		t = reDiff.ReplaceAllString(t, "$1")
 72
 73		return strings.TrimSpace(t), nil
 74	}
 75
 76	return "", fmt.Errorf(
 77		"failed to extract title: html=%d md=%d input=%q",
 78		len(matchHTML),
 79		len(matchMD),
 80		p.input,
 81	)
 82}
 83
 84// stripHTML removes all html tags from a provided string
 85func (p titleParser) stripHTML(s string) (string, error) {
 86	nodes, err := html.Parse(strings.NewReader(s))
 87	if err != nil {
 88		// return the original unmodified string in the event html.Parse()
 89		// fails; let the downstream callsites decide if they want to proceed
 90		// with the value or not.
 91		return s, err
 92	}
 93
 94	var buf bytes.Buffer
 95	var walk func(*html.Node)
 96	walk = func(n *html.Node) {
 97		if n.Type == html.TextNode {
 98			buf.WriteString(n.Data)
 99		}
100		for c := n.FirstChild; c != nil; c = c.NextSibling {
101			walk(c)
102		}
103	}
104	walk(nodes)
105
106	return buf.String(), nil
107}