doc.go

  1// Copyright 2010 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5/*
  6Package html implements an HTML5-compliant tokenizer and parser.
  7
  8Tokenization is done by creating a Tokenizer for an io.Reader r. It is the
  9caller's responsibility to ensure that r provides UTF-8 encoded HTML.
 10
 11	z := html.NewTokenizer(r)
 12
 13Given a Tokenizer z, the HTML is tokenized by repeatedly calling z.Next(),
 14which parses the next token and returns its type, or an error:
 15
 16	for {
 17		tt := z.Next()
 18		if tt == html.ErrorToken {
 19			// ...
 20			return ...
 21		}
 22		// Process the current token.
 23	}
 24
 25There are two APIs for retrieving the current token. The high-level API is to
 26call Token; the low-level API is to call Text or TagName / TagAttr. Both APIs
 27allow optionally calling Raw after Next but before Token, Text, TagName, or
 28TagAttr. In EBNF notation, the valid call sequence per token is:
 29
 30	Next {Raw} [ Token | Text | TagName {TagAttr} ]
 31
 32Token returns an independent data structure that completely describes a token.
 33Entities (such as "&lt;") are unescaped, tag names and attribute keys are
 34lower-cased, and attributes are collected into a []Attribute. For example:
 35
 36	for {
 37		if z.Next() == html.ErrorToken {
 38			// Returning io.EOF indicates success.
 39			return z.Err()
 40		}
 41		emitToken(z.Token())
 42	}
 43
 44The low-level API performs fewer allocations and copies, but the contents of
 45the []byte values returned by Text, TagName and TagAttr may change on the next
 46call to Next. For example, to extract an HTML page's anchor text:
 47
 48	depth := 0
 49	for {
 50		tt := z.Next()
 51		switch tt {
 52		case html.ErrorToken:
 53			return z.Err()
 54		case html.TextToken:
 55			if depth > 0 {
 56				// emitBytes should copy the []byte it receives,
 57				// if it doesn't process it immediately.
 58				emitBytes(z.Text())
 59			}
 60		case html.StartTagToken, html.EndTagToken:
 61			tn, _ := z.TagName()
 62			if len(tn) == 1 && tn[0] == 'a' {
 63				if tt == html.StartTagToken {
 64					depth++
 65				} else {
 66					depth--
 67				}
 68			}
 69		}
 70	}
 71
 72Parsing is done by calling Parse with an io.Reader, which returns the root of
 73the parse tree (the document element) as a *Node. It is the caller's
 74responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
 75example, to process each anchor node in depth-first order:
 76
 77	doc, err := html.Parse(r)
 78	if err != nil {
 79		// ...
 80	}
 81	for n := range doc.Descendants() {
 82		if n.Type == html.ElementNode && n.Data == "a" {
 83			// Do something with n...
 84		}
 85	}
 86
 87The relevant specifications include:
 88https://html.spec.whatwg.org/multipage/syntax.html and
 89https://html.spec.whatwg.org/multipage/syntax.html#tokenization
 90
 91# Security Considerations
 92
 93Care should be taken when parsing and interpreting HTML, whether full documents
 94or fragments, within the framework of the HTML specification, especially with
 95regard to untrusted inputs.
 96
 97This package provides both a tokenizer and a parser, which implement the
 98tokenization, and tokenization and tree construction stages of the WHATWG HTML
 99parsing specification respectively. While the tokenizer parses and normalizes
100individual HTML tokens, only the parser constructs the DOM tree from the
101tokenized HTML, as described in the tree construction stage of the
102specification, dynamically modifying or extending the document's DOM tree.
103
104If your use case requires semantically well-formed HTML documents, as defined by
105the WHATWG specification, the parser should be used rather than the tokenizer.
106
107In security contexts, if trust decisions are being made using the tokenized or
108parsed content, the input must be re-serialized (for instance by using Render or
109Token.String) in order for those trust decisions to hold, as the process of
110tokenization or parsing may alter the content.
111*/
112package html // import "golang.org/x/net/html"
113
114// The tokenization algorithm implemented by this package is not a line-by-line
115// transliteration of the relatively verbose state-machine in the WHATWG
116// specification. A more direct approach is used instead, where the program
117// counter implies the state, such as whether it is tokenizing a tag or a text
118// node. Specification compliance is verified by checking expected and actual
119// outputs over a test suite rather than aiming for algorithmic fidelity.
120
121// TODO(nigeltao): Does a DOM API belong in this package or a separate one?
122// TODO(nigeltao): How does parsing interact with a JavaScript engine?