Fork 0
mirror of https://github.com/Luzifer/nginx-sso.git synced 2024-10-18 15:44:21 +00:00

749 lines
14 KiB
Raw Normal View History

// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
type tokenTest struct {
// A short description of the test case.
desc string
// The HTML to parse.
html string
// The string representations of the expected tokens, joined by '$'.
golden string
var tokenTests = []tokenTest{
// A single text node. The tokenizer should not break text nodes on whitespace,
// nor should it normalize whitespace within a text node.
"foo bar",
"foo bar",
// An entity.
"one < two",
"one < two",
// A start, self-closing and end tag. The tokenizer does not care if the start
// and end tokens don't match; that is the job of the parser.
// Angle brackets that aren't a tag.
"not a tag #0",
"not a tag #1",
"not a tag #2",
"not a tag #3",
"not a tag #4",
"</ >",
"<!-- -->",
"not a tag #5",
"not a tag #6",
"not a tag #7",
"a < b",
"a &lt; b",
"not a tag #8",
"not a tag #9",
"not a tag #10",
"if x<0 and y < 0 then x*y>0",
"if x&lt;0 and y &lt; 0 then x*y&gt;0",
"not a tag #11",
// EOF in a tag name.
"tag name eof #0",
"tag name eof #1",
"<a ",
"tag name eof #2",
"tag name eof #3",
"tag name eof #4",
`<a x`,
// Some malformed tags that are missing a '>'.
"malformed tag #0",
`<p< p="">`,
"malformed tag #1",
`<p </p>`,
`<p <="" p="">`,
"malformed tag #2",
`<p id`,
"malformed tag #3",
`<p id=`,
"malformed tag #4",
`<p id=>`,
`<p id="">`,
"malformed tag #5",
`<p id=0`,
"malformed tag #6",
`<p id=0</p>`,
`<p id="0&lt;/p">`,
"malformed tag #7",
`<p id="0</p>`,
"malformed tag #8",
`<p id="0"</p>`,
`<p id="0" <="" p="">`,
"malformed tag #9",
`<p></p id`,
// Raw text and RCDATA.
"basic raw text",
"unfinished script end tag",
"broken script end tag",
"<SCRIPT>a</SCR ipt>",
"<script>$a&lt;/SCR ipt&gt;",
"EOF in script end tag",
"scriptx end tag",
"' ' completes script end tag",
"<SCRIPT>a</SCRipt ",
"'>' completes script end tag",
"self-closing script end tag",
"nested script tag",
"script end tag after unfinished",
"script/style mismatched tags",
"style element with entity",
"textarea with tag",
"title with tag and entity",
"<title><b>K&amp;R C</b></title>",
"<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
// DOCTYPE tests.
"Proper DOCTYPE",
"<!DOCTYPE html>",
"<!DOCTYPE html>",
"DOCTYPE with no space",
"<!DOCTYPE html>",
"DOCTYPE with two spaces",
"<!doctype html>",
"<!DOCTYPE html>",
"looks like DOCTYPE but isn't",
"<!DOCUMENT html>",
"<!--DOCUMENT html-->",
"<!DOCTYPE >",
// XML processing instructions.
"XML processing instruction",
// Comments.
"abc<b><!-- skipme --></b>def",
"abc$<b>$<!-- skipme -->$</b>$def",
// An attribute with a backslash.
`<p id="a\"b">`,
`<p id="a\" b"="">`,
// Entities, tag name and attribute key lower-casing, and whitespace
// normalization within a tag.
"<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
`<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
// A nonexistent entity. Tokenizing and converting back to a string should
// escape the "&" to become "&amp;".
`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
"entity without semicolon",
`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
"entity with digits",
// Attribute tests:
// http://dev.w3.org/html5/pf-summary/Overview.html#attributes
"Empty attribute",
`<input disabled FOO>`,
`<input disabled="" foo="">`,
"Empty attribute, whitespace",
`<input disabled FOO >`,
`<input disabled="" foo="">`,
"Unquoted attribute value",
`<input value=yes FOO=BAR>`,
`<input value="yes" foo="BAR">`,
"Unquoted attribute value, spaces",
`<input value = yes FOO = BAR>`,
`<input value="yes" foo="BAR">`,
"Unquoted attribute value, trailing space",
`<input value=yes FOO=BAR >`,
`<input value="yes" foo="BAR">`,
"Single-quoted attribute value",
`<input value='yes' FOO='BAR'>`,
`<input value="yes" foo="BAR">`,
"Single-quoted attribute value, trailing space",
`<input value='yes' FOO='BAR' >`,
`<input value="yes" foo="BAR">`,
"Double-quoted attribute value",
`<input value="I'm an attribute" FOO="BAR">`,
`<input value="I&#39;m an attribute" foo="BAR">`,
"Attribute name characters",
`<meta http-equiv="content-type">`,
`<meta http-equiv="content-type">`,
"Mixed attributes",
`a<P V="0 1" w='2' X=3 y>z`,
`a$<p v="0 1" w="2" x="3" y="">$z`,
"Attributes with a solitary single quote",
`<p id=can't><p id=won't>`,
`<p id="can&#39;t">$<p id="won&#39;t">`,
func TestTokenizer(t *testing.T) {
for _, tt := range tokenTests {
z := NewTokenizer(strings.NewReader(tt.html))
if tt.golden != "" {
for i, s := range strings.Split(tt.golden, "$") {
if z.Next() == ErrorToken {
t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
continue loop
actual := z.Token().String()
if s != actual {
t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
continue loop
if z.Err() != io.EOF {
t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
func TestMaxBuffer(t *testing.T) {
// Exceeding the maximum buffer size generates ErrBufferExceeded.
z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
tt := z.Next()
if got, want := tt, ErrorToken; got != want {
t.Fatalf("token type: got: %v want: %v", got, want)
if got, want := z.Err(), ErrBufferExceeded; got != want {
t.Errorf("error type: got: %v want: %v", got, want)
if got, want := string(z.Raw()), "<tttt"; got != want {
t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
func TestMaxBufferReconstruction(t *testing.T) {
// Exceeding the maximum buffer size at any point while tokenizing permits
// reconstructing the original input.
for _, test := range tokenTests {
for maxBuf := 1; ; maxBuf++ {
r := strings.NewReader(test.html)
z := NewTokenizer(r)
var tokenized bytes.Buffer
for {
tt := z.Next()
if tt == ErrorToken {
if err := z.Err(); err != io.EOF && err != ErrBufferExceeded {
t.Errorf("%s: unexpected error: %v", test.desc, err)
// Anything tokenized along with untokenized input or data left in the reader.
assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r))
if err != nil {
t.Errorf("%s: ReadAll: %v", test.desc, err)
continue tests
if got, want := string(assembled), test.html; got != want {
t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
continue tests
// EOF indicates that we completed tokenization and hence found the max
// maxBuf that generates ErrBufferExceeded, so continue to the next test.
if z.Err() == io.EOF {
} // buffer sizes
} // tests
func TestPassthrough(t *testing.T) {
// Accumulating the raw output for each parse event should reconstruct the
// original input.
for _, test := range tokenTests {
z := NewTokenizer(strings.NewReader(test.html))
var parsed bytes.Buffer
for {
tt := z.Next()
if tt == ErrorToken {
if got, want := parsed.String(), test.html; got != want {
t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
func TestBufAPI(t *testing.T) {
s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
z := NewTokenizer(bytes.NewBufferString(s))
var result bytes.Buffer
depth := 0
for {
tt := z.Next()
switch tt {
case ErrorToken:
if z.Err() != io.EOF {
break loop
case TextToken:
if depth > 0 {
case StartTagToken, EndTagToken:
tn, _ := z.TagName()
if len(tn) == 1 && tn[0] == 'a' {
if tt == StartTagToken {
} else {
u := "14567"
v := string(result.Bytes())
if u != v {
t.Errorf("TestBufAPI: want %q got %q", u, v)
func TestConvertNewlines(t *testing.T) {
testCases := map[string]string{
"Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
"Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
"DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
"": "",
"\n": "\n",
"\n\r": "\n\n",
"\r": "\n",
"\r\n": "\n",
"\r\n\n": "\n\n",
"\r\n\r": "\n\n",
"\r\n\r\n": "\n\n",
"\r\r": "\n\n",
"\r\r\n": "\n\n",
"\r\r\n\n": "\n\n\n",
"\r\r\r\n": "\n\n\n",
"\r \n": "\n \n",
"xyz": "xyz",
for in, want := range testCases {
if got := string(convertNewlines([]byte(in))); got != want {
t.Errorf("input %q: got %q, want %q", in, got, want)
func TestReaderEdgeCases(t *testing.T) {
const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
testCases := []io.Reader{
&zeroOneByteReader{s: s},
&eofStringsReader{s: s},
for i, tc := range testCases {
got := []TokenType{}
z := NewTokenizer(tc)
for {
tt := z.Next()
if tt == ErrorToken {
got = append(got, tt)
if err := z.Err(); err != nil && err != io.EOF {
if err != io.ErrNoProgress {
t.Errorf("i=%d: %v", i, err)
want := []TokenType{
if !reflect.DeepEqual(got, want) {
t.Errorf("i=%d: got %v, want %v", i, got, want)
// zeroOneByteReader is like a strings.Reader that alternates between
// returning 0 bytes and 1 byte at a time.
type zeroOneByteReader struct {
s string
n int
func (r *zeroOneByteReader) Read(p []byte) (int, error) {
if len(p) == 0 {
return 0, nil
if len(r.s) == 0 {
return 0, io.EOF
if r.n%2 != 0 {
return 0, nil
p[0], r.s = r.s[0], r.s[1:]
return 1, nil
// eofStringsReader is like a strings.Reader but can return an (n, err) where
// n > 0 && err != nil.
type eofStringsReader struct {
s string
func (r *eofStringsReader) Read(p []byte) (int, error) {
n := copy(p, r.s)
r.s = r.s[n:]
if r.s != "" {
return n, nil
return n, io.EOF
// stuckReader is an io.Reader that always returns no data and no error.
type stuckReader struct{}
func (*stuckReader) Read(p []byte) (int, error) {
return 0, nil
const (
rawLevel = iota
func benchmarkTokenizer(b *testing.B, level int) {
buf, err := ioutil.ReadFile("testdata/go1.html")
if err != nil {
b.Fatalf("could not read testdata/go1.html: %v", err)
for i := 0; i < b.N; i++ {
z := NewTokenizer(bytes.NewBuffer(buf))
for {
tt := z.Next()
if tt == ErrorToken {
if err := z.Err(); err != nil && err != io.EOF {
b.Fatalf("tokenizer error: %v", err)
switch level {
case rawLevel:
// Calling z.Raw just returns the raw bytes of the token. It does
// not unescape &lt; to <, or lower-case tag names and attribute keys.
case lowLevel:
// Caling z.Text, z.TagName and z.TagAttr returns []byte values
// whose contents may change on the next call to z.Next.
switch tt {
case TextToken, CommentToken, DoctypeToken:
case StartTagToken, SelfClosingTagToken:
_, more := z.TagName()
for more {
_, _, more = z.TagAttr()
case EndTagToken:
case highLevel:
// Calling z.Token converts []byte values to strings whose validity
// extend beyond the next call to z.Next.
func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) }
func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) }
func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }