mirror of
https://github.com/Luzifer/twitch-bot.git
synced 2024-11-09 16:50:01 +00:00
[linkcheck] Add support for meta-redirects
Signed-off-by: Knut Ahlers <knut@ahlers.me>
This commit is contained in:
parent
f1d4c1a283
commit
621d266391
4 changed files with 165 additions and 6 deletions
66
internal/linkcheck/meta.go
Normal file
66
internal/linkcheck/meta.go
Normal file
|
@ -0,0 +1,66 @@
|
|||
package linkcheck
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"regexp"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
var (
|
||||
errNoMetaRedir = fmt.Errorf("no meta-redir found")
|
||||
metaRedirContent = regexp.MustCompile(`^[0-9]+;\s*url=(.*)$`)
|
||||
)
|
||||
|
||||
//nolint:gocognit // Makes no sense to split
|
||||
func resolveMetaRedirect(body []byte) (redir string, err error) {
|
||||
tok := html.NewTokenizer(bytes.NewReader(body))
|
||||
|
||||
tokenLoop:
|
||||
for {
|
||||
token := tok.Next()
|
||||
switch token {
|
||||
case html.ErrorToken:
|
||||
if errors.Is(tok.Err(), io.EOF) {
|
||||
break tokenLoop
|
||||
}
|
||||
return "", fmt.Errorf("scanning tokens: %w", tok.Err())
|
||||
|
||||
case html.StartTagToken:
|
||||
t := tok.Token()
|
||||
if t.Data == "meta" {
|
||||
var (
|
||||
content string
|
||||
isRedirect bool
|
||||
)
|
||||
|
||||
for _, attr := range t.Attr {
|
||||
isRedirect = isRedirect || attr.Key == "http-equiv" && attr.Val == "refresh"
|
||||
|
||||
if attr.Key == "content" {
|
||||
content = attr.Val
|
||||
}
|
||||
}
|
||||
|
||||
if !isRedirect {
|
||||
continue tokenLoop
|
||||
}
|
||||
|
||||
// It is a redirect, get the content and parse it
|
||||
if matches := metaRedirContent.FindStringSubmatch(content); len(matches) > 1 {
|
||||
redir = matches[1]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if redir == "" {
|
||||
// We did not find anything
|
||||
return "", errNoMetaRedir
|
||||
}
|
||||
|
||||
return redir, nil
|
||||
}
|
41
internal/linkcheck/meta_test.go
Normal file
41
internal/linkcheck/meta_test.go
Normal file
|
@ -0,0 +1,41 @@
|
|||
package linkcheck
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestResolveMetaRedir(t *testing.T) {
|
||||
testDoc := []byte(`<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta property="twitter:image" content="">
|
||||
<meta http-equiv='refresh' content='0; url=https://github.com/Luzifer/twitch-bot'>
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>`)
|
||||
|
||||
redir, err := resolveMetaRedirect(testDoc)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "https://github.com/Luzifer/twitch-bot", redir)
|
||||
|
||||
testDoc = []byte(`<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta property="twitter:image" content="">
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>`)
|
||||
|
||||
redir, err = resolveMetaRedirect(testDoc)
|
||||
require.ErrorIs(t, err, errNoMetaRedir)
|
||||
assert.Equal(t, "", redir)
|
||||
}
|
|
@ -4,6 +4,7 @@ import (
|
|||
"context"
|
||||
"crypto/rand"
|
||||
_ "embed"
|
||||
"io"
|
||||
"math/big"
|
||||
"net/http"
|
||||
"net/http/cookiejar"
|
||||
|
@ -13,7 +14,6 @@ import (
|
|||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/Luzifer/go_helpers/v2/str"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
|
@ -88,12 +88,12 @@ func (resolver) getJar() *cookiejar.Jar {
|
|||
// that link after all redirects were followed
|
||||
//
|
||||
//nolint:gocyclo
|
||||
func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack []string, userAgent string) string {
|
||||
func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack *stack, userAgent string) string {
|
||||
if !linkTest.MatchString(link) && !r.skipValidation {
|
||||
return ""
|
||||
}
|
||||
|
||||
if str.StringInSlice(link, callStack) || len(callStack) == maxRedirects {
|
||||
if callStack.Count(link) > 2 || callStack.Height() == maxRedirects {
|
||||
// We got ourselves a loop: Yay!
|
||||
return link
|
||||
}
|
||||
|
@ -155,10 +155,35 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack
|
|||
return ""
|
||||
}
|
||||
target := r.resolveReference(u, tu)
|
||||
return r.resolveFinal(target, cookieJar, append(callStack, link), userAgent)
|
||||
callStack.Visit(link)
|
||||
return r.resolveFinal(target, cookieJar, callStack, userAgent)
|
||||
}
|
||||
|
||||
// We got a response, it's no redirect, we count this as a success
|
||||
// We got a response, it's no redirect, lets check for in-document stuff
|
||||
docBody, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
if metaRedir, err := resolveMetaRedirect(docBody); err == nil {
|
||||
// Meta-Redirect found
|
||||
tu, err := url.Parse(metaRedir)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
target := r.resolveReference(u, tu)
|
||||
callStack.Visit(link)
|
||||
return r.resolveFinal(target, cookieJar, callStack, userAgent)
|
||||
}
|
||||
|
||||
if resp.Header.Get("Set-Cookie") != "" {
|
||||
// A new cookie was set, lets refresh the page once to see if stuff
|
||||
// changes with that new cookie
|
||||
callStack.Visit(link)
|
||||
return r.resolveFinal(u.String(), cookieJar, callStack, userAgent)
|
||||
}
|
||||
|
||||
// We had no in-document redirects: we count this as a success
|
||||
return u.String()
|
||||
}
|
||||
|
||||
|
@ -201,7 +226,7 @@ func (resolver) resolveReference(origin *url.URL, loc *url.URL) string {
|
|||
|
||||
func (r resolver) runResolver() {
|
||||
for qe := range r.resolverC {
|
||||
if link := r.resolveFinal(qe.Link, r.getJar(), nil, r.userAgent()); link != "" {
|
||||
if link := r.resolveFinal(qe.Link, r.getJar(), &stack{}, r.userAgent()); link != "" {
|
||||
qe.Callback(link)
|
||||
}
|
||||
qe.WaitGroup.Done()
|
||||
|
|
27
internal/linkcheck/stack.go
Normal file
27
internal/linkcheck/stack.go
Normal file
|
@ -0,0 +1,27 @@
|
|||
package linkcheck
|
||||
|
||||
import "strings"
|
||||
|
||||
type (
|
||||
stack struct {
|
||||
visits []string
|
||||
}
|
||||
)
|
||||
|
||||
func (s stack) Count(url string) (n int) {
|
||||
for _, v := range s.visits {
|
||||
if strings.EqualFold(v, url) {
|
||||
n++
|
||||
}
|
||||
}
|
||||
|
||||
return n
|
||||
}
|
||||
|
||||
func (s stack) Height() int {
|
||||
return len(s.visits)
|
||||
}
|
||||
|
||||
func (s *stack) Visit(url string) {
|
||||
s.visits = append(s.visits, url)
|
||||
}
|
Loading…
Reference in a new issue