mirror of
https://github.com/Luzifer/twitch-bot.git
synced 2024-12-20 11:51:17 +00:00
[linkcheck] Add support for meta-redirects
Signed-off-by: Knut Ahlers <knut@ahlers.me>
This commit is contained in:
parent
f1d4c1a283
commit
621d266391
4 changed files with 165 additions and 6 deletions
66
internal/linkcheck/meta.go
Normal file
66
internal/linkcheck/meta.go
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
package linkcheck
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"regexp"
|
||||||
|
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
errNoMetaRedir = fmt.Errorf("no meta-redir found")
|
||||||
|
metaRedirContent = regexp.MustCompile(`^[0-9]+;\s*url=(.*)$`)
|
||||||
|
)
|
||||||
|
|
||||||
|
//nolint:gocognit // Makes no sense to split
|
||||||
|
func resolveMetaRedirect(body []byte) (redir string, err error) {
|
||||||
|
tok := html.NewTokenizer(bytes.NewReader(body))
|
||||||
|
|
||||||
|
tokenLoop:
|
||||||
|
for {
|
||||||
|
token := tok.Next()
|
||||||
|
switch token {
|
||||||
|
case html.ErrorToken:
|
||||||
|
if errors.Is(tok.Err(), io.EOF) {
|
||||||
|
break tokenLoop
|
||||||
|
}
|
||||||
|
return "", fmt.Errorf("scanning tokens: %w", tok.Err())
|
||||||
|
|
||||||
|
case html.StartTagToken:
|
||||||
|
t := tok.Token()
|
||||||
|
if t.Data == "meta" {
|
||||||
|
var (
|
||||||
|
content string
|
||||||
|
isRedirect bool
|
||||||
|
)
|
||||||
|
|
||||||
|
for _, attr := range t.Attr {
|
||||||
|
isRedirect = isRedirect || attr.Key == "http-equiv" && attr.Val == "refresh"
|
||||||
|
|
||||||
|
if attr.Key == "content" {
|
||||||
|
content = attr.Val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !isRedirect {
|
||||||
|
continue tokenLoop
|
||||||
|
}
|
||||||
|
|
||||||
|
// It is a redirect, get the content and parse it
|
||||||
|
if matches := metaRedirContent.FindStringSubmatch(content); len(matches) > 1 {
|
||||||
|
redir = matches[1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if redir == "" {
|
||||||
|
// We did not find anything
|
||||||
|
return "", errNoMetaRedir
|
||||||
|
}
|
||||||
|
|
||||||
|
return redir, nil
|
||||||
|
}
|
41
internal/linkcheck/meta_test.go
Normal file
41
internal/linkcheck/meta_test.go
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
package linkcheck
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestResolveMetaRedir(t *testing.T) {
|
||||||
|
testDoc := []byte(`<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title></title>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||||
|
<meta property="twitter:image" content="">
|
||||||
|
<meta http-equiv='refresh' content='0; url=https://github.com/Luzifer/twitch-bot'>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
</body>
|
||||||
|
</html>`)
|
||||||
|
|
||||||
|
redir, err := resolveMetaRedirect(testDoc)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Equal(t, "https://github.com/Luzifer/twitch-bot", redir)
|
||||||
|
|
||||||
|
testDoc = []byte(`<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title></title>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||||
|
<meta property="twitter:image" content="">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
</body>
|
||||||
|
</html>`)
|
||||||
|
|
||||||
|
redir, err = resolveMetaRedirect(testDoc)
|
||||||
|
require.ErrorIs(t, err, errNoMetaRedir)
|
||||||
|
assert.Equal(t, "", redir)
|
||||||
|
}
|
|
@ -4,6 +4,7 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"crypto/rand"
|
"crypto/rand"
|
||||||
_ "embed"
|
_ "embed"
|
||||||
|
"io"
|
||||||
"math/big"
|
"math/big"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/cookiejar"
|
"net/http/cookiejar"
|
||||||
|
@ -13,7 +14,6 @@ import (
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/Luzifer/go_helpers/v2/str"
|
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -88,12 +88,12 @@ func (resolver) getJar() *cookiejar.Jar {
|
||||||
// that link after all redirects were followed
|
// that link after all redirects were followed
|
||||||
//
|
//
|
||||||
//nolint:gocyclo
|
//nolint:gocyclo
|
||||||
func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack []string, userAgent string) string {
|
func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack *stack, userAgent string) string {
|
||||||
if !linkTest.MatchString(link) && !r.skipValidation {
|
if !linkTest.MatchString(link) && !r.skipValidation {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
if str.StringInSlice(link, callStack) || len(callStack) == maxRedirects {
|
if callStack.Count(link) > 2 || callStack.Height() == maxRedirects {
|
||||||
// We got ourselves a loop: Yay!
|
// We got ourselves a loop: Yay!
|
||||||
return link
|
return link
|
||||||
}
|
}
|
||||||
|
@ -155,10 +155,35 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
target := r.resolveReference(u, tu)
|
target := r.resolveReference(u, tu)
|
||||||
return r.resolveFinal(target, cookieJar, append(callStack, link), userAgent)
|
callStack.Visit(link)
|
||||||
|
return r.resolveFinal(target, cookieJar, callStack, userAgent)
|
||||||
}
|
}
|
||||||
|
|
||||||
// We got a response, it's no redirect, we count this as a success
|
// We got a response, it's no redirect, lets check for in-document stuff
|
||||||
|
docBody, err := io.ReadAll(resp.Body)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
if metaRedir, err := resolveMetaRedirect(docBody); err == nil {
|
||||||
|
// Meta-Redirect found
|
||||||
|
tu, err := url.Parse(metaRedir)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
target := r.resolveReference(u, tu)
|
||||||
|
callStack.Visit(link)
|
||||||
|
return r.resolveFinal(target, cookieJar, callStack, userAgent)
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.Header.Get("Set-Cookie") != "" {
|
||||||
|
// A new cookie was set, lets refresh the page once to see if stuff
|
||||||
|
// changes with that new cookie
|
||||||
|
callStack.Visit(link)
|
||||||
|
return r.resolveFinal(u.String(), cookieJar, callStack, userAgent)
|
||||||
|
}
|
||||||
|
|
||||||
|
// We had no in-document redirects: we count this as a success
|
||||||
return u.String()
|
return u.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -201,7 +226,7 @@ func (resolver) resolveReference(origin *url.URL, loc *url.URL) string {
|
||||||
|
|
||||||
func (r resolver) runResolver() {
|
func (r resolver) runResolver() {
|
||||||
for qe := range r.resolverC {
|
for qe := range r.resolverC {
|
||||||
if link := r.resolveFinal(qe.Link, r.getJar(), nil, r.userAgent()); link != "" {
|
if link := r.resolveFinal(qe.Link, r.getJar(), &stack{}, r.userAgent()); link != "" {
|
||||||
qe.Callback(link)
|
qe.Callback(link)
|
||||||
}
|
}
|
||||||
qe.WaitGroup.Done()
|
qe.WaitGroup.Done()
|
||||||
|
|
27
internal/linkcheck/stack.go
Normal file
27
internal/linkcheck/stack.go
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
package linkcheck
|
||||||
|
|
||||||
|
import "strings"
|
||||||
|
|
||||||
|
type (
|
||||||
|
stack struct {
|
||||||
|
visits []string
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s stack) Count(url string) (n int) {
|
||||||
|
for _, v := range s.visits {
|
||||||
|
if strings.EqualFold(v, url) {
|
||||||
|
n++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s stack) Height() int {
|
||||||
|
return len(s.visits)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *stack) Visit(url string) {
|
||||||
|
s.visits = append(s.visits, url)
|
||||||
|
}
|
Loading…
Reference in a new issue