[linkcheck] Add support for meta-redirects

Signed-off-by: Knut Ahlers <knut@ahlers.me>
This commit is contained in:
Knut Ahlers 2024-06-10 14:17:49 +02:00
parent f1d4c1a283
commit 621d266391
Signed by: luzifer
SSH key fingerprint: SHA256:/xtE5lCgiRDQr8SLxHMS92ZBlACmATUmF1crK16Ks4E
4 changed files with 165 additions and 6 deletions

View file

@ -0,0 +1,66 @@
package linkcheck
import (
"bytes"
"errors"
"fmt"
"io"
"regexp"
"golang.org/x/net/html"
)
var (
errNoMetaRedir = fmt.Errorf("no meta-redir found")
metaRedirContent = regexp.MustCompile(`^[0-9]+;\s*url=(.*)$`)
)
//nolint:gocognit // Makes no sense to split
func resolveMetaRedirect(body []byte) (redir string, err error) {
tok := html.NewTokenizer(bytes.NewReader(body))
tokenLoop:
for {
token := tok.Next()
switch token {
case html.ErrorToken:
if errors.Is(tok.Err(), io.EOF) {
break tokenLoop
}
return "", fmt.Errorf("scanning tokens: %w", tok.Err())
case html.StartTagToken:
t := tok.Token()
if t.Data == "meta" {
var (
content string
isRedirect bool
)
for _, attr := range t.Attr {
isRedirect = isRedirect || attr.Key == "http-equiv" && attr.Val == "refresh"
if attr.Key == "content" {
content = attr.Val
}
}
if !isRedirect {
continue tokenLoop
}
// It is a redirect, get the content and parse it
if matches := metaRedirContent.FindStringSubmatch(content); len(matches) > 1 {
redir = matches[1]
}
}
}
}
if redir == "" {
// We did not find anything
return "", errNoMetaRedir
}
return redir, nil
}

View file

@ -0,0 +1,41 @@
package linkcheck
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestResolveMetaRedir(t *testing.T) {
testDoc := []byte(`<!DOCTYPE html>
<html>
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta property="twitter:image" content="">
<meta http-equiv='refresh' content='0; url=https://github.com/Luzifer/twitch-bot'>
</head>
<body>
</body>
</html>`)
redir, err := resolveMetaRedirect(testDoc)
require.NoError(t, err)
assert.Equal(t, "https://github.com/Luzifer/twitch-bot", redir)
testDoc = []byte(`<!DOCTYPE html>
<html>
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta property="twitter:image" content="">
</head>
<body>
</body>
</html>`)
redir, err = resolveMetaRedirect(testDoc)
require.ErrorIs(t, err, errNoMetaRedir)
assert.Equal(t, "", redir)
}

View file

@ -4,6 +4,7 @@ import (
"context"
"crypto/rand"
_ "embed"
"io"
"math/big"
"net/http"
"net/http/cookiejar"
@ -13,7 +14,6 @@ import (
"sync"
"time"
"github.com/Luzifer/go_helpers/v2/str"
"github.com/sirupsen/logrus"
)
@ -88,12 +88,12 @@ func (resolver) getJar() *cookiejar.Jar {
// that link after all redirects were followed
//
//nolint:gocyclo
func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack []string, userAgent string) string {
func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack *stack, userAgent string) string {
if !linkTest.MatchString(link) && !r.skipValidation {
return ""
}
if str.StringInSlice(link, callStack) || len(callStack) == maxRedirects {
if callStack.Count(link) > 2 || callStack.Height() == maxRedirects {
// We got ourselves a loop: Yay!
return link
}
@ -155,10 +155,35 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack
return ""
}
target := r.resolveReference(u, tu)
return r.resolveFinal(target, cookieJar, append(callStack, link), userAgent)
callStack.Visit(link)
return r.resolveFinal(target, cookieJar, callStack, userAgent)
}
// We got a response, it's no redirect, we count this as a success
// We got a response, it's no redirect, lets check for in-document stuff
docBody, err := io.ReadAll(resp.Body)
if err != nil {
return ""
}
if metaRedir, err := resolveMetaRedirect(docBody); err == nil {
// Meta-Redirect found
tu, err := url.Parse(metaRedir)
if err != nil {
return ""
}
target := r.resolveReference(u, tu)
callStack.Visit(link)
return r.resolveFinal(target, cookieJar, callStack, userAgent)
}
if resp.Header.Get("Set-Cookie") != "" {
// A new cookie was set, lets refresh the page once to see if stuff
// changes with that new cookie
callStack.Visit(link)
return r.resolveFinal(u.String(), cookieJar, callStack, userAgent)
}
// We had no in-document redirects: we count this as a success
return u.String()
}
@ -201,7 +226,7 @@ func (resolver) resolveReference(origin *url.URL, loc *url.URL) string {
func (r resolver) runResolver() {
for qe := range r.resolverC {
if link := r.resolveFinal(qe.Link, r.getJar(), nil, r.userAgent()); link != "" {
if link := r.resolveFinal(qe.Link, r.getJar(), &stack{}, r.userAgent()); link != "" {
qe.Callback(link)
}
qe.WaitGroup.Done()

View file

@ -0,0 +1,27 @@
package linkcheck
import "strings"
type (
stack struct {
visits []string
}
)
func (s stack) Count(url string) (n int) {
for _, v := range s.visits {
if strings.EqualFold(v, url) {
n++
}
}
return n
}
func (s stack) Height() int {
return len(s.visits)
}
func (s *stack) Visit(url string) {
s.visits = append(s.visits, url)
}