From 621d266391aa5869376768d887c61af08063a0e5 Mon Sep 17 00:00:00 2001 From: Knut Ahlers Date: Mon, 10 Jun 2024 14:17:49 +0200 Subject: [PATCH] [linkcheck] Add support for meta-redirects Signed-off-by: Knut Ahlers --- internal/linkcheck/meta.go | 66 +++++++++++++++++++++++++++++++++ internal/linkcheck/meta_test.go | 41 ++++++++++++++++++++ internal/linkcheck/resolver.go | 37 +++++++++++++++--- internal/linkcheck/stack.go | 27 ++++++++++++++ 4 files changed, 165 insertions(+), 6 deletions(-) create mode 100644 internal/linkcheck/meta.go create mode 100644 internal/linkcheck/meta_test.go create mode 100644 internal/linkcheck/stack.go diff --git a/internal/linkcheck/meta.go b/internal/linkcheck/meta.go new file mode 100644 index 0000000..6217844 --- /dev/null +++ b/internal/linkcheck/meta.go @@ -0,0 +1,66 @@ +package linkcheck + +import ( + "bytes" + "errors" + "fmt" + "io" + "regexp" + + "golang.org/x/net/html" +) + +var ( + errNoMetaRedir = fmt.Errorf("no meta-redir found") + metaRedirContent = regexp.MustCompile(`^[0-9]+;\s*url=(.*)$`) +) + +//nolint:gocognit // Makes no sense to split +func resolveMetaRedirect(body []byte) (redir string, err error) { + tok := html.NewTokenizer(bytes.NewReader(body)) + +tokenLoop: + for { + token := tok.Next() + switch token { + case html.ErrorToken: + if errors.Is(tok.Err(), io.EOF) { + break tokenLoop + } + return "", fmt.Errorf("scanning tokens: %w", tok.Err()) + + case html.StartTagToken: + t := tok.Token() + if t.Data == "meta" { + var ( + content string + isRedirect bool + ) + + for _, attr := range t.Attr { + isRedirect = isRedirect || attr.Key == "http-equiv" && attr.Val == "refresh" + + if attr.Key == "content" { + content = attr.Val + } + } + + if !isRedirect { + continue tokenLoop + } + + // It is a redirect, get the content and parse it + if matches := metaRedirContent.FindStringSubmatch(content); len(matches) > 1 { + redir = matches[1] + } + } + } + } + + if redir == "" { + // We did not find anything + return "", errNoMetaRedir + } + + return redir, nil +} diff --git a/internal/linkcheck/meta_test.go b/internal/linkcheck/meta_test.go new file mode 100644 index 0000000..4edc30b --- /dev/null +++ b/internal/linkcheck/meta_test.go @@ -0,0 +1,41 @@ +package linkcheck + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestResolveMetaRedir(t *testing.T) { + testDoc := []byte(` + + + + + + + + + +`) + + redir, err := resolveMetaRedirect(testDoc) + require.NoError(t, err) + assert.Equal(t, "https://github.com/Luzifer/twitch-bot", redir) + + testDoc = []byte(` + + + + + + + + +`) + + redir, err = resolveMetaRedirect(testDoc) + require.ErrorIs(t, err, errNoMetaRedir) + assert.Equal(t, "", redir) +} diff --git a/internal/linkcheck/resolver.go b/internal/linkcheck/resolver.go index 310980d..e854027 100644 --- a/internal/linkcheck/resolver.go +++ b/internal/linkcheck/resolver.go @@ -4,6 +4,7 @@ import ( "context" "crypto/rand" _ "embed" + "io" "math/big" "net/http" "net/http/cookiejar" @@ -13,7 +14,6 @@ import ( "sync" "time" - "github.com/Luzifer/go_helpers/v2/str" "github.com/sirupsen/logrus" ) @@ -88,12 +88,12 @@ func (resolver) getJar() *cookiejar.Jar { // that link after all redirects were followed // //nolint:gocyclo -func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack []string, userAgent string) string { +func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack *stack, userAgent string) string { if !linkTest.MatchString(link) && !r.skipValidation { return "" } - if str.StringInSlice(link, callStack) || len(callStack) == maxRedirects { + if callStack.Count(link) > 2 || callStack.Height() == maxRedirects { // We got ourselves a loop: Yay! return link } @@ -155,10 +155,35 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack return "" } target := r.resolveReference(u, tu) - return r.resolveFinal(target, cookieJar, append(callStack, link), userAgent) + callStack.Visit(link) + return r.resolveFinal(target, cookieJar, callStack, userAgent) } - // We got a response, it's no redirect, we count this as a success + // We got a response, it's no redirect, lets check for in-document stuff + docBody, err := io.ReadAll(resp.Body) + if err != nil { + return "" + } + + if metaRedir, err := resolveMetaRedirect(docBody); err == nil { + // Meta-Redirect found + tu, err := url.Parse(metaRedir) + if err != nil { + return "" + } + target := r.resolveReference(u, tu) + callStack.Visit(link) + return r.resolveFinal(target, cookieJar, callStack, userAgent) + } + + if resp.Header.Get("Set-Cookie") != "" { + // A new cookie was set, lets refresh the page once to see if stuff + // changes with that new cookie + callStack.Visit(link) + return r.resolveFinal(u.String(), cookieJar, callStack, userAgent) + } + + // We had no in-document redirects: we count this as a success return u.String() } @@ -201,7 +226,7 @@ func (resolver) resolveReference(origin *url.URL, loc *url.URL) string { func (r resolver) runResolver() { for qe := range r.resolverC { - if link := r.resolveFinal(qe.Link, r.getJar(), nil, r.userAgent()); link != "" { + if link := r.resolveFinal(qe.Link, r.getJar(), &stack{}, r.userAgent()); link != "" { qe.Callback(link) } qe.WaitGroup.Done() diff --git a/internal/linkcheck/stack.go b/internal/linkcheck/stack.go new file mode 100644 index 0000000..6074a2e --- /dev/null +++ b/internal/linkcheck/stack.go @@ -0,0 +1,27 @@ +package linkcheck + +import "strings" + +type ( + stack struct { + visits []string + } +) + +func (s stack) Count(url string) (n int) { + for _, v := range s.visits { + if strings.EqualFold(v, url) { + n++ + } + } + + return n +} + +func (s stack) Height() int { + return len(s.visits) +} + +func (s *stack) Visit(url string) { + s.visits = append(s.visits, url) +}