twitch-bot/internal/linkcheck/resolver.go
2024-09-05 12:50:58 +02:00

237 lines
5.5 KiB
Go

package linkcheck
import (
"context"
"io"
"net/http"
"net/http/cookiejar"
"net/url"
"regexp"
"strings"
"sync"
"testing"
"time"
"github.com/sirupsen/logrus"
)
const (
// DefaultCheckTimeout defines the default time the request to a site
// may take to answer
DefaultCheckTimeout = 10 * time.Second
maxRedirects = 50
resolverPoolSize = 25
)
type (
resolver struct {
resolverC chan resolverQueueEntry
skipValidation bool
t *testing.T
}
resolverQueueEntry struct {
Link string
Callback func(string)
WaitGroup *sync.WaitGroup
}
)
var (
linkTest = regexp.MustCompile(`(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]`)
numericHost = regexp.MustCompile(`^(?:[0-9]+\.)*[0-9]+(?::[0-9]+)?$`)
defaultResolver = newResolver(resolverPoolSize)
)
func newResolver(poolSize int, opts ...func(*resolver)) *resolver {
r := &resolver{
resolverC: make(chan resolverQueueEntry),
}
for _, o := range opts {
o(r)
}
for i := 0; i < poolSize; i++ {
go r.runResolver()
}
return r
}
func withSkipVerify() func(*resolver) {
return func(r *resolver) { r.skipValidation = true }
}
func withTesting(t *testing.T) func(*resolver) {
return func(r *resolver) { r.t = t }
}
func (r resolver) Resolve(qe resolverQueueEntry) {
qe.WaitGroup.Add(1)
r.resolverC <- qe
}
func (resolver) getJar() *cookiejar.Jar {
jar, _ := cookiejar.New(nil)
return jar
}
// resolveFinal takes a link and looks up the final destination of
// that link after all redirects were followed
//
//nolint:funlen,gocyclo
func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack *stack) string {
if !linkTest.MatchString(link) && !r.skipValidation {
return ""
}
if callStack.Count(link) > 2 || callStack.Height() == maxRedirects {
// We got ourselves a loop: Yay!
return link
}
client := &http.Client{
CheckRedirect: func(*http.Request, []*http.Request) error {
return http.ErrUseLastResponse
},
Jar: cookieJar,
}
ctx, cancel := context.WithTimeout(context.Background(), DefaultCheckTimeout)
defer cancel()
u, err := url.Parse(link)
if err != nil {
return ""
}
if u.Scheme == "" {
// We have no scheme and the url is in the path, lets add the
// scheme and re-parse the URL to avoid some confusion
u.Scheme = "http"
u, err = url.Parse(u.String())
if err != nil {
return ""
}
}
if numericHost.MatchString(u.Host) && !r.skipValidation {
// Host is fully numeric: We don't support scanning that
return ""
}
// Sanitize host: Trailing dots are valid but not required
u.Host = strings.TrimRight(u.Host, ".")
if r.t != nil {
r.t.Logf("resolving link: link=%q jar_c=%#v stack_c=%d stack_h=%d",
link, len(cookieJar.Cookies(u)), callStack.Count(link), callStack.Height())
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
if err != nil {
return ""
}
for k, v := range generateUserAgentHeaders() {
req.Header.Set(k, v)
}
resp, err := client.Do(req)
if err != nil {
return ""
}
defer func() {
if err := resp.Body.Close(); err != nil {
logrus.WithError(err).Error("closing response body (leaked fd)")
}
}()
if resp.StatusCode > 299 && resp.StatusCode < 400 {
// We got a redirect
tu, err := url.Parse(resp.Header.Get("location"))
if err != nil {
return ""
}
target := r.resolveReference(u, tu)
callStack.Visit(link)
return r.resolveFinal(target, cookieJar, callStack)
}
// We got a response, it's no redirect, lets check for in-document stuff
docBody, err := io.ReadAll(resp.Body)
if err != nil {
return ""
}
if metaRedir, err := resolveMetaRedirect(docBody); err == nil {
// Meta-Redirect found
tu, err := url.Parse(metaRedir)
if err != nil {
return ""
}
target := r.resolveReference(u, tu)
callStack.Visit(link)
return r.resolveFinal(target, cookieJar, callStack)
}
if resp.Header.Get("Set-Cookie") != "" {
// A new cookie was set, lets refresh the page once to see if stuff
// changes with that new cookie
callStack.Visit(link)
return r.resolveFinal(u.String(), cookieJar, callStack)
}
// We had no in-document redirects: we count this as a success
return u.String()
}
func (resolver) resolveReference(origin *url.URL, loc *url.URL) string {
// Special Case: vkontakte used as shortener / obfuscation
if loc.Path == "/away.php" && loc.Query().Has("to") {
// VK is doing HTML / JS redirect magic so we take that from them
// and execute the redirect directly here in code
return loc.Query().Get("to")
}
if loc.Host == "consent.youtube.com" && loc.Query().Has("continue") {
// Youtube links end up in consent page but we want the real
// target so we use the continue parameter where we strip the
// cbrd query parameters as that one causes an infinite loop.
contTarget, err := url.Parse(loc.Query().Get("continue"))
if err == nil {
v := contTarget.Query()
v.Del("cbrd")
contTarget.RawQuery = v.Encode()
return contTarget.String()
}
return loc.Query().Get("continue")
}
if loc.Host == "www.instagram.com" && loc.Query().Has("next") {
// Instagram likes its login page, we on the other side don't
// care about the sign-in or even the content. Therefore we
// just take their redirect target and use that as the next
// URL
return loc.Query().Get("next")
}
// Default fallback behavior: Do a normal resolve
return origin.ResolveReference(loc).String()
}
func (r resolver) runResolver() {
for qe := range r.resolverC {
if link := r.resolveFinal(qe.Link, r.getJar(), &stack{}); link != "" {
qe.Callback(link)
}
qe.WaitGroup.Done()
}
}