[linkcheck] Fix: Replace static (deprecated) user-agent list

Signed-off-by: Knut Ahlers <knut@ahlers.me>
This commit is contained in:
Knut Ahlers 2024-09-05 11:22:26 +02:00
parent 5a8459cedc
commit e0a8ce3684
Signed by: luzifer
SSH key fingerprint: SHA256:/xtE5lCgiRDQr8SLxHMS92ZBlACmATUmF1crK16Ks4E
5 changed files with 73 additions and 93 deletions

View file

@ -40,9 +40,10 @@ node_modules:
# --- Tools # --- Tools
update_ua_list: update-chrome-major:
# User-Agents provided by https://www.useragents.me/ sed -i -E \
curl -sSf https://www.useragents.me/api | jq -r '.data[].ua' | grep -v 'Trident' >internal/linkcheck/user-agents.txt 's/chromeMajor = [0-9]+/chromeMajor = $(shell curl -sSf https://lv.luzifer.io/v1/catalog/google-chrome/stable/version | cut -d '.' -f 1)/' \
internal/linkcheck/useragent.go
gh-workflow: gh-workflow:
bash ci/create-workflow.sh bash ci/create-workflow.sh

View file

@ -57,13 +57,12 @@ func TestScanForLinks(t *testing.T) {
t.SkipNow() t.SkipNow()
} }
c := New()
for _, testCase := range []struct { for _, testCase := range []struct {
Heuristic bool Heuristic bool
Message string Message string
ExpectedLinks []string ExpectedLinks []string
ExpectedContains bool ExpectedContains bool
TraceStack bool
}{ }{
// Case: full URL is present in the message // Case: full URL is present in the message
{ {
@ -183,6 +182,13 @@ func TestScanForLinks(t *testing.T) {
{Heuristic: false, Message: "Hey btw. es kann sein, dass", ExpectedLinks: nil}, {Heuristic: false, Message: "Hey btw. es kann sein, dass", ExpectedLinks: nil},
} { } {
t.Run(fmt.Sprintf("h:%v lc:%d m:%s", testCase.Heuristic, len(testCase.ExpectedLinks), testCase.Message), func(t *testing.T) { t.Run(fmt.Sprintf("h:%v lc:%d m:%s", testCase.Heuristic, len(testCase.ExpectedLinks), testCase.Message), func(t *testing.T) {
var c *Checker
if testCase.TraceStack {
c = New(withResolver(newResolver(resolverPoolSize, withTesting(t))))
} else {
c = New()
}
var linksFound []string var linksFound []string
if testCase.Heuristic { if testCase.Heuristic {
linksFound = c.HeuristicScanForLinks(testCase.Message) linksFound = c.HeuristicScanForLinks(testCase.Message)
@ -209,23 +215,3 @@ func TestScanForLinks(t *testing.T) {
}) })
} }
} }
func TestUserAgentListNotEmpty(t *testing.T) {
if len(defaultUserAgents) == 0 {
t.Fatal("found empty user-agent list")
}
}
func TestUserAgentRandomizer(t *testing.T) {
uas := map[string]int{}
for i := 0; i < 10; i++ {
uas[defaultResolver.userAgent()]++
}
for _, c := range uas {
assert.Less(t, c, 10)
}
assert.Equal(t, 0, uas[""]) // there should be no empty UA
}

View file

@ -2,16 +2,14 @@ package linkcheck
import ( import (
"context" "context"
"crypto/rand"
_ "embed"
"io" "io"
"math/big"
"net/http" "net/http"
"net/http/cookiejar" "net/http/cookiejar"
"net/url" "net/url"
"regexp" "regexp"
"strings" "strings"
"sync" "sync"
"testing"
"time" "time"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
@ -30,6 +28,8 @@ type (
resolver struct { resolver struct {
resolverC chan resolverQueueEntry resolverC chan resolverQueueEntry
skipValidation bool skipValidation bool
t *testing.T
} }
resolverQueueEntry struct { resolverQueueEntry struct {
@ -40,20 +40,12 @@ type (
) )
var ( var (
defaultUserAgents = []string{}
linkTest = regexp.MustCompile(`(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]`) linkTest = regexp.MustCompile(`(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]`)
numericHost = regexp.MustCompile(`^(?:[0-9]+\.)*[0-9]+(?::[0-9]+)?$`) numericHost = regexp.MustCompile(`^(?:[0-9]+\.)*[0-9]+(?::[0-9]+)?$`)
//go:embed user-agents.txt
uaList string
defaultResolver = newResolver(resolverPoolSize) defaultResolver = newResolver(resolverPoolSize)
) )
func init() {
defaultUserAgents = strings.Split(strings.TrimSpace(uaList), "\n")
}
func newResolver(poolSize int, opts ...func(*resolver)) *resolver { func newResolver(poolSize int, opts ...func(*resolver)) *resolver {
r := &resolver{ r := &resolver{
resolverC: make(chan resolverQueueEntry), resolverC: make(chan resolverQueueEntry),
@ -74,6 +66,10 @@ func withSkipVerify() func(*resolver) {
return func(r *resolver) { r.skipValidation = true } return func(r *resolver) { r.skipValidation = true }
} }
func withTesting(t *testing.T) func(*resolver) {
return func(r *resolver) { r.t = t }
}
func (r resolver) Resolve(qe resolverQueueEntry) { func (r resolver) Resolve(qe resolverQueueEntry) {
qe.WaitGroup.Add(1) qe.WaitGroup.Add(1)
r.resolverC <- qe r.resolverC <- qe
@ -87,8 +83,8 @@ func (resolver) getJar() *cookiejar.Jar {
// resolveFinal takes a link and looks up the final destination of // resolveFinal takes a link and looks up the final destination of
// that link after all redirects were followed // that link after all redirects were followed
// //
//nolint:gocyclo //nolint:funlen,gocyclo
func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack *stack, userAgent string) string { func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack *stack) string {
if !linkTest.MatchString(link) && !r.skipValidation { if !linkTest.MatchString(link) && !r.skipValidation {
return "" return ""
} }
@ -131,12 +127,19 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack
// Sanitize host: Trailing dots are valid but not required // Sanitize host: Trailing dots are valid but not required
u.Host = strings.TrimRight(u.Host, ".") u.Host = strings.TrimRight(u.Host, ".")
if r.t != nil {
r.t.Logf("resolving link: link=%q jar_c=%#v stack_c=%d stack_h=%d",
link, len(cookieJar.Cookies(u)), callStack.Count(link), callStack.Height())
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil) req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
if err != nil { if err != nil {
return "" return ""
} }
req.Header.Set("User-Agent", userAgent) for k, v := range generateUserAgentHeaders() {
req.Header.Set(k, v)
}
resp, err := client.Do(req) resp, err := client.Do(req)
if err != nil { if err != nil {
@ -156,7 +159,7 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack
} }
target := r.resolveReference(u, tu) target := r.resolveReference(u, tu)
callStack.Visit(link) callStack.Visit(link)
return r.resolveFinal(target, cookieJar, callStack, userAgent) return r.resolveFinal(target, cookieJar, callStack)
} }
// We got a response, it's no redirect, lets check for in-document stuff // We got a response, it's no redirect, lets check for in-document stuff
@ -173,14 +176,14 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack
} }
target := r.resolveReference(u, tu) target := r.resolveReference(u, tu)
callStack.Visit(link) callStack.Visit(link)
return r.resolveFinal(target, cookieJar, callStack, userAgent) return r.resolveFinal(target, cookieJar, callStack)
} }
if resp.Header.Get("Set-Cookie") != "" { if resp.Header.Get("Set-Cookie") != "" {
// A new cookie was set, lets refresh the page once to see if stuff // A new cookie was set, lets refresh the page once to see if stuff
// changes with that new cookie // changes with that new cookie
callStack.Visit(link) callStack.Visit(link)
return r.resolveFinal(u.String(), cookieJar, callStack, userAgent) return r.resolveFinal(u.String(), cookieJar, callStack)
} }
// We had no in-document redirects: we count this as a success // We had no in-document redirects: we count this as a success
@ -226,14 +229,9 @@ func (resolver) resolveReference(origin *url.URL, loc *url.URL) string {
func (r resolver) runResolver() { func (r resolver) runResolver() {
for qe := range r.resolverC { for qe := range r.resolverC {
if link := r.resolveFinal(qe.Link, r.getJar(), &stack{}, r.userAgent()); link != "" { if link := r.resolveFinal(qe.Link, r.getJar(), &stack{}); link != "" {
qe.Callback(link) qe.Callback(link)
} }
qe.WaitGroup.Done() qe.WaitGroup.Done()
} }
} }
func (resolver) userAgent() string {
n, _ := rand.Int(rand.Reader, big.NewInt(int64(len(defaultUserAgents))))
return defaultUserAgents[n.Int64()]
}

View file

@ -1,43 +0,0 @@
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36
Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.56
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Whale/3.19.166.16 Safari/537.36
Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.46
Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.192.400 QQBrowser/11.5.5250.400
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36
Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763
Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61
Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.70

View file

@ -0,0 +1,38 @@
package linkcheck
import (
"fmt"
)
const (
chromeMajor = 128
webkitMajor = 537
webkitMinor = 36
)
// generateUserAgent resembles the Chrome user agent generation as
// closely as possible in order to blend into the crowd of browsers
//
// https://github.com/chromium/chromium/blob/58e23d958ee8d2bb4b085c843a18eb28b9da17da/content/common/user_agent.cc
func generateUserAgentHeaders() map[string]string {
return map[string]string{
// New UA hints method
"Sec-CH-UA": fmt.Sprintf(
`"Chromium";v="%[1]d", "Not;A=Brand";v="24", "Google Chrome";v="%[1]d"`,
chromeMajor,
),
// Not a mobile browser
"Sec-CH-UA-Mobile": "?0",
// We're always Windows
"Sec-CH-UA-Platform": "Windows",
// "old" user-agent
"User-Agent": fmt.Sprintf(
"Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) %s Safari/537.36",
"Windows NT 10.0; Win64; x64", // We're always Windows 10 / 11 on x64
fmt.Sprintf("Chrome/%d.0.0.0", chromeMajor), // UA-Reduction enabled
),
}
}