[linkcheck] Fix: Replace static (deprecated) user-agent list

Signed-off-by: Knut Ahlers <knut@ahlers.me>
This commit is contained in:
Knut Ahlers 2024-09-05 11:22:26 +02:00
parent 5a8459cedc
commit e0a8ce3684
Signed by: luzifer
SSH Key Fingerprint: SHA256:/xtE5lCgiRDQr8SLxHMS92ZBlACmATUmF1crK16Ks4E
5 changed files with 73 additions and 93 deletions

View File

@ -40,9 +40,10 @@ node_modules:
# --- Tools
update_ua_list:
# User-Agents provided by https://www.useragents.me/
curl -sSf https://www.useragents.me/api | jq -r '.data[].ua' | grep -v 'Trident' >internal/linkcheck/user-agents.txt
update-chrome-major:
sed -i -E \
's/chromeMajor = [0-9]+/chromeMajor = $(shell curl -sSf https://lv.luzifer.io/v1/catalog/google-chrome/stable/version | cut -d '.' -f 1)/' \
internal/linkcheck/useragent.go
gh-workflow:
bash ci/create-workflow.sh

View File

@ -57,13 +57,12 @@ func TestScanForLinks(t *testing.T) {
t.SkipNow()
}
c := New()
for _, testCase := range []struct {
Heuristic bool
Message string
ExpectedLinks []string
ExpectedContains bool
TraceStack bool
}{
// Case: full URL is present in the message
{
@ -183,6 +182,13 @@ func TestScanForLinks(t *testing.T) {
{Heuristic: false, Message: "Hey btw. es kann sein, dass", ExpectedLinks: nil},
} {
t.Run(fmt.Sprintf("h:%v lc:%d m:%s", testCase.Heuristic, len(testCase.ExpectedLinks), testCase.Message), func(t *testing.T) {
var c *Checker
if testCase.TraceStack {
c = New(withResolver(newResolver(resolverPoolSize, withTesting(t))))
} else {
c = New()
}
var linksFound []string
if testCase.Heuristic {
linksFound = c.HeuristicScanForLinks(testCase.Message)
@ -209,23 +215,3 @@ func TestScanForLinks(t *testing.T) {
})
}
}
func TestUserAgentListNotEmpty(t *testing.T) {
if len(defaultUserAgents) == 0 {
t.Fatal("found empty user-agent list")
}
}
func TestUserAgentRandomizer(t *testing.T) {
uas := map[string]int{}
for i := 0; i < 10; i++ {
uas[defaultResolver.userAgent()]++
}
for _, c := range uas {
assert.Less(t, c, 10)
}
assert.Equal(t, 0, uas[""]) // there should be no empty UA
}

View File

@ -2,16 +2,14 @@ package linkcheck
import (
"context"
"crypto/rand"
_ "embed"
"io"
"math/big"
"net/http"
"net/http/cookiejar"
"net/url"
"regexp"
"strings"
"sync"
"testing"
"time"
"github.com/sirupsen/logrus"
@ -30,6 +28,8 @@ type (
resolver struct {
resolverC chan resolverQueueEntry
skipValidation bool
t *testing.T
}
resolverQueueEntry struct {
@ -40,20 +40,12 @@ type (
)
var (
defaultUserAgents = []string{}
linkTest = regexp.MustCompile(`(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]`)
numericHost = regexp.MustCompile(`^(?:[0-9]+\.)*[0-9]+(?::[0-9]+)?$`)
//go:embed user-agents.txt
uaList string
linkTest = regexp.MustCompile(`(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]`)
numericHost = regexp.MustCompile(`^(?:[0-9]+\.)*[0-9]+(?::[0-9]+)?$`)
defaultResolver = newResolver(resolverPoolSize)
)
func init() {
defaultUserAgents = strings.Split(strings.TrimSpace(uaList), "\n")
}
func newResolver(poolSize int, opts ...func(*resolver)) *resolver {
r := &resolver{
resolverC: make(chan resolverQueueEntry),
@ -74,6 +66,10 @@ func withSkipVerify() func(*resolver) {
return func(r *resolver) { r.skipValidation = true }
}
func withTesting(t *testing.T) func(*resolver) {
return func(r *resolver) { r.t = t }
}
func (r resolver) Resolve(qe resolverQueueEntry) {
qe.WaitGroup.Add(1)
r.resolverC <- qe
@ -87,8 +83,8 @@ func (resolver) getJar() *cookiejar.Jar {
// resolveFinal takes a link and looks up the final destination of
// that link after all redirects were followed
//
//nolint:gocyclo
func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack *stack, userAgent string) string {
//nolint:funlen,gocyclo
func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack *stack) string {
if !linkTest.MatchString(link) && !r.skipValidation {
return ""
}
@ -131,12 +127,19 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack
// Sanitize host: Trailing dots are valid but not required
u.Host = strings.TrimRight(u.Host, ".")
if r.t != nil {
r.t.Logf("resolving link: link=%q jar_c=%#v stack_c=%d stack_h=%d",
link, len(cookieJar.Cookies(u)), callStack.Count(link), callStack.Height())
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
if err != nil {
return ""
}
req.Header.Set("User-Agent", userAgent)
for k, v := range generateUserAgentHeaders() {
req.Header.Set(k, v)
}
resp, err := client.Do(req)
if err != nil {
@ -156,7 +159,7 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack
}
target := r.resolveReference(u, tu)
callStack.Visit(link)
return r.resolveFinal(target, cookieJar, callStack, userAgent)
return r.resolveFinal(target, cookieJar, callStack)
}
// We got a response, it's no redirect, lets check for in-document stuff
@ -173,14 +176,14 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack
}
target := r.resolveReference(u, tu)
callStack.Visit(link)
return r.resolveFinal(target, cookieJar, callStack, userAgent)
return r.resolveFinal(target, cookieJar, callStack)
}
if resp.Header.Get("Set-Cookie") != "" {
// A new cookie was set, lets refresh the page once to see if stuff
// changes with that new cookie
callStack.Visit(link)
return r.resolveFinal(u.String(), cookieJar, callStack, userAgent)
return r.resolveFinal(u.String(), cookieJar, callStack)
}
// We had no in-document redirects: we count this as a success
@ -226,14 +229,9 @@ func (resolver) resolveReference(origin *url.URL, loc *url.URL) string {
func (r resolver) runResolver() {
for qe := range r.resolverC {
if link := r.resolveFinal(qe.Link, r.getJar(), &stack{}, r.userAgent()); link != "" {
if link := r.resolveFinal(qe.Link, r.getJar(), &stack{}); link != "" {
qe.Callback(link)
}
qe.WaitGroup.Done()
}
}
func (resolver) userAgent() string {
n, _ := rand.Int(rand.Reader, big.NewInt(int64(len(defaultUserAgents))))
return defaultUserAgents[n.Int64()]
}

View File

@ -1,43 +0,0 @@
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36
Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.56
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Whale/3.19.166.16 Safari/537.36
Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.46
Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.192.400 QQBrowser/11.5.5250.400
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36
Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763
Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61
Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.70

View File

@ -0,0 +1,38 @@
package linkcheck
import (
"fmt"
)
const (
chromeMajor = 128
webkitMajor = 537
webkitMinor = 36
)
// generateUserAgent resembles the Chrome user agent generation as
// closely as possible in order to blend into the crowd of browsers
//
// https://github.com/chromium/chromium/blob/58e23d958ee8d2bb4b085c843a18eb28b9da17da/content/common/user_agent.cc
func generateUserAgentHeaders() map[string]string {
return map[string]string{
// New UA hints method
"Sec-CH-UA": fmt.Sprintf(
`"Chromium";v="%[1]d", "Not;A=Brand";v="24", "Google Chrome";v="%[1]d"`,
chromeMajor,
),
// Not a mobile browser
"Sec-CH-UA-Mobile": "?0",
// We're always Windows
"Sec-CH-UA-Platform": "Windows",
// "old" user-agent
"User-Agent": fmt.Sprintf(
"Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) %s Safari/537.36",
"Windows NT 10.0; Win64; x64", // We're always Windows 10 / 11 on x64
fmt.Sprintf("Chrome/%d.0.0.0", chromeMajor), // UA-Reduction enabled
),
}
}