From e0a8ce36841634d7f782c011310363373c0beb21 Mon Sep 17 00:00:00 2001 From: Knut Ahlers Date: Thu, 5 Sep 2024 11:22:26 +0200 Subject: [PATCH] [linkcheck] Fix: Replace static (deprecated) user-agent list Signed-off-by: Knut Ahlers --- Makefile | 7 ++-- internal/linkcheck/linkcheck_test.go | 30 +++++------------ internal/linkcheck/resolver.go | 48 +++++++++++++--------------- internal/linkcheck/user-agents.txt | 43 ------------------------- internal/linkcheck/useragent.go | 38 ++++++++++++++++++++++ 5 files changed, 73 insertions(+), 93 deletions(-) delete mode 100644 internal/linkcheck/user-agents.txt create mode 100644 internal/linkcheck/useragent.go diff --git a/Makefile b/Makefile index 6bc850f..aa62855 100644 --- a/Makefile +++ b/Makefile @@ -40,9 +40,10 @@ node_modules: # --- Tools -update_ua_list: - # User-Agents provided by https://www.useragents.me/ - curl -sSf https://www.useragents.me/api | jq -r '.data[].ua' | grep -v 'Trident' >internal/linkcheck/user-agents.txt +update-chrome-major: + sed -i -E \ + 's/chromeMajor = [0-9]+/chromeMajor = $(shell curl -sSf https://lv.luzifer.io/v1/catalog/google-chrome/stable/version | cut -d '.' -f 1)/' \ + internal/linkcheck/useragent.go gh-workflow: bash ci/create-workflow.sh diff --git a/internal/linkcheck/linkcheck_test.go b/internal/linkcheck/linkcheck_test.go index 239122e..f1664b7 100644 --- a/internal/linkcheck/linkcheck_test.go +++ b/internal/linkcheck/linkcheck_test.go @@ -57,13 +57,12 @@ func TestScanForLinks(t *testing.T) { t.SkipNow() } - c := New() - for _, testCase := range []struct { Heuristic bool Message string ExpectedLinks []string ExpectedContains bool + TraceStack bool }{ // Case: full URL is present in the message { @@ -183,6 +182,13 @@ func TestScanForLinks(t *testing.T) { {Heuristic: false, Message: "Hey btw. es kann sein, dass", ExpectedLinks: nil}, } { t.Run(fmt.Sprintf("h:%v lc:%d m:%s", testCase.Heuristic, len(testCase.ExpectedLinks), testCase.Message), func(t *testing.T) { + var c *Checker + if testCase.TraceStack { + c = New(withResolver(newResolver(resolverPoolSize, withTesting(t)))) + } else { + c = New() + } + var linksFound []string if testCase.Heuristic { linksFound = c.HeuristicScanForLinks(testCase.Message) @@ -209,23 +215,3 @@ func TestScanForLinks(t *testing.T) { }) } } - -func TestUserAgentListNotEmpty(t *testing.T) { - if len(defaultUserAgents) == 0 { - t.Fatal("found empty user-agent list") - } -} - -func TestUserAgentRandomizer(t *testing.T) { - uas := map[string]int{} - - for i := 0; i < 10; i++ { - uas[defaultResolver.userAgent()]++ - } - - for _, c := range uas { - assert.Less(t, c, 10) - } - - assert.Equal(t, 0, uas[""]) // there should be no empty UA -} diff --git a/internal/linkcheck/resolver.go b/internal/linkcheck/resolver.go index e854027..f391189 100644 --- a/internal/linkcheck/resolver.go +++ b/internal/linkcheck/resolver.go @@ -2,16 +2,14 @@ package linkcheck import ( "context" - "crypto/rand" - _ "embed" "io" - "math/big" "net/http" "net/http/cookiejar" "net/url" "regexp" "strings" "sync" + "testing" "time" "github.com/sirupsen/logrus" @@ -30,6 +28,8 @@ type ( resolver struct { resolverC chan resolverQueueEntry skipValidation bool + + t *testing.T } resolverQueueEntry struct { @@ -40,20 +40,12 @@ type ( ) var ( - defaultUserAgents = []string{} - linkTest = regexp.MustCompile(`(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]`) - numericHost = regexp.MustCompile(`^(?:[0-9]+\.)*[0-9]+(?::[0-9]+)?$`) - - //go:embed user-agents.txt - uaList string + linkTest = regexp.MustCompile(`(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]`) + numericHost = regexp.MustCompile(`^(?:[0-9]+\.)*[0-9]+(?::[0-9]+)?$`) defaultResolver = newResolver(resolverPoolSize) ) -func init() { - defaultUserAgents = strings.Split(strings.TrimSpace(uaList), "\n") -} - func newResolver(poolSize int, opts ...func(*resolver)) *resolver { r := &resolver{ resolverC: make(chan resolverQueueEntry), @@ -74,6 +66,10 @@ func withSkipVerify() func(*resolver) { return func(r *resolver) { r.skipValidation = true } } +func withTesting(t *testing.T) func(*resolver) { + return func(r *resolver) { r.t = t } +} + func (r resolver) Resolve(qe resolverQueueEntry) { qe.WaitGroup.Add(1) r.resolverC <- qe @@ -87,8 +83,8 @@ func (resolver) getJar() *cookiejar.Jar { // resolveFinal takes a link and looks up the final destination of // that link after all redirects were followed // -//nolint:gocyclo -func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack *stack, userAgent string) string { +//nolint:funlen,gocyclo +func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack *stack) string { if !linkTest.MatchString(link) && !r.skipValidation { return "" } @@ -131,12 +127,19 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack // Sanitize host: Trailing dots are valid but not required u.Host = strings.TrimRight(u.Host, ".") + if r.t != nil { + r.t.Logf("resolving link: link=%q jar_c=%#v stack_c=%d stack_h=%d", + link, len(cookieJar.Cookies(u)), callStack.Count(link), callStack.Height()) + } + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil) if err != nil { return "" } - req.Header.Set("User-Agent", userAgent) + for k, v := range generateUserAgentHeaders() { + req.Header.Set(k, v) + } resp, err := client.Do(req) if err != nil { @@ -156,7 +159,7 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack } target := r.resolveReference(u, tu) callStack.Visit(link) - return r.resolveFinal(target, cookieJar, callStack, userAgent) + return r.resolveFinal(target, cookieJar, callStack) } // We got a response, it's no redirect, lets check for in-document stuff @@ -173,14 +176,14 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack } target := r.resolveReference(u, tu) callStack.Visit(link) - return r.resolveFinal(target, cookieJar, callStack, userAgent) + return r.resolveFinal(target, cookieJar, callStack) } if resp.Header.Get("Set-Cookie") != "" { // A new cookie was set, lets refresh the page once to see if stuff // changes with that new cookie callStack.Visit(link) - return r.resolveFinal(u.String(), cookieJar, callStack, userAgent) + return r.resolveFinal(u.String(), cookieJar, callStack) } // We had no in-document redirects: we count this as a success @@ -226,14 +229,9 @@ func (resolver) resolveReference(origin *url.URL, loc *url.URL) string { func (r resolver) runResolver() { for qe := range r.resolverC { - if link := r.resolveFinal(qe.Link, r.getJar(), &stack{}, r.userAgent()); link != "" { + if link := r.resolveFinal(qe.Link, r.getJar(), &stack{}); link != "" { qe.Callback(link) } qe.WaitGroup.Done() } } - -func (resolver) userAgent() string { - n, _ := rand.Int(rand.Reader, big.NewInt(int64(len(defaultUserAgents)))) - return defaultUserAgents[n.Int64()] -} diff --git a/internal/linkcheck/user-agents.txt b/internal/linkcheck/user-agents.txt deleted file mode 100644 index 43e1e57..0000000 --- a/internal/linkcheck/user-agents.txt +++ /dev/null @@ -1,43 +0,0 @@ -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63 -Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 -Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57 -Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 -Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0 -Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 -Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.56 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37 -Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Whale/3.19.166.16 Safari/537.36 -Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76 -Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.46 -Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0 -Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0 -Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.192.400 QQBrowser/11.5.5250.400 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36 -Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0 -Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67 -Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0 -Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 -Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763 -Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 -Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63 -Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61 -Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/110.0 -Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.70 diff --git a/internal/linkcheck/useragent.go b/internal/linkcheck/useragent.go new file mode 100644 index 0000000..2c549f8 --- /dev/null +++ b/internal/linkcheck/useragent.go @@ -0,0 +1,38 @@ +package linkcheck + +import ( + "fmt" +) + +const ( + chromeMajor = 128 + webkitMajor = 537 + webkitMinor = 36 +) + +// generateUserAgent resembles the Chrome user agent generation as +// closely as possible in order to blend into the crowd of browsers +// +// https://github.com/chromium/chromium/blob/58e23d958ee8d2bb4b085c843a18eb28b9da17da/content/common/user_agent.cc +func generateUserAgentHeaders() map[string]string { + return map[string]string{ + // New UA hints method + "Sec-CH-UA": fmt.Sprintf( + `"Chromium";v="%[1]d", "Not;A=Brand";v="24", "Google Chrome";v="%[1]d"`, + chromeMajor, + ), + + // Not a mobile browser + "Sec-CH-UA-Mobile": "?0", + + // We're always Windows + "Sec-CH-UA-Platform": "Windows", + + // "old" user-agent + "User-Agent": fmt.Sprintf( + "Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) %s Safari/537.36", + "Windows NT 10.0; Win64; x64", // We're always Windows 10 / 11 on x64 + fmt.Sprintf("Chrome/%d.0.0.0", chromeMajor), // UA-Reduction enabled + ), + } +}