mirror of
https://github.com/Luzifer/twitch-bot.git
synced 2024-11-09 08:40:01 +00:00
[linkcheck] Fix: Replace static (deprecated) user-agent list
Signed-off-by: Knut Ahlers <knut@ahlers.me>
This commit is contained in:
parent
5a8459cedc
commit
e0a8ce3684
5 changed files with 73 additions and 93 deletions
7
Makefile
7
Makefile
|
@ -40,9 +40,10 @@ node_modules:
|
||||||
|
|
||||||
# --- Tools
|
# --- Tools
|
||||||
|
|
||||||
update_ua_list:
|
update-chrome-major:
|
||||||
# User-Agents provided by https://www.useragents.me/
|
sed -i -E \
|
||||||
curl -sSf https://www.useragents.me/api | jq -r '.data[].ua' | grep -v 'Trident' >internal/linkcheck/user-agents.txt
|
's/chromeMajor = [0-9]+/chromeMajor = $(shell curl -sSf https://lv.luzifer.io/v1/catalog/google-chrome/stable/version | cut -d '.' -f 1)/' \
|
||||||
|
internal/linkcheck/useragent.go
|
||||||
|
|
||||||
gh-workflow:
|
gh-workflow:
|
||||||
bash ci/create-workflow.sh
|
bash ci/create-workflow.sh
|
||||||
|
|
|
@ -57,13 +57,12 @@ func TestScanForLinks(t *testing.T) {
|
||||||
t.SkipNow()
|
t.SkipNow()
|
||||||
}
|
}
|
||||||
|
|
||||||
c := New()
|
|
||||||
|
|
||||||
for _, testCase := range []struct {
|
for _, testCase := range []struct {
|
||||||
Heuristic bool
|
Heuristic bool
|
||||||
Message string
|
Message string
|
||||||
ExpectedLinks []string
|
ExpectedLinks []string
|
||||||
ExpectedContains bool
|
ExpectedContains bool
|
||||||
|
TraceStack bool
|
||||||
}{
|
}{
|
||||||
// Case: full URL is present in the message
|
// Case: full URL is present in the message
|
||||||
{
|
{
|
||||||
|
@ -183,6 +182,13 @@ func TestScanForLinks(t *testing.T) {
|
||||||
{Heuristic: false, Message: "Hey btw. es kann sein, dass", ExpectedLinks: nil},
|
{Heuristic: false, Message: "Hey btw. es kann sein, dass", ExpectedLinks: nil},
|
||||||
} {
|
} {
|
||||||
t.Run(fmt.Sprintf("h:%v lc:%d m:%s", testCase.Heuristic, len(testCase.ExpectedLinks), testCase.Message), func(t *testing.T) {
|
t.Run(fmt.Sprintf("h:%v lc:%d m:%s", testCase.Heuristic, len(testCase.ExpectedLinks), testCase.Message), func(t *testing.T) {
|
||||||
|
var c *Checker
|
||||||
|
if testCase.TraceStack {
|
||||||
|
c = New(withResolver(newResolver(resolverPoolSize, withTesting(t))))
|
||||||
|
} else {
|
||||||
|
c = New()
|
||||||
|
}
|
||||||
|
|
||||||
var linksFound []string
|
var linksFound []string
|
||||||
if testCase.Heuristic {
|
if testCase.Heuristic {
|
||||||
linksFound = c.HeuristicScanForLinks(testCase.Message)
|
linksFound = c.HeuristicScanForLinks(testCase.Message)
|
||||||
|
@ -209,23 +215,3 @@ func TestScanForLinks(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestUserAgentListNotEmpty(t *testing.T) {
|
|
||||||
if len(defaultUserAgents) == 0 {
|
|
||||||
t.Fatal("found empty user-agent list")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUserAgentRandomizer(t *testing.T) {
|
|
||||||
uas := map[string]int{}
|
|
||||||
|
|
||||||
for i := 0; i < 10; i++ {
|
|
||||||
uas[defaultResolver.userAgent()]++
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, c := range uas {
|
|
||||||
assert.Less(t, c, 10)
|
|
||||||
}
|
|
||||||
|
|
||||||
assert.Equal(t, 0, uas[""]) // there should be no empty UA
|
|
||||||
}
|
|
||||||
|
|
|
@ -2,16 +2,14 @@ package linkcheck
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"crypto/rand"
|
|
||||||
_ "embed"
|
|
||||||
"io"
|
"io"
|
||||||
"math/big"
|
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/cookiejar"
|
"net/http/cookiejar"
|
||||||
"net/url"
|
"net/url"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
|
@ -30,6 +28,8 @@ type (
|
||||||
resolver struct {
|
resolver struct {
|
||||||
resolverC chan resolverQueueEntry
|
resolverC chan resolverQueueEntry
|
||||||
skipValidation bool
|
skipValidation bool
|
||||||
|
|
||||||
|
t *testing.T
|
||||||
}
|
}
|
||||||
|
|
||||||
resolverQueueEntry struct {
|
resolverQueueEntry struct {
|
||||||
|
@ -40,20 +40,12 @@ type (
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
defaultUserAgents = []string{}
|
|
||||||
linkTest = regexp.MustCompile(`(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]`)
|
linkTest = regexp.MustCompile(`(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]`)
|
||||||
numericHost = regexp.MustCompile(`^(?:[0-9]+\.)*[0-9]+(?::[0-9]+)?$`)
|
numericHost = regexp.MustCompile(`^(?:[0-9]+\.)*[0-9]+(?::[0-9]+)?$`)
|
||||||
|
|
||||||
//go:embed user-agents.txt
|
|
||||||
uaList string
|
|
||||||
|
|
||||||
defaultResolver = newResolver(resolverPoolSize)
|
defaultResolver = newResolver(resolverPoolSize)
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
|
||||||
defaultUserAgents = strings.Split(strings.TrimSpace(uaList), "\n")
|
|
||||||
}
|
|
||||||
|
|
||||||
func newResolver(poolSize int, opts ...func(*resolver)) *resolver {
|
func newResolver(poolSize int, opts ...func(*resolver)) *resolver {
|
||||||
r := &resolver{
|
r := &resolver{
|
||||||
resolverC: make(chan resolverQueueEntry),
|
resolverC: make(chan resolverQueueEntry),
|
||||||
|
@ -74,6 +66,10 @@ func withSkipVerify() func(*resolver) {
|
||||||
return func(r *resolver) { r.skipValidation = true }
|
return func(r *resolver) { r.skipValidation = true }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func withTesting(t *testing.T) func(*resolver) {
|
||||||
|
return func(r *resolver) { r.t = t }
|
||||||
|
}
|
||||||
|
|
||||||
func (r resolver) Resolve(qe resolverQueueEntry) {
|
func (r resolver) Resolve(qe resolverQueueEntry) {
|
||||||
qe.WaitGroup.Add(1)
|
qe.WaitGroup.Add(1)
|
||||||
r.resolverC <- qe
|
r.resolverC <- qe
|
||||||
|
@ -87,8 +83,8 @@ func (resolver) getJar() *cookiejar.Jar {
|
||||||
// resolveFinal takes a link and looks up the final destination of
|
// resolveFinal takes a link and looks up the final destination of
|
||||||
// that link after all redirects were followed
|
// that link after all redirects were followed
|
||||||
//
|
//
|
||||||
//nolint:gocyclo
|
//nolint:funlen,gocyclo
|
||||||
func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack *stack, userAgent string) string {
|
func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack *stack) string {
|
||||||
if !linkTest.MatchString(link) && !r.skipValidation {
|
if !linkTest.MatchString(link) && !r.skipValidation {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
@ -131,12 +127,19 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack
|
||||||
// Sanitize host: Trailing dots are valid but not required
|
// Sanitize host: Trailing dots are valid but not required
|
||||||
u.Host = strings.TrimRight(u.Host, ".")
|
u.Host = strings.TrimRight(u.Host, ".")
|
||||||
|
|
||||||
|
if r.t != nil {
|
||||||
|
r.t.Logf("resolving link: link=%q jar_c=%#v stack_c=%d stack_h=%d",
|
||||||
|
link, len(cookieJar.Cookies(u)), callStack.Count(link), callStack.Height())
|
||||||
|
}
|
||||||
|
|
||||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
req.Header.Set("User-Agent", userAgent)
|
for k, v := range generateUserAgentHeaders() {
|
||||||
|
req.Header.Set(k, v)
|
||||||
|
}
|
||||||
|
|
||||||
resp, err := client.Do(req)
|
resp, err := client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -156,7 +159,7 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack
|
||||||
}
|
}
|
||||||
target := r.resolveReference(u, tu)
|
target := r.resolveReference(u, tu)
|
||||||
callStack.Visit(link)
|
callStack.Visit(link)
|
||||||
return r.resolveFinal(target, cookieJar, callStack, userAgent)
|
return r.resolveFinal(target, cookieJar, callStack)
|
||||||
}
|
}
|
||||||
|
|
||||||
// We got a response, it's no redirect, lets check for in-document stuff
|
// We got a response, it's no redirect, lets check for in-document stuff
|
||||||
|
@ -173,14 +176,14 @@ func (r resolver) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack
|
||||||
}
|
}
|
||||||
target := r.resolveReference(u, tu)
|
target := r.resolveReference(u, tu)
|
||||||
callStack.Visit(link)
|
callStack.Visit(link)
|
||||||
return r.resolveFinal(target, cookieJar, callStack, userAgent)
|
return r.resolveFinal(target, cookieJar, callStack)
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp.Header.Get("Set-Cookie") != "" {
|
if resp.Header.Get("Set-Cookie") != "" {
|
||||||
// A new cookie was set, lets refresh the page once to see if stuff
|
// A new cookie was set, lets refresh the page once to see if stuff
|
||||||
// changes with that new cookie
|
// changes with that new cookie
|
||||||
callStack.Visit(link)
|
callStack.Visit(link)
|
||||||
return r.resolveFinal(u.String(), cookieJar, callStack, userAgent)
|
return r.resolveFinal(u.String(), cookieJar, callStack)
|
||||||
}
|
}
|
||||||
|
|
||||||
// We had no in-document redirects: we count this as a success
|
// We had no in-document redirects: we count this as a success
|
||||||
|
@ -226,14 +229,9 @@ func (resolver) resolveReference(origin *url.URL, loc *url.URL) string {
|
||||||
|
|
||||||
func (r resolver) runResolver() {
|
func (r resolver) runResolver() {
|
||||||
for qe := range r.resolverC {
|
for qe := range r.resolverC {
|
||||||
if link := r.resolveFinal(qe.Link, r.getJar(), &stack{}, r.userAgent()); link != "" {
|
if link := r.resolveFinal(qe.Link, r.getJar(), &stack{}); link != "" {
|
||||||
qe.Callback(link)
|
qe.Callback(link)
|
||||||
}
|
}
|
||||||
qe.WaitGroup.Done()
|
qe.WaitGroup.Done()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (resolver) userAgent() string {
|
|
||||||
n, _ := rand.Int(rand.Reader, big.NewInt(int64(len(defaultUserAgents))))
|
|
||||||
return defaultUserAgents[n.Int64()]
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,43 +0,0 @@
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63
|
|
||||||
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57
|
|
||||||
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
|
|
||||||
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0
|
|
||||||
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36
|
|
||||||
Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.56
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Whale/3.19.166.16 Safari/537.36
|
|
||||||
Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76
|
|
||||||
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.46
|
|
||||||
Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0
|
|
||||||
Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.192.400 QQBrowser/11.5.5250.400
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36
|
|
||||||
Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0
|
|
||||||
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0
|
|
||||||
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36
|
|
||||||
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763
|
|
||||||
Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36
|
|
||||||
Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/110.0
|
|
||||||
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.70
|
|
38
internal/linkcheck/useragent.go
Normal file
38
internal/linkcheck/useragent.go
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
package linkcheck
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
chromeMajor = 128
|
||||||
|
webkitMajor = 537
|
||||||
|
webkitMinor = 36
|
||||||
|
)
|
||||||
|
|
||||||
|
// generateUserAgent resembles the Chrome user agent generation as
|
||||||
|
// closely as possible in order to blend into the crowd of browsers
|
||||||
|
//
|
||||||
|
// https://github.com/chromium/chromium/blob/58e23d958ee8d2bb4b085c843a18eb28b9da17da/content/common/user_agent.cc
|
||||||
|
func generateUserAgentHeaders() map[string]string {
|
||||||
|
return map[string]string{
|
||||||
|
// New UA hints method
|
||||||
|
"Sec-CH-UA": fmt.Sprintf(
|
||||||
|
`"Chromium";v="%[1]d", "Not;A=Brand";v="24", "Google Chrome";v="%[1]d"`,
|
||||||
|
chromeMajor,
|
||||||
|
),
|
||||||
|
|
||||||
|
// Not a mobile browser
|
||||||
|
"Sec-CH-UA-Mobile": "?0",
|
||||||
|
|
||||||
|
// We're always Windows
|
||||||
|
"Sec-CH-UA-Platform": "Windows",
|
||||||
|
|
||||||
|
// "old" user-agent
|
||||||
|
"User-Agent": fmt.Sprintf(
|
||||||
|
"Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) %s Safari/537.36",
|
||||||
|
"Windows NT 10.0; Win64; x64", // We're always Windows 10 / 11 on x64
|
||||||
|
fmt.Sprintf("Chrome/%d.0.0.0", chromeMajor), // UA-Reduction enabled
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue