package linkcheck import ( "context" "crypto/rand" _ "embed" "math/big" "net/http" "net/http/cookiejar" "net/url" "regexp" "strings" "time" "github.com/Luzifer/go_helpers/v2/str" ) const ( // DefaultCheckTimeout defines the default time the request to a site // may take to answer DefaultCheckTimeout = 10 * time.Second maxRedirects = 50 ) type ( // Checker contains logic to detect and resolve links in a message Checker struct { checkTimeout time.Duration userAgents []string skipValidation bool // Only for tests, not settable from the outside } ) var ( defaultUserAgents = []string{} dropSet = regexp.MustCompile(`[^a-zA-Z0-9.:/\s_-]`) linkTest = regexp.MustCompile(`(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]`) numericHost = regexp.MustCompile(`^(?:[0-9]+\.)*[0-9]+(?::[0-9]+)?$`) //go:embed user-agents.txt uaList string ) func init() { defaultUserAgents = strings.Split(strings.TrimSpace(uaList), "\n") } // New creates a new Checker instance with default settings func New() *Checker { return &Checker{ checkTimeout: DefaultCheckTimeout, userAgents: defaultUserAgents, } } // HeuristicScanForLinks takes a message and tries to find links // within that message. Common methods like putting spaces into links // are tried to circumvent. func (c Checker) HeuristicScanForLinks(message string) []string { return c.scan(message, c.scanPlainNoObfuscate, c.scanObfuscateSpace, c.scanObfuscateSpecialCharsAndSpaces, c.scanDotObfuscation, ) } // ScanForLinks takes a message and tries to find links within that // message. This only detects links without any means of obfuscation // like putting spaces into the link. func (c Checker) ScanForLinks(message string) (links []string) { return c.scan(message, c.scanPlainNoObfuscate) } // resolveFinal takes a link and looks up the final destination of // that link after all redirects were followed func (c Checker) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack []string, userAgent string) string { if !linkTest.MatchString(link) && !c.skipValidation { return "" } if str.StringInSlice(link, callStack) || len(callStack) == maxRedirects { // We got ourselves a loop: Yay! return link } client := &http.Client{ CheckRedirect: func(req *http.Request, via []*http.Request) error { return http.ErrUseLastResponse }, Jar: cookieJar, } ctx, cancel := context.WithTimeout(context.Background(), c.checkTimeout) defer cancel() u, err := url.Parse(link) if err != nil { return "" } if u.Scheme == "" { // We have no scheme and the url is in the path, lets add the // scheme and re-parse the URL to avoid some confusion u.Scheme = "http" u, err = url.Parse(u.String()) if err != nil { return "" } } if numericHost.MatchString(u.Host) && !c.skipValidation { // Host is fully numeric: We don't support scanning that return "" } req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil) if err != nil { return "" } req.Header.Set("User-Agent", userAgent) resp, err := client.Do(req) if err != nil { return "" } defer resp.Body.Close() if resp.StatusCode > 299 && resp.StatusCode < 400 { // We got a redirect tu, err := url.Parse(resp.Header.Get("location")) if err != nil { return "" } target := c.resolveReference(u, tu) return c.resolveFinal(target, cookieJar, append(callStack, link), userAgent) } // We got a response, it's no redirect, we count this as a success return u.String() } func (Checker) resolveReference(origin *url.URL, loc *url.URL) string { // Special Case: vkontakte used as shortener / obfuscation if loc.Path == "/away.php" && loc.Query().Has("to") { // VK is doing HTML / JS redirect magic so we take that from them // and execute the redirect directly here in code return loc.Query().Get("to") } if loc.Host == "consent.youtube.com" && loc.Query().Has("continue") { // Youtube links end up in consent page but we want the real // target so we use the continue parameter where we strip the // cbrd query parameters as that one causes an infinite loop. contTarget, err := url.Parse(loc.Query().Get("continue")) if err == nil { v := contTarget.Query() v.Del("cbrd") contTarget.RawQuery = v.Encode() return contTarget.String() } return loc.Query().Get("continue") } if loc.Host == "www.instagram.com" && loc.Query().Has("next") { // Instagram likes its login page, we on the other side don't // care about the sign-in or even the content. Therefore we // just take their redirect target and use that as the next // URL return loc.Query().Get("next") } // Default fallback behavior: Do a normal resolve return origin.ResolveReference(loc).String() } func (Checker) getJar() *cookiejar.Jar { jar, _ := cookiejar.New(nil) return jar } func (c Checker) scan(message string, scanFns ...func(string) []string) (links []string) { for _, scanner := range scanFns { if links = scanner(message); links != nil { return links } } return links } func (c Checker) scanDotObfuscation(message string) (links []string) { message = regexp.MustCompile(`(?i)\s*\(?dot\)?\s*`).ReplaceAllString(message, ".") return c.scanPlainNoObfuscate(message) } func (c Checker) scanObfuscateSpace(message string) (links []string) { // Spammers use spaces in their links to prevent link protection matches parts := regexp.MustCompile(`\s+`).Split(message, -1) for ptJoin := 2; ptJoin < len(parts); ptJoin++ { for i := 0; i <= len(parts)-ptJoin; i++ { if link := c.resolveFinal(strings.Join(parts[i:i+ptJoin], ""), c.getJar(), nil, c.userAgent()); link != "" { links = append(links, link) } } } return links } func (c Checker) scanObfuscateSpecialCharsAndSpaces(message string) (links []string) { // First clean URL from all characters not acceptable in Domains (plus some extra chars) message = dropSet.ReplaceAllString(message, "") return c.scanObfuscateSpace(message) } func (c Checker) scanPlainNoObfuscate(message string) (links []string) { parts := regexp.MustCompile(`\s+`).Split(message, -1) for _, part := range parts { if link := c.resolveFinal(part, c.getJar(), nil, c.userAgent()); link != "" { links = append(links, link) } } return links } func (c Checker) userAgent() string { n, _ := rand.Int(rand.Reader, big.NewInt(int64(len(c.userAgents)))) return c.userAgents[n.Int64()] }