2024-01-01 16:52:18 +00:00
|
|
|
// Package linkcheck implements a helper library to search for links
|
|
|
|
// in a message text and validate them by trying to call them
|
2023-04-07 22:41:00 +00:00
|
|
|
package linkcheck
|
|
|
|
|
|
|
|
import (
|
|
|
|
"regexp"
|
|
|
|
"strings"
|
2023-12-06 08:17:32 +00:00
|
|
|
"sync"
|
2023-04-07 22:41:00 +00:00
|
|
|
|
|
|
|
"github.com/Luzifer/go_helpers/v2/str"
|
|
|
|
)
|
|
|
|
|
|
|
|
type (
|
|
|
|
// Checker contains logic to detect and resolve links in a message
|
|
|
|
Checker struct {
|
2023-12-06 08:17:32 +00:00
|
|
|
res *resolver
|
2023-04-07 22:41:00 +00:00
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2023-12-06 08:17:32 +00:00
|
|
|
// New creates a new Checker instance with default settings
|
|
|
|
func New(opts ...func(*Checker)) *Checker {
|
|
|
|
c := &Checker{
|
|
|
|
res: defaultResolver,
|
|
|
|
}
|
2023-04-07 22:41:00 +00:00
|
|
|
|
2023-12-06 08:17:32 +00:00
|
|
|
for _, o := range opts {
|
|
|
|
o(c)
|
|
|
|
}
|
2023-04-07 22:41:00 +00:00
|
|
|
|
2023-12-06 08:17:32 +00:00
|
|
|
return c
|
2023-04-07 22:41:00 +00:00
|
|
|
}
|
|
|
|
|
2023-12-06 08:17:32 +00:00
|
|
|
func withResolver(r *resolver) func(*Checker) {
|
|
|
|
return func(c *Checker) { c.res = r }
|
2023-04-07 22:41:00 +00:00
|
|
|
}
|
|
|
|
|
2023-07-24 21:27:25 +00:00
|
|
|
// HeuristicScanForLinks takes a message and tries to find links
|
|
|
|
// within that message. Common methods like putting spaces into links
|
|
|
|
// are tried to circumvent.
|
|
|
|
func (c Checker) HeuristicScanForLinks(message string) []string {
|
|
|
|
return c.scan(message,
|
2023-04-07 22:41:00 +00:00
|
|
|
c.scanPlainNoObfuscate,
|
|
|
|
c.scanDotObfuscation,
|
2023-12-05 17:58:23 +00:00
|
|
|
c.scanObfuscateSpace,
|
|
|
|
c.scanObfuscateSpecialCharsAndSpaces(regexp.MustCompile(`[^a-zA-Z0-9.:/\s_-]`), ""), // Leave dots intact and just join parts
|
|
|
|
c.scanObfuscateSpecialCharsAndSpaces(regexp.MustCompile(`[^a-zA-Z0-9:/\s_-]`), "."), // Remove dots also and connect by them
|
2023-07-24 21:27:25 +00:00
|
|
|
)
|
|
|
|
}
|
2023-04-07 22:41:00 +00:00
|
|
|
|
2023-07-24 21:27:25 +00:00
|
|
|
// ScanForLinks takes a message and tries to find links within that
|
|
|
|
// message. This only detects links without any means of obfuscation
|
|
|
|
// like putting spaces into the link.
|
|
|
|
func (c Checker) ScanForLinks(message string) (links []string) {
|
|
|
|
return c.scan(message, c.scanPlainNoObfuscate)
|
2023-04-07 22:41:00 +00:00
|
|
|
}
|
|
|
|
|
2024-01-01 16:52:18 +00:00
|
|
|
func (Checker) scan(message string, scanFns ...func(string) []string) (links []string) {
|
2023-07-24 21:27:25 +00:00
|
|
|
for _, scanner := range scanFns {
|
|
|
|
if links = scanner(message); links != nil {
|
|
|
|
return links
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return links
|
|
|
|
}
|
|
|
|
|
2023-04-07 22:41:00 +00:00
|
|
|
func (c Checker) scanDotObfuscation(message string) (links []string) {
|
|
|
|
message = regexp.MustCompile(`(?i)\s*\(?dot\)?\s*`).ReplaceAllString(message, ".")
|
|
|
|
return c.scanPlainNoObfuscate(message)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c Checker) scanObfuscateSpace(message string) (links []string) {
|
|
|
|
// Spammers use spaces in their links to prevent link protection matches
|
|
|
|
parts := regexp.MustCompile(`\s+`).Split(message, -1)
|
2023-12-05 17:58:23 +00:00
|
|
|
return c.scanPartsConnected(parts, "")
|
|
|
|
}
|
2023-04-07 22:41:00 +00:00
|
|
|
|
2023-12-05 17:58:23 +00:00
|
|
|
func (c Checker) scanObfuscateSpecialCharsAndSpaces(set *regexp.Regexp, connector string) func(string) []string {
|
|
|
|
return func(message string) (links []string) {
|
|
|
|
// First clean URL from all characters not acceptable in Domains (plus some extra chars)
|
|
|
|
message = set.ReplaceAllString(message, " ")
|
|
|
|
parts := regexp.MustCompile(`\s+`).Split(message, -1)
|
|
|
|
return c.scanPartsConnected(parts, connector)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c Checker) scanPartsConnected(parts []string, connector string) (links []string) {
|
2023-12-06 08:17:32 +00:00
|
|
|
wg := new(sync.WaitGroup)
|
|
|
|
|
2023-07-24 21:27:25 +00:00
|
|
|
for ptJoin := 2; ptJoin < len(parts); ptJoin++ {
|
|
|
|
for i := 0; i <= len(parts)-ptJoin; i++ {
|
2023-12-06 08:17:32 +00:00
|
|
|
c.res.Resolve(resolverQueueEntry{
|
|
|
|
Link: strings.Join(parts[i:i+ptJoin], connector),
|
|
|
|
Callback: func(link string) { links = str.AppendIfMissing(links, link) },
|
|
|
|
WaitGroup: wg,
|
|
|
|
})
|
2023-04-07 22:41:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-06 08:17:32 +00:00
|
|
|
wg.Wait()
|
|
|
|
|
2023-04-07 22:41:00 +00:00
|
|
|
return links
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c Checker) scanPlainNoObfuscate(message string) (links []string) {
|
2023-12-06 08:17:32 +00:00
|
|
|
var (
|
|
|
|
parts = regexp.MustCompile(`\s+`).Split(message, -1)
|
|
|
|
wg = new(sync.WaitGroup)
|
|
|
|
)
|
2023-04-07 22:41:00 +00:00
|
|
|
|
|
|
|
for _, part := range parts {
|
2023-12-06 08:17:32 +00:00
|
|
|
c.res.Resolve(resolverQueueEntry{
|
|
|
|
Link: part,
|
|
|
|
Callback: func(link string) { links = str.AppendIfMissing(links, link) },
|
|
|
|
WaitGroup: wg,
|
|
|
|
})
|
2023-04-07 22:41:00 +00:00
|
|
|
}
|
|
|
|
|
2023-12-06 08:17:32 +00:00
|
|
|
wg.Wait()
|
2023-04-07 22:41:00 +00:00
|
|
|
|
2023-12-06 08:17:32 +00:00
|
|
|
return links
|
2023-04-07 22:41:00 +00:00
|
|
|
}
|