[linkdetector] Add more ways of link detection in heuristic mode

Signed-off-by: Knut Ahlers <knut@ahlers.me>
This commit is contained in:
Knut Ahlers 2023-12-05 18:58:23 +01:00
parent a07ad6fe83
commit 5ec6baaf2c
Signed by: luzifer
GPG key ID: D91C3E91E4CAD6F5
3 changed files with 29 additions and 13 deletions

View file

@ -22,7 +22,7 @@ func Register(args plugins.RegistrationArguments) error {
Fields: []plugins.ActionDocumentationField{
{
Default: "false",
Description: "Enable heuristic scans to find links with spaces or other means of obfuscation in them",
Description: "Enable heuristic scans to find links with spaces or other means of obfuscation in them (quite slow and will detect MANY false-positive links, only use for blacklisting links!)",
Key: "heuristic",
Name: "Heuristic Scan",
Optional: true,

View file

@ -35,7 +35,6 @@ type (
var (
defaultUserAgents = []string{}
dropSet = regexp.MustCompile(`[^a-zA-Z0-9.:/\s_-]`)
linkTest = regexp.MustCompile(`(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]`)
numericHost = regexp.MustCompile(`^(?:[0-9]+\.)*[0-9]+(?::[0-9]+)?$`)
@ -61,9 +60,10 @@ func New() *Checker {
func (c Checker) HeuristicScanForLinks(message string) []string {
return c.scan(message,
c.scanPlainNoObfuscate,
c.scanObfuscateSpace,
c.scanObfuscateSpecialCharsAndSpaces,
c.scanDotObfuscation,
c.scanObfuscateSpace,
c.scanObfuscateSpecialCharsAndSpaces(regexp.MustCompile(`[^a-zA-Z0-9.:/\s_-]`), ""), // Leave dots intact and just join parts
c.scanObfuscateSpecialCharsAndSpaces(regexp.MustCompile(`[^a-zA-Z0-9:/\s_-]`), "."), // Remove dots also and connect by them
)
}
@ -116,6 +116,9 @@ func (c Checker) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack [
return ""
}
// Sanitize host: Trailing dots are valid but not required
u.Host = strings.TrimRight(u.Host, ".")
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
if err != nil {
return ""
@ -203,10 +206,22 @@ func (c Checker) scanDotObfuscation(message string) (links []string) {
func (c Checker) scanObfuscateSpace(message string) (links []string) {
// Spammers use spaces in their links to prevent link protection matches
parts := regexp.MustCompile(`\s+`).Split(message, -1)
return c.scanPartsConnected(parts, "")
}
func (c Checker) scanObfuscateSpecialCharsAndSpaces(set *regexp.Regexp, connector string) func(string) []string {
return func(message string) (links []string) {
// First clean URL from all characters not acceptable in Domains (plus some extra chars)
message = set.ReplaceAllString(message, " ")
parts := regexp.MustCompile(`\s+`).Split(message, -1)
return c.scanPartsConnected(parts, connector)
}
}
func (c Checker) scanPartsConnected(parts []string, connector string) (links []string) {
for ptJoin := 2; ptJoin < len(parts); ptJoin++ {
for i := 0; i <= len(parts)-ptJoin; i++ {
if link := c.resolveFinal(strings.Join(parts[i:i+ptJoin], ""), c.getJar(), nil, c.userAgent()); link != "" {
if link := c.resolveFinal(strings.Join(parts[i:i+ptJoin], connector), c.getJar(), nil, c.userAgent()); link != "" && !str.StringInSlice(link, links) {
links = append(links, link)
}
}
@ -215,17 +230,11 @@ func (c Checker) scanObfuscateSpace(message string) (links []string) {
return links
}
func (c Checker) scanObfuscateSpecialCharsAndSpaces(message string) (links []string) {
// First clean URL from all characters not acceptable in Domains (plus some extra chars)
message = dropSet.ReplaceAllString(message, "")
return c.scanObfuscateSpace(message)
}
func (c Checker) scanPlainNoObfuscate(message string) (links []string) {
parts := regexp.MustCompile(`\s+`).Split(message, -1)
for _, part := range parts {
if link := c.resolveFinal(part, c.getJar(), nil, c.userAgent()); link != "" {
if link := c.resolveFinal(part, c.getJar(), nil, c.userAgent()); link != "" && !str.StringInSlice(link, links) {
links = append(links, link)
}
}

View file

@ -170,9 +170,16 @@ func TestScanForLinks(t *testing.T) {
Message: "Hey there, see my new project on exa mpl e. com! Get it fast now!",
ExpectedLinks: []string{"http://example.com"},
},
// Case: Dot in the end of the link with space
{
Heuristic: true,
Message: "See example com. Nice testing stuff there!",
ExpectedLinks: []string{"http://example.com"},
},
// Case: false positives
{Heuristic: true, Message: "game dot exe has stopped working", ExpectedLinks: nil},
{Heuristic: true, Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: nil},
{Heuristic: false, Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: nil},
{Heuristic: true, Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: []string{"http://You.re"}},
{Heuristic: false, Message: "Hey btw. es kann sein, dass", ExpectedLinks: nil},
} {
t.Run(fmt.Sprintf("h:%v lc:%d m:%s", testCase.Heuristic, len(testCase.ExpectedLinks), testCase.Message), func(t *testing.T) {