[linkdetector] Add new option to enable heuristic scan

Signed-off-by: Knut Ahlers <knut@ahlers.me>
This commit is contained in:
Knut Ahlers 2023-07-24 23:27:25 +02:00
parent d74105bc00
commit a74f210cd2
Signed by: luzifer
GPG key ID: D91C3E91E4CAD6F5
3 changed files with 98 additions and 31 deletions
internal
actors/linkdetector
linkcheck

View file

@ -9,6 +9,8 @@ import (
const actorName = "linkdetector" const actorName = "linkdetector"
var ptrFalse = func(v bool) *bool { return &v }(false)
func Register(args plugins.RegistrationArguments) error { func Register(args plugins.RegistrationArguments) error {
args.RegisterActor(actorName, func() plugins.Actor { return &Actor{} }) args.RegisterActor(actorName, func() plugins.Actor { return &Actor{} })
@ -16,6 +18,18 @@ func Register(args plugins.RegistrationArguments) error {
Description: `Scans for links in the message and adds the "links" field to the event data`, Description: `Scans for links in the message and adds the "links" field to the event data`,
Name: "Scan for Links", Name: "Scan for Links",
Type: actorName, Type: actorName,
Fields: []plugins.ActionDocumentationField{
{
Default: "false",
Description: "Enable heuristic scans to find links with spaces or other means of obfuscation in them",
Key: "heuristic",
Name: "Heuristic Scan",
Optional: true,
SupportTemplate: false,
Type: plugins.ActionDocumentationFieldTypeBool,
},
},
}) })
return nil return nil
@ -23,13 +37,18 @@ func Register(args plugins.RegistrationArguments) error {
type Actor struct{} type Actor struct{}
func (Actor) Execute(_ *irc.Client, m *irc.Message, _ *plugins.Rule, eventData *plugins.FieldCollection, _ *plugins.FieldCollection) (preventCooldown bool, err error) { func (Actor) Execute(_ *irc.Client, m *irc.Message, _ *plugins.Rule, eventData *plugins.FieldCollection, attrs *plugins.FieldCollection) (preventCooldown bool, err error) {
if eventData.HasAll("links") { if eventData.HasAll("links") {
// We already detected links, lets not do it again // We already detected links, lets not do it again
return false, nil return false, nil
} }
eventData.Set("links", linkcheck.New().ScanForLinks(m.Trailing())) if attrs.MustBool("heuristic", ptrFalse) {
eventData.Set("links", linkcheck.New().HeuristicScanForLinks(m.Trailing()))
} else {
eventData.Set("links", linkcheck.New().ScanForLinks(m.Trailing()))
}
return false, nil return false, nil
} }

View file

@ -55,22 +55,23 @@ func New() *Checker {
} }
} }
// ScanForLinks takes a message and tries to find links within that // HeuristicScanForLinks takes a message and tries to find links
// message. Common methods like putting spaces into links are tried // within that message. Common methods like putting spaces into links
// to circumvent. // are tried to circumvent.
func (c Checker) ScanForLinks(message string) (links []string) { func (c Checker) HeuristicScanForLinks(message string) []string {
for _, scanner := range []func(string) []string{ return c.scan(message,
c.scanPlainNoObfuscate, c.scanPlainNoObfuscate,
c.scanObfuscateSpace, c.scanObfuscateSpace,
c.scanObfuscateSpecialCharsAndSpaces, c.scanObfuscateSpecialCharsAndSpaces,
c.scanDotObfuscation, c.scanDotObfuscation,
} { )
if links = scanner(message); links != nil { }
return links
}
}
return links // ScanForLinks takes a message and tries to find links within that
// message. This only detects links without any means of obfuscation
// like putting spaces into the link.
func (c Checker) ScanForLinks(message string) (links []string) {
return c.scan(message, c.scanPlainNoObfuscate)
} }
// resolveFinal takes a link and looks up the final destination of // resolveFinal takes a link and looks up the final destination of
@ -184,6 +185,16 @@ func (Checker) getJar() *cookiejar.Jar {
return jar return jar
} }
func (c Checker) scan(message string, scanFns ...func(string) []string) (links []string) {
for _, scanner := range scanFns {
if links = scanner(message); links != nil {
return links
}
}
return links
}
func (c Checker) scanDotObfuscation(message string) (links []string) { func (c Checker) scanDotObfuscation(message string) (links []string) {
message = regexp.MustCompile(`(?i)\s*\(?dot\)?\s*`).ReplaceAllString(message, ".") message = regexp.MustCompile(`(?i)\s*\(?dot\)?\s*`).ReplaceAllString(message, ".")
return c.scanPlainNoObfuscate(message) return c.scanPlainNoObfuscate(message)
@ -193,9 +204,11 @@ func (c Checker) scanObfuscateSpace(message string) (links []string) {
// Spammers use spaces in their links to prevent link protection matches // Spammers use spaces in their links to prevent link protection matches
parts := regexp.MustCompile(`\s+`).Split(message, -1) parts := regexp.MustCompile(`\s+`).Split(message, -1)
for i := 0; i < len(parts)-1; i++ { for ptJoin := 2; ptJoin < len(parts); ptJoin++ {
if link := c.resolveFinal(strings.Join(parts[i:i+2], ""), c.getJar(), nil, c.userAgent()); link != "" { for i := 0; i <= len(parts)-ptJoin; i++ {
links = append(links, link) if link := c.resolveFinal(strings.Join(parts[i:i+ptJoin], ""), c.getJar(), nil, c.userAgent()); link != "" {
links = append(links, link)
}
} }
} }

View file

@ -54,6 +54,7 @@ func TestMaxRedirects(t *testing.T) {
assert.Equal(t, []string{fmt.Sprintf("%s/%d", ts.URL, maxRedirects)}, c.ScanForLinks(msg)) assert.Equal(t, []string{fmt.Sprintf("%s/%d", ts.URL, maxRedirects)}, c.ScanForLinks(msg))
} }
//nolint:funlen
func TestScanForLinks(t *testing.T) { func TestScanForLinks(t *testing.T) {
if testing.Short() { if testing.Short() {
t.SkipNow() t.SkipNow()
@ -62,54 +63,64 @@ func TestScanForLinks(t *testing.T) {
c := New() c := New()
for _, testCase := range []struct { for _, testCase := range []struct {
Heuristic bool
Message string Message string
ExpectedLinks []string ExpectedLinks []string
}{ }{
// Case: full URL is present in the message // Case: full URL is present in the message
{ {
Message: "https://example.com", Heuristic: false,
Message: "https://example.com",
ExpectedLinks: []string{ ExpectedLinks: []string{
"https://example.com", "https://example.com",
}, },
}, },
// Case: full bitly link is present in the message // Case: full bitly link is present in the message
{ {
Message: "https://bit.ly/438obkJ", Heuristic: false,
Message: "https://bit.ly/438obkJ",
ExpectedLinks: []string{ ExpectedLinks: []string{
"https://example.com/", "https://example.com/",
}, },
}, },
// Case: link is present just without the protocol // Case: link is present just without the protocol
{ {
Message: "Here, take a look at this: bit.ly/438obkJ", Heuristic: false,
Message: "Here, take a look at this: bit.ly/438obkJ",
ExpectedLinks: []string{ ExpectedLinks: []string{
"https://example.com/", "https://example.com/",
}, },
}, },
// Case: message with vk.cc shortener // Case: message with vk.cc shortener
{ {
Message: "See more here: vk.cc/ckGZN2", Heuristic: false,
Message: "See more here: vk.cc/ckGZN2",
ExpectedLinks: []string{ ExpectedLinks: []string{
"https://vk.com/club206261664", "https://vk.com/club206261664",
}, },
}, },
// Case: link is obfuscated using space // Case: link is obfuscated using space
{ {
Message: "Take a look at example. com", Heuristic: true,
Message: "Take a look at example. com",
ExpectedLinks: []string{ ExpectedLinks: []string{
"http://example.com", "http://example.com",
"http://www.atexample.com/",
}, },
}, },
// Case: link is obfuscated using space and braces // Case: link is obfuscated using space and braces
{ {
Message: "Take a look at example. (com)", Heuristic: true,
Message: "Take a look at example. (com)",
ExpectedLinks: []string{ ExpectedLinks: []string{
"http://example.com", "http://example.com",
"http://www.atexample.com/",
}, },
}, },
// Case: multiple links in one message // Case: multiple links in one message
{ {
Message: "https://clips.twitch.tv/WrongEnchantingMinkFutureMan-EKlDjYkvDeurO9XT https://bit.ly/438obkJ", Heuristic: false,
Message: "https://clips.twitch.tv/WrongEnchantingMinkFutureMan-EKlDjYkvDeurO9XT https://bit.ly/438obkJ",
ExpectedLinks: []string{ ExpectedLinks: []string{
"https://clips.twitch.tv/WrongEnchantingMinkFutureMan-EKlDjYkvDeurO9XT", "https://clips.twitch.tv/WrongEnchantingMinkFutureMan-EKlDjYkvDeurO9XT",
"https://example.com/", "https://example.com/",
@ -117,38 +128,62 @@ func TestScanForLinks(t *testing.T) {
}, },
// Case: obfuscation with "dot" // Case: obfuscation with "dot"
{ {
Message: "I'm live now on twitch dot tv/twitch", Heuristic: true,
Message: "I'm live now on twitch dot tv/twitch",
ExpectedLinks: []string{ ExpectedLinks: []string{
"https://www.twitch.tv/twitch", "https://www.twitch.tv/twitch",
}, },
}, },
// Case: enhanced "dot" obfuscation // Case: enhanced "dot" obfuscation
{ {
Message: "You can visit Archive(Dot) org in your browser", Heuristic: true,
Message: "You can visit Archive(Dot) org in your browser",
ExpectedLinks: []string{ ExpectedLinks: []string{
"http://Archive.org", "http://Archive.org",
}, },
}, },
// Case: Youtube does weird stuff // Case: Youtube does weird stuff
{ {
Message: "https://luziferus.tv/youtube", Heuristic: false,
Message: "https://luziferus.tv/youtube",
ExpectedLinks: []string{ ExpectedLinks: []string{
"https://www.youtube.com/channel/UCjsRmaAQ0IHR2CNEBqfNOSQ", "https://www.youtube.com/channel/UCjsRmaAQ0IHR2CNEBqfNOSQ",
}, },
}, },
// Case: Instagram also does weird things // Case: Instagram also does weird things
{ {
Message: "https://bit.ly/3KHpJuy", Heuristic: false,
Message: "https://bit.ly/3KHpJuy",
ExpectedLinks: []string{ ExpectedLinks: []string{
"https://www.instagram.com/instagram/", "https://www.instagram.com/instagram/",
}, },
}, },
// Case: Heuristic enabled with a German sentence accidentally
// forming a valid link to a spanish site (btw.es) - known and
// valid false-positive
{
Heuristic: true,
Message: "Hey btw. es kann sein, dass",
ExpectedLinks: []string{"https://trusted.evo-media.eu/btw.es"},
},
// Case: Multiple spaces in the link
{
Heuristic: true,
Message: "Hey there, see my new project on exa mpl e. com! Get it fast now!",
ExpectedLinks: []string{"http://example.com"},
},
// Case: false positives // Case: false positives
{Message: "game dot exe has stopped working", ExpectedLinks: nil}, {Heuristic: true, Message: "game dot exe has stopped working", ExpectedLinks: nil},
{Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: nil}, {Heuristic: true, Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: nil},
{Heuristic: false, Message: "Hey btw. es kann sein, dass", ExpectedLinks: nil},
} { } {
t.Run(testCase.Message, func(t *testing.T) { t.Run(fmt.Sprintf("h:%v lc:%d m:%s", testCase.Heuristic, len(testCase.ExpectedLinks), testCase.Message), func(t *testing.T) {
linksFound := c.ScanForLinks(testCase.Message) var linksFound []string
if testCase.Heuristic {
linksFound = c.HeuristicScanForLinks(testCase.Message)
} else {
linksFound = c.ScanForLinks(testCase.Message)
}
sort.Strings(linksFound) sort.Strings(linksFound)
assert.Equal(t, testCase.ExpectedLinks, linksFound, "links from message %q", testCase.Message) assert.Equal(t, testCase.ExpectedLinks, linksFound, "links from message %q", testCase.Message)