[linkdetector] Add new option to enable heuristic scan

Signed-off-by: Knut Ahlers <knut@ahlers.me>
This commit is contained in:
Knut Ahlers 2023-07-24 23:27:25 +02:00
parent d74105bc00
commit a74f210cd2
Signed by: luzifer
GPG key ID: D91C3E91E4CAD6F5
3 changed files with 98 additions and 31 deletions

View file

@ -9,6 +9,8 @@ import (
const actorName = "linkdetector"
var ptrFalse = func(v bool) *bool { return &v }(false)
func Register(args plugins.RegistrationArguments) error {
args.RegisterActor(actorName, func() plugins.Actor { return &Actor{} })
@ -16,6 +18,18 @@ func Register(args plugins.RegistrationArguments) error {
Description: `Scans for links in the message and adds the "links" field to the event data`,
Name: "Scan for Links",
Type: actorName,
Fields: []plugins.ActionDocumentationField{
{
Default: "false",
Description: "Enable heuristic scans to find links with spaces or other means of obfuscation in them",
Key: "heuristic",
Name: "Heuristic Scan",
Optional: true,
SupportTemplate: false,
Type: plugins.ActionDocumentationFieldTypeBool,
},
},
})
return nil
@ -23,13 +37,18 @@ func Register(args plugins.RegistrationArguments) error {
type Actor struct{}
func (Actor) Execute(_ *irc.Client, m *irc.Message, _ *plugins.Rule, eventData *plugins.FieldCollection, _ *plugins.FieldCollection) (preventCooldown bool, err error) {
func (Actor) Execute(_ *irc.Client, m *irc.Message, _ *plugins.Rule, eventData *plugins.FieldCollection, attrs *plugins.FieldCollection) (preventCooldown bool, err error) {
if eventData.HasAll("links") {
// We already detected links, lets not do it again
return false, nil
}
eventData.Set("links", linkcheck.New().ScanForLinks(m.Trailing()))
if attrs.MustBool("heuristic", ptrFalse) {
eventData.Set("links", linkcheck.New().HeuristicScanForLinks(m.Trailing()))
} else {
eventData.Set("links", linkcheck.New().ScanForLinks(m.Trailing()))
}
return false, nil
}

View file

@ -55,22 +55,23 @@ func New() *Checker {
}
}
// ScanForLinks takes a message and tries to find links within that
// message. Common methods like putting spaces into links are tried
// to circumvent.
func (c Checker) ScanForLinks(message string) (links []string) {
for _, scanner := range []func(string) []string{
// HeuristicScanForLinks takes a message and tries to find links
// within that message. Common methods like putting spaces into links
// are tried to circumvent.
func (c Checker) HeuristicScanForLinks(message string) []string {
return c.scan(message,
c.scanPlainNoObfuscate,
c.scanObfuscateSpace,
c.scanObfuscateSpecialCharsAndSpaces,
c.scanDotObfuscation,
} {
if links = scanner(message); links != nil {
return links
}
}
)
}
return links
// ScanForLinks takes a message and tries to find links within that
// message. This only detects links without any means of obfuscation
// like putting spaces into the link.
func (c Checker) ScanForLinks(message string) (links []string) {
return c.scan(message, c.scanPlainNoObfuscate)
}
// resolveFinal takes a link and looks up the final destination of
@ -184,6 +185,16 @@ func (Checker) getJar() *cookiejar.Jar {
return jar
}
func (c Checker) scan(message string, scanFns ...func(string) []string) (links []string) {
for _, scanner := range scanFns {
if links = scanner(message); links != nil {
return links
}
}
return links
}
func (c Checker) scanDotObfuscation(message string) (links []string) {
message = regexp.MustCompile(`(?i)\s*\(?dot\)?\s*`).ReplaceAllString(message, ".")
return c.scanPlainNoObfuscate(message)
@ -193,9 +204,11 @@ func (c Checker) scanObfuscateSpace(message string) (links []string) {
// Spammers use spaces in their links to prevent link protection matches
parts := regexp.MustCompile(`\s+`).Split(message, -1)
for i := 0; i < len(parts)-1; i++ {
if link := c.resolveFinal(strings.Join(parts[i:i+2], ""), c.getJar(), nil, c.userAgent()); link != "" {
links = append(links, link)
for ptJoin := 2; ptJoin < len(parts); ptJoin++ {
for i := 0; i <= len(parts)-ptJoin; i++ {
if link := c.resolveFinal(strings.Join(parts[i:i+ptJoin], ""), c.getJar(), nil, c.userAgent()); link != "" {
links = append(links, link)
}
}
}

View file

@ -54,6 +54,7 @@ func TestMaxRedirects(t *testing.T) {
assert.Equal(t, []string{fmt.Sprintf("%s/%d", ts.URL, maxRedirects)}, c.ScanForLinks(msg))
}
//nolint:funlen
func TestScanForLinks(t *testing.T) {
if testing.Short() {
t.SkipNow()
@ -62,54 +63,64 @@ func TestScanForLinks(t *testing.T) {
c := New()
for _, testCase := range []struct {
Heuristic bool
Message string
ExpectedLinks []string
}{
// Case: full URL is present in the message
{
Message: "https://example.com",
Heuristic: false,
Message: "https://example.com",
ExpectedLinks: []string{
"https://example.com",
},
},
// Case: full bitly link is present in the message
{
Message: "https://bit.ly/438obkJ",
Heuristic: false,
Message: "https://bit.ly/438obkJ",
ExpectedLinks: []string{
"https://example.com/",
},
},
// Case: link is present just without the protocol
{
Message: "Here, take a look at this: bit.ly/438obkJ",
Heuristic: false,
Message: "Here, take a look at this: bit.ly/438obkJ",
ExpectedLinks: []string{
"https://example.com/",
},
},
// Case: message with vk.cc shortener
{
Message: "See more here: vk.cc/ckGZN2",
Heuristic: false,
Message: "See more here: vk.cc/ckGZN2",
ExpectedLinks: []string{
"https://vk.com/club206261664",
},
},
// Case: link is obfuscated using space
{
Message: "Take a look at example. com",
Heuristic: true,
Message: "Take a look at example. com",
ExpectedLinks: []string{
"http://example.com",
"http://www.atexample.com/",
},
},
// Case: link is obfuscated using space and braces
{
Message: "Take a look at example. (com)",
Heuristic: true,
Message: "Take a look at example. (com)",
ExpectedLinks: []string{
"http://example.com",
"http://www.atexample.com/",
},
},
// Case: multiple links in one message
{
Message: "https://clips.twitch.tv/WrongEnchantingMinkFutureMan-EKlDjYkvDeurO9XT https://bit.ly/438obkJ",
Heuristic: false,
Message: "https://clips.twitch.tv/WrongEnchantingMinkFutureMan-EKlDjYkvDeurO9XT https://bit.ly/438obkJ",
ExpectedLinks: []string{
"https://clips.twitch.tv/WrongEnchantingMinkFutureMan-EKlDjYkvDeurO9XT",
"https://example.com/",
@ -117,38 +128,62 @@ func TestScanForLinks(t *testing.T) {
},
// Case: obfuscation with "dot"
{
Message: "I'm live now on twitch dot tv/twitch",
Heuristic: true,
Message: "I'm live now on twitch dot tv/twitch",
ExpectedLinks: []string{
"https://www.twitch.tv/twitch",
},
},
// Case: enhanced "dot" obfuscation
{
Message: "You can visit Archive(Dot) org in your browser",
Heuristic: true,
Message: "You can visit Archive(Dot) org in your browser",
ExpectedLinks: []string{
"http://Archive.org",
},
},
// Case: Youtube does weird stuff
{
Message: "https://luziferus.tv/youtube",
Heuristic: false,
Message: "https://luziferus.tv/youtube",
ExpectedLinks: []string{
"https://www.youtube.com/channel/UCjsRmaAQ0IHR2CNEBqfNOSQ",
},
},
// Case: Instagram also does weird things
{
Message: "https://bit.ly/3KHpJuy",
Heuristic: false,
Message: "https://bit.ly/3KHpJuy",
ExpectedLinks: []string{
"https://www.instagram.com/instagram/",
},
},
// Case: Heuristic enabled with a German sentence accidentally
// forming a valid link to a spanish site (btw.es) - known and
// valid false-positive
{
Heuristic: true,
Message: "Hey btw. es kann sein, dass",
ExpectedLinks: []string{"https://trusted.evo-media.eu/btw.es"},
},
// Case: Multiple spaces in the link
{
Heuristic: true,
Message: "Hey there, see my new project on exa mpl e. com! Get it fast now!",
ExpectedLinks: []string{"http://example.com"},
},
// Case: false positives
{Message: "game dot exe has stopped working", ExpectedLinks: nil},
{Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: nil},
{Heuristic: true, Message: "game dot exe has stopped working", ExpectedLinks: nil},
{Heuristic: true, Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: nil},
{Heuristic: false, Message: "Hey btw. es kann sein, dass", ExpectedLinks: nil},
} {
t.Run(testCase.Message, func(t *testing.T) {
linksFound := c.ScanForLinks(testCase.Message)
t.Run(fmt.Sprintf("h:%v lc:%d m:%s", testCase.Heuristic, len(testCase.ExpectedLinks), testCase.Message), func(t *testing.T) {
var linksFound []string
if testCase.Heuristic {
linksFound = c.HeuristicScanForLinks(testCase.Message)
} else {
linksFound = c.ScanForLinks(testCase.Message)
}
sort.Strings(linksFound)
assert.Equal(t, testCase.ExpectedLinks, linksFound, "links from message %q", testCase.Message)