[linkdetector] Add new option to enable heuristic scan
Signed-off-by: Knut Ahlers <knut@ahlers.me>
This commit is contained in:
parent
d74105bc00
commit
a74f210cd2
3 changed files with 98 additions and 31 deletions
internal
|
@ -9,6 +9,8 @@ import (
|
||||||
|
|
||||||
const actorName = "linkdetector"
|
const actorName = "linkdetector"
|
||||||
|
|
||||||
|
var ptrFalse = func(v bool) *bool { return &v }(false)
|
||||||
|
|
||||||
func Register(args plugins.RegistrationArguments) error {
|
func Register(args plugins.RegistrationArguments) error {
|
||||||
args.RegisterActor(actorName, func() plugins.Actor { return &Actor{} })
|
args.RegisterActor(actorName, func() plugins.Actor { return &Actor{} })
|
||||||
|
|
||||||
|
@ -16,6 +18,18 @@ func Register(args plugins.RegistrationArguments) error {
|
||||||
Description: `Scans for links in the message and adds the "links" field to the event data`,
|
Description: `Scans for links in the message and adds the "links" field to the event data`,
|
||||||
Name: "Scan for Links",
|
Name: "Scan for Links",
|
||||||
Type: actorName,
|
Type: actorName,
|
||||||
|
|
||||||
|
Fields: []plugins.ActionDocumentationField{
|
||||||
|
{
|
||||||
|
Default: "false",
|
||||||
|
Description: "Enable heuristic scans to find links with spaces or other means of obfuscation in them",
|
||||||
|
Key: "heuristic",
|
||||||
|
Name: "Heuristic Scan",
|
||||||
|
Optional: true,
|
||||||
|
SupportTemplate: false,
|
||||||
|
Type: plugins.ActionDocumentationFieldTypeBool,
|
||||||
|
},
|
||||||
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
@ -23,13 +37,18 @@ func Register(args plugins.RegistrationArguments) error {
|
||||||
|
|
||||||
type Actor struct{}
|
type Actor struct{}
|
||||||
|
|
||||||
func (Actor) Execute(_ *irc.Client, m *irc.Message, _ *plugins.Rule, eventData *plugins.FieldCollection, _ *plugins.FieldCollection) (preventCooldown bool, err error) {
|
func (Actor) Execute(_ *irc.Client, m *irc.Message, _ *plugins.Rule, eventData *plugins.FieldCollection, attrs *plugins.FieldCollection) (preventCooldown bool, err error) {
|
||||||
if eventData.HasAll("links") {
|
if eventData.HasAll("links") {
|
||||||
// We already detected links, lets not do it again
|
// We already detected links, lets not do it again
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
eventData.Set("links", linkcheck.New().ScanForLinks(m.Trailing()))
|
if attrs.MustBool("heuristic", ptrFalse) {
|
||||||
|
eventData.Set("links", linkcheck.New().HeuristicScanForLinks(m.Trailing()))
|
||||||
|
} else {
|
||||||
|
eventData.Set("links", linkcheck.New().ScanForLinks(m.Trailing()))
|
||||||
|
}
|
||||||
|
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -55,22 +55,23 @@ func New() *Checker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ScanForLinks takes a message and tries to find links within that
|
// HeuristicScanForLinks takes a message and tries to find links
|
||||||
// message. Common methods like putting spaces into links are tried
|
// within that message. Common methods like putting spaces into links
|
||||||
// to circumvent.
|
// are tried to circumvent.
|
||||||
func (c Checker) ScanForLinks(message string) (links []string) {
|
func (c Checker) HeuristicScanForLinks(message string) []string {
|
||||||
for _, scanner := range []func(string) []string{
|
return c.scan(message,
|
||||||
c.scanPlainNoObfuscate,
|
c.scanPlainNoObfuscate,
|
||||||
c.scanObfuscateSpace,
|
c.scanObfuscateSpace,
|
||||||
c.scanObfuscateSpecialCharsAndSpaces,
|
c.scanObfuscateSpecialCharsAndSpaces,
|
||||||
c.scanDotObfuscation,
|
c.scanDotObfuscation,
|
||||||
} {
|
)
|
||||||
if links = scanner(message); links != nil {
|
}
|
||||||
return links
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return links
|
// ScanForLinks takes a message and tries to find links within that
|
||||||
|
// message. This only detects links without any means of obfuscation
|
||||||
|
// like putting spaces into the link.
|
||||||
|
func (c Checker) ScanForLinks(message string) (links []string) {
|
||||||
|
return c.scan(message, c.scanPlainNoObfuscate)
|
||||||
}
|
}
|
||||||
|
|
||||||
// resolveFinal takes a link and looks up the final destination of
|
// resolveFinal takes a link and looks up the final destination of
|
||||||
|
@ -184,6 +185,16 @@ func (Checker) getJar() *cookiejar.Jar {
|
||||||
return jar
|
return jar
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c Checker) scan(message string, scanFns ...func(string) []string) (links []string) {
|
||||||
|
for _, scanner := range scanFns {
|
||||||
|
if links = scanner(message); links != nil {
|
||||||
|
return links
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return links
|
||||||
|
}
|
||||||
|
|
||||||
func (c Checker) scanDotObfuscation(message string) (links []string) {
|
func (c Checker) scanDotObfuscation(message string) (links []string) {
|
||||||
message = regexp.MustCompile(`(?i)\s*\(?dot\)?\s*`).ReplaceAllString(message, ".")
|
message = regexp.MustCompile(`(?i)\s*\(?dot\)?\s*`).ReplaceAllString(message, ".")
|
||||||
return c.scanPlainNoObfuscate(message)
|
return c.scanPlainNoObfuscate(message)
|
||||||
|
@ -193,9 +204,11 @@ func (c Checker) scanObfuscateSpace(message string) (links []string) {
|
||||||
// Spammers use spaces in their links to prevent link protection matches
|
// Spammers use spaces in their links to prevent link protection matches
|
||||||
parts := regexp.MustCompile(`\s+`).Split(message, -1)
|
parts := regexp.MustCompile(`\s+`).Split(message, -1)
|
||||||
|
|
||||||
for i := 0; i < len(parts)-1; i++ {
|
for ptJoin := 2; ptJoin < len(parts); ptJoin++ {
|
||||||
if link := c.resolveFinal(strings.Join(parts[i:i+2], ""), c.getJar(), nil, c.userAgent()); link != "" {
|
for i := 0; i <= len(parts)-ptJoin; i++ {
|
||||||
links = append(links, link)
|
if link := c.resolveFinal(strings.Join(parts[i:i+ptJoin], ""), c.getJar(), nil, c.userAgent()); link != "" {
|
||||||
|
links = append(links, link)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,7 @@ func TestMaxRedirects(t *testing.T) {
|
||||||
assert.Equal(t, []string{fmt.Sprintf("%s/%d", ts.URL, maxRedirects)}, c.ScanForLinks(msg))
|
assert.Equal(t, []string{fmt.Sprintf("%s/%d", ts.URL, maxRedirects)}, c.ScanForLinks(msg))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//nolint:funlen
|
||||||
func TestScanForLinks(t *testing.T) {
|
func TestScanForLinks(t *testing.T) {
|
||||||
if testing.Short() {
|
if testing.Short() {
|
||||||
t.SkipNow()
|
t.SkipNow()
|
||||||
|
@ -62,54 +63,64 @@ func TestScanForLinks(t *testing.T) {
|
||||||
c := New()
|
c := New()
|
||||||
|
|
||||||
for _, testCase := range []struct {
|
for _, testCase := range []struct {
|
||||||
|
Heuristic bool
|
||||||
Message string
|
Message string
|
||||||
ExpectedLinks []string
|
ExpectedLinks []string
|
||||||
}{
|
}{
|
||||||
// Case: full URL is present in the message
|
// Case: full URL is present in the message
|
||||||
{
|
{
|
||||||
Message: "https://example.com",
|
Heuristic: false,
|
||||||
|
Message: "https://example.com",
|
||||||
ExpectedLinks: []string{
|
ExpectedLinks: []string{
|
||||||
"https://example.com",
|
"https://example.com",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// Case: full bitly link is present in the message
|
// Case: full bitly link is present in the message
|
||||||
{
|
{
|
||||||
Message: "https://bit.ly/438obkJ",
|
Heuristic: false,
|
||||||
|
Message: "https://bit.ly/438obkJ",
|
||||||
ExpectedLinks: []string{
|
ExpectedLinks: []string{
|
||||||
"https://example.com/",
|
"https://example.com/",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// Case: link is present just without the protocol
|
// Case: link is present just without the protocol
|
||||||
{
|
{
|
||||||
Message: "Here, take a look at this: bit.ly/438obkJ",
|
Heuristic: false,
|
||||||
|
Message: "Here, take a look at this: bit.ly/438obkJ",
|
||||||
ExpectedLinks: []string{
|
ExpectedLinks: []string{
|
||||||
"https://example.com/",
|
"https://example.com/",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// Case: message with vk.cc shortener
|
// Case: message with vk.cc shortener
|
||||||
{
|
{
|
||||||
Message: "See more here: vk.cc/ckGZN2",
|
Heuristic: false,
|
||||||
|
Message: "See more here: vk.cc/ckGZN2",
|
||||||
ExpectedLinks: []string{
|
ExpectedLinks: []string{
|
||||||
"https://vk.com/club206261664",
|
"https://vk.com/club206261664",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// Case: link is obfuscated using space
|
// Case: link is obfuscated using space
|
||||||
{
|
{
|
||||||
Message: "Take a look at example. com",
|
Heuristic: true,
|
||||||
|
Message: "Take a look at example. com",
|
||||||
ExpectedLinks: []string{
|
ExpectedLinks: []string{
|
||||||
"http://example.com",
|
"http://example.com",
|
||||||
|
"http://www.atexample.com/",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// Case: link is obfuscated using space and braces
|
// Case: link is obfuscated using space and braces
|
||||||
{
|
{
|
||||||
Message: "Take a look at example. (com)",
|
Heuristic: true,
|
||||||
|
Message: "Take a look at example. (com)",
|
||||||
ExpectedLinks: []string{
|
ExpectedLinks: []string{
|
||||||
"http://example.com",
|
"http://example.com",
|
||||||
|
"http://www.atexample.com/",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// Case: multiple links in one message
|
// Case: multiple links in one message
|
||||||
{
|
{
|
||||||
Message: "https://clips.twitch.tv/WrongEnchantingMinkFutureMan-EKlDjYkvDeurO9XT https://bit.ly/438obkJ",
|
Heuristic: false,
|
||||||
|
Message: "https://clips.twitch.tv/WrongEnchantingMinkFutureMan-EKlDjYkvDeurO9XT https://bit.ly/438obkJ",
|
||||||
ExpectedLinks: []string{
|
ExpectedLinks: []string{
|
||||||
"https://clips.twitch.tv/WrongEnchantingMinkFutureMan-EKlDjYkvDeurO9XT",
|
"https://clips.twitch.tv/WrongEnchantingMinkFutureMan-EKlDjYkvDeurO9XT",
|
||||||
"https://example.com/",
|
"https://example.com/",
|
||||||
|
@ -117,38 +128,62 @@ func TestScanForLinks(t *testing.T) {
|
||||||
},
|
},
|
||||||
// Case: obfuscation with "dot"
|
// Case: obfuscation with "dot"
|
||||||
{
|
{
|
||||||
Message: "I'm live now on twitch dot tv/twitch",
|
Heuristic: true,
|
||||||
|
Message: "I'm live now on twitch dot tv/twitch",
|
||||||
ExpectedLinks: []string{
|
ExpectedLinks: []string{
|
||||||
"https://www.twitch.tv/twitch",
|
"https://www.twitch.tv/twitch",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// Case: enhanced "dot" obfuscation
|
// Case: enhanced "dot" obfuscation
|
||||||
{
|
{
|
||||||
Message: "You can visit Archive(Dot) org in your browser",
|
Heuristic: true,
|
||||||
|
Message: "You can visit Archive(Dot) org in your browser",
|
||||||
ExpectedLinks: []string{
|
ExpectedLinks: []string{
|
||||||
"http://Archive.org",
|
"http://Archive.org",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// Case: Youtube does weird stuff
|
// Case: Youtube does weird stuff
|
||||||
{
|
{
|
||||||
Message: "https://luziferus.tv/youtube",
|
Heuristic: false,
|
||||||
|
Message: "https://luziferus.tv/youtube",
|
||||||
ExpectedLinks: []string{
|
ExpectedLinks: []string{
|
||||||
"https://www.youtube.com/channel/UCjsRmaAQ0IHR2CNEBqfNOSQ",
|
"https://www.youtube.com/channel/UCjsRmaAQ0IHR2CNEBqfNOSQ",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// Case: Instagram also does weird things
|
// Case: Instagram also does weird things
|
||||||
{
|
{
|
||||||
Message: "https://bit.ly/3KHpJuy",
|
Heuristic: false,
|
||||||
|
Message: "https://bit.ly/3KHpJuy",
|
||||||
ExpectedLinks: []string{
|
ExpectedLinks: []string{
|
||||||
"https://www.instagram.com/instagram/",
|
"https://www.instagram.com/instagram/",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
// Case: Heuristic enabled with a German sentence accidentally
|
||||||
|
// forming a valid link to a spanish site (btw.es) - known and
|
||||||
|
// valid false-positive
|
||||||
|
{
|
||||||
|
Heuristic: true,
|
||||||
|
Message: "Hey btw. es kann sein, dass",
|
||||||
|
ExpectedLinks: []string{"https://trusted.evo-media.eu/btw.es"},
|
||||||
|
},
|
||||||
|
// Case: Multiple spaces in the link
|
||||||
|
{
|
||||||
|
Heuristic: true,
|
||||||
|
Message: "Hey there, see my new project on exa mpl e. com! Get it fast now!",
|
||||||
|
ExpectedLinks: []string{"http://example.com"},
|
||||||
|
},
|
||||||
// Case: false positives
|
// Case: false positives
|
||||||
{Message: "game dot exe has stopped working", ExpectedLinks: nil},
|
{Heuristic: true, Message: "game dot exe has stopped working", ExpectedLinks: nil},
|
||||||
{Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: nil},
|
{Heuristic: true, Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: nil},
|
||||||
|
{Heuristic: false, Message: "Hey btw. es kann sein, dass", ExpectedLinks: nil},
|
||||||
} {
|
} {
|
||||||
t.Run(testCase.Message, func(t *testing.T) {
|
t.Run(fmt.Sprintf("h:%v lc:%d m:%s", testCase.Heuristic, len(testCase.ExpectedLinks), testCase.Message), func(t *testing.T) {
|
||||||
linksFound := c.ScanForLinks(testCase.Message)
|
var linksFound []string
|
||||||
|
if testCase.Heuristic {
|
||||||
|
linksFound = c.HeuristicScanForLinks(testCase.Message)
|
||||||
|
} else {
|
||||||
|
linksFound = c.ScanForLinks(testCase.Message)
|
||||||
|
}
|
||||||
sort.Strings(linksFound)
|
sort.Strings(linksFound)
|
||||||
|
|
||||||
assert.Equal(t, testCase.ExpectedLinks, linksFound, "links from message %q", testCase.Message)
|
assert.Equal(t, testCase.ExpectedLinks, linksFound, "links from message %q", testCase.Message)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue