mirror of
https://github.com/Luzifer/twitch-bot.git
synced 2024-12-20 11:51:17 +00:00
[linkdetector] Add new option to enable heuristic scan
Signed-off-by: Knut Ahlers <knut@ahlers.me>
This commit is contained in:
parent
d74105bc00
commit
a74f210cd2
3 changed files with 98 additions and 31 deletions
|
@ -9,6 +9,8 @@ import (
|
|||
|
||||
const actorName = "linkdetector"
|
||||
|
||||
var ptrFalse = func(v bool) *bool { return &v }(false)
|
||||
|
||||
func Register(args plugins.RegistrationArguments) error {
|
||||
args.RegisterActor(actorName, func() plugins.Actor { return &Actor{} })
|
||||
|
||||
|
@ -16,6 +18,18 @@ func Register(args plugins.RegistrationArguments) error {
|
|||
Description: `Scans for links in the message and adds the "links" field to the event data`,
|
||||
Name: "Scan for Links",
|
||||
Type: actorName,
|
||||
|
||||
Fields: []plugins.ActionDocumentationField{
|
||||
{
|
||||
Default: "false",
|
||||
Description: "Enable heuristic scans to find links with spaces or other means of obfuscation in them",
|
||||
Key: "heuristic",
|
||||
Name: "Heuristic Scan",
|
||||
Optional: true,
|
||||
SupportTemplate: false,
|
||||
Type: plugins.ActionDocumentationFieldTypeBool,
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
return nil
|
||||
|
@ -23,13 +37,18 @@ func Register(args plugins.RegistrationArguments) error {
|
|||
|
||||
type Actor struct{}
|
||||
|
||||
func (Actor) Execute(_ *irc.Client, m *irc.Message, _ *plugins.Rule, eventData *plugins.FieldCollection, _ *plugins.FieldCollection) (preventCooldown bool, err error) {
|
||||
func (Actor) Execute(_ *irc.Client, m *irc.Message, _ *plugins.Rule, eventData *plugins.FieldCollection, attrs *plugins.FieldCollection) (preventCooldown bool, err error) {
|
||||
if eventData.HasAll("links") {
|
||||
// We already detected links, lets not do it again
|
||||
return false, nil
|
||||
}
|
||||
|
||||
eventData.Set("links", linkcheck.New().ScanForLinks(m.Trailing()))
|
||||
if attrs.MustBool("heuristic", ptrFalse) {
|
||||
eventData.Set("links", linkcheck.New().HeuristicScanForLinks(m.Trailing()))
|
||||
} else {
|
||||
eventData.Set("links", linkcheck.New().ScanForLinks(m.Trailing()))
|
||||
}
|
||||
|
||||
return false, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -55,22 +55,23 @@ func New() *Checker {
|
|||
}
|
||||
}
|
||||
|
||||
// ScanForLinks takes a message and tries to find links within that
|
||||
// message. Common methods like putting spaces into links are tried
|
||||
// to circumvent.
|
||||
func (c Checker) ScanForLinks(message string) (links []string) {
|
||||
for _, scanner := range []func(string) []string{
|
||||
// HeuristicScanForLinks takes a message and tries to find links
|
||||
// within that message. Common methods like putting spaces into links
|
||||
// are tried to circumvent.
|
||||
func (c Checker) HeuristicScanForLinks(message string) []string {
|
||||
return c.scan(message,
|
||||
c.scanPlainNoObfuscate,
|
||||
c.scanObfuscateSpace,
|
||||
c.scanObfuscateSpecialCharsAndSpaces,
|
||||
c.scanDotObfuscation,
|
||||
} {
|
||||
if links = scanner(message); links != nil {
|
||||
return links
|
||||
}
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
return links
|
||||
// ScanForLinks takes a message and tries to find links within that
|
||||
// message. This only detects links without any means of obfuscation
|
||||
// like putting spaces into the link.
|
||||
func (c Checker) ScanForLinks(message string) (links []string) {
|
||||
return c.scan(message, c.scanPlainNoObfuscate)
|
||||
}
|
||||
|
||||
// resolveFinal takes a link and looks up the final destination of
|
||||
|
@ -184,6 +185,16 @@ func (Checker) getJar() *cookiejar.Jar {
|
|||
return jar
|
||||
}
|
||||
|
||||
func (c Checker) scan(message string, scanFns ...func(string) []string) (links []string) {
|
||||
for _, scanner := range scanFns {
|
||||
if links = scanner(message); links != nil {
|
||||
return links
|
||||
}
|
||||
}
|
||||
|
||||
return links
|
||||
}
|
||||
|
||||
func (c Checker) scanDotObfuscation(message string) (links []string) {
|
||||
message = regexp.MustCompile(`(?i)\s*\(?dot\)?\s*`).ReplaceAllString(message, ".")
|
||||
return c.scanPlainNoObfuscate(message)
|
||||
|
@ -193,9 +204,11 @@ func (c Checker) scanObfuscateSpace(message string) (links []string) {
|
|||
// Spammers use spaces in their links to prevent link protection matches
|
||||
parts := regexp.MustCompile(`\s+`).Split(message, -1)
|
||||
|
||||
for i := 0; i < len(parts)-1; i++ {
|
||||
if link := c.resolveFinal(strings.Join(parts[i:i+2], ""), c.getJar(), nil, c.userAgent()); link != "" {
|
||||
links = append(links, link)
|
||||
for ptJoin := 2; ptJoin < len(parts); ptJoin++ {
|
||||
for i := 0; i <= len(parts)-ptJoin; i++ {
|
||||
if link := c.resolveFinal(strings.Join(parts[i:i+ptJoin], ""), c.getJar(), nil, c.userAgent()); link != "" {
|
||||
links = append(links, link)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -54,6 +54,7 @@ func TestMaxRedirects(t *testing.T) {
|
|||
assert.Equal(t, []string{fmt.Sprintf("%s/%d", ts.URL, maxRedirects)}, c.ScanForLinks(msg))
|
||||
}
|
||||
|
||||
//nolint:funlen
|
||||
func TestScanForLinks(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.SkipNow()
|
||||
|
@ -62,54 +63,64 @@ func TestScanForLinks(t *testing.T) {
|
|||
c := New()
|
||||
|
||||
for _, testCase := range []struct {
|
||||
Heuristic bool
|
||||
Message string
|
||||
ExpectedLinks []string
|
||||
}{
|
||||
// Case: full URL is present in the message
|
||||
{
|
||||
Message: "https://example.com",
|
||||
Heuristic: false,
|
||||
Message: "https://example.com",
|
||||
ExpectedLinks: []string{
|
||||
"https://example.com",
|
||||
},
|
||||
},
|
||||
// Case: full bitly link is present in the message
|
||||
{
|
||||
Message: "https://bit.ly/438obkJ",
|
||||
Heuristic: false,
|
||||
Message: "https://bit.ly/438obkJ",
|
||||
ExpectedLinks: []string{
|
||||
"https://example.com/",
|
||||
},
|
||||
},
|
||||
// Case: link is present just without the protocol
|
||||
{
|
||||
Message: "Here, take a look at this: bit.ly/438obkJ",
|
||||
Heuristic: false,
|
||||
Message: "Here, take a look at this: bit.ly/438obkJ",
|
||||
ExpectedLinks: []string{
|
||||
"https://example.com/",
|
||||
},
|
||||
},
|
||||
// Case: message with vk.cc shortener
|
||||
{
|
||||
Message: "See more here: vk.cc/ckGZN2",
|
||||
Heuristic: false,
|
||||
Message: "See more here: vk.cc/ckGZN2",
|
||||
ExpectedLinks: []string{
|
||||
"https://vk.com/club206261664",
|
||||
},
|
||||
},
|
||||
// Case: link is obfuscated using space
|
||||
{
|
||||
Message: "Take a look at example. com",
|
||||
Heuristic: true,
|
||||
Message: "Take a look at example. com",
|
||||
ExpectedLinks: []string{
|
||||
"http://example.com",
|
||||
"http://www.atexample.com/",
|
||||
},
|
||||
},
|
||||
// Case: link is obfuscated using space and braces
|
||||
{
|
||||
Message: "Take a look at example. (com)",
|
||||
Heuristic: true,
|
||||
Message: "Take a look at example. (com)",
|
||||
ExpectedLinks: []string{
|
||||
"http://example.com",
|
||||
"http://www.atexample.com/",
|
||||
},
|
||||
},
|
||||
// Case: multiple links in one message
|
||||
{
|
||||
Message: "https://clips.twitch.tv/WrongEnchantingMinkFutureMan-EKlDjYkvDeurO9XT https://bit.ly/438obkJ",
|
||||
Heuristic: false,
|
||||
Message: "https://clips.twitch.tv/WrongEnchantingMinkFutureMan-EKlDjYkvDeurO9XT https://bit.ly/438obkJ",
|
||||
ExpectedLinks: []string{
|
||||
"https://clips.twitch.tv/WrongEnchantingMinkFutureMan-EKlDjYkvDeurO9XT",
|
||||
"https://example.com/",
|
||||
|
@ -117,38 +128,62 @@ func TestScanForLinks(t *testing.T) {
|
|||
},
|
||||
// Case: obfuscation with "dot"
|
||||
{
|
||||
Message: "I'm live now on twitch dot tv/twitch",
|
||||
Heuristic: true,
|
||||
Message: "I'm live now on twitch dot tv/twitch",
|
||||
ExpectedLinks: []string{
|
||||
"https://www.twitch.tv/twitch",
|
||||
},
|
||||
},
|
||||
// Case: enhanced "dot" obfuscation
|
||||
{
|
||||
Message: "You can visit Archive(Dot) org in your browser",
|
||||
Heuristic: true,
|
||||
Message: "You can visit Archive(Dot) org in your browser",
|
||||
ExpectedLinks: []string{
|
||||
"http://Archive.org",
|
||||
},
|
||||
},
|
||||
// Case: Youtube does weird stuff
|
||||
{
|
||||
Message: "https://luziferus.tv/youtube",
|
||||
Heuristic: false,
|
||||
Message: "https://luziferus.tv/youtube",
|
||||
ExpectedLinks: []string{
|
||||
"https://www.youtube.com/channel/UCjsRmaAQ0IHR2CNEBqfNOSQ",
|
||||
},
|
||||
},
|
||||
// Case: Instagram also does weird things
|
||||
{
|
||||
Message: "https://bit.ly/3KHpJuy",
|
||||
Heuristic: false,
|
||||
Message: "https://bit.ly/3KHpJuy",
|
||||
ExpectedLinks: []string{
|
||||
"https://www.instagram.com/instagram/",
|
||||
},
|
||||
},
|
||||
// Case: Heuristic enabled with a German sentence accidentally
|
||||
// forming a valid link to a spanish site (btw.es) - known and
|
||||
// valid false-positive
|
||||
{
|
||||
Heuristic: true,
|
||||
Message: "Hey btw. es kann sein, dass",
|
||||
ExpectedLinks: []string{"https://trusted.evo-media.eu/btw.es"},
|
||||
},
|
||||
// Case: Multiple spaces in the link
|
||||
{
|
||||
Heuristic: true,
|
||||
Message: "Hey there, see my new project on exa mpl e. com! Get it fast now!",
|
||||
ExpectedLinks: []string{"http://example.com"},
|
||||
},
|
||||
// Case: false positives
|
||||
{Message: "game dot exe has stopped working", ExpectedLinks: nil},
|
||||
{Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: nil},
|
||||
{Heuristic: true, Message: "game dot exe has stopped working", ExpectedLinks: nil},
|
||||
{Heuristic: true, Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: nil},
|
||||
{Heuristic: false, Message: "Hey btw. es kann sein, dass", ExpectedLinks: nil},
|
||||
} {
|
||||
t.Run(testCase.Message, func(t *testing.T) {
|
||||
linksFound := c.ScanForLinks(testCase.Message)
|
||||
t.Run(fmt.Sprintf("h:%v lc:%d m:%s", testCase.Heuristic, len(testCase.ExpectedLinks), testCase.Message), func(t *testing.T) {
|
||||
var linksFound []string
|
||||
if testCase.Heuristic {
|
||||
linksFound = c.HeuristicScanForLinks(testCase.Message)
|
||||
} else {
|
||||
linksFound = c.ScanForLinks(testCase.Message)
|
||||
}
|
||||
sort.Strings(linksFound)
|
||||
|
||||
assert.Equal(t, testCase.ExpectedLinks, linksFound, "links from message %q", testCase.Message)
|
||||
|
|
Loading…
Reference in a new issue