mirror of
https://github.com/Luzifer/twitch-bot.git
synced 2024-12-20 11:51:17 +00:00
[linkdetector] Add more ways of link detection in heuristic mode
Signed-off-by: Knut Ahlers <knut@ahlers.me>
This commit is contained in:
parent
a07ad6fe83
commit
5ec6baaf2c
3 changed files with 29 additions and 13 deletions
|
@ -22,7 +22,7 @@ func Register(args plugins.RegistrationArguments) error {
|
||||||
Fields: []plugins.ActionDocumentationField{
|
Fields: []plugins.ActionDocumentationField{
|
||||||
{
|
{
|
||||||
Default: "false",
|
Default: "false",
|
||||||
Description: "Enable heuristic scans to find links with spaces or other means of obfuscation in them",
|
Description: "Enable heuristic scans to find links with spaces or other means of obfuscation in them (quite slow and will detect MANY false-positive links, only use for blacklisting links!)",
|
||||||
Key: "heuristic",
|
Key: "heuristic",
|
||||||
Name: "Heuristic Scan",
|
Name: "Heuristic Scan",
|
||||||
Optional: true,
|
Optional: true,
|
||||||
|
|
|
@ -35,7 +35,6 @@ type (
|
||||||
|
|
||||||
var (
|
var (
|
||||||
defaultUserAgents = []string{}
|
defaultUserAgents = []string{}
|
||||||
dropSet = regexp.MustCompile(`[^a-zA-Z0-9.:/\s_-]`)
|
|
||||||
linkTest = regexp.MustCompile(`(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]`)
|
linkTest = regexp.MustCompile(`(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]`)
|
||||||
numericHost = regexp.MustCompile(`^(?:[0-9]+\.)*[0-9]+(?::[0-9]+)?$`)
|
numericHost = regexp.MustCompile(`^(?:[0-9]+\.)*[0-9]+(?::[0-9]+)?$`)
|
||||||
|
|
||||||
|
@ -61,9 +60,10 @@ func New() *Checker {
|
||||||
func (c Checker) HeuristicScanForLinks(message string) []string {
|
func (c Checker) HeuristicScanForLinks(message string) []string {
|
||||||
return c.scan(message,
|
return c.scan(message,
|
||||||
c.scanPlainNoObfuscate,
|
c.scanPlainNoObfuscate,
|
||||||
c.scanObfuscateSpace,
|
|
||||||
c.scanObfuscateSpecialCharsAndSpaces,
|
|
||||||
c.scanDotObfuscation,
|
c.scanDotObfuscation,
|
||||||
|
c.scanObfuscateSpace,
|
||||||
|
c.scanObfuscateSpecialCharsAndSpaces(regexp.MustCompile(`[^a-zA-Z0-9.:/\s_-]`), ""), // Leave dots intact and just join parts
|
||||||
|
c.scanObfuscateSpecialCharsAndSpaces(regexp.MustCompile(`[^a-zA-Z0-9:/\s_-]`), "."), // Remove dots also and connect by them
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -116,6 +116,9 @@ func (c Checker) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack [
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sanitize host: Trailing dots are valid but not required
|
||||||
|
u.Host = strings.TrimRight(u.Host, ".")
|
||||||
|
|
||||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return ""
|
return ""
|
||||||
|
@ -203,10 +206,22 @@ func (c Checker) scanDotObfuscation(message string) (links []string) {
|
||||||
func (c Checker) scanObfuscateSpace(message string) (links []string) {
|
func (c Checker) scanObfuscateSpace(message string) (links []string) {
|
||||||
// Spammers use spaces in their links to prevent link protection matches
|
// Spammers use spaces in their links to prevent link protection matches
|
||||||
parts := regexp.MustCompile(`\s+`).Split(message, -1)
|
parts := regexp.MustCompile(`\s+`).Split(message, -1)
|
||||||
|
return c.scanPartsConnected(parts, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c Checker) scanObfuscateSpecialCharsAndSpaces(set *regexp.Regexp, connector string) func(string) []string {
|
||||||
|
return func(message string) (links []string) {
|
||||||
|
// First clean URL from all characters not acceptable in Domains (plus some extra chars)
|
||||||
|
message = set.ReplaceAllString(message, " ")
|
||||||
|
parts := regexp.MustCompile(`\s+`).Split(message, -1)
|
||||||
|
return c.scanPartsConnected(parts, connector)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c Checker) scanPartsConnected(parts []string, connector string) (links []string) {
|
||||||
for ptJoin := 2; ptJoin < len(parts); ptJoin++ {
|
for ptJoin := 2; ptJoin < len(parts); ptJoin++ {
|
||||||
for i := 0; i <= len(parts)-ptJoin; i++ {
|
for i := 0; i <= len(parts)-ptJoin; i++ {
|
||||||
if link := c.resolveFinal(strings.Join(parts[i:i+ptJoin], ""), c.getJar(), nil, c.userAgent()); link != "" {
|
if link := c.resolveFinal(strings.Join(parts[i:i+ptJoin], connector), c.getJar(), nil, c.userAgent()); link != "" && !str.StringInSlice(link, links) {
|
||||||
links = append(links, link)
|
links = append(links, link)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -215,17 +230,11 @@ func (c Checker) scanObfuscateSpace(message string) (links []string) {
|
||||||
return links
|
return links
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c Checker) scanObfuscateSpecialCharsAndSpaces(message string) (links []string) {
|
|
||||||
// First clean URL from all characters not acceptable in Domains (plus some extra chars)
|
|
||||||
message = dropSet.ReplaceAllString(message, "")
|
|
||||||
return c.scanObfuscateSpace(message)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c Checker) scanPlainNoObfuscate(message string) (links []string) {
|
func (c Checker) scanPlainNoObfuscate(message string) (links []string) {
|
||||||
parts := regexp.MustCompile(`\s+`).Split(message, -1)
|
parts := regexp.MustCompile(`\s+`).Split(message, -1)
|
||||||
|
|
||||||
for _, part := range parts {
|
for _, part := range parts {
|
||||||
if link := c.resolveFinal(part, c.getJar(), nil, c.userAgent()); link != "" {
|
if link := c.resolveFinal(part, c.getJar(), nil, c.userAgent()); link != "" && !str.StringInSlice(link, links) {
|
||||||
links = append(links, link)
|
links = append(links, link)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -170,9 +170,16 @@ func TestScanForLinks(t *testing.T) {
|
||||||
Message: "Hey there, see my new project on exa mpl e. com! Get it fast now!",
|
Message: "Hey there, see my new project on exa mpl e. com! Get it fast now!",
|
||||||
ExpectedLinks: []string{"http://example.com"},
|
ExpectedLinks: []string{"http://example.com"},
|
||||||
},
|
},
|
||||||
|
// Case: Dot in the end of the link with space
|
||||||
|
{
|
||||||
|
Heuristic: true,
|
||||||
|
Message: "See example com. Nice testing stuff there!",
|
||||||
|
ExpectedLinks: []string{"http://example.com"},
|
||||||
|
},
|
||||||
// Case: false positives
|
// Case: false positives
|
||||||
{Heuristic: true, Message: "game dot exe has stopped working", ExpectedLinks: nil},
|
{Heuristic: true, Message: "game dot exe has stopped working", ExpectedLinks: nil},
|
||||||
{Heuristic: true, Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: nil},
|
{Heuristic: false, Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: nil},
|
||||||
|
{Heuristic: true, Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: []string{"http://You.re"}},
|
||||||
{Heuristic: false, Message: "Hey btw. es kann sein, dass", ExpectedLinks: nil},
|
{Heuristic: false, Message: "Hey btw. es kann sein, dass", ExpectedLinks: nil},
|
||||||
} {
|
} {
|
||||||
t.Run(fmt.Sprintf("h:%v lc:%d m:%s", testCase.Heuristic, len(testCase.ExpectedLinks), testCase.Message), func(t *testing.T) {
|
t.Run(fmt.Sprintf("h:%v lc:%d m:%s", testCase.Heuristic, len(testCase.ExpectedLinks), testCase.Message), func(t *testing.T) {
|
||||||
|
|
Loading…
Reference in a new issue