mirror of
https://github.com/Luzifer/twitch-bot.git
synced 2024-12-20 03:41:16 +00:00
[linkdetector] Add more ways of link detection in heuristic mode
Signed-off-by: Knut Ahlers <knut@ahlers.me>
This commit is contained in:
parent
a07ad6fe83
commit
5ec6baaf2c
3 changed files with 29 additions and 13 deletions
|
@ -22,7 +22,7 @@ func Register(args plugins.RegistrationArguments) error {
|
|||
Fields: []plugins.ActionDocumentationField{
|
||||
{
|
||||
Default: "false",
|
||||
Description: "Enable heuristic scans to find links with spaces or other means of obfuscation in them",
|
||||
Description: "Enable heuristic scans to find links with spaces or other means of obfuscation in them (quite slow and will detect MANY false-positive links, only use for blacklisting links!)",
|
||||
Key: "heuristic",
|
||||
Name: "Heuristic Scan",
|
||||
Optional: true,
|
||||
|
|
|
@ -35,7 +35,6 @@ type (
|
|||
|
||||
var (
|
||||
defaultUserAgents = []string{}
|
||||
dropSet = regexp.MustCompile(`[^a-zA-Z0-9.:/\s_-]`)
|
||||
linkTest = regexp.MustCompile(`(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]`)
|
||||
numericHost = regexp.MustCompile(`^(?:[0-9]+\.)*[0-9]+(?::[0-9]+)?$`)
|
||||
|
||||
|
@ -61,9 +60,10 @@ func New() *Checker {
|
|||
func (c Checker) HeuristicScanForLinks(message string) []string {
|
||||
return c.scan(message,
|
||||
c.scanPlainNoObfuscate,
|
||||
c.scanObfuscateSpace,
|
||||
c.scanObfuscateSpecialCharsAndSpaces,
|
||||
c.scanDotObfuscation,
|
||||
c.scanObfuscateSpace,
|
||||
c.scanObfuscateSpecialCharsAndSpaces(regexp.MustCompile(`[^a-zA-Z0-9.:/\s_-]`), ""), // Leave dots intact and just join parts
|
||||
c.scanObfuscateSpecialCharsAndSpaces(regexp.MustCompile(`[^a-zA-Z0-9:/\s_-]`), "."), // Remove dots also and connect by them
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -116,6 +116,9 @@ func (c Checker) resolveFinal(link string, cookieJar *cookiejar.Jar, callStack [
|
|||
return ""
|
||||
}
|
||||
|
||||
// Sanitize host: Trailing dots are valid but not required
|
||||
u.Host = strings.TrimRight(u.Host, ".")
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
|
||||
if err != nil {
|
||||
return ""
|
||||
|
@ -203,10 +206,22 @@ func (c Checker) scanDotObfuscation(message string) (links []string) {
|
|||
func (c Checker) scanObfuscateSpace(message string) (links []string) {
|
||||
// Spammers use spaces in their links to prevent link protection matches
|
||||
parts := regexp.MustCompile(`\s+`).Split(message, -1)
|
||||
return c.scanPartsConnected(parts, "")
|
||||
}
|
||||
|
||||
func (c Checker) scanObfuscateSpecialCharsAndSpaces(set *regexp.Regexp, connector string) func(string) []string {
|
||||
return func(message string) (links []string) {
|
||||
// First clean URL from all characters not acceptable in Domains (plus some extra chars)
|
||||
message = set.ReplaceAllString(message, " ")
|
||||
parts := regexp.MustCompile(`\s+`).Split(message, -1)
|
||||
return c.scanPartsConnected(parts, connector)
|
||||
}
|
||||
}
|
||||
|
||||
func (c Checker) scanPartsConnected(parts []string, connector string) (links []string) {
|
||||
for ptJoin := 2; ptJoin < len(parts); ptJoin++ {
|
||||
for i := 0; i <= len(parts)-ptJoin; i++ {
|
||||
if link := c.resolveFinal(strings.Join(parts[i:i+ptJoin], ""), c.getJar(), nil, c.userAgent()); link != "" {
|
||||
if link := c.resolveFinal(strings.Join(parts[i:i+ptJoin], connector), c.getJar(), nil, c.userAgent()); link != "" && !str.StringInSlice(link, links) {
|
||||
links = append(links, link)
|
||||
}
|
||||
}
|
||||
|
@ -215,17 +230,11 @@ func (c Checker) scanObfuscateSpace(message string) (links []string) {
|
|||
return links
|
||||
}
|
||||
|
||||
func (c Checker) scanObfuscateSpecialCharsAndSpaces(message string) (links []string) {
|
||||
// First clean URL from all characters not acceptable in Domains (plus some extra chars)
|
||||
message = dropSet.ReplaceAllString(message, "")
|
||||
return c.scanObfuscateSpace(message)
|
||||
}
|
||||
|
||||
func (c Checker) scanPlainNoObfuscate(message string) (links []string) {
|
||||
parts := regexp.MustCompile(`\s+`).Split(message, -1)
|
||||
|
||||
for _, part := range parts {
|
||||
if link := c.resolveFinal(part, c.getJar(), nil, c.userAgent()); link != "" {
|
||||
if link := c.resolveFinal(part, c.getJar(), nil, c.userAgent()); link != "" && !str.StringInSlice(link, links) {
|
||||
links = append(links, link)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -170,9 +170,16 @@ func TestScanForLinks(t *testing.T) {
|
|||
Message: "Hey there, see my new project on exa mpl e. com! Get it fast now!",
|
||||
ExpectedLinks: []string{"http://example.com"},
|
||||
},
|
||||
// Case: Dot in the end of the link with space
|
||||
{
|
||||
Heuristic: true,
|
||||
Message: "See example com. Nice testing stuff there!",
|
||||
ExpectedLinks: []string{"http://example.com"},
|
||||
},
|
||||
// Case: false positives
|
||||
{Heuristic: true, Message: "game dot exe has stopped working", ExpectedLinks: nil},
|
||||
{Heuristic: true, Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: nil},
|
||||
{Heuristic: false, Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: nil},
|
||||
{Heuristic: true, Message: "You're following since 12.12.2020 DogChamp", ExpectedLinks: []string{"http://You.re"}},
|
||||
{Heuristic: false, Message: "Hey btw. es kann sein, dass", ExpectedLinks: nil},
|
||||
} {
|
||||
t.Run(fmt.Sprintf("h:%v lc:%d m:%s", testCase.Heuristic, len(testCase.ExpectedLinks), testCase.Message), func(t *testing.T) {
|
||||
|
|
Loading…
Reference in a new issue