go-latestver/internal/fetcher/html.go

99 lines
3 KiB
Go
Raw Permalink Normal View History

2021-11-22 02:39:25 +00:00
package fetcher
import (
"context"
"regexp"
"time"
"github.com/antchfx/htmlquery"
"github.com/antchfx/xpath"
"github.com/pkg/errors"
"golang.org/x/net/html"
"github.com/Luzifer/go-latestver/internal/database"
"github.com/Luzifer/go_helpers/v2/fieldcollection"
2021-11-22 02:39:25 +00:00
)
/*
* @module html
* @module_desc Downloads website, selects text-node using XPath and optionally applies custom regular expression
*/
2021-11-22 02:39:25 +00:00
var htmlFetcherDefaultRegex = `(v?(?:[0-9]+\.?){2,})`
type (
// HTMLFetcher implements the fetcher interface to monitor versions on websites by xpath queries
2021-11-22 02:39:25 +00:00
HTMLFetcher struct{}
)
func init() { registerFetcher("html", func() Fetcher { return &HTMLFetcher{} }) }
// FetchVersion retrieves the latest version for the catalog entry
func (HTMLFetcher) FetchVersion(_ context.Context, attrs *fieldcollection.FieldCollection) (string, time.Time, error) {
2021-11-22 02:39:25 +00:00
doc, err := htmlquery.LoadURL(attrs.MustString("url", nil))
if err != nil {
return "", time.Time{}, errors.Wrap(err, "loading URL")
}
node, err := htmlquery.Query(doc, attrs.MustString("xpath", nil))
if err != nil {
return "", time.Time{}, errors.Wrap(err, "querying xpath")
}
if node == nil {
return "", time.Time{}, errors.New("xpath expression lead to nil-node")
}
if node.Type == html.ElementNode && node.FirstChild != nil && node.FirstChild.Type == html.TextNode {
2021-11-22 02:39:25 +00:00
node = node.FirstChild
}
if node.Type != html.TextNode {
return "", time.Time{}, errors.Errorf("xpath expression lead to unexpected node type: %d", node.Type)
}
match := regexp.MustCompile(attrs.MustString("regex", &htmlFetcherDefaultRegex)).FindStringSubmatch(node.Data)
if len(match) < 2 { //nolint:mnd // Simple count of fields, no need for constant
2021-11-22 02:39:25 +00:00
return "", time.Time{}, errors.New("regular expression did not yield version")
}
return match[1], time.Now(), nil
}
// Links retrieves a collection of links for the fetcher
func (HTMLFetcher) Links(attrs *fieldcollection.FieldCollection) []database.CatalogLink {
2021-11-22 02:39:25 +00:00
return []database.CatalogLink{
{
IconClass: "fas fa-globe",
Name: "Website",
URL: attrs.MustString("url", nil),
},
}
}
// Validate validates the configuration given to the fetcher
func (HTMLFetcher) Validate(attrs *fieldcollection.FieldCollection) error {
// @attr url required string "" URL to fetch the HTML from
2021-11-22 02:39:25 +00:00
if v, err := attrs.String("url"); err != nil || v == "" {
return errors.New("url is expected to be non-empty string")
}
// @attr xpath required string "" XPath expression leading to the text-node containing the version
2021-11-22 02:39:25 +00:00
if v, err := attrs.String("xpath"); err != nil || v == "" {
return errors.New("xpath is expected to be non-empty string")
}
if _, err := xpath.Compile(attrs.MustString("xpath", nil)); err != nil {
return errors.Wrap(err, "compiling xpath expression")
}
// @attr regex optional string "(v?(?:[0-9]+\.?){2,})" Regular expression to apply to the text from the XPath expression
2021-11-22 02:39:25 +00:00
if attrs.CanString("regex") {
if _, err := regexp.Compile(attrs.MustString("regex", nil)); err != nil {
return errors.Wrap(err, "invalid regex given")
}
}
return nil
}