diff --git a/README.md b/README.md index d40959f..6acfee6 100644 --- a/README.md +++ b/README.md @@ -76,9 +76,7 @@ An example Prometheus configuration file demonstrating how to scrape metrics can ## HTTP Service Discovery -When enabled via the `--ns1.enable-service-discovery` flag, the exporter will also expose an HTTP endpoint `/sd` that can be used to output NS1 DNS records in a format that is compatible with [Prometheus's HTTP service discovery](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config). In order to be kind to NS1 API rate limits, the SD mechanism will update every 5 minutes and cache scrape targets. To override the default SD refresh interval, use the `--ns1.sd-refresh-interval` flag. - -> :warning: _NOTE_: The NS1 API has an [account activity](https://ns1.com/api?docId=2285) endpoint that can be used to retrieve recent account activity (such as creating/modifying/deleting DNS records). However, the [ns1-go SDK](https://github.com/ns1/ns1-go) being used by this exporter does not currently support the account activity endpoint. This means that at each HTTP SD refresh interval, the exporter will do a full refresh of all DNS records available to the API token. If/when the go SDK adds support for the account activity endpoint, the HTTP SD mechanism will be updated to use a more intelligent refresh algorithm that polls the activity log on each refresh, only updating the scrape target cache when recent changes are detected to the account's DNS records. +When enabled via the `--ns1.enable-service-discovery` flag, the exporter will also expose an HTTP endpoint `/sd` that can be used to output NS1 DNS records in a format that is compatible with [Prometheus's HTTP service discovery](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config). In order to be kind to NS1 API rate limits, the SD mechanism will poll the `/account/activity` endpoint every 1 minute and check to see if any recent API actions have been performed that would affect the SD's cache; if recent account actions are detected, the SD mechanism will assume it's cache is invalid and refresh data to serve for targets. As a failsafe, the SD mechanism will allow a maximum of 10 "empty" responses from the account activity endpoint (meaning no activity since the last poll), at which point it will refresh it's cache regardless to ensure it's fresh. To override the default SD refresh interval, use the `--ns1.sd-refresh-interval` flag. Example HTTP SD entry for an `A` record pointing to a testing instance on Hetzner Cloud: diff --git a/cmd/ns1_exporter/ns1_exporter.go b/cmd/ns1_exporter/ns1_exporter.go index 89d8332..12fa21e 100644 --- a/cmd/ns1_exporter/ns1_exporter.go +++ b/cmd/ns1_exporter/ns1_exporter.go @@ -112,7 +112,7 @@ var ( flagNS1SDRefreshInterval = kingpin.Flag( "ns1.sd-refresh-interval", "The interval at which targets for Prometheus HTTP service discovery will be refreshed from the NS1 API.", - ).Default("5m").Duration() + ).Default("1m").Duration() flagNS1SDZoneBlacklistRegex = kingpin.Flag( "ns1.sd-zone-blacklist", diff --git a/pkg/servicediscovery/sd.go b/pkg/servicediscovery/sd.go index 24a4fd7..fc0c360 100644 --- a/pkg/servicediscovery/sd.go +++ b/pkg/servicediscovery/sd.go @@ -22,12 +22,14 @@ import ( "sort" "strconv" "strings" + "time" "github.com/go-kit/log" "github.com/go-kit/log/level" promModel "github.com/prometheus/common/model" "github.com/samber/lo" api "gopkg.in/ns1/ns1-go.v2/rest" + "gopkg.in/ns1/ns1-go.v2/rest/model/account" "gopkg.in/ns1/ns1-go.v2/rest/model/data" "gopkg.in/ns1/ns1-go.v2/rest/model/dns" "gopkg.in/ns1/ns1-go.v2/rest/model/filter" @@ -67,11 +69,13 @@ type Worker struct { ZoneWhitelist *regexp.Regexp RecordTypeWhitelist *regexp.Regexp - logger log.Logger - client *api.Client - zoneCache map[string]*ns1_internal.Zone - recordCache []*dns.Record - targetCache []*HTTPSDTarget + logger log.Logger + client *api.Client + zoneCache map[string]*ns1_internal.Zone + recordCache []*dns.Record + targetCache []*HTTPSDTarget + lastRefreshTimestamp time.Time + pollCount int } func NewWorker(logger log.Logger, client *api.Client, blacklist, whitelist, recordType *regexp.Regexp) *Worker { @@ -271,7 +275,7 @@ func (w *Worker) RefreshRecordData() { level.Debug(w.logger).Log("msg", "Worker record cache updated", "num_records", len(w.recordCache)) } -func (w *Worker) Refresh() { +func (w *Worker) RefreshData() { level.Info(w.logger).Log("msg", "Updating record data from NS1 API") w.RefreshZoneData() w.RefreshRecordData() @@ -279,6 +283,53 @@ func (w *Worker) Refresh() { w.RefreshPrometheusTargetData() } +func (w *Worker) Refresh() { + needsRefresh := true + ts := time.Now().UTC() + + // if we already have data, we need to poll for activity and see if we should still refresh our data set or skip + if w.recordCache != nil { + params := []api.Param{ + {Key: "start", Value: strconv.FormatInt(w.lastRefreshTimestamp.Unix(), 10)}, + {Key: "limit", Value: "1000"}, + } + activity, _, err := w.client.Activity.List(params...) + if err != nil { + level.Error(w.logger).Log("msg", "Failed to get account activity from NS1 API", "err", err.Error()) + metrics.MetricExporterNS1APIFailures.Inc() + } + w.pollCount++ + + switch len(activity) { + case 0: + if w.pollCount < 10 { + needsRefresh = false + } + default: + // activity detected, filter to only care about activity that can affect zones/records, that's what we care about + var filteredActivity []*account.Activity + for _, a := range activity { + switch a.ResourceType { + case "dns_zone", "record", "notify_list", "datasource", "datafeed", "job": + filteredActivity = append(filteredActivity, a) + default: + } + } + + if len(filteredActivity) == 0 && w.pollCount < 10 { + needsRefresh = false + } + } + } + + if needsRefresh { + w.RefreshData() + w.pollCount = 0 + } + + w.lastRefreshTimestamp = ts +} + func (w *Worker) ServeHTTP(writer http.ResponseWriter, req *http.Request) { buf, err := json.MarshalIndent(w.targetCache, "", " ") if err != nil {