From a0f3937e1901db55cb1f232598b9687c4e16c601 Mon Sep 17 00:00:00 2001
From: TJ Hoplock <t.hoplock@gmail.com>
Date: Mon, 22 Apr 2024 23:48:01 -0400
Subject: [PATCH] feat!: enable activity polling for more efficient SD
 refreshes

I added support to ns1-go to list account activity upstream:
https://github.com/ns1/ns1-go/pull/233

This adds support for utilizing this endpoint for more efficient service
discovery updates.

It uses a similar refresh mechanism to what I used when working on linode
service discovery for prometheus:
https://github.com/prometheus/prometheus/blob/76b0318ed52e655e96a3a4734e3678bf55801616/discovery/linode/linode.go#L183-L231

Rather than refreshing all zone/record cache on every refresh interval,
the SD mechanism will now poll the `/account/activity` endpoint. If
activity is detected that could affect the zone/record data used to
create SD targets, then the SD mechanism will actually go ahead and
refresh it's cache. There's a max of 10 polls with no activity detected
before refreshing cache regardless of activity status, to ensure that
cache is valid/recent.
---
 README.md                        |  4 +-
 cmd/ns1_exporter/ns1_exporter.go |  2 +-
 pkg/servicediscovery/sd.go       | 63 +++++++++++++++++++++++++++++---
 3 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index d40959f..6acfee6 100644
--- a/README.md
+++ b/README.md
@@ -76,9 +76,7 @@ An example Prometheus configuration file demonstrating how to scrape metrics can
 
 ## HTTP Service Discovery
 
-When enabled via the `--ns1.enable-service-discovery` flag, the exporter will also expose an HTTP endpoint `/sd` that can be used to output NS1 DNS records in a format that is compatible with [Prometheus's HTTP service discovery](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config). In order to be kind to NS1 API rate limits, the SD mechanism will update every 5 minutes and cache scrape targets. To override the default SD refresh interval, use the `--ns1.sd-refresh-interval` flag.
-
-> :warning: _NOTE_: The NS1 API has an [account activity](https://ns1.com/api?docId=2285) endpoint that can be used to retrieve recent account activity (such as creating/modifying/deleting DNS records). However, the [ns1-go SDK](https://github.com/ns1/ns1-go) being used by this exporter does not currently support the account activity endpoint. This means that at each HTTP SD refresh interval, the exporter will do a full refresh of all DNS records available to the API token. If/when the go SDK adds support for the account activity endpoint, the HTTP SD mechanism will be updated to use a more intelligent refresh algorithm that polls the activity log on each refresh, only updating the scrape target cache when recent changes are detected to the account's DNS records.
+When enabled via the `--ns1.enable-service-discovery` flag, the exporter will also expose an HTTP endpoint `/sd` that can be used to output NS1 DNS records in a format that is compatible with [Prometheus's HTTP service discovery](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config). In order to be kind to NS1 API rate limits, the SD mechanism will poll the `/account/activity` endpoint every 1 minute and check to see if any recent API actions have been performed that would affect the SD's cache; if recent account actions are detected, the SD mechanism will assume it's cache is invalid and refresh data to serve for targets. As a failsafe, the SD mechanism will allow a maximum of 10 "empty" responses from the account activity endpoint (meaning no activity since the last poll), at which point it will refresh it's cache regardless to ensure it's fresh. To override the default SD refresh interval, use the `--ns1.sd-refresh-interval` flag.
 
 Example HTTP SD entry for an `A` record pointing to a testing instance on Hetzner Cloud:
 
diff --git a/cmd/ns1_exporter/ns1_exporter.go b/cmd/ns1_exporter/ns1_exporter.go
index 89d8332..12fa21e 100644
--- a/cmd/ns1_exporter/ns1_exporter.go
+++ b/cmd/ns1_exporter/ns1_exporter.go
@@ -112,7 +112,7 @@ var (
 	flagNS1SDRefreshInterval = kingpin.Flag(
 		"ns1.sd-refresh-interval",
 		"The interval at which targets for Prometheus HTTP service discovery will be refreshed from the NS1 API.",
-	).Default("5m").Duration()
+	).Default("1m").Duration()
 
 	flagNS1SDZoneBlacklistRegex = kingpin.Flag(
 		"ns1.sd-zone-blacklist",
diff --git a/pkg/servicediscovery/sd.go b/pkg/servicediscovery/sd.go
index 24a4fd7..fc0c360 100644
--- a/pkg/servicediscovery/sd.go
+++ b/pkg/servicediscovery/sd.go
@@ -22,12 +22,14 @@ import (
 	"sort"
 	"strconv"
 	"strings"
+	"time"
 
 	"github.com/go-kit/log"
 	"github.com/go-kit/log/level"
 	promModel "github.com/prometheus/common/model"
 	"github.com/samber/lo"
 	api "gopkg.in/ns1/ns1-go.v2/rest"
+	"gopkg.in/ns1/ns1-go.v2/rest/model/account"
 	"gopkg.in/ns1/ns1-go.v2/rest/model/data"
 	"gopkg.in/ns1/ns1-go.v2/rest/model/dns"
 	"gopkg.in/ns1/ns1-go.v2/rest/model/filter"
@@ -67,11 +69,13 @@ type Worker struct {
 	ZoneWhitelist       *regexp.Regexp
 	RecordTypeWhitelist *regexp.Regexp
 
-	logger      log.Logger
-	client      *api.Client
-	zoneCache   map[string]*ns1_internal.Zone
-	recordCache []*dns.Record
-	targetCache []*HTTPSDTarget
+	logger               log.Logger
+	client               *api.Client
+	zoneCache            map[string]*ns1_internal.Zone
+	recordCache          []*dns.Record
+	targetCache          []*HTTPSDTarget
+	lastRefreshTimestamp time.Time
+	pollCount            int
 }
 
 func NewWorker(logger log.Logger, client *api.Client, blacklist, whitelist, recordType *regexp.Regexp) *Worker {
@@ -271,7 +275,7 @@ func (w *Worker) RefreshRecordData() {
 	level.Debug(w.logger).Log("msg", "Worker record cache updated", "num_records", len(w.recordCache))
 }
 
-func (w *Worker) Refresh() {
+func (w *Worker) RefreshData() {
 	level.Info(w.logger).Log("msg", "Updating record data from NS1 API")
 	w.RefreshZoneData()
 	w.RefreshRecordData()
@@ -279,6 +283,53 @@ func (w *Worker) Refresh() {
 	w.RefreshPrometheusTargetData()
 }
 
+func (w *Worker) Refresh() {
+	needsRefresh := true
+	ts := time.Now().UTC()
+
+	// if we already have data, we need to poll for activity and see if we should still refresh our data set or skip
+	if w.recordCache != nil {
+		params := []api.Param{
+			{Key: "start", Value: strconv.FormatInt(w.lastRefreshTimestamp.Unix(), 10)},
+			{Key: "limit", Value: "1000"},
+		}
+		activity, _, err := w.client.Activity.List(params...)
+		if err != nil {
+			level.Error(w.logger).Log("msg", "Failed to get account activity from NS1 API", "err", err.Error())
+			metrics.MetricExporterNS1APIFailures.Inc()
+		}
+		w.pollCount++
+
+		switch len(activity) {
+		case 0:
+			if w.pollCount < 10 {
+				needsRefresh = false
+			}
+		default:
+			// activity detected, filter to only care about activity that can affect zones/records, that's what we care about
+			var filteredActivity []*account.Activity
+			for _, a := range activity {
+				switch a.ResourceType {
+				case "dns_zone", "record", "notify_list", "datasource", "datafeed", "job":
+					filteredActivity = append(filteredActivity, a)
+				default:
+				}
+			}
+
+			if len(filteredActivity) == 0 && w.pollCount < 10 {
+				needsRefresh = false
+			}
+		}
+	}
+
+	if needsRefresh {
+		w.RefreshData()
+		w.pollCount = 0
+	}
+
+	w.lastRefreshTimestamp = ts
+}
+
 func (w *Worker) ServeHTTP(writer http.ResponseWriter, req *http.Request) {
 	buf, err := json.MarshalIndent(w.targetCache, "", "    ")
 	if err != nil {