From 74c047cb0364705014126f4b90f724f14ffbcad1 Mon Sep 17 00:00:00 2001 From: Poorna Date: Thu, 1 Aug 2024 17:55:27 -0700 Subject: [PATCH] fix replication last hour metric (#20199) also adding missing recent_backlog_count metric to v3 metrics --- cmd/bucket-stats.go | 11 ++++++++--- cmd/metrics-v3-replication.go | 4 ++++ cmd/metrics-v3.go | 1 + docs/metrics/v3.md | 2 +- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/cmd/bucket-stats.go b/cmd/bucket-stats.go index d691af93b7a5f..a51fbf41d0b7b 100644 --- a/cmd/bucket-stats.go +++ b/cmd/bucket-stats.go @@ -20,6 +20,7 @@ package cmd import ( "fmt" "math" + "sync/atomic" "time" "github.com/minio/madmin-go/v3" @@ -127,8 +128,7 @@ func (l *ReplicationLastHour) getTotal() AccElem { // forwardTo time t, clearing any entries in between. func (l *ReplicationLastHour) forwardTo(t int64) { - tMin := t / 60 - if l.LastMin >= tMin { + if l.LastMin >= t { return } if t-l.LastMin >= 60 { @@ -314,6 +314,9 @@ func (r *ReplicationStats) getNodeQueueStats(bucket string) (qs ReplQNodeStats) qs.XferStats = make(map[RMetricName]XferStats) qs.QStats = r.qCache.getBucketStats(bucket) qs.TgtXferStats = make(map[string]map[RMetricName]XferStats) + qs.MRFStats = ReplicationMRFStats{ + LastFailedCount: atomic.LoadUint64(&r.mrfStats.LastFailedCount), + } r.RLock() defer r.RUnlock() @@ -402,7 +405,9 @@ func (r *ReplicationStats) getNodeQueueStatsSummary() (qs ReplQNodeStats) { qs.ActiveWorkers = globalReplicationStats.ActiveWorkers() qs.XferStats = make(map[RMetricName]XferStats) qs.QStats = r.qCache.getSiteStats() - + qs.MRFStats = ReplicationMRFStats{ + LastFailedCount: atomic.LoadUint64(&r.mrfStats.LastFailedCount), + } r.RLock() defer r.RUnlock() tx := newXferStats() diff --git a/cmd/metrics-v3-replication.go b/cmd/metrics-v3-replication.go index 1961c3304f1ec..da26e0956ef6e 100644 --- a/cmd/metrics-v3-replication.go +++ b/cmd/metrics-v3-replication.go @@ -34,6 +34,7 @@ const ( replicationMaxQueuedBytes = "max_queued_bytes" replicationMaxQueuedCount = "max_queued_count" replicationMaxDataTransferRate = "max_data_transfer_rate" + replicationRecentBacklogCount = "recent_backlog_count" ) var ( @@ -61,6 +62,8 @@ var ( "Maximum number of objects queued for replication since server start") replicationMaxDataTransferRateMD = NewGaugeMD(replicationMaxDataTransferRate, "Maximum replication data transfer rate in bytes/sec seen since server start") + replicationRecentBacklogCountMD = NewGaugeMD(replicationRecentBacklogCount, + "Total number of objects seen in replication backlog in the last 5 minutes") ) // loadClusterReplicationMetrics - `MetricsLoaderFn` for cluster replication metrics @@ -91,6 +94,7 @@ func loadClusterReplicationMetrics(ctx context.Context, m MetricValues, c *metri m.Set(replicationCurrentDataTransferRate, tots.Curr) m.Set(replicationMaxDataTransferRate, tots.Peak) } + m.Set(replicationRecentBacklogCount, float64(qs.MRFStats.LastFailedCount)) return nil } diff --git a/cmd/metrics-v3.go b/cmd/metrics-v3.go index d00a447d379d9..93749258cb11a 100644 --- a/cmd/metrics-v3.go +++ b/cmd/metrics-v3.go @@ -341,6 +341,7 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { replicationMaxQueuedBytesMD, replicationMaxQueuedCountMD, replicationMaxDataTransferRateMD, + replicationRecentBacklogCountMD, }, loadClusterReplicationMetrics, ) diff --git a/docs/metrics/v3.md b/docs/metrics/v3.md index 8cbf46517a5fc..1805232deaa53 100644 --- a/docs/metrics/v3.md +++ b/docs/metrics/v3.md @@ -275,7 +275,7 @@ Metrics about MinIO site and bucket replication. | `minio_replication_max_queued_bytes` | Maximum number of bytes queued for replication since server start.

Type: gauge | `server` | | `minio_replication_max_queued_count` | Maximum number of objects queued for replication since server start.

Type: gauge | `server` | | `minio_replication_max_data_transfer_rate` | Maximum replication data transfer rate in bytes/sec since server start.

Type: gauge | `server` | - +| `minio_replication_recent_backlog_count` | Total number of objects seen in replication backlog in the last 5 minutes

Type: gauge | `server` | #### `/bucket/replication` | Name | Description | Labels |