From 7350a29fec3f34d0b1218786c1a36db4fed58ffb Mon Sep 17 00:00:00 2001 From: Shireesh Anjal <355479+anjalshireesh@users.noreply.github.com> Date: Thu, 7 Dec 2023 02:49:59 +0530 Subject: [PATCH] Capture percentage of cpu load and memory used (#18596) By default the cpu load is the cumulative of all cores. Capture the percentage load (load * 100 / cpu-count) Also capture the percentage memory used (used * 100 / total) --- cmd/metrics-realtime.go | 9 ++- cmd/metrics-resource.go | 36 +++++++--- docs/metrics/prometheus/list.md | 116 ++++++++++++++++++-------------- go.mod | 2 +- go.sum | 4 +- 5 files changed, 102 insertions(+), 65 deletions(-) diff --git a/cmd/metrics-realtime.go b/cmd/metrics-realtime.go index 7754c3025..08e4353d5 100644 --- a/cmd/metrics-realtime.go +++ b/cmd/metrics-realtime.go @@ -91,7 +91,7 @@ func collectLocalMetrics(types madmin.MetricType, opts collectMetricsOpts) (m ma } cm, err := c.Times(false) if err != nil { - m.Errors = append(m.Errors, fmt.Sprintf("%s: %v (cputimes)", globalMinioAddr, err.Error())) + m.Errors = append(m.Errors, fmt.Sprintf("%s: %v (cpuTimes)", globalMinioAddr, err.Error())) } else { // not collecting per-cpu stats, so there will be only one element if len(cm) == 1 { @@ -100,6 +100,13 @@ func collectLocalMetrics(types madmin.MetricType, opts collectMetricsOpts) (m ma m.Errors = append(m.Errors, fmt.Sprintf("%s: Expected one CPU stat, got %d", globalMinioAddr, len(cm))) } } + cpuCount, err := c.Counts(true) + if err != nil { + m.Errors = append(m.Errors, fmt.Sprintf("%s: %v (cpuCount)", globalMinioAddr, err.Error())) + } else { + m.Aggregated.CPU.CPUCount = cpuCount + } + loadStat, err := load.Avg() if err != nil { m.Errors = append(m.Errors, fmt.Sprintf("%s: %v (loadStat)", globalMinioAddr, err.Error())) diff --git a/cmd/metrics-resource.go b/cmd/metrics-resource.go index 171a7caff..c4a677469 100644 --- a/cmd/metrics-resource.go +++ b/cmd/metrics-resource.go @@ -53,6 +53,7 @@ const ( // memory stats memUsed MetricName = "used" + memUsedPerc MetricName = "used_perc" memFree MetricName = "free" memShared MetricName = "shared" memBuffers MetricName = "buffers" @@ -60,15 +61,18 @@ const ( memAvailable MetricName = "available" // cpu stats - cpuUser MetricName = "user" - cpuSystem MetricName = "system" - cpuIOWait MetricName = "iowait" - cpuIdle MetricName = "idle" - cpuNice MetricName = "nice" - cpuSteal MetricName = "steal" - cpuLoad1 MetricName = "load1" - cpuLoad5 MetricName = "load5" - cpuLoad15 MetricName = "load15" + cpuUser MetricName = "user" + cpuSystem MetricName = "system" + cpuIOWait MetricName = "iowait" + cpuIdle MetricName = "idle" + cpuNice MetricName = "nice" + cpuSteal MetricName = "steal" + cpuLoad1 MetricName = "load1" + cpuLoad5 MetricName = "load5" + cpuLoad15 MetricName = "load15" + cpuLoad1Perc MetricName = "load1_perc" + cpuLoad5Perc MetricName = "load5_perc" + cpuLoad15Perc MetricName = "load15_perc" ) var ( @@ -126,6 +130,7 @@ func init() { interfaceTxErrors: "Transmit errors in " + interval, total: "Total memory on the node", memUsed: "Used memory on the node", + memUsedPerc: "Used memory percentage on the node", memFree: "Free memory on the node", memShared: "Shared memory on the node", memBuffers: "Buffers memory on the node", @@ -151,6 +156,9 @@ func init() { cpuLoad1: "CPU load average 1min", cpuLoad5: "CPU load average 5min", cpuLoad15: "CPU load average 15min", + cpuLoad1Perc: "CPU load average 1min (perentage)", + cpuLoad5Perc: "CPU load average 5min (percentage)", + cpuLoad15Perc: "CPU load average 15min (percentage)", } resourceMetricsGroups = []*MetricsGroup{ getResourceMetrics(), @@ -283,6 +291,8 @@ func collectLocalResourceMetrics() { stats := hm.Mem.Info updateResourceMetrics(memSubsystem, total, float64(stats.Total), labels, false) updateResourceMetrics(memSubsystem, memUsed, float64(stats.Used), labels, false) + perc := math.Round(float64(stats.Used*100*100)/float64(stats.Total)) / 100 + updateResourceMetrics(memSubsystem, memUsedPerc, perc, labels, false) updateResourceMetrics(memSubsystem, memFree, float64(stats.Free), labels, false) updateResourceMetrics(memSubsystem, memShared, float64(stats.Shared), labels, false) updateResourceMetrics(memSubsystem, memBuffers, float64(stats.Buffers), labels, false) @@ -312,6 +322,14 @@ func collectLocalResourceMetrics() { updateResourceMetrics(cpuSubsystem, cpuLoad1, ls.Load1, labels, false) updateResourceMetrics(cpuSubsystem, cpuLoad5, ls.Load5, labels, false) updateResourceMetrics(cpuSubsystem, cpuLoad15, ls.Load15, labels, false) + if hm.CPU.CPUCount > 0 { + perc := math.Round(ls.Load1*100*100/float64(hm.CPU.CPUCount)) / 100 + updateResourceMetrics(cpuSubsystem, cpuLoad1Perc, perc, labels, false) + perc = math.Round(ls.Load5*100*100/float64(hm.CPU.CPUCount)) / 100 + updateResourceMetrics(cpuSubsystem, cpuLoad5Perc, perc, labels, false) + perc = math.Round(ls.Load15*100*100/float64(hm.CPU.CPUCount)) / 100 + updateResourceMetrics(cpuSubsystem, cpuLoad15Perc, perc, labels, false) + } } } break // only one host expected diff --git a/docs/metrics/prometheus/list.md b/docs/metrics/prometheus/list.md index f9294ec31..3cb7edfe5 100644 --- a/docs/metrics/prometheus/list.md +++ b/docs/metrics/prometheus/list.md @@ -345,58 +345,70 @@ For deployments behind a load balancer, use the load balancer hostname instead o ## CPU Metrics -| Name | Description | -|:--------------------------------|:------------------------------| -| `minio_node_cpu_avg_user` | CPU user time. | -| `minio_node_cpu_avg_user_avg` | CPU user time (avg). | -| `minio_node_cpu_avg_user_max` | CPU user time (max). | -| `minio_node_cpu_avg_system` | CPU system time. | -| `minio_node_cpu_avg_system_avg` | CPU system time (avg). | -| `minio_node_cpu_avg_system_max` | CPU system time (max). | -| `minio_node_cpu_avg_idle` | CPU idle time. | -| `minio_node_cpu_avg_idle_avg` | CPU idle time (avg). | -| `minio_node_cpu_avg_idle_max` | CPU idle time (max). | -| `minio_node_cpu_avg_iowait` | CPU ioWait time. | -| `minio_node_cpu_avg_iowait_avg` | CPU ioWait time (avg). | -| `minio_node_cpu_avg_iowait_max` | CPU ioWait time (max). | -| `minio_node_cpu_avg_nice` | CPU nice time. | -| `minio_node_cpu_avg_nice_avg` | CPU nice time (avg). | -| `minio_node_cpu_avg_nice_max` | CPU nice time (max). | -| `minio_node_cpu_avg_steal` | CPU steam time. | -| `minio_node_cpu_avg_steal_avg` | CPU steam time (avg). | -| `minio_node_cpu_avg_steal_max` | CPU steam time (max). | -| `minio_node_cpu_avg_load1` | CPU load average 1min. | -| `minio_node_cpu_avg_load1_avg` | CPU load average 1min (avg). | -| `minio_node_cpu_avg_load1_max` | CPU load average 1min (max). | -| `minio_node_cpu_avg_load5` | CPU load average 5min. | -| `minio_node_cpu_avg_load5_avg` | CPU load average 5min (avg). | -| `minio_node_cpu_avg_load5_max` | CPU load average 5min (max). | -| `minio_node_cpu_avg_load15` | CPU load average 15min. | -| `minio_node_cpu_avg_load15_avg` | CPU load average 15min (avg). | -| `minio_node_cpu_avg_load15_max` | CPU load average 15min (max). | +| Name | Description | +|:-------------------------------------|:-------------------------------------------| +| `minio_node_cpu_avg_user` | CPU user time. | +| `minio_node_cpu_avg_user_avg` | CPU user time (avg). | +| `minio_node_cpu_avg_user_max` | CPU user time (max). | +| `minio_node_cpu_avg_system` | CPU system time. | +| `minio_node_cpu_avg_system_avg` | CPU system time (avg). | +| `minio_node_cpu_avg_system_max` | CPU system time (max). | +| `minio_node_cpu_avg_idle` | CPU idle time. | +| `minio_node_cpu_avg_idle_avg` | CPU idle time (avg). | +| `minio_node_cpu_avg_idle_max` | CPU idle time (max). | +| `minio_node_cpu_avg_iowait` | CPU ioWait time. | +| `minio_node_cpu_avg_iowait_avg` | CPU ioWait time (avg). | +| `minio_node_cpu_avg_iowait_max` | CPU ioWait time (max). | +| `minio_node_cpu_avg_nice` | CPU nice time. | +| `minio_node_cpu_avg_nice_avg` | CPU nice time (avg). | +| `minio_node_cpu_avg_nice_max` | CPU nice time (max). | +| `minio_node_cpu_avg_steal` | CPU steam time. | +| `minio_node_cpu_avg_steal_avg` | CPU steam time (avg). | +| `minio_node_cpu_avg_steal_max` | CPU steam time (max). | +| `minio_node_cpu_avg_load1` | CPU load average 1min. | +| `minio_node_cpu_avg_load1_avg` | CPU load average 1min (avg). | +| `minio_node_cpu_avg_load1_max` | CPU load average 1min (max). | +| `minio_node_cpu_avg_load1_perc` | CPU load average 1min (percentage). | +| `minio_node_cpu_avg_load1_perc_avg` | CPU load average 1min (percentage) (avg). | +| `minio_node_cpu_avg_load1_perc_max` | CPU load average 1min (percentage) (max). | +| `minio_node_cpu_avg_load5` | CPU load average 5min. | +| `minio_node_cpu_avg_load5_avg` | CPU load average 5min (avg). | +| `minio_node_cpu_avg_load5_max` | CPU load average 5min (max). | +| `minio_node_cpu_avg_load5_perc` | CPU load average 5min (percentage). | +| `minio_node_cpu_avg_load5_perc_avg` | CPU load average 5min (percentage) (avg). | +| `minio_node_cpu_avg_load5_perc_max` | CPU load average 5min (percentage) (max). | +| `minio_node_cpu_avg_load15` | CPU load average 15min. | +| `minio_node_cpu_avg_load15_avg` | CPU load average 15min (avg). | +| `minio_node_cpu_avg_load15_max` | CPU load average 15min (max). | +| `minio_node_cpu_avg_load15_perc` | CPU load average 15min (percentage). | +| `minio_node_cpu_avg_load15_perc_avg` | CPU load average 15min (percentage) (avg). | +| `minio_node_cpu_avg_load15_perc_max` | CPU load average 15min (percentage) (max). | ## Memory Metrics -| Name | Description | -|:-------------------------------|:------------------------------------| -| `minio_node_mem_available` | Available memory on the node. | -| `minio_node_mem_available_avg` | Available memory on the node (avg). | -| `minio_node_mem_available_max` | Available memory on the node (max). | -| `minio_node_mem_buffers` | Buffers memory on the node. | -| `minio_node_mem_buffers_avg` | Buffers memory on the node (avg). | -| `minio_node_mem_buffers_max` | Buffers memory on the node (max). | -| `minio_node_mem_cache` | Cache memory on the node. | -| `minio_node_mem_cache_avg` | Cache memory on the node (avg). | -| `minio_node_mem_cache_max` | Cache memory on the node (max). | -| `minio_node_mem_free` | Free memory on the node. | -| `minio_node_mem_free_avg` | Free memory on the node (avg). | -| `minio_node_mem_free_max` | Free memory on the node (max). | -| `minio_node_mem_shared` | Shared memory on the node. | -| `minio_node_mem_shared_avg` | Shared memory on the node (avg). | -| `minio_node_mem_shared_max` | Shared memory on the node (max). | -| `minio_node_mem_total` | Total memory on the node. | -| `minio_node_mem_total_avg` | Total memory on the node (avg). | -| `minio_node_mem_total_max` | Total memory on the node (max). | -| `minio_node_mem_used` | Used memory on the node. | -| `minio_node_mem_used_avg` | Used memory on the node (avg). | -| `minio_node_mem_used_max` | Used memory on the node (max). | +| Name | Description | +|:-------------------------------|:------------------------------------------| +| `minio_node_mem_available` | Available memory on the node. | +| `minio_node_mem_available_avg` | Available memory on the node (avg). | +| `minio_node_mem_available_max` | Available memory on the node (max). | +| `minio_node_mem_buffers` | Buffers memory on the node. | +| `minio_node_mem_buffers_avg` | Buffers memory on the node (avg). | +| `minio_node_mem_buffers_max` | Buffers memory on the node (max). | +| `minio_node_mem_cache` | Cache memory on the node. | +| `minio_node_mem_cache_avg` | Cache memory on the node (avg). | +| `minio_node_mem_cache_max` | Cache memory on the node (max). | +| `minio_node_mem_free` | Free memory on the node. | +| `minio_node_mem_free_avg` | Free memory on the node (avg). | +| `minio_node_mem_free_max` | Free memory on the node (max). | +| `minio_node_mem_shared` | Shared memory on the node. | +| `minio_node_mem_shared_avg` | Shared memory on the node (avg). | +| `minio_node_mem_shared_max` | Shared memory on the node (max). | +| `minio_node_mem_total` | Total memory on the node. | +| `minio_node_mem_total_avg` | Total memory on the node (avg). | +| `minio_node_mem_total_max` | Total memory on the node (max). | +| `minio_node_mem_used` | Used memory on the node. | +| `minio_node_mem_used_avg` | Used memory on the node (avg). | +| `minio_node_mem_used_max` | Used memory on the node (max). | +| `minio_node_mem_used_perc` | Used memory percentage on the node. | +| `minio_node_mem_used_perc_avg` | Used memory percentage on the node (avg). | +| `minio_node_mem_used_perc_max` | Used memory percentage on the node (max). | diff --git a/go.mod b/go.mod index f3c4c44af..6bdacff8e 100644 --- a/go.mod +++ b/go.mod @@ -49,7 +49,7 @@ require ( github.com/minio/dperf v0.5.2 github.com/minio/highwayhash v1.0.2 github.com/minio/kes-go v0.2.0 - github.com/minio/madmin-go/v3 v3.0.35-0.20231130082526-199918d0ff20 + github.com/minio/madmin-go/v3 v3.0.36 github.com/minio/minio-go/v7 v7.0.65-0.20231122233251-1f7dd6b7e3e1 github.com/minio/mux v1.9.0 github.com/minio/pkg/v2 v2.0.4 diff --git a/go.sum b/go.sum index dc6aa13c3..8c4c83daa 100644 --- a/go.sum +++ b/go.sum @@ -446,8 +446,8 @@ github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA github.com/minio/highwayhash v1.0.2/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLTk+kldvVxY= github.com/minio/kes-go v0.2.0 h1:HA33arq9s3MErbsj3PAXFVfFo4U4yw7lTKQ5kWFrpCA= github.com/minio/kes-go v0.2.0/go.mod h1:VorHLaIYis9/MxAHAtXN4d8PUMNKhIxTIlvFt0hBOEo= -github.com/minio/madmin-go/v3 v3.0.35-0.20231130082526-199918d0ff20 h1:5kfjAypPN18QOOQaZjR3jfGzXyIwzLdKMS7d/cPY3Wc= -github.com/minio/madmin-go/v3 v3.0.35-0.20231130082526-199918d0ff20/go.mod h1:4QN2NftLSV7MdlT50dkrenOMmNVHluxTvlqJou3hte8= +github.com/minio/madmin-go/v3 v3.0.36 h1:Ewu/Rt7WVSs9slWW+SZHRc5RPQdYAGIdNZnRr+gyN4k= +github.com/minio/madmin-go/v3 v3.0.36/go.mod h1:4QN2NftLSV7MdlT50dkrenOMmNVHluxTvlqJou3hte8= github.com/minio/mc v0.0.0-20231127112613-5e6ae2172e25 h1:8jT9Tz4opgrX6mnyFWW+TQ90AnrJqJ0mzeFXUWDHNGo= github.com/minio/mc v0.0.0-20231127112613-5e6ae2172e25/go.mod h1:8kat72LmpzZ2/xykDcq64tcRRJkkWo1Kd/Z5coC6t0w= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=