You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

487 lines
14 KiB

  1. // Copyright (c) 2015-2024 MinIO, Inc.
  2. //
  3. // This file is part of MinIO Object Storage stack
  4. //
  5. // This program is free software: you can redistribute it and/or modify
  6. // it under the terms of the GNU Affero General Public License as published by
  7. // the Free Software Foundation, either version 3 of the License, or
  8. // (at your option) any later version.
  9. //
  10. // This program is distributed in the hope that it will be useful
  11. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. // GNU Affero General Public License for more details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. package cmd
  18. import (
  19. "slices"
  20. "strings"
  21. "github.com/prometheus/client_golang/prometheus"
  22. "github.com/prometheus/client_golang/prometheus/collectors"
  23. )
  24. // Collector paths.
  25. //
  26. // These are paths under the top-level /minio/metrics/v3 metrics endpoint. Each
  27. // of these paths returns a set of V3 metrics.
  28. //
  29. // Per-bucket metrics endpoints always start with /bucket and the bucket name is
  30. // appended to the path. e.g. if the collector path is /bucket/api, the endpoint
  31. // for the bucket "mybucket" would be /minio/metrics/v3/bucket/api/mybucket
  32. const (
  33. apiRequestsCollectorPath collectorPath = "/api/requests"
  34. bucketAPICollectorPath collectorPath = "/bucket/api"
  35. bucketReplicationCollectorPath collectorPath = "/bucket/replication"
  36. systemNetworkInternodeCollectorPath collectorPath = "/system/network/internode"
  37. systemDriveCollectorPath collectorPath = "/system/drive"
  38. systemMemoryCollectorPath collectorPath = "/system/memory"
  39. systemCPUCollectorPath collectorPath = "/system/cpu"
  40. systemProcessCollectorPath collectorPath = "/system/process"
  41. debugGoCollectorPath collectorPath = "/debug/go"
  42. clusterHealthCollectorPath collectorPath = "/cluster/health"
  43. clusterUsageObjectsCollectorPath collectorPath = "/cluster/usage/objects"
  44. clusterUsageBucketsCollectorPath collectorPath = "/cluster/usage/buckets"
  45. clusterErasureSetCollectorPath collectorPath = "/cluster/erasure-set"
  46. clusterIAMCollectorPath collectorPath = "/cluster/iam"
  47. clusterConfigCollectorPath collectorPath = "/cluster/config"
  48. ilmCollectorPath collectorPath = "/ilm"
  49. auditCollectorPath collectorPath = "/audit"
  50. loggerWebhookCollectorPath collectorPath = "/logger/webhook"
  51. replicationCollectorPath collectorPath = "/replication"
  52. notificationCollectorPath collectorPath = "/notification"
  53. scannerCollectorPath collectorPath = "/scanner"
  54. )
  55. const (
  56. clusterBasePath = "/cluster"
  57. )
  58. type metricsV3Collection struct {
  59. mgMap map[collectorPath]*MetricsGroup
  60. bucketMGMap map[collectorPath]*MetricsGroup
  61. // Gatherers for non-bucket MetricsGroup's
  62. mgGatherers map[collectorPath]prometheus.Gatherer
  63. collectorPaths []collectorPath
  64. }
  65. func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
  66. // Create all metric groups.
  67. apiRequestsMG := NewMetricsGroup(apiRequestsCollectorPath,
  68. []MetricDescriptor{
  69. apiRejectedAuthTotalMD,
  70. apiRejectedHeaderTotalMD,
  71. apiRejectedTimestampTotalMD,
  72. apiRejectedInvalidTotalMD,
  73. apiRequestsWaitingTotalMD,
  74. apiRequestsIncomingTotalMD,
  75. apiRequestsInFlightTotalMD,
  76. apiRequestsTotalMD,
  77. apiRequestsErrorsTotalMD,
  78. apiRequests5xxErrorsTotalMD,
  79. apiRequests4xxErrorsTotalMD,
  80. apiRequestsCanceledTotalMD,
  81. apiRequestsTTFBSecondsDistributionMD,
  82. apiTrafficSentBytesMD,
  83. apiTrafficRecvBytesMD,
  84. },
  85. JoinLoaders(loadAPIRequestsHTTPMetrics, loadAPIRequestsTTFBMetrics,
  86. loadAPIRequestsNetworkMetrics),
  87. )
  88. bucketAPIMG := NewBucketMetricsGroup(bucketAPICollectorPath,
  89. []MetricDescriptor{
  90. bucketAPITrafficRecvBytesMD,
  91. bucketAPITrafficSentBytesMD,
  92. bucketAPIRequestsInFlightMD,
  93. bucketAPIRequestsTotalMD,
  94. bucketAPIRequestsCanceledMD,
  95. bucketAPIRequests4xxErrorsMD,
  96. bucketAPIRequests5xxErrorsMD,
  97. bucketAPIRequestsTTFBSecondsDistributionMD,
  98. },
  99. JoinBucketLoaders(loadBucketAPIHTTPMetrics, loadBucketAPITTFBMetrics),
  100. )
  101. bucketReplicationMG := NewBucketMetricsGroup(bucketReplicationCollectorPath,
  102. []MetricDescriptor{
  103. bucketReplLastHrFailedBytesMD,
  104. bucketReplLastHrFailedCountMD,
  105. bucketReplLastMinFailedBytesMD,
  106. bucketReplLastMinFailedCountMD,
  107. bucketReplLatencyMsMD,
  108. bucketReplProxiedDeleteTaggingRequestsTotalMD,
  109. bucketReplProxiedGetRequestsFailuresMD,
  110. bucketReplProxiedGetRequestsTotalMD,
  111. bucketReplProxiedGetTaggingRequestsFailuresMD,
  112. bucketReplProxiedGetTaggingRequestsTotalMD,
  113. bucketReplProxiedHeadRequestsFailuresMD,
  114. bucketReplProxiedHeadRequestsTotalMD,
  115. bucketReplProxiedPutTaggingRequestsFailuresMD,
  116. bucketReplProxiedPutTaggingRequestsTotalMD,
  117. bucketReplSentBytesMD,
  118. bucketReplSentCountMD,
  119. bucketReplTotalFailedBytesMD,
  120. bucketReplTotalFailedCountMD,
  121. bucketReplProxiedDeleteTaggingRequestsFailuresMD,
  122. },
  123. loadBucketReplicationMetrics,
  124. )
  125. systemNetworkInternodeMG := NewMetricsGroup(systemNetworkInternodeCollectorPath,
  126. []MetricDescriptor{
  127. internodeErrorsTotalMD,
  128. internodeDialedErrorsTotalMD,
  129. internodeDialAvgTimeNanosMD,
  130. internodeSentBytesTotalMD,
  131. internodeRecvBytesTotalMD,
  132. },
  133. loadNetworkInternodeMetrics,
  134. )
  135. systemMemoryMG := NewMetricsGroup(systemMemoryCollectorPath,
  136. []MetricDescriptor{
  137. memTotalMD,
  138. memUsedMD,
  139. memFreeMD,
  140. memAvailableMD,
  141. memBuffersMD,
  142. memCacheMD,
  143. memSharedMD,
  144. memUsedPercMD,
  145. },
  146. loadMemoryMetrics,
  147. )
  148. systemCPUMG := NewMetricsGroup(systemCPUCollectorPath,
  149. []MetricDescriptor{
  150. sysCPUAvgIdleMD,
  151. sysCPUAvgIOWaitMD,
  152. sysCPULoadMD,
  153. sysCPULoadPercMD,
  154. sysCPUNiceMD,
  155. sysCPUStealMD,
  156. sysCPUSystemMD,
  157. sysCPUUserMD,
  158. },
  159. loadCPUMetrics,
  160. )
  161. systemProcessMG := NewMetricsGroup(systemProcessCollectorPath,
  162. []MetricDescriptor{
  163. processLocksReadTotalMD,
  164. processLocksWriteTotalMD,
  165. processCPUTotalSecondsMD,
  166. processGoRoutineTotalMD,
  167. processIORCharBytesMD,
  168. processIOReadBytesMD,
  169. processIOWCharBytesMD,
  170. processIOWriteBytesMD,
  171. processStarttimeSecondsMD,
  172. processUptimeSecondsMD,
  173. processFileDescriptorLimitTotalMD,
  174. processFileDescriptorOpenTotalMD,
  175. processSyscallReadTotalMD,
  176. processSyscallWriteTotalMD,
  177. processResidentMemoryBytesMD,
  178. processVirtualMemoryBytesMD,
  179. processVirtualMemoryMaxBytesMD,
  180. },
  181. loadProcessMetrics,
  182. )
  183. systemDriveMG := NewMetricsGroup(systemDriveCollectorPath,
  184. []MetricDescriptor{
  185. driveUsedBytesMD,
  186. driveFreeBytesMD,
  187. driveTotalBytesMD,
  188. driveUsedInodesMD,
  189. driveFreeInodesMD,
  190. driveTotalInodesMD,
  191. driveTimeoutErrorsMD,
  192. driveIOErrorsMD,
  193. driveAvailabilityErrorsMD,
  194. driveWaitingIOMD,
  195. driveAPILatencyMD,
  196. driveHealthMD,
  197. driveOfflineCountMD,
  198. driveOnlineCountMD,
  199. driveCountMD,
  200. // iostat related
  201. driveReadsPerSecMD,
  202. driveReadsKBPerSecMD,
  203. driveReadsAwaitMD,
  204. driveWritesPerSecMD,
  205. driveWritesKBPerSecMD,
  206. driveWritesAwaitMD,
  207. drivePercUtilMD,
  208. },
  209. loadDriveMetrics,
  210. )
  211. clusterHealthMG := NewMetricsGroup(clusterHealthCollectorPath,
  212. []MetricDescriptor{
  213. healthDrivesOfflineCountMD,
  214. healthDrivesOnlineCountMD,
  215. healthDrivesCountMD,
  216. healthNodesOfflineCountMD,
  217. healthNodesOnlineCountMD,
  218. healthCapacityRawTotalBytesMD,
  219. healthCapacityRawFreeBytesMD,
  220. healthCapacityUsableTotalBytesMD,
  221. healthCapacityUsableFreeBytesMD,
  222. },
  223. JoinLoaders(loadClusterHealthDriveMetrics,
  224. loadClusterHealthNodeMetrics,
  225. loadClusterHealthCapacityMetrics),
  226. )
  227. clusterUsageObjectsMG := NewMetricsGroup(clusterUsageObjectsCollectorPath,
  228. []MetricDescriptor{
  229. usageSinceLastUpdateSecondsMD,
  230. usageTotalBytesMD,
  231. usageObjectsCountMD,
  232. usageVersionsCountMD,
  233. usageDeleteMarkersCountMD,
  234. usageBucketsCountMD,
  235. usageObjectsDistributionMD,
  236. usageVersionsDistributionMD,
  237. },
  238. loadClusterUsageObjectMetrics,
  239. )
  240. clusterUsageBucketsMG := NewMetricsGroup(clusterUsageBucketsCollectorPath,
  241. []MetricDescriptor{
  242. usageSinceLastUpdateSecondsMD,
  243. usageBucketTotalBytesMD,
  244. usageBucketObjectsTotalMD,
  245. usageBucketVersionsCountMD,
  246. usageBucketDeleteMarkersCountMD,
  247. usageBucketQuotaTotalBytesMD,
  248. usageBucketObjectSizeDistributionMD,
  249. usageBucketObjectVersionCountDistributionMD,
  250. },
  251. loadClusterUsageBucketMetrics,
  252. )
  253. clusterErasureSetMG := NewMetricsGroup(clusterErasureSetCollectorPath,
  254. []MetricDescriptor{
  255. erasureSetOverallWriteQuorumMD,
  256. erasureSetOverallHealthMD,
  257. erasureSetReadQuorumMD,
  258. erasureSetWriteQuorumMD,
  259. erasureSetOnlineDrivesCountMD,
  260. erasureSetHealingDrivesCountMD,
  261. erasureSetHealthMD,
  262. erasureSetReadToleranceMD,
  263. erasureSetWriteToleranceMD,
  264. erasureSetReadHealthMD,
  265. erasureSetWriteHealthMD,
  266. },
  267. loadClusterErasureSetMetrics,
  268. )
  269. clusterNotificationMG := NewMetricsGroup(notificationCollectorPath,
  270. []MetricDescriptor{
  271. notificationCurrentSendInProgressMD,
  272. notificationEventsErrorsTotalMD,
  273. notificationEventsSentTotalMD,
  274. notificationEventsSkippedTotalMD,
  275. },
  276. loadClusterNotificationMetrics,
  277. )
  278. clusterIAMMG := NewMetricsGroup(clusterIAMCollectorPath,
  279. []MetricDescriptor{
  280. lastSyncDurationMillisMD,
  281. pluginAuthnServiceFailedRequestsMinuteMD,
  282. pluginAuthnServiceLastFailSecondsMD,
  283. pluginAuthnServiceLastSuccSecondsMD,
  284. pluginAuthnServiceSuccAvgRttMsMinuteMD,
  285. pluginAuthnServiceSuccMaxRttMsMinuteMD,
  286. pluginAuthnServiceTotalRequestsMinuteMD,
  287. sinceLastSyncMillisMD,
  288. syncFailuresMD,
  289. syncSuccessesMD,
  290. },
  291. loadClusterIAMMetrics,
  292. )
  293. clusterReplicationMG := NewMetricsGroup(replicationCollectorPath,
  294. []MetricDescriptor{
  295. replicationAverageActiveWorkersMD,
  296. replicationAverageQueuedBytesMD,
  297. replicationAverageQueuedCountMD,
  298. replicationAverageDataTransferRateMD,
  299. replicationCurrentActiveWorkersMD,
  300. replicationCurrentDataTransferRateMD,
  301. replicationLastMinuteQueuedBytesMD,
  302. replicationLastMinuteQueuedCountMD,
  303. replicationMaxActiveWorkersMD,
  304. replicationMaxQueuedBytesMD,
  305. replicationMaxQueuedCountMD,
  306. replicationMaxDataTransferRateMD,
  307. replicationRecentBacklogCountMD,
  308. },
  309. loadClusterReplicationMetrics,
  310. )
  311. clusterConfigMG := NewMetricsGroup(clusterConfigCollectorPath,
  312. []MetricDescriptor{
  313. configRRSParityMD,
  314. configStandardParityMD,
  315. },
  316. loadClusterConfigMetrics,
  317. )
  318. scannerMG := NewMetricsGroup(scannerCollectorPath,
  319. []MetricDescriptor{
  320. scannerBucketScansFinishedMD,
  321. scannerBucketScansStartedMD,
  322. scannerDirectoriesScannedMD,
  323. scannerObjectsScannedMD,
  324. scannerVersionsScannedMD,
  325. scannerLastActivitySecondsMD,
  326. },
  327. loadClusterScannerMetrics,
  328. )
  329. loggerWebhookMG := NewMetricsGroup(loggerWebhookCollectorPath,
  330. []MetricDescriptor{
  331. webhookFailedMessagesMD,
  332. webhookQueueLengthMD,
  333. webhookTotalMessagesMD,
  334. },
  335. loadLoggerWebhookMetrics,
  336. )
  337. auditMG := NewMetricsGroup(auditCollectorPath,
  338. []MetricDescriptor{
  339. auditFailedMessagesMD,
  340. auditTargetQueueLengthMD,
  341. auditTotalMessagesMD,
  342. },
  343. loadAuditMetrics,
  344. )
  345. ilmMG := NewMetricsGroup(ilmCollectorPath,
  346. []MetricDescriptor{
  347. ilmExpiryPendingTasksMD,
  348. ilmTransitionActiveTasksMD,
  349. ilmTransitionPendingTasksMD,
  350. ilmTransitionMissedImmediateTasksMD,
  351. ilmVersionsScannedMD,
  352. },
  353. loadILMMetrics,
  354. )
  355. allMetricGroups := []*MetricsGroup{
  356. apiRequestsMG,
  357. bucketAPIMG,
  358. bucketReplicationMG,
  359. systemNetworkInternodeMG,
  360. systemDriveMG,
  361. systemMemoryMG,
  362. systemCPUMG,
  363. systemProcessMG,
  364. clusterHealthMG,
  365. clusterUsageObjectsMG,
  366. clusterUsageBucketsMG,
  367. clusterErasureSetMG,
  368. clusterNotificationMG,
  369. clusterIAMMG,
  370. clusterReplicationMG,
  371. clusterConfigMG,
  372. ilmMG,
  373. scannerMG,
  374. auditMG,
  375. loggerWebhookMG,
  376. }
  377. // Bucket metrics are special, they always include the bucket label. These
  378. // metrics required a list of buckets to be passed to the loader, and the list
  379. // of buckets is not known until the request is made. So we keep a separate
  380. // map for bucket metrics and handle them specially.
  381. // Add the serverName and poolIndex labels to all non-cluster metrics.
  382. //
  383. // Also create metric group maps and set the cache.
  384. metricsCache := newMetricsCache()
  385. mgMap := make(map[collectorPath]*MetricsGroup)
  386. bucketMGMap := make(map[collectorPath]*MetricsGroup)
  387. for _, mg := range allMetricGroups {
  388. if !strings.HasPrefix(string(mg.CollectorPath), clusterBasePath) {
  389. mg.AddExtraLabels(
  390. serverName, globalLocalNodeName,
  391. // poolIndex, strconv.Itoa(globalLocalPoolIdx),
  392. )
  393. }
  394. mg.SetCache(metricsCache)
  395. if mg.IsBucketMetricsGroup() {
  396. bucketMGMap[mg.CollectorPath] = mg
  397. } else {
  398. mgMap[mg.CollectorPath] = mg
  399. }
  400. }
  401. // Prepare to register the collectors. Other than `MetricGroup` collectors,
  402. // we also have standard collectors like `GoCollector`.
  403. // Create all Non-`MetricGroup` collectors here.
  404. collectors := map[collectorPath]prometheus.Collector{
  405. debugGoCollectorPath: collectors.NewGoCollector(),
  406. }
  407. // Add all `MetricGroup` collectors to the map.
  408. for _, mg := range allMetricGroups {
  409. collectors[mg.CollectorPath] = mg
  410. }
  411. // Helper function to register a collector and return a gatherer for it.
  412. mustRegister := func(c ...prometheus.Collector) prometheus.Gatherer {
  413. subRegistry := prometheus.NewRegistry()
  414. for _, col := range c {
  415. subRegistry.MustRegister(col)
  416. }
  417. r.MustRegister(subRegistry)
  418. return subRegistry
  419. }
  420. // Register all collectors and create gatherers for them.
  421. gatherers := make(map[collectorPath]prometheus.Gatherer, len(collectors))
  422. collectorPaths := make([]collectorPath, 0, len(collectors))
  423. for path, collector := range collectors {
  424. gatherers[path] = mustRegister(collector)
  425. collectorPaths = append(collectorPaths, path)
  426. }
  427. slices.Sort(collectorPaths)
  428. return &metricsV3Collection{
  429. mgMap: mgMap,
  430. bucketMGMap: bucketMGMap,
  431. mgGatherers: gatherers,
  432. collectorPaths: collectorPaths,
  433. }
  434. }