You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

516 lines
14 KiB

  1. // Copyright (c) 2015-2021 MinIO, Inc.
  2. //
  3. // This file is part of MinIO Object Storage stack
  4. //
  5. // This program is free software: you can redistribute it and/or modify
  6. // it under the terms of the GNU Affero General Public License as published by
  7. // the Free Software Foundation, either version 3 of the License, or
  8. // (at your option) any later version.
  9. //
  10. // This program is distributed in the hope that it will be useful
  11. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. // GNU Affero General Public License for more details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. package cmd
  18. import (
  19. "context"
  20. "sync"
  21. "sync/atomic"
  22. "time"
  23. "github.com/minio/minio/internal/bucket/replication"
  24. "github.com/rcrowley/go-metrics"
  25. )
  26. func (b *BucketReplicationStats) hasReplicationUsage() bool {
  27. for _, s := range b.Stats {
  28. if s.hasReplicationUsage() {
  29. return true
  30. }
  31. }
  32. return false
  33. }
  34. // ReplicationStats holds the global in-memory replication stats
  35. type ReplicationStats struct {
  36. // map of site deployment ID to site replication status
  37. // for site replication - maintain stats at global level
  38. srStats *SRStats
  39. // active worker stats
  40. workers *ActiveWorkerStat
  41. // queue stats cache
  42. qCache queueCache
  43. pCache proxyStatsCache
  44. // mrf backlog stats
  45. mrfStats ReplicationMRFStats
  46. // for bucket replication, continue to use existing cache
  47. Cache map[string]*BucketReplicationStats
  48. mostRecentStats BucketStatsMap
  49. registry metrics.Registry
  50. sync.RWMutex // mutex for Cache
  51. mostRecentStatsMu sync.Mutex // mutex for mostRecentStats
  52. wlock sync.RWMutex // mutex for active workers
  53. movingAvgTicker *time.Ticker // Ticker for calculating moving averages
  54. wTimer *time.Ticker // ticker for calculating active workers
  55. qTimer *time.Ticker // ticker for calculating queue stats
  56. }
  57. func (r *ReplicationStats) trackEWMA() {
  58. for {
  59. select {
  60. case <-r.movingAvgTicker.C:
  61. r.updateMovingAvg()
  62. case <-GlobalContext.Done():
  63. return
  64. }
  65. }
  66. }
  67. func (r *ReplicationStats) updateMovingAvg() {
  68. r.RLock()
  69. for _, s := range r.Cache {
  70. for _, st := range s.Stats {
  71. st.XferRateLrg.measure.updateExponentialMovingAverage(time.Now())
  72. st.XferRateSml.measure.updateExponentialMovingAverage(time.Now())
  73. }
  74. }
  75. r.RUnlock()
  76. }
  77. // ActiveWorkers returns worker stats
  78. func (r *ReplicationStats) ActiveWorkers() ActiveWorkerStat {
  79. if r == nil {
  80. return ActiveWorkerStat{}
  81. }
  82. r.wlock.RLock()
  83. defer r.wlock.RUnlock()
  84. w := r.workers.get()
  85. return ActiveWorkerStat{
  86. Curr: w.Curr,
  87. Max: w.Max,
  88. Avg: w.Avg,
  89. }
  90. }
  91. func (r *ReplicationStats) collectWorkerMetrics(ctx context.Context) {
  92. if r == nil {
  93. return
  94. }
  95. for {
  96. select {
  97. case <-ctx.Done():
  98. return
  99. case <-r.wTimer.C:
  100. r.wlock.Lock()
  101. r.workers.update()
  102. r.wlock.Unlock()
  103. }
  104. }
  105. }
  106. func (r *ReplicationStats) collectQueueMetrics(ctx context.Context) {
  107. if r == nil {
  108. return
  109. }
  110. for {
  111. select {
  112. case <-ctx.Done():
  113. return
  114. case <-r.qTimer.C:
  115. r.qCache.update()
  116. }
  117. }
  118. }
  119. // Delete deletes in-memory replication statistics for a bucket.
  120. func (r *ReplicationStats) Delete(bucket string) {
  121. if r == nil {
  122. return
  123. }
  124. r.Lock()
  125. defer r.Unlock()
  126. delete(r.Cache, bucket)
  127. }
  128. // UpdateReplicaStat updates in-memory replica statistics with new values.
  129. func (r *ReplicationStats) UpdateReplicaStat(bucket string, n int64) {
  130. if r == nil {
  131. return
  132. }
  133. r.Lock()
  134. defer r.Unlock()
  135. bs, ok := r.Cache[bucket]
  136. if !ok {
  137. bs = newBucketReplicationStats()
  138. }
  139. bs.ReplicaSize += n
  140. bs.ReplicaCount++
  141. r.Cache[bucket] = bs
  142. r.srUpdateReplicaStat(n)
  143. }
  144. func (r *ReplicationStats) srUpdateReplicaStat(sz int64) {
  145. if r == nil {
  146. return
  147. }
  148. atomic.AddInt64(&r.srStats.ReplicaSize, sz)
  149. atomic.AddInt64(&r.srStats.ReplicaCount, 1)
  150. }
  151. func (r *ReplicationStats) srUpdate(sr replStat) {
  152. dID, err := globalSiteReplicationSys.getDeplIDForEndpoint(sr.endpoint())
  153. if err == nil {
  154. r.srStats.update(sr, dID)
  155. }
  156. }
  157. // Update updates in-memory replication statistics with new values.
  158. func (r *ReplicationStats) Update(bucket string, ri replicatedTargetInfo, status, prevStatus replication.StatusType) {
  159. if r == nil {
  160. return
  161. }
  162. var rs replStat
  163. switch status {
  164. case replication.Pending:
  165. if ri.OpType.IsDataReplication() && prevStatus != status {
  166. rs.set(ri.Arn, ri.Size, 0, status, ri.OpType, ri.endpoint, ri.secure, ri.Err)
  167. }
  168. case replication.Completed:
  169. if ri.OpType.IsDataReplication() {
  170. rs.set(ri.Arn, ri.Size, ri.Duration, status, ri.OpType, ri.endpoint, ri.secure, ri.Err)
  171. }
  172. case replication.Failed:
  173. if ri.OpType.IsDataReplication() && prevStatus == replication.Pending {
  174. rs.set(ri.Arn, ri.Size, ri.Duration, status, ri.OpType, ri.endpoint, ri.secure, ri.Err)
  175. }
  176. case replication.Replica:
  177. if ri.OpType == replication.ObjectReplicationType {
  178. rs.set(ri.Arn, ri.Size, 0, status, ri.OpType, "", false, ri.Err)
  179. }
  180. }
  181. // update site-replication in-memory stats
  182. if rs.Completed || rs.Failed {
  183. r.srUpdate(rs)
  184. }
  185. r.Lock()
  186. defer r.Unlock()
  187. // update bucket replication in-memory stats
  188. bs, ok := r.Cache[bucket]
  189. if !ok {
  190. bs = newBucketReplicationStats()
  191. r.Cache[bucket] = bs
  192. }
  193. b, ok := bs.Stats[ri.Arn]
  194. if !ok {
  195. b = &BucketReplicationStat{
  196. XferRateLrg: newXferStats(),
  197. XferRateSml: newXferStats(),
  198. }
  199. bs.Stats[ri.Arn] = b
  200. }
  201. switch {
  202. case rs.Completed:
  203. b.ReplicatedSize += rs.TransferSize
  204. b.ReplicatedCount++
  205. if rs.TransferDuration > 0 {
  206. b.Latency.update(rs.TransferSize, rs.TransferDuration)
  207. b.updateXferRate(rs.TransferSize, rs.TransferDuration)
  208. }
  209. case rs.Failed:
  210. b.FailStats.addsize(rs.TransferSize, rs.Err)
  211. case rs.Pending:
  212. }
  213. }
  214. type replStat struct {
  215. Arn string
  216. Completed bool
  217. Pending bool
  218. Failed bool
  219. opType replication.Type
  220. // transfer size
  221. TransferSize int64
  222. // transfer duration
  223. TransferDuration time.Duration
  224. Endpoint string
  225. Secure bool
  226. Err error
  227. }
  228. func (rs *replStat) endpoint() string {
  229. scheme := "http"
  230. if rs.Secure {
  231. scheme = "https"
  232. }
  233. return scheme + "://" + rs.Endpoint
  234. }
  235. func (rs *replStat) set(arn string, n int64, duration time.Duration, status replication.StatusType, opType replication.Type, endpoint string, secure bool, err error) {
  236. rs.Endpoint = endpoint
  237. rs.Secure = secure
  238. rs.TransferSize = n
  239. rs.Arn = arn
  240. rs.TransferDuration = duration
  241. rs.opType = opType
  242. switch status {
  243. case replication.Completed:
  244. rs.Completed = true
  245. case replication.Pending:
  246. rs.Pending = true
  247. case replication.Failed:
  248. rs.Failed = true
  249. rs.Err = err
  250. }
  251. }
  252. // GetAll returns replication metrics for all buckets at once.
  253. func (r *ReplicationStats) GetAll() map[string]BucketReplicationStats {
  254. if r == nil {
  255. return map[string]BucketReplicationStats{}
  256. }
  257. r.RLock()
  258. bucketReplicationStats := make(map[string]BucketReplicationStats, len(r.Cache))
  259. for k, v := range r.Cache {
  260. bucketReplicationStats[k] = v.Clone()
  261. }
  262. r.RUnlock()
  263. for k, v := range bucketReplicationStats {
  264. v.QStat = r.qCache.getBucketStats(k)
  265. bucketReplicationStats[k] = v
  266. }
  267. return bucketReplicationStats
  268. }
  269. func (r *ReplicationStats) getSRMetricsForNode() SRMetricsSummary {
  270. if r == nil {
  271. return SRMetricsSummary{}
  272. }
  273. m := SRMetricsSummary{
  274. Uptime: UTCNow().Unix() - globalBootTime.Unix(),
  275. Queued: r.qCache.getSiteStats(),
  276. ActiveWorkers: r.ActiveWorkers(),
  277. Metrics: r.srStats.get(),
  278. Proxied: r.pCache.getSiteStats(),
  279. ReplicaSize: atomic.LoadInt64(&r.srStats.ReplicaSize),
  280. ReplicaCount: atomic.LoadInt64(&r.srStats.ReplicaCount),
  281. }
  282. return m
  283. }
  284. // Get replication metrics for a bucket from this node since this node came up.
  285. func (r *ReplicationStats) Get(bucket string) BucketReplicationStats {
  286. if r == nil {
  287. return BucketReplicationStats{Stats: make(map[string]*BucketReplicationStat)}
  288. }
  289. r.RLock()
  290. defer r.RUnlock()
  291. st, ok := r.Cache[bucket]
  292. if !ok {
  293. return BucketReplicationStats{Stats: make(map[string]*BucketReplicationStat)}
  294. }
  295. return st.Clone()
  296. }
  297. // NewReplicationStats initialize in-memory replication statistics
  298. func NewReplicationStats(ctx context.Context, objectAPI ObjectLayer) *ReplicationStats {
  299. r := metrics.NewRegistry()
  300. rs := ReplicationStats{
  301. Cache: make(map[string]*BucketReplicationStats),
  302. qCache: newQueueCache(r),
  303. pCache: newProxyStatsCache(),
  304. srStats: newSRStats(),
  305. movingAvgTicker: time.NewTicker(2 * time.Second),
  306. wTimer: time.NewTicker(2 * time.Second),
  307. qTimer: time.NewTicker(2 * time.Second),
  308. workers: newActiveWorkerStat(r),
  309. registry: r,
  310. }
  311. go rs.collectWorkerMetrics(ctx)
  312. go rs.collectQueueMetrics(ctx)
  313. return &rs
  314. }
  315. func (r *ReplicationStats) getAllLatest(bucketsUsage map[string]BucketUsageInfo) (bucketsReplicationStats map[string]BucketStats) {
  316. if r == nil {
  317. return nil
  318. }
  319. peerBucketStatsList := globalNotificationSys.GetClusterAllBucketStats(GlobalContext)
  320. bucketsReplicationStats = make(map[string]BucketStats, len(bucketsUsage))
  321. for bucket := range bucketsUsage {
  322. bucketStats := make([]BucketStats, len(peerBucketStatsList))
  323. for i, peerBucketStats := range peerBucketStatsList {
  324. bucketStat, ok := peerBucketStats.Stats[bucket]
  325. if !ok {
  326. continue
  327. }
  328. bucketStats[i] = bucketStat
  329. }
  330. bucketsReplicationStats[bucket] = r.calculateBucketReplicationStats(bucket, bucketStats)
  331. }
  332. return bucketsReplicationStats
  333. }
  334. func (r *ReplicationStats) calculateBucketReplicationStats(bucket string, bucketStats []BucketStats) (bs BucketStats) {
  335. if r == nil {
  336. bs = BucketStats{
  337. ReplicationStats: BucketReplicationStats{
  338. Stats: make(map[string]*BucketReplicationStat),
  339. },
  340. QueueStats: ReplicationQueueStats{},
  341. ProxyStats: ProxyMetric{},
  342. }
  343. return bs
  344. }
  345. var s BucketReplicationStats
  346. // accumulate cluster bucket stats
  347. stats := make(map[string]*BucketReplicationStat)
  348. var (
  349. totReplicaSize, totReplicatedSize int64
  350. totReplicaCount, totReplicatedCount int64
  351. totFailed RTimedMetrics
  352. tq InQueueMetric
  353. )
  354. for _, bucketStat := range bucketStats {
  355. totReplicaSize += bucketStat.ReplicationStats.ReplicaSize
  356. totReplicaCount += bucketStat.ReplicationStats.ReplicaCount
  357. for _, q := range bucketStat.QueueStats.Nodes {
  358. tq = tq.merge(q.QStats)
  359. }
  360. for arn, stat := range bucketStat.ReplicationStats.Stats {
  361. oldst := stats[arn]
  362. if oldst == nil {
  363. oldst = &BucketReplicationStat{
  364. XferRateLrg: newXferStats(),
  365. XferRateSml: newXferStats(),
  366. }
  367. }
  368. fstats := stat.FailStats.merge(oldst.FailStats)
  369. lrg := oldst.XferRateLrg.merge(*stat.XferRateLrg)
  370. sml := oldst.XferRateSml.merge(*stat.XferRateSml)
  371. stats[arn] = &BucketReplicationStat{
  372. Failed: fstats.toMetric(),
  373. FailStats: fstats,
  374. ReplicatedSize: stat.ReplicatedSize + oldst.ReplicatedSize,
  375. ReplicatedCount: stat.ReplicatedCount + oldst.ReplicatedCount,
  376. Latency: stat.Latency.merge(oldst.Latency),
  377. XferRateLrg: &lrg,
  378. XferRateSml: &sml,
  379. }
  380. totReplicatedSize += stat.ReplicatedSize
  381. totReplicatedCount += stat.ReplicatedCount
  382. totFailed = totFailed.merge(stat.FailStats)
  383. }
  384. }
  385. s = BucketReplicationStats{
  386. Stats: stats,
  387. QStat: tq,
  388. ReplicaSize: totReplicaSize,
  389. ReplicaCount: totReplicaCount,
  390. ReplicatedSize: totReplicatedSize,
  391. ReplicatedCount: totReplicatedCount,
  392. Failed: totFailed.toMetric(),
  393. }
  394. var qs ReplicationQueueStats
  395. for _, bs := range bucketStats {
  396. qs.Nodes = append(qs.Nodes, bs.QueueStats.Nodes...)
  397. }
  398. qs.Uptime = UTCNow().Unix() - globalBootTime.Unix()
  399. var ps ProxyMetric
  400. for _, bs := range bucketStats {
  401. ps.add(bs.ProxyStats)
  402. }
  403. bs = BucketStats{
  404. ReplicationStats: s,
  405. QueueStats: qs,
  406. ProxyStats: ps,
  407. }
  408. r.mostRecentStatsMu.Lock()
  409. if len(r.mostRecentStats.Stats) == 0 {
  410. r.mostRecentStats = BucketStatsMap{Stats: make(map[string]BucketStats, 1), Timestamp: UTCNow()}
  411. }
  412. if len(bs.ReplicationStats.Stats) > 0 {
  413. r.mostRecentStats.Stats[bucket] = bs
  414. }
  415. r.mostRecentStats.Timestamp = UTCNow()
  416. r.mostRecentStatsMu.Unlock()
  417. return bs
  418. }
  419. // get the most current of in-memory replication stats and data usage info from crawler.
  420. func (r *ReplicationStats) getLatestReplicationStats(bucket string) (s BucketStats) {
  421. if r == nil {
  422. return s
  423. }
  424. bucketStats := globalNotificationSys.GetClusterBucketStats(GlobalContext, bucket)
  425. return r.calculateBucketReplicationStats(bucket, bucketStats)
  426. }
  427. func (r *ReplicationStats) incQ(bucket string, sz int64, isDeleteRepl bool, opType replication.Type) {
  428. r.qCache.Lock()
  429. defer r.qCache.Unlock()
  430. v, ok := r.qCache.bucketStats[bucket]
  431. if !ok {
  432. v = newInQueueStats(r.registry, bucket)
  433. }
  434. atomic.AddInt64(&v.nowBytes, sz)
  435. atomic.AddInt64(&v.nowCount, 1)
  436. r.qCache.bucketStats[bucket] = v
  437. atomic.AddInt64(&r.qCache.srQueueStats.nowBytes, sz)
  438. atomic.AddInt64(&r.qCache.srQueueStats.nowCount, 1)
  439. }
  440. func (r *ReplicationStats) decQ(bucket string, sz int64, isDelMarker bool, opType replication.Type) {
  441. r.qCache.Lock()
  442. defer r.qCache.Unlock()
  443. v, ok := r.qCache.bucketStats[bucket]
  444. if !ok {
  445. v = newInQueueStats(r.registry, bucket)
  446. }
  447. atomic.AddInt64(&v.nowBytes, -1*sz)
  448. atomic.AddInt64(&v.nowCount, -1)
  449. r.qCache.bucketStats[bucket] = v
  450. atomic.AddInt64(&r.qCache.srQueueStats.nowBytes, -1*sz)
  451. atomic.AddInt64(&r.qCache.srQueueStats.nowCount, -1)
  452. }
  453. // incProxy increments proxy metrics for proxied calls
  454. func (r *ReplicationStats) incProxy(bucket string, api replProxyAPI, isErr bool) {
  455. if r != nil {
  456. r.pCache.inc(bucket, api, isErr)
  457. }
  458. }
  459. func (r *ReplicationStats) getProxyStats(bucket string) ProxyMetric {
  460. if r == nil {
  461. return ProxyMetric{}
  462. }
  463. return r.pCache.getBucketStats(bucket)
  464. }