You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

440 lines
12 KiB

Move admin APIs to new path and add redesigned heal APIs (#5351) - Changes related to moving admin APIs - admin APIs now have an endpoint under /minio/admin - admin APIs are now versioned - a new API to server the version is added at "GET /minio/admin/version" and all API operations have the path prefix /minio/admin/v1/<operation> - new service stop API added - credentials change API is moved to /minio/admin/v1/config/credential - credentials change API and configuration get/set API now require TLS so that credentials are protected - all API requests now receive JSON - heal APIs are disabled as they will be changed substantially - Heal API changes Heal API is now provided at a single endpoint with the ability for a client to start a heal sequence on all the data in the server, a single bucket, or under a prefix within a bucket. When a heal sequence is started, the server returns a unique token that needs to be used for subsequent 'status' requests to fetch heal results. On each status request from the client, the server returns heal result records that it has accumulated since the previous status request. The server accumulates upto 1000 records and pauses healing further objects until the client requests for status. If the client does not request any further records for a long time, the server aborts the heal sequence automatically. A heal result record is returned for each entity healed on the server, such as system metadata, object metadata, buckets and objects, and has information about the before and after states on each disk. A client may request to force restart a heal sequence - this causes the running heal sequence to be aborted at the next safe spot and starts a new heal sequence.
8 years ago
  1. // Copyright (c) 2015-2021 MinIO, Inc.
  2. //
  3. // This file is part of MinIO Object Storage stack
  4. //
  5. // This program is free software: you can redistribute it and/or modify
  6. // it under the terms of the GNU Affero General Public License as published by
  7. // the Free Software Foundation, either version 3 of the License, or
  8. // (at your option) any later version.
  9. //
  10. // This program is distributed in the hope that it will be useful
  11. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. // GNU Affero General Public License for more details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. package cmd
  18. import (
  19. "bytes"
  20. "context"
  21. "slices"
  22. "time"
  23. "github.com/minio/madmin-go/v3"
  24. )
  25. func commonETags(etags []string) (etag string, maxima int) {
  26. etagOccurrenceMap := make(map[string]int, len(etags))
  27. // Ignore the uuid sentinel and count the rest.
  28. for _, etag := range etags {
  29. if etag == "" {
  30. continue
  31. }
  32. etagOccurrenceMap[etag]++
  33. }
  34. maxima = 0 // Counter for remembering max occurrence of elements.
  35. latest := ""
  36. // Find the common cardinality from previously collected
  37. // occurrences of elements.
  38. for etag, count := range etagOccurrenceMap {
  39. if count < maxima {
  40. continue
  41. }
  42. // We are at or above maxima
  43. if count > maxima {
  44. maxima = count
  45. latest = etag
  46. }
  47. }
  48. // Return the collected common max time, with maxima
  49. return latest, maxima
  50. }
  51. // commonTime returns a maximally occurring time from a list of time.
  52. func commonTimeAndOccurrence(times []time.Time, group time.Duration) (maxTime time.Time, maxima int) {
  53. timeOccurrenceMap := make(map[int64]int, len(times))
  54. groupNano := group.Nanoseconds()
  55. // Ignore the uuid sentinel and count the rest.
  56. for _, t := range times {
  57. if t.Equal(timeSentinel) || t.IsZero() {
  58. continue
  59. }
  60. nano := t.UnixNano()
  61. if group > 0 {
  62. for k := range timeOccurrenceMap {
  63. if k == nano {
  64. // We add to ourself later
  65. continue
  66. }
  67. diff := k - nano
  68. if diff < 0 {
  69. diff = -diff
  70. }
  71. // We are within the limit
  72. if diff < groupNano {
  73. timeOccurrenceMap[k]++
  74. }
  75. }
  76. }
  77. // Add ourself...
  78. timeOccurrenceMap[nano]++
  79. }
  80. maxima = 0 // Counter for remembering max occurrence of elements.
  81. latest := int64(0)
  82. // Find the common cardinality from previously collected
  83. // occurrences of elements.
  84. for nano, count := range timeOccurrenceMap {
  85. if count < maxima {
  86. continue
  87. }
  88. // We are at or above maxima
  89. if count > maxima || nano > latest {
  90. maxima = count
  91. latest = nano
  92. }
  93. }
  94. // Return the collected common max time, with maxima
  95. return time.Unix(0, latest).UTC(), maxima
  96. }
  97. // commonTime returns a maximally occurring time from a list of time if it
  98. // occurs >= quorum, else return timeSentinel
  99. func commonTime(modTimes []time.Time, quorum int) time.Time {
  100. if modTime, count := commonTimeAndOccurrence(modTimes, 0); count >= quorum {
  101. return modTime
  102. }
  103. return timeSentinel
  104. }
  105. func commonETag(etags []string, quorum int) string {
  106. if etag, count := commonETags(etags); count >= quorum {
  107. return etag
  108. }
  109. return ""
  110. }
  111. // Beginning of unix time is treated as sentinel value here.
  112. var (
  113. timeSentinel = time.Unix(0, 0).UTC()
  114. timeSentinel1970 = time.Unix(0, 1).UTC() // 1970 used for special cases when xlmeta.version == 0
  115. )
  116. // Boot modTimes up to disk count, setting the value to time sentinel.
  117. func bootModtimes(diskCount int) []time.Time {
  118. modTimes := make([]time.Time, diskCount)
  119. // Boots up all the modtimes.
  120. for i := range modTimes {
  121. modTimes[i] = timeSentinel
  122. }
  123. return modTimes
  124. }
  125. func listObjectETags(partsMetadata []FileInfo, errs []error, quorum int) (etags []string) {
  126. etags = make([]string, len(partsMetadata))
  127. vidMap := map[string]int{}
  128. for index, metadata := range partsMetadata {
  129. if errs[index] != nil {
  130. continue
  131. }
  132. vid := metadata.VersionID
  133. if metadata.VersionID == "" {
  134. vid = nullVersionID
  135. }
  136. vidMap[vid]++
  137. etags[index] = metadata.Metadata["etag"]
  138. }
  139. for _, count := range vidMap {
  140. // do we have enough common versions
  141. // that have enough quorum to satisfy
  142. // the etag.
  143. if count >= quorum {
  144. return etags
  145. }
  146. }
  147. return make([]string, len(partsMetadata))
  148. }
  149. // Extracts list of times from FileInfo slice and returns, skips
  150. // slice elements which have errors.
  151. func listObjectModtimes(partsMetadata []FileInfo, errs []error) (modTimes []time.Time) {
  152. modTimes = bootModtimes(len(partsMetadata))
  153. for index, metadata := range partsMetadata {
  154. if errs[index] != nil {
  155. continue
  156. }
  157. // Once the file is found, save the uuid saved on disk.
  158. modTimes[index] = metadata.ModTime
  159. }
  160. return modTimes
  161. }
  162. func filterOnlineDisksInplace(fi FileInfo, partsMetadata []FileInfo, onlineDisks []StorageAPI) {
  163. for i, meta := range partsMetadata {
  164. if fi.XLV1 == meta.XLV1 {
  165. continue
  166. }
  167. onlineDisks[i] = nil
  168. }
  169. }
  170. // Notes:
  171. // There are 5 possible states a disk could be in,
  172. // 1. __online__ - has the latest copy of xl.meta - returned by listOnlineDisks
  173. //
  174. // 2. __offline__ - err == errDiskNotFound
  175. //
  176. // 3. __availableWithParts__ - has the latest copy of xl.meta and has all
  177. // parts with checksums matching; returned by disksWithAllParts
  178. //
  179. // 4. __outdated__ - returned by outDatedDisk, provided []StorageAPI
  180. // returned by diskWithAllParts is passed for latestDisks.
  181. // - has an old copy of xl.meta
  182. // - doesn't have xl.meta (errFileNotFound)
  183. // - has the latest xl.meta but one or more parts are corrupt
  184. //
  185. // 5. __missingParts__ - has the latest copy of xl.meta but has some parts
  186. // missing. This is identified separately since this may need manual
  187. // inspection to understand the root cause. E.g, this could be due to
  188. // backend filesystem corruption.
  189. // listOnlineDisks - returns
  190. // - a slice of disks where disk having 'older' xl.meta (or nothing)
  191. // are set to nil.
  192. // - latest (in time) of the maximally occurring modTime(s), which has at least quorum occurrences.
  193. func listOnlineDisks(disks []StorageAPI, partsMetadata []FileInfo, errs []error, quorum int) (onlineDisks []StorageAPI, modTime time.Time, etag string) {
  194. onlineDisks = make([]StorageAPI, len(disks))
  195. // List all the file commit ids from parts metadata.
  196. modTimes := listObjectModtimes(partsMetadata, errs)
  197. // Reduce list of UUIDs to a single common value.
  198. modTime = commonTime(modTimes, quorum)
  199. if modTime.IsZero() || modTime.Equal(timeSentinel) {
  200. etags := listObjectETags(partsMetadata, errs, quorum)
  201. etag = commonETag(etags, quorum)
  202. if etag != "" { // allow this fallback only if a non-empty etag is found.
  203. for index, e := range etags {
  204. if partsMetadata[index].IsValid() && e == etag {
  205. onlineDisks[index] = disks[index]
  206. } else {
  207. onlineDisks[index] = nil
  208. }
  209. }
  210. return onlineDisks, modTime, etag
  211. }
  212. }
  213. // Create a new online disks slice, which have common uuid.
  214. for index, t := range modTimes {
  215. if partsMetadata[index].IsValid() && t.Equal(modTime) {
  216. onlineDisks[index] = disks[index]
  217. } else {
  218. onlineDisks[index] = nil
  219. }
  220. }
  221. return onlineDisks, modTime, ""
  222. }
  223. // Convert verify or check parts returned error to integer representation
  224. func convPartErrToInt(err error) int {
  225. err = unwrapAll(err)
  226. switch err {
  227. case nil:
  228. return checkPartSuccess
  229. case errFileNotFound, errFileVersionNotFound:
  230. return checkPartFileNotFound
  231. case errFileCorrupt:
  232. return checkPartFileCorrupt
  233. case errVolumeNotFound:
  234. return checkPartVolumeNotFound
  235. case errDiskNotFound:
  236. return checkPartDiskNotFound
  237. default:
  238. return checkPartUnknown
  239. }
  240. }
  241. func partNeedsHealing(partErrs []int) bool {
  242. return slices.IndexFunc(partErrs, func(i int) bool { return i != checkPartSuccess && i != checkPartUnknown }) > -1
  243. }
  244. func countPartNotSuccess(partErrs []int) (c int) {
  245. for _, pe := range partErrs {
  246. if pe != checkPartSuccess {
  247. c++
  248. }
  249. }
  250. return
  251. }
  252. // checkObjectWithAllParts sets partsMetadata and onlineDisks when xl.meta is inexistant/corrupted or outdated
  253. // it also checks if the status of each part (corrupted, missing, ok) in each drive
  254. func checkObjectWithAllParts(ctx context.Context, onlineDisks []StorageAPI, partsMetadata []FileInfo,
  255. errs []error, latestMeta FileInfo, filterByETag bool, bucket, object string,
  256. scanMode madmin.HealScanMode,
  257. ) (dataErrsByDisk map[int][]int, dataErrsByPart map[int][]int) {
  258. dataErrsByDisk = make(map[int][]int, len(onlineDisks))
  259. for i := range onlineDisks {
  260. dataErrsByDisk[i] = make([]int, len(latestMeta.Parts))
  261. }
  262. dataErrsByPart = make(map[int][]int, len(latestMeta.Parts))
  263. for i := range latestMeta.Parts {
  264. dataErrsByPart[i] = make([]int, len(onlineDisks))
  265. }
  266. inconsistent := 0
  267. for i, meta := range partsMetadata {
  268. if !meta.IsValid() {
  269. // Since for majority of the cases erasure.Index matches with erasure.Distribution we can
  270. // consider the offline disks as consistent.
  271. continue
  272. }
  273. if !meta.Deleted {
  274. if len(meta.Erasure.Distribution) != len(onlineDisks) {
  275. // Erasure distribution seems to have lesser
  276. // number of items than number of online disks.
  277. inconsistent++
  278. continue
  279. }
  280. if meta.Erasure.Distribution[i] != meta.Erasure.Index {
  281. // Mismatch indexes with distribution order
  282. inconsistent++
  283. }
  284. }
  285. }
  286. erasureDistributionReliable := inconsistent <= len(partsMetadata)/2
  287. metaErrs := make([]error, len(errs))
  288. for i := range onlineDisks {
  289. if errs[i] != nil {
  290. metaErrs[i] = errs[i]
  291. continue
  292. }
  293. if onlineDisks[i] == OfflineDisk {
  294. metaErrs[i] = errDiskNotFound
  295. continue
  296. }
  297. meta := partsMetadata[i]
  298. corrupted := false
  299. if filterByETag {
  300. corrupted = meta.Metadata["etag"] != latestMeta.Metadata["etag"]
  301. } else {
  302. corrupted = !meta.ModTime.Equal(latestMeta.ModTime) || meta.DataDir != latestMeta.DataDir
  303. }
  304. if corrupted {
  305. metaErrs[i] = errFileCorrupt
  306. partsMetadata[i] = FileInfo{}
  307. onlineDisks[i] = nil
  308. continue
  309. }
  310. if erasureDistributionReliable {
  311. if !meta.IsValid() {
  312. partsMetadata[i] = FileInfo{}
  313. metaErrs[i] = errFileCorrupt
  314. onlineDisks[i] = nil
  315. continue
  316. }
  317. if !meta.Deleted {
  318. if len(meta.Erasure.Distribution) != len(onlineDisks) {
  319. // Erasure distribution is not the same as onlineDisks
  320. // attempt a fix if possible, assuming other entries
  321. // might have the right erasure distribution.
  322. partsMetadata[i] = FileInfo{}
  323. metaErrs[i] = errFileCorrupt
  324. onlineDisks[i] = nil
  325. continue
  326. }
  327. }
  328. }
  329. }
  330. // Copy meta errors to part errors
  331. for i, err := range metaErrs {
  332. if err != nil {
  333. partErr := convPartErrToInt(err)
  334. for p := range latestMeta.Parts {
  335. dataErrsByPart[p][i] = partErr
  336. }
  337. }
  338. }
  339. for i, onlineDisk := range onlineDisks {
  340. if metaErrs[i] != nil {
  341. continue
  342. }
  343. meta := partsMetadata[i]
  344. if meta.Deleted || meta.IsRemote() {
  345. continue
  346. }
  347. // Always check data, if we got it.
  348. if (len(meta.Data) > 0 || meta.Size == 0) && len(meta.Parts) > 0 {
  349. checksumInfo := meta.Erasure.GetChecksumInfo(meta.Parts[0].Number)
  350. verifyErr := bitrotVerify(bytes.NewReader(meta.Data),
  351. int64(len(meta.Data)),
  352. meta.Erasure.ShardFileSize(meta.Size),
  353. checksumInfo.Algorithm,
  354. checksumInfo.Hash, meta.Erasure.ShardSize())
  355. dataErrsByPart[0][i] = convPartErrToInt(verifyErr)
  356. continue
  357. }
  358. var (
  359. verifyErr error
  360. verifyResp *CheckPartsResp
  361. )
  362. switch scanMode {
  363. case madmin.HealDeepScan:
  364. // disk has a valid xl.meta but may not have all the
  365. // parts. This is considered an outdated disk, since
  366. // it needs healing too.
  367. verifyResp, verifyErr = onlineDisk.VerifyFile(ctx, bucket, object, meta)
  368. default:
  369. verifyResp, verifyErr = onlineDisk.CheckParts(ctx, bucket, object, meta)
  370. }
  371. for p := range latestMeta.Parts {
  372. if verifyErr != nil {
  373. dataErrsByPart[p][i] = convPartErrToInt(verifyErr)
  374. } else {
  375. dataErrsByPart[p][i] = verifyResp.Results[p]
  376. }
  377. }
  378. }
  379. // Build dataErrs by disk from dataErrs by part
  380. for part, disks := range dataErrsByPart {
  381. for disk := range disks {
  382. dataErrsByDisk[disk][part] = dataErrsByPart[part][disk]
  383. }
  384. }
  385. return
  386. }