You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

364 lines
9.5 KiB

Add PutObject Ring Buffer (#19605) Replace the `io.Pipe` from streamingBitrotWriter -> CreateFile with a fixed size ring buffer. This will add an output buffer for encoded shards to be written to disk - potentially via RPC. This will remove blocking when `(*streamingBitrotWriter).Write` is called, and it writes hashes and data. With current settings, the write looks like this: ``` Outbound ┌───────────────────┐ ┌────────────────┐ ┌───────────────┐ ┌────────────────┐ │ │ Parr. │ │ (http body) │ │ │ │ │ Bitrot Hash │ Write │ Pipe │ Read │ HTTP buffer │ Write (syscall) │ TCP Buffer │ │ Erasure Shard │ ──────────► │ (unbuffered) │ ────────────► │ (64K Max) │ ───────────────────► │ (4MB) │ │ │ │ │ │ (io.Copy) │ │ │ └───────────────────┘ └────────────────┘ └───────────────┘ └────────────────┘ ``` We write a Hash (32 bytes). Since the pipe is unbuffered, it will block until the 32 bytes have been delivered to the TCP buffer, and the next Read hits the Pipe. Then we write the shard data. This will typically be bigger than 64KB, so it will block until two blocks have been read from the pipe. When we insert a ring buffer: ``` Outbound ┌───────────────────┐ ┌────────────────┐ ┌───────────────┐ ┌────────────────┐ │ │ │ │ (http body) │ │ │ │ │ Bitrot Hash │ Write │ Ring Buffer │ Read │ HTTP buffer │ Write (syscall) │ TCP Buffer │ │ Erasure Shard │ ──────────► │ (2MB) │ ────────────► │ (64K Max) │ ───────────────────► │ (4MB) │ │ │ │ │ │ (io.Copy) │ │ │ └───────────────────┘ └────────────────┘ └───────────────┘ └────────────────┘ ``` The hash+shard will fit within the ring buffer, so writes will not block - but will complete after a memcopy. Reads can fill the 64KB buffer if there is data for it. If the network is congested, the ring buffer will become filled, and all syscalls will be on full buffers. Only when the ring buffer is filled will erasure coding start blocking. Since there is always "space" to write output data, we remove the parallel writing since we are always writing to memory now, and the goroutine synchronization overhead probably not worth taking. If the output were blocked in the existing, we would still wait for it to unblock in parallel write, so it would make no difference there - except now the ring buffer smoothes out the load. There are some micro-optimizations we could look at later. The biggest is that, in most cases, we could encode directly to the ring buffer - if we are not at a boundary. Also, "force filling" the Read requests (i.e., blocking until a full read can be completed) could be investigated and maybe allow concurrent memory on read and write.
1 year ago
  1. // Copyright (c) 2015-2021 MinIO, Inc.
  2. //
  3. // This file is part of MinIO Object Storage stack
  4. //
  5. // This program is free software: you can redistribute it and/or modify
  6. // it under the terms of the GNU Affero General Public License as published by
  7. // the Free Software Foundation, either version 3 of the License, or
  8. // (at your option) any later version.
  9. //
  10. // This program is distributed in the hope that it will be useful
  11. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. // GNU Affero General Public License for more details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. package cmd
  18. import (
  19. "context"
  20. "errors"
  21. "fmt"
  22. "io"
  23. "sync"
  24. "sync/atomic"
  25. xioutil "github.com/minio/minio/internal/ioutil"
  26. )
  27. // Reads in parallel from readers.
  28. type parallelReader struct {
  29. readers []io.ReaderAt
  30. orgReaders []io.ReaderAt
  31. dataBlocks int
  32. offset int64
  33. shardSize int64
  34. shardFileSize int64
  35. buf [][]byte
  36. readerToBuf []int
  37. stashBuffer []byte
  38. }
  39. // newParallelReader returns parallelReader.
  40. func newParallelReader(readers []io.ReaderAt, e Erasure, offset, totalLength int64) *parallelReader {
  41. r2b := make([]int, len(readers))
  42. for i := range r2b {
  43. r2b[i] = i
  44. }
  45. bufs := make([][]byte, len(readers))
  46. shardSize := int(e.ShardSize())
  47. var b []byte
  48. // We should always have enough capacity, but older objects may be bigger
  49. // we do not need stashbuffer for them.
  50. if globalBytePoolCap.Load().WidthCap() >= len(readers)*shardSize {
  51. // Fill buffers
  52. b = globalBytePoolCap.Load().Get()
  53. // Seed the buffers.
  54. for i := range bufs {
  55. bufs[i] = b[i*shardSize : (i+1)*shardSize]
  56. }
  57. }
  58. return &parallelReader{
  59. readers: readers,
  60. orgReaders: readers,
  61. dataBlocks: e.dataBlocks,
  62. offset: (offset / e.blockSize) * e.ShardSize(),
  63. shardSize: e.ShardSize(),
  64. shardFileSize: e.ShardFileSize(totalLength),
  65. buf: make([][]byte, len(readers)),
  66. readerToBuf: r2b,
  67. stashBuffer: b,
  68. }
  69. }
  70. // Done will release any resources used by the parallelReader.
  71. func (p *parallelReader) Done() {
  72. if p.stashBuffer != nil {
  73. globalBytePoolCap.Load().Put(p.stashBuffer)
  74. p.stashBuffer = nil
  75. }
  76. }
  77. // preferReaders can mark readers as preferred.
  78. // These will be chosen before others.
  79. func (p *parallelReader) preferReaders(prefer []bool) {
  80. if len(prefer) != len(p.orgReaders) {
  81. return
  82. }
  83. // Copy so we don't change our input.
  84. tmp := make([]io.ReaderAt, len(p.orgReaders))
  85. copy(tmp, p.orgReaders)
  86. p.readers = tmp
  87. // next is the next non-preferred index.
  88. next := 0
  89. for i, ok := range prefer {
  90. if !ok || p.readers[i] == nil {
  91. continue
  92. }
  93. if i == next {
  94. next++
  95. continue
  96. }
  97. // Move reader with index i to index next.
  98. // Do this by swapping next and i
  99. p.readers[next], p.readers[i] = p.readers[i], p.readers[next]
  100. p.readerToBuf[next] = i
  101. p.readerToBuf[i] = next
  102. next++
  103. }
  104. }
  105. // Returns if buf can be erasure decoded.
  106. func (p *parallelReader) canDecode(buf [][]byte) bool {
  107. bufCount := 0
  108. for _, b := range buf {
  109. if len(b) > 0 {
  110. bufCount++
  111. }
  112. }
  113. return bufCount >= p.dataBlocks
  114. }
  115. // Read reads from readers in parallel. Returns p.dataBlocks number of bufs.
  116. func (p *parallelReader) Read(dst [][]byte) ([][]byte, error) {
  117. newBuf := dst
  118. if len(dst) != len(p.readers) {
  119. newBuf = make([][]byte, len(p.readers))
  120. } else {
  121. for i := range newBuf {
  122. newBuf[i] = newBuf[i][:0]
  123. }
  124. }
  125. var newBufLK sync.RWMutex
  126. if p.offset+p.shardSize > p.shardFileSize {
  127. p.shardSize = p.shardFileSize - p.offset
  128. }
  129. if p.shardSize == 0 {
  130. return newBuf, nil
  131. }
  132. readTriggerCh := make(chan bool, len(p.readers))
  133. defer xioutil.SafeClose(readTriggerCh) // close the channel upon return
  134. for i := 0; i < p.dataBlocks; i++ {
  135. // Setup read triggers for p.dataBlocks number of reads so that it reads in parallel.
  136. readTriggerCh <- true
  137. }
  138. disksNotFound := int32(0)
  139. bitrotHeal := int32(0) // Atomic bool flag.
  140. missingPartsHeal := int32(0) // Atomic bool flag.
  141. readerIndex := 0
  142. var wg sync.WaitGroup
  143. // if readTrigger is true, it implies next disk.ReadAt() should be tried
  144. // if readTrigger is false, it implies previous disk.ReadAt() was successful and there is no need
  145. // to try reading the next disk.
  146. for readTrigger := range readTriggerCh {
  147. newBufLK.RLock()
  148. canDecode := p.canDecode(newBuf)
  149. newBufLK.RUnlock()
  150. if canDecode {
  151. break
  152. }
  153. if readerIndex == len(p.readers) {
  154. break
  155. }
  156. if !readTrigger {
  157. continue
  158. }
  159. wg.Add(1)
  160. go func(i int) {
  161. defer wg.Done()
  162. rr := p.readers[i]
  163. if rr == nil {
  164. // Since reader is nil, trigger another read.
  165. readTriggerCh <- true
  166. return
  167. }
  168. bufIdx := p.readerToBuf[i]
  169. if p.buf[bufIdx] == nil {
  170. // Reading first time on this disk, hence the buffer needs to be allocated.
  171. // Subsequent reads will reuse this buffer.
  172. p.buf[bufIdx] = make([]byte, p.shardSize)
  173. }
  174. // For the last shard, the shardsize might be less than previous shard sizes.
  175. // Hence the following statement ensures that the buffer size is reset to the right size.
  176. p.buf[bufIdx] = p.buf[bufIdx][:p.shardSize]
  177. n, err := rr.ReadAt(p.buf[bufIdx], p.offset)
  178. if err != nil {
  179. switch {
  180. case errors.Is(err, errFileNotFound):
  181. atomic.StoreInt32(&missingPartsHeal, 1)
  182. case errors.Is(err, errFileCorrupt):
  183. atomic.StoreInt32(&bitrotHeal, 1)
  184. case errors.Is(err, errDiskNotFound):
  185. atomic.AddInt32(&disksNotFound, 1)
  186. }
  187. // This will be communicated upstream.
  188. p.orgReaders[bufIdx] = nil
  189. if br, ok := p.readers[i].(io.Closer); ok {
  190. br.Close()
  191. }
  192. p.readers[i] = nil
  193. // Since ReadAt returned error, trigger another read.
  194. readTriggerCh <- true
  195. return
  196. }
  197. newBufLK.Lock()
  198. newBuf[bufIdx] = p.buf[bufIdx][:n]
  199. newBufLK.Unlock()
  200. // Since ReadAt returned success, there is no need to trigger another read.
  201. readTriggerCh <- false
  202. }(readerIndex)
  203. readerIndex++
  204. }
  205. wg.Wait()
  206. if p.canDecode(newBuf) {
  207. p.offset += p.shardSize
  208. if missingPartsHeal == 1 {
  209. return newBuf, errFileNotFound
  210. } else if bitrotHeal == 1 {
  211. return newBuf, errFileCorrupt
  212. }
  213. return newBuf, nil
  214. }
  215. // If we cannot decode, just return read quorum error.
  216. return nil, fmt.Errorf("%w (offline-disks=%d/%d)", errErasureReadQuorum, disksNotFound, len(p.readers))
  217. }
  218. // Decode reads from readers, reconstructs data if needed and writes the data to the writer.
  219. // A set of preferred drives can be supplied. In that case they will be used and the data reconstructed.
  220. func (e Erasure) Decode(ctx context.Context, writer io.Writer, readers []io.ReaderAt, offset, length, totalLength int64, prefer []bool) (written int64, derr error) {
  221. if offset < 0 || length < 0 {
  222. return -1, errInvalidArgument
  223. }
  224. if offset+length > totalLength {
  225. return -1, errInvalidArgument
  226. }
  227. if length == 0 {
  228. return 0, nil
  229. }
  230. reader := newParallelReader(readers, e, offset, totalLength)
  231. if len(prefer) == len(readers) {
  232. reader.preferReaders(prefer)
  233. }
  234. defer reader.Done()
  235. startBlock := offset / e.blockSize
  236. endBlock := (offset + length) / e.blockSize
  237. var bytesWritten int64
  238. var bufs [][]byte
  239. for block := startBlock; block <= endBlock; block++ {
  240. var blockOffset, blockLength int64
  241. switch {
  242. case startBlock == endBlock:
  243. blockOffset = offset % e.blockSize
  244. blockLength = length
  245. case block == startBlock:
  246. blockOffset = offset % e.blockSize
  247. blockLength = e.blockSize - blockOffset
  248. case block == endBlock:
  249. blockOffset = 0
  250. blockLength = (offset + length) % e.blockSize
  251. default:
  252. blockOffset = 0
  253. blockLength = e.blockSize
  254. }
  255. if blockLength == 0 {
  256. break
  257. }
  258. var err error
  259. bufs, err = reader.Read(bufs)
  260. if len(bufs) > 0 {
  261. // Set only if there are be enough data for reconstruction.
  262. // and only for expected errors, also set once.
  263. if errors.Is(err, errFileNotFound) || errors.Is(err, errFileCorrupt) {
  264. if derr == nil {
  265. derr = err
  266. }
  267. }
  268. } else if err != nil {
  269. // For all errors that cannot be reconstructed fail the read operation.
  270. return -1, err
  271. }
  272. if err = e.DecodeDataBlocks(bufs); err != nil {
  273. return -1, err
  274. }
  275. n, err := writeDataBlocks(ctx, writer, bufs, e.dataBlocks, blockOffset, blockLength)
  276. if err != nil {
  277. return -1, err
  278. }
  279. bytesWritten += n
  280. }
  281. if bytesWritten != length {
  282. return bytesWritten, errLessData
  283. }
  284. return bytesWritten, derr
  285. }
  286. // Heal reads from readers, reconstruct shards and writes the data to the writers.
  287. func (e Erasure) Heal(ctx context.Context, writers []io.Writer, readers []io.ReaderAt, totalLength int64, prefer []bool) (derr error) {
  288. if len(writers) != e.parityBlocks+e.dataBlocks {
  289. return errInvalidArgument
  290. }
  291. reader := newParallelReader(readers, e, 0, totalLength)
  292. if len(readers) == len(prefer) {
  293. reader.preferReaders(prefer)
  294. }
  295. defer reader.Done()
  296. startBlock := int64(0)
  297. endBlock := totalLength / e.blockSize
  298. if totalLength%e.blockSize != 0 {
  299. endBlock++
  300. }
  301. var bufs [][]byte
  302. for block := startBlock; block < endBlock; block++ {
  303. var err error
  304. bufs, err = reader.Read(bufs)
  305. if len(bufs) > 0 {
  306. if errors.Is(err, errFileNotFound) || errors.Is(err, errFileCorrupt) {
  307. if derr == nil {
  308. derr = err
  309. }
  310. }
  311. } else if err != nil {
  312. return err
  313. }
  314. if err = e.DecodeDataAndParityBlocks(ctx, bufs); err != nil {
  315. return err
  316. }
  317. w := multiWriter{
  318. writers: writers,
  319. writeQuorum: 1,
  320. errs: make([]error, len(writers)),
  321. }
  322. if err = w.Write(ctx, bufs); err != nil {
  323. return err
  324. }
  325. }
  326. return derr
  327. }