Add PutObject Ring Buffer (#19605)

Replace the `io.Pipe` from streamingBitrotWriter -> CreateFile with a fixed size ring buffer.

This will add an output buffer for encoded shards to be written to disk - potentially via RPC.

This will remove blocking when `(*streamingBitrotWriter).Write` is called, and it writes hashes and data.

With current settings, the write looks like this:

```
Outbound
┌───────────────────┐             ┌────────────────┐               ┌───────────────┐                      ┌────────────────┐
│                   │   Parr.     │                │  (http body)  │               │                      │                │
│ Bitrot Hash       │     Write   │      Pipe      │      Read     │  HTTP buffer  │    Write (syscall)   │  TCP Buffer    │
│ Erasure Shard     │ ──────────► │  (unbuffered)  │ ────────────► │   (64K Max)   │ ───────────────────► │    (4MB)       │
│                   │             │                │               │  (io.Copy)    │                      │                │
└───────────────────┘             └────────────────┘               └───────────────┘                      └────────────────┘
```

We write a Hash (32 bytes). Since the pipe is unbuffered, it will block until the 32 bytes have 
been delivered to the TCP buffer, and the next Read hits the Pipe.

Then we write the shard data. This will typically be bigger than 64KB, so it will block until two blocks 
have been read from the pipe.

When we insert a ring buffer:

```
Outbound
┌───────────────────┐             ┌────────────────┐               ┌───────────────┐                      ┌────────────────┐
│                   │             │                │  (http body)  │               │                      │                │
│ Bitrot Hash       │     Write   │  Ring Buffer   │      Read     │  HTTP buffer  │    Write (syscall)   │  TCP Buffer    │
│ Erasure Shard     │ ──────────► │    (2MB)       │ ────────────► │   (64K Max)   │ ───────────────────► │    (4MB)       │
│                   │             │                │               │  (io.Copy)    │                      │                │
└───────────────────┘             └────────────────┘               └───────────────┘                      └────────────────┘
```

The hash+shard will fit within the ring buffer, so writes will not block - but will complete after a 
memcopy. Reads can fill the 64KB buffer if there is data for it.

If the network is congested, the ring buffer will become filled, and all syscalls will be on full buffers.
Only when the ring buffer is filled will erasure coding start blocking.

Since there is always "space" to write output data, we remove the parallel writing since we are 
always writing to memory now, and the goroutine synchronization overhead probably not worth taking. 

If the output were blocked in the existing, we would still wait for it to unblock in parallel write, so it would 
make no difference there - except now the ring buffer smoothes out the load.

There are some micro-optimizations we could look at later. The biggest is that, in most cases, 
we could encode directly to the ring buffer - if we are not at a boundary. Also, "force filling" the 
Read requests (i.e., blocking until a full read can be completed) could be investigated and maybe 
allow concurrent memory on read and write.
This commit is contained in:
Klaus Post
2024-05-14 17:11:04 -07:00
committed by GitHub
parent de4d3dac00
commit d4b391de1b
9 changed files with 1902 additions and 36 deletions

View File

@@ -26,15 +26,17 @@ import (
xhttp "github.com/minio/minio/internal/http"
"github.com/minio/minio/internal/ioutil"
"github.com/minio/minio/internal/ringbuffer"
)
// Calculates bitrot in chunks and writes the hash into the stream.
type streamingBitrotWriter struct {
iow io.WriteCloser
closeWithErr func(err error) error
closeWithErr func(err error)
h hash.Hash
shardSize int64
canClose *sync.WaitGroup
byteBuf []byte
}
func (b *streamingBitrotWriter) Write(p []byte) (int, error) {
@@ -62,7 +64,10 @@ func (b *streamingBitrotWriter) Write(p []byte) (int, error) {
}
func (b *streamingBitrotWriter) Close() error {
// Close the underlying writer.
// This will also flush the ring buffer if used.
err := b.iow.Close()
// Wait for all data to be written before returning else it causes race conditions.
// Race condition is because of io.PipeWriter implementation. i.e consider the following
// sequent of operations:
@@ -73,29 +78,34 @@ func (b *streamingBitrotWriter) Close() error {
if b.canClose != nil {
b.canClose.Wait()
}
// Recycle the buffer.
if b.byteBuf != nil {
globalBytePoolCap.Load().Put(b.byteBuf)
b.byteBuf = nil
}
return err
}
// newStreamingBitrotWriterBuffer returns streaming bitrot writer implementation.
// The output is written to the supplied writer w.
func newStreamingBitrotWriterBuffer(w io.Writer, algo BitrotAlgorithm, shardSize int64) io.Writer {
return &streamingBitrotWriter{iow: ioutil.NopCloser(w), h: algo.New(), shardSize: shardSize, canClose: nil, closeWithErr: func(err error) error {
// Similar to CloseWithError on pipes we always return nil.
return nil
}}
return &streamingBitrotWriter{iow: ioutil.NopCloser(w), h: algo.New(), shardSize: shardSize, canClose: nil, closeWithErr: func(err error) {}}
}
// Returns streaming bitrot writer implementation.
func newStreamingBitrotWriter(disk StorageAPI, origvolume, volume, filePath string, length int64, algo BitrotAlgorithm, shardSize int64) io.Writer {
r, w := io.Pipe()
h := algo.New()
buf := globalBytePoolCap.Load().Get()
rb := ringbuffer.NewBuffer(buf[:cap(buf)]).SetBlocking(true)
bw := &streamingBitrotWriter{
iow: ioutil.NewDeadlineWriter(w, globalDriveConfig.GetMaxTimeout()),
closeWithErr: w.CloseWithError,
iow: ioutil.NewDeadlineWriter(rb.WriteCloser(), globalDriveConfig.GetMaxTimeout()),
closeWithErr: rb.CloseWithError,
h: h,
shardSize: shardSize,
canClose: &sync.WaitGroup{},
byteBuf: buf,
}
bw.canClose.Add(1)
go func() {
@@ -106,7 +116,7 @@ func newStreamingBitrotWriter(disk StorageAPI, origvolume, volume, filePath stri
bitrotSumsTotalSize := ceilFrac(length, shardSize) * int64(h.Size()) // Size used for storing bitrot checksums.
totalFileSize = bitrotSumsTotalSize + length
}
r.CloseWithError(disk.CreateFile(context.TODO(), origvolume, volume, filePath, totalFileSize, r))
rb.CloseWithError(disk.CreateFile(context.TODO(), origvolume, volume, filePath, totalFileSize, rb))
}()
return bw
}

View File

@@ -346,7 +346,7 @@ func (e Erasure) Heal(ctx context.Context, writers []io.Writer, readers []io.Rea
return err
}
w := parallelWriter{
w := multiWriter{
writers: writers,
writeQuorum: 1,
errs: make([]error, len(writers)),

View File

@@ -21,44 +21,36 @@ import (
"context"
"fmt"
"io"
"sync"
)
// Writes in parallel to writers
type parallelWriter struct {
// Writes to multiple writers
type multiWriter struct {
writers []io.Writer
writeQuorum int
errs []error
}
// Write writes data to writers in parallel.
func (p *parallelWriter) Write(ctx context.Context, blocks [][]byte) error {
var wg sync.WaitGroup
// Write writes data to writers.
func (p *multiWriter) Write(ctx context.Context, blocks [][]byte) error {
for i := range p.writers {
if p.errs[i] != nil {
continue
}
if p.writers[i] == nil {
p.errs[i] = errDiskNotFound
continue
}
if p.errs[i] != nil {
continue
}
wg.Add(1)
go func(i int) {
defer wg.Done()
var n int
n, p.errs[i] = p.writers[i].Write(blocks[i])
if p.errs[i] == nil {
if n != len(blocks[i]) {
p.errs[i] = io.ErrShortWrite
p.writers[i] = nil
}
} else {
var n int
n, p.errs[i] = p.writers[i].Write(blocks[i])
if p.errs[i] == nil {
if n != len(blocks[i]) {
p.errs[i] = io.ErrShortWrite
p.writers[i] = nil
}
}(i)
} else {
p.writers[i] = nil
}
}
wg.Wait()
// If nilCount >= p.writeQuorum, we return nil. This is because HealFile() uses
// CreateFile with p.writeQuorum=1 to accommodate healing of single disk.
@@ -75,7 +67,7 @@ func (p *parallelWriter) Write(ctx context.Context, blocks [][]byte) error {
// Encode reads from the reader, erasure-encodes the data and writes to the writers.
func (e *Erasure) Encode(ctx context.Context, src io.Reader, writers []io.Writer, buf []byte, quorum int) (total int64, err error) {
writer := &parallelWriter{
writer := &multiWriter{
writers: writers,
writeQuorum: quorum,
errs: make([]error, len(writers)),