mirror of
https://github.com/minio/minio.git
synced 2025-11-22 02:35:30 -05:00
fs: Re-implement object layer to remember the fd (#3509)
This patch re-writes FS backend to support shared backend sharing locks for safe concurrent access across multiple servers.
This commit is contained in:
@@ -18,6 +18,8 @@ package cmd
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"os"
|
||||
"reflect"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -30,15 +32,16 @@ import (
|
||||
var errPartsMissing = errors.New("required parts missing")
|
||||
|
||||
// Error sent when appendParts go-routine has waited long enough and timedout.
|
||||
var errAppendPartsTimeout = errors.New("appendParts goroutine timeout")
|
||||
var errAppendPartsTimeout = errors.New("appendParts go-routine timeout")
|
||||
|
||||
// Timeout value for the appendParts go-routine.
|
||||
var appendPartsTimeout = 24 * 60 * 60 * time.Second // 24 hours.
|
||||
var appendPartsTimeout = 24 * 60 * 60 * time.Second // 24 Hours.
|
||||
|
||||
// Holds a map of uploadID->appendParts go-routine
|
||||
type backgroundAppend struct {
|
||||
infoMap map[string]bgAppendPartsInfo
|
||||
sync.Mutex
|
||||
infoMap map[string]bgAppendPartsInfo
|
||||
appendFile io.WriteCloser
|
||||
}
|
||||
|
||||
// Input to the appendParts go-routine
|
||||
@@ -56,9 +59,9 @@ type bgAppendPartsInfo struct {
|
||||
}
|
||||
|
||||
// Called after a part is uploaded so that it can be appended in the background.
|
||||
func (b *backgroundAppend) append(disk StorageAPI, bucket, object, uploadID string, meta fsMetaV1) chan error {
|
||||
b.Lock()
|
||||
info, ok := b.infoMap[uploadID]
|
||||
func (fs fsObjects) append(bucket, object, uploadID string, meta fsMetaV1) chan error {
|
||||
fs.bgAppend.Lock()
|
||||
info, ok := fs.bgAppend.infoMap[uploadID]
|
||||
if !ok {
|
||||
// Corresponding appendParts go-routine was not found, create a new one. Would happen when the first
|
||||
// part of a multipart upload is uploaded.
|
||||
@@ -68,11 +71,12 @@ func (b *backgroundAppend) append(disk StorageAPI, bucket, object, uploadID stri
|
||||
completeCh := make(chan struct{})
|
||||
|
||||
info = bgAppendPartsInfo{inputCh, timeoutCh, abortCh, completeCh}
|
||||
b.infoMap[uploadID] = info
|
||||
fs.bgAppend.infoMap[uploadID] = info
|
||||
|
||||
go b.appendParts(disk, bucket, object, uploadID, info)
|
||||
go fs.appendParts(bucket, object, uploadID, info)
|
||||
}
|
||||
b.Unlock()
|
||||
fs.bgAppend.Unlock()
|
||||
|
||||
errCh := make(chan error)
|
||||
go func() {
|
||||
// send input in a goroutine as send on the inputCh can block if appendParts go-routine
|
||||
@@ -85,19 +89,23 @@ func (b *backgroundAppend) append(disk StorageAPI, bucket, object, uploadID stri
|
||||
case info.inputCh <- bgAppendPartsInput{meta, errCh}:
|
||||
}
|
||||
}()
|
||||
|
||||
return errCh
|
||||
}
|
||||
|
||||
// Called on complete-multipart-upload. Returns nil if the required parts have been appended.
|
||||
func (b *backgroundAppend) complete(disk StorageAPI, bucket, object, uploadID string, meta fsMetaV1) error {
|
||||
b.Lock()
|
||||
defer b.Unlock()
|
||||
info, ok := b.infoMap[uploadID]
|
||||
delete(b.infoMap, uploadID)
|
||||
func (fs *fsObjects) complete(bucket, object, uploadID string, meta fsMetaV1) error {
|
||||
fs.bgAppend.Lock()
|
||||
defer fs.bgAppend.Unlock()
|
||||
|
||||
info, ok := fs.bgAppend.infoMap[uploadID]
|
||||
delete(fs.bgAppend.infoMap, uploadID)
|
||||
if !ok {
|
||||
return errPartsMissing
|
||||
}
|
||||
|
||||
errCh := make(chan error)
|
||||
|
||||
select {
|
||||
case <-info.timeoutCh:
|
||||
// This is to handle a rare race condition where we found info in b.infoMap
|
||||
@@ -105,6 +113,7 @@ func (b *backgroundAppend) complete(disk StorageAPI, bucket, object, uploadID st
|
||||
return errAppendPartsTimeout
|
||||
case info.inputCh <- bgAppendPartsInput{meta, errCh}:
|
||||
}
|
||||
|
||||
err := <-errCh
|
||||
|
||||
close(info.completeCh)
|
||||
@@ -113,21 +122,26 @@ func (b *backgroundAppend) complete(disk StorageAPI, bucket, object, uploadID st
|
||||
}
|
||||
|
||||
// Called after complete-multipart-upload or abort-multipart-upload so that the appendParts go-routine is not left dangling.
|
||||
func (b *backgroundAppend) abort(uploadID string) {
|
||||
b.Lock()
|
||||
defer b.Unlock()
|
||||
info, ok := b.infoMap[uploadID]
|
||||
func (fs fsObjects) abort(uploadID string) {
|
||||
fs.bgAppend.Lock()
|
||||
defer fs.bgAppend.Unlock()
|
||||
|
||||
info, ok := fs.bgAppend.infoMap[uploadID]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
delete(b.infoMap, uploadID)
|
||||
|
||||
delete(fs.bgAppend.infoMap, uploadID)
|
||||
|
||||
info.abortCh <- struct{}{}
|
||||
}
|
||||
|
||||
// This is run as a go-routine that appends the parts in the background.
|
||||
func (b *backgroundAppend) appendParts(disk StorageAPI, bucket, object, uploadID string, info bgAppendPartsInfo) {
|
||||
func (fs fsObjects) appendParts(bucket, object, uploadID string, info bgAppendPartsInfo) {
|
||||
appendPath := pathJoin(fs.fsPath, minioMetaTmpBucket, fs.fsUUID, uploadID)
|
||||
// Holds the list of parts that is already appended to the "append" file.
|
||||
appendMeta := fsMetaV1{}
|
||||
|
||||
// Allocate staging read buffer.
|
||||
buf := make([]byte, readSizeV1)
|
||||
for {
|
||||
@@ -136,6 +150,7 @@ func (b *backgroundAppend) appendParts(disk StorageAPI, bucket, object, uploadID
|
||||
// We receive on this channel when new part gets uploaded or when complete-multipart sends
|
||||
// a value on this channel to confirm if all the required parts are appended.
|
||||
meta := input.meta
|
||||
|
||||
for {
|
||||
// Append should be done such a way that if part-3 and part-2 is uploaded before part-1, we
|
||||
// wait till part-1 is uploaded after which we append part-2 and part-3 as well in this for-loop.
|
||||
@@ -152,18 +167,23 @@ func (b *backgroundAppend) appendParts(disk StorageAPI, bucket, object, uploadID
|
||||
}
|
||||
break
|
||||
}
|
||||
if err := appendPart(disk, bucket, object, uploadID, part, buf); err != nil {
|
||||
disk.DeleteFile(minioMetaTmpBucket, uploadID)
|
||||
|
||||
if err := fs.appendPart(bucket, object, uploadID, part, buf); err != nil {
|
||||
fsRemoveFile(appendPath)
|
||||
appendMeta.Parts = nil
|
||||
input.errCh <- err
|
||||
break
|
||||
}
|
||||
|
||||
appendMeta.AddObjectPart(part.Number, part.Name, part.ETag, part.Size)
|
||||
}
|
||||
case <-info.abortCh:
|
||||
// abort-multipart-upload closed abortCh to end the appendParts go-routine.
|
||||
disk.DeleteFile(minioMetaTmpBucket, uploadID)
|
||||
close(info.timeoutCh) // So that any racing PutObjectPart does not leave a dangling go-routine.
|
||||
fsRemoveFile(appendPath)
|
||||
|
||||
// So that any racing PutObjectPart does not leave a dangling go-routine.
|
||||
close(info.timeoutCh)
|
||||
|
||||
return
|
||||
case <-info.completeCh:
|
||||
// complete-multipart-upload closed completeCh to end the appendParts go-routine.
|
||||
@@ -172,11 +192,12 @@ func (b *backgroundAppend) appendParts(disk StorageAPI, bucket, object, uploadID
|
||||
case <-time.After(appendPartsTimeout):
|
||||
// Timeout the goroutine to garbage collect its resources. This would happen if the client initiates
|
||||
// a multipart upload and does not complete/abort it.
|
||||
b.Lock()
|
||||
delete(b.infoMap, uploadID)
|
||||
b.Unlock()
|
||||
fs.bgAppend.Lock()
|
||||
delete(fs.bgAppend.infoMap, uploadID)
|
||||
fs.bgAppend.Unlock()
|
||||
|
||||
// Delete the temporary append file as well.
|
||||
disk.DeleteFile(minioMetaTmpBucket, uploadID)
|
||||
fsRemoveFile(appendPath)
|
||||
|
||||
close(info.timeoutCh)
|
||||
return
|
||||
@@ -186,29 +207,34 @@ func (b *backgroundAppend) appendParts(disk StorageAPI, bucket, object, uploadID
|
||||
|
||||
// Appends the "part" to the append-file inside "tmp/" that finally gets moved to the actual location
|
||||
// upon complete-multipart-upload.
|
||||
func appendPart(disk StorageAPI, bucket, object, uploadID string, part objectPartInfo, buf []byte) error {
|
||||
partPath := pathJoin(bucket, object, uploadID, part.Name)
|
||||
func (fs fsObjects) appendPart(bucket, object, uploadID string, part objectPartInfo, buf []byte) error {
|
||||
partPath := pathJoin(fs.fsPath, minioMetaMultipartBucket, bucket, object, uploadID, part.Name)
|
||||
|
||||
offset := int64(0)
|
||||
totalLeft := part.Size
|
||||
for totalLeft > 0 {
|
||||
curLeft := int64(readSizeV1)
|
||||
if totalLeft < readSizeV1 {
|
||||
curLeft = totalLeft
|
||||
// Read each file part to start writing to the temporary concatenated object.
|
||||
file, size, err := fsOpenFile(partPath, offset)
|
||||
if err != nil {
|
||||
if err == errFileNotFound {
|
||||
return errPartsMissing
|
||||
}
|
||||
n, err := disk.ReadFile(minioMetaMultipartBucket, partPath, offset, buf[:curLeft])
|
||||
if err != nil {
|
||||
// Check for EOF/ErrUnexpectedEOF not needed as it should never happen as we know
|
||||
// the exact size of the file and hence know the size of buf[]
|
||||
// EOF/ErrUnexpectedEOF indicates that the length of file was shorter than part.Size and
|
||||
// hence considered as an error condition.
|
||||
return err
|
||||
}
|
||||
if err = disk.AppendFile(minioMetaTmpBucket, uploadID, buf[:n]); err != nil {
|
||||
return err
|
||||
}
|
||||
offset += n
|
||||
totalLeft -= n
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
defer file.Close()
|
||||
|
||||
tmpObjPath := pathJoin(fs.fsPath, minioMetaTmpBucket, fs.fsUUID, uploadID)
|
||||
// No need to hold a lock, this is a unique file and will be only written
|
||||
// to one one process per uploadID per minio process.
|
||||
wfile, err := os.OpenFile(preparePath(tmpObjPath), os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0666)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer wfile.Close()
|
||||
|
||||
// Fallocate more space as we concatenate.
|
||||
if err = fsFAllocate(int(wfile.Fd()), 0, size); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = io.CopyBuffer(wfile, file, buf)
|
||||
return err
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user