xl PutObject: Split object into parts (#3651)

For faster time-to-first-byte when we try to download a big object
This commit is contained in:
Anis Elleuch 2017-01-31 00:44:42 +01:00 committed by Harshavardhana
parent 46743c7918
commit e9394dc22d
10 changed files with 169 additions and 43 deletions

View File

@ -606,6 +606,11 @@ func toAPIErrorCode(err error) (apiErr APIErrorCode) {
apiErr = ErrSignatureDoesNotMatch apiErr = ErrSignatureDoesNotMatch
case errContentSHA256Mismatch: case errContentSHA256Mismatch:
apiErr = ErrContentSHA256Mismatch apiErr = ErrContentSHA256Mismatch
case errDataTooLarge:
apiErr = ErrEntityTooLarge
case errDataTooSmall:
apiErr = ErrEntityTooSmall
} }
if apiErr != ErrNone { if apiErr != ErrNone {

View File

@ -28,7 +28,7 @@ import (
// erasureCreateFile - writes an entire stream by erasure coding to // erasureCreateFile - writes an entire stream by erasure coding to
// all the disks, writes also calculate individual block's checksum // all the disks, writes also calculate individual block's checksum
// for future bit-rot protection. // for future bit-rot protection.
func erasureCreateFile(disks []StorageAPI, volume, path string, reader io.Reader, blockSize int64, dataBlocks int, parityBlocks int, algo string, writeQuorum int) (bytesWritten int64, checkSums []string, err error) { func erasureCreateFile(disks []StorageAPI, volume, path string, reader io.Reader, allowEmpty bool, blockSize int64, dataBlocks int, parityBlocks int, algo string, writeQuorum int) (bytesWritten int64, checkSums []string, err error) {
// Allocated blockSized buffer for reading from incoming stream. // Allocated blockSized buffer for reading from incoming stream.
buf := make([]byte, blockSize) buf := make([]byte, blockSize)
@ -47,7 +47,7 @@ func erasureCreateFile(disks []StorageAPI, volume, path string, reader io.Reader
// We have reached EOF on the first byte read, io.Reader // We have reached EOF on the first byte read, io.Reader
// must be 0bytes, we don't need to erasure code // must be 0bytes, we don't need to erasure code
// data. Will create a 0byte file instead. // data. Will create a 0byte file instead.
if bytesWritten == 0 { if bytesWritten == 0 && allowEmpty {
blocks = make([][]byte, len(disks)) blocks = make([][]byte, len(disks))
rErr = appendFile(disks, volume, path, blocks, hashWriters, writeQuorum) rErr = appendFile(disks, volume, path, blocks, hashWriters, writeQuorum)
if rErr != nil { if rErr != nil {

View File

@ -56,7 +56,7 @@ func TestErasureCreateFile(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
// Test when all disks are up. // Test when all disks are up.
size, _, err := erasureCreateFile(disks, "testbucket", "testobject1", bytes.NewReader(data), blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1) size, _, err := erasureCreateFile(disks, "testbucket", "testobject1", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -69,7 +69,7 @@ func TestErasureCreateFile(t *testing.T) {
disks[5] = AppendDiskDown{disks[5].(*posix)} disks[5] = AppendDiskDown{disks[5].(*posix)}
// Test when two disks are down. // Test when two disks are down.
size, _, err = erasureCreateFile(disks, "testbucket", "testobject2", bytes.NewReader(data), blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1) size, _, err = erasureCreateFile(disks, "testbucket", "testobject2", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -83,7 +83,7 @@ func TestErasureCreateFile(t *testing.T) {
disks[8] = AppendDiskDown{disks[8].(*posix)} disks[8] = AppendDiskDown{disks[8].(*posix)}
disks[9] = AppendDiskDown{disks[9].(*posix)} disks[9] = AppendDiskDown{disks[9].(*posix)}
size, _, err = erasureCreateFile(disks, "testbucket", "testobject3", bytes.NewReader(data), blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1) size, _, err = erasureCreateFile(disks, "testbucket", "testobject3", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -93,7 +93,7 @@ func TestErasureCreateFile(t *testing.T) {
// 1 more disk down. 7 disk down in total. Should return quorum error. // 1 more disk down. 7 disk down in total. Should return quorum error.
disks[10] = AppendDiskDown{disks[10].(*posix)} disks[10] = AppendDiskDown{disks[10].(*posix)}
_, _, err = erasureCreateFile(disks, "testbucket", "testobject4", bytes.NewReader(data), blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1) _, _, err = erasureCreateFile(disks, "testbucket", "testobject4", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if errorCause(err) != errXLWriteQuorum { if errorCause(err) != errXLWriteQuorum {
t.Errorf("erasureCreateFile return value: expected errXLWriteQuorum, got %s", err) t.Errorf("erasureCreateFile return value: expected errXLWriteQuorum, got %s", err)
} }

View File

@ -48,7 +48,7 @@ func TestErasureHealFile(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
// Create a test file. // Create a test file.
size, checkSums, err := erasureCreateFile(disks, "testbucket", "testobject1", bytes.NewReader(data), blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1) size, checkSums, err := erasureCreateFile(disks, "testbucket", "testobject1", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }

View File

@ -271,7 +271,7 @@ func TestErasureReadFileDiskFail(t *testing.T) {
} }
// Create a test file to read from. // Create a test file to read from.
size, checkSums, err := erasureCreateFile(disks, "testbucket", "testobject", bytes.NewReader(data), blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1) size, checkSums, err := erasureCreateFile(disks, "testbucket", "testobject", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -354,7 +354,7 @@ func TestErasureReadFileOffsetLength(t *testing.T) {
} }
// Create a test file to read from. // Create a test file to read from.
size, checkSums, err := erasureCreateFile(disks, "testbucket", "testobject", bytes.NewReader(data), blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1) size, checkSums, err := erasureCreateFile(disks, "testbucket", "testobject", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -433,7 +433,7 @@ func TestErasureReadFileRandomOffsetLength(t *testing.T) {
iterations := 10000 iterations := 10000
// Create a test file to read from. // Create a test file to read from.
size, checkSums, err := erasureCreateFile(disks, "testbucket", "testobject", bytes.NewReader(data), blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1) size, checkSums, err := erasureCreateFile(disks, "testbucket", "testobject", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }

View File

@ -82,6 +82,9 @@ var (
// Caching is enabled only for RAM size > 8GiB. // Caching is enabled only for RAM size > 8GiB.
globalMaxCacheSize = uint64(0) globalMaxCacheSize = uint64(0)
// Maximum size of internal objects parts
globalPutPartSize = int64(64 * 1024 * 1024)
// Cache expiry. // Cache expiry.
globalCacheExpiry = objcache.DefaultExpiry globalCacheExpiry = objcache.DefaultExpiry

View File

@ -627,8 +627,11 @@ func (xl xlObjects) PutObjectPart(bucket, object, uploadID string, partID int, s
} }
} }
// We always allow empty part.
allowEmpty := true
// Erasure code data and write across all disks. // Erasure code data and write across all disks.
sizeWritten, checkSums, err := erasureCreateFile(onlineDisks, minioMetaTmpBucket, tmpPartPath, teeReader, xlMeta.Erasure.BlockSize, xl.dataBlocks, xl.parityBlocks, bitRotAlgo, xl.writeQuorum) sizeWritten, checkSums, err := erasureCreateFile(onlineDisks, minioMetaTmpBucket, tmpPartPath, teeReader, allowEmpty, xlMeta.Erasure.BlockSize, xl.dataBlocks, xl.parityBlocks, bitRotAlgo, xl.writeQuorum)
if err != nil { if err != nil {
return "", toObjectErr(err, bucket, object) return "", toObjectErr(err, bucket, object)
} }

View File

@ -22,6 +22,7 @@ import (
"hash" "hash"
"io" "io"
"path" "path"
"strconv"
"strings" "strings"
"sync" "sync"
"time" "time"
@ -461,7 +462,6 @@ func (xl xlObjects) PutObject(bucket string, object string, size int64, data io.
} }
uniqueID := mustGetUUID() uniqueID := mustGetUUID()
tempErasureObj := path.Join(uniqueID, "part.1")
tempObj := uniqueID tempObj := uniqueID
// Initialize md5 writer. // Initialize md5 writer.
@ -512,40 +512,101 @@ func (xl xlObjects) PutObject(bucket string, object string, size int64, data io.
// Tee reader combines incoming data stream and md5, data read from input stream is written to md5. // Tee reader combines incoming data stream and md5, data read from input stream is written to md5.
teeReader := io.TeeReader(limitDataReader, mw) teeReader := io.TeeReader(limitDataReader, mw)
// Initialize xl meta. // Initialize parts metadata
partsMetadata := make([]xlMetaV1, len(xl.storageDisks))
xlMeta := newXLMetaV1(object, xl.dataBlocks, xl.parityBlocks) xlMeta := newXLMetaV1(object, xl.dataBlocks, xl.parityBlocks)
onlineDisks := getOrderedDisks(xlMeta.Erasure.Distribution, xl.storageDisks) // Initialize xl meta.
for index := range partsMetadata {
partsMetadata[index] = xlMeta
}
// Order disks according to erasure distribution
onlineDisks := getOrderedDisks(partsMetadata[0].Erasure.Distribution, xl.storageDisks)
// Delete temporary object in the event of failure. // Delete temporary object in the event of failure.
// If PutObject succeeded there would be no temporary // If PutObject succeeded there would be no temporary
// object to delete. // object to delete.
defer xl.deleteObject(minioMetaTmpBucket, tempObj) defer xl.deleteObject(minioMetaTmpBucket, tempObj)
if size > 0 { // Total size of the written object
for _, disk := range onlineDisks { sizeWritten := int64(0)
if disk != nil {
actualSize := xl.sizeOnDisk(size, xlMeta.Erasure.BlockSize, xlMeta.Erasure.DataBlocks) // Read data and split into parts - similar to multipart mechanism
disk.PrepareFile(minioMetaTmpBucket, tempErasureObj, actualSize) for partIdx := 1; ; partIdx++ {
// Compute part name
partName := "part." + strconv.Itoa(partIdx)
// Compute the path of current part
tempErasureObj := path.Join(uniqueID, partName)
// Calculate the size of the current part, if size is unknown, curPartSize wil be unknown too.
// allowEmptyPart will always be true if this is the first part and false otherwise.
curPartSize := getPartSizeFromIdx(size, globalPutPartSize, partIdx)
// Prepare file for eventual optimization in the disk
if curPartSize > 0 {
// Calculate the real size of the part in the disk and prepare it for eventual optimization
actualSize := xl.sizeOnDisk(curPartSize, xlMeta.Erasure.BlockSize, xlMeta.Erasure.DataBlocks)
for _, disk := range onlineDisks {
if disk != nil {
disk.PrepareFile(minioMetaTmpBucket, tempErasureObj, actualSize)
}
} }
} }
}
// Erasure code data and write across all disks. // partReader streams at most maximum part size
sizeWritten, checkSums, err := erasureCreateFile(onlineDisks, minioMetaTmpBucket, tempErasureObj, teeReader, xlMeta.Erasure.BlockSize, xlMeta.Erasure.DataBlocks, xlMeta.Erasure.ParityBlocks, bitRotAlgo, xl.writeQuorum) partReader := io.LimitReader(teeReader, globalPutPartSize)
if err != nil {
return ObjectInfo{}, toObjectErr(err, minioMetaTmpBucket, tempErasureObj) // Allow creating empty earsure file only when this is the first part. This flag is useful
} // when size == -1 because in this case, we are not able to predict how many parts we will have.
// Should return IncompleteBody{} error when reader has fewer bytes allowEmptyPart := partIdx == 1
// than specified in request header.
if sizeWritten < size { // Erasure code data and write across all disks.
return ObjectInfo{}, traceError(IncompleteBody{}) partSizeWritten, checkSums, erasureErr := erasureCreateFile(onlineDisks, minioMetaTmpBucket, tempErasureObj, partReader, allowEmptyPart, partsMetadata[0].Erasure.BlockSize, partsMetadata[0].Erasure.DataBlocks, partsMetadata[0].Erasure.ParityBlocks, bitRotAlgo, xl.writeQuorum)
if erasureErr != nil {
return ObjectInfo{}, toObjectErr(erasureErr, minioMetaTmpBucket, tempErasureObj)
}
// Should return IncompleteBody{} error when reader has fewer bytes
// than specified in request header.
if partSizeWritten < int64(curPartSize) {
return ObjectInfo{}, traceError(IncompleteBody{})
}
// Update the total written size
sizeWritten += partSizeWritten
// If erasure stored some data in the loop or created an empty file
if partSizeWritten > 0 || allowEmptyPart {
for index := range partsMetadata {
// Add the part to xl.json.
partsMetadata[index].AddObjectPart(partIdx, partName, "", partSizeWritten)
// Add part checksum info to xl.json.
partsMetadata[index].Erasure.AddCheckSumInfo(checkSumInfo{
Name: partName,
Hash: checkSums[index],
Algorithm: bitRotAlgo,
})
}
}
// If we didn't write anything or we know that the next part doesn't have any
// data to write, we should quit this loop immediately
if partSizeWritten == 0 || getPartSizeFromIdx(size, globalPutPartSize, partIdx+1) == 0 {
break
}
} }
// For size == -1, perhaps client is sending in chunked encoding // For size == -1, perhaps client is sending in chunked encoding
// set the size as size that was actually written. // set the size as size that was actually written.
if size == -1 { if size == -1 {
size = sizeWritten size = sizeWritten
} else {
// Check if stored data satisfies what is asked
if sizeWritten < size {
return ObjectInfo{}, traceError(IncompleteBody{})
}
} }
// Save additional erasureMetadata. // Save additional erasureMetadata.
@ -604,22 +665,11 @@ func (xl xlObjects) PutObject(bucket string, object string, size int64, data io.
} }
// Fill all the necessary metadata. // Fill all the necessary metadata.
xlMeta.Meta = metadata
xlMeta.Stat.Size = size
xlMeta.Stat.ModTime = modTime
// Add the final part.
xlMeta.AddObjectPart(1, "part.1", newMD5Hex, xlMeta.Stat.Size)
partsMetadata := make([]xlMetaV1, len(xl.storageDisks))
// Update `xl.json` content on each disks. // Update `xl.json` content on each disks.
for index := range partsMetadata { for index := range partsMetadata {
partsMetadata[index] = xlMeta partsMetadata[index].Meta = metadata
partsMetadata[index].Erasure.AddCheckSumInfo(checkSumInfo{ partsMetadata[index].Stat.Size = size
Name: "part.1", partsMetadata[index].Stat.ModTime = modTime
Hash: checkSums[index],
Algorithm: bitRotAlgo,
})
} }
// Write unique `xl.json` for each disk. // Write unique `xl.json` for each disk.
@ -639,6 +689,10 @@ func (xl xlObjects) PutObject(bucket string, object string, size int64, data io.
newBuffer.Close() newBuffer.Close()
} }
// Object info is the same in all disks, so we can pick the first meta
// of the first disk
xlMeta = partsMetadata[0]
objInfo = ObjectInfo{ objInfo = ObjectInfo{
IsDir: false, IsDir: false,
Bucket: bucket, Bucket: bucket,

View File

@ -343,3 +343,30 @@ func getOrderedDisks(distribution []int, disks []StorageAPI) (orderedDisks []Sto
} }
return orderedDisks return orderedDisks
} }
// getPartSizeFromIdx predicts the part size according to its index. It also
// returns -1 when totalSize is also -1.
func getPartSizeFromIdx(totalSize int64, partSize int64, partIndex int) int64 {
if partSize == 0 {
panic("Part size cannot be nil.")
}
if partIndex < 1 {
panic("Part index should be greater than 1.")
}
switch totalSize {
case -1, 0:
return totalSize
}
// Compute the total count of parts
partsCount := totalSize/partSize + 1
// Return the part's size
switch {
case int64(partIndex) < partsCount:
return partSize
case int64(partIndex) == partsCount:
// Size of last part
return totalSize % partSize
default:
return 0
}
}

View File

@ -22,6 +22,8 @@ import (
"strconv" "strconv"
"testing" "testing"
"time" "time"
humanize "github.com/dustin/go-humanize"
) )
// Tests caclculating disk count. // Tests caclculating disk count.
@ -325,3 +327,35 @@ func TestGetXLMetaV1GJson10(t *testing.T) {
} }
compareXLMetaV1(t, unMarshalXLMeta, gjsonXLMeta) compareXLMetaV1(t, unMarshalXLMeta, gjsonXLMeta)
} }
// Test the predicted part size from the part index
func TestGetPartSizeFromIdx(t *testing.T) {
// Create test cases
testCases := []struct {
totalSize int64
partSize int64
partIndex int
expectedSize int64
}{
// Total size is - 1
{-1, 10, 1, -1},
// Total size is zero
{0, 10, 1, 0},
// part size 2MiB, total size 4MiB
{4 * humanize.MiByte, 2 * humanize.MiByte, 1, 2 * humanize.MiByte},
{4 * humanize.MiByte, 2 * humanize.MiByte, 2, 2 * humanize.MiByte},
{4 * humanize.MiByte, 2 * humanize.MiByte, 3, 0},
// part size 2MiB, total size 5MiB
{5 * humanize.MiByte, 2 * humanize.MiByte, 1, 2 * humanize.MiByte},
{5 * humanize.MiByte, 2 * humanize.MiByte, 2, 2 * humanize.MiByte},
{5 * humanize.MiByte, 2 * humanize.MiByte, 3, 1 * humanize.MiByte},
{5 * humanize.MiByte, 2 * humanize.MiByte, 4, 0},
}
for i, testCase := range testCases {
s := getPartSizeFromIdx(testCase.totalSize, testCase.partSize, testCase.partIndex)
if s != testCase.expectedSize {
t.Errorf("Test %d: The calculated part size is incorrect: expected = %d, found = %d\n", i+1, testCase.expectedSize, s)
}
}
}