From 293d246f95e9f3984628efbcd1ccf1476475c3ad Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Fri, 20 May 2016 20:48:47 -0700 Subject: [PATCH] XL/FS: Rewrite in new format. --- docs/backend/json-files/fs/format.json | 4 + docs/backend/json-files/fs/fs.json | 14 + docs/backend/json-files/fs/uploads.json | 10 + docs/backend/json-files/xl/format.json | 20 + docs/backend/json-files/xl/uploads.json | 10 + docs/backend/json-files/xl/xl.json | 44 ++ erasure-createfile.go | 172 +++++ ...sure-v1-readfile.go => erasure-readfile.go | 122 ++- xl-erasure-v1-utils.go => erasure-utils.go | 13 +- ...-v1-waitcloser.go => erasure-waitcloser.go | 0 erasure.go | 60 ++ fs-objects-multipart.go | 150 ---- fs-v1-metadata.go | 106 +++ ...-common-multipart.go => fs-v1-multipart.go | 719 +++++++++--------- fs-objects.go => fs-v1.go | 175 ++++- object-common.go | 194 +---- object-utils.go | 15 + object_api_suite_test.go | 2 - posix.go | 3 +- test-utils_test.go | 9 +- tree-walk.go => tree-walk-fs.go | 131 +--- tree-walk-xl.go | 265 +++++++ xl-erasure-v1-common.go | 204 ----- xl-erasure-v1-createfile.go | 287 ------- xl-erasure-v1-healfile.go | 185 ----- xl-erasure-v1-metadata.go | 61 -- xl-erasure-v1.go | 546 ------------- xl-objects-multipart.go | 336 -------- xl-objects.go | 581 -------------- xl-v1-bucket.go | 355 +++++++++ xl-v1-list-objects.go | 116 +++ xl-v1-metadata.go | 287 +++++++ xl-v1-multipart-common.go | 474 ++++++++++++ xl-v1-multipart.go | 432 +++++++++++ xl-v1-object.go | 357 +++++++++ xl-v1.go | 177 +++++ 36 files changed, 3560 insertions(+), 3076 deletions(-) create mode 100644 docs/backend/json-files/fs/format.json create mode 100644 docs/backend/json-files/fs/fs.json create mode 100644 docs/backend/json-files/fs/uploads.json create mode 100644 docs/backend/json-files/xl/format.json create mode 100644 docs/backend/json-files/xl/uploads.json create mode 100644 docs/backend/json-files/xl/xl.json create mode 100644 erasure-createfile.go rename xl-erasure-v1-readfile.go => erasure-readfile.go (54%) rename xl-erasure-v1-utils.go => erasure-utils.go (82%) rename xl-erasure-v1-waitcloser.go => erasure-waitcloser.go (100%) create mode 100644 erasure.go delete mode 100644 fs-objects-multipart.go create mode 100644 fs-v1-metadata.go rename object-common-multipart.go => fs-v1-multipart.go (50%) rename fs-objects.go => fs-v1.go (56%) rename tree-walk.go => tree-walk-fs.go (59%) create mode 100644 tree-walk-xl.go delete mode 100644 xl-erasure-v1-common.go delete mode 100644 xl-erasure-v1-createfile.go delete mode 100644 xl-erasure-v1-healfile.go delete mode 100644 xl-erasure-v1-metadata.go delete mode 100644 xl-erasure-v1.go delete mode 100644 xl-objects-multipart.go delete mode 100644 xl-objects.go create mode 100644 xl-v1-bucket.go create mode 100644 xl-v1-list-objects.go create mode 100644 xl-v1-metadata.go create mode 100644 xl-v1-multipart-common.go create mode 100644 xl-v1-multipart.go create mode 100644 xl-v1-object.go create mode 100644 xl-v1.go diff --git a/docs/backend/json-files/fs/format.json b/docs/backend/json-files/fs/format.json new file mode 100644 index 000000000..244e25856 --- /dev/null +++ b/docs/backend/json-files/fs/format.json @@ -0,0 +1,4 @@ +{ + "format": "fs", + "version": "1" +} diff --git a/docs/backend/json-files/fs/fs.json b/docs/backend/json-files/fs/fs.json new file mode 100644 index 000000000..5d5594828 --- /dev/null +++ b/docs/backend/json-files/fs/fs.json @@ -0,0 +1,14 @@ +{ + "version": "1", + "format": "fs", + "minio": { + "release": "DEVELOPMENT.GOGET" + }, + "parts": [ + { + "name": "object1", + "size": 29, + "eTag": "", + }, + ] +} diff --git a/docs/backend/json-files/fs/uploads.json b/docs/backend/json-files/fs/uploads.json new file mode 100644 index 000000000..339d5ecff --- /dev/null +++ b/docs/backend/json-files/fs/uploads.json @@ -0,0 +1,10 @@ +{ + "version": "1", + "format": "fs", + "uploadIds": [ + { + "uploadID": "id", + "startTime": "time", + } + ] +} diff --git a/docs/backend/json-files/xl/format.json b/docs/backend/json-files/xl/format.json new file mode 100644 index 000000000..c3acdd6cf --- /dev/null +++ b/docs/backend/json-files/xl/format.json @@ -0,0 +1,20 @@ +{ + "xl": { + "jbod": [ + "8aa2b1bc-0e5a-49e0-8221-05228336b040", + "3467a69b-0266-478a-9e10-e819447e4545", + "d4a4505b-4e4f-4864-befd-4f36adb0bc66", + "592b6583-ca26-47af-b991-ba6d097e34e8", + "c7ef69f0-dbf5-4c0e-b167-d30a441bad7e", + "f0b36ea3-fe96-4f2b-bced-22c7f33e0e0c", + "b83abf39-e39d-4e7b-8e16-6f9953455a48", + "7d63dfc9-5441-4243-bd36-de8db0691982", + "c1bbffc5-81f9-4251-9398-33a959b3ce37", + "64408f94-26e0-4277-9593-2d703f4d5a91" + ], + "disk": "8aa2b1bc-0e5a-49e0-8221-05228336b040", + "version": "1" + }, + "format": "xl", + "version": "1" +} diff --git a/docs/backend/json-files/xl/uploads.json b/docs/backend/json-files/xl/uploads.json new file mode 100644 index 000000000..301f731ec --- /dev/null +++ b/docs/backend/json-files/xl/uploads.json @@ -0,0 +1,10 @@ +{ + "version": "1", + "format": "xl", + "uploadIds": [ + { + "uploadID": "id", + "startTime": "time", + } + ] +} diff --git a/docs/backend/json-files/xl/xl.json b/docs/backend/json-files/xl/xl.json new file mode 100644 index 000000000..ebd73fa86 --- /dev/null +++ b/docs/backend/json-files/xl/xl.json @@ -0,0 +1,44 @@ +{ + "parts": [ + { + "size": 5242880, + "etag": "3565c6e741e69a007a5ac7db893a62b5", + "name": "object1" + }, + { + "size": 5242880, + "etag": "d416712335c280ab1e39498552937764", + "name": "object2" + }, + { + "size": 4338324, + "etag": "8a98c5c54d81c6c95ed9bdcaeb941aaf", + "name": "object3" + } + ], + "meta": { + "md5Sum": "97586a5290d4f5a41328062d6a7da593-3", + "content-type": "application\/octet-stream", + "content-encoding": "" + }, + "minio": { + "release": "DEVELOPMENT.GOGET" + }, + "erasure": { + "index": 2, + "distribution": [ 1, 3, 4, 2, 5, 8, 7, 6, 9 ], + "blockSize": 4194304, + "parity": 5, + "data": 5 + }, + "checksum": { + "enable": false, + }, + "stat": { + "version": 0, + "modTime": "2016-05-24T00:09:40.122390255Z", + "size": 14824084 + }, + "format": "xl", + "version": "1" +} diff --git a/erasure-createfile.go b/erasure-createfile.go new file mode 100644 index 000000000..e5f049f48 --- /dev/null +++ b/erasure-createfile.go @@ -0,0 +1,172 @@ +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "io" + "sync" +) + +// cleanupCreateFileOps - cleans up all the temporary files and other +// temporary data upon any failure. +func (e erasure) cleanupCreateFileOps(volume, path string, writers []io.WriteCloser) { + // Close and remove temporary writers. + for _, writer := range writers { + if err := safeCloseAndRemove(writer); err != nil { + errorIf(err, "Failed to close writer.") + } + } + // Remove any temporary written data. + for _, disk := range e.storageDisks { + if err := disk.DeleteFile(volume, path); err != nil { + errorIf(err, "Unable to delete file.") + } + } +} + +// WriteErasure reads predefined blocks, encodes them and writes to +// configured storage disks. +func (e erasure) writeErasure(volume, path string, reader *io.PipeReader, wcloser *waitCloser) { + // Release the block writer upon function return. + defer wcloser.release() + + writers := make([]io.WriteCloser, len(e.storageDisks)) + + // Initialize all writers. + for index, disk := range e.storageDisks { + writer, err := disk.CreateFile(volume, path) + if err != nil { + e.cleanupCreateFileOps(volume, path, writers) + reader.CloseWithError(err) + return + } + writers[index] = writer + } + + // Allocate 4MiB block size buffer for reading. + dataBuffer := make([]byte, erasureBlockSize) + for { + // Read up to allocated block size. + n, err := io.ReadFull(reader, dataBuffer) + if err != nil { + // Any unexpected errors, close the pipe reader with error. + if err != io.ErrUnexpectedEOF && err != io.EOF { + // Remove all temp writers. + e.cleanupCreateFileOps(volume, path, writers) + reader.CloseWithError(err) + return + } + } + // At EOF break out. + if err == io.EOF { + break + } + if n > 0 { + // Split the input buffer into data and parity blocks. + var dataBlocks [][]byte + dataBlocks, err = e.ReedSolomon.Split(dataBuffer[0:n]) + if err != nil { + // Remove all temp writers. + e.cleanupCreateFileOps(volume, path, writers) + reader.CloseWithError(err) + return + } + + // Encode parity blocks using data blocks. + err = e.ReedSolomon.Encode(dataBlocks) + if err != nil { + // Remove all temp writers upon error. + e.cleanupCreateFileOps(volume, path, writers) + reader.CloseWithError(err) + return + } + + var wg = &sync.WaitGroup{} + var wErrs = make([]error, len(writers)) + // Write encoded data to quorum disks in parallel. + for index, writer := range writers { + if writer == nil { + continue + } + wg.Add(1) + // Write encoded data in routine. + go func(index int, writer io.Writer) { + defer wg.Done() + encodedData := dataBlocks[index] + _, wErr := writers[index].Write(encodedData) + if wErr != nil { + wErrs[index] = wErr + return + } + wErrs[index] = nil + }(index, writer) + } + wg.Wait() + + // Cleanup and return on first non-nil error. + for _, wErr := range wErrs { + if wErr == nil { + continue + } + // Remove all temp writers upon error. + e.cleanupCreateFileOps(volume, path, writers) + reader.CloseWithError(wErr) + return + } + } + } + + // Close all writers and metadata writers in routines. + for _, writer := range writers { + if writer == nil { + continue + } + // Safely wrote, now rename to its actual location. + if err := writer.Close(); err != nil { + // Remove all temp writers upon error. + e.cleanupCreateFileOps(volume, path, writers) + reader.CloseWithError(err) + return + } + } + + // Close the pipe reader and return. + reader.Close() + return +} + +// CreateFile - create a file. +func (e erasure) CreateFile(volume, path string) (writeCloser io.WriteCloser, err error) { + if !isValidVolname(volume) { + return nil, errInvalidArgument + } + if !isValidPath(path) { + return nil, errInvalidArgument + } + + // Initialize pipe for data pipe line. + pipeReader, pipeWriter := io.Pipe() + + // Initialize a new wait closer, implements both Write and Close. + wcloser := newWaitCloser(pipeWriter) + + // Start erasure encoding in routine, reading data block by block from pipeReader. + go e.writeErasure(volume, path, pipeReader, wcloser) + + // Return the writer, caller should start writing to this. + return wcloser, nil +} diff --git a/xl-erasure-v1-readfile.go b/erasure-readfile.go similarity index 54% rename from xl-erasure-v1-readfile.go rename to erasure-readfile.go index 987cd6143..9c35058a7 100644 --- a/xl-erasure-v1-readfile.go +++ b/erasure-readfile.go @@ -18,14 +18,12 @@ package main import ( "errors" - "fmt" "io" - slashpath "path" "sync" ) -// ReadFile - read file -func (xl XL) ReadFile(volume, path string, startOffset int64) (io.ReadCloser, error) { +// ReadFile - decoded erasure coded file. +func (e erasure) ReadFile(volume, path string, startOffset int64) (io.ReadCloser, error) { // Input validation. if !isValidVolname(volume) { return nil, errInvalidArgument @@ -34,52 +32,34 @@ func (xl XL) ReadFile(volume, path string, startOffset int64) (io.ReadCloser, er return nil, errInvalidArgument } - onlineDisks, metadata, heal, err := xl.listOnlineDisks(volume, path) - if err != nil { - return nil, err + var wg = &sync.WaitGroup{} + + readers := make([]io.ReadCloser, len(e.storageDisks)) + for index, disk := range e.storageDisks { + wg.Add(1) + go func(index int, disk StorageAPI) { + defer wg.Done() + // If disk.ReadFile returns error and we don't have read + // quorum it will be taken care as ReedSolomon.Reconstruct() + // will fail later. + offset := int64(0) + if reader, err := disk.ReadFile(volume, path, offset); err == nil { + readers[index] = reader + } + }(index, disk) } - if heal { - // Heal in background safely, since we already have read - // quorum disks. Let the reads continue. - go func() { - hErr := xl.healFile(volume, path) - errorIf(hErr, "Unable to heal file "+volume+"/"+path+".") - }() - } - - readers := make([]io.ReadCloser, len(xl.storageDisks)) - for index, disk := range onlineDisks { - if disk == nil { - continue - } - erasurePart := slashpath.Join(path, fmt.Sprintf("file.%d", index)) - // If disk.ReadFile returns error and we don't have read quorum it will be taken care as - // ReedSolomon.Reconstruct() will fail later. - var reader io.ReadCloser - offset := int64(0) - if reader, err = disk.ReadFile(volume, erasurePart, offset); err == nil { - readers[index] = reader - } - } + wg.Wait() // Initialize pipe. pipeReader, pipeWriter := io.Pipe() + go func() { - var totalLeft = metadata.Stat.Size - // Read until the totalLeft. - for totalLeft > 0 { - // Figure out the right blockSize as it was encoded before. - var curBlockSize int64 - if metadata.Erasure.BlockSize < totalLeft { - curBlockSize = metadata.Erasure.BlockSize - } else { - curBlockSize = totalLeft - } + // Read until EOF. + for { // Calculate the current encoded block size. - curEncBlockSize := getEncodedBlockLen(curBlockSize, metadata.Erasure.DataBlocks) - enBlocks := make([][]byte, len(xl.storageDisks)) - var wg = &sync.WaitGroup{} + curEncBlockSize := getEncodedBlockLen(erasureBlockSize, e.DataBlocks) + enBlocks := make([][]byte, len(e.storageDisks)) // Loop through all readers and read. for index, reader := range readers { // Initialize shard slice and fill the data from each parts. @@ -87,19 +67,28 @@ func (xl XL) ReadFile(volume, path string, startOffset int64) (io.ReadCloser, er if reader == nil { continue } - // Parallelize reading. - wg.Add(1) - go func(index int, reader io.Reader) { - defer wg.Done() - // Read the necessary blocks. - _, rErr := io.ReadFull(reader, enBlocks[index]) - if rErr != nil && rErr != io.ErrUnexpectedEOF { - readers[index] = nil + // Read the necessary blocks. + n, rErr := io.ReadFull(reader, enBlocks[index]) + if rErr == io.EOF { + // Close the pipe. + pipeWriter.Close() + + // Cleanly close all the underlying data readers. + for _, reader := range readers { + if reader == nil { + continue + } + reader.Close() } - }(index, reader) + return + } + if rErr != nil && rErr != io.ErrUnexpectedEOF { + readers[index].Close() + readers[index] = nil + continue + } + enBlocks[index] = enBlocks[index][:n] } - // Wait for the read routines to finish. - wg.Wait() // Check blocks if they are all zero in length. if checkBlockSize(enBlocks) == 0 { @@ -108,8 +97,7 @@ func (xl XL) ReadFile(volume, path string, startOffset int64) (io.ReadCloser, er } // Verify the blocks. - var ok bool - ok, err = xl.ReedSolomon.Verify(enBlocks) + ok, err := e.ReedSolomon.Verify(enBlocks) if err != nil { pipeWriter.CloseWithError(err) return @@ -123,13 +111,13 @@ func (xl XL) ReadFile(volume, path string, startOffset int64) (io.ReadCloser, er enBlocks[index] = nil } } - err = xl.ReedSolomon.Reconstruct(enBlocks) + err = e.ReedSolomon.Reconstruct(enBlocks) if err != nil { pipeWriter.CloseWithError(err) return } // Verify reconstructed blocks again. - ok, err = xl.ReedSolomon.Verify(enBlocks) + ok, err = e.ReedSolomon.Verify(enBlocks) if err != nil { pipeWriter.CloseWithError(err) return @@ -143,16 +131,14 @@ func (xl XL) ReadFile(volume, path string, startOffset int64) (io.ReadCloser, er } // Get all the data blocks. - dataBlocks := getDataBlocks(enBlocks, metadata.Erasure.DataBlocks, int(curBlockSize)) + dataBlocks := getDataBlocks(enBlocks, e.DataBlocks) // Verify if the offset is right for the block, if not move to // the next block. - if startOffset > 0 { startOffset = startOffset - int64(len(dataBlocks)) // Start offset is greater than or equal to zero, skip the dataBlocks. if startOffset >= 0 { - totalLeft = totalLeft - metadata.Erasure.BlockSize continue } // Now get back the remaining offset if startOffset is negative. @@ -168,20 +154,6 @@ func (xl XL) ReadFile(volume, path string, startOffset int64) (io.ReadCloser, er // Reset offset to '0' to read rest of the blocks. startOffset = int64(0) - - // Save what's left after reading erasureBlockSize. - totalLeft = totalLeft - metadata.Erasure.BlockSize - } - - // Cleanly end the pipe after a successful decoding. - pipeWriter.Close() - - // Cleanly close all the underlying data readers. - for _, reader := range readers { - if reader == nil { - continue - } - reader.Close() } }() diff --git a/xl-erasure-v1-utils.go b/erasure-utils.go similarity index 82% rename from xl-erasure-v1-utils.go rename to erasure-utils.go index ff505b143..c291dda4a 100644 --- a/xl-erasure-v1-utils.go +++ b/erasure-utils.go @@ -17,12 +17,19 @@ package main // getDataBlocks - fetches the data block only part of the input encoded blocks. -func getDataBlocks(enBlocks [][]byte, dataBlocks int, curBlockSize int) []byte { +func getDataBlocks(enBlocks [][]byte, dataBlocks int) []byte { var data []byte for _, block := range enBlocks[:dataBlocks] { - data = append(data, block...) + var newBlock []byte + // FIXME: Find a better way to skip the padding zeros. + for _, b := range block { + if b == 0 { + continue + } + newBlock = append(newBlock, b) + } + data = append(data, newBlock...) } - data = data[:curBlockSize] return data } diff --git a/xl-erasure-v1-waitcloser.go b/erasure-waitcloser.go similarity index 100% rename from xl-erasure-v1-waitcloser.go rename to erasure-waitcloser.go diff --git a/erasure.go b/erasure.go new file mode 100644 index 000000000..45d121d2f --- /dev/null +++ b/erasure.go @@ -0,0 +1,60 @@ +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "errors" + + "github.com/klauspost/reedsolomon" +) + +// erasure storage layer. +type erasure struct { + ReedSolomon reedsolomon.Encoder // Erasure encoder/decoder. + DataBlocks int + ParityBlocks int + storageDisks []StorageAPI +} + +// errUnexpected - returned for any unexpected error. +var errUnexpected = errors.New("Unexpected error - please report at https://github.com/minio/minio/issues") + +// newErasure instantiate a new erasure. +func newErasure(disks []StorageAPI) (*erasure, error) { + // Initialize E. + e := &erasure{} + + // Calculate data and parity blocks. + dataBlocks, parityBlocks := len(disks)/2, len(disks)/2 + + // Initialize reed solomon encoding. + rs, err := reedsolomon.New(dataBlocks, parityBlocks) + if err != nil { + return nil, err + } + + // Save the reedsolomon. + e.DataBlocks = dataBlocks + e.ParityBlocks = parityBlocks + e.ReedSolomon = rs + + // Save all the initialized storage disks. + e.storageDisks = disks + + // Return successfully initialized. + return e, nil +} diff --git a/fs-objects-multipart.go b/fs-objects-multipart.go deleted file mode 100644 index 99cadfef8..000000000 --- a/fs-objects-multipart.go +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Minio Cloud Storage, (C) 2016 Minio, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package main - -import ( - "fmt" - "io" - "path" -) - -// ListMultipartUploads - list multipart uploads. -func (fs fsObjects) ListMultipartUploads(bucket, prefix, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (ListMultipartsInfo, error) { - return listMultipartUploadsCommon(fs, bucket, prefix, keyMarker, uploadIDMarker, delimiter, maxUploads) -} - -// NewMultipartUpload - initialize a new multipart upload, returns a unique id. -func (fs fsObjects) NewMultipartUpload(bucket, object string, meta map[string]string) (string, error) { - meta = make(map[string]string) // Reset the meta value, we are not going to save headers for fs. - return newMultipartUploadCommon(fs.storage, bucket, object, meta) -} - -// PutObjectPart - writes the multipart upload chunks. -func (fs fsObjects) PutObjectPart(bucket, object, uploadID string, partID int, size int64, data io.Reader, md5Hex string) (string, error) { - return putObjectPartCommon(fs.storage, bucket, object, uploadID, partID, size, data, md5Hex) -} - -func (fs fsObjects) ListObjectParts(bucket, object, uploadID string, partNumberMarker, maxParts int) (ListPartsInfo, error) { - return listObjectPartsCommon(fs.storage, bucket, object, uploadID, partNumberMarker, maxParts) -} - -func (fs fsObjects) CompleteMultipartUpload(bucket string, object string, uploadID string, parts []completePart) (string, error) { - // Verify if bucket is valid. - if !IsValidBucketName(bucket) { - return "", BucketNameInvalid{Bucket: bucket} - } - // Verify whether the bucket exists. - if !isBucketExist(fs.storage, bucket) { - return "", BucketNotFound{Bucket: bucket} - } - if !IsValidObjectName(object) { - return "", ObjectNameInvalid{ - Bucket: bucket, - Object: object, - } - } - if !isUploadIDExists(fs.storage, bucket, object, uploadID) { - return "", InvalidUploadID{UploadID: uploadID} - } - - // Calculate s3 compatible md5sum for complete multipart. - s3MD5, err := completeMultipartMD5(parts...) - if err != nil { - return "", err - } - - tempObj := path.Join(tmpMetaPrefix, bucket, object, uploadID, incompleteFile) - fileWriter, err := fs.storage.CreateFile(minioMetaBucket, tempObj) - if err != nil { - return "", toObjectErr(err, bucket, object) - } - - // Loop through all parts, validate them and then commit to disk. - for i, part := range parts { - // Construct part suffix. - partSuffix := fmt.Sprintf("%.5d.%s", part.PartNumber, part.ETag) - multipartPartFile := path.Join(mpartMetaPrefix, bucket, object, uploadID, partSuffix) - var fi FileInfo - fi, err = fs.storage.StatFile(minioMetaBucket, multipartPartFile) - if err != nil { - if err == errFileNotFound { - return "", InvalidPart{} - } - return "", err - } - // All parts except the last part has to be atleast 5MB. - if (i < len(parts)-1) && !isMinAllowedPartSize(fi.Size) { - return "", PartTooSmall{} - } - var fileReader io.ReadCloser - fileReader, err = fs.storage.ReadFile(minioMetaBucket, multipartPartFile, 0) - if err != nil { - if clErr := safeCloseAndRemove(fileWriter); clErr != nil { - return "", clErr - } - if err == errFileNotFound { - return "", InvalidPart{} - } - return "", err - } - _, err = io.Copy(fileWriter, fileReader) - if err != nil { - if clErr := safeCloseAndRemove(fileWriter); clErr != nil { - return "", clErr - } - return "", err - } - err = fileReader.Close() - if err != nil { - if clErr := safeCloseAndRemove(fileWriter); clErr != nil { - return "", clErr - } - return "", err - } - } - - err = fileWriter.Close() - if err != nil { - if clErr := safeCloseAndRemove(fileWriter); clErr != nil { - return "", clErr - } - return "", err - } - - // Rename the file back to original location, if not delete the - // temporary object. - err = fs.storage.RenameFile(minioMetaBucket, tempObj, bucket, object) - if err != nil { - if derr := fs.storage.DeleteFile(minioMetaBucket, tempObj); derr != nil { - return "", toObjectErr(derr, minioMetaBucket, tempObj) - } - return "", toObjectErr(err, bucket, object) - } - - // Cleanup all the parts if everything else has been safely committed. - if err = cleanupUploadedParts(fs.storage, bucket, object, uploadID); err != nil { - return "", err - } - - // Return md5sum. - return s3MD5, nil -} - -// AbortMultipartUpload - aborts a multipart upload. -func (fs fsObjects) AbortMultipartUpload(bucket, object, uploadID string) error { - return abortMultipartUploadCommon(fs.storage, bucket, object, uploadID) -} diff --git a/fs-v1-metadata.go b/fs-v1-metadata.go new file mode 100644 index 000000000..b045a52df --- /dev/null +++ b/fs-v1-metadata.go @@ -0,0 +1,106 @@ +package main + +import ( + "bytes" + "encoding/json" + "io" + "path" + "sort" +) + +// A fsMetaV1 represents a metadata header mapping keys to sets of values. +type fsMetaV1 struct { + Version string `json:"version"` + Format string `json:"format"` + Minio struct { + Release string `json:"release"` + } `json:"minio"` + Checksum struct { + Enable bool `json:"enable"` + } `json:"checksum"` + Parts []objectPartInfo `json:"parts,omitempty"` +} + +// ReadFrom - read from implements io.ReaderFrom interface for +// unmarshalling fsMetaV1. +func (m *fsMetaV1) ReadFrom(reader io.Reader) (n int64, err error) { + var buffer bytes.Buffer + n, err = buffer.ReadFrom(reader) + if err != nil { + return 0, err + } + err = json.Unmarshal(buffer.Bytes(), m) + return n, err +} + +// WriteTo - write to implements io.WriterTo interface for marshalling fsMetaV1. +func (m fsMetaV1) WriteTo(writer io.Writer) (n int64, err error) { + metadataBytes, err := json.Marshal(m) + if err != nil { + return 0, err + } + p, err := writer.Write(metadataBytes) + return int64(p), err +} + +// SearchObjectPart - search object part name and etag. +func (m fsMetaV1) SearchObjectPart(name string, etag string) int { + for i, part := range m.Parts { + if name == part.Name && etag == part.ETag { + return i + } + } + return -1 +} + +// AddObjectPart - add a new object part in order. +func (m *fsMetaV1) AddObjectPart(name string, etag string, size int64) { + m.Parts = append(m.Parts, objectPartInfo{ + Name: name, + ETag: etag, + Size: size, + }) + sort.Sort(byPartName(m.Parts)) +} + +const ( + fsMetaJSONFile = "fs.json" +) + +// readFSMetadata - read `fs.json`. +func (fs fsObjects) readFSMetadata(bucket, object string) (fsMeta fsMetaV1, err error) { + r, err := fs.storage.ReadFile(bucket, path.Join(object, fsMetaJSONFile), int64(0)) + if err != nil { + return fsMetaV1{}, err + } + defer r.Close() + _, err = fsMeta.ReadFrom(r) + if err != nil { + return fsMetaV1{}, err + } + return fsMeta, nil +} + +// writeFSMetadata - write `fs.json`. +func (fs fsObjects) writeFSMetadata(bucket, prefix string, fsMeta fsMetaV1) error { + // Initialize metadata map, save all erasure related metadata. + fsMeta.Minio.Release = minioReleaseTag + w, err := fs.storage.CreateFile(bucket, path.Join(prefix, fsMetaJSONFile)) + if err != nil { + return err + } + _, err = fsMeta.WriteTo(w) + if err != nil { + if mErr := safeCloseAndRemove(w); mErr != nil { + return mErr + } + return err + } + if err = w.Close(); err != nil { + if mErr := safeCloseAndRemove(w); mErr != nil { + return mErr + } + return err + } + return nil +} diff --git a/object-common-multipart.go b/fs-v1-multipart.go similarity index 50% rename from object-common-multipart.go rename to fs-v1-multipart.go index 583cd2f25..3530a1c78 100644 --- a/object-common-multipart.go +++ b/fs-v1-multipart.go @@ -19,66 +19,39 @@ package main import ( "crypto/md5" "encoding/hex" - "encoding/json" "fmt" "io" "io/ioutil" "path" - "sort" "strconv" "strings" + "time" "github.com/skyrings/skyring-common/tools/uuid" ) -const ( - incompleteFile = "00000.incomplete" - uploadsJSONFile = "uploads.json" -) - -// createUploadsJSON - create uploads.json placeholder file. -func createUploadsJSON(storage StorageAPI, bucket, object, uploadID string) error { - // Place holder uploads.json - uploadsPath := path.Join(mpartMetaPrefix, bucket, object, uploadsJSONFile) - uploadsJSONSuffix := fmt.Sprintf("%s.%s", uploadID, uploadsJSONFile) - tmpUploadsPath := path.Join(tmpMetaPrefix, bucket, object, uploadsJSONSuffix) - w, err := storage.CreateFile(minioMetaBucket, uploadsPath) +// Checks whether bucket exists. +func (fs fsObjects) isBucketExist(bucket string) bool { + // Check whether bucket exists. + _, err := fs.storage.StatVol(bucket) if err != nil { - return err - } - if err = w.Close(); err != nil { - if clErr := safeCloseAndRemove(w); clErr != nil { - return clErr + if err == errVolumeNotFound { + return false } - return err + errorIf(err, "Stat failed on bucket "+bucket+".") + return false } - _, err = storage.StatFile(minioMetaBucket, uploadsPath) - if err != nil { - if err == errFileNotFound { - err = storage.RenameFile(minioMetaBucket, tmpUploadsPath, minioMetaBucket, uploadsPath) - if err == nil { - return nil - } - } - if derr := storage.DeleteFile(minioMetaBucket, tmpUploadsPath); derr != nil { - return derr - } - return err - } - return nil + return true } -/// Common multipart object layer functions. - -// newMultipartUploadCommon - initialize a new multipart, is a common -// function for both object layers. -func newMultipartUploadCommon(storage StorageAPI, bucket string, object string, meta map[string]string) (uploadID string, err error) { +// newMultipartUploadCommon - initialize a new multipart, is a common function for both object layers. +func (fs fsObjects) newMultipartUploadCommon(bucket string, object string, meta map[string]string) (uploadID string, err error) { // Verify if bucket name is valid. if !IsValidBucketName(bucket) { return "", BucketNameInvalid{Bucket: bucket} } // Verify whether the bucket exists. - if !isBucketExist(storage, bucket) { + if !fs.isBucketExist(bucket) { return "", BucketNotFound{Bucket: bucket} } // Verify if object name is valid. @@ -89,266 +62,68 @@ func newMultipartUploadCommon(storage StorageAPI, bucket string, object string, if meta == nil { meta = make(map[string]string) } + + fsMeta := fsMetaV1{} + fsMeta.Format = "fs" + fsMeta.Version = "1" + // This lock needs to be held for any changes to the directory contents of ".minio/multipart/object/" nsMutex.Lock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object)) defer nsMutex.Unlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object)) - // Loops through until successfully generates a new unique upload id. - for { - uuid, err := uuid.New() - if err != nil { - return "", err - } - uploadID := uuid.String() - // Create placeholder file 'uploads.json' - err = createUploadsJSON(storage, bucket, object, uploadID) - if err != nil { - return "", err - } - uploadIDPath := path.Join(mpartMetaPrefix, bucket, object, uploadID, incompleteFile) - incompleteSuffix := fmt.Sprintf("%s.%s", uploadID, incompleteFile) - tempUploadIDPath := path.Join(tmpMetaPrefix, bucket, object, incompleteSuffix) - if _, err = storage.StatFile(minioMetaBucket, uploadIDPath); err != nil { - if err != errFileNotFound { - return "", toObjectErr(err, minioMetaBucket, uploadIDPath) - } - // uploadIDPath doesn't exist, so create empty file to reserve the name - var w io.WriteCloser - if w, err = storage.CreateFile(minioMetaBucket, tempUploadIDPath); err != nil { - return "", toObjectErr(err, minioMetaBucket, tempUploadIDPath) - } - // Encode the uploaded metadata into incomplete file. - encoder := json.NewEncoder(w) - err = encoder.Encode(&meta) - if err != nil { - if clErr := safeCloseAndRemove(w); clErr != nil { - return "", toObjectErr(clErr, minioMetaBucket, tempUploadIDPath) - } - return "", toObjectErr(err, minioMetaBucket, tempUploadIDPath) - } - - // Close the writer. - if err = w.Close(); err != nil { - if clErr := safeCloseAndRemove(w); clErr != nil { - return "", toObjectErr(clErr, minioMetaBucket, tempUploadIDPath) - } - return "", toObjectErr(err, minioMetaBucket, tempUploadIDPath) - } - - // Rename the file to the actual location from temporary path. - err = storage.RenameFile(minioMetaBucket, tempUploadIDPath, minioMetaBucket, uploadIDPath) - if err != nil { - if derr := storage.DeleteFile(minioMetaBucket, tempUploadIDPath); derr != nil { - return "", toObjectErr(derr, minioMetaBucket, tempUploadIDPath) - } - return "", toObjectErr(err, minioMetaBucket, uploadIDPath) - } - return uploadID, nil - } - // uploadIDPath already exists. - // loop again to try with different uuid generated. - } -} - -// putObjectPartCommon - put object part. -func putObjectPartCommon(storage StorageAPI, bucket string, object string, uploadID string, partID int, size int64, data io.Reader, md5Hex string) (string, error) { - // Verify if bucket is valid. - if !IsValidBucketName(bucket) { - return "", BucketNameInvalid{Bucket: bucket} - } - // Verify whether the bucket exists. - if !isBucketExist(storage, bucket) { - return "", BucketNotFound{Bucket: bucket} - } - if !IsValidObjectName(object) { - return "", ObjectNameInvalid{Bucket: bucket, Object: object} - } - if !isUploadIDExists(storage, bucket, object, uploadID) { - return "", InvalidUploadID{UploadID: uploadID} - } - // Hold read lock on the uploadID so that no one aborts it. - nsMutex.RLock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) - defer nsMutex.RUnlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) - - // Hold write lock on the part so that there is no parallel upload on the part. - nsMutex.Lock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID, strconv.Itoa(partID))) - defer nsMutex.Unlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID, strconv.Itoa(partID))) - - partSuffix := fmt.Sprintf("%s.%.5d", uploadID, partID) - partSuffixPath := path.Join(tmpMetaPrefix, bucket, object, partSuffix) - fileWriter, err := storage.CreateFile(minioMetaBucket, partSuffixPath) - if err != nil { - return "", toObjectErr(err, bucket, object) - } - - // Initialize md5 writer. - md5Writer := md5.New() - - // Instantiate a new multi writer. - multiWriter := io.MultiWriter(md5Writer, fileWriter) - - // Instantiate checksum hashers and create a multiwriter. - if size > 0 { - if _, err = io.CopyN(multiWriter, data, size); err != nil { - if clErr := safeCloseAndRemove(fileWriter); clErr != nil { - return "", toObjectErr(clErr, bucket, object) - } - return "", toObjectErr(err, bucket, object) - } - // Reader shouldn't have more data what mentioned in size argument. - // reading one more byte from the reader to validate it. - // expected to fail, success validates existence of more data in the reader. - if _, err = io.CopyN(ioutil.Discard, data, 1); err == nil { - if clErr := safeCloseAndRemove(fileWriter); clErr != nil { - return "", toObjectErr(clErr, bucket, object) - } - return "", UnExpectedDataSize{Size: int(size)} - } - } else { - if _, err = io.Copy(multiWriter, data); err != nil { - if clErr := safeCloseAndRemove(fileWriter); clErr != nil { - return "", toObjectErr(clErr, bucket, object) - } - return "", toObjectErr(err, bucket, object) - } - } - - newMD5Hex := hex.EncodeToString(md5Writer.Sum(nil)) - if md5Hex != "" { - if newMD5Hex != md5Hex { - if clErr := safeCloseAndRemove(fileWriter); clErr != nil { - return "", toObjectErr(clErr, bucket, object) - } - return "", BadDigest{md5Hex, newMD5Hex} - } - } - err = fileWriter.Close() - if err != nil { - if clErr := safeCloseAndRemove(fileWriter); clErr != nil { - return "", toObjectErr(clErr, bucket, object) - } + uploadID = getUUID() + initiated := time.Now().UTC() + // Create 'uploads.json' + if err = writeUploadJSON(bucket, object, uploadID, initiated, fs.storage); err != nil { return "", err } - - partSuffixMD5 := fmt.Sprintf("%.5d.%s", partID, newMD5Hex) - partSuffixMD5Path := path.Join(mpartMetaPrefix, bucket, object, uploadID, partSuffixMD5) - if _, err = storage.StatFile(minioMetaBucket, partSuffixMD5Path); err == nil { - // Part already uploaded as md5sum matches with the previous part. - // Just delete the temporary file. - if err = storage.DeleteFile(minioMetaBucket, partSuffixPath); err != nil { - return "", toObjectErr(err, minioMetaBucket, partSuffixPath) - } - return newMD5Hex, nil + uploadIDPath := path.Join(mpartMetaPrefix, bucket, object, uploadID) + tempUploadIDPath := path.Join(tmpMetaPrefix, bucket, object, uploadID) + if err = fs.writeFSMetadata(minioMetaBucket, tempUploadIDPath, fsMeta); err != nil { + return "", toObjectErr(err, minioMetaBucket, tempUploadIDPath) } - err = storage.RenameFile(minioMetaBucket, partSuffixPath, minioMetaBucket, partSuffixMD5Path) + err = fs.storage.RenameFile(minioMetaBucket, path.Join(tempUploadIDPath, fsMetaJSONFile), minioMetaBucket, path.Join(uploadIDPath, fsMetaJSONFile)) if err != nil { - if derr := storage.DeleteFile(minioMetaBucket, partSuffixPath); derr != nil { - return "", toObjectErr(derr, minioMetaBucket, partSuffixPath) + if dErr := fs.storage.DeleteFile(minioMetaBucket, path.Join(tempUploadIDPath, fsMetaJSONFile)); dErr != nil { + return "", toObjectErr(dErr, minioMetaBucket, tempUploadIDPath) } - return "", toObjectErr(err, minioMetaBucket, partSuffixMD5Path) + return "", toObjectErr(err, minioMetaBucket, uploadIDPath) } - return newMD5Hex, nil + // Return success. + return uploadID, nil } -// Wrapper to which removes all the uploaded parts after a successful -// complete multipart upload. -func cleanupUploadedParts(storage StorageAPI, bucket, object, uploadID string) error { - return cleanupDir(storage, minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object, uploadID)) -} - -// abortMultipartUploadCommon - aborts a multipart upload, common -// function used by both object layers. -func abortMultipartUploadCommon(storage StorageAPI, bucket, object, uploadID string) error { - // Verify if bucket is valid. - if !IsValidBucketName(bucket) { - return BucketNameInvalid{Bucket: bucket} - } - if !isBucketExist(storage, bucket) { - return BucketNotFound{Bucket: bucket} - } - if !IsValidObjectName(object) { - return ObjectNameInvalid{Bucket: bucket, Object: object} - } - if !isUploadIDExists(storage, bucket, object, uploadID) { - return InvalidUploadID{UploadID: uploadID} - } - - // Hold lock so that there is no competing complete-multipart-upload or put-object-part. - nsMutex.Lock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) - defer nsMutex.Unlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) - - if err := cleanupUploadedParts(storage, bucket, object, uploadID); err != nil { - return err - } - - // Validate if there are other incomplete upload-id's present for - // the object, if yes do not attempt to delete 'uploads.json'. - if entries, err := storage.ListDir(minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object)); err == nil { - if len(entries) > 1 { - return nil - } - } - - uploadsJSONPath := path.Join(mpartMetaPrefix, bucket, object, uploadsJSONFile) - if err := storage.DeleteFile(minioMetaBucket, uploadsJSONPath); err != nil { - return err - } - - return nil -} - -// isIncompleteMultipart - is object incomplete multipart. -func isIncompleteMultipart(storage StorageAPI, objectPath string) (bool, error) { - _, err := storage.StatFile(minioMetaBucket, path.Join(objectPath, uploadsJSONFile)) +func isMultipartObject(storage StorageAPI, bucket, prefix string) bool { + _, err := storage.StatFile(bucket, path.Join(prefix, fsMetaJSONFile)) if err != nil { if err == errFileNotFound { - return false, nil + return false } - return false, err + errorIf(err, "Unable to access "+path.Join(prefix, fsMetaJSONFile)) + return false } - return true, nil + return true } -// listLeafEntries - lists all entries if a given prefixPath is a leaf -// directory, returns error if any - returns empty list if prefixPath -// is not a leaf directory. -func listLeafEntries(storage StorageAPI, prefixPath string) (entries []string, err error) { - var ok bool - if ok, err = isIncompleteMultipart(storage, prefixPath); err != nil { - return nil, err - } else if !ok { - return nil, nil - } - entries, err = storage.ListDir(minioMetaBucket, prefixPath) +// listUploadsInfo - list all uploads info. +func (fs fsObjects) listUploadsInfo(prefixPath string) (uploads []uploadInfo, err error) { + splitPrefixes := strings.SplitN(prefixPath, "/", 3) + uploadIDs, err := getUploadIDs(splitPrefixes[1], splitPrefixes[2], fs.storage) if err != nil { + if err == errFileNotFound { + return []uploadInfo{}, nil + } return nil, err } - var newEntries []string - for _, entry := range entries { - if strings.HasSuffix(entry, slashSeparator) { - newEntries = append(newEntries, entry) - } - } - return newEntries, nil + uploads = uploadIDs.Uploads + return uploads, nil } -// listMetaBucketMultipartFiles - list all files at a given prefix inside minioMetaBucket. -func listMetaBucketMultipartFiles(layer ObjectLayer, prefixPath string, markerPath string, recursive bool, maxKeys int) (fileInfos []FileInfo, eof bool, err error) { - var storage StorageAPI - switch l := layer.(type) { - case fsObjects: - storage = l.storage - case xlObjects: - storage = l.storage - } - - if recursive && markerPath != "" { - markerPath = pathJoin(markerPath, incompleteFile) - } - - walker := lookupTreeWalk(layer, listParams{minioMetaBucket, recursive, markerPath, prefixPath}) +// listMetaBucketMultipart - list all objects at a given prefix inside minioMetaBucket. +func (fs fsObjects) listMetaBucketMultipart(prefixPath string, markerPath string, recursive bool, maxKeys int) (fileInfos []FileInfo, eof bool, err error) { + walker := fs.lookupTreeWalk(listParams{minioMetaBucket, recursive, markerPath, prefixPath}) if walker == nil { - walker = startTreeWalk(layer, minioMetaBucket, prefixPath, markerPath, recursive) + walker = fs.startTreeWalk(minioMetaBucket, prefixPath, markerPath, recursive) } // newMaxKeys tracks the size of entries which are going to be @@ -357,7 +132,6 @@ func listMetaBucketMultipartFiles(layer ObjectLayer, prefixPath string, markerPa // Following loop gathers and filters out special files inside // minio meta volume. -outerLoop: for { walkResult, ok := <-walker.ch if !ok { @@ -373,47 +147,41 @@ outerLoop: } return nil, false, toObjectErr(walkResult.err, minioMetaBucket, prefixPath) } - fi := walkResult.fileInfo - var entries []string - if fi.Mode.IsDir() { + fileInfo := walkResult.fileInfo + var uploads []uploadInfo + if fileInfo.Mode.IsDir() { // List all the entries if fi.Name is a leaf directory, if // fi.Name is not a leaf directory then the resulting // entries are empty. - entries, err = listLeafEntries(storage, fi.Name) + uploads, err = fs.listUploadsInfo(fileInfo.Name) if err != nil { return nil, false, err } } - if len(entries) > 0 { - // We reach here for non-recursive case and a leaf entry. - sort.Strings(entries) - for _, entry := range entries { - var fileInfo FileInfo - incompleteUploadFile := path.Join(fi.Name, entry, incompleteFile) - fileInfo, err = storage.StatFile(minioMetaBucket, incompleteUploadFile) - if err != nil { - return nil, false, err - } - fileInfo.Name = path.Join(fi.Name, entry) - fileInfos = append(fileInfos, fileInfo) + if len(uploads) > 0 { + for _, upload := range uploads { + fileInfos = append(fileInfos, FileInfo{ + Name: path.Join(fileInfo.Name, upload.UploadID), + ModTime: upload.Initiated, + }) newMaxKeys++ // If we have reached the maxKeys, it means we have listed // everything that was requested. if newMaxKeys == maxKeys { - break outerLoop + break } } } else { // We reach here for a non-recursive case non-leaf entry // OR recursive case with fi.Name. - if !fi.Mode.IsDir() { // Do not skip non-recursive case directory entries. + if !fileInfo.Mode.IsDir() { // Do not skip non-recursive case directory entries. // Validate if 'fi.Name' is incomplete multipart. - if !strings.HasSuffix(fi.Name, incompleteFile) { + if !strings.HasSuffix(fileInfo.Name, fsMetaJSONFile) { continue } - fi.Name = path.Dir(fi.Name) + fileInfo.Name = path.Dir(fileInfo.Name) } - fileInfos = append(fileInfos, fi) + fileInfos = append(fileInfos, fileInfo) newMaxKeys++ // If we have reached the maxKeys, it means we have listed // everything that was requested. @@ -428,34 +196,27 @@ outerLoop: // can continue from where it left off for the next list request. lastFileInfo := fileInfos[len(fileInfos)-1] markerPath = lastFileInfo.Name - saveTreeWalk(layer, listParams{minioMetaBucket, recursive, markerPath, prefixPath}, walker) + fs.saveTreeWalk(listParams{minioMetaBucket, recursive, markerPath, prefixPath}, walker) } + // Return entries here. return fileInfos, eof, nil } // FIXME: Currently the code sorts based on keyName/upload-id which is -// in correct based on the S3 specs. According to s3 specs we are +// not correct based on the S3 specs. According to s3 specs we are // supposed to only lexically sort keyNames and then for keyNames with // multiple upload ids should be sorted based on the initiated time. // Currently this case is not handled. -// listMultipartUploadsCommon - lists all multipart uploads, common -// function for both object layers. -func listMultipartUploadsCommon(layer ObjectLayer, bucket, prefix, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (ListMultipartsInfo, error) { - var storage StorageAPI - switch l := layer.(type) { - case xlObjects: - storage = l.storage - case fsObjects: - storage = l.storage - } +// listMultipartUploadsCommon - lists all multipart uploads, common function for both object layers. +func (fs fsObjects) listMultipartUploadsCommon(bucket, prefix, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (ListMultipartsInfo, error) { result := ListMultipartsInfo{} // Verify if bucket is valid. if !IsValidBucketName(bucket) { return ListMultipartsInfo{}, BucketNameInvalid{Bucket: bucket} } - if !isBucketExist(storage, bucket) { + if !fs.isBucketExist(bucket) { return ListMultipartsInfo{}, BucketNotFound{Bucket: bucket} } if !IsValidObjectPrefix(prefix) { @@ -514,27 +275,27 @@ func listMultipartUploadsCommon(layer ObjectLayer, bucket, prefix, keyMarker, up } // List all the multipart files at prefixPath, starting with marker keyMarkerPath. - fileInfos, eof, err := listMetaBucketMultipartFiles(layer, multipartPrefixPath, multipartMarkerPath, recursive, maxUploads) + fileInfos, eof, err := fs.listMetaBucketMultipart(multipartPrefixPath, multipartMarkerPath, recursive, maxUploads) if err != nil { return ListMultipartsInfo{}, err } // Loop through all the received files fill in the multiparts result. - for _, fi := range fileInfos { + for _, fileInfo := range fileInfos { var objectName string var uploadID string - if fi.Mode.IsDir() { + if fileInfo.Mode.IsDir() { // All directory entries are common prefixes. uploadID = "" // Upload ids are empty for CommonPrefixes. - objectName = strings.TrimPrefix(fi.Name, retainSlash(pathJoin(mpartMetaPrefix, bucket))) + objectName = strings.TrimPrefix(fileInfo.Name, retainSlash(pathJoin(mpartMetaPrefix, bucket))) result.CommonPrefixes = append(result.CommonPrefixes, objectName) } else { - uploadID = path.Base(fi.Name) - objectName = strings.TrimPrefix(path.Dir(fi.Name), retainSlash(pathJoin(mpartMetaPrefix, bucket))) + uploadID = path.Base(fileInfo.Name) + objectName = strings.TrimPrefix(path.Dir(fileInfo.Name), retainSlash(pathJoin(mpartMetaPrefix, bucket))) result.Uploads = append(result.Uploads, uploadMetadata{ Object: objectName, UploadID: uploadID, - Initiated: fi.ModTime, + Initiated: fileInfo.ModTime, }) } result.NextKeyMarker = objectName @@ -548,51 +309,165 @@ func listMultipartUploadsCommon(layer ObjectLayer, bucket, prefix, keyMarker, up return result, nil } -// ListObjectParts - list object parts, common function across both object layers. -func listObjectPartsCommon(storage StorageAPI, bucket, object, uploadID string, partNumberMarker, maxParts int) (ListPartsInfo, error) { +// ListMultipartUploads - list multipart uploads. +func (fs fsObjects) ListMultipartUploads(bucket, prefix, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (ListMultipartsInfo, error) { + return fs.listMultipartUploadsCommon(bucket, prefix, keyMarker, uploadIDMarker, delimiter, maxUploads) +} + +// NewMultipartUpload - initialize a new multipart upload, returns a unique id. +func (fs fsObjects) NewMultipartUpload(bucket, object string, meta map[string]string) (string, error) { + meta = make(map[string]string) // Reset the meta value, we are not going to save headers for fs. + return fs.newMultipartUploadCommon(bucket, object, meta) +} + +// putObjectPartCommon - put object part. +func (fs fsObjects) putObjectPartCommon(bucket string, object string, uploadID string, partID int, size int64, data io.Reader, md5Hex string) (string, error) { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return "", BucketNameInvalid{Bucket: bucket} + } + // Verify whether the bucket exists. + if !fs.isBucketExist(bucket) { + return "", BucketNotFound{Bucket: bucket} + } + if !IsValidObjectName(object) { + return "", ObjectNameInvalid{Bucket: bucket, Object: object} + } + if !fs.isUploadIDExists(bucket, object, uploadID) { + return "", InvalidUploadID{UploadID: uploadID} + } + // Hold read lock on the uploadID so that no one aborts it. + nsMutex.RLock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) + defer nsMutex.RUnlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) + + // Hold write lock on the part so that there is no parallel upload on the part. + nsMutex.Lock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID, strconv.Itoa(partID))) + defer nsMutex.Unlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID, strconv.Itoa(partID))) + + partSuffix := fmt.Sprintf("object%d", partID) + tmpPartPath := path.Join(tmpMetaPrefix, bucket, object, uploadID, partSuffix) + fileWriter, err := fs.storage.CreateFile(minioMetaBucket, tmpPartPath) + if err != nil { + return "", toObjectErr(err, bucket, object) + } + + // Initialize md5 writer. + md5Writer := md5.New() + + // Instantiate a new multi writer. + multiWriter := io.MultiWriter(md5Writer, fileWriter) + + // Instantiate checksum hashers and create a multiwriter. + if size > 0 { + if _, err = io.CopyN(multiWriter, data, size); err != nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", toObjectErr(clErr, bucket, object) + } + return "", toObjectErr(err, bucket, object) + } + // Reader shouldn't have more data what mentioned in size argument. + // reading one more byte from the reader to validate it. + // expected to fail, success validates existence of more data in the reader. + if _, err = io.CopyN(ioutil.Discard, data, 1); err == nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", toObjectErr(clErr, bucket, object) + } + return "", UnExpectedDataSize{Size: int(size)} + } + } else { + var n int64 + if n, err = io.Copy(multiWriter, data); err != nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", toObjectErr(clErr, bucket, object) + } + return "", toObjectErr(err, bucket, object) + } + size = n + } + + newMD5Hex := hex.EncodeToString(md5Writer.Sum(nil)) + if md5Hex != "" { + if newMD5Hex != md5Hex { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", toObjectErr(clErr, bucket, object) + } + return "", BadDigest{md5Hex, newMD5Hex} + } + } + err = fileWriter.Close() + if err != nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", toObjectErr(clErr, bucket, object) + } + return "", err + } + + uploadIDPath := path.Join(mpartMetaPrefix, bucket, object, uploadID) + fsMeta, err := fs.readFSMetadata(minioMetaBucket, uploadIDPath) + if err != nil { + return "", toObjectErr(err, minioMetaBucket, uploadIDPath) + } + fsMeta.AddObjectPart(partSuffix, newMD5Hex, size) + + partPath := path.Join(mpartMetaPrefix, bucket, object, uploadID, partSuffix) + err = fs.storage.RenameFile(minioMetaBucket, tmpPartPath, minioMetaBucket, partPath) + if err != nil { + if dErr := fs.storage.DeleteFile(minioMetaBucket, tmpPartPath); dErr != nil { + return "", toObjectErr(dErr, minioMetaBucket, tmpPartPath) + } + return "", toObjectErr(err, minioMetaBucket, partPath) + } + if err = fs.writeFSMetadata(minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object, uploadID), fsMeta); err != nil { + return "", toObjectErr(err, minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object, uploadID)) + } + return newMD5Hex, nil +} + +// PutObjectPart - writes the multipart upload chunks. +func (fs fsObjects) PutObjectPart(bucket, object, uploadID string, partID int, size int64, data io.Reader, md5Hex string) (string, error) { + return fs.putObjectPartCommon(bucket, object, uploadID, partID, size, data, md5Hex) +} + +func (fs fsObjects) listObjectPartsCommon(bucket, object, uploadID string, partNumberMarker, maxParts int) (ListPartsInfo, error) { // Verify if bucket is valid. if !IsValidBucketName(bucket) { return ListPartsInfo{}, BucketNameInvalid{Bucket: bucket} } // Verify whether the bucket exists. - if !isBucketExist(storage, bucket) { + if !fs.isBucketExist(bucket) { return ListPartsInfo{}, BucketNotFound{Bucket: bucket} } if !IsValidObjectName(object) { return ListPartsInfo{}, ObjectNameInvalid{Bucket: bucket, Object: object} } - if !isUploadIDExists(storage, bucket, object, uploadID) { + if !fs.isUploadIDExists(bucket, object, uploadID) { return ListPartsInfo{}, InvalidUploadID{UploadID: uploadID} } // Hold lock so that there is no competing abort-multipart-upload or complete-multipart-upload. nsMutex.Lock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) defer nsMutex.Unlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) result := ListPartsInfo{} - entries, err := storage.ListDir(minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object, uploadID)) + + uploadIDPath := path.Join(mpartMetaPrefix, bucket, object, uploadID) + fsMeta, err := fs.readFSMetadata(minioMetaBucket, uploadIDPath) if err != nil { - return result, err + return ListPartsInfo{}, toObjectErr(err, minioMetaBucket, uploadIDPath) } - sort.Strings(entries) - var newEntries []string - for _, entry := range entries { - newEntries = append(newEntries, path.Base(entry)) - } - idx := sort.SearchStrings(newEntries, fmt.Sprintf("%.5d.", partNumberMarker+1)) - newEntries = newEntries[idx:] + // Only parts with higher part numbers will be listed. + parts := fsMeta.Parts[partNumberMarker:] count := maxParts - for _, entry := range newEntries { - fi, err := storage.StatFile(minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object, uploadID, entry)) - splitEntry := strings.SplitN(entry, ".", 2) - partStr := splitEntry[0] - etagStr := splitEntry[1] - partNum, err := strconv.Atoi(partStr) + for i, part := range parts { + var fi FileInfo + partNamePath := path.Join(mpartMetaPrefix, bucket, object, uploadID, part.Name) + fi, err = fs.storage.StatFile(minioMetaBucket, partNamePath) if err != nil { - return ListPartsInfo{}, err + return ListPartsInfo{}, toObjectErr(err, minioMetaBucket, partNamePath) } + partNum := i + partNumberMarker + 1 result.Parts = append(result.Parts, partInfo{ PartNumber: partNum, + ETag: part.ETag, LastModified: fi.ModTime, - ETag: etagStr, Size: fi.Size, }) count-- @@ -601,7 +476,7 @@ func listObjectPartsCommon(storage StorageAPI, bucket, object, uploadID string, } } // If listed entries are more than maxParts, we set IsTruncated as true. - if len(newEntries) > len(result.Parts) { + if len(parts) > len(result.Parts) { result.IsTruncated = true // Make sure to fill next part number marker if IsTruncated is // true for subsequent listing. @@ -615,16 +490,170 @@ func listObjectPartsCommon(storage StorageAPI, bucket, object, uploadID string, return result, nil } +func (fs fsObjects) ListObjectParts(bucket, object, uploadID string, partNumberMarker, maxParts int) (ListPartsInfo, error) { + return fs.listObjectPartsCommon(bucket, object, uploadID, partNumberMarker, maxParts) +} + // isUploadIDExists - verify if a given uploadID exists and is valid. -func isUploadIDExists(storage StorageAPI, bucket, object, uploadID string) bool { - uploadIDPath := path.Join(mpartMetaPrefix, bucket, object, uploadID, incompleteFile) - st, err := storage.StatFile(minioMetaBucket, uploadIDPath) +func (fs fsObjects) isUploadIDExists(bucket, object, uploadID string) bool { + uploadIDPath := path.Join(mpartMetaPrefix, bucket, object, uploadID) + _, err := fs.storage.StatFile(minioMetaBucket, path.Join(uploadIDPath, fsMetaJSONFile)) if err != nil { if err == errFileNotFound { return false } - errorIf(err, "Stat failed on "+minioMetaBucket+"/"+uploadIDPath+".") + errorIf(err, "Unable to access upload id"+uploadIDPath) return false } - return st.Mode.IsRegular() + return true +} + +func (fs fsObjects) CompleteMultipartUpload(bucket string, object string, uploadID string, parts []completePart) (string, error) { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return "", BucketNameInvalid{Bucket: bucket} + } + // Verify whether the bucket exists. + if !fs.isBucketExist(bucket) { + return "", BucketNotFound{Bucket: bucket} + } + if !IsValidObjectName(object) { + return "", ObjectNameInvalid{ + Bucket: bucket, + Object: object, + } + } + if !fs.isUploadIDExists(bucket, object, uploadID) { + return "", InvalidUploadID{UploadID: uploadID} + } + + // Calculate s3 compatible md5sum for complete multipart. + s3MD5, err := completeMultipartMD5(parts...) + if err != nil { + return "", err + } + + tempObj := path.Join(tmpMetaPrefix, bucket, object, uploadID, "object1") + fileWriter, err := fs.storage.CreateFile(minioMetaBucket, tempObj) + if err != nil { + return "", toObjectErr(err, bucket, object) + } + + // Loop through all parts, validate them and then commit to disk. + for i, part := range parts { + // Construct part suffix. + partSuffix := fmt.Sprintf("object%d", part.PartNumber) + multipartPartFile := path.Join(mpartMetaPrefix, bucket, object, uploadID, partSuffix) + var fi FileInfo + fi, err = fs.storage.StatFile(minioMetaBucket, multipartPartFile) + if err != nil { + if err == errFileNotFound { + return "", InvalidPart{} + } + return "", err + } + // All parts except the last part has to be atleast 5MB. + if (i < len(parts)-1) && !isMinAllowedPartSize(fi.Size) { + return "", PartTooSmall{} + } + var fileReader io.ReadCloser + fileReader, err = fs.storage.ReadFile(minioMetaBucket, multipartPartFile, 0) + if err != nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", clErr + } + if err == errFileNotFound { + return "", InvalidPart{} + } + return "", err + } + _, err = io.Copy(fileWriter, fileReader) + if err != nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", clErr + } + return "", err + } + err = fileReader.Close() + if err != nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", clErr + } + return "", err + } + } + + err = fileWriter.Close() + if err != nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", clErr + } + return "", err + } + + // Rename the file back to original location, if not delete the temporary object. + err = fs.storage.RenameFile(minioMetaBucket, tempObj, bucket, object) + if err != nil { + if dErr := fs.storage.DeleteFile(minioMetaBucket, tempObj); dErr != nil { + return "", toObjectErr(dErr, minioMetaBucket, tempObj) + } + return "", toObjectErr(err, bucket, object) + } + + // Cleanup all the parts if everything else has been safely committed. + if err = cleanupUploadedParts(bucket, object, uploadID, fs.storage); err != nil { + return "", err + } + + // Return md5sum. + return s3MD5, nil +} + +// abortMultipartUploadCommon - aborts a multipart upload, common +// function used by both object layers. +func (fs fsObjects) abortMultipartUploadCommon(bucket, object, uploadID string) error { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return BucketNameInvalid{Bucket: bucket} + } + if !fs.isBucketExist(bucket) { + return BucketNotFound{Bucket: bucket} + } + if !IsValidObjectName(object) { + return ObjectNameInvalid{Bucket: bucket, Object: object} + } + if !fs.isUploadIDExists(bucket, object, uploadID) { + return InvalidUploadID{UploadID: uploadID} + } + + // Hold lock so that there is no competing complete-multipart-upload or put-object-part. + nsMutex.Lock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) + defer nsMutex.Unlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) + + // Cleanup all uploaded parts. + if err := cleanupUploadedParts(bucket, object, uploadID, fs.storage); err != nil { + return err + } + + // Validate if there are other incomplete upload-id's present for + // the object, if yes do not attempt to delete 'uploads.json'. + uploadIDs, err := getUploadIDs(bucket, object, fs.storage) + if err == nil { + uploadIDIdx := uploadIDs.SearchUploadID(uploadID) + if uploadIDIdx != -1 { + uploadIDs.Uploads = append(uploadIDs.Uploads[:uploadIDIdx], uploadIDs.Uploads[uploadIDIdx+1:]...) + } + if len(uploadIDs.Uploads) > 0 { + return nil + } + } + if err = fs.storage.DeleteFile(minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object, uploadsJSONFile)); err != nil { + return toObjectErr(err, minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object)) + } + return nil +} + +// AbortMultipartUpload - aborts a multipart upload. +func (fs fsObjects) AbortMultipartUpload(bucket, object, uploadID string) error { + return fs.abortMultipartUploadCommon(bucket, object, uploadID) } diff --git a/fs-objects.go b/fs-v1.go similarity index 56% rename from fs-objects.go rename to fs-v1.go index d1f7b89f2..f4ab2a060 100644 --- a/fs-objects.go +++ b/fs-v1.go @@ -21,6 +21,7 @@ import ( "encoding/hex" "io" "path/filepath" + "sort" "strings" "sync" @@ -30,7 +31,7 @@ import ( // fsObjects - Implements fs object layer. type fsObjects struct { storage StorageAPI - listObjectMap map[listParams][]*treeWalker + listObjectMap map[listParams][]*treeWalkerFS listObjectMapMutex *sync.Mutex } @@ -59,7 +60,7 @@ func newFSObjects(exportPath string) (ObjectLayer, error) { // Return successfully initialized object layer. return fsObjects{ storage: storage, - listObjectMap: make(map[listParams][]*treeWalker), + listObjectMap: make(map[listParams][]*treeWalkerFS), listObjectMapMutex: &sync.Mutex{}, }, nil } @@ -68,22 +69,68 @@ func newFSObjects(exportPath string) (ObjectLayer, error) { // MakeBucket - make a bucket. func (fs fsObjects) MakeBucket(bucket string) error { - return makeBucket(fs.storage, bucket) + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return BucketNameInvalid{Bucket: bucket} + } + if err := fs.storage.MakeVol(bucket); err != nil { + return toObjectErr(err, bucket) + } + return nil } // GetBucketInfo - get bucket info. func (fs fsObjects) GetBucketInfo(bucket string) (BucketInfo, error) { - return getBucketInfo(fs.storage, bucket) + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return BucketInfo{}, BucketNameInvalid{Bucket: bucket} + } + vi, err := fs.storage.StatVol(bucket) + if err != nil { + return BucketInfo{}, toObjectErr(err, bucket) + } + return BucketInfo{ + Name: bucket, + Created: vi.Created, + Total: vi.Total, + Free: vi.Free, + }, nil } // ListBuckets - list buckets. func (fs fsObjects) ListBuckets() ([]BucketInfo, error) { - return listBuckets(fs.storage) + var bucketInfos []BucketInfo + vols, err := fs.storage.ListVols() + if err != nil { + return nil, toObjectErr(err) + } + for _, vol := range vols { + // StorageAPI can send volume names which are incompatible + // with buckets, handle it and skip them. + if !IsValidBucketName(vol.Name) { + continue + } + bucketInfos = append(bucketInfos, BucketInfo{ + Name: vol.Name, + Created: vol.Created, + Total: vol.Total, + Free: vol.Free, + }) + } + sort.Sort(byBucketName(bucketInfos)) + return bucketInfos, nil } // DeleteBucket - delete a bucket. func (fs fsObjects) DeleteBucket(bucket string) error { - return deleteBucket(fs.storage, bucket) + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return BucketNameInvalid{Bucket: bucket} + } + if err := fs.storage.DeleteVol(bucket); err != nil { + return toObjectErr(err, bucket) + } + return nil } /// Object Operations @@ -218,7 +265,121 @@ func (fs fsObjects) DeleteObject(bucket, object string) error { return nil } +// Checks whether bucket exists. +func isBucketExist(storage StorageAPI, bucketName string) bool { + // Check whether bucket exists. + _, err := storage.StatVol(bucketName) + if err != nil { + if err == errVolumeNotFound { + return false + } + errorIf(err, "Stat failed on bucket "+bucketName+".") + return false + } + return true +} + +func (fs fsObjects) listObjectsFS(bucket, prefix, marker, delimiter string, maxKeys int) (ListObjectsInfo, error) { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return ListObjectsInfo{}, BucketNameInvalid{Bucket: bucket} + } + // Verify if bucket exists. + if !isBucketExist(fs.storage, bucket) { + return ListObjectsInfo{}, BucketNotFound{Bucket: bucket} + } + if !IsValidObjectPrefix(prefix) { + return ListObjectsInfo{}, ObjectNameInvalid{Bucket: bucket, Object: prefix} + } + // Verify if delimiter is anything other than '/', which we do not support. + if delimiter != "" && delimiter != slashSeparator { + return ListObjectsInfo{}, UnsupportedDelimiter{ + Delimiter: delimiter, + } + } + // Verify if marker has prefix. + if marker != "" { + if !strings.HasPrefix(marker, prefix) { + return ListObjectsInfo{}, InvalidMarkerPrefixCombination{ + Marker: marker, + Prefix: prefix, + } + } + } + + // With max keys of zero we have reached eof, return right here. + if maxKeys == 0 { + return ListObjectsInfo{}, nil + } + + // Over flowing count - reset to maxObjectList. + if maxKeys < 0 || maxKeys > maxObjectList { + maxKeys = maxObjectList + } + + // Default is recursive, if delimiter is set then list non recursive. + recursive := true + if delimiter == slashSeparator { + recursive = false + } + + walker := fs.lookupTreeWalk(listParams{bucket, recursive, marker, prefix}) + if walker == nil { + walker = fs.startTreeWalk(bucket, prefix, marker, recursive) + } + var fileInfos []FileInfo + var eof bool + var nextMarker string + for i := 0; i < maxKeys; { + walkResult, ok := <-walker.ch + if !ok { + // Closed channel. + eof = true + break + } + // For any walk error return right away. + if walkResult.err != nil { + // File not found is a valid case. + if walkResult.err == errFileNotFound { + return ListObjectsInfo{}, nil + } + return ListObjectsInfo{}, toObjectErr(walkResult.err, bucket, prefix) + } + fileInfo := walkResult.fileInfo + nextMarker = fileInfo.Name + fileInfos = append(fileInfos, fileInfo) + if walkResult.end { + eof = true + break + } + i++ + } + params := listParams{bucket, recursive, nextMarker, prefix} + if !eof { + fs.saveTreeWalk(params, walker) + } + + result := ListObjectsInfo{IsTruncated: !eof} + for _, fileInfo := range fileInfos { + // With delimiter set we fill in NextMarker and Prefixes. + if delimiter == slashSeparator { + result.NextMarker = fileInfo.Name + if fileInfo.Mode.IsDir() { + result.Prefixes = append(result.Prefixes, fileInfo.Name) + continue + } + } + result.Objects = append(result.Objects, ObjectInfo{ + Name: fileInfo.Name, + ModTime: fileInfo.ModTime, + Size: fileInfo.Size, + IsDir: false, + }) + } + return result, nil +} + // ListObjects - list all objects. func (fs fsObjects) ListObjects(bucket, prefix, marker, delimiter string, maxKeys int) (ListObjectsInfo, error) { - return listObjectsCommon(fs, bucket, prefix, marker, delimiter, maxKeys) + return fs.listObjectsFS(bucket, prefix, marker, delimiter, maxKeys) } diff --git a/object-common.go b/object-common.go index a95615b1a..193868009 100644 --- a/object-common.go +++ b/object-common.go @@ -16,10 +16,7 @@ package main -import ( - "sort" - "strings" -) +import "strings" // Common initialization needed for both object layers. func initObjectLayer(storageDisks ...StorageAPI) error { @@ -69,192 +66,3 @@ func cleanupDir(storage StorageAPI, volume, dirPath string) error { } return delFunc(retainSlash(pathJoin(dirPath))) } - -/// Common object layer functions. - -// makeBucket - create a bucket, is a common function for both object layers. -func makeBucket(storage StorageAPI, bucket string) error { - // Verify if bucket is valid. - if !IsValidBucketName(bucket) { - return BucketNameInvalid{Bucket: bucket} - } - if err := storage.MakeVol(bucket); err != nil { - return toObjectErr(err, bucket) - } - return nil -} - -// getBucketInfo - fetch bucket info, is a common function for both object layers. -func getBucketInfo(storage StorageAPI, bucket string) (BucketInfo, error) { - // Verify if bucket is valid. - if !IsValidBucketName(bucket) { - return BucketInfo{}, BucketNameInvalid{Bucket: bucket} - } - vi, err := storage.StatVol(bucket) - if err != nil { - return BucketInfo{}, toObjectErr(err, bucket) - } - return BucketInfo{ - Name: bucket, - Created: vi.Created, - Total: vi.Total, - Free: vi.Free, - }, nil -} - -// listBuckets - list all buckets, is a common function for both object layers. -func listBuckets(storage StorageAPI) ([]BucketInfo, error) { - var bucketInfos []BucketInfo - vols, err := storage.ListVols() - if err != nil { - return nil, toObjectErr(err) - } - for _, vol := range vols { - // StorageAPI can send volume names which are incompatible - // with buckets, handle it and skip them. - if !IsValidBucketName(vol.Name) { - continue - } - bucketInfos = append(bucketInfos, BucketInfo{ - Name: vol.Name, - Created: vol.Created, - Total: vol.Total, - Free: vol.Free, - }) - } - sort.Sort(byBucketName(bucketInfos)) - return bucketInfos, nil -} - -// deleteBucket - deletes a bucket, is a common function for both the layers. -func deleteBucket(storage StorageAPI, bucket string) error { - // Verify if bucket is valid. - if !IsValidBucketName(bucket) { - return BucketNameInvalid{Bucket: bucket} - } - if err := storage.DeleteVol(bucket); err != nil { - return toObjectErr(err, bucket) - } - return nil -} - -func listObjectsCommon(layer ObjectLayer, bucket, prefix, marker, delimiter string, maxKeys int) (ListObjectsInfo, error) { - var storage StorageAPI - switch l := layer.(type) { - case xlObjects: - storage = l.storage - case fsObjects: - storage = l.storage - } - // Verify if bucket is valid. - if !IsValidBucketName(bucket) { - return ListObjectsInfo{}, BucketNameInvalid{Bucket: bucket} - } - // Verify if bucket exists. - if !isBucketExist(storage, bucket) { - return ListObjectsInfo{}, BucketNotFound{Bucket: bucket} - } - if !IsValidObjectPrefix(prefix) { - return ListObjectsInfo{}, ObjectNameInvalid{Bucket: bucket, Object: prefix} - } - // Verify if delimiter is anything other than '/', which we do not support. - if delimiter != "" && delimiter != slashSeparator { - return ListObjectsInfo{}, UnsupportedDelimiter{ - Delimiter: delimiter, - } - } - // Verify if marker has prefix. - if marker != "" { - if !strings.HasPrefix(marker, prefix) { - return ListObjectsInfo{}, InvalidMarkerPrefixCombination{ - Marker: marker, - Prefix: prefix, - } - } - } - - // With max keys of zero we have reached eof, return right here. - if maxKeys == 0 { - return ListObjectsInfo{}, nil - } - - // Over flowing count - reset to maxObjectList. - if maxKeys < 0 || maxKeys > maxObjectList { - maxKeys = maxObjectList - } - - // Default is recursive, if delimiter is set then list non recursive. - recursive := true - if delimiter == slashSeparator { - recursive = false - } - - walker := lookupTreeWalk(layer, listParams{bucket, recursive, marker, prefix}) - if walker == nil { - walker = startTreeWalk(layer, bucket, prefix, marker, recursive) - } - var fileInfos []FileInfo - var eof bool - var nextMarker string - for i := 0; i < maxKeys; { - walkResult, ok := <-walker.ch - if !ok { - // Closed channel. - eof = true - break - } - // For any walk error return right away. - if walkResult.err != nil { - // File not found is a valid case. - if walkResult.err == errFileNotFound { - return ListObjectsInfo{}, nil - } - return ListObjectsInfo{}, toObjectErr(walkResult.err, bucket, prefix) - } - fileInfo := walkResult.fileInfo - nextMarker = fileInfo.Name - fileInfos = append(fileInfos, fileInfo) - if walkResult.end { - eof = true - break - } - i++ - } - params := listParams{bucket, recursive, nextMarker, prefix} - if !eof { - saveTreeWalk(layer, params, walker) - } - - result := ListObjectsInfo{IsTruncated: !eof} - for _, fileInfo := range fileInfos { - // With delimiter set we fill in NextMarker and Prefixes. - if delimiter == slashSeparator { - result.NextMarker = fileInfo.Name - if fileInfo.Mode.IsDir() { - result.Prefixes = append(result.Prefixes, fileInfo.Name) - continue - } - } - result.Objects = append(result.Objects, ObjectInfo{ - Name: fileInfo.Name, - ModTime: fileInfo.ModTime, - Size: fileInfo.Size, - IsDir: false, - }) - } - return result, nil -} - -// checks whether bucket exists. -func isBucketExist(storage StorageAPI, bucketName string) bool { - // Check whether bucket exists. - _, err := storage.StatVol(bucketName) - if err != nil { - if err == errVolumeNotFound { - return false - } - errorIf(err, "Stat failed on bucket "+bucketName+".") - return false - } - return true -} diff --git a/object-utils.go b/object-utils.go index c0b0a59ff..2b9f027e0 100644 --- a/object-utils.go +++ b/object-utils.go @@ -28,6 +28,7 @@ import ( "unicode/utf8" "github.com/minio/minio/pkg/safe" + "github.com/skyrings/skyring-common/tools/uuid" ) const ( @@ -123,6 +124,20 @@ func pathJoin(elem ...string) string { return path.Join(elem...) + trailingSlash } +// getUUID() - get a unique uuid. +func getUUID() (uuidStr string) { + for { + uuid, err := uuid.New() + if err != nil { + errorIf(err, "Unable to initialize uuid") + continue + } + uuidStr = uuid.String() + break + } + return uuidStr +} + // Create an s3 compatible MD5sum for complete multipart transaction. func completeMultipartMD5(parts ...completePart) (string, error) { var finalMD5Bytes []byte diff --git a/object_api_suite_test.go b/object_api_suite_test.go index 4363ac21e..96e744202 100644 --- a/object_api_suite_test.go +++ b/object_api_suite_test.go @@ -27,8 +27,6 @@ import ( "gopkg.in/check.v1" ) -// TODO - enable all the commented tests. - // APITestSuite - collection of API tests. func APITestSuite(c *check.C, create func() ObjectLayer) { testMakeBucket(c, create) diff --git a/posix.go b/posix.go index bc0de22e9..fe4240b35 100644 --- a/posix.go +++ b/posix.go @@ -333,6 +333,8 @@ func (s fsStorage) ReadFile(volume string, path string, offset int64) (readClose return nil, errFileNotFound } else if os.IsPermission(err) { return nil, errFileAccessDenied + } else if strings.Contains(err.Error(), "not a directory") { + return nil, errFileNotFound } return nil, err } @@ -425,7 +427,6 @@ func (s fsStorage) StatFile(volume, path string) (file FileInfo, err error) { // Return all errors here. return FileInfo{}, err } - // If its a directory its not a regular file. if st.Mode().IsDir() { return FileInfo{}, errFileNotFound diff --git a/test-utils_test.go b/test-utils_test.go index 9c9747e5e..7f45bf127 100644 --- a/test-utils_test.go +++ b/test-utils_test.go @@ -44,6 +44,10 @@ func ExecObjectLayerTest(t *testing.T, objTest func(obj ObjectLayer, instanceTyp } erasureDisks = append(erasureDisks, path) } + + // Initialize name space lock. + initNSLock() + objLayer, err := newXLObjects(erasureDisks) if err != nil { return nil, nil, err @@ -59,6 +63,9 @@ func ExecObjectLayerTest(t *testing.T, objTest func(obj ObjectLayer, instanceTyp return nil, "", err } + // Initialize name space lock. + initNSLock() + // Create the obj. objLayer, err := newFSObjects(fsDir) if err != nil { @@ -80,7 +87,7 @@ func ExecObjectLayerTest(t *testing.T, objTest func(obj ObjectLayer, instanceTyp } // Executing the object layer tests for single node setup. objTest(objLayer, singleNodeTestStr, t) - initNSLock() + objLayer, fsDirs, err := getXLObjectLayer() if err != nil { t.Fatalf("Initialization of object layer failed for XL setup: %s", err.Error()) diff --git a/tree-walk.go b/tree-walk-fs.go similarity index 59% rename from tree-walk.go rename to tree-walk-fs.go index 3f61a6af7..3394e3e7f 100644 --- a/tree-walk.go +++ b/tree-walk-fs.go @@ -21,49 +21,30 @@ import ( "path" "sort" "strings" - "sync" "time" ) -// listParams - list object params used for list object map -type listParams struct { - bucket string - recursive bool - marker string - prefix string +// Tree walk notify carries a channel which notifies tree walk +// results, additionally it also carries information if treeWalk +// should be timedOut. +type treeWalkerFS struct { + ch <-chan treeWalkResultFS + timedOut bool } // Tree walk result carries results of tree walking. -type treeWalkResult struct { +type treeWalkResultFS struct { fileInfo FileInfo err error end bool } -// Tree walk notify carries a channel which notifies tree walk -// results, additionally it also carries information if treeWalk -// should be timedOut. -type treeWalker struct { - ch <-chan treeWalkResult - timedOut bool -} - // treeWalk walks FS directory tree recursively pushing fileInfo into the channel as and when it encounters files. -func treeWalk(layer ObjectLayer, bucket, prefixDir, entryPrefixMatch, marker string, recursive bool, send func(treeWalkResult) bool, count *int) bool { +func (fs fsObjects) treeWalk(bucket, prefixDir, entryPrefixMatch, marker string, recursive bool, send func(treeWalkResultFS) bool, count *int) bool { // Example: // if prefixDir="one/two/three/" and marker="four/five.txt" treeWalk is recursively // called with prefixDir="one/two/three/four/" and marker="five.txt" - var isXL bool - var disk StorageAPI - switch l := layer.(type) { - case xlObjects: - isXL = true - disk = l.storage - case fsObjects: - disk = l.storage - } - // Convert entry to FileInfo entryToFileInfo := func(entry string) (fileInfo FileInfo, err error) { if strings.HasSuffix(entry, slashSeparator) { @@ -73,26 +54,7 @@ func treeWalk(layer ObjectLayer, bucket, prefixDir, entryPrefixMatch, marker str fileInfo.Mode = os.ModeDir return } - if isXL && strings.HasSuffix(entry, multipartSuffix) { - // If the entry was detected as a multipart file we use - // getMultipartObjectInfo() to fill the FileInfo structure. - entry = strings.TrimSuffix(entry, multipartSuffix) - var info MultipartObjectInfo - info, err = getMultipartObjectInfo(disk, bucket, path.Join(prefixDir, entry)) - if err != nil { - return - } - // Set the Mode to a "regular" file. - fileInfo.Mode = 0 - // Trim the suffix that was temporarily added to indicate that this - // is a multipart file. - fileInfo.Name = path.Join(prefixDir, entry) - fileInfo.Size = info.Size - fileInfo.MD5Sum = info.MD5Sum - fileInfo.ModTime = info.ModTime - return - } - if fileInfo, err = disk.StatFile(bucket, path.Join(prefixDir, entry)); err != nil { + if fileInfo, err = fs.storage.StatFile(bucket, path.Join(prefixDir, entry)); err != nil { return } // Object name needs to be full path. @@ -110,9 +72,9 @@ func treeWalk(layer ObjectLayer, bucket, prefixDir, entryPrefixMatch, marker str markerBase = markerSplit[1] } } - entries, err := disk.ListDir(bucket, prefixDir) + entries, err := fs.storage.ListDir(bucket, prefixDir) if err != nil { - send(treeWalkResult{err: err}) + send(treeWalkResultFS{err: err}) return false } @@ -123,16 +85,7 @@ func treeWalk(layer ObjectLayer, bucket, prefixDir, entryPrefixMatch, marker str } } } - // For XL multipart files strip the trailing "/" and append ".minio.multipart" to the entry so that - // entryToFileInfo() can call StatFile for regular files or getMultipartObjectInfo() for multipart files. - for i, entry := range entries { - if isXL && strings.HasSuffix(entry, slashSeparator) { - if isMultipartObject(disk, bucket, path.Join(prefixDir, entry)) { - entries[i] = strings.TrimSuffix(entry, slashSeparator) + multipartSuffix - } - } - } - sort.Sort(byMultipartFiles(entries)) + sort.Strings(entries) // Skip the empty strings for len(entries) > 0 && entries[0] == "" { entries = entries[1:] @@ -144,7 +97,7 @@ func treeWalk(layer ObjectLayer, bucket, prefixDir, entryPrefixMatch, marker str // If markerDir="four/" Search() returns the index of "four/" in the sorted // entries list so we skip all the entries till "four/" idx := sort.Search(len(entries), func(i int) bool { - return strings.TrimSuffix(entries[i], multipartSuffix) >= markerDir + return entries[i] >= markerDir }) entries = entries[idx:] *count += len(entries) @@ -176,7 +129,7 @@ func treeWalk(layer ObjectLayer, bucket, prefixDir, entryPrefixMatch, marker str } *count-- prefixMatch := "" // Valid only for first level treeWalk and empty for subdirectories. - if !treeWalk(layer, bucket, path.Join(prefixDir, entry), prefixMatch, markerArg, recursive, send, count) { + if !fs.treeWalk(bucket, path.Join(prefixDir, entry), prefixMatch, markerArg, recursive, send, count) { return false } continue @@ -188,7 +141,7 @@ func treeWalk(layer ObjectLayer, bucket, prefixDir, entryPrefixMatch, marker str // Ignore error and continue. continue } - if !send(treeWalkResult{fileInfo: fileInfo}) { + if !send(treeWalkResultFS{fileInfo: fileInfo}) { return false } } @@ -196,7 +149,7 @@ func treeWalk(layer ObjectLayer, bucket, prefixDir, entryPrefixMatch, marker str } // Initiate a new treeWalk in a goroutine. -func startTreeWalk(layer ObjectLayer, bucket, prefix, marker string, recursive bool) *treeWalker { +func (fs fsObjects) startTreeWalk(bucket, prefix, marker string, recursive bool) *treeWalkerFS { // Example 1 // If prefix is "one/two/three/" and marker is "one/two/three/four/five.txt" // treeWalk is called with prefixDir="one/two/three/" and marker="four/five.txt" @@ -207,8 +160,8 @@ func startTreeWalk(layer ObjectLayer, bucket, prefix, marker string, recursive b // treeWalk is called with prefixDir="one/two/" and marker="three/four/five.txt" // and entryPrefixMatch="th" - ch := make(chan treeWalkResult, maxObjectList) - walkNotify := treeWalker{ch: ch} + ch := make(chan treeWalkResultFS, maxObjectList) + walkNotify := treeWalkerFS{ch: ch} entryPrefixMatch := prefix prefixDir := "" lastIndex := strings.LastIndex(prefix, slashSeparator) @@ -220,7 +173,7 @@ func startTreeWalk(layer ObjectLayer, bucket, prefix, marker string, recursive b marker = strings.TrimPrefix(marker, prefixDir) go func() { defer close(ch) - send := func(walkResult treeWalkResult) bool { + send := func(walkResult treeWalkResultFS) bool { if count == 0 { walkResult.end = true } @@ -233,61 +186,41 @@ func startTreeWalk(layer ObjectLayer, bucket, prefix, marker string, recursive b return false } } - treeWalk(layer, bucket, prefixDir, entryPrefixMatch, marker, recursive, send, &count) + fs.treeWalk(bucket, prefixDir, entryPrefixMatch, marker, recursive, send, &count) }() return &walkNotify } // Save the goroutine reference in the map -func saveTreeWalk(layer ObjectLayer, params listParams, walker *treeWalker) { - var listObjectMap map[listParams][]*treeWalker - var listObjectMapMutex *sync.Mutex - switch l := layer.(type) { - case xlObjects: - listObjectMap = l.listObjectMap - listObjectMapMutex = l.listObjectMapMutex - case fsObjects: - listObjectMap = l.listObjectMap - listObjectMapMutex = l.listObjectMapMutex - } - listObjectMapMutex.Lock() - defer listObjectMapMutex.Unlock() +func (fs fsObjects) saveTreeWalk(params listParams, walker *treeWalkerFS) { + fs.listObjectMapMutex.Lock() + defer fs.listObjectMapMutex.Unlock() - walkers, _ := listObjectMap[params] + walkers, _ := fs.listObjectMap[params] walkers = append(walkers, walker) - listObjectMap[params] = walkers + fs.listObjectMap[params] = walkers } // Lookup the goroutine reference from map -func lookupTreeWalk(layer ObjectLayer, params listParams) *treeWalker { - var listObjectMap map[listParams][]*treeWalker - var listObjectMapMutex *sync.Mutex - switch l := layer.(type) { - case xlObjects: - listObjectMap = l.listObjectMap - listObjectMapMutex = l.listObjectMapMutex - case fsObjects: - listObjectMap = l.listObjectMap - listObjectMapMutex = l.listObjectMapMutex - } - listObjectMapMutex.Lock() - defer listObjectMapMutex.Unlock() +func (fs fsObjects) lookupTreeWalk(params listParams) *treeWalkerFS { + fs.listObjectMapMutex.Lock() + defer fs.listObjectMapMutex.Unlock() - if walkChs, ok := listObjectMap[params]; ok { + if walkChs, ok := fs.listObjectMap[params]; ok { for i, walkCh := range walkChs { if !walkCh.timedOut { newWalkChs := walkChs[i+1:] if len(newWalkChs) > 0 { - listObjectMap[params] = newWalkChs + fs.listObjectMap[params] = newWalkChs } else { - delete(listObjectMap, params) + delete(fs.listObjectMap, params) } return walkCh } } // As all channels are timed out, delete the map entry - delete(listObjectMap, params) + delete(fs.listObjectMap, params) } return nil } diff --git a/tree-walk-xl.go b/tree-walk-xl.go new file mode 100644 index 000000000..119de840d --- /dev/null +++ b/tree-walk-xl.go @@ -0,0 +1,265 @@ +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "math/rand" + "path" + "sort" + "strings" + "time" +) + +// listParams - list object params used for list object map +type listParams struct { + bucket string + recursive bool + marker string + prefix string +} + +// Tree walk result carries results of tree walking. +type treeWalkResult struct { + objInfo ObjectInfo + err error + end bool +} + +// Tree walk notify carries a channel which notifies tree walk +// results, additionally it also carries information if treeWalk +// should be timedOut. +type treeWalker struct { + ch <-chan treeWalkResult + timedOut bool +} + +// listDir - listDir. +func (xl xlObjects) listDir(bucket, prefixDir string, filter func(entry string) bool) (entries []string, err error) { + // Count for list errors encountered. + var listErrCount = 0 + + // Loop through and return the first success entry based on the + // selected random disk. + for listErrCount < len(xl.storageDisks) { + // Choose a random disk on each attempt, do not hit the same disk all the time. + randIndex := rand.Intn(len(xl.storageDisks) - 1) + disk := xl.storageDisks[randIndex] // Pick a random disk. + if entries, err = disk.ListDir(bucket, prefixDir); err == nil { + // Skip the entries which do not match the filter. + for i, entry := range entries { + if filter(entry) { + entries[i] = "" + continue + } + if strings.HasSuffix(entry, slashSeparator) && xl.isObject(bucket, path.Join(prefixDir, entry)) { + entries[i] = strings.TrimSuffix(entry, slashSeparator) + } + } + sort.Strings(entries) + // Skip the empty strings + for len(entries) > 0 && entries[0] == "" { + entries = entries[1:] + } + return entries, nil + } + listErrCount++ // Update list error count. + } + + // Return error at the end. + return nil, err +} + +// getRandomDisk - gives a random disk at any point in time from the +// available disk pool. +func (xl xlObjects) getRandomDisk() (disk StorageAPI) { + randIndex := rand.Intn(len(xl.storageDisks) - 1) + disk = xl.storageDisks[randIndex] // Pick a random disk. + return disk +} + +// treeWalkXL walks directory tree recursively pushing fileInfo into the channel as and when it encounters files. +func (xl xlObjects) treeWalkXL(bucket, prefixDir, entryPrefixMatch, marker string, recursive bool, send func(treeWalkResult) bool, count *int) bool { + // Example: + // if prefixDir="one/two/three/" and marker="four/five.txt" treeWalk is recursively + // called with prefixDir="one/two/three/four/" and marker="five.txt" + + // Convert entry to FileInfo + entryToObjectInfo := func(entry string) (objInfo ObjectInfo, err error) { + if strings.HasSuffix(entry, slashSeparator) { + // Object name needs to be full path. + objInfo.Bucket = bucket + objInfo.Name = path.Join(prefixDir, entry) + objInfo.Name += slashSeparator + objInfo.IsDir = true + return objInfo, nil + } + // Set the Mode to a "regular" file. + return xl.getObjectInfo(bucket, path.Join(prefixDir, entry)) + } + + var markerBase, markerDir string + if marker != "" { + // Ex: if marker="four/five.txt", markerDir="four/" markerBase="five.txt" + markerSplit := strings.SplitN(marker, slashSeparator, 2) + markerDir = markerSplit[0] + if len(markerSplit) == 2 { + markerDir += slashSeparator + markerBase = markerSplit[1] + } + } + entries, err := xl.listDir(bucket, prefixDir, func(entry string) bool { + return !strings.HasPrefix(entry, entryPrefixMatch) + }) + if err != nil { + send(treeWalkResult{err: err}) + return false + } + if len(entries) == 0 { + return true + } + + // example: + // If markerDir="four/" Search() returns the index of "four/" in the sorted + // entries list so we skip all the entries till "four/" + idx := sort.Search(len(entries), func(i int) bool { + return entries[i] >= markerDir + }) + entries = entries[idx:] + *count += len(entries) + for i, entry := range entries { + if i == 0 && markerDir == entry { + if !recursive { + // Skip as the marker would already be listed in the previous listing. + *count-- + continue + } + if recursive && !strings.HasSuffix(entry, slashSeparator) { + // We should not skip for recursive listing and if markerDir is a directory + // for ex. if marker is "four/five.txt" markerDir will be "four/" which + // should not be skipped, instead it will need to be treeWalkXL()'ed into. + + // Skip if it is a file though as it would be listed in previous listing. + *count-- + continue + } + } + + if recursive && strings.HasSuffix(entry, slashSeparator) { + // If the entry is a directory, we will need recurse into it. + markerArg := "" + if entry == markerDir { + // We need to pass "five.txt" as marker only if we are + // recursing into "four/" + markerArg = markerBase + } + *count-- + prefixMatch := "" // Valid only for first level treeWalk and empty for subdirectories. + if !xl.treeWalkXL(bucket, path.Join(prefixDir, entry), prefixMatch, markerArg, recursive, send, count) { + return false + } + continue + } + *count-- + objInfo, err := entryToObjectInfo(entry) + if err != nil { + // The file got deleted in the interim between ListDir() and StatFile() + // Ignore error and continue. + continue + } + if !send(treeWalkResult{objInfo: objInfo}) { + return false + } + } + return true +} + +// Initiate a new treeWalk in a goroutine. +func (xl xlObjects) startTreeWalkXL(bucket, prefix, marker string, recursive bool) *treeWalker { + // Example 1 + // If prefix is "one/two/three/" and marker is "one/two/three/four/five.txt" + // treeWalk is called with prefixDir="one/two/three/" and marker="four/five.txt" + // and entryPrefixMatch="" + + // Example 2 + // if prefix is "one/two/th" and marker is "one/two/three/four/five.txt" + // treeWalk is called with prefixDir="one/two/" and marker="three/four/five.txt" + // and entryPrefixMatch="th" + + ch := make(chan treeWalkResult, maxObjectList) + walkNotify := treeWalker{ch: ch} + entryPrefixMatch := prefix + prefixDir := "" + lastIndex := strings.LastIndex(prefix, slashSeparator) + if lastIndex != -1 { + entryPrefixMatch = prefix[lastIndex+1:] + prefixDir = prefix[:lastIndex+1] + } + count := 0 + marker = strings.TrimPrefix(marker, prefixDir) + go func() { + defer close(ch) + send := func(walkResult treeWalkResult) bool { + if count == 0 { + walkResult.end = true + } + timer := time.After(time.Second * 60) + select { + case ch <- walkResult: + return true + case <-timer: + walkNotify.timedOut = true + return false + } + } + xl.treeWalkXL(bucket, prefixDir, entryPrefixMatch, marker, recursive, send, &count) + }() + return &walkNotify +} + +// Save the goroutine reference in the map +func (xl xlObjects) saveTreeWalkXL(params listParams, walker *treeWalker) { + xl.listObjectMapMutex.Lock() + defer xl.listObjectMapMutex.Unlock() + + walkers, _ := xl.listObjectMap[params] + walkers = append(walkers, walker) + + xl.listObjectMap[params] = walkers +} + +// Lookup the goroutine reference from map +func (xl xlObjects) lookupTreeWalkXL(params listParams) *treeWalker { + xl.listObjectMapMutex.Lock() + defer xl.listObjectMapMutex.Unlock() + + if walkChs, ok := xl.listObjectMap[params]; ok { + for i, walkCh := range walkChs { + if !walkCh.timedOut { + newWalkChs := walkChs[i+1:] + if len(newWalkChs) > 0 { + xl.listObjectMap[params] = newWalkChs + } else { + delete(xl.listObjectMap, params) + } + return walkCh + } + } + // As all channels are timed out, delete the map entry + delete(xl.listObjectMap, params) + } + return nil +} diff --git a/xl-erasure-v1-common.go b/xl-erasure-v1-common.go deleted file mode 100644 index 663c26878..000000000 --- a/xl-erasure-v1-common.go +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Minio Cloud Storage, (C) 2016 Minio, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package main - -import ( - "errors" - slashpath "path" - "sync" -) - -// Get the highest integer from a given integer slice. -func highestInt(intSlice []int64) (highestInteger int64) { - highestInteger = int64(0) - for _, integer := range intSlice { - if highestInteger < integer { - highestInteger = integer - } - } - return highestInteger -} - -// Extracts file versions from partsMetadata slice and returns version slice. -func listFileVersions(partsMetadata []xlMetaV1, errs []error) (versions []int64) { - versions = make([]int64, len(partsMetadata)) - for index, metadata := range partsMetadata { - if errs[index] == nil { - versions[index] = metadata.Stat.Version - } else { - versions[index] = -1 - } - } - return versions -} - -// reduceError - convert collection of errors into a single -// error based on total errors and read quorum. -func (xl XL) reduceError(errs []error) error { - fileNotFoundCount := 0 - diskNotFoundCount := 0 - volumeNotFoundCount := 0 - diskAccessDeniedCount := 0 - for _, err := range errs { - if err == errFileNotFound { - fileNotFoundCount++ - } else if err == errDiskNotFound { - diskNotFoundCount++ - } else if err == errVolumeAccessDenied { - diskAccessDeniedCount++ - } else if err == errVolumeNotFound { - volumeNotFoundCount++ - } - } - // If we have errors with 'file not found' greater than - // readQuorum, return as errFileNotFound. - // else if we have errors with 'volume not found' greater than - // readQuorum, return as errVolumeNotFound. - if fileNotFoundCount > len(xl.storageDisks)-xl.readQuorum { - return errFileNotFound - } else if volumeNotFoundCount > len(xl.storageDisks)-xl.readQuorum { - return errVolumeNotFound - } - // If we have errors with disk not found equal to the - // number of disks, return as errDiskNotFound. - if diskNotFoundCount == len(xl.storageDisks) { - return errDiskNotFound - } else if diskNotFoundCount > len(xl.storageDisks)-xl.readQuorum { - // If we have errors with 'disk not found' greater than - // readQuorum, return as errFileNotFound. - return errFileNotFound - } - // If we have errors with disk not found equal to the - // number of disks, return as errDiskNotFound. - if diskAccessDeniedCount == len(xl.storageDisks) { - return errVolumeAccessDenied - } - return nil -} - -// Returns slice of online disks needed. -// - slice returing readable disks. -// - xlMetaV1 -// - bool value indicating if healing is needed. -// - error if any. -func (xl XL) listOnlineDisks(volume, path string) (onlineDisks []StorageAPI, mdata xlMetaV1, heal bool, err error) { - partsMetadata, errs := xl.getPartsMetadata(volume, path) - if err = xl.reduceError(errs); err != nil { - return nil, xlMetaV1{}, false, err - } - highestVersion := int64(0) - onlineDisks = make([]StorageAPI, len(xl.storageDisks)) - // List all the file versions from partsMetadata list. - versions := listFileVersions(partsMetadata, errs) - - // Get highest file version. - highestVersion = highestInt(versions) - - // Pick online disks with version set to highestVersion. - onlineDiskCount := 0 - for index, version := range versions { - if version == highestVersion { - mdata = partsMetadata[index] - onlineDisks[index] = xl.storageDisks[index] - onlineDiskCount++ - } else { - onlineDisks[index] = nil - } - } - - // If online disks count is lesser than configured disks, most - // probably we need to heal the file, additionally verify if the - // count is lesser than readQuorum, if not we throw an error. - if onlineDiskCount < len(xl.storageDisks) { - // Online disks lesser than total storage disks, needs to be - // healed. unless we do not have readQuorum. - heal = true - // Verify if online disks count are lesser than readQuorum - // threshold, return an error if yes. - if onlineDiskCount < xl.readQuorum { - return nil, xlMetaV1{}, false, errReadQuorum - } - } - return onlineDisks, mdata, heal, nil -} - -// Get file.json metadata as a map slice. -// Returns error slice indicating the failed metadata reads. -// Read lockNS() should be done by caller. -func (xl XL) getPartsMetadata(volume, path string) ([]xlMetaV1, []error) { - errs := make([]error, len(xl.storageDisks)) - metadataArray := make([]xlMetaV1, len(xl.storageDisks)) - xlMetaV1FilePath := slashpath.Join(path, xlMetaV1File) - var wg = &sync.WaitGroup{} - for index, disk := range xl.storageDisks { - wg.Add(1) - go func(index int, disk StorageAPI) { - defer wg.Done() - offset := int64(0) - metadataReader, err := disk.ReadFile(volume, xlMetaV1FilePath, offset) - if err != nil { - errs[index] = err - return - } - defer metadataReader.Close() - - metadata, err := xlMetaV1Decode(metadataReader) - if err != nil { - // Unable to parse file.json, set error. - errs[index] = err - return - } - metadataArray[index] = metadata - }(index, disk) - } - wg.Wait() - return metadataArray, errs -} - -// Writes/Updates `file.json` for given file. updateParts carries -// index of disks where `file.json` needs to be updated. -// -// Returns collection of errors, indexed in accordance with input -// updateParts order. -// Write lockNS() should be done by caller. -func (xl XL) updatePartsMetadata(volume, path string, metadata xlMetaV1, updateParts []bool) []error { - xlMetaV1FilePath := pathJoin(path, xlMetaV1File) - errs := make([]error, len(xl.storageDisks)) - - for index := range updateParts { - errs[index] = errors.New("Metadata not updated") - } - - for index, shouldUpdate := range updateParts { - if !shouldUpdate { - continue - } - writer, err := xl.storageDisks[index].CreateFile(volume, xlMetaV1FilePath) - errs[index] = err - if err != nil { - continue - } - err = metadata.Write(writer) - if err != nil { - errs[index] = err - safeCloseAndRemove(writer) - continue - } - writer.Close() - } - return errs -} diff --git a/xl-erasure-v1-createfile.go b/xl-erasure-v1-createfile.go deleted file mode 100644 index 45da7becb..000000000 --- a/xl-erasure-v1-createfile.go +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Minio Cloud Storage, (C) 2016 Minio, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package main - -import ( - "fmt" - "io" - slashpath "path" - "sync" - "time" -) - -// Erasure block size. -const erasureBlockSize = 4 * 1024 * 1024 // 4MiB. - -// cleanupCreateFileOps - cleans up all the temporary files and other -// temporary data upon any failure. -func (xl XL) cleanupCreateFileOps(volume, path string, writers ...io.WriteCloser) { - closeAndRemoveWriters(writers...) - for _, disk := range xl.storageDisks { - if err := disk.DeleteFile(volume, path); err != nil { - errorIf(err, "Unable to delete file.") - } - } -} - -// Close and remove writers if they are safeFile. -func closeAndRemoveWriters(writers ...io.WriteCloser) { - for _, writer := range writers { - if err := safeCloseAndRemove(writer); err != nil { - errorIf(err, "Failed to close writer.") - } - } -} - -// WriteErasure reads predefined blocks, encodes them and writes to -// configured storage disks. -func (xl XL) writeErasure(volume, path string, reader *io.PipeReader, wcloser *waitCloser) { - // Release the block writer upon function return. - defer wcloser.release() - - partsMetadata, errs := xl.getPartsMetadata(volume, path) - - // Convert errs into meaningful err to be sent upwards if possible - // based on total number of errors and read quorum. - err := xl.reduceError(errs) - if err != nil && err != errFileNotFound { - reader.CloseWithError(err) - return - } - - // List all the file versions on existing files. - versions := listFileVersions(partsMetadata, errs) - // Get highest file version. - higherVersion := highestInt(versions) - // Increment to have next higher version. - higherVersion++ - - writers := make([]io.WriteCloser, len(xl.storageDisks)) - - xlMetaV1FilePath := slashpath.Join(path, xlMetaV1File) - metadataWriters := make([]io.WriteCloser, len(xl.storageDisks)) - - // Save additional erasureMetadata. - modTime := time.Now().UTC() - - createFileError := 0 - for index, disk := range xl.storageDisks { - erasurePart := slashpath.Join(path, fmt.Sprintf("file.%d", index)) - var writer io.WriteCloser - writer, err = disk.CreateFile(volume, erasurePart) - if err != nil { - // Treat errFileNameTooLong specially - if err == errFileNameTooLong { - xl.cleanupCreateFileOps(volume, path, append(writers, metadataWriters...)...) - reader.CloseWithError(err) - return - } - - createFileError++ - - // We can safely allow CreateFile errors up to len(xl.storageDisks) - xl.writeQuorum - // otherwise return failure. - if createFileError <= len(xl.storageDisks)-xl.writeQuorum { - continue - } - - // Remove previous temp writers for any failure. - xl.cleanupCreateFileOps(volume, path, append(writers, metadataWriters...)...) - reader.CloseWithError(errWriteQuorum) - return - } - - // Create meta data file. - var metadataWriter io.WriteCloser - metadataWriter, err = disk.CreateFile(volume, xlMetaV1FilePath) - if err != nil { - createFileError++ - - // We can safely allow CreateFile errors up to - // len(xl.storageDisks) - xl.writeQuorum otherwise return failure. - if createFileError <= len(xl.storageDisks)-xl.writeQuorum { - continue - } - - // Remove previous temp writers for any failure. - xl.cleanupCreateFileOps(volume, path, append(writers, metadataWriters...)...) - reader.CloseWithError(errWriteQuorum) - return - } - - writers[index] = writer - metadataWriters[index] = metadataWriter - } - - // Allocate 4MiB block size buffer for reading. - dataBuffer := make([]byte, erasureBlockSize) - var totalSize int64 // Saves total incoming stream size. - for { - // Read up to allocated block size. - var n int - n, err = io.ReadFull(reader, dataBuffer) - if err != nil { - // Any unexpected errors, close the pipe reader with error. - if err != io.ErrUnexpectedEOF && err != io.EOF { - // Remove all temp writers. - xl.cleanupCreateFileOps(volume, path, append(writers, metadataWriters...)...) - reader.CloseWithError(err) - return - } - } - // At EOF break out. - if err == io.EOF { - break - } - if n > 0 { - // Split the input buffer into data and parity blocks. - var dataBlocks [][]byte - dataBlocks, err = xl.ReedSolomon.Split(dataBuffer[0:n]) - if err != nil { - // Remove all temp writers. - xl.cleanupCreateFileOps(volume, path, append(writers, metadataWriters...)...) - reader.CloseWithError(err) - return - } - - // Encode parity blocks using data blocks. - err = xl.ReedSolomon.Encode(dataBlocks) - if err != nil { - // Remove all temp writers upon error. - xl.cleanupCreateFileOps(volume, path, append(writers, metadataWriters...)...) - reader.CloseWithError(err) - return - } - - var wg = &sync.WaitGroup{} - var wErrs = make([]error, len(writers)) - // Loop through and write encoded data to quorum disks. - for index, writer := range writers { - if writer == nil { - continue - } - wg.Add(1) - go func(index int, writer io.Writer) { - defer wg.Done() - encodedData := dataBlocks[index] - _, wErr := writers[index].Write(encodedData) - wErrs[index] = wErr - }(index, writer) - } - wg.Wait() - for _, wErr := range wErrs { - if wErr == nil { - continue - } - // Remove all temp writers upon error. - xl.cleanupCreateFileOps(volume, path, append(writers, metadataWriters...)...) - reader.CloseWithError(wErr) - return - } - - // Update total written. - totalSize += int64(n) - } - } - - // Initialize metadata map, save all erasure related metadata. - metadata := xlMetaV1{} - metadata.Version = "1" - metadata.Stat.Size = totalSize - metadata.Stat.ModTime = modTime - metadata.Minio.Release = minioReleaseTag - if len(xl.storageDisks) > len(writers) { - // Save file.version only if we wrote to less disks than all - // storage disks. - metadata.Stat.Version = higherVersion - } - metadata.Erasure.DataBlocks = xl.DataBlocks - metadata.Erasure.ParityBlocks = xl.ParityBlocks - metadata.Erasure.BlockSize = erasureBlockSize - - // Write all the metadata. - // below case is not handled here - // Case: when storageDisks is 16 and write quorumDisks is 13, - // meta data write failure up to 2 can be considered. - // currently we fail for any meta data writes - for _, metadataWriter := range metadataWriters { - if metadataWriter == nil { - continue - } - - // Write metadata. - err = metadata.Write(metadataWriter) - if err != nil { - // Remove temporary files. - xl.cleanupCreateFileOps(volume, path, append(writers, metadataWriters...)...) - reader.CloseWithError(err) - return - } - } - - // Close all writers and metadata writers in routines. - for index, writer := range writers { - if writer == nil { - continue - } - // Safely wrote, now rename to its actual location. - if err = writer.Close(); err != nil { - // Remove all temp writers upon error. - xl.cleanupCreateFileOps(volume, path, append(writers, metadataWriters...)...) - reader.CloseWithError(err) - return - } - - if metadataWriters[index] == nil { - continue - } - // Safely wrote, now rename to its actual location. - if err = metadataWriters[index].Close(); err != nil { - // Remove all temp writers upon error. - xl.cleanupCreateFileOps(volume, path, append(writers, metadataWriters...)...) - reader.CloseWithError(err) - return - } - - } - - // Close the pipe reader and return. - reader.Close() - return -} - -// CreateFile - create a file. -func (xl XL) CreateFile(volume, path string) (writeCloser io.WriteCloser, err error) { - if !isValidVolname(volume) { - return nil, errInvalidArgument - } - if !isValidPath(path) { - return nil, errInvalidArgument - } - - // Initialize pipe for data pipe line. - pipeReader, pipeWriter := io.Pipe() - - // Initialize a new wait closer, implements both Write and Close. - wcloser := newWaitCloser(pipeWriter) - - // Start erasure encoding in routine, reading data block by block from pipeReader. - go xl.writeErasure(volume, path, pipeReader, wcloser) - - // Return the writer, caller should start writing to this. - return wcloser, nil -} diff --git a/xl-erasure-v1-healfile.go b/xl-erasure-v1-healfile.go deleted file mode 100644 index 7ea7ec001..000000000 --- a/xl-erasure-v1-healfile.go +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Minio Cloud Storage, (C) 2016 Minio, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package main - -import ( - "errors" - "fmt" - "io" - slashpath "path" -) - -// healHeal - heals the file at path. -func (xl XL) healFile(volume string, path string) error { - totalBlocks := xl.DataBlocks + xl.ParityBlocks - needsHeal := make([]bool, totalBlocks) - var readers = make([]io.Reader, totalBlocks) - var writers = make([]io.WriteCloser, totalBlocks) - - // List all online disks to verify if we need to heal. - onlineDisks, metadata, heal, err := xl.listOnlineDisks(volume, path) - if err != nil { - return err - } - if !heal { - return nil - } - - for index, disk := range onlineDisks { - if disk == nil { - needsHeal[index] = true - continue - } - erasurePart := slashpath.Join(path, fmt.Sprintf("file.%d", index)) - // If disk.ReadFile returns error and we don't have read quorum it will be taken care as - // ReedSolomon.Reconstruct() will fail later. - var reader io.ReadCloser - offset := int64(0) - if reader, err = xl.storageDisks[index].ReadFile(volume, erasurePart, offset); err == nil { - readers[index] = reader - defer reader.Close() - } - } - - // create writers for parts where healing is needed. - for index, healNeeded := range needsHeal { - if !healNeeded { - continue - } - erasurePart := slashpath.Join(path, fmt.Sprintf("file.%d", index)) - writers[index], err = xl.storageDisks[index].CreateFile(volume, erasurePart) - if err != nil { - needsHeal[index] = false - safeCloseAndRemove(writers[index]) - continue - } - } - - // Check if there is atleast one part that needs to be healed. - atleastOneHeal := false - for _, healNeeded := range needsHeal { - if healNeeded { - atleastOneHeal = true - break - } - } - if !atleastOneHeal { - // Return if healing not needed anywhere. - return nil - } - - var totalLeft = metadata.Stat.Size - for totalLeft > 0 { - // Figure out the right blockSize. - var curBlockSize int64 - if metadata.Erasure.BlockSize < totalLeft { - curBlockSize = metadata.Erasure.BlockSize - } else { - curBlockSize = totalLeft - } - // Calculate the current block size. - curBlockSize = getEncodedBlockLen(curBlockSize, metadata.Erasure.DataBlocks) - enBlocks := make([][]byte, totalBlocks) - // Loop through all readers and read. - for index, reader := range readers { - // Initialize block slice and fill the data from each parts. - // ReedSolomon.Verify() expects that slice is not nil even if the particular - // part needs healing. - enBlocks[index] = make([]byte, curBlockSize) - if needsHeal[index] { - // Skip reading if the part needs healing. - continue - } - if reader == nil { - // If ReadFile() had returned error, do not read from this disk. - continue - } - _, err = io.ReadFull(reader, enBlocks[index]) - if err != nil && err != io.ErrUnexpectedEOF { - enBlocks[index] = nil - } - } - - // Check blocks if they are all zero in length. - if checkBlockSize(enBlocks) == 0 { - return errDataCorrupt - } - - // Verify the blocks. - ok, err := xl.ReedSolomon.Verify(enBlocks) - if err != nil { - closeAndRemoveWriters(writers...) - return err - } - - // Verification failed, blocks require reconstruction. - if !ok { - for index, healNeeded := range needsHeal { - if healNeeded { - // Reconstructs() reconstructs the parts if the array is nil. - enBlocks[index] = nil - } - } - err = xl.ReedSolomon.Reconstruct(enBlocks) - if err != nil { - closeAndRemoveWriters(writers...) - return err - } - // Verify reconstructed blocks again. - ok, err = xl.ReedSolomon.Verify(enBlocks) - if err != nil { - closeAndRemoveWriters(writers...) - return err - } - if !ok { - // Blocks cannot be reconstructed, corrupted data. - err = errors.New("Verification failed after reconstruction, data likely corrupted.") - closeAndRemoveWriters(writers...) - return err - } - } - for index, healNeeded := range needsHeal { - if !healNeeded { - continue - } - _, err := writers[index].Write(enBlocks[index]) - if err != nil { - safeCloseAndRemove(writers[index]) - continue - } - } - totalLeft = totalLeft - metadata.Erasure.BlockSize - } - - // After successful healing Close() the writer so that the temp - // files are committed to their location. - for _, writer := range writers { - if writer == nil { - continue - } - writer.Close() - } - - // Update the quorum metadata after heal. - errs := xl.updatePartsMetadata(volume, path, metadata, needsHeal) - for index, healNeeded := range needsHeal { - if healNeeded && errs[index] != nil { - return errs[index] - } - } - return nil -} diff --git a/xl-erasure-v1-metadata.go b/xl-erasure-v1-metadata.go deleted file mode 100644 index e5c29ff45..000000000 --- a/xl-erasure-v1-metadata.go +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Minio Cloud Storage, (C) 2016 Minio, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package main - -import ( - "encoding/json" - "io" - "time" -) - -// A xlMetaV1 represents a metadata header mapping keys to sets of values. -type xlMetaV1 struct { - Version string `json:"version"` - Stat struct { - Size int64 `json:"size"` - ModTime time.Time `json:"modTime"` - Version int64 `json:"version"` - } `json:"stat"` - Erasure struct { - DataBlocks int `json:"data"` - ParityBlocks int `json:"parity"` - BlockSize int64 `json:"blockSize"` - } `json:"erasure"` - Minio struct { - Release string `json:"release"` - } `json:"minio"` -} - -// Write writes a metadata in wire format. -func (m xlMetaV1) Write(writer io.Writer) error { - metadataBytes, err := json.Marshal(m) - if err != nil { - return err - } - _, err = writer.Write(metadataBytes) - return err -} - -// xlMetaV1Decode - file metadata decode. -func xlMetaV1Decode(reader io.Reader) (metadata xlMetaV1, err error) { - decoder := json.NewDecoder(reader) - // Unmarshalling failed, file possibly corrupted. - if err = decoder.Decode(&metadata); err != nil { - return xlMetaV1{}, err - } - return metadata, nil -} diff --git a/xl-erasure-v1.go b/xl-erasure-v1.go deleted file mode 100644 index 5858dbe6e..000000000 --- a/xl-erasure-v1.go +++ /dev/null @@ -1,546 +0,0 @@ -/* - * Minio Cloud Storage, (C) 2016 Minio, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package main - -import ( - "errors" - "fmt" - "math/rand" - "os" - slashpath "path" - "strings" - - "path" - "sync" - - "github.com/klauspost/reedsolomon" -) - -const ( - // XL erasure metadata file. - xlMetaV1File = "file.json" -) - -// XL layer structure. -type XL struct { - ReedSolomon reedsolomon.Encoder // Erasure encoder/decoder. - DataBlocks int - ParityBlocks int - storageDisks []StorageAPI - readQuorum int - writeQuorum int -} - -// errUnexpected - returned for any unexpected error. -var errUnexpected = errors.New("Unexpected error - please report at https://github.com/minio/minio/issues") - -// newXL instantiate a new XL. -func newXL(disks []StorageAPI) (StorageAPI, error) { - // Initialize XL. - xl := &XL{} - - // Calculate data and parity blocks. - dataBlocks, parityBlocks := len(disks)/2, len(disks)/2 - - // Initialize reed solomon encoding. - rs, err := reedsolomon.New(dataBlocks, parityBlocks) - if err != nil { - return nil, err - } - - // Save the reedsolomon. - xl.DataBlocks = dataBlocks - xl.ParityBlocks = parityBlocks - xl.ReedSolomon = rs - - // Save all the initialized storage disks. - xl.storageDisks = disks - - // Figure out read and write quorum based on number of storage disks. - // Read quorum should be always N/2 + 1 (due to Vandermonde matrix - // erasure requirements) - xl.readQuorum = len(xl.storageDisks)/2 + 1 - - // Write quorum is assumed if we have total disks + 3 - // parity. (Need to discuss this again) - xl.writeQuorum = len(xl.storageDisks)/2 + 3 - if xl.writeQuorum > len(xl.storageDisks) { - xl.writeQuorum = len(xl.storageDisks) - } - - // Return successfully initialized. - return xl, nil -} - -// MakeVol - make a volume. -func (xl XL) MakeVol(volume string) error { - if !isValidVolname(volume) { - return errInvalidArgument - } - - // Err counters. - createVolErr := 0 // Count generic create vol errs. - volumeExistsErrCnt := 0 // Count all errVolumeExists errs. - - // Initialize sync waitgroup. - var wg = &sync.WaitGroup{} - - // Initialize list of errors. - var dErrs = make([]error, len(xl.storageDisks)) - - // Make a volume entry on all underlying storage disks. - for index, disk := range xl.storageDisks { - wg.Add(1) - // Make a volume inside a go-routine. - go func(index int, disk StorageAPI) { - defer wg.Done() - if disk == nil { - return - } - dErrs[index] = disk.MakeVol(volume) - }(index, disk) - } - - // Wait for all make vol to finish. - wg.Wait() - - // Loop through all the concocted errors. - for _, err := range dErrs { - if err == nil { - continue - } - // if volume already exists, count them. - if err == errVolumeExists { - volumeExistsErrCnt++ - continue - } - - // Update error counter separately. - createVolErr++ - } - // Return err if all disks report volume exists. - if volumeExistsErrCnt == len(xl.storageDisks) { - return errVolumeExists - } else if createVolErr > len(xl.storageDisks)-xl.writeQuorum { - // Return errWriteQuorum if errors were more than - // allowed write quorum. - return errWriteQuorum - } - return nil -} - -// DeleteVol - delete a volume. -func (xl XL) DeleteVol(volume string) error { - if !isValidVolname(volume) { - return errInvalidArgument - } - - // Collect if all disks report volume not found. - var volumeNotFoundErrCnt int - - var wg = &sync.WaitGroup{} - var dErrs = make([]error, len(xl.storageDisks)) - - // Remove a volume entry on all underlying storage disks. - for index, disk := range xl.storageDisks { - wg.Add(1) - // Delete volume inside a go-routine. - go func(index int, disk StorageAPI) { - defer wg.Done() - dErrs[index] = disk.DeleteVol(volume) - }(index, disk) - } - - // Wait for all the delete vols to finish. - wg.Wait() - - // Loop through concocted errors and return anything unusual. - for _, err := range dErrs { - if err != nil { - // We ignore error if errVolumeNotFound or errDiskNotFound - if err == errVolumeNotFound || err == errDiskNotFound { - volumeNotFoundErrCnt++ - continue - } - return err - } - } - // Return err if all disks report volume not found. - if volumeNotFoundErrCnt == len(xl.storageDisks) { - return errVolumeNotFound - } - return nil -} - -// ListVols - list volumes. -func (xl XL) ListVols() (volsInfo []VolInfo, err error) { - // Initialize sync waitgroup. - var wg = &sync.WaitGroup{} - - // Success vols map carries successful results of ListVols from each disks. - var successVols = make([][]VolInfo, len(xl.storageDisks)) - for index, disk := range xl.storageDisks { - wg.Add(1) // Add each go-routine to wait for. - go func(index int, disk StorageAPI) { - // Indicate wait group as finished. - defer wg.Done() - - // Initiate listing. - vlsInfo, _ := disk.ListVols() - successVols[index] = vlsInfo - }(index, disk) - } - - // For all the list volumes running in parallel to finish. - wg.Wait() - - // Loop through success vols and get aggregated usage values. - var vlsInfo []VolInfo - var total, free int64 - for _, vlsInfo = range successVols { - if len(vlsInfo) <= 1 { - continue - } - var vlInfo VolInfo - for _, vlInfo = range vlsInfo { - if vlInfo.Name == "" { - continue - } - break - } - free += vlInfo.Free - total += vlInfo.Total - } - - // Save the updated usage values back into the vols. - for _, vlInfo := range vlsInfo { - vlInfo.Free = free - vlInfo.Total = total - volsInfo = append(volsInfo, vlInfo) - } - - // NOTE: The assumption here is that volumes across all disks in - // readQuorum have consistent view i.e they all have same number - // of buckets. This is essentially not verified since healing - // should take care of this. - return volsInfo, nil -} - -// getAllVolInfo - list bucket volume info from all disks. -// Returns error slice indicating the failed volume stat operations. -func (xl XL) getAllVolInfo(volume string) ([]VolInfo, []error) { - // Create errs and volInfo slices of storageDisks size. - var errs = make([]error, len(xl.storageDisks)) - var volsInfo = make([]VolInfo, len(xl.storageDisks)) - - // Allocate a new waitgroup. - var wg = &sync.WaitGroup{} - for index, disk := range xl.storageDisks { - wg.Add(1) - // Stat volume on all the disks in a routine. - go func(index int, disk StorageAPI) { - defer wg.Done() - volInfo, err := disk.StatVol(volume) - if err != nil { - errs[index] = err - return - } - volsInfo[index] = volInfo - }(index, disk) - } - - // Wait for all the Stat operations to finish. - wg.Wait() - - // Return the concocted values. - return volsInfo, errs -} - -// listAllVolInfo - list all stat volume info from all disks. -// Returns -// - stat volume info for all online disks. -// - boolean to indicate if healing is necessary. -// - error if any. -func (xl XL) listAllVolInfo(volume string) ([]VolInfo, bool, error) { - volsInfo, errs := xl.getAllVolInfo(volume) - notFoundCount := 0 - for _, err := range errs { - if err == errVolumeNotFound { - notFoundCount++ - // If we have errors with file not found greater than allowed read - // quorum we return err as errFileNotFound. - if notFoundCount > len(xl.storageDisks)-xl.readQuorum { - return nil, false, errVolumeNotFound - } - } - } - - // Calculate online disk count. - onlineDiskCount := 0 - for index := range errs { - if errs[index] == nil { - onlineDiskCount++ - } - } - - var heal bool - // If online disks count is lesser than configured disks, most - // probably we need to heal the file, additionally verify if the - // count is lesser than readQuorum, if not we throw an error. - if onlineDiskCount < len(xl.storageDisks) { - // Online disks lesser than total storage disks, needs to be - // healed. unless we do not have readQuorum. - heal = true - // Verify if online disks count are lesser than readQuorum - // threshold, return an error if yes. - if onlineDiskCount < xl.readQuorum { - return nil, false, errReadQuorum - } - } - - // Return success. - return volsInfo, heal, nil -} - -// StatVol - get volume stat info. -func (xl XL) StatVol(volume string) (volInfo VolInfo, err error) { - if !isValidVolname(volume) { - return VolInfo{}, errInvalidArgument - } - - // List and figured out if we need healing. - volsInfo, heal, err := xl.listAllVolInfo(volume) - if err != nil { - return VolInfo{}, err - } - - // Heal for missing entries. - if heal { - go func() { - // Create volume if missing on disks. - for index, volInfo := range volsInfo { - if volInfo.Name != "" { - continue - } - // Volinfo name would be an empty string, create it. - xl.storageDisks[index].MakeVol(volume) - } - }() - } - - // Loop through all statVols, calculate the actual usage values. - var total, free int64 - for _, volInfo = range volsInfo { - if volInfo.Name == "" { - continue - } - free += volInfo.Free - total += volInfo.Total - } - // Update the aggregated values. - volInfo.Free = free - volInfo.Total = total - return volInfo, nil -} - -// isLeafDirectoryXL - check if a given path is leaf directory. i.e -// if it contains file xlMetaV1File -func isLeafDirectoryXL(disk StorageAPI, volume, leafPath string) (isLeaf bool) { - _, err := disk.StatFile(volume, path.Join(leafPath, xlMetaV1File)) - return err == nil -} - -// ListDir - return all the entries at the given directory path. -// If an entry is a directory it will be returned with a trailing "/". -func (xl XL) ListDir(volume, dirPath string) (entries []string, err error) { - if !isValidVolname(volume) { - return nil, errInvalidArgument - } - - // Count for list errors encountered. - var listErrCount = 0 - - // Loop through and return the first success entry based on the - // selected random disk. - for listErrCount < len(xl.storageDisks) { - // Choose a random disk on each attempt, do not hit the same disk all the time. - randIndex := rand.Intn(len(xl.storageDisks) - 1) - disk := xl.storageDisks[randIndex] // Pick a random disk. - // Initiate a list operation, if successful filter and return quickly. - if entries, err = disk.ListDir(volume, dirPath); err == nil { - for i, entry := range entries { - isLeaf := isLeafDirectoryXL(disk, volume, path.Join(dirPath, entry)) - isDir := strings.HasSuffix(entry, slashSeparator) - if isDir && isLeaf { - entries[i] = strings.TrimSuffix(entry, slashSeparator) - } - } - // We got the entries successfully return. - return entries, nil - } - listErrCount++ // Update list error count. - } - // Return error at the end. - return nil, err -} - -// Object API. - -// StatFile - stat a file -func (xl XL) StatFile(volume, path string) (FileInfo, error) { - if !isValidVolname(volume) { - return FileInfo{}, errInvalidArgument - } - if !isValidPath(path) { - return FileInfo{}, errInvalidArgument - } - - _, metadata, heal, err := xl.listOnlineDisks(volume, path) - if err != nil { - return FileInfo{}, err - } - - if heal { - // Heal in background safely, since we already have read quorum disks. - go func() { - hErr := xl.healFile(volume, path) - errorIf(hErr, "Unable to heal file "+volume+"/"+path+".") - }() - } - - // Return file info. - return FileInfo{ - Volume: volume, - Name: path, - Size: metadata.Stat.Size, - ModTime: metadata.Stat.ModTime, - Mode: os.FileMode(0644), - }, nil -} - -// deleteXLFiles - delete all XL backend files. -func (xl XL) deleteXLFiles(volume, path string) error { - errCount := 0 - // Update meta data file and remove part file - for index, disk := range xl.storageDisks { - erasureFilePart := slashpath.Join(path, fmt.Sprintf("file.%d", index)) - err := disk.DeleteFile(volume, erasureFilePart) - if err != nil { - errCount++ - - // We can safely allow DeleteFile errors up to len(xl.storageDisks) - xl.writeQuorum - // otherwise return failure. - if errCount <= len(xl.storageDisks)-xl.writeQuorum { - continue - } - - return err - } - - xlMetaV1FilePath := slashpath.Join(path, "file.json") - err = disk.DeleteFile(volume, xlMetaV1FilePath) - if err != nil { - errCount++ - - // We can safely allow DeleteFile errors up to len(xl.storageDisks) - xl.writeQuorum - // otherwise return failure. - if errCount <= len(xl.storageDisks)-xl.writeQuorum { - continue - } - - return err - } - } - // Return success. - return nil -} - -// DeleteFile - delete a file -func (xl XL) DeleteFile(volume, path string) error { - if !isValidVolname(volume) { - return errInvalidArgument - } - if !isValidPath(path) { - return errInvalidArgument - } - - // Delete all XL files. - return xl.deleteXLFiles(volume, path) -} - -// RenameFile - rename file. -func (xl XL) RenameFile(srcVolume, srcPath, dstVolume, dstPath string) error { - // Validate inputs. - if !isValidVolname(srcVolume) { - return errInvalidArgument - } - if !isValidPath(srcPath) { - return errInvalidArgument - } - if !isValidVolname(dstVolume) { - return errInvalidArgument - } - if !isValidPath(dstPath) { - return errInvalidArgument - } - - // Initialize sync waitgroup. - var wg = &sync.WaitGroup{} - - // Initialize list of errors. - var errs = make([]error, len(xl.storageDisks)) - - // Rename file on all underlying storage disks. - for index, disk := range xl.storageDisks { - // Append "/" as srcPath and dstPath are either leaf-dirs or non-leaf-dris. - // If srcPath is an object instead of prefix we just rename the leaf-dir and - // not rename the part and metadata files separately. - wg.Add(1) - go func(index int, disk StorageAPI) { - defer wg.Done() - err := disk.RenameFile(srcVolume, retainSlash(srcPath), dstVolume, retainSlash(dstPath)) - if err != nil { - errs[index] = err - } - errs[index] = nil - }(index, disk) - } - - // Wait for all RenameFile to finish. - wg.Wait() - - // Gather err count. - var errCount = 0 - for _, err := range errs { - if err == nil { - continue - } - errCount++ - } - // We can safely allow RenameFile errors up to len(xl.storageDisks) - xl.writeQuorum - // otherwise return failure. Cleanup successful renames. - if errCount > len(xl.storageDisks)-xl.writeQuorum { - // Special condition if readQuorum exists, then return success. - if errCount <= len(xl.storageDisks)-xl.readQuorum { - return nil - } - // Ignore errors here, delete all successfully written files. - xl.deleteXLFiles(dstVolume, dstPath) - return errWriteQuorum - } - return nil -} diff --git a/xl-objects-multipart.go b/xl-objects-multipart.go deleted file mode 100644 index 6a8d6e081..000000000 --- a/xl-objects-multipart.go +++ /dev/null @@ -1,336 +0,0 @@ -/* - * Minio Cloud Storage, (C) 2016 Minio, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package main - -import ( - "encoding/json" - "fmt" - "io" - "path" - "strings" - "sync" - "time" -) - -// MultipartPartInfo Info of each part kept in the multipart metadata file after -// CompleteMultipartUpload() is called. -type MultipartPartInfo struct { - PartNumber int - ETag string - Size int64 -} - -// MultipartObjectInfo - contents of the multipart metadata file after -// CompleteMultipartUpload() is called. -type MultipartObjectInfo struct { - Parts []MultipartPartInfo - ModTime time.Time - Size int64 - MD5Sum string - ContentType string - ContentEncoding string - // Add more fields here. -} - -type byMultipartFiles []string - -func (files byMultipartFiles) Len() int { return len(files) } -func (files byMultipartFiles) Less(i, j int) bool { - first := strings.TrimSuffix(files[i], multipartSuffix) - second := strings.TrimSuffix(files[j], multipartSuffix) - return first < second -} -func (files byMultipartFiles) Swap(i, j int) { files[i], files[j] = files[j], files[i] } - -// GetPartNumberOffset - given an offset for the whole object, return the part and offset in that part. -func (m MultipartObjectInfo) GetPartNumberOffset(offset int64) (partIndex int, partOffset int64, err error) { - partOffset = offset - for i, part := range m.Parts { - partIndex = i - if partOffset < part.Size { - return - } - partOffset -= part.Size - } - // Offset beyond the size of the object - err = errUnexpected - return -} - -// getMultipartObjectMeta - incomplete meta file and extract meta information if any. -func getMultipartObjectMeta(storage StorageAPI, metaFile string) (meta map[string]string, err error) { - meta = make(map[string]string) - offset := int64(0) - objMetaReader, err := storage.ReadFile(minioMetaBucket, metaFile, offset) - if err != nil { - return nil, err - } - // Close the metadata reader. - defer objMetaReader.Close() - - decoder := json.NewDecoder(objMetaReader) - err = decoder.Decode(&meta) - if err != nil { - return nil, err - } - return meta, nil -} - -func partNumToPartFileName(partNum int) string { - return fmt.Sprintf("%.5d%s", partNum, multipartSuffix) -} - -// ListMultipartUploads - list multipart uploads. -func (xl xlObjects) ListMultipartUploads(bucket, prefix, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (ListMultipartsInfo, error) { - return listMultipartUploadsCommon(xl, bucket, prefix, keyMarker, uploadIDMarker, delimiter, maxUploads) -} - -// NewMultipartUpload - initialize a new multipart upload, returns a unique id. -func (xl xlObjects) NewMultipartUpload(bucket, object string, meta map[string]string) (string, error) { - return newMultipartUploadCommon(xl.storage, bucket, object, meta) -} - -// PutObjectPart - writes the multipart upload chunks. -func (xl xlObjects) PutObjectPart(bucket, object, uploadID string, partID int, size int64, data io.Reader, md5Hex string) (string, error) { - return putObjectPartCommon(xl.storage, bucket, object, uploadID, partID, size, data, md5Hex) -} - -// ListObjectParts - list object parts. -func (xl xlObjects) ListObjectParts(bucket, object, uploadID string, partNumberMarker, maxParts int) (ListPartsInfo, error) { - return listObjectPartsCommon(xl.storage, bucket, object, uploadID, partNumberMarker, maxParts) -} - -// This function does the following check, suppose -// object is "a/b/c/d", stat makes sure that objects ""a/b/c"" -// "a/b" and "a" do not exist. -func (xl xlObjects) parentDirIsObject(bucket, parent string) error { - var stat func(string) error - stat = func(p string) error { - if p == "." { - return nil - } - _, err := xl.getObjectInfo(bucket, p) - if err == nil { - // If there is already a file at prefix "p" return error. - return errFileAccessDenied - } - if err == errFileNotFound { - // Check if there is a file as one of the parent paths. - return stat(path.Dir(p)) - } - return err - } - return stat(parent) -} - -func (xl xlObjects) CompleteMultipartUpload(bucket string, object string, uploadID string, parts []completePart) (string, error) { - // Verify if bucket is valid. - if !IsValidBucketName(bucket) { - return "", BucketNameInvalid{Bucket: bucket} - } - // Verify whether the bucket exists. - if !isBucketExist(xl.storage, bucket) { - return "", BucketNotFound{Bucket: bucket} - } - if !IsValidObjectName(object) { - return "", ObjectNameInvalid{ - Bucket: bucket, - Object: object, - } - } - if !isUploadIDExists(xl.storage, bucket, object, uploadID) { - return "", InvalidUploadID{UploadID: uploadID} - } - // Hold lock so that - // 1) no one aborts this multipart upload - // 2) no one does a parallel complete-multipart-upload on this multipart upload - nsMutex.Lock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) - defer nsMutex.Unlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) - - // Calculate s3 compatible md5sum for complete multipart. - s3MD5, err := completeMultipartMD5(parts...) - if err != nil { - return "", err - } - - var metadata = MultipartObjectInfo{} - var errs = make([]error, len(parts)) - - uploadIDIncompletePath := path.Join(mpartMetaPrefix, bucket, object, uploadID, incompleteFile) - objMeta, err := getMultipartObjectMeta(xl.storage, uploadIDIncompletePath) - if err != nil { - return "", toObjectErr(err, minioMetaBucket, uploadIDIncompletePath) - } - - // Waitgroup to wait for go-routines. - var wg = &sync.WaitGroup{} - - // Loop through all parts, validate them and then commit to disk. - for i, part := range parts { - // Construct part suffix. - partSuffix := fmt.Sprintf("%.5d.%s", part.PartNumber, part.ETag) - multipartPartFile := path.Join(mpartMetaPrefix, bucket, object, uploadID, partSuffix) - var fi FileInfo - fi, err = xl.storage.StatFile(minioMetaBucket, multipartPartFile) - if err != nil { - if err == errFileNotFound { - return "", InvalidPart{} - } - return "", err - } - // All parts except the last part has to be atleast 5MB. - if (i < len(parts)-1) && !isMinAllowedPartSize(fi.Size) { - return "", PartTooSmall{} - } - // Update metadata parts. - metadata.Parts = append(metadata.Parts, MultipartPartInfo{ - PartNumber: part.PartNumber, - ETag: part.ETag, - Size: fi.Size, - }) - metadata.Size += fi.Size - } - - // check if an object is present as one of the parent dir. - if err = xl.parentDirIsObject(bucket, path.Dir(object)); err != nil { - return "", toObjectErr(err, bucket, object) - } - - // Save successfully calculated md5sum. - metadata.MD5Sum = s3MD5 - metadata.ContentType = objMeta["content-type"] - metadata.ContentEncoding = objMeta["content-encoding"] - - // Save modTime as well as the current time. - metadata.ModTime = time.Now().UTC() - - // Create temporary multipart meta file to write and then rename. - multipartMetaSuffix := fmt.Sprintf("%s.%s", uploadID, multipartMetaFile) - tempMultipartMetaFile := path.Join(tmpMetaPrefix, bucket, object, multipartMetaSuffix) - w, err := xl.storage.CreateFile(minioMetaBucket, tempMultipartMetaFile) - if err != nil { - return "", toObjectErr(err, bucket, object) - } - encoder := json.NewEncoder(w) - err = encoder.Encode(&metadata) - if err != nil { - if err = safeCloseAndRemove(w); err != nil { - return "", toObjectErr(err, bucket, object) - } - return "", toObjectErr(err, bucket, object) - } - // Close the writer. - if err = w.Close(); err != nil { - if err = safeCloseAndRemove(w); err != nil { - return "", toObjectErr(err, bucket, object) - } - return "", toObjectErr(err, bucket, object) - } - - // Attempt a Rename of multipart meta file to final namespace. - multipartObjFile := path.Join(mpartMetaPrefix, bucket, object, uploadID, multipartMetaFile) - err = xl.storage.RenameFile(minioMetaBucket, tempMultipartMetaFile, minioMetaBucket, multipartObjFile) - if err != nil { - if derr := xl.storage.DeleteFile(minioMetaBucket, tempMultipartMetaFile); derr != nil { - return "", toObjectErr(err, minioMetaBucket, tempMultipartMetaFile) - } - return "", toObjectErr(err, bucket, multipartObjFile) - } - - // Loop through and atomically rename the parts to their actual location. - for index, part := range parts { - wg.Add(1) - go func(index int, part completePart) { - defer wg.Done() - partSuffix := fmt.Sprintf("%.5d.%s", part.PartNumber, part.ETag) - src := path.Join(mpartMetaPrefix, bucket, object, uploadID, partSuffix) - dst := path.Join(mpartMetaPrefix, bucket, object, uploadID, partNumToPartFileName(part.PartNumber)) - errs[index] = xl.storage.RenameFile(minioMetaBucket, src, minioMetaBucket, dst) - errorIf(errs[index], "Unable to rename file %s to %s.", src, dst) - }(index, part) - } - - // Wait for all the renames to finish. - wg.Wait() - - // Loop through errs list and return first error. - for _, err := range errs { - if err != nil { - return "", toObjectErr(err, bucket, object) - } - } - - // Delete the incomplete file place holder. - err = xl.storage.DeleteFile(minioMetaBucket, uploadIDIncompletePath) - if err != nil { - return "", toObjectErr(err, minioMetaBucket, uploadIDIncompletePath) - } - - // Hold write lock on the destination before rename - nsMutex.Lock(bucket, object) - defer nsMutex.Unlock(bucket, object) - - // Delete if an object already exists. - // FIXME: rename it to tmp file and delete only after - // the newly uploaded file is renamed from tmp location to - // the original location. - // Verify if the object is a multipart object. - if isMultipartObject(xl.storage, bucket, object) { - err = xl.deleteMultipartObject(bucket, object) - if err != nil { - return "", toObjectErr(err, bucket, object) - } - return s3MD5, nil - } - err = xl.deleteObject(bucket, object) - if err != nil { - return "", toObjectErr(err, bucket, object) - } - - uploadIDPath := path.Join(mpartMetaPrefix, bucket, object, uploadID) - if err = xl.storage.RenameFile(minioMetaBucket, uploadIDPath, bucket, object); err != nil { - return "", toObjectErr(err, bucket, object) - } - - // Hold the lock so that two parallel complete-multipart-uploads do no - // leave a stale uploads.json behind. - nsMutex.Lock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object)) - defer nsMutex.Unlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object)) - - // Validate if there are other incomplete upload-id's present for - // the object, if yes do not attempt to delete 'uploads.json'. - var entries []string - if entries, err = xl.storage.ListDir(minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object)); err == nil { - if len(entries) > 1 { - return s3MD5, nil - } - } - - uploadsJSONPath := path.Join(mpartMetaPrefix, bucket, object, uploadsJSONFile) - err = xl.storage.DeleteFile(minioMetaBucket, uploadsJSONPath) - if err != nil { - return "", toObjectErr(err, minioMetaBucket, uploadsJSONPath) - } - - // Return md5sum. - return s3MD5, nil -} - -// AbortMultipartUpload - aborts a multipart upload. -func (xl xlObjects) AbortMultipartUpload(bucket, object, uploadID string) error { - return abortMultipartUploadCommon(xl.storage, bucket, object, uploadID) -} diff --git a/xl-objects.go b/xl-objects.go deleted file mode 100644 index 91e4758e9..000000000 --- a/xl-objects.go +++ /dev/null @@ -1,581 +0,0 @@ -/* - * Minio Cloud Storage, (C) 2016 Minio, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package main - -import ( - "crypto/md5" - "encoding/hex" - "encoding/json" - "errors" - "fmt" - "io" - "path" - "path/filepath" - "strings" - "sync" - - "github.com/minio/minio/pkg/mimedb" -) - -const ( - multipartSuffix = ".minio.multipart" - multipartMetaFile = "00000" + multipartSuffix - formatConfigFile = "format.json" -) - -// xlObjects - Implements fs object layer. -type xlObjects struct { - storage StorageAPI - listObjectMap map[listParams][]*treeWalker - listObjectMapMutex *sync.Mutex -} - -// errMaxDisks - returned for reached maximum of disks. -var errMaxDisks = errors.New("Number of disks are higher than supported maximum count '16'") - -// errMinDisks - returned for minimum number of disks. -var errMinDisks = errors.New("Number of disks are smaller than supported minimum count '8'") - -// errNumDisks - returned for odd number of disks. -var errNumDisks = errors.New("Number of disks should be multiples of '2'") - -const ( - // Maximum erasure blocks. - maxErasureBlocks = 16 - // Minimum erasure blocks. - minErasureBlocks = 8 -) - -func checkSufficientDisks(disks []string) error { - // Verify total number of disks. - totalDisks := len(disks) - if totalDisks > maxErasureBlocks { - return errMaxDisks - } - if totalDisks < minErasureBlocks { - return errMinDisks - } - - // isEven function to verify if a given number if even. - isEven := func(number int) bool { - return number%2 == 0 - } - - // Verify if we have even number of disks. - // only combination of 8, 10, 12, 14, 16 are supported. - if !isEven(totalDisks) { - return errNumDisks - } - - return nil -} - -// Depending on the disk type network or local, initialize storage layer. -func newStorageLayer(disk string) (storage StorageAPI, err error) { - if !strings.ContainsRune(disk, ':') || filepath.VolumeName(disk) != "" { - // Initialize filesystem storage API. - return newPosix(disk) - } - // Initialize rpc client storage API. - return newRPCClient(disk) -} - -// Initialize all storage disks to bootstrap. -func bootstrapDisks(disks []string) ([]StorageAPI, error) { - storageDisks := make([]StorageAPI, len(disks)) - for index, disk := range disks { - var err error - // Intentionally ignore disk not found errors while - // initializing POSIX, so that we have successfully - // initialized posix Storage. Subsequent calls to XL/Erasure - // will manage any errors related to disks. - storageDisks[index], err = newStorageLayer(disk) - if err != nil && err != errDiskNotFound { - return nil, err - } - } - return storageDisks, nil -} - -// newXLObjects - initialize new xl object layer. -func newXLObjects(disks []string) (ObjectLayer, error) { - if err := checkSufficientDisks(disks); err != nil { - return nil, err - } - - storageDisks, err := bootstrapDisks(disks) - if err != nil { - return nil, err - } - - // Initialize object layer - like creating minioMetaBucket, cleaning up tmp files etc. - initObjectLayer(storageDisks...) - - // Load saved XL format.json and validate. - newDisks, err := loadFormatXL(storageDisks) - if err != nil { - switch err { - case errUnformattedDisk: - // Save new XL format. - errSave := initFormatXL(storageDisks) - if errSave != nil { - return nil, errSave - } - newDisks = storageDisks - default: - // errCorruptedDisk - error. - return nil, fmt.Errorf("Unable to recognize backend format, %s", err) - } - } - - // FIXME: healFormatXL(newDisks) - - storage, err := newXL(newDisks) - if err != nil { - return nil, err - } - - // Return successfully initialized object layer. - return xlObjects{ - storage: storage, - listObjectMap: make(map[listParams][]*treeWalker), - listObjectMapMutex: &sync.Mutex{}, - }, nil -} - -/// Bucket operations - -// MakeBucket - make a bucket. -func (xl xlObjects) MakeBucket(bucket string) error { - nsMutex.Lock(bucket, "") - defer nsMutex.Unlock(bucket, "") - return makeBucket(xl.storage, bucket) -} - -// GetBucketInfo - get bucket info. -func (xl xlObjects) GetBucketInfo(bucket string) (BucketInfo, error) { - nsMutex.RLock(bucket, "") - defer nsMutex.RUnlock(bucket, "") - return getBucketInfo(xl.storage, bucket) -} - -// ListBuckets - list buckets. -func (xl xlObjects) ListBuckets() ([]BucketInfo, error) { - return listBuckets(xl.storage) -} - -// DeleteBucket - delete a bucket. -func (xl xlObjects) DeleteBucket(bucket string) error { - nsMutex.Lock(bucket, "") - nsMutex.Unlock(bucket, "") - return deleteBucket(xl.storage, bucket) -} - -/// Object Operations - -// GetObject - get an object. -func (xl xlObjects) GetObject(bucket, object string, startOffset int64) (io.ReadCloser, error) { - // Verify if bucket is valid. - if !IsValidBucketName(bucket) { - return nil, BucketNameInvalid{Bucket: bucket} - } - // Verify if object is valid. - if !IsValidObjectName(object) { - return nil, ObjectNameInvalid{Bucket: bucket, Object: object} - } - nsMutex.RLock(bucket, object) - defer nsMutex.RUnlock(bucket, object) - if !isMultipartObject(xl.storage, bucket, object) { - _, err := xl.storage.StatFile(bucket, object) - if err == nil { - var reader io.ReadCloser - reader, err = xl.storage.ReadFile(bucket, object, startOffset) - if err != nil { - return nil, toObjectErr(err, bucket, object) - } - return reader, nil - } - return nil, toObjectErr(err, bucket, object) - } - fileReader, fileWriter := io.Pipe() - info, err := getMultipartObjectInfo(xl.storage, bucket, object) - if err != nil { - return nil, toObjectErr(err, bucket, object) - } - partIndex, offset, err := info.GetPartNumberOffset(startOffset) - if err != nil { - return nil, toObjectErr(err, bucket, object) - } - - // Hold a read lock once more which can be released after the following go-routine ends. - // We hold RLock once more because the current function would return before the go routine below - // executes and hence releasing the read lock (because of defer'ed nsMutex.RUnlock() call). - nsMutex.RLock(bucket, object) - go func() { - defer nsMutex.RUnlock(bucket, object) - for ; partIndex < len(info.Parts); partIndex++ { - part := info.Parts[partIndex] - r, err := xl.storage.ReadFile(bucket, pathJoin(object, partNumToPartFileName(part.PartNumber)), offset) - if err != nil { - fileWriter.CloseWithError(err) - return - } - // Reset offset to 0 as it would be non-0 only for the first loop if startOffset is non-0. - offset = 0 - if _, err = io.Copy(fileWriter, r); err != nil { - switch reader := r.(type) { - case *io.PipeReader: - reader.CloseWithError(err) - case io.ReadCloser: - reader.Close() - } - fileWriter.CloseWithError(err) - return - } - // Close the readerCloser that reads multiparts of an object from the xl storage layer. - // Not closing leaks underlying file descriptors. - r.Close() - } - fileWriter.Close() - }() - return fileReader, nil -} - -// Return the partsInfo of a special multipart object. -func getMultipartObjectInfo(storage StorageAPI, bucket, object string) (info MultipartObjectInfo, err error) { - offset := int64(0) - r, err := storage.ReadFile(bucket, pathJoin(object, multipartMetaFile), offset) - if err != nil { - return MultipartObjectInfo{}, err - } - decoder := json.NewDecoder(r) - err = decoder.Decode(&info) - if err != nil { - return MultipartObjectInfo{}, err - } - return info, nil -} - -// Return ObjectInfo. -func (xl xlObjects) getObjectInfo(bucket, object string) (objInfo ObjectInfo, err error) { - objInfo.Bucket = bucket - objInfo.Name = object - // First see if the object was a simple-PUT upload. - fi, err := xl.storage.StatFile(bucket, object) - if err != nil { - if err != errFileNotFound { - return ObjectInfo{}, err - } - var info MultipartObjectInfo - // Check if the object was multipart upload. - info, err = getMultipartObjectInfo(xl.storage, bucket, object) - if err != nil { - return ObjectInfo{}, err - } - objInfo.Size = info.Size - objInfo.ModTime = info.ModTime - objInfo.MD5Sum = info.MD5Sum - objInfo.ContentType = info.ContentType - objInfo.ContentEncoding = info.ContentEncoding - } else { - metadata := make(map[string]string) - offset := int64(0) // To read entire content - r, err := xl.storage.ReadFile(bucket, pathJoin(object, "meta.json"), offset) - if err != nil { - return ObjectInfo{}, toObjectErr(err, bucket, object) - } - decoder := json.NewDecoder(r) - if err = decoder.Decode(&metadata); err != nil { - return ObjectInfo{}, toObjectErr(err, bucket, object) - } - contentType := metadata["content-type"] - if len(contentType) == 0 { - contentType = "application/octet-stream" - if objectExt := filepath.Ext(object); objectExt != "" { - content, ok := mimedb.DB[strings.ToLower(strings.TrimPrefix(objectExt, "."))] - if ok { - contentType = content.ContentType - } - } - } - objInfo.Size = fi.Size - objInfo.IsDir = fi.Mode.IsDir() - objInfo.ModTime = fi.ModTime - objInfo.MD5Sum = metadata["md5Sum"] - objInfo.ContentType = contentType - objInfo.ContentEncoding = metadata["content-encoding"] - } - return objInfo, nil -} - -// GetObjectInfo - get object info. -func (xl xlObjects) GetObjectInfo(bucket, object string) (ObjectInfo, error) { - // Verify if bucket is valid. - if !IsValidBucketName(bucket) { - return ObjectInfo{}, BucketNameInvalid{Bucket: bucket} - } - // Verify if object is valid. - if !IsValidObjectName(object) { - return ObjectInfo{}, ObjectNameInvalid{Bucket: bucket, Object: object} - } - nsMutex.RLock(bucket, object) - defer nsMutex.RUnlock(bucket, object) - info, err := xl.getObjectInfo(bucket, object) - if err != nil { - return ObjectInfo{}, toObjectErr(err, bucket, object) - } - return info, nil -} - -// PutObject - create an object. -func (xl xlObjects) PutObject(bucket string, object string, size int64, data io.Reader, metadata map[string]string) (string, error) { - // Verify if bucket is valid. - if !IsValidBucketName(bucket) { - return "", BucketNameInvalid{Bucket: bucket} - } - // Verify bucket exists. - if !isBucketExist(xl.storage, bucket) { - return "", BucketNotFound{Bucket: bucket} - } - if !IsValidObjectName(object) { - return "", ObjectNameInvalid{ - Bucket: bucket, - Object: object, - } - } - // No metadata is set, allocate a new one. - if metadata == nil { - metadata = make(map[string]string) - } - nsMutex.Lock(bucket, object) - defer nsMutex.Unlock(bucket, object) - - tempObj := path.Join(tmpMetaPrefix, bucket, object) - fileWriter, err := xl.storage.CreateFile(minioMetaBucket, tempObj) - if err != nil { - return "", toObjectErr(err, bucket, object) - } - - // Initialize md5 writer. - md5Writer := md5.New() - - // Instantiate a new multi writer. - multiWriter := io.MultiWriter(md5Writer, fileWriter) - - // Instantiate checksum hashers and create a multiwriter. - if size > 0 { - if _, err = io.CopyN(multiWriter, data, size); err != nil { - if clErr := safeCloseAndRemove(fileWriter); clErr != nil { - return "", toObjectErr(clErr, bucket, object) - } - return "", toObjectErr(err, bucket, object) - } - } else { - if _, err = io.Copy(multiWriter, data); err != nil { - if clErr := safeCloseAndRemove(fileWriter); clErr != nil { - return "", toObjectErr(clErr, bucket, object) - } - return "", toObjectErr(err, bucket, object) - } - } - - newMD5Hex := hex.EncodeToString(md5Writer.Sum(nil)) - // Update the md5sum if not set with the newly calculated one. - if len(metadata["md5Sum"]) == 0 { - metadata["md5Sum"] = newMD5Hex - } - - // md5Hex representation. - md5Hex := metadata["md5Sum"] - if md5Hex != "" { - if newMD5Hex != md5Hex { - if err = safeCloseAndRemove(fileWriter); err != nil { - return "", toObjectErr(err, bucket, object) - } - return "", BadDigest{md5Hex, newMD5Hex} - } - } - - err = fileWriter.Close() - if err != nil { - if clErr := safeCloseAndRemove(fileWriter); clErr != nil { - return "", toObjectErr(clErr, bucket, object) - } - return "", toObjectErr(err, bucket, object) - } - - // Check if an object is present as one of the parent dir. - if err = xl.parentDirIsObject(bucket, path.Dir(object)); err != nil { - return "", toObjectErr(err, bucket, object) - } - - // Delete if an object already exists. - // FIXME: rename it to tmp file and delete only after - // the newly uploaded file is renamed from tmp location to - // the original location. - // Verify if the object is a multipart object. - if isMultipartObject(xl.storage, bucket, object) { - err = xl.deleteMultipartObject(bucket, object) - if err != nil { - return "", toObjectErr(err, bucket, object) - } - } else { - err = xl.deleteObject(bucket, object) - if err != nil { - return "", toObjectErr(err, bucket, object) - } - } - - err = xl.storage.RenameFile(minioMetaBucket, tempObj, bucket, object) - if err != nil { - if dErr := xl.storage.DeleteFile(minioMetaBucket, tempObj); dErr != nil { - return "", toObjectErr(dErr, bucket, object) - } - return "", toObjectErr(err, bucket, object) - } - - tempMetaJSONFile := path.Join(tmpMetaPrefix, bucket, object, "meta.json") - metaWriter, err := xl.storage.CreateFile(minioMetaBucket, tempMetaJSONFile) - if err != nil { - return "", toObjectErr(err, bucket, object) - } - - encoder := json.NewEncoder(metaWriter) - err = encoder.Encode(&metadata) - if err != nil { - if clErr := safeCloseAndRemove(metaWriter); clErr != nil { - return "", toObjectErr(clErr, bucket, object) - } - return "", toObjectErr(err, bucket, object) - } - if err = metaWriter.Close(); err != nil { - if err = safeCloseAndRemove(metaWriter); err != nil { - return "", toObjectErr(err, bucket, object) - } - return "", toObjectErr(err, bucket, object) - } - - metaJSONFile := path.Join(object, "meta.json") - err = xl.storage.RenameFile(minioMetaBucket, tempMetaJSONFile, bucket, metaJSONFile) - if err != nil { - if derr := xl.storage.DeleteFile(minioMetaBucket, tempMetaJSONFile); derr != nil { - return "", toObjectErr(derr, bucket, object) - } - return "", toObjectErr(err, bucket, object) - } - - // Return md5sum, successfully wrote object. - return newMD5Hex, nil -} - -// isMultipartObject - verifies if an object is special multipart file. -func isMultipartObject(storage StorageAPI, bucket, object string) bool { - _, err := storage.StatFile(bucket, pathJoin(object, multipartMetaFile)) - if err != nil { - if err == errFileNotFound { - return false - } - errorIf(err, "Failed to stat file "+bucket+pathJoin(object, multipartMetaFile)) - return false - } - return true -} - -// deleteMultipartObject - deletes only multipart object. -func (xl xlObjects) deleteMultipartObject(bucket, object string) error { - // Get parts info. - info, err := getMultipartObjectInfo(xl.storage, bucket, object) - if err != nil { - return err - } - // Range through all files and delete it. - var wg = &sync.WaitGroup{} - var errs = make([]error, len(info.Parts)) - for index, part := range info.Parts { - wg.Add(1) - // Start deleting parts in routine. - go func(index int, part MultipartPartInfo) { - defer wg.Done() - partFileName := partNumToPartFileName(part.PartNumber) - errs[index] = xl.storage.DeleteFile(bucket, pathJoin(object, partFileName)) - }(index, part) - } - // Wait for all the deletes to finish. - wg.Wait() - // Loop through and validate if any errors, if we are unable to remove any part return - // "unexpected" error as returning any other error might be misleading. For ex. - // if DeleteFile() had returned errFileNotFound and we return it, then client would see - // ObjectNotFound which is misleading. - for _, err := range errs { - if err != nil { - return errUnexpected - } - } - err = xl.storage.DeleteFile(bucket, pathJoin(object, multipartMetaFile)) - if err != nil { - return err - } - return nil -} - -// deleteObject - deletes a regular object. -func (xl xlObjects) deleteObject(bucket, object string) error { - metaJSONFile := path.Join(object, "meta.json") - // Ignore if meta.json file doesn't exist. - if err := xl.storage.DeleteFile(bucket, metaJSONFile); err != nil { - if err != errFileNotFound { - return err - } - } - if err := xl.storage.DeleteFile(bucket, object); err != nil { - if err != errFileNotFound { - return err - } - } - return nil -} - -// DeleteObject - delete the object. -func (xl xlObjects) DeleteObject(bucket, object string) error { - // Verify if bucket is valid. - if !IsValidBucketName(bucket) { - return BucketNameInvalid{Bucket: bucket} - } - if !IsValidObjectName(object) { - return ObjectNameInvalid{Bucket: bucket, Object: object} - } - nsMutex.Lock(bucket, object) - defer nsMutex.Unlock(bucket, object) - // Verify if the object is a multipart object. - if isMultipartObject(xl.storage, bucket, object) { - err := xl.deleteMultipartObject(bucket, object) - if err != nil { - return toObjectErr(err, bucket, object) - } - return nil - } - err := xl.deleteObject(bucket, object) - if err != nil { - return toObjectErr(err, bucket, object) - } - return nil -} - -// ListObjects - list all objects at prefix, delimited by '/'. -func (xl xlObjects) ListObjects(bucket, prefix, marker, delimiter string, maxKeys int) (ListObjectsInfo, error) { - return listObjectsCommon(xl, bucket, prefix, marker, delimiter, maxKeys) -} diff --git a/xl-v1-bucket.go b/xl-v1-bucket.go new file mode 100644 index 000000000..99158b6b3 --- /dev/null +++ b/xl-v1-bucket.go @@ -0,0 +1,355 @@ +package main + +import ( + "sort" + "sync" +) + +/// Bucket operations + +// MakeBucket - make a bucket. +func (xl xlObjects) MakeBucket(bucket string) error { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return BucketNameInvalid{Bucket: bucket} + } + + nsMutex.Lock(bucket, "") + defer nsMutex.Unlock(bucket, "") + + // Err counters. + createVolErr := 0 // Count generic create vol errs. + volumeExistsErrCnt := 0 // Count all errVolumeExists errs. + + // Initialize sync waitgroup. + var wg = &sync.WaitGroup{} + + // Initialize list of errors. + var dErrs = make([]error, len(xl.storageDisks)) + + // Make a volume entry on all underlying storage disks. + for index, disk := range xl.storageDisks { + wg.Add(1) + // Make a volume inside a go-routine. + go func(index int, disk StorageAPI) { + defer wg.Done() + err := disk.MakeVol(bucket) + if err != nil { + dErrs[index] = err + return + } + dErrs[index] = nil + }(index, disk) + } + + // Wait for all make vol to finish. + wg.Wait() + + // Loop through all the concocted errors. + for _, err := range dErrs { + if err == nil { + continue + } + // if volume already exists, count them. + if err == errVolumeExists { + volumeExistsErrCnt++ + continue + } + + // Update error counter separately. + createVolErr++ + } + + // Return err if all disks report volume exists. + if volumeExistsErrCnt == len(xl.storageDisks) { + return toObjectErr(errVolumeExists, bucket) + } else if createVolErr > len(xl.storageDisks)-xl.writeQuorum { + // Return errWriteQuorum if errors were more than + // allowed write quorum. + return toObjectErr(errWriteQuorum, bucket) + } + return nil +} + +// getAllBucketInfo - list bucket info from all disks. +// Returns error slice indicating the failed volume stat operations. +func (xl xlObjects) getAllBucketInfo(bucketName string) ([]BucketInfo, []error) { + // Create errs and volInfo slices of storageDisks size. + var errs = make([]error, len(xl.storageDisks)) + var volsInfo = make([]VolInfo, len(xl.storageDisks)) + + // Allocate a new waitgroup. + var wg = &sync.WaitGroup{} + for index, disk := range xl.storageDisks { + wg.Add(1) + // Stat volume on all the disks in a routine. + go func(index int, disk StorageAPI) { + defer wg.Done() + volInfo, err := disk.StatVol(bucketName) + if err != nil { + errs[index] = err + return + } + volsInfo[index] = volInfo + errs[index] = nil + }(index, disk) + } + + // Wait for all the Stat operations to finish. + wg.Wait() + + // Return the concocted values. + var bucketsInfo = make([]BucketInfo, len(xl.storageDisks)) + for _, volInfo := range volsInfo { + if IsValidBucketName(volInfo.Name) { + bucketsInfo = append(bucketsInfo, BucketInfo{ + Name: volInfo.Name, + Created: volInfo.Created, + }) + } + } + return bucketsInfo, errs +} + +// listAllBucketInfo - list all stat volume info from all disks. +// Returns +// - stat volume info for all online disks. +// - boolean to indicate if healing is necessary. +// - error if any. +func (xl xlObjects) listAllBucketInfo(bucketName string) ([]BucketInfo, bool, error) { + bucketsInfo, errs := xl.getAllBucketInfo(bucketName) + notFoundCount := 0 + for _, err := range errs { + if err == errVolumeNotFound { + notFoundCount++ + // If we have errors with file not found greater than allowed read + // quorum we return err as errFileNotFound. + if notFoundCount > len(xl.storageDisks)-xl.readQuorum { + return nil, false, errVolumeNotFound + } + } + } + + // Calculate online disk count. + onlineDiskCount := 0 + for index := range errs { + if errs[index] == nil { + onlineDiskCount++ + } + } + + var heal bool + // If online disks count is lesser than configured disks, most + // probably we need to heal the file, additionally verify if the + // count is lesser than readQuorum, if not we throw an error. + if onlineDiskCount < len(xl.storageDisks) { + // Online disks lesser than total storage disks, needs to be + // healed. unless we do not have readQuorum. + heal = true + // Verify if online disks count are lesser than readQuorum + // threshold, return an error if yes. + if onlineDiskCount < xl.readQuorum { + return nil, false, errReadQuorum + } + } + + // Return success. + return bucketsInfo, heal, nil +} + +// Checks whether bucket exists. +func (xl xlObjects) isBucketExist(bucketName string) bool { + // Check whether bucket exists. + _, _, err := xl.listAllBucketInfo(bucketName) + if err != nil { + if err == errVolumeNotFound { + return false + } + errorIf(err, "Stat failed on bucket "+bucketName+".") + return false + } + return true +} + +// GetBucketInfo - get bucket info. +func (xl xlObjects) GetBucketInfo(bucket string) (BucketInfo, error) { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return BucketInfo{}, BucketNameInvalid{Bucket: bucket} + } + + nsMutex.RLock(bucket, "") + defer nsMutex.RUnlock(bucket, "") + + // List and figured out if we need healing. + bucketsInfo, heal, err := xl.listAllBucketInfo(bucket) + if err != nil { + return BucketInfo{}, toObjectErr(err, bucket) + } + + // Heal for missing entries. + if heal { + go func() { + // Create bucket if missing on disks. + for index, bktInfo := range bucketsInfo { + if bktInfo.Name != "" { + continue + } + // Bucketinfo name would be an empty string, create it. + xl.storageDisks[index].MakeVol(bucket) + } + }() + } + + // Loop through all statVols, calculate the actual usage values. + var total, free int64 + var bucketInfo BucketInfo + for _, bucketInfo = range bucketsInfo { + if bucketInfo.Name == "" { + continue + } + free += bucketInfo.Free + total += bucketInfo.Total + } + // Update the aggregated values. + bucketInfo.Free = free + bucketInfo.Total = total + + return BucketInfo{ + Name: bucket, + Created: bucketInfo.Created, + Total: bucketInfo.Total, + Free: bucketInfo.Free, + }, nil +} + +func (xl xlObjects) listBuckets() ([]BucketInfo, error) { + // Initialize sync waitgroup. + var wg = &sync.WaitGroup{} + + // Success vols map carries successful results of ListVols from each disks. + var successVols = make([][]VolInfo, len(xl.storageDisks)) + for index, disk := range xl.storageDisks { + wg.Add(1) // Add each go-routine to wait for. + go func(index int, disk StorageAPI) { + // Indicate wait group as finished. + defer wg.Done() + + // Initiate listing. + volsInfo, _ := disk.ListVols() + successVols[index] = volsInfo + }(index, disk) + } + + // For all the list volumes running in parallel to finish. + wg.Wait() + + // Loop through success vols and get aggregated usage values. + var volsInfo []VolInfo + var total, free int64 + for _, volsInfo = range successVols { + var volInfo VolInfo + for _, volInfo = range volsInfo { + if volInfo.Name == "" { + continue + } + if !IsValidBucketName(volInfo.Name) { + continue + } + break + } + free += volInfo.Free + total += volInfo.Total + } + + // Save the updated usage values back into the vols. + for index, volInfo := range volsInfo { + volInfo.Free = free + volInfo.Total = total + volsInfo[index] = volInfo + } + + // NOTE: The assumption here is that volumes across all disks in + // readQuorum have consistent view i.e they all have same number + // of buckets. This is essentially not verified since healing + // should take care of this. + var bucketsInfo []BucketInfo + for _, volInfo := range volsInfo { + // StorageAPI can send volume names which are incompatible + // with buckets, handle it and skip them. + if !IsValidBucketName(volInfo.Name) { + continue + } + bucketsInfo = append(bucketsInfo, BucketInfo{ + Name: volInfo.Name, + Created: volInfo.Created, + Total: volInfo.Total, + Free: volInfo.Free, + }) + } + return bucketsInfo, nil +} + +// ListBuckets - list buckets. +func (xl xlObjects) ListBuckets() ([]BucketInfo, error) { + bucketInfos, err := xl.listBuckets() + if err != nil { + return nil, toObjectErr(err) + } + sort.Sort(byBucketName(bucketInfos)) + return bucketInfos, nil +} + +// DeleteBucket - delete a bucket. +func (xl xlObjects) DeleteBucket(bucket string) error { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return BucketNameInvalid{Bucket: bucket} + } + + nsMutex.Lock(bucket, "") + nsMutex.Unlock(bucket, "") + + // Collect if all disks report volume not found. + var volumeNotFoundErrCnt int + + var wg = &sync.WaitGroup{} + var dErrs = make([]error, len(xl.storageDisks)) + + // Remove a volume entry on all underlying storage disks. + for index, disk := range xl.storageDisks { + wg.Add(1) + // Delete volume inside a go-routine. + go func(index int, disk StorageAPI) { + defer wg.Done() + err := disk.DeleteVol(bucket) + if err != nil { + dErrs[index] = err + return + } + dErrs[index] = nil + }(index, disk) + } + + // Wait for all the delete vols to finish. + wg.Wait() + + // Loop through concocted errors and return anything unusual. + for _, err := range dErrs { + if err != nil { + // We ignore error if errVolumeNotFound or errDiskNotFound + if err == errVolumeNotFound || err == errDiskNotFound { + volumeNotFoundErrCnt++ + continue + } + return toObjectErr(err, bucket) + } + } + + // Return err if all disks report volume not found. + if volumeNotFoundErrCnt == len(xl.storageDisks) { + return toObjectErr(errVolumeNotFound, bucket) + } + + return nil +} diff --git a/xl-v1-list-objects.go b/xl-v1-list-objects.go new file mode 100644 index 000000000..eb446bfd9 --- /dev/null +++ b/xl-v1-list-objects.go @@ -0,0 +1,116 @@ +package main + +import "strings" + +func (xl xlObjects) listObjectsXL(bucket, prefix, marker, delimiter string, maxKeys int) (ListObjectsInfo, error) { + // Default is recursive, if delimiter is set then list non recursive. + recursive := true + if delimiter == slashSeparator { + recursive = false + } + + walker := xl.lookupTreeWalkXL(listParams{bucket, recursive, marker, prefix}) + if walker == nil { + walker = xl.startTreeWalkXL(bucket, prefix, marker, recursive) + } + var objInfos []ObjectInfo + var eof bool + var nextMarker string + for i := 0; i < maxKeys; { + walkResult, ok := <-walker.ch + if !ok { + // Closed channel. + eof = true + break + } + // For any walk error return right away. + if walkResult.err != nil { + // File not found is a valid case. + if walkResult.err == errFileNotFound { + return ListObjectsInfo{}, nil + } + return ListObjectsInfo{}, toObjectErr(walkResult.err, bucket, prefix) + } + objInfo := walkResult.objInfo + nextMarker = objInfo.Name + objInfos = append(objInfos, objInfo) + if walkResult.end { + eof = true + break + } + i++ + } + params := listParams{bucket, recursive, nextMarker, prefix} + if !eof { + xl.saveTreeWalkXL(params, walker) + } + + result := ListObjectsInfo{IsTruncated: !eof} + for _, objInfo := range objInfos { + // With delimiter set we fill in NextMarker and Prefixes. + if delimiter == slashSeparator { + result.NextMarker = objInfo.Name + if objInfo.IsDir { + result.Prefixes = append(result.Prefixes, objInfo.Name) + continue + } + } + result.Objects = append(result.Objects, ObjectInfo{ + Name: objInfo.Name, + ModTime: objInfo.ModTime, + Size: objInfo.Size, + IsDir: false, + }) + } + return result, nil +} + +// ListObjects - list all objects at prefix, delimited by '/'. +func (xl xlObjects) ListObjects(bucket, prefix, marker, delimiter string, maxKeys int) (ListObjectsInfo, error) { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return ListObjectsInfo{}, BucketNameInvalid{Bucket: bucket} + } + // Verify if bucket exists. + if !xl.isBucketExist(bucket) { + return ListObjectsInfo{}, BucketNotFound{Bucket: bucket} + } + if !IsValidObjectPrefix(prefix) { + return ListObjectsInfo{}, ObjectNameInvalid{Bucket: bucket, Object: prefix} + } + // Verify if delimiter is anything other than '/', which we do not support. + if delimiter != "" && delimiter != slashSeparator { + return ListObjectsInfo{}, UnsupportedDelimiter{ + Delimiter: delimiter, + } + } + // Verify if marker has prefix. + if marker != "" { + if !strings.HasPrefix(marker, prefix) { + return ListObjectsInfo{}, InvalidMarkerPrefixCombination{ + Marker: marker, + Prefix: prefix, + } + } + } + + // With max keys of zero we have reached eof, return right here. + if maxKeys == 0 { + return ListObjectsInfo{}, nil + } + + // Over flowing count - reset to maxObjectList. + if maxKeys < 0 || maxKeys > maxObjectList { + maxKeys = maxObjectList + } + + // Initiate a list operation, if successful filter and return quickly. + listObjInfo, err := xl.listObjectsXL(bucket, prefix, marker, delimiter, maxKeys) + if err == nil { + // We got the entries successfully return. + return listObjInfo, nil + } + + // Return error at the end. + return ListObjectsInfo{}, toObjectErr(err, bucket, prefix) +} diff --git a/xl-v1-metadata.go b/xl-v1-metadata.go new file mode 100644 index 000000000..c11ae2ed5 --- /dev/null +++ b/xl-v1-metadata.go @@ -0,0 +1,287 @@ +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "bytes" + "encoding/json" + "io" + "path" + "sort" + "sync" + "time" +) + +// Erasure block size. +const erasureBlockSize = 4 * 1024 * 1024 // 4MiB. + +// objectPartInfo Info of each part kept in the multipart metadata +// file after CompleteMultipartUpload() is called. +type objectPartInfo struct { + Name string `json:"name"` + ETag string `json:"etag"` + Size int64 `json:"size"` +} + +// A xlMetaV1 represents a metadata header mapping keys to sets of values. +type xlMetaV1 struct { + Version string `json:"version"` + Format string `json:"format"` + Stat struct { + Size int64 `json:"size"` + ModTime time.Time `json:"modTime"` + Version int64 `json:"version"` + } `json:"stat"` + Erasure struct { + DataBlocks int `json:"data"` + ParityBlocks int `json:"parity"` + BlockSize int64 `json:"blockSize"` + Index int `json:"index"` + Distribution []int `json:"distribution"` + } `json:"erasure"` + Checksum struct { + Enable bool `json:"enable"` + } `json:"checksum"` + Minio struct { + Release string `json:"release"` + } `json:"minio"` + Meta map[string]string `json:"meta"` + Parts []objectPartInfo `json:"parts,omitempty"` +} + +// ReadFrom - read from implements io.ReaderFrom interface for +// unmarshalling xlMetaV1. +func (m *xlMetaV1) ReadFrom(reader io.Reader) (n int64, err error) { + var buffer bytes.Buffer + n, err = buffer.ReadFrom(reader) + if err != nil { + return 0, err + } + err = json.Unmarshal(buffer.Bytes(), m) + return n, err +} + +// WriteTo - write to implements io.WriterTo interface for marshalling xlMetaV1. +func (m xlMetaV1) WriteTo(writer io.Writer) (n int64, err error) { + metadataBytes, err := json.Marshal(m) + if err != nil { + return 0, err + } + p, err := writer.Write(metadataBytes) + return int64(p), err +} + +// byPartName is a collection satisfying sort.Interface. +type byPartName []objectPartInfo + +func (t byPartName) Len() int { return len(t) } +func (t byPartName) Swap(i, j int) { t[i], t[j] = t[j], t[i] } +func (t byPartName) Less(i, j int) bool { return t[i].Name < t[j].Name } + +// SearchObjectPart - searches for part name and etag, returns the +// index if found. +func (m xlMetaV1) SearchObjectPart(name string, etag string) int { + for i, part := range m.Parts { + if name == part.Name && etag == part.ETag { + return i + } + } + return -1 +} + +// AddObjectPart - add a new object part in order. +func (m *xlMetaV1) AddObjectPart(name string, etag string, size int64) { + m.Parts = append(m.Parts, objectPartInfo{ + Name: name, + ETag: etag, + Size: size, + }) + sort.Sort(byPartName(m.Parts)) +} + +// getPartNumberOffset - given an offset for the whole object, return the part and offset in that part. +func (m xlMetaV1) getPartNumberOffset(offset int64) (partNumber int, partOffset int64, err error) { + partOffset = offset + for i, part := range m.Parts { + partNumber = i + if part.Size == 0 { + return partNumber, partOffset, nil + } + if partOffset < part.Size { + return partNumber, partOffset, nil + } + partOffset -= part.Size + } + // Offset beyond the size of the object + err = errUnexpected + return 0, 0, err +} + +// This function does the following check, suppose +// object is "a/b/c/d", stat makes sure that objects ""a/b/c"" +// "a/b" and "a" do not exist. +func (xl xlObjects) parentDirIsObject(bucket, parent string) bool { + var isParentDirObject func(string) bool + isParentDirObject = func(p string) bool { + if p == "." { + return false + } + if xl.isObject(bucket, p) { + // If there is already a file at prefix "p" return error. + return true + } + // Check if there is a file as one of the parent paths. + return isParentDirObject(path.Dir(p)) + } + return isParentDirObject(parent) +} + +func (xl xlObjects) isObject(bucket, prefix string) bool { + // Create errs and volInfo slices of storageDisks size. + var errs = make([]error, len(xl.storageDisks)) + + // Allocate a new waitgroup. + var wg = &sync.WaitGroup{} + for index, disk := range xl.storageDisks { + wg.Add(1) + // Stat file on all the disks in a routine. + go func(index int, disk StorageAPI) { + defer wg.Done() + _, err := disk.StatFile(bucket, path.Join(prefix, xlMetaJSONFile)) + if err != nil { + errs[index] = err + return + } + errs[index] = nil + }(index, disk) + } + + // Wait for all the Stat operations to finish. + wg.Wait() + + var errFileNotFoundCount int + for _, err := range errs { + if err != nil { + if err == errFileNotFound { + errFileNotFoundCount++ + // If we have errors with file not found greater than allowed read + // quorum we return err as errFileNotFound. + if errFileNotFoundCount > len(xl.storageDisks)-xl.readQuorum { + return false + } + continue + } + errorIf(err, "Unable to access file "+path.Join(bucket, prefix)) + return false + } + } + return true +} + +// readXLMetadata - read xl metadata. +func readXLMetadata(disk StorageAPI, bucket, object string) (xlMeta xlMetaV1, err error) { + r, err := disk.ReadFile(bucket, path.Join(object, xlMetaJSONFile), int64(0)) + if err != nil { + return xlMetaV1{}, err + } + defer r.Close() + _, err = xlMeta.ReadFrom(r) + if err != nil { + return xlMetaV1{}, err + } + return xlMeta, nil +} + +// deleteXLJson - delete `xl.json` on all disks. +func (xl xlObjects) deleteXLMetadata(bucket, object string) error { + return xl.deleteObject(bucket, path.Join(object, xlMetaJSONFile)) +} + +// renameXLJson - rename `xl.json` on all disks. +func (xl xlObjects) renameXLMetadata(srcBucket, srcPrefix, dstBucket, dstPrefix string) error { + return xl.renameObject(srcBucket, path.Join(srcPrefix, xlMetaJSONFile), dstBucket, path.Join(dstPrefix, xlMetaJSONFile)) +} + +// getDiskDistribution - get disk distribution. +func (xl xlObjects) getDiskDistribution() []int { + var distribution = make([]int, len(xl.storageDisks)) + for index := range xl.storageDisks { + distribution[index] = index + 1 + } + return distribution +} + +// writeXLJson - write `xl.json` on all disks in order. +func (xl xlObjects) writeXLMetadata(bucket, prefix string, xlMeta xlMetaV1) error { + var wg = &sync.WaitGroup{} + var mErrs = make([]error, len(xl.storageDisks)) + + // Initialize metadata map, save all erasure related metadata. + xlMeta.Minio.Release = minioReleaseTag + xlMeta.Erasure.DataBlocks = xl.dataBlocks + xlMeta.Erasure.ParityBlocks = xl.parityBlocks + xlMeta.Erasure.BlockSize = erasureBlockSize + xlMeta.Erasure.Distribution = xl.getDiskDistribution() + + for index, disk := range xl.storageDisks { + wg.Add(1) + go func(index int, disk StorageAPI, metadata xlMetaV1) { + defer wg.Done() + + metaJSONFile := path.Join(prefix, xlMetaJSONFile) + metaWriter, mErr := disk.CreateFile(bucket, metaJSONFile) + if mErr != nil { + mErrs[index] = mErr + return + } + + // Save the order. + metadata.Erasure.Index = index + 1 + _, mErr = metadata.WriteTo(metaWriter) + if mErr != nil { + if mErr = safeCloseAndRemove(metaWriter); mErr != nil { + mErrs[index] = mErr + return + } + mErrs[index] = mErr + return + } + if mErr = metaWriter.Close(); mErr != nil { + if mErr = safeCloseAndRemove(metaWriter); mErr != nil { + mErrs[index] = mErr + return + } + mErrs[index] = mErr + return + } + mErrs[index] = nil + }(index, disk, xlMeta) + } + + // Wait for all the routines. + wg.Wait() + + // FIXME: check for quorum. + // Loop through concocted errors and return the first one. + for _, err := range mErrs { + if err == nil { + continue + } + return err + } + return nil +} diff --git a/xl-v1-multipart-common.go b/xl-v1-multipart-common.go new file mode 100644 index 000000000..ee9b057c6 --- /dev/null +++ b/xl-v1-multipart-common.go @@ -0,0 +1,474 @@ +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "bytes" + "encoding/json" + "io" + "path" + "sort" + "strings" + "sync" + "time" + + "github.com/skyrings/skyring-common/tools/uuid" +) + +// uploadInfo - +type uploadInfo struct { + UploadID string `json:"uploadId"` + Initiated time.Time `json:"initiated"` +} + +// uploadsV1 - +type uploadsV1 struct { + Version string `json:"version"` + Format string `json:"format"` + Uploads []uploadInfo `json:"uploadIds"` +} + +// byInitiatedTime is a collection satisfying sort.Interface. +type byInitiatedTime []uploadInfo + +func (t byInitiatedTime) Len() int { return len(t) } +func (t byInitiatedTime) Swap(i, j int) { t[i], t[j] = t[j], t[i] } +func (t byInitiatedTime) Less(i, j int) bool { + return t[i].Initiated.After(t[j].Initiated) +} + +// AddUploadID - adds a new upload id in order of its initiated time. +func (u *uploadsV1) AddUploadID(uploadID string, initiated time.Time) { + u.Uploads = append(u.Uploads, uploadInfo{ + UploadID: uploadID, + Initiated: initiated, + }) + sort.Sort(byInitiatedTime(u.Uploads)) +} + +func (u uploadsV1) SearchUploadID(uploadID string) int { + for i, u := range u.Uploads { + if u.UploadID == uploadID { + return i + } + } + return -1 +} + +// ReadFrom - read from implements io.ReaderFrom interface for unmarshalling uploads. +func (u *uploadsV1) ReadFrom(reader io.Reader) (n int64, err error) { + var buffer bytes.Buffer + n, err = buffer.ReadFrom(reader) + if err != nil { + return 0, err + } + err = json.Unmarshal(buffer.Bytes(), &u) + return n, err +} + +// WriteTo - write to implements io.WriterTo interface for marshalling uploads. +func (u uploadsV1) WriteTo(writer io.Writer) (n int64, err error) { + metadataBytes, err := json.Marshal(u) + if err != nil { + return 0, err + } + m, err := writer.Write(metadataBytes) + return int64(m), err +} + +// getUploadIDs - get saved upload id's. +func getUploadIDs(bucket, object string, storageDisks ...StorageAPI) (uploadIDs uploadsV1, err error) { + uploadJSONPath := path.Join(mpartMetaPrefix, bucket, object, uploadsJSONFile) + var errs = make([]error, len(storageDisks)) + var uploads = make([]uploadsV1, len(storageDisks)) + var wg = &sync.WaitGroup{} + + for index, disk := range storageDisks { + wg.Add(1) + go func(index int, disk StorageAPI) { + defer wg.Done() + r, rErr := disk.ReadFile(minioMetaBucket, uploadJSONPath, int64(0)) + if rErr != nil { + errs[index] = rErr + return + } + defer r.Close() + _, rErr = uploads[index].ReadFrom(r) + if rErr != nil { + errs[index] = rErr + return + } + errs[index] = nil + }(index, disk) + } + wg.Wait() + + for _, err = range errs { + if err != nil { + return uploadsV1{}, err + } + } + + // FIXME: Do not know if it should pick the picks the first successful one and returns. + return uploads[0], nil +} + +func updateUploadJSON(bucket, object string, uploadIDs uploadsV1, storageDisks ...StorageAPI) error { + uploadsPath := path.Join(mpartMetaPrefix, bucket, object, uploadsJSONFile) + var errs = make([]error, len(storageDisks)) + var wg = &sync.WaitGroup{} + + for index, disk := range storageDisks { + wg.Add(1) + go func(index int, disk StorageAPI) { + defer wg.Done() + w, wErr := disk.CreateFile(minioMetaBucket, uploadsPath) + if wErr != nil { + errs[index] = wErr + return + } + _, wErr = uploadIDs.WriteTo(w) + if wErr != nil { + errs[index] = wErr + return + } + if wErr = w.Close(); wErr != nil { + if clErr := safeCloseAndRemove(w); clErr != nil { + errs[index] = clErr + return + } + errs[index] = wErr + return + } + }(index, disk) + } + + wg.Wait() + + for _, err := range errs { + if err != nil { + return err + } + } + + return nil +} + +// writeUploadJSON - create `uploads.json` or update it with new uploadID. +func writeUploadJSON(bucket, object, uploadID string, initiated time.Time, storageDisks ...StorageAPI) error { + uploadsPath := path.Join(mpartMetaPrefix, bucket, object, uploadsJSONFile) + tmpUploadsPath := path.Join(tmpMetaPrefix, bucket, object, uploadsJSONFile) + + var errs = make([]error, len(storageDisks)) + var wg = &sync.WaitGroup{} + + uploadIDs, err := getUploadIDs(bucket, object, storageDisks...) + if err != nil && err != errFileNotFound { + return err + } + uploadIDs.Version = "1" + uploadIDs.Format = "xl" + uploadIDs.AddUploadID(uploadID, initiated) + + for index, disk := range storageDisks { + wg.Add(1) + go func(index int, disk StorageAPI) { + defer wg.Done() + w, wErr := disk.CreateFile(minioMetaBucket, tmpUploadsPath) + if wErr != nil { + errs[index] = wErr + return + } + _, wErr = uploadIDs.WriteTo(w) + if wErr != nil { + errs[index] = wErr + return + } + if wErr = w.Close(); wErr != nil { + if clErr := safeCloseAndRemove(w); clErr != nil { + errs[index] = clErr + return + } + errs[index] = wErr + return + } + + _, wErr = disk.StatFile(minioMetaBucket, uploadsPath) + if wErr != nil { + if wErr == errFileNotFound { + wErr = disk.RenameFile(minioMetaBucket, tmpUploadsPath, minioMetaBucket, uploadsPath) + if wErr == nil { + return + } + } + if dErr := disk.DeleteFile(minioMetaBucket, tmpUploadsPath); dErr != nil { + errs[index] = dErr + return + } + errs[index] = wErr + return + } + }(index, disk) + } + + wg.Wait() + + for _, err = range errs { + if err != nil { + return err + } + } + + return nil +} + +// Wrapper which removes all the uploaded parts. +func cleanupUploadedParts(bucket, object, uploadID string, storageDisks ...StorageAPI) error { + var errs = make([]error, len(storageDisks)) + var wg = &sync.WaitGroup{} + for index, disk := range storageDisks { + wg.Add(1) + go func(index int, disk StorageAPI) { + defer wg.Done() + err := cleanupDir(disk, minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object, uploadID)) + if err != nil { + errs[index] = err + return + } + errs[index] = nil + }(index, disk) + } + wg.Wait() + + for _, err := range errs { + if err != nil { + return err + } + } + return nil +} + +// listUploadsInfo - list all uploads info. +func (xl xlObjects) listUploadsInfo(prefixPath string) (uploads []uploadInfo, err error) { + disk := xl.getRandomDisk() + splitPrefixes := strings.SplitN(prefixPath, "/", 3) + uploadIDs, err := getUploadIDs(splitPrefixes[1], splitPrefixes[2], disk) + if err != nil { + if err == errFileNotFound { + return []uploadInfo{}, nil + } + return nil, err + } + uploads = uploadIDs.Uploads + return uploads, nil +} + +// listMetaBucketMultipart - list all objects at a given prefix inside minioMetaBucket. +func (xl xlObjects) listMetaBucketMultipart(prefixPath string, markerPath string, recursive bool, maxKeys int) (objInfos []ObjectInfo, eof bool, err error) { + walker := xl.lookupTreeWalkXL(listParams{minioMetaBucket, recursive, markerPath, prefixPath}) + if walker == nil { + walker = xl.startTreeWalkXL(minioMetaBucket, prefixPath, markerPath, recursive) + } + + // newMaxKeys tracks the size of entries which are going to be + // returned back. + var newMaxKeys int + + // Following loop gathers and filters out special files inside minio meta volume. + for { + walkResult, ok := <-walker.ch + if !ok { + // Closed channel. + eof = true + break + } + // For any walk error return right away. + if walkResult.err != nil { + // File not found or Disk not found is a valid case. + if walkResult.err == errFileNotFound || walkResult.err == errDiskNotFound { + return nil, true, nil + } + return nil, false, toObjectErr(walkResult.err, minioMetaBucket, prefixPath) + } + objInfo := walkResult.objInfo + var uploads []uploadInfo + if objInfo.IsDir { + // List all the entries if fi.Name is a leaf directory, if + // fi.Name is not a leaf directory then the resulting + // entries are empty. + uploads, err = xl.listUploadsInfo(objInfo.Name) + if err != nil { + return nil, false, err + } + } + if len(uploads) > 0 { + for _, upload := range uploads { + objInfos = append(objInfos, ObjectInfo{ + Name: path.Join(objInfo.Name, upload.UploadID), + ModTime: upload.Initiated, + }) + newMaxKeys++ + // If we have reached the maxKeys, it means we have listed + // everything that was requested. + if newMaxKeys == maxKeys { + break + } + } + } else { + // We reach here for a non-recursive case non-leaf entry + // OR recursive case with fi.Name. + if !objInfo.IsDir { // Do not skip non-recursive case directory entries. + // Validate if 'fi.Name' is incomplete multipart. + if !strings.HasSuffix(objInfo.Name, xlMetaJSONFile) { + continue + } + objInfo.Name = path.Dir(objInfo.Name) + } + objInfos = append(objInfos, objInfo) + newMaxKeys++ + // If we have reached the maxKeys, it means we have listed + // everything that was requested. + if newMaxKeys == maxKeys { + break + } + } + } + + if !eof && len(objInfos) != 0 { + // EOF has not reached, hence save the walker channel to the map so that the walker go routine + // can continue from where it left off for the next list request. + lastObjInfo := objInfos[len(objInfos)-1] + markerPath = lastObjInfo.Name + xl.saveTreeWalkXL(listParams{minioMetaBucket, recursive, markerPath, prefixPath}, walker) + } + + // Return entries here. + return objInfos, eof, nil +} + +// FIXME: Currently the code sorts based on keyName/upload-id which is +// not correct based on the S3 specs. According to s3 specs we are +// supposed to only lexically sort keyNames and then for keyNames with +// multiple upload ids should be sorted based on the initiated time. +// Currently this case is not handled. + +// listMultipartUploadsCommon - lists all multipart uploads, common +// function for both object layers. +func (xl xlObjects) listMultipartUploadsCommon(bucket, prefix, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (ListMultipartsInfo, error) { + result := ListMultipartsInfo{} + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return ListMultipartsInfo{}, BucketNameInvalid{Bucket: bucket} + } + if !xl.isBucketExist(bucket) { + return ListMultipartsInfo{}, BucketNotFound{Bucket: bucket} + } + if !IsValidObjectPrefix(prefix) { + return ListMultipartsInfo{}, ObjectNameInvalid{Bucket: bucket, Object: prefix} + } + // Verify if delimiter is anything other than '/', which we do not support. + if delimiter != "" && delimiter != slashSeparator { + return ListMultipartsInfo{}, UnsupportedDelimiter{ + Delimiter: delimiter, + } + } + // Verify if marker has prefix. + if keyMarker != "" && !strings.HasPrefix(keyMarker, prefix) { + return ListMultipartsInfo{}, InvalidMarkerPrefixCombination{ + Marker: keyMarker, + Prefix: prefix, + } + } + if uploadIDMarker != "" { + if strings.HasSuffix(keyMarker, slashSeparator) { + return result, InvalidUploadIDKeyCombination{ + UploadIDMarker: uploadIDMarker, + KeyMarker: keyMarker, + } + } + id, err := uuid.Parse(uploadIDMarker) + if err != nil { + return result, err + } + if id.IsZero() { + return result, MalformedUploadID{ + UploadID: uploadIDMarker, + } + } + } + + recursive := true + if delimiter == slashSeparator { + recursive = false + } + + result.IsTruncated = true + result.MaxUploads = maxUploads + + // Not using path.Join() as it strips off the trailing '/'. + multipartPrefixPath := pathJoin(mpartMetaPrefix, pathJoin(bucket, prefix)) + if prefix == "" { + // Should have a trailing "/" if prefix is "" + // For ex. multipartPrefixPath should be "multipart/bucket/" if prefix is "" + multipartPrefixPath += slashSeparator + } + multipartMarkerPath := "" + if keyMarker != "" { + keyMarkerPath := pathJoin(pathJoin(bucket, keyMarker), uploadIDMarker) + multipartMarkerPath = pathJoin(mpartMetaPrefix, keyMarkerPath) + } + + // List all the multipart files at prefixPath, starting with marker keyMarkerPath. + objInfos, eof, err := xl.listMetaBucketMultipart(multipartPrefixPath, multipartMarkerPath, recursive, maxUploads) + if err != nil { + return ListMultipartsInfo{}, err + } + + // Loop through all the received files fill in the multiparts result. + for _, objInfo := range objInfos { + var objectName string + var uploadID string + if objInfo.IsDir { + // All directory entries are common prefixes. + uploadID = "" // Upload ids are empty for CommonPrefixes. + objectName = strings.TrimPrefix(objInfo.Name, retainSlash(pathJoin(mpartMetaPrefix, bucket))) + result.CommonPrefixes = append(result.CommonPrefixes, objectName) + } else { + uploadID = path.Base(objInfo.Name) + objectName = strings.TrimPrefix(path.Dir(objInfo.Name), retainSlash(pathJoin(mpartMetaPrefix, bucket))) + result.Uploads = append(result.Uploads, uploadMetadata{ + Object: objectName, + UploadID: uploadID, + Initiated: objInfo.ModTime, + }) + } + result.NextKeyMarker = objectName + result.NextUploadIDMarker = uploadID + } + result.IsTruncated = !eof + if !result.IsTruncated { + result.NextKeyMarker = "" + result.NextUploadIDMarker = "" + } + return result, nil +} + +// isUploadIDExists - verify if a given uploadID exists and is valid. +func (xl xlObjects) isUploadIDExists(bucket, object, uploadID string) bool { + uploadIDPath := path.Join(mpartMetaPrefix, bucket, object, uploadID) + return xl.isObject(minioMetaBucket, uploadIDPath) +} diff --git a/xl-v1-multipart.go b/xl-v1-multipart.go new file mode 100644 index 000000000..c3928e30e --- /dev/null +++ b/xl-v1-multipart.go @@ -0,0 +1,432 @@ +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "crypto/md5" + "encoding/hex" + "fmt" + "io" + "io/ioutil" + "path" + "strconv" + "time" +) + +// ListMultipartUploads - list multipart uploads. +func (xl xlObjects) ListMultipartUploads(bucket, prefix, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (ListMultipartsInfo, error) { + return xl.listMultipartUploadsCommon(bucket, prefix, keyMarker, uploadIDMarker, delimiter, maxUploads) +} + +/// Common multipart object layer functions. + +// newMultipartUploadCommon - initialize a new multipart, is a common function for both object layers. +func (xl xlObjects) newMultipartUploadCommon(bucket string, object string, meta map[string]string) (uploadID string, err error) { + // Verify if bucket name is valid. + if !IsValidBucketName(bucket) { + return "", BucketNameInvalid{Bucket: bucket} + } + // Verify whether the bucket exists. + if !xl.isBucketExist(bucket) { + return "", BucketNotFound{Bucket: bucket} + } + // Verify if object name is valid. + if !IsValidObjectName(object) { + return "", ObjectNameInvalid{Bucket: bucket, Object: object} + } + // No metadata is set, allocate a new one. + if meta == nil { + meta = make(map[string]string) + } + + xlMeta := xlMetaV1{} + xlMeta.Format = "xl" + xlMeta.Version = "1" + // If not set default to "application/octet-stream" + if meta["content-type"] == "" { + meta["content-type"] = "application/octet-stream" + } + xlMeta.Meta = meta + + // This lock needs to be held for any changes to the directory contents of ".minio/multipart/object/" + nsMutex.Lock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object)) + defer nsMutex.Unlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object)) + + uploadID = getUUID() + initiated := time.Now().UTC() + // Create 'uploads.json' + if err = writeUploadJSON(bucket, object, uploadID, initiated, xl.storageDisks...); err != nil { + return "", err + } + uploadIDPath := path.Join(mpartMetaPrefix, bucket, object, uploadID) + tempUploadIDPath := path.Join(tmpMetaPrefix, bucket, object, uploadID) + if err = xl.writeXLMetadata(minioMetaBucket, tempUploadIDPath, xlMeta); err != nil { + return "", toObjectErr(err, minioMetaBucket, tempUploadIDPath) + } + if err = xl.renameXLMetadata(minioMetaBucket, tempUploadIDPath, minioMetaBucket, uploadIDPath); err != nil { + if dErr := xl.deleteXLMetadata(minioMetaBucket, tempUploadIDPath); dErr != nil { + return "", toObjectErr(dErr, minioMetaBucket, tempUploadIDPath) + } + return "", toObjectErr(err, minioMetaBucket, uploadIDPath) + } + // Return success. + return uploadID, nil +} + +// NewMultipartUpload - initialize a new multipart upload, returns a unique id. +func (xl xlObjects) NewMultipartUpload(bucket, object string, meta map[string]string) (string, error) { + return xl.newMultipartUploadCommon(bucket, object, meta) +} + +// putObjectPartCommon - put object part. +func (xl xlObjects) putObjectPartCommon(bucket string, object string, uploadID string, partID int, size int64, data io.Reader, md5Hex string) (string, error) { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return "", BucketNameInvalid{Bucket: bucket} + } + // Verify whether the bucket exists. + if !xl.isBucketExist(bucket) { + return "", BucketNotFound{Bucket: bucket} + } + if !IsValidObjectName(object) { + return "", ObjectNameInvalid{Bucket: bucket, Object: object} + } + if !xl.isUploadIDExists(bucket, object, uploadID) { + return "", InvalidUploadID{UploadID: uploadID} + } + // Hold read lock on the uploadID so that no one aborts it. + nsMutex.RLock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) + defer nsMutex.RUnlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) + + // Hold write lock on the part so that there is no parallel upload on the part. + nsMutex.Lock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID, strconv.Itoa(partID))) + defer nsMutex.Unlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID, strconv.Itoa(partID))) + + partSuffix := fmt.Sprintf("object%d", partID) + tmpPartPath := path.Join(tmpMetaPrefix, bucket, object, uploadID, partSuffix) + fileWriter, err := xl.erasureDisk.CreateFile(minioMetaBucket, tmpPartPath) + if err != nil { + return "", toObjectErr(err, bucket, object) + } + + // Initialize md5 writer. + md5Writer := md5.New() + + // Instantiate a new multi writer. + multiWriter := io.MultiWriter(md5Writer, fileWriter) + + // Instantiate checksum hashers and create a multiwriter. + if size > 0 { + if _, err = io.CopyN(multiWriter, data, size); err != nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", toObjectErr(clErr, bucket, object) + } + return "", toObjectErr(err, bucket, object) + } + // Reader shouldn't have more data what mentioned in size argument. + // reading one more byte from the reader to validate it. + // expected to fail, success validates existence of more data in the reader. + if _, err = io.CopyN(ioutil.Discard, data, 1); err == nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", toObjectErr(clErr, bucket, object) + } + return "", UnExpectedDataSize{Size: int(size)} + } + } else { + var n int64 + if n, err = io.Copy(multiWriter, data); err != nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", toObjectErr(clErr, bucket, object) + } + return "", toObjectErr(err, bucket, object) + } + size = n + } + + newMD5Hex := hex.EncodeToString(md5Writer.Sum(nil)) + if md5Hex != "" { + if newMD5Hex != md5Hex { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", toObjectErr(clErr, bucket, object) + } + return "", BadDigest{md5Hex, newMD5Hex} + } + } + err = fileWriter.Close() + if err != nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", toObjectErr(clErr, bucket, object) + } + return "", err + } + + uploadIDPath := path.Join(mpartMetaPrefix, bucket, object, uploadID) + xlMeta, err := readXLMetadata(xl.getRandomDisk(), minioMetaBucket, uploadIDPath) + if err != nil { + return "", toObjectErr(err, minioMetaBucket, uploadIDPath) + } + xlMeta.AddObjectPart(partSuffix, newMD5Hex, size) + + partPath := path.Join(mpartMetaPrefix, bucket, object, uploadID, partSuffix) + err = xl.renameObject(minioMetaBucket, tmpPartPath, minioMetaBucket, partPath) + if err != nil { + if dErr := xl.deleteObject(minioMetaBucket, tmpPartPath); dErr != nil { + return "", toObjectErr(dErr, minioMetaBucket, tmpPartPath) + } + return "", toObjectErr(err, minioMetaBucket, partPath) + } + if err = xl.writeXLMetadata(minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object, uploadID), xlMeta); err != nil { + return "", toObjectErr(err, minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object, uploadID)) + } + return newMD5Hex, nil +} + +// PutObjectPart - writes the multipart upload chunks. +func (xl xlObjects) PutObjectPart(bucket, object, uploadID string, partID int, size int64, data io.Reader, md5Hex string) (string, error) { + return xl.putObjectPartCommon(bucket, object, uploadID, partID, size, data, md5Hex) +} + +// ListObjectParts - list object parts, common function across both object layers. +func (xl xlObjects) listObjectPartsCommon(bucket, object, uploadID string, partNumberMarker, maxParts int) (ListPartsInfo, error) { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return ListPartsInfo{}, BucketNameInvalid{Bucket: bucket} + } + // Verify whether the bucket exists. + if !xl.isBucketExist(bucket) { + return ListPartsInfo{}, BucketNotFound{Bucket: bucket} + } + if !IsValidObjectName(object) { + return ListPartsInfo{}, ObjectNameInvalid{Bucket: bucket, Object: object} + } + if !xl.isUploadIDExists(bucket, object, uploadID) { + return ListPartsInfo{}, InvalidUploadID{UploadID: uploadID} + } + // Hold lock so that there is no competing abort-multipart-upload or complete-multipart-upload. + nsMutex.Lock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) + defer nsMutex.Unlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) + result := ListPartsInfo{} + + disk := xl.getRandomDisk() // Pick a random disk and read `xl.json` from there. + uploadIDPath := path.Join(mpartMetaPrefix, bucket, object, uploadID) + xlMeta, err := readXLMetadata(disk, minioMetaBucket, uploadIDPath) + if err != nil { + return ListPartsInfo{}, toObjectErr(err, minioMetaBucket, uploadIDPath) + } + // Only parts with higher part numbers will be listed. + parts := xlMeta.Parts[partNumberMarker:] + count := maxParts + for i, part := range parts { + var fi FileInfo + partNamePath := path.Join(mpartMetaPrefix, bucket, object, uploadID, part.Name) + fi, err = disk.StatFile(minioMetaBucket, partNamePath) + if err != nil { + return ListPartsInfo{}, toObjectErr(err, minioMetaBucket, partNamePath) + } + partNum := i + partNumberMarker + 1 + result.Parts = append(result.Parts, partInfo{ + PartNumber: partNum, + ETag: part.ETag, + LastModified: fi.ModTime, + Size: fi.Size, + }) + count-- + if count == 0 { + break + } + } + // If listed entries are more than maxParts, we set IsTruncated as true. + if len(parts) > len(result.Parts) { + result.IsTruncated = true + // Make sure to fill next part number marker if IsTruncated is + // true for subsequent listing. + nextPartNumberMarker := result.Parts[len(result.Parts)-1].PartNumber + result.NextPartNumberMarker = nextPartNumberMarker + } + result.Bucket = bucket + result.Object = object + result.UploadID = uploadID + result.MaxParts = maxParts + return result, nil +} + +// ListObjectParts - list object parts. +func (xl xlObjects) ListObjectParts(bucket, object, uploadID string, partNumberMarker, maxParts int) (ListPartsInfo, error) { + return xl.listObjectPartsCommon(bucket, object, uploadID, partNumberMarker, maxParts) +} + +func (xl xlObjects) CompleteMultipartUpload(bucket string, object string, uploadID string, parts []completePart) (string, error) { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return "", BucketNameInvalid{Bucket: bucket} + } + // Verify whether the bucket exists. + if !xl.isBucketExist(bucket) { + return "", BucketNotFound{Bucket: bucket} + } + if !IsValidObjectName(object) { + return "", ObjectNameInvalid{ + Bucket: bucket, + Object: object, + } + } + if !xl.isUploadIDExists(bucket, object, uploadID) { + return "", InvalidUploadID{UploadID: uploadID} + } + // Hold lock so that + // 1) no one aborts this multipart upload + // 2) no one does a parallel complete-multipart-upload on this multipart upload + nsMutex.Lock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) + defer nsMutex.Unlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) + + // Calculate s3 compatible md5sum for complete multipart. + s3MD5, err := completeMultipartMD5(parts...) + if err != nil { + return "", err + } + + uploadIDPath := pathJoin(mpartMetaPrefix, bucket, object, uploadID) + xlMeta, err := readXLMetadata(xl.getRandomDisk(), minioMetaBucket, uploadIDPath) + if err != nil { + return "", err + } + + var objectSize int64 + // Loop through all parts, validate them and then commit to disk. + for i, part := range parts { + // Construct part suffix. + partSuffix := fmt.Sprintf("object%d", part.PartNumber) + if xlMeta.SearchObjectPart(partSuffix, part.ETag) == -1 { + return "", InvalidPart{} + } + // All parts except the last part has to be atleast 5MB. + if (i < len(parts)-1) && !isMinAllowedPartSize(xlMeta.Parts[i].Size) { + return "", PartTooSmall{} + } + objectSize += xlMeta.Parts[i].Size + } + + // Check if an object is present as one of the parent dir. + if xl.parentDirIsObject(bucket, path.Dir(object)) { + return "", toObjectErr(errFileAccessDenied, bucket, object) + } + + // Save the final object size and modtime. + xlMeta.Stat.Size = objectSize + xlMeta.Stat.ModTime = time.Now().UTC() + + // Save successfully calculated md5sum. + xlMeta.Meta["md5Sum"] = s3MD5 + if err = xl.writeXLMetadata(minioMetaBucket, uploadIDPath, xlMeta); err != nil { + return "", toObjectErr(err, minioMetaBucket, uploadIDPath) + } + + // Hold write lock on the destination before rename + nsMutex.Lock(bucket, object) + defer nsMutex.Unlock(bucket, object) + + // Delete if an object already exists. + // FIXME: rename it to tmp file and delete only after + // the newly uploaded file is renamed from tmp location to + // the original location. Verify if the object is a multipart object. + err = xl.deleteObject(bucket, object) + if err != nil { + return "", toObjectErr(err, bucket, object) + } + + if err = xl.renameObject(minioMetaBucket, uploadIDPath, bucket, object); err != nil { + return "", toObjectErr(err, bucket, object) + } + + // Hold the lock so that two parallel complete-multipart-uploads do no + // leave a stale uploads.json behind. + nsMutex.Lock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object)) + defer nsMutex.Unlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object)) + + // Validate if there are other incomplete upload-id's present for + // the object, if yes do not attempt to delete 'uploads.json'. + uploadIDs, err := getUploadIDs(bucket, object, xl.storageDisks...) + if err == nil { + uploadIDIdx := uploadIDs.SearchUploadID(uploadID) + if uploadIDIdx != -1 { + uploadIDs.Uploads = append(uploadIDs.Uploads[:uploadIDIdx], uploadIDs.Uploads[uploadIDIdx+1:]...) + } + if len(uploadIDs.Uploads) > 0 { + if err = updateUploadJSON(bucket, object, uploadIDs, xl.storageDisks...); err != nil { + return "", err + } + return s3MD5, nil + } + } + + err = xl.deleteObject(minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object)) + if err != nil { + return "", toObjectErr(err, minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object)) + } + + // Return md5sum. + return s3MD5, nil +} + +// abortMultipartUploadCommon - aborts a multipart upload, common +// function used by both object layers. +func (xl xlObjects) abortMultipartUploadCommon(bucket, object, uploadID string) error { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return BucketNameInvalid{Bucket: bucket} + } + if !xl.isBucketExist(bucket) { + return BucketNotFound{Bucket: bucket} + } + if !IsValidObjectName(object) { + return ObjectNameInvalid{Bucket: bucket, Object: object} + } + if !xl.isUploadIDExists(bucket, object, uploadID) { + return InvalidUploadID{UploadID: uploadID} + } + + // Hold lock so that there is no competing complete-multipart-upload or put-object-part. + nsMutex.Lock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) + defer nsMutex.Unlock(minioMetaBucket, pathJoin(mpartMetaPrefix, bucket, object, uploadID)) + + // Cleanup all uploaded parts. + if err := cleanupUploadedParts(bucket, object, uploadID, xl.storageDisks...); err != nil { + return err + } + + // Validate if there are other incomplete upload-id's present for + // the object, if yes do not attempt to delete 'uploads.json'. + uploadIDs, err := getUploadIDs(bucket, object, xl.storageDisks...) + if err == nil { + uploadIDIdx := uploadIDs.SearchUploadID(uploadID) + if uploadIDIdx != -1 { + uploadIDs.Uploads = append(uploadIDs.Uploads[:uploadIDIdx], uploadIDs.Uploads[uploadIDIdx+1:]...) + } + if len(uploadIDs.Uploads) > 0 { + return nil + } + } + if err = xl.deleteObject(minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object)); err != nil { + return toObjectErr(err, minioMetaBucket, path.Join(mpartMetaPrefix, bucket, object)) + } + return nil +} + +// AbortMultipartUpload - aborts a multipart upload. +func (xl xlObjects) AbortMultipartUpload(bucket, object, uploadID string) error { + return xl.abortMultipartUploadCommon(bucket, object, uploadID) +} diff --git a/xl-v1-object.go b/xl-v1-object.go new file mode 100644 index 000000000..5b81e4c08 --- /dev/null +++ b/xl-v1-object.go @@ -0,0 +1,357 @@ +package main + +import ( + "crypto/md5" + "encoding/hex" + "io" + "path" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/minio/minio/pkg/mimedb" +) + +/// Object Operations + +// GetObject - get an object. +func (xl xlObjects) GetObject(bucket, object string, startOffset int64) (io.ReadCloser, error) { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return nil, BucketNameInvalid{Bucket: bucket} + } + // Verify if object is valid. + if !IsValidObjectName(object) { + return nil, ObjectNameInvalid{Bucket: bucket, Object: object} + } + nsMutex.RLock(bucket, object) + defer nsMutex.RUnlock(bucket, object) + fileReader, fileWriter := io.Pipe() + xlMeta, err := readXLMetadata(xl.getRandomDisk(), bucket, object) + if err != nil { + return nil, toObjectErr(err, bucket, object) + } + partIndex, offset, err := xlMeta.getPartNumberOffset(startOffset) + if err != nil { + return nil, toObjectErr(err, bucket, object) + } + + // Hold a read lock once more which can be released after the following go-routine ends. + // We hold RLock once more because the current function would return before the go routine below + // executes and hence releasing the read lock (because of defer'ed nsMutex.RUnlock() call). + nsMutex.RLock(bucket, object) + go func() { + defer nsMutex.RUnlock(bucket, object) + for ; partIndex < len(xlMeta.Parts); partIndex++ { + part := xlMeta.Parts[partIndex] + r, err := xl.erasureDisk.ReadFile(bucket, pathJoin(object, part.Name), offset) + if err != nil { + fileWriter.CloseWithError(err) + return + } + // Reset offset to 0 as it would be non-0 only for the first loop if startOffset is non-0. + offset = 0 + if _, err = io.Copy(fileWriter, r); err != nil { + switch reader := r.(type) { + case *io.PipeReader: + reader.CloseWithError(err) + case io.ReadCloser: + reader.Close() + } + fileWriter.CloseWithError(err) + return + } + // Close the readerCloser that reads multiparts of an object from the xl storage layer. + // Not closing leaks underlying file descriptors. + r.Close() + } + fileWriter.Close() + }() + return fileReader, nil +} + +// GetObjectInfo - get object info. +func (xl xlObjects) GetObjectInfo(bucket, object string) (ObjectInfo, error) { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return ObjectInfo{}, BucketNameInvalid{Bucket: bucket} + } + // Verify if object is valid. + if !IsValidObjectName(object) { + return ObjectInfo{}, ObjectNameInvalid{Bucket: bucket, Object: object} + } + nsMutex.RLock(bucket, object) + defer nsMutex.RUnlock(bucket, object) + info, err := xl.getObjectInfo(bucket, object) + if err != nil { + return ObjectInfo{}, toObjectErr(err, bucket, object) + } + return info, nil +} + +func (xl xlObjects) getObjectInfo(bucket, object string) (objInfo ObjectInfo, err error) { + // Count for errors encountered. + var xlJSONErrCount = 0 + + // Loop through and return the first success entry based on the + // selected random disk. + for xlJSONErrCount < len(xl.storageDisks) { + // Choose a random disk on each attempt, do not hit the same disk all the time. + disk := xl.getRandomDisk() // Pick a random disk. + var xlMeta xlMetaV1 + xlMeta, err = readXLMetadata(disk, bucket, object) + if err == nil { + objInfo = ObjectInfo{} + objInfo.IsDir = false + objInfo.Bucket = bucket + objInfo.Name = object + objInfo.Size = xlMeta.Stat.Size + objInfo.ModTime = xlMeta.Stat.ModTime + objInfo.MD5Sum = xlMeta.Meta["md5Sum"] + objInfo.ContentType = xlMeta.Meta["content-type"] + objInfo.ContentEncoding = xlMeta.Meta["content-encoding"] + return objInfo, nil + } + xlJSONErrCount++ // Update error count. + } + + // Return error at the end. + return ObjectInfo{}, err +} + +// renameObject - renaming all source objects to destination object across all disks. +func (xl xlObjects) renameObject(srcBucket, srcObject, dstBucket, dstObject string) error { + // Initialize sync waitgroup. + var wg = &sync.WaitGroup{} + + // Initialize list of errors. + var errs = make([]error, len(xl.storageDisks)) + + // Rename file on all underlying storage disks. + for index, disk := range xl.storageDisks { + // Append "/" as srcObject and dstObject are either leaf-dirs or non-leaf-dris. + // If srcObject is an object instead of prefix we just rename the leaf-dir and + // not rename the part and metadata files separately. + wg.Add(1) + go func(index int, disk StorageAPI) { + defer wg.Done() + err := disk.RenameFile(srcBucket, retainSlash(srcObject), dstBucket, retainSlash(dstObject)) + if err != nil { + errs[index] = err + } + errs[index] = nil + }(index, disk) + } + + // Wait for all RenameFile to finish. + wg.Wait() + + // Gather err count. + var errCount = 0 + for _, err := range errs { + if err == nil { + continue + } + errCount++ + } + // We can safely allow RenameFile errors up to len(xl.storageDisks) - xl.writeQuorum + // otherwise return failure. Cleanup successful renames. + if errCount > len(xl.storageDisks)-xl.writeQuorum { + // Special condition if readQuorum exists, then return success. + if errCount <= len(xl.storageDisks)-xl.readQuorum { + return nil + } + xl.deleteObject(srcBucket, srcObject) + return errWriteQuorum + } + return nil +} + +// PutObject - create an object. +func (xl xlObjects) PutObject(bucket string, object string, size int64, data io.Reader, metadata map[string]string) (string, error) { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return "", BucketNameInvalid{Bucket: bucket} + } + // Verify bucket exists. + if !xl.isBucketExist(bucket) { + return "", BucketNotFound{Bucket: bucket} + } + if !IsValidObjectName(object) { + return "", ObjectNameInvalid{ + Bucket: bucket, + Object: object, + } + } + // No metadata is set, allocate a new one. + if metadata == nil { + metadata = make(map[string]string) + } + nsMutex.Lock(bucket, object) + defer nsMutex.Unlock(bucket, object) + + tempErasureObj := path.Join(tmpMetaPrefix, bucket, object, "object1") + tempObj := path.Join(tmpMetaPrefix, bucket, object) + fileWriter, err := xl.erasureDisk.CreateFile(minioMetaBucket, tempErasureObj) + if err != nil { + return "", toObjectErr(err, bucket, object) + } + + // Initialize md5 writer. + md5Writer := md5.New() + + // Instantiate a new multi writer. + multiWriter := io.MultiWriter(md5Writer, fileWriter) + + // Instantiate checksum hashers and create a multiwriter. + if size > 0 { + if _, err = io.CopyN(multiWriter, data, size); err != nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", toObjectErr(clErr, bucket, object) + } + return "", toObjectErr(err, bucket, object) + } + } else { + if _, err = io.Copy(multiWriter, data); err != nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", toObjectErr(clErr, bucket, object) + } + return "", toObjectErr(err, bucket, object) + } + } + + // Save additional erasureMetadata. + modTime := time.Now().UTC() + + newMD5Hex := hex.EncodeToString(md5Writer.Sum(nil)) + // Update the md5sum if not set with the newly calculated one. + if len(metadata["md5Sum"]) == 0 { + metadata["md5Sum"] = newMD5Hex + } + // If not set default to "application/octet-stream" + if metadata["content-type"] == "" { + contentType := "application/octet-stream" + if objectExt := filepath.Ext(object); objectExt != "" { + content, ok := mimedb.DB[strings.ToLower(strings.TrimPrefix(objectExt, "."))] + if ok { + contentType = content.ContentType + } + } + metadata["content-type"] = contentType + } + + // md5Hex representation. + md5Hex := metadata["md5Sum"] + if md5Hex != "" { + if newMD5Hex != md5Hex { + if err = safeCloseAndRemove(fileWriter); err != nil { + return "", toObjectErr(err, bucket, object) + } + return "", BadDigest{md5Hex, newMD5Hex} + } + } + + err = fileWriter.Close() + if err != nil { + if clErr := safeCloseAndRemove(fileWriter); clErr != nil { + return "", toObjectErr(clErr, bucket, object) + } + return "", toObjectErr(err, bucket, object) + } + + // Check if an object is present as one of the parent dir. + if xl.parentDirIsObject(bucket, path.Dir(object)) { + return "", toObjectErr(errFileAccessDenied, bucket, object) + } + + // Delete if an object already exists. + err = xl.deleteObject(bucket, object) + if err != nil { + return "", toObjectErr(err, bucket, object) + } + + err = xl.renameObject(minioMetaBucket, tempObj, bucket, object) + if err != nil { + if dErr := xl.deleteObject(minioMetaBucket, tempObj); dErr != nil { + return "", toObjectErr(dErr, minioMetaBucket, tempObj) + } + return "", toObjectErr(err, bucket, object) + } + + xlMeta := xlMetaV1{} + xlMeta.Version = "1" + xlMeta.Format = "xl" + xlMeta.Meta = metadata + xlMeta.Stat.Size = size + xlMeta.Stat.ModTime = modTime + xlMeta.AddObjectPart("object1", newMD5Hex, xlMeta.Stat.Size) + if err = xl.writeXLMetadata(bucket, object, xlMeta); err != nil { + return "", toObjectErr(err, bucket, object) + } + + // Return md5sum, successfully wrote object. + return newMD5Hex, nil +} + +// deleteObject - deletes a regular object. +func (xl xlObjects) deleteObject(bucket, object string) error { + // Initialize sync waitgroup. + var wg = &sync.WaitGroup{} + + // Initialize list of errors. + var dErrs = make([]error, len(xl.storageDisks)) + + for index, disk := range xl.storageDisks { + wg.Add(1) + go func(index int, disk StorageAPI) { + defer wg.Done() + dErrs[index] = cleanupDir(disk, bucket, object) + }(index, disk) + } + + // Wait for all routines to finish. + wg.Wait() + + var fileNotFoundCnt, deleteFileErr int + // Loop through all the concocted errors. + for _, err := range dErrs { + if err == nil { + continue + } + // If file not found, count them. + if err == errFileNotFound { + fileNotFoundCnt++ + continue + } + + // Update error counter separately. + deleteFileErr++ + } + + // Return err if all disks report file not found. + if fileNotFoundCnt == len(xl.storageDisks) { + return errFileNotFound + } else if deleteFileErr > len(xl.storageDisks)-xl.writeQuorum { + // Return errWriteQuorum if errors were more than + // allowed write quorum. + return errWriteQuorum + } + + return nil +} + +// DeleteObject - delete the object. +func (xl xlObjects) DeleteObject(bucket, object string) error { + // Verify if bucket is valid. + if !IsValidBucketName(bucket) { + return BucketNameInvalid{Bucket: bucket} + } + if !IsValidObjectName(object) { + return ObjectNameInvalid{Bucket: bucket, Object: object} + } + nsMutex.Lock(bucket, object) + defer nsMutex.Unlock(bucket, object) + xl.deleteObject(bucket, object) + return nil +} diff --git a/xl-v1.go b/xl-v1.go new file mode 100644 index 000000000..4475c5642 --- /dev/null +++ b/xl-v1.go @@ -0,0 +1,177 @@ +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "errors" + "fmt" + "path/filepath" + "strings" + "sync" +) + +const ( + formatConfigFile = "format.json" + xlMetaJSONFile = "xl.json" + uploadsJSONFile = "uploads.json" +) + +// xlObjects - Implements fs object layer. +type xlObjects struct { + storageDisks []StorageAPI + erasureDisk *erasure + dataBlocks int + parityBlocks int + readQuorum int + writeQuorum int + listObjectMap map[listParams][]*treeWalker + listObjectMapMutex *sync.Mutex +} + +// errMaxDisks - returned for reached maximum of disks. +var errMaxDisks = errors.New("Number of disks are higher than supported maximum count '16'") + +// errMinDisks - returned for minimum number of disks. +var errMinDisks = errors.New("Number of disks are smaller than supported minimum count '8'") + +// errNumDisks - returned for odd number of disks. +var errNumDisks = errors.New("Number of disks should be multiples of '2'") + +const ( + // Maximum erasure blocks. + maxErasureBlocks = 16 + // Minimum erasure blocks. + minErasureBlocks = 8 +) + +func checkSufficientDisks(disks []string) error { + // Verify total number of disks. + totalDisks := len(disks) + if totalDisks > maxErasureBlocks { + return errMaxDisks + } + if totalDisks < minErasureBlocks { + return errMinDisks + } + + // isEven function to verify if a given number if even. + isEven := func(number int) bool { + return number%2 == 0 + } + + // Verify if we have even number of disks. + // only combination of 8, 10, 12, 14, 16 are supported. + if !isEven(totalDisks) { + return errNumDisks + } + + return nil +} + +// Depending on the disk type network or local, initialize storage layer. +func newStorageLayer(disk string) (storage StorageAPI, err error) { + if !strings.ContainsRune(disk, ':') || filepath.VolumeName(disk) != "" { + // Initialize filesystem storage API. + return newPosix(disk) + } + // Initialize rpc client storage API. + return newRPCClient(disk) +} + +// Initialize all storage disks to bootstrap. +func bootstrapDisks(disks []string) ([]StorageAPI, error) { + storageDisks := make([]StorageAPI, len(disks)) + for index, disk := range disks { + var err error + // Intentionally ignore disk not found errors while + // initializing POSIX, so that we have successfully + // initialized posix Storage. Subsequent calls to XL/Erasure + // will manage any errors related to disks. + storageDisks[index], err = newStorageLayer(disk) + if err != nil && err != errDiskNotFound { + return nil, err + } + } + return storageDisks, nil +} + +// newXLObjects - initialize new xl object layer. +func newXLObjects(disks []string) (ObjectLayer, error) { + if err := checkSufficientDisks(disks); err != nil { + return nil, err + } + + // Bootstrap disks. + storageDisks, err := bootstrapDisks(disks) + if err != nil { + return nil, err + } + + // Initialize object layer - like creating minioMetaBucket, cleaning up tmp files etc. + initObjectLayer(storageDisks...) + + // Load saved XL format.json and validate. + newPosixDisks, err := loadFormatXL(storageDisks) + if err != nil { + switch err { + case errUnformattedDisk: + // Save new XL format. + errSave := initFormatXL(storageDisks) + if errSave != nil { + return nil, errSave + } + newPosixDisks = storageDisks + default: + // errCorruptedDisk - error. + return nil, fmt.Errorf("Unable to recognize backend format, %s", err) + } + } + + // FIXME: healFormatXL(newDisks) + + newErasureDisk, err := newErasure(newPosixDisks) + if err != nil { + return nil, err + } + + // Calculate data and parity blocks. + dataBlocks, parityBlocks := len(newPosixDisks)/2, len(newPosixDisks)/2 + + xl := xlObjects{ + storageDisks: newPosixDisks, + erasureDisk: newErasureDisk, + dataBlocks: dataBlocks, + parityBlocks: parityBlocks, + listObjectMap: make(map[listParams][]*treeWalker), + listObjectMapMutex: &sync.Mutex{}, + } + + // Figure out read and write quorum based on number of storage disks. + // Read quorum should be always N/2 + 1 (due to Vandermonde matrix + // erasure requirements) + xl.readQuorum = len(xl.storageDisks)/2 + 1 + + // Write quorum is assumed if we have total disks + 3 + // parity. (Need to discuss this again) + xl.writeQuorum = len(xl.storageDisks)/2 + 3 + if xl.writeQuorum > len(xl.storageDisks) { + xl.writeQuorum = len(xl.storageDisks) + } + + // Return successfully initialized object layer. + return xl, nil +}