diff --git a/cmd/fallocate.go b/cmd/fallocate.go new file mode 100644 index 000000000..d624c85b4 --- /dev/null +++ b/cmd/fallocate.go @@ -0,0 +1,25 @@ +// +build !linux + +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cmd + +// Fallocate is not POSIX and not supported under Windows +// Always return successful +func Fallocate(fd int, offset int64, len int64) error { + return nil +} diff --git a/cmd/fallocate_linux.go b/cmd/fallocate_linux.go new file mode 100644 index 000000000..fa25693b0 --- /dev/null +++ b/cmd/fallocate_linux.go @@ -0,0 +1,31 @@ +// +build linux + +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cmd + +import "syscall" + +// Fallocate uses the linux Fallocate syscall, which helps us to be +// sure that subsequent writes on a file just created will not fail, +// in addition, file allocation will be contigous on the disk +func Fallocate(fd int, offset int64, len int64) error { + return syscall.Fallocate(fd, + 1, // FALLOC_FL_KEEP_SIZE + offset, + len) +} diff --git a/cmd/fs-v1-multipart.go b/cmd/fs-v1-multipart.go index 036ebe40f..56041e3dd 100644 --- a/cmd/fs-v1-multipart.go +++ b/cmd/fs-v1-multipart.go @@ -450,6 +450,15 @@ func (fs fsObjects) PutObjectPart(bucket, object, uploadID string, partID int, s bufSize = size } buf := make([]byte, int(bufSize)) + + if size > 0 { + // Prepare file to avoid disk fragmentation + err := fs.storage.PrepareFile(minioMetaBucket, tmpPartPath, size) + if err != nil { + return "", toObjectErr(err, minioMetaBucket, tmpPartPath) + } + } + bytesWritten, cErr := fsCreateFile(fs.storage, teeReader, buf, minioMetaBucket, tmpPartPath) if cErr != nil { fs.storage.DeleteFile(minioMetaBucket, tmpPartPath) @@ -599,6 +608,18 @@ func (fs fsObjects) ListObjectParts(bucket, object, uploadID string, partNumberM return fs.listObjectParts(bucket, object, uploadID, partNumberMarker, maxParts) } +func (fs fsObjects) totalObjectSize(fsMeta fsMetaV1, parts []completePart) (int64, error) { + objSize := int64(0) + for _, part := range parts { + partIdx := fsMeta.ObjectPartIndex(part.PartNumber) + if partIdx == -1 { + return 0, InvalidPart{} + } + objSize += fsMeta.Parts[partIdx].Size + } + return objSize, nil +} + // CompleteMultipartUpload - completes an ongoing multipart // transaction after receiving all the parts indicated by the client. // Returns an md5sum calculated by concatenating all the individual @@ -668,6 +689,19 @@ func (fs fsObjects) CompleteMultipartUpload(bucket string, object string, upload // Allocate staging buffer. var buf = make([]byte, readSizeV1) + var objSize int64 + + objSize, err = fs.totalObjectSize(fsMeta, parts) + if err != nil { + return "", traceError(err) + } + if objSize > 0 { + // Prepare file to avoid disk fragmentation + err = fs.storage.PrepareFile(minioMetaBucket, tempObj, objSize) + if err != nil { + return "", traceError(err) + } + } // Loop through all parts, validate them and then commit to disk. for i, part := range parts { diff --git a/cmd/fs-v1-multipart_test.go b/cmd/fs-v1-multipart_test.go index 52299412f..ab4431999 100644 --- a/cmd/fs-v1-multipart_test.go +++ b/cmd/fs-v1-multipart_test.go @@ -90,13 +90,18 @@ func TestPutObjectPartFaultyDisk(t *testing.T) { for i := 1; i <= 7; i++ { // Faulty disk generates errFaultyDisk at 'i' storage api call number fs.storage = newNaughtyDisk(fsStorage, map[int]error{i: errFaultyDisk}, nil) - if _, err := fs.PutObjectPart(bucketName, objectName, uploadID, 1, dataLen, bytes.NewReader(data), md5Hex, sha256sum); errorCause(err) != errFaultyDisk { + md5sum, err := fs.PutObjectPart(bucketName, objectName, uploadID, 1, dataLen, bytes.NewReader(data), md5Hex, sha256sum) + if errorCause(err) != errFaultyDisk { + if errorCause(err) == nil { + t.Fatalf("Test %d shouldn't succeed, md5sum = %s\n", i, md5sum) + } switch i { case 1: if !isSameType(errorCause(err), BucketNotFound{}) { t.Fatal("Unexpected error ", err) } - case 2, 4: + case 3: + case 2, 4, 5: if !isSameType(errorCause(err), InvalidUploadID{}) { t.Fatal("Unexpected error ", err) } diff --git a/cmd/fs-v1.go b/cmd/fs-v1.go index bdfefc080..c41abb3b0 100644 --- a/cmd/fs-v1.go +++ b/cmd/fs-v1.go @@ -388,6 +388,15 @@ func (fs fsObjects) PutObject(bucket string, object string, size int64, data io. return ObjectInfo{}, toObjectErr(traceError(err), bucket, object) } } else { + + // Prepare file to avoid disk fragmentation + if size > 0 { + err = fs.storage.PrepareFile(minioMetaBucket, tempObj, size) + if err != nil { + return ObjectInfo{}, toObjectErr(err, bucket, object) + } + } + // Allocate a buffer to Read() from request body bufSize := int64(readSizeV1) if size > 0 && bufSize > size { diff --git a/cmd/naughty-disk_test.go b/cmd/naughty-disk_test.go index 34c0c68e0..10f81e195 100644 --- a/cmd/naughty-disk_test.go +++ b/cmd/naughty-disk_test.go @@ -108,6 +108,13 @@ func (d *naughtyDisk) ReadFile(volume string, path string, offset int64, buf []b return d.disk.ReadFile(volume, path, offset, buf) } +func (d *naughtyDisk) PrepareFile(volume, path string, length int64) error { + if err := d.calcError(); err != nil { + return err + } + return d.disk.PrepareFile(volume, path, length) +} + func (d *naughtyDisk) AppendFile(volume, path string, buf []byte) error { if err := d.calcError(); err != nil { return err diff --git a/cmd/posix-errors.go b/cmd/posix-errors.go index bbdc60495..337c068ff 100644 --- a/cmd/posix-errors.go +++ b/cmd/posix-errors.go @@ -22,6 +22,26 @@ import ( "syscall" ) +// Function not implemented error +func isSysErrNoSys(err error) bool { + return err != nil && err == syscall.ENOSYS +} + +// Not supported error +func isSysErrOpNotSupported(err error) bool { + return err != nil && err == syscall.EOPNOTSUPP +} + +// No space left on device error +func isSysErrNoSpace(err error) bool { + return err != nil && err == syscall.ENOSPC +} + +// Input/output error +func isSysErrIO(err error) bool { + return err != nil && err == syscall.EIO +} + // Check if the given error corresponds to ENOTDIR (is not a directory) func isSysErrNotDir(err error) bool { if pathErr, ok := err.(*os.PathError); ok { diff --git a/cmd/posix.go b/cmd/posix.go index 9e8424a35..029a9f991 100644 --- a/cmd/posix.go +++ b/cmd/posix.go @@ -546,6 +546,121 @@ func (s *posix) ReadFile(volume string, path string, offset int64, buf []byte) ( return int64(m), err } +func (s *posix) createFile(volume, path string) (f *os.File, err error) { + defer func() { + if err == syscall.EIO { + atomic.AddInt32(&s.ioErrCount, 1) + } + }() + + if s.ioErrCount > maxAllowedIOError { + return nil, errFaultyDisk + } + + // Validate if disk is free. + if err = s.checkDiskFree(); err != nil { + return nil, err + } + + volumeDir, err := s.getVolDir(volume) + if err != nil { + return nil, err + } + // Stat a volume entry. + _, err = os.Stat(preparePath(volumeDir)) + if err != nil { + if os.IsNotExist(err) { + return nil, errVolumeNotFound + } + return nil, err + } + + filePath := pathJoin(volumeDir, path) + if err = checkPathLength(filePath); err != nil { + return nil, err + } + + // Verify if the file already exists and is not of regular type. + var st os.FileInfo + if st, err = os.Stat(preparePath(filePath)); err == nil { + if !st.Mode().IsRegular() { + return nil, errIsNotRegular + } + } else { + // Create top level directories if they don't exist. + // with mode 0777 mkdir honors system umask. + if err = mkdirAll(preparePath(slashpath.Dir(filePath)), 0777); err != nil { + // File path cannot be verified since one of the parents is a file. + if isSysErrNotDir(err) { + return nil, errFileAccessDenied + } else if isSysErrPathNotFound(err) { + // Add specific case for windows. + return nil, errFileAccessDenied + } + return nil, err + } + } + + w, err := os.OpenFile(preparePath(filePath), os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0666) + if err != nil { + // File path cannot be verified since one of the parents is a file. + if isSysErrNotDir(err) { + return nil, errFileAccessDenied + } + return nil, err + } + + return w, nil +} + +// PrepareFile - run prior actions before creating a new file for optimization purposes +// Currenty we use fallocate when available to avoid disk fragmentation as much as possible +func (s *posix) PrepareFile(volume, path string, fileSize int64) (err error) { + + // It doesn't make sense to create a negative-sized file + if fileSize <= 0 { + return errInvalidArgument + } + + defer func() { + if err == syscall.EIO { + atomic.AddInt32(&s.ioErrCount, 1) + } + }() + + if s.ioErrCount > maxAllowedIOError { + return errFaultyDisk + } + + // Create file if not found + w, err := s.createFile(volume, path) + if err != nil { + return err + } + + // Close upon return. + defer w.Close() + + // Allocate needed disk space to append data + e := Fallocate(int(w.Fd()), 0, fileSize) + + // Ignore errors when Fallocate is not supported in the current system + if e != nil && !isSysErrNoSys(e) && !isSysErrOpNotSupported(e) { + switch { + case isSysErrNoSpace(e): + err = errDiskFull + case isSysErrIO(e): + err = e + default: + // For errors: EBADF, EINTR, EINVAL, ENODEV, EPERM, ESPIPE and ETXTBSY + // Appending was failed anyway, returns unexpected error + err = errUnexpected + } + return err + } + return nil +} + // AppendFile - append a byte array at path, if file doesn't exist at // path this call explicitly creates it. func (s *posix) AppendFile(volume, path string, buf []byte) (err error) { @@ -559,57 +674,11 @@ func (s *posix) AppendFile(volume, path string, buf []byte) (err error) { return errFaultyDisk } - // Validate if disk is free. - if err = s.checkDiskFree(); err != nil { - return err - } - - volumeDir, err := s.getVolDir(volume) + // Create file if not found + w, err := s.createFile(volume, path) if err != nil { return err } - // Stat a volume entry. - _, err = os.Stat(preparePath(volumeDir)) - if err != nil { - if os.IsNotExist(err) { - return errVolumeNotFound - } - return err - } - filePath := pathJoin(volumeDir, path) - if err = checkPathLength(filePath); err != nil { - return err - } - // Verify if the file already exists and is not of regular type. - var st os.FileInfo - if st, err = os.Stat(preparePath(filePath)); err == nil { - if !st.Mode().IsRegular() { - return errIsNotRegular - } - } - // Create top level directories if they don't exist. - // with mode 0777 mkdir honors system umask. - if err = mkdirAll(preparePath(slashpath.Dir(filePath)), 0777); err != nil { - // File path cannot be verified since one of the parents is a file. - if isSysErrNotDir(err) { - return errFileAccessDenied - } else if isSysErrPathNotFound(err) { - // Add specific case for windows. - return errFileAccessDenied - } - return err - } - - // Creates the named file with mode 0666 (before umask), or starts appending - // to an existig file. - w, err := os.OpenFile(preparePath(filePath), os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0666) - if err != nil { - // File path cannot be verified since one of the parents is a file. - if isSysErrNotDir(err) { - return errFileAccessDenied - } - return err - } // Close upon return. defer w.Close() diff --git a/cmd/posix_test.go b/cmd/posix_test.go index 470473e86..2c55f0c7f 100644 --- a/cmd/posix_test.go +++ b/cmd/posix_test.go @@ -1126,6 +1126,100 @@ func TestAppendFile(t *testing.T) { } } +// Test posix.PrepareFile() +func TestPrepareFile(t *testing.T) { + // create posix test setup + posixStorage, path, err := newPosixTestSetup() + if err != nil { + t.Fatalf("Unable to create posix test setup, %s", err) + } + defer removeAll(path) + + // Setup test environment. + if err = posixStorage.MakeVol("success-vol"); err != nil { + t.Fatalf("Unable to create volume, %s", err) + } + + if err = os.Mkdir(slashpath.Join(path, "success-vol", "object-as-dir"), 0777); err != nil { + t.Fatalf("Unable to create directory, %s", err) + } + + testCases := []struct { + fileName string + expectedErr error + }{ + {"myobject", nil}, + {"path/to/my/object", nil}, + // Test to append to previously created file. + {"myobject", nil}, + // Test to use same path of previously created file. + {"path/to/my/testobject", nil}, + {"object-as-dir", errIsNotRegular}, + // path segment uses previously uploaded object. + {"myobject/testobject", errFileAccessDenied}, + // One path segment length is > 255 chars long. + {"path/to/my/object0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001", errFileNameTooLong}, + } + + // Add path length > 1024 test specially as OS X system does not support 1024 long path. + err = errFileNameTooLong + if runtime.GOOS != "darwin" { + err = nil + } + // path length is 1024 chars long. + testCases = append(testCases, struct { + fileName string + expectedErr error + }{"level0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001/level0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002/level0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003/object000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001", err}) + + for _, testCase := range testCases { + if err = posixStorage.PrepareFile("success-vol", testCase.fileName, 16); err != testCase.expectedErr { + t.Errorf("Case: %s, expected: %s, got: %s", testCase, testCase.expectedErr, err) + } + } + + // Test for permission denied. + if runtime.GOOS != "windows" { + // Initialize posix storage layer for permission denied error. + posixStorage, err = newPosix("/usr") + if err != nil { + t.Fatalf("Unable to initialize posix, %s", err) + } + + if err = posixStorage.PrepareFile("bin", "yes", 16); !os.IsPermission(err) { + t.Errorf("expected: Permission error, got: %s", err) + } + } + + // Test case with invalid file size which should be strictly positive + err = posixStorage.PrepareFile("bn", "yes", -3) + if err != errInvalidArgument { + t.Fatalf("should fail: %v", err) + } + + // Test case with invalid volume name. + // A valid volume name should be atleast of size 3. + err = posixStorage.PrepareFile("bn", "yes", 16) + if err != errInvalidArgument { + t.Fatalf("expected: \"Invalid argument error\", got: \"%s\"", err) + } + + // Test case with IO error count > max limit. + + // setting ioErrCnt to 6. + // should fail with errFaultyDisk. + if posixType, ok := posixStorage.(*posix); ok { + // setting the io error count from as specified in the test case. + posixType.ioErrCount = int32(6) + err = posixType.PrepareFile("abc", "yes", 16) + if err != errFaultyDisk { + t.Fatalf("Expected \"Faulty Disk\", got: \"%s\"", err) + } + } else { + t.Fatalf("Expected the StorageAPI to be of type *posix") + } +} + // Test posix.RenameFile() func TestRenameFile(t *testing.T) { // create posix test setup diff --git a/cmd/storage-interface.go b/cmd/storage-interface.go index ee82d1d9c..16eb7ec8a 100644 --- a/cmd/storage-interface.go +++ b/cmd/storage-interface.go @@ -35,6 +35,7 @@ type StorageAPI interface { // File operations. ListDir(volume, dirPath string) ([]string, error) ReadFile(volume string, path string, offset int64, buf []byte) (n int64, err error) + PrepareFile(volume string, path string, len int64) (err error) AppendFile(volume string, path string, buf []byte) (err error) RenameFile(srcVolume, srcPath, dstVolume, dstPath string) error StatFile(volume string, path string) (file FileInfo, err error) diff --git a/cmd/storage-rpc-client.go b/cmd/storage-rpc-client.go index 0e75a1f87..311a38a24 100644 --- a/cmd/storage-rpc-client.go +++ b/cmd/storage-rpc-client.go @@ -186,6 +186,18 @@ func (n networkStorage) DeleteVol(volume string) error { // File operations. +func (n networkStorage) PrepareFile(volume, path string, length int64) (err error) { + reply := GenericReply{} + if err = n.rpcClient.Call("Storage.PrepareFileHandler", &PrepareFileArgs{ + Vol: volume, + Path: path, + Size: length, + }, &reply); err != nil { + return toStorageErr(err) + } + return nil +} + // CreateFile - create file. func (n networkStorage) AppendFile(volume, path string, buffer []byte) (err error) { reply := GenericReply{} diff --git a/cmd/storage-rpc-client_test.go b/cmd/storage-rpc-client_test.go index 7a4c61b9d..f1d84f0cc 100644 --- a/cmd/storage-rpc-client_test.go +++ b/cmd/storage-rpc-client_test.go @@ -243,6 +243,10 @@ func (s *TestRPCStorageSuite) testRPCStorageFileOps(t *testing.T) { if err != nil { t.Error("Unable to initiate MakeVol", err) } + err = storageDisk.PrepareFile("myvol", "file1", int64(len([]byte("Hello, world")))) + if err != nil { + t.Error("Unable to initiate AppendFile", err) + } err = storageDisk.AppendFile("myvol", "file1", []byte("Hello, world")) if err != nil { t.Error("Unable to initiate AppendFile", err) diff --git a/cmd/storage-rpc-server-datatypes.go b/cmd/storage-rpc-server-datatypes.go index 58fa36581..f0b75d153 100644 --- a/cmd/storage-rpc-server-datatypes.go +++ b/cmd/storage-rpc-server-datatypes.go @@ -61,6 +61,21 @@ type ReadFileArgs struct { Size int } +// PrepareFileArgs represents append file RPC arguments. +type PrepareFileArgs struct { + // Authentication token generated by Login. + GenericArgs + + // Name of the volume. + Vol string + + // Name of the path. + Path string + + // Size of the file to be prepared + Size int64 +} + // AppendFileArgs represents append file RPC arguments. type AppendFileArgs struct { // Authentication token generated by Login. diff --git a/cmd/storage-rpc-server.go b/cmd/storage-rpc-server.go index a8d563de5..6c81b7dd2 100644 --- a/cmd/storage-rpc-server.go +++ b/cmd/storage-rpc-server.go @@ -180,6 +180,14 @@ func (s *storageServer) ReadFileHandler(args *ReadFileArgs, reply *[]byte) (err return err } +// PrepareFileHandler - prepare file handler is rpc wrapper to prepare file. +func (s *storageServer) PrepareFileHandler(args *PrepareFileArgs, reply *GenericReply) error { + if !isRPCTokenValid(args.Token) { + return errInvalidToken + } + return s.storage.PrepareFile(args.Vol, args.Path, args.Size) +} + // AppendFileHandler - append file handler is rpc wrapper to append file. func (s *storageServer) AppendFileHandler(args *AppendFileArgs, reply *GenericReply) error { if !isRPCTokenValid(args.Token) { diff --git a/cmd/xl-v1-common.go b/cmd/xl-v1-common.go index 7c04390a7..433322a1d 100644 --- a/cmd/xl-v1-common.go +++ b/cmd/xl-v1-common.go @@ -69,3 +69,16 @@ func (xl xlObjects) isObject(bucket, prefix string) (ok bool) { } // Exhausted all disks - return false. return false } + +// Calculate the space occupied by an object in a single disk +func (xl xlObjects) sizeOnDisk(fileSize int64, blockSize int64, dataBlocks int) int64 { + numBlocks := fileSize / blockSize + chunkSize := getChunkSize(blockSize, dataBlocks) + sizeInDisk := numBlocks * chunkSize + remaining := fileSize % blockSize + if remaining > 0 { + sizeInDisk += getChunkSize(remaining, dataBlocks) + } + + return sizeInDisk +} diff --git a/cmd/xl-v1-multipart.go b/cmd/xl-v1-multipart.go index 82b2b6d73..7bfe04234 100644 --- a/cmd/xl-v1-multipart.go +++ b/cmd/xl-v1-multipart.go @@ -424,6 +424,15 @@ func (xl xlObjects) PutObjectPart(bucket, object, uploadID string, partID int, s // Delete the temporary object part. If PutObjectPart succeeds there would be nothing to delete. defer xl.deleteObject(minioMetaBucket, tmpPartPath) + if size > 0 { + for _, disk := range onlineDisks { + if disk != nil { + actualSize := xl.sizeOnDisk(size, xlMeta.Erasure.BlockSize, xlMeta.Erasure.DataBlocks) + disk.PrepareFile(minioMetaBucket, tmpPartPath, actualSize) + } + } + } + // Erasure code data and write across all disks. sizeWritten, checkSums, err := erasureCreateFile(onlineDisks, minioMetaBucket, tmpPartPath, teeReader, xlMeta.Erasure.BlockSize, xl.dataBlocks, xl.parityBlocks, bitRotAlgo, xl.writeQuorum) if err != nil { diff --git a/cmd/xl-v1-object.go b/cmd/xl-v1-object.go index 450a5bc28..6f0103173 100644 --- a/cmd/xl-v1-object.go +++ b/cmd/xl-v1-object.go @@ -448,6 +448,15 @@ func (xl xlObjects) PutObject(bucket string, object string, size int64, data io. // delete. defer xl.deleteObject(minioMetaTmpBucket, tempObj) + if size > 0 { + for _, disk := range onlineDisks { + if disk != nil { + actualSize := xl.sizeOnDisk(size, xlMeta.Erasure.BlockSize, xlMeta.Erasure.DataBlocks) + disk.PrepareFile(minioMetaBucket, tempErasureObj, actualSize) + } + } + } + // Erasure code data and write across all disks. sizeWritten, checkSums, err := erasureCreateFile(onlineDisks, minioMetaBucket, tempErasureObj, teeReader, xlMeta.Erasure.BlockSize, xlMeta.Erasure.DataBlocks, xlMeta.Erasure.ParityBlocks, bitRotAlgo, xl.writeQuorum) if err != nil {