From 3a2f89b3c05a39c04b72ed2c879a2d2f570db42b Mon Sep 17 00:00:00 2001 From: Krishna Srinivas <634494+krishnasrinivas@users.noreply.github.com> Date: Fri, 30 Oct 2020 11:04:29 -0700 Subject: [PATCH] fix: add support for O_DIRECT reads for erasure backends (#10718) --- cmd/config/storageclass/storage-class.go | 47 ++++++++++- cmd/xl-storage.go | 74 +++++++++++++++++ cmd/xl-storage_test.go | 100 +++++++++++++---------- 3 files changed, 173 insertions(+), 48 deletions(-) diff --git a/cmd/config/storageclass/storage-class.go b/cmd/config/storageclass/storage-class.go index e1465f946..d026bdf4e 100644 --- a/cmd/config/storageclass/storage-class.go +++ b/cmd/config/storageclass/storage-class.go @@ -18,6 +18,7 @@ package storageclass import ( "encoding/json" + "errors" "fmt" "strconv" "strings" @@ -32,17 +33,26 @@ const ( RRS = "REDUCED_REDUNDANCY" // Standard storage class STANDARD = "STANDARD" + // DMA storage class + DMA = "DMA" + + // Valid values are "write" and "read-write" + DMAWrite = "write" + DMAReadWrite = "read-write" ) // Standard constats for config info storage class const ( ClassStandard = "standard" ClassRRS = "rrs" + ClassDMA = "dma" // Reduced redundancy storage class environment variable RRSEnv = "MINIO_STORAGE_CLASS_RRS" // Standard storage class environment variable StandardEnv = "MINIO_STORAGE_CLASS_STANDARD" + // DMA storage class environment variable + DMAEnv = "MINIO_STORAGE_CLASS_DMA" // Supported storage class scheme is EC schemePrefix = "EC" @@ -52,6 +62,9 @@ const ( // Default RRS parity is always minimum parity. defaultRRSParity = minParityDisks + + // Default DMA value + defaultDMA = DMAWrite ) // DefaultKVS - default storage class config @@ -65,18 +78,24 @@ var ( Key: ClassRRS, Value: "EC:2", }, + config.KV{ + Key: ClassDMA, + Value: defaultDMA, + }, } ) // StorageClass - holds storage class information type StorageClass struct { Parity int + DMA string } // Config storage class configuration type Config struct { Standard StorageClass `json:"standard"` RRS StorageClass `json:"rrs"` + DMA StorageClass `json:"dma"` } // UnmarshalJSON - Validate SS and RRS parity when unmarshalling JSON. @@ -93,7 +112,7 @@ func (sCfg *Config) UnmarshalJSON(data []byte) error { // IsValid - returns true if input string is a valid // storage class kind supported. func IsValid(sc string) bool { - return sc == RRS || sc == STANDARD + return sc == RRS || sc == STANDARD || sc == DMA } // UnmarshalText unmarshals storage class from its textual form into @@ -103,6 +122,14 @@ func (sc *StorageClass) UnmarshalText(b []byte) error { if scStr == "" { return nil } + if scStr == DMAWrite { + sc.DMA = DMAWrite + return nil + } + if scStr == DMAReadWrite { + sc.DMA = DMAReadWrite + return nil + } s, err := parseStorageClass(scStr) if err != nil { return err @@ -116,14 +143,14 @@ func (sc *StorageClass) MarshalText() ([]byte, error) { if sc.Parity != 0 { return []byte(fmt.Sprintf("%s:%d", schemePrefix, sc.Parity)), nil } - return []byte(""), nil + return []byte(sc.DMA), nil } func (sc *StorageClass) String() string { if sc.Parity != 0 { return fmt.Sprintf("%s:%d", schemePrefix, sc.Parity) } - return "" + return sc.DMA } // Parses given storageClassEnv and returns a storageClass structure. @@ -212,6 +239,11 @@ func (sCfg Config) GetParityForSC(sc string) (parity int) { } } +// GetDMA - returns DMA configuration. +func (sCfg Config) GetDMA() string { + return sCfg.DMA.DMA +} + // Enabled returns if etcd is enabled. func Enabled(kvs config.KVS) bool { ssc := kvs.Get(ClassStandard) @@ -231,6 +263,7 @@ func LookupConfig(kvs config.KVS, setDriveCount int) (cfg Config, err error) { ssc := env.Get(StandardEnv, kvs.Get(ClassStandard)) rrsc := env.Get(RRSEnv, kvs.Get(ClassRRS)) + dma := env.Get(DMAEnv, kvs.Get(ClassDMA)) // Check for environment variables and parse into storageClass struct if ssc != "" { cfg.Standard, err = parseStorageClass(ssc) @@ -252,6 +285,14 @@ func LookupConfig(kvs config.KVS, setDriveCount int) (cfg Config, err error) { cfg.RRS.Parity = defaultRRSParity } + if dma == "" { + dma = defaultDMA + } + if dma != DMAReadWrite && dma != DMAWrite { + return Config{}, errors.New(`valid dma values are "read-write" and "write"`) + } + cfg.DMA.DMA = dma + // Validation is done after parsing both the storage classes. This is needed because we need one // storage class value to deduce the correct value of the other storage class. if err = validateParity(cfg.Standard.Parity, cfg.RRS.Parity, setDriveCount); err != nil { diff --git a/cmd/xl-storage.go b/cmd/xl-storage.go index e72ed03a7..90d83992e 100644 --- a/cmd/xl-storage.go +++ b/cmd/xl-storage.go @@ -42,6 +42,7 @@ import ( jsoniter "github.com/json-iterator/go" "github.com/klauspost/readahead" "github.com/minio/minio/cmd/config" + "github.com/minio/minio/cmd/config/storageclass" "github.com/minio/minio/cmd/logger" "github.com/minio/minio/pkg/disk" "github.com/minio/minio/pkg/env" @@ -1584,6 +1585,56 @@ func (s *xlStorage) openFile(volume, path string, mode int) (f *os.File, err err return w, nil } +// To support O_DIRECT reads for erasure backends. +type odirectreader struct { + f *os.File + buf []byte + bufp *[]byte + freshRead bool + s *xlStorage + err error +} + +// Read - Implements Reader interface. +func (o *odirectreader) Read(buf []byte) (n int, err error) { + if o.err != nil { + return 0, o.err + } + if o.buf == nil { + o.bufp = o.s.pool.Get().(*[]byte) + } + if o.freshRead { + o.buf = *o.bufp + n, err = o.f.Read(o.buf) + if err != nil && err != io.EOF { + o.err = err + return n, err + } + if n == 0 { + // err is io.EOF + o.err = err + return n, err + } + o.buf = o.buf[:n] + o.freshRead = false + } + if len(buf) >= len(o.buf) { + n = copy(buf, o.buf) + o.freshRead = true + return n, nil + } + n = copy(buf, o.buf) + o.buf = o.buf[n:] + return n, nil +} + +// Close - Release the buffer and close the file. +func (o *odirectreader) Close() error { + o.s.pool.Put(o.bufp) + atomic.AddInt32(&o.s.activeIOCount, -1) + return o.f.Close() +} + // ReadFileStream - Returns the read stream of the file. func (s *xlStorage) ReadFileStream(ctx context.Context, volume, path string, offset, length int64) (io.ReadCloser, error) { if offset < 0 { @@ -1611,6 +1662,29 @@ func (s *xlStorage) ReadFileStream(ctx context.Context, volume, path string, off return nil, err } + if offset == 0 && globalStorageClass.GetDMA() == storageclass.DMAReadWrite { + file, err := disk.OpenFileDirectIO(filePath, os.O_RDONLY, 0666) + if err != nil { + switch { + case os.IsNotExist(err): + return nil, errFileNotFound + case os.IsPermission(err): + return nil, errFileAccessDenied + case isSysErrNotDir(err): + return nil, errFileAccessDenied + case isSysErrIO(err): + return nil, errFaultyDisk + case isSysErrTooManyFiles(err): + return nil, errTooManyOpenFiles + default: + return nil, err + } + } + + atomic.AddInt32(&s.activeIOCount, 1) + return &odirectreader{file, nil, nil, true, s, nil}, nil + } + // Open the file for reading. file, err := os.Open(filePath) if err != nil { diff --git a/cmd/xl-storage_test.go b/cmd/xl-storage_test.go index 937921b66..5ab1317fa 100644 --- a/cmd/xl-storage_test.go +++ b/cmd/xl-storage_test.go @@ -30,6 +30,7 @@ import ( "syscall" "testing" + "github.com/minio/minio/cmd/config/storageclass" "github.com/minio/minio/pkg/disk" ) @@ -1063,62 +1064,71 @@ func TestXLStorageReadFile(t *testing.T) { } } - // Following block validates all ReadFile test cases. - for i, testCase := range testCases { - var n int64 - // Common read buffer. - var buf = make([]byte, testCase.bufSize) - n, err = xlStorage.ReadFile(context.Background(), testCase.volume, testCase.fileName, testCase.offset, buf, v) - if err != nil && testCase.expectedErr != nil { - // Validate if the type string of the errors are an exact match. - if err.Error() != testCase.expectedErr.Error() { - if runtime.GOOS != globalWindowsOSName { - t.Errorf("Case: %d %#v, expected: %s, got: %s", i+1, testCase, testCase.expectedErr, err) - } else { - var resultErrno, expectErrno uintptr - if pathErr, ok := err.(*os.PathError); ok { - if errno, pok := pathErr.Err.(syscall.Errno); pok { - resultErrno = uintptr(errno) - } - } - if pathErr, ok := testCase.expectedErr.(*os.PathError); ok { - if errno, pok := pathErr.Err.(syscall.Errno); pok { - expectErrno = uintptr(errno) - } - } - if !(expectErrno != 0 && resultErrno != 0 && expectErrno == resultErrno) { + for l := 0; l < 2; l++ { + // 1st loop tests with dma=write, 2nd loop tests with dma=read-write. + if l == 1 { + globalStorageClass.DMA.DMA = storageclass.DMAReadWrite + } + // Following block validates all ReadFile test cases. + for i, testCase := range testCases { + var n int64 + // Common read buffer. + var buf = make([]byte, testCase.bufSize) + n, err = xlStorage.ReadFile(context.Background(), testCase.volume, testCase.fileName, testCase.offset, buf, v) + if err != nil && testCase.expectedErr != nil { + // Validate if the type string of the errors are an exact match. + if err.Error() != testCase.expectedErr.Error() { + if runtime.GOOS != globalWindowsOSName { t.Errorf("Case: %d %#v, expected: %s, got: %s", i+1, testCase, testCase.expectedErr, err) + } else { + var resultErrno, expectErrno uintptr + if pathErr, ok := err.(*os.PathError); ok { + if errno, pok := pathErr.Err.(syscall.Errno); pok { + resultErrno = uintptr(errno) + } + } + if pathErr, ok := testCase.expectedErr.(*os.PathError); ok { + if errno, pok := pathErr.Err.(syscall.Errno); pok { + expectErrno = uintptr(errno) + } + } + if !(expectErrno != 0 && resultErrno != 0 && expectErrno == resultErrno) { + t.Errorf("Case: %d %#v, expected: %s, got: %s", i+1, testCase, testCase.expectedErr, err) + } + } + } + // Err unexpected EOF special case, where we verify we have provided a larger + // buffer than the data itself, but the results are in-fact valid. So we validate + // this error condition specifically treating it as a good condition with valid + // results. In this scenario return 'n' is always lesser than the input buffer. + if err == io.ErrUnexpectedEOF { + if !bytes.Equal(testCase.expectedBuf, buf[:n]) { + t.Errorf("Case: %d %#v, expected: \"%s\", got: \"%s\"", i+1, testCase, string(testCase.expectedBuf), string(buf[:n])) + } + if n > int64(len(buf)) { + t.Errorf("Case: %d %#v, expected: %d, got: %d", i+1, testCase, testCase.bufSize, n) } } } - // Err unexpected EOF special case, where we verify we have provided a larger - // buffer than the data itself, but the results are in-fact valid. So we validate - // this error condition specifically treating it as a good condition with valid - // results. In this scenario return 'n' is always lesser than the input buffer. - if err == io.ErrUnexpectedEOF { - if !bytes.Equal(testCase.expectedBuf, buf[:n]) { - t.Errorf("Case: %d %#v, expected: \"%s\", got: \"%s\"", i+1, testCase, string(testCase.expectedBuf), string(buf[:n])) + // ReadFile has returned success, but our expected error is non 'nil'. + if err == nil && err != testCase.expectedErr { + t.Errorf("Case: %d %#v, expected: %s, got :%s", i+1, testCase, testCase.expectedErr, err) + } + // Expected error retured, proceed further to validate the returned results. + if err == nil && err == testCase.expectedErr { + if !bytes.Equal(testCase.expectedBuf, buf) { + t.Errorf("Case: %d %#v, expected: \"%s\", got: \"%s\"", i+1, testCase, string(testCase.expectedBuf), string(buf[:testCase.bufSize])) } - if n > int64(len(buf)) { + if n != int64(testCase.bufSize) { t.Errorf("Case: %d %#v, expected: %d, got: %d", i+1, testCase, testCase.bufSize, n) } } } - // ReadFile has returned success, but our expected error is non 'nil'. - if err == nil && err != testCase.expectedErr { - t.Errorf("Case: %d %#v, expected: %s, got :%s", i+1, testCase, testCase.expectedErr, err) - } - // Expected error retured, proceed further to validate the returned results. - if err == nil && err == testCase.expectedErr { - if !bytes.Equal(testCase.expectedBuf, buf) { - t.Errorf("Case: %d %#v, expected: \"%s\", got: \"%s\"", i+1, testCase, string(testCase.expectedBuf), string(buf[:testCase.bufSize])) - } - if n != int64(testCase.bufSize) { - t.Errorf("Case: %d %#v, expected: %d, got: %d", i+1, testCase, testCase.bufSize, n) - } - } } + // Reset the flag. + globalStorageClass.DMA.DMA = storageclass.DMAWrite + // TestXLStorage for permission denied. if runtime.GOOS != globalWindowsOSName { permDeniedDir := createPermDeniedFile(t)