From a3ec71bc28e68764642d6ef5d82d9d56d4a36d1e Mon Sep 17 00:00:00 2001 From: Krishna Srinivas <634494+krishnasrinivas@users.noreply.github.com> Date: Tue, 23 Apr 2019 21:25:06 -0700 Subject: [PATCH] Use O_DIRECT while writing to disk (#7479) - Use O_DIRECT while writing to disk - Remove MINIO_DRIVE_SYNC option --- cmd/posix.go | 139 ++++++++++++++++++++++++------- go.mod | 1 + go.sum | 2 + pkg/disk/directio_darwin.go | 36 ++++++++ pkg/disk/directio_unix.go | 44 ++++++++++ pkg/disk/directio_unsupported.go | 35 ++++++++ 6 files changed, 226 insertions(+), 31 deletions(-) create mode 100644 pkg/disk/directio_darwin.go create mode 100644 pkg/disk/directio_unix.go create mode 100644 pkg/disk/directio_unsupported.go diff --git a/cmd/posix.go b/cmd/posix.go index 12af7e1a6..86ccdb386 100644 --- a/cmd/posix.go +++ b/cmd/posix.go @@ -37,12 +37,14 @@ import ( "github.com/minio/minio/cmd/logger" "github.com/minio/minio/pkg/disk" "github.com/minio/minio/pkg/mountinfo" + "github.com/ncw/directio" ) const ( - diskMinFreeSpace = 900 * humanize.MiByte // Min 900MiB free space. - diskMinTotalSpace = diskMinFreeSpace // Min 900MiB total space. - maxAllowedIOError = 5 + diskMinFreeSpace = 900 * humanize.MiByte // Min 900MiB free space. + diskMinTotalSpace = diskMinFreeSpace // Min 900MiB total space. + maxAllowedIOError = 5 + posixWriteBlockSize = 4 * humanize.MiByte ) // isValidVolname verifies a volname name in accordance with object @@ -71,7 +73,6 @@ type posix struct { connected bool diskMount bool // indicates if the path is an actual mount. - driveSync bool // indicates if the backend is synchronous. diskFileInfo os.FileInfo // Disk usage metrics @@ -188,7 +189,7 @@ func newPosix(path string) (*posix, error) { // 1MiB buffer pool for posix internal operations. pool: sync.Pool{ New: func() interface{} { - b := make([]byte, readSizeV1) + b := directio.AlignedBlock(posixWriteBlockSize) return &b }, }, @@ -197,15 +198,6 @@ func newPosix(path string) (*posix, error) { diskMount: mountinfo.IsLikelyMountPoint(path), } - var pf BoolFlag - if driveSync := os.Getenv("MINIO_DRIVE_SYNC"); driveSync != "" { - pf, err = ParseBoolFlag(driveSync) - if err != nil { - return nil, err - } - p.driveSync = bool(pf) - } - if !p.diskMount { go p.diskUsage(GlobalServiceDoneCh) } @@ -1057,13 +1049,49 @@ func (s *posix) CreateFile(volume, path string, fileSize int64, r io.Reader) (er return err } - // Create file if not found. Note that it is created with os.O_EXCL flag as the file - // always is supposed to be created in the tmp directory with a unique file name. - w, err := s.openFile(volume, path, os.O_CREATE|os.O_APPEND|os.O_WRONLY|os.O_EXCL) - if err != nil { + if err = s.checkDiskFound(); err != nil { return err } + volumeDir, err := s.getVolDir(volume) + if err != nil { + return err + } + // Stat a volume entry. + _, err = os.Stat((volumeDir)) + if err != nil { + if os.IsNotExist(err) { + return errVolumeNotFound + } else if isSysErrIO(err) { + return errFaultyDisk + } + return err + } + + filePath := pathJoin(volumeDir, path) + if err = checkPathLength((filePath)); err != nil { + return err + } + + // Create top level directories if they don't exist. + // with mode 0777 mkdir honors system umask. + if err = mkdirAll(slashpath.Dir(filePath), 0777); err != nil { + return err + } + + w, err := disk.OpenFileDirectIO(filePath, os.O_CREATE|os.O_WRONLY|os.O_EXCL|os.O_SYNC, 0666) + if err != nil { + switch { + case os.IsPermission(err): + return errFileAccessDenied + case os.IsExist(err): + return errFileAccessDenied + case isSysErrIO(err): + return errFaultyDisk + default: + return err + } + } defer w.Close() var e error @@ -1090,16 +1118,69 @@ func (s *posix) CreateFile(volume, path string, fileSize int64, r io.Reader) (er bufp := s.pool.Get().(*[]byte) defer s.pool.Put(bufp) - n, err := io.CopyBuffer(w, r, *bufp) - if err != nil { - return err + buf := *bufp + var written int64 + dioCount := int(fileSize) / len(buf) + for i := 0; i < dioCount; i++ { + var n int + _, err = io.ReadFull(r, buf) + if err != nil { + return err + } + n, err = w.Write(buf) + if err != nil { + return err + } + written += int64(n) } - if n < fileSize { - return errLessData + // The following logic writes the remainging data such that it writes whatever best is possible (aligned buffer) + // in O_DIRECT mode and remaining (unaligned buffer) in non-O_DIRECT mode. + remaining := fileSize % int64(len(buf)) + if remaining != 0 { + buf = buf[:remaining] + _, err = io.ReadFull(r, buf) + if err != nil { + return err + } + remainingAligned := (remaining / directio.AlignSize) * directio.AlignSize + remainingAlignedBuf := buf[:remainingAligned] + remainingUnalignedBuf := buf[remainingAligned:] + if len(remainingAlignedBuf) > 0 { + var n int + n, err = w.Write(remainingAlignedBuf) + if err != nil { + return err + } + written += int64(n) + } + if len(remainingUnalignedBuf) > 0 { + var n int + // Write on O_DIRECT fds fail if buffer is not 4K aligned, hence disable O_DIRECT. + if err = disk.DisableDirectIO(w); err != nil { + return err + } + n, err = w.Write(remainingUnalignedBuf) + if err != nil { + return err + } + written += int64(n) + } } - if n > fileSize { + + // Do some sanity checks. + _, err = io.ReadFull(r, buf) + if err != io.EOF { return errMoreData } + + if written < fileSize { + return errLessData + } + + if written > fileSize { + return errMoreData + } + return nil } @@ -1142,13 +1223,9 @@ func (s *posix) AppendFile(volume, path string, buf []byte) (err error) { } var w *os.File - // Create file if not found, additionally also enables synchronous - // operation if asked by the user. - if s.driveSync { - w, err = s.openFile(volume, path, os.O_CREATE|os.O_SYNC|os.O_APPEND|os.O_WRONLY) - } else { - w, err = s.openFile(volume, path, os.O_CREATE|os.O_APPEND|os.O_WRONLY) - } + // Create file if not found. Not doing O_DIRECT here to avoid the code that does buffer aligned writes. + // AppendFile() is only used by healing code to heal objects written in old format. + w, err = s.openFile(volume, path, os.O_CREATE|os.O_SYNC|os.O_APPEND|os.O_WRONLY) if err != nil { return err } diff --git a/go.mod b/go.mod index c0cee249d..00fbc391a 100644 --- a/go.mod +++ b/go.mod @@ -69,6 +69,7 @@ require ( github.com/nats-io/nats v1.7.2 github.com/nats-io/nkeys v0.0.2 // indirect github.com/nats-io/nuid v1.0.1 // indirect + github.com/ncw/directio v1.0.5 github.com/nsqio/go-nsq v1.0.7 github.com/pascaldekloe/goe v0.1.0 // indirect github.com/pkg/errors v0.8.1 // indirect diff --git a/go.sum b/go.sum index 4991ee865..733e182d3 100644 --- a/go.sum +++ b/go.sum @@ -432,6 +432,8 @@ github.com/nats-io/nuid v1.0.0 h1:44QGdhbiANq8ZCbUkdn6W5bqtg+mHuDE4wOUuxxndFs= github.com/nats-io/nuid v1.0.0/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= +github.com/ncw/directio v1.0.5 h1:JSUBhdjEvVaJvOoyPAbcW0fnd0tvRXD76wEfZ1KcQz4= +github.com/ncw/directio v1.0.5/go.mod h1:rX/pKEYkOXBGOggmcyJeJGloCkleSvphPx2eV3t6ROk= github.com/nsqio/go-nsq v0.0.0-20181028195256-0527e80f3ba5/go.mod h1:XP5zaUs3pqf+Q71EqUJs3HYfBIqfK6G83WQMdNN+Ito= github.com/nsqio/go-nsq v1.0.7 h1:O0pIZJYTf+x7cZBA0UMY8WxFG79lYTURmWzAAh48ljY= github.com/nsqio/go-nsq v1.0.7/go.mod h1:XP5zaUs3pqf+Q71EqUJs3HYfBIqfK6G83WQMdNN+Ito= diff --git a/pkg/disk/directio_darwin.go b/pkg/disk/directio_darwin.go new file mode 100644 index 000000000..9c3ede6d5 --- /dev/null +++ b/pkg/disk/directio_darwin.go @@ -0,0 +1,36 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package disk + +import ( + "os" + + "github.com/ncw/directio" + "golang.org/x/sys/unix" +) + +// OpenFileDirectIO - bypass kernel cache. +func OpenFileDirectIO(filePath string, flag int, perm os.FileMode) (*os.File, error) { + return directio.OpenFile(filePath, flag, perm) +} + +// DisableDirectIO - disables directio mode. +func DisableDirectIO(f *os.File) error { + fd := f.Fd() + _, err := unix.FcntlInt(fd, unix.F_NOCACHE, 0) + return err +} diff --git a/pkg/disk/directio_unix.go b/pkg/disk/directio_unix.go new file mode 100644 index 000000000..b5be409cc --- /dev/null +++ b/pkg/disk/directio_unix.go @@ -0,0 +1,44 @@ +// +build linux netbsd freebsd + +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package disk + +import ( + "os" + "syscall" + + "github.com/ncw/directio" + "golang.org/x/sys/unix" +) + +// OpenFileDirectIO - bypass kernel cache. +func OpenFileDirectIO(filePath string, flag int, perm os.FileMode) (*os.File, error) { + return directio.OpenFile(filePath, flag, perm) +} + +// DisableDirectIO - disables directio mode. +func DisableDirectIO(f *os.File) error { + fd := f.Fd() + flag, err := unix.FcntlInt(fd, unix.F_GETFL, 0) + if err != nil { + return err + } + flag = flag & ^(syscall.O_DIRECT) + _, err = unix.FcntlInt(fd, unix.F_SETFL, flag) + return err +} diff --git a/pkg/disk/directio_unsupported.go b/pkg/disk/directio_unsupported.go new file mode 100644 index 000000000..3c6342b51 --- /dev/null +++ b/pkg/disk/directio_unsupported.go @@ -0,0 +1,35 @@ +// +build !linux,!netbsd,!freebsd,!darwin + +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package disk + +import ( + "os" +) + +// OpenBSD and Windows not supported. +// On OpenBSD O_DIRECT is not supported +// On Windows there is no documentation on disabling O_DIRECT + +func OpenFileDirectIO(filePath string, flag int, perm os.FileMode) (*os.File, error) { + return os.OpenFile(filePath, flag, perm) +} + +func DisableDirectIO(f *os.File) error { + return nil +}