[feat]: use DSYNC for xl.meta writes and NOATIME for reads (#11615)

Instead of using O_SYNC, we are better off using O_DSYNC
instead since we are only ever interested in data to be
persisted to disk not the associated filesystem metadata.

For reads we ask customers to turn off noatime, but instead
we can proactively use O_NOATIME flag to avoid atime updates
upon reads.
This commit is contained in:
Harshavardhana 2021-02-24 00:14:16 -08:00 committed by GitHub
parent 14aef52004
commit b517c791e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 278 additions and 74 deletions

View File

@ -20,7 +20,6 @@ import (
"context"
"io"
"io/ioutil"
"log"
"os"
"testing"
)
@ -28,7 +27,7 @@ import (
func testBitrotReaderWriterAlgo(t *testing.T, bitrotAlgo BitrotAlgorithm) {
tmpDir, err := ioutil.TempDir("", "")
if err != nil {
log.Fatal(err)
t.Fatal(err)
}
defer os.RemoveAll(tmpDir)
@ -46,35 +45,35 @@ func testBitrotReaderWriterAlgo(t *testing.T, bitrotAlgo BitrotAlgorithm) {
_, err = writer.Write([]byte("aaaaaaaaaa"))
if err != nil {
log.Fatal(err)
t.Fatal(err)
}
_, err = writer.Write([]byte("aaaaaaaaaa"))
if err != nil {
log.Fatal(err)
t.Fatal(err)
}
_, err = writer.Write([]byte("aaaaaaaaaa"))
if err != nil {
log.Fatal(err)
t.Fatal(err)
}
_, err = writer.Write([]byte("aaaaa"))
if err != nil {
log.Fatal(err)
t.Fatal(err)
}
writer.(io.Closer).Close()
reader := newBitrotReader(disk, nil, volume, filePath, 35, bitrotAlgo, bitrotWriterSum(writer), 10)
b := make([]byte, 10)
if _, err = reader.ReadAt(b, 0); err != nil {
log.Fatal(err)
t.Fatal(err)
}
if _, err = reader.ReadAt(b, 10); err != nil {
log.Fatal(err)
t.Fatal(err)
}
if _, err = reader.ReadAt(b, 20); err != nil {
log.Fatal(err)
t.Fatal(err)
}
if _, err = reader.ReadAt(b[:5], 30); err != nil {
log.Fatal(err)
t.Fatal(err)
}
}

View File

@ -31,6 +31,7 @@ import (
"github.com/minio/minio/cmd/config/storageclass"
"github.com/minio/minio/cmd/logger"
"github.com/minio/minio/pkg/color"
xioutil "github.com/minio/minio/pkg/ioutil"
"github.com/minio/minio/pkg/sync/errgroup"
sha256 "github.com/minio/sha256-simd"
)
@ -156,7 +157,7 @@ func newFormatErasureV3(numSets int, setLen int) *formatErasureV3 {
// successfully the version only if the backend is Erasure.
func formatGetBackendErasureVersion(formatPath string) (string, error) {
meta := &formatMetaV1{}
b, err := ioutil.ReadFile(formatPath)
b, err := xioutil.ReadFile(formatPath)
if err != nil {
return "", err
}
@ -218,7 +219,7 @@ func formatErasureMigrateV1ToV2(export, version string) error {
formatPath := pathJoin(export, minioMetaBucket, formatConfigFile)
formatV1 := &formatErasureV1{}
b, err := ioutil.ReadFile(formatPath)
b, err := xioutil.ReadFile(formatPath)
if err != nil {
return err
}
@ -251,7 +252,7 @@ func formatErasureMigrateV2ToV3(export, version string) error {
formatPath := pathJoin(export, minioMetaBucket, formatConfigFile)
formatV2 := &formatErasureV2{}
b, err := ioutil.ReadFile(formatPath)
b, err := xioutil.ReadFile(formatPath)
if err != nil {
return err
}

View File

@ -31,7 +31,7 @@ import (
jsoniter "github.com/json-iterator/go"
"github.com/minio/minio/cmd/logger"
mioutil "github.com/minio/minio/pkg/ioutil"
xioutil "github.com/minio/minio/pkg/ioutil"
"github.com/minio/minio/pkg/trie"
)
@ -114,7 +114,7 @@ func (fs *FSObjects) backgroundAppend(ctx context.Context, bucket, object, uploa
}
partPath := pathJoin(uploadIDDir, entry)
err = mioutil.AppendFile(file.filePath, partPath, globalFSOSync)
err = xioutil.AppendFile(file.filePath, partPath, globalFSOSync)
if err != nil {
reqInfo := logger.GetReqInfo(ctx).AppendTags("partPath", partPath)
reqInfo.AppendTags("filepath", file.filePath)
@ -390,7 +390,7 @@ func (fs *FSObjects) GetMultipartInfo(ctx context.Context, bucket, object, uploa
return minfo, toObjectErr(err, bucket, object)
}
fsMetaBytes, err := ioutil.ReadFile(pathJoin(uploadIDDir, fs.metaJSONFile))
fsMetaBytes, err := xioutil.ReadFile(pathJoin(uploadIDDir, fs.metaJSONFile))
if err != nil {
logger.LogIf(ctx, err)
return minfo, toObjectErr(err, bucket, object)
@ -700,7 +700,7 @@ func (fs *FSObjects) CompleteMultipartUpload(ctx context.Context, bucket string,
GotETag: part.ETag,
}
}
if err = mioutil.AppendFile(appendFilePath, pathJoin(uploadIDDir, partFile), globalFSOSync); err != nil {
if err = xioutil.AppendFile(appendFilePath, pathJoin(uploadIDDir, partFile), globalFSOSync); err != nil {
logger.LogIf(ctx, err)
return oi, toObjectErr(err)
}
@ -744,7 +744,7 @@ func (fs *FSObjects) CompleteMultipartUpload(ctx context.Context, bucket string,
}()
// Read saved fs metadata for ongoing multipart.
fsMetaBuf, err := ioutil.ReadFile(pathJoin(uploadIDDir, fs.metaJSONFile))
fsMetaBuf, err := xioutil.ReadFile(pathJoin(uploadIDDir, fs.metaJSONFile))
if err != nil {
logger.LogIf(ctx, err)
return oi, toObjectErr(err, bucket, object)

View File

@ -40,6 +40,7 @@ import (
"github.com/minio/minio/cmd/logger"
"github.com/minio/minio/pkg/bucket/policy"
"github.com/minio/minio/pkg/color"
xioutil "github.com/minio/minio/pkg/ioutil"
"github.com/minio/minio/pkg/lock"
"github.com/minio/minio/pkg/madmin"
"github.com/minio/minio/pkg/mimedb"
@ -323,7 +324,7 @@ func (fs *FSObjects) crawlBucket(ctx context.Context, bucket string, cache dataU
// Load bucket info.
cache, err = crawlDataFolder(ctx, fs.fsPath, cache, func(item crawlItem) (sizeSummary, error) {
bucket, object := item.bucket, item.objectPath()
fsMetaBytes, err := ioutil.ReadFile(pathJoin(fs.fsPath, minioMetaBucket, bucketMetaPrefix, bucket, object, fs.metaJSONFile))
fsMetaBytes, err := xioutil.ReadFile(pathJoin(fs.fsPath, minioMetaBucket, bucketMetaPrefix, bucket, object, fs.metaJSONFile))
if err != nil && !osIsNotExist(err) {
if intDataUpdateTracker.debug {
logger.Info(color.Green("crawlBucket:")+" object return unexpected error: %v/%v: %w", item.bucket, item.objectPath(), err)

View File

@ -19,7 +19,6 @@ package cmd
import (
"context"
"io"
"io/ioutil"
"net/http"
"net/url"
"os"
@ -30,6 +29,7 @@ import (
"github.com/gorilla/mux"
"github.com/minio/minio/cmd/logger"
xioutil "github.com/minio/minio/pkg/ioutil"
)
// WalkDirOptions provides options for WalkDir operations.
@ -91,7 +91,7 @@ func (s *xlStorage) WalkDir(ctx context.Context, opts WalkDirOptions, wr io.Writ
// Fast exit track to check if we are listing an object with
// a trailing slash, this will avoid to list the object content.
if HasSuffix(opts.BaseDir, SlashSeparator) {
metadata, err := ioutil.ReadFile(pathJoin(volumeDir,
metadata, err := xioutil.ReadFile(pathJoin(volumeDir,
opts.BaseDir[:len(opts.BaseDir)-1]+globalDirSuffix,
xlStorageFormatFile))
if err == nil {
@ -151,7 +151,7 @@ func (s *xlStorage) WalkDir(ctx context.Context, opts WalkDirOptions, wr io.Writ
// If root was an object return it as such.
if HasSuffix(entry, xlStorageFormatFile) {
var meta metaCacheEntry
meta.metadata, err = ioutil.ReadFile(pathJoin(volumeDir, current, entry))
meta.metadata, err = xioutil.ReadFile(pathJoin(volumeDir, current, entry))
if err != nil {
logger.LogIf(ctx, err)
continue
@ -166,7 +166,7 @@ func (s *xlStorage) WalkDir(ctx context.Context, opts WalkDirOptions, wr io.Writ
// Check legacy.
if HasSuffix(entry, xlStorageFormatFileV1) {
var meta metaCacheEntry
meta.metadata, err = ioutil.ReadFile(pathJoin(volumeDir, current, entry))
meta.metadata, err = xioutil.ReadFile(pathJoin(volumeDir, current, entry))
if err != nil {
logger.LogIf(ctx, err)
continue
@ -213,7 +213,7 @@ func (s *xlStorage) WalkDir(ctx context.Context, opts WalkDirOptions, wr io.Writ
meta.name = meta.name[:len(meta.name)-1] + globalDirSuffixWithSlash
}
meta.metadata, err = ioutil.ReadFile(pathJoin(volumeDir, meta.name, xlStorageFormatFile))
meta.metadata, err = xioutil.ReadFile(pathJoin(volumeDir, meta.name, xlStorageFormatFile))
switch {
case err == nil:
// It was an object
@ -222,7 +222,7 @@ func (s *xlStorage) WalkDir(ctx context.Context, opts WalkDirOptions, wr io.Writ
}
out <- meta
case osIsNotExist(err):
meta.metadata, err = ioutil.ReadFile(pathJoin(volumeDir, meta.name, xlStorageFormatFileV1))
meta.metadata, err = xioutil.ReadFile(pathJoin(volumeDir, meta.name, xlStorageFormatFileV1))
if err == nil {
// Maybe rename? Would make it inconsistent across disks though.
// os.Rename(pathJoin(volumeDir, meta.name, xlStorageFormatFileV1), pathJoin(volumeDir, meta.name, xlStorageFormatFile))

View File

@ -50,10 +50,11 @@ func readDirFn(dirPath string, filter func(name string, typ os.FileMode) error)
if err == io.EOF {
break
}
if osErrToFileErr(err) == errFileNotFound {
err = osErrToFileErr(err)
if err == errFileNotFound {
return nil
}
return osErrToFileErr(err)
return err
}
for _, fi := range fis {
if fi.Mode()&os.ModeSymlink == os.ModeSymlink {

View File

@ -109,6 +109,10 @@ func readDirFn(dirPath string, fn func(name string, typ os.FileMode) error) erro
if isSysErrNotDir(err) {
return nil
}
err = osErrToFileErr(err)
if err == errFileNotFound {
return nil
}
return err
}
if nbuf <= 0 {
@ -183,7 +187,7 @@ func readDirN(dirPath string, count int) (entries []string, err error) {
if isSysErrNotDir(err) {
return nil, errFileNotFound
}
return nil, err
return nil, osErrToFileErr(err)
}
if nbuf <= 0 {
break

View File

@ -61,11 +61,15 @@ func readDirFn(dirPath string, filter func(name string, typ os.FileMode) error)
if isSysErrPathNotFound(e) {
return nil
}
return osErrToFileErr(&os.PathError{
err = osErrToFileErr(&os.PathError{
Op: "FindNextFile",
Path: dirPath,
Err: e,
})
if err == errFileNotFound {
return nil
}
return err
}
}
name := syscall.UTF16ToString(data.FileName[0:])

View File

@ -264,9 +264,10 @@ func newXLStorage(ep Endpoint) (*xlStorage, error) {
return &b
},
},
globalSync: env.Get(config.EnvFSOSync, config.EnableOff) == config.EnableOn,
ctx: GlobalContext,
rootDisk: rootDisk,
globalSync: env.Get(config.EnvFSOSync, config.EnableOff) == config.EnableOn,
ctx: GlobalContext,
rootDisk: rootDisk,
readODirectSupported: true,
}
// Create all necessary bucket folders if possible.
@ -382,7 +383,7 @@ func (s *xlStorage) CrawlAndGetDataUsage(ctx context.Context, cache dataUsageCac
return sizeSummary{}, errSkipFile
}
buf, err := ioutil.ReadFile(item.Path)
buf, err := xioutil.ReadFile(item.Path)
if err != nil {
if intDataUpdateTracker.debug {
console.Debugf(color.Green("crawlBucket:")+" object path missing: %v: %w\n", item.Path, err)
@ -539,7 +540,7 @@ func (s *xlStorage) GetDiskID() (string, error) {
return diskID, nil
}
b, err := ioutil.ReadFile(formatFile)
b, err := xioutil.ReadFile(formatFile)
if err != nil {
// If the disk is still not initialized.
if osIsNotExist(err) {
@ -827,7 +828,7 @@ func (s *xlStorage) WalkVersions(ctx context.Context, volume, dirPath, marker st
},
}
} else {
xlMetaBuf, err := ioutil.ReadFile(pathJoin(volumeDir, walkResult.entry, xlStorageFormatFile))
xlMetaBuf, err := xioutil.ReadFile(pathJoin(volumeDir, walkResult.entry, xlStorageFormatFile))
if err != nil {
continue
}
@ -1150,9 +1151,9 @@ func (s *xlStorage) ReadVersion(ctx context.Context, volume, path, versionID str
func (s *xlStorage) readAllData(volumeDir string, filePath string, requireDirectIO bool) (buf []byte, err error) {
var f *os.File
if requireDirectIO {
f, err = disk.OpenFileDirectIO(filePath, os.O_RDONLY, 0666)
f, err = disk.OpenFileDirectIO(filePath, readMode, 0666)
} else {
f, err = os.Open(filePath)
f, err = os.OpenFile(filePath, readMode, 0)
}
if err != nil {
if osIsNotExist(err) {
@ -1175,6 +1176,13 @@ func (s *xlStorage) readAllData(volumeDir string, filePath string, requireDirect
} else if isSysErrTooManyFiles(err) {
return nil, errTooManyOpenFiles
} else if isSysErrInvalidArg(err) {
st, _ := os.Lstat(filePath)
if st != nil && st.IsDir() {
// Linux returns InvalidArg for directory O_DIRECT
// we need to keep this fallback code to return correct
// errors upwards.
return nil, errFileNotFound
}
return nil, errUnsupportedDisk
}
return nil, err
@ -1328,42 +1336,23 @@ func (s *xlStorage) openFile(volume, path string, mode int) (f *os.File, err err
return nil, err
}
// Stat a volume entry.
_, err = os.Lstat(volumeDir)
if err != nil {
if osIsNotExist(err) {
return nil, errVolumeNotFound
} else if isSysErrIO(err) {
return nil, errFaultyDisk
}
return nil, err
}
filePath := pathJoin(volumeDir, path)
if err = checkPathLength(filePath); err != nil {
return nil, err
}
// Verify if the file already exists and is not of regular type.
var st os.FileInfo
if st, err = os.Lstat(filePath); err == nil {
if !st.Mode().IsRegular() {
return nil, errIsNotRegular
}
} else {
// Create top level directories if they don't exist.
// with mode 0777 mkdir honors system umask.
if err = mkdirAll(pathutil.Dir(filePath), 0777); err != nil {
return nil, err
}
// Create top level directories if they don't exist.
// with mode 0777 mkdir honors system umask.
if err = mkdirAll(pathutil.Dir(filePath), 0777); err != nil {
return nil, err
}
w, err := os.OpenFile(filePath, mode, 0666)
w, err := os.OpenFile(filePath, mode|writeMode, 0666)
if err != nil {
// File path cannot be verified since one of the parents is a file.
switch {
case isSysErrNotDir(err):
return nil, errFileAccessDenied
case isSysErrIsDir(err):
return nil, errIsNotRegular
case osIsPermission(err):
return nil, errFileAccessDenied
case isSysErrIO(err):
@ -1668,19 +1657,21 @@ func (s *xlStorage) WriteAll(ctx context.Context, volume string, path string, b
atomic.AddInt32(&s.activeIOCount, -1)
}()
w, err := s.openFile(volume, path, os.O_CREATE|os.O_SYNC|os.O_WRONLY)
w, err := s.openFile(volume, path, os.O_CREATE|os.O_WRONLY)
if err != nil {
return err
}
defer w.Close()
n, err := w.Write(b)
if err != nil {
return err
}
if n != len(b) {
return io.ErrShortWrite
}
return nil
}
@ -1692,19 +1683,38 @@ func (s *xlStorage) AppendFile(ctx context.Context, volume string, path string,
atomic.AddInt32(&s.activeIOCount, -1)
}()
var w *os.File
// Create file if not found. Not doing O_DIRECT here to avoid the code that does buffer aligned writes.
// AppendFile() is only used by healing code to heal objects written in old format.
w, err = s.openFile(volume, path, os.O_CREATE|os.O_SYNC|os.O_APPEND|os.O_WRONLY)
volumeDir, err := s.getVolDir(volume)
if err != nil {
return err
}
if _, err = w.Write(buf); err != nil {
// Stat a volume entry.
if _, err = os.Lstat(volumeDir); err != nil {
if osIsNotExist(err) {
return errVolumeNotFound
}
return err
}
return w.Close()
var w *os.File
// Create file if not found. Not doing O_DIRECT here to avoid the code that does buffer aligned writes.
// AppendFile() is only used by healing code to heal objects written in old format.
w, err = s.openFile(volume, path, os.O_CREATE|os.O_APPEND|os.O_WRONLY)
if err != nil {
return err
}
defer w.Close()
n, err := w.Write(buf)
if err != nil {
return err
}
if n != len(buf) {
return io.ErrShortWrite
}
return nil
}
// CheckParts check if path has necessary parts available.
@ -2000,7 +2010,7 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath, dataDir,
return err
}
srcBuf, err := ioutil.ReadFile(srcFilePath)
srcBuf, err := xioutil.ReadFile(srcFilePath)
if err != nil {
return osErrToFileErr(err)
}
@ -2010,7 +2020,7 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath, dataDir,
return err
}
dstBuf, err := ioutil.ReadFile(dstFilePath)
dstBuf, err := xioutil.ReadFile(dstFilePath)
if err != nil {
if !osIsNotExist(err) {
return osErrToFileErr(err)
@ -2021,7 +2031,7 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath, dataDir,
return err
}
if err == nil {
dstBuf, err = ioutil.ReadFile(dstFilePath)
dstBuf, err = xioutil.ReadFile(dstFilePath)
if err != nil && !osIsNotExist(err) {
return osErrToFileErr(err)
}

View File

@ -0,0 +1,31 @@
// +build windows darwin
/*
* MinIO Cloud Storage, (C) 2021 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cmd
import (
"os"
)
var (
// No special option for reads on windows
readMode = os.O_RDONLY
// Write with sync no buffering only used only for `xl.meta` writes
writeMode = os.O_SYNC
)

View File

@ -0,0 +1,31 @@
// +build !windows,!darwin
/*
* MinIO Cloud Storage, (C) 2021 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cmd
import (
"os"
)
var (
// Disallow updating access times
readMode = os.O_RDONLY | 0x40000 // O_NOATIME
// Write with data sync only used only for `xl.meta` writes
writeMode = 0x1000 // O_DSYNC
)

74
pkg/ioutil/read_file.go Normal file
View File

@ -0,0 +1,74 @@
/*
* MinIO Cloud Storage, (C) 2021 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Forked from golang.org/pkg/os.ReadFile with NOATIME support.
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ioutil
import (
"io"
"os"
)
// ReadFile reads the named file and returns the contents.
// A successful call returns err == nil, not err == EOF.
// Because ReadFile reads the whole file, it does not treat an EOF from Read
// as an error to be reported.
//
// passes NOATIME flag for reads on Unix systems to avoid atime updates.
func ReadFile(name string) ([]byte, error) {
f, err := os.OpenFile(name, readMode, 0)
if err != nil {
return nil, err
}
defer f.Close()
var size int
if info, err := f.Stat(); err == nil {
size64 := info.Size()
if int64(int(size64)) == size64 {
size = int(size64)
}
}
size++ // one byte for final read at EOF
// If a file claims a small size, read at least 512 bytes.
// In particular, files in Linux's /proc claim size 0 but
// then do not work right if read in small pieces,
// so an initial read of 1 byte would not work correctly.
if size < 512 {
size = 512
}
data := make([]byte, 0, size)
for {
if len(data) >= cap(data) {
d := append(data[:cap(data)], 0)
data = d[:len(data)]
}
n, err := f.Read(data[len(data):cap(data)])
data = data[:len(data)+n]
if err != nil {
if err == io.EOF {
err = nil
}
return data, err
}
}
}

View File

@ -0,0 +1,23 @@
// +build windows darwin
/*
* MinIO Cloud Storage, (C) 2021 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ioutil
import "os"
var readMode = os.O_RDONLY

View File

@ -0,0 +1,25 @@
// +build !windows,!darwin
/*
* MinIO Cloud Storage, (C) 2021 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ioutil
import (
"os"
)
var readMode = os.O_RDONLY | 0x40000 // read with O_NOATIME