remove serializing WalkDir() across all buckets/prefixes on SSDs (#17707)

slower drives get knocked off because they are too slow via 
active monitoring, we do not need to block calls arbitrarily.

Serializing adds latencies for already slow calls, remove
it for SSDs/NVMEs

Also, add a selection with context when writing to `out <-`
channel, to avoid any potential blocks.
This commit is contained in:
Harshavardhana 2023-07-24 09:30:19 -07:00 committed by GitHub
parent a7fb3a3853
commit 14e1ace552
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 108 additions and 52 deletions

View File

@ -98,6 +98,15 @@ func (s *xlStorage) WalkDir(ctx context.Context, opts WalkDirOptions, wr io.Writ
objsReturned++ objsReturned++
} }
} }
send := func(entry metaCacheEntry) error {
objReturned(entry.metadata)
select {
case <-ctx.Done():
return ctx.Err()
case out <- entry:
}
return nil
}
// Fast exit track to check if we are listing an object with // Fast exit track to check if we are listing an object with
// a trailing slash, this will avoid to list the object content. // a trailing slash, this will avoid to list the object content.
@ -109,11 +118,12 @@ func (s *xlStorage) WalkDir(ctx context.Context, opts WalkDirOptions, wr io.Writ
// if baseDir is already a directory object, consider it // if baseDir is already a directory object, consider it
// as part of the list call, this is AWS S3 specific // as part of the list call, this is AWS S3 specific
// behavior. // behavior.
out <- metaCacheEntry{ if err := send(metaCacheEntry{
name: opts.BaseDir, name: opts.BaseDir,
metadata: metadata, metadata: metadata,
}); err != nil {
return err
} }
objReturned(metadata)
} else { } else {
st, sterr := Lstat(pathJoin(volumeDir, opts.BaseDir, xlStorageFormatFile)) st, sterr := Lstat(pathJoin(volumeDir, opts.BaseDir, xlStorageFormatFile))
if sterr == nil && st.Mode().IsRegular() { if sterr == nil && st.Mode().IsRegular() {
@ -143,19 +153,25 @@ func (s *xlStorage) WalkDir(ctx context.Context, opts WalkDirOptions, wr io.Writ
return nil return nil
} }
if s.walkMu != nil {
s.walkMu.Lock() s.walkMu.Lock()
}
entries, err := s.ListDir(ctx, opts.Bucket, current, -1) entries, err := s.ListDir(ctx, opts.Bucket, current, -1)
if s.walkMu != nil {
s.walkMu.Unlock() s.walkMu.Unlock()
}
if err != nil { if err != nil {
// Folder could have gone away in-between // Folder could have gone away in-between
if err != errVolumeNotFound && err != errFileNotFound { if err != errVolumeNotFound && err != errFileNotFound {
logger.LogOnceIf(ctx, err, "metacache-walk-scan-dir") logger.LogOnceIf(ctx, err, "metacache-walk-scan-dir")
} }
if opts.ReportNotFound && err == errFileNotFound && current == opts.BaseDir { if opts.ReportNotFound && err == errFileNotFound && current == opts.BaseDir {
return errFileNotFound err = errFileNotFound
} else {
err = nil
} }
// Forward some errors? diskHealthCheckOK(ctx, err)
return nil return err
} }
diskHealthCheckOK(ctx, err) diskHealthCheckOK(ctx, err)
if len(entries) == 0 { if len(entries) == 0 {
@ -202,9 +218,13 @@ func (s *xlStorage) WalkDir(ctx context.Context, opts WalkDirOptions, wr io.Writ
// If root was an object return it as such. // If root was an object return it as such.
if HasSuffix(entry, xlStorageFormatFile) { if HasSuffix(entry, xlStorageFormatFile) {
var meta metaCacheEntry var meta metaCacheEntry
if s.walkReadMu != nil {
s.walkReadMu.Lock() s.walkReadMu.Lock()
}
meta.metadata, err = s.readMetadata(ctx, pathJoinBuf(&sb, volumeDir, current, entry)) meta.metadata, err = s.readMetadata(ctx, pathJoinBuf(&sb, volumeDir, current, entry))
if s.walkReadMu != nil {
s.walkReadMu.Unlock() s.walkReadMu.Unlock()
}
diskHealthCheckOK(ctx, err) diskHealthCheckOK(ctx, err)
if err != nil { if err != nil {
// It is totally possible that xl.meta was overwritten // It is totally possible that xl.meta was overwritten
@ -219,17 +239,15 @@ func (s *xlStorage) WalkDir(ctx context.Context, opts WalkDirOptions, wr io.Writ
meta.name = strings.TrimSuffix(meta.name, SlashSeparator) meta.name = strings.TrimSuffix(meta.name, SlashSeparator)
meta.name = pathJoinBuf(&sb, current, meta.name) meta.name = pathJoinBuf(&sb, current, meta.name)
meta.name = decodeDirObject(meta.name) meta.name = decodeDirObject(meta.name)
if err := send(meta); err != nil {
objReturned(meta.metadata) return err
out <- meta }
return nil return nil
} }
// Check legacy. // Check legacy.
if HasSuffix(entry, xlStorageFormatFileV1) { if HasSuffix(entry, xlStorageFormatFileV1) {
var meta metaCacheEntry var meta metaCacheEntry
s.walkReadMu.Lock()
meta.metadata, err = xioutil.ReadFile(pathJoinBuf(&sb, volumeDir, current, entry)) meta.metadata, err = xioutil.ReadFile(pathJoinBuf(&sb, volumeDir, current, entry))
s.walkReadMu.Unlock()
diskHealthCheckOK(ctx, err) diskHealthCheckOK(ctx, err)
if err != nil { if err != nil {
if !IsErrIgnored(err, io.EOF, io.ErrUnexpectedEOF) { if !IsErrIgnored(err, io.EOF, io.ErrUnexpectedEOF) {
@ -240,9 +258,9 @@ func (s *xlStorage) WalkDir(ctx context.Context, opts WalkDirOptions, wr io.Writ
meta.name = strings.TrimSuffix(entry, xlStorageFormatFileV1) meta.name = strings.TrimSuffix(entry, xlStorageFormatFileV1)
meta.name = strings.TrimSuffix(meta.name, SlashSeparator) meta.name = strings.TrimSuffix(meta.name, SlashSeparator)
meta.name = pathJoinBuf(&sb, current, meta.name) meta.name = pathJoinBuf(&sb, current, meta.name)
objReturned(meta.metadata) if err := send(meta); err != nil {
return err
out <- meta }
return nil return nil
} }
// Skip all other files. // Skip all other files.
@ -295,9 +313,13 @@ func (s *xlStorage) WalkDir(ctx context.Context, opts WalkDirOptions, wr io.Writ
meta.name = meta.name[:len(meta.name)-1] + globalDirSuffixWithSlash meta.name = meta.name[:len(meta.name)-1] + globalDirSuffixWithSlash
} }
if s.walkReadMu != nil {
s.walkReadMu.Lock() s.walkReadMu.Lock()
}
meta.metadata, err = s.readMetadata(ctx, pathJoinBuf(&sb, volumeDir, meta.name, xlStorageFormatFile)) meta.metadata, err = s.readMetadata(ctx, pathJoinBuf(&sb, volumeDir, meta.name, xlStorageFormatFile))
if s.walkReadMu != nil {
s.walkReadMu.Unlock() s.walkReadMu.Unlock()
}
diskHealthCheckOK(ctx, err) diskHealthCheckOK(ctx, err)
switch { switch {
case err == nil: case err == nil:
@ -305,17 +327,17 @@ func (s *xlStorage) WalkDir(ctx context.Context, opts WalkDirOptions, wr io.Writ
if isDirObj { if isDirObj {
meta.name = strings.TrimSuffix(meta.name, globalDirSuffixWithSlash) + slashSeparator meta.name = strings.TrimSuffix(meta.name, globalDirSuffixWithSlash) + slashSeparator
} }
objReturned(meta.metadata) if err := send(meta); err != nil {
return err
out <- meta }
case osIsNotExist(err), isSysErrIsDir(err): case osIsNotExist(err), isSysErrIsDir(err):
meta.metadata, err = xioutil.ReadFile(pathJoinBuf(&sb, volumeDir, meta.name, xlStorageFormatFileV1)) meta.metadata, err = xioutil.ReadFile(pathJoinBuf(&sb, volumeDir, meta.name, xlStorageFormatFileV1))
diskHealthCheckOK(ctx, err) diskHealthCheckOK(ctx, err)
if err == nil { if err == nil {
// It was an object // It was an object
objReturned(meta.metadata) if err := send(meta); err != nil {
return err
out <- meta }
continue continue
} }

View File

@ -115,8 +115,8 @@ type xlStorage struct {
formatData []byte formatData []byte
// mutex to prevent concurrent read operations overloading walks. // mutex to prevent concurrent read operations overloading walks.
walkMu sync.Mutex walkMu *sync.Mutex
walkReadMu sync.Mutex walkReadMu *sync.Mutex
} }
// checkPathLength - returns error if given path name length more than 255 // checkPathLength - returns error if given path name length more than 255
@ -216,18 +216,17 @@ func newXLStorage(ep Endpoint, cleanUp bool) (s *xlStorage, err error) {
return nil, err return nil, err
} }
var rootDisk bool
if !globalIsCICD && !globalIsErasureSD {
if globalRootDiskThreshold > 0 {
// Use MINIO_ROOTDISK_THRESHOLD_SIZE to figure out if
// this disk is a root disk.
info, err := disk.GetInfo(path) info, err := disk.GetInfo(path)
if err != nil { if err != nil {
return nil, err return nil, err
} }
// treat those disks with size less than or equal to the var rootDisk bool
// threshold as rootDisks. if !globalIsCICD && !globalIsErasureSD {
if globalRootDiskThreshold > 0 {
// Use MINIO_ROOTDISK_THRESHOLD_SIZE to figure out if
// this disk is a root disk. treat those disks with
// size less than or equal to the threshold as rootDisks.
rootDisk = info.Total <= globalRootDiskThreshold rootDisk = info.Total <= globalRootDiskThreshold
} else { } else {
rootDisk, err = disk.IsRootDisk(path, SlashSeparator) rootDisk, err = disk.IsRootDisk(path, SlashSeparator)
@ -247,6 +246,12 @@ func newXLStorage(ep Endpoint, cleanUp bool) (s *xlStorage, err error) {
diskIndex: -1, diskIndex: -1,
} }
// We stagger listings only on HDDs.
if info.Rotational == nil || *info.Rotational {
s.walkMu = &sync.Mutex{}
s.walkReadMu = &sync.Mutex{}
}
if cleanUp { if cleanUp {
bgFormatErasureCleanupTmp(s.diskPath) // cleanup any old data. bgFormatErasureCleanupTmp(s.diskPath) // cleanup any old data.
} }

View File

@ -23,6 +23,9 @@ package disk
// Files - total inodes available // Files - total inodes available
// Ffree - free inodes available // Ffree - free inodes available
// FSType - file system type // FSType - file system type
// Major - major dev id
// Minor - minor dev id
// Devname - device name
type Info struct { type Info struct {
Total uint64 Total uint64
Free uint64 Free uint64
@ -32,6 +35,8 @@ type Info struct {
FSType string FSType string
Major uint32 Major uint32
Minor uint32 Minor uint32
Name string
Rotational *bool
} }
// DevID is the drive major and minor ids // DevID is the drive major and minor ids

View File

@ -1,7 +1,7 @@
//go:build linux && !s390x && !arm && !386 //go:build linux && !s390x && !arm && !386
// +build linux,!s390x,!arm,!386 // +build linux,!s390x,!arm,!386
// Copyright (c) 2015-2021 MinIO, Inc. // Copyright (c) 2015-2023 MinIO, Inc.
// //
// This file is part of MinIO Object Storage stack // This file is part of MinIO Object Storage stack
// //
@ -28,6 +28,7 @@ import (
"strings" "strings"
"syscall" "syscall"
"github.com/prometheus/procfs/blockdevice"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
@ -47,14 +48,6 @@ func GetInfo(path string) (info Info, err error) {
//nolint:unconvert //nolint:unconvert
FSType: getFSType(int64(s.Type)), FSType: getFSType(int64(s.Type)),
} }
// Check for overflows.
// https://github.com/minio/minio/issues/8035
// XFS can show wrong values at times error out
// in such scenarios.
if info.Free > info.Total {
return info, fmt.Errorf("detected free space (%d) > total drive space (%d), fs corruption at (%s). please run 'fsck'", info.Free, info.Total, path)
}
info.Used = info.Total - info.Free
st := syscall.Stat_t{} st := syscall.Stat_t{}
err = syscall.Stat(path, &st) err = syscall.Stat(path, &st)
@ -65,6 +58,37 @@ func GetInfo(path string) (info Info, err error) {
devID := uint64(st.Dev) // Needed to support multiple GOARCHs devID := uint64(st.Dev) // Needed to support multiple GOARCHs
info.Major = unix.Major(devID) info.Major = unix.Major(devID)
info.Minor = unix.Minor(devID) info.Minor = unix.Minor(devID)
// Check for overflows.
// https://github.com/minio/minio/issues/8035
// XFS can show wrong values at times error out
// in such scenarios.
if info.Free > info.Total {
return info, fmt.Errorf("detected free space (%d) > total drive space (%d), fs corruption at (%s). please run 'fsck'", info.Free, info.Total, path)
}
info.Used = info.Total - info.Free
bfs, err := blockdevice.NewDefaultFS()
if err == nil {
diskstats, _ := bfs.ProcDiskstats()
for _, dstat := range diskstats {
// ignore all loop devices
if strings.HasPrefix(dstat.DeviceName, "loop") {
continue
}
qst, err := bfs.SysBlockDeviceQueueStats(dstat.DeviceName)
if err != nil {
continue
}
rot := qst.Rotational == 1 // Rotational is '1' if the device is HDD
if dstat.MajorNumber == info.Major && dstat.MinorNumber == info.Minor {
info.Name = dstat.DeviceName
info.Rotational = &rot
break
}
}
}
return info, nil return info, nil
} }