use O_DIRECT for all ReadFileStream (#13324)

This PR also removes #13312 to ensure
that we can use a better mechanism to
handle page-cache, using O_DIRECT
even for Range GETs.
This commit is contained in:
Harshavardhana 2021-09-29 16:40:28 -07:00 committed by GitHub
parent 1d9e91e00f
commit d00ff3c453
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 198 additions and 220 deletions

View File

@ -27,8 +27,10 @@ import (
"io" "io"
"github.com/minio/highwayhash" "github.com/minio/highwayhash"
"github.com/minio/minio/internal/logger"
"golang.org/x/crypto/blake2b" "golang.org/x/crypto/blake2b"
xioutil "github.com/minio/minio/internal/ioutil"
"github.com/minio/minio/internal/logger"
) )
// magic HH-256 key as HH-256 hash of the first 100 decimals of π as utf-8 string with a zero key. // magic HH-256 key as HH-256 hash of the first 100 decimals of π as utf-8 string with a zero key.
@ -172,8 +174,8 @@ func bitrotVerify(r io.Reader, wantSize, partSize int64, algo BitrotAlgorithm, w
return errFileCorrupt return errFileCorrupt
} }
bufp := xlPoolSmall.Get().(*[]byte) bufp := xioutil.ODirectPoolSmall.Get().(*[]byte)
defer xlPoolSmall.Put(bufp) defer xioutil.ODirectPoolSmall.Put(bufp)
for left > 0 { for left > 0 {
// Read expected hash... // Read expected hash...

View File

@ -22,9 +22,11 @@ import (
"sync" "sync"
"time" "time"
"github.com/minio/minio/internal/config/api"
"github.com/minio/minio/internal/logger"
mem "github.com/shirou/gopsutil/v3/mem" mem "github.com/shirou/gopsutil/v3/mem"
"github.com/minio/minio/internal/config/api"
xioutil "github.com/minio/minio/internal/ioutil"
"github.com/minio/minio/internal/logger"
) )
type apiConfig struct { type apiConfig struct {
@ -71,7 +73,8 @@ func (t *apiConfig) init(cfg api.Config, setDriveCounts []int) {
// total_ram / ram_per_request // total_ram / ram_per_request
// ram_per_request is (2MiB+128KiB) * driveCount \ // ram_per_request is (2MiB+128KiB) * driveCount \
// + 2 * 10MiB (default erasure block size v1) + 2 * 1MiB (default erasure block size v2) // + 2 * 10MiB (default erasure block size v1) + 2 * 1MiB (default erasure block size v2)
apiRequestsMaxPerNode = int(maxMem / uint64(maxSetDrives*(blockSizeLarge+blockSizeSmall)+int(blockSizeV1*2+blockSizeV2*2))) blockSize := xioutil.BlockSizeLarge + xioutil.BlockSizeSmall
apiRequestsMaxPerNode = int(maxMem / uint64(maxSetDrives*blockSize+int(blockSizeV1*2+blockSizeV2*2)))
if globalIsErasure { if globalIsErasure {
logger.Info("Automatically configured API requests per node based on available memory on the system: %d", apiRequestsMaxPerNode) logger.Info("Automatically configured API requests per node based on available memory on the system: %d", apiRequestsMaxPerNode)

View File

@ -28,7 +28,6 @@ import (
"syscall" "syscall"
"unsafe" "unsafe"
"github.com/minio/minio/internal/disk"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
@ -111,11 +110,6 @@ func readDirFn(dirPath string, fn func(name string, typ os.FileMode) error) erro
} }
return osErrToFileErr(err) return osErrToFileErr(err)
} }
if err := disk.Fadvise(f, disk.FadvSequential); err != nil {
return err
}
defer disk.Fadvise(f, disk.FadvNoReuse)
defer f.Close() defer f.Close()
bufp := direntPool.Get().(*[]byte) bufp := direntPool.Get().(*[]byte)
@ -191,12 +185,6 @@ func readDirWithOpts(dirPath string, opts readDirOpts) (entries []string, err er
if err != nil { if err != nil {
return nil, osErrToFileErr(err) return nil, osErrToFileErr(err)
} }
if err := disk.Fadvise(f, disk.FadvSequential); err != nil {
return nil, err
}
defer disk.Fadvise(f, disk.FadvNoReuse)
defer f.Close() defer f.Close()
bufp := direntPool.Get().(*[]byte) bufp := direntPool.Get().(*[]byte)

View File

@ -18,7 +18,6 @@
package cmd package cmd
import ( import (
"bufio"
"bytes" "bytes"
"context" "context"
"crypto/rand" "crypto/rand"
@ -39,7 +38,6 @@ import (
"github.com/dustin/go-humanize" "github.com/dustin/go-humanize"
jsoniter "github.com/json-iterator/go" jsoniter "github.com/json-iterator/go"
"github.com/klauspost/readahead"
"github.com/minio/minio/internal/bucket/lifecycle" "github.com/minio/minio/internal/bucket/lifecycle"
"github.com/minio/minio/internal/color" "github.com/minio/minio/internal/color"
"github.com/minio/minio/internal/config" "github.com/minio/minio/internal/config"
@ -51,18 +49,7 @@ import (
) )
const ( const (
nullVersionID = "null" nullVersionID = "null"
blockSizeSmall = 128 * humanize.KiByte // Default r/w block size for smaller objects.
blockSizeLarge = 2 * humanize.MiByte // Default r/w block size for larger objects.
blockSizeReallyLarge = 4 * humanize.MiByte // Default write block size for objects per shard >= 64MiB
// On regular files bigger than this;
readAheadSize = 16 << 20
// Read this many buffers ahead.
readAheadBuffers = 4
// Size of each buffer.
readAheadBufSize = 1 << 20
// Really large streams threshold per shard. // Really large streams threshold per shard.
reallyLargeFileThreshold = 64 * humanize.MiByte // Optimized for HDDs reallyLargeFileThreshold = 64 * humanize.MiByte // Optimized for HDDs
@ -78,7 +65,7 @@ const (
var alignedBuf []byte var alignedBuf []byte
func init() { func init() {
alignedBuf = disk.AlignedBlock(4096) alignedBuf = disk.AlignedBlock(xioutil.DirectioAlignSize)
_, _ = rand.Read(alignedBuf) _, _ = rand.Read(alignedBuf)
} }
@ -97,27 +84,6 @@ func isValidVolname(volname string) bool {
return true return true
} }
var (
xlPoolReallyLarge = sync.Pool{
New: func() interface{} {
b := disk.AlignedBlock(blockSizeReallyLarge)
return &b
},
}
xlPoolLarge = sync.Pool{
New: func() interface{} {
b := disk.AlignedBlock(blockSizeLarge)
return &b
},
}
xlPoolSmall = sync.Pool{
New: func() interface{} {
b := disk.AlignedBlock(blockSizeSmall)
return &b
},
}
)
// xlStorage - implements StorageAPI interface. // xlStorage - implements StorageAPI interface.
type xlStorage struct { type xlStorage struct {
diskPath string diskPath string
@ -410,12 +376,6 @@ func (s *xlStorage) readMetadata(ctx context.Context, itemPath string) ([]byte,
if err != nil { if err != nil {
return nil, err return nil, err
} }
if err := disk.Fadvise(f, disk.FadvSequential); err != nil {
return nil, err
}
defer disk.Fadvise(f, disk.FadvNoReuse)
defer f.Close() defer f.Close()
stat, err := f.Stat() stat, err := f.Stat()
if err != nil { if err != nil {
@ -1234,11 +1194,10 @@ func (s *xlStorage) readAllData(volumeDir string, filePath string) (buf []byte,
} }
return nil, err return nil, err
} }
if err := disk.Fadvise(f, disk.FadvSequential); err != nil { r := &xioutil.ODirectReader{
return nil, err File: f,
SmallFile: true,
} }
defer disk.Fadvise(f, disk.FadvNoReuse)
r := &odirectReader{f, nil, nil, true, true, s, nil}
defer r.Close() defer r.Close()
buf, err = ioutil.ReadAll(r) buf, err = ioutil.ReadAll(r)
if err != nil { if err != nil {
@ -1428,75 +1387,6 @@ func (s *xlStorage) openFileNoSync(filePath string, mode int) (f *os.File, err e
return w, nil return w, nil
} }
// To support O_DIRECT reads for erasure backends.
type odirectReader struct {
f *os.File
buf []byte
bufp *[]byte
freshRead bool
smallFile bool
s *xlStorage
err error
}
// Read - Implements Reader interface.
func (o *odirectReader) Read(buf []byte) (n int, err error) {
if o.err != nil && (len(o.buf) == 0 || o.freshRead) {
return 0, o.err
}
if o.buf == nil {
if o.smallFile {
o.bufp = xlPoolSmall.Get().(*[]byte)
} else {
o.bufp = xlPoolLarge.Get().(*[]byte)
}
}
if o.freshRead {
o.buf = *o.bufp
n, err = o.f.Read(o.buf)
if err != nil && err != io.EOF {
if isSysErrInvalidArg(err) {
if err = disk.DisableDirectIO(o.f); err != nil {
o.err = err
return n, err
}
n, err = o.f.Read(o.buf)
}
if err != nil && err != io.EOF {
o.err = err
return n, err
}
}
if n == 0 {
// err is likely io.EOF
o.err = err
return n, err
}
o.err = err
o.buf = o.buf[:n]
o.freshRead = false
}
if len(buf) >= len(o.buf) {
n = copy(buf, o.buf)
o.freshRead = true
return n, o.err
}
n = copy(buf, o.buf)
o.buf = o.buf[n:]
// There is more left in buffer, do not return any EOF yet.
return n, nil
}
// Close - Release the buffer and close the file.
func (o *odirectReader) Close() error {
if o.smallFile {
xlPoolSmall.Put(o.bufp)
} else {
xlPoolLarge.Put(o.bufp)
}
return o.f.Close()
}
// ReadFileStream - Returns the read stream of the file. // ReadFileStream - Returns the read stream of the file.
func (s *xlStorage) ReadFileStream(ctx context.Context, volume, path string, offset, length int64) (io.ReadCloser, error) { func (s *xlStorage) ReadFileStream(ctx context.Context, volume, path string, offset, length int64) (io.ReadCloser, error) {
if offset < 0 { if offset < 0 {
@ -1514,14 +1404,7 @@ func (s *xlStorage) ReadFileStream(ctx context.Context, volume, path string, off
return nil, err return nil, err
} }
var file *os.File file, err := OpenFileDirectIO(filePath, readMode, 0666)
// O_DIRECT only supported if offset is zero
if offset == 0 {
file, err = OpenFileDirectIO(filePath, readMode, 0666)
} else {
// Open the file for reading.
file, err = OpenFile(filePath, readMode, 0666)
}
if err != nil { if err != nil {
switch { switch {
case osIsNotExist(err): case osIsNotExist(err):
@ -1557,52 +1440,44 @@ func (s *xlStorage) ReadFileStream(ctx context.Context, volume, path string, off
return nil, errIsNotRegular return nil, errIsNotRegular
} }
// Enable sequential read access pattern - only applicable on Linux. alignment := offset%xioutil.DirectioAlignSize == 0
if err := disk.Fadvise(file, disk.FadvSequential); err != nil { if !alignment {
return nil, err if err = disk.DisableDirectIO(file); err != nil {
file.Close()
return nil, err
}
} }
if offset == 0 { if offset > 0 {
or := &odirectReader{file, nil, nil, true, false, s, nil} if _, err = file.Seek(offset, io.SeekStart); err != nil {
if length <= smallFileThreshold { file.Close()
or = &odirectReader{file, nil, nil, true, true, s, nil} return nil, err
}
}
or := &xioutil.ODirectReader{
File: file,
SmallFile: false,
}
if length <= smallFileThreshold {
or = &xioutil.ODirectReader{
File: file,
SmallFile: true,
} }
r := struct {
io.Reader
io.Closer
}{Reader: io.LimitReader(or, length), Closer: closeWrapper(func() error {
return or.Close()
})}
return r, nil
} }
r := struct { r := struct {
io.Reader io.Reader
io.Closer io.Closer
}{Reader: io.LimitReader(file, length), Closer: closeWrapper(func() error { }{Reader: io.LimitReader(or, length), Closer: closeWrapper(func() error {
disk.Fadvise(file, disk.FadvNoReuse) if !alignment || offset+length%xioutil.DirectioAlignSize != 0 {
return file.Close() // invalidate page-cache for unaligned reads.
disk.FadviseDontNeed(file)
}
return or.Close()
})} })}
if offset > 0 {
if _, err = file.Seek(offset, io.SeekStart); err != nil {
r.Close()
return nil, err
}
}
// Add readahead to big reads
if length >= readAheadSize {
rc, err := readahead.NewReadCloserSize(r, readAheadBuffers, readAheadBufSize)
if err != nil {
r.Close()
return nil, err
}
return rc, nil
}
// Just add a small 64k buffer.
r.Reader = bufio.NewReaderSize(r.Reader, 64<<10)
return r, nil return r, nil
} }
@ -1683,11 +1558,11 @@ func (s *xlStorage) CreateFile(ctx context.Context, volume, path string, fileSiz
var bufp *[]byte var bufp *[]byte
if fileSize > 0 && fileSize >= reallyLargeFileThreshold { if fileSize > 0 && fileSize >= reallyLargeFileThreshold {
// use a larger 4MiB buffer for really large streams. // use a larger 4MiB buffer for really large streams.
bufp = xlPoolReallyLarge.Get().(*[]byte) bufp = xioutil.ODirectPoolXLarge.Get().(*[]byte)
defer xlPoolReallyLarge.Put(bufp) defer xioutil.ODirectPoolXLarge.Put(bufp)
} else { } else {
bufp = xlPoolLarge.Get().(*[]byte) bufp = xioutil.ODirectPoolLarge.Get().(*[]byte)
defer xlPoolLarge.Put(bufp) defer xioutil.ODirectPoolLarge.Put(bufp)
} }
written, err := xioutil.CopyAligned(w, r, *bufp, fileSize) written, err := xioutil.CopyAligned(w, r, *bufp, fileSize)

View File

@ -41,14 +41,7 @@ func Fdatasync(f *os.File) error {
return syscall.Fdatasync(int(f.Fd())) return syscall.Fdatasync(int(f.Fd()))
} }
// fdavise advice constants // FadviseDontNeed invalidates page-cache
const ( func FadviseDontNeed(f *os.File) error {
FadvSequential = unix.FADV_SEQUENTIAL return unix.Fadvise(int(f.Fd()), 0, 0, unix.FADV_DONTNEED)
FadvNoReuse = unix.FADV_NOREUSE
)
// Fadvise implements possibility of choosing
// offset: 0, length: 0
func Fadvise(f *os.File, advice int) error {
return unix.Fadvise(int(f.Fd()), 0, 0, advice)
} }

View File

@ -30,14 +30,7 @@ func Fdatasync(f *os.File) error {
return syscall.Fsync(int(f.Fd())) return syscall.Fsync(int(f.Fd()))
} }
// fdavise advice constants // FadviseDontNeed is a no-op
const ( func FadviseDontNeed(f *os.File) error {
FadvSequential = 0
FadvNoReuse = 0
)
// Fadvise implements possibility of choosing
// offset: 0, length: 0
func Fadvise(f *os.File, advice int) error {
return nil return nil
} }

View File

@ -29,14 +29,7 @@ func Fdatasync(f *os.File) error {
return nil return nil
} }
// fdavise advice constants // FadviseDontNeed is a no-op
const ( func FadviseDontNeed(f *os.File) error {
FadvSequential = 0
FadvNoReuse = 0
)
// Fadvise implements possibility of choosing
// offset: 0, length: 0
func Fadvise(f *os.File, advice int) error {
return nil return nil
} }

View File

@ -227,9 +227,9 @@ func SameFile(fi1, fi2 os.FileInfo) bool {
return true return true
} }
// DirectIO alignment needs to be 4K. Defined here as // DirectioAlignSize - DirectIO alignment needs to be 4K. Defined here as
// directio.AlignSize is defined as 0 in MacOS causing divide by 0 error. // directio.AlignSize is defined as 0 in MacOS causing divide by 0 error.
const directioAlignSize = 4096 const DirectioAlignSize = 4096
// CopyAligned - copies from reader to writer using the aligned input // CopyAligned - copies from reader to writer using the aligned input
// buffer, it is expected that input buffer is page aligned to // buffer, it is expected that input buffer is page aligned to
@ -269,7 +269,7 @@ func CopyAligned(w *os.File, r io.Reader, alignedBuf []byte, totalSize int64) (i
} }
buf = buf[:nr] buf = buf[:nr]
var nw int64 var nw int64
if len(buf)%directioAlignSize == 0 { if len(buf)%DirectioAlignSize == 0 {
var n int var n int
// buf is aligned for directio write() // buf is aligned for directio write()
n, err = w.Write(buf) n, err = w.Write(buf)

View File

@ -0,0 +1,131 @@
// Copyright (c) 2015-2021 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package ioutil
import (
"errors"
"io"
"os"
"sync"
"syscall"
"github.com/dustin/go-humanize"
"github.com/minio/minio/internal/disk"
)
// ODirectReader - to support O_DIRECT reads for erasure backends.
type ODirectReader struct {
File *os.File
SmallFile bool
err error
buf []byte
bufp *[]byte
seenRead bool
}
// Block sizes constant.
const (
BlockSizeSmall = 128 * humanize.KiByte // Default r/w block size for smaller objects.
BlockSizeLarge = 2 * humanize.MiByte // Default r/w block size for larger objects.
BlockSizeReallyLarge = 4 * humanize.MiByte // Default write block size for objects per shard >= 64MiB
)
// O_DIRECT aligned sync.Pool's
var (
ODirectPoolXLarge = sync.Pool{
New: func() interface{} {
b := disk.AlignedBlock(BlockSizeReallyLarge)
return &b
},
}
ODirectPoolLarge = sync.Pool{
New: func() interface{} {
b := disk.AlignedBlock(BlockSizeLarge)
return &b
},
}
ODirectPoolSmall = sync.Pool{
New: func() interface{} {
b := disk.AlignedBlock(BlockSizeSmall)
return &b
},
}
)
// Invalid argument, unsupported flags such as O_DIRECT
func isSysErrInvalidArg(err error) bool {
return errors.Is(err, syscall.EINVAL)
}
// Read - Implements Reader interface.
func (o *ODirectReader) Read(buf []byte) (n int, err error) {
if o.err != nil && (len(o.buf) == 0 || !o.seenRead) {
return 0, o.err
}
if o.buf == nil {
if o.SmallFile {
o.bufp = ODirectPoolSmall.Get().(*[]byte)
} else {
o.bufp = ODirectPoolLarge.Get().(*[]byte)
}
}
if !o.seenRead {
o.buf = *o.bufp
n, err = o.File.Read(o.buf)
if err != nil && err != io.EOF {
if isSysErrInvalidArg(err) {
if err = disk.DisableDirectIO(o.File); err != nil {
o.err = err
return n, err
}
n, err = o.File.Read(o.buf)
}
if err != nil && err != io.EOF {
o.err = err
return n, err
}
}
if n == 0 {
// err is likely io.EOF
o.err = err
return n, err
}
o.err = err
o.buf = o.buf[:n]
o.seenRead = true
}
if len(buf) >= len(o.buf) {
n = copy(buf, o.buf)
o.seenRead = false
return n, o.err
}
n = copy(buf, o.buf)
o.buf = o.buf[n:]
// There is more left in buffer, do not return any EOF yet.
return n, nil
}
// Close - Release the buffer and close the file.
func (o *ODirectReader) Close() error {
if o.SmallFile {
ODirectPoolSmall.Put(o.bufp)
} else {
ODirectPoolLarge.Put(o.bufp)
}
return o.File.Close()
}

View File

@ -19,7 +19,6 @@ package ioutil
import ( import (
"io" "io"
"os"
"github.com/minio/minio/internal/disk" "github.com/minio/minio/internal/disk"
) )
@ -31,20 +30,21 @@ import (
// //
// passes NOATIME flag for reads on Unix systems to avoid atime updates. // passes NOATIME flag for reads on Unix systems to avoid atime updates.
func ReadFile(name string) ([]byte, error) { func ReadFile(name string) ([]byte, error) {
f, err := os.OpenFile(name, readMode, 0) f, err := disk.OpenFileDirectIO(name, readMode, 0666)
if err != nil { if err != nil {
return nil, err return nil, err
} }
if err := disk.Fadvise(f, disk.FadvSequential); err != nil { r := &ODirectReader{
return nil, err File: f,
SmallFile: true,
} }
defer disk.Fadvise(f, disk.FadvNoReuse) defer r.Close()
defer f.Close()
st, err := f.Stat() st, err := f.Stat()
if err != nil { if err != nil {
return io.ReadAll(f) return io.ReadAll(r)
} }
dst := make([]byte, st.Size()) dst := make([]byte, st.Size())
_, err = io.ReadFull(f, dst) _, err = io.ReadFull(r, dst)
return dst, err return dst, err
} }