use renameat2 Linux extension syscall (#17757)

this is a faster and safer alternative
on newer kernel versions.
This commit is contained in:
Harshavardhana 2023-08-27 09:57:11 -07:00 committed by GitHub
parent 6f0ed2a091
commit 8a57b6bced
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 146 additions and 67 deletions

View File

@ -20,7 +20,6 @@ package cmd
import (
"bytes"
"context"
"errors"
"fmt"
"os"
"path/filepath"
@ -426,7 +425,7 @@ func TestListOnlineDisksSmallObjects(t *testing.T) {
}
partsMetadata, errs := readAllFileInfo(ctx, erasureDisks, bucket, object, "", true)
_, err = getLatestFileInfo(ctx, partsMetadata, z.serverPools[0].sets[0].defaultParityCount, errs)
fi, err := getLatestFileInfo(ctx, partsMetadata, z.serverPools[0].sets[0].defaultParityCount, errs)
if err != nil {
t.Fatalf("Failed to getLatestFileInfo %v", err)
}
@ -484,11 +483,6 @@ func TestListOnlineDisksSmallObjects(t *testing.T) {
}
}
partsMetadata, errs = readAllFileInfo(ctx, erasureDisks, bucket, object, "", true)
fi, err := getLatestFileInfo(ctx, partsMetadata, z.serverPools[0].sets[0].defaultParityCount, errs)
if !errors.Is(err, errErasureReadQuorum) {
t.Fatalf("Failed to getLatestFileInfo, expected %v, got %v", errErasureReadQuorum, err)
}
rQuorum := len(errs) - z.serverPools[0].sets[0].defaultParityCount
onlineDisks, modTime, _ := listOnlineDisks(erasureDisks, partsMetadata, test.errs, rQuorum)

View File

@ -644,9 +644,8 @@ func (er erasureObjects) PutObjectPart(ctx context.Context, bucket, object, uplo
tmpPartPath := pathJoin(tmpPart, partSuffix)
// Delete the temporary object part. If PutObjectPart succeeds there would be nothing to delete.
var online int
defer func() {
if online != len(onlineDisks) {
if countOnlineDisks(onlineDisks) != len(onlineDisks) {
er.deleteAll(context.Background(), minioMetaTmpBucket, tmpPart)
}
}()
@ -1214,6 +1213,7 @@ func (er erasureObjects) CompleteMultipartUpload(ctx context.Context, bucket str
partsMetadata[index].Metadata = fi.Metadata
partsMetadata[index].Parts = fi.Parts
partsMetadata[index].Checksum = fi.Checksum
partsMetadata[index].Versioned = opts.Versioned || opts.VersionSuspended
}
}
@ -1228,6 +1228,9 @@ func (er erasureObjects) CompleteMultipartUpload(ctx context.Context, bucket str
// Remove parts that weren't present in CompleteMultipartUpload request.
for _, curpart := range currentFI.Parts {
// Remove part.meta which is not needed anymore.
er.removePartMeta(bucket, object, uploadID, currentFI.DataDir, curpart.Number)
if objectPartIndex(fi.Parts, curpart.Number) == -1 {
// Delete the missing part files. e.g,
// Request 1: NewMultipart
@ -1239,10 +1242,7 @@ func (er erasureObjects) CompleteMultipartUpload(ctx context.Context, bucket str
}
}
// Remove part.meta which is not needed anymore.
for _, part := range currentFI.Parts {
er.removePartMeta(bucket, object, uploadID, currentFI.DataDir, part.Number)
}
defer er.deleteAll(context.Background(), minioMetaMultipartBucket, uploadIDPath)
// Rename the multipart object to final location.
onlineDisks, versionsDisparity, err := renameData(ctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath,

View File

@ -1196,15 +1196,7 @@ func (er erasureObjects) putObject(ctx context.Context, bucket string, object st
partName := "part.1"
tempErasureObj := pathJoin(uniqueID, fi.DataDir, partName)
// Delete temporary object in the event of failure.
// If PutObject succeeded there would be no temporary
// object to delete.
var online int
defer func() {
if online != len(onlineDisks) {
er.deleteAll(context.Background(), minioMetaTmpBucket, tempObj)
}
}()
defer er.deleteAll(context.Background(), minioMetaTmpBucket, tempObj)
shardFileSize := erasure.ShardFileSize(data.Size())
writers := make([]io.Writer, len(onlineDisks))
@ -1309,6 +1301,7 @@ func (er erasureObjects) putObject(ctx context.Context, bucket string, object st
Algorithm: DefaultBitrotAlgorithm,
Hash: bitrotWriterSum(w),
})
partsMetadata[i].Versioned = opts.Versioned || opts.VersionSuspended
}
userDefined["etag"] = r.MD5CurrentHexString()
@ -1389,7 +1382,6 @@ func (er erasureObjects) putObject(ctx context.Context, bucket string, object st
}
fi.ReplicationState = opts.PutReplicationState()
online = countOnlineDisks(onlineDisks)
// we are adding a new version to this object under the namespace lock, so this is the latest version.
fi.IsLatest = true

View File

@ -49,6 +49,7 @@ const (
osMetricReadDirent
osMetricFdatasync
osMetricSync
osMetricRename2 // Linux specific
// .... add more
osMetricLast
@ -86,7 +87,11 @@ func (o *osMetrics) incTime(s osMetric, d time.Duration) {
o.latency[s].add(d)
}
func osTrace(s osMetric, startTime time.Time, duration time.Duration, path string) madmin.TraceInfo {
func osTrace(s osMetric, startTime time.Time, duration time.Duration, path string, err error) madmin.TraceInfo {
var errStr string
if err != nil {
errStr = err.Error()
}
return madmin.TraceInfo{
TraceType: madmin.TraceOS,
Time: startTime,
@ -94,53 +99,54 @@ func osTrace(s osMetric, startTime time.Time, duration time.Duration, path strin
FuncName: "os." + s.String(),
Duration: duration,
Path: path,
Error: errStr,
}
}
func updateOSMetrics(s osMetric, paths ...string) func() {
func updateOSMetrics(s osMetric, paths ...string) func(err error) {
if globalTrace.NumSubscribers(madmin.TraceOS) == 0 {
return globalOSMetrics.time(s)
return func(err error) { globalOSMetrics.time(s) }
}
startTime := time.Now()
return func() {
return func(err error) {
duration := time.Since(startTime)
globalOSMetrics.incTime(s, duration)
globalTrace.Publish(osTrace(s, startTime, duration, strings.Join(paths, " -> ")))
globalTrace.Publish(osTrace(s, startTime, duration, strings.Join(paths, " -> "), err))
}
}
// RemoveAll captures time taken to call the underlying os.RemoveAll
func RemoveAll(dirPath string) error {
defer updateOSMetrics(osMetricRemoveAll, dirPath)()
func RemoveAll(dirPath string) (err error) {
defer updateOSMetrics(osMetricRemoveAll, dirPath)(err)
return os.RemoveAll(dirPath)
}
// Mkdir captures time taken to call os.Mkdir
func Mkdir(dirPath string, mode os.FileMode) error {
defer updateOSMetrics(osMetricMkdir, dirPath)()
func Mkdir(dirPath string, mode os.FileMode) (err error) {
defer updateOSMetrics(osMetricMkdir, dirPath)(err)
return os.Mkdir(dirPath, mode)
}
// MkdirAll captures time taken to call os.MkdirAll
func MkdirAll(dirPath string, mode os.FileMode) error {
defer updateOSMetrics(osMetricMkdirAll, dirPath)()
func MkdirAll(dirPath string, mode os.FileMode) (err error) {
defer updateOSMetrics(osMetricMkdirAll, dirPath)(err)
return osMkdirAll(dirPath, mode)
}
// Rename captures time taken to call os.Rename
func Rename(src, dst string) error {
defer updateOSMetrics(osMetricRename, src, dst)()
func Rename(src, dst string) (err error) {
defer updateOSMetrics(osMetricRename, src, dst)(err)
return os.Rename(src, dst)
}
// OpenFile captures time taken to call os.OpenFile
func OpenFile(name string, flag int, perm os.FileMode) (*os.File, error) {
func OpenFile(name string, flag int, perm os.FileMode) (f *os.File, err error) {
switch flag & writeMode {
case writeMode:
defer updateOSMetrics(osMetricOpenFileW, name)()
defer updateOSMetrics(osMetricOpenFileW, name)(err)
default:
defer updateOSMetrics(osMetricOpenFileR, name)()
defer updateOSMetrics(osMetricOpenFileR, name)(err)
}
return os.OpenFile(name, flag, perm)
}
@ -148,54 +154,54 @@ func OpenFile(name string, flag int, perm os.FileMode) (*os.File, error) {
// Access captures time taken to call syscall.Access()
// on windows, plan9 and solaris syscall.Access uses
// os.Lstat()
func Access(name string) error {
defer updateOSMetrics(osMetricAccess, name)()
func Access(name string) (err error) {
defer updateOSMetrics(osMetricAccess, name)(err)
return access(name)
}
// Open captures time taken to call os.Open
func Open(name string) (*os.File, error) {
defer updateOSMetrics(osMetricOpen, name)()
func Open(name string) (f *os.File, err error) {
defer updateOSMetrics(osMetricOpen, name)(err)
return os.Open(name)
}
// OpenFileDirectIO captures time taken to call disk.OpenFileDirectIO
func OpenFileDirectIO(name string, flag int, perm os.FileMode) (*os.File, error) {
defer updateOSMetrics(osMetricOpenFileDirectIO, name)()
func OpenFileDirectIO(name string, flag int, perm os.FileMode) (f *os.File, err error) {
defer updateOSMetrics(osMetricOpenFileDirectIO, name)(err)
return disk.OpenFileDirectIO(name, flag, perm)
}
// Lstat captures time taken to call os.Lstat
func Lstat(name string) (os.FileInfo, error) {
defer updateOSMetrics(osMetricLstat, name)()
func Lstat(name string) (info os.FileInfo, err error) {
defer updateOSMetrics(osMetricLstat, name)(err)
return os.Lstat(name)
}
// Remove captures time taken to call os.Remove
func Remove(deletePath string) error {
defer updateOSMetrics(osMetricRemove, deletePath)()
func Remove(deletePath string) (err error) {
defer updateOSMetrics(osMetricRemove, deletePath)(err)
return os.Remove(deletePath)
}
// Stat captures time taken to call os.Stat
func Stat(name string) (os.FileInfo, error) {
defer updateOSMetrics(osMetricStat, name)()
func Stat(name string) (info os.FileInfo, err error) {
defer updateOSMetrics(osMetricStat, name)(err)
return os.Stat(name)
}
// Create captures time taken to call os.Create
func Create(name string) (*os.File, error) {
defer updateOSMetrics(osMetricCreate, name)()
func Create(name string) (f *os.File, err error) {
defer updateOSMetrics(osMetricCreate, name)(err)
return os.Create(name)
}
// Fdatasync captures time taken to call Fdatasync
func Fdatasync(f *os.File) error {
func Fdatasync(f *os.File) (err error) {
fn := ""
if f != nil {
fn = f.Name()
}
defer updateOSMetrics(osMetricFdatasync, fn)()
defer updateOSMetrics(osMetricFdatasync, fn)(err)
return disk.Fdatasync(f)
}

31
cmd/os-rename_linux.go Normal file
View File

@ -0,0 +1,31 @@
//go:build linux
// +build linux
// Copyright (c) 2015-2023 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package cmd
import (
"golang.org/x/sys/unix"
)
// Rename2 captures time taken to call os.Rename
func Rename2(src, dst string) (err error) {
defer updateOSMetrics(osMetricRename2, src, dst)(err)
return unix.Renameat2(unix.AT_FDCWD, src, unix.AT_FDCWD, dst, uint(2)) // RENAME_EXCHANGE from 'man renameat2'
}

29
cmd/os-rename_nolinux.go Normal file
View File

@ -0,0 +1,29 @@
//go:build !linux
// +build !linux
// Copyright (c) 2015-2023 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package cmd
import "errors"
// Rename captures time taken to call os.Rename
func Rename2(src, dst string) (err error) {
defer updateOSMetrics(osMetricRename2, src, dst)(errors.New("not implemented, skipping"))
return errSkipFile
}

View File

@ -24,12 +24,13 @@ func _() {
_ = x[osMetricReadDirent-13]
_ = x[osMetricFdatasync-14]
_ = x[osMetricSync-15]
_ = x[osMetricLast-16]
_ = x[osMetricRename2-16]
_ = x[osMetricLast-17]
}
const _osMetric_name = "RemoveAllMkdirAllMkdirRenameOpenFileWOpenFileROpenOpenFileDirectIOLstatRemoveStatAccessCreateReadDirentFdatasyncSyncLast"
const _osMetric_name = "RemoveAllMkdirAllMkdirRenameOpenFileWOpenFileROpenOpenFileDirectIOLstatRemoveStatAccessCreateReadDirentFdatasyncSyncRename2Last"
var _osMetric_index = [...]uint8{0, 9, 17, 22, 28, 37, 46, 50, 66, 71, 77, 81, 87, 93, 103, 112, 116, 120}
var _osMetric_index = [...]uint8{0, 9, 17, 22, 28, 37, 46, 50, 66, 71, 77, 81, 87, 93, 103, 112, 116, 123, 127}
func (i osMetric) String() string {
if i >= osMetric(len(_osMetric_index)-1) {

View File

@ -235,6 +235,9 @@ type FileInfo struct {
// Combined checksum when object was uploaded.
Checksum []byte `msg:"cs,allownil"`
// Versioned - indicates if this file is versioned or not.
Versioned bool `msg:"vs"`
}
// WriteQuorum returns expected write quorum for this FileInfo

View File

@ -669,8 +669,8 @@ func (z *FileInfo) DecodeMsg(dc *msgp.Reader) (err error) {
err = msgp.WrapError(err)
return
}
if zb0001 != 28 {
err = msgp.ArrayError{Wanted: 28, Got: zb0001}
if zb0001 != 29 {
err = msgp.ArrayError{Wanted: 29, Got: zb0001}
return
}
z.Volume, err = dc.ReadString()
@ -850,13 +850,18 @@ func (z *FileInfo) DecodeMsg(dc *msgp.Reader) (err error) {
err = msgp.WrapError(err, "Checksum")
return
}
z.Versioned, err = dc.ReadBool()
if err != nil {
err = msgp.WrapError(err, "Versioned")
return
}
return
}
// EncodeMsg implements msgp.Encodable
func (z *FileInfo) EncodeMsg(en *msgp.Writer) (err error) {
// array header, size 28
err = en.Append(0xdc, 0x0, 0x1c)
// array header, size 29
err = en.Append(0xdc, 0x0, 0x1d)
if err != nil {
return
}
@ -1019,14 +1024,19 @@ func (z *FileInfo) EncodeMsg(en *msgp.Writer) (err error) {
err = msgp.WrapError(err, "Checksum")
return
}
err = en.WriteBool(z.Versioned)
if err != nil {
err = msgp.WrapError(err, "Versioned")
return
}
return
}
// MarshalMsg implements msgp.Marshaler
func (z *FileInfo) MarshalMsg(b []byte) (o []byte, err error) {
o = msgp.Require(b, z.Msgsize())
// array header, size 28
o = append(o, 0xdc, 0x0, 0x1c)
// array header, size 29
o = append(o, 0xdc, 0x0, 0x1d)
o = msgp.AppendString(o, z.Volume)
o = msgp.AppendString(o, z.Name)
o = msgp.AppendString(o, z.VersionID)
@ -1074,6 +1084,7 @@ func (z *FileInfo) MarshalMsg(b []byte) (o []byte, err error) {
o = msgp.AppendInt(o, z.Idx)
o = msgp.AppendTime(o, z.DiskMTime)
o = msgp.AppendBytes(o, z.Checksum)
o = msgp.AppendBool(o, z.Versioned)
return
}
@ -1085,8 +1096,8 @@ func (z *FileInfo) UnmarshalMsg(bts []byte) (o []byte, err error) {
err = msgp.WrapError(err)
return
}
if zb0001 != 28 {
err = msgp.ArrayError{Wanted: 28, Got: zb0001}
if zb0001 != 29 {
err = msgp.ArrayError{Wanted: 29, Got: zb0001}
return
}
z.Volume, bts, err = msgp.ReadStringBytes(bts)
@ -1266,6 +1277,11 @@ func (z *FileInfo) UnmarshalMsg(bts []byte) (o []byte, err error) {
err = msgp.WrapError(err, "Checksum")
return
}
z.Versioned, bts, err = msgp.ReadBoolBytes(bts)
if err != nil {
err = msgp.WrapError(err, "Versioned")
return
}
o = bts
return
}
@ -1283,7 +1299,7 @@ func (z *FileInfo) Msgsize() (s int) {
for za0003 := range z.Parts {
s += z.Parts[za0003].Msgsize()
}
s += z.Erasure.Msgsize() + msgp.BoolSize + z.ReplicationState.Msgsize() + msgp.BytesPrefixSize + len(z.Data) + msgp.IntSize + msgp.TimeSize + msgp.BoolSize + msgp.IntSize + msgp.TimeSize + msgp.BytesPrefixSize + len(z.Checksum)
s += z.Erasure.Msgsize() + msgp.BoolSize + z.ReplicationState.Msgsize() + msgp.BytesPrefixSize + len(z.Data) + msgp.IntSize + msgp.TimeSize + msgp.BoolSize + msgp.IntSize + msgp.TimeSize + msgp.BytesPrefixSize + len(z.Checksum) + msgp.BoolSize
return
}

View File

@ -2485,6 +2485,13 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath string, f
}
diskHealthCheckOK(ctx, err)
if !fi.Versioned && !fi.Healing() {
// Use https://man7.org/linux/man-pages/man2/rename.2.html if possible on unversioned bucket.
if err := Rename2(pathutil.Join(srcVolumeDir, srcPath), pathutil.Join(dstVolumeDir, dstPath)); err == nil {
return sign, nil
} // if Rename2 is not successful fallback.
}
// renameAll only for objects that have xl.meta not saved inline.
if len(fi.Data) == 0 && fi.Size > 0 {
s.moveToTrash(dstDataPath, true, false)