use renameat2 Linux extension syscall (#17757)

this is a faster and safer alternative
on newer kernel versions.
This commit is contained in:
Harshavardhana 2023-08-27 09:57:11 -07:00 committed by GitHub
parent 6f0ed2a091
commit 8a57b6bced
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 146 additions and 67 deletions

View File

@ -20,7 +20,6 @@ package cmd
import ( import (
"bytes" "bytes"
"context" "context"
"errors"
"fmt" "fmt"
"os" "os"
"path/filepath" "path/filepath"
@ -426,7 +425,7 @@ func TestListOnlineDisksSmallObjects(t *testing.T) {
} }
partsMetadata, errs := readAllFileInfo(ctx, erasureDisks, bucket, object, "", true) partsMetadata, errs := readAllFileInfo(ctx, erasureDisks, bucket, object, "", true)
_, err = getLatestFileInfo(ctx, partsMetadata, z.serverPools[0].sets[0].defaultParityCount, errs) fi, err := getLatestFileInfo(ctx, partsMetadata, z.serverPools[0].sets[0].defaultParityCount, errs)
if err != nil { if err != nil {
t.Fatalf("Failed to getLatestFileInfo %v", err) t.Fatalf("Failed to getLatestFileInfo %v", err)
} }
@ -484,11 +483,6 @@ func TestListOnlineDisksSmallObjects(t *testing.T) {
} }
} }
partsMetadata, errs = readAllFileInfo(ctx, erasureDisks, bucket, object, "", true)
fi, err := getLatestFileInfo(ctx, partsMetadata, z.serverPools[0].sets[0].defaultParityCount, errs)
if !errors.Is(err, errErasureReadQuorum) {
t.Fatalf("Failed to getLatestFileInfo, expected %v, got %v", errErasureReadQuorum, err)
}
rQuorum := len(errs) - z.serverPools[0].sets[0].defaultParityCount rQuorum := len(errs) - z.serverPools[0].sets[0].defaultParityCount
onlineDisks, modTime, _ := listOnlineDisks(erasureDisks, partsMetadata, test.errs, rQuorum) onlineDisks, modTime, _ := listOnlineDisks(erasureDisks, partsMetadata, test.errs, rQuorum)

View File

@ -644,9 +644,8 @@ func (er erasureObjects) PutObjectPart(ctx context.Context, bucket, object, uplo
tmpPartPath := pathJoin(tmpPart, partSuffix) tmpPartPath := pathJoin(tmpPart, partSuffix)
// Delete the temporary object part. If PutObjectPart succeeds there would be nothing to delete. // Delete the temporary object part. If PutObjectPart succeeds there would be nothing to delete.
var online int
defer func() { defer func() {
if online != len(onlineDisks) { if countOnlineDisks(onlineDisks) != len(onlineDisks) {
er.deleteAll(context.Background(), minioMetaTmpBucket, tmpPart) er.deleteAll(context.Background(), minioMetaTmpBucket, tmpPart)
} }
}() }()
@ -1214,6 +1213,7 @@ func (er erasureObjects) CompleteMultipartUpload(ctx context.Context, bucket str
partsMetadata[index].Metadata = fi.Metadata partsMetadata[index].Metadata = fi.Metadata
partsMetadata[index].Parts = fi.Parts partsMetadata[index].Parts = fi.Parts
partsMetadata[index].Checksum = fi.Checksum partsMetadata[index].Checksum = fi.Checksum
partsMetadata[index].Versioned = opts.Versioned || opts.VersionSuspended
} }
} }
@ -1228,6 +1228,9 @@ func (er erasureObjects) CompleteMultipartUpload(ctx context.Context, bucket str
// Remove parts that weren't present in CompleteMultipartUpload request. // Remove parts that weren't present in CompleteMultipartUpload request.
for _, curpart := range currentFI.Parts { for _, curpart := range currentFI.Parts {
// Remove part.meta which is not needed anymore.
er.removePartMeta(bucket, object, uploadID, currentFI.DataDir, curpart.Number)
if objectPartIndex(fi.Parts, curpart.Number) == -1 { if objectPartIndex(fi.Parts, curpart.Number) == -1 {
// Delete the missing part files. e.g, // Delete the missing part files. e.g,
// Request 1: NewMultipart // Request 1: NewMultipart
@ -1239,10 +1242,7 @@ func (er erasureObjects) CompleteMultipartUpload(ctx context.Context, bucket str
} }
} }
// Remove part.meta which is not needed anymore. defer er.deleteAll(context.Background(), minioMetaMultipartBucket, uploadIDPath)
for _, part := range currentFI.Parts {
er.removePartMeta(bucket, object, uploadID, currentFI.DataDir, part.Number)
}
// Rename the multipart object to final location. // Rename the multipart object to final location.
onlineDisks, versionsDisparity, err := renameData(ctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath, onlineDisks, versionsDisparity, err := renameData(ctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath,

View File

@ -1196,15 +1196,7 @@ func (er erasureObjects) putObject(ctx context.Context, bucket string, object st
partName := "part.1" partName := "part.1"
tempErasureObj := pathJoin(uniqueID, fi.DataDir, partName) tempErasureObj := pathJoin(uniqueID, fi.DataDir, partName)
// Delete temporary object in the event of failure. defer er.deleteAll(context.Background(), minioMetaTmpBucket, tempObj)
// If PutObject succeeded there would be no temporary
// object to delete.
var online int
defer func() {
if online != len(onlineDisks) {
er.deleteAll(context.Background(), minioMetaTmpBucket, tempObj)
}
}()
shardFileSize := erasure.ShardFileSize(data.Size()) shardFileSize := erasure.ShardFileSize(data.Size())
writers := make([]io.Writer, len(onlineDisks)) writers := make([]io.Writer, len(onlineDisks))
@ -1309,6 +1301,7 @@ func (er erasureObjects) putObject(ctx context.Context, bucket string, object st
Algorithm: DefaultBitrotAlgorithm, Algorithm: DefaultBitrotAlgorithm,
Hash: bitrotWriterSum(w), Hash: bitrotWriterSum(w),
}) })
partsMetadata[i].Versioned = opts.Versioned || opts.VersionSuspended
} }
userDefined["etag"] = r.MD5CurrentHexString() userDefined["etag"] = r.MD5CurrentHexString()
@ -1389,7 +1382,6 @@ func (er erasureObjects) putObject(ctx context.Context, bucket string, object st
} }
fi.ReplicationState = opts.PutReplicationState() fi.ReplicationState = opts.PutReplicationState()
online = countOnlineDisks(onlineDisks)
// we are adding a new version to this object under the namespace lock, so this is the latest version. // we are adding a new version to this object under the namespace lock, so this is the latest version.
fi.IsLatest = true fi.IsLatest = true

View File

@ -49,6 +49,7 @@ const (
osMetricReadDirent osMetricReadDirent
osMetricFdatasync osMetricFdatasync
osMetricSync osMetricSync
osMetricRename2 // Linux specific
// .... add more // .... add more
osMetricLast osMetricLast
@ -86,7 +87,11 @@ func (o *osMetrics) incTime(s osMetric, d time.Duration) {
o.latency[s].add(d) o.latency[s].add(d)
} }
func osTrace(s osMetric, startTime time.Time, duration time.Duration, path string) madmin.TraceInfo { func osTrace(s osMetric, startTime time.Time, duration time.Duration, path string, err error) madmin.TraceInfo {
var errStr string
if err != nil {
errStr = err.Error()
}
return madmin.TraceInfo{ return madmin.TraceInfo{
TraceType: madmin.TraceOS, TraceType: madmin.TraceOS,
Time: startTime, Time: startTime,
@ -94,53 +99,54 @@ func osTrace(s osMetric, startTime time.Time, duration time.Duration, path strin
FuncName: "os." + s.String(), FuncName: "os." + s.String(),
Duration: duration, Duration: duration,
Path: path, Path: path,
Error: errStr,
} }
} }
func updateOSMetrics(s osMetric, paths ...string) func() { func updateOSMetrics(s osMetric, paths ...string) func(err error) {
if globalTrace.NumSubscribers(madmin.TraceOS) == 0 { if globalTrace.NumSubscribers(madmin.TraceOS) == 0 {
return globalOSMetrics.time(s) return func(err error) { globalOSMetrics.time(s) }
} }
startTime := time.Now() startTime := time.Now()
return func() { return func(err error) {
duration := time.Since(startTime) duration := time.Since(startTime)
globalOSMetrics.incTime(s, duration) globalOSMetrics.incTime(s, duration)
globalTrace.Publish(osTrace(s, startTime, duration, strings.Join(paths, " -> "))) globalTrace.Publish(osTrace(s, startTime, duration, strings.Join(paths, " -> "), err))
} }
} }
// RemoveAll captures time taken to call the underlying os.RemoveAll // RemoveAll captures time taken to call the underlying os.RemoveAll
func RemoveAll(dirPath string) error { func RemoveAll(dirPath string) (err error) {
defer updateOSMetrics(osMetricRemoveAll, dirPath)() defer updateOSMetrics(osMetricRemoveAll, dirPath)(err)
return os.RemoveAll(dirPath) return os.RemoveAll(dirPath)
} }
// Mkdir captures time taken to call os.Mkdir // Mkdir captures time taken to call os.Mkdir
func Mkdir(dirPath string, mode os.FileMode) error { func Mkdir(dirPath string, mode os.FileMode) (err error) {
defer updateOSMetrics(osMetricMkdir, dirPath)() defer updateOSMetrics(osMetricMkdir, dirPath)(err)
return os.Mkdir(dirPath, mode) return os.Mkdir(dirPath, mode)
} }
// MkdirAll captures time taken to call os.MkdirAll // MkdirAll captures time taken to call os.MkdirAll
func MkdirAll(dirPath string, mode os.FileMode) error { func MkdirAll(dirPath string, mode os.FileMode) (err error) {
defer updateOSMetrics(osMetricMkdirAll, dirPath)() defer updateOSMetrics(osMetricMkdirAll, dirPath)(err)
return osMkdirAll(dirPath, mode) return osMkdirAll(dirPath, mode)
} }
// Rename captures time taken to call os.Rename // Rename captures time taken to call os.Rename
func Rename(src, dst string) error { func Rename(src, dst string) (err error) {
defer updateOSMetrics(osMetricRename, src, dst)() defer updateOSMetrics(osMetricRename, src, dst)(err)
return os.Rename(src, dst) return os.Rename(src, dst)
} }
// OpenFile captures time taken to call os.OpenFile // OpenFile captures time taken to call os.OpenFile
func OpenFile(name string, flag int, perm os.FileMode) (*os.File, error) { func OpenFile(name string, flag int, perm os.FileMode) (f *os.File, err error) {
switch flag & writeMode { switch flag & writeMode {
case writeMode: case writeMode:
defer updateOSMetrics(osMetricOpenFileW, name)() defer updateOSMetrics(osMetricOpenFileW, name)(err)
default: default:
defer updateOSMetrics(osMetricOpenFileR, name)() defer updateOSMetrics(osMetricOpenFileR, name)(err)
} }
return os.OpenFile(name, flag, perm) return os.OpenFile(name, flag, perm)
} }
@ -148,54 +154,54 @@ func OpenFile(name string, flag int, perm os.FileMode) (*os.File, error) {
// Access captures time taken to call syscall.Access() // Access captures time taken to call syscall.Access()
// on windows, plan9 and solaris syscall.Access uses // on windows, plan9 and solaris syscall.Access uses
// os.Lstat() // os.Lstat()
func Access(name string) error { func Access(name string) (err error) {
defer updateOSMetrics(osMetricAccess, name)() defer updateOSMetrics(osMetricAccess, name)(err)
return access(name) return access(name)
} }
// Open captures time taken to call os.Open // Open captures time taken to call os.Open
func Open(name string) (*os.File, error) { func Open(name string) (f *os.File, err error) {
defer updateOSMetrics(osMetricOpen, name)() defer updateOSMetrics(osMetricOpen, name)(err)
return os.Open(name) return os.Open(name)
} }
// OpenFileDirectIO captures time taken to call disk.OpenFileDirectIO // OpenFileDirectIO captures time taken to call disk.OpenFileDirectIO
func OpenFileDirectIO(name string, flag int, perm os.FileMode) (*os.File, error) { func OpenFileDirectIO(name string, flag int, perm os.FileMode) (f *os.File, err error) {
defer updateOSMetrics(osMetricOpenFileDirectIO, name)() defer updateOSMetrics(osMetricOpenFileDirectIO, name)(err)
return disk.OpenFileDirectIO(name, flag, perm) return disk.OpenFileDirectIO(name, flag, perm)
} }
// Lstat captures time taken to call os.Lstat // Lstat captures time taken to call os.Lstat
func Lstat(name string) (os.FileInfo, error) { func Lstat(name string) (info os.FileInfo, err error) {
defer updateOSMetrics(osMetricLstat, name)() defer updateOSMetrics(osMetricLstat, name)(err)
return os.Lstat(name) return os.Lstat(name)
} }
// Remove captures time taken to call os.Remove // Remove captures time taken to call os.Remove
func Remove(deletePath string) error { func Remove(deletePath string) (err error) {
defer updateOSMetrics(osMetricRemove, deletePath)() defer updateOSMetrics(osMetricRemove, deletePath)(err)
return os.Remove(deletePath) return os.Remove(deletePath)
} }
// Stat captures time taken to call os.Stat // Stat captures time taken to call os.Stat
func Stat(name string) (os.FileInfo, error) { func Stat(name string) (info os.FileInfo, err error) {
defer updateOSMetrics(osMetricStat, name)() defer updateOSMetrics(osMetricStat, name)(err)
return os.Stat(name) return os.Stat(name)
} }
// Create captures time taken to call os.Create // Create captures time taken to call os.Create
func Create(name string) (*os.File, error) { func Create(name string) (f *os.File, err error) {
defer updateOSMetrics(osMetricCreate, name)() defer updateOSMetrics(osMetricCreate, name)(err)
return os.Create(name) return os.Create(name)
} }
// Fdatasync captures time taken to call Fdatasync // Fdatasync captures time taken to call Fdatasync
func Fdatasync(f *os.File) error { func Fdatasync(f *os.File) (err error) {
fn := "" fn := ""
if f != nil { if f != nil {
fn = f.Name() fn = f.Name()
} }
defer updateOSMetrics(osMetricFdatasync, fn)() defer updateOSMetrics(osMetricFdatasync, fn)(err)
return disk.Fdatasync(f) return disk.Fdatasync(f)
} }

31
cmd/os-rename_linux.go Normal file
View File

@ -0,0 +1,31 @@
//go:build linux
// +build linux
// Copyright (c) 2015-2023 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package cmd
import (
"golang.org/x/sys/unix"
)
// Rename2 captures time taken to call os.Rename
func Rename2(src, dst string) (err error) {
defer updateOSMetrics(osMetricRename2, src, dst)(err)
return unix.Renameat2(unix.AT_FDCWD, src, unix.AT_FDCWD, dst, uint(2)) // RENAME_EXCHANGE from 'man renameat2'
}

29
cmd/os-rename_nolinux.go Normal file
View File

@ -0,0 +1,29 @@
//go:build !linux
// +build !linux
// Copyright (c) 2015-2023 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package cmd
import "errors"
// Rename captures time taken to call os.Rename
func Rename2(src, dst string) (err error) {
defer updateOSMetrics(osMetricRename2, src, dst)(errors.New("not implemented, skipping"))
return errSkipFile
}

View File

@ -24,12 +24,13 @@ func _() {
_ = x[osMetricReadDirent-13] _ = x[osMetricReadDirent-13]
_ = x[osMetricFdatasync-14] _ = x[osMetricFdatasync-14]
_ = x[osMetricSync-15] _ = x[osMetricSync-15]
_ = x[osMetricLast-16] _ = x[osMetricRename2-16]
_ = x[osMetricLast-17]
} }
const _osMetric_name = "RemoveAllMkdirAllMkdirRenameOpenFileWOpenFileROpenOpenFileDirectIOLstatRemoveStatAccessCreateReadDirentFdatasyncSyncLast" const _osMetric_name = "RemoveAllMkdirAllMkdirRenameOpenFileWOpenFileROpenOpenFileDirectIOLstatRemoveStatAccessCreateReadDirentFdatasyncSyncRename2Last"
var _osMetric_index = [...]uint8{0, 9, 17, 22, 28, 37, 46, 50, 66, 71, 77, 81, 87, 93, 103, 112, 116, 120} var _osMetric_index = [...]uint8{0, 9, 17, 22, 28, 37, 46, 50, 66, 71, 77, 81, 87, 93, 103, 112, 116, 123, 127}
func (i osMetric) String() string { func (i osMetric) String() string {
if i >= osMetric(len(_osMetric_index)-1) { if i >= osMetric(len(_osMetric_index)-1) {

View File

@ -235,6 +235,9 @@ type FileInfo struct {
// Combined checksum when object was uploaded. // Combined checksum when object was uploaded.
Checksum []byte `msg:"cs,allownil"` Checksum []byte `msg:"cs,allownil"`
// Versioned - indicates if this file is versioned or not.
Versioned bool `msg:"vs"`
} }
// WriteQuorum returns expected write quorum for this FileInfo // WriteQuorum returns expected write quorum for this FileInfo

View File

@ -669,8 +669,8 @@ func (z *FileInfo) DecodeMsg(dc *msgp.Reader) (err error) {
err = msgp.WrapError(err) err = msgp.WrapError(err)
return return
} }
if zb0001 != 28 { if zb0001 != 29 {
err = msgp.ArrayError{Wanted: 28, Got: zb0001} err = msgp.ArrayError{Wanted: 29, Got: zb0001}
return return
} }
z.Volume, err = dc.ReadString() z.Volume, err = dc.ReadString()
@ -850,13 +850,18 @@ func (z *FileInfo) DecodeMsg(dc *msgp.Reader) (err error) {
err = msgp.WrapError(err, "Checksum") err = msgp.WrapError(err, "Checksum")
return return
} }
z.Versioned, err = dc.ReadBool()
if err != nil {
err = msgp.WrapError(err, "Versioned")
return
}
return return
} }
// EncodeMsg implements msgp.Encodable // EncodeMsg implements msgp.Encodable
func (z *FileInfo) EncodeMsg(en *msgp.Writer) (err error) { func (z *FileInfo) EncodeMsg(en *msgp.Writer) (err error) {
// array header, size 28 // array header, size 29
err = en.Append(0xdc, 0x0, 0x1c) err = en.Append(0xdc, 0x0, 0x1d)
if err != nil { if err != nil {
return return
} }
@ -1019,14 +1024,19 @@ func (z *FileInfo) EncodeMsg(en *msgp.Writer) (err error) {
err = msgp.WrapError(err, "Checksum") err = msgp.WrapError(err, "Checksum")
return return
} }
err = en.WriteBool(z.Versioned)
if err != nil {
err = msgp.WrapError(err, "Versioned")
return
}
return return
} }
// MarshalMsg implements msgp.Marshaler // MarshalMsg implements msgp.Marshaler
func (z *FileInfo) MarshalMsg(b []byte) (o []byte, err error) { func (z *FileInfo) MarshalMsg(b []byte) (o []byte, err error) {
o = msgp.Require(b, z.Msgsize()) o = msgp.Require(b, z.Msgsize())
// array header, size 28 // array header, size 29
o = append(o, 0xdc, 0x0, 0x1c) o = append(o, 0xdc, 0x0, 0x1d)
o = msgp.AppendString(o, z.Volume) o = msgp.AppendString(o, z.Volume)
o = msgp.AppendString(o, z.Name) o = msgp.AppendString(o, z.Name)
o = msgp.AppendString(o, z.VersionID) o = msgp.AppendString(o, z.VersionID)
@ -1074,6 +1084,7 @@ func (z *FileInfo) MarshalMsg(b []byte) (o []byte, err error) {
o = msgp.AppendInt(o, z.Idx) o = msgp.AppendInt(o, z.Idx)
o = msgp.AppendTime(o, z.DiskMTime) o = msgp.AppendTime(o, z.DiskMTime)
o = msgp.AppendBytes(o, z.Checksum) o = msgp.AppendBytes(o, z.Checksum)
o = msgp.AppendBool(o, z.Versioned)
return return
} }
@ -1085,8 +1096,8 @@ func (z *FileInfo) UnmarshalMsg(bts []byte) (o []byte, err error) {
err = msgp.WrapError(err) err = msgp.WrapError(err)
return return
} }
if zb0001 != 28 { if zb0001 != 29 {
err = msgp.ArrayError{Wanted: 28, Got: zb0001} err = msgp.ArrayError{Wanted: 29, Got: zb0001}
return return
} }
z.Volume, bts, err = msgp.ReadStringBytes(bts) z.Volume, bts, err = msgp.ReadStringBytes(bts)
@ -1266,6 +1277,11 @@ func (z *FileInfo) UnmarshalMsg(bts []byte) (o []byte, err error) {
err = msgp.WrapError(err, "Checksum") err = msgp.WrapError(err, "Checksum")
return return
} }
z.Versioned, bts, err = msgp.ReadBoolBytes(bts)
if err != nil {
err = msgp.WrapError(err, "Versioned")
return
}
o = bts o = bts
return return
} }
@ -1283,7 +1299,7 @@ func (z *FileInfo) Msgsize() (s int) {
for za0003 := range z.Parts { for za0003 := range z.Parts {
s += z.Parts[za0003].Msgsize() s += z.Parts[za0003].Msgsize()
} }
s += z.Erasure.Msgsize() + msgp.BoolSize + z.ReplicationState.Msgsize() + msgp.BytesPrefixSize + len(z.Data) + msgp.IntSize + msgp.TimeSize + msgp.BoolSize + msgp.IntSize + msgp.TimeSize + msgp.BytesPrefixSize + len(z.Checksum) s += z.Erasure.Msgsize() + msgp.BoolSize + z.ReplicationState.Msgsize() + msgp.BytesPrefixSize + len(z.Data) + msgp.IntSize + msgp.TimeSize + msgp.BoolSize + msgp.IntSize + msgp.TimeSize + msgp.BytesPrefixSize + len(z.Checksum) + msgp.BoolSize
return return
} }

View File

@ -2485,6 +2485,13 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath string, f
} }
diskHealthCheckOK(ctx, err) diskHealthCheckOK(ctx, err)
if !fi.Versioned && !fi.Healing() {
// Use https://man7.org/linux/man-pages/man2/rename.2.html if possible on unversioned bucket.
if err := Rename2(pathutil.Join(srcVolumeDir, srcPath), pathutil.Join(dstVolumeDir, dstPath)); err == nil {
return sign, nil
} // if Rename2 is not successful fallback.
}
// renameAll only for objects that have xl.meta not saved inline. // renameAll only for objects that have xl.meta not saved inline.
if len(fi.Data) == 0 && fi.Size > 0 { if len(fi.Data) == 0 && fi.Size > 0 {
s.moveToTrash(dstDataPath, true, false) s.moveToTrash(dstDataPath, true, false)