From f1abb92f0c146b20d19e192d2b4abeb8c472c721 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Mon, 30 May 2022 10:58:37 -0700 Subject: [PATCH] feat: Single drive XL implementation (#14970) Main motivation is move towards a common backend format for all different types of modes in MinIO, allowing for a simpler code and predictable behavior across all features. This PR also brings features such as versioning, replication, transitioning to single drive setups. --- Makefile | 8 +- README.md | 6 - buildscripts/race.sh | 1 + cmd/admin-bucket-handlers.go | 7 +- cmd/admin-handlers-users-race_test.go | 8 +- cmd/admin-handlers-users_test.go | 8 +- cmd/admin-handlers.go | 9 +- cmd/admin-router.go | 78 +- cmd/bucket-handlers.go | 4 +- cmd/bucket-metadata-sys.go | 9 - cmd/bucket-replication.go | 4 - cmd/bucket-targets.go | 6 - cmd/endpoint.go | 2 +- cmd/endpoint_test.go | 6 +- cmd/erasure-coding.go | 2 +- cmd/erasure-metadata.go | 8 +- cmd/erasure-object.go | 5 + cmd/erasure-server-pool.go | 22 +- cmd/erasure-sets.go | 2 +- cmd/erasure-single-drive.go | 3289 +++++++++++++++++++++++++ cmd/erasure.go | 2 +- cmd/format-erasure.go | 16 +- cmd/format-fs.go | 7 + cmd/fs-v1-metadata_test.go | 3 + cmd/fs-v1-multipart_test.go | 2 + cmd/fs-v1.go | 5 + cmd/fs-v1_test.go | 7 + cmd/gateway-main.go | 5 + cmd/global-heal.go | 2 + cmd/globals.go | 4 + cmd/handler-api.go | 5 +- cmd/iam.go | 16 +- cmd/lock-rest-server.go | 17 +- cmd/metacache-bucket.go | 20 +- cmd/metacache-manager.go | 2 +- cmd/metacache-server-pool.go | 347 +++ cmd/metacache-set.go | 338 +++ cmd/metacache.go | 4 +- cmd/metrics-v2.go | 4 +- cmd/metrics.go | 2 +- cmd/object-api-listobjects_test.go | 20 +- cmd/object-api-multipart_test.go | 56 +- cmd/object-api-utils.go | 2 +- cmd/object-handlers_test.go | 2 +- cmd/prepare-storage.go | 8 +- cmd/server-main.go | 52 +- cmd/server-main_test.go | 4 +- cmd/server_test.go | 14 +- cmd/setup-type.go | 5 + cmd/storage-errors.go | 6 + cmd/storage-rest-server.go | 6 + cmd/sts-handlers.go | 4 +- cmd/sts-handlers_test.go | 8 +- cmd/test-utils_test.go | 32 +- cmd/tier-handlers.go | 12 +- cmd/utils.go | 2 + cmd/xl-storage-format-utils.go | 3 + cmd/xl-storage-format-v1.go | 2 +- cmd/xl-storage-format_test.go | 10 +- docs/config/README.md | 2 +- docs/gateway/nas.md | 4 +- internal/config/errors.go | 12 + 62 files changed, 4288 insertions(+), 270 deletions(-) create mode 100644 cmd/erasure-single-drive.go diff --git a/Makefile b/Makefile index 84a3ee7d4..df8e24b69 100644 --- a/Makefile +++ b/Makefile @@ -53,7 +53,7 @@ test-iam: build ## verify IAM (external IDP, etcd backends) @echo "Running tests for IAM (external IDP, etcd backends)" @CGO_ENABLED=0 go test -tags kqueue -v -run TestIAM* ./cmd @echo "Running tests for IAM (external IDP, etcd backends) with -race" - @CGO_ENABLED=1 go test -race -tags kqueue -v -run TestIAM* ./cmd + @GORACE=history_size=7 CGO_ENABLED=1 go test -race -tags kqueue -v -run TestIAM* ./cmd test-replication: install ## verify multi site replication @echo "Running tests for replicating three sites" @@ -73,18 +73,18 @@ test-site-replication-minio: install ## verify automatic site replication verify: ## verify minio various setups @echo "Verifying build with race" - @CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null + @GORACE=history_size=7 CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null @(env bash $(PWD)/buildscripts/verify-build.sh) verify-healing: ## verify healing and replacing disks with minio binary @echo "Verify healing build with race" - @CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null + @GORACE=history_size=7 CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null @(env bash $(PWD)/buildscripts/verify-healing.sh) @(env bash $(PWD)/buildscripts/unaligned-healing.sh) verify-healing-inconsistent-versions: ## verify resolving inconsistent versions @echo "Verify resolving inconsistent versions build with race" - @CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null + @GORACE=history_size=7 CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null @(env bash $(PWD)/buildscripts/resolve-right-versions.sh) build: checks ## builds minio to $(PWD) diff --git a/README.md b/README.md index 4c80cad53..be9790a0b 100644 --- a/README.md +++ b/README.md @@ -196,12 +196,6 @@ iptables -A INPUT -p tcp --dport 9000:9010 -j ACCEPT service iptables restart ``` -## Pre-existing data - -When deployed on a single drive, MinIO server lets clients access any pre-existing data in the data directory. For example, if MinIO is started with the command `minio server /mnt/data`, any pre-existing data in the `/mnt/data` directory would be accessible to the clients. - -The above statement is also valid for all gateway backends. - ## Test MinIO Connectivity ### Test using MinIO Console diff --git a/buildscripts/race.sh b/buildscripts/race.sh index fbdd9b428..6b3964fcc 100755 --- a/buildscripts/race.sh +++ b/buildscripts/race.sh @@ -2,6 +2,7 @@ set -e +export GORACE="history_size=7" ## TODO remove `dsync` from race detector once this is merged and released https://go-review.googlesource.com/c/go/+/333529/ for d in $(go list ./... | grep -v dsync); do CGO_ENABLED=1 go test -v -race --timeout 100m "$d" diff --git a/cmd/admin-bucket-handlers.go b/cmd/admin-bucket-handlers.go index 49ab89443..360067fe3 100644 --- a/cmd/admin-bucket-handlers.go +++ b/cmd/admin-bucket-handlers.go @@ -145,7 +145,7 @@ func (a adminAPIHandlers) SetRemoteTargetHandler(w http.ResponseWriter, r *http. bucket := pathClean(vars["bucket"]) update := r.Form.Get("update") == "true" - if !globalIsErasure { + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrNotImplemented), r.URL) return } @@ -274,7 +274,8 @@ func (a adminAPIHandlers) ListRemoteTargetsHandler(w http.ResponseWriter, r *htt vars := mux.Vars(r) bucket := pathClean(vars["bucket"]) arnType := vars["type"] - if !globalIsErasure { + + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrNotImplemented), r.URL) return } @@ -314,7 +315,7 @@ func (a adminAPIHandlers) RemoveRemoteTargetHandler(w http.ResponseWriter, r *ht bucket := pathClean(vars["bucket"]) arn := vars["arn"] - if !globalIsErasure { + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrNotImplemented), r.URL) return } diff --git a/cmd/admin-handlers-users-race_test.go b/cmd/admin-handlers-users-race_test.go index 7b10d438e..c3524da1d 100644 --- a/cmd/admin-handlers-users-race_test.go +++ b/cmd/admin-handlers-users-race_test.go @@ -47,10 +47,10 @@ func TestIAMInternalIDPConcurrencyServerSuite(t *testing.T) { } baseTestCases := []TestSuiteCommon{ - // Init and run test on FS backend with signature v4. - {serverType: "FS", signer: signerV4}, - // Init and run test on FS backend, with tls enabled. - {serverType: "FS", signer: signerV4, secure: true}, + // Init and run test on ErasureSD backend with signature v4. + {serverType: "ErasureSD", signer: signerV4}, + // Init and run test on ErasureSD backend, with tls enabled. + {serverType: "ErasureSD", signer: signerV4, secure: true}, // Init and run test on Erasure backend. {serverType: "Erasure", signer: signerV4}, // Init and run test on ErasureSet backend. diff --git a/cmd/admin-handlers-users_test.go b/cmd/admin-handlers-users_test.go index 91ebdffa2..6291d70c4 100644 --- a/cmd/admin-handlers-users_test.go +++ b/cmd/admin-handlers-users_test.go @@ -102,10 +102,10 @@ func (s *TestSuiteIAM) iamSetup(c *check) { // common to tests. var iamTestSuites = func() []*TestSuiteIAM { baseTestCases := []TestSuiteCommon{ - // Init and run test on FS backend with signature v4. - {serverType: "FS", signer: signerV4}, - // Init and run test on FS backend, with tls enabled. - {serverType: "FS", signer: signerV4, secure: true}, + // Init and run test on ErasureSD backend with signature v4. + {serverType: "ErasureSD", signer: signerV4}, + // Init and run test on ErasureSD backend, with tls enabled. + {serverType: "ErasureSD", signer: signerV4, secure: true}, // Init and run test on Erasure backend. {serverType: "Erasure", signer: signerV4}, // Init and run test on ErasureSet backend. diff --git a/cmd/admin-handlers.go b/cmd/admin-handlers.go index 32b1cc741..af6f85433 100644 --- a/cmd/admin-handlers.go +++ b/cmd/admin-handlers.go @@ -805,8 +805,7 @@ func (a adminAPIHandlers) HealHandler(w http.ResponseWriter, r *http.Request) { return } - // Check if this setup has an erasure coded backend. - if !globalIsErasure { + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrHealNotImplemented), r.URL) return } @@ -998,7 +997,7 @@ func (a adminAPIHandlers) BackgroundHealStatusHandler(w http.ResponseWriter, r * } // Check if this setup has an erasure coded backend. - if !globalIsErasure { + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrHealNotImplemented), r.URL) return } @@ -1078,7 +1077,7 @@ func (a adminAPIHandlers) ObjectSpeedtestHandler(w http.ResponseWriter, r *http. return } - if !globalIsErasure { + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrNotImplemented), r.URL) return } @@ -1228,7 +1227,7 @@ func (a adminAPIHandlers) DriveSpeedtestHandler(w http.ResponseWriter, r *http.R return } - if !globalIsErasure { + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrNotImplemented), r.URL) return } diff --git a/cmd/admin-router.go b/cmd/admin-router.go index f30414e67..9a62dcede 100644 --- a/cmd/admin-router.go +++ b/cmd/admin-router.go @@ -170,50 +170,48 @@ func registerAdminRouter(router *mux.Router, enableConfigOps bool) { // Set Group Status adminRouter.Methods(http.MethodPut).Path(adminVersion+"/set-group-status").HandlerFunc(gz(httpTraceHdrs(adminAPI.SetGroupStatus))).Queries("group", "{group:.*}").Queries("status", "{status:.*}") - if globalIsDistErasure || globalIsErasure { - // GetBucketQuotaConfig - adminRouter.Methods(http.MethodGet).Path(adminVersion+"/get-bucket-quota").HandlerFunc( - gz(httpTraceHdrs(adminAPI.GetBucketQuotaConfigHandler))).Queries("bucket", "{bucket:.*}") - // PutBucketQuotaConfig - adminRouter.Methods(http.MethodPut).Path(adminVersion+"/set-bucket-quota").HandlerFunc( - gz(httpTraceHdrs(adminAPI.PutBucketQuotaConfigHandler))).Queries("bucket", "{bucket:.*}") + // GetBucketQuotaConfig + adminRouter.Methods(http.MethodGet).Path(adminVersion+"/get-bucket-quota").HandlerFunc( + gz(httpTraceHdrs(adminAPI.GetBucketQuotaConfigHandler))).Queries("bucket", "{bucket:.*}") + // PutBucketQuotaConfig + adminRouter.Methods(http.MethodPut).Path(adminVersion+"/set-bucket-quota").HandlerFunc( + gz(httpTraceHdrs(adminAPI.PutBucketQuotaConfigHandler))).Queries("bucket", "{bucket:.*}") - // Bucket replication operations - // GetBucketTargetHandler - adminRouter.Methods(http.MethodGet).Path(adminVersion+"/list-remote-targets").HandlerFunc( - gz(httpTraceHdrs(adminAPI.ListRemoteTargetsHandler))).Queries("bucket", "{bucket:.*}", "type", "{type:.*}") - // SetRemoteTargetHandler - adminRouter.Methods(http.MethodPut).Path(adminVersion+"/set-remote-target").HandlerFunc( - gz(httpTraceHdrs(adminAPI.SetRemoteTargetHandler))).Queries("bucket", "{bucket:.*}") - // RemoveRemoteTargetHandler - adminRouter.Methods(http.MethodDelete).Path(adminVersion+"/remove-remote-target").HandlerFunc( - gz(httpTraceHdrs(adminAPI.RemoveRemoteTargetHandler))).Queries("bucket", "{bucket:.*}", "arn", "{arn:.*}") + // Bucket replication operations + // GetBucketTargetHandler + adminRouter.Methods(http.MethodGet).Path(adminVersion+"/list-remote-targets").HandlerFunc( + gz(httpTraceHdrs(adminAPI.ListRemoteTargetsHandler))).Queries("bucket", "{bucket:.*}", "type", "{type:.*}") + // SetRemoteTargetHandler + adminRouter.Methods(http.MethodPut).Path(adminVersion+"/set-remote-target").HandlerFunc( + gz(httpTraceHdrs(adminAPI.SetRemoteTargetHandler))).Queries("bucket", "{bucket:.*}") + // RemoveRemoteTargetHandler + adminRouter.Methods(http.MethodDelete).Path(adminVersion+"/remove-remote-target").HandlerFunc( + gz(httpTraceHdrs(adminAPI.RemoveRemoteTargetHandler))).Queries("bucket", "{bucket:.*}", "arn", "{arn:.*}") - // Remote Tier management operations - adminRouter.Methods(http.MethodPut).Path(adminVersion + "/tier").HandlerFunc(gz(httpTraceHdrs(adminAPI.AddTierHandler))) - adminRouter.Methods(http.MethodPost).Path(adminVersion + "/tier/{tier}").HandlerFunc(gz(httpTraceHdrs(adminAPI.EditTierHandler))) - adminRouter.Methods(http.MethodGet).Path(adminVersion + "/tier").HandlerFunc(gz(httpTraceHdrs(adminAPI.ListTierHandler))) - adminRouter.Methods(http.MethodDelete).Path(adminVersion + "/tier/{tier}").HandlerFunc(gz(httpTraceHdrs(adminAPI.RemoveTierHandler))) - adminRouter.Methods(http.MethodGet).Path(adminVersion + "/tier/{tier}").HandlerFunc(gz(httpTraceHdrs(adminAPI.VerifyTierHandler))) - // Tier stats - adminRouter.Methods(http.MethodGet).Path(adminVersion + "/tier-stats").HandlerFunc(gz(httpTraceHdrs(adminAPI.TierStatsHandler))) + // Remote Tier management operations + adminRouter.Methods(http.MethodPut).Path(adminVersion + "/tier").HandlerFunc(gz(httpTraceHdrs(adminAPI.AddTierHandler))) + adminRouter.Methods(http.MethodPost).Path(adminVersion + "/tier/{tier}").HandlerFunc(gz(httpTraceHdrs(adminAPI.EditTierHandler))) + adminRouter.Methods(http.MethodGet).Path(adminVersion + "/tier").HandlerFunc(gz(httpTraceHdrs(adminAPI.ListTierHandler))) + adminRouter.Methods(http.MethodDelete).Path(adminVersion + "/tier/{tier}").HandlerFunc(gz(httpTraceHdrs(adminAPI.RemoveTierHandler))) + adminRouter.Methods(http.MethodGet).Path(adminVersion + "/tier/{tier}").HandlerFunc(gz(httpTraceHdrs(adminAPI.VerifyTierHandler))) + // Tier stats + adminRouter.Methods(http.MethodGet).Path(adminVersion + "/tier-stats").HandlerFunc(gz(httpTraceHdrs(adminAPI.TierStatsHandler))) - // Cluster Replication APIs - adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/add").HandlerFunc(gz(httpTraceHdrs(adminAPI.SiteReplicationAdd))) - adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/remove").HandlerFunc(gz(httpTraceHdrs(adminAPI.SiteReplicationRemove))) - adminRouter.Methods(http.MethodGet).Path(adminVersion + "/site-replication/info").HandlerFunc(gz(httpTraceHdrs(adminAPI.SiteReplicationInfo))) - adminRouter.Methods(http.MethodGet).Path(adminVersion + "/site-replication/metainfo").HandlerFunc(gz(httpTraceHdrs(adminAPI.SiteReplicationMetaInfo))) - adminRouter.Methods(http.MethodGet).Path(adminVersion + "/site-replication/status").HandlerFunc(gz(httpTraceHdrs(adminAPI.SiteReplicationStatus))) + // Cluster Replication APIs + adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/add").HandlerFunc(gz(httpTraceHdrs(adminAPI.SiteReplicationAdd))) + adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/remove").HandlerFunc(gz(httpTraceHdrs(adminAPI.SiteReplicationRemove))) + adminRouter.Methods(http.MethodGet).Path(adminVersion + "/site-replication/info").HandlerFunc(gz(httpTraceHdrs(adminAPI.SiteReplicationInfo))) + adminRouter.Methods(http.MethodGet).Path(adminVersion + "/site-replication/metainfo").HandlerFunc(gz(httpTraceHdrs(adminAPI.SiteReplicationMetaInfo))) + adminRouter.Methods(http.MethodGet).Path(adminVersion + "/site-replication/status").HandlerFunc(gz(httpTraceHdrs(adminAPI.SiteReplicationStatus))) - adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/peer/join").HandlerFunc(gz(httpTraceHdrs(adminAPI.SRPeerJoin))) - adminRouter.Methods(http.MethodPut).Path(adminVersion+"/site-replication/peer/bucket-ops").HandlerFunc(gz(httpTraceHdrs(adminAPI.SRPeerBucketOps))).Queries("bucket", "{bucket:.*}").Queries("operation", "{operation:.*}") - adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/peer/iam-item").HandlerFunc(gz(httpTraceHdrs(adminAPI.SRPeerReplicateIAMItem))) - adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/peer/bucket-meta").HandlerFunc(gz(httpTraceHdrs(adminAPI.SRPeerReplicateBucketItem))) - adminRouter.Methods(http.MethodGet).Path(adminVersion + "/site-replication/peer/idp-settings").HandlerFunc(gz(httpTraceHdrs(adminAPI.SRPeerGetIDPSettings))) - adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/edit").HandlerFunc(gz(httpTraceHdrs(adminAPI.SiteReplicationEdit))) - adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/peer/edit").HandlerFunc(gz(httpTraceHdrs(adminAPI.SRPeerEdit))) - adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/peer/remove").HandlerFunc(gz(httpTraceHdrs(adminAPI.SRPeerRemove))) - } + adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/peer/join").HandlerFunc(gz(httpTraceHdrs(adminAPI.SRPeerJoin))) + adminRouter.Methods(http.MethodPut).Path(adminVersion+"/site-replication/peer/bucket-ops").HandlerFunc(gz(httpTraceHdrs(adminAPI.SRPeerBucketOps))).Queries("bucket", "{bucket:.*}").Queries("operation", "{operation:.*}") + adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/peer/iam-item").HandlerFunc(gz(httpTraceHdrs(adminAPI.SRPeerReplicateIAMItem))) + adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/peer/bucket-meta").HandlerFunc(gz(httpTraceHdrs(adminAPI.SRPeerReplicateBucketItem))) + adminRouter.Methods(http.MethodGet).Path(adminVersion + "/site-replication/peer/idp-settings").HandlerFunc(gz(httpTraceHdrs(adminAPI.SRPeerGetIDPSettings))) + adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/edit").HandlerFunc(gz(httpTraceHdrs(adminAPI.SiteReplicationEdit))) + adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/peer/edit").HandlerFunc(gz(httpTraceHdrs(adminAPI.SRPeerEdit))) + adminRouter.Methods(http.MethodPut).Path(adminVersion + "/site-replication/peer/remove").HandlerFunc(gz(httpTraceHdrs(adminAPI.SRPeerRemove))) if globalIsDistErasure { // Top locks diff --git a/cmd/bucket-handlers.go b/cmd/bucket-handlers.go index 5ec439f3a..3475dacad 100644 --- a/cmd/bucket-handlers.go +++ b/cmd/bucket-handlers.go @@ -1364,7 +1364,7 @@ func (api objectAPIHandlers) PutBucketObjectLockConfigHandler(w http.ResponseWri writeErrorResponse(ctx, w, errorCodes.ToAPIErr(ErrServerNotInitialized), r.URL) return } - if !globalIsErasure { + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrNotImplemented), r.URL) return } @@ -1611,7 +1611,7 @@ func (api objectAPIHandlers) PutBucketReplicationConfigHandler(w http.ResponseWr writeErrorResponse(ctx, w, errorCodes.ToAPIErr(ErrServerNotInitialized), r.URL) return } - if !globalIsErasure { + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrNotImplemented), r.URL) return } diff --git a/cmd/bucket-metadata-sys.go b/cmd/bucket-metadata-sys.go index 125d6a2fd..c8fbdca84 100644 --- a/cmd/bucket-metadata-sys.go +++ b/cmd/bucket-metadata-sys.go @@ -127,20 +127,11 @@ func (sys *BucketMetadataSys) Update(ctx context.Context, bucket string, configF meta.QuotaConfigJSON = configData meta.QuotaConfigUpdatedAt = UTCNow() case objectLockConfig: - if !globalIsErasure && !globalIsDistErasure { - return NotImplemented{} - } meta.ObjectLockConfigXML = configData meta.ObjectLockConfigUpdatedAt = UTCNow() case bucketVersioningConfig: - if !globalIsErasure && !globalIsDistErasure { - return NotImplemented{} - } meta.VersioningConfigXML = configData case bucketReplicationConfig: - if !globalIsErasure && !globalIsDistErasure { - return NotImplemented{} - } meta.ReplicationConfigXML = configData meta.ReplicationConfigUpdatedAt = UTCNow() case bucketTargetsFile: diff --git a/cmd/bucket-replication.go b/cmd/bucket-replication.go index dd89af7fe..260f4c653 100644 --- a/cmd/bucket-replication.go +++ b/cmd/bucket-replication.go @@ -2187,10 +2187,6 @@ func (p *ReplicationPool) initResync(ctx context.Context, buckets []BucketInfo, if objAPI == nil { return errServerNotInitialized } - // replication applies only to erasure coded setups - if !globalIsErasure { - return nil - } // Load bucket metadata sys in background go p.loadResync(ctx, buckets, objAPI) return nil diff --git a/cmd/bucket-targets.go b/cmd/bucket-targets.go index 776188ab0..26dc5685b 100644 --- a/cmd/bucket-targets.go +++ b/cmd/bucket-targets.go @@ -120,9 +120,6 @@ func (sys *BucketTargetSys) SetTarget(ctx context.Context, bucket string, tgt *m return BucketRemoteConnectionErr{Bucket: tgt.TargetBucket, Err: err} } if tgt.Type == madmin.ReplicationService { - if !globalIsErasure { - return NotImplemented{Message: "Replication is not implemented in " + getMinioMode()} - } if !globalBucketVersioningSys.Enabled(bucket) { return BucketReplicationSourceNotVersioned{Bucket: bucket} } @@ -184,9 +181,6 @@ func (sys *BucketTargetSys) RemoveTarget(ctx context.Context, bucket, arnStr str if globalIsGateway { return nil } - if !globalIsErasure { - return NotImplemented{Message: "Replication is not implemented in " + getMinioMode()} - } if arnStr == "" { return BucketRemoteArnInvalid{Bucket: bucket} diff --git a/cmd/endpoint.go b/cmd/endpoint.go index aadfa17c6..5cf67f4a3 100644 --- a/cmd/endpoint.go +++ b/cmd/endpoint.go @@ -582,7 +582,7 @@ func CreateEndpoints(serverAddr string, foundLocal bool, args ...[]string) (Endp return endpoints, setupType, config.ErrInvalidFSEndpoint(nil).Msg("use path style endpoint for FS setup") } endpoints = append(endpoints, endpoint) - setupType = FSSetupType + setupType = ErasureSDSetupType // Check for cross device mounts if any. if err = checkCrossDeviceMounts(endpoints); err != nil { diff --git a/cmd/endpoint_test.go b/cmd/endpoint_test.go index 073d9e41c..4925f0617 100644 --- a/cmd/endpoint_test.go +++ b/cmd/endpoint_test.go @@ -231,10 +231,10 @@ func TestCreateEndpoints(t *testing.T) { }{ {"localhost", [][]string{}, "", Endpoints{}, -1, fmt.Errorf("address localhost: missing port in address")}, - // FS Setup + // Erasure Single Drive {"localhost:9000", [][]string{{"http://localhost/d1"}}, "", Endpoints{}, -1, fmt.Errorf("use path style endpoint for FS setup")}, - {":443", [][]string{{"/d1"}}, ":443", Endpoints{Endpoint{URL: &url.URL{Path: mustAbs("/d1")}, IsLocal: true}}, FSSetupType, nil}, - {"localhost:10000", [][]string{{"/d1"}}, "localhost:10000", Endpoints{Endpoint{URL: &url.URL{Path: mustAbs("/d1")}, IsLocal: true}}, FSSetupType, nil}, + {":443", [][]string{{"/d1"}}, ":443", Endpoints{Endpoint{URL: &url.URL{Path: mustAbs("/d1")}, IsLocal: true}}, ErasureSDSetupType, nil}, + {"localhost:10000", [][]string{{"/d1"}}, "localhost:10000", Endpoints{Endpoint{URL: &url.URL{Path: mustAbs("/d1")}, IsLocal: true}}, ErasureSDSetupType, nil}, {"localhost:9000", [][]string{{"https://127.0.0.1:9000/d1", "https://localhost:9001/d1", "https://example.com/d1", "https://example.com/d2"}}, "", Endpoints{}, -1, fmt.Errorf("path '/d1' can not be served by different port on same address")}, // Erasure Setup with PathEndpointType diff --git a/cmd/erasure-coding.go b/cmd/erasure-coding.go index b52eba69b..a61a003e3 100644 --- a/cmd/erasure-coding.go +++ b/cmd/erasure-coding.go @@ -41,7 +41,7 @@ type Erasure struct { // NewErasure creates a new ErasureStorage. func NewErasure(ctx context.Context, dataBlocks, parityBlocks int, blockSize int64) (e Erasure, err error) { // Check the parameters for sanity now. - if dataBlocks <= 0 || parityBlocks <= 0 { + if dataBlocks <= 0 || parityBlocks < 0 { return e, reedsolomon.ErrInvShardNum } diff --git a/cmd/erasure-metadata.go b/cmd/erasure-metadata.go index dc58788ad..6d6c141f6 100644 --- a/cmd/erasure-metadata.go +++ b/cmd/erasure-metadata.go @@ -99,7 +99,7 @@ func (fi FileInfo) IsValid() bool { fi.Erasure.Index <= dataBlocks+parityBlocks && len(fi.Erasure.Distribution) == (dataBlocks+parityBlocks)) return ((dataBlocks >= parityBlocks) && - (dataBlocks != 0) && (parityBlocks != 0) && + (dataBlocks > 0) && (parityBlocks >= 0) && correctIndexes) } @@ -284,7 +284,7 @@ func (fi FileInfo) ObjectToPartOffset(ctx context.Context, offset int64) (partIn func findFileInfoInQuorum(ctx context.Context, metaArr []FileInfo, modTime time.Time, quorum int) (FileInfo, error) { // with less quorum return error. - if quorum < 2 { + if quorum < 1 { return FileInfo{}, errErasureReadQuorum } metaHashes := make([]string, len(metaArr)) @@ -398,6 +398,10 @@ func writeUniqueFileInfo(ctx context.Context, disks []StorageAPI, bucket, prefix // readQuorum is the min required disks to read data. // writeQuorum is the min required disks to write data. func objectQuorumFromMeta(ctx context.Context, partsMetaData []FileInfo, errs []error, defaultParityCount int) (objectReadQuorum, objectWriteQuorum int, err error) { + if defaultParityCount == 0 { + return 1, 1, nil + } + // get the latest updated Metadata and a count of all the latest updated FileInfo(s) latestFileInfo, err := getLatestFileInfo(ctx, partsMetaData, errs) if err != nil { diff --git a/cmd/erasure-object.go b/cmd/erasure-object.go index 66f06d84e..a8a7253a1 100644 --- a/cmd/erasure-object.go +++ b/cmd/erasure-object.go @@ -1327,12 +1327,17 @@ func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objec func (er erasureObjects) deletePrefix(ctx context.Context, bucket, prefix string) error { disks := er.getDisks() g := errgroup.WithNErrs(len(disks)) + dirPrefix := encodeDirObject(prefix) for index := range disks { index := index g.Go(func() error { if disks[index] == nil { return nil } + // Deletes + // - The prefix and its children + // - The prefix__XLDIR__ + defer disks[index].Delete(ctx, bucket, dirPrefix, true) return disks[index].Delete(ctx, bucket, prefix, true) }, index) } diff --git a/cmd/erasure-server-pool.go b/cmd/erasure-server-pool.go index adf9c9a3d..e4e02d784 100644 --- a/cmd/erasure-server-pool.go +++ b/cmd/erasure-server-pool.go @@ -61,6 +61,22 @@ func (z *erasureServerPools) SinglePool() bool { // Initialize new pool of erasure sets. func newErasureServerPools(ctx context.Context, endpointServerPools EndpointServerPools) (ObjectLayer, error) { + if endpointServerPools.NEndpoints() == 1 { + ep := endpointServerPools[0] + storageDisks, format, err := waitForFormatErasure(true, ep.Endpoints, 1, ep.SetCount, ep.DrivesPerSet, "", "") + if err != nil { + return nil, err + } + + objLayer, err := newErasureSingle(ctx, storageDisks[0], format) + if err != nil { + return nil, err + } + + globalLocalDrives = storageDisks + return objLayer, nil + } + var ( deploymentID string distributionAlgo string @@ -320,7 +336,7 @@ func (z *erasureServerPools) getServerPoolsAvailableSpace(ctx context.Context, b nSets[index] = pool.setCount g.Go(func() error { // Get the set where it would be placed. - storageInfos[index] = getDiskInfos(ctx, pool.getHashedSet(object).getDisks()) + storageInfos[index] = getDiskInfos(ctx, pool.getHashedSet(object).getDisks()...) return nil }, index) } @@ -933,7 +949,7 @@ func (z *erasureServerPools) PutObject(ctx context.Context, bucket string, objec object = encodeDirObject(object) if z.SinglePool() { - if !isMinioMetaBucketName(bucket) && !hasSpaceFor(getDiskInfos(ctx, z.serverPools[0].getHashedSet(object).getDisks()), data.Size()) { + if !isMinioMetaBucketName(bucket) && !hasSpaceFor(getDiskInfos(ctx, z.serverPools[0].getHashedSet(object).getDisks()...), data.Size()) { return ObjectInfo{}, toObjectErr(errDiskFull) } return z.serverPools[0].PutObject(ctx, bucket, object, data, opts) @@ -1325,7 +1341,7 @@ func (z *erasureServerPools) NewMultipartUpload(ctx context.Context, bucket, obj } if z.SinglePool() { - if !isMinioMetaBucketName(bucket) && !hasSpaceFor(getDiskInfos(ctx, z.serverPools[0].getHashedSet(object).getDisks()), -1) { + if !isMinioMetaBucketName(bucket) && !hasSpaceFor(getDiskInfos(ctx, z.serverPools[0].getHashedSet(object).getDisks()...), -1) { return "", toObjectErr(errDiskFull) } return z.serverPools[0].NewMultipartUpload(ctx, bucket, object, opts) diff --git a/cmd/erasure-sets.go b/cmd/erasure-sets.go index bd0d013fd..04976f3df 100644 --- a/cmd/erasure-sets.go +++ b/cmd/erasure-sets.go @@ -1252,7 +1252,7 @@ func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.H defer func(storageDisks []StorageAPI) { if err != nil { - closeStorageDisks(storageDisks) + closeStorageDisks(storageDisks...) } }(storageDisks) diff --git a/cmd/erasure-single-drive.go b/cmd/erasure-single-drive.go new file mode 100644 index 000000000..0f165a5c1 --- /dev/null +++ b/cmd/erasure-single-drive.go @@ -0,0 +1,3289 @@ +// Copyright (c) 2015-2022 MinIO, Inc. +// +// This file is part of MinIO Object Storage stack +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package cmd + +import ( + "bytes" + "context" + "errors" + "fmt" + "io" + "math/rand" + "net/http" + "os" + "path" + "sort" + "strconv" + "strings" + "sync" + "time" + + "github.com/dustin/go-humanize" + "github.com/klauspost/readahead" + "github.com/minio/madmin-go" + "github.com/minio/minio-go/v7/pkg/s3utils" + "github.com/minio/minio-go/v7/pkg/set" + "github.com/minio/minio-go/v7/pkg/tags" + "github.com/minio/minio/internal/bpool" + "github.com/minio/minio/internal/bucket/lifecycle" + "github.com/minio/minio/internal/bucket/object/lock" + "github.com/minio/minio/internal/bucket/replication" + "github.com/minio/minio/internal/event" + "github.com/minio/minio/internal/hash" + xhttp "github.com/minio/minio/internal/http" + xioutil "github.com/minio/minio/internal/ioutil" + "github.com/minio/minio/internal/logger" + "github.com/minio/minio/internal/sync/errgroup" + "github.com/minio/pkg/mimedb" +) + +// erasureSingle - Implements single drive XL layer +type erasureSingle struct { + GatewayUnsupported + + disk StorageAPI + + endpoint Endpoint + + // Locker mutex map. + nsMutex *nsLockMap + + // Byte pools used for temporary i/o buffers. + bp *bpool.BytePoolCap + + deletedCleanupSleeper *dynamicSleeper + + // Shut down async operations + shutdown context.CancelFunc + + format *formatErasureV3 +} + +// Initialize new set of erasure coded sets. +func newErasureSingle(ctx context.Context, storageDisk StorageAPI, format *formatErasureV3) (ObjectLayer, error) { + // Number of buffers, max 2GB + n := (2 * humanize.GiByte) / (blockSizeV2 * 2) + + // Initialize byte pool once for all sets, bpool size is set to + // setCount * setDriveCount with each memory upto blockSizeV2. + bp := bpool.NewBytePoolCap(n, blockSizeV2, blockSizeV2*2) + + // Initialize the erasure sets instance. + s := &erasureSingle{ + disk: storageDisk, + endpoint: storageDisk.Endpoint(), + format: format, + nsMutex: newNSLock(false), + bp: bp, + deletedCleanupSleeper: newDynamicSleeper(10, 2*time.Second), + } + + // start cleanup stale uploads go-routine. + go s.cleanupStaleUploads(ctx) + + // start cleanup of deleted objects. + go s.cleanupDeletedObjects(ctx) + + ctx, s.shutdown = context.WithCancel(ctx) + go intDataUpdateTracker.start(ctx, s.endpoint.Path) + + return s, nil +} + +// List all buckets from one of the set, we are not doing merge +// sort here just for simplification. As per design it is assumed +// that all buckets are present on all sets. +func (es *erasureSingle) ListBuckets(ctx context.Context) (buckets []BucketInfo, err error) { + var listBuckets []BucketInfo + healBuckets := map[string]VolInfo{} + // lists all unique buckets across drives. + if err := listAllBuckets(ctx, []StorageAPI{es.disk}, healBuckets, 0); err != nil { + return nil, err + } + + for _, v := range healBuckets { + listBuckets = append(listBuckets, BucketInfo(v)) + } + + sort.Slice(listBuckets, func(i, j int) bool { + return listBuckets[i].Name < listBuckets[j].Name + }) + + for i := range listBuckets { + meta, err := globalBucketMetadataSys.Get(listBuckets[i].Name) + if err == nil { + listBuckets[i].Created = meta.Created + } + } + + return listBuckets, nil +} + +func (es *erasureSingle) cleanupStaleUploads(ctx context.Context) { + timer := time.NewTimer(globalAPIConfig.getStaleUploadsCleanupInterval()) + defer timer.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-timer.C: + es.cleanupStaleUploadsOnDisk(ctx, es.disk, globalAPIConfig.getStaleUploadsExpiry()) + + // Reset for the next interval + timer.Reset(globalAPIConfig.getStaleUploadsCleanupInterval()) + } + } +} + +// cleanup ".trash/" folder every 5m minutes with sufficient sleep cycles, between each +// deletes a dynamic sleeper is used with a factor of 10 ratio with max delay between +// deletes to be 2 seconds. +func (es *erasureSingle) cleanupDeletedObjects(ctx context.Context) { + timer := time.NewTimer(globalAPIConfig.getDeleteCleanupInterval()) + defer timer.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-timer.C: + es.cleanupDeletedObjectsInner(ctx) + // Reset for the next interval + timer.Reset(globalAPIConfig.getDeleteCleanupInterval()) + } + } +} + +// NewNSLock - initialize a new namespace RWLocker instance. +func (es *erasureSingle) NewNSLock(bucket string, objects ...string) RWLocker { + return es.nsMutex.NewNSLock(nil, bucket, objects...) +} + +// Shutdown function for object storage interface. +func (es *erasureSingle) Shutdown(ctx context.Context) error { + defer es.shutdown() + + // Add any object layer shutdown activities here. + closeStorageDisks(es.disk) + return nil +} + +func (es *erasureSingle) BackendInfo() (b madmin.BackendInfo) { + b.Type = madmin.Erasure + + scParity := 0 + rrSCParity := 0 + + // Data blocks can vary per pool, but parity is same. + for _, setDriveCount := range es.SetDriveCounts() { + b.StandardSCData = append(b.StandardSCData, setDriveCount-scParity) + b.RRSCData = append(b.RRSCData, setDriveCount-rrSCParity) + } + + b.StandardSCParity = scParity + b.RRSCParity = rrSCParity + return +} + +// StorageInfo - returns underlying storage statistics. +func (es *erasureSingle) StorageInfo(ctx context.Context) (StorageInfo, []error) { + disks := []StorageAPI{es.disk} + endpoints := []Endpoint{es.endpoint} + + storageInfo, errs := getStorageInfo(disks, endpoints) + storageInfo.Backend = es.BackendInfo() + return storageInfo, errs +} + +// LocalStorageInfo - returns underlying local storage statistics. +func (es *erasureSingle) LocalStorageInfo(ctx context.Context) (StorageInfo, []error) { + disks := []StorageAPI{es.disk} + endpoints := []Endpoint{es.endpoint} + + var localDisks []StorageAPI + var localEndpoints []Endpoint + + for i, endpoint := range endpoints { + if endpoint.IsLocal { + localDisks = append(localDisks, disks[i]) + localEndpoints = append(localEndpoints, endpoint) + } + } + + return getStorageInfo(localDisks, localEndpoints) +} + +// Clean-up previously deleted objects. from .minio.sys/tmp/.trash/ +func (es *erasureSingle) cleanupDeletedObjectsInner(ctx context.Context) { + diskPath := es.disk.Endpoint().Path + readDirFn(pathJoin(diskPath, minioMetaTmpDeletedBucket), func(ddir string, typ os.FileMode) error { + wait := es.deletedCleanupSleeper.Timer(ctx) + removeAll(pathJoin(diskPath, minioMetaTmpDeletedBucket, ddir)) + wait() + return nil + }) +} + +func (es *erasureSingle) renameAll(ctx context.Context, bucket, prefix string) { + if es.disk != nil { + es.disk.RenameFile(ctx, bucket, prefix, minioMetaTmpDeletedBucket, mustGetUUID()) + } +} + +type renameAllStorager interface { + renameAll(ctx context.Context, bucket, prefix string) +} + +// Bucket operations +// MakeBucket - make a bucket. +func (es *erasureSingle) MakeBucketWithLocation(ctx context.Context, bucket string, opts BucketOptions) error { + defer NSUpdated(bucket, slashSeparator) + + // Lock the bucket name before creating. + lk := es.NewNSLock(minioMetaTmpBucket, bucket+".lck") + lkctx, err := lk.GetLock(ctx, globalOperationTimeout) + if err != nil { + return err + } + ctx = lkctx.Context() + defer lk.Unlock(lkctx.Cancel) + + // Verify if bucket is valid. + if !isMinioMetaBucketName(bucket) { + if err := s3utils.CheckValidBucketNameStrict(bucket); err != nil { + return BucketNameInvalid{Bucket: bucket} + } + } + + if err := es.disk.MakeVol(ctx, bucket); err != nil { + if opts.ForceCreate && errors.Is(err, errVolumeExists) { + // No need to return error when force create was + // requested. + return nil + } + if !errors.Is(err, errVolumeExists) { + logger.LogIf(ctx, err) + } + return toObjectErr(err, bucket) + } + + // If it doesn't exist we get a new, so ignore errors + meta := newBucketMetadata(bucket) + if opts.LockEnabled { + meta.VersioningConfigXML = enabledBucketVersioningConfig + meta.ObjectLockConfigXML = enabledBucketObjectLockConfig + } + + if opts.VersioningEnabled { + meta.VersioningConfigXML = enabledBucketVersioningConfig + } + + if err := meta.Save(context.Background(), es); err != nil { + return toObjectErr(err, bucket) + } + + globalBucketMetadataSys.Set(bucket, meta) + + return nil +} + +// GetBucketInfo - returns BucketInfo for a bucket. +func (es *erasureSingle) GetBucketInfo(ctx context.Context, bucket string) (bi BucketInfo, e error) { + volInfo, err := es.disk.StatVol(ctx, bucket) + if err != nil { + return bi, toObjectErr(err, bucket) + } + return BucketInfo(volInfo), nil +} + +// DeleteBucket - deletes a bucket. +func (es *erasureSingle) DeleteBucket(ctx context.Context, bucket string, opts DeleteBucketOptions) error { + // Collect if all disks report volume not found. + defer NSUpdated(bucket, slashSeparator) + + err := es.disk.DeleteVol(ctx, bucket, opts.Force) + return toObjectErr(err, bucket) +} + +// IsNotificationSupported returns whether bucket notification is applicable for this layer. +func (es *erasureSingle) IsNotificationSupported() bool { + return true +} + +// IsListenSupported returns whether listen bucket notification is applicable for this layer. +func (es *erasureSingle) IsListenSupported() bool { + return true +} + +// IsEncryptionSupported returns whether server side encryption is implemented for this layer. +func (es *erasureSingle) IsEncryptionSupported() bool { + return true +} + +// IsCompressionSupported returns whether compression is applicable for this layer. +func (es *erasureSingle) IsCompressionSupported() bool { + return true +} + +// IsTaggingSupported indicates whethes *erasureSingle implements tagging support. +func (es *erasureSingle) IsTaggingSupported() bool { + return true +} + +// Object Operations + +// CopyObject - copy object source object to destination object. +// if source object and destination object are same we only +// update metadata. +func (es *erasureSingle) CopyObject(ctx context.Context, srcBucket, srcObject, dstBucket, dstObject string, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (oi ObjectInfo, err error) { + defer NSUpdated(dstBucket, dstObject) + + srcObject = encodeDirObject(srcObject) + dstObject = encodeDirObject(dstObject) + + cpSrcDstSame := isStringEqual(pathJoin(srcBucket, srcObject), pathJoin(dstBucket, dstObject)) + + if !dstOpts.NoLock { + ns := es.NewNSLock(dstBucket, dstObject) + lkctx, err := ns.GetLock(ctx, globalOperationTimeout) + if err != nil { + return ObjectInfo{}, err + } + ctx = lkctx.Context() + defer ns.Unlock(lkctx.Cancel) + dstOpts.NoLock = true + } + + if cpSrcDstSame && srcInfo.metadataOnly { + // Read metadata associated with the object from all disks. + storageDisks := []StorageAPI{es.disk} + + var metaArr []FileInfo + var errs []error + + // Read metadata associated with the object from all disks. + if srcOpts.VersionID != "" { + metaArr, errs = readAllFileInfo(ctx, storageDisks, srcBucket, srcObject, srcOpts.VersionID, true) + } else { + metaArr, errs = readAllXL(ctx, storageDisks, srcBucket, srcObject, true) + } + + readQuorum, writeQuorum, err := objectQuorumFromMeta(ctx, metaArr, errs, 0) + if err != nil { + return ObjectInfo{}, toObjectErr(err, srcBucket, srcObject) + } + + // List all online disks. + onlineDisks, modTime := listOnlineDisks(storageDisks, metaArr, errs) + + // Pick latest valid metadata. + fi, err := pickValidFileInfo(ctx, metaArr, modTime, readQuorum) + if err != nil { + return oi, toObjectErr(err, srcBucket, srcObject) + } + if fi.Deleted { + if srcOpts.VersionID == "" { + return oi, toObjectErr(errFileNotFound, srcBucket, srcObject) + } + return fi.ToObjectInfo(srcBucket, srcObject), toObjectErr(errMethodNotAllowed, srcBucket, srcObject) + } + + filterOnlineDisksInplace(fi, metaArr, onlineDisks) + + versionID := srcInfo.VersionID + if srcInfo.versionOnly { + versionID = dstOpts.VersionID + // preserve destination versionId if specified. + if versionID == "" { + versionID = mustGetUUID() + fi.IsLatest = true // we are creating a new version so this is latest. + } + modTime = UTCNow() + } + + // If the data is not inlined, we may end up incorrectly + // inlining the data here, that leads to an inconsistent + // situation where some objects are were not inlined + // were now inlined, make sure to `nil` the Data such + // that xl.meta is written as expected. + if !fi.InlineData() { + fi.Data = nil + } + + fi.VersionID = versionID // set any new versionID we might have created + fi.ModTime = modTime // set modTime for the new versionID + if !dstOpts.MTime.IsZero() { + modTime = dstOpts.MTime + fi.ModTime = dstOpts.MTime + } + fi.Metadata = srcInfo.UserDefined + srcInfo.UserDefined["etag"] = srcInfo.ETag + + // Update `xl.meta` content on each disks. + for index := range metaArr { + if metaArr[index].IsValid() { + metaArr[index].ModTime = modTime + metaArr[index].VersionID = versionID + metaArr[index].Metadata = srcInfo.UserDefined + if !metaArr[index].InlineData() { + // If the data is not inlined, we may end up incorrectly + // inlining the data here, that leads to an inconsistent + // situation where some objects are were not inlined + // were now inlined, make sure to `nil` the Data such + // that xl.meta is written as expected. + metaArr[index].Data = nil + } + } + } + + // Write unique `xl.meta` for each disk. + if _, err = writeUniqueFileInfo(ctx, onlineDisks, srcBucket, srcObject, metaArr, writeQuorum); err != nil { + return oi, toObjectErr(err, srcBucket, srcObject) + } + + return fi.ToObjectInfo(srcBucket, srcObject), nil + } + + putOpts := ObjectOptions{ + ServerSideEncryption: dstOpts.ServerSideEncryption, + UserDefined: srcInfo.UserDefined, + Versioned: dstOpts.Versioned, + VersionID: dstOpts.VersionID, + MTime: dstOpts.MTime, + NoLock: true, + } + + return es.PutObject(ctx, dstBucket, dstObject, srcInfo.PutObjReader, putOpts) +} + +// GetObjectNInfo - returns object info and an object +// Read(Closer). When err != nil, the returned reader is always nil. +func (es *erasureSingle) GetObjectNInfo(ctx context.Context, bucket, object string, rs *HTTPRangeSpec, h http.Header, lockType LockType, opts ObjectOptions) (gr *GetObjectReader, err error) { + if err = checkGetObjArgs(ctx, bucket, object); err != nil { + return nil, err + } + + object = encodeDirObject(object) + + var unlockOnDefer bool + nsUnlocker := func() {} + defer func() { + if unlockOnDefer { + nsUnlocker() + } + }() + + // Acquire lock + if lockType != noLock { + lock := es.NewNSLock(bucket, object) + switch lockType { + case writeLock: + lkctx, err := lock.GetLock(ctx, globalOperationTimeout) + if err != nil { + return nil, err + } + ctx = lkctx.Context() + nsUnlocker = func() { lock.Unlock(lkctx.Cancel) } + case readLock: + lkctx, err := lock.GetRLock(ctx, globalOperationTimeout) + if err != nil { + return nil, err + } + ctx = lkctx.Context() + nsUnlocker = func() { lock.RUnlock(lkctx.Cancel) } + } + unlockOnDefer = true + } + + fi, metaArr, onlineDisks, err := es.getObjectFileInfo(ctx, bucket, object, opts, true) + if err != nil { + return nil, toObjectErr(err, bucket, object) + } + + objInfo := fi.ToObjectInfo(bucket, object) + if objInfo.DeleteMarker { + if opts.VersionID == "" { + return &GetObjectReader{ + ObjInfo: objInfo, + }, toObjectErr(errFileNotFound, bucket, object) + } + // Make sure to return object info to provide extra information. + return &GetObjectReader{ + ObjInfo: objInfo, + }, toObjectErr(errMethodNotAllowed, bucket, object) + } + if objInfo.IsRemote() { + gr, err := getTransitionedObjectReader(ctx, bucket, object, rs, h, objInfo, opts) + if err != nil { + return nil, err + } + unlockOnDefer = false + return gr.WithCleanupFuncs(nsUnlocker), nil + } + + fn, off, length, err := NewGetObjectReader(rs, objInfo, opts) + if err != nil { + return nil, err + } + unlockOnDefer = false + + pr, pw := xioutil.WaitPipe() + go func() { + pw.CloseWithError(es.getObjectWithFileInfo(ctx, bucket, object, off, length, pw, fi, metaArr, onlineDisks)) + }() + + // Cleanup function to cause the go routine above to exit, in + // case of incomplete read. + pipeCloser := func() { + pr.CloseWithError(nil) + } + + return fn(pr, h, pipeCloser, nsUnlocker) +} + +func (es *erasureSingle) getObjectWithFileInfo(ctx context.Context, bucket, object string, startOffset int64, length int64, writer io.Writer, fi FileInfo, metaArr []FileInfo, onlineDisks []StorageAPI) error { + // Reorder online disks based on erasure distribution ordes. + // Reorder parts metadata based on erasure distribution ordes. + onlineDisks, metaArr = shuffleDisksAndPartsMetadataByIndex(onlineDisks, metaArr, fi) + + // For negative length read everything. + if length < 0 { + length = fi.Size - startOffset + } + + // Reply back invalid range if the input offset and length fall out of range. + if startOffset > fi.Size || startOffset+length > fi.Size { + logger.LogIf(ctx, InvalidRange{startOffset, length, fi.Size}, logger.Application) + return InvalidRange{startOffset, length, fi.Size} + } + + // Get start part index and offset. + partIndex, partOffset, err := fi.ObjectToPartOffset(ctx, startOffset) + if err != nil { + return InvalidRange{startOffset, length, fi.Size} + } + + // Calculate endOffset according to length + endOffset := startOffset + if length > 0 { + endOffset += length - 1 + } + + // Get last part index to read given length. + lastPartIndex, _, err := fi.ObjectToPartOffset(ctx, endOffset) + if err != nil { + return InvalidRange{startOffset, length, fi.Size} + } + + var totalBytesRead int64 + erasure, err := NewErasure(ctx, fi.Erasure.DataBlocks, fi.Erasure.ParityBlocks, fi.Erasure.BlockSize) + if err != nil { + return toObjectErr(err, bucket, object) + } + + // once we have obtained a common FileInfo i.e latest, we should stick + // to single dataDir to read the content to avoid reading from some other + // dataDir that has stale FileInfo{} to ensure that we fail appropriately + // during reads and expect the same dataDir everywhere. + dataDir := fi.DataDir + for ; partIndex <= lastPartIndex; partIndex++ { + if length == totalBytesRead { + break + } + + partNumber := fi.Parts[partIndex].Number + + // Save the current part name and size. + partSize := fi.Parts[partIndex].Size + + partLength := partSize - partOffset + // partLength should be adjusted so that we don't write more data than what was requested. + if partLength > (length - totalBytesRead) { + partLength = length - totalBytesRead + } + + tillOffset := erasure.ShardFileOffset(partOffset, partLength, partSize) + // Get the checksums of the current part. + readers := make([]io.ReaderAt, len(onlineDisks)) + prefer := make([]bool, len(onlineDisks)) + for index, disk := range onlineDisks { + if disk == OfflineDisk { + continue + } + if !metaArr[index].IsValid() { + continue + } + checksumInfo := metaArr[index].Erasure.GetChecksumInfo(partNumber) + partPath := pathJoin(object, dataDir, fmt.Sprintf("part.%d", partNumber)) + readers[index] = newBitrotReader(disk, metaArr[index].Data, bucket, partPath, tillOffset, + checksumInfo.Algorithm, checksumInfo.Hash, erasure.ShardSize()) + + // Prefer local disks + prefer[index] = disk.Hostname() == "" + } + + _, err = erasure.Decode(ctx, writer, readers, partOffset, partLength, partSize, prefer) + // Note: we should not be defer'ing the following closeBitrotReaders() call as + // we are inside a for loop i.e if we use defer, we would accumulate a lot of open files by the time + // we return from this function. + closeBitrotReaders(readers) + if err != nil { + return toObjectErr(err, bucket, object) + } + for i, r := range readers { + if r == nil { + onlineDisks[i] = OfflineDisk + } + } + // Track total bytes read from disk and written to the client. + totalBytesRead += partLength + // partOffset will be valid only for the first part, hence reset it to 0 for + // the remaining parts. + partOffset = 0 + } // End of read all parts loop. + // Return success. + return nil +} + +// GetObjectInfo - reads object metadata and replies back ObjectInfo. +func (es *erasureSingle) GetObjectInfo(ctx context.Context, bucket, object string, opts ObjectOptions) (info ObjectInfo, err error) { + if err = checkGetObjArgs(ctx, bucket, object); err != nil { + return info, err + } + + object = encodeDirObject(object) + if !opts.NoLock { + // Lock the object before reading. + lk := es.NewNSLock(bucket, object) + lkctx, err := lk.GetRLock(ctx, globalOperationTimeout) + if err != nil { + return ObjectInfo{}, err + } + ctx = lkctx.Context() + defer lk.RUnlock(lkctx.Cancel) + } + + return es.getObjectInfo(ctx, bucket, object, opts) +} + +func (es *erasureSingle) getObjectFileInfo(ctx context.Context, bucket, object string, opts ObjectOptions, readData bool) (fi FileInfo, metaArr []FileInfo, onlineDisks []StorageAPI, err error) { + disks := []StorageAPI{es.disk} + + var errs []error + + // Read metadata associated with the object from all disks. + metaArr, errs = readAllFileInfo(ctx, disks, bucket, object, opts.VersionID, readData) + readQuorum, _, err := objectQuorumFromMeta(ctx, metaArr, errs, 0) + if err != nil { + return fi, nil, nil, toObjectErr(err, bucket, object) + } + if reducedErr := reduceReadQuorumErrs(ctx, errs, objectOpIgnoredErrs, readQuorum); reducedErr != nil { + return fi, nil, nil, toObjectErr(reducedErr, bucket, object) + } + + // List all online disks. + onlineDisks, modTime := listOnlineDisks(disks, metaArr, errs) + + // Pick latest valid metadata. + fi, err = pickValidFileInfo(ctx, metaArr, modTime, readQuorum) + if err != nil { + return fi, nil, nil, err + } + + filterOnlineDisksInplace(fi, metaArr, onlineDisks) + return fi, metaArr, onlineDisks, nil +} + +// getObjectInfo - wrapper for reading object metadata and constructs ObjectInfo. +func (es *erasureSingle) getObjectInfo(ctx context.Context, bucket, object string, opts ObjectOptions) (objInfo ObjectInfo, err error) { + fi, _, _, err := es.getObjectFileInfo(ctx, bucket, object, opts, false) + if err != nil { + return objInfo, toObjectErr(err, bucket, object) + } + objInfo = fi.ToObjectInfo(bucket, object) + if fi.Deleted { + if opts.VersionID == "" || opts.DeleteMarker { + return objInfo, toObjectErr(errFileNotFound, bucket, object) + } + // Make sure to return object info to provide extra information. + return objInfo, toObjectErr(errMethodNotAllowed, bucket, object) + } + + return objInfo, nil +} + +// getObjectInfoAndQuroum - wrapper for reading object metadata and constructs ObjectInfo, additionally returns write quorum for the object. +func (es *erasureSingle) getObjectInfoAndQuorum(ctx context.Context, bucket, object string, opts ObjectOptions) (objInfo ObjectInfo, wquorum int, err error) { + fi, _, _, err := es.getObjectFileInfo(ctx, bucket, object, opts, false) + if err != nil { + return objInfo, 1, toObjectErr(err, bucket, object) + } + + wquorum = fi.Erasure.DataBlocks + if fi.Erasure.DataBlocks == fi.Erasure.ParityBlocks { + wquorum++ + } + + objInfo = fi.ToObjectInfo(bucket, object) + if !fi.VersionPurgeStatus().Empty() && opts.VersionID != "" { + // Make sure to return object info to provide extra information. + return objInfo, wquorum, toObjectErr(errMethodNotAllowed, bucket, object) + } + + if fi.Deleted { + if opts.VersionID == "" || opts.DeleteMarker { + return objInfo, wquorum, toObjectErr(errFileNotFound, bucket, object) + } + // Make sure to return object info to provide extra information. + return objInfo, wquorum, toObjectErr(errMethodNotAllowed, bucket, object) + } + + return objInfo, wquorum, nil +} + +func (es *erasureSingle) putMetacacheObject(ctx context.Context, key string, r *PutObjReader, opts ObjectOptions) (objInfo ObjectInfo, err error) { + data := r.Reader + + // No metadata is set, allocate a new one. + if opts.UserDefined == nil { + opts.UserDefined = make(map[string]string) + } + + storageDisks := []StorageAPI{es.disk} + // Get parity and data drive count based on storage class metadata + parityDrives := 0 + dataDrives := len(storageDisks) - parityDrives + + // we now know the number of blocks this object needs for data and parity. + // writeQuorum is dataBlocks + 1 + writeQuorum := dataDrives + if dataDrives == parityDrives { + writeQuorum++ + } + + // Validate input data size and it can never be less than zero. + if data.Size() < -1 { + logger.LogIf(ctx, errInvalidArgument, logger.Application) + return ObjectInfo{}, toObjectErr(errInvalidArgument) + } + + // Initialize parts metadata + partsMetadata := make([]FileInfo, len(storageDisks)) + + fi := newFileInfo(pathJoin(minioMetaBucket, key), dataDrives, parityDrives) + fi.DataDir = mustGetUUID() + + // Initialize erasure metadata. + for index := range partsMetadata { + partsMetadata[index] = fi + } + + // Order disks according to erasure distribution + var onlineDisks []StorageAPI + onlineDisks, partsMetadata = shuffleDisksAndPartsMetadata(storageDisks, partsMetadata, fi) + + erasure, err := NewErasure(ctx, fi.Erasure.DataBlocks, fi.Erasure.ParityBlocks, fi.Erasure.BlockSize) + if err != nil { + return ObjectInfo{}, toObjectErr(err, minioMetaBucket, key) + } + + // Fetch buffer for I/O, returns from the pool if not allocates a new one and returns. + var buffer []byte + switch size := data.Size(); { + case size == 0: + buffer = make([]byte, 1) // Allocate atleast a byte to reach EOF + case size >= fi.Erasure.BlockSize: + buffer = es.bp.Get() + defer es.bp.Put(buffer) + case size < fi.Erasure.BlockSize: + // No need to allocate fully blockSizeV1 buffer if the incoming data is smaller. + buffer = make([]byte, size, 2*size+int64(fi.Erasure.ParityBlocks+fi.Erasure.DataBlocks-1)) + } + + if len(buffer) > int(fi.Erasure.BlockSize) { + buffer = buffer[:fi.Erasure.BlockSize] + } + + shardFileSize := erasure.ShardFileSize(data.Size()) + writers := make([]io.Writer, len(onlineDisks)) + inlineBuffers := make([]*bytes.Buffer, len(onlineDisks)) + for i, disk := range onlineDisks { + if disk == nil { + continue + } + if disk.IsOnline() { + inlineBuffers[i] = bytes.NewBuffer(make([]byte, 0, shardFileSize)) + writers[i] = newStreamingBitrotWriterBuffer(inlineBuffers[i], DefaultBitrotAlgorithm, erasure.ShardSize()) + } + } + + n, erasureErr := erasure.Encode(ctx, data, writers, buffer, writeQuorum) + closeBitrotWriters(writers) + if erasureErr != nil { + return ObjectInfo{}, toObjectErr(erasureErr, minioMetaBucket, key) + } + + // Should return IncompleteBody{} error when reader has fewer bytes + // than specified in request header. + if n < data.Size() { + return ObjectInfo{}, IncompleteBody{Bucket: minioMetaBucket, Object: key} + } + + for i, w := range writers { + if w == nil { + // Make sure to avoid writing to disks which we couldn't complete in erasure.Encode() + onlineDisks[i] = nil + continue + } + partsMetadata[i].Data = inlineBuffers[i].Bytes() + partsMetadata[i].AddObjectPart(1, "", n, data.ActualSize()) + partsMetadata[i].Erasure.AddChecksumInfo(ChecksumInfo{ + PartNumber: 1, + Algorithm: DefaultBitrotAlgorithm, + Hash: bitrotWriterSum(w), + }) + } + + modTime := UTCNow() + + // Fill all the necessary metadata. + // Update `xl.meta` content on each disks. + for index := range partsMetadata { + partsMetadata[index].Size = n + partsMetadata[index].Fresh = true + partsMetadata[index].ModTime = modTime + partsMetadata[index].Metadata = opts.UserDefined + } + + // Set an additional header when data is inlined. + for index := range partsMetadata { + partsMetadata[index].SetInlineData() + } + + for i := 0; i < len(onlineDisks); i++ { + if onlineDisks[i] != nil && onlineDisks[i].IsOnline() { + // Object info is the same in all disks, so we can pick + // the first meta from online disk + fi = partsMetadata[i] + break + } + } + + if _, err = writeUniqueFileInfo(ctx, onlineDisks, minioMetaBucket, key, partsMetadata, writeQuorum); err != nil { + return ObjectInfo{}, toObjectErr(err, minioMetaBucket, key) + } + + return fi.ToObjectInfo(minioMetaBucket, key), nil +} + +// PutObject - creates an object upon reading from the input stream +// until EOF, erasure codes the data across all disk and additionally +// writes `xl.meta` which carries the necessary metadata for future +// object operations. +func (es *erasureSingle) PutObject(ctx context.Context, bucket string, object string, data *PutObjReader, opts ObjectOptions) (objInfo ObjectInfo, err error) { + // Validate put object input args. + if err := checkPutObjectArgs(ctx, bucket, object, es); err != nil { + return ObjectInfo{}, err + } + + object = encodeDirObject(object) + + if !isMinioMetaBucketName(bucket) && !hasSpaceFor(getDiskInfos(ctx, es.disk), data.Size()) { + return ObjectInfo{}, toObjectErr(errDiskFull) + } + + return es.putObject(ctx, bucket, object, data, opts) +} + +// putObject wrapper for erasureObjects PutObject +func (es *erasureSingle) putObject(ctx context.Context, bucket string, object string, r *PutObjReader, opts ObjectOptions) (objInfo ObjectInfo, err error) { + data := r.Reader + + // No metadata is set, allocate a new one. + if opts.UserDefined == nil { + opts.UserDefined = make(map[string]string) + } + + storageDisks := []StorageAPI{es.disk} + parityDrives := 0 + dataDrives := len(storageDisks) - parityDrives + + // we now know the number of blocks this object needs for data and parity. + // writeQuorum is dataBlocks + 1 + writeQuorum := dataDrives + if dataDrives == parityDrives { + writeQuorum++ + } + + // Validate input data size and it can never be less than zero. + if data.Size() < -1 { + logger.LogIf(ctx, errInvalidArgument, logger.Application) + return ObjectInfo{}, toObjectErr(errInvalidArgument) + } + + // Initialize parts metadata + partsMetadata := make([]FileInfo, len(storageDisks)) + + fi := newFileInfo(pathJoin(bucket, object), dataDrives, parityDrives) + fi.VersionID = opts.VersionID + if opts.Versioned && fi.VersionID == "" { + fi.VersionID = mustGetUUID() + } + + fi.DataDir = mustGetUUID() + uniqueID := mustGetUUID() + tempObj := uniqueID + + // Initialize erasure metadata. + for index := range partsMetadata { + partsMetadata[index] = fi + } + + // Order disks according to erasure distribution + var onlineDisks []StorageAPI + onlineDisks, partsMetadata = shuffleDisksAndPartsMetadata(storageDisks, partsMetadata, fi) + + erasure, err := NewErasure(ctx, fi.Erasure.DataBlocks, fi.Erasure.ParityBlocks, fi.Erasure.BlockSize) + if err != nil { + return ObjectInfo{}, toObjectErr(err, bucket, object) + } + + // Fetch buffer for I/O, returns from the pool if not allocates a new one and returns. + var buffer []byte + switch size := data.Size(); { + case size == 0: + buffer = make([]byte, 1) // Allocate atleast a byte to reach EOF + case size == -1: + if size := data.ActualSize(); size > 0 && size < fi.Erasure.BlockSize { + buffer = make([]byte, data.ActualSize()+256, data.ActualSize()*2+512) + } else { + buffer = es.bp.Get() + defer es.bp.Put(buffer) + } + case size >= fi.Erasure.BlockSize: + buffer = es.bp.Get() + defer es.bp.Put(buffer) + case size < fi.Erasure.BlockSize: + // No need to allocate fully blockSizeV1 buffer if the incoming data is smaller. + buffer = make([]byte, size, 2*size+int64(fi.Erasure.ParityBlocks+fi.Erasure.DataBlocks-1)) + } + + if len(buffer) > int(fi.Erasure.BlockSize) { + buffer = buffer[:fi.Erasure.BlockSize] + } + + partName := "part.1" + tempErasureObj := pathJoin(uniqueID, fi.DataDir, partName) + + // Delete temporary object in the event of failure. + // If PutObject succeeded there would be no temporary + // object to delete. + var online int + defer func() { + if online != len(onlineDisks) { + es.disk.RenameFile(context.Background(), minioMetaTmpBucket, tempObj, minioMetaTmpDeletedBucket, mustGetUUID()) + } + }() + + shardFileSize := erasure.ShardFileSize(data.Size()) + writers := make([]io.Writer, len(onlineDisks)) + var inlineBuffers []*bytes.Buffer + if shardFileSize >= 0 { + if !opts.Versioned && shardFileSize < smallFileThreshold { + inlineBuffers = make([]*bytes.Buffer, len(onlineDisks)) + } else if shardFileSize < smallFileThreshold/8 { + inlineBuffers = make([]*bytes.Buffer, len(onlineDisks)) + } + } else { + // If compressed, use actual size to determine. + if sz := erasure.ShardFileSize(data.ActualSize()); sz > 0 { + if !opts.Versioned && sz < smallFileThreshold { + inlineBuffers = make([]*bytes.Buffer, len(onlineDisks)) + } else if sz < smallFileThreshold/8 { + inlineBuffers = make([]*bytes.Buffer, len(onlineDisks)) + } + } + } + for i, disk := range onlineDisks { + if disk == nil { + continue + } + + if !disk.IsOnline() { + continue + } + + if len(inlineBuffers) > 0 { + sz := shardFileSize + if sz < 0 { + sz = data.ActualSize() + } + inlineBuffers[i] = bytes.NewBuffer(make([]byte, 0, sz)) + writers[i] = newStreamingBitrotWriterBuffer(inlineBuffers[i], DefaultBitrotAlgorithm, erasure.ShardSize()) + continue + } + + writers[i] = newBitrotWriter(disk, minioMetaTmpBucket, tempErasureObj, shardFileSize, DefaultBitrotAlgorithm, erasure.ShardSize()) + } + + toEncode := io.Reader(data) + if data.Size() > bigFileThreshold { + // We use 2 buffers, so we always have a full buffer of input. + bufA := es.bp.Get() + bufB := es.bp.Get() + defer es.bp.Put(bufA) + defer es.bp.Put(bufB) + ra, err := readahead.NewReaderBuffer(data, [][]byte{bufA[:fi.Erasure.BlockSize], bufB[:fi.Erasure.BlockSize]}) + if err == nil { + toEncode = ra + defer ra.Close() + } + logger.LogIf(ctx, err) + } + n, erasureErr := erasure.Encode(ctx, toEncode, writers, buffer, writeQuorum) + closeBitrotWriters(writers) + if erasureErr != nil { + return ObjectInfo{}, toObjectErr(erasureErr, minioMetaTmpBucket, tempErasureObj) + } + + // Should return IncompleteBody{} error when reader has fewer bytes + // than specified in request header. + if n < data.Size() { + return ObjectInfo{}, IncompleteBody{Bucket: bucket, Object: object} + } + + if !opts.NoLock { + lk := es.NewNSLock(bucket, object) + lkctx, err := lk.GetLock(ctx, globalOperationTimeout) + if err != nil { + return ObjectInfo{}, err + } + ctx = lkctx.Context() + defer lk.Unlock(lkctx.Cancel) + } + + for i, w := range writers { + if w == nil { + onlineDisks[i] = nil + continue + } + if len(inlineBuffers) > 0 && inlineBuffers[i] != nil { + partsMetadata[i].Data = inlineBuffers[i].Bytes() + } else { + partsMetadata[i].Data = nil + } + partsMetadata[i].AddObjectPart(1, "", n, data.ActualSize()) + partsMetadata[i].Erasure.AddChecksumInfo(ChecksumInfo{ + PartNumber: 1, + Algorithm: DefaultBitrotAlgorithm, + Hash: bitrotWriterSum(w), + }) + } + if opts.UserDefined["etag"] == "" { + opts.UserDefined["etag"] = r.MD5CurrentHexString() + } + + // Guess content-type from the extension if possible. + if opts.UserDefined["content-type"] == "" { + opts.UserDefined["content-type"] = mimedb.TypeByExtension(path.Ext(object)) + } + + modTime := opts.MTime + if opts.MTime.IsZero() { + modTime = UTCNow() + } + + // Fill all the necessary metadata. + // Update `xl.meta` content on each disks. + for index := range partsMetadata { + partsMetadata[index].Metadata = opts.UserDefined + partsMetadata[index].Size = n + partsMetadata[index].ModTime = modTime + } + + if len(inlineBuffers) > 0 { + // Set an additional header when data is inlined. + for index := range partsMetadata { + partsMetadata[index].SetInlineData() + } + } + + // Rename the successfully written temporary object to final location. + if onlineDisks, err = renameData(ctx, onlineDisks, minioMetaTmpBucket, tempObj, partsMetadata, bucket, object, writeQuorum); err != nil { + if errors.Is(err, errFileNotFound) { + return ObjectInfo{}, toObjectErr(errErasureWriteQuorum, bucket, object) + } + logger.LogIf(ctx, err) + return ObjectInfo{}, toObjectErr(err, bucket, object) + } + + for i := 0; i < len(onlineDisks); i++ { + if onlineDisks[i] != nil && onlineDisks[i].IsOnline() { + // Object info is the same in all disks, so we can pick + // the first meta from online disk + fi = partsMetadata[i] + break + } + } + + fi.ReplicationState = opts.PutReplicationState() + online = countOnlineDisks(onlineDisks) + + // we are adding a new version to this object under the namespace lock, so this is the latest version. + fi.IsLatest = true + + return fi.ToObjectInfo(bucket, object), nil +} + +func (es *erasureSingle) deleteObjectVersion(ctx context.Context, bucket, object string, writeQuorum int, fi FileInfo, forceDelMarker bool) error { + return es.disk.DeleteVersion(ctx, bucket, object, fi, forceDelMarker) +} + +// DeleteObjects deletes objects/versions in bulk, this function will still automatically split objects list +// into smaller bulks if some object names are found to be duplicated in the delete list, splitting +// into smaller bulks will avoid holding twice the write lock of the duplicated object names. +func (es *erasureSingle) DeleteObjects(ctx context.Context, bucket string, objects []ObjectToDelete, opts ObjectOptions) ([]DeletedObject, []error) { + errs := make([]error, len(objects)) + dobjects := make([]DeletedObject, len(objects)) + objSets := set.NewStringSet() + for i := range errs { + objects[i].ObjectName = encodeDirObject(objects[i].ObjectName) + + errs[i] = checkDelObjArgs(ctx, bucket, objects[i].ObjectName) + objSets.Add(objects[i].ObjectName) + } + + // Acquire a bulk write lock across 'objects' + multiDeleteLock := es.NewNSLock(bucket, objSets.ToSlice()...) + lkctx, err := multiDeleteLock.GetLock(ctx, globalOperationTimeout) + if err != nil { + for i := range errs { + errs[i] = err + } + return dobjects, errs + } + ctx = lkctx.Context() + defer multiDeleteLock.Unlock(lkctx.Cancel) + + writeQuorums := make([]int, len(objects)) + storageDisks := []StorageAPI{es.disk} + + for i := range objects { + // Single drive write quorum is '1' + writeQuorums[i] = 1 + } + + versionsMap := make(map[string]FileInfoVersions, len(objects)) + for i := range objects { + // Construct the FileInfo data that needs to be preserved on the disk. + vr := FileInfo{ + Name: objects[i].ObjectName, + VersionID: objects[i].VersionID, + ReplicationState: objects[i].ReplicationState(), + // save the index to set correct error at this index. + Idx: i, + } + vr.SetTierFreeVersionID(mustGetUUID()) + // VersionID is not set means delete is not specific about + // any version, look for if the bucket is versioned or not. + if objects[i].VersionID == "" { + if opts.Versioned || opts.VersionSuspended { + // Bucket is versioned and no version was explicitly + // mentioned for deletes, create a delete marker instead. + vr.ModTime = UTCNow() + vr.Deleted = true + // Versioning suspended means that we add a `null` version + // delete marker, if not add a new version for this delete + // marker. + if opts.Versioned { + vr.VersionID = mustGetUUID() + } + } + } + // De-dup same object name to collect multiple versions for same object. + v, ok := versionsMap[objects[i].ObjectName] + if ok { + v.Versions = append(v.Versions, vr) + } else { + v = FileInfoVersions{ + Name: vr.Name, + Versions: []FileInfo{vr}, + } + } + if vr.Deleted { + dobjects[i] = DeletedObject{ + DeleteMarker: vr.Deleted, + DeleteMarkerVersionID: vr.VersionID, + DeleteMarkerMTime: DeleteMarkerMTime{vr.ModTime}, + ObjectName: vr.Name, + ReplicationState: vr.ReplicationState, + } + } else { + dobjects[i] = DeletedObject{ + ObjectName: vr.Name, + VersionID: vr.VersionID, + ReplicationState: vr.ReplicationState, + } + } + versionsMap[objects[i].ObjectName] = v + } + + dedupVersions := make([]FileInfoVersions, 0, len(versionsMap)) + for _, version := range versionsMap { + dedupVersions = append(dedupVersions, version) + } + + // Initialize list of errors. + delObjErrs := make([][]error, len(storageDisks)) + + var wg sync.WaitGroup + // Remove versions in bulk for each disk + for index, disk := range storageDisks { + wg.Add(1) + go func(index int, disk StorageAPI) { + defer wg.Done() + delObjErrs[index] = make([]error, len(objects)) + if disk == nil { + for i := range objects { + delObjErrs[index][i] = errDiskNotFound + } + return + } + errs := disk.DeleteVersions(ctx, bucket, dedupVersions) + for i, err := range errs { + if err == nil { + continue + } + for _, v := range dedupVersions[i].Versions { + if err == errFileNotFound || err == errFileVersionNotFound { + if !dobjects[v.Idx].DeleteMarker { + // Not delete marker, if not found, ok. + continue + } + } + delObjErrs[index][v.Idx] = err + } + } + }(index, disk) + } + wg.Wait() + + // Reduce errors for each object + for objIndex := range objects { + diskErrs := make([]error, len(storageDisks)) + // Iterate over disks to fetch the error + // of deleting of the current object + for i := range delObjErrs { + // delObjErrs[i] is not nil when disks[i] is also not nil + if delObjErrs[i] != nil { + diskErrs[i] = delObjErrs[i][objIndex] + } + } + err := reduceWriteQuorumErrs(ctx, diskErrs, objectOpIgnoredErrs, writeQuorums[objIndex]) + if objects[objIndex].VersionID != "" { + errs[objIndex] = toObjectErr(err, bucket, objects[objIndex].ObjectName, objects[objIndex].VersionID) + } else { + errs[objIndex] = toObjectErr(err, bucket, objects[objIndex].ObjectName) + } + + defer NSUpdated(bucket, objects[objIndex].ObjectName) + } + + return dobjects, errs +} + +func (es *erasureSingle) deletePrefix(ctx context.Context, bucket, prefix string) error { + dirPrefix := encodeDirObject(prefix) + defer es.disk.Delete(ctx, bucket, dirPrefix, true) + return es.disk.Delete(ctx, bucket, prefix, true) +} + +// DeleteObject - deletes an object, this call doesn't necessary reply +// any error as it is not necessary for the handler to reply back a +// response to the client request. +func (es *erasureSingle) DeleteObject(ctx context.Context, bucket, object string, opts ObjectOptions) (objInfo ObjectInfo, err error) { + if err = checkDelObjArgs(ctx, bucket, object); err != nil { + return objInfo, err + } + + if opts.DeletePrefix { + return ObjectInfo{}, toObjectErr(es.deletePrefix(ctx, bucket, object), bucket, object) + } + + object = encodeDirObject(object) + var lc *lifecycle.Lifecycle + var rcfg lock.Retention + if opts.Expiration.Expire { + // Check if the current bucket has a configured lifecycle policy + lc, _ = globalLifecycleSys.Get(bucket) + rcfg, _ = globalBucketObjectLockSys.Get(bucket) + } + + // expiration attempted on a bucket with no lifecycle + // rules shall be rejected. + if lc == nil && opts.Expiration.Expire { + if opts.VersionID != "" { + return objInfo, VersionNotFound{ + Bucket: bucket, + Object: object, + VersionID: opts.VersionID, + } + } + return objInfo, ObjectNotFound{ + Bucket: bucket, + Object: object, + } + } + + // Acquire a write lock before deleting the object. + lk := es.NewNSLock(bucket, object) + lkctx, err := lk.GetLock(ctx, globalDeleteOperationTimeout) + if err != nil { + return ObjectInfo{}, err + } + ctx = lkctx.Context() + defer lk.Unlock(lkctx.Cancel) + + versionFound := true + objInfo = ObjectInfo{VersionID: opts.VersionID} // version id needed in Delete API response. + goi, writeQuorum, gerr := es.getObjectInfoAndQuorum(ctx, bucket, object, opts) + if gerr != nil && goi.Name == "" { + switch gerr.(type) { + case InsufficientReadQuorum: + return objInfo, InsufficientWriteQuorum{} + } + // For delete marker replication, versionID being replicated will not exist on disk + if opts.DeleteMarker { + versionFound = false + } else { + return objInfo, gerr + } + } + + if opts.Expiration.Expire { + action := evalActionFromLifecycle(ctx, *lc, rcfg, goi, false) + var isErr bool + switch action { + case lifecycle.NoneAction: + isErr = true + case lifecycle.TransitionAction, lifecycle.TransitionVersionAction: + isErr = true + } + if isErr { + if goi.VersionID != "" { + return goi, VersionNotFound{ + Bucket: bucket, + Object: object, + VersionID: goi.VersionID, + } + } + return goi, ObjectNotFound{ + Bucket: bucket, + Object: object, + } + } + } + + defer NSUpdated(bucket, object) + + var markDelete bool + // Determine whether to mark object deleted for replication + if goi.VersionID != "" { + markDelete = true + } + + // Default deleteMarker to true if object is under versioning + deleteMarker := opts.Versioned + + if opts.VersionID != "" { + // case where replica version needs to be deleted on target cluster + if versionFound && opts.DeleteMarkerReplicationStatus() == replication.Replica { + markDelete = false + } + if opts.VersionPurgeStatus().Empty() && opts.DeleteMarkerReplicationStatus().Empty() { + markDelete = false + } + if opts.VersionPurgeStatus() == Complete { + markDelete = false + } + + // Version is found but we do not wish to create more delete markers + // now, since VersionPurgeStatus() is already set, we can let the + // lower layers decide this. This fixes a regression that was introduced + // in PR #14555 where !VersionPurgeStatus.Empty() is automatically + // considered as Delete marker true to avoid listing such objects by + // regular ListObjects() calls. However for delete replication this + // ends up being a problem because "upon" a successful delete this + // ends up creating a new delete marker that is spurious and unnecessary. + if versionFound { + if !goi.VersionPurgeStatus.Empty() { + deleteMarker = false + } else if !goi.DeleteMarker { // implies a versioned delete of object + deleteMarker = false + } + } + } + + modTime := opts.MTime + if opts.MTime.IsZero() { + modTime = UTCNow() + } + fvID := mustGetUUID() + if markDelete { + if opts.Versioned || opts.VersionSuspended { + if !deleteMarker { + // versioning suspended means we add `null` version as + // delete marker, if its not decided already. + deleteMarker = opts.VersionSuspended && opts.VersionID == "" + } + fi := FileInfo{ + Name: object, + Deleted: deleteMarker, + MarkDeleted: markDelete, + ModTime: modTime, + ReplicationState: opts.DeleteReplication, + TransitionStatus: opts.Transition.Status, + ExpireRestored: opts.Transition.ExpireRestored, + } + fi.SetTierFreeVersionID(fvID) + if opts.Versioned { + fi.VersionID = mustGetUUID() + if opts.VersionID != "" { + fi.VersionID = opts.VersionID + } + } + // versioning suspended means we add `null` version as + // delete marker. Add delete marker, since we don't have + // any version specified explicitly. Or if a particular + // version id needs to be replicated. + if err = es.deleteObjectVersion(ctx, bucket, object, writeQuorum, fi, opts.DeleteMarker); err != nil { + return objInfo, toObjectErr(err, bucket, object) + } + return fi.ToObjectInfo(bucket, object), nil + } + } + + // Delete the object version on all disks. + dfi := FileInfo{ + Name: object, + VersionID: opts.VersionID, + MarkDeleted: markDelete, + Deleted: deleteMarker, + ModTime: modTime, + ReplicationState: opts.DeleteReplication, + TransitionStatus: opts.Transition.Status, + ExpireRestored: opts.Transition.ExpireRestored, + } + dfi.SetTierFreeVersionID(fvID) + if err = es.deleteObjectVersion(ctx, bucket, object, writeQuorum, dfi, opts.DeleteMarker); err != nil { + return objInfo, toObjectErr(err, bucket, object) + } + + return ObjectInfo{ + Bucket: bucket, + Name: object, + VersionID: opts.VersionID, + VersionPurgeStatusInternal: opts.DeleteReplication.VersionPurgeStatusInternal, + ReplicationStatusInternal: opts.DeleteReplication.ReplicationStatusInternal, + }, nil +} + +func (es *erasureSingle) PutObjectMetadata(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) { + if !opts.NoLock { + // Lock the object before updating metadata. + lk := es.NewNSLock(bucket, object) + lkctx, err := lk.GetLock(ctx, globalOperationTimeout) + if err != nil { + return ObjectInfo{}, err + } + ctx = lkctx.Context() + defer lk.Unlock(lkctx.Cancel) + } + + disks := []StorageAPI{es.disk} + + var metaArr []FileInfo + var errs []error + + // Read metadata associated with the object from all disks. + metaArr, errs = readAllFileInfo(ctx, disks, bucket, object, opts.VersionID, false) + + readQuorum, _, err := objectQuorumFromMeta(ctx, metaArr, errs, 0) + if err != nil { + return ObjectInfo{}, toObjectErr(err, bucket, object) + } + + // List all online disks. + onlineDisks, modTime := listOnlineDisks(disks, metaArr, errs) + + // Pick latest valid metadata. + fi, err := pickValidFileInfo(ctx, metaArr, modTime, readQuorum) + if err != nil { + return ObjectInfo{}, toObjectErr(err, bucket, object) + } + + if fi.Deleted { + return ObjectInfo{}, toObjectErr(errMethodNotAllowed, bucket, object) + } + + filterOnlineDisksInplace(fi, metaArr, onlineDisks) + + // if version-id is not specified retention is supposed to be set on the latest object. + if opts.VersionID == "" { + opts.VersionID = fi.VersionID + } + + objInfo := fi.ToObjectInfo(bucket, object) + if opts.EvalMetadataFn != nil { + if err := opts.EvalMetadataFn(objInfo); err != nil { + return ObjectInfo{}, err + } + } + for k, v := range objInfo.UserDefined { + fi.Metadata[k] = v + } + fi.ModTime = opts.MTime + fi.VersionID = opts.VersionID + + if err = es.updateObjectMeta(ctx, bucket, object, fi, onlineDisks...); err != nil { + return ObjectInfo{}, toObjectErr(err, bucket, object) + } + + return fi.ToObjectInfo(bucket, object), nil +} + +// PutObjectTags - replace or add tags to an existing object +func (es *erasureSingle) PutObjectTags(ctx context.Context, bucket, object string, tags string, opts ObjectOptions) (ObjectInfo, error) { + // Lock the object before updating tags. + lk := es.NewNSLock(bucket, object) + lkctx, err := lk.GetLock(ctx, globalOperationTimeout) + if err != nil { + return ObjectInfo{}, err + } + ctx = lkctx.Context() + defer lk.Unlock(lkctx.Cancel) + + disks := []StorageAPI{es.disk} + + var metaArr []FileInfo + var errs []error + + // Read metadata associated with the object from all disks. + if opts.VersionID != "" { + metaArr, errs = readAllFileInfo(ctx, disks, bucket, object, opts.VersionID, false) + } else { + metaArr, errs = readAllXL(ctx, disks, bucket, object, false) + } + + readQuorum, _, err := objectQuorumFromMeta(ctx, metaArr, errs, 0) + if err != nil { + return ObjectInfo{}, toObjectErr(err, bucket, object) + } + + // List all online disks. + onlineDisks, modTime := listOnlineDisks(disks, metaArr, errs) + + // Pick latest valid metadata. + fi, err := pickValidFileInfo(ctx, metaArr, modTime, readQuorum) + if err != nil { + return ObjectInfo{}, toObjectErr(err, bucket, object) + } + if fi.Deleted { + if opts.VersionID == "" { + return ObjectInfo{}, toObjectErr(errFileNotFound, bucket, object) + } + return ObjectInfo{}, toObjectErr(errMethodNotAllowed, bucket, object) + } + + filterOnlineDisksInplace(fi, metaArr, onlineDisks) + + fi.Metadata[xhttp.AmzObjectTagging] = tags + fi.ReplicationState = opts.PutReplicationState() + for k, v := range opts.UserDefined { + fi.Metadata[k] = v + } + + if err = es.updateObjectMeta(ctx, bucket, object, fi, onlineDisks...); err != nil { + return ObjectInfo{}, toObjectErr(err, bucket, object) + } + + return fi.ToObjectInfo(bucket, object), nil +} + +// updateObjectMeta will update the metadata of a file. +func (es *erasureSingle) updateObjectMeta(ctx context.Context, bucket, object string, fi FileInfo, onlineDisks ...StorageAPI) error { + if len(fi.Metadata) == 0 { + return nil + } + + g := errgroup.WithNErrs(len(onlineDisks)) + + // Start writing `xl.meta` to all disks in parallel. + for index := range onlineDisks { + index := index + g.Go(func() error { + if onlineDisks[index] == nil { + return errDiskNotFound + } + return onlineDisks[index].UpdateMetadata(ctx, bucket, object, fi) + }, index) + } + + // Wait for all the routines. + mErrs := g.Wait() + + return reduceWriteQuorumErrs(ctx, mErrs, objectOpIgnoredErrs, 1) +} + +// DeleteObjectTags - delete object tags from an existing object +func (es *erasureSingle) DeleteObjectTags(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) { + return es.PutObjectTags(ctx, bucket, object, "", opts) +} + +// GetObjectTags - get object tags from an existing object +func (es *erasureSingle) GetObjectTags(ctx context.Context, bucket, object string, opts ObjectOptions) (*tags.Tags, error) { + // GetObjectInfo will return tag value as well + oi, err := es.GetObjectInfo(ctx, bucket, object, opts) + if err != nil { + return nil, err + } + + return tags.ParseObjectTags(oi.UserTags) +} + +// TransitionObject - transition object content to target tier. +func (es *erasureSingle) TransitionObject(ctx context.Context, bucket, object string, opts ObjectOptions) error { + tgtClient, err := globalTierConfigMgr.getDriver(opts.Transition.Tier) + if err != nil { + return err + } + + // Acquire write lock before starting to transition the object. + lk := es.NewNSLock(bucket, object) + lkctx, err := lk.GetLock(ctx, globalDeleteOperationTimeout) + if err != nil { + return err + } + ctx = lkctx.Context() + defer lk.Unlock(lkctx.Cancel) + + fi, metaArr, onlineDisks, err := es.getObjectFileInfo(ctx, bucket, object, opts, true) + if err != nil { + return toObjectErr(err, bucket, object) + } + if fi.Deleted { + if opts.VersionID == "" { + return toObjectErr(errFileNotFound, bucket, object) + } + // Make sure to return object info to provide extra information. + return toObjectErr(errMethodNotAllowed, bucket, object) + } + // verify that the object queued for transition is identical to that on disk. + if !opts.MTime.Equal(fi.ModTime) || !strings.EqualFold(opts.Transition.ETag, extractETag(fi.Metadata)) { + return toObjectErr(errFileNotFound, bucket, object) + } + // if object already transitioned, return + if fi.TransitionStatus == lifecycle.TransitionComplete { + return nil + } + defer NSUpdated(bucket, object) + + destObj, err := genTransitionObjName(bucket) + if err != nil { + return err + } + + pr, pw := xioutil.WaitPipe() + go func() { + err := es.getObjectWithFileInfo(ctx, bucket, object, 0, fi.Size, pw, fi, metaArr, onlineDisks) + pw.CloseWithError(err) + }() + + var rv remoteVersionID + rv, err = tgtClient.Put(ctx, destObj, pr, fi.Size) + pr.CloseWithError(err) + if err != nil { + logger.LogIf(ctx, fmt.Errorf("Unable to transition %s/%s(%s) to %s tier: %w", bucket, object, opts.VersionID, opts.Transition.Tier, err)) + return err + } + fi.TransitionStatus = lifecycle.TransitionComplete + fi.TransitionedObjName = destObj + fi.TransitionTier = opts.Transition.Tier + fi.TransitionVersionID = string(rv) + eventName := event.ObjectTransitionComplete + + // we now know the number of blocks this object needs for data and parity. + // writeQuorum is dataBlocks + 1 + writeQuorum := fi.Erasure.DataBlocks + if fi.Erasure.DataBlocks == fi.Erasure.ParityBlocks { + writeQuorum++ + } + + if err = es.deleteObjectVersion(ctx, bucket, object, writeQuorum, fi, false); err != nil { + eventName = event.ObjectTransitionFailed + } + + objInfo := fi.ToObjectInfo(bucket, object) + sendEvent(eventArgs{ + EventName: eventName, + BucketName: bucket, + Object: objInfo, + Host: "Internal: [ILM-Transition]", + }) + auditLogLifecycle(ctx, objInfo, ILMTransition) + return err +} + +// RestoreTransitionedObject - restore transitioned object content locally on this cluster. +// This is similar to PostObjectRestore from AWS GLACIER +// storage class. When PostObjectRestore API is called, a temporary copy of the object +// is restored locally to the bucket on source cluster until the restore expiry date. +// The copy that was transitioned continues to reside in the transitioned tier. +func (es *erasureSingle) RestoreTransitionedObject(ctx context.Context, bucket, object string, opts ObjectOptions) error { + return es.restoreTransitionedObject(ctx, bucket, object, opts) +} + +// update restore status header in the metadata +func (es *erasureSingle) updateRestoreMetadata(ctx context.Context, bucket, object string, objInfo ObjectInfo, opts ObjectOptions, rerr error) error { + oi := objInfo.Clone() + oi.metadataOnly = true // Perform only metadata updates. + + if rerr == nil { + oi.UserDefined[xhttp.AmzRestore] = completedRestoreObj(opts.Transition.RestoreExpiry).String() + } else { // allow retry in the case of failure to restore + delete(oi.UserDefined, xhttp.AmzRestore) + } + if _, err := es.CopyObject(ctx, bucket, object, bucket, object, oi, ObjectOptions{ + VersionID: oi.VersionID, + }, ObjectOptions{ + VersionID: oi.VersionID, + }); err != nil { + logger.LogIf(ctx, fmt.Errorf("Unable to update transition restore metadata for %s/%s(%s): %s", bucket, object, oi.VersionID, err)) + return err + } + return nil +} + +// restoreTransitionedObject for multipart object chunks the file stream from remote tier into the same number of parts +// as in the xl.meta for this version and rehydrates the part.n into the fi.DataDir for this version as in the xl.meta +func (es *erasureSingle) restoreTransitionedObject(ctx context.Context, bucket string, object string, opts ObjectOptions) error { + setRestoreHeaderFn := func(oi ObjectInfo, rerr error) error { + es.updateRestoreMetadata(ctx, bucket, object, oi, opts, rerr) + return rerr + } + var oi ObjectInfo + // get the file info on disk for transitioned object + actualfi, _, _, err := es.getObjectFileInfo(ctx, bucket, object, opts, false) + if err != nil { + return setRestoreHeaderFn(oi, toObjectErr(err, bucket, object)) + } + + oi = actualfi.ToObjectInfo(bucket, object) + ropts := putRestoreOpts(bucket, object, opts.Transition.RestoreRequest, oi) + if len(oi.Parts) == 1 { + var rs *HTTPRangeSpec + gr, err := getTransitionedObjectReader(ctx, bucket, object, rs, http.Header{}, oi, opts) + if err != nil { + return setRestoreHeaderFn(oi, toObjectErr(err, bucket, object)) + } + defer gr.Close() + hashReader, err := hash.NewReader(gr, gr.ObjInfo.Size, "", "", gr.ObjInfo.Size) + if err != nil { + return setRestoreHeaderFn(oi, toObjectErr(err, bucket, object)) + } + pReader := NewPutObjReader(hashReader) + ropts.UserDefined[xhttp.AmzRestore] = completedRestoreObj(opts.Transition.RestoreExpiry).String() + _, err = es.PutObject(ctx, bucket, object, pReader, ropts) + return setRestoreHeaderFn(oi, toObjectErr(err, bucket, object)) + } + + uploadID, err := es.NewMultipartUpload(ctx, bucket, object, ropts) + if err != nil { + return setRestoreHeaderFn(oi, err) + } + + var uploadedParts []CompletePart + var rs *HTTPRangeSpec + // get reader from the warm backend - note that even in the case of encrypted objects, this stream is still encrypted. + gr, err := getTransitionedObjectReader(ctx, bucket, object, rs, http.Header{}, oi, opts) + if err != nil { + return setRestoreHeaderFn(oi, err) + } + defer gr.Close() + + // rehydrate the parts back on disk as per the original xl.meta prior to transition + for _, partInfo := range oi.Parts { + hr, err := hash.NewReader(gr, partInfo.Size, "", "", partInfo.Size) + if err != nil { + return setRestoreHeaderFn(oi, err) + } + pInfo, err := es.PutObjectPart(ctx, bucket, object, uploadID, partInfo.Number, NewPutObjReader(hr), ObjectOptions{}) + if err != nil { + return setRestoreHeaderFn(oi, err) + } + if pInfo.Size != partInfo.Size { + return setRestoreHeaderFn(oi, InvalidObjectState{Bucket: bucket, Object: object}) + } + uploadedParts = append(uploadedParts, CompletePart{ + PartNumber: pInfo.PartNumber, + ETag: pInfo.ETag, + }) + } + _, err = es.CompleteMultipartUpload(ctx, bucket, object, uploadID, uploadedParts, ObjectOptions{ + MTime: oi.ModTime, + }) + return setRestoreHeaderFn(oi, err) +} + +func (es *erasureSingle) getUploadIDDir(bucket, object, uploadID string) string { + return pathJoin(es.getMultipartSHADir(bucket, object), uploadID) +} + +func (es *erasureSingle) getMultipartSHADir(bucket, object string) string { + return getSHA256Hash([]byte(pathJoin(bucket, object))) +} + +// checkUploadIDExists - verify if a given uploadID exists and is valid. +func (es *erasureSingle) checkUploadIDExists(ctx context.Context, bucket, object, uploadID string) (err error) { + defer func() { + if err == errFileNotFound { + err = errUploadIDNotFound + } + }() + + disks := []StorageAPI{es.disk} + + // Read metadata associated with the object from all disks. + metaArr, errs := readAllFileInfo(ctx, disks, minioMetaMultipartBucket, es.getUploadIDDir(bucket, object, uploadID), "", false) + + readQuorum, _, err := objectQuorumFromMeta(ctx, metaArr, errs, 0) + if err != nil { + return err + } + + if reducedErr := reduceReadQuorumErrs(ctx, errs, objectOpIgnoredErrs, readQuorum); reducedErr != nil { + return reducedErr + } + + // List all online disks. + _, modTime := listOnlineDisks(disks, metaArr, errs) + + // Pick latest valid metadata. + _, err = pickValidFileInfo(ctx, metaArr, modTime, readQuorum) + return err +} + +// Removes part given by partName belonging to a mulitpart upload from minioMetaBucket +func (es *erasureSingle) removeObjectPart(bucket, object, uploadID, dataDir string, partNumber int) { + uploadIDPath := es.getUploadIDDir(bucket, object, uploadID) + curpartPath := pathJoin(uploadIDPath, dataDir, fmt.Sprintf("part.%d", partNumber)) + storageDisks := []StorageAPI{es.disk} + + g := errgroup.WithNErrs(len(storageDisks)) + for index, disk := range storageDisks { + if disk == nil { + continue + } + index := index + g.Go(func() error { + // Ignoring failure to remove parts that weren't present in CompleteMultipartUpload + // requests. xl.meta is the authoritative source of truth on which parts constitute + // the object. The presence of parts that don't belong in the object doesn't affect correctness. + _ = storageDisks[index].Delete(context.TODO(), minioMetaMultipartBucket, curpartPath, false) + return nil + }, index) + } + g.Wait() +} + +// Remove the old multipart uploads on the given disk. +func (es *erasureSingle) cleanupStaleUploadsOnDisk(ctx context.Context, disk StorageAPI, expiry time.Duration) { + now := time.Now() + diskPath := disk.Endpoint().Path + + readDirFn(pathJoin(diskPath, minioMetaMultipartBucket), func(shaDir string, typ os.FileMode) error { + return readDirFn(pathJoin(diskPath, minioMetaMultipartBucket, shaDir), func(uploadIDDir string, typ os.FileMode) error { + uploadIDPath := pathJoin(shaDir, uploadIDDir) + fi, err := disk.ReadVersion(ctx, minioMetaMultipartBucket, uploadIDPath, "", false) + if err != nil { + return nil + } + wait := es.deletedCleanupSleeper.Timer(ctx) + if now.Sub(fi.ModTime) > expiry { + es.disk.RenameFile(context.Background(), minioMetaMultipartBucket, uploadIDPath, minioMetaTmpDeletedBucket, mustGetUUID()) + } + wait() + return nil + }) + }) + + readDirFn(pathJoin(diskPath, minioMetaTmpBucket), func(tmpDir string, typ os.FileMode) error { + if tmpDir == ".trash/" { // do not remove .trash/ here, it has its own routines + return nil + } + vi, err := disk.StatVol(ctx, pathJoin(minioMetaTmpBucket, tmpDir)) + if err != nil { + return nil + } + wait := es.deletedCleanupSleeper.Timer(ctx) + if now.Sub(vi.Created) > expiry { + disk.Delete(ctx, minioMetaTmpBucket, tmpDir, true) + } + wait() + return nil + }) +} + +// ListMultipartUploads - lists all the pending multipart +// uploads for a particular object in a bucket. +// +// Implements minimal S3 compatible ListMultipartUploads API. We do +// not support prefix based listing, this is a deliberate attempt +// towards simplification of multipart APIs. +// The resulting ListMultipartsInfo structure is unmarshalled directly as XML. +func (es *erasureSingle) ListMultipartUploads(ctx context.Context, bucket, object, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (result ListMultipartsInfo, err error) { + if err := checkListMultipartArgs(ctx, bucket, object, keyMarker, uploadIDMarker, delimiter, es); err != nil { + return ListMultipartsInfo{}, err + } + + result.MaxUploads = maxUploads + result.KeyMarker = keyMarker + result.Prefix = object + result.Delimiter = delimiter + + uploadIDs, err := es.disk.ListDir(ctx, minioMetaMultipartBucket, es.getMultipartSHADir(bucket, object), -1) + if err != nil { + if err == errFileNotFound { + return result, nil + } + logger.LogIf(ctx, err) + return result, toObjectErr(err, bucket, object) + } + + for i := range uploadIDs { + uploadIDs[i] = strings.TrimSuffix(uploadIDs[i], SlashSeparator) + } + + // S3 spec says uploadIDs should be sorted based on initiated time, we need + // to read the metadata entry. + var uploads []MultipartInfo + + populatedUploadIds := set.NewStringSet() + + for _, uploadID := range uploadIDs { + if populatedUploadIds.Contains(uploadID) { + continue + } + fi, err := es.disk.ReadVersion(ctx, minioMetaMultipartBucket, pathJoin(es.getUploadIDDir(bucket, object, uploadID)), "", false) + if err != nil { + return result, toObjectErr(err, bucket, object) + } + populatedUploadIds.Add(uploadID) + uploads = append(uploads, MultipartInfo{ + Object: object, + UploadID: uploadID, + Initiated: fi.ModTime, + }) + } + + sort.Slice(uploads, func(i int, j int) bool { + return uploads[i].Initiated.Before(uploads[j].Initiated) + }) + + uploadIndex := 0 + if uploadIDMarker != "" { + for uploadIndex < len(uploads) { + if uploads[uploadIndex].UploadID != uploadIDMarker { + uploadIndex++ + continue + } + if uploads[uploadIndex].UploadID == uploadIDMarker { + uploadIndex++ + break + } + uploadIndex++ + } + } + for uploadIndex < len(uploads) { + result.Uploads = append(result.Uploads, uploads[uploadIndex]) + result.NextUploadIDMarker = uploads[uploadIndex].UploadID + uploadIndex++ + if len(result.Uploads) == maxUploads { + break + } + } + + result.IsTruncated = uploadIndex < len(uploads) + + if !result.IsTruncated { + result.NextKeyMarker = "" + result.NextUploadIDMarker = "" + } + + return result, nil +} + +// newMultipartUpload - wrapper for initializing a new multipart +// request; returns a unique upload id. +// +// Internally this function creates 'uploads.json' associated for the +// incoming object at +// '.minio.sys/multipart/bucket/object/uploads.json' on all the +// disks. `uploads.json` carries metadata regarding on-going multipart +// operation(s) on the object. +func (es *erasureSingle) newMultipartUpload(ctx context.Context, bucket string, object string, opts ObjectOptions) (string, error) { + onlineDisks := []StorageAPI{es.disk} + parityDrives := 0 + dataDrives := len(onlineDisks) - parityDrives + + // we now know the number of blocks this object needs for data and parity. + // establish the writeQuorum using this data + writeQuorum := dataDrives + if dataDrives == parityDrives { + writeQuorum++ + } + + // Initialize parts metadata + partsMetadata := make([]FileInfo, len(onlineDisks)) + + fi := newFileInfo(pathJoin(bucket, object), dataDrives, parityDrives) + fi.VersionID = opts.VersionID + if opts.Versioned && fi.VersionID == "" { + fi.VersionID = mustGetUUID() + } + fi.DataDir = mustGetUUID() + + // Initialize erasure metadata. + for index := range partsMetadata { + partsMetadata[index] = fi + } + + // Guess content-type from the extension if possible. + if opts.UserDefined["content-type"] == "" { + opts.UserDefined["content-type"] = mimedb.TypeByExtension(path.Ext(object)) + } + + modTime := opts.MTime + if opts.MTime.IsZero() { + modTime = UTCNow() + } + + onlineDisks, partsMetadata = shuffleDisksAndPartsMetadata(onlineDisks, partsMetadata, fi) + + // Fill all the necessary metadata. + // Update `xl.meta` content on each disks. + for index := range partsMetadata { + partsMetadata[index].Fresh = true + partsMetadata[index].ModTime = modTime + partsMetadata[index].Metadata = opts.UserDefined + } + + uploadID := mustGetUUID() + uploadIDPath := es.getUploadIDDir(bucket, object, uploadID) + + // Write updated `xl.meta` to all disks. + if _, err := writeUniqueFileInfo(ctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath, partsMetadata, writeQuorum); err != nil { + return "", toObjectErr(err, minioMetaMultipartBucket, uploadIDPath) + } + + // Return success. + return uploadID, nil +} + +// NewMultipartUpload - initialize a new multipart upload, returns a +// unique id. The unique id returned here is of UUID form, for each +// subsequent request each UUID is unique. +// +// Implements S3 compatible initiate multipart API. +func (es *erasureSingle) NewMultipartUpload(ctx context.Context, bucket, object string, opts ObjectOptions) (string, error) { + if err := checkNewMultipartArgs(ctx, bucket, object, es); err != nil { + return "", err + } + + // No metadata is set, allocate a new one. + if opts.UserDefined == nil { + opts.UserDefined = make(map[string]string) + } + return es.newMultipartUpload(ctx, bucket, object, opts) +} + +// CopyObjectPart - reads incoming stream and internally erasure codes +// them. This call is similar to put object part operation but the source +// data is read from an existing object. +// +// Implements S3 compatible Upload Part Copy API. +func (es *erasureSingle) CopyObjectPart(ctx context.Context, srcBucket, srcObject, dstBucket, dstObject, uploadID string, partID int, startOffset int64, length int64, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (pi PartInfo, e error) { + partInfo, err := es.PutObjectPart(ctx, dstBucket, dstObject, uploadID, partID, NewPutObjReader(srcInfo.Reader), dstOpts) + if err != nil { + return pi, toObjectErr(err, dstBucket, dstObject) + } + + // Success. + return partInfo, nil +} + +// PutObjectPart - reads incoming stream and internally erasure codes +// them. This call is similar to single put operation but it is part +// of the multipart transaction. +// +// Implements S3 compatible Upload Part API. +func (es *erasureSingle) PutObjectPart(ctx context.Context, bucket, object, uploadID string, partID int, r *PutObjReader, opts ObjectOptions) (pi PartInfo, err error) { + if err := checkPutObjectPartArgs(ctx, bucket, object, es); err != nil { + return PartInfo{}, err + } + + // Write lock for this part ID. + // Held throughout the operation. + partIDLock := es.NewNSLock(bucket, pathJoin(object, uploadID, strconv.Itoa(partID))) + plkctx, err := partIDLock.GetLock(ctx, globalOperationTimeout) + if err != nil { + return PartInfo{}, err + } + pctx := plkctx.Context() + defer partIDLock.Unlock(plkctx.Cancel) + + // Read lock for upload id. + // Only held while reading the upload metadata. + uploadIDRLock := es.NewNSLock(bucket, pathJoin(object, uploadID)) + rlkctx, err := uploadIDRLock.GetRLock(ctx, globalOperationTimeout) + if err != nil { + return PartInfo{}, err + } + rctx := rlkctx.Context() + defer func() { + if uploadIDRLock != nil { + uploadIDRLock.RUnlock(rlkctx.Cancel) + } + }() + + data := r.Reader + // Validate input data size and it can never be less than zero. + if data.Size() < -1 { + logger.LogIf(rctx, errInvalidArgument, logger.Application) + return pi, toObjectErr(errInvalidArgument) + } + + var partsMetadata []FileInfo + var errs []error + uploadIDPath := es.getUploadIDDir(bucket, object, uploadID) + + // Validates if upload ID exists. + if err = es.checkUploadIDExists(rctx, bucket, object, uploadID); err != nil { + return pi, toObjectErr(err, bucket, object, uploadID) + } + + storageDisks := []StorageAPI{es.disk} + + // Read metadata associated with the object from all disks. + partsMetadata, errs = readAllFileInfo(rctx, storageDisks, minioMetaMultipartBucket, + uploadIDPath, "", false) + + // Unlock upload id locks before, so others can get it. + uploadIDRLock.RUnlock(rlkctx.Cancel) + uploadIDRLock = nil + + // get Quorum for this object + _, writeQuorum, err := objectQuorumFromMeta(pctx, partsMetadata, errs, 0) + if err != nil { + return pi, toObjectErr(err, bucket, object) + } + + reducedErr := reduceWriteQuorumErrs(pctx, errs, objectOpIgnoredErrs, writeQuorum) + if reducedErr == errErasureWriteQuorum { + return pi, toObjectErr(reducedErr, bucket, object) + } + + // List all online disks. + onlineDisks, modTime := listOnlineDisks(storageDisks, partsMetadata, errs) + + // Pick one from the first valid metadata. + fi, err := pickValidFileInfo(pctx, partsMetadata, modTime, writeQuorum) + if err != nil { + return pi, err + } + + onlineDisks = shuffleDisks(onlineDisks, fi.Erasure.Distribution) + + // Need a unique name for the part being written in minioMetaBucket to + // accommodate concurrent PutObjectPart requests + + partSuffix := fmt.Sprintf("part.%d", partID) + tmpPart := mustGetUUID() + tmpPartPath := pathJoin(tmpPart, partSuffix) + + // Delete the temporary object part. If PutObjectPart succeeds there would be nothing to delete. + var online int + defer func() { + if online != len(onlineDisks) { + es.disk.RenameFile(context.Background(), minioMetaTmpBucket, tmpPart, minioMetaTmpDeletedBucket, mustGetUUID()) + } + }() + + erasure, err := NewErasure(pctx, fi.Erasure.DataBlocks, fi.Erasure.ParityBlocks, fi.Erasure.BlockSize) + if err != nil { + return pi, toObjectErr(err, bucket, object) + } + + // Fetch buffer for I/O, returns from the pool if not allocates a new one and returns. + var buffer []byte + switch size := data.Size(); { + case size == 0: + buffer = make([]byte, 1) // Allocate atleast a byte to reach EOF + case size == -1: + if size := data.ActualSize(); size > 0 && size < fi.Erasure.BlockSize { + buffer = make([]byte, data.ActualSize()+256, data.ActualSize()*2+512) + } else { + buffer = es.bp.Get() + defer es.bp.Put(buffer) + } + case size >= fi.Erasure.BlockSize: + buffer = es.bp.Get() + defer es.bp.Put(buffer) + case size < fi.Erasure.BlockSize: + // No need to allocate fully fi.Erasure.BlockSize buffer if the incoming data is smalles. + buffer = make([]byte, size, 2*size+int64(fi.Erasure.ParityBlocks+fi.Erasure.DataBlocks-1)) + } + + if len(buffer) > int(fi.Erasure.BlockSize) { + buffer = buffer[:fi.Erasure.BlockSize] + } + writers := make([]io.Writer, len(onlineDisks)) + for i, disk := range onlineDisks { + if disk == nil { + continue + } + writers[i] = newBitrotWriter(disk, minioMetaTmpBucket, tmpPartPath, erasure.ShardFileSize(data.Size()), DefaultBitrotAlgorithm, erasure.ShardSize()) + } + + toEncode := io.Reader(data) + if data.Size() > bigFileThreshold { + // Add input readahead. + // We use 2 buffers, so we always have a full buffer of input. + bufA := es.bp.Get() + bufB := es.bp.Get() + defer es.bp.Put(bufA) + defer es.bp.Put(bufB) + ra, err := readahead.NewReaderBuffer(data, [][]byte{bufA[:fi.Erasure.BlockSize], bufB[:fi.Erasure.BlockSize]}) + if err == nil { + toEncode = ra + defer ra.Close() + } + } + + n, err := erasure.Encode(pctx, toEncode, writers, buffer, writeQuorum) + closeBitrotWriters(writers) + if err != nil { + return pi, toObjectErr(err, bucket, object) + } + + // Should return IncompleteBody{} error when reader has fewer bytes + // than specified in request header. + if n < data.Size() { + return pi, IncompleteBody{Bucket: bucket, Object: object} + } + + for i := range writers { + if writers[i] == nil { + onlineDisks[i] = nil + } + } + + // Acquire write lock to update metadata. + uploadIDWLock := es.NewNSLock(bucket, pathJoin(object, uploadID)) + wlkctx, err := uploadIDWLock.GetLock(pctx, globalOperationTimeout) + if err != nil { + return PartInfo{}, err + } + wctx := wlkctx.Context() + defer uploadIDWLock.Unlock(wlkctx.Cancel) + + // Validates if upload ID exists. + if err = es.checkUploadIDExists(wctx, bucket, object, uploadID); err != nil { + return pi, toObjectErr(err, bucket, object, uploadID) + } + + // Rename temporary part file to its final location. + partPath := pathJoin(uploadIDPath, fi.DataDir, partSuffix) + onlineDisks, err = renamePart(wctx, onlineDisks, minioMetaTmpBucket, tmpPartPath, minioMetaMultipartBucket, partPath, writeQuorum) + if err != nil { + return pi, toObjectErr(err, minioMetaMultipartBucket, partPath) + } + + // Read metadata again because it might be updated with parallel upload of another part. + partsMetadata, errs = readAllFileInfo(wctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath, "", false) + reducedErr = reduceWriteQuorumErrs(wctx, errs, objectOpIgnoredErrs, writeQuorum) + if reducedErr == errErasureWriteQuorum { + return pi, toObjectErr(reducedErr, bucket, object) + } + + // Get current highest version based on re-read partsMetadata. + onlineDisks, modTime = listOnlineDisks(onlineDisks, partsMetadata, errs) + + // Pick one from the first valid metadata. + fi, err = pickValidFileInfo(wctx, partsMetadata, modTime, writeQuorum) + if err != nil { + return pi, err + } + + // Once part is successfully committed, proceed with updating erasure metadata. + fi.ModTime = UTCNow() + + md5hex := r.MD5CurrentHexString() + + // Add the current part. + fi.AddObjectPart(partID, md5hex, n, data.ActualSize()) + + for i, disk := range onlineDisks { + if disk == OfflineDisk { + continue + } + partsMetadata[i].Size = fi.Size + partsMetadata[i].ModTime = fi.ModTime + partsMetadata[i].Parts = fi.Parts + partsMetadata[i].Erasure.AddChecksumInfo(ChecksumInfo{ + PartNumber: partID, + Algorithm: DefaultBitrotAlgorithm, + Hash: bitrotWriterSum(writers[i]), + }) + } + + // Writes update `xl.meta` format for each disk. + if _, err = writeUniqueFileInfo(wctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath, partsMetadata, writeQuorum); err != nil { + return pi, toObjectErr(err, minioMetaMultipartBucket, uploadIDPath) + } + + online = countOnlineDisks(onlineDisks) + + // Return success. + return PartInfo{ + PartNumber: partID, + ETag: md5hex, + LastModified: fi.ModTime, + Size: n, + ActualSize: data.ActualSize(), + }, nil +} + +// GetMultipartInfo returns multipart metadata uploaded during newMultipartUpload, used +// by callers to verify object states +// - encrypted +// - compressed +func (es *erasureSingle) GetMultipartInfo(ctx context.Context, bucket, object, uploadID string, opts ObjectOptions) (MultipartInfo, error) { + if err := checkListPartsArgs(ctx, bucket, object, es); err != nil { + return MultipartInfo{}, err + } + + result := MultipartInfo{ + Bucket: bucket, + Object: object, + UploadID: uploadID, + } + + uploadIDLock := es.NewNSLock(bucket, pathJoin(object, uploadID)) + lkctx, err := uploadIDLock.GetRLock(ctx, globalOperationTimeout) + if err != nil { + return MultipartInfo{}, err + } + ctx = lkctx.Context() + defer uploadIDLock.RUnlock(lkctx.Cancel) + + if err := es.checkUploadIDExists(ctx, bucket, object, uploadID); err != nil { + return result, toObjectErr(err, bucket, object, uploadID) + } + + uploadIDPath := es.getUploadIDDir(bucket, object, uploadID) + + storageDisks := []StorageAPI{es.disk} + + // Read metadata associated with the object from all disks. + partsMetadata, errs := readAllFileInfo(ctx, storageDisks, minioMetaMultipartBucket, uploadIDPath, opts.VersionID, false) + + // get Quorum for this object + readQuorum, _, err := objectQuorumFromMeta(ctx, partsMetadata, errs, 0) + if err != nil { + return result, toObjectErr(err, minioMetaMultipartBucket, uploadIDPath) + } + + reducedErr := reduceWriteQuorumErrs(ctx, errs, objectOpIgnoredErrs, readQuorum) + if reducedErr == errErasureReadQuorum { + return result, toObjectErr(reducedErr, minioMetaMultipartBucket, uploadIDPath) + } + + _, modTime := listOnlineDisks(storageDisks, partsMetadata, errs) + + // Pick one from the first valid metadata. + fi, err := pickValidFileInfo(ctx, partsMetadata, modTime, readQuorum) + if err != nil { + return result, err + } + + result.UserDefined = cloneMSS(fi.Metadata) + return result, nil +} + +// ListObjectParts - lists all previously uploaded parts for a given +// object and uploadID. Takes additional input of part-number-marker +// to indicate where the listing should begin from. +// +// Implements S3 compatible ListObjectParts API. The resulting +// ListPartsInfo structure is marshaled directly into XML and +// replied back to the client. +func (es *erasureSingle) ListObjectParts(ctx context.Context, bucket, object, uploadID string, partNumberMarker, maxParts int, opts ObjectOptions) (result ListPartsInfo, err error) { + if err := checkListPartsArgs(ctx, bucket, object, es); err != nil { + return ListPartsInfo{}, err + } + + uploadIDLock := es.NewNSLock(bucket, pathJoin(object, uploadID)) + lkctx, err := uploadIDLock.GetRLock(ctx, globalOperationTimeout) + if err != nil { + return ListPartsInfo{}, err + } + ctx = lkctx.Context() + defer uploadIDLock.RUnlock(lkctx.Cancel) + + if err := es.checkUploadIDExists(ctx, bucket, object, uploadID); err != nil { + return result, toObjectErr(err, bucket, object, uploadID) + } + + uploadIDPath := es.getUploadIDDir(bucket, object, uploadID) + + storageDisks := []StorageAPI{es.disk} + + // Read metadata associated with the object from all disks. + partsMetadata, errs := readAllFileInfo(ctx, storageDisks, minioMetaMultipartBucket, uploadIDPath, "", false) + + // get Quorum for this object + _, writeQuorum, err := objectQuorumFromMeta(ctx, partsMetadata, errs, 0) + if err != nil { + return result, toObjectErr(err, minioMetaMultipartBucket, uploadIDPath) + } + + reducedErr := reduceWriteQuorumErrs(ctx, errs, objectOpIgnoredErrs, writeQuorum) + if reducedErr == errErasureWriteQuorum { + return result, toObjectErr(reducedErr, minioMetaMultipartBucket, uploadIDPath) + } + + _, modTime := listOnlineDisks(storageDisks, partsMetadata, errs) + + // Pick one from the first valid metadata. + fi, err := pickValidFileInfo(ctx, partsMetadata, modTime, writeQuorum) + if err != nil { + return result, err + } + + // Populate the result stub. + result.Bucket = bucket + result.Object = object + result.UploadID = uploadID + result.MaxParts = maxParts + result.PartNumberMarker = partNumberMarker + result.UserDefined = cloneMSS(fi.Metadata) + + // For empty number of parts or maxParts as zero, return right here. + if len(fi.Parts) == 0 || maxParts == 0 { + return result, nil + } + + // Limit output to maxPartsList. + if maxParts > maxPartsList { + maxParts = maxPartsList + } + + // Only parts with higher part numbers will be listed. + partIdx := objectPartIndex(fi.Parts, partNumberMarker) + parts := fi.Parts + if partIdx != -1 { + parts = fi.Parts[partIdx+1:] + } + count := maxParts + for _, part := range parts { + result.Parts = append(result.Parts, PartInfo{ + PartNumber: part.Number, + ETag: part.ETag, + LastModified: fi.ModTime, + Size: part.Size, + }) + count-- + if count == 0 { + break + } + } + // If listed entries are more than maxParts, we set IsTruncated as true. + if len(parts) > len(result.Parts) { + result.IsTruncated = true + // Make sure to fill next part number marker if IsTruncated is + // true for subsequent listing. + nextPartNumberMarker := result.Parts[len(result.Parts)-1].PartNumber + result.NextPartNumberMarker = nextPartNumberMarker + } + return result, nil +} + +// CompleteMultipartUpload - completes an ongoing multipart +// transaction after receiving all the parts indicated by the client. +// Returns an md5sum calculated by concatenating all the individual +// md5sums of all the parts. +// +// Implements S3 compatible Complete multipart API. +func (es *erasureSingle) CompleteMultipartUpload(ctx context.Context, bucket string, object string, uploadID string, parts []CompletePart, opts ObjectOptions) (oi ObjectInfo, err error) { + if err = checkCompleteMultipartArgs(ctx, bucket, object, es); err != nil { + return oi, err + } + + // Hold read-locks to verify uploaded parts, also disallows + // parallel part uploads as well. + uploadIDLock := es.NewNSLock(bucket, pathJoin(object, uploadID)) + rlkctx, err := uploadIDLock.GetRLock(ctx, globalOperationTimeout) + if err != nil { + return oi, err + } + rctx := rlkctx.Context() + defer uploadIDLock.RUnlock(rlkctx.Cancel) + + if err = es.checkUploadIDExists(rctx, bucket, object, uploadID); err != nil { + return oi, toObjectErr(err, bucket, object, uploadID) + } + + uploadIDPath := es.getUploadIDDir(bucket, object, uploadID) + + storageDisks := []StorageAPI{es.disk} + + // Read metadata associated with the object from all disks. + partsMetadata, errs := readAllFileInfo(rctx, storageDisks, minioMetaMultipartBucket, uploadIDPath, "", false) + + // get Quorum for this object + _, writeQuorum, err := objectQuorumFromMeta(rctx, partsMetadata, errs, 0) + if err != nil { + return oi, toObjectErr(err, bucket, object) + } + + reducedErr := reduceWriteQuorumErrs(rctx, errs, objectOpIgnoredErrs, writeQuorum) + if reducedErr == errErasureWriteQuorum { + return oi, toObjectErr(reducedErr, bucket, object) + } + + onlineDisks, modTime := listOnlineDisks(storageDisks, partsMetadata, errs) + + // Pick one from the first valid metadata. + fi, err := pickValidFileInfo(rctx, partsMetadata, modTime, writeQuorum) + if err != nil { + return oi, err + } + + // Calculate full object size. + var objectSize int64 + + // Calculate consolidated actual size. + var objectActualSize int64 + + // Order online disks in accordance with distribution order. + // Order parts metadata in accordance with distribution order. + onlineDisks, partsMetadata = shuffleDisksAndPartsMetadataByIndex(onlineDisks, partsMetadata, fi) + + // Save current erasure metadata for validation. + currentFI := fi + + // Allocate parts similar to incoming slice. + fi.Parts = make([]ObjectPartInfo, len(parts)) + + // Validate each part and then commit to disk. + for i, part := range parts { + partIdx := objectPartIndex(currentFI.Parts, part.PartNumber) + // All parts should have same part number. + if partIdx == -1 { + invp := InvalidPart{ + PartNumber: part.PartNumber, + GotETag: part.ETag, + } + return oi, invp + } + + // ensure that part ETag is canonicalized to strip off extraneous quotes + part.ETag = canonicalizeETag(part.ETag) + if currentFI.Parts[partIdx].ETag != part.ETag { + invp := InvalidPart{ + PartNumber: part.PartNumber, + ExpETag: currentFI.Parts[partIdx].ETag, + GotETag: part.ETag, + } + return oi, invp + } + + // All parts except the last part has to be atleast 5MB. + if (i < len(parts)-1) && !isMinAllowedPartSize(currentFI.Parts[partIdx].ActualSize) { + return oi, PartTooSmall{ + PartNumber: part.PartNumber, + PartSize: currentFI.Parts[partIdx].ActualSize, + PartETag: part.ETag, + } + } + + // Save for total object size. + objectSize += currentFI.Parts[partIdx].Size + + // Save the consolidated actual size. + objectActualSize += currentFI.Parts[partIdx].ActualSize + + // Add incoming parts. + fi.Parts[i] = ObjectPartInfo{ + Number: part.PartNumber, + Size: currentFI.Parts[partIdx].Size, + ActualSize: currentFI.Parts[partIdx].ActualSize, + } + } + + // Save the final object size and modtime. + fi.Size = objectSize + fi.ModTime = opts.MTime + if opts.MTime.IsZero() { + fi.ModTime = UTCNow() + } + + // Save successfully calculated md5sum. + fi.Metadata["etag"] = opts.UserDefined["etag"] + if fi.Metadata["etag"] == "" { + fi.Metadata["etag"] = getCompleteMultipartMD5(parts) + } + + // Save the consolidated actual size. + fi.Metadata[ReservedMetadataPrefix+"actual-size"] = strconv.FormatInt(objectActualSize, 10) + + // Update all erasure metadata, make sure to not modify fields like + // checksum which are different on each disks. + for index := range partsMetadata { + if partsMetadata[index].IsValid() { + partsMetadata[index].Size = fi.Size + partsMetadata[index].ModTime = fi.ModTime + partsMetadata[index].Metadata = fi.Metadata + partsMetadata[index].Parts = fi.Parts + } + } + + // Hold namespace to complete the transaction + lk := es.NewNSLock(bucket, object) + lkctx, err := lk.GetLock(ctx, globalOperationTimeout) + if err != nil { + return oi, err + } + ctx = lkctx.Context() + defer lk.Unlock(lkctx.Cancel) + + // Write final `xl.meta` at uploadID location + onlineDisks, err = writeUniqueFileInfo(ctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath, partsMetadata, writeQuorum) + if err != nil { + return oi, toObjectErr(err, minioMetaMultipartBucket, uploadIDPath) + } + + // Remove parts that weren't present in CompleteMultipartUpload request. + for _, curpart := range currentFI.Parts { + if objectPartIndex(fi.Parts, curpart.Number) == -1 { + // Delete the missing part files. e.g, + // Request 1: NewMultipart + // Request 2: PutObjectPart 1 + // Request 3: PutObjectPart 2 + // Request 4: CompleteMultipartUpload --part 2 + // N.B. 1st part is not present. This part should be removed from the storage. + es.removeObjectPart(bucket, object, uploadID, fi.DataDir, curpart.Number) + } + } + + // Rename the multipart object to final location. + if onlineDisks, err = renameData(ctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath, + partsMetadata, bucket, object, writeQuorum); err != nil { + return oi, toObjectErr(err, bucket, object) + } + + for i := 0; i < len(onlineDisks); i++ { + if onlineDisks[i] != nil && onlineDisks[i].IsOnline() { + // Object info is the same in all disks, so we can pick + // the first meta from online disk + fi = partsMetadata[i] + break + } + } + + // we are adding a new version to this object under the namespace lock, so this is the latest version. + fi.IsLatest = true + + // Success, return object info. + return fi.ToObjectInfo(bucket, object), nil +} + +// AbortMultipartUpload - aborts an ongoing multipart operation +// signified by the input uploadID. This is an atomic operation +// doesn't require clients to initiate multiple such requests. +// +// All parts are purged from all disks and reference to the uploadID +// would be removed from the system, rollback is not possible on this +// operation. +func (es *erasureSingle) AbortMultipartUpload(ctx context.Context, bucket, object, uploadID string, opts ObjectOptions) (err error) { + if err = checkAbortMultipartArgs(ctx, bucket, object, es); err != nil { + return err + } + + lk := es.NewNSLock(bucket, pathJoin(object, uploadID)) + lkctx, err := lk.GetLock(ctx, globalOperationTimeout) + if err != nil { + return err + } + ctx = lkctx.Context() + defer lk.Unlock(lkctx.Cancel) + + // Validates if upload ID exists. + if err := es.checkUploadIDExists(ctx, bucket, object, uploadID); err != nil { + return toObjectErr(err, bucket, object, uploadID) + } + + // Cleanup all uploaded parts. + es.disk.RenameFile(ctx, minioMetaMultipartBucket, es.getUploadIDDir(bucket, object, uploadID), minioMetaTmpDeletedBucket, mustGetUUID()) + + // Successfully purged. + return nil +} + +func (es *erasureSingle) ListObjects(ctx context.Context, bucket, prefix, marker, delimiter string, maxKeys int) (ListObjectsInfo, error) { + var loi ListObjectsInfo + + // Automatically remove the object/version is an expiry lifecycle rule can be applied + lc, _ := globalLifecycleSys.Get(bucket) + + // Check if bucket is object locked. + rcfg, _ := globalBucketObjectLockSys.Get(bucket) + + if len(prefix) > 0 && maxKeys == 1 && delimiter == "" && marker == "" { + // Optimization for certain applications like + // - Cohesity + // - Actifio, Splunk etc. + // which send ListObjects requests where the actual object + // itself is the prefix and max-keys=1 in such scenarios + // we can simply verify locally if such an object exists + // to avoid the need for ListObjects(). + objInfo, err := es.GetObjectInfo(ctx, bucket, prefix, ObjectOptions{NoLock: true}) + if err == nil { + if lc != nil { + action := evalActionFromLifecycle(ctx, *lc, rcfg, objInfo, false) + switch action { + case lifecycle.DeleteVersionAction, lifecycle.DeleteAction: + fallthrough + case lifecycle.DeleteRestoredAction, lifecycle.DeleteRestoredVersionAction: + return loi, nil + } + } + loi.Objects = append(loi.Objects, objInfo) + return loi, nil + } + } + + opts := listPathOptions{ + Bucket: bucket, + Prefix: prefix, + Separator: delimiter, + Limit: maxKeysPlusOne(maxKeys, marker != ""), + Marker: marker, + InclDeleted: false, + AskDisks: globalAPIConfig.getListQuorum(), + Lifecycle: lc, + Retention: rcfg, + } + + merged, err := es.listPath(ctx, &opts) + if err != nil && err != io.EOF { + if !isErrBucketNotFound(err) { + logger.LogIf(ctx, err) + } + return loi, err + } + + merged.forwardPast(opts.Marker) + defer merged.truncate(0) // Release when returning + + // Default is recursive, if delimiter is set then list non recursive. + objects := merged.fileInfos(bucket, prefix, delimiter) + loi.IsTruncated = err == nil && len(objects) > 0 + if maxKeys > 0 && len(objects) > maxKeys { + objects = objects[:maxKeys] + loi.IsTruncated = true + } + for _, obj := range objects { + if obj.IsDir && obj.ModTime.IsZero() && delimiter != "" { + loi.Prefixes = append(loi.Prefixes, obj.Name) + } else { + loi.Objects = append(loi.Objects, obj) + } + } + if loi.IsTruncated { + last := objects[len(objects)-1] + loi.NextMarker = opts.encodeMarker(last.Name) + } + return loi, nil +} + +func (es *erasureSingle) ListObjectsV2(ctx context.Context, bucket, prefix, continuationToken, delimiter string, maxKeys int, fetchOwner bool, startAfter string) (ListObjectsV2Info, error) { + marker := continuationToken + if marker == "" { + marker = startAfter + } + + loi, err := es.ListObjects(ctx, bucket, prefix, marker, delimiter, maxKeys) + if err != nil { + return ListObjectsV2Info{}, err + } + + listObjectsV2Info := ListObjectsV2Info{ + IsTruncated: loi.IsTruncated, + ContinuationToken: continuationToken, + NextContinuationToken: loi.NextMarker, + Objects: loi.Objects, + Prefixes: loi.Prefixes, + } + return listObjectsV2Info, err +} + +func (es *erasureSingle) ListObjectVersions(ctx context.Context, bucket, prefix, marker, versionMarker, delimiter string, maxKeys int) (ListObjectVersionsInfo, error) { + loi := ListObjectVersionsInfo{} + if marker == "" && versionMarker != "" { + return loi, NotImplemented{} + } + + opts := listPathOptions{ + Bucket: bucket, + Prefix: prefix, + Separator: delimiter, + Limit: maxKeysPlusOne(maxKeys, marker != ""), + Marker: marker, + InclDeleted: true, + AskDisks: "strict", + Versioned: true, + } + + merged, err := es.listPath(ctx, &opts) + if err != nil && err != io.EOF { + return loi, err + } + defer merged.truncate(0) // Release when returning + if versionMarker == "" { + o := listPathOptions{Marker: marker} + // If we are not looking for a specific version skip it. + + o.parseMarker() + merged.forwardPast(o.Marker) + } + objects := merged.fileInfoVersions(bucket, prefix, delimiter, versionMarker) + loi.IsTruncated = err == nil && len(objects) > 0 + if maxKeys > 0 && len(objects) > maxKeys { + objects = objects[:maxKeys] + loi.IsTruncated = true + } + for _, obj := range objects { + if obj.IsDir && obj.ModTime.IsZero() && delimiter != "" { + loi.Prefixes = append(loi.Prefixes, obj.Name) + } else { + loi.Objects = append(loi.Objects, obj) + } + } + if loi.IsTruncated { + last := objects[len(objects)-1] + loi.NextMarker = opts.encodeMarker(last.Name) + loi.NextVersionIDMarker = last.VersionID + } + return loi, nil +} + +// Walk a bucket, optionally prefix recursively, until we have returned +// all the content to objectInfo channel, it is callers responsibility +// to allocate a receive channel for ObjectInfo, upon any unhandled +// error walker returns error. Optionally if context.Done() is received +// then Walk() stops the walker. +func (es *erasureSingle) Walk(ctx context.Context, bucket, prefix string, results chan<- ObjectInfo, opts ObjectOptions) error { + if err := checkListObjsArgs(ctx, bucket, prefix, "", es); err != nil { + // Upon error close the channel. + close(results) + return err + } + + ctx, cancel := context.WithCancel(ctx) + go func() { + defer cancel() + defer close(results) + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + loadEntry := func(entry metaCacheEntry) { + if entry.isDir() { + return + } + + fivs, err := entry.fileInfoVersions(bucket) + if err != nil { + cancel() + return + } + if opts.WalkAscending { + for i := len(fivs.Versions) - 1; i >= 0; i-- { + version := fivs.Versions[i] + results <- version.ToObjectInfo(bucket, version.Name) + } + return + } + for _, version := range fivs.Versions { + results <- version.ToObjectInfo(bucket, version.Name) + } + } + + // How to resolve partial results. + resolver := metadataResolutionParams{ + dirQuorum: 1, + objQuorum: 1, + bucket: bucket, + } + + path := baseDirFromPrefix(prefix) + filterPrefix := strings.Trim(strings.TrimPrefix(prefix, path), slashSeparator) + if path == prefix { + filterPrefix = "" + } + + lopts := listPathRawOptions{ + disks: []StorageAPI{es.disk}, + bucket: bucket, + path: path, + filterPrefix: filterPrefix, + recursive: true, + forwardTo: "", + minDisks: 1, + reportNotFound: false, + agreed: loadEntry, + partial: func(entries metaCacheEntries, nAgreed int, errs []error) { + entry, ok := entries.resolve(&resolver) + if !ok { + // check if we can get one entry atleast + // proceed to heal nonetheless. + entry, _ = entries.firstFound() + } + + loadEntry(*entry) + }, + finished: nil, + } + + if err := listPathRaw(ctx, lopts); err != nil { + logger.LogIf(ctx, fmt.Errorf("listPathRaw returned %w: opts(%#v)", err, lopts)) + return + } + }() + wg.Wait() + }() + + return nil +} + +// nsScanner will start scanning buckets and send updated totals as they are traversed. +// Updates are sent on a regular basis and the caller *must* consume them. +func (es *erasureSingle) nsScanner(ctx context.Context, buckets []BucketInfo, bf *bloomFilter, wantCycle uint32, updates chan<- dataUsageCache, healScanMode madmin.HealScanMode) error { + if len(buckets) == 0 { + return nil + } + + // Collect disks we can use. + disks := []StorageAPI{es.disk} + + // Load bucket totals + oldCache := dataUsageCache{} + if err := oldCache.load(ctx, es, dataUsageCacheName); err != nil { + return err + } + + // New cache.. + cache := dataUsageCache{ + Info: dataUsageCacheInfo{ + Name: dataUsageRoot, + NextCycle: oldCache.Info.NextCycle, + }, + Cache: make(map[string]dataUsageEntry, len(oldCache.Cache)), + } + bloom := bf.bytes() + + // Put all buckets into channel. + bucketCh := make(chan BucketInfo, len(buckets)) + // Add new buckets first + for _, b := range buckets { + if oldCache.find(b.Name) == nil { + bucketCh <- b + } + } + + // Add existing buckets. + for _, b := range buckets { + e := oldCache.find(b.Name) + if e != nil { + cache.replace(b.Name, dataUsageRoot, *e) + bucketCh <- b + } + } + + close(bucketCh) + bucketResults := make(chan dataUsageEntryInfo, len(disks)) + + // Start async collector/saver. + // This goroutine owns the cache. + var saverWg sync.WaitGroup + saverWg.Add(1) + go func() { + // Add jitter to the update time so multiple sets don't sync up. + updateTime := 30*time.Second + time.Duration(float64(10*time.Second)*rand.Float64()) + t := time.NewTicker(updateTime) + defer t.Stop() + defer saverWg.Done() + var lastSave time.Time + + for { + select { + case <-ctx.Done(): + // Return without saving. + return + case <-t.C: + if cache.Info.LastUpdate.Equal(lastSave) { + continue + } + logger.LogIf(ctx, cache.save(ctx, es, dataUsageCacheName)) + updates <- cache.clone() + lastSave = cache.Info.LastUpdate + case v, ok := <-bucketResults: + if !ok { + // Save final state... + cache.Info.NextCycle = wantCycle + cache.Info.LastUpdate = time.Now() + logger.LogIf(ctx, cache.save(ctx, es, dataUsageCacheName)) + updates <- cache + return + } + cache.replace(v.Name, v.Parent, v.Entry) + cache.Info.LastUpdate = time.Now() + } + } + }() + + // Shuffle disks to ensure a total randomness of bucket/disk association to ensure + // that objects that are not present in all disks are accounted and ILM applied. + r := rand.New(rand.NewSource(time.Now().UnixNano())) + r.Shuffle(len(disks), func(i, j int) { disks[i], disks[j] = disks[j], disks[i] }) + + // Start one scanner per disk + var wg sync.WaitGroup + wg.Add(len(disks)) + for i := range disks { + go func(i int) { + defer wg.Done() + disk := disks[i] + + for bucket := range bucketCh { + select { + case <-ctx.Done(): + return + default: + } + + // Load cache for bucket + cacheName := pathJoin(bucket.Name, dataUsageCacheName) + cache := dataUsageCache{} + logger.LogIf(ctx, cache.load(ctx, es, cacheName)) + if cache.Info.Name == "" { + cache.Info.Name = bucket.Name + } + cache.Info.BloomFilter = bloom + cache.Info.SkipHealing = true + cache.Info.NextCycle = wantCycle + if cache.Info.Name != bucket.Name { + logger.LogIf(ctx, fmt.Errorf("cache name mismatch: %s != %s", cache.Info.Name, bucket.Name)) + cache.Info = dataUsageCacheInfo{ + Name: bucket.Name, + LastUpdate: time.Time{}, + NextCycle: wantCycle, + } + } + // Collect updates. + updates := make(chan dataUsageEntry, 1) + var wg sync.WaitGroup + wg.Add(1) + go func(name string) { + defer wg.Done() + for update := range updates { + bucketResults <- dataUsageEntryInfo{ + Name: name, + Parent: dataUsageRoot, + Entry: update, + } + } + }(cache.Info.Name) + // Calc usage + before := cache.Info.LastUpdate + var err error + cache, err = disk.NSScanner(ctx, cache, updates, healScanMode) + cache.Info.BloomFilter = nil + if err != nil { + if !cache.Info.LastUpdate.IsZero() && cache.Info.LastUpdate.After(before) { + logger.LogIf(ctx, cache.save(ctx, es, cacheName)) + } else { + logger.LogIf(ctx, err) + } + // This ensures that we don't close + // bucketResults channel while the + // updates-collector goroutine still + // holds a reference to this. + wg.Wait() + continue + } + + wg.Wait() + var root dataUsageEntry + if r := cache.root(); r != nil { + root = cache.flatten(*r) + } + t := time.Now() + bucketResults <- dataUsageEntryInfo{ + Name: cache.Info.Name, + Parent: dataUsageRoot, + Entry: root, + } + // We want to avoid synchronizing up all writes in case + // the results are piled up. + time.Sleep(time.Duration(float64(time.Since(t)) * rand.Float64())) + // Save cache + logger.LogIf(ctx, cache.save(ctx, es, cacheName)) + } + }(i) + } + wg.Wait() + close(bucketResults) + saverWg.Wait() + + return nil +} + +func (es *erasureSingle) NSScanner(ctx context.Context, bf *bloomFilter, updates chan<- DataUsageInfo, wantCycle uint32, healScanMode madmin.HealScanMode) error { + // Updates must be closed before we return. + defer close(updates) + + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + var wg sync.WaitGroup + var mu sync.Mutex + results := make([]dataUsageCache, 1) + var firstErr error + + allBuckets, err := es.ListBuckets(ctx) + if err != nil { + return err + } + + if len(allBuckets) == 0 { + updates <- DataUsageInfo{} // no buckets found update data usage to reflect latest state + return nil + } + + // Scanner latest allBuckets first. + sort.Slice(allBuckets, func(i, j int) bool { + return allBuckets[i].Created.After(allBuckets[j].Created) + }) + + wg.Add(1) + go func() { + updates := make(chan dataUsageCache, 1) + defer close(updates) + // Start update collector. + go func() { + defer wg.Done() + for info := range updates { + mu.Lock() + results[0] = info + mu.Unlock() + } + }() + + // Start scanner. Blocks until done. + err := es.nsScanner(ctx, allBuckets, bf, wantCycle, updates, healScanMode) + if err != nil { + logger.LogIf(ctx, err) + mu.Lock() + if firstErr == nil { + firstErr = err + } + // Cancel remaining... + cancel() + mu.Unlock() + return + } + }() + + updateCloser := make(chan chan struct{}) + go func() { + updateTicker := time.NewTicker(30 * time.Second) + defer updateTicker.Stop() + var lastUpdate time.Time + + // We need to merge since we will get the same buckets from each pool. + // Therefore to get the exact bucket sizes we must merge before we can convert. + var allMerged dataUsageCache + + update := func() { + mu.Lock() + defer mu.Unlock() + + allMerged = dataUsageCache{Info: dataUsageCacheInfo{Name: dataUsageRoot}} + for _, info := range results { + if info.Info.LastUpdate.IsZero() { + // Not filled yet. + return + } + allMerged.merge(info) + } + if allMerged.root() != nil && allMerged.Info.LastUpdate.After(lastUpdate) { + updates <- allMerged.dui(allMerged.Info.Name, allBuckets) + lastUpdate = allMerged.Info.LastUpdate + } + } + for { + select { + case <-ctx.Done(): + return + case v := <-updateCloser: + update() + close(v) + return + case <-updateTicker.C: + update() + } + } + }() + + wg.Wait() + ch := make(chan struct{}) + select { + case updateCloser <- ch: + <-ch + case <-ctx.Done(): + if firstErr == nil { + firstErr = ctx.Err() + } + } + return firstErr +} diff --git a/cmd/erasure.go b/cmd/erasure.go index 3a206a80b..45a51e035 100644 --- a/cmd/erasure.go +++ b/cmd/erasure.go @@ -79,7 +79,7 @@ func (er erasureObjects) NewNSLock(bucket string, objects ...string) RWLocker { // Shutdown function for object storage interface. func (er erasureObjects) Shutdown(ctx context.Context) error { // Add any object layer shutdown activities here. - closeStorageDisks(er.getDisks()) + closeStorageDisks(er.getDisks()...) return nil } diff --git a/cmd/format-erasure.go b/cmd/format-erasure.go index b152c0d95..2097da096 100644 --- a/cmd/format-erasure.go +++ b/cmd/format-erasure.go @@ -40,6 +40,9 @@ const ( // Represents Erasure backend. formatBackendErasure = "xl" + // Represents Erasure backend - single drive + formatBackendErasureSingle = "xl-single" + // formatErasureV1.Erasure.Version - version '1'. formatErasureVersionV1 = "1" @@ -146,6 +149,9 @@ func newFormatErasureV3(numSets int, setLen int) *formatErasureV3 { format := &formatErasureV3{} format.Version = formatMetaVersionV1 format.Format = formatBackendErasure + if setLen == 1 { + format.Format = formatBackendErasureSingle + } format.ID = mustGetUUID() format.Erasure.Version = formatErasureVersionV3 format.Erasure.DistributionAlgo = formatErasureVersionV3DistributionAlgoV3 @@ -170,8 +176,8 @@ func formatGetBackendErasureVersion(b []byte) (string, error) { if meta.Version != formatMetaVersionV1 { return "", fmt.Errorf(`format.Version expected: %s, got: %s`, formatMetaVersionV1, meta.Version) } - if meta.Format != formatBackendErasure { - return "", fmt.Errorf(`found backend type %s, expected %s`, meta.Format, formatBackendErasure) + if meta.Format != formatBackendErasure && meta.Format != formatBackendErasureSingle { + return "", fmt.Errorf(`found backend type %s, expected %s or %s`, meta.Format, formatBackendErasure, formatBackendErasureSingle) } // Erasure backend found, proceed to detect version. format := &formatErasureVersionDetect{} @@ -291,7 +297,7 @@ func formatErasureMigrateV2ToV3(data []byte, export, version string) ([]byte, er func countErrs(errs []error, err error) int { i := 0 for _, err1 := range errs { - if err1 == err { + if err1 == err || errors.Is(err1, err) { i++ } } @@ -410,7 +416,7 @@ func checkFormatErasureValue(formatErasure *formatErasureV3, disk StorageAPI) er if formatErasure.Version != formatMetaVersionV1 { return fmt.Errorf("Unsupported version of backend format [%s] found on %s", formatErasure.Version, disk) } - if formatErasure.Format != formatBackendErasure { + if formatErasure.Format != formatBackendErasure && formatErasure.Format != formatBackendErasureSingle { return fmt.Errorf("Unsupported backend format [%s] found on %s", formatErasure.Format, disk) } if formatErasure.Erasure.Version != formatErasureVersionV3 { @@ -643,7 +649,7 @@ func saveFormatErasureAll(ctx context.Context, storageDisks []StorageAPI, format } // relinquishes the underlying connection for all storage disks. -func closeStorageDisks(storageDisks []StorageAPI) { +func closeStorageDisks(storageDisks ...StorageAPI) { var wg sync.WaitGroup for _, disk := range storageDisks { if disk == nil { diff --git a/cmd/format-fs.go b/cmd/format-fs.go index 172924186..29fd029d7 100644 --- a/cmd/format-fs.go +++ b/cmd/format-fs.go @@ -240,6 +240,9 @@ func initFormatFS(ctx context.Context, fsPath string) (rlk *lock.RLockedFile, er if err != nil { return nil, err } + if formatBackend == formatBackendErasureSingle { + return nil, errFreshDisk + } if formatBackend != formatBackendFS { return nil, fmt.Errorf(`%s file: expected format-type: %s, found: %s`, formatConfigFile, formatBackendFS, formatBackend) } @@ -319,6 +322,10 @@ func formatFSFixDeploymentID(ctx context.Context, fsFormatPath string) error { rlk.Close() return err } + if formatBackend == formatBackendErasureSingle { + rlk.Close() + return errFreshDisk + } if formatBackend != formatBackendFS { rlk.Close() return fmt.Errorf(`%s file: expected format-type: %s, found: %s`, formatConfigFile, formatBackendFS, formatBackend) diff --git a/cmd/fs-v1-metadata_test.go b/cmd/fs-v1-metadata_test.go index 2003b67a4..1d16914d7 100644 --- a/cmd/fs-v1-metadata_test.go +++ b/cmd/fs-v1-metadata_test.go @@ -45,6 +45,8 @@ func TestFSV1MetadataObjInfo(t *testing.T) { // TestReadFSMetadata - readFSMetadata testing with a healthy and faulty disk func TestReadFSMetadata(t *testing.T) { + t.Skip() + disk := filepath.Join(globalTestTmpDir, "minio-"+nextSuffix()) defer os.RemoveAll(disk) @@ -80,6 +82,7 @@ func TestReadFSMetadata(t *testing.T) { // TestWriteFSMetadata - tests of writeFSMetadata with healthy disk. func TestWriteFSMetadata(t *testing.T) { + t.Skip() disk := filepath.Join(globalTestTmpDir, "minio-"+nextSuffix()) defer os.RemoveAll(disk) diff --git a/cmd/fs-v1-multipart_test.go b/cmd/fs-v1-multipart_test.go index 16210ea65..24d371c1b 100644 --- a/cmd/fs-v1-multipart_test.go +++ b/cmd/fs-v1-multipart_test.go @@ -32,6 +32,7 @@ import ( // Tests cleanup multipart uploads for filesystem backend. func TestFSCleanupMultipartUploadsInRoutine(t *testing.T) { + t.Skip() // Prepare for tests disk := filepath.Join(globalTestTmpDir, "minio-"+nextSuffix()) defer os.RemoveAll(disk) @@ -88,6 +89,7 @@ func TestFSCleanupMultipartUploadsInRoutine(t *testing.T) { // TestNewMultipartUploadFaultyDisk - test NewMultipartUpload with faulty disks func TestNewMultipartUploadFaultyDisk(t *testing.T) { + t.Skip() // Prepare for tests disk := filepath.Join(globalTestTmpDir, "minio-"+nextSuffix()) defer os.RemoveAll(disk) diff --git a/cmd/fs-v1.go b/cmd/fs-v1.go index 1f0d22314..3ec9e9425 100644 --- a/cmd/fs-v1.go +++ b/cmd/fs-v1.go @@ -142,6 +142,11 @@ func NewFSObjectLayer(fsPath string) (ObjectLayer, error) { return nil, config.ErrUnableToWriteInBackend(err).Hint(hint) } + fsFormatPath := pathJoin(fsPath, minioMetaBucket, formatConfigFile) + if _, err = fsStat(ctx, fsFormatPath); err != nil && os.IsNotExist(err) { + return nil, errFreshDisk + } + // Assign a new UUID for FS minio mode. Each server instance // gets its own UUID for temporary file transaction. fsUUID := mustGetUUID() diff --git a/cmd/fs-v1_test.go b/cmd/fs-v1_test.go index 785523451..471fff001 100644 --- a/cmd/fs-v1_test.go +++ b/cmd/fs-v1_test.go @@ -51,6 +51,8 @@ func TestNewFS(t *testing.T) { // TestFSShutdown - initialize a new FS object layer then calls // Shutdown to check returned results func TestFSShutdown(t *testing.T) { + t.Skip() + bucketName := "testbucket" objectName := "object" // Create and return an fsObject with its path in the disk @@ -83,6 +85,8 @@ func TestFSShutdown(t *testing.T) { // TestFSGetBucketInfo - test GetBucketInfo with healty and faulty disks func TestFSGetBucketInfo(t *testing.T) { + t.Skip() + // Prepare for testing disk := filepath.Join(globalTestTmpDir, "minio-"+nextSuffix()) defer os.RemoveAll(disk) @@ -165,6 +169,7 @@ func TestFSPutObject(t *testing.T) { // TestFSDeleteObject - test fs.DeleteObject() with healthy and corrupted disks func TestFSDeleteObject(t *testing.T) { + t.Skip() // Prepare for tests disk := filepath.Join(globalTestTmpDir, "minio-"+nextSuffix()) defer os.RemoveAll(disk) @@ -209,6 +214,7 @@ func TestFSDeleteObject(t *testing.T) { // TestFSDeleteBucket - tests for fs DeleteBucket func TestFSDeleteBucket(t *testing.T) { + t.Skip() // Prepare for testing disk := filepath.Join(globalTestTmpDir, "minio-"+nextSuffix()) defer os.RemoveAll(disk) @@ -249,6 +255,7 @@ func TestFSDeleteBucket(t *testing.T) { // TestFSListBuckets - tests for fs ListBuckets func TestFSListBuckets(t *testing.T) { + t.Skip() // Prepare for tests disk := filepath.Join(globalTestTmpDir, "minio-"+nextSuffix()) defer os.RemoveAll(disk) diff --git a/cmd/gateway-main.go b/cmd/gateway-main.go index b257f959d..9e38c35a7 100644 --- a/cmd/gateway-main.go +++ b/cmd/gateway-main.go @@ -19,6 +19,7 @@ package cmd import ( "context" + "errors" "fmt" "io/ioutil" "log" @@ -32,6 +33,7 @@ import ( "github.com/minio/cli" "github.com/minio/madmin-go" "github.com/minio/minio/internal/color" + "github.com/minio/minio/internal/config" xhttp "github.com/minio/minio/internal/http" "github.com/minio/minio/internal/logger" "github.com/minio/pkg/certs" @@ -292,6 +294,9 @@ func StartGateway(ctx *cli.Context, gw Gateway) { SecretKey: globalActiveCred.SecretKey, }) if err != nil { + if errors.Is(err, errFreshDisk) { + err = config.ErrInvalidFSValue(err) + } logger.FatalIf(err, "Unable to initialize gateway backend") } newObject = NewGatewayLayerWithLocker(newObject) diff --git a/cmd/global-heal.go b/cmd/global-heal.go index 740522223..397b9af84 100644 --- a/cmd/global-heal.go +++ b/cmd/global-heal.go @@ -340,7 +340,9 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, // healObject heals given object path in deep to fix bitrot. func healObject(bucket, object, versionID string, scan madmin.HealScanMode) { // Get background heal sequence to send elements to heal + globalHealStateLK.Lock() bgSeq, ok := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID) + globalHealStateLK.Unlock() if ok { bgSeq.queueHealTask(healSource{ bucket: bucket, diff --git a/cmd/globals.go b/cmd/globals.go index 5ede39527..37eeb23cd 100644 --- a/cmd/globals.go +++ b/cmd/globals.go @@ -74,6 +74,7 @@ const ( globalWindowsOSName = "windows" globalMacOSName = "darwin" globalMinioModeFS = "mode-server-fs" + globalMinioModeErasureSD = "mode-server-xl-single" globalMinioModeErasure = "mode-server-xl" globalMinioModeDistErasure = "mode-server-distributed-xl" globalMinioModeGatewayPrefix = "mode-gateway-" @@ -141,6 +142,9 @@ var ( // Indicates if the running minio server is an erasure-code backend. globalIsErasure = false + // Indicates if the running minio server is in single drive XL mode. + globalIsErasureSD = false + // Indicates if the running minio is in gateway mode. globalIsGateway = false diff --git a/cmd/handler-api.go b/cmd/handler-api.go index 1d2ac3971..45bce36a8 100644 --- a/cmd/handler-api.go +++ b/cmd/handler-api.go @@ -116,10 +116,7 @@ func (t *apiConfig) init(cfg api.Config, setDriveCounts []int) { // + 2 * 10MiB (default erasure block size v1) + 2 * 1MiB (default erasure block size v2) blockSize := xioutil.BlockSizeLarge + xioutil.BlockSizeSmall apiRequestsMaxPerNode = int(maxMem / uint64(maxSetDrives*blockSize+int(blockSizeV1*2+blockSizeV2*2))) - - if globalIsErasure { - logger.Info("Automatically configured API requests per node based on available memory on the system: %d", apiRequestsMaxPerNode) - } + logger.Info("Automatically configured API requests per node based on available memory on the system: %d", apiRequestsMaxPerNode) } else { apiRequestsMaxPerNode = cfg.RequestsMax if len(globalEndpoints.Hostnames()) > 0 { diff --git a/cmd/iam.go b/cmd/iam.go index 2564dd06e..772501dbb 100644 --- a/cmd/iam.go +++ b/cmd/iam.go @@ -314,18 +314,20 @@ func (sys *IAMSys) Init(ctx context.Context, objAPI ObjectLayer, etcdClient *etc break } + refreshInterval := sys.iamRefreshInterval + // Set up polling for expired accounts and credentials purging. switch { case sys.openIDConfig.ProviderEnabled(): go func() { - timer := time.NewTimer(sys.iamRefreshInterval) + timer := time.NewTimer(refreshInterval) defer timer.Stop() for { select { case <-timer.C: sys.purgeExpiredCredentialsForExternalSSO(ctx) - timer.Reset(sys.iamRefreshInterval) + timer.Reset(refreshInterval) case <-ctx.Done(): return } @@ -333,7 +335,7 @@ func (sys *IAMSys) Init(ctx context.Context, objAPI ObjectLayer, etcdClient *etc }() case sys.ldapConfig.Enabled: go func() { - timer := time.NewTimer(sys.iamRefreshInterval) + timer := time.NewTimer(refreshInterval) defer timer.Stop() for { @@ -342,7 +344,7 @@ func (sys *IAMSys) Init(ctx context.Context, objAPI ObjectLayer, etcdClient *etc sys.purgeExpiredCredentialsForLDAP(ctx) sys.updateGroupMembershipsForLDAP(ctx) - timer.Reset(sys.iamRefreshInterval) + timer.Reset(refreshInterval) case <-ctx.Done(): return } @@ -717,7 +719,7 @@ func (sys *IAMSys) SetTempUser(ctx context.Context, accessKey string, cred auth. return errServerNotInitialized } - if globalAuthZPlugin != nil { + if newGlobalAuthZPluginFn() != nil { // If OPA is set, we do not need to set a policy mapping. policyName = "" } @@ -1690,8 +1692,8 @@ func (sys *IAMSys) GetCombinedPolicy(policies ...string) iampolicy.Policy { // IsAllowed - checks given policy args is allowed to continue the Rest API. func (sys *IAMSys) IsAllowed(args iampolicy.Args) bool { // If opa is configured, use OPA always. - if globalAuthZPlugin != nil { - ok, err := globalAuthZPlugin.IsAllowed(args) + if authz := newGlobalAuthZPluginFn(); authz != nil { + ok, err := authz.IsAllowed(args) if err != nil { logger.LogIf(GlobalContext, err) } diff --git a/cmd/lock-rest-server.go b/cmd/lock-rest-server.go index 7d07004a3..43cd3955c 100644 --- a/cmd/lock-rest-server.go +++ b/cmd/lock-rest-server.go @@ -209,22 +209,7 @@ func (l *lockRESTServer) ForceUnlockHandler(w http.ResponseWriter, r *http.Reque // lockMaintenance loops over all locks and discards locks // that have not been refreshed for some time. func lockMaintenance(ctx context.Context) { - // Wait until the object API is ready - // no need to start the lock maintenance - // if ObjectAPI is not initialized. - - var objAPI ObjectLayer - - for { - objAPI = newObjectLayerFn() - if objAPI == nil { - time.Sleep(time.Second) - continue - } - break - } - - if _, ok := objAPI.(*erasureServerPools); !ok { + if !globalIsDistErasure { return } diff --git a/cmd/metacache-bucket.go b/cmd/metacache-bucket.go index fdad507cb..274223f8e 100644 --- a/cmd/metacache-bucket.go +++ b/cmd/metacache-bucket.go @@ -51,10 +51,12 @@ func newBucketMetacache(bucket string, cleanup bool) *bucketMetacache { if cleanup { // Recursively delete all caches. objAPI := newObjectLayerFn() - ez, ok := objAPI.(*erasureServerPools) - if ok { - ctx := context.Background() - ez.renameAll(ctx, minioMetaBucket, metacachePrefixForID(bucket, slashSeparator)) + if objAPI != nil { + ez, ok := objAPI.(renameAllStorager) + if ok { + ctx := context.Background() + ez.renameAll(ctx, minioMetaBucket, metacachePrefixForID(bucket, slashSeparator)) + } } } return &bucketMetacache{ @@ -207,9 +209,15 @@ func (b *bucketMetacache) cloneCaches() (map[string]metacache, map[string][]stri // Deletes are performed concurrently. func (b *bucketMetacache) deleteAll() { ctx := context.Background() - ez, ok := newObjectLayerFn().(*erasureServerPools) + + objAPI := newObjectLayerFn() + if objAPI == nil { + return + } + + ez, ok := objAPI.(renameAllStorager) if !ok { - logger.LogIf(ctx, errors.New("bucketMetacache: expected objAPI to be *erasurePools")) + logger.LogIf(ctx, errors.New("bucketMetacache: expected objAPI to be 'renameAllStorager'")) return } diff --git a/cmd/metacache-manager.go b/cmd/metacache-manager.go index 34282fad5..dc967f5e7 100644 --- a/cmd/metacache-manager.go +++ b/cmd/metacache-manager.go @@ -56,7 +56,7 @@ func (m *metacacheManager) initManager() { objAPI = newObjectLayerFn() } - if !globalIsErasure { + if globalIsGateway { return } diff --git a/cmd/metacache-server-pool.go b/cmd/metacache-server-pool.go index c38c3a864..c28c7d554 100644 --- a/cmd/metacache-server-pool.go +++ b/cmd/metacache-server-pool.go @@ -259,6 +259,286 @@ func (z *erasureServerPools) listPath(ctx context.Context, o *listPathOptions) ( return entries, nil } +// listPath will return the requested entries. +// If no more entries are in the listing io.EOF is returned, +// otherwise nil or an unexpected error is returned. +// The listPathOptions given will be checked and modified internally. +// Required important fields are Bucket, Prefix, Separator. +// Other important fields are Limit, Marker. +// List ID always derived from the Marker. +func (es *erasureSingle) listPath(ctx context.Context, o *listPathOptions) (entries metaCacheEntriesSorted, err error) { + if err := checkListObjsArgs(ctx, o.Bucket, o.Prefix, o.Marker, es); err != nil { + return entries, err + } + + // Marker is set validate pre-condition. + if o.Marker != "" && o.Prefix != "" { + // Marker not common with prefix is not implemented. Send an empty response + if !HasPrefix(o.Marker, o.Prefix) { + return entries, io.EOF + } + } + + // With max keys of zero we have reached eof, return right here. + if o.Limit == 0 { + return entries, io.EOF + } + + // For delimiter and prefix as '/' we do not list anything at all + // along // with the prefix. On a flat namespace with 'prefix' + // as '/' we don't have any entries, since all the keys are + // of form 'keyName/...' + if strings.HasPrefix(o.Prefix, SlashSeparator) { + return entries, io.EOF + } + + // If delimiter is slashSeparator we must return directories of + // the non-recursive scan unless explicitly requested. + o.IncludeDirectories = o.Separator == slashSeparator + if (o.Separator == slashSeparator || o.Separator == "") && !o.Recursive { + o.Recursive = o.Separator != slashSeparator + o.Separator = slashSeparator + } else { + // Default is recursive, if delimiter is set then list non recursive. + o.Recursive = true + } + + // Decode and get the optional list id from the marker. + o.parseMarker() + o.BaseDir = baseDirFromPrefix(o.Prefix) + o.Transient = o.Transient || isReservedOrInvalidBucket(o.Bucket, false) + o.SetFilter() + if o.Transient { + o.Create = false + } + + // We have 2 cases: + // 1) Cold listing, just list. + // 2) Returning, but with no id. Start async listing. + // 3) Returning, with ID, stream from list. + // + // If we don't have a list id we must ask the server if it has a cache or create a new. + if o.ID != "" && !o.Transient { + // Create or ping with handout... + rpc := globalNotificationSys.restClientFromHash(pathJoin(o.Bucket, o.Prefix)) + ctx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + var c *metacache + if rpc == nil { + resp := localMetacacheMgr.getBucket(ctx, o.Bucket).findCache(*o) + c = &resp + } else { + c, err = rpc.GetMetacacheListing(ctx, *o) + } + if err != nil { + if errors.Is(err, context.Canceled) { + // Context is canceled, return at once. + // request canceled, no entries to return + return entries, io.EOF + } + if !errors.Is(err, context.DeadlineExceeded) { + o.debugln("listPath: got error", err) + } + o.Transient = true + o.Create = false + o.ID = mustGetUUID() + } else { + if c.fileNotFound { + // No cache found, no entries found. + return entries, io.EOF + } + if c.status == scanStateError || c.status == scanStateNone { + o.ID = "" + o.Create = false + o.debugln("scan status", c.status, " - waiting a roundtrip to create") + } else { + // Continue listing + o.ID = c.id + go func(meta metacache) { + // Continuously update while we wait. + t := time.NewTicker(metacacheMaxClientWait / 10) + defer t.Stop() + select { + case <-ctx.Done(): + // Request is done, stop updating. + return + case <-t.C: + meta.lastHandout = time.Now() + if rpc == nil { + meta, _ = localMetacacheMgr.updateCacheEntry(meta) + } + meta, _ = rpc.UpdateMetacacheListing(ctx, meta) + } + }(*c) + } + } + + // We have an existing list ID, continue streaming. + if o.Create { + o.debugln("Creating", o) + entries, err = es.listAndSave(ctx, o) + if err == nil || err == io.EOF { + return entries, err + } + entries.truncate(0) + } else { + o.debugln("Resuming", o) + entries, err = es.streamMetadataParts(ctx, *o) + entries.reuse = true // We read from stream and are not sharing results. + if err == nil { + return entries, nil + } + } + if IsErr(err, []error{ + nil, + context.Canceled, + context.DeadlineExceeded, + // io.EOF is expected and should be returned but no need to log it. + io.EOF, + }...) { + // Expected good errors we don't need to return error. + return entries, err + } + entries.truncate(0) + go func() { + rpc := globalNotificationSys.restClientFromHash(pathJoin(o.Bucket, o.Prefix)) + if rpc != nil { + ctx, cancel := context.WithTimeout(GlobalContext, 5*time.Second) + defer cancel() + c, err := rpc.GetMetacacheListing(ctx, *o) + if err == nil { + c.error = "no longer used" + c.status = scanStateError + rpc.UpdateMetacacheListing(ctx, *c) + } + } + }() + o.ID = "" + + if err != nil { + logger.LogIf(ctx, fmt.Errorf("Resuming listing from drives failed %w, proceeding to do raw listing", err)) + } + } + + // Do listing in-place. + // Create output for our results. + // Create filter for results. + o.debugln("Raw List", o) + filterCh := make(chan metaCacheEntry, o.Limit) + listCtx, cancelList := context.WithCancel(ctx) + filteredResults := o.gatherResults(listCtx, filterCh) + var wg sync.WaitGroup + wg.Add(1) + var listErr error + + go func(o listPathOptions) { + defer wg.Done() + o.Limit = 0 + listErr = es.listMerged(listCtx, o, filterCh) + o.debugln("listMerged returned with", listErr) + }(*o) + + entries, err = filteredResults() + cancelList() + wg.Wait() + if listErr != nil && !errors.Is(listErr, context.Canceled) { + return entries, listErr + } + entries.reuse = true + truncated := entries.len() > o.Limit || err == nil + entries.truncate(o.Limit) + if !o.Transient && truncated { + if o.ID == "" { + entries.listID = mustGetUUID() + } else { + entries.listID = o.ID + } + } + if !truncated { + return entries, io.EOF + } + return entries, nil +} + +// listMerged will list across all sets and return a merged results stream. +// The result channel is closed when no more results are expected. +func (es *erasureSingle) listMerged(ctx context.Context, o listPathOptions, results chan<- metaCacheEntry) error { + var mu sync.Mutex + var wg sync.WaitGroup + var listErr error + var inputs []chan metaCacheEntry + + innerResults := make(chan metaCacheEntry, 100) + inputs = append(inputs, innerResults) + + mu.Lock() + listCtx, cancelList := context.WithCancel(ctx) + defer cancelList() + + wg.Add(1) + go func() { + defer wg.Done() + err := es.listPathInner(listCtx, o, innerResults) + mu.Lock() + defer mu.Unlock() + listErr = err + }() + mu.Unlock() + + // Do lifecycle filtering. + if o.Lifecycle != nil { + filterIn := make(chan metaCacheEntry, 10) + go filterLifeCycle(ctx, o.Bucket, *o.Lifecycle, o.Retention, filterIn, results) + // Replace results. + results = filterIn + } + + // Gather results to a single channel. + err := mergeEntryChannels(ctx, inputs, results, func(existing, other *metaCacheEntry) (replace bool) { + // Pick object over directory + if existing.isDir() && !other.isDir() { + return true + } + if !existing.isDir() && other.isDir() { + return false + } + eMeta, err := existing.xlmeta() + if err != nil { + return true + } + oMeta, err := other.xlmeta() + if err != nil { + return false + } + // Replace if modtime is newer + if !oMeta.latestModtime().Equal(oMeta.latestModtime()) { + return oMeta.latestModtime().After(eMeta.latestModtime()) + } + // Use NumVersions as a final tiebreaker. + return len(oMeta.versions) > len(eMeta.versions) + }) + + cancelList() + wg.Wait() + if err != nil { + return err + } + if listErr != nil { + if contextCanceled(ctx) { + return nil + } + if listErr.Error() == io.EOF.Error() { + return nil + } + logger.LogIf(ctx, listErr) + return listErr + } + if contextCanceled(ctx) { + return ctx.Err() + } + return nil +} + // listMerged will list across all sets and return a merged results stream. // The result channel is closed when no more results are expected. func (z *erasureServerPools) listMerged(ctx context.Context, o listPathOptions, results chan<- metaCacheEntry) error { @@ -395,6 +675,73 @@ func filterLifeCycle(ctx context.Context, bucket string, lc lifecycle.Lifecycle, } } +func (es *erasureSingle) listAndSave(ctx context.Context, o *listPathOptions) (entries metaCacheEntriesSorted, err error) { + // Use ID as the object name... + o.pool = 0 + o.set = 0 + saver := es + + // Disconnect from call above, but cancel on exit. + listCtx, cancel := context.WithCancel(GlobalContext) + saveCh := make(chan metaCacheEntry, metacacheBlockSize) + inCh := make(chan metaCacheEntry, metacacheBlockSize) + outCh := make(chan metaCacheEntry, o.Limit) + + filteredResults := o.gatherResults(ctx, outCh) + + mc := o.newMetacache() + meta := metaCacheRPC{meta: &mc, cancel: cancel, rpc: globalNotificationSys.restClientFromHash(pathJoin(o.Bucket, o.Prefix)), o: *o} + + // Save listing... + go func() { + if err := saver.saveMetaCacheStream(listCtx, &meta, saveCh); err != nil { + meta.setErr(err.Error()) + } + cancel() + }() + + // Do listing... + go func(o listPathOptions) { + err := es.listMerged(listCtx, o, inCh) + if err != nil { + meta.setErr(err.Error()) + } + o.debugln("listAndSave: listing", o.ID, "finished with ", err) + }(*o) + + // Keep track of when we return since we no longer have to send entries to output. + var funcReturned bool + var funcReturnedMu sync.Mutex + defer func() { + funcReturnedMu.Lock() + funcReturned = true + funcReturnedMu.Unlock() + }() + // Write listing to results and saver. + go func() { + var returned bool + for entry := range inCh { + if !returned { + funcReturnedMu.Lock() + returned = funcReturned + funcReturnedMu.Unlock() + outCh <- entry + if returned { + close(outCh) + } + } + entry.reusable = returned + saveCh <- entry + } + if !returned { + close(outCh) + } + close(saveCh) + }() + + return filteredResults() +} + func (z *erasureServerPools) listAndSave(ctx context.Context, o *listPathOptions) (entries metaCacheEntriesSorted, err error) { // Use ID as the object name... o.pool = z.getAvailablePoolIdx(ctx, minioMetaBucket, o.ID, 10<<20) diff --git a/cmd/metacache-set.go b/cmd/metacache-set.go index f0c090a43..37f72a582 100644 --- a/cmd/metacache-set.go +++ b/cmd/metacache-set.go @@ -543,6 +543,170 @@ func (er *erasureObjects) streamMetadataParts(ctx context.Context, o listPathOpt } } +func (es *erasureSingle) streamMetadataParts(ctx context.Context, o listPathOptions) (entries metaCacheEntriesSorted, err error) { + retries := 0 + rpc := globalNotificationSys.restClientFromHash(pathJoin(o.Bucket, o.Prefix)) + + for { + if contextCanceled(ctx) { + return entries, ctx.Err() + } + + // If many failures, check the cache state. + if retries > 10 { + err := o.checkMetacacheState(ctx, rpc) + if err != nil { + return entries, fmt.Errorf("remote listing canceled: %w", err) + } + retries = 1 + } + + const retryDelay = 250 * time.Millisecond + // All operations are performed without locks, so we must be careful and allow for failures. + // Read metadata associated with the object from a disk. + if retries > 0 { + _, err := es.disk.ReadVersion(ctx, minioMetaBucket, + o.objectPath(0), "", false) + if err != nil { + time.Sleep(retryDelay) + retries++ + continue + } + } + + // Load first part metadata... + // Read metadata associated with the object from all disks. + fi, metaArr, onlineDisks, err := es.getObjectFileInfo(ctx, minioMetaBucket, o.objectPath(0), ObjectOptions{}, true) + if err != nil { + switch toObjectErr(err, minioMetaBucket, o.objectPath(0)).(type) { + case ObjectNotFound: + retries++ + time.Sleep(retryDelay) + continue + case InsufficientReadQuorum: + retries++ + time.Sleep(retryDelay) + continue + default: + return entries, fmt.Errorf("reading first part metadata: %w", err) + } + } + + partN, err := o.findFirstPart(fi) + switch { + case err == nil: + case errors.Is(err, io.ErrUnexpectedEOF): + if retries == 10 { + err := o.checkMetacacheState(ctx, rpc) + if err != nil { + return entries, fmt.Errorf("remote listing canceled: %w", err) + } + retries = -1 + } + retries++ + time.Sleep(retryDelay) + continue + case errors.Is(err, io.EOF): + return entries, io.EOF + } + + // We got a stream to start at. + loadedPart := 0 + for { + if contextCanceled(ctx) { + return entries, ctx.Err() + } + + if partN != loadedPart { + if retries > 10 { + err := o.checkMetacacheState(ctx, rpc) + if err != nil { + return entries, fmt.Errorf("waiting for next part %d: %w", partN, err) + } + retries = 1 + } + + if retries > 0 { + // Load from one disk only + _, err := es.disk.ReadVersion(ctx, minioMetaBucket, + o.objectPath(partN), "", false) + if err != nil { + time.Sleep(retryDelay) + retries++ + continue + } + } + + // Load partN metadata... + fi, metaArr, onlineDisks, err = es.getObjectFileInfo(ctx, minioMetaBucket, o.objectPath(partN), ObjectOptions{}, true) + if err != nil { + time.Sleep(retryDelay) + retries++ + continue + } + loadedPart = partN + bi, err := getMetacacheBlockInfo(fi, partN) + logger.LogIf(ctx, err) + if err == nil { + if bi.pastPrefix(o.Prefix) { + return entries, io.EOF + } + } + } + + pr, pw := io.Pipe() + go func() { + werr := es.getObjectWithFileInfo(ctx, minioMetaBucket, o.objectPath(partN), 0, + fi.Size, pw, fi, metaArr, onlineDisks) + pw.CloseWithError(werr) + }() + + tmp := newMetacacheReader(pr) + e, err := tmp.filter(o) + pr.CloseWithError(err) + entries.o = append(entries.o, e.o...) + if o.Limit > 0 && entries.len() > o.Limit { + entries.truncate(o.Limit) + return entries, nil + } + if err == nil { + // We stopped within the listing, we are done for now... + return entries, nil + } + if err != nil && err.Error() != io.EOF.Error() { + switch toObjectErr(err, minioMetaBucket, o.objectPath(partN)).(type) { + case ObjectNotFound: + retries++ + time.Sleep(retryDelay) + continue + case InsufficientReadQuorum: + retries++ + time.Sleep(retryDelay) + continue + default: + logger.LogIf(ctx, err) + return entries, err + } + } + + // We finished at the end of the block. + // And should not expect any more results. + bi, err := getMetacacheBlockInfo(fi, partN) + logger.LogIf(ctx, err) + if err != nil || bi.EOS { + // We are done and there are no more parts. + return entries, io.EOF + } + if bi.endedPrefix(o.Prefix) { + // Nothing more for prefix. + return entries, io.EOF + } + partN++ + retries = 0 + } + } +} + // getListQuorum interprets list quorum values and returns appropriate // acceptable quorum expected for list operations func getListQuorum(quorum string, driveCount int) int { @@ -562,6 +726,53 @@ func getListQuorum(quorum string, driveCount int) int { return 3 } +// Will return io.EOF if continuing would not yield more results. +func (es *erasureSingle) listPathInner(ctx context.Context, o listPathOptions, results chan<- metaCacheEntry) (err error) { + defer close(results) + o.debugf(color.Green("listPath:")+" with options: %#v", o) + + // How to resolve results. + resolver := metadataResolutionParams{ + dirQuorum: 1, + objQuorum: 1, + bucket: o.Bucket, + } + + // Maximum versions requested for "latest" object + // resolution on versioned buckets, this is to be only + // used when o.Versioned is false + if !o.Versioned { + resolver.requestedVersions = 1 + } + + ctxDone := ctx.Done() + return listPathRaw(ctx, listPathRawOptions{ + disks: []StorageAPI{es.disk}, + bucket: o.Bucket, + path: o.BaseDir, + recursive: o.Recursive, + filterPrefix: o.FilterPrefix, + minDisks: 1, + forwardTo: o.Marker, + agreed: func(entry metaCacheEntry) { + select { + case <-ctxDone: + case results <- entry: + } + }, + partial: func(entries metaCacheEntries, nAgreed int, errs []error) { + // Results Disagree :-( + entry, ok := entries.resolve(&resolver) + if ok { + select { + case <-ctxDone: + case results <- *entry: + } + } + }, + }) +} + // Will return io.EOF if continuing would not yield more results. func (er *erasureObjects) listPath(ctx context.Context, o listPathOptions, results chan<- metaCacheEntry) (err error) { defer close(results) @@ -654,6 +865,133 @@ func (m *metaCacheRPC) setErr(err string) { *m.meta = meta } +func (es *erasureSingle) saveMetaCacheStream(ctx context.Context, mc *metaCacheRPC, entries <-chan metaCacheEntry) (err error) { + o := mc.o + o.debugf(color.Green("saveMetaCacheStream:")+" with options: %#v", o) + + metaMu := &mc.mu + rpc := mc.rpc + cancel := mc.cancel + defer func() { + o.debugln(color.Green("saveMetaCacheStream:")+"err:", err) + if err != nil && !errors.Is(err, io.EOF) { + go mc.setErr(err.Error()) + cancel() + } + }() + + defer cancel() + // Save continuous updates + go func() { + var err error + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + var exit bool + for !exit { + select { + case <-ticker.C: + case <-ctx.Done(): + exit = true + } + metaMu.Lock() + meta := *mc.meta + meta, err = o.updateMetacacheListing(meta, rpc) + if err == nil && time.Since(meta.lastHandout) > metacacheMaxClientWait { + cancel() + exit = true + meta.status = scanStateError + meta.error = fmt.Sprintf("listing canceled since time since last handout was %v ago", time.Since(meta.lastHandout).Round(time.Second)) + o.debugln(color.Green("saveMetaCacheStream: ") + meta.error) + meta, err = o.updateMetacacheListing(meta, rpc) + } + if err == nil { + *mc.meta = meta + if meta.status == scanStateError { + cancel() + exit = true + } + } + metaMu.Unlock() + } + }() + + const retryDelay = 200 * time.Millisecond + const maxTries = 5 + + // Keep destination... + // Write results to disk. + bw := newMetacacheBlockWriter(entries, func(b *metacacheBlock) error { + // if the block is 0 bytes and its a first block skip it. + // skip only this for Transient caches. + if len(b.data) == 0 && b.n == 0 && o.Transient { + return nil + } + o.debugln(color.Green("saveMetaCacheStream:")+" saving block", b.n, "to", o.objectPath(b.n)) + r, err := hash.NewReader(bytes.NewReader(b.data), int64(len(b.data)), "", "", int64(len(b.data))) + logger.LogIf(ctx, err) + custom := b.headerKV() + _, err = es.putMetacacheObject(ctx, o.objectPath(b.n), NewPutObjReader(r), ObjectOptions{ + UserDefined: custom, + }) + if err != nil { + mc.setErr(err.Error()) + cancel() + return err + } + if b.n == 0 { + return nil + } + // Update block 0 metadata. + var retries int + for { + meta := b.headerKV() + fi := FileInfo{ + Metadata: make(map[string]string, len(meta)), + } + for k, v := range meta { + fi.Metadata[k] = v + } + err := es.updateObjectMeta(ctx, minioMetaBucket, o.objectPath(0), fi, es.disk) + if err == nil { + break + } + switch err.(type) { + case ObjectNotFound: + return err + case StorageErr: + return err + case InsufficientReadQuorum: + default: + logger.LogIf(ctx, err) + } + if retries >= maxTries { + return err + } + retries++ + time.Sleep(retryDelay) + } + return nil + }) + + // Blocks while consuming entries or an error occurs. + err = bw.Close() + if err != nil { + mc.setErr(err.Error()) + } + metaMu.Lock() + defer metaMu.Unlock() + if mc.meta.error != "" { + return err + } + // Save success + mc.meta.status = scanStateSuccess + meta, err := o.updateMetacacheListing(*mc.meta, rpc) + if err == nil { + *mc.meta = meta + } + return nil +} + func (er *erasureObjects) saveMetaCacheStream(ctx context.Context, mc *metaCacheRPC, entries <-chan metaCacheEntry) (err error) { o := mc.o o.debugf(color.Green("saveMetaCacheStream:")+" with options: %#v", o) diff --git a/cmd/metacache.go b/cmd/metacache.go index d1b1aaac8..885541253 100644 --- a/cmd/metacache.go +++ b/cmd/metacache.go @@ -155,9 +155,9 @@ func (m *metacache) delete(ctx context.Context) { logger.LogIf(ctx, errors.New("metacache.delete: no object layer")) return } - ez, ok := objAPI.(*erasureServerPools) + ez, ok := objAPI.(renameAllStorager) if !ok { - logger.LogIf(ctx, errors.New("metacache.delete: expected objAPI to be *erasureServerPools")) + logger.LogIf(ctx, errors.New("metacache.delete: expected objAPI to be 'renameAllStorager'")) return } ez.renameAll(ctx, minioMetaBucket, metacachePrefixForID(m.bucket, m.id)) diff --git a/cmd/metrics-v2.go b/cmd/metrics-v2.go index 1dd884adc..61f470f7e 100644 --- a/cmd/metrics-v2.go +++ b/cmd/metrics-v2.go @@ -1365,7 +1365,7 @@ func getMinioHealingMetrics() *MetricsGroup { mg := &MetricsGroup{} mg.RegisterRead(func(_ context.Context) (metrics []Metric) { metrics = make([]Metric, 0, 5) - if !globalIsErasure { + if globalIsGateway { return } bgSeq, exists := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID) @@ -1817,7 +1817,7 @@ func getClusterStorageMetrics() *MetricsGroup { mg.RegisterRead(func(ctx context.Context) (metrics []Metric) { objLayer := newObjectLayerFn() // Service not initialized yet - if objLayer == nil || !globalIsErasure { + if objLayer == nil || globalIsGateway { return } diff --git a/cmd/metrics.go b/cmd/metrics.go index c287e27b5..79225d363 100644 --- a/cmd/metrics.go +++ b/cmd/metrics.go @@ -132,7 +132,7 @@ func nodeHealthMetricsPrometheus(ch chan<- prometheus.Metric) { // collects healing specific metrics for MinIO instance in Prometheus specific format // and sends to given channel func healingMetricsPrometheus(ch chan<- prometheus.Metric) { - if !globalIsErasure { + if globalIsGateway { return } bgSeq, exists := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID) diff --git a/cmd/object-api-listobjects_test.go b/cmd/object-api-listobjects_test.go index a808c44df..46dd5f94a 100644 --- a/cmd/object-api-listobjects_test.go +++ b/cmd/object-api-listobjects_test.go @@ -35,9 +35,6 @@ func TestListObjectsVersionedFolders(t *testing.T) { } func testListObjectsVersionedFolders(obj ObjectLayer, instanceType string, t1 TestErrHandler) { - if instanceType == FSTestStr { - return - } t, _ := t1.(*testing.T) testBuckets := []string{ // This bucket is used for testing ListObject operations. @@ -317,9 +314,6 @@ func testListObjects(obj ObjectLayer, instanceType string, t1 TestErrHandler) { } func _testListObjects(obj ObjectLayer, instanceType string, t1 TestErrHandler, versioned bool) { - if instanceType == FSTestStr && versioned { - return - } t, _ := t1.(*testing.T) testBuckets := []string{ // This bucket is used for testing ListObject operations. @@ -1020,10 +1014,6 @@ func TestDeleteObjectVersionMarker(t *testing.T) { } func testDeleteObjectVersion(obj ObjectLayer, instanceType string, t1 TestErrHandler) { - if instanceType == FSTestStr { - return - } - t, _ := t1.(*testing.T) testBuckets := []string{ @@ -1101,10 +1091,6 @@ func TestListObjectVersions(t *testing.T) { // Unit test for ListObjectVersions func testListObjectVersions(obj ObjectLayer, instanceType string, t1 TestErrHandler) { - if instanceType == FSTestStr { - return - } - t, _ := t1.(*testing.T) testBuckets := []string{ // This bucket is used for testing ListObject operations. @@ -1886,16 +1872,14 @@ func testListObjectsContinuation(obj ObjectLayer, instanceType string, t1 TestEr // Initialize FS backend for the benchmark. func initFSObjectsB(disk string, t *testing.B) (obj ObjectLayer) { - var err error - obj, err = NewFSObjectLayer(disk) + obj, _, err := initObjectLayer(context.Background(), mustGetPoolEndpoints(disk)) if err != nil { - t.Fatal("Unexpected err: ", err) + t.Fatal(err) } newTestConfig(globalMinioDefaultRegion, obj) initAllSubsystems() - return obj } diff --git a/cmd/object-api-multipart_test.go b/cmd/object-api-multipart_test.go index ec10d8c96..e18fd1dba 100644 --- a/cmd/object-api-multipart_test.go +++ b/cmd/object-api-multipart_test.go @@ -1620,39 +1620,35 @@ func testListObjectParts(obj ObjectLayer, instanceType string, t TestErrHandler) t.Errorf("Test %d: %s: Expected Bucket to be \"%s\", but instead found it to be \"%s\"", i+1, instanceType, expectedResult.Bucket, actualResult.Bucket) } - // ListObjectParts returns empty response always in FS mode - if instanceType != FSTestStr { - // Asserting IsTruncated. - if actualResult.IsTruncated != testCase.expectedResult.IsTruncated { - t.Errorf("Test %d: %s: Expected IsTruncated to be \"%v\", but found it to \"%v\"", i+1, instanceType, expectedResult.IsTruncated, actualResult.IsTruncated) - continue + // Asserting IsTruncated. + if actualResult.IsTruncated != testCase.expectedResult.IsTruncated { + t.Errorf("Test %d: %s: Expected IsTruncated to be \"%v\", but found it to \"%v\"", i+1, instanceType, expectedResult.IsTruncated, actualResult.IsTruncated) + continue + } + // Asserting NextPartNumberMarker. + if actualResult.NextPartNumberMarker != expectedResult.NextPartNumberMarker { + t.Errorf("Test %d: %s: Expected NextPartNumberMarker to be \"%d\", but instead found it to be \"%d\"", i+1, instanceType, expectedResult.NextPartNumberMarker, actualResult.NextPartNumberMarker) + continue + } + // Asserting the number of Parts. + if len(expectedResult.Parts) != len(actualResult.Parts) { + t.Errorf("Test %d: %s: Expected the result to contain info of %d Parts, but found %d instead", i+1, instanceType, len(expectedResult.Parts), len(actualResult.Parts)) + continue + } + // Iterating over the partInfos and asserting the fields. + for j, actualMetaData := range actualResult.Parts { + // Asserting the PartNumber in the PartInfo. + if actualMetaData.PartNumber != expectedResult.Parts[j].PartNumber { + t.Errorf("Test %d: %s: Part %d: Expected PartNumber to be \"%d\", but instead found \"%d\"", i+1, instanceType, j+1, expectedResult.Parts[j].PartNumber, actualMetaData.PartNumber) } - // Asserting NextPartNumberMarker. - if actualResult.NextPartNumberMarker != expectedResult.NextPartNumberMarker { - t.Errorf("Test %d: %s: Expected NextPartNumberMarker to be \"%d\", but instead found it to be \"%d\"", i+1, instanceType, expectedResult.NextPartNumberMarker, actualResult.NextPartNumberMarker) - continue + // Asserting the Size in the PartInfo. + if actualMetaData.Size != expectedResult.Parts[j].Size { + t.Errorf("Test %d: %s: Part %d: Expected Part Size to be \"%d\", but instead found \"%d\"", i+1, instanceType, j+1, expectedResult.Parts[j].Size, actualMetaData.Size) } - // Asserting the number of Parts. - if len(expectedResult.Parts) != len(actualResult.Parts) { - t.Errorf("Test %d: %s: Expected the result to contain info of %d Parts, but found %d instead", i+1, instanceType, len(expectedResult.Parts), len(actualResult.Parts)) - continue + // Asserting the ETag in the PartInfo. + if actualMetaData.ETag != expectedResult.Parts[j].ETag { + t.Errorf("Test %d: %s: Part %d: Expected Etag to be \"%s\", but instead found \"%s\"", i+1, instanceType, j+1, expectedResult.Parts[j].ETag, actualMetaData.ETag) } - // Iterating over the partInfos and asserting the fields. - for j, actualMetaData := range actualResult.Parts { - // Asserting the PartNumber in the PartInfo. - if actualMetaData.PartNumber != expectedResult.Parts[j].PartNumber { - t.Errorf("Test %d: %s: Part %d: Expected PartNumber to be \"%d\", but instead found \"%d\"", i+1, instanceType, j+1, expectedResult.Parts[j].PartNumber, actualMetaData.PartNumber) - } - // Asserting the Size in the PartInfo. - if actualMetaData.Size != expectedResult.Parts[j].Size { - t.Errorf("Test %d: %s: Part %d: Expected Part Size to be \"%d\", but instead found \"%d\"", i+1, instanceType, j+1, expectedResult.Parts[j].Size, actualMetaData.Size) - } - // Asserting the ETag in the PartInfo. - if actualMetaData.ETag != expectedResult.Parts[j].ETag { - t.Errorf("Test %d: %s: Part %d: Expected Etag to be \"%s\", but instead found \"%s\"", i+1, instanceType, j+1, expectedResult.Parts[j].ETag, actualMetaData.ETag) - } - } - } } } diff --git a/cmd/object-api-utils.go b/cmd/object-api-utils.go index 9a4078078..08ca9146a 100644 --- a/cmd/object-api-utils.go +++ b/cmd/object-api-utils.go @@ -950,7 +950,7 @@ func compressSelfTest() { // getDiskInfos returns the disk information for the provided disks. // If a disk is nil or an error is returned the result will be nil as well. -func getDiskInfos(ctx context.Context, disks []StorageAPI) []*DiskInfo { +func getDiskInfos(ctx context.Context, disks ...StorageAPI) []*DiskInfo { res := make([]*DiskInfo, len(disks)) for i, disk := range disks { if disk == nil { diff --git a/cmd/object-handlers_test.go b/cmd/object-handlers_test.go index 9384910de..c1c66cec2 100644 --- a/cmd/object-handlers_test.go +++ b/cmd/object-handlers_test.go @@ -1954,7 +1954,7 @@ func testAPICopyObjectPartHandler(obj ObjectLayer, instanceType, bucketName stri if err != nil { t.Fatalf("Test %d: %s: Failed to look for copied object part: %s", i+1, instanceType, err) } - if instanceType != FSTestStr && len(results.Parts) != 1 { + if len(results.Parts) != 1 { t.Fatalf("Test %d: %s: Expected only one entry returned %d entries", i+1, instanceType, len(results.Parts)) } } diff --git a/cmd/prepare-storage.go b/cmd/prepare-storage.go index fd3cbcad3..c4131358a 100644 --- a/cmd/prepare-storage.go +++ b/cmd/prepare-storage.go @@ -150,13 +150,13 @@ func connectLoadInitFormats(verboseLogging bool, firstDisk bool, endpoints Endpo defer func(storageDisks []StorageAPI) { if err != nil { - closeStorageDisks(storageDisks) + closeStorageDisks(storageDisks...) } }(storageDisks) for i, err := range errs { - if err != nil { - if err == errDiskNotFound && verboseLogging { + if err != nil && !errors.Is(err, errXLBackend) { + if errors.Is(err, errDiskNotFound) && verboseLogging { logger.Error("Unable to connect to %s: %v", endpoints[i], isServerResolvable(endpoints[i], time.Second)) } else { logger.Error("Unable to use the drive %s: %v", endpoints[i], err) @@ -173,7 +173,7 @@ func connectLoadInitFormats(verboseLogging bool, firstDisk bool, endpoints Endpo // Check if we have for i, sErr := range sErrs { // print the error, nonetheless, which is perhaps unhandled - if sErr != errUnformattedDisk && sErr != errDiskNotFound && verboseLogging { + if !errors.Is(sErr, errUnformattedDisk) && !errors.Is(sErr, errDiskNotFound) && verboseLogging { if sErr != nil { logger.Error("Unable to read 'format.json' from %s: %v\n", endpoints[i], sErr) } diff --git a/cmd/server-main.go b/cmd/server-main.go index a0cb5872e..158560cc5 100644 --- a/cmd/server-main.go +++ b/cmd/server-main.go @@ -222,6 +222,7 @@ func serverHandleCmdArgs(ctx *cli.Context) { if globalIsDistErasure { globalIsErasure = true } + globalIsErasureSD = (setupType == ErasureSDSetupType) } func serverHandleEnvVars() { @@ -232,13 +233,11 @@ func serverHandleEnvVars() { var globalHealStateLK sync.RWMutex func initAllSubsystems() { - if globalIsErasure { - globalHealStateLK.Lock() - // New global heal state - globalAllHealState = newHealState(true) - globalBackgroundHealState = newHealState(false) - globalHealStateLK.Unlock() - } + globalHealStateLK.Lock() + // New global heal state + globalAllHealState = newHealState(true) + globalBackgroundHealState = newHealState(false) + globalHealStateLK.Unlock() // Create new notification system and initialize notification peer targets globalNotificationSys = NewNotificationSys(globalEndpoints) @@ -527,11 +526,8 @@ func serverMain(ctx *cli.Context) { xhttp.SetMinIOVersion(Version) // Enable background operations for erasure coding - if globalIsErasure { - initAutoHeal(GlobalContext, newObject) - initHealMRF(GlobalContext, newObject) - } - + initAutoHeal(GlobalContext, newObject) + initHealMRF(GlobalContext, newObject) initBackgroundExpiry(GlobalContext, newObject) if globalActiveCred.Equal(auth.DefaultCredentials) { @@ -579,21 +575,19 @@ func serverMain(ctx *cli.Context) { // Background all other operations such as initializing bucket metadata etc. go func() { // Initialize transition tier configuration manager - if globalIsErasure { - initBackgroundReplication(GlobalContext, newObject) - initBackgroundTransition(GlobalContext, newObject) + initBackgroundReplication(GlobalContext, newObject) + initBackgroundTransition(GlobalContext, newObject) - go func() { - if err := globalTierConfigMgr.Init(GlobalContext, newObject); err != nil { - logger.LogIf(GlobalContext, err) - } + go func() { + if err := globalTierConfigMgr.Init(GlobalContext, newObject); err != nil { + logger.LogIf(GlobalContext, err) + } - globalTierJournal, err = initTierDeletionJournal(GlobalContext) - if err != nil { - logger.FatalIf(err, "Unable to initialize remote tier pending deletes journal") - } - }() - } + globalTierJournal, err = initTierDeletionJournal(GlobalContext) + if err != nil { + logger.FatalIf(err, "Unable to initialize remote tier pending deletes journal") + } + }() // Initialize site replication manager. globalSiteReplicationSys.Init(GlobalContext, newObject) @@ -664,7 +658,13 @@ func newObjectLayer(ctx context.Context, endpointServerPools EndpointServerPools // For FS only, directly use the disk. if endpointServerPools.NEndpoints() == 1 { // Initialize new FS object layer. - return NewFSObjectLayer(endpointServerPools[0].Endpoints[0].Path) + newObject, err = NewFSObjectLayer(endpointServerPools[0].Endpoints[0].Path) + if err == nil { + return newObject, nil + } + if err != nil && err != errFreshDisk { + return newObject, err + } } return newErasureServerPools(ctx, endpointServerPools) diff --git a/cmd/server-main_test.go b/cmd/server-main_test.go index 980b1961a..580068b1f 100644 --- a/cmd/server-main_test.go +++ b/cmd/server-main_test.go @@ -27,7 +27,7 @@ import ( func TestNewObjectLayer(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - // Tests for FS object layer. + // Tests for ErasureSD object layer. nDisks := 1 disks, err := getRandomDisks(nDisks) if err != nil { @@ -39,7 +39,7 @@ func TestNewObjectLayer(t *testing.T) { if err != nil { t.Fatal("Unexpected object layer initialization error", err) } - _, ok := obj.(*FSObjects) + _, ok := obj.(*erasureSingle) if !ok { t.Fatal("Unexpected object layer detected", reflect.TypeOf(obj)) } diff --git a/cmd/server_test.go b/cmd/server_test.go index 9cf0cb115..dd2ebef84 100644 --- a/cmd/server_test.go +++ b/cmd/server_test.go @@ -39,7 +39,7 @@ import ( "github.com/minio/pkg/bucket/policy" ) -// API suite container common to both FS and Erasure. +// API suite container common to both ErasureSD and Erasure. type TestSuiteCommon struct { serverType string testServer TestServer @@ -122,12 +122,12 @@ func runAllTests(suite *TestSuiteCommon, c *check) { func TestServerSuite(t *testing.T) { testCases := []*TestSuiteCommon{ - // Init and run test on FS backend with signature v4. - {serverType: "FS", signer: signerV4}, - // Init and run test on FS backend with signature v2. - {serverType: "FS", signer: signerV2}, - // Init and run test on FS backend, with tls enabled. - {serverType: "FS", signer: signerV4, secure: true}, + // Init and run test on ErasureSD backend with signature v4. + {serverType: "ErasureSD", signer: signerV4}, + // Init and run test on ErasureSD backend with signature v2. + {serverType: "ErasureSD", signer: signerV2}, + // Init and run test on ErasureSD backend, with tls enabled. + {serverType: "ErasureSD", signer: signerV4, secure: true}, // Init and run test on Erasure backend. {serverType: "Erasure", signer: signerV4}, // Init and run test on ErasureSet backend. diff --git a/cmd/setup-type.go b/cmd/setup-type.go index 1b81f5a46..c540f2ecc 100644 --- a/cmd/setup-type.go +++ b/cmd/setup-type.go @@ -27,6 +27,9 @@ const ( // FSSetupType - FS setup type enum. FSSetupType + // ErasureSDSetupType - Erasure single drive setup enum. + ErasureSDSetupType + // ErasureSetupType - Erasure setup type enum. ErasureSetupType @@ -41,6 +44,8 @@ func (setupType SetupType) String() string { switch setupType { case FSSetupType: return globalMinioModeFS + case ErasureSDSetupType: + return globalMinioModeErasureSD case ErasureSetupType: return globalMinioModeErasure case DistErasureSetupType: diff --git a/cmd/storage-errors.go b/cmd/storage-errors.go index b45779305..35153d07e 100644 --- a/cmd/storage-errors.go +++ b/cmd/storage-errors.go @@ -116,6 +116,12 @@ var errDoneForNow = errors.New("done for now") // to proceed to next entry. var errSkipFile = errors.New("skip this file") +// Returned by FS drive mode when a fresh disk is specified. +var errFreshDisk = errors.New("FS backend requires existing disk") + +// errXLBackend XL drive mode requires fresh deployment. +var errXLBackend = errors.New("XL backend requires fresh disk") + // StorageErr represents error generated by xlStorage call. type StorageErr string diff --git a/cmd/storage-rest-server.go b/cmd/storage-rest-server.go index 27a07dc89..99a1d03c9 100644 --- a/cmd/storage-rest-server.go +++ b/cmd/storage-rest-server.go @@ -1140,6 +1140,10 @@ func checkDiskFatalErrs(errs []error) error { return errFaultyDisk } + if countErrs(errs, errXLBackend) == len(errs) { + return errXLBackend + } + return nil } @@ -1152,6 +1156,8 @@ func checkDiskFatalErrs(errs []error) error { // Do not like it :-( func logFatalErrs(err error, endpoint Endpoint, exit bool) { switch { + case errors.Is(err, errXLBackend): + logger.Fatal(config.ErrInvalidXLValue(err), "Unable to initialize backend") case errors.Is(err, errUnsupportedDisk): var hint string if endpoint.URL != nil { diff --git a/cmd/sts-handlers.go b/cmd/sts-handlers.go index 0c0a42bdc..7daeb17c0 100644 --- a/cmd/sts-handlers.go +++ b/cmd/sts-handlers.go @@ -389,7 +389,7 @@ func (sts *stsAPIHandlers) AssumeRoleWithSSO(w http.ResponseWriter, r *http.Requ policyName = globalIAMSys.CurrentPolicies(policies) } - if globalAuthZPlugin == nil { + if newGlobalAuthZPluginFn() == nil { if !ok { writeSTSErrorResponse(ctx, w, true, ErrSTSInvalidParameterValue, fmt.Errorf("%s claim missing from the JWT token, credentials will not be generated", iamPolicyClaimNameOpenID())) @@ -598,7 +598,7 @@ func (sts *stsAPIHandlers) AssumeRoleWithLDAPIdentity(w http.ResponseWriter, r * // Check if this user or their groups have a policy applied. ldapPolicies, _ := globalIAMSys.PolicyDBGet(ldapUserDN, false, groupDistNames...) - if len(ldapPolicies) == 0 && globalAuthZPlugin == nil { + if len(ldapPolicies) == 0 && newGlobalAuthZPluginFn() == nil { writeSTSErrorResponse(ctx, w, true, ErrSTSInvalidParameterValue, fmt.Errorf("expecting a policy to be set for user `%s` or one of their groups: `%s` - rejecting this request", ldapUserDN, strings.Join(groupDistNames, "`,`"))) diff --git a/cmd/sts-handlers_test.go b/cmd/sts-handlers_test.go index 5ef18ee9f..f91c187aa 100644 --- a/cmd/sts-handlers_test.go +++ b/cmd/sts-handlers_test.go @@ -42,10 +42,10 @@ func runAllIAMSTSTests(suite *TestSuiteIAM, c *check) { func TestIAMInternalIDPSTSServerSuite(t *testing.T) { baseTestCases := []TestSuiteCommon{ - // Init and run test on FS backend with signature v4. - {serverType: "FS", signer: signerV4}, - // Init and run test on FS backend, with tls enabled. - {serverType: "FS", signer: signerV4, secure: true}, + // Init and run test on ErasureSD backend with signature v4. + {serverType: "ErasureSD", signer: signerV4}, + // Init and run test on ErasureSD backend, with tls enabled. + {serverType: "ErasureSD", signer: signerV4, secure: true}, // Init and run test on Erasure backend. {serverType: "Erasure", signer: signerV4}, // Init and run test on ErasureSet backend. diff --git a/cmd/test-utils_test.go b/cmd/test-utils_test.go index 17559b6e7..98485e917 100644 --- a/cmd/test-utils_test.go +++ b/cmd/test-utils_test.go @@ -78,6 +78,8 @@ func TestMain(m *testing.M) { // set to 'true' when testing is invoked globalIsTesting = true + globalIsCICD = globalIsTesting + globalActiveCred = auth.Credentials{ AccessKey: auth.DefaultAccessKey, SecretKey: auth.DefaultSecretKey, @@ -191,10 +193,14 @@ func prepareFS() (ObjectLayer, string, error) { if err != nil { return nil, "", err } - obj, err := NewFSObjectLayer(fsDirs[0]) + obj, _, err := initObjectLayer(context.Background(), mustGetPoolEndpoints(fsDirs...)) if err != nil { return nil, "", err } + + initAllSubsystems() + + globalIAMSys.Init(context.Background(), obj, globalEtcdClient, 2*time.Second) return obj, fsDirs[0], nil } @@ -221,8 +227,7 @@ func prepareErasure16(ctx context.Context) (ObjectLayer, []string, error) { // Initialize FS objects. func initFSObjects(disk string, t *testing.T) (obj ObjectLayer) { - var err error - obj, err = NewFSObjectLayer(disk) + obj, _, err := initObjectLayer(context.Background(), mustGetPoolEndpoints(disk)) if err != nil { t.Fatal(err) } @@ -242,8 +247,8 @@ type TestErrHandler interface { } const ( - // FSTestStr is the string which is used as notation for Single node ObjectLayer in the unit tests. - FSTestStr string = "FS" + // ErasureSDStr is the string which is used as notation for Single node ObjectLayer in the unit tests. + ErasureSDStr string = "ErasureSD" // ErasureTestStr is the string which is used as notation for Erasure ObjectLayer in the unit tests. ErasureTestStr string = "Erasure" @@ -1469,20 +1474,9 @@ func getRandomDisks(N int) ([]string, error) { // Initialize object layer with the supplied disks, objectLayer is nil upon any error. func newTestObjectLayer(ctx context.Context, endpointServerPools EndpointServerPools) (newObject ObjectLayer, err error) { - // For FS only, directly use the disk. - if endpointServerPools.NEndpoints() == 1 { - // Initialize new FS object layer. - return NewFSObjectLayer(endpointServerPools[0].Endpoints[0].Path) - } - - z, err := newErasureServerPools(ctx, endpointServerPools) - if err != nil { - return nil, err - } - initAllSubsystems() - return z, nil + return newErasureServerPools(ctx, endpointServerPools) } // initObjectLayer - Instantiates object layer and returns it. @@ -1750,7 +1744,7 @@ func ExecObjectLayerAPITest(t *testing.T, objAPITest objAPITestType, endpoints [ credentials := globalActiveCred // Executing the object layer tests for single node setup. - objAPITest(objLayer, FSTestStr, bucketFS, fsAPIRouter, credentials, t) + objAPITest(objLayer, ErasureSDStr, bucketFS, fsAPIRouter, credentials, t) objLayer, erasureDisks, err := prepareErasure16(ctx) if err != nil { @@ -1816,7 +1810,7 @@ func ExecObjectLayerTest(t TestErrHandler, objTest objTestType) { globalIAMSys.Init(ctx, objLayer, globalEtcdClient, 2*time.Second) // Executing the object layer tests for single node setup. - objTest(objLayer, FSTestStr, t) + objTest(objLayer, ErasureSDStr, t) // Call clean up functions cancel() diff --git a/cmd/tier-handlers.go b/cmd/tier-handlers.go index 133d67208..068d9c9ba 100644 --- a/cmd/tier-handlers.go +++ b/cmd/tier-handlers.go @@ -74,7 +74,7 @@ func (api adminAPIHandlers) AddTierHandler(w http.ResponseWriter, r *http.Reques defer logger.AuditLog(ctx, w, r, mustGetClaimsFromToken(r)) - if !globalIsErasure { + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrNotImplemented), r.URL) return } @@ -132,7 +132,7 @@ func (api adminAPIHandlers) ListTierHandler(w http.ResponseWriter, r *http.Reque defer logger.AuditLog(ctx, w, r, mustGetClaimsFromToken(r)) - if !globalIsErasure { + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrNotImplemented), r.URL) return } @@ -158,7 +158,7 @@ func (api adminAPIHandlers) EditTierHandler(w http.ResponseWriter, r *http.Reque defer logger.AuditLog(ctx, w, r, mustGetClaimsFromToken(r)) - if !globalIsErasure { + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrNotImplemented), r.URL) return } @@ -210,7 +210,7 @@ func (api adminAPIHandlers) RemoveTierHandler(w http.ResponseWriter, r *http.Req defer logger.AuditLog(ctx, w, r, mustGetClaimsFromToken(r)) - if !globalIsErasure { + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrNotImplemented), r.URL) return } @@ -247,7 +247,7 @@ func (api adminAPIHandlers) VerifyTierHandler(w http.ResponseWriter, r *http.Req defer logger.AuditLog(ctx, w, r, mustGetClaimsFromToken(r)) - if !globalIsErasure { + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrNotImplemented), r.URL) return } @@ -273,7 +273,7 @@ func (api adminAPIHandlers) TierStatsHandler(w http.ResponseWriter, r *http.Requ defer logger.AuditLog(ctx, w, r, mustGetClaimsFromToken(r)) - if !globalIsErasure { + if globalIsGateway { writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErr(ErrNotImplemented), r.URL) return } diff --git a/cmd/utils.go b/cmd/utils.go index 5be2e14cb..69ca691ee 100644 --- a/cmd/utils.go +++ b/cmd/utils.go @@ -906,6 +906,8 @@ func getMinioMode() string { mode = globalMinioModeErasure } else if globalIsGateway { mode = globalMinioModeGatewayPrefix + globalGatewayName + } else if globalIsErasureSD { + mode = globalMinioModeErasureSD } return mode } diff --git a/cmd/xl-storage-format-utils.go b/cmd/xl-storage-format-utils.go index 0de7d7656..51ef99a77 100644 --- a/cmd/xl-storage-format-utils.go +++ b/cmd/xl-storage-format-utils.go @@ -141,6 +141,9 @@ func getFileInfo(xlMetaBuf []byte, volume, path, versionID string, data bool) (F // Will return -1 for unknown values. func getXLDiskLoc(diskID string) (poolIdx, setIdx, diskIdx int) { if api := newObjectLayerFn(); api != nil { + if globalIsErasureSD { + return 0, 0, 0 + } if ep, ok := api.(*erasureServerPools); ok { if pool, set, disk, err := ep.getPoolAndSet(diskID); err == nil { return pool, set, disk diff --git a/cmd/xl-storage-format-v1.go b/cmd/xl-storage-format-v1.go index 03aa6feea..fce93af16 100644 --- a/cmd/xl-storage-format-v1.go +++ b/cmd/xl-storage-format-v1.go @@ -53,7 +53,7 @@ func isXLMetaFormatValid(version, format string) bool { // Verifies if the backend format metadata is sane by validating // the ErasureInfo, i.e. data and parity blocks. func isXLMetaErasureInfoValid(data, parity int) bool { - return ((data >= parity) && (data != 0) && (parity != 0)) + return ((data >= parity) && (data > 0) && (parity >= 0)) } //go:generate msgp -file=$GOFILE -unexported diff --git a/cmd/xl-storage-format_test.go b/cmd/xl-storage-format_test.go index b4e68c941..ba776837c 100644 --- a/cmd/xl-storage-format_test.go +++ b/cmd/xl-storage-format_test.go @@ -62,13 +62,15 @@ func TestIsXLMetaErasureInfoValid(t *testing.T) { {1, 5, 6, false}, {2, 5, 5, true}, {3, 0, 5, false}, - {4, 5, 0, false}, - {5, 5, 0, false}, - {6, 5, 4, true}, + {3, -1, 5, false}, + {4, 5, -1, false}, + {5, 5, 0, true}, + {6, 5, 0, true}, + {7, 5, 4, true}, } for _, tt := range tests { if got := isXLMetaErasureInfoValid(tt.data, tt.parity); got != tt.want { - t.Errorf("Test %d: Expected %v but received %v", tt.name, got, tt.want) + t.Errorf("Test %d: Expected %v but received %v -> %#v", tt.name, got, tt.want, tt) } } } diff --git a/docs/config/README.md b/docs/config/README.md index d9002459b..3241867a1 100644 --- a/docs/config/README.md +++ b/docs/config/README.md @@ -309,7 +309,7 @@ Example: The following settings will increase the heal operation speed by allowi Once set the healer settings are automatically applied without the need for server restarts. -> NOTE: Healing is not supported for gateway and single drive mode. +> NOTE: Healing is not supported for Gateway deployments. ## Environment only settings (not in config) diff --git a/docs/gateway/nas.md b/docs/gateway/nas.md index b06ed632f..e1a160974 100644 --- a/docs/gateway/nas.md +++ b/docs/gateway/nas.md @@ -1,4 +1,6 @@ -# MinIO NAS Gateway [![Slack](https://slack.min.io/slack?type=svg)](https://slack.min.io) +# MinIO NAS Gateway [![Slack](https://slack.min.io/slack?type=svg)](https://slack.min.io) + +> NAS gateway is deprecated and will be removed in future, no more fresh deployments are supported. MinIO Gateway adds Amazon S3 compatibility to NAS storage. You may run multiple minio instances on the same shared NAS volume as a distributed object gateway. diff --git a/internal/config/errors.go b/internal/config/errors.go index 5e3a22bb0..0c63d3459 100644 --- a/internal/config/errors.go +++ b/internal/config/errors.go @@ -19,6 +19,18 @@ package config // UI errors var ( + ErrInvalidFSValue = newErrFn( + "Invalid drive path", + "Please provide an existing deployment with MinIO", + "MinIO does not support newer NAS gateway deployments anymore refer https://github.com/minio/minio/issues/14331", + ) + + ErrInvalidXLValue = newErrFn( + "Invalid drive path", + "Please provide a fresh drive for single drive MinIO setup", + "MinIO only supports fresh drive paths", + ) + ErrInvalidBrowserValue = newErrFn( "Invalid console value", "Please check the passed value",