2021-04-18 15:41:13 -04:00
// Copyright (c) 2015-2021 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
2018-02-15 20:45:57 -05:00
package cmd
import (
2018-03-14 15:01:47 -04:00
"context"
2021-02-26 19:53:06 -05:00
"encoding/binary"
2021-06-09 13:51:19 -04:00
"encoding/json"
2020-08-18 17:37:26 -04:00
"errors"
2018-02-15 20:45:57 -05:00
"fmt"
"hash/crc32"
2020-10-13 21:28:42 -04:00
"math/rand"
2018-09-20 22:22:09 -04:00
"net/http"
2021-11-15 12:46:55 -05:00
"reflect"
2020-05-06 17:25:05 -04:00
"sort"
2018-02-15 20:45:57 -05:00
"sync"
"time"
2020-06-12 23:04:01 -04:00
"github.com/dchest/siphash"
[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
2021-03-06 17:09:34 -05:00
"github.com/dustin/go-humanize"
2020-06-12 23:04:01 -04:00
"github.com/google/uuid"
2021-05-06 11:52:02 -04:00
"github.com/minio/madmin-go"
2020-12-10 10:28:37 -05:00
"github.com/minio/minio-go/v7/pkg/set"
2020-07-14 12:38:05 -04:00
"github.com/minio/minio-go/v7/pkg/tags"
2021-06-01 17:59:40 -04:00
"github.com/minio/minio/internal/bpool"
"github.com/minio/minio/internal/dsync"
"github.com/minio/minio/internal/logger"
"github.com/minio/minio/internal/sync/errgroup"
2021-05-28 18:17:01 -04:00
"github.com/minio/pkg/console"
2018-02-15 20:45:57 -05:00
)
2019-11-13 15:17:45 -05:00
// setsDsyncLockers is encapsulated type for Close()
type setsDsyncLockers [ ] [ ] dsync . NetLocker
2020-06-12 23:04:01 -04:00
// erasureSets implements ObjectLayer combining a static list of erasure coded
2018-02-15 20:45:57 -05:00
// object sets. NOTE: There is no dynamic scaling allowed or intended in
// current design.
2020-06-12 23:04:01 -04:00
type erasureSets struct {
2020-05-19 16:53:54 -04:00
GatewayUnsupported
2020-06-12 23:04:01 -04:00
sets [ ] * erasureObjects
2018-02-15 20:45:57 -05:00
// Reference format.
2020-06-12 23:04:01 -04:00
format * formatErasureV3
2018-02-15 20:45:57 -05:00
2020-06-12 23:04:01 -04:00
// erasureDisks mutex to lock erasureDisks.
erasureDisksMu sync . RWMutex
2018-02-15 20:45:57 -05:00
// Re-ordered list of disks per set.
2020-06-12 23:04:01 -04:00
erasureDisks [ ] [ ] StorageAPI
2018-02-15 20:45:57 -05:00
2019-11-13 15:17:45 -05:00
// Distributed locker clients.
2020-06-12 23:04:01 -04:00
erasureLockers setsDsyncLockers
2019-11-13 15:17:45 -05:00
2020-09-25 22:21:52 -04:00
// Distributed lock owner (constant per running instance).
erasureLockOwner string
2018-02-15 20:45:57 -05:00
// List of endpoints provided on the command line.
2022-01-10 12:07:49 -05:00
endpoints PoolEndpoints
2018-02-15 20:45:57 -05:00
2020-03-24 21:53:24 -04:00
// String version of all the endpoints, an optimization
// to avoid url.String() conversion taking CPU on
// large disk setups.
endpointStrings [ ] string
2018-02-15 20:45:57 -05:00
// Total number of sets and the number of disks per set.
2020-08-26 22:29:35 -04:00
setCount , setDriveCount int
2021-01-16 15:08:02 -05:00
defaultParityCount int
2018-02-15 20:45:57 -05:00
2021-03-04 17:36:23 -05:00
poolIndex int
2021-01-26 16:21:51 -05:00
2021-03-18 14:19:02 -04:00
// A channel to send the set index to the MRF when
// any disk belonging to that set is connected
setReconnectEvent chan int
2020-01-15 21:30:32 -05:00
2018-02-15 20:45:57 -05:00
// Distribution algorithm of choice.
distributionAlgo string
2020-06-12 23:04:01 -04:00
deploymentID [ 16 ] byte
2018-02-15 20:45:57 -05:00
2020-05-26 15:52:24 -04:00
disksStorageInfoCache timedValue
2021-05-11 12:19:15 -04:00
lastConnectDisksOpTime time . Time
2018-02-15 20:45:57 -05:00
}
2021-06-16 17:26:26 -04:00
func ( s * erasureSets ) getDiskMap ( ) map [ Endpoint ] StorageAPI {
diskMap := make ( map [ Endpoint ] StorageAPI )
2020-03-25 02:26:13 -04:00
2020-06-12 23:04:01 -04:00
s . erasureDisksMu . RLock ( )
defer s . erasureDisksMu . RUnlock ( )
2018-02-15 20:45:57 -05:00
for i := 0 ; i < s . setCount ; i ++ {
2020-08-26 22:29:35 -04:00
for j := 0 ; j < s . setDriveCount ; j ++ {
2020-06-12 23:04:01 -04:00
disk := s . erasureDisks [ i ] [ j ]
2020-09-17 00:14:35 -04:00
if disk == OfflineDisk {
2018-02-15 20:45:57 -05:00
continue
}
2020-03-25 02:26:13 -04:00
if ! disk . IsOnline ( ) {
2018-02-15 20:45:57 -05:00
continue
}
2021-06-16 17:26:26 -04:00
diskMap [ disk . Endpoint ( ) ] = disk
2018-02-15 20:45:57 -05:00
}
}
2020-03-25 02:26:13 -04:00
return diskMap
2018-02-15 20:45:57 -05:00
}
// Initializes a new StorageAPI from the endpoint argument, returns
// StorageAPI and also `format` which exists on the disk.
2020-06-12 23:04:01 -04:00
func connectEndpoint ( endpoint Endpoint ) ( StorageAPI , * formatErasureV3 , error ) {
2020-10-26 13:29:29 -04:00
disk , err := newStorageAPIWithoutHealthCheck ( endpoint )
2018-02-15 20:45:57 -05:00
if err != nil {
return nil , nil , err
}
2020-06-12 23:04:01 -04:00
format , err := loadFormatErasure ( disk )
2018-02-15 20:45:57 -05:00
if err != nil {
2020-08-18 17:37:26 -04:00
if errors . Is ( err , errUnformattedDisk ) {
2020-09-04 12:45:06 -04:00
info , derr := disk . DiskInfo ( context . TODO ( ) )
2020-08-18 17:37:26 -04:00
if derr != nil && info . RootDisk {
2022-01-31 20:28:20 -05:00
return nil , nil , fmt . Errorf ( "Disk: %s is a root disk" , disk )
2020-08-18 17:37:26 -04:00
}
}
return nil , nil , fmt . Errorf ( "Disk: %s returned %w" , disk , err ) // make sure to '%w' to wrap the error
2018-02-15 20:45:57 -05:00
}
return disk , format , nil
}
2020-03-27 17:48:30 -04:00
// findDiskIndex - returns the i,j'th position of the input `diskID` against the reference
// format, after successful validation.
// - i'th position is the set index
// - j'th position is the disk index in the current set
2020-06-12 23:04:01 -04:00
func findDiskIndexByDiskID ( refFormat * formatErasureV3 , diskID string ) ( int , int , error ) {
2022-01-24 22:40:02 -05:00
if diskID == "" {
return - 1 , - 1 , errDiskNotFound
}
2020-03-27 17:48:30 -04:00
if diskID == offlineDiskUUID {
return - 1 , - 1 , fmt . Errorf ( "diskID: %s is offline" , diskID )
}
2020-06-12 23:04:01 -04:00
for i := 0 ; i < len ( refFormat . Erasure . Sets ) ; i ++ {
for j := 0 ; j < len ( refFormat . Erasure . Sets [ 0 ] ) ; j ++ {
if refFormat . Erasure . Sets [ i ] [ j ] == diskID {
2020-03-27 17:48:30 -04:00
return i , j , nil
}
}
}
return - 1 , - 1 , fmt . Errorf ( "diskID: %s not found" , diskID )
}
2018-02-15 20:45:57 -05:00
// findDiskIndex - returns the i,j'th position of the input `format` against the reference
// format, after successful validation.
2020-01-15 21:30:32 -05:00
// - i'th position is the set index
// - j'th position is the disk index in the current set
2020-06-12 23:04:01 -04:00
func findDiskIndex ( refFormat , format * formatErasureV3 ) ( int , int , error ) {
if err := formatErasureV3Check ( refFormat , format ) ; err != nil {
2018-02-15 20:45:57 -05:00
return 0 , 0 , err
}
2020-06-12 23:04:01 -04:00
if format . Erasure . This == offlineDiskUUID {
return - 1 , - 1 , fmt . Errorf ( "diskID: %s is offline" , format . Erasure . This )
2018-02-15 20:45:57 -05:00
}
2020-06-12 23:04:01 -04:00
for i := 0 ; i < len ( refFormat . Erasure . Sets ) ; i ++ {
for j := 0 ; j < len ( refFormat . Erasure . Sets [ 0 ] ) ; j ++ {
if refFormat . Erasure . Sets [ i ] [ j ] == format . Erasure . This {
2018-02-15 20:45:57 -05:00
return i , j , nil
}
}
}
2020-06-12 23:04:01 -04:00
return - 1 , - 1 , fmt . Errorf ( "diskID: %s not found" , format . Erasure . This )
2018-02-15 20:45:57 -05:00
}
2020-01-10 05:35:06 -05:00
// connectDisks - attempt to connect all the endpoints, loads format
2018-03-27 21:11:39 -04:00
// and re-arranges the disks in proper position.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) connectDisks ( ) {
2021-05-11 12:19:15 -04:00
defer func ( ) {
s . lastConnectDisksOpTime = time . Now ( )
} ( )
2020-03-25 02:26:13 -04:00
var wg sync . WaitGroup
diskMap := s . getDiskMap ( )
2022-02-21 18:51:54 -05:00
setsJustConnected := make ( [ ] bool , s . setCount )
2022-01-10 12:07:49 -05:00
for _ , endpoint := range s . endpoints . Endpoints {
2022-02-21 18:51:54 -05:00
cdisk := diskMap [ endpoint ]
if cdisk != nil && cdisk . IsOnline ( ) {
if s . lastConnectDisksOpTime . IsZero ( ) {
continue
}
// An online-disk means its a valid disk but it may be a re-connected disk
// we verify that here based on LastConn(), however we make sure to avoid
// putting it back into the s.erasureDisks by re-placing the disk again.
_ , setIndex , _ := cdisk . GetDiskLoc ( )
if setIndex != - 1 {
// Recently disconnected disks must go to MRF
setsJustConnected [ setIndex ] = cdisk . LastConn ( ) . After ( s . lastConnectDisksOpTime )
continue
}
2018-03-27 21:11:39 -04:00
}
2022-02-21 18:51:54 -05:00
2020-03-25 02:26:13 -04:00
wg . Add ( 1 )
go func ( endpoint Endpoint ) {
defer wg . Done ( )
disk , format , err := connectEndpoint ( endpoint )
if err != nil {
2020-09-04 20:09:02 -04:00
if endpoint . IsLocal && errors . Is ( err , errUnformattedDisk ) {
globalBackgroundHealState . pushHealLocalDisks ( endpoint )
} else {
printEndpointError ( endpoint , err , true )
}
2020-03-25 02:26:13 -04:00
return
}
2021-03-04 17:36:23 -05:00
if disk . IsLocal ( ) && disk . Healing ( ) != nil {
2020-10-24 16:23:08 -04:00
globalBackgroundHealState . pushHealLocalDisks ( disk . Endpoint ( ) )
}
2020-09-17 00:14:35 -04:00
s . erasureDisksMu . RLock ( )
2020-03-25 02:26:13 -04:00
setIndex , diskIndex , err := findDiskIndex ( s . format , format )
2020-09-17 00:14:35 -04:00
s . erasureDisksMu . RUnlock ( )
2020-03-25 02:26:13 -04:00
if err != nil {
2020-10-24 16:23:08 -04:00
printEndpointError ( endpoint , err , false )
2021-11-15 12:46:55 -05:00
disk . Close ( )
2020-03-25 02:26:13 -04:00
return
}
2020-09-28 22:39:32 -04:00
2020-06-12 23:04:01 -04:00
s . erasureDisksMu . Lock ( )
2021-11-15 12:46:55 -05:00
if currentDisk := s . erasureDisks [ setIndex ] [ diskIndex ] ; currentDisk != nil {
if ! reflect . DeepEqual ( currentDisk . Endpoint ( ) , disk . Endpoint ( ) ) {
err = fmt . Errorf ( "Detected unexpected disk ordering refusing to use the disk: expecting %s, found %s, refusing to use the disk" ,
currentDisk . Endpoint ( ) , disk . Endpoint ( ) )
printEndpointError ( endpoint , err , false )
disk . Close ( )
s . erasureDisksMu . Unlock ( )
return
}
2020-06-12 23:04:01 -04:00
s . erasureDisks [ setIndex ] [ diskIndex ] . Close ( )
2020-04-03 21:06:31 -04:00
}
2020-10-26 13:29:29 -04:00
if disk . IsLocal ( ) {
disk . SetDiskID ( format . Erasure . This )
s . erasureDisks [ setIndex ] [ diskIndex ] = disk
} else {
// Enable healthcheck disk for remote endpoint.
disk , err = newStorageAPI ( endpoint )
if err != nil {
printEndpointError ( endpoint , err , false )
2021-11-15 12:46:55 -05:00
s . erasureDisksMu . Unlock ( )
2020-10-26 13:29:29 -04:00
return
}
disk . SetDiskID ( format . Erasure . This )
s . erasureDisks [ setIndex ] [ diskIndex ] = disk
}
2021-03-04 17:36:23 -05:00
disk . SetDiskLoc ( s . poolIndex , setIndex , diskIndex )
2022-02-21 18:51:54 -05:00
setsJustConnected [ setIndex ] = true // disk just went online we treat it is as MRF event
2021-04-06 14:33:10 -04:00
s . erasureDisksMu . Unlock ( )
2020-03-25 02:26:13 -04:00
} ( endpoint )
2018-03-27 21:11:39 -04:00
}
2021-03-18 14:19:02 -04:00
2020-03-25 02:26:13 -04:00
wg . Wait ( )
2021-03-18 14:19:02 -04:00
go func ( ) {
for setIndex , justConnected := range setsJustConnected {
if ! justConnected {
continue
}
2022-02-28 12:13:19 -05:00
globalMRFState . newSetReconnected ( s . poolIndex , setIndex )
2021-03-18 14:19:02 -04:00
}
} ( )
2018-03-27 21:11:39 -04:00
}
2018-02-15 20:45:57 -05:00
// monitorAndConnectEndpoints this is a monitoring loop to keep track of disconnected
// endpoints by reconnecting them and making sure to place them into right position in
// the set topology, this monitoring happens at a given monitoring interval.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) monitorAndConnectEndpoints ( ctx context . Context , monitorInterval time . Duration ) {
2020-10-13 21:28:42 -04:00
r := rand . New ( rand . NewSource ( time . Now ( ) . UnixNano ( ) ) )
time . Sleep ( time . Duration ( r . Float64 ( ) * float64 ( time . Second ) ) )
// Pre-emptively connect the disks if possible.
s . connectDisks ( )
2020-12-16 17:33:05 -05:00
monitor := time . NewTimer ( monitorInterval )
defer monitor . Stop ( )
2018-02-15 20:45:57 -05:00
for {
select {
2020-03-18 19:19:29 -04:00
case <- ctx . Done ( ) :
2018-04-09 13:25:41 -04:00
return
2020-12-16 17:33:05 -05:00
case <- monitor . C :
2020-12-17 15:35:02 -05:00
// Reset the timer once fired for required interval.
monitor . Reset ( monitorInterval )
2020-12-16 17:33:05 -05:00
2020-12-17 19:52:47 -05:00
if serverDebugLog {
console . Debugln ( "running disk monitoring" )
}
2020-12-17 15:35:02 -05:00
s . connectDisks ( )
2020-12-16 17:33:05 -05:00
}
2019-11-13 15:17:45 -05:00
}
}
2020-09-25 22:21:52 -04:00
func ( s * erasureSets ) GetLockers ( setIndex int ) func ( ) ( [ ] dsync . NetLocker , string ) {
return func ( ) ( [ ] dsync . NetLocker , string ) {
2020-12-10 10:28:37 -05:00
lockers := make ( [ ] dsync . NetLocker , len ( s . erasureLockers [ setIndex ] ) )
2020-06-12 23:04:01 -04:00
copy ( lockers , s . erasureLockers [ setIndex ] )
2020-09-25 22:21:52 -04:00
return lockers , s . erasureLockOwner
2018-02-15 20:45:57 -05:00
}
}
2021-09-29 14:36:19 -04:00
func ( s * erasureSets ) GetEndpoints ( setIndex int ) func ( ) [ ] Endpoint {
return func ( ) [ ] Endpoint {
2020-06-12 23:04:01 -04:00
s . erasureDisksMu . RLock ( )
defer s . erasureDisksMu . RUnlock ( )
2020-06-10 20:10:31 -04:00
2021-09-29 14:36:19 -04:00
eps := make ( [ ] Endpoint , s . setDriveCount )
2020-08-26 22:29:35 -04:00
for i := 0 ; i < s . setDriveCount ; i ++ {
2022-01-10 12:07:49 -05:00
eps [ i ] = s . endpoints . Endpoints [ setIndex * s . setDriveCount + i ]
2020-06-10 20:10:31 -04:00
}
return eps
}
}
2018-02-15 20:45:57 -05:00
// GetDisks returns a closure for a given set, which provides list of disks per set.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) GetDisks ( setIndex int ) func ( ) [ ] StorageAPI {
2018-02-15 20:45:57 -05:00
return func ( ) [ ] StorageAPI {
2020-06-12 23:04:01 -04:00
s . erasureDisksMu . RLock ( )
defer s . erasureDisksMu . RUnlock ( )
2020-08-26 22:29:35 -04:00
disks := make ( [ ] StorageAPI , s . setDriveCount )
2020-06-12 23:04:01 -04:00
copy ( disks , s . erasureDisks [ setIndex ] )
2018-02-15 20:45:57 -05:00
return disks
}
}
2020-09-04 20:09:02 -04:00
// defaultMonitorConnectEndpointInterval is the interval to monitor endpoint connections.
// Must be bigger than defaultMonitorNewDiskInterval.
const defaultMonitorConnectEndpointInterval = defaultMonitorNewDiskInterval + time . Second * 5
2018-02-15 20:45:57 -05:00
// Initialize new set of erasure coded sets.
2022-01-10 12:07:49 -05:00
func newErasureSets ( ctx context . Context , endpoints PoolEndpoints , storageDisks [ ] StorageAPI , format * formatErasureV3 , defaultParityCount , poolIdx int ) ( * erasureSets , error ) {
2020-06-12 23:04:01 -04:00
setCount := len ( format . Erasure . Sets )
2020-08-26 22:29:35 -04:00
setDriveCount := len ( format . Erasure . Sets [ 0 ] )
2020-04-27 17:39:57 -04:00
2022-01-10 12:07:49 -05:00
endpointStrings := make ( [ ] string , len ( endpoints . Endpoints ) )
for i , endpoint := range endpoints . Endpoints {
2021-09-25 13:51:03 -04:00
endpointStrings [ i ] = endpoint . String ( )
}
2020-06-12 23:04:01 -04:00
// Initialize the erasure sets instance.
s := & erasureSets {
2021-01-16 15:08:02 -05:00
sets : make ( [ ] * erasureObjects , setCount ) ,
erasureDisks : make ( [ ] [ ] StorageAPI , setCount ) ,
erasureLockers : make ( [ ] [ ] dsync . NetLocker , setCount ) ,
2021-03-26 14:37:58 -04:00
erasureLockOwner : globalLocalNodeName ,
2021-01-16 15:08:02 -05:00
endpoints : endpoints ,
endpointStrings : endpointStrings ,
setCount : setCount ,
setDriveCount : setDriveCount ,
defaultParityCount : defaultParityCount ,
format : format ,
2021-03-18 14:19:02 -04:00
setReconnectEvent : make ( chan int ) ,
2021-01-16 15:08:02 -05:00
distributionAlgo : format . Erasure . DistributionAlgo ,
deploymentID : uuid . MustParse ( format . ID ) ,
2021-03-04 17:36:23 -05:00
poolIndex : poolIdx ,
2018-02-15 20:45:57 -05:00
}
2020-06-12 23:04:01 -04:00
mutex := newNSLock ( globalIsDistErasure )
2018-06-01 19:41:23 -04:00
[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
2021-03-06 17:09:34 -05:00
// Number of buffers, max 2GB
n := ( 2 * humanize . GiByte ) / ( blockSizeV2 * 2 )
2018-06-01 19:41:23 -04:00
// Initialize byte pool once for all sets, bpool size is set to
[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
2021-03-06 17:09:34 -05:00
// setCount * setDriveCount with each memory upto blockSizeV2.
bp := bpool . NewBytePoolCap ( n , blockSizeV2 , blockSizeV2 * 2 )
2018-06-01 19:41:23 -04:00
2021-06-07 13:06:06 -04:00
// Initialize byte pool for all sets, bpool size is set to
// setCount * setDriveCount with each memory upto blockSizeV1
//
// Number of buffers, max 10GiB
m := ( 10 * humanize . GiByte ) / ( blockSizeV1 * 2 )
bpOld := bpool . NewBytePoolCap ( m , blockSizeV1 , blockSizeV1 * 2 )
2020-03-27 17:48:30 -04:00
for i := 0 ; i < setCount ; i ++ {
2020-08-26 22:29:35 -04:00
s . erasureDisks [ i ] = make ( [ ] StorageAPI , setDriveCount )
2020-12-10 10:28:37 -05:00
}
2022-01-02 12:15:06 -05:00
erasureLockers := map [ string ] dsync . NetLocker { }
2022-01-10 12:07:49 -05:00
for _ , endpoint := range endpoints . Endpoints {
2020-12-10 10:28:37 -05:00
if _ , ok := erasureLockers [ endpoint . Host ] ; ! ok {
erasureLockers [ endpoint . Host ] = newLockAPI ( endpoint )
}
2020-04-29 16:42:37 -04:00
}
2018-02-15 20:45:57 -05:00
2020-04-29 16:42:37 -04:00
for i := 0 ; i < setCount ; i ++ {
2022-01-02 12:15:06 -05:00
lockerEpSet := set . NewStringSet ( )
2020-08-26 22:29:35 -04:00
for j := 0 ; j < setDriveCount ; j ++ {
2022-01-10 12:07:49 -05:00
endpoint := endpoints . Endpoints [ i * setDriveCount + j ]
// Only add lockers only one per endpoint and per erasure set.
2020-12-10 10:28:37 -05:00
if locker , ok := erasureLockers [ endpoint . Host ] ; ok && ! lockerEpSet . Contains ( endpoint . Host ) {
lockerEpSet . Add ( endpoint . Host )
s . erasureLockers [ i ] = append ( s . erasureLockers [ i ] , locker )
}
2020-03-04 19:18:32 -05:00
}
2022-01-24 14:28:45 -05:00
}
2020-03-04 19:18:32 -05:00
2022-01-24 14:28:45 -05:00
var wg sync . WaitGroup
for i := 0 ; i < setCount ; i ++ {
wg . Add ( 1 )
go func ( i int ) {
defer wg . Done ( )
var innerWg sync . WaitGroup
for j := 0 ; j < setDriveCount ; j ++ {
disk := storageDisks [ i * setDriveCount + j ]
if disk == nil {
continue
}
innerWg . Add ( 1 )
go func ( disk StorageAPI , i , j int ) {
defer innerWg . Done ( )
diskID , err := disk . GetDiskID ( )
if err != nil {
if ! errors . Is ( err , errUnformattedDisk ) {
logger . LogIf ( ctx , err )
}
return
}
2022-01-24 22:40:02 -05:00
if diskID == "" {
return
}
2022-01-24 14:28:45 -05:00
m , n , err := findDiskIndexByDiskID ( format , diskID )
if err != nil {
logger . LogIf ( ctx , err )
return
}
if m != i || n != j {
logger . LogIf ( ctx , fmt . Errorf ( "Detected unexpected disk ordering refusing to use the disk - poolID: %s, found disk mounted at (set=%s, disk=%s) expected mount at (set=%s, disk=%s): %s(%s)" , humanize . Ordinal ( poolIdx + 1 ) , humanize . Ordinal ( m + 1 ) , humanize . Ordinal ( n + 1 ) , humanize . Ordinal ( i + 1 ) , humanize . Ordinal ( j + 1 ) , disk , diskID ) )
s . erasureDisks [ i ] [ j ] = & unrecognizedDisk { storage : disk }
return
}
disk . SetDiskLoc ( s . poolIndex , m , n )
s . endpointStrings [ m * setDriveCount + n ] = disk . String ( )
s . erasureDisks [ m ] [ n ] = disk
} ( disk , i , j )
}
innerWg . Wait ( )
// Initialize erasure objects for a given set.
s . sets [ i ] = & erasureObjects {
setIndex : i ,
poolIndex : poolIdx ,
setDriveCount : setDriveCount ,
defaultParityCount : defaultParityCount ,
getDisks : s . GetDisks ( i ) ,
getLockers : s . GetLockers ( i ) ,
getEndpoints : s . GetEndpoints ( i ) ,
deletedCleanupSleeper : newDynamicSleeper ( 10 , 2 * time . Second ) ,
nsMutex : mutex ,
bp : bp ,
bpOld : bpOld ,
}
} ( i )
2018-02-15 20:45:57 -05:00
}
2022-01-24 14:28:45 -05:00
wg . Wait ( )
2020-12-10 10:28:37 -05:00
// start cleanup stale uploads go-routine.
2021-10-04 13:52:28 -04:00
go s . cleanupStaleUploads ( ctx )
2020-12-10 10:28:37 -05:00
2021-02-26 12:52:27 -05:00
// start cleanup of deleted objects.
2021-10-04 13:52:28 -04:00
go s . cleanupDeletedObjects ( ctx )
2021-02-26 12:52:27 -05:00
2018-02-15 20:45:57 -05:00
// Start the disk monitoring and connect routine.
2020-10-31 17:10:12 -04:00
go s . monitorAndConnectEndpoints ( ctx , defaultMonitorConnectEndpointInterval )
2020-01-15 21:30:32 -05:00
2018-02-15 20:45:57 -05:00
return s , nil
}
2021-10-04 13:52:28 -04:00
// cleanup ".trash/" folder every 5m minutes with sufficient sleep cycles, between each
// deletes a dynamic sleeper is used with a factor of 10 ratio with max delay between
// deletes to be 2 seconds.
func ( s * erasureSets ) cleanupDeletedObjects ( ctx context . Context ) {
timer := time . NewTimer ( globalAPIConfig . getDeleteCleanupInterval ( ) )
2021-02-26 12:52:27 -05:00
defer timer . Stop ( )
for {
select {
case <- ctx . Done ( ) :
return
case <- timer . C :
// Reset for the next interval
2021-10-04 13:52:28 -04:00
timer . Reset ( globalAPIConfig . getDeleteCleanupInterval ( ) )
2021-02-26 12:52:27 -05:00
2022-02-11 17:22:48 -05:00
var wg sync . WaitGroup
2021-02-26 12:52:27 -05:00
for _ , set := range s . sets {
2022-02-11 17:22:48 -05:00
wg . Add ( 1 )
go func ( set * erasureObjects ) {
defer wg . Done ( )
if set == nil {
return
}
set . cleanupDeletedObjects ( ctx )
} ( set )
2021-02-26 12:52:27 -05:00
}
2022-02-11 17:22:48 -05:00
wg . Wait ( )
2021-02-26 12:52:27 -05:00
}
}
}
2021-10-04 13:52:28 -04:00
func ( s * erasureSets ) cleanupStaleUploads ( ctx context . Context ) {
timer := time . NewTimer ( globalAPIConfig . getStaleUploadsCleanupInterval ( ) )
2021-02-05 22:23:48 -05:00
defer timer . Stop ( )
2020-12-10 10:28:37 -05:00
for {
select {
case <- ctx . Done ( ) :
return
2021-02-05 22:23:48 -05:00
case <- timer . C :
// Reset for the next interval
2021-10-04 13:52:28 -04:00
timer . Reset ( globalAPIConfig . getStaleUploadsCleanupInterval ( ) )
2021-02-05 22:23:48 -05:00
2022-02-11 17:22:48 -05:00
var wg sync . WaitGroup
2020-12-10 10:28:37 -05:00
for _ , set := range s . sets {
2022-02-11 17:22:48 -05:00
wg . Add ( 1 )
go func ( set * erasureObjects ) {
defer wg . Done ( )
if set == nil {
return
}
set . cleanupStaleUploads ( ctx , globalAPIConfig . getStaleUploadsExpiry ( ) )
} ( set )
2020-12-10 10:28:37 -05:00
}
2022-02-11 17:22:48 -05:00
wg . Wait ( )
2020-12-10 10:28:37 -05:00
}
}
}
2021-01-26 16:21:51 -05:00
const objectErasureMapKey = "objectErasureMap"
type auditObjectOp struct {
2021-01-26 16:39:55 -05:00
Pool int ` json:"poolId" `
Set int ` json:"setId" `
Disks [ ] string ` json:"disks" `
2021-01-26 16:21:51 -05:00
}
2021-06-09 13:51:19 -04:00
type auditObjectErasureMap struct {
sync . Map
}
// Define how to marshal auditObjectErasureMap so it can be
// printed in the audit webhook notification request.
func ( a * auditObjectErasureMap ) MarshalJSON ( ) ( [ ] byte , error ) {
mapCopy := make ( map [ string ] auditObjectOp )
a . Range ( func ( k , v interface { } ) bool {
mapCopy [ k . ( string ) ] = v . ( auditObjectOp )
return true
} )
return json . Marshal ( mapCopy )
}
2021-03-04 17:36:23 -05:00
func auditObjectErasureSet ( ctx context . Context , object string , set * erasureObjects ) {
2021-10-28 10:35:28 -04:00
if len ( logger . AuditTargets ( ) ) == 0 {
2021-01-26 16:21:51 -05:00
return
}
object = decodeDirObject ( object )
2021-09-29 14:36:19 -04:00
var disksEndpoints [ ] string
for _ , endpoint := range set . getEndpoints ( ) {
disksEndpoints = append ( disksEndpoints , endpoint . String ( ) )
}
2021-01-26 16:21:51 -05:00
op := auditObjectOp {
2021-03-04 17:36:23 -05:00
Pool : set . poolIndex + 1 ,
Set : set . setIndex + 1 ,
2021-09-29 14:36:19 -04:00
Disks : disksEndpoints ,
2021-01-26 16:21:51 -05:00
}
2021-06-09 13:51:19 -04:00
var objectErasureSetTag * auditObjectErasureMap
2021-01-26 16:21:51 -05:00
reqInfo := logger . GetReqInfo ( ctx )
for _ , kv := range reqInfo . GetTags ( ) {
if kv . Key == objectErasureMapKey {
2021-06-09 13:51:19 -04:00
objectErasureSetTag = kv . Val . ( * auditObjectErasureMap )
2021-01-26 16:21:51 -05:00
break
}
}
if objectErasureSetTag == nil {
2021-06-09 13:51:19 -04:00
objectErasureSetTag = & auditObjectErasureMap { }
2021-01-26 16:21:51 -05:00
}
2021-06-09 13:51:19 -04:00
objectErasureSetTag . Store ( object , op )
2021-01-26 16:21:51 -05:00
reqInfo . SetTags ( objectErasureMapKey , objectErasureSetTag )
}
2019-11-13 15:17:45 -05:00
// NewNSLock - initialize a new namespace RWLocker instance.
2020-11-04 11:25:42 -05:00
func ( s * erasureSets ) NewNSLock ( bucket string , objects ... string ) RWLocker {
2020-02-21 00:59:57 -05:00
if len ( objects ) == 1 {
2020-11-04 11:25:42 -05:00
return s . getHashedSet ( objects [ 0 ] ) . NewNSLock ( bucket , objects ... )
2020-02-21 00:59:57 -05:00
}
2020-11-04 11:25:42 -05:00
return s . getHashedSet ( "" ) . NewNSLock ( bucket , objects ... )
2019-11-13 15:17:45 -05:00
}
2020-08-05 16:31:12 -04:00
// SetDriveCount returns the current drives per set.
func ( s * erasureSets ) SetDriveCount ( ) int {
2020-08-26 22:29:35 -04:00
return s . setDriveCount
2020-08-05 16:31:12 -04:00
}
2021-01-16 15:08:02 -05:00
// ParityCount returns the default parity count used while erasure
// coding objects
func ( s * erasureSets ) ParityCount ( ) int {
return s . defaultParityCount
}
2020-05-28 16:03:04 -04:00
// StorageUsageInfo - combines output of StorageInfo across all erasure coded object sets.
2020-12-01 16:50:33 -05:00
// This only returns disk usage info for ServerPools to perform placement decision, this call
2020-05-28 16:03:04 -04:00
// is not implemented in Object interface and is not meant to be used by other object
// layer implementations.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) StorageUsageInfo ( ctx context . Context ) StorageInfo {
2020-05-28 16:03:04 -04:00
storageUsageInfo := func ( ) StorageInfo {
var storageInfo StorageInfo
storageInfos := make ( [ ] StorageInfo , len ( s . sets ) )
2021-03-04 17:36:23 -05:00
storageInfo . Backend . Type = madmin . Erasure
2020-05-28 16:03:04 -04:00
g := errgroup . WithNErrs ( len ( s . sets ) )
for index := range s . sets {
index := index
g . Go ( func ( ) error {
// ignoring errors on purpose
2021-01-04 12:42:09 -05:00
storageInfos [ index ] , _ = s . sets [ index ] . StorageInfo ( ctx )
2020-05-28 16:03:04 -04:00
return nil
} , index )
}
// Wait for the go routines.
g . Wait ( )
for _ , lstorageInfo := range storageInfos {
2020-07-13 12:51:07 -04:00
storageInfo . Disks = append ( storageInfo . Disks , lstorageInfo . Disks ... )
2020-05-28 16:03:04 -04:00
}
return storageInfo
}
2020-05-26 15:52:24 -04:00
s . disksStorageInfoCache . Once . Do ( func ( ) {
s . disksStorageInfoCache . TTL = time . Second
s . disksStorageInfoCache . Update = func ( ) ( interface { } , error ) {
2020-05-28 16:03:04 -04:00
return storageUsageInfo ( ) , nil
2020-05-26 15:52:24 -04:00
}
} )
2020-05-28 16:03:04 -04:00
2020-05-26 15:52:24 -04:00
v , _ := s . disksStorageInfoCache . Get ( )
return v . ( StorageInfo )
}
2020-05-28 16:03:04 -04:00
// StorageInfo - combines output of StorageInfo across all erasure coded object sets.
2021-01-04 12:42:09 -05:00
func ( s * erasureSets ) StorageInfo ( ctx context . Context ) ( StorageInfo , [ ] error ) {
2021-03-04 17:36:23 -05:00
var storageInfo madmin . StorageInfo
2019-08-22 23:02:40 -04:00
2021-03-04 17:36:23 -05:00
storageInfos := make ( [ ] madmin . StorageInfo , len ( s . sets ) )
2020-05-28 16:03:04 -04:00
storageInfoErrs := make ( [ ] [ ] error , len ( s . sets ) )
2019-10-14 12:44:51 -04:00
g := errgroup . WithNErrs ( len ( s . sets ) )
for index := range s . sets {
index := index
g . Go ( func ( ) error {
2021-01-04 12:42:09 -05:00
storageInfos [ index ] , storageInfoErrs [ index ] = s . sets [ index ] . StorageInfo ( ctx )
2019-10-14 12:44:51 -04:00
return nil
} , index )
2019-08-22 23:02:40 -04:00
}
2019-10-14 12:44:51 -04:00
2019-08-22 23:02:40 -04:00
// Wait for the go routines.
2019-10-14 12:44:51 -04:00
g . Wait ( )
2019-08-22 23:02:40 -04:00
for _ , lstorageInfo := range storageInfos {
2020-07-13 12:51:07 -04:00
storageInfo . Disks = append ( storageInfo . Disks , lstorageInfo . Disks ... )
2018-02-15 20:45:57 -05:00
}
2021-04-12 16:45:06 -04:00
errs := make ( [ ] error , 0 , len ( s . sets ) * s . setDriveCount )
2020-05-28 16:03:04 -04:00
for i := range s . sets {
errs = append ( errs , storageInfoErrs [ i ] ... )
}
return storageInfo , errs
2018-02-15 20:45:57 -05:00
}
2021-03-02 20:28:04 -05:00
// StorageInfo - combines output of StorageInfo across all erasure coded object sets.
func ( s * erasureSets ) LocalStorageInfo ( ctx context . Context ) ( StorageInfo , [ ] error ) {
var storageInfo StorageInfo
storageInfos := make ( [ ] StorageInfo , len ( s . sets ) )
storageInfoErrs := make ( [ ] [ ] error , len ( s . sets ) )
g := errgroup . WithNErrs ( len ( s . sets ) )
for index := range s . sets {
index := index
g . Go ( func ( ) error {
storageInfos [ index ] , storageInfoErrs [ index ] = s . sets [ index ] . LocalStorageInfo ( ctx )
return nil
} , index )
}
// Wait for the go routines.
g . Wait ( )
for _ , lstorageInfo := range storageInfos {
storageInfo . Disks = append ( storageInfo . Disks , lstorageInfo . Disks ... )
}
var errs [ ] error
for i := range s . sets {
errs = append ( errs , storageInfoErrs [ i ] ... )
}
return storageInfo , errs
}
2018-02-15 20:45:57 -05:00
// Shutdown shutsdown all erasure coded sets in parallel
// returns error upon first error.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) Shutdown ( ctx context . Context ) error {
2018-02-15 20:45:57 -05:00
g := errgroup . WithNErrs ( len ( s . sets ) )
for index := range s . sets {
index := index
g . Go ( func ( ) error {
2018-03-14 15:01:47 -04:00
return s . sets [ index ] . Shutdown ( ctx )
2018-02-15 20:45:57 -05:00
} , index )
}
for _ , err := range g . Wait ( ) {
if err != nil {
return err
}
}
2020-09-10 12:18:19 -04:00
select {
2021-03-18 14:19:02 -04:00
case _ , ok := <- s . setReconnectEvent :
2020-09-10 12:18:19 -04:00
if ok {
2021-03-18 14:19:02 -04:00
close ( s . setReconnectEvent )
2020-09-10 12:18:19 -04:00
}
default :
2021-03-18 14:19:02 -04:00
close ( s . setReconnectEvent )
2020-09-10 12:18:19 -04:00
}
2018-02-15 20:45:57 -05:00
return nil
}
2020-05-12 18:20:42 -04:00
// MakeBucketLocation - creates a new bucket across all sets simultaneously,
// then return the first encountered error
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) MakeBucketWithLocation ( ctx context . Context , bucket string , opts BucketOptions ) error {
2018-02-15 20:45:57 -05:00
g := errgroup . WithNErrs ( len ( s . sets ) )
// Create buckets in parallel across all sets.
for index := range s . sets {
index := index
g . Go ( func ( ) error {
2020-06-12 23:04:01 -04:00
return s . sets [ index ] . MakeBucketWithLocation ( ctx , bucket , opts )
2018-02-15 20:45:57 -05:00
} , index )
}
errs := g . Wait ( )
2020-05-12 18:20:42 -04:00
// Return the first encountered error
2018-02-15 20:45:57 -05:00
for _ , err := range errs {
2018-02-16 23:16:48 -05:00
if err != nil {
return err
2018-02-15 20:45:57 -05:00
}
}
// Success.
return nil
}
// hashes the key returning an integer based on the input algorithm.
// This function currently supports
// - CRCMOD
2020-06-12 23:04:01 -04:00
// - SIPMOD
2018-02-15 20:45:57 -05:00
// - all new algos.
2020-06-12 23:04:01 -04:00
func sipHashMod ( key string , cardinality int , id [ 16 ] byte ) int {
if cardinality <= 0 {
return - 1
}
2021-02-26 19:53:06 -05:00
// use the faster version as per siphash docs
// https://github.com/dchest/siphash#usage
k0 , k1 := binary . LittleEndian . Uint64 ( id [ 0 : 8 ] ) , binary . LittleEndian . Uint64 ( id [ 8 : 16 ] )
sum64 := siphash . Hash ( k0 , k1 , [ ] byte ( key ) )
return int ( sum64 % uint64 ( cardinality ) )
2020-06-12 23:04:01 -04:00
}
2018-02-15 20:45:57 -05:00
func crcHashMod ( key string , cardinality int ) int {
if cardinality <= 0 {
return - 1
}
keyCrc := crc32 . Checksum ( [ ] byte ( key ) , crc32 . IEEETable )
return int ( keyCrc % uint32 ( cardinality ) )
}
2020-06-12 23:04:01 -04:00
func hashKey ( algo string , key string , cardinality int , id [ 16 ] byte ) int {
2018-02-15 20:45:57 -05:00
switch algo {
2021-01-16 15:08:02 -05:00
case formatErasureVersionV2DistributionAlgoV1 :
2018-02-15 20:45:57 -05:00
return crcHashMod ( key , cardinality )
2021-01-16 15:08:02 -05:00
case formatErasureVersionV3DistributionAlgoV2 , formatErasureVersionV3DistributionAlgoV3 :
2020-06-12 23:04:01 -04:00
return sipHashMod ( key , cardinality , id )
2018-08-06 13:26:40 -04:00
default :
// Unknown algorithm returns -1, also if cardinality is lesser than 0.
return - 1
2018-02-15 20:45:57 -05:00
}
}
2019-05-13 15:25:49 -04:00
// Returns always a same erasure coded set for a given input.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) getHashedSetIndex ( input string ) int {
return hashKey ( s . distributionAlgo , input , len ( s . sets ) , s . deploymentID )
2019-05-13 15:25:49 -04:00
}
2018-02-15 20:45:57 -05:00
// Returns always a same erasure coded set for a given input.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) getHashedSet ( input string ) ( set * erasureObjects ) {
2019-05-13 15:25:49 -04:00
return s . sets [ s . getHashedSetIndex ( input ) ]
2018-02-15 20:45:57 -05:00
}
// GetBucketInfo - returns bucket info from one of the erasure coded set.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) GetBucketInfo ( ctx context . Context , bucket string ) ( bucketInfo BucketInfo , err error ) {
2019-11-19 20:42:27 -05:00
return s . getHashedSet ( "" ) . GetBucketInfo ( ctx , bucket )
2018-02-15 20:45:57 -05:00
}
// IsNotificationSupported returns whether bucket notification is applicable for this layer.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) IsNotificationSupported ( ) bool {
2018-02-15 20:45:57 -05:00
return s . getHashedSet ( "" ) . IsNotificationSupported ( )
}
2020-07-20 15:52:49 -04:00
// IsListenSupported returns whether listen bucket notification is applicable for this layer.
func ( s * erasureSets ) IsListenSupported ( ) bool {
2018-12-05 17:03:42 -05:00
return true
}
2019-01-05 17:16:43 -05:00
// IsEncryptionSupported returns whether server side encryption is implemented for this layer.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) IsEncryptionSupported ( ) bool {
2018-02-15 20:45:57 -05:00
return s . getHashedSet ( "" ) . IsEncryptionSupported ( )
}
2018-09-27 23:36:17 -04:00
// IsCompressionSupported returns whether compression is applicable for this layer.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) IsCompressionSupported ( ) bool {
2018-09-27 23:36:17 -04:00
return s . getHashedSet ( "" ) . IsCompressionSupported ( )
}
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) IsTaggingSupported ( ) bool {
2020-05-23 14:09:35 -04:00
return true
}
2018-02-15 20:45:57 -05:00
// DeleteBucket - deletes a bucket on all sets simultaneously,
// even if one of the sets fail to delete buckets, we proceed to
// undo a successful operation.
2021-10-06 13:24:40 -04:00
func ( s * erasureSets ) DeleteBucket ( ctx context . Context , bucket string , opts DeleteBucketOptions ) error {
2018-02-15 20:45:57 -05:00
g := errgroup . WithNErrs ( len ( s . sets ) )
// Delete buckets in parallel across all sets.
for index := range s . sets {
index := index
g . Go ( func ( ) error {
2021-10-06 13:24:40 -04:00
return s . sets [ index ] . DeleteBucket ( ctx , bucket , opts )
2018-02-15 20:45:57 -05:00
} , index )
}
errs := g . Wait ( )
2020-04-27 17:18:02 -04:00
// For any failure, we attempt undo all the delete buckets operation
// by creating buckets again on all sets which were successfully deleted.
2018-02-15 20:45:57 -05:00
for _ , err := range errs {
2021-10-06 13:24:40 -04:00
if err != nil && ! opts . NoRecreate {
2020-06-12 23:04:01 -04:00
undoDeleteBucketSets ( ctx , bucket , s . sets , errs )
2018-02-16 23:16:48 -05:00
return err
2018-02-15 20:45:57 -05:00
}
}
// Success.
return nil
}
// This function is used to undo a successful DeleteBucket operation.
2020-06-12 23:04:01 -04:00
func undoDeleteBucketSets ( ctx context . Context , bucket string , sets [ ] * erasureObjects , errs [ ] error ) {
2018-02-15 20:45:57 -05:00
g := errgroup . WithNErrs ( len ( sets ) )
// Undo previous delete bucket on all underlying sets.
for index := range sets {
index := index
2019-10-14 12:44:51 -04:00
g . Go ( func ( ) error {
if errs [ index ] == nil {
2020-06-12 23:04:01 -04:00
return sets [ index ] . MakeBucketWithLocation ( ctx , bucket , BucketOptions { } )
2019-10-14 12:44:51 -04:00
}
return nil
} , index )
2018-02-15 20:45:57 -05:00
}
g . Wait ( )
}
// List all buckets from one of the set, we are not doing merge
// sort here just for simplification. As per design it is assumed
// that all buckets are present on all sets.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) ListBuckets ( ctx context . Context ) ( buckets [ ] BucketInfo , err error ) {
2020-12-14 15:07:07 -05:00
var listBuckets [ ] BucketInfo
2022-01-02 12:15:06 -05:00
healBuckets := map [ string ] VolInfo { }
2020-12-14 15:07:07 -05:00
for _ , set := range s . sets {
// lists all unique buckets across drives.
2021-11-15 12:46:55 -05:00
if err := listAllBuckets ( ctx , set . getDisks ( ) , healBuckets , s . defaultParityCount ) ; err != nil {
2020-12-14 15:07:07 -05:00
return nil , err
}
}
2020-12-15 20:34:54 -05:00
2020-12-14 15:07:07 -05:00
for _ , v := range healBuckets {
2021-09-17 18:02:21 -04:00
listBuckets = append ( listBuckets , BucketInfo ( v ) )
2020-12-14 15:07:07 -05:00
}
2020-12-15 20:34:54 -05:00
sort . Slice ( listBuckets , func ( i , j int ) bool {
return listBuckets [ i ] . Name < listBuckets [ j ] . Name
} )
2020-12-14 15:07:07 -05:00
return listBuckets , nil
2018-02-15 20:45:57 -05:00
}
// --- Object Operations ---
2018-09-20 22:22:09 -04:00
// GetObjectNInfo - returns object info and locked object ReadCloser
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) GetObjectNInfo ( ctx context . Context , bucket , object string , rs * HTTPRangeSpec , h http . Header , lockType LockType , opts ObjectOptions ) ( gr * GetObjectReader , err error ) {
2021-01-26 16:21:51 -05:00
set := s . getHashedSet ( object )
2021-03-04 17:36:23 -05:00
auditObjectErasureSet ( ctx , object , set )
2021-01-26 16:21:51 -05:00
return set . GetObjectNInfo ( ctx , bucket , object , rs , h , lockType , opts )
2018-09-20 22:22:09 -04:00
}
2018-02-15 20:45:57 -05:00
// PutObject - writes an object to hashedSet based on the object name.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) PutObject ( ctx context . Context , bucket string , object string , data * PutObjReader , opts ObjectOptions ) ( objInfo ObjectInfo , err error ) {
2021-01-26 16:21:51 -05:00
set := s . getHashedSet ( object )
2021-03-04 17:36:23 -05:00
auditObjectErasureSet ( ctx , object , set )
2021-01-26 16:21:51 -05:00
return set . PutObject ( ctx , bucket , object , data , opts )
2018-02-15 20:45:57 -05:00
}
// GetObjectInfo - reads object metadata from the hashedSet based on the object name.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) GetObjectInfo ( ctx context . Context , bucket , object string , opts ObjectOptions ) ( objInfo ObjectInfo , err error ) {
2021-01-26 16:21:51 -05:00
set := s . getHashedSet ( object )
2021-03-04 17:36:23 -05:00
auditObjectErasureSet ( ctx , object , set )
2021-01-26 16:21:51 -05:00
return set . GetObjectInfo ( ctx , bucket , object , opts )
2018-02-15 20:45:57 -05:00
}
2021-06-15 21:43:14 -04:00
func ( s * erasureSets ) deletePrefix ( ctx context . Context , bucket string , prefix string ) error {
2021-09-17 22:34:48 -04:00
var wg sync . WaitGroup
wg . Add ( len ( s . sets ) )
2021-06-15 21:43:14 -04:00
for _ , s := range s . sets {
2021-09-17 22:34:48 -04:00
go func ( s * erasureObjects ) {
defer wg . Done ( )
// This is a force delete, no reason to throw errors.
s . DeleteObject ( ctx , bucket , prefix , ObjectOptions { DeletePrefix : true } )
} ( s )
2021-06-15 21:43:14 -04:00
}
2021-09-17 22:34:48 -04:00
wg . Wait ( )
2021-06-15 21:43:14 -04:00
return nil
}
2018-02-15 20:45:57 -05:00
// DeleteObject - deletes an object from the hashedSet based on the object name.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) DeleteObject ( ctx context . Context , bucket string , object string , opts ObjectOptions ) ( objInfo ObjectInfo , err error ) {
2021-01-26 16:21:51 -05:00
set := s . getHashedSet ( object )
2021-03-04 17:36:23 -05:00
auditObjectErasureSet ( ctx , object , set )
2021-06-15 21:43:14 -04:00
if opts . DeletePrefix {
err := s . deletePrefix ( ctx , bucket , object )
return ObjectInfo { } , err
}
2021-01-26 16:21:51 -05:00
return set . DeleteObject ( ctx , bucket , object , opts )
2018-02-15 20:45:57 -05:00
}
2019-05-13 15:25:49 -04:00
// DeleteObjects - bulk delete of objects
// Bulk delete is only possible within one set. For that purpose
// objects are group by set first, and then bulk delete is invoked
// for each set, the error response of each delete will be returned
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) DeleteObjects ( ctx context . Context , bucket string , objects [ ] ObjectToDelete , opts ObjectOptions ) ( [ ] DeletedObject , [ ] error ) {
2019-05-13 15:25:49 -04:00
type delObj struct {
// Set index associated to this object
setIndex int
// Original index from the list of arguments
// where this object is passed
origIndex int
2020-06-12 23:04:01 -04:00
// object to delete
object ObjectToDelete
2019-05-13 15:25:49 -04:00
}
// Transform []delObj to the list of object names
2020-06-12 23:04:01 -04:00
toNames := func ( delObjs [ ] delObj ) [ ] ObjectToDelete {
objs := make ( [ ] ObjectToDelete , len ( delObjs ) )
2019-05-13 15:25:49 -04:00
for i , obj := range delObjs {
2020-06-12 23:04:01 -04:00
objs [ i ] = obj . object
2019-05-13 15:25:49 -04:00
}
2020-06-12 23:04:01 -04:00
return objs
2019-05-13 15:25:49 -04:00
}
// The result of delete operation on all passed objects
2022-01-02 12:15:06 -05:00
delErrs := make ( [ ] error , len ( objects ) )
2019-05-13 15:25:49 -04:00
2020-06-12 23:04:01 -04:00
// The result of delete objects
2022-01-02 12:15:06 -05:00
delObjects := make ( [ ] DeletedObject , len ( objects ) )
2020-06-12 23:04:01 -04:00
2019-05-13 15:25:49 -04:00
// A map between a set and its associated objects
2022-01-02 12:15:06 -05:00
objSetMap := make ( map [ int ] [ ] delObj )
2019-05-13 15:25:49 -04:00
// Group objects by set index
for i , object := range objects {
2020-06-12 23:04:01 -04:00
index := s . getHashedSetIndex ( object . ObjectName )
objSetMap [ index ] = append ( objSetMap [ index ] , delObj { setIndex : index , origIndex : i , object : object } )
2019-05-13 15:25:49 -04:00
}
// Invoke bulk delete on objects per set and save
// the result of the delete operation
2022-01-06 13:47:49 -05:00
var wg sync . WaitGroup
var mu sync . Mutex
wg . Add ( len ( objSetMap ) )
for setIdx , objsGroup := range objSetMap {
go func ( set * erasureObjects , group [ ] delObj ) {
defer wg . Done ( )
dobjects , errs := set . DeleteObjects ( ctx , bucket , toNames ( group ) , opts )
mu . Lock ( )
defer mu . Unlock ( )
for i , obj := range group {
delErrs [ obj . origIndex ] = errs [ i ]
delObjects [ obj . origIndex ] = dobjects [ i ]
if errs [ i ] == nil {
auditObjectErasureSet ( ctx , obj . object . ObjectName , set )
}
2021-01-26 16:21:51 -05:00
}
2022-01-06 13:47:49 -05:00
} ( s . sets [ setIdx ] , objsGroup )
2019-05-13 15:25:49 -04:00
}
2022-01-06 13:47:49 -05:00
wg . Wait ( )
2019-05-13 15:25:49 -04:00
2020-06-12 23:04:01 -04:00
return delObjects , delErrs
2019-05-13 15:25:49 -04:00
}
2018-02-15 20:45:57 -05:00
// CopyObject - copies objects from one hashedSet to another hashedSet, on server side.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) CopyObject ( ctx context . Context , srcBucket , srcObject , dstBucket , dstObject string , srcInfo ObjectInfo , srcOpts , dstOpts ObjectOptions ) ( objInfo ObjectInfo , err error ) {
2018-02-15 20:45:57 -05:00
srcSet := s . getHashedSet ( srcObject )
2020-05-28 17:36:38 -04:00
dstSet := s . getHashedSet ( dstObject )
2018-02-15 20:45:57 -05:00
2021-03-04 17:36:23 -05:00
auditObjectErasureSet ( ctx , dstObject , dstSet )
2021-01-26 16:21:51 -05:00
2020-08-03 19:21:10 -04:00
cpSrcDstSame := srcSet == dstSet
2018-02-15 20:45:57 -05:00
// Check if this request is only metadata update.
2020-08-03 19:21:10 -04:00
if cpSrcDstSame && srcInfo . metadataOnly {
2020-09-14 18:57:13 -04:00
// Version ID is set for the destination and source == destination version ID.
// perform an in-place update.
2020-06-19 11:44:51 -04:00
if dstOpts . VersionID != "" && srcOpts . VersionID == dstOpts . VersionID {
return srcSet . CopyObject ( ctx , srcBucket , srcObject , dstBucket , dstObject , srcInfo , srcOpts , dstOpts )
}
2020-09-14 18:57:13 -04:00
// Destination is not versioned and source version ID is empty
// perform an in-place update.
2020-06-19 11:44:51 -04:00
if ! dstOpts . Versioned && srcOpts . VersionID == "" {
return srcSet . CopyObject ( ctx , srcBucket , srcObject , dstBucket , dstObject , srcInfo , srcOpts , dstOpts )
}
2020-08-03 19:21:10 -04:00
// CopyObject optimization where we don't create an entire copy
// of the content, instead we add a reference, we disallow legacy
// objects to be self referenced in this manner so make sure
// that we actually create a new dataDir for legacy objects.
if dstOpts . Versioned && srcOpts . VersionID != dstOpts . VersionID && ! srcInfo . Legacy {
srcInfo . versionOnly = true
return srcSet . CopyObject ( ctx , srcBucket , srcObject , dstBucket , dstObject , srcInfo , srcOpts , dstOpts )
}
2018-02-15 20:45:57 -05:00
}
2020-06-17 14:13:41 -04:00
putOpts := ObjectOptions {
ServerSideEncryption : dstOpts . ServerSideEncryption ,
UserDefined : srcInfo . UserDefined ,
Versioned : dstOpts . Versioned ,
VersionID : dstOpts . VersionID ,
2020-11-19 14:50:22 -05:00
MTime : dstOpts . MTime ,
2020-06-17 14:13:41 -04:00
}
2020-06-19 11:44:51 -04:00
2020-05-28 17:36:38 -04:00
return dstSet . putObject ( ctx , dstBucket , dstObject , srcInfo . PutObjReader , putOpts )
2018-02-15 20:45:57 -05:00
}
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) ListMultipartUploads ( ctx context . Context , bucket , prefix , keyMarker , uploadIDMarker , delimiter string , maxUploads int ) ( result ListMultipartsInfo , err error ) {
2018-02-15 20:45:57 -05:00
// In list multipart uploads we are going to treat input prefix as the object,
// this means that we are not supporting directory navigation.
2021-01-26 16:21:51 -05:00
set := s . getHashedSet ( prefix )
2021-03-04 17:36:23 -05:00
auditObjectErasureSet ( ctx , prefix , set )
2021-01-26 16:21:51 -05:00
return set . ListMultipartUploads ( ctx , bucket , prefix , keyMarker , uploadIDMarker , delimiter , maxUploads )
2018-02-15 20:45:57 -05:00
}
// Initiate a new multipart upload on a hashedSet based on object name.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) NewMultipartUpload ( ctx context . Context , bucket , object string , opts ObjectOptions ) ( uploadID string , err error ) {
2021-01-26 16:21:51 -05:00
set := s . getHashedSet ( object )
2021-03-04 17:36:23 -05:00
auditObjectErasureSet ( ctx , object , set )
2021-01-26 16:21:51 -05:00
return set . NewMultipartUpload ( ctx , bucket , object , opts )
2018-02-15 20:45:57 -05:00
}
// Copies a part of an object from source hashedSet to destination hashedSet.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) CopyObjectPart ( ctx context . Context , srcBucket , srcObject , destBucket , destObject string , uploadID string , partID int ,
2022-04-13 15:00:11 -04:00
startOffset int64 , length int64 , srcInfo ObjectInfo , srcOpts , dstOpts ObjectOptions ,
) ( partInfo PartInfo , err error ) {
2018-02-15 20:45:57 -05:00
destSet := s . getHashedSet ( destObject )
2021-03-04 17:36:23 -05:00
auditObjectErasureSet ( ctx , destObject , destSet )
2021-02-10 11:52:50 -05:00
return destSet . PutObjectPart ( ctx , destBucket , destObject , uploadID , partID , NewPutObjReader ( srcInfo . Reader ) , dstOpts )
2018-02-15 20:45:57 -05:00
}
// PutObjectPart - writes part of an object to hashedSet based on the object name.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) PutObjectPart ( ctx context . Context , bucket , object , uploadID string , partID int , data * PutObjReader , opts ObjectOptions ) ( info PartInfo , err error ) {
2021-01-26 16:21:51 -05:00
set := s . getHashedSet ( object )
2021-03-04 17:36:23 -05:00
auditObjectErasureSet ( ctx , object , set )
2021-01-26 16:21:51 -05:00
return set . PutObjectPart ( ctx , bucket , object , uploadID , partID , data , opts )
2018-02-15 20:45:57 -05:00
}
2020-05-28 15:36:20 -04:00
// GetMultipartInfo - return multipart metadata info uploaded at hashedSet.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) GetMultipartInfo ( ctx context . Context , bucket , object , uploadID string , opts ObjectOptions ) ( result MultipartInfo , err error ) {
2021-01-26 16:21:51 -05:00
set := s . getHashedSet ( object )
2021-03-04 17:36:23 -05:00
auditObjectErasureSet ( ctx , object , set )
2021-01-26 16:21:51 -05:00
return set . GetMultipartInfo ( ctx , bucket , object , uploadID , opts )
2020-05-28 15:36:20 -04:00
}
2018-02-15 20:45:57 -05:00
// ListObjectParts - lists all uploaded parts to an object in hashedSet.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) ListObjectParts ( ctx context . Context , bucket , object , uploadID string , partNumberMarker int , maxParts int , opts ObjectOptions ) ( result ListPartsInfo , err error ) {
2021-01-26 16:21:51 -05:00
set := s . getHashedSet ( object )
2021-03-04 17:36:23 -05:00
auditObjectErasureSet ( ctx , object , set )
2021-01-26 16:21:51 -05:00
return set . ListObjectParts ( ctx , bucket , object , uploadID , partNumberMarker , maxParts , opts )
2018-02-15 20:45:57 -05:00
}
// Aborts an in-progress multipart operation on hashedSet based on the object name.
2020-09-14 18:57:13 -04:00
func ( s * erasureSets ) AbortMultipartUpload ( ctx context . Context , bucket , object , uploadID string , opts ObjectOptions ) error {
2021-01-26 16:21:51 -05:00
set := s . getHashedSet ( object )
2021-03-04 17:36:23 -05:00
auditObjectErasureSet ( ctx , object , set )
2021-01-26 16:21:51 -05:00
return set . AbortMultipartUpload ( ctx , bucket , object , uploadID , opts )
2018-02-15 20:45:57 -05:00
}
// CompleteMultipartUpload - completes a pending multipart transaction, on hashedSet based on object name.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) CompleteMultipartUpload ( ctx context . Context , bucket , object , uploadID string , uploadedParts [ ] CompletePart , opts ObjectOptions ) ( objInfo ObjectInfo , err error ) {
2021-01-26 16:21:51 -05:00
set := s . getHashedSet ( object )
2021-03-04 17:36:23 -05:00
auditObjectErasureSet ( ctx , object , set )
2021-01-26 16:21:51 -05:00
return set . CompleteMultipartUpload ( ctx , bucket , object , uploadID , uploadedParts , opts )
2018-02-15 20:45:57 -05:00
}
/ *
All disks online
-- -- -- -- -- -- -- -- -
- All Unformatted - format all and return success .
- Some Unformatted - format all and return success .
- Any JBOD inconsistent - return failure
- Some are corrupt ( missing format . json ) - return failure
- Any unrecognized disks - return failure
Some disks are offline and we have quorum .
-- -- -- -- -- -- -- -- -
- Some unformatted - format all and return success ,
treat disks offline as corrupted .
- Any JBOD inconsistent - return failure
- Some are corrupt ( missing format . json )
- Any unrecognized disks - return failure
No read quorum
-- -- -- -- -- -- -- -- -
failure for all cases .
// Pseudo code for managing `format.json`.
// Generic checks.
if ( no quorum ) return error
if ( any disk is corrupt ) return error // Always error
if ( jbod inconsistent ) return error // Always error.
if ( disks not recognized ) // Always error.
// Specific checks.
if ( all disks online )
if ( all disks return format . json )
if ( jbod consistent )
if ( all disks recognized )
return
else
if ( all disks return format . json not found )
return error
else ( some disks return format . json not found )
( heal format )
return
fi
fi
else
if ( some disks return format . json not found )
// Offline disks are marked as dead.
( heal format ) // Offline disks should be marked as dead.
return success
fi
fi
* /
2020-07-13 12:51:07 -04:00
func formatsToDrivesInfo ( endpoints Endpoints , formats [ ] * formatErasureV3 , sErrs [ ] error ) ( beforeDrives [ ] madmin . HealDriveInfo ) {
beforeDrives = make ( [ ] madmin . HealDriveInfo , len ( endpoints ) )
2018-02-15 20:45:57 -05:00
// Existing formats are available (i.e. ok), so save it in
// result, also populate disks to be healed.
for i , format := range formats {
drive := endpoints . GetString ( i )
2022-01-02 12:15:06 -05:00
state := madmin . DriveStateCorrupt
2018-02-15 20:45:57 -05:00
switch {
case format != nil :
2019-08-30 17:11:18 -04:00
state = madmin . DriveStateOk
2018-02-15 20:45:57 -05:00
case sErrs [ i ] == errUnformattedDisk :
2019-08-30 17:11:18 -04:00
state = madmin . DriveStateMissing
2019-08-02 15:17:26 -04:00
case sErrs [ i ] == errDiskNotFound :
2019-08-30 17:11:18 -04:00
state = madmin . DriveStateOffline
}
2020-07-13 12:51:07 -04:00
beforeDrives [ i ] = madmin . HealDriveInfo {
2019-08-30 17:11:18 -04:00
UUID : func ( ) string {
if format != nil {
2020-06-12 23:04:01 -04:00
return format . Erasure . This
2019-08-30 17:11:18 -04:00
}
return ""
} ( ) ,
Endpoint : drive ,
State : state ,
2018-02-15 20:45:57 -05:00
}
}
return beforeDrives
}
2020-08-18 17:37:26 -04:00
func getHealDiskInfos ( storageDisks [ ] StorageAPI , errs [ ] error ) ( [ ] DiskInfo , [ ] error ) {
2019-02-06 14:44:19 -05:00
infos := make ( [ ] DiskInfo , len ( storageDisks ) )
2019-10-14 12:44:51 -04:00
g := errgroup . WithNErrs ( len ( storageDisks ) )
for index := range storageDisks {
index := index
g . Go ( func ( ) error {
2020-08-18 17:37:26 -04:00
if errs [ index ] != nil && errs [ index ] != errUnformattedDisk {
return errs [ index ]
}
if storageDisks [ index ] == nil {
return errDiskNotFound
2019-10-14 12:44:51 -04:00
}
2020-08-18 17:37:26 -04:00
var err error
2020-09-04 12:45:06 -04:00
infos [ index ] , err = storageDisks [ index ] . DiskInfo ( context . TODO ( ) )
2019-10-14 12:44:51 -04:00
return err
} , index )
}
return infos , g . Wait ( )
2019-02-06 14:44:19 -05:00
}
// Mark root disks as down so as not to heal them.
2020-08-18 17:37:26 -04:00
func markRootDisksAsDown ( storageDisks [ ] StorageAPI , errs [ ] error ) {
2022-02-04 15:21:21 -05:00
if globalIsCICD {
2022-02-13 18:42:07 -05:00
// Do nothing
2022-02-04 15:21:21 -05:00
return
}
2022-02-13 18:42:07 -05:00
infos , _ := getHealDiskInfos ( storageDisks , errs )
for i := range storageDisks {
if storageDisks [ i ] != nil && infos [ i ] . RootDisk {
// We should not heal on root disk. i.e in a situation where the minio-administrator has unmounted a
// defective drive we should not heal a path on the root disk.
2022-03-03 16:21:16 -05:00
logger . LogIf ( GlobalContext , fmt . Errorf ( "Disk `%s` is part of root disk, will not be used" , storageDisks [ i ] ) )
2022-02-13 18:42:07 -05:00
storageDisks [ i ] = nil
2019-02-06 14:44:19 -05:00
}
}
}
2018-04-30 23:37:39 -04:00
// HealFormat - heals missing `format.json` on fresh unformatted disks.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) HealFormat ( ctx context . Context , dryRun bool ) ( res madmin . HealResultItem , err error ) {
2022-01-10 12:07:49 -05:00
storageDisks , _ := initStorageDisksWithErrorsWithoutHealthCheck ( s . endpoints . Endpoints )
2018-04-09 13:25:41 -04:00
defer func ( storageDisks [ ] StorageAPI ) {
if err != nil {
closeStorageDisks ( storageDisks )
}
} ( storageDisks )
2018-04-04 00:58:48 -04:00
2020-06-12 23:04:01 -04:00
formats , sErrs := loadFormatErasureAll ( storageDisks , true )
2021-01-29 14:40:55 -05:00
if err = checkFormatErasureValues ( formats , storageDisks , s . setDriveCount ) ; err != nil {
2018-02-15 20:45:57 -05:00
return madmin . HealResultItem { } , err
}
2020-08-18 17:37:26 -04:00
// Mark all root disks down
markRootDisksAsDown ( storageDisks , sErrs )
2020-09-04 20:09:02 -04:00
refFormat , err := getFormatErasureInQuorum ( formats )
if err != nil {
return res , err
}
2018-02-15 20:45:57 -05:00
// Prepare heal-result
2018-04-09 13:25:41 -04:00
res = madmin . HealResultItem {
2018-02-15 20:45:57 -05:00
Type : madmin . HealItemMetadata ,
Detail : "disk-format" ,
2020-08-26 22:29:35 -04:00
DiskCount : s . setCount * s . setDriveCount ,
2018-02-15 20:45:57 -05:00
SetCount : s . setCount ,
}
// Fetch all the drive info status.
2022-01-10 12:07:49 -05:00
beforeDrives := formatsToDrivesInfo ( s . endpoints . Endpoints , formats , sErrs )
2018-02-15 20:45:57 -05:00
res . After . Drives = make ( [ ] madmin . HealDriveInfo , len ( beforeDrives ) )
res . Before . Drives = make ( [ ] madmin . HealDriveInfo , len ( beforeDrives ) )
// Copy "after" drive state too from before.
for k , v := range beforeDrives {
2021-03-04 17:36:23 -05:00
res . Before . Drives [ k ] = v
res . After . Drives [ k ] = v
2018-02-15 20:45:57 -05:00
}
2019-09-24 21:47:26 -04:00
if countErrs ( sErrs , errUnformattedDisk ) == 0 {
2018-04-30 23:37:39 -04:00
return res , errNoHealRequired
}
2018-02-15 20:45:57 -05:00
// Initialize a new set of set formats which will be written to disk.
2020-08-26 22:29:35 -04:00
newFormatSets := newHealFormatSets ( refFormat , s . setCount , s . setDriveCount , formats , sErrs )
2018-02-15 20:45:57 -05:00
if ! dryRun {
2022-01-02 12:15:06 -05:00
tmpNewFormats := make ( [ ] * formatErasureV3 , s . setCount * s . setDriveCount )
2018-02-15 20:45:57 -05:00
for i := range newFormatSets {
for j := range newFormatSets [ i ] {
if newFormatSets [ i ] [ j ] == nil {
continue
}
2020-10-26 13:29:29 -04:00
res . After . Drives [ i * s . setDriveCount + j ] . UUID = newFormatSets [ i ] [ j ] . Erasure . This
res . After . Drives [ i * s . setDriveCount + j ] . State = madmin . DriveStateOk
2020-08-26 22:29:35 -04:00
tmpNewFormats [ i * s . setDriveCount + j ] = newFormatSets [ i ] [ j ]
2018-02-15 20:45:57 -05:00
}
}
2020-10-31 04:34:48 -04:00
// Save new formats `format.json` on unformatted disks.
2021-11-04 19:42:49 -04:00
for index , format := range tmpNewFormats {
if storageDisks [ index ] == nil || format == nil {
continue
}
if err := saveFormatErasure ( storageDisks [ index ] , format , true ) ; err != nil {
logger . LogIf ( ctx , fmt . Errorf ( "Disk %s failed to write updated 'format.json': %v" , storageDisks [ index ] , err ) )
tmpNewFormats [ index ] = nil // this disk failed to write new format
}
2020-08-07 16:22:53 -04:00
}
2020-09-17 00:14:35 -04:00
s . erasureDisksMu . Lock ( )
2018-04-09 13:25:41 -04:00
2020-10-31 04:34:48 -04:00
for index , format := range tmpNewFormats {
if format == nil {
2020-03-27 17:48:30 -04:00
continue
}
2020-10-31 04:34:48 -04:00
m , n , err := findDiskIndexByDiskID ( refFormat , format . Erasure . This )
2020-03-27 17:48:30 -04:00
if err != nil {
2022-01-24 22:40:02 -05:00
logger . LogIf ( ctx , err )
2020-03-27 17:48:30 -04:00
continue
}
2020-06-12 23:04:01 -04:00
if s . erasureDisks [ m ] [ n ] != nil {
s . erasureDisks [ m ] [ n ] . Close ( )
2020-03-27 17:48:30 -04:00
}
2021-09-14 18:10:00 -04:00
if storageDisks [ index ] != nil {
storageDisks [ index ] . SetDiskLoc ( s . poolIndex , m , n )
s . erasureDisks [ m ] [ n ] = storageDisks [ index ]
}
2020-03-27 17:48:30 -04:00
}
2018-04-09 13:25:41 -04:00
2020-10-31 17:10:12 -04:00
// Replace reference format with what was loaded from disks.
2020-10-31 04:34:48 -04:00
s . format = refFormat
2020-09-17 00:14:35 -04:00
s . erasureDisksMu . Unlock ( )
2018-02-15 20:45:57 -05:00
}
return res , nil
}
// HealBucket - heals inconsistent buckets and bucket metadata on all sets.
2020-12-13 14:57:08 -05:00
func ( s * erasureSets ) HealBucket ( ctx context . Context , bucket string , opts madmin . HealOpts ) ( result madmin . HealResultItem , err error ) {
2018-02-15 20:45:57 -05:00
// Initialize heal result info
2019-02-10 22:53:13 -05:00
result = madmin . HealResultItem {
2018-02-15 20:45:57 -05:00
Type : madmin . HealItemBucket ,
Bucket : bucket ,
2020-08-26 22:29:35 -04:00
DiskCount : s . setCount * s . setDriveCount ,
2018-02-15 20:45:57 -05:00
SetCount : s . setCount ,
}
2021-02-23 12:23:11 -05:00
for _ , set := range s . sets {
2021-11-15 12:46:55 -05:00
healResult , err := set . HealBucket ( ctx , bucket , opts )
2019-02-10 22:53:13 -05:00
if err != nil {
2021-02-23 12:23:11 -05:00
return result , toObjectErr ( err , bucket )
2019-02-10 22:53:13 -05:00
}
2019-02-13 07:59:36 -05:00
result . Before . Drives = append ( result . Before . Drives , healResult . Before . Drives ... )
result . After . Drives = append ( result . After . Drives , healResult . After . Drives ... )
2018-02-15 20:45:57 -05:00
}
2019-02-10 22:53:13 -05:00
return result , nil
2018-02-15 20:45:57 -05:00
}
// HealObject - heals inconsistent object on a hashedSet based on object name.
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) HealObject ( ctx context . Context , bucket , object , versionID string , opts madmin . HealOpts ) ( madmin . HealResultItem , error ) {
return s . getHashedSet ( object ) . HealObject ( ctx , bucket , object , versionID , opts )
2018-02-15 20:45:57 -05:00
}
2021-04-04 16:32:31 -04:00
// PutObjectMetadata - replace or add metadata to an existing object/version
func ( s * erasureSets ) PutObjectMetadata ( ctx context . Context , bucket , object string , opts ObjectOptions ) ( ObjectInfo , error ) {
er := s . getHashedSet ( object )
return er . PutObjectMetadata ( ctx , bucket , object , opts )
}
2020-05-23 14:09:35 -04:00
// PutObjectTags - replace or add tags to an existing object
2021-02-01 16:52:51 -05:00
func ( s * erasureSets ) PutObjectTags ( ctx context . Context , bucket , object string , tags string , opts ObjectOptions ) ( ObjectInfo , error ) {
2021-01-26 16:21:51 -05:00
er := s . getHashedSet ( object )
return er . PutObjectTags ( ctx , bucket , object , tags , opts )
2020-01-20 11:45:59 -05:00
}
2020-05-23 14:09:35 -04:00
// DeleteObjectTags - delete object tags from an existing object
2021-02-01 16:52:51 -05:00
func ( s * erasureSets ) DeleteObjectTags ( ctx context . Context , bucket , object string , opts ObjectOptions ) ( ObjectInfo , error ) {
2021-01-26 16:21:51 -05:00
er := s . getHashedSet ( object )
return er . DeleteObjectTags ( ctx , bucket , object , opts )
2020-01-20 11:45:59 -05:00
}
2020-05-23 14:09:35 -04:00
// GetObjectTags - get object tags from an existing object
2020-06-12 23:04:01 -04:00
func ( s * erasureSets ) GetObjectTags ( ctx context . Context , bucket , object string , opts ObjectOptions ) ( * tags . Tags , error ) {
2021-01-26 16:21:51 -05:00
er := s . getHashedSet ( object )
return er . GetObjectTags ( ctx , bucket , object , opts )
2020-01-20 11:45:59 -05:00
}
2021-04-19 13:30:42 -04:00
// TransitionObject - transition object content to target tier.
func ( s * erasureSets ) TransitionObject ( ctx context . Context , bucket , object string , opts ObjectOptions ) error {
return s . getHashedSet ( object ) . TransitionObject ( ctx , bucket , object , opts )
}
// RestoreTransitionedObject - restore transitioned object content locally on this cluster.
func ( s * erasureSets ) RestoreTransitionedObject ( ctx context . Context , bucket , object string , opts ObjectOptions ) error {
return s . getHashedSet ( object ) . RestoreTransitionedObject ( ctx , bucket , object , opts )
}