Fix diskinfo race (#12857)

Fixes share info struct.

```
WARNING: DATA RACE
Read at 0x00c011780618 by goroutine 419:
  github.com/minio/minio/cmd.(*DiskMetrics).DecodeMsg()
      c:/gopath/src/github.com/minio/minio/cmd/storage-datatypes_gen.go:331 +0x247
  github.com/minio/minio/cmd.(*DiskInfo).DecodeMsg()
      c:/gopath/src/github.com/minio/minio/cmd/storage-datatypes_gen.go:76 +0x5ec
  github.com/tinylib/msgp/msgp.Decode()
      c:/gopath/pkg/mod/github.com/tinylib/msgp@v1.1.6-0.20210521143832-0becd170c402/msgp/read.go:105 +0x70
  github.com/minio/minio/cmd.(*storageRESTClient).DiskInfo.func1.1()
      c:/gopath/src/github.com/minio/minio/cmd/storage-rest-client.go:288 +0x235
  github.com/minio/minio/cmd.(*timedValue).Get()
      c:/gopath/src/github.com/minio/minio/cmd/utils.go:886 +0x77
  github.com/minio/minio/cmd.(*storageRESTClient).DiskInfo()
      c:/gopath/src/github.com/minio/minio/cmd/storage-rest-client.go:297 +0xf9
  github.com/minio/minio/cmd.getDiskInfos()
      c:/gopath/src/github.com/minio/minio/cmd/object-api-utils.go:962 +0x1a8
  github.com/minio/minio/cmd.(*erasureServerPools).getServerPoolsAvailableSpace.func1()
      c:/gopath/src/github.com/minio/minio/cmd/erasure-server-pool.go:241 +0x27c
  github.com/minio/minio/internal/sync/errgroup.(*Group).Go.func1()
      c:/gopath/src/github.com/minio/minio/internal/sync/errgroup/errgroup.go:123 +0xd7

Previous write at 0x00c011780618 by goroutine 423:
  github.com/minio/minio/cmd.(*DiskMetrics).DecodeMsg()
      c:/gopath/src/github.com/minio/minio/cmd/storage-datatypes_gen.go:332 +0x6e4
  github.com/minio/minio/cmd.(*DiskInfo).DecodeMsg()
      c:/gopath/src/github.com/minio/minio/cmd/storage-datatypes_gen.go:76 +0x5ec
  github.com/tinylib/msgp/msgp.Decode()
      c:/gopath/pkg/mod/github.com/tinylib/msgp@v1.1.6-0.20210521143832-0becd170c402/msgp/read.go:105 +0x70
  github.com/minio/minio/cmd.(*storageRESTClient).DiskInfo.func1.1()
      c:/gopath/src/github.com/minio/minio/cmd/storage-rest-client.go:288 +0x235
  github.com/minio/minio/cmd.(*timedValue).Get()
      c:/gopath/src/github.com/minio/minio/cmd/utils.go:886 +0x77
  github.com/minio/minio/cmd.(*storageRESTClient).DiskInfo()
      c:/gopath/src/github.com/minio/minio/cmd/storage-rest-client.go:297 +0xf9
  github.com/minio/minio/cmd.getDiskInfos()
      c:/gopath/src/github.com/minio/minio/cmd/object-api-utils.go:962 +0x1a8
  github.com/minio/minio/cmd.(*erasureServerPools).getServerPoolsAvailableSpace.func1()
      c:/gopath/src/github.com/minio/minio/cmd/erasure-server-pool.go:241 +0x27c
  github.com/minio/minio/internal/sync/errgroup.(*Group).Go.func1()
      c:/gopath/src/github.com/minio/minio/internal/sync/errgroup/errgroup.go:123 +0xd7
```
This commit is contained in:
Klaus Post 2021-08-23 10:13:47 +02:00 committed by GitHub
parent 14fe8ecb58
commit 47de1d2e0e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 32 additions and 23 deletions

View File

@ -14,8 +14,6 @@ WORK_DIR="$PWD/.verify-$RANDOM"
MINIO_CONFIG_DIR="$WORK_DIR/.minio"
MINIO=( "$PWD/minio" --config-dir "$MINIO_CONFIG_DIR" server )
export GOGC=25
function start_minio_3_node() {
export MINIO_ROOT_USER=minio
export MINIO_ROOT_PASSWORD=minio123
@ -67,7 +65,7 @@ function start_minio_3_node() {
function check_online() {
if grep -q 'Unable to initialize sub-systems' ${WORK_DIR}/dist-minio-*.log; then
if grep -q 'Unable to initialize' ${WORK_DIR}/dist-minio-*.log; then
echo "1"
fi
}
@ -88,22 +86,24 @@ function __init__()
}
function perform_test() {
start_minio_3_node 60
start_minio_3_node 120
echo "Testing Distributed Erasure setup healing of drives"
echo "Remove the contents of the disks belonging to '${1}' erasure set"
set -x
rm -rf ${WORK_DIR}/${1}/*/
start_minio_3_node 60
start_minio_3_node 200
rv=$(check_online)
if [ "$rv" == "1" ]; then
pkill -9 minio
for i in $(seq 1 3); do
echo "server$i log:"
cat "${WORK_DIR}/dist-minio-server$i.log"
done
pkill -9 minio
echo "FAILED"
purge "$WORK_DIR"
exit 1

View File

@ -328,7 +328,13 @@ func waitForFormatErasure(firstDisk bool, endpoints Endpoints, poolCount, setCou
continue
case errErasureReadQuorum:
// no quorum available continue to wait for minimum number of servers.
logger.Info("Waiting for a minimum of %d disks to come online (elapsed %s)\n", len(endpoints)/2, getElapsedTime())
logger.Info("Waiting for a minimum of %d disks to come online (elapsed %s)\n",
len(endpoints)/2, getElapsedTime())
continue
case errErasureWriteQuorum:
// no quorum available continue to wait for minimum number of servers.
logger.Info("Waiting for a minimum of %d disks to come online (elapsed %s)\n",
(len(endpoints)/2)+1, getElapsedTime())
continue
case errErasureV3ThisEmpty:
// need to wait for this error to be healed, so continue.

View File

@ -265,19 +265,8 @@ func (client *storageRESTClient) SetDiskID(id string) {
client.diskID = id
}
// DiskInfo - fetch disk information for a remote disk.
func (client *storageRESTClient) DiskInfo(ctx context.Context) (info DiskInfo, err error) {
if !client.IsOnline() {
// make sure to check if the disk is offline, since the underlying
// value is cached we should attempt to invalidate it if such calls
// were attempted. This can lead to false success under certain conditions
// - this change attempts to avoid stale information if the underlying
// transport is already down.
return info, errDiskNotFound
}
client.diskInfoCache.Once.Do(func() {
client.diskInfoCache.TTL = time.Second
client.diskInfoCache.Update = func() (interface{}, error) {
func (client *storageRESTClient) diskInfo() (interface{}, error) {
var info DiskInfo
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
respBody, err := client.call(ctx, storageRESTMethodDiskInfo, nil, nil, -1)
@ -292,7 +281,21 @@ func (client *storageRESTClient) DiskInfo(ctx context.Context) (info DiskInfo, e
return info, toStorageErr(errors.New(info.Error))
}
return info, nil
}
// DiskInfo - fetch disk information for a remote disk.
func (client *storageRESTClient) DiskInfo(ctx context.Context) (info DiskInfo, err error) {
if !client.IsOnline() {
// make sure to check if the disk is offline, since the underlying
// value is cached we should attempt to invalidate it if such calls
// were attempted. This can lead to false success under certain conditions
// - this change attempts to avoid stale information if the underlying
// transport is already down.
return info, errDiskNotFound
}
client.diskInfoCache.Once.Do(func() {
client.diskInfoCache.TTL = time.Second
client.diskInfoCache.Update = client.diskInfo
})
val, err := client.diskInfoCache.Get()
if err == nil {