// Copyright (c) 2015-2024 MinIO, Inc. // // This file is part of MinIO Object Storage stack // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see <http://www.gnu.org/licenses/>. //go:generate msgp -file=$GOFILE package cmd import ( "context" "encoding/binary" "errors" "fmt" "io" "sync" "sync/atomic" "time" "github.com/google/uuid" "github.com/minio/madmin-go/v3" "github.com/minio/pkg/v3/wildcard" "github.com/tinylib/msgp/msgp" ) const ( mrfOpsQueueSize = 100000 ) const ( healDir = ".heal" healMRFDir = bucketMetaPrefix + SlashSeparator + healDir + SlashSeparator + "mrf" healMRFMetaFormat = 1 healMRFMetaVersionV1 = 1 ) // PartialOperation is a successful upload/delete of an object // but not written in all disks (having quorum) type PartialOperation struct { Bucket string Object string VersionID string Versions []byte SetIndex, PoolIndex int Queued time.Time BitrotScan bool } // mrfState sncapsulates all the information // related to the global background MRF. type mrfState struct { opCh chan PartialOperation closed int32 closing int32 wg sync.WaitGroup } func newMRFState() mrfState { return mrfState{ opCh: make(chan PartialOperation, mrfOpsQueueSize), } } // Add a partial S3 operation (put/delete) when one or more disks are offline. func (m *mrfState) addPartialOp(op PartialOperation) { if m == nil { return } if atomic.LoadInt32(&m.closed) == 1 { return } m.wg.Add(1) defer m.wg.Done() if atomic.LoadInt32(&m.closing) == 1 { return } select { case m.opCh <- op: default: } } // Do not accept new MRF operations anymore and start to save // the current heal status in one available disk func (m *mrfState) shutdown() { atomic.StoreInt32(&m.closing, 1) m.wg.Wait() close(m.opCh) atomic.StoreInt32(&m.closed, 1) if len(m.opCh) > 0 { healingLogEvent(context.Background(), "Saving MRF healing data (%d entries)", len(m.opCh)) } newReader := func() io.ReadCloser { r, w := io.Pipe() go func() { // Initialize MRF meta header. var data [4]byte binary.LittleEndian.PutUint16(data[0:2], healMRFMetaFormat) binary.LittleEndian.PutUint16(data[2:4], healMRFMetaVersionV1) mw := msgp.NewWriter(w) n, err := mw.Write(data[:]) if err != nil { w.CloseWithError(err) return } if n != len(data) { w.CloseWithError(io.ErrShortWrite) return } for item := range m.opCh { err = item.EncodeMsg(mw) if err != nil { break } } mw.Flush() w.CloseWithError(err) }() return r } globalLocalDrivesMu.RLock() localDrives := cloneDrives(globalLocalDrivesMap) globalLocalDrivesMu.RUnlock() for _, localDrive := range localDrives { r := newReader() err := localDrive.CreateFile(context.Background(), "", minioMetaBucket, pathJoin(healMRFDir, "list.bin"), -1, r) r.Close() if err == nil { break } } } func (m *mrfState) startMRFPersistence() { loadMRF := func(rc io.ReadCloser, opCh chan PartialOperation) error { defer rc.Close() var data [4]byte n, err := rc.Read(data[:]) if err != nil { return err } if n != len(data) { return errors.New("heal mrf: no data") } // Read resync meta header switch binary.LittleEndian.Uint16(data[0:2]) { case healMRFMetaFormat: default: return fmt.Errorf("heal mrf: unknown format: %d", binary.LittleEndian.Uint16(data[0:2])) } switch binary.LittleEndian.Uint16(data[2:4]) { case healMRFMetaVersionV1: default: return fmt.Errorf("heal mrf: unknown version: %d", binary.LittleEndian.Uint16(data[2:4])) } mr := msgp.NewReader(rc) for { op := PartialOperation{} err = op.DecodeMsg(mr) if err != nil { break } opCh <- op } return nil } globalLocalDrivesMu.RLock() localDrives := cloneDrives(globalLocalDrivesMap) globalLocalDrivesMu.RUnlock() for _, localDrive := range localDrives { if localDrive == nil { continue } rc, err := localDrive.ReadFileStream(context.Background(), minioMetaBucket, pathJoin(healMRFDir, "list.bin"), 0, -1) if err != nil { continue } err = loadMRF(rc, m.opCh) if err != nil { continue } // finally delete the file after processing mrf entries localDrive.Delete(GlobalContext, minioMetaBucket, pathJoin(healMRFDir, "list.bin"), DeleteOptions{}) break } return } var healSleeper = newDynamicSleeper(5, time.Second, false) // healRoutine listens to new disks reconnection events and // issues healing requests for queued objects belonging to the // corresponding erasure set func (m *mrfState) healRoutine(z *erasureServerPools) { for { select { case <-GlobalContext.Done(): return case u, ok := <-m.opCh: if !ok { return } // We might land at .metacache, .trash, .multipart // no need to heal them skip, only when bucket // is '.minio.sys' if u.Bucket == minioMetaBucket { // No MRF needed for temporary objects if wildcard.Match("buckets/*/.metacache/*", u.Object) { continue } if wildcard.Match("tmp/*", u.Object) { continue } if wildcard.Match("multipart/*", u.Object) { continue } if wildcard.Match("tmp-old/*", u.Object) { continue } } now := time.Now() if now.Sub(u.Queued) < time.Second { // let recently failed networks to reconnect // making MRF wait for 1s before retrying, // i.e 4 reconnect attempts. time.Sleep(time.Second) } // wait on timer per heal wait := healSleeper.Timer(context.Background()) scan := madmin.HealNormalScan if u.BitrotScan { scan = madmin.HealDeepScan } if u.Object == "" { healBucket(u.Bucket, scan) } else { if len(u.Versions) > 0 { vers := len(u.Versions) / 16 if vers > 0 { for i := 0; i < vers; i++ { healObject(u.Bucket, u.Object, uuid.UUID(u.Versions[16*i:]).String(), scan) } } } else { healObject(u.Bucket, u.Object, u.VersionID, scan) } } wait() } } }