2022-11-14 10:16:40 -05:00
|
|
|
// Copyright (c) 2015-2022 MinIO, Inc.
|
|
|
|
//
|
|
|
|
// This file is part of MinIO Object Storage stack
|
|
|
|
//
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU Affero General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
package cmd
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"math/rand"
|
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
2023-06-19 20:53:08 -04:00
|
|
|
"github.com/minio/madmin-go/v3"
|
2022-11-14 10:16:40 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
//go:generate msgp -file=$GOFILE
|
|
|
|
|
|
|
|
// SiteResyncStatus captures current replication resync status for a target site
|
|
|
|
type SiteResyncStatus struct {
|
|
|
|
Version int `json:"version" msg:"v"`
|
|
|
|
// Overall site status
|
|
|
|
Status ResyncStatusType `json:"st" msg:"ss"`
|
|
|
|
DeplID string `json:"dId" msg:"did"`
|
|
|
|
BucketStatuses map[string]ResyncStatusType `json:"buckets" msg:"bkts"`
|
|
|
|
TotBuckets int `json:"totbuckets" msg:"tb"`
|
|
|
|
TargetReplicationResyncStatus `json:"currSt" msg:"cst"`
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *SiteResyncStatus) clone() SiteResyncStatus {
|
|
|
|
if s == nil {
|
|
|
|
return SiteResyncStatus{}
|
|
|
|
}
|
|
|
|
o := *s
|
|
|
|
o.BucketStatuses = make(map[string]ResyncStatusType, len(s.BucketStatuses))
|
|
|
|
for b, st := range s.BucketStatuses {
|
|
|
|
o.BucketStatuses[b] = st
|
|
|
|
}
|
|
|
|
return o
|
|
|
|
}
|
|
|
|
|
|
|
|
const (
|
|
|
|
siteResyncPrefix = bucketMetaPrefix + "/site-replication/resync"
|
|
|
|
)
|
|
|
|
|
|
|
|
type resyncState struct {
|
|
|
|
resyncID string
|
|
|
|
LastSaved time.Time
|
|
|
|
}
|
|
|
|
|
|
|
|
//msgp:ignore siteResyncMetrics
|
|
|
|
type siteResyncMetrics struct {
|
|
|
|
sync.RWMutex
|
|
|
|
// resyncStatus maps resync ID to resync status for peer
|
|
|
|
resyncStatus map[string]SiteResyncStatus
|
|
|
|
// map peer deployment ID to resync ID
|
|
|
|
peerResyncMap map[string]resyncState
|
|
|
|
}
|
|
|
|
|
|
|
|
func newSiteResyncMetrics(ctx context.Context) *siteResyncMetrics {
|
|
|
|
s := siteResyncMetrics{
|
|
|
|
resyncStatus: make(map[string]SiteResyncStatus),
|
|
|
|
peerResyncMap: make(map[string]resyncState),
|
|
|
|
}
|
|
|
|
go s.save(ctx)
|
|
|
|
go s.init(ctx)
|
|
|
|
return &s
|
|
|
|
}
|
|
|
|
|
|
|
|
// init site resync metrics
|
|
|
|
func (sm *siteResyncMetrics) init(ctx context.Context) {
|
|
|
|
r := rand.New(rand.NewSource(time.Now().UnixNano()))
|
|
|
|
// Run the site resync metrics load in a loop
|
|
|
|
for {
|
|
|
|
if err := sm.load(ctx, newObjectLayerFn()); err == nil {
|
|
|
|
<-ctx.Done()
|
|
|
|
return
|
|
|
|
}
|
|
|
|
duration := time.Duration(r.Float64() * float64(time.Second*10))
|
|
|
|
if duration < time.Second {
|
|
|
|
// Make sure to sleep atleast a second to avoid high CPU ticks.
|
|
|
|
duration = time.Second
|
|
|
|
}
|
|
|
|
time.Sleep(duration)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// load resync metrics saved on disk into memory
|
|
|
|
func (sm *siteResyncMetrics) load(ctx context.Context, objAPI ObjectLayer) error {
|
|
|
|
if objAPI == nil {
|
|
|
|
return errServerNotInitialized
|
|
|
|
}
|
|
|
|
info, err := globalSiteReplicationSys.GetClusterInfo(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if !info.Enabled {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
for _, peer := range info.Sites {
|
|
|
|
if peer.DeploymentID == globalDeploymentID {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
rs, err := loadSiteResyncMetadata(ctx, objAPI, peer.DeploymentID)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
sm.Lock()
|
|
|
|
defer sm.Unlock()
|
|
|
|
if _, ok := sm.peerResyncMap[peer.DeploymentID]; !ok {
|
|
|
|
sm.peerResyncMap[peer.DeploymentID] = resyncState{resyncID: rs.ResyncID, LastSaved: time.Time{}}
|
|
|
|
sm.resyncStatus[rs.ResyncID] = rs
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (sm *siteResyncMetrics) report(dID string) *madmin.SiteResyncMetrics {
|
|
|
|
sm.RLock()
|
|
|
|
defer sm.RUnlock()
|
|
|
|
rst, ok := sm.peerResyncMap[dID]
|
|
|
|
if !ok {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
rs, ok := sm.resyncStatus[rst.resyncID]
|
|
|
|
if !ok {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
m := madmin.SiteResyncMetrics{
|
|
|
|
CollectedAt: rs.LastUpdate,
|
|
|
|
StartTime: rs.StartTime,
|
|
|
|
LastUpdate: rs.LastUpdate,
|
|
|
|
ResyncStatus: rs.Status.String(),
|
|
|
|
ResyncID: rst.resyncID,
|
|
|
|
DeplID: rs.DeplID,
|
|
|
|
ReplicatedSize: rs.ReplicatedSize,
|
|
|
|
ReplicatedCount: rs.ReplicatedCount,
|
|
|
|
FailedSize: rs.FailedSize,
|
|
|
|
FailedCount: rs.FailedCount,
|
|
|
|
Bucket: rs.Bucket,
|
|
|
|
Object: rs.Object,
|
|
|
|
NumBuckets: int64(rs.TotBuckets),
|
|
|
|
}
|
|
|
|
for b, st := range rs.BucketStatuses {
|
|
|
|
if st == ResyncFailed {
|
|
|
|
m.FailedBuckets = append(m.FailedBuckets, b)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return &m
|
|
|
|
}
|
|
|
|
|
|
|
|
// save in-memory stats to disk
|
|
|
|
func (sm *siteResyncMetrics) save(ctx context.Context) {
|
|
|
|
sTimer := time.NewTimer(siteResyncSaveInterval)
|
|
|
|
defer sTimer.Stop()
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-sTimer.C:
|
|
|
|
if globalSiteReplicationSys.isEnabled() {
|
|
|
|
sm.Lock()
|
2023-03-30 23:59:45 -04:00
|
|
|
wg := sync.WaitGroup{}
|
2022-11-14 10:16:40 -05:00
|
|
|
for dID, rs := range sm.peerResyncMap {
|
|
|
|
st, ok := sm.resyncStatus[rs.resyncID]
|
|
|
|
if ok {
|
|
|
|
updt := st.Status.isValid() && st.LastUpdate.After(rs.LastSaved)
|
|
|
|
if !updt {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
rs.LastSaved = UTCNow()
|
|
|
|
sm.peerResyncMap[dID] = rs
|
2023-03-30 23:59:45 -04:00
|
|
|
wg.Add(1)
|
|
|
|
go func() {
|
|
|
|
defer wg.Done()
|
|
|
|
saveSiteResyncMetadata(ctx, st, newObjectLayerFn())
|
|
|
|
}()
|
2022-11-14 10:16:40 -05:00
|
|
|
}
|
|
|
|
}
|
2023-03-30 23:59:45 -04:00
|
|
|
wg.Wait()
|
2022-11-14 10:16:40 -05:00
|
|
|
sm.Unlock()
|
|
|
|
}
|
|
|
|
sTimer.Reset(siteResyncSaveInterval)
|
|
|
|
case <-ctx.Done():
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// update overall site resync state
|
|
|
|
func (sm *siteResyncMetrics) updateState(s SiteResyncStatus) {
|
|
|
|
if !globalSiteReplicationSys.isEnabled() {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
sm.Lock()
|
|
|
|
defer sm.Unlock()
|
|
|
|
switch s.Status {
|
|
|
|
case ResyncStarted:
|
|
|
|
sm.peerResyncMap[s.DeplID] = resyncState{resyncID: s.ResyncID, LastSaved: time.Time{}}
|
|
|
|
sm.resyncStatus[s.ResyncID] = s
|
|
|
|
case ResyncCompleted, ResyncCanceled, ResyncFailed:
|
|
|
|
st, ok := sm.resyncStatus[s.ResyncID]
|
|
|
|
if ok {
|
|
|
|
st.LastUpdate = s.LastUpdate
|
|
|
|
st.Status = s.Status
|
|
|
|
}
|
|
|
|
sm.resyncStatus[s.ResyncID] = st
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// increment SyncedBuckets count
|
|
|
|
func (sm *siteResyncMetrics) incBucket(o resyncOpts, bktStatus ResyncStatusType) {
|
|
|
|
if !globalSiteReplicationSys.isEnabled() {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
sm.Lock()
|
|
|
|
defer sm.Unlock()
|
|
|
|
st, ok := sm.resyncStatus[o.resyncID]
|
|
|
|
if ok {
|
2022-11-18 04:39:05 -05:00
|
|
|
if st.BucketStatuses == nil {
|
|
|
|
st.BucketStatuses = map[string]ResyncStatusType{}
|
|
|
|
}
|
2022-11-14 10:16:40 -05:00
|
|
|
switch bktStatus {
|
|
|
|
case ResyncCompleted:
|
|
|
|
st.BucketStatuses[o.bucket] = ResyncCompleted
|
|
|
|
st.Status = siteResyncStatus(st.Status, st.BucketStatuses)
|
|
|
|
st.LastUpdate = UTCNow()
|
|
|
|
sm.resyncStatus[o.resyncID] = st
|
|
|
|
case ResyncFailed:
|
|
|
|
st.BucketStatuses[o.bucket] = ResyncFailed
|
|
|
|
st.Status = siteResyncStatus(st.Status, st.BucketStatuses)
|
|
|
|
st.LastUpdate = UTCNow()
|
|
|
|
sm.resyncStatus[o.resyncID] = st
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// remove deleted bucket from active resync tracking
|
|
|
|
func (sm *siteResyncMetrics) deleteBucket(b string) {
|
|
|
|
if !globalSiteReplicationSys.isEnabled() {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
sm.Lock()
|
|
|
|
defer sm.Unlock()
|
|
|
|
for _, rs := range sm.peerResyncMap {
|
|
|
|
st, ok := sm.resyncStatus[rs.resyncID]
|
|
|
|
if !ok {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
switch st.Status {
|
|
|
|
case ResyncCompleted, ResyncFailed:
|
|
|
|
return
|
|
|
|
default:
|
|
|
|
delete(st.BucketStatuses, b)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// returns overall resync status from individual bucket resync status map
|
|
|
|
func siteResyncStatus(currSt ResyncStatusType, m map[string]ResyncStatusType) ResyncStatusType {
|
|
|
|
// avoid overwriting canceled resync status
|
|
|
|
if currSt != ResyncStarted {
|
|
|
|
return currSt
|
|
|
|
}
|
|
|
|
totBuckets := len(m)
|
|
|
|
var cmpCount, failCount int
|
|
|
|
for _, st := range m {
|
|
|
|
switch st {
|
|
|
|
case ResyncCompleted:
|
|
|
|
cmpCount++
|
|
|
|
case ResyncFailed:
|
|
|
|
failCount++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if cmpCount == totBuckets {
|
|
|
|
return ResyncCompleted
|
|
|
|
}
|
|
|
|
if cmpCount+failCount == totBuckets {
|
|
|
|
return ResyncFailed
|
|
|
|
}
|
|
|
|
return ResyncStarted
|
|
|
|
}
|
|
|
|
|
|
|
|
// update resync metrics per object
|
|
|
|
func (sm *siteResyncMetrics) updateMetric(roi ReplicateObjectInfo, success bool, resyncID string) {
|
|
|
|
if !globalSiteReplicationSys.isEnabled() {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
sm.Lock()
|
|
|
|
defer sm.Unlock()
|
|
|
|
s := sm.resyncStatus[resyncID]
|
|
|
|
if success {
|
|
|
|
s.ReplicatedCount++
|
|
|
|
s.ReplicatedSize += roi.Size
|
|
|
|
} else {
|
|
|
|
s.FailedCount++
|
|
|
|
s.FailedSize += roi.Size
|
|
|
|
}
|
|
|
|
s.Bucket = roi.Bucket
|
|
|
|
s.Object = roi.Name
|
|
|
|
s.LastUpdate = UTCNow()
|
|
|
|
sm.resyncStatus[resyncID] = s
|
|
|
|
}
|
|
|
|
|
|
|
|
// Status returns current in-memory resync status for this deployment
|
|
|
|
func (sm *siteResyncMetrics) status(dID string) (rs SiteResyncStatus, err error) {
|
|
|
|
sm.RLock()
|
|
|
|
defer sm.RUnlock()
|
|
|
|
if rst, ok1 := sm.peerResyncMap[dID]; ok1 {
|
|
|
|
if st, ok2 := sm.resyncStatus[rst.resyncID]; ok2 {
|
|
|
|
return st.clone(), nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return rs, errSRNoResync
|
|
|
|
}
|
|
|
|
|
|
|
|
// Status returns latest resync status for this deployment
|
|
|
|
func (sm *siteResyncMetrics) siteStatus(ctx context.Context, objAPI ObjectLayer, dID string) (rs SiteResyncStatus, err error) {
|
|
|
|
if !globalSiteReplicationSys.isEnabled() {
|
|
|
|
return rs, errSRNotEnabled
|
|
|
|
}
|
|
|
|
// check in-memory status
|
|
|
|
rs, err = sm.status(dID)
|
|
|
|
if err == nil {
|
|
|
|
return rs, nil
|
|
|
|
}
|
|
|
|
// check disk resync status
|
|
|
|
rs, err = loadSiteResyncMetadata(ctx, objAPI, dID)
|
|
|
|
if err != nil && err == errConfigNotFound {
|
|
|
|
return rs, nil
|
|
|
|
}
|
|
|
|
return rs, err
|
|
|
|
}
|