Improve expiration of tiered objects (#18926)

- Use a shared worker pool for all ILM expiry tasks
- Free version cleanup executes in a separate goroutine
- Add a free version only if removing the remote object fails
- Add ILM expiry metrics to the node namespace
- Move tier journal tasks to expiryState
- Remove unused on-disk journal for tiered objects pending deletion
- Distribute expiry tasks across workers such that the expiry of versions of
  the same object serialized
- Ability to resize worker pool without server restart
- Make scaling down of expiryState workers' concurrency safe; Thanks
  @klauspost
- Add error logs when expiryState and transition state are not
  initialized (yet)
* metrics: Add missed tier journal entry tasks
* Initialize the ILM worker pool after the object layer
This commit is contained in:
Krishnan Parthasarathi
2024-03-01 21:11:03 -08:00
committed by GitHub
parent 325fd80687
commit a7577da768
28 changed files with 611 additions and 1131 deletions

View File

@@ -273,10 +273,14 @@ const (
vmemory = "virtual_memory_bytes"
cpu = "cpu_total_seconds"
expiryPendingTasks MetricName = "expiry_pending_tasks"
transitionPendingTasks MetricName = "transition_pending_tasks"
transitionActiveTasks MetricName = "transition_active_tasks"
transitionMissedTasks MetricName = "transition_missed_immediate_tasks"
expiryPendingTasks MetricName = "expiry_pending_tasks"
expiryMissedTasks MetricName = "expiry_missed_tasks"
expiryMissedFreeVersions MetricName = "expiry_missed_freeversions"
expiryMissedTierJournalTasks MetricName = "expiry_missed_tierjournal_tasks"
expiryNumWorkers MetricName = "expiry_num_workers"
transitionPendingTasks MetricName = "transition_pending_tasks"
transitionActiveTasks MetricName = "transition_active_tasks"
transitionMissedTasks MetricName = "transition_missed_immediate_tasks"
transitionedBytes MetricName = "transitioned_bytes"
transitionedObjects MetricName = "transitioned_objects"
@@ -2000,6 +2004,42 @@ func getILMNodeMetrics() *MetricsGroup {
expPendingTasks := Metric{
Description: getExpiryPendingTasksMD(),
}
expMissedTasks := Metric{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ilmSubsystem,
Name: expiryMissedTasks,
Help: "Number of object version expiry missed due to busy system",
Type: counterMetric,
},
}
expMissedFreeVersions := Metric{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ilmSubsystem,
Name: expiryMissedFreeVersions,
Help: "Number of free versions expiry missed due to busy system",
Type: counterMetric,
},
}
expMissedTierJournalTasks := Metric{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ilmSubsystem,
Name: expiryMissedTierJournalTasks,
Help: "Number of tier journal entries cleanup missed due to busy system",
Type: counterMetric,
},
}
expNumWorkers := Metric{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ilmSubsystem,
Name: expiryNumWorkers,
Help: "Number of workers expiring object versions currently",
Type: gaugeMetric,
},
}
trPendingTasks := Metric{
Description: getTransitionPendingTasksMD(),
}
@@ -2011,6 +2051,10 @@ func getILMNodeMetrics() *MetricsGroup {
}
if globalExpiryState != nil {
expPendingTasks.Value = float64(globalExpiryState.PendingTasks())
expMissedTasks.Value = float64(globalExpiryState.stats.MissedTasks())
expMissedFreeVersions.Value = float64(globalExpiryState.stats.MissedFreeVersTasks())
expMissedTierJournalTasks.Value = float64(globalExpiryState.stats.MissedTierJournalTasks())
expNumWorkers.Value = float64(globalExpiryState.stats.NumWorkers())
}
if globalTransitionState != nil {
trPendingTasks.Value = float64(globalTransitionState.PendingTasks())
@@ -2019,6 +2063,10 @@ func getILMNodeMetrics() *MetricsGroup {
}
return []Metric{
expPendingTasks,
expMissedTasks,
expMissedFreeVersions,
expMissedTierJournalTasks,
expNumWorkers,
trPendingTasks,
trActiveTasks,
trMissedTasks,