Add support for audit/logger max retry and retry interval (#20402)

Current implementation retries forever until our
log buffer is full, and we start dropping events.

This PR allows you to set a value until we give
up on existing audit/logger batches to proceed to
process the new ones.

Bonus:
 - do not blow up buffers beyond batchSize value
 - do not leak the ticker if the worker returns
This commit is contained in:
Harshavardhana
2024-09-08 05:15:09 -07:00
committed by GitHub
parent 3f39da48ea
commit 8268c12cfb
4 changed files with 162 additions and 77 deletions

View File

@@ -79,6 +79,8 @@ type Config struct {
BatchSize int `json:"batchSize"`
QueueSize int `json:"queueSize"`
QueueDir string `json:"queueDir"`
MaxRetry int `json:"maxRetry"`
RetryIntvl time.Duration `json:"retryInterval"`
Proxy string `json:"string"`
Transport http.RoundTripper `json:"-"`
@@ -227,6 +229,7 @@ func (h *Target) send(ctx context.Context, payload []byte, payloadCount int, pay
if xnet.IsNetworkOrHostDown(err, false) {
h.status.Store(statusOffline)
}
h.failedMessages.Add(int64(payloadCount))
} else {
h.status.Store(statusOnline)
}
@@ -257,7 +260,6 @@ func (h *Target) send(ctx context.Context, payload []byte, payloadCount int, pay
resp, err := h.client.Do(req)
if err != nil {
h.failedMessages.Add(int64(payloadCount))
return fmt.Errorf("%s returned '%w', please check your endpoint configuration", h.Endpoint(), err)
}
@@ -268,10 +270,8 @@ func (h *Target) send(ctx context.Context, payload []byte, payloadCount int, pay
// accepted HTTP status codes.
return nil
} else if resp.StatusCode == http.StatusForbidden {
h.failedMessages.Add(int64(payloadCount))
return fmt.Errorf("%s returned '%s', please check if your auth token is correctly set", h.Endpoint(), resp.Status)
}
h.failedMessages.Add(int64(payloadCount))
return fmt.Errorf("%s returned '%s', please check your endpoint configuration", h.Endpoint(), resp.Status)
}
@@ -326,9 +326,6 @@ func (h *Target) startQueueProcessor(ctx context.Context, mainWorker bool) {
}
}()
var entry interface{}
var ok bool
var err error
lastBatchProcess := time.Now()
buf := bytebufferpool.Get()
@@ -343,61 +340,76 @@ func (h *Target) startQueueProcessor(ctx context.Context, mainWorker bool) {
globalBuffer := logChBuffers[name]
logChLock.Unlock()
newTicker := time.NewTicker(time.Second)
isTick := false
var count int
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
var count int
for {
isTick = false
select {
case _ = <-newTicker.C:
isTick = true
case entry, _ = <-globalBuffer:
case entry, ok = <-h.logCh:
if !ok {
var (
ok bool
entry any
)
if count < h.batchSize {
tickered := false
select {
case _ = <-ticker.C:
tickered = true
case entry, _ = <-globalBuffer:
case entry, ok = <-h.logCh:
if !ok {
return
}
case <-ctx.Done():
return
}
case <-ctx.Done():
return
}
if !isTick {
h.totalMessages.Add(1)
if !isDirQueue {
if err := enc.Encode(&entry); err != nil {
h.config.LogOnceIf(
ctx,
fmt.Errorf("unable to encode webhook log entry, err '%w' entry: %v\n", err, entry),
h.Name(),
)
h.failedMessages.Add(1)
continue
if !tickered {
h.totalMessages.Add(1)
if !isDirQueue {
if err := enc.Encode(&entry); err != nil {
h.config.LogOnceIf(
ctx,
fmt.Errorf("unable to encode webhook log entry, err '%w' entry: %v\n", err, entry),
h.Name(),
)
h.failedMessages.Add(1)
continue
}
} else {
entries = append(entries, entry)
}
count++
} else {
entries = append(entries, entry)
count++
}
}
if count != h.batchSize {
if len(h.logCh) > 0 || len(globalBuffer) > 0 || count == 0 {
// there is something in the log queue
// process it first, even if we tickered
// first, or we have not received any events
// yet, still wait on it.
continue
}
if h.batchSize > 1 {
// If we are doing batching, we should wait
// at least one second before sending.
// Even if there is nothing in the queue.
if time.Since(lastBatchProcess).Seconds() < 1 {
continue
}
// If we are doing batching, we should wait
// at least for a second, before sending.
// Even if there is nothing in the queue.
if h.batchSize > 1 && time.Since(lastBatchProcess) < time.Second {
continue
}
}
// if we have reached the count send at once
// or we have crossed last second before batch was sent, send at once
lastBatchProcess = time.Now()
var retries int
retryIntvl := h.config.RetryIntvl
if retryIntvl <= 0 {
retryIntvl = 3 * time.Second
}
maxRetries := h.config.MaxRetry
retry:
// If the channel reaches above half capacity
// we spawn more workers. The workers spawned
@@ -415,6 +427,7 @@ func (h *Target) startQueueProcessor(ctx context.Context, mainWorker bool) {
}
}
var err error
if !isDirQueue {
err = h.send(ctx, buf.Bytes(), count, h.payloadType, webhookCallTimeout)
} else {
@@ -422,18 +435,24 @@ func (h *Target) startQueueProcessor(ctx context.Context, mainWorker bool) {
}
if err != nil {
h.config.LogOnceIf(
context.Background(),
fmt.Errorf("unable to send webhook log entry(s) to '%s' err '%w': %d", name, err, count),
name,
)
if errors.Is(err, context.Canceled) {
return
}
time.Sleep(3 * time.Second)
goto retry
h.config.LogOnceIf(
context.Background(),
fmt.Errorf("unable to send audit/log entry(s) to '%s' err '%w': %d", name, err, count),
name,
)
time.Sleep(retryIntvl)
if maxRetries == 0 {
goto retry
}
retries++
if retries <= maxRetries {
goto retry
}
}
entries = make([]interface{}, 0)