Add support for audit/logger max retry and retry interval (#20402)

Current implementation retries forever until our log buffer is full, and we start dropping events. This PR allows you to set a value until we give up on existing audit/logger batches to proceed to process the new ones. Bonus: - do not blow up buffers beyond batchSize value - do not leak the ticker if the worker returns
2025-11-07 21:02:58 -05:00 · 2024-09-08 05:15:09 -07:00
parent 3f39da48ea
commit 8268c12cfb
4 changed files with 162 additions and 77 deletions
--- a/internal/logger/target/http/http.go
+++ b/internal/logger/target/http/http.go
@@ -79,6 +79,8 @@ type Config struct {
 	BatchSize  int               `json:"batchSize"`
 	QueueSize  int               `json:"queueSize"`
 	QueueDir   string            `json:"queueDir"`
+	MaxRetry   int               `json:"maxRetry"`
+	RetryIntvl time.Duration     `json:"retryInterval"`
 	Proxy      string            `json:"string"`
 	Transport  http.RoundTripper `json:"-"`

@@ -227,6 +229,7 @@ func (h *Target) send(ctx context.Context, payload []byte, payloadCount int, pay
 			if xnet.IsNetworkOrHostDown(err, false) {
 				h.status.Store(statusOffline)
 			}
+			h.failedMessages.Add(int64(payloadCount))
 		} else {
 			h.status.Store(statusOnline)
 		}
@@ -257,7 +260,6 @@ func (h *Target) send(ctx context.Context, payload []byte, payloadCount int, pay

 	resp, err := h.client.Do(req)
 	if err != nil {
-		h.failedMessages.Add(int64(payloadCount))
 		return fmt.Errorf("%s returned '%w', please check your endpoint configuration", h.Endpoint(), err)
 	}

@@ -268,10 +270,8 @@ func (h *Target) send(ctx context.Context, payload []byte, payloadCount int, pay
 		// accepted HTTP status codes.
 		return nil
 	} else if resp.StatusCode == http.StatusForbidden {
-		h.failedMessages.Add(int64(payloadCount))
 		return fmt.Errorf("%s returned '%s', please check if your auth token is correctly set", h.Endpoint(), resp.Status)
 	}
-	h.failedMessages.Add(int64(payloadCount))
 	return fmt.Errorf("%s returned '%s', please check your endpoint configuration", h.Endpoint(), resp.Status)
 }

@@ -326,9 +326,6 @@ func (h *Target) startQueueProcessor(ctx context.Context, mainWorker bool) {
 		}
 	}()

-	var entry interface{}
-	var ok bool
-	var err error
 	lastBatchProcess := time.Now()

 	buf := bytebufferpool.Get()
@@ -343,61 +340,76 @@ func (h *Target) startQueueProcessor(ctx context.Context, mainWorker bool) {
 	globalBuffer := logChBuffers[name]
 	logChLock.Unlock()

-	newTicker := time.NewTicker(time.Second)
-	isTick := false
-	var count int
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()

+	var count int
 	for {
-		isTick = false
-		select {
-		case _ = <-newTicker.C:
-			isTick = true
-		case entry, _ = <-globalBuffer:
-		case entry, ok = <-h.logCh:
-			if !ok {
+		var (
+			ok    bool
+			entry any
+		)
+
+		if count < h.batchSize {
+			tickered := false
+			select {
+			case _ = <-ticker.C:
+				tickered = true
+			case entry, _ = <-globalBuffer:
+			case entry, ok = <-h.logCh:
+				if !ok {
+					return
+				}
+			case <-ctx.Done():
 				return
 			}
-		case <-ctx.Done():
-			return
-		}

-		if !isTick {
-			h.totalMessages.Add(1)
-
-			if !isDirQueue {
-				if err := enc.Encode(&entry); err != nil {
-					h.config.LogOnceIf(
-						ctx,
-						fmt.Errorf("unable to encode webhook log entry, err  '%w' entry: %v\n", err, entry),
-						h.Name(),
-					)
-					h.failedMessages.Add(1)
-					continue
+			if !tickered {
+				h.totalMessages.Add(1)
+				if !isDirQueue {
+					if err := enc.Encode(&entry); err != nil {
+						h.config.LogOnceIf(
+							ctx,
+							fmt.Errorf("unable to encode webhook log entry, err  '%w' entry: %v\n", err, entry),
+							h.Name(),
+						)
+						h.failedMessages.Add(1)
+						continue
+					}
+				} else {
+					entries = append(entries, entry)
 				}
 				count++
-			} else {
-				entries = append(entries, entry)
-				count++
 			}
-		}

-		if count != h.batchSize {
 			if len(h.logCh) > 0 || len(globalBuffer) > 0 || count == 0 {
+				// there is something in the log queue
+				// process it first, even if we tickered
+				// first, or we have not received any events
+				// yet, still wait on it.
 				continue
 			}

-			if h.batchSize > 1 {
-				// If we are doing batching, we should wait
-				// at least one second before sending.
-				// Even if there is nothing in the queue.
-				if time.Since(lastBatchProcess).Seconds() < 1 {
-					continue
-				}
+			// If we are doing batching, we should wait
+			// at least for a second, before sending.
+			// Even if there is nothing in the queue.
+			if h.batchSize > 1 && time.Since(lastBatchProcess) < time.Second {
+				continue
 			}
 		}

+		// if we have reached the count send at once
+		// or we have crossed last second before batch was sent, send at once
 		lastBatchProcess = time.Now()

+		var retries int
+		retryIntvl := h.config.RetryIntvl
+		if retryIntvl <= 0 {
+			retryIntvl = 3 * time.Second
+		}
+
+		maxRetries := h.config.MaxRetry
+
 	retry:
 		// If the channel reaches above half capacity
 		// we spawn more workers. The workers spawned
@@ -415,6 +427,7 @@ func (h *Target) startQueueProcessor(ctx context.Context, mainWorker bool) {
 			}
 		}

+		var err error
 		if !isDirQueue {
 			err = h.send(ctx, buf.Bytes(), count, h.payloadType, webhookCallTimeout)
 		} else {
@@ -422,18 +435,24 @@ func (h *Target) startQueueProcessor(ctx context.Context, mainWorker bool) {
 		}

 		if err != nil {
-			h.config.LogOnceIf(
-				context.Background(),
-				fmt.Errorf("unable to send webhook log entry(s) to '%s' err '%w': %d", name, err, count),
-				name,
-			)
-
 			if errors.Is(err, context.Canceled) {
 				return
 			}

-			time.Sleep(3 * time.Second)
-			goto retry
+			h.config.LogOnceIf(
+				context.Background(),
+				fmt.Errorf("unable to send audit/log entry(s) to '%s' err '%w': %d", name, err, count),
+				name,
+			)
+
+			time.Sleep(retryIntvl)
+			if maxRetries == 0 {
+				goto retry
+			}
+			retries++
+			if retries <= maxRetries {
+				goto retry
+			}
 		}

 		entries = make([]interface{}, 0)