fix: reduce crawler memory usage by orders of magnitude (#11556)

currently crawler waits for an entire readdir call to return until it processes usage, lifecycle, replication and healing - instead we should pass the applicator all the way down to avoid building any special stack for all the contents in a single directory. This allows for - no need to remember the entire list of entries per directory before applying the required functions - no need to wait for entire readdir() call to finish before applying the required functions
2025-11-21 02:09:08 -05:00 · 2021-02-17 15:34:42 -08:00
parent e07918abe3
commit 289e1d8b2a
8 changed files with 47 additions and 63 deletions
--- a/cmd/data-scanner.go
+++ b/cmd/data-scanner.go
@@ -407,19 +407,19 @@ func (f *folderScanner) scanQueuedLevels(ctx context.Context, folders []cachedFo
 				if f.dataUsageCrawlDebug {
 					console.Debugf(scannerLogPrefix+" no bucket (%s,%s)\n", f.root, entName)
 				}
-				return nil
+				return errDoneForNow
 			}

 			if isReservedOrInvalidBucket(bucket, false) {
 				if f.dataUsageCrawlDebug {
 					console.Debugf(scannerLogPrefix+" invalid bucket: %v, entry: %v\n", bucket, entName)
 				}
-				return nil
+				return errDoneForNow
 			}

 			select {
 			case <-done:
-				return ctx.Err()
+				return errDoneForNow
 			default:
 			}

@@ -682,7 +682,7 @@ func (f *folderScanner) deepScanFolder(ctx context.Context, folder cachedFolder,
 	addDir = func(entName string, typ os.FileMode) error {
 		select {
 		case <-done:
-			return ctx.Err()
+			return errDoneForNow
 		default:
 		}