add data update tracking using bloom filter (#9208)

By monitoring PUT/DELETE and heal operations it is possible to track changed paths and keep a bloom filter for this data. This can help prioritize paths to scan. The bloom filter can identify paths that have not changed, and the few collisions will only result in a marginal extra workload. This can be implemented on either a bucket+(1 prefix level) with reasonable performance. The bloom filter is set to have a false positive rate at 1% at 1M entries. A bloom table of this size is about ~2500 bytes when serialized. To not force a full scan of all paths that have changed cycle bloom filters would need to be kept, so we guarantee that dirty paths have been scanned within cycle runs. Until cycle bloom filters have been collected all paths are considered dirty.
2025-11-20 18:06:10 -05:00 · 2020-04-27 19:06:21 +02:00
parent eff4127efd
commit 073aac3d92
24 changed files with 1270 additions and 61 deletions
--- a/cmd/data-usage-cache.go
+++ b/cmd/data-usage-cache.go
@@ -20,6 +20,7 @@ import (
 	"bytes"
 	"context"
 	"encoding/binary"
+	"errors"
 	"fmt"
 	"io"
 	"path"
@@ -62,9 +63,10 @@ type dataUsageEntryInfo struct {

 type dataUsageCacheInfo struct {
 	// Name of the bucket. Also root element.
-	Name       string
-	LastUpdate time.Time
-	NextCycle  uint8
+	Name        string
+	LastUpdate  time.Time
+	NextCycle   uint32
+	BloomFilter []byte `msg:"BloomFilter,omitempty"`
 }

 // merge other data usage entry into this, excluding children.
@@ -77,8 +79,8 @@ func (e *dataUsageEntry) merge(other dataUsageEntry) {
 }

 // mod returns true if the hash mod cycles == cycle.
-func (h dataUsageHash) mod(cycle uint8, cycles uint8) bool {
-	return uint8(h)%cycles == cycle%cycles
+func (h dataUsageHash) mod(cycle uint32, cycles uint32) bool {
+	return uint32(h)%cycles == cycle%cycles
 }

 // addChildString will add a child based on its name.
@@ -110,6 +112,7 @@ func (d *dataUsageCache) find(path string) *dataUsageEntry {
 }

 // dui converts the flattened version of the path to DataUsageInfo.
+// As a side effect d will be flattened, use a clone if this is not ok.
 func (d *dataUsageCache) dui(path string, buckets []BucketInfo) DataUsageInfo {
 	e := d.find(path)
 	if e == nil {
@@ -158,6 +161,32 @@ func (d *dataUsageCache) replaceHashed(hash dataUsageHash, parent *dataUsageHash
 	}
 }

+// copyWithChildren will copy entry with hash from src if it exists along with any children.
+// If a parent is specified it will be added to that if not already there.
+// If the parent does not exist, it will be added.
+func (d *dataUsageCache) copyWithChildren(src *dataUsageCache, hash dataUsageHash, parent *dataUsageHash) {
+	if d.Cache == nil {
+		d.Cache = make(map[dataUsageHash]dataUsageEntry, 100)
+	}
+	e, ok := src.Cache[hash]
+	if !ok {
+		return
+	}
+	d.Cache[hash] = e
+	for ch := range e.Children {
+		if ch == hash {
+			logger.LogIf(GlobalContext, errors.New("dataUsageCache.copyWithChildren: Circular reference"))
+			return
+		}
+		d.copyWithChildren(src, ch, &hash)
+	}
+	if parent != nil {
+		p := d.Cache[*parent]
+		p.addChild(hash)
+		d.Cache[*parent] = p
+	}
+}
+
 // StringAll returns a detailed string representation of all entries in the cache.
 func (d *dataUsageCache) StringAll() string {
 	s := fmt.Sprintf("info:%+v\n", d.Info)
@@ -167,6 +196,12 @@ func (d *dataUsageCache) StringAll() string {
 	return strings.TrimSpace(s)
 }

+// insert the hash into dst.
+// dst must be at least dataUsageHashLen bytes long.
+func (h dataUsageHash) bytes(dst []byte) {
+	binary.LittleEndian.PutUint64(dst, uint64(h))
+}
+
 // String returns a human readable representation of the string.
 func (h dataUsageHash) String() string {
 	return fmt.Sprintf("%x", uint64(h))
@@ -297,7 +332,7 @@ func (d *dataUsageCache) load(ctx context.Context, store ObjectLayer, name strin
 	var buf bytes.Buffer
 	err := store.GetObject(ctx, dataUsageBucket, name, 0, -1, &buf, "", ObjectOptions{})
 	if err != nil {
-		if !isErrObjectNotFound(err) {
+		if !isErrObjectNotFound(err) && !isErrBucketNotFound(err) {
 			return toObjectErr(err, dataUsageBucket, name)
 		}
 		*d = dataUsageCache{}