Use concurrent bz2 decompression (#13360)

Testing with `mc sql --compression BZIP2 --csv-input "rd=\n,fh=USE,fd=;" --query="select COUNT(*) from S3Object" local2/testbucket/nyc-taxi-data-10M.csv.bz2` Before 96.98s, after 10.79s. Uses about 70% CPU while running.
2025-11-07 12:52:58 -05:00 · 2021-10-14 11:11:07 -07:00
parent 974073a2e5
commit 5e53f767c4
4 changed files with 31 additions and 4 deletions
--- a/cmd/untar.go
+++ b/cmd/untar.go
@@ -21,12 +21,14 @@ import (
 	"archive/tar"
 	"bufio"
 	"bytes"
-	"compress/bzip2"
+	"context"
 	"fmt"
 	"io"
 	"os"
 	"path"
+	"runtime"

+	"github.com/cosnicolaou/pbzip2"
 	"github.com/klauspost/compress/s2"
 	"github.com/klauspost/compress/zstd"
 	gzip "github.com/klauspost/pgzip"
@@ -112,7 +114,9 @@ func untar(r io.Reader, putObject func(reader io.Reader, info os.FileInfo, name
 		defer dec.Close()
 		r = dec
 	case formatBZ2:
-		r = bzip2.NewReader(bf)
+		ctx, cancel := context.WithCancel(context.Background())
+		defer cancel()
+		r = pbzip2.NewReader(ctx, bf, pbzip2.DecompressionOptions(pbzip2.BZConcurrency((runtime.GOMAXPROCS(0)+1)/2)))
 	case formatLZ4:
 		r = lz4.NewReader(bf)
 	case formatUnknown: