Use concurrent bz2 decompression (#13360)

Testing with `mc sql --compression BZIP2 --csv-input "rd=\n,fh=USE,fd=;" --query="select COUNT(*) from S3Object" local2/testbucket/nyc-taxi-data-10M.csv.bz2`

Before 96.98s, after 10.79s. Uses about 70% CPU while running.
This commit is contained in:
Klaus Post
2021-10-14 11:11:07 -07:00
committed by GitHub
parent 974073a2e5
commit 5e53f767c4
4 changed files with 31 additions and 4 deletions

View File

@@ -18,13 +18,15 @@
package s3select
import (
"compress/bzip2"
"context"
"errors"
"fmt"
"io"
"runtime"
"sync"
"sync/atomic"
"github.com/cosnicolaou/pbzip2"
"github.com/klauspost/compress/s2"
"github.com/klauspost/compress/zstd"
gzip "github.com/klauspost/pgzip"
@@ -121,7 +123,9 @@ func newProgressReader(rc io.ReadCloser, compType CompressionType) (*progressRea
r = gzr
pr.closer = gzr
case bzip2Type:
r = bzip2.NewReader(scannedReader)
ctx, cancel := context.WithCancel(context.Background())
r = pbzip2.NewReader(ctx, scannedReader, pbzip2.DecompressionOptions(pbzip2.BZConcurrency((runtime.GOMAXPROCS(0)+1)/2)))
pr.closer = &nopReadCloser{fn: cancel}
case zstdType:
// Set a max window of 64MB. More than reasonable.
zr, err := zstd.NewReader(scannedReader, zstd.WithDecoderConcurrency(2), zstd.WithDecoderMaxWindow(64<<20))
@@ -143,3 +147,19 @@ func newProgressReader(rc io.ReadCloser, compType CompressionType) (*progressRea
return &pr, nil
}
type nopReadCloser struct {
fn func()
}
func (n2 *nopReadCloser) Read(p []byte) (n int, err error) {
panic("should not be called")
}
func (n2 *nopReadCloser) Close() error {
if n2.fn != nil {
n2.fn()
}
n2.fn = nil
return nil
}