Performance improvements to SELECT API on certain query operations (#6752)

This improves the performance of certain queries dramatically, such as 'count(*)' etc. Without this PR ``` ~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz 2173762 real 0m42.464s user 0m0.071s sys 0m0.010s ``` With this PR ``` ~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz 2173762 real 0m17.603s user 0m0.093s sys 0m0.008s ``` Almost a 250% improvement in performance. This PR avoids a lot of type conversions and instead relies on raw sequences of data and interprets them lazily. ``` benchcmp old new benchmark old ns/op new ns/op delta BenchmarkSQLAggregate_100K-4 551213 259782 -52.87% BenchmarkSQLAggregate_1M-4 6981901985 2432413729 -65.16% BenchmarkSQLAggregate_2M-4 13511978488 4536903552 -66.42% BenchmarkSQLAggregate_10M-4 68427084908 23266283336 -66.00% benchmark old allocs new allocs delta BenchmarkSQLAggregate_100K-4 2366 485 -79.50% BenchmarkSQLAggregate_1M-4 47455492 21462860 -54.77% BenchmarkSQLAggregate_2M-4 95163637 43110771 -54.70% BenchmarkSQLAggregate_10M-4 476959550 216906510 -54.52% benchmark old bytes new bytes delta BenchmarkSQLAggregate_100K-4 1233079 1086024 -11.93% BenchmarkSQLAggregate_1M-4 2607984120 557038536 -78.64% BenchmarkSQLAggregate_2M-4 5254103616 1128149168 -78.53% BenchmarkSQLAggregate_10M-4 26443524872 5722715992 -78.36% ```
2025-11-22 18:47:43 -05:00 · 2018-11-14 15:55:10 -08:00
parent f9779b24ad
commit 7e1661f4fa
108 changed files with 640 additions and 12237 deletions
--- a/pkg/s3select/format/csv/csv.go
+++ b/pkg/s3select/format/csv/csv.go
@@ -23,6 +23,8 @@ import (
 	"strconv"
 	"strings"

+	"github.com/tidwall/sjson"
+
 	"github.com/minio/minio/pkg/ioutil"
 	"github.com/minio/minio/pkg/s3select/format"
 )
@@ -96,7 +98,6 @@ func New(opts *Options) (format.Select, error) {
 	reader.stats.BytesScanned = opts.StreamSize
 	reader.stats.BytesProcessed = 0
 	reader.stats.BytesReturned = 0
-
 	reader.firstRow = nil

 	reader.reader.FieldsPerRecord = -1
@@ -120,7 +121,14 @@ func New(opts *Options) (format.Select, error) {

 // Replace the spaces in columnnames with underscores
 func cleanHeader(columns []string) []string {
-	for i := 0; i < len(columns); i++ {
+	for i := range columns {
+		// Even if header name is specified, some CSV's
+		// might have column header names might be empty
+		// and non-empty. In such a scenario we prepare
+		// indexed value.
+		if columns[i] == "" {
+			columns[i] = "_" + strconv.Itoa(i)
+		}
 		columns[i] = strings.Replace(columns[i], " ", "_", -1)
 	}
 	return columns
@@ -137,15 +145,14 @@ func (reader *cinput) readHeader() error {
 		}
 		reader.header = cleanHeader(reader.firstRow)
 		reader.firstRow = nil
-		reader.minOutputLength = len(reader.header)
 	} else {
 		reader.firstRow, readErr = reader.reader.Read()
 		reader.header = make([]string, len(reader.firstRow))
-		for i := 0; i < reader.minOutputLength; i++ {
-			reader.header[i] = strconv.Itoa(i)
+		for i := range reader.firstRow {
+			reader.header[i] = "_" + strconv.Itoa(i)
 		}
-
 	}
+	reader.minOutputLength = len(reader.header)
 	return nil
 }

@@ -155,33 +162,24 @@ func (reader *cinput) Progress() bool {
 }

 // UpdateBytesProcessed - populates the bytes Processed
-func (reader *cinput) UpdateBytesProcessed(record map[string]interface{}) {
-	// Convert map to slice of values.
-	values := []string{}
-	for _, value := range record {
-		values = append(values, value.(string))
-	}
-
-	reader.stats.BytesProcessed += int64(len(values))
+func (reader *cinput) UpdateBytesProcessed(size int64) {
+	reader.stats.BytesProcessed += size

 }

-// Read the file and returns map[string]interface{}
-func (reader *cinput) Read() (map[string]interface{}, error) {
-	record := make(map[string]interface{})
+// Read returns byte sequence
+func (reader *cinput) Read() ([]byte, error) {
 	dec := reader.readRecord()
 	if dec != nil {
-		if reader.options.HasHeader {
-			columns := reader.header
-			for i, value := range dec {
-				record[columns[i]] = value
-			}
-		} else {
-			for i, value := range dec {
-				record["_"+strconv.Itoa(i)] = value
+		var data []byte
+		var err error
+		for i, value := range dec {
+			data, err = sjson.SetBytes(data, reader.header[i], value)
+			if err != nil {
+				return nil, err
 			}
 		}
-		return record, nil
+		return data, nil
 	}
 	return nil, nil
 }
--- a/pkg/s3select/format/json/json.go
+++ b/pkg/s3select/format/json/json.go
@@ -17,11 +17,10 @@
 package json

 import (
-	"encoding/json"
+	"bufio"
 	"encoding/xml"
 	"io"

-	jsoniter "github.com/json-iterator/go"
 	"github.com/minio/minio/pkg/s3select/format"
 )

@@ -57,7 +56,7 @@ type Options struct {
 // jinput represents a record producing input from a  formatted file or pipe.
 type jinput struct {
 	options         *Options
-	reader          *jsoniter.Decoder
+	reader          *bufio.Reader
 	firstRow        []string
 	header          []string
 	minOutputLength int
@@ -75,7 +74,7 @@ type jinput struct {
 func New(opts *Options) (format.Select, error) {
 	reader := &jinput{
 		options: opts,
-		reader:  jsoniter.NewDecoder(opts.ReadFrom),
+		reader:  bufio.NewReader(opts.ReadFrom),
 	}
 	reader.stats.BytesScanned = opts.StreamSize
 	reader.stats.BytesProcessed = 0
@@ -90,26 +89,21 @@ func (reader *jinput) Progress() bool {
 }

 // UpdateBytesProcessed - populates the bytes Processed
-func (reader *jinput) UpdateBytesProcessed(record map[string]interface{}) {
-	out, _ := json.Marshal(record)
-	reader.stats.BytesProcessed += int64(len(out))
+func (reader *jinput) UpdateBytesProcessed(size int64) {
+	reader.stats.BytesProcessed += size
 }

-// Read the file and returns map[string]interface{}
-func (reader *jinput) Read() (map[string]interface{}, error) {
-	dec := reader.reader
-	var record interface{}
-	for {
-		err := dec.Decode(&record)
+// Read the file and returns
+func (reader *jinput) Read() ([]byte, error) {
+	data, err := reader.reader.ReadBytes('\n')
+	if err != nil {
 		if err == io.EOF || err == io.ErrClosedPipe {
-			break
+			err = nil
+		} else {
+			err = format.ErrJSONParsingError
 		}
-		if err != nil {
-			return nil, format.ErrJSONParsingError
-		}
-		return record.(map[string]interface{}), nil
 	}
-	return nil, nil
+	return data, err
 }

 // OutputFieldDelimiter - returns the delimiter specified in input request
--- a/pkg/s3select/format/select.go
+++ b/pkg/s3select/format/select.go
@@ -22,11 +22,11 @@ import "encoding/xml"
 // https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
 type Select interface {
 	Type() Type
-	Read() (map[string]interface{}, error)
+	Read() ([]byte, error)
 	Header() []string
 	HasHeader() bool
 	OutputFieldDelimiter() string
-	UpdateBytesProcessed(record map[string]interface{})
+	UpdateBytesProcessed(int64)
 	Expression() string
 	UpdateBytesReturned(int64)
 	CreateStatXML() (string, error)