Performance improvements to SELECT API on certain query operations (#6752)

This improves the performance of certain queries dramatically, such as 'count(*)' etc. Without this PR ``` ~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz 2173762 real 0m42.464s user 0m0.071s sys 0m0.010s ``` With this PR ``` ~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz 2173762 real 0m17.603s user 0m0.093s sys 0m0.008s ``` Almost a 250% improvement in performance. This PR avoids a lot of type conversions and instead relies on raw sequences of data and interprets them lazily. ``` benchcmp old new benchmark old ns/op new ns/op delta BenchmarkSQLAggregate_100K-4 551213 259782 -52.87% BenchmarkSQLAggregate_1M-4 6981901985 2432413729 -65.16% BenchmarkSQLAggregate_2M-4 13511978488 4536903552 -66.42% BenchmarkSQLAggregate_10M-4 68427084908 23266283336 -66.00% benchmark old allocs new allocs delta BenchmarkSQLAggregate_100K-4 2366 485 -79.50% BenchmarkSQLAggregate_1M-4 47455492 21462860 -54.77% BenchmarkSQLAggregate_2M-4 95163637 43110771 -54.70% BenchmarkSQLAggregate_10M-4 476959550 216906510 -54.52% benchmark old bytes new bytes delta BenchmarkSQLAggregate_100K-4 1233079 1086024 -11.93% BenchmarkSQLAggregate_1M-4 2607984120 557038536 -78.64% BenchmarkSQLAggregate_2M-4 5254103616 1128149168 -78.53% BenchmarkSQLAggregate_10M-4 26443524872 5722715992 -78.36% ```
2025-11-06 20:33:07 -05:00 · 2018-11-14 15:55:10 -08:00
parent f9779b24ad
commit 7e1661f4fa
108 changed files with 640 additions and 12237 deletions
--- a/pkg/s3select/select_test.go
+++ b/pkg/s3select/select_test.go
@@ -17,34 +17,20 @@
 package s3select

 import (
+	"bytes"
+	"encoding/csv"
 	"fmt"
-	"reflect"
+	"math/rand"
+	"strconv"
 	"testing"
+	"time"
+
+	humanize "github.com/dustin/go-humanize"
+	"github.com/tidwall/gjson"

 	"github.com/minio/minio/pkg/s3select/format"
 )

-// Unit Test for the checkForDuplicates function.
-func TestCheckForDuplicates(t *testing.T) {
-	tables := []struct {
-		myReq     []string
-		myHeaders map[string]int
-		myDup     map[string]bool
-		myLow     map[string]int
-		myErr     error
-	}{
-		{[]string{"name", "id", "last_name", "last_name"}, make(map[string]int), make(map[string]bool), make(map[string]int), ErrAmbiguousFieldName},
-		{[]string{"name", "id", "last_name", "another_name"}, make(map[string]int), make(map[string]bool), make(map[string]int), nil},
-	}
-
-	for _, table := range tables {
-		err := checkForDuplicates(table.myReq, table.myHeaders, table.myDup, table.myLow)
-		if err != table.myErr {
-			t.Error()
-		}
-	}
-}
-
 // This function returns the index of a string in a list
 func stringIndex(a string, list []string) int {
 	for i, v := range list {
@@ -55,9 +41,9 @@ func stringIndex(a string, list []string) int {
 	return -1
 }

-// TestMyHelperFunctions is a unit test which tests some small helper string
-// functions.
-func TestMyHelperFunctions(t *testing.T) {
+// TestHelperFunctions is a unit test which tests some
+// small helper string functions.
+func TestHelperFunctions(t *testing.T) {
 	tables := []struct {
 		myReq    string
 		myList   []string
@@ -78,37 +64,44 @@ func TestMyHelperFunctions(t *testing.T) {
 	}
 }

-// TestMyStateMachine is a unit test which ensures that the lowest level of the
+// TestStateMachine is a unit test which ensures that the lowest level of the
 // interpreter is converting properly.
-func TestMyStateMachine(t *testing.T) {
+func TestStateMachine(t *testing.T) {
 	tables := []struct {
-		operand  interface{}
+		operand  string
 		operator string
 		leftArg  string
 		err      error
 		expected bool
 	}{
+		{"", ">", "2012", nil, true},
 		{"2005", ">", "2012", nil, true},
-		{2005, ">", "2012", nil, true},
-		{2012.0000, ">", "2014.000", nil, true},
-		{"NA", ">", "2014.000", nil, false},
-		{2014, ">", "Random", nil, false},
+		{"2005", ">", "2012", nil, true},
+		{"2012.0000", ">", "2014.000", nil, true},
+		{"2012", "!=", "2014.000", nil, true},
+		{"NA", ">", "2014.000", nil, true},
+		{"2012", ">", "2014.000", nil, false},
+		{"2012.0000", ">", "2014", nil, false},
+		{"", "<", "2012", nil, false},
+		{"2012.0000", "<", "2014.000", nil, false},
+		{"2014", ">", "Random", nil, false},
 		{"test3", ">", "aandom", nil, false},
+		{"true", ">", "true", ErrUnsupportedSyntax, false},
 	}
-	for _, table := range tables {
-		val, err := evaluateOperator(table.leftArg, table.operator, table.operand)
+	for i, table := range tables {
+		val, err := evaluateOperator(gjson.Parse(table.leftArg), table.operator, gjson.Parse(table.operand))
 		if err != table.err {
-			t.Error()
+			t.Errorf("Test %d: expected %v, got %v", i+1, table.err, err)
 		}
 		if val != table.expected {
-			t.Error()
+			t.Errorf("Test %d: expected %t, got %t", i+1, table.expected, val)
 		}
 	}
 }

-// TestMyOperators is a unit test which ensures that the appropriate values are
+// TestOperators is a unit test which ensures that the appropriate values are
 // being returned from the operators functions.
-func TestMyOperators(t *testing.T) {
+func TestOperators(t *testing.T) {
 	tables := []struct {
 		operator string
 		err      error
@@ -124,27 +117,8 @@ func TestMyOperators(t *testing.T) {
 	}
 }

-// TestMyConversion ensures that the conversion of the value from the csv
-// happens correctly.
-func TestMyConversion(t *testing.T) {
-	tables := []struct {
-		myTblVal string
-		expected reflect.Kind
-	}{
-		{"2014", reflect.Int},
-		{"2014.000", reflect.Float64},
-		{"String!!!", reflect.String},
-	}
-	for _, table := range tables {
-		val := reflect.ValueOf(checkStringType(table.myTblVal)).Kind()
-		if val != table.expected {
-			t.Error()
-		}
-	}
-}
-
 // Unit tests for the main function that performs aggreggation.
-func TestMyAggregationFunc(t *testing.T) {
+func TestAggregationFunc(t *testing.T) {
 	columnsMap := make(map[string]int)
 	columnsMap["Col1"] = 0
 	columnsMap["Col2"] = 1
@@ -155,22 +129,23 @@ func TestMyAggregationFunc(t *testing.T) {
 		columnsMap     map[string]int
 		storeReqCols   []string
 		storeFunctions []string
-		record         string
+		record         []byte
 		err            error
 		expectedVal    float64
 	}{
-		{10, 5, []float64{10, 11, 12, 13, 14}, columnsMap, []string{"Col1"}, []string{"count"}, "{\"Col1\":\"1\",\"Col2\":\"2\"}", nil, 11},
-		{10, 5, []float64{10}, columnsMap, []string{"Col1"}, []string{"min"}, "{\"Col1\":\"1\",\"Col2\":\"2\"}", nil, 1},
-		{10, 5, []float64{10}, columnsMap, []string{"Col1"}, []string{"max"}, "{\"Col1\":\"1\",\"Col2\":\"2\"}", nil, 10},
-		{10, 5, []float64{10}, columnsMap, []string{"Col1"}, []string{"sum"}, "{\"Col1\":\"1\",\"Col2\":\"2\"}", nil, 11},
-		{1, 1, []float64{10}, columnsMap, []string{"Col1"}, []string{"avg"}, "{\"Col1\":\"1\",\"Col2\":\"2\"}", nil, 5.500},
-		{10, 5, []float64{0.0000}, columnsMap, []string{"Col1"}, []string{"random"}, "{\"Col1\":\"1\",\"Col2\":\"2\"}", ErrParseNonUnaryAgregateFunctionCall, 0},
-		{0, 5, []float64{0}, columnsMap, []string{"0"}, []string{"count"}, "{\"Col1\":\"1\",\"Col2\":\"2\"}", nil, 1},
-		{10, 5, []float64{10}, columnsMap, []string{"1"}, []string{"min"}, "{\"_1\":\"1\",\"_2\":\"2\"}", nil, 1},
+		{10, 5, []float64{10, 11, 12, 13, 14}, columnsMap, []string{"Col1"}, []string{"count"}, []byte("{\"Col1\":\"1\",\"Col2\":\"2\"}"), nil, 11},
+		{10, 5, []float64{10}, columnsMap, []string{"Col1"}, []string{"min"}, []byte("{\"Col1\":\"1\",\"Col2\":\"2\"}"), nil, 1},
+		{10, 5, []float64{10}, columnsMap, []string{"Col1"}, []string{"max"}, []byte("{\"Col1\":\"1\",\"Col2\":\"2\"}"), nil, 10},
+		{10, 5, []float64{10}, columnsMap, []string{"Col1"}, []string{"sum"}, []byte("{\"Col1\":\"1\",\"Col2\":\"2\"}"), nil, 11},
+		{1, 1, []float64{10}, columnsMap, []string{"Col1"}, []string{"avg"}, []byte("{\"Col1\":\"1\",\"Col2\":\"2\"}"), nil, 5.500},
+		{10, 5, []float64{0.0000}, columnsMap, []string{"Col1"}, []string{"random"}, []byte("{\"Col1\":\"1\",\"Col2\":\"2\"}"),
+			ErrParseNonUnaryAgregateFunctionCall, 0},
+		{0, 5, []float64{0}, columnsMap, []string{"0"}, []string{"count"}, []byte("{\"Col1\":\"1\",\"Col2\":\"2\"}"), nil, 1},
+		{10, 5, []float64{10}, columnsMap, []string{"1"}, []string{"min"}, []byte("{\"_1\":\"1\",\"_2\":\"2\"}"), nil, 1},
 	}

 	for _, table := range tables {
-		err := aggregationFunctions(table.counter, table.filtrCount, table.myAggVals, table.storeReqCols, table.storeFunctions, table.record)
+		err := aggregationFns(table.counter, table.filtrCount, table.myAggVals, table.storeReqCols, table.storeFunctions, table.record)
 		if table.err != err {
 			t.Error()
 		}
@@ -181,9 +156,9 @@ func TestMyAggregationFunc(t *testing.T) {
 	}
 }

-// TestMyStringComparator is a unit test which ensures that the appropriate
+// TestStringComparator is a unit test which ensures that the appropriate
 // values are being compared for strings.
-func TestMyStringComparator(t *testing.T) {
+func TestStringComparator(t *testing.T) {
 	tables := []struct {
 		operand  string
 		operator string
@@ -211,9 +186,9 @@ func TestMyStringComparator(t *testing.T) {
 	}
 }

-// TestMyFloatComparator is a unit test which ensures that the appropriate
+// TestFloatComparator is a unit test which ensures that the appropriate
 // values are being compared for floats.
-func TestMyFloatComparator(t *testing.T) {
+func TestFloatComparator(t *testing.T) {
 	tables := []struct {
 		operand  float64
 		operator string
@@ -240,9 +215,9 @@ func TestMyFloatComparator(t *testing.T) {
 	}
 }

-// TestMyIntComparator is a unit test which ensures that the appropriate values
+// TestIntComparator is a unit test which ensures that the appropriate values
 // are being compared for ints.
-func TestMyIntComparator(t *testing.T) {
+func TestIntComparator(t *testing.T) {
 	tables := []struct {
 		operand  int64
 		operator string
@@ -269,9 +244,9 @@ func TestMyIntComparator(t *testing.T) {
 	}
 }

-// TestMySizeFunction is a function which provides unit testing for the function
+// TestSizeFunction is a function which provides unit testing for the function
 // which calculates size.
-func TestMySizeFunction(t *testing.T) {
+func TestSizeFunction(t *testing.T) {
 	tables := []struct {
 		myRecord []string
 		expected int64
@@ -471,20 +446,19 @@ func TestMatch(t *testing.T) {
 	}
 }

-// TestMyFuncProcessing is a unit test which ensures that the appropriate values are
+// TestFuncProcessing is a unit test which ensures that the appropriate values are
 // being returned from the Processing... functions.
-func TestMyFuncProcessing(t *testing.T) {
+func TestFuncProcessing(t *testing.T) {
 	tables := []struct {
 		myString    string
-		nullList    []string
 		coalList    []string
 		myValString string
 		myValCoal   string
 		myValNull   string
 		stringFunc  string
 	}{
-		{"lower", []string{"yo", "yo"}, []string{"random", "hello", "random"}, "LOWER", "random", "", "UPPER"},
-		{"LOWER", []string{"null", "random"}, []string{"missing", "hello", "random"}, "lower", "hello", "null", "LOWER"},
+		{"lower", []string{"random", "hello", "random"}, "LOWER", "random", "", "UPPER"},
+		{"LOWER", []string{"missing", "hello", "random"}, "lower", "hello", "null", "LOWER"},
 	}
 	for _, table := range tables {
 		if table.coalList != nil {
@@ -493,16 +467,145 @@ func TestMyFuncProcessing(t *testing.T) {
 				t.Error()
 			}
 		}
-		if table.nullList != nil {
-			myVal := processNullIf(table.nullList)
-			if myVal != table.myValNull {
-				t.Error()
-			}
-		}
-		myVal := applyStrFunc(table.myString, table.stringFunc)
+		myVal := applyStrFunc(gjson.Result{
+			Type: gjson.String,
+			Str:  table.myString,
+		}, table.stringFunc)
 		if myVal != table.myValString {
 			t.Error()
 		}

 	}
 }
+
+const charset = "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
+
+var seededRand = rand.New(rand.NewSource(time.Now().UnixNano()))
+
+func StringWithCharset(length int, charset string) string {
+	b := make([]byte, length)
+	for i := range b {
+		b[i] = charset[seededRand.Intn(len(charset))]
+	}
+	return string(b)
+}
+
+func String(length int) string {
+	return StringWithCharset(length, charset)
+}
+
+func genCSV(b *bytes.Buffer, records int) error {
+	b.Reset()
+	w := csv.NewWriter(b)
+	w.Write([]string{"id", "name", "age", "city"})
+
+	for i := 0; i < records; i++ {
+		w.Write([]string{
+			strconv.Itoa(i),
+			String(10),
+			String(5),
+			String(10),
+		})
+	}
+
+	// Write any buffered data to the underlying writer (standard output).
+	w.Flush()
+
+	return w.Error()
+}
+
+func benchmarkSQLAll(b *testing.B, records int) {
+	benchmarkSQL(b, records, "select * from S3Object")
+}
+
+func benchmarkSQLAggregate(b *testing.B, records int) {
+	benchmarkSQL(b, records, "select count(*) from S3Object")
+}
+
+func benchmarkSQL(b *testing.B, records int, query string) {
+	var (
+		buf    bytes.Buffer
+		output bytes.Buffer
+	)
+	genCSV(&buf, records)
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	sreq := ObjectSelectRequest{}
+	sreq.Expression = query
+	sreq.ExpressionType = QueryExpressionTypeSQL
+	sreq.InputSerialization.CSV = &struct {
+		FileHeaderInfo       CSVFileHeaderInfo
+		RecordDelimiter      string
+		FieldDelimiter       string
+		QuoteCharacter       string
+		QuoteEscapeCharacter string
+		Comments             string
+	}{}
+	sreq.InputSerialization.CSV.FileHeaderInfo = CSVFileHeaderInfoUse
+	sreq.InputSerialization.CSV.RecordDelimiter = "\n"
+	sreq.InputSerialization.CSV.FieldDelimiter = ","
+
+	sreq.OutputSerialization.CSV = &struct {
+		QuoteFields          CSVQuoteFields
+		RecordDelimiter      string
+		FieldDelimiter       string
+		QuoteCharacter       string
+		QuoteEscapeCharacter string
+	}{}
+	sreq.OutputSerialization.CSV.RecordDelimiter = "\n"
+	sreq.OutputSerialization.CSV.FieldDelimiter = ","
+
+	s3s, err := New(&buf, int64(buf.Len()), sreq)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	for i := 0; i < b.N; i++ {
+		output.Reset()
+		if err = Execute(&output, s3s); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+// BenchmarkSQLAggregate_100K - benchmark count(*) function with 100k records.
+func BenchmarkSQLAggregate_100K(b *testing.B) {
+	benchmarkSQLAggregate(b, humanize.KiByte*100)
+}
+
+// BenchmarkSQLAggregate_1M - benchmark count(*) function with 1m records.
+func BenchmarkSQLAggregate_1M(b *testing.B) {
+	benchmarkSQLAggregate(b, humanize.MiByte)
+}
+
+// BenchmarkSQLAggregate_2M - benchmark count(*) function with 2m records.
+func BenchmarkSQLAggregate_2M(b *testing.B) {
+	benchmarkSQLAggregate(b, 2*humanize.MiByte)
+}
+
+// BenchmarkSQLAggregate_10M - benchmark count(*) function with 10m records.
+func BenchmarkSQLAggregate_10M(b *testing.B) {
+	benchmarkSQLAggregate(b, 10*humanize.MiByte)
+}
+
+// BenchmarkSQLAll_100K - benchmark * function with 100k records.
+func BenchmarkSQLAll_100K(b *testing.B) {
+	benchmarkSQLAll(b, humanize.KiByte*100)
+}
+
+// BenchmarkSQLAll_1M - benchmark * function with 1m records.
+func BenchmarkSQLAll_1M(b *testing.B) {
+	benchmarkSQLAll(b, humanize.MiByte)
+}
+
+// BenchmarkSQLAll_2M - benchmark * function with 2m records.
+func BenchmarkSQLAll_2M(b *testing.B) {
+	benchmarkSQLAll(b, 2*humanize.MiByte)
+}
+
+// BenchmarkSQLAll_10M - benchmark * function with 10m records.
+func BenchmarkSQLAll_10M(b *testing.B) {
+	benchmarkSQLAll(b, 10*humanize.MiByte)
+}