Performance improvements to SELECT API on certain query operations (#6752)

This improves the performance of certain queries dramatically,
such as 'count(*)' etc.

Without this PR
```
~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz
2173762

real	0m42.464s
user	0m0.071s
sys	0m0.010s
```

With this PR
```
~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz
2173762

real	0m17.603s
user	0m0.093s
sys	0m0.008s
```

Almost a 250% improvement in performance. This PR avoids a lot of type
conversions and instead relies on raw sequences of data and interprets
them lazily.

```
benchcmp old new
benchmark                        old ns/op       new ns/op       delta
BenchmarkSQLAggregate_100K-4     551213          259782          -52.87%
BenchmarkSQLAggregate_1M-4       6981901985      2432413729      -65.16%
BenchmarkSQLAggregate_2M-4       13511978488     4536903552      -66.42%
BenchmarkSQLAggregate_10M-4      68427084908     23266283336     -66.00%

benchmark                        old allocs     new allocs     delta
BenchmarkSQLAggregate_100K-4     2366           485            -79.50%
BenchmarkSQLAggregate_1M-4       47455492       21462860       -54.77%
BenchmarkSQLAggregate_2M-4       95163637       43110771       -54.70%
BenchmarkSQLAggregate_10M-4      476959550      216906510      -54.52%

benchmark                        old bytes       new bytes      delta
BenchmarkSQLAggregate_100K-4     1233079         1086024        -11.93%
BenchmarkSQLAggregate_1M-4       2607984120      557038536      -78.64%
BenchmarkSQLAggregate_2M-4       5254103616      1128149168     -78.53%
BenchmarkSQLAggregate_10M-4      26443524872     5722715992     -78.36%
```
This commit is contained in:
Harshavardhana
2018-11-14 15:55:10 -08:00
committed by kannappanr
parent f9779b24ad
commit 7e1661f4fa
108 changed files with 640 additions and 12237 deletions

View File

@@ -88,13 +88,14 @@ The dot and wildcard characters can be escaped with '\\'.
```
You can also query an array for the first match by using `#[...]`, or find all matches with `#[...]#`.
Queries support the `==`, `!=`, `<`, `<=`, `>`, `>=` comparison operators and the simple pattern matching `%` operator.
Queries support the `==`, `!=`, `<`, `<=`, `>`, `>=` comparison operators and the simple pattern matching `%` (like) and `!%` (not like) operators.
```
friends.#[last=="Murphy"].first >> "Dale"
friends.#[last=="Murphy"]#.first >> ["Dale","Jane"]
friends.#[age>45]#.last >> ["Craig","Murphy"]
friends.#[first%"D*"].last >> "Murphy"
friends.#[first!%"D*"].last >> "Craig"
```
## JSON Lines

View File

@@ -77,7 +77,20 @@ func (t Result) String() string {
case False:
return "false"
case Number:
return strconv.FormatFloat(t.Num, 'f', -1, 64)
if len(t.Raw) == 0 {
// calculated result
return strconv.FormatFloat(t.Num, 'f', -1, 64)
}
var i int
if t.Raw[0] == '-' {
i++
}
for ; i < len(t.Raw); i++ {
if t.Raw[i] < '0' || t.Raw[i] > '9' {
return strconv.FormatFloat(t.Num, 'f', -1, 64)
}
}
return t.Raw
case String:
return t.Str
case JSON:
@@ -344,24 +357,30 @@ func (t Result) arrayOrMap(vc byte, valueize bool) (r arrayOrMapResult) {
if (json[i] >= '0' && json[i] <= '9') || json[i] == '-' {
value.Type = Number
value.Raw, value.Num = tonum(json[i:])
value.Str = ""
} else {
continue
}
case '{', '[':
value.Type = JSON
value.Raw = squash(json[i:])
value.Str, value.Num = "", 0
case 'n':
value.Type = Null
value.Raw = tolit(json[i:])
value.Str, value.Num = "", 0
case 't':
value.Type = True
value.Raw = tolit(json[i:])
value.Str, value.Num = "", 0
case 'f':
value.Type = False
value.Raw = tolit(json[i:])
value.Str, value.Num = "", 0
case '"':
value.Type = String
value.Raw, value.Str = tostr(json[i:])
value.Num = 0
}
i += len(value.Raw) - 1
@@ -370,9 +389,13 @@ func (t Result) arrayOrMap(vc byte, valueize bool) (r arrayOrMapResult) {
key = value
} else {
if valueize {
r.oi[key.Str] = value.Value()
if _, ok := r.oi[key.Str]; !ok {
r.oi[key.Str] = value.Value()
}
} else {
r.o[key.Str] = value
if _, ok := r.o[key.Str]; !ok {
r.o[key.Str] = value
}
}
}
count++
@@ -732,7 +755,7 @@ func parseArrayPath(path string) (r arrayPathResult) {
if i < len(path) {
s = i
if path[i] == '!' {
if i < len(path)-1 && path[i+1] == '=' {
if i < len(path)-1 && (path[i+1] == '=' || path[i+1] == '%') {
i++
}
} else if path[i] == '<' || path[i] == '>' {
@@ -1076,6 +1099,8 @@ func queryMatches(rp *arrayPathResult, value Result) bool {
return value.Str >= rpv
case "%":
return match.Match(value.Str, rpv)
case "!%":
return !match.Match(value.Str, rpv)
}
case Number:
rpvn, _ := strconv.ParseFloat(rpv, 64)
@@ -1288,7 +1313,7 @@ func parseArray(c *parseContext, i int, path string) (int, bool) {
if rp.alogok {
break
}
c.value.Raw = val
c.value.Raw = ""
c.value.Type = Number
c.value.Num = float64(h - 1)
c.calcd = true
@@ -1603,7 +1628,11 @@ func GetMany(json string, path ...string) []Result {
// The return value is a Result array where the number of items
// will be equal to the number of input paths.
func GetManyBytes(json []byte, path ...string) []Result {
return GetMany(string(json), path...)
res := make([]Result, len(path))
for i, path := range path {
res[i] = GetBytes(json, path)
}
return res
}
var fieldsmu sync.RWMutex