Concurrent CSV parsing and reduce S3 select allocations (#8200)

```
CSV parsing, BEFORE:
BenchmarkReaderBasic-12         	    2842	    407533 ns/op	  397860 B/op	     957 allocs/op
BenchmarkReaderReplace-12       	    2718	    429914 ns/op	  397844 B/op	     957 allocs/op
BenchmarkReaderReplaceTwo-12    	    2718	    435556 ns/op	  397855 B/op	     957 allocs/op
BenchmarkAggregateCount_100K-12    	     171	   6798974 ns/op	16667102 B/op	  308077 allocs/op
BenchmarkAggregateCount_1M-12    	      19	  65657411 ns/op	168057743 B/op	 3146610 allocs/op
BenchmarkSelectAll_10M-12    	       1	20882119900 ns/op	2758799896 B/op	41978762 allocs/op

CSV parsing, AFTER:
BenchmarkReaderBasic-12         	    3721	    312549 ns/op	  101920 B/op	     338 allocs/op
BenchmarkReaderReplace-12       	    3776	    318810 ns/op	  101993 B/op	     340 allocs/op
BenchmarkReaderReplaceTwo-12    	    3610	    330967 ns/op	  102012 B/op	     341 allocs/op
BenchmarkAggregateCount_100K-12    	     295	   4149588 ns/op	 3553623 B/op	  103261 allocs/op
BenchmarkAggregateCount_1M-12    	      30	  37746503 ns/op	33827931 B/op	 1049435 allocs/op
BenchmarkSelectAll_10M-12    	       1	17608495800 ns/op	1416504040 B/op	21007082 allocs/op

~ benchcmp old.txt new.txt
benchmark                           old ns/op       new ns/op       delta
BenchmarkReaderBasic-12             407533          312549          -23.31%
BenchmarkReaderReplace-12           429914          318810          -25.84%
BenchmarkReaderReplaceTwo-12        435556          330967          -24.01%
BenchmarkAggregateCount_100K-12     6798974         4149588         -38.97%
BenchmarkAggregateCount_1M-12       65657411        37746503        -42.51%
BenchmarkSelectAll_10M-12           20882119900     17608495800     -15.68%

benchmark                           old allocs     new allocs     delta
BenchmarkReaderBasic-12             957            338            -64.68%
BenchmarkReaderReplace-12           957            340            -64.47%
BenchmarkReaderReplaceTwo-12        957            341            -64.37%
BenchmarkAggregateCount_100K-12     308077         103261         -66.48%
BenchmarkAggregateCount_1M-12       3146610        1049435        -66.65%
BenchmarkSelectAll_10M-12           41978762       21007082       -49.96%

benchmark                           old bytes      new bytes      delta
BenchmarkReaderBasic-12             397860         101920         -74.38%
BenchmarkReaderReplace-12           397844         101993         -74.36%
BenchmarkReaderReplaceTwo-12        397855         102012         -74.36%
BenchmarkAggregateCount_100K-12     16667102       3553623        -78.68%
BenchmarkAggregateCount_1M-12       168057743      33827931       -79.87%
BenchmarkSelectAll_10M-12           2758799896     1416504040     -48.66%
```

```
BenchmarkReaderHuge/97K-12         	    2200	    540840 ns/op	 184.32 MB/s	 1604450 B/op	     687 allocs/op
BenchmarkReaderHuge/194K-12        	    1522	    752257 ns/op	 265.04 MB/s	 2143135 B/op	    1335 allocs/op
BenchmarkReaderHuge/389K-12        	    1190	    947858 ns/op	 420.69 MB/s	 3221831 B/op	    2630 allocs/op
BenchmarkReaderHuge/778K-12        	     806	   1472486 ns/op	 541.61 MB/s	 5201856 B/op	    5187 allocs/op
BenchmarkReaderHuge/1557K-12       	     426	   2575269 ns/op	 619.36 MB/s	 9101330 B/op	   10233 allocs/op
BenchmarkReaderHuge/3115K-12       	     286	   4034656 ns/op	 790.66 MB/s	12397968 B/op	   16099 allocs/op
BenchmarkReaderHuge/6230K-12       	     172	   6830563 ns/op	 934.05 MB/s	16008416 B/op	   26844 allocs/op
BenchmarkReaderHuge/12461K-12      	     100	  11409467 ns/op	1118.39 MB/s	22655163 B/op	   48107 allocs/op
BenchmarkReaderHuge/24922K-12      	      66	  19780395 ns/op	1290.19 MB/s	35158559 B/op	   90216 allocs/op
BenchmarkReaderHuge/49844K-12      	      34	  37282559 ns/op	1369.03 MB/s	60528624 B/op	  174497 allocs/op
```
This commit is contained in:
Klaus Post
2019-09-13 14:18:35 -07:00
committed by Harshavardhana
parent e7f491a14b
commit ddea0bdf11
23 changed files with 1041 additions and 189 deletions

View File

@@ -18,6 +18,7 @@ package s3select
import (
"bytes"
"fmt"
"io"
"io/ioutil"
"net/http"
@@ -108,26 +109,29 @@ func TestCSVInput(t *testing.T) {
2.5,baz,true
`)
for _, testCase := range testTable {
s3Select, err := NewS3Select(bytes.NewReader(testCase.requestXML))
if err != nil {
t.Fatal(err)
}
for i, testCase := range testTable {
t.Run(fmt.Sprint(i), func(t *testing.T) {
s3Select, err := NewS3Select(bytes.NewReader(testCase.requestXML))
if err != nil {
t.Fatal(err)
}
if err = s3Select.Open(func(offset, length int64) (io.ReadCloser, error) {
return ioutil.NopCloser(bytes.NewReader(csvData)), nil
}); err != nil {
t.Fatal(err)
}
if err = s3Select.Open(func(offset, length int64) (io.ReadCloser, error) {
return ioutil.NopCloser(bytes.NewReader(csvData)), nil
}); err != nil {
t.Fatal(err)
}
w := &testResponseWriter{}
s3Select.Evaluate(w)
s3Select.Close()
w := &testResponseWriter{}
s3Select.Evaluate(w)
s3Select.Close()
if !reflect.DeepEqual(w.response, testCase.expectedResult) {
t.Fatalf("received response does not match with expected reply")
}
if !reflect.DeepEqual(w.response, testCase.expectedResult) {
t.Errorf("received response does not match with expected reply\ngot: %#v\nwant:%#v", w.response, testCase.expectedResult)
}
})
}
}
func TestJSONInput(t *testing.T) {
@@ -191,26 +195,27 @@ func TestJSONInput(t *testing.T) {
{"three":true,"two":"baz","one":2.5}
`)
for _, testCase := range testTable {
for i, testCase := range testTable {
t.Run(fmt.Sprint(i), func(t *testing.T) {
s3Select, err := NewS3Select(bytes.NewReader(testCase.requestXML))
if err != nil {
t.Fatal(err)
}
s3Select, err := NewS3Select(bytes.NewReader(testCase.requestXML))
if err != nil {
t.Fatal(err)
}
if err = s3Select.Open(func(offset, length int64) (io.ReadCloser, error) {
return ioutil.NopCloser(bytes.NewReader(jsonData)), nil
}); err != nil {
t.Fatal(err)
}
if err = s3Select.Open(func(offset, length int64) (io.ReadCloser, error) {
return ioutil.NopCloser(bytes.NewReader(jsonData)), nil
}); err != nil {
t.Fatal(err)
}
w := &testResponseWriter{}
s3Select.Evaluate(w)
s3Select.Close()
w := &testResponseWriter{}
s3Select.Evaluate(w)
s3Select.Close()
if !reflect.DeepEqual(w.response, testCase.expectedResult) {
t.Fatalf("received response does not match with expected reply")
}
if !reflect.DeepEqual(w.response, testCase.expectedResult) {
t.Errorf("received response does not match with expected reply\ngot: %s\nwant:%s", string(w.response), string(testCase.expectedResult))
}
})
}
}
@@ -268,45 +273,47 @@ func TestParquetInput(t *testing.T) {
},
}
for _, testCase := range testTable {
getReader := func(offset int64, length int64) (io.ReadCloser, error) {
testdataFile := "testdata.parquet"
file, err := os.Open(testdataFile)
for i, testCase := range testTable {
t.Run(fmt.Sprint(i), func(t *testing.T) {
getReader := func(offset int64, length int64) (io.ReadCloser, error) {
testdataFile := "testdata.parquet"
file, err := os.Open(testdataFile)
if err != nil {
return nil, err
}
fi, err := file.Stat()
if err != nil {
return nil, err
}
if offset < 0 {
offset = fi.Size() + offset
}
if _, err = file.Seek(offset, os.SEEK_SET); err != nil {
return nil, err
}
return file, nil
}
s3Select, err := NewS3Select(bytes.NewReader(testCase.requestXML))
if err != nil {
return nil, err
t.Fatal(err)
}
fi, err := file.Stat()
if err != nil {
return nil, err
if err = s3Select.Open(getReader); err != nil {
t.Fatal(err)
}
if offset < 0 {
offset = fi.Size() + offset
w := &testResponseWriter{}
s3Select.Evaluate(w)
s3Select.Close()
if !reflect.DeepEqual(w.response, testCase.expectedResult) {
t.Errorf("received response does not match with expected reply\ngot: %#v\nwant:%#v", w.response, testCase.expectedResult)
}
if _, err = file.Seek(offset, os.SEEK_SET); err != nil {
return nil, err
}
return file, nil
}
s3Select, err := NewS3Select(bytes.NewReader(testCase.requestXML))
if err != nil {
t.Fatal(err)
}
if err = s3Select.Open(getReader); err != nil {
t.Fatal(err)
}
w := &testResponseWriter{}
s3Select.Evaluate(w)
s3Select.Close()
if !reflect.DeepEqual(w.response, testCase.expectedResult) {
t.Fatalf("received response does not match with expected reply")
}
})
}
}