SIMDJSON S3 select input (#8401)

This commit is contained in:
Klaus Post
2020-02-13 14:03:52 -08:00
committed by GitHub
parent d1144c2c7e
commit e4020fb41f
16 changed files with 1116 additions and 73 deletions

View File

@@ -21,13 +21,16 @@ import (
"fmt"
"io"
"io/ioutil"
"math"
"net/http"
"os"
"reflect"
"strings"
"testing"
"github.com/klauspost/cpuid"
"github.com/minio/minio-go/v6"
"github.com/minio/simdjson-go"
)
type testResponseWriter struct {
@@ -56,6 +59,7 @@ func TestJSONQueries(t *testing.T) {
{"id": 1,"title": "Second Record","desc": "another text","synonyms": ["some", "synonym", "value"]}
{"id": 2,"title": "Second Record","desc": "another text","numbers": [2, 3.0, 4]}
{"id": 3,"title": "Second Record","desc": "another text","nested": [[2, 3.0, 4], [7, 8.5, 9]]}`
var testTable = []struct {
name string
query string
@@ -229,6 +233,11 @@ func TestJSONQueries(t *testing.T) {
{"id":3, "value": "true"}
`,
},
{
name: "index-wildcard-in",
query: `SELECT * from s3object s WHERE title = 'Test Record'`,
wantResult: `{"id":0,"title":"Test Record","desc":"Some text","synonyms":["foo","bar","whatever"]}`,
},
{
name: "select-output-field-as-csv",
requestXML: []byte(`<?xml version="1.0" encoding="UTF-8"?>
@@ -260,7 +269,7 @@ func TestJSONQueries(t *testing.T) {
<InputSerialization>
<CompressionType>NONE</CompressionType>
<JSON>
<Type>DOCUMENT</Type>
<Type>LINES</Type>
</JSON>
</InputSerialization>
<OutputSerialization>
@@ -274,6 +283,59 @@ func TestJSONQueries(t *testing.T) {
for _, testCase := range testTable {
t.Run(testCase.name, func(t *testing.T) {
// Hack cpuid to the CPU doesn't appear to support AVX2.
// Restore whatever happens.
defer func(f cpuid.Flags) {
cpuid.CPU.Features = f
}(cpuid.CPU.Features)
cpuid.CPU.Features &= math.MaxUint64 - cpuid.AVX2
testReq := testCase.requestXML
if len(testReq) == 0 {
testReq = []byte(fmt.Sprintf(defRequest, testCase.query))
}
s3Select, err := NewS3Select(bytes.NewReader(testReq))
if err != nil {
t.Fatal(err)
}
if err = s3Select.Open(func(offset, length int64) (io.ReadCloser, error) {
in := input
if len(testCase.withJSON) > 0 {
in = testCase.withJSON
}
return ioutil.NopCloser(bytes.NewBufferString(in)), nil
}); err != nil {
t.Fatal(err)
}
w := &testResponseWriter{}
s3Select.Evaluate(w)
s3Select.Close()
resp := http.Response{
StatusCode: http.StatusOK,
Body: ioutil.NopCloser(bytes.NewReader(w.response)),
ContentLength: int64(len(w.response)),
}
res, err := minio.NewSelectResults(&resp, "testbucket")
if err != nil {
t.Error(err)
return
}
got, err := ioutil.ReadAll(res)
if err != nil {
t.Error(err)
return
}
gotS := strings.TrimSpace(string(got))
if !reflect.DeepEqual(gotS, testCase.wantResult) {
t.Errorf("received response does not match with expected reply. Query: %s\ngot: %s\nwant:%s", testCase.query, gotS, testCase.wantResult)
}
})
t.Run("simd-"+testCase.name, func(t *testing.T) {
if !simdjson.SupportedCPU() {
t.Skip("No CPU support")
}
testReq := testCase.requestXML
if len(testReq) == 0 {
testReq = []byte(fmt.Sprintf(defRequest, testCase.query))
@@ -320,6 +382,89 @@ func TestJSONQueries(t *testing.T) {
}
func TestCSVQueries(t *testing.T) {
input := `index,ID,CaseNumber,Date,Day,Month,Year,Block,IUCR,PrimaryType,Description,LocationDescription,Arrest,Domestic,Beat,District,Ward,CommunityArea,FBI Code,XCoordinate,YCoordinate,UpdatedOn,Latitude,Longitude,Location
2700763,7732229,,2010-05-26 00:00:00,26,May,2010,113XX S HALSTED ST,1150,,CREDIT CARD FRAUD,,False,False,2233,22.0,34.0,,11,,,,41.688043288,-87.6422444,"(41.688043288, -87.6422444)"`
var testTable = []struct {
name string
query string
requestXML []byte
wantResult string
}{
{
name: "select-in-text-simple",
query: `SELECT index FROM s3Object s WHERE "Month"='May'`,
wantResult: `2700763`,
},
}
defRequest := `<?xml version="1.0" encoding="UTF-8"?>
<SelectObjectContentRequest>
<Expression>%s</Expression>
<ExpressionType>SQL</ExpressionType>
<InputSerialization>
<CompressionType>NONE</CompressionType>
<CSV>
<FieldDelimiter>,</FieldDelimiter>
<FileHeaderInfo>USE</FileHeaderInfo>
<QuoteCharacter>"</QuoteCharacter>
<QuoteEscapeCharacter>"</QuoteEscapeCharacter>
<RecordDelimiter>\n</RecordDelimiter>
</CSV>
</InputSerialization>
<OutputSerialization>
<CSV>
</CSV>
</OutputSerialization>
<RequestProgress>
<Enabled>FALSE</Enabled>
</RequestProgress>
</SelectObjectContentRequest>`
for _, testCase := range testTable {
t.Run(testCase.name, func(t *testing.T) {
testReq := testCase.requestXML
if len(testReq) == 0 {
testReq = []byte(fmt.Sprintf(defRequest, testCase.query))
}
s3Select, err := NewS3Select(bytes.NewReader(testReq))
if err != nil {
t.Fatal(err)
}
if err = s3Select.Open(func(offset, length int64) (io.ReadCloser, error) {
return ioutil.NopCloser(bytes.NewBufferString(input)), nil
}); err != nil {
t.Fatal(err)
}
w := &testResponseWriter{}
s3Select.Evaluate(w)
s3Select.Close()
resp := http.Response{
StatusCode: http.StatusOK,
Body: ioutil.NopCloser(bytes.NewReader(w.response)),
ContentLength: int64(len(w.response)),
}
res, err := minio.NewSelectResults(&resp, "testbucket")
if err != nil {
t.Error(err)
return
}
got, err := ioutil.ReadAll(res)
if err != nil {
t.Error(err)
return
}
gotS := strings.TrimSpace(string(got))
if !reflect.DeepEqual(gotS, testCase.wantResult) {
t.Errorf("received response does not match with expected reply. Query: %s\ngot: %s\nwant:%s", testCase.query, gotS, testCase.wantResult)
}
})
}
}
func TestCSVQueries2(t *testing.T) {
input := `id,time,num,num2,text
1,2010-01-01T,7867786,4565.908123,"a text, with comma"
2,2017-01-02T03:04Z,-5, 0.765111,
@@ -559,7 +704,23 @@ func TestCSVInput(t *testing.T) {
s3Select.Close()
if !reflect.DeepEqual(w.response, testCase.expectedResult) {
t.Errorf("received response does not match with expected reply\ngot: %#v\nwant:%#v", w.response, testCase.expectedResult)
resp := http.Response{
StatusCode: http.StatusOK,
Body: ioutil.NopCloser(bytes.NewReader(w.response)),
ContentLength: int64(len(w.response)),
}
res, err := minio.NewSelectResults(&resp, "testbucket")
if err != nil {
t.Error(err)
return
}
got, err := ioutil.ReadAll(res)
if err != nil {
t.Error(err)
return
}
t.Errorf("received response does not match with expected reply\ngot: %#v\nwant:%#v\ndecoded:%s", w.response, testCase.expectedResult, string(got))
}
})
}
@@ -667,7 +828,23 @@ func TestJSONInput(t *testing.T) {
s3Select.Close()
if !reflect.DeepEqual(w.response, testCase.expectedResult) {
t.Errorf("received response does not match with expected reply\ngot: %#v\nwant:%#v", w.response, testCase.expectedResult)
resp := http.Response{
StatusCode: http.StatusOK,
Body: ioutil.NopCloser(bytes.NewReader(w.response)),
ContentLength: int64(len(w.response)),
}
res, err := minio.NewSelectResults(&resp, "testbucket")
if err != nil {
t.Error(err)
return
}
got, err := ioutil.ReadAll(res)
if err != nil {
t.Error(err)
return
}
t.Errorf("received response does not match with expected reply\ngot: %#v\nwant:%#v\ndecoded:%s", w.response, testCase.expectedResult, string(got))
}
})
}
@@ -766,7 +943,23 @@ func TestParquetInput(t *testing.T) {
s3Select.Close()
if !reflect.DeepEqual(w.response, testCase.expectedResult) {
t.Errorf("received response does not match with expected reply\ngot: %#v\nwant:%#v", w.response, testCase.expectedResult)
resp := http.Response{
StatusCode: http.StatusOK,
Body: ioutil.NopCloser(bytes.NewReader(w.response)),
ContentLength: int64(len(w.response)),
}
res, err := minio.NewSelectResults(&resp, "testbucket")
if err != nil {
t.Error(err)
return
}
got, err := ioutil.ReadAll(res)
if err != nil {
t.Error(err)
return
}
t.Errorf("received response does not match with expected reply\ngot: %#v\nwant:%#v\ndecoded:%s", w.response, testCase.expectedResult, string(got))
}
})
}