select: Support Parquet dates (#11928)

Pass schema to parser to support dates.

Fixes #11926
This commit is contained in:
Klaus Post
2021-04-03 17:25:19 +02:00
committed by GitHub
parent bf106453b8
commit dca7cf7200
7 changed files with 280 additions and 13 deletions

View File

@@ -1069,7 +1069,7 @@ func TestParquetInput(t *testing.T) {
for i, testCase := range testTable {
t.Run(fmt.Sprint(i), func(t *testing.T) {
getReader := func(offset int64, length int64) (io.ReadCloser, error) {
testdataFile := "testdata.parquet"
testdataFile := "testdata/testdata.parquet"
file, err := os.Open(testdataFile)
if err != nil {
return nil, err
@@ -1126,3 +1126,243 @@ func TestParquetInput(t *testing.T) {
})
}
}
func TestParquetInputSchema(t *testing.T) {
os.Setenv("MINIO_API_SELECT_PARQUET", "on")
defer os.Setenv("MINIO_API_SELECT_PARQUET", "off")
var testTable = []struct {
requestXML []byte
wantResult string
}{
{
requestXML: []byte(`
<?xml version="1.0" encoding="UTF-8"?>
<SelectObjectContentRequest>
<Expression>SELECT * FROM S3Object LIMIT 5</Expression>
<ExpressionType>SQL</ExpressionType>
<InputSerialization>
<CompressionType>NONE</CompressionType>
<Parquet>
</Parquet>
</InputSerialization>
<OutputSerialization>
<JSON>
</JSON>
</OutputSerialization>
<RequestProgress>
<Enabled>FALSE</Enabled>
</RequestProgress>
</SelectObjectContentRequest>
`), wantResult: `{"shipdate":"1996-03-13T"}
{"shipdate":"1996-04-12T"}
{"shipdate":"1996-01-29T"}
{"shipdate":"1996-04-21T"}
{"shipdate":"1996-03-30T"}`,
},
{
requestXML: []byte(`
<?xml version="1.0" encoding="UTF-8"?>
<SelectObjectContentRequest>
<Expression>SELECT DATE_ADD(day, 2, shipdate) as shipdate FROM S3Object LIMIT 5</Expression>
<ExpressionType>SQL</ExpressionType>
<InputSerialization>
<CompressionType>NONE</CompressionType>
<Parquet>
</Parquet>
</InputSerialization>
<OutputSerialization>
<JSON>
</JSON>
</OutputSerialization>
<RequestProgress>
<Enabled>FALSE</Enabled>
</RequestProgress>
</SelectObjectContentRequest>
`), wantResult: `{"shipdate":"1996-03-15T"}
{"shipdate":"1996-04-14T"}
{"shipdate":"1996-01-31T"}
{"shipdate":"1996-04-23T"}
{"shipdate":"1996-04T"}`,
},
}
for i, testCase := range testTable {
t.Run(fmt.Sprint(i), func(t *testing.T) {
getReader := func(offset int64, length int64) (io.ReadCloser, error) {
testdataFile := "testdata/lineitem_shipdate.parquet"
file, err := os.Open(testdataFile)
if err != nil {
return nil, err
}
fi, err := file.Stat()
if err != nil {
return nil, err
}
if offset < 0 {
offset = fi.Size() + offset
}
if _, err = file.Seek(offset, io.SeekStart); err != nil {
return nil, err
}
return file, nil
}
s3Select, err := NewS3Select(bytes.NewReader(testCase.requestXML))
if err != nil {
t.Fatal(err)
}
if err = s3Select.Open(getReader); err != nil {
t.Fatal(err)
}
w := &testResponseWriter{}
s3Select.Evaluate(w)
s3Select.Close()
resp := http.Response{
StatusCode: http.StatusOK,
Body: ioutil.NopCloser(bytes.NewReader(w.response)),
ContentLength: int64(len(w.response)),
}
res, err := minio.NewSelectResults(&resp, "testbucket")
if err != nil {
t.Error(err)
return
}
got, err := ioutil.ReadAll(res)
if err != nil {
t.Error(err)
return
}
gotS := strings.TrimSpace(string(got))
if !reflect.DeepEqual(gotS, testCase.wantResult) {
t.Errorf("received response does not match with expected reply. Query: %s\ngot: %s\nwant:%s", testCase.requestXML, gotS, testCase.wantResult)
}
})
}
}
func TestParquetInputSchemaCSV(t *testing.T) {
os.Setenv("MINIO_API_SELECT_PARQUET", "on")
defer os.Setenv("MINIO_API_SELECT_PARQUET", "off")
var testTable = []struct {
requestXML []byte
wantResult string
}{
{
requestXML: []byte(`
<?xml version="1.0" encoding="UTF-8"?>
<SelectObjectContentRequest>
<Expression>SELECT * FROM S3Object LIMIT 5</Expression>
<ExpressionType>SQL</ExpressionType>
<InputSerialization>
<CompressionType>NONE</CompressionType>
<Parquet>
</Parquet>
</InputSerialization>
<OutputSerialization>
<CSV/>
</OutputSerialization>
<RequestProgress>
<Enabled>FALSE</Enabled>
</RequestProgress>
</SelectObjectContentRequest>
`), wantResult: `1996-03-13T
1996-04-12T
1996-01-29T
1996-04-21T
1996-03-30T`,
},
{
requestXML: []byte(`
<?xml version="1.0" encoding="UTF-8"?>
<SelectObjectContentRequest>
<Expression>SELECT DATE_ADD(day, 2, shipdate) as shipdate FROM S3Object LIMIT 5</Expression>
<ExpressionType>SQL</ExpressionType>
<InputSerialization>
<CompressionType>NONE</CompressionType>
<Parquet>
</Parquet>
</InputSerialization>
<OutputSerialization>
<CSV/>
</OutputSerialization>
<RequestProgress>
<Enabled>FALSE</Enabled>
</RequestProgress>
</SelectObjectContentRequest>
`), wantResult: `1996-03-15T
1996-04-14T
1996-01-31T
1996-04-23T
1996-04T`,
},
}
for i, testCase := range testTable {
t.Run(fmt.Sprint(i), func(t *testing.T) {
getReader := func(offset int64, length int64) (io.ReadCloser, error) {
testdataFile := "testdata/lineitem_shipdate.parquet"
file, err := os.Open(testdataFile)
if err != nil {
return nil, err
}
fi, err := file.Stat()
if err != nil {
return nil, err
}
if offset < 0 {
offset = fi.Size() + offset
}
if _, err = file.Seek(offset, io.SeekStart); err != nil {
return nil, err
}
return file, nil
}
s3Select, err := NewS3Select(bytes.NewReader(testCase.requestXML))
if err != nil {
t.Fatal(err)
}
if err = s3Select.Open(getReader); err != nil {
t.Fatal(err)
}
w := &testResponseWriter{}
s3Select.Evaluate(w)
s3Select.Close()
resp := http.Response{
StatusCode: http.StatusOK,
Body: ioutil.NopCloser(bytes.NewReader(w.response)),
ContentLength: int64(len(w.response)),
}
res, err := minio.NewSelectResults(&resp, "testbucket")
if err != nil {
t.Error(err)
return
}
got, err := ioutil.ReadAll(res)
if err != nil {
t.Error(err)
return
}
gotS := strings.TrimSpace(string(got))
if !reflect.DeepEqual(gotS, testCase.wantResult) {
t.Errorf("received response does not match with expected reply. Query: %s\ngot: %s\nwant:%s", testCase.requestXML, gotS, testCase.wantResult)
}
})
}
}