Add archived parquet as int. package (#9912)

Since github.com/minio/parquet-go is archived add it as internal package.
This commit is contained in:
Klaus Post
2020-06-25 07:31:16 -07:00
committed by GitHub
parent b1705599e1
commit 2d0f65a5e3
51 changed files with 19278 additions and 3 deletions

View File

@@ -0,0 +1,618 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package data
import (
"reflect"
"testing"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
)
func TestPopulateGroupList(t *testing.T) {
requiredList1 := schema.NewTree()
{
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredList1.Set("group", requiredGroup); err != nil {
t.Fatal(err)
}
if err = requiredList1.Set("group.list", list); err != nil {
t.Fatal(err)
}
if err = requiredList1.Set("group.list.element", requiredElement); err != nil {
t.Fatal(err)
}
if err = requiredList1.Set("group.list.element.col", requiredCol); err != nil {
t.Fatal(err)
}
if _, _, err := requiredList1.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
requiredList2 := schema.NewTree()
{
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredList2.Set("group", requiredGroup); err != nil {
t.Fatal(err)
}
if err = requiredList2.Set("group.list", list); err != nil {
t.Fatal(err)
}
if err = requiredList2.Set("group.list.element", requiredElement); err != nil {
t.Fatal(err)
}
if err = requiredList2.Set("group.list.element.col", optionalCol); err != nil {
t.Fatal(err)
}
if _, _, err := requiredList2.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
requiredList3 := schema.NewTree()
{
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredList3.Set("group", requiredGroup); err != nil {
t.Fatal(err)
}
if err = requiredList3.Set("group.list", list); err != nil {
t.Fatal(err)
}
if err = requiredList3.Set("group.list.element", optionalElement); err != nil {
t.Fatal(err)
}
if err = requiredList3.Set("group.list.element.col", requiredCol); err != nil {
t.Fatal(err)
}
if _, _, err := requiredList3.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
requiredList4 := schema.NewTree()
{
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredList4.Set("group", requiredGroup); err != nil {
t.Fatal(err)
}
if err = requiredList4.Set("group.list", list); err != nil {
t.Fatal(err)
}
if err = requiredList4.Set("group.list.element", optionalElement); err != nil {
t.Fatal(err)
}
if err = requiredList4.Set("group.list.element.col", optionalCol); err != nil {
t.Fatal(err)
}
if _, _, err := requiredList4.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalList1 := schema.NewTree()
{
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalList1.Set("group", optionalGroup); err != nil {
t.Fatal(err)
}
if err = optionalList1.Set("group.list", list); err != nil {
t.Fatal(err)
}
if err = optionalList1.Set("group.list.element", requiredElement); err != nil {
t.Fatal(err)
}
if err = optionalList1.Set("group.list.element.col", requiredCol); err != nil {
t.Fatal(err)
}
if _, _, err := optionalList1.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalList2 := schema.NewTree()
{
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalList2.Set("group", optionalGroup); err != nil {
t.Fatal(err)
}
if err = optionalList2.Set("group.list", list); err != nil {
t.Fatal(err)
}
if err = optionalList2.Set("group.list.element", requiredElement); err != nil {
t.Fatal(err)
}
if err = optionalList2.Set("group.list.element.col", optionalCol); err != nil {
t.Fatal(err)
}
if _, _, err := optionalList2.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalList3 := schema.NewTree()
{
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalList3.Set("group", optionalGroup); err != nil {
t.Fatal(err)
}
if err = optionalList3.Set("group.list", list); err != nil {
t.Fatal(err)
}
if err = optionalList3.Set("group.list.element", optionalElement); err != nil {
t.Fatal(err)
}
if err = optionalList3.Set("group.list.element.col", requiredCol); err != nil {
t.Fatal(err)
}
if _, _, err := optionalList3.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalList4 := schema.NewTree()
{
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalList4.Set("group", optionalGroup); err != nil {
t.Fatal(err)
}
if err = optionalList4.Set("group.list", list); err != nil {
t.Fatal(err)
}
if err = optionalList4.Set("group.list.element", optionalElement); err != nil {
t.Fatal(err)
}
if err = optionalList4.Set("group.list.element.col", optionalCol); err != nil {
t.Fatal(err)
}
if _, _, err := optionalList4.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
result1 := map[string]*Column{
"group.list.element.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{1},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
result2 := map[string]*Column{
"group.list.element.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10, v20},
definitionLevels: []int64{1, 1},
repetitionLevels: []int64{0, 1},
rowCount: 1,
maxBitWidth: 5,
minValue: v10,
maxValue: v20,
},
}
result3 := map[string]*Column{
"group.list.element.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{1},
repetitionLevels: []int64{0},
rowCount: 1,
},
}
result4 := map[string]*Column{
"group.list.element.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{2},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
result5 := map[string]*Column{
"group.list.element.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10, v20},
definitionLevels: []int64{2, 2},
repetitionLevels: []int64{0, 1},
rowCount: 1,
maxBitWidth: 5,
minValue: v10,
maxValue: v20,
},
}
result6 := map[string]*Column{
"group.list.element.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{2},
repetitionLevels: []int64{0},
rowCount: 1,
},
}
result7 := map[string]*Column{
"group.list.element.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{3},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
result8 := map[string]*Column{
"group.list.element.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10, v20},
definitionLevels: []int64{3, 3},
repetitionLevels: []int64{0, 1},
rowCount: 1,
maxBitWidth: 5,
minValue: v10,
maxValue: v20,
},
}
result9 := map[string]*Column{
"group.list.element.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
rowCount: 1,
},
}
result10 := map[string]*Column{
"group.list.element.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{3},
repetitionLevels: []int64{0},
rowCount: 1,
},
}
result11 := map[string]*Column{
"group.list.element.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{4},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
result12 := map[string]*Column{
"group.list.element.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10, v20},
definitionLevels: []int64{4, 4},
repetitionLevels: []int64{0, 1},
rowCount: 1,
maxBitWidth: 5,
minValue: v10,
maxValue: v20,
},
}
testCases := []struct {
schemaTree *schema.Tree
data string
expectedResult map[string]*Column
expectErr bool
}{
{requiredList1, `{}`, nil, true}, // err: group: nil value for required field
{requiredList1, `{"group": null}`, nil, true}, // err: group: nil value for required field
{requiredList1, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field
{requiredList1, `{"group": [{"col": 10}]}`, result1, false},
{requiredList1, `{"group": [{"col": 10}, {"col": 20}]}`, result2, false},
{requiredList2, `{}`, nil, true}, // err: group: nil value for required field
{requiredList2, `{"group": null}`, nil, true}, // err: group: nil value for required field
{requiredList2, `{"group": [{"col": null}]}`, result3, false},
{requiredList2, `{"group": [{"col": 10}]}`, result4, false},
{requiredList2, `{"group": [{"col": 10}, {"col": 20}]}`, result5, false},
{requiredList3, `{}`, nil, true}, // err: group: nil value for required field
{requiredList3, `{"group": null}`, nil, true}, // err: group: nil value for required field
{requiredList3, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field
{requiredList3, `{"group": [{"col": 10}]}`, result4, false},
{requiredList3, `{"group": [{"col": 10}, {"col": 20}]}`, result5, false},
{requiredList4, `{}`, nil, true}, // err: group: nil value for required field
{requiredList4, `{"group": null}`, nil, true}, // err: group: nil value for required field
{requiredList4, `{"group": [{"col": null}]}`, result6, false},
{requiredList4, `{"group": [{"col": 10}]}`, result7, false},
{requiredList4, `{"group": [{"col": 10}, {"col": 20}]}`, result8, false},
{optionalList1, `{}`, result9, false},
{optionalList1, `{"group": null}`, result9, false},
{optionalList1, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field
{optionalList1, `{"group": [{"col": 10}]}`, result4, false},
{optionalList1, `{"group": [{"col": 10}, {"col": 20}]}`, result5, false},
{optionalList2, `{}`, result9, false},
{optionalList2, `{"group": null}`, result9, false},
{optionalList2, `{"group": [{"col": null}]}`, result6, false},
{optionalList2, `{"group": [{"col": 10}]}`, result7, false},
{optionalList2, `{"group": [{"col": 10}, {"col": 20}]}`, result8, false},
{optionalList3, `{}`, result9, false},
{optionalList3, `{"group": null}`, result9, false},
{optionalList3, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field
{optionalList3, `{"group": [{"col": 10}]}`, result7, false},
{optionalList3, `{"group": [{"col": 10}, {"col": 20}]}`, result8, false},
{optionalList4, `{}`, result9, false},
{optionalList4, `{"group": null}`, result9, false},
{optionalList4, `{"group": [{"col": null}]}`, result10, false},
{optionalList4, `{"group": [{"col": 10}]}`, result11, false},
{optionalList4, `{"group": [{"col": 10}, {"col": 20}]}`, result12, false},
}
for i, testCase := range testCases {
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
expectErr := (err != nil)
if testCase.expectErr != expectErr {
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
}
if !testCase.expectErr {
if !reflect.DeepEqual(result, testCase.expectedResult) {
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
}
}
}
}

View File

@@ -0,0 +1,237 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package data
import (
"reflect"
"testing"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
)
func TestPopulateGroupType(t *testing.T) {
requiredGroup1 := schema.NewTree()
{
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredGroup1.Set("group", requiredGroup); err != nil {
t.Fatal(err)
}
if err = requiredGroup1.Set("group.col", requiredCol); err != nil {
t.Fatal(err)
}
if _, _, err := requiredGroup1.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
requiredGroup2 := schema.NewTree()
{
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredGroup2.Set("group", requiredGroup); err != nil {
t.Fatal(err)
}
if err = requiredGroup2.Set("group.col", optionalCol); err != nil {
t.Fatal(err)
}
if _, _, err := requiredGroup2.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalGroup1 := schema.NewTree()
{
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalGroup1.Set("group", optionalGroup); err != nil {
t.Fatal(err)
}
if err = optionalGroup1.Set("group.col", requiredCol); err != nil {
t.Fatal(err)
}
if _, _, err := optionalGroup1.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalGroup2 := schema.NewTree()
{
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalGroup2.Set("group", optionalGroup); err != nil {
t.Fatal(err)
}
if err = optionalGroup2.Set("group.col", optionalCol); err != nil {
t.Fatal(err)
}
if _, _, err := optionalGroup2.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
result1 := map[string]*Column{
"group.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
result2 := map[string]*Column{
"group.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
rowCount: 1,
},
}
result3 := map[string]*Column{
"group.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{1},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
result4 := map[string]*Column{
"group.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{1},
repetitionLevels: []int64{0},
rowCount: 1,
},
}
result5 := map[string]*Column{
"group.col": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{2},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
testCases := []struct {
schemaTree *schema.Tree
data string
expectedResult map[string]*Column
expectErr bool
}{
{requiredGroup1, `{}`, nil, true}, // err: group: nil value for required field
{requiredGroup1, `{"group": null}`, nil, true}, // err: group: nil value for required field
{requiredGroup1, `{"group": {"col": null}}`, nil, true}, // err: group.col: nil value for required field
{requiredGroup1, `{"group": {"col": 10}}`, result1, false},
{requiredGroup2, `{}`, nil, true}, // err: group: nil value for required field
{requiredGroup2, `{"group": null}`, nil, true}, // err: group: nil value for required field
{requiredGroup2, `{"group": {"col": null}}`, result2, false},
{requiredGroup2, `{"group": {"col": 10}}`, result3, false},
{optionalGroup1, `{}`, result2, false},
{optionalGroup1, `{"group": null}`, result2, false},
{optionalGroup1, `{"group": {"col": null}}`, nil, true}, // err: group.col: nil value for required field
{optionalGroup1, `{"group": {"col": 10}}`, result3, false},
{optionalGroup2, `{}`, result2, false},
{optionalGroup2, `{"group": null}`, result2, false},
{optionalGroup2, `{"group": {"col": null}}`, result4, false},
{optionalGroup2, `{"group": {"col": 10}}`, result5, false},
}
for i, testCase := range testCases {
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
expectErr := (err != nil)
if testCase.expectErr != expectErr {
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
}
if !testCase.expectErr {
if !reflect.DeepEqual(result, testCase.expectedResult) {
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
}
}
}
}

View File

@@ -0,0 +1,698 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package data
import (
"reflect"
"testing"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
)
func TestPopulateListOfList(t *testing.T) {
requiredList1 := schema.NewTree()
{
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredList1.Set("col", requiredCol); err != nil {
t.Fatal(err)
}
if err = requiredList1.Set("col.list", list); err != nil {
t.Fatal(err)
}
if err = requiredList1.Set("col.list.element", requiredElement); err != nil {
t.Fatal(err)
}
if err = requiredList1.Set("col.list.element.list", subList); err != nil {
t.Fatal(err)
}
if err = requiredList1.Set("col.list.element.list.element", requiredSubElement); err != nil {
t.Fatal(err)
}
if _, _, err = requiredList1.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
requiredList2 := schema.NewTree()
{
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredList2.Set("col", requiredCol); err != nil {
t.Fatal(err)
}
if err = requiredList2.Set("col.list", list); err != nil {
t.Fatal(err)
}
if err = requiredList2.Set("col.list.element", requiredElement); err != nil {
t.Fatal(err)
}
if err = requiredList2.Set("col.list.element.list", subList); err != nil {
t.Fatal(err)
}
if err = requiredList2.Set("col.list.element.list.element", optionalSubElement); err != nil {
t.Fatal(err)
}
if _, _, err = requiredList2.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
requiredList3 := schema.NewTree()
{
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredList3.Set("col", requiredCol); err != nil {
t.Fatal(err)
}
if err = requiredList3.Set("col.list", list); err != nil {
t.Fatal(err)
}
if err = requiredList3.Set("col.list.element", optioonalElement); err != nil {
t.Fatal(err)
}
if err = requiredList3.Set("col.list.element.list", subList); err != nil {
t.Fatal(err)
}
if err = requiredList3.Set("col.list.element.list.element", requiredSubElement); err != nil {
t.Fatal(err)
}
if _, _, err = requiredList3.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
requiredList4 := schema.NewTree()
{
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredList4.Set("col", requiredCol); err != nil {
t.Fatal(err)
}
if err = requiredList4.Set("col.list", list); err != nil {
t.Fatal(err)
}
if err = requiredList4.Set("col.list.element", optioonalElement); err != nil {
t.Fatal(err)
}
if err = requiredList4.Set("col.list.element.list", subList); err != nil {
t.Fatal(err)
}
if err = requiredList4.Set("col.list.element.list.element", optionalSubElement); err != nil {
t.Fatal(err)
}
if _, _, err = requiredList4.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalList1 := schema.NewTree()
{
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalList1.Set("col", requiredCol); err != nil {
t.Fatal(err)
}
if err = optionalList1.Set("col.list", list); err != nil {
t.Fatal(err)
}
if err = optionalList1.Set("col.list.element", requiredElement); err != nil {
t.Fatal(err)
}
if err = optionalList1.Set("col.list.element.list", subList); err != nil {
t.Fatal(err)
}
if err = optionalList1.Set("col.list.element.list.element", requiredSubElement); err != nil {
t.Fatal(err)
}
if _, _, err = optionalList1.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalList2 := schema.NewTree()
{
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalList2.Set("col", requiredCol); err != nil {
t.Fatal(err)
}
if err = optionalList2.Set("col.list", list); err != nil {
t.Fatal(err)
}
if err = optionalList2.Set("col.list.element", requiredElement); err != nil {
t.Fatal(err)
}
if err = optionalList2.Set("col.list.element.list", subList); err != nil {
t.Fatal(err)
}
if err = optionalList2.Set("col.list.element.list.element", optionalSubElement); err != nil {
t.Fatal(err)
}
if _, _, err = optionalList2.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalList3 := schema.NewTree()
{
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalList3.Set("col", requiredCol); err != nil {
t.Fatal(err)
}
if err = optionalList3.Set("col.list", list); err != nil {
t.Fatal(err)
}
if err = optionalList3.Set("col.list.element", optioonalElement); err != nil {
t.Fatal(err)
}
if err = optionalList3.Set("col.list.element.list", subList); err != nil {
t.Fatal(err)
}
if err = optionalList3.Set("col.list.element.list.element", requiredSubElement); err != nil {
t.Fatal(err)
}
if _, _, err = optionalList3.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalList4 := schema.NewTree()
{
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalList4.Set("col", requiredCol); err != nil {
t.Fatal(err)
}
if err = optionalList4.Set("col.list", list); err != nil {
t.Fatal(err)
}
if err = optionalList4.Set("col.list.element", optioonalElement); err != nil {
t.Fatal(err)
}
if err = optionalList4.Set("col.list.element.list", subList); err != nil {
t.Fatal(err)
}
if err = optionalList4.Set("col.list.element.list.element", optionalSubElement); err != nil {
t.Fatal(err)
}
if _, _, err = optionalList4.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
result1 := map[string]*Column{
"col.list.element.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{2},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
result2 := map[string]*Column{
"col.list.element.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10, v20, v30, v10, v20, v10, v30},
definitionLevels: []int64{2, 2, 2, 2, 2, 2, 2},
repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2},
rowCount: 1,
maxBitWidth: 5,
minValue: v10,
maxValue: v30,
},
}
result3 := map[string]*Column{
"col.list.element.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{2},
repetitionLevels: []int64{0},
rowCount: 1,
},
}
result4 := map[string]*Column{
"col.list.element.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{3},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
result5 := map[string]*Column{
"col.list.element.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10, v20, v30, v10, v20, v10, v30},
definitionLevels: []int64{3, 3, 3, 3, 3, 3, 3},
repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2},
rowCount: 1,
maxBitWidth: 5,
minValue: v10,
maxValue: v30,
},
}
result6 := map[string]*Column{
"col.list.element.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{3},
repetitionLevels: []int64{0},
rowCount: 1,
},
}
result7 := map[string]*Column{
"col.list.element.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{4},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
result8 := map[string]*Column{
"col.list.element.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10, v20, v30, v10, v20, v10, v30},
definitionLevels: []int64{4, 4, 4, 4, 4, 4, 4},
repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2},
rowCount: 1,
maxBitWidth: 5,
minValue: v10,
maxValue: v30,
},
}
result9 := map[string]*Column{
"col.list.element.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
rowCount: 1,
},
}
result10 := map[string]*Column{
"col.list.element.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{4},
repetitionLevels: []int64{0},
rowCount: 1,
},
}
result11 := map[string]*Column{
"col.list.element.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{5},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
result12 := map[string]*Column{
"col.list.element.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10, v20, v30, v10, v20, v10, v30},
definitionLevels: []int64{5, 5, 5, 5, 5, 5, 5},
repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2},
rowCount: 1,
maxBitWidth: 5,
minValue: v10,
maxValue: v30,
},
}
testCases := []struct {
schemaTree *schema.Tree
data string
expectedResult map[string]*Column
expectErr bool
}{
{requiredList1, `{}`, nil, true}, // err: col: nil value for required field
{requiredList1, `{"col": null}`, nil, true}, // err: col: nil value for required field
{requiredList1, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field
{requiredList1, `{"col": [[10]]}`, result1, false},
{requiredList1, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result2, false},
{requiredList2, `{}`, nil, true}, // err: col: nil value for required field
{requiredList2, `{"col": null}`, nil, true}, // err: col: nil value for required field
{requiredList2, `{"col": [[null]]}`, result3, false},
{requiredList2, `{"col": [[10]]}`, result4, false},
{requiredList2, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result5, false},
{requiredList3, `{}`, nil, true}, // err: col: nil value for required field
{requiredList3, `{"col": null}`, nil, true}, // err: col: nil value for required field
{requiredList3, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field
{requiredList3, `{"col": [[10]]}`, result4, false},
{requiredList3, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result5, false},
{requiredList4, `{}`, nil, true}, // err: col: nil value for required field
{requiredList4, `{"col": null}`, nil, true}, // err: col: nil value for required field
{requiredList4, `{"col": [[null]]}`, result6, false},
{requiredList4, `{"col": [[10]]}`, result7, false},
{requiredList4, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result8, false},
{optionalList1, `{}`, result9, false},
{optionalList1, `{"col": null}`, result9, false},
{optionalList1, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field
{optionalList1, `{"col": [[10]]}`, result4, false},
{optionalList1, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result5, false},
{optionalList2, `{}`, result9, false},
{optionalList2, `{"col": null}`, result9, false},
{optionalList2, `{"col": [[null]]}`, result6, false},
{optionalList2, `{"col": [[10]]}`, result7, false},
{optionalList2, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result8, false},
{optionalList3, `{}`, result9, false},
{optionalList3, `{"col": null}`, result9, false},
{optionalList3, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field
{optionalList3, `{"col": [[10]]}`, result7, false},
{optionalList3, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result8, false},
{optionalList4, `{}`, result9, false},
{optionalList4, `{"col": null}`, result9, false},
{optionalList4, `{"col": [[null]]}`, result10, false},
{optionalList4, `{"col": [[10]]}`, result11, false},
{optionalList4, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result12, false},
}
for i, testCase := range testCases {
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
expectErr := (err != nil)
if testCase.expectErr != expectErr {
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
}
if !testCase.expectErr {
if !reflect.DeepEqual(result, testCase.expectedResult) {
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
}
}
}
}

View File

@@ -0,0 +1,370 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package data
import (
"reflect"
"testing"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
)
func TestPopulateMap(t *testing.T) {
t.Skip("Broken")
requiredMap1 := schema.NewTree()
{
mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredValue, err := schema.NewElement("value", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredMap1.Set("map", mapElement); err != nil {
t.Fatal(err)
}
if err = requiredMap1.Set("map.key_value", keyValue); err != nil {
t.Fatal(err)
}
if err = requiredMap1.Set("map.key_value.key", requiredKey); err != nil {
t.Fatal(err)
}
if err = requiredMap1.Set("map.key_value.value", requiredValue); err != nil {
t.Fatal(err)
}
if _, _, err = requiredMap1.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
requiredMap2 := schema.NewTree()
{
mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalValue, err := schema.NewElement("value", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredMap2.Set("map", mapElement); err != nil {
t.Fatal(err)
}
if err = requiredMap2.Set("map.key_value", keyValue); err != nil {
t.Fatal(err)
}
if err = requiredMap2.Set("map.key_value.key", requiredKey); err != nil {
t.Fatal(err)
}
if err = requiredMap2.Set("map.key_value.value", optionalValue); err != nil {
t.Fatal(err)
}
if _, _, err = requiredMap2.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalMap1 := schema.NewTree()
{
mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredValue, err := schema.NewElement("value", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalMap1.Set("map", mapElement); err != nil {
t.Fatal(err)
}
if err = optionalMap1.Set("map.key_value", keyValue); err != nil {
t.Fatal(err)
}
if err = optionalMap1.Set("map.key_value.key", requiredKey); err != nil {
t.Fatal(err)
}
if err = optionalMap1.Set("map.key_value.value", requiredValue); err != nil {
t.Fatal(err)
}
if _, _, err = optionalMap1.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalMap2 := schema.NewTree()
{
mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalValue, err := schema.NewElement("value", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalMap2.Set("map", mapElement); err != nil {
t.Fatal(err)
}
if err = optionalMap2.Set("map.key_value", keyValue); err != nil {
t.Fatal(err)
}
if err = optionalMap2.Set("map.key_value.key", requiredKey); err != nil {
t.Fatal(err)
}
if err = optionalMap2.Set("map.key_value.value", optionalValue); err != nil {
t.Fatal(err)
}
if _, _, err = optionalMap2.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
result1 := map[string]*Column{
"map.key_value.key": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{ten},
definitionLevels: []int64{1},
repetitionLevels: []int64{0},
},
"map.key_value.value": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{1},
repetitionLevels: []int64{1},
},
}
result2 := map[string]*Column{
"map.key_value.key": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{ten},
definitionLevels: []int64{1},
repetitionLevels: []int64{0},
},
"map.key_value.value": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{1},
repetitionLevels: []int64{1},
},
}
result3 := map[string]*Column{
"map.key_value.key": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{ten},
definitionLevels: []int64{1},
repetitionLevels: []int64{0},
},
"map.key_value.value": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{2},
repetitionLevels: []int64{1},
},
}
result4 := map[string]*Column{
"map.key_value.key": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{nil},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
},
}
result5 := map[string]*Column{
"map.key_value.key": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{ten},
definitionLevels: []int64{2},
repetitionLevels: []int64{0},
},
"map.key_value.value": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{2},
repetitionLevels: []int64{1},
},
}
result6 := map[string]*Column{
"map.key_value.key": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{ten},
definitionLevels: []int64{2},
repetitionLevels: []int64{0},
},
"map.key_value.value": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{2},
repetitionLevels: []int64{1},
},
}
result7 := map[string]*Column{
"map.key_value.key": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{ten},
definitionLevels: []int64{2},
repetitionLevels: []int64{0},
},
"map.key_value.value": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{3},
repetitionLevels: []int64{1},
},
}
testCases := []struct {
schemaTree *schema.Tree
data string
expectedResult map[string]*Column
expectErr bool
}{
{requiredMap1, `{}`, nil, true}, // err: map: nil value for required field
{requiredMap1, `{"map": null}`, nil, true}, // err: map: nil value for required field
{requiredMap1, `{"map": {"ten": null}}`, nil, true}, // err: map.key_value.value: nil value for required field
{requiredMap1, `{"map": {"ten": 10}}`, result1, false},
{requiredMap2, `{}`, nil, true}, // err: map: nil value for required field
{requiredMap2, `{"map": null}`, nil, true}, // err: map: nil value for required field
{requiredMap2, `{"map": {"ten": null}}`, result2, false},
{requiredMap2, `{"map": {"ten": 10}}`, result3, false},
{optionalMap1, `{}`, result4, false},
{optionalMap1, `{"map": null}`, result4, false},
{optionalMap1, `{"map": {"ten": null}}`, nil, true}, // err: map.key_value.value: nil value for required field
{optionalMap1, `{"map": {"ten": 10}}`, result5, false},
{optionalMap2, `{}`, result4, false},
{optionalMap2, `{"map": null}`, result4, false},
{optionalMap2, `{"map": {"ten": null}}`, result6, false},
{optionalMap2, `{"map": {"ten": 10}}`, result7, false},
}
for i, testCase := range testCases {
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
expectErr := (err != nil)
if testCase.expectErr != expectErr {
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
}
if !testCase.expectErr {
if !reflect.DeepEqual(result, testCase.expectedResult) {
t.Errorf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
}
}
}
}

View File

@@ -0,0 +1,330 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package data
import (
"reflect"
"testing"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
)
func TestPopulatePrimitiveList(t *testing.T) {
requiredList1 := schema.NewTree()
{
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredList1.Set("col", requiredCol); err != nil {
t.Fatal(err)
}
if err = requiredList1.Set("col.list", list); err != nil {
t.Fatal(err)
}
if err = requiredList1.Set("col.list.element", requiredElement); err != nil {
t.Fatal(err)
}
if _, _, err = requiredList1.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
requiredList2 := schema.NewTree()
{
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredList2.Set("col", requiredCol); err != nil {
t.Fatal(err)
}
if err = requiredList2.Set("col.list", list); err != nil {
t.Fatal(err)
}
if err = requiredList2.Set("col.list.element", optionalElement); err != nil {
t.Fatal(err)
}
if _, _, err = requiredList2.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalList1 := schema.NewTree()
{
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalList1.Set("col", optionalCol); err != nil {
t.Fatal(err)
}
if err = optionalList1.Set("col.list", list); err != nil {
t.Fatal(err)
}
if err = optionalList1.Set("col.list.element", requiredElement); err != nil {
t.Fatal(err)
}
if _, _, err = optionalList1.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalList2 := schema.NewTree()
{
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalList2.Set("col", optionalCol); err != nil {
t.Fatal(err)
}
if err = optionalList2.Set("col.list", list); err != nil {
t.Fatal(err)
}
if err = optionalList2.Set("col.list.element", optionalElement); err != nil {
t.Fatal(err)
}
if _, _, err = optionalList2.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
result1 := map[string]*Column{
"col.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{1},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
result2 := map[string]*Column{
"col.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10, v20, v30},
definitionLevels: []int64{1, 1, 1},
repetitionLevels: []int64{0, 1, 1},
rowCount: 1,
maxBitWidth: 5,
minValue: v10,
maxValue: v30,
},
}
result3 := map[string]*Column{
"col.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{1},
repetitionLevels: []int64{0},
rowCount: 1,
},
}
result4 := map[string]*Column{
"col.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{2},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
result5 := map[string]*Column{
"col.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10, v20, v30},
definitionLevels: []int64{2, 2, 2},
repetitionLevels: []int64{0, 1, 1},
rowCount: 1,
maxBitWidth: 5,
minValue: v10,
maxValue: v30,
},
}
result6 := map[string]*Column{
"col.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
rowCount: 1,
},
}
result7 := map[string]*Column{
"col.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{2},
repetitionLevels: []int64{0},
rowCount: 1,
},
}
result8 := map[string]*Column{
"col.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{3},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
result9 := map[string]*Column{
"col.list.element": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10, v20, v30},
definitionLevels: []int64{3, 3, 3},
repetitionLevels: []int64{0, 1, 1},
rowCount: 1,
maxBitWidth: 5,
minValue: v10,
maxValue: v30,
},
}
testCases := []struct {
schemaTree *schema.Tree
data string
expectedResult map[string]*Column
expectErr bool
}{
{requiredList1, `{}`, nil, true}, // err: col: nil value for required field
{requiredList1, `{"col": null}`, nil, true}, // err: col: nil value for required field
{requiredList1, `{"col": [null]}`, nil, true}, // err: col.list.element: nil value for required field
{requiredList1, `{"col": [10]}`, result1, false},
{requiredList1, `{"col": [10, 20, 30]}`, result2, false},
{requiredList2, `{}`, nil, true}, // err: col: nil value for required field
{requiredList2, `{"col": null}`, nil, true}, // err: col: nil value for required field
{requiredList2, `{"col": [null]}`, result3, false},
{requiredList2, `{"col": [10]}`, result4, false},
{requiredList2, `{"col": [10, 20, 30]}`, result5, false},
{optionalList1, `{}`, result6, false},
{optionalList1, `{"col": null}`, result6, false},
{optionalList1, `{"col": [null]}`, nil, true}, // err: col.list.element: nil value for required field
{optionalList1, `{"col": [10]}`, result4, false},
{optionalList1, `{"col": [10, 20, 30]}`, result5, false},
{optionalList2, `{}`, result6, false},
{optionalList2, `{"col": null}`, result6, false},
{optionalList2, `{"col": [null]}`, result7, false},
{optionalList2, `{"col": [10]}`, result8, false},
{optionalList2, `{"col": [10, 20, 30]}`, result9, false},
}
for i, testCase := range testCases {
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
expectErr := (err != nil)
if testCase.expectErr != expectErr {
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
}
if !testCase.expectErr {
if !reflect.DeepEqual(result, testCase.expectedResult) {
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
}
}
}
}

View File

@@ -0,0 +1,128 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package data
import (
"reflect"
"testing"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
)
func TestPopulatePrimitiveType(t *testing.T) {
requiredField := schema.NewTree()
{
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_INT32), nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = requiredField.Set("col", requiredCol); err != nil {
t.Fatal(err)
}
if _, _, err = requiredField.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
optionalField := schema.NewTree()
{
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = optionalField.Set("col", optionalCol); err != nil {
t.Fatal(err)
}
if _, _, err = optionalField.ToParquetSchema(); err != nil {
t.Fatal(err)
}
}
result1 := map[string]*Column{
"col": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
result2 := map[string]*Column{
"col": {
parquetType: parquet.Type_INT32,
values: []interface{}{nil},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
rowCount: 1,
},
}
result3 := map[string]*Column{
"col": {
parquetType: parquet.Type_INT32,
values: []interface{}{v10},
definitionLevels: []int64{1},
repetitionLevels: []int64{0},
rowCount: 1,
maxBitWidth: 4,
minValue: v10,
maxValue: v10,
},
}
testCases := []struct {
schemaTree *schema.Tree
data string
expectedResult map[string]*Column
expectErr bool
}{
{requiredField, `{}`, nil, true},
{requiredField, `{"col": null}`, nil, true}, // err: col: nil value for required field
{requiredField, `{"col": 10}`, result1, false},
{optionalField, `{}`, result2, false},
{optionalField, `{"col": null}`, result2, false},
{optionalField, `{"col": 10}`, result3, false},
}
for i, testCase := range testCases {
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
expectErr := (err != nil)
if testCase.expectErr != expectErr {
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
}
if !testCase.expectErr {
if !reflect.DeepEqual(result, testCase.expectedResult) {
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
}
}
}
}

View File

@@ -0,0 +1,680 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package data
import (
"bytes"
"context"
"fmt"
"strings"
"git.apache.org/thrift.git/lib/go/thrift"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/common"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/encoding"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
"github.com/tidwall/gjson"
"github.com/tidwall/sjson"
)
func getDefaultEncoding(parquetType parquet.Type) parquet.Encoding {
switch parquetType {
case parquet.Type_BOOLEAN:
return parquet.Encoding_PLAIN
case parquet.Type_INT32, parquet.Type_INT64, parquet.Type_FLOAT, parquet.Type_DOUBLE:
return parquet.Encoding_RLE_DICTIONARY
case parquet.Type_BYTE_ARRAY:
return parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY
}
return parquet.Encoding_PLAIN
}
func getFirstValueElement(tree *schema.Tree) (valueElement *schema.Element) {
tree.Range(func(name string, element *schema.Element) bool {
if element.Children == nil {
valueElement = element
} else {
valueElement = getFirstValueElement(element.Children)
}
return false
})
return valueElement
}
func populate(columnDataMap map[string]*Column, input *jsonValue, tree *schema.Tree, firstValueRL int64) (map[string]*Column, error) {
var err error
pos := 0
handleElement := func(name string, element *schema.Element) bool {
pos++
dataPath := element.PathInTree
if *element.RepetitionType == parquet.FieldRepetitionType_REPEATED {
panic(fmt.Errorf("%v: repetition type must be REQUIRED or OPTIONAL type", dataPath))
}
inputValue := input.Get(name)
if *element.RepetitionType == parquet.FieldRepetitionType_REQUIRED && inputValue.IsNull() {
err = fmt.Errorf("%v: nil value for required field", dataPath)
return false
}
add := func(element *schema.Element, value interface{}, DL, RL int64) {
columnData := columnDataMap[element.PathInSchema]
if columnData == nil {
columnData = NewColumn(*element.Type)
}
columnData.add(value, DL, RL)
columnDataMap[element.PathInSchema] = columnData
}
// Handle primitive type element.
if element.Type != nil {
var value interface{}
if value, err = inputValue.GetValue(*element.Type, element.ConvertedType); err != nil {
return false
}
DL := element.MaxDefinitionLevel
if value == nil && DL > 0 {
DL--
}
RL := element.MaxRepetitionLevel
if pos == 1 {
RL = firstValueRL
}
add(element, value, DL, RL)
return true
}
addNull := func() {
valueElement := getFirstValueElement(element.Children)
DL := element.MaxDefinitionLevel
if DL > 0 {
DL--
}
RL := element.MaxRepetitionLevel
if RL > 0 {
RL--
}
add(valueElement, nil, DL, RL)
}
// Handle group type element.
if element.ConvertedType == nil {
if inputValue.IsNull() {
addNull()
return true
}
columnDataMap, err = populate(columnDataMap, inputValue, element.Children, firstValueRL)
return (err == nil)
}
// Handle list type element.
if *element.ConvertedType == parquet.ConvertedType_LIST {
if inputValue.IsNull() {
addNull()
return true
}
var results []gjson.Result
if results, err = inputValue.GetArray(); err != nil {
return false
}
listElement, _ := element.Children.Get("list")
valueElement, _ := listElement.Children.Get("element")
for i := range results {
rl := valueElement.MaxRepetitionLevel
if i == 0 {
rl = firstValueRL
}
var jsonData []byte
if jsonData, err = sjson.SetBytes([]byte{}, "element", results[i].Value()); err != nil {
return false
}
var jv *jsonValue
if jv, err = bytesToJSONValue(jsonData); err != nil {
return false
}
if columnDataMap, err = populate(columnDataMap, jv, listElement.Children, rl); err != nil {
return false
}
}
return true
}
if *element.ConvertedType == parquet.ConvertedType_MAP {
if inputValue.IsNull() {
addNull()
return true
}
keyValueElement, _ := element.Children.Get("key_value")
var rerr error
err = inputValue.Range(func(key, value gjson.Result) bool {
if !key.Exists() || key.Type == gjson.Null {
rerr = fmt.Errorf("%v.key_value.key: not found or null", dataPath)
return false
}
var jsonData []byte
if jsonData, rerr = sjson.SetBytes([]byte{}, "key", key.Value()); err != nil {
return false
}
if jsonData, rerr = sjson.SetBytes(jsonData, "value", value.Value()); err != nil {
return false
}
var jv *jsonValue
if jv, rerr = bytesToJSONValue(jsonData); rerr != nil {
return false
}
if columnDataMap, rerr = populate(columnDataMap, jv, keyValueElement.Children, firstValueRL); err != nil {
return false
}
return true
})
if err != nil {
return false
}
err = rerr
return (err == nil)
}
err = fmt.Errorf("%v: unsupported converted type %v in %v field type", dataPath, *element.ConvertedType, *element.RepetitionType)
return false
}
tree.Range(handleElement)
return columnDataMap, err
}
// Column - denotes values of a column.
type Column struct {
parquetType parquet.Type // value type.
values []interface{} // must be a slice of parquet typed values.
definitionLevels []int64 // exactly same length of values.
repetitionLevels []int64 // exactly same length of values.
rowCount int32
maxBitWidth int32
minValue interface{}
maxValue interface{}
}
func (column *Column) updateMinMaxValue(value interface{}) {
if column.minValue == nil && column.maxValue == nil {
column.minValue = value
column.maxValue = value
return
}
switch column.parquetType {
case parquet.Type_BOOLEAN:
if column.minValue.(bool) && !value.(bool) {
column.minValue = value
}
if !column.maxValue.(bool) && value.(bool) {
column.maxValue = value
}
case parquet.Type_INT32:
if column.minValue.(int32) > value.(int32) {
column.minValue = value
}
if column.maxValue.(int32) < value.(int32) {
column.maxValue = value
}
case parquet.Type_INT64:
if column.minValue.(int64) > value.(int64) {
column.minValue = value
}
if column.maxValue.(int64) < value.(int64) {
column.maxValue = value
}
case parquet.Type_FLOAT:
if column.minValue.(float32) > value.(float32) {
column.minValue = value
}
if column.maxValue.(float32) < value.(float32) {
column.maxValue = value
}
case parquet.Type_DOUBLE:
if column.minValue.(float64) > value.(float64) {
column.minValue = value
}
if column.maxValue.(float64) < value.(float64) {
column.maxValue = value
}
case parquet.Type_BYTE_ARRAY:
if bytes.Compare(column.minValue.([]byte), value.([]byte)) > 0 {
column.minValue = value
}
if bytes.Compare(column.minValue.([]byte), value.([]byte)) < 0 {
column.maxValue = value
}
}
}
func (column *Column) updateStats(value interface{}, DL, RL int64) {
if RL == 0 {
column.rowCount++
}
if value == nil {
return
}
var bitWidth int32
switch column.parquetType {
case parquet.Type_BOOLEAN:
bitWidth = 1
case parquet.Type_INT32:
bitWidth = common.BitWidth(uint64(value.(int32)))
case parquet.Type_INT64:
bitWidth = common.BitWidth(uint64(value.(int64)))
case parquet.Type_FLOAT:
bitWidth = 32
case parquet.Type_DOUBLE:
bitWidth = 64
case parquet.Type_BYTE_ARRAY:
bitWidth = int32(len(value.([]byte)))
}
if column.maxBitWidth < bitWidth {
column.maxBitWidth = bitWidth
}
column.updateMinMaxValue(value)
}
func (column *Column) add(value interface{}, DL, RL int64) {
column.values = append(column.values, value)
column.definitionLevels = append(column.definitionLevels, DL)
column.repetitionLevels = append(column.repetitionLevels, RL)
column.updateStats(value, DL, RL)
}
// AddNull - adds nil value.
func (column *Column) AddNull(DL, RL int64) {
column.add(nil, DL, RL)
}
// AddBoolean - adds boolean value.
func (column *Column) AddBoolean(value bool, DL, RL int64) {
if column.parquetType != parquet.Type_BOOLEAN {
panic(fmt.Errorf("expected %v value", column.parquetType))
}
column.add(value, DL, RL)
}
// AddInt32 - adds int32 value.
func (column *Column) AddInt32(value int32, DL, RL int64) {
if column.parquetType != parquet.Type_INT32 {
panic(fmt.Errorf("expected %v value", column.parquetType))
}
column.add(value, DL, RL)
}
// AddInt64 - adds int64 value.
func (column *Column) AddInt64(value int64, DL, RL int64) {
if column.parquetType != parquet.Type_INT64 {
panic(fmt.Errorf("expected %v value", column.parquetType))
}
column.add(value, DL, RL)
}
// AddFloat - adds float32 value.
func (column *Column) AddFloat(value float32, DL, RL int64) {
if column.parquetType != parquet.Type_FLOAT {
panic(fmt.Errorf("expected %v value", column.parquetType))
}
column.add(value, DL, RL)
}
// AddDouble - adds float64 value.
func (column *Column) AddDouble(value float64, DL, RL int64) {
if column.parquetType != parquet.Type_DOUBLE {
panic(fmt.Errorf("expected %v value", column.parquetType))
}
column.add(value, DL, RL)
}
// AddByteArray - adds byte array value.
func (column *Column) AddByteArray(value []byte, DL, RL int64) {
if column.parquetType != parquet.Type_BYTE_ARRAY {
panic(fmt.Errorf("expected %v value", column.parquetType))
}
column.add(value, DL, RL)
}
// Merge - merges columns.
func (column *Column) Merge(column2 *Column) {
if column.parquetType != column2.parquetType {
panic(fmt.Errorf("merge differs in parquet type"))
}
column.values = append(column.values, column2.values...)
column.definitionLevels = append(column.definitionLevels, column2.definitionLevels...)
column.repetitionLevels = append(column.repetitionLevels, column2.repetitionLevels...)
column.rowCount += column2.rowCount
if column.maxBitWidth < column2.maxBitWidth {
column.maxBitWidth = column2.maxBitWidth
}
column.updateMinMaxValue(column2.minValue)
column.updateMinMaxValue(column2.maxValue)
}
func (column *Column) String() string {
var strs []string
strs = append(strs, fmt.Sprintf("parquetType: %v", column.parquetType))
strs = append(strs, fmt.Sprintf("values: %v", column.values))
strs = append(strs, fmt.Sprintf("definitionLevels: %v", column.definitionLevels))
strs = append(strs, fmt.Sprintf("repetitionLevels: %v", column.repetitionLevels))
strs = append(strs, fmt.Sprintf("rowCount: %v", column.rowCount))
strs = append(strs, fmt.Sprintf("maxBitWidth: %v", column.maxBitWidth))
strs = append(strs, fmt.Sprintf("minValue: %v", column.minValue))
strs = append(strs, fmt.Sprintf("maxValue: %v", column.maxValue))
return "{" + strings.Join(strs, ", ") + "}"
}
func (column *Column) encodeValue(value interface{}, element *schema.Element) []byte {
if value == nil {
return nil
}
valueData := encoding.PlainEncode(common.ToSliceValue([]interface{}{value}, column.parquetType), column.parquetType)
if column.parquetType == parquet.Type_BYTE_ARRAY && element.ConvertedType != nil {
switch *element.ConvertedType {
case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
valueData = valueData[4:]
}
}
return valueData
}
func (column *Column) toDataPageV2(element *schema.Element, parquetEncoding parquet.Encoding) *ColumnChunk {
var definedValues []interface{}
for _, value := range column.values {
if value != nil {
definedValues = append(definedValues, value)
}
}
var encodedData []byte
switch parquetEncoding {
case parquet.Encoding_PLAIN:
encodedData = encoding.PlainEncode(common.ToSliceValue(definedValues, column.parquetType), column.parquetType)
case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
var bytesSlices [][]byte
for _, value := range column.values {
bytesSlices = append(bytesSlices, value.([]byte))
}
encodedData = encoding.DeltaLengthByteArrayEncode(bytesSlices)
}
compressionType := parquet.CompressionCodec_SNAPPY
if element.CompressionType != nil {
compressionType = *element.CompressionType
}
compressedData, err := common.Compress(compressionType, encodedData)
if err != nil {
panic(err)
}
DLData := encoding.RLEBitPackedHybridEncode(
column.definitionLevels,
common.BitWidth(uint64(element.MaxDefinitionLevel)),
parquet.Type_INT64,
)
RLData := encoding.RLEBitPackedHybridEncode(
column.repetitionLevels,
common.BitWidth(uint64(element.MaxRepetitionLevel)),
parquet.Type_INT64,
)
pageHeader := parquet.NewPageHeader()
pageHeader.Type = parquet.PageType_DATA_PAGE_V2
pageHeader.CompressedPageSize = int32(len(compressedData) + len(DLData) + len(RLData))
pageHeader.UncompressedPageSize = int32(len(encodedData) + len(DLData) + len(RLData))
pageHeader.DataPageHeaderV2 = parquet.NewDataPageHeaderV2()
pageHeader.DataPageHeaderV2.NumValues = int32(len(column.values))
pageHeader.DataPageHeaderV2.NumNulls = int32(len(column.values) - len(definedValues))
pageHeader.DataPageHeaderV2.NumRows = column.rowCount
pageHeader.DataPageHeaderV2.Encoding = parquetEncoding
pageHeader.DataPageHeaderV2.DefinitionLevelsByteLength = int32(len(DLData))
pageHeader.DataPageHeaderV2.RepetitionLevelsByteLength = int32(len(RLData))
pageHeader.DataPageHeaderV2.IsCompressed = true
pageHeader.DataPageHeaderV2.Statistics = parquet.NewStatistics()
pageHeader.DataPageHeaderV2.Statistics.Min = column.encodeValue(column.minValue, element)
pageHeader.DataPageHeaderV2.Statistics.Max = column.encodeValue(column.maxValue, element)
ts := thrift.NewTSerializer()
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
rawData, err := ts.Write(context.TODO(), pageHeader)
if err != nil {
panic(err)
}
rawData = append(rawData, RLData...)
rawData = append(rawData, DLData...)
rawData = append(rawData, compressedData...)
metadata := parquet.NewColumnMetaData()
metadata.Type = column.parquetType
metadata.Encodings = []parquet.Encoding{
parquet.Encoding_PLAIN,
parquet.Encoding_RLE,
parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY,
}
metadata.Codec = compressionType
metadata.NumValues = int64(pageHeader.DataPageHeaderV2.NumValues)
metadata.TotalCompressedSize = int64(len(rawData))
metadata.TotalUncompressedSize = int64(pageHeader.UncompressedPageSize) + int64(len(rawData)) - int64(pageHeader.CompressedPageSize)
metadata.PathInSchema = strings.Split(element.PathInSchema, ".")
metadata.Statistics = parquet.NewStatistics()
metadata.Statistics.Min = pageHeader.DataPageHeaderV2.Statistics.Min
metadata.Statistics.Max = pageHeader.DataPageHeaderV2.Statistics.Max
chunk := new(ColumnChunk)
chunk.ColumnChunk.MetaData = metadata
chunk.dataPageLen = int64(len(rawData))
chunk.dataLen = int64(len(rawData))
chunk.data = rawData
return chunk
}
func (column *Column) toRLEDictPage(element *schema.Element) *ColumnChunk {
dictPageData, dataPageData, dictValueCount, indexBitWidth := encoding.RLEDictEncode(column.values, column.parquetType, column.maxBitWidth)
compressionType := parquet.CompressionCodec_SNAPPY
if element.CompressionType != nil {
compressionType = *element.CompressionType
}
compressedData, err := common.Compress(compressionType, dictPageData)
if err != nil {
panic(err)
}
dictPageHeader := parquet.NewPageHeader()
dictPageHeader.Type = parquet.PageType_DICTIONARY_PAGE
dictPageHeader.CompressedPageSize = int32(len(compressedData))
dictPageHeader.UncompressedPageSize = int32(len(dictPageData))
dictPageHeader.DictionaryPageHeader = parquet.NewDictionaryPageHeader()
dictPageHeader.DictionaryPageHeader.NumValues = dictValueCount
dictPageHeader.DictionaryPageHeader.Encoding = parquet.Encoding_PLAIN
ts := thrift.NewTSerializer()
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
dictPageRawData, err := ts.Write(context.TODO(), dictPageHeader)
if err != nil {
panic(err)
}
dictPageRawData = append(dictPageRawData, compressedData...)
RLData := encoding.RLEBitPackedHybridEncode(
column.repetitionLevels,
common.BitWidth(uint64(element.MaxRepetitionLevel)),
parquet.Type_INT64,
)
encodedData := RLData
DLData := encoding.RLEBitPackedHybridEncode(
column.definitionLevels,
common.BitWidth(uint64(element.MaxDefinitionLevel)),
parquet.Type_INT64,
)
encodedData = append(encodedData, DLData...)
encodedData = append(encodedData, indexBitWidth)
encodedData = append(encodedData, dataPageData...)
compressedData, err = common.Compress(compressionType, encodedData)
if err != nil {
panic(err)
}
dataPageHeader := parquet.NewPageHeader()
dataPageHeader.Type = parquet.PageType_DATA_PAGE
dataPageHeader.CompressedPageSize = int32(len(compressedData))
dataPageHeader.UncompressedPageSize = int32(len(encodedData))
dataPageHeader.DataPageHeader = parquet.NewDataPageHeader()
dataPageHeader.DataPageHeader.NumValues = int32(len(column.values))
dataPageHeader.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE
dataPageHeader.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE
dataPageHeader.DataPageHeader.Encoding = parquet.Encoding_RLE_DICTIONARY
ts = thrift.NewTSerializer()
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
dataPageRawData, err := ts.Write(context.TODO(), dataPageHeader)
if err != nil {
panic(err)
}
dataPageRawData = append(dataPageRawData, compressedData...)
metadata := parquet.NewColumnMetaData()
metadata.Type = column.parquetType
metadata.Encodings = []parquet.Encoding{
parquet.Encoding_PLAIN,
parquet.Encoding_RLE,
parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY,
parquet.Encoding_RLE_DICTIONARY,
}
metadata.Codec = compressionType
metadata.NumValues = int64(dataPageHeader.DataPageHeader.NumValues)
metadata.TotalCompressedSize = int64(len(dictPageRawData)) + int64(len(dataPageRawData))
uncompressedSize := int64(dictPageHeader.UncompressedPageSize) + int64(len(dictPageData)) - int64(dictPageHeader.CompressedPageSize)
uncompressedSize += int64(dataPageHeader.UncompressedPageSize) + int64(len(dataPageData)) - int64(dataPageHeader.CompressedPageSize)
metadata.TotalUncompressedSize = uncompressedSize
metadata.PathInSchema = strings.Split(element.PathInSchema, ".")
metadata.Statistics = parquet.NewStatistics()
metadata.Statistics.Min = column.encodeValue(column.minValue, element)
metadata.Statistics.Max = column.encodeValue(column.maxValue, element)
chunk := new(ColumnChunk)
chunk.ColumnChunk.MetaData = metadata
chunk.isDictPage = true
chunk.dictPageLen = int64(len(dictPageRawData))
chunk.dataPageLen = int64(len(dataPageRawData))
chunk.dataLen = chunk.dictPageLen + chunk.dataPageLen
chunk.data = append(dictPageRawData, dataPageRawData...)
return chunk
}
// Encode an element.
func (column *Column) Encode(element *schema.Element) *ColumnChunk {
parquetEncoding := getDefaultEncoding(column.parquetType)
if element.Encoding != nil {
parquetEncoding = *element.Encoding
}
switch parquetEncoding {
case parquet.Encoding_PLAIN, parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
return column.toDataPageV2(element, parquetEncoding)
}
return column.toRLEDictPage(element)
}
// NewColumn - creates new column data
func NewColumn(parquetType parquet.Type) *Column {
switch parquetType {
case parquet.Type_BOOLEAN, parquet.Type_INT32, parquet.Type_INT64, parquet.Type_FLOAT, parquet.Type_DOUBLE, parquet.Type_BYTE_ARRAY:
default:
panic(fmt.Errorf("unsupported parquet type %v", parquetType))
}
return &Column{
parquetType: parquetType,
}
}
// UnmarshalJSON - decodes JSON data into map of Column.
func UnmarshalJSON(data []byte, tree *schema.Tree) (map[string]*Column, error) {
if !tree.ReadOnly() {
return nil, fmt.Errorf("tree must be read only")
}
inputValue, err := bytesToJSONValue(data)
if err != nil {
return nil, err
}
columnDataMap := make(map[string]*Column)
return populate(columnDataMap, inputValue, tree, 0)
}

View File

@@ -0,0 +1,369 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package data
import (
"reflect"
"testing"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
)
var (
v10 = int32(10)
v20 = int32(20)
v30 = int32(30)
ten = []byte("ten")
foo = []byte("foo")
bar = []byte("bar")
phone1 = []byte("1-234-567-8901")
phone2 = []byte("1-234-567-1098")
phone3 = []byte("1-111-222-3333")
)
func TestAddressBookExample(t *testing.T) {
// message AddressBook {
// required string owner;
// repeated string ownerPhoneNumbers;
// repeated group contacts {
// required string name;
// optional string phoneNumber;
// }
// }
t.Skip("Broken")
addressBook := schema.NewTree()
{
owner, err := schema.NewElement("owner", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
ownerPhoneNumbers, err := schema.NewElement("ownerPhoneNumbers", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
ownerPhoneNumbersList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
ownerPhoneNumbersElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
contacts, err := schema.NewElement("contacts", parquet.FieldRepetitionType_OPTIONAL,
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
contactsList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
contactsElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
nil, nil,
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
contactName, err := schema.NewElement("name", parquet.FieldRepetitionType_REQUIRED,
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
contactPhoneNumber, err := schema.NewElement("phoneNumber", parquet.FieldRepetitionType_OPTIONAL,
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
nil, nil, nil)
if err != nil {
t.Fatal(err)
}
if err = addressBook.Set("owner", owner); err != nil {
t.Fatal(err)
}
if err = addressBook.Set("ownerPhoneNumbers", ownerPhoneNumbers); err != nil {
t.Fatal(err)
}
if err = addressBook.Set("ownerPhoneNumbers.list", ownerPhoneNumbersList); err != nil {
t.Fatal(err)
}
if err = addressBook.Set("ownerPhoneNumbers.list.element", ownerPhoneNumbersElement); err != nil {
t.Fatal(err)
}
if err = addressBook.Set("contacts", contacts); err != nil {
t.Fatal(err)
}
if err = addressBook.Set("contacts.list", contactsList); err != nil {
t.Fatal(err)
}
if err = addressBook.Set("contacts.list.element", contactsElement); err != nil {
t.Fatal(err)
}
if err = addressBook.Set("contacts.list.element.name", contactName); err != nil {
t.Fatal(err)
}
if err = addressBook.Set("contacts.list.element.phoneNumber", contactPhoneNumber); err != nil {
t.Fatal(err)
}
}
if _, _, err := addressBook.ToParquetSchema(); err != nil {
t.Fatal(err)
}
case2Data := `{
"owner": "foo"
}`
result2 := map[string]*Column{
"owner": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{foo},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
},
"ownerPhoneNumbers.list.element": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{nil},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
},
"contacts.list.element.name": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{nil},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
},
}
case3Data := `{
"owner": "foo",
"ownerPhoneNumbers": [
"1-234-567-8901"
]
}
`
result3 := map[string]*Column{
"owner": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{foo},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
},
"ownerPhoneNumbers.list.element": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{phone1},
definitionLevels: []int64{2},
repetitionLevels: []int64{0},
},
"contacts.list.element.name": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{nil},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
},
}
case4Data := `{
"owner": "foo",
"ownerPhoneNumbers": [
"1-234-567-8901",
"1-234-567-1098"
]
}
`
result4 := map[string]*Column{
"owner": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{foo},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
},
"ownerPhoneNumbers.list.element": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{phone1, phone2},
definitionLevels: []int64{2, 2},
repetitionLevels: []int64{0, 1},
},
"contacts.list.element.name": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{nil},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
},
}
case5Data := `{
"contacts": [
{
"name": "bar"
}
],
"owner": "foo"
}`
result5 := map[string]*Column{
"owner": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{foo},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
},
"ownerPhoneNumbers.list.element": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{nil},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
},
"contacts.list.element.name": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{bar},
definitionLevels: []int64{2},
repetitionLevels: []int64{0},
},
"contacts.list.element.phoneNumber": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{nil},
definitionLevels: []int64{2},
repetitionLevels: []int64{1},
},
}
case6Data := `{
"contacts": [
{
"name": "bar",
"phoneNumber": "1-111-222-3333"
}
],
"owner": "foo"
}`
result6 := map[string]*Column{
"owner": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{foo},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
},
"ownerPhoneNumbers.list.element": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{nil},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
},
"contacts.list.element.name": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{bar},
definitionLevels: []int64{2},
repetitionLevels: []int64{0},
},
"contacts.list.element.phoneNumber": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{phone3},
definitionLevels: []int64{3},
repetitionLevels: []int64{1},
},
}
case7Data := `{
"contacts": [
{
"name": "bar",
"phoneNumber": "1-111-222-3333"
}
],
"owner": "foo",
"ownerPhoneNumbers": [
"1-234-567-8901",
"1-234-567-1098"
]
}`
result7 := map[string]*Column{
"owner": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{foo},
definitionLevels: []int64{0},
repetitionLevels: []int64{0},
},
"ownerPhoneNumbers.list.element": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{phone1, phone2},
definitionLevels: []int64{2, 2},
repetitionLevels: []int64{0, 1},
},
"contacts.list.element.name": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{bar},
definitionLevels: []int64{2},
repetitionLevels: []int64{0},
},
"contacts.list.element.phoneNumber": {
parquetType: parquet.Type_BYTE_ARRAY,
values: []interface{}{phone3},
definitionLevels: []int64{3},
repetitionLevels: []int64{1},
},
}
testCases := []struct {
data string
expectedResult map[string]*Column
expectErr bool
}{
{`{}`, nil, true}, // err: owner: nil value for required field
{case2Data, result2, false},
{case3Data, result3, false},
{case4Data, result4, false},
{case5Data, result5, false},
{case6Data, result6, false},
{case7Data, result7, false},
}
for i, testCase := range testCases {
result, err := UnmarshalJSON([]byte(testCase.data), addressBook)
expectErr := (err != nil)
if testCase.expectErr != expectErr {
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
}
if !testCase.expectErr {
if !reflect.DeepEqual(result, testCase.expectedResult) {
t.Errorf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
}
}
}
}

View File

@@ -0,0 +1,65 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package data
import (
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
)
// ColumnChunk ...
type ColumnChunk struct {
parquet.ColumnChunk
isDictPage bool
dictPageLen int64
dataPageLen int64
dataLen int64
data []byte
}
// Data returns the data.
func (chunk *ColumnChunk) Data() []byte {
return chunk.data
}
// DataLen returns the length of the data.
func (chunk *ColumnChunk) DataLen() int64 {
return chunk.dataLen
}
// NewRowGroup creates a new row group.
func NewRowGroup(chunks []*ColumnChunk, numRows, offset int64) *parquet.RowGroup {
rows := parquet.NewRowGroup()
rows.NumRows = numRows
for _, chunk := range chunks {
rows.Columns = append(rows.Columns, &chunk.ColumnChunk)
rows.TotalByteSize += chunk.dataLen
chunk.ColumnChunk.FileOffset = offset
if chunk.isDictPage {
dictPageOffset := offset
chunk.ColumnChunk.MetaData.DictionaryPageOffset = &dictPageOffset
offset += chunk.dictPageLen
}
chunk.ColumnChunk.MetaData.DataPageOffset = offset
offset += chunk.dataPageLen
}
return rows
}

View File

@@ -0,0 +1,107 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package data
import (
"fmt"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
"github.com/tidwall/gjson"
)
type jsonValue struct {
result *gjson.Result
path *string
}
func (v *jsonValue) String() string {
if v.result == nil {
return "<nil>"
}
return fmt.Sprintf("%v", *v.result)
}
func (v *jsonValue) IsNull() bool {
return v.result == nil || v.result.Type == gjson.Null
}
func (v *jsonValue) Get(path string) *jsonValue {
if v.path != nil {
var result *gjson.Result
if *v.path == path {
result = v.result
}
return resultToJSONValue(result)
}
if v.result == nil {
return resultToJSONValue(nil)
}
result := v.result.Get(path)
if !result.Exists() {
return resultToJSONValue(nil)
}
return resultToJSONValue(&result)
}
func (v *jsonValue) GetValue(parquetType parquet.Type, convertedType *parquet.ConvertedType) (interface{}, error) {
if v.result == nil {
return nil, nil
}
return resultToParquetValue(*v.result, parquetType, convertedType)
}
func (v *jsonValue) GetArray() ([]gjson.Result, error) {
if v.result == nil {
return nil, nil
}
return resultToArray(*v.result)
}
func (v *jsonValue) Range(iterator func(key, value gjson.Result) bool) error {
if v.result == nil || v.result.Type == gjson.Null {
return nil
}
if v.result.Type != gjson.JSON || !v.result.IsObject() {
return fmt.Errorf("result is not Map but %v", v.result.Type)
}
v.result.ForEach(iterator)
return nil
}
func resultToJSONValue(result *gjson.Result) *jsonValue {
return &jsonValue{
result: result,
}
}
func bytesToJSONValue(data []byte) (*jsonValue, error) {
if !gjson.ValidBytes(data) {
return nil, fmt.Errorf("invalid JSON data")
}
result := gjson.ParseBytes(data)
return resultToJSONValue(&result), nil
}

View File

@@ -0,0 +1,360 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package data
import (
"fmt"
"math"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
"github.com/tidwall/gjson"
)
func resultToBool(result gjson.Result) (value interface{}, err error) {
switch result.Type {
case gjson.False, gjson.True:
return result.Bool(), nil
}
return nil, fmt.Errorf("result is not Bool but %v", result.Type)
}
func resultToInt32(result gjson.Result) (value interface{}, err error) {
if value, err = resultToInt64(result); err != nil {
return nil, err
}
if value.(int64) < math.MinInt32 || value.(int64) > math.MaxInt32 {
return nil, fmt.Errorf("int32 overflow")
}
return int32(value.(int64)), nil
}
func resultToInt64(result gjson.Result) (value interface{}, err error) {
if result.Type == gjson.Number {
return result.Int(), nil
}
return nil, fmt.Errorf("result is not Number but %v", result.Type)
}
func resultToFloat(result gjson.Result) (value interface{}, err error) {
if result.Type == gjson.Number {
return float32(result.Float()), nil
}
return nil, fmt.Errorf("result is not float32 but %v", result.Type)
}
func resultToDouble(result gjson.Result) (value interface{}, err error) {
if result.Type == gjson.Number {
return result.Float(), nil
}
return nil, fmt.Errorf("result is not float64 but %v", result.Type)
}
func resultToBytes(result gjson.Result) (interface{}, error) {
if result.Type != gjson.JSON || !result.IsArray() {
return nil, fmt.Errorf("result is not byte array but %v", result.Type)
}
data := []byte{}
for i, r := range result.Array() {
if r.Type != gjson.Number {
return nil, fmt.Errorf("result[%v] is not byte but %v", i, r.Type)
}
value := r.Uint()
if value > math.MaxUint8 {
return nil, fmt.Errorf("byte overflow in result[%v]", i)
}
data = append(data, byte(value))
}
return data, nil
}
func resultToString(result gjson.Result) (value interface{}, err error) {
if result.Type == gjson.String {
return result.String(), nil
}
return nil, fmt.Errorf("result is not String but %v", result.Type)
}
func resultToUint8(result gjson.Result) (value interface{}, err error) {
if value, err = resultToUint64(result); err != nil {
return nil, err
}
if value.(uint64) > math.MaxUint8 {
return nil, fmt.Errorf("uint8 overflow")
}
return uint8(value.(uint64)), nil
}
func resultToUint16(result gjson.Result) (value interface{}, err error) {
if value, err = resultToUint64(result); err != nil {
return nil, err
}
if value.(uint64) > math.MaxUint16 {
return nil, fmt.Errorf("uint16 overflow")
}
return uint16(value.(uint64)), nil
}
func resultToUint32(result gjson.Result) (value interface{}, err error) {
if value, err = resultToUint64(result); err != nil {
return nil, err
}
if value.(uint64) > math.MaxUint32 {
return nil, fmt.Errorf("uint32 overflow")
}
return uint32(value.(uint64)), nil
}
func resultToUint64(result gjson.Result) (value interface{}, err error) {
if result.Type == gjson.Number {
return result.Uint(), nil
}
return nil, fmt.Errorf("result is not Number but %v", result.Type)
}
func resultToInt8(result gjson.Result) (value interface{}, err error) {
if value, err = resultToInt64(result); err != nil {
return nil, err
}
if value.(int64) < math.MinInt8 || value.(int64) > math.MaxInt8 {
return nil, fmt.Errorf("int8 overflow")
}
return int8(value.(int64)), nil
}
func resultToInt16(result gjson.Result) (value interface{}, err error) {
if value, err = resultToInt64(result); err != nil {
return nil, err
}
if value.(int64) < math.MinInt16 || value.(int64) > math.MaxInt16 {
return nil, fmt.Errorf("int16 overflow")
}
return int16(value.(int64)), nil
}
func stringToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
switch parquetType {
case parquet.Type_INT96, parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY:
return []byte(value.(string)), nil
}
return nil, fmt.Errorf("string cannot be converted to parquet type %v", parquetType)
}
func uint8ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
switch parquetType {
case parquet.Type_INT32:
return int32(value.(uint8)), nil
case parquet.Type_INT64:
return int64(value.(uint8)), nil
}
return nil, fmt.Errorf("uint8 cannot be converted to parquet type %v", parquetType)
}
func uint16ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
switch parquetType {
case parquet.Type_INT32:
return int32(value.(uint16)), nil
case parquet.Type_INT64:
return int64(value.(uint16)), nil
}
return nil, fmt.Errorf("uint16 cannot be converted to parquet type %v", parquetType)
}
func uint32ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
switch parquetType {
case parquet.Type_INT32:
return int32(value.(uint32)), nil
case parquet.Type_INT64:
return int64(value.(uint32)), nil
}
return nil, fmt.Errorf("uint32 cannot be converted to parquet type %v", parquetType)
}
func uint64ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
switch parquetType {
case parquet.Type_INT32:
return int32(value.(uint64)), nil
case parquet.Type_INT64:
return int64(value.(uint64)), nil
}
return nil, fmt.Errorf("uint64 cannot be converted to parquet type %v", parquetType)
}
func int8ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
switch parquetType {
case parquet.Type_INT32:
return int32(value.(int8)), nil
case parquet.Type_INT64:
return int64(value.(int8)), nil
}
return nil, fmt.Errorf("int8 cannot be converted to parquet type %v", parquetType)
}
func int16ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
switch parquetType {
case parquet.Type_INT32:
return int32(value.(int16)), nil
case parquet.Type_INT64:
return int64(value.(int16)), nil
}
return nil, fmt.Errorf("int16 cannot be converted to parquet type %v", parquetType)
}
func int32ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
switch parquetType {
case parquet.Type_INT32:
return value.(int32), nil
case parquet.Type_INT64:
return int64(value.(int32)), nil
}
return nil, fmt.Errorf("int32 cannot be converted to parquet type %v", parquetType)
}
func int64ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
switch parquetType {
case parquet.Type_INT32:
return int32(value.(int64)), nil
case parquet.Type_INT64:
return value.(int64), nil
}
return nil, fmt.Errorf("int64 cannot be converted to parquet type %v", parquetType)
}
func resultToParquetValueByConvertedValue(result gjson.Result, convertedType parquet.ConvertedType, parquetType parquet.Type) (value interface{}, err error) {
if result.Type == gjson.Null {
return nil, nil
}
switch convertedType {
case parquet.ConvertedType_UTF8:
if value, err = resultToString(result); err != nil {
return nil, err
}
return stringToParquetValue(value, parquetType)
case parquet.ConvertedType_UINT_8:
if value, err = resultToUint8(result); err != nil {
return nil, err
}
return uint8ToParquetValue(value, parquetType)
case parquet.ConvertedType_UINT_16:
if value, err = resultToUint16(result); err != nil {
return nil, err
}
return uint16ToParquetValue(value, parquetType)
case parquet.ConvertedType_UINT_32:
if value, err = resultToUint32(result); err != nil {
return nil, err
}
return uint32ToParquetValue(value, parquetType)
case parquet.ConvertedType_UINT_64:
if value, err = resultToUint64(result); err != nil {
return nil, err
}
return uint64ToParquetValue(value, parquetType)
case parquet.ConvertedType_INT_8:
if value, err = resultToInt8(result); err != nil {
return nil, err
}
return int8ToParquetValue(value, parquetType)
case parquet.ConvertedType_INT_16:
if value, err = resultToInt16(result); err != nil {
return nil, err
}
return int16ToParquetValue(value, parquetType)
case parquet.ConvertedType_INT_32:
if value, err = resultToInt32(result); err != nil {
return nil, err
}
return int32ToParquetValue(value, parquetType)
case parquet.ConvertedType_INT_64:
if value, err = resultToInt64(result); err != nil {
return nil, err
}
return int64ToParquetValue(value, parquetType)
}
return nil, fmt.Errorf("unsupported converted type %v", convertedType)
}
func resultToParquetValue(result gjson.Result, parquetType parquet.Type, convertedType *parquet.ConvertedType) (interface{}, error) {
if convertedType != nil {
return resultToParquetValueByConvertedValue(result, *convertedType, parquetType)
}
if result.Type == gjson.Null {
return nil, nil
}
switch parquetType {
case parquet.Type_BOOLEAN:
return resultToBool(result)
case parquet.Type_INT32:
return resultToInt32(result)
case parquet.Type_INT64:
return resultToInt64(result)
case parquet.Type_FLOAT:
return resultToFloat(result)
case parquet.Type_DOUBLE:
return resultToDouble(result)
case parquet.Type_INT96, parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY:
return resultToBytes(result)
}
return nil, fmt.Errorf("unknown parquet type %v", parquetType)
}
func resultToArray(result gjson.Result) ([]gjson.Result, error) {
if result.Type == gjson.Null {
return nil, nil
}
if result.Type != gjson.JSON || !result.IsArray() {
return nil, fmt.Errorf("result is not Array but %v", result.Type)
}
return result.Array(), nil
}