mirror of
https://github.com/minio/minio.git
synced 2025-11-11 06:20:14 -05:00
Add archived parquet as int. package (#9912)
Since github.com/minio/parquet-go is archived add it as internal package.
This commit is contained in:
126
pkg/s3select/internal/parquet-go/schema/element.go
Normal file
126
pkg/s3select/internal/parquet-go/schema/element.go
Normal file
@@ -0,0 +1,126 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package schema
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
var nameRegexp = regexp.MustCompile("^[a-zA-Z0-9_]+$")
|
||||
|
||||
func validataPathSegments(pathSegments []string) error {
|
||||
for _, pathSegment := range pathSegments {
|
||||
if !nameRegexp.MatchString(pathSegment) {
|
||||
return fmt.Errorf("unsupported name %v", strings.Join(pathSegments, "."))
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Element - represents schema element and its children. Any element must have Name and RepetitionType fields set.
|
||||
type Element struct {
|
||||
parquet.SchemaElement
|
||||
numChildren int32
|
||||
Encoding *parquet.Encoding // Optional; defaults is computed.
|
||||
CompressionType *parquet.CompressionCodec // Optional; defaults to SNAPPY.
|
||||
Children *Tree
|
||||
MaxDefinitionLevel int64
|
||||
MaxRepetitionLevel int64
|
||||
PathInTree string
|
||||
PathInSchema string
|
||||
}
|
||||
|
||||
// String - stringify this element.
|
||||
func (element *Element) String() string {
|
||||
var s []string
|
||||
s = append(s, "Name:"+element.Name)
|
||||
s = append(s, "RepetitionType:"+element.RepetitionType.String())
|
||||
if element.Type != nil {
|
||||
s = append(s, "Type:"+element.Type.String())
|
||||
}
|
||||
if element.ConvertedType != nil {
|
||||
s = append(s, "ConvertedType:"+element.ConvertedType.String())
|
||||
}
|
||||
if element.Encoding != nil {
|
||||
s = append(s, "Encoding:"+element.Encoding.String())
|
||||
}
|
||||
if element.CompressionType != nil {
|
||||
s = append(s, "CompressionType:"+element.CompressionType.String())
|
||||
}
|
||||
if element.Children != nil && element.Children.Length() > 0 {
|
||||
s = append(s, "Children:"+element.Children.String())
|
||||
}
|
||||
s = append(s, fmt.Sprintf("MaxDefinitionLevel:%v", element.MaxDefinitionLevel))
|
||||
s = append(s, fmt.Sprintf("MaxRepetitionLevel:%v", element.MaxRepetitionLevel))
|
||||
if element.PathInTree != "" {
|
||||
s = append(s, "PathInTree:"+element.PathInTree)
|
||||
}
|
||||
if element.PathInSchema != "" {
|
||||
s = append(s, "PathInSchema:"+element.PathInSchema)
|
||||
}
|
||||
|
||||
return "{" + strings.Join(s, ", ") + "}"
|
||||
}
|
||||
|
||||
// NewElement - creates new element.
|
||||
func NewElement(name string, repetitionType parquet.FieldRepetitionType,
|
||||
elementType *parquet.Type, convertedType *parquet.ConvertedType,
|
||||
encoding *parquet.Encoding, compressionType *parquet.CompressionCodec,
|
||||
children *Tree) (*Element, error) {
|
||||
|
||||
if !nameRegexp.MatchString(name) {
|
||||
return nil, fmt.Errorf("unsupported name %v", name)
|
||||
}
|
||||
|
||||
switch repetitionType {
|
||||
case parquet.FieldRepetitionType_REQUIRED, parquet.FieldRepetitionType_OPTIONAL, parquet.FieldRepetitionType_REPEATED:
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown repetition type %v", repetitionType)
|
||||
}
|
||||
|
||||
if repetitionType == parquet.FieldRepetitionType_REPEATED && (elementType != nil || convertedType != nil) {
|
||||
return nil, fmt.Errorf("repetition type REPEATED should be used in group element")
|
||||
}
|
||||
|
||||
if children != nil && children.Length() != 0 {
|
||||
if elementType != nil {
|
||||
return nil, fmt.Errorf("type should be nil for group element")
|
||||
}
|
||||
}
|
||||
|
||||
element := Element{
|
||||
Encoding: encoding,
|
||||
CompressionType: compressionType,
|
||||
Children: children,
|
||||
}
|
||||
|
||||
element.Name = name
|
||||
element.RepetitionType = &repetitionType
|
||||
element.Type = elementType
|
||||
element.ConvertedType = convertedType
|
||||
element.NumChildren = &element.numChildren
|
||||
if element.Children != nil {
|
||||
element.numChildren = int32(element.Children.Length())
|
||||
}
|
||||
|
||||
return &element, nil
|
||||
}
|
||||
388
pkg/s3select/internal/parquet-go/schema/tree.go
Normal file
388
pkg/s3select/internal/parquet-go/schema/tree.go
Normal file
@@ -0,0 +1,388 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package schema
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func updateMaxDLRL(schemaMap map[string]*Element, maxDL, maxRL int64) {
|
||||
for _, element := range schemaMap {
|
||||
element.MaxDefinitionLevel = maxDL
|
||||
element.MaxRepetitionLevel = maxRL
|
||||
if *element.RepetitionType != parquet.FieldRepetitionType_REQUIRED {
|
||||
element.MaxDefinitionLevel++
|
||||
if *element.RepetitionType == parquet.FieldRepetitionType_REPEATED {
|
||||
element.MaxRepetitionLevel++
|
||||
}
|
||||
}
|
||||
|
||||
if element.Children != nil {
|
||||
updateMaxDLRL(element.Children.schemaMap, element.MaxDefinitionLevel, element.MaxRepetitionLevel)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func toParquetSchema(tree *Tree, treePrefix string, schemaPrefix string, schemaList *[]*parquet.SchemaElement, valueElements *[]*Element) (err error) {
|
||||
tree.Range(func(name string, element *Element) bool {
|
||||
pathInTree := name
|
||||
if treePrefix != "" {
|
||||
pathInTree = treePrefix + "." + name
|
||||
}
|
||||
|
||||
if element.Type == nil && element.ConvertedType == nil && element.Children == nil {
|
||||
err = fmt.Errorf("%v: group element must have children", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if element.ConvertedType != nil {
|
||||
switch *element.ConvertedType {
|
||||
case parquet.ConvertedType_LIST:
|
||||
// Supported structure.
|
||||
// <REQUIRED|OPTIONAL> group <name> (LIST) {
|
||||
// REPEATED group list {
|
||||
// <REQUIRED|OPTIONAL> <element-type> element;
|
||||
// }
|
||||
// }
|
||||
|
||||
if element.Type != nil {
|
||||
err = fmt.Errorf("%v: type must be nil for LIST ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if element.Children == nil || element.Children.Length() != 1 {
|
||||
err = fmt.Errorf("%v: children must have one element only for LIST ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
listElement, ok := element.Children.Get("list")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v: missing group element 'list' for LIST ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if listElement.Name != "list" {
|
||||
err = fmt.Errorf("%v.list: name must be 'list'", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if *listElement.RepetitionType != parquet.FieldRepetitionType_REPEATED {
|
||||
err = fmt.Errorf("%v.list: repetition type must be REPEATED type", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if listElement.Type != nil || listElement.ConvertedType != nil {
|
||||
err = fmt.Errorf("%v.list: type and converted type must be nil", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if listElement.Children == nil || listElement.Children.Length() != 1 {
|
||||
err = fmt.Errorf("%v.list.element: not found", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
valueElement, ok := listElement.Children.Get("element")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v.list.element: not found", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if valueElement.Name != "element" {
|
||||
err = fmt.Errorf("%v.list.element: name must be 'element'", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
case parquet.ConvertedType_MAP:
|
||||
// Supported structure:
|
||||
// <REQUIRED|OPTIONAL> group <name> (MAP) {
|
||||
// REPEATED group key_value {
|
||||
// REQUIRED <key-type> key;
|
||||
// <REQUIRED|OPTIONAL> <value-type> value;
|
||||
// }
|
||||
// }
|
||||
|
||||
if element.Type != nil {
|
||||
err = fmt.Errorf("%v: type must be nil for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if element.Children == nil || element.Children.Length() != 1 {
|
||||
err = fmt.Errorf("%v: children must have one element only for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
keyValueElement, ok := element.Children.Get("key_value")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v: missing group element 'key_value' for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if keyValueElement.Name != "key_value" {
|
||||
err = fmt.Errorf("%v.key_value: name must be 'key_value'", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if *keyValueElement.RepetitionType != parquet.FieldRepetitionType_REPEATED {
|
||||
err = fmt.Errorf("%v.key_value: repetition type must be REPEATED type", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if keyValueElement.Children == nil || keyValueElement.Children.Length() < 1 || keyValueElement.Children.Length() > 2 {
|
||||
err = fmt.Errorf("%v.key_value: children must have 'key' and optionally 'value' elements for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
keyElement, ok := keyValueElement.Children.Get("key")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v.key_value: missing 'key' element for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if keyElement.Name != "key" {
|
||||
err = fmt.Errorf("%v.key_value.key: name must be 'key'", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if *keyElement.RepetitionType != parquet.FieldRepetitionType_REQUIRED {
|
||||
err = fmt.Errorf("%v.key_value: repetition type must be REQUIRED type", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if keyValueElement.Children.Length() == 2 {
|
||||
valueElement, ok := keyValueElement.Children.Get("value")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v.key_value: second element must be 'value' element for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if valueElement.Name != "value" {
|
||||
err = fmt.Errorf("%v.key_value.value: name must be 'value'", pathInTree)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
case parquet.ConvertedType_UTF8, parquet.ConvertedType_UINT_8, parquet.ConvertedType_UINT_16:
|
||||
fallthrough
|
||||
case parquet.ConvertedType_UINT_32, parquet.ConvertedType_UINT_64, parquet.ConvertedType_INT_8:
|
||||
fallthrough
|
||||
case parquet.ConvertedType_INT_16, parquet.ConvertedType_INT_32, parquet.ConvertedType_INT_64:
|
||||
if element.Type == nil {
|
||||
err = fmt.Errorf("%v: ConvertedType %v must have Type value", pathInTree, element.ConvertedType)
|
||||
return false
|
||||
}
|
||||
|
||||
default:
|
||||
err = fmt.Errorf("%v: unsupported ConvertedType %v", pathInTree, element.ConvertedType)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
element.PathInTree = pathInTree
|
||||
element.PathInSchema = element.Name
|
||||
if schemaPrefix != "" {
|
||||
element.PathInSchema = schemaPrefix + "." + element.Name
|
||||
}
|
||||
|
||||
if element.Type != nil {
|
||||
*valueElements = append(*valueElements, element)
|
||||
}
|
||||
|
||||
*schemaList = append(*schemaList, &element.SchemaElement)
|
||||
if element.Children != nil {
|
||||
element.numChildren = int32(element.Children.Length())
|
||||
err = toParquetSchema(element.Children, element.PathInTree, element.PathInSchema, schemaList, valueElements)
|
||||
}
|
||||
|
||||
return (err == nil)
|
||||
})
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// Tree - represents tree of schema. Tree preserves order in which elements are added.
|
||||
type Tree struct {
|
||||
schemaMap map[string]*Element
|
||||
keys []string
|
||||
readOnly bool
|
||||
}
|
||||
|
||||
// String - stringify this tree.
|
||||
func (tree *Tree) String() string {
|
||||
var s []string
|
||||
tree.Range(func(name string, element *Element) bool {
|
||||
s = append(s, fmt.Sprintf("%v: %v", name, element))
|
||||
return true
|
||||
})
|
||||
|
||||
return "{" + strings.Join(s, ", ") + "}"
|
||||
}
|
||||
|
||||
// Length - returns length of tree.
|
||||
func (tree *Tree) Length() int {
|
||||
return len(tree.keys)
|
||||
}
|
||||
|
||||
func (tree *Tree) travel(pathSegments []string) (pathSegmentIndex int, pathSegment string, currElement *Element, parentTree *Tree, found bool) {
|
||||
parentTree = tree
|
||||
for pathSegmentIndex, pathSegment = range pathSegments {
|
||||
if tree == nil {
|
||||
found = false
|
||||
break
|
||||
}
|
||||
|
||||
var tmpCurrElement *Element
|
||||
if tmpCurrElement, found = tree.schemaMap[pathSegment]; !found {
|
||||
break
|
||||
}
|
||||
currElement = tmpCurrElement
|
||||
|
||||
parentTree = tree
|
||||
tree = currElement.Children
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// ReadOnly - returns whether this tree is read only or not.
|
||||
func (tree *Tree) ReadOnly() bool {
|
||||
return tree.readOnly
|
||||
}
|
||||
|
||||
// Get - returns the element stored for name.
|
||||
func (tree *Tree) Get(name string) (element *Element, ok bool) {
|
||||
pathSegments := strings.Split(name, ".")
|
||||
for _, pathSegment := range pathSegments {
|
||||
if tree == nil {
|
||||
element = nil
|
||||
ok = false
|
||||
break
|
||||
}
|
||||
|
||||
if element, ok = tree.schemaMap[pathSegment]; !ok {
|
||||
break
|
||||
}
|
||||
|
||||
tree = element.Children
|
||||
}
|
||||
|
||||
return element, ok
|
||||
}
|
||||
|
||||
// Set - adds or sets element to name.
|
||||
func (tree *Tree) Set(name string, element *Element) error {
|
||||
if tree.readOnly {
|
||||
return fmt.Errorf("read only tree")
|
||||
}
|
||||
|
||||
pathSegments := strings.Split(name, ".")
|
||||
if err := validataPathSegments(pathSegments); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
i, pathSegment, currElement, parentTree, found := tree.travel(pathSegments)
|
||||
|
||||
if !found {
|
||||
if i != len(pathSegments)-1 {
|
||||
return fmt.Errorf("parent %v does not exist", strings.Join(pathSegments[:i+1], "."))
|
||||
}
|
||||
|
||||
if currElement == nil {
|
||||
parentTree = tree
|
||||
} else {
|
||||
if currElement.Type != nil {
|
||||
return fmt.Errorf("parent %v is not group element", strings.Join(pathSegments[:i], "."))
|
||||
}
|
||||
|
||||
if currElement.Children == nil {
|
||||
currElement.Children = NewTree()
|
||||
}
|
||||
parentTree = currElement.Children
|
||||
}
|
||||
|
||||
parentTree.keys = append(parentTree.keys, pathSegment)
|
||||
}
|
||||
|
||||
parentTree.schemaMap[pathSegment] = element
|
||||
return nil
|
||||
}
|
||||
|
||||
// Delete - deletes name and its element.
|
||||
func (tree *Tree) Delete(name string) {
|
||||
if tree.readOnly {
|
||||
panic(fmt.Errorf("read only tree"))
|
||||
}
|
||||
|
||||
pathSegments := strings.Split(name, ".")
|
||||
|
||||
_, pathSegment, _, parentTree, found := tree.travel(pathSegments)
|
||||
|
||||
if found {
|
||||
for i := range parentTree.keys {
|
||||
if parentTree.keys[i] == pathSegment {
|
||||
copy(parentTree.keys[i:], parentTree.keys[i+1:])
|
||||
parentTree.keys = parentTree.keys[:len(parentTree.keys)-1]
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
delete(parentTree.schemaMap, pathSegment)
|
||||
}
|
||||
}
|
||||
|
||||
// Range - calls f sequentially for each name and its element. If f returns false, range stops the iteration.
|
||||
func (tree *Tree) Range(f func(name string, element *Element) bool) {
|
||||
for _, name := range tree.keys {
|
||||
if !f(name, tree.schemaMap[name]) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ToParquetSchema - returns list of parquet SchemaElement and list of elements those stores values.
|
||||
func (tree *Tree) ToParquetSchema() (schemaList []*parquet.SchemaElement, valueElements []*Element, err error) {
|
||||
if tree.readOnly {
|
||||
return nil, nil, fmt.Errorf("read only tree")
|
||||
}
|
||||
|
||||
updateMaxDLRL(tree.schemaMap, 0, 0)
|
||||
|
||||
var schemaElements []*parquet.SchemaElement
|
||||
if err = toParquetSchema(tree, "", "", &schemaElements, &valueElements); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
tree.readOnly = true
|
||||
|
||||
numChildren := int32(len(tree.keys))
|
||||
schemaList = append(schemaList, &parquet.SchemaElement{
|
||||
Name: "schema",
|
||||
RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
|
||||
NumChildren: &numChildren,
|
||||
})
|
||||
schemaList = append(schemaList, schemaElements...)
|
||||
return schemaList, valueElements, nil
|
||||
}
|
||||
|
||||
// NewTree - creates new schema tree.
|
||||
func NewTree() *Tree {
|
||||
return &Tree{
|
||||
schemaMap: make(map[string]*Element),
|
||||
}
|
||||
}
|
||||
1092
pkg/s3select/internal/parquet-go/schema/tree_test.go
Normal file
1092
pkg/s3select/internal/parquet-go/schema/tree_test.go
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user