mirror of
https://github.com/minio/minio.git
synced 2024-12-24 22:25:54 -05:00
Add archived parquet as int. package (#9912)
Since github.com/minio/parquet-go is archived add it as internal package.
This commit is contained in:
parent
b1705599e1
commit
2d0f65a5e3
5
go.mod
5
go.mod
@ -5,6 +5,7 @@ go 1.13
|
||||
require (
|
||||
cloud.google.com/go v0.39.0
|
||||
contrib.go.opencensus.io/exporter/ocagent v0.5.0 // indirect
|
||||
git.apache.org/thrift.git v0.13.0
|
||||
github.com/Azure/azure-pipeline-go v0.2.1
|
||||
github.com/Azure/azure-storage-blob-go v0.8.0
|
||||
github.com/Azure/go-autorest v11.7.1+incompatible // indirect
|
||||
@ -69,7 +70,6 @@ require (
|
||||
github.com/minio/cli v1.22.0
|
||||
github.com/minio/highwayhash v1.0.0
|
||||
github.com/minio/minio-go/v6 v6.0.58-0.20200612001654-a57fec8037ec
|
||||
github.com/minio/parquet-go v0.0.0-20200414234858-838cfa8aae61
|
||||
github.com/minio/sha256-simd v0.1.1
|
||||
github.com/minio/simdjson-go v0.1.5-0.20200303142138-b17fe061ea37
|
||||
github.com/minio/sio v0.2.0
|
||||
@ -90,6 +90,7 @@ require (
|
||||
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e // indirect
|
||||
github.com/nsqio/go-nsq v1.0.7
|
||||
github.com/philhofer/fwd v1.0.0 // indirect
|
||||
github.com/pierrec/lz4 v2.4.0+incompatible
|
||||
github.com/pkg/errors v0.8.1
|
||||
github.com/prometheus/client_golang v0.9.3
|
||||
github.com/rcrowley/go-metrics v0.0.0-20190704165056-9c2d0518ed81 // indirect
|
||||
@ -103,6 +104,8 @@ require (
|
||||
github.com/soheilhy/cmux v0.1.4 // indirect
|
||||
github.com/spaolacci/murmur3 v1.1.0 // indirect
|
||||
github.com/streadway/amqp v0.0.0-20190404075320-75d898a42a94
|
||||
github.com/tidwall/gjson v1.3.5
|
||||
github.com/tidwall/sjson v1.0.4
|
||||
github.com/tinylib/msgp v1.1.1
|
||||
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5 // indirect
|
||||
github.com/ugorji/go v1.1.5-pre // indirect
|
||||
|
202
pkg/s3select/internal/parquet-go/LICENSE
Normal file
202
pkg/s3select/internal/parquet-go/LICENSE
Normal file
@ -0,0 +1,202 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
36
pkg/s3select/internal/parquet-go/Makefile
Normal file
36
pkg/s3select/internal/parquet-go/Makefile
Normal file
@ -0,0 +1,36 @@
|
||||
GOPATH := $(shell go env GOPATH)
|
||||
|
||||
all: check
|
||||
|
||||
getdeps:
|
||||
@if [ ! -f ${GOPATH}/bin/golint ]; then echo "Installing golint" && go get -u golang.org/x/lint/golint; fi
|
||||
@if [ ! -f ${GOPATH}/bin/gocyclo ]; then echo "Installing gocyclo" && go get -u github.com/fzipp/gocyclo; fi
|
||||
@if [ ! -f ${GOPATH}/bin/misspell ]; then echo "Installing misspell" && go get -u github.com/client9/misspell/cmd/misspell; fi
|
||||
@if [ ! -f ${GOPATH}/bin/ineffassign ]; then echo "Installing ineffassign" && go get -u github.com/gordonklaus/ineffassign; fi
|
||||
|
||||
vet:
|
||||
@echo "Running $@"
|
||||
@go vet *.go
|
||||
|
||||
fmt:
|
||||
@echo "Running $@"
|
||||
@gofmt -d *.go
|
||||
|
||||
lint:
|
||||
@echo "Running $@"
|
||||
@${GOPATH}/bin/golint -set_exit_status
|
||||
|
||||
cyclo:
|
||||
@echo "Running $@"
|
||||
@${GOPATH}/bin/gocyclo -over 200 .
|
||||
|
||||
spelling:
|
||||
@${GOPATH}/bin/misspell -locale US -error *.go README.md
|
||||
|
||||
ineffassign:
|
||||
@echo "Running $@"
|
||||
@${GOPATH}/bin/ineffassign .
|
||||
|
||||
check: getdeps vet fmt lint cyclo spelling ineffassign
|
||||
@echo "Running unit tests"
|
||||
@go test -tags kqueue ./...
|
3
pkg/s3select/internal/parquet-go/README.md
Normal file
3
pkg/s3select/internal/parquet-go/README.md
Normal file
@ -0,0 +1,3 @@
|
||||
# parquet-go
|
||||
|
||||
Modified version of https://github.com/xitongsys/parquet-go
|
154
pkg/s3select/internal/parquet-go/column.go
Normal file
154
pkg/s3select/internal/parquet-go/column.go
Normal file
@ -0,0 +1,154 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2018 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"git.apache.org/thrift.git/lib/go/thrift"
|
||||
"github.com/minio/minio-go/v6/pkg/set"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func getColumns(
|
||||
rowGroup *parquet.RowGroup,
|
||||
columnNames set.StringSet,
|
||||
schemaElements []*parquet.SchemaElement,
|
||||
getReaderFunc GetReaderFunc,
|
||||
) (nameColumnMap map[string]*column, err error) {
|
||||
nameIndexMap := make(map[string]int)
|
||||
for colIndex, columnChunk := range rowGroup.GetColumns() {
|
||||
meta := columnChunk.GetMetaData()
|
||||
columnName := strings.Join(meta.GetPathInSchema(), ".")
|
||||
if columnNames != nil && !columnNames.Contains(columnName) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Ignore column spanning into another file.
|
||||
if columnChunk.GetFilePath() != "" {
|
||||
continue
|
||||
}
|
||||
|
||||
offset := meta.GetDataPageOffset()
|
||||
if meta.DictionaryPageOffset != nil {
|
||||
offset = meta.GetDictionaryPageOffset()
|
||||
}
|
||||
|
||||
size := meta.GetTotalCompressedSize()
|
||||
|
||||
rc, err := getReaderFunc(offset, size)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
thriftReader := thrift.NewTBufferedTransport(thrift.NewStreamTransportR(rc), int(size))
|
||||
|
||||
if nameColumnMap == nil {
|
||||
nameColumnMap = make(map[string]*column)
|
||||
}
|
||||
|
||||
nameColumnMap[columnName] = &column{
|
||||
name: columnName,
|
||||
metadata: meta,
|
||||
schemaElements: schemaElements,
|
||||
rc: rc,
|
||||
thriftReader: thriftReader,
|
||||
valueType: meta.GetType(),
|
||||
}
|
||||
|
||||
// First element of []*parquet.SchemaElement from parquet file metadata is 'schema'
|
||||
// which is always skipped, hence colIndex + 1 is valid.
|
||||
nameIndexMap[columnName] = colIndex + 1
|
||||
}
|
||||
|
||||
for name := range nameColumnMap {
|
||||
nameColumnMap[name].nameIndexMap = nameIndexMap
|
||||
}
|
||||
|
||||
return nameColumnMap, nil
|
||||
}
|
||||
|
||||
type column struct {
|
||||
name string
|
||||
endOfValues bool
|
||||
valueIndex int
|
||||
valueType parquet.Type
|
||||
metadata *parquet.ColumnMetaData
|
||||
schemaElements []*parquet.SchemaElement
|
||||
nameIndexMap map[string]int
|
||||
dictPage *page
|
||||
dataTable *table
|
||||
rc io.ReadCloser
|
||||
thriftReader *thrift.TBufferedTransport
|
||||
}
|
||||
|
||||
func (column *column) close() (err error) {
|
||||
if column.rc != nil {
|
||||
err = column.rc.Close()
|
||||
column.rc = nil
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (column *column) readPage() {
|
||||
page, _, _, err := readPage(
|
||||
column.thriftReader,
|
||||
column.metadata,
|
||||
column.nameIndexMap,
|
||||
column.schemaElements,
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
column.endOfValues = true
|
||||
return
|
||||
}
|
||||
|
||||
if page.Header.GetType() == parquet.PageType_DICTIONARY_PAGE {
|
||||
column.dictPage = page
|
||||
column.readPage()
|
||||
return
|
||||
}
|
||||
|
||||
page.decode(column.dictPage)
|
||||
|
||||
if column.dataTable == nil {
|
||||
column.dataTable = newTableFromTable(page.DataTable)
|
||||
}
|
||||
|
||||
column.dataTable.Merge(page.DataTable)
|
||||
}
|
||||
|
||||
func (column *column) read() (value interface{}, valueType parquet.Type) {
|
||||
if column.dataTable == nil {
|
||||
column.readPage()
|
||||
column.valueIndex = 0
|
||||
}
|
||||
|
||||
if column.endOfValues {
|
||||
return nil, column.metadata.GetType()
|
||||
}
|
||||
|
||||
value = column.dataTable.Values[column.valueIndex]
|
||||
column.valueIndex++
|
||||
if len(column.dataTable.Values) == column.valueIndex {
|
||||
column.dataTable = nil
|
||||
}
|
||||
|
||||
return value, column.metadata.GetType()
|
||||
}
|
95
pkg/s3select/internal/parquet-go/common.go
Normal file
95
pkg/s3select/internal/parquet-go/common.go
Normal file
@ -0,0 +1,95 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func valuesToInterfaces(values interface{}, valueType parquet.Type) (tableValues []interface{}) {
|
||||
switch valueType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
for _, v := range values.([]bool) {
|
||||
tableValues = append(tableValues, v)
|
||||
}
|
||||
case parquet.Type_INT32:
|
||||
for _, v := range values.([]int32) {
|
||||
tableValues = append(tableValues, v)
|
||||
}
|
||||
case parquet.Type_INT64:
|
||||
for _, v := range values.([]int64) {
|
||||
tableValues = append(tableValues, v)
|
||||
}
|
||||
case parquet.Type_FLOAT:
|
||||
for _, v := range values.([]float32) {
|
||||
tableValues = append(tableValues, v)
|
||||
}
|
||||
case parquet.Type_DOUBLE:
|
||||
for _, v := range values.([]float64) {
|
||||
tableValues = append(tableValues, v)
|
||||
}
|
||||
case parquet.Type_INT96, parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY:
|
||||
for _, v := range values.([][]byte) {
|
||||
tableValues = append(tableValues, v)
|
||||
}
|
||||
}
|
||||
|
||||
return tableValues
|
||||
}
|
||||
|
||||
func interfacesToValues(values []interface{}, valueType parquet.Type) interface{} {
|
||||
switch valueType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
bs := make([]bool, len(values))
|
||||
for i := range values {
|
||||
bs[i] = values[i].(bool)
|
||||
}
|
||||
return bs
|
||||
case parquet.Type_INT32:
|
||||
i32s := make([]int32, len(values))
|
||||
for i := range values {
|
||||
i32s[i] = values[i].(int32)
|
||||
}
|
||||
return i32s
|
||||
case parquet.Type_INT64:
|
||||
i64s := make([]int64, len(values))
|
||||
for i := range values {
|
||||
i64s[i] = values[i].(int64)
|
||||
}
|
||||
return i64s
|
||||
case parquet.Type_FLOAT:
|
||||
f32s := make([]float32, len(values))
|
||||
for i := range values {
|
||||
f32s[i] = values[i].(float32)
|
||||
}
|
||||
return f32s
|
||||
case parquet.Type_DOUBLE:
|
||||
f64s := make([]float64, len(values))
|
||||
for i := range values {
|
||||
f64s[i] = values[i].(float64)
|
||||
}
|
||||
return f64s
|
||||
case parquet.Type_INT96, parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY:
|
||||
array := make([][]byte, len(values))
|
||||
for i := range values {
|
||||
array[i] = values[i].([]byte)
|
||||
}
|
||||
return array
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
144
pkg/s3select/internal/parquet-go/common/common.go
Normal file
144
pkg/s3select/internal/parquet-go/common/common.go
Normal file
@ -0,0 +1,144 @@
|
||||
package common
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
|
||||
"github.com/klauspost/compress/snappy"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/pierrec/lz4"
|
||||
)
|
||||
|
||||
// ToSliceValue converts values to a slice value.
|
||||
func ToSliceValue(values []interface{}, parquetType parquet.Type) interface{} {
|
||||
switch parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
bs := make([]bool, len(values))
|
||||
for i := range values {
|
||||
bs[i] = values[i].(bool)
|
||||
}
|
||||
return bs
|
||||
case parquet.Type_INT32:
|
||||
i32s := make([]int32, len(values))
|
||||
for i := range values {
|
||||
i32s[i] = values[i].(int32)
|
||||
}
|
||||
return i32s
|
||||
case parquet.Type_INT64:
|
||||
i64s := make([]int64, len(values))
|
||||
for i := range values {
|
||||
i64s[i] = values[i].(int64)
|
||||
}
|
||||
return i64s
|
||||
case parquet.Type_FLOAT:
|
||||
f32s := make([]float32, len(values))
|
||||
for i := range values {
|
||||
f32s[i] = values[i].(float32)
|
||||
}
|
||||
return f32s
|
||||
case parquet.Type_DOUBLE:
|
||||
f64s := make([]float64, len(values))
|
||||
for i := range values {
|
||||
f64s[i] = values[i].(float64)
|
||||
}
|
||||
return f64s
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
array := make([][]byte, len(values))
|
||||
for i := range values {
|
||||
array[i] = values[i].([]byte)
|
||||
}
|
||||
return array
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// BitWidth returns bits count required to accommodate given value.
|
||||
func BitWidth(ui64 uint64) (width int32) {
|
||||
for ; ui64 != 0; ui64 >>= 1 {
|
||||
width++
|
||||
}
|
||||
|
||||
return width
|
||||
}
|
||||
|
||||
// Compress compresses given data.
|
||||
func Compress(compressionType parquet.CompressionCodec, data []byte) ([]byte, error) {
|
||||
switch compressionType {
|
||||
case parquet.CompressionCodec_UNCOMPRESSED:
|
||||
return data, nil
|
||||
|
||||
case parquet.CompressionCodec_SNAPPY:
|
||||
return snappy.Encode(nil, data), nil
|
||||
|
||||
case parquet.CompressionCodec_GZIP:
|
||||
buf := new(bytes.Buffer)
|
||||
writer := gzip.NewWriter(buf)
|
||||
n, err := writer.Write(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if n != len(data) {
|
||||
return nil, fmt.Errorf("short writes")
|
||||
}
|
||||
|
||||
if err = writer.Flush(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err = writer.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return buf.Bytes(), nil
|
||||
|
||||
case parquet.CompressionCodec_LZ4:
|
||||
buf := new(bytes.Buffer)
|
||||
writer := lz4.NewWriter(buf)
|
||||
n, err := writer.Write(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if n != len(data) {
|
||||
return nil, fmt.Errorf("short writes")
|
||||
}
|
||||
|
||||
if err = writer.Flush(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err = writer.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unsupported compression codec %v", compressionType)
|
||||
}
|
||||
|
||||
// Uncompress uncompresses given data.
|
||||
func Uncompress(compressionType parquet.CompressionCodec, data []byte) ([]byte, error) {
|
||||
switch compressionType {
|
||||
case parquet.CompressionCodec_UNCOMPRESSED:
|
||||
return data, nil
|
||||
|
||||
case parquet.CompressionCodec_SNAPPY:
|
||||
return snappy.Decode(nil, data)
|
||||
|
||||
case parquet.CompressionCodec_GZIP:
|
||||
reader, err := gzip.NewReader(bytes.NewReader(data))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer reader.Close()
|
||||
return ioutil.ReadAll(reader)
|
||||
|
||||
case parquet.CompressionCodec_LZ4:
|
||||
return ioutil.ReadAll(lz4.NewReader(bytes.NewReader(data)))
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unsupported compression codec %v", compressionType)
|
||||
}
|
127
pkg/s3select/internal/parquet-go/compression.go
Normal file
127
pkg/s3select/internal/parquet-go/compression.go
Normal file
@ -0,0 +1,127 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2018 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"sync"
|
||||
|
||||
"github.com/klauspost/compress/gzip"
|
||||
"github.com/klauspost/compress/snappy"
|
||||
"github.com/klauspost/compress/zstd"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/pierrec/lz4"
|
||||
)
|
||||
|
||||
type compressionCodec parquet.CompressionCodec
|
||||
|
||||
var zstdOnce sync.Once
|
||||
var zstdEnc *zstd.Encoder
|
||||
var zstdDec *zstd.Decoder
|
||||
|
||||
func initZstd() {
|
||||
zstdOnce.Do(func() {
|
||||
zstdEnc, _ = zstd.NewWriter(nil, zstd.WithZeroFrames(true))
|
||||
zstdDec, _ = zstd.NewReader(nil)
|
||||
})
|
||||
}
|
||||
|
||||
func (c compressionCodec) compress(buf []byte) ([]byte, error) {
|
||||
switch parquet.CompressionCodec(c) {
|
||||
case parquet.CompressionCodec_UNCOMPRESSED:
|
||||
return buf, nil
|
||||
|
||||
case parquet.CompressionCodec_SNAPPY:
|
||||
return snappy.Encode(nil, buf), nil
|
||||
|
||||
case parquet.CompressionCodec_GZIP:
|
||||
byteBuf := new(bytes.Buffer)
|
||||
writer := gzip.NewWriter(byteBuf)
|
||||
n, err := writer.Write(buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if n != len(buf) {
|
||||
return nil, fmt.Errorf("short writes")
|
||||
}
|
||||
|
||||
if err = writer.Flush(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err = writer.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return byteBuf.Bytes(), nil
|
||||
|
||||
case parquet.CompressionCodec_LZ4:
|
||||
byteBuf := new(bytes.Buffer)
|
||||
writer := lz4.NewWriter(byteBuf)
|
||||
n, err := writer.Write(buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if n != len(buf) {
|
||||
return nil, fmt.Errorf("short writes")
|
||||
}
|
||||
|
||||
if err = writer.Flush(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err = writer.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return byteBuf.Bytes(), nil
|
||||
case parquet.CompressionCodec_ZSTD:
|
||||
initZstd()
|
||||
return zstdEnc.EncodeAll(buf, nil), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("invalid compression codec %v", c)
|
||||
}
|
||||
|
||||
func (c compressionCodec) uncompress(buf []byte) ([]byte, error) {
|
||||
switch parquet.CompressionCodec(c) {
|
||||
case parquet.CompressionCodec_UNCOMPRESSED:
|
||||
return buf, nil
|
||||
|
||||
case parquet.CompressionCodec_SNAPPY:
|
||||
return snappy.Decode(nil, buf)
|
||||
|
||||
case parquet.CompressionCodec_GZIP:
|
||||
reader, err := gzip.NewReader(bytes.NewReader(buf))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer reader.Close()
|
||||
return ioutil.ReadAll(reader)
|
||||
|
||||
case parquet.CompressionCodec_LZ4:
|
||||
return ioutil.ReadAll(lz4.NewReader(bytes.NewReader(buf)))
|
||||
|
||||
case parquet.CompressionCodec_ZSTD:
|
||||
initZstd()
|
||||
return zstdDec.DecodeAll(buf, nil)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("invalid compression codec %v", c)
|
||||
}
|
618
pkg/s3select/internal/parquet-go/data/column-grouplist_test.go
Normal file
618
pkg/s3select/internal/parquet-go/data/column-grouplist_test.go
Normal file
@ -0,0 +1,618 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
func TestPopulateGroupList(t *testing.T) {
|
||||
requiredList1 := schema.NewTree()
|
||||
{
|
||||
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList1.Set("group", requiredGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("group.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("group.list.element.col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := requiredList1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredList2 := schema.NewTree()
|
||||
{
|
||||
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList2.Set("group", requiredGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("group.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("group.list.element.col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := requiredList2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredList3 := schema.NewTree()
|
||||
{
|
||||
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList3.Set("group", requiredGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList3.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList3.Set("group.list.element", optionalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList3.Set("group.list.element.col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := requiredList3.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredList4 := schema.NewTree()
|
||||
{
|
||||
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList4.Set("group", requiredGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList4.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList4.Set("group.list.element", optionalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList4.Set("group.list.element.col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := requiredList4.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList1 := schema.NewTree()
|
||||
{
|
||||
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList1.Set("group", optionalGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("group.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("group.list.element.col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := optionalList1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList2 := schema.NewTree()
|
||||
{
|
||||
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList2.Set("group", optionalGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("group.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("group.list.element.col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := optionalList2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList3 := schema.NewTree()
|
||||
{
|
||||
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList3.Set("group", optionalGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList3.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList3.Set("group.list.element", optionalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList3.Set("group.list.element.col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := optionalList3.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList4 := schema.NewTree()
|
||||
{
|
||||
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList4.Set("group", optionalGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList4.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList4.Set("group.list.element", optionalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList4.Set("group.list.element.col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := optionalList4.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
result1 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result2 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20},
|
||||
definitionLevels: []int64{1, 1},
|
||||
repetitionLevels: []int64{0, 1},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v20,
|
||||
},
|
||||
}
|
||||
|
||||
result3 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result4 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result5 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20},
|
||||
definitionLevels: []int64{2, 2},
|
||||
repetitionLevels: []int64{0, 1},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v20,
|
||||
},
|
||||
}
|
||||
|
||||
result6 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result7 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result8 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20},
|
||||
definitionLevels: []int64{3, 3},
|
||||
repetitionLevels: []int64{0, 1},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v20,
|
||||
},
|
||||
}
|
||||
|
||||
result9 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result10 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result11 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{4},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result12 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20},
|
||||
definitionLevels: []int64{4, 4},
|
||||
repetitionLevels: []int64{0, 1},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v20,
|
||||
},
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
schemaTree *schema.Tree
|
||||
data string
|
||||
expectedResult map[string]*Column
|
||||
expectErr bool
|
||||
}{
|
||||
{requiredList1, `{}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList1, `{"group": null}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList1, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field
|
||||
{requiredList1, `{"group": [{"col": 10}]}`, result1, false},
|
||||
{requiredList1, `{"group": [{"col": 10}, {"col": 20}]}`, result2, false},
|
||||
{requiredList2, `{}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList2, `{"group": null}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList2, `{"group": [{"col": null}]}`, result3, false},
|
||||
{requiredList2, `{"group": [{"col": 10}]}`, result4, false},
|
||||
{requiredList2, `{"group": [{"col": 10}, {"col": 20}]}`, result5, false},
|
||||
{requiredList3, `{}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList3, `{"group": null}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList3, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field
|
||||
{requiredList3, `{"group": [{"col": 10}]}`, result4, false},
|
||||
{requiredList3, `{"group": [{"col": 10}, {"col": 20}]}`, result5, false},
|
||||
{requiredList4, `{}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList4, `{"group": null}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList4, `{"group": [{"col": null}]}`, result6, false},
|
||||
{requiredList4, `{"group": [{"col": 10}]}`, result7, false},
|
||||
{requiredList4, `{"group": [{"col": 10}, {"col": 20}]}`, result8, false},
|
||||
{optionalList1, `{}`, result9, false},
|
||||
{optionalList1, `{"group": null}`, result9, false},
|
||||
{optionalList1, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field
|
||||
{optionalList1, `{"group": [{"col": 10}]}`, result4, false},
|
||||
{optionalList1, `{"group": [{"col": 10}, {"col": 20}]}`, result5, false},
|
||||
{optionalList2, `{}`, result9, false},
|
||||
{optionalList2, `{"group": null}`, result9, false},
|
||||
{optionalList2, `{"group": [{"col": null}]}`, result6, false},
|
||||
{optionalList2, `{"group": [{"col": 10}]}`, result7, false},
|
||||
{optionalList2, `{"group": [{"col": 10}, {"col": 20}]}`, result8, false},
|
||||
{optionalList3, `{}`, result9, false},
|
||||
{optionalList3, `{"group": null}`, result9, false},
|
||||
{optionalList3, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field
|
||||
{optionalList3, `{"group": [{"col": 10}]}`, result7, false},
|
||||
{optionalList3, `{"group": [{"col": 10}, {"col": 20}]}`, result8, false},
|
||||
{optionalList4, `{}`, result9, false},
|
||||
{optionalList4, `{"group": null}`, result9, false},
|
||||
{optionalList4, `{"group": [{"col": null}]}`, result10, false},
|
||||
{optionalList4, `{"group": [{"col": 10}]}`, result11, false},
|
||||
{optionalList4, `{"group": [{"col": 10}, {"col": 20}]}`, result12, false},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
|
||||
expectErr := (err != nil)
|
||||
|
||||
if testCase.expectErr != expectErr {
|
||||
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
|
||||
}
|
||||
|
||||
if !testCase.expectErr {
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
237
pkg/s3select/internal/parquet-go/data/column-grouptype_test.go
Normal file
237
pkg/s3select/internal/parquet-go/data/column-grouptype_test.go
Normal file
@ -0,0 +1,237 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
func TestPopulateGroupType(t *testing.T) {
|
||||
requiredGroup1 := schema.NewTree()
|
||||
{
|
||||
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredGroup1.Set("group", requiredGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredGroup1.Set("group.col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := requiredGroup1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredGroup2 := schema.NewTree()
|
||||
{
|
||||
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredGroup2.Set("group", requiredGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredGroup2.Set("group.col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := requiredGroup2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalGroup1 := schema.NewTree()
|
||||
{
|
||||
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalGroup1.Set("group", optionalGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalGroup1.Set("group.col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := optionalGroup1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalGroup2 := schema.NewTree()
|
||||
{
|
||||
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalGroup2.Set("group", optionalGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalGroup2.Set("group.col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := optionalGroup2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
result1 := map[string]*Column{
|
||||
"group.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result2 := map[string]*Column{
|
||||
"group.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result3 := map[string]*Column{
|
||||
"group.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result4 := map[string]*Column{
|
||||
"group.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result5 := map[string]*Column{
|
||||
"group.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
schemaTree *schema.Tree
|
||||
data string
|
||||
expectedResult map[string]*Column
|
||||
expectErr bool
|
||||
}{
|
||||
{requiredGroup1, `{}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredGroup1, `{"group": null}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredGroup1, `{"group": {"col": null}}`, nil, true}, // err: group.col: nil value for required field
|
||||
{requiredGroup1, `{"group": {"col": 10}}`, result1, false},
|
||||
{requiredGroup2, `{}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredGroup2, `{"group": null}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredGroup2, `{"group": {"col": null}}`, result2, false},
|
||||
{requiredGroup2, `{"group": {"col": 10}}`, result3, false},
|
||||
{optionalGroup1, `{}`, result2, false},
|
||||
{optionalGroup1, `{"group": null}`, result2, false},
|
||||
{optionalGroup1, `{"group": {"col": null}}`, nil, true}, // err: group.col: nil value for required field
|
||||
{optionalGroup1, `{"group": {"col": 10}}`, result3, false},
|
||||
{optionalGroup2, `{}`, result2, false},
|
||||
{optionalGroup2, `{"group": null}`, result2, false},
|
||||
{optionalGroup2, `{"group": {"col": null}}`, result4, false},
|
||||
{optionalGroup2, `{"group": {"col": 10}}`, result5, false},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
|
||||
expectErr := (err != nil)
|
||||
|
||||
if testCase.expectErr != expectErr {
|
||||
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
|
||||
}
|
||||
|
||||
if !testCase.expectErr {
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
698
pkg/s3select/internal/parquet-go/data/column-listoflist_test.go
Normal file
698
pkg/s3select/internal/parquet-go/data/column-listoflist_test.go
Normal file
@ -0,0 +1,698 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
func TestPopulateListOfList(t *testing.T) {
|
||||
requiredList1 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList1.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("col.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("col.list.element.list.element", requiredSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredList1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredList2 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList2.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("col.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("col.list.element.list.element", optionalSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredList2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredList3 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList3.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList3.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList3.Set("col.list.element", optioonalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList3.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList3.Set("col.list.element.list.element", requiredSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredList3.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredList4 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList4.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList4.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList4.Set("col.list.element", optioonalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList4.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList4.Set("col.list.element.list.element", optionalSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredList4.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList1 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList1.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("col.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("col.list.element.list.element", requiredSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalList1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList2 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList2.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("col.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("col.list.element.list.element", optionalSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalList2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList3 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList3.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList3.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList3.Set("col.list.element", optioonalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList3.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList3.Set("col.list.element.list.element", requiredSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalList3.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList4 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList4.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList4.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList4.Set("col.list.element", optioonalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList4.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList4.Set("col.list.element.list.element", optionalSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalList4.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
result1 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result2 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20, v30, v10, v20, v10, v30},
|
||||
definitionLevels: []int64{2, 2, 2, 2, 2, 2, 2},
|
||||
repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v30,
|
||||
},
|
||||
}
|
||||
|
||||
result3 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result4 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result5 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20, v30, v10, v20, v10, v30},
|
||||
definitionLevels: []int64{3, 3, 3, 3, 3, 3, 3},
|
||||
repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v30,
|
||||
},
|
||||
}
|
||||
|
||||
result6 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result7 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{4},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result8 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20, v30, v10, v20, v10, v30},
|
||||
definitionLevels: []int64{4, 4, 4, 4, 4, 4, 4},
|
||||
repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v30,
|
||||
},
|
||||
}
|
||||
|
||||
result9 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result10 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{4},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result11 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{5},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result12 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20, v30, v10, v20, v10, v30},
|
||||
definitionLevels: []int64{5, 5, 5, 5, 5, 5, 5},
|
||||
repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v30,
|
||||
},
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
schemaTree *schema.Tree
|
||||
data string
|
||||
expectedResult map[string]*Column
|
||||
expectErr bool
|
||||
}{
|
||||
{requiredList1, `{}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList1, `{"col": null}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList1, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field
|
||||
{requiredList1, `{"col": [[10]]}`, result1, false},
|
||||
{requiredList1, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result2, false},
|
||||
{requiredList2, `{}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList2, `{"col": null}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList2, `{"col": [[null]]}`, result3, false},
|
||||
{requiredList2, `{"col": [[10]]}`, result4, false},
|
||||
{requiredList2, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result5, false},
|
||||
{requiredList3, `{}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList3, `{"col": null}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList3, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field
|
||||
{requiredList3, `{"col": [[10]]}`, result4, false},
|
||||
{requiredList3, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result5, false},
|
||||
{requiredList4, `{}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList4, `{"col": null}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList4, `{"col": [[null]]}`, result6, false},
|
||||
{requiredList4, `{"col": [[10]]}`, result7, false},
|
||||
{requiredList4, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result8, false},
|
||||
{optionalList1, `{}`, result9, false},
|
||||
{optionalList1, `{"col": null}`, result9, false},
|
||||
{optionalList1, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field
|
||||
{optionalList1, `{"col": [[10]]}`, result4, false},
|
||||
{optionalList1, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result5, false},
|
||||
{optionalList2, `{}`, result9, false},
|
||||
{optionalList2, `{"col": null}`, result9, false},
|
||||
{optionalList2, `{"col": [[null]]}`, result6, false},
|
||||
{optionalList2, `{"col": [[10]]}`, result7, false},
|
||||
{optionalList2, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result8, false},
|
||||
{optionalList3, `{}`, result9, false},
|
||||
{optionalList3, `{"col": null}`, result9, false},
|
||||
{optionalList3, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field
|
||||
{optionalList3, `{"col": [[10]]}`, result7, false},
|
||||
{optionalList3, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result8, false},
|
||||
{optionalList4, `{}`, result9, false},
|
||||
{optionalList4, `{"col": null}`, result9, false},
|
||||
{optionalList4, `{"col": [[null]]}`, result10, false},
|
||||
{optionalList4, `{"col": [[10]]}`, result11, false},
|
||||
{optionalList4, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result12, false},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
|
||||
expectErr := (err != nil)
|
||||
|
||||
if testCase.expectErr != expectErr {
|
||||
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
|
||||
}
|
||||
|
||||
if !testCase.expectErr {
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
370
pkg/s3select/internal/parquet-go/data/column-map_test.go
Normal file
370
pkg/s3select/internal/parquet-go/data/column-map_test.go
Normal file
@ -0,0 +1,370 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
func TestPopulateMap(t *testing.T) {
|
||||
t.Skip("Broken")
|
||||
requiredMap1 := schema.NewTree()
|
||||
{
|
||||
mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredValue, err := schema.NewElement("value", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap1.Set("map", mapElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap1.Set("map.key_value", keyValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap1.Set("map.key_value.key", requiredKey); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap1.Set("map.key_value.value", requiredValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredMap1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredMap2 := schema.NewTree()
|
||||
{
|
||||
mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalValue, err := schema.NewElement("value", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap2.Set("map", mapElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap2.Set("map.key_value", keyValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap2.Set("map.key_value.key", requiredKey); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap2.Set("map.key_value.value", optionalValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredMap2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalMap1 := schema.NewTree()
|
||||
{
|
||||
mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredValue, err := schema.NewElement("value", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap1.Set("map", mapElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap1.Set("map.key_value", keyValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap1.Set("map.key_value.key", requiredKey); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap1.Set("map.key_value.value", requiredValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalMap1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalMap2 := schema.NewTree()
|
||||
{
|
||||
mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalValue, err := schema.NewElement("value", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap2.Set("map", mapElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap2.Set("map.key_value", keyValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap2.Set("map.key_value.key", requiredKey); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap2.Set("map.key_value.value", optionalValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalMap2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
result1 := map[string]*Column{
|
||||
"map.key_value.key": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{ten},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"map.key_value.value": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
result2 := map[string]*Column{
|
||||
"map.key_value.key": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{ten},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"map.key_value.value": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
result3 := map[string]*Column{
|
||||
"map.key_value.key": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{ten},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"map.key_value.value": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
result4 := map[string]*Column{
|
||||
"map.key_value.key": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
}
|
||||
|
||||
result5 := map[string]*Column{
|
||||
"map.key_value.key": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{ten},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"map.key_value.value": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
result6 := map[string]*Column{
|
||||
"map.key_value.key": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{ten},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"map.key_value.value": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
result7 := map[string]*Column{
|
||||
"map.key_value.key": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{ten},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"map.key_value.value": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
schemaTree *schema.Tree
|
||||
data string
|
||||
expectedResult map[string]*Column
|
||||
expectErr bool
|
||||
}{
|
||||
{requiredMap1, `{}`, nil, true}, // err: map: nil value for required field
|
||||
{requiredMap1, `{"map": null}`, nil, true}, // err: map: nil value for required field
|
||||
{requiredMap1, `{"map": {"ten": null}}`, nil, true}, // err: map.key_value.value: nil value for required field
|
||||
{requiredMap1, `{"map": {"ten": 10}}`, result1, false},
|
||||
{requiredMap2, `{}`, nil, true}, // err: map: nil value for required field
|
||||
{requiredMap2, `{"map": null}`, nil, true}, // err: map: nil value for required field
|
||||
{requiredMap2, `{"map": {"ten": null}}`, result2, false},
|
||||
{requiredMap2, `{"map": {"ten": 10}}`, result3, false},
|
||||
{optionalMap1, `{}`, result4, false},
|
||||
{optionalMap1, `{"map": null}`, result4, false},
|
||||
{optionalMap1, `{"map": {"ten": null}}`, nil, true}, // err: map.key_value.value: nil value for required field
|
||||
{optionalMap1, `{"map": {"ten": 10}}`, result5, false},
|
||||
{optionalMap2, `{}`, result4, false},
|
||||
{optionalMap2, `{"map": null}`, result4, false},
|
||||
{optionalMap2, `{"map": {"ten": null}}`, result6, false},
|
||||
{optionalMap2, `{"map": {"ten": 10}}`, result7, false},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
|
||||
expectErr := (err != nil)
|
||||
|
||||
if testCase.expectErr != expectErr {
|
||||
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
|
||||
}
|
||||
|
||||
if !testCase.expectErr {
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Errorf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,330 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
func TestPopulatePrimitiveList(t *testing.T) {
|
||||
requiredList1 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList1.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("col.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredList1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredList2 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList2.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("col.list.element", optionalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredList2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList1 := schema.NewTree()
|
||||
{
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList1.Set("col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("col.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalList1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList2 := schema.NewTree()
|
||||
{
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList2.Set("col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("col.list.element", optionalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalList2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
result1 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result2 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20, v30},
|
||||
definitionLevels: []int64{1, 1, 1},
|
||||
repetitionLevels: []int64{0, 1, 1},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v30,
|
||||
},
|
||||
}
|
||||
|
||||
result3 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result4 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result5 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20, v30},
|
||||
definitionLevels: []int64{2, 2, 2},
|
||||
repetitionLevels: []int64{0, 1, 1},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v30,
|
||||
},
|
||||
}
|
||||
|
||||
result6 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result7 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result8 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result9 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20, v30},
|
||||
definitionLevels: []int64{3, 3, 3},
|
||||
repetitionLevels: []int64{0, 1, 1},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v30,
|
||||
},
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
schemaTree *schema.Tree
|
||||
data string
|
||||
expectedResult map[string]*Column
|
||||
expectErr bool
|
||||
}{
|
||||
{requiredList1, `{}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList1, `{"col": null}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList1, `{"col": [null]}`, nil, true}, // err: col.list.element: nil value for required field
|
||||
{requiredList1, `{"col": [10]}`, result1, false},
|
||||
{requiredList1, `{"col": [10, 20, 30]}`, result2, false},
|
||||
{requiredList2, `{}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList2, `{"col": null}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList2, `{"col": [null]}`, result3, false},
|
||||
{requiredList2, `{"col": [10]}`, result4, false},
|
||||
{requiredList2, `{"col": [10, 20, 30]}`, result5, false},
|
||||
{optionalList1, `{}`, result6, false},
|
||||
{optionalList1, `{"col": null}`, result6, false},
|
||||
{optionalList1, `{"col": [null]}`, nil, true}, // err: col.list.element: nil value for required field
|
||||
{optionalList1, `{"col": [10]}`, result4, false},
|
||||
{optionalList1, `{"col": [10, 20, 30]}`, result5, false},
|
||||
{optionalList2, `{}`, result6, false},
|
||||
{optionalList2, `{"col": null}`, result6, false},
|
||||
{optionalList2, `{"col": [null]}`, result7, false},
|
||||
{optionalList2, `{"col": [10]}`, result8, false},
|
||||
{optionalList2, `{"col": [10, 20, 30]}`, result9, false},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
|
||||
expectErr := (err != nil)
|
||||
|
||||
if testCase.expectErr != expectErr {
|
||||
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
|
||||
}
|
||||
|
||||
if !testCase.expectErr {
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,128 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
func TestPopulatePrimitiveType(t *testing.T) {
|
||||
requiredField := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredField.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredField.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalField := schema.NewTree()
|
||||
{
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalField.Set("col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalField.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
result1 := map[string]*Column{
|
||||
"col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result2 := map[string]*Column{
|
||||
"col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result3 := map[string]*Column{
|
||||
"col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
schemaTree *schema.Tree
|
||||
data string
|
||||
expectedResult map[string]*Column
|
||||
expectErr bool
|
||||
}{
|
||||
{requiredField, `{}`, nil, true},
|
||||
{requiredField, `{"col": null}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredField, `{"col": 10}`, result1, false},
|
||||
{optionalField, `{}`, result2, false},
|
||||
{optionalField, `{"col": null}`, result2, false},
|
||||
{optionalField, `{"col": 10}`, result3, false},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
|
||||
expectErr := (err != nil)
|
||||
|
||||
if testCase.expectErr != expectErr {
|
||||
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
|
||||
}
|
||||
|
||||
if !testCase.expectErr {
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
680
pkg/s3select/internal/parquet-go/data/column.go
Normal file
680
pkg/s3select/internal/parquet-go/data/column.go
Normal file
@ -0,0 +1,680 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"git.apache.org/thrift.git/lib/go/thrift"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/common"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/encoding"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
"github.com/tidwall/gjson"
|
||||
"github.com/tidwall/sjson"
|
||||
)
|
||||
|
||||
func getDefaultEncoding(parquetType parquet.Type) parquet.Encoding {
|
||||
switch parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
return parquet.Encoding_PLAIN
|
||||
case parquet.Type_INT32, parquet.Type_INT64, parquet.Type_FLOAT, parquet.Type_DOUBLE:
|
||||
return parquet.Encoding_RLE_DICTIONARY
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
return parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY
|
||||
}
|
||||
|
||||
return parquet.Encoding_PLAIN
|
||||
}
|
||||
|
||||
func getFirstValueElement(tree *schema.Tree) (valueElement *schema.Element) {
|
||||
tree.Range(func(name string, element *schema.Element) bool {
|
||||
if element.Children == nil {
|
||||
valueElement = element
|
||||
} else {
|
||||
valueElement = getFirstValueElement(element.Children)
|
||||
}
|
||||
|
||||
return false
|
||||
})
|
||||
|
||||
return valueElement
|
||||
}
|
||||
|
||||
func populate(columnDataMap map[string]*Column, input *jsonValue, tree *schema.Tree, firstValueRL int64) (map[string]*Column, error) {
|
||||
var err error
|
||||
|
||||
pos := 0
|
||||
handleElement := func(name string, element *schema.Element) bool {
|
||||
pos++
|
||||
|
||||
dataPath := element.PathInTree
|
||||
|
||||
if *element.RepetitionType == parquet.FieldRepetitionType_REPEATED {
|
||||
panic(fmt.Errorf("%v: repetition type must be REQUIRED or OPTIONAL type", dataPath))
|
||||
}
|
||||
|
||||
inputValue := input.Get(name)
|
||||
if *element.RepetitionType == parquet.FieldRepetitionType_REQUIRED && inputValue.IsNull() {
|
||||
err = fmt.Errorf("%v: nil value for required field", dataPath)
|
||||
return false
|
||||
}
|
||||
|
||||
add := func(element *schema.Element, value interface{}, DL, RL int64) {
|
||||
columnData := columnDataMap[element.PathInSchema]
|
||||
if columnData == nil {
|
||||
columnData = NewColumn(*element.Type)
|
||||
}
|
||||
columnData.add(value, DL, RL)
|
||||
columnDataMap[element.PathInSchema] = columnData
|
||||
}
|
||||
|
||||
// Handle primitive type element.
|
||||
if element.Type != nil {
|
||||
var value interface{}
|
||||
if value, err = inputValue.GetValue(*element.Type, element.ConvertedType); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
DL := element.MaxDefinitionLevel
|
||||
if value == nil && DL > 0 {
|
||||
DL--
|
||||
}
|
||||
|
||||
RL := element.MaxRepetitionLevel
|
||||
if pos == 1 {
|
||||
RL = firstValueRL
|
||||
}
|
||||
|
||||
add(element, value, DL, RL)
|
||||
return true
|
||||
}
|
||||
|
||||
addNull := func() {
|
||||
valueElement := getFirstValueElement(element.Children)
|
||||
|
||||
DL := element.MaxDefinitionLevel
|
||||
if DL > 0 {
|
||||
DL--
|
||||
}
|
||||
|
||||
RL := element.MaxRepetitionLevel
|
||||
if RL > 0 {
|
||||
RL--
|
||||
}
|
||||
|
||||
add(valueElement, nil, DL, RL)
|
||||
}
|
||||
|
||||
// Handle group type element.
|
||||
if element.ConvertedType == nil {
|
||||
if inputValue.IsNull() {
|
||||
addNull()
|
||||
return true
|
||||
}
|
||||
|
||||
columnDataMap, err = populate(columnDataMap, inputValue, element.Children, firstValueRL)
|
||||
return (err == nil)
|
||||
}
|
||||
|
||||
// Handle list type element.
|
||||
if *element.ConvertedType == parquet.ConvertedType_LIST {
|
||||
if inputValue.IsNull() {
|
||||
addNull()
|
||||
return true
|
||||
}
|
||||
|
||||
var results []gjson.Result
|
||||
if results, err = inputValue.GetArray(); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
listElement, _ := element.Children.Get("list")
|
||||
valueElement, _ := listElement.Children.Get("element")
|
||||
for i := range results {
|
||||
rl := valueElement.MaxRepetitionLevel
|
||||
if i == 0 {
|
||||
rl = firstValueRL
|
||||
}
|
||||
|
||||
var jsonData []byte
|
||||
if jsonData, err = sjson.SetBytes([]byte{}, "element", results[i].Value()); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
var jv *jsonValue
|
||||
if jv, err = bytesToJSONValue(jsonData); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
if columnDataMap, err = populate(columnDataMap, jv, listElement.Children, rl); err != nil {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
if *element.ConvertedType == parquet.ConvertedType_MAP {
|
||||
if inputValue.IsNull() {
|
||||
addNull()
|
||||
return true
|
||||
}
|
||||
|
||||
keyValueElement, _ := element.Children.Get("key_value")
|
||||
var rerr error
|
||||
err = inputValue.Range(func(key, value gjson.Result) bool {
|
||||
if !key.Exists() || key.Type == gjson.Null {
|
||||
rerr = fmt.Errorf("%v.key_value.key: not found or null", dataPath)
|
||||
return false
|
||||
}
|
||||
|
||||
var jsonData []byte
|
||||
if jsonData, rerr = sjson.SetBytes([]byte{}, "key", key.Value()); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
if jsonData, rerr = sjson.SetBytes(jsonData, "value", value.Value()); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
var jv *jsonValue
|
||||
if jv, rerr = bytesToJSONValue(jsonData); rerr != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
if columnDataMap, rerr = populate(columnDataMap, jv, keyValueElement.Children, firstValueRL); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
err = rerr
|
||||
return (err == nil)
|
||||
}
|
||||
|
||||
err = fmt.Errorf("%v: unsupported converted type %v in %v field type", dataPath, *element.ConvertedType, *element.RepetitionType)
|
||||
return false
|
||||
}
|
||||
|
||||
tree.Range(handleElement)
|
||||
return columnDataMap, err
|
||||
}
|
||||
|
||||
// Column - denotes values of a column.
|
||||
type Column struct {
|
||||
parquetType parquet.Type // value type.
|
||||
values []interface{} // must be a slice of parquet typed values.
|
||||
definitionLevels []int64 // exactly same length of values.
|
||||
repetitionLevels []int64 // exactly same length of values.
|
||||
rowCount int32
|
||||
maxBitWidth int32
|
||||
minValue interface{}
|
||||
maxValue interface{}
|
||||
}
|
||||
|
||||
func (column *Column) updateMinMaxValue(value interface{}) {
|
||||
if column.minValue == nil && column.maxValue == nil {
|
||||
column.minValue = value
|
||||
column.maxValue = value
|
||||
return
|
||||
}
|
||||
|
||||
switch column.parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
if column.minValue.(bool) && !value.(bool) {
|
||||
column.minValue = value
|
||||
}
|
||||
|
||||
if !column.maxValue.(bool) && value.(bool) {
|
||||
column.maxValue = value
|
||||
}
|
||||
|
||||
case parquet.Type_INT32:
|
||||
if column.minValue.(int32) > value.(int32) {
|
||||
column.minValue = value
|
||||
}
|
||||
|
||||
if column.maxValue.(int32) < value.(int32) {
|
||||
column.maxValue = value
|
||||
}
|
||||
|
||||
case parquet.Type_INT64:
|
||||
if column.minValue.(int64) > value.(int64) {
|
||||
column.minValue = value
|
||||
}
|
||||
|
||||
if column.maxValue.(int64) < value.(int64) {
|
||||
column.maxValue = value
|
||||
}
|
||||
|
||||
case parquet.Type_FLOAT:
|
||||
if column.minValue.(float32) > value.(float32) {
|
||||
column.minValue = value
|
||||
}
|
||||
|
||||
if column.maxValue.(float32) < value.(float32) {
|
||||
column.maxValue = value
|
||||
}
|
||||
|
||||
case parquet.Type_DOUBLE:
|
||||
if column.minValue.(float64) > value.(float64) {
|
||||
column.minValue = value
|
||||
}
|
||||
|
||||
if column.maxValue.(float64) < value.(float64) {
|
||||
column.maxValue = value
|
||||
}
|
||||
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
if bytes.Compare(column.minValue.([]byte), value.([]byte)) > 0 {
|
||||
column.minValue = value
|
||||
}
|
||||
|
||||
if bytes.Compare(column.minValue.([]byte), value.([]byte)) < 0 {
|
||||
column.maxValue = value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (column *Column) updateStats(value interface{}, DL, RL int64) {
|
||||
if RL == 0 {
|
||||
column.rowCount++
|
||||
}
|
||||
|
||||
if value == nil {
|
||||
return
|
||||
}
|
||||
|
||||
var bitWidth int32
|
||||
switch column.parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
bitWidth = 1
|
||||
case parquet.Type_INT32:
|
||||
bitWidth = common.BitWidth(uint64(value.(int32)))
|
||||
case parquet.Type_INT64:
|
||||
bitWidth = common.BitWidth(uint64(value.(int64)))
|
||||
case parquet.Type_FLOAT:
|
||||
bitWidth = 32
|
||||
case parquet.Type_DOUBLE:
|
||||
bitWidth = 64
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
bitWidth = int32(len(value.([]byte)))
|
||||
}
|
||||
if column.maxBitWidth < bitWidth {
|
||||
column.maxBitWidth = bitWidth
|
||||
}
|
||||
|
||||
column.updateMinMaxValue(value)
|
||||
}
|
||||
|
||||
func (column *Column) add(value interface{}, DL, RL int64) {
|
||||
column.values = append(column.values, value)
|
||||
column.definitionLevels = append(column.definitionLevels, DL)
|
||||
column.repetitionLevels = append(column.repetitionLevels, RL)
|
||||
column.updateStats(value, DL, RL)
|
||||
}
|
||||
|
||||
// AddNull - adds nil value.
|
||||
func (column *Column) AddNull(DL, RL int64) {
|
||||
column.add(nil, DL, RL)
|
||||
}
|
||||
|
||||
// AddBoolean - adds boolean value.
|
||||
func (column *Column) AddBoolean(value bool, DL, RL int64) {
|
||||
if column.parquetType != parquet.Type_BOOLEAN {
|
||||
panic(fmt.Errorf("expected %v value", column.parquetType))
|
||||
}
|
||||
|
||||
column.add(value, DL, RL)
|
||||
}
|
||||
|
||||
// AddInt32 - adds int32 value.
|
||||
func (column *Column) AddInt32(value int32, DL, RL int64) {
|
||||
if column.parquetType != parquet.Type_INT32 {
|
||||
panic(fmt.Errorf("expected %v value", column.parquetType))
|
||||
}
|
||||
|
||||
column.add(value, DL, RL)
|
||||
}
|
||||
|
||||
// AddInt64 - adds int64 value.
|
||||
func (column *Column) AddInt64(value int64, DL, RL int64) {
|
||||
if column.parquetType != parquet.Type_INT64 {
|
||||
panic(fmt.Errorf("expected %v value", column.parquetType))
|
||||
}
|
||||
|
||||
column.add(value, DL, RL)
|
||||
}
|
||||
|
||||
// AddFloat - adds float32 value.
|
||||
func (column *Column) AddFloat(value float32, DL, RL int64) {
|
||||
if column.parquetType != parquet.Type_FLOAT {
|
||||
panic(fmt.Errorf("expected %v value", column.parquetType))
|
||||
}
|
||||
|
||||
column.add(value, DL, RL)
|
||||
}
|
||||
|
||||
// AddDouble - adds float64 value.
|
||||
func (column *Column) AddDouble(value float64, DL, RL int64) {
|
||||
if column.parquetType != parquet.Type_DOUBLE {
|
||||
panic(fmt.Errorf("expected %v value", column.parquetType))
|
||||
}
|
||||
|
||||
column.add(value, DL, RL)
|
||||
}
|
||||
|
||||
// AddByteArray - adds byte array value.
|
||||
func (column *Column) AddByteArray(value []byte, DL, RL int64) {
|
||||
if column.parquetType != parquet.Type_BYTE_ARRAY {
|
||||
panic(fmt.Errorf("expected %v value", column.parquetType))
|
||||
}
|
||||
|
||||
column.add(value, DL, RL)
|
||||
}
|
||||
|
||||
// Merge - merges columns.
|
||||
func (column *Column) Merge(column2 *Column) {
|
||||
if column.parquetType != column2.parquetType {
|
||||
panic(fmt.Errorf("merge differs in parquet type"))
|
||||
}
|
||||
|
||||
column.values = append(column.values, column2.values...)
|
||||
column.definitionLevels = append(column.definitionLevels, column2.definitionLevels...)
|
||||
column.repetitionLevels = append(column.repetitionLevels, column2.repetitionLevels...)
|
||||
|
||||
column.rowCount += column2.rowCount
|
||||
if column.maxBitWidth < column2.maxBitWidth {
|
||||
column.maxBitWidth = column2.maxBitWidth
|
||||
}
|
||||
|
||||
column.updateMinMaxValue(column2.minValue)
|
||||
column.updateMinMaxValue(column2.maxValue)
|
||||
}
|
||||
|
||||
func (column *Column) String() string {
|
||||
var strs []string
|
||||
strs = append(strs, fmt.Sprintf("parquetType: %v", column.parquetType))
|
||||
strs = append(strs, fmt.Sprintf("values: %v", column.values))
|
||||
strs = append(strs, fmt.Sprintf("definitionLevels: %v", column.definitionLevels))
|
||||
strs = append(strs, fmt.Sprintf("repetitionLevels: %v", column.repetitionLevels))
|
||||
strs = append(strs, fmt.Sprintf("rowCount: %v", column.rowCount))
|
||||
strs = append(strs, fmt.Sprintf("maxBitWidth: %v", column.maxBitWidth))
|
||||
strs = append(strs, fmt.Sprintf("minValue: %v", column.minValue))
|
||||
strs = append(strs, fmt.Sprintf("maxValue: %v", column.maxValue))
|
||||
return "{" + strings.Join(strs, ", ") + "}"
|
||||
}
|
||||
|
||||
func (column *Column) encodeValue(value interface{}, element *schema.Element) []byte {
|
||||
if value == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
valueData := encoding.PlainEncode(common.ToSliceValue([]interface{}{value}, column.parquetType), column.parquetType)
|
||||
if column.parquetType == parquet.Type_BYTE_ARRAY && element.ConvertedType != nil {
|
||||
switch *element.ConvertedType {
|
||||
case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
|
||||
valueData = valueData[4:]
|
||||
}
|
||||
}
|
||||
|
||||
return valueData
|
||||
}
|
||||
|
||||
func (column *Column) toDataPageV2(element *schema.Element, parquetEncoding parquet.Encoding) *ColumnChunk {
|
||||
var definedValues []interface{}
|
||||
for _, value := range column.values {
|
||||
if value != nil {
|
||||
definedValues = append(definedValues, value)
|
||||
}
|
||||
}
|
||||
|
||||
var encodedData []byte
|
||||
switch parquetEncoding {
|
||||
case parquet.Encoding_PLAIN:
|
||||
encodedData = encoding.PlainEncode(common.ToSliceValue(definedValues, column.parquetType), column.parquetType)
|
||||
|
||||
case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
|
||||
var bytesSlices [][]byte
|
||||
for _, value := range column.values {
|
||||
bytesSlices = append(bytesSlices, value.([]byte))
|
||||
}
|
||||
encodedData = encoding.DeltaLengthByteArrayEncode(bytesSlices)
|
||||
}
|
||||
|
||||
compressionType := parquet.CompressionCodec_SNAPPY
|
||||
if element.CompressionType != nil {
|
||||
compressionType = *element.CompressionType
|
||||
}
|
||||
|
||||
compressedData, err := common.Compress(compressionType, encodedData)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
DLData := encoding.RLEBitPackedHybridEncode(
|
||||
column.definitionLevels,
|
||||
common.BitWidth(uint64(element.MaxDefinitionLevel)),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
|
||||
RLData := encoding.RLEBitPackedHybridEncode(
|
||||
column.repetitionLevels,
|
||||
common.BitWidth(uint64(element.MaxRepetitionLevel)),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
|
||||
pageHeader := parquet.NewPageHeader()
|
||||
pageHeader.Type = parquet.PageType_DATA_PAGE_V2
|
||||
pageHeader.CompressedPageSize = int32(len(compressedData) + len(DLData) + len(RLData))
|
||||
pageHeader.UncompressedPageSize = int32(len(encodedData) + len(DLData) + len(RLData))
|
||||
pageHeader.DataPageHeaderV2 = parquet.NewDataPageHeaderV2()
|
||||
pageHeader.DataPageHeaderV2.NumValues = int32(len(column.values))
|
||||
pageHeader.DataPageHeaderV2.NumNulls = int32(len(column.values) - len(definedValues))
|
||||
pageHeader.DataPageHeaderV2.NumRows = column.rowCount
|
||||
pageHeader.DataPageHeaderV2.Encoding = parquetEncoding
|
||||
pageHeader.DataPageHeaderV2.DefinitionLevelsByteLength = int32(len(DLData))
|
||||
pageHeader.DataPageHeaderV2.RepetitionLevelsByteLength = int32(len(RLData))
|
||||
pageHeader.DataPageHeaderV2.IsCompressed = true
|
||||
pageHeader.DataPageHeaderV2.Statistics = parquet.NewStatistics()
|
||||
pageHeader.DataPageHeaderV2.Statistics.Min = column.encodeValue(column.minValue, element)
|
||||
pageHeader.DataPageHeaderV2.Statistics.Max = column.encodeValue(column.maxValue, element)
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
rawData, err := ts.Write(context.TODO(), pageHeader)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
rawData = append(rawData, RLData...)
|
||||
rawData = append(rawData, DLData...)
|
||||
rawData = append(rawData, compressedData...)
|
||||
|
||||
metadata := parquet.NewColumnMetaData()
|
||||
metadata.Type = column.parquetType
|
||||
metadata.Encodings = []parquet.Encoding{
|
||||
parquet.Encoding_PLAIN,
|
||||
parquet.Encoding_RLE,
|
||||
parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY,
|
||||
}
|
||||
metadata.Codec = compressionType
|
||||
metadata.NumValues = int64(pageHeader.DataPageHeaderV2.NumValues)
|
||||
metadata.TotalCompressedSize = int64(len(rawData))
|
||||
metadata.TotalUncompressedSize = int64(pageHeader.UncompressedPageSize) + int64(len(rawData)) - int64(pageHeader.CompressedPageSize)
|
||||
metadata.PathInSchema = strings.Split(element.PathInSchema, ".")
|
||||
metadata.Statistics = parquet.NewStatistics()
|
||||
metadata.Statistics.Min = pageHeader.DataPageHeaderV2.Statistics.Min
|
||||
metadata.Statistics.Max = pageHeader.DataPageHeaderV2.Statistics.Max
|
||||
|
||||
chunk := new(ColumnChunk)
|
||||
chunk.ColumnChunk.MetaData = metadata
|
||||
chunk.dataPageLen = int64(len(rawData))
|
||||
chunk.dataLen = int64(len(rawData))
|
||||
chunk.data = rawData
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
func (column *Column) toRLEDictPage(element *schema.Element) *ColumnChunk {
|
||||
dictPageData, dataPageData, dictValueCount, indexBitWidth := encoding.RLEDictEncode(column.values, column.parquetType, column.maxBitWidth)
|
||||
|
||||
compressionType := parquet.CompressionCodec_SNAPPY
|
||||
if element.CompressionType != nil {
|
||||
compressionType = *element.CompressionType
|
||||
}
|
||||
|
||||
compressedData, err := common.Compress(compressionType, dictPageData)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
dictPageHeader := parquet.NewPageHeader()
|
||||
dictPageHeader.Type = parquet.PageType_DICTIONARY_PAGE
|
||||
dictPageHeader.CompressedPageSize = int32(len(compressedData))
|
||||
dictPageHeader.UncompressedPageSize = int32(len(dictPageData))
|
||||
dictPageHeader.DictionaryPageHeader = parquet.NewDictionaryPageHeader()
|
||||
dictPageHeader.DictionaryPageHeader.NumValues = dictValueCount
|
||||
dictPageHeader.DictionaryPageHeader.Encoding = parquet.Encoding_PLAIN
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
dictPageRawData, err := ts.Write(context.TODO(), dictPageHeader)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
dictPageRawData = append(dictPageRawData, compressedData...)
|
||||
|
||||
RLData := encoding.RLEBitPackedHybridEncode(
|
||||
column.repetitionLevels,
|
||||
common.BitWidth(uint64(element.MaxRepetitionLevel)),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
encodedData := RLData
|
||||
|
||||
DLData := encoding.RLEBitPackedHybridEncode(
|
||||
column.definitionLevels,
|
||||
common.BitWidth(uint64(element.MaxDefinitionLevel)),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
encodedData = append(encodedData, DLData...)
|
||||
|
||||
encodedData = append(encodedData, indexBitWidth)
|
||||
encodedData = append(encodedData, dataPageData...)
|
||||
|
||||
compressedData, err = common.Compress(compressionType, encodedData)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
dataPageHeader := parquet.NewPageHeader()
|
||||
dataPageHeader.Type = parquet.PageType_DATA_PAGE
|
||||
dataPageHeader.CompressedPageSize = int32(len(compressedData))
|
||||
dataPageHeader.UncompressedPageSize = int32(len(encodedData))
|
||||
dataPageHeader.DataPageHeader = parquet.NewDataPageHeader()
|
||||
dataPageHeader.DataPageHeader.NumValues = int32(len(column.values))
|
||||
dataPageHeader.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE
|
||||
dataPageHeader.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE
|
||||
dataPageHeader.DataPageHeader.Encoding = parquet.Encoding_RLE_DICTIONARY
|
||||
|
||||
ts = thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
dataPageRawData, err := ts.Write(context.TODO(), dataPageHeader)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
dataPageRawData = append(dataPageRawData, compressedData...)
|
||||
|
||||
metadata := parquet.NewColumnMetaData()
|
||||
metadata.Type = column.parquetType
|
||||
metadata.Encodings = []parquet.Encoding{
|
||||
parquet.Encoding_PLAIN,
|
||||
parquet.Encoding_RLE,
|
||||
parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY,
|
||||
parquet.Encoding_RLE_DICTIONARY,
|
||||
}
|
||||
metadata.Codec = compressionType
|
||||
metadata.NumValues = int64(dataPageHeader.DataPageHeader.NumValues)
|
||||
metadata.TotalCompressedSize = int64(len(dictPageRawData)) + int64(len(dataPageRawData))
|
||||
uncompressedSize := int64(dictPageHeader.UncompressedPageSize) + int64(len(dictPageData)) - int64(dictPageHeader.CompressedPageSize)
|
||||
uncompressedSize += int64(dataPageHeader.UncompressedPageSize) + int64(len(dataPageData)) - int64(dataPageHeader.CompressedPageSize)
|
||||
metadata.TotalUncompressedSize = uncompressedSize
|
||||
metadata.PathInSchema = strings.Split(element.PathInSchema, ".")
|
||||
metadata.Statistics = parquet.NewStatistics()
|
||||
metadata.Statistics.Min = column.encodeValue(column.minValue, element)
|
||||
metadata.Statistics.Max = column.encodeValue(column.maxValue, element)
|
||||
|
||||
chunk := new(ColumnChunk)
|
||||
chunk.ColumnChunk.MetaData = metadata
|
||||
chunk.isDictPage = true
|
||||
chunk.dictPageLen = int64(len(dictPageRawData))
|
||||
chunk.dataPageLen = int64(len(dataPageRawData))
|
||||
chunk.dataLen = chunk.dictPageLen + chunk.dataPageLen
|
||||
chunk.data = append(dictPageRawData, dataPageRawData...)
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
// Encode an element.
|
||||
func (column *Column) Encode(element *schema.Element) *ColumnChunk {
|
||||
parquetEncoding := getDefaultEncoding(column.parquetType)
|
||||
if element.Encoding != nil {
|
||||
parquetEncoding = *element.Encoding
|
||||
}
|
||||
|
||||
switch parquetEncoding {
|
||||
case parquet.Encoding_PLAIN, parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
|
||||
return column.toDataPageV2(element, parquetEncoding)
|
||||
}
|
||||
|
||||
return column.toRLEDictPage(element)
|
||||
}
|
||||
|
||||
// NewColumn - creates new column data
|
||||
func NewColumn(parquetType parquet.Type) *Column {
|
||||
switch parquetType {
|
||||
case parquet.Type_BOOLEAN, parquet.Type_INT32, parquet.Type_INT64, parquet.Type_FLOAT, parquet.Type_DOUBLE, parquet.Type_BYTE_ARRAY:
|
||||
default:
|
||||
panic(fmt.Errorf("unsupported parquet type %v", parquetType))
|
||||
}
|
||||
|
||||
return &Column{
|
||||
parquetType: parquetType,
|
||||
}
|
||||
}
|
||||
|
||||
// UnmarshalJSON - decodes JSON data into map of Column.
|
||||
func UnmarshalJSON(data []byte, tree *schema.Tree) (map[string]*Column, error) {
|
||||
if !tree.ReadOnly() {
|
||||
return nil, fmt.Errorf("tree must be read only")
|
||||
}
|
||||
|
||||
inputValue, err := bytesToJSONValue(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
columnDataMap := make(map[string]*Column)
|
||||
return populate(columnDataMap, inputValue, tree, 0)
|
||||
}
|
369
pkg/s3select/internal/parquet-go/data/column_test.go
Normal file
369
pkg/s3select/internal/parquet-go/data/column_test.go
Normal file
@ -0,0 +1,369 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
var (
|
||||
v10 = int32(10)
|
||||
v20 = int32(20)
|
||||
v30 = int32(30)
|
||||
ten = []byte("ten")
|
||||
foo = []byte("foo")
|
||||
bar = []byte("bar")
|
||||
phone1 = []byte("1-234-567-8901")
|
||||
phone2 = []byte("1-234-567-1098")
|
||||
phone3 = []byte("1-111-222-3333")
|
||||
)
|
||||
|
||||
func TestAddressBookExample(t *testing.T) {
|
||||
// message AddressBook {
|
||||
// required string owner;
|
||||
// repeated string ownerPhoneNumbers;
|
||||
// repeated group contacts {
|
||||
// required string name;
|
||||
// optional string phoneNumber;
|
||||
// }
|
||||
// }
|
||||
t.Skip("Broken")
|
||||
|
||||
addressBook := schema.NewTree()
|
||||
{
|
||||
owner, err := schema.NewElement("owner", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ownerPhoneNumbers, err := schema.NewElement("ownerPhoneNumbers", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ownerPhoneNumbersList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ownerPhoneNumbersElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
contacts, err := schema.NewElement("contacts", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
contactsList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
contactsElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
contactName, err := schema.NewElement("name", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
contactPhoneNumber, err := schema.NewElement("phoneNumber", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = addressBook.Set("owner", owner); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = addressBook.Set("ownerPhoneNumbers", ownerPhoneNumbers); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = addressBook.Set("ownerPhoneNumbers.list", ownerPhoneNumbersList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = addressBook.Set("ownerPhoneNumbers.list.element", ownerPhoneNumbersElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = addressBook.Set("contacts", contacts); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = addressBook.Set("contacts.list", contactsList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = addressBook.Set("contacts.list.element", contactsElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = addressBook.Set("contacts.list.element.name", contactName); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = addressBook.Set("contacts.list.element.phoneNumber", contactPhoneNumber); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
if _, _, err := addressBook.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
case2Data := `{
|
||||
"owner": "foo"
|
||||
}`
|
||||
result2 := map[string]*Column{
|
||||
"owner": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{foo},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"ownerPhoneNumbers.list.element": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"contacts.list.element.name": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
}
|
||||
|
||||
case3Data := `{
|
||||
"owner": "foo",
|
||||
"ownerPhoneNumbers": [
|
||||
"1-234-567-8901"
|
||||
]
|
||||
}
|
||||
`
|
||||
result3 := map[string]*Column{
|
||||
"owner": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{foo},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"ownerPhoneNumbers.list.element": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{phone1},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"contacts.list.element.name": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
}
|
||||
|
||||
case4Data := `{
|
||||
"owner": "foo",
|
||||
"ownerPhoneNumbers": [
|
||||
"1-234-567-8901",
|
||||
"1-234-567-1098"
|
||||
]
|
||||
}
|
||||
`
|
||||
result4 := map[string]*Column{
|
||||
"owner": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{foo},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"ownerPhoneNumbers.list.element": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{phone1, phone2},
|
||||
definitionLevels: []int64{2, 2},
|
||||
repetitionLevels: []int64{0, 1},
|
||||
},
|
||||
"contacts.list.element.name": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
}
|
||||
|
||||
case5Data := `{
|
||||
"contacts": [
|
||||
{
|
||||
"name": "bar"
|
||||
}
|
||||
],
|
||||
"owner": "foo"
|
||||
}`
|
||||
result5 := map[string]*Column{
|
||||
"owner": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{foo},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"ownerPhoneNumbers.list.element": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"contacts.list.element.name": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{bar},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"contacts.list.element.phoneNumber": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
case6Data := `{
|
||||
"contacts": [
|
||||
{
|
||||
"name": "bar",
|
||||
"phoneNumber": "1-111-222-3333"
|
||||
}
|
||||
],
|
||||
"owner": "foo"
|
||||
}`
|
||||
result6 := map[string]*Column{
|
||||
"owner": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{foo},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"ownerPhoneNumbers.list.element": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"contacts.list.element.name": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{bar},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"contacts.list.element.phoneNumber": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{phone3},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
case7Data := `{
|
||||
"contacts": [
|
||||
{
|
||||
"name": "bar",
|
||||
"phoneNumber": "1-111-222-3333"
|
||||
}
|
||||
],
|
||||
"owner": "foo",
|
||||
"ownerPhoneNumbers": [
|
||||
"1-234-567-8901",
|
||||
"1-234-567-1098"
|
||||
]
|
||||
}`
|
||||
result7 := map[string]*Column{
|
||||
"owner": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{foo},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"ownerPhoneNumbers.list.element": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{phone1, phone2},
|
||||
definitionLevels: []int64{2, 2},
|
||||
repetitionLevels: []int64{0, 1},
|
||||
},
|
||||
"contacts.list.element.name": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{bar},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"contacts.list.element.phoneNumber": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{phone3},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
data string
|
||||
expectedResult map[string]*Column
|
||||
expectErr bool
|
||||
}{
|
||||
{`{}`, nil, true}, // err: owner: nil value for required field
|
||||
{case2Data, result2, false},
|
||||
{case3Data, result3, false},
|
||||
{case4Data, result4, false},
|
||||
{case5Data, result5, false},
|
||||
{case6Data, result6, false},
|
||||
{case7Data, result7, false},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result, err := UnmarshalJSON([]byte(testCase.data), addressBook)
|
||||
expectErr := (err != nil)
|
||||
|
||||
if testCase.expectErr != expectErr {
|
||||
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
|
||||
}
|
||||
|
||||
if !testCase.expectErr {
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Errorf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
65
pkg/s3select/internal/parquet-go/data/data.go
Normal file
65
pkg/s3select/internal/parquet-go/data/data.go
Normal file
@ -0,0 +1,65 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
// ColumnChunk ...
|
||||
type ColumnChunk struct {
|
||||
parquet.ColumnChunk
|
||||
isDictPage bool
|
||||
dictPageLen int64
|
||||
dataPageLen int64
|
||||
dataLen int64
|
||||
data []byte
|
||||
}
|
||||
|
||||
// Data returns the data.
|
||||
func (chunk *ColumnChunk) Data() []byte {
|
||||
return chunk.data
|
||||
}
|
||||
|
||||
// DataLen returns the length of the data.
|
||||
func (chunk *ColumnChunk) DataLen() int64 {
|
||||
return chunk.dataLen
|
||||
}
|
||||
|
||||
// NewRowGroup creates a new row group.
|
||||
func NewRowGroup(chunks []*ColumnChunk, numRows, offset int64) *parquet.RowGroup {
|
||||
rows := parquet.NewRowGroup()
|
||||
rows.NumRows = numRows
|
||||
|
||||
for _, chunk := range chunks {
|
||||
rows.Columns = append(rows.Columns, &chunk.ColumnChunk)
|
||||
rows.TotalByteSize += chunk.dataLen
|
||||
|
||||
chunk.ColumnChunk.FileOffset = offset
|
||||
|
||||
if chunk.isDictPage {
|
||||
dictPageOffset := offset
|
||||
chunk.ColumnChunk.MetaData.DictionaryPageOffset = &dictPageOffset
|
||||
offset += chunk.dictPageLen
|
||||
}
|
||||
|
||||
chunk.ColumnChunk.MetaData.DataPageOffset = offset
|
||||
offset += chunk.dataPageLen
|
||||
}
|
||||
|
||||
return rows
|
||||
}
|
107
pkg/s3select/internal/parquet-go/data/jsonvalue.go
Normal file
107
pkg/s3select/internal/parquet-go/data/jsonvalue.go
Normal file
@ -0,0 +1,107 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/tidwall/gjson"
|
||||
)
|
||||
|
||||
type jsonValue struct {
|
||||
result *gjson.Result
|
||||
path *string
|
||||
}
|
||||
|
||||
func (v *jsonValue) String() string {
|
||||
if v.result == nil {
|
||||
return "<nil>"
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%v", *v.result)
|
||||
}
|
||||
|
||||
func (v *jsonValue) IsNull() bool {
|
||||
return v.result == nil || v.result.Type == gjson.Null
|
||||
}
|
||||
|
||||
func (v *jsonValue) Get(path string) *jsonValue {
|
||||
if v.path != nil {
|
||||
var result *gjson.Result
|
||||
if *v.path == path {
|
||||
result = v.result
|
||||
}
|
||||
|
||||
return resultToJSONValue(result)
|
||||
}
|
||||
|
||||
if v.result == nil {
|
||||
return resultToJSONValue(nil)
|
||||
}
|
||||
|
||||
result := v.result.Get(path)
|
||||
if !result.Exists() {
|
||||
return resultToJSONValue(nil)
|
||||
}
|
||||
|
||||
return resultToJSONValue(&result)
|
||||
}
|
||||
|
||||
func (v *jsonValue) GetValue(parquetType parquet.Type, convertedType *parquet.ConvertedType) (interface{}, error) {
|
||||
if v.result == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return resultToParquetValue(*v.result, parquetType, convertedType)
|
||||
}
|
||||
|
||||
func (v *jsonValue) GetArray() ([]gjson.Result, error) {
|
||||
if v.result == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return resultToArray(*v.result)
|
||||
}
|
||||
|
||||
func (v *jsonValue) Range(iterator func(key, value gjson.Result) bool) error {
|
||||
if v.result == nil || v.result.Type == gjson.Null {
|
||||
return nil
|
||||
}
|
||||
|
||||
if v.result.Type != gjson.JSON || !v.result.IsObject() {
|
||||
return fmt.Errorf("result is not Map but %v", v.result.Type)
|
||||
}
|
||||
|
||||
v.result.ForEach(iterator)
|
||||
return nil
|
||||
}
|
||||
|
||||
func resultToJSONValue(result *gjson.Result) *jsonValue {
|
||||
return &jsonValue{
|
||||
result: result,
|
||||
}
|
||||
}
|
||||
|
||||
func bytesToJSONValue(data []byte) (*jsonValue, error) {
|
||||
if !gjson.ValidBytes(data) {
|
||||
return nil, fmt.Errorf("invalid JSON data")
|
||||
}
|
||||
|
||||
result := gjson.ParseBytes(data)
|
||||
return resultToJSONValue(&result), nil
|
||||
}
|
360
pkg/s3select/internal/parquet-go/data/result.go
Normal file
360
pkg/s3select/internal/parquet-go/data/result.go
Normal file
@ -0,0 +1,360 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/tidwall/gjson"
|
||||
)
|
||||
|
||||
func resultToBool(result gjson.Result) (value interface{}, err error) {
|
||||
switch result.Type {
|
||||
case gjson.False, gjson.True:
|
||||
return result.Bool(), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("result is not Bool but %v", result.Type)
|
||||
}
|
||||
|
||||
func resultToInt32(result gjson.Result) (value interface{}, err error) {
|
||||
if value, err = resultToInt64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if value.(int64) < math.MinInt32 || value.(int64) > math.MaxInt32 {
|
||||
return nil, fmt.Errorf("int32 overflow")
|
||||
}
|
||||
|
||||
return int32(value.(int64)), nil
|
||||
}
|
||||
|
||||
func resultToInt64(result gjson.Result) (value interface{}, err error) {
|
||||
if result.Type == gjson.Number {
|
||||
return result.Int(), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("result is not Number but %v", result.Type)
|
||||
}
|
||||
|
||||
func resultToFloat(result gjson.Result) (value interface{}, err error) {
|
||||
if result.Type == gjson.Number {
|
||||
return float32(result.Float()), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("result is not float32 but %v", result.Type)
|
||||
}
|
||||
|
||||
func resultToDouble(result gjson.Result) (value interface{}, err error) {
|
||||
if result.Type == gjson.Number {
|
||||
return result.Float(), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("result is not float64 but %v", result.Type)
|
||||
}
|
||||
|
||||
func resultToBytes(result gjson.Result) (interface{}, error) {
|
||||
if result.Type != gjson.JSON || !result.IsArray() {
|
||||
return nil, fmt.Errorf("result is not byte array but %v", result.Type)
|
||||
}
|
||||
|
||||
data := []byte{}
|
||||
for i, r := range result.Array() {
|
||||
if r.Type != gjson.Number {
|
||||
return nil, fmt.Errorf("result[%v] is not byte but %v", i, r.Type)
|
||||
}
|
||||
|
||||
value := r.Uint()
|
||||
if value > math.MaxUint8 {
|
||||
return nil, fmt.Errorf("byte overflow in result[%v]", i)
|
||||
}
|
||||
|
||||
data = append(data, byte(value))
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func resultToString(result gjson.Result) (value interface{}, err error) {
|
||||
if result.Type == gjson.String {
|
||||
return result.String(), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("result is not String but %v", result.Type)
|
||||
}
|
||||
|
||||
func resultToUint8(result gjson.Result) (value interface{}, err error) {
|
||||
if value, err = resultToUint64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if value.(uint64) > math.MaxUint8 {
|
||||
return nil, fmt.Errorf("uint8 overflow")
|
||||
}
|
||||
|
||||
return uint8(value.(uint64)), nil
|
||||
}
|
||||
|
||||
func resultToUint16(result gjson.Result) (value interface{}, err error) {
|
||||
if value, err = resultToUint64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if value.(uint64) > math.MaxUint16 {
|
||||
return nil, fmt.Errorf("uint16 overflow")
|
||||
}
|
||||
|
||||
return uint16(value.(uint64)), nil
|
||||
}
|
||||
|
||||
func resultToUint32(result gjson.Result) (value interface{}, err error) {
|
||||
if value, err = resultToUint64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if value.(uint64) > math.MaxUint32 {
|
||||
return nil, fmt.Errorf("uint32 overflow")
|
||||
}
|
||||
|
||||
return uint32(value.(uint64)), nil
|
||||
}
|
||||
|
||||
func resultToUint64(result gjson.Result) (value interface{}, err error) {
|
||||
if result.Type == gjson.Number {
|
||||
return result.Uint(), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("result is not Number but %v", result.Type)
|
||||
}
|
||||
|
||||
func resultToInt8(result gjson.Result) (value interface{}, err error) {
|
||||
if value, err = resultToInt64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if value.(int64) < math.MinInt8 || value.(int64) > math.MaxInt8 {
|
||||
return nil, fmt.Errorf("int8 overflow")
|
||||
}
|
||||
|
||||
return int8(value.(int64)), nil
|
||||
}
|
||||
|
||||
func resultToInt16(result gjson.Result) (value interface{}, err error) {
|
||||
if value, err = resultToInt64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if value.(int64) < math.MinInt16 || value.(int64) > math.MaxInt16 {
|
||||
return nil, fmt.Errorf("int16 overflow")
|
||||
}
|
||||
|
||||
return int16(value.(int64)), nil
|
||||
}
|
||||
|
||||
func stringToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT96, parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY:
|
||||
return []byte(value.(string)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("string cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func uint8ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return int32(value.(uint8)), nil
|
||||
case parquet.Type_INT64:
|
||||
return int64(value.(uint8)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("uint8 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func uint16ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return int32(value.(uint16)), nil
|
||||
case parquet.Type_INT64:
|
||||
return int64(value.(uint16)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("uint16 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func uint32ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return int32(value.(uint32)), nil
|
||||
case parquet.Type_INT64:
|
||||
return int64(value.(uint32)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("uint32 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func uint64ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return int32(value.(uint64)), nil
|
||||
case parquet.Type_INT64:
|
||||
return int64(value.(uint64)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("uint64 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func int8ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return int32(value.(int8)), nil
|
||||
case parquet.Type_INT64:
|
||||
return int64(value.(int8)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("int8 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func int16ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return int32(value.(int16)), nil
|
||||
case parquet.Type_INT64:
|
||||
return int64(value.(int16)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("int16 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func int32ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return value.(int32), nil
|
||||
case parquet.Type_INT64:
|
||||
return int64(value.(int32)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("int32 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func int64ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return int32(value.(int64)), nil
|
||||
case parquet.Type_INT64:
|
||||
return value.(int64), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("int64 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func resultToParquetValueByConvertedValue(result gjson.Result, convertedType parquet.ConvertedType, parquetType parquet.Type) (value interface{}, err error) {
|
||||
if result.Type == gjson.Null {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
switch convertedType {
|
||||
case parquet.ConvertedType_UTF8:
|
||||
if value, err = resultToString(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stringToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_UINT_8:
|
||||
if value, err = resultToUint8(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return uint8ToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_UINT_16:
|
||||
if value, err = resultToUint16(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return uint16ToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_UINT_32:
|
||||
if value, err = resultToUint32(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return uint32ToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_UINT_64:
|
||||
if value, err = resultToUint64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return uint64ToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_INT_8:
|
||||
if value, err = resultToInt8(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return int8ToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_INT_16:
|
||||
if value, err = resultToInt16(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return int16ToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_INT_32:
|
||||
if value, err = resultToInt32(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return int32ToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_INT_64:
|
||||
if value, err = resultToInt64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return int64ToParquetValue(value, parquetType)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unsupported converted type %v", convertedType)
|
||||
}
|
||||
|
||||
func resultToParquetValue(result gjson.Result, parquetType parquet.Type, convertedType *parquet.ConvertedType) (interface{}, error) {
|
||||
if convertedType != nil {
|
||||
return resultToParquetValueByConvertedValue(result, *convertedType, parquetType)
|
||||
}
|
||||
|
||||
if result.Type == gjson.Null {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
switch parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
return resultToBool(result)
|
||||
case parquet.Type_INT32:
|
||||
return resultToInt32(result)
|
||||
case parquet.Type_INT64:
|
||||
return resultToInt64(result)
|
||||
case parquet.Type_FLOAT:
|
||||
return resultToFloat(result)
|
||||
case parquet.Type_DOUBLE:
|
||||
return resultToDouble(result)
|
||||
case parquet.Type_INT96, parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY:
|
||||
return resultToBytes(result)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unknown parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func resultToArray(result gjson.Result) ([]gjson.Result, error) {
|
||||
if result.Type == gjson.Null {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if result.Type != gjson.JSON || !result.IsArray() {
|
||||
return nil, fmt.Errorf("result is not Array but %v", result.Type)
|
||||
}
|
||||
|
||||
return result.Array(), nil
|
||||
}
|
490
pkg/s3select/internal/parquet-go/decode.go
Normal file
490
pkg/s3select/internal/parquet-go/decode.go
Normal file
@ -0,0 +1,490 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2018 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func i64sToi32s(i64s []int64) (i32s []int32) {
|
||||
i32s = make([]int32, len(i64s))
|
||||
for i := range i64s {
|
||||
i32s[i] = int32(i64s[i])
|
||||
}
|
||||
|
||||
return i32s
|
||||
}
|
||||
|
||||
func readBitPacked(reader *bytes.Reader, header, bitWidth uint64) (result []int64, err error) {
|
||||
count := header * 8
|
||||
|
||||
if count == 0 {
|
||||
return result, nil
|
||||
}
|
||||
|
||||
if bitWidth == 0 {
|
||||
return make([]int64, count), nil
|
||||
}
|
||||
|
||||
data := make([]byte, header*bitWidth)
|
||||
if _, err = reader.Read(data); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var val, used, left, b uint64
|
||||
|
||||
valNeedBits := bitWidth
|
||||
i := -1
|
||||
for {
|
||||
if left <= 0 {
|
||||
i++
|
||||
if i >= len(data) {
|
||||
break
|
||||
}
|
||||
|
||||
b = uint64(data[i])
|
||||
left = 8
|
||||
used = 0
|
||||
}
|
||||
|
||||
if left >= valNeedBits {
|
||||
val |= ((b >> used) & ((1 << valNeedBits) - 1)) << (bitWidth - valNeedBits)
|
||||
result = append(result, int64(val))
|
||||
val = 0
|
||||
left -= valNeedBits
|
||||
used += valNeedBits
|
||||
valNeedBits = bitWidth
|
||||
} else {
|
||||
val |= (b >> used) << (bitWidth - valNeedBits)
|
||||
valNeedBits -= left
|
||||
left = 0
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readBools(reader *bytes.Reader, count uint64) (result []bool, err error) {
|
||||
i64s, err := readBitPacked(reader, count, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
result = append(result, i64s[i] > 0)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readInt32s(reader *bytes.Reader, count uint64) (result []int32, err error) {
|
||||
buf := make([]byte, 4)
|
||||
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
if _, err = reader.Read(buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, int32(bytesToUint32(buf)))
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readInt64s(reader *bytes.Reader, count uint64) (result []int64, err error) {
|
||||
buf := make([]byte, 8)
|
||||
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
if _, err = reader.Read(buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, int64(bytesToUint64(buf)))
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readInt96s(reader *bytes.Reader, count uint64) (result [][]byte, err error) {
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
buf := make([]byte, 12)
|
||||
|
||||
if _, err = reader.Read(buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, buf)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readFloats(reader *bytes.Reader, count uint64) (result []float32, err error) {
|
||||
buf := make([]byte, 4)
|
||||
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
if _, err = reader.Read(buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, math.Float32frombits(bytesToUint32(buf)))
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readDoubles(reader *bytes.Reader, count uint64) (result []float64, err error) {
|
||||
buf := make([]byte, 8)
|
||||
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
if _, err = reader.Read(buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, math.Float64frombits(bytesToUint64(buf)))
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readByteArrays(reader *bytes.Reader, count uint64) (result [][]byte, err error) {
|
||||
buf := make([]byte, 4)
|
||||
var length uint32
|
||||
var data []byte
|
||||
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
if _, err = reader.Read(buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
length = bytesToUint32(buf)
|
||||
data = make([]byte, length)
|
||||
if length > 0 {
|
||||
if _, err = reader.Read(data); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
result = append(result, data)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readFixedLenByteArrays(reader *bytes.Reader, count, length uint64) (result [][]byte, err error) {
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
data := make([]byte, length)
|
||||
if _, err = reader.Read(data); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, data)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readValues(reader *bytes.Reader, dataType parquet.Type, count, length uint64) (interface{}, error) {
|
||||
switch dataType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
return readBools(reader, count)
|
||||
case parquet.Type_INT32:
|
||||
return readInt32s(reader, count)
|
||||
case parquet.Type_INT64:
|
||||
return readInt64s(reader, count)
|
||||
case parquet.Type_INT96:
|
||||
return readInt96s(reader, count)
|
||||
case parquet.Type_FLOAT:
|
||||
return readFloats(reader, count)
|
||||
case parquet.Type_DOUBLE:
|
||||
return readDoubles(reader, count)
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
return readByteArrays(reader, count)
|
||||
case parquet.Type_FIXED_LEN_BYTE_ARRAY:
|
||||
return readFixedLenByteArrays(reader, count, length)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unknown parquet type %v", dataType)
|
||||
}
|
||||
|
||||
func readUnsignedVarInt(reader *bytes.Reader) (v uint64, err error) {
|
||||
var b byte
|
||||
var shift uint64
|
||||
|
||||
for {
|
||||
if b, err = reader.ReadByte(); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if v |= ((uint64(b) & 0x7F) << shift); b&0x80 == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
shift += 7
|
||||
}
|
||||
|
||||
return v, nil
|
||||
}
|
||||
|
||||
func readRLE(reader *bytes.Reader, header, bitWidth uint64) (result []int64, err error) {
|
||||
width := (bitWidth + 7) / 8
|
||||
data := make([]byte, width)
|
||||
if width > 0 {
|
||||
if _, err = reader.Read(data); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if width < 4 {
|
||||
data = append(data, make([]byte, 4-width)...)
|
||||
}
|
||||
|
||||
val := int64(bytesToUint32(data))
|
||||
|
||||
count := header >> 1
|
||||
result = make([]int64, count)
|
||||
for i := range result {
|
||||
result[i] = val
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readRLEBitPackedHybrid(reader *bytes.Reader, length, bitWidth uint64) (result []int64, err error) {
|
||||
if length <= 0 {
|
||||
var i32s []int32
|
||||
i32s, err = readInt32s(reader, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
length = uint64(i32s[0])
|
||||
}
|
||||
|
||||
buf := make([]byte, length)
|
||||
if _, err = reader.Read(buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
reader = bytes.NewReader(buf)
|
||||
for reader.Len() > 0 {
|
||||
header, err := readUnsignedVarInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var i64s []int64
|
||||
if header&1 == 0 {
|
||||
i64s, err = readRLE(reader, header, bitWidth)
|
||||
} else {
|
||||
i64s, err = readBitPacked(reader, header>>1, bitWidth)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, i64s...)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readDeltaBinaryPackedInt(reader *bytes.Reader) (result []int64, err error) {
|
||||
blockSize, err := readUnsignedVarInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
numMiniblocksInBlock, err := readUnsignedVarInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
numValues, err := readUnsignedVarInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
firstValueZigZag, err := readUnsignedVarInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
v := int64(firstValueZigZag>>1) ^ (-int64(firstValueZigZag & 1))
|
||||
result = append(result, v)
|
||||
|
||||
numValuesInMiniBlock := blockSize / numMiniblocksInBlock
|
||||
|
||||
bitWidths := make([]uint64, numMiniblocksInBlock)
|
||||
for uint64(len(result)) < numValues {
|
||||
minDeltaZigZag, err := readUnsignedVarInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i := 0; uint64(i) < numMiniblocksInBlock; i++ {
|
||||
b, err := reader.ReadByte()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
bitWidths[i] = uint64(b)
|
||||
}
|
||||
|
||||
minDelta := int64(minDeltaZigZag>>1) ^ (-int64(minDeltaZigZag & 1))
|
||||
for i := 0; uint64(i) < numMiniblocksInBlock; i++ {
|
||||
i64s, err := readBitPacked(reader, numValuesInMiniBlock/8, bitWidths[i])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for j := range i64s {
|
||||
v += i64s[j] + minDelta
|
||||
result = append(result, v)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result[:numValues], nil
|
||||
}
|
||||
|
||||
func readDeltaLengthByteArrays(reader *bytes.Reader) (result [][]byte, err error) {
|
||||
i64s, err := readDeltaBinaryPackedInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i := 0; i < len(i64s); i++ {
|
||||
arrays, err := readFixedLenByteArrays(reader, 1, uint64(i64s[i]))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, arrays[0])
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readDeltaByteArrays(reader *bytes.Reader) (result [][]byte, err error) {
|
||||
i64s, err := readDeltaBinaryPackedInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
suffixes, err := readDeltaLengthByteArrays(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, suffixes[0])
|
||||
for i := 1; i < len(i64s); i++ {
|
||||
prefixLength := i64s[i]
|
||||
val := append([]byte{}, result[i-1][:prefixLength]...)
|
||||
val = append(val, suffixes[i]...)
|
||||
result = append(result, val)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readDataPageValues(
|
||||
bytesReader *bytes.Reader,
|
||||
encoding parquet.Encoding,
|
||||
dataType parquet.Type,
|
||||
convertedType parquet.ConvertedType,
|
||||
count, bitWidth uint64,
|
||||
) (result interface{}, resultDataType parquet.Type, err error) {
|
||||
switch encoding {
|
||||
case parquet.Encoding_PLAIN:
|
||||
result, err = readValues(bytesReader, dataType, count, bitWidth)
|
||||
return result, dataType, err
|
||||
|
||||
case parquet.Encoding_PLAIN_DICTIONARY:
|
||||
b, err := bytesReader.ReadByte()
|
||||
if err != nil {
|
||||
return nil, -1, err
|
||||
}
|
||||
|
||||
i64s, err := readRLEBitPackedHybrid(bytesReader, uint64(bytesReader.Len()), uint64(b))
|
||||
if err != nil {
|
||||
return nil, -1, err
|
||||
}
|
||||
|
||||
return i64s[:count], parquet.Type_INT64, nil
|
||||
|
||||
case parquet.Encoding_RLE:
|
||||
i64s, err := readRLEBitPackedHybrid(bytesReader, 0, bitWidth)
|
||||
if err != nil {
|
||||
return nil, -1, err
|
||||
}
|
||||
|
||||
i64s = i64s[:count]
|
||||
|
||||
if dataType == parquet.Type_INT32 {
|
||||
return i64sToi32s(i64s), parquet.Type_INT32, nil
|
||||
}
|
||||
|
||||
return i64s, parquet.Type_INT64, nil
|
||||
|
||||
case parquet.Encoding_BIT_PACKED:
|
||||
return nil, -1, fmt.Errorf("deprecated parquet encoding %v", parquet.Encoding_BIT_PACKED)
|
||||
|
||||
case parquet.Encoding_DELTA_BINARY_PACKED:
|
||||
i64s, err := readDeltaBinaryPackedInt(bytesReader)
|
||||
if err != nil {
|
||||
return nil, -1, err
|
||||
}
|
||||
|
||||
i64s = i64s[:count]
|
||||
|
||||
if dataType == parquet.Type_INT32 {
|
||||
return i64sToi32s(i64s), parquet.Type_INT32, nil
|
||||
}
|
||||
|
||||
return i64s, parquet.Type_INT64, nil
|
||||
|
||||
case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
|
||||
byteSlices, err := readDeltaLengthByteArrays(bytesReader)
|
||||
if err != nil {
|
||||
return nil, -1, err
|
||||
}
|
||||
|
||||
return byteSlices[:count], parquet.Type_FIXED_LEN_BYTE_ARRAY, nil
|
||||
|
||||
case parquet.Encoding_DELTA_BYTE_ARRAY:
|
||||
byteSlices, err := readDeltaByteArrays(bytesReader)
|
||||
if err != nil {
|
||||
return nil, -1, err
|
||||
}
|
||||
|
||||
return byteSlices[:count], parquet.Type_FIXED_LEN_BYTE_ARRAY, nil
|
||||
}
|
||||
|
||||
return nil, -1, fmt.Errorf("unsupported parquet encoding %v", encoding)
|
||||
}
|
450
pkg/s3select/internal/parquet-go/encode.go
Normal file
450
pkg/s3select/internal/parquet-go/encode.go
Normal file
@ -0,0 +1,450 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func boolsToBytes(bs []bool) []byte {
|
||||
size := (len(bs) + 7) / 8
|
||||
result := make([]byte, size)
|
||||
for i := range bs {
|
||||
if bs[i] {
|
||||
result[i/8] |= 1 << uint32(i%8)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func int32sToBytes(i32s []int32) []byte {
|
||||
buf := make([]byte, 4*len(i32s))
|
||||
for i, i32 := range i32s {
|
||||
binary.LittleEndian.PutUint32(buf[i*4:], uint32(i32))
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func int64sToBytes(i64s []int64) []byte {
|
||||
buf := make([]byte, 8*len(i64s))
|
||||
for i, i64 := range i64s {
|
||||
binary.LittleEndian.PutUint64(buf[i*8:], uint64(i64))
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func float32sToBytes(f32s []float32) []byte {
|
||||
buf := make([]byte, 4*len(f32s))
|
||||
for i, f32 := range f32s {
|
||||
binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(f32))
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func float64sToBytes(f64s []float64) []byte {
|
||||
buf := make([]byte, 8*len(f64s))
|
||||
for i, f64 := range f64s {
|
||||
binary.LittleEndian.PutUint64(buf[i*8:], math.Float64bits(f64))
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func byteSlicesToBytes(byteSlices [][]byte) []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
for _, s := range byteSlices {
|
||||
if err := binary.Write(buf, binary.LittleEndian, uint32(len(s))); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if _, err := buf.Write(s); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func byteArraysToBytes(arrayList [][]byte) []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
arrayLen := -1
|
||||
for _, array := range arrayList {
|
||||
if arrayLen != -1 && len(array) != arrayLen {
|
||||
panic(errors.New("array list does not have same length"))
|
||||
}
|
||||
|
||||
arrayLen = len(array)
|
||||
if _, err := buf.Write(array); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func int96sToBytes(i96s [][]byte) []byte {
|
||||
return byteArraysToBytes(i96s)
|
||||
}
|
||||
|
||||
func valuesToBytes(values interface{}, dataType parquet.Type) []byte {
|
||||
switch dataType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
return boolsToBytes(values.([]bool))
|
||||
case parquet.Type_INT32:
|
||||
return int32sToBytes(values.([]int32))
|
||||
case parquet.Type_INT64:
|
||||
return int64sToBytes(values.([]int64))
|
||||
case parquet.Type_INT96:
|
||||
return int96sToBytes(values.([][]byte))
|
||||
case parquet.Type_FLOAT:
|
||||
return float32sToBytes(values.([]float32))
|
||||
case parquet.Type_DOUBLE:
|
||||
return float64sToBytes(values.([]float64))
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
return byteSlicesToBytes(values.([][]byte))
|
||||
case parquet.Type_FIXED_LEN_BYTE_ARRAY:
|
||||
return byteArraysToBytes(values.([][]byte))
|
||||
}
|
||||
|
||||
return []byte{}
|
||||
}
|
||||
|
||||
func valueToBytes(value interface{}, dataType parquet.Type) []byte {
|
||||
var values interface{}
|
||||
switch dataType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
values = []bool{value.(bool)}
|
||||
case parquet.Type_INT32:
|
||||
values = []int32{value.(int32)}
|
||||
case parquet.Type_INT64:
|
||||
values = []int64{value.(int64)}
|
||||
case parquet.Type_INT96:
|
||||
values = [][]byte{value.([]byte)}
|
||||
case parquet.Type_FLOAT:
|
||||
values = []float32{value.(float32)}
|
||||
case parquet.Type_DOUBLE:
|
||||
values = []float64{value.(float64)}
|
||||
case parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY:
|
||||
values = [][]byte{value.([]byte)}
|
||||
}
|
||||
|
||||
return valuesToBytes(values, dataType)
|
||||
}
|
||||
|
||||
func unsignedVarIntToBytes(ui64 uint64) []byte {
|
||||
size := (getBitWidth(ui64) + 6) / 7
|
||||
if size == 0 {
|
||||
return []byte{0}
|
||||
}
|
||||
|
||||
buf := make([]byte, size)
|
||||
for i := uint64(0); i < size; i++ {
|
||||
buf[i] = byte(ui64&0x7F) | 0x80
|
||||
ui64 >>= 7
|
||||
}
|
||||
buf[size-1] &= 0x7F
|
||||
|
||||
return buf
|
||||
}
|
||||
|
||||
func valuesToRLEBytes(values interface{}, bitWidth int32, valueType parquet.Type) []byte {
|
||||
vals := valuesToInterfaces(values, valueType)
|
||||
result := []byte{}
|
||||
j := 0
|
||||
for i := 0; i < len(vals); i = j {
|
||||
for j = i + 1; j < len(vals) && vals[i] == vals[j]; j++ {
|
||||
}
|
||||
headerBytes := unsignedVarIntToBytes(uint64((j - i) << 1))
|
||||
result = append(result, headerBytes...)
|
||||
|
||||
valBytes := valueToBytes(vals[i], valueType)
|
||||
byteCount := (bitWidth + 7) / 8
|
||||
result = append(result, valBytes[:byteCount]...)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func valuesToRLEBitPackedHybridBytes(values interface{}, bitWidth int32, dataType parquet.Type) []byte {
|
||||
rleBytes := valuesToRLEBytes(values, bitWidth, dataType)
|
||||
lenBytes := valueToBytes(int32(len(rleBytes)), parquet.Type_INT32)
|
||||
return append(lenBytes, rleBytes...)
|
||||
}
|
||||
|
||||
func valuesToBitPackedBytes(values interface{}, bitWidth int64, withHeader bool, dataType parquet.Type) []byte {
|
||||
var i64s []int64
|
||||
switch dataType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
bs := values.([]bool)
|
||||
i64s = make([]int64, len(bs))
|
||||
for i := range bs {
|
||||
if bs[i] {
|
||||
i64s[i] = 1
|
||||
}
|
||||
}
|
||||
case parquet.Type_INT32:
|
||||
i32s := values.([]int32)
|
||||
i64s = make([]int64, len(i32s))
|
||||
for i := range i32s {
|
||||
i64s[i] = int64(i32s[i])
|
||||
}
|
||||
case parquet.Type_INT64:
|
||||
i64s = values.([]int64)
|
||||
default:
|
||||
panic(fmt.Errorf("data type %v is not supported for bit packing", dataType))
|
||||
}
|
||||
|
||||
if len(i64s) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var valueByte byte
|
||||
bitsSet := uint64(0)
|
||||
bitsNeeded := uint64(8)
|
||||
bitsToSet := uint64(bitWidth)
|
||||
value := i64s[0]
|
||||
|
||||
valueBytes := []byte{}
|
||||
for i := 0; i < len(i64s); {
|
||||
if bitsToSet >= bitsNeeded {
|
||||
valueByte |= byte(((value >> bitsSet) & ((1 << bitsNeeded) - 1)) << (8 - bitsNeeded))
|
||||
valueBytes = append(valueBytes, valueByte)
|
||||
bitsToSet -= bitsNeeded
|
||||
bitsSet += bitsNeeded
|
||||
|
||||
bitsNeeded = 8
|
||||
valueByte = 0
|
||||
|
||||
if bitsToSet <= 0 && (i+1) < len(i64s) {
|
||||
i++
|
||||
value = i64s[i]
|
||||
bitsToSet = uint64(bitWidth)
|
||||
bitsSet = 0
|
||||
}
|
||||
} else {
|
||||
valueByte |= byte((value >> bitsSet) << (8 - bitsNeeded))
|
||||
i++
|
||||
|
||||
if i < len(i64s) {
|
||||
value = i64s[i]
|
||||
}
|
||||
|
||||
bitsNeeded -= bitsToSet
|
||||
bitsToSet = uint64(bitWidth)
|
||||
bitsSet = 0
|
||||
}
|
||||
}
|
||||
|
||||
if withHeader {
|
||||
header := uint64(((len(i64s) / 8) << 1) | 1)
|
||||
headerBytes := unsignedVarIntToBytes(header)
|
||||
return append(headerBytes, valueBytes...)
|
||||
}
|
||||
|
||||
return valueBytes
|
||||
}
|
||||
|
||||
const (
|
||||
blockSize = 128
|
||||
subBlockSize = 32
|
||||
subBlockCount = blockSize / subBlockSize
|
||||
)
|
||||
|
||||
var (
|
||||
blockSizeBytes = unsignedVarIntToBytes(blockSize)
|
||||
subBlockCountBytes = unsignedVarIntToBytes(subBlockCount)
|
||||
)
|
||||
|
||||
func int32ToDeltaBytes(i32s []int32) []byte {
|
||||
getValue := func(i32 int32) uint64 {
|
||||
return uint64((i32 >> 31) ^ (i32 << 1))
|
||||
}
|
||||
|
||||
result := append([]byte{}, blockSizeBytes...)
|
||||
result = append(result, subBlockCountBytes...)
|
||||
result = append(result, unsignedVarIntToBytes(uint64(len(i32s)))...)
|
||||
result = append(result, unsignedVarIntToBytes(getValue(i32s[0]))...)
|
||||
|
||||
for i := 1; i < len(i32s); {
|
||||
block := []int32{}
|
||||
minDelta := int32(0x7FFFFFFF)
|
||||
|
||||
for ; i < len(i32s) && len(block) < blockSize; i++ {
|
||||
delta := i32s[i] - i32s[i-1]
|
||||
block = append(block, delta)
|
||||
if delta < minDelta {
|
||||
minDelta = delta
|
||||
}
|
||||
}
|
||||
|
||||
for len(block) < blockSize {
|
||||
block = append(block, minDelta)
|
||||
}
|
||||
|
||||
bitWidths := make([]byte, subBlockCount)
|
||||
for j := 0; j < subBlockCount; j++ {
|
||||
maxValue := int32(0)
|
||||
for k := j * subBlockSize; k < (j+1)*subBlockSize; k++ {
|
||||
block[k] -= minDelta
|
||||
if block[k] > maxValue {
|
||||
maxValue = block[k]
|
||||
}
|
||||
}
|
||||
|
||||
bitWidths[j] = byte(getBitWidth(uint64(maxValue)))
|
||||
}
|
||||
|
||||
minDeltaZigZag := getValue(minDelta)
|
||||
result = append(result, unsignedVarIntToBytes(minDeltaZigZag)...)
|
||||
result = append(result, bitWidths...)
|
||||
|
||||
for j := 0; j < subBlockCount; j++ {
|
||||
bitPacked := valuesToBitPackedBytes(
|
||||
block[j*subBlockSize:(j+1)*subBlockSize],
|
||||
int64(bitWidths[j]),
|
||||
false,
|
||||
parquet.Type_INT32,
|
||||
)
|
||||
result = append(result, bitPacked...)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func int64ToDeltaBytes(i64s []int64) []byte {
|
||||
getValue := func(i64 int64) uint64 {
|
||||
return uint64((i64 >> 63) ^ (i64 << 1))
|
||||
}
|
||||
|
||||
result := append([]byte{}, blockSizeBytes...)
|
||||
result = append(result, subBlockCountBytes...)
|
||||
result = append(result, unsignedVarIntToBytes(uint64(len(i64s)))...)
|
||||
result = append(result, unsignedVarIntToBytes(getValue(i64s[0]))...)
|
||||
|
||||
for i := 1; i < len(i64s); {
|
||||
block := []int64{}
|
||||
minDelta := int64(0x7FFFFFFFFFFFFFFF)
|
||||
|
||||
for ; i < len(i64s) && len(block) < blockSize; i++ {
|
||||
delta := i64s[i] - i64s[i-1]
|
||||
block = append(block, delta)
|
||||
if delta < minDelta {
|
||||
minDelta = delta
|
||||
}
|
||||
}
|
||||
|
||||
for len(block) < blockSize {
|
||||
block = append(block, minDelta)
|
||||
}
|
||||
|
||||
bitWidths := make([]byte, subBlockCount)
|
||||
for j := 0; j < subBlockCount; j++ {
|
||||
maxValue := int64(0)
|
||||
for k := j * subBlockSize; k < (j+1)*subBlockSize; k++ {
|
||||
block[k] -= minDelta
|
||||
if block[k] > maxValue {
|
||||
maxValue = block[k]
|
||||
}
|
||||
}
|
||||
|
||||
bitWidths[j] = byte(getBitWidth(uint64(maxValue)))
|
||||
}
|
||||
|
||||
minDeltaZigZag := getValue(minDelta)
|
||||
result = append(result, unsignedVarIntToBytes(minDeltaZigZag)...)
|
||||
result = append(result, bitWidths...)
|
||||
|
||||
for j := 0; j < subBlockCount; j++ {
|
||||
bitPacked := valuesToBitPackedBytes(
|
||||
block[j*subBlockSize:(j+1)*subBlockSize],
|
||||
int64(bitWidths[j]),
|
||||
false,
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
result = append(result, bitPacked...)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func valuesToDeltaBytes(values interface{}, dataType parquet.Type) []byte {
|
||||
switch dataType {
|
||||
case parquet.Type_INT32:
|
||||
return int32ToDeltaBytes(values.([]int32))
|
||||
case parquet.Type_INT64:
|
||||
return int64ToDeltaBytes(values.([]int64))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func stringsToDeltaLengthByteArrayBytes(strs []string) []byte {
|
||||
lengths := make([]int32, len(strs))
|
||||
for i, s := range strs {
|
||||
lengths[i] = int32(len(s))
|
||||
}
|
||||
|
||||
result := int32ToDeltaBytes(lengths)
|
||||
for _, s := range strs {
|
||||
result = append(result, []byte(s)...)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func stringsToDeltaByteArrayBytes(strs []string) []byte {
|
||||
prefixLengths := make([]int32, len(strs))
|
||||
suffixes := make([]string, len(strs))
|
||||
|
||||
var i, j int
|
||||
for i = 1; i < len(strs); i++ {
|
||||
for j = 0; j < len(strs[i-1]) && j < len(strs[i]); j++ {
|
||||
if strs[i-1][j] != strs[i][j] {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
prefixLengths[i] = int32(j)
|
||||
suffixes[i] = strs[i][j:]
|
||||
}
|
||||
|
||||
result := int32ToDeltaBytes(prefixLengths)
|
||||
return append(result, stringsToDeltaLengthByteArrayBytes(suffixes)...)
|
||||
}
|
||||
|
||||
func encodeValues(values interface{}, dataType parquet.Type, encoding parquet.Encoding, bitWidth int32) []byte {
|
||||
switch encoding {
|
||||
case parquet.Encoding_RLE:
|
||||
return valuesToRLEBitPackedHybridBytes(values, bitWidth, dataType)
|
||||
case parquet.Encoding_DELTA_BINARY_PACKED:
|
||||
return valuesToDeltaBytes(values, dataType)
|
||||
case parquet.Encoding_DELTA_BYTE_ARRAY:
|
||||
return stringsToDeltaByteArrayBytes(values.([]string))
|
||||
case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
|
||||
return stringsToDeltaLengthByteArrayBytes(values.([]string))
|
||||
}
|
||||
|
||||
return valuesToBytes(values, dataType)
|
||||
}
|
189
pkg/s3select/internal/parquet-go/encode_test.go
Normal file
189
pkg/s3select/internal/parquet-go/encode_test.go
Normal file
@ -0,0 +1,189 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func TestBoolsToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
bs []bool
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]bool{}, []byte{}},
|
||||
{[]bool{true}, []byte{1}},
|
||||
{[]bool{false}, []byte{0}},
|
||||
{[]bool{true, true}, []byte{3}},
|
||||
{[]bool{false, false}, []byte{0}},
|
||||
{[]bool{false, true}, []byte{2}},
|
||||
{[]bool{true, false}, []byte{1}},
|
||||
{[]bool{false, false, false, false, false, false, false, true, true}, []byte{128, 1}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := boolsToBytes(testCase.bs)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestInt32sToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
i32s []int32
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]int32{}, []byte{}},
|
||||
{[]int32{1}, []byte{1, 0, 0, 0}},
|
||||
{[]int32{-1}, []byte{255, 255, 255, 255}},
|
||||
{[]int32{256}, []byte{0, 1, 0, 0}},
|
||||
{[]int32{math.MinInt32}, []byte{0, 0, 0, 128}},
|
||||
{[]int32{math.MaxInt32}, []byte{255, 255, 255, 127}},
|
||||
{[]int32{257, -2}, []byte{1, 1, 0, 0, 254, 255, 255, 255}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := int32sToBytes(testCase.i32s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestInt64sToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
i64s []int64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]int64{}, []byte{}},
|
||||
{[]int64{1}, []byte{1, 0, 0, 0, 0, 0, 0, 0}},
|
||||
{[]int64{-1}, []byte{255, 255, 255, 255, 255, 255, 255, 255}},
|
||||
{[]int64{256}, []byte{0, 1, 0, 0, 0, 0, 0, 0}},
|
||||
{[]int64{math.MinInt64}, []byte{0, 0, 0, 0, 0, 0, 0, 128}},
|
||||
{[]int64{math.MaxInt64}, []byte{255, 255, 255, 255, 255, 255, 255, 127}},
|
||||
{[]int64{257, -2}, []byte{1, 1, 0, 0, 0, 0, 0, 0, 254, 255, 255, 255, 255, 255, 255, 255}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := int64sToBytes(testCase.i64s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFloat32sToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
f32s []float32
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]float32{}, []byte{}},
|
||||
{[]float32{1}, []byte{0, 0, 128, 63}},
|
||||
{[]float32{1.0}, []byte{0, 0, 128, 63}},
|
||||
{[]float32{-1}, []byte{0, 0, 128, 191}},
|
||||
{[]float32{-1.0}, []byte{0, 0, 128, 191}},
|
||||
{[]float32{256}, []byte{0, 0, 128, 67}},
|
||||
{[]float32{1.1}, []byte{205, 204, 140, 63}},
|
||||
{[]float32{-1.1}, []byte{205, 204, 140, 191}},
|
||||
{[]float32{math.Pi}, []byte{219, 15, 73, 64}},
|
||||
{[]float32{257, -2}, []byte{0, 128, 128, 67, 0, 0, 0, 192}},
|
||||
{[]float32{257.1, -2.1}, []byte{205, 140, 128, 67, 102, 102, 6, 192}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := float32sToBytes(testCase.f32s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFloat64sToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
f64s []float64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]float64{}, []byte{}},
|
||||
{[]float64{1}, []byte{0, 0, 0, 0, 0, 0, 240, 63}},
|
||||
{[]float64{1.0}, []byte{0, 0, 0, 0, 0, 0, 240, 63}},
|
||||
{[]float64{-1}, []byte{0, 0, 0, 0, 0, 0, 240, 191}},
|
||||
{[]float64{-1.0}, []byte{0, 0, 0, 0, 0, 0, 240, 191}},
|
||||
{[]float64{256}, []byte{0, 0, 0, 0, 0, 0, 112, 64}},
|
||||
{[]float64{1.1}, []byte{154, 153, 153, 153, 153, 153, 241, 63}},
|
||||
{[]float64{-1.1}, []byte{154, 153, 153, 153, 153, 153, 241, 191}},
|
||||
{[]float64{math.Pi}, []byte{24, 45, 68, 84, 251, 33, 9, 64}},
|
||||
{[]float64{257, -2}, []byte{0, 0, 0, 0, 0, 16, 112, 64, 0, 0, 0, 0, 0, 0, 0, 192}},
|
||||
{[]float64{257.1, -2.1}, []byte{154, 153, 153, 153, 153, 17, 112, 64, 205, 204, 204, 204, 204, 204, 0, 192}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := float64sToBytes(testCase.f64s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestUnsignedVarIntToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
ui64 uint64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{0, []byte{0}},
|
||||
{1, []byte{1}},
|
||||
{0x7F, []byte{127}},
|
||||
{0x80, []byte{128, 1}},
|
||||
{uint64(math.MaxUint64), []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 1}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := unsignedVarIntToBytes(testCase.ui64)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestValuesToRLEBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
values interface{}
|
||||
bitWidth int32
|
||||
dataType parquet.Type
|
||||
expectedResult []byte
|
||||
}{
|
||||
{[]int32{3, 5, 7}, 1, parquet.Type_INT32, []byte{2, 3, 2, 5, 2, 7}},
|
||||
{[]int32{3, 3, 3}, 1, parquet.Type_INT32, []byte{6, 3}},
|
||||
{[]int32{2, 2, 3, 3, 3}, 1, parquet.Type_INT32, []byte{4, 2, 6, 3}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := valuesToRLEBytes(testCase.values, testCase.bitWidth, testCase.dataType)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
38
pkg/s3select/internal/parquet-go/encoding/common.go
Normal file
38
pkg/s3select/internal/parquet-go/encoding/common.go
Normal file
@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/common"
|
||||
)
|
||||
|
||||
// Refer https://en.wikipedia.org/wiki/LEB128#Unsigned_LEB128
|
||||
func varIntEncode(ui64 uint64) []byte {
|
||||
if ui64 == 0 {
|
||||
return []byte{0}
|
||||
}
|
||||
|
||||
length := int(common.BitWidth(ui64)+6) / 7
|
||||
data := make([]byte, length)
|
||||
for i := 0; i < length; i++ {
|
||||
data[i] = byte(ui64&0x7F) | 0x80
|
||||
ui64 >>= 7
|
||||
}
|
||||
data[length-1] &= 0x7F
|
||||
|
||||
return data
|
||||
}
|
43
pkg/s3select/internal/parquet-go/encoding/common_test.go
Normal file
43
pkg/s3select/internal/parquet-go/encoding/common_test.go
Normal file
@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestVarIntToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
ui64 uint64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{0, []byte{0}},
|
||||
{1, []byte{1}},
|
||||
{0x7F, []byte{127}},
|
||||
{0x80, []byte{128, 1}},
|
||||
{uint64(math.MaxUint64), []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 1}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := varIntEncode(testCase.ui64)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
296
pkg/s3select/internal/parquet-go/encoding/delta-encode.go
Normal file
296
pkg/s3select/internal/parquet-go/encoding/delta-encode.go
Normal file
@ -0,0 +1,296 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/common"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
const (
|
||||
blockSize = 128
|
||||
miniBlockSize = 32
|
||||
miniBlockCount = blockSize / miniBlockSize
|
||||
)
|
||||
|
||||
var deltaEncodeHeaderBytes []byte
|
||||
|
||||
func init() {
|
||||
deltaEncodeHeaderBytes = varIntEncode(blockSize)
|
||||
deltaEncodeHeaderBytes = append(deltaEncodeHeaderBytes, varIntEncode(miniBlockCount)...)
|
||||
}
|
||||
|
||||
// Supported Types: BOOLEAN, INT32, INT64
|
||||
func bitPackedEncode(values interface{}, bitWidth uint64, withHeader bool, parquetType parquet.Type) []byte {
|
||||
var i64s []int64
|
||||
switch parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
bs, ok := values.([]bool)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of bool"))
|
||||
}
|
||||
|
||||
i64s = make([]int64, len(bs))
|
||||
for i := range bs {
|
||||
if bs[i] {
|
||||
i64s[i] = 1
|
||||
}
|
||||
}
|
||||
case parquet.Type_INT32:
|
||||
i32s, ok := values.([]int32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int32"))
|
||||
}
|
||||
|
||||
for i := range i32s {
|
||||
i64s[i] = int64(i32s[i])
|
||||
}
|
||||
case parquet.Type_INT64:
|
||||
var ok bool
|
||||
i64s, ok = values.([]int64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int64"))
|
||||
}
|
||||
default:
|
||||
panic(fmt.Errorf("%v parquet type unsupported", parquetType))
|
||||
}
|
||||
|
||||
if len(i64s) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var valueByte byte
|
||||
bitsSet := uint64(0)
|
||||
bitsNeeded := uint64(8)
|
||||
bitsToSet := bitWidth
|
||||
value := i64s[0]
|
||||
|
||||
valueBytes := []byte{}
|
||||
for i := 0; i < len(i64s); {
|
||||
if bitsToSet >= bitsNeeded {
|
||||
valueByte |= byte(((value >> bitsSet) & ((1 << bitsNeeded) - 1)) << (8 - bitsNeeded))
|
||||
valueBytes = append(valueBytes, valueByte)
|
||||
bitsToSet -= bitsNeeded
|
||||
bitsSet += bitsNeeded
|
||||
|
||||
bitsNeeded = 8
|
||||
valueByte = 0
|
||||
|
||||
if bitsToSet <= 0 && (i+1) < len(i64s) {
|
||||
i++
|
||||
value = i64s[i]
|
||||
bitsToSet = bitWidth
|
||||
bitsSet = 0
|
||||
}
|
||||
} else {
|
||||
valueByte |= byte((value >> bitsSet) << (8 - bitsNeeded))
|
||||
i++
|
||||
|
||||
if i < len(i64s) {
|
||||
value = i64s[i]
|
||||
}
|
||||
|
||||
bitsNeeded -= bitsToSet
|
||||
bitsToSet = bitWidth
|
||||
bitsSet = 0
|
||||
}
|
||||
}
|
||||
|
||||
if withHeader {
|
||||
header := uint64(((len(i64s) / 8) << 1) | 1)
|
||||
headerBytes := varIntEncode(header)
|
||||
return append(headerBytes, valueBytes...)
|
||||
}
|
||||
|
||||
return valueBytes
|
||||
}
|
||||
|
||||
func deltaEncodeInt32s(i32s []int32) (data []byte) {
|
||||
getValue := func(i32 int32) uint64 {
|
||||
return uint64((i32 >> 31) ^ (i32 << 1))
|
||||
}
|
||||
|
||||
data = append(data, deltaEncodeHeaderBytes...)
|
||||
data = append(data, varIntEncode(uint64(len(i32s)))...)
|
||||
data = append(data, varIntEncode(getValue(i32s[0]))...)
|
||||
|
||||
for i := 1; i < len(i32s); {
|
||||
block := []int32{}
|
||||
minDelta := int32(0x7FFFFFFF)
|
||||
|
||||
for ; i < len(i32s) && len(block) < blockSize; i++ {
|
||||
delta := i32s[i] - i32s[i-1]
|
||||
block = append(block, delta)
|
||||
if delta < minDelta {
|
||||
minDelta = delta
|
||||
}
|
||||
}
|
||||
|
||||
for len(block) < blockSize {
|
||||
block = append(block, minDelta)
|
||||
}
|
||||
|
||||
bitWidths := make([]byte, miniBlockCount)
|
||||
for j := 0; j < miniBlockCount; j++ {
|
||||
maxValue := int32(0)
|
||||
for k := j * miniBlockSize; k < (j+1)*miniBlockSize; k++ {
|
||||
block[k] -= minDelta
|
||||
if block[k] > maxValue {
|
||||
maxValue = block[k]
|
||||
}
|
||||
}
|
||||
|
||||
bitWidths[j] = byte(common.BitWidth(uint64(maxValue)))
|
||||
}
|
||||
|
||||
minDeltaZigZag := getValue(minDelta)
|
||||
data = append(data, varIntEncode(minDeltaZigZag)...)
|
||||
data = append(data, bitWidths...)
|
||||
|
||||
for j := 0; j < miniBlockCount; j++ {
|
||||
bitPacked := bitPackedEncode(
|
||||
block[j*miniBlockSize:(j+1)*miniBlockSize],
|
||||
uint64(bitWidths[j]),
|
||||
false,
|
||||
parquet.Type_INT32,
|
||||
)
|
||||
data = append(data, bitPacked...)
|
||||
}
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func deltaEncodeInt64s(i64s []int64) (data []byte) {
|
||||
getValue := func(i64 int64) uint64 {
|
||||
return uint64((i64 >> 63) ^ (i64 << 1))
|
||||
}
|
||||
|
||||
data = append(data, deltaEncodeHeaderBytes...)
|
||||
data = append(data, varIntEncode(uint64(len(i64s)))...)
|
||||
data = append(data, varIntEncode(getValue(i64s[0]))...)
|
||||
|
||||
for i := 1; i < len(i64s); {
|
||||
block := []int64{}
|
||||
minDelta := int64(0x7FFFFFFFFFFFFFFF)
|
||||
|
||||
for ; i < len(i64s) && len(block) < blockSize; i++ {
|
||||
delta := i64s[i] - i64s[i-1]
|
||||
block = append(block, delta)
|
||||
if delta < minDelta {
|
||||
minDelta = delta
|
||||
}
|
||||
}
|
||||
|
||||
for len(block) < blockSize {
|
||||
block = append(block, minDelta)
|
||||
}
|
||||
|
||||
bitWidths := make([]byte, miniBlockCount)
|
||||
for j := 0; j < miniBlockCount; j++ {
|
||||
maxValue := int64(0)
|
||||
for k := j * miniBlockSize; k < (j+1)*miniBlockSize; k++ {
|
||||
block[k] -= minDelta
|
||||
if block[k] > maxValue {
|
||||
maxValue = block[k]
|
||||
}
|
||||
}
|
||||
|
||||
bitWidths[j] = byte(common.BitWidth(uint64(maxValue)))
|
||||
}
|
||||
|
||||
minDeltaZigZag := getValue(minDelta)
|
||||
data = append(data, varIntEncode(minDeltaZigZag)...)
|
||||
data = append(data, bitWidths...)
|
||||
|
||||
for j := 0; j < miniBlockCount; j++ {
|
||||
bitPacked := bitPackedEncode(
|
||||
block[j*miniBlockSize:(j+1)*miniBlockSize],
|
||||
uint64(bitWidths[j]),
|
||||
false,
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
data = append(data, bitPacked...)
|
||||
}
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
// DeltaEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-encoding-delta_binary_packed--5
|
||||
//
|
||||
// Supported Types: INT32, INT64.
|
||||
func DeltaEncode(values interface{}, parquetType parquet.Type) []byte {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
i32s, ok := values.([]int32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int32"))
|
||||
}
|
||||
return deltaEncodeInt32s(i32s)
|
||||
case parquet.Type_INT64:
|
||||
i64s, ok := values.([]int64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int64"))
|
||||
}
|
||||
return deltaEncodeInt64s(i64s)
|
||||
}
|
||||
|
||||
panic(fmt.Errorf("%v parquet type unsupported", parquetType))
|
||||
}
|
||||
|
||||
// DeltaLengthByteArrayEncode encodes bytes slices specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6
|
||||
//
|
||||
// Supported Types: BYTE_ARRAY
|
||||
func DeltaLengthByteArrayEncode(bytesSlices [][]byte) (data []byte) {
|
||||
lengths := make([]int32, len(bytesSlices))
|
||||
for i, bytes := range bytesSlices {
|
||||
lengths[i] = int32(len(bytes))
|
||||
}
|
||||
|
||||
data = deltaEncodeInt32s(lengths)
|
||||
for _, bytes := range bytesSlices {
|
||||
data = append(data, []byte(bytes)...)
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
// DeltaByteArrayEncode encodes sequence of strings values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-strings-delta_byte_array--7
|
||||
//
|
||||
// Supported Types: BYTE_ARRAY
|
||||
func DeltaByteArrayEncode(bytesSlices [][]byte) (data []byte) {
|
||||
prefixLengths := make([]int32, len(bytesSlices))
|
||||
suffixes := make([][]byte, len(bytesSlices))
|
||||
|
||||
var i, j int
|
||||
for i = 1; i < len(bytesSlices); i++ {
|
||||
for j = 0; j < len(bytesSlices[i-1]) && j < len(bytesSlices[i]); j++ {
|
||||
if bytesSlices[i-1][j] != bytesSlices[i][j] {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
prefixLengths[i] = int32(j)
|
||||
suffixes[i] = bytesSlices[i][j:]
|
||||
}
|
||||
|
||||
data = deltaEncodeInt32s(prefixLengths)
|
||||
return append(data, DeltaLengthByteArrayEncode(suffixes)...)
|
||||
}
|
140
pkg/s3select/internal/parquet-go/encoding/plain-encode.go
Normal file
140
pkg/s3select/internal/parquet-go/encoding/plain-encode.go
Normal file
@ -0,0 +1,140 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func plainEncodeBools(bs []bool) []byte {
|
||||
data := make([]byte, (len(bs)+7)/8)
|
||||
|
||||
for i := range bs {
|
||||
if bs[i] {
|
||||
data[i/8] |= 1 << uint(i%8)
|
||||
}
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeInt32s(i32s []int32) []byte {
|
||||
data := make([]byte, len(i32s)*4)
|
||||
|
||||
for i, i32 := range i32s {
|
||||
binary.LittleEndian.PutUint32(data[i*4:], uint32(i32))
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeInt64s(i64s []int64) []byte {
|
||||
data := make([]byte, len(i64s)*8)
|
||||
|
||||
for i, i64 := range i64s {
|
||||
binary.LittleEndian.PutUint64(data[i*8:], uint64(i64))
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeFloat32s(f32s []float32) []byte {
|
||||
data := make([]byte, len(f32s)*4)
|
||||
|
||||
for i, f32 := range f32s {
|
||||
binary.LittleEndian.PutUint32(data[i*4:], math.Float32bits(f32))
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeFloat64s(f64s []float64) []byte {
|
||||
data := make([]byte, len(f64s)*8)
|
||||
|
||||
for i, f64 := range f64s {
|
||||
binary.LittleEndian.PutUint64(data[i*8:], math.Float64bits(f64))
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeBytesSlices(bytesSlices [][]byte) []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
|
||||
for _, s := range bytesSlices {
|
||||
if err := binary.Write(buf, binary.LittleEndian, uint32(len(s))); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if _, err := buf.Write(s); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
// PlainEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0
|
||||
//
|
||||
// Supported Types: BOOLEAN, INT32, INT64, FLOAT, DOUBLE, BYTE_ARRAY
|
||||
func PlainEncode(values interface{}, parquetType parquet.Type) []byte {
|
||||
switch parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
bs, ok := values.([]bool)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of bool"))
|
||||
}
|
||||
return plainEncodeBools(bs)
|
||||
case parquet.Type_INT32:
|
||||
i32s, ok := values.([]int32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int32"))
|
||||
}
|
||||
return plainEncodeInt32s(i32s)
|
||||
case parquet.Type_INT64:
|
||||
i64s, ok := values.([]int64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int64"))
|
||||
}
|
||||
return plainEncodeInt64s(i64s)
|
||||
case parquet.Type_FLOAT:
|
||||
f32s, ok := values.([]float32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of float32"))
|
||||
}
|
||||
return plainEncodeFloat32s(f32s)
|
||||
case parquet.Type_DOUBLE:
|
||||
f64s, ok := values.([]float64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of float64"))
|
||||
}
|
||||
return plainEncodeFloat64s(f64s)
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
bytesSlices, ok := values.([][]byte)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of byte array"))
|
||||
}
|
||||
return plainEncodeBytesSlices(bytesSlices)
|
||||
}
|
||||
|
||||
panic(fmt.Errorf("%v parquet type unsupported", parquetType))
|
||||
}
|
147
pkg/s3select/internal/parquet-go/encoding/plain-encode_test.go
Normal file
147
pkg/s3select/internal/parquet-go/encoding/plain-encode_test.go
Normal file
@ -0,0 +1,147 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestPlainEncodeBools(t *testing.T) {
|
||||
testCases := []struct {
|
||||
bs []bool
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]bool{}, []byte{}},
|
||||
{[]bool{true}, []byte{1}},
|
||||
{[]bool{false}, []byte{0}},
|
||||
{[]bool{true, true}, []byte{3}},
|
||||
{[]bool{false, false}, []byte{0}},
|
||||
{[]bool{false, true}, []byte{2}},
|
||||
{[]bool{true, false}, []byte{1}},
|
||||
{[]bool{false, false, false, false, false, false, false, true, true}, []byte{128, 1}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeBools(testCase.bs)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlainEncodeInt32s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
i32s []int32
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]int32{}, []byte{}},
|
||||
{[]int32{1}, []byte{1, 0, 0, 0}},
|
||||
{[]int32{-1}, []byte{255, 255, 255, 255}},
|
||||
{[]int32{256}, []byte{0, 1, 0, 0}},
|
||||
{[]int32{math.MinInt32}, []byte{0, 0, 0, 128}},
|
||||
{[]int32{math.MaxInt32}, []byte{255, 255, 255, 127}},
|
||||
{[]int32{257, -2}, []byte{1, 1, 0, 0, 254, 255, 255, 255}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeInt32s(testCase.i32s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlainEncodeInt64s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
i64s []int64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]int64{}, []byte{}},
|
||||
{[]int64{1}, []byte{1, 0, 0, 0, 0, 0, 0, 0}},
|
||||
{[]int64{-1}, []byte{255, 255, 255, 255, 255, 255, 255, 255}},
|
||||
{[]int64{256}, []byte{0, 1, 0, 0, 0, 0, 0, 0}},
|
||||
{[]int64{math.MinInt64}, []byte{0, 0, 0, 0, 0, 0, 0, 128}},
|
||||
{[]int64{math.MaxInt64}, []byte{255, 255, 255, 255, 255, 255, 255, 127}},
|
||||
{[]int64{257, -2}, []byte{1, 1, 0, 0, 0, 0, 0, 0, 254, 255, 255, 255, 255, 255, 255, 255}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeInt64s(testCase.i64s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlainEncodeFloat32s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
f32s []float32
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]float32{}, []byte{}},
|
||||
{[]float32{1}, []byte{0, 0, 128, 63}},
|
||||
{[]float32{1.0}, []byte{0, 0, 128, 63}},
|
||||
{[]float32{-1}, []byte{0, 0, 128, 191}},
|
||||
{[]float32{-1.0}, []byte{0, 0, 128, 191}},
|
||||
{[]float32{256}, []byte{0, 0, 128, 67}},
|
||||
{[]float32{1.1}, []byte{205, 204, 140, 63}},
|
||||
{[]float32{-1.1}, []byte{205, 204, 140, 191}},
|
||||
{[]float32{math.Pi}, []byte{219, 15, 73, 64}},
|
||||
{[]float32{257, -2}, []byte{0, 128, 128, 67, 0, 0, 0, 192}},
|
||||
{[]float32{257.1, -2.1}, []byte{205, 140, 128, 67, 102, 102, 6, 192}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeFloat32s(testCase.f32s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlainEncodeFloat64s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
f64s []float64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]float64{}, []byte{}},
|
||||
{[]float64{1}, []byte{0, 0, 0, 0, 0, 0, 240, 63}},
|
||||
{[]float64{1.0}, []byte{0, 0, 0, 0, 0, 0, 240, 63}},
|
||||
{[]float64{-1}, []byte{0, 0, 0, 0, 0, 0, 240, 191}},
|
||||
{[]float64{-1.0}, []byte{0, 0, 0, 0, 0, 0, 240, 191}},
|
||||
{[]float64{256}, []byte{0, 0, 0, 0, 0, 0, 112, 64}},
|
||||
{[]float64{1.1}, []byte{154, 153, 153, 153, 153, 153, 241, 63}},
|
||||
{[]float64{-1.1}, []byte{154, 153, 153, 153, 153, 153, 241, 191}},
|
||||
{[]float64{math.Pi}, []byte{24, 45, 68, 84, 251, 33, 9, 64}},
|
||||
{[]float64{257, -2}, []byte{0, 0, 0, 0, 0, 16, 112, 64, 0, 0, 0, 0, 0, 0, 0, 192}},
|
||||
{[]float64{257.1, -2.1}, []byte{154, 153, 153, 153, 153, 17, 112, 64, 205, 204, 204, 204, 204, 204, 0, 192}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeFloat64s(testCase.f64s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
84
pkg/s3select/internal/parquet-go/encoding/rle-encode.go
Normal file
84
pkg/s3select/internal/parquet-go/encoding/rle-encode.go
Normal file
@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func rleEncodeInt32s(i32s []int32, bitWidth int32) (data []byte) {
|
||||
j := 0
|
||||
for i := 0; i < len(i32s); i = j {
|
||||
for j = i + 1; j < len(i32s) && i32s[i] == i32s[j]; j++ {
|
||||
}
|
||||
|
||||
headerBytes := varIntEncode(uint64((j - i) << 1))
|
||||
data = append(data, headerBytes...)
|
||||
|
||||
valBytes := plainEncodeInt32s([]int32{i32s[i]})
|
||||
byteCount := (bitWidth + 7) / 8
|
||||
data = append(data, valBytes[:byteCount]...)
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func rleEncodeInt64s(i64s []int64, bitWidth int32) (data []byte) {
|
||||
j := 0
|
||||
for i := 0; i < len(i64s); i = j {
|
||||
for j = i + 1; j < len(i64s) && i64s[i] == i64s[j]; j++ {
|
||||
}
|
||||
|
||||
headerBytes := varIntEncode(uint64((j - i) << 1))
|
||||
data = append(data, headerBytes...)
|
||||
|
||||
valBytes := plainEncodeInt64s([]int64{i64s[i]})
|
||||
byteCount := (bitWidth + 7) / 8
|
||||
data = append(data, valBytes[:byteCount]...)
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
// RLEBitPackedHybridEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3
|
||||
//
|
||||
// Supported Types: INT32, INT64
|
||||
func RLEBitPackedHybridEncode(values interface{}, bitWidth int32, parquetType parquet.Type) []byte {
|
||||
var rleBytes []byte
|
||||
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
i32s, ok := values.([]int32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int32"))
|
||||
}
|
||||
rleBytes = rleEncodeInt32s(i32s, bitWidth)
|
||||
case parquet.Type_INT64:
|
||||
i64s, ok := values.([]int64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int64"))
|
||||
}
|
||||
rleBytes = rleEncodeInt64s(i64s, bitWidth)
|
||||
default:
|
||||
panic(fmt.Errorf("%v parquet type unsupported", parquetType))
|
||||
}
|
||||
|
||||
lenBytes := plainEncodeInt32s([]int32{int32(len(rleBytes))})
|
||||
return append(lenBytes, rleBytes...)
|
||||
}
|
44
pkg/s3select/internal/parquet-go/encoding/rle-encode_test.go
Normal file
44
pkg/s3select/internal/parquet-go/encoding/rle-encode_test.go
Normal file
@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func TestRLEEncodeInt32s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
values []int32
|
||||
bitWidth int32
|
||||
dataType parquet.Type
|
||||
expectedResult []byte
|
||||
}{
|
||||
{[]int32{3, 5, 7}, 1, parquet.Type_INT32, []byte{2, 3, 2, 5, 2, 7}},
|
||||
{[]int32{3, 3, 3}, 1, parquet.Type_INT32, []byte{6, 3}},
|
||||
{[]int32{2, 2, 3, 3, 3}, 1, parquet.Type_INT32, []byte{4, 2, 6, 3}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := rleEncodeInt32s(testCase.values, testCase.bitWidth)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
60
pkg/s3select/internal/parquet-go/encoding/rledict-encode.go
Normal file
60
pkg/s3select/internal/parquet-go/encoding/rledict-encode.go
Normal file
@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/common"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
// RLEDictEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#dictionary-encoding-plain_dictionary--2-and-rle_dictionary--8 and returns dictionary page data and data page data.
|
||||
//
|
||||
// Dictionary page data contains PLAIN encodeed slice of uniquely fully defined non-nil values.
|
||||
// Data page data contains RLE/Bit-Packed Hybrid encoded indices of fully defined non-nil values.
|
||||
//
|
||||
// Supported Types: BOOLEAN, INT32, INT64, FLOAT, DOUBLE, BYTE_ARRAY
|
||||
func RLEDictEncode(values []interface{}, parquetType parquet.Type, bitWidth int32) (dictPageData, dataPageData []byte, dictValueCount int32, indexBitWidth uint8) {
|
||||
var definedValues []interface{}
|
||||
var indices []int32
|
||||
|
||||
valueIndexMap := make(map[interface{}]int32)
|
||||
j := 0
|
||||
for i := 0; i < len(values); i = j {
|
||||
for j = i; j < len(values); j++ {
|
||||
value := values[j]
|
||||
if value == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
index, found := valueIndexMap[value]
|
||||
if !found {
|
||||
index = int32(len(definedValues))
|
||||
definedValues = append(definedValues, value)
|
||||
valueIndexMap[value] = index
|
||||
}
|
||||
|
||||
indices = append(indices, index)
|
||||
}
|
||||
}
|
||||
|
||||
indexBitWidth = uint8(common.BitWidth(uint64(indices[len(indices)-1])))
|
||||
|
||||
dictPageData = PlainEncode(common.ToSliceValue(definedValues, parquetType), parquetType)
|
||||
dataPageData = RLEBitPackedHybridEncode(indices, int32(indexBitWidth), parquet.Type_INT32)
|
||||
|
||||
return dictPageData, dataPageData, int32(len(definedValues)), indexBitWidth
|
||||
}
|
35
pkg/s3select/internal/parquet-go/endian.go
Normal file
35
pkg/s3select/internal/parquet-go/endian.go
Normal file
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
)
|
||||
|
||||
func uint32ToBytes(v uint32) []byte {
|
||||
buf := make([]byte, 4)
|
||||
binary.LittleEndian.PutUint32(buf, v)
|
||||
return buf
|
||||
}
|
||||
|
||||
func bytesToUint32(buf []byte) uint32 {
|
||||
return binary.LittleEndian.Uint32(buf)
|
||||
}
|
||||
|
||||
func bytesToUint64(buf []byte) uint64 {
|
||||
return binary.LittleEndian.Uint64(buf)
|
||||
}
|
BIN
pkg/s3select/internal/parquet-go/example.parquet
Normal file
BIN
pkg/s3select/internal/parquet-go/example.parquet
Normal file
Binary file not shown.
@ -0,0 +1,6 @@
|
||||
// Autogenerated by Thrift Compiler (0.10.0)
|
||||
// DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
|
||||
|
||||
package parquet
|
||||
|
||||
var GoUnusedProtection__ int
|
@ -0,0 +1,18 @@
|
||||
// Autogenerated by Thrift Compiler (0.10.0)
|
||||
// DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"git.apache.org/thrift.git/lib/go/thrift"
|
||||
)
|
||||
|
||||
// (needed to ensure safety because of naive import list construction.)
|
||||
var _ = thrift.ZERO
|
||||
var _ = fmt.Printf
|
||||
var _ = bytes.Equal
|
||||
|
||||
func init() {
|
||||
}
|
8191
pkg/s3select/internal/parquet-go/gen-go/parquet/parquet.go
Normal file
8191
pkg/s3select/internal/parquet-go/gen-go/parquet/parquet.go
Normal file
File diff suppressed because it is too large
Load Diff
23
pkg/s3select/internal/parquet-go/gen-parquet-format-pkg.sh
Normal file
23
pkg/s3select/internal/parquet-go/gen-parquet-format-pkg.sh
Normal file
@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Minio Cloud Storage, (C) 2018 Minio, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
rm -f parquet.thrift
|
||||
wget -q https://github.com/apache/parquet-format/raw/df6132b94f273521a418a74442085fdd5a0aa009/src/main/thrift/parquet.thrift
|
||||
thrift --gen go parquet.thrift
|
||||
gofmt -w -s gen-go/parquet
|
765
pkg/s3select/internal/parquet-go/page.go
Normal file
765
pkg/s3select/internal/parquet-go/page.go
Normal file
@ -0,0 +1,765 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2018 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"git.apache.org/thrift.git/lib/go/thrift"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
// getBitWidth - returns bits required to place num e.g.
|
||||
//
|
||||
// num | width
|
||||
// -----|-------
|
||||
// 0 | 0
|
||||
// 1 | 1
|
||||
// 2 | 2
|
||||
// 3 | 2
|
||||
// 4 | 3
|
||||
// 5 | 3
|
||||
// ... | ...
|
||||
// ... | ...
|
||||
//
|
||||
func getBitWidth(num uint64) (width uint64) {
|
||||
for ; num != 0; num >>= 1 {
|
||||
width++
|
||||
}
|
||||
|
||||
return width
|
||||
}
|
||||
|
||||
// getMaxDefLevel - get maximum definition level.
|
||||
func getMaxDefLevel(nameIndexMap map[string]int, schemaElements []*parquet.SchemaElement, path []string) (v int) {
|
||||
for i := 1; i <= len(path); i++ {
|
||||
name := strings.Join(path[:i], ".")
|
||||
if index, ok := nameIndexMap[name]; ok {
|
||||
if schemaElements[index].GetRepetitionType() != parquet.FieldRepetitionType_REQUIRED {
|
||||
v++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return v
|
||||
}
|
||||
|
||||
// getMaxRepLevel - get maximum repetition level.
|
||||
func getMaxRepLevel(nameIndexMap map[string]int, schemaElements []*parquet.SchemaElement, path []string) (v int) {
|
||||
for i := 1; i <= len(path); i++ {
|
||||
name := strings.Join(path[:i], ".")
|
||||
if index, ok := nameIndexMap[name]; ok {
|
||||
if schemaElements[index].GetRepetitionType() == parquet.FieldRepetitionType_REPEATED {
|
||||
v++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return v
|
||||
}
|
||||
|
||||
func readPageHeader(reader *thrift.TBufferedTransport) (*parquet.PageHeader, error) {
|
||||
pageHeader := parquet.NewPageHeader()
|
||||
if err := pageHeader.Read(thrift.NewTCompactProtocol(reader)); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return pageHeader, nil
|
||||
}
|
||||
|
||||
func readPage(
|
||||
thriftReader *thrift.TBufferedTransport,
|
||||
metadata *parquet.ColumnMetaData,
|
||||
columnNameIndexMap map[string]int,
|
||||
schemaElements []*parquet.SchemaElement,
|
||||
) (page *page, definitionLevels, numRows int64, err error) {
|
||||
|
||||
pageHeader, err := readPageHeader(thriftReader)
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
|
||||
read := func() (data []byte, err error) {
|
||||
var repLevelsLen, defLevelsLen int32
|
||||
var repLevelsBuf, defLevelsBuf []byte
|
||||
|
||||
if pageHeader.GetType() == parquet.PageType_DATA_PAGE_V2 {
|
||||
repLevelsLen = pageHeader.DataPageHeaderV2.GetRepetitionLevelsByteLength()
|
||||
repLevelsBuf = make([]byte, repLevelsLen)
|
||||
if _, err = thriftReader.Read(repLevelsBuf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defLevelsLen = pageHeader.DataPageHeaderV2.GetDefinitionLevelsByteLength()
|
||||
defLevelsBuf = make([]byte, defLevelsLen)
|
||||
if _, err = thriftReader.Read(defLevelsBuf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
dataBuf := make([]byte, pageHeader.GetCompressedPageSize()-repLevelsLen-defLevelsLen)
|
||||
if _, err = thriftReader.Read(dataBuf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if dataBuf, err = compressionCodec(metadata.GetCodec()).uncompress(dataBuf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if repLevelsLen == 0 && defLevelsLen == 0 {
|
||||
return dataBuf, nil
|
||||
}
|
||||
|
||||
if repLevelsLen > 0 {
|
||||
data = append(data, uint32ToBytes(uint32(repLevelsLen))...)
|
||||
data = append(data, repLevelsBuf...)
|
||||
}
|
||||
|
||||
if defLevelsLen > 0 {
|
||||
data = append(data, uint32ToBytes(uint32(defLevelsLen))...)
|
||||
data = append(data, defLevelsBuf...)
|
||||
}
|
||||
|
||||
data = append(data, dataBuf...)
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
buf, err := read()
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
|
||||
path := append([]string{}, metadata.GetPathInSchema()...)
|
||||
|
||||
bytesReader := bytes.NewReader(buf)
|
||||
pageType := pageHeader.GetType()
|
||||
switch pageType {
|
||||
case parquet.PageType_INDEX_PAGE:
|
||||
return nil, 0, 0, fmt.Errorf("page type %v is not supported", parquet.PageType_INDEX_PAGE)
|
||||
|
||||
case parquet.PageType_DICTIONARY_PAGE:
|
||||
page = newDictPage()
|
||||
page.Header = pageHeader
|
||||
table := new(table)
|
||||
table.Path = path
|
||||
values, err := readValues(bytesReader, metadata.GetType(),
|
||||
uint64(pageHeader.DictionaryPageHeader.GetNumValues()), 0)
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
table.Values = getTableValues(values, metadata.GetType())
|
||||
page.DataTable = table
|
||||
|
||||
return page, 0, 0, nil
|
||||
|
||||
case parquet.PageType_DATA_PAGE, parquet.PageType_DATA_PAGE_V2:
|
||||
name := strings.Join(path, ".")
|
||||
|
||||
page = newDataPage()
|
||||
page.Header = pageHeader
|
||||
|
||||
maxDefinitionLevel := getMaxDefLevel(columnNameIndexMap, schemaElements, path)
|
||||
maxRepetitionLevel := getMaxRepLevel(columnNameIndexMap, schemaElements, path)
|
||||
|
||||
var numValues uint64
|
||||
var encodingType parquet.Encoding
|
||||
|
||||
if pageHeader.GetType() == parquet.PageType_DATA_PAGE {
|
||||
numValues = uint64(pageHeader.DataPageHeader.GetNumValues())
|
||||
encodingType = pageHeader.DataPageHeader.GetEncoding()
|
||||
} else {
|
||||
numValues = uint64(pageHeader.DataPageHeaderV2.GetNumValues())
|
||||
encodingType = pageHeader.DataPageHeaderV2.GetEncoding()
|
||||
}
|
||||
|
||||
var repetitionLevels []int64
|
||||
if maxRepetitionLevel > 0 {
|
||||
values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64,
|
||||
-1, numValues, getBitWidth(uint64(maxRepetitionLevel)))
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
|
||||
if repetitionLevels = values.([]int64); uint64(len(repetitionLevels)) > numValues {
|
||||
repetitionLevels = repetitionLevels[:numValues]
|
||||
}
|
||||
} else {
|
||||
repetitionLevels = make([]int64, numValues)
|
||||
}
|
||||
|
||||
var definitionLevels []int64
|
||||
if maxDefinitionLevel > 0 {
|
||||
values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64,
|
||||
-1, numValues, getBitWidth(uint64(maxDefinitionLevel)))
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
if definitionLevels = values.([]int64); uint64(len(definitionLevels)) > numValues {
|
||||
definitionLevels = definitionLevels[:numValues]
|
||||
}
|
||||
} else {
|
||||
definitionLevels = make([]int64, numValues)
|
||||
}
|
||||
|
||||
var numNulls uint64
|
||||
for i := 0; i < len(definitionLevels); i++ {
|
||||
if definitionLevels[i] != int64(maxDefinitionLevel) {
|
||||
numNulls++
|
||||
}
|
||||
}
|
||||
|
||||
var convertedType parquet.ConvertedType = -1
|
||||
if schemaElements[columnNameIndexMap[name]].IsSetConvertedType() {
|
||||
convertedType = schemaElements[columnNameIndexMap[name]].GetConvertedType()
|
||||
}
|
||||
values, valueType, err := readDataPageValues(bytesReader, encodingType, metadata.GetType(),
|
||||
convertedType, uint64(len(definitionLevels))-numNulls,
|
||||
uint64(schemaElements[columnNameIndexMap[name]].GetTypeLength()))
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
tableValues := getTableValues(values, valueType)
|
||||
|
||||
table := new(table)
|
||||
table.Path = path
|
||||
table.RepetitionType = schemaElements[columnNameIndexMap[name]].GetRepetitionType()
|
||||
table.MaxRepetitionLevel = int32(maxRepetitionLevel)
|
||||
table.MaxDefinitionLevel = int32(maxDefinitionLevel)
|
||||
table.Values = make([]interface{}, len(definitionLevels))
|
||||
table.RepetitionLevels = make([]int32, len(definitionLevels))
|
||||
table.DefinitionLevels = make([]int32, len(definitionLevels))
|
||||
|
||||
j := 0
|
||||
numRows := int64(0)
|
||||
for i := 0; i < len(definitionLevels); i++ {
|
||||
table.RepetitionLevels[i] = int32(repetitionLevels[i])
|
||||
table.DefinitionLevels[i] = int32(definitionLevels[i])
|
||||
if int(table.DefinitionLevels[i]) == maxDefinitionLevel {
|
||||
table.Values[i] = tableValues[j]
|
||||
j++
|
||||
}
|
||||
if table.RepetitionLevels[i] == 0 {
|
||||
numRows++
|
||||
}
|
||||
}
|
||||
page.DataTable = table
|
||||
|
||||
return page, int64(len(definitionLevels)), numRows, nil
|
||||
}
|
||||
|
||||
return nil, 0, 0, fmt.Errorf("unknown page type %v", pageType)
|
||||
}
|
||||
|
||||
type page struct {
|
||||
Header *parquet.PageHeader // Header of a page
|
||||
DataTable *table // Table to store values
|
||||
RawData []byte // Compressed data of the page, which is written in parquet file
|
||||
CompressType parquet.CompressionCodec // Compress type: gzip/snappy/none
|
||||
DataType parquet.Type // Parquet type of the values in the page
|
||||
Path []string // Path in schema(include the root)
|
||||
MaxVal interface{} // Maximum of the values
|
||||
MinVal interface{} // Minimum of the values
|
||||
PageSize int32
|
||||
}
|
||||
|
||||
func newPage() *page {
|
||||
return &page{
|
||||
Header: parquet.NewPageHeader(),
|
||||
PageSize: defaultPageSize,
|
||||
}
|
||||
}
|
||||
|
||||
func newDictPage() *page {
|
||||
page := newPage()
|
||||
page.Header.DictionaryPageHeader = parquet.NewDictionaryPageHeader()
|
||||
return page
|
||||
}
|
||||
|
||||
func newDataPage() *page {
|
||||
page := newPage()
|
||||
page.Header.DataPageHeader = parquet.NewDataPageHeader()
|
||||
return page
|
||||
}
|
||||
|
||||
func (page *page) decode(dictPage *page) {
|
||||
if dictPage == nil || page == nil || page.Header.DataPageHeader == nil ||
|
||||
(page.Header.DataPageHeader.Encoding != parquet.Encoding_RLE_DICTIONARY &&
|
||||
page.Header.DataPageHeader.Encoding != parquet.Encoding_PLAIN_DICTIONARY) {
|
||||
return
|
||||
}
|
||||
|
||||
for i := 0; i < len(page.DataTable.Values); i++ {
|
||||
if page.DataTable.Values[i] != nil {
|
||||
index := page.DataTable.Values[i].(int64)
|
||||
page.DataTable.Values[i] = dictPage.DataTable.Values[index]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get RepetitionLevels and Definitions from RawData
|
||||
func (page *page) getRLDLFromRawData(columnNameIndexMap map[string]int, schemaElements []*parquet.SchemaElement) (numValues int64, numRows int64, err error) {
|
||||
bytesReader := bytes.NewReader(page.RawData)
|
||||
|
||||
pageType := page.Header.GetType()
|
||||
|
||||
var buf []byte
|
||||
if pageType == parquet.PageType_DATA_PAGE_V2 {
|
||||
var repLevelsLen, defLevelsLen int32
|
||||
var repLevelsBuf, defLevelsBuf []byte
|
||||
|
||||
repLevelsLen = page.Header.DataPageHeaderV2.GetRepetitionLevelsByteLength()
|
||||
repLevelsBuf = make([]byte, repLevelsLen)
|
||||
if _, err = bytesReader.Read(repLevelsBuf); err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
|
||||
defLevelsLen = page.Header.DataPageHeaderV2.GetDefinitionLevelsByteLength()
|
||||
defLevelsBuf = make([]byte, defLevelsLen)
|
||||
if _, err = bytesReader.Read(defLevelsBuf); err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
|
||||
dataBuf := make([]byte, len(page.RawData)-int(repLevelsLen)-int(defLevelsLen))
|
||||
if _, err = bytesReader.Read(dataBuf); err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
|
||||
if repLevelsLen == 0 && defLevelsLen == 0 {
|
||||
buf = dataBuf
|
||||
} else {
|
||||
if repLevelsLen > 0 {
|
||||
buf = append(buf, uint32ToBytes(uint32(repLevelsLen))...)
|
||||
buf = append(buf, repLevelsBuf...)
|
||||
}
|
||||
|
||||
if defLevelsLen > 0 {
|
||||
buf = append(buf, uint32ToBytes(uint32(defLevelsLen))...)
|
||||
buf = append(buf, defLevelsBuf...)
|
||||
}
|
||||
|
||||
buf = append(buf, dataBuf...)
|
||||
}
|
||||
} else {
|
||||
if buf, err = compressionCodec(page.CompressType).uncompress(page.RawData); err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
}
|
||||
|
||||
bytesReader = bytes.NewReader(buf)
|
||||
|
||||
switch pageType {
|
||||
case parquet.PageType_DICTIONARY_PAGE:
|
||||
table := new(table)
|
||||
table.Path = page.Path
|
||||
page.DataTable = table
|
||||
return 0, 0, nil
|
||||
|
||||
case parquet.PageType_DATA_PAGE, parquet.PageType_DATA_PAGE_V2:
|
||||
var numValues uint64
|
||||
if pageType == parquet.PageType_DATA_PAGE {
|
||||
numValues = uint64(page.Header.DataPageHeader.GetNumValues())
|
||||
} else {
|
||||
numValues = uint64(page.Header.DataPageHeaderV2.GetNumValues())
|
||||
}
|
||||
|
||||
maxDefinitionLevel := getMaxDefLevel(columnNameIndexMap, schemaElements, page.Path)
|
||||
maxRepetitionLevel := getMaxRepLevel(columnNameIndexMap, schemaElements, page.Path)
|
||||
|
||||
var repetitionLevels []int64
|
||||
if maxRepetitionLevel > 0 {
|
||||
values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64,
|
||||
-1, numValues, getBitWidth(uint64(maxRepetitionLevel)))
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
|
||||
if repetitionLevels = values.([]int64); uint64(len(repetitionLevels)) > numValues {
|
||||
repetitionLevels = repetitionLevels[:numValues]
|
||||
}
|
||||
} else {
|
||||
repetitionLevels = make([]int64, numValues)
|
||||
}
|
||||
|
||||
var definitionLevels []int64
|
||||
if maxDefinitionLevel > 0 {
|
||||
values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64,
|
||||
-1, numValues, getBitWidth(uint64(maxDefinitionLevel)))
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
if definitionLevels = values.([]int64); uint64(len(definitionLevels)) > numValues {
|
||||
definitionLevels = definitionLevels[:numValues]
|
||||
}
|
||||
} else {
|
||||
definitionLevels = make([]int64, numValues)
|
||||
}
|
||||
|
||||
table := new(table)
|
||||
table.Path = page.Path
|
||||
name := strings.Join(page.Path, ".")
|
||||
table.RepetitionType = schemaElements[columnNameIndexMap[name]].GetRepetitionType()
|
||||
table.MaxRepetitionLevel = int32(maxRepetitionLevel)
|
||||
table.MaxDefinitionLevel = int32(maxDefinitionLevel)
|
||||
table.Values = make([]interface{}, len(definitionLevels))
|
||||
table.RepetitionLevels = make([]int32, len(definitionLevels))
|
||||
table.DefinitionLevels = make([]int32, len(definitionLevels))
|
||||
|
||||
numRows := int64(0)
|
||||
for i := 0; i < len(definitionLevels); i++ {
|
||||
table.RepetitionLevels[i] = int32(repetitionLevels[i])
|
||||
table.DefinitionLevels[i] = int32(definitionLevels[i])
|
||||
if table.RepetitionLevels[i] == 0 {
|
||||
numRows++
|
||||
}
|
||||
}
|
||||
page.DataTable = table
|
||||
page.RawData = buf[len(buf)-bytesReader.Len():]
|
||||
|
||||
return int64(numValues), numRows, nil
|
||||
}
|
||||
|
||||
return 0, 0, fmt.Errorf("Unsupported page type %v", pageType)
|
||||
}
|
||||
|
||||
func (page *page) getValueFromRawData(columnNameIndexMap map[string]int, schemaElements []*parquet.SchemaElement) (err error) {
|
||||
pageType := page.Header.GetType()
|
||||
switch pageType {
|
||||
case parquet.PageType_DICTIONARY_PAGE:
|
||||
bytesReader := bytes.NewReader(page.RawData)
|
||||
var values interface{}
|
||||
values, err = readValues(bytesReader, page.DataType,
|
||||
uint64(page.Header.DictionaryPageHeader.GetNumValues()), 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
page.DataTable.Values = getTableValues(values, page.DataType)
|
||||
return nil
|
||||
|
||||
case parquet.PageType_DATA_PAGE_V2:
|
||||
if page.RawData, err = compressionCodec(page.CompressType).uncompress(page.RawData); err != nil {
|
||||
return err
|
||||
}
|
||||
fallthrough
|
||||
case parquet.PageType_DATA_PAGE:
|
||||
encodingType := page.Header.DataPageHeader.GetEncoding()
|
||||
bytesReader := bytes.NewReader(page.RawData)
|
||||
|
||||
var numNulls uint64
|
||||
for i := 0; i < len(page.DataTable.DefinitionLevels); i++ {
|
||||
if page.DataTable.DefinitionLevels[i] != page.DataTable.MaxDefinitionLevel {
|
||||
numNulls++
|
||||
}
|
||||
}
|
||||
|
||||
name := strings.Join(page.DataTable.Path, ".")
|
||||
var convertedType parquet.ConvertedType = -1
|
||||
|
||||
if schemaElements[columnNameIndexMap[name]].IsSetConvertedType() {
|
||||
convertedType = schemaElements[columnNameIndexMap[name]].GetConvertedType()
|
||||
}
|
||||
|
||||
values, _, err := readDataPageValues(bytesReader, encodingType, page.DataType,
|
||||
convertedType, uint64(len(page.DataTable.DefinitionLevels))-numNulls,
|
||||
uint64(schemaElements[columnNameIndexMap[name]].GetTypeLength()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
tableValues := getTableValues(values, page.DataType)
|
||||
|
||||
j := 0
|
||||
for i := 0; i < len(page.DataTable.DefinitionLevels); i++ {
|
||||
if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel {
|
||||
page.DataTable.Values[i] = tableValues[j]
|
||||
j++
|
||||
}
|
||||
}
|
||||
|
||||
page.RawData = []byte{}
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("unsupported page type %v", pageType)
|
||||
}
|
||||
|
||||
func (page *page) toDataPage(compressType parquet.CompressionCodec) []byte {
|
||||
values := []interface{}{}
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel {
|
||||
values = append(values, page.DataTable.Values[i])
|
||||
}
|
||||
}
|
||||
valuesBytes := encodeValues(interfacesToValues(values, page.DataTable.Type), page.DataType, page.DataTable.Encoding, page.DataTable.BitWidth)
|
||||
|
||||
var defLevelBytes []byte
|
||||
if page.DataTable.MaxDefinitionLevel > 0 {
|
||||
defLevels := make([]int64, len(page.DataTable.DefinitionLevels))
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
defLevels[i] = int64(page.DataTable.DefinitionLevels[i])
|
||||
}
|
||||
defLevelBytes = valuesToRLEBitPackedHybridBytes(
|
||||
defLevels,
|
||||
int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
}
|
||||
|
||||
var repLevelBytes []byte
|
||||
if page.DataTable.MaxRepetitionLevel > 0 {
|
||||
repLevels := make([]int64, len(page.DataTable.DefinitionLevels))
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
repLevels[i] = int64(page.DataTable.RepetitionLevels[i])
|
||||
}
|
||||
repLevelBytes = valuesToRLEBitPackedHybridBytes(
|
||||
repLevels,
|
||||
int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
}
|
||||
|
||||
data := repLevelBytes
|
||||
data = append(data, defLevelBytes...)
|
||||
data = append(data, valuesBytes...)
|
||||
|
||||
compressedData, err := compressionCodec(compressType).compress(data)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.Header = parquet.NewPageHeader()
|
||||
page.Header.Type = parquet.PageType_DATA_PAGE
|
||||
page.Header.CompressedPageSize = int32(len(compressedData))
|
||||
page.Header.UncompressedPageSize = int32(len(data))
|
||||
page.Header.DataPageHeader = parquet.NewDataPageHeader()
|
||||
page.Header.DataPageHeader.NumValues = int32(len(page.DataTable.DefinitionLevels))
|
||||
page.Header.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE
|
||||
page.Header.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE
|
||||
page.Header.DataPageHeader.Encoding = page.DataTable.Encoding
|
||||
page.Header.DataPageHeader.Statistics = parquet.NewStatistics()
|
||||
if page.MaxVal != nil {
|
||||
tmpBuf := valueToBytes(page.MaxVal, page.DataType)
|
||||
if page.DataType == parquet.Type_BYTE_ARRAY {
|
||||
switch page.DataTable.ConvertedType {
|
||||
case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
|
||||
tmpBuf = tmpBuf[4:]
|
||||
}
|
||||
}
|
||||
page.Header.DataPageHeader.Statistics.Max = tmpBuf
|
||||
}
|
||||
if page.MinVal != nil {
|
||||
tmpBuf := valueToBytes(page.MinVal, page.DataType)
|
||||
if page.DataType == parquet.Type_BYTE_ARRAY {
|
||||
switch page.DataTable.ConvertedType {
|
||||
case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
|
||||
tmpBuf = tmpBuf[4:]
|
||||
}
|
||||
}
|
||||
page.Header.DataPageHeader.Statistics.Min = tmpBuf
|
||||
}
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
pageHeaderBytes, err := ts.Write(context.TODO(), page.Header)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.RawData = append(pageHeaderBytes, compressedData...)
|
||||
return page.RawData
|
||||
}
|
||||
|
||||
func (page *page) toDataPageV2(compressType parquet.CompressionCodec) []byte {
|
||||
values := []interface{}{}
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel {
|
||||
values = append(values, page.DataTable.Values[i])
|
||||
}
|
||||
}
|
||||
valuesBytes := encodeValues(values, page.DataType, page.DataTable.Encoding, page.DataTable.BitWidth)
|
||||
|
||||
var defLevelBytes []byte
|
||||
if page.DataTable.MaxDefinitionLevel > 0 {
|
||||
defLevels := make([]int64, len(page.DataTable.DefinitionLevels))
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
defLevels[i] = int64(page.DataTable.DefinitionLevels[i])
|
||||
}
|
||||
defLevelBytes = valuesToRLEBytes(
|
||||
defLevels,
|
||||
int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
}
|
||||
|
||||
var repLevelBytes []byte
|
||||
numRows := int32(0)
|
||||
if page.DataTable.MaxRepetitionLevel > 0 {
|
||||
repLevels := make([]int64, len(page.DataTable.DefinitionLevels))
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
repLevels[i] = int64(page.DataTable.RepetitionLevels[i])
|
||||
if page.DataTable.RepetitionLevels[i] == 0 {
|
||||
numRows++
|
||||
}
|
||||
}
|
||||
repLevelBytes = valuesToRLEBytes(
|
||||
repLevels,
|
||||
int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
}
|
||||
|
||||
compressedData, err := compressionCodec(compressType).compress(valuesBytes)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.Header = parquet.NewPageHeader()
|
||||
page.Header.Type = parquet.PageType_DATA_PAGE_V2
|
||||
page.Header.CompressedPageSize = int32(len(compressedData) + len(defLevelBytes) + len(repLevelBytes))
|
||||
page.Header.UncompressedPageSize = int32(len(valuesBytes) + len(defLevelBytes) + len(repLevelBytes))
|
||||
page.Header.DataPageHeaderV2 = parquet.NewDataPageHeaderV2()
|
||||
page.Header.DataPageHeaderV2.NumValues = int32(len(page.DataTable.Values))
|
||||
page.Header.DataPageHeaderV2.NumNulls = page.Header.DataPageHeaderV2.NumValues - int32(len(values))
|
||||
page.Header.DataPageHeaderV2.NumRows = numRows
|
||||
page.Header.DataPageHeaderV2.Encoding = page.DataTable.Encoding
|
||||
page.Header.DataPageHeaderV2.DefinitionLevelsByteLength = int32(len(defLevelBytes))
|
||||
page.Header.DataPageHeaderV2.RepetitionLevelsByteLength = int32(len(repLevelBytes))
|
||||
page.Header.DataPageHeaderV2.IsCompressed = true
|
||||
|
||||
page.Header.DataPageHeaderV2.Statistics = parquet.NewStatistics()
|
||||
if page.MaxVal != nil {
|
||||
tmpBuf := valueToBytes(page.MaxVal, page.DataType)
|
||||
if page.DataType == parquet.Type_BYTE_ARRAY {
|
||||
switch page.DataTable.ConvertedType {
|
||||
case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
|
||||
tmpBuf = tmpBuf[4:]
|
||||
}
|
||||
}
|
||||
page.Header.DataPageHeaderV2.Statistics.Max = tmpBuf
|
||||
}
|
||||
if page.MinVal != nil {
|
||||
tmpBuf := valueToBytes(page.MinVal, page.DataType)
|
||||
if page.DataType == parquet.Type_BYTE_ARRAY {
|
||||
switch page.DataTable.ConvertedType {
|
||||
case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
|
||||
tmpBuf = tmpBuf[4:]
|
||||
}
|
||||
}
|
||||
page.Header.DataPageHeaderV2.Statistics.Min = tmpBuf
|
||||
}
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
pageHeaderBytes, err := ts.Write(context.TODO(), page.Header)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.RawData = append(pageHeaderBytes, repLevelBytes...)
|
||||
page.RawData = append(page.RawData, defLevelBytes...)
|
||||
page.RawData = append(page.RawData, compressedData...)
|
||||
|
||||
return page.RawData
|
||||
}
|
||||
|
||||
func (page *page) toDictPage(compressType parquet.CompressionCodec, dataType parquet.Type) []byte {
|
||||
valuesBytes := valuesToBytes(page.DataTable.Values, dataType)
|
||||
compressedData, err := compressionCodec(compressType).compress(valuesBytes)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.Header = parquet.NewPageHeader()
|
||||
page.Header.Type = parquet.PageType_DICTIONARY_PAGE
|
||||
page.Header.CompressedPageSize = int32(len(compressedData))
|
||||
page.Header.UncompressedPageSize = int32(len(valuesBytes))
|
||||
page.Header.DictionaryPageHeader = parquet.NewDictionaryPageHeader()
|
||||
page.Header.DictionaryPageHeader.NumValues = int32(len(page.DataTable.Values))
|
||||
page.Header.DictionaryPageHeader.Encoding = parquet.Encoding_PLAIN
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
pageHeaderBytes, err := ts.Write(context.TODO(), page.Header)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.RawData = append(pageHeaderBytes, compressedData...)
|
||||
return page.RawData
|
||||
}
|
||||
|
||||
func (page *page) toDictDataPage(compressType parquet.CompressionCodec, bitWidth int32) []byte {
|
||||
valuesBytes := append([]byte{byte(bitWidth)}, valuesToRLEBytes(page.DataTable.Values, bitWidth, parquet.Type_INT32)...)
|
||||
|
||||
var defLevelBytes []byte
|
||||
if page.DataTable.MaxDefinitionLevel > 0 {
|
||||
defLevels := make([]int64, len(page.DataTable.DefinitionLevels))
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
defLevels[i] = int64(page.DataTable.DefinitionLevels[i])
|
||||
}
|
||||
defLevelBytes = valuesToRLEBitPackedHybridBytes(
|
||||
defLevels,
|
||||
int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
}
|
||||
|
||||
var repLevelBytes []byte
|
||||
if page.DataTable.MaxRepetitionLevel > 0 {
|
||||
repLevels := make([]int64, len(page.DataTable.DefinitionLevels))
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
repLevels[i] = int64(page.DataTable.RepetitionLevels[i])
|
||||
}
|
||||
repLevelBytes = valuesToRLEBitPackedHybridBytes(
|
||||
repLevels,
|
||||
int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
}
|
||||
|
||||
data := append(repLevelBytes, defLevelBytes...)
|
||||
data = append(data, valuesBytes...)
|
||||
|
||||
compressedData, err := compressionCodec(compressType).compress(data)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.Header = parquet.NewPageHeader()
|
||||
page.Header.Type = parquet.PageType_DATA_PAGE
|
||||
page.Header.CompressedPageSize = int32(len(compressedData))
|
||||
page.Header.UncompressedPageSize = int32(len(data))
|
||||
page.Header.DataPageHeader = parquet.NewDataPageHeader()
|
||||
page.Header.DataPageHeader.NumValues = int32(len(page.DataTable.DefinitionLevels))
|
||||
page.Header.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE
|
||||
page.Header.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE
|
||||
page.Header.DataPageHeader.Encoding = parquet.Encoding_PLAIN_DICTIONARY
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
pageHeaderBytes, err := ts.Write(context.TODO(), page.Header)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.RawData = append(pageHeaderBytes, compressedData...)
|
||||
return page.RawData
|
||||
}
|
881
pkg/s3select/internal/parquet-go/parquet.thrift
Normal file
881
pkg/s3select/internal/parquet-go/parquet.thrift
Normal file
@ -0,0 +1,881 @@
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* File format description for the parquet file format
|
||||
*/
|
||||
namespace cpp parquet
|
||||
namespace java org.apache.parquet.format
|
||||
|
||||
/**
|
||||
* Types supported by Parquet. These types are intended to be used in combination
|
||||
* with the encodings to control the on disk storage format.
|
||||
* For example INT16 is not included as a type since a good encoding of INT32
|
||||
* would handle this.
|
||||
*/
|
||||
enum Type {
|
||||
BOOLEAN = 0;
|
||||
INT32 = 1;
|
||||
INT64 = 2;
|
||||
INT96 = 3; // deprecated, only used by legacy implementations.
|
||||
FLOAT = 4;
|
||||
DOUBLE = 5;
|
||||
BYTE_ARRAY = 6;
|
||||
FIXED_LEN_BYTE_ARRAY = 7;
|
||||
}
|
||||
|
||||
/**
|
||||
* Common types used by frameworks(e.g. hive, pig) using parquet. This helps map
|
||||
* between types in those frameworks to the base types in parquet. This is only
|
||||
* metadata and not needed to read or write the data.
|
||||
*/
|
||||
enum ConvertedType {
|
||||
/** a BYTE_ARRAY actually contains UTF8 encoded chars */
|
||||
UTF8 = 0;
|
||||
|
||||
/** a map is converted as an optional field containing a repeated key/value pair */
|
||||
MAP = 1;
|
||||
|
||||
/** a key/value pair is converted into a group of two fields */
|
||||
MAP_KEY_VALUE = 2;
|
||||
|
||||
/** a list is converted into an optional field containing a repeated field for its
|
||||
* values */
|
||||
LIST = 3;
|
||||
|
||||
/** an enum is converted into a binary field */
|
||||
ENUM = 4;
|
||||
|
||||
/**
|
||||
* A decimal value.
|
||||
*
|
||||
* This may be used to annotate binary or fixed primitive types. The
|
||||
* underlying byte array stores the unscaled value encoded as two's
|
||||
* complement using big-endian byte order (the most significant byte is the
|
||||
* zeroth element). The value of the decimal is the value * 10^{-scale}.
|
||||
*
|
||||
* This must be accompanied by a (maximum) precision and a scale in the
|
||||
* SchemaElement. The precision specifies the number of digits in the decimal
|
||||
* and the scale stores the location of the decimal point. For example 1.23
|
||||
* would have precision 3 (3 total digits) and scale 2 (the decimal point is
|
||||
* 2 digits over).
|
||||
*/
|
||||
DECIMAL = 5;
|
||||
|
||||
/**
|
||||
* A Date
|
||||
*
|
||||
* Stored as days since Unix epoch, encoded as the INT32 physical type.
|
||||
*
|
||||
*/
|
||||
DATE = 6;
|
||||
|
||||
/**
|
||||
* A time
|
||||
*
|
||||
* The total number of milliseconds since midnight. The value is stored
|
||||
* as an INT32 physical type.
|
||||
*/
|
||||
TIME_MILLIS = 7;
|
||||
|
||||
/**
|
||||
* A time.
|
||||
*
|
||||
* The total number of microseconds since midnight. The value is stored as
|
||||
* an INT64 physical type.
|
||||
*/
|
||||
TIME_MICROS = 8;
|
||||
|
||||
/**
|
||||
* A date/time combination
|
||||
*
|
||||
* Date and time recorded as milliseconds since the Unix epoch. Recorded as
|
||||
* a physical type of INT64.
|
||||
*/
|
||||
TIMESTAMP_MILLIS = 9;
|
||||
|
||||
/**
|
||||
* A date/time combination
|
||||
*
|
||||
* Date and time recorded as microseconds since the Unix epoch. The value is
|
||||
* stored as an INT64 physical type.
|
||||
*/
|
||||
TIMESTAMP_MICROS = 10;
|
||||
|
||||
|
||||
/**
|
||||
* An unsigned integer value.
|
||||
*
|
||||
* The number describes the maximum number of meainful data bits in
|
||||
* the stored value. 8, 16 and 32 bit values are stored using the
|
||||
* INT32 physical type. 64 bit values are stored using the INT64
|
||||
* physical type.
|
||||
*
|
||||
*/
|
||||
UINT_8 = 11;
|
||||
UINT_16 = 12;
|
||||
UINT_32 = 13;
|
||||
UINT_64 = 14;
|
||||
|
||||
/**
|
||||
* A signed integer value.
|
||||
*
|
||||
* The number describes the maximum number of meainful data bits in
|
||||
* the stored value. 8, 16 and 32 bit values are stored using the
|
||||
* INT32 physical type. 64 bit values are stored using the INT64
|
||||
* physical type.
|
||||
*
|
||||
*/
|
||||
INT_8 = 15;
|
||||
INT_16 = 16;
|
||||
INT_32 = 17;
|
||||
INT_64 = 18;
|
||||
|
||||
/**
|
||||
* An embedded JSON document
|
||||
*
|
||||
* A JSON document embedded within a single UTF8 column.
|
||||
*/
|
||||
JSON = 19;
|
||||
|
||||
/**
|
||||
* An embedded BSON document
|
||||
*
|
||||
* A BSON document embedded within a single BINARY column.
|
||||
*/
|
||||
BSON = 20;
|
||||
|
||||
/**
|
||||
* An interval of time
|
||||
*
|
||||
* This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12
|
||||
* This data is composed of three separate little endian unsigned
|
||||
* integers. Each stores a component of a duration of time. The first
|
||||
* integer identifies the number of months associated with the duration,
|
||||
* the second identifies the number of days associated with the duration
|
||||
* and the third identifies the number of milliseconds associated with
|
||||
* the provided duration. This duration of time is independent of any
|
||||
* particular timezone or date.
|
||||
*/
|
||||
INTERVAL = 21;
|
||||
}
|
||||
|
||||
/**
|
||||
* Representation of Schemas
|
||||
*/
|
||||
enum FieldRepetitionType {
|
||||
/** This field is required (can not be null) and each record has exactly 1 value. */
|
||||
REQUIRED = 0;
|
||||
|
||||
/** The field is optional (can be null) and each record has 0 or 1 values. */
|
||||
OPTIONAL = 1;
|
||||
|
||||
/** The field is repeated and can contain 0 or more values */
|
||||
REPEATED = 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Statistics per row group and per page
|
||||
* All fields are optional.
|
||||
*/
|
||||
struct Statistics {
|
||||
/**
|
||||
* DEPRECATED: min and max value of the column. Use min_value and max_value.
|
||||
*
|
||||
* Values are encoded using PLAIN encoding, except that variable-length byte
|
||||
* arrays do not include a length prefix.
|
||||
*
|
||||
* These fields encode min and max values determined by signed comparison
|
||||
* only. New files should use the correct order for a column's logical type
|
||||
* and store the values in the min_value and max_value fields.
|
||||
*
|
||||
* To support older readers, these may be set when the column order is
|
||||
* signed.
|
||||
*/
|
||||
1: optional binary max;
|
||||
2: optional binary min;
|
||||
/** count of null value in the column */
|
||||
3: optional i64 null_count;
|
||||
/** count of distinct values occurring */
|
||||
4: optional i64 distinct_count;
|
||||
/**
|
||||
* Min and max values for the column, determined by its ColumnOrder.
|
||||
*
|
||||
* Values are encoded using PLAIN encoding, except that variable-length byte
|
||||
* arrays do not include a length prefix.
|
||||
*/
|
||||
5: optional binary max_value;
|
||||
6: optional binary min_value;
|
||||
}
|
||||
|
||||
/** Empty structs to use as logical type annotations */
|
||||
struct StringType {} // allowed for BINARY, must be encoded with UTF-8
|
||||
struct UUIDType {} // allowed for FIXED[16], must encoded raw UUID bytes
|
||||
struct MapType {} // see LogicalTypes.md
|
||||
struct ListType {} // see LogicalTypes.md
|
||||
struct EnumType {} // allowed for BINARY, must be encoded with UTF-8
|
||||
struct DateType {} // allowed for INT32
|
||||
|
||||
/**
|
||||
* Logical type to annotate a column that is always null.
|
||||
*
|
||||
* Sometimes when discovering the schema of existing data, values are always
|
||||
* null and the physical type can't be determined. This annotation signals
|
||||
* the case where the physical type was guessed from all null values.
|
||||
*/
|
||||
struct NullType {} // allowed for any physical type, only null values stored
|
||||
|
||||
/**
|
||||
* Decimal logical type annotation
|
||||
*
|
||||
* To maintain forward-compatibility in v1, implementations using this logical
|
||||
* type must also set scale and precision on the annotated SchemaElement.
|
||||
*
|
||||
* Allowed for physical types: INT32, INT64, FIXED, and BINARY
|
||||
*/
|
||||
struct DecimalType {
|
||||
1: required i32 scale
|
||||
2: required i32 precision
|
||||
}
|
||||
|
||||
/** Time units for logical types */
|
||||
struct MilliSeconds {}
|
||||
struct MicroSeconds {}
|
||||
struct NanoSeconds {}
|
||||
union TimeUnit {
|
||||
1: MilliSeconds MILLIS
|
||||
2: MicroSeconds MICROS
|
||||
3: NanoSeconds NANOS
|
||||
}
|
||||
|
||||
/**
|
||||
* Timestamp logical type annotation
|
||||
*
|
||||
* Allowed for physical types: INT64
|
||||
*/
|
||||
struct TimestampType {
|
||||
1: required bool isAdjustedToUTC
|
||||
2: required TimeUnit unit
|
||||
}
|
||||
|
||||
/**
|
||||
* Time logical type annotation
|
||||
*
|
||||
* Allowed for physical types: INT32 (millis), INT64 (micros, nanos)
|
||||
*/
|
||||
struct TimeType {
|
||||
1: required bool isAdjustedToUTC
|
||||
2: required TimeUnit unit
|
||||
}
|
||||
|
||||
/**
|
||||
* Integer logical type annotation
|
||||
*
|
||||
* bitWidth must be 8, 16, 32, or 64.
|
||||
*
|
||||
* Allowed for physical types: INT32, INT64
|
||||
*/
|
||||
struct IntType {
|
||||
1: required byte bitWidth
|
||||
2: required bool isSigned
|
||||
}
|
||||
|
||||
/**
|
||||
* Embedded JSON logical type annotation
|
||||
*
|
||||
* Allowed for physical types: BINARY
|
||||
*/
|
||||
struct JsonType {
|
||||
}
|
||||
|
||||
/**
|
||||
* Embedded BSON logical type annotation
|
||||
*
|
||||
* Allowed for physical types: BINARY
|
||||
*/
|
||||
struct BsonType {
|
||||
}
|
||||
|
||||
/**
|
||||
* LogicalType annotations to replace ConvertedType.
|
||||
*
|
||||
* To maintain compatibility, implementations using LogicalType for a
|
||||
* SchemaElement must also set the corresponding ConvertedType from the
|
||||
* following table.
|
||||
*/
|
||||
union LogicalType {
|
||||
1: StringType STRING // use ConvertedType UTF8
|
||||
2: MapType MAP // use ConvertedType MAP
|
||||
3: ListType LIST // use ConvertedType LIST
|
||||
4: EnumType ENUM // use ConvertedType ENUM
|
||||
5: DecimalType DECIMAL // use ConvertedType DECIMAL
|
||||
6: DateType DATE // use ConvertedType DATE
|
||||
7: TimeType TIME // use ConvertedType TIME_MICROS or TIME_MILLIS
|
||||
8: TimestampType TIMESTAMP // use ConvertedType TIMESTAMP_MICROS or TIMESTAMP_MILLIS
|
||||
// 9: reserved for INTERVAL
|
||||
10: IntType INTEGER // use ConvertedType INT_* or UINT_*
|
||||
11: NullType UNKNOWN // no compatible ConvertedType
|
||||
12: JsonType JSON // use ConvertedType JSON
|
||||
13: BsonType BSON // use ConvertedType BSON
|
||||
14: UUIDType UUID
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a element inside a schema definition.
|
||||
* - if it is a group (inner node) then type is undefined and num_children is defined
|
||||
* - if it is a primitive type (leaf) then type is defined and num_children is undefined
|
||||
* the nodes are listed in depth first traversal order.
|
||||
*/
|
||||
struct SchemaElement {
|
||||
/** Data type for this field. Not set if the current element is a non-leaf node */
|
||||
1: optional Type type;
|
||||
|
||||
/** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the vales.
|
||||
* Otherwise, if specified, this is the maximum bit length to store any of the values.
|
||||
* (e.g. a low cardinality INT col could have this set to 3). Note that this is
|
||||
* in the schema, and therefore fixed for the entire file.
|
||||
*/
|
||||
2: optional i32 type_length;
|
||||
|
||||
/** repetition of the field. The root of the schema does not have a repetition_type.
|
||||
* All other nodes must have one */
|
||||
3: optional FieldRepetitionType repetition_type;
|
||||
|
||||
/** Name of the field in the schema */
|
||||
4: required string name;
|
||||
|
||||
/** Nested fields. Since thrift does not support nested fields,
|
||||
* the nesting is flattened to a single list by a depth-first traversal.
|
||||
* The children count is used to construct the nested relationship.
|
||||
* This field is not set when the element is a primitive type
|
||||
*/
|
||||
5: optional i32 num_children;
|
||||
|
||||
/** When the schema is the result of a conversion from another model
|
||||
* Used to record the original type to help with cross conversion.
|
||||
*/
|
||||
6: optional ConvertedType converted_type;
|
||||
|
||||
/** Used when this column contains decimal data.
|
||||
* See the DECIMAL converted type for more details.
|
||||
*/
|
||||
7: optional i32 scale
|
||||
8: optional i32 precision
|
||||
|
||||
/** When the original schema supports field ids, this will save the
|
||||
* original field id in the parquet schema
|
||||
*/
|
||||
9: optional i32 field_id;
|
||||
|
||||
/**
|
||||
* The logical type of this SchemaElement
|
||||
*
|
||||
* LogicalType replaces ConvertedType, but ConvertedType is still required
|
||||
* for some logical types to ensure forward-compatibility in format v1.
|
||||
*/
|
||||
10: optional LogicalType logicalType
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodings supported by Parquet. Not all encodings are valid for all types. These
|
||||
* enums are also used to specify the encoding of definition and repetition levels.
|
||||
* See the accompanying doc for the details of the more complicated encodings.
|
||||
*/
|
||||
enum Encoding {
|
||||
/** Default encoding.
|
||||
* BOOLEAN - 1 bit per value. 0 is false; 1 is true.
|
||||
* INT32 - 4 bytes per value. Stored as little-endian.
|
||||
* INT64 - 8 bytes per value. Stored as little-endian.
|
||||
* FLOAT - 4 bytes per value. IEEE. Stored as little-endian.
|
||||
* DOUBLE - 8 bytes per value. IEEE. Stored as little-endian.
|
||||
* BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes.
|
||||
* FIXED_LEN_BYTE_ARRAY - Just the bytes.
|
||||
*/
|
||||
PLAIN = 0;
|
||||
|
||||
/** Group VarInt encoding for INT32/INT64.
|
||||
* This encoding is deprecated. It was never used
|
||||
*/
|
||||
// GROUP_VAR_INT = 1;
|
||||
|
||||
/**
|
||||
* Deprecated: Dictionary encoding. The values in the dictionary are encoded in the
|
||||
* plain type.
|
||||
* in a data page use RLE_DICTIONARY instead.
|
||||
* in a Dictionary page use PLAIN instead
|
||||
*/
|
||||
PLAIN_DICTIONARY = 2;
|
||||
|
||||
/** Group packed run length encoding. Usable for definition/repetition levels
|
||||
* encoding and Booleans (on one bit: 0 is false; 1 is true.)
|
||||
*/
|
||||
RLE = 3;
|
||||
|
||||
/** Bit packed encoding. This can only be used if the data has a known max
|
||||
* width. Usable for definition/repetition levels encoding.
|
||||
*/
|
||||
BIT_PACKED = 4;
|
||||
|
||||
/** Delta encoding for integers. This can be used for int columns and works best
|
||||
* on sorted data
|
||||
*/
|
||||
DELTA_BINARY_PACKED = 5;
|
||||
|
||||
/** Encoding for byte arrays to separate the length values and the data. The lengths
|
||||
* are encoded using DELTA_BINARY_PACKED
|
||||
*/
|
||||
DELTA_LENGTH_BYTE_ARRAY = 6;
|
||||
|
||||
/** Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED.
|
||||
* Suffixes are stored as delta length byte arrays.
|
||||
*/
|
||||
DELTA_BYTE_ARRAY = 7;
|
||||
|
||||
/** Dictionary encoding: the ids are encoded using the RLE encoding
|
||||
*/
|
||||
RLE_DICTIONARY = 8;
|
||||
}
|
||||
|
||||
/**
|
||||
* Supported compression algorithms.
|
||||
*
|
||||
* Codecs added in 2.4 can be read by readers based on 2.4 and later.
|
||||
* Codec support may vary between readers based on the format version and
|
||||
* libraries available at runtime. Gzip, Snappy, and LZ4 codecs are
|
||||
* widely available, while Zstd and Brotli require additional libraries.
|
||||
*/
|
||||
enum CompressionCodec {
|
||||
UNCOMPRESSED = 0;
|
||||
SNAPPY = 1;
|
||||
GZIP = 2;
|
||||
LZO = 3;
|
||||
BROTLI = 4; // Added in 2.4
|
||||
LZ4 = 5; // Added in 2.4
|
||||
ZSTD = 6; // Added in 2.4
|
||||
}
|
||||
|
||||
enum PageType {
|
||||
DATA_PAGE = 0;
|
||||
INDEX_PAGE = 1;
|
||||
DICTIONARY_PAGE = 2;
|
||||
DATA_PAGE_V2 = 3;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enum to annotate whether lists of min/max elements inside ColumnIndex
|
||||
* are ordered and if so, in which direction.
|
||||
*/
|
||||
enum BoundaryOrder {
|
||||
UNORDERED = 0;
|
||||
ASCENDING = 1;
|
||||
DESCENDING = 2;
|
||||
}
|
||||
|
||||
/** Data page header */
|
||||
struct DataPageHeader {
|
||||
/** Number of values, including NULLs, in this data page. **/
|
||||
1: required i32 num_values
|
||||
|
||||
/** Encoding used for this data page **/
|
||||
2: required Encoding encoding
|
||||
|
||||
/** Encoding used for definition levels **/
|
||||
3: required Encoding definition_level_encoding;
|
||||
|
||||
/** Encoding used for repetition levels **/
|
||||
4: required Encoding repetition_level_encoding;
|
||||
|
||||
/** Optional statistics for the data in this page**/
|
||||
5: optional Statistics statistics;
|
||||
}
|
||||
|
||||
struct IndexPageHeader {
|
||||
/** TODO: **/
|
||||
}
|
||||
|
||||
struct DictionaryPageHeader {
|
||||
/** Number of values in the dictionary **/
|
||||
1: required i32 num_values;
|
||||
|
||||
/** Encoding using this dictionary page **/
|
||||
2: required Encoding encoding
|
||||
|
||||
/** If true, the entries in the dictionary are sorted in ascending order **/
|
||||
3: optional bool is_sorted;
|
||||
}
|
||||
|
||||
/**
|
||||
* New page format allowing reading levels without decompressing the data
|
||||
* Repetition and definition levels are uncompressed
|
||||
* The remaining section containing the data is compressed if is_compressed is true
|
||||
**/
|
||||
struct DataPageHeaderV2 {
|
||||
/** Number of values, including NULLs, in this data page. **/
|
||||
1: required i32 num_values
|
||||
/** Number of NULL values, in this data page.
|
||||
Number of non-null = num_values - num_nulls which is also the number of values in the data section **/
|
||||
2: required i32 num_nulls
|
||||
/** Number of rows in this data page. which means pages change on record boundaries (r = 0) **/
|
||||
3: required i32 num_rows
|
||||
/** Encoding used for data in this page **/
|
||||
4: required Encoding encoding
|
||||
|
||||
// repetition levels and definition levels are always using RLE (without size in it)
|
||||
|
||||
/** length of the definition levels */
|
||||
5: required i32 definition_levels_byte_length;
|
||||
/** length of the repetition levels */
|
||||
6: required i32 repetition_levels_byte_length;
|
||||
|
||||
/** whether the values are compressed.
|
||||
Which means the section of the page between
|
||||
definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included)
|
||||
is compressed with the compression_codec.
|
||||
If missing it is considered compressed */
|
||||
7: optional bool is_compressed = 1;
|
||||
|
||||
/** optional statistics for this column chunk */
|
||||
8: optional Statistics statistics;
|
||||
}
|
||||
|
||||
struct PageHeader {
|
||||
/** the type of the page: indicates which of the *_header fields is set **/
|
||||
1: required PageType type
|
||||
|
||||
/** Uncompressed page size in bytes (not including this header) **/
|
||||
2: required i32 uncompressed_page_size
|
||||
|
||||
/** Compressed page size in bytes (not including this header) **/
|
||||
3: required i32 compressed_page_size
|
||||
|
||||
/** 32bit crc for the data below. This allows for disabling checksumming in HDFS
|
||||
* if only a few pages needs to be read
|
||||
**/
|
||||
4: optional i32 crc
|
||||
|
||||
// Headers for page specific data. One only will be set.
|
||||
5: optional DataPageHeader data_page_header;
|
||||
6: optional IndexPageHeader index_page_header;
|
||||
7: optional DictionaryPageHeader dictionary_page_header;
|
||||
8: optional DataPageHeaderV2 data_page_header_v2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrapper struct to store key values
|
||||
*/
|
||||
struct KeyValue {
|
||||
1: required string key
|
||||
2: optional string value
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrapper struct to specify sort order
|
||||
*/
|
||||
struct SortingColumn {
|
||||
/** The column index (in this row group) **/
|
||||
1: required i32 column_idx
|
||||
|
||||
/** If true, indicates this column is sorted in descending order. **/
|
||||
2: required bool descending
|
||||
|
||||
/** If true, nulls will come before non-null values, otherwise,
|
||||
* nulls go at the end. */
|
||||
3: required bool nulls_first
|
||||
}
|
||||
|
||||
/**
|
||||
* statistics of a given page type and encoding
|
||||
*/
|
||||
struct PageEncodingStats {
|
||||
|
||||
/** the page type (data/dic/...) **/
|
||||
1: required PageType page_type;
|
||||
|
||||
/** encoding of the page **/
|
||||
2: required Encoding encoding;
|
||||
|
||||
/** number of pages of this type with this encoding **/
|
||||
3: required i32 count;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Description for column metadata
|
||||
*/
|
||||
struct ColumnMetaData {
|
||||
/** Type of this column **/
|
||||
1: required Type type
|
||||
|
||||
/** Set of all encodings used for this column. The purpose is to validate
|
||||
* whether we can decode those pages. **/
|
||||
2: required list<Encoding> encodings
|
||||
|
||||
/** Path in schema **/
|
||||
3: required list<string> path_in_schema
|
||||
|
||||
/** Compression codec **/
|
||||
4: required CompressionCodec codec
|
||||
|
||||
/** Number of values in this column **/
|
||||
5: required i64 num_values
|
||||
|
||||
/** total byte size of all uncompressed pages in this column chunk (including the headers) **/
|
||||
6: required i64 total_uncompressed_size
|
||||
|
||||
/** total byte size of all compressed pages in this column chunk (including the headers) **/
|
||||
7: required i64 total_compressed_size
|
||||
|
||||
/** Optional key/value metadata **/
|
||||
8: optional list<KeyValue> key_value_metadata
|
||||
|
||||
/** Byte offset from beginning of file to first data page **/
|
||||
9: required i64 data_page_offset
|
||||
|
||||
/** Byte offset from beginning of file to root index page **/
|
||||
10: optional i64 index_page_offset
|
||||
|
||||
/** Byte offset from the beginning of file to first (only) dictionary page **/
|
||||
11: optional i64 dictionary_page_offset
|
||||
|
||||
/** optional statistics for this column chunk */
|
||||
12: optional Statistics statistics;
|
||||
|
||||
/** Set of all encodings used for pages in this column chunk.
|
||||
* This information can be used to determine if all data pages are
|
||||
* dictionary encoded for example **/
|
||||
13: optional list<PageEncodingStats> encoding_stats;
|
||||
}
|
||||
|
||||
struct ColumnChunk {
|
||||
/** File where column data is stored. If not set, assumed to be same file as
|
||||
* metadata. This path is relative to the current file.
|
||||
**/
|
||||
1: optional string file_path
|
||||
|
||||
/** Byte offset in file_path to the ColumnMetaData **/
|
||||
2: required i64 file_offset
|
||||
|
||||
/** Column metadata for this chunk. This is the same content as what is at
|
||||
* file_path/file_offset. Having it here has it replicated in the file
|
||||
* metadata.
|
||||
**/
|
||||
3: optional ColumnMetaData meta_data
|
||||
|
||||
/** File offset of ColumnChunk's OffsetIndex **/
|
||||
4: optional i64 offset_index_offset
|
||||
|
||||
/** Size of ColumnChunk's OffsetIndex, in bytes **/
|
||||
5: optional i32 offset_index_length
|
||||
|
||||
/** File offset of ColumnChunk's ColumnIndex **/
|
||||
6: optional i64 column_index_offset
|
||||
|
||||
/** Size of ColumnChunk's ColumnIndex, in bytes **/
|
||||
7: optional i32 column_index_length
|
||||
}
|
||||
|
||||
struct RowGroup {
|
||||
/** Metadata for each column chunk in this row group.
|
||||
* This list must have the same order as the SchemaElement list in FileMetaData.
|
||||
**/
|
||||
1: required list<ColumnChunk> columns
|
||||
|
||||
/** Total byte size of all the uncompressed column data in this row group **/
|
||||
2: required i64 total_byte_size
|
||||
|
||||
/** Number of rows in this row group **/
|
||||
3: required i64 num_rows
|
||||
|
||||
/** If set, specifies a sort ordering of the rows in this RowGroup.
|
||||
* The sorting columns can be a subset of all the columns.
|
||||
*/
|
||||
4: optional list<SortingColumn> sorting_columns
|
||||
}
|
||||
|
||||
/** Empty struct to signal the order defined by the physical or logical type */
|
||||
struct TypeDefinedOrder {}
|
||||
|
||||
/**
|
||||
* Union to specify the order used for the min_value and max_value fields for a
|
||||
* column. This union takes the role of an enhanced enum that allows rich
|
||||
* elements (which will be needed for a collation-based ordering in the future).
|
||||
*
|
||||
* Possible values are:
|
||||
* * TypeDefinedOrder - the column uses the order defined by its logical or
|
||||
* physical type (if there is no logical type).
|
||||
*
|
||||
* If the reader does not support the value of this union, min and max stats
|
||||
* for this column should be ignored.
|
||||
*/
|
||||
union ColumnOrder {
|
||||
|
||||
/**
|
||||
* The sort orders for logical types are:
|
||||
* UTF8 - unsigned byte-wise comparison
|
||||
* INT8 - signed comparison
|
||||
* INT16 - signed comparison
|
||||
* INT32 - signed comparison
|
||||
* INT64 - signed comparison
|
||||
* UINT8 - unsigned comparison
|
||||
* UINT16 - unsigned comparison
|
||||
* UINT32 - unsigned comparison
|
||||
* UINT64 - unsigned comparison
|
||||
* DECIMAL - signed comparison of the represented value
|
||||
* DATE - signed comparison
|
||||
* TIME_MILLIS - signed comparison
|
||||
* TIME_MICROS - signed comparison
|
||||
* TIMESTAMP_MILLIS - signed comparison
|
||||
* TIMESTAMP_MICROS - signed comparison
|
||||
* INTERVAL - unsigned comparison
|
||||
* JSON - unsigned byte-wise comparison
|
||||
* BSON - unsigned byte-wise comparison
|
||||
* ENUM - unsigned byte-wise comparison
|
||||
* LIST - undefined
|
||||
* MAP - undefined
|
||||
*
|
||||
* In the absence of logical types, the sort order is determined by the physical type:
|
||||
* BOOLEAN - false, true
|
||||
* INT32 - signed comparison
|
||||
* INT64 - signed comparison
|
||||
* INT96 (only used for legacy timestamps) - undefined
|
||||
* FLOAT - signed comparison of the represented value (*)
|
||||
* DOUBLE - signed comparison of the represented value (*)
|
||||
* BYTE_ARRAY - unsigned byte-wise comparison
|
||||
* FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison
|
||||
*
|
||||
* (*) Because the sorting order is not specified properly for floating
|
||||
* point values (relations vs. total ordering) the following
|
||||
* compatibility rules should be applied when reading statistics:
|
||||
* - If the min is a NaN, it should be ignored.
|
||||
* - If the max is a NaN, it should be ignored.
|
||||
* - If the min is +0, the row group may contain -0 values as well.
|
||||
* - If the max is -0, the row group may contain +0 values as well.
|
||||
* - When looking for NaN values, min and max should be ignored.
|
||||
*/
|
||||
1: TypeDefinedOrder TYPE_ORDER;
|
||||
}
|
||||
|
||||
struct PageLocation {
|
||||
/** Offset of the page in the file **/
|
||||
1: required i64 offset
|
||||
|
||||
/**
|
||||
* Size of the page, including header. Sum of compressed_page_size and header
|
||||
* length
|
||||
*/
|
||||
2: required i32 compressed_page_size
|
||||
|
||||
/**
|
||||
* Index within the RowGroup of the first row of the page; this means pages
|
||||
* change on record boundaries (r = 0).
|
||||
*/
|
||||
3: required i64 first_row_index
|
||||
}
|
||||
|
||||
struct OffsetIndex {
|
||||
/**
|
||||
* PageLocations, ordered by increasing PageLocation.offset. It is required
|
||||
* that page_locations[i].first_row_index < page_locations[i+1].first_row_index.
|
||||
*/
|
||||
1: required list<PageLocation> page_locations
|
||||
}
|
||||
|
||||
/**
|
||||
* Description for ColumnIndex.
|
||||
* Each <array-field>[i] refers to the page at OffsetIndex.page_locations[i]
|
||||
*/
|
||||
struct ColumnIndex {
|
||||
/**
|
||||
* A list of Boolean values to determine the validity of the corresponding
|
||||
* min and max values. If true, a page contains only null values, and writers
|
||||
* have to set the corresponding entries in min_values and max_values to
|
||||
* byte[0], so that all lists have the same length. If false, the
|
||||
* corresponding entries in min_values and max_values must be valid.
|
||||
*/
|
||||
1: required list<bool> null_pages
|
||||
|
||||
/**
|
||||
* Two lists containing lower and upper bounds for the values of each page.
|
||||
* These may be the actual minimum and maximum values found on a page, but
|
||||
* can also be (more compact) values that do not exist on a page. For
|
||||
* example, instead of storing ""Blart Versenwald III", a writer may set
|
||||
* min_values[i]="B", max_values[i]="C". Such more compact values must still
|
||||
* be valid values within the column's logical type. Readers must make sure
|
||||
* that list entries are populated before using them by inspecting null_pages.
|
||||
*/
|
||||
2: required list<binary> min_values
|
||||
3: required list<binary> max_values
|
||||
|
||||
/**
|
||||
* Stores whether both min_values and max_values are orderd and if so, in
|
||||
* which direction. This allows readers to perform binary searches in both
|
||||
* lists. Readers cannot assume that max_values[i] <= min_values[i+1], even
|
||||
* if the lists are ordered.
|
||||
*/
|
||||
4: required BoundaryOrder boundary_order
|
||||
|
||||
/** A list containing the number of null values for each page **/
|
||||
5: optional list<i64> null_counts
|
||||
}
|
||||
|
||||
/**
|
||||
* Description for file metadata
|
||||
*/
|
||||
struct FileMetaData {
|
||||
/** Version of this file **/
|
||||
1: required i32 version
|
||||
|
||||
/** Parquet schema for this file. This schema contains metadata for all the columns.
|
||||
* The schema is represented as a tree with a single root. The nodes of the tree
|
||||
* are flattened to a list by doing a depth-first traversal.
|
||||
* The column metadata contains the path in the schema for that column which can be
|
||||
* used to map columns to nodes in the schema.
|
||||
* The first element is the root **/
|
||||
2: required list<SchemaElement> schema;
|
||||
|
||||
/** Number of rows in this file **/
|
||||
3: required i64 num_rows
|
||||
|
||||
/** Row groups in this file **/
|
||||
4: required list<RowGroup> row_groups
|
||||
|
||||
/** Optional key/value metadata **/
|
||||
5: optional list<KeyValue> key_value_metadata
|
||||
|
||||
/** String for application that wrote this file. This should be in the format
|
||||
* <Application> version <App Version> (build <App Build Hash>).
|
||||
* e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
|
||||
**/
|
||||
6: optional string created_by
|
||||
|
||||
/**
|
||||
* Sort order used for the min_value and max_value fields of each column in
|
||||
* this file. Each sort order corresponds to one column, determined by its
|
||||
* position in the list, matching the position of the column in the schema.
|
||||
*
|
||||
* Without column_orders, the meaning of the min_value and max_value fields is
|
||||
* undefined. To ensure well-defined behaviour, if min_value and max_value are
|
||||
* written to a Parquet file, column_orders must be written as well.
|
||||
*
|
||||
* The obsolete min and max fields are always sorted by signed comparison
|
||||
* regardless of column_orders.
|
||||
*/
|
||||
7: optional list<ColumnOrder> column_orders;
|
||||
}
|
||||
|
166
pkg/s3select/internal/parquet-go/reader.go
Normal file
166
pkg/s3select/internal/parquet-go/reader.go
Normal file
@ -0,0 +1,166 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2018 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"io"
|
||||
|
||||
"git.apache.org/thrift.git/lib/go/thrift"
|
||||
"github.com/minio/minio-go/v6/pkg/set"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
// GetReaderFunc - function type returning io.ReadCloser for requested offset/length.
|
||||
type GetReaderFunc func(offset, length int64) (io.ReadCloser, error)
|
||||
|
||||
func footerSize(getReaderFunc GetReaderFunc) (size int64, err error) {
|
||||
rc, err := getReaderFunc(-8, 4)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer rc.Close()
|
||||
|
||||
buf := make([]byte, 4)
|
||||
if _, err = io.ReadFull(rc, buf); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
size = int64(binary.LittleEndian.Uint32(buf))
|
||||
|
||||
return size, nil
|
||||
}
|
||||
|
||||
func fileMetadata(getReaderFunc GetReaderFunc) (*parquet.FileMetaData, error) {
|
||||
size, err := footerSize(getReaderFunc)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
rc, err := getReaderFunc(-(8 + size), size)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rc.Close()
|
||||
|
||||
fileMeta := parquet.NewFileMetaData()
|
||||
|
||||
pf := thrift.NewTCompactProtocolFactory()
|
||||
protocol := pf.GetProtocol(thrift.NewStreamTransportR(rc))
|
||||
err = fileMeta.Read(protocol)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return fileMeta, nil
|
||||
}
|
||||
|
||||
// Value - denotes column value
|
||||
type Value struct {
|
||||
Value interface{}
|
||||
Type parquet.Type
|
||||
}
|
||||
|
||||
// MarshalJSON - encodes to JSON data
|
||||
func (value Value) MarshalJSON() (data []byte, err error) {
|
||||
return json.Marshal(value.Value)
|
||||
}
|
||||
|
||||
// Reader - denotes parquet file.
|
||||
type Reader struct {
|
||||
getReaderFunc GetReaderFunc
|
||||
schemaElements []*parquet.SchemaElement
|
||||
rowGroups []*parquet.RowGroup
|
||||
rowGroupIndex int
|
||||
|
||||
nameList []string
|
||||
columnNames set.StringSet
|
||||
columns map[string]*column
|
||||
rowIndex int64
|
||||
}
|
||||
|
||||
// NewReader - creates new parquet reader. Reader calls getReaderFunc to get required data range for given columnNames. If columnNames is empty, all columns are used.
|
||||
func NewReader(getReaderFunc GetReaderFunc, columnNames set.StringSet) (*Reader, error) {
|
||||
fileMeta, err := fileMetadata(getReaderFunc)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nameList := []string{}
|
||||
schemaElements := fileMeta.GetSchema()
|
||||
for _, element := range schemaElements {
|
||||
nameList = append(nameList, element.Name)
|
||||
}
|
||||
|
||||
return &Reader{
|
||||
getReaderFunc: getReaderFunc,
|
||||
rowGroups: fileMeta.GetRowGroups(),
|
||||
schemaElements: schemaElements,
|
||||
nameList: nameList,
|
||||
columnNames: columnNames,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Read - reads single record.
|
||||
func (reader *Reader) Read() (record *Record, err error) {
|
||||
if reader.rowGroupIndex >= len(reader.rowGroups) {
|
||||
return nil, io.EOF
|
||||
}
|
||||
|
||||
if reader.columns == nil {
|
||||
reader.columns, err = getColumns(
|
||||
reader.rowGroups[reader.rowGroupIndex],
|
||||
reader.columnNames,
|
||||
reader.schemaElements,
|
||||
reader.getReaderFunc,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
reader.rowIndex = 0
|
||||
}
|
||||
|
||||
if reader.rowIndex >= reader.rowGroups[reader.rowGroupIndex].GetNumRows() {
|
||||
reader.rowGroupIndex++
|
||||
reader.Close()
|
||||
return reader.Read()
|
||||
}
|
||||
|
||||
record = newRecord(reader.nameList)
|
||||
for name := range reader.columns {
|
||||
value, valueType := reader.columns[name].read()
|
||||
record.set(name, Value{value, valueType})
|
||||
}
|
||||
|
||||
reader.rowIndex++
|
||||
|
||||
return record, nil
|
||||
}
|
||||
|
||||
// Close - closes underneath readers.
|
||||
func (reader *Reader) Close() (err error) {
|
||||
for _, column := range reader.columns {
|
||||
column.close()
|
||||
}
|
||||
|
||||
reader.columns = nil
|
||||
reader.rowIndex = 0
|
||||
|
||||
return nil
|
||||
}
|
90
pkg/s3select/internal/parquet-go/reader_test.go
Normal file
90
pkg/s3select/internal/parquet-go/reader_test.go
Normal file
@ -0,0 +1,90 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2018 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"io"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio-go/v6/pkg/set"
|
||||
)
|
||||
|
||||
func getReader(name string, offset int64, length int64) (io.ReadCloser, error) {
|
||||
file, err := os.Open(name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fi, err := file.Stat()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if offset < 0 {
|
||||
offset = fi.Size() + offset
|
||||
}
|
||||
|
||||
if _, err = file.Seek(offset, os.SEEK_SET); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return file, nil
|
||||
}
|
||||
|
||||
func TestReader(t *testing.T) {
|
||||
name := "example.parquet"
|
||||
reader, err := NewReader(
|
||||
func(offset, length int64) (io.ReadCloser, error) {
|
||||
return getReader(name, offset, length)
|
||||
},
|
||||
set.CreateStringSet("one", "two", "three"),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
expectedRecords := []string{
|
||||
`map[one:{-1 DOUBLE} three:{true BOOLEAN} two:{[102 111 111] BYTE_ARRAY}]`,
|
||||
`map[one:{<nil> DOUBLE} three:{false BOOLEAN} two:{[98 97 114] BYTE_ARRAY}]`,
|
||||
`map[one:{2.5 DOUBLE} three:{true BOOLEAN} two:{[98 97 122] BYTE_ARRAY}]`,
|
||||
}
|
||||
|
||||
i := 0
|
||||
for {
|
||||
record, err := reader.Read()
|
||||
if err != nil {
|
||||
if err != io.EOF {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
if i == len(expectedRecords) {
|
||||
t.Fatalf("read more than expected record count %v", len(expectedRecords))
|
||||
}
|
||||
|
||||
if record.String() != expectedRecords[i] {
|
||||
t.Fatalf("record%v: expected: %v, got: %v", i+1, expectedRecords[i], record.String())
|
||||
}
|
||||
|
||||
i++
|
||||
}
|
||||
|
||||
reader.Close()
|
||||
}
|
70
pkg/s3select/internal/parquet-go/record.go
Normal file
70
pkg/s3select/internal/parquet-go/record.go
Normal file
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Record - ordered parquet record.
|
||||
type Record struct {
|
||||
nameList []string
|
||||
nameValueMap map[string]Value
|
||||
}
|
||||
|
||||
// String - returns string representation of this record.
|
||||
func (r *Record) String() string {
|
||||
values := []string{}
|
||||
r.Range(func(name string, value Value) bool {
|
||||
values = append(values, fmt.Sprintf("%v:%v", name, value))
|
||||
return true
|
||||
})
|
||||
|
||||
return "map[" + strings.Join(values, " ") + "]"
|
||||
}
|
||||
|
||||
func (r *Record) set(name string, value Value) {
|
||||
r.nameValueMap[name] = value
|
||||
}
|
||||
|
||||
// Get - returns Value of name.
|
||||
func (r *Record) Get(name string) (Value, bool) {
|
||||
value, ok := r.nameValueMap[name]
|
||||
return value, ok
|
||||
}
|
||||
|
||||
// Range - calls f sequentially for each name and value present in the record. If f returns false, range stops the iteration.
|
||||
func (r *Record) Range(f func(name string, value Value) bool) {
|
||||
for _, name := range r.nameList {
|
||||
value, ok := r.nameValueMap[name]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
if !f(name, value) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func newRecord(nameList []string) *Record {
|
||||
return &Record{
|
||||
nameList: nameList,
|
||||
nameValueMap: make(map[string]Value),
|
||||
}
|
||||
}
|
126
pkg/s3select/internal/parquet-go/schema/element.go
Normal file
126
pkg/s3select/internal/parquet-go/schema/element.go
Normal file
@ -0,0 +1,126 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package schema
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
var nameRegexp = regexp.MustCompile("^[a-zA-Z0-9_]+$")
|
||||
|
||||
func validataPathSegments(pathSegments []string) error {
|
||||
for _, pathSegment := range pathSegments {
|
||||
if !nameRegexp.MatchString(pathSegment) {
|
||||
return fmt.Errorf("unsupported name %v", strings.Join(pathSegments, "."))
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Element - represents schema element and its children. Any element must have Name and RepetitionType fields set.
|
||||
type Element struct {
|
||||
parquet.SchemaElement
|
||||
numChildren int32
|
||||
Encoding *parquet.Encoding // Optional; defaults is computed.
|
||||
CompressionType *parquet.CompressionCodec // Optional; defaults to SNAPPY.
|
||||
Children *Tree
|
||||
MaxDefinitionLevel int64
|
||||
MaxRepetitionLevel int64
|
||||
PathInTree string
|
||||
PathInSchema string
|
||||
}
|
||||
|
||||
// String - stringify this element.
|
||||
func (element *Element) String() string {
|
||||
var s []string
|
||||
s = append(s, "Name:"+element.Name)
|
||||
s = append(s, "RepetitionType:"+element.RepetitionType.String())
|
||||
if element.Type != nil {
|
||||
s = append(s, "Type:"+element.Type.String())
|
||||
}
|
||||
if element.ConvertedType != nil {
|
||||
s = append(s, "ConvertedType:"+element.ConvertedType.String())
|
||||
}
|
||||
if element.Encoding != nil {
|
||||
s = append(s, "Encoding:"+element.Encoding.String())
|
||||
}
|
||||
if element.CompressionType != nil {
|
||||
s = append(s, "CompressionType:"+element.CompressionType.String())
|
||||
}
|
||||
if element.Children != nil && element.Children.Length() > 0 {
|
||||
s = append(s, "Children:"+element.Children.String())
|
||||
}
|
||||
s = append(s, fmt.Sprintf("MaxDefinitionLevel:%v", element.MaxDefinitionLevel))
|
||||
s = append(s, fmt.Sprintf("MaxRepetitionLevel:%v", element.MaxRepetitionLevel))
|
||||
if element.PathInTree != "" {
|
||||
s = append(s, "PathInTree:"+element.PathInTree)
|
||||
}
|
||||
if element.PathInSchema != "" {
|
||||
s = append(s, "PathInSchema:"+element.PathInSchema)
|
||||
}
|
||||
|
||||
return "{" + strings.Join(s, ", ") + "}"
|
||||
}
|
||||
|
||||
// NewElement - creates new element.
|
||||
func NewElement(name string, repetitionType parquet.FieldRepetitionType,
|
||||
elementType *parquet.Type, convertedType *parquet.ConvertedType,
|
||||
encoding *parquet.Encoding, compressionType *parquet.CompressionCodec,
|
||||
children *Tree) (*Element, error) {
|
||||
|
||||
if !nameRegexp.MatchString(name) {
|
||||
return nil, fmt.Errorf("unsupported name %v", name)
|
||||
}
|
||||
|
||||
switch repetitionType {
|
||||
case parquet.FieldRepetitionType_REQUIRED, parquet.FieldRepetitionType_OPTIONAL, parquet.FieldRepetitionType_REPEATED:
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown repetition type %v", repetitionType)
|
||||
}
|
||||
|
||||
if repetitionType == parquet.FieldRepetitionType_REPEATED && (elementType != nil || convertedType != nil) {
|
||||
return nil, fmt.Errorf("repetition type REPEATED should be used in group element")
|
||||
}
|
||||
|
||||
if children != nil && children.Length() != 0 {
|
||||
if elementType != nil {
|
||||
return nil, fmt.Errorf("type should be nil for group element")
|
||||
}
|
||||
}
|
||||
|
||||
element := Element{
|
||||
Encoding: encoding,
|
||||
CompressionType: compressionType,
|
||||
Children: children,
|
||||
}
|
||||
|
||||
element.Name = name
|
||||
element.RepetitionType = &repetitionType
|
||||
element.Type = elementType
|
||||
element.ConvertedType = convertedType
|
||||
element.NumChildren = &element.numChildren
|
||||
if element.Children != nil {
|
||||
element.numChildren = int32(element.Children.Length())
|
||||
}
|
||||
|
||||
return &element, nil
|
||||
}
|
388
pkg/s3select/internal/parquet-go/schema/tree.go
Normal file
388
pkg/s3select/internal/parquet-go/schema/tree.go
Normal file
@ -0,0 +1,388 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package schema
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func updateMaxDLRL(schemaMap map[string]*Element, maxDL, maxRL int64) {
|
||||
for _, element := range schemaMap {
|
||||
element.MaxDefinitionLevel = maxDL
|
||||
element.MaxRepetitionLevel = maxRL
|
||||
if *element.RepetitionType != parquet.FieldRepetitionType_REQUIRED {
|
||||
element.MaxDefinitionLevel++
|
||||
if *element.RepetitionType == parquet.FieldRepetitionType_REPEATED {
|
||||
element.MaxRepetitionLevel++
|
||||
}
|
||||
}
|
||||
|
||||
if element.Children != nil {
|
||||
updateMaxDLRL(element.Children.schemaMap, element.MaxDefinitionLevel, element.MaxRepetitionLevel)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func toParquetSchema(tree *Tree, treePrefix string, schemaPrefix string, schemaList *[]*parquet.SchemaElement, valueElements *[]*Element) (err error) {
|
||||
tree.Range(func(name string, element *Element) bool {
|
||||
pathInTree := name
|
||||
if treePrefix != "" {
|
||||
pathInTree = treePrefix + "." + name
|
||||
}
|
||||
|
||||
if element.Type == nil && element.ConvertedType == nil && element.Children == nil {
|
||||
err = fmt.Errorf("%v: group element must have children", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if element.ConvertedType != nil {
|
||||
switch *element.ConvertedType {
|
||||
case parquet.ConvertedType_LIST:
|
||||
// Supported structure.
|
||||
// <REQUIRED|OPTIONAL> group <name> (LIST) {
|
||||
// REPEATED group list {
|
||||
// <REQUIRED|OPTIONAL> <element-type> element;
|
||||
// }
|
||||
// }
|
||||
|
||||
if element.Type != nil {
|
||||
err = fmt.Errorf("%v: type must be nil for LIST ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if element.Children == nil || element.Children.Length() != 1 {
|
||||
err = fmt.Errorf("%v: children must have one element only for LIST ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
listElement, ok := element.Children.Get("list")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v: missing group element 'list' for LIST ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if listElement.Name != "list" {
|
||||
err = fmt.Errorf("%v.list: name must be 'list'", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if *listElement.RepetitionType != parquet.FieldRepetitionType_REPEATED {
|
||||
err = fmt.Errorf("%v.list: repetition type must be REPEATED type", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if listElement.Type != nil || listElement.ConvertedType != nil {
|
||||
err = fmt.Errorf("%v.list: type and converted type must be nil", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if listElement.Children == nil || listElement.Children.Length() != 1 {
|
||||
err = fmt.Errorf("%v.list.element: not found", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
valueElement, ok := listElement.Children.Get("element")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v.list.element: not found", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if valueElement.Name != "element" {
|
||||
err = fmt.Errorf("%v.list.element: name must be 'element'", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
case parquet.ConvertedType_MAP:
|
||||
// Supported structure:
|
||||
// <REQUIRED|OPTIONAL> group <name> (MAP) {
|
||||
// REPEATED group key_value {
|
||||
// REQUIRED <key-type> key;
|
||||
// <REQUIRED|OPTIONAL> <value-type> value;
|
||||
// }
|
||||
// }
|
||||
|
||||
if element.Type != nil {
|
||||
err = fmt.Errorf("%v: type must be nil for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if element.Children == nil || element.Children.Length() != 1 {
|
||||
err = fmt.Errorf("%v: children must have one element only for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
keyValueElement, ok := element.Children.Get("key_value")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v: missing group element 'key_value' for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if keyValueElement.Name != "key_value" {
|
||||
err = fmt.Errorf("%v.key_value: name must be 'key_value'", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if *keyValueElement.RepetitionType != parquet.FieldRepetitionType_REPEATED {
|
||||
err = fmt.Errorf("%v.key_value: repetition type must be REPEATED type", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if keyValueElement.Children == nil || keyValueElement.Children.Length() < 1 || keyValueElement.Children.Length() > 2 {
|
||||
err = fmt.Errorf("%v.key_value: children must have 'key' and optionally 'value' elements for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
keyElement, ok := keyValueElement.Children.Get("key")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v.key_value: missing 'key' element for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if keyElement.Name != "key" {
|
||||
err = fmt.Errorf("%v.key_value.key: name must be 'key'", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if *keyElement.RepetitionType != parquet.FieldRepetitionType_REQUIRED {
|
||||
err = fmt.Errorf("%v.key_value: repetition type must be REQUIRED type", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if keyValueElement.Children.Length() == 2 {
|
||||
valueElement, ok := keyValueElement.Children.Get("value")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v.key_value: second element must be 'value' element for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if valueElement.Name != "value" {
|
||||
err = fmt.Errorf("%v.key_value.value: name must be 'value'", pathInTree)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
case parquet.ConvertedType_UTF8, parquet.ConvertedType_UINT_8, parquet.ConvertedType_UINT_16:
|
||||
fallthrough
|
||||
case parquet.ConvertedType_UINT_32, parquet.ConvertedType_UINT_64, parquet.ConvertedType_INT_8:
|
||||
fallthrough
|
||||
case parquet.ConvertedType_INT_16, parquet.ConvertedType_INT_32, parquet.ConvertedType_INT_64:
|
||||
if element.Type == nil {
|
||||
err = fmt.Errorf("%v: ConvertedType %v must have Type value", pathInTree, element.ConvertedType)
|
||||
return false
|
||||
}
|
||||
|
||||
default:
|
||||
err = fmt.Errorf("%v: unsupported ConvertedType %v", pathInTree, element.ConvertedType)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
element.PathInTree = pathInTree
|
||||
element.PathInSchema = element.Name
|
||||
if schemaPrefix != "" {
|
||||
element.PathInSchema = schemaPrefix + "." + element.Name
|
||||
}
|
||||
|
||||
if element.Type != nil {
|
||||
*valueElements = append(*valueElements, element)
|
||||
}
|
||||
|
||||
*schemaList = append(*schemaList, &element.SchemaElement)
|
||||
if element.Children != nil {
|
||||
element.numChildren = int32(element.Children.Length())
|
||||
err = toParquetSchema(element.Children, element.PathInTree, element.PathInSchema, schemaList, valueElements)
|
||||
}
|
||||
|
||||
return (err == nil)
|
||||
})
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// Tree - represents tree of schema. Tree preserves order in which elements are added.
|
||||
type Tree struct {
|
||||
schemaMap map[string]*Element
|
||||
keys []string
|
||||
readOnly bool
|
||||
}
|
||||
|
||||
// String - stringify this tree.
|
||||
func (tree *Tree) String() string {
|
||||
var s []string
|
||||
tree.Range(func(name string, element *Element) bool {
|
||||
s = append(s, fmt.Sprintf("%v: %v", name, element))
|
||||
return true
|
||||
})
|
||||
|
||||
return "{" + strings.Join(s, ", ") + "}"
|
||||
}
|
||||
|
||||
// Length - returns length of tree.
|
||||
func (tree *Tree) Length() int {
|
||||
return len(tree.keys)
|
||||
}
|
||||
|
||||
func (tree *Tree) travel(pathSegments []string) (pathSegmentIndex int, pathSegment string, currElement *Element, parentTree *Tree, found bool) {
|
||||
parentTree = tree
|
||||
for pathSegmentIndex, pathSegment = range pathSegments {
|
||||
if tree == nil {
|
||||
found = false
|
||||
break
|
||||
}
|
||||
|
||||
var tmpCurrElement *Element
|
||||
if tmpCurrElement, found = tree.schemaMap[pathSegment]; !found {
|
||||
break
|
||||
}
|
||||
currElement = tmpCurrElement
|
||||
|
||||
parentTree = tree
|
||||
tree = currElement.Children
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// ReadOnly - returns whether this tree is read only or not.
|
||||
func (tree *Tree) ReadOnly() bool {
|
||||
return tree.readOnly
|
||||
}
|
||||
|
||||
// Get - returns the element stored for name.
|
||||
func (tree *Tree) Get(name string) (element *Element, ok bool) {
|
||||
pathSegments := strings.Split(name, ".")
|
||||
for _, pathSegment := range pathSegments {
|
||||
if tree == nil {
|
||||
element = nil
|
||||
ok = false
|
||||
break
|
||||
}
|
||||
|
||||
if element, ok = tree.schemaMap[pathSegment]; !ok {
|
||||
break
|
||||
}
|
||||
|
||||
tree = element.Children
|
||||
}
|
||||
|
||||
return element, ok
|
||||
}
|
||||
|
||||
// Set - adds or sets element to name.
|
||||
func (tree *Tree) Set(name string, element *Element) error {
|
||||
if tree.readOnly {
|
||||
return fmt.Errorf("read only tree")
|
||||
}
|
||||
|
||||
pathSegments := strings.Split(name, ".")
|
||||
if err := validataPathSegments(pathSegments); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
i, pathSegment, currElement, parentTree, found := tree.travel(pathSegments)
|
||||
|
||||
if !found {
|
||||
if i != len(pathSegments)-1 {
|
||||
return fmt.Errorf("parent %v does not exist", strings.Join(pathSegments[:i+1], "."))
|
||||
}
|
||||
|
||||
if currElement == nil {
|
||||
parentTree = tree
|
||||
} else {
|
||||
if currElement.Type != nil {
|
||||
return fmt.Errorf("parent %v is not group element", strings.Join(pathSegments[:i], "."))
|
||||
}
|
||||
|
||||
if currElement.Children == nil {
|
||||
currElement.Children = NewTree()
|
||||
}
|
||||
parentTree = currElement.Children
|
||||
}
|
||||
|
||||
parentTree.keys = append(parentTree.keys, pathSegment)
|
||||
}
|
||||
|
||||
parentTree.schemaMap[pathSegment] = element
|
||||
return nil
|
||||
}
|
||||
|
||||
// Delete - deletes name and its element.
|
||||
func (tree *Tree) Delete(name string) {
|
||||
if tree.readOnly {
|
||||
panic(fmt.Errorf("read only tree"))
|
||||
}
|
||||
|
||||
pathSegments := strings.Split(name, ".")
|
||||
|
||||
_, pathSegment, _, parentTree, found := tree.travel(pathSegments)
|
||||
|
||||
if found {
|
||||
for i := range parentTree.keys {
|
||||
if parentTree.keys[i] == pathSegment {
|
||||
copy(parentTree.keys[i:], parentTree.keys[i+1:])
|
||||
parentTree.keys = parentTree.keys[:len(parentTree.keys)-1]
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
delete(parentTree.schemaMap, pathSegment)
|
||||
}
|
||||
}
|
||||
|
||||
// Range - calls f sequentially for each name and its element. If f returns false, range stops the iteration.
|
||||
func (tree *Tree) Range(f func(name string, element *Element) bool) {
|
||||
for _, name := range tree.keys {
|
||||
if !f(name, tree.schemaMap[name]) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ToParquetSchema - returns list of parquet SchemaElement and list of elements those stores values.
|
||||
func (tree *Tree) ToParquetSchema() (schemaList []*parquet.SchemaElement, valueElements []*Element, err error) {
|
||||
if tree.readOnly {
|
||||
return nil, nil, fmt.Errorf("read only tree")
|
||||
}
|
||||
|
||||
updateMaxDLRL(tree.schemaMap, 0, 0)
|
||||
|
||||
var schemaElements []*parquet.SchemaElement
|
||||
if err = toParquetSchema(tree, "", "", &schemaElements, &valueElements); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
tree.readOnly = true
|
||||
|
||||
numChildren := int32(len(tree.keys))
|
||||
schemaList = append(schemaList, &parquet.SchemaElement{
|
||||
Name: "schema",
|
||||
RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
|
||||
NumChildren: &numChildren,
|
||||
})
|
||||
schemaList = append(schemaList, schemaElements...)
|
||||
return schemaList, valueElements, nil
|
||||
}
|
||||
|
||||
// NewTree - creates new schema tree.
|
||||
func NewTree() *Tree {
|
||||
return &Tree{
|
||||
schemaMap: make(map[string]*Element),
|
||||
}
|
||||
}
|
1092
pkg/s3select/internal/parquet-go/schema/tree_test.go
Normal file
1092
pkg/s3select/internal/parquet-go/schema/tree_test.go
Normal file
File diff suppressed because it is too large
Load Diff
100
pkg/s3select/internal/parquet-go/table.go
Normal file
100
pkg/s3select/internal/parquet-go/table.go
Normal file
@ -0,0 +1,100 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2018 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
|
||||
func getTableValues(values interface{}, valueType parquet.Type) (tableValues []interface{}) {
|
||||
return valuesToInterfaces(values, valueType)
|
||||
}
|
||||
|
||||
type table struct {
|
||||
RepetitionType parquet.FieldRepetitionType
|
||||
Type parquet.Type
|
||||
MaxDefinitionLevel int32
|
||||
MaxRepetitionLevel int32
|
||||
Path []string // Path of this column
|
||||
Values []interface{} // Parquet values
|
||||
DefinitionLevels []int32 // Definition Levels slice
|
||||
RepetitionLevels []int32 // Repetition Levels slice
|
||||
ConvertedType parquet.ConvertedType
|
||||
Encoding parquet.Encoding
|
||||
BitWidth int32
|
||||
}
|
||||
|
||||
func newTableFromTable(srcTable *table) *table {
|
||||
if srcTable == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &table{
|
||||
Type: srcTable.Type,
|
||||
Path: append([]string{}, srcTable.Path...),
|
||||
}
|
||||
}
|
||||
|
||||
func (table *table) Merge(tables ...*table) {
|
||||
for i := 0; i < len(tables); i++ {
|
||||
if tables[i] == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
table.Values = append(table.Values, tables[i].Values...)
|
||||
table.RepetitionLevels = append(table.RepetitionLevels, tables[i].RepetitionLevels...)
|
||||
table.DefinitionLevels = append(table.DefinitionLevels, tables[i].DefinitionLevels...)
|
||||
|
||||
if table.MaxDefinitionLevel < tables[i].MaxDefinitionLevel {
|
||||
table.MaxDefinitionLevel = tables[i].MaxDefinitionLevel
|
||||
}
|
||||
|
||||
if table.MaxRepetitionLevel < tables[i].MaxRepetitionLevel {
|
||||
table.MaxRepetitionLevel = tables[i].MaxRepetitionLevel
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (table *table) Pop(numRows int64) *table {
|
||||
result := newTableFromTable(table)
|
||||
var i, num int64
|
||||
for i = int64(0); i < int64(len(table.Values)); i++ {
|
||||
if table.RepetitionLevels[i] == 0 {
|
||||
if num >= numRows {
|
||||
break
|
||||
}
|
||||
|
||||
num++
|
||||
}
|
||||
|
||||
if result.MaxRepetitionLevel < table.RepetitionLevels[i] {
|
||||
result.MaxRepetitionLevel = table.RepetitionLevels[i]
|
||||
}
|
||||
|
||||
if result.MaxDefinitionLevel < table.DefinitionLevels[i] {
|
||||
result.MaxDefinitionLevel = table.DefinitionLevels[i]
|
||||
}
|
||||
}
|
||||
|
||||
result.RepetitionLevels = table.RepetitionLevels[:i]
|
||||
result.DefinitionLevels = table.DefinitionLevels[:i]
|
||||
result.Values = table.Values[:i]
|
||||
|
||||
table.RepetitionLevels = table.RepetitionLevels[i:]
|
||||
table.DefinitionLevels = table.DefinitionLevels[i:]
|
||||
table.Values = table.Values[i:]
|
||||
|
||||
return result
|
||||
}
|
BIN
pkg/s3select/internal/parquet-go/test.parquet
Normal file
BIN
pkg/s3select/internal/parquet-go/test.parquet
Normal file
Binary file not shown.
@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2018 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
|
||||
"github.com/minio/minio-go/v6/pkg/set"
|
||||
parquet "github.com/minio/minio/pkg/s3select/internal/parquet-go"
|
||||
)
|
||||
|
||||
func getReader(name string, offset int64, length int64) (io.ReadCloser, error) {
|
||||
file, err := os.Open(name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fi, err := file.Stat()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if offset < 0 {
|
||||
offset = fi.Size() + offset
|
||||
}
|
||||
|
||||
if _, err = file.Seek(offset, os.SEEK_SET); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return file, nil
|
||||
}
|
||||
|
||||
func printUsage() {
|
||||
progName := path.Base(os.Args[0])
|
||||
fmt.Printf("usage: %v PARQUET-FILE [COLUMN...]\n", progName)
|
||||
fmt.Println()
|
||||
fmt.Printf("examples:\n")
|
||||
fmt.Printf("# Convert all columns to CSV\n")
|
||||
fmt.Printf("$ %v example.parquet\n", progName)
|
||||
fmt.Println()
|
||||
fmt.Printf("# Convert specific columns to CSV\n")
|
||||
fmt.Printf("$ %v example.par firstname dob\n", progName)
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
func main() {
|
||||
if len(os.Args) < 2 {
|
||||
printUsage()
|
||||
os.Exit(-1)
|
||||
}
|
||||
|
||||
name := os.Args[1]
|
||||
ext := path.Ext(name)
|
||||
csvFilename := name + ".csv"
|
||||
if ext == ".parquet" || ext == ".par" {
|
||||
csvFilename = strings.TrimSuffix(name, ext) + ".csv"
|
||||
}
|
||||
|
||||
columns := set.CreateStringSet(os.Args[2:]...)
|
||||
if len(columns) == 0 {
|
||||
columns = nil
|
||||
}
|
||||
|
||||
file, err := parquet.NewReader(
|
||||
func(offset, length int64) (io.ReadCloser, error) {
|
||||
return getReader(name, offset, length)
|
||||
},
|
||||
columns,
|
||||
)
|
||||
if err != nil {
|
||||
fmt.Printf("%v: %v\n", name, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
defer file.Close()
|
||||
|
||||
csvFile, err := os.OpenFile(csvFilename, os.O_RDWR|os.O_CREATE, 0755)
|
||||
if err != nil {
|
||||
fmt.Printf("%v: %v\n", csvFilename, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
defer csvFile.Close()
|
||||
|
||||
csvWriter := csv.NewWriter(csvFile)
|
||||
defer csvWriter.Flush()
|
||||
|
||||
headerWritten := false
|
||||
for {
|
||||
record, err := file.Read()
|
||||
if err != nil {
|
||||
if err != io.EOF {
|
||||
fmt.Printf("%v: %v\n", name, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
if !headerWritten {
|
||||
var csvRecord []string
|
||||
record.Range(func(name string, value parquet.Value) bool {
|
||||
csvRecord = append(csvRecord, name)
|
||||
return true
|
||||
})
|
||||
|
||||
if err = csvWriter.Write(csvRecord); err != nil {
|
||||
fmt.Printf("%v: %v\n", csvFilename, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
headerWritten = true
|
||||
}
|
||||
|
||||
var csvRecord []string
|
||||
record.Range(func(name string, value parquet.Value) bool {
|
||||
csvRecord = append(csvRecord, fmt.Sprintf("%v", value.Value))
|
||||
return true
|
||||
})
|
||||
|
||||
if err = csvWriter.Write(csvRecord); err != nil {
|
||||
fmt.Printf("%v: %v\n", csvFilename, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,128 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2018 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
|
||||
"github.com/minio/minio-go/v6/pkg/set"
|
||||
parquet "github.com/minio/minio/pkg/s3select/internal/parquet-go"
|
||||
)
|
||||
|
||||
func getReader(name string, offset int64, length int64) (io.ReadCloser, error) {
|
||||
file, err := os.Open(name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fi, err := file.Stat()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if offset < 0 {
|
||||
offset = fi.Size() + offset
|
||||
}
|
||||
|
||||
if _, err = file.Seek(offset, os.SEEK_SET); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return file, nil
|
||||
}
|
||||
|
||||
func printUsage() {
|
||||
progName := path.Base(os.Args[0])
|
||||
fmt.Printf("Usage: %v PARQUET-FILE [COLUMN...]\n", progName)
|
||||
fmt.Println()
|
||||
fmt.Printf("Examples:\n")
|
||||
fmt.Printf("# Convert all columns to JSON\n")
|
||||
fmt.Printf("$ %v example.parquet\n", progName)
|
||||
fmt.Println()
|
||||
fmt.Printf("# Convert specific columns to JSON\n")
|
||||
fmt.Printf("$ %v example.par firstname dob\n", progName)
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
func main() {
|
||||
if len(os.Args) < 2 {
|
||||
printUsage()
|
||||
os.Exit(-1)
|
||||
}
|
||||
|
||||
name := os.Args[1]
|
||||
ext := path.Ext(name)
|
||||
jsonFilename := name + ".json"
|
||||
if ext == ".parquet" || ext == ".par" {
|
||||
jsonFilename = strings.TrimSuffix(name, ext) + ".json"
|
||||
}
|
||||
|
||||
columns := set.CreateStringSet(os.Args[2:]...)
|
||||
if len(columns) == 0 {
|
||||
columns = nil
|
||||
}
|
||||
|
||||
file, err := parquet.NewReader(
|
||||
func(offset, length int64) (io.ReadCloser, error) {
|
||||
return getReader(name, offset, length)
|
||||
},
|
||||
columns,
|
||||
)
|
||||
if err != nil {
|
||||
fmt.Printf("%v: %v\n", name, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
defer file.Close()
|
||||
|
||||
jsonFile, err := os.OpenFile(jsonFilename, os.O_RDWR|os.O_CREATE, 0755)
|
||||
if err != nil {
|
||||
fmt.Printf("%v: %v\n", jsonFilename, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
defer jsonFile.Close()
|
||||
|
||||
for {
|
||||
record, err := file.Read()
|
||||
if err != nil {
|
||||
if err != io.EOF {
|
||||
fmt.Printf("%v: %v\n", name, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
data, err := json.Marshal(record)
|
||||
if err != nil {
|
||||
fmt.Printf("%v: %v\n", name, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
data = append(data, byte('\n'))
|
||||
|
||||
if _, err = jsonFile.Write(data); err != nil {
|
||||
fmt.Printf("%v: %v\n", jsonFilename, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
}
|
191
pkg/s3select/internal/parquet-go/writer.go
Normal file
191
pkg/s3select/internal/parquet-go/writer.go
Normal file
@ -0,0 +1,191 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"git.apache.org/thrift.git/lib/go/thrift"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/data"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultPageSize = 8 * 1024 // 8 KiB
|
||||
defaultRowGroupSize = 128 * 1024 * 1024 // 128 MiB
|
||||
)
|
||||
|
||||
// Writer - represents parquet writer.
|
||||
type Writer struct {
|
||||
PageSize int64
|
||||
RowGroupSize int64
|
||||
CompressionType parquet.CompressionCodec
|
||||
|
||||
writeCloser io.WriteCloser
|
||||
numRows int64
|
||||
offset int64
|
||||
footer *parquet.FileMetaData
|
||||
schemaTree *schema.Tree
|
||||
valueElements []*schema.Element
|
||||
columnDataMap map[string]*data.Column
|
||||
rowGroupCount int
|
||||
}
|
||||
|
||||
func (writer *Writer) writeData() (err error) {
|
||||
if writer.numRows == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var chunks []*data.ColumnChunk
|
||||
for _, element := range writer.valueElements {
|
||||
name := element.PathInTree
|
||||
columnData, found := writer.columnDataMap[name]
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
|
||||
columnChunk := columnData.Encode(element)
|
||||
chunks = append(chunks, columnChunk)
|
||||
}
|
||||
|
||||
rowGroup := data.NewRowGroup(chunks, writer.numRows, writer.offset)
|
||||
|
||||
for _, chunk := range chunks {
|
||||
if _, err = writer.writeCloser.Write(chunk.Data()); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
writer.offset += chunk.DataLen()
|
||||
}
|
||||
|
||||
writer.footer.RowGroups = append(writer.footer.RowGroups, rowGroup)
|
||||
writer.footer.NumRows += writer.numRows
|
||||
|
||||
writer.numRows = 0
|
||||
writer.columnDataMap = nil
|
||||
return nil
|
||||
}
|
||||
|
||||
// WriteJSON - writes a record represented in JSON.
|
||||
func (writer *Writer) WriteJSON(recordData []byte) (err error) {
|
||||
columnDataMap, err := data.UnmarshalJSON(recordData, writer.schemaTree)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return writer.Write(columnDataMap)
|
||||
}
|
||||
|
||||
// Write - writes a record represented in map.
|
||||
func (writer *Writer) Write(record map[string]*data.Column) (err error) {
|
||||
if writer.columnDataMap == nil {
|
||||
writer.columnDataMap = record
|
||||
} else {
|
||||
for name, columnData := range record {
|
||||
var found bool
|
||||
var element *schema.Element
|
||||
for _, element = range writer.valueElements {
|
||||
if element.PathInTree == name {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
return fmt.Errorf("%v is not value column", name)
|
||||
}
|
||||
|
||||
writer.columnDataMap[name].Merge(columnData)
|
||||
}
|
||||
}
|
||||
|
||||
writer.numRows++
|
||||
if writer.numRows == int64(writer.rowGroupCount) {
|
||||
return writer.writeData()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (writer *Writer) finalize() (err error) {
|
||||
if err = writer.writeData(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
footerBuf, err := ts.Write(context.TODO(), writer.footer)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err = writer.writeCloser.Write(footerBuf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
footerSizeBuf := make([]byte, 4)
|
||||
binary.LittleEndian.PutUint32(footerSizeBuf, uint32(len(footerBuf)))
|
||||
|
||||
if _, err = writer.writeCloser.Write(footerSizeBuf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = writer.writeCloser.Write([]byte("PAR1"))
|
||||
return err
|
||||
}
|
||||
|
||||
// Close - finalizes and closes writer. If any pending records are available, they are written here.
|
||||
func (writer *Writer) Close() (err error) {
|
||||
if err = writer.finalize(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return writer.writeCloser.Close()
|
||||
}
|
||||
|
||||
// NewWriter - creates new parquet writer. Binary data of rowGroupCount records are written to writeCloser.
|
||||
func NewWriter(writeCloser io.WriteCloser, schemaTree *schema.Tree, rowGroupCount int) (*Writer, error) {
|
||||
if _, err := writeCloser.Write([]byte("PAR1")); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
schemaList, valueElements, err := schemaTree.ToParquetSchema()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
footer := parquet.NewFileMetaData()
|
||||
footer.Version = 1
|
||||
footer.Schema = schemaList
|
||||
|
||||
return &Writer{
|
||||
PageSize: defaultPageSize,
|
||||
RowGroupSize: defaultRowGroupSize,
|
||||
CompressionType: parquet.CompressionCodec_SNAPPY,
|
||||
|
||||
writeCloser: writeCloser,
|
||||
offset: 4,
|
||||
footer: footer,
|
||||
schemaTree: schemaTree,
|
||||
valueElements: valueElements,
|
||||
rowGroupCount: rowGroupCount,
|
||||
}, nil
|
||||
}
|
152
pkg/s3select/internal/parquet-go/writer_test.go
Normal file
152
pkg/s3select/internal/parquet-go/writer_test.go
Normal file
@ -0,0 +1,152 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/data"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
func TestWriterWrite(t *testing.T) {
|
||||
schemaTree := schema.NewTree()
|
||||
{
|
||||
one, err := schema.NewElement("one", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_16),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
two, err := schema.NewElement("two", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
three, err := schema.NewElement("three", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BOOLEAN), nil, nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := schemaTree.Set("one", one); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := schemaTree.Set("two", two); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := schemaTree.Set("three", three); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
file, err := os.Create("test.parquet")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
writer, err := NewWriter(file, schemaTree, 100)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
oneColumn := data.NewColumn(parquet.Type_INT32)
|
||||
oneColumn.AddInt32(100, 0, 0)
|
||||
|
||||
twoColumn := data.NewColumn(parquet.Type_BYTE_ARRAY)
|
||||
twoColumn.AddByteArray([]byte("foo"), 0, 0)
|
||||
|
||||
threeColumn := data.NewColumn(parquet.Type_BOOLEAN)
|
||||
threeColumn.AddBoolean(true, 0, 0)
|
||||
|
||||
record := map[string]*data.Column{
|
||||
"one": oneColumn,
|
||||
"two": twoColumn,
|
||||
"three": threeColumn,
|
||||
}
|
||||
|
||||
err = writer.Write(record)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
err = writer.Close()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriterWriteJSON(t *testing.T) {
|
||||
schemaTree := schema.NewTree()
|
||||
{
|
||||
one, err := schema.NewElement("one", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_16),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
two, err := schema.NewElement("two", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
three, err := schema.NewElement("three", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BOOLEAN), nil, nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := schemaTree.Set("one", one); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := schemaTree.Set("two", two); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := schemaTree.Set("three", three); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
file, err := os.Create("test.parquet")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
writer, err := NewWriter(file, schemaTree, 100)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
record := `{"one": 100, "two": "foo", "three": true}`
|
||||
err = writer.WriteJSON([]byte(record))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
err = writer.Close()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
@ -20,10 +20,10 @@ import (
|
||||
"io"
|
||||
|
||||
"github.com/bcicen/jstream"
|
||||
parquetgo "github.com/minio/minio/pkg/s3select/internal/parquet-go"
|
||||
parquetgen "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
jsonfmt "github.com/minio/minio/pkg/s3select/json"
|
||||
"github.com/minio/minio/pkg/s3select/sql"
|
||||
parquetgo "github.com/minio/parquet-go"
|
||||
parquetgen "github.com/minio/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
// Reader - Parquet record reader for S3Select.
|
||||
|
Loading…
Reference in New Issue
Block a user