diff --git a/go.mod b/go.mod index beebe94d2..cf1487973 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.13 require ( cloud.google.com/go v0.39.0 contrib.go.opencensus.io/exporter/ocagent v0.5.0 // indirect + git.apache.org/thrift.git v0.13.0 github.com/Azure/azure-pipeline-go v0.2.1 github.com/Azure/azure-storage-blob-go v0.8.0 github.com/Azure/go-autorest v11.7.1+incompatible // indirect @@ -69,7 +70,6 @@ require ( github.com/minio/cli v1.22.0 github.com/minio/highwayhash v1.0.0 github.com/minio/minio-go/v6 v6.0.58-0.20200612001654-a57fec8037ec - github.com/minio/parquet-go v0.0.0-20200414234858-838cfa8aae61 github.com/minio/sha256-simd v0.1.1 github.com/minio/simdjson-go v0.1.5-0.20200303142138-b17fe061ea37 github.com/minio/sio v0.2.0 @@ -90,6 +90,7 @@ require ( github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e // indirect github.com/nsqio/go-nsq v1.0.7 github.com/philhofer/fwd v1.0.0 // indirect + github.com/pierrec/lz4 v2.4.0+incompatible github.com/pkg/errors v0.8.1 github.com/prometheus/client_golang v0.9.3 github.com/rcrowley/go-metrics v0.0.0-20190704165056-9c2d0518ed81 // indirect @@ -103,6 +104,8 @@ require ( github.com/soheilhy/cmux v0.1.4 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/streadway/amqp v0.0.0-20190404075320-75d898a42a94 + github.com/tidwall/gjson v1.3.5 + github.com/tidwall/sjson v1.0.4 github.com/tinylib/msgp v1.1.1 github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5 // indirect github.com/ugorji/go v1.1.5-pre // indirect diff --git a/pkg/s3select/internal/parquet-go/LICENSE b/pkg/s3select/internal/parquet-go/LICENSE new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/pkg/s3select/internal/parquet-go/Makefile b/pkg/s3select/internal/parquet-go/Makefile new file mode 100644 index 000000000..3f9a95084 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/Makefile @@ -0,0 +1,36 @@ +GOPATH := $(shell go env GOPATH) + +all: check + +getdeps: + @if [ ! -f ${GOPATH}/bin/golint ]; then echo "Installing golint" && go get -u golang.org/x/lint/golint; fi + @if [ ! -f ${GOPATH}/bin/gocyclo ]; then echo "Installing gocyclo" && go get -u github.com/fzipp/gocyclo; fi + @if [ ! -f ${GOPATH}/bin/misspell ]; then echo "Installing misspell" && go get -u github.com/client9/misspell/cmd/misspell; fi + @if [ ! -f ${GOPATH}/bin/ineffassign ]; then echo "Installing ineffassign" && go get -u github.com/gordonklaus/ineffassign; fi + +vet: + @echo "Running $@" + @go vet *.go + +fmt: + @echo "Running $@" + @gofmt -d *.go + +lint: + @echo "Running $@" + @${GOPATH}/bin/golint -set_exit_status + +cyclo: + @echo "Running $@" + @${GOPATH}/bin/gocyclo -over 200 . + +spelling: + @${GOPATH}/bin/misspell -locale US -error *.go README.md + +ineffassign: + @echo "Running $@" + @${GOPATH}/bin/ineffassign . + +check: getdeps vet fmt lint cyclo spelling ineffassign + @echo "Running unit tests" + @go test -tags kqueue ./... diff --git a/pkg/s3select/internal/parquet-go/README.md b/pkg/s3select/internal/parquet-go/README.md new file mode 100644 index 000000000..f38f9c78e --- /dev/null +++ b/pkg/s3select/internal/parquet-go/README.md @@ -0,0 +1,3 @@ +# parquet-go + +Modified version of https://github.com/xitongsys/parquet-go diff --git a/pkg/s3select/internal/parquet-go/column.go b/pkg/s3select/internal/parquet-go/column.go new file mode 100644 index 000000000..5ee5ef5b0 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/column.go @@ -0,0 +1,154 @@ +/* + * Minio Cloud Storage, (C) 2018 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package parquet + +import ( + "io" + "strings" + + "git.apache.org/thrift.git/lib/go/thrift" + "github.com/minio/minio-go/v6/pkg/set" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +func getColumns( + rowGroup *parquet.RowGroup, + columnNames set.StringSet, + schemaElements []*parquet.SchemaElement, + getReaderFunc GetReaderFunc, +) (nameColumnMap map[string]*column, err error) { + nameIndexMap := make(map[string]int) + for colIndex, columnChunk := range rowGroup.GetColumns() { + meta := columnChunk.GetMetaData() + columnName := strings.Join(meta.GetPathInSchema(), ".") + if columnNames != nil && !columnNames.Contains(columnName) { + continue + } + + // Ignore column spanning into another file. + if columnChunk.GetFilePath() != "" { + continue + } + + offset := meta.GetDataPageOffset() + if meta.DictionaryPageOffset != nil { + offset = meta.GetDictionaryPageOffset() + } + + size := meta.GetTotalCompressedSize() + + rc, err := getReaderFunc(offset, size) + if err != nil { + return nil, err + } + + thriftReader := thrift.NewTBufferedTransport(thrift.NewStreamTransportR(rc), int(size)) + + if nameColumnMap == nil { + nameColumnMap = make(map[string]*column) + } + + nameColumnMap[columnName] = &column{ + name: columnName, + metadata: meta, + schemaElements: schemaElements, + rc: rc, + thriftReader: thriftReader, + valueType: meta.GetType(), + } + + // First element of []*parquet.SchemaElement from parquet file metadata is 'schema' + // which is always skipped, hence colIndex + 1 is valid. + nameIndexMap[columnName] = colIndex + 1 + } + + for name := range nameColumnMap { + nameColumnMap[name].nameIndexMap = nameIndexMap + } + + return nameColumnMap, nil +} + +type column struct { + name string + endOfValues bool + valueIndex int + valueType parquet.Type + metadata *parquet.ColumnMetaData + schemaElements []*parquet.SchemaElement + nameIndexMap map[string]int + dictPage *page + dataTable *table + rc io.ReadCloser + thriftReader *thrift.TBufferedTransport +} + +func (column *column) close() (err error) { + if column.rc != nil { + err = column.rc.Close() + column.rc = nil + } + + return err +} + +func (column *column) readPage() { + page, _, _, err := readPage( + column.thriftReader, + column.metadata, + column.nameIndexMap, + column.schemaElements, + ) + + if err != nil { + column.endOfValues = true + return + } + + if page.Header.GetType() == parquet.PageType_DICTIONARY_PAGE { + column.dictPage = page + column.readPage() + return + } + + page.decode(column.dictPage) + + if column.dataTable == nil { + column.dataTable = newTableFromTable(page.DataTable) + } + + column.dataTable.Merge(page.DataTable) +} + +func (column *column) read() (value interface{}, valueType parquet.Type) { + if column.dataTable == nil { + column.readPage() + column.valueIndex = 0 + } + + if column.endOfValues { + return nil, column.metadata.GetType() + } + + value = column.dataTable.Values[column.valueIndex] + column.valueIndex++ + if len(column.dataTable.Values) == column.valueIndex { + column.dataTable = nil + } + + return value, column.metadata.GetType() +} diff --git a/pkg/s3select/internal/parquet-go/common.go b/pkg/s3select/internal/parquet-go/common.go new file mode 100644 index 000000000..e1d825bb7 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/common.go @@ -0,0 +1,95 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package parquet + +import ( + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +func valuesToInterfaces(values interface{}, valueType parquet.Type) (tableValues []interface{}) { + switch valueType { + case parquet.Type_BOOLEAN: + for _, v := range values.([]bool) { + tableValues = append(tableValues, v) + } + case parquet.Type_INT32: + for _, v := range values.([]int32) { + tableValues = append(tableValues, v) + } + case parquet.Type_INT64: + for _, v := range values.([]int64) { + tableValues = append(tableValues, v) + } + case parquet.Type_FLOAT: + for _, v := range values.([]float32) { + tableValues = append(tableValues, v) + } + case parquet.Type_DOUBLE: + for _, v := range values.([]float64) { + tableValues = append(tableValues, v) + } + case parquet.Type_INT96, parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY: + for _, v := range values.([][]byte) { + tableValues = append(tableValues, v) + } + } + + return tableValues +} + +func interfacesToValues(values []interface{}, valueType parquet.Type) interface{} { + switch valueType { + case parquet.Type_BOOLEAN: + bs := make([]bool, len(values)) + for i := range values { + bs[i] = values[i].(bool) + } + return bs + case parquet.Type_INT32: + i32s := make([]int32, len(values)) + for i := range values { + i32s[i] = values[i].(int32) + } + return i32s + case parquet.Type_INT64: + i64s := make([]int64, len(values)) + for i := range values { + i64s[i] = values[i].(int64) + } + return i64s + case parquet.Type_FLOAT: + f32s := make([]float32, len(values)) + for i := range values { + f32s[i] = values[i].(float32) + } + return f32s + case parquet.Type_DOUBLE: + f64s := make([]float64, len(values)) + for i := range values { + f64s[i] = values[i].(float64) + } + return f64s + case parquet.Type_INT96, parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY: + array := make([][]byte, len(values)) + for i := range values { + array[i] = values[i].([]byte) + } + return array + } + + return nil +} diff --git a/pkg/s3select/internal/parquet-go/common/common.go b/pkg/s3select/internal/parquet-go/common/common.go new file mode 100644 index 000000000..8df338f0f --- /dev/null +++ b/pkg/s3select/internal/parquet-go/common/common.go @@ -0,0 +1,144 @@ +package common + +import ( + "bytes" + "compress/gzip" + "fmt" + "io/ioutil" + + "github.com/klauspost/compress/snappy" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + "github.com/pierrec/lz4" +) + +// ToSliceValue converts values to a slice value. +func ToSliceValue(values []interface{}, parquetType parquet.Type) interface{} { + switch parquetType { + case parquet.Type_BOOLEAN: + bs := make([]bool, len(values)) + for i := range values { + bs[i] = values[i].(bool) + } + return bs + case parquet.Type_INT32: + i32s := make([]int32, len(values)) + for i := range values { + i32s[i] = values[i].(int32) + } + return i32s + case parquet.Type_INT64: + i64s := make([]int64, len(values)) + for i := range values { + i64s[i] = values[i].(int64) + } + return i64s + case parquet.Type_FLOAT: + f32s := make([]float32, len(values)) + for i := range values { + f32s[i] = values[i].(float32) + } + return f32s + case parquet.Type_DOUBLE: + f64s := make([]float64, len(values)) + for i := range values { + f64s[i] = values[i].(float64) + } + return f64s + case parquet.Type_BYTE_ARRAY: + array := make([][]byte, len(values)) + for i := range values { + array[i] = values[i].([]byte) + } + return array + } + + return nil +} + +// BitWidth returns bits count required to accommodate given value. +func BitWidth(ui64 uint64) (width int32) { + for ; ui64 != 0; ui64 >>= 1 { + width++ + } + + return width +} + +// Compress compresses given data. +func Compress(compressionType parquet.CompressionCodec, data []byte) ([]byte, error) { + switch compressionType { + case parquet.CompressionCodec_UNCOMPRESSED: + return data, nil + + case parquet.CompressionCodec_SNAPPY: + return snappy.Encode(nil, data), nil + + case parquet.CompressionCodec_GZIP: + buf := new(bytes.Buffer) + writer := gzip.NewWriter(buf) + n, err := writer.Write(data) + if err != nil { + return nil, err + } + if n != len(data) { + return nil, fmt.Errorf("short writes") + } + + if err = writer.Flush(); err != nil { + return nil, err + } + + if err = writer.Close(); err != nil { + return nil, err + } + + return buf.Bytes(), nil + + case parquet.CompressionCodec_LZ4: + buf := new(bytes.Buffer) + writer := lz4.NewWriter(buf) + n, err := writer.Write(data) + if err != nil { + return nil, err + } + if n != len(data) { + return nil, fmt.Errorf("short writes") + } + + if err = writer.Flush(); err != nil { + return nil, err + } + + if err = writer.Close(); err != nil { + return nil, err + } + + return buf.Bytes(), nil + } + + return nil, fmt.Errorf("unsupported compression codec %v", compressionType) +} + +// Uncompress uncompresses given data. +func Uncompress(compressionType parquet.CompressionCodec, data []byte) ([]byte, error) { + switch compressionType { + case parquet.CompressionCodec_UNCOMPRESSED: + return data, nil + + case parquet.CompressionCodec_SNAPPY: + return snappy.Decode(nil, data) + + case parquet.CompressionCodec_GZIP: + reader, err := gzip.NewReader(bytes.NewReader(data)) + if err != nil { + return nil, err + } + defer reader.Close() + return ioutil.ReadAll(reader) + + case parquet.CompressionCodec_LZ4: + return ioutil.ReadAll(lz4.NewReader(bytes.NewReader(data))) + } + + return nil, fmt.Errorf("unsupported compression codec %v", compressionType) +} diff --git a/pkg/s3select/internal/parquet-go/compression.go b/pkg/s3select/internal/parquet-go/compression.go new file mode 100644 index 000000000..f9a1e5e52 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/compression.go @@ -0,0 +1,127 @@ +/* + * Minio Cloud Storage, (C) 2018 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package parquet + +import ( + "bytes" + "fmt" + "io/ioutil" + "sync" + + "github.com/klauspost/compress/gzip" + "github.com/klauspost/compress/snappy" + "github.com/klauspost/compress/zstd" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + "github.com/pierrec/lz4" +) + +type compressionCodec parquet.CompressionCodec + +var zstdOnce sync.Once +var zstdEnc *zstd.Encoder +var zstdDec *zstd.Decoder + +func initZstd() { + zstdOnce.Do(func() { + zstdEnc, _ = zstd.NewWriter(nil, zstd.WithZeroFrames(true)) + zstdDec, _ = zstd.NewReader(nil) + }) +} + +func (c compressionCodec) compress(buf []byte) ([]byte, error) { + switch parquet.CompressionCodec(c) { + case parquet.CompressionCodec_UNCOMPRESSED: + return buf, nil + + case parquet.CompressionCodec_SNAPPY: + return snappy.Encode(nil, buf), nil + + case parquet.CompressionCodec_GZIP: + byteBuf := new(bytes.Buffer) + writer := gzip.NewWriter(byteBuf) + n, err := writer.Write(buf) + if err != nil { + return nil, err + } + if n != len(buf) { + return nil, fmt.Errorf("short writes") + } + + if err = writer.Flush(); err != nil { + return nil, err + } + + if err = writer.Close(); err != nil { + return nil, err + } + + return byteBuf.Bytes(), nil + + case parquet.CompressionCodec_LZ4: + byteBuf := new(bytes.Buffer) + writer := lz4.NewWriter(byteBuf) + n, err := writer.Write(buf) + if err != nil { + return nil, err + } + if n != len(buf) { + return nil, fmt.Errorf("short writes") + } + + if err = writer.Flush(); err != nil { + return nil, err + } + + if err = writer.Close(); err != nil { + return nil, err + } + + return byteBuf.Bytes(), nil + case parquet.CompressionCodec_ZSTD: + initZstd() + return zstdEnc.EncodeAll(buf, nil), nil + } + + return nil, fmt.Errorf("invalid compression codec %v", c) +} + +func (c compressionCodec) uncompress(buf []byte) ([]byte, error) { + switch parquet.CompressionCodec(c) { + case parquet.CompressionCodec_UNCOMPRESSED: + return buf, nil + + case parquet.CompressionCodec_SNAPPY: + return snappy.Decode(nil, buf) + + case parquet.CompressionCodec_GZIP: + reader, err := gzip.NewReader(bytes.NewReader(buf)) + if err != nil { + return nil, err + } + defer reader.Close() + return ioutil.ReadAll(reader) + + case parquet.CompressionCodec_LZ4: + return ioutil.ReadAll(lz4.NewReader(bytes.NewReader(buf))) + + case parquet.CompressionCodec_ZSTD: + initZstd() + return zstdDec.DecodeAll(buf, nil) + } + + return nil, fmt.Errorf("invalid compression codec %v", c) +} diff --git a/pkg/s3select/internal/parquet-go/data/column-grouplist_test.go b/pkg/s3select/internal/parquet-go/data/column-grouplist_test.go new file mode 100644 index 000000000..cda76013a --- /dev/null +++ b/pkg/s3select/internal/parquet-go/data/column-grouplist_test.go @@ -0,0 +1,618 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package data + +import ( + "reflect" + "testing" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/schema" +) + +func TestPopulateGroupList(t *testing.T) { + requiredList1 := schema.NewTree() + { + requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredList1.Set("group", requiredGroup); err != nil { + t.Fatal(err) + } + if err = requiredList1.Set("group.list", list); err != nil { + t.Fatal(err) + } + if err = requiredList1.Set("group.list.element", requiredElement); err != nil { + t.Fatal(err) + } + if err = requiredList1.Set("group.list.element.col", requiredCol); err != nil { + t.Fatal(err) + } + + if _, _, err := requiredList1.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + requiredList2 := schema.NewTree() + { + requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredList2.Set("group", requiredGroup); err != nil { + t.Fatal(err) + } + if err = requiredList2.Set("group.list", list); err != nil { + t.Fatal(err) + } + if err = requiredList2.Set("group.list.element", requiredElement); err != nil { + t.Fatal(err) + } + if err = requiredList2.Set("group.list.element.col", optionalCol); err != nil { + t.Fatal(err) + } + + if _, _, err := requiredList2.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + requiredList3 := schema.NewTree() + { + requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredList3.Set("group", requiredGroup); err != nil { + t.Fatal(err) + } + if err = requiredList3.Set("group.list", list); err != nil { + t.Fatal(err) + } + if err = requiredList3.Set("group.list.element", optionalElement); err != nil { + t.Fatal(err) + } + if err = requiredList3.Set("group.list.element.col", requiredCol); err != nil { + t.Fatal(err) + } + + if _, _, err := requiredList3.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + requiredList4 := schema.NewTree() + { + requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredList4.Set("group", requiredGroup); err != nil { + t.Fatal(err) + } + if err = requiredList4.Set("group.list", list); err != nil { + t.Fatal(err) + } + if err = requiredList4.Set("group.list.element", optionalElement); err != nil { + t.Fatal(err) + } + if err = requiredList4.Set("group.list.element.col", optionalCol); err != nil { + t.Fatal(err) + } + + if _, _, err := requiredList4.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalList1 := schema.NewTree() + { + optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalList1.Set("group", optionalGroup); err != nil { + t.Fatal(err) + } + if err = optionalList1.Set("group.list", list); err != nil { + t.Fatal(err) + } + if err = optionalList1.Set("group.list.element", requiredElement); err != nil { + t.Fatal(err) + } + if err = optionalList1.Set("group.list.element.col", requiredCol); err != nil { + t.Fatal(err) + } + + if _, _, err := optionalList1.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalList2 := schema.NewTree() + { + optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalList2.Set("group", optionalGroup); err != nil { + t.Fatal(err) + } + if err = optionalList2.Set("group.list", list); err != nil { + t.Fatal(err) + } + if err = optionalList2.Set("group.list.element", requiredElement); err != nil { + t.Fatal(err) + } + if err = optionalList2.Set("group.list.element.col", optionalCol); err != nil { + t.Fatal(err) + } + + if _, _, err := optionalList2.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalList3 := schema.NewTree() + { + optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalList3.Set("group", optionalGroup); err != nil { + t.Fatal(err) + } + if err = optionalList3.Set("group.list", list); err != nil { + t.Fatal(err) + } + if err = optionalList3.Set("group.list.element", optionalElement); err != nil { + t.Fatal(err) + } + if err = optionalList3.Set("group.list.element.col", requiredCol); err != nil { + t.Fatal(err) + } + + if _, _, err := optionalList3.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalList4 := schema.NewTree() + { + optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalList4.Set("group", optionalGroup); err != nil { + t.Fatal(err) + } + if err = optionalList4.Set("group.list", list); err != nil { + t.Fatal(err) + } + if err = optionalList4.Set("group.list.element", optionalElement); err != nil { + t.Fatal(err) + } + if err = optionalList4.Set("group.list.element.col", optionalCol); err != nil { + t.Fatal(err) + } + + if _, _, err := optionalList4.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + result1 := map[string]*Column{ + "group.list.element.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{1}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + result2 := map[string]*Column{ + "group.list.element.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10, v20}, + definitionLevels: []int64{1, 1}, + repetitionLevels: []int64{0, 1}, + rowCount: 1, + maxBitWidth: 5, + minValue: v10, + maxValue: v20, + }, + } + + result3 := map[string]*Column{ + "group.list.element.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{1}, + repetitionLevels: []int64{0}, + rowCount: 1, + }, + } + + result4 := map[string]*Column{ + "group.list.element.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + result5 := map[string]*Column{ + "group.list.element.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10, v20}, + definitionLevels: []int64{2, 2}, + repetitionLevels: []int64{0, 1}, + rowCount: 1, + maxBitWidth: 5, + minValue: v10, + maxValue: v20, + }, + } + + result6 := map[string]*Column{ + "group.list.element.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{0}, + rowCount: 1, + }, + } + + result7 := map[string]*Column{ + "group.list.element.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{3}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + result8 := map[string]*Column{ + "group.list.element.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10, v20}, + definitionLevels: []int64{3, 3}, + repetitionLevels: []int64{0, 1}, + rowCount: 1, + maxBitWidth: 5, + minValue: v10, + maxValue: v20, + }, + } + + result9 := map[string]*Column{ + "group.list.element.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + rowCount: 1, + }, + } + + result10 := map[string]*Column{ + "group.list.element.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{3}, + repetitionLevels: []int64{0}, + rowCount: 1, + }, + } + + result11 := map[string]*Column{ + "group.list.element.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{4}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + result12 := map[string]*Column{ + "group.list.element.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10, v20}, + definitionLevels: []int64{4, 4}, + repetitionLevels: []int64{0, 1}, + rowCount: 1, + maxBitWidth: 5, + minValue: v10, + maxValue: v20, + }, + } + + testCases := []struct { + schemaTree *schema.Tree + data string + expectedResult map[string]*Column + expectErr bool + }{ + {requiredList1, `{}`, nil, true}, // err: group: nil value for required field + {requiredList1, `{"group": null}`, nil, true}, // err: group: nil value for required field + {requiredList1, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field + {requiredList1, `{"group": [{"col": 10}]}`, result1, false}, + {requiredList1, `{"group": [{"col": 10}, {"col": 20}]}`, result2, false}, + {requiredList2, `{}`, nil, true}, // err: group: nil value for required field + {requiredList2, `{"group": null}`, nil, true}, // err: group: nil value for required field + {requiredList2, `{"group": [{"col": null}]}`, result3, false}, + {requiredList2, `{"group": [{"col": 10}]}`, result4, false}, + {requiredList2, `{"group": [{"col": 10}, {"col": 20}]}`, result5, false}, + {requiredList3, `{}`, nil, true}, // err: group: nil value for required field + {requiredList3, `{"group": null}`, nil, true}, // err: group: nil value for required field + {requiredList3, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field + {requiredList3, `{"group": [{"col": 10}]}`, result4, false}, + {requiredList3, `{"group": [{"col": 10}, {"col": 20}]}`, result5, false}, + {requiredList4, `{}`, nil, true}, // err: group: nil value for required field + {requiredList4, `{"group": null}`, nil, true}, // err: group: nil value for required field + {requiredList4, `{"group": [{"col": null}]}`, result6, false}, + {requiredList4, `{"group": [{"col": 10}]}`, result7, false}, + {requiredList4, `{"group": [{"col": 10}, {"col": 20}]}`, result8, false}, + {optionalList1, `{}`, result9, false}, + {optionalList1, `{"group": null}`, result9, false}, + {optionalList1, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field + {optionalList1, `{"group": [{"col": 10}]}`, result4, false}, + {optionalList1, `{"group": [{"col": 10}, {"col": 20}]}`, result5, false}, + {optionalList2, `{}`, result9, false}, + {optionalList2, `{"group": null}`, result9, false}, + {optionalList2, `{"group": [{"col": null}]}`, result6, false}, + {optionalList2, `{"group": [{"col": 10}]}`, result7, false}, + {optionalList2, `{"group": [{"col": 10}, {"col": 20}]}`, result8, false}, + {optionalList3, `{}`, result9, false}, + {optionalList3, `{"group": null}`, result9, false}, + {optionalList3, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field + {optionalList3, `{"group": [{"col": 10}]}`, result7, false}, + {optionalList3, `{"group": [{"col": 10}, {"col": 20}]}`, result8, false}, + {optionalList4, `{}`, result9, false}, + {optionalList4, `{"group": null}`, result9, false}, + {optionalList4, `{"group": [{"col": null}]}`, result10, false}, + {optionalList4, `{"group": [{"col": 10}]}`, result11, false}, + {optionalList4, `{"group": [{"col": 10}, {"col": 20}]}`, result12, false}, + } + + for i, testCase := range testCases { + result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree) + expectErr := (err != nil) + + if testCase.expectErr != expectErr { + t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr) + } + + if !testCase.expectErr { + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } + } +} diff --git a/pkg/s3select/internal/parquet-go/data/column-grouptype_test.go b/pkg/s3select/internal/parquet-go/data/column-grouptype_test.go new file mode 100644 index 000000000..b1f957382 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/data/column-grouptype_test.go @@ -0,0 +1,237 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package data + +import ( + "reflect" + "testing" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/schema" +) + +func TestPopulateGroupType(t *testing.T) { + requiredGroup1 := schema.NewTree() + { + requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredGroup1.Set("group", requiredGroup); err != nil { + t.Fatal(err) + } + if err = requiredGroup1.Set("group.col", requiredCol); err != nil { + t.Fatal(err) + } + + if _, _, err := requiredGroup1.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + requiredGroup2 := schema.NewTree() + { + requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredGroup2.Set("group", requiredGroup); err != nil { + t.Fatal(err) + } + if err = requiredGroup2.Set("group.col", optionalCol); err != nil { + t.Fatal(err) + } + + if _, _, err := requiredGroup2.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalGroup1 := schema.NewTree() + { + optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalGroup1.Set("group", optionalGroup); err != nil { + t.Fatal(err) + } + if err = optionalGroup1.Set("group.col", requiredCol); err != nil { + t.Fatal(err) + } + + if _, _, err := optionalGroup1.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalGroup2 := schema.NewTree() + { + optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalGroup2.Set("group", optionalGroup); err != nil { + t.Fatal(err) + } + if err = optionalGroup2.Set("group.col", optionalCol); err != nil { + t.Fatal(err) + } + + if _, _, err := optionalGroup2.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + result1 := map[string]*Column{ + "group.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + result2 := map[string]*Column{ + "group.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + rowCount: 1, + }, + } + + result3 := map[string]*Column{ + "group.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{1}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + result4 := map[string]*Column{ + "group.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{1}, + repetitionLevels: []int64{0}, + rowCount: 1, + }, + } + + result5 := map[string]*Column{ + "group.col": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + testCases := []struct { + schemaTree *schema.Tree + data string + expectedResult map[string]*Column + expectErr bool + }{ + {requiredGroup1, `{}`, nil, true}, // err: group: nil value for required field + {requiredGroup1, `{"group": null}`, nil, true}, // err: group: nil value for required field + {requiredGroup1, `{"group": {"col": null}}`, nil, true}, // err: group.col: nil value for required field + {requiredGroup1, `{"group": {"col": 10}}`, result1, false}, + {requiredGroup2, `{}`, nil, true}, // err: group: nil value for required field + {requiredGroup2, `{"group": null}`, nil, true}, // err: group: nil value for required field + {requiredGroup2, `{"group": {"col": null}}`, result2, false}, + {requiredGroup2, `{"group": {"col": 10}}`, result3, false}, + {optionalGroup1, `{}`, result2, false}, + {optionalGroup1, `{"group": null}`, result2, false}, + {optionalGroup1, `{"group": {"col": null}}`, nil, true}, // err: group.col: nil value for required field + {optionalGroup1, `{"group": {"col": 10}}`, result3, false}, + {optionalGroup2, `{}`, result2, false}, + {optionalGroup2, `{"group": null}`, result2, false}, + {optionalGroup2, `{"group": {"col": null}}`, result4, false}, + {optionalGroup2, `{"group": {"col": 10}}`, result5, false}, + } + + for i, testCase := range testCases { + result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree) + expectErr := (err != nil) + + if testCase.expectErr != expectErr { + t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr) + } + + if !testCase.expectErr { + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } + } +} diff --git a/pkg/s3select/internal/parquet-go/data/column-listoflist_test.go b/pkg/s3select/internal/parquet-go/data/column-listoflist_test.go new file mode 100644 index 000000000..f990f8969 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/data/column-listoflist_test.go @@ -0,0 +1,698 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package data + +import ( + "reflect" + "testing" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/schema" +) + +func TestPopulateListOfList(t *testing.T) { + requiredList1 := schema.NewTree() + { + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredList1.Set("col", requiredCol); err != nil { + t.Fatal(err) + } + if err = requiredList1.Set("col.list", list); err != nil { + t.Fatal(err) + } + if err = requiredList1.Set("col.list.element", requiredElement); err != nil { + t.Fatal(err) + } + if err = requiredList1.Set("col.list.element.list", subList); err != nil { + t.Fatal(err) + } + if err = requiredList1.Set("col.list.element.list.element", requiredSubElement); err != nil { + t.Fatal(err) + } + + if _, _, err = requiredList1.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + requiredList2 := schema.NewTree() + { + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredList2.Set("col", requiredCol); err != nil { + t.Fatal(err) + } + if err = requiredList2.Set("col.list", list); err != nil { + t.Fatal(err) + } + if err = requiredList2.Set("col.list.element", requiredElement); err != nil { + t.Fatal(err) + } + if err = requiredList2.Set("col.list.element.list", subList); err != nil { + t.Fatal(err) + } + if err = requiredList2.Set("col.list.element.list.element", optionalSubElement); err != nil { + t.Fatal(err) + } + + if _, _, err = requiredList2.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + requiredList3 := schema.NewTree() + { + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredList3.Set("col", requiredCol); err != nil { + t.Fatal(err) + } + if err = requiredList3.Set("col.list", list); err != nil { + t.Fatal(err) + } + if err = requiredList3.Set("col.list.element", optioonalElement); err != nil { + t.Fatal(err) + } + if err = requiredList3.Set("col.list.element.list", subList); err != nil { + t.Fatal(err) + } + if err = requiredList3.Set("col.list.element.list.element", requiredSubElement); err != nil { + t.Fatal(err) + } + + if _, _, err = requiredList3.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + requiredList4 := schema.NewTree() + { + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredList4.Set("col", requiredCol); err != nil { + t.Fatal(err) + } + if err = requiredList4.Set("col.list", list); err != nil { + t.Fatal(err) + } + if err = requiredList4.Set("col.list.element", optioonalElement); err != nil { + t.Fatal(err) + } + if err = requiredList4.Set("col.list.element.list", subList); err != nil { + t.Fatal(err) + } + if err = requiredList4.Set("col.list.element.list.element", optionalSubElement); err != nil { + t.Fatal(err) + } + + if _, _, err = requiredList4.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalList1 := schema.NewTree() + { + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalList1.Set("col", requiredCol); err != nil { + t.Fatal(err) + } + if err = optionalList1.Set("col.list", list); err != nil { + t.Fatal(err) + } + if err = optionalList1.Set("col.list.element", requiredElement); err != nil { + t.Fatal(err) + } + if err = optionalList1.Set("col.list.element.list", subList); err != nil { + t.Fatal(err) + } + if err = optionalList1.Set("col.list.element.list.element", requiredSubElement); err != nil { + t.Fatal(err) + } + + if _, _, err = optionalList1.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalList2 := schema.NewTree() + { + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalList2.Set("col", requiredCol); err != nil { + t.Fatal(err) + } + if err = optionalList2.Set("col.list", list); err != nil { + t.Fatal(err) + } + if err = optionalList2.Set("col.list.element", requiredElement); err != nil { + t.Fatal(err) + } + if err = optionalList2.Set("col.list.element.list", subList); err != nil { + t.Fatal(err) + } + if err = optionalList2.Set("col.list.element.list.element", optionalSubElement); err != nil { + t.Fatal(err) + } + + if _, _, err = optionalList2.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalList3 := schema.NewTree() + { + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalList3.Set("col", requiredCol); err != nil { + t.Fatal(err) + } + if err = optionalList3.Set("col.list", list); err != nil { + t.Fatal(err) + } + if err = optionalList3.Set("col.list.element", optioonalElement); err != nil { + t.Fatal(err) + } + if err = optionalList3.Set("col.list.element.list", subList); err != nil { + t.Fatal(err) + } + if err = optionalList3.Set("col.list.element.list.element", requiredSubElement); err != nil { + t.Fatal(err) + } + + if _, _, err = optionalList3.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalList4 := schema.NewTree() + { + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalList4.Set("col", requiredCol); err != nil { + t.Fatal(err) + } + if err = optionalList4.Set("col.list", list); err != nil { + t.Fatal(err) + } + if err = optionalList4.Set("col.list.element", optioonalElement); err != nil { + t.Fatal(err) + } + if err = optionalList4.Set("col.list.element.list", subList); err != nil { + t.Fatal(err) + } + if err = optionalList4.Set("col.list.element.list.element", optionalSubElement); err != nil { + t.Fatal(err) + } + + if _, _, err = optionalList4.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + result1 := map[string]*Column{ + "col.list.element.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + result2 := map[string]*Column{ + "col.list.element.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10, v20, v30, v10, v20, v10, v30}, + definitionLevels: []int64{2, 2, 2, 2, 2, 2, 2}, + repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2}, + rowCount: 1, + maxBitWidth: 5, + minValue: v10, + maxValue: v30, + }, + } + + result3 := map[string]*Column{ + "col.list.element.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{0}, + rowCount: 1, + }, + } + + result4 := map[string]*Column{ + "col.list.element.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{3}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + result5 := map[string]*Column{ + "col.list.element.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10, v20, v30, v10, v20, v10, v30}, + definitionLevels: []int64{3, 3, 3, 3, 3, 3, 3}, + repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2}, + rowCount: 1, + maxBitWidth: 5, + minValue: v10, + maxValue: v30, + }, + } + + result6 := map[string]*Column{ + "col.list.element.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{3}, + repetitionLevels: []int64{0}, + rowCount: 1, + }, + } + + result7 := map[string]*Column{ + "col.list.element.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{4}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + result8 := map[string]*Column{ + "col.list.element.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10, v20, v30, v10, v20, v10, v30}, + definitionLevels: []int64{4, 4, 4, 4, 4, 4, 4}, + repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2}, + rowCount: 1, + maxBitWidth: 5, + minValue: v10, + maxValue: v30, + }, + } + + result9 := map[string]*Column{ + "col.list.element.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + rowCount: 1, + }, + } + + result10 := map[string]*Column{ + "col.list.element.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{4}, + repetitionLevels: []int64{0}, + rowCount: 1, + }, + } + + result11 := map[string]*Column{ + "col.list.element.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{5}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + result12 := map[string]*Column{ + "col.list.element.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10, v20, v30, v10, v20, v10, v30}, + definitionLevels: []int64{5, 5, 5, 5, 5, 5, 5}, + repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2}, + rowCount: 1, + maxBitWidth: 5, + minValue: v10, + maxValue: v30, + }, + } + + testCases := []struct { + schemaTree *schema.Tree + data string + expectedResult map[string]*Column + expectErr bool + }{ + {requiredList1, `{}`, nil, true}, // err: col: nil value for required field + {requiredList1, `{"col": null}`, nil, true}, // err: col: nil value for required field + {requiredList1, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field + {requiredList1, `{"col": [[10]]}`, result1, false}, + {requiredList1, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result2, false}, + {requiredList2, `{}`, nil, true}, // err: col: nil value for required field + {requiredList2, `{"col": null}`, nil, true}, // err: col: nil value for required field + {requiredList2, `{"col": [[null]]}`, result3, false}, + {requiredList2, `{"col": [[10]]}`, result4, false}, + {requiredList2, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result5, false}, + {requiredList3, `{}`, nil, true}, // err: col: nil value for required field + {requiredList3, `{"col": null}`, nil, true}, // err: col: nil value for required field + {requiredList3, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field + {requiredList3, `{"col": [[10]]}`, result4, false}, + {requiredList3, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result5, false}, + {requiredList4, `{}`, nil, true}, // err: col: nil value for required field + {requiredList4, `{"col": null}`, nil, true}, // err: col: nil value for required field + {requiredList4, `{"col": [[null]]}`, result6, false}, + {requiredList4, `{"col": [[10]]}`, result7, false}, + {requiredList4, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result8, false}, + {optionalList1, `{}`, result9, false}, + {optionalList1, `{"col": null}`, result9, false}, + {optionalList1, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field + {optionalList1, `{"col": [[10]]}`, result4, false}, + {optionalList1, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result5, false}, + {optionalList2, `{}`, result9, false}, + {optionalList2, `{"col": null}`, result9, false}, + {optionalList2, `{"col": [[null]]}`, result6, false}, + {optionalList2, `{"col": [[10]]}`, result7, false}, + {optionalList2, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result8, false}, + {optionalList3, `{}`, result9, false}, + {optionalList3, `{"col": null}`, result9, false}, + {optionalList3, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field + {optionalList3, `{"col": [[10]]}`, result7, false}, + {optionalList3, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result8, false}, + {optionalList4, `{}`, result9, false}, + {optionalList4, `{"col": null}`, result9, false}, + {optionalList4, `{"col": [[null]]}`, result10, false}, + {optionalList4, `{"col": [[10]]}`, result11, false}, + {optionalList4, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result12, false}, + } + + for i, testCase := range testCases { + result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree) + expectErr := (err != nil) + + if testCase.expectErr != expectErr { + t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr) + } + + if !testCase.expectErr { + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } + } +} diff --git a/pkg/s3select/internal/parquet-go/data/column-map_test.go b/pkg/s3select/internal/parquet-go/data/column-map_test.go new file mode 100644 index 000000000..30be01c8c --- /dev/null +++ b/pkg/s3select/internal/parquet-go/data/column-map_test.go @@ -0,0 +1,370 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package data + +import ( + "reflect" + "testing" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/schema" +) + +func TestPopulateMap(t *testing.T) { + t.Skip("Broken") + requiredMap1 := schema.NewTree() + { + mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredValue, err := schema.NewElement("value", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredMap1.Set("map", mapElement); err != nil { + t.Fatal(err) + } + + if err = requiredMap1.Set("map.key_value", keyValue); err != nil { + t.Fatal(err) + } + + if err = requiredMap1.Set("map.key_value.key", requiredKey); err != nil { + t.Fatal(err) + } + + if err = requiredMap1.Set("map.key_value.value", requiredValue); err != nil { + t.Fatal(err) + } + + if _, _, err = requiredMap1.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + requiredMap2 := schema.NewTree() + { + mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalValue, err := schema.NewElement("value", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredMap2.Set("map", mapElement); err != nil { + t.Fatal(err) + } + + if err = requiredMap2.Set("map.key_value", keyValue); err != nil { + t.Fatal(err) + } + + if err = requiredMap2.Set("map.key_value.key", requiredKey); err != nil { + t.Fatal(err) + } + + if err = requiredMap2.Set("map.key_value.value", optionalValue); err != nil { + t.Fatal(err) + } + + if _, _, err = requiredMap2.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalMap1 := schema.NewTree() + { + mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredValue, err := schema.NewElement("value", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalMap1.Set("map", mapElement); err != nil { + t.Fatal(err) + } + + if err = optionalMap1.Set("map.key_value", keyValue); err != nil { + t.Fatal(err) + } + + if err = optionalMap1.Set("map.key_value.key", requiredKey); err != nil { + t.Fatal(err) + } + + if err = optionalMap1.Set("map.key_value.value", requiredValue); err != nil { + t.Fatal(err) + } + + if _, _, err = optionalMap1.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalMap2 := schema.NewTree() + { + mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalValue, err := schema.NewElement("value", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalMap2.Set("map", mapElement); err != nil { + t.Fatal(err) + } + + if err = optionalMap2.Set("map.key_value", keyValue); err != nil { + t.Fatal(err) + } + + if err = optionalMap2.Set("map.key_value.key", requiredKey); err != nil { + t.Fatal(err) + } + + if err = optionalMap2.Set("map.key_value.value", optionalValue); err != nil { + t.Fatal(err) + } + + if _, _, err = optionalMap2.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + result1 := map[string]*Column{ + "map.key_value.key": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{ten}, + definitionLevels: []int64{1}, + repetitionLevels: []int64{0}, + }, + "map.key_value.value": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{1}, + repetitionLevels: []int64{1}, + }, + } + + result2 := map[string]*Column{ + "map.key_value.key": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{ten}, + definitionLevels: []int64{1}, + repetitionLevels: []int64{0}, + }, + "map.key_value.value": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{1}, + repetitionLevels: []int64{1}, + }, + } + + result3 := map[string]*Column{ + "map.key_value.key": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{ten}, + definitionLevels: []int64{1}, + repetitionLevels: []int64{0}, + }, + "map.key_value.value": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{1}, + }, + } + + result4 := map[string]*Column{ + "map.key_value.key": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{nil}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + }, + } + + result5 := map[string]*Column{ + "map.key_value.key": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{ten}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{0}, + }, + "map.key_value.value": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{1}, + }, + } + + result6 := map[string]*Column{ + "map.key_value.key": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{ten}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{0}, + }, + "map.key_value.value": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{1}, + }, + } + + result7 := map[string]*Column{ + "map.key_value.key": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{ten}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{0}, + }, + "map.key_value.value": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{3}, + repetitionLevels: []int64{1}, + }, + } + + testCases := []struct { + schemaTree *schema.Tree + data string + expectedResult map[string]*Column + expectErr bool + }{ + {requiredMap1, `{}`, nil, true}, // err: map: nil value for required field + {requiredMap1, `{"map": null}`, nil, true}, // err: map: nil value for required field + {requiredMap1, `{"map": {"ten": null}}`, nil, true}, // err: map.key_value.value: nil value for required field + {requiredMap1, `{"map": {"ten": 10}}`, result1, false}, + {requiredMap2, `{}`, nil, true}, // err: map: nil value for required field + {requiredMap2, `{"map": null}`, nil, true}, // err: map: nil value for required field + {requiredMap2, `{"map": {"ten": null}}`, result2, false}, + {requiredMap2, `{"map": {"ten": 10}}`, result3, false}, + {optionalMap1, `{}`, result4, false}, + {optionalMap1, `{"map": null}`, result4, false}, + {optionalMap1, `{"map": {"ten": null}}`, nil, true}, // err: map.key_value.value: nil value for required field + {optionalMap1, `{"map": {"ten": 10}}`, result5, false}, + {optionalMap2, `{}`, result4, false}, + {optionalMap2, `{"map": null}`, result4, false}, + {optionalMap2, `{"map": {"ten": null}}`, result6, false}, + {optionalMap2, `{"map": {"ten": 10}}`, result7, false}, + } + + for i, testCase := range testCases { + result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree) + expectErr := (err != nil) + + if testCase.expectErr != expectErr { + t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr) + } + + if !testCase.expectErr { + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Errorf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } + } +} diff --git a/pkg/s3select/internal/parquet-go/data/column-primitivelist_test.go b/pkg/s3select/internal/parquet-go/data/column-primitivelist_test.go new file mode 100644 index 000000000..3fb7b0c98 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/data/column-primitivelist_test.go @@ -0,0 +1,330 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package data + +import ( + "reflect" + "testing" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/schema" +) + +func TestPopulatePrimitiveList(t *testing.T) { + requiredList1 := schema.NewTree() + { + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredList1.Set("col", requiredCol); err != nil { + t.Fatal(err) + } + if err = requiredList1.Set("col.list", list); err != nil { + t.Fatal(err) + } + if err = requiredList1.Set("col.list.element", requiredElement); err != nil { + t.Fatal(err) + } + + if _, _, err = requiredList1.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + requiredList2 := schema.NewTree() + { + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredList2.Set("col", requiredCol); err != nil { + t.Fatal(err) + } + if err = requiredList2.Set("col.list", list); err != nil { + t.Fatal(err) + } + if err = requiredList2.Set("col.list.element", optionalElement); err != nil { + t.Fatal(err) + } + + if _, _, err = requiredList2.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalList1 := schema.NewTree() + { + optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalList1.Set("col", optionalCol); err != nil { + t.Fatal(err) + } + if err = optionalList1.Set("col.list", list); err != nil { + t.Fatal(err) + } + if err = optionalList1.Set("col.list.element", requiredElement); err != nil { + t.Fatal(err) + } + + if _, _, err = optionalList1.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalList2 := schema.NewTree() + { + optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalList2.Set("col", optionalCol); err != nil { + t.Fatal(err) + } + if err = optionalList2.Set("col.list", list); err != nil { + t.Fatal(err) + } + if err = optionalList2.Set("col.list.element", optionalElement); err != nil { + t.Fatal(err) + } + + if _, _, err = optionalList2.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + result1 := map[string]*Column{ + "col.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{1}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + result2 := map[string]*Column{ + "col.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10, v20, v30}, + definitionLevels: []int64{1, 1, 1}, + repetitionLevels: []int64{0, 1, 1}, + rowCount: 1, + maxBitWidth: 5, + minValue: v10, + maxValue: v30, + }, + } + + result3 := map[string]*Column{ + "col.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{1}, + repetitionLevels: []int64{0}, + rowCount: 1, + }, + } + + result4 := map[string]*Column{ + "col.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + result5 := map[string]*Column{ + "col.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10, v20, v30}, + definitionLevels: []int64{2, 2, 2}, + repetitionLevels: []int64{0, 1, 1}, + rowCount: 1, + maxBitWidth: 5, + minValue: v10, + maxValue: v30, + }, + } + + result6 := map[string]*Column{ + "col.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + rowCount: 1, + }, + } + + result7 := map[string]*Column{ + "col.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{0}, + rowCount: 1, + }, + } + + result8 := map[string]*Column{ + "col.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{3}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + result9 := map[string]*Column{ + "col.list.element": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10, v20, v30}, + definitionLevels: []int64{3, 3, 3}, + repetitionLevels: []int64{0, 1, 1}, + rowCount: 1, + maxBitWidth: 5, + minValue: v10, + maxValue: v30, + }, + } + + testCases := []struct { + schemaTree *schema.Tree + data string + expectedResult map[string]*Column + expectErr bool + }{ + {requiredList1, `{}`, nil, true}, // err: col: nil value for required field + {requiredList1, `{"col": null}`, nil, true}, // err: col: nil value for required field + {requiredList1, `{"col": [null]}`, nil, true}, // err: col.list.element: nil value for required field + {requiredList1, `{"col": [10]}`, result1, false}, + {requiredList1, `{"col": [10, 20, 30]}`, result2, false}, + {requiredList2, `{}`, nil, true}, // err: col: nil value for required field + {requiredList2, `{"col": null}`, nil, true}, // err: col: nil value for required field + {requiredList2, `{"col": [null]}`, result3, false}, + {requiredList2, `{"col": [10]}`, result4, false}, + {requiredList2, `{"col": [10, 20, 30]}`, result5, false}, + {optionalList1, `{}`, result6, false}, + {optionalList1, `{"col": null}`, result6, false}, + {optionalList1, `{"col": [null]}`, nil, true}, // err: col.list.element: nil value for required field + {optionalList1, `{"col": [10]}`, result4, false}, + {optionalList1, `{"col": [10, 20, 30]}`, result5, false}, + {optionalList2, `{}`, result6, false}, + {optionalList2, `{"col": null}`, result6, false}, + {optionalList2, `{"col": [null]}`, result7, false}, + {optionalList2, `{"col": [10]}`, result8, false}, + {optionalList2, `{"col": [10, 20, 30]}`, result9, false}, + } + + for i, testCase := range testCases { + result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree) + expectErr := (err != nil) + + if testCase.expectErr != expectErr { + t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr) + } + + if !testCase.expectErr { + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } + } +} diff --git a/pkg/s3select/internal/parquet-go/data/column-primitivetype_test.go b/pkg/s3select/internal/parquet-go/data/column-primitivetype_test.go new file mode 100644 index 000000000..d66829980 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/data/column-primitivetype_test.go @@ -0,0 +1,128 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package data + +import ( + "reflect" + "testing" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/schema" +) + +func TestPopulatePrimitiveType(t *testing.T) { + requiredField := schema.NewTree() + { + requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = requiredField.Set("col", requiredCol); err != nil { + t.Fatal(err) + } + + if _, _, err = requiredField.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + optionalField := schema.NewTree() + { + optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err = optionalField.Set("col", optionalCol); err != nil { + t.Fatal(err) + } + + if _, _, err = optionalField.ToParquetSchema(); err != nil { + t.Fatal(err) + } + } + + result1 := map[string]*Column{ + "col": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + result2 := map[string]*Column{ + "col": { + parquetType: parquet.Type_INT32, + values: []interface{}{nil}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + rowCount: 1, + }, + } + + result3 := map[string]*Column{ + "col": { + parquetType: parquet.Type_INT32, + values: []interface{}{v10}, + definitionLevels: []int64{1}, + repetitionLevels: []int64{0}, + rowCount: 1, + maxBitWidth: 4, + minValue: v10, + maxValue: v10, + }, + } + + testCases := []struct { + schemaTree *schema.Tree + data string + expectedResult map[string]*Column + expectErr bool + }{ + {requiredField, `{}`, nil, true}, + {requiredField, `{"col": null}`, nil, true}, // err: col: nil value for required field + {requiredField, `{"col": 10}`, result1, false}, + {optionalField, `{}`, result2, false}, + {optionalField, `{"col": null}`, result2, false}, + {optionalField, `{"col": 10}`, result3, false}, + } + + for i, testCase := range testCases { + result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree) + expectErr := (err != nil) + + if testCase.expectErr != expectErr { + t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr) + } + + if !testCase.expectErr { + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } + } +} diff --git a/pkg/s3select/internal/parquet-go/data/column.go b/pkg/s3select/internal/parquet-go/data/column.go new file mode 100644 index 000000000..f1af9a672 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/data/column.go @@ -0,0 +1,680 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package data + +import ( + "bytes" + "context" + "fmt" + "strings" + + "git.apache.org/thrift.git/lib/go/thrift" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/common" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/encoding" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/schema" + "github.com/tidwall/gjson" + "github.com/tidwall/sjson" +) + +func getDefaultEncoding(parquetType parquet.Type) parquet.Encoding { + switch parquetType { + case parquet.Type_BOOLEAN: + return parquet.Encoding_PLAIN + case parquet.Type_INT32, parquet.Type_INT64, parquet.Type_FLOAT, parquet.Type_DOUBLE: + return parquet.Encoding_RLE_DICTIONARY + case parquet.Type_BYTE_ARRAY: + return parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY + } + + return parquet.Encoding_PLAIN +} + +func getFirstValueElement(tree *schema.Tree) (valueElement *schema.Element) { + tree.Range(func(name string, element *schema.Element) bool { + if element.Children == nil { + valueElement = element + } else { + valueElement = getFirstValueElement(element.Children) + } + + return false + }) + + return valueElement +} + +func populate(columnDataMap map[string]*Column, input *jsonValue, tree *schema.Tree, firstValueRL int64) (map[string]*Column, error) { + var err error + + pos := 0 + handleElement := func(name string, element *schema.Element) bool { + pos++ + + dataPath := element.PathInTree + + if *element.RepetitionType == parquet.FieldRepetitionType_REPEATED { + panic(fmt.Errorf("%v: repetition type must be REQUIRED or OPTIONAL type", dataPath)) + } + + inputValue := input.Get(name) + if *element.RepetitionType == parquet.FieldRepetitionType_REQUIRED && inputValue.IsNull() { + err = fmt.Errorf("%v: nil value for required field", dataPath) + return false + } + + add := func(element *schema.Element, value interface{}, DL, RL int64) { + columnData := columnDataMap[element.PathInSchema] + if columnData == nil { + columnData = NewColumn(*element.Type) + } + columnData.add(value, DL, RL) + columnDataMap[element.PathInSchema] = columnData + } + + // Handle primitive type element. + if element.Type != nil { + var value interface{} + if value, err = inputValue.GetValue(*element.Type, element.ConvertedType); err != nil { + return false + } + + DL := element.MaxDefinitionLevel + if value == nil && DL > 0 { + DL-- + } + + RL := element.MaxRepetitionLevel + if pos == 1 { + RL = firstValueRL + } + + add(element, value, DL, RL) + return true + } + + addNull := func() { + valueElement := getFirstValueElement(element.Children) + + DL := element.MaxDefinitionLevel + if DL > 0 { + DL-- + } + + RL := element.MaxRepetitionLevel + if RL > 0 { + RL-- + } + + add(valueElement, nil, DL, RL) + } + + // Handle group type element. + if element.ConvertedType == nil { + if inputValue.IsNull() { + addNull() + return true + } + + columnDataMap, err = populate(columnDataMap, inputValue, element.Children, firstValueRL) + return (err == nil) + } + + // Handle list type element. + if *element.ConvertedType == parquet.ConvertedType_LIST { + if inputValue.IsNull() { + addNull() + return true + } + + var results []gjson.Result + if results, err = inputValue.GetArray(); err != nil { + return false + } + + listElement, _ := element.Children.Get("list") + valueElement, _ := listElement.Children.Get("element") + for i := range results { + rl := valueElement.MaxRepetitionLevel + if i == 0 { + rl = firstValueRL + } + + var jsonData []byte + if jsonData, err = sjson.SetBytes([]byte{}, "element", results[i].Value()); err != nil { + return false + } + + var jv *jsonValue + if jv, err = bytesToJSONValue(jsonData); err != nil { + return false + } + + if columnDataMap, err = populate(columnDataMap, jv, listElement.Children, rl); err != nil { + return false + } + } + return true + } + + if *element.ConvertedType == parquet.ConvertedType_MAP { + if inputValue.IsNull() { + addNull() + return true + } + + keyValueElement, _ := element.Children.Get("key_value") + var rerr error + err = inputValue.Range(func(key, value gjson.Result) bool { + if !key.Exists() || key.Type == gjson.Null { + rerr = fmt.Errorf("%v.key_value.key: not found or null", dataPath) + return false + } + + var jsonData []byte + if jsonData, rerr = sjson.SetBytes([]byte{}, "key", key.Value()); err != nil { + return false + } + + if jsonData, rerr = sjson.SetBytes(jsonData, "value", value.Value()); err != nil { + return false + } + + var jv *jsonValue + if jv, rerr = bytesToJSONValue(jsonData); rerr != nil { + return false + } + + if columnDataMap, rerr = populate(columnDataMap, jv, keyValueElement.Children, firstValueRL); err != nil { + return false + } + + return true + }) + + if err != nil { + return false + } + + err = rerr + return (err == nil) + } + + err = fmt.Errorf("%v: unsupported converted type %v in %v field type", dataPath, *element.ConvertedType, *element.RepetitionType) + return false + } + + tree.Range(handleElement) + return columnDataMap, err +} + +// Column - denotes values of a column. +type Column struct { + parquetType parquet.Type // value type. + values []interface{} // must be a slice of parquet typed values. + definitionLevels []int64 // exactly same length of values. + repetitionLevels []int64 // exactly same length of values. + rowCount int32 + maxBitWidth int32 + minValue interface{} + maxValue interface{} +} + +func (column *Column) updateMinMaxValue(value interface{}) { + if column.minValue == nil && column.maxValue == nil { + column.minValue = value + column.maxValue = value + return + } + + switch column.parquetType { + case parquet.Type_BOOLEAN: + if column.minValue.(bool) && !value.(bool) { + column.minValue = value + } + + if !column.maxValue.(bool) && value.(bool) { + column.maxValue = value + } + + case parquet.Type_INT32: + if column.minValue.(int32) > value.(int32) { + column.minValue = value + } + + if column.maxValue.(int32) < value.(int32) { + column.maxValue = value + } + + case parquet.Type_INT64: + if column.minValue.(int64) > value.(int64) { + column.minValue = value + } + + if column.maxValue.(int64) < value.(int64) { + column.maxValue = value + } + + case parquet.Type_FLOAT: + if column.minValue.(float32) > value.(float32) { + column.minValue = value + } + + if column.maxValue.(float32) < value.(float32) { + column.maxValue = value + } + + case parquet.Type_DOUBLE: + if column.minValue.(float64) > value.(float64) { + column.minValue = value + } + + if column.maxValue.(float64) < value.(float64) { + column.maxValue = value + } + + case parquet.Type_BYTE_ARRAY: + if bytes.Compare(column.minValue.([]byte), value.([]byte)) > 0 { + column.minValue = value + } + + if bytes.Compare(column.minValue.([]byte), value.([]byte)) < 0 { + column.maxValue = value + } + } +} + +func (column *Column) updateStats(value interface{}, DL, RL int64) { + if RL == 0 { + column.rowCount++ + } + + if value == nil { + return + } + + var bitWidth int32 + switch column.parquetType { + case parquet.Type_BOOLEAN: + bitWidth = 1 + case parquet.Type_INT32: + bitWidth = common.BitWidth(uint64(value.(int32))) + case parquet.Type_INT64: + bitWidth = common.BitWidth(uint64(value.(int64))) + case parquet.Type_FLOAT: + bitWidth = 32 + case parquet.Type_DOUBLE: + bitWidth = 64 + case parquet.Type_BYTE_ARRAY: + bitWidth = int32(len(value.([]byte))) + } + if column.maxBitWidth < bitWidth { + column.maxBitWidth = bitWidth + } + + column.updateMinMaxValue(value) +} + +func (column *Column) add(value interface{}, DL, RL int64) { + column.values = append(column.values, value) + column.definitionLevels = append(column.definitionLevels, DL) + column.repetitionLevels = append(column.repetitionLevels, RL) + column.updateStats(value, DL, RL) +} + +// AddNull - adds nil value. +func (column *Column) AddNull(DL, RL int64) { + column.add(nil, DL, RL) +} + +// AddBoolean - adds boolean value. +func (column *Column) AddBoolean(value bool, DL, RL int64) { + if column.parquetType != parquet.Type_BOOLEAN { + panic(fmt.Errorf("expected %v value", column.parquetType)) + } + + column.add(value, DL, RL) +} + +// AddInt32 - adds int32 value. +func (column *Column) AddInt32(value int32, DL, RL int64) { + if column.parquetType != parquet.Type_INT32 { + panic(fmt.Errorf("expected %v value", column.parquetType)) + } + + column.add(value, DL, RL) +} + +// AddInt64 - adds int64 value. +func (column *Column) AddInt64(value int64, DL, RL int64) { + if column.parquetType != parquet.Type_INT64 { + panic(fmt.Errorf("expected %v value", column.parquetType)) + } + + column.add(value, DL, RL) +} + +// AddFloat - adds float32 value. +func (column *Column) AddFloat(value float32, DL, RL int64) { + if column.parquetType != parquet.Type_FLOAT { + panic(fmt.Errorf("expected %v value", column.parquetType)) + } + + column.add(value, DL, RL) +} + +// AddDouble - adds float64 value. +func (column *Column) AddDouble(value float64, DL, RL int64) { + if column.parquetType != parquet.Type_DOUBLE { + panic(fmt.Errorf("expected %v value", column.parquetType)) + } + + column.add(value, DL, RL) +} + +// AddByteArray - adds byte array value. +func (column *Column) AddByteArray(value []byte, DL, RL int64) { + if column.parquetType != parquet.Type_BYTE_ARRAY { + panic(fmt.Errorf("expected %v value", column.parquetType)) + } + + column.add(value, DL, RL) +} + +// Merge - merges columns. +func (column *Column) Merge(column2 *Column) { + if column.parquetType != column2.parquetType { + panic(fmt.Errorf("merge differs in parquet type")) + } + + column.values = append(column.values, column2.values...) + column.definitionLevels = append(column.definitionLevels, column2.definitionLevels...) + column.repetitionLevels = append(column.repetitionLevels, column2.repetitionLevels...) + + column.rowCount += column2.rowCount + if column.maxBitWidth < column2.maxBitWidth { + column.maxBitWidth = column2.maxBitWidth + } + + column.updateMinMaxValue(column2.minValue) + column.updateMinMaxValue(column2.maxValue) +} + +func (column *Column) String() string { + var strs []string + strs = append(strs, fmt.Sprintf("parquetType: %v", column.parquetType)) + strs = append(strs, fmt.Sprintf("values: %v", column.values)) + strs = append(strs, fmt.Sprintf("definitionLevels: %v", column.definitionLevels)) + strs = append(strs, fmt.Sprintf("repetitionLevels: %v", column.repetitionLevels)) + strs = append(strs, fmt.Sprintf("rowCount: %v", column.rowCount)) + strs = append(strs, fmt.Sprintf("maxBitWidth: %v", column.maxBitWidth)) + strs = append(strs, fmt.Sprintf("minValue: %v", column.minValue)) + strs = append(strs, fmt.Sprintf("maxValue: %v", column.maxValue)) + return "{" + strings.Join(strs, ", ") + "}" +} + +func (column *Column) encodeValue(value interface{}, element *schema.Element) []byte { + if value == nil { + return nil + } + + valueData := encoding.PlainEncode(common.ToSliceValue([]interface{}{value}, column.parquetType), column.parquetType) + if column.parquetType == parquet.Type_BYTE_ARRAY && element.ConvertedType != nil { + switch *element.ConvertedType { + case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL: + valueData = valueData[4:] + } + } + + return valueData +} + +func (column *Column) toDataPageV2(element *schema.Element, parquetEncoding parquet.Encoding) *ColumnChunk { + var definedValues []interface{} + for _, value := range column.values { + if value != nil { + definedValues = append(definedValues, value) + } + } + + var encodedData []byte + switch parquetEncoding { + case parquet.Encoding_PLAIN: + encodedData = encoding.PlainEncode(common.ToSliceValue(definedValues, column.parquetType), column.parquetType) + + case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY: + var bytesSlices [][]byte + for _, value := range column.values { + bytesSlices = append(bytesSlices, value.([]byte)) + } + encodedData = encoding.DeltaLengthByteArrayEncode(bytesSlices) + } + + compressionType := parquet.CompressionCodec_SNAPPY + if element.CompressionType != nil { + compressionType = *element.CompressionType + } + + compressedData, err := common.Compress(compressionType, encodedData) + if err != nil { + panic(err) + } + + DLData := encoding.RLEBitPackedHybridEncode( + column.definitionLevels, + common.BitWidth(uint64(element.MaxDefinitionLevel)), + parquet.Type_INT64, + ) + + RLData := encoding.RLEBitPackedHybridEncode( + column.repetitionLevels, + common.BitWidth(uint64(element.MaxRepetitionLevel)), + parquet.Type_INT64, + ) + + pageHeader := parquet.NewPageHeader() + pageHeader.Type = parquet.PageType_DATA_PAGE_V2 + pageHeader.CompressedPageSize = int32(len(compressedData) + len(DLData) + len(RLData)) + pageHeader.UncompressedPageSize = int32(len(encodedData) + len(DLData) + len(RLData)) + pageHeader.DataPageHeaderV2 = parquet.NewDataPageHeaderV2() + pageHeader.DataPageHeaderV2.NumValues = int32(len(column.values)) + pageHeader.DataPageHeaderV2.NumNulls = int32(len(column.values) - len(definedValues)) + pageHeader.DataPageHeaderV2.NumRows = column.rowCount + pageHeader.DataPageHeaderV2.Encoding = parquetEncoding + pageHeader.DataPageHeaderV2.DefinitionLevelsByteLength = int32(len(DLData)) + pageHeader.DataPageHeaderV2.RepetitionLevelsByteLength = int32(len(RLData)) + pageHeader.DataPageHeaderV2.IsCompressed = true + pageHeader.DataPageHeaderV2.Statistics = parquet.NewStatistics() + pageHeader.DataPageHeaderV2.Statistics.Min = column.encodeValue(column.minValue, element) + pageHeader.DataPageHeaderV2.Statistics.Max = column.encodeValue(column.maxValue, element) + + ts := thrift.NewTSerializer() + ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport) + rawData, err := ts.Write(context.TODO(), pageHeader) + if err != nil { + panic(err) + } + rawData = append(rawData, RLData...) + rawData = append(rawData, DLData...) + rawData = append(rawData, compressedData...) + + metadata := parquet.NewColumnMetaData() + metadata.Type = column.parquetType + metadata.Encodings = []parquet.Encoding{ + parquet.Encoding_PLAIN, + parquet.Encoding_RLE, + parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY, + } + metadata.Codec = compressionType + metadata.NumValues = int64(pageHeader.DataPageHeaderV2.NumValues) + metadata.TotalCompressedSize = int64(len(rawData)) + metadata.TotalUncompressedSize = int64(pageHeader.UncompressedPageSize) + int64(len(rawData)) - int64(pageHeader.CompressedPageSize) + metadata.PathInSchema = strings.Split(element.PathInSchema, ".") + metadata.Statistics = parquet.NewStatistics() + metadata.Statistics.Min = pageHeader.DataPageHeaderV2.Statistics.Min + metadata.Statistics.Max = pageHeader.DataPageHeaderV2.Statistics.Max + + chunk := new(ColumnChunk) + chunk.ColumnChunk.MetaData = metadata + chunk.dataPageLen = int64(len(rawData)) + chunk.dataLen = int64(len(rawData)) + chunk.data = rawData + + return chunk +} + +func (column *Column) toRLEDictPage(element *schema.Element) *ColumnChunk { + dictPageData, dataPageData, dictValueCount, indexBitWidth := encoding.RLEDictEncode(column.values, column.parquetType, column.maxBitWidth) + + compressionType := parquet.CompressionCodec_SNAPPY + if element.CompressionType != nil { + compressionType = *element.CompressionType + } + + compressedData, err := common.Compress(compressionType, dictPageData) + if err != nil { + panic(err) + } + + dictPageHeader := parquet.NewPageHeader() + dictPageHeader.Type = parquet.PageType_DICTIONARY_PAGE + dictPageHeader.CompressedPageSize = int32(len(compressedData)) + dictPageHeader.UncompressedPageSize = int32(len(dictPageData)) + dictPageHeader.DictionaryPageHeader = parquet.NewDictionaryPageHeader() + dictPageHeader.DictionaryPageHeader.NumValues = dictValueCount + dictPageHeader.DictionaryPageHeader.Encoding = parquet.Encoding_PLAIN + + ts := thrift.NewTSerializer() + ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport) + dictPageRawData, err := ts.Write(context.TODO(), dictPageHeader) + if err != nil { + panic(err) + } + dictPageRawData = append(dictPageRawData, compressedData...) + + RLData := encoding.RLEBitPackedHybridEncode( + column.repetitionLevels, + common.BitWidth(uint64(element.MaxRepetitionLevel)), + parquet.Type_INT64, + ) + encodedData := RLData + + DLData := encoding.RLEBitPackedHybridEncode( + column.definitionLevels, + common.BitWidth(uint64(element.MaxDefinitionLevel)), + parquet.Type_INT64, + ) + encodedData = append(encodedData, DLData...) + + encodedData = append(encodedData, indexBitWidth) + encodedData = append(encodedData, dataPageData...) + + compressedData, err = common.Compress(compressionType, encodedData) + if err != nil { + panic(err) + } + + dataPageHeader := parquet.NewPageHeader() + dataPageHeader.Type = parquet.PageType_DATA_PAGE + dataPageHeader.CompressedPageSize = int32(len(compressedData)) + dataPageHeader.UncompressedPageSize = int32(len(encodedData)) + dataPageHeader.DataPageHeader = parquet.NewDataPageHeader() + dataPageHeader.DataPageHeader.NumValues = int32(len(column.values)) + dataPageHeader.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE + dataPageHeader.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE + dataPageHeader.DataPageHeader.Encoding = parquet.Encoding_RLE_DICTIONARY + + ts = thrift.NewTSerializer() + ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport) + dataPageRawData, err := ts.Write(context.TODO(), dataPageHeader) + if err != nil { + panic(err) + } + dataPageRawData = append(dataPageRawData, compressedData...) + + metadata := parquet.NewColumnMetaData() + metadata.Type = column.parquetType + metadata.Encodings = []parquet.Encoding{ + parquet.Encoding_PLAIN, + parquet.Encoding_RLE, + parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY, + parquet.Encoding_RLE_DICTIONARY, + } + metadata.Codec = compressionType + metadata.NumValues = int64(dataPageHeader.DataPageHeader.NumValues) + metadata.TotalCompressedSize = int64(len(dictPageRawData)) + int64(len(dataPageRawData)) + uncompressedSize := int64(dictPageHeader.UncompressedPageSize) + int64(len(dictPageData)) - int64(dictPageHeader.CompressedPageSize) + uncompressedSize += int64(dataPageHeader.UncompressedPageSize) + int64(len(dataPageData)) - int64(dataPageHeader.CompressedPageSize) + metadata.TotalUncompressedSize = uncompressedSize + metadata.PathInSchema = strings.Split(element.PathInSchema, ".") + metadata.Statistics = parquet.NewStatistics() + metadata.Statistics.Min = column.encodeValue(column.minValue, element) + metadata.Statistics.Max = column.encodeValue(column.maxValue, element) + + chunk := new(ColumnChunk) + chunk.ColumnChunk.MetaData = metadata + chunk.isDictPage = true + chunk.dictPageLen = int64(len(dictPageRawData)) + chunk.dataPageLen = int64(len(dataPageRawData)) + chunk.dataLen = chunk.dictPageLen + chunk.dataPageLen + chunk.data = append(dictPageRawData, dataPageRawData...) + + return chunk +} + +// Encode an element. +func (column *Column) Encode(element *schema.Element) *ColumnChunk { + parquetEncoding := getDefaultEncoding(column.parquetType) + if element.Encoding != nil { + parquetEncoding = *element.Encoding + } + + switch parquetEncoding { + case parquet.Encoding_PLAIN, parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY: + return column.toDataPageV2(element, parquetEncoding) + } + + return column.toRLEDictPage(element) +} + +// NewColumn - creates new column data +func NewColumn(parquetType parquet.Type) *Column { + switch parquetType { + case parquet.Type_BOOLEAN, parquet.Type_INT32, parquet.Type_INT64, parquet.Type_FLOAT, parquet.Type_DOUBLE, parquet.Type_BYTE_ARRAY: + default: + panic(fmt.Errorf("unsupported parquet type %v", parquetType)) + } + + return &Column{ + parquetType: parquetType, + } +} + +// UnmarshalJSON - decodes JSON data into map of Column. +func UnmarshalJSON(data []byte, tree *schema.Tree) (map[string]*Column, error) { + if !tree.ReadOnly() { + return nil, fmt.Errorf("tree must be read only") + } + + inputValue, err := bytesToJSONValue(data) + if err != nil { + return nil, err + } + + columnDataMap := make(map[string]*Column) + return populate(columnDataMap, inputValue, tree, 0) +} diff --git a/pkg/s3select/internal/parquet-go/data/column_test.go b/pkg/s3select/internal/parquet-go/data/column_test.go new file mode 100644 index 000000000..a8d35d886 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/data/column_test.go @@ -0,0 +1,369 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package data + +import ( + "reflect" + "testing" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/schema" +) + +var ( + v10 = int32(10) + v20 = int32(20) + v30 = int32(30) + ten = []byte("ten") + foo = []byte("foo") + bar = []byte("bar") + phone1 = []byte("1-234-567-8901") + phone2 = []byte("1-234-567-1098") + phone3 = []byte("1-111-222-3333") +) + +func TestAddressBookExample(t *testing.T) { + // message AddressBook { + // required string owner; + // repeated string ownerPhoneNumbers; + // repeated group contacts { + // required string name; + // optional string phoneNumber; + // } + // } + t.Skip("Broken") + + addressBook := schema.NewTree() + { + owner, err := schema.NewElement("owner", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + ownerPhoneNumbers, err := schema.NewElement("ownerPhoneNumbers", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + ownerPhoneNumbersList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + ownerPhoneNumbersElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + contacts, err := schema.NewElement("contacts", parquet.FieldRepetitionType_OPTIONAL, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + contactsList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + contactsElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + contactName, err := schema.NewElement("name", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + contactPhoneNumber, err := schema.NewElement("phoneNumber", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + if err = addressBook.Set("owner", owner); err != nil { + t.Fatal(err) + } + + if err = addressBook.Set("ownerPhoneNumbers", ownerPhoneNumbers); err != nil { + t.Fatal(err) + } + if err = addressBook.Set("ownerPhoneNumbers.list", ownerPhoneNumbersList); err != nil { + t.Fatal(err) + } + if err = addressBook.Set("ownerPhoneNumbers.list.element", ownerPhoneNumbersElement); err != nil { + t.Fatal(err) + } + + if err = addressBook.Set("contacts", contacts); err != nil { + t.Fatal(err) + } + if err = addressBook.Set("contacts.list", contactsList); err != nil { + t.Fatal(err) + } + if err = addressBook.Set("contacts.list.element", contactsElement); err != nil { + t.Fatal(err) + } + if err = addressBook.Set("contacts.list.element.name", contactName); err != nil { + t.Fatal(err) + } + if err = addressBook.Set("contacts.list.element.phoneNumber", contactPhoneNumber); err != nil { + t.Fatal(err) + } + } + + if _, _, err := addressBook.ToParquetSchema(); err != nil { + t.Fatal(err) + } + + case2Data := `{ + "owner": "foo" +}` + result2 := map[string]*Column{ + "owner": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{foo}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + }, + "ownerPhoneNumbers.list.element": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{nil}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + }, + "contacts.list.element.name": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{nil}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + }, + } + + case3Data := `{ + "owner": "foo", + "ownerPhoneNumbers": [ + "1-234-567-8901" + ] +} +` + result3 := map[string]*Column{ + "owner": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{foo}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + }, + "ownerPhoneNumbers.list.element": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{phone1}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{0}, + }, + "contacts.list.element.name": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{nil}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + }, + } + + case4Data := `{ + "owner": "foo", + "ownerPhoneNumbers": [ + "1-234-567-8901", + "1-234-567-1098" + ] +} +` + result4 := map[string]*Column{ + "owner": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{foo}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + }, + "ownerPhoneNumbers.list.element": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{phone1, phone2}, + definitionLevels: []int64{2, 2}, + repetitionLevels: []int64{0, 1}, + }, + "contacts.list.element.name": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{nil}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + }, + } + + case5Data := `{ + "contacts": [ + { + "name": "bar" + } + ], + "owner": "foo" +}` + result5 := map[string]*Column{ + "owner": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{foo}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + }, + "ownerPhoneNumbers.list.element": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{nil}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + }, + "contacts.list.element.name": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{bar}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{0}, + }, + "contacts.list.element.phoneNumber": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{nil}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{1}, + }, + } + + case6Data := `{ + "contacts": [ + { + "name": "bar", + "phoneNumber": "1-111-222-3333" + } + ], + "owner": "foo" +}` + result6 := map[string]*Column{ + "owner": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{foo}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + }, + "ownerPhoneNumbers.list.element": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{nil}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + }, + "contacts.list.element.name": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{bar}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{0}, + }, + "contacts.list.element.phoneNumber": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{phone3}, + definitionLevels: []int64{3}, + repetitionLevels: []int64{1}, + }, + } + + case7Data := `{ + "contacts": [ + { + "name": "bar", + "phoneNumber": "1-111-222-3333" + } + ], + "owner": "foo", + "ownerPhoneNumbers": [ + "1-234-567-8901", + "1-234-567-1098" + ] +}` + result7 := map[string]*Column{ + "owner": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{foo}, + definitionLevels: []int64{0}, + repetitionLevels: []int64{0}, + }, + "ownerPhoneNumbers.list.element": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{phone1, phone2}, + definitionLevels: []int64{2, 2}, + repetitionLevels: []int64{0, 1}, + }, + "contacts.list.element.name": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{bar}, + definitionLevels: []int64{2}, + repetitionLevels: []int64{0}, + }, + "contacts.list.element.phoneNumber": { + parquetType: parquet.Type_BYTE_ARRAY, + values: []interface{}{phone3}, + definitionLevels: []int64{3}, + repetitionLevels: []int64{1}, + }, + } + + testCases := []struct { + data string + expectedResult map[string]*Column + expectErr bool + }{ + {`{}`, nil, true}, // err: owner: nil value for required field + {case2Data, result2, false}, + {case3Data, result3, false}, + {case4Data, result4, false}, + {case5Data, result5, false}, + {case6Data, result6, false}, + {case7Data, result7, false}, + } + + for i, testCase := range testCases { + result, err := UnmarshalJSON([]byte(testCase.data), addressBook) + expectErr := (err != nil) + + if testCase.expectErr != expectErr { + t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr) + } + + if !testCase.expectErr { + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Errorf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } + } +} diff --git a/pkg/s3select/internal/parquet-go/data/data.go b/pkg/s3select/internal/parquet-go/data/data.go new file mode 100644 index 000000000..9d9245a45 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/data/data.go @@ -0,0 +1,65 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package data + +import ( + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +// ColumnChunk ... +type ColumnChunk struct { + parquet.ColumnChunk + isDictPage bool + dictPageLen int64 + dataPageLen int64 + dataLen int64 + data []byte +} + +// Data returns the data. +func (chunk *ColumnChunk) Data() []byte { + return chunk.data +} + +// DataLen returns the length of the data. +func (chunk *ColumnChunk) DataLen() int64 { + return chunk.dataLen +} + +// NewRowGroup creates a new row group. +func NewRowGroup(chunks []*ColumnChunk, numRows, offset int64) *parquet.RowGroup { + rows := parquet.NewRowGroup() + rows.NumRows = numRows + + for _, chunk := range chunks { + rows.Columns = append(rows.Columns, &chunk.ColumnChunk) + rows.TotalByteSize += chunk.dataLen + + chunk.ColumnChunk.FileOffset = offset + + if chunk.isDictPage { + dictPageOffset := offset + chunk.ColumnChunk.MetaData.DictionaryPageOffset = &dictPageOffset + offset += chunk.dictPageLen + } + + chunk.ColumnChunk.MetaData.DataPageOffset = offset + offset += chunk.dataPageLen + } + + return rows +} diff --git a/pkg/s3select/internal/parquet-go/data/jsonvalue.go b/pkg/s3select/internal/parquet-go/data/jsonvalue.go new file mode 100644 index 000000000..bd3c6a179 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/data/jsonvalue.go @@ -0,0 +1,107 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package data + +import ( + "fmt" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + "github.com/tidwall/gjson" +) + +type jsonValue struct { + result *gjson.Result + path *string +} + +func (v *jsonValue) String() string { + if v.result == nil { + return "" + } + + return fmt.Sprintf("%v", *v.result) +} + +func (v *jsonValue) IsNull() bool { + return v.result == nil || v.result.Type == gjson.Null +} + +func (v *jsonValue) Get(path string) *jsonValue { + if v.path != nil { + var result *gjson.Result + if *v.path == path { + result = v.result + } + + return resultToJSONValue(result) + } + + if v.result == nil { + return resultToJSONValue(nil) + } + + result := v.result.Get(path) + if !result.Exists() { + return resultToJSONValue(nil) + } + + return resultToJSONValue(&result) +} + +func (v *jsonValue) GetValue(parquetType parquet.Type, convertedType *parquet.ConvertedType) (interface{}, error) { + if v.result == nil { + return nil, nil + } + + return resultToParquetValue(*v.result, parquetType, convertedType) +} + +func (v *jsonValue) GetArray() ([]gjson.Result, error) { + if v.result == nil { + return nil, nil + } + + return resultToArray(*v.result) +} + +func (v *jsonValue) Range(iterator func(key, value gjson.Result) bool) error { + if v.result == nil || v.result.Type == gjson.Null { + return nil + } + + if v.result.Type != gjson.JSON || !v.result.IsObject() { + return fmt.Errorf("result is not Map but %v", v.result.Type) + } + + v.result.ForEach(iterator) + return nil +} + +func resultToJSONValue(result *gjson.Result) *jsonValue { + return &jsonValue{ + result: result, + } +} + +func bytesToJSONValue(data []byte) (*jsonValue, error) { + if !gjson.ValidBytes(data) { + return nil, fmt.Errorf("invalid JSON data") + } + + result := gjson.ParseBytes(data) + return resultToJSONValue(&result), nil +} diff --git a/pkg/s3select/internal/parquet-go/data/result.go b/pkg/s3select/internal/parquet-go/data/result.go new file mode 100644 index 000000000..2385b9bcf --- /dev/null +++ b/pkg/s3select/internal/parquet-go/data/result.go @@ -0,0 +1,360 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package data + +import ( + "fmt" + "math" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + "github.com/tidwall/gjson" +) + +func resultToBool(result gjson.Result) (value interface{}, err error) { + switch result.Type { + case gjson.False, gjson.True: + return result.Bool(), nil + } + + return nil, fmt.Errorf("result is not Bool but %v", result.Type) +} + +func resultToInt32(result gjson.Result) (value interface{}, err error) { + if value, err = resultToInt64(result); err != nil { + return nil, err + } + + if value.(int64) < math.MinInt32 || value.(int64) > math.MaxInt32 { + return nil, fmt.Errorf("int32 overflow") + } + + return int32(value.(int64)), nil +} + +func resultToInt64(result gjson.Result) (value interface{}, err error) { + if result.Type == gjson.Number { + return result.Int(), nil + } + + return nil, fmt.Errorf("result is not Number but %v", result.Type) +} + +func resultToFloat(result gjson.Result) (value interface{}, err error) { + if result.Type == gjson.Number { + return float32(result.Float()), nil + } + + return nil, fmt.Errorf("result is not float32 but %v", result.Type) +} + +func resultToDouble(result gjson.Result) (value interface{}, err error) { + if result.Type == gjson.Number { + return result.Float(), nil + } + + return nil, fmt.Errorf("result is not float64 but %v", result.Type) +} + +func resultToBytes(result gjson.Result) (interface{}, error) { + if result.Type != gjson.JSON || !result.IsArray() { + return nil, fmt.Errorf("result is not byte array but %v", result.Type) + } + + data := []byte{} + for i, r := range result.Array() { + if r.Type != gjson.Number { + return nil, fmt.Errorf("result[%v] is not byte but %v", i, r.Type) + } + + value := r.Uint() + if value > math.MaxUint8 { + return nil, fmt.Errorf("byte overflow in result[%v]", i) + } + + data = append(data, byte(value)) + } + + return data, nil +} + +func resultToString(result gjson.Result) (value interface{}, err error) { + if result.Type == gjson.String { + return result.String(), nil + } + + return nil, fmt.Errorf("result is not String but %v", result.Type) +} + +func resultToUint8(result gjson.Result) (value interface{}, err error) { + if value, err = resultToUint64(result); err != nil { + return nil, err + } + + if value.(uint64) > math.MaxUint8 { + return nil, fmt.Errorf("uint8 overflow") + } + + return uint8(value.(uint64)), nil +} + +func resultToUint16(result gjson.Result) (value interface{}, err error) { + if value, err = resultToUint64(result); err != nil { + return nil, err + } + + if value.(uint64) > math.MaxUint16 { + return nil, fmt.Errorf("uint16 overflow") + } + + return uint16(value.(uint64)), nil +} + +func resultToUint32(result gjson.Result) (value interface{}, err error) { + if value, err = resultToUint64(result); err != nil { + return nil, err + } + + if value.(uint64) > math.MaxUint32 { + return nil, fmt.Errorf("uint32 overflow") + } + + return uint32(value.(uint64)), nil +} + +func resultToUint64(result gjson.Result) (value interface{}, err error) { + if result.Type == gjson.Number { + return result.Uint(), nil + } + + return nil, fmt.Errorf("result is not Number but %v", result.Type) +} + +func resultToInt8(result gjson.Result) (value interface{}, err error) { + if value, err = resultToInt64(result); err != nil { + return nil, err + } + + if value.(int64) < math.MinInt8 || value.(int64) > math.MaxInt8 { + return nil, fmt.Errorf("int8 overflow") + } + + return int8(value.(int64)), nil +} + +func resultToInt16(result gjson.Result) (value interface{}, err error) { + if value, err = resultToInt64(result); err != nil { + return nil, err + } + + if value.(int64) < math.MinInt16 || value.(int64) > math.MaxInt16 { + return nil, fmt.Errorf("int16 overflow") + } + + return int16(value.(int64)), nil +} + +func stringToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) { + switch parquetType { + case parquet.Type_INT96, parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY: + return []byte(value.(string)), nil + } + + return nil, fmt.Errorf("string cannot be converted to parquet type %v", parquetType) +} + +func uint8ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) { + switch parquetType { + case parquet.Type_INT32: + return int32(value.(uint8)), nil + case parquet.Type_INT64: + return int64(value.(uint8)), nil + } + + return nil, fmt.Errorf("uint8 cannot be converted to parquet type %v", parquetType) +} + +func uint16ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) { + switch parquetType { + case parquet.Type_INT32: + return int32(value.(uint16)), nil + case parquet.Type_INT64: + return int64(value.(uint16)), nil + } + + return nil, fmt.Errorf("uint16 cannot be converted to parquet type %v", parquetType) +} + +func uint32ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) { + switch parquetType { + case parquet.Type_INT32: + return int32(value.(uint32)), nil + case parquet.Type_INT64: + return int64(value.(uint32)), nil + } + + return nil, fmt.Errorf("uint32 cannot be converted to parquet type %v", parquetType) +} + +func uint64ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) { + switch parquetType { + case parquet.Type_INT32: + return int32(value.(uint64)), nil + case parquet.Type_INT64: + return int64(value.(uint64)), nil + } + + return nil, fmt.Errorf("uint64 cannot be converted to parquet type %v", parquetType) +} + +func int8ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) { + switch parquetType { + case parquet.Type_INT32: + return int32(value.(int8)), nil + case parquet.Type_INT64: + return int64(value.(int8)), nil + } + + return nil, fmt.Errorf("int8 cannot be converted to parquet type %v", parquetType) +} + +func int16ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) { + switch parquetType { + case parquet.Type_INT32: + return int32(value.(int16)), nil + case parquet.Type_INT64: + return int64(value.(int16)), nil + } + + return nil, fmt.Errorf("int16 cannot be converted to parquet type %v", parquetType) +} + +func int32ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) { + switch parquetType { + case parquet.Type_INT32: + return value.(int32), nil + case parquet.Type_INT64: + return int64(value.(int32)), nil + } + + return nil, fmt.Errorf("int32 cannot be converted to parquet type %v", parquetType) +} + +func int64ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) { + switch parquetType { + case parquet.Type_INT32: + return int32(value.(int64)), nil + case parquet.Type_INT64: + return value.(int64), nil + } + + return nil, fmt.Errorf("int64 cannot be converted to parquet type %v", parquetType) +} + +func resultToParquetValueByConvertedValue(result gjson.Result, convertedType parquet.ConvertedType, parquetType parquet.Type) (value interface{}, err error) { + if result.Type == gjson.Null { + return nil, nil + } + + switch convertedType { + case parquet.ConvertedType_UTF8: + if value, err = resultToString(result); err != nil { + return nil, err + } + return stringToParquetValue(value, parquetType) + case parquet.ConvertedType_UINT_8: + if value, err = resultToUint8(result); err != nil { + return nil, err + } + return uint8ToParquetValue(value, parquetType) + case parquet.ConvertedType_UINT_16: + if value, err = resultToUint16(result); err != nil { + return nil, err + } + return uint16ToParquetValue(value, parquetType) + case parquet.ConvertedType_UINT_32: + if value, err = resultToUint32(result); err != nil { + return nil, err + } + return uint32ToParquetValue(value, parquetType) + case parquet.ConvertedType_UINT_64: + if value, err = resultToUint64(result); err != nil { + return nil, err + } + return uint64ToParquetValue(value, parquetType) + case parquet.ConvertedType_INT_8: + if value, err = resultToInt8(result); err != nil { + return nil, err + } + return int8ToParquetValue(value, parquetType) + case parquet.ConvertedType_INT_16: + if value, err = resultToInt16(result); err != nil { + return nil, err + } + return int16ToParquetValue(value, parquetType) + case parquet.ConvertedType_INT_32: + if value, err = resultToInt32(result); err != nil { + return nil, err + } + return int32ToParquetValue(value, parquetType) + case parquet.ConvertedType_INT_64: + if value, err = resultToInt64(result); err != nil { + return nil, err + } + return int64ToParquetValue(value, parquetType) + } + + return nil, fmt.Errorf("unsupported converted type %v", convertedType) +} + +func resultToParquetValue(result gjson.Result, parquetType parquet.Type, convertedType *parquet.ConvertedType) (interface{}, error) { + if convertedType != nil { + return resultToParquetValueByConvertedValue(result, *convertedType, parquetType) + } + + if result.Type == gjson.Null { + return nil, nil + } + + switch parquetType { + case parquet.Type_BOOLEAN: + return resultToBool(result) + case parquet.Type_INT32: + return resultToInt32(result) + case parquet.Type_INT64: + return resultToInt64(result) + case parquet.Type_FLOAT: + return resultToFloat(result) + case parquet.Type_DOUBLE: + return resultToDouble(result) + case parquet.Type_INT96, parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY: + return resultToBytes(result) + } + + return nil, fmt.Errorf("unknown parquet type %v", parquetType) +} + +func resultToArray(result gjson.Result) ([]gjson.Result, error) { + if result.Type == gjson.Null { + return nil, nil + } + + if result.Type != gjson.JSON || !result.IsArray() { + return nil, fmt.Errorf("result is not Array but %v", result.Type) + } + + return result.Array(), nil +} diff --git a/pkg/s3select/internal/parquet-go/decode.go b/pkg/s3select/internal/parquet-go/decode.go new file mode 100644 index 000000000..fbb3bcc65 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/decode.go @@ -0,0 +1,490 @@ +/* + * Minio Cloud Storage, (C) 2018 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package parquet + +import ( + "bytes" + "fmt" + "math" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +func i64sToi32s(i64s []int64) (i32s []int32) { + i32s = make([]int32, len(i64s)) + for i := range i64s { + i32s[i] = int32(i64s[i]) + } + + return i32s +} + +func readBitPacked(reader *bytes.Reader, header, bitWidth uint64) (result []int64, err error) { + count := header * 8 + + if count == 0 { + return result, nil + } + + if bitWidth == 0 { + return make([]int64, count), nil + } + + data := make([]byte, header*bitWidth) + if _, err = reader.Read(data); err != nil { + return nil, err + } + + var val, used, left, b uint64 + + valNeedBits := bitWidth + i := -1 + for { + if left <= 0 { + i++ + if i >= len(data) { + break + } + + b = uint64(data[i]) + left = 8 + used = 0 + } + + if left >= valNeedBits { + val |= ((b >> used) & ((1 << valNeedBits) - 1)) << (bitWidth - valNeedBits) + result = append(result, int64(val)) + val = 0 + left -= valNeedBits + used += valNeedBits + valNeedBits = bitWidth + } else { + val |= (b >> used) << (bitWidth - valNeedBits) + valNeedBits -= left + left = 0 + } + } + + return result, nil +} + +func readBools(reader *bytes.Reader, count uint64) (result []bool, err error) { + i64s, err := readBitPacked(reader, count, 1) + if err != nil { + return nil, err + } + + var i uint64 + for i = 0; i < count; i++ { + result = append(result, i64s[i] > 0) + } + + return result, nil +} + +func readInt32s(reader *bytes.Reader, count uint64) (result []int32, err error) { + buf := make([]byte, 4) + + var i uint64 + for i = 0; i < count; i++ { + if _, err = reader.Read(buf); err != nil { + return nil, err + } + + result = append(result, int32(bytesToUint32(buf))) + } + + return result, nil +} + +func readInt64s(reader *bytes.Reader, count uint64) (result []int64, err error) { + buf := make([]byte, 8) + + var i uint64 + for i = 0; i < count; i++ { + if _, err = reader.Read(buf); err != nil { + return nil, err + } + + result = append(result, int64(bytesToUint64(buf))) + } + + return result, nil +} + +func readInt96s(reader *bytes.Reader, count uint64) (result [][]byte, err error) { + var i uint64 + for i = 0; i < count; i++ { + buf := make([]byte, 12) + + if _, err = reader.Read(buf); err != nil { + return nil, err + } + + result = append(result, buf) + } + + return result, nil +} + +func readFloats(reader *bytes.Reader, count uint64) (result []float32, err error) { + buf := make([]byte, 4) + + var i uint64 + for i = 0; i < count; i++ { + if _, err = reader.Read(buf); err != nil { + return nil, err + } + + result = append(result, math.Float32frombits(bytesToUint32(buf))) + } + + return result, nil +} + +func readDoubles(reader *bytes.Reader, count uint64) (result []float64, err error) { + buf := make([]byte, 8) + + var i uint64 + for i = 0; i < count; i++ { + if _, err = reader.Read(buf); err != nil { + return nil, err + } + + result = append(result, math.Float64frombits(bytesToUint64(buf))) + } + + return result, nil +} + +func readByteArrays(reader *bytes.Reader, count uint64) (result [][]byte, err error) { + buf := make([]byte, 4) + var length uint32 + var data []byte + + var i uint64 + for i = 0; i < count; i++ { + if _, err = reader.Read(buf); err != nil { + return nil, err + } + + length = bytesToUint32(buf) + data = make([]byte, length) + if length > 0 { + if _, err = reader.Read(data); err != nil { + return nil, err + } + } + + result = append(result, data) + } + + return result, nil +} + +func readFixedLenByteArrays(reader *bytes.Reader, count, length uint64) (result [][]byte, err error) { + var i uint64 + for i = 0; i < count; i++ { + data := make([]byte, length) + if _, err = reader.Read(data); err != nil { + return nil, err + } + + result = append(result, data) + } + + return result, nil +} + +func readValues(reader *bytes.Reader, dataType parquet.Type, count, length uint64) (interface{}, error) { + switch dataType { + case parquet.Type_BOOLEAN: + return readBools(reader, count) + case parquet.Type_INT32: + return readInt32s(reader, count) + case parquet.Type_INT64: + return readInt64s(reader, count) + case parquet.Type_INT96: + return readInt96s(reader, count) + case parquet.Type_FLOAT: + return readFloats(reader, count) + case parquet.Type_DOUBLE: + return readDoubles(reader, count) + case parquet.Type_BYTE_ARRAY: + return readByteArrays(reader, count) + case parquet.Type_FIXED_LEN_BYTE_ARRAY: + return readFixedLenByteArrays(reader, count, length) + } + + return nil, fmt.Errorf("unknown parquet type %v", dataType) +} + +func readUnsignedVarInt(reader *bytes.Reader) (v uint64, err error) { + var b byte + var shift uint64 + + for { + if b, err = reader.ReadByte(); err != nil { + return 0, err + } + + if v |= ((uint64(b) & 0x7F) << shift); b&0x80 == 0 { + break + } + + shift += 7 + } + + return v, nil +} + +func readRLE(reader *bytes.Reader, header, bitWidth uint64) (result []int64, err error) { + width := (bitWidth + 7) / 8 + data := make([]byte, width) + if width > 0 { + if _, err = reader.Read(data); err != nil { + return nil, err + } + } + + if width < 4 { + data = append(data, make([]byte, 4-width)...) + } + + val := int64(bytesToUint32(data)) + + count := header >> 1 + result = make([]int64, count) + for i := range result { + result[i] = val + } + + return result, nil +} + +func readRLEBitPackedHybrid(reader *bytes.Reader, length, bitWidth uint64) (result []int64, err error) { + if length <= 0 { + var i32s []int32 + i32s, err = readInt32s(reader, 1) + if err != nil { + return nil, err + } + length = uint64(i32s[0]) + } + + buf := make([]byte, length) + if _, err = reader.Read(buf); err != nil { + return nil, err + } + + reader = bytes.NewReader(buf) + for reader.Len() > 0 { + header, err := readUnsignedVarInt(reader) + if err != nil { + return nil, err + } + + var i64s []int64 + if header&1 == 0 { + i64s, err = readRLE(reader, header, bitWidth) + } else { + i64s, err = readBitPacked(reader, header>>1, bitWidth) + } + + if err != nil { + return nil, err + } + + result = append(result, i64s...) + } + + return result, nil +} + +func readDeltaBinaryPackedInt(reader *bytes.Reader) (result []int64, err error) { + blockSize, err := readUnsignedVarInt(reader) + if err != nil { + return nil, err + } + + numMiniblocksInBlock, err := readUnsignedVarInt(reader) + if err != nil { + return nil, err + } + + numValues, err := readUnsignedVarInt(reader) + if err != nil { + return nil, err + } + + firstValueZigZag, err := readUnsignedVarInt(reader) + if err != nil { + return nil, err + } + + v := int64(firstValueZigZag>>1) ^ (-int64(firstValueZigZag & 1)) + result = append(result, v) + + numValuesInMiniBlock := blockSize / numMiniblocksInBlock + + bitWidths := make([]uint64, numMiniblocksInBlock) + for uint64(len(result)) < numValues { + minDeltaZigZag, err := readUnsignedVarInt(reader) + if err != nil { + return nil, err + } + + for i := 0; uint64(i) < numMiniblocksInBlock; i++ { + b, err := reader.ReadByte() + if err != nil { + return nil, err + } + bitWidths[i] = uint64(b) + } + + minDelta := int64(minDeltaZigZag>>1) ^ (-int64(minDeltaZigZag & 1)) + for i := 0; uint64(i) < numMiniblocksInBlock; i++ { + i64s, err := readBitPacked(reader, numValuesInMiniBlock/8, bitWidths[i]) + if err != nil { + return nil, err + } + + for j := range i64s { + v += i64s[j] + minDelta + result = append(result, v) + } + } + } + + return result[:numValues], nil +} + +func readDeltaLengthByteArrays(reader *bytes.Reader) (result [][]byte, err error) { + i64s, err := readDeltaBinaryPackedInt(reader) + if err != nil { + return nil, err + } + + for i := 0; i < len(i64s); i++ { + arrays, err := readFixedLenByteArrays(reader, 1, uint64(i64s[i])) + if err != nil { + return nil, err + } + + result = append(result, arrays[0]) + } + + return result, nil +} + +func readDeltaByteArrays(reader *bytes.Reader) (result [][]byte, err error) { + i64s, err := readDeltaBinaryPackedInt(reader) + if err != nil { + return nil, err + } + + suffixes, err := readDeltaLengthByteArrays(reader) + if err != nil { + return nil, err + } + + result = append(result, suffixes[0]) + for i := 1; i < len(i64s); i++ { + prefixLength := i64s[i] + val := append([]byte{}, result[i-1][:prefixLength]...) + val = append(val, suffixes[i]...) + result = append(result, val) + } + + return result, nil +} + +func readDataPageValues( + bytesReader *bytes.Reader, + encoding parquet.Encoding, + dataType parquet.Type, + convertedType parquet.ConvertedType, + count, bitWidth uint64, +) (result interface{}, resultDataType parquet.Type, err error) { + switch encoding { + case parquet.Encoding_PLAIN: + result, err = readValues(bytesReader, dataType, count, bitWidth) + return result, dataType, err + + case parquet.Encoding_PLAIN_DICTIONARY: + b, err := bytesReader.ReadByte() + if err != nil { + return nil, -1, err + } + + i64s, err := readRLEBitPackedHybrid(bytesReader, uint64(bytesReader.Len()), uint64(b)) + if err != nil { + return nil, -1, err + } + + return i64s[:count], parquet.Type_INT64, nil + + case parquet.Encoding_RLE: + i64s, err := readRLEBitPackedHybrid(bytesReader, 0, bitWidth) + if err != nil { + return nil, -1, err + } + + i64s = i64s[:count] + + if dataType == parquet.Type_INT32 { + return i64sToi32s(i64s), parquet.Type_INT32, nil + } + + return i64s, parquet.Type_INT64, nil + + case parquet.Encoding_BIT_PACKED: + return nil, -1, fmt.Errorf("deprecated parquet encoding %v", parquet.Encoding_BIT_PACKED) + + case parquet.Encoding_DELTA_BINARY_PACKED: + i64s, err := readDeltaBinaryPackedInt(bytesReader) + if err != nil { + return nil, -1, err + } + + i64s = i64s[:count] + + if dataType == parquet.Type_INT32 { + return i64sToi32s(i64s), parquet.Type_INT32, nil + } + + return i64s, parquet.Type_INT64, nil + + case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY: + byteSlices, err := readDeltaLengthByteArrays(bytesReader) + if err != nil { + return nil, -1, err + } + + return byteSlices[:count], parquet.Type_FIXED_LEN_BYTE_ARRAY, nil + + case parquet.Encoding_DELTA_BYTE_ARRAY: + byteSlices, err := readDeltaByteArrays(bytesReader) + if err != nil { + return nil, -1, err + } + + return byteSlices[:count], parquet.Type_FIXED_LEN_BYTE_ARRAY, nil + } + + return nil, -1, fmt.Errorf("unsupported parquet encoding %v", encoding) +} diff --git a/pkg/s3select/internal/parquet-go/encode.go b/pkg/s3select/internal/parquet-go/encode.go new file mode 100644 index 000000000..b165a2bd9 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/encode.go @@ -0,0 +1,450 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package parquet + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "math" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +func boolsToBytes(bs []bool) []byte { + size := (len(bs) + 7) / 8 + result := make([]byte, size) + for i := range bs { + if bs[i] { + result[i/8] |= 1 << uint32(i%8) + } + } + + return result +} + +func int32sToBytes(i32s []int32) []byte { + buf := make([]byte, 4*len(i32s)) + for i, i32 := range i32s { + binary.LittleEndian.PutUint32(buf[i*4:], uint32(i32)) + } + return buf +} + +func int64sToBytes(i64s []int64) []byte { + buf := make([]byte, 8*len(i64s)) + for i, i64 := range i64s { + binary.LittleEndian.PutUint64(buf[i*8:], uint64(i64)) + } + return buf +} + +func float32sToBytes(f32s []float32) []byte { + buf := make([]byte, 4*len(f32s)) + for i, f32 := range f32s { + binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(f32)) + } + return buf +} + +func float64sToBytes(f64s []float64) []byte { + buf := make([]byte, 8*len(f64s)) + for i, f64 := range f64s { + binary.LittleEndian.PutUint64(buf[i*8:], math.Float64bits(f64)) + } + return buf +} + +func byteSlicesToBytes(byteSlices [][]byte) []byte { + buf := new(bytes.Buffer) + for _, s := range byteSlices { + if err := binary.Write(buf, binary.LittleEndian, uint32(len(s))); err != nil { + panic(err) + } + + if _, err := buf.Write(s); err != nil { + panic(err) + } + } + + return buf.Bytes() +} + +func byteArraysToBytes(arrayList [][]byte) []byte { + buf := new(bytes.Buffer) + arrayLen := -1 + for _, array := range arrayList { + if arrayLen != -1 && len(array) != arrayLen { + panic(errors.New("array list does not have same length")) + } + + arrayLen = len(array) + if _, err := buf.Write(array); err != nil { + panic(err) + } + } + + return buf.Bytes() +} + +func int96sToBytes(i96s [][]byte) []byte { + return byteArraysToBytes(i96s) +} + +func valuesToBytes(values interface{}, dataType parquet.Type) []byte { + switch dataType { + case parquet.Type_BOOLEAN: + return boolsToBytes(values.([]bool)) + case parquet.Type_INT32: + return int32sToBytes(values.([]int32)) + case parquet.Type_INT64: + return int64sToBytes(values.([]int64)) + case parquet.Type_INT96: + return int96sToBytes(values.([][]byte)) + case parquet.Type_FLOAT: + return float32sToBytes(values.([]float32)) + case parquet.Type_DOUBLE: + return float64sToBytes(values.([]float64)) + case parquet.Type_BYTE_ARRAY: + return byteSlicesToBytes(values.([][]byte)) + case parquet.Type_FIXED_LEN_BYTE_ARRAY: + return byteArraysToBytes(values.([][]byte)) + } + + return []byte{} +} + +func valueToBytes(value interface{}, dataType parquet.Type) []byte { + var values interface{} + switch dataType { + case parquet.Type_BOOLEAN: + values = []bool{value.(bool)} + case parquet.Type_INT32: + values = []int32{value.(int32)} + case parquet.Type_INT64: + values = []int64{value.(int64)} + case parquet.Type_INT96: + values = [][]byte{value.([]byte)} + case parquet.Type_FLOAT: + values = []float32{value.(float32)} + case parquet.Type_DOUBLE: + values = []float64{value.(float64)} + case parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY: + values = [][]byte{value.([]byte)} + } + + return valuesToBytes(values, dataType) +} + +func unsignedVarIntToBytes(ui64 uint64) []byte { + size := (getBitWidth(ui64) + 6) / 7 + if size == 0 { + return []byte{0} + } + + buf := make([]byte, size) + for i := uint64(0); i < size; i++ { + buf[i] = byte(ui64&0x7F) | 0x80 + ui64 >>= 7 + } + buf[size-1] &= 0x7F + + return buf +} + +func valuesToRLEBytes(values interface{}, bitWidth int32, valueType parquet.Type) []byte { + vals := valuesToInterfaces(values, valueType) + result := []byte{} + j := 0 + for i := 0; i < len(vals); i = j { + for j = i + 1; j < len(vals) && vals[i] == vals[j]; j++ { + } + headerBytes := unsignedVarIntToBytes(uint64((j - i) << 1)) + result = append(result, headerBytes...) + + valBytes := valueToBytes(vals[i], valueType) + byteCount := (bitWidth + 7) / 8 + result = append(result, valBytes[:byteCount]...) + } + + return result +} + +func valuesToRLEBitPackedHybridBytes(values interface{}, bitWidth int32, dataType parquet.Type) []byte { + rleBytes := valuesToRLEBytes(values, bitWidth, dataType) + lenBytes := valueToBytes(int32(len(rleBytes)), parquet.Type_INT32) + return append(lenBytes, rleBytes...) +} + +func valuesToBitPackedBytes(values interface{}, bitWidth int64, withHeader bool, dataType parquet.Type) []byte { + var i64s []int64 + switch dataType { + case parquet.Type_BOOLEAN: + bs := values.([]bool) + i64s = make([]int64, len(bs)) + for i := range bs { + if bs[i] { + i64s[i] = 1 + } + } + case parquet.Type_INT32: + i32s := values.([]int32) + i64s = make([]int64, len(i32s)) + for i := range i32s { + i64s[i] = int64(i32s[i]) + } + case parquet.Type_INT64: + i64s = values.([]int64) + default: + panic(fmt.Errorf("data type %v is not supported for bit packing", dataType)) + } + + if len(i64s) == 0 { + return nil + } + + var valueByte byte + bitsSet := uint64(0) + bitsNeeded := uint64(8) + bitsToSet := uint64(bitWidth) + value := i64s[0] + + valueBytes := []byte{} + for i := 0; i < len(i64s); { + if bitsToSet >= bitsNeeded { + valueByte |= byte(((value >> bitsSet) & ((1 << bitsNeeded) - 1)) << (8 - bitsNeeded)) + valueBytes = append(valueBytes, valueByte) + bitsToSet -= bitsNeeded + bitsSet += bitsNeeded + + bitsNeeded = 8 + valueByte = 0 + + if bitsToSet <= 0 && (i+1) < len(i64s) { + i++ + value = i64s[i] + bitsToSet = uint64(bitWidth) + bitsSet = 0 + } + } else { + valueByte |= byte((value >> bitsSet) << (8 - bitsNeeded)) + i++ + + if i < len(i64s) { + value = i64s[i] + } + + bitsNeeded -= bitsToSet + bitsToSet = uint64(bitWidth) + bitsSet = 0 + } + } + + if withHeader { + header := uint64(((len(i64s) / 8) << 1) | 1) + headerBytes := unsignedVarIntToBytes(header) + return append(headerBytes, valueBytes...) + } + + return valueBytes +} + +const ( + blockSize = 128 + subBlockSize = 32 + subBlockCount = blockSize / subBlockSize +) + +var ( + blockSizeBytes = unsignedVarIntToBytes(blockSize) + subBlockCountBytes = unsignedVarIntToBytes(subBlockCount) +) + +func int32ToDeltaBytes(i32s []int32) []byte { + getValue := func(i32 int32) uint64 { + return uint64((i32 >> 31) ^ (i32 << 1)) + } + + result := append([]byte{}, blockSizeBytes...) + result = append(result, subBlockCountBytes...) + result = append(result, unsignedVarIntToBytes(uint64(len(i32s)))...) + result = append(result, unsignedVarIntToBytes(getValue(i32s[0]))...) + + for i := 1; i < len(i32s); { + block := []int32{} + minDelta := int32(0x7FFFFFFF) + + for ; i < len(i32s) && len(block) < blockSize; i++ { + delta := i32s[i] - i32s[i-1] + block = append(block, delta) + if delta < minDelta { + minDelta = delta + } + } + + for len(block) < blockSize { + block = append(block, minDelta) + } + + bitWidths := make([]byte, subBlockCount) + for j := 0; j < subBlockCount; j++ { + maxValue := int32(0) + for k := j * subBlockSize; k < (j+1)*subBlockSize; k++ { + block[k] -= minDelta + if block[k] > maxValue { + maxValue = block[k] + } + } + + bitWidths[j] = byte(getBitWidth(uint64(maxValue))) + } + + minDeltaZigZag := getValue(minDelta) + result = append(result, unsignedVarIntToBytes(minDeltaZigZag)...) + result = append(result, bitWidths...) + + for j := 0; j < subBlockCount; j++ { + bitPacked := valuesToBitPackedBytes( + block[j*subBlockSize:(j+1)*subBlockSize], + int64(bitWidths[j]), + false, + parquet.Type_INT32, + ) + result = append(result, bitPacked...) + } + } + + return result +} + +func int64ToDeltaBytes(i64s []int64) []byte { + getValue := func(i64 int64) uint64 { + return uint64((i64 >> 63) ^ (i64 << 1)) + } + + result := append([]byte{}, blockSizeBytes...) + result = append(result, subBlockCountBytes...) + result = append(result, unsignedVarIntToBytes(uint64(len(i64s)))...) + result = append(result, unsignedVarIntToBytes(getValue(i64s[0]))...) + + for i := 1; i < len(i64s); { + block := []int64{} + minDelta := int64(0x7FFFFFFFFFFFFFFF) + + for ; i < len(i64s) && len(block) < blockSize; i++ { + delta := i64s[i] - i64s[i-1] + block = append(block, delta) + if delta < minDelta { + minDelta = delta + } + } + + for len(block) < blockSize { + block = append(block, minDelta) + } + + bitWidths := make([]byte, subBlockCount) + for j := 0; j < subBlockCount; j++ { + maxValue := int64(0) + for k := j * subBlockSize; k < (j+1)*subBlockSize; k++ { + block[k] -= minDelta + if block[k] > maxValue { + maxValue = block[k] + } + } + + bitWidths[j] = byte(getBitWidth(uint64(maxValue))) + } + + minDeltaZigZag := getValue(minDelta) + result = append(result, unsignedVarIntToBytes(minDeltaZigZag)...) + result = append(result, bitWidths...) + + for j := 0; j < subBlockCount; j++ { + bitPacked := valuesToBitPackedBytes( + block[j*subBlockSize:(j+1)*subBlockSize], + int64(bitWidths[j]), + false, + parquet.Type_INT64, + ) + result = append(result, bitPacked...) + } + } + + return result +} + +func valuesToDeltaBytes(values interface{}, dataType parquet.Type) []byte { + switch dataType { + case parquet.Type_INT32: + return int32ToDeltaBytes(values.([]int32)) + case parquet.Type_INT64: + return int64ToDeltaBytes(values.([]int64)) + } + + return nil +} + +func stringsToDeltaLengthByteArrayBytes(strs []string) []byte { + lengths := make([]int32, len(strs)) + for i, s := range strs { + lengths[i] = int32(len(s)) + } + + result := int32ToDeltaBytes(lengths) + for _, s := range strs { + result = append(result, []byte(s)...) + } + + return result +} + +func stringsToDeltaByteArrayBytes(strs []string) []byte { + prefixLengths := make([]int32, len(strs)) + suffixes := make([]string, len(strs)) + + var i, j int + for i = 1; i < len(strs); i++ { + for j = 0; j < len(strs[i-1]) && j < len(strs[i]); j++ { + if strs[i-1][j] != strs[i][j] { + break + } + } + + prefixLengths[i] = int32(j) + suffixes[i] = strs[i][j:] + } + + result := int32ToDeltaBytes(prefixLengths) + return append(result, stringsToDeltaLengthByteArrayBytes(suffixes)...) +} + +func encodeValues(values interface{}, dataType parquet.Type, encoding parquet.Encoding, bitWidth int32) []byte { + switch encoding { + case parquet.Encoding_RLE: + return valuesToRLEBitPackedHybridBytes(values, bitWidth, dataType) + case parquet.Encoding_DELTA_BINARY_PACKED: + return valuesToDeltaBytes(values, dataType) + case parquet.Encoding_DELTA_BYTE_ARRAY: + return stringsToDeltaByteArrayBytes(values.([]string)) + case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY: + return stringsToDeltaLengthByteArrayBytes(values.([]string)) + } + + return valuesToBytes(values, dataType) +} diff --git a/pkg/s3select/internal/parquet-go/encode_test.go b/pkg/s3select/internal/parquet-go/encode_test.go new file mode 100644 index 000000000..8379c190c --- /dev/null +++ b/pkg/s3select/internal/parquet-go/encode_test.go @@ -0,0 +1,189 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package parquet + +import ( + "math" + "reflect" + "testing" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +func TestBoolsToBytes(t *testing.T) { + testCases := []struct { + bs []bool + expectedResult []byte + }{ + {nil, []byte{}}, + {[]bool{}, []byte{}}, + {[]bool{true}, []byte{1}}, + {[]bool{false}, []byte{0}}, + {[]bool{true, true}, []byte{3}}, + {[]bool{false, false}, []byte{0}}, + {[]bool{false, true}, []byte{2}}, + {[]bool{true, false}, []byte{1}}, + {[]bool{false, false, false, false, false, false, false, true, true}, []byte{128, 1}}, + } + + for i, testCase := range testCases { + result := boolsToBytes(testCase.bs) + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } +} + +func TestInt32sToBytes(t *testing.T) { + testCases := []struct { + i32s []int32 + expectedResult []byte + }{ + {nil, []byte{}}, + {[]int32{}, []byte{}}, + {[]int32{1}, []byte{1, 0, 0, 0}}, + {[]int32{-1}, []byte{255, 255, 255, 255}}, + {[]int32{256}, []byte{0, 1, 0, 0}}, + {[]int32{math.MinInt32}, []byte{0, 0, 0, 128}}, + {[]int32{math.MaxInt32}, []byte{255, 255, 255, 127}}, + {[]int32{257, -2}, []byte{1, 1, 0, 0, 254, 255, 255, 255}}, + } + + for i, testCase := range testCases { + result := int32sToBytes(testCase.i32s) + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } +} + +func TestInt64sToBytes(t *testing.T) { + testCases := []struct { + i64s []int64 + expectedResult []byte + }{ + {nil, []byte{}}, + {[]int64{}, []byte{}}, + {[]int64{1}, []byte{1, 0, 0, 0, 0, 0, 0, 0}}, + {[]int64{-1}, []byte{255, 255, 255, 255, 255, 255, 255, 255}}, + {[]int64{256}, []byte{0, 1, 0, 0, 0, 0, 0, 0}}, + {[]int64{math.MinInt64}, []byte{0, 0, 0, 0, 0, 0, 0, 128}}, + {[]int64{math.MaxInt64}, []byte{255, 255, 255, 255, 255, 255, 255, 127}}, + {[]int64{257, -2}, []byte{1, 1, 0, 0, 0, 0, 0, 0, 254, 255, 255, 255, 255, 255, 255, 255}}, + } + + for i, testCase := range testCases { + result := int64sToBytes(testCase.i64s) + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } +} + +func TestFloat32sToBytes(t *testing.T) { + testCases := []struct { + f32s []float32 + expectedResult []byte + }{ + {nil, []byte{}}, + {[]float32{}, []byte{}}, + {[]float32{1}, []byte{0, 0, 128, 63}}, + {[]float32{1.0}, []byte{0, 0, 128, 63}}, + {[]float32{-1}, []byte{0, 0, 128, 191}}, + {[]float32{-1.0}, []byte{0, 0, 128, 191}}, + {[]float32{256}, []byte{0, 0, 128, 67}}, + {[]float32{1.1}, []byte{205, 204, 140, 63}}, + {[]float32{-1.1}, []byte{205, 204, 140, 191}}, + {[]float32{math.Pi}, []byte{219, 15, 73, 64}}, + {[]float32{257, -2}, []byte{0, 128, 128, 67, 0, 0, 0, 192}}, + {[]float32{257.1, -2.1}, []byte{205, 140, 128, 67, 102, 102, 6, 192}}, + } + + for i, testCase := range testCases { + result := float32sToBytes(testCase.f32s) + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } +} + +func TestFloat64sToBytes(t *testing.T) { + testCases := []struct { + f64s []float64 + expectedResult []byte + }{ + {nil, []byte{}}, + {[]float64{}, []byte{}}, + {[]float64{1}, []byte{0, 0, 0, 0, 0, 0, 240, 63}}, + {[]float64{1.0}, []byte{0, 0, 0, 0, 0, 0, 240, 63}}, + {[]float64{-1}, []byte{0, 0, 0, 0, 0, 0, 240, 191}}, + {[]float64{-1.0}, []byte{0, 0, 0, 0, 0, 0, 240, 191}}, + {[]float64{256}, []byte{0, 0, 0, 0, 0, 0, 112, 64}}, + {[]float64{1.1}, []byte{154, 153, 153, 153, 153, 153, 241, 63}}, + {[]float64{-1.1}, []byte{154, 153, 153, 153, 153, 153, 241, 191}}, + {[]float64{math.Pi}, []byte{24, 45, 68, 84, 251, 33, 9, 64}}, + {[]float64{257, -2}, []byte{0, 0, 0, 0, 0, 16, 112, 64, 0, 0, 0, 0, 0, 0, 0, 192}}, + {[]float64{257.1, -2.1}, []byte{154, 153, 153, 153, 153, 17, 112, 64, 205, 204, 204, 204, 204, 204, 0, 192}}, + } + + for i, testCase := range testCases { + result := float64sToBytes(testCase.f64s) + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } +} + +func TestUnsignedVarIntToBytes(t *testing.T) { + testCases := []struct { + ui64 uint64 + expectedResult []byte + }{ + {0, []byte{0}}, + {1, []byte{1}}, + {0x7F, []byte{127}}, + {0x80, []byte{128, 1}}, + {uint64(math.MaxUint64), []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 1}}, + } + + for i, testCase := range testCases { + result := unsignedVarIntToBytes(testCase.ui64) + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } +} + +func TestValuesToRLEBytes(t *testing.T) { + testCases := []struct { + values interface{} + bitWidth int32 + dataType parquet.Type + expectedResult []byte + }{ + {[]int32{3, 5, 7}, 1, parquet.Type_INT32, []byte{2, 3, 2, 5, 2, 7}}, + {[]int32{3, 3, 3}, 1, parquet.Type_INT32, []byte{6, 3}}, + {[]int32{2, 2, 3, 3, 3}, 1, parquet.Type_INT32, []byte{4, 2, 6, 3}}, + } + + for i, testCase := range testCases { + result := valuesToRLEBytes(testCase.values, testCase.bitWidth, testCase.dataType) + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } +} diff --git a/pkg/s3select/internal/parquet-go/encoding/common.go b/pkg/s3select/internal/parquet-go/encoding/common.go new file mode 100644 index 000000000..f26619fa6 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/encoding/common.go @@ -0,0 +1,38 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package encoding + +import ( + "github.com/minio/minio/pkg/s3select/internal/parquet-go/common" +) + +// Refer https://en.wikipedia.org/wiki/LEB128#Unsigned_LEB128 +func varIntEncode(ui64 uint64) []byte { + if ui64 == 0 { + return []byte{0} + } + + length := int(common.BitWidth(ui64)+6) / 7 + data := make([]byte, length) + for i := 0; i < length; i++ { + data[i] = byte(ui64&0x7F) | 0x80 + ui64 >>= 7 + } + data[length-1] &= 0x7F + + return data +} diff --git a/pkg/s3select/internal/parquet-go/encoding/common_test.go b/pkg/s3select/internal/parquet-go/encoding/common_test.go new file mode 100644 index 000000000..f37bc7b35 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/encoding/common_test.go @@ -0,0 +1,43 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package encoding + +import ( + "math" + "reflect" + "testing" +) + +func TestVarIntToBytes(t *testing.T) { + testCases := []struct { + ui64 uint64 + expectedResult []byte + }{ + {0, []byte{0}}, + {1, []byte{1}}, + {0x7F, []byte{127}}, + {0x80, []byte{128, 1}}, + {uint64(math.MaxUint64), []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 1}}, + } + + for i, testCase := range testCases { + result := varIntEncode(testCase.ui64) + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } +} diff --git a/pkg/s3select/internal/parquet-go/encoding/delta-encode.go b/pkg/s3select/internal/parquet-go/encoding/delta-encode.go new file mode 100644 index 000000000..3d14318f8 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/encoding/delta-encode.go @@ -0,0 +1,296 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package encoding + +import ( + "fmt" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/common" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +const ( + blockSize = 128 + miniBlockSize = 32 + miniBlockCount = blockSize / miniBlockSize +) + +var deltaEncodeHeaderBytes []byte + +func init() { + deltaEncodeHeaderBytes = varIntEncode(blockSize) + deltaEncodeHeaderBytes = append(deltaEncodeHeaderBytes, varIntEncode(miniBlockCount)...) +} + +// Supported Types: BOOLEAN, INT32, INT64 +func bitPackedEncode(values interface{}, bitWidth uint64, withHeader bool, parquetType parquet.Type) []byte { + var i64s []int64 + switch parquetType { + case parquet.Type_BOOLEAN: + bs, ok := values.([]bool) + if !ok { + panic(fmt.Errorf("expected slice of bool")) + } + + i64s = make([]int64, len(bs)) + for i := range bs { + if bs[i] { + i64s[i] = 1 + } + } + case parquet.Type_INT32: + i32s, ok := values.([]int32) + if !ok { + panic(fmt.Errorf("expected slice of int32")) + } + + for i := range i32s { + i64s[i] = int64(i32s[i]) + } + case parquet.Type_INT64: + var ok bool + i64s, ok = values.([]int64) + if !ok { + panic(fmt.Errorf("expected slice of int64")) + } + default: + panic(fmt.Errorf("%v parquet type unsupported", parquetType)) + } + + if len(i64s) == 0 { + return nil + } + + var valueByte byte + bitsSet := uint64(0) + bitsNeeded := uint64(8) + bitsToSet := bitWidth + value := i64s[0] + + valueBytes := []byte{} + for i := 0; i < len(i64s); { + if bitsToSet >= bitsNeeded { + valueByte |= byte(((value >> bitsSet) & ((1 << bitsNeeded) - 1)) << (8 - bitsNeeded)) + valueBytes = append(valueBytes, valueByte) + bitsToSet -= bitsNeeded + bitsSet += bitsNeeded + + bitsNeeded = 8 + valueByte = 0 + + if bitsToSet <= 0 && (i+1) < len(i64s) { + i++ + value = i64s[i] + bitsToSet = bitWidth + bitsSet = 0 + } + } else { + valueByte |= byte((value >> bitsSet) << (8 - bitsNeeded)) + i++ + + if i < len(i64s) { + value = i64s[i] + } + + bitsNeeded -= bitsToSet + bitsToSet = bitWidth + bitsSet = 0 + } + } + + if withHeader { + header := uint64(((len(i64s) / 8) << 1) | 1) + headerBytes := varIntEncode(header) + return append(headerBytes, valueBytes...) + } + + return valueBytes +} + +func deltaEncodeInt32s(i32s []int32) (data []byte) { + getValue := func(i32 int32) uint64 { + return uint64((i32 >> 31) ^ (i32 << 1)) + } + + data = append(data, deltaEncodeHeaderBytes...) + data = append(data, varIntEncode(uint64(len(i32s)))...) + data = append(data, varIntEncode(getValue(i32s[0]))...) + + for i := 1; i < len(i32s); { + block := []int32{} + minDelta := int32(0x7FFFFFFF) + + for ; i < len(i32s) && len(block) < blockSize; i++ { + delta := i32s[i] - i32s[i-1] + block = append(block, delta) + if delta < minDelta { + minDelta = delta + } + } + + for len(block) < blockSize { + block = append(block, minDelta) + } + + bitWidths := make([]byte, miniBlockCount) + for j := 0; j < miniBlockCount; j++ { + maxValue := int32(0) + for k := j * miniBlockSize; k < (j+1)*miniBlockSize; k++ { + block[k] -= minDelta + if block[k] > maxValue { + maxValue = block[k] + } + } + + bitWidths[j] = byte(common.BitWidth(uint64(maxValue))) + } + + minDeltaZigZag := getValue(minDelta) + data = append(data, varIntEncode(minDeltaZigZag)...) + data = append(data, bitWidths...) + + for j := 0; j < miniBlockCount; j++ { + bitPacked := bitPackedEncode( + block[j*miniBlockSize:(j+1)*miniBlockSize], + uint64(bitWidths[j]), + false, + parquet.Type_INT32, + ) + data = append(data, bitPacked...) + } + } + + return data +} + +func deltaEncodeInt64s(i64s []int64) (data []byte) { + getValue := func(i64 int64) uint64 { + return uint64((i64 >> 63) ^ (i64 << 1)) + } + + data = append(data, deltaEncodeHeaderBytes...) + data = append(data, varIntEncode(uint64(len(i64s)))...) + data = append(data, varIntEncode(getValue(i64s[0]))...) + + for i := 1; i < len(i64s); { + block := []int64{} + minDelta := int64(0x7FFFFFFFFFFFFFFF) + + for ; i < len(i64s) && len(block) < blockSize; i++ { + delta := i64s[i] - i64s[i-1] + block = append(block, delta) + if delta < minDelta { + minDelta = delta + } + } + + for len(block) < blockSize { + block = append(block, minDelta) + } + + bitWidths := make([]byte, miniBlockCount) + for j := 0; j < miniBlockCount; j++ { + maxValue := int64(0) + for k := j * miniBlockSize; k < (j+1)*miniBlockSize; k++ { + block[k] -= minDelta + if block[k] > maxValue { + maxValue = block[k] + } + } + + bitWidths[j] = byte(common.BitWidth(uint64(maxValue))) + } + + minDeltaZigZag := getValue(minDelta) + data = append(data, varIntEncode(minDeltaZigZag)...) + data = append(data, bitWidths...) + + for j := 0; j < miniBlockCount; j++ { + bitPacked := bitPackedEncode( + block[j*miniBlockSize:(j+1)*miniBlockSize], + uint64(bitWidths[j]), + false, + parquet.Type_INT64, + ) + data = append(data, bitPacked...) + } + } + + return data +} + +// DeltaEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-encoding-delta_binary_packed--5 +// +// Supported Types: INT32, INT64. +func DeltaEncode(values interface{}, parquetType parquet.Type) []byte { + switch parquetType { + case parquet.Type_INT32: + i32s, ok := values.([]int32) + if !ok { + panic(fmt.Errorf("expected slice of int32")) + } + return deltaEncodeInt32s(i32s) + case parquet.Type_INT64: + i64s, ok := values.([]int64) + if !ok { + panic(fmt.Errorf("expected slice of int64")) + } + return deltaEncodeInt64s(i64s) + } + + panic(fmt.Errorf("%v parquet type unsupported", parquetType)) +} + +// DeltaLengthByteArrayEncode encodes bytes slices specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6 +// +// Supported Types: BYTE_ARRAY +func DeltaLengthByteArrayEncode(bytesSlices [][]byte) (data []byte) { + lengths := make([]int32, len(bytesSlices)) + for i, bytes := range bytesSlices { + lengths[i] = int32(len(bytes)) + } + + data = deltaEncodeInt32s(lengths) + for _, bytes := range bytesSlices { + data = append(data, []byte(bytes)...) + } + + return data +} + +// DeltaByteArrayEncode encodes sequence of strings values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-strings-delta_byte_array--7 +// +// Supported Types: BYTE_ARRAY +func DeltaByteArrayEncode(bytesSlices [][]byte) (data []byte) { + prefixLengths := make([]int32, len(bytesSlices)) + suffixes := make([][]byte, len(bytesSlices)) + + var i, j int + for i = 1; i < len(bytesSlices); i++ { + for j = 0; j < len(bytesSlices[i-1]) && j < len(bytesSlices[i]); j++ { + if bytesSlices[i-1][j] != bytesSlices[i][j] { + break + } + } + + prefixLengths[i] = int32(j) + suffixes[i] = bytesSlices[i][j:] + } + + data = deltaEncodeInt32s(prefixLengths) + return append(data, DeltaLengthByteArrayEncode(suffixes)...) +} diff --git a/pkg/s3select/internal/parquet-go/encoding/plain-encode.go b/pkg/s3select/internal/parquet-go/encoding/plain-encode.go new file mode 100644 index 000000000..babeb47e7 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/encoding/plain-encode.go @@ -0,0 +1,140 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package encoding + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +func plainEncodeBools(bs []bool) []byte { + data := make([]byte, (len(bs)+7)/8) + + for i := range bs { + if bs[i] { + data[i/8] |= 1 << uint(i%8) + } + } + + return data +} + +func plainEncodeInt32s(i32s []int32) []byte { + data := make([]byte, len(i32s)*4) + + for i, i32 := range i32s { + binary.LittleEndian.PutUint32(data[i*4:], uint32(i32)) + } + + return data +} + +func plainEncodeInt64s(i64s []int64) []byte { + data := make([]byte, len(i64s)*8) + + for i, i64 := range i64s { + binary.LittleEndian.PutUint64(data[i*8:], uint64(i64)) + } + + return data +} + +func plainEncodeFloat32s(f32s []float32) []byte { + data := make([]byte, len(f32s)*4) + + for i, f32 := range f32s { + binary.LittleEndian.PutUint32(data[i*4:], math.Float32bits(f32)) + } + + return data +} + +func plainEncodeFloat64s(f64s []float64) []byte { + data := make([]byte, len(f64s)*8) + + for i, f64 := range f64s { + binary.LittleEndian.PutUint64(data[i*8:], math.Float64bits(f64)) + } + + return data +} + +func plainEncodeBytesSlices(bytesSlices [][]byte) []byte { + buf := new(bytes.Buffer) + + for _, s := range bytesSlices { + if err := binary.Write(buf, binary.LittleEndian, uint32(len(s))); err != nil { + panic(err) + } + + if _, err := buf.Write(s); err != nil { + panic(err) + } + } + + return buf.Bytes() +} + +// PlainEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0 +// +// Supported Types: BOOLEAN, INT32, INT64, FLOAT, DOUBLE, BYTE_ARRAY +func PlainEncode(values interface{}, parquetType parquet.Type) []byte { + switch parquetType { + case parquet.Type_BOOLEAN: + bs, ok := values.([]bool) + if !ok { + panic(fmt.Errorf("expected slice of bool")) + } + return plainEncodeBools(bs) + case parquet.Type_INT32: + i32s, ok := values.([]int32) + if !ok { + panic(fmt.Errorf("expected slice of int32")) + } + return plainEncodeInt32s(i32s) + case parquet.Type_INT64: + i64s, ok := values.([]int64) + if !ok { + panic(fmt.Errorf("expected slice of int64")) + } + return plainEncodeInt64s(i64s) + case parquet.Type_FLOAT: + f32s, ok := values.([]float32) + if !ok { + panic(fmt.Errorf("expected slice of float32")) + } + return plainEncodeFloat32s(f32s) + case parquet.Type_DOUBLE: + f64s, ok := values.([]float64) + if !ok { + panic(fmt.Errorf("expected slice of float64")) + } + return plainEncodeFloat64s(f64s) + case parquet.Type_BYTE_ARRAY: + bytesSlices, ok := values.([][]byte) + if !ok { + panic(fmt.Errorf("expected slice of byte array")) + } + return plainEncodeBytesSlices(bytesSlices) + } + + panic(fmt.Errorf("%v parquet type unsupported", parquetType)) +} diff --git a/pkg/s3select/internal/parquet-go/encoding/plain-encode_test.go b/pkg/s3select/internal/parquet-go/encoding/plain-encode_test.go new file mode 100644 index 000000000..f7c63b298 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/encoding/plain-encode_test.go @@ -0,0 +1,147 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package encoding + +import ( + "math" + "reflect" + "testing" +) + +func TestPlainEncodeBools(t *testing.T) { + testCases := []struct { + bs []bool + expectedResult []byte + }{ + {nil, []byte{}}, + {[]bool{}, []byte{}}, + {[]bool{true}, []byte{1}}, + {[]bool{false}, []byte{0}}, + {[]bool{true, true}, []byte{3}}, + {[]bool{false, false}, []byte{0}}, + {[]bool{false, true}, []byte{2}}, + {[]bool{true, false}, []byte{1}}, + {[]bool{false, false, false, false, false, false, false, true, true}, []byte{128, 1}}, + } + + for i, testCase := range testCases { + result := plainEncodeBools(testCase.bs) + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } +} + +func TestPlainEncodeInt32s(t *testing.T) { + testCases := []struct { + i32s []int32 + expectedResult []byte + }{ + {nil, []byte{}}, + {[]int32{}, []byte{}}, + {[]int32{1}, []byte{1, 0, 0, 0}}, + {[]int32{-1}, []byte{255, 255, 255, 255}}, + {[]int32{256}, []byte{0, 1, 0, 0}}, + {[]int32{math.MinInt32}, []byte{0, 0, 0, 128}}, + {[]int32{math.MaxInt32}, []byte{255, 255, 255, 127}}, + {[]int32{257, -2}, []byte{1, 1, 0, 0, 254, 255, 255, 255}}, + } + + for i, testCase := range testCases { + result := plainEncodeInt32s(testCase.i32s) + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } +} + +func TestPlainEncodeInt64s(t *testing.T) { + testCases := []struct { + i64s []int64 + expectedResult []byte + }{ + {nil, []byte{}}, + {[]int64{}, []byte{}}, + {[]int64{1}, []byte{1, 0, 0, 0, 0, 0, 0, 0}}, + {[]int64{-1}, []byte{255, 255, 255, 255, 255, 255, 255, 255}}, + {[]int64{256}, []byte{0, 1, 0, 0, 0, 0, 0, 0}}, + {[]int64{math.MinInt64}, []byte{0, 0, 0, 0, 0, 0, 0, 128}}, + {[]int64{math.MaxInt64}, []byte{255, 255, 255, 255, 255, 255, 255, 127}}, + {[]int64{257, -2}, []byte{1, 1, 0, 0, 0, 0, 0, 0, 254, 255, 255, 255, 255, 255, 255, 255}}, + } + + for i, testCase := range testCases { + result := plainEncodeInt64s(testCase.i64s) + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } +} + +func TestPlainEncodeFloat32s(t *testing.T) { + testCases := []struct { + f32s []float32 + expectedResult []byte + }{ + {nil, []byte{}}, + {[]float32{}, []byte{}}, + {[]float32{1}, []byte{0, 0, 128, 63}}, + {[]float32{1.0}, []byte{0, 0, 128, 63}}, + {[]float32{-1}, []byte{0, 0, 128, 191}}, + {[]float32{-1.0}, []byte{0, 0, 128, 191}}, + {[]float32{256}, []byte{0, 0, 128, 67}}, + {[]float32{1.1}, []byte{205, 204, 140, 63}}, + {[]float32{-1.1}, []byte{205, 204, 140, 191}}, + {[]float32{math.Pi}, []byte{219, 15, 73, 64}}, + {[]float32{257, -2}, []byte{0, 128, 128, 67, 0, 0, 0, 192}}, + {[]float32{257.1, -2.1}, []byte{205, 140, 128, 67, 102, 102, 6, 192}}, + } + + for i, testCase := range testCases { + result := plainEncodeFloat32s(testCase.f32s) + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } +} + +func TestPlainEncodeFloat64s(t *testing.T) { + testCases := []struct { + f64s []float64 + expectedResult []byte + }{ + {nil, []byte{}}, + {[]float64{}, []byte{}}, + {[]float64{1}, []byte{0, 0, 0, 0, 0, 0, 240, 63}}, + {[]float64{1.0}, []byte{0, 0, 0, 0, 0, 0, 240, 63}}, + {[]float64{-1}, []byte{0, 0, 0, 0, 0, 0, 240, 191}}, + {[]float64{-1.0}, []byte{0, 0, 0, 0, 0, 0, 240, 191}}, + {[]float64{256}, []byte{0, 0, 0, 0, 0, 0, 112, 64}}, + {[]float64{1.1}, []byte{154, 153, 153, 153, 153, 153, 241, 63}}, + {[]float64{-1.1}, []byte{154, 153, 153, 153, 153, 153, 241, 191}}, + {[]float64{math.Pi}, []byte{24, 45, 68, 84, 251, 33, 9, 64}}, + {[]float64{257, -2}, []byte{0, 0, 0, 0, 0, 16, 112, 64, 0, 0, 0, 0, 0, 0, 0, 192}}, + {[]float64{257.1, -2.1}, []byte{154, 153, 153, 153, 153, 17, 112, 64, 205, 204, 204, 204, 204, 204, 0, 192}}, + } + + for i, testCase := range testCases { + result := plainEncodeFloat64s(testCase.f64s) + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } +} diff --git a/pkg/s3select/internal/parquet-go/encoding/rle-encode.go b/pkg/s3select/internal/parquet-go/encoding/rle-encode.go new file mode 100644 index 000000000..518e11d1b --- /dev/null +++ b/pkg/s3select/internal/parquet-go/encoding/rle-encode.go @@ -0,0 +1,84 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package encoding + +import ( + "fmt" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +func rleEncodeInt32s(i32s []int32, bitWidth int32) (data []byte) { + j := 0 + for i := 0; i < len(i32s); i = j { + for j = i + 1; j < len(i32s) && i32s[i] == i32s[j]; j++ { + } + + headerBytes := varIntEncode(uint64((j - i) << 1)) + data = append(data, headerBytes...) + + valBytes := plainEncodeInt32s([]int32{i32s[i]}) + byteCount := (bitWidth + 7) / 8 + data = append(data, valBytes[:byteCount]...) + } + + return data +} + +func rleEncodeInt64s(i64s []int64, bitWidth int32) (data []byte) { + j := 0 + for i := 0; i < len(i64s); i = j { + for j = i + 1; j < len(i64s) && i64s[i] == i64s[j]; j++ { + } + + headerBytes := varIntEncode(uint64((j - i) << 1)) + data = append(data, headerBytes...) + + valBytes := plainEncodeInt64s([]int64{i64s[i]}) + byteCount := (bitWidth + 7) / 8 + data = append(data, valBytes[:byteCount]...) + } + + return data +} + +// RLEBitPackedHybridEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3 +// +// Supported Types: INT32, INT64 +func RLEBitPackedHybridEncode(values interface{}, bitWidth int32, parquetType parquet.Type) []byte { + var rleBytes []byte + + switch parquetType { + case parquet.Type_INT32: + i32s, ok := values.([]int32) + if !ok { + panic(fmt.Errorf("expected slice of int32")) + } + rleBytes = rleEncodeInt32s(i32s, bitWidth) + case parquet.Type_INT64: + i64s, ok := values.([]int64) + if !ok { + panic(fmt.Errorf("expected slice of int64")) + } + rleBytes = rleEncodeInt64s(i64s, bitWidth) + default: + panic(fmt.Errorf("%v parquet type unsupported", parquetType)) + } + + lenBytes := plainEncodeInt32s([]int32{int32(len(rleBytes))}) + return append(lenBytes, rleBytes...) +} diff --git a/pkg/s3select/internal/parquet-go/encoding/rle-encode_test.go b/pkg/s3select/internal/parquet-go/encoding/rle-encode_test.go new file mode 100644 index 000000000..e0773c866 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/encoding/rle-encode_test.go @@ -0,0 +1,44 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package encoding + +import ( + "reflect" + "testing" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +func TestRLEEncodeInt32s(t *testing.T) { + testCases := []struct { + values []int32 + bitWidth int32 + dataType parquet.Type + expectedResult []byte + }{ + {[]int32{3, 5, 7}, 1, parquet.Type_INT32, []byte{2, 3, 2, 5, 2, 7}}, + {[]int32{3, 3, 3}, 1, parquet.Type_INT32, []byte{6, 3}}, + {[]int32{2, 2, 3, 3, 3}, 1, parquet.Type_INT32, []byte{4, 2, 6, 3}}, + } + + for i, testCase := range testCases { + result := rleEncodeInt32s(testCase.values, testCase.bitWidth) + if !reflect.DeepEqual(result, testCase.expectedResult) { + t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result) + } + } +} diff --git a/pkg/s3select/internal/parquet-go/encoding/rledict-encode.go b/pkg/s3select/internal/parquet-go/encoding/rledict-encode.go new file mode 100644 index 000000000..3ed799ae4 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/encoding/rledict-encode.go @@ -0,0 +1,60 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package encoding + +import ( + "github.com/minio/minio/pkg/s3select/internal/parquet-go/common" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +// RLEDictEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#dictionary-encoding-plain_dictionary--2-and-rle_dictionary--8 and returns dictionary page data and data page data. +// +// Dictionary page data contains PLAIN encodeed slice of uniquely fully defined non-nil values. +// Data page data contains RLE/Bit-Packed Hybrid encoded indices of fully defined non-nil values. +// +// Supported Types: BOOLEAN, INT32, INT64, FLOAT, DOUBLE, BYTE_ARRAY +func RLEDictEncode(values []interface{}, parquetType parquet.Type, bitWidth int32) (dictPageData, dataPageData []byte, dictValueCount int32, indexBitWidth uint8) { + var definedValues []interface{} + var indices []int32 + + valueIndexMap := make(map[interface{}]int32) + j := 0 + for i := 0; i < len(values); i = j { + for j = i; j < len(values); j++ { + value := values[j] + if value == nil { + continue + } + + index, found := valueIndexMap[value] + if !found { + index = int32(len(definedValues)) + definedValues = append(definedValues, value) + valueIndexMap[value] = index + } + + indices = append(indices, index) + } + } + + indexBitWidth = uint8(common.BitWidth(uint64(indices[len(indices)-1]))) + + dictPageData = PlainEncode(common.ToSliceValue(definedValues, parquetType), parquetType) + dataPageData = RLEBitPackedHybridEncode(indices, int32(indexBitWidth), parquet.Type_INT32) + + return dictPageData, dataPageData, int32(len(definedValues)), indexBitWidth +} diff --git a/pkg/s3select/internal/parquet-go/endian.go b/pkg/s3select/internal/parquet-go/endian.go new file mode 100644 index 000000000..9506fb0d2 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/endian.go @@ -0,0 +1,35 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package parquet + +import ( + "encoding/binary" +) + +func uint32ToBytes(v uint32) []byte { + buf := make([]byte, 4) + binary.LittleEndian.PutUint32(buf, v) + return buf +} + +func bytesToUint32(buf []byte) uint32 { + return binary.LittleEndian.Uint32(buf) +} + +func bytesToUint64(buf []byte) uint64 { + return binary.LittleEndian.Uint64(buf) +} diff --git a/pkg/s3select/internal/parquet-go/example.parquet b/pkg/s3select/internal/parquet-go/example.parquet new file mode 100644 index 000000000..05cd61aea Binary files /dev/null and b/pkg/s3select/internal/parquet-go/example.parquet differ diff --git a/pkg/s3select/internal/parquet-go/gen-go/parquet/GoUnusedProtection__.go b/pkg/s3select/internal/parquet-go/gen-go/parquet/GoUnusedProtection__.go new file mode 100644 index 000000000..7323152d4 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/gen-go/parquet/GoUnusedProtection__.go @@ -0,0 +1,6 @@ +// Autogenerated by Thrift Compiler (0.10.0) +// DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING + +package parquet + +var GoUnusedProtection__ int diff --git a/pkg/s3select/internal/parquet-go/gen-go/parquet/parquet-consts.go b/pkg/s3select/internal/parquet-go/gen-go/parquet/parquet-consts.go new file mode 100644 index 000000000..95d12c627 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/gen-go/parquet/parquet-consts.go @@ -0,0 +1,18 @@ +// Autogenerated by Thrift Compiler (0.10.0) +// DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING + +package parquet + +import ( + "bytes" + "fmt" + "git.apache.org/thrift.git/lib/go/thrift" +) + +// (needed to ensure safety because of naive import list construction.) +var _ = thrift.ZERO +var _ = fmt.Printf +var _ = bytes.Equal + +func init() { +} diff --git a/pkg/s3select/internal/parquet-go/gen-go/parquet/parquet.go b/pkg/s3select/internal/parquet-go/gen-go/parquet/parquet.go new file mode 100644 index 000000000..22711db46 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/gen-go/parquet/parquet.go @@ -0,0 +1,8191 @@ +// Autogenerated by Thrift Compiler (0.10.0) +// DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING + +package parquet + +import ( + "bytes" + "database/sql/driver" + "errors" + "fmt" + "git.apache.org/thrift.git/lib/go/thrift" +) + +// (needed to ensure safety because of naive import list construction.) +var _ = thrift.ZERO +var _ = fmt.Printf +var _ = bytes.Equal + +//Types supported by Parquet. These types are intended to be used in combination +//with the encodings to control the on disk storage format. +//For example INT16 is not included as a type since a good encoding of INT32 +//would handle this. +type Type int64 + +const ( + Type_BOOLEAN Type = 0 + Type_INT32 Type = 1 + Type_INT64 Type = 2 + Type_INT96 Type = 3 + Type_FLOAT Type = 4 + Type_DOUBLE Type = 5 + Type_BYTE_ARRAY Type = 6 + Type_FIXED_LEN_BYTE_ARRAY Type = 7 +) + +func (p Type) String() string { + switch p { + case Type_BOOLEAN: + return "BOOLEAN" + case Type_INT32: + return "INT32" + case Type_INT64: + return "INT64" + case Type_INT96: + return "INT96" + case Type_FLOAT: + return "FLOAT" + case Type_DOUBLE: + return "DOUBLE" + case Type_BYTE_ARRAY: + return "BYTE_ARRAY" + case Type_FIXED_LEN_BYTE_ARRAY: + return "FIXED_LEN_BYTE_ARRAY" + } + return "" +} + +func TypeFromString(s string) (Type, error) { + switch s { + case "BOOLEAN": + return Type_BOOLEAN, nil + case "INT32": + return Type_INT32, nil + case "INT64": + return Type_INT64, nil + case "INT96": + return Type_INT96, nil + case "FLOAT": + return Type_FLOAT, nil + case "DOUBLE": + return Type_DOUBLE, nil + case "BYTE_ARRAY": + return Type_BYTE_ARRAY, nil + case "FIXED_LEN_BYTE_ARRAY": + return Type_FIXED_LEN_BYTE_ARRAY, nil + } + return Type(0), fmt.Errorf("not a valid Type string") +} + +func TypePtr(v Type) *Type { return &v } + +func (p Type) MarshalText() ([]byte, error) { + return []byte(p.String()), nil +} + +func (p *Type) UnmarshalText(text []byte) error { + q, err := TypeFromString(string(text)) + if err != nil { + return err + } + *p = q + return nil +} + +func (p *Type) Scan(value interface{}) error { + v, ok := value.(int64) + if !ok { + return errors.New("Scan value is not int64") + } + *p = Type(v) + return nil +} + +func (p *Type) Value() (driver.Value, error) { + if p == nil { + return nil, nil + } + return int64(*p), nil +} + +//Common types used by frameworks(e.g. hive, pig) using parquet. This helps map +//between types in those frameworks to the base types in parquet. This is only +//metadata and not needed to read or write the data. +type ConvertedType int64 + +const ( + ConvertedType_UTF8 ConvertedType = 0 + ConvertedType_MAP ConvertedType = 1 + ConvertedType_MAP_KEY_VALUE ConvertedType = 2 + ConvertedType_LIST ConvertedType = 3 + ConvertedType_ENUM ConvertedType = 4 + ConvertedType_DECIMAL ConvertedType = 5 + ConvertedType_DATE ConvertedType = 6 + ConvertedType_TIME_MILLIS ConvertedType = 7 + ConvertedType_TIME_MICROS ConvertedType = 8 + ConvertedType_TIMESTAMP_MILLIS ConvertedType = 9 + ConvertedType_TIMESTAMP_MICROS ConvertedType = 10 + ConvertedType_UINT_8 ConvertedType = 11 + ConvertedType_UINT_16 ConvertedType = 12 + ConvertedType_UINT_32 ConvertedType = 13 + ConvertedType_UINT_64 ConvertedType = 14 + ConvertedType_INT_8 ConvertedType = 15 + ConvertedType_INT_16 ConvertedType = 16 + ConvertedType_INT_32 ConvertedType = 17 + ConvertedType_INT_64 ConvertedType = 18 + ConvertedType_JSON ConvertedType = 19 + ConvertedType_BSON ConvertedType = 20 + ConvertedType_INTERVAL ConvertedType = 21 +) + +func (p ConvertedType) String() string { + switch p { + case ConvertedType_UTF8: + return "UTF8" + case ConvertedType_MAP: + return "MAP" + case ConvertedType_MAP_KEY_VALUE: + return "MAP_KEY_VALUE" + case ConvertedType_LIST: + return "LIST" + case ConvertedType_ENUM: + return "ENUM" + case ConvertedType_DECIMAL: + return "DECIMAL" + case ConvertedType_DATE: + return "DATE" + case ConvertedType_TIME_MILLIS: + return "TIME_MILLIS" + case ConvertedType_TIME_MICROS: + return "TIME_MICROS" + case ConvertedType_TIMESTAMP_MILLIS: + return "TIMESTAMP_MILLIS" + case ConvertedType_TIMESTAMP_MICROS: + return "TIMESTAMP_MICROS" + case ConvertedType_UINT_8: + return "UINT_8" + case ConvertedType_UINT_16: + return "UINT_16" + case ConvertedType_UINT_32: + return "UINT_32" + case ConvertedType_UINT_64: + return "UINT_64" + case ConvertedType_INT_8: + return "INT_8" + case ConvertedType_INT_16: + return "INT_16" + case ConvertedType_INT_32: + return "INT_32" + case ConvertedType_INT_64: + return "INT_64" + case ConvertedType_JSON: + return "JSON" + case ConvertedType_BSON: + return "BSON" + case ConvertedType_INTERVAL: + return "INTERVAL" + } + return "" +} + +func ConvertedTypeFromString(s string) (ConvertedType, error) { + switch s { + case "UTF8": + return ConvertedType_UTF8, nil + case "MAP": + return ConvertedType_MAP, nil + case "MAP_KEY_VALUE": + return ConvertedType_MAP_KEY_VALUE, nil + case "LIST": + return ConvertedType_LIST, nil + case "ENUM": + return ConvertedType_ENUM, nil + case "DECIMAL": + return ConvertedType_DECIMAL, nil + case "DATE": + return ConvertedType_DATE, nil + case "TIME_MILLIS": + return ConvertedType_TIME_MILLIS, nil + case "TIME_MICROS": + return ConvertedType_TIME_MICROS, nil + case "TIMESTAMP_MILLIS": + return ConvertedType_TIMESTAMP_MILLIS, nil + case "TIMESTAMP_MICROS": + return ConvertedType_TIMESTAMP_MICROS, nil + case "UINT_8": + return ConvertedType_UINT_8, nil + case "UINT_16": + return ConvertedType_UINT_16, nil + case "UINT_32": + return ConvertedType_UINT_32, nil + case "UINT_64": + return ConvertedType_UINT_64, nil + case "INT_8": + return ConvertedType_INT_8, nil + case "INT_16": + return ConvertedType_INT_16, nil + case "INT_32": + return ConvertedType_INT_32, nil + case "INT_64": + return ConvertedType_INT_64, nil + case "JSON": + return ConvertedType_JSON, nil + case "BSON": + return ConvertedType_BSON, nil + case "INTERVAL": + return ConvertedType_INTERVAL, nil + } + return ConvertedType(0), fmt.Errorf("not a valid ConvertedType string") +} + +func ConvertedTypePtr(v ConvertedType) *ConvertedType { return &v } + +func (p ConvertedType) MarshalText() ([]byte, error) { + return []byte(p.String()), nil +} + +func (p *ConvertedType) UnmarshalText(text []byte) error { + q, err := ConvertedTypeFromString(string(text)) + if err != nil { + return err + } + *p = q + return nil +} + +func (p *ConvertedType) Scan(value interface{}) error { + v, ok := value.(int64) + if !ok { + return errors.New("Scan value is not int64") + } + *p = ConvertedType(v) + return nil +} + +func (p *ConvertedType) Value() (driver.Value, error) { + if p == nil { + return nil, nil + } + return int64(*p), nil +} + +//Representation of Schemas +type FieldRepetitionType int64 + +const ( + FieldRepetitionType_REQUIRED FieldRepetitionType = 0 + FieldRepetitionType_OPTIONAL FieldRepetitionType = 1 + FieldRepetitionType_REPEATED FieldRepetitionType = 2 +) + +func (p FieldRepetitionType) String() string { + switch p { + case FieldRepetitionType_REQUIRED: + return "REQUIRED" + case FieldRepetitionType_OPTIONAL: + return "OPTIONAL" + case FieldRepetitionType_REPEATED: + return "REPEATED" + } + return "" +} + +func FieldRepetitionTypeFromString(s string) (FieldRepetitionType, error) { + switch s { + case "REQUIRED": + return FieldRepetitionType_REQUIRED, nil + case "OPTIONAL": + return FieldRepetitionType_OPTIONAL, nil + case "REPEATED": + return FieldRepetitionType_REPEATED, nil + } + return FieldRepetitionType(0), fmt.Errorf("not a valid FieldRepetitionType string") +} + +func FieldRepetitionTypePtr(v FieldRepetitionType) *FieldRepetitionType { return &v } + +func (p FieldRepetitionType) MarshalText() ([]byte, error) { + return []byte(p.String()), nil +} + +func (p *FieldRepetitionType) UnmarshalText(text []byte) error { + q, err := FieldRepetitionTypeFromString(string(text)) + if err != nil { + return err + } + *p = q + return nil +} + +func (p *FieldRepetitionType) Scan(value interface{}) error { + v, ok := value.(int64) + if !ok { + return errors.New("Scan value is not int64") + } + *p = FieldRepetitionType(v) + return nil +} + +func (p *FieldRepetitionType) Value() (driver.Value, error) { + if p == nil { + return nil, nil + } + return int64(*p), nil +} + +//Encodings supported by Parquet. Not all encodings are valid for all types. These +//enums are also used to specify the encoding of definition and repetition levels. +//See the accompanying doc for the details of the more complicated encodings. +type Encoding int64 + +const ( + Encoding_PLAIN Encoding = 0 + Encoding_PLAIN_DICTIONARY Encoding = 2 + Encoding_RLE Encoding = 3 + Encoding_BIT_PACKED Encoding = 4 + Encoding_DELTA_BINARY_PACKED Encoding = 5 + Encoding_DELTA_LENGTH_BYTE_ARRAY Encoding = 6 + Encoding_DELTA_BYTE_ARRAY Encoding = 7 + Encoding_RLE_DICTIONARY Encoding = 8 +) + +func (p Encoding) String() string { + switch p { + case Encoding_PLAIN: + return "PLAIN" + case Encoding_PLAIN_DICTIONARY: + return "PLAIN_DICTIONARY" + case Encoding_RLE: + return "RLE" + case Encoding_BIT_PACKED: + return "BIT_PACKED" + case Encoding_DELTA_BINARY_PACKED: + return "DELTA_BINARY_PACKED" + case Encoding_DELTA_LENGTH_BYTE_ARRAY: + return "DELTA_LENGTH_BYTE_ARRAY" + case Encoding_DELTA_BYTE_ARRAY: + return "DELTA_BYTE_ARRAY" + case Encoding_RLE_DICTIONARY: + return "RLE_DICTIONARY" + } + return "" +} + +func EncodingFromString(s string) (Encoding, error) { + switch s { + case "PLAIN": + return Encoding_PLAIN, nil + case "PLAIN_DICTIONARY": + return Encoding_PLAIN_DICTIONARY, nil + case "RLE": + return Encoding_RLE, nil + case "BIT_PACKED": + return Encoding_BIT_PACKED, nil + case "DELTA_BINARY_PACKED": + return Encoding_DELTA_BINARY_PACKED, nil + case "DELTA_LENGTH_BYTE_ARRAY": + return Encoding_DELTA_LENGTH_BYTE_ARRAY, nil + case "DELTA_BYTE_ARRAY": + return Encoding_DELTA_BYTE_ARRAY, nil + case "RLE_DICTIONARY": + return Encoding_RLE_DICTIONARY, nil + } + return Encoding(0), fmt.Errorf("not a valid Encoding string") +} + +func EncodingPtr(v Encoding) *Encoding { return &v } + +func (p Encoding) MarshalText() ([]byte, error) { + return []byte(p.String()), nil +} + +func (p *Encoding) UnmarshalText(text []byte) error { + q, err := EncodingFromString(string(text)) + if err != nil { + return err + } + *p = q + return nil +} + +func (p *Encoding) Scan(value interface{}) error { + v, ok := value.(int64) + if !ok { + return errors.New("Scan value is not int64") + } + *p = Encoding(v) + return nil +} + +func (p *Encoding) Value() (driver.Value, error) { + if p == nil { + return nil, nil + } + return int64(*p), nil +} + +//Supported compression algorithms. +// +//Codecs added in 2.4 can be read by readers based on 2.4 and later. +//Codec support may vary between readers based on the format version and +//libraries available at runtime. Gzip, Snappy, and LZ4 codecs are +//widely available, while Zstd and Brotli require additional libraries. +type CompressionCodec int64 + +const ( + CompressionCodec_UNCOMPRESSED CompressionCodec = 0 + CompressionCodec_SNAPPY CompressionCodec = 1 + CompressionCodec_GZIP CompressionCodec = 2 + CompressionCodec_LZO CompressionCodec = 3 + CompressionCodec_BROTLI CompressionCodec = 4 + CompressionCodec_LZ4 CompressionCodec = 5 + CompressionCodec_ZSTD CompressionCodec = 6 +) + +func (p CompressionCodec) String() string { + switch p { + case CompressionCodec_UNCOMPRESSED: + return "UNCOMPRESSED" + case CompressionCodec_SNAPPY: + return "SNAPPY" + case CompressionCodec_GZIP: + return "GZIP" + case CompressionCodec_LZO: + return "LZO" + case CompressionCodec_BROTLI: + return "BROTLI" + case CompressionCodec_LZ4: + return "LZ4" + case CompressionCodec_ZSTD: + return "ZSTD" + } + return "" +} + +func CompressionCodecFromString(s string) (CompressionCodec, error) { + switch s { + case "UNCOMPRESSED": + return CompressionCodec_UNCOMPRESSED, nil + case "SNAPPY": + return CompressionCodec_SNAPPY, nil + case "GZIP": + return CompressionCodec_GZIP, nil + case "LZO": + return CompressionCodec_LZO, nil + case "BROTLI": + return CompressionCodec_BROTLI, nil + case "LZ4": + return CompressionCodec_LZ4, nil + case "ZSTD": + return CompressionCodec_ZSTD, nil + } + return CompressionCodec(0), fmt.Errorf("not a valid CompressionCodec string") +} + +func CompressionCodecPtr(v CompressionCodec) *CompressionCodec { return &v } + +func (p CompressionCodec) MarshalText() ([]byte, error) { + return []byte(p.String()), nil +} + +func (p *CompressionCodec) UnmarshalText(text []byte) error { + q, err := CompressionCodecFromString(string(text)) + if err != nil { + return err + } + *p = q + return nil +} + +func (p *CompressionCodec) Scan(value interface{}) error { + v, ok := value.(int64) + if !ok { + return errors.New("Scan value is not int64") + } + *p = CompressionCodec(v) + return nil +} + +func (p *CompressionCodec) Value() (driver.Value, error) { + if p == nil { + return nil, nil + } + return int64(*p), nil +} + +type PageType int64 + +const ( + PageType_DATA_PAGE PageType = 0 + PageType_INDEX_PAGE PageType = 1 + PageType_DICTIONARY_PAGE PageType = 2 + PageType_DATA_PAGE_V2 PageType = 3 +) + +func (p PageType) String() string { + switch p { + case PageType_DATA_PAGE: + return "DATA_PAGE" + case PageType_INDEX_PAGE: + return "INDEX_PAGE" + case PageType_DICTIONARY_PAGE: + return "DICTIONARY_PAGE" + case PageType_DATA_PAGE_V2: + return "DATA_PAGE_V2" + } + return "" +} + +func PageTypeFromString(s string) (PageType, error) { + switch s { + case "DATA_PAGE": + return PageType_DATA_PAGE, nil + case "INDEX_PAGE": + return PageType_INDEX_PAGE, nil + case "DICTIONARY_PAGE": + return PageType_DICTIONARY_PAGE, nil + case "DATA_PAGE_V2": + return PageType_DATA_PAGE_V2, nil + } + return PageType(0), fmt.Errorf("not a valid PageType string") +} + +func PageTypePtr(v PageType) *PageType { return &v } + +func (p PageType) MarshalText() ([]byte, error) { + return []byte(p.String()), nil +} + +func (p *PageType) UnmarshalText(text []byte) error { + q, err := PageTypeFromString(string(text)) + if err != nil { + return err + } + *p = q + return nil +} + +func (p *PageType) Scan(value interface{}) error { + v, ok := value.(int64) + if !ok { + return errors.New("Scan value is not int64") + } + *p = PageType(v) + return nil +} + +func (p *PageType) Value() (driver.Value, error) { + if p == nil { + return nil, nil + } + return int64(*p), nil +} + +//Enum to annotate whether lists of min/max elements inside ColumnIndex +//are ordered and if so, in which direction. +type BoundaryOrder int64 + +const ( + BoundaryOrder_UNORDERED BoundaryOrder = 0 + BoundaryOrder_ASCENDING BoundaryOrder = 1 + BoundaryOrder_DESCENDING BoundaryOrder = 2 +) + +func (p BoundaryOrder) String() string { + switch p { + case BoundaryOrder_UNORDERED: + return "UNORDERED" + case BoundaryOrder_ASCENDING: + return "ASCENDING" + case BoundaryOrder_DESCENDING: + return "DESCENDING" + } + return "" +} + +func BoundaryOrderFromString(s string) (BoundaryOrder, error) { + switch s { + case "UNORDERED": + return BoundaryOrder_UNORDERED, nil + case "ASCENDING": + return BoundaryOrder_ASCENDING, nil + case "DESCENDING": + return BoundaryOrder_DESCENDING, nil + } + return BoundaryOrder(0), fmt.Errorf("not a valid BoundaryOrder string") +} + +func BoundaryOrderPtr(v BoundaryOrder) *BoundaryOrder { return &v } + +func (p BoundaryOrder) MarshalText() ([]byte, error) { + return []byte(p.String()), nil +} + +func (p *BoundaryOrder) UnmarshalText(text []byte) error { + q, err := BoundaryOrderFromString(string(text)) + if err != nil { + return err + } + *p = q + return nil +} + +func (p *BoundaryOrder) Scan(value interface{}) error { + v, ok := value.(int64) + if !ok { + return errors.New("Scan value is not int64") + } + *p = BoundaryOrder(v) + return nil +} + +func (p *BoundaryOrder) Value() (driver.Value, error) { + if p == nil { + return nil, nil + } + return int64(*p), nil +} + +// Statistics per row group and per page +// All fields are optional. +// +// Attributes: +// - Max: DEPRECATED: min and max value of the column. Use min_value and max_value. +// +// Values are encoded using PLAIN encoding, except that variable-length byte +// arrays do not include a length prefix. +// +// These fields encode min and max values determined by signed comparison +// only. New files should use the correct order for a column's logical type +// and store the values in the min_value and max_value fields. +// +// To support older readers, these may be set when the column order is +// signed. +// - Min +// - NullCount: count of null value in the column +// - DistinctCount: count of distinct values occurring +// - MaxValue: Min and max values for the column, determined by its ColumnOrder. +// +// Values are encoded using PLAIN encoding, except that variable-length byte +// arrays do not include a length prefix. +// - MinValue +type Statistics struct { + Max []byte `thrift:"max,1" db:"max" json:"max,omitempty"` + Min []byte `thrift:"min,2" db:"min" json:"min,omitempty"` + NullCount *int64 `thrift:"null_count,3" db:"null_count" json:"null_count,omitempty"` + DistinctCount *int64 `thrift:"distinct_count,4" db:"distinct_count" json:"distinct_count,omitempty"` + MaxValue []byte `thrift:"max_value,5" db:"max_value" json:"max_value,omitempty"` + MinValue []byte `thrift:"min_value,6" db:"min_value" json:"min_value,omitempty"` +} + +func NewStatistics() *Statistics { + return &Statistics{} +} + +var Statistics_Max_DEFAULT []byte + +func (p *Statistics) GetMax() []byte { + return p.Max +} + +var Statistics_Min_DEFAULT []byte + +func (p *Statistics) GetMin() []byte { + return p.Min +} + +var Statistics_NullCount_DEFAULT int64 + +func (p *Statistics) GetNullCount() int64 { + if !p.IsSetNullCount() { + return Statistics_NullCount_DEFAULT + } + return *p.NullCount +} + +var Statistics_DistinctCount_DEFAULT int64 + +func (p *Statistics) GetDistinctCount() int64 { + if !p.IsSetDistinctCount() { + return Statistics_DistinctCount_DEFAULT + } + return *p.DistinctCount +} + +var Statistics_MaxValue_DEFAULT []byte + +func (p *Statistics) GetMaxValue() []byte { + return p.MaxValue +} + +var Statistics_MinValue_DEFAULT []byte + +func (p *Statistics) GetMinValue() []byte { + return p.MinValue +} +func (p *Statistics) IsSetMax() bool { + return p.Max != nil +} + +func (p *Statistics) IsSetMin() bool { + return p.Min != nil +} + +func (p *Statistics) IsSetNullCount() bool { + return p.NullCount != nil +} + +func (p *Statistics) IsSetDistinctCount() bool { + return p.DistinctCount != nil +} + +func (p *Statistics) IsSetMaxValue() bool { + return p.MaxValue != nil +} + +func (p *Statistics) IsSetMinValue() bool { + return p.MinValue != nil +} + +func (p *Statistics) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + case 4: + if err := p.ReadField4(iprot); err != nil { + return err + } + case 5: + if err := p.ReadField5(iprot); err != nil { + return err + } + case 6: + if err := p.ReadField6(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *Statistics) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadBinary(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + p.Max = v + } + return nil +} + +func (p *Statistics) ReadField2(iprot thrift.TProtocol) error { + if v, err := iprot.ReadBinary(); err != nil { + return thrift.PrependError("error reading field 2: ", err) + } else { + p.Min = v + } + return nil +} + +func (p *Statistics) ReadField3(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 3: ", err) + } else { + p.NullCount = &v + } + return nil +} + +func (p *Statistics) ReadField4(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 4: ", err) + } else { + p.DistinctCount = &v + } + return nil +} + +func (p *Statistics) ReadField5(iprot thrift.TProtocol) error { + if v, err := iprot.ReadBinary(); err != nil { + return thrift.PrependError("error reading field 5: ", err) + } else { + p.MaxValue = v + } + return nil +} + +func (p *Statistics) ReadField6(iprot thrift.TProtocol) error { + if v, err := iprot.ReadBinary(); err != nil { + return thrift.PrependError("error reading field 6: ", err) + } else { + p.MinValue = v + } + return nil +} + +func (p *Statistics) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("Statistics"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + if err := p.writeField4(oprot); err != nil { + return err + } + if err := p.writeField5(oprot); err != nil { + return err + } + if err := p.writeField6(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *Statistics) writeField1(oprot thrift.TProtocol) (err error) { + if p.IsSetMax() { + if err := oprot.WriteFieldBegin("max", thrift.STRING, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:max: ", p), err) + } + if err := oprot.WriteBinary(p.Max); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.max (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:max: ", p), err) + } + } + return err +} + +func (p *Statistics) writeField2(oprot thrift.TProtocol) (err error) { + if p.IsSetMin() { + if err := oprot.WriteFieldBegin("min", thrift.STRING, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:min: ", p), err) + } + if err := oprot.WriteBinary(p.Min); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.min (2) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:min: ", p), err) + } + } + return err +} + +func (p *Statistics) writeField3(oprot thrift.TProtocol) (err error) { + if p.IsSetNullCount() { + if err := oprot.WriteFieldBegin("null_count", thrift.I64, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:null_count: ", p), err) + } + if err := oprot.WriteI64(int64(*p.NullCount)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.null_count (3) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:null_count: ", p), err) + } + } + return err +} + +func (p *Statistics) writeField4(oprot thrift.TProtocol) (err error) { + if p.IsSetDistinctCount() { + if err := oprot.WriteFieldBegin("distinct_count", thrift.I64, 4); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 4:distinct_count: ", p), err) + } + if err := oprot.WriteI64(int64(*p.DistinctCount)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.distinct_count (4) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 4:distinct_count: ", p), err) + } + } + return err +} + +func (p *Statistics) writeField5(oprot thrift.TProtocol) (err error) { + if p.IsSetMaxValue() { + if err := oprot.WriteFieldBegin("max_value", thrift.STRING, 5); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 5:max_value: ", p), err) + } + if err := oprot.WriteBinary(p.MaxValue); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.max_value (5) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 5:max_value: ", p), err) + } + } + return err +} + +func (p *Statistics) writeField6(oprot thrift.TProtocol) (err error) { + if p.IsSetMinValue() { + if err := oprot.WriteFieldBegin("min_value", thrift.STRING, 6); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 6:min_value: ", p), err) + } + if err := oprot.WriteBinary(p.MinValue); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.min_value (6) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 6:min_value: ", p), err) + } + } + return err +} + +func (p *Statistics) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("Statistics(%+v)", *p) +} + +// Empty structs to use as logical type annotations +type StringType struct { +} + +func NewStringType() *StringType { + return &StringType{} +} + +func (p *StringType) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *StringType) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("StringType"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *StringType) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("StringType(%+v)", *p) +} + +type UUIDType struct { +} + +func NewUUIDType() *UUIDType { + return &UUIDType{} +} + +func (p *UUIDType) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *UUIDType) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("UUIDType"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *UUIDType) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("UUIDType(%+v)", *p) +} + +type MapType struct { +} + +func NewMapType() *MapType { + return &MapType{} +} + +func (p *MapType) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *MapType) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("MapType"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *MapType) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("MapType(%+v)", *p) +} + +type ListType struct { +} + +func NewListType() *ListType { + return &ListType{} +} + +func (p *ListType) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *ListType) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("ListType"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *ListType) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("ListType(%+v)", *p) +} + +type EnumType struct { +} + +func NewEnumType() *EnumType { + return &EnumType{} +} + +func (p *EnumType) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *EnumType) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("EnumType"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *EnumType) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("EnumType(%+v)", *p) +} + +type DateType struct { +} + +func NewDateType() *DateType { + return &DateType{} +} + +func (p *DateType) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *DateType) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("DateType"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *DateType) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("DateType(%+v)", *p) +} + +// Logical type to annotate a column that is always null. +// +// Sometimes when discovering the schema of existing data, values are always +// null and the physical type can't be determined. This annotation signals +// the case where the physical type was guessed from all null values. +type NullType struct { +} + +func NewNullType() *NullType { + return &NullType{} +} + +func (p *NullType) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *NullType) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("NullType"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *NullType) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("NullType(%+v)", *p) +} + +// Decimal logical type annotation +// +// To maintain forward-compatibility in v1, implementations using this logical +// type must also set scale and precision on the annotated SchemaElement. +// +// Allowed for physical types: INT32, INT64, FIXED, and BINARY +// +// Attributes: +// - Scale +// - Precision +type DecimalType struct { + Scale int32 `thrift:"scale,1,required" db:"scale" json:"scale"` + Precision int32 `thrift:"precision,2,required" db:"precision" json:"precision"` +} + +func NewDecimalType() *DecimalType { + return &DecimalType{} +} + +func (p *DecimalType) GetScale() int32 { + return p.Scale +} + +func (p *DecimalType) GetPrecision() int32 { + return p.Precision +} +func (p *DecimalType) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetScale bool = false + var issetPrecision bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetScale = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetPrecision = true + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetScale { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Scale is not set")) + } + if !issetPrecision { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Precision is not set")) + } + return nil +} + +func (p *DecimalType) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + p.Scale = v + } + return nil +} + +func (p *DecimalType) ReadField2(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 2: ", err) + } else { + p.Precision = v + } + return nil +} + +func (p *DecimalType) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("DecimalType"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *DecimalType) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("scale", thrift.I32, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:scale: ", p), err) + } + if err := oprot.WriteI32(int32(p.Scale)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.scale (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:scale: ", p), err) + } + return err +} + +func (p *DecimalType) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("precision", thrift.I32, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:precision: ", p), err) + } + if err := oprot.WriteI32(int32(p.Precision)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.precision (2) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:precision: ", p), err) + } + return err +} + +func (p *DecimalType) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("DecimalType(%+v)", *p) +} + +// Time units for logical types +type MilliSeconds struct { +} + +func NewMilliSeconds() *MilliSeconds { + return &MilliSeconds{} +} + +func (p *MilliSeconds) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *MilliSeconds) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("MilliSeconds"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *MilliSeconds) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("MilliSeconds(%+v)", *p) +} + +type MicroSeconds struct { +} + +func NewMicroSeconds() *MicroSeconds { + return &MicroSeconds{} +} + +func (p *MicroSeconds) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *MicroSeconds) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("MicroSeconds"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *MicroSeconds) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("MicroSeconds(%+v)", *p) +} + +type NanoSeconds struct { +} + +func NewNanoSeconds() *NanoSeconds { + return &NanoSeconds{} +} + +func (p *NanoSeconds) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *NanoSeconds) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("NanoSeconds"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *NanoSeconds) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("NanoSeconds(%+v)", *p) +} + +// Attributes: +// - MILLIS +// - MICROS +// - NANOS +type TimeUnit struct { + MILLIS *MilliSeconds `thrift:"MILLIS,1" db:"MILLIS" json:"MILLIS,omitempty"` + MICROS *MicroSeconds `thrift:"MICROS,2" db:"MICROS" json:"MICROS,omitempty"` + NANOS *NanoSeconds `thrift:"NANOS,3" db:"NANOS" json:"NANOS,omitempty"` +} + +func NewTimeUnit() *TimeUnit { + return &TimeUnit{} +} + +var TimeUnit_MILLIS_DEFAULT *MilliSeconds + +func (p *TimeUnit) GetMILLIS() *MilliSeconds { + if !p.IsSetMILLIS() { + return TimeUnit_MILLIS_DEFAULT + } + return p.MILLIS +} + +var TimeUnit_MICROS_DEFAULT *MicroSeconds + +func (p *TimeUnit) GetMICROS() *MicroSeconds { + if !p.IsSetMICROS() { + return TimeUnit_MICROS_DEFAULT + } + return p.MICROS +} + +var TimeUnit_NANOS_DEFAULT *NanoSeconds + +func (p *TimeUnit) GetNANOS() *NanoSeconds { + if !p.IsSetNANOS() { + return TimeUnit_NANOS_DEFAULT + } + return p.NANOS +} +func (p *TimeUnit) CountSetFieldsTimeUnit() int { + count := 0 + if p.IsSetMILLIS() { + count++ + } + if p.IsSetMICROS() { + count++ + } + if p.IsSetNANOS() { + count++ + } + return count + +} + +func (p *TimeUnit) IsSetMILLIS() bool { + return p.MILLIS != nil +} + +func (p *TimeUnit) IsSetMICROS() bool { + return p.MICROS != nil +} + +func (p *TimeUnit) IsSetNANOS() bool { + return p.NANOS != nil +} + +func (p *TimeUnit) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *TimeUnit) ReadField1(iprot thrift.TProtocol) error { + p.MILLIS = &MilliSeconds{} + if err := p.MILLIS.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.MILLIS), err) + } + return nil +} + +func (p *TimeUnit) ReadField2(iprot thrift.TProtocol) error { + p.MICROS = &MicroSeconds{} + if err := p.MICROS.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.MICROS), err) + } + return nil +} + +func (p *TimeUnit) ReadField3(iprot thrift.TProtocol) error { + p.NANOS = &NanoSeconds{} + if err := p.NANOS.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.NANOS), err) + } + return nil +} + +func (p *TimeUnit) Write(oprot thrift.TProtocol) error { + if c := p.CountSetFieldsTimeUnit(); c != 1 { + return fmt.Errorf("%T write union: exactly one field must be set (%d set).", p, c) + } + if err := oprot.WriteStructBegin("TimeUnit"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *TimeUnit) writeField1(oprot thrift.TProtocol) (err error) { + if p.IsSetMILLIS() { + if err := oprot.WriteFieldBegin("MILLIS", thrift.STRUCT, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:MILLIS: ", p), err) + } + if err := p.MILLIS.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.MILLIS), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:MILLIS: ", p), err) + } + } + return err +} + +func (p *TimeUnit) writeField2(oprot thrift.TProtocol) (err error) { + if p.IsSetMICROS() { + if err := oprot.WriteFieldBegin("MICROS", thrift.STRUCT, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:MICROS: ", p), err) + } + if err := p.MICROS.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.MICROS), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:MICROS: ", p), err) + } + } + return err +} + +func (p *TimeUnit) writeField3(oprot thrift.TProtocol) (err error) { + if p.IsSetNANOS() { + if err := oprot.WriteFieldBegin("NANOS", thrift.STRUCT, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:NANOS: ", p), err) + } + if err := p.NANOS.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.NANOS), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:NANOS: ", p), err) + } + } + return err +} + +func (p *TimeUnit) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("TimeUnit(%+v)", *p) +} + +// Timestamp logical type annotation +// +// Allowed for physical types: INT64 +// +// Attributes: +// - IsAdjustedToUTC +// - Unit +type TimestampType struct { + IsAdjustedToUTC bool `thrift:"isAdjustedToUTC,1,required" db:"isAdjustedToUTC" json:"isAdjustedToUTC"` + Unit *TimeUnit `thrift:"unit,2,required" db:"unit" json:"unit"` +} + +func NewTimestampType() *TimestampType { + return &TimestampType{} +} + +func (p *TimestampType) GetIsAdjustedToUTC() bool { + return p.IsAdjustedToUTC +} + +var TimestampType_Unit_DEFAULT *TimeUnit + +func (p *TimestampType) GetUnit() *TimeUnit { + if !p.IsSetUnit() { + return TimestampType_Unit_DEFAULT + } + return p.Unit +} +func (p *TimestampType) IsSetUnit() bool { + return p.Unit != nil +} + +func (p *TimestampType) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetIsAdjustedToUTC bool = false + var issetUnit bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetIsAdjustedToUTC = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetUnit = true + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetIsAdjustedToUTC { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field IsAdjustedToUTC is not set")) + } + if !issetUnit { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Unit is not set")) + } + return nil +} + +func (p *TimestampType) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadBool(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + p.IsAdjustedToUTC = v + } + return nil +} + +func (p *TimestampType) ReadField2(iprot thrift.TProtocol) error { + p.Unit = &TimeUnit{} + if err := p.Unit.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.Unit), err) + } + return nil +} + +func (p *TimestampType) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("TimestampType"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *TimestampType) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("isAdjustedToUTC", thrift.BOOL, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:isAdjustedToUTC: ", p), err) + } + if err := oprot.WriteBool(bool(p.IsAdjustedToUTC)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.isAdjustedToUTC (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:isAdjustedToUTC: ", p), err) + } + return err +} + +func (p *TimestampType) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("unit", thrift.STRUCT, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:unit: ", p), err) + } + if err := p.Unit.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.Unit), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:unit: ", p), err) + } + return err +} + +func (p *TimestampType) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("TimestampType(%+v)", *p) +} + +// Time logical type annotation +// +// Allowed for physical types: INT32 (millis), INT64 (micros, nanos) +// +// Attributes: +// - IsAdjustedToUTC +// - Unit +type TimeType struct { + IsAdjustedToUTC bool `thrift:"isAdjustedToUTC,1,required" db:"isAdjustedToUTC" json:"isAdjustedToUTC"` + Unit *TimeUnit `thrift:"unit,2,required" db:"unit" json:"unit"` +} + +func NewTimeType() *TimeType { + return &TimeType{} +} + +func (p *TimeType) GetIsAdjustedToUTC() bool { + return p.IsAdjustedToUTC +} + +var TimeType_Unit_DEFAULT *TimeUnit + +func (p *TimeType) GetUnit() *TimeUnit { + if !p.IsSetUnit() { + return TimeType_Unit_DEFAULT + } + return p.Unit +} +func (p *TimeType) IsSetUnit() bool { + return p.Unit != nil +} + +func (p *TimeType) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetIsAdjustedToUTC bool = false + var issetUnit bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetIsAdjustedToUTC = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetUnit = true + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetIsAdjustedToUTC { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field IsAdjustedToUTC is not set")) + } + if !issetUnit { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Unit is not set")) + } + return nil +} + +func (p *TimeType) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadBool(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + p.IsAdjustedToUTC = v + } + return nil +} + +func (p *TimeType) ReadField2(iprot thrift.TProtocol) error { + p.Unit = &TimeUnit{} + if err := p.Unit.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.Unit), err) + } + return nil +} + +func (p *TimeType) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("TimeType"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *TimeType) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("isAdjustedToUTC", thrift.BOOL, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:isAdjustedToUTC: ", p), err) + } + if err := oprot.WriteBool(bool(p.IsAdjustedToUTC)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.isAdjustedToUTC (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:isAdjustedToUTC: ", p), err) + } + return err +} + +func (p *TimeType) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("unit", thrift.STRUCT, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:unit: ", p), err) + } + if err := p.Unit.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.Unit), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:unit: ", p), err) + } + return err +} + +func (p *TimeType) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("TimeType(%+v)", *p) +} + +// Integer logical type annotation +// +// bitWidth must be 8, 16, 32, or 64. +// +// Allowed for physical types: INT32, INT64 +// +// Attributes: +// - BitWidth +// - IsSigned +type IntType struct { + BitWidth int8 `thrift:"bitWidth,1,required" db:"bitWidth" json:"bitWidth"` + IsSigned bool `thrift:"isSigned,2,required" db:"isSigned" json:"isSigned"` +} + +func NewIntType() *IntType { + return &IntType{} +} + +func (p *IntType) GetBitWidth() int8 { + return p.BitWidth +} + +func (p *IntType) GetIsSigned() bool { + return p.IsSigned +} +func (p *IntType) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetBitWidth bool = false + var issetIsSigned bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetBitWidth = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetIsSigned = true + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetBitWidth { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field BitWidth is not set")) + } + if !issetIsSigned { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field IsSigned is not set")) + } + return nil +} + +func (p *IntType) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadByte(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + temp := int8(v) + p.BitWidth = temp + } + return nil +} + +func (p *IntType) ReadField2(iprot thrift.TProtocol) error { + if v, err := iprot.ReadBool(); err != nil { + return thrift.PrependError("error reading field 2: ", err) + } else { + p.IsSigned = v + } + return nil +} + +func (p *IntType) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("IntType"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *IntType) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("bitWidth", thrift.BYTE, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:bitWidth: ", p), err) + } + if err := oprot.WriteByte(int8(p.BitWidth)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.bitWidth (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:bitWidth: ", p), err) + } + return err +} + +func (p *IntType) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("isSigned", thrift.BOOL, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:isSigned: ", p), err) + } + if err := oprot.WriteBool(bool(p.IsSigned)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.isSigned (2) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:isSigned: ", p), err) + } + return err +} + +func (p *IntType) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("IntType(%+v)", *p) +} + +// Embedded JSON logical type annotation +// +// Allowed for physical types: BINARY +type JsonType struct { +} + +func NewJsonType() *JsonType { + return &JsonType{} +} + +func (p *JsonType) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *JsonType) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("JsonType"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *JsonType) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("JsonType(%+v)", *p) +} + +// Embedded BSON logical type annotation +// +// Allowed for physical types: BINARY +type BsonType struct { +} + +func NewBsonType() *BsonType { + return &BsonType{} +} + +func (p *BsonType) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *BsonType) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("BsonType"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *BsonType) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("BsonType(%+v)", *p) +} + +// LogicalType annotations to replace ConvertedType. +// +// To maintain compatibility, implementations using LogicalType for a +// SchemaElement must also set the corresponding ConvertedType from the +// following table. +// +// Attributes: +// - STRING +// - MAP +// - LIST +// - ENUM +// - DECIMAL +// - DATE +// - TIME +// - TIMESTAMP +// - INTEGER +// - UNKNOWN +// - JSON +// - BSON +// - UUID +type LogicalType struct { + STRING *StringType `thrift:"STRING,1" db:"STRING" json:"STRING,omitempty"` + MAP *MapType `thrift:"MAP,2" db:"MAP" json:"MAP,omitempty"` + LIST *ListType `thrift:"LIST,3" db:"LIST" json:"LIST,omitempty"` + ENUM *EnumType `thrift:"ENUM,4" db:"ENUM" json:"ENUM,omitempty"` + DECIMAL *DecimalType `thrift:"DECIMAL,5" db:"DECIMAL" json:"DECIMAL,omitempty"` + DATE *DateType `thrift:"DATE,6" db:"DATE" json:"DATE,omitempty"` + TIME *TimeType `thrift:"TIME,7" db:"TIME" json:"TIME,omitempty"` + TIMESTAMP *TimestampType `thrift:"TIMESTAMP,8" db:"TIMESTAMP" json:"TIMESTAMP,omitempty"` + // unused field # 9 + INTEGER *IntType `thrift:"INTEGER,10" db:"INTEGER" json:"INTEGER,omitempty"` + UNKNOWN *NullType `thrift:"UNKNOWN,11" db:"UNKNOWN" json:"UNKNOWN,omitempty"` + JSON *JsonType `thrift:"JSON,12" db:"JSON" json:"JSON,omitempty"` + BSON *BsonType `thrift:"BSON,13" db:"BSON" json:"BSON,omitempty"` + UUID *UUIDType `thrift:"UUID,14" db:"UUID" json:"UUID,omitempty"` +} + +func NewLogicalType() *LogicalType { + return &LogicalType{} +} + +var LogicalType_STRING_DEFAULT *StringType + +func (p *LogicalType) GetSTRING() *StringType { + if !p.IsSetSTRING() { + return LogicalType_STRING_DEFAULT + } + return p.STRING +} + +var LogicalType_MAP_DEFAULT *MapType + +func (p *LogicalType) GetMAP() *MapType { + if !p.IsSetMAP() { + return LogicalType_MAP_DEFAULT + } + return p.MAP +} + +var LogicalType_LIST_DEFAULT *ListType + +func (p *LogicalType) GetLIST() *ListType { + if !p.IsSetLIST() { + return LogicalType_LIST_DEFAULT + } + return p.LIST +} + +var LogicalType_ENUM_DEFAULT *EnumType + +func (p *LogicalType) GetENUM() *EnumType { + if !p.IsSetENUM() { + return LogicalType_ENUM_DEFAULT + } + return p.ENUM +} + +var LogicalType_DECIMAL_DEFAULT *DecimalType + +func (p *LogicalType) GetDECIMAL() *DecimalType { + if !p.IsSetDECIMAL() { + return LogicalType_DECIMAL_DEFAULT + } + return p.DECIMAL +} + +var LogicalType_DATE_DEFAULT *DateType + +func (p *LogicalType) GetDATE() *DateType { + if !p.IsSetDATE() { + return LogicalType_DATE_DEFAULT + } + return p.DATE +} + +var LogicalType_TIME_DEFAULT *TimeType + +func (p *LogicalType) GetTIME() *TimeType { + if !p.IsSetTIME() { + return LogicalType_TIME_DEFAULT + } + return p.TIME +} + +var LogicalType_TIMESTAMP_DEFAULT *TimestampType + +func (p *LogicalType) GetTIMESTAMP() *TimestampType { + if !p.IsSetTIMESTAMP() { + return LogicalType_TIMESTAMP_DEFAULT + } + return p.TIMESTAMP +} + +var LogicalType_INTEGER_DEFAULT *IntType + +func (p *LogicalType) GetINTEGER() *IntType { + if !p.IsSetINTEGER() { + return LogicalType_INTEGER_DEFAULT + } + return p.INTEGER +} + +var LogicalType_UNKNOWN_DEFAULT *NullType + +func (p *LogicalType) GetUNKNOWN() *NullType { + if !p.IsSetUNKNOWN() { + return LogicalType_UNKNOWN_DEFAULT + } + return p.UNKNOWN +} + +var LogicalType_JSON_DEFAULT *JsonType + +func (p *LogicalType) GetJSON() *JsonType { + if !p.IsSetJSON() { + return LogicalType_JSON_DEFAULT + } + return p.JSON +} + +var LogicalType_BSON_DEFAULT *BsonType + +func (p *LogicalType) GetBSON() *BsonType { + if !p.IsSetBSON() { + return LogicalType_BSON_DEFAULT + } + return p.BSON +} + +var LogicalType_UUID_DEFAULT *UUIDType + +func (p *LogicalType) GetUUID() *UUIDType { + if !p.IsSetUUID() { + return LogicalType_UUID_DEFAULT + } + return p.UUID +} +func (p *LogicalType) CountSetFieldsLogicalType() int { + count := 0 + if p.IsSetSTRING() { + count++ + } + if p.IsSetMAP() { + count++ + } + if p.IsSetLIST() { + count++ + } + if p.IsSetENUM() { + count++ + } + if p.IsSetDECIMAL() { + count++ + } + if p.IsSetDATE() { + count++ + } + if p.IsSetTIME() { + count++ + } + if p.IsSetTIMESTAMP() { + count++ + } + if p.IsSetINTEGER() { + count++ + } + if p.IsSetUNKNOWN() { + count++ + } + if p.IsSetJSON() { + count++ + } + if p.IsSetBSON() { + count++ + } + if p.IsSetUUID() { + count++ + } + return count + +} + +func (p *LogicalType) IsSetSTRING() bool { + return p.STRING != nil +} + +func (p *LogicalType) IsSetMAP() bool { + return p.MAP != nil +} + +func (p *LogicalType) IsSetLIST() bool { + return p.LIST != nil +} + +func (p *LogicalType) IsSetENUM() bool { + return p.ENUM != nil +} + +func (p *LogicalType) IsSetDECIMAL() bool { + return p.DECIMAL != nil +} + +func (p *LogicalType) IsSetDATE() bool { + return p.DATE != nil +} + +func (p *LogicalType) IsSetTIME() bool { + return p.TIME != nil +} + +func (p *LogicalType) IsSetTIMESTAMP() bool { + return p.TIMESTAMP != nil +} + +func (p *LogicalType) IsSetINTEGER() bool { + return p.INTEGER != nil +} + +func (p *LogicalType) IsSetUNKNOWN() bool { + return p.UNKNOWN != nil +} + +func (p *LogicalType) IsSetJSON() bool { + return p.JSON != nil +} + +func (p *LogicalType) IsSetBSON() bool { + return p.BSON != nil +} + +func (p *LogicalType) IsSetUUID() bool { + return p.UUID != nil +} + +func (p *LogicalType) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + case 4: + if err := p.ReadField4(iprot); err != nil { + return err + } + case 5: + if err := p.ReadField5(iprot); err != nil { + return err + } + case 6: + if err := p.ReadField6(iprot); err != nil { + return err + } + case 7: + if err := p.ReadField7(iprot); err != nil { + return err + } + case 8: + if err := p.ReadField8(iprot); err != nil { + return err + } + case 10: + if err := p.ReadField10(iprot); err != nil { + return err + } + case 11: + if err := p.ReadField11(iprot); err != nil { + return err + } + case 12: + if err := p.ReadField12(iprot); err != nil { + return err + } + case 13: + if err := p.ReadField13(iprot); err != nil { + return err + } + case 14: + if err := p.ReadField14(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *LogicalType) ReadField1(iprot thrift.TProtocol) error { + p.STRING = &StringType{} + if err := p.STRING.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.STRING), err) + } + return nil +} + +func (p *LogicalType) ReadField2(iprot thrift.TProtocol) error { + p.MAP = &MapType{} + if err := p.MAP.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.MAP), err) + } + return nil +} + +func (p *LogicalType) ReadField3(iprot thrift.TProtocol) error { + p.LIST = &ListType{} + if err := p.LIST.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.LIST), err) + } + return nil +} + +func (p *LogicalType) ReadField4(iprot thrift.TProtocol) error { + p.ENUM = &EnumType{} + if err := p.ENUM.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.ENUM), err) + } + return nil +} + +func (p *LogicalType) ReadField5(iprot thrift.TProtocol) error { + p.DECIMAL = &DecimalType{} + if err := p.DECIMAL.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.DECIMAL), err) + } + return nil +} + +func (p *LogicalType) ReadField6(iprot thrift.TProtocol) error { + p.DATE = &DateType{} + if err := p.DATE.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.DATE), err) + } + return nil +} + +func (p *LogicalType) ReadField7(iprot thrift.TProtocol) error { + p.TIME = &TimeType{} + if err := p.TIME.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.TIME), err) + } + return nil +} + +func (p *LogicalType) ReadField8(iprot thrift.TProtocol) error { + p.TIMESTAMP = &TimestampType{} + if err := p.TIMESTAMP.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.TIMESTAMP), err) + } + return nil +} + +func (p *LogicalType) ReadField10(iprot thrift.TProtocol) error { + p.INTEGER = &IntType{} + if err := p.INTEGER.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.INTEGER), err) + } + return nil +} + +func (p *LogicalType) ReadField11(iprot thrift.TProtocol) error { + p.UNKNOWN = &NullType{} + if err := p.UNKNOWN.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.UNKNOWN), err) + } + return nil +} + +func (p *LogicalType) ReadField12(iprot thrift.TProtocol) error { + p.JSON = &JsonType{} + if err := p.JSON.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.JSON), err) + } + return nil +} + +func (p *LogicalType) ReadField13(iprot thrift.TProtocol) error { + p.BSON = &BsonType{} + if err := p.BSON.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.BSON), err) + } + return nil +} + +func (p *LogicalType) ReadField14(iprot thrift.TProtocol) error { + p.UUID = &UUIDType{} + if err := p.UUID.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.UUID), err) + } + return nil +} + +func (p *LogicalType) Write(oprot thrift.TProtocol) error { + if c := p.CountSetFieldsLogicalType(); c != 1 { + return fmt.Errorf("%T write union: exactly one field must be set (%d set).", p, c) + } + if err := oprot.WriteStructBegin("LogicalType"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + if err := p.writeField4(oprot); err != nil { + return err + } + if err := p.writeField5(oprot); err != nil { + return err + } + if err := p.writeField6(oprot); err != nil { + return err + } + if err := p.writeField7(oprot); err != nil { + return err + } + if err := p.writeField8(oprot); err != nil { + return err + } + if err := p.writeField10(oprot); err != nil { + return err + } + if err := p.writeField11(oprot); err != nil { + return err + } + if err := p.writeField12(oprot); err != nil { + return err + } + if err := p.writeField13(oprot); err != nil { + return err + } + if err := p.writeField14(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *LogicalType) writeField1(oprot thrift.TProtocol) (err error) { + if p.IsSetSTRING() { + if err := oprot.WriteFieldBegin("STRING", thrift.STRUCT, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:STRING: ", p), err) + } + if err := p.STRING.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.STRING), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:STRING: ", p), err) + } + } + return err +} + +func (p *LogicalType) writeField2(oprot thrift.TProtocol) (err error) { + if p.IsSetMAP() { + if err := oprot.WriteFieldBegin("MAP", thrift.STRUCT, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:MAP: ", p), err) + } + if err := p.MAP.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.MAP), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:MAP: ", p), err) + } + } + return err +} + +func (p *LogicalType) writeField3(oprot thrift.TProtocol) (err error) { + if p.IsSetLIST() { + if err := oprot.WriteFieldBegin("LIST", thrift.STRUCT, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:LIST: ", p), err) + } + if err := p.LIST.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.LIST), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:LIST: ", p), err) + } + } + return err +} + +func (p *LogicalType) writeField4(oprot thrift.TProtocol) (err error) { + if p.IsSetENUM() { + if err := oprot.WriteFieldBegin("ENUM", thrift.STRUCT, 4); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 4:ENUM: ", p), err) + } + if err := p.ENUM.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.ENUM), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 4:ENUM: ", p), err) + } + } + return err +} + +func (p *LogicalType) writeField5(oprot thrift.TProtocol) (err error) { + if p.IsSetDECIMAL() { + if err := oprot.WriteFieldBegin("DECIMAL", thrift.STRUCT, 5); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 5:DECIMAL: ", p), err) + } + if err := p.DECIMAL.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.DECIMAL), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 5:DECIMAL: ", p), err) + } + } + return err +} + +func (p *LogicalType) writeField6(oprot thrift.TProtocol) (err error) { + if p.IsSetDATE() { + if err := oprot.WriteFieldBegin("DATE", thrift.STRUCT, 6); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 6:DATE: ", p), err) + } + if err := p.DATE.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.DATE), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 6:DATE: ", p), err) + } + } + return err +} + +func (p *LogicalType) writeField7(oprot thrift.TProtocol) (err error) { + if p.IsSetTIME() { + if err := oprot.WriteFieldBegin("TIME", thrift.STRUCT, 7); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 7:TIME: ", p), err) + } + if err := p.TIME.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.TIME), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 7:TIME: ", p), err) + } + } + return err +} + +func (p *LogicalType) writeField8(oprot thrift.TProtocol) (err error) { + if p.IsSetTIMESTAMP() { + if err := oprot.WriteFieldBegin("TIMESTAMP", thrift.STRUCT, 8); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 8:TIMESTAMP: ", p), err) + } + if err := p.TIMESTAMP.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.TIMESTAMP), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 8:TIMESTAMP: ", p), err) + } + } + return err +} + +func (p *LogicalType) writeField10(oprot thrift.TProtocol) (err error) { + if p.IsSetINTEGER() { + if err := oprot.WriteFieldBegin("INTEGER", thrift.STRUCT, 10); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 10:INTEGER: ", p), err) + } + if err := p.INTEGER.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.INTEGER), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 10:INTEGER: ", p), err) + } + } + return err +} + +func (p *LogicalType) writeField11(oprot thrift.TProtocol) (err error) { + if p.IsSetUNKNOWN() { + if err := oprot.WriteFieldBegin("UNKNOWN", thrift.STRUCT, 11); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 11:UNKNOWN: ", p), err) + } + if err := p.UNKNOWN.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.UNKNOWN), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 11:UNKNOWN: ", p), err) + } + } + return err +} + +func (p *LogicalType) writeField12(oprot thrift.TProtocol) (err error) { + if p.IsSetJSON() { + if err := oprot.WriteFieldBegin("JSON", thrift.STRUCT, 12); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 12:JSON: ", p), err) + } + if err := p.JSON.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.JSON), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 12:JSON: ", p), err) + } + } + return err +} + +func (p *LogicalType) writeField13(oprot thrift.TProtocol) (err error) { + if p.IsSetBSON() { + if err := oprot.WriteFieldBegin("BSON", thrift.STRUCT, 13); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 13:BSON: ", p), err) + } + if err := p.BSON.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.BSON), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 13:BSON: ", p), err) + } + } + return err +} + +func (p *LogicalType) writeField14(oprot thrift.TProtocol) (err error) { + if p.IsSetUUID() { + if err := oprot.WriteFieldBegin("UUID", thrift.STRUCT, 14); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 14:UUID: ", p), err) + } + if err := p.UUID.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.UUID), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 14:UUID: ", p), err) + } + } + return err +} + +func (p *LogicalType) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("LogicalType(%+v)", *p) +} + +// Represents a element inside a schema definition. +// - if it is a group (inner node) then type is undefined and num_children is defined +// - if it is a primitive type (leaf) then type is defined and num_children is undefined +// the nodes are listed in depth first traversal order. +// +// Attributes: +// - Type: Data type for this field. Not set if the current element is a non-leaf node +// - TypeLength: If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the vales. +// Otherwise, if specified, this is the maximum bit length to store any of the values. +// (e.g. a low cardinality INT col could have this set to 3). Note that this is +// in the schema, and therefore fixed for the entire file. +// - RepetitionType: repetition of the field. The root of the schema does not have a repetition_type. +// All other nodes must have one +// - Name: Name of the field in the schema +// - NumChildren: Nested fields. Since thrift does not support nested fields, +// the nesting is flattened to a single list by a depth-first traversal. +// The children count is used to construct the nested relationship. +// This field is not set when the element is a primitive type +// - ConvertedType: When the schema is the result of a conversion from another model +// Used to record the original type to help with cross conversion. +// - Scale: Used when this column contains decimal data. +// See the DECIMAL converted type for more details. +// - Precision +// - FieldID: When the original schema supports field ids, this will save the +// original field id in the parquet schema +// - LogicalType: The logical type of this SchemaElement +// +// LogicalType replaces ConvertedType, but ConvertedType is still required +// for some logical types to ensure forward-compatibility in format v1. +type SchemaElement struct { + Type *Type `thrift:"type,1" db:"type" json:"type,omitempty"` + TypeLength *int32 `thrift:"type_length,2" db:"type_length" json:"type_length,omitempty"` + RepetitionType *FieldRepetitionType `thrift:"repetition_type,3" db:"repetition_type" json:"repetition_type,omitempty"` + Name string `thrift:"name,4,required" db:"name" json:"name"` + NumChildren *int32 `thrift:"num_children,5" db:"num_children" json:"num_children,omitempty"` + ConvertedType *ConvertedType `thrift:"converted_type,6" db:"converted_type" json:"converted_type,omitempty"` + Scale *int32 `thrift:"scale,7" db:"scale" json:"scale,omitempty"` + Precision *int32 `thrift:"precision,8" db:"precision" json:"precision,omitempty"` + FieldID *int32 `thrift:"field_id,9" db:"field_id" json:"field_id,omitempty"` + LogicalType *LogicalType `thrift:"logicalType,10" db:"logicalType" json:"logicalType,omitempty"` +} + +func NewSchemaElement() *SchemaElement { + return &SchemaElement{} +} + +var SchemaElement_Type_DEFAULT Type + +func (p *SchemaElement) GetType() Type { + if !p.IsSetType() { + return SchemaElement_Type_DEFAULT + } + return *p.Type +} + +var SchemaElement_TypeLength_DEFAULT int32 + +func (p *SchemaElement) GetTypeLength() int32 { + if !p.IsSetTypeLength() { + return SchemaElement_TypeLength_DEFAULT + } + return *p.TypeLength +} + +var SchemaElement_RepetitionType_DEFAULT FieldRepetitionType + +func (p *SchemaElement) GetRepetitionType() FieldRepetitionType { + if !p.IsSetRepetitionType() { + return SchemaElement_RepetitionType_DEFAULT + } + return *p.RepetitionType +} + +func (p *SchemaElement) GetName() string { + return p.Name +} + +var SchemaElement_NumChildren_DEFAULT int32 + +func (p *SchemaElement) GetNumChildren() int32 { + if !p.IsSetNumChildren() { + return SchemaElement_NumChildren_DEFAULT + } + return *p.NumChildren +} + +var SchemaElement_ConvertedType_DEFAULT ConvertedType + +func (p *SchemaElement) GetConvertedType() ConvertedType { + if !p.IsSetConvertedType() { + return SchemaElement_ConvertedType_DEFAULT + } + return *p.ConvertedType +} + +var SchemaElement_Scale_DEFAULT int32 + +func (p *SchemaElement) GetScale() int32 { + if !p.IsSetScale() { + return SchemaElement_Scale_DEFAULT + } + return *p.Scale +} + +var SchemaElement_Precision_DEFAULT int32 + +func (p *SchemaElement) GetPrecision() int32 { + if !p.IsSetPrecision() { + return SchemaElement_Precision_DEFAULT + } + return *p.Precision +} + +var SchemaElement_FieldID_DEFAULT int32 + +func (p *SchemaElement) GetFieldID() int32 { + if !p.IsSetFieldID() { + return SchemaElement_FieldID_DEFAULT + } + return *p.FieldID +} + +var SchemaElement_LogicalType_DEFAULT *LogicalType + +func (p *SchemaElement) GetLogicalType() *LogicalType { + if !p.IsSetLogicalType() { + return SchemaElement_LogicalType_DEFAULT + } + return p.LogicalType +} +func (p *SchemaElement) IsSetType() bool { + return p.Type != nil +} + +func (p *SchemaElement) IsSetTypeLength() bool { + return p.TypeLength != nil +} + +func (p *SchemaElement) IsSetRepetitionType() bool { + return p.RepetitionType != nil +} + +func (p *SchemaElement) IsSetNumChildren() bool { + return p.NumChildren != nil +} + +func (p *SchemaElement) IsSetConvertedType() bool { + return p.ConvertedType != nil +} + +func (p *SchemaElement) IsSetScale() bool { + return p.Scale != nil +} + +func (p *SchemaElement) IsSetPrecision() bool { + return p.Precision != nil +} + +func (p *SchemaElement) IsSetFieldID() bool { + return p.FieldID != nil +} + +func (p *SchemaElement) IsSetLogicalType() bool { + return p.LogicalType != nil +} + +func (p *SchemaElement) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetName bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + case 4: + if err := p.ReadField4(iprot); err != nil { + return err + } + issetName = true + case 5: + if err := p.ReadField5(iprot); err != nil { + return err + } + case 6: + if err := p.ReadField6(iprot); err != nil { + return err + } + case 7: + if err := p.ReadField7(iprot); err != nil { + return err + } + case 8: + if err := p.ReadField8(iprot); err != nil { + return err + } + case 9: + if err := p.ReadField9(iprot); err != nil { + return err + } + case 10: + if err := p.ReadField10(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetName { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Name is not set")) + } + return nil +} + +func (p *SchemaElement) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + temp := Type(v) + p.Type = &temp + } + return nil +} + +func (p *SchemaElement) ReadField2(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 2: ", err) + } else { + p.TypeLength = &v + } + return nil +} + +func (p *SchemaElement) ReadField3(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 3: ", err) + } else { + temp := FieldRepetitionType(v) + p.RepetitionType = &temp + } + return nil +} + +func (p *SchemaElement) ReadField4(iprot thrift.TProtocol) error { + if v, err := iprot.ReadString(); err != nil { + return thrift.PrependError("error reading field 4: ", err) + } else { + p.Name = v + } + return nil +} + +func (p *SchemaElement) ReadField5(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 5: ", err) + } else { + p.NumChildren = &v + } + return nil +} + +func (p *SchemaElement) ReadField6(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 6: ", err) + } else { + temp := ConvertedType(v) + p.ConvertedType = &temp + } + return nil +} + +func (p *SchemaElement) ReadField7(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 7: ", err) + } else { + p.Scale = &v + } + return nil +} + +func (p *SchemaElement) ReadField8(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 8: ", err) + } else { + p.Precision = &v + } + return nil +} + +func (p *SchemaElement) ReadField9(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 9: ", err) + } else { + p.FieldID = &v + } + return nil +} + +func (p *SchemaElement) ReadField10(iprot thrift.TProtocol) error { + p.LogicalType = &LogicalType{} + if err := p.LogicalType.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.LogicalType), err) + } + return nil +} + +func (p *SchemaElement) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("SchemaElement"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + if err := p.writeField4(oprot); err != nil { + return err + } + if err := p.writeField5(oprot); err != nil { + return err + } + if err := p.writeField6(oprot); err != nil { + return err + } + if err := p.writeField7(oprot); err != nil { + return err + } + if err := p.writeField8(oprot); err != nil { + return err + } + if err := p.writeField9(oprot); err != nil { + return err + } + if err := p.writeField10(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *SchemaElement) writeField1(oprot thrift.TProtocol) (err error) { + if p.IsSetType() { + if err := oprot.WriteFieldBegin("type", thrift.I32, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:type: ", p), err) + } + if err := oprot.WriteI32(int32(*p.Type)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.type (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:type: ", p), err) + } + } + return err +} + +func (p *SchemaElement) writeField2(oprot thrift.TProtocol) (err error) { + if p.IsSetTypeLength() { + if err := oprot.WriteFieldBegin("type_length", thrift.I32, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:type_length: ", p), err) + } + if err := oprot.WriteI32(int32(*p.TypeLength)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.type_length (2) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:type_length: ", p), err) + } + } + return err +} + +func (p *SchemaElement) writeField3(oprot thrift.TProtocol) (err error) { + if p.IsSetRepetitionType() { + if err := oprot.WriteFieldBegin("repetition_type", thrift.I32, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:repetition_type: ", p), err) + } + if err := oprot.WriteI32(int32(*p.RepetitionType)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.repetition_type (3) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:repetition_type: ", p), err) + } + } + return err +} + +func (p *SchemaElement) writeField4(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("name", thrift.STRING, 4); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 4:name: ", p), err) + } + if err := oprot.WriteString(string(p.Name)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.name (4) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 4:name: ", p), err) + } + return err +} + +func (p *SchemaElement) writeField5(oprot thrift.TProtocol) (err error) { + if p.IsSetNumChildren() { + if err := oprot.WriteFieldBegin("num_children", thrift.I32, 5); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 5:num_children: ", p), err) + } + if err := oprot.WriteI32(int32(*p.NumChildren)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.num_children (5) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 5:num_children: ", p), err) + } + } + return err +} + +func (p *SchemaElement) writeField6(oprot thrift.TProtocol) (err error) { + if p.IsSetConvertedType() { + if err := oprot.WriteFieldBegin("converted_type", thrift.I32, 6); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 6:converted_type: ", p), err) + } + if err := oprot.WriteI32(int32(*p.ConvertedType)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.converted_type (6) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 6:converted_type: ", p), err) + } + } + return err +} + +func (p *SchemaElement) writeField7(oprot thrift.TProtocol) (err error) { + if p.IsSetScale() { + if err := oprot.WriteFieldBegin("scale", thrift.I32, 7); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 7:scale: ", p), err) + } + if err := oprot.WriteI32(int32(*p.Scale)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.scale (7) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 7:scale: ", p), err) + } + } + return err +} + +func (p *SchemaElement) writeField8(oprot thrift.TProtocol) (err error) { + if p.IsSetPrecision() { + if err := oprot.WriteFieldBegin("precision", thrift.I32, 8); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 8:precision: ", p), err) + } + if err := oprot.WriteI32(int32(*p.Precision)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.precision (8) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 8:precision: ", p), err) + } + } + return err +} + +func (p *SchemaElement) writeField9(oprot thrift.TProtocol) (err error) { + if p.IsSetFieldID() { + if err := oprot.WriteFieldBegin("field_id", thrift.I32, 9); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 9:field_id: ", p), err) + } + if err := oprot.WriteI32(int32(*p.FieldID)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.field_id (9) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 9:field_id: ", p), err) + } + } + return err +} + +func (p *SchemaElement) writeField10(oprot thrift.TProtocol) (err error) { + if p.IsSetLogicalType() { + if err := oprot.WriteFieldBegin("logicalType", thrift.STRUCT, 10); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 10:logicalType: ", p), err) + } + if err := p.LogicalType.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.LogicalType), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 10:logicalType: ", p), err) + } + } + return err +} + +func (p *SchemaElement) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("SchemaElement(%+v)", *p) +} + +// Data page header +// +// Attributes: +// - NumValues: Number of values, including NULLs, in this data page. * +// - Encoding: Encoding used for this data page * +// - DefinitionLevelEncoding: Encoding used for definition levels * +// - RepetitionLevelEncoding: Encoding used for repetition levels * +// - Statistics: Optional statistics for the data in this page* +type DataPageHeader struct { + NumValues int32 `thrift:"num_values,1,required" db:"num_values" json:"num_values"` + Encoding Encoding `thrift:"encoding,2,required" db:"encoding" json:"encoding"` + DefinitionLevelEncoding Encoding `thrift:"definition_level_encoding,3,required" db:"definition_level_encoding" json:"definition_level_encoding"` + RepetitionLevelEncoding Encoding `thrift:"repetition_level_encoding,4,required" db:"repetition_level_encoding" json:"repetition_level_encoding"` + Statistics *Statistics `thrift:"statistics,5" db:"statistics" json:"statistics,omitempty"` +} + +func NewDataPageHeader() *DataPageHeader { + return &DataPageHeader{} +} + +func (p *DataPageHeader) GetNumValues() int32 { + return p.NumValues +} + +func (p *DataPageHeader) GetEncoding() Encoding { + return p.Encoding +} + +func (p *DataPageHeader) GetDefinitionLevelEncoding() Encoding { + return p.DefinitionLevelEncoding +} + +func (p *DataPageHeader) GetRepetitionLevelEncoding() Encoding { + return p.RepetitionLevelEncoding +} + +var DataPageHeader_Statistics_DEFAULT *Statistics + +func (p *DataPageHeader) GetStatistics() *Statistics { + if !p.IsSetStatistics() { + return DataPageHeader_Statistics_DEFAULT + } + return p.Statistics +} +func (p *DataPageHeader) IsSetStatistics() bool { + return p.Statistics != nil +} + +func (p *DataPageHeader) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetNumValues bool = false + var issetEncoding bool = false + var issetDefinitionLevelEncoding bool = false + var issetRepetitionLevelEncoding bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetNumValues = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetEncoding = true + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + issetDefinitionLevelEncoding = true + case 4: + if err := p.ReadField4(iprot); err != nil { + return err + } + issetRepetitionLevelEncoding = true + case 5: + if err := p.ReadField5(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetNumValues { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field NumValues is not set")) + } + if !issetEncoding { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Encoding is not set")) + } + if !issetDefinitionLevelEncoding { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field DefinitionLevelEncoding is not set")) + } + if !issetRepetitionLevelEncoding { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field RepetitionLevelEncoding is not set")) + } + return nil +} + +func (p *DataPageHeader) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + p.NumValues = v + } + return nil +} + +func (p *DataPageHeader) ReadField2(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 2: ", err) + } else { + temp := Encoding(v) + p.Encoding = temp + } + return nil +} + +func (p *DataPageHeader) ReadField3(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 3: ", err) + } else { + temp := Encoding(v) + p.DefinitionLevelEncoding = temp + } + return nil +} + +func (p *DataPageHeader) ReadField4(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 4: ", err) + } else { + temp := Encoding(v) + p.RepetitionLevelEncoding = temp + } + return nil +} + +func (p *DataPageHeader) ReadField5(iprot thrift.TProtocol) error { + p.Statistics = &Statistics{} + if err := p.Statistics.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.Statistics), err) + } + return nil +} + +func (p *DataPageHeader) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("DataPageHeader"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + if err := p.writeField4(oprot); err != nil { + return err + } + if err := p.writeField5(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *DataPageHeader) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("num_values", thrift.I32, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:num_values: ", p), err) + } + if err := oprot.WriteI32(int32(p.NumValues)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.num_values (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:num_values: ", p), err) + } + return err +} + +func (p *DataPageHeader) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("encoding", thrift.I32, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:encoding: ", p), err) + } + if err := oprot.WriteI32(int32(p.Encoding)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.encoding (2) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:encoding: ", p), err) + } + return err +} + +func (p *DataPageHeader) writeField3(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("definition_level_encoding", thrift.I32, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:definition_level_encoding: ", p), err) + } + if err := oprot.WriteI32(int32(p.DefinitionLevelEncoding)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.definition_level_encoding (3) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:definition_level_encoding: ", p), err) + } + return err +} + +func (p *DataPageHeader) writeField4(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("repetition_level_encoding", thrift.I32, 4); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 4:repetition_level_encoding: ", p), err) + } + if err := oprot.WriteI32(int32(p.RepetitionLevelEncoding)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.repetition_level_encoding (4) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 4:repetition_level_encoding: ", p), err) + } + return err +} + +func (p *DataPageHeader) writeField5(oprot thrift.TProtocol) (err error) { + if p.IsSetStatistics() { + if err := oprot.WriteFieldBegin("statistics", thrift.STRUCT, 5); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 5:statistics: ", p), err) + } + if err := p.Statistics.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.Statistics), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 5:statistics: ", p), err) + } + } + return err +} + +func (p *DataPageHeader) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("DataPageHeader(%+v)", *p) +} + +type IndexPageHeader struct { +} + +func NewIndexPageHeader() *IndexPageHeader { + return &IndexPageHeader{} +} + +func (p *IndexPageHeader) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *IndexPageHeader) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("IndexPageHeader"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *IndexPageHeader) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("IndexPageHeader(%+v)", *p) +} + +// TODO: * +// +// Attributes: +// - NumValues: Number of values in the dictionary * +// - Encoding: Encoding using this dictionary page * +// - IsSorted: If true, the entries in the dictionary are sorted in ascending order * +type DictionaryPageHeader struct { + NumValues int32 `thrift:"num_values,1,required" db:"num_values" json:"num_values"` + Encoding Encoding `thrift:"encoding,2,required" db:"encoding" json:"encoding"` + IsSorted *bool `thrift:"is_sorted,3" db:"is_sorted" json:"is_sorted,omitempty"` +} + +func NewDictionaryPageHeader() *DictionaryPageHeader { + return &DictionaryPageHeader{} +} + +func (p *DictionaryPageHeader) GetNumValues() int32 { + return p.NumValues +} + +func (p *DictionaryPageHeader) GetEncoding() Encoding { + return p.Encoding +} + +var DictionaryPageHeader_IsSorted_DEFAULT bool + +func (p *DictionaryPageHeader) GetIsSorted() bool { + if !p.IsSetIsSorted() { + return DictionaryPageHeader_IsSorted_DEFAULT + } + return *p.IsSorted +} +func (p *DictionaryPageHeader) IsSetIsSorted() bool { + return p.IsSorted != nil +} + +func (p *DictionaryPageHeader) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetNumValues bool = false + var issetEncoding bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetNumValues = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetEncoding = true + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetNumValues { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field NumValues is not set")) + } + if !issetEncoding { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Encoding is not set")) + } + return nil +} + +func (p *DictionaryPageHeader) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + p.NumValues = v + } + return nil +} + +func (p *DictionaryPageHeader) ReadField2(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 2: ", err) + } else { + temp := Encoding(v) + p.Encoding = temp + } + return nil +} + +func (p *DictionaryPageHeader) ReadField3(iprot thrift.TProtocol) error { + if v, err := iprot.ReadBool(); err != nil { + return thrift.PrependError("error reading field 3: ", err) + } else { + p.IsSorted = &v + } + return nil +} + +func (p *DictionaryPageHeader) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("DictionaryPageHeader"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *DictionaryPageHeader) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("num_values", thrift.I32, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:num_values: ", p), err) + } + if err := oprot.WriteI32(int32(p.NumValues)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.num_values (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:num_values: ", p), err) + } + return err +} + +func (p *DictionaryPageHeader) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("encoding", thrift.I32, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:encoding: ", p), err) + } + if err := oprot.WriteI32(int32(p.Encoding)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.encoding (2) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:encoding: ", p), err) + } + return err +} + +func (p *DictionaryPageHeader) writeField3(oprot thrift.TProtocol) (err error) { + if p.IsSetIsSorted() { + if err := oprot.WriteFieldBegin("is_sorted", thrift.BOOL, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:is_sorted: ", p), err) + } + if err := oprot.WriteBool(bool(*p.IsSorted)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.is_sorted (3) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:is_sorted: ", p), err) + } + } + return err +} + +func (p *DictionaryPageHeader) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("DictionaryPageHeader(%+v)", *p) +} + +// New page format allowing reading levels without decompressing the data +// Repetition and definition levels are uncompressed +// The remaining section containing the data is compressed if is_compressed is true +// +// +// Attributes: +// - NumValues: Number of values, including NULLs, in this data page. * +// - NumNulls: Number of NULL values, in this data page. +// Number of non-null = num_values - num_nulls which is also the number of values in the data section * +// - NumRows: Number of rows in this data page. which means pages change on record boundaries (r = 0) * +// - Encoding: Encoding used for data in this page * +// - DefinitionLevelsByteLength: length of the definition levels +// - RepetitionLevelsByteLength: length of the repetition levels +// - IsCompressed: whether the values are compressed. +// Which means the section of the page between +// definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) +// is compressed with the compression_codec. +// If missing it is considered compressed +// - Statistics: optional statistics for this column chunk +type DataPageHeaderV2 struct { + NumValues int32 `thrift:"num_values,1,required" db:"num_values" json:"num_values"` + NumNulls int32 `thrift:"num_nulls,2,required" db:"num_nulls" json:"num_nulls"` + NumRows int32 `thrift:"num_rows,3,required" db:"num_rows" json:"num_rows"` + Encoding Encoding `thrift:"encoding,4,required" db:"encoding" json:"encoding"` + DefinitionLevelsByteLength int32 `thrift:"definition_levels_byte_length,5,required" db:"definition_levels_byte_length" json:"definition_levels_byte_length"` + RepetitionLevelsByteLength int32 `thrift:"repetition_levels_byte_length,6,required" db:"repetition_levels_byte_length" json:"repetition_levels_byte_length"` + IsCompressed bool `thrift:"is_compressed,7" db:"is_compressed" json:"is_compressed,omitempty"` + Statistics *Statistics `thrift:"statistics,8" db:"statistics" json:"statistics,omitempty"` +} + +func NewDataPageHeaderV2() *DataPageHeaderV2 { + return &DataPageHeaderV2{ + IsCompressed: true, + } +} + +func (p *DataPageHeaderV2) GetNumValues() int32 { + return p.NumValues +} + +func (p *DataPageHeaderV2) GetNumNulls() int32 { + return p.NumNulls +} + +func (p *DataPageHeaderV2) GetNumRows() int32 { + return p.NumRows +} + +func (p *DataPageHeaderV2) GetEncoding() Encoding { + return p.Encoding +} + +func (p *DataPageHeaderV2) GetDefinitionLevelsByteLength() int32 { + return p.DefinitionLevelsByteLength +} + +func (p *DataPageHeaderV2) GetRepetitionLevelsByteLength() int32 { + return p.RepetitionLevelsByteLength +} + +var DataPageHeaderV2_IsCompressed_DEFAULT bool = true + +func (p *DataPageHeaderV2) GetIsCompressed() bool { + return p.IsCompressed +} + +var DataPageHeaderV2_Statistics_DEFAULT *Statistics + +func (p *DataPageHeaderV2) GetStatistics() *Statistics { + if !p.IsSetStatistics() { + return DataPageHeaderV2_Statistics_DEFAULT + } + return p.Statistics +} +func (p *DataPageHeaderV2) IsSetIsCompressed() bool { + return p.IsCompressed != DataPageHeaderV2_IsCompressed_DEFAULT +} + +func (p *DataPageHeaderV2) IsSetStatistics() bool { + return p.Statistics != nil +} + +func (p *DataPageHeaderV2) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetNumValues bool = false + var issetNumNulls bool = false + var issetNumRows bool = false + var issetEncoding bool = false + var issetDefinitionLevelsByteLength bool = false + var issetRepetitionLevelsByteLength bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetNumValues = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetNumNulls = true + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + issetNumRows = true + case 4: + if err := p.ReadField4(iprot); err != nil { + return err + } + issetEncoding = true + case 5: + if err := p.ReadField5(iprot); err != nil { + return err + } + issetDefinitionLevelsByteLength = true + case 6: + if err := p.ReadField6(iprot); err != nil { + return err + } + issetRepetitionLevelsByteLength = true + case 7: + if err := p.ReadField7(iprot); err != nil { + return err + } + case 8: + if err := p.ReadField8(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetNumValues { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field NumValues is not set")) + } + if !issetNumNulls { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field NumNulls is not set")) + } + if !issetNumRows { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field NumRows is not set")) + } + if !issetEncoding { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Encoding is not set")) + } + if !issetDefinitionLevelsByteLength { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field DefinitionLevelsByteLength is not set")) + } + if !issetRepetitionLevelsByteLength { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field RepetitionLevelsByteLength is not set")) + } + return nil +} + +func (p *DataPageHeaderV2) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + p.NumValues = v + } + return nil +} + +func (p *DataPageHeaderV2) ReadField2(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 2: ", err) + } else { + p.NumNulls = v + } + return nil +} + +func (p *DataPageHeaderV2) ReadField3(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 3: ", err) + } else { + p.NumRows = v + } + return nil +} + +func (p *DataPageHeaderV2) ReadField4(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 4: ", err) + } else { + temp := Encoding(v) + p.Encoding = temp + } + return nil +} + +func (p *DataPageHeaderV2) ReadField5(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 5: ", err) + } else { + p.DefinitionLevelsByteLength = v + } + return nil +} + +func (p *DataPageHeaderV2) ReadField6(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 6: ", err) + } else { + p.RepetitionLevelsByteLength = v + } + return nil +} + +func (p *DataPageHeaderV2) ReadField7(iprot thrift.TProtocol) error { + if v, err := iprot.ReadBool(); err != nil { + return thrift.PrependError("error reading field 7: ", err) + } else { + p.IsCompressed = v + } + return nil +} + +func (p *DataPageHeaderV2) ReadField8(iprot thrift.TProtocol) error { + p.Statistics = &Statistics{} + if err := p.Statistics.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.Statistics), err) + } + return nil +} + +func (p *DataPageHeaderV2) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("DataPageHeaderV2"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + if err := p.writeField4(oprot); err != nil { + return err + } + if err := p.writeField5(oprot); err != nil { + return err + } + if err := p.writeField6(oprot); err != nil { + return err + } + if err := p.writeField7(oprot); err != nil { + return err + } + if err := p.writeField8(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *DataPageHeaderV2) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("num_values", thrift.I32, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:num_values: ", p), err) + } + if err := oprot.WriteI32(int32(p.NumValues)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.num_values (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:num_values: ", p), err) + } + return err +} + +func (p *DataPageHeaderV2) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("num_nulls", thrift.I32, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:num_nulls: ", p), err) + } + if err := oprot.WriteI32(int32(p.NumNulls)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.num_nulls (2) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:num_nulls: ", p), err) + } + return err +} + +func (p *DataPageHeaderV2) writeField3(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("num_rows", thrift.I32, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:num_rows: ", p), err) + } + if err := oprot.WriteI32(int32(p.NumRows)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.num_rows (3) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:num_rows: ", p), err) + } + return err +} + +func (p *DataPageHeaderV2) writeField4(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("encoding", thrift.I32, 4); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 4:encoding: ", p), err) + } + if err := oprot.WriteI32(int32(p.Encoding)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.encoding (4) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 4:encoding: ", p), err) + } + return err +} + +func (p *DataPageHeaderV2) writeField5(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("definition_levels_byte_length", thrift.I32, 5); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 5:definition_levels_byte_length: ", p), err) + } + if err := oprot.WriteI32(int32(p.DefinitionLevelsByteLength)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.definition_levels_byte_length (5) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 5:definition_levels_byte_length: ", p), err) + } + return err +} + +func (p *DataPageHeaderV2) writeField6(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("repetition_levels_byte_length", thrift.I32, 6); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 6:repetition_levels_byte_length: ", p), err) + } + if err := oprot.WriteI32(int32(p.RepetitionLevelsByteLength)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.repetition_levels_byte_length (6) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 6:repetition_levels_byte_length: ", p), err) + } + return err +} + +func (p *DataPageHeaderV2) writeField7(oprot thrift.TProtocol) (err error) { + if p.IsSetIsCompressed() { + if err := oprot.WriteFieldBegin("is_compressed", thrift.BOOL, 7); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 7:is_compressed: ", p), err) + } + if err := oprot.WriteBool(bool(p.IsCompressed)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.is_compressed (7) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 7:is_compressed: ", p), err) + } + } + return err +} + +func (p *DataPageHeaderV2) writeField8(oprot thrift.TProtocol) (err error) { + if p.IsSetStatistics() { + if err := oprot.WriteFieldBegin("statistics", thrift.STRUCT, 8); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 8:statistics: ", p), err) + } + if err := p.Statistics.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.Statistics), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 8:statistics: ", p), err) + } + } + return err +} + +func (p *DataPageHeaderV2) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("DataPageHeaderV2(%+v)", *p) +} + +// Attributes: +// - Type: the type of the page: indicates which of the *_header fields is set * +// - UncompressedPageSize: Uncompressed page size in bytes (not including this header) * +// - CompressedPageSize: Compressed page size in bytes (not including this header) * +// - Crc: 32bit crc for the data below. This allows for disabling checksumming in HDFS +// if only a few pages needs to be read +// +// - DataPageHeader +// - IndexPageHeader +// - DictionaryPageHeader +// - DataPageHeaderV2 +type PageHeader struct { + Type PageType `thrift:"type,1,required" db:"type" json:"type"` + UncompressedPageSize int32 `thrift:"uncompressed_page_size,2,required" db:"uncompressed_page_size" json:"uncompressed_page_size"` + CompressedPageSize int32 `thrift:"compressed_page_size,3,required" db:"compressed_page_size" json:"compressed_page_size"` + Crc *int32 `thrift:"crc,4" db:"crc" json:"crc,omitempty"` + DataPageHeader *DataPageHeader `thrift:"data_page_header,5" db:"data_page_header" json:"data_page_header,omitempty"` + IndexPageHeader *IndexPageHeader `thrift:"index_page_header,6" db:"index_page_header" json:"index_page_header,omitempty"` + DictionaryPageHeader *DictionaryPageHeader `thrift:"dictionary_page_header,7" db:"dictionary_page_header" json:"dictionary_page_header,omitempty"` + DataPageHeaderV2 *DataPageHeaderV2 `thrift:"data_page_header_v2,8" db:"data_page_header_v2" json:"data_page_header_v2,omitempty"` +} + +func NewPageHeader() *PageHeader { + return &PageHeader{} +} + +func (p *PageHeader) GetType() PageType { + return p.Type +} + +func (p *PageHeader) GetUncompressedPageSize() int32 { + return p.UncompressedPageSize +} + +func (p *PageHeader) GetCompressedPageSize() int32 { + return p.CompressedPageSize +} + +var PageHeader_Crc_DEFAULT int32 + +func (p *PageHeader) GetCrc() int32 { + if !p.IsSetCrc() { + return PageHeader_Crc_DEFAULT + } + return *p.Crc +} + +var PageHeader_DataPageHeader_DEFAULT *DataPageHeader + +func (p *PageHeader) GetDataPageHeader() *DataPageHeader { + if !p.IsSetDataPageHeader() { + return PageHeader_DataPageHeader_DEFAULT + } + return p.DataPageHeader +} + +var PageHeader_IndexPageHeader_DEFAULT *IndexPageHeader + +func (p *PageHeader) GetIndexPageHeader() *IndexPageHeader { + if !p.IsSetIndexPageHeader() { + return PageHeader_IndexPageHeader_DEFAULT + } + return p.IndexPageHeader +} + +var PageHeader_DictionaryPageHeader_DEFAULT *DictionaryPageHeader + +func (p *PageHeader) GetDictionaryPageHeader() *DictionaryPageHeader { + if !p.IsSetDictionaryPageHeader() { + return PageHeader_DictionaryPageHeader_DEFAULT + } + return p.DictionaryPageHeader +} + +var PageHeader_DataPageHeaderV2_DEFAULT *DataPageHeaderV2 + +func (p *PageHeader) GetDataPageHeaderV2() *DataPageHeaderV2 { + if !p.IsSetDataPageHeaderV2() { + return PageHeader_DataPageHeaderV2_DEFAULT + } + return p.DataPageHeaderV2 +} +func (p *PageHeader) IsSetCrc() bool { + return p.Crc != nil +} + +func (p *PageHeader) IsSetDataPageHeader() bool { + return p.DataPageHeader != nil +} + +func (p *PageHeader) IsSetIndexPageHeader() bool { + return p.IndexPageHeader != nil +} + +func (p *PageHeader) IsSetDictionaryPageHeader() bool { + return p.DictionaryPageHeader != nil +} + +func (p *PageHeader) IsSetDataPageHeaderV2() bool { + return p.DataPageHeaderV2 != nil +} + +func (p *PageHeader) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetType bool = false + var issetUncompressedPageSize bool = false + var issetCompressedPageSize bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetType = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetUncompressedPageSize = true + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + issetCompressedPageSize = true + case 4: + if err := p.ReadField4(iprot); err != nil { + return err + } + case 5: + if err := p.ReadField5(iprot); err != nil { + return err + } + case 6: + if err := p.ReadField6(iprot); err != nil { + return err + } + case 7: + if err := p.ReadField7(iprot); err != nil { + return err + } + case 8: + if err := p.ReadField8(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetType { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Type is not set")) + } + if !issetUncompressedPageSize { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field UncompressedPageSize is not set")) + } + if !issetCompressedPageSize { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field CompressedPageSize is not set")) + } + return nil +} + +func (p *PageHeader) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + temp := PageType(v) + p.Type = temp + } + return nil +} + +func (p *PageHeader) ReadField2(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 2: ", err) + } else { + p.UncompressedPageSize = v + } + return nil +} + +func (p *PageHeader) ReadField3(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 3: ", err) + } else { + p.CompressedPageSize = v + } + return nil +} + +func (p *PageHeader) ReadField4(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 4: ", err) + } else { + p.Crc = &v + } + return nil +} + +func (p *PageHeader) ReadField5(iprot thrift.TProtocol) error { + p.DataPageHeader = &DataPageHeader{} + if err := p.DataPageHeader.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.DataPageHeader), err) + } + return nil +} + +func (p *PageHeader) ReadField6(iprot thrift.TProtocol) error { + p.IndexPageHeader = &IndexPageHeader{} + if err := p.IndexPageHeader.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.IndexPageHeader), err) + } + return nil +} + +func (p *PageHeader) ReadField7(iprot thrift.TProtocol) error { + p.DictionaryPageHeader = &DictionaryPageHeader{} + if err := p.DictionaryPageHeader.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.DictionaryPageHeader), err) + } + return nil +} + +func (p *PageHeader) ReadField8(iprot thrift.TProtocol) error { + p.DataPageHeaderV2 = &DataPageHeaderV2{ + IsCompressed: true, + } + if err := p.DataPageHeaderV2.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.DataPageHeaderV2), err) + } + return nil +} + +func (p *PageHeader) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("PageHeader"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + if err := p.writeField4(oprot); err != nil { + return err + } + if err := p.writeField5(oprot); err != nil { + return err + } + if err := p.writeField6(oprot); err != nil { + return err + } + if err := p.writeField7(oprot); err != nil { + return err + } + if err := p.writeField8(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *PageHeader) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("type", thrift.I32, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:type: ", p), err) + } + if err := oprot.WriteI32(int32(p.Type)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.type (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:type: ", p), err) + } + return err +} + +func (p *PageHeader) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("uncompressed_page_size", thrift.I32, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:uncompressed_page_size: ", p), err) + } + if err := oprot.WriteI32(int32(p.UncompressedPageSize)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.uncompressed_page_size (2) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:uncompressed_page_size: ", p), err) + } + return err +} + +func (p *PageHeader) writeField3(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("compressed_page_size", thrift.I32, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:compressed_page_size: ", p), err) + } + if err := oprot.WriteI32(int32(p.CompressedPageSize)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.compressed_page_size (3) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:compressed_page_size: ", p), err) + } + return err +} + +func (p *PageHeader) writeField4(oprot thrift.TProtocol) (err error) { + if p.IsSetCrc() { + if err := oprot.WriteFieldBegin("crc", thrift.I32, 4); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 4:crc: ", p), err) + } + if err := oprot.WriteI32(int32(*p.Crc)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.crc (4) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 4:crc: ", p), err) + } + } + return err +} + +func (p *PageHeader) writeField5(oprot thrift.TProtocol) (err error) { + if p.IsSetDataPageHeader() { + if err := oprot.WriteFieldBegin("data_page_header", thrift.STRUCT, 5); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 5:data_page_header: ", p), err) + } + if err := p.DataPageHeader.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.DataPageHeader), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 5:data_page_header: ", p), err) + } + } + return err +} + +func (p *PageHeader) writeField6(oprot thrift.TProtocol) (err error) { + if p.IsSetIndexPageHeader() { + if err := oprot.WriteFieldBegin("index_page_header", thrift.STRUCT, 6); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 6:index_page_header: ", p), err) + } + if err := p.IndexPageHeader.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.IndexPageHeader), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 6:index_page_header: ", p), err) + } + } + return err +} + +func (p *PageHeader) writeField7(oprot thrift.TProtocol) (err error) { + if p.IsSetDictionaryPageHeader() { + if err := oprot.WriteFieldBegin("dictionary_page_header", thrift.STRUCT, 7); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 7:dictionary_page_header: ", p), err) + } + if err := p.DictionaryPageHeader.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.DictionaryPageHeader), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 7:dictionary_page_header: ", p), err) + } + } + return err +} + +func (p *PageHeader) writeField8(oprot thrift.TProtocol) (err error) { + if p.IsSetDataPageHeaderV2() { + if err := oprot.WriteFieldBegin("data_page_header_v2", thrift.STRUCT, 8); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 8:data_page_header_v2: ", p), err) + } + if err := p.DataPageHeaderV2.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.DataPageHeaderV2), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 8:data_page_header_v2: ", p), err) + } + } + return err +} + +func (p *PageHeader) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("PageHeader(%+v)", *p) +} + +// Wrapper struct to store key values +// +// Attributes: +// - Key +// - Value +type KeyValue struct { + Key string `thrift:"key,1,required" db:"key" json:"key"` + Value *string `thrift:"value,2" db:"value" json:"value,omitempty"` +} + +func NewKeyValue() *KeyValue { + return &KeyValue{} +} + +func (p *KeyValue) GetKey() string { + return p.Key +} + +var KeyValue_Value_DEFAULT string + +func (p *KeyValue) GetValue() string { + if !p.IsSetValue() { + return KeyValue_Value_DEFAULT + } + return *p.Value +} +func (p *KeyValue) IsSetValue() bool { + return p.Value != nil +} + +func (p *KeyValue) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetKey bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetKey = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetKey { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Key is not set")) + } + return nil +} + +func (p *KeyValue) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadString(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + p.Key = v + } + return nil +} + +func (p *KeyValue) ReadField2(iprot thrift.TProtocol) error { + if v, err := iprot.ReadString(); err != nil { + return thrift.PrependError("error reading field 2: ", err) + } else { + p.Value = &v + } + return nil +} + +func (p *KeyValue) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("KeyValue"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *KeyValue) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("key", thrift.STRING, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:key: ", p), err) + } + if err := oprot.WriteString(string(p.Key)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.key (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:key: ", p), err) + } + return err +} + +func (p *KeyValue) writeField2(oprot thrift.TProtocol) (err error) { + if p.IsSetValue() { + if err := oprot.WriteFieldBegin("value", thrift.STRING, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:value: ", p), err) + } + if err := oprot.WriteString(string(*p.Value)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.value (2) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:value: ", p), err) + } + } + return err +} + +func (p *KeyValue) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("KeyValue(%+v)", *p) +} + +// Wrapper struct to specify sort order +// +// Attributes: +// - ColumnIdx: The column index (in this row group) * +// - Descending: If true, indicates this column is sorted in descending order. * +// - NullsFirst: If true, nulls will come before non-null values, otherwise, +// nulls go at the end. +type SortingColumn struct { + ColumnIdx int32 `thrift:"column_idx,1,required" db:"column_idx" json:"column_idx"` + Descending bool `thrift:"descending,2,required" db:"descending" json:"descending"` + NullsFirst bool `thrift:"nulls_first,3,required" db:"nulls_first" json:"nulls_first"` +} + +func NewSortingColumn() *SortingColumn { + return &SortingColumn{} +} + +func (p *SortingColumn) GetColumnIdx() int32 { + return p.ColumnIdx +} + +func (p *SortingColumn) GetDescending() bool { + return p.Descending +} + +func (p *SortingColumn) GetNullsFirst() bool { + return p.NullsFirst +} +func (p *SortingColumn) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetColumnIdx bool = false + var issetDescending bool = false + var issetNullsFirst bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetColumnIdx = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetDescending = true + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + issetNullsFirst = true + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetColumnIdx { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field ColumnIdx is not set")) + } + if !issetDescending { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Descending is not set")) + } + if !issetNullsFirst { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field NullsFirst is not set")) + } + return nil +} + +func (p *SortingColumn) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + p.ColumnIdx = v + } + return nil +} + +func (p *SortingColumn) ReadField2(iprot thrift.TProtocol) error { + if v, err := iprot.ReadBool(); err != nil { + return thrift.PrependError("error reading field 2: ", err) + } else { + p.Descending = v + } + return nil +} + +func (p *SortingColumn) ReadField3(iprot thrift.TProtocol) error { + if v, err := iprot.ReadBool(); err != nil { + return thrift.PrependError("error reading field 3: ", err) + } else { + p.NullsFirst = v + } + return nil +} + +func (p *SortingColumn) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("SortingColumn"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *SortingColumn) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("column_idx", thrift.I32, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:column_idx: ", p), err) + } + if err := oprot.WriteI32(int32(p.ColumnIdx)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.column_idx (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:column_idx: ", p), err) + } + return err +} + +func (p *SortingColumn) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("descending", thrift.BOOL, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:descending: ", p), err) + } + if err := oprot.WriteBool(bool(p.Descending)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.descending (2) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:descending: ", p), err) + } + return err +} + +func (p *SortingColumn) writeField3(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("nulls_first", thrift.BOOL, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:nulls_first: ", p), err) + } + if err := oprot.WriteBool(bool(p.NullsFirst)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.nulls_first (3) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:nulls_first: ", p), err) + } + return err +} + +func (p *SortingColumn) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("SortingColumn(%+v)", *p) +} + +// statistics of a given page type and encoding +// +// Attributes: +// - PageType: the page type (data/dic/...) * +// - Encoding: encoding of the page * +// - Count: number of pages of this type with this encoding * +type PageEncodingStats struct { + PageType PageType `thrift:"page_type,1,required" db:"page_type" json:"page_type"` + Encoding Encoding `thrift:"encoding,2,required" db:"encoding" json:"encoding"` + Count int32 `thrift:"count,3,required" db:"count" json:"count"` +} + +func NewPageEncodingStats() *PageEncodingStats { + return &PageEncodingStats{} +} + +func (p *PageEncodingStats) GetPageType() PageType { + return p.PageType +} + +func (p *PageEncodingStats) GetEncoding() Encoding { + return p.Encoding +} + +func (p *PageEncodingStats) GetCount() int32 { + return p.Count +} +func (p *PageEncodingStats) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetPageType bool = false + var issetEncoding bool = false + var issetCount bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetPageType = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetEncoding = true + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + issetCount = true + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetPageType { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field PageType is not set")) + } + if !issetEncoding { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Encoding is not set")) + } + if !issetCount { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Count is not set")) + } + return nil +} + +func (p *PageEncodingStats) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + temp := PageType(v) + p.PageType = temp + } + return nil +} + +func (p *PageEncodingStats) ReadField2(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 2: ", err) + } else { + temp := Encoding(v) + p.Encoding = temp + } + return nil +} + +func (p *PageEncodingStats) ReadField3(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 3: ", err) + } else { + p.Count = v + } + return nil +} + +func (p *PageEncodingStats) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("PageEncodingStats"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *PageEncodingStats) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("page_type", thrift.I32, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:page_type: ", p), err) + } + if err := oprot.WriteI32(int32(p.PageType)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.page_type (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:page_type: ", p), err) + } + return err +} + +func (p *PageEncodingStats) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("encoding", thrift.I32, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:encoding: ", p), err) + } + if err := oprot.WriteI32(int32(p.Encoding)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.encoding (2) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:encoding: ", p), err) + } + return err +} + +func (p *PageEncodingStats) writeField3(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("count", thrift.I32, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:count: ", p), err) + } + if err := oprot.WriteI32(int32(p.Count)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.count (3) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:count: ", p), err) + } + return err +} + +func (p *PageEncodingStats) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("PageEncodingStats(%+v)", *p) +} + +// Description for column metadata +// +// Attributes: +// - Type: Type of this column * +// - Encodings: Set of all encodings used for this column. The purpose is to validate +// whether we can decode those pages. * +// - PathInSchema: Path in schema * +// - Codec: Compression codec * +// - NumValues: Number of values in this column * +// - TotalUncompressedSize: total byte size of all uncompressed pages in this column chunk (including the headers) * +// - TotalCompressedSize: total byte size of all compressed pages in this column chunk (including the headers) * +// - KeyValueMetadata: Optional key/value metadata * +// - DataPageOffset: Byte offset from beginning of file to first data page * +// - IndexPageOffset: Byte offset from beginning of file to root index page * +// - DictionaryPageOffset: Byte offset from the beginning of file to first (only) dictionary page * +// - Statistics: optional statistics for this column chunk +// - EncodingStats: Set of all encodings used for pages in this column chunk. +// This information can be used to determine if all data pages are +// dictionary encoded for example * +type ColumnMetaData struct { + Type Type `thrift:"type,1,required" db:"type" json:"type"` + Encodings []Encoding `thrift:"encodings,2,required" db:"encodings" json:"encodings"` + PathInSchema []string `thrift:"path_in_schema,3,required" db:"path_in_schema" json:"path_in_schema"` + Codec CompressionCodec `thrift:"codec,4,required" db:"codec" json:"codec"` + NumValues int64 `thrift:"num_values,5,required" db:"num_values" json:"num_values"` + TotalUncompressedSize int64 `thrift:"total_uncompressed_size,6,required" db:"total_uncompressed_size" json:"total_uncompressed_size"` + TotalCompressedSize int64 `thrift:"total_compressed_size,7,required" db:"total_compressed_size" json:"total_compressed_size"` + KeyValueMetadata []*KeyValue `thrift:"key_value_metadata,8" db:"key_value_metadata" json:"key_value_metadata,omitempty"` + DataPageOffset int64 `thrift:"data_page_offset,9,required" db:"data_page_offset" json:"data_page_offset"` + IndexPageOffset *int64 `thrift:"index_page_offset,10" db:"index_page_offset" json:"index_page_offset,omitempty"` + DictionaryPageOffset *int64 `thrift:"dictionary_page_offset,11" db:"dictionary_page_offset" json:"dictionary_page_offset,omitempty"` + Statistics *Statistics `thrift:"statistics,12" db:"statistics" json:"statistics,omitempty"` + EncodingStats []*PageEncodingStats `thrift:"encoding_stats,13" db:"encoding_stats" json:"encoding_stats,omitempty"` +} + +func NewColumnMetaData() *ColumnMetaData { + return &ColumnMetaData{} +} + +func (p *ColumnMetaData) GetType() Type { + return p.Type +} + +func (p *ColumnMetaData) GetEncodings() []Encoding { + return p.Encodings +} + +func (p *ColumnMetaData) GetPathInSchema() []string { + return p.PathInSchema +} + +func (p *ColumnMetaData) GetCodec() CompressionCodec { + return p.Codec +} + +func (p *ColumnMetaData) GetNumValues() int64 { + return p.NumValues +} + +func (p *ColumnMetaData) GetTotalUncompressedSize() int64 { + return p.TotalUncompressedSize +} + +func (p *ColumnMetaData) GetTotalCompressedSize() int64 { + return p.TotalCompressedSize +} + +var ColumnMetaData_KeyValueMetadata_DEFAULT []*KeyValue + +func (p *ColumnMetaData) GetKeyValueMetadata() []*KeyValue { + return p.KeyValueMetadata +} + +func (p *ColumnMetaData) GetDataPageOffset() int64 { + return p.DataPageOffset +} + +var ColumnMetaData_IndexPageOffset_DEFAULT int64 + +func (p *ColumnMetaData) GetIndexPageOffset() int64 { + if !p.IsSetIndexPageOffset() { + return ColumnMetaData_IndexPageOffset_DEFAULT + } + return *p.IndexPageOffset +} + +var ColumnMetaData_DictionaryPageOffset_DEFAULT int64 + +func (p *ColumnMetaData) GetDictionaryPageOffset() int64 { + if !p.IsSetDictionaryPageOffset() { + return ColumnMetaData_DictionaryPageOffset_DEFAULT + } + return *p.DictionaryPageOffset +} + +var ColumnMetaData_Statistics_DEFAULT *Statistics + +func (p *ColumnMetaData) GetStatistics() *Statistics { + if !p.IsSetStatistics() { + return ColumnMetaData_Statistics_DEFAULT + } + return p.Statistics +} + +var ColumnMetaData_EncodingStats_DEFAULT []*PageEncodingStats + +func (p *ColumnMetaData) GetEncodingStats() []*PageEncodingStats { + return p.EncodingStats +} +func (p *ColumnMetaData) IsSetKeyValueMetadata() bool { + return p.KeyValueMetadata != nil +} + +func (p *ColumnMetaData) IsSetIndexPageOffset() bool { + return p.IndexPageOffset != nil +} + +func (p *ColumnMetaData) IsSetDictionaryPageOffset() bool { + return p.DictionaryPageOffset != nil +} + +func (p *ColumnMetaData) IsSetStatistics() bool { + return p.Statistics != nil +} + +func (p *ColumnMetaData) IsSetEncodingStats() bool { + return p.EncodingStats != nil +} + +func (p *ColumnMetaData) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetType bool = false + var issetEncodings bool = false + var issetPathInSchema bool = false + var issetCodec bool = false + var issetNumValues bool = false + var issetTotalUncompressedSize bool = false + var issetTotalCompressedSize bool = false + var issetDataPageOffset bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetType = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetEncodings = true + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + issetPathInSchema = true + case 4: + if err := p.ReadField4(iprot); err != nil { + return err + } + issetCodec = true + case 5: + if err := p.ReadField5(iprot); err != nil { + return err + } + issetNumValues = true + case 6: + if err := p.ReadField6(iprot); err != nil { + return err + } + issetTotalUncompressedSize = true + case 7: + if err := p.ReadField7(iprot); err != nil { + return err + } + issetTotalCompressedSize = true + case 8: + if err := p.ReadField8(iprot); err != nil { + return err + } + case 9: + if err := p.ReadField9(iprot); err != nil { + return err + } + issetDataPageOffset = true + case 10: + if err := p.ReadField10(iprot); err != nil { + return err + } + case 11: + if err := p.ReadField11(iprot); err != nil { + return err + } + case 12: + if err := p.ReadField12(iprot); err != nil { + return err + } + case 13: + if err := p.ReadField13(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetType { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Type is not set")) + } + if !issetEncodings { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Encodings is not set")) + } + if !issetPathInSchema { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field PathInSchema is not set")) + } + if !issetCodec { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Codec is not set")) + } + if !issetNumValues { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field NumValues is not set")) + } + if !issetTotalUncompressedSize { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field TotalUncompressedSize is not set")) + } + if !issetTotalCompressedSize { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field TotalCompressedSize is not set")) + } + if !issetDataPageOffset { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field DataPageOffset is not set")) + } + return nil +} + +func (p *ColumnMetaData) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + temp := Type(v) + p.Type = temp + } + return nil +} + +func (p *ColumnMetaData) ReadField2(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([]Encoding, 0, size) + p.Encodings = tSlice + for i := 0; i < size; i++ { + var _elem0 Encoding + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 0: ", err) + } else { + temp := Encoding(v) + _elem0 = temp + } + p.Encodings = append(p.Encodings, _elem0) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *ColumnMetaData) ReadField3(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([]string, 0, size) + p.PathInSchema = tSlice + for i := 0; i < size; i++ { + var _elem1 string + if v, err := iprot.ReadString(); err != nil { + return thrift.PrependError("error reading field 0: ", err) + } else { + _elem1 = v + } + p.PathInSchema = append(p.PathInSchema, _elem1) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *ColumnMetaData) ReadField4(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 4: ", err) + } else { + temp := CompressionCodec(v) + p.Codec = temp + } + return nil +} + +func (p *ColumnMetaData) ReadField5(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 5: ", err) + } else { + p.NumValues = v + } + return nil +} + +func (p *ColumnMetaData) ReadField6(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 6: ", err) + } else { + p.TotalUncompressedSize = v + } + return nil +} + +func (p *ColumnMetaData) ReadField7(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 7: ", err) + } else { + p.TotalCompressedSize = v + } + return nil +} + +func (p *ColumnMetaData) ReadField8(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([]*KeyValue, 0, size) + p.KeyValueMetadata = tSlice + for i := 0; i < size; i++ { + _elem2 := &KeyValue{} + if err := _elem2.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", _elem2), err) + } + p.KeyValueMetadata = append(p.KeyValueMetadata, _elem2) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *ColumnMetaData) ReadField9(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 9: ", err) + } else { + p.DataPageOffset = v + } + return nil +} + +func (p *ColumnMetaData) ReadField10(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 10: ", err) + } else { + p.IndexPageOffset = &v + } + return nil +} + +func (p *ColumnMetaData) ReadField11(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 11: ", err) + } else { + p.DictionaryPageOffset = &v + } + return nil +} + +func (p *ColumnMetaData) ReadField12(iprot thrift.TProtocol) error { + p.Statistics = &Statistics{} + if err := p.Statistics.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.Statistics), err) + } + return nil +} + +func (p *ColumnMetaData) ReadField13(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([]*PageEncodingStats, 0, size) + p.EncodingStats = tSlice + for i := 0; i < size; i++ { + _elem3 := &PageEncodingStats{} + if err := _elem3.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", _elem3), err) + } + p.EncodingStats = append(p.EncodingStats, _elem3) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *ColumnMetaData) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("ColumnMetaData"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + if err := p.writeField4(oprot); err != nil { + return err + } + if err := p.writeField5(oprot); err != nil { + return err + } + if err := p.writeField6(oprot); err != nil { + return err + } + if err := p.writeField7(oprot); err != nil { + return err + } + if err := p.writeField8(oprot); err != nil { + return err + } + if err := p.writeField9(oprot); err != nil { + return err + } + if err := p.writeField10(oprot); err != nil { + return err + } + if err := p.writeField11(oprot); err != nil { + return err + } + if err := p.writeField12(oprot); err != nil { + return err + } + if err := p.writeField13(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *ColumnMetaData) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("type", thrift.I32, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:type: ", p), err) + } + if err := oprot.WriteI32(int32(p.Type)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.type (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:type: ", p), err) + } + return err +} + +func (p *ColumnMetaData) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("encodings", thrift.LIST, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:encodings: ", p), err) + } + if err := oprot.WriteListBegin(thrift.I32, len(p.Encodings)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.Encodings { + if err := oprot.WriteI32(int32(v)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T. (0) field write error: ", p), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:encodings: ", p), err) + } + return err +} + +func (p *ColumnMetaData) writeField3(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("path_in_schema", thrift.LIST, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:path_in_schema: ", p), err) + } + if err := oprot.WriteListBegin(thrift.STRING, len(p.PathInSchema)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.PathInSchema { + if err := oprot.WriteString(string(v)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T. (0) field write error: ", p), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:path_in_schema: ", p), err) + } + return err +} + +func (p *ColumnMetaData) writeField4(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("codec", thrift.I32, 4); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 4:codec: ", p), err) + } + if err := oprot.WriteI32(int32(p.Codec)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.codec (4) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 4:codec: ", p), err) + } + return err +} + +func (p *ColumnMetaData) writeField5(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("num_values", thrift.I64, 5); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 5:num_values: ", p), err) + } + if err := oprot.WriteI64(int64(p.NumValues)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.num_values (5) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 5:num_values: ", p), err) + } + return err +} + +func (p *ColumnMetaData) writeField6(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("total_uncompressed_size", thrift.I64, 6); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 6:total_uncompressed_size: ", p), err) + } + if err := oprot.WriteI64(int64(p.TotalUncompressedSize)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.total_uncompressed_size (6) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 6:total_uncompressed_size: ", p), err) + } + return err +} + +func (p *ColumnMetaData) writeField7(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("total_compressed_size", thrift.I64, 7); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 7:total_compressed_size: ", p), err) + } + if err := oprot.WriteI64(int64(p.TotalCompressedSize)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.total_compressed_size (7) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 7:total_compressed_size: ", p), err) + } + return err +} + +func (p *ColumnMetaData) writeField8(oprot thrift.TProtocol) (err error) { + if p.IsSetKeyValueMetadata() { + if err := oprot.WriteFieldBegin("key_value_metadata", thrift.LIST, 8); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 8:key_value_metadata: ", p), err) + } + if err := oprot.WriteListBegin(thrift.STRUCT, len(p.KeyValueMetadata)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.KeyValueMetadata { + if err := v.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", v), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 8:key_value_metadata: ", p), err) + } + } + return err +} + +func (p *ColumnMetaData) writeField9(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("data_page_offset", thrift.I64, 9); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 9:data_page_offset: ", p), err) + } + if err := oprot.WriteI64(int64(p.DataPageOffset)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.data_page_offset (9) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 9:data_page_offset: ", p), err) + } + return err +} + +func (p *ColumnMetaData) writeField10(oprot thrift.TProtocol) (err error) { + if p.IsSetIndexPageOffset() { + if err := oprot.WriteFieldBegin("index_page_offset", thrift.I64, 10); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 10:index_page_offset: ", p), err) + } + if err := oprot.WriteI64(int64(*p.IndexPageOffset)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.index_page_offset (10) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 10:index_page_offset: ", p), err) + } + } + return err +} + +func (p *ColumnMetaData) writeField11(oprot thrift.TProtocol) (err error) { + if p.IsSetDictionaryPageOffset() { + if err := oprot.WriteFieldBegin("dictionary_page_offset", thrift.I64, 11); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 11:dictionary_page_offset: ", p), err) + } + if err := oprot.WriteI64(int64(*p.DictionaryPageOffset)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.dictionary_page_offset (11) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 11:dictionary_page_offset: ", p), err) + } + } + return err +} + +func (p *ColumnMetaData) writeField12(oprot thrift.TProtocol) (err error) { + if p.IsSetStatistics() { + if err := oprot.WriteFieldBegin("statistics", thrift.STRUCT, 12); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 12:statistics: ", p), err) + } + if err := p.Statistics.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.Statistics), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 12:statistics: ", p), err) + } + } + return err +} + +func (p *ColumnMetaData) writeField13(oprot thrift.TProtocol) (err error) { + if p.IsSetEncodingStats() { + if err := oprot.WriteFieldBegin("encoding_stats", thrift.LIST, 13); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 13:encoding_stats: ", p), err) + } + if err := oprot.WriteListBegin(thrift.STRUCT, len(p.EncodingStats)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.EncodingStats { + if err := v.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", v), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 13:encoding_stats: ", p), err) + } + } + return err +} + +func (p *ColumnMetaData) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("ColumnMetaData(%+v)", *p) +} + +// Attributes: +// - FilePath: File where column data is stored. If not set, assumed to be same file as +// metadata. This path is relative to the current file. +// +// - FileOffset: Byte offset in file_path to the ColumnMetaData * +// - MetaData: Column metadata for this chunk. This is the same content as what is at +// file_path/file_offset. Having it here has it replicated in the file +// metadata. +// +// - OffsetIndexOffset: File offset of ColumnChunk's OffsetIndex * +// - OffsetIndexLength: Size of ColumnChunk's OffsetIndex, in bytes * +// - ColumnIndexOffset: File offset of ColumnChunk's ColumnIndex * +// - ColumnIndexLength: Size of ColumnChunk's ColumnIndex, in bytes * +type ColumnChunk struct { + FilePath *string `thrift:"file_path,1" db:"file_path" json:"file_path,omitempty"` + FileOffset int64 `thrift:"file_offset,2,required" db:"file_offset" json:"file_offset"` + MetaData *ColumnMetaData `thrift:"meta_data,3" db:"meta_data" json:"meta_data,omitempty"` + OffsetIndexOffset *int64 `thrift:"offset_index_offset,4" db:"offset_index_offset" json:"offset_index_offset,omitempty"` + OffsetIndexLength *int32 `thrift:"offset_index_length,5" db:"offset_index_length" json:"offset_index_length,omitempty"` + ColumnIndexOffset *int64 `thrift:"column_index_offset,6" db:"column_index_offset" json:"column_index_offset,omitempty"` + ColumnIndexLength *int32 `thrift:"column_index_length,7" db:"column_index_length" json:"column_index_length,omitempty"` +} + +func NewColumnChunk() *ColumnChunk { + return &ColumnChunk{} +} + +var ColumnChunk_FilePath_DEFAULT string + +func (p *ColumnChunk) GetFilePath() string { + if !p.IsSetFilePath() { + return ColumnChunk_FilePath_DEFAULT + } + return *p.FilePath +} + +func (p *ColumnChunk) GetFileOffset() int64 { + return p.FileOffset +} + +var ColumnChunk_MetaData_DEFAULT *ColumnMetaData + +func (p *ColumnChunk) GetMetaData() *ColumnMetaData { + if !p.IsSetMetaData() { + return ColumnChunk_MetaData_DEFAULT + } + return p.MetaData +} + +var ColumnChunk_OffsetIndexOffset_DEFAULT int64 + +func (p *ColumnChunk) GetOffsetIndexOffset() int64 { + if !p.IsSetOffsetIndexOffset() { + return ColumnChunk_OffsetIndexOffset_DEFAULT + } + return *p.OffsetIndexOffset +} + +var ColumnChunk_OffsetIndexLength_DEFAULT int32 + +func (p *ColumnChunk) GetOffsetIndexLength() int32 { + if !p.IsSetOffsetIndexLength() { + return ColumnChunk_OffsetIndexLength_DEFAULT + } + return *p.OffsetIndexLength +} + +var ColumnChunk_ColumnIndexOffset_DEFAULT int64 + +func (p *ColumnChunk) GetColumnIndexOffset() int64 { + if !p.IsSetColumnIndexOffset() { + return ColumnChunk_ColumnIndexOffset_DEFAULT + } + return *p.ColumnIndexOffset +} + +var ColumnChunk_ColumnIndexLength_DEFAULT int32 + +func (p *ColumnChunk) GetColumnIndexLength() int32 { + if !p.IsSetColumnIndexLength() { + return ColumnChunk_ColumnIndexLength_DEFAULT + } + return *p.ColumnIndexLength +} +func (p *ColumnChunk) IsSetFilePath() bool { + return p.FilePath != nil +} + +func (p *ColumnChunk) IsSetMetaData() bool { + return p.MetaData != nil +} + +func (p *ColumnChunk) IsSetOffsetIndexOffset() bool { + return p.OffsetIndexOffset != nil +} + +func (p *ColumnChunk) IsSetOffsetIndexLength() bool { + return p.OffsetIndexLength != nil +} + +func (p *ColumnChunk) IsSetColumnIndexOffset() bool { + return p.ColumnIndexOffset != nil +} + +func (p *ColumnChunk) IsSetColumnIndexLength() bool { + return p.ColumnIndexLength != nil +} + +func (p *ColumnChunk) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetFileOffset bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetFileOffset = true + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + case 4: + if err := p.ReadField4(iprot); err != nil { + return err + } + case 5: + if err := p.ReadField5(iprot); err != nil { + return err + } + case 6: + if err := p.ReadField6(iprot); err != nil { + return err + } + case 7: + if err := p.ReadField7(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetFileOffset { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field FileOffset is not set")) + } + return nil +} + +func (p *ColumnChunk) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadString(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + p.FilePath = &v + } + return nil +} + +func (p *ColumnChunk) ReadField2(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 2: ", err) + } else { + p.FileOffset = v + } + return nil +} + +func (p *ColumnChunk) ReadField3(iprot thrift.TProtocol) error { + p.MetaData = &ColumnMetaData{} + if err := p.MetaData.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.MetaData), err) + } + return nil +} + +func (p *ColumnChunk) ReadField4(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 4: ", err) + } else { + p.OffsetIndexOffset = &v + } + return nil +} + +func (p *ColumnChunk) ReadField5(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 5: ", err) + } else { + p.OffsetIndexLength = &v + } + return nil +} + +func (p *ColumnChunk) ReadField6(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 6: ", err) + } else { + p.ColumnIndexOffset = &v + } + return nil +} + +func (p *ColumnChunk) ReadField7(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 7: ", err) + } else { + p.ColumnIndexLength = &v + } + return nil +} + +func (p *ColumnChunk) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("ColumnChunk"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + if err := p.writeField4(oprot); err != nil { + return err + } + if err := p.writeField5(oprot); err != nil { + return err + } + if err := p.writeField6(oprot); err != nil { + return err + } + if err := p.writeField7(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *ColumnChunk) writeField1(oprot thrift.TProtocol) (err error) { + if p.IsSetFilePath() { + if err := oprot.WriteFieldBegin("file_path", thrift.STRING, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:file_path: ", p), err) + } + if err := oprot.WriteString(string(*p.FilePath)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.file_path (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:file_path: ", p), err) + } + } + return err +} + +func (p *ColumnChunk) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("file_offset", thrift.I64, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:file_offset: ", p), err) + } + if err := oprot.WriteI64(int64(p.FileOffset)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.file_offset (2) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:file_offset: ", p), err) + } + return err +} + +func (p *ColumnChunk) writeField3(oprot thrift.TProtocol) (err error) { + if p.IsSetMetaData() { + if err := oprot.WriteFieldBegin("meta_data", thrift.STRUCT, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:meta_data: ", p), err) + } + if err := p.MetaData.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.MetaData), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:meta_data: ", p), err) + } + } + return err +} + +func (p *ColumnChunk) writeField4(oprot thrift.TProtocol) (err error) { + if p.IsSetOffsetIndexOffset() { + if err := oprot.WriteFieldBegin("offset_index_offset", thrift.I64, 4); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 4:offset_index_offset: ", p), err) + } + if err := oprot.WriteI64(int64(*p.OffsetIndexOffset)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.offset_index_offset (4) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 4:offset_index_offset: ", p), err) + } + } + return err +} + +func (p *ColumnChunk) writeField5(oprot thrift.TProtocol) (err error) { + if p.IsSetOffsetIndexLength() { + if err := oprot.WriteFieldBegin("offset_index_length", thrift.I32, 5); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 5:offset_index_length: ", p), err) + } + if err := oprot.WriteI32(int32(*p.OffsetIndexLength)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.offset_index_length (5) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 5:offset_index_length: ", p), err) + } + } + return err +} + +func (p *ColumnChunk) writeField6(oprot thrift.TProtocol) (err error) { + if p.IsSetColumnIndexOffset() { + if err := oprot.WriteFieldBegin("column_index_offset", thrift.I64, 6); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 6:column_index_offset: ", p), err) + } + if err := oprot.WriteI64(int64(*p.ColumnIndexOffset)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.column_index_offset (6) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 6:column_index_offset: ", p), err) + } + } + return err +} + +func (p *ColumnChunk) writeField7(oprot thrift.TProtocol) (err error) { + if p.IsSetColumnIndexLength() { + if err := oprot.WriteFieldBegin("column_index_length", thrift.I32, 7); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 7:column_index_length: ", p), err) + } + if err := oprot.WriteI32(int32(*p.ColumnIndexLength)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.column_index_length (7) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 7:column_index_length: ", p), err) + } + } + return err +} + +func (p *ColumnChunk) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("ColumnChunk(%+v)", *p) +} + +// Attributes: +// - Columns: Metadata for each column chunk in this row group. +// This list must have the same order as the SchemaElement list in FileMetaData. +// +// - TotalByteSize: Total byte size of all the uncompressed column data in this row group * +// - NumRows: Number of rows in this row group * +// - SortingColumns: If set, specifies a sort ordering of the rows in this RowGroup. +// The sorting columns can be a subset of all the columns. +type RowGroup struct { + Columns []*ColumnChunk `thrift:"columns,1,required" db:"columns" json:"columns"` + TotalByteSize int64 `thrift:"total_byte_size,2,required" db:"total_byte_size" json:"total_byte_size"` + NumRows int64 `thrift:"num_rows,3,required" db:"num_rows" json:"num_rows"` + SortingColumns []*SortingColumn `thrift:"sorting_columns,4" db:"sorting_columns" json:"sorting_columns,omitempty"` +} + +func NewRowGroup() *RowGroup { + return &RowGroup{} +} + +func (p *RowGroup) GetColumns() []*ColumnChunk { + return p.Columns +} + +func (p *RowGroup) GetTotalByteSize() int64 { + return p.TotalByteSize +} + +func (p *RowGroup) GetNumRows() int64 { + return p.NumRows +} + +var RowGroup_SortingColumns_DEFAULT []*SortingColumn + +func (p *RowGroup) GetSortingColumns() []*SortingColumn { + return p.SortingColumns +} +func (p *RowGroup) IsSetSortingColumns() bool { + return p.SortingColumns != nil +} + +func (p *RowGroup) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetColumns bool = false + var issetTotalByteSize bool = false + var issetNumRows bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetColumns = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetTotalByteSize = true + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + issetNumRows = true + case 4: + if err := p.ReadField4(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetColumns { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Columns is not set")) + } + if !issetTotalByteSize { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field TotalByteSize is not set")) + } + if !issetNumRows { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field NumRows is not set")) + } + return nil +} + +func (p *RowGroup) ReadField1(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([]*ColumnChunk, 0, size) + p.Columns = tSlice + for i := 0; i < size; i++ { + _elem4 := &ColumnChunk{} + if err := _elem4.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", _elem4), err) + } + p.Columns = append(p.Columns, _elem4) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *RowGroup) ReadField2(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 2: ", err) + } else { + p.TotalByteSize = v + } + return nil +} + +func (p *RowGroup) ReadField3(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 3: ", err) + } else { + p.NumRows = v + } + return nil +} + +func (p *RowGroup) ReadField4(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([]*SortingColumn, 0, size) + p.SortingColumns = tSlice + for i := 0; i < size; i++ { + _elem5 := &SortingColumn{} + if err := _elem5.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", _elem5), err) + } + p.SortingColumns = append(p.SortingColumns, _elem5) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *RowGroup) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("RowGroup"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + if err := p.writeField4(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *RowGroup) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("columns", thrift.LIST, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:columns: ", p), err) + } + if err := oprot.WriteListBegin(thrift.STRUCT, len(p.Columns)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.Columns { + if err := v.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", v), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:columns: ", p), err) + } + return err +} + +func (p *RowGroup) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("total_byte_size", thrift.I64, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:total_byte_size: ", p), err) + } + if err := oprot.WriteI64(int64(p.TotalByteSize)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.total_byte_size (2) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:total_byte_size: ", p), err) + } + return err +} + +func (p *RowGroup) writeField3(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("num_rows", thrift.I64, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:num_rows: ", p), err) + } + if err := oprot.WriteI64(int64(p.NumRows)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.num_rows (3) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:num_rows: ", p), err) + } + return err +} + +func (p *RowGroup) writeField4(oprot thrift.TProtocol) (err error) { + if p.IsSetSortingColumns() { + if err := oprot.WriteFieldBegin("sorting_columns", thrift.LIST, 4); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 4:sorting_columns: ", p), err) + } + if err := oprot.WriteListBegin(thrift.STRUCT, len(p.SortingColumns)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.SortingColumns { + if err := v.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", v), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 4:sorting_columns: ", p), err) + } + } + return err +} + +func (p *RowGroup) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("RowGroup(%+v)", *p) +} + +// Empty struct to signal the order defined by the physical or logical type +type TypeDefinedOrder struct { +} + +func NewTypeDefinedOrder() *TypeDefinedOrder { + return &TypeDefinedOrder{} +} + +func (p *TypeDefinedOrder) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *TypeDefinedOrder) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("TypeDefinedOrder"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *TypeDefinedOrder) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("TypeDefinedOrder(%+v)", *p) +} + +// Union to specify the order used for the min_value and max_value fields for a +// column. This union takes the role of an enhanced enum that allows rich +// elements (which will be needed for a collation-based ordering in the future). +// +// Possible values are: +// * TypeDefinedOrder - the column uses the order defined by its logical or +// physical type (if there is no logical type). +// +// If the reader does not support the value of this union, min and max stats +// for this column should be ignored. +// +// Attributes: +// - TYPE_ORDER: The sort orders for logical types are: +// UTF8 - unsigned byte-wise comparison +// INT8 - signed comparison +// INT16 - signed comparison +// INT32 - signed comparison +// INT64 - signed comparison +// UINT8 - unsigned comparison +// UINT16 - unsigned comparison +// UINT32 - unsigned comparison +// UINT64 - unsigned comparison +// DECIMAL - signed comparison of the represented value +// DATE - signed comparison +// TIME_MILLIS - signed comparison +// TIME_MICROS - signed comparison +// TIMESTAMP_MILLIS - signed comparison +// TIMESTAMP_MICROS - signed comparison +// INTERVAL - unsigned comparison +// JSON - unsigned byte-wise comparison +// BSON - unsigned byte-wise comparison +// ENUM - unsigned byte-wise comparison +// LIST - undefined +// MAP - undefined +// +// In the absence of logical types, the sort order is determined by the physical type: +// BOOLEAN - false, true +// INT32 - signed comparison +// INT64 - signed comparison +// INT96 (only used for legacy timestamps) - undefined +// FLOAT - signed comparison of the represented value (*) +// DOUBLE - signed comparison of the represented value (*) +// BYTE_ARRAY - unsigned byte-wise comparison +// FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison +// +// (*) Because the sorting order is not specified properly for floating +// point values (relations vs. total ordering) the following +// compatibility rules should be applied when reading statistics: +// - If the min is a NaN, it should be ignored. +// - If the max is a NaN, it should be ignored. +// - If the min is +0, the row group may contain -0 values as well. +// - If the max is -0, the row group may contain +0 values as well. +// - When looking for NaN values, min and max should be ignored. +type ColumnOrder struct { + TYPE_ORDER *TypeDefinedOrder `thrift:"TYPE_ORDER,1" db:"TYPE_ORDER" json:"TYPE_ORDER,omitempty"` +} + +func NewColumnOrder() *ColumnOrder { + return &ColumnOrder{} +} + +var ColumnOrder_TYPE_ORDER_DEFAULT *TypeDefinedOrder + +func (p *ColumnOrder) GetTYPE_ORDER() *TypeDefinedOrder { + if !p.IsSetTYPE_ORDER() { + return ColumnOrder_TYPE_ORDER_DEFAULT + } + return p.TYPE_ORDER +} +func (p *ColumnOrder) CountSetFieldsColumnOrder() int { + count := 0 + if p.IsSetTYPE_ORDER() { + count++ + } + return count + +} + +func (p *ColumnOrder) IsSetTYPE_ORDER() bool { + return p.TYPE_ORDER != nil +} + +func (p *ColumnOrder) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + return nil +} + +func (p *ColumnOrder) ReadField1(iprot thrift.TProtocol) error { + p.TYPE_ORDER = &TypeDefinedOrder{} + if err := p.TYPE_ORDER.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", p.TYPE_ORDER), err) + } + return nil +} + +func (p *ColumnOrder) Write(oprot thrift.TProtocol) error { + if c := p.CountSetFieldsColumnOrder(); c != 1 { + return fmt.Errorf("%T write union: exactly one field must be set (%d set).", p, c) + } + if err := oprot.WriteStructBegin("ColumnOrder"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *ColumnOrder) writeField1(oprot thrift.TProtocol) (err error) { + if p.IsSetTYPE_ORDER() { + if err := oprot.WriteFieldBegin("TYPE_ORDER", thrift.STRUCT, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:TYPE_ORDER: ", p), err) + } + if err := p.TYPE_ORDER.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", p.TYPE_ORDER), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:TYPE_ORDER: ", p), err) + } + } + return err +} + +func (p *ColumnOrder) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("ColumnOrder(%+v)", *p) +} + +// Attributes: +// - Offset: Offset of the page in the file * +// - CompressedPageSize: Size of the page, including header. Sum of compressed_page_size and header +// length +// - FirstRowIndex: Index within the RowGroup of the first row of the page; this means pages +// change on record boundaries (r = 0). +type PageLocation struct { + Offset int64 `thrift:"offset,1,required" db:"offset" json:"offset"` + CompressedPageSize int32 `thrift:"compressed_page_size,2,required" db:"compressed_page_size" json:"compressed_page_size"` + FirstRowIndex int64 `thrift:"first_row_index,3,required" db:"first_row_index" json:"first_row_index"` +} + +func NewPageLocation() *PageLocation { + return &PageLocation{} +} + +func (p *PageLocation) GetOffset() int64 { + return p.Offset +} + +func (p *PageLocation) GetCompressedPageSize() int32 { + return p.CompressedPageSize +} + +func (p *PageLocation) GetFirstRowIndex() int64 { + return p.FirstRowIndex +} +func (p *PageLocation) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetOffset bool = false + var issetCompressedPageSize bool = false + var issetFirstRowIndex bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetOffset = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetCompressedPageSize = true + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + issetFirstRowIndex = true + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetOffset { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Offset is not set")) + } + if !issetCompressedPageSize { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field CompressedPageSize is not set")) + } + if !issetFirstRowIndex { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field FirstRowIndex is not set")) + } + return nil +} + +func (p *PageLocation) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + p.Offset = v + } + return nil +} + +func (p *PageLocation) ReadField2(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 2: ", err) + } else { + p.CompressedPageSize = v + } + return nil +} + +func (p *PageLocation) ReadField3(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 3: ", err) + } else { + p.FirstRowIndex = v + } + return nil +} + +func (p *PageLocation) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("PageLocation"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *PageLocation) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("offset", thrift.I64, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:offset: ", p), err) + } + if err := oprot.WriteI64(int64(p.Offset)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.offset (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:offset: ", p), err) + } + return err +} + +func (p *PageLocation) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("compressed_page_size", thrift.I32, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:compressed_page_size: ", p), err) + } + if err := oprot.WriteI32(int32(p.CompressedPageSize)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.compressed_page_size (2) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:compressed_page_size: ", p), err) + } + return err +} + +func (p *PageLocation) writeField3(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("first_row_index", thrift.I64, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:first_row_index: ", p), err) + } + if err := oprot.WriteI64(int64(p.FirstRowIndex)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.first_row_index (3) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:first_row_index: ", p), err) + } + return err +} + +func (p *PageLocation) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("PageLocation(%+v)", *p) +} + +// Attributes: +// - PageLocations: PageLocations, ordered by increasing PageLocation.offset. It is required +// that page_locations[i].first_row_index < page_locations[i+1].first_row_index. +type OffsetIndex struct { + PageLocations []*PageLocation `thrift:"page_locations,1,required" db:"page_locations" json:"page_locations"` +} + +func NewOffsetIndex() *OffsetIndex { + return &OffsetIndex{} +} + +func (p *OffsetIndex) GetPageLocations() []*PageLocation { + return p.PageLocations +} +func (p *OffsetIndex) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetPageLocations bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetPageLocations = true + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetPageLocations { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field PageLocations is not set")) + } + return nil +} + +func (p *OffsetIndex) ReadField1(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([]*PageLocation, 0, size) + p.PageLocations = tSlice + for i := 0; i < size; i++ { + _elem6 := &PageLocation{} + if err := _elem6.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", _elem6), err) + } + p.PageLocations = append(p.PageLocations, _elem6) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *OffsetIndex) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("OffsetIndex"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *OffsetIndex) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("page_locations", thrift.LIST, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:page_locations: ", p), err) + } + if err := oprot.WriteListBegin(thrift.STRUCT, len(p.PageLocations)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.PageLocations { + if err := v.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", v), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:page_locations: ", p), err) + } + return err +} + +func (p *OffsetIndex) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("OffsetIndex(%+v)", *p) +} + +// Description for ColumnIndex. +// Each [i] refers to the page at OffsetIndex.page_locations[i] +// +// Attributes: +// - NullPages: A list of Boolean values to determine the validity of the corresponding +// min and max values. If true, a page contains only null values, and writers +// have to set the corresponding entries in min_values and max_values to +// byte[0], so that all lists have the same length. If false, the +// corresponding entries in min_values and max_values must be valid. +// - MinValues: Two lists containing lower and upper bounds for the values of each page. +// These may be the actual minimum and maximum values found on a page, but +// can also be (more compact) values that do not exist on a page. For +// example, instead of storing ""Blart Versenwald III", a writer may set +// min_values[i]="B", max_values[i]="C". Such more compact values must still +// be valid values within the column's logical type. Readers must make sure +// that list entries are populated before using them by inspecting null_pages. +// - MaxValues +// - BoundaryOrder: Stores whether both min_values and max_values are orderd and if so, in +// which direction. This allows readers to perform binary searches in both +// lists. Readers cannot assume that max_values[i] <= min_values[i+1], even +// if the lists are ordered. +// - NullCounts: A list containing the number of null values for each page * +type ColumnIndex struct { + NullPages []bool `thrift:"null_pages,1,required" db:"null_pages" json:"null_pages"` + MinValues [][]byte `thrift:"min_values,2,required" db:"min_values" json:"min_values"` + MaxValues [][]byte `thrift:"max_values,3,required" db:"max_values" json:"max_values"` + BoundaryOrder BoundaryOrder `thrift:"boundary_order,4,required" db:"boundary_order" json:"boundary_order"` + NullCounts []int64 `thrift:"null_counts,5" db:"null_counts" json:"null_counts,omitempty"` +} + +func NewColumnIndex() *ColumnIndex { + return &ColumnIndex{} +} + +func (p *ColumnIndex) GetNullPages() []bool { + return p.NullPages +} + +func (p *ColumnIndex) GetMinValues() [][]byte { + return p.MinValues +} + +func (p *ColumnIndex) GetMaxValues() [][]byte { + return p.MaxValues +} + +func (p *ColumnIndex) GetBoundaryOrder() BoundaryOrder { + return p.BoundaryOrder +} + +var ColumnIndex_NullCounts_DEFAULT []int64 + +func (p *ColumnIndex) GetNullCounts() []int64 { + return p.NullCounts +} +func (p *ColumnIndex) IsSetNullCounts() bool { + return p.NullCounts != nil +} + +func (p *ColumnIndex) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetNullPages bool = false + var issetMinValues bool = false + var issetMaxValues bool = false + var issetBoundaryOrder bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetNullPages = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetMinValues = true + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + issetMaxValues = true + case 4: + if err := p.ReadField4(iprot); err != nil { + return err + } + issetBoundaryOrder = true + case 5: + if err := p.ReadField5(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetNullPages { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field NullPages is not set")) + } + if !issetMinValues { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field MinValues is not set")) + } + if !issetMaxValues { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field MaxValues is not set")) + } + if !issetBoundaryOrder { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field BoundaryOrder is not set")) + } + return nil +} + +func (p *ColumnIndex) ReadField1(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([]bool, 0, size) + p.NullPages = tSlice + for i := 0; i < size; i++ { + var _elem7 bool + if v, err := iprot.ReadBool(); err != nil { + return thrift.PrependError("error reading field 0: ", err) + } else { + _elem7 = v + } + p.NullPages = append(p.NullPages, _elem7) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *ColumnIndex) ReadField2(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([][]byte, 0, size) + p.MinValues = tSlice + for i := 0; i < size; i++ { + var _elem8 []byte + if v, err := iprot.ReadBinary(); err != nil { + return thrift.PrependError("error reading field 0: ", err) + } else { + _elem8 = v + } + p.MinValues = append(p.MinValues, _elem8) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *ColumnIndex) ReadField3(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([][]byte, 0, size) + p.MaxValues = tSlice + for i := 0; i < size; i++ { + var _elem9 []byte + if v, err := iprot.ReadBinary(); err != nil { + return thrift.PrependError("error reading field 0: ", err) + } else { + _elem9 = v + } + p.MaxValues = append(p.MaxValues, _elem9) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *ColumnIndex) ReadField4(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 4: ", err) + } else { + temp := BoundaryOrder(v) + p.BoundaryOrder = temp + } + return nil +} + +func (p *ColumnIndex) ReadField5(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([]int64, 0, size) + p.NullCounts = tSlice + for i := 0; i < size; i++ { + var _elem10 int64 + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 0: ", err) + } else { + _elem10 = v + } + p.NullCounts = append(p.NullCounts, _elem10) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *ColumnIndex) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("ColumnIndex"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + if err := p.writeField4(oprot); err != nil { + return err + } + if err := p.writeField5(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *ColumnIndex) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("null_pages", thrift.LIST, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:null_pages: ", p), err) + } + if err := oprot.WriteListBegin(thrift.BOOL, len(p.NullPages)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.NullPages { + if err := oprot.WriteBool(bool(v)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T. (0) field write error: ", p), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:null_pages: ", p), err) + } + return err +} + +func (p *ColumnIndex) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("min_values", thrift.LIST, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:min_values: ", p), err) + } + if err := oprot.WriteListBegin(thrift.STRING, len(p.MinValues)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.MinValues { + if err := oprot.WriteBinary(v); err != nil { + return thrift.PrependError(fmt.Sprintf("%T. (0) field write error: ", p), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:min_values: ", p), err) + } + return err +} + +func (p *ColumnIndex) writeField3(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("max_values", thrift.LIST, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:max_values: ", p), err) + } + if err := oprot.WriteListBegin(thrift.STRING, len(p.MaxValues)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.MaxValues { + if err := oprot.WriteBinary(v); err != nil { + return thrift.PrependError(fmt.Sprintf("%T. (0) field write error: ", p), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:max_values: ", p), err) + } + return err +} + +func (p *ColumnIndex) writeField4(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("boundary_order", thrift.I32, 4); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 4:boundary_order: ", p), err) + } + if err := oprot.WriteI32(int32(p.BoundaryOrder)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.boundary_order (4) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 4:boundary_order: ", p), err) + } + return err +} + +func (p *ColumnIndex) writeField5(oprot thrift.TProtocol) (err error) { + if p.IsSetNullCounts() { + if err := oprot.WriteFieldBegin("null_counts", thrift.LIST, 5); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 5:null_counts: ", p), err) + } + if err := oprot.WriteListBegin(thrift.I64, len(p.NullCounts)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.NullCounts { + if err := oprot.WriteI64(int64(v)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T. (0) field write error: ", p), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 5:null_counts: ", p), err) + } + } + return err +} + +func (p *ColumnIndex) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("ColumnIndex(%+v)", *p) +} + +// Description for file metadata +// +// Attributes: +// - Version: Version of this file * +// - Schema: Parquet schema for this file. This schema contains metadata for all the columns. +// The schema is represented as a tree with a single root. The nodes of the tree +// are flattened to a list by doing a depth-first traversal. +// The column metadata contains the path in the schema for that column which can be +// used to map columns to nodes in the schema. +// The first element is the root * +// - NumRows: Number of rows in this file * +// - RowGroups: Row groups in this file * +// - KeyValueMetadata: Optional key/value metadata * +// - CreatedBy: String for application that wrote this file. This should be in the format +// version (build ). +// e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) +// +// - ColumnOrders: Sort order used for the min_value and max_value fields of each column in +// this file. Each sort order corresponds to one column, determined by its +// position in the list, matching the position of the column in the schema. +// +// Without column_orders, the meaning of the min_value and max_value fields is +// undefined. To ensure well-defined behaviour, if min_value and max_value are +// written to a Parquet file, column_orders must be written as well. +// +// The obsolete min and max fields are always sorted by signed comparison +// regardless of column_orders. +type FileMetaData struct { + Version int32 `thrift:"version,1,required" db:"version" json:"version"` + Schema []*SchemaElement `thrift:"schema,2,required" db:"schema" json:"schema"` + NumRows int64 `thrift:"num_rows,3,required" db:"num_rows" json:"num_rows"` + RowGroups []*RowGroup `thrift:"row_groups,4,required" db:"row_groups" json:"row_groups"` + KeyValueMetadata []*KeyValue `thrift:"key_value_metadata,5" db:"key_value_metadata" json:"key_value_metadata,omitempty"` + CreatedBy *string `thrift:"created_by,6" db:"created_by" json:"created_by,omitempty"` + ColumnOrders []*ColumnOrder `thrift:"column_orders,7" db:"column_orders" json:"column_orders,omitempty"` +} + +func NewFileMetaData() *FileMetaData { + return &FileMetaData{} +} + +func (p *FileMetaData) GetVersion() int32 { + return p.Version +} + +func (p *FileMetaData) GetSchema() []*SchemaElement { + return p.Schema +} + +func (p *FileMetaData) GetNumRows() int64 { + return p.NumRows +} + +func (p *FileMetaData) GetRowGroups() []*RowGroup { + return p.RowGroups +} + +var FileMetaData_KeyValueMetadata_DEFAULT []*KeyValue + +func (p *FileMetaData) GetKeyValueMetadata() []*KeyValue { + return p.KeyValueMetadata +} + +var FileMetaData_CreatedBy_DEFAULT string + +func (p *FileMetaData) GetCreatedBy() string { + if !p.IsSetCreatedBy() { + return FileMetaData_CreatedBy_DEFAULT + } + return *p.CreatedBy +} + +var FileMetaData_ColumnOrders_DEFAULT []*ColumnOrder + +func (p *FileMetaData) GetColumnOrders() []*ColumnOrder { + return p.ColumnOrders +} +func (p *FileMetaData) IsSetKeyValueMetadata() bool { + return p.KeyValueMetadata != nil +} + +func (p *FileMetaData) IsSetCreatedBy() bool { + return p.CreatedBy != nil +} + +func (p *FileMetaData) IsSetColumnOrders() bool { + return p.ColumnOrders != nil +} + +func (p *FileMetaData) Read(iprot thrift.TProtocol) error { + if _, err := iprot.ReadStructBegin(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read error: ", p), err) + } + + var issetVersion bool = false + var issetSchema bool = false + var issetNumRows bool = false + var issetRowGroups bool = false + + for { + _, fieldTypeId, fieldId, err := iprot.ReadFieldBegin() + if err != nil { + return thrift.PrependError(fmt.Sprintf("%T field %d read error: ", p, fieldId), err) + } + if fieldTypeId == thrift.STOP { + break + } + switch fieldId { + case 1: + if err := p.ReadField1(iprot); err != nil { + return err + } + issetVersion = true + case 2: + if err := p.ReadField2(iprot); err != nil { + return err + } + issetSchema = true + case 3: + if err := p.ReadField3(iprot); err != nil { + return err + } + issetNumRows = true + case 4: + if err := p.ReadField4(iprot); err != nil { + return err + } + issetRowGroups = true + case 5: + if err := p.ReadField5(iprot); err != nil { + return err + } + case 6: + if err := p.ReadField6(iprot); err != nil { + return err + } + case 7: + if err := p.ReadField7(iprot); err != nil { + return err + } + default: + if err := iprot.Skip(fieldTypeId); err != nil { + return err + } + } + if err := iprot.ReadFieldEnd(); err != nil { + return err + } + } + if err := iprot.ReadStructEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T read struct end error: ", p), err) + } + if !issetVersion { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Version is not set")) + } + if !issetSchema { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field Schema is not set")) + } + if !issetNumRows { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field NumRows is not set")) + } + if !issetRowGroups { + return thrift.NewTProtocolExceptionWithType(thrift.INVALID_DATA, fmt.Errorf("Required field RowGroups is not set")) + } + return nil +} + +func (p *FileMetaData) ReadField1(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI32(); err != nil { + return thrift.PrependError("error reading field 1: ", err) + } else { + p.Version = v + } + return nil +} + +func (p *FileMetaData) ReadField2(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([]*SchemaElement, 0, size) + p.Schema = tSlice + for i := 0; i < size; i++ { + _elem11 := &SchemaElement{} + if err := _elem11.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", _elem11), err) + } + p.Schema = append(p.Schema, _elem11) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *FileMetaData) ReadField3(iprot thrift.TProtocol) error { + if v, err := iprot.ReadI64(); err != nil { + return thrift.PrependError("error reading field 3: ", err) + } else { + p.NumRows = v + } + return nil +} + +func (p *FileMetaData) ReadField4(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([]*RowGroup, 0, size) + p.RowGroups = tSlice + for i := 0; i < size; i++ { + _elem12 := &RowGroup{} + if err := _elem12.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", _elem12), err) + } + p.RowGroups = append(p.RowGroups, _elem12) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *FileMetaData) ReadField5(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([]*KeyValue, 0, size) + p.KeyValueMetadata = tSlice + for i := 0; i < size; i++ { + _elem13 := &KeyValue{} + if err := _elem13.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", _elem13), err) + } + p.KeyValueMetadata = append(p.KeyValueMetadata, _elem13) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *FileMetaData) ReadField6(iprot thrift.TProtocol) error { + if v, err := iprot.ReadString(); err != nil { + return thrift.PrependError("error reading field 6: ", err) + } else { + p.CreatedBy = &v + } + return nil +} + +func (p *FileMetaData) ReadField7(iprot thrift.TProtocol) error { + _, size, err := iprot.ReadListBegin() + if err != nil { + return thrift.PrependError("error reading list begin: ", err) + } + tSlice := make([]*ColumnOrder, 0, size) + p.ColumnOrders = tSlice + for i := 0; i < size; i++ { + _elem14 := &ColumnOrder{} + if err := _elem14.Read(iprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error reading struct: ", _elem14), err) + } + p.ColumnOrders = append(p.ColumnOrders, _elem14) + } + if err := iprot.ReadListEnd(); err != nil { + return thrift.PrependError("error reading list end: ", err) + } + return nil +} + +func (p *FileMetaData) Write(oprot thrift.TProtocol) error { + if err := oprot.WriteStructBegin("FileMetaData"); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write struct begin error: ", p), err) + } + if p != nil { + if err := p.writeField1(oprot); err != nil { + return err + } + if err := p.writeField2(oprot); err != nil { + return err + } + if err := p.writeField3(oprot); err != nil { + return err + } + if err := p.writeField4(oprot); err != nil { + return err + } + if err := p.writeField5(oprot); err != nil { + return err + } + if err := p.writeField6(oprot); err != nil { + return err + } + if err := p.writeField7(oprot); err != nil { + return err + } + } + if err := oprot.WriteFieldStop(); err != nil { + return thrift.PrependError("write field stop error: ", err) + } + if err := oprot.WriteStructEnd(); err != nil { + return thrift.PrependError("write struct stop error: ", err) + } + return nil +} + +func (p *FileMetaData) writeField1(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("version", thrift.I32, 1); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 1:version: ", p), err) + } + if err := oprot.WriteI32(int32(p.Version)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.version (1) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 1:version: ", p), err) + } + return err +} + +func (p *FileMetaData) writeField2(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("schema", thrift.LIST, 2); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 2:schema: ", p), err) + } + if err := oprot.WriteListBegin(thrift.STRUCT, len(p.Schema)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.Schema { + if err := v.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", v), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 2:schema: ", p), err) + } + return err +} + +func (p *FileMetaData) writeField3(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("num_rows", thrift.I64, 3); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 3:num_rows: ", p), err) + } + if err := oprot.WriteI64(int64(p.NumRows)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.num_rows (3) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 3:num_rows: ", p), err) + } + return err +} + +func (p *FileMetaData) writeField4(oprot thrift.TProtocol) (err error) { + if err := oprot.WriteFieldBegin("row_groups", thrift.LIST, 4); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 4:row_groups: ", p), err) + } + if err := oprot.WriteListBegin(thrift.STRUCT, len(p.RowGroups)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.RowGroups { + if err := v.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", v), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 4:row_groups: ", p), err) + } + return err +} + +func (p *FileMetaData) writeField5(oprot thrift.TProtocol) (err error) { + if p.IsSetKeyValueMetadata() { + if err := oprot.WriteFieldBegin("key_value_metadata", thrift.LIST, 5); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 5:key_value_metadata: ", p), err) + } + if err := oprot.WriteListBegin(thrift.STRUCT, len(p.KeyValueMetadata)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.KeyValueMetadata { + if err := v.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", v), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 5:key_value_metadata: ", p), err) + } + } + return err +} + +func (p *FileMetaData) writeField6(oprot thrift.TProtocol) (err error) { + if p.IsSetCreatedBy() { + if err := oprot.WriteFieldBegin("created_by", thrift.STRING, 6); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 6:created_by: ", p), err) + } + if err := oprot.WriteString(string(*p.CreatedBy)); err != nil { + return thrift.PrependError(fmt.Sprintf("%T.created_by (6) field write error: ", p), err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 6:created_by: ", p), err) + } + } + return err +} + +func (p *FileMetaData) writeField7(oprot thrift.TProtocol) (err error) { + if p.IsSetColumnOrders() { + if err := oprot.WriteFieldBegin("column_orders", thrift.LIST, 7); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field begin error 7:column_orders: ", p), err) + } + if err := oprot.WriteListBegin(thrift.STRUCT, len(p.ColumnOrders)); err != nil { + return thrift.PrependError("error writing list begin: ", err) + } + for _, v := range p.ColumnOrders { + if err := v.Write(oprot); err != nil { + return thrift.PrependError(fmt.Sprintf("%T error writing struct: ", v), err) + } + } + if err := oprot.WriteListEnd(); err != nil { + return thrift.PrependError("error writing list end: ", err) + } + if err := oprot.WriteFieldEnd(); err != nil { + return thrift.PrependError(fmt.Sprintf("%T write field end error 7:column_orders: ", p), err) + } + } + return err +} + +func (p *FileMetaData) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("FileMetaData(%+v)", *p) +} diff --git a/pkg/s3select/internal/parquet-go/gen-parquet-format-pkg.sh b/pkg/s3select/internal/parquet-go/gen-parquet-format-pkg.sh new file mode 100644 index 000000000..f8147c684 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/gen-parquet-format-pkg.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# +# Minio Cloud Storage, (C) 2018 Minio, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -e + +rm -f parquet.thrift +wget -q https://github.com/apache/parquet-format/raw/df6132b94f273521a418a74442085fdd5a0aa009/src/main/thrift/parquet.thrift +thrift --gen go parquet.thrift +gofmt -w -s gen-go/parquet diff --git a/pkg/s3select/internal/parquet-go/page.go b/pkg/s3select/internal/parquet-go/page.go new file mode 100644 index 000000000..e44899bee --- /dev/null +++ b/pkg/s3select/internal/parquet-go/page.go @@ -0,0 +1,765 @@ +/* + * Minio Cloud Storage, (C) 2018 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package parquet + +import ( + "bytes" + "context" + "fmt" + "strings" + + "git.apache.org/thrift.git/lib/go/thrift" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +// getBitWidth - returns bits required to place num e.g. +// +// num | width +// -----|------- +// 0 | 0 +// 1 | 1 +// 2 | 2 +// 3 | 2 +// 4 | 3 +// 5 | 3 +// ... | ... +// ... | ... +// +func getBitWidth(num uint64) (width uint64) { + for ; num != 0; num >>= 1 { + width++ + } + + return width +} + +// getMaxDefLevel - get maximum definition level. +func getMaxDefLevel(nameIndexMap map[string]int, schemaElements []*parquet.SchemaElement, path []string) (v int) { + for i := 1; i <= len(path); i++ { + name := strings.Join(path[:i], ".") + if index, ok := nameIndexMap[name]; ok { + if schemaElements[index].GetRepetitionType() != parquet.FieldRepetitionType_REQUIRED { + v++ + } + } + } + + return v +} + +// getMaxRepLevel - get maximum repetition level. +func getMaxRepLevel(nameIndexMap map[string]int, schemaElements []*parquet.SchemaElement, path []string) (v int) { + for i := 1; i <= len(path); i++ { + name := strings.Join(path[:i], ".") + if index, ok := nameIndexMap[name]; ok { + if schemaElements[index].GetRepetitionType() == parquet.FieldRepetitionType_REPEATED { + v++ + } + } + } + + return v +} + +func readPageHeader(reader *thrift.TBufferedTransport) (*parquet.PageHeader, error) { + pageHeader := parquet.NewPageHeader() + if err := pageHeader.Read(thrift.NewTCompactProtocol(reader)); err != nil { + return nil, err + } + + return pageHeader, nil +} + +func readPage( + thriftReader *thrift.TBufferedTransport, + metadata *parquet.ColumnMetaData, + columnNameIndexMap map[string]int, + schemaElements []*parquet.SchemaElement, +) (page *page, definitionLevels, numRows int64, err error) { + + pageHeader, err := readPageHeader(thriftReader) + if err != nil { + return nil, 0, 0, err + } + + read := func() (data []byte, err error) { + var repLevelsLen, defLevelsLen int32 + var repLevelsBuf, defLevelsBuf []byte + + if pageHeader.GetType() == parquet.PageType_DATA_PAGE_V2 { + repLevelsLen = pageHeader.DataPageHeaderV2.GetRepetitionLevelsByteLength() + repLevelsBuf = make([]byte, repLevelsLen) + if _, err = thriftReader.Read(repLevelsBuf); err != nil { + return nil, err + } + + defLevelsLen = pageHeader.DataPageHeaderV2.GetDefinitionLevelsByteLength() + defLevelsBuf = make([]byte, defLevelsLen) + if _, err = thriftReader.Read(defLevelsBuf); err != nil { + return nil, err + } + } + + dataBuf := make([]byte, pageHeader.GetCompressedPageSize()-repLevelsLen-defLevelsLen) + if _, err = thriftReader.Read(dataBuf); err != nil { + return nil, err + } + + if dataBuf, err = compressionCodec(metadata.GetCodec()).uncompress(dataBuf); err != nil { + return nil, err + } + + if repLevelsLen == 0 && defLevelsLen == 0 { + return dataBuf, nil + } + + if repLevelsLen > 0 { + data = append(data, uint32ToBytes(uint32(repLevelsLen))...) + data = append(data, repLevelsBuf...) + } + + if defLevelsLen > 0 { + data = append(data, uint32ToBytes(uint32(defLevelsLen))...) + data = append(data, defLevelsBuf...) + } + + data = append(data, dataBuf...) + + return data, nil + } + + buf, err := read() + if err != nil { + return nil, 0, 0, err + } + + path := append([]string{}, metadata.GetPathInSchema()...) + + bytesReader := bytes.NewReader(buf) + pageType := pageHeader.GetType() + switch pageType { + case parquet.PageType_INDEX_PAGE: + return nil, 0, 0, fmt.Errorf("page type %v is not supported", parquet.PageType_INDEX_PAGE) + + case parquet.PageType_DICTIONARY_PAGE: + page = newDictPage() + page.Header = pageHeader + table := new(table) + table.Path = path + values, err := readValues(bytesReader, metadata.GetType(), + uint64(pageHeader.DictionaryPageHeader.GetNumValues()), 0) + if err != nil { + return nil, 0, 0, err + } + table.Values = getTableValues(values, metadata.GetType()) + page.DataTable = table + + return page, 0, 0, nil + + case parquet.PageType_DATA_PAGE, parquet.PageType_DATA_PAGE_V2: + name := strings.Join(path, ".") + + page = newDataPage() + page.Header = pageHeader + + maxDefinitionLevel := getMaxDefLevel(columnNameIndexMap, schemaElements, path) + maxRepetitionLevel := getMaxRepLevel(columnNameIndexMap, schemaElements, path) + + var numValues uint64 + var encodingType parquet.Encoding + + if pageHeader.GetType() == parquet.PageType_DATA_PAGE { + numValues = uint64(pageHeader.DataPageHeader.GetNumValues()) + encodingType = pageHeader.DataPageHeader.GetEncoding() + } else { + numValues = uint64(pageHeader.DataPageHeaderV2.GetNumValues()) + encodingType = pageHeader.DataPageHeaderV2.GetEncoding() + } + + var repetitionLevels []int64 + if maxRepetitionLevel > 0 { + values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64, + -1, numValues, getBitWidth(uint64(maxRepetitionLevel))) + if err != nil { + return nil, 0, 0, err + } + + if repetitionLevels = values.([]int64); uint64(len(repetitionLevels)) > numValues { + repetitionLevels = repetitionLevels[:numValues] + } + } else { + repetitionLevels = make([]int64, numValues) + } + + var definitionLevels []int64 + if maxDefinitionLevel > 0 { + values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64, + -1, numValues, getBitWidth(uint64(maxDefinitionLevel))) + if err != nil { + return nil, 0, 0, err + } + if definitionLevels = values.([]int64); uint64(len(definitionLevels)) > numValues { + definitionLevels = definitionLevels[:numValues] + } + } else { + definitionLevels = make([]int64, numValues) + } + + var numNulls uint64 + for i := 0; i < len(definitionLevels); i++ { + if definitionLevels[i] != int64(maxDefinitionLevel) { + numNulls++ + } + } + + var convertedType parquet.ConvertedType = -1 + if schemaElements[columnNameIndexMap[name]].IsSetConvertedType() { + convertedType = schemaElements[columnNameIndexMap[name]].GetConvertedType() + } + values, valueType, err := readDataPageValues(bytesReader, encodingType, metadata.GetType(), + convertedType, uint64(len(definitionLevels))-numNulls, + uint64(schemaElements[columnNameIndexMap[name]].GetTypeLength())) + if err != nil { + return nil, 0, 0, err + } + tableValues := getTableValues(values, valueType) + + table := new(table) + table.Path = path + table.RepetitionType = schemaElements[columnNameIndexMap[name]].GetRepetitionType() + table.MaxRepetitionLevel = int32(maxRepetitionLevel) + table.MaxDefinitionLevel = int32(maxDefinitionLevel) + table.Values = make([]interface{}, len(definitionLevels)) + table.RepetitionLevels = make([]int32, len(definitionLevels)) + table.DefinitionLevels = make([]int32, len(definitionLevels)) + + j := 0 + numRows := int64(0) + for i := 0; i < len(definitionLevels); i++ { + table.RepetitionLevels[i] = int32(repetitionLevels[i]) + table.DefinitionLevels[i] = int32(definitionLevels[i]) + if int(table.DefinitionLevels[i]) == maxDefinitionLevel { + table.Values[i] = tableValues[j] + j++ + } + if table.RepetitionLevels[i] == 0 { + numRows++ + } + } + page.DataTable = table + + return page, int64(len(definitionLevels)), numRows, nil + } + + return nil, 0, 0, fmt.Errorf("unknown page type %v", pageType) +} + +type page struct { + Header *parquet.PageHeader // Header of a page + DataTable *table // Table to store values + RawData []byte // Compressed data of the page, which is written in parquet file + CompressType parquet.CompressionCodec // Compress type: gzip/snappy/none + DataType parquet.Type // Parquet type of the values in the page + Path []string // Path in schema(include the root) + MaxVal interface{} // Maximum of the values + MinVal interface{} // Minimum of the values + PageSize int32 +} + +func newPage() *page { + return &page{ + Header: parquet.NewPageHeader(), + PageSize: defaultPageSize, + } +} + +func newDictPage() *page { + page := newPage() + page.Header.DictionaryPageHeader = parquet.NewDictionaryPageHeader() + return page +} + +func newDataPage() *page { + page := newPage() + page.Header.DataPageHeader = parquet.NewDataPageHeader() + return page +} + +func (page *page) decode(dictPage *page) { + if dictPage == nil || page == nil || page.Header.DataPageHeader == nil || + (page.Header.DataPageHeader.Encoding != parquet.Encoding_RLE_DICTIONARY && + page.Header.DataPageHeader.Encoding != parquet.Encoding_PLAIN_DICTIONARY) { + return + } + + for i := 0; i < len(page.DataTable.Values); i++ { + if page.DataTable.Values[i] != nil { + index := page.DataTable.Values[i].(int64) + page.DataTable.Values[i] = dictPage.DataTable.Values[index] + } + } +} + +// Get RepetitionLevels and Definitions from RawData +func (page *page) getRLDLFromRawData(columnNameIndexMap map[string]int, schemaElements []*parquet.SchemaElement) (numValues int64, numRows int64, err error) { + bytesReader := bytes.NewReader(page.RawData) + + pageType := page.Header.GetType() + + var buf []byte + if pageType == parquet.PageType_DATA_PAGE_V2 { + var repLevelsLen, defLevelsLen int32 + var repLevelsBuf, defLevelsBuf []byte + + repLevelsLen = page.Header.DataPageHeaderV2.GetRepetitionLevelsByteLength() + repLevelsBuf = make([]byte, repLevelsLen) + if _, err = bytesReader.Read(repLevelsBuf); err != nil { + return 0, 0, err + } + + defLevelsLen = page.Header.DataPageHeaderV2.GetDefinitionLevelsByteLength() + defLevelsBuf = make([]byte, defLevelsLen) + if _, err = bytesReader.Read(defLevelsBuf); err != nil { + return 0, 0, err + } + + dataBuf := make([]byte, len(page.RawData)-int(repLevelsLen)-int(defLevelsLen)) + if _, err = bytesReader.Read(dataBuf); err != nil { + return 0, 0, err + } + + if repLevelsLen == 0 && defLevelsLen == 0 { + buf = dataBuf + } else { + if repLevelsLen > 0 { + buf = append(buf, uint32ToBytes(uint32(repLevelsLen))...) + buf = append(buf, repLevelsBuf...) + } + + if defLevelsLen > 0 { + buf = append(buf, uint32ToBytes(uint32(defLevelsLen))...) + buf = append(buf, defLevelsBuf...) + } + + buf = append(buf, dataBuf...) + } + } else { + if buf, err = compressionCodec(page.CompressType).uncompress(page.RawData); err != nil { + return 0, 0, err + } + } + + bytesReader = bytes.NewReader(buf) + + switch pageType { + case parquet.PageType_DICTIONARY_PAGE: + table := new(table) + table.Path = page.Path + page.DataTable = table + return 0, 0, nil + + case parquet.PageType_DATA_PAGE, parquet.PageType_DATA_PAGE_V2: + var numValues uint64 + if pageType == parquet.PageType_DATA_PAGE { + numValues = uint64(page.Header.DataPageHeader.GetNumValues()) + } else { + numValues = uint64(page.Header.DataPageHeaderV2.GetNumValues()) + } + + maxDefinitionLevel := getMaxDefLevel(columnNameIndexMap, schemaElements, page.Path) + maxRepetitionLevel := getMaxRepLevel(columnNameIndexMap, schemaElements, page.Path) + + var repetitionLevels []int64 + if maxRepetitionLevel > 0 { + values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64, + -1, numValues, getBitWidth(uint64(maxRepetitionLevel))) + if err != nil { + return 0, 0, err + } + + if repetitionLevels = values.([]int64); uint64(len(repetitionLevels)) > numValues { + repetitionLevels = repetitionLevels[:numValues] + } + } else { + repetitionLevels = make([]int64, numValues) + } + + var definitionLevels []int64 + if maxDefinitionLevel > 0 { + values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64, + -1, numValues, getBitWidth(uint64(maxDefinitionLevel))) + if err != nil { + return 0, 0, err + } + if definitionLevels = values.([]int64); uint64(len(definitionLevels)) > numValues { + definitionLevels = definitionLevels[:numValues] + } + } else { + definitionLevels = make([]int64, numValues) + } + + table := new(table) + table.Path = page.Path + name := strings.Join(page.Path, ".") + table.RepetitionType = schemaElements[columnNameIndexMap[name]].GetRepetitionType() + table.MaxRepetitionLevel = int32(maxRepetitionLevel) + table.MaxDefinitionLevel = int32(maxDefinitionLevel) + table.Values = make([]interface{}, len(definitionLevels)) + table.RepetitionLevels = make([]int32, len(definitionLevels)) + table.DefinitionLevels = make([]int32, len(definitionLevels)) + + numRows := int64(0) + for i := 0; i < len(definitionLevels); i++ { + table.RepetitionLevels[i] = int32(repetitionLevels[i]) + table.DefinitionLevels[i] = int32(definitionLevels[i]) + if table.RepetitionLevels[i] == 0 { + numRows++ + } + } + page.DataTable = table + page.RawData = buf[len(buf)-bytesReader.Len():] + + return int64(numValues), numRows, nil + } + + return 0, 0, fmt.Errorf("Unsupported page type %v", pageType) +} + +func (page *page) getValueFromRawData(columnNameIndexMap map[string]int, schemaElements []*parquet.SchemaElement) (err error) { + pageType := page.Header.GetType() + switch pageType { + case parquet.PageType_DICTIONARY_PAGE: + bytesReader := bytes.NewReader(page.RawData) + var values interface{} + values, err = readValues(bytesReader, page.DataType, + uint64(page.Header.DictionaryPageHeader.GetNumValues()), 0) + if err != nil { + return err + } + + page.DataTable.Values = getTableValues(values, page.DataType) + return nil + + case parquet.PageType_DATA_PAGE_V2: + if page.RawData, err = compressionCodec(page.CompressType).uncompress(page.RawData); err != nil { + return err + } + fallthrough + case parquet.PageType_DATA_PAGE: + encodingType := page.Header.DataPageHeader.GetEncoding() + bytesReader := bytes.NewReader(page.RawData) + + var numNulls uint64 + for i := 0; i < len(page.DataTable.DefinitionLevels); i++ { + if page.DataTable.DefinitionLevels[i] != page.DataTable.MaxDefinitionLevel { + numNulls++ + } + } + + name := strings.Join(page.DataTable.Path, ".") + var convertedType parquet.ConvertedType = -1 + + if schemaElements[columnNameIndexMap[name]].IsSetConvertedType() { + convertedType = schemaElements[columnNameIndexMap[name]].GetConvertedType() + } + + values, _, err := readDataPageValues(bytesReader, encodingType, page.DataType, + convertedType, uint64(len(page.DataTable.DefinitionLevels))-numNulls, + uint64(schemaElements[columnNameIndexMap[name]].GetTypeLength())) + if err != nil { + return err + } + + tableValues := getTableValues(values, page.DataType) + + j := 0 + for i := 0; i < len(page.DataTable.DefinitionLevels); i++ { + if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel { + page.DataTable.Values[i] = tableValues[j] + j++ + } + } + + page.RawData = []byte{} + return nil + } + + return fmt.Errorf("unsupported page type %v", pageType) +} + +func (page *page) toDataPage(compressType parquet.CompressionCodec) []byte { + values := []interface{}{} + for i := range page.DataTable.DefinitionLevels { + if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel { + values = append(values, page.DataTable.Values[i]) + } + } + valuesBytes := encodeValues(interfacesToValues(values, page.DataTable.Type), page.DataType, page.DataTable.Encoding, page.DataTable.BitWidth) + + var defLevelBytes []byte + if page.DataTable.MaxDefinitionLevel > 0 { + defLevels := make([]int64, len(page.DataTable.DefinitionLevels)) + for i := range page.DataTable.DefinitionLevels { + defLevels[i] = int64(page.DataTable.DefinitionLevels[i]) + } + defLevelBytes = valuesToRLEBitPackedHybridBytes( + defLevels, + int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))), + parquet.Type_INT64, + ) + } + + var repLevelBytes []byte + if page.DataTable.MaxRepetitionLevel > 0 { + repLevels := make([]int64, len(page.DataTable.DefinitionLevels)) + for i := range page.DataTable.DefinitionLevels { + repLevels[i] = int64(page.DataTable.RepetitionLevels[i]) + } + repLevelBytes = valuesToRLEBitPackedHybridBytes( + repLevels, + int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))), + parquet.Type_INT64, + ) + } + + data := repLevelBytes + data = append(data, defLevelBytes...) + data = append(data, valuesBytes...) + + compressedData, err := compressionCodec(compressType).compress(data) + if err != nil { + panic(err) + } + + page.Header = parquet.NewPageHeader() + page.Header.Type = parquet.PageType_DATA_PAGE + page.Header.CompressedPageSize = int32(len(compressedData)) + page.Header.UncompressedPageSize = int32(len(data)) + page.Header.DataPageHeader = parquet.NewDataPageHeader() + page.Header.DataPageHeader.NumValues = int32(len(page.DataTable.DefinitionLevels)) + page.Header.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE + page.Header.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE + page.Header.DataPageHeader.Encoding = page.DataTable.Encoding + page.Header.DataPageHeader.Statistics = parquet.NewStatistics() + if page.MaxVal != nil { + tmpBuf := valueToBytes(page.MaxVal, page.DataType) + if page.DataType == parquet.Type_BYTE_ARRAY { + switch page.DataTable.ConvertedType { + case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL: + tmpBuf = tmpBuf[4:] + } + } + page.Header.DataPageHeader.Statistics.Max = tmpBuf + } + if page.MinVal != nil { + tmpBuf := valueToBytes(page.MinVal, page.DataType) + if page.DataType == parquet.Type_BYTE_ARRAY { + switch page.DataTable.ConvertedType { + case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL: + tmpBuf = tmpBuf[4:] + } + } + page.Header.DataPageHeader.Statistics.Min = tmpBuf + } + + ts := thrift.NewTSerializer() + ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport) + pageHeaderBytes, err := ts.Write(context.TODO(), page.Header) + if err != nil { + panic(err) + } + + page.RawData = append(pageHeaderBytes, compressedData...) + return page.RawData +} + +func (page *page) toDataPageV2(compressType parquet.CompressionCodec) []byte { + values := []interface{}{} + for i := range page.DataTable.DefinitionLevels { + if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel { + values = append(values, page.DataTable.Values[i]) + } + } + valuesBytes := encodeValues(values, page.DataType, page.DataTable.Encoding, page.DataTable.BitWidth) + + var defLevelBytes []byte + if page.DataTable.MaxDefinitionLevel > 0 { + defLevels := make([]int64, len(page.DataTable.DefinitionLevels)) + for i := range page.DataTable.DefinitionLevels { + defLevels[i] = int64(page.DataTable.DefinitionLevels[i]) + } + defLevelBytes = valuesToRLEBytes( + defLevels, + int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))), + parquet.Type_INT64, + ) + } + + var repLevelBytes []byte + numRows := int32(0) + if page.DataTable.MaxRepetitionLevel > 0 { + repLevels := make([]int64, len(page.DataTable.DefinitionLevels)) + for i := range page.DataTable.DefinitionLevels { + repLevels[i] = int64(page.DataTable.RepetitionLevels[i]) + if page.DataTable.RepetitionLevels[i] == 0 { + numRows++ + } + } + repLevelBytes = valuesToRLEBytes( + repLevels, + int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))), + parquet.Type_INT64, + ) + } + + compressedData, err := compressionCodec(compressType).compress(valuesBytes) + if err != nil { + panic(err) + } + + page.Header = parquet.NewPageHeader() + page.Header.Type = parquet.PageType_DATA_PAGE_V2 + page.Header.CompressedPageSize = int32(len(compressedData) + len(defLevelBytes) + len(repLevelBytes)) + page.Header.UncompressedPageSize = int32(len(valuesBytes) + len(defLevelBytes) + len(repLevelBytes)) + page.Header.DataPageHeaderV2 = parquet.NewDataPageHeaderV2() + page.Header.DataPageHeaderV2.NumValues = int32(len(page.DataTable.Values)) + page.Header.DataPageHeaderV2.NumNulls = page.Header.DataPageHeaderV2.NumValues - int32(len(values)) + page.Header.DataPageHeaderV2.NumRows = numRows + page.Header.DataPageHeaderV2.Encoding = page.DataTable.Encoding + page.Header.DataPageHeaderV2.DefinitionLevelsByteLength = int32(len(defLevelBytes)) + page.Header.DataPageHeaderV2.RepetitionLevelsByteLength = int32(len(repLevelBytes)) + page.Header.DataPageHeaderV2.IsCompressed = true + + page.Header.DataPageHeaderV2.Statistics = parquet.NewStatistics() + if page.MaxVal != nil { + tmpBuf := valueToBytes(page.MaxVal, page.DataType) + if page.DataType == parquet.Type_BYTE_ARRAY { + switch page.DataTable.ConvertedType { + case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL: + tmpBuf = tmpBuf[4:] + } + } + page.Header.DataPageHeaderV2.Statistics.Max = tmpBuf + } + if page.MinVal != nil { + tmpBuf := valueToBytes(page.MinVal, page.DataType) + if page.DataType == parquet.Type_BYTE_ARRAY { + switch page.DataTable.ConvertedType { + case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL: + tmpBuf = tmpBuf[4:] + } + } + page.Header.DataPageHeaderV2.Statistics.Min = tmpBuf + } + + ts := thrift.NewTSerializer() + ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport) + pageHeaderBytes, err := ts.Write(context.TODO(), page.Header) + if err != nil { + panic(err) + } + + page.RawData = append(pageHeaderBytes, repLevelBytes...) + page.RawData = append(page.RawData, defLevelBytes...) + page.RawData = append(page.RawData, compressedData...) + + return page.RawData +} + +func (page *page) toDictPage(compressType parquet.CompressionCodec, dataType parquet.Type) []byte { + valuesBytes := valuesToBytes(page.DataTable.Values, dataType) + compressedData, err := compressionCodec(compressType).compress(valuesBytes) + if err != nil { + panic(err) + } + + page.Header = parquet.NewPageHeader() + page.Header.Type = parquet.PageType_DICTIONARY_PAGE + page.Header.CompressedPageSize = int32(len(compressedData)) + page.Header.UncompressedPageSize = int32(len(valuesBytes)) + page.Header.DictionaryPageHeader = parquet.NewDictionaryPageHeader() + page.Header.DictionaryPageHeader.NumValues = int32(len(page.DataTable.Values)) + page.Header.DictionaryPageHeader.Encoding = parquet.Encoding_PLAIN + + ts := thrift.NewTSerializer() + ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport) + pageHeaderBytes, err := ts.Write(context.TODO(), page.Header) + if err != nil { + panic(err) + } + + page.RawData = append(pageHeaderBytes, compressedData...) + return page.RawData +} + +func (page *page) toDictDataPage(compressType parquet.CompressionCodec, bitWidth int32) []byte { + valuesBytes := append([]byte{byte(bitWidth)}, valuesToRLEBytes(page.DataTable.Values, bitWidth, parquet.Type_INT32)...) + + var defLevelBytes []byte + if page.DataTable.MaxDefinitionLevel > 0 { + defLevels := make([]int64, len(page.DataTable.DefinitionLevels)) + for i := range page.DataTable.DefinitionLevels { + defLevels[i] = int64(page.DataTable.DefinitionLevels[i]) + } + defLevelBytes = valuesToRLEBitPackedHybridBytes( + defLevels, + int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))), + parquet.Type_INT64, + ) + } + + var repLevelBytes []byte + if page.DataTable.MaxRepetitionLevel > 0 { + repLevels := make([]int64, len(page.DataTable.DefinitionLevels)) + for i := range page.DataTable.DefinitionLevels { + repLevels[i] = int64(page.DataTable.RepetitionLevels[i]) + } + repLevelBytes = valuesToRLEBitPackedHybridBytes( + repLevels, + int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))), + parquet.Type_INT64, + ) + } + + data := append(repLevelBytes, defLevelBytes...) + data = append(data, valuesBytes...) + + compressedData, err := compressionCodec(compressType).compress(data) + if err != nil { + panic(err) + } + + page.Header = parquet.NewPageHeader() + page.Header.Type = parquet.PageType_DATA_PAGE + page.Header.CompressedPageSize = int32(len(compressedData)) + page.Header.UncompressedPageSize = int32(len(data)) + page.Header.DataPageHeader = parquet.NewDataPageHeader() + page.Header.DataPageHeader.NumValues = int32(len(page.DataTable.DefinitionLevels)) + page.Header.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE + page.Header.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE + page.Header.DataPageHeader.Encoding = parquet.Encoding_PLAIN_DICTIONARY + + ts := thrift.NewTSerializer() + ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport) + pageHeaderBytes, err := ts.Write(context.TODO(), page.Header) + if err != nil { + panic(err) + } + + page.RawData = append(pageHeaderBytes, compressedData...) + return page.RawData +} diff --git a/pkg/s3select/internal/parquet-go/parquet.thrift b/pkg/s3select/internal/parquet-go/parquet.thrift new file mode 100644 index 000000000..6c9011b9a --- /dev/null +++ b/pkg/s3select/internal/parquet-go/parquet.thrift @@ -0,0 +1,881 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * File format description for the parquet file format + */ +namespace cpp parquet +namespace java org.apache.parquet.format + +/** + * Types supported by Parquet. These types are intended to be used in combination + * with the encodings to control the on disk storage format. + * For example INT16 is not included as a type since a good encoding of INT32 + * would handle this. + */ +enum Type { + BOOLEAN = 0; + INT32 = 1; + INT64 = 2; + INT96 = 3; // deprecated, only used by legacy implementations. + FLOAT = 4; + DOUBLE = 5; + BYTE_ARRAY = 6; + FIXED_LEN_BYTE_ARRAY = 7; +} + +/** + * Common types used by frameworks(e.g. hive, pig) using parquet. This helps map + * between types in those frameworks to the base types in parquet. This is only + * metadata and not needed to read or write the data. + */ +enum ConvertedType { + /** a BYTE_ARRAY actually contains UTF8 encoded chars */ + UTF8 = 0; + + /** a map is converted as an optional field containing a repeated key/value pair */ + MAP = 1; + + /** a key/value pair is converted into a group of two fields */ + MAP_KEY_VALUE = 2; + + /** a list is converted into an optional field containing a repeated field for its + * values */ + LIST = 3; + + /** an enum is converted into a binary field */ + ENUM = 4; + + /** + * A decimal value. + * + * This may be used to annotate binary or fixed primitive types. The + * underlying byte array stores the unscaled value encoded as two's + * complement using big-endian byte order (the most significant byte is the + * zeroth element). The value of the decimal is the value * 10^{-scale}. + * + * This must be accompanied by a (maximum) precision and a scale in the + * SchemaElement. The precision specifies the number of digits in the decimal + * and the scale stores the location of the decimal point. For example 1.23 + * would have precision 3 (3 total digits) and scale 2 (the decimal point is + * 2 digits over). + */ + DECIMAL = 5; + + /** + * A Date + * + * Stored as days since Unix epoch, encoded as the INT32 physical type. + * + */ + DATE = 6; + + /** + * A time + * + * The total number of milliseconds since midnight. The value is stored + * as an INT32 physical type. + */ + TIME_MILLIS = 7; + + /** + * A time. + * + * The total number of microseconds since midnight. The value is stored as + * an INT64 physical type. + */ + TIME_MICROS = 8; + + /** + * A date/time combination + * + * Date and time recorded as milliseconds since the Unix epoch. Recorded as + * a physical type of INT64. + */ + TIMESTAMP_MILLIS = 9; + + /** + * A date/time combination + * + * Date and time recorded as microseconds since the Unix epoch. The value is + * stored as an INT64 physical type. + */ + TIMESTAMP_MICROS = 10; + + + /** + * An unsigned integer value. + * + * The number describes the maximum number of meainful data bits in + * the stored value. 8, 16 and 32 bit values are stored using the + * INT32 physical type. 64 bit values are stored using the INT64 + * physical type. + * + */ + UINT_8 = 11; + UINT_16 = 12; + UINT_32 = 13; + UINT_64 = 14; + + /** + * A signed integer value. + * + * The number describes the maximum number of meainful data bits in + * the stored value. 8, 16 and 32 bit values are stored using the + * INT32 physical type. 64 bit values are stored using the INT64 + * physical type. + * + */ + INT_8 = 15; + INT_16 = 16; + INT_32 = 17; + INT_64 = 18; + + /** + * An embedded JSON document + * + * A JSON document embedded within a single UTF8 column. + */ + JSON = 19; + + /** + * An embedded BSON document + * + * A BSON document embedded within a single BINARY column. + */ + BSON = 20; + + /** + * An interval of time + * + * This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12 + * This data is composed of three separate little endian unsigned + * integers. Each stores a component of a duration of time. The first + * integer identifies the number of months associated with the duration, + * the second identifies the number of days associated with the duration + * and the third identifies the number of milliseconds associated with + * the provided duration. This duration of time is independent of any + * particular timezone or date. + */ + INTERVAL = 21; +} + +/** + * Representation of Schemas + */ +enum FieldRepetitionType { + /** This field is required (can not be null) and each record has exactly 1 value. */ + REQUIRED = 0; + + /** The field is optional (can be null) and each record has 0 or 1 values. */ + OPTIONAL = 1; + + /** The field is repeated and can contain 0 or more values */ + REPEATED = 2; +} + +/** + * Statistics per row group and per page + * All fields are optional. + */ +struct Statistics { + /** + * DEPRECATED: min and max value of the column. Use min_value and max_value. + * + * Values are encoded using PLAIN encoding, except that variable-length byte + * arrays do not include a length prefix. + * + * These fields encode min and max values determined by signed comparison + * only. New files should use the correct order for a column's logical type + * and store the values in the min_value and max_value fields. + * + * To support older readers, these may be set when the column order is + * signed. + */ + 1: optional binary max; + 2: optional binary min; + /** count of null value in the column */ + 3: optional i64 null_count; + /** count of distinct values occurring */ + 4: optional i64 distinct_count; + /** + * Min and max values for the column, determined by its ColumnOrder. + * + * Values are encoded using PLAIN encoding, except that variable-length byte + * arrays do not include a length prefix. + */ + 5: optional binary max_value; + 6: optional binary min_value; +} + +/** Empty structs to use as logical type annotations */ +struct StringType {} // allowed for BINARY, must be encoded with UTF-8 +struct UUIDType {} // allowed for FIXED[16], must encoded raw UUID bytes +struct MapType {} // see LogicalTypes.md +struct ListType {} // see LogicalTypes.md +struct EnumType {} // allowed for BINARY, must be encoded with UTF-8 +struct DateType {} // allowed for INT32 + +/** + * Logical type to annotate a column that is always null. + * + * Sometimes when discovering the schema of existing data, values are always + * null and the physical type can't be determined. This annotation signals + * the case where the physical type was guessed from all null values. + */ +struct NullType {} // allowed for any physical type, only null values stored + +/** + * Decimal logical type annotation + * + * To maintain forward-compatibility in v1, implementations using this logical + * type must also set scale and precision on the annotated SchemaElement. + * + * Allowed for physical types: INT32, INT64, FIXED, and BINARY + */ +struct DecimalType { + 1: required i32 scale + 2: required i32 precision +} + +/** Time units for logical types */ +struct MilliSeconds {} +struct MicroSeconds {} +struct NanoSeconds {} +union TimeUnit { + 1: MilliSeconds MILLIS + 2: MicroSeconds MICROS + 3: NanoSeconds NANOS +} + +/** + * Timestamp logical type annotation + * + * Allowed for physical types: INT64 + */ +struct TimestampType { + 1: required bool isAdjustedToUTC + 2: required TimeUnit unit +} + +/** + * Time logical type annotation + * + * Allowed for physical types: INT32 (millis), INT64 (micros, nanos) + */ +struct TimeType { + 1: required bool isAdjustedToUTC + 2: required TimeUnit unit +} + +/** + * Integer logical type annotation + * + * bitWidth must be 8, 16, 32, or 64. + * + * Allowed for physical types: INT32, INT64 + */ +struct IntType { + 1: required byte bitWidth + 2: required bool isSigned +} + +/** + * Embedded JSON logical type annotation + * + * Allowed for physical types: BINARY + */ +struct JsonType { +} + +/** + * Embedded BSON logical type annotation + * + * Allowed for physical types: BINARY + */ +struct BsonType { +} + +/** + * LogicalType annotations to replace ConvertedType. + * + * To maintain compatibility, implementations using LogicalType for a + * SchemaElement must also set the corresponding ConvertedType from the + * following table. + */ +union LogicalType { + 1: StringType STRING // use ConvertedType UTF8 + 2: MapType MAP // use ConvertedType MAP + 3: ListType LIST // use ConvertedType LIST + 4: EnumType ENUM // use ConvertedType ENUM + 5: DecimalType DECIMAL // use ConvertedType DECIMAL + 6: DateType DATE // use ConvertedType DATE + 7: TimeType TIME // use ConvertedType TIME_MICROS or TIME_MILLIS + 8: TimestampType TIMESTAMP // use ConvertedType TIMESTAMP_MICROS or TIMESTAMP_MILLIS + // 9: reserved for INTERVAL + 10: IntType INTEGER // use ConvertedType INT_* or UINT_* + 11: NullType UNKNOWN // no compatible ConvertedType + 12: JsonType JSON // use ConvertedType JSON + 13: BsonType BSON // use ConvertedType BSON + 14: UUIDType UUID +} + +/** + * Represents a element inside a schema definition. + * - if it is a group (inner node) then type is undefined and num_children is defined + * - if it is a primitive type (leaf) then type is defined and num_children is undefined + * the nodes are listed in depth first traversal order. + */ +struct SchemaElement { + /** Data type for this field. Not set if the current element is a non-leaf node */ + 1: optional Type type; + + /** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the vales. + * Otherwise, if specified, this is the maximum bit length to store any of the values. + * (e.g. a low cardinality INT col could have this set to 3). Note that this is + * in the schema, and therefore fixed for the entire file. + */ + 2: optional i32 type_length; + + /** repetition of the field. The root of the schema does not have a repetition_type. + * All other nodes must have one */ + 3: optional FieldRepetitionType repetition_type; + + /** Name of the field in the schema */ + 4: required string name; + + /** Nested fields. Since thrift does not support nested fields, + * the nesting is flattened to a single list by a depth-first traversal. + * The children count is used to construct the nested relationship. + * This field is not set when the element is a primitive type + */ + 5: optional i32 num_children; + + /** When the schema is the result of a conversion from another model + * Used to record the original type to help with cross conversion. + */ + 6: optional ConvertedType converted_type; + + /** Used when this column contains decimal data. + * See the DECIMAL converted type for more details. + */ + 7: optional i32 scale + 8: optional i32 precision + + /** When the original schema supports field ids, this will save the + * original field id in the parquet schema + */ + 9: optional i32 field_id; + + /** + * The logical type of this SchemaElement + * + * LogicalType replaces ConvertedType, but ConvertedType is still required + * for some logical types to ensure forward-compatibility in format v1. + */ + 10: optional LogicalType logicalType +} + +/** + * Encodings supported by Parquet. Not all encodings are valid for all types. These + * enums are also used to specify the encoding of definition and repetition levels. + * See the accompanying doc for the details of the more complicated encodings. + */ +enum Encoding { + /** Default encoding. + * BOOLEAN - 1 bit per value. 0 is false; 1 is true. + * INT32 - 4 bytes per value. Stored as little-endian. + * INT64 - 8 bytes per value. Stored as little-endian. + * FLOAT - 4 bytes per value. IEEE. Stored as little-endian. + * DOUBLE - 8 bytes per value. IEEE. Stored as little-endian. + * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. + * FIXED_LEN_BYTE_ARRAY - Just the bytes. + */ + PLAIN = 0; + + /** Group VarInt encoding for INT32/INT64. + * This encoding is deprecated. It was never used + */ + // GROUP_VAR_INT = 1; + + /** + * Deprecated: Dictionary encoding. The values in the dictionary are encoded in the + * plain type. + * in a data page use RLE_DICTIONARY instead. + * in a Dictionary page use PLAIN instead + */ + PLAIN_DICTIONARY = 2; + + /** Group packed run length encoding. Usable for definition/repetition levels + * encoding and Booleans (on one bit: 0 is false; 1 is true.) + */ + RLE = 3; + + /** Bit packed encoding. This can only be used if the data has a known max + * width. Usable for definition/repetition levels encoding. + */ + BIT_PACKED = 4; + + /** Delta encoding for integers. This can be used for int columns and works best + * on sorted data + */ + DELTA_BINARY_PACKED = 5; + + /** Encoding for byte arrays to separate the length values and the data. The lengths + * are encoded using DELTA_BINARY_PACKED + */ + DELTA_LENGTH_BYTE_ARRAY = 6; + + /** Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. + * Suffixes are stored as delta length byte arrays. + */ + DELTA_BYTE_ARRAY = 7; + + /** Dictionary encoding: the ids are encoded using the RLE encoding + */ + RLE_DICTIONARY = 8; +} + +/** + * Supported compression algorithms. + * + * Codecs added in 2.4 can be read by readers based on 2.4 and later. + * Codec support may vary between readers based on the format version and + * libraries available at runtime. Gzip, Snappy, and LZ4 codecs are + * widely available, while Zstd and Brotli require additional libraries. + */ +enum CompressionCodec { + UNCOMPRESSED = 0; + SNAPPY = 1; + GZIP = 2; + LZO = 3; + BROTLI = 4; // Added in 2.4 + LZ4 = 5; // Added in 2.4 + ZSTD = 6; // Added in 2.4 +} + +enum PageType { + DATA_PAGE = 0; + INDEX_PAGE = 1; + DICTIONARY_PAGE = 2; + DATA_PAGE_V2 = 3; +} + +/** + * Enum to annotate whether lists of min/max elements inside ColumnIndex + * are ordered and if so, in which direction. + */ +enum BoundaryOrder { + UNORDERED = 0; + ASCENDING = 1; + DESCENDING = 2; +} + +/** Data page header */ +struct DataPageHeader { + /** Number of values, including NULLs, in this data page. **/ + 1: required i32 num_values + + /** Encoding used for this data page **/ + 2: required Encoding encoding + + /** Encoding used for definition levels **/ + 3: required Encoding definition_level_encoding; + + /** Encoding used for repetition levels **/ + 4: required Encoding repetition_level_encoding; + + /** Optional statistics for the data in this page**/ + 5: optional Statistics statistics; +} + +struct IndexPageHeader { + /** TODO: **/ +} + +struct DictionaryPageHeader { + /** Number of values in the dictionary **/ + 1: required i32 num_values; + + /** Encoding using this dictionary page **/ + 2: required Encoding encoding + + /** If true, the entries in the dictionary are sorted in ascending order **/ + 3: optional bool is_sorted; +} + +/** + * New page format allowing reading levels without decompressing the data + * Repetition and definition levels are uncompressed + * The remaining section containing the data is compressed if is_compressed is true + **/ +struct DataPageHeaderV2 { + /** Number of values, including NULLs, in this data page. **/ + 1: required i32 num_values + /** Number of NULL values, in this data page. + Number of non-null = num_values - num_nulls which is also the number of values in the data section **/ + 2: required i32 num_nulls + /** Number of rows in this data page. which means pages change on record boundaries (r = 0) **/ + 3: required i32 num_rows + /** Encoding used for data in this page **/ + 4: required Encoding encoding + + // repetition levels and definition levels are always using RLE (without size in it) + + /** length of the definition levels */ + 5: required i32 definition_levels_byte_length; + /** length of the repetition levels */ + 6: required i32 repetition_levels_byte_length; + + /** whether the values are compressed. + Which means the section of the page between + definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) + is compressed with the compression_codec. + If missing it is considered compressed */ + 7: optional bool is_compressed = 1; + + /** optional statistics for this column chunk */ + 8: optional Statistics statistics; +} + +struct PageHeader { + /** the type of the page: indicates which of the *_header fields is set **/ + 1: required PageType type + + /** Uncompressed page size in bytes (not including this header) **/ + 2: required i32 uncompressed_page_size + + /** Compressed page size in bytes (not including this header) **/ + 3: required i32 compressed_page_size + + /** 32bit crc for the data below. This allows for disabling checksumming in HDFS + * if only a few pages needs to be read + **/ + 4: optional i32 crc + + // Headers for page specific data. One only will be set. + 5: optional DataPageHeader data_page_header; + 6: optional IndexPageHeader index_page_header; + 7: optional DictionaryPageHeader dictionary_page_header; + 8: optional DataPageHeaderV2 data_page_header_v2; +} + +/** + * Wrapper struct to store key values + */ + struct KeyValue { + 1: required string key + 2: optional string value +} + +/** + * Wrapper struct to specify sort order + */ +struct SortingColumn { + /** The column index (in this row group) **/ + 1: required i32 column_idx + + /** If true, indicates this column is sorted in descending order. **/ + 2: required bool descending + + /** If true, nulls will come before non-null values, otherwise, + * nulls go at the end. */ + 3: required bool nulls_first +} + +/** + * statistics of a given page type and encoding + */ +struct PageEncodingStats { + + /** the page type (data/dic/...) **/ + 1: required PageType page_type; + + /** encoding of the page **/ + 2: required Encoding encoding; + + /** number of pages of this type with this encoding **/ + 3: required i32 count; + +} + +/** + * Description for column metadata + */ +struct ColumnMetaData { + /** Type of this column **/ + 1: required Type type + + /** Set of all encodings used for this column. The purpose is to validate + * whether we can decode those pages. **/ + 2: required list encodings + + /** Path in schema **/ + 3: required list path_in_schema + + /** Compression codec **/ + 4: required CompressionCodec codec + + /** Number of values in this column **/ + 5: required i64 num_values + + /** total byte size of all uncompressed pages in this column chunk (including the headers) **/ + 6: required i64 total_uncompressed_size + + /** total byte size of all compressed pages in this column chunk (including the headers) **/ + 7: required i64 total_compressed_size + + /** Optional key/value metadata **/ + 8: optional list key_value_metadata + + /** Byte offset from beginning of file to first data page **/ + 9: required i64 data_page_offset + + /** Byte offset from beginning of file to root index page **/ + 10: optional i64 index_page_offset + + /** Byte offset from the beginning of file to first (only) dictionary page **/ + 11: optional i64 dictionary_page_offset + + /** optional statistics for this column chunk */ + 12: optional Statistics statistics; + + /** Set of all encodings used for pages in this column chunk. + * This information can be used to determine if all data pages are + * dictionary encoded for example **/ + 13: optional list encoding_stats; +} + +struct ColumnChunk { + /** File where column data is stored. If not set, assumed to be same file as + * metadata. This path is relative to the current file. + **/ + 1: optional string file_path + + /** Byte offset in file_path to the ColumnMetaData **/ + 2: required i64 file_offset + + /** Column metadata for this chunk. This is the same content as what is at + * file_path/file_offset. Having it here has it replicated in the file + * metadata. + **/ + 3: optional ColumnMetaData meta_data + + /** File offset of ColumnChunk's OffsetIndex **/ + 4: optional i64 offset_index_offset + + /** Size of ColumnChunk's OffsetIndex, in bytes **/ + 5: optional i32 offset_index_length + + /** File offset of ColumnChunk's ColumnIndex **/ + 6: optional i64 column_index_offset + + /** Size of ColumnChunk's ColumnIndex, in bytes **/ + 7: optional i32 column_index_length +} + +struct RowGroup { + /** Metadata for each column chunk in this row group. + * This list must have the same order as the SchemaElement list in FileMetaData. + **/ + 1: required list columns + + /** Total byte size of all the uncompressed column data in this row group **/ + 2: required i64 total_byte_size + + /** Number of rows in this row group **/ + 3: required i64 num_rows + + /** If set, specifies a sort ordering of the rows in this RowGroup. + * The sorting columns can be a subset of all the columns. + */ + 4: optional list sorting_columns +} + +/** Empty struct to signal the order defined by the physical or logical type */ +struct TypeDefinedOrder {} + +/** + * Union to specify the order used for the min_value and max_value fields for a + * column. This union takes the role of an enhanced enum that allows rich + * elements (which will be needed for a collation-based ordering in the future). + * + * Possible values are: + * * TypeDefinedOrder - the column uses the order defined by its logical or + * physical type (if there is no logical type). + * + * If the reader does not support the value of this union, min and max stats + * for this column should be ignored. + */ +union ColumnOrder { + + /** + * The sort orders for logical types are: + * UTF8 - unsigned byte-wise comparison + * INT8 - signed comparison + * INT16 - signed comparison + * INT32 - signed comparison + * INT64 - signed comparison + * UINT8 - unsigned comparison + * UINT16 - unsigned comparison + * UINT32 - unsigned comparison + * UINT64 - unsigned comparison + * DECIMAL - signed comparison of the represented value + * DATE - signed comparison + * TIME_MILLIS - signed comparison + * TIME_MICROS - signed comparison + * TIMESTAMP_MILLIS - signed comparison + * TIMESTAMP_MICROS - signed comparison + * INTERVAL - unsigned comparison + * JSON - unsigned byte-wise comparison + * BSON - unsigned byte-wise comparison + * ENUM - unsigned byte-wise comparison + * LIST - undefined + * MAP - undefined + * + * In the absence of logical types, the sort order is determined by the physical type: + * BOOLEAN - false, true + * INT32 - signed comparison + * INT64 - signed comparison + * INT96 (only used for legacy timestamps) - undefined + * FLOAT - signed comparison of the represented value (*) + * DOUBLE - signed comparison of the represented value (*) + * BYTE_ARRAY - unsigned byte-wise comparison + * FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison + * + * (*) Because the sorting order is not specified properly for floating + * point values (relations vs. total ordering) the following + * compatibility rules should be applied when reading statistics: + * - If the min is a NaN, it should be ignored. + * - If the max is a NaN, it should be ignored. + * - If the min is +0, the row group may contain -0 values as well. + * - If the max is -0, the row group may contain +0 values as well. + * - When looking for NaN values, min and max should be ignored. + */ + 1: TypeDefinedOrder TYPE_ORDER; +} + +struct PageLocation { + /** Offset of the page in the file **/ + 1: required i64 offset + + /** + * Size of the page, including header. Sum of compressed_page_size and header + * length + */ + 2: required i32 compressed_page_size + + /** + * Index within the RowGroup of the first row of the page; this means pages + * change on record boundaries (r = 0). + */ + 3: required i64 first_row_index +} + +struct OffsetIndex { + /** + * PageLocations, ordered by increasing PageLocation.offset. It is required + * that page_locations[i].first_row_index < page_locations[i+1].first_row_index. + */ + 1: required list page_locations +} + +/** + * Description for ColumnIndex. + * Each [i] refers to the page at OffsetIndex.page_locations[i] + */ +struct ColumnIndex { + /** + * A list of Boolean values to determine the validity of the corresponding + * min and max values. If true, a page contains only null values, and writers + * have to set the corresponding entries in min_values and max_values to + * byte[0], so that all lists have the same length. If false, the + * corresponding entries in min_values and max_values must be valid. + */ + 1: required list null_pages + + /** + * Two lists containing lower and upper bounds for the values of each page. + * These may be the actual minimum and maximum values found on a page, but + * can also be (more compact) values that do not exist on a page. For + * example, instead of storing ""Blart Versenwald III", a writer may set + * min_values[i]="B", max_values[i]="C". Such more compact values must still + * be valid values within the column's logical type. Readers must make sure + * that list entries are populated before using them by inspecting null_pages. + */ + 2: required list min_values + 3: required list max_values + + /** + * Stores whether both min_values and max_values are orderd and if so, in + * which direction. This allows readers to perform binary searches in both + * lists. Readers cannot assume that max_values[i] <= min_values[i+1], even + * if the lists are ordered. + */ + 4: required BoundaryOrder boundary_order + + /** A list containing the number of null values for each page **/ + 5: optional list null_counts +} + +/** + * Description for file metadata + */ +struct FileMetaData { + /** Version of this file **/ + 1: required i32 version + + /** Parquet schema for this file. This schema contains metadata for all the columns. + * The schema is represented as a tree with a single root. The nodes of the tree + * are flattened to a list by doing a depth-first traversal. + * The column metadata contains the path in the schema for that column which can be + * used to map columns to nodes in the schema. + * The first element is the root **/ + 2: required list schema; + + /** Number of rows in this file **/ + 3: required i64 num_rows + + /** Row groups in this file **/ + 4: required list row_groups + + /** Optional key/value metadata **/ + 5: optional list key_value_metadata + + /** String for application that wrote this file. This should be in the format + * version (build ). + * e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) + **/ + 6: optional string created_by + + /** + * Sort order used for the min_value and max_value fields of each column in + * this file. Each sort order corresponds to one column, determined by its + * position in the list, matching the position of the column in the schema. + * + * Without column_orders, the meaning of the min_value and max_value fields is + * undefined. To ensure well-defined behaviour, if min_value and max_value are + * written to a Parquet file, column_orders must be written as well. + * + * The obsolete min and max fields are always sorted by signed comparison + * regardless of column_orders. + */ + 7: optional list column_orders; +} + diff --git a/pkg/s3select/internal/parquet-go/reader.go b/pkg/s3select/internal/parquet-go/reader.go new file mode 100644 index 000000000..74ff5e202 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/reader.go @@ -0,0 +1,166 @@ +/* + * Minio Cloud Storage, (C) 2018 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package parquet + +import ( + "encoding/binary" + "encoding/json" + "io" + + "git.apache.org/thrift.git/lib/go/thrift" + "github.com/minio/minio-go/v6/pkg/set" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +// GetReaderFunc - function type returning io.ReadCloser for requested offset/length. +type GetReaderFunc func(offset, length int64) (io.ReadCloser, error) + +func footerSize(getReaderFunc GetReaderFunc) (size int64, err error) { + rc, err := getReaderFunc(-8, 4) + if err != nil { + return 0, err + } + defer rc.Close() + + buf := make([]byte, 4) + if _, err = io.ReadFull(rc, buf); err != nil { + return 0, err + } + + size = int64(binary.LittleEndian.Uint32(buf)) + + return size, nil +} + +func fileMetadata(getReaderFunc GetReaderFunc) (*parquet.FileMetaData, error) { + size, err := footerSize(getReaderFunc) + if err != nil { + return nil, err + } + + rc, err := getReaderFunc(-(8 + size), size) + if err != nil { + return nil, err + } + defer rc.Close() + + fileMeta := parquet.NewFileMetaData() + + pf := thrift.NewTCompactProtocolFactory() + protocol := pf.GetProtocol(thrift.NewStreamTransportR(rc)) + err = fileMeta.Read(protocol) + if err != nil { + return nil, err + } + + return fileMeta, nil +} + +// Value - denotes column value +type Value struct { + Value interface{} + Type parquet.Type +} + +// MarshalJSON - encodes to JSON data +func (value Value) MarshalJSON() (data []byte, err error) { + return json.Marshal(value.Value) +} + +// Reader - denotes parquet file. +type Reader struct { + getReaderFunc GetReaderFunc + schemaElements []*parquet.SchemaElement + rowGroups []*parquet.RowGroup + rowGroupIndex int + + nameList []string + columnNames set.StringSet + columns map[string]*column + rowIndex int64 +} + +// NewReader - creates new parquet reader. Reader calls getReaderFunc to get required data range for given columnNames. If columnNames is empty, all columns are used. +func NewReader(getReaderFunc GetReaderFunc, columnNames set.StringSet) (*Reader, error) { + fileMeta, err := fileMetadata(getReaderFunc) + if err != nil { + return nil, err + } + + nameList := []string{} + schemaElements := fileMeta.GetSchema() + for _, element := range schemaElements { + nameList = append(nameList, element.Name) + } + + return &Reader{ + getReaderFunc: getReaderFunc, + rowGroups: fileMeta.GetRowGroups(), + schemaElements: schemaElements, + nameList: nameList, + columnNames: columnNames, + }, nil +} + +// Read - reads single record. +func (reader *Reader) Read() (record *Record, err error) { + if reader.rowGroupIndex >= len(reader.rowGroups) { + return nil, io.EOF + } + + if reader.columns == nil { + reader.columns, err = getColumns( + reader.rowGroups[reader.rowGroupIndex], + reader.columnNames, + reader.schemaElements, + reader.getReaderFunc, + ) + if err != nil { + return nil, err + } + + reader.rowIndex = 0 + } + + if reader.rowIndex >= reader.rowGroups[reader.rowGroupIndex].GetNumRows() { + reader.rowGroupIndex++ + reader.Close() + return reader.Read() + } + + record = newRecord(reader.nameList) + for name := range reader.columns { + value, valueType := reader.columns[name].read() + record.set(name, Value{value, valueType}) + } + + reader.rowIndex++ + + return record, nil +} + +// Close - closes underneath readers. +func (reader *Reader) Close() (err error) { + for _, column := range reader.columns { + column.close() + } + + reader.columns = nil + reader.rowIndex = 0 + + return nil +} diff --git a/pkg/s3select/internal/parquet-go/reader_test.go b/pkg/s3select/internal/parquet-go/reader_test.go new file mode 100644 index 000000000..15d02c1e5 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/reader_test.go @@ -0,0 +1,90 @@ +/* + * Minio Cloud Storage, (C) 2018 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package parquet + +import ( + "io" + "os" + "testing" + + "github.com/minio/minio-go/v6/pkg/set" +) + +func getReader(name string, offset int64, length int64) (io.ReadCloser, error) { + file, err := os.Open(name) + if err != nil { + return nil, err + } + + fi, err := file.Stat() + if err != nil { + return nil, err + } + + if offset < 0 { + offset = fi.Size() + offset + } + + if _, err = file.Seek(offset, os.SEEK_SET); err != nil { + return nil, err + } + + return file, nil +} + +func TestReader(t *testing.T) { + name := "example.parquet" + reader, err := NewReader( + func(offset, length int64) (io.ReadCloser, error) { + return getReader(name, offset, length) + }, + set.CreateStringSet("one", "two", "three"), + ) + if err != nil { + t.Fatal(err) + } + + expectedRecords := []string{ + `map[one:{-1 DOUBLE} three:{true BOOLEAN} two:{[102 111 111] BYTE_ARRAY}]`, + `map[one:{ DOUBLE} three:{false BOOLEAN} two:{[98 97 114] BYTE_ARRAY}]`, + `map[one:{2.5 DOUBLE} three:{true BOOLEAN} two:{[98 97 122] BYTE_ARRAY}]`, + } + + i := 0 + for { + record, err := reader.Read() + if err != nil { + if err != io.EOF { + t.Error(err) + } + + break + } + + if i == len(expectedRecords) { + t.Fatalf("read more than expected record count %v", len(expectedRecords)) + } + + if record.String() != expectedRecords[i] { + t.Fatalf("record%v: expected: %v, got: %v", i+1, expectedRecords[i], record.String()) + } + + i++ + } + + reader.Close() +} diff --git a/pkg/s3select/internal/parquet-go/record.go b/pkg/s3select/internal/parquet-go/record.go new file mode 100644 index 000000000..6d25632f3 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/record.go @@ -0,0 +1,70 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package parquet + +import ( + "fmt" + "strings" +) + +// Record - ordered parquet record. +type Record struct { + nameList []string + nameValueMap map[string]Value +} + +// String - returns string representation of this record. +func (r *Record) String() string { + values := []string{} + r.Range(func(name string, value Value) bool { + values = append(values, fmt.Sprintf("%v:%v", name, value)) + return true + }) + + return "map[" + strings.Join(values, " ") + "]" +} + +func (r *Record) set(name string, value Value) { + r.nameValueMap[name] = value +} + +// Get - returns Value of name. +func (r *Record) Get(name string) (Value, bool) { + value, ok := r.nameValueMap[name] + return value, ok +} + +// Range - calls f sequentially for each name and value present in the record. If f returns false, range stops the iteration. +func (r *Record) Range(f func(name string, value Value) bool) { + for _, name := range r.nameList { + value, ok := r.nameValueMap[name] + if !ok { + continue + } + + if !f(name, value) { + break + } + } +} + +func newRecord(nameList []string) *Record { + return &Record{ + nameList: nameList, + nameValueMap: make(map[string]Value), + } +} diff --git a/pkg/s3select/internal/parquet-go/schema/element.go b/pkg/s3select/internal/parquet-go/schema/element.go new file mode 100644 index 000000000..df191f48b --- /dev/null +++ b/pkg/s3select/internal/parquet-go/schema/element.go @@ -0,0 +1,126 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package schema + +import ( + "fmt" + "regexp" + "strings" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +var nameRegexp = regexp.MustCompile("^[a-zA-Z0-9_]+$") + +func validataPathSegments(pathSegments []string) error { + for _, pathSegment := range pathSegments { + if !nameRegexp.MatchString(pathSegment) { + return fmt.Errorf("unsupported name %v", strings.Join(pathSegments, ".")) + } + } + + return nil +} + +// Element - represents schema element and its children. Any element must have Name and RepetitionType fields set. +type Element struct { + parquet.SchemaElement + numChildren int32 + Encoding *parquet.Encoding // Optional; defaults is computed. + CompressionType *parquet.CompressionCodec // Optional; defaults to SNAPPY. + Children *Tree + MaxDefinitionLevel int64 + MaxRepetitionLevel int64 + PathInTree string + PathInSchema string +} + +// String - stringify this element. +func (element *Element) String() string { + var s []string + s = append(s, "Name:"+element.Name) + s = append(s, "RepetitionType:"+element.RepetitionType.String()) + if element.Type != nil { + s = append(s, "Type:"+element.Type.String()) + } + if element.ConvertedType != nil { + s = append(s, "ConvertedType:"+element.ConvertedType.String()) + } + if element.Encoding != nil { + s = append(s, "Encoding:"+element.Encoding.String()) + } + if element.CompressionType != nil { + s = append(s, "CompressionType:"+element.CompressionType.String()) + } + if element.Children != nil && element.Children.Length() > 0 { + s = append(s, "Children:"+element.Children.String()) + } + s = append(s, fmt.Sprintf("MaxDefinitionLevel:%v", element.MaxDefinitionLevel)) + s = append(s, fmt.Sprintf("MaxRepetitionLevel:%v", element.MaxRepetitionLevel)) + if element.PathInTree != "" { + s = append(s, "PathInTree:"+element.PathInTree) + } + if element.PathInSchema != "" { + s = append(s, "PathInSchema:"+element.PathInSchema) + } + + return "{" + strings.Join(s, ", ") + "}" +} + +// NewElement - creates new element. +func NewElement(name string, repetitionType parquet.FieldRepetitionType, + elementType *parquet.Type, convertedType *parquet.ConvertedType, + encoding *parquet.Encoding, compressionType *parquet.CompressionCodec, + children *Tree) (*Element, error) { + + if !nameRegexp.MatchString(name) { + return nil, fmt.Errorf("unsupported name %v", name) + } + + switch repetitionType { + case parquet.FieldRepetitionType_REQUIRED, parquet.FieldRepetitionType_OPTIONAL, parquet.FieldRepetitionType_REPEATED: + default: + return nil, fmt.Errorf("unknown repetition type %v", repetitionType) + } + + if repetitionType == parquet.FieldRepetitionType_REPEATED && (elementType != nil || convertedType != nil) { + return nil, fmt.Errorf("repetition type REPEATED should be used in group element") + } + + if children != nil && children.Length() != 0 { + if elementType != nil { + return nil, fmt.Errorf("type should be nil for group element") + } + } + + element := Element{ + Encoding: encoding, + CompressionType: compressionType, + Children: children, + } + + element.Name = name + element.RepetitionType = &repetitionType + element.Type = elementType + element.ConvertedType = convertedType + element.NumChildren = &element.numChildren + if element.Children != nil { + element.numChildren = int32(element.Children.Length()) + } + + return &element, nil +} diff --git a/pkg/s3select/internal/parquet-go/schema/tree.go b/pkg/s3select/internal/parquet-go/schema/tree.go new file mode 100644 index 000000000..6fb3e52a5 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/schema/tree.go @@ -0,0 +1,388 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package schema + +import ( + "fmt" + "strings" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +func updateMaxDLRL(schemaMap map[string]*Element, maxDL, maxRL int64) { + for _, element := range schemaMap { + element.MaxDefinitionLevel = maxDL + element.MaxRepetitionLevel = maxRL + if *element.RepetitionType != parquet.FieldRepetitionType_REQUIRED { + element.MaxDefinitionLevel++ + if *element.RepetitionType == parquet.FieldRepetitionType_REPEATED { + element.MaxRepetitionLevel++ + } + } + + if element.Children != nil { + updateMaxDLRL(element.Children.schemaMap, element.MaxDefinitionLevel, element.MaxRepetitionLevel) + } + } +} + +func toParquetSchema(tree *Tree, treePrefix string, schemaPrefix string, schemaList *[]*parquet.SchemaElement, valueElements *[]*Element) (err error) { + tree.Range(func(name string, element *Element) bool { + pathInTree := name + if treePrefix != "" { + pathInTree = treePrefix + "." + name + } + + if element.Type == nil && element.ConvertedType == nil && element.Children == nil { + err = fmt.Errorf("%v: group element must have children", pathInTree) + return false + } + + if element.ConvertedType != nil { + switch *element.ConvertedType { + case parquet.ConvertedType_LIST: + // Supported structure. + // group (LIST) { + // REPEATED group list { + // element; + // } + // } + + if element.Type != nil { + err = fmt.Errorf("%v: type must be nil for LIST ConvertedType", pathInTree) + return false + } + + if element.Children == nil || element.Children.Length() != 1 { + err = fmt.Errorf("%v: children must have one element only for LIST ConvertedType", pathInTree) + return false + } + + listElement, ok := element.Children.Get("list") + if !ok { + err = fmt.Errorf("%v: missing group element 'list' for LIST ConvertedType", pathInTree) + return false + } + + if listElement.Name != "list" { + err = fmt.Errorf("%v.list: name must be 'list'", pathInTree) + return false + } + + if *listElement.RepetitionType != parquet.FieldRepetitionType_REPEATED { + err = fmt.Errorf("%v.list: repetition type must be REPEATED type", pathInTree) + return false + } + + if listElement.Type != nil || listElement.ConvertedType != nil { + err = fmt.Errorf("%v.list: type and converted type must be nil", pathInTree) + return false + } + + if listElement.Children == nil || listElement.Children.Length() != 1 { + err = fmt.Errorf("%v.list.element: not found", pathInTree) + return false + } + + valueElement, ok := listElement.Children.Get("element") + if !ok { + err = fmt.Errorf("%v.list.element: not found", pathInTree) + return false + } + + if valueElement.Name != "element" { + err = fmt.Errorf("%v.list.element: name must be 'element'", pathInTree) + return false + } + + case parquet.ConvertedType_MAP: + // Supported structure: + // group (MAP) { + // REPEATED group key_value { + // REQUIRED key; + // value; + // } + // } + + if element.Type != nil { + err = fmt.Errorf("%v: type must be nil for MAP ConvertedType", pathInTree) + return false + } + + if element.Children == nil || element.Children.Length() != 1 { + err = fmt.Errorf("%v: children must have one element only for MAP ConvertedType", pathInTree) + return false + } + + keyValueElement, ok := element.Children.Get("key_value") + if !ok { + err = fmt.Errorf("%v: missing group element 'key_value' for MAP ConvertedType", pathInTree) + return false + } + + if keyValueElement.Name != "key_value" { + err = fmt.Errorf("%v.key_value: name must be 'key_value'", pathInTree) + return false + } + + if *keyValueElement.RepetitionType != parquet.FieldRepetitionType_REPEATED { + err = fmt.Errorf("%v.key_value: repetition type must be REPEATED type", pathInTree) + return false + } + + if keyValueElement.Children == nil || keyValueElement.Children.Length() < 1 || keyValueElement.Children.Length() > 2 { + err = fmt.Errorf("%v.key_value: children must have 'key' and optionally 'value' elements for MAP ConvertedType", pathInTree) + return false + } + + keyElement, ok := keyValueElement.Children.Get("key") + if !ok { + err = fmt.Errorf("%v.key_value: missing 'key' element for MAP ConvertedType", pathInTree) + return false + } + + if keyElement.Name != "key" { + err = fmt.Errorf("%v.key_value.key: name must be 'key'", pathInTree) + return false + } + + if *keyElement.RepetitionType != parquet.FieldRepetitionType_REQUIRED { + err = fmt.Errorf("%v.key_value: repetition type must be REQUIRED type", pathInTree) + return false + } + + if keyValueElement.Children.Length() == 2 { + valueElement, ok := keyValueElement.Children.Get("value") + if !ok { + err = fmt.Errorf("%v.key_value: second element must be 'value' element for MAP ConvertedType", pathInTree) + return false + } + + if valueElement.Name != "value" { + err = fmt.Errorf("%v.key_value.value: name must be 'value'", pathInTree) + return false + } + } + + case parquet.ConvertedType_UTF8, parquet.ConvertedType_UINT_8, parquet.ConvertedType_UINT_16: + fallthrough + case parquet.ConvertedType_UINT_32, parquet.ConvertedType_UINT_64, parquet.ConvertedType_INT_8: + fallthrough + case parquet.ConvertedType_INT_16, parquet.ConvertedType_INT_32, parquet.ConvertedType_INT_64: + if element.Type == nil { + err = fmt.Errorf("%v: ConvertedType %v must have Type value", pathInTree, element.ConvertedType) + return false + } + + default: + err = fmt.Errorf("%v: unsupported ConvertedType %v", pathInTree, element.ConvertedType) + return false + } + } + + element.PathInTree = pathInTree + element.PathInSchema = element.Name + if schemaPrefix != "" { + element.PathInSchema = schemaPrefix + "." + element.Name + } + + if element.Type != nil { + *valueElements = append(*valueElements, element) + } + + *schemaList = append(*schemaList, &element.SchemaElement) + if element.Children != nil { + element.numChildren = int32(element.Children.Length()) + err = toParquetSchema(element.Children, element.PathInTree, element.PathInSchema, schemaList, valueElements) + } + + return (err == nil) + }) + + return err +} + +// Tree - represents tree of schema. Tree preserves order in which elements are added. +type Tree struct { + schemaMap map[string]*Element + keys []string + readOnly bool +} + +// String - stringify this tree. +func (tree *Tree) String() string { + var s []string + tree.Range(func(name string, element *Element) bool { + s = append(s, fmt.Sprintf("%v: %v", name, element)) + return true + }) + + return "{" + strings.Join(s, ", ") + "}" +} + +// Length - returns length of tree. +func (tree *Tree) Length() int { + return len(tree.keys) +} + +func (tree *Tree) travel(pathSegments []string) (pathSegmentIndex int, pathSegment string, currElement *Element, parentTree *Tree, found bool) { + parentTree = tree + for pathSegmentIndex, pathSegment = range pathSegments { + if tree == nil { + found = false + break + } + + var tmpCurrElement *Element + if tmpCurrElement, found = tree.schemaMap[pathSegment]; !found { + break + } + currElement = tmpCurrElement + + parentTree = tree + tree = currElement.Children + } + + return +} + +// ReadOnly - returns whether this tree is read only or not. +func (tree *Tree) ReadOnly() bool { + return tree.readOnly +} + +// Get - returns the element stored for name. +func (tree *Tree) Get(name string) (element *Element, ok bool) { + pathSegments := strings.Split(name, ".") + for _, pathSegment := range pathSegments { + if tree == nil { + element = nil + ok = false + break + } + + if element, ok = tree.schemaMap[pathSegment]; !ok { + break + } + + tree = element.Children + } + + return element, ok +} + +// Set - adds or sets element to name. +func (tree *Tree) Set(name string, element *Element) error { + if tree.readOnly { + return fmt.Errorf("read only tree") + } + + pathSegments := strings.Split(name, ".") + if err := validataPathSegments(pathSegments); err != nil { + return err + } + + i, pathSegment, currElement, parentTree, found := tree.travel(pathSegments) + + if !found { + if i != len(pathSegments)-1 { + return fmt.Errorf("parent %v does not exist", strings.Join(pathSegments[:i+1], ".")) + } + + if currElement == nil { + parentTree = tree + } else { + if currElement.Type != nil { + return fmt.Errorf("parent %v is not group element", strings.Join(pathSegments[:i], ".")) + } + + if currElement.Children == nil { + currElement.Children = NewTree() + } + parentTree = currElement.Children + } + + parentTree.keys = append(parentTree.keys, pathSegment) + } + + parentTree.schemaMap[pathSegment] = element + return nil +} + +// Delete - deletes name and its element. +func (tree *Tree) Delete(name string) { + if tree.readOnly { + panic(fmt.Errorf("read only tree")) + } + + pathSegments := strings.Split(name, ".") + + _, pathSegment, _, parentTree, found := tree.travel(pathSegments) + + if found { + for i := range parentTree.keys { + if parentTree.keys[i] == pathSegment { + copy(parentTree.keys[i:], parentTree.keys[i+1:]) + parentTree.keys = parentTree.keys[:len(parentTree.keys)-1] + break + } + } + + delete(parentTree.schemaMap, pathSegment) + } +} + +// Range - calls f sequentially for each name and its element. If f returns false, range stops the iteration. +func (tree *Tree) Range(f func(name string, element *Element) bool) { + for _, name := range tree.keys { + if !f(name, tree.schemaMap[name]) { + break + } + } +} + +// ToParquetSchema - returns list of parquet SchemaElement and list of elements those stores values. +func (tree *Tree) ToParquetSchema() (schemaList []*parquet.SchemaElement, valueElements []*Element, err error) { + if tree.readOnly { + return nil, nil, fmt.Errorf("read only tree") + } + + updateMaxDLRL(tree.schemaMap, 0, 0) + + var schemaElements []*parquet.SchemaElement + if err = toParquetSchema(tree, "", "", &schemaElements, &valueElements); err != nil { + return nil, nil, err + } + + tree.readOnly = true + + numChildren := int32(len(tree.keys)) + schemaList = append(schemaList, &parquet.SchemaElement{ + Name: "schema", + RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), + NumChildren: &numChildren, + }) + schemaList = append(schemaList, schemaElements...) + return schemaList, valueElements, nil +} + +// NewTree - creates new schema tree. +func NewTree() *Tree { + return &Tree{ + schemaMap: make(map[string]*Element), + } +} diff --git a/pkg/s3select/internal/parquet-go/schema/tree_test.go b/pkg/s3select/internal/parquet-go/schema/tree_test.go new file mode 100644 index 000000000..ede461156 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/schema/tree_test.go @@ -0,0 +1,1092 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package schema + +import ( + "testing" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" +) + +func TestTreeSet(t *testing.T) { + a, err := NewElement("a", parquet.FieldRepetitionType_OPTIONAL, nil, nil, nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + b, err := NewElement("b", parquet.FieldRepetitionType_OPTIONAL, nil, nil, nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + c, err := NewElement("c", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + testCases := []struct { + name string + element *Element + expectErr bool + }{ + {"A", a, false}, + {"A.B", b, false}, + {"A.B.C", c, false}, + {"B.C", nil, true}, // error: parent B does not exist + {"A.B.C.AA", nil, true}, // error: parent A.B.C is not group element + } + + root := NewTree() + for i, testCase := range testCases { + err := root.Set(testCase.name, testCase.element) + expectErr := (err != nil) + + if expectErr != testCase.expectErr { + if testCase.expectErr { + t.Fatalf("case %v: err: expected: , got: ", i+1) + } else { + t.Fatalf("case %v: err: expected: , got: %v", i+1, err) + } + } + } +} + +func TestTreeGet(t *testing.T) { + a, err := NewElement("a", parquet.FieldRepetitionType_OPTIONAL, nil, nil, nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + b, err := NewElement("b", parquet.FieldRepetitionType_OPTIONAL, nil, nil, nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + c, err := NewElement("c", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + root := NewTree() + if err := root.Set("A", a); err != nil { + t.Fatal(err) + } + if err := root.Set("A.B", b); err != nil { + t.Fatal(err) + } + if err := root.Set("A.B.C", c); err != nil { + t.Fatal(err) + } + + testCases := []struct { + name string + expectedElement *Element + expectedFound bool + }{ + {"A", a, true}, + {"A.B", b, true}, + {"A.B.C", c, true}, + {"B", nil, false}, + {"A.B.C.AA", nil, false}, + } + + for i, testCase := range testCases { + element, found := root.Get(testCase.name) + + if element != testCase.expectedElement { + t.Fatalf("case %v: element: expected: %v, got: %v", i+1, testCase.expectedElement, element) + } + + if found != testCase.expectedFound { + t.Fatalf("case %v: found: expected: %v, got: %v", i+1, testCase.expectedFound, found) + } + } +} + +func TestTreeDelete(t *testing.T) { + testCases := []struct { + name string + expectedFound bool + }{ + {"A", false}, + {"A.B", false}, + {"A.B.C", false}, + } + + for i, testCase := range testCases { + a, err := NewElement("a", parquet.FieldRepetitionType_OPTIONAL, nil, nil, nil, nil, nil) + if err != nil { + t.Fatalf("case %v: %v", i+1, err) + } + + b, err := NewElement("b", parquet.FieldRepetitionType_OPTIONAL, nil, nil, nil, nil, nil) + if err != nil { + t.Fatalf("case %v: %v", i+1, err) + } + + c, err := NewElement("c", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), + nil, nil, nil) + if err != nil { + t.Fatalf("case %v: %v", i+1, err) + } + + root := NewTree() + if err := root.Set("A", a); err != nil { + t.Fatalf("case %v: %v", i+1, err) + } + if err := root.Set("A.B", b); err != nil { + t.Fatalf("case %v: %v", i+1, err) + } + if err := root.Set("A.B.C", c); err != nil { + t.Fatalf("case %v: %v", i+1, err) + } + + root.Delete(testCase.name) + _, found := root.Get(testCase.name) + + if found != testCase.expectedFound { + t.Fatalf("case %v: found: expected: %v, got: %v", i+1, testCase.expectedFound, found) + } + } +} + +func TestTreeToParquetSchema(t *testing.T) { + case1Root := NewTree() + { + a, err := NewElement("a", parquet.FieldRepetitionType_OPTIONAL, nil, nil, nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case1Root.Set("A", a); err != nil { + t.Fatal(err) + } + } + + case2Root := NewTree() + { + a, err := NewElement("a", parquet.FieldRepetitionType_OPTIONAL, nil, parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case2Root.Set("A", a); err != nil { + t.Fatal(err) + } + } + + case3Root := NewTree() + { + a, err := NewElement("a", parquet.FieldRepetitionType_OPTIONAL, nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP_KEY_VALUE), nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case3Root.Set("A", a); err != nil { + t.Fatal(err) + } + } + + case4Root := NewTree() + { + a, err := NewElement("a", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + if err := case4Root.Set("A", a); err != nil { + t.Fatal(err) + } + } + + case5Root := NewTree() + { + a, err := NewElement("a", parquet.FieldRepetitionType_OPTIONAL, nil, nil, nil, nil, nil) + if err != nil { + t.Fatal(err) + } + b, err := NewElement("b", parquet.FieldRepetitionType_OPTIONAL, nil, nil, nil, nil, nil) + if err != nil { + t.Fatal(err) + } + c, err := NewElement("c", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + if err := case5Root.Set("A", a); err != nil { + t.Fatal(err) + } + if err := case5Root.Set("A.B", b); err != nil { + t.Fatal(err) + } + if err := case5Root.Set("A.B.C", c); err != nil { + t.Fatal(err) + } + } + + testCases := []struct { + tree *Tree + expectErr bool + }{ + {case1Root, true}, // err: A: group element must have children + {case2Root, true}, // err: A: ConvertedType INT_8 must have Type value + {case3Root, true}, // err: A: unsupported ConvertedType MAP_KEY_VALUE + {case4Root, false}, + {case5Root, false}, + } + + for i, testCase := range testCases { + _, _, err := testCase.tree.ToParquetSchema() + expectErr := (err != nil) + + if expectErr != testCase.expectErr { + if testCase.expectErr { + t.Fatalf("case %v: err: expected: , got: ", i+1) + } else { + t.Fatalf("case %v: err: expected: , got: %v", i+1, err) + } + } + } +} + +func TestTreeToParquetSchemaOfList(t *testing.T) { + case1Root := NewTree() + { + names, err := NewElement("names", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case1Root.Set("Names", names); err != nil { + t.Fatal(err) + } + } + + case2Root := NewTree() + { + names, err := NewElement("names", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case2Root.Set("Names", names); err != nil { + t.Fatal(err) + } + } + + case3Root := NewTree() + { + names, err := NewElement("names", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + a, err := NewElement("a", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case3Root.Set("Names", names); err != nil { + t.Fatal(err) + } + + if err := case3Root.Set("Names.a", a); err != nil { + t.Fatal(err) + } + } + + case4Root := NewTree() + { + names, err := NewElement("names", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := NewElement("LIST", parquet.FieldRepetitionType_REQUIRED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case4Root.Set("Names", names); err != nil { + t.Fatal(err) + } + + if err := case4Root.Set("Names.list", list); err != nil { + t.Fatal(err) + } + } + + case5Root := NewTree() + { + names, err := NewElement("names", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := NewElement("list", parquet.FieldRepetitionType_REQUIRED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case5Root.Set("Names", names); err != nil { + t.Fatal(err) + } + + if err := case5Root.Set("Names.list", list); err != nil { + t.Fatal(err) + } + } + + case6Root := NewTree() + { + names, err := NewElement("names", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case6Root.Set("Names", names); err != nil { + t.Fatal(err) + } + + if err := case6Root.Set("Names.list", list); err != nil { + t.Fatal(err) + } + } + + case7Root := NewTree() + { + names, err := NewElement("names", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + a, err := NewElement("a", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case7Root.Set("Names", names); err != nil { + t.Fatal(err) + } + + if err := case7Root.Set("Names.list", list); err != nil { + t.Fatal(err) + } + + if err := case7Root.Set("Names.list.a", a); err != nil { + t.Fatal(err) + } + } + + case8Root := NewTree() + { + names, err := NewElement("names", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + element, err := NewElement("element", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + a, err := NewElement("a", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case8Root.Set("Names", names); err != nil { + t.Fatal(err) + } + + if err := case8Root.Set("Names.list", list); err != nil { + t.Fatal(err) + } + + if err := case8Root.Set("Names.list.element", element); err != nil { + t.Fatal(err) + } + + if err := case8Root.Set("Names.list.a", a); err != nil { + t.Fatal(err) + } + } + + case9Root := NewTree() + { + names, err := NewElement("names", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + element, err := NewElement("ELEMENT", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case9Root.Set("Names", names); err != nil { + t.Fatal(err) + } + + if err := case9Root.Set("Names.list", list); err != nil { + t.Fatal(err) + } + + if err := case9Root.Set("Names.list.element", element); err != nil { + t.Fatal(err) + } + } + + case10Root := NewTree() + { + names, err := NewElement("names", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + list, err := NewElement("list", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + element, err := NewElement("element", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case10Root.Set("Names", names); err != nil { + t.Fatal(err) + } + + if err := case10Root.Set("Names.list", list); err != nil { + t.Fatal(err) + } + + if err := case10Root.Set("Names.list.element", element); err != nil { + t.Fatal(err) + } + } + + testCases := []struct { + tree *Tree + expectErr bool + }{ + {case1Root, true}, // err: Names: type must be nil for LIST ConvertedType + {case2Root, true}, // err: Names: children must have one element only for LIST ConvertedType + {case3Root, true}, // err: Names: missing group element 'list' for LIST ConvertedType + {case4Root, true}, // err: Names.list: name must be 'list' + {case5Root, true}, // err: Names.list: repetition type must be REPEATED type + {case6Root, true}, // err: Names.list.element: not found + {case7Root, true}, // err: Names.list.element: not found + {case8Root, true}, // err: Names.list.element: not found + {case9Root, true}, // err: Names.list.element: name must be 'element' + {case10Root, false}, + } + + for i, testCase := range testCases { + _, _, err := testCase.tree.ToParquetSchema() + expectErr := (err != nil) + + if expectErr != testCase.expectErr { + if testCase.expectErr { + t.Fatalf("case %v: err: expected: , got: ", i+1) + } else { + t.Fatalf("case %v: err: expected: , got: %v", i+1, err) + } + } + } +} + +func TestTreeToParquetSchemaOfMap(t *testing.T) { + case1Root := NewTree() + { + nameMap, err := NewElement("nameMap", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case1Root.Set("NameMap", nameMap); err != nil { + t.Fatal(err) + } + } + + case2Root := NewTree() + { + nameMap, err := NewElement("nameMap", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case2Root.Set("NameMap", nameMap); err != nil { + t.Fatal(err) + } + } + + case3Root := NewTree() + { + nameMap, err := NewElement("nameMap", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + a, err := NewElement("a", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case3Root.Set("NameMap", nameMap); err != nil { + t.Fatal(err) + } + + if err := case3Root.Set("NameMap.a", a); err != nil { + t.Fatal(err) + } + } + + case4Root := NewTree() + { + nameMap, err := NewElement("nameMap", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := NewElement("keyValue", parquet.FieldRepetitionType_REQUIRED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case4Root.Set("NameMap", nameMap); err != nil { + t.Fatal(err) + } + + if err := case4Root.Set("NameMap.key_value", keyValue); err != nil { + t.Fatal(err) + } + } + + case5Root := NewTree() + { + nameMap, err := NewElement("nameMap", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := NewElement("key_value", parquet.FieldRepetitionType_REQUIRED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case5Root.Set("NameMap", nameMap); err != nil { + t.Fatal(err) + } + + if err := case5Root.Set("NameMap.key_value", keyValue); err != nil { + t.Fatal(err) + } + } + + case6Root := NewTree() + { + nameMap, err := NewElement("nameMap", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := NewElement("key_value", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case6Root.Set("NameMap", nameMap); err != nil { + t.Fatal(err) + } + + if err := case6Root.Set("NameMap.key_value", keyValue); err != nil { + t.Fatal(err) + } + } + + case7Root := NewTree() + { + nameMap, err := NewElement("nameMap", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := NewElement("key_value", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + a, err := NewElement("a", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + b, err := NewElement("b", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + c, err := NewElement("c", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case7Root.Set("NameMap", nameMap); err != nil { + t.Fatal(err) + } + + if err := case7Root.Set("NameMap.key_value", keyValue); err != nil { + t.Fatal(err) + } + + if err := case7Root.Set("NameMap.key_value.a", a); err != nil { + t.Fatal(err) + } + + if err := case7Root.Set("NameMap.key_value.b", b); err != nil { + t.Fatal(err) + } + + if err := case7Root.Set("NameMap.key_value.c", c); err != nil { + t.Fatal(err) + } + } + + case8Root := NewTree() + { + nameMap, err := NewElement("nameMap", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := NewElement("key_value", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + a, err := NewElement("a", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case8Root.Set("NameMap", nameMap); err != nil { + t.Fatal(err) + } + + if err := case8Root.Set("NameMap.key_value", keyValue); err != nil { + t.Fatal(err) + } + + if err := case8Root.Set("NameMap.key_value.a", a); err != nil { + t.Fatal(err) + } + } + + case9Root := NewTree() + { + nameMap, err := NewElement("nameMap", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := NewElement("key_value", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + key, err := NewElement("KEY", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case9Root.Set("NameMap", nameMap); err != nil { + t.Fatal(err) + } + + if err := case9Root.Set("NameMap.key_value", keyValue); err != nil { + t.Fatal(err) + } + + if err := case9Root.Set("NameMap.key_value.key", key); err != nil { + t.Fatal(err) + } + } + + case10Root := NewTree() + { + nameMap, err := NewElement("nameMap", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := NewElement("key_value", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + key, err := NewElement("key", parquet.FieldRepetitionType_OPTIONAL, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case10Root.Set("NameMap", nameMap); err != nil { + t.Fatal(err) + } + + if err := case10Root.Set("NameMap.key_value", keyValue); err != nil { + t.Fatal(err) + } + + if err := case10Root.Set("NameMap.key_value.key", key); err != nil { + t.Fatal(err) + } + } + + case11Root := NewTree() + { + nameMap, err := NewElement("nameMap", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := NewElement("key_value", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + key, err := NewElement("key", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + a, err := NewElement("a", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case11Root.Set("NameMap", nameMap); err != nil { + t.Fatal(err) + } + + if err := case11Root.Set("NameMap.key_value", keyValue); err != nil { + t.Fatal(err) + } + + if err := case11Root.Set("NameMap.key_value.key", key); err != nil { + t.Fatal(err) + } + + if err := case11Root.Set("NameMap.key_value.a", a); err != nil { + t.Fatal(err) + } + } + + case12Root := NewTree() + { + nameMap, err := NewElement("nameMap", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := NewElement("key_value", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + key, err := NewElement("key", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + value, err := NewElement("VALUE", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case12Root.Set("NameMap", nameMap); err != nil { + t.Fatal(err) + } + + if err := case12Root.Set("NameMap.key_value", keyValue); err != nil { + t.Fatal(err) + } + + if err := case12Root.Set("NameMap.key_value.key", key); err != nil { + t.Fatal(err) + } + + if err := case12Root.Set("NameMap.key_value.value", value); err != nil { + t.Fatal(err) + } + } + + case13Root := NewTree() + { + nameMap, err := NewElement("nameMap", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := NewElement("key_value", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + key, err := NewElement("key", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case13Root.Set("NameMap", nameMap); err != nil { + t.Fatal(err) + } + + if err := case13Root.Set("NameMap.key_value", keyValue); err != nil { + t.Fatal(err) + } + + if err := case13Root.Set("NameMap.key_value.key", key); err != nil { + t.Fatal(err) + } + } + + case14Root := NewTree() + { + nameMap, err := NewElement("nameMap", parquet.FieldRepetitionType_REQUIRED, + nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + keyValue, err := NewElement("key_value", parquet.FieldRepetitionType_REPEATED, + nil, nil, + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + key, err := NewElement("key", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + value, err := NewElement("value", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := case14Root.Set("NameMap", nameMap); err != nil { + t.Fatal(err) + } + + if err := case14Root.Set("NameMap.key_value", keyValue); err != nil { + t.Fatal(err) + } + + if err := case14Root.Set("NameMap.key_value.key", key); err != nil { + t.Fatal(err) + } + + if err := case13Root.Set("NameMap.key_value.value", value); err != nil { + t.Fatal(err) + } + } + + testCases := []struct { + tree *Tree + expectErr bool + }{ + {case1Root, true}, // err: NameMap: type must be nil for MAP ConvertedType + {case2Root, true}, // err: NameMap: children must have one element only for MAP ConvertedType + {case3Root, true}, // err: NameMap: missing group element 'key_value' for MAP ConvertedType + {case4Root, true}, // err: NameMap.key_value: name must be 'key_value' + {case5Root, true}, // err: NameMap.key_value: repetition type must be REPEATED type + {case6Root, true}, // err: NameMap.key_value: children must have 'key' and optionally 'value' elements for MAP ConvertedType + {case7Root, true}, // err: NameMap.key_value: children must have 'key' and optionally 'value' elements for MAP ConvertedType + {case8Root, true}, // err: NameMap.key_value: missing 'key' element for MAP ConvertedType + {case9Root, true}, // err: NameMap.key_value.key: name must be 'key' + {case10Root, true}, // err: NameMap.key_value: repetition type must be REQUIRED type + {case11Root, true}, // err: NameMap.key_value: second element must be 'value' element for MAP ConvertedType + {case12Root, true}, // err: NameMap.key_value.value: name must be 'value' + {case13Root, false}, + {case14Root, false}, + } + + for i, testCase := range testCases { + _, _, err := testCase.tree.ToParquetSchema() + expectErr := (err != nil) + + if expectErr != testCase.expectErr { + if testCase.expectErr { + t.Fatalf("case %v: err: expected: , got: ", i+1) + } else { + t.Fatalf("case %v: err: expected: , got: %v", i+1, err) + } + } + } +} diff --git a/pkg/s3select/internal/parquet-go/table.go b/pkg/s3select/internal/parquet-go/table.go new file mode 100644 index 000000000..52158aa50 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/table.go @@ -0,0 +1,100 @@ +/* + * Minio Cloud Storage, (C) 2018 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package parquet + +import "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + +func getTableValues(values interface{}, valueType parquet.Type) (tableValues []interface{}) { + return valuesToInterfaces(values, valueType) +} + +type table struct { + RepetitionType parquet.FieldRepetitionType + Type parquet.Type + MaxDefinitionLevel int32 + MaxRepetitionLevel int32 + Path []string // Path of this column + Values []interface{} // Parquet values + DefinitionLevels []int32 // Definition Levels slice + RepetitionLevels []int32 // Repetition Levels slice + ConvertedType parquet.ConvertedType + Encoding parquet.Encoding + BitWidth int32 +} + +func newTableFromTable(srcTable *table) *table { + if srcTable == nil { + return nil + } + + return &table{ + Type: srcTable.Type, + Path: append([]string{}, srcTable.Path...), + } +} + +func (table *table) Merge(tables ...*table) { + for i := 0; i < len(tables); i++ { + if tables[i] == nil { + continue + } + + table.Values = append(table.Values, tables[i].Values...) + table.RepetitionLevels = append(table.RepetitionLevels, tables[i].RepetitionLevels...) + table.DefinitionLevels = append(table.DefinitionLevels, tables[i].DefinitionLevels...) + + if table.MaxDefinitionLevel < tables[i].MaxDefinitionLevel { + table.MaxDefinitionLevel = tables[i].MaxDefinitionLevel + } + + if table.MaxRepetitionLevel < tables[i].MaxRepetitionLevel { + table.MaxRepetitionLevel = tables[i].MaxRepetitionLevel + } + } +} + +func (table *table) Pop(numRows int64) *table { + result := newTableFromTable(table) + var i, num int64 + for i = int64(0); i < int64(len(table.Values)); i++ { + if table.RepetitionLevels[i] == 0 { + if num >= numRows { + break + } + + num++ + } + + if result.MaxRepetitionLevel < table.RepetitionLevels[i] { + result.MaxRepetitionLevel = table.RepetitionLevels[i] + } + + if result.MaxDefinitionLevel < table.DefinitionLevels[i] { + result.MaxDefinitionLevel = table.DefinitionLevels[i] + } + } + + result.RepetitionLevels = table.RepetitionLevels[:i] + result.DefinitionLevels = table.DefinitionLevels[:i] + result.Values = table.Values[:i] + + table.RepetitionLevels = table.RepetitionLevels[i:] + table.DefinitionLevels = table.DefinitionLevels[i:] + table.Values = table.Values[i:] + + return result +} diff --git a/pkg/s3select/internal/parquet-go/test.parquet b/pkg/s3select/internal/parquet-go/test.parquet new file mode 100644 index 000000000..f9e319f39 Binary files /dev/null and b/pkg/s3select/internal/parquet-go/test.parquet differ diff --git a/pkg/s3select/internal/parquet-go/tools/parquet2csv/parquet2csv.go b/pkg/s3select/internal/parquet-go/tools/parquet2csv/parquet2csv.go new file mode 100644 index 000000000..b112d9310 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/tools/parquet2csv/parquet2csv.go @@ -0,0 +1,146 @@ +/* + * Minio Cloud Storage, (C) 2018 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "encoding/csv" + "fmt" + "io" + "os" + "path" + "strings" + + "github.com/minio/minio-go/v6/pkg/set" + parquet "github.com/minio/minio/pkg/s3select/internal/parquet-go" +) + +func getReader(name string, offset int64, length int64) (io.ReadCloser, error) { + file, err := os.Open(name) + if err != nil { + return nil, err + } + + fi, err := file.Stat() + if err != nil { + return nil, err + } + + if offset < 0 { + offset = fi.Size() + offset + } + + if _, err = file.Seek(offset, os.SEEK_SET); err != nil { + return nil, err + } + + return file, nil +} + +func printUsage() { + progName := path.Base(os.Args[0]) + fmt.Printf("usage: %v PARQUET-FILE [COLUMN...]\n", progName) + fmt.Println() + fmt.Printf("examples:\n") + fmt.Printf("# Convert all columns to CSV\n") + fmt.Printf("$ %v example.parquet\n", progName) + fmt.Println() + fmt.Printf("# Convert specific columns to CSV\n") + fmt.Printf("$ %v example.par firstname dob\n", progName) + fmt.Println() +} + +func main() { + if len(os.Args) < 2 { + printUsage() + os.Exit(-1) + } + + name := os.Args[1] + ext := path.Ext(name) + csvFilename := name + ".csv" + if ext == ".parquet" || ext == ".par" { + csvFilename = strings.TrimSuffix(name, ext) + ".csv" + } + + columns := set.CreateStringSet(os.Args[2:]...) + if len(columns) == 0 { + columns = nil + } + + file, err := parquet.NewReader( + func(offset, length int64) (io.ReadCloser, error) { + return getReader(name, offset, length) + }, + columns, + ) + if err != nil { + fmt.Printf("%v: %v\n", name, err) + os.Exit(1) + } + + defer file.Close() + + csvFile, err := os.OpenFile(csvFilename, os.O_RDWR|os.O_CREATE, 0755) + if err != nil { + fmt.Printf("%v: %v\n", csvFilename, err) + os.Exit(1) + } + + defer csvFile.Close() + + csvWriter := csv.NewWriter(csvFile) + defer csvWriter.Flush() + + headerWritten := false + for { + record, err := file.Read() + if err != nil { + if err != io.EOF { + fmt.Printf("%v: %v\n", name, err) + os.Exit(1) + } + + break + } + + if !headerWritten { + var csvRecord []string + record.Range(func(name string, value parquet.Value) bool { + csvRecord = append(csvRecord, name) + return true + }) + + if err = csvWriter.Write(csvRecord); err != nil { + fmt.Printf("%v: %v\n", csvFilename, err) + os.Exit(1) + } + + headerWritten = true + } + + var csvRecord []string + record.Range(func(name string, value parquet.Value) bool { + csvRecord = append(csvRecord, fmt.Sprintf("%v", value.Value)) + return true + }) + + if err = csvWriter.Write(csvRecord); err != nil { + fmt.Printf("%v: %v\n", csvFilename, err) + os.Exit(1) + } + } +} diff --git a/pkg/s3select/internal/parquet-go/tools/parquet2json/parquet2json.go b/pkg/s3select/internal/parquet-go/tools/parquet2json/parquet2json.go new file mode 100644 index 000000000..bdd0fb941 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/tools/parquet2json/parquet2json.go @@ -0,0 +1,128 @@ +/* + * Minio Cloud Storage, (C) 2018 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "encoding/json" + "fmt" + "io" + "os" + "path" + "strings" + + "github.com/minio/minio-go/v6/pkg/set" + parquet "github.com/minio/minio/pkg/s3select/internal/parquet-go" +) + +func getReader(name string, offset int64, length int64) (io.ReadCloser, error) { + file, err := os.Open(name) + if err != nil { + return nil, err + } + + fi, err := file.Stat() + if err != nil { + return nil, err + } + + if offset < 0 { + offset = fi.Size() + offset + } + + if _, err = file.Seek(offset, os.SEEK_SET); err != nil { + return nil, err + } + + return file, nil +} + +func printUsage() { + progName := path.Base(os.Args[0]) + fmt.Printf("Usage: %v PARQUET-FILE [COLUMN...]\n", progName) + fmt.Println() + fmt.Printf("Examples:\n") + fmt.Printf("# Convert all columns to JSON\n") + fmt.Printf("$ %v example.parquet\n", progName) + fmt.Println() + fmt.Printf("# Convert specific columns to JSON\n") + fmt.Printf("$ %v example.par firstname dob\n", progName) + fmt.Println() +} + +func main() { + if len(os.Args) < 2 { + printUsage() + os.Exit(-1) + } + + name := os.Args[1] + ext := path.Ext(name) + jsonFilename := name + ".json" + if ext == ".parquet" || ext == ".par" { + jsonFilename = strings.TrimSuffix(name, ext) + ".json" + } + + columns := set.CreateStringSet(os.Args[2:]...) + if len(columns) == 0 { + columns = nil + } + + file, err := parquet.NewReader( + func(offset, length int64) (io.ReadCloser, error) { + return getReader(name, offset, length) + }, + columns, + ) + if err != nil { + fmt.Printf("%v: %v\n", name, err) + os.Exit(1) + } + + defer file.Close() + + jsonFile, err := os.OpenFile(jsonFilename, os.O_RDWR|os.O_CREATE, 0755) + if err != nil { + fmt.Printf("%v: %v\n", jsonFilename, err) + os.Exit(1) + } + + defer jsonFile.Close() + + for { + record, err := file.Read() + if err != nil { + if err != io.EOF { + fmt.Printf("%v: %v\n", name, err) + os.Exit(1) + } + + break + } + + data, err := json.Marshal(record) + if err != nil { + fmt.Printf("%v: %v\n", name, err) + os.Exit(1) + } + data = append(data, byte('\n')) + + if _, err = jsonFile.Write(data); err != nil { + fmt.Printf("%v: %v\n", jsonFilename, err) + os.Exit(1) + } + } +} diff --git a/pkg/s3select/internal/parquet-go/writer.go b/pkg/s3select/internal/parquet-go/writer.go new file mode 100644 index 000000000..7d1fbdd4b --- /dev/null +++ b/pkg/s3select/internal/parquet-go/writer.go @@ -0,0 +1,191 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package parquet + +import ( + "context" + "encoding/binary" + "fmt" + "io" + + "git.apache.org/thrift.git/lib/go/thrift" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/data" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/schema" +) + +const ( + defaultPageSize = 8 * 1024 // 8 KiB + defaultRowGroupSize = 128 * 1024 * 1024 // 128 MiB +) + +// Writer - represents parquet writer. +type Writer struct { + PageSize int64 + RowGroupSize int64 + CompressionType parquet.CompressionCodec + + writeCloser io.WriteCloser + numRows int64 + offset int64 + footer *parquet.FileMetaData + schemaTree *schema.Tree + valueElements []*schema.Element + columnDataMap map[string]*data.Column + rowGroupCount int +} + +func (writer *Writer) writeData() (err error) { + if writer.numRows == 0 { + return nil + } + + var chunks []*data.ColumnChunk + for _, element := range writer.valueElements { + name := element.PathInTree + columnData, found := writer.columnDataMap[name] + if !found { + continue + } + + columnChunk := columnData.Encode(element) + chunks = append(chunks, columnChunk) + } + + rowGroup := data.NewRowGroup(chunks, writer.numRows, writer.offset) + + for _, chunk := range chunks { + if _, err = writer.writeCloser.Write(chunk.Data()); err != nil { + return err + } + + writer.offset += chunk.DataLen() + } + + writer.footer.RowGroups = append(writer.footer.RowGroups, rowGroup) + writer.footer.NumRows += writer.numRows + + writer.numRows = 0 + writer.columnDataMap = nil + return nil +} + +// WriteJSON - writes a record represented in JSON. +func (writer *Writer) WriteJSON(recordData []byte) (err error) { + columnDataMap, err := data.UnmarshalJSON(recordData, writer.schemaTree) + if err != nil { + return err + } + + return writer.Write(columnDataMap) +} + +// Write - writes a record represented in map. +func (writer *Writer) Write(record map[string]*data.Column) (err error) { + if writer.columnDataMap == nil { + writer.columnDataMap = record + } else { + for name, columnData := range record { + var found bool + var element *schema.Element + for _, element = range writer.valueElements { + if element.PathInTree == name { + found = true + break + } + } + + if !found { + return fmt.Errorf("%v is not value column", name) + } + + writer.columnDataMap[name].Merge(columnData) + } + } + + writer.numRows++ + if writer.numRows == int64(writer.rowGroupCount) { + return writer.writeData() + } + + return nil +} + +func (writer *Writer) finalize() (err error) { + if err = writer.writeData(); err != nil { + return err + } + + ts := thrift.NewTSerializer() + ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport) + footerBuf, err := ts.Write(context.TODO(), writer.footer) + if err != nil { + return err + } + + if _, err = writer.writeCloser.Write(footerBuf); err != nil { + return err + } + + footerSizeBuf := make([]byte, 4) + binary.LittleEndian.PutUint32(footerSizeBuf, uint32(len(footerBuf))) + + if _, err = writer.writeCloser.Write(footerSizeBuf); err != nil { + return err + } + + _, err = writer.writeCloser.Write([]byte("PAR1")) + return err +} + +// Close - finalizes and closes writer. If any pending records are available, they are written here. +func (writer *Writer) Close() (err error) { + if err = writer.finalize(); err != nil { + return err + } + + return writer.writeCloser.Close() +} + +// NewWriter - creates new parquet writer. Binary data of rowGroupCount records are written to writeCloser. +func NewWriter(writeCloser io.WriteCloser, schemaTree *schema.Tree, rowGroupCount int) (*Writer, error) { + if _, err := writeCloser.Write([]byte("PAR1")); err != nil { + return nil, err + } + + schemaList, valueElements, err := schemaTree.ToParquetSchema() + if err != nil { + return nil, err + } + + footer := parquet.NewFileMetaData() + footer.Version = 1 + footer.Schema = schemaList + + return &Writer{ + PageSize: defaultPageSize, + RowGroupSize: defaultRowGroupSize, + CompressionType: parquet.CompressionCodec_SNAPPY, + + writeCloser: writeCloser, + offset: 4, + footer: footer, + schemaTree: schemaTree, + valueElements: valueElements, + rowGroupCount: rowGroupCount, + }, nil +} diff --git a/pkg/s3select/internal/parquet-go/writer_test.go b/pkg/s3select/internal/parquet-go/writer_test.go new file mode 100644 index 000000000..7bc4d9f36 --- /dev/null +++ b/pkg/s3select/internal/parquet-go/writer_test.go @@ -0,0 +1,152 @@ +/* + * Minio Cloud Storage, (C) 2019 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package parquet + +import ( + "os" + "testing" + + "github.com/minio/minio/pkg/s3select/internal/parquet-go/data" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" + "github.com/minio/minio/pkg/s3select/internal/parquet-go/schema" +) + +func TestWriterWrite(t *testing.T) { + schemaTree := schema.NewTree() + { + one, err := schema.NewElement("one", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_16), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + two, err := schema.NewElement("two", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + three, err := schema.NewElement("three", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_BOOLEAN), nil, nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := schemaTree.Set("one", one); err != nil { + t.Fatal(err) + } + if err := schemaTree.Set("two", two); err != nil { + t.Fatal(err) + } + if err := schemaTree.Set("three", three); err != nil { + t.Fatal(err) + } + } + + file, err := os.Create("test.parquet") + if err != nil { + t.Fatal(err) + } + + writer, err := NewWriter(file, schemaTree, 100) + if err != nil { + t.Fatal(err) + } + + oneColumn := data.NewColumn(parquet.Type_INT32) + oneColumn.AddInt32(100, 0, 0) + + twoColumn := data.NewColumn(parquet.Type_BYTE_ARRAY) + twoColumn.AddByteArray([]byte("foo"), 0, 0) + + threeColumn := data.NewColumn(parquet.Type_BOOLEAN) + threeColumn.AddBoolean(true, 0, 0) + + record := map[string]*data.Column{ + "one": oneColumn, + "two": twoColumn, + "three": threeColumn, + } + + err = writer.Write(record) + if err != nil { + t.Fatal(err) + } + + err = writer.Close() + if err != nil { + t.Fatal(err) + } +} + +func TestWriterWriteJSON(t *testing.T) { + schemaTree := schema.NewTree() + { + one, err := schema.NewElement("one", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_16), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + two, err := schema.NewElement("two", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), + nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + three, err := schema.NewElement("three", parquet.FieldRepetitionType_REQUIRED, + parquet.TypePtr(parquet.Type_BOOLEAN), nil, nil, nil, nil) + if err != nil { + t.Fatal(err) + } + + if err := schemaTree.Set("one", one); err != nil { + t.Fatal(err) + } + if err := schemaTree.Set("two", two); err != nil { + t.Fatal(err) + } + if err := schemaTree.Set("three", three); err != nil { + t.Fatal(err) + } + } + + file, err := os.Create("test.parquet") + if err != nil { + t.Fatal(err) + } + + writer, err := NewWriter(file, schemaTree, 100) + if err != nil { + t.Fatal(err) + } + + record := `{"one": 100, "two": "foo", "three": true}` + err = writer.WriteJSON([]byte(record)) + if err != nil { + t.Fatal(err) + } + + err = writer.Close() + if err != nil { + t.Fatal(err) + } +} diff --git a/pkg/s3select/parquet/reader.go b/pkg/s3select/parquet/reader.go index c63967bc3..fd7ae1e6b 100644 --- a/pkg/s3select/parquet/reader.go +++ b/pkg/s3select/parquet/reader.go @@ -20,10 +20,10 @@ import ( "io" "github.com/bcicen/jstream" + parquetgo "github.com/minio/minio/pkg/s3select/internal/parquet-go" + parquetgen "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" jsonfmt "github.com/minio/minio/pkg/s3select/json" "github.com/minio/minio/pkg/s3select/sql" - parquetgo "github.com/minio/parquet-go" - parquetgen "github.com/minio/parquet-go/gen-go/parquet" ) // Reader - Parquet record reader for S3Select.