mirror of
https://github.com/minio/minio.git
synced 2025-01-25 21:53:16 -05:00
move parquet-go to github.com/minio/parquet-go repo
This commit is contained in:
parent
6c8fddb70f
commit
e948e7cdf6
208
CREDITS
208
CREDITS
@ -10733,6 +10733,214 @@ https://github.com/minio/minio-go/v7
|
||||
|
||||
================================================================
|
||||
|
||||
github.com/minio/parquet-go
|
||||
https://github.com/minio/parquet-go
|
||||
----------------------------------------------------------------
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
================================================================
|
||||
|
||||
github.com/minio/rpc
|
||||
https://github.com/minio/rpc
|
||||
----------------------------------------------------------------
|
||||
|
7
go.mod
7
go.mod
@ -4,7 +4,6 @@ go 1.16
|
||||
|
||||
require (
|
||||
cloud.google.com/go/storage v1.8.0
|
||||
git.apache.org/thrift.git v0.13.0
|
||||
github.com/Azure/azure-pipeline-go v0.2.2
|
||||
github.com/Azure/azure-storage-blob-go v0.10.0
|
||||
github.com/Azure/go-autorest/autorest/adal v0.9.1 // indirect
|
||||
@ -30,7 +29,6 @@ require (
|
||||
github.com/fatih/structs v1.1.0
|
||||
github.com/go-ldap/ldap/v3 v3.2.4
|
||||
github.com/go-sql-driver/mysql v1.5.0
|
||||
github.com/golang/snappy v0.0.3
|
||||
github.com/gomodule/redigo v2.0.0+incompatible
|
||||
github.com/google/martian v2.1.1-0.20190517191504-25dcb96d9e51+incompatible // indirect
|
||||
github.com/google/uuid v1.1.2
|
||||
@ -55,6 +53,7 @@ require (
|
||||
github.com/minio/highwayhash v1.0.2
|
||||
github.com/minio/md5-simd v1.1.1 // indirect
|
||||
github.com/minio/minio-go/v7 v7.0.11-0.20210302210017-6ae69c73ce78
|
||||
github.com/minio/parquet-go v1.0.0
|
||||
github.com/minio/rpc v1.0.0
|
||||
github.com/minio/selfupdate v0.3.1
|
||||
github.com/minio/sha256-simd v1.0.0
|
||||
@ -71,7 +70,7 @@ require (
|
||||
github.com/nsqio/go-nsq v1.0.8
|
||||
github.com/olivere/elastic/v7 v7.0.22
|
||||
github.com/philhofer/fwd v1.1.1
|
||||
github.com/pierrec/lz4 v2.5.2+incompatible
|
||||
github.com/pierrec/lz4 v2.6.0+incompatible
|
||||
github.com/pkg/errors v0.9.1
|
||||
github.com/prometheus/client_golang v1.8.0
|
||||
github.com/prometheus/client_model v0.2.0
|
||||
@ -83,8 +82,6 @@ require (
|
||||
github.com/spaolacci/murmur3 v1.1.0 // indirect
|
||||
github.com/spf13/pflag v1.0.5 // indirect
|
||||
github.com/streadway/amqp v1.0.0
|
||||
github.com/tidwall/gjson v1.6.8
|
||||
github.com/tidwall/sjson v1.0.4
|
||||
github.com/tinylib/msgp v1.1.3
|
||||
github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a
|
||||
github.com/willf/bitset v1.1.11 // indirect
|
||||
|
25
go.sum
25
go.sum
@ -180,8 +180,9 @@ github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8
|
||||
github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
|
||||
github.com/franela/goblin v0.0.0-20200105215937-c9ffbefa60db/go.mod h1:7dvUGVsVBjqR7JHJk0brhHOZYGmfBYOrK0ZhYMEtBr4=
|
||||
github.com/franela/goreq v0.0.0-20171204163338-bcd34c9993f8/go.mod h1:ZhphrRTfi2rbfLwlschooIH4+wKKDR4Pdxhh+TRoA20=
|
||||
github.com/frankban/quicktest v1.10.2 h1:19ARM85nVi4xH7xPXuc5eM/udya5ieh7b/Sv+d844Tk=
|
||||
github.com/frankban/quicktest v1.10.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
|
||||
github.com/frankban/quicktest v1.12.1 h1:P6vQcHwZYgVGIpUzKB5DXzkEeYJppJOStPLuh9aB89c=
|
||||
github.com/frankban/quicktest v1.12.1/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
|
||||
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
|
||||
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
|
||||
github.com/go-asn1-ber/asn1-ber v1.5.1 h1:pDbRAunXzIUXfx4CB2QJFv5IuPiuoW+sWvr/Us009o8=
|
||||
@ -251,8 +252,9 @@ github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw
|
||||
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
|
||||
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.5.4 h1:L8R9j+yAqZuZjsqh/z+F1NCffTKKLShY6zXTItVIZ8M=
|
||||
github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=
|
||||
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
|
||||
github.com/google/martian v2.1.1-0.20190517191504-25dcb96d9e51+incompatible h1:xmapqc1AyLoB+ddYT6r04bD9lIjlOqGaREovi0SzFaE=
|
||||
@ -443,8 +445,11 @@ github.com/minio/highwayhash v1.0.2/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLT
|
||||
github.com/minio/md5-simd v1.1.0/go.mod h1:XpBqgZULrMYD3R+M28PcmP0CkI7PEMzB3U77ZrKZ0Gw=
|
||||
github.com/minio/md5-simd v1.1.1 h1:9ojcLbuZ4gXbB2sX53MKn8JUZ0sB/2wfwsEcRw+I08U=
|
||||
github.com/minio/md5-simd v1.1.1/go.mod h1:XpBqgZULrMYD3R+M28PcmP0CkI7PEMzB3U77ZrKZ0Gw=
|
||||
github.com/minio/minio-go/v7 v7.0.10/go.mod h1:td4gW1ldOsj1PbSNS+WYK43j+P1XVhX/8W8awaYlBFo=
|
||||
github.com/minio/minio-go/v7 v7.0.11-0.20210302210017-6ae69c73ce78 h1:v7OMbUnWkyRlO2MZ5AuYioELhwXF/BgZEznrQ1drBEM=
|
||||
github.com/minio/minio-go/v7 v7.0.11-0.20210302210017-6ae69c73ce78/go.mod h1:mTh2uJuAbEqdhMVl6CMIIZLUeiMiWtJR4JB8/5g2skw=
|
||||
github.com/minio/parquet-go v1.0.0 h1:fcWsEvub04Nsl/4hiRBDWlbqd6jhacQieV07a+nhiIk=
|
||||
github.com/minio/parquet-go v1.0.0/go.mod h1:aQlkSOfOq2AtQKkuou3mosNVMwNokd+faTacxxk/oHA=
|
||||
github.com/minio/rpc v1.0.0 h1:tJCHyLfQF6k6HlMQFpKy2FO/7lc2WP8gLDGMZp18E70=
|
||||
github.com/minio/rpc v1.0.0/go.mod h1:b9xqF7J0xeMXr0cM4pnBlP7Te7PDsG5JrRxl5dG6Ldk=
|
||||
github.com/minio/selfupdate v0.3.1 h1:BWEFSNnrZVMUWXbXIgLDNDjbejkmpAmZvy/nCz1HlEs=
|
||||
@ -531,8 +536,9 @@ github.com/philhofer/fwd v1.1.1 h1:GdGcTjf5RNAxwS4QLsiMzJYj5KEvPJD3Abr261yRQXQ=
|
||||
github.com/philhofer/fwd v1.1.1/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU=
|
||||
github.com/pierrec/lz4 v1.0.2-0.20190131084431-473cd7ce01a1/go.mod h1:3/3N9NVKO0jef7pBehbT1qWhCMrIgbYNnFAZCqQ5LRc=
|
||||
github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
|
||||
github.com/pierrec/lz4 v2.5.2+incompatible h1:WCjObylUIOlKy/+7Abdn34TLIkXiA4UWUMhxq9m9ZXI=
|
||||
github.com/pierrec/lz4 v2.5.2+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
|
||||
github.com/pierrec/lz4 v2.6.0+incompatible h1:Ix9yFKn1nSPBLFl/yZknTp8TU5G4Ps0JDmguYK6iH1A=
|
||||
github.com/pierrec/lz4 v2.6.0+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
|
||||
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
@ -626,14 +632,15 @@ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81P
|
||||
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
|
||||
github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0=
|
||||
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/tidwall/gjson v1.6.8 h1:CTmXMClGYPAmln7652e69B7OLXfTi5ABcPPwjIWUv7w=
|
||||
github.com/tidwall/gjson v1.6.8/go.mod h1:zeFuBCIqD4sN/gmqBzZ4j7Jd6UcA2Fc56x7QFsv+8fI=
|
||||
github.com/tidwall/gjson v1.7.4/go.mod h1:5/xDoumyyDNerp2U36lyolv46b3uF/9Bu6OfyQ9GImk=
|
||||
github.com/tidwall/gjson v1.7.5 h1:zmAN/xmX7OtpAkv4Ovfso60r/BiCi5IErCDYGNJu+uc=
|
||||
github.com/tidwall/gjson v1.7.5/go.mod h1:5/xDoumyyDNerp2U36lyolv46b3uF/9Bu6OfyQ9GImk=
|
||||
github.com/tidwall/match v1.0.3 h1:FQUVvBImDutD8wJLN6c5eMzWtjgONK9MwIBCOrUJKeE=
|
||||
github.com/tidwall/match v1.0.3/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
|
||||
github.com/tidwall/pretty v1.0.2 h1:Z7S3cePv9Jwm1KwS0513MRaoUe3S01WPbLNV40pwWZU=
|
||||
github.com/tidwall/pretty v1.0.2/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
|
||||
github.com/tidwall/sjson v1.0.4 h1:UcdIRXff12Lpnu3OLtZvnc03g4vH2suXDXhBwBqmzYg=
|
||||
github.com/tidwall/sjson v1.0.4/go.mod h1:bURseu1nuBkFpIES5cz6zBtjmYeOQmEESshn7VpF15Y=
|
||||
github.com/tidwall/pretty v1.1.0 h1:K3hMW5epkdAVwibsQEfR/7Zj0Qgt4DxtNumTq/VloO8=
|
||||
github.com/tidwall/pretty v1.1.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
|
||||
github.com/tidwall/sjson v1.1.6 h1:8fDdlahON04OZBlTQCIatW8FstSFJz8oxidj5h0rmSQ=
|
||||
github.com/tidwall/sjson v1.1.6/go.mod h1:KN3FZ7odvXIHPbJdhNorK/M9lWweVUbXsXXhrJ/kGOA=
|
||||
github.com/tinylib/msgp v1.1.3 h1:3giwAkmtaEDLSV0MdO1lDLuPgklgPzmk8H9+So2BVfA=
|
||||
github.com/tinylib/msgp v1.1.3/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE=
|
||||
github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8 h1:ndzgwNDnKIqyCvHTXaCqh9KlOWKvBry6nuXMJmonVsE=
|
||||
|
@ -1,661 +0,0 @@
|
||||
GNU AFFERO GENERAL PUBLIC LICENSE
|
||||
Version 3, 19 November 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU Affero General Public License is a free, copyleft license for
|
||||
software and other kinds of works, specifically designed to ensure
|
||||
cooperation with the community in the case of network server software.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
our General Public Licenses are intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
Developers that use our General Public Licenses protect your rights
|
||||
with two steps: (1) assert copyright on the software, and (2) offer
|
||||
you this License which gives you legal permission to copy, distribute
|
||||
and/or modify the software.
|
||||
|
||||
A secondary benefit of defending all users' freedom is that
|
||||
improvements made in alternate versions of the program, if they
|
||||
receive widespread use, become available for other developers to
|
||||
incorporate. Many developers of free software are heartened and
|
||||
encouraged by the resulting cooperation. However, in the case of
|
||||
software used on network servers, this result may fail to come about.
|
||||
The GNU General Public License permits making a modified version and
|
||||
letting the public access it on a server without ever releasing its
|
||||
source code to the public.
|
||||
|
||||
The GNU Affero General Public License is designed specifically to
|
||||
ensure that, in such cases, the modified source code becomes available
|
||||
to the community. It requires the operator of a network server to
|
||||
provide the source code of the modified version running there to the
|
||||
users of that server. Therefore, public use of a modified version, on
|
||||
a publicly accessible server, gives the public access to the source
|
||||
code of the modified version.
|
||||
|
||||
An older license, called the Affero General Public License and
|
||||
published by Affero, was designed to accomplish similar goals. This is
|
||||
a different license, not a version of the Affero GPL, but Affero has
|
||||
released a new version of the Affero GPL which permits relicensing under
|
||||
this license.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU Affero General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Remote Network Interaction; Use with the GNU General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, if you modify the
|
||||
Program, your modified version must prominently offer all users
|
||||
interacting with it remotely through a computer network (if your version
|
||||
supports such interaction) an opportunity to receive the Corresponding
|
||||
Source of your version by providing access to the Corresponding Source
|
||||
from a network server at no charge, through some standard or customary
|
||||
means of facilitating copying of software. This Corresponding Source
|
||||
shall include the Corresponding Source for any work covered by version 3
|
||||
of the GNU General Public License that is incorporated pursuant to the
|
||||
following paragraph.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the work with which it is combined will remain governed by version
|
||||
3 of the GNU General Public License.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU Affero General Public License from time to time. Such new versions
|
||||
will be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU Affero General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU Affero General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU Affero General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If your software can interact with users remotely through a computer
|
||||
network, you should also make sure that it provides a way for users to
|
||||
get its source. For example, if your program is a web application, its
|
||||
interface could display a "Source" link that leads users to an archive
|
||||
of the code. There are many ways you could offer source, and different
|
||||
solutions will be better for different programs; see section 13 for the
|
||||
specific requirements.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU AGPL, see
|
||||
<https://www.gnu.org/licenses/>.
|
@ -1,36 +0,0 @@
|
||||
GOPATH := $(shell go env GOPATH)
|
||||
|
||||
all: check
|
||||
|
||||
getdeps:
|
||||
@if [ ! -f ${GOPATH}/bin/golint ]; then echo "Installing golint" && go get -u golang.org/x/lint/golint; fi
|
||||
@if [ ! -f ${GOPATH}/bin/gocyclo ]; then echo "Installing gocyclo" && go get -u github.com/fzipp/gocyclo; fi
|
||||
@if [ ! -f ${GOPATH}/bin/misspell ]; then echo "Installing misspell" && go get -u github.com/client9/misspell/cmd/misspell; fi
|
||||
@if [ ! -f ${GOPATH}/bin/ineffassign ]; then echo "Installing ineffassign" && go get -u github.com/gordonklaus/ineffassign; fi
|
||||
|
||||
vet:
|
||||
@echo "Running $@"
|
||||
@go vet *.go
|
||||
|
||||
fmt:
|
||||
@echo "Running $@"
|
||||
@gofmt -d *.go
|
||||
|
||||
lint:
|
||||
@echo "Running $@"
|
||||
@${GOPATH}/bin/golint -set_exit_status
|
||||
|
||||
cyclo:
|
||||
@echo "Running $@"
|
||||
@${GOPATH}/bin/gocyclo -over 200 .
|
||||
|
||||
spelling:
|
||||
@${GOPATH}/bin/misspell -locale US -error *.go README.md
|
||||
|
||||
ineffassign:
|
||||
@echo "Running $@"
|
||||
@${GOPATH}/bin/ineffassign .
|
||||
|
||||
check: getdeps vet fmt lint cyclo spelling ineffassign
|
||||
@echo "Running unit tests"
|
||||
@go test -tags kqueue ./...
|
@ -1,3 +0,0 @@
|
||||
# parquet-go
|
||||
|
||||
Modified version of https://github.com/xitongsys/parquet-go
|
@ -1,170 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"git.apache.org/thrift.git/lib/go/thrift"
|
||||
"github.com/minio/minio-go/v7/pkg/set"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func getColumns(
|
||||
rowGroup *parquet.RowGroup,
|
||||
columnNames set.StringSet,
|
||||
schemaElements []*parquet.SchemaElement,
|
||||
getReaderFunc GetReaderFunc,
|
||||
) (nameColumnMap map[string]*column, err error) {
|
||||
nameIndexMap := make(map[string]int)
|
||||
for colIndex, columnChunk := range rowGroup.GetColumns() {
|
||||
meta := columnChunk.GetMetaData()
|
||||
if meta == nil {
|
||||
return nil, errors.New("parquet: column metadata missing")
|
||||
}
|
||||
columnName := strings.Join(meta.GetPathInSchema(), ".")
|
||||
if columnNames != nil && !columnNames.Contains(columnName) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Ignore column spanning into another file.
|
||||
if columnChunk.GetFilePath() != "" {
|
||||
continue
|
||||
}
|
||||
|
||||
offset := meta.GetDataPageOffset()
|
||||
if meta.DictionaryPageOffset != nil {
|
||||
offset = meta.GetDictionaryPageOffset()
|
||||
}
|
||||
|
||||
size := meta.GetTotalCompressedSize()
|
||||
if size < 0 {
|
||||
return nil, errors.New("parquet: negative compressed size")
|
||||
}
|
||||
rc, err := getReaderFunc(offset, size)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
thriftReader := thrift.NewTBufferedTransport(thrift.NewStreamTransportR(rc), int(size))
|
||||
|
||||
if nameColumnMap == nil {
|
||||
nameColumnMap = make(map[string]*column)
|
||||
}
|
||||
var se *parquet.SchemaElement
|
||||
for _, schema := range schemaElements {
|
||||
if schema != nil && schema.Name == columnName {
|
||||
se = schema
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
nameColumnMap[columnName] = &column{
|
||||
name: columnName,
|
||||
metadata: meta,
|
||||
schema: se,
|
||||
schemaElements: schemaElements,
|
||||
rc: rc,
|
||||
thriftReader: thriftReader,
|
||||
valueType: meta.GetType(),
|
||||
}
|
||||
|
||||
// First element of []*parquet.SchemaElement from parquet file metadata is 'schema'
|
||||
// which is always skipped, hence colIndex + 1 is valid.
|
||||
nameIndexMap[columnName] = colIndex + 1
|
||||
}
|
||||
|
||||
for name := range nameColumnMap {
|
||||
nameColumnMap[name].nameIndexMap = nameIndexMap
|
||||
}
|
||||
|
||||
return nameColumnMap, nil
|
||||
}
|
||||
|
||||
type column struct {
|
||||
name string
|
||||
endOfValues bool
|
||||
valueIndex int
|
||||
valueType parquet.Type
|
||||
metadata *parquet.ColumnMetaData
|
||||
schema *parquet.SchemaElement
|
||||
schemaElements []*parquet.SchemaElement
|
||||
nameIndexMap map[string]int
|
||||
dictPage *page
|
||||
dataTable *table
|
||||
rc io.ReadCloser
|
||||
thriftReader *thrift.TBufferedTransport
|
||||
}
|
||||
|
||||
func (column *column) close() (err error) {
|
||||
if column.rc != nil {
|
||||
err = column.rc.Close()
|
||||
column.rc = nil
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (column *column) readPage() {
|
||||
page, _, _, err := readPage(
|
||||
column.thriftReader,
|
||||
column.metadata,
|
||||
column.nameIndexMap,
|
||||
column.schemaElements,
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
column.endOfValues = true
|
||||
return
|
||||
}
|
||||
|
||||
if page.Header.GetType() == parquet.PageType_DICTIONARY_PAGE {
|
||||
column.dictPage = page
|
||||
column.readPage()
|
||||
return
|
||||
}
|
||||
|
||||
page.decode(column.dictPage)
|
||||
|
||||
if column.dataTable == nil {
|
||||
column.dataTable = newTableFromTable(page.DataTable)
|
||||
}
|
||||
|
||||
column.dataTable.Merge(page.DataTable)
|
||||
}
|
||||
|
||||
func (column *column) read() (value interface{}, valueType parquet.Type, cnv *parquet.SchemaElement) {
|
||||
if column.dataTable == nil {
|
||||
column.readPage()
|
||||
column.valueIndex = 0
|
||||
}
|
||||
|
||||
if column.endOfValues {
|
||||
return nil, column.metadata.GetType(), column.schema
|
||||
}
|
||||
|
||||
value = column.dataTable.Values[column.valueIndex]
|
||||
column.valueIndex++
|
||||
if len(column.dataTable.Values) == column.valueIndex {
|
||||
column.dataTable = nil
|
||||
}
|
||||
|
||||
return value, column.metadata.GetType(), column.schema
|
||||
}
|
@ -1,96 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func valuesToInterfaces(values interface{}, valueType parquet.Type) (tableValues []interface{}) {
|
||||
switch valueType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
for _, v := range values.([]bool) {
|
||||
tableValues = append(tableValues, v)
|
||||
}
|
||||
case parquet.Type_INT32:
|
||||
for _, v := range values.([]int32) {
|
||||
tableValues = append(tableValues, v)
|
||||
}
|
||||
case parquet.Type_INT64:
|
||||
for _, v := range values.([]int64) {
|
||||
tableValues = append(tableValues, v)
|
||||
}
|
||||
case parquet.Type_FLOAT:
|
||||
for _, v := range values.([]float32) {
|
||||
tableValues = append(tableValues, v)
|
||||
}
|
||||
case parquet.Type_DOUBLE:
|
||||
for _, v := range values.([]float64) {
|
||||
tableValues = append(tableValues, v)
|
||||
}
|
||||
case parquet.Type_INT96, parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY:
|
||||
for _, v := range values.([][]byte) {
|
||||
tableValues = append(tableValues, v)
|
||||
}
|
||||
}
|
||||
|
||||
return tableValues
|
||||
}
|
||||
|
||||
func interfacesToValues(values []interface{}, valueType parquet.Type) interface{} {
|
||||
switch valueType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
bs := make([]bool, len(values))
|
||||
for i := range values {
|
||||
bs[i] = values[i].(bool)
|
||||
}
|
||||
return bs
|
||||
case parquet.Type_INT32:
|
||||
i32s := make([]int32, len(values))
|
||||
for i := range values {
|
||||
i32s[i] = values[i].(int32)
|
||||
}
|
||||
return i32s
|
||||
case parquet.Type_INT64:
|
||||
i64s := make([]int64, len(values))
|
||||
for i := range values {
|
||||
i64s[i] = values[i].(int64)
|
||||
}
|
||||
return i64s
|
||||
case parquet.Type_FLOAT:
|
||||
f32s := make([]float32, len(values))
|
||||
for i := range values {
|
||||
f32s[i] = values[i].(float32)
|
||||
}
|
||||
return f32s
|
||||
case parquet.Type_DOUBLE:
|
||||
f64s := make([]float64, len(values))
|
||||
for i := range values {
|
||||
f64s[i] = values[i].(float64)
|
||||
}
|
||||
return f64s
|
||||
case parquet.Type_INT96, parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY:
|
||||
array := make([][]byte, len(values))
|
||||
for i := range values {
|
||||
array[i] = values[i].([]byte)
|
||||
}
|
||||
return array
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
@ -1,161 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package common
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
|
||||
"github.com/golang/snappy"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/pierrec/lz4"
|
||||
)
|
||||
|
||||
// ToSliceValue converts values to a slice value.
|
||||
func ToSliceValue(values []interface{}, parquetType parquet.Type) interface{} {
|
||||
switch parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
bs := make([]bool, len(values))
|
||||
for i := range values {
|
||||
bs[i] = values[i].(bool)
|
||||
}
|
||||
return bs
|
||||
case parquet.Type_INT32:
|
||||
i32s := make([]int32, len(values))
|
||||
for i := range values {
|
||||
i32s[i] = values[i].(int32)
|
||||
}
|
||||
return i32s
|
||||
case parquet.Type_INT64:
|
||||
i64s := make([]int64, len(values))
|
||||
for i := range values {
|
||||
i64s[i] = values[i].(int64)
|
||||
}
|
||||
return i64s
|
||||
case parquet.Type_FLOAT:
|
||||
f32s := make([]float32, len(values))
|
||||
for i := range values {
|
||||
f32s[i] = values[i].(float32)
|
||||
}
|
||||
return f32s
|
||||
case parquet.Type_DOUBLE:
|
||||
f64s := make([]float64, len(values))
|
||||
for i := range values {
|
||||
f64s[i] = values[i].(float64)
|
||||
}
|
||||
return f64s
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
array := make([][]byte, len(values))
|
||||
for i := range values {
|
||||
array[i] = values[i].([]byte)
|
||||
}
|
||||
return array
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// BitWidth returns bits count required to accommodate given value.
|
||||
func BitWidth(ui64 uint64) (width int32) {
|
||||
for ; ui64 != 0; ui64 >>= 1 {
|
||||
width++
|
||||
}
|
||||
|
||||
return width
|
||||
}
|
||||
|
||||
// Compress compresses given data.
|
||||
func Compress(compressionType parquet.CompressionCodec, data []byte) ([]byte, error) {
|
||||
switch compressionType {
|
||||
case parquet.CompressionCodec_UNCOMPRESSED:
|
||||
return data, nil
|
||||
|
||||
case parquet.CompressionCodec_SNAPPY:
|
||||
return snappy.Encode(nil, data), nil
|
||||
|
||||
case parquet.CompressionCodec_GZIP:
|
||||
buf := new(bytes.Buffer)
|
||||
writer := gzip.NewWriter(buf)
|
||||
n, err := writer.Write(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if n != len(data) {
|
||||
return nil, fmt.Errorf("short writes")
|
||||
}
|
||||
|
||||
if err = writer.Flush(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err = writer.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return buf.Bytes(), nil
|
||||
|
||||
case parquet.CompressionCodec_LZ4:
|
||||
buf := new(bytes.Buffer)
|
||||
writer := lz4.NewWriter(buf)
|
||||
n, err := writer.Write(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if n != len(data) {
|
||||
return nil, fmt.Errorf("short writes")
|
||||
}
|
||||
|
||||
if err = writer.Flush(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err = writer.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unsupported compression codec %v", compressionType)
|
||||
}
|
||||
|
||||
// Uncompress uncompresses given data.
|
||||
func Uncompress(compressionType parquet.CompressionCodec, data []byte) ([]byte, error) {
|
||||
switch compressionType {
|
||||
case parquet.CompressionCodec_UNCOMPRESSED:
|
||||
return data, nil
|
||||
|
||||
case parquet.CompressionCodec_SNAPPY:
|
||||
return snappy.Decode(nil, data)
|
||||
|
||||
case parquet.CompressionCodec_GZIP:
|
||||
reader, err := gzip.NewReader(bytes.NewReader(data))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer reader.Close()
|
||||
return ioutil.ReadAll(reader)
|
||||
|
||||
case parquet.CompressionCodec_LZ4:
|
||||
return ioutil.ReadAll(lz4.NewReader(bytes.NewReader(data)))
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unsupported compression codec %v", compressionType)
|
||||
}
|
@ -1,128 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"sync"
|
||||
|
||||
"github.com/golang/snappy"
|
||||
"github.com/klauspost/compress/gzip"
|
||||
"github.com/klauspost/compress/zstd"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/pierrec/lz4"
|
||||
)
|
||||
|
||||
type compressionCodec parquet.CompressionCodec
|
||||
|
||||
var zstdOnce sync.Once
|
||||
var zstdEnc *zstd.Encoder
|
||||
var zstdDec *zstd.Decoder
|
||||
|
||||
func initZstd() {
|
||||
zstdOnce.Do(func() {
|
||||
zstdEnc, _ = zstd.NewWriter(nil, zstd.WithZeroFrames(true))
|
||||
zstdDec, _ = zstd.NewReader(nil)
|
||||
})
|
||||
}
|
||||
|
||||
func (c compressionCodec) compress(buf []byte) ([]byte, error) {
|
||||
switch parquet.CompressionCodec(c) {
|
||||
case parquet.CompressionCodec_UNCOMPRESSED:
|
||||
return buf, nil
|
||||
|
||||
case parquet.CompressionCodec_SNAPPY:
|
||||
return snappy.Encode(nil, buf), nil
|
||||
|
||||
case parquet.CompressionCodec_GZIP:
|
||||
byteBuf := new(bytes.Buffer)
|
||||
writer := gzip.NewWriter(byteBuf)
|
||||
n, err := writer.Write(buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if n != len(buf) {
|
||||
return nil, fmt.Errorf("short writes")
|
||||
}
|
||||
|
||||
if err = writer.Flush(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err = writer.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return byteBuf.Bytes(), nil
|
||||
|
||||
case parquet.CompressionCodec_LZ4:
|
||||
byteBuf := new(bytes.Buffer)
|
||||
writer := lz4.NewWriter(byteBuf)
|
||||
n, err := writer.Write(buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if n != len(buf) {
|
||||
return nil, fmt.Errorf("short writes")
|
||||
}
|
||||
|
||||
if err = writer.Flush(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err = writer.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return byteBuf.Bytes(), nil
|
||||
case parquet.CompressionCodec_ZSTD:
|
||||
initZstd()
|
||||
return zstdEnc.EncodeAll(buf, nil), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("invalid compression codec %v", c)
|
||||
}
|
||||
|
||||
func (c compressionCodec) uncompress(buf []byte) ([]byte, error) {
|
||||
switch parquet.CompressionCodec(c) {
|
||||
case parquet.CompressionCodec_UNCOMPRESSED:
|
||||
return buf, nil
|
||||
|
||||
case parquet.CompressionCodec_SNAPPY:
|
||||
return snappy.Decode(nil, buf)
|
||||
|
||||
case parquet.CompressionCodec_GZIP:
|
||||
reader, err := gzip.NewReader(bytes.NewReader(buf))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer reader.Close()
|
||||
return ioutil.ReadAll(reader)
|
||||
|
||||
case parquet.CompressionCodec_LZ4:
|
||||
return ioutil.ReadAll(lz4.NewReader(bytes.NewReader(buf)))
|
||||
|
||||
case parquet.CompressionCodec_ZSTD:
|
||||
initZstd()
|
||||
return zstdDec.DecodeAll(buf, nil)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("invalid compression codec %v", c)
|
||||
}
|
@ -1,619 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
func TestPopulateGroupList(t *testing.T) {
|
||||
requiredList1 := schema.NewTree()
|
||||
{
|
||||
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList1.Set("group", requiredGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("group.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("group.list.element.col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := requiredList1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredList2 := schema.NewTree()
|
||||
{
|
||||
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList2.Set("group", requiredGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("group.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("group.list.element.col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := requiredList2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredList3 := schema.NewTree()
|
||||
{
|
||||
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList3.Set("group", requiredGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList3.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList3.Set("group.list.element", optionalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList3.Set("group.list.element.col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := requiredList3.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredList4 := schema.NewTree()
|
||||
{
|
||||
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList4.Set("group", requiredGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList4.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList4.Set("group.list.element", optionalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList4.Set("group.list.element.col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := requiredList4.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList1 := schema.NewTree()
|
||||
{
|
||||
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList1.Set("group", optionalGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("group.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("group.list.element.col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := optionalList1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList2 := schema.NewTree()
|
||||
{
|
||||
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList2.Set("group", optionalGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("group.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("group.list.element.col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := optionalList2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList3 := schema.NewTree()
|
||||
{
|
||||
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList3.Set("group", optionalGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList3.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList3.Set("group.list.element", optionalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList3.Set("group.list.element.col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := optionalList3.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList4 := schema.NewTree()
|
||||
{
|
||||
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList4.Set("group", optionalGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList4.Set("group.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList4.Set("group.list.element", optionalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList4.Set("group.list.element.col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := optionalList4.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
result1 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result2 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20},
|
||||
definitionLevels: []int64{1, 1},
|
||||
repetitionLevels: []int64{0, 1},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v20,
|
||||
},
|
||||
}
|
||||
|
||||
result3 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result4 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result5 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20},
|
||||
definitionLevels: []int64{2, 2},
|
||||
repetitionLevels: []int64{0, 1},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v20,
|
||||
},
|
||||
}
|
||||
|
||||
result6 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result7 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result8 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20},
|
||||
definitionLevels: []int64{3, 3},
|
||||
repetitionLevels: []int64{0, 1},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v20,
|
||||
},
|
||||
}
|
||||
|
||||
result9 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result10 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result11 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{4},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result12 := map[string]*Column{
|
||||
"group.list.element.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20},
|
||||
definitionLevels: []int64{4, 4},
|
||||
repetitionLevels: []int64{0, 1},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v20,
|
||||
},
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
schemaTree *schema.Tree
|
||||
data string
|
||||
expectedResult map[string]*Column
|
||||
expectErr bool
|
||||
}{
|
||||
{requiredList1, `{}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList1, `{"group": null}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList1, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field
|
||||
{requiredList1, `{"group": [{"col": 10}]}`, result1, false},
|
||||
{requiredList1, `{"group": [{"col": 10}, {"col": 20}]}`, result2, false},
|
||||
{requiredList2, `{}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList2, `{"group": null}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList2, `{"group": [{"col": null}]}`, result3, false},
|
||||
{requiredList2, `{"group": [{"col": 10}]}`, result4, false},
|
||||
{requiredList2, `{"group": [{"col": 10}, {"col": 20}]}`, result5, false},
|
||||
{requiredList3, `{}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList3, `{"group": null}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList3, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field
|
||||
{requiredList3, `{"group": [{"col": 10}]}`, result4, false},
|
||||
{requiredList3, `{"group": [{"col": 10}, {"col": 20}]}`, result5, false},
|
||||
{requiredList4, `{}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList4, `{"group": null}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredList4, `{"group": [{"col": null}]}`, result6, false},
|
||||
{requiredList4, `{"group": [{"col": 10}]}`, result7, false},
|
||||
{requiredList4, `{"group": [{"col": 10}, {"col": 20}]}`, result8, false},
|
||||
{optionalList1, `{}`, result9, false},
|
||||
{optionalList1, `{"group": null}`, result9, false},
|
||||
{optionalList1, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field
|
||||
{optionalList1, `{"group": [{"col": 10}]}`, result4, false},
|
||||
{optionalList1, `{"group": [{"col": 10}, {"col": 20}]}`, result5, false},
|
||||
{optionalList2, `{}`, result9, false},
|
||||
{optionalList2, `{"group": null}`, result9, false},
|
||||
{optionalList2, `{"group": [{"col": null}]}`, result6, false},
|
||||
{optionalList2, `{"group": [{"col": 10}]}`, result7, false},
|
||||
{optionalList2, `{"group": [{"col": 10}, {"col": 20}]}`, result8, false},
|
||||
{optionalList3, `{}`, result9, false},
|
||||
{optionalList3, `{"group": null}`, result9, false},
|
||||
{optionalList3, `{"group": [{"col": null}]}`, nil, true}, // err: group.list.element.col: nil value for required field
|
||||
{optionalList3, `{"group": [{"col": 10}]}`, result7, false},
|
||||
{optionalList3, `{"group": [{"col": 10}, {"col": 20}]}`, result8, false},
|
||||
{optionalList4, `{}`, result9, false},
|
||||
{optionalList4, `{"group": null}`, result9, false},
|
||||
{optionalList4, `{"group": [{"col": null}]}`, result10, false},
|
||||
{optionalList4, `{"group": [{"col": 10}]}`, result11, false},
|
||||
{optionalList4, `{"group": [{"col": 10}, {"col": 20}]}`, result12, false},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
|
||||
expectErr := (err != nil)
|
||||
|
||||
if testCase.expectErr != expectErr {
|
||||
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
|
||||
}
|
||||
|
||||
if !testCase.expectErr {
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,238 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
func TestPopulateGroupType(t *testing.T) {
|
||||
requiredGroup1 := schema.NewTree()
|
||||
{
|
||||
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredGroup1.Set("group", requiredGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredGroup1.Set("group.col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := requiredGroup1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredGroup2 := schema.NewTree()
|
||||
{
|
||||
requiredGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredGroup2.Set("group", requiredGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredGroup2.Set("group.col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := requiredGroup2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalGroup1 := schema.NewTree()
|
||||
{
|
||||
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalGroup1.Set("group", optionalGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalGroup1.Set("group.col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := optionalGroup1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalGroup2 := schema.NewTree()
|
||||
{
|
||||
optionalGroup, err := schema.NewElement("group", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalGroup2.Set("group", optionalGroup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalGroup2.Set("group.col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err := optionalGroup2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
result1 := map[string]*Column{
|
||||
"group.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result2 := map[string]*Column{
|
||||
"group.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result3 := map[string]*Column{
|
||||
"group.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result4 := map[string]*Column{
|
||||
"group.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result5 := map[string]*Column{
|
||||
"group.col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
schemaTree *schema.Tree
|
||||
data string
|
||||
expectedResult map[string]*Column
|
||||
expectErr bool
|
||||
}{
|
||||
{requiredGroup1, `{}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredGroup1, `{"group": null}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredGroup1, `{"group": {"col": null}}`, nil, true}, // err: group.col: nil value for required field
|
||||
{requiredGroup1, `{"group": {"col": 10}}`, result1, false},
|
||||
{requiredGroup2, `{}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredGroup2, `{"group": null}`, nil, true}, // err: group: nil value for required field
|
||||
{requiredGroup2, `{"group": {"col": null}}`, result2, false},
|
||||
{requiredGroup2, `{"group": {"col": 10}}`, result3, false},
|
||||
{optionalGroup1, `{}`, result2, false},
|
||||
{optionalGroup1, `{"group": null}`, result2, false},
|
||||
{optionalGroup1, `{"group": {"col": null}}`, nil, true}, // err: group.col: nil value for required field
|
||||
{optionalGroup1, `{"group": {"col": 10}}`, result3, false},
|
||||
{optionalGroup2, `{}`, result2, false},
|
||||
{optionalGroup2, `{"group": null}`, result2, false},
|
||||
{optionalGroup2, `{"group": {"col": null}}`, result4, false},
|
||||
{optionalGroup2, `{"group": {"col": 10}}`, result5, false},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
|
||||
expectErr := (err != nil)
|
||||
|
||||
if testCase.expectErr != expectErr {
|
||||
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
|
||||
}
|
||||
|
||||
if !testCase.expectErr {
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,699 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
func TestPopulateListOfList(t *testing.T) {
|
||||
requiredList1 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList1.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("col.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("col.list.element.list.element", requiredSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredList1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredList2 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList2.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("col.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("col.list.element.list.element", optionalSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredList2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredList3 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList3.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList3.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList3.Set("col.list.element", optioonalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList3.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList3.Set("col.list.element.list.element", requiredSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredList3.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredList4 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList4.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList4.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList4.Set("col.list.element", optioonalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList4.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList4.Set("col.list.element.list.element", optionalSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredList4.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList1 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList1.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("col.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("col.list.element.list.element", requiredSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalList1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList2 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList2.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("col.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("col.list.element.list.element", optionalSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalList2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList3 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList3.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList3.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList3.Set("col.list.element", optioonalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList3.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList3.Set("col.list.element.list.element", requiredSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalList3.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList4 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optioonalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
subList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalSubElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList4.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList4.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList4.Set("col.list.element", optioonalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList4.Set("col.list.element.list", subList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList4.Set("col.list.element.list.element", optionalSubElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalList4.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
result1 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result2 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20, v30, v10, v20, v10, v30},
|
||||
definitionLevels: []int64{2, 2, 2, 2, 2, 2, 2},
|
||||
repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v30,
|
||||
},
|
||||
}
|
||||
|
||||
result3 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result4 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result5 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20, v30, v10, v20, v10, v30},
|
||||
definitionLevels: []int64{3, 3, 3, 3, 3, 3, 3},
|
||||
repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v30,
|
||||
},
|
||||
}
|
||||
|
||||
result6 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result7 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{4},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result8 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20, v30, v10, v20, v10, v30},
|
||||
definitionLevels: []int64{4, 4, 4, 4, 4, 4, 4},
|
||||
repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v30,
|
||||
},
|
||||
}
|
||||
|
||||
result9 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result10 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{4},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result11 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{5},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result12 := map[string]*Column{
|
||||
"col.list.element.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20, v30, v10, v20, v10, v30},
|
||||
definitionLevels: []int64{5, 5, 5, 5, 5, 5, 5},
|
||||
repetitionLevels: []int64{0, 2, 1, 2, 1, 2, 2},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v30,
|
||||
},
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
schemaTree *schema.Tree
|
||||
data string
|
||||
expectedResult map[string]*Column
|
||||
expectErr bool
|
||||
}{
|
||||
{requiredList1, `{}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList1, `{"col": null}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList1, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field
|
||||
{requiredList1, `{"col": [[10]]}`, result1, false},
|
||||
{requiredList1, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result2, false},
|
||||
{requiredList2, `{}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList2, `{"col": null}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList2, `{"col": [[null]]}`, result3, false},
|
||||
{requiredList2, `{"col": [[10]]}`, result4, false},
|
||||
{requiredList2, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result5, false},
|
||||
{requiredList3, `{}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList3, `{"col": null}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList3, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field
|
||||
{requiredList3, `{"col": [[10]]}`, result4, false},
|
||||
{requiredList3, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result5, false},
|
||||
{requiredList4, `{}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList4, `{"col": null}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList4, `{"col": [[null]]}`, result6, false},
|
||||
{requiredList4, `{"col": [[10]]}`, result7, false},
|
||||
{requiredList4, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result8, false},
|
||||
{optionalList1, `{}`, result9, false},
|
||||
{optionalList1, `{"col": null}`, result9, false},
|
||||
{optionalList1, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field
|
||||
{optionalList1, `{"col": [[10]]}`, result4, false},
|
||||
{optionalList1, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result5, false},
|
||||
{optionalList2, `{}`, result9, false},
|
||||
{optionalList2, `{"col": null}`, result9, false},
|
||||
{optionalList2, `{"col": [[null]]}`, result6, false},
|
||||
{optionalList2, `{"col": [[10]]}`, result7, false},
|
||||
{optionalList2, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result8, false},
|
||||
{optionalList3, `{}`, result9, false},
|
||||
{optionalList3, `{"col": null}`, result9, false},
|
||||
{optionalList3, `{"col": [[null]]}`, nil, true}, // err: col.list.element.list.element: nil value for required field
|
||||
{optionalList3, `{"col": [[10]]}`, result7, false},
|
||||
{optionalList3, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result8, false},
|
||||
{optionalList4, `{}`, result9, false},
|
||||
{optionalList4, `{"col": null}`, result9, false},
|
||||
{optionalList4, `{"col": [[null]]}`, result10, false},
|
||||
{optionalList4, `{"col": [[10]]}`, result11, false},
|
||||
{optionalList4, `{"col": [[10, 20], [30, 10], [20, 10, 30]]}`, result12, false},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
|
||||
expectErr := (err != nil)
|
||||
|
||||
if testCase.expectErr != expectErr {
|
||||
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
|
||||
}
|
||||
|
||||
if !testCase.expectErr {
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,371 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
func TestPopulateMap(t *testing.T) {
|
||||
t.Skip("Broken")
|
||||
requiredMap1 := schema.NewTree()
|
||||
{
|
||||
mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredValue, err := schema.NewElement("value", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap1.Set("map", mapElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap1.Set("map.key_value", keyValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap1.Set("map.key_value.key", requiredKey); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap1.Set("map.key_value.value", requiredValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredMap1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredMap2 := schema.NewTree()
|
||||
{
|
||||
mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalValue, err := schema.NewElement("value", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap2.Set("map", mapElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap2.Set("map.key_value", keyValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap2.Set("map.key_value.key", requiredKey); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredMap2.Set("map.key_value.value", optionalValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredMap2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalMap1 := schema.NewTree()
|
||||
{
|
||||
mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredValue, err := schema.NewElement("value", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap1.Set("map", mapElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap1.Set("map.key_value", keyValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap1.Set("map.key_value.key", requiredKey); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap1.Set("map.key_value.value", requiredValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalMap1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalMap2 := schema.NewTree()
|
||||
{
|
||||
mapElement, err := schema.NewElement("map", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_MAP),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
keyValue, err := schema.NewElement("key_value", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredKey, err := schema.NewElement("key", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalValue, err := schema.NewElement("value", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap2.Set("map", mapElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap2.Set("map.key_value", keyValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap2.Set("map.key_value.key", requiredKey); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalMap2.Set("map.key_value.value", optionalValue); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalMap2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
result1 := map[string]*Column{
|
||||
"map.key_value.key": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{ten},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"map.key_value.value": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
result2 := map[string]*Column{
|
||||
"map.key_value.key": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{ten},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"map.key_value.value": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
result3 := map[string]*Column{
|
||||
"map.key_value.key": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{ten},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"map.key_value.value": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
result4 := map[string]*Column{
|
||||
"map.key_value.key": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
}
|
||||
|
||||
result5 := map[string]*Column{
|
||||
"map.key_value.key": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{ten},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"map.key_value.value": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
result6 := map[string]*Column{
|
||||
"map.key_value.key": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{ten},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"map.key_value.value": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
result7 := map[string]*Column{
|
||||
"map.key_value.key": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{ten},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"map.key_value.value": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
schemaTree *schema.Tree
|
||||
data string
|
||||
expectedResult map[string]*Column
|
||||
expectErr bool
|
||||
}{
|
||||
{requiredMap1, `{}`, nil, true}, // err: map: nil value for required field
|
||||
{requiredMap1, `{"map": null}`, nil, true}, // err: map: nil value for required field
|
||||
{requiredMap1, `{"map": {"ten": null}}`, nil, true}, // err: map.key_value.value: nil value for required field
|
||||
{requiredMap1, `{"map": {"ten": 10}}`, result1, false},
|
||||
{requiredMap2, `{}`, nil, true}, // err: map: nil value for required field
|
||||
{requiredMap2, `{"map": null}`, nil, true}, // err: map: nil value for required field
|
||||
{requiredMap2, `{"map": {"ten": null}}`, result2, false},
|
||||
{requiredMap2, `{"map": {"ten": 10}}`, result3, false},
|
||||
{optionalMap1, `{}`, result4, false},
|
||||
{optionalMap1, `{"map": null}`, result4, false},
|
||||
{optionalMap1, `{"map": {"ten": null}}`, nil, true}, // err: map.key_value.value: nil value for required field
|
||||
{optionalMap1, `{"map": {"ten": 10}}`, result5, false},
|
||||
{optionalMap2, `{}`, result4, false},
|
||||
{optionalMap2, `{"map": null}`, result4, false},
|
||||
{optionalMap2, `{"map": {"ten": null}}`, result6, false},
|
||||
{optionalMap2, `{"map": {"ten": 10}}`, result7, false},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
|
||||
expectErr := (err != nil)
|
||||
|
||||
if testCase.expectErr != expectErr {
|
||||
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
|
||||
}
|
||||
|
||||
if !testCase.expectErr {
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Errorf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,331 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
func TestPopulatePrimitiveList(t *testing.T) {
|
||||
requiredList1 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList1.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList1.Set("col.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredList1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
requiredList2 := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredList2.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = requiredList2.Set("col.list.element", optionalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredList2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList1 := schema.NewTree()
|
||||
{
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
requiredElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList1.Set("col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList1.Set("col.list.element", requiredElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalList1.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalList2 := schema.NewTree()
|
||||
{
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
list, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
optionalElement, err := schema.NewElement("element", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalList2.Set("col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("col.list", list); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = optionalList2.Set("col.list.element", optionalElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalList2.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
result1 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result2 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20, v30},
|
||||
definitionLevels: []int64{1, 1, 1},
|
||||
repetitionLevels: []int64{0, 1, 1},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v30,
|
||||
},
|
||||
}
|
||||
|
||||
result3 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result4 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result5 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20, v30},
|
||||
definitionLevels: []int64{2, 2, 2},
|
||||
repetitionLevels: []int64{0, 1, 1},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v30,
|
||||
},
|
||||
}
|
||||
|
||||
result6 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result7 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result8 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result9 := map[string]*Column{
|
||||
"col.list.element": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10, v20, v30},
|
||||
definitionLevels: []int64{3, 3, 3},
|
||||
repetitionLevels: []int64{0, 1, 1},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 5,
|
||||
minValue: v10,
|
||||
maxValue: v30,
|
||||
},
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
schemaTree *schema.Tree
|
||||
data string
|
||||
expectedResult map[string]*Column
|
||||
expectErr bool
|
||||
}{
|
||||
{requiredList1, `{}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList1, `{"col": null}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList1, `{"col": [null]}`, nil, true}, // err: col.list.element: nil value for required field
|
||||
{requiredList1, `{"col": [10]}`, result1, false},
|
||||
{requiredList1, `{"col": [10, 20, 30]}`, result2, false},
|
||||
{requiredList2, `{}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList2, `{"col": null}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredList2, `{"col": [null]}`, result3, false},
|
||||
{requiredList2, `{"col": [10]}`, result4, false},
|
||||
{requiredList2, `{"col": [10, 20, 30]}`, result5, false},
|
||||
{optionalList1, `{}`, result6, false},
|
||||
{optionalList1, `{"col": null}`, result6, false},
|
||||
{optionalList1, `{"col": [null]}`, nil, true}, // err: col.list.element: nil value for required field
|
||||
{optionalList1, `{"col": [10]}`, result4, false},
|
||||
{optionalList1, `{"col": [10, 20, 30]}`, result5, false},
|
||||
{optionalList2, `{}`, result6, false},
|
||||
{optionalList2, `{"col": null}`, result6, false},
|
||||
{optionalList2, `{"col": [null]}`, result7, false},
|
||||
{optionalList2, `{"col": [10]}`, result8, false},
|
||||
{optionalList2, `{"col": [10, 20, 30]}`, result9, false},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
|
||||
expectErr := (err != nil)
|
||||
|
||||
if testCase.expectErr != expectErr {
|
||||
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
|
||||
}
|
||||
|
||||
if !testCase.expectErr {
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,129 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
func TestPopulatePrimitiveType(t *testing.T) {
|
||||
requiredField := schema.NewTree()
|
||||
{
|
||||
requiredCol, err := schema.NewElement("col", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = requiredField.Set("col", requiredCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = requiredField.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
optionalField := schema.NewTree()
|
||||
{
|
||||
optionalCol, err := schema.NewElement("col", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = optionalField.Set("col", optionalCol); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, _, err = optionalField.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
result1 := map[string]*Column{
|
||||
"col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
result2 := map[string]*Column{
|
||||
"col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
},
|
||||
}
|
||||
|
||||
result3 := map[string]*Column{
|
||||
"col": {
|
||||
parquetType: parquet.Type_INT32,
|
||||
values: []interface{}{v10},
|
||||
definitionLevels: []int64{1},
|
||||
repetitionLevels: []int64{0},
|
||||
rowCount: 1,
|
||||
maxBitWidth: 4,
|
||||
minValue: v10,
|
||||
maxValue: v10,
|
||||
},
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
schemaTree *schema.Tree
|
||||
data string
|
||||
expectedResult map[string]*Column
|
||||
expectErr bool
|
||||
}{
|
||||
{requiredField, `{}`, nil, true},
|
||||
{requiredField, `{"col": null}`, nil, true}, // err: col: nil value for required field
|
||||
{requiredField, `{"col": 10}`, result1, false},
|
||||
{optionalField, `{}`, result2, false},
|
||||
{optionalField, `{"col": null}`, result2, false},
|
||||
{optionalField, `{"col": 10}`, result3, false},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result, err := UnmarshalJSON([]byte(testCase.data), testCase.schemaTree)
|
||||
expectErr := (err != nil)
|
||||
|
||||
if testCase.expectErr != expectErr {
|
||||
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
|
||||
}
|
||||
|
||||
if !testCase.expectErr {
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,681 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"git.apache.org/thrift.git/lib/go/thrift"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/common"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/encoding"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
"github.com/tidwall/gjson"
|
||||
"github.com/tidwall/sjson"
|
||||
)
|
||||
|
||||
func getDefaultEncoding(parquetType parquet.Type) parquet.Encoding {
|
||||
switch parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
return parquet.Encoding_PLAIN
|
||||
case parquet.Type_INT32, parquet.Type_INT64, parquet.Type_FLOAT, parquet.Type_DOUBLE:
|
||||
return parquet.Encoding_RLE_DICTIONARY
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
return parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY
|
||||
}
|
||||
|
||||
return parquet.Encoding_PLAIN
|
||||
}
|
||||
|
||||
func getFirstValueElement(tree *schema.Tree) (valueElement *schema.Element) {
|
||||
tree.Range(func(name string, element *schema.Element) bool {
|
||||
if element.Children == nil {
|
||||
valueElement = element
|
||||
} else {
|
||||
valueElement = getFirstValueElement(element.Children)
|
||||
}
|
||||
|
||||
return false
|
||||
})
|
||||
|
||||
return valueElement
|
||||
}
|
||||
|
||||
func populate(columnDataMap map[string]*Column, input *jsonValue, tree *schema.Tree, firstValueRL int64) (map[string]*Column, error) {
|
||||
var err error
|
||||
|
||||
pos := 0
|
||||
handleElement := func(name string, element *schema.Element) bool {
|
||||
pos++
|
||||
|
||||
dataPath := element.PathInTree
|
||||
|
||||
if *element.RepetitionType == parquet.FieldRepetitionType_REPEATED {
|
||||
panic(fmt.Errorf("%v: repetition type must be REQUIRED or OPTIONAL type", dataPath))
|
||||
}
|
||||
|
||||
inputValue := input.Get(name)
|
||||
if *element.RepetitionType == parquet.FieldRepetitionType_REQUIRED && inputValue.IsNull() {
|
||||
err = fmt.Errorf("%v: nil value for required field", dataPath)
|
||||
return false
|
||||
}
|
||||
|
||||
add := func(element *schema.Element, value interface{}, DL, RL int64) {
|
||||
columnData := columnDataMap[element.PathInSchema]
|
||||
if columnData == nil {
|
||||
columnData = NewColumn(*element.Type)
|
||||
}
|
||||
columnData.add(value, DL, RL)
|
||||
columnDataMap[element.PathInSchema] = columnData
|
||||
}
|
||||
|
||||
// Handle primitive type element.
|
||||
if element.Type != nil {
|
||||
var value interface{}
|
||||
if value, err = inputValue.GetValue(*element.Type, element.ConvertedType); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
DL := element.MaxDefinitionLevel
|
||||
if value == nil && DL > 0 {
|
||||
DL--
|
||||
}
|
||||
|
||||
RL := element.MaxRepetitionLevel
|
||||
if pos == 1 {
|
||||
RL = firstValueRL
|
||||
}
|
||||
|
||||
add(element, value, DL, RL)
|
||||
return true
|
||||
}
|
||||
|
||||
addNull := func() {
|
||||
valueElement := getFirstValueElement(element.Children)
|
||||
|
||||
DL := element.MaxDefinitionLevel
|
||||
if DL > 0 {
|
||||
DL--
|
||||
}
|
||||
|
||||
RL := element.MaxRepetitionLevel
|
||||
if RL > 0 {
|
||||
RL--
|
||||
}
|
||||
|
||||
add(valueElement, nil, DL, RL)
|
||||
}
|
||||
|
||||
// Handle group type element.
|
||||
if element.ConvertedType == nil {
|
||||
if inputValue.IsNull() {
|
||||
addNull()
|
||||
return true
|
||||
}
|
||||
|
||||
columnDataMap, err = populate(columnDataMap, inputValue, element.Children, firstValueRL)
|
||||
return (err == nil)
|
||||
}
|
||||
|
||||
// Handle list type element.
|
||||
if *element.ConvertedType == parquet.ConvertedType_LIST {
|
||||
if inputValue.IsNull() {
|
||||
addNull()
|
||||
return true
|
||||
}
|
||||
|
||||
var results []gjson.Result
|
||||
if results, err = inputValue.GetArray(); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
listElement, _ := element.Children.Get("list")
|
||||
valueElement, _ := listElement.Children.Get("element")
|
||||
for i := range results {
|
||||
rl := valueElement.MaxRepetitionLevel
|
||||
if i == 0 {
|
||||
rl = firstValueRL
|
||||
}
|
||||
|
||||
var jsonData []byte
|
||||
if jsonData, err = sjson.SetBytes([]byte{}, "element", results[i].Value()); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
var jv *jsonValue
|
||||
if jv, err = bytesToJSONValue(jsonData); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
if columnDataMap, err = populate(columnDataMap, jv, listElement.Children, rl); err != nil {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
if *element.ConvertedType == parquet.ConvertedType_MAP {
|
||||
if inputValue.IsNull() {
|
||||
addNull()
|
||||
return true
|
||||
}
|
||||
|
||||
keyValueElement, _ := element.Children.Get("key_value")
|
||||
var rerr error
|
||||
err = inputValue.Range(func(key, value gjson.Result) bool {
|
||||
if !key.Exists() || key.Type == gjson.Null {
|
||||
rerr = fmt.Errorf("%v.key_value.key: not found or null", dataPath)
|
||||
return false
|
||||
}
|
||||
|
||||
var jsonData []byte
|
||||
if jsonData, rerr = sjson.SetBytes([]byte{}, "key", key.Value()); rerr != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
if jsonData, rerr = sjson.SetBytes(jsonData, "value", value.Value()); rerr != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
var jv *jsonValue
|
||||
if jv, rerr = bytesToJSONValue(jsonData); rerr != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
if columnDataMap, rerr = populate(columnDataMap, jv, keyValueElement.Children, firstValueRL); rerr != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
err = rerr
|
||||
return (err == nil)
|
||||
}
|
||||
|
||||
err = fmt.Errorf("%v: unsupported converted type %v in %v field type", dataPath, *element.ConvertedType, *element.RepetitionType)
|
||||
return false
|
||||
}
|
||||
|
||||
tree.Range(handleElement)
|
||||
return columnDataMap, err
|
||||
}
|
||||
|
||||
// Column - denotes values of a column.
|
||||
type Column struct {
|
||||
parquetType parquet.Type // value type.
|
||||
values []interface{} // must be a slice of parquet typed values.
|
||||
definitionLevels []int64 // exactly same length of values.
|
||||
repetitionLevels []int64 // exactly same length of values.
|
||||
rowCount int32
|
||||
maxBitWidth int32
|
||||
minValue interface{}
|
||||
maxValue interface{}
|
||||
}
|
||||
|
||||
func (column *Column) updateMinMaxValue(value interface{}) {
|
||||
if column.minValue == nil && column.maxValue == nil {
|
||||
column.minValue = value
|
||||
column.maxValue = value
|
||||
return
|
||||
}
|
||||
|
||||
switch column.parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
if column.minValue.(bool) && !value.(bool) {
|
||||
column.minValue = value
|
||||
}
|
||||
|
||||
if !column.maxValue.(bool) && value.(bool) {
|
||||
column.maxValue = value
|
||||
}
|
||||
|
||||
case parquet.Type_INT32:
|
||||
if column.minValue.(int32) > value.(int32) {
|
||||
column.minValue = value
|
||||
}
|
||||
|
||||
if column.maxValue.(int32) < value.(int32) {
|
||||
column.maxValue = value
|
||||
}
|
||||
|
||||
case parquet.Type_INT64:
|
||||
if column.minValue.(int64) > value.(int64) {
|
||||
column.minValue = value
|
||||
}
|
||||
|
||||
if column.maxValue.(int64) < value.(int64) {
|
||||
column.maxValue = value
|
||||
}
|
||||
|
||||
case parquet.Type_FLOAT:
|
||||
if column.minValue.(float32) > value.(float32) {
|
||||
column.minValue = value
|
||||
}
|
||||
|
||||
if column.maxValue.(float32) < value.(float32) {
|
||||
column.maxValue = value
|
||||
}
|
||||
|
||||
case parquet.Type_DOUBLE:
|
||||
if column.minValue.(float64) > value.(float64) {
|
||||
column.minValue = value
|
||||
}
|
||||
|
||||
if column.maxValue.(float64) < value.(float64) {
|
||||
column.maxValue = value
|
||||
}
|
||||
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
if bytes.Compare(column.minValue.([]byte), value.([]byte)) > 0 {
|
||||
column.minValue = value
|
||||
}
|
||||
|
||||
if bytes.Compare(column.minValue.([]byte), value.([]byte)) < 0 {
|
||||
column.maxValue = value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (column *Column) updateStats(value interface{}, DL, RL int64) {
|
||||
if RL == 0 {
|
||||
column.rowCount++
|
||||
}
|
||||
|
||||
if value == nil {
|
||||
return
|
||||
}
|
||||
|
||||
var bitWidth int32
|
||||
switch column.parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
bitWidth = 1
|
||||
case parquet.Type_INT32:
|
||||
bitWidth = common.BitWidth(uint64(value.(int32)))
|
||||
case parquet.Type_INT64:
|
||||
bitWidth = common.BitWidth(uint64(value.(int64)))
|
||||
case parquet.Type_FLOAT:
|
||||
bitWidth = 32
|
||||
case parquet.Type_DOUBLE:
|
||||
bitWidth = 64
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
bitWidth = int32(len(value.([]byte)))
|
||||
}
|
||||
if column.maxBitWidth < bitWidth {
|
||||
column.maxBitWidth = bitWidth
|
||||
}
|
||||
|
||||
column.updateMinMaxValue(value)
|
||||
}
|
||||
|
||||
func (column *Column) add(value interface{}, DL, RL int64) {
|
||||
column.values = append(column.values, value)
|
||||
column.definitionLevels = append(column.definitionLevels, DL)
|
||||
column.repetitionLevels = append(column.repetitionLevels, RL)
|
||||
column.updateStats(value, DL, RL)
|
||||
}
|
||||
|
||||
// AddNull - adds nil value.
|
||||
func (column *Column) AddNull(DL, RL int64) {
|
||||
column.add(nil, DL, RL)
|
||||
}
|
||||
|
||||
// AddBoolean - adds boolean value.
|
||||
func (column *Column) AddBoolean(value bool, DL, RL int64) {
|
||||
if column.parquetType != parquet.Type_BOOLEAN {
|
||||
panic(fmt.Errorf("expected %v value", column.parquetType))
|
||||
}
|
||||
|
||||
column.add(value, DL, RL)
|
||||
}
|
||||
|
||||
// AddInt32 - adds int32 value.
|
||||
func (column *Column) AddInt32(value int32, DL, RL int64) {
|
||||
if column.parquetType != parquet.Type_INT32 {
|
||||
panic(fmt.Errorf("expected %v value", column.parquetType))
|
||||
}
|
||||
|
||||
column.add(value, DL, RL)
|
||||
}
|
||||
|
||||
// AddInt64 - adds int64 value.
|
||||
func (column *Column) AddInt64(value int64, DL, RL int64) {
|
||||
if column.parquetType != parquet.Type_INT64 {
|
||||
panic(fmt.Errorf("expected %v value", column.parquetType))
|
||||
}
|
||||
|
||||
column.add(value, DL, RL)
|
||||
}
|
||||
|
||||
// AddFloat - adds float32 value.
|
||||
func (column *Column) AddFloat(value float32, DL, RL int64) {
|
||||
if column.parquetType != parquet.Type_FLOAT {
|
||||
panic(fmt.Errorf("expected %v value", column.parquetType))
|
||||
}
|
||||
|
||||
column.add(value, DL, RL)
|
||||
}
|
||||
|
||||
// AddDouble - adds float64 value.
|
||||
func (column *Column) AddDouble(value float64, DL, RL int64) {
|
||||
if column.parquetType != parquet.Type_DOUBLE {
|
||||
panic(fmt.Errorf("expected %v value", column.parquetType))
|
||||
}
|
||||
|
||||
column.add(value, DL, RL)
|
||||
}
|
||||
|
||||
// AddByteArray - adds byte array value.
|
||||
func (column *Column) AddByteArray(value []byte, DL, RL int64) {
|
||||
if column.parquetType != parquet.Type_BYTE_ARRAY {
|
||||
panic(fmt.Errorf("expected %v value", column.parquetType))
|
||||
}
|
||||
|
||||
column.add(value, DL, RL)
|
||||
}
|
||||
|
||||
// Merge - merges columns.
|
||||
func (column *Column) Merge(column2 *Column) {
|
||||
if column.parquetType != column2.parquetType {
|
||||
panic(fmt.Errorf("merge differs in parquet type"))
|
||||
}
|
||||
|
||||
column.values = append(column.values, column2.values...)
|
||||
column.definitionLevels = append(column.definitionLevels, column2.definitionLevels...)
|
||||
column.repetitionLevels = append(column.repetitionLevels, column2.repetitionLevels...)
|
||||
|
||||
column.rowCount += column2.rowCount
|
||||
if column.maxBitWidth < column2.maxBitWidth {
|
||||
column.maxBitWidth = column2.maxBitWidth
|
||||
}
|
||||
|
||||
column.updateMinMaxValue(column2.minValue)
|
||||
column.updateMinMaxValue(column2.maxValue)
|
||||
}
|
||||
|
||||
func (column *Column) String() string {
|
||||
var strs []string
|
||||
strs = append(strs, fmt.Sprintf("parquetType: %v", column.parquetType))
|
||||
strs = append(strs, fmt.Sprintf("values: %v", column.values))
|
||||
strs = append(strs, fmt.Sprintf("definitionLevels: %v", column.definitionLevels))
|
||||
strs = append(strs, fmt.Sprintf("repetitionLevels: %v", column.repetitionLevels))
|
||||
strs = append(strs, fmt.Sprintf("rowCount: %v", column.rowCount))
|
||||
strs = append(strs, fmt.Sprintf("maxBitWidth: %v", column.maxBitWidth))
|
||||
strs = append(strs, fmt.Sprintf("minValue: %v", column.minValue))
|
||||
strs = append(strs, fmt.Sprintf("maxValue: %v", column.maxValue))
|
||||
return "{" + strings.Join(strs, ", ") + "}"
|
||||
}
|
||||
|
||||
func (column *Column) encodeValue(value interface{}, element *schema.Element) []byte {
|
||||
if value == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
valueData := encoding.PlainEncode(common.ToSliceValue([]interface{}{value}, column.parquetType), column.parquetType)
|
||||
if column.parquetType == parquet.Type_BYTE_ARRAY && element.ConvertedType != nil {
|
||||
switch *element.ConvertedType {
|
||||
case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
|
||||
valueData = valueData[4:]
|
||||
}
|
||||
}
|
||||
|
||||
return valueData
|
||||
}
|
||||
|
||||
func (column *Column) toDataPageV2(element *schema.Element, parquetEncoding parquet.Encoding) *ColumnChunk {
|
||||
var definedValues []interface{}
|
||||
for _, value := range column.values {
|
||||
if value != nil {
|
||||
definedValues = append(definedValues, value)
|
||||
}
|
||||
}
|
||||
|
||||
var encodedData []byte
|
||||
switch parquetEncoding {
|
||||
case parquet.Encoding_PLAIN:
|
||||
encodedData = encoding.PlainEncode(common.ToSliceValue(definedValues, column.parquetType), column.parquetType)
|
||||
|
||||
case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
|
||||
var bytesSlices [][]byte
|
||||
for _, value := range column.values {
|
||||
bytesSlices = append(bytesSlices, value.([]byte))
|
||||
}
|
||||
encodedData = encoding.DeltaLengthByteArrayEncode(bytesSlices)
|
||||
}
|
||||
|
||||
compressionType := parquet.CompressionCodec_SNAPPY
|
||||
if element.CompressionType != nil {
|
||||
compressionType = *element.CompressionType
|
||||
}
|
||||
|
||||
compressedData, err := common.Compress(compressionType, encodedData)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
DLData := encoding.RLEBitPackedHybridEncode(
|
||||
column.definitionLevels,
|
||||
common.BitWidth(uint64(element.MaxDefinitionLevel)),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
|
||||
RLData := encoding.RLEBitPackedHybridEncode(
|
||||
column.repetitionLevels,
|
||||
common.BitWidth(uint64(element.MaxRepetitionLevel)),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
|
||||
pageHeader := parquet.NewPageHeader()
|
||||
pageHeader.Type = parquet.PageType_DATA_PAGE_V2
|
||||
pageHeader.CompressedPageSize = int32(len(compressedData) + len(DLData) + len(RLData))
|
||||
pageHeader.UncompressedPageSize = int32(len(encodedData) + len(DLData) + len(RLData))
|
||||
pageHeader.DataPageHeaderV2 = parquet.NewDataPageHeaderV2()
|
||||
pageHeader.DataPageHeaderV2.NumValues = int32(len(column.values))
|
||||
pageHeader.DataPageHeaderV2.NumNulls = int32(len(column.values) - len(definedValues))
|
||||
pageHeader.DataPageHeaderV2.NumRows = column.rowCount
|
||||
pageHeader.DataPageHeaderV2.Encoding = parquetEncoding
|
||||
pageHeader.DataPageHeaderV2.DefinitionLevelsByteLength = int32(len(DLData))
|
||||
pageHeader.DataPageHeaderV2.RepetitionLevelsByteLength = int32(len(RLData))
|
||||
pageHeader.DataPageHeaderV2.IsCompressed = true
|
||||
pageHeader.DataPageHeaderV2.Statistics = parquet.NewStatistics()
|
||||
pageHeader.DataPageHeaderV2.Statistics.Min = column.encodeValue(column.minValue, element)
|
||||
pageHeader.DataPageHeaderV2.Statistics.Max = column.encodeValue(column.maxValue, element)
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
rawData, err := ts.Write(context.TODO(), pageHeader)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
rawData = append(rawData, RLData...)
|
||||
rawData = append(rawData, DLData...)
|
||||
rawData = append(rawData, compressedData...)
|
||||
|
||||
metadata := parquet.NewColumnMetaData()
|
||||
metadata.Type = column.parquetType
|
||||
metadata.Encodings = []parquet.Encoding{
|
||||
parquet.Encoding_PLAIN,
|
||||
parquet.Encoding_RLE,
|
||||
parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY,
|
||||
}
|
||||
metadata.Codec = compressionType
|
||||
metadata.NumValues = int64(pageHeader.DataPageHeaderV2.NumValues)
|
||||
metadata.TotalCompressedSize = int64(len(rawData))
|
||||
metadata.TotalUncompressedSize = int64(pageHeader.UncompressedPageSize) + int64(len(rawData)) - int64(pageHeader.CompressedPageSize)
|
||||
metadata.PathInSchema = strings.Split(element.PathInSchema, ".")
|
||||
metadata.Statistics = parquet.NewStatistics()
|
||||
metadata.Statistics.Min = pageHeader.DataPageHeaderV2.Statistics.Min
|
||||
metadata.Statistics.Max = pageHeader.DataPageHeaderV2.Statistics.Max
|
||||
|
||||
chunk := new(ColumnChunk)
|
||||
chunk.ColumnChunk.MetaData = metadata
|
||||
chunk.dataPageLen = int64(len(rawData))
|
||||
chunk.dataLen = int64(len(rawData))
|
||||
chunk.data = rawData
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
func (column *Column) toRLEDictPage(element *schema.Element) *ColumnChunk {
|
||||
dictPageData, dataPageData, dictValueCount, indexBitWidth := encoding.RLEDictEncode(column.values, column.parquetType, column.maxBitWidth)
|
||||
|
||||
compressionType := parquet.CompressionCodec_SNAPPY
|
||||
if element.CompressionType != nil {
|
||||
compressionType = *element.CompressionType
|
||||
}
|
||||
|
||||
compressedData, err := common.Compress(compressionType, dictPageData)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
dictPageHeader := parquet.NewPageHeader()
|
||||
dictPageHeader.Type = parquet.PageType_DICTIONARY_PAGE
|
||||
dictPageHeader.CompressedPageSize = int32(len(compressedData))
|
||||
dictPageHeader.UncompressedPageSize = int32(len(dictPageData))
|
||||
dictPageHeader.DictionaryPageHeader = parquet.NewDictionaryPageHeader()
|
||||
dictPageHeader.DictionaryPageHeader.NumValues = dictValueCount
|
||||
dictPageHeader.DictionaryPageHeader.Encoding = parquet.Encoding_PLAIN
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
dictPageRawData, err := ts.Write(context.TODO(), dictPageHeader)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
dictPageRawData = append(dictPageRawData, compressedData...)
|
||||
|
||||
RLData := encoding.RLEBitPackedHybridEncode(
|
||||
column.repetitionLevels,
|
||||
common.BitWidth(uint64(element.MaxRepetitionLevel)),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
encodedData := RLData
|
||||
|
||||
DLData := encoding.RLEBitPackedHybridEncode(
|
||||
column.definitionLevels,
|
||||
common.BitWidth(uint64(element.MaxDefinitionLevel)),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
encodedData = append(encodedData, DLData...)
|
||||
|
||||
encodedData = append(encodedData, indexBitWidth)
|
||||
encodedData = append(encodedData, dataPageData...)
|
||||
|
||||
compressedData, err = common.Compress(compressionType, encodedData)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
dataPageHeader := parquet.NewPageHeader()
|
||||
dataPageHeader.Type = parquet.PageType_DATA_PAGE
|
||||
dataPageHeader.CompressedPageSize = int32(len(compressedData))
|
||||
dataPageHeader.UncompressedPageSize = int32(len(encodedData))
|
||||
dataPageHeader.DataPageHeader = parquet.NewDataPageHeader()
|
||||
dataPageHeader.DataPageHeader.NumValues = int32(len(column.values))
|
||||
dataPageHeader.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE
|
||||
dataPageHeader.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE
|
||||
dataPageHeader.DataPageHeader.Encoding = parquet.Encoding_RLE_DICTIONARY
|
||||
|
||||
ts = thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
dataPageRawData, err := ts.Write(context.TODO(), dataPageHeader)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
dataPageRawData = append(dataPageRawData, compressedData...)
|
||||
|
||||
metadata := parquet.NewColumnMetaData()
|
||||
metadata.Type = column.parquetType
|
||||
metadata.Encodings = []parquet.Encoding{
|
||||
parquet.Encoding_PLAIN,
|
||||
parquet.Encoding_RLE,
|
||||
parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY,
|
||||
parquet.Encoding_RLE_DICTIONARY,
|
||||
}
|
||||
metadata.Codec = compressionType
|
||||
metadata.NumValues = int64(dataPageHeader.DataPageHeader.NumValues)
|
||||
metadata.TotalCompressedSize = int64(len(dictPageRawData)) + int64(len(dataPageRawData))
|
||||
uncompressedSize := int64(dictPageHeader.UncompressedPageSize) + int64(len(dictPageData)) - int64(dictPageHeader.CompressedPageSize)
|
||||
uncompressedSize += int64(dataPageHeader.UncompressedPageSize) + int64(len(dataPageData)) - int64(dataPageHeader.CompressedPageSize)
|
||||
metadata.TotalUncompressedSize = uncompressedSize
|
||||
metadata.PathInSchema = strings.Split(element.PathInSchema, ".")
|
||||
metadata.Statistics = parquet.NewStatistics()
|
||||
metadata.Statistics.Min = column.encodeValue(column.minValue, element)
|
||||
metadata.Statistics.Max = column.encodeValue(column.maxValue, element)
|
||||
|
||||
chunk := new(ColumnChunk)
|
||||
chunk.ColumnChunk.MetaData = metadata
|
||||
chunk.isDictPage = true
|
||||
chunk.dictPageLen = int64(len(dictPageRawData))
|
||||
chunk.dataPageLen = int64(len(dataPageRawData))
|
||||
chunk.dataLen = chunk.dictPageLen + chunk.dataPageLen
|
||||
chunk.data = append(dictPageRawData, dataPageRawData...)
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
// Encode an element.
|
||||
func (column *Column) Encode(element *schema.Element) *ColumnChunk {
|
||||
parquetEncoding := getDefaultEncoding(column.parquetType)
|
||||
if element.Encoding != nil {
|
||||
parquetEncoding = *element.Encoding
|
||||
}
|
||||
|
||||
switch parquetEncoding {
|
||||
case parquet.Encoding_PLAIN, parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
|
||||
return column.toDataPageV2(element, parquetEncoding)
|
||||
}
|
||||
|
||||
return column.toRLEDictPage(element)
|
||||
}
|
||||
|
||||
// NewColumn - creates new column data
|
||||
func NewColumn(parquetType parquet.Type) *Column {
|
||||
switch parquetType {
|
||||
case parquet.Type_BOOLEAN, parquet.Type_INT32, parquet.Type_INT64, parquet.Type_FLOAT, parquet.Type_DOUBLE, parquet.Type_BYTE_ARRAY:
|
||||
default:
|
||||
panic(fmt.Errorf("unsupported parquet type %v", parquetType))
|
||||
}
|
||||
|
||||
return &Column{
|
||||
parquetType: parquetType,
|
||||
}
|
||||
}
|
||||
|
||||
// UnmarshalJSON - decodes JSON data into map of Column.
|
||||
func UnmarshalJSON(data []byte, tree *schema.Tree) (map[string]*Column, error) {
|
||||
if !tree.ReadOnly() {
|
||||
return nil, fmt.Errorf("tree must be read only")
|
||||
}
|
||||
|
||||
inputValue, err := bytesToJSONValue(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
columnDataMap := make(map[string]*Column)
|
||||
return populate(columnDataMap, inputValue, tree, 0)
|
||||
}
|
@ -1,370 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
var (
|
||||
v10 = int32(10)
|
||||
v20 = int32(20)
|
||||
v30 = int32(30)
|
||||
ten = []byte("ten")
|
||||
foo = []byte("foo")
|
||||
bar = []byte("bar")
|
||||
phone1 = []byte("1-234-567-8901")
|
||||
phone2 = []byte("1-234-567-1098")
|
||||
phone3 = []byte("1-111-222-3333")
|
||||
)
|
||||
|
||||
func TestAddressBookExample(t *testing.T) {
|
||||
// message AddressBook {
|
||||
// required string owner;
|
||||
// repeated string ownerPhoneNumbers;
|
||||
// repeated group contacts {
|
||||
// required string name;
|
||||
// optional string phoneNumber;
|
||||
// }
|
||||
// }
|
||||
t.Skip("Broken")
|
||||
|
||||
addressBook := schema.NewTree()
|
||||
{
|
||||
owner, err := schema.NewElement("owner", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ownerPhoneNumbers, err := schema.NewElement("ownerPhoneNumbers", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ownerPhoneNumbersList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ownerPhoneNumbersElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
contacts, err := schema.NewElement("contacts", parquet.FieldRepetitionType_OPTIONAL,
|
||||
nil, parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
contactsList, err := schema.NewElement("list", parquet.FieldRepetitionType_REPEATED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
contactsElement, err := schema.NewElement("element", parquet.FieldRepetitionType_REQUIRED,
|
||||
nil, nil,
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
contactName, err := schema.NewElement("name", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
contactPhoneNumber, err := schema.NewElement("phoneNumber", parquet.FieldRepetitionType_OPTIONAL,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = addressBook.Set("owner", owner); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = addressBook.Set("ownerPhoneNumbers", ownerPhoneNumbers); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = addressBook.Set("ownerPhoneNumbers.list", ownerPhoneNumbersList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = addressBook.Set("ownerPhoneNumbers.list.element", ownerPhoneNumbersElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err = addressBook.Set("contacts", contacts); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = addressBook.Set("contacts.list", contactsList); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = addressBook.Set("contacts.list.element", contactsElement); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = addressBook.Set("contacts.list.element.name", contactName); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err = addressBook.Set("contacts.list.element.phoneNumber", contactPhoneNumber); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
if _, _, err := addressBook.ToParquetSchema(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
case2Data := `{
|
||||
"owner": "foo"
|
||||
}`
|
||||
result2 := map[string]*Column{
|
||||
"owner": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{foo},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"ownerPhoneNumbers.list.element": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"contacts.list.element.name": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
}
|
||||
|
||||
case3Data := `{
|
||||
"owner": "foo",
|
||||
"ownerPhoneNumbers": [
|
||||
"1-234-567-8901"
|
||||
]
|
||||
}
|
||||
`
|
||||
result3 := map[string]*Column{
|
||||
"owner": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{foo},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"ownerPhoneNumbers.list.element": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{phone1},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"contacts.list.element.name": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
}
|
||||
|
||||
case4Data := `{
|
||||
"owner": "foo",
|
||||
"ownerPhoneNumbers": [
|
||||
"1-234-567-8901",
|
||||
"1-234-567-1098"
|
||||
]
|
||||
}
|
||||
`
|
||||
result4 := map[string]*Column{
|
||||
"owner": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{foo},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"ownerPhoneNumbers.list.element": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{phone1, phone2},
|
||||
definitionLevels: []int64{2, 2},
|
||||
repetitionLevels: []int64{0, 1},
|
||||
},
|
||||
"contacts.list.element.name": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
}
|
||||
|
||||
case5Data := `{
|
||||
"contacts": [
|
||||
{
|
||||
"name": "bar"
|
||||
}
|
||||
],
|
||||
"owner": "foo"
|
||||
}`
|
||||
result5 := map[string]*Column{
|
||||
"owner": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{foo},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"ownerPhoneNumbers.list.element": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"contacts.list.element.name": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{bar},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"contacts.list.element.phoneNumber": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
case6Data := `{
|
||||
"contacts": [
|
||||
{
|
||||
"name": "bar",
|
||||
"phoneNumber": "1-111-222-3333"
|
||||
}
|
||||
],
|
||||
"owner": "foo"
|
||||
}`
|
||||
result6 := map[string]*Column{
|
||||
"owner": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{foo},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"ownerPhoneNumbers.list.element": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{nil},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"contacts.list.element.name": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{bar},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"contacts.list.element.phoneNumber": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{phone3},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
case7Data := `{
|
||||
"contacts": [
|
||||
{
|
||||
"name": "bar",
|
||||
"phoneNumber": "1-111-222-3333"
|
||||
}
|
||||
],
|
||||
"owner": "foo",
|
||||
"ownerPhoneNumbers": [
|
||||
"1-234-567-8901",
|
||||
"1-234-567-1098"
|
||||
]
|
||||
}`
|
||||
result7 := map[string]*Column{
|
||||
"owner": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{foo},
|
||||
definitionLevels: []int64{0},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"ownerPhoneNumbers.list.element": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{phone1, phone2},
|
||||
definitionLevels: []int64{2, 2},
|
||||
repetitionLevels: []int64{0, 1},
|
||||
},
|
||||
"contacts.list.element.name": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{bar},
|
||||
definitionLevels: []int64{2},
|
||||
repetitionLevels: []int64{0},
|
||||
},
|
||||
"contacts.list.element.phoneNumber": {
|
||||
parquetType: parquet.Type_BYTE_ARRAY,
|
||||
values: []interface{}{phone3},
|
||||
definitionLevels: []int64{3},
|
||||
repetitionLevels: []int64{1},
|
||||
},
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
data string
|
||||
expectedResult map[string]*Column
|
||||
expectErr bool
|
||||
}{
|
||||
{`{}`, nil, true}, // err: owner: nil value for required field
|
||||
{case2Data, result2, false},
|
||||
{case3Data, result3, false},
|
||||
{case4Data, result4, false},
|
||||
{case5Data, result5, false},
|
||||
{case6Data, result6, false},
|
||||
{case7Data, result7, false},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result, err := UnmarshalJSON([]byte(testCase.data), addressBook)
|
||||
expectErr := (err != nil)
|
||||
|
||||
if testCase.expectErr != expectErr {
|
||||
t.Fatalf("case %v: error: expected: %v, got: %v", i+1, testCase.expectErr, expectErr)
|
||||
}
|
||||
|
||||
if !testCase.expectErr {
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Errorf("case %v: result: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,66 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
// ColumnChunk ...
|
||||
type ColumnChunk struct {
|
||||
parquet.ColumnChunk
|
||||
isDictPage bool
|
||||
dictPageLen int64
|
||||
dataPageLen int64
|
||||
dataLen int64
|
||||
data []byte
|
||||
}
|
||||
|
||||
// Data returns the data.
|
||||
func (chunk *ColumnChunk) Data() []byte {
|
||||
return chunk.data
|
||||
}
|
||||
|
||||
// DataLen returns the length of the data.
|
||||
func (chunk *ColumnChunk) DataLen() int64 {
|
||||
return chunk.dataLen
|
||||
}
|
||||
|
||||
// NewRowGroup creates a new row group.
|
||||
func NewRowGroup(chunks []*ColumnChunk, numRows, offset int64) *parquet.RowGroup {
|
||||
rows := parquet.NewRowGroup()
|
||||
rows.NumRows = numRows
|
||||
|
||||
for _, chunk := range chunks {
|
||||
rows.Columns = append(rows.Columns, &chunk.ColumnChunk)
|
||||
rows.TotalByteSize += chunk.dataLen
|
||||
|
||||
chunk.ColumnChunk.FileOffset = offset
|
||||
|
||||
if chunk.isDictPage {
|
||||
dictPageOffset := offset
|
||||
chunk.ColumnChunk.MetaData.DictionaryPageOffset = &dictPageOffset
|
||||
offset += chunk.dictPageLen
|
||||
}
|
||||
|
||||
chunk.ColumnChunk.MetaData.DataPageOffset = offset
|
||||
offset += chunk.dataPageLen
|
||||
}
|
||||
|
||||
return rows
|
||||
}
|
@ -1,108 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/tidwall/gjson"
|
||||
)
|
||||
|
||||
type jsonValue struct {
|
||||
result *gjson.Result
|
||||
path *string
|
||||
}
|
||||
|
||||
func (v *jsonValue) String() string {
|
||||
if v.result == nil {
|
||||
return "<nil>"
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%v", *v.result)
|
||||
}
|
||||
|
||||
func (v *jsonValue) IsNull() bool {
|
||||
return v.result == nil || v.result.Type == gjson.Null
|
||||
}
|
||||
|
||||
func (v *jsonValue) Get(path string) *jsonValue {
|
||||
if v.path != nil {
|
||||
var result *gjson.Result
|
||||
if *v.path == path {
|
||||
result = v.result
|
||||
}
|
||||
|
||||
return resultToJSONValue(result)
|
||||
}
|
||||
|
||||
if v.result == nil {
|
||||
return resultToJSONValue(nil)
|
||||
}
|
||||
|
||||
result := v.result.Get(path)
|
||||
if !result.Exists() {
|
||||
return resultToJSONValue(nil)
|
||||
}
|
||||
|
||||
return resultToJSONValue(&result)
|
||||
}
|
||||
|
||||
func (v *jsonValue) GetValue(parquetType parquet.Type, convertedType *parquet.ConvertedType) (interface{}, error) {
|
||||
if v.result == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return resultToParquetValue(*v.result, parquetType, convertedType)
|
||||
}
|
||||
|
||||
func (v *jsonValue) GetArray() ([]gjson.Result, error) {
|
||||
if v.result == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return resultToArray(*v.result)
|
||||
}
|
||||
|
||||
func (v *jsonValue) Range(iterator func(key, value gjson.Result) bool) error {
|
||||
if v.result == nil || v.result.Type == gjson.Null {
|
||||
return nil
|
||||
}
|
||||
|
||||
if v.result.Type != gjson.JSON || !v.result.IsObject() {
|
||||
return fmt.Errorf("result is not Map but %v", v.result.Type)
|
||||
}
|
||||
|
||||
v.result.ForEach(iterator)
|
||||
return nil
|
||||
}
|
||||
|
||||
func resultToJSONValue(result *gjson.Result) *jsonValue {
|
||||
return &jsonValue{
|
||||
result: result,
|
||||
}
|
||||
}
|
||||
|
||||
func bytesToJSONValue(data []byte) (*jsonValue, error) {
|
||||
if !gjson.ValidBytes(data) {
|
||||
return nil, fmt.Errorf("invalid JSON data")
|
||||
}
|
||||
|
||||
result := gjson.ParseBytes(data)
|
||||
return resultToJSONValue(&result), nil
|
||||
}
|
@ -1,361 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/tidwall/gjson"
|
||||
)
|
||||
|
||||
func resultToBool(result gjson.Result) (value interface{}, err error) {
|
||||
switch result.Type {
|
||||
case gjson.False, gjson.True:
|
||||
return result.Bool(), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("result is not Bool but %v", result.Type)
|
||||
}
|
||||
|
||||
func resultToInt32(result gjson.Result) (value interface{}, err error) {
|
||||
if value, err = resultToInt64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if value.(int64) < math.MinInt32 || value.(int64) > math.MaxInt32 {
|
||||
return nil, fmt.Errorf("int32 overflow")
|
||||
}
|
||||
|
||||
return int32(value.(int64)), nil
|
||||
}
|
||||
|
||||
func resultToInt64(result gjson.Result) (value interface{}, err error) {
|
||||
if result.Type == gjson.Number {
|
||||
return result.Int(), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("result is not Number but %v", result.Type)
|
||||
}
|
||||
|
||||
func resultToFloat(result gjson.Result) (value interface{}, err error) {
|
||||
if result.Type == gjson.Number {
|
||||
return float32(result.Float()), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("result is not float32 but %v", result.Type)
|
||||
}
|
||||
|
||||
func resultToDouble(result gjson.Result) (value interface{}, err error) {
|
||||
if result.Type == gjson.Number {
|
||||
return result.Float(), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("result is not float64 but %v", result.Type)
|
||||
}
|
||||
|
||||
func resultToBytes(result gjson.Result) (interface{}, error) {
|
||||
if result.Type != gjson.JSON || !result.IsArray() {
|
||||
return nil, fmt.Errorf("result is not byte array but %v", result.Type)
|
||||
}
|
||||
|
||||
data := []byte{}
|
||||
for i, r := range result.Array() {
|
||||
if r.Type != gjson.Number {
|
||||
return nil, fmt.Errorf("result[%v] is not byte but %v", i, r.Type)
|
||||
}
|
||||
|
||||
value := r.Uint()
|
||||
if value > math.MaxUint8 {
|
||||
return nil, fmt.Errorf("byte overflow in result[%v]", i)
|
||||
}
|
||||
|
||||
data = append(data, byte(value))
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func resultToString(result gjson.Result) (value interface{}, err error) {
|
||||
if result.Type == gjson.String {
|
||||
return result.String(), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("result is not String but %v", result.Type)
|
||||
}
|
||||
|
||||
func resultToUint8(result gjson.Result) (value interface{}, err error) {
|
||||
if value, err = resultToUint64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if value.(uint64) > math.MaxUint8 {
|
||||
return nil, fmt.Errorf("uint8 overflow")
|
||||
}
|
||||
|
||||
return uint8(value.(uint64)), nil
|
||||
}
|
||||
|
||||
func resultToUint16(result gjson.Result) (value interface{}, err error) {
|
||||
if value, err = resultToUint64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if value.(uint64) > math.MaxUint16 {
|
||||
return nil, fmt.Errorf("uint16 overflow")
|
||||
}
|
||||
|
||||
return uint16(value.(uint64)), nil
|
||||
}
|
||||
|
||||
func resultToUint32(result gjson.Result) (value interface{}, err error) {
|
||||
if value, err = resultToUint64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if value.(uint64) > math.MaxUint32 {
|
||||
return nil, fmt.Errorf("uint32 overflow")
|
||||
}
|
||||
|
||||
return uint32(value.(uint64)), nil
|
||||
}
|
||||
|
||||
func resultToUint64(result gjson.Result) (value interface{}, err error) {
|
||||
if result.Type == gjson.Number {
|
||||
return result.Uint(), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("result is not Number but %v", result.Type)
|
||||
}
|
||||
|
||||
func resultToInt8(result gjson.Result) (value interface{}, err error) {
|
||||
if value, err = resultToInt64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if value.(int64) < math.MinInt8 || value.(int64) > math.MaxInt8 {
|
||||
return nil, fmt.Errorf("int8 overflow")
|
||||
}
|
||||
|
||||
return int8(value.(int64)), nil
|
||||
}
|
||||
|
||||
func resultToInt16(result gjson.Result) (value interface{}, err error) {
|
||||
if value, err = resultToInt64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if value.(int64) < math.MinInt16 || value.(int64) > math.MaxInt16 {
|
||||
return nil, fmt.Errorf("int16 overflow")
|
||||
}
|
||||
|
||||
return int16(value.(int64)), nil
|
||||
}
|
||||
|
||||
func stringToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT96, parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY:
|
||||
return []byte(value.(string)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("string cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func uint8ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return int32(value.(uint8)), nil
|
||||
case parquet.Type_INT64:
|
||||
return int64(value.(uint8)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("uint8 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func uint16ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return int32(value.(uint16)), nil
|
||||
case parquet.Type_INT64:
|
||||
return int64(value.(uint16)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("uint16 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func uint32ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return int32(value.(uint32)), nil
|
||||
case parquet.Type_INT64:
|
||||
return int64(value.(uint32)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("uint32 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func uint64ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return int32(value.(uint64)), nil
|
||||
case parquet.Type_INT64:
|
||||
return int64(value.(uint64)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("uint64 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func int8ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return int32(value.(int8)), nil
|
||||
case parquet.Type_INT64:
|
||||
return int64(value.(int8)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("int8 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func int16ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return int32(value.(int16)), nil
|
||||
case parquet.Type_INT64:
|
||||
return int64(value.(int16)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("int16 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func int32ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return value.(int32), nil
|
||||
case parquet.Type_INT64:
|
||||
return int64(value.(int32)), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("int32 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func int64ToParquetValue(value interface{}, parquetType parquet.Type) (interface{}, error) {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
return int32(value.(int64)), nil
|
||||
case parquet.Type_INT64:
|
||||
return value.(int64), nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("int64 cannot be converted to parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func resultToParquetValueByConvertedValue(result gjson.Result, convertedType parquet.ConvertedType, parquetType parquet.Type) (value interface{}, err error) {
|
||||
if result.Type == gjson.Null {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
switch convertedType {
|
||||
case parquet.ConvertedType_UTF8:
|
||||
if value, err = resultToString(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stringToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_UINT_8:
|
||||
if value, err = resultToUint8(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return uint8ToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_UINT_16:
|
||||
if value, err = resultToUint16(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return uint16ToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_UINT_32:
|
||||
if value, err = resultToUint32(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return uint32ToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_UINT_64:
|
||||
if value, err = resultToUint64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return uint64ToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_INT_8:
|
||||
if value, err = resultToInt8(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return int8ToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_INT_16:
|
||||
if value, err = resultToInt16(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return int16ToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_INT_32:
|
||||
if value, err = resultToInt32(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return int32ToParquetValue(value, parquetType)
|
||||
case parquet.ConvertedType_INT_64:
|
||||
if value, err = resultToInt64(result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return int64ToParquetValue(value, parquetType)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unsupported converted type %v", convertedType)
|
||||
}
|
||||
|
||||
func resultToParquetValue(result gjson.Result, parquetType parquet.Type, convertedType *parquet.ConvertedType) (interface{}, error) {
|
||||
if convertedType != nil {
|
||||
return resultToParquetValueByConvertedValue(result, *convertedType, parquetType)
|
||||
}
|
||||
|
||||
if result.Type == gjson.Null {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
switch parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
return resultToBool(result)
|
||||
case parquet.Type_INT32:
|
||||
return resultToInt32(result)
|
||||
case parquet.Type_INT64:
|
||||
return resultToInt64(result)
|
||||
case parquet.Type_FLOAT:
|
||||
return resultToFloat(result)
|
||||
case parquet.Type_DOUBLE:
|
||||
return resultToDouble(result)
|
||||
case parquet.Type_INT96, parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY:
|
||||
return resultToBytes(result)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unknown parquet type %v", parquetType)
|
||||
}
|
||||
|
||||
func resultToArray(result gjson.Result) ([]gjson.Result, error) {
|
||||
if result.Type == gjson.Null {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if result.Type != gjson.JSON || !result.IsArray() {
|
||||
return nil, fmt.Errorf("result is not Array but %v", result.Type)
|
||||
}
|
||||
|
||||
return result.Array(), nil
|
||||
}
|
@ -1,514 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func i64sToi32s(i64s []int64) (i32s []int32) {
|
||||
i32s = make([]int32, len(i64s))
|
||||
for i := range i64s {
|
||||
i32s[i] = int32(i64s[i])
|
||||
}
|
||||
|
||||
return i32s
|
||||
}
|
||||
|
||||
func readBitPacked(reader *bytes.Reader, header, bitWidth uint64) (result []int64, err error) {
|
||||
count := header * 8
|
||||
|
||||
if count == 0 {
|
||||
return result, nil
|
||||
}
|
||||
|
||||
if bitWidth == 0 {
|
||||
return make([]int64, count), nil
|
||||
}
|
||||
|
||||
data := make([]byte, header*bitWidth)
|
||||
if _, err = reader.Read(data); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var val, used, left, b uint64
|
||||
|
||||
valNeedBits := bitWidth
|
||||
i := -1
|
||||
for {
|
||||
if left <= 0 {
|
||||
i++
|
||||
if i >= len(data) {
|
||||
break
|
||||
}
|
||||
|
||||
b = uint64(data[i])
|
||||
left = 8
|
||||
used = 0
|
||||
}
|
||||
|
||||
if left >= valNeedBits {
|
||||
val |= ((b >> used) & ((1 << valNeedBits) - 1)) << (bitWidth - valNeedBits)
|
||||
result = append(result, int64(val))
|
||||
val = 0
|
||||
left -= valNeedBits
|
||||
used += valNeedBits
|
||||
valNeedBits = bitWidth
|
||||
} else {
|
||||
val |= (b >> used) << (bitWidth - valNeedBits)
|
||||
valNeedBits -= left
|
||||
left = 0
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readBools(reader *bytes.Reader, count uint64) (result []bool, err error) {
|
||||
i64s, err := readBitPacked(reader, count, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
result = append(result, i64s[i] > 0)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readInt32s(reader *bytes.Reader, count uint64) (result []int32, err error) {
|
||||
buf := make([]byte, 4)
|
||||
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
if _, err = reader.Read(buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, int32(bytesToUint32(buf)))
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readInt64s(reader *bytes.Reader, count uint64) (result []int64, err error) {
|
||||
buf := make([]byte, 8)
|
||||
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
if _, err = reader.Read(buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, int64(bytesToUint64(buf)))
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readInt96s(reader *bytes.Reader, count uint64) (result [][]byte, err error) {
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
buf := make([]byte, 12)
|
||||
|
||||
if _, err = reader.Read(buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, buf)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readFloats(reader *bytes.Reader, count uint64) (result []float32, err error) {
|
||||
buf := make([]byte, 4)
|
||||
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
if _, err = reader.Read(buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, math.Float32frombits(bytesToUint32(buf)))
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readDoubles(reader *bytes.Reader, count uint64) (result []float64, err error) {
|
||||
buf := make([]byte, 8)
|
||||
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
if _, err = reader.Read(buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, math.Float64frombits(bytesToUint64(buf)))
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readByteArrays(reader *bytes.Reader, count uint64) (result [][]byte, err error) {
|
||||
buf := make([]byte, 4)
|
||||
var length uint32
|
||||
var data []byte
|
||||
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
if _, err = reader.Read(buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
length = bytesToUint32(buf)
|
||||
data = make([]byte, length)
|
||||
if length > 0 {
|
||||
if _, err = reader.Read(data); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
result = append(result, data)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readFixedLenByteArrays(reader *bytes.Reader, count, length uint64) (result [][]byte, err error) {
|
||||
var i uint64
|
||||
for i = 0; i < count; i++ {
|
||||
data := make([]byte, length)
|
||||
if _, err = reader.Read(data); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, data)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readValues(reader *bytes.Reader, dataType parquet.Type, count, length uint64) (interface{}, error) {
|
||||
switch dataType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
return readBools(reader, count)
|
||||
case parquet.Type_INT32:
|
||||
return readInt32s(reader, count)
|
||||
case parquet.Type_INT64:
|
||||
return readInt64s(reader, count)
|
||||
case parquet.Type_INT96:
|
||||
return readInt96s(reader, count)
|
||||
case parquet.Type_FLOAT:
|
||||
return readFloats(reader, count)
|
||||
case parquet.Type_DOUBLE:
|
||||
return readDoubles(reader, count)
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
return readByteArrays(reader, count)
|
||||
case parquet.Type_FIXED_LEN_BYTE_ARRAY:
|
||||
return readFixedLenByteArrays(reader, count, length)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unknown parquet type %v", dataType)
|
||||
}
|
||||
|
||||
func readUnsignedVarInt(reader *bytes.Reader) (v uint64, err error) {
|
||||
var b byte
|
||||
var shift uint64
|
||||
|
||||
for {
|
||||
if b, err = reader.ReadByte(); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if v |= ((uint64(b) & 0x7F) << shift); b&0x80 == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
shift += 7
|
||||
}
|
||||
|
||||
return v, nil
|
||||
}
|
||||
|
||||
func readRLE(reader *bytes.Reader, header, bitWidth uint64) (result []int64, err error) {
|
||||
width := (bitWidth + 7) / 8
|
||||
data := make([]byte, width)
|
||||
if width > 0 {
|
||||
if _, err = reader.Read(data); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if width < 4 {
|
||||
data = append(data, make([]byte, 4-width)...)
|
||||
}
|
||||
|
||||
val := int64(bytesToUint32(data))
|
||||
count := header >> 1
|
||||
if count > math.MaxInt64/8 {
|
||||
// 8 bytes/element.
|
||||
return nil, errors.New("parquet: size too large")
|
||||
}
|
||||
result = make([]int64, count)
|
||||
for i := range result {
|
||||
result[i] = val
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readRLEBitPackedHybrid(reader *bytes.Reader, length, bitWidth uint64) (result []int64, err error) {
|
||||
if length <= 0 {
|
||||
var i32s []int32
|
||||
i32s, err = readInt32s(reader, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if i32s[0] < 0 {
|
||||
return nil, errors.New("parquet: negative RLEBitPackedHybrid length")
|
||||
}
|
||||
length = uint64(i32s[0])
|
||||
}
|
||||
|
||||
buf := make([]byte, length)
|
||||
if _, err = reader.Read(buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
reader = bytes.NewReader(buf)
|
||||
for reader.Len() > 0 {
|
||||
header, err := readUnsignedVarInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var i64s []int64
|
||||
if header&1 == 0 {
|
||||
i64s, err = readRLE(reader, header, bitWidth)
|
||||
} else {
|
||||
i64s, err = readBitPacked(reader, header>>1, bitWidth)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, i64s...)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readDeltaBinaryPackedInt(reader *bytes.Reader) (result []int64, err error) {
|
||||
blockSize, err := readUnsignedVarInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
numMiniblocksInBlock, err := readUnsignedVarInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
numValues, err := readUnsignedVarInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
firstValueZigZag, err := readUnsignedVarInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
v := int64(firstValueZigZag>>1) ^ (-int64(firstValueZigZag & 1))
|
||||
result = append(result, v)
|
||||
if numMiniblocksInBlock == 0 {
|
||||
return nil, errors.New("parquet: zero mini blocks in block")
|
||||
}
|
||||
numValuesInMiniBlock := blockSize / numMiniblocksInBlock
|
||||
|
||||
bitWidths := make([]uint64, numMiniblocksInBlock)
|
||||
for uint64(len(result)) < numValues {
|
||||
minDeltaZigZag, err := readUnsignedVarInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i := 0; uint64(i) < numMiniblocksInBlock; i++ {
|
||||
b, err := reader.ReadByte()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
bitWidths[i] = uint64(b)
|
||||
}
|
||||
|
||||
minDelta := int64(minDeltaZigZag>>1) ^ (-int64(minDeltaZigZag & 1))
|
||||
for i := 0; uint64(i) < numMiniblocksInBlock; i++ {
|
||||
i64s, err := readBitPacked(reader, numValuesInMiniBlock/8, bitWidths[i])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for j := range i64s {
|
||||
v += i64s[j] + minDelta
|
||||
result = append(result, v)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result[:numValues], nil
|
||||
}
|
||||
|
||||
func readDeltaLengthByteArrays(reader *bytes.Reader) (result [][]byte, err error) {
|
||||
i64s, err := readDeltaBinaryPackedInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i := 0; i < len(i64s); i++ {
|
||||
arrays, err := readFixedLenByteArrays(reader, 1, uint64(i64s[i]))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, arrays[0])
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readDeltaByteArrays(reader *bytes.Reader) (result [][]byte, err error) {
|
||||
i64s, err := readDeltaBinaryPackedInt(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
suffixes, err := readDeltaLengthByteArrays(reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result = append(result, suffixes[0])
|
||||
for i := 1; i < len(i64s); i++ {
|
||||
prefixLength := i64s[i]
|
||||
val := append([]byte{}, result[i-1][:prefixLength]...)
|
||||
val = append(val, suffixes[i]...)
|
||||
result = append(result, val)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func readDataPageValues(
|
||||
bytesReader *bytes.Reader,
|
||||
encoding parquet.Encoding,
|
||||
dataType parquet.Type,
|
||||
convertedType parquet.ConvertedType,
|
||||
count, bitWidth uint64,
|
||||
) (result interface{}, resultDataType parquet.Type, err error) {
|
||||
switch encoding {
|
||||
case parquet.Encoding_PLAIN:
|
||||
result, err = readValues(bytesReader, dataType, count, bitWidth)
|
||||
return result, dataType, err
|
||||
|
||||
case parquet.Encoding_PLAIN_DICTIONARY:
|
||||
b, err := bytesReader.ReadByte()
|
||||
if err != nil {
|
||||
return nil, -1, err
|
||||
}
|
||||
|
||||
i64s, err := readRLEBitPackedHybrid(bytesReader, uint64(bytesReader.Len()), uint64(b))
|
||||
if err != nil {
|
||||
return nil, -1, err
|
||||
}
|
||||
if len(i64s) < int(count) || count > math.MaxInt64/8 {
|
||||
return nil, -1, errors.New("parquet: value out of range")
|
||||
}
|
||||
return i64s[:count], parquet.Type_INT64, nil
|
||||
|
||||
case parquet.Encoding_RLE:
|
||||
i64s, err := readRLEBitPackedHybrid(bytesReader, 0, bitWidth)
|
||||
if err != nil {
|
||||
return nil, -1, err
|
||||
}
|
||||
|
||||
if len(i64s) < int(count) || count > math.MaxInt64/8 {
|
||||
return nil, -1, errors.New("parquet: value out of range")
|
||||
}
|
||||
i64s = i64s[:count]
|
||||
|
||||
if dataType == parquet.Type_INT32 {
|
||||
return i64sToi32s(i64s), parquet.Type_INT32, nil
|
||||
}
|
||||
|
||||
return i64s, parquet.Type_INT64, nil
|
||||
|
||||
case parquet.Encoding_BIT_PACKED:
|
||||
return nil, -1, fmt.Errorf("deprecated parquet encoding %v", parquet.Encoding_BIT_PACKED)
|
||||
|
||||
case parquet.Encoding_DELTA_BINARY_PACKED:
|
||||
i64s, err := readDeltaBinaryPackedInt(bytesReader)
|
||||
if err != nil {
|
||||
return nil, -1, err
|
||||
}
|
||||
|
||||
if len(i64s) < int(count) || count > math.MaxInt64/8 {
|
||||
return nil, -1, errors.New("parquet: value out of range")
|
||||
}
|
||||
i64s = i64s[:count]
|
||||
|
||||
if dataType == parquet.Type_INT32 {
|
||||
return i64sToi32s(i64s), parquet.Type_INT32, nil
|
||||
}
|
||||
|
||||
return i64s, parquet.Type_INT64, nil
|
||||
|
||||
case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
|
||||
byteSlices, err := readDeltaLengthByteArrays(bytesReader)
|
||||
if err != nil {
|
||||
return nil, -1, err
|
||||
}
|
||||
if len(byteSlices) < int(count) || count > math.MaxInt64/24 {
|
||||
return nil, -1, errors.New("parquet: value out of range")
|
||||
}
|
||||
|
||||
return byteSlices[:count], parquet.Type_FIXED_LEN_BYTE_ARRAY, nil
|
||||
|
||||
case parquet.Encoding_DELTA_BYTE_ARRAY:
|
||||
byteSlices, err := readDeltaByteArrays(bytesReader)
|
||||
if err != nil {
|
||||
return nil, -1, err
|
||||
}
|
||||
if len(byteSlices) < int(count) || count > math.MaxInt64/24 {
|
||||
return nil, -1, errors.New("parquet: value out of range")
|
||||
}
|
||||
|
||||
return byteSlices[:count], parquet.Type_FIXED_LEN_BYTE_ARRAY, nil
|
||||
}
|
||||
|
||||
return nil, -1, fmt.Errorf("unsupported parquet encoding %v", encoding)
|
||||
}
|
@ -1,451 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func boolsToBytes(bs []bool) []byte {
|
||||
size := (len(bs) + 7) / 8
|
||||
result := make([]byte, size)
|
||||
for i := range bs {
|
||||
if bs[i] {
|
||||
result[i/8] |= 1 << uint32(i%8)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func int32sToBytes(i32s []int32) []byte {
|
||||
buf := make([]byte, 4*len(i32s))
|
||||
for i, i32 := range i32s {
|
||||
binary.LittleEndian.PutUint32(buf[i*4:], uint32(i32))
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func int64sToBytes(i64s []int64) []byte {
|
||||
buf := make([]byte, 8*len(i64s))
|
||||
for i, i64 := range i64s {
|
||||
binary.LittleEndian.PutUint64(buf[i*8:], uint64(i64))
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func float32sToBytes(f32s []float32) []byte {
|
||||
buf := make([]byte, 4*len(f32s))
|
||||
for i, f32 := range f32s {
|
||||
binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(f32))
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func float64sToBytes(f64s []float64) []byte {
|
||||
buf := make([]byte, 8*len(f64s))
|
||||
for i, f64 := range f64s {
|
||||
binary.LittleEndian.PutUint64(buf[i*8:], math.Float64bits(f64))
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func byteSlicesToBytes(byteSlices [][]byte) []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
for _, s := range byteSlices {
|
||||
if err := binary.Write(buf, binary.LittleEndian, uint32(len(s))); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if _, err := buf.Write(s); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func byteArraysToBytes(arrayList [][]byte) []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
arrayLen := -1
|
||||
for _, array := range arrayList {
|
||||
if arrayLen != -1 && len(array) != arrayLen {
|
||||
panic(errors.New("array list does not have same length"))
|
||||
}
|
||||
|
||||
arrayLen = len(array)
|
||||
if _, err := buf.Write(array); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func int96sToBytes(i96s [][]byte) []byte {
|
||||
return byteArraysToBytes(i96s)
|
||||
}
|
||||
|
||||
func valuesToBytes(values interface{}, dataType parquet.Type) []byte {
|
||||
switch dataType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
return boolsToBytes(values.([]bool))
|
||||
case parquet.Type_INT32:
|
||||
return int32sToBytes(values.([]int32))
|
||||
case parquet.Type_INT64:
|
||||
return int64sToBytes(values.([]int64))
|
||||
case parquet.Type_INT96:
|
||||
return int96sToBytes(values.([][]byte))
|
||||
case parquet.Type_FLOAT:
|
||||
return float32sToBytes(values.([]float32))
|
||||
case parquet.Type_DOUBLE:
|
||||
return float64sToBytes(values.([]float64))
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
return byteSlicesToBytes(values.([][]byte))
|
||||
case parquet.Type_FIXED_LEN_BYTE_ARRAY:
|
||||
return byteArraysToBytes(values.([][]byte))
|
||||
}
|
||||
|
||||
return []byte{}
|
||||
}
|
||||
|
||||
func valueToBytes(value interface{}, dataType parquet.Type) []byte {
|
||||
var values interface{}
|
||||
switch dataType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
values = []bool{value.(bool)}
|
||||
case parquet.Type_INT32:
|
||||
values = []int32{value.(int32)}
|
||||
case parquet.Type_INT64:
|
||||
values = []int64{value.(int64)}
|
||||
case parquet.Type_INT96:
|
||||
values = [][]byte{value.([]byte)}
|
||||
case parquet.Type_FLOAT:
|
||||
values = []float32{value.(float32)}
|
||||
case parquet.Type_DOUBLE:
|
||||
values = []float64{value.(float64)}
|
||||
case parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY:
|
||||
values = [][]byte{value.([]byte)}
|
||||
}
|
||||
|
||||
return valuesToBytes(values, dataType)
|
||||
}
|
||||
|
||||
func unsignedVarIntToBytes(ui64 uint64) []byte {
|
||||
size := (getBitWidth(ui64) + 6) / 7
|
||||
if size == 0 {
|
||||
return []byte{0}
|
||||
}
|
||||
|
||||
buf := make([]byte, size)
|
||||
for i := uint64(0); i < size; i++ {
|
||||
buf[i] = byte(ui64&0x7F) | 0x80
|
||||
ui64 >>= 7
|
||||
}
|
||||
buf[size-1] &= 0x7F
|
||||
|
||||
return buf
|
||||
}
|
||||
|
||||
func valuesToRLEBytes(values interface{}, bitWidth int32, valueType parquet.Type) []byte {
|
||||
vals := valuesToInterfaces(values, valueType)
|
||||
result := []byte{}
|
||||
j := 0
|
||||
for i := 0; i < len(vals); i = j {
|
||||
for j = i + 1; j < len(vals) && vals[i] == vals[j]; j++ {
|
||||
}
|
||||
headerBytes := unsignedVarIntToBytes(uint64((j - i) << 1))
|
||||
result = append(result, headerBytes...)
|
||||
|
||||
valBytes := valueToBytes(vals[i], valueType)
|
||||
byteCount := (bitWidth + 7) / 8
|
||||
result = append(result, valBytes[:byteCount]...)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func valuesToRLEBitPackedHybridBytes(values interface{}, bitWidth int32, dataType parquet.Type) []byte {
|
||||
rleBytes := valuesToRLEBytes(values, bitWidth, dataType)
|
||||
lenBytes := valueToBytes(int32(len(rleBytes)), parquet.Type_INT32)
|
||||
return append(lenBytes, rleBytes...)
|
||||
}
|
||||
|
||||
func valuesToBitPackedBytes(values interface{}, bitWidth int64, withHeader bool, dataType parquet.Type) []byte {
|
||||
var i64s []int64
|
||||
switch dataType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
bs := values.([]bool)
|
||||
i64s = make([]int64, len(bs))
|
||||
for i := range bs {
|
||||
if bs[i] {
|
||||
i64s[i] = 1
|
||||
}
|
||||
}
|
||||
case parquet.Type_INT32:
|
||||
i32s := values.([]int32)
|
||||
i64s = make([]int64, len(i32s))
|
||||
for i := range i32s {
|
||||
i64s[i] = int64(i32s[i])
|
||||
}
|
||||
case parquet.Type_INT64:
|
||||
i64s = values.([]int64)
|
||||
default:
|
||||
panic(fmt.Errorf("data type %v is not supported for bit packing", dataType))
|
||||
}
|
||||
|
||||
if len(i64s) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var valueByte byte
|
||||
bitsSet := uint64(0)
|
||||
bitsNeeded := uint64(8)
|
||||
bitsToSet := uint64(bitWidth)
|
||||
value := i64s[0]
|
||||
|
||||
valueBytes := []byte{}
|
||||
for i := 0; i < len(i64s); {
|
||||
if bitsToSet >= bitsNeeded {
|
||||
valueByte |= byte(((value >> bitsSet) & ((1 << bitsNeeded) - 1)) << (8 - bitsNeeded))
|
||||
valueBytes = append(valueBytes, valueByte)
|
||||
bitsToSet -= bitsNeeded
|
||||
bitsSet += bitsNeeded
|
||||
|
||||
bitsNeeded = 8
|
||||
valueByte = 0
|
||||
|
||||
if bitsToSet <= 0 && (i+1) < len(i64s) {
|
||||
i++
|
||||
value = i64s[i]
|
||||
bitsToSet = uint64(bitWidth)
|
||||
bitsSet = 0
|
||||
}
|
||||
} else {
|
||||
valueByte |= byte((value >> bitsSet) << (8 - bitsNeeded))
|
||||
i++
|
||||
|
||||
if i < len(i64s) {
|
||||
value = i64s[i]
|
||||
}
|
||||
|
||||
bitsNeeded -= bitsToSet
|
||||
bitsToSet = uint64(bitWidth)
|
||||
bitsSet = 0
|
||||
}
|
||||
}
|
||||
|
||||
if withHeader {
|
||||
header := uint64(((len(i64s) / 8) << 1) | 1)
|
||||
headerBytes := unsignedVarIntToBytes(header)
|
||||
return append(headerBytes, valueBytes...)
|
||||
}
|
||||
|
||||
return valueBytes
|
||||
}
|
||||
|
||||
const (
|
||||
blockSize = 128
|
||||
subBlockSize = 32
|
||||
subBlockCount = blockSize / subBlockSize
|
||||
)
|
||||
|
||||
var (
|
||||
blockSizeBytes = unsignedVarIntToBytes(blockSize)
|
||||
subBlockCountBytes = unsignedVarIntToBytes(subBlockCount)
|
||||
)
|
||||
|
||||
func int32ToDeltaBytes(i32s []int32) []byte {
|
||||
getValue := func(i32 int32) uint64 {
|
||||
return uint64((i32 >> 31) ^ (i32 << 1))
|
||||
}
|
||||
|
||||
result := append([]byte{}, blockSizeBytes...)
|
||||
result = append(result, subBlockCountBytes...)
|
||||
result = append(result, unsignedVarIntToBytes(uint64(len(i32s)))...)
|
||||
result = append(result, unsignedVarIntToBytes(getValue(i32s[0]))...)
|
||||
|
||||
for i := 1; i < len(i32s); {
|
||||
block := []int32{}
|
||||
minDelta := int32(0x7FFFFFFF)
|
||||
|
||||
for ; i < len(i32s) && len(block) < blockSize; i++ {
|
||||
delta := i32s[i] - i32s[i-1]
|
||||
block = append(block, delta)
|
||||
if delta < minDelta {
|
||||
minDelta = delta
|
||||
}
|
||||
}
|
||||
|
||||
for len(block) < blockSize {
|
||||
block = append(block, minDelta)
|
||||
}
|
||||
|
||||
bitWidths := make([]byte, subBlockCount)
|
||||
for j := 0; j < subBlockCount; j++ {
|
||||
maxValue := int32(0)
|
||||
for k := j * subBlockSize; k < (j+1)*subBlockSize; k++ {
|
||||
block[k] -= minDelta
|
||||
if block[k] > maxValue {
|
||||
maxValue = block[k]
|
||||
}
|
||||
}
|
||||
|
||||
bitWidths[j] = byte(getBitWidth(uint64(maxValue)))
|
||||
}
|
||||
|
||||
minDeltaZigZag := getValue(minDelta)
|
||||
result = append(result, unsignedVarIntToBytes(minDeltaZigZag)...)
|
||||
result = append(result, bitWidths...)
|
||||
|
||||
for j := 0; j < subBlockCount; j++ {
|
||||
bitPacked := valuesToBitPackedBytes(
|
||||
block[j*subBlockSize:(j+1)*subBlockSize],
|
||||
int64(bitWidths[j]),
|
||||
false,
|
||||
parquet.Type_INT32,
|
||||
)
|
||||
result = append(result, bitPacked...)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func int64ToDeltaBytes(i64s []int64) []byte {
|
||||
getValue := func(i64 int64) uint64 {
|
||||
return uint64((i64 >> 63) ^ (i64 << 1))
|
||||
}
|
||||
|
||||
result := append([]byte{}, blockSizeBytes...)
|
||||
result = append(result, subBlockCountBytes...)
|
||||
result = append(result, unsignedVarIntToBytes(uint64(len(i64s)))...)
|
||||
result = append(result, unsignedVarIntToBytes(getValue(i64s[0]))...)
|
||||
|
||||
for i := 1; i < len(i64s); {
|
||||
block := []int64{}
|
||||
minDelta := int64(0x7FFFFFFFFFFFFFFF)
|
||||
|
||||
for ; i < len(i64s) && len(block) < blockSize; i++ {
|
||||
delta := i64s[i] - i64s[i-1]
|
||||
block = append(block, delta)
|
||||
if delta < minDelta {
|
||||
minDelta = delta
|
||||
}
|
||||
}
|
||||
|
||||
for len(block) < blockSize {
|
||||
block = append(block, minDelta)
|
||||
}
|
||||
|
||||
bitWidths := make([]byte, subBlockCount)
|
||||
for j := 0; j < subBlockCount; j++ {
|
||||
maxValue := int64(0)
|
||||
for k := j * subBlockSize; k < (j+1)*subBlockSize; k++ {
|
||||
block[k] -= minDelta
|
||||
if block[k] > maxValue {
|
||||
maxValue = block[k]
|
||||
}
|
||||
}
|
||||
|
||||
bitWidths[j] = byte(getBitWidth(uint64(maxValue)))
|
||||
}
|
||||
|
||||
minDeltaZigZag := getValue(minDelta)
|
||||
result = append(result, unsignedVarIntToBytes(minDeltaZigZag)...)
|
||||
result = append(result, bitWidths...)
|
||||
|
||||
for j := 0; j < subBlockCount; j++ {
|
||||
bitPacked := valuesToBitPackedBytes(
|
||||
block[j*subBlockSize:(j+1)*subBlockSize],
|
||||
int64(bitWidths[j]),
|
||||
false,
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
result = append(result, bitPacked...)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func valuesToDeltaBytes(values interface{}, dataType parquet.Type) []byte {
|
||||
switch dataType {
|
||||
case parquet.Type_INT32:
|
||||
return int32ToDeltaBytes(values.([]int32))
|
||||
case parquet.Type_INT64:
|
||||
return int64ToDeltaBytes(values.([]int64))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func stringsToDeltaLengthByteArrayBytes(strs []string) []byte {
|
||||
lengths := make([]int32, len(strs))
|
||||
for i, s := range strs {
|
||||
lengths[i] = int32(len(s))
|
||||
}
|
||||
|
||||
result := int32ToDeltaBytes(lengths)
|
||||
for _, s := range strs {
|
||||
result = append(result, []byte(s)...)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func stringsToDeltaByteArrayBytes(strs []string) []byte {
|
||||
prefixLengths := make([]int32, len(strs))
|
||||
suffixes := make([]string, len(strs))
|
||||
|
||||
var i, j int
|
||||
for i = 1; i < len(strs); i++ {
|
||||
for j = 0; j < len(strs[i-1]) && j < len(strs[i]); j++ {
|
||||
if strs[i-1][j] != strs[i][j] {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
prefixLengths[i] = int32(j)
|
||||
suffixes[i] = strs[i][j:]
|
||||
}
|
||||
|
||||
result := int32ToDeltaBytes(prefixLengths)
|
||||
return append(result, stringsToDeltaLengthByteArrayBytes(suffixes)...)
|
||||
}
|
||||
|
||||
func encodeValues(values interface{}, dataType parquet.Type, encoding parquet.Encoding, bitWidth int32) []byte {
|
||||
switch encoding {
|
||||
case parquet.Encoding_RLE:
|
||||
return valuesToRLEBitPackedHybridBytes(values, bitWidth, dataType)
|
||||
case parquet.Encoding_DELTA_BINARY_PACKED:
|
||||
return valuesToDeltaBytes(values, dataType)
|
||||
case parquet.Encoding_DELTA_BYTE_ARRAY:
|
||||
return stringsToDeltaByteArrayBytes(values.([]string))
|
||||
case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
|
||||
return stringsToDeltaLengthByteArrayBytes(values.([]string))
|
||||
}
|
||||
|
||||
return valuesToBytes(values, dataType)
|
||||
}
|
@ -1,190 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func TestBoolsToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
bs []bool
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]bool{}, []byte{}},
|
||||
{[]bool{true}, []byte{1}},
|
||||
{[]bool{false}, []byte{0}},
|
||||
{[]bool{true, true}, []byte{3}},
|
||||
{[]bool{false, false}, []byte{0}},
|
||||
{[]bool{false, true}, []byte{2}},
|
||||
{[]bool{true, false}, []byte{1}},
|
||||
{[]bool{false, false, false, false, false, false, false, true, true}, []byte{128, 1}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := boolsToBytes(testCase.bs)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestInt32sToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
i32s []int32
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]int32{}, []byte{}},
|
||||
{[]int32{1}, []byte{1, 0, 0, 0}},
|
||||
{[]int32{-1}, []byte{255, 255, 255, 255}},
|
||||
{[]int32{256}, []byte{0, 1, 0, 0}},
|
||||
{[]int32{math.MinInt32}, []byte{0, 0, 0, 128}},
|
||||
{[]int32{math.MaxInt32}, []byte{255, 255, 255, 127}},
|
||||
{[]int32{257, -2}, []byte{1, 1, 0, 0, 254, 255, 255, 255}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := int32sToBytes(testCase.i32s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestInt64sToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
i64s []int64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]int64{}, []byte{}},
|
||||
{[]int64{1}, []byte{1, 0, 0, 0, 0, 0, 0, 0}},
|
||||
{[]int64{-1}, []byte{255, 255, 255, 255, 255, 255, 255, 255}},
|
||||
{[]int64{256}, []byte{0, 1, 0, 0, 0, 0, 0, 0}},
|
||||
{[]int64{math.MinInt64}, []byte{0, 0, 0, 0, 0, 0, 0, 128}},
|
||||
{[]int64{math.MaxInt64}, []byte{255, 255, 255, 255, 255, 255, 255, 127}},
|
||||
{[]int64{257, -2}, []byte{1, 1, 0, 0, 0, 0, 0, 0, 254, 255, 255, 255, 255, 255, 255, 255}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := int64sToBytes(testCase.i64s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFloat32sToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
f32s []float32
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]float32{}, []byte{}},
|
||||
{[]float32{1}, []byte{0, 0, 128, 63}},
|
||||
{[]float32{1.0}, []byte{0, 0, 128, 63}},
|
||||
{[]float32{-1}, []byte{0, 0, 128, 191}},
|
||||
{[]float32{-1.0}, []byte{0, 0, 128, 191}},
|
||||
{[]float32{256}, []byte{0, 0, 128, 67}},
|
||||
{[]float32{1.1}, []byte{205, 204, 140, 63}},
|
||||
{[]float32{-1.1}, []byte{205, 204, 140, 191}},
|
||||
{[]float32{math.Pi}, []byte{219, 15, 73, 64}},
|
||||
{[]float32{257, -2}, []byte{0, 128, 128, 67, 0, 0, 0, 192}},
|
||||
{[]float32{257.1, -2.1}, []byte{205, 140, 128, 67, 102, 102, 6, 192}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := float32sToBytes(testCase.f32s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFloat64sToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
f64s []float64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]float64{}, []byte{}},
|
||||
{[]float64{1}, []byte{0, 0, 0, 0, 0, 0, 240, 63}},
|
||||
{[]float64{1.0}, []byte{0, 0, 0, 0, 0, 0, 240, 63}},
|
||||
{[]float64{-1}, []byte{0, 0, 0, 0, 0, 0, 240, 191}},
|
||||
{[]float64{-1.0}, []byte{0, 0, 0, 0, 0, 0, 240, 191}},
|
||||
{[]float64{256}, []byte{0, 0, 0, 0, 0, 0, 112, 64}},
|
||||
{[]float64{1.1}, []byte{154, 153, 153, 153, 153, 153, 241, 63}},
|
||||
{[]float64{-1.1}, []byte{154, 153, 153, 153, 153, 153, 241, 191}},
|
||||
{[]float64{math.Pi}, []byte{24, 45, 68, 84, 251, 33, 9, 64}},
|
||||
{[]float64{257, -2}, []byte{0, 0, 0, 0, 0, 16, 112, 64, 0, 0, 0, 0, 0, 0, 0, 192}},
|
||||
{[]float64{257.1, -2.1}, []byte{154, 153, 153, 153, 153, 17, 112, 64, 205, 204, 204, 204, 204, 204, 0, 192}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := float64sToBytes(testCase.f64s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestUnsignedVarIntToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
ui64 uint64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{0, []byte{0}},
|
||||
{1, []byte{1}},
|
||||
{0x7F, []byte{127}},
|
||||
{0x80, []byte{128, 1}},
|
||||
{uint64(math.MaxUint64), []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 1}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := unsignedVarIntToBytes(testCase.ui64)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestValuesToRLEBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
values interface{}
|
||||
bitWidth int32
|
||||
dataType parquet.Type
|
||||
expectedResult []byte
|
||||
}{
|
||||
{[]int32{3, 5, 7}, 1, parquet.Type_INT32, []byte{2, 3, 2, 5, 2, 7}},
|
||||
{[]int32{3, 3, 3}, 1, parquet.Type_INT32, []byte{6, 3}},
|
||||
{[]int32{2, 2, 3, 3, 3}, 1, parquet.Type_INT32, []byte{4, 2, 6, 3}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := valuesToRLEBytes(testCase.values, testCase.bitWidth, testCase.dataType)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
@ -1,39 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/common"
|
||||
)
|
||||
|
||||
// Refer https://en.wikipedia.org/wiki/LEB128#Unsigned_LEB128
|
||||
func varIntEncode(ui64 uint64) []byte {
|
||||
if ui64 == 0 {
|
||||
return []byte{0}
|
||||
}
|
||||
|
||||
length := int(common.BitWidth(ui64)+6) / 7
|
||||
data := make([]byte, length)
|
||||
for i := 0; i < length; i++ {
|
||||
data[i] = byte(ui64&0x7F) | 0x80
|
||||
ui64 >>= 7
|
||||
}
|
||||
data[length-1] &= 0x7F
|
||||
|
||||
return data
|
||||
}
|
@ -1,44 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestVarIntToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
ui64 uint64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{0, []byte{0}},
|
||||
{1, []byte{1}},
|
||||
{0x7F, []byte{127}},
|
||||
{0x80, []byte{128, 1}},
|
||||
{uint64(math.MaxUint64), []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 1}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := varIntEncode(testCase.ui64)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
@ -1,297 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/common"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
const (
|
||||
blockSize = 128
|
||||
miniBlockSize = 32
|
||||
miniBlockCount = blockSize / miniBlockSize
|
||||
)
|
||||
|
||||
var deltaEncodeHeaderBytes []byte
|
||||
|
||||
func init() {
|
||||
deltaEncodeHeaderBytes = varIntEncode(blockSize)
|
||||
deltaEncodeHeaderBytes = append(deltaEncodeHeaderBytes, varIntEncode(miniBlockCount)...)
|
||||
}
|
||||
|
||||
// Supported Types: BOOLEAN, INT32, INT64
|
||||
func bitPackedEncode(values interface{}, bitWidth uint64, withHeader bool, parquetType parquet.Type) []byte {
|
||||
var i64s []int64
|
||||
switch parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
bs, ok := values.([]bool)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of bool"))
|
||||
}
|
||||
|
||||
i64s = make([]int64, len(bs))
|
||||
for i := range bs {
|
||||
if bs[i] {
|
||||
i64s[i] = 1
|
||||
}
|
||||
}
|
||||
case parquet.Type_INT32:
|
||||
i32s, ok := values.([]int32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int32"))
|
||||
}
|
||||
|
||||
for i := range i32s {
|
||||
i64s[i] = int64(i32s[i])
|
||||
}
|
||||
case parquet.Type_INT64:
|
||||
var ok bool
|
||||
i64s, ok = values.([]int64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int64"))
|
||||
}
|
||||
default:
|
||||
panic(fmt.Errorf("%v parquet type unsupported", parquetType))
|
||||
}
|
||||
|
||||
if len(i64s) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var valueByte byte
|
||||
bitsSet := uint64(0)
|
||||
bitsNeeded := uint64(8)
|
||||
bitsToSet := bitWidth
|
||||
value := i64s[0]
|
||||
|
||||
valueBytes := []byte{}
|
||||
for i := 0; i < len(i64s); {
|
||||
if bitsToSet >= bitsNeeded {
|
||||
valueByte |= byte(((value >> bitsSet) & ((1 << bitsNeeded) - 1)) << (8 - bitsNeeded))
|
||||
valueBytes = append(valueBytes, valueByte)
|
||||
bitsToSet -= bitsNeeded
|
||||
bitsSet += bitsNeeded
|
||||
|
||||
bitsNeeded = 8
|
||||
valueByte = 0
|
||||
|
||||
if bitsToSet <= 0 && (i+1) < len(i64s) {
|
||||
i++
|
||||
value = i64s[i]
|
||||
bitsToSet = bitWidth
|
||||
bitsSet = 0
|
||||
}
|
||||
} else {
|
||||
valueByte |= byte((value >> bitsSet) << (8 - bitsNeeded))
|
||||
i++
|
||||
|
||||
if i < len(i64s) {
|
||||
value = i64s[i]
|
||||
}
|
||||
|
||||
bitsNeeded -= bitsToSet
|
||||
bitsToSet = bitWidth
|
||||
bitsSet = 0
|
||||
}
|
||||
}
|
||||
|
||||
if withHeader {
|
||||
header := uint64(((len(i64s) / 8) << 1) | 1)
|
||||
headerBytes := varIntEncode(header)
|
||||
return append(headerBytes, valueBytes...)
|
||||
}
|
||||
|
||||
return valueBytes
|
||||
}
|
||||
|
||||
func deltaEncodeInt32s(i32s []int32) (data []byte) {
|
||||
getValue := func(i32 int32) uint64 {
|
||||
return uint64((i32 >> 31) ^ (i32 << 1))
|
||||
}
|
||||
|
||||
data = append(data, deltaEncodeHeaderBytes...)
|
||||
data = append(data, varIntEncode(uint64(len(i32s)))...)
|
||||
data = append(data, varIntEncode(getValue(i32s[0]))...)
|
||||
|
||||
for i := 1; i < len(i32s); {
|
||||
block := []int32{}
|
||||
minDelta := int32(0x7FFFFFFF)
|
||||
|
||||
for ; i < len(i32s) && len(block) < blockSize; i++ {
|
||||
delta := i32s[i] - i32s[i-1]
|
||||
block = append(block, delta)
|
||||
if delta < minDelta {
|
||||
minDelta = delta
|
||||
}
|
||||
}
|
||||
|
||||
for len(block) < blockSize {
|
||||
block = append(block, minDelta)
|
||||
}
|
||||
|
||||
bitWidths := make([]byte, miniBlockCount)
|
||||
for j := 0; j < miniBlockCount; j++ {
|
||||
maxValue := int32(0)
|
||||
for k := j * miniBlockSize; k < (j+1)*miniBlockSize; k++ {
|
||||
block[k] -= minDelta
|
||||
if block[k] > maxValue {
|
||||
maxValue = block[k]
|
||||
}
|
||||
}
|
||||
|
||||
bitWidths[j] = byte(common.BitWidth(uint64(maxValue)))
|
||||
}
|
||||
|
||||
minDeltaZigZag := getValue(minDelta)
|
||||
data = append(data, varIntEncode(minDeltaZigZag)...)
|
||||
data = append(data, bitWidths...)
|
||||
|
||||
for j := 0; j < miniBlockCount; j++ {
|
||||
bitPacked := bitPackedEncode(
|
||||
block[j*miniBlockSize:(j+1)*miniBlockSize],
|
||||
uint64(bitWidths[j]),
|
||||
false,
|
||||
parquet.Type_INT32,
|
||||
)
|
||||
data = append(data, bitPacked...)
|
||||
}
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func deltaEncodeInt64s(i64s []int64) (data []byte) {
|
||||
getValue := func(i64 int64) uint64 {
|
||||
return uint64((i64 >> 63) ^ (i64 << 1))
|
||||
}
|
||||
|
||||
data = append(data, deltaEncodeHeaderBytes...)
|
||||
data = append(data, varIntEncode(uint64(len(i64s)))...)
|
||||
data = append(data, varIntEncode(getValue(i64s[0]))...)
|
||||
|
||||
for i := 1; i < len(i64s); {
|
||||
block := []int64{}
|
||||
minDelta := int64(0x7FFFFFFFFFFFFFFF)
|
||||
|
||||
for ; i < len(i64s) && len(block) < blockSize; i++ {
|
||||
delta := i64s[i] - i64s[i-1]
|
||||
block = append(block, delta)
|
||||
if delta < minDelta {
|
||||
minDelta = delta
|
||||
}
|
||||
}
|
||||
|
||||
for len(block) < blockSize {
|
||||
block = append(block, minDelta)
|
||||
}
|
||||
|
||||
bitWidths := make([]byte, miniBlockCount)
|
||||
for j := 0; j < miniBlockCount; j++ {
|
||||
maxValue := int64(0)
|
||||
for k := j * miniBlockSize; k < (j+1)*miniBlockSize; k++ {
|
||||
block[k] -= minDelta
|
||||
if block[k] > maxValue {
|
||||
maxValue = block[k]
|
||||
}
|
||||
}
|
||||
|
||||
bitWidths[j] = byte(common.BitWidth(uint64(maxValue)))
|
||||
}
|
||||
|
||||
minDeltaZigZag := getValue(minDelta)
|
||||
data = append(data, varIntEncode(minDeltaZigZag)...)
|
||||
data = append(data, bitWidths...)
|
||||
|
||||
for j := 0; j < miniBlockCount; j++ {
|
||||
bitPacked := bitPackedEncode(
|
||||
block[j*miniBlockSize:(j+1)*miniBlockSize],
|
||||
uint64(bitWidths[j]),
|
||||
false,
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
data = append(data, bitPacked...)
|
||||
}
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
// DeltaEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-encoding-delta_binary_packed--5
|
||||
//
|
||||
// Supported Types: INT32, INT64.
|
||||
func DeltaEncode(values interface{}, parquetType parquet.Type) []byte {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
i32s, ok := values.([]int32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int32"))
|
||||
}
|
||||
return deltaEncodeInt32s(i32s)
|
||||
case parquet.Type_INT64:
|
||||
i64s, ok := values.([]int64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int64"))
|
||||
}
|
||||
return deltaEncodeInt64s(i64s)
|
||||
}
|
||||
|
||||
panic(fmt.Errorf("%v parquet type unsupported", parquetType))
|
||||
}
|
||||
|
||||
// DeltaLengthByteArrayEncode encodes bytes slices specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6
|
||||
//
|
||||
// Supported Types: BYTE_ARRAY
|
||||
func DeltaLengthByteArrayEncode(bytesSlices [][]byte) (data []byte) {
|
||||
lengths := make([]int32, len(bytesSlices))
|
||||
for i, bytes := range bytesSlices {
|
||||
lengths[i] = int32(len(bytes))
|
||||
}
|
||||
|
||||
data = deltaEncodeInt32s(lengths)
|
||||
for _, bytes := range bytesSlices {
|
||||
data = append(data, []byte(bytes)...)
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
// DeltaByteArrayEncode encodes sequence of strings values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-strings-delta_byte_array--7
|
||||
//
|
||||
// Supported Types: BYTE_ARRAY
|
||||
func DeltaByteArrayEncode(bytesSlices [][]byte) (data []byte) {
|
||||
prefixLengths := make([]int32, len(bytesSlices))
|
||||
suffixes := make([][]byte, len(bytesSlices))
|
||||
|
||||
var i, j int
|
||||
for i = 1; i < len(bytesSlices); i++ {
|
||||
for j = 0; j < len(bytesSlices[i-1]) && j < len(bytesSlices[i]); j++ {
|
||||
if bytesSlices[i-1][j] != bytesSlices[i][j] {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
prefixLengths[i] = int32(j)
|
||||
suffixes[i] = bytesSlices[i][j:]
|
||||
}
|
||||
|
||||
data = deltaEncodeInt32s(prefixLengths)
|
||||
return append(data, DeltaLengthByteArrayEncode(suffixes)...)
|
||||
}
|
@ -1,141 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func plainEncodeBools(bs []bool) []byte {
|
||||
data := make([]byte, (len(bs)+7)/8)
|
||||
|
||||
for i := range bs {
|
||||
if bs[i] {
|
||||
data[i/8] |= 1 << uint(i%8)
|
||||
}
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeInt32s(i32s []int32) []byte {
|
||||
data := make([]byte, len(i32s)*4)
|
||||
|
||||
for i, i32 := range i32s {
|
||||
binary.LittleEndian.PutUint32(data[i*4:], uint32(i32))
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeInt64s(i64s []int64) []byte {
|
||||
data := make([]byte, len(i64s)*8)
|
||||
|
||||
for i, i64 := range i64s {
|
||||
binary.LittleEndian.PutUint64(data[i*8:], uint64(i64))
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeFloat32s(f32s []float32) []byte {
|
||||
data := make([]byte, len(f32s)*4)
|
||||
|
||||
for i, f32 := range f32s {
|
||||
binary.LittleEndian.PutUint32(data[i*4:], math.Float32bits(f32))
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeFloat64s(f64s []float64) []byte {
|
||||
data := make([]byte, len(f64s)*8)
|
||||
|
||||
for i, f64 := range f64s {
|
||||
binary.LittleEndian.PutUint64(data[i*8:], math.Float64bits(f64))
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeBytesSlices(bytesSlices [][]byte) []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
|
||||
for _, s := range bytesSlices {
|
||||
if err := binary.Write(buf, binary.LittleEndian, uint32(len(s))); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if _, err := buf.Write(s); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
// PlainEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0
|
||||
//
|
||||
// Supported Types: BOOLEAN, INT32, INT64, FLOAT, DOUBLE, BYTE_ARRAY
|
||||
func PlainEncode(values interface{}, parquetType parquet.Type) []byte {
|
||||
switch parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
bs, ok := values.([]bool)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of bool"))
|
||||
}
|
||||
return plainEncodeBools(bs)
|
||||
case parquet.Type_INT32:
|
||||
i32s, ok := values.([]int32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int32"))
|
||||
}
|
||||
return plainEncodeInt32s(i32s)
|
||||
case parquet.Type_INT64:
|
||||
i64s, ok := values.([]int64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int64"))
|
||||
}
|
||||
return plainEncodeInt64s(i64s)
|
||||
case parquet.Type_FLOAT:
|
||||
f32s, ok := values.([]float32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of float32"))
|
||||
}
|
||||
return plainEncodeFloat32s(f32s)
|
||||
case parquet.Type_DOUBLE:
|
||||
f64s, ok := values.([]float64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of float64"))
|
||||
}
|
||||
return plainEncodeFloat64s(f64s)
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
bytesSlices, ok := values.([][]byte)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of byte array"))
|
||||
}
|
||||
return plainEncodeBytesSlices(bytesSlices)
|
||||
}
|
||||
|
||||
panic(fmt.Errorf("%v parquet type unsupported", parquetType))
|
||||
}
|
@ -1,148 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestPlainEncodeBools(t *testing.T) {
|
||||
testCases := []struct {
|
||||
bs []bool
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]bool{}, []byte{}},
|
||||
{[]bool{true}, []byte{1}},
|
||||
{[]bool{false}, []byte{0}},
|
||||
{[]bool{true, true}, []byte{3}},
|
||||
{[]bool{false, false}, []byte{0}},
|
||||
{[]bool{false, true}, []byte{2}},
|
||||
{[]bool{true, false}, []byte{1}},
|
||||
{[]bool{false, false, false, false, false, false, false, true, true}, []byte{128, 1}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeBools(testCase.bs)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlainEncodeInt32s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
i32s []int32
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]int32{}, []byte{}},
|
||||
{[]int32{1}, []byte{1, 0, 0, 0}},
|
||||
{[]int32{-1}, []byte{255, 255, 255, 255}},
|
||||
{[]int32{256}, []byte{0, 1, 0, 0}},
|
||||
{[]int32{math.MinInt32}, []byte{0, 0, 0, 128}},
|
||||
{[]int32{math.MaxInt32}, []byte{255, 255, 255, 127}},
|
||||
{[]int32{257, -2}, []byte{1, 1, 0, 0, 254, 255, 255, 255}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeInt32s(testCase.i32s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlainEncodeInt64s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
i64s []int64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]int64{}, []byte{}},
|
||||
{[]int64{1}, []byte{1, 0, 0, 0, 0, 0, 0, 0}},
|
||||
{[]int64{-1}, []byte{255, 255, 255, 255, 255, 255, 255, 255}},
|
||||
{[]int64{256}, []byte{0, 1, 0, 0, 0, 0, 0, 0}},
|
||||
{[]int64{math.MinInt64}, []byte{0, 0, 0, 0, 0, 0, 0, 128}},
|
||||
{[]int64{math.MaxInt64}, []byte{255, 255, 255, 255, 255, 255, 255, 127}},
|
||||
{[]int64{257, -2}, []byte{1, 1, 0, 0, 0, 0, 0, 0, 254, 255, 255, 255, 255, 255, 255, 255}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeInt64s(testCase.i64s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlainEncodeFloat32s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
f32s []float32
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]float32{}, []byte{}},
|
||||
{[]float32{1}, []byte{0, 0, 128, 63}},
|
||||
{[]float32{1.0}, []byte{0, 0, 128, 63}},
|
||||
{[]float32{-1}, []byte{0, 0, 128, 191}},
|
||||
{[]float32{-1.0}, []byte{0, 0, 128, 191}},
|
||||
{[]float32{256}, []byte{0, 0, 128, 67}},
|
||||
{[]float32{1.1}, []byte{205, 204, 140, 63}},
|
||||
{[]float32{-1.1}, []byte{205, 204, 140, 191}},
|
||||
{[]float32{math.Pi}, []byte{219, 15, 73, 64}},
|
||||
{[]float32{257, -2}, []byte{0, 128, 128, 67, 0, 0, 0, 192}},
|
||||
{[]float32{257.1, -2.1}, []byte{205, 140, 128, 67, 102, 102, 6, 192}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeFloat32s(testCase.f32s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlainEncodeFloat64s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
f64s []float64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]float64{}, []byte{}},
|
||||
{[]float64{1}, []byte{0, 0, 0, 0, 0, 0, 240, 63}},
|
||||
{[]float64{1.0}, []byte{0, 0, 0, 0, 0, 0, 240, 63}},
|
||||
{[]float64{-1}, []byte{0, 0, 0, 0, 0, 0, 240, 191}},
|
||||
{[]float64{-1.0}, []byte{0, 0, 0, 0, 0, 0, 240, 191}},
|
||||
{[]float64{256}, []byte{0, 0, 0, 0, 0, 0, 112, 64}},
|
||||
{[]float64{1.1}, []byte{154, 153, 153, 153, 153, 153, 241, 63}},
|
||||
{[]float64{-1.1}, []byte{154, 153, 153, 153, 153, 153, 241, 191}},
|
||||
{[]float64{math.Pi}, []byte{24, 45, 68, 84, 251, 33, 9, 64}},
|
||||
{[]float64{257, -2}, []byte{0, 0, 0, 0, 0, 16, 112, 64, 0, 0, 0, 0, 0, 0, 0, 192}},
|
||||
{[]float64{257.1, -2.1}, []byte{154, 153, 153, 153, 153, 17, 112, 64, 205, 204, 204, 204, 204, 204, 0, 192}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeFloat64s(testCase.f64s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
@ -1,85 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func rleEncodeInt32s(i32s []int32, bitWidth int32) (data []byte) {
|
||||
j := 0
|
||||
for i := 0; i < len(i32s); i = j {
|
||||
for j = i + 1; j < len(i32s) && i32s[i] == i32s[j]; j++ {
|
||||
}
|
||||
|
||||
headerBytes := varIntEncode(uint64((j - i) << 1))
|
||||
data = append(data, headerBytes...)
|
||||
|
||||
valBytes := plainEncodeInt32s([]int32{i32s[i]})
|
||||
byteCount := (bitWidth + 7) / 8
|
||||
data = append(data, valBytes[:byteCount]...)
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func rleEncodeInt64s(i64s []int64, bitWidth int32) (data []byte) {
|
||||
j := 0
|
||||
for i := 0; i < len(i64s); i = j {
|
||||
for j = i + 1; j < len(i64s) && i64s[i] == i64s[j]; j++ {
|
||||
}
|
||||
|
||||
headerBytes := varIntEncode(uint64((j - i) << 1))
|
||||
data = append(data, headerBytes...)
|
||||
|
||||
valBytes := plainEncodeInt64s([]int64{i64s[i]})
|
||||
byteCount := (bitWidth + 7) / 8
|
||||
data = append(data, valBytes[:byteCount]...)
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
// RLEBitPackedHybridEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3
|
||||
//
|
||||
// Supported Types: INT32, INT64
|
||||
func RLEBitPackedHybridEncode(values interface{}, bitWidth int32, parquetType parquet.Type) []byte {
|
||||
var rleBytes []byte
|
||||
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
i32s, ok := values.([]int32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int32"))
|
||||
}
|
||||
rleBytes = rleEncodeInt32s(i32s, bitWidth)
|
||||
case parquet.Type_INT64:
|
||||
i64s, ok := values.([]int64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int64"))
|
||||
}
|
||||
rleBytes = rleEncodeInt64s(i64s, bitWidth)
|
||||
default:
|
||||
panic(fmt.Errorf("%v parquet type unsupported", parquetType))
|
||||
}
|
||||
|
||||
lenBytes := plainEncodeInt32s([]int32{int32(len(rleBytes))})
|
||||
return append(lenBytes, rleBytes...)
|
||||
}
|
@ -1,45 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func TestRLEEncodeInt32s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
values []int32
|
||||
bitWidth int32
|
||||
dataType parquet.Type
|
||||
expectedResult []byte
|
||||
}{
|
||||
{[]int32{3, 5, 7}, 1, parquet.Type_INT32, []byte{2, 3, 2, 5, 2, 7}},
|
||||
{[]int32{3, 3, 3}, 1, parquet.Type_INT32, []byte{6, 3}},
|
||||
{[]int32{2, 2, 3, 3, 3}, 1, parquet.Type_INT32, []byte{4, 2, 6, 3}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := rleEncodeInt32s(testCase.values, testCase.bitWidth)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
@ -1,61 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/common"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
// RLEDictEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#dictionary-encoding-plain_dictionary--2-and-rle_dictionary--8 and returns dictionary page data and data page data.
|
||||
//
|
||||
// Dictionary page data contains PLAIN encodeed slice of uniquely fully defined non-nil values.
|
||||
// Data page data contains RLE/Bit-Packed Hybrid encoded indices of fully defined non-nil values.
|
||||
//
|
||||
// Supported Types: BOOLEAN, INT32, INT64, FLOAT, DOUBLE, BYTE_ARRAY
|
||||
func RLEDictEncode(values []interface{}, parquetType parquet.Type, bitWidth int32) (dictPageData, dataPageData []byte, dictValueCount int32, indexBitWidth uint8) {
|
||||
var definedValues []interface{}
|
||||
var indices []int32
|
||||
|
||||
valueIndexMap := make(map[interface{}]int32)
|
||||
j := 0
|
||||
for i := 0; i < len(values); i = j {
|
||||
for j = i; j < len(values); j++ {
|
||||
value := values[j]
|
||||
if value == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
index, found := valueIndexMap[value]
|
||||
if !found {
|
||||
index = int32(len(definedValues))
|
||||
definedValues = append(definedValues, value)
|
||||
valueIndexMap[value] = index
|
||||
}
|
||||
|
||||
indices = append(indices, index)
|
||||
}
|
||||
}
|
||||
|
||||
indexBitWidth = uint8(common.BitWidth(uint64(indices[len(indices)-1])))
|
||||
|
||||
dictPageData = PlainEncode(common.ToSliceValue(definedValues, parquetType), parquetType)
|
||||
dataPageData = RLEBitPackedHybridEncode(indices, int32(indexBitWidth), parquet.Type_INT32)
|
||||
|
||||
return dictPageData, dataPageData, int32(len(definedValues)), indexBitWidth
|
||||
}
|
@ -1,36 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
)
|
||||
|
||||
func uint32ToBytes(v uint32) []byte {
|
||||
buf := make([]byte, 4)
|
||||
binary.LittleEndian.PutUint32(buf, v)
|
||||
return buf
|
||||
}
|
||||
|
||||
func bytesToUint32(buf []byte) uint32 {
|
||||
return binary.LittleEndian.Uint32(buf)
|
||||
}
|
||||
|
||||
func bytesToUint64(buf []byte) uint64 {
|
||||
return binary.LittleEndian.Uint64(buf)
|
||||
}
|
Binary file not shown.
@ -1,20 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
var GoUnusedProtection__ int
|
@ -1,33 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
|
||||
"git.apache.org/thrift.git/lib/go/thrift"
|
||||
)
|
||||
|
||||
// (needed to ensure safety because of naive import list construction.)
|
||||
var _ = thrift.ZERO
|
||||
var _ = fmt.Printf
|
||||
var _ = bytes.Equal
|
||||
|
||||
func init() {
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,10 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
rm -f parquet.thrift
|
||||
wget -q https://github.com/apache/parquet-format/raw/df6132b94f273521a418a74442085fdd5a0aa009/src/main/thrift/parquet.thrift
|
||||
thrift --gen go parquet.thrift
|
||||
gofmt -w -s gen-go/parquet
|
@ -1,824 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"strings"
|
||||
|
||||
"git.apache.org/thrift.git/lib/go/thrift"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
// getBitWidth - returns bits required to place num e.g.
|
||||
//
|
||||
// num | width
|
||||
// -----|-------
|
||||
// 0 | 0
|
||||
// 1 | 1
|
||||
// 2 | 2
|
||||
// 3 | 2
|
||||
// 4 | 3
|
||||
// 5 | 3
|
||||
// ... | ...
|
||||
// ... | ...
|
||||
//
|
||||
func getBitWidth(num uint64) (width uint64) {
|
||||
for ; num != 0; num >>= 1 {
|
||||
width++
|
||||
}
|
||||
|
||||
return width
|
||||
}
|
||||
|
||||
// getMaxDefLevel - get maximum definition level.
|
||||
func getMaxDefLevel(nameIndexMap map[string]int, schemaElements []*parquet.SchemaElement, path []string) (v int) {
|
||||
for i := 1; i <= len(path); i++ {
|
||||
name := strings.Join(path[:i], ".")
|
||||
if index, ok := nameIndexMap[name]; ok {
|
||||
if schemaElements[index].GetRepetitionType() != parquet.FieldRepetitionType_REQUIRED {
|
||||
v++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return v
|
||||
}
|
||||
|
||||
// getMaxRepLevel - get maximum repetition level.
|
||||
func getMaxRepLevel(nameIndexMap map[string]int, schemaElements []*parquet.SchemaElement, path []string) (v int) {
|
||||
for i := 1; i <= len(path); i++ {
|
||||
name := strings.Join(path[:i], ".")
|
||||
if index, ok := nameIndexMap[name]; ok {
|
||||
if schemaElements[index].GetRepetitionType() == parquet.FieldRepetitionType_REPEATED {
|
||||
v++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return v
|
||||
}
|
||||
|
||||
func readPageHeader(reader *thrift.TBufferedTransport) (*parquet.PageHeader, error) {
|
||||
pageHeader := parquet.NewPageHeader()
|
||||
if err := pageHeader.Read(thrift.NewTCompactProtocol(reader)); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return pageHeader, nil
|
||||
}
|
||||
|
||||
func readPage(
|
||||
thriftReader *thrift.TBufferedTransport,
|
||||
metadata *parquet.ColumnMetaData,
|
||||
columnNameIndexMap map[string]int,
|
||||
schemaElements []*parquet.SchemaElement,
|
||||
) (page *page, definitionLevels, numRows int64, err error) {
|
||||
|
||||
pageHeader, err := readPageHeader(thriftReader)
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
|
||||
read := func() (data []byte, err error) {
|
||||
var repLevelsLen, defLevelsLen int32
|
||||
var repLevelsBuf, defLevelsBuf []byte
|
||||
|
||||
if pageHeader.GetType() == parquet.PageType_DATA_PAGE_V2 {
|
||||
if pageHeader.DataPageHeaderV2 == nil {
|
||||
return nil, errors.New("parquet: Header not set")
|
||||
}
|
||||
repLevelsLen = pageHeader.DataPageHeaderV2.GetRepetitionLevelsByteLength()
|
||||
repLevelsBuf = make([]byte, repLevelsLen)
|
||||
|
||||
n, err := io.ReadFull(thriftReader, repLevelsBuf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if n != int(repLevelsLen) {
|
||||
return nil, fmt.Errorf("expected parquet header repetition levels %d, got %d", repLevelsLen, n)
|
||||
}
|
||||
|
||||
defLevelsLen = pageHeader.DataPageHeaderV2.GetDefinitionLevelsByteLength()
|
||||
defLevelsBuf = make([]byte, defLevelsLen)
|
||||
|
||||
n, err = io.ReadFull(thriftReader, defLevelsBuf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if n != int(defLevelsLen) {
|
||||
return nil, fmt.Errorf("expected parquet header definition levels %d, got %d", defLevelsLen, n)
|
||||
}
|
||||
}
|
||||
dbLen := pageHeader.GetCompressedPageSize() - repLevelsLen - defLevelsLen
|
||||
if dbLen < 0 {
|
||||
return nil, errors.New("parquet: negative data length")
|
||||
}
|
||||
|
||||
dataBuf := make([]byte, dbLen)
|
||||
n, err := io.ReadFull(thriftReader, dataBuf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if n != int(dbLen) {
|
||||
return nil, fmt.Errorf("expected parquet data buffer %d, got %d", dbLen, n)
|
||||
}
|
||||
|
||||
if dataBuf, err = compressionCodec(metadata.GetCodec()).uncompress(dataBuf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if repLevelsLen == 0 && defLevelsLen == 0 {
|
||||
return dataBuf, nil
|
||||
}
|
||||
|
||||
if repLevelsLen > 0 {
|
||||
data = append(data, uint32ToBytes(uint32(repLevelsLen))...)
|
||||
data = append(data, repLevelsBuf...)
|
||||
}
|
||||
|
||||
if defLevelsLen > 0 {
|
||||
data = append(data, uint32ToBytes(uint32(defLevelsLen))...)
|
||||
data = append(data, defLevelsBuf...)
|
||||
}
|
||||
|
||||
data = append(data, dataBuf...)
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
buf, err := read()
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
if metadata == nil {
|
||||
return nil, 0, 0, errors.New("parquet: metadata not set")
|
||||
}
|
||||
path := append([]string{}, metadata.GetPathInSchema()...)
|
||||
|
||||
bytesReader := bytes.NewReader(buf)
|
||||
pageType := pageHeader.GetType()
|
||||
switch pageType {
|
||||
case parquet.PageType_INDEX_PAGE:
|
||||
return nil, 0, 0, fmt.Errorf("page type %v is not supported", parquet.PageType_INDEX_PAGE)
|
||||
|
||||
case parquet.PageType_DICTIONARY_PAGE:
|
||||
page = newDictPage()
|
||||
page.Header = pageHeader
|
||||
table := new(table)
|
||||
table.Path = path
|
||||
if pageHeader.DictionaryPageHeader == nil {
|
||||
return nil, 0, 0, errors.New("parquet: dictionary not set")
|
||||
}
|
||||
values, err := readValues(bytesReader, metadata.GetType(),
|
||||
uint64(pageHeader.DictionaryPageHeader.GetNumValues()), 0)
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
table.Values = getTableValues(values, metadata.GetType())
|
||||
page.DataTable = table
|
||||
|
||||
return page, 0, 0, nil
|
||||
|
||||
case parquet.PageType_DATA_PAGE, parquet.PageType_DATA_PAGE_V2:
|
||||
name := strings.Join(path, ".")
|
||||
|
||||
page = newDataPage()
|
||||
page.Header = pageHeader
|
||||
|
||||
maxDefinitionLevel := getMaxDefLevel(columnNameIndexMap, schemaElements, path)
|
||||
maxRepetitionLevel := getMaxRepLevel(columnNameIndexMap, schemaElements, path)
|
||||
|
||||
var numValues uint64
|
||||
var encodingType parquet.Encoding
|
||||
|
||||
if pageHeader.GetType() == parquet.PageType_DATA_PAGE {
|
||||
if pageHeader.DataPageHeader == nil {
|
||||
return nil, 0, 0, errors.New("parquet: Header not set")
|
||||
}
|
||||
numValues = uint64(pageHeader.DataPageHeader.GetNumValues())
|
||||
encodingType = pageHeader.DataPageHeader.GetEncoding()
|
||||
} else {
|
||||
if pageHeader.DataPageHeaderV2 == nil {
|
||||
return nil, 0, 0, errors.New("parquet: Header not set")
|
||||
}
|
||||
numValues = uint64(pageHeader.DataPageHeaderV2.GetNumValues())
|
||||
encodingType = pageHeader.DataPageHeaderV2.GetEncoding()
|
||||
}
|
||||
|
||||
var repetitionLevels []int64
|
||||
if maxRepetitionLevel > 0 {
|
||||
values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64,
|
||||
-1, numValues, getBitWidth(uint64(maxRepetitionLevel)))
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
|
||||
if repetitionLevels = values.([]int64); len(repetitionLevels) > int(numValues) && int(numValues) >= 0 {
|
||||
repetitionLevels = repetitionLevels[:numValues]
|
||||
}
|
||||
} else {
|
||||
if numValues > math.MaxInt64/8 {
|
||||
return nil, 0, 0, errors.New("parquet: numvalues too large")
|
||||
}
|
||||
repetitionLevels = make([]int64, numValues)
|
||||
}
|
||||
|
||||
var definitionLevels []int64
|
||||
if maxDefinitionLevel > 0 {
|
||||
values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64,
|
||||
-1, numValues, getBitWidth(uint64(maxDefinitionLevel)))
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
if numValues > math.MaxInt64/8 {
|
||||
return nil, 0, 0, errors.New("parquet: numvalues too large")
|
||||
}
|
||||
if definitionLevels = values.([]int64); len(definitionLevels) > int(numValues) {
|
||||
definitionLevels = definitionLevels[:numValues]
|
||||
}
|
||||
} else {
|
||||
if numValues > math.MaxInt64/8 {
|
||||
return nil, 0, 0, errors.New("parquet: numvalues too large")
|
||||
}
|
||||
definitionLevels = make([]int64, numValues)
|
||||
}
|
||||
|
||||
var numNulls uint64
|
||||
for i := 0; i < len(definitionLevels); i++ {
|
||||
if definitionLevels[i] != int64(maxDefinitionLevel) {
|
||||
numNulls++
|
||||
}
|
||||
}
|
||||
|
||||
var convertedType parquet.ConvertedType = -1
|
||||
if schemaElements[columnNameIndexMap[name]].IsSetConvertedType() {
|
||||
convertedType = schemaElements[columnNameIndexMap[name]].GetConvertedType()
|
||||
}
|
||||
values, valueType, err := readDataPageValues(bytesReader, encodingType, metadata.GetType(),
|
||||
convertedType, uint64(len(definitionLevels))-numNulls,
|
||||
uint64(schemaElements[columnNameIndexMap[name]].GetTypeLength()))
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
tableValues := getTableValues(values, valueType)
|
||||
|
||||
table := new(table)
|
||||
table.Path = path
|
||||
table.RepetitionType = schemaElements[columnNameIndexMap[name]].GetRepetitionType()
|
||||
table.MaxRepetitionLevel = int32(maxRepetitionLevel)
|
||||
table.MaxDefinitionLevel = int32(maxDefinitionLevel)
|
||||
table.Values = make([]interface{}, len(definitionLevels))
|
||||
table.RepetitionLevels = make([]int32, len(definitionLevels))
|
||||
table.DefinitionLevels = make([]int32, len(definitionLevels))
|
||||
|
||||
j := 0
|
||||
numRows := int64(0)
|
||||
for i := 0; i < len(definitionLevels); i++ {
|
||||
table.RepetitionLevels[i] = int32(repetitionLevels[i])
|
||||
table.DefinitionLevels[i] = int32(definitionLevels[i])
|
||||
if int(table.DefinitionLevels[i]) == maxDefinitionLevel {
|
||||
table.Values[i] = tableValues[j]
|
||||
j++
|
||||
}
|
||||
if table.RepetitionLevels[i] == 0 {
|
||||
numRows++
|
||||
}
|
||||
}
|
||||
page.DataTable = table
|
||||
|
||||
return page, int64(len(definitionLevels)), numRows, nil
|
||||
}
|
||||
|
||||
return nil, 0, 0, fmt.Errorf("unknown page type %v", pageType)
|
||||
}
|
||||
|
||||
type page struct {
|
||||
Header *parquet.PageHeader // Header of a page
|
||||
DataTable *table // Table to store values
|
||||
RawData []byte // Compressed data of the page, which is written in parquet file
|
||||
CompressType parquet.CompressionCodec // Compress type: gzip/snappy/none
|
||||
DataType parquet.Type // Parquet type of the values in the page
|
||||
Path []string // Path in schema(include the root)
|
||||
MaxVal interface{} // Maximum of the values
|
||||
MinVal interface{} // Minimum of the values
|
||||
PageSize int32
|
||||
}
|
||||
|
||||
func newPage() *page {
|
||||
return &page{
|
||||
Header: parquet.NewPageHeader(),
|
||||
PageSize: defaultPageSize,
|
||||
}
|
||||
}
|
||||
|
||||
func newDictPage() *page {
|
||||
page := newPage()
|
||||
page.Header.DictionaryPageHeader = parquet.NewDictionaryPageHeader()
|
||||
return page
|
||||
}
|
||||
|
||||
func newDataPage() *page {
|
||||
page := newPage()
|
||||
page.Header.DataPageHeader = parquet.NewDataPageHeader()
|
||||
return page
|
||||
}
|
||||
|
||||
func (page *page) decode(dictPage *page) {
|
||||
if dictPage == nil || page == nil || page.Header.DataPageHeader == nil ||
|
||||
(page.Header.DataPageHeader.Encoding != parquet.Encoding_RLE_DICTIONARY &&
|
||||
page.Header.DataPageHeader.Encoding != parquet.Encoding_PLAIN_DICTIONARY) {
|
||||
return
|
||||
}
|
||||
|
||||
for i := 0; i < len(page.DataTable.Values); i++ {
|
||||
if page.DataTable.Values[i] != nil {
|
||||
index, ok := page.DataTable.Values[i].(int64)
|
||||
if !ok || int(index) >= len(dictPage.DataTable.Values) {
|
||||
return
|
||||
}
|
||||
page.DataTable.Values[i] = dictPage.DataTable.Values[index]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get RepetitionLevels and Definitions from RawData
|
||||
func (page *page) getRLDLFromRawData(columnNameIndexMap map[string]int, schemaElements []*parquet.SchemaElement) (numValues int64, numRows int64, err error) {
|
||||
bytesReader := bytes.NewReader(page.RawData)
|
||||
|
||||
pageType := page.Header.GetType()
|
||||
|
||||
var buf []byte
|
||||
if pageType == parquet.PageType_DATA_PAGE_V2 {
|
||||
var repLevelsLen, defLevelsLen int32
|
||||
var repLevelsBuf, defLevelsBuf []byte
|
||||
if page.Header.DataPageHeaderV2 == nil {
|
||||
return 0, 0, errors.New("parquet: Header not set")
|
||||
}
|
||||
repLevelsLen = page.Header.DataPageHeaderV2.GetRepetitionLevelsByteLength()
|
||||
repLevelsBuf = make([]byte, repLevelsLen)
|
||||
if _, err = bytesReader.Read(repLevelsBuf); err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
|
||||
defLevelsLen = page.Header.DataPageHeaderV2.GetDefinitionLevelsByteLength()
|
||||
defLevelsBuf = make([]byte, defLevelsLen)
|
||||
if _, err = bytesReader.Read(defLevelsBuf); err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
|
||||
dataBuf := make([]byte, len(page.RawData)-int(repLevelsLen)-int(defLevelsLen))
|
||||
if _, err = bytesReader.Read(dataBuf); err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
|
||||
if repLevelsLen == 0 && defLevelsLen == 0 {
|
||||
buf = dataBuf
|
||||
} else {
|
||||
if repLevelsLen > 0 {
|
||||
buf = append(buf, uint32ToBytes(uint32(repLevelsLen))...)
|
||||
buf = append(buf, repLevelsBuf...)
|
||||
}
|
||||
|
||||
if defLevelsLen > 0 {
|
||||
buf = append(buf, uint32ToBytes(uint32(defLevelsLen))...)
|
||||
buf = append(buf, defLevelsBuf...)
|
||||
}
|
||||
|
||||
buf = append(buf, dataBuf...)
|
||||
}
|
||||
} else {
|
||||
if buf, err = compressionCodec(page.CompressType).uncompress(page.RawData); err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
}
|
||||
|
||||
bytesReader = bytes.NewReader(buf)
|
||||
|
||||
switch pageType {
|
||||
case parquet.PageType_DICTIONARY_PAGE:
|
||||
table := new(table)
|
||||
table.Path = page.Path
|
||||
page.DataTable = table
|
||||
return 0, 0, nil
|
||||
|
||||
case parquet.PageType_DATA_PAGE, parquet.PageType_DATA_PAGE_V2:
|
||||
var numValues uint64
|
||||
if pageType == parquet.PageType_DATA_PAGE {
|
||||
if page.Header.DataPageHeader == nil {
|
||||
return 0, 0, errors.New("parquet: Header not set")
|
||||
}
|
||||
numValues = uint64(page.Header.DataPageHeader.GetNumValues())
|
||||
} else {
|
||||
if page.Header.DataPageHeaderV2 == nil {
|
||||
return 0, 0, errors.New("parquet: Header not set")
|
||||
}
|
||||
numValues = uint64(page.Header.DataPageHeaderV2.GetNumValues())
|
||||
}
|
||||
|
||||
maxDefinitionLevel := getMaxDefLevel(columnNameIndexMap, schemaElements, page.Path)
|
||||
maxRepetitionLevel := getMaxRepLevel(columnNameIndexMap, schemaElements, page.Path)
|
||||
|
||||
var repetitionLevels []int64
|
||||
if maxRepetitionLevel > 0 {
|
||||
values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64,
|
||||
-1, numValues, getBitWidth(uint64(maxRepetitionLevel)))
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
|
||||
if repetitionLevels = values.([]int64); uint64(len(repetitionLevels)) > numValues {
|
||||
repetitionLevels = repetitionLevels[:numValues]
|
||||
}
|
||||
} else {
|
||||
repetitionLevels = make([]int64, numValues)
|
||||
}
|
||||
|
||||
var definitionLevels []int64
|
||||
if maxDefinitionLevel > 0 {
|
||||
values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64,
|
||||
-1, numValues, getBitWidth(uint64(maxDefinitionLevel)))
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
if definitionLevels = values.([]int64); uint64(len(definitionLevels)) > numValues {
|
||||
definitionLevels = definitionLevels[:numValues]
|
||||
}
|
||||
} else {
|
||||
definitionLevels = make([]int64, numValues)
|
||||
}
|
||||
|
||||
table := new(table)
|
||||
table.Path = page.Path
|
||||
name := strings.Join(page.Path, ".")
|
||||
table.RepetitionType = schemaElements[columnNameIndexMap[name]].GetRepetitionType()
|
||||
table.MaxRepetitionLevel = int32(maxRepetitionLevel)
|
||||
table.MaxDefinitionLevel = int32(maxDefinitionLevel)
|
||||
table.Values = make([]interface{}, len(definitionLevels))
|
||||
table.RepetitionLevels = make([]int32, len(definitionLevels))
|
||||
table.DefinitionLevels = make([]int32, len(definitionLevels))
|
||||
|
||||
numRows := int64(0)
|
||||
for i := 0; i < len(definitionLevels); i++ {
|
||||
table.RepetitionLevels[i] = int32(repetitionLevels[i])
|
||||
table.DefinitionLevels[i] = int32(definitionLevels[i])
|
||||
if table.RepetitionLevels[i] == 0 {
|
||||
numRows++
|
||||
}
|
||||
}
|
||||
page.DataTable = table
|
||||
page.RawData = buf[len(buf)-bytesReader.Len():]
|
||||
|
||||
return int64(numValues), numRows, nil
|
||||
}
|
||||
|
||||
return 0, 0, fmt.Errorf("Unsupported page type %v", pageType)
|
||||
}
|
||||
|
||||
func (page *page) getValueFromRawData(columnNameIndexMap map[string]int, schemaElements []*parquet.SchemaElement) (err error) {
|
||||
pageType := page.Header.GetType()
|
||||
switch pageType {
|
||||
case parquet.PageType_DICTIONARY_PAGE:
|
||||
bytesReader := bytes.NewReader(page.RawData)
|
||||
var values interface{}
|
||||
if page.Header.DictionaryPageHeader == nil {
|
||||
return errors.New("parquet: dictionary not set")
|
||||
}
|
||||
values, err = readValues(bytesReader, page.DataType,
|
||||
uint64(page.Header.DictionaryPageHeader.GetNumValues()), 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
page.DataTable.Values = getTableValues(values, page.DataType)
|
||||
return nil
|
||||
|
||||
case parquet.PageType_DATA_PAGE_V2:
|
||||
if page.RawData, err = compressionCodec(page.CompressType).uncompress(page.RawData); err != nil {
|
||||
return err
|
||||
}
|
||||
fallthrough
|
||||
case parquet.PageType_DATA_PAGE:
|
||||
encodingType := page.Header.DataPageHeader.GetEncoding()
|
||||
bytesReader := bytes.NewReader(page.RawData)
|
||||
|
||||
var numNulls uint64
|
||||
for i := 0; i < len(page.DataTable.DefinitionLevels); i++ {
|
||||
if page.DataTable.DefinitionLevels[i] != page.DataTable.MaxDefinitionLevel {
|
||||
numNulls++
|
||||
}
|
||||
}
|
||||
|
||||
name := strings.Join(page.DataTable.Path, ".")
|
||||
var convertedType parquet.ConvertedType = -1
|
||||
|
||||
if schemaElements[columnNameIndexMap[name]].IsSetConvertedType() {
|
||||
convertedType = schemaElements[columnNameIndexMap[name]].GetConvertedType()
|
||||
}
|
||||
|
||||
values, _, err := readDataPageValues(bytesReader, encodingType, page.DataType,
|
||||
convertedType, uint64(len(page.DataTable.DefinitionLevels))-numNulls,
|
||||
uint64(schemaElements[columnNameIndexMap[name]].GetTypeLength()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
tableValues := getTableValues(values, page.DataType)
|
||||
|
||||
j := 0
|
||||
for i := 0; i < len(page.DataTable.DefinitionLevels); i++ {
|
||||
if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel {
|
||||
page.DataTable.Values[i] = tableValues[j]
|
||||
j++
|
||||
}
|
||||
}
|
||||
|
||||
page.RawData = []byte{}
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("unsupported page type %v", pageType)
|
||||
}
|
||||
|
||||
func (page *page) toDataPage(compressType parquet.CompressionCodec) []byte {
|
||||
values := []interface{}{}
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel {
|
||||
values = append(values, page.DataTable.Values[i])
|
||||
}
|
||||
}
|
||||
valuesBytes := encodeValues(interfacesToValues(values, page.DataTable.Type), page.DataType, page.DataTable.Encoding, page.DataTable.BitWidth)
|
||||
|
||||
var defLevelBytes []byte
|
||||
if page.DataTable.MaxDefinitionLevel > 0 {
|
||||
defLevels := make([]int64, len(page.DataTable.DefinitionLevels))
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
defLevels[i] = int64(page.DataTable.DefinitionLevels[i])
|
||||
}
|
||||
defLevelBytes = valuesToRLEBitPackedHybridBytes(
|
||||
defLevels,
|
||||
int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
}
|
||||
|
||||
var repLevelBytes []byte
|
||||
if page.DataTable.MaxRepetitionLevel > 0 {
|
||||
repLevels := make([]int64, len(page.DataTable.DefinitionLevels))
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
repLevels[i] = int64(page.DataTable.RepetitionLevels[i])
|
||||
}
|
||||
repLevelBytes = valuesToRLEBitPackedHybridBytes(
|
||||
repLevels,
|
||||
int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
}
|
||||
|
||||
data := repLevelBytes
|
||||
data = append(data, defLevelBytes...)
|
||||
data = append(data, valuesBytes...)
|
||||
|
||||
compressedData, err := compressionCodec(compressType).compress(data)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.Header = parquet.NewPageHeader()
|
||||
page.Header.Type = parquet.PageType_DATA_PAGE
|
||||
page.Header.CompressedPageSize = int32(len(compressedData))
|
||||
page.Header.UncompressedPageSize = int32(len(data))
|
||||
page.Header.DataPageHeader = parquet.NewDataPageHeader()
|
||||
page.Header.DataPageHeader.NumValues = int32(len(page.DataTable.DefinitionLevels))
|
||||
page.Header.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE
|
||||
page.Header.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE
|
||||
page.Header.DataPageHeader.Encoding = page.DataTable.Encoding
|
||||
page.Header.DataPageHeader.Statistics = parquet.NewStatistics()
|
||||
if page.MaxVal != nil {
|
||||
tmpBuf := valueToBytes(page.MaxVal, page.DataType)
|
||||
if page.DataType == parquet.Type_BYTE_ARRAY {
|
||||
switch page.DataTable.ConvertedType {
|
||||
case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
|
||||
tmpBuf = tmpBuf[4:]
|
||||
}
|
||||
}
|
||||
page.Header.DataPageHeader.Statistics.Max = tmpBuf
|
||||
}
|
||||
if page.MinVal != nil {
|
||||
tmpBuf := valueToBytes(page.MinVal, page.DataType)
|
||||
if page.DataType == parquet.Type_BYTE_ARRAY {
|
||||
switch page.DataTable.ConvertedType {
|
||||
case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
|
||||
tmpBuf = tmpBuf[4:]
|
||||
}
|
||||
}
|
||||
page.Header.DataPageHeader.Statistics.Min = tmpBuf
|
||||
}
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
pageHeaderBytes, err := ts.Write(context.TODO(), page.Header)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.RawData = append(pageHeaderBytes, compressedData...)
|
||||
return page.RawData
|
||||
}
|
||||
|
||||
func (page *page) toDataPageV2(compressType parquet.CompressionCodec) []byte {
|
||||
values := []interface{}{}
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel {
|
||||
values = append(values, page.DataTable.Values[i])
|
||||
}
|
||||
}
|
||||
valuesBytes := encodeValues(values, page.DataType, page.DataTable.Encoding, page.DataTable.BitWidth)
|
||||
|
||||
var defLevelBytes []byte
|
||||
if page.DataTable.MaxDefinitionLevel > 0 {
|
||||
defLevels := make([]int64, len(page.DataTable.DefinitionLevels))
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
defLevels[i] = int64(page.DataTable.DefinitionLevels[i])
|
||||
}
|
||||
defLevelBytes = valuesToRLEBytes(
|
||||
defLevels,
|
||||
int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
}
|
||||
|
||||
var repLevelBytes []byte
|
||||
numRows := int32(0)
|
||||
if page.DataTable.MaxRepetitionLevel > 0 {
|
||||
repLevels := make([]int64, len(page.DataTable.DefinitionLevels))
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
repLevels[i] = int64(page.DataTable.RepetitionLevels[i])
|
||||
if page.DataTable.RepetitionLevels[i] == 0 {
|
||||
numRows++
|
||||
}
|
||||
}
|
||||
repLevelBytes = valuesToRLEBytes(
|
||||
repLevels,
|
||||
int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
}
|
||||
|
||||
compressedData, err := compressionCodec(compressType).compress(valuesBytes)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.Header = parquet.NewPageHeader()
|
||||
page.Header.Type = parquet.PageType_DATA_PAGE_V2
|
||||
page.Header.CompressedPageSize = int32(len(compressedData) + len(defLevelBytes) + len(repLevelBytes))
|
||||
page.Header.UncompressedPageSize = int32(len(valuesBytes) + len(defLevelBytes) + len(repLevelBytes))
|
||||
page.Header.DataPageHeaderV2 = parquet.NewDataPageHeaderV2()
|
||||
page.Header.DataPageHeaderV2.NumValues = int32(len(page.DataTable.Values))
|
||||
page.Header.DataPageHeaderV2.NumNulls = page.Header.DataPageHeaderV2.NumValues - int32(len(values))
|
||||
page.Header.DataPageHeaderV2.NumRows = numRows
|
||||
page.Header.DataPageHeaderV2.Encoding = page.DataTable.Encoding
|
||||
page.Header.DataPageHeaderV2.DefinitionLevelsByteLength = int32(len(defLevelBytes))
|
||||
page.Header.DataPageHeaderV2.RepetitionLevelsByteLength = int32(len(repLevelBytes))
|
||||
page.Header.DataPageHeaderV2.IsCompressed = true
|
||||
|
||||
page.Header.DataPageHeaderV2.Statistics = parquet.NewStatistics()
|
||||
if page.MaxVal != nil {
|
||||
tmpBuf := valueToBytes(page.MaxVal, page.DataType)
|
||||
if page.DataType == parquet.Type_BYTE_ARRAY {
|
||||
switch page.DataTable.ConvertedType {
|
||||
case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
|
||||
tmpBuf = tmpBuf[4:]
|
||||
}
|
||||
}
|
||||
page.Header.DataPageHeaderV2.Statistics.Max = tmpBuf
|
||||
}
|
||||
if page.MinVal != nil {
|
||||
tmpBuf := valueToBytes(page.MinVal, page.DataType)
|
||||
if page.DataType == parquet.Type_BYTE_ARRAY {
|
||||
switch page.DataTable.ConvertedType {
|
||||
case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
|
||||
tmpBuf = tmpBuf[4:]
|
||||
}
|
||||
}
|
||||
page.Header.DataPageHeaderV2.Statistics.Min = tmpBuf
|
||||
}
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
pageHeaderBytes, err := ts.Write(context.TODO(), page.Header)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.RawData = append(pageHeaderBytes, repLevelBytes...)
|
||||
page.RawData = append(page.RawData, defLevelBytes...)
|
||||
page.RawData = append(page.RawData, compressedData...)
|
||||
|
||||
return page.RawData
|
||||
}
|
||||
|
||||
func (page *page) toDictPage(compressType parquet.CompressionCodec, dataType parquet.Type) []byte {
|
||||
valuesBytes := valuesToBytes(page.DataTable.Values, dataType)
|
||||
compressedData, err := compressionCodec(compressType).compress(valuesBytes)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.Header = parquet.NewPageHeader()
|
||||
page.Header.Type = parquet.PageType_DICTIONARY_PAGE
|
||||
page.Header.CompressedPageSize = int32(len(compressedData))
|
||||
page.Header.UncompressedPageSize = int32(len(valuesBytes))
|
||||
page.Header.DictionaryPageHeader = parquet.NewDictionaryPageHeader()
|
||||
page.Header.DictionaryPageHeader.NumValues = int32(len(page.DataTable.Values))
|
||||
page.Header.DictionaryPageHeader.Encoding = parquet.Encoding_PLAIN
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
pageHeaderBytes, err := ts.Write(context.TODO(), page.Header)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.RawData = append(pageHeaderBytes, compressedData...)
|
||||
return page.RawData
|
||||
}
|
||||
|
||||
func (page *page) toDictDataPage(compressType parquet.CompressionCodec, bitWidth int32) []byte {
|
||||
valuesBytes := append([]byte{byte(bitWidth)}, valuesToRLEBytes(page.DataTable.Values, bitWidth, parquet.Type_INT32)...)
|
||||
|
||||
var defLevelBytes []byte
|
||||
if page.DataTable.MaxDefinitionLevel > 0 {
|
||||
defLevels := make([]int64, len(page.DataTable.DefinitionLevels))
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
defLevels[i] = int64(page.DataTable.DefinitionLevels[i])
|
||||
}
|
||||
defLevelBytes = valuesToRLEBitPackedHybridBytes(
|
||||
defLevels,
|
||||
int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
}
|
||||
|
||||
var repLevelBytes []byte
|
||||
if page.DataTable.MaxRepetitionLevel > 0 {
|
||||
repLevels := make([]int64, len(page.DataTable.DefinitionLevels))
|
||||
for i := range page.DataTable.DefinitionLevels {
|
||||
repLevels[i] = int64(page.DataTable.RepetitionLevels[i])
|
||||
}
|
||||
repLevelBytes = valuesToRLEBitPackedHybridBytes(
|
||||
repLevels,
|
||||
int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))),
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
}
|
||||
|
||||
data := append(repLevelBytes, defLevelBytes...)
|
||||
data = append(data, valuesBytes...)
|
||||
|
||||
compressedData, err := compressionCodec(compressType).compress(data)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.Header = parquet.NewPageHeader()
|
||||
page.Header.Type = parquet.PageType_DATA_PAGE
|
||||
page.Header.CompressedPageSize = int32(len(compressedData))
|
||||
page.Header.UncompressedPageSize = int32(len(data))
|
||||
page.Header.DataPageHeader = parquet.NewDataPageHeader()
|
||||
page.Header.DataPageHeader.NumValues = int32(len(page.DataTable.DefinitionLevels))
|
||||
page.Header.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE
|
||||
page.Header.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE
|
||||
page.Header.DataPageHeader.Encoding = parquet.Encoding_PLAIN_DICTIONARY
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
pageHeaderBytes, err := ts.Write(context.TODO(), page.Header)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
page.RawData = append(pageHeaderBytes, compressedData...)
|
||||
return page.RawData
|
||||
}
|
@ -1,881 +0,0 @@
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* File format description for the parquet file format
|
||||
*/
|
||||
namespace cpp parquet
|
||||
namespace java org.apache.parquet.format
|
||||
|
||||
/**
|
||||
* Types supported by Parquet. These types are intended to be used in combination
|
||||
* with the encodings to control the on disk storage format.
|
||||
* For example INT16 is not included as a type since a good encoding of INT32
|
||||
* would handle this.
|
||||
*/
|
||||
enum Type {
|
||||
BOOLEAN = 0;
|
||||
INT32 = 1;
|
||||
INT64 = 2;
|
||||
INT96 = 3; // deprecated, only used by legacy implementations.
|
||||
FLOAT = 4;
|
||||
DOUBLE = 5;
|
||||
BYTE_ARRAY = 6;
|
||||
FIXED_LEN_BYTE_ARRAY = 7;
|
||||
}
|
||||
|
||||
/**
|
||||
* Common types used by frameworks(e.g. hive, pig) using parquet. This helps map
|
||||
* between types in those frameworks to the base types in parquet. This is only
|
||||
* metadata and not needed to read or write the data.
|
||||
*/
|
||||
enum ConvertedType {
|
||||
/** a BYTE_ARRAY actually contains UTF8 encoded chars */
|
||||
UTF8 = 0;
|
||||
|
||||
/** a map is converted as an optional field containing a repeated key/value pair */
|
||||
MAP = 1;
|
||||
|
||||
/** a key/value pair is converted into a group of two fields */
|
||||
MAP_KEY_VALUE = 2;
|
||||
|
||||
/** a list is converted into an optional field containing a repeated field for its
|
||||
* values */
|
||||
LIST = 3;
|
||||
|
||||
/** an enum is converted into a binary field */
|
||||
ENUM = 4;
|
||||
|
||||
/**
|
||||
* A decimal value.
|
||||
*
|
||||
* This may be used to annotate binary or fixed primitive types. The
|
||||
* underlying byte array stores the unscaled value encoded as two's
|
||||
* complement using big-endian byte order (the most significant byte is the
|
||||
* zeroth element). The value of the decimal is the value * 10^{-scale}.
|
||||
*
|
||||
* This must be accompanied by a (maximum) precision and a scale in the
|
||||
* SchemaElement. The precision specifies the number of digits in the decimal
|
||||
* and the scale stores the location of the decimal point. For example 1.23
|
||||
* would have precision 3 (3 total digits) and scale 2 (the decimal point is
|
||||
* 2 digits over).
|
||||
*/
|
||||
DECIMAL = 5;
|
||||
|
||||
/**
|
||||
* A Date
|
||||
*
|
||||
* Stored as days since Unix epoch, encoded as the INT32 physical type.
|
||||
*
|
||||
*/
|
||||
DATE = 6;
|
||||
|
||||
/**
|
||||
* A time
|
||||
*
|
||||
* The total number of milliseconds since midnight. The value is stored
|
||||
* as an INT32 physical type.
|
||||
*/
|
||||
TIME_MILLIS = 7;
|
||||
|
||||
/**
|
||||
* A time.
|
||||
*
|
||||
* The total number of microseconds since midnight. The value is stored as
|
||||
* an INT64 physical type.
|
||||
*/
|
||||
TIME_MICROS = 8;
|
||||
|
||||
/**
|
||||
* A date/time combination
|
||||
*
|
||||
* Date and time recorded as milliseconds since the Unix epoch. Recorded as
|
||||
* a physical type of INT64.
|
||||
*/
|
||||
TIMESTAMP_MILLIS = 9;
|
||||
|
||||
/**
|
||||
* A date/time combination
|
||||
*
|
||||
* Date and time recorded as microseconds since the Unix epoch. The value is
|
||||
* stored as an INT64 physical type.
|
||||
*/
|
||||
TIMESTAMP_MICROS = 10;
|
||||
|
||||
|
||||
/**
|
||||
* An unsigned integer value.
|
||||
*
|
||||
* The number describes the maximum number of meainful data bits in
|
||||
* the stored value. 8, 16 and 32 bit values are stored using the
|
||||
* INT32 physical type. 64 bit values are stored using the INT64
|
||||
* physical type.
|
||||
*
|
||||
*/
|
||||
UINT_8 = 11;
|
||||
UINT_16 = 12;
|
||||
UINT_32 = 13;
|
||||
UINT_64 = 14;
|
||||
|
||||
/**
|
||||
* A signed integer value.
|
||||
*
|
||||
* The number describes the maximum number of meainful data bits in
|
||||
* the stored value. 8, 16 and 32 bit values are stored using the
|
||||
* INT32 physical type. 64 bit values are stored using the INT64
|
||||
* physical type.
|
||||
*
|
||||
*/
|
||||
INT_8 = 15;
|
||||
INT_16 = 16;
|
||||
INT_32 = 17;
|
||||
INT_64 = 18;
|
||||
|
||||
/**
|
||||
* An embedded JSON document
|
||||
*
|
||||
* A JSON document embedded within a single UTF8 column.
|
||||
*/
|
||||
JSON = 19;
|
||||
|
||||
/**
|
||||
* An embedded BSON document
|
||||
*
|
||||
* A BSON document embedded within a single BINARY column.
|
||||
*/
|
||||
BSON = 20;
|
||||
|
||||
/**
|
||||
* An interval of time
|
||||
*
|
||||
* This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12
|
||||
* This data is composed of three separate little endian unsigned
|
||||
* integers. Each stores a component of a duration of time. The first
|
||||
* integer identifies the number of months associated with the duration,
|
||||
* the second identifies the number of days associated with the duration
|
||||
* and the third identifies the number of milliseconds associated with
|
||||
* the provided duration. This duration of time is independent of any
|
||||
* particular timezone or date.
|
||||
*/
|
||||
INTERVAL = 21;
|
||||
}
|
||||
|
||||
/**
|
||||
* Representation of Schemas
|
||||
*/
|
||||
enum FieldRepetitionType {
|
||||
/** This field is required (can not be null) and each record has exactly 1 value. */
|
||||
REQUIRED = 0;
|
||||
|
||||
/** The field is optional (can be null) and each record has 0 or 1 values. */
|
||||
OPTIONAL = 1;
|
||||
|
||||
/** The field is repeated and can contain 0 or more values */
|
||||
REPEATED = 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Statistics per row group and per page
|
||||
* All fields are optional.
|
||||
*/
|
||||
struct Statistics {
|
||||
/**
|
||||
* DEPRECATED: min and max value of the column. Use min_value and max_value.
|
||||
*
|
||||
* Values are encoded using PLAIN encoding, except that variable-length byte
|
||||
* arrays do not include a length prefix.
|
||||
*
|
||||
* These fields encode min and max values determined by signed comparison
|
||||
* only. New files should use the correct order for a column's logical type
|
||||
* and store the values in the min_value and max_value fields.
|
||||
*
|
||||
* To support older readers, these may be set when the column order is
|
||||
* signed.
|
||||
*/
|
||||
1: optional binary max;
|
||||
2: optional binary min;
|
||||
/** count of null value in the column */
|
||||
3: optional i64 null_count;
|
||||
/** count of distinct values occurring */
|
||||
4: optional i64 distinct_count;
|
||||
/**
|
||||
* Min and max values for the column, determined by its ColumnOrder.
|
||||
*
|
||||
* Values are encoded using PLAIN encoding, except that variable-length byte
|
||||
* arrays do not include a length prefix.
|
||||
*/
|
||||
5: optional binary max_value;
|
||||
6: optional binary min_value;
|
||||
}
|
||||
|
||||
/** Empty structs to use as logical type annotations */
|
||||
struct StringType {} // allowed for BINARY, must be encoded with UTF-8
|
||||
struct UUIDType {} // allowed for FIXED[16], must encoded raw UUID bytes
|
||||
struct MapType {} // see LogicalTypes.md
|
||||
struct ListType {} // see LogicalTypes.md
|
||||
struct EnumType {} // allowed for BINARY, must be encoded with UTF-8
|
||||
struct DateType {} // allowed for INT32
|
||||
|
||||
/**
|
||||
* Logical type to annotate a column that is always null.
|
||||
*
|
||||
* Sometimes when discovering the schema of existing data, values are always
|
||||
* null and the physical type can't be determined. This annotation signals
|
||||
* the case where the physical type was guessed from all null values.
|
||||
*/
|
||||
struct NullType {} // allowed for any physical type, only null values stored
|
||||
|
||||
/**
|
||||
* Decimal logical type annotation
|
||||
*
|
||||
* To maintain forward-compatibility in v1, implementations using this logical
|
||||
* type must also set scale and precision on the annotated SchemaElement.
|
||||
*
|
||||
* Allowed for physical types: INT32, INT64, FIXED, and BINARY
|
||||
*/
|
||||
struct DecimalType {
|
||||
1: required i32 scale
|
||||
2: required i32 precision
|
||||
}
|
||||
|
||||
/** Time units for logical types */
|
||||
struct MilliSeconds {}
|
||||
struct MicroSeconds {}
|
||||
struct NanoSeconds {}
|
||||
union TimeUnit {
|
||||
1: MilliSeconds MILLIS
|
||||
2: MicroSeconds MICROS
|
||||
3: NanoSeconds NANOS
|
||||
}
|
||||
|
||||
/**
|
||||
* Timestamp logical type annotation
|
||||
*
|
||||
* Allowed for physical types: INT64
|
||||
*/
|
||||
struct TimestampType {
|
||||
1: required bool isAdjustedToUTC
|
||||
2: required TimeUnit unit
|
||||
}
|
||||
|
||||
/**
|
||||
* Time logical type annotation
|
||||
*
|
||||
* Allowed for physical types: INT32 (millis), INT64 (micros, nanos)
|
||||
*/
|
||||
struct TimeType {
|
||||
1: required bool isAdjustedToUTC
|
||||
2: required TimeUnit unit
|
||||
}
|
||||
|
||||
/**
|
||||
* Integer logical type annotation
|
||||
*
|
||||
* bitWidth must be 8, 16, 32, or 64.
|
||||
*
|
||||
* Allowed for physical types: INT32, INT64
|
||||
*/
|
||||
struct IntType {
|
||||
1: required byte bitWidth
|
||||
2: required bool isSigned
|
||||
}
|
||||
|
||||
/**
|
||||
* Embedded JSON logical type annotation
|
||||
*
|
||||
* Allowed for physical types: BINARY
|
||||
*/
|
||||
struct JsonType {
|
||||
}
|
||||
|
||||
/**
|
||||
* Embedded BSON logical type annotation
|
||||
*
|
||||
* Allowed for physical types: BINARY
|
||||
*/
|
||||
struct BsonType {
|
||||
}
|
||||
|
||||
/**
|
||||
* LogicalType annotations to replace ConvertedType.
|
||||
*
|
||||
* To maintain compatibility, implementations using LogicalType for a
|
||||
* SchemaElement must also set the corresponding ConvertedType from the
|
||||
* following table.
|
||||
*/
|
||||
union LogicalType {
|
||||
1: StringType STRING // use ConvertedType UTF8
|
||||
2: MapType MAP // use ConvertedType MAP
|
||||
3: ListType LIST // use ConvertedType LIST
|
||||
4: EnumType ENUM // use ConvertedType ENUM
|
||||
5: DecimalType DECIMAL // use ConvertedType DECIMAL
|
||||
6: DateType DATE // use ConvertedType DATE
|
||||
7: TimeType TIME // use ConvertedType TIME_MICROS or TIME_MILLIS
|
||||
8: TimestampType TIMESTAMP // use ConvertedType TIMESTAMP_MICROS or TIMESTAMP_MILLIS
|
||||
// 9: reserved for INTERVAL
|
||||
10: IntType INTEGER // use ConvertedType INT_* or UINT_*
|
||||
11: NullType UNKNOWN // no compatible ConvertedType
|
||||
12: JsonType JSON // use ConvertedType JSON
|
||||
13: BsonType BSON // use ConvertedType BSON
|
||||
14: UUIDType UUID
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a element inside a schema definition.
|
||||
* - if it is a group (inner node) then type is undefined and num_children is defined
|
||||
* - if it is a primitive type (leaf) then type is defined and num_children is undefined
|
||||
* the nodes are listed in depth first traversal order.
|
||||
*/
|
||||
struct SchemaElement {
|
||||
/** Data type for this field. Not set if the current element is a non-leaf node */
|
||||
1: optional Type type;
|
||||
|
||||
/** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the vales.
|
||||
* Otherwise, if specified, this is the maximum bit length to store any of the values.
|
||||
* (e.g. a low cardinality INT col could have this set to 3). Note that this is
|
||||
* in the schema, and therefore fixed for the entire file.
|
||||
*/
|
||||
2: optional i32 type_length;
|
||||
|
||||
/** repetition of the field. The root of the schema does not have a repetition_type.
|
||||
* All other nodes must have one */
|
||||
3: optional FieldRepetitionType repetition_type;
|
||||
|
||||
/** Name of the field in the schema */
|
||||
4: required string name;
|
||||
|
||||
/** Nested fields. Since thrift does not support nested fields,
|
||||
* the nesting is flattened to a single list by a depth-first traversal.
|
||||
* The children count is used to construct the nested relationship.
|
||||
* This field is not set when the element is a primitive type
|
||||
*/
|
||||
5: optional i32 num_children;
|
||||
|
||||
/** When the schema is the result of a conversion from another model
|
||||
* Used to record the original type to help with cross conversion.
|
||||
*/
|
||||
6: optional ConvertedType converted_type;
|
||||
|
||||
/** Used when this column contains decimal data.
|
||||
* See the DECIMAL converted type for more details.
|
||||
*/
|
||||
7: optional i32 scale
|
||||
8: optional i32 precision
|
||||
|
||||
/** When the original schema supports field ids, this will save the
|
||||
* original field id in the parquet schema
|
||||
*/
|
||||
9: optional i32 field_id;
|
||||
|
||||
/**
|
||||
* The logical type of this SchemaElement
|
||||
*
|
||||
* LogicalType replaces ConvertedType, but ConvertedType is still required
|
||||
* for some logical types to ensure forward-compatibility in format v1.
|
||||
*/
|
||||
10: optional LogicalType logicalType
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodings supported by Parquet. Not all encodings are valid for all types. These
|
||||
* enums are also used to specify the encoding of definition and repetition levels.
|
||||
* See the accompanying doc for the details of the more complicated encodings.
|
||||
*/
|
||||
enum Encoding {
|
||||
/** Default encoding.
|
||||
* BOOLEAN - 1 bit per value. 0 is false; 1 is true.
|
||||
* INT32 - 4 bytes per value. Stored as little-endian.
|
||||
* INT64 - 8 bytes per value. Stored as little-endian.
|
||||
* FLOAT - 4 bytes per value. IEEE. Stored as little-endian.
|
||||
* DOUBLE - 8 bytes per value. IEEE. Stored as little-endian.
|
||||
* BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes.
|
||||
* FIXED_LEN_BYTE_ARRAY - Just the bytes.
|
||||
*/
|
||||
PLAIN = 0;
|
||||
|
||||
/** Group VarInt encoding for INT32/INT64.
|
||||
* This encoding is deprecated. It was never used
|
||||
*/
|
||||
// GROUP_VAR_INT = 1;
|
||||
|
||||
/**
|
||||
* Deprecated: Dictionary encoding. The values in the dictionary are encoded in the
|
||||
* plain type.
|
||||
* in a data page use RLE_DICTIONARY instead.
|
||||
* in a Dictionary page use PLAIN instead
|
||||
*/
|
||||
PLAIN_DICTIONARY = 2;
|
||||
|
||||
/** Group packed run length encoding. Usable for definition/repetition levels
|
||||
* encoding and Booleans (on one bit: 0 is false; 1 is true.)
|
||||
*/
|
||||
RLE = 3;
|
||||
|
||||
/** Bit packed encoding. This can only be used if the data has a known max
|
||||
* width. Usable for definition/repetition levels encoding.
|
||||
*/
|
||||
BIT_PACKED = 4;
|
||||
|
||||
/** Delta encoding for integers. This can be used for int columns and works best
|
||||
* on sorted data
|
||||
*/
|
||||
DELTA_BINARY_PACKED = 5;
|
||||
|
||||
/** Encoding for byte arrays to separate the length values and the data. The lengths
|
||||
* are encoded using DELTA_BINARY_PACKED
|
||||
*/
|
||||
DELTA_LENGTH_BYTE_ARRAY = 6;
|
||||
|
||||
/** Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED.
|
||||
* Suffixes are stored as delta length byte arrays.
|
||||
*/
|
||||
DELTA_BYTE_ARRAY = 7;
|
||||
|
||||
/** Dictionary encoding: the ids are encoded using the RLE encoding
|
||||
*/
|
||||
RLE_DICTIONARY = 8;
|
||||
}
|
||||
|
||||
/**
|
||||
* Supported compression algorithms.
|
||||
*
|
||||
* Codecs added in 2.4 can be read by readers based on 2.4 and later.
|
||||
* Codec support may vary between readers based on the format version and
|
||||
* libraries available at runtime. Gzip, Snappy, and LZ4 codecs are
|
||||
* widely available, while Zstd and Brotli require additional libraries.
|
||||
*/
|
||||
enum CompressionCodec {
|
||||
UNCOMPRESSED = 0;
|
||||
SNAPPY = 1;
|
||||
GZIP = 2;
|
||||
LZO = 3;
|
||||
BROTLI = 4; // Added in 2.4
|
||||
LZ4 = 5; // Added in 2.4
|
||||
ZSTD = 6; // Added in 2.4
|
||||
}
|
||||
|
||||
enum PageType {
|
||||
DATA_PAGE = 0;
|
||||
INDEX_PAGE = 1;
|
||||
DICTIONARY_PAGE = 2;
|
||||
DATA_PAGE_V2 = 3;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enum to annotate whether lists of min/max elements inside ColumnIndex
|
||||
* are ordered and if so, in which direction.
|
||||
*/
|
||||
enum BoundaryOrder {
|
||||
UNORDERED = 0;
|
||||
ASCENDING = 1;
|
||||
DESCENDING = 2;
|
||||
}
|
||||
|
||||
/** Data page header */
|
||||
struct DataPageHeader {
|
||||
/** Number of values, including NULLs, in this data page. **/
|
||||
1: required i32 num_values
|
||||
|
||||
/** Encoding used for this data page **/
|
||||
2: required Encoding encoding
|
||||
|
||||
/** Encoding used for definition levels **/
|
||||
3: required Encoding definition_level_encoding;
|
||||
|
||||
/** Encoding used for repetition levels **/
|
||||
4: required Encoding repetition_level_encoding;
|
||||
|
||||
/** Optional statistics for the data in this page**/
|
||||
5: optional Statistics statistics;
|
||||
}
|
||||
|
||||
struct IndexPageHeader {
|
||||
/** TODO: **/
|
||||
}
|
||||
|
||||
struct DictionaryPageHeader {
|
||||
/** Number of values in the dictionary **/
|
||||
1: required i32 num_values;
|
||||
|
||||
/** Encoding using this dictionary page **/
|
||||
2: required Encoding encoding
|
||||
|
||||
/** If true, the entries in the dictionary are sorted in ascending order **/
|
||||
3: optional bool is_sorted;
|
||||
}
|
||||
|
||||
/**
|
||||
* New page format allowing reading levels without decompressing the data
|
||||
* Repetition and definition levels are uncompressed
|
||||
* The remaining section containing the data is compressed if is_compressed is true
|
||||
**/
|
||||
struct DataPageHeaderV2 {
|
||||
/** Number of values, including NULLs, in this data page. **/
|
||||
1: required i32 num_values
|
||||
/** Number of NULL values, in this data page.
|
||||
Number of non-null = num_values - num_nulls which is also the number of values in the data section **/
|
||||
2: required i32 num_nulls
|
||||
/** Number of rows in this data page. which means pages change on record boundaries (r = 0) **/
|
||||
3: required i32 num_rows
|
||||
/** Encoding used for data in this page **/
|
||||
4: required Encoding encoding
|
||||
|
||||
// repetition levels and definition levels are always using RLE (without size in it)
|
||||
|
||||
/** length of the definition levels */
|
||||
5: required i32 definition_levels_byte_length;
|
||||
/** length of the repetition levels */
|
||||
6: required i32 repetition_levels_byte_length;
|
||||
|
||||
/** whether the values are compressed.
|
||||
Which means the section of the page between
|
||||
definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included)
|
||||
is compressed with the compression_codec.
|
||||
If missing it is considered compressed */
|
||||
7: optional bool is_compressed = 1;
|
||||
|
||||
/** optional statistics for this column chunk */
|
||||
8: optional Statistics statistics;
|
||||
}
|
||||
|
||||
struct PageHeader {
|
||||
/** the type of the page: indicates which of the *_header fields is set **/
|
||||
1: required PageType type
|
||||
|
||||
/** Uncompressed page size in bytes (not including this header) **/
|
||||
2: required i32 uncompressed_page_size
|
||||
|
||||
/** Compressed page size in bytes (not including this header) **/
|
||||
3: required i32 compressed_page_size
|
||||
|
||||
/** 32bit crc for the data below. This allows for disabling checksumming in HDFS
|
||||
* if only a few pages needs to be read
|
||||
**/
|
||||
4: optional i32 crc
|
||||
|
||||
// Headers for page specific data. One only will be set.
|
||||
5: optional DataPageHeader data_page_header;
|
||||
6: optional IndexPageHeader index_page_header;
|
||||
7: optional DictionaryPageHeader dictionary_page_header;
|
||||
8: optional DataPageHeaderV2 data_page_header_v2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrapper struct to store key values
|
||||
*/
|
||||
struct KeyValue {
|
||||
1: required string key
|
||||
2: optional string value
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrapper struct to specify sort order
|
||||
*/
|
||||
struct SortingColumn {
|
||||
/** The column index (in this row group) **/
|
||||
1: required i32 column_idx
|
||||
|
||||
/** If true, indicates this column is sorted in descending order. **/
|
||||
2: required bool descending
|
||||
|
||||
/** If true, nulls will come before non-null values, otherwise,
|
||||
* nulls go at the end. */
|
||||
3: required bool nulls_first
|
||||
}
|
||||
|
||||
/**
|
||||
* statistics of a given page type and encoding
|
||||
*/
|
||||
struct PageEncodingStats {
|
||||
|
||||
/** the page type (data/dic/...) **/
|
||||
1: required PageType page_type;
|
||||
|
||||
/** encoding of the page **/
|
||||
2: required Encoding encoding;
|
||||
|
||||
/** number of pages of this type with this encoding **/
|
||||
3: required i32 count;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Description for column metadata
|
||||
*/
|
||||
struct ColumnMetaData {
|
||||
/** Type of this column **/
|
||||
1: required Type type
|
||||
|
||||
/** Set of all encodings used for this column. The purpose is to validate
|
||||
* whether we can decode those pages. **/
|
||||
2: required list<Encoding> encodings
|
||||
|
||||
/** Path in schema **/
|
||||
3: required list<string> path_in_schema
|
||||
|
||||
/** Compression codec **/
|
||||
4: required CompressionCodec codec
|
||||
|
||||
/** Number of values in this column **/
|
||||
5: required i64 num_values
|
||||
|
||||
/** total byte size of all uncompressed pages in this column chunk (including the headers) **/
|
||||
6: required i64 total_uncompressed_size
|
||||
|
||||
/** total byte size of all compressed pages in this column chunk (including the headers) **/
|
||||
7: required i64 total_compressed_size
|
||||
|
||||
/** Optional key/value metadata **/
|
||||
8: optional list<KeyValue> key_value_metadata
|
||||
|
||||
/** Byte offset from beginning of file to first data page **/
|
||||
9: required i64 data_page_offset
|
||||
|
||||
/** Byte offset from beginning of file to root index page **/
|
||||
10: optional i64 index_page_offset
|
||||
|
||||
/** Byte offset from the beginning of file to first (only) dictionary page **/
|
||||
11: optional i64 dictionary_page_offset
|
||||
|
||||
/** optional statistics for this column chunk */
|
||||
12: optional Statistics statistics;
|
||||
|
||||
/** Set of all encodings used for pages in this column chunk.
|
||||
* This information can be used to determine if all data pages are
|
||||
* dictionary encoded for example **/
|
||||
13: optional list<PageEncodingStats> encoding_stats;
|
||||
}
|
||||
|
||||
struct ColumnChunk {
|
||||
/** File where column data is stored. If not set, assumed to be same file as
|
||||
* metadata. This path is relative to the current file.
|
||||
**/
|
||||
1: optional string file_path
|
||||
|
||||
/** Byte offset in file_path to the ColumnMetaData **/
|
||||
2: required i64 file_offset
|
||||
|
||||
/** Column metadata for this chunk. This is the same content as what is at
|
||||
* file_path/file_offset. Having it here has it replicated in the file
|
||||
* metadata.
|
||||
**/
|
||||
3: optional ColumnMetaData meta_data
|
||||
|
||||
/** File offset of ColumnChunk's OffsetIndex **/
|
||||
4: optional i64 offset_index_offset
|
||||
|
||||
/** Size of ColumnChunk's OffsetIndex, in bytes **/
|
||||
5: optional i32 offset_index_length
|
||||
|
||||
/** File offset of ColumnChunk's ColumnIndex **/
|
||||
6: optional i64 column_index_offset
|
||||
|
||||
/** Size of ColumnChunk's ColumnIndex, in bytes **/
|
||||
7: optional i32 column_index_length
|
||||
}
|
||||
|
||||
struct RowGroup {
|
||||
/** Metadata for each column chunk in this row group.
|
||||
* This list must have the same order as the SchemaElement list in FileMetaData.
|
||||
**/
|
||||
1: required list<ColumnChunk> columns
|
||||
|
||||
/** Total byte size of all the uncompressed column data in this row group **/
|
||||
2: required i64 total_byte_size
|
||||
|
||||
/** Number of rows in this row group **/
|
||||
3: required i64 num_rows
|
||||
|
||||
/** If set, specifies a sort ordering of the rows in this RowGroup.
|
||||
* The sorting columns can be a subset of all the columns.
|
||||
*/
|
||||
4: optional list<SortingColumn> sorting_columns
|
||||
}
|
||||
|
||||
/** Empty struct to signal the order defined by the physical or logical type */
|
||||
struct TypeDefinedOrder {}
|
||||
|
||||
/**
|
||||
* Union to specify the order used for the min_value and max_value fields for a
|
||||
* column. This union takes the role of an enhanced enum that allows rich
|
||||
* elements (which will be needed for a collation-based ordering in the future).
|
||||
*
|
||||
* Possible values are:
|
||||
* * TypeDefinedOrder - the column uses the order defined by its logical or
|
||||
* physical type (if there is no logical type).
|
||||
*
|
||||
* If the reader does not support the value of this union, min and max stats
|
||||
* for this column should be ignored.
|
||||
*/
|
||||
union ColumnOrder {
|
||||
|
||||
/**
|
||||
* The sort orders for logical types are:
|
||||
* UTF8 - unsigned byte-wise comparison
|
||||
* INT8 - signed comparison
|
||||
* INT16 - signed comparison
|
||||
* INT32 - signed comparison
|
||||
* INT64 - signed comparison
|
||||
* UINT8 - unsigned comparison
|
||||
* UINT16 - unsigned comparison
|
||||
* UINT32 - unsigned comparison
|
||||
* UINT64 - unsigned comparison
|
||||
* DECIMAL - signed comparison of the represented value
|
||||
* DATE - signed comparison
|
||||
* TIME_MILLIS - signed comparison
|
||||
* TIME_MICROS - signed comparison
|
||||
* TIMESTAMP_MILLIS - signed comparison
|
||||
* TIMESTAMP_MICROS - signed comparison
|
||||
* INTERVAL - unsigned comparison
|
||||
* JSON - unsigned byte-wise comparison
|
||||
* BSON - unsigned byte-wise comparison
|
||||
* ENUM - unsigned byte-wise comparison
|
||||
* LIST - undefined
|
||||
* MAP - undefined
|
||||
*
|
||||
* In the absence of logical types, the sort order is determined by the physical type:
|
||||
* BOOLEAN - false, true
|
||||
* INT32 - signed comparison
|
||||
* INT64 - signed comparison
|
||||
* INT96 (only used for legacy timestamps) - undefined
|
||||
* FLOAT - signed comparison of the represented value (*)
|
||||
* DOUBLE - signed comparison of the represented value (*)
|
||||
* BYTE_ARRAY - unsigned byte-wise comparison
|
||||
* FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison
|
||||
*
|
||||
* (*) Because the sorting order is not specified properly for floating
|
||||
* point values (relations vs. total ordering) the following
|
||||
* compatibility rules should be applied when reading statistics:
|
||||
* - If the min is a NaN, it should be ignored.
|
||||
* - If the max is a NaN, it should be ignored.
|
||||
* - If the min is +0, the row group may contain -0 values as well.
|
||||
* - If the max is -0, the row group may contain +0 values as well.
|
||||
* - When looking for NaN values, min and max should be ignored.
|
||||
*/
|
||||
1: TypeDefinedOrder TYPE_ORDER;
|
||||
}
|
||||
|
||||
struct PageLocation {
|
||||
/** Offset of the page in the file **/
|
||||
1: required i64 offset
|
||||
|
||||
/**
|
||||
* Size of the page, including header. Sum of compressed_page_size and header
|
||||
* length
|
||||
*/
|
||||
2: required i32 compressed_page_size
|
||||
|
||||
/**
|
||||
* Index within the RowGroup of the first row of the page; this means pages
|
||||
* change on record boundaries (r = 0).
|
||||
*/
|
||||
3: required i64 first_row_index
|
||||
}
|
||||
|
||||
struct OffsetIndex {
|
||||
/**
|
||||
* PageLocations, ordered by increasing PageLocation.offset. It is required
|
||||
* that page_locations[i].first_row_index < page_locations[i+1].first_row_index.
|
||||
*/
|
||||
1: required list<PageLocation> page_locations
|
||||
}
|
||||
|
||||
/**
|
||||
* Description for ColumnIndex.
|
||||
* Each <array-field>[i] refers to the page at OffsetIndex.page_locations[i]
|
||||
*/
|
||||
struct ColumnIndex {
|
||||
/**
|
||||
* A list of Boolean values to determine the validity of the corresponding
|
||||
* min and max values. If true, a page contains only null values, and writers
|
||||
* have to set the corresponding entries in min_values and max_values to
|
||||
* byte[0], so that all lists have the same length. If false, the
|
||||
* corresponding entries in min_values and max_values must be valid.
|
||||
*/
|
||||
1: required list<bool> null_pages
|
||||
|
||||
/**
|
||||
* Two lists containing lower and upper bounds for the values of each page.
|
||||
* These may be the actual minimum and maximum values found on a page, but
|
||||
* can also be (more compact) values that do not exist on a page. For
|
||||
* example, instead of storing ""Blart Versenwald III", a writer may set
|
||||
* min_values[i]="B", max_values[i]="C". Such more compact values must still
|
||||
* be valid values within the column's logical type. Readers must make sure
|
||||
* that list entries are populated before using them by inspecting null_pages.
|
||||
*/
|
||||
2: required list<binary> min_values
|
||||
3: required list<binary> max_values
|
||||
|
||||
/**
|
||||
* Stores whether both min_values and max_values are orderd and if so, in
|
||||
* which direction. This allows readers to perform binary searches in both
|
||||
* lists. Readers cannot assume that max_values[i] <= min_values[i+1], even
|
||||
* if the lists are ordered.
|
||||
*/
|
||||
4: required BoundaryOrder boundary_order
|
||||
|
||||
/** A list containing the number of null values for each page **/
|
||||
5: optional list<i64> null_counts
|
||||
}
|
||||
|
||||
/**
|
||||
* Description for file metadata
|
||||
*/
|
||||
struct FileMetaData {
|
||||
/** Version of this file **/
|
||||
1: required i32 version
|
||||
|
||||
/** Parquet schema for this file. This schema contains metadata for all the columns.
|
||||
* The schema is represented as a tree with a single root. The nodes of the tree
|
||||
* are flattened to a list by doing a depth-first traversal.
|
||||
* The column metadata contains the path in the schema for that column which can be
|
||||
* used to map columns to nodes in the schema.
|
||||
* The first element is the root **/
|
||||
2: required list<SchemaElement> schema;
|
||||
|
||||
/** Number of rows in this file **/
|
||||
3: required i64 num_rows
|
||||
|
||||
/** Row groups in this file **/
|
||||
4: required list<RowGroup> row_groups
|
||||
|
||||
/** Optional key/value metadata **/
|
||||
5: optional list<KeyValue> key_value_metadata
|
||||
|
||||
/** String for application that wrote this file. This should be in the format
|
||||
* <Application> version <App Version> (build <App Build Hash>).
|
||||
* e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
|
||||
**/
|
||||
6: optional string created_by
|
||||
|
||||
/**
|
||||
* Sort order used for the min_value and max_value fields of each column in
|
||||
* this file. Each sort order corresponds to one column, determined by its
|
||||
* position in the list, matching the position of the column in the schema.
|
||||
*
|
||||
* Without column_orders, the meaning of the min_value and max_value fields is
|
||||
* undefined. To ensure well-defined behavior, if min_value and max_value are
|
||||
* written to a Parquet file, column_orders must be written as well.
|
||||
*
|
||||
* The obsolete min and max fields are always sorted by signed comparison
|
||||
* regardless of column_orders.
|
||||
*/
|
||||
7: optional list<ColumnOrder> column_orders;
|
||||
}
|
||||
|
@ -1,169 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"io"
|
||||
|
||||
"git.apache.org/thrift.git/lib/go/thrift"
|
||||
"github.com/minio/minio-go/v7/pkg/set"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
// GetReaderFunc - function type returning io.ReadCloser for requested offset/length.
|
||||
type GetReaderFunc func(offset, length int64) (io.ReadCloser, error)
|
||||
|
||||
func footerSize(getReaderFunc GetReaderFunc) (size int64, err error) {
|
||||
rc, err := getReaderFunc(-8, 4)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer rc.Close()
|
||||
|
||||
buf := make([]byte, 4)
|
||||
if _, err = io.ReadFull(rc, buf); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
size = int64(binary.LittleEndian.Uint32(buf))
|
||||
|
||||
return size, nil
|
||||
}
|
||||
|
||||
func fileMetadata(getReaderFunc GetReaderFunc) (*parquet.FileMetaData, error) {
|
||||
size, err := footerSize(getReaderFunc)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
rc, err := getReaderFunc(-(8 + size), size)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rc.Close()
|
||||
|
||||
fileMeta := parquet.NewFileMetaData()
|
||||
|
||||
pf := thrift.NewTCompactProtocolFactory()
|
||||
protocol := pf.GetProtocol(thrift.NewStreamTransportR(rc))
|
||||
err = fileMeta.Read(protocol)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return fileMeta, nil
|
||||
}
|
||||
|
||||
// Value - denotes column value
|
||||
type Value struct {
|
||||
Value interface{}
|
||||
Type parquet.Type
|
||||
Schema *parquet.SchemaElement
|
||||
}
|
||||
|
||||
// MarshalJSON - encodes to JSON data
|
||||
func (value Value) MarshalJSON() (data []byte, err error) {
|
||||
return json.Marshal(value.Value)
|
||||
}
|
||||
|
||||
// Reader - denotes parquet file.
|
||||
type Reader struct {
|
||||
getReaderFunc GetReaderFunc
|
||||
schemaElements []*parquet.SchemaElement
|
||||
rowGroups []*parquet.RowGroup
|
||||
rowGroupIndex int
|
||||
|
||||
nameList []string
|
||||
columnNames set.StringSet
|
||||
columns map[string]*column
|
||||
rowIndex int64
|
||||
}
|
||||
|
||||
// NewReader - creates new parquet reader. Reader calls getReaderFunc to get required data range for given columnNames. If columnNames is empty, all columns are used.
|
||||
func NewReader(getReaderFunc GetReaderFunc, columnNames set.StringSet) (*Reader, error) {
|
||||
fileMeta, err := fileMetadata(getReaderFunc)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nameList := []string{}
|
||||
schemaElements := fileMeta.GetSchema()
|
||||
for _, element := range schemaElements {
|
||||
nameList = append(nameList, element.Name)
|
||||
}
|
||||
|
||||
return &Reader{
|
||||
getReaderFunc: getReaderFunc,
|
||||
rowGroups: fileMeta.GetRowGroups(),
|
||||
schemaElements: schemaElements,
|
||||
nameList: nameList,
|
||||
columnNames: columnNames,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Read - reads single record.
|
||||
func (reader *Reader) Read() (record *Record, err error) {
|
||||
if reader.rowGroupIndex >= len(reader.rowGroups) {
|
||||
return nil, io.EOF
|
||||
}
|
||||
|
||||
if reader.columns == nil {
|
||||
reader.columns, err = getColumns(
|
||||
reader.rowGroups[reader.rowGroupIndex],
|
||||
reader.columnNames,
|
||||
reader.schemaElements,
|
||||
reader.getReaderFunc,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
reader.rowIndex = 0
|
||||
}
|
||||
|
||||
if reader.rowIndex >= reader.rowGroups[reader.rowGroupIndex].GetNumRows() {
|
||||
reader.rowGroupIndex++
|
||||
reader.Close()
|
||||
return reader.Read()
|
||||
}
|
||||
|
||||
record = newRecord(reader.nameList)
|
||||
for name := range reader.columns {
|
||||
col := reader.columns[name]
|
||||
value, valueType, schema := col.read()
|
||||
record.set(name, Value{Value: value, Type: valueType, Schema: schema})
|
||||
}
|
||||
|
||||
reader.rowIndex++
|
||||
|
||||
return record, nil
|
||||
}
|
||||
|
||||
// Close - closes underneath readers.
|
||||
func (reader *Reader) Close() (err error) {
|
||||
for _, column := range reader.columns {
|
||||
column.close()
|
||||
}
|
||||
|
||||
reader.columns = nil
|
||||
reader.rowIndex = 0
|
||||
|
||||
return nil
|
||||
}
|
@ -1,91 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"io"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio-go/v7/pkg/set"
|
||||
)
|
||||
|
||||
func getReader(name string, offset int64, length int64) (io.ReadCloser, error) {
|
||||
file, err := os.Open(name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fi, err := file.Stat()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if offset < 0 {
|
||||
offset = fi.Size() + offset
|
||||
}
|
||||
|
||||
if _, err = file.Seek(offset, io.SeekStart); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return file, nil
|
||||
}
|
||||
|
||||
func TestReader(t *testing.T) {
|
||||
name := "example.parquet"
|
||||
reader, err := NewReader(
|
||||
func(offset, length int64) (io.ReadCloser, error) {
|
||||
return getReader(name, offset, length)
|
||||
},
|
||||
set.CreateStringSet("one", "two", "three"),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
expectedRecords := []string{
|
||||
`map[one:{-1 DOUBLE SchemaElement({Type:DOUBLE TypeLength:<nil> RepetitionType:OPTIONAL Name:one NumChildren:<nil> ConvertedType:<nil> Scale:<nil> Precision:<nil> FieldID:<nil> LogicalType:<nil>})} three:{true BOOLEAN SchemaElement({Type:BOOLEAN TypeLength:<nil> RepetitionType:OPTIONAL Name:three NumChildren:<nil> ConvertedType:<nil> Scale:<nil> Precision:<nil> FieldID:<nil> LogicalType:<nil>})} two:{[102 111 111] BYTE_ARRAY SchemaElement({Type:BYTE_ARRAY TypeLength:<nil> RepetitionType:OPTIONAL Name:two NumChildren:<nil> ConvertedType:<nil> Scale:<nil> Precision:<nil> FieldID:<nil> LogicalType:<nil>})}]`,
|
||||
`map[one:{<nil> DOUBLE SchemaElement({Type:DOUBLE TypeLength:<nil> RepetitionType:OPTIONAL Name:one NumChildren:<nil> ConvertedType:<nil> Scale:<nil> Precision:<nil> FieldID:<nil> LogicalType:<nil>})} three:{false BOOLEAN SchemaElement({Type:BOOLEAN TypeLength:<nil> RepetitionType:OPTIONAL Name:three NumChildren:<nil> ConvertedType:<nil> Scale:<nil> Precision:<nil> FieldID:<nil> LogicalType:<nil>})} two:{[98 97 114] BYTE_ARRAY SchemaElement({Type:BYTE_ARRAY TypeLength:<nil> RepetitionType:OPTIONAL Name:two NumChildren:<nil> ConvertedType:<nil> Scale:<nil> Precision:<nil> FieldID:<nil> LogicalType:<nil>})}]`,
|
||||
`map[one:{2.5 DOUBLE SchemaElement({Type:DOUBLE TypeLength:<nil> RepetitionType:OPTIONAL Name:one NumChildren:<nil> ConvertedType:<nil> Scale:<nil> Precision:<nil> FieldID:<nil> LogicalType:<nil>})} three:{true BOOLEAN SchemaElement({Type:BOOLEAN TypeLength:<nil> RepetitionType:OPTIONAL Name:three NumChildren:<nil> ConvertedType:<nil> Scale:<nil> Precision:<nil> FieldID:<nil> LogicalType:<nil>})} two:{[98 97 122] BYTE_ARRAY SchemaElement({Type:BYTE_ARRAY TypeLength:<nil> RepetitionType:OPTIONAL Name:two NumChildren:<nil> ConvertedType:<nil> Scale:<nil> Precision:<nil> FieldID:<nil> LogicalType:<nil>})}]`,
|
||||
}
|
||||
|
||||
i := 0
|
||||
for {
|
||||
record, err := reader.Read()
|
||||
if err != nil {
|
||||
if err != io.EOF {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
if i == len(expectedRecords) {
|
||||
t.Errorf("read more than expected record count %v", len(expectedRecords))
|
||||
}
|
||||
|
||||
if record.String() != expectedRecords[i] {
|
||||
t.Errorf("record%v: expected: %v, got: %v", i+1, expectedRecords[i], record.String())
|
||||
}
|
||||
|
||||
i++
|
||||
}
|
||||
|
||||
reader.Close()
|
||||
}
|
@ -1,71 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Record - ordered parquet record.
|
||||
type Record struct {
|
||||
nameList []string
|
||||
nameValueMap map[string]Value
|
||||
}
|
||||
|
||||
// String - returns string representation of this record.
|
||||
func (r *Record) String() string {
|
||||
values := []string{}
|
||||
r.Range(func(name string, value Value) bool {
|
||||
values = append(values, fmt.Sprintf("%v:%v", name, value))
|
||||
return true
|
||||
})
|
||||
|
||||
return "map[" + strings.Join(values, " ") + "]"
|
||||
}
|
||||
|
||||
func (r *Record) set(name string, value Value) {
|
||||
r.nameValueMap[name] = value
|
||||
}
|
||||
|
||||
// Get - returns Value of name.
|
||||
func (r *Record) Get(name string) (Value, bool) {
|
||||
value, ok := r.nameValueMap[name]
|
||||
return value, ok
|
||||
}
|
||||
|
||||
// Range - calls f sequentially for each name and value present in the record. If f returns false, range stops the iteration.
|
||||
func (r *Record) Range(f func(name string, value Value) bool) {
|
||||
for _, name := range r.nameList {
|
||||
value, ok := r.nameValueMap[name]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
if !f(name, value) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func newRecord(nameList []string) *Record {
|
||||
return &Record{
|
||||
nameList: nameList,
|
||||
nameValueMap: make(map[string]Value),
|
||||
}
|
||||
}
|
@ -1,127 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package schema
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
var nameRegexp = regexp.MustCompile("^[a-zA-Z0-9_]+$")
|
||||
|
||||
func validataPathSegments(pathSegments []string) error {
|
||||
for _, pathSegment := range pathSegments {
|
||||
if !nameRegexp.MatchString(pathSegment) {
|
||||
return fmt.Errorf("unsupported name %v", strings.Join(pathSegments, "."))
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Element - represents schema element and its children. Any element must have Name and RepetitionType fields set.
|
||||
type Element struct {
|
||||
parquet.SchemaElement
|
||||
numChildren int32
|
||||
Encoding *parquet.Encoding // Optional; defaults is computed.
|
||||
CompressionType *parquet.CompressionCodec // Optional; defaults to SNAPPY.
|
||||
Children *Tree
|
||||
MaxDefinitionLevel int64
|
||||
MaxRepetitionLevel int64
|
||||
PathInTree string
|
||||
PathInSchema string
|
||||
}
|
||||
|
||||
// String - stringify this element.
|
||||
func (element *Element) String() string {
|
||||
var s []string
|
||||
s = append(s, "Name:"+element.Name)
|
||||
s = append(s, "RepetitionType:"+element.RepetitionType.String())
|
||||
if element.Type != nil {
|
||||
s = append(s, "Type:"+element.Type.String())
|
||||
}
|
||||
if element.ConvertedType != nil {
|
||||
s = append(s, "ConvertedType:"+element.ConvertedType.String())
|
||||
}
|
||||
if element.Encoding != nil {
|
||||
s = append(s, "Encoding:"+element.Encoding.String())
|
||||
}
|
||||
if element.CompressionType != nil {
|
||||
s = append(s, "CompressionType:"+element.CompressionType.String())
|
||||
}
|
||||
if element.Children != nil && element.Children.Length() > 0 {
|
||||
s = append(s, "Children:"+element.Children.String())
|
||||
}
|
||||
s = append(s, fmt.Sprintf("MaxDefinitionLevel:%v", element.MaxDefinitionLevel))
|
||||
s = append(s, fmt.Sprintf("MaxRepetitionLevel:%v", element.MaxRepetitionLevel))
|
||||
if element.PathInTree != "" {
|
||||
s = append(s, "PathInTree:"+element.PathInTree)
|
||||
}
|
||||
if element.PathInSchema != "" {
|
||||
s = append(s, "PathInSchema:"+element.PathInSchema)
|
||||
}
|
||||
|
||||
return "{" + strings.Join(s, ", ") + "}"
|
||||
}
|
||||
|
||||
// NewElement - creates new element.
|
||||
func NewElement(name string, repetitionType parquet.FieldRepetitionType,
|
||||
elementType *parquet.Type, convertedType *parquet.ConvertedType,
|
||||
encoding *parquet.Encoding, compressionType *parquet.CompressionCodec,
|
||||
children *Tree) (*Element, error) {
|
||||
|
||||
if !nameRegexp.MatchString(name) {
|
||||
return nil, fmt.Errorf("unsupported name %v", name)
|
||||
}
|
||||
|
||||
switch repetitionType {
|
||||
case parquet.FieldRepetitionType_REQUIRED, parquet.FieldRepetitionType_OPTIONAL, parquet.FieldRepetitionType_REPEATED:
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown repetition type %v", repetitionType)
|
||||
}
|
||||
|
||||
if repetitionType == parquet.FieldRepetitionType_REPEATED && (elementType != nil || convertedType != nil) {
|
||||
return nil, fmt.Errorf("repetition type REPEATED should be used in group element")
|
||||
}
|
||||
|
||||
if children != nil && children.Length() != 0 {
|
||||
if elementType != nil {
|
||||
return nil, fmt.Errorf("type should be nil for group element")
|
||||
}
|
||||
}
|
||||
|
||||
element := Element{
|
||||
Encoding: encoding,
|
||||
CompressionType: compressionType,
|
||||
Children: children,
|
||||
}
|
||||
|
||||
element.Name = name
|
||||
element.RepetitionType = &repetitionType
|
||||
element.Type = elementType
|
||||
element.ConvertedType = convertedType
|
||||
element.NumChildren = &element.numChildren
|
||||
if element.Children != nil {
|
||||
element.numChildren = int32(element.Children.Length())
|
||||
}
|
||||
|
||||
return &element, nil
|
||||
}
|
@ -1,389 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package schema
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func updateMaxDLRL(schemaMap map[string]*Element, maxDL, maxRL int64) {
|
||||
for _, element := range schemaMap {
|
||||
element.MaxDefinitionLevel = maxDL
|
||||
element.MaxRepetitionLevel = maxRL
|
||||
if *element.RepetitionType != parquet.FieldRepetitionType_REQUIRED {
|
||||
element.MaxDefinitionLevel++
|
||||
if *element.RepetitionType == parquet.FieldRepetitionType_REPEATED {
|
||||
element.MaxRepetitionLevel++
|
||||
}
|
||||
}
|
||||
|
||||
if element.Children != nil {
|
||||
updateMaxDLRL(element.Children.schemaMap, element.MaxDefinitionLevel, element.MaxRepetitionLevel)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func toParquetSchema(tree *Tree, treePrefix string, schemaPrefix string, schemaList *[]*parquet.SchemaElement, valueElements *[]*Element) (err error) {
|
||||
tree.Range(func(name string, element *Element) bool {
|
||||
pathInTree := name
|
||||
if treePrefix != "" {
|
||||
pathInTree = treePrefix + "." + name
|
||||
}
|
||||
|
||||
if element.Type == nil && element.ConvertedType == nil && element.Children == nil {
|
||||
err = fmt.Errorf("%v: group element must have children", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if element.ConvertedType != nil {
|
||||
switch *element.ConvertedType {
|
||||
case parquet.ConvertedType_LIST:
|
||||
// Supported structure.
|
||||
// <REQUIRED|OPTIONAL> group <name> (LIST) {
|
||||
// REPEATED group list {
|
||||
// <REQUIRED|OPTIONAL> <element-type> element;
|
||||
// }
|
||||
// }
|
||||
|
||||
if element.Type != nil {
|
||||
err = fmt.Errorf("%v: type must be nil for LIST ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if element.Children == nil || element.Children.Length() != 1 {
|
||||
err = fmt.Errorf("%v: children must have one element only for LIST ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
listElement, ok := element.Children.Get("list")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v: missing group element 'list' for LIST ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if listElement.Name != "list" {
|
||||
err = fmt.Errorf("%v.list: name must be 'list'", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if *listElement.RepetitionType != parquet.FieldRepetitionType_REPEATED {
|
||||
err = fmt.Errorf("%v.list: repetition type must be REPEATED type", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if listElement.Type != nil || listElement.ConvertedType != nil {
|
||||
err = fmt.Errorf("%v.list: type and converted type must be nil", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if listElement.Children == nil || listElement.Children.Length() != 1 {
|
||||
err = fmt.Errorf("%v.list.element: not found", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
valueElement, ok := listElement.Children.Get("element")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v.list.element: not found", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if valueElement.Name != "element" {
|
||||
err = fmt.Errorf("%v.list.element: name must be 'element'", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
case parquet.ConvertedType_MAP:
|
||||
// Supported structure:
|
||||
// <REQUIRED|OPTIONAL> group <name> (MAP) {
|
||||
// REPEATED group key_value {
|
||||
// REQUIRED <key-type> key;
|
||||
// <REQUIRED|OPTIONAL> <value-type> value;
|
||||
// }
|
||||
// }
|
||||
|
||||
if element.Type != nil {
|
||||
err = fmt.Errorf("%v: type must be nil for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if element.Children == nil || element.Children.Length() != 1 {
|
||||
err = fmt.Errorf("%v: children must have one element only for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
keyValueElement, ok := element.Children.Get("key_value")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v: missing group element 'key_value' for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if keyValueElement.Name != "key_value" {
|
||||
err = fmt.Errorf("%v.key_value: name must be 'key_value'", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if *keyValueElement.RepetitionType != parquet.FieldRepetitionType_REPEATED {
|
||||
err = fmt.Errorf("%v.key_value: repetition type must be REPEATED type", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if keyValueElement.Children == nil || keyValueElement.Children.Length() < 1 || keyValueElement.Children.Length() > 2 {
|
||||
err = fmt.Errorf("%v.key_value: children must have 'key' and optionally 'value' elements for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
keyElement, ok := keyValueElement.Children.Get("key")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v.key_value: missing 'key' element for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if keyElement.Name != "key" {
|
||||
err = fmt.Errorf("%v.key_value.key: name must be 'key'", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if *keyElement.RepetitionType != parquet.FieldRepetitionType_REQUIRED {
|
||||
err = fmt.Errorf("%v.key_value: repetition type must be REQUIRED type", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if keyValueElement.Children.Length() == 2 {
|
||||
valueElement, ok := keyValueElement.Children.Get("value")
|
||||
if !ok {
|
||||
err = fmt.Errorf("%v.key_value: second element must be 'value' element for MAP ConvertedType", pathInTree)
|
||||
return false
|
||||
}
|
||||
|
||||
if valueElement.Name != "value" {
|
||||
err = fmt.Errorf("%v.key_value.value: name must be 'value'", pathInTree)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
case parquet.ConvertedType_UTF8, parquet.ConvertedType_UINT_8, parquet.ConvertedType_UINT_16:
|
||||
fallthrough
|
||||
case parquet.ConvertedType_UINT_32, parquet.ConvertedType_UINT_64, parquet.ConvertedType_INT_8:
|
||||
fallthrough
|
||||
case parquet.ConvertedType_INT_16, parquet.ConvertedType_INT_32, parquet.ConvertedType_INT_64:
|
||||
if element.Type == nil {
|
||||
err = fmt.Errorf("%v: ConvertedType %v must have Type value", pathInTree, element.ConvertedType)
|
||||
return false
|
||||
}
|
||||
|
||||
default:
|
||||
err = fmt.Errorf("%v: unsupported ConvertedType %v", pathInTree, element.ConvertedType)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
element.PathInTree = pathInTree
|
||||
element.PathInSchema = element.Name
|
||||
if schemaPrefix != "" {
|
||||
element.PathInSchema = schemaPrefix + "." + element.Name
|
||||
}
|
||||
|
||||
if element.Type != nil {
|
||||
*valueElements = append(*valueElements, element)
|
||||
}
|
||||
|
||||
*schemaList = append(*schemaList, &element.SchemaElement)
|
||||
if element.Children != nil {
|
||||
element.numChildren = int32(element.Children.Length())
|
||||
err = toParquetSchema(element.Children, element.PathInTree, element.PathInSchema, schemaList, valueElements)
|
||||
}
|
||||
|
||||
return (err == nil)
|
||||
})
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// Tree - represents tree of schema. Tree preserves order in which elements are added.
|
||||
type Tree struct {
|
||||
schemaMap map[string]*Element
|
||||
keys []string
|
||||
readOnly bool
|
||||
}
|
||||
|
||||
// String - stringify this tree.
|
||||
func (tree *Tree) String() string {
|
||||
var s []string
|
||||
tree.Range(func(name string, element *Element) bool {
|
||||
s = append(s, fmt.Sprintf("%v: %v", name, element))
|
||||
return true
|
||||
})
|
||||
|
||||
return "{" + strings.Join(s, ", ") + "}"
|
||||
}
|
||||
|
||||
// Length - returns length of tree.
|
||||
func (tree *Tree) Length() int {
|
||||
return len(tree.keys)
|
||||
}
|
||||
|
||||
func (tree *Tree) travel(pathSegments []string) (pathSegmentIndex int, pathSegment string, currElement *Element, parentTree *Tree, found bool) {
|
||||
parentTree = tree
|
||||
for pathSegmentIndex, pathSegment = range pathSegments {
|
||||
if tree == nil {
|
||||
found = false
|
||||
break
|
||||
}
|
||||
|
||||
var tmpCurrElement *Element
|
||||
if tmpCurrElement, found = tree.schemaMap[pathSegment]; !found {
|
||||
break
|
||||
}
|
||||
currElement = tmpCurrElement
|
||||
|
||||
parentTree = tree
|
||||
tree = currElement.Children
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// ReadOnly - returns whether this tree is read only or not.
|
||||
func (tree *Tree) ReadOnly() bool {
|
||||
return tree.readOnly
|
||||
}
|
||||
|
||||
// Get - returns the element stored for name.
|
||||
func (tree *Tree) Get(name string) (element *Element, ok bool) {
|
||||
pathSegments := strings.Split(name, ".")
|
||||
for _, pathSegment := range pathSegments {
|
||||
if tree == nil {
|
||||
element = nil
|
||||
ok = false
|
||||
break
|
||||
}
|
||||
|
||||
if element, ok = tree.schemaMap[pathSegment]; !ok {
|
||||
break
|
||||
}
|
||||
|
||||
tree = element.Children
|
||||
}
|
||||
|
||||
return element, ok
|
||||
}
|
||||
|
||||
// Set - adds or sets element to name.
|
||||
func (tree *Tree) Set(name string, element *Element) error {
|
||||
if tree.readOnly {
|
||||
return fmt.Errorf("read only tree")
|
||||
}
|
||||
|
||||
pathSegments := strings.Split(name, ".")
|
||||
if err := validataPathSegments(pathSegments); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
i, pathSegment, currElement, parentTree, found := tree.travel(pathSegments)
|
||||
|
||||
if !found {
|
||||
if i != len(pathSegments)-1 {
|
||||
return fmt.Errorf("parent %v does not exist", strings.Join(pathSegments[:i+1], "."))
|
||||
}
|
||||
|
||||
if currElement == nil {
|
||||
parentTree = tree
|
||||
} else {
|
||||
if currElement.Type != nil {
|
||||
return fmt.Errorf("parent %v is not group element", strings.Join(pathSegments[:i], "."))
|
||||
}
|
||||
|
||||
if currElement.Children == nil {
|
||||
currElement.Children = NewTree()
|
||||
}
|
||||
parentTree = currElement.Children
|
||||
}
|
||||
|
||||
parentTree.keys = append(parentTree.keys, pathSegment)
|
||||
}
|
||||
|
||||
parentTree.schemaMap[pathSegment] = element
|
||||
return nil
|
||||
}
|
||||
|
||||
// Delete - deletes name and its element.
|
||||
func (tree *Tree) Delete(name string) {
|
||||
if tree.readOnly {
|
||||
panic(fmt.Errorf("read only tree"))
|
||||
}
|
||||
|
||||
pathSegments := strings.Split(name, ".")
|
||||
|
||||
_, pathSegment, _, parentTree, found := tree.travel(pathSegments)
|
||||
|
||||
if found {
|
||||
for i := range parentTree.keys {
|
||||
if parentTree.keys[i] == pathSegment {
|
||||
copy(parentTree.keys[i:], parentTree.keys[i+1:])
|
||||
parentTree.keys = parentTree.keys[:len(parentTree.keys)-1]
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
delete(parentTree.schemaMap, pathSegment)
|
||||
}
|
||||
}
|
||||
|
||||
// Range - calls f sequentially for each name and its element. If f returns false, range stops the iteration.
|
||||
func (tree *Tree) Range(f func(name string, element *Element) bool) {
|
||||
for _, name := range tree.keys {
|
||||
if !f(name, tree.schemaMap[name]) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ToParquetSchema - returns list of parquet SchemaElement and list of elements those stores values.
|
||||
func (tree *Tree) ToParquetSchema() (schemaList []*parquet.SchemaElement, valueElements []*Element, err error) {
|
||||
if tree.readOnly {
|
||||
return nil, nil, fmt.Errorf("read only tree")
|
||||
}
|
||||
|
||||
updateMaxDLRL(tree.schemaMap, 0, 0)
|
||||
|
||||
var schemaElements []*parquet.SchemaElement
|
||||
if err = toParquetSchema(tree, "", "", &schemaElements, &valueElements); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
tree.readOnly = true
|
||||
|
||||
numChildren := int32(len(tree.keys))
|
||||
schemaList = append(schemaList, &parquet.SchemaElement{
|
||||
Name: "schema",
|
||||
RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
|
||||
NumChildren: &numChildren,
|
||||
})
|
||||
schemaList = append(schemaList, schemaElements...)
|
||||
return schemaList, valueElements, nil
|
||||
}
|
||||
|
||||
// NewTree - creates new schema tree.
|
||||
func NewTree() *Tree {
|
||||
return &Tree{
|
||||
schemaMap: make(map[string]*Element),
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,101 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
|
||||
func getTableValues(values interface{}, valueType parquet.Type) (tableValues []interface{}) {
|
||||
return valuesToInterfaces(values, valueType)
|
||||
}
|
||||
|
||||
type table struct {
|
||||
RepetitionType parquet.FieldRepetitionType
|
||||
Type parquet.Type
|
||||
MaxDefinitionLevel int32
|
||||
MaxRepetitionLevel int32
|
||||
Path []string // Path of this column
|
||||
Values []interface{} // Parquet values
|
||||
DefinitionLevels []int32 // Definition Levels slice
|
||||
RepetitionLevels []int32 // Repetition Levels slice
|
||||
ConvertedType parquet.ConvertedType
|
||||
Encoding parquet.Encoding
|
||||
BitWidth int32
|
||||
}
|
||||
|
||||
func newTableFromTable(srcTable *table) *table {
|
||||
if srcTable == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &table{
|
||||
Type: srcTable.Type,
|
||||
Path: append([]string{}, srcTable.Path...),
|
||||
}
|
||||
}
|
||||
|
||||
func (table *table) Merge(tables ...*table) {
|
||||
for i := 0; i < len(tables); i++ {
|
||||
if tables[i] == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
table.Values = append(table.Values, tables[i].Values...)
|
||||
table.RepetitionLevels = append(table.RepetitionLevels, tables[i].RepetitionLevels...)
|
||||
table.DefinitionLevels = append(table.DefinitionLevels, tables[i].DefinitionLevels...)
|
||||
|
||||
if table.MaxDefinitionLevel < tables[i].MaxDefinitionLevel {
|
||||
table.MaxDefinitionLevel = tables[i].MaxDefinitionLevel
|
||||
}
|
||||
|
||||
if table.MaxRepetitionLevel < tables[i].MaxRepetitionLevel {
|
||||
table.MaxRepetitionLevel = tables[i].MaxRepetitionLevel
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (table *table) Pop(numRows int64) *table {
|
||||
result := newTableFromTable(table)
|
||||
var i, num int64
|
||||
for i = int64(0); i < int64(len(table.Values)); i++ {
|
||||
if table.RepetitionLevels[i] == 0 {
|
||||
if num >= numRows {
|
||||
break
|
||||
}
|
||||
|
||||
num++
|
||||
}
|
||||
|
||||
if result.MaxRepetitionLevel < table.RepetitionLevels[i] {
|
||||
result.MaxRepetitionLevel = table.RepetitionLevels[i]
|
||||
}
|
||||
|
||||
if result.MaxDefinitionLevel < table.DefinitionLevels[i] {
|
||||
result.MaxDefinitionLevel = table.DefinitionLevels[i]
|
||||
}
|
||||
}
|
||||
|
||||
result.RepetitionLevels = table.RepetitionLevels[:i]
|
||||
result.DefinitionLevels = table.DefinitionLevels[:i]
|
||||
result.Values = table.Values[:i]
|
||||
|
||||
table.RepetitionLevels = table.RepetitionLevels[i:]
|
||||
table.DefinitionLevels = table.DefinitionLevels[i:]
|
||||
table.Values = table.Values[i:]
|
||||
|
||||
return result
|
||||
}
|
Binary file not shown.
@ -1,147 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
|
||||
"github.com/minio/minio-go/v7/pkg/set"
|
||||
parquet "github.com/minio/minio/pkg/s3select/internal/parquet-go"
|
||||
)
|
||||
|
||||
func getReader(name string, offset int64, length int64) (io.ReadCloser, error) {
|
||||
file, err := os.Open(name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fi, err := file.Stat()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if offset < 0 {
|
||||
offset = fi.Size() + offset
|
||||
}
|
||||
|
||||
if _, err = file.Seek(offset, io.SeekStart); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return file, nil
|
||||
}
|
||||
|
||||
func printUsage() {
|
||||
progName := path.Base(os.Args[0])
|
||||
fmt.Printf("usage: %v PARQUET-FILE [COLUMN...]\n", progName)
|
||||
fmt.Println()
|
||||
fmt.Printf("examples:\n")
|
||||
fmt.Printf("# Convert all columns to CSV\n")
|
||||
fmt.Printf("$ %v example.parquet\n", progName)
|
||||
fmt.Println()
|
||||
fmt.Printf("# Convert specific columns to CSV\n")
|
||||
fmt.Printf("$ %v example.par firstname dob\n", progName)
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
func main() {
|
||||
if len(os.Args) < 2 {
|
||||
printUsage()
|
||||
os.Exit(-1)
|
||||
}
|
||||
|
||||
name := os.Args[1]
|
||||
ext := path.Ext(name)
|
||||
csvFilename := name + ".csv"
|
||||
if ext == ".parquet" || ext == ".par" {
|
||||
csvFilename = strings.TrimSuffix(name, ext) + ".csv"
|
||||
}
|
||||
|
||||
columns := set.CreateStringSet(os.Args[2:]...)
|
||||
if len(columns) == 0 {
|
||||
columns = nil
|
||||
}
|
||||
|
||||
file, err := parquet.NewReader(
|
||||
func(offset, length int64) (io.ReadCloser, error) {
|
||||
return getReader(name, offset, length)
|
||||
},
|
||||
columns,
|
||||
)
|
||||
if err != nil {
|
||||
fmt.Printf("%v: %v\n", name, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
defer file.Close()
|
||||
|
||||
csvFile, err := os.OpenFile(csvFilename, os.O_RDWR|os.O_CREATE, 0755)
|
||||
if err != nil {
|
||||
fmt.Printf("%v: %v\n", csvFilename, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
defer csvFile.Close()
|
||||
|
||||
csvWriter := csv.NewWriter(csvFile)
|
||||
defer csvWriter.Flush()
|
||||
|
||||
headerWritten := false
|
||||
for {
|
||||
record, err := file.Read()
|
||||
if err != nil {
|
||||
if err != io.EOF {
|
||||
fmt.Printf("%v: %v\n", name, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
if !headerWritten {
|
||||
var csvRecord []string
|
||||
record.Range(func(name string, value parquet.Value) bool {
|
||||
csvRecord = append(csvRecord, name)
|
||||
return true
|
||||
})
|
||||
|
||||
if err = csvWriter.Write(csvRecord); err != nil {
|
||||
fmt.Printf("%v: %v\n", csvFilename, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
headerWritten = true
|
||||
}
|
||||
|
||||
var csvRecord []string
|
||||
record.Range(func(name string, value parquet.Value) bool {
|
||||
csvRecord = append(csvRecord, fmt.Sprintf("%v", value.Value))
|
||||
return true
|
||||
})
|
||||
|
||||
if err = csvWriter.Write(csvRecord); err != nil {
|
||||
fmt.Printf("%v: %v\n", csvFilename, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
}
|
@ -1,129 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
|
||||
"github.com/minio/minio-go/v7/pkg/set"
|
||||
parquet "github.com/minio/minio/pkg/s3select/internal/parquet-go"
|
||||
)
|
||||
|
||||
func getReader(name string, offset int64, length int64) (io.ReadCloser, error) {
|
||||
file, err := os.Open(name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fi, err := file.Stat()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if offset < 0 {
|
||||
offset = fi.Size() + offset
|
||||
}
|
||||
|
||||
if _, err = file.Seek(offset, io.SeekStart); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return file, nil
|
||||
}
|
||||
|
||||
func printUsage() {
|
||||
progName := path.Base(os.Args[0])
|
||||
fmt.Printf("Usage: %v PARQUET-FILE [COLUMN...]\n", progName)
|
||||
fmt.Println()
|
||||
fmt.Printf("Examples:\n")
|
||||
fmt.Printf("# Convert all columns to JSON\n")
|
||||
fmt.Printf("$ %v example.parquet\n", progName)
|
||||
fmt.Println()
|
||||
fmt.Printf("# Convert specific columns to JSON\n")
|
||||
fmt.Printf("$ %v example.par firstname dob\n", progName)
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
func main() {
|
||||
if len(os.Args) < 2 {
|
||||
printUsage()
|
||||
os.Exit(-1)
|
||||
}
|
||||
|
||||
name := os.Args[1]
|
||||
ext := path.Ext(name)
|
||||
jsonFilename := name + ".json"
|
||||
if ext == ".parquet" || ext == ".par" {
|
||||
jsonFilename = strings.TrimSuffix(name, ext) + ".json"
|
||||
}
|
||||
|
||||
columns := set.CreateStringSet(os.Args[2:]...)
|
||||
if len(columns) == 0 {
|
||||
columns = nil
|
||||
}
|
||||
|
||||
file, err := parquet.NewReader(
|
||||
func(offset, length int64) (io.ReadCloser, error) {
|
||||
return getReader(name, offset, length)
|
||||
},
|
||||
columns,
|
||||
)
|
||||
if err != nil {
|
||||
fmt.Printf("%v: %v\n", name, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
defer file.Close()
|
||||
|
||||
jsonFile, err := os.OpenFile(jsonFilename, os.O_RDWR|os.O_CREATE, 0755)
|
||||
if err != nil {
|
||||
fmt.Printf("%v: %v\n", jsonFilename, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
defer jsonFile.Close()
|
||||
|
||||
for {
|
||||
record, err := file.Read()
|
||||
if err != nil {
|
||||
if err != io.EOF {
|
||||
fmt.Printf("%v: %v\n", name, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
data, err := json.Marshal(record)
|
||||
if err != nil {
|
||||
fmt.Printf("%v: %v\n", name, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
data = append(data, byte('\n'))
|
||||
|
||||
if _, err = jsonFile.Write(data); err != nil {
|
||||
fmt.Printf("%v: %v\n", jsonFilename, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
}
|
@ -1,192 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"git.apache.org/thrift.git/lib/go/thrift"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/data"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultPageSize = 8 * 1024 // 8 KiB
|
||||
defaultRowGroupSize = 128 * 1024 * 1024 // 128 MiB
|
||||
)
|
||||
|
||||
// Writer - represents parquet writer.
|
||||
type Writer struct {
|
||||
PageSize int64
|
||||
RowGroupSize int64
|
||||
CompressionType parquet.CompressionCodec
|
||||
|
||||
writeCloser io.WriteCloser
|
||||
numRows int64
|
||||
offset int64
|
||||
footer *parquet.FileMetaData
|
||||
schemaTree *schema.Tree
|
||||
valueElements []*schema.Element
|
||||
columnDataMap map[string]*data.Column
|
||||
rowGroupCount int
|
||||
}
|
||||
|
||||
func (writer *Writer) writeData() (err error) {
|
||||
if writer.numRows == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var chunks []*data.ColumnChunk
|
||||
for _, element := range writer.valueElements {
|
||||
name := element.PathInTree
|
||||
columnData, found := writer.columnDataMap[name]
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
|
||||
columnChunk := columnData.Encode(element)
|
||||
chunks = append(chunks, columnChunk)
|
||||
}
|
||||
|
||||
rowGroup := data.NewRowGroup(chunks, writer.numRows, writer.offset)
|
||||
|
||||
for _, chunk := range chunks {
|
||||
if _, err = writer.writeCloser.Write(chunk.Data()); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
writer.offset += chunk.DataLen()
|
||||
}
|
||||
|
||||
writer.footer.RowGroups = append(writer.footer.RowGroups, rowGroup)
|
||||
writer.footer.NumRows += writer.numRows
|
||||
|
||||
writer.numRows = 0
|
||||
writer.columnDataMap = nil
|
||||
return nil
|
||||
}
|
||||
|
||||
// WriteJSON - writes a record represented in JSON.
|
||||
func (writer *Writer) WriteJSON(recordData []byte) (err error) {
|
||||
columnDataMap, err := data.UnmarshalJSON(recordData, writer.schemaTree)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return writer.Write(columnDataMap)
|
||||
}
|
||||
|
||||
// Write - writes a record represented in map.
|
||||
func (writer *Writer) Write(record map[string]*data.Column) (err error) {
|
||||
if writer.columnDataMap == nil {
|
||||
writer.columnDataMap = record
|
||||
} else {
|
||||
for name, columnData := range record {
|
||||
var found bool
|
||||
var element *schema.Element
|
||||
for _, element = range writer.valueElements {
|
||||
if element.PathInTree == name {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
return fmt.Errorf("%v is not value column", name)
|
||||
}
|
||||
|
||||
writer.columnDataMap[name].Merge(columnData)
|
||||
}
|
||||
}
|
||||
|
||||
writer.numRows++
|
||||
if writer.numRows == int64(writer.rowGroupCount) {
|
||||
return writer.writeData()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (writer *Writer) finalize() (err error) {
|
||||
if err = writer.writeData(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
footerBuf, err := ts.Write(context.TODO(), writer.footer)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err = writer.writeCloser.Write(footerBuf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
footerSizeBuf := make([]byte, 4)
|
||||
binary.LittleEndian.PutUint32(footerSizeBuf, uint32(len(footerBuf)))
|
||||
|
||||
if _, err = writer.writeCloser.Write(footerSizeBuf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = writer.writeCloser.Write([]byte("PAR1"))
|
||||
return err
|
||||
}
|
||||
|
||||
// Close - finalizes and closes writer. If any pending records are available, they are written here.
|
||||
func (writer *Writer) Close() (err error) {
|
||||
if err = writer.finalize(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return writer.writeCloser.Close()
|
||||
}
|
||||
|
||||
// NewWriter - creates new parquet writer. Binary data of rowGroupCount records are written to writeCloser.
|
||||
func NewWriter(writeCloser io.WriteCloser, schemaTree *schema.Tree, rowGroupCount int) (*Writer, error) {
|
||||
if _, err := writeCloser.Write([]byte("PAR1")); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
schemaList, valueElements, err := schemaTree.ToParquetSchema()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
footer := parquet.NewFileMetaData()
|
||||
footer.Version = 1
|
||||
footer.Schema = schemaList
|
||||
|
||||
return &Writer{
|
||||
PageSize: defaultPageSize,
|
||||
RowGroupSize: defaultRowGroupSize,
|
||||
CompressionType: parquet.CompressionCodec_SNAPPY,
|
||||
|
||||
writeCloser: writeCloser,
|
||||
offset: 4,
|
||||
footer: footer,
|
||||
schemaTree: schemaTree,
|
||||
valueElements: valueElements,
|
||||
rowGroupCount: rowGroupCount,
|
||||
}, nil
|
||||
}
|
@ -1,153 +0,0 @@
|
||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
||||
//
|
||||
// This file is part of MinIO Object Storage stack
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/data"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
func TestWriterWrite(t *testing.T) {
|
||||
schemaTree := schema.NewTree()
|
||||
{
|
||||
one, err := schema.NewElement("one", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_16),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
two, err := schema.NewElement("two", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
three, err := schema.NewElement("three", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BOOLEAN), nil, nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := schemaTree.Set("one", one); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := schemaTree.Set("two", two); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := schemaTree.Set("three", three); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
file, err := os.Create("test.parquet")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
writer, err := NewWriter(file, schemaTree, 100)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
oneColumn := data.NewColumn(parquet.Type_INT32)
|
||||
oneColumn.AddInt32(100, 0, 0)
|
||||
|
||||
twoColumn := data.NewColumn(parquet.Type_BYTE_ARRAY)
|
||||
twoColumn.AddByteArray([]byte("foo"), 0, 0)
|
||||
|
||||
threeColumn := data.NewColumn(parquet.Type_BOOLEAN)
|
||||
threeColumn.AddBoolean(true, 0, 0)
|
||||
|
||||
record := map[string]*data.Column{
|
||||
"one": oneColumn,
|
||||
"two": twoColumn,
|
||||
"three": threeColumn,
|
||||
}
|
||||
|
||||
err = writer.Write(record)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
err = writer.Close()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriterWriteJSON(t *testing.T) {
|
||||
schemaTree := schema.NewTree()
|
||||
{
|
||||
one, err := schema.NewElement("one", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_INT32), parquet.ConvertedTypePtr(parquet.ConvertedType_INT_16),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
two, err := schema.NewElement("two", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BYTE_ARRAY), parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
|
||||
nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
three, err := schema.NewElement("three", parquet.FieldRepetitionType_REQUIRED,
|
||||
parquet.TypePtr(parquet.Type_BOOLEAN), nil, nil, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := schemaTree.Set("one", one); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := schemaTree.Set("two", two); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := schemaTree.Set("three", three); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
file, err := os.Create("test.parquet")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
writer, err := NewWriter(file, schemaTree, 100)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
record := `{"one": 100, "two": "foo", "three": true}`
|
||||
err = writer.WriteJSON([]byte(record))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
err = writer.Close()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
@ -23,10 +23,10 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/bcicen/jstream"
|
||||
parquetgo "github.com/minio/minio/pkg/s3select/internal/parquet-go"
|
||||
parquetgen "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
jsonfmt "github.com/minio/minio/pkg/s3select/json"
|
||||
"github.com/minio/minio/pkg/s3select/sql"
|
||||
parquetgo "github.com/minio/parquet-go"
|
||||
parquetgen "github.com/minio/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
// Reader - Parquet record reader for S3Select.
|
||||
|
Loading…
x
Reference in New Issue
Block a user