mirror of
https://github.com/minio/minio.git
synced 2025-11-11 06:20:14 -05:00
Add archived parquet as int. package (#9912)
Since github.com/minio/parquet-go is archived add it as internal package.
This commit is contained in:
38
pkg/s3select/internal/parquet-go/encoding/common.go
Normal file
38
pkg/s3select/internal/parquet-go/encoding/common.go
Normal file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/common"
|
||||
)
|
||||
|
||||
// Refer https://en.wikipedia.org/wiki/LEB128#Unsigned_LEB128
|
||||
func varIntEncode(ui64 uint64) []byte {
|
||||
if ui64 == 0 {
|
||||
return []byte{0}
|
||||
}
|
||||
|
||||
length := int(common.BitWidth(ui64)+6) / 7
|
||||
data := make([]byte, length)
|
||||
for i := 0; i < length; i++ {
|
||||
data[i] = byte(ui64&0x7F) | 0x80
|
||||
ui64 >>= 7
|
||||
}
|
||||
data[length-1] &= 0x7F
|
||||
|
||||
return data
|
||||
}
|
||||
43
pkg/s3select/internal/parquet-go/encoding/common_test.go
Normal file
43
pkg/s3select/internal/parquet-go/encoding/common_test.go
Normal file
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestVarIntToBytes(t *testing.T) {
|
||||
testCases := []struct {
|
||||
ui64 uint64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{0, []byte{0}},
|
||||
{1, []byte{1}},
|
||||
{0x7F, []byte{127}},
|
||||
{0x80, []byte{128, 1}},
|
||||
{uint64(math.MaxUint64), []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 1}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := varIntEncode(testCase.ui64)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
296
pkg/s3select/internal/parquet-go/encoding/delta-encode.go
Normal file
296
pkg/s3select/internal/parquet-go/encoding/delta-encode.go
Normal file
@@ -0,0 +1,296 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/common"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
const (
|
||||
blockSize = 128
|
||||
miniBlockSize = 32
|
||||
miniBlockCount = blockSize / miniBlockSize
|
||||
)
|
||||
|
||||
var deltaEncodeHeaderBytes []byte
|
||||
|
||||
func init() {
|
||||
deltaEncodeHeaderBytes = varIntEncode(blockSize)
|
||||
deltaEncodeHeaderBytes = append(deltaEncodeHeaderBytes, varIntEncode(miniBlockCount)...)
|
||||
}
|
||||
|
||||
// Supported Types: BOOLEAN, INT32, INT64
|
||||
func bitPackedEncode(values interface{}, bitWidth uint64, withHeader bool, parquetType parquet.Type) []byte {
|
||||
var i64s []int64
|
||||
switch parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
bs, ok := values.([]bool)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of bool"))
|
||||
}
|
||||
|
||||
i64s = make([]int64, len(bs))
|
||||
for i := range bs {
|
||||
if bs[i] {
|
||||
i64s[i] = 1
|
||||
}
|
||||
}
|
||||
case parquet.Type_INT32:
|
||||
i32s, ok := values.([]int32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int32"))
|
||||
}
|
||||
|
||||
for i := range i32s {
|
||||
i64s[i] = int64(i32s[i])
|
||||
}
|
||||
case parquet.Type_INT64:
|
||||
var ok bool
|
||||
i64s, ok = values.([]int64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int64"))
|
||||
}
|
||||
default:
|
||||
panic(fmt.Errorf("%v parquet type unsupported", parquetType))
|
||||
}
|
||||
|
||||
if len(i64s) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var valueByte byte
|
||||
bitsSet := uint64(0)
|
||||
bitsNeeded := uint64(8)
|
||||
bitsToSet := bitWidth
|
||||
value := i64s[0]
|
||||
|
||||
valueBytes := []byte{}
|
||||
for i := 0; i < len(i64s); {
|
||||
if bitsToSet >= bitsNeeded {
|
||||
valueByte |= byte(((value >> bitsSet) & ((1 << bitsNeeded) - 1)) << (8 - bitsNeeded))
|
||||
valueBytes = append(valueBytes, valueByte)
|
||||
bitsToSet -= bitsNeeded
|
||||
bitsSet += bitsNeeded
|
||||
|
||||
bitsNeeded = 8
|
||||
valueByte = 0
|
||||
|
||||
if bitsToSet <= 0 && (i+1) < len(i64s) {
|
||||
i++
|
||||
value = i64s[i]
|
||||
bitsToSet = bitWidth
|
||||
bitsSet = 0
|
||||
}
|
||||
} else {
|
||||
valueByte |= byte((value >> bitsSet) << (8 - bitsNeeded))
|
||||
i++
|
||||
|
||||
if i < len(i64s) {
|
||||
value = i64s[i]
|
||||
}
|
||||
|
||||
bitsNeeded -= bitsToSet
|
||||
bitsToSet = bitWidth
|
||||
bitsSet = 0
|
||||
}
|
||||
}
|
||||
|
||||
if withHeader {
|
||||
header := uint64(((len(i64s) / 8) << 1) | 1)
|
||||
headerBytes := varIntEncode(header)
|
||||
return append(headerBytes, valueBytes...)
|
||||
}
|
||||
|
||||
return valueBytes
|
||||
}
|
||||
|
||||
func deltaEncodeInt32s(i32s []int32) (data []byte) {
|
||||
getValue := func(i32 int32) uint64 {
|
||||
return uint64((i32 >> 31) ^ (i32 << 1))
|
||||
}
|
||||
|
||||
data = append(data, deltaEncodeHeaderBytes...)
|
||||
data = append(data, varIntEncode(uint64(len(i32s)))...)
|
||||
data = append(data, varIntEncode(getValue(i32s[0]))...)
|
||||
|
||||
for i := 1; i < len(i32s); {
|
||||
block := []int32{}
|
||||
minDelta := int32(0x7FFFFFFF)
|
||||
|
||||
for ; i < len(i32s) && len(block) < blockSize; i++ {
|
||||
delta := i32s[i] - i32s[i-1]
|
||||
block = append(block, delta)
|
||||
if delta < minDelta {
|
||||
minDelta = delta
|
||||
}
|
||||
}
|
||||
|
||||
for len(block) < blockSize {
|
||||
block = append(block, minDelta)
|
||||
}
|
||||
|
||||
bitWidths := make([]byte, miniBlockCount)
|
||||
for j := 0; j < miniBlockCount; j++ {
|
||||
maxValue := int32(0)
|
||||
for k := j * miniBlockSize; k < (j+1)*miniBlockSize; k++ {
|
||||
block[k] -= minDelta
|
||||
if block[k] > maxValue {
|
||||
maxValue = block[k]
|
||||
}
|
||||
}
|
||||
|
||||
bitWidths[j] = byte(common.BitWidth(uint64(maxValue)))
|
||||
}
|
||||
|
||||
minDeltaZigZag := getValue(minDelta)
|
||||
data = append(data, varIntEncode(minDeltaZigZag)...)
|
||||
data = append(data, bitWidths...)
|
||||
|
||||
for j := 0; j < miniBlockCount; j++ {
|
||||
bitPacked := bitPackedEncode(
|
||||
block[j*miniBlockSize:(j+1)*miniBlockSize],
|
||||
uint64(bitWidths[j]),
|
||||
false,
|
||||
parquet.Type_INT32,
|
||||
)
|
||||
data = append(data, bitPacked...)
|
||||
}
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func deltaEncodeInt64s(i64s []int64) (data []byte) {
|
||||
getValue := func(i64 int64) uint64 {
|
||||
return uint64((i64 >> 63) ^ (i64 << 1))
|
||||
}
|
||||
|
||||
data = append(data, deltaEncodeHeaderBytes...)
|
||||
data = append(data, varIntEncode(uint64(len(i64s)))...)
|
||||
data = append(data, varIntEncode(getValue(i64s[0]))...)
|
||||
|
||||
for i := 1; i < len(i64s); {
|
||||
block := []int64{}
|
||||
minDelta := int64(0x7FFFFFFFFFFFFFFF)
|
||||
|
||||
for ; i < len(i64s) && len(block) < blockSize; i++ {
|
||||
delta := i64s[i] - i64s[i-1]
|
||||
block = append(block, delta)
|
||||
if delta < minDelta {
|
||||
minDelta = delta
|
||||
}
|
||||
}
|
||||
|
||||
for len(block) < blockSize {
|
||||
block = append(block, minDelta)
|
||||
}
|
||||
|
||||
bitWidths := make([]byte, miniBlockCount)
|
||||
for j := 0; j < miniBlockCount; j++ {
|
||||
maxValue := int64(0)
|
||||
for k := j * miniBlockSize; k < (j+1)*miniBlockSize; k++ {
|
||||
block[k] -= minDelta
|
||||
if block[k] > maxValue {
|
||||
maxValue = block[k]
|
||||
}
|
||||
}
|
||||
|
||||
bitWidths[j] = byte(common.BitWidth(uint64(maxValue)))
|
||||
}
|
||||
|
||||
minDeltaZigZag := getValue(minDelta)
|
||||
data = append(data, varIntEncode(minDeltaZigZag)...)
|
||||
data = append(data, bitWidths...)
|
||||
|
||||
for j := 0; j < miniBlockCount; j++ {
|
||||
bitPacked := bitPackedEncode(
|
||||
block[j*miniBlockSize:(j+1)*miniBlockSize],
|
||||
uint64(bitWidths[j]),
|
||||
false,
|
||||
parquet.Type_INT64,
|
||||
)
|
||||
data = append(data, bitPacked...)
|
||||
}
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
// DeltaEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-encoding-delta_binary_packed--5
|
||||
//
|
||||
// Supported Types: INT32, INT64.
|
||||
func DeltaEncode(values interface{}, parquetType parquet.Type) []byte {
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
i32s, ok := values.([]int32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int32"))
|
||||
}
|
||||
return deltaEncodeInt32s(i32s)
|
||||
case parquet.Type_INT64:
|
||||
i64s, ok := values.([]int64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int64"))
|
||||
}
|
||||
return deltaEncodeInt64s(i64s)
|
||||
}
|
||||
|
||||
panic(fmt.Errorf("%v parquet type unsupported", parquetType))
|
||||
}
|
||||
|
||||
// DeltaLengthByteArrayEncode encodes bytes slices specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6
|
||||
//
|
||||
// Supported Types: BYTE_ARRAY
|
||||
func DeltaLengthByteArrayEncode(bytesSlices [][]byte) (data []byte) {
|
||||
lengths := make([]int32, len(bytesSlices))
|
||||
for i, bytes := range bytesSlices {
|
||||
lengths[i] = int32(len(bytes))
|
||||
}
|
||||
|
||||
data = deltaEncodeInt32s(lengths)
|
||||
for _, bytes := range bytesSlices {
|
||||
data = append(data, []byte(bytes)...)
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
// DeltaByteArrayEncode encodes sequence of strings values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-strings-delta_byte_array--7
|
||||
//
|
||||
// Supported Types: BYTE_ARRAY
|
||||
func DeltaByteArrayEncode(bytesSlices [][]byte) (data []byte) {
|
||||
prefixLengths := make([]int32, len(bytesSlices))
|
||||
suffixes := make([][]byte, len(bytesSlices))
|
||||
|
||||
var i, j int
|
||||
for i = 1; i < len(bytesSlices); i++ {
|
||||
for j = 0; j < len(bytesSlices[i-1]) && j < len(bytesSlices[i]); j++ {
|
||||
if bytesSlices[i-1][j] != bytesSlices[i][j] {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
prefixLengths[i] = int32(j)
|
||||
suffixes[i] = bytesSlices[i][j:]
|
||||
}
|
||||
|
||||
data = deltaEncodeInt32s(prefixLengths)
|
||||
return append(data, DeltaLengthByteArrayEncode(suffixes)...)
|
||||
}
|
||||
140
pkg/s3select/internal/parquet-go/encoding/plain-encode.go
Normal file
140
pkg/s3select/internal/parquet-go/encoding/plain-encode.go
Normal file
@@ -0,0 +1,140 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func plainEncodeBools(bs []bool) []byte {
|
||||
data := make([]byte, (len(bs)+7)/8)
|
||||
|
||||
for i := range bs {
|
||||
if bs[i] {
|
||||
data[i/8] |= 1 << uint(i%8)
|
||||
}
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeInt32s(i32s []int32) []byte {
|
||||
data := make([]byte, len(i32s)*4)
|
||||
|
||||
for i, i32 := range i32s {
|
||||
binary.LittleEndian.PutUint32(data[i*4:], uint32(i32))
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeInt64s(i64s []int64) []byte {
|
||||
data := make([]byte, len(i64s)*8)
|
||||
|
||||
for i, i64 := range i64s {
|
||||
binary.LittleEndian.PutUint64(data[i*8:], uint64(i64))
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeFloat32s(f32s []float32) []byte {
|
||||
data := make([]byte, len(f32s)*4)
|
||||
|
||||
for i, f32 := range f32s {
|
||||
binary.LittleEndian.PutUint32(data[i*4:], math.Float32bits(f32))
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeFloat64s(f64s []float64) []byte {
|
||||
data := make([]byte, len(f64s)*8)
|
||||
|
||||
for i, f64 := range f64s {
|
||||
binary.LittleEndian.PutUint64(data[i*8:], math.Float64bits(f64))
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func plainEncodeBytesSlices(bytesSlices [][]byte) []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
|
||||
for _, s := range bytesSlices {
|
||||
if err := binary.Write(buf, binary.LittleEndian, uint32(len(s))); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if _, err := buf.Write(s); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
// PlainEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0
|
||||
//
|
||||
// Supported Types: BOOLEAN, INT32, INT64, FLOAT, DOUBLE, BYTE_ARRAY
|
||||
func PlainEncode(values interface{}, parquetType parquet.Type) []byte {
|
||||
switch parquetType {
|
||||
case parquet.Type_BOOLEAN:
|
||||
bs, ok := values.([]bool)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of bool"))
|
||||
}
|
||||
return plainEncodeBools(bs)
|
||||
case parquet.Type_INT32:
|
||||
i32s, ok := values.([]int32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int32"))
|
||||
}
|
||||
return plainEncodeInt32s(i32s)
|
||||
case parquet.Type_INT64:
|
||||
i64s, ok := values.([]int64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int64"))
|
||||
}
|
||||
return plainEncodeInt64s(i64s)
|
||||
case parquet.Type_FLOAT:
|
||||
f32s, ok := values.([]float32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of float32"))
|
||||
}
|
||||
return plainEncodeFloat32s(f32s)
|
||||
case parquet.Type_DOUBLE:
|
||||
f64s, ok := values.([]float64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of float64"))
|
||||
}
|
||||
return plainEncodeFloat64s(f64s)
|
||||
case parquet.Type_BYTE_ARRAY:
|
||||
bytesSlices, ok := values.([][]byte)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of byte array"))
|
||||
}
|
||||
return plainEncodeBytesSlices(bytesSlices)
|
||||
}
|
||||
|
||||
panic(fmt.Errorf("%v parquet type unsupported", parquetType))
|
||||
}
|
||||
147
pkg/s3select/internal/parquet-go/encoding/plain-encode_test.go
Normal file
147
pkg/s3select/internal/parquet-go/encoding/plain-encode_test.go
Normal file
@@ -0,0 +1,147 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestPlainEncodeBools(t *testing.T) {
|
||||
testCases := []struct {
|
||||
bs []bool
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]bool{}, []byte{}},
|
||||
{[]bool{true}, []byte{1}},
|
||||
{[]bool{false}, []byte{0}},
|
||||
{[]bool{true, true}, []byte{3}},
|
||||
{[]bool{false, false}, []byte{0}},
|
||||
{[]bool{false, true}, []byte{2}},
|
||||
{[]bool{true, false}, []byte{1}},
|
||||
{[]bool{false, false, false, false, false, false, false, true, true}, []byte{128, 1}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeBools(testCase.bs)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlainEncodeInt32s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
i32s []int32
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]int32{}, []byte{}},
|
||||
{[]int32{1}, []byte{1, 0, 0, 0}},
|
||||
{[]int32{-1}, []byte{255, 255, 255, 255}},
|
||||
{[]int32{256}, []byte{0, 1, 0, 0}},
|
||||
{[]int32{math.MinInt32}, []byte{0, 0, 0, 128}},
|
||||
{[]int32{math.MaxInt32}, []byte{255, 255, 255, 127}},
|
||||
{[]int32{257, -2}, []byte{1, 1, 0, 0, 254, 255, 255, 255}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeInt32s(testCase.i32s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlainEncodeInt64s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
i64s []int64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]int64{}, []byte{}},
|
||||
{[]int64{1}, []byte{1, 0, 0, 0, 0, 0, 0, 0}},
|
||||
{[]int64{-1}, []byte{255, 255, 255, 255, 255, 255, 255, 255}},
|
||||
{[]int64{256}, []byte{0, 1, 0, 0, 0, 0, 0, 0}},
|
||||
{[]int64{math.MinInt64}, []byte{0, 0, 0, 0, 0, 0, 0, 128}},
|
||||
{[]int64{math.MaxInt64}, []byte{255, 255, 255, 255, 255, 255, 255, 127}},
|
||||
{[]int64{257, -2}, []byte{1, 1, 0, 0, 0, 0, 0, 0, 254, 255, 255, 255, 255, 255, 255, 255}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeInt64s(testCase.i64s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlainEncodeFloat32s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
f32s []float32
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]float32{}, []byte{}},
|
||||
{[]float32{1}, []byte{0, 0, 128, 63}},
|
||||
{[]float32{1.0}, []byte{0, 0, 128, 63}},
|
||||
{[]float32{-1}, []byte{0, 0, 128, 191}},
|
||||
{[]float32{-1.0}, []byte{0, 0, 128, 191}},
|
||||
{[]float32{256}, []byte{0, 0, 128, 67}},
|
||||
{[]float32{1.1}, []byte{205, 204, 140, 63}},
|
||||
{[]float32{-1.1}, []byte{205, 204, 140, 191}},
|
||||
{[]float32{math.Pi}, []byte{219, 15, 73, 64}},
|
||||
{[]float32{257, -2}, []byte{0, 128, 128, 67, 0, 0, 0, 192}},
|
||||
{[]float32{257.1, -2.1}, []byte{205, 140, 128, 67, 102, 102, 6, 192}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeFloat32s(testCase.f32s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlainEncodeFloat64s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
f64s []float64
|
||||
expectedResult []byte
|
||||
}{
|
||||
{nil, []byte{}},
|
||||
{[]float64{}, []byte{}},
|
||||
{[]float64{1}, []byte{0, 0, 0, 0, 0, 0, 240, 63}},
|
||||
{[]float64{1.0}, []byte{0, 0, 0, 0, 0, 0, 240, 63}},
|
||||
{[]float64{-1}, []byte{0, 0, 0, 0, 0, 0, 240, 191}},
|
||||
{[]float64{-1.0}, []byte{0, 0, 0, 0, 0, 0, 240, 191}},
|
||||
{[]float64{256}, []byte{0, 0, 0, 0, 0, 0, 112, 64}},
|
||||
{[]float64{1.1}, []byte{154, 153, 153, 153, 153, 153, 241, 63}},
|
||||
{[]float64{-1.1}, []byte{154, 153, 153, 153, 153, 153, 241, 191}},
|
||||
{[]float64{math.Pi}, []byte{24, 45, 68, 84, 251, 33, 9, 64}},
|
||||
{[]float64{257, -2}, []byte{0, 0, 0, 0, 0, 16, 112, 64, 0, 0, 0, 0, 0, 0, 0, 192}},
|
||||
{[]float64{257.1, -2.1}, []byte{154, 153, 153, 153, 153, 17, 112, 64, 205, 204, 204, 204, 204, 204, 0, 192}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := plainEncodeFloat64s(testCase.f64s)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
84
pkg/s3select/internal/parquet-go/encoding/rle-encode.go
Normal file
84
pkg/s3select/internal/parquet-go/encoding/rle-encode.go
Normal file
@@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func rleEncodeInt32s(i32s []int32, bitWidth int32) (data []byte) {
|
||||
j := 0
|
||||
for i := 0; i < len(i32s); i = j {
|
||||
for j = i + 1; j < len(i32s) && i32s[i] == i32s[j]; j++ {
|
||||
}
|
||||
|
||||
headerBytes := varIntEncode(uint64((j - i) << 1))
|
||||
data = append(data, headerBytes...)
|
||||
|
||||
valBytes := plainEncodeInt32s([]int32{i32s[i]})
|
||||
byteCount := (bitWidth + 7) / 8
|
||||
data = append(data, valBytes[:byteCount]...)
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func rleEncodeInt64s(i64s []int64, bitWidth int32) (data []byte) {
|
||||
j := 0
|
||||
for i := 0; i < len(i64s); i = j {
|
||||
for j = i + 1; j < len(i64s) && i64s[i] == i64s[j]; j++ {
|
||||
}
|
||||
|
||||
headerBytes := varIntEncode(uint64((j - i) << 1))
|
||||
data = append(data, headerBytes...)
|
||||
|
||||
valBytes := plainEncodeInt64s([]int64{i64s[i]})
|
||||
byteCount := (bitWidth + 7) / 8
|
||||
data = append(data, valBytes[:byteCount]...)
|
||||
}
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
// RLEBitPackedHybridEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3
|
||||
//
|
||||
// Supported Types: INT32, INT64
|
||||
func RLEBitPackedHybridEncode(values interface{}, bitWidth int32, parquetType parquet.Type) []byte {
|
||||
var rleBytes []byte
|
||||
|
||||
switch parquetType {
|
||||
case parquet.Type_INT32:
|
||||
i32s, ok := values.([]int32)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int32"))
|
||||
}
|
||||
rleBytes = rleEncodeInt32s(i32s, bitWidth)
|
||||
case parquet.Type_INT64:
|
||||
i64s, ok := values.([]int64)
|
||||
if !ok {
|
||||
panic(fmt.Errorf("expected slice of int64"))
|
||||
}
|
||||
rleBytes = rleEncodeInt64s(i64s, bitWidth)
|
||||
default:
|
||||
panic(fmt.Errorf("%v parquet type unsupported", parquetType))
|
||||
}
|
||||
|
||||
lenBytes := plainEncodeInt32s([]int32{int32(len(rleBytes))})
|
||||
return append(lenBytes, rleBytes...)
|
||||
}
|
||||
44
pkg/s3select/internal/parquet-go/encoding/rle-encode_test.go
Normal file
44
pkg/s3select/internal/parquet-go/encoding/rle-encode_test.go
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
func TestRLEEncodeInt32s(t *testing.T) {
|
||||
testCases := []struct {
|
||||
values []int32
|
||||
bitWidth int32
|
||||
dataType parquet.Type
|
||||
expectedResult []byte
|
||||
}{
|
||||
{[]int32{3, 5, 7}, 1, parquet.Type_INT32, []byte{2, 3, 2, 5, 2, 7}},
|
||||
{[]int32{3, 3, 3}, 1, parquet.Type_INT32, []byte{6, 3}},
|
||||
{[]int32{2, 2, 3, 3, 3}, 1, parquet.Type_INT32, []byte{4, 2, 6, 3}},
|
||||
}
|
||||
|
||||
for i, testCase := range testCases {
|
||||
result := rleEncodeInt32s(testCase.values, testCase.bitWidth)
|
||||
if !reflect.DeepEqual(result, testCase.expectedResult) {
|
||||
t.Fatalf("case %v: expected: %v, got: %v", i+1, testCase.expectedResult, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
60
pkg/s3select/internal/parquet-go/encoding/rledict-encode.go
Normal file
60
pkg/s3select/internal/parquet-go/encoding/rledict-encode.go
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package encoding
|
||||
|
||||
import (
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/common"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
)
|
||||
|
||||
// RLEDictEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#dictionary-encoding-plain_dictionary--2-and-rle_dictionary--8 and returns dictionary page data and data page data.
|
||||
//
|
||||
// Dictionary page data contains PLAIN encodeed slice of uniquely fully defined non-nil values.
|
||||
// Data page data contains RLE/Bit-Packed Hybrid encoded indices of fully defined non-nil values.
|
||||
//
|
||||
// Supported Types: BOOLEAN, INT32, INT64, FLOAT, DOUBLE, BYTE_ARRAY
|
||||
func RLEDictEncode(values []interface{}, parquetType parquet.Type, bitWidth int32) (dictPageData, dataPageData []byte, dictValueCount int32, indexBitWidth uint8) {
|
||||
var definedValues []interface{}
|
||||
var indices []int32
|
||||
|
||||
valueIndexMap := make(map[interface{}]int32)
|
||||
j := 0
|
||||
for i := 0; i < len(values); i = j {
|
||||
for j = i; j < len(values); j++ {
|
||||
value := values[j]
|
||||
if value == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
index, found := valueIndexMap[value]
|
||||
if !found {
|
||||
index = int32(len(definedValues))
|
||||
definedValues = append(definedValues, value)
|
||||
valueIndexMap[value] = index
|
||||
}
|
||||
|
||||
indices = append(indices, index)
|
||||
}
|
||||
}
|
||||
|
||||
indexBitWidth = uint8(common.BitWidth(uint64(indices[len(indices)-1])))
|
||||
|
||||
dictPageData = PlainEncode(common.ToSliceValue(definedValues, parquetType), parquetType)
|
||||
dataPageData = RLEBitPackedHybridEncode(indices, int32(indexBitWidth), parquet.Type_INT32)
|
||||
|
||||
return dictPageData, dataPageData, int32(len(definedValues)), indexBitWidth
|
||||
}
|
||||
Reference in New Issue
Block a user