// Copyright (c) 2015-2021 MinIO, Inc. // // This file is part of MinIO Object Storage stack // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . package parquet import ( "bytes" "encoding/binary" "errors" "fmt" "math" "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" ) func boolsToBytes(bs []bool) []byte { size := (len(bs) + 7) / 8 result := make([]byte, size) for i := range bs { if bs[i] { result[i/8] |= 1 << uint32(i%8) } } return result } func int32sToBytes(i32s []int32) []byte { buf := make([]byte, 4*len(i32s)) for i, i32 := range i32s { binary.LittleEndian.PutUint32(buf[i*4:], uint32(i32)) } return buf } func int64sToBytes(i64s []int64) []byte { buf := make([]byte, 8*len(i64s)) for i, i64 := range i64s { binary.LittleEndian.PutUint64(buf[i*8:], uint64(i64)) } return buf } func float32sToBytes(f32s []float32) []byte { buf := make([]byte, 4*len(f32s)) for i, f32 := range f32s { binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(f32)) } return buf } func float64sToBytes(f64s []float64) []byte { buf := make([]byte, 8*len(f64s)) for i, f64 := range f64s { binary.LittleEndian.PutUint64(buf[i*8:], math.Float64bits(f64)) } return buf } func byteSlicesToBytes(byteSlices [][]byte) []byte { buf := new(bytes.Buffer) for _, s := range byteSlices { if err := binary.Write(buf, binary.LittleEndian, uint32(len(s))); err != nil { panic(err) } if _, err := buf.Write(s); err != nil { panic(err) } } return buf.Bytes() } func byteArraysToBytes(arrayList [][]byte) []byte { buf := new(bytes.Buffer) arrayLen := -1 for _, array := range arrayList { if arrayLen != -1 && len(array) != arrayLen { panic(errors.New("array list does not have same length")) } arrayLen = len(array) if _, err := buf.Write(array); err != nil { panic(err) } } return buf.Bytes() } func int96sToBytes(i96s [][]byte) []byte { return byteArraysToBytes(i96s) } func valuesToBytes(values interface{}, dataType parquet.Type) []byte { switch dataType { case parquet.Type_BOOLEAN: return boolsToBytes(values.([]bool)) case parquet.Type_INT32: return int32sToBytes(values.([]int32)) case parquet.Type_INT64: return int64sToBytes(values.([]int64)) case parquet.Type_INT96: return int96sToBytes(values.([][]byte)) case parquet.Type_FLOAT: return float32sToBytes(values.([]float32)) case parquet.Type_DOUBLE: return float64sToBytes(values.([]float64)) case parquet.Type_BYTE_ARRAY: return byteSlicesToBytes(values.([][]byte)) case parquet.Type_FIXED_LEN_BYTE_ARRAY: return byteArraysToBytes(values.([][]byte)) } return []byte{} } func valueToBytes(value interface{}, dataType parquet.Type) []byte { var values interface{} switch dataType { case parquet.Type_BOOLEAN: values = []bool{value.(bool)} case parquet.Type_INT32: values = []int32{value.(int32)} case parquet.Type_INT64: values = []int64{value.(int64)} case parquet.Type_INT96: values = [][]byte{value.([]byte)} case parquet.Type_FLOAT: values = []float32{value.(float32)} case parquet.Type_DOUBLE: values = []float64{value.(float64)} case parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY: values = [][]byte{value.([]byte)} } return valuesToBytes(values, dataType) } func unsignedVarIntToBytes(ui64 uint64) []byte { size := (getBitWidth(ui64) + 6) / 7 if size == 0 { return []byte{0} } buf := make([]byte, size) for i := uint64(0); i < size; i++ { buf[i] = byte(ui64&0x7F) | 0x80 ui64 >>= 7 } buf[size-1] &= 0x7F return buf } func valuesToRLEBytes(values interface{}, bitWidth int32, valueType parquet.Type) []byte { vals := valuesToInterfaces(values, valueType) result := []byte{} j := 0 for i := 0; i < len(vals); i = j { for j = i + 1; j < len(vals) && vals[i] == vals[j]; j++ { } headerBytes := unsignedVarIntToBytes(uint64((j - i) << 1)) result = append(result, headerBytes...) valBytes := valueToBytes(vals[i], valueType) byteCount := (bitWidth + 7) / 8 result = append(result, valBytes[:byteCount]...) } return result } func valuesToRLEBitPackedHybridBytes(values interface{}, bitWidth int32, dataType parquet.Type) []byte { rleBytes := valuesToRLEBytes(values, bitWidth, dataType) lenBytes := valueToBytes(int32(len(rleBytes)), parquet.Type_INT32) return append(lenBytes, rleBytes...) } func valuesToBitPackedBytes(values interface{}, bitWidth int64, withHeader bool, dataType parquet.Type) []byte { var i64s []int64 switch dataType { case parquet.Type_BOOLEAN: bs := values.([]bool) i64s = make([]int64, len(bs)) for i := range bs { if bs[i] { i64s[i] = 1 } } case parquet.Type_INT32: i32s := values.([]int32) i64s = make([]int64, len(i32s)) for i := range i32s { i64s[i] = int64(i32s[i]) } case parquet.Type_INT64: i64s = values.([]int64) default: panic(fmt.Errorf("data type %v is not supported for bit packing", dataType)) } if len(i64s) == 0 { return nil } var valueByte byte bitsSet := uint64(0) bitsNeeded := uint64(8) bitsToSet := uint64(bitWidth) value := i64s[0] valueBytes := []byte{} for i := 0; i < len(i64s); { if bitsToSet >= bitsNeeded { valueByte |= byte(((value >> bitsSet) & ((1 << bitsNeeded) - 1)) << (8 - bitsNeeded)) valueBytes = append(valueBytes, valueByte) bitsToSet -= bitsNeeded bitsSet += bitsNeeded bitsNeeded = 8 valueByte = 0 if bitsToSet <= 0 && (i+1) < len(i64s) { i++ value = i64s[i] bitsToSet = uint64(bitWidth) bitsSet = 0 } } else { valueByte |= byte((value >> bitsSet) << (8 - bitsNeeded)) i++ if i < len(i64s) { value = i64s[i] } bitsNeeded -= bitsToSet bitsToSet = uint64(bitWidth) bitsSet = 0 } } if withHeader { header := uint64(((len(i64s) / 8) << 1) | 1) headerBytes := unsignedVarIntToBytes(header) return append(headerBytes, valueBytes...) } return valueBytes } const ( blockSize = 128 subBlockSize = 32 subBlockCount = blockSize / subBlockSize ) var ( blockSizeBytes = unsignedVarIntToBytes(blockSize) subBlockCountBytes = unsignedVarIntToBytes(subBlockCount) ) func int32ToDeltaBytes(i32s []int32) []byte { getValue := func(i32 int32) uint64 { return uint64((i32 >> 31) ^ (i32 << 1)) } result := append([]byte{}, blockSizeBytes...) result = append(result, subBlockCountBytes...) result = append(result, unsignedVarIntToBytes(uint64(len(i32s)))...) result = append(result, unsignedVarIntToBytes(getValue(i32s[0]))...) for i := 1; i < len(i32s); { block := []int32{} minDelta := int32(0x7FFFFFFF) for ; i < len(i32s) && len(block) < blockSize; i++ { delta := i32s[i] - i32s[i-1] block = append(block, delta) if delta < minDelta { minDelta = delta } } for len(block) < blockSize { block = append(block, minDelta) } bitWidths := make([]byte, subBlockCount) for j := 0; j < subBlockCount; j++ { maxValue := int32(0) for k := j * subBlockSize; k < (j+1)*subBlockSize; k++ { block[k] -= minDelta if block[k] > maxValue { maxValue = block[k] } } bitWidths[j] = byte(getBitWidth(uint64(maxValue))) } minDeltaZigZag := getValue(minDelta) result = append(result, unsignedVarIntToBytes(minDeltaZigZag)...) result = append(result, bitWidths...) for j := 0; j < subBlockCount; j++ { bitPacked := valuesToBitPackedBytes( block[j*subBlockSize:(j+1)*subBlockSize], int64(bitWidths[j]), false, parquet.Type_INT32, ) result = append(result, bitPacked...) } } return result } func int64ToDeltaBytes(i64s []int64) []byte { getValue := func(i64 int64) uint64 { return uint64((i64 >> 63) ^ (i64 << 1)) } result := append([]byte{}, blockSizeBytes...) result = append(result, subBlockCountBytes...) result = append(result, unsignedVarIntToBytes(uint64(len(i64s)))...) result = append(result, unsignedVarIntToBytes(getValue(i64s[0]))...) for i := 1; i < len(i64s); { block := []int64{} minDelta := int64(0x7FFFFFFFFFFFFFFF) for ; i < len(i64s) && len(block) < blockSize; i++ { delta := i64s[i] - i64s[i-1] block = append(block, delta) if delta < minDelta { minDelta = delta } } for len(block) < blockSize { block = append(block, minDelta) } bitWidths := make([]byte, subBlockCount) for j := 0; j < subBlockCount; j++ { maxValue := int64(0) for k := j * subBlockSize; k < (j+1)*subBlockSize; k++ { block[k] -= minDelta if block[k] > maxValue { maxValue = block[k] } } bitWidths[j] = byte(getBitWidth(uint64(maxValue))) } minDeltaZigZag := getValue(minDelta) result = append(result, unsignedVarIntToBytes(minDeltaZigZag)...) result = append(result, bitWidths...) for j := 0; j < subBlockCount; j++ { bitPacked := valuesToBitPackedBytes( block[j*subBlockSize:(j+1)*subBlockSize], int64(bitWidths[j]), false, parquet.Type_INT64, ) result = append(result, bitPacked...) } } return result } func valuesToDeltaBytes(values interface{}, dataType parquet.Type) []byte { switch dataType { case parquet.Type_INT32: return int32ToDeltaBytes(values.([]int32)) case parquet.Type_INT64: return int64ToDeltaBytes(values.([]int64)) } return nil } func stringsToDeltaLengthByteArrayBytes(strs []string) []byte { lengths := make([]int32, len(strs)) for i, s := range strs { lengths[i] = int32(len(s)) } result := int32ToDeltaBytes(lengths) for _, s := range strs { result = append(result, []byte(s)...) } return result } func stringsToDeltaByteArrayBytes(strs []string) []byte { prefixLengths := make([]int32, len(strs)) suffixes := make([]string, len(strs)) var i, j int for i = 1; i < len(strs); i++ { for j = 0; j < len(strs[i-1]) && j < len(strs[i]); j++ { if strs[i-1][j] != strs[i][j] { break } } prefixLengths[i] = int32(j) suffixes[i] = strs[i][j:] } result := int32ToDeltaBytes(prefixLengths) return append(result, stringsToDeltaLengthByteArrayBytes(suffixes)...) } func encodeValues(values interface{}, dataType parquet.Type, encoding parquet.Encoding, bitWidth int32) []byte { switch encoding { case parquet.Encoding_RLE: return valuesToRLEBitPackedHybridBytes(values, bitWidth, dataType) case parquet.Encoding_DELTA_BINARY_PACKED: return valuesToDeltaBytes(values, dataType) case parquet.Encoding_DELTA_BYTE_ARRAY: return stringsToDeltaByteArrayBytes(values.([]string)) case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY: return stringsToDeltaLengthByteArrayBytes(values.([]string)) } return valuesToBytes(values, dataType) }