// Copyright (c) 2015-2021 MinIO, Inc. // // This file is part of MinIO Object Storage stack // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . package parquet import ( "bytes" "errors" "fmt" "math" "github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" ) func i64sToi32s(i64s []int64) (i32s []int32) { i32s = make([]int32, len(i64s)) for i := range i64s { i32s[i] = int32(i64s[i]) } return i32s } func readBitPacked(reader *bytes.Reader, header, bitWidth uint64) (result []int64, err error) { count := header * 8 if count == 0 { return result, nil } if bitWidth == 0 { return make([]int64, count), nil } data := make([]byte, header*bitWidth) if _, err = reader.Read(data); err != nil { return nil, err } var val, used, left, b uint64 valNeedBits := bitWidth i := -1 for { if left <= 0 { i++ if i >= len(data) { break } b = uint64(data[i]) left = 8 used = 0 } if left >= valNeedBits { val |= ((b >> used) & ((1 << valNeedBits) - 1)) << (bitWidth - valNeedBits) result = append(result, int64(val)) val = 0 left -= valNeedBits used += valNeedBits valNeedBits = bitWidth } else { val |= (b >> used) << (bitWidth - valNeedBits) valNeedBits -= left left = 0 } } return result, nil } func readBools(reader *bytes.Reader, count uint64) (result []bool, err error) { i64s, err := readBitPacked(reader, count, 1) if err != nil { return nil, err } var i uint64 for i = 0; i < count; i++ { result = append(result, i64s[i] > 0) } return result, nil } func readInt32s(reader *bytes.Reader, count uint64) (result []int32, err error) { buf := make([]byte, 4) var i uint64 for i = 0; i < count; i++ { if _, err = reader.Read(buf); err != nil { return nil, err } result = append(result, int32(bytesToUint32(buf))) } return result, nil } func readInt64s(reader *bytes.Reader, count uint64) (result []int64, err error) { buf := make([]byte, 8) var i uint64 for i = 0; i < count; i++ { if _, err = reader.Read(buf); err != nil { return nil, err } result = append(result, int64(bytesToUint64(buf))) } return result, nil } func readInt96s(reader *bytes.Reader, count uint64) (result [][]byte, err error) { var i uint64 for i = 0; i < count; i++ { buf := make([]byte, 12) if _, err = reader.Read(buf); err != nil { return nil, err } result = append(result, buf) } return result, nil } func readFloats(reader *bytes.Reader, count uint64) (result []float32, err error) { buf := make([]byte, 4) var i uint64 for i = 0; i < count; i++ { if _, err = reader.Read(buf); err != nil { return nil, err } result = append(result, math.Float32frombits(bytesToUint32(buf))) } return result, nil } func readDoubles(reader *bytes.Reader, count uint64) (result []float64, err error) { buf := make([]byte, 8) var i uint64 for i = 0; i < count; i++ { if _, err = reader.Read(buf); err != nil { return nil, err } result = append(result, math.Float64frombits(bytesToUint64(buf))) } return result, nil } func readByteArrays(reader *bytes.Reader, count uint64) (result [][]byte, err error) { buf := make([]byte, 4) var length uint32 var data []byte var i uint64 for i = 0; i < count; i++ { if _, err = reader.Read(buf); err != nil { return nil, err } length = bytesToUint32(buf) data = make([]byte, length) if length > 0 { if _, err = reader.Read(data); err != nil { return nil, err } } result = append(result, data) } return result, nil } func readFixedLenByteArrays(reader *bytes.Reader, count, length uint64) (result [][]byte, err error) { var i uint64 for i = 0; i < count; i++ { data := make([]byte, length) if _, err = reader.Read(data); err != nil { return nil, err } result = append(result, data) } return result, nil } func readValues(reader *bytes.Reader, dataType parquet.Type, count, length uint64) (interface{}, error) { switch dataType { case parquet.Type_BOOLEAN: return readBools(reader, count) case parquet.Type_INT32: return readInt32s(reader, count) case parquet.Type_INT64: return readInt64s(reader, count) case parquet.Type_INT96: return readInt96s(reader, count) case parquet.Type_FLOAT: return readFloats(reader, count) case parquet.Type_DOUBLE: return readDoubles(reader, count) case parquet.Type_BYTE_ARRAY: return readByteArrays(reader, count) case parquet.Type_FIXED_LEN_BYTE_ARRAY: return readFixedLenByteArrays(reader, count, length) } return nil, fmt.Errorf("unknown parquet type %v", dataType) } func readUnsignedVarInt(reader *bytes.Reader) (v uint64, err error) { var b byte var shift uint64 for { if b, err = reader.ReadByte(); err != nil { return 0, err } if v |= ((uint64(b) & 0x7F) << shift); b&0x80 == 0 { break } shift += 7 } return v, nil } func readRLE(reader *bytes.Reader, header, bitWidth uint64) (result []int64, err error) { width := (bitWidth + 7) / 8 data := make([]byte, width) if width > 0 { if _, err = reader.Read(data); err != nil { return nil, err } } if width < 4 { data = append(data, make([]byte, 4-width)...) } val := int64(bytesToUint32(data)) count := header >> 1 if count > math.MaxInt64/8 { // 8 bytes/element. return nil, errors.New("parquet: size too large") } result = make([]int64, count) for i := range result { result[i] = val } return result, nil } func readRLEBitPackedHybrid(reader *bytes.Reader, length, bitWidth uint64) (result []int64, err error) { if length <= 0 { var i32s []int32 i32s, err = readInt32s(reader, 1) if err != nil { return nil, err } if i32s[0] < 0 { return nil, errors.New("parquet: negative RLEBitPackedHybrid length") } length = uint64(i32s[0]) } buf := make([]byte, length) if _, err = reader.Read(buf); err != nil { return nil, err } reader = bytes.NewReader(buf) for reader.Len() > 0 { header, err := readUnsignedVarInt(reader) if err != nil { return nil, err } var i64s []int64 if header&1 == 0 { i64s, err = readRLE(reader, header, bitWidth) } else { i64s, err = readBitPacked(reader, header>>1, bitWidth) } if err != nil { return nil, err } result = append(result, i64s...) } return result, nil } func readDeltaBinaryPackedInt(reader *bytes.Reader) (result []int64, err error) { blockSize, err := readUnsignedVarInt(reader) if err != nil { return nil, err } numMiniblocksInBlock, err := readUnsignedVarInt(reader) if err != nil { return nil, err } numValues, err := readUnsignedVarInt(reader) if err != nil { return nil, err } firstValueZigZag, err := readUnsignedVarInt(reader) if err != nil { return nil, err } v := int64(firstValueZigZag>>1) ^ (-int64(firstValueZigZag & 1)) result = append(result, v) if numMiniblocksInBlock == 0 { return nil, errors.New("parquet: zero mini blocks in block") } numValuesInMiniBlock := blockSize / numMiniblocksInBlock bitWidths := make([]uint64, numMiniblocksInBlock) for uint64(len(result)) < numValues { minDeltaZigZag, err := readUnsignedVarInt(reader) if err != nil { return nil, err } for i := 0; uint64(i) < numMiniblocksInBlock; i++ { b, err := reader.ReadByte() if err != nil { return nil, err } bitWidths[i] = uint64(b) } minDelta := int64(minDeltaZigZag>>1) ^ (-int64(minDeltaZigZag & 1)) for i := 0; uint64(i) < numMiniblocksInBlock; i++ { i64s, err := readBitPacked(reader, numValuesInMiniBlock/8, bitWidths[i]) if err != nil { return nil, err } for j := range i64s { v += i64s[j] + minDelta result = append(result, v) } } } return result[:numValues], nil } func readDeltaLengthByteArrays(reader *bytes.Reader) (result [][]byte, err error) { i64s, err := readDeltaBinaryPackedInt(reader) if err != nil { return nil, err } for i := 0; i < len(i64s); i++ { arrays, err := readFixedLenByteArrays(reader, 1, uint64(i64s[i])) if err != nil { return nil, err } result = append(result, arrays[0]) } return result, nil } func readDeltaByteArrays(reader *bytes.Reader) (result [][]byte, err error) { i64s, err := readDeltaBinaryPackedInt(reader) if err != nil { return nil, err } suffixes, err := readDeltaLengthByteArrays(reader) if err != nil { return nil, err } result = append(result, suffixes[0]) for i := 1; i < len(i64s); i++ { prefixLength := i64s[i] val := append([]byte{}, result[i-1][:prefixLength]...) val = append(val, suffixes[i]...) result = append(result, val) } return result, nil } func readDataPageValues( bytesReader *bytes.Reader, encoding parquet.Encoding, dataType parquet.Type, convertedType parquet.ConvertedType, count, bitWidth uint64, ) (result interface{}, resultDataType parquet.Type, err error) { switch encoding { case parquet.Encoding_PLAIN: result, err = readValues(bytesReader, dataType, count, bitWidth) return result, dataType, err case parquet.Encoding_PLAIN_DICTIONARY: b, err := bytesReader.ReadByte() if err != nil { return nil, -1, err } i64s, err := readRLEBitPackedHybrid(bytesReader, uint64(bytesReader.Len()), uint64(b)) if err != nil { return nil, -1, err } if len(i64s) < int(count) || count > math.MaxInt64/8 { return nil, -1, errors.New("parquet: value out of range") } return i64s[:count], parquet.Type_INT64, nil case parquet.Encoding_RLE: i64s, err := readRLEBitPackedHybrid(bytesReader, 0, bitWidth) if err != nil { return nil, -1, err } if len(i64s) < int(count) || count > math.MaxInt64/8 { return nil, -1, errors.New("parquet: value out of range") } i64s = i64s[:count] if dataType == parquet.Type_INT32 { return i64sToi32s(i64s), parquet.Type_INT32, nil } return i64s, parquet.Type_INT64, nil case parquet.Encoding_BIT_PACKED: return nil, -1, fmt.Errorf("deprecated parquet encoding %v", parquet.Encoding_BIT_PACKED) case parquet.Encoding_DELTA_BINARY_PACKED: i64s, err := readDeltaBinaryPackedInt(bytesReader) if err != nil { return nil, -1, err } if len(i64s) < int(count) || count > math.MaxInt64/8 { return nil, -1, errors.New("parquet: value out of range") } i64s = i64s[:count] if dataType == parquet.Type_INT32 { return i64sToi32s(i64s), parquet.Type_INT32, nil } return i64s, parquet.Type_INT64, nil case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY: byteSlices, err := readDeltaLengthByteArrays(bytesReader) if err != nil { return nil, -1, err } if len(byteSlices) < int(count) || count > math.MaxInt64/24 { return nil, -1, errors.New("parquet: value out of range") } return byteSlices[:count], parquet.Type_FIXED_LEN_BYTE_ARRAY, nil case parquet.Encoding_DELTA_BYTE_ARRAY: byteSlices, err := readDeltaByteArrays(bytesReader) if err != nil { return nil, -1, err } if len(byteSlices) < int(count) || count > math.MaxInt64/24 { return nil, -1, errors.New("parquet: value out of range") } return byteSlices[:count], parquet.Type_FIXED_LEN_BYTE_ARRAY, nil } return nil, -1, fmt.Errorf("unsupported parquet encoding %v", encoding) }