2021-04-18 12:41:13 -07:00
// Copyright (c) 2015-2021 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
2020-06-25 07:31:16 -07:00
package parquet
import (
"encoding/binary"
"encoding/json"
"io"
"git.apache.org/thrift.git/lib/go/thrift"
2020-07-14 17:38:05 +01:00
"github.com/minio/minio-go/v7/pkg/set"
2020-06-25 07:31:16 -07:00
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
)
// GetReaderFunc - function type returning io.ReadCloser for requested offset/length.
type GetReaderFunc func ( offset , length int64 ) ( io . ReadCloser , error )
func footerSize ( getReaderFunc GetReaderFunc ) ( size int64 , err error ) {
rc , err := getReaderFunc ( - 8 , 4 )
if err != nil {
return 0 , err
}
defer rc . Close ( )
buf := make ( [ ] byte , 4 )
if _ , err = io . ReadFull ( rc , buf ) ; err != nil {
return 0 , err
}
size = int64 ( binary . LittleEndian . Uint32 ( buf ) )
return size , nil
}
func fileMetadata ( getReaderFunc GetReaderFunc ) ( * parquet . FileMetaData , error ) {
size , err := footerSize ( getReaderFunc )
if err != nil {
return nil , err
}
rc , err := getReaderFunc ( - ( 8 + size ) , size )
if err != nil {
return nil , err
}
defer rc . Close ( )
fileMeta := parquet . NewFileMetaData ( )
pf := thrift . NewTCompactProtocolFactory ( )
protocol := pf . GetProtocol ( thrift . NewStreamTransportR ( rc ) )
err = fileMeta . Read ( protocol )
if err != nil {
return nil , err
}
return fileMeta , nil
}
// Value - denotes column value
type Value struct {
2021-04-03 17:25:19 +02:00
Value interface { }
Type parquet . Type
Schema * parquet . SchemaElement
2020-06-25 07:31:16 -07:00
}
// MarshalJSON - encodes to JSON data
func ( value Value ) MarshalJSON ( ) ( data [ ] byte , err error ) {
return json . Marshal ( value . Value )
}
// Reader - denotes parquet file.
type Reader struct {
getReaderFunc GetReaderFunc
schemaElements [ ] * parquet . SchemaElement
rowGroups [ ] * parquet . RowGroup
rowGroupIndex int
nameList [ ] string
columnNames set . StringSet
columns map [ string ] * column
rowIndex int64
}
// NewReader - creates new parquet reader. Reader calls getReaderFunc to get required data range for given columnNames. If columnNames is empty, all columns are used.
func NewReader ( getReaderFunc GetReaderFunc , columnNames set . StringSet ) ( * Reader , error ) {
fileMeta , err := fileMetadata ( getReaderFunc )
if err != nil {
return nil , err
}
nameList := [ ] string { }
schemaElements := fileMeta . GetSchema ( )
for _ , element := range schemaElements {
nameList = append ( nameList , element . Name )
}
return & Reader {
getReaderFunc : getReaderFunc ,
rowGroups : fileMeta . GetRowGroups ( ) ,
schemaElements : schemaElements ,
nameList : nameList ,
columnNames : columnNames ,
} , nil
}
// Read - reads single record.
func ( reader * Reader ) Read ( ) ( record * Record , err error ) {
if reader . rowGroupIndex >= len ( reader . rowGroups ) {
return nil , io . EOF
}
if reader . columns == nil {
reader . columns , err = getColumns (
reader . rowGroups [ reader . rowGroupIndex ] ,
reader . columnNames ,
reader . schemaElements ,
reader . getReaderFunc ,
)
if err != nil {
return nil , err
}
reader . rowIndex = 0
}
if reader . rowIndex >= reader . rowGroups [ reader . rowGroupIndex ] . GetNumRows ( ) {
reader . rowGroupIndex ++
reader . Close ( )
return reader . Read ( )
}
record = newRecord ( reader . nameList )
for name := range reader . columns {
2021-04-03 17:25:19 +02:00
col := reader . columns [ name ]
value , valueType , schema := col . read ( )
record . set ( name , Value { Value : value , Type : valueType , Schema : schema } )
2020-06-25 07:31:16 -07:00
}
reader . rowIndex ++
return record , nil
}
// Close - closes underneath readers.
func ( reader * Reader ) Close ( ) ( err error ) {
for _ , column := range reader . columns {
column . close ( )
}
reader . columns = nil
reader . rowIndex = 0
return nil
}