2021-04-18 15:41:13 -04:00
|
|
|
// Copyright (c) 2015-2021 MinIO, Inc.
|
|
|
|
//
|
|
|
|
// This file is part of MinIO Object Storage stack
|
|
|
|
//
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU Affero General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
2019-01-08 19:53:04 -05:00
|
|
|
|
|
|
|
package parquet
|
|
|
|
|
|
|
|
import (
|
2020-08-18 13:23:28 -04:00
|
|
|
"fmt"
|
2019-01-08 19:53:04 -05:00
|
|
|
"io"
|
2021-04-03 11:25:19 -04:00
|
|
|
"time"
|
2019-01-08 19:53:04 -05:00
|
|
|
|
2019-03-07 03:20:10 -05:00
|
|
|
"github.com/bcicen/jstream"
|
|
|
|
jsonfmt "github.com/minio/minio/pkg/s3select/json"
|
2019-01-08 19:53:04 -05:00
|
|
|
"github.com/minio/minio/pkg/s3select/sql"
|
2021-05-03 11:51:43 -04:00
|
|
|
parquetgo "github.com/minio/parquet-go"
|
|
|
|
parquetgen "github.com/minio/parquet-go/gen-go/parquet"
|
2019-01-08 19:53:04 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
// Reader - Parquet record reader for S3Select.
|
|
|
|
type Reader struct {
|
2019-03-13 23:33:18 -04:00
|
|
|
args *ReaderArgs
|
|
|
|
reader *parquetgo.Reader
|
2019-01-08 19:53:04 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
// Read - reads single record.
|
2019-09-13 17:18:35 -04:00
|
|
|
func (r *Reader) Read(dst sql.Record) (rec sql.Record, rerr error) {
|
2020-08-18 13:23:28 -04:00
|
|
|
defer func() {
|
|
|
|
if rec := recover(); rec != nil {
|
|
|
|
rerr = fmt.Errorf("panic reading parquet record: %v", rec)
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
2019-03-13 23:33:18 -04:00
|
|
|
parquetRecord, err := r.reader.Read()
|
2019-01-08 19:53:04 -05:00
|
|
|
if err != nil {
|
|
|
|
if err != io.EOF {
|
|
|
|
return nil, errParquetParsingError(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2019-03-07 03:20:10 -05:00
|
|
|
kvs := jstream.KVS{}
|
2019-01-28 20:59:48 -05:00
|
|
|
f := func(name string, v parquetgo.Value) bool {
|
2019-01-16 11:22:04 -05:00
|
|
|
if v.Value == nil {
|
2019-03-07 03:20:10 -05:00
|
|
|
kvs = append(kvs, jstream.KV{Key: name, Value: nil})
|
|
|
|
return true
|
2019-01-16 11:22:04 -05:00
|
|
|
}
|
|
|
|
|
2019-03-07 03:20:10 -05:00
|
|
|
var value interface{}
|
2019-01-08 19:53:04 -05:00
|
|
|
switch v.Type {
|
|
|
|
case parquetgen.Type_BOOLEAN:
|
2019-03-07 03:20:10 -05:00
|
|
|
value = v.Value.(bool)
|
2019-01-08 19:53:04 -05:00
|
|
|
case parquetgen.Type_INT32:
|
2019-03-07 03:20:10 -05:00
|
|
|
value = int64(v.Value.(int32))
|
2021-04-03 11:25:19 -04:00
|
|
|
if v.Schema != nil && v.Schema.ConvertedType != nil {
|
|
|
|
switch *v.Schema.ConvertedType {
|
|
|
|
case parquetgen.ConvertedType_DATE:
|
|
|
|
value = sql.FormatSQLTimestamp(time.Unix(60*60*24*int64(v.Value.(int32)), 0).UTC())
|
|
|
|
}
|
|
|
|
}
|
2019-01-08 19:53:04 -05:00
|
|
|
case parquetgen.Type_INT64:
|
2020-08-24 15:11:20 -04:00
|
|
|
value = v.Value.(int64)
|
2021-04-03 11:25:19 -04:00
|
|
|
if v.Schema != nil && v.Schema.ConvertedType != nil {
|
|
|
|
switch *v.Schema.ConvertedType {
|
|
|
|
// Only UTC supported, add one NS to never be exactly midnight.
|
|
|
|
case parquetgen.ConvertedType_TIMESTAMP_MILLIS:
|
|
|
|
value = sql.FormatSQLTimestamp(time.Unix(0, 0).Add(time.Duration(v.Value.(int64)) * time.Millisecond).UTC())
|
|
|
|
case parquetgen.ConvertedType_TIMESTAMP_MICROS:
|
|
|
|
value = sql.FormatSQLTimestamp(time.Unix(0, 0).Add(time.Duration(v.Value.(int64)) * time.Microsecond).UTC())
|
|
|
|
}
|
|
|
|
}
|
2019-01-08 19:53:04 -05:00
|
|
|
case parquetgen.Type_FLOAT:
|
2019-03-07 03:20:10 -05:00
|
|
|
value = float64(v.Value.(float32))
|
2019-01-08 19:53:04 -05:00
|
|
|
case parquetgen.Type_DOUBLE:
|
2019-03-07 03:20:10 -05:00
|
|
|
value = v.Value.(float64)
|
2019-01-08 19:53:04 -05:00
|
|
|
case parquetgen.Type_INT96, parquetgen.Type_BYTE_ARRAY, parquetgen.Type_FIXED_LEN_BYTE_ARRAY:
|
2019-03-07 03:20:10 -05:00
|
|
|
value = string(v.Value.([]byte))
|
2019-01-08 19:53:04 -05:00
|
|
|
default:
|
2019-01-28 20:59:48 -05:00
|
|
|
rerr = errParquetParsingError(nil)
|
|
|
|
return false
|
2019-01-08 19:53:04 -05:00
|
|
|
}
|
|
|
|
|
2019-03-07 03:20:10 -05:00
|
|
|
kvs = append(kvs, jstream.KV{Key: name, Value: value})
|
|
|
|
return true
|
2019-01-08 19:53:04 -05:00
|
|
|
}
|
|
|
|
|
2019-09-13 17:18:35 -04:00
|
|
|
// Apply our range
|
2019-01-28 20:59:48 -05:00
|
|
|
parquetRecord.Range(f)
|
2019-09-13 17:18:35 -04:00
|
|
|
|
|
|
|
// Reuse destination if we can.
|
|
|
|
dstRec, ok := dst.(*jsonfmt.Record)
|
|
|
|
if !ok {
|
|
|
|
dstRec = &jsonfmt.Record{}
|
|
|
|
}
|
|
|
|
dstRec.SelectFormat = sql.SelectFmtParquet
|
|
|
|
dstRec.KVS = kvs
|
|
|
|
return dstRec, nil
|
2019-01-08 19:53:04 -05:00
|
|
|
}
|
|
|
|
|
2019-09-13 17:18:35 -04:00
|
|
|
// Close - closes underlying readers.
|
2019-01-08 19:53:04 -05:00
|
|
|
func (r *Reader) Close() error {
|
2019-03-13 23:33:18 -04:00
|
|
|
return r.reader.Close()
|
2019-01-08 19:53:04 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewReader - creates new Parquet reader using readerFunc callback.
|
2020-08-18 13:23:28 -04:00
|
|
|
func NewReader(getReaderFunc func(offset, length int64) (io.ReadCloser, error), args *ReaderArgs) (r *Reader, err error) {
|
|
|
|
defer func() {
|
|
|
|
if rec := recover(); rec != nil {
|
|
|
|
err = fmt.Errorf("panic reading parquet header: %v", rec)
|
|
|
|
}
|
|
|
|
}()
|
2019-03-13 23:33:18 -04:00
|
|
|
reader, err := parquetgo.NewReader(getReaderFunc, nil)
|
2019-01-08 19:53:04 -05:00
|
|
|
if err != nil {
|
|
|
|
if err != io.EOF {
|
|
|
|
return nil, errParquetParsingError(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return &Reader{
|
2019-03-13 23:33:18 -04:00
|
|
|
args: args,
|
|
|
|
reader: reader,
|
2019-01-08 19:53:04 -05:00
|
|
|
}, nil
|
|
|
|
}
|