2021-04-18 15:41:13 -04:00
|
|
|
// Copyright (c) 2015-2021 MinIO, Inc.
|
|
|
|
//
|
|
|
|
// This file is part of MinIO Object Storage stack
|
|
|
|
//
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU Affero General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
2019-01-08 19:53:04 -05:00
|
|
|
|
|
|
|
package parquet
|
|
|
|
|
|
|
|
import (
|
2022-04-14 09:54:47 -04:00
|
|
|
"errors"
|
2019-01-08 19:53:04 -05:00
|
|
|
"io"
|
2021-04-03 11:25:19 -04:00
|
|
|
"time"
|
2019-01-08 19:53:04 -05:00
|
|
|
|
2019-03-07 03:20:10 -05:00
|
|
|
"github.com/bcicen/jstream"
|
2022-04-14 09:54:47 -04:00
|
|
|
parquetgo "github.com/fraugster/parquet-go"
|
|
|
|
parquettypes "github.com/fraugster/parquet-go/parquet"
|
2021-06-01 17:59:40 -04:00
|
|
|
jsonfmt "github.com/minio/minio/internal/s3select/json"
|
|
|
|
"github.com/minio/minio/internal/s3select/sql"
|
2019-01-08 19:53:04 -05:00
|
|
|
)
|
|
|
|
|
2022-04-14 09:54:47 -04:00
|
|
|
// Reader implements reading records from parquet input.
|
2019-01-08 19:53:04 -05:00
|
|
|
type Reader struct {
|
2022-04-14 09:54:47 -04:00
|
|
|
io.Closer
|
|
|
|
r *parquetgo.FileReader
|
2019-01-08 19:53:04 -05:00
|
|
|
}
|
|
|
|
|
2022-04-14 09:54:47 -04:00
|
|
|
// NewParquetReader creates a Reader2 from a io.ReadSeekCloser.
|
|
|
|
func NewParquetReader(rsc io.ReadSeekCloser, _ *ReaderArgs) (r *Reader, err error) {
|
|
|
|
fr, err := parquetgo.NewFileReader(rsc)
|
|
|
|
if err != nil {
|
|
|
|
return nil, errParquetParsingError(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return &Reader{Closer: rsc, r: fr}, nil
|
|
|
|
}
|
2020-08-18 13:23:28 -04:00
|
|
|
|
2022-04-14 09:54:47 -04:00
|
|
|
func (pr *Reader) Read(dst sql.Record) (rec sql.Record, rerr error) {
|
|
|
|
nextRow, err := pr.r.NextRow()
|
2019-01-08 19:53:04 -05:00
|
|
|
if err != nil {
|
2022-04-14 09:54:47 -04:00
|
|
|
if err == io.EOF {
|
|
|
|
return nil, err
|
2019-01-08 19:53:04 -05:00
|
|
|
}
|
2022-04-14 09:54:47 -04:00
|
|
|
return nil, errParquetParsingError(err)
|
2019-01-08 19:53:04 -05:00
|
|
|
}
|
|
|
|
|
2019-03-07 03:20:10 -05:00
|
|
|
kvs := jstream.KVS{}
|
2022-04-14 09:54:47 -04:00
|
|
|
for _, col := range pr.r.Columns() {
|
2019-01-16 11:22:04 -05:00
|
|
|
|
2019-03-07 03:20:10 -05:00
|
|
|
var value interface{}
|
2022-04-14 09:54:47 -04:00
|
|
|
if v, ok := nextRow[col.FlatName()]; ok {
|
|
|
|
value, err = convertFromAnnotation(col.Element(), v)
|
|
|
|
if err != nil {
|
|
|
|
return nil, errParquetParsingError(err)
|
2021-04-03 11:25:19 -04:00
|
|
|
}
|
2019-01-08 19:53:04 -05:00
|
|
|
}
|
2022-04-14 09:54:47 -04:00
|
|
|
kvs = append(kvs, jstream.KV{Key: col.FlatName(), Value: value})
|
2019-01-08 19:53:04 -05:00
|
|
|
}
|
|
|
|
|
2019-09-13 17:18:35 -04:00
|
|
|
// Reuse destination if we can.
|
|
|
|
dstRec, ok := dst.(*jsonfmt.Record)
|
|
|
|
if !ok {
|
|
|
|
dstRec = &jsonfmt.Record{}
|
|
|
|
}
|
|
|
|
dstRec.SelectFormat = sql.SelectFmtParquet
|
|
|
|
dstRec.KVS = kvs
|
|
|
|
return dstRec, nil
|
2019-01-08 19:53:04 -05:00
|
|
|
}
|
|
|
|
|
2022-04-14 09:54:47 -04:00
|
|
|
// convertFromAnnotation - converts values based on the Parquet column's type
|
|
|
|
// annotations. LogicalType annotations if present override the deprecated
|
|
|
|
// ConvertedType annotations. Ref:
|
|
|
|
// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
|
|
|
|
func convertFromAnnotation(se *parquettypes.SchemaElement, v interface{}) (interface{}, error) {
|
|
|
|
if se == nil {
|
|
|
|
return v, nil
|
|
|
|
}
|
2019-01-08 19:53:04 -05:00
|
|
|
|
2022-04-14 09:54:47 -04:00
|
|
|
var value interface{}
|
|
|
|
switch val := v.(type) {
|
|
|
|
case []byte:
|
|
|
|
// TODO: only strings are supported in s3select output (not
|
|
|
|
// binary arrays) - perhaps we need to check the annotation to
|
|
|
|
// ensure it's UTF8 encoded.
|
|
|
|
value = string(val)
|
|
|
|
case [12]byte:
|
|
|
|
// TODO: This is returned for the parquet INT96 type. We just
|
|
|
|
// treat it same as []byte (but AWS S3 treats it as a large int)
|
|
|
|
// - fix this later.
|
|
|
|
value = string(val[:])
|
|
|
|
case int32:
|
|
|
|
value = int64(val)
|
|
|
|
if logicalType := se.GetLogicalType(); logicalType != nil {
|
|
|
|
if logicalType.IsSetDATE() {
|
|
|
|
value = sql.FormatSQLTimestamp(time.Unix(60*60*24*int64(val), 0).UTC())
|
|
|
|
}
|
|
|
|
} else if se.GetConvertedType() == parquettypes.ConvertedType_DATE {
|
|
|
|
value = sql.FormatSQLTimestamp(time.Unix(60*60*24*int64(val), 0).UTC())
|
2020-08-18 13:23:28 -04:00
|
|
|
}
|
2022-04-14 09:54:47 -04:00
|
|
|
case int64:
|
|
|
|
value = val
|
|
|
|
if logicalType := se.GetLogicalType(); logicalType != nil {
|
|
|
|
if ts := logicalType.GetTIMESTAMP(); ts != nil {
|
|
|
|
var duration time.Duration
|
|
|
|
// Only support UTC normalized timestamps.
|
|
|
|
if ts.IsAdjustedToUTC {
|
|
|
|
switch {
|
|
|
|
case ts.Unit.IsSetNANOS():
|
|
|
|
duration = time.Duration(val) * time.Nanosecond
|
|
|
|
case ts.Unit.IsSetMILLIS():
|
|
|
|
duration = time.Duration(val) * time.Millisecond
|
|
|
|
case ts.Unit.IsSetMICROS():
|
|
|
|
duration = time.Duration(val) * time.Microsecond
|
|
|
|
default:
|
|
|
|
return nil, errors.New("Invalid LogicalType annotation found")
|
|
|
|
}
|
|
|
|
value = sql.FormatSQLTimestamp(time.Unix(0, 0).Add(duration))
|
|
|
|
}
|
|
|
|
} else if se.GetConvertedType() == parquettypes.ConvertedType_TIMESTAMP_MILLIS {
|
|
|
|
duration := time.Duration(val) * time.Millisecond
|
|
|
|
value = sql.FormatSQLTimestamp(time.Unix(0, 0).Add(duration))
|
|
|
|
} else if se.GetConvertedType() == parquettypes.ConvertedType_TIMESTAMP_MICROS {
|
|
|
|
duration := time.Duration(val) * time.Microsecond
|
|
|
|
value = sql.FormatSQLTimestamp(time.Unix(0, 0).Add(duration))
|
|
|
|
}
|
2019-01-08 19:53:04 -05:00
|
|
|
}
|
2022-04-14 09:54:47 -04:00
|
|
|
case float32:
|
|
|
|
value = float64(val)
|
|
|
|
default:
|
|
|
|
value = v
|
2019-01-08 19:53:04 -05:00
|
|
|
}
|
2022-04-14 09:54:47 -04:00
|
|
|
return value, nil
|
2019-01-08 19:53:04 -05:00
|
|
|
}
|