minio/pkg/s3select/internal/parquet-go/column.go

170 lines
4.1 KiB
Go

/*
* Minio Cloud Storage, (C) 2018 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package parquet
import (
"errors"
"io"
"strings"
"git.apache.org/thrift.git/lib/go/thrift"
"github.com/minio/minio-go/v7/pkg/set"
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
)
func getColumns(
rowGroup *parquet.RowGroup,
columnNames set.StringSet,
schemaElements []*parquet.SchemaElement,
getReaderFunc GetReaderFunc,
) (nameColumnMap map[string]*column, err error) {
nameIndexMap := make(map[string]int)
for colIndex, columnChunk := range rowGroup.GetColumns() {
meta := columnChunk.GetMetaData()
if meta == nil {
return nil, errors.New("parquet: column metadata missing")
}
columnName := strings.Join(meta.GetPathInSchema(), ".")
if columnNames != nil && !columnNames.Contains(columnName) {
continue
}
// Ignore column spanning into another file.
if columnChunk.GetFilePath() != "" {
continue
}
offset := meta.GetDataPageOffset()
if meta.DictionaryPageOffset != nil {
offset = meta.GetDictionaryPageOffset()
}
size := meta.GetTotalCompressedSize()
if size < 0 {
return nil, errors.New("parquet: negative compressed size")
}
rc, err := getReaderFunc(offset, size)
if err != nil {
return nil, err
}
thriftReader := thrift.NewTBufferedTransport(thrift.NewStreamTransportR(rc), int(size))
if nameColumnMap == nil {
nameColumnMap = make(map[string]*column)
}
var se *parquet.SchemaElement
for _, schema := range schemaElements {
if schema != nil && schema.Name == columnName {
se = schema
break
}
}
nameColumnMap[columnName] = &column{
name: columnName,
metadata: meta,
schema: se,
schemaElements: schemaElements,
rc: rc,
thriftReader: thriftReader,
valueType: meta.GetType(),
}
// First element of []*parquet.SchemaElement from parquet file metadata is 'schema'
// which is always skipped, hence colIndex + 1 is valid.
nameIndexMap[columnName] = colIndex + 1
}
for name := range nameColumnMap {
nameColumnMap[name].nameIndexMap = nameIndexMap
}
return nameColumnMap, nil
}
type column struct {
name string
endOfValues bool
valueIndex int
valueType parquet.Type
metadata *parquet.ColumnMetaData
schema *parquet.SchemaElement
schemaElements []*parquet.SchemaElement
nameIndexMap map[string]int
dictPage *page
dataTable *table
rc io.ReadCloser
thriftReader *thrift.TBufferedTransport
}
func (column *column) close() (err error) {
if column.rc != nil {
err = column.rc.Close()
column.rc = nil
}
return err
}
func (column *column) readPage() {
page, _, _, err := readPage(
column.thriftReader,
column.metadata,
column.nameIndexMap,
column.schemaElements,
)
if err != nil {
column.endOfValues = true
return
}
if page.Header.GetType() == parquet.PageType_DICTIONARY_PAGE {
column.dictPage = page
column.readPage()
return
}
page.decode(column.dictPage)
if column.dataTable == nil {
column.dataTable = newTableFromTable(page.DataTable)
}
column.dataTable.Merge(page.DataTable)
}
func (column *column) read() (value interface{}, valueType parquet.Type, cnv *parquet.SchemaElement) {
if column.dataTable == nil {
column.readPage()
column.valueIndex = 0
}
if column.endOfValues {
return nil, column.metadata.GetType(), column.schema
}
value = column.dataTable.Values[column.valueIndex]
column.valueIndex++
if len(column.dataTable.Values) == column.valueIndex {
column.dataTable = nil
}
return value, column.metadata.GetType(), column.schema
}