mirror of
https://github.com/minio/minio.git
synced 2025-11-25 03:56:17 -05:00
Add archived parquet as int. package (#9912)
Since github.com/minio/parquet-go is archived add it as internal package.
This commit is contained in:
191
pkg/s3select/internal/parquet-go/writer.go
Normal file
191
pkg/s3select/internal/parquet-go/writer.go
Normal file
@@ -0,0 +1,191 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package parquet
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"git.apache.org/thrift.git/lib/go/thrift"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/data"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
|
||||
"github.com/minio/minio/pkg/s3select/internal/parquet-go/schema"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultPageSize = 8 * 1024 // 8 KiB
|
||||
defaultRowGroupSize = 128 * 1024 * 1024 // 128 MiB
|
||||
)
|
||||
|
||||
// Writer - represents parquet writer.
|
||||
type Writer struct {
|
||||
PageSize int64
|
||||
RowGroupSize int64
|
||||
CompressionType parquet.CompressionCodec
|
||||
|
||||
writeCloser io.WriteCloser
|
||||
numRows int64
|
||||
offset int64
|
||||
footer *parquet.FileMetaData
|
||||
schemaTree *schema.Tree
|
||||
valueElements []*schema.Element
|
||||
columnDataMap map[string]*data.Column
|
||||
rowGroupCount int
|
||||
}
|
||||
|
||||
func (writer *Writer) writeData() (err error) {
|
||||
if writer.numRows == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var chunks []*data.ColumnChunk
|
||||
for _, element := range writer.valueElements {
|
||||
name := element.PathInTree
|
||||
columnData, found := writer.columnDataMap[name]
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
|
||||
columnChunk := columnData.Encode(element)
|
||||
chunks = append(chunks, columnChunk)
|
||||
}
|
||||
|
||||
rowGroup := data.NewRowGroup(chunks, writer.numRows, writer.offset)
|
||||
|
||||
for _, chunk := range chunks {
|
||||
if _, err = writer.writeCloser.Write(chunk.Data()); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
writer.offset += chunk.DataLen()
|
||||
}
|
||||
|
||||
writer.footer.RowGroups = append(writer.footer.RowGroups, rowGroup)
|
||||
writer.footer.NumRows += writer.numRows
|
||||
|
||||
writer.numRows = 0
|
||||
writer.columnDataMap = nil
|
||||
return nil
|
||||
}
|
||||
|
||||
// WriteJSON - writes a record represented in JSON.
|
||||
func (writer *Writer) WriteJSON(recordData []byte) (err error) {
|
||||
columnDataMap, err := data.UnmarshalJSON(recordData, writer.schemaTree)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return writer.Write(columnDataMap)
|
||||
}
|
||||
|
||||
// Write - writes a record represented in map.
|
||||
func (writer *Writer) Write(record map[string]*data.Column) (err error) {
|
||||
if writer.columnDataMap == nil {
|
||||
writer.columnDataMap = record
|
||||
} else {
|
||||
for name, columnData := range record {
|
||||
var found bool
|
||||
var element *schema.Element
|
||||
for _, element = range writer.valueElements {
|
||||
if element.PathInTree == name {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
return fmt.Errorf("%v is not value column", name)
|
||||
}
|
||||
|
||||
writer.columnDataMap[name].Merge(columnData)
|
||||
}
|
||||
}
|
||||
|
||||
writer.numRows++
|
||||
if writer.numRows == int64(writer.rowGroupCount) {
|
||||
return writer.writeData()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (writer *Writer) finalize() (err error) {
|
||||
if err = writer.writeData(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ts := thrift.NewTSerializer()
|
||||
ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
|
||||
footerBuf, err := ts.Write(context.TODO(), writer.footer)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err = writer.writeCloser.Write(footerBuf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
footerSizeBuf := make([]byte, 4)
|
||||
binary.LittleEndian.PutUint32(footerSizeBuf, uint32(len(footerBuf)))
|
||||
|
||||
if _, err = writer.writeCloser.Write(footerSizeBuf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = writer.writeCloser.Write([]byte("PAR1"))
|
||||
return err
|
||||
}
|
||||
|
||||
// Close - finalizes and closes writer. If any pending records are available, they are written here.
|
||||
func (writer *Writer) Close() (err error) {
|
||||
if err = writer.finalize(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return writer.writeCloser.Close()
|
||||
}
|
||||
|
||||
// NewWriter - creates new parquet writer. Binary data of rowGroupCount records are written to writeCloser.
|
||||
func NewWriter(writeCloser io.WriteCloser, schemaTree *schema.Tree, rowGroupCount int) (*Writer, error) {
|
||||
if _, err := writeCloser.Write([]byte("PAR1")); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
schemaList, valueElements, err := schemaTree.ToParquetSchema()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
footer := parquet.NewFileMetaData()
|
||||
footer.Version = 1
|
||||
footer.Schema = schemaList
|
||||
|
||||
return &Writer{
|
||||
PageSize: defaultPageSize,
|
||||
RowGroupSize: defaultRowGroupSize,
|
||||
CompressionType: parquet.CompressionCodec_SNAPPY,
|
||||
|
||||
writeCloser: writeCloser,
|
||||
offset: 4,
|
||||
footer: footer,
|
||||
schemaTree: schemaTree,
|
||||
valueElements: valueElements,
|
||||
rowGroupCount: rowGroupCount,
|
||||
}, nil
|
||||
}
|
||||
Reference in New Issue
Block a user