Refactor s3select to support parquet. (#7023)

Also handle pretty formatted JSON documents.
This commit is contained in:
Bala FA
2019-01-09 06:23:04 +05:30
committed by kannappanr
parent e98d89274f
commit b0deea27df
124 changed files with 27376 additions and 4152 deletions

190
pkg/s3select/csv/args.go Normal file
View File

@@ -0,0 +1,190 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package csv
import (
"encoding/xml"
"fmt"
"strings"
)
const (
none = "none"
use = "use"
ignore = "ignore"
defaultRecordDelimiter = "\n"
defaultFieldDelimiter = ","
defaultQuoteCharacter = `"`
defaultQuoteEscapeCharacter = `"`
defaultCommentCharacter = "#"
always = "always"
asneeded = "asneeded"
)
// ReaderArgs - represents elements inside <InputSerialization><CSV> in request XML.
type ReaderArgs struct {
FileHeaderInfo string `xml:"FileHeaderInfo"`
RecordDelimiter string `xml:"RecordDelimiter"`
FieldDelimiter string `xml:"FieldDelimiter"`
QuoteCharacter string `xml:"QuoteCharacter"`
QuoteEscapeCharacter string `xml:"QuoteEscapeCharacter"`
CommentCharacter string `xml:"Comments"`
AllowQuotedRecordDelimiter bool `xml:"AllowQuotedRecordDelimiter"`
unmarshaled bool
}
// IsEmpty - returns whether reader args is empty or not.
func (args *ReaderArgs) IsEmpty() bool {
return !args.unmarshaled
}
// UnmarshalXML - decodes XML data.
func (args *ReaderArgs) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
// Make subtype to avoid recursive UnmarshalXML().
type subReaderArgs ReaderArgs
parsedArgs := subReaderArgs{}
if err := d.DecodeElement(&parsedArgs, &start); err != nil {
return err
}
parsedArgs.FileHeaderInfo = strings.ToLower(parsedArgs.FileHeaderInfo)
switch parsedArgs.FileHeaderInfo {
case none, use, ignore:
default:
return errInvalidFileHeaderInfo(fmt.Errorf("invalid FileHeaderInfo '%v'", parsedArgs.FileHeaderInfo))
}
switch len(parsedArgs.RecordDelimiter) {
case 0:
parsedArgs.RecordDelimiter = defaultRecordDelimiter
case 1, 2:
default:
return fmt.Errorf("invalid RecordDelimiter '%v'", parsedArgs.RecordDelimiter)
}
switch len(parsedArgs.FieldDelimiter) {
case 0:
parsedArgs.FieldDelimiter = defaultFieldDelimiter
case 1:
default:
return fmt.Errorf("invalid FieldDelimiter '%v'", parsedArgs.FieldDelimiter)
}
switch parsedArgs.QuoteCharacter {
case "":
parsedArgs.QuoteCharacter = defaultQuoteCharacter
case defaultQuoteCharacter:
default:
return fmt.Errorf("unsupported QuoteCharacter '%v'", parsedArgs.QuoteCharacter)
}
switch parsedArgs.QuoteEscapeCharacter {
case "":
parsedArgs.QuoteEscapeCharacter = defaultQuoteEscapeCharacter
case defaultQuoteEscapeCharacter:
default:
return fmt.Errorf("unsupported QuoteEscapeCharacter '%v'", parsedArgs.QuoteEscapeCharacter)
}
switch parsedArgs.CommentCharacter {
case "":
parsedArgs.CommentCharacter = defaultCommentCharacter
case defaultCommentCharacter:
default:
return fmt.Errorf("unsupported Comments '%v'", parsedArgs.CommentCharacter)
}
if parsedArgs.AllowQuotedRecordDelimiter {
return fmt.Errorf("flag AllowQuotedRecordDelimiter is unsupported at the moment")
}
*args = ReaderArgs(parsedArgs)
args.unmarshaled = true
return nil
}
// WriterArgs - represents elements inside <OutputSerialization><CSV/> in request XML.
type WriterArgs struct {
QuoteFields string `xml:"QuoteFields"`
RecordDelimiter string `xml:"RecordDelimiter"`
FieldDelimiter string `xml:"FieldDelimiter"`
QuoteCharacter string `xml:"QuoteCharacter"`
QuoteEscapeCharacter string `xml:"QuoteEscapeCharacter"`
unmarshaled bool
}
// IsEmpty - returns whether writer args is empty or not.
func (args *WriterArgs) IsEmpty() bool {
return !args.unmarshaled
}
// UnmarshalXML - decodes XML data.
func (args *WriterArgs) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
// Make subtype to avoid recursive UnmarshalXML().
type subWriterArgs WriterArgs
parsedArgs := subWriterArgs{}
if err := d.DecodeElement(&parsedArgs, &start); err != nil {
return err
}
parsedArgs.QuoteFields = strings.ToLower(parsedArgs.QuoteFields)
switch parsedArgs.QuoteFields {
case "":
parsedArgs.QuoteFields = asneeded
case always, asneeded:
default:
return errInvalidQuoteFields(fmt.Errorf("invalid QuoteFields '%v'", parsedArgs.QuoteFields))
}
switch len(parsedArgs.RecordDelimiter) {
case 0:
parsedArgs.RecordDelimiter = defaultRecordDelimiter
case 1, 2:
default:
return fmt.Errorf("invalid RecordDelimiter '%v'", parsedArgs.RecordDelimiter)
}
switch len(parsedArgs.FieldDelimiter) {
case 0:
parsedArgs.FieldDelimiter = defaultFieldDelimiter
case 1:
default:
return fmt.Errorf("invalid FieldDelimiter '%v'", parsedArgs.FieldDelimiter)
}
switch parsedArgs.QuoteCharacter {
case "":
parsedArgs.QuoteCharacter = defaultQuoteCharacter
case defaultQuoteCharacter:
default:
return fmt.Errorf("unsupported QuoteCharacter '%v'", parsedArgs.QuoteCharacter)
}
switch parsedArgs.QuoteEscapeCharacter {
case "":
parsedArgs.QuoteEscapeCharacter = defaultQuoteEscapeCharacter
case defaultQuoteEscapeCharacter:
default:
return fmt.Errorf("unsupported QuoteEscapeCharacter '%v'", parsedArgs.QuoteEscapeCharacter)
}
*args = WriterArgs(parsedArgs)
args.unmarshaled = true
return nil
}

View File

@@ -0,0 +1,71 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package csv
type s3Error struct {
code string
message string
statusCode int
cause error
}
func (err *s3Error) Cause() error {
return err.cause
}
func (err *s3Error) ErrorCode() string {
return err.code
}
func (err *s3Error) ErrorMessage() string {
return err.message
}
func (err *s3Error) HTTPStatusCode() int {
return err.statusCode
}
func (err *s3Error) Error() string {
return err.message
}
func errInvalidFileHeaderInfo(err error) *s3Error {
return &s3Error{
code: "InvalidFileHeaderInfo",
message: "The FileHeaderInfo is invalid. Only NONE, USE, and IGNORE are supported.",
statusCode: 400,
cause: err,
}
}
func errInvalidQuoteFields(err error) *s3Error {
return &s3Error{
code: "InvalidQuoteFields",
message: "The QuoteFields is invalid. Only ALWAYS and ASNEEDED are supported.",
statusCode: 400,
cause: err,
}
}
func errCSVParsingError(err error) *s3Error {
return &s3Error{
code: "CSVParsingError",
message: "Encountered an error parsing the CSV file. Check the file and try again.",
statusCode: 400,
cause: err,
}
}

166
pkg/s3select/csv/reader.go Normal file
View File

@@ -0,0 +1,166 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package csv
import (
"bytes"
"encoding/csv"
"fmt"
"io"
"github.com/minio/minio/pkg/s3select/sql"
)
type recordReader struct {
reader io.Reader
recordDelimiter []byte
oneByte []byte
useOneByte bool
}
func (rr *recordReader) Read(p []byte) (n int, err error) {
if rr.useOneByte {
p[0] = rr.oneByte[0]
rr.useOneByte = false
n, err = rr.reader.Read(p[1:])
n++
} else {
n, err = rr.reader.Read(p)
}
if err != nil {
return 0, err
}
if string(rr.recordDelimiter) == "\n" {
return n, nil
}
for {
i := bytes.Index(p, rr.recordDelimiter)
if i < 0 {
break
}
p[i] = '\n'
if len(rr.recordDelimiter) > 1 {
p = append(p[:i+1], p[i+len(rr.recordDelimiter):]...)
}
}
n = len(p)
if len(rr.recordDelimiter) == 1 || p[n-1] != rr.recordDelimiter[0] {
return n, nil
}
if _, err = rr.reader.Read(rr.oneByte); err != nil {
return 0, err
}
if rr.oneByte[0] == rr.recordDelimiter[1] {
p[n-1] = '\n'
return n, nil
}
rr.useOneByte = true
return n, nil
}
// Reader - CSV record reader for S3Select.
type Reader struct {
args *ReaderArgs
readCloser io.ReadCloser
csvReader *csv.Reader
columnNames []string
}
// Read - reads single record.
func (r *Reader) Read() (sql.Record, error) {
csvRecord, err := r.csvReader.Read()
if err != nil {
if err != io.EOF {
return nil, errCSVParsingError(err)
}
return nil, err
}
columnNames := r.columnNames
if columnNames == nil {
columnNames = make([]string, len(csvRecord))
for i := range csvRecord {
columnNames[i] = fmt.Sprintf("_%v", i+1)
}
}
nameIndexMap := make(map[string]int64)
for i := range columnNames {
nameIndexMap[columnNames[i]] = int64(i)
}
return &Record{
columnNames: columnNames,
csvRecord: csvRecord,
nameIndexMap: nameIndexMap,
}, nil
}
// Close - closes underlaying reader.
func (r *Reader) Close() error {
return r.readCloser.Close()
}
// NewReader - creates new CSV reader using readCloser.
func NewReader(readCloser io.ReadCloser, args *ReaderArgs) (*Reader, error) {
if args == nil || args.IsEmpty() {
panic(fmt.Errorf("empty args passed %v", args))
}
csvReader := csv.NewReader(&recordReader{
reader: readCloser,
recordDelimiter: []byte(args.RecordDelimiter),
oneByte: []byte{0},
})
csvReader.Comma = []rune(args.FieldDelimiter)[0]
csvReader.Comment = []rune(args.CommentCharacter)[0]
csvReader.FieldsPerRecord = -1
r := &Reader{
args: args,
readCloser: readCloser,
csvReader: csvReader,
}
if args.FileHeaderInfo == none {
return r, nil
}
record, err := csvReader.Read()
if err != nil {
if err != io.EOF {
return nil, errCSVParsingError(err)
}
return nil, err
}
if args.FileHeaderInfo == use {
r.columnNames = record
}
return r, nil
}

View File

@@ -0,0 +1,95 @@
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package csv
import (
"bytes"
"encoding/csv"
"fmt"
"github.com/minio/minio/pkg/s3select/sql"
"github.com/tidwall/sjson"
)
// Record - is CSV record.
type Record struct {
columnNames []string
csvRecord []string
nameIndexMap map[string]int64
}
// Get - gets the value for a column name.
func (r *Record) Get(name string) (*sql.Value, error) {
index, found := r.nameIndexMap[name]
if !found {
return nil, fmt.Errorf("column %v not found", name)
}
if index >= int64(len(r.csvRecord)) {
// No value found for column 'name', hence return empty string for compatibility.
return sql.NewString(""), nil
}
return sql.NewString(r.csvRecord[index]), nil
}
// Set - sets the value for a column name.
func (r *Record) Set(name string, value *sql.Value) error {
r.columnNames = append(r.columnNames, name)
r.csvRecord = append(r.csvRecord, value.CSVString())
return nil
}
// MarshalCSV - encodes to CSV data.
func (r *Record) MarshalCSV(fieldDelimiter rune) ([]byte, error) {
buf := new(bytes.Buffer)
w := csv.NewWriter(buf)
w.Comma = fieldDelimiter
if err := w.Write(r.csvRecord); err != nil {
return nil, err
}
w.Flush()
if err := w.Error(); err != nil {
return nil, err
}
data := buf.Bytes()
return data[:len(data)-1], nil
}
// MarshalJSON - encodes to JSON data.
func (r *Record) MarshalJSON() ([]byte, error) {
data := "{}"
var err error
for i := len(r.columnNames) - 1; i >= 0; i-- {
if i >= len(r.csvRecord) {
continue
}
if data, err = sjson.Set(data, r.columnNames[i], r.csvRecord[i]); err != nil {
return nil, err
}
}
return []byte(data), nil
}
// NewRecord - creates new CSV record.
func NewRecord() *Record {
return &Record{}
}