mirror of
https://github.com/minio/minio.git
synced 2025-11-10 05:59:43 -05:00
Refactor s3select to support parquet. (#7023)
Also handle pretty formatted JSON documents.
This commit is contained in:
190
pkg/s3select/csv/args.go
Normal file
190
pkg/s3select/csv/args.go
Normal file
@@ -0,0 +1,190 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package csv
|
||||
|
||||
import (
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
none = "none"
|
||||
use = "use"
|
||||
ignore = "ignore"
|
||||
|
||||
defaultRecordDelimiter = "\n"
|
||||
defaultFieldDelimiter = ","
|
||||
defaultQuoteCharacter = `"`
|
||||
defaultQuoteEscapeCharacter = `"`
|
||||
defaultCommentCharacter = "#"
|
||||
|
||||
always = "always"
|
||||
asneeded = "asneeded"
|
||||
)
|
||||
|
||||
// ReaderArgs - represents elements inside <InputSerialization><CSV> in request XML.
|
||||
type ReaderArgs struct {
|
||||
FileHeaderInfo string `xml:"FileHeaderInfo"`
|
||||
RecordDelimiter string `xml:"RecordDelimiter"`
|
||||
FieldDelimiter string `xml:"FieldDelimiter"`
|
||||
QuoteCharacter string `xml:"QuoteCharacter"`
|
||||
QuoteEscapeCharacter string `xml:"QuoteEscapeCharacter"`
|
||||
CommentCharacter string `xml:"Comments"`
|
||||
AllowQuotedRecordDelimiter bool `xml:"AllowQuotedRecordDelimiter"`
|
||||
unmarshaled bool
|
||||
}
|
||||
|
||||
// IsEmpty - returns whether reader args is empty or not.
|
||||
func (args *ReaderArgs) IsEmpty() bool {
|
||||
return !args.unmarshaled
|
||||
}
|
||||
|
||||
// UnmarshalXML - decodes XML data.
|
||||
func (args *ReaderArgs) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
|
||||
// Make subtype to avoid recursive UnmarshalXML().
|
||||
type subReaderArgs ReaderArgs
|
||||
parsedArgs := subReaderArgs{}
|
||||
if err := d.DecodeElement(&parsedArgs, &start); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
parsedArgs.FileHeaderInfo = strings.ToLower(parsedArgs.FileHeaderInfo)
|
||||
switch parsedArgs.FileHeaderInfo {
|
||||
case none, use, ignore:
|
||||
default:
|
||||
return errInvalidFileHeaderInfo(fmt.Errorf("invalid FileHeaderInfo '%v'", parsedArgs.FileHeaderInfo))
|
||||
}
|
||||
|
||||
switch len(parsedArgs.RecordDelimiter) {
|
||||
case 0:
|
||||
parsedArgs.RecordDelimiter = defaultRecordDelimiter
|
||||
case 1, 2:
|
||||
default:
|
||||
return fmt.Errorf("invalid RecordDelimiter '%v'", parsedArgs.RecordDelimiter)
|
||||
}
|
||||
|
||||
switch len(parsedArgs.FieldDelimiter) {
|
||||
case 0:
|
||||
parsedArgs.FieldDelimiter = defaultFieldDelimiter
|
||||
case 1:
|
||||
default:
|
||||
return fmt.Errorf("invalid FieldDelimiter '%v'", parsedArgs.FieldDelimiter)
|
||||
}
|
||||
|
||||
switch parsedArgs.QuoteCharacter {
|
||||
case "":
|
||||
parsedArgs.QuoteCharacter = defaultQuoteCharacter
|
||||
case defaultQuoteCharacter:
|
||||
default:
|
||||
return fmt.Errorf("unsupported QuoteCharacter '%v'", parsedArgs.QuoteCharacter)
|
||||
}
|
||||
|
||||
switch parsedArgs.QuoteEscapeCharacter {
|
||||
case "":
|
||||
parsedArgs.QuoteEscapeCharacter = defaultQuoteEscapeCharacter
|
||||
case defaultQuoteEscapeCharacter:
|
||||
default:
|
||||
return fmt.Errorf("unsupported QuoteEscapeCharacter '%v'", parsedArgs.QuoteEscapeCharacter)
|
||||
}
|
||||
|
||||
switch parsedArgs.CommentCharacter {
|
||||
case "":
|
||||
parsedArgs.CommentCharacter = defaultCommentCharacter
|
||||
case defaultCommentCharacter:
|
||||
default:
|
||||
return fmt.Errorf("unsupported Comments '%v'", parsedArgs.CommentCharacter)
|
||||
}
|
||||
|
||||
if parsedArgs.AllowQuotedRecordDelimiter {
|
||||
return fmt.Errorf("flag AllowQuotedRecordDelimiter is unsupported at the moment")
|
||||
}
|
||||
|
||||
*args = ReaderArgs(parsedArgs)
|
||||
args.unmarshaled = true
|
||||
return nil
|
||||
}
|
||||
|
||||
// WriterArgs - represents elements inside <OutputSerialization><CSV/> in request XML.
|
||||
type WriterArgs struct {
|
||||
QuoteFields string `xml:"QuoteFields"`
|
||||
RecordDelimiter string `xml:"RecordDelimiter"`
|
||||
FieldDelimiter string `xml:"FieldDelimiter"`
|
||||
QuoteCharacter string `xml:"QuoteCharacter"`
|
||||
QuoteEscapeCharacter string `xml:"QuoteEscapeCharacter"`
|
||||
unmarshaled bool
|
||||
}
|
||||
|
||||
// IsEmpty - returns whether writer args is empty or not.
|
||||
func (args *WriterArgs) IsEmpty() bool {
|
||||
return !args.unmarshaled
|
||||
}
|
||||
|
||||
// UnmarshalXML - decodes XML data.
|
||||
func (args *WriterArgs) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
|
||||
// Make subtype to avoid recursive UnmarshalXML().
|
||||
type subWriterArgs WriterArgs
|
||||
parsedArgs := subWriterArgs{}
|
||||
if err := d.DecodeElement(&parsedArgs, &start); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
parsedArgs.QuoteFields = strings.ToLower(parsedArgs.QuoteFields)
|
||||
switch parsedArgs.QuoteFields {
|
||||
case "":
|
||||
parsedArgs.QuoteFields = asneeded
|
||||
case always, asneeded:
|
||||
default:
|
||||
return errInvalidQuoteFields(fmt.Errorf("invalid QuoteFields '%v'", parsedArgs.QuoteFields))
|
||||
}
|
||||
|
||||
switch len(parsedArgs.RecordDelimiter) {
|
||||
case 0:
|
||||
parsedArgs.RecordDelimiter = defaultRecordDelimiter
|
||||
case 1, 2:
|
||||
default:
|
||||
return fmt.Errorf("invalid RecordDelimiter '%v'", parsedArgs.RecordDelimiter)
|
||||
}
|
||||
|
||||
switch len(parsedArgs.FieldDelimiter) {
|
||||
case 0:
|
||||
parsedArgs.FieldDelimiter = defaultFieldDelimiter
|
||||
case 1:
|
||||
default:
|
||||
return fmt.Errorf("invalid FieldDelimiter '%v'", parsedArgs.FieldDelimiter)
|
||||
}
|
||||
|
||||
switch parsedArgs.QuoteCharacter {
|
||||
case "":
|
||||
parsedArgs.QuoteCharacter = defaultQuoteCharacter
|
||||
case defaultQuoteCharacter:
|
||||
default:
|
||||
return fmt.Errorf("unsupported QuoteCharacter '%v'", parsedArgs.QuoteCharacter)
|
||||
}
|
||||
|
||||
switch parsedArgs.QuoteEscapeCharacter {
|
||||
case "":
|
||||
parsedArgs.QuoteEscapeCharacter = defaultQuoteEscapeCharacter
|
||||
case defaultQuoteEscapeCharacter:
|
||||
default:
|
||||
return fmt.Errorf("unsupported QuoteEscapeCharacter '%v'", parsedArgs.QuoteEscapeCharacter)
|
||||
}
|
||||
|
||||
*args = WriterArgs(parsedArgs)
|
||||
args.unmarshaled = true
|
||||
return nil
|
||||
}
|
||||
71
pkg/s3select/csv/errors.go
Normal file
71
pkg/s3select/csv/errors.go
Normal file
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package csv
|
||||
|
||||
type s3Error struct {
|
||||
code string
|
||||
message string
|
||||
statusCode int
|
||||
cause error
|
||||
}
|
||||
|
||||
func (err *s3Error) Cause() error {
|
||||
return err.cause
|
||||
}
|
||||
|
||||
func (err *s3Error) ErrorCode() string {
|
||||
return err.code
|
||||
}
|
||||
|
||||
func (err *s3Error) ErrorMessage() string {
|
||||
return err.message
|
||||
}
|
||||
|
||||
func (err *s3Error) HTTPStatusCode() int {
|
||||
return err.statusCode
|
||||
}
|
||||
|
||||
func (err *s3Error) Error() string {
|
||||
return err.message
|
||||
}
|
||||
|
||||
func errInvalidFileHeaderInfo(err error) *s3Error {
|
||||
return &s3Error{
|
||||
code: "InvalidFileHeaderInfo",
|
||||
message: "The FileHeaderInfo is invalid. Only NONE, USE, and IGNORE are supported.",
|
||||
statusCode: 400,
|
||||
cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
func errInvalidQuoteFields(err error) *s3Error {
|
||||
return &s3Error{
|
||||
code: "InvalidQuoteFields",
|
||||
message: "The QuoteFields is invalid. Only ALWAYS and ASNEEDED are supported.",
|
||||
statusCode: 400,
|
||||
cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
func errCSVParsingError(err error) *s3Error {
|
||||
return &s3Error{
|
||||
code: "CSVParsingError",
|
||||
message: "Encountered an error parsing the CSV file. Check the file and try again.",
|
||||
statusCode: 400,
|
||||
cause: err,
|
||||
}
|
||||
}
|
||||
166
pkg/s3select/csv/reader.go
Normal file
166
pkg/s3select/csv/reader.go
Normal file
@@ -0,0 +1,166 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package csv
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/sql"
|
||||
)
|
||||
|
||||
type recordReader struct {
|
||||
reader io.Reader
|
||||
recordDelimiter []byte
|
||||
oneByte []byte
|
||||
useOneByte bool
|
||||
}
|
||||
|
||||
func (rr *recordReader) Read(p []byte) (n int, err error) {
|
||||
if rr.useOneByte {
|
||||
p[0] = rr.oneByte[0]
|
||||
rr.useOneByte = false
|
||||
n, err = rr.reader.Read(p[1:])
|
||||
n++
|
||||
} else {
|
||||
n, err = rr.reader.Read(p)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if string(rr.recordDelimiter) == "\n" {
|
||||
return n, nil
|
||||
}
|
||||
|
||||
for {
|
||||
i := bytes.Index(p, rr.recordDelimiter)
|
||||
if i < 0 {
|
||||
break
|
||||
}
|
||||
|
||||
p[i] = '\n'
|
||||
if len(rr.recordDelimiter) > 1 {
|
||||
p = append(p[:i+1], p[i+len(rr.recordDelimiter):]...)
|
||||
}
|
||||
}
|
||||
|
||||
n = len(p)
|
||||
if len(rr.recordDelimiter) == 1 || p[n-1] != rr.recordDelimiter[0] {
|
||||
return n, nil
|
||||
}
|
||||
|
||||
if _, err = rr.reader.Read(rr.oneByte); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if rr.oneByte[0] == rr.recordDelimiter[1] {
|
||||
p[n-1] = '\n'
|
||||
return n, nil
|
||||
}
|
||||
|
||||
rr.useOneByte = true
|
||||
return n, nil
|
||||
}
|
||||
|
||||
// Reader - CSV record reader for S3Select.
|
||||
type Reader struct {
|
||||
args *ReaderArgs
|
||||
readCloser io.ReadCloser
|
||||
csvReader *csv.Reader
|
||||
columnNames []string
|
||||
}
|
||||
|
||||
// Read - reads single record.
|
||||
func (r *Reader) Read() (sql.Record, error) {
|
||||
csvRecord, err := r.csvReader.Read()
|
||||
if err != nil {
|
||||
if err != io.EOF {
|
||||
return nil, errCSVParsingError(err)
|
||||
}
|
||||
|
||||
return nil, err
|
||||
}
|
||||
|
||||
columnNames := r.columnNames
|
||||
if columnNames == nil {
|
||||
columnNames = make([]string, len(csvRecord))
|
||||
for i := range csvRecord {
|
||||
columnNames[i] = fmt.Sprintf("_%v", i+1)
|
||||
}
|
||||
}
|
||||
|
||||
nameIndexMap := make(map[string]int64)
|
||||
for i := range columnNames {
|
||||
nameIndexMap[columnNames[i]] = int64(i)
|
||||
}
|
||||
|
||||
return &Record{
|
||||
columnNames: columnNames,
|
||||
csvRecord: csvRecord,
|
||||
nameIndexMap: nameIndexMap,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Close - closes underlaying reader.
|
||||
func (r *Reader) Close() error {
|
||||
return r.readCloser.Close()
|
||||
}
|
||||
|
||||
// NewReader - creates new CSV reader using readCloser.
|
||||
func NewReader(readCloser io.ReadCloser, args *ReaderArgs) (*Reader, error) {
|
||||
if args == nil || args.IsEmpty() {
|
||||
panic(fmt.Errorf("empty args passed %v", args))
|
||||
}
|
||||
|
||||
csvReader := csv.NewReader(&recordReader{
|
||||
reader: readCloser,
|
||||
recordDelimiter: []byte(args.RecordDelimiter),
|
||||
oneByte: []byte{0},
|
||||
})
|
||||
csvReader.Comma = []rune(args.FieldDelimiter)[0]
|
||||
csvReader.Comment = []rune(args.CommentCharacter)[0]
|
||||
csvReader.FieldsPerRecord = -1
|
||||
|
||||
r := &Reader{
|
||||
args: args,
|
||||
readCloser: readCloser,
|
||||
csvReader: csvReader,
|
||||
}
|
||||
|
||||
if args.FileHeaderInfo == none {
|
||||
return r, nil
|
||||
}
|
||||
|
||||
record, err := csvReader.Read()
|
||||
if err != nil {
|
||||
if err != io.EOF {
|
||||
return nil, errCSVParsingError(err)
|
||||
}
|
||||
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if args.FileHeaderInfo == use {
|
||||
r.columnNames = record
|
||||
}
|
||||
|
||||
return r, nil
|
||||
}
|
||||
95
pkg/s3select/csv/record.go
Normal file
95
pkg/s3select/csv/record.go
Normal file
@@ -0,0 +1,95 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package csv
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/sql"
|
||||
"github.com/tidwall/sjson"
|
||||
)
|
||||
|
||||
// Record - is CSV record.
|
||||
type Record struct {
|
||||
columnNames []string
|
||||
csvRecord []string
|
||||
nameIndexMap map[string]int64
|
||||
}
|
||||
|
||||
// Get - gets the value for a column name.
|
||||
func (r *Record) Get(name string) (*sql.Value, error) {
|
||||
index, found := r.nameIndexMap[name]
|
||||
if !found {
|
||||
return nil, fmt.Errorf("column %v not found", name)
|
||||
}
|
||||
|
||||
if index >= int64(len(r.csvRecord)) {
|
||||
// No value found for column 'name', hence return empty string for compatibility.
|
||||
return sql.NewString(""), nil
|
||||
}
|
||||
|
||||
return sql.NewString(r.csvRecord[index]), nil
|
||||
}
|
||||
|
||||
// Set - sets the value for a column name.
|
||||
func (r *Record) Set(name string, value *sql.Value) error {
|
||||
r.columnNames = append(r.columnNames, name)
|
||||
r.csvRecord = append(r.csvRecord, value.CSVString())
|
||||
return nil
|
||||
}
|
||||
|
||||
// MarshalCSV - encodes to CSV data.
|
||||
func (r *Record) MarshalCSV(fieldDelimiter rune) ([]byte, error) {
|
||||
buf := new(bytes.Buffer)
|
||||
w := csv.NewWriter(buf)
|
||||
w.Comma = fieldDelimiter
|
||||
if err := w.Write(r.csvRecord); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
w.Flush()
|
||||
if err := w.Error(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data := buf.Bytes()
|
||||
return data[:len(data)-1], nil
|
||||
}
|
||||
|
||||
// MarshalJSON - encodes to JSON data.
|
||||
func (r *Record) MarshalJSON() ([]byte, error) {
|
||||
data := "{}"
|
||||
|
||||
var err error
|
||||
for i := len(r.columnNames) - 1; i >= 0; i-- {
|
||||
if i >= len(r.csvRecord) {
|
||||
continue
|
||||
}
|
||||
|
||||
if data, err = sjson.Set(data, r.columnNames[i], r.csvRecord[i]); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return []byte(data), nil
|
||||
}
|
||||
|
||||
// NewRecord - creates new CSV record.
|
||||
func NewRecord() *Record {
|
||||
return &Record{}
|
||||
}
|
||||
Reference in New Issue
Block a user